diff options
author | Sergei Golubchik <vuvova@gmail.com> | 2015-05-04 19:15:28 +0200 |
---|---|---|
committer | Sergei Golubchik <vuvova@gmail.com> | 2015-05-04 19:15:28 +0200 |
commit | 14a142fca67b9e1fb3f0250fda093f5b967f0138 (patch) | |
tree | dd49e0666c863d80b5c50642e36a9c945ea12b8a /storage/xtradb | |
parent | dfb001edcd4b16bd4370b08b0176df78c4c5523f (diff) | |
download | mariadb-git-14a142fca67b9e1fb3f0250fda093f5b967f0138.tar.gz |
move to storage/xtradb
Diffstat (limited to 'storage/xtradb')
365 files changed, 327935 insertions, 0 deletions
diff --git a/storage/xtradb/CMakeLists.txt b/storage/xtradb/CMakeLists.txt new file mode 100644 index 00000000000..452f5c8e35d --- /dev/null +++ b/storage/xtradb/CMakeLists.txt @@ -0,0 +1,435 @@ +# Copyright (c) 2006, 2011, Oracle and/or its affiliates. All rights reserved. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; version 2 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +# This is the CMakeLists for InnoDB + +INCLUDE(CheckFunctionExists) +INCLUDE(CheckCSourceCompiles) +INCLUDE(CheckCSourceRuns) + +# OS tests +IF(UNIX) + IF(CMAKE_SYSTEM_NAME STREQUAL "Linux") + CHECK_INCLUDE_FILES (libaio.h HAVE_LIBAIO_H) + CHECK_LIBRARY_EXISTS(aio io_queue_init "" HAVE_LIBAIO) + ADD_DEFINITIONS("-DUNIV_LINUX -D_GNU_SOURCE=1") + IF(HAVE_LIBAIO_H AND HAVE_LIBAIO) + ADD_DEFINITIONS(-DLINUX_NATIVE_AIO=1) + LINK_LIBRARIES(aio) + ENDIF() + ELSEIF(CMAKE_SYSTEM_NAME MATCHES "HP*") + ADD_DEFINITIONS("-DUNIV_HPUX") + ELSEIF(CMAKE_SYSTEM_NAME STREQUAL "AIX") + ADD_DEFINITIONS("-DUNIV_AIX") + ELSEIF(CMAKE_SYSTEM_NAME STREQUAL "SunOS") + ADD_DEFINITIONS("-DUNIV_SOLARIS") + ENDIF() +ENDIF() + +IF(CMAKE_CXX_COMPILER_ID MATCHES "GNU") +# After: WL#5825 Using C++ Standard Library with MySQL code +# we no longer use -fno-exceptions +# SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions") +ENDIF() + +# Enable InnoDB's UNIV_DEBUG and UNIV_SYNC_DEBUG in debug builds +SET(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DUNIV_DEBUG -DUNIV_SYNC_DEBUG") + +# Add -Wconversion if compiling with GCC +## As of Mar 15 2011 this flag causes 3573+ warnings. If you are reading this +## please fix them and enable the following code: +#IF(CMAKE_CXX_COMPILER_ID MATCHES "GNU") +#SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wconversion") +#ENDIF() + +CHECK_FUNCTION_EXISTS(sched_getcpu HAVE_SCHED_GETCPU) + +IF(NOT MSVC) +# either define HAVE_IB_GCC_ATOMIC_BUILTINS or not +IF(NOT CMAKE_CROSSCOMPILING) + CHECK_C_SOURCE_RUNS( + " + int main() + { + long x; + long y; + long res; + + x = 10; + y = 123; + res = __sync_bool_compare_and_swap(&x, x, y); + if (!res || x != y) { + return(1); + } + + x = 10; + y = 123; + res = __sync_bool_compare_and_swap(&x, x + 1, y); + if (res || x != 10) { + return(1); + } + x = 10; + y = 123; + res = __sync_add_and_fetch(&x, y); + if (res != 123 + 10 || x != 123 + 10) { + return(1); + } + return(0); + }" + HAVE_IB_GCC_ATOMIC_BUILTINS + ) + CHECK_C_SOURCE_RUNS( + " + int main() + { + long res; + char c; + + c = 10; + res = __sync_lock_test_and_set(&c, 123); + if (res != 10 || c != 123) { + return(1); + } + return(0); + }" + HAVE_IB_GCC_ATOMIC_BUILTINS_BYTE + ) + CHECK_C_SOURCE_RUNS( + "#include<stdint.h> + int main() + { + int64_t x,y,res; + + x = 10; + y = 123; + res = __sync_sub_and_fetch(&y, x); + if (res != y || y != 113) { + return(1); + } + res = __sync_add_and_fetch(&y, x); + if (res != y || y != 123) { + return(1); + } + return(0); + }" + HAVE_IB_GCC_ATOMIC_BUILTINS_64 + ) + CHECK_C_SOURCE_RUNS( + "#include<stdint.h> + int main() + { + __sync_synchronize(); + return(0); + }" + HAVE_IB_GCC_SYNC_SYNCHRONISE + ) + CHECK_C_SOURCE_RUNS( + "#include<stdint.h> + int main() + { + __atomic_thread_fence(__ATOMIC_ACQUIRE); + __atomic_thread_fence(__ATOMIC_RELEASE); + return(0); + }" + HAVE_IB_GCC_ATOMIC_THREAD_FENCE + ) +ENDIF() + +IF(HAVE_IB_GCC_ATOMIC_BUILTINS) + ADD_DEFINITIONS(-DHAVE_IB_GCC_ATOMIC_BUILTINS=1) +ENDIF() + +IF(HAVE_IB_GCC_ATOMIC_BUILTINS_BYTE) + ADD_DEFINITIONS(-DHAVE_IB_GCC_ATOMIC_BUILTINS_BYTE=1) +ENDIF() + +IF(HAVE_IB_GCC_ATOMIC_BUILTINS_64) + ADD_DEFINITIONS(-DHAVE_IB_GCC_ATOMIC_BUILTINS_64=1) +ENDIF() + +IF(HAVE_IB_GCC_SYNC_SYNCHRONISE) + ADD_DEFINITIONS(-DHAVE_IB_GCC_SYNC_SYNCHRONISE=1) +ENDIF() + +IF(HAVE_IB_GCC_ATOMIC_THREAD_FENCE) + ADD_DEFINITIONS(-DHAVE_IB_GCC_ATOMIC_THREAD_FENCE=1) +ENDIF() + + # either define HAVE_IB_ATOMIC_PTHREAD_T_GCC or not +IF(NOT CMAKE_CROSSCOMPILING) + CHECK_C_SOURCE_RUNS( + " + #include <pthread.h> + #include <string.h> + + int main() { + pthread_t x1; + pthread_t x2; + pthread_t x3; + + memset(&x1, 0x0, sizeof(x1)); + memset(&x2, 0x0, sizeof(x2)); + memset(&x3, 0x0, sizeof(x3)); + + __sync_bool_compare_and_swap(&x1, x2, x3); + + return(0); + }" + HAVE_IB_ATOMIC_PTHREAD_T_GCC) +ENDIF() +IF(HAVE_IB_ATOMIC_PTHREAD_T_GCC) + ADD_DEFINITIONS(-DHAVE_IB_ATOMIC_PTHREAD_T_GCC=1) +ENDIF() + +ENDIF(NOT MSVC) + +CHECK_FUNCTION_EXISTS(asprintf HAVE_ASPRINTF) +CHECK_FUNCTION_EXISTS(vasprintf HAVE_VASPRINTF) + +# Solaris atomics +IF(CMAKE_SYSTEM_NAME STREQUAL "SunOS") + CHECK_FUNCTION_EXISTS(atomic_cas_ulong HAVE_ATOMIC_CAS_ULONG) + CHECK_FUNCTION_EXISTS(atomic_cas_32 HAVE_ATOMIC_CAS_32) + CHECK_FUNCTION_EXISTS(atomic_cas_64 HAVE_ATOMIC_CAS_64) + CHECK_FUNCTION_EXISTS(atomic_add_long_nv HAVE_ATOMIC_ADD_LONG_NV) + CHECK_FUNCTION_EXISTS(atomic_swap_uchar HAVE_ATOMIC_SWAP_UCHAR) + IF(HAVE_ATOMIC_CAS_ULONG AND + HAVE_ATOMIC_CAS_32 AND + HAVE_ATOMIC_CAS_64 AND + HAVE_ATOMIC_ADD_LONG_NV AND + HAVE_ATOMIC_SWAP_UCHAR) + SET(HAVE_IB_SOLARIS_ATOMICS 1) + ENDIF() + + IF(HAVE_IB_SOLARIS_ATOMICS) + ADD_DEFINITIONS(-DHAVE_IB_SOLARIS_ATOMICS=1) + ENDIF() + + IF(NOT CMAKE_CROSSCOMPILING) + # either define HAVE_IB_ATOMIC_PTHREAD_T_SOLARIS or not + CHECK_C_SOURCE_COMPILES( + " #include <pthread.h> + #include <string.h> + + int main(int argc, char** argv) { + pthread_t x1; + pthread_t x2; + pthread_t x3; + + memset(&x1, 0x0, sizeof(x1)); + memset(&x2, 0x0, sizeof(x2)); + memset(&x3, 0x0, sizeof(x3)); + + if (sizeof(pthread_t) == 4) { + + atomic_cas_32(&x1, x2, x3); + + } else if (sizeof(pthread_t) == 8) { + + atomic_cas_64(&x1, x2, x3); + + } else { + + return(1); + } + + return(0); + } + " HAVE_IB_ATOMIC_PTHREAD_T_SOLARIS) + CHECK_C_SOURCE_COMPILES( + "#include <mbarrier.h> + int main() { + __machine_r_barrier(); + __machine_w_barrier(); + return(0); + }" + HAVE_IB_MACHINE_BARRIER_SOLARIS) + ENDIF() + IF(HAVE_IB_ATOMIC_PTHREAD_T_SOLARIS) + ADD_DEFINITIONS(-DHAVE_IB_ATOMIC_PTHREAD_T_SOLARIS=1) + ENDIF() + IF(HAVE_IB_MACHINE_BARRIER_SOLARIS) + ADD_DEFINITIONS(-DHAVE_IB_MACHINE_BARRIER_SOLARIS=1) + ENDIF() +ENDIF() + + +IF(UNIX) +# this is needed to know which one of atomic_cas_32() or atomic_cas_64() +# to use in the source +SET(CMAKE_EXTRA_INCLUDE_FILES pthread.h) +CHECK_TYPE_SIZE(pthread_t SIZEOF_PTHREAD_T) +SET(CMAKE_EXTRA_INCLUDE_FILES) +ENDIF() + +IF(SIZEOF_PTHREAD_T) + ADD_DEFINITIONS(-DSIZEOF_PTHREAD_T=${SIZEOF_PTHREAD_T}) +ENDIF() + +IF(MSVC) + ADD_DEFINITIONS(-DHAVE_WINDOWS_ATOMICS) + ADD_DEFINITIONS(-DHAVE_WINDOWS_MM_FENCE) +ENDIF() + + +# Include directories under innobase +INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/storage/innobase/include + ${CMAKE_SOURCE_DIR}/storage/innobase/handler) + +# Sun Studio bug with -xO2 +IF(CMAKE_CXX_COMPILER_ID MATCHES "SunPro" + AND CMAKE_CXX_FLAGS_RELEASE MATCHES "O2" + AND NOT CMAKE_BUILD_TYPE STREQUAL "Debug") + # Sun Studio 12 crashes with -xO2 flag, but not with higher optimization + # -xO3 + SET_SOURCE_FILES_PROPERTIES(${CMAKE_CURRENT_SOURCE_DIR}/rem/rem0rec.cc + PROPERTIES COMPILE_FLAGS -xO3) +ENDIF() + +# Removing compiler optimizations for innodb/mem/* files on 64-bit Windows +# due to 64-bit compiler error, See MySQL Bug #19424, #36366, #34297 +IF (MSVC AND CMAKE_SIZEOF_VOID_P EQUAL 8) + SET_SOURCE_FILES_PROPERTIES(mem/mem0mem.cc mem/mem0pool.cc + PROPERTIES COMPILE_FLAGS -Od) +ENDIF() + +SET(INNOBASE_SOURCES + api/api0api.cc + api/api0misc.cc + btr/btr0btr.cc + btr/btr0cur.cc + btr/btr0pcur.cc + btr/btr0sea.cc + buf/buf0buddy.cc + buf/buf0buf.cc + buf/buf0dblwr.cc + buf/buf0checksum.cc + buf/buf0dump.cc + buf/buf0flu.cc + buf/buf0lru.cc + buf/buf0rea.cc + data/data0data.cc + data/data0type.cc + dict/dict0boot.cc + dict/dict0crea.cc + dict/dict0dict.cc + dict/dict0load.cc + dict/dict0mem.cc + dict/dict0stats.cc + dict/dict0stats_bg.cc + dyn/dyn0dyn.cc + eval/eval0eval.cc + eval/eval0proc.cc + fil/fil0fil.cc + fsp/fsp0fsp.cc + fut/fut0fut.cc + fut/fut0lst.cc + ha/ha0ha.cc + ha/ha0storage.cc + ha/hash0hash.cc + fts/fts0fts.cc + fts/fts0ast.cc + fts/fts0blex.cc + fts/fts0config.cc + fts/fts0opt.cc + fts/fts0pars.cc + fts/fts0que.cc + fts/fts0sql.cc + fts/fts0tlex.cc + handler/ha_innodb.cc + handler/handler0alter.cc + handler/i_s.cc + handler/xtradb_i_s.cc + ibuf/ibuf0ibuf.cc + lock/lock0iter.cc + lock/lock0lock.cc + lock/lock0wait.cc + log/log0log.cc + log/log0online.cc + log/log0recv.cc + mach/mach0data.cc + mem/mem0mem.cc + mem/mem0pool.cc + mtr/mtr0log.cc + mtr/mtr0mtr.cc + os/os0file.cc + os/os0proc.cc + os/os0sync.cc + os/os0thread.cc + page/page0cur.cc + page/page0page.cc + page/page0zip.cc + pars/lexyy.cc + pars/pars0grm.cc + pars/pars0opt.cc + pars/pars0pars.cc + pars/pars0sym.cc + que/que0que.cc + read/read0read.cc + rem/rem0cmp.cc + rem/rem0rec.cc + row/row0ext.cc + row/row0ftsort.cc + row/row0import.cc + row/row0ins.cc + row/row0merge.cc + row/row0mysql.cc + row/row0log.cc + row/row0purge.cc + row/row0row.cc + row/row0sel.cc + row/row0uins.cc + row/row0umod.cc + row/row0undo.cc + row/row0upd.cc + row/row0quiesce.cc + row/row0vers.cc + srv/srv0conc.cc + srv/srv0mon.cc + srv/srv0srv.cc + srv/srv0start.cc + sync/sync0arr.cc + sync/sync0rw.cc + sync/sync0sync.cc + trx/trx0i_s.cc + trx/trx0purge.cc + trx/trx0rec.cc + trx/trx0roll.cc + trx/trx0rseg.cc + trx/trx0sys.cc + trx/trx0trx.cc + trx/trx0undo.cc + usr/usr0sess.cc + ut/ut0bh.cc + ut/ut0byte.cc + ut/ut0crc32.cc + ut/ut0dbg.cc + ut/ut0list.cc + ut/ut0mem.cc + ut/ut0rbt.cc + ut/ut0rnd.cc + ut/ut0ut.cc + ut/ut0vec.cc + ut/ut0wqueue.cc) + +IF(WITH_INNODB) + # Legacy option + SET(WITH_INNOBASE_STORAGE_ENGINE TRUE) +ENDIF() + +MYSQL_ADD_PLUGIN(innobase ${INNOBASE_SOURCES} STORAGE_ENGINE + DEFAULT + MODULE_OUTPUT_NAME ha_innodb + LINK_LIBRARIES ${ZLIB_LIBRARY}) diff --git a/storage/xtradb/COPYING.Google b/storage/xtradb/COPYING.Google new file mode 100644 index 00000000000..5ade2b0e381 --- /dev/null +++ b/storage/xtradb/COPYING.Google @@ -0,0 +1,30 @@ +Portions of this software contain modifications contributed by Google, Inc. +These contributions are used with the following license: + +Copyright (c) 2008, Google Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials + provided with the distribution. + * Neither the name of the Google Inc. nor the names of its + contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/storage/xtradb/COPYING.Percona b/storage/xtradb/COPYING.Percona new file mode 100644 index 00000000000..8c786811719 --- /dev/null +++ b/storage/xtradb/COPYING.Percona @@ -0,0 +1,30 @@ +Portions of this software contain modifications contributed by Percona, Inc. +These contributions are used with the following license: + +Copyright (c) 2008, 2009, Percona Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials + provided with the distribution. + * Neither the name of the Percona Inc. nor the names of its + contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/storage/xtradb/Doxyfile b/storage/xtradb/Doxyfile new file mode 100644 index 00000000000..7cf5048fa52 --- /dev/null +++ b/storage/xtradb/Doxyfile @@ -0,0 +1,1419 @@ +# Doxyfile 1.5.6 + +# Usage: SVNVERSION=-r$(svnversion) doxygen + +# This file describes the settings to be used by the documentation system +# doxygen (www.doxygen.org) for a project +# +# All text after a hash (#) is considered a comment and will be ignored +# The format is: +# TAG = value [value, ...] +# For lists items can also be appended using: +# TAG += value [value, ...] +# Values that contain spaces should be placed between quotes (" ") + +#--------------------------------------------------------------------------- +# Project related configuration options +#--------------------------------------------------------------------------- + +# This tag specifies the encoding used for all characters in the config file +# that follow. The default is UTF-8 which is also the encoding used for all +# text before the first occurrence of this tag. Doxygen uses libiconv (or the +# iconv built into libc) for the transcoding. See +# http://www.gnu.org/software/libiconv for the list of possible encodings. + +DOXYFILE_ENCODING = UTF-8 + +# The PROJECT_NAME tag is a single word (or a sequence of words surrounded +# by quotes) that should identify the project. + +PROJECT_NAME = "InnoDB Plugin" + +# The PROJECT_NUMBER tag can be used to enter a project or revision number. +# This could be handy for archiving the generated documentation or +# if some version control system is used. + +PROJECT_NUMBER = 1.0$(SVNVERSION) + +# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) +# base path where the generated documentation will be put. +# If a relative path is entered, it will be relative to the location +# where doxygen was started. If left blank the current directory will be used. + +OUTPUT_DIRECTORY = dox + +# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create +# 4096 sub-directories (in 2 levels) under the output directory of each output +# format and will distribute the generated files over these directories. +# Enabling this option can be useful when feeding doxygen a huge amount of +# source files, where putting all generated files in the same directory would +# otherwise cause performance problems for the file system. + +CREATE_SUBDIRS = NO + +# The OUTPUT_LANGUAGE tag is used to specify the language in which all +# documentation generated by doxygen is written. Doxygen will use this +# information to generate all constant output in the proper language. +# The default language is English, other supported languages are: +# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional, +# Croatian, Czech, Danish, Dutch, Farsi, Finnish, French, German, Greek, +# Hungarian, Italian, Japanese, Japanese-en (Japanese with English messages), +# Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian, Polish, +# Portuguese, Romanian, Russian, Serbian, Slovak, Slovene, Spanish, Swedish, +# and Ukrainian. + +OUTPUT_LANGUAGE = English + +# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will +# include brief member descriptions after the members that are listed in +# the file and class documentation (similar to JavaDoc). +# Set to NO to disable this. + +BRIEF_MEMBER_DESC = YES + +# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend +# the brief description of a member or function before the detailed description. +# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the +# brief descriptions will be completely suppressed. + +REPEAT_BRIEF = YES + +# This tag implements a quasi-intelligent brief description abbreviator +# that is used to form the text in various listings. Each string +# in this list, if found as the leading text of the brief description, will be +# stripped from the text and the result after processing the whole list, is +# used as the annotated text. Otherwise, the brief description is used as-is. +# If left blank, the following values are used ("$name" is automatically +# replaced with the name of the entity): "The $name class" "The $name widget" +# "The $name file" "is" "provides" "specifies" "contains" +# "represents" "a" "an" "the" + +ABBREVIATE_BRIEF = + +# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then +# Doxygen will generate a detailed section even if there is only a brief +# description. + +ALWAYS_DETAILED_SEC = NO + +# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all +# inherited members of a class in the documentation of that class as if those +# members were ordinary class members. Constructors, destructors and assignment +# operators of the base classes will not be shown. + +INLINE_INHERITED_MEMB = NO + +# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full +# path before files name in the file list and in the header files. If set +# to NO the shortest path that makes the file name unique will be used. + +FULL_PATH_NAMES = YES + +# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag +# can be used to strip a user-defined part of the path. Stripping is +# only done if one of the specified strings matches the left-hand part of +# the path. The tag can be used to show relative paths in the file list. +# If left blank the directory from which doxygen is run is used as the +# path to strip. + +STRIP_FROM_PATH = + +# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of +# the path mentioned in the documentation of a class, which tells +# the reader which header file to include in order to use a class. +# If left blank only the name of the header file containing the class +# definition is used. Otherwise one should specify the include paths that +# are normally passed to the compiler using the -I flag. + +STRIP_FROM_INC_PATH = + +# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter +# (but less readable) file names. This can be useful is your file systems +# doesn't support long names like on DOS, Mac, or CD-ROM. + +SHORT_NAMES = NO + +# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen +# will interpret the first line (until the first dot) of a JavaDoc-style +# comment as the brief description. If set to NO, the JavaDoc +# comments will behave just like regular Qt-style comments +# (thus requiring an explicit @brief command for a brief description.) + +JAVADOC_AUTOBRIEF = NO + +# If the QT_AUTOBRIEF tag is set to YES then Doxygen will +# interpret the first line (until the first dot) of a Qt-style +# comment as the brief description. If set to NO, the comments +# will behave just like regular Qt-style comments (thus requiring +# an explicit \brief command for a brief description.) + +QT_AUTOBRIEF = NO + +# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen +# treat a multi-line C++ special comment block (i.e. a block of //! or /// +# comments) as a brief description. This used to be the default behaviour. +# The new default is to treat a multi-line C++ comment block as a detailed +# description. Set this tag to YES if you prefer the old behaviour instead. + +MULTILINE_CPP_IS_BRIEF = NO + +# If the DETAILS_AT_TOP tag is set to YES then Doxygen +# will output the detailed description near the top, like JavaDoc. +# If set to NO, the detailed description appears after the member +# documentation. + +DETAILS_AT_TOP = NO + +# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented +# member inherits the documentation from any documented member that it +# re-implements. + +INHERIT_DOCS = YES + +# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce +# a new page for each member. If set to NO, the documentation of a member will +# be part of the file/class/namespace that contains it. + +SEPARATE_MEMBER_PAGES = NO + +# The TAB_SIZE tag can be used to set the number of spaces in a tab. +# Doxygen uses this value to replace tabs by spaces in code fragments. + +TAB_SIZE = 8 + +# This tag can be used to specify a number of aliases that acts +# as commands in the documentation. An alias has the form "name=value". +# For example adding "sideeffect=\par Side Effects:\n" will allow you to +# put the command \sideeffect (or @sideeffect) in the documentation, which +# will result in a user-defined paragraph with heading "Side Effects:". +# You can put \n's in the value part of an alias to insert newlines. + +ALIASES = + +# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C +# sources only. Doxygen will then generate output that is more tailored for C. +# For instance, some of the names that are used will be different. The list +# of all members will be omitted, etc. + +OPTIMIZE_OUTPUT_FOR_C = YES + +# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java +# sources only. Doxygen will then generate output that is more tailored for +# Java. For instance, namespaces will be presented as packages, qualified +# scopes will look different, etc. + +OPTIMIZE_OUTPUT_JAVA = NO + +# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran +# sources only. Doxygen will then generate output that is more tailored for +# Fortran. + +OPTIMIZE_FOR_FORTRAN = NO + +# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL +# sources. Doxygen will then generate output that is tailored for +# VHDL. + +OPTIMIZE_OUTPUT_VHDL = NO + +# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want +# to include (a tag file for) the STL sources as input, then you should +# set this tag to YES in order to let doxygen match functions declarations and +# definitions whose arguments contain STL classes (e.g. func(std::string); v.s. +# func(std::string) {}). This also make the inheritance and collaboration +# diagrams that involve STL classes more complete and accurate. + +BUILTIN_STL_SUPPORT = NO + +# If you use Microsoft's C++/CLI language, you should set this option to YES to +# enable parsing support. + +CPP_CLI_SUPPORT = NO + +# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only. +# Doxygen will parse them like normal C++ but will assume all classes use public +# instead of private inheritance when no explicit protection keyword is present. + +SIP_SUPPORT = NO + +# For Microsoft's IDL there are propget and propput attributes to indicate getter +# and setter methods for a property. Setting this option to YES (the default) +# will make doxygen to replace the get and set methods by a property in the +# documentation. This will only work if the methods are indeed getting or +# setting a simple type. If this is not the case, or you want to show the +# methods anyway, you should set this option to NO. + +IDL_PROPERTY_SUPPORT = YES + +# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC +# tag is set to YES, then doxygen will reuse the documentation of the first +# member in the group (if any) for the other members of the group. By default +# all members of a group must be documented explicitly. + +DISTRIBUTE_GROUP_DOC = NO + +# Set the SUBGROUPING tag to YES (the default) to allow class member groups of +# the same type (for instance a group of public functions) to be put as a +# subgroup of that type (e.g. under the Public Functions section). Set it to +# NO to prevent subgrouping. Alternatively, this can be done per class using +# the \nosubgrouping command. + +SUBGROUPING = YES + +# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum +# is documented as struct, union, or enum with the name of the typedef. So +# typedef struct TypeS {} TypeT, will appear in the documentation as a struct +# with name TypeT. When disabled the typedef will appear as a member of a file, +# namespace, or class. And the struct will be named TypeS. This can typically +# be useful for C code in case the coding convention dictates that all compound +# types are typedef'ed and only the typedef is referenced, never the tag name. + +TYPEDEF_HIDES_STRUCT = NO + +#--------------------------------------------------------------------------- +# Build related configuration options +#--------------------------------------------------------------------------- + +# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in +# documentation are documented, even if no documentation was available. +# Private class members and static file members will be hidden unless +# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES + +EXTRACT_ALL = NO + +# If the EXTRACT_PRIVATE tag is set to YES all private members of a class +# will be included in the documentation. + +EXTRACT_PRIVATE = YES + +# If the EXTRACT_STATIC tag is set to YES all static members of a file +# will be included in the documentation. + +EXTRACT_STATIC = YES + +# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) +# defined locally in source files will be included in the documentation. +# If set to NO only classes defined in header files are included. + +EXTRACT_LOCAL_CLASSES = YES + +# This flag is only useful for Objective-C code. When set to YES local +# methods, which are defined in the implementation section but not in +# the interface are included in the documentation. +# If set to NO (the default) only methods in the interface are included. + +EXTRACT_LOCAL_METHODS = NO + +# If this flag is set to YES, the members of anonymous namespaces will be +# extracted and appear in the documentation as a namespace called +# 'anonymous_namespace{file}', where file will be replaced with the base +# name of the file that contains the anonymous namespace. By default +# anonymous namespace are hidden. + +EXTRACT_ANON_NSPACES = NO + +# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all +# undocumented members of documented classes, files or namespaces. +# If set to NO (the default) these members will be included in the +# various overviews, but no documentation section is generated. +# This option has no effect if EXTRACT_ALL is enabled. + +HIDE_UNDOC_MEMBERS = NO + +# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all +# undocumented classes that are normally visible in the class hierarchy. +# If set to NO (the default) these classes will be included in the various +# overviews. This option has no effect if EXTRACT_ALL is enabled. + +HIDE_UNDOC_CLASSES = NO + +# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all +# friend (class|struct|union) declarations. +# If set to NO (the default) these declarations will be included in the +# documentation. + +HIDE_FRIEND_COMPOUNDS = NO + +# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any +# documentation blocks found inside the body of a function. +# If set to NO (the default) these blocks will be appended to the +# function's detailed documentation block. + +HIDE_IN_BODY_DOCS = NO + +# The INTERNAL_DOCS tag determines if documentation +# that is typed after a \internal command is included. If the tag is set +# to NO (the default) then the documentation will be excluded. +# Set it to YES to include the internal documentation. + +INTERNAL_DOCS = NO + +# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate +# file names in lower-case letters. If set to YES upper-case letters are also +# allowed. This is useful if you have classes or files whose names only differ +# in case and if your file system supports case sensitive file names. Windows +# and Mac users are advised to set this option to NO. + +CASE_SENSE_NAMES = YES + +# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen +# will show members with their full class and namespace scopes in the +# documentation. If set to YES the scope will be hidden. + +HIDE_SCOPE_NAMES = NO + +# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen +# will put a list of the files that are included by a file in the documentation +# of that file. + +SHOW_INCLUDE_FILES = YES + +# If the INLINE_INFO tag is set to YES (the default) then a tag [inline] +# is inserted in the documentation for inline members. + +INLINE_INFO = YES + +# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen +# will sort the (detailed) documentation of file and class members +# alphabetically by member name. If set to NO the members will appear in +# declaration order. + +SORT_MEMBER_DOCS = YES + +# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the +# brief documentation of file, namespace and class members alphabetically +# by member name. If set to NO (the default) the members will appear in +# declaration order. + +SORT_BRIEF_DOCS = NO + +# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the +# hierarchy of group names into alphabetical order. If set to NO (the default) +# the group names will appear in their defined order. + +SORT_GROUP_NAMES = NO + +# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be +# sorted by fully-qualified names, including namespaces. If set to +# NO (the default), the class list will be sorted only by class name, +# not including the namespace part. +# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. +# Note: This option applies only to the class list, not to the +# alphabetical list. + +SORT_BY_SCOPE_NAME = NO + +# The GENERATE_TODOLIST tag can be used to enable (YES) or +# disable (NO) the todo list. This list is created by putting \todo +# commands in the documentation. + +GENERATE_TODOLIST = YES + +# The GENERATE_TESTLIST tag can be used to enable (YES) or +# disable (NO) the test list. This list is created by putting \test +# commands in the documentation. + +GENERATE_TESTLIST = YES + +# The GENERATE_BUGLIST tag can be used to enable (YES) or +# disable (NO) the bug list. This list is created by putting \bug +# commands in the documentation. + +GENERATE_BUGLIST = YES + +# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or +# disable (NO) the deprecated list. This list is created by putting +# \deprecated commands in the documentation. + +GENERATE_DEPRECATEDLIST= YES + +# The ENABLED_SECTIONS tag can be used to enable conditional +# documentation sections, marked by \if sectionname ... \endif. + +ENABLED_SECTIONS = + +# The MAX_INITIALIZER_LINES tag determines the maximum number of lines +# the initial value of a variable or define consists of for it to appear in +# the documentation. If the initializer consists of more lines than specified +# here it will be hidden. Use a value of 0 to hide initializers completely. +# The appearance of the initializer of individual variables and defines in the +# documentation can be controlled using \showinitializer or \hideinitializer +# command in the documentation regardless of this setting. + +MAX_INITIALIZER_LINES = 30 + +# Set the SHOW_USED_FILES tag to NO to disable the list of files generated +# at the bottom of the documentation of classes and structs. If set to YES the +# list will mention the files that were used to generate the documentation. + +SHOW_USED_FILES = YES + +# If the sources in your project are distributed over multiple directories +# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy +# in the documentation. The default is NO. + +SHOW_DIRECTORIES = NO + +# Set the SHOW_FILES tag to NO to disable the generation of the Files page. +# This will remove the Files entry from the Quick Index and from the +# Folder Tree View (if specified). The default is YES. + +SHOW_FILES = YES + +# Set the SHOW_NAMESPACES tag to NO to disable the generation of the +# Namespaces page. This will remove the Namespaces entry from the Quick Index +# and from the Folder Tree View (if specified). The default is YES. + +SHOW_NAMESPACES = YES + +# The FILE_VERSION_FILTER tag can be used to specify a program or script that +# doxygen should invoke to get the current version for each file (typically from +# the version control system). Doxygen will invoke the program by executing (via +# popen()) the command <command> <input-file>, where <command> is the value of +# the FILE_VERSION_FILTER tag, and <input-file> is the name of an input file +# provided by doxygen. Whatever the program writes to standard output +# is used as the file version. See the manual for examples. + +FILE_VERSION_FILTER = + +#--------------------------------------------------------------------------- +# configuration options related to warning and progress messages +#--------------------------------------------------------------------------- + +# The QUIET tag can be used to turn on/off the messages that are generated +# by doxygen. Possible values are YES and NO. If left blank NO is used. + +QUIET = YES + +# The WARNINGS tag can be used to turn on/off the warning messages that are +# generated by doxygen. Possible values are YES and NO. If left blank +# NO is used. + +WARNINGS = YES + +# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings +# for undocumented members. If EXTRACT_ALL is set to YES then this flag will +# automatically be disabled. + +WARN_IF_UNDOCUMENTED = YES + +# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for +# potential errors in the documentation, such as not documenting some +# parameters in a documented function, or documenting parameters that +# don't exist or using markup commands wrongly. + +WARN_IF_DOC_ERROR = YES + +# This WARN_NO_PARAMDOC option can be abled to get warnings for +# functions that are documented, but have no documentation for their parameters +# or return value. If set to NO (the default) doxygen will only warn about +# wrong or incomplete parameter documentation, but not about the absence of +# documentation. + +WARN_NO_PARAMDOC = NO + +# The WARN_FORMAT tag determines the format of the warning messages that +# doxygen can produce. The string should contain the $file, $line, and $text +# tags, which will be replaced by the file and line number from which the +# warning originated and the warning text. Optionally the format may contain +# $version, which will be replaced by the version of the file (if it could +# be obtained via FILE_VERSION_FILTER) + +WARN_FORMAT = "$file:$line: $text" + +# The WARN_LOGFILE tag can be used to specify a file to which warning +# and error messages should be written. If left blank the output is written +# to stderr. + +WARN_LOGFILE = + +#--------------------------------------------------------------------------- +# configuration options related to the input files +#--------------------------------------------------------------------------- + +# The INPUT tag can be used to specify the files and/or directories that contain +# documented source files. You may enter file names like "myfile.cpp" or +# directories like "/usr/src/myproject". Separate the files or directories +# with spaces. + +INPUT = . include/univ.i + +# This tag can be used to specify the character encoding of the source files +# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is +# also the default input encoding. Doxygen uses libiconv (or the iconv built +# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for +# the list of possible encodings. + +INPUT_ENCODING = UTF-8 + +# If the value of the INPUT tag contains directories, you can use the +# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp +# and *.h) to filter out the source-files in the directories. If left +# blank the following patterns are tested: +# *.c *.cc *.cxx *.cpp *.c++ *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh *.hxx +# *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.py *.f90 + +FILE_PATTERNS = *.c *.ic *.h + +# The RECURSIVE tag can be used to turn specify whether or not subdirectories +# should be searched for input files as well. Possible values are YES and NO. +# If left blank NO is used. + +RECURSIVE = YES + +# The EXCLUDE tag can be used to specify files and/or directories that should +# excluded from the INPUT source files. This way you can easily exclude a +# subdirectory from a directory tree whose root is specified with the INPUT tag. + +EXCLUDE = + +# The EXCLUDE_SYMLINKS tag can be used select whether or not files or +# directories that are symbolic links (a Unix filesystem feature) are excluded +# from the input. + +EXCLUDE_SYMLINKS = NO + +# If the value of the INPUT tag contains directories, you can use the +# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude +# certain files from those directories. Note that the wildcards are matched +# against the file with absolute path, so to exclude all test directories +# for example use the pattern */test/* + +EXCLUDE_PATTERNS = + +# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names +# (namespaces, classes, functions, etc.) that should be excluded from the +# output. The symbol name can be a fully qualified name, a word, or if the +# wildcard * is used, a substring. Examples: ANamespace, AClass, +# AClass::ANamespace, ANamespace::*Test + +EXCLUDE_SYMBOLS = + +# The EXAMPLE_PATH tag can be used to specify one or more files or +# directories that contain example code fragments that are included (see +# the \include command). + +EXAMPLE_PATH = + +# If the value of the EXAMPLE_PATH tag contains directories, you can use the +# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp +# and *.h) to filter out the source-files in the directories. If left +# blank all files are included. + +EXAMPLE_PATTERNS = + +# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be +# searched for input files to be used with the \include or \dontinclude +# commands irrespective of the value of the RECURSIVE tag. +# Possible values are YES and NO. If left blank NO is used. + +EXAMPLE_RECURSIVE = NO + +# The IMAGE_PATH tag can be used to specify one or more files or +# directories that contain image that are included in the documentation (see +# the \image command). + +IMAGE_PATH = + +# The INPUT_FILTER tag can be used to specify a program that doxygen should +# invoke to filter for each input file. Doxygen will invoke the filter program +# by executing (via popen()) the command <filter> <input-file>, where <filter> +# is the value of the INPUT_FILTER tag, and <input-file> is the name of an +# input file. Doxygen will then use the output that the filter program writes +# to standard output. If FILTER_PATTERNS is specified, this tag will be +# ignored. + +INPUT_FILTER = + +# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern +# basis. Doxygen will compare the file name with each pattern and apply the +# filter if there is a match. The filters are a list of the form: +# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further +# info on how filters are used. If FILTER_PATTERNS is empty, INPUT_FILTER +# is applied to all files. + +FILTER_PATTERNS = + +# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using +# INPUT_FILTER) will be used to filter the input files when producing source +# files to browse (i.e. when SOURCE_BROWSER is set to YES). + +FILTER_SOURCE_FILES = NO + +#--------------------------------------------------------------------------- +# configuration options related to source browsing +#--------------------------------------------------------------------------- + +# If the SOURCE_BROWSER tag is set to YES then a list of source files will +# be generated. Documented entities will be cross-referenced with these sources. +# Note: To get rid of all source code in the generated output, make sure also +# VERBATIM_HEADERS is set to NO. + +SOURCE_BROWSER = NO + +# Setting the INLINE_SOURCES tag to YES will include the body +# of functions and classes directly in the documentation. + +INLINE_SOURCES = NO + +# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct +# doxygen to hide any special comment blocks from generated source code +# fragments. Normal C and C++ comments will always remain visible. + +STRIP_CODE_COMMENTS = YES + +# If the REFERENCED_BY_RELATION tag is set to YES +# then for each documented function all documented +# functions referencing it will be listed. + +REFERENCED_BY_RELATION = NO + +# If the REFERENCES_RELATION tag is set to YES +# then for each documented function all documented entities +# called/used by that function will be listed. + +REFERENCES_RELATION = NO + +# If the REFERENCES_LINK_SOURCE tag is set to YES (the default) +# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from +# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will +# link to the source code. Otherwise they will link to the documentstion. + +REFERENCES_LINK_SOURCE = YES + +# If the USE_HTAGS tag is set to YES then the references to source code +# will point to the HTML generated by the htags(1) tool instead of doxygen +# built-in source browser. The htags tool is part of GNU's global source +# tagging system (see http://www.gnu.org/software/global/global.html). You +# will need version 4.8.6 or higher. + +USE_HTAGS = NO + +# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen +# will generate a verbatim copy of the header file for each class for +# which an include is specified. Set to NO to disable this. + +VERBATIM_HEADERS = YES + +#--------------------------------------------------------------------------- +# configuration options related to the alphabetical class index +#--------------------------------------------------------------------------- + +# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index +# of all compounds will be generated. Enable this if the project +# contains a lot of classes, structs, unions or interfaces. + +ALPHABETICAL_INDEX = NO + +# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then +# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns +# in which this list will be split (can be a number in the range [1..20]) + +COLS_IN_ALPHA_INDEX = 5 + +# In case all classes in a project start with a common prefix, all +# classes will be put under the same header in the alphabetical index. +# The IGNORE_PREFIX tag can be used to specify one or more prefixes that +# should be ignored while generating the index headers. + +IGNORE_PREFIX = + +#--------------------------------------------------------------------------- +# configuration options related to the HTML output +#--------------------------------------------------------------------------- + +# If the GENERATE_HTML tag is set to YES (the default) Doxygen will +# generate HTML output. + +GENERATE_HTML = YES + +# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `html' will be used as the default path. + +HTML_OUTPUT = html + +# The HTML_FILE_EXTENSION tag can be used to specify the file extension for +# each generated HTML page (for example: .htm,.php,.asp). If it is left blank +# doxygen will generate files with .html extension. + +HTML_FILE_EXTENSION = .html + +# The HTML_HEADER tag can be used to specify a personal HTML header for +# each generated HTML page. If it is left blank doxygen will generate a +# standard header. + +HTML_HEADER = + +# The HTML_FOOTER tag can be used to specify a personal HTML footer for +# each generated HTML page. If it is left blank doxygen will generate a +# standard footer. + +HTML_FOOTER = + +# The HTML_STYLESHEET tag can be used to specify a user-defined cascading +# style sheet that is used by each HTML page. It can be used to +# fine-tune the look of the HTML output. If the tag is left blank doxygen +# will generate a default style sheet. Note that doxygen will try to copy +# the style sheet file to the HTML output directory, so don't put your own +# stylesheet in the HTML output directory as well, or it will be erased! + +HTML_STYLESHEET = + +# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes, +# files or namespaces will be aligned in HTML using tables. If set to +# NO a bullet list will be used. + +HTML_ALIGN_MEMBERS = YES + +# If the GENERATE_HTMLHELP tag is set to YES, additional index files +# will be generated that can be used as input for tools like the +# Microsoft HTML help workshop to generate a compiled HTML help file (.chm) +# of the generated HTML documentation. + +GENERATE_HTMLHELP = NO + +# If the GENERATE_DOCSET tag is set to YES, additional index files +# will be generated that can be used as input for Apple's Xcode 3 +# integrated development environment, introduced with OSX 10.5 (Leopard). +# To create a documentation set, doxygen will generate a Makefile in the +# HTML output directory. Running make will produce the docset in that +# directory and running "make install" will install the docset in +# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find +# it at startup. + +GENERATE_DOCSET = NO + +# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the +# feed. A documentation feed provides an umbrella under which multiple +# documentation sets from a single provider (such as a company or product suite) +# can be grouped. + +DOCSET_FEEDNAME = "Doxygen generated docs" + +# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that +# should uniquely identify the documentation set bundle. This should be a +# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen +# will append .docset to the name. + +DOCSET_BUNDLE_ID = org.doxygen.Project + +# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML +# documentation will contain sections that can be hidden and shown after the +# page has loaded. For this to work a browser that supports +# JavaScript and DHTML is required (for instance Mozilla 1.0+, Firefox +# Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari). + +HTML_DYNAMIC_SECTIONS = NO + +# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can +# be used to specify the file name of the resulting .chm file. You +# can add a path in front of the file if the result should not be +# written to the html output directory. + +CHM_FILE = + +# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can +# be used to specify the location (absolute path including file name) of +# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run +# the HTML help compiler on the generated index.hhp. + +HHC_LOCATION = + +# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag +# controls if a separate .chi index file is generated (YES) or that +# it should be included in the master .chm file (NO). + +GENERATE_CHI = NO + +# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING +# is used to encode HtmlHelp index (hhk), content (hhc) and project file +# content. + +CHM_INDEX_ENCODING = + +# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag +# controls whether a binary table of contents is generated (YES) or a +# normal table of contents (NO) in the .chm file. + +BINARY_TOC = NO + +# The TOC_EXPAND flag can be set to YES to add extra items for group members +# to the contents of the HTML help documentation and to the tree view. + +TOC_EXPAND = NO + +# The DISABLE_INDEX tag can be used to turn on/off the condensed index at +# top of each HTML page. The value NO (the default) enables the index and +# the value YES disables it. + +DISABLE_INDEX = NO + +# This tag can be used to set the number of enum values (range [1..20]) +# that doxygen will group on one line in the generated HTML documentation. + +ENUM_VALUES_PER_LINE = 4 + +# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index +# structure should be generated to display hierarchical information. +# If the tag value is set to FRAME, a side panel will be generated +# containing a tree-like index structure (just like the one that +# is generated for HTML Help). For this to work a browser that supports +# JavaScript, DHTML, CSS and frames is required (for instance Mozilla 1.0+, +# Netscape 6.0+, Internet explorer 5.0+, or Konqueror). Windows users are +# probably better off using the HTML help feature. Other possible values +# for this tag are: HIERARCHIES, which will generate the Groups, Directories, +# and Class Hiererachy pages using a tree view instead of an ordered list; +# ALL, which combines the behavior of FRAME and HIERARCHIES; and NONE, which +# disables this behavior completely. For backwards compatibility with previous +# releases of Doxygen, the values YES and NO are equivalent to FRAME and NONE +# respectively. + +GENERATE_TREEVIEW = NONE + +# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be +# used to set the initial width (in pixels) of the frame in which the tree +# is shown. + +TREEVIEW_WIDTH = 250 + +# Use this tag to change the font size of Latex formulas included +# as images in the HTML documentation. The default is 10. Note that +# when you change the font size after a successful doxygen run you need +# to manually remove any form_*.png images from the HTML output directory +# to force them to be regenerated. + +FORMULA_FONTSIZE = 10 + +#--------------------------------------------------------------------------- +# configuration options related to the LaTeX output +#--------------------------------------------------------------------------- + +# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will +# generate Latex output. + +GENERATE_LATEX = NO + +# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `latex' will be used as the default path. + +LATEX_OUTPUT = latex + +# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be +# invoked. If left blank `latex' will be used as the default command name. + +LATEX_CMD_NAME = latex + +# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to +# generate index for LaTeX. If left blank `makeindex' will be used as the +# default command name. + +MAKEINDEX_CMD_NAME = makeindex + +# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact +# LaTeX documents. This may be useful for small projects and may help to +# save some trees in general. + +COMPACT_LATEX = NO + +# The PAPER_TYPE tag can be used to set the paper type that is used +# by the printer. Possible values are: a4, a4wide, letter, legal and +# executive. If left blank a4wide will be used. + +PAPER_TYPE = a4wide + +# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX +# packages that should be included in the LaTeX output. + +EXTRA_PACKAGES = + +# The LATEX_HEADER tag can be used to specify a personal LaTeX header for +# the generated latex document. The header should contain everything until +# the first chapter. If it is left blank doxygen will generate a +# standard header. Notice: only use this tag if you know what you are doing! + +LATEX_HEADER = + +# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated +# is prepared for conversion to pdf (using ps2pdf). The pdf file will +# contain links (just like the HTML output) instead of page references +# This makes the output suitable for online browsing using a pdf viewer. + +PDF_HYPERLINKS = YES + +# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of +# plain latex in the generated Makefile. Set this option to YES to get a +# higher quality PDF documentation. + +USE_PDFLATEX = YES + +# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode. +# command to the generated LaTeX files. This will instruct LaTeX to keep +# running if errors occur, instead of asking the user for help. +# This option is also used when generating formulas in HTML. + +LATEX_BATCHMODE = NO + +# If LATEX_HIDE_INDICES is set to YES then doxygen will not +# include the index chapters (such as File Index, Compound Index, etc.) +# in the output. + +LATEX_HIDE_INDICES = NO + +#--------------------------------------------------------------------------- +# configuration options related to the RTF output +#--------------------------------------------------------------------------- + +# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output +# The RTF output is optimized for Word 97 and may not look very pretty with +# other RTF readers or editors. + +GENERATE_RTF = NO + +# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `rtf' will be used as the default path. + +RTF_OUTPUT = rtf + +# If the COMPACT_RTF tag is set to YES Doxygen generates more compact +# RTF documents. This may be useful for small projects and may help to +# save some trees in general. + +COMPACT_RTF = NO + +# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated +# will contain hyperlink fields. The RTF file will +# contain links (just like the HTML output) instead of page references. +# This makes the output suitable for online browsing using WORD or other +# programs which support those fields. +# Note: wordpad (write) and others do not support links. + +RTF_HYPERLINKS = NO + +# Load stylesheet definitions from file. Syntax is similar to doxygen's +# config file, i.e. a series of assignments. You only have to provide +# replacements, missing definitions are set to their default value. + +RTF_STYLESHEET_FILE = + +# Set optional variables used in the generation of an rtf document. +# Syntax is similar to doxygen's config file. + +RTF_EXTENSIONS_FILE = + +#--------------------------------------------------------------------------- +# configuration options related to the man page output +#--------------------------------------------------------------------------- + +# If the GENERATE_MAN tag is set to YES (the default) Doxygen will +# generate man pages + +GENERATE_MAN = NO + +# The MAN_OUTPUT tag is used to specify where the man pages will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `man' will be used as the default path. + +MAN_OUTPUT = man + +# The MAN_EXTENSION tag determines the extension that is added to +# the generated man pages (default is the subroutine's section .3) + +MAN_EXTENSION = .3 + +# If the MAN_LINKS tag is set to YES and Doxygen generates man output, +# then it will generate one additional man file for each entity +# documented in the real man page(s). These additional files +# only source the real man page, but without them the man command +# would be unable to find the correct page. The default is NO. + +MAN_LINKS = NO + +#--------------------------------------------------------------------------- +# configuration options related to the XML output +#--------------------------------------------------------------------------- + +# If the GENERATE_XML tag is set to YES Doxygen will +# generate an XML file that captures the structure of +# the code including all documentation. + +GENERATE_XML = NO + +# The XML_OUTPUT tag is used to specify where the XML pages will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `xml' will be used as the default path. + +XML_OUTPUT = xml + +# The XML_SCHEMA tag can be used to specify an XML schema, +# which can be used by a validating XML parser to check the +# syntax of the XML files. + +XML_SCHEMA = + +# The XML_DTD tag can be used to specify an XML DTD, +# which can be used by a validating XML parser to check the +# syntax of the XML files. + +XML_DTD = + +# If the XML_PROGRAMLISTING tag is set to YES Doxygen will +# dump the program listings (including syntax highlighting +# and cross-referencing information) to the XML output. Note that +# enabling this will significantly increase the size of the XML output. + +XML_PROGRAMLISTING = YES + +#--------------------------------------------------------------------------- +# configuration options for the AutoGen Definitions output +#--------------------------------------------------------------------------- + +# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will +# generate an AutoGen Definitions (see autogen.sf.net) file +# that captures the structure of the code including all +# documentation. Note that this feature is still experimental +# and incomplete at the moment. + +GENERATE_AUTOGEN_DEF = NO + +#--------------------------------------------------------------------------- +# configuration options related to the Perl module output +#--------------------------------------------------------------------------- + +# If the GENERATE_PERLMOD tag is set to YES Doxygen will +# generate a Perl module file that captures the structure of +# the code including all documentation. Note that this +# feature is still experimental and incomplete at the +# moment. + +GENERATE_PERLMOD = NO + +# If the PERLMOD_LATEX tag is set to YES Doxygen will generate +# the necessary Makefile rules, Perl scripts and LaTeX code to be able +# to generate PDF and DVI output from the Perl module output. + +PERLMOD_LATEX = NO + +# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be +# nicely formatted so it can be parsed by a human reader. This is useful +# if you want to understand what is going on. On the other hand, if this +# tag is set to NO the size of the Perl module output will be much smaller +# and Perl will parse it just the same. + +PERLMOD_PRETTY = YES + +# The names of the make variables in the generated doxyrules.make file +# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. +# This is useful so different doxyrules.make files included by the same +# Makefile don't overwrite each other's variables. + +PERLMOD_MAKEVAR_PREFIX = + +#--------------------------------------------------------------------------- +# Configuration options related to the preprocessor +#--------------------------------------------------------------------------- + +# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will +# evaluate all C-preprocessor directives found in the sources and include +# files. + +ENABLE_PREPROCESSING = YES + +# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro +# names in the source code. If set to NO (the default) only conditional +# compilation will be performed. Macro expansion can be done in a controlled +# way by setting EXPAND_ONLY_PREDEF to YES. + +MACRO_EXPANSION = YES + +# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES +# then the macro expansion is limited to the macros specified with the +# PREDEFINED and EXPAND_AS_DEFINED tags. + +EXPAND_ONLY_PREDEF = YES + +# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files +# in the INCLUDE_PATH (see below) will be search if a #include is found. + +SEARCH_INCLUDES = YES + +# The INCLUDE_PATH tag can be used to specify one or more directories that +# contain include files that are not input files but should be processed by +# the preprocessor. + +INCLUDE_PATH = + +# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard +# patterns (like *.h and *.hpp) to filter out the header-files in the +# directories. If left blank, the patterns specified with FILE_PATTERNS will +# be used. + +INCLUDE_FILE_PATTERNS = + +# The PREDEFINED tag can be used to specify one or more macro names that +# are defined before the preprocessor is started (similar to the -D option of +# gcc). The argument of the tag is a list of macros of the form: name +# or name=definition (no spaces). If the definition and the = are +# omitted =1 is assumed. To prevent a macro definition from being +# undefined via #undef or recursively expanded use the := operator +# instead of the = operator. + +PREDEFINED = DOXYGEN UNIV_DEBUG UNIV_SYNC_DEBUG __attribute__()= + +# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then +# this tag can be used to specify a list of macro names that should be expanded. +# The macro definition that is found in the sources will be used. +# Use the PREDEFINED tag if you want to use a different macro definition. + +EXPAND_AS_DEFINED = UT_LIST_BASE_NODE_T UT_LIST_NODE_T + +# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then +# doxygen's preprocessor will remove all function-like macros that are alone +# on a line, have an all uppercase name, and do not end with a semicolon. Such +# function macros are typically used for boiler-plate code, and will confuse +# the parser if not removed. + +SKIP_FUNCTION_MACROS = YES + +#--------------------------------------------------------------------------- +# Configuration::additions related to external references +#--------------------------------------------------------------------------- + +# The TAGFILES option can be used to specify one or more tagfiles. +# Optionally an initial location of the external documentation +# can be added for each tagfile. The format of a tag file without +# this location is as follows: +# TAGFILES = file1 file2 ... +# Adding location for the tag files is done as follows: +# TAGFILES = file1=loc1 "file2 = loc2" ... +# where "loc1" and "loc2" can be relative or absolute paths or +# URLs. If a location is present for each tag, the installdox tool +# does not have to be run to correct the links. +# Note that each tag file must have a unique name +# (where the name does NOT include the path) +# If a tag file is not located in the directory in which doxygen +# is run, you must also specify the path to the tagfile here. + +TAGFILES = + +# When a file name is specified after GENERATE_TAGFILE, doxygen will create +# a tag file that is based on the input files it reads. + +GENERATE_TAGFILE = + +# If the ALLEXTERNALS tag is set to YES all external classes will be listed +# in the class index. If set to NO only the inherited external classes +# will be listed. + +ALLEXTERNALS = NO + +# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed +# in the modules index. If set to NO, only the current project's groups will +# be listed. + +EXTERNAL_GROUPS = NO + +# The PERL_PATH should be the absolute path and name of the perl script +# interpreter (i.e. the result of `which perl'). + +PERL_PATH = /usr/bin/perl + +#--------------------------------------------------------------------------- +# Configuration options related to the dot tool +#--------------------------------------------------------------------------- + +# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will +# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base +# or super classes. Setting the tag to NO turns the diagrams off. Note that +# this option is superseded by the HAVE_DOT option below. This is only a +# fallback. It is recommended to install and use dot, since it yields more +# powerful graphs. + +CLASS_DIAGRAMS = YES + +# You can define message sequence charts within doxygen comments using the \msc +# command. Doxygen will then run the mscgen tool (see +# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the +# documentation. The MSCGEN_PATH tag allows you to specify the directory where +# the mscgen tool resides. If left empty the tool is assumed to be found in the +# default search path. + +MSCGEN_PATH = + +# If set to YES, the inheritance and collaboration graphs will hide +# inheritance and usage relations if the target is undocumented +# or is not a class. + +HIDE_UNDOC_RELATIONS = YES + +# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is +# available from the path. This tool is part of Graphviz, a graph visualization +# toolkit from AT&T and Lucent Bell Labs. The other options in this section +# have no effect if this option is set to NO (the default) + +HAVE_DOT = YES + +# By default doxygen will write a font called FreeSans.ttf to the output +# directory and reference it in all dot files that doxygen generates. This +# font does not include all possible unicode characters however, so when you need +# these (or just want a differently looking font) you can specify the font name +# using DOT_FONTNAME. You need need to make sure dot is able to find the font, +# which can be done by putting it in a standard location or by setting the +# DOTFONTPATH environment variable or by setting DOT_FONTPATH to the directory +# containing the font. + +DOT_FONTNAME = FreeSans + +# By default doxygen will tell dot to use the output directory to look for the +# FreeSans.ttf font (which doxygen will put there itself). If you specify a +# different font using DOT_FONTNAME you can set the path where dot +# can find it using this tag. + +DOT_FONTPATH = + +# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen +# will generate a graph for each documented class showing the direct and +# indirect inheritance relations. Setting this tag to YES will force the +# the CLASS_DIAGRAMS tag to NO. + +CLASS_GRAPH = YES + +# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen +# will generate a graph for each documented class showing the direct and +# indirect implementation dependencies (inheritance, containment, and +# class references variables) of the class with other documented classes. + +COLLABORATION_GRAPH = YES + +# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen +# will generate a graph for groups, showing the direct groups dependencies + +GROUP_GRAPHS = NO + +# If the UML_LOOK tag is set to YES doxygen will generate inheritance and +# collaboration diagrams in a style similar to the OMG's Unified Modeling +# Language. + +UML_LOOK = NO + +# If set to YES, the inheritance and collaboration graphs will show the +# relations between templates and their instances. + +TEMPLATE_RELATIONS = NO + +# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT +# tags are set to YES then doxygen will generate a graph for each documented +# file showing the direct and indirect include dependencies of the file with +# other documented files. + +INCLUDE_GRAPH = YES + +# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and +# HAVE_DOT tags are set to YES then doxygen will generate a graph for each +# documented header file showing the documented files that directly or +# indirectly include this file. + +INCLUDED_BY_GRAPH = YES + +# If the CALL_GRAPH and HAVE_DOT options are set to YES then +# doxygen will generate a call dependency graph for every global function +# or class method. Note that enabling this option will significantly increase +# the time of a run. So in most cases it will be better to enable call graphs +# for selected functions only using the \callgraph command. + +CALL_GRAPH = NO + +# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then +# doxygen will generate a caller dependency graph for every global function +# or class method. Note that enabling this option will significantly increase +# the time of a run. So in most cases it will be better to enable caller +# graphs for selected functions only using the \callergraph command. + +CALLER_GRAPH = NO + +# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen +# will graphical hierarchy of all classes instead of a textual one. + +GRAPHICAL_HIERARCHY = YES + +# If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES +# then doxygen will show the dependencies a directory has on other directories +# in a graphical way. The dependency relations are determined by the #include +# relations between the files in the directories. + +DIRECTORY_GRAPH = YES + +# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images +# generated by dot. Possible values are png, jpg, or gif +# If left blank png will be used. + +DOT_IMAGE_FORMAT = png + +# The tag DOT_PATH can be used to specify the path where the dot tool can be +# found. If left blank, it is assumed the dot tool can be found in the path. + +DOT_PATH = + +# The DOTFILE_DIRS tag can be used to specify one or more directories that +# contain dot files that are included in the documentation (see the +# \dotfile command). + +DOTFILE_DIRS = + +# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of +# nodes that will be shown in the graph. If the number of nodes in a graph +# becomes larger than this value, doxygen will truncate the graph, which is +# visualized by representing a node as a red box. Note that doxygen if the +# number of direct children of the root node in a graph is already larger than +# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note +# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH. + +DOT_GRAPH_MAX_NODES = 50 + +# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the +# graphs generated by dot. A depth value of 3 means that only nodes reachable +# from the root by following a path via at most 3 edges will be shown. Nodes +# that lay further from the root node will be omitted. Note that setting this +# option to 1 or 2 may greatly reduce the computation time needed for large +# code bases. Also note that the size of a graph can be further restricted by +# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction. + +MAX_DOT_GRAPH_DEPTH = 3 + +# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent +# background. This is enabled by default, which results in a transparent +# background. Warning: Depending on the platform used, enabling this option +# may lead to badly anti-aliased labels on the edges of a graph (i.e. they +# become hard to read). + +DOT_TRANSPARENT = YES + +# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output +# files in one run (i.e. multiple -o and -T options on the command line). This +# makes dot run faster, but since only newer versions of dot (>1.8.10) +# support this, this feature is disabled by default. + +DOT_MULTI_TARGETS = NO + +# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will +# generate a legend page explaining the meaning of the various boxes and +# arrows in the dot generated graphs. + +GENERATE_LEGEND = YES + +# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will +# remove the intermediate dot files that are used to generate +# the various graphs. + +DOT_CLEANUP = YES + +#--------------------------------------------------------------------------- +# Configuration::additions related to the search engine +#--------------------------------------------------------------------------- + +# The SEARCHENGINE tag specifies whether or not a search engine should be +# used. If set to NO the values of all tags below this one will be ignored. + +SEARCHENGINE = NO diff --git a/storage/xtradb/api/api0api.cc b/storage/xtradb/api/api0api.cc new file mode 100644 index 00000000000..2f5999e9a3a --- /dev/null +++ b/storage/xtradb/api/api0api.cc @@ -0,0 +1,4061 @@ +/***************************************************************************** + +Copyright (c) 2008, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file api/api0api.cc +InnoDB Native API + +2008-08-01 Created Sunny Bains +3/20/2011 Jimmy Yang extracted from Embedded InnoDB +*******************************************************/ + +#include "univ.i" + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <stdarg.h> +#ifdef HAVE_UNISTD_H +#include <unistd.h> +#endif + +#include "api0api.h" +#include "api0misc.h" +#include "srv0start.h" +#include "dict0dict.h" +#include "btr0pcur.h" +#include "row0ins.h" +#include "row0upd.h" +#include "row0vers.h" +#include "trx0roll.h" +#include "dict0crea.h" +#include "row0merge.h" +#include "pars0pars.h" +#include "lock0types.h" +#include "row0sel.h" +#include "lock0lock.h" +#include "rem0cmp.h" +#include "ut0dbg.h" +#include "dict0priv.h" +#include "ut0ut.h" +#include "ha_prototypes.h" +#include "trx0roll.h" + +/** configure variable for binlog option with InnoDB APIs */ +my_bool ib_binlog_enabled = FALSE; + +/** configure variable for MDL option with InnoDB APIs */ +my_bool ib_mdl_enabled = FALSE; + +/** configure variable for disable rowlock with InnoDB APIs */ +my_bool ib_disable_row_lock = FALSE; + +/** configure variable for Transaction isolation levels */ +ulong ib_trx_level_setting = IB_TRX_READ_UNCOMMITTED; + +/** configure variable for background commit interval in seconds */ +ulong ib_bk_commit_interval = 0; + +/** InnoDB tuple types. */ +enum ib_tuple_type_t{ + TPL_TYPE_ROW, /*!< Data row tuple */ + TPL_TYPE_KEY /*!< Index key tuple */ +}; + +/** Query types supported. */ +enum ib_qry_type_t{ + QRY_NON, /*!< None/Sentinel */ + QRY_INS, /*!< Insert operation */ + QRY_UPD, /*!< Update operation */ + QRY_SEL /*!< Select operation */ +}; + +/** Query graph types. */ +struct ib_qry_grph_t { + que_fork_t* ins; /*!< Innobase SQL query graph used + in inserts */ + que_fork_t* upd; /*!< Innobase SQL query graph used + in updates or deletes */ + que_fork_t* sel; /*!< dummy query graph used in + selects */ +}; + +/** Query node types. */ +struct ib_qry_node_t { + ins_node_t* ins; /*!< Innobase SQL insert node + used to perform inserts to the table */ + upd_node_t* upd; /*!< Innobase SQL update node + used to perform updates and deletes */ + sel_node_t* sel; /*!< Innobase SQL select node + used to perform selects on the table */ +}; + +/** Query processing fields. */ +struct ib_qry_proc_t { + + ib_qry_node_t node; /*!< Query node*/ + + ib_qry_grph_t grph; /*!< Query graph */ +}; + +/** Cursor instance for traversing tables/indexes. This will eventually +become row_prebuilt_t. */ +struct ib_cursor_t { + mem_heap_t* heap; /*!< Instance heap */ + + mem_heap_t* query_heap; /*!< Heap to use for query graphs */ + + ib_qry_proc_t q_proc; /*!< Query processing info */ + + ib_match_mode_t match_mode; /*!< ib_cursor_moveto match mode */ + + row_prebuilt_t* prebuilt; /*!< For reading rows */ + + bool valid_trx; /*!< Valid transaction attached */ +}; + +/** InnoDB table columns used during table and index schema creation. */ +struct ib_col_t { + const char* name; /*!< Name of column */ + + ib_col_type_t ib_col_type; /*!< Main type of the column */ + + ulint len; /*!< Length of the column */ + + ib_col_attr_t ib_col_attr; /*!< Column attributes */ + +}; + +/** InnoDB index columns used during index and index schema creation. */ +struct ib_key_col_t { + const char* name; /*!< Name of column */ + + ulint prefix_len; /*!< Column index prefix len or 0 */ +}; + +struct ib_table_def_t; + +/** InnoDB index schema used during index creation */ +struct ib_index_def_t { + mem_heap_t* heap; /*!< Heap used to build this and all + its columns in the list */ + + const char* name; /*!< Index name */ + + dict_table_t* table; /*!< Parent InnoDB table */ + + ib_table_def_t* schema; /*!< Parent table schema that owns + this instance */ + + ibool clustered; /*!< True if clustered index */ + + ibool unique; /*!< True if unique index */ + + ib_vector_t* cols; /*!< Vector of columns */ + + trx_t* usr_trx; /*!< User transacton covering the + DDL operations */ +}; + +/** InnoDB table schema used during table creation */ +struct ib_table_def_t { + mem_heap_t* heap; /*!< Heap used to build this and all + its columns in the list */ + const char* name; /*!< Table name */ + + ib_tbl_fmt_t ib_tbl_fmt; /*!< Row format */ + + ulint page_size; /*!< Page size */ + + ib_vector_t* cols; /*!< Vector of columns */ + + ib_vector_t* indexes; /*!< Vector of indexes */ + + dict_table_t* table; /* Table read from or NULL */ +}; + +/** InnoDB tuple used for key operations. */ +struct ib_tuple_t { + mem_heap_t* heap; /*!< Heap used to build + this and for copying + the column values. */ + + ib_tuple_type_t type; /*!< Tuple discriminitor. */ + + const dict_index_t* index; /*!< Index for tuple can be either + secondary or cluster index. */ + + dtuple_t* ptr; /*!< The internal tuple + instance */ +}; + +/** The following counter is used to convey information to InnoDB +about server activity: in case of normal DML ops it is not +sensible to call srv_active_wake_master_thread after each +operation, we only do it every INNOBASE_WAKE_INTERVAL'th step. */ + +#define INNOBASE_WAKE_INTERVAL 32 + +/*****************************************************************//** +Check whether the Innodb persistent cursor is positioned. +@return IB_TRUE if positioned */ +UNIV_INLINE +ib_bool_t +ib_btr_cursor_is_positioned( +/*========================*/ + btr_pcur_t* pcur) /*!< in: InnoDB persistent cursor */ +{ + return(pcur->old_stored == BTR_PCUR_OLD_STORED + && (pcur->pos_state == BTR_PCUR_IS_POSITIONED + || pcur->pos_state == BTR_PCUR_WAS_POSITIONED)); +} + + +/********************************************************************//** +Open a table using the table id, if found then increment table ref count. +@return table instance if found */ +static +dict_table_t* +ib_open_table_by_id( +/*================*/ + ib_id_u64_t tid, /*!< in: table id to lookup */ + ib_bool_t locked) /*!< in: TRUE if own dict mutex */ +{ + dict_table_t* table; + table_id_t table_id; + + table_id = tid; + + if (!locked) { + dict_mutex_enter_for_mysql(); + } + + table = dict_table_open_on_id(table_id, FALSE, DICT_TABLE_OP_NORMAL); + + if (table != NULL && table->ibd_file_missing) { + table = NULL; + } + + if (!locked) { + dict_mutex_exit_for_mysql(); + } + + return(table); +} + +/********************************************************************//** +Open a table using the table name, if found then increment table ref count. +@return table instance if found */ +UNIV_INTERN +void* +ib_open_table_by_name( +/*==================*/ + const char* name) /*!< in: table name to lookup */ +{ + dict_table_t* table; + + table = dict_table_open_on_name(name, FALSE, FALSE, + DICT_ERR_IGNORE_NONE); + + if (table != NULL && table->ibd_file_missing) { + table = NULL; + } + + return(table); +} + +/********************************************************************//** +Find table using table name. +@return table instance if found */ +static +dict_table_t* +ib_lookup_table_by_name( +/*====================*/ + const char* name) /*!< in: table name to lookup */ +{ + dict_table_t* table; + + table = dict_table_get_low(name); + + if (table != NULL && table->ibd_file_missing) { + table = NULL; + } + + return(table); +} + +/********************************************************************//** +Increments innobase_active_counter and every INNOBASE_WAKE_INTERVALth +time calls srv_active_wake_master_thread. This function should be used +when a single database operation may introduce a small need for +server utility activity, like checkpointing. */ +UNIV_INLINE +void +ib_wake_master_thread(void) +/*=======================*/ +{ + static ulint ib_signal_counter = 0; + + ++ib_signal_counter; + + if ((ib_signal_counter % INNOBASE_WAKE_INTERVAL) == 0) { + srv_active_wake_master_thread(); + } +} + +/*********************************************************************//** +Calculate the max row size of the columns in a cluster index. +@return max row length */ +UNIV_INLINE +ulint +ib_get_max_row_len( +/*===============*/ + dict_index_t* cluster) /*!< in: cluster index */ +{ + ulint i; + ulint max_len = 0; + ulint n_fields = cluster->n_fields; + + /* Add the size of the ordering columns in the + clustered index. */ + for (i = 0; i < n_fields; ++i) { + const dict_col_t* col; + + col = dict_index_get_nth_col(cluster, i); + + /* Use the maximum output size of + mach_write_compressed(), although the encoded + length should always fit in 2 bytes. */ + max_len += dict_col_get_max_size(col); + } + + return(max_len); +} + +/*****************************************************************//** +Read the columns from a rec into a tuple. */ +static +void +ib_read_tuple( +/*==========*/ + const rec_t* rec, /*!< in: Record to read */ + ib_bool_t page_format, /*!< in: IB_TRUE if compressed format */ + ib_tuple_t* tuple, /*!< in: tuple to read into */ + void** rec_buf, /*!< in/out: row buffer */ + ulint* len) /*!< in/out: buffer len */ +{ + ulint i; + void* ptr; + rec_t* copy; + ulint rec_meta_data; + ulint n_index_fields; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + dtuple_t* dtuple = tuple->ptr; + const dict_index_t* index = tuple->index; + ulint offset_size; + + rec_offs_init(offsets_); + + offsets = rec_get_offsets( + rec, index, offsets, ULINT_UNDEFINED, &tuple->heap); + + rec_meta_data = rec_get_info_bits(rec, page_format); + dtuple_set_info_bits(dtuple, rec_meta_data); + + offset_size = rec_offs_size(offsets); + + if (rec_buf && *rec_buf) { + if (*len < offset_size) { + free(*rec_buf); + *rec_buf = malloc(offset_size); + *len = offset_size; + } + ptr = *rec_buf; + } else { + /* Make a copy of the rec. */ + ptr = mem_heap_alloc(tuple->heap, offset_size); + } + + copy = rec_copy(ptr, rec, offsets); + + n_index_fields = ut_min( + rec_offs_n_fields(offsets), dtuple_get_n_fields(dtuple)); + + for (i = 0; i < n_index_fields; ++i) { + ulint len; + const byte* data; + dfield_t* dfield; + + if (tuple->type == TPL_TYPE_ROW) { + const dict_col_t* col; + ulint col_no; + const dict_field_t* index_field; + + index_field = dict_index_get_nth_field(index, i); + col = dict_field_get_col(index_field); + col_no = dict_col_get_no(col); + + dfield = dtuple_get_nth_field(dtuple, col_no); + } else { + dfield = dtuple_get_nth_field(dtuple, i); + } + + data = rec_get_nth_field(copy, offsets, i, &len); + + /* Fetch and copy any externally stored column. */ + if (rec_offs_nth_extern(offsets, i)) { + + ulint zip_size; + + zip_size = dict_table_zip_size(index->table); + + data = btr_rec_copy_externally_stored_field( + copy, offsets, zip_size, i, &len, + tuple->heap); + + ut_a(len != UNIV_SQL_NULL); + } + + dfield_set_data(dfield, data, len); + } +} + +/*****************************************************************//** +Create an InnoDB key tuple. +@return tuple instance created, or NULL */ +static +ib_tpl_t +ib_key_tuple_new_low( +/*=================*/ + const dict_index_t* index, /*!< in: index for which tuple + required */ + ulint n_cols, /*!< in: no. of user defined cols */ + mem_heap_t* heap) /*!< in: memory heap */ +{ + ib_tuple_t* tuple; + ulint i; + ulint n_cmp_cols; + + tuple = static_cast<ib_tuple_t*>( + mem_heap_alloc(heap, sizeof(*tuple))); + + if (tuple == NULL) { + mem_heap_free(heap); + return(NULL); + } + + tuple->heap = heap; + tuple->index = index; + tuple->type = TPL_TYPE_KEY; + + /* Is it a generated clustered index ? */ + if (n_cols == 0) { + ++n_cols; + } + + tuple->ptr = dtuple_create(heap, n_cols); + + /* Copy types and set to SQL_NULL. */ + dict_index_copy_types(tuple->ptr, index, n_cols); + + for (i = 0; i < n_cols; i++) { + + dfield_t* dfield; + + dfield = dtuple_get_nth_field(tuple->ptr, i); + dfield_set_null(dfield); + } + + n_cmp_cols = dict_index_get_n_ordering_defined_by_user(index); + + dtuple_set_n_fields_cmp(tuple->ptr, n_cmp_cols); + + return((ib_tpl_t) tuple); +} + +/*****************************************************************//** +Create an InnoDB key tuple. +@return tuple instance created, or NULL */ +static +ib_tpl_t +ib_key_tuple_new( +/*=============*/ + const dict_index_t* index, /*!< in: index of tuple */ + ulint n_cols) /*!< in: no. of user defined cols */ +{ + mem_heap_t* heap; + + heap = mem_heap_create(64); + + if (heap == NULL) { + return(NULL); + } + + return(ib_key_tuple_new_low(index, n_cols, heap)); +} + +/*****************************************************************//** +Create an InnoDB row tuple. +@return tuple instance, or NULL */ +static +ib_tpl_t +ib_row_tuple_new_low( +/*=================*/ + const dict_index_t* index, /*!< in: index of tuple */ + ulint n_cols, /*!< in: no. of cols in tuple */ + mem_heap_t* heap) /*!< in: memory heap */ +{ + ib_tuple_t* tuple; + + tuple = static_cast<ib_tuple_t*>(mem_heap_alloc(heap, sizeof(*tuple))); + + if (tuple == NULL) { + mem_heap_free(heap); + return(NULL); + } + + tuple->heap = heap; + tuple->index = index; + tuple->type = TPL_TYPE_ROW; + + tuple->ptr = dtuple_create(heap, n_cols); + + /* Copy types and set to SQL_NULL. */ + dict_table_copy_types(tuple->ptr, index->table); + + return((ib_tpl_t) tuple); +} + +/*****************************************************************//** +Create an InnoDB row tuple. +@return tuple instance, or NULL */ +static +ib_tpl_t +ib_row_tuple_new( +/*=============*/ + const dict_index_t* index, /*!< in: index of tuple */ + ulint n_cols) /*!< in: no. of cols in tuple */ +{ + mem_heap_t* heap; + + heap = mem_heap_create(64); + + if (heap == NULL) { + return(NULL); + } + + return(ib_row_tuple_new_low(index, n_cols, heap)); +} + +/*****************************************************************//** +Begin a transaction. +@return innobase txn handle */ +UNIV_INTERN +ib_err_t +ib_trx_start( +/*=========*/ + ib_trx_t ib_trx, /*!< in: transaction to restart */ + ib_trx_level_t ib_trx_level, /*!< in: trx isolation level */ + ib_bool_t read_write, /*!< in: true if read write + transaction */ + ib_bool_t auto_commit, /*!< in: auto commit after each + single DML */ + void* thd) /*!< in: THD */ +{ + ib_err_t err = DB_SUCCESS; + trx_t* trx = (trx_t*) ib_trx; + + ut_a(ib_trx_level <= IB_TRX_SERIALIZABLE); + + trx->api_trx = true; + trx->api_auto_commit = auto_commit; + trx->read_write = read_write; + + trx_start_if_not_started(trx); + + trx->isolation_level = ib_trx_level; + + /* FIXME: This is a place holder, we should add an arg that comes + from the client. */ + trx->mysql_thd = static_cast<THD*>(thd); + + return(err); +} + +/*****************************************************************//** +Begin a transaction. This will allocate a new transaction handle. +put the transaction in the active state. +@return innobase txn handle */ +UNIV_INTERN +ib_trx_t +ib_trx_begin( +/*=========*/ + ib_trx_level_t ib_trx_level, /*!< in: trx isolation level */ + ib_bool_t read_write, /*!< in: true if read write + transaction */ + ib_bool_t auto_commit) /*!< in: auto commit after each + single DML */ +{ + trx_t* trx; + ib_bool_t started; + + trx = trx_allocate_for_mysql(); + + started = ib_trx_start(static_cast<ib_trx_t>(trx), ib_trx_level, + read_write, auto_commit, NULL); + ut_a(started); + + return(static_cast<ib_trx_t>(trx)); +} + +/*****************************************************************//** +Get the transaction's state. +@return transaction state */ +UNIV_INTERN +ib_trx_state_t +ib_trx_state( +/*=========*/ + ib_trx_t ib_trx) /*!< in: trx handle */ +{ + trx_t* trx = (trx_t*) ib_trx; + + return((ib_trx_state_t) trx->state); +} + +/*****************************************************************//** +Get a trx start time. +@return trx start_time */ +UNIV_INTERN +ib_u64_t +ib_trx_get_start_time( +/*==================*/ + ib_trx_t ib_trx) /*!< in: transaction */ +{ + trx_t* trx = (trx_t*) ib_trx; + return(static_cast<ib_u64_t>(trx->start_time)); +} +/*****************************************************************//** +Release the resources of the transaction. +@return DB_SUCCESS or err code */ +UNIV_INTERN +ib_err_t +ib_trx_release( +/*===========*/ + ib_trx_t ib_trx) /*!< in: trx handle */ +{ + trx_t* trx = (trx_t*) ib_trx; + + ut_ad(trx != NULL); + trx_free_for_mysql(trx); + + return(DB_SUCCESS); +} + +/*****************************************************************//** +Commit a transaction. This function will also release the schema +latches too. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_trx_commit( +/*==========*/ + ib_trx_t ib_trx) /*!< in: trx handle */ +{ + ib_err_t err = DB_SUCCESS; + trx_t* trx = (trx_t*) ib_trx; + + if (trx->state == TRX_STATE_NOT_STARTED) { + return(err); + } + + trx_commit(trx); + + return(DB_SUCCESS); +} + +/*****************************************************************//** +Rollback a transaction. This function will also release the schema +latches too. +@return DB_SUCCESS or err code */ +UNIV_INTERN +ib_err_t +ib_trx_rollback( +/*============*/ + ib_trx_t ib_trx) /*!< in: trx handle */ +{ + ib_err_t err; + trx_t* trx = (trx_t*) ib_trx; + + err = static_cast<ib_err_t>(trx_rollback_for_mysql(trx)); + + /* It should always succeed */ + ut_a(err == DB_SUCCESS); + + return(err); +} + +/*****************************************************************//** +Find an index definition from the index vector using index name. +@return index def. if found else NULL */ +UNIV_INLINE +const ib_index_def_t* +ib_table_find_index( +/*================*/ + ib_vector_t* indexes, /*!< in: vector of indexes */ + const char* name) /*!< in: index name */ +{ + ulint i; + + for (i = 0; i < ib_vector_size(indexes); ++i) { + const ib_index_def_t* index_def; + + index_def = (ib_index_def_t*) ib_vector_get(indexes, i); + + if (innobase_strcasecmp(name, index_def->name) == 0) { + return(index_def); + } + } + + return(NULL); +} + +/*****************************************************************//** +Get the InnoDB internal precise type from the schema column definition. +@return precise type in api format */ +UNIV_INLINE +ulint +ib_col_get_prtype( +/*==============*/ + const ib_col_t* ib_col) /*!< in: column definition */ +{ + ulint prtype = 0; + + if (ib_col->ib_col_attr & IB_COL_UNSIGNED) { + prtype |= DATA_UNSIGNED; + + ut_a(ib_col->ib_col_type == IB_INT); + } + + if (ib_col->ib_col_attr & IB_COL_NOT_NULL) { + prtype |= DATA_NOT_NULL; + } + + return(prtype); +} + +/*****************************************************************//** +Get the InnoDB internal main type from the schema column definition. +@return column main type */ +UNIV_INLINE +ulint +ib_col_get_mtype( +/*==============*/ + const ib_col_t* ib_col) /*!< in: column definition */ +{ + /* Note: The api0api.h types should map directly to + the internal numeric codes. */ + return(ib_col->ib_col_type); +} + +/*****************************************************************//** +Find a column in the the column vector with the same name. +@return col. def. if found else NULL */ +UNIV_INLINE +const ib_col_t* +ib_table_find_col( +/*==============*/ + const ib_vector_t* cols, /*!< in: column list head */ + const char* name) /*!< in: column name to find */ +{ + ulint i; + + for (i = 0; i < ib_vector_size(cols); ++i) { + const ib_col_t* ib_col; + + ib_col = static_cast<const ib_col_t*>( + ib_vector_get((ib_vector_t*) cols, i)); + + if (innobase_strcasecmp(ib_col->name, name) == 0) { + return(ib_col); + } + } + + return(NULL); +} + +/*****************************************************************//** +Find a column in the the column list with the same name. +@return col. def. if found else NULL */ +UNIV_INLINE +const ib_key_col_t* +ib_index_find_col( +/*==============*/ + ib_vector_t* cols, /*!< in: column list head */ + const char* name) /*!< in: column name to find */ +{ + ulint i; + + for (i = 0; i < ib_vector_size(cols); ++i) { + const ib_key_col_t* ib_col; + + ib_col = static_cast<ib_key_col_t*>(ib_vector_get(cols, i)); + + if (innobase_strcasecmp(ib_col->name, name) == 0) { + return(ib_col); + } + } + + return(NULL); +} + +#ifdef __WIN__ +/*****************************************************************//** +Convert a string to lower case. */ +static +void +ib_to_lower_case( +/*=============*/ + char* ptr) /*!< string to convert to lower case */ +{ + while (*ptr) { + *ptr = tolower(*ptr); + ++ptr; + } +} +#endif /* __WIN__ */ + +/*****************************************************************//** +Normalizes a table name string. A normalized name consists of the +database name catenated to '/' and table name. An example: +test/mytable. On Windows normalization puts both the database name and the +table name always to lower case. This function can be called for system +tables and they don't have a database component. For tables that don't have +a database component, we don't normalize them to lower case on Windows. +The assumption is that they are system tables that reside in the system +table space. */ +static +void +ib_normalize_table_name( +/*====================*/ + char* norm_name, /*!< out: normalized name as a + null-terminated string */ + const char* name) /*!< in: table name string */ +{ + const char* ptr = name; + + /* Scan name from the end */ + + ptr += ut_strlen(name) - 1; + + /* Find the start of the table name. */ + while (ptr >= name && *ptr != '\\' && *ptr != '/' && ptr > name) { + --ptr; + } + + + /* For system tables there is no '/' or dbname. */ + ut_a(ptr >= name); + + if (ptr > name) { + const char* db_name; + const char* table_name; + + table_name = ptr + 1; + + --ptr; + + while (ptr >= name && *ptr != '\\' && *ptr != '/') { + ptr--; + } + + db_name = ptr + 1; + + memcpy(norm_name, db_name, + ut_strlen(name) + 1 - (db_name - name)); + + norm_name[table_name - db_name - 1] = '/'; +#ifdef __WIN__ + ib_to_lower_case(norm_name); +#endif + } else { + ut_strcpy(norm_name, name); + } +} + +/*****************************************************************//** +Check whether the table name conforms to our requirements. Currently +we only do a simple check for the presence of a '/'. +@return DB_SUCCESS or err code */ +UNIV_INTERN +ib_err_t +ib_table_name_check( +/*================*/ + const char* name) /*!< in: table name to check */ +{ + const char* slash = NULL; + ulint len = ut_strlen(name); + + if (len < 2 + || *name == '/' + || name[len - 1] == '/' + || (name[0] == '.' && name[1] == '/') + || (name[0] == '.' && name[1] == '.' && name[2] == '/')) { + + return(DB_DATA_MISMATCH); + } + + for ( ; *name; ++name) { +#ifdef __WIN__ + /* Check for reserved characters in DOS filenames. */ + switch (*name) { + case ':': + case '|': + case '"': + case '*': + case '<': + case '>': + return(DB_DATA_MISMATCH); + } +#endif /* __WIN__ */ + if (*name == '/') { + if (slash) { + return(DB_DATA_MISMATCH); + } + slash = name; + } + } + + return(slash ? DB_SUCCESS : DB_DATA_MISMATCH); +} + + + +/*****************************************************************//** +Get an index definition that is tagged as a clustered index. +@return cluster index schema */ +UNIV_INLINE +ib_index_def_t* +ib_find_clustered_index( +/*====================*/ + ib_vector_t* indexes) /*!< in: index defs. to search */ +{ + ulint i; + ulint n_indexes; + + n_indexes = ib_vector_size(indexes); + + for (i = 0; i < n_indexes; ++i) { + ib_index_def_t* ib_index_def; + + ib_index_def = static_cast<ib_index_def_t*>( + ib_vector_get(indexes, i)); + + if (ib_index_def->clustered) { + return(ib_index_def); + } + } + + return(NULL); +} + +/*****************************************************************//** +Get a table id. The caller must have acquired the dictionary mutex. +@return DB_SUCCESS if found */ +static +ib_err_t +ib_table_get_id_low( +/*================*/ + const char* table_name, /*!< in: table to find */ + ib_id_u64_t* table_id) /*!< out: table id if found */ +{ + dict_table_t* table; + ib_err_t err = DB_TABLE_NOT_FOUND; + + *table_id = 0; + + table = ib_lookup_table_by_name(table_name); + + if (table != NULL) { + *table_id = (table->id); + + err = DB_SUCCESS; + } + + return(err); +} + +/*****************************************************************//** +Create an internal cursor instance. +@return DB_SUCCESS or err code */ +static +ib_err_t +ib_create_cursor( +/*=============*/ + ib_crsr_t* ib_crsr, /*!< out: InnoDB cursor */ + dict_table_t* table, /*!< in: table instance */ + dict_index_t* index, /*!< in: index to use */ + trx_t* trx) /*!< in: transaction */ +{ + mem_heap_t* heap; + ib_cursor_t* cursor; + ib_err_t err = DB_SUCCESS; + + heap = mem_heap_create(sizeof(*cursor) * 2); + + if (heap != NULL) { + row_prebuilt_t* prebuilt; + + cursor = static_cast<ib_cursor_t*>( + mem_heap_zalloc(heap, sizeof(*cursor))); + + cursor->heap = heap; + + cursor->query_heap = mem_heap_create(64); + + if (cursor->query_heap == NULL) { + mem_heap_free(heap); + + return(DB_OUT_OF_MEMORY); + } + + cursor->prebuilt = row_create_prebuilt(table, 0); + + prebuilt = cursor->prebuilt; + + prebuilt->trx = trx; + + cursor->valid_trx = TRUE; + + prebuilt->table = table; + prebuilt->select_lock_type = LOCK_NONE; + prebuilt->innodb_api = TRUE; + + prebuilt->index = index; + + ut_a(prebuilt->index != NULL); + + if (prebuilt->trx != NULL) { + ++prebuilt->trx->n_mysql_tables_in_use; + + prebuilt->index_usable = + row_merge_is_index_usable( + prebuilt->trx, prebuilt->index); + + /* Assign a read view if the transaction does + not have it yet */ + + trx_assign_read_view(prebuilt->trx); + } + + *ib_crsr = (ib_crsr_t) cursor; + } else { + err = DB_OUT_OF_MEMORY; + } + + return(err); +} + +/*****************************************************************//** +Create an internal cursor instance, and set prebuilt->index to index +with supplied index_id. +@return DB_SUCCESS or err code */ +static +ib_err_t +ib_create_cursor_with_index_id( +/*===========================*/ + ib_crsr_t* ib_crsr, /*!< out: InnoDB cursor */ + dict_table_t* table, /*!< in: table instance */ + ib_id_u64_t index_id, /*!< in: index id or 0 */ + trx_t* trx) /*!< in: transaction */ +{ + dict_index_t* index; + + if (index_id != 0) { + mutex_enter(&dict_sys->mutex); + index = dict_index_find_on_id_low(index_id); + mutex_exit(&dict_sys->mutex); + } else { + index = dict_table_get_first_index(table); + } + + return(ib_create_cursor(ib_crsr, table, index, trx)); +} + +/*****************************************************************//** +Open an InnoDB table and return a cursor handle to it. +@return DB_SUCCESS or err code */ +UNIV_INTERN +ib_err_t +ib_cursor_open_table_using_id( +/*==========================*/ + ib_id_u64_t table_id, /*!< in: table id of table to open */ + ib_trx_t ib_trx, /*!< in: Current transaction handle + can be NULL */ + ib_crsr_t* ib_crsr) /*!< out,own: InnoDB cursor */ +{ + ib_err_t err; + dict_table_t* table; + + if (ib_trx == NULL || !ib_schema_lock_is_exclusive(ib_trx)) { + table = ib_open_table_by_id(table_id, FALSE); + } else { + table = ib_open_table_by_id(table_id, TRUE); + } + + if (table == NULL) { + + return(DB_TABLE_NOT_FOUND); + } + + err = ib_create_cursor_with_index_id(ib_crsr, table, 0, + (trx_t*) ib_trx); + + return(err); +} + +/*****************************************************************//** +Open an InnoDB index and return a cursor handle to it. +@return DB_SUCCESS or err code */ +UNIV_INTERN +ib_err_t +ib_cursor_open_index_using_id( +/*==========================*/ + ib_id_u64_t index_id, /*!< in: index id of index to open */ + ib_trx_t ib_trx, /*!< in: Current transaction handle + can be NULL */ + ib_crsr_t* ib_crsr) /*!< out: InnoDB cursor */ +{ + ib_err_t err; + dict_table_t* table; + ulint table_id = (ulint)( index_id >> 32); + + if (ib_trx == NULL || !ib_schema_lock_is_exclusive(ib_trx)) { + table = ib_open_table_by_id(table_id, FALSE); + } else { + table = ib_open_table_by_id(table_id, TRUE); + } + + if (table == NULL) { + + return(DB_TABLE_NOT_FOUND); + } + + /* We only return the lower 32 bits of the dulint. */ + err = ib_create_cursor_with_index_id( + ib_crsr, table, index_id, (trx_t*) ib_trx); + + if (ib_crsr != NULL) { + const ib_cursor_t* cursor; + + cursor = *(ib_cursor_t**) ib_crsr; + + if (cursor->prebuilt->index == NULL) { + ib_err_t crsr_err; + + crsr_err = ib_cursor_close(*ib_crsr); + ut_a(crsr_err == DB_SUCCESS); + + *ib_crsr = NULL; + } + } + + return(err); +} + +/*****************************************************************//** +Open an InnoDB secondary index cursor and return a cursor handle to it. +@return DB_SUCCESS or err code */ +UNIV_INTERN +ib_err_t +ib_cursor_open_index_using_name( +/*============================*/ + ib_crsr_t ib_open_crsr, /*!< in: open/active cursor */ + const char* index_name, /*!< in: secondary index name */ + ib_crsr_t* ib_crsr, /*!< out,own: InnoDB index cursor */ + int* idx_type, /*!< out: index is cluster index */ + ib_id_u64_t* idx_id) /*!< out: index id */ +{ + dict_table_t* table; + dict_index_t* index; + index_id_t index_id = 0; + ib_err_t err = DB_TABLE_NOT_FOUND; + ib_cursor_t* cursor = (ib_cursor_t*) ib_open_crsr; + + *idx_type = 0; + *idx_id = 0; + *ib_crsr = NULL; + + /* We want to increment the ref count, so we do a redundant search. */ + table = dict_table_open_on_id(cursor->prebuilt->table->id, + FALSE, DICT_TABLE_OP_NORMAL); + ut_a(table != NULL); + + /* The first index is always the cluster index. */ + index = dict_table_get_first_index(table); + + /* Traverse the user defined indexes. */ + while (index != NULL) { + if (innobase_strcasecmp(index->name, index_name) == 0) { + index_id = index->id; + *idx_type = index->type; + *idx_id = index_id; + break; + } + index = UT_LIST_GET_NEXT(indexes, index); + } + + if (!index_id) { + dict_table_close(table, FALSE, FALSE); + return(DB_ERROR); + } + + if (index_id > 0) { + ut_ad(index->id == index_id); + err = ib_create_cursor( + ib_crsr, table, index, cursor->prebuilt->trx); + } + + if (*ib_crsr != NULL) { + const ib_cursor_t* cursor; + + cursor = *(ib_cursor_t**) ib_crsr; + + if (cursor->prebuilt->index == NULL) { + err = ib_cursor_close(*ib_crsr); + ut_a(err == DB_SUCCESS); + *ib_crsr = NULL; + } + } + + return(err); +} + +/*****************************************************************//** +Open an InnoDB table and return a cursor handle to it. +@return DB_SUCCESS or err code */ +UNIV_INTERN +ib_err_t +ib_cursor_open_table( +/*=================*/ + const char* name, /*!< in: table name */ + ib_trx_t ib_trx, /*!< in: Current transaction handle + can be NULL */ + ib_crsr_t* ib_crsr) /*!< out,own: InnoDB cursor */ +{ + ib_err_t err; + dict_table_t* table; + char* normalized_name; + + normalized_name = static_cast<char*>(mem_alloc(ut_strlen(name) + 1)); + ib_normalize_table_name(normalized_name, name); + + if (ib_trx != NULL) { + if (!ib_schema_lock_is_exclusive(ib_trx)) { + table = (dict_table_t*)ib_open_table_by_name( + normalized_name); + } else { + /* NOTE: We do not acquire MySQL metadata lock */ + table = ib_lookup_table_by_name(normalized_name); + } + } else { + table = (dict_table_t*)ib_open_table_by_name(normalized_name); + } + + mem_free(normalized_name); + normalized_name = NULL; + + /* It can happen that another thread has created the table but + not the cluster index or it's a broken table definition. Refuse to + open if that's the case. */ + if (table != NULL && dict_table_get_first_index(table) == NULL) { + table = NULL; + } + + if (table != NULL) { + err = ib_create_cursor_with_index_id(ib_crsr, table, 0, + (trx_t*) ib_trx); + } else { + err = DB_TABLE_NOT_FOUND; + } + + return(err); +} + +/********************************************************************//** +Free a context struct for a table handle. */ +static +void +ib_qry_proc_free( +/*=============*/ + ib_qry_proc_t* q_proc) /*!< in, own: qproc struct */ +{ + que_graph_free_recursive(q_proc->grph.ins); + que_graph_free_recursive(q_proc->grph.upd); + que_graph_free_recursive(q_proc->grph.sel); + + memset(q_proc, 0x0, sizeof(*q_proc)); +} + +/*****************************************************************//** +set a cursor trx to NULL */ +UNIV_INTERN +void +ib_cursor_clear_trx( +/*================*/ + ib_crsr_t ib_crsr) /*!< in/out: InnoDB cursor */ +{ + ib_cursor_t* cursor = (ib_cursor_t*) ib_crsr; + + cursor->prebuilt->trx = NULL; +} + +/*****************************************************************//** +Reset the cursor. +@return DB_SUCCESS or err code */ +UNIV_INTERN +ib_err_t +ib_cursor_reset( +/*============*/ + ib_crsr_t ib_crsr) /*!< in/out: InnoDB cursor */ +{ + ib_cursor_t* cursor = (ib_cursor_t*) ib_crsr; + row_prebuilt_t* prebuilt = cursor->prebuilt; + + if (cursor->valid_trx && prebuilt->trx != NULL + && prebuilt->trx->n_mysql_tables_in_use > 0) { + + --prebuilt->trx->n_mysql_tables_in_use; + } + + /* The fields in this data structure are allocated from + the query heap and so need to be reset too. */ + ib_qry_proc_free(&cursor->q_proc); + + mem_heap_empty(cursor->query_heap); + + return(DB_SUCCESS); +} + +/*****************************************************************//** +update the cursor with new transactions and also reset the cursor +@return DB_SUCCESS or err code */ +ib_err_t +ib_cursor_new_trx( +/*==============*/ + ib_crsr_t ib_crsr, /*!< in/out: InnoDB cursor */ + ib_trx_t ib_trx) /*!< in: transaction */ +{ + ib_err_t err = DB_SUCCESS; + ib_cursor_t* cursor = (ib_cursor_t*) ib_crsr; + trx_t* trx = (trx_t*) ib_trx; + + row_prebuilt_t* prebuilt = cursor->prebuilt; + + row_update_prebuilt_trx(prebuilt, trx); + + cursor->valid_trx = TRUE; + + trx_assign_read_view(prebuilt->trx); + + ib_qry_proc_free(&cursor->q_proc); + + mem_heap_empty(cursor->query_heap); + + return(err); +} + +/*****************************************************************//** +Commit the transaction in a cursor +@return DB_SUCCESS or err code */ +ib_err_t +ib_cursor_commit_trx( +/*=================*/ + ib_crsr_t ib_crsr, /*!< in/out: InnoDB cursor */ + ib_trx_t ib_trx) /*!< in: transaction */ +{ + ib_err_t err = DB_SUCCESS; + ib_cursor_t* cursor = (ib_cursor_t*) ib_crsr; +#ifdef UNIV_DEBUG + row_prebuilt_t* prebuilt = cursor->prebuilt; + + ut_ad(prebuilt->trx == (trx_t*) ib_trx); +#endif /* UNIV_DEBUG */ + ib_trx_commit(ib_trx); + cursor->valid_trx = FALSE; + return(err); +} + +/*****************************************************************//** +Close an InnoDB table and free the cursor. +@return DB_SUCCESS or err code */ +UNIV_INTERN +ib_err_t +ib_cursor_close( +/*============*/ + ib_crsr_t ib_crsr) /*!< in,own: InnoDB cursor */ +{ + ib_cursor_t* cursor = (ib_cursor_t*) ib_crsr; + row_prebuilt_t* prebuilt; + trx_t* trx; + + if (!cursor) { + return(DB_SUCCESS); + } + + prebuilt = cursor->prebuilt; + trx = prebuilt->trx; + + ib_qry_proc_free(&cursor->q_proc); + + /* The transaction could have been detached from the cursor. */ + if (cursor->valid_trx && trx != NULL + && trx->n_mysql_tables_in_use > 0) { + --trx->n_mysql_tables_in_use; + } + + row_prebuilt_free(prebuilt, FALSE); + cursor->prebuilt = NULL; + + mem_heap_free(cursor->query_heap); + mem_heap_free(cursor->heap); + cursor = NULL; + + return(DB_SUCCESS); +} + +/*****************************************************************//** +Close the table, decrement n_ref_count count. +@return DB_SUCCESS or err code */ +UNIV_INTERN +ib_err_t +ib_cursor_close_table( +/*==================*/ + ib_crsr_t ib_crsr) /*!< in,own: InnoDB cursor */ +{ + ib_cursor_t* cursor = (ib_cursor_t*) ib_crsr; + row_prebuilt_t* prebuilt = cursor->prebuilt; + + if (prebuilt && prebuilt->table) { + dict_table_close(prebuilt->table, FALSE, FALSE); + } + + return(DB_SUCCESS); +} +/**********************************************************************//** +Run the insert query and do error handling. +@return DB_SUCCESS or error code */ +UNIV_INLINE +ib_err_t +ib_insert_row_with_lock_retry( +/*==========================*/ + que_thr_t* thr, /*!< in: insert query graph */ + ins_node_t* node, /*!< in: insert node for the query */ + trx_savept_t* savept) /*!< in: savepoint to rollback to + in case of an error */ +{ + trx_t* trx; + ib_err_t err; + ib_bool_t lock_wait; + + trx = thr_get_trx(thr); + + do { + thr->run_node = node; + thr->prev_node = node; + + row_ins_step(thr); + + err = trx->error_state; + + if (err != DB_SUCCESS) { + que_thr_stop_for_mysql(thr); + + thr->lock_state = QUE_THR_LOCK_ROW; + lock_wait = static_cast<ib_bool_t>( + ib_handle_errors(&err, trx, thr, savept)); + thr->lock_state = QUE_THR_LOCK_NOLOCK; + } else { + lock_wait = FALSE; + } + } while (lock_wait); + + return(err); +} + +/*****************************************************************//** +Write a row. +@return DB_SUCCESS or err code */ +static +ib_err_t +ib_execute_insert_query_graph( +/*==========================*/ + dict_table_t* table, /*!< in: table where to insert */ + que_fork_t* ins_graph, /*!< in: query graph */ + ins_node_t* node) /*!< in: insert node */ +{ + trx_t* trx; + que_thr_t* thr; + trx_savept_t savept; + ib_err_t err = DB_SUCCESS; + + trx = ins_graph->trx; + + savept = trx_savept_take(trx); + + thr = que_fork_get_first_thr(ins_graph); + + que_thr_move_to_run_state_for_mysql(thr, trx); + + err = ib_insert_row_with_lock_retry(thr, node, &savept); + + if (err == DB_SUCCESS) { + que_thr_stop_for_mysql_no_error(thr, trx); + + dict_table_n_rows_inc(table); + + srv_stats.n_rows_inserted.inc(); + } + + trx->op_info = ""; + + return(err); +} + +/*****************************************************************//** +Create an insert query graph node. */ +static +void +ib_insert_query_graph_create( +/*==========================*/ + ib_cursor_t* cursor) /*!< in: Cursor instance */ +{ + ib_qry_proc_t* q_proc = &cursor->q_proc; + ib_qry_node_t* node = &q_proc->node; + trx_t* trx = cursor->prebuilt->trx; + + ut_a(trx->state != TRX_STATE_NOT_STARTED); + + if (node->ins == NULL) { + dtuple_t* row; + ib_qry_grph_t* grph = &q_proc->grph; + mem_heap_t* heap = cursor->query_heap; + dict_table_t* table = cursor->prebuilt->table; + + node->ins = ins_node_create(INS_DIRECT, table, heap); + + node->ins->select = NULL; + node->ins->values_list = NULL; + + row = dtuple_create(heap, dict_table_get_n_cols(table)); + dict_table_copy_types(row, table); + + ins_node_set_new_row(node->ins, row); + + grph->ins = static_cast<que_fork_t*>( + que_node_get_parent( + pars_complete_graph_for_exec(node->ins, trx, + heap))); + + grph->ins->state = QUE_FORK_ACTIVE; + } +} + +/*****************************************************************//** +Insert a row to a table. +@return DB_SUCCESS or err code */ +UNIV_INTERN +ib_err_t +ib_cursor_insert_row( +/*=================*/ + ib_crsr_t ib_crsr, /*!< in/out: InnoDB cursor instance */ + const ib_tpl_t ib_tpl) /*!< in: tuple to insert */ +{ + ib_ulint_t i; + ib_qry_node_t* node; + ib_qry_proc_t* q_proc; + ulint n_fields; + dtuple_t* dst_dtuple; + ib_err_t err = DB_SUCCESS; + ib_cursor_t* cursor = (ib_cursor_t*) ib_crsr; + const ib_tuple_t* src_tuple = (const ib_tuple_t*) ib_tpl; + + ib_insert_query_graph_create(cursor); + + ut_ad(src_tuple->type == TPL_TYPE_ROW); + + q_proc = &cursor->q_proc; + node = &q_proc->node; + + node->ins->state = INS_NODE_ALLOC_ROW_ID; + dst_dtuple = node->ins->row; + + n_fields = dtuple_get_n_fields(src_tuple->ptr); + ut_ad(n_fields == dtuple_get_n_fields(dst_dtuple)); + + /* Do a shallow copy of the data fields and check for NULL + constraints on columns. */ + for (i = 0; i < n_fields; i++) { + ulint mtype; + dfield_t* src_field; + dfield_t* dst_field; + + src_field = dtuple_get_nth_field(src_tuple->ptr, i); + + mtype = dtype_get_mtype(dfield_get_type(src_field)); + + /* Don't touch the system columns. */ + if (mtype != DATA_SYS) { + ulint prtype; + + prtype = dtype_get_prtype(dfield_get_type(src_field)); + + if ((prtype & DATA_NOT_NULL) + && dfield_is_null(src_field)) { + + err = DB_DATA_MISMATCH; + break; + } + + dst_field = dtuple_get_nth_field(dst_dtuple, i); + ut_ad(mtype + == dtype_get_mtype(dfield_get_type(dst_field))); + + /* Do a shallow copy. */ + dfield_set_data( + dst_field, src_field->data, src_field->len); + + if (dst_field->len != IB_SQL_NULL) { + UNIV_MEM_ASSERT_RW(dst_field->data, + dst_field->len); + } + } + } + + if (err == DB_SUCCESS) { + err = ib_execute_insert_query_graph( + src_tuple->index->table, q_proc->grph.ins, node->ins); + } + + ib_wake_master_thread(); + + return(err); +} + +/*********************************************************************//** +Gets pointer to a prebuilt update vector used in updates. +@return update vector */ +UNIV_INLINE +upd_t* +ib_update_vector_create( +/*====================*/ + ib_cursor_t* cursor) /*!< in: current cursor */ +{ + trx_t* trx = cursor->prebuilt->trx; + mem_heap_t* heap = cursor->query_heap; + dict_table_t* table = cursor->prebuilt->table; + ib_qry_proc_t* q_proc = &cursor->q_proc; + ib_qry_grph_t* grph = &q_proc->grph; + ib_qry_node_t* node = &q_proc->node; + + ut_a(trx->state != TRX_STATE_NOT_STARTED); + + if (node->upd == NULL) { + node->upd = static_cast<upd_node_t*>( + row_create_update_node_for_mysql(table, heap)); + } + + grph->upd = static_cast<que_fork_t*>( + que_node_get_parent( + pars_complete_graph_for_exec(node->upd, trx, heap))); + + grph->upd->state = QUE_FORK_ACTIVE; + + return(node->upd->update); +} + +/**********************************************************************//** +Note that a column has changed. */ +static +void +ib_update_col( +/*==========*/ + + ib_cursor_t* cursor, /*!< in: current cursor */ + upd_field_t* upd_field, /*!< in/out: update field */ + ulint col_no, /*!< in: column number */ + dfield_t* dfield) /*!< in: updated dfield */ +{ + ulint data_len; + dict_table_t* table = cursor->prebuilt->table; + dict_index_t* index = dict_table_get_first_index(table); + + data_len = dfield_get_len(dfield); + + if (data_len == UNIV_SQL_NULL) { + dfield_set_null(&upd_field->new_val); + } else { + dfield_copy_data(&upd_field->new_val, dfield); + } + + upd_field->exp = NULL; + + upd_field->orig_len = 0; + + upd_field->field_no = dict_col_get_clust_pos( + &table->cols[col_no], index); +} + +/**********************************************************************//** +Checks which fields have changed in a row and stores the new data +to an update vector. +@return DB_SUCCESS or err code */ +static +ib_err_t +ib_calc_diff( +/*=========*/ + ib_cursor_t* cursor, /*!< in: current cursor */ + upd_t* upd, /*!< in/out: update vector */ + const ib_tuple_t*old_tuple, /*!< in: Old tuple in table */ + const ib_tuple_t*new_tuple) /*!< in: New tuple to update */ +{ + ulint i; + ulint n_changed = 0; + ib_err_t err = DB_SUCCESS; + ulint n_fields = dtuple_get_n_fields(new_tuple->ptr); + + ut_a(old_tuple->type == TPL_TYPE_ROW); + ut_a(new_tuple->type == TPL_TYPE_ROW); + ut_a(old_tuple->index->table == new_tuple->index->table); + + for (i = 0; i < n_fields; ++i) { + ulint mtype; + ulint prtype; + upd_field_t* upd_field; + dfield_t* new_dfield; + dfield_t* old_dfield; + + new_dfield = dtuple_get_nth_field(new_tuple->ptr, i); + old_dfield = dtuple_get_nth_field(old_tuple->ptr, i); + + mtype = dtype_get_mtype(dfield_get_type(old_dfield)); + prtype = dtype_get_prtype(dfield_get_type(old_dfield)); + + /* Skip the system columns */ + if (mtype == DATA_SYS) { + continue; + + } else if ((prtype & DATA_NOT_NULL) + && dfield_is_null(new_dfield)) { + + err = DB_DATA_MISMATCH; + break; + } + + if (dfield_get_len(new_dfield) != dfield_get_len(old_dfield) + || (!dfield_is_null(old_dfield) + && memcmp(dfield_get_data(new_dfield), + dfield_get_data(old_dfield), + dfield_get_len(old_dfield)) != 0)) { + + upd_field = &upd->fields[n_changed]; + + ib_update_col(cursor, upd_field, i, new_dfield); + + ++n_changed; + } + } + + if (err == DB_SUCCESS) { + upd->info_bits = 0; + upd->n_fields = n_changed; + } + + return(err); +} + +/**********************************************************************//** +Run the update query and do error handling. +@return DB_SUCCESS or error code */ +UNIV_INLINE +ib_err_t +ib_update_row_with_lock_retry( +/*==========================*/ + que_thr_t* thr, /*!< in: Update query graph */ + upd_node_t* node, /*!< in: Update node for the query */ + trx_savept_t* savept) /*!< in: savepoint to rollback to + in case of an error */ + +{ + trx_t* trx; + ib_err_t err; + ib_bool_t lock_wait; + + trx = thr_get_trx(thr); + + do { + thr->run_node = node; + thr->prev_node = node; + + row_upd_step(thr); + + err = trx->error_state; + + if (err != DB_SUCCESS) { + que_thr_stop_for_mysql(thr); + + if (err != DB_RECORD_NOT_FOUND) { + thr->lock_state = QUE_THR_LOCK_ROW; + + lock_wait = static_cast<ib_bool_t>( + ib_handle_errors(&err, trx, thr, savept)); + + thr->lock_state = QUE_THR_LOCK_NOLOCK; + } else { + lock_wait = FALSE; + } + } else { + lock_wait = FALSE; + } + } while (lock_wait); + + return(err); +} + +/*********************************************************************//** +Does an update or delete of a row. +@return DB_SUCCESS or err code */ +UNIV_INLINE +ib_err_t +ib_execute_update_query_graph( +/*==========================*/ + ib_cursor_t* cursor, /*!< in: Cursor instance */ + btr_pcur_t* pcur) /*!< in: Btree persistent cursor */ +{ + ib_err_t err; + que_thr_t* thr; + upd_node_t* node; + trx_savept_t savept; + trx_t* trx = cursor->prebuilt->trx; + dict_table_t* table = cursor->prebuilt->table; + ib_qry_proc_t* q_proc = &cursor->q_proc; + + /* The transaction must be running. */ + ut_a(trx->state != TRX_STATE_NOT_STARTED); + + node = q_proc->node.upd; + + ut_a(dict_index_is_clust(pcur->btr_cur.index)); + btr_pcur_copy_stored_position(node->pcur, pcur); + + ut_a(node->pcur->rel_pos == BTR_PCUR_ON); + + savept = trx_savept_take(trx); + + thr = que_fork_get_first_thr(q_proc->grph.upd); + + node->state = UPD_NODE_UPDATE_CLUSTERED; + + que_thr_move_to_run_state_for_mysql(thr, trx); + + err = ib_update_row_with_lock_retry(thr, node, &savept); + + if (err == DB_SUCCESS) { + + que_thr_stop_for_mysql_no_error(thr, trx); + + if (node->is_delete) { + + dict_table_n_rows_dec(table); + + srv_stats.n_rows_deleted.inc(); + } else { + srv_stats.n_rows_updated.inc(); + } + + } else if (err == DB_RECORD_NOT_FOUND) { + trx->error_state = DB_SUCCESS; + } + + trx->op_info = ""; + + return(err); +} + +/*****************************************************************//** +Update a row in a table. +@return DB_SUCCESS or err code */ +UNIV_INTERN +ib_err_t +ib_cursor_update_row( +/*=================*/ + ib_crsr_t ib_crsr, /*!< in: InnoDB cursor instance */ + const ib_tpl_t ib_old_tpl, /*!< in: Old tuple in table */ + const ib_tpl_t ib_new_tpl) /*!< in: New tuple to update */ +{ + upd_t* upd; + ib_err_t err; + btr_pcur_t* pcur; + ib_cursor_t* cursor = (ib_cursor_t*) ib_crsr; + row_prebuilt_t* prebuilt = cursor->prebuilt; + const ib_tuple_t*old_tuple = (const ib_tuple_t*) ib_old_tpl; + const ib_tuple_t*new_tuple = (const ib_tuple_t*) ib_new_tpl; + + if (dict_index_is_clust(prebuilt->index)) { + pcur = &cursor->prebuilt->pcur; + } else if (prebuilt->need_to_access_clustered) { + pcur = &cursor->prebuilt->clust_pcur; + } else { + return(DB_ERROR); + } + + ut_a(old_tuple->type == TPL_TYPE_ROW); + ut_a(new_tuple->type == TPL_TYPE_ROW); + + upd = ib_update_vector_create(cursor); + + err = ib_calc_diff(cursor, upd, old_tuple, new_tuple); + + if (err == DB_SUCCESS) { + /* Note that this is not a delete. */ + cursor->q_proc.node.upd->is_delete = FALSE; + + err = ib_execute_update_query_graph(cursor, pcur); + } + + ib_wake_master_thread(); + + return(err); +} + +/**********************************************************************//** +Build the update query graph to delete a row from an index. +@return DB_SUCCESS or err code */ +static +ib_err_t +ib_delete_row( +/*==========*/ + ib_cursor_t* cursor, /*!< in: current cursor */ + btr_pcur_t* pcur, /*!< in: Btree persistent cursor */ + const rec_t* rec) /*!< in: record to delete */ +{ + ulint i; + upd_t* upd; + ib_err_t err; + ib_tuple_t* tuple; + ib_tpl_t ib_tpl; + ulint n_cols; + upd_field_t* upd_field; + ib_bool_t page_format; + dict_table_t* table = cursor->prebuilt->table; + dict_index_t* index = dict_table_get_first_index(table); + + n_cols = dict_index_get_n_ordering_defined_by_user(index); + ib_tpl = ib_key_tuple_new(index, n_cols); + + if (!ib_tpl) { + return(DB_OUT_OF_MEMORY); + } + + tuple = (ib_tuple_t*) ib_tpl; + + upd = ib_update_vector_create(cursor); + + page_format = static_cast<ib_bool_t>( + dict_table_is_comp(index->table)); + ib_read_tuple(rec, page_format, tuple, NULL, NULL); + + upd->n_fields = ib_tuple_get_n_cols(ib_tpl); + + for (i = 0; i < upd->n_fields; ++i) { + dfield_t* dfield; + + upd_field = &upd->fields[i]; + dfield = dtuple_get_nth_field(tuple->ptr, i); + + dfield_copy_data(&upd_field->new_val, dfield); + + upd_field->exp = NULL; + + upd_field->orig_len = 0; + + upd->info_bits = 0; + + upd_field->field_no = dict_col_get_clust_pos( + &table->cols[i], index); + } + + /* Note that this is a delete. */ + cursor->q_proc.node.upd->is_delete = TRUE; + + err = ib_execute_update_query_graph(cursor, pcur); + + ib_tuple_delete(ib_tpl); + + return(err); +} + +/*****************************************************************//** +Delete a row in a table. +@return DB_SUCCESS or err code */ +UNIV_INTERN +ib_err_t +ib_cursor_delete_row( +/*=================*/ + ib_crsr_t ib_crsr) /*!< in: InnoDB cursor instance */ +{ + ib_err_t err; + btr_pcur_t* pcur; + dict_index_t* index; + ib_cursor_t* cursor = (ib_cursor_t*) ib_crsr; + row_prebuilt_t* prebuilt = cursor->prebuilt; + + index = dict_table_get_first_index(prebuilt->index->table); + + /* Check whether this is a secondary index cursor */ + if (index != prebuilt->index) { + if (prebuilt->need_to_access_clustered) { + pcur = &prebuilt->clust_pcur; + } else { + return(DB_ERROR); + } + } else { + pcur = &prebuilt->pcur; + } + + if (ib_btr_cursor_is_positioned(pcur)) { + const rec_t* rec; + ib_bool_t page_format; + mtr_t mtr; + rec_t* copy = NULL; + byte ptr[UNIV_PAGE_SIZE_MAX]; + + page_format = static_cast<ib_bool_t>( + dict_table_is_comp(index->table)); + + mtr_start(&mtr); + + if (btr_pcur_restore_position( + BTR_SEARCH_LEAF, pcur, &mtr)) { + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + + rec_offs_init(offsets_); + + rec = btr_pcur_get_rec(pcur); + + /* Since mtr will be commited, the rec + will not be protected. Make a copy of + the rec. */ + offsets = rec_get_offsets( + rec, index, offsets, ULINT_UNDEFINED, &heap); + ut_ad(rec_offs_size(offsets) < UNIV_PAGE_SIZE_MAX); + copy = rec_copy(ptr, rec, offsets); + } + + mtr_commit(&mtr); + + if (copy && !rec_get_deleted_flag(copy, page_format)) { + err = ib_delete_row(cursor, pcur, copy); + } else { + err = DB_RECORD_NOT_FOUND; + } + } else { + err = DB_RECORD_NOT_FOUND; + } + + ib_wake_master_thread(); + + return(err); +} + +/*****************************************************************//** +Read current row. +@return DB_SUCCESS or err code */ +UNIV_INTERN +ib_err_t +ib_cursor_read_row( +/*===============*/ + ib_crsr_t ib_crsr, /*!< in: InnoDB cursor instance */ + ib_tpl_t ib_tpl, /*!< out: read cols into this tuple */ + void** row_buf, /*!< in/out: row buffer */ + ib_ulint_t* row_len) /*!< in/out: row buffer len */ +{ + ib_err_t err; + ib_tuple_t* tuple = (ib_tuple_t*) ib_tpl; + ib_cursor_t* cursor = (ib_cursor_t*) ib_crsr; + + ut_a(cursor->prebuilt->trx->state != TRX_STATE_NOT_STARTED); + + /* When searching with IB_EXACT_MATCH set, row_search_for_mysql() + will not position the persistent cursor but will copy the record + found into the row cache. It should be the only entry. */ + if (!ib_cursor_is_positioned(ib_crsr) ) { + err = DB_RECORD_NOT_FOUND; + } else { + mtr_t mtr; + btr_pcur_t* pcur; + row_prebuilt_t* prebuilt = cursor->prebuilt; + + if (prebuilt->need_to_access_clustered + && tuple->type == TPL_TYPE_ROW) { + pcur = &prebuilt->clust_pcur; + } else { + pcur = &prebuilt->pcur; + } + + if (pcur == NULL) { + return(DB_ERROR); + } + + mtr_start(&mtr); + + if (btr_pcur_restore_position(BTR_SEARCH_LEAF, pcur, &mtr)) { + const rec_t* rec; + ib_bool_t page_format; + + page_format = static_cast<ib_bool_t>( + dict_table_is_comp(tuple->index->table)); + rec = btr_pcur_get_rec(pcur); + + if (prebuilt->innodb_api_rec && + prebuilt->innodb_api_rec != rec) { + rec = prebuilt->innodb_api_rec; + } + + if (!rec_get_deleted_flag(rec, page_format)) { + ib_read_tuple(rec, page_format, tuple, + row_buf, (ulint*) row_len); + err = DB_SUCCESS; + } else{ + err = DB_RECORD_NOT_FOUND; + } + + } else { + err = DB_RECORD_NOT_FOUND; + } + + mtr_commit(&mtr); + } + + return(err); +} + +/*****************************************************************//** +Move cursor to the first record in the table. +@return DB_SUCCESS or err code */ +UNIV_INLINE +ib_err_t +ib_cursor_position( +/*===============*/ + ib_cursor_t* cursor, /*!< in: InnoDB cursor instance */ + ib_srch_mode_t mode) /*!< in: Search mode */ +{ + ib_err_t err; + row_prebuilt_t* prebuilt = cursor->prebuilt; + unsigned char* buf; + + buf = static_cast<unsigned char*>(mem_alloc(UNIV_PAGE_SIZE)); + + /* We want to position at one of the ends, row_search_for_mysql() + uses the search_tuple fields to work out what to do. */ + dtuple_set_n_fields(prebuilt->search_tuple, 0); + + err = static_cast<ib_err_t>(row_search_for_mysql( + buf, mode, prebuilt, 0, 0)); + + mem_free(buf); + + return(err); +} + +/*****************************************************************//** +Move cursor to the first record in the table. +@return DB_SUCCESS or err code */ +UNIV_INTERN +ib_err_t +ib_cursor_first( +/*============*/ + ib_crsr_t ib_crsr) /*!< in: InnoDB cursor instance */ +{ + ib_cursor_t* cursor = (ib_cursor_t*) ib_crsr; + + return(ib_cursor_position(cursor, IB_CUR_G)); +} + +/*****************************************************************//** +Move cursor to the last record in the table. +@return DB_SUCCESS or err code */ +UNIV_INTERN +ib_err_t +ib_cursor_last( +/*===========*/ + ib_crsr_t ib_crsr) /*!< in: InnoDB cursor instance */ +{ + ib_cursor_t* cursor = (ib_cursor_t*) ib_crsr; + + return(ib_cursor_position(cursor, IB_CUR_L)); +} + +/*****************************************************************//** +Move cursor to the next user record in the table. +@return DB_SUCCESS or err code */ +UNIV_INTERN +ib_err_t +ib_cursor_next( +/*===========*/ + ib_crsr_t ib_crsr) /*!< in: InnoDB cursor instance */ +{ + ib_err_t err; + ib_cursor_t* cursor = (ib_cursor_t*) ib_crsr; + row_prebuilt_t* prebuilt = cursor->prebuilt; + byte buf[UNIV_PAGE_SIZE_MAX]; + + /* We want to move to the next record */ + dtuple_set_n_fields(prebuilt->search_tuple, 0); + + err = static_cast<ib_err_t>(row_search_for_mysql( + buf, PAGE_CUR_G, prebuilt, 0, ROW_SEL_NEXT)); + + return(err); +} + +/*****************************************************************//** +Search for key. +@return DB_SUCCESS or err code */ +UNIV_INTERN +ib_err_t +ib_cursor_moveto( +/*=============*/ + ib_crsr_t ib_crsr, /*!< in: InnoDB cursor instance */ + ib_tpl_t ib_tpl, /*!< in: Key to search for */ + ib_srch_mode_t ib_srch_mode) /*!< in: search mode */ +{ + ulint i; + ulint n_fields; + ib_err_t err = DB_SUCCESS; + ib_tuple_t* tuple = (ib_tuple_t*) ib_tpl; + ib_cursor_t* cursor = (ib_cursor_t*) ib_crsr; + row_prebuilt_t* prebuilt = cursor->prebuilt; + dtuple_t* search_tuple = prebuilt->search_tuple; + unsigned char* buf; + + ut_a(tuple->type == TPL_TYPE_KEY); + + n_fields = dict_index_get_n_ordering_defined_by_user(prebuilt->index); + + dtuple_set_n_fields(search_tuple, n_fields); + dtuple_set_n_fields_cmp(search_tuple, n_fields); + + /* Do a shallow copy */ + for (i = 0; i < n_fields; ++i) { + dfield_copy(dtuple_get_nth_field(search_tuple, i), + dtuple_get_nth_field(tuple->ptr, i)); + } + + ut_a(prebuilt->select_lock_type <= LOCK_NUM); + + prebuilt->innodb_api_rec = NULL; + + buf = static_cast<unsigned char*>(mem_alloc(UNIV_PAGE_SIZE)); + + err = static_cast<ib_err_t>(row_search_for_mysql( + buf, ib_srch_mode, prebuilt, cursor->match_mode, 0)); + + mem_free(buf); + + return(err); +} + +/*****************************************************************//** +Set the cursor search mode. */ +UNIV_INTERN +void +ib_cursor_set_match_mode( +/*=====================*/ + ib_crsr_t ib_crsr, /*!< in: Cursor instance */ + ib_match_mode_t match_mode) /*!< in: ib_cursor_moveto match mode */ +{ + ib_cursor_t* cursor = (ib_cursor_t*) ib_crsr; + + cursor->match_mode = match_mode; +} + +/*****************************************************************//** +Get the dfield instance for the column in the tuple. +@return dfield instance in tuple */ +UNIV_INLINE +dfield_t* +ib_col_get_dfield( +/*==============*/ + ib_tuple_t* tuple, /*!< in: tuple instance */ + ulint col_no) /*!< in: col no. in tuple */ +{ + dfield_t* dfield; + + dfield = dtuple_get_nth_field(tuple->ptr, col_no); + + return(dfield); +} + +/*****************************************************************//** +Predicate to check whether a column type contains variable length data. +@return DB_SUCCESS or error code */ +UNIV_INLINE +ib_err_t +ib_col_is_capped( +/*==============*/ + const dtype_t* dtype) /*!< in: column type */ +{ + return(static_cast<ib_err_t>( + (dtype_get_mtype(dtype) == DATA_VARCHAR + || dtype_get_mtype(dtype) == DATA_CHAR + || dtype_get_mtype(dtype) == DATA_MYSQL + || dtype_get_mtype(dtype) == DATA_VARMYSQL + || dtype_get_mtype(dtype) == DATA_FIXBINARY + || dtype_get_mtype(dtype) == DATA_BINARY) + && dtype_get_len(dtype) > 0)); +} + +/*****************************************************************//** +Set a column of the tuple. Make a copy using the tuple's heap. +@return DB_SUCCESS or error code */ +UNIV_INTERN +ib_err_t +ib_col_set_value( +/*=============*/ + ib_tpl_t ib_tpl, /*!< in: tuple instance */ + ib_ulint_t col_no, /*!< in: column index in tuple */ + const void* src, /*!< in: data value */ + ib_ulint_t len, /*!< in: data value len */ + ib_bool_t need_cpy) /*!< in: if need memcpy */ +{ + const dtype_t* dtype; + dfield_t* dfield; + void* dst = NULL; + ib_tuple_t* tuple = (ib_tuple_t*) ib_tpl; + ulint col_len; + + dfield = ib_col_get_dfield(tuple, col_no); + + /* User wants to set the column to NULL. */ + if (len == IB_SQL_NULL) { + dfield_set_null(dfield); + return(DB_SUCCESS); + } + + dtype = dfield_get_type(dfield); + col_len = dtype_get_len(dtype); + + /* Not allowed to update system columns. */ + if (dtype_get_mtype(dtype) == DATA_SYS) { + return(DB_DATA_MISMATCH); + } + + dst = dfield_get_data(dfield); + + /* Since TEXT/CLOB also map to DATA_VARCHAR we need to make an + exception. Perhaps we need to set the precise type and check + for that. */ + if (ib_col_is_capped(dtype)) { + + len = ut_min(len, static_cast<ib_ulint_t>(col_len)); + + if (dst == NULL || len > dfield_get_len(dfield)) { + dst = mem_heap_alloc(tuple->heap, col_len); + ut_a(dst != NULL); + } + } else if (dst == NULL || len > dfield_get_len(dfield)) { + dst = mem_heap_alloc(tuple->heap, len); + } + + if (dst == NULL) { + return(DB_OUT_OF_MEMORY); + } + + switch (dtype_get_mtype(dtype)) { + case DATA_INT: { + + if (col_len == len) { + ibool usign; + + usign = dtype_get_prtype(dtype) & DATA_UNSIGNED; + mach_write_int_type(static_cast<byte*>(dst), + static_cast<const byte*>(src), + len, usign); + + } else { + return(DB_DATA_MISMATCH); + } + break; + } + + case DATA_FLOAT: + if (len == sizeof(float)) { + mach_float_write(static_cast<byte*>(dst), *(float*)src); + } else { + return(DB_DATA_MISMATCH); + } + break; + + case DATA_DOUBLE: + if (len == sizeof(double)) { + mach_double_write(static_cast<byte*>(dst), + *(double*)src); + } else { + return(DB_DATA_MISMATCH); + } + break; + + case DATA_SYS: + ut_error; + break; + + case DATA_CHAR: { + ulint pad_char = ULINT_UNDEFINED; + + pad_char = dtype_get_pad_char( + dtype_get_mtype(dtype), dtype_get_prtype(dtype)); + + ut_a(pad_char != ULINT_UNDEFINED); + + memset((byte*) dst + len, + static_cast<int>(pad_char), + static_cast<size_t>(col_len - len)); + + memcpy(dst, src, len); + + len = static_cast<ib_ulint_t>(col_len); + break; + } + case DATA_BLOB: + case DATA_BINARY: + case DATA_DECIMAL: + case DATA_VARCHAR: + case DATA_FIXBINARY: + if (need_cpy) { + memcpy(dst, src, len); + } else { + dfield_set_data(dfield, src, len); + dst = dfield_get_data(dfield); + } + break; + + case DATA_MYSQL: + case DATA_VARMYSQL: { + ulint cset; + CHARSET_INFO* cs; + int error = 0; + ulint true_len = len; + + /* For multi byte character sets we need to + calculate the true length of the data. */ + cset = dtype_get_charset_coll( + dtype_get_prtype(dtype)); + cs = all_charsets[cset]; + if (cs) { + uint pos = (uint)(col_len / cs->mbmaxlen); + + if (len > 0 && cs->mbmaxlen > 1) { + true_len = (ulint) + cs->cset->well_formed_len( + cs, + (const char*)src, + (const char*)src + len, + pos, + &error); + + if (true_len < len) { + len = static_cast<ib_ulint_t>(true_len); + } + } + } + + /* All invalid bytes in data need be truncated. + If len == 0, means all bytes of the data is invalid. + In this case, the data will be truncated to empty.*/ + memcpy(dst, src, len); + + /* For DATA_MYSQL, need to pad the unused + space with spaces. */ + if (dtype_get_mtype(dtype) == DATA_MYSQL) { + ulint n_chars; + + if (len < col_len) { + ulint pad_len = col_len - len; + + ut_a(cs != NULL); + ut_a(!(pad_len % cs->mbminlen)); + + cs->cset->fill(cs, (char*)dst + len, + pad_len, + 0x20 /* space */); + } + + /* Why we should do below? See function + row_mysql_store_col_in_innobase_format */ + + ut_a(!(dtype_get_len(dtype) + % dtype_get_mbmaxlen(dtype))); + + n_chars = dtype_get_len(dtype) + / dtype_get_mbmaxlen(dtype); + + /* Strip space padding. */ + while (col_len > n_chars + && ((char*)dst)[col_len - 1] == 0x20) { + col_len--; + } + + len = static_cast<ib_ulint_t>(col_len); + } + break; + } + + default: + ut_error; + } + + if (dst != dfield_get_data(dfield)) { + dfield_set_data(dfield, dst, len); + } else { + dfield_set_len(dfield, len); + } + + return(DB_SUCCESS); +} + +/*****************************************************************//** +Get the size of the data available in a column of the tuple. +@return bytes avail or IB_SQL_NULL */ +UNIV_INTERN +ib_ulint_t +ib_col_get_len( +/*===========*/ + ib_tpl_t ib_tpl, /*!< in: tuple instance */ + ib_ulint_t i) /*!< in: column index in tuple */ +{ + const dfield_t* dfield; + ulint data_len; + ib_tuple_t* tuple = (ib_tuple_t*) ib_tpl; + + dfield = ib_col_get_dfield(tuple, i); + + data_len = dfield_get_len(dfield); + + return(static_cast<ib_ulint_t>( + data_len == UNIV_SQL_NULL ? IB_SQL_NULL : data_len)); +} + +/*****************************************************************//** +Copy a column value from the tuple. +@return bytes copied or IB_SQL_NULL */ +UNIV_INLINE +ib_ulint_t +ib_col_copy_value_low( +/*==================*/ + ib_tpl_t ib_tpl, /*!< in: tuple instance */ + ib_ulint_t i, /*!< in: column index in tuple */ + void* dst, /*!< out: copied data value */ + ib_ulint_t len) /*!< in: max data value len to copy */ +{ + const void* data; + const dfield_t* dfield; + ulint data_len; + ib_tuple_t* tuple = (ib_tuple_t*) ib_tpl; + + dfield = ib_col_get_dfield(tuple, i); + + data = dfield_get_data(dfield); + data_len = dfield_get_len(dfield); + + if (data_len != UNIV_SQL_NULL) { + + const dtype_t* dtype = dfield_get_type(dfield); + + switch (dtype_get_mtype(dfield_get_type(dfield))) { + case DATA_INT: { + ibool usign; + ullint ret; + + ut_a(data_len == len); + + usign = dtype_get_prtype(dtype) & DATA_UNSIGNED; + ret = mach_read_int_type(static_cast<const byte*>(data), + data_len, usign); + + if (usign) { + if (len == 1) { + *(ib_i8_t*)dst = (ib_i8_t)ret; + } else if (len == 2) { + *(ib_i16_t*)dst = (ib_i16_t)ret; + } else if (len == 4) { + *(ib_i32_t*)dst = (ib_i32_t)ret; + } else { + *(ib_i64_t*)dst = (ib_i64_t)ret; + } + } else { + if (len == 1) { + *(ib_u8_t*)dst = (ib_i8_t)ret; + } else if (len == 2) { + *(ib_u16_t*)dst = (ib_i16_t)ret; + } else if (len == 4) { + *(ib_u32_t*)dst = (ib_i32_t)ret; + } else { + *(ib_u64_t*)dst = (ib_i64_t)ret; + } + } + + break; + } + case DATA_FLOAT: + if (len == data_len) { + float f; + + ut_a(data_len == sizeof(f)); + f = mach_float_read(static_cast<const byte*>( + data)); + memcpy(dst, &f, sizeof(f)); + } else { + data_len = 0; + } + break; + case DATA_DOUBLE: + if (len == data_len) { + double d; + + ut_a(data_len == sizeof(d)); + d = mach_double_read(static_cast<const byte*>( + data)); + memcpy(dst, &d, sizeof(d)); + } else { + data_len = 0; + } + break; + default: + data_len = ut_min(data_len, len); + memcpy(dst, data, data_len); + } + } else { + data_len = IB_SQL_NULL; + } + + return(static_cast<ib_ulint_t>(data_len)); +} + +/*****************************************************************//** +Copy a column value from the tuple. +@return bytes copied or IB_SQL_NULL */ +UNIV_INTERN +ib_ulint_t +ib_col_copy_value( +/*==============*/ + ib_tpl_t ib_tpl, /*!< in: tuple instance */ + ib_ulint_t i, /*!< in: column index in tuple */ + void* dst, /*!< out: copied data value */ + ib_ulint_t len) /*!< in: max data value len to copy */ +{ + return(ib_col_copy_value_low(ib_tpl, i, dst, len)); +} + +/*****************************************************************//** +Get the InnoDB column attribute from the internal column precise type. +@return precise type in api format */ +UNIV_INLINE +ib_col_attr_t +ib_col_get_attr( +/*============*/ + ulint prtype) /*!< in: column definition */ +{ + ib_col_attr_t attr = IB_COL_NONE; + + if (prtype & DATA_UNSIGNED) { + attr = static_cast<ib_col_attr_t>(attr | IB_COL_UNSIGNED); + } + + if (prtype & DATA_NOT_NULL) { + attr = static_cast<ib_col_attr_t>(attr | IB_COL_NOT_NULL); + } + + return(attr); +} + +/*****************************************************************//** +Get a column name from the tuple. +@return name of the column */ +UNIV_INTERN +const char* +ib_col_get_name( +/*============*/ + ib_crsr_t ib_crsr, /*!< in: InnoDB cursor instance */ + ib_ulint_t i) /*!< in: column index in tuple */ +{ + const char* name; + ib_cursor_t* cursor = (ib_cursor_t*) ib_crsr; + dict_table_t* table = cursor->prebuilt->table; + dict_col_t* col = dict_table_get_nth_col(table, i); + ulint col_no = dict_col_get_no(col); + + name = dict_table_get_col_name(table, col_no); + + return(name); +} + +/*****************************************************************//** +Get an index field name from the cursor. +@return name of the field */ +UNIV_INTERN +const char* +ib_get_idx_field_name( +/*==================*/ + ib_crsr_t ib_crsr, /*!< in: InnoDB cursor instance */ + ib_ulint_t i) /*!< in: column index in tuple */ +{ + ib_cursor_t* cursor = (ib_cursor_t*) ib_crsr; + dict_index_t* index = cursor->prebuilt->index; + dict_field_t* field; + + if (index) { + field = dict_index_get_nth_field(cursor->prebuilt->index, i); + + if (field) { + return(field->name); + } + } + + return(NULL); +} + +/*****************************************************************//** +Get a column type, length and attributes from the tuple. +@return len of column data */ +UNIV_INLINE +ib_ulint_t +ib_col_get_meta_low( +/*================*/ + ib_tpl_t ib_tpl, /*!< in: tuple instance */ + ib_ulint_t i, /*!< in: column index in tuple */ + ib_col_meta_t* ib_col_meta) /*!< out: column meta data */ +{ + ib_u16_t prtype; + const dfield_t* dfield; + ulint data_len; + ib_tuple_t* tuple = (ib_tuple_t*) ib_tpl; + + dfield = ib_col_get_dfield(tuple, i); + + data_len = dfield_get_len(dfield); + + /* We assume 1-1 mapping between the ENUM and internal type codes. */ + ib_col_meta->type = static_cast<ib_col_type_t>( + dtype_get_mtype(dfield_get_type(dfield))); + + ib_col_meta->type_len = static_cast<ib_u32_t>( + dtype_get_len(dfield_get_type(dfield))); + + prtype = (ib_u16_t) dtype_get_prtype(dfield_get_type(dfield)); + + ib_col_meta->attr = ib_col_get_attr(prtype); + ib_col_meta->client_type = prtype & DATA_MYSQL_TYPE_MASK; + + return(static_cast<ib_ulint_t>(data_len)); +} + +/*************************************************************//** +Read a signed int 8 bit column from an InnoDB tuple. */ +UNIV_INLINE +ib_err_t +ib_tuple_check_int( +/*===============*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t i, /*!< in: column number */ + ib_bool_t usign, /*!< in: true if unsigned */ + ulint size) /*!< in: size of integer */ +{ + ib_col_meta_t ib_col_meta; + + ib_col_get_meta_low(ib_tpl, i, &ib_col_meta); + + if (ib_col_meta.type != IB_INT) { + return(DB_DATA_MISMATCH); + } else if (ib_col_meta.type_len == IB_SQL_NULL) { + return(DB_UNDERFLOW); + } else if (ib_col_meta.type_len != size) { + return(DB_DATA_MISMATCH); + } else if ((ib_col_meta.attr & IB_COL_UNSIGNED) && !usign) { + return(DB_DATA_MISMATCH); + } + + return(DB_SUCCESS); +} + +/*************************************************************//** +Read a signed int 8 bit column from an InnoDB tuple. +@return DB_SUCCESS or error */ +UNIV_INTERN +ib_err_t +ib_tuple_read_i8( +/*=============*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t i, /*!< in: column number */ + ib_i8_t* ival) /*!< out: integer value */ +{ + ib_err_t err; + + err = ib_tuple_check_int(ib_tpl, i, IB_FALSE, sizeof(*ival)); + + if (err == DB_SUCCESS) { + ib_col_copy_value_low(ib_tpl, i, ival, sizeof(*ival)); + } + + return(err); +} + +/*************************************************************//** +Read an unsigned int 8 bit column from an InnoDB tuple. +@return DB_SUCCESS or error */ +UNIV_INTERN +ib_err_t +ib_tuple_read_u8( +/*=============*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t i, /*!< in: column number */ + ib_u8_t* ival) /*!< out: integer value */ +{ + ib_err_t err; + + err = ib_tuple_check_int(ib_tpl, i, IB_TRUE, sizeof(*ival)); + + if (err == DB_SUCCESS) { + ib_col_copy_value_low(ib_tpl, i, ival, sizeof(*ival)); + } + + return(err); +} + +/*************************************************************//** +Read a signed int 16 bit column from an InnoDB tuple. +@return DB_SUCCESS or error */ +UNIV_INTERN +ib_err_t +ib_tuple_read_i16( +/*==============*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t i, /*!< in: column number */ + ib_i16_t* ival) /*!< out: integer value */ +{ + ib_err_t err; + + err = ib_tuple_check_int(ib_tpl, i, FALSE, sizeof(*ival)); + + if (err == DB_SUCCESS) { + ib_col_copy_value_low(ib_tpl, i, ival, sizeof(*ival)); + } + + return(err); +} + +/*************************************************************//** +Read an unsigned int 16 bit column from an InnoDB tuple. +@return DB_SUCCESS or error */ +UNIV_INTERN +ib_err_t +ib_tuple_read_u16( +/*==============*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t i, /*!< in: column number */ + ib_u16_t* ival) /*!< out: integer value */ +{ + ib_err_t err; + + err = ib_tuple_check_int(ib_tpl, i, IB_TRUE, sizeof(*ival)); + + if (err == DB_SUCCESS) { + ib_col_copy_value_low(ib_tpl, i, ival, sizeof(*ival)); + } + + return(err); +} + +/*************************************************************//** +Read a signed int 32 bit column from an InnoDB tuple. +@return DB_SUCCESS or error */ +UNIV_INTERN +ib_err_t +ib_tuple_read_i32( +/*==============*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t i, /*!< in: column number */ + ib_i32_t* ival) /*!< out: integer value */ +{ + ib_err_t err; + + err = ib_tuple_check_int(ib_tpl, i, FALSE, sizeof(*ival)); + + if (err == DB_SUCCESS) { + ib_col_copy_value_low(ib_tpl, i, ival, sizeof(*ival)); + } + + return(err); +} + +/*************************************************************//** +Read an unsigned int 32 bit column from an InnoDB tuple. +@return DB_SUCCESS or error */ +UNIV_INTERN +ib_err_t +ib_tuple_read_u32( +/*==============*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t i, /*!< in: column number */ + ib_u32_t* ival) /*!< out: integer value */ +{ + ib_err_t err; + + err = ib_tuple_check_int(ib_tpl, i, IB_TRUE, sizeof(*ival)); + + if (err == DB_SUCCESS) { + ib_col_copy_value_low(ib_tpl, i, ival, sizeof(*ival)); + } + + return(err); +} + +/*************************************************************//** +Read a signed int 64 bit column from an InnoDB tuple. +@return DB_SUCCESS or error */ +UNIV_INTERN +ib_err_t +ib_tuple_read_i64( +/*==============*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t i, /*!< in: column number */ + ib_i64_t* ival) /*!< out: integer value */ +{ + ib_err_t err; + + err = ib_tuple_check_int(ib_tpl, i, FALSE, sizeof(*ival)); + + if (err == DB_SUCCESS) { + ib_col_copy_value_low(ib_tpl, i, ival, sizeof(*ival)); + } + + return(err); +} + +/*************************************************************//** +Read an unsigned int 64 bit column from an InnoDB tuple. +@return DB_SUCCESS or error */ +UNIV_INTERN +ib_err_t +ib_tuple_read_u64( +/*==============*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t i, /*!< in: column number */ + ib_u64_t* ival) /*!< out: integer value */ +{ + ib_err_t err; + + err = ib_tuple_check_int(ib_tpl, i, IB_TRUE, sizeof(*ival)); + + if (err == DB_SUCCESS) { + ib_col_copy_value_low(ib_tpl, i, ival, sizeof(*ival)); + } + + return(err); +} + +/*****************************************************************//** +Get a column value pointer from the tuple. +@return NULL or pointer to buffer */ +UNIV_INTERN +const void* +ib_col_get_value( +/*=============*/ + ib_tpl_t ib_tpl, /*!< in: tuple instance */ + ib_ulint_t i) /*!< in: column index in tuple */ +{ + const void* data; + const dfield_t* dfield; + ulint data_len; + ib_tuple_t* tuple = (ib_tuple_t*) ib_tpl; + + dfield = ib_col_get_dfield(tuple, i); + + data = dfield_get_data(dfield); + data_len = dfield_get_len(dfield); + + return(data_len != UNIV_SQL_NULL ? data : NULL); +} + +/*****************************************************************//** +Get a column type, length and attributes from the tuple. +@return len of column data */ +UNIV_INTERN +ib_ulint_t +ib_col_get_meta( +/*============*/ + ib_tpl_t ib_tpl, /*!< in: tuple instance */ + ib_ulint_t i, /*!< in: column index in tuple */ + ib_col_meta_t* ib_col_meta) /*!< out: column meta data */ +{ + return(ib_col_get_meta_low(ib_tpl, i, ib_col_meta)); +} + +/*****************************************************************//** +"Clear" or reset an InnoDB tuple. We free the heap and recreate the tuple. +@return new tuple, or NULL */ +UNIV_INTERN +ib_tpl_t +ib_tuple_clear( +/*============*/ + ib_tpl_t ib_tpl) /*!< in,own: tuple (will be freed) */ +{ + const dict_index_t* index; + ulint n_cols; + ib_tuple_t* tuple = (ib_tuple_t*) ib_tpl; + ib_tuple_type_t type = tuple->type; + mem_heap_t* heap = tuple->heap; + + index = tuple->index; + n_cols = dtuple_get_n_fields(tuple->ptr); + + mem_heap_empty(heap); + + if (type == TPL_TYPE_ROW) { + return(ib_row_tuple_new_low(index, n_cols, heap)); + } else { + return(ib_key_tuple_new_low(index, n_cols, heap)); + } +} + +/*****************************************************************//** +Create a new cluster key search tuple and copy the contents of the +secondary index key tuple columns that refer to the cluster index record +to the cluster key. It does a deep copy of the column data. +@return DB_SUCCESS or error code */ +UNIV_INTERN +ib_err_t +ib_tuple_get_cluster_key( +/*=====================*/ + ib_crsr_t ib_crsr, /*!< in: secondary index cursor */ + ib_tpl_t* ib_dst_tpl, /*!< out,own: destination tuple */ + const ib_tpl_t ib_src_tpl) /*!< in: source tuple */ +{ + ulint i; + ulint n_fields; + ib_err_t err = DB_SUCCESS; + ib_tuple_t* dst_tuple = NULL; + ib_cursor_t* cursor = (ib_cursor_t*) ib_crsr; + ib_tuple_t* src_tuple = (ib_tuple_t*) ib_src_tpl; + dict_index_t* clust_index; + + clust_index = dict_table_get_first_index(cursor->prebuilt->table); + + /* We need to ensure that the src tuple belongs to the same table + as the open cursor and that it's not a tuple for a cluster index. */ + if (src_tuple->type != TPL_TYPE_KEY) { + return(DB_ERROR); + } else if (src_tuple->index->table != cursor->prebuilt->table) { + return(DB_DATA_MISMATCH); + } else if (src_tuple->index == clust_index) { + return(DB_ERROR); + } + + /* Create the cluster index key search tuple. */ + *ib_dst_tpl = ib_clust_search_tuple_create(ib_crsr); + + if (!*ib_dst_tpl) { + return(DB_OUT_OF_MEMORY); + } + + dst_tuple = (ib_tuple_t*) *ib_dst_tpl; + ut_a(dst_tuple->index == clust_index); + + n_fields = dict_index_get_n_unique(dst_tuple->index); + + /* Do a deep copy of the data fields. */ + for (i = 0; i < n_fields; i++) { + ulint pos; + dfield_t* src_field; + dfield_t* dst_field; + + pos = dict_index_get_nth_field_pos( + src_tuple->index, dst_tuple->index, i); + + ut_a(pos != ULINT_UNDEFINED); + + src_field = dtuple_get_nth_field(src_tuple->ptr, pos); + dst_field = dtuple_get_nth_field(dst_tuple->ptr, i); + + if (!dfield_is_null(src_field)) { + UNIV_MEM_ASSERT_RW(src_field->data, src_field->len); + + dst_field->data = mem_heap_dup( + dst_tuple->heap, + src_field->data, + src_field->len); + + dst_field->len = src_field->len; + } else { + dfield_set_null(dst_field); + } + } + + return(err); +} + +/*****************************************************************//** +Copy the contents of source tuple to destination tuple. The tuples +must be of the same type and belong to the same table/index. +@return DB_SUCCESS or error code */ +UNIV_INTERN +ib_err_t +ib_tuple_copy( +/*==========*/ + ib_tpl_t ib_dst_tpl, /*!< in: destination tuple */ + const ib_tpl_t ib_src_tpl) /*!< in: source tuple */ +{ + ulint i; + ulint n_fields; + ib_err_t err = DB_SUCCESS; + const ib_tuple_t*src_tuple = (const ib_tuple_t*) ib_src_tpl; + ib_tuple_t* dst_tuple = (ib_tuple_t*) ib_dst_tpl; + + /* Make sure src and dst are not the same. */ + ut_a(src_tuple != dst_tuple); + + /* Make sure they are the same type and refer to the same index. */ + if (src_tuple->type != dst_tuple->type + || src_tuple->index != dst_tuple->index) { + + return(DB_DATA_MISMATCH); + } + + n_fields = dtuple_get_n_fields(src_tuple->ptr); + ut_ad(n_fields == dtuple_get_n_fields(dst_tuple->ptr)); + + /* Do a deep copy of the data fields. */ + for (i = 0; i < n_fields; ++i) { + dfield_t* src_field; + dfield_t* dst_field; + + src_field = dtuple_get_nth_field(src_tuple->ptr, i); + dst_field = dtuple_get_nth_field(dst_tuple->ptr, i); + + if (!dfield_is_null(src_field)) { + UNIV_MEM_ASSERT_RW(src_field->data, src_field->len); + + dst_field->data = mem_heap_dup( + dst_tuple->heap, + src_field->data, + src_field->len); + + dst_field->len = src_field->len; + } else { + dfield_set_null(dst_field); + } + } + + return(err); +} + +/*****************************************************************//** +Create an InnoDB tuple used for index/table search. +@return own: Tuple for current index */ +UNIV_INTERN +ib_tpl_t +ib_sec_search_tuple_create( +/*=======================*/ + ib_crsr_t ib_crsr) /*!< in: Cursor instance */ +{ + ulint n_cols; + ib_cursor_t* cursor = (ib_cursor_t*) ib_crsr; + dict_index_t* index = cursor->prebuilt->index; + + n_cols = dict_index_get_n_unique_in_tree(index); + return(ib_key_tuple_new(index, n_cols)); +} + +/*****************************************************************//** +Create an InnoDB tuple used for index/table search. +@return own: Tuple for current index */ +UNIV_INTERN +ib_tpl_t +ib_sec_read_tuple_create( +/*=====================*/ + ib_crsr_t ib_crsr) /*!< in: Cursor instance */ +{ + ulint n_cols; + ib_cursor_t* cursor = (ib_cursor_t*) ib_crsr; + dict_index_t* index = cursor->prebuilt->index; + + n_cols = dict_index_get_n_fields(index); + return(ib_row_tuple_new(index, n_cols)); +} + +/*****************************************************************//** +Create an InnoDB tuple used for table key operations. +@return own: Tuple for current table */ +UNIV_INTERN +ib_tpl_t +ib_clust_search_tuple_create( +/*=========================*/ + ib_crsr_t ib_crsr) /*!< in: Cursor instance */ +{ + ulint n_cols; + ib_cursor_t* cursor = (ib_cursor_t*) ib_crsr; + dict_index_t* index; + + index = dict_table_get_first_index(cursor->prebuilt->table); + + n_cols = dict_index_get_n_ordering_defined_by_user(index); + return(ib_key_tuple_new(index, n_cols)); +} + +/*****************************************************************//** +Create an InnoDB tuple for table row operations. +@return own: Tuple for current table */ +UNIV_INTERN +ib_tpl_t +ib_clust_read_tuple_create( +/*=======================*/ + ib_crsr_t ib_crsr) /*!< in: Cursor instance */ +{ + ulint n_cols; + ib_cursor_t* cursor = (ib_cursor_t*) ib_crsr; + dict_index_t* index; + + index = dict_table_get_first_index(cursor->prebuilt->table); + + n_cols = dict_table_get_n_cols(cursor->prebuilt->table); + return(ib_row_tuple_new(index, n_cols)); +} + +/*****************************************************************//** +Return the number of user columns in the tuple definition. +@return number of user columns */ +UNIV_INTERN +ib_ulint_t +ib_tuple_get_n_user_cols( +/*=====================*/ + const ib_tpl_t ib_tpl) /*!< in: Tuple for current table */ +{ + const ib_tuple_t* tuple = (const ib_tuple_t*) ib_tpl; + + if (tuple->type == TPL_TYPE_ROW) { + return(static_cast<ib_ulint_t>( + dict_table_get_n_user_cols(tuple->index->table))); + } + + return(static_cast<ib_ulint_t>( + dict_index_get_n_ordering_defined_by_user(tuple->index))); +} + +/*****************************************************************//** +Return the number of columns in the tuple definition. +@return number of columns */ +UNIV_INTERN +ib_ulint_t +ib_tuple_get_n_cols( +/*================*/ + const ib_tpl_t ib_tpl) /*!< in: Tuple for table/index */ +{ + const ib_tuple_t* tuple = (const ib_tuple_t*) ib_tpl; + + return(static_cast<ib_ulint_t>(dtuple_get_n_fields(tuple->ptr))); +} + +/*****************************************************************//** +Destroy an InnoDB tuple. */ +UNIV_INTERN +void +ib_tuple_delete( +/*============*/ + ib_tpl_t ib_tpl) /*!< in,own: Tuple instance to delete */ +{ + ib_tuple_t* tuple = (ib_tuple_t*) ib_tpl; + + if (!ib_tpl) { + return; + } + + mem_heap_free(tuple->heap); +} + +/*****************************************************************//** +Get a table id. This function will acquire the dictionary mutex. +@return DB_SUCCESS if found */ +UNIV_INTERN +ib_err_t +ib_table_get_id( +/*============*/ + const char* table_name, /*!< in: table to find */ + ib_id_u64_t* table_id) /*!< out: table id if found */ +{ + ib_err_t err; + + dict_mutex_enter_for_mysql(); + + err = ib_table_get_id_low(table_name, table_id); + + dict_mutex_exit_for_mysql(); + + return(err); +} + +/*****************************************************************//** +Get an index id. +@return DB_SUCCESS if found */ +UNIV_INTERN +ib_err_t +ib_index_get_id( +/*============*/ + const char* table_name, /*!< in: find index for this table */ + const char* index_name, /*!< in: index to find */ + ib_id_u64_t* index_id) /*!< out: index id if found */ +{ + dict_table_t* table; + char* normalized_name; + ib_err_t err = DB_TABLE_NOT_FOUND; + + *index_id = 0; + + normalized_name = static_cast<char*>( + mem_alloc(ut_strlen(table_name) + 1)); + ib_normalize_table_name(normalized_name, table_name); + + table = ib_lookup_table_by_name(normalized_name); + + mem_free(normalized_name); + normalized_name = NULL; + + if (table != NULL) { + dict_index_t* index; + + index = dict_table_get_index_on_name(table, index_name); + + if (index != NULL) { + /* We only support 32 bit table and index ids. Because + we need to pack the table id into the index id. */ + + *index_id = (table->id); + *index_id <<= 32; + *index_id |= (index->id); + + err = DB_SUCCESS; + } + } + + return(err); +} + +#ifdef __WIN__ +#define SRV_PATH_SEPARATOR '\\' +#else +#define SRV_PATH_SEPARATOR '/' +#endif + + +/*****************************************************************//** +Check if cursor is positioned. +@return IB_TRUE if positioned */ +UNIV_INTERN +ib_bool_t +ib_cursor_is_positioned( +/*====================*/ + const ib_crsr_t ib_crsr) /*!< in: InnoDB cursor instance */ +{ + const ib_cursor_t* cursor = (const ib_cursor_t*) ib_crsr; + row_prebuilt_t* prebuilt = cursor->prebuilt; + + return(ib_btr_cursor_is_positioned(&prebuilt->pcur)); +} + + +/*****************************************************************//** +Checks if the data dictionary is latched in exclusive mode. +@return TRUE if exclusive latch */ +UNIV_INTERN +ib_bool_t +ib_schema_lock_is_exclusive( +/*========================*/ + const ib_trx_t ib_trx) /*!< in: transaction */ +{ + const trx_t* trx = (const trx_t*) ib_trx; + + return(trx->dict_operation_lock_mode == RW_X_LATCH); +} + +/*****************************************************************//** +Checks if the data dictionary is latched in shared mode. +@return TRUE if shared latch */ +UNIV_INTERN +ib_bool_t +ib_schema_lock_is_shared( +/*=====================*/ + const ib_trx_t ib_trx) /*!< in: transaction */ +{ + const trx_t* trx = (const trx_t*) ib_trx; + + return(trx->dict_operation_lock_mode == RW_S_LATCH); +} + +/*****************************************************************//** +Set the Lock an InnoDB cursor/table. +@return DB_SUCCESS or error code */ +UNIV_INTERN +ib_err_t +ib_cursor_lock( +/*===========*/ + ib_crsr_t ib_crsr, /*!< in/out: InnoDB cursor */ + ib_lck_mode_t ib_lck_mode) /*!< in: InnoDB lock mode */ +{ + ib_cursor_t* cursor = (ib_cursor_t*) ib_crsr; + row_prebuilt_t* prebuilt = cursor->prebuilt; + trx_t* trx = prebuilt->trx; + dict_table_t* table = prebuilt->table; + + return(ib_trx_lock_table_with_retry( + trx, table, (enum lock_mode) ib_lck_mode)); +} + +/*****************************************************************//** +Set the Lock an InnoDB table using the table id. +@return DB_SUCCESS or error code */ +UNIV_INTERN +ib_err_t +ib_table_lock( +/*==========*/ + ib_trx_t ib_trx, /*!< in/out: transaction */ + ib_id_u64_t table_id, /*!< in: table id */ + ib_lck_mode_t ib_lck_mode) /*!< in: InnoDB lock mode */ +{ + ib_err_t err; + que_thr_t* thr; + mem_heap_t* heap; + dict_table_t* table; + ib_qry_proc_t q_proc; + trx_t* trx = (trx_t*) ib_trx; + + ut_a(trx->state != TRX_STATE_NOT_STARTED); + + table = ib_open_table_by_id(table_id, FALSE); + + if (table == NULL) { + return(DB_TABLE_NOT_FOUND); + } + + ut_a(ib_lck_mode <= static_cast<ib_lck_mode_t>(LOCK_NUM)); + + heap = mem_heap_create(128); + + q_proc.node.sel = sel_node_create(heap); + + thr = pars_complete_graph_for_exec(q_proc.node.sel, trx, heap); + + q_proc.grph.sel = static_cast<que_fork_t*>(que_node_get_parent(thr)); + q_proc.grph.sel->state = QUE_FORK_ACTIVE; + + trx->op_info = "setting table lock"; + + ut_a(ib_lck_mode == IB_LOCK_IS || ib_lck_mode == IB_LOCK_IX); + err = static_cast<ib_err_t>( + lock_table(0, table, (enum lock_mode) ib_lck_mode, thr)); + + trx->error_state = err; + + mem_heap_free(heap); + + return(err); +} + +/*****************************************************************//** +Unlock an InnoDB table. +@return DB_SUCCESS or error code */ +UNIV_INTERN +ib_err_t +ib_cursor_unlock( +/*=============*/ + ib_crsr_t ib_crsr) /*!< in/out: InnoDB cursor */ +{ + ib_err_t err = DB_SUCCESS; + ib_cursor_t* cursor = (ib_cursor_t*) ib_crsr; + row_prebuilt_t* prebuilt = cursor->prebuilt; + + if (prebuilt->trx->mysql_n_tables_locked > 0) { + --prebuilt->trx->mysql_n_tables_locked; + } else { + err = DB_ERROR; + } + + return(err); +} + +/*****************************************************************//** +Set the Lock mode of the cursor. +@return DB_SUCCESS or error code */ +UNIV_INTERN +ib_err_t +ib_cursor_set_lock_mode( +/*====================*/ + ib_crsr_t ib_crsr, /*!< in/out: InnoDB cursor */ + ib_lck_mode_t ib_lck_mode) /*!< in: InnoDB lock mode */ +{ + ib_err_t err = DB_SUCCESS; + ib_cursor_t* cursor = (ib_cursor_t*) ib_crsr; + row_prebuilt_t* prebuilt = cursor->prebuilt; + + ut_a(ib_lck_mode <= static_cast<ib_lck_mode_t>(LOCK_NUM)); + + if (ib_lck_mode == IB_LOCK_X) { + err = ib_cursor_lock(ib_crsr, IB_LOCK_IX); + } else if (ib_lck_mode == IB_LOCK_S) { + err = ib_cursor_lock(ib_crsr, IB_LOCK_IS); + } + + if (err == DB_SUCCESS) { + prebuilt->select_lock_type = (enum lock_mode) ib_lck_mode; + ut_a(prebuilt->trx->state != TRX_STATE_NOT_STARTED); + } + + return(err); +} + +/*****************************************************************//** +Set need to access clustered index record. */ +UNIV_INTERN +void +ib_cursor_set_cluster_access( +/*=========================*/ + ib_crsr_t ib_crsr) /*!< in/out: InnoDB cursor */ +{ + ib_cursor_t* cursor = (ib_cursor_t*) ib_crsr; + row_prebuilt_t* prebuilt = cursor->prebuilt; + + prebuilt->need_to_access_clustered = TRUE; +} + +/*************************************************************//** +Convert and write an INT column value to an InnoDB tuple. +@return DB_SUCCESS or error */ +UNIV_INLINE +ib_err_t +ib_tuple_write_int( +/*===============*/ + ib_tpl_t ib_tpl, /*!< in/out: tuple to write to */ + ulint col_no, /*!< in: column number */ + const void* value, /*!< in: integer value */ + ulint value_len) /*!< in: sizeof value type */ +{ + const dfield_t* dfield; + ulint data_len; + ulint type_len; + ib_tuple_t* tuple = (ib_tuple_t*) ib_tpl; + + ut_a(col_no < ib_tuple_get_n_cols(ib_tpl)); + + dfield = ib_col_get_dfield(tuple, col_no); + + data_len = dfield_get_len(dfield); + type_len = dtype_get_len(dfield_get_type(dfield)); + + if (dtype_get_mtype(dfield_get_type(dfield)) != DATA_INT + || value_len != data_len) { + + return(DB_DATA_MISMATCH); + } + + return(ib_col_set_value( + ib_tpl, static_cast<ib_ulint_t>(col_no), + value, static_cast<ib_ulint_t>(type_len), true)); +} + +/*****************************************************************//** +Write an integer value to a column. Integers are stored in big-endian +format and will need to be converted from the host format. +@return DB_SUCESS or error */ +UNIV_INTERN +ib_err_t +ib_tuple_write_i8( +/*==============*/ + ib_tpl_t ib_tpl, /*!< in/out: tuple to write to */ + int col_no, /*!< in: column number */ + ib_i8_t val) /*!< in: value to write */ +{ + return(ib_col_set_value(ib_tpl, col_no, &val, sizeof(val), true)); +} + +/*****************************************************************//** +Write an integer value to a column. Integers are stored in big-endian +format and will need to be converted from the host format. +@return DB_SUCESS or error */ +UNIV_INTERN +ib_err_t +ib_tuple_write_i16( +/*===============*/ + ib_tpl_t ib_tpl, /*!< in/out: tuple to write to */ + int col_no, /*!< in: column number */ + ib_i16_t val) /*!< in: value to write */ +{ + return(ib_col_set_value(ib_tpl, col_no, &val, sizeof(val), true)); +} + +/*****************************************************************//** +Write an integer value to a column. Integers are stored in big-endian +format and will need to be converted from the host format. +@return DB_SUCCESS or error */ +UNIV_INTERN +ib_err_t +ib_tuple_write_i32( +/*===============*/ + ib_tpl_t ib_tpl, /*!< in/out: tuple to write to */ + int col_no, /*!< in: column number */ + ib_i32_t val) /*!< in: value to write */ +{ + return(ib_col_set_value(ib_tpl, col_no, &val, sizeof(val), true)); +} + +/*****************************************************************//** +Write an integer value to a column. Integers are stored in big-endian +format and will need to be converted from the host format. +@return DB_SUCCESS or error */ +UNIV_INTERN +ib_err_t +ib_tuple_write_i64( +/*===============*/ + ib_tpl_t ib_tpl, /*!< in/out: tuple to write to */ + int col_no, /*!< in: column number */ + ib_i64_t val) /*!< in: value to write */ +{ + return(ib_col_set_value(ib_tpl, col_no, &val, sizeof(val), true)); +} + +/*****************************************************************//** +Write an integer value to a column. Integers are stored in big-endian +format and will need to be converted from the host format. +@return DB_SUCCESS or error */ +UNIV_INTERN +ib_err_t +ib_tuple_write_u8( +/*==============*/ + ib_tpl_t ib_tpl, /*!< in/out: tuple to write to */ + int col_no, /*!< in: column number */ + ib_u8_t val) /*!< in: value to write */ +{ + return(ib_col_set_value(ib_tpl, col_no, &val, sizeof(val), true)); +} + +/*****************************************************************//** +Write an integer value to a column. Integers are stored in big-endian +format and will need to be converted from the host format. +@return DB_SUCCESS or error */ +UNIV_INTERN +ib_err_t +ib_tuple_write_u16( +/*===============*/ + ib_tpl_t ib_tpl, /*!< in/out: tupe to write to */ + int col_no, /*!< in: column number */ + ib_u16_t val) /*!< in: value to write */ +{ + return(ib_col_set_value(ib_tpl, col_no, &val, sizeof(val), true)); +} + +/*****************************************************************//** +Write an integer value to a column. Integers are stored in big-endian +format and will need to be converted from the host format. +@return DB_SUCCESS or error */ +UNIV_INTERN +ib_err_t +ib_tuple_write_u32( +/*===============*/ + ib_tpl_t ib_tpl, /*!< in/out: tuple to write to */ + int col_no, /*!< in: column number */ + ib_u32_t val) /*!< in: value to write */ +{ + return(ib_col_set_value(ib_tpl, col_no, &val, sizeof(val), true)); +} + +/*****************************************************************//** +Write an integer value to a column. Integers are stored in big-endian +format and will need to be converted from the host format. +@return DB_SUCCESS or error */ +UNIV_INTERN +ib_err_t +ib_tuple_write_u64( +/*===============*/ + ib_tpl_t ib_tpl, /*!< in/out: tuple to write to */ + int col_no, /*!< in: column number */ + ib_u64_t val) /*!< in: value to write */ +{ + return(ib_col_set_value(ib_tpl, col_no, &val, sizeof(val), true)); +} + +/*****************************************************************//** +Inform the cursor that it's the start of an SQL statement. */ +UNIV_INTERN +void +ib_cursor_stmt_begin( +/*=================*/ + ib_crsr_t ib_crsr) /*!< in: cursor */ +{ + ib_cursor_t* cursor = (ib_cursor_t*) ib_crsr; + + cursor->prebuilt->sql_stat_start = TRUE; +} + +/*****************************************************************//** +Write a double value to a column. +@return DB_SUCCESS or error */ +UNIV_INTERN +ib_err_t +ib_tuple_write_double( +/*==================*/ + ib_tpl_t ib_tpl, /*!< in/out: tuple to write to */ + int col_no, /*!< in: column number */ + double val) /*!< in: value to write */ +{ + const dfield_t* dfield; + ib_tuple_t* tuple = (ib_tuple_t*) ib_tpl; + + dfield = ib_col_get_dfield(tuple, col_no); + + if (dtype_get_mtype(dfield_get_type(dfield)) == DATA_DOUBLE) { + return(ib_col_set_value(ib_tpl, col_no, + &val, sizeof(val), true)); + } else { + return(DB_DATA_MISMATCH); + } +} + +/*************************************************************//** +Read a double column value from an InnoDB tuple. +@return DB_SUCCESS or error */ +UNIV_INTERN +ib_err_t +ib_tuple_read_double( +/*=================*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t col_no, /*!< in: column number */ + double* dval) /*!< out: double value */ +{ + ib_err_t err; + const dfield_t* dfield; + ib_tuple_t* tuple = (ib_tuple_t*) ib_tpl; + + dfield = ib_col_get_dfield(tuple, col_no); + + if (dtype_get_mtype(dfield_get_type(dfield)) == DATA_DOUBLE) { + ib_col_copy_value_low(ib_tpl, col_no, dval, sizeof(*dval)); + err = DB_SUCCESS; + } else { + err = DB_DATA_MISMATCH; + } + + return(err); +} + +/*****************************************************************//** +Write a float value to a column. +@return DB_SUCCESS or error */ +UNIV_INTERN +ib_err_t +ib_tuple_write_float( +/*=================*/ + ib_tpl_t ib_tpl, /*!< in/out: tuple to write to */ + int col_no, /*!< in: column number */ + float val) /*!< in: value to write */ +{ + const dfield_t* dfield; + ib_tuple_t* tuple = (ib_tuple_t*) ib_tpl; + + dfield = ib_col_get_dfield(tuple, col_no); + + if (dtype_get_mtype(dfield_get_type(dfield)) == DATA_FLOAT) { + return(ib_col_set_value(ib_tpl, col_no, + &val, sizeof(val), true)); + } else { + return(DB_DATA_MISMATCH); + } +} + +/*************************************************************//** +Read a float value from an InnoDB tuple. +@return DB_SUCCESS or error */ +UNIV_INTERN +ib_err_t +ib_tuple_read_float( +/*================*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t col_no, /*!< in: column number */ + float* fval) /*!< out: float value */ +{ + ib_err_t err; + const dfield_t* dfield; + ib_tuple_t* tuple = (ib_tuple_t*) ib_tpl; + + dfield = ib_col_get_dfield(tuple, col_no); + + if (dtype_get_mtype(dfield_get_type(dfield)) == DATA_FLOAT) { + ib_col_copy_value_low(ib_tpl, col_no, fval, sizeof(*fval)); + err = DB_SUCCESS; + } else { + err = DB_DATA_MISMATCH; + } + + return(err); +} + +/*****************************************************************//** +Truncate a table. The cursor handle will be closed and set to NULL +on success. +@return DB_SUCCESS or error code */ +UNIV_INTERN +ib_err_t +ib_cursor_truncate( +/*===============*/ + ib_crsr_t* ib_crsr, /*!< in/out: cursor for table + to truncate */ + ib_id_u64_t* table_id) /*!< out: new table id */ +{ + ib_err_t err; + ib_cursor_t* cursor = *(ib_cursor_t**) ib_crsr; + row_prebuilt_t* prebuilt = cursor->prebuilt; + + *table_id = 0; + + err = ib_cursor_lock(*ib_crsr, IB_LOCK_X); + + if (err == DB_SUCCESS) { + trx_t* trx; + dict_table_t* table = prebuilt->table; + + /* We are going to free the cursor and the prebuilt. Store + the transaction handle locally. */ + trx = prebuilt->trx; + err = ib_cursor_close(*ib_crsr); + ut_a(err == DB_SUCCESS); + + *ib_crsr = NULL; + + /* A temp go around for assertion in trx_start_for_ddl_low + we already start the trx */ + if (trx->state == TRX_STATE_ACTIVE) { +#ifdef UNIV_DEBUG + trx->start_file = 0; +#endif /* UNIV_DEBUG */ + trx->dict_operation = TRX_DICT_OP_TABLE; + } + + /* This function currently commits the transaction + on success. */ + err = static_cast<ib_err_t>( + row_truncate_table_for_mysql(table, trx)); + + if (err == DB_SUCCESS) { + *table_id = (table->id); + } + } + + return(err); +} + +/*****************************************************************//** +Truncate a table. +@return DB_SUCCESS or error code */ +UNIV_INTERN +ib_err_t +ib_table_truncate( +/*==============*/ + const char* table_name, /*!< in: table name */ + ib_id_u64_t* table_id) /*!< out: new table id */ +{ + ib_err_t err; + dict_table_t* table; + ib_err_t trunc_err; + ib_trx_t ib_trx = NULL; + ib_crsr_t ib_crsr = NULL; + ib_ulint_t memcached_sync = 0; + + ib_trx = ib_trx_begin(IB_TRX_SERIALIZABLE, true, false); + + dict_mutex_enter_for_mysql(); + + table = dict_table_open_on_name(table_name, TRUE, FALSE, + DICT_ERR_IGNORE_NONE); + + if (table != NULL && dict_table_get_first_index(table)) { + err = ib_create_cursor_with_index_id(&ib_crsr, table, 0, + (trx_t*) ib_trx); + } else { + err = DB_TABLE_NOT_FOUND; + } + + /* Remember the memcached_sync_count and set it to 0, so the + truncate can be executed. */ + if (table != NULL && err == DB_SUCCESS) { + memcached_sync = static_cast<ib_ulint_t>( + table->memcached_sync_count); + table->memcached_sync_count = 0; + } + + dict_mutex_exit_for_mysql(); + + if (err == DB_SUCCESS) { + trunc_err = ib_cursor_truncate(&ib_crsr, table_id); + ut_a(err == DB_SUCCESS); + } else { + trunc_err = err; + } + + if (ib_crsr != NULL) { + err = ib_cursor_close(ib_crsr); + ut_a(err == DB_SUCCESS); + } + + if (trunc_err == DB_SUCCESS) { + ut_a(ib_trx_state(ib_trx) == static_cast<ib_trx_state_t>( + TRX_STATE_NOT_STARTED)); + + err = ib_trx_release(ib_trx); + ut_a(err == DB_SUCCESS); + } else { + err = ib_trx_rollback(ib_trx); + ut_a(err == DB_SUCCESS); + } + + /* Set the memcached_sync_count back. */ + if (table != NULL && memcached_sync != 0) { + dict_mutex_enter_for_mysql(); + + table->memcached_sync_count = memcached_sync; + + dict_mutex_exit_for_mysql(); + } + + return(trunc_err); +} + +/*****************************************************************//** +Frees a possible InnoDB trx object associated with the current THD. +@return 0 or error number */ +UNIV_INTERN +ib_err_t +ib_close_thd( +/*=========*/ + void* thd) /*!< in: handle to the MySQL thread of the user + whose resources should be free'd */ +{ + innobase_close_thd(static_cast<THD*>(thd)); + + return(DB_SUCCESS); +} + +/*****************************************************************//** +Return isolation configuration set by "innodb_api_trx_level" +@return trx isolation level*/ +UNIV_INTERN +ib_trx_state_t +ib_cfg_trx_level() +/*==============*/ +{ + return(static_cast<ib_trx_state_t>(ib_trx_level_setting)); +} + +/*****************************************************************//** +Return configure value for background commit interval (in seconds) +@return background commit interval (in seconds) */ +UNIV_INTERN +ib_ulint_t +ib_cfg_bk_commit_interval() +/*=======================*/ +{ + return(static_cast<ib_ulint_t>(ib_bk_commit_interval)); +} + +/*****************************************************************//** +Get generic configure status +@return configure status*/ +UNIV_INTERN +int +ib_cfg_get_cfg() +/*============*/ +{ + int cfg_status; + + cfg_status = (ib_binlog_enabled) ? IB_CFG_BINLOG_ENABLED : 0; + + if (ib_mdl_enabled) { + cfg_status |= IB_CFG_MDL_ENABLED; + } + + if (ib_disable_row_lock) { + cfg_status |= IB_CFG_DISABLE_ROWLOCK; + } + + return(cfg_status); +} + +/*****************************************************************//** +Increase/decrease the memcached sync count of table to sync memcached +DML with SQL DDLs. +@return DB_SUCCESS or error number */ +UNIV_INTERN +ib_err_t +ib_cursor_set_memcached_sync( +/*=========================*/ + ib_crsr_t ib_crsr, /*!< in: cursor */ + ib_bool_t flag) /*!< in: true for increase */ +{ + const ib_cursor_t* cursor = (const ib_cursor_t*) ib_crsr; + row_prebuilt_t* prebuilt = cursor->prebuilt; + dict_table_t* table = prebuilt->table; + ib_err_t err = DB_SUCCESS; + + if (table != NULL) { + /* If memcached_sync_count is -1, means table is + doing DDL, we just return error. */ + if (table->memcached_sync_count == DICT_TABLE_IN_DDL) { + return(DB_ERROR); + } + + if (flag) { +#ifdef HAVE_ATOMIC_BUILTINS + os_atomic_increment_lint(&table->memcached_sync_count, 1); +#else + dict_mutex_enter_for_mysql(); + ++table->memcached_sync_count; + dict_mutex_exit_for_mysql(); +#endif + } else { +#ifdef HAVE_ATOMIC_BUILTINS + os_atomic_decrement_lint(&table->memcached_sync_count, 1); +#else + dict_mutex_enter_for_mysql(); + --table->memcached_sync_count; + dict_mutex_exit_for_mysql(); +#endif + ut_a(table->memcached_sync_count >= 0); + } + } else { + err = DB_TABLE_NOT_FOUND; + } + + return(err); +} diff --git a/storage/xtradb/api/api0misc.cc b/storage/xtradb/api/api0misc.cc new file mode 100644 index 00000000000..b2370105938 --- /dev/null +++ b/storage/xtradb/api/api0misc.cc @@ -0,0 +1,206 @@ +/***************************************************************************** + +Copyright (c) 2008, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file api/api0misc.cc +InnoDB Native API + +2008-08-01 Created by Sunny Bains +3/20/2011 Jimmy Yang extracted from Embedded InnoDB +*******************************************************/ + +#include <errno.h> + +#ifdef HAVE_UNISTD_H +#include <unistd.h> +#endif /* HAVE_UNISTD_H */ + +#include "api0misc.h" +#include "trx0roll.h" +#include "srv0srv.h" +#include "dict0mem.h" +#include "dict0dict.h" +#include "pars0pars.h" +#include "row0sel.h" +#include "lock0lock.h" +#include "ha_prototypes.h" +#include <m_ctype.h> +#include <mysys_err.h> +#include <mysql/plugin.h> + +/*********************************************************************//** +Sets a lock on a table. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +ib_trx_lock_table_with_retry( +/*=========================*/ + trx_t* trx, /*!< in/out: transaction */ + dict_table_t* table, /*!< in: table to lock */ + enum lock_mode mode) /*!< in: LOCK_X or LOCK_S */ +{ + que_thr_t* thr; + dberr_t err; + mem_heap_t* heap; + sel_node_t* node; + + heap = mem_heap_create(512); + + trx->op_info = "setting table lock"; + + node = sel_node_create(heap); + thr = pars_complete_graph_for_exec(node, trx, heap); + thr->graph->state = QUE_FORK_ACTIVE; + + /* We use the select query graph as the dummy graph needed + in the lock module call */ + + thr = que_fork_get_first_thr(static_cast<que_fork_t*>( + que_node_get_parent(thr))); + que_thr_move_to_run_state_for_mysql(thr, trx); + +run_again: + thr->run_node = thr; + thr->prev_node = thr->common.parent; + + err = lock_table(0, table, mode, thr); + + trx->error_state = err; + + if (UNIV_LIKELY(err == DB_SUCCESS)) { + que_thr_stop_for_mysql_no_error(thr, trx); + } else { + que_thr_stop_for_mysql(thr); + + if (err != DB_QUE_THR_SUSPENDED) { + ibool was_lock_wait; + + was_lock_wait = ib_handle_errors(&err, trx, thr, NULL); + + if (was_lock_wait) { + goto run_again; + } + } else { + que_thr_t* run_thr; + que_node_t* parent; + + parent = que_node_get_parent(thr); + run_thr = que_fork_start_command( + static_cast<que_fork_t*>(parent)); + + ut_a(run_thr == thr); + + /* There was a lock wait but the thread was not + in a ready to run or running state. */ + trx->error_state = DB_LOCK_WAIT; + + goto run_again; + } + } + + que_graph_free(thr->graph); + trx->op_info = ""; + + return(err); +} +/****************************************************************//** +Handles user errors and lock waits detected by the database engine. +@return TRUE if it was a lock wait and we should continue running +the query thread */ +UNIV_INTERN +ibool +ib_handle_errors( +/*=============*/ + dberr_t* new_err,/*!< out: possible new error encountered in + lock wait, or if no new error, the value + of trx->error_state at the entry of this + function */ + trx_t* trx, /*!< in: transaction */ + que_thr_t* thr, /*!< in: query thread */ + trx_savept_t* savept) /*!< in: savepoint or NULL */ +{ + dberr_t err; +handle_new_error: + err = trx->error_state; + + ut_a(err != DB_SUCCESS); + + trx->error_state = DB_SUCCESS; + + switch (err) { + case DB_LOCK_WAIT_TIMEOUT: + trx_rollback_for_mysql(trx); + break; + /* fall through */ + case DB_DUPLICATE_KEY: + case DB_FOREIGN_DUPLICATE_KEY: + case DB_TOO_BIG_RECORD: + case DB_ROW_IS_REFERENCED: + case DB_NO_REFERENCED_ROW: + case DB_CANNOT_ADD_CONSTRAINT: + case DB_TOO_MANY_CONCURRENT_TRXS: + case DB_OUT_OF_FILE_SPACE: + if (savept) { + /* Roll back the latest, possibly incomplete + insertion or update */ + + trx_rollback_to_savepoint(trx, savept); + } + break; + case DB_LOCK_WAIT: + lock_wait_suspend_thread(thr); + + if (trx->error_state != DB_SUCCESS) { + que_thr_stop_for_mysql(thr); + + goto handle_new_error; + } + + *new_err = err; + + return(TRUE); /* Operation needs to be retried. */ + + case DB_DEADLOCK: + case DB_LOCK_TABLE_FULL: + /* Roll back the whole transaction; this resolution was added + to version 3.23.43 */ + + trx_rollback_for_mysql(trx); + break; + + case DB_MUST_GET_MORE_FILE_SPACE: + + exit(1); + + case DB_CORRUPTION: + case DB_FOREIGN_EXCEED_MAX_CASCADE: + break; + default: + ut_error; + } + + if (trx->error_state != DB_SUCCESS) { + *new_err = trx->error_state; + } else { + *new_err = err; + } + + trx->error_state = DB_SUCCESS; + + return(FALSE); +} diff --git a/storage/xtradb/btr/btr0btr.cc b/storage/xtradb/btr/btr0btr.cc new file mode 100644 index 00000000000..7e357f0a1cf --- /dev/null +++ b/storage/xtradb/btr/btr0btr.cc @@ -0,0 +1,5154 @@ +/***************************************************************************** + +Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file btr/btr0btr.cc +The B-tree + +Created 6/2/1994 Heikki Tuuri +*******************************************************/ + +#include "btr0btr.h" + +#ifdef UNIV_NONINL +#include "btr0btr.ic" +#endif + +#include "fsp0fsp.h" +#include "page0page.h" +#include "page0zip.h" + +#ifndef UNIV_HOTBACKUP +#include "btr0cur.h" +#include "btr0sea.h" +#include "btr0pcur.h" +#include "rem0cmp.h" +#include "lock0lock.h" +#include "ibuf0ibuf.h" +#include "trx0trx.h" +#include "srv0mon.h" + +/**************************************************************//** +Checks if the page in the cursor can be merged with given page. +If necessary, re-organize the merge_page. +@return TRUE if possible to merge. */ +UNIV_INTERN +ibool +btr_can_merge_with_page( +/*====================*/ + btr_cur_t* cursor, /*!< in: cursor on the page to merge */ + ulint page_no, /*!< in: a sibling page */ + buf_block_t** merge_block, /*!< out: the merge block */ + mtr_t* mtr); /*!< in: mini-transaction */ + +#endif /* UNIV_HOTBACKUP */ + +/**************************************************************//** +Report that an index page is corrupted. */ +UNIV_INTERN +void +btr_corruption_report( +/*==================*/ + const buf_block_t* block, /*!< in: corrupted block */ + const dict_index_t* index) /*!< in: index tree */ +{ + fprintf(stderr, "InnoDB: flag mismatch in space %u page %u" + " index %s of table %s\n", + (unsigned) buf_block_get_space(block), + (unsigned) buf_block_get_page_no(block), + index->name, index->table_name); + if (block->page.zip.data) { + buf_page_print(block->page.zip.data, + buf_block_get_zip_size(block), + BUF_PAGE_PRINT_NO_CRASH); + } + buf_page_print(buf_block_get_frame(block), 0, 0); +} + +#ifndef UNIV_HOTBACKUP +#ifdef UNIV_BLOB_DEBUG +# include "srv0srv.h" +# include "ut0rbt.h" + +/** TRUE when messages about index->blobs modification are enabled. */ +static ibool btr_blob_dbg_msg; + +/** Issue a message about an operation on index->blobs. +@param op operation +@param b the entry being subjected to the operation +@param ctx the context of the operation */ +#define btr_blob_dbg_msg_issue(op, b, ctx) \ + fprintf(stderr, op " %u:%u:%u->%u %s(%u,%u,%u)\n", \ + (b)->ref_page_no, (b)->ref_heap_no, \ + (b)->ref_field_no, (b)->blob_page_no, ctx, \ + (b)->owner, (b)->always_owner, (b)->del) + +/** Insert to index->blobs a reference to an off-page column. +@param index the index tree +@param b the reference +@param ctx context (for logging) */ +UNIV_INTERN +void +btr_blob_dbg_rbt_insert( +/*====================*/ + dict_index_t* index, /*!< in/out: index tree */ + const btr_blob_dbg_t* b, /*!< in: the reference */ + const char* ctx) /*!< in: context (for logging) */ +{ + if (btr_blob_dbg_msg) { + btr_blob_dbg_msg_issue("insert", b, ctx); + } + mutex_enter(&index->blobs_mutex); + rbt_insert(index->blobs, b, b); + mutex_exit(&index->blobs_mutex); +} + +/** Remove from index->blobs a reference to an off-page column. +@param index the index tree +@param b the reference +@param ctx context (for logging) */ +UNIV_INTERN +void +btr_blob_dbg_rbt_delete( +/*====================*/ + dict_index_t* index, /*!< in/out: index tree */ + const btr_blob_dbg_t* b, /*!< in: the reference */ + const char* ctx) /*!< in: context (for logging) */ +{ + if (btr_blob_dbg_msg) { + btr_blob_dbg_msg_issue("delete", b, ctx); + } + mutex_enter(&index->blobs_mutex); + ut_a(rbt_delete(index->blobs, b)); + mutex_exit(&index->blobs_mutex); +} + +/**************************************************************//** +Comparator for items (btr_blob_dbg_t) in index->blobs. +The key in index->blobs is (ref_page_no, ref_heap_no, ref_field_no). +@return negative, 0 or positive if *a<*b, *a=*b, *a>*b */ +static +int +btr_blob_dbg_cmp( +/*=============*/ + const void* a, /*!< in: first btr_blob_dbg_t to compare */ + const void* b) /*!< in: second btr_blob_dbg_t to compare */ +{ + const btr_blob_dbg_t* aa = static_cast<const btr_blob_dbg_t*>(a); + const btr_blob_dbg_t* bb = static_cast<const btr_blob_dbg_t*>(b); + + ut_ad(aa != NULL); + ut_ad(bb != NULL); + + if (aa->ref_page_no != bb->ref_page_no) { + return(aa->ref_page_no < bb->ref_page_no ? -1 : 1); + } + if (aa->ref_heap_no != bb->ref_heap_no) { + return(aa->ref_heap_no < bb->ref_heap_no ? -1 : 1); + } + if (aa->ref_field_no != bb->ref_field_no) { + return(aa->ref_field_no < bb->ref_field_no ? -1 : 1); + } + return(0); +} + +/**************************************************************//** +Add a reference to an off-page column to the index->blobs map. */ +UNIV_INTERN +void +btr_blob_dbg_add_blob( +/*==================*/ + const rec_t* rec, /*!< in: clustered index record */ + ulint field_no, /*!< in: off-page column number */ + ulint page_no, /*!< in: start page of the column */ + dict_index_t* index, /*!< in/out: index tree */ + const char* ctx) /*!< in: context (for logging) */ +{ + btr_blob_dbg_t b; + const page_t* page = page_align(rec); + + ut_a(index->blobs); + + b.blob_page_no = page_no; + b.ref_page_no = page_get_page_no(page); + b.ref_heap_no = page_rec_get_heap_no(rec); + b.ref_field_no = field_no; + ut_a(b.ref_field_no >= index->n_uniq); + b.always_owner = b.owner = TRUE; + b.del = FALSE; + ut_a(!rec_get_deleted_flag(rec, page_is_comp(page))); + btr_blob_dbg_rbt_insert(index, &b, ctx); +} + +/**************************************************************//** +Add to index->blobs any references to off-page columns from a record. +@return number of references added */ +UNIV_INTERN +ulint +btr_blob_dbg_add_rec( +/*=================*/ + const rec_t* rec, /*!< in: record */ + dict_index_t* index, /*!< in/out: index */ + const ulint* offsets,/*!< in: offsets */ + const char* ctx) /*!< in: context (for logging) */ +{ + ulint count = 0; + ulint i; + btr_blob_dbg_t b; + ibool del; + + ut_ad(rec_offs_validate(rec, index, offsets)); + + if (!rec_offs_any_extern(offsets)) { + return(0); + } + + b.ref_page_no = page_get_page_no(page_align(rec)); + b.ref_heap_no = page_rec_get_heap_no(rec); + del = (rec_get_deleted_flag(rec, rec_offs_comp(offsets)) != 0); + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + if (rec_offs_nth_extern(offsets, i)) { + ulint len; + const byte* field_ref = rec_get_nth_field( + rec, offsets, i, &len); + + ut_a(len != UNIV_SQL_NULL); + ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE); + field_ref += len - BTR_EXTERN_FIELD_REF_SIZE; + + if (!memcmp(field_ref, field_ref_zero, + BTR_EXTERN_FIELD_REF_SIZE)) { + /* the column has not been stored yet */ + continue; + } + + b.ref_field_no = i; + b.blob_page_no = mach_read_from_4( + field_ref + BTR_EXTERN_PAGE_NO); + ut_a(b.ref_field_no >= index->n_uniq); + b.always_owner = b.owner + = !(field_ref[BTR_EXTERN_LEN] + & BTR_EXTERN_OWNER_FLAG); + b.del = del; + + btr_blob_dbg_rbt_insert(index, &b, ctx); + count++; + } + } + + return(count); +} + +/**************************************************************//** +Display the references to off-page columns. +This function is to be called from a debugger, +for example when a breakpoint on ut_dbg_assertion_failed is hit. */ +UNIV_INTERN +void +btr_blob_dbg_print( +/*===============*/ + const dict_index_t* index) /*!< in: index tree */ +{ + const ib_rbt_node_t* node; + + if (!index->blobs) { + return; + } + + /* We intentionally do not acquire index->blobs_mutex here. + This function is to be called from a debugger, and the caller + should make sure that the index->blobs_mutex is held. */ + + for (node = rbt_first(index->blobs); + node != NULL; node = rbt_next(index->blobs, node)) { + const btr_blob_dbg_t* b + = rbt_value(btr_blob_dbg_t, node); + fprintf(stderr, "%u:%u:%u->%u%s%s%s\n", + b->ref_page_no, b->ref_heap_no, b->ref_field_no, + b->blob_page_no, + b->owner ? "" : "(disowned)", + b->always_owner ? "" : "(has disowned)", + b->del ? "(deleted)" : ""); + } +} + +/**************************************************************//** +Remove from index->blobs any references to off-page columns from a record. +@return number of references removed */ +UNIV_INTERN +ulint +btr_blob_dbg_remove_rec( +/*====================*/ + const rec_t* rec, /*!< in: record */ + dict_index_t* index, /*!< in/out: index */ + const ulint* offsets,/*!< in: offsets */ + const char* ctx) /*!< in: context (for logging) */ +{ + ulint i; + ulint count = 0; + btr_blob_dbg_t b; + + ut_ad(rec_offs_validate(rec, index, offsets)); + + if (!rec_offs_any_extern(offsets)) { + return(0); + } + + b.ref_page_no = page_get_page_no(page_align(rec)); + b.ref_heap_no = page_rec_get_heap_no(rec); + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + if (rec_offs_nth_extern(offsets, i)) { + ulint len; + const byte* field_ref = rec_get_nth_field( + rec, offsets, i, &len); + + ut_a(len != UNIV_SQL_NULL); + ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE); + field_ref += len - BTR_EXTERN_FIELD_REF_SIZE; + + b.ref_field_no = i; + b.blob_page_no = mach_read_from_4( + field_ref + BTR_EXTERN_PAGE_NO); + + switch (b.blob_page_no) { + case 0: + /* The column has not been stored yet. + The BLOB pointer must be all zero. + There cannot be a BLOB starting at + page 0, because page 0 is reserved for + the tablespace header. */ + ut_a(!memcmp(field_ref, field_ref_zero, + BTR_EXTERN_FIELD_REF_SIZE)); + /* fall through */ + case FIL_NULL: + /* the column has been freed already */ + continue; + } + + btr_blob_dbg_rbt_delete(index, &b, ctx); + count++; + } + } + + return(count); +} + +/**************************************************************//** +Check that there are no references to off-page columns from or to +the given page. Invoked when freeing or clearing a page. +@return TRUE when no orphan references exist */ +UNIV_INTERN +ibool +btr_blob_dbg_is_empty( +/*==================*/ + dict_index_t* index, /*!< in: index */ + ulint page_no) /*!< in: page number */ +{ + const ib_rbt_node_t* node; + ibool success = TRUE; + + if (!index->blobs) { + return(success); + } + + mutex_enter(&index->blobs_mutex); + + for (node = rbt_first(index->blobs); + node != NULL; node = rbt_next(index->blobs, node)) { + const btr_blob_dbg_t* b + = rbt_value(btr_blob_dbg_t, node); + + if (b->ref_page_no != page_no && b->blob_page_no != page_no) { + continue; + } + + fprintf(stderr, + "InnoDB: orphan BLOB ref%s%s%s %u:%u:%u->%u\n", + b->owner ? "" : "(disowned)", + b->always_owner ? "" : "(has disowned)", + b->del ? "(deleted)" : "", + b->ref_page_no, b->ref_heap_no, b->ref_field_no, + b->blob_page_no); + + if (b->blob_page_no != page_no || b->owner || !b->del) { + success = FALSE; + } + } + + mutex_exit(&index->blobs_mutex); + return(success); +} + +/**************************************************************//** +Count and process all references to off-page columns on a page. +@return number of references processed */ +UNIV_INTERN +ulint +btr_blob_dbg_op( +/*============*/ + const page_t* page, /*!< in: B-tree leaf page */ + const rec_t* rec, /*!< in: record to start from + (NULL to process the whole page) */ + dict_index_t* index, /*!< in/out: index */ + const char* ctx, /*!< in: context (for logging) */ + const btr_blob_dbg_op_f op) /*!< in: operation on records */ +{ + ulint count = 0; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + ut_a(fil_page_get_type(page) == FIL_PAGE_INDEX); + ut_a(!rec || page_align(rec) == page); + + if (!index->blobs || !page_is_leaf(page) + || !dict_index_is_clust(index)) { + return(0); + } + + if (rec == NULL) { + rec = page_get_infimum_rec(page); + } + + do { + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + count += op(rec, index, offsets, ctx); + rec = page_rec_get_next_const(rec); + } while (!page_rec_is_supremum(rec)); + + if (heap) { + mem_heap_free(heap); + } + + return(count); +} + +/**************************************************************//** +Count and add to index->blobs any references to off-page columns +from records on a page. +@return number of references added */ +UNIV_INTERN +ulint +btr_blob_dbg_add( +/*=============*/ + const page_t* page, /*!< in: rewritten page */ + dict_index_t* index, /*!< in/out: index */ + const char* ctx) /*!< in: context (for logging) */ +{ + btr_blob_dbg_assert_empty(index, page_get_page_no(page)); + + return(btr_blob_dbg_op(page, NULL, index, ctx, btr_blob_dbg_add_rec)); +} + +/**************************************************************//** +Count and remove from index->blobs any references to off-page columns +from records on a page. +Used when reorganizing a page, before copying the records. +@return number of references removed */ +UNIV_INTERN +ulint +btr_blob_dbg_remove( +/*================*/ + const page_t* page, /*!< in: b-tree page */ + dict_index_t* index, /*!< in/out: index */ + const char* ctx) /*!< in: context (for logging) */ +{ + ulint count; + + count = btr_blob_dbg_op(page, NULL, index, ctx, + btr_blob_dbg_remove_rec); + + /* Check that no references exist. */ + btr_blob_dbg_assert_empty(index, page_get_page_no(page)); + + return(count); +} + +/**************************************************************//** +Restore in index->blobs any references to off-page columns +Used when page reorganize fails due to compressed page overflow. */ +UNIV_INTERN +void +btr_blob_dbg_restore( +/*=================*/ + const page_t* npage, /*!< in: page that failed to compress */ + const page_t* page, /*!< in: copy of original page */ + dict_index_t* index, /*!< in/out: index */ + const char* ctx) /*!< in: context (for logging) */ +{ + ulint removed; + ulint added; + + ut_a(page_get_page_no(npage) == page_get_page_no(page)); + ut_a(page_get_space_id(npage) == page_get_space_id(page)); + + removed = btr_blob_dbg_remove(npage, index, ctx); + added = btr_blob_dbg_add(page, index, ctx); + ut_a(added == removed); +} + +/**************************************************************//** +Modify the 'deleted' flag of a record. */ +UNIV_INTERN +void +btr_blob_dbg_set_deleted_flag( +/*==========================*/ + const rec_t* rec, /*!< in: record */ + dict_index_t* index, /*!< in/out: index */ + const ulint* offsets,/*!< in: rec_get_offs(rec, index) */ + ibool del) /*!< in: TRUE=deleted, FALSE=exists */ +{ + const ib_rbt_node_t* node; + btr_blob_dbg_t b; + btr_blob_dbg_t* c; + ulint i; + + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_a(dict_index_is_clust(index)); + ut_a(del == !!del);/* must be FALSE==0 or TRUE==1 */ + + if (!rec_offs_any_extern(offsets) || !index->blobs) { + + return; + } + + b.ref_page_no = page_get_page_no(page_align(rec)); + b.ref_heap_no = page_rec_get_heap_no(rec); + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + if (rec_offs_nth_extern(offsets, i)) { + ulint len; + const byte* field_ref = rec_get_nth_field( + rec, offsets, i, &len); + + ut_a(len != UNIV_SQL_NULL); + ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE); + field_ref += len - BTR_EXTERN_FIELD_REF_SIZE; + + b.ref_field_no = i; + b.blob_page_no = mach_read_from_4( + field_ref + BTR_EXTERN_PAGE_NO); + + switch (b.blob_page_no) { + case 0: + ut_a(memcmp(field_ref, field_ref_zero, + BTR_EXTERN_FIELD_REF_SIZE)); + /* page number 0 is for the + page allocation bitmap */ + case FIL_NULL: + /* the column has been freed already */ + ut_error; + } + + mutex_enter(&index->blobs_mutex); + node = rbt_lookup(index->blobs, &b); + ut_a(node); + + c = rbt_value(btr_blob_dbg_t, node); + /* The flag should be modified. */ + c->del = del; + if (btr_blob_dbg_msg) { + b = *c; + mutex_exit(&index->blobs_mutex); + btr_blob_dbg_msg_issue("del_mk", &b, ""); + } else { + mutex_exit(&index->blobs_mutex); + } + } + } +} + +/**************************************************************//** +Change the ownership of an off-page column. */ +UNIV_INTERN +void +btr_blob_dbg_owner( +/*===============*/ + const rec_t* rec, /*!< in: record */ + dict_index_t* index, /*!< in/out: index */ + const ulint* offsets,/*!< in: rec_get_offs(rec, index) */ + ulint i, /*!< in: ith field in rec */ + ibool own) /*!< in: TRUE=owned, FALSE=disowned */ +{ + const ib_rbt_node_t* node; + btr_blob_dbg_t b; + const byte* field_ref; + ulint len; + + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_a(rec_offs_nth_extern(offsets, i)); + + field_ref = rec_get_nth_field(rec, offsets, i, &len); + ut_a(len != UNIV_SQL_NULL); + ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE); + field_ref += len - BTR_EXTERN_FIELD_REF_SIZE; + + b.ref_page_no = page_get_page_no(page_align(rec)); + b.ref_heap_no = page_rec_get_heap_no(rec); + b.ref_field_no = i; + b.owner = !(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG); + b.blob_page_no = mach_read_from_4(field_ref + BTR_EXTERN_PAGE_NO); + + ut_a(b.owner == own); + + mutex_enter(&index->blobs_mutex); + node = rbt_lookup(index->blobs, &b); + /* row_ins_clust_index_entry_by_modify() invokes + btr_cur_unmark_extern_fields() also for the newly inserted + references, which are all zero bytes until the columns are stored. + The node lookup must fail if and only if that is the case. */ + ut_a(!memcmp(field_ref, field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE) + == !node); + + if (node) { + btr_blob_dbg_t* c = rbt_value(btr_blob_dbg_t, node); + /* Some code sets ownership from TRUE to TRUE. + We do not allow changing ownership from FALSE to FALSE. */ + ut_a(own || c->owner); + + c->owner = own; + if (!own) { + c->always_owner = FALSE; + } + } + + mutex_exit(&index->blobs_mutex); +} +#endif /* UNIV_BLOB_DEBUG */ + +/* +Latching strategy of the InnoDB B-tree +-------------------------------------- +A tree latch protects all non-leaf nodes of the tree. Each node of a tree +also has a latch of its own. + +A B-tree operation normally first acquires an S-latch on the tree. It +searches down the tree and releases the tree latch when it has the +leaf node latch. To save CPU time we do not acquire any latch on +non-leaf nodes of the tree during a search, those pages are only bufferfixed. + +If an operation needs to restructure the tree, it acquires an X-latch on +the tree before searching to a leaf node. If it needs, for example, to +split a leaf, +(1) InnoDB decides the split point in the leaf, +(2) allocates a new page, +(3) inserts the appropriate node pointer to the first non-leaf level, +(4) releases the tree X-latch, +(5) and then moves records from the leaf to the new allocated page. + +Node pointers +------------- +Leaf pages of a B-tree contain the index records stored in the +tree. On levels n > 0 we store 'node pointers' to pages on level +n - 1. For each page there is exactly one node pointer stored: +thus the our tree is an ordinary B-tree, not a B-link tree. + +A node pointer contains a prefix P of an index record. The prefix +is long enough so that it determines an index record uniquely. +The file page number of the child page is added as the last +field. To the child page we can store node pointers or index records +which are >= P in the alphabetical order, but < P1 if there is +a next node pointer on the level, and P1 is its prefix. + +If a node pointer with a prefix P points to a non-leaf child, +then the leftmost record in the child must have the same +prefix P. If it points to a leaf node, the child is not required +to contain any record with a prefix equal to P. The leaf case +is decided this way to allow arbitrary deletions in a leaf node +without touching upper levels of the tree. + +We have predefined a special minimum record which we +define as the smallest record in any alphabetical order. +A minimum record is denoted by setting a bit in the record +header. A minimum record acts as the prefix of a node pointer +which points to a leftmost node on any level of the tree. + +File page allocation +-------------------- +In the root node of a B-tree there are two file segment headers. +The leaf pages of a tree are allocated from one file segment, to +make them consecutive on disk if possible. From the other file segment +we allocate pages for the non-leaf levels of the tree. +*/ + +#ifdef UNIV_BTR_DEBUG +/**************************************************************//** +Checks a file segment header within a B-tree root page. +@return TRUE if valid */ +static +ibool +btr_root_fseg_validate( +/*===================*/ + const fseg_header_t* seg_header, /*!< in: segment header */ + ulint space) /*!< in: tablespace identifier */ +{ + ulint offset = mach_read_from_2(seg_header + FSEG_HDR_OFFSET); + + if (UNIV_UNLIKELY(srv_pass_corrupt_table != 0)) { + return (mach_read_from_4(seg_header + FSEG_HDR_SPACE) == space) + && (offset >= FIL_PAGE_DATA) + && (offset <= UNIV_PAGE_SIZE - FIL_PAGE_DATA_END); + } + + ut_a(mach_read_from_4(seg_header + FSEG_HDR_SPACE) == space); + ut_a(offset >= FIL_PAGE_DATA); + ut_a(offset <= UNIV_PAGE_SIZE - FIL_PAGE_DATA_END); + return(TRUE); +} +#endif /* UNIV_BTR_DEBUG */ + +/**************************************************************//** +Gets the root node of a tree and x- or s-latches it. +@return root page, x- or s-latched */ +static +buf_block_t* +btr_root_block_get( +/*===============*/ + const dict_index_t* index, /*!< in: index tree */ + ulint mode, /*!< in: either RW_S_LATCH + or RW_X_LATCH */ + mtr_t* mtr) /*!< in: mtr */ +{ + ulint space; + ulint zip_size; + ulint root_page_no; + buf_block_t* block; + + space = dict_index_get_space(index); + zip_size = dict_table_zip_size(index->table); + root_page_no = dict_index_get_page(index); + + block = btr_block_get(space, zip_size, root_page_no, mode, index, mtr); + + SRV_CORRUPT_TABLE_CHECK(block, return(0);); + + btr_assert_not_corrupted(block, index); +#ifdef UNIV_BTR_DEBUG + if (!dict_index_is_ibuf(index)) { + const page_t* root = buf_block_get_frame(block); + + if (UNIV_UNLIKELY(srv_pass_corrupt_table != 0)) { + if (!btr_root_fseg_validate(FIL_PAGE_DATA + + PAGE_BTR_SEG_LEAF + + root, space)) + return(NULL); + if (!btr_root_fseg_validate(FIL_PAGE_DATA + + PAGE_BTR_SEG_TOP + + root, space)) + return(NULL); + return(block); + } + ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF + + root, space)); + ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP + + root, space)); + } +#endif /* UNIV_BTR_DEBUG */ + + return(block); +} + +/**************************************************************//** +Gets the root node of a tree and x-latches it. +@return root page, x-latched */ +UNIV_INTERN +page_t* +btr_root_get( +/*=========*/ + const dict_index_t* index, /*!< in: index tree */ + mtr_t* mtr) /*!< in: mtr */ +{ + return(buf_block_get_frame(btr_root_block_get(index, RW_X_LATCH, + mtr))); +} + +/**************************************************************//** +Gets the height of the B-tree (the level of the root, when the leaf +level is assumed to be 0). The caller must hold an S or X latch on +the index. +@return tree height (level of the root) */ +UNIV_INTERN +ulint +btr_height_get( +/*===========*/ + dict_index_t* index, /*!< in: index tree */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ulint height; + buf_block_t* root_block; + + ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index), + MTR_MEMO_S_LOCK) + || mtr_memo_contains(mtr, dict_index_get_lock(index), + MTR_MEMO_X_LOCK)); + + /* S latches the page */ + root_block = btr_root_block_get(index, RW_S_LATCH, mtr); + + height = btr_page_get_level(buf_block_get_frame(root_block), mtr); + + /* Release the S latch on the root page. */ + mtr_memo_release(mtr, root_block, MTR_MEMO_PAGE_S_FIX); +#ifdef UNIV_SYNC_DEBUG + sync_thread_reset_level(&root_block->lock); +#endif /* UNIV_SYNC_DEBUG */ + + return(height); +} + +/**************************************************************//** +Checks a file segment header within a B-tree root page and updates +the segment header space id. +@return TRUE if valid */ +static +bool +btr_root_fseg_adjust_on_import( +/*===========================*/ + fseg_header_t* seg_header, /*!< in/out: segment header */ + page_zip_des_t* page_zip, /*!< in/out: compressed page, + or NULL */ + ulint space, /*!< in: tablespace identifier */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ulint offset = mach_read_from_2(seg_header + FSEG_HDR_OFFSET); + + if (offset < FIL_PAGE_DATA + || offset > UNIV_PAGE_SIZE - FIL_PAGE_DATA_END) { + + return(FALSE); + + } else if (page_zip) { + mach_write_to_4(seg_header + FSEG_HDR_SPACE, space); + page_zip_write_header(page_zip, seg_header + FSEG_HDR_SPACE, + 4, mtr); + } else { + mlog_write_ulint(seg_header + FSEG_HDR_SPACE, + space, MLOG_4BYTES, mtr); + } + + return(TRUE); +} + +/**************************************************************//** +Checks and adjusts the root node of a tree during IMPORT TABLESPACE. +@return error code, or DB_SUCCESS */ +UNIV_INTERN +dberr_t +btr_root_adjust_on_import( +/*======================*/ + const dict_index_t* index) /*!< in: index tree */ +{ + dberr_t err; + mtr_t mtr; + page_t* page; + buf_block_t* block; + page_zip_des_t* page_zip; + dict_table_t* table = index->table; + ulint space_id = dict_index_get_space(index); + ulint zip_size = dict_table_zip_size(table); + ulint root_page_no = dict_index_get_page(index); + + mtr_start(&mtr); + + mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO); + + DBUG_EXECUTE_IF("ib_import_trigger_corruption_3", + return(DB_CORRUPTION);); + + block = btr_block_get( + space_id, zip_size, root_page_no, RW_X_LATCH, index, &mtr); + + page = buf_block_get_frame(block); + page_zip = buf_block_get_page_zip(block); + + /* Check that this is a B-tree page and both the PREV and NEXT + pointers are FIL_NULL, because the root page does not have any + siblings. */ + if (fil_page_get_type(page) != FIL_PAGE_INDEX + || fil_page_get_prev(page) != FIL_NULL + || fil_page_get_next(page) != FIL_NULL) { + + err = DB_CORRUPTION; + + } else if (dict_index_is_clust(index)) { + bool page_is_compact_format; + + page_is_compact_format = page_is_comp(page) > 0; + + /* Check if the page format and table format agree. */ + if (page_is_compact_format != dict_table_is_comp(table)) { + err = DB_CORRUPTION; + } else { + + /* Check that the table flags and the tablespace + flags match. */ + ulint flags = fil_space_get_flags(table->space); + + if (flags + && flags != dict_tf_to_fsp_flags(table->flags)) { + + err = DB_CORRUPTION; + } else { + err = DB_SUCCESS; + } + } + } else { + err = DB_SUCCESS; + } + + /* Check and adjust the file segment headers, if all OK so far. */ + if (err == DB_SUCCESS + && (!btr_root_fseg_adjust_on_import( + FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF + + page, page_zip, space_id, &mtr) + || !btr_root_fseg_adjust_on_import( + FIL_PAGE_DATA + PAGE_BTR_SEG_TOP + + page, page_zip, space_id, &mtr))) { + + err = DB_CORRUPTION; + } + + mtr_commit(&mtr); + + return(err); +} + +/*************************************************************//** +Gets pointer to the previous user record in the tree. It is assumed that +the caller has appropriate latches on the page and its neighbor. +@return previous user record, NULL if there is none */ +UNIV_INTERN +rec_t* +btr_get_prev_user_rec( +/*==================*/ + rec_t* rec, /*!< in: record on leaf level */ + mtr_t* mtr) /*!< in: mtr holding a latch on the page, and if + needed, also to the previous page */ +{ + page_t* page; + page_t* prev_page; + ulint prev_page_no; + + if (!page_rec_is_infimum(rec)) { + + rec_t* prev_rec = page_rec_get_prev(rec); + + if (!page_rec_is_infimum(prev_rec)) { + + return(prev_rec); + } + } + + page = page_align(rec); + prev_page_no = btr_page_get_prev(page, mtr); + + if (prev_page_no != FIL_NULL) { + + ulint space; + ulint zip_size; + buf_block_t* prev_block; + + space = page_get_space_id(page); + zip_size = fil_space_get_zip_size(space); + + prev_block = buf_page_get_with_no_latch(space, zip_size, + prev_page_no, mtr); + prev_page = buf_block_get_frame(prev_block); + /* The caller must already have a latch to the brother */ + ut_ad(mtr_memo_contains(mtr, prev_block, + MTR_MEMO_PAGE_S_FIX) + || mtr_memo_contains(mtr, prev_block, + MTR_MEMO_PAGE_X_FIX)); +#ifdef UNIV_BTR_DEBUG + ut_a(page_is_comp(prev_page) == page_is_comp(page)); + ut_a(btr_page_get_next(prev_page, mtr) + == page_get_page_no(page)); +#endif /* UNIV_BTR_DEBUG */ + + return(page_rec_get_prev(page_get_supremum_rec(prev_page))); + } + + return(NULL); +} + +/*************************************************************//** +Gets pointer to the next user record in the tree. It is assumed that the +caller has appropriate latches on the page and its neighbor. +@return next user record, NULL if there is none */ +UNIV_INTERN +rec_t* +btr_get_next_user_rec( +/*==================*/ + rec_t* rec, /*!< in: record on leaf level */ + mtr_t* mtr) /*!< in: mtr holding a latch on the page, and if + needed, also to the next page */ +{ + page_t* page; + page_t* next_page; + ulint next_page_no; + + if (!page_rec_is_supremum(rec)) { + + rec_t* next_rec = page_rec_get_next(rec); + + if (!page_rec_is_supremum(next_rec)) { + + return(next_rec); + } + } + + page = page_align(rec); + next_page_no = btr_page_get_next(page, mtr); + + if (next_page_no != FIL_NULL) { + ulint space; + ulint zip_size; + buf_block_t* next_block; + + space = page_get_space_id(page); + zip_size = fil_space_get_zip_size(space); + + next_block = buf_page_get_with_no_latch(space, zip_size, + next_page_no, mtr); + next_page = buf_block_get_frame(next_block); + /* The caller must already have a latch to the brother */ + ut_ad(mtr_memo_contains(mtr, next_block, MTR_MEMO_PAGE_S_FIX) + || mtr_memo_contains(mtr, next_block, + MTR_MEMO_PAGE_X_FIX)); +#ifdef UNIV_BTR_DEBUG + ut_a(page_is_comp(next_page) == page_is_comp(page)); + ut_a(btr_page_get_prev(next_page, mtr) + == page_get_page_no(page)); +#endif /* UNIV_BTR_DEBUG */ + + return(page_rec_get_next(page_get_infimum_rec(next_page))); + } + + return(NULL); +} + +/**************************************************************//** +Creates a new index page (not the root, and also not +used in page reorganization). @see btr_page_empty(). */ +UNIV_INTERN +void +btr_page_create( +/*============*/ + buf_block_t* block, /*!< in/out: page to be created */ + page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */ + dict_index_t* index, /*!< in: index */ + ulint level, /*!< in: the B-tree level of the page */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_t* page = buf_block_get_frame(block); + + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + btr_blob_dbg_assert_empty(index, buf_block_get_page_no(block)); + + if (page_zip) { + page_create_zip(block, index, level, 0, mtr); + } else { + page_create(block, mtr, dict_table_is_comp(index->table)); + /* Set the level of the new index page */ + btr_page_set_level(page, NULL, level, mtr); + } + + block->check_index_page_at_flush = TRUE; + + btr_page_set_index_id(page, page_zip, index->id, mtr); +} + +/**************************************************************//** +Allocates a new file page to be used in an ibuf tree. Takes the page from +the free list of the tree, which must contain pages! +@return new allocated block, x-latched */ +static +buf_block_t* +btr_page_alloc_for_ibuf( +/*====================*/ + dict_index_t* index, /*!< in: index tree */ + mtr_t* mtr) /*!< in: mtr */ +{ + fil_addr_t node_addr; + page_t* root; + page_t* new_page; + buf_block_t* new_block; + + root = btr_root_get(index, mtr); + + node_addr = flst_get_first(root + PAGE_HEADER + + PAGE_BTR_IBUF_FREE_LIST, mtr); + ut_a(node_addr.page != FIL_NULL); + + new_block = buf_page_get(dict_index_get_space(index), + dict_table_zip_size(index->table), + node_addr.page, RW_X_LATCH, mtr); + new_page = buf_block_get_frame(new_block); + buf_block_dbg_add_level(new_block, SYNC_IBUF_TREE_NODE_NEW); + + flst_remove(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, + new_page + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, + mtr); + ut_ad(flst_validate(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, + mtr)); + + return(new_block); +} + +/**************************************************************//** +Allocates a new file page to be used in an index tree. NOTE: we assume +that the caller has made the reservation for free extents! +@retval NULL if no page could be allocated +@retval block, rw_lock_x_lock_count(&block->lock) == 1 if allocation succeeded +(init_mtr == mtr, or the page was not previously freed in mtr) +@retval block (not allocated or initialized) otherwise */ +static __attribute__((nonnull, warn_unused_result)) +buf_block_t* +btr_page_alloc_low( +/*===============*/ + dict_index_t* index, /*!< in: index */ + ulint hint_page_no, /*!< in: hint of a good page */ + byte file_direction, /*!< in: direction where a possible + page split is made */ + ulint level, /*!< in: level where the page is placed + in the tree */ + mtr_t* mtr, /*!< in/out: mini-transaction + for the allocation */ + mtr_t* init_mtr) /*!< in/out: mtr or another + mini-transaction in which the + page should be initialized. + If init_mtr!=mtr, but the page + is already X-latched in mtr, do + not initialize the page. */ +{ + fseg_header_t* seg_header; + page_t* root; + + root = btr_root_get(index, mtr); + + if (level == 0) { + seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF; + } else { + seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP; + } + + /* Parameter TRUE below states that the caller has made the + reservation for free extents, and thus we know that a page can + be allocated: */ + + return(fseg_alloc_free_page_general( + seg_header, hint_page_no, file_direction, + TRUE, mtr, init_mtr)); +} + +/**************************************************************//** +Allocates a new file page to be used in an index tree. NOTE: we assume +that the caller has made the reservation for free extents! +@retval NULL if no page could be allocated +@retval block, rw_lock_x_lock_count(&block->lock) == 1 if allocation succeeded +(init_mtr == mtr, or the page was not previously freed in mtr) +@retval block (not allocated or initialized) otherwise */ +UNIV_INTERN +buf_block_t* +btr_page_alloc( +/*===========*/ + dict_index_t* index, /*!< in: index */ + ulint hint_page_no, /*!< in: hint of a good page */ + byte file_direction, /*!< in: direction where a possible + page split is made */ + ulint level, /*!< in: level where the page is placed + in the tree */ + mtr_t* mtr, /*!< in/out: mini-transaction + for the allocation */ + mtr_t* init_mtr) /*!< in/out: mini-transaction + for x-latching and initializing + the page */ +{ + buf_block_t* new_block; + + if (dict_index_is_ibuf(index)) { + + return(btr_page_alloc_for_ibuf(index, mtr)); + } + + new_block = btr_page_alloc_low( + index, hint_page_no, file_direction, level, mtr, init_mtr); + + if (new_block) { + buf_block_dbg_add_level(new_block, SYNC_TREE_NODE_NEW); + } + + return(new_block); +} + +/**************************************************************//** +Gets the number of pages in a B-tree. +@return number of pages, or ULINT_UNDEFINED if the index is unavailable */ +UNIV_INTERN +ulint +btr_get_size( +/*=========*/ + dict_index_t* index, /*!< in: index */ + ulint flag, /*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */ + mtr_t* mtr) /*!< in/out: mini-transaction where index + is s-latched */ +{ + fseg_header_t* seg_header; + page_t* root; + ulint n; + ulint dummy; + + ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index), + MTR_MEMO_S_LOCK)); + + if (index->page == FIL_NULL || dict_index_is_online_ddl(index) + || *index->name == TEMP_INDEX_PREFIX) { + return(ULINT_UNDEFINED); + } + + root = btr_root_get(index, mtr); + + SRV_CORRUPT_TABLE_CHECK(root, + { + mtr_commit(mtr); + return(0); + }); + + if (flag == BTR_N_LEAF_PAGES) { + seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF; + + fseg_n_reserved_pages(seg_header, &n, mtr); + + } else if (flag == BTR_TOTAL_SIZE) { + seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP; + + n = fseg_n_reserved_pages(seg_header, &dummy, mtr); + + seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF; + + n += fseg_n_reserved_pages(seg_header, &dummy, mtr); + } else { + ut_error; + } + + return(n); +} + +/**************************************************************//** +Frees a page used in an ibuf tree. Puts the page to the free list of the +ibuf tree. */ +static +void +btr_page_free_for_ibuf( +/*===================*/ + dict_index_t* index, /*!< in: index tree */ + buf_block_t* block, /*!< in: block to be freed, x-latched */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_t* root; + + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + root = btr_root_get(index, mtr); + + flst_add_first(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, + buf_block_get_frame(block) + + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, mtr); + + ut_ad(flst_validate(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, + mtr)); +} + +/**************************************************************//** +Frees a file page used in an index tree. Can be used also to (BLOB) +external storage pages, because the page level 0 can be given as an +argument. */ +UNIV_INTERN +void +btr_page_free_low( +/*==============*/ + dict_index_t* index, /*!< in: index tree */ + buf_block_t* block, /*!< in: block to be freed, x-latched */ + ulint level, /*!< in: page level */ + mtr_t* mtr) /*!< in: mtr */ +{ + fseg_header_t* seg_header; + page_t* root; + + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + /* The page gets invalid for optimistic searches: increment the frame + modify clock */ + + buf_block_modify_clock_inc(block); + btr_blob_dbg_assert_empty(index, buf_block_get_page_no(block)); + + if (dict_index_is_ibuf(index)) { + + btr_page_free_for_ibuf(index, block, mtr); + + return; + } + + root = btr_root_get(index, mtr); + + if (level == 0) { + seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF; + } else { + seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP; + } + + fseg_free_page(seg_header, + buf_block_get_space(block), + buf_block_get_page_no(block), mtr); + + /* The page was marked free in the allocation bitmap, but it + should remain buffer-fixed until mtr_commit(mtr) or until it + is explicitly freed from the mini-transaction. */ + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + /* TODO: Discard any operations on the page from the redo log + and remove the block from the flush list and the buffer pool. + This would free up buffer pool earlier and reduce writes to + both the tablespace and the redo log. */ +} + +/**************************************************************//** +Frees a file page used in an index tree. NOTE: cannot free field external +storage pages because the page must contain info on its level. */ +UNIV_INTERN +void +btr_page_free( +/*==========*/ + dict_index_t* index, /*!< in: index tree */ + buf_block_t* block, /*!< in: block to be freed, x-latched */ + mtr_t* mtr) /*!< in: mtr */ +{ + const page_t* page = buf_block_get_frame(block); + ulint level = btr_page_get_level(page, mtr); + + ut_ad(fil_page_get_type(block->frame) == FIL_PAGE_INDEX); + btr_page_free_low(index, block, level, mtr); +} + +/**************************************************************//** +Sets the child node file address in a node pointer. */ +UNIV_INLINE +void +btr_node_ptr_set_child_page_no( +/*===========================*/ + rec_t* rec, /*!< in: node pointer record */ + page_zip_des_t* page_zip,/*!< in/out: compressed page whose uncompressed + part will be updated, or NULL */ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint page_no,/*!< in: child node address */ + mtr_t* mtr) /*!< in: mtr */ +{ + byte* field; + ulint len; + + ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_ad(!page_is_leaf(page_align(rec))); + ut_ad(!rec_offs_comp(offsets) || rec_get_node_ptr_flag(rec)); + + /* The child address is in the last field */ + field = rec_get_nth_field(rec, offsets, + rec_offs_n_fields(offsets) - 1, &len); + + ut_ad(len == REC_NODE_PTR_SIZE); + + if (page_zip) { + page_zip_write_node_ptr(page_zip, rec, + rec_offs_data_size(offsets), + page_no, mtr); + } else { + mlog_write_ulint(field, page_no, MLOG_4BYTES, mtr); + } +} + +/************************************************************//** +Returns the child page of a node pointer and x-latches it. +@return child page, x-latched */ +static +buf_block_t* +btr_node_ptr_get_child( +/*===================*/ + const rec_t* node_ptr,/*!< in: node pointer */ + dict_index_t* index, /*!< in: index */ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + mtr_t* mtr) /*!< in: mtr */ +{ + ulint page_no; + ulint space; + + ut_ad(rec_offs_validate(node_ptr, index, offsets)); + space = page_get_space_id(page_align(node_ptr)); + page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets); + + return(btr_block_get(space, dict_table_zip_size(index->table), + page_no, RW_X_LATCH, index, mtr)); +} + +/************************************************************//** +Returns the upper level node pointer to a page. It is assumed that mtr holds +an x-latch on the tree. +@return rec_get_offsets() of the node pointer record */ +static +ulint* +btr_page_get_father_node_ptr_func( +/*==============================*/ + ulint* offsets,/*!< in: work area for the return value */ + mem_heap_t* heap, /*!< in: memory heap to use */ + btr_cur_t* cursor, /*!< in: cursor pointing to user record, + out: cursor on node pointer record, + its page x-latched */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ + mtr_t* mtr) /*!< in: mtr */ +{ + dtuple_t* tuple; + rec_t* user_rec; + rec_t* node_ptr; + ulint level; + ulint page_no; + dict_index_t* index; + + page_no = buf_block_get_page_no(btr_cur_get_block(cursor)); + index = btr_cur_get_index(cursor); + + ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index), + MTR_MEMO_X_LOCK)); + + ut_ad(dict_index_get_page(index) != page_no); + + level = btr_page_get_level(btr_cur_get_page(cursor), mtr); + + user_rec = btr_cur_get_rec(cursor); + ut_a(page_rec_is_user_rec(user_rec)); + tuple = dict_index_build_node_ptr(index, user_rec, 0, heap, level); + + btr_cur_search_to_nth_level(index, level + 1, tuple, PAGE_CUR_LE, + BTR_CONT_MODIFY_TREE, cursor, 0, + file, line, mtr); + + node_ptr = btr_cur_get_rec(cursor); + ut_ad(!page_rec_is_comp(node_ptr) + || rec_get_status(node_ptr) == REC_STATUS_NODE_PTR); + offsets = rec_get_offsets(node_ptr, index, offsets, + ULINT_UNDEFINED, &heap); + + if (btr_node_ptr_get_child_page_no(node_ptr, offsets) != page_no) { + rec_t* print_rec; + fputs("InnoDB: Dump of the child page:\n", stderr); + buf_page_print(page_align(user_rec), 0, + BUF_PAGE_PRINT_NO_CRASH); + fputs("InnoDB: Dump of the parent page:\n", stderr); + buf_page_print(page_align(node_ptr), 0, + BUF_PAGE_PRINT_NO_CRASH); + + fputs("InnoDB: Corruption of an index tree: table ", stderr); + ut_print_name(stderr, NULL, TRUE, index->table_name); + fputs(", index ", stderr); + ut_print_name(stderr, NULL, FALSE, index->name); + fprintf(stderr, ",\n" + "InnoDB: father ptr page no %lu, child page no %lu\n", + (ulong) + btr_node_ptr_get_child_page_no(node_ptr, offsets), + (ulong) page_no); + print_rec = page_rec_get_next( + page_get_infimum_rec(page_align(user_rec))); + offsets = rec_get_offsets(print_rec, index, + offsets, ULINT_UNDEFINED, &heap); + page_rec_print(print_rec, offsets); + offsets = rec_get_offsets(node_ptr, index, offsets, + ULINT_UNDEFINED, &heap); + page_rec_print(node_ptr, offsets); + + fputs("InnoDB: You should dump + drop + reimport the table" + " to fix the\n" + "InnoDB: corruption. If the crash happens at " + "the database startup, see\n" + "InnoDB: " REFMAN "forcing-innodb-recovery.html about\n" + "InnoDB: forcing recovery. " + "Then dump + drop + reimport.\n", stderr); + + ut_error; + } + + return(offsets); +} + +#define btr_page_get_father_node_ptr(of,heap,cur,mtr) \ + btr_page_get_father_node_ptr_func(of,heap,cur,__FILE__,__LINE__,mtr) + +/************************************************************//** +Returns the upper level node pointer to a page. It is assumed that mtr holds +an x-latch on the tree. +@return rec_get_offsets() of the node pointer record */ +static +ulint* +btr_page_get_father_block( +/*======================*/ + ulint* offsets,/*!< in: work area for the return value */ + mem_heap_t* heap, /*!< in: memory heap to use */ + dict_index_t* index, /*!< in: b-tree index */ + buf_block_t* block, /*!< in: child page in the index */ + mtr_t* mtr, /*!< in: mtr */ + btr_cur_t* cursor) /*!< out: cursor on node pointer record, + its page x-latched */ +{ + rec_t* rec + = page_rec_get_next(page_get_infimum_rec(buf_block_get_frame( + block))); + btr_cur_position(index, rec, block, cursor); + return(btr_page_get_father_node_ptr(offsets, heap, cursor, mtr)); +} + +/************************************************************//** +Seeks to the upper level node pointer to a page. +It is assumed that mtr holds an x-latch on the tree. */ +static +void +btr_page_get_father( +/*================*/ + dict_index_t* index, /*!< in: b-tree index */ + buf_block_t* block, /*!< in: child page in the index */ + mtr_t* mtr, /*!< in: mtr */ + btr_cur_t* cursor) /*!< out: cursor on node pointer record, + its page x-latched */ +{ + mem_heap_t* heap; + rec_t* rec + = page_rec_get_next(page_get_infimum_rec(buf_block_get_frame( + block))); + btr_cur_position(index, rec, block, cursor); + + heap = mem_heap_create(100); + btr_page_get_father_node_ptr(NULL, heap, cursor, mtr); + mem_heap_free(heap); +} + +/************************************************************//** +Creates the root node for a new index tree. +@return page number of the created root, FIL_NULL if did not succeed */ +UNIV_INTERN +ulint +btr_create( +/*=======*/ + ulint type, /*!< in: type of the index */ + ulint space, /*!< in: space where created */ + ulint zip_size,/*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + index_id_t index_id,/*!< in: index id */ + dict_index_t* index, /*!< in: index */ + mtr_t* mtr) /*!< in: mini-transaction handle */ +{ + ulint page_no; + buf_block_t* block; + buf_frame_t* frame; + page_t* page; + page_zip_des_t* page_zip; + + /* Create the two new segments (one, in the case of an ibuf tree) for + the index tree; the segment headers are put on the allocated root page + (for an ibuf tree, not in the root, but on a separate ibuf header + page) */ + + if (type & DICT_IBUF) { + /* Allocate first the ibuf header page */ + buf_block_t* ibuf_hdr_block = fseg_create( + space, 0, + IBUF_HEADER + IBUF_TREE_SEG_HEADER, mtr); + + buf_block_dbg_add_level( + ibuf_hdr_block, SYNC_IBUF_TREE_NODE_NEW); + + ut_ad(buf_block_get_page_no(ibuf_hdr_block) + == IBUF_HEADER_PAGE_NO); + /* Allocate then the next page to the segment: it will be the + tree root page */ + + block = fseg_alloc_free_page( + buf_block_get_frame(ibuf_hdr_block) + + IBUF_HEADER + IBUF_TREE_SEG_HEADER, + IBUF_TREE_ROOT_PAGE_NO, + FSP_UP, mtr); + ut_ad(buf_block_get_page_no(block) == IBUF_TREE_ROOT_PAGE_NO); + } else { +#ifdef UNIV_BLOB_DEBUG + if ((type & DICT_CLUSTERED) && !index->blobs) { + mutex_create(PFS_NOT_INSTRUMENTED, + &index->blobs_mutex, SYNC_ANY_LATCH); + index->blobs = rbt_create(sizeof(btr_blob_dbg_t), + btr_blob_dbg_cmp); + } +#endif /* UNIV_BLOB_DEBUG */ + block = fseg_create(space, 0, + PAGE_HEADER + PAGE_BTR_SEG_TOP, mtr); + } + + if (block == NULL) { + + return(FIL_NULL); + } + + page_no = buf_block_get_page_no(block); + frame = buf_block_get_frame(block); + + if (type & DICT_IBUF) { + /* It is an insert buffer tree: initialize the free list */ + buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE_NEW); + + ut_ad(page_no == IBUF_TREE_ROOT_PAGE_NO); + + flst_init(frame + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr); + } else { + /* It is a non-ibuf tree: create a file segment for leaf + pages */ + buf_block_dbg_add_level(block, SYNC_TREE_NODE_NEW); + + if (!fseg_create(space, page_no, + PAGE_HEADER + PAGE_BTR_SEG_LEAF, mtr)) { + /* Not enough space for new segment, free root + segment before return. */ + btr_free_root(space, zip_size, page_no, mtr); + + return(FIL_NULL); + } + + /* The fseg create acquires a second latch on the page, + therefore we must declare it: */ + buf_block_dbg_add_level(block, SYNC_TREE_NODE_NEW); + } + + /* Create a new index page on the allocated segment page */ + page_zip = buf_block_get_page_zip(block); + + if (page_zip) { + page = page_create_zip(block, index, 0, 0, mtr); + } else { + page = page_create(block, mtr, + dict_table_is_comp(index->table)); + /* Set the level of the new index page */ + btr_page_set_level(page, NULL, 0, mtr); + } + + block->check_index_page_at_flush = TRUE; + + /* Set the index id of the page */ + btr_page_set_index_id(page, page_zip, index_id, mtr); + + /* Set the next node and previous node fields */ + btr_page_set_next(page, page_zip, FIL_NULL, mtr); + btr_page_set_prev(page, page_zip, FIL_NULL, mtr); + + /* We reset the free bits for the page to allow creation of several + trees in the same mtr, otherwise the latch on a bitmap page would + prevent it because of the latching order */ + + if (!(type & DICT_CLUSTERED)) { + ibuf_reset_free_bits(block); + } + + /* In the following assertion we test that two records of maximum + allowed size fit on the root page: this fact is needed to ensure + correctness of split algorithms */ + + ut_ad(page_get_max_insert_size(page, 2) > 2 * BTR_PAGE_MAX_REC_SIZE); + + return(page_no); +} + +/************************************************************//** +Frees a B-tree except the root page, which MUST be freed after this +by calling btr_free_root. */ +UNIV_INTERN +void +btr_free_but_not_root( +/*==================*/ + ulint space, /*!< in: space where created */ + ulint zip_size, /*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint root_page_no) /*!< in: root page number */ +{ + ibool finished; + page_t* root; + mtr_t mtr; + +leaf_loop: + mtr_start(&mtr); + + root = btr_page_get(space, zip_size, root_page_no, RW_X_LATCH, + NULL, &mtr); + + SRV_CORRUPT_TABLE_CHECK(root, + { + mtr_commit(&mtr); + return; + }); + +#ifdef UNIV_BTR_DEBUG + ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF + + root, space)); + ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP + + root, space)); +#endif /* UNIV_BTR_DEBUG */ + + /* NOTE: page hash indexes are dropped when a page is freed inside + fsp0fsp. */ + + finished = fseg_free_step(root + PAGE_HEADER + PAGE_BTR_SEG_LEAF, + &mtr); + mtr_commit(&mtr); + + if (!finished) { + + goto leaf_loop; + } +top_loop: + mtr_start(&mtr); + + root = btr_page_get(space, zip_size, root_page_no, RW_X_LATCH, + NULL, &mtr); + + SRV_CORRUPT_TABLE_CHECK(root, + { + mtr_commit(&mtr); + return; + }); + +#ifdef UNIV_BTR_DEBUG + ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP + + root, space)); +#endif /* UNIV_BTR_DEBUG */ + + finished = fseg_free_step_not_header( + root + PAGE_HEADER + PAGE_BTR_SEG_TOP, &mtr); + mtr_commit(&mtr); + + if (!finished) { + + goto top_loop; + } +} + +/************************************************************//** +Frees the B-tree root page. Other tree MUST already have been freed. */ +UNIV_INTERN +void +btr_free_root( +/*==========*/ + ulint space, /*!< in: space where created */ + ulint zip_size, /*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint root_page_no, /*!< in: root page number */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + buf_block_t* block; + fseg_header_t* header; + + block = btr_block_get(space, zip_size, root_page_no, RW_X_LATCH, + NULL, mtr); + + SRV_CORRUPT_TABLE_CHECK(block, return;); + + btr_search_drop_page_hash_index(block); + + header = buf_block_get_frame(block) + PAGE_HEADER + PAGE_BTR_SEG_TOP; +#ifdef UNIV_BTR_DEBUG + ut_a(btr_root_fseg_validate(header, space)); +#endif /* UNIV_BTR_DEBUG */ + + while (!fseg_free_step(header, mtr)) { + /* Free the entire segment in small steps. */ + } +} +#endif /* !UNIV_HOTBACKUP */ + +/*************************************************************//** +Reorganizes an index page. + +IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE +if this is a compressed leaf page in a secondary index. This has to +be done either within the same mini-transaction, or by invoking +ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages, +IBUF_BITMAP_FREE is unaffected by reorganization. + +@retval true if the operation was successful +@retval false if it is a compressed page, and recompression failed */ +UNIV_INTERN +bool +btr_page_reorganize_low( +/*====================*/ + bool recovery,/*!< in: true if called in recovery: + locks should not be updated, i.e., + there cannot exist locks on the + page, and a hash index should not be + dropped: it cannot exist */ + ulint z_level,/*!< in: compression level to be used + if dealing with compressed page */ + page_cur_t* cursor, /*!< in/out: page cursor */ + dict_index_t* index, /*!< in: the index tree of the page */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + buf_block_t* block = page_cur_get_block(cursor); +#ifndef UNIV_HOTBACKUP + buf_pool_t* buf_pool = buf_pool_from_bpage(&block->page); +#endif /* !UNIV_HOTBACKUP */ + page_t* page = buf_block_get_frame(block); + page_zip_des_t* page_zip = buf_block_get_page_zip(block); + buf_block_t* temp_block; + page_t* temp_page; + ulint log_mode; + ulint data_size1; + ulint data_size2; + ulint max_ins_size1; + ulint max_ins_size2; + bool success = false; + ulint pos; + bool log_compressed; + + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + btr_assert_not_corrupted(block, index); +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + data_size1 = page_get_data_size(page); + max_ins_size1 = page_get_max_insert_size_after_reorganize(page, 1); + + /* Turn logging off */ + log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE); + +#ifndef UNIV_HOTBACKUP + temp_block = buf_block_alloc(buf_pool); +#else /* !UNIV_HOTBACKUP */ + ut_ad(block == back_block1); + temp_block = back_block2; +#endif /* !UNIV_HOTBACKUP */ + temp_page = temp_block->frame; + + MONITOR_INC(MONITOR_INDEX_REORG_ATTEMPTS); + + /* Copy the old page to temporary space */ + buf_frame_copy(temp_page, page); + +#ifndef UNIV_HOTBACKUP + if (!recovery) { + btr_search_drop_page_hash_index(block); + } + + block->check_index_page_at_flush = TRUE; +#endif /* !UNIV_HOTBACKUP */ + btr_blob_dbg_remove(page, index, "btr_page_reorganize"); + + /* Save the cursor position. */ + pos = page_rec_get_n_recs_before(page_cur_get_rec(cursor)); + + /* Recreate the page: note that global data on page (possible + segment headers, next page-field, etc.) is preserved intact */ + + page_create(block, mtr, dict_table_is_comp(index->table)); + + /* Copy the records from the temporary space to the recreated page; + do not copy the lock bits yet */ + + page_copy_rec_list_end_no_locks(block, temp_block, + page_get_infimum_rec(temp_page), + index, mtr); + + if (dict_index_is_sec_or_ibuf(index) && page_is_leaf(page)) { + /* Copy max trx id to recreated page */ + trx_id_t max_trx_id = page_get_max_trx_id(temp_page); + page_set_max_trx_id(block, NULL, max_trx_id, mtr); + /* In crash recovery, dict_index_is_sec_or_ibuf() always + holds, even for clustered indexes. max_trx_id is + unused in clustered index pages. */ + ut_ad(max_trx_id != 0 || recovery); + } + + /* If innodb_log_compressed_pages is ON, page reorganize should log the + compressed page image.*/ + log_compressed = page_zip && page_zip_log_pages; + + if (log_compressed) { + mtr_set_log_mode(mtr, log_mode); + } + + if (page_zip + && !page_zip_compress(page_zip, page, index, z_level, mtr)) { + + /* Restore the old page and exit. */ + btr_blob_dbg_restore(page, temp_page, index, + "btr_page_reorganize_compress_fail"); + +#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG + /* Check that the bytes that we skip are identical. */ + ut_a(!memcmp(page, temp_page, PAGE_HEADER)); + ut_a(!memcmp(PAGE_HEADER + PAGE_N_RECS + page, + PAGE_HEADER + PAGE_N_RECS + temp_page, + PAGE_DATA - (PAGE_HEADER + PAGE_N_RECS))); + ut_a(!memcmp(UNIV_PAGE_SIZE - FIL_PAGE_DATA_END + page, + UNIV_PAGE_SIZE - FIL_PAGE_DATA_END + temp_page, + FIL_PAGE_DATA_END)); +#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ + + memcpy(PAGE_HEADER + page, PAGE_HEADER + temp_page, + PAGE_N_RECS - PAGE_N_DIR_SLOTS); + memcpy(PAGE_DATA + page, PAGE_DATA + temp_page, + UNIV_PAGE_SIZE - PAGE_DATA - FIL_PAGE_DATA_END); + +#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG + ut_a(!memcmp(page, temp_page, UNIV_PAGE_SIZE)); +#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ + + goto func_exit; + } + +#ifndef UNIV_HOTBACKUP + if (!recovery) { + /* Update the record lock bitmaps */ + lock_move_reorganize_page(block, temp_block); + } +#endif /* !UNIV_HOTBACKUP */ + + data_size2 = page_get_data_size(page); + max_ins_size2 = page_get_max_insert_size_after_reorganize(page, 1); + + if (data_size1 != data_size2 || max_ins_size1 != max_ins_size2) { + buf_page_print(page, 0, BUF_PAGE_PRINT_NO_CRASH); + buf_page_print(temp_page, 0, BUF_PAGE_PRINT_NO_CRASH); + + fprintf(stderr, + "InnoDB: Error: page old data size %lu" + " new data size %lu\n" + "InnoDB: Error: page old max ins size %lu" + " new max ins size %lu\n" + "InnoDB: Submit a detailed bug report" + " to http://bugs.mysql.com\n", + (unsigned long) data_size1, (unsigned long) data_size2, + (unsigned long) max_ins_size1, + (unsigned long) max_ins_size2); + ut_ad(0); + } else { + success = true; + } + + /* Restore the cursor position. */ + if (pos > 0) { + cursor->rec = page_rec_get_nth(page, pos); + } else { + ut_ad(cursor->rec == page_get_infimum_rec(page)); + } + +func_exit: +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ +#ifndef UNIV_HOTBACKUP + buf_block_free(temp_block); +#endif /* !UNIV_HOTBACKUP */ + + /* Restore logging mode */ + mtr_set_log_mode(mtr, log_mode); + +#ifndef UNIV_HOTBACKUP + if (success) { + byte type; + byte* log_ptr; + + /* Write the log record */ + if (page_zip) { + ut_ad(page_is_comp(page)); + type = MLOG_ZIP_PAGE_REORGANIZE; + } else if (page_is_comp(page)) { + type = MLOG_COMP_PAGE_REORGANIZE; + } else { + type = MLOG_PAGE_REORGANIZE; + } + + log_ptr = log_compressed + ? NULL + : mlog_open_and_write_index( + mtr, page, index, type, + page_zip ? 1 : 0); + + /* For compressed pages write the compression level. */ + if (log_ptr && page_zip) { + mach_write_to_1(log_ptr, z_level); + mlog_close(mtr, log_ptr + 1); + } + + MONITOR_INC(MONITOR_INDEX_REORG_SUCCESSFUL); + } +#endif /* !UNIV_HOTBACKUP */ + + return(success); +} + +/*************************************************************//** +Reorganizes an index page. + +IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE +if this is a compressed leaf page in a secondary index. This has to +be done either within the same mini-transaction, or by invoking +ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages, +IBUF_BITMAP_FREE is unaffected by reorganization. + +@retval true if the operation was successful +@retval false if it is a compressed page, and recompression failed */ +static __attribute__((nonnull)) +bool +btr_page_reorganize_block( +/*======================*/ + bool recovery,/*!< in: true if called in recovery: + locks should not be updated, i.e., + there cannot exist locks on the + page, and a hash index should not be + dropped: it cannot exist */ + ulint z_level,/*!< in: compression level to be used + if dealing with compressed page */ + buf_block_t* block, /*!< in/out: B-tree page */ + dict_index_t* index, /*!< in: the index tree of the page */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + page_cur_t cur; + page_cur_set_before_first(block, &cur); + + return(btr_page_reorganize_low(recovery, z_level, &cur, index, mtr)); +} + +#ifndef UNIV_HOTBACKUP +/*************************************************************//** +Reorganizes an index page. + +IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE +if this is a compressed leaf page in a secondary index. This has to +be done either within the same mini-transaction, or by invoking +ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages, +IBUF_BITMAP_FREE is unaffected by reorganization. + +@retval true if the operation was successful +@retval false if it is a compressed page, and recompression failed */ +UNIV_INTERN +bool +btr_page_reorganize( +/*================*/ + page_cur_t* cursor, /*!< in/out: page cursor */ + dict_index_t* index, /*!< in: the index tree of the page */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + return(btr_page_reorganize_low(false, page_zip_level, + cursor, index, mtr)); +} +#endif /* !UNIV_HOTBACKUP */ + +/***********************************************************//** +Parses a redo log record of reorganizing a page. +@return end of log record or NULL */ +UNIV_INTERN +byte* +btr_parse_page_reorganize( +/*======================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + dict_index_t* index, /*!< in: record descriptor */ + bool compressed,/*!< in: true if compressed page */ + buf_block_t* block, /*!< in: page to be reorganized, or NULL */ + mtr_t* mtr) /*!< in: mtr or NULL */ +{ + ulint level; + + ut_ad(ptr && end_ptr); + + /* If dealing with a compressed page the record has the + compression level used during original compression written in + one byte. Otherwise record is empty. */ + if (compressed) { + if (ptr == end_ptr) { + return(NULL); + } + + level = mach_read_from_1(ptr); + + ut_a(level <= 9); + ++ptr; + } else { + level = page_zip_level; + } + + if (block != NULL) { + btr_page_reorganize_block(true, level, block, index, mtr); + } + + return(ptr); +} + +#ifndef UNIV_HOTBACKUP +/*************************************************************//** +Empties an index page. @see btr_page_create(). */ +UNIV_INTERN +void +btr_page_empty( +/*===========*/ + buf_block_t* block, /*!< in: page to be emptied */ + page_zip_des_t* page_zip,/*!< out: compressed page, or NULL */ + dict_index_t* index, /*!< in: index of the page */ + ulint level, /*!< in: the B-tree level of the page */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_t* page = buf_block_get_frame(block); + + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(page_zip == buf_block_get_page_zip(block)); +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + + btr_search_drop_page_hash_index(block); + btr_blob_dbg_remove(page, index, "btr_page_empty"); + + /* Recreate the page: note that global data on page (possible + segment headers, next page-field, etc.) is preserved intact */ + + if (page_zip) { + page_create_zip(block, index, level, 0, mtr); + } else { + page_create(block, mtr, dict_table_is_comp(index->table)); + btr_page_set_level(page, NULL, level, mtr); + } + + block->check_index_page_at_flush = TRUE; +} + +/*************************************************************//** +Makes tree one level higher by splitting the root, and inserts +the tuple. It is assumed that mtr contains an x-latch on the tree. +NOTE that the operation of this function must always succeed, +we cannot reverse it: therefore enough free disk space must be +guaranteed to be available before this function is called. +@return inserted record */ +UNIV_INTERN +rec_t* +btr_root_raise_and_insert( +/*======================*/ + ulint flags, /*!< in: undo logging and locking flags */ + btr_cur_t* cursor, /*!< in: cursor at which to insert: must be + on the root page; when the function returns, + the cursor is positioned on the predecessor + of the inserted record */ + ulint** offsets,/*!< out: offsets on inserted record */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */ + const dtuple_t* tuple, /*!< in: tuple to insert */ + ulint n_ext, /*!< in: number of externally stored columns */ + mtr_t* mtr) /*!< in: mtr */ +{ + dict_index_t* index; + page_t* root; + page_t* new_page; + ulint new_page_no; + rec_t* rec; + dtuple_t* node_ptr; + ulint level; + rec_t* node_ptr_rec; + page_cur_t* page_cursor; + page_zip_des_t* root_page_zip; + page_zip_des_t* new_page_zip; + buf_block_t* root_block; + buf_block_t* new_block; + + root = btr_cur_get_page(cursor); + root_block = btr_cur_get_block(cursor); + root_page_zip = buf_block_get_page_zip(root_block); + ut_ad(!page_is_empty(root)); + index = btr_cur_get_index(cursor); +#ifdef UNIV_ZIP_DEBUG + ut_a(!root_page_zip || page_zip_validate(root_page_zip, root, index)); +#endif /* UNIV_ZIP_DEBUG */ +#ifdef UNIV_BTR_DEBUG + if (!dict_index_is_ibuf(index)) { + ulint space = dict_index_get_space(index); + + ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF + + root, space)); + ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP + + root, space)); + } + + ut_a(dict_index_get_page(index) == page_get_page_no(root)); +#endif /* UNIV_BTR_DEBUG */ + ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index), + MTR_MEMO_X_LOCK)); + ut_ad(mtr_memo_contains(mtr, root_block, MTR_MEMO_PAGE_X_FIX)); + + /* Allocate a new page to the tree. Root splitting is done by first + moving the root records to the new page, emptying the root, putting + a node pointer to the new page, and then splitting the new page. */ + + level = btr_page_get_level(root, mtr); + + new_block = btr_page_alloc(index, 0, FSP_NO_DIR, level, mtr, mtr); + new_page = buf_block_get_frame(new_block); + new_page_zip = buf_block_get_page_zip(new_block); + ut_a(!new_page_zip == !root_page_zip); + ut_a(!new_page_zip + || page_zip_get_size(new_page_zip) + == page_zip_get_size(root_page_zip)); + + btr_page_create(new_block, new_page_zip, index, level, mtr); + + /* Set the next node and previous node fields of new page */ + btr_page_set_next(new_page, new_page_zip, FIL_NULL, mtr); + btr_page_set_prev(new_page, new_page_zip, FIL_NULL, mtr); + + /* Copy the records from root to the new page one by one. */ + + if (0 +#ifdef UNIV_ZIP_COPY + || new_page_zip +#endif /* UNIV_ZIP_COPY */ + || !page_copy_rec_list_end(new_block, root_block, + page_get_infimum_rec(root), + index, mtr)) { + ut_a(new_page_zip); + + /* Copy the page byte for byte. */ + page_zip_copy_recs(new_page_zip, new_page, + root_page_zip, root, index, mtr); + + /* Update the lock table and possible hash index. */ + + lock_move_rec_list_end(new_block, root_block, + page_get_infimum_rec(root)); + + btr_search_move_or_delete_hash_entries(new_block, root_block, + index); + } + + /* If this is a pessimistic insert which is actually done to + perform a pessimistic update then we have stored the lock + information of the record to be inserted on the infimum of the + root page: we cannot discard the lock structs on the root page */ + + lock_update_root_raise(new_block, root_block); + + /* Create a memory heap where the node pointer is stored */ + if (!*heap) { + *heap = mem_heap_create(1000); + } + + rec = page_rec_get_next(page_get_infimum_rec(new_page)); + new_page_no = buf_block_get_page_no(new_block); + + /* Build the node pointer (= node key and page address) for the + child */ + + node_ptr = dict_index_build_node_ptr( + index, rec, new_page_no, *heap, level); + /* The node pointer must be marked as the predefined minimum record, + as there is no lower alphabetical limit to records in the leftmost + node of a level: */ + dtuple_set_info_bits(node_ptr, + dtuple_get_info_bits(node_ptr) + | REC_INFO_MIN_REC_FLAG); + + /* Rebuild the root page to get free space */ + btr_page_empty(root_block, root_page_zip, index, level + 1, mtr); + + /* Set the next node and previous node fields, although + they should already have been set. The previous node field + must be FIL_NULL if root_page_zip != NULL, because the + REC_INFO_MIN_REC_FLAG (of the first user record) will be + set if and only if btr_page_get_prev() == FIL_NULL. */ + btr_page_set_next(root, root_page_zip, FIL_NULL, mtr); + btr_page_set_prev(root, root_page_zip, FIL_NULL, mtr); + + page_cursor = btr_cur_get_page_cur(cursor); + + /* Insert node pointer to the root */ + + page_cur_set_before_first(root_block, page_cursor); + + node_ptr_rec = page_cur_tuple_insert(page_cursor, node_ptr, + index, offsets, heap, 0, mtr); + + /* The root page should only contain the node pointer + to new_page at this point. Thus, the data should fit. */ + ut_a(node_ptr_rec); + + /* We play safe and reset the free bits for the new page */ + +#if 0 + fprintf(stderr, "Root raise new page no %lu\n", new_page_no); +#endif + + if (!dict_index_is_clust(index)) { + ibuf_reset_free_bits(new_block); + } + + /* Reposition the cursor to the child node */ + page_cur_search(new_block, index, tuple, + PAGE_CUR_LE, page_cursor); + + /* Split the child and insert tuple */ + return(btr_page_split_and_insert(flags, cursor, offsets, heap, + tuple, n_ext, mtr)); +} + +/*************************************************************//** +Decides if the page should be split at the convergence point of inserts +converging to the left. +@return TRUE if split recommended */ +UNIV_INTERN +ibool +btr_page_get_split_rec_to_left( +/*===========================*/ + btr_cur_t* cursor, /*!< in: cursor at which to insert */ + rec_t** split_rec) /*!< out: if split recommended, + the first record on upper half page, + or NULL if tuple to be inserted should + be first */ +{ + page_t* page; + rec_t* insert_point; + rec_t* infimum; + + page = btr_cur_get_page(cursor); + insert_point = btr_cur_get_rec(cursor); + + if (page_header_get_ptr(page, PAGE_LAST_INSERT) + == page_rec_get_next(insert_point)) { + + infimum = page_get_infimum_rec(page); + + /* If the convergence is in the middle of a page, include also + the record immediately before the new insert to the upper + page. Otherwise, we could repeatedly move from page to page + lots of records smaller than the convergence point. */ + + if (infimum != insert_point + && page_rec_get_next(infimum) != insert_point) { + + *split_rec = insert_point; + } else { + *split_rec = page_rec_get_next(insert_point); + } + + return(TRUE); + } + + return(FALSE); +} + +/*************************************************************//** +Decides if the page should be split at the convergence point of inserts +converging to the right. +@return TRUE if split recommended */ +UNIV_INTERN +ibool +btr_page_get_split_rec_to_right( +/*============================*/ + btr_cur_t* cursor, /*!< in: cursor at which to insert */ + rec_t** split_rec) /*!< out: if split recommended, + the first record on upper half page, + or NULL if tuple to be inserted should + be first */ +{ + page_t* page; + rec_t* insert_point; + + page = btr_cur_get_page(cursor); + insert_point = btr_cur_get_rec(cursor); + + /* We use eager heuristics: if the new insert would be right after + the previous insert on the same page, we assume that there is a + pattern of sequential inserts here. */ + + if (page_header_get_ptr(page, PAGE_LAST_INSERT) == insert_point) { + + rec_t* next_rec; + + next_rec = page_rec_get_next(insert_point); + + if (page_rec_is_supremum(next_rec)) { +split_at_new: + /* Split at the new record to insert */ + *split_rec = NULL; + } else { + rec_t* next_next_rec = page_rec_get_next(next_rec); + if (page_rec_is_supremum(next_next_rec)) { + + goto split_at_new; + } + + /* If there are >= 2 user records up from the insert + point, split all but 1 off. We want to keep one because + then sequential inserts can use the adaptive hash + index, as they can do the necessary checks of the right + search position just by looking at the records on this + page. */ + + *split_rec = next_next_rec; + } + + return(TRUE); + } + + return(FALSE); +} + +/*************************************************************//** +Calculates a split record such that the tuple will certainly fit on +its half-page when the split is performed. We assume in this function +only that the cursor page has at least one user record. +@return split record, or NULL if tuple will be the first record on +the lower or upper half-page (determined by btr_page_tuple_smaller()) */ +static +rec_t* +btr_page_get_split_rec( +/*===================*/ + btr_cur_t* cursor, /*!< in: cursor at which insert should be made */ + const dtuple_t* tuple, /*!< in: tuple to insert */ + ulint n_ext) /*!< in: number of externally stored columns */ +{ + page_t* page; + page_zip_des_t* page_zip; + ulint insert_size; + ulint free_space; + ulint total_data; + ulint total_n_recs; + ulint total_space; + ulint incl_data; + rec_t* ins_rec; + rec_t* rec; + rec_t* next_rec; + ulint n; + mem_heap_t* heap; + ulint* offsets; + + page = btr_cur_get_page(cursor); + + insert_size = rec_get_converted_size(cursor->index, tuple, n_ext); + free_space = page_get_free_space_of_empty(page_is_comp(page)); + + page_zip = btr_cur_get_page_zip(cursor); + if (page_zip) { + /* Estimate the free space of an empty compressed page. */ + ulint free_space_zip = page_zip_empty_size( + cursor->index->n_fields, + page_zip_get_size(page_zip)); + + if (free_space > (ulint) free_space_zip) { + free_space = (ulint) free_space_zip; + } + } + + /* free_space is now the free space of a created new page */ + + total_data = page_get_data_size(page) + insert_size; + total_n_recs = page_get_n_recs(page) + 1; + ut_ad(total_n_recs >= 2); + total_space = total_data + page_dir_calc_reserved_space(total_n_recs); + + n = 0; + incl_data = 0; + ins_rec = btr_cur_get_rec(cursor); + rec = page_get_infimum_rec(page); + + heap = NULL; + offsets = NULL; + + /* We start to include records to the left half, and when the + space reserved by them exceeds half of total_space, then if + the included records fit on the left page, they will be put there + if something was left over also for the right page, + otherwise the last included record will be the first on the right + half page */ + + do { + /* Decide the next record to include */ + if (rec == ins_rec) { + rec = NULL; /* NULL denotes that tuple is + now included */ + } else if (rec == NULL) { + rec = page_rec_get_next(ins_rec); + } else { + rec = page_rec_get_next(rec); + } + + if (rec == NULL) { + /* Include tuple */ + incl_data += insert_size; + } else { + offsets = rec_get_offsets(rec, cursor->index, + offsets, ULINT_UNDEFINED, + &heap); + incl_data += rec_offs_size(offsets); + } + + n++; + } while (incl_data + page_dir_calc_reserved_space(n) + < total_space / 2); + + if (incl_data + page_dir_calc_reserved_space(n) <= free_space) { + /* The next record will be the first on + the right half page if it is not the + supremum record of page */ + + if (rec == ins_rec) { + rec = NULL; + + goto func_exit; + } else if (rec == NULL) { + next_rec = page_rec_get_next(ins_rec); + } else { + next_rec = page_rec_get_next(rec); + } + ut_ad(next_rec); + if (!page_rec_is_supremum(next_rec)) { + rec = next_rec; + } + } + +func_exit: + if (heap) { + mem_heap_free(heap); + } + return(rec); +} + +/*************************************************************//** +Returns TRUE if the insert fits on the appropriate half-page with the +chosen split_rec. +@return true if fits */ +static __attribute__((nonnull(1,3,4,6), warn_unused_result)) +bool +btr_page_insert_fits( +/*=================*/ + btr_cur_t* cursor, /*!< in: cursor at which insert + should be made */ + const rec_t* split_rec,/*!< in: suggestion for first record + on upper half-page, or NULL if + tuple to be inserted should be first */ + ulint** offsets,/*!< in: rec_get_offsets( + split_rec, cursor->index); out: garbage */ + const dtuple_t* tuple, /*!< in: tuple to insert */ + ulint n_ext, /*!< in: number of externally stored columns */ + mem_heap_t** heap) /*!< in: temporary memory heap */ +{ + page_t* page; + ulint insert_size; + ulint free_space; + ulint total_data; + ulint total_n_recs; + const rec_t* rec; + const rec_t* end_rec; + + page = btr_cur_get_page(cursor); + + ut_ad(!split_rec + || !page_is_comp(page) == !rec_offs_comp(*offsets)); + ut_ad(!split_rec + || rec_offs_validate(split_rec, cursor->index, *offsets)); + + insert_size = rec_get_converted_size(cursor->index, tuple, n_ext); + free_space = page_get_free_space_of_empty(page_is_comp(page)); + + /* free_space is now the free space of a created new page */ + + total_data = page_get_data_size(page) + insert_size; + total_n_recs = page_get_n_recs(page) + 1; + + /* We determine which records (from rec to end_rec, not including + end_rec) will end up on the other half page from tuple when it is + inserted. */ + + if (split_rec == NULL) { + rec = page_rec_get_next(page_get_infimum_rec(page)); + end_rec = page_rec_get_next(btr_cur_get_rec(cursor)); + + } else if (cmp_dtuple_rec(tuple, split_rec, *offsets) >= 0) { + + rec = page_rec_get_next(page_get_infimum_rec(page)); + end_rec = split_rec; + } else { + rec = split_rec; + end_rec = page_get_supremum_rec(page); + } + + if (total_data + page_dir_calc_reserved_space(total_n_recs) + <= free_space) { + + /* Ok, there will be enough available space on the + half page where the tuple is inserted */ + + return(true); + } + + while (rec != end_rec) { + /* In this loop we calculate the amount of reserved + space after rec is removed from page. */ + + *offsets = rec_get_offsets(rec, cursor->index, *offsets, + ULINT_UNDEFINED, heap); + + total_data -= rec_offs_size(*offsets); + total_n_recs--; + + if (total_data + page_dir_calc_reserved_space(total_n_recs) + <= free_space) { + + /* Ok, there will be enough available space on the + half page where the tuple is inserted */ + + return(true); + } + + rec = page_rec_get_next_const(rec); + } + + return(false); +} + +/*******************************************************//** +Inserts a data tuple to a tree on a non-leaf level. It is assumed +that mtr holds an x-latch on the tree. */ +UNIV_INTERN +void +btr_insert_on_non_leaf_level_func( +/*==============================*/ + ulint flags, /*!< in: undo logging and locking flags */ + dict_index_t* index, /*!< in: index */ + ulint level, /*!< in: level, must be > 0 */ + dtuple_t* tuple, /*!< in: the record to be inserted */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ + mtr_t* mtr) /*!< in: mtr */ +{ + big_rec_t* dummy_big_rec; + btr_cur_t cursor; + dberr_t err; + rec_t* rec; + ulint* offsets = NULL; + mem_heap_t* heap = NULL; + + ut_ad(level > 0); + + btr_cur_search_to_nth_level(index, level, tuple, PAGE_CUR_LE, + BTR_CONT_MODIFY_TREE, + &cursor, 0, file, line, mtr); + + ut_ad(cursor.flag == BTR_CUR_BINARY); + + err = btr_cur_optimistic_insert( + flags + | BTR_NO_LOCKING_FLAG + | BTR_KEEP_SYS_FLAG + | BTR_NO_UNDO_LOG_FLAG, + &cursor, &offsets, &heap, + tuple, &rec, &dummy_big_rec, 0, NULL, mtr); + + if (err == DB_FAIL) { + err = btr_cur_pessimistic_insert(flags + | BTR_NO_LOCKING_FLAG + | BTR_KEEP_SYS_FLAG + | BTR_NO_UNDO_LOG_FLAG, + &cursor, &offsets, &heap, + tuple, &rec, + &dummy_big_rec, 0, NULL, mtr); + ut_a(err == DB_SUCCESS); + } + mem_heap_free(heap); +} + +/**************************************************************//** +Attaches the halves of an index page on the appropriate level in an +index tree. */ +static __attribute__((nonnull)) +void +btr_attach_half_pages( +/*==================*/ + ulint flags, /*!< in: undo logging and + locking flags */ + dict_index_t* index, /*!< in: the index tree */ + buf_block_t* block, /*!< in/out: page to be split */ + const rec_t* split_rec, /*!< in: first record on upper + half page */ + buf_block_t* new_block, /*!< in/out: the new half page */ + ulint direction, /*!< in: FSP_UP or FSP_DOWN */ + mtr_t* mtr) /*!< in: mtr */ +{ + ulint space; + ulint zip_size; + ulint prev_page_no; + ulint next_page_no; + ulint level; + page_t* page = buf_block_get_frame(block); + page_t* lower_page; + page_t* upper_page; + ulint lower_page_no; + ulint upper_page_no; + page_zip_des_t* lower_page_zip; + page_zip_des_t* upper_page_zip; + dtuple_t* node_ptr_upper; + mem_heap_t* heap; + + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(mtr_memo_contains(mtr, new_block, MTR_MEMO_PAGE_X_FIX)); + + /* Create a memory heap where the data tuple is stored */ + heap = mem_heap_create(1024); + + /* Based on split direction, decide upper and lower pages */ + if (direction == FSP_DOWN) { + + btr_cur_t cursor; + ulint* offsets; + + lower_page = buf_block_get_frame(new_block); + lower_page_no = buf_block_get_page_no(new_block); + lower_page_zip = buf_block_get_page_zip(new_block); + upper_page = buf_block_get_frame(block); + upper_page_no = buf_block_get_page_no(block); + upper_page_zip = buf_block_get_page_zip(block); + + /* Look up the index for the node pointer to page */ + offsets = btr_page_get_father_block(NULL, heap, index, + block, mtr, &cursor); + + /* Replace the address of the old child node (= page) with the + address of the new lower half */ + + btr_node_ptr_set_child_page_no( + btr_cur_get_rec(&cursor), + btr_cur_get_page_zip(&cursor), + offsets, lower_page_no, mtr); + mem_heap_empty(heap); + } else { + lower_page = buf_block_get_frame(block); + lower_page_no = buf_block_get_page_no(block); + lower_page_zip = buf_block_get_page_zip(block); + upper_page = buf_block_get_frame(new_block); + upper_page_no = buf_block_get_page_no(new_block); + upper_page_zip = buf_block_get_page_zip(new_block); + } + + /* Get the level of the split pages */ + level = btr_page_get_level(buf_block_get_frame(block), mtr); + ut_ad(level + == btr_page_get_level(buf_block_get_frame(new_block), mtr)); + + /* Build the node pointer (= node key and page address) for the upper + half */ + + node_ptr_upper = dict_index_build_node_ptr(index, split_rec, + upper_page_no, heap, level); + + /* Insert it next to the pointer to the lower half. Note that this + may generate recursion leading to a split on the higher level. */ + + btr_insert_on_non_leaf_level(flags, index, level + 1, + node_ptr_upper, mtr); + + /* Free the memory heap */ + mem_heap_free(heap); + + /* Get the previous and next pages of page */ + + prev_page_no = btr_page_get_prev(page, mtr); + next_page_no = btr_page_get_next(page, mtr); + space = buf_block_get_space(block); + zip_size = buf_block_get_zip_size(block); + + /* Update page links of the level */ + + if (prev_page_no != FIL_NULL) { + buf_block_t* prev_block = btr_block_get( + space, zip_size, prev_page_no, RW_X_LATCH, index, mtr); +#ifdef UNIV_BTR_DEBUG + ut_a(page_is_comp(prev_block->frame) == page_is_comp(page)); + ut_a(btr_page_get_next(prev_block->frame, mtr) + == buf_block_get_page_no(block)); +#endif /* UNIV_BTR_DEBUG */ + + btr_page_set_next(buf_block_get_frame(prev_block), + buf_block_get_page_zip(prev_block), + lower_page_no, mtr); + } + + if (next_page_no != FIL_NULL) { + buf_block_t* next_block = btr_block_get( + space, zip_size, next_page_no, RW_X_LATCH, index, mtr); +#ifdef UNIV_BTR_DEBUG + ut_a(page_is_comp(next_block->frame) == page_is_comp(page)); + ut_a(btr_page_get_prev(next_block->frame, mtr) + == page_get_page_no(page)); +#endif /* UNIV_BTR_DEBUG */ + + btr_page_set_prev(buf_block_get_frame(next_block), + buf_block_get_page_zip(next_block), + upper_page_no, mtr); + } + + btr_page_set_prev(lower_page, lower_page_zip, prev_page_no, mtr); + btr_page_set_next(lower_page, lower_page_zip, upper_page_no, mtr); + + btr_page_set_prev(upper_page, upper_page_zip, lower_page_no, mtr); + btr_page_set_next(upper_page, upper_page_zip, next_page_no, mtr); +} + +/*************************************************************//** +Determine if a tuple is smaller than any record on the page. +@return TRUE if smaller */ +static __attribute__((nonnull, warn_unused_result)) +bool +btr_page_tuple_smaller( +/*===================*/ + btr_cur_t* cursor, /*!< in: b-tree cursor */ + const dtuple_t* tuple, /*!< in: tuple to consider */ + ulint** offsets,/*!< in/out: temporary storage */ + ulint n_uniq, /*!< in: number of unique fields + in the index page records */ + mem_heap_t** heap) /*!< in/out: heap for offsets */ +{ + buf_block_t* block; + const rec_t* first_rec; + page_cur_t pcur; + + /* Read the first user record in the page. */ + block = btr_cur_get_block(cursor); + page_cur_set_before_first(block, &pcur); + page_cur_move_to_next(&pcur); + first_rec = page_cur_get_rec(&pcur); + + *offsets = rec_get_offsets( + first_rec, cursor->index, *offsets, + n_uniq, heap); + + return(cmp_dtuple_rec(tuple, first_rec, *offsets) < 0); +} + +/** Insert the tuple into the right sibling page, if the cursor is at the end +of a page. +@param[in] flags undo logging and locking flags +@param[in,out] cursor cursor at which to insert; when the function succeeds, + the cursor is positioned before the insert point. +@param[out] offsets offsets on inserted record +@param[in,out] heap memory heap for allocating offsets +@param[in] tuple tuple to insert +@param[in] n_ext number of externally stored columns +@param[in,out] mtr mini-transaction +@return inserted record (first record on the right sibling page); + the cursor will be positioned on the page infimum +@retval NULL if the operation was not performed */ +static +rec_t* +btr_insert_into_right_sibling( + ulint flags, + btr_cur_t* cursor, + ulint** offsets, + mem_heap_t* heap, + const dtuple_t* tuple, + ulint n_ext, + mtr_t* mtr) +{ + buf_block_t* block = btr_cur_get_block(cursor); + page_t* page = buf_block_get_frame(block); + ulint next_page_no = btr_page_get_next(page, mtr); + + ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(cursor->index), + MTR_MEMO_X_LOCK)); + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(heap); + + if (next_page_no == FIL_NULL || !page_rec_is_supremum( + page_rec_get_next(btr_cur_get_rec(cursor)))) { + + return(NULL); + } + + page_cur_t next_page_cursor; + buf_block_t* next_block; + page_t* next_page; + btr_cur_t next_father_cursor; + rec_t* rec = NULL; + ulint zip_size = buf_block_get_zip_size(block); + ulint max_size; + + next_block = btr_block_get( + buf_block_get_space(block), zip_size, + next_page_no, RW_X_LATCH, cursor->index, mtr); + next_page = buf_block_get_frame(next_block); + + bool is_leaf = page_is_leaf(next_page); + + btr_page_get_father( + cursor->index, next_block, mtr, &next_father_cursor); + + page_cur_search( + next_block, cursor->index, tuple, PAGE_CUR_LE, + &next_page_cursor); + + max_size = page_get_max_insert_size_after_reorganize(next_page, 1); + + /* Extends gap lock for the next page */ + lock_update_split_left(next_block, block); + + rec = page_cur_tuple_insert( + &next_page_cursor, tuple, cursor->index, offsets, &heap, + n_ext, mtr); + + if (rec == NULL) { + if (zip_size && is_leaf + && !dict_index_is_clust(cursor->index)) { + /* Reset the IBUF_BITMAP_FREE bits, because + page_cur_tuple_insert() will have attempted page + reorganize before failing. */ + ibuf_reset_free_bits(next_block); + } + return(NULL); + } + + ibool compressed; + dberr_t err; + ulint level = btr_page_get_level(next_page, mtr); + + /* adjust cursor position */ + *btr_cur_get_page_cur(cursor) = next_page_cursor; + + ut_ad(btr_cur_get_rec(cursor) == page_get_infimum_rec(next_page)); + ut_ad(page_rec_get_next(page_get_infimum_rec(next_page)) == rec); + + /* We have to change the parent node pointer */ + + compressed = btr_cur_pessimistic_delete( + &err, TRUE, &next_father_cursor, + BTR_CREATE_FLAG, RB_NONE, mtr); + + ut_a(err == DB_SUCCESS); + + if (!compressed) { + btr_cur_compress_if_useful(&next_father_cursor, FALSE, mtr); + } + + dtuple_t* node_ptr = dict_index_build_node_ptr( + cursor->index, rec, buf_block_get_page_no(next_block), + heap, level); + + btr_insert_on_non_leaf_level( + flags, cursor->index, level + 1, node_ptr, mtr); + + ut_ad(rec_offs_validate(rec, cursor->index, *offsets)); + + if (is_leaf && !dict_index_is_clust(cursor->index)) { + /* Update the free bits of the B-tree page in the + insert buffer bitmap. */ + + if (zip_size) { + ibuf_update_free_bits_zip(next_block, mtr); + } else { + ibuf_update_free_bits_if_full( + next_block, max_size, + rec_offs_size(*offsets) + PAGE_DIR_SLOT_SIZE); + } + } + + return(rec); +} + +/*************************************************************//** +Splits an index page to halves and inserts the tuple. It is assumed +that mtr holds an x-latch to the index tree. NOTE: the tree x-latch is +released within this function! NOTE that the operation of this +function must always succeed, we cannot reverse it: therefore enough +free disk space (2 pages) must be guaranteed to be available before +this function is called. + +@return inserted record */ +UNIV_INTERN +rec_t* +btr_page_split_and_insert( +/*======================*/ + ulint flags, /*!< in: undo logging and locking flags */ + btr_cur_t* cursor, /*!< in: cursor at which to insert; when the + function returns, the cursor is positioned + on the predecessor of the inserted record */ + ulint** offsets,/*!< out: offsets on inserted record */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */ + const dtuple_t* tuple, /*!< in: tuple to insert */ + ulint n_ext, /*!< in: number of externally stored columns */ + mtr_t* mtr) /*!< in: mtr */ +{ + buf_block_t* block; + page_t* page; + page_zip_des_t* page_zip; + ulint page_no; + byte direction; + ulint hint_page_no; + buf_block_t* new_block; + page_t* new_page; + page_zip_des_t* new_page_zip; + rec_t* split_rec; + buf_block_t* left_block; + buf_block_t* right_block; + buf_block_t* insert_block; + page_cur_t* page_cursor; + rec_t* first_rec; + byte* buf = 0; /* remove warning */ + rec_t* move_limit; + ibool insert_will_fit; + ibool insert_left; + ulint n_iterations = 0; + rec_t* rec; + ulint n_uniq; + + if (!*heap) { + *heap = mem_heap_create(1024); + } + n_uniq = dict_index_get_n_unique_in_tree(cursor->index); +func_start: + mem_heap_empty(*heap); + *offsets = NULL; + + ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(cursor->index), + MTR_MEMO_X_LOCK)); + ut_ad(!dict_index_is_online_ddl(cursor->index) + || (flags & BTR_CREATE_FLAG) + || dict_index_is_clust(cursor->index)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(dict_index_get_lock(cursor->index), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + block = btr_cur_get_block(cursor); + page = buf_block_get_frame(block); + page_zip = buf_block_get_page_zip(block); + + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(!page_is_empty(page)); + + /* try to insert to the next page if possible before split */ + rec = btr_insert_into_right_sibling( + flags, cursor, offsets, *heap, tuple, n_ext, mtr); + + if (rec != NULL) { + return(rec); + } + + page_no = buf_block_get_page_no(block); + + /* 1. Decide the split record; split_rec == NULL means that the + tuple to be inserted should be the first record on the upper + half-page */ + insert_left = FALSE; + + if (n_iterations > 0) { + direction = FSP_UP; + hint_page_no = page_no + 1; + split_rec = btr_page_get_split_rec(cursor, tuple, n_ext); + + if (split_rec == NULL) { + insert_left = btr_page_tuple_smaller( + cursor, tuple, offsets, n_uniq, heap); + } + } else if (btr_page_get_split_rec_to_right(cursor, &split_rec)) { + direction = FSP_UP; + hint_page_no = page_no + 1; + + } else if (btr_page_get_split_rec_to_left(cursor, &split_rec)) { + direction = FSP_DOWN; + hint_page_no = page_no - 1; + ut_ad(split_rec); + } else { + direction = FSP_UP; + hint_page_no = page_no + 1; + + /* If there is only one record in the index page, we + can't split the node in the middle by default. We need + to determine whether the new record will be inserted + to the left or right. */ + + if (page_get_n_recs(page) > 1) { + split_rec = page_get_middle_rec(page); + } else if (btr_page_tuple_smaller(cursor, tuple, + offsets, n_uniq, heap)) { + split_rec = page_rec_get_next( + page_get_infimum_rec(page)); + } else { + split_rec = NULL; + } + } + + /* 2. Allocate a new page to the index */ + new_block = btr_page_alloc(cursor->index, hint_page_no, direction, + btr_page_get_level(page, mtr), mtr, mtr); + new_page = buf_block_get_frame(new_block); + new_page_zip = buf_block_get_page_zip(new_block); + btr_page_create(new_block, new_page_zip, cursor->index, + btr_page_get_level(page, mtr), mtr); + + /* 3. Calculate the first record on the upper half-page, and the + first record (move_limit) on original page which ends up on the + upper half */ + + if (split_rec) { + first_rec = move_limit = split_rec; + + *offsets = rec_get_offsets(split_rec, cursor->index, *offsets, + n_uniq, heap); + + insert_left = cmp_dtuple_rec(tuple, split_rec, *offsets) < 0; + + if (!insert_left && new_page_zip && n_iterations > 0) { + /* If a compressed page has already been split, + avoid further splits by inserting the record + to an empty page. */ + split_rec = NULL; + goto insert_empty; + } + } else if (insert_left) { + ut_a(n_iterations > 0); + first_rec = page_rec_get_next(page_get_infimum_rec(page)); + move_limit = page_rec_get_next(btr_cur_get_rec(cursor)); + } else { +insert_empty: + ut_ad(!split_rec); + ut_ad(!insert_left); + buf = (byte*) mem_alloc(rec_get_converted_size(cursor->index, + tuple, n_ext)); + + first_rec = rec_convert_dtuple_to_rec(buf, cursor->index, + tuple, n_ext); + move_limit = page_rec_get_next(btr_cur_get_rec(cursor)); + } + + /* 4. Do first the modifications in the tree structure */ + + btr_attach_half_pages(flags, cursor->index, block, + first_rec, new_block, direction, mtr); + + /* If the split is made on the leaf level and the insert will fit + on the appropriate half-page, we may release the tree x-latch. + We can then move the records after releasing the tree latch, + thus reducing the tree latch contention. */ + + if (split_rec) { + insert_will_fit = !new_page_zip + && btr_page_insert_fits(cursor, split_rec, + offsets, tuple, n_ext, heap); + } else { + if (!insert_left) { + mem_free(buf); + buf = NULL; + } + + insert_will_fit = !new_page_zip + && btr_page_insert_fits(cursor, NULL, + offsets, tuple, n_ext, heap); + } + + if (insert_will_fit && page_is_leaf(page) + && !dict_index_is_online_ddl(cursor->index)) { + + mtr_memo_release(mtr, dict_index_get_lock(cursor->index), + MTR_MEMO_X_LOCK); + } + + /* 5. Move then the records to the new page */ + if (direction == FSP_DOWN) { + /* fputs("Split left\n", stderr); */ + + if (0 +#ifdef UNIV_ZIP_COPY + || page_zip +#endif /* UNIV_ZIP_COPY */ + || !page_move_rec_list_start(new_block, block, move_limit, + cursor->index, mtr)) { + /* For some reason, compressing new_page failed, + even though it should contain fewer records than + the original page. Copy the page byte for byte + and then delete the records from both pages + as appropriate. Deleting will always succeed. */ + ut_a(new_page_zip); + + page_zip_copy_recs(new_page_zip, new_page, + page_zip, page, cursor->index, mtr); + page_delete_rec_list_end(move_limit - page + new_page, + new_block, cursor->index, + ULINT_UNDEFINED, + ULINT_UNDEFINED, mtr); + + /* Update the lock table and possible hash index. */ + + lock_move_rec_list_start( + new_block, block, move_limit, + new_page + PAGE_NEW_INFIMUM); + + btr_search_move_or_delete_hash_entries( + new_block, block, cursor->index); + + /* Delete the records from the source page. */ + + page_delete_rec_list_start(move_limit, block, + cursor->index, mtr); + } + + left_block = new_block; + right_block = block; + + lock_update_split_left(right_block, left_block); + } else { + /* fputs("Split right\n", stderr); */ + + if (0 +#ifdef UNIV_ZIP_COPY + || page_zip +#endif /* UNIV_ZIP_COPY */ + || !page_move_rec_list_end(new_block, block, move_limit, + cursor->index, mtr)) { + /* For some reason, compressing new_page failed, + even though it should contain fewer records than + the original page. Copy the page byte for byte + and then delete the records from both pages + as appropriate. Deleting will always succeed. */ + ut_a(new_page_zip); + + page_zip_copy_recs(new_page_zip, new_page, + page_zip, page, cursor->index, mtr); + page_delete_rec_list_start(move_limit - page + + new_page, new_block, + cursor->index, mtr); + + /* Update the lock table and possible hash index. */ + + lock_move_rec_list_end(new_block, block, move_limit); + + btr_search_move_or_delete_hash_entries( + new_block, block, cursor->index); + + /* Delete the records from the source page. */ + + page_delete_rec_list_end(move_limit, block, + cursor->index, + ULINT_UNDEFINED, + ULINT_UNDEFINED, mtr); + } + + left_block = block; + right_block = new_block; + + lock_update_split_right(right_block, left_block); + } + +#ifdef UNIV_ZIP_DEBUG + if (page_zip) { + ut_a(page_zip_validate(page_zip, page, cursor->index)); + ut_a(page_zip_validate(new_page_zip, new_page, cursor->index)); + } +#endif /* UNIV_ZIP_DEBUG */ + + /* At this point, split_rec, move_limit and first_rec may point + to garbage on the old page. */ + + /* 6. The split and the tree modification is now completed. Decide the + page where the tuple should be inserted */ + + if (insert_left) { + insert_block = left_block; + } else { + insert_block = right_block; + } + + /* 7. Reposition the cursor for insert and try insertion */ + page_cursor = btr_cur_get_page_cur(cursor); + + page_cur_search(insert_block, cursor->index, tuple, + PAGE_CUR_LE, page_cursor); + + rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index, + offsets, heap, n_ext, mtr); + +#ifdef UNIV_ZIP_DEBUG + { + page_t* insert_page + = buf_block_get_frame(insert_block); + + page_zip_des_t* insert_page_zip + = buf_block_get_page_zip(insert_block); + + ut_a(!insert_page_zip + || page_zip_validate(insert_page_zip, insert_page, + cursor->index)); + } +#endif /* UNIV_ZIP_DEBUG */ + + if (rec != NULL) { + + goto func_exit; + } + + /* 8. If insert did not fit, try page reorganization. + For compressed pages, page_cur_tuple_insert() will have + attempted this already. */ + + if (page_cur_get_page_zip(page_cursor) + || !btr_page_reorganize(page_cursor, cursor->index, mtr)) { + + goto insert_failed; + } + + rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index, + offsets, heap, n_ext, mtr); + + if (rec == NULL) { + /* The insert did not fit on the page: loop back to the + start of the function for a new split */ +insert_failed: + /* We play safe and reset the free bits */ + if (!dict_index_is_clust(cursor->index)) { + ibuf_reset_free_bits(new_block); + ibuf_reset_free_bits(block); + } + + /* fprintf(stderr, "Split second round %lu\n", + page_get_page_no(page)); */ + n_iterations++; + ut_ad(n_iterations < 2 + || buf_block_get_page_zip(insert_block)); + ut_ad(!insert_will_fit); + + goto func_start; + } + +func_exit: + /* Insert fit on the page: update the free bits for the + left and right pages in the same mtr */ + + if (!dict_index_is_clust(cursor->index) && page_is_leaf(page)) { + ibuf_update_free_bits_for_two_pages_low( + buf_block_get_zip_size(left_block), + left_block, right_block, mtr); + } + +#if 0 + fprintf(stderr, "Split and insert done %lu %lu\n", + buf_block_get_page_no(left_block), + buf_block_get_page_no(right_block)); +#endif + MONITOR_INC(MONITOR_INDEX_SPLIT); + + ut_ad(page_validate(buf_block_get_frame(left_block), cursor->index)); + ut_ad(page_validate(buf_block_get_frame(right_block), cursor->index)); + + ut_ad(!rec || rec_offs_validate(rec, cursor->index, *offsets)); + return(rec); +} + +#ifdef UNIV_SYNC_DEBUG +/*************************************************************//** +Removes a page from the level list of pages. +@param space in: space where removed +@param zip_size in: compressed page size in bytes, or 0 for uncompressed +@param page in/out: page to remove +@param index in: index tree +@param mtr in/out: mini-transaction */ +# define btr_level_list_remove(space,zip_size,page,index,mtr) \ + btr_level_list_remove_func(space,zip_size,page,index,mtr) +#else /* UNIV_SYNC_DEBUG */ +/*************************************************************//** +Removes a page from the level list of pages. +@param space in: space where removed +@param zip_size in: compressed page size in bytes, or 0 for uncompressed +@param page in/out: page to remove +@param index in: index tree +@param mtr in/out: mini-transaction */ +# define btr_level_list_remove(space,zip_size,page,index,mtr) \ + btr_level_list_remove_func(space,zip_size,page,mtr) +#endif /* UNIV_SYNC_DEBUG */ + +/*************************************************************//** +Removes a page from the level list of pages. */ +static __attribute__((nonnull)) +void +btr_level_list_remove_func( +/*=======================*/ + ulint space, /*!< in: space where removed */ + ulint zip_size,/*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + page_t* page, /*!< in/out: page to remove */ +#ifdef UNIV_SYNC_DEBUG + const dict_index_t* index, /*!< in: index tree */ +#endif /* UNIV_SYNC_DEBUG */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ulint prev_page_no; + ulint next_page_no; + + ut_ad(page && mtr); + ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX)); + ut_ad(space == page_get_space_id(page)); + /* Get the previous and next page numbers of page */ + + prev_page_no = btr_page_get_prev(page, mtr); + next_page_no = btr_page_get_next(page, mtr); + + /* Update page links of the level */ + + if (prev_page_no != FIL_NULL) { + buf_block_t* prev_block + = btr_block_get(space, zip_size, prev_page_no, + RW_X_LATCH, index, mtr); + page_t* prev_page + = buf_block_get_frame(prev_block); +#ifdef UNIV_BTR_DEBUG + ut_a(page_is_comp(prev_page) == page_is_comp(page)); + ut_a(btr_page_get_next(prev_page, mtr) + == page_get_page_no(page)); +#endif /* UNIV_BTR_DEBUG */ + + btr_page_set_next(prev_page, + buf_block_get_page_zip(prev_block), + next_page_no, mtr); + } + + if (next_page_no != FIL_NULL) { + buf_block_t* next_block + = btr_block_get(space, zip_size, next_page_no, + RW_X_LATCH, index, mtr); + page_t* next_page + = buf_block_get_frame(next_block); +#ifdef UNIV_BTR_DEBUG + ut_a(page_is_comp(next_page) == page_is_comp(page)); + ut_a(btr_page_get_prev(next_page, mtr) + == page_get_page_no(page)); +#endif /* UNIV_BTR_DEBUG */ + + btr_page_set_prev(next_page, + buf_block_get_page_zip(next_block), + prev_page_no, mtr); + } +} + +/****************************************************************//** +Writes the redo log record for setting an index record as the predefined +minimum record. */ +UNIV_INLINE +void +btr_set_min_rec_mark_log( +/*=====================*/ + rec_t* rec, /*!< in: record */ + byte type, /*!< in: MLOG_COMP_REC_MIN_MARK or MLOG_REC_MIN_MARK */ + mtr_t* mtr) /*!< in: mtr */ +{ + mlog_write_initial_log_record(rec, type, mtr); + + /* Write rec offset as a 2-byte ulint */ + mlog_catenate_ulint(mtr, page_offset(rec), MLOG_2BYTES); +} +#else /* !UNIV_HOTBACKUP */ +# define btr_set_min_rec_mark_log(rec,comp,mtr) ((void) 0) +#endif /* !UNIV_HOTBACKUP */ + +/****************************************************************//** +Parses the redo log record for setting an index record as the predefined +minimum record. +@return end of log record or NULL */ +UNIV_INTERN +byte* +btr_parse_set_min_rec_mark( +/*=======================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + ulint comp, /*!< in: nonzero=compact page format */ + page_t* page, /*!< in: page or NULL */ + mtr_t* mtr) /*!< in: mtr or NULL */ +{ + rec_t* rec; + + if (end_ptr < ptr + 2) { + + return(NULL); + } + + if (page) { + ut_a(!page_is_comp(page) == !comp); + + rec = page + mach_read_from_2(ptr); + + btr_set_min_rec_mark(rec, mtr); + } + + return(ptr + 2); +} + +/****************************************************************//** +Sets a record as the predefined minimum record. */ +UNIV_INTERN +void +btr_set_min_rec_mark( +/*=================*/ + rec_t* rec, /*!< in: record */ + mtr_t* mtr) /*!< in: mtr */ +{ + ulint info_bits; + + if (page_rec_is_comp(rec)) { + info_bits = rec_get_info_bits(rec, TRUE); + + rec_set_info_bits_new(rec, info_bits | REC_INFO_MIN_REC_FLAG); + + btr_set_min_rec_mark_log(rec, MLOG_COMP_REC_MIN_MARK, mtr); + } else { + info_bits = rec_get_info_bits(rec, FALSE); + + rec_set_info_bits_old(rec, info_bits | REC_INFO_MIN_REC_FLAG); + + btr_set_min_rec_mark_log(rec, MLOG_REC_MIN_MARK, mtr); + } +} + +#ifndef UNIV_HOTBACKUP +/*************************************************************//** +Deletes on the upper level the node pointer to a page. */ +UNIV_INTERN +void +btr_node_ptr_delete( +/*================*/ + dict_index_t* index, /*!< in: index tree */ + buf_block_t* block, /*!< in: page whose node pointer is deleted */ + mtr_t* mtr) /*!< in: mtr */ +{ + btr_cur_t cursor; + ibool compressed; + dberr_t err; + + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + + /* Delete node pointer on father page */ + btr_page_get_father(index, block, mtr, &cursor); + + compressed = btr_cur_pessimistic_delete(&err, TRUE, &cursor, + BTR_CREATE_FLAG, RB_NONE, mtr); + ut_a(err == DB_SUCCESS); + + if (!compressed) { + btr_cur_compress_if_useful(&cursor, FALSE, mtr); + } +} + +/*************************************************************//** +If page is the only on its level, this function moves its records to the +father page, thus reducing the tree height. +@return father block */ +static +buf_block_t* +btr_lift_page_up( +/*=============*/ + dict_index_t* index, /*!< in: index tree */ + buf_block_t* block, /*!< in: page which is the only on its level; + must not be empty: use + btr_discard_only_page_on_level if the last + record from the page should be removed */ + mtr_t* mtr) /*!< in: mtr */ +{ + buf_block_t* father_block; + page_t* father_page; + ulint page_level; + page_zip_des_t* father_page_zip; + page_t* page = buf_block_get_frame(block); + ulint root_page_no; + buf_block_t* blocks[BTR_MAX_LEVELS]; + ulint n_blocks; /*!< last used index in blocks[] */ + ulint i; + bool lift_father_up; + buf_block_t* block_orig = block; + + ut_ad(btr_page_get_prev(page, mtr) == FIL_NULL); + ut_ad(btr_page_get_next(page, mtr) == FIL_NULL); + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + + page_level = btr_page_get_level(page, mtr); + root_page_no = dict_index_get_page(index); + + { + btr_cur_t cursor; + ulint* offsets = NULL; + mem_heap_t* heap = mem_heap_create( + sizeof(*offsets) + * (REC_OFFS_HEADER_SIZE + 1 + 1 + index->n_fields)); + buf_block_t* b; + + offsets = btr_page_get_father_block(offsets, heap, index, + block, mtr, &cursor); + father_block = btr_cur_get_block(&cursor); + father_page_zip = buf_block_get_page_zip(father_block); + father_page = buf_block_get_frame(father_block); + + n_blocks = 0; + + /* Store all ancestor pages so we can reset their + levels later on. We have to do all the searches on + the tree now because later on, after we've replaced + the first level, the tree is in an inconsistent state + and can not be searched. */ + for (b = father_block; + buf_block_get_page_no(b) != root_page_no; ) { + ut_a(n_blocks < BTR_MAX_LEVELS); + + offsets = btr_page_get_father_block(offsets, heap, + index, b, + mtr, &cursor); + + blocks[n_blocks++] = b = btr_cur_get_block(&cursor); + } + + lift_father_up = (n_blocks && page_level == 0); + if (lift_father_up) { + /* The father page also should be the only on its level (not + root). We should lift up the father page at first. + Because the leaf page should be lifted up only for root page. + The freeing page is based on page_level (==0 or !=0) + to choose segment. If the page_level is changed ==0 from !=0, + later freeing of the page doesn't find the page allocation + to be freed.*/ + + block = father_block; + page = buf_block_get_frame(block); + page_level = btr_page_get_level(page, mtr); + + ut_ad(btr_page_get_prev(page, mtr) == FIL_NULL); + ut_ad(btr_page_get_next(page, mtr) == FIL_NULL); + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + + father_block = blocks[0]; + father_page_zip = buf_block_get_page_zip(father_block); + father_page = buf_block_get_frame(father_block); + } + + mem_heap_free(heap); + } + + btr_search_drop_page_hash_index(block); + + /* Make the father empty */ + btr_page_empty(father_block, father_page_zip, index, page_level, mtr); + page_level++; + + /* Copy the records to the father page one by one. */ + if (0 +#ifdef UNIV_ZIP_COPY + || father_page_zip +#endif /* UNIV_ZIP_COPY */ + || !page_copy_rec_list_end(father_block, block, + page_get_infimum_rec(page), + index, mtr)) { + const page_zip_des_t* page_zip + = buf_block_get_page_zip(block); + ut_a(father_page_zip); + ut_a(page_zip); + + /* Copy the page byte for byte. */ + page_zip_copy_recs(father_page_zip, father_page, + page_zip, page, index, mtr); + + /* Update the lock table and possible hash index. */ + + lock_move_rec_list_end(father_block, block, + page_get_infimum_rec(page)); + + btr_search_move_or_delete_hash_entries(father_block, block, + index); + } + + btr_blob_dbg_remove(page, index, "btr_lift_page_up"); + lock_update_copy_and_discard(father_block, block); + + /* Go upward to root page, decrementing levels by one. */ + for (i = lift_father_up ? 1 : 0; i < n_blocks; i++, page_level++) { + page_t* page = buf_block_get_frame(blocks[i]); + page_zip_des_t* page_zip= buf_block_get_page_zip(blocks[i]); + + ut_ad(btr_page_get_level(page, mtr) == page_level + 1); + + btr_page_set_level(page, page_zip, page_level, mtr); +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + } + + /* Free the file page */ + btr_page_free(index, block, mtr); + + /* We play it safe and reset the free bits for the father */ + if (!dict_index_is_clust(index)) { + ibuf_reset_free_bits(father_block); + } + ut_ad(page_validate(father_page, index)); + ut_ad(btr_check_node_ptr(index, father_block, mtr)); + + return(lift_father_up ? block_orig : father_block); +} + +/*************************************************************//** +Tries to merge the page first to the left immediate brother if such a +brother exists, and the node pointers to the current page and to the brother +reside on the same page. If the left brother does not satisfy these +conditions, looks at the right brother. If the page is the only one on that +level lifts the records of the page to the father page, thus reducing the +tree height. It is assumed that mtr holds an x-latch on the tree and on the +page. If cursor is on the leaf level, mtr must also hold x-latches to the +brothers, if they exist. +@return TRUE on success */ +UNIV_INTERN +ibool +btr_compress( +/*=========*/ + btr_cur_t* cursor, /*!< in/out: cursor on the page to merge + or lift; the page must not be empty: + when deleting records, use btr_discard_page() + if the page would become empty */ + ibool adjust, /*!< in: TRUE if should adjust the + cursor position even if compression occurs */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + dict_index_t* index; + ulint space; + ulint zip_size; + ulint left_page_no; + ulint right_page_no; + buf_block_t* merge_block; + page_t* merge_page = NULL; + page_zip_des_t* merge_page_zip; + ibool is_left; + buf_block_t* block; + page_t* page; + btr_cur_t father_cursor; + mem_heap_t* heap; + ulint* offsets; + ulint nth_rec = 0; /* remove bogus warning */ + DBUG_ENTER("btr_compress"); + + block = btr_cur_get_block(cursor); + page = btr_cur_get_page(cursor); + index = btr_cur_get_index(cursor); + + btr_assert_not_corrupted(block, index); + + ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index), + MTR_MEMO_X_LOCK)); + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + space = dict_index_get_space(index); + zip_size = dict_table_zip_size(index->table); + + MONITOR_INC(MONITOR_INDEX_MERGE_ATTEMPTS); + + left_page_no = btr_page_get_prev(page, mtr); + right_page_no = btr_page_get_next(page, mtr); + +#ifdef UNIV_DEBUG + if (!page_is_leaf(page) && left_page_no == FIL_NULL) { + ut_a(REC_INFO_MIN_REC_FLAG & rec_get_info_bits( + page_rec_get_next(page_get_infimum_rec(page)), + page_is_comp(page))); + } +#endif /* UNIV_DEBUG */ + + heap = mem_heap_create(100); + offsets = btr_page_get_father_block(NULL, heap, index, block, mtr, + &father_cursor); + + if (adjust) { + nth_rec = page_rec_get_n_recs_before(btr_cur_get_rec(cursor)); + ut_ad(nth_rec > 0); + } + + if (left_page_no == FIL_NULL && right_page_no == FIL_NULL) { + /* The page is the only one on the level, lift the records + to the father */ + + merge_block = btr_lift_page_up(index, block, mtr); + goto func_exit; + } + + /* Decide the page to which we try to merge and which will inherit + the locks */ + + is_left = btr_can_merge_with_page(cursor, left_page_no, + &merge_block, mtr); + + DBUG_EXECUTE_IF("ib_always_merge_right", is_left = FALSE;); + + if(!is_left + && !btr_can_merge_with_page(cursor, right_page_no, &merge_block, + mtr)) { + goto err_exit; + } + + merge_page = buf_block_get_frame(merge_block); + +#ifdef UNIV_BTR_DEBUG + if (is_left) { + ut_a(btr_page_get_next(merge_page, mtr) + == buf_block_get_page_no(block)); + } else { + ut_a(btr_page_get_prev(merge_page, mtr) + == buf_block_get_page_no(block)); + } +#endif /* UNIV_BTR_DEBUG */ + + ut_ad(page_validate(merge_page, index)); + + merge_page_zip = buf_block_get_page_zip(merge_block); +#ifdef UNIV_ZIP_DEBUG + if (merge_page_zip) { + const page_zip_des_t* page_zip + = buf_block_get_page_zip(block); + ut_a(page_zip); + ut_a(page_zip_validate(merge_page_zip, merge_page, index)); + ut_a(page_zip_validate(page_zip, page, index)); + } +#endif /* UNIV_ZIP_DEBUG */ + + /* Move records to the merge page */ + if (is_left) { + rec_t* orig_pred = page_copy_rec_list_start( + merge_block, block, page_get_supremum_rec(page), + index, mtr); + + if (!orig_pred) { + goto err_exit; + } + + btr_search_drop_page_hash_index(block); + + /* Remove the page from the level list */ + btr_level_list_remove(space, zip_size, page, index, mtr); + + btr_node_ptr_delete(index, block, mtr); + lock_update_merge_left(merge_block, orig_pred, block); + + if (adjust) { + nth_rec += page_rec_get_n_recs_before(orig_pred); + } + } else { + rec_t* orig_succ; + ibool compressed; + dberr_t err; + btr_cur_t cursor2; + /* father cursor pointing to node ptr + of the right sibling */ +#ifdef UNIV_BTR_DEBUG + byte fil_page_prev[4]; +#endif /* UNIV_BTR_DEBUG */ + + btr_page_get_father(index, merge_block, mtr, &cursor2); + + if (merge_page_zip && left_page_no == FIL_NULL) { + + /* The function page_zip_compress(), which will be + invoked by page_copy_rec_list_end() below, + requires that FIL_PAGE_PREV be FIL_NULL. + Clear the field, but prepare to restore it. */ +#ifdef UNIV_BTR_DEBUG + memcpy(fil_page_prev, merge_page + FIL_PAGE_PREV, 4); +#endif /* UNIV_BTR_DEBUG */ +#if FIL_NULL != 0xffffffff +# error "FIL_NULL != 0xffffffff" +#endif + memset(merge_page + FIL_PAGE_PREV, 0xff, 4); + } + + orig_succ = page_copy_rec_list_end(merge_block, block, + page_get_infimum_rec(page), + cursor->index, mtr); + + if (!orig_succ) { + ut_a(merge_page_zip); +#ifdef UNIV_BTR_DEBUG + if (left_page_no == FIL_NULL) { + /* FIL_PAGE_PREV was restored from + merge_page_zip. */ + ut_a(!memcmp(fil_page_prev, + merge_page + FIL_PAGE_PREV, 4)); + } +#endif /* UNIV_BTR_DEBUG */ + goto err_exit; + } + + btr_search_drop_page_hash_index(block); + +#ifdef UNIV_BTR_DEBUG + if (merge_page_zip && left_page_no == FIL_NULL) { + + /* Restore FIL_PAGE_PREV in order to avoid an assertion + failure in btr_level_list_remove(), which will set + the field again to FIL_NULL. Even though this makes + merge_page and merge_page_zip inconsistent for a + split second, it is harmless, because the pages + are X-latched. */ + memcpy(merge_page + FIL_PAGE_PREV, fil_page_prev, 4); + } +#endif /* UNIV_BTR_DEBUG */ + + /* Remove the page from the level list */ + btr_level_list_remove(space, zip_size, page, index, mtr); + + /* Replace the address of the old child node (= page) with the + address of the merge page to the right */ + btr_node_ptr_set_child_page_no( + btr_cur_get_rec(&father_cursor), + btr_cur_get_page_zip(&father_cursor), + offsets, right_page_no, mtr); + + compressed = btr_cur_pessimistic_delete(&err, TRUE, &cursor2, + BTR_CREATE_FLAG, + RB_NONE, mtr); + ut_a(err == DB_SUCCESS); + + if (!compressed) { + btr_cur_compress_if_useful(&cursor2, FALSE, mtr); + } + + lock_update_merge_right(merge_block, orig_succ, block); + } + + btr_blob_dbg_remove(page, index, "btr_compress"); + + if (!dict_index_is_clust(index) && page_is_leaf(merge_page)) { + /* Update the free bits of the B-tree page in the + insert buffer bitmap. This has to be done in a + separate mini-transaction that is committed before the + main mini-transaction. We cannot update the insert + buffer bitmap in this mini-transaction, because + btr_compress() can be invoked recursively without + committing the mini-transaction in between. Since + insert buffer bitmap pages have a lower rank than + B-tree pages, we must not access other pages in the + same mini-transaction after accessing an insert buffer + bitmap page. */ + + /* The free bits in the insert buffer bitmap must + never exceed the free space on a page. It is safe to + decrement or reset the bits in the bitmap in a + mini-transaction that is committed before the + mini-transaction that affects the free space. */ + + /* It is unsafe to increment the bits in a separately + committed mini-transaction, because in crash recovery, + the free bits could momentarily be set too high. */ + + if (zip_size) { + /* Because the free bits may be incremented + and we cannot update the insert buffer bitmap + in the same mini-transaction, the only safe + thing we can do here is the pessimistic + approach: reset the free bits. */ + ibuf_reset_free_bits(merge_block); + } else { + /* On uncompressed pages, the free bits will + never increase here. Thus, it is safe to + write the bits accurately in a separate + mini-transaction. */ + ibuf_update_free_bits_if_full(merge_block, + UNIV_PAGE_SIZE, + ULINT_UNDEFINED); + } + } + + ut_ad(page_validate(merge_page, index)); +#ifdef UNIV_ZIP_DEBUG + ut_a(!merge_page_zip || page_zip_validate(merge_page_zip, merge_page, + index)); +#endif /* UNIV_ZIP_DEBUG */ + + /* Free the file page */ + btr_page_free(index, block, mtr); + + ut_ad(btr_check_node_ptr(index, merge_block, mtr)); +func_exit: + mem_heap_free(heap); + + if (adjust) { + ut_ad(nth_rec > 0); + btr_cur_position( + index, + page_rec_get_nth(merge_block->frame, nth_rec), + merge_block, cursor); + } + + MONITOR_INC(MONITOR_INDEX_MERGE_SUCCESSFUL); + + DBUG_RETURN(TRUE); + +err_exit: + /* We play it safe and reset the free bits. */ + if (zip_size + && merge_page + && page_is_leaf(merge_page) + && !dict_index_is_clust(index)) { + ibuf_reset_free_bits(merge_block); + } + + mem_heap_free(heap); + DBUG_RETURN(FALSE); +} + +/*************************************************************//** +Discards a page that is the only page on its level. This will empty +the whole B-tree, leaving just an empty root page. This function +should never be reached, because btr_compress(), which is invoked in +delete operations, calls btr_lift_page_up() to flatten the B-tree. */ +static +void +btr_discard_only_page_on_level( +/*===========================*/ + dict_index_t* index, /*!< in: index tree */ + buf_block_t* block, /*!< in: page which is the only on its level */ + mtr_t* mtr) /*!< in: mtr */ +{ + ulint page_level = 0; + trx_id_t max_trx_id; + + /* Save the PAGE_MAX_TRX_ID from the leaf page. */ + max_trx_id = page_get_max_trx_id(buf_block_get_frame(block)); + + while (buf_block_get_page_no(block) != dict_index_get_page(index)) { + btr_cur_t cursor; + buf_block_t* father; + const page_t* page = buf_block_get_frame(block); + + ut_a(page_get_n_recs(page) == 1); + ut_a(page_level == btr_page_get_level(page, mtr)); + ut_a(btr_page_get_prev(page, mtr) == FIL_NULL); + ut_a(btr_page_get_next(page, mtr) == FIL_NULL); + + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + btr_search_drop_page_hash_index(block); + + btr_page_get_father(index, block, mtr, &cursor); + father = btr_cur_get_block(&cursor); + + lock_update_discard(father, PAGE_HEAP_NO_SUPREMUM, block); + + /* Free the file page */ + btr_page_free(index, block, mtr); + + block = father; + page_level++; + } + + /* block is the root page, which must be empty, except + for the node pointer to the (now discarded) block(s). */ + +#ifdef UNIV_BTR_DEBUG + if (!dict_index_is_ibuf(index)) { + const page_t* root = buf_block_get_frame(block); + const ulint space = dict_index_get_space(index); + ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF + + root, space)); + ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP + + root, space)); + } +#endif /* UNIV_BTR_DEBUG */ + + btr_page_empty(block, buf_block_get_page_zip(block), index, 0, mtr); + ut_ad(page_is_leaf(buf_block_get_frame(block))); + + if (!dict_index_is_clust(index)) { + /* We play it safe and reset the free bits for the root */ + ibuf_reset_free_bits(block); + + ut_a(max_trx_id); + page_set_max_trx_id(block, + buf_block_get_page_zip(block), + max_trx_id, mtr); + } +} + +/*************************************************************//** +Discards a page from a B-tree. This is used to remove the last record from +a B-tree page: the whole page must be removed at the same time. This cannot +be used for the root page, which is allowed to be empty. */ +UNIV_INTERN +void +btr_discard_page( +/*=============*/ + btr_cur_t* cursor, /*!< in: cursor on the page to discard: not on + the root page */ + mtr_t* mtr) /*!< in: mtr */ +{ + dict_index_t* index; + ulint space; + ulint zip_size; + ulint left_page_no; + ulint right_page_no; + buf_block_t* merge_block; + page_t* merge_page; + buf_block_t* block; + page_t* page; + rec_t* node_ptr; + + block = btr_cur_get_block(cursor); + index = btr_cur_get_index(cursor); + + ut_ad(dict_index_get_page(index) != buf_block_get_page_no(block)); + ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index), + MTR_MEMO_X_LOCK)); + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + space = dict_index_get_space(index); + zip_size = dict_table_zip_size(index->table); + + MONITOR_INC(MONITOR_INDEX_DISCARD); + + /* Decide the page which will inherit the locks */ + + left_page_no = btr_page_get_prev(buf_block_get_frame(block), mtr); + right_page_no = btr_page_get_next(buf_block_get_frame(block), mtr); + + if (left_page_no != FIL_NULL) { + merge_block = btr_block_get(space, zip_size, left_page_no, + RW_X_LATCH, index, mtr); + merge_page = buf_block_get_frame(merge_block); +#ifdef UNIV_BTR_DEBUG + ut_a(btr_page_get_next(merge_page, mtr) + == buf_block_get_page_no(block)); +#endif /* UNIV_BTR_DEBUG */ + } else if (right_page_no != FIL_NULL) { + merge_block = btr_block_get(space, zip_size, right_page_no, + RW_X_LATCH, index, mtr); + merge_page = buf_block_get_frame(merge_block); +#ifdef UNIV_BTR_DEBUG + ut_a(btr_page_get_prev(merge_page, mtr) + == buf_block_get_page_no(block)); +#endif /* UNIV_BTR_DEBUG */ + } else { + btr_discard_only_page_on_level(index, block, mtr); + + return; + } + + page = buf_block_get_frame(block); + ut_a(page_is_comp(merge_page) == page_is_comp(page)); + btr_search_drop_page_hash_index(block); + + if (left_page_no == FIL_NULL && !page_is_leaf(page)) { + + /* We have to mark the leftmost node pointer on the right + side page as the predefined minimum record */ + node_ptr = page_rec_get_next(page_get_infimum_rec(merge_page)); + + ut_ad(page_rec_is_user_rec(node_ptr)); + + /* This will make page_zip_validate() fail on merge_page + until btr_level_list_remove() completes. This is harmless, + because everything will take place within a single + mini-transaction and because writing to the redo log + is an atomic operation (performed by mtr_commit()). */ + btr_set_min_rec_mark(node_ptr, mtr); + } + + btr_node_ptr_delete(index, block, mtr); + + /* Remove the page from the level list */ + btr_level_list_remove(space, zip_size, page, index, mtr); +#ifdef UNIV_ZIP_DEBUG + { + page_zip_des_t* merge_page_zip + = buf_block_get_page_zip(merge_block); + ut_a(!merge_page_zip + || page_zip_validate(merge_page_zip, merge_page, index)); + } +#endif /* UNIV_ZIP_DEBUG */ + + if (left_page_no != FIL_NULL) { + lock_update_discard(merge_block, PAGE_HEAP_NO_SUPREMUM, + block); + } else { + lock_update_discard(merge_block, + lock_get_min_heap_no(merge_block), + block); + } + + btr_blob_dbg_remove(page, index, "btr_discard_page"); + + /* Free the file page */ + btr_page_free(index, block, mtr); + + ut_ad(btr_check_node_ptr(index, merge_block, mtr)); +} + +#ifdef UNIV_BTR_PRINT +/*************************************************************//** +Prints size info of a B-tree. */ +UNIV_INTERN +void +btr_print_size( +/*===========*/ + dict_index_t* index) /*!< in: index tree */ +{ + page_t* root; + fseg_header_t* seg; + mtr_t mtr; + + if (dict_index_is_ibuf(index)) { + fputs("Sorry, cannot print info of an ibuf tree:" + " use ibuf functions\n", stderr); + + return; + } + + mtr_start(&mtr); + + root = btr_root_get(index, &mtr); + + seg = root + PAGE_HEADER + PAGE_BTR_SEG_TOP; + + fputs("INFO OF THE NON-LEAF PAGE SEGMENT\n", stderr); + fseg_print(seg, &mtr); + + if (!dict_index_is_univ(index)) { + + seg = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF; + + fputs("INFO OF THE LEAF PAGE SEGMENT\n", stderr); + fseg_print(seg, &mtr); + } + + mtr_commit(&mtr); +} + +/************************************************************//** +Prints recursively index tree pages. */ +static +void +btr_print_recursive( +/*================*/ + dict_index_t* index, /*!< in: index tree */ + buf_block_t* block, /*!< in: index page */ + ulint width, /*!< in: print this many entries from start + and end */ + mem_heap_t** heap, /*!< in/out: heap for rec_get_offsets() */ + ulint** offsets,/*!< in/out: buffer for rec_get_offsets() */ + mtr_t* mtr) /*!< in: mtr */ +{ + const page_t* page = buf_block_get_frame(block); + page_cur_t cursor; + ulint n_recs; + ulint i = 0; + mtr_t mtr2; + + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + fprintf(stderr, "NODE ON LEVEL %lu page number %lu\n", + (ulong) btr_page_get_level(page, mtr), + (ulong) buf_block_get_page_no(block)); + + page_print(block, index, width, width); + + n_recs = page_get_n_recs(page); + + page_cur_set_before_first(block, &cursor); + page_cur_move_to_next(&cursor); + + while (!page_cur_is_after_last(&cursor)) { + + if (page_is_leaf(page)) { + + /* If this is the leaf level, do nothing */ + + } else if ((i <= width) || (i >= n_recs - width)) { + + const rec_t* node_ptr; + + mtr_start(&mtr2); + + node_ptr = page_cur_get_rec(&cursor); + + *offsets = rec_get_offsets(node_ptr, index, *offsets, + ULINT_UNDEFINED, heap); + btr_print_recursive(index, + btr_node_ptr_get_child(node_ptr, + index, + *offsets, + &mtr2), + width, heap, offsets, &mtr2); + mtr_commit(&mtr2); + } + + page_cur_move_to_next(&cursor); + i++; + } +} + +/**************************************************************//** +Prints directories and other info of all nodes in the tree. */ +UNIV_INTERN +void +btr_print_index( +/*============*/ + dict_index_t* index, /*!< in: index */ + ulint width) /*!< in: print this many entries from start + and end */ +{ + mtr_t mtr; + buf_block_t* root; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + fputs("--------------------------\n" + "INDEX TREE PRINT\n", stderr); + + mtr_start(&mtr); + + root = btr_root_block_get(index, RW_X_LATCH, &mtr); + + btr_print_recursive(index, root, width, &heap, &offsets, &mtr); + if (heap) { + mem_heap_free(heap); + } + + mtr_commit(&mtr); + + btr_validate_index(index, 0); +} +#endif /* UNIV_BTR_PRINT */ + +#ifdef UNIV_DEBUG +/************************************************************//** +Checks that the node pointer to a page is appropriate. +@return TRUE */ +UNIV_INTERN +ibool +btr_check_node_ptr( +/*===============*/ + dict_index_t* index, /*!< in: index tree */ + buf_block_t* block, /*!< in: index page */ + mtr_t* mtr) /*!< in: mtr */ +{ + mem_heap_t* heap; + dtuple_t* tuple; + ulint* offsets; + btr_cur_t cursor; + page_t* page = buf_block_get_frame(block); + + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + if (dict_index_get_page(index) == buf_block_get_page_no(block)) { + + return(TRUE); + } + + heap = mem_heap_create(256); + offsets = btr_page_get_father_block(NULL, heap, index, block, mtr, + &cursor); + + if (page_is_leaf(page)) { + + goto func_exit; + } + + tuple = dict_index_build_node_ptr( + index, page_rec_get_next(page_get_infimum_rec(page)), 0, heap, + btr_page_get_level(page, mtr)); + + ut_a(!cmp_dtuple_rec(tuple, btr_cur_get_rec(&cursor), offsets)); +func_exit: + mem_heap_free(heap); + + return(TRUE); +} +#endif /* UNIV_DEBUG */ + +/************************************************************//** +Display identification information for a record. */ +static +void +btr_index_rec_validate_report( +/*==========================*/ + const page_t* page, /*!< in: index page */ + const rec_t* rec, /*!< in: index record */ + const dict_index_t* index) /*!< in: index */ +{ + fputs("InnoDB: Record in ", stderr); + dict_index_name_print(stderr, NULL, index); + fprintf(stderr, ", page %lu, at offset %lu\n", + page_get_page_no(page), (ulint) page_offset(rec)); +} + +/************************************************************//** +Checks the size and number of fields in a record based on the definition of +the index. +@return TRUE if ok */ +UNIV_INTERN +ibool +btr_index_rec_validate( +/*===================*/ + const rec_t* rec, /*!< in: index record */ + const dict_index_t* index, /*!< in: index */ + ibool dump_on_error) /*!< in: TRUE if the function + should print hex dump of record + and page on error */ +{ + ulint len; + ulint n; + ulint i; + const page_t* page; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + page = page_align(rec); + + if (dict_index_is_univ(index)) { + /* The insert buffer index tree can contain records from any + other index: we cannot check the number of fields or + their length */ + + return(TRUE); + } + + if ((ibool)!!page_is_comp(page) != dict_table_is_comp(index->table)) { + btr_index_rec_validate_report(page, rec, index); + fprintf(stderr, "InnoDB: compact flag=%lu, should be %lu\n", + (ulong) !!page_is_comp(page), + (ulong) dict_table_is_comp(index->table)); + + return(FALSE); + } + + n = dict_index_get_n_fields(index); + + if (!page_is_comp(page) && rec_get_n_fields_old(rec) != n) { + btr_index_rec_validate_report(page, rec, index); + fprintf(stderr, "InnoDB: has %lu fields, should have %lu\n", + (ulong) rec_get_n_fields_old(rec), (ulong) n); + + if (dump_on_error) { + buf_page_print(page, 0, BUF_PAGE_PRINT_NO_CRASH); + + fputs("InnoDB: corrupt record ", stderr); + rec_print_old(stderr, rec); + putc('\n', stderr); + } + return(FALSE); + } + + offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap); + + for (i = 0; i < n; i++) { + ulint fixed_size = dict_col_get_fixed_size( + dict_index_get_nth_col(index, i), page_is_comp(page)); + + rec_get_nth_field_offs(offsets, i, &len); + + /* Note that if fixed_size != 0, it equals the + length of a fixed-size column in the clustered index. + A prefix index of the column is of fixed, but different + length. When fixed_size == 0, prefix_len is the maximum + length of the prefix index column. */ + + if ((dict_index_get_nth_field(index, i)->prefix_len == 0 + && len != UNIV_SQL_NULL && fixed_size + && len != fixed_size) + || (dict_index_get_nth_field(index, i)->prefix_len > 0 + && len != UNIV_SQL_NULL + && len + > dict_index_get_nth_field(index, i)->prefix_len)) { + + btr_index_rec_validate_report(page, rec, index); + fprintf(stderr, + "InnoDB: field %lu len is %lu," + " should be %lu\n", + (ulong) i, (ulong) len, (ulong) fixed_size); + + if (dump_on_error) { + buf_page_print(page, 0, + BUF_PAGE_PRINT_NO_CRASH); + + fputs("InnoDB: corrupt record ", stderr); + rec_print_new(stderr, rec, offsets); + putc('\n', stderr); + } + if (heap) { + mem_heap_free(heap); + } + return(FALSE); + } + } + + if (heap) { + mem_heap_free(heap); + } + return(TRUE); +} + +/************************************************************//** +Checks the size and number of fields in records based on the definition of +the index. +@return TRUE if ok */ +static +ibool +btr_index_page_validate( +/*====================*/ + buf_block_t* block, /*!< in: index page */ + dict_index_t* index) /*!< in: index */ +{ + page_cur_t cur; + ibool ret = TRUE; +#ifndef DBUG_OFF + ulint nth = 1; +#endif /* !DBUG_OFF */ + + page_cur_set_before_first(block, &cur); + + /* Directory slot 0 should only contain the infimum record. */ + DBUG_EXECUTE_IF("check_table_rec_next", + ut_a(page_rec_get_nth_const( + page_cur_get_page(&cur), 0) + == cur.rec); + ut_a(page_dir_slot_get_n_owned( + page_dir_get_nth_slot( + page_cur_get_page(&cur), 0)) + == 1);); + + page_cur_move_to_next(&cur); + + for (;;) { + if (page_cur_is_after_last(&cur)) { + + break; + } + + if (!btr_index_rec_validate(cur.rec, index, TRUE)) { + + return(FALSE); + } + + /* Verify that page_rec_get_nth_const() is correctly + retrieving each record. */ + DBUG_EXECUTE_IF("check_table_rec_next", + ut_a(cur.rec == page_rec_get_nth_const( + page_cur_get_page(&cur), + page_rec_get_n_recs_before( + cur.rec))); + ut_a(nth++ == page_rec_get_n_recs_before( + cur.rec));); + + page_cur_move_to_next(&cur); + } + + return(ret); +} + +/************************************************************//** +Report an error on one page of an index tree. */ +static +void +btr_validate_report1( +/*=================*/ + dict_index_t* index, /*!< in: index */ + ulint level, /*!< in: B-tree level */ + const buf_block_t* block) /*!< in: index page */ +{ + fprintf(stderr, "InnoDB: Error in page %lu of ", + buf_block_get_page_no(block)); + dict_index_name_print(stderr, NULL, index); + if (level) { + fprintf(stderr, ", index tree level %lu", level); + } + putc('\n', stderr); +} + +/************************************************************//** +Report an error on two pages of an index tree. */ +static +void +btr_validate_report2( +/*=================*/ + const dict_index_t* index, /*!< in: index */ + ulint level, /*!< in: B-tree level */ + const buf_block_t* block1, /*!< in: first index page */ + const buf_block_t* block2) /*!< in: second index page */ +{ + fprintf(stderr, "InnoDB: Error in pages %lu and %lu of ", + buf_block_get_page_no(block1), + buf_block_get_page_no(block2)); + dict_index_name_print(stderr, NULL, index); + if (level) { + fprintf(stderr, ", index tree level %lu", level); + } + putc('\n', stderr); +} + +/************************************************************//** +Validates index tree level. +@return TRUE if ok */ +static +bool +btr_validate_level( +/*===============*/ + dict_index_t* index, /*!< in: index tree */ + const trx_t* trx, /*!< in: transaction or NULL */ + ulint level) /*!< in: level number */ +{ + ulint space; + ulint space_flags; + ulint zip_size; + buf_block_t* block; + page_t* page; + buf_block_t* right_block = 0; /* remove warning */ + page_t* right_page = 0; /* remove warning */ + page_t* father_page; + btr_cur_t node_cur; + btr_cur_t right_node_cur; + rec_t* rec; + ulint right_page_no; + ulint left_page_no; + page_cur_t cursor; + dtuple_t* node_ptr_tuple; + bool ret = true; + mtr_t mtr; + mem_heap_t* heap = mem_heap_create(256); + fseg_header_t* seg; + ulint* offsets = NULL; + ulint* offsets2= NULL; +#ifdef UNIV_ZIP_DEBUG + page_zip_des_t* page_zip; +#endif /* UNIV_ZIP_DEBUG */ + + mtr_start(&mtr); + + mtr_x_lock(dict_index_get_lock(index), &mtr); + + block = btr_root_block_get(index, RW_X_LATCH, &mtr); + page = buf_block_get_frame(block); + seg = page + PAGE_HEADER + PAGE_BTR_SEG_TOP; + + space = dict_index_get_space(index); + zip_size = dict_table_zip_size(index->table); + + fil_space_get_latch(space, &space_flags); + + if (zip_size != dict_tf_get_zip_size(space_flags)) { + + ib_logf(IB_LOG_LEVEL_WARN, + "Flags mismatch: table=%lu, tablespace=%lu", + (ulint) index->table->flags, (ulint) space_flags); + + mtr_commit(&mtr); + + return(false); + } + + while (level != btr_page_get_level(page, &mtr)) { + const rec_t* node_ptr; + + if (fseg_page_is_free(seg, + block->page.space, block->page.offset)) { + + btr_validate_report1(index, level, block); + + ib_logf(IB_LOG_LEVEL_WARN, "page is free"); + + ret = false; + } + + ut_a(space == buf_block_get_space(block)); + ut_a(space == page_get_space_id(page)); +#ifdef UNIV_ZIP_DEBUG + page_zip = buf_block_get_page_zip(block); + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + ut_a(!page_is_leaf(page)); + + page_cur_set_before_first(block, &cursor); + page_cur_move_to_next(&cursor); + + node_ptr = page_cur_get_rec(&cursor); + offsets = rec_get_offsets(node_ptr, index, offsets, + ULINT_UNDEFINED, &heap); + block = btr_node_ptr_get_child(node_ptr, index, offsets, &mtr); + page = buf_block_get_frame(block); + } + + /* Now we are on the desired level. Loop through the pages on that + level. */ + + if (level == 0) { + /* Leaf pages are managed in their own file segment. */ + seg -= PAGE_BTR_SEG_TOP - PAGE_BTR_SEG_LEAF; + } + +loop: + mem_heap_empty(heap); + offsets = offsets2 = NULL; + mtr_x_lock(dict_index_get_lock(index), &mtr); + +#ifdef UNIV_ZIP_DEBUG + page_zip = buf_block_get_page_zip(block); + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + + ut_a(block->page.space == space); + + if (fseg_page_is_free(seg, block->page.space, block->page.offset)) { + + btr_validate_report1(index, level, block); + + ib_logf(IB_LOG_LEVEL_WARN, "Page is marked as free"); + ret = false; + + } else if (btr_page_get_index_id(page) != index->id) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "Page index id " IB_ID_FMT " != data dictionary " + "index id " IB_ID_FMT, + btr_page_get_index_id(page), index->id); + + ret = false; + + } else if (!page_validate(page, index)) { + + btr_validate_report1(index, level, block); + ret = false; + + } else if (level == 0 && !btr_index_page_validate(block, index)) { + + /* We are on level 0. Check that the records have the right + number of fields, and field lengths are right. */ + + ret = false; + } + + ut_a(btr_page_get_level(page, &mtr) == level); + + right_page_no = btr_page_get_next(page, &mtr); + left_page_no = btr_page_get_prev(page, &mtr); + + ut_a(!page_is_empty(page) + || (level == 0 + && page_get_page_no(page) == dict_index_get_page(index))); + + if (right_page_no != FIL_NULL) { + const rec_t* right_rec; + right_block = btr_block_get(space, zip_size, right_page_no, + RW_X_LATCH, index, &mtr); + right_page = buf_block_get_frame(right_block); + if (btr_page_get_prev(right_page, &mtr) + != page_get_page_no(page)) { + + btr_validate_report2(index, level, block, right_block); + fputs("InnoDB: broken FIL_PAGE_NEXT" + " or FIL_PAGE_PREV links\n", stderr); + buf_page_print(page, 0, BUF_PAGE_PRINT_NO_CRASH); + buf_page_print(right_page, 0, BUF_PAGE_PRINT_NO_CRASH); + + ret = false; + } + + if (page_is_comp(right_page) != page_is_comp(page)) { + btr_validate_report2(index, level, block, right_block); + fputs("InnoDB: 'compact' flag mismatch\n", stderr); + buf_page_print(page, 0, BUF_PAGE_PRINT_NO_CRASH); + buf_page_print(right_page, 0, BUF_PAGE_PRINT_NO_CRASH); + + ret = false; + + goto node_ptr_fails; + } + + rec = page_rec_get_prev(page_get_supremum_rec(page)); + right_rec = page_rec_get_next(page_get_infimum_rec( + right_page)); + offsets = rec_get_offsets(rec, index, + offsets, ULINT_UNDEFINED, &heap); + offsets2 = rec_get_offsets(right_rec, index, + offsets2, ULINT_UNDEFINED, &heap); + if (cmp_rec_rec(rec, right_rec, offsets, offsets2, + index) >= 0) { + + btr_validate_report2(index, level, block, right_block); + + fputs("InnoDB: records in wrong order" + " on adjacent pages\n", stderr); + + buf_page_print(page, 0, BUF_PAGE_PRINT_NO_CRASH); + buf_page_print(right_page, 0, BUF_PAGE_PRINT_NO_CRASH); + + fputs("InnoDB: record ", stderr); + rec = page_rec_get_prev(page_get_supremum_rec(page)); + rec_print(stderr, rec, index); + putc('\n', stderr); + fputs("InnoDB: record ", stderr); + rec = page_rec_get_next( + page_get_infimum_rec(right_page)); + rec_print(stderr, rec, index); + putc('\n', stderr); + + ret = false; + } + } + + if (level > 0 && left_page_no == FIL_NULL) { + ut_a(REC_INFO_MIN_REC_FLAG & rec_get_info_bits( + page_rec_get_next(page_get_infimum_rec(page)), + page_is_comp(page))); + } + + if (buf_block_get_page_no(block) != dict_index_get_page(index)) { + + /* Check father node pointers */ + + rec_t* node_ptr; + + offsets = btr_page_get_father_block(offsets, heap, index, + block, &mtr, &node_cur); + father_page = btr_cur_get_page(&node_cur); + node_ptr = btr_cur_get_rec(&node_cur); + + btr_cur_position( + index, page_rec_get_prev(page_get_supremum_rec(page)), + block, &node_cur); + offsets = btr_page_get_father_node_ptr(offsets, heap, + &node_cur, &mtr); + + if (node_ptr != btr_cur_get_rec(&node_cur) + || btr_node_ptr_get_child_page_no(node_ptr, offsets) + != buf_block_get_page_no(block)) { + + btr_validate_report1(index, level, block); + + fputs("InnoDB: node pointer to the page is wrong\n", + stderr); + + buf_page_print(father_page, 0, BUF_PAGE_PRINT_NO_CRASH); + buf_page_print(page, 0, BUF_PAGE_PRINT_NO_CRASH); + + fputs("InnoDB: node ptr ", stderr); + rec_print(stderr, node_ptr, index); + + rec = btr_cur_get_rec(&node_cur); + fprintf(stderr, "\n" + "InnoDB: node ptr child page n:o %lu\n", + (ulong) btr_node_ptr_get_child_page_no( + rec, offsets)); + + fputs("InnoDB: record on page ", stderr); + rec_print_new(stderr, rec, offsets); + putc('\n', stderr); + ret = false; + + goto node_ptr_fails; + } + + if (!page_is_leaf(page)) { + node_ptr_tuple = dict_index_build_node_ptr( + index, + page_rec_get_next(page_get_infimum_rec(page)), + 0, heap, btr_page_get_level(page, &mtr)); + + if (cmp_dtuple_rec(node_ptr_tuple, node_ptr, + offsets)) { + const rec_t* first_rec = page_rec_get_next( + page_get_infimum_rec(page)); + + btr_validate_report1(index, level, block); + + buf_page_print(father_page, 0, + BUF_PAGE_PRINT_NO_CRASH); + buf_page_print(page, 0, + BUF_PAGE_PRINT_NO_CRASH); + + fputs("InnoDB: Error: node ptrs differ" + " on levels > 0\n" + "InnoDB: node ptr ", stderr); + rec_print_new(stderr, node_ptr, offsets); + fputs("InnoDB: first rec ", stderr); + rec_print(stderr, first_rec, index); + putc('\n', stderr); + ret = false; + + goto node_ptr_fails; + } + } + + if (left_page_no == FIL_NULL) { + ut_a(node_ptr == page_rec_get_next( + page_get_infimum_rec(father_page))); + ut_a(btr_page_get_prev(father_page, &mtr) == FIL_NULL); + } + + if (right_page_no == FIL_NULL) { + ut_a(node_ptr == page_rec_get_prev( + page_get_supremum_rec(father_page))); + ut_a(btr_page_get_next(father_page, &mtr) == FIL_NULL); + } else { + const rec_t* right_node_ptr + = page_rec_get_next(node_ptr); + + offsets = btr_page_get_father_block( + offsets, heap, index, right_block, + &mtr, &right_node_cur); + if (right_node_ptr + != page_get_supremum_rec(father_page)) { + + if (btr_cur_get_rec(&right_node_cur) + != right_node_ptr) { + ret = false; + fputs("InnoDB: node pointer to" + " the right page is wrong\n", + stderr); + + btr_validate_report1(index, level, + block); + + buf_page_print( + father_page, 0, + BUF_PAGE_PRINT_NO_CRASH); + buf_page_print( + page, 0, + BUF_PAGE_PRINT_NO_CRASH); + buf_page_print( + right_page, 0, + BUF_PAGE_PRINT_NO_CRASH); + } + } else { + page_t* right_father_page + = btr_cur_get_page(&right_node_cur); + + if (btr_cur_get_rec(&right_node_cur) + != page_rec_get_next( + page_get_infimum_rec( + right_father_page))) { + ret = false; + fputs("InnoDB: node pointer 2 to" + " the right page is wrong\n", + stderr); + + btr_validate_report1(index, level, + block); + + buf_page_print( + father_page, 0, + BUF_PAGE_PRINT_NO_CRASH); + buf_page_print( + right_father_page, 0, + BUF_PAGE_PRINT_NO_CRASH); + buf_page_print( + page, 0, + BUF_PAGE_PRINT_NO_CRASH); + buf_page_print( + right_page, 0, + BUF_PAGE_PRINT_NO_CRASH); + } + + if (page_get_page_no(right_father_page) + != btr_page_get_next(father_page, &mtr)) { + + ret = false; + fputs("InnoDB: node pointer 3 to" + " the right page is wrong\n", + stderr); + + btr_validate_report1(index, level, + block); + + buf_page_print( + father_page, 0, + BUF_PAGE_PRINT_NO_CRASH); + buf_page_print( + right_father_page, 0, + BUF_PAGE_PRINT_NO_CRASH); + buf_page_print( + page, 0, + BUF_PAGE_PRINT_NO_CRASH); + buf_page_print( + right_page, 0, + BUF_PAGE_PRINT_NO_CRASH); + } + } + } + } + +node_ptr_fails: + /* Commit the mini-transaction to release the latch on 'page'. + Re-acquire the latch on right_page, which will become 'page' + on the next loop. The page has already been checked. */ + mtr_commit(&mtr); + + if (trx_is_interrupted(trx)) { + /* On interrupt, return the current status. */ + } else if (right_page_no != FIL_NULL) { + + mtr_start(&mtr); + + block = btr_block_get( + space, zip_size, right_page_no, + RW_X_LATCH, index, &mtr); + + page = buf_block_get_frame(block); + + goto loop; + } + + mem_heap_free(heap); + + return(ret); +} + +/**************************************************************//** +Checks the consistency of an index tree. +@return TRUE if ok */ +UNIV_INTERN +bool +btr_validate_index( +/*===============*/ + dict_index_t* index, /*!< in: index */ + const trx_t* trx) /*!< in: transaction or NULL */ +{ + /* Full Text index are implemented by auxiliary tables, + not the B-tree */ + if (dict_index_is_online_ddl(index) || (index->type & DICT_FTS)) { + return(true); + } + + mtr_t mtr; + + mtr_start(&mtr); + + mtr_x_lock(dict_index_get_lock(index), &mtr); + + bool ok = true; + page_t* root = btr_root_get(index, &mtr); + + SRV_CORRUPT_TABLE_CHECK(root, + { + mtr_commit(&mtr); + return(FALSE); + }); + + ulint n = btr_page_get_level(root, &mtr); + + for (ulint i = 0; i <= n; ++i) { + + if (!btr_validate_level(index, trx, n - i)) { + ok = false; + break; + } + } + + mtr_commit(&mtr); + + return(ok); +} + +/**************************************************************//** +Checks if the page in the cursor can be merged with given page. +If necessary, re-organize the merge_page. +@return TRUE if possible to merge. */ +UNIV_INTERN +ibool +btr_can_merge_with_page( +/*====================*/ + btr_cur_t* cursor, /*!< in: cursor on the page to merge */ + ulint page_no, /*!< in: a sibling page */ + buf_block_t** merge_block, /*!< out: the merge block */ + mtr_t* mtr) /*!< in: mini-transaction */ +{ + dict_index_t* index; + page_t* page; + ulint space; + ulint zip_size; + ulint n_recs; + ulint data_size; + ulint max_ins_size_reorg; + ulint max_ins_size; + buf_block_t* mblock; + page_t* mpage; + DBUG_ENTER("btr_can_merge_with_page"); + + if (page_no == FIL_NULL) { + goto error; + } + + index = btr_cur_get_index(cursor); + page = btr_cur_get_page(cursor); + space = dict_index_get_space(index); + zip_size = dict_table_zip_size(index->table); + + mblock = btr_block_get(space, zip_size, page_no, RW_X_LATCH, index, + mtr); + mpage = buf_block_get_frame(mblock); + + n_recs = page_get_n_recs(page); + data_size = page_get_data_size(page); + + max_ins_size_reorg = page_get_max_insert_size_after_reorganize( + mpage, n_recs); + + if (data_size > max_ins_size_reorg) { + goto error; + } + + /* If compression padding tells us that merging will result in + too packed up page i.e.: which is likely to cause compression + failure then don't merge the pages. */ + if (zip_size && page_is_leaf(mpage) + && (page_get_data_size(mpage) + data_size + >= dict_index_zip_pad_optimal_page_size(index))) { + + goto error; + } + + + max_ins_size = page_get_max_insert_size(mpage, n_recs); + + if (data_size > max_ins_size) { + + /* We have to reorganize mpage */ + + if (!btr_page_reorganize_block( + false, page_zip_level, mblock, index, mtr)) { + + goto error; + } + + max_ins_size = page_get_max_insert_size(mpage, n_recs); + + ut_ad(page_validate(mpage, index)); + ut_ad(max_ins_size == max_ins_size_reorg); + + if (data_size > max_ins_size) { + + /* Add fault tolerance, though this should + never happen */ + + goto error; + } + } + + *merge_block = mblock; + DBUG_RETURN(TRUE); + +error: + *merge_block = NULL; + DBUG_RETURN(FALSE); +} + +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/xtradb/btr/btr0cur.cc b/storage/xtradb/btr/btr0cur.cc new file mode 100644 index 00000000000..8675acadb19 --- /dev/null +++ b/storage/xtradb/btr/btr0cur.cc @@ -0,0 +1,5851 @@ +/***************************************************************************** + +Copyright (c) 1994, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2008, Google Inc. +Copyright (c) 2012, Facebook Inc. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file btr/btr0cur.cc +The index tree cursor + +All changes that row operations make to a B-tree or the records +there must go through this module! Undo log records are written here +of every modify or insert of a clustered index record. + + NOTE!!! +To make sure we do not run out of disk space during a pessimistic +insert or update, we have to reserve 2 x the height of the index tree +many pages in the tablespace before we start the operation, because +if leaf splitting has been started, it is difficult to undo, except +by crashing the database and doing a roll-forward. + +Created 10/16/1994 Heikki Tuuri +*******************************************************/ + +#include "btr0cur.h" + +#ifdef UNIV_NONINL +#include "btr0cur.ic" +#endif + +#include "row0upd.h" +#ifndef UNIV_HOTBACKUP +#include "mtr0log.h" +#include "page0page.h" +#include "page0zip.h" +#include "rem0rec.h" +#include "rem0cmp.h" +#include "buf0lru.h" +#include "btr0btr.h" +#include "btr0sea.h" +#include "row0log.h" +#include "row0purge.h" +#include "row0upd.h" +#include "trx0rec.h" +#include "trx0roll.h" /* trx_is_recv() */ +#include "que0que.h" +#include "row0row.h" +#include "srv0srv.h" +#include "ibuf0ibuf.h" +#include "lock0lock.h" +#include "zlib.h" + +/** Buffered B-tree operation types, introduced as part of delete buffering. */ +enum btr_op_t { + BTR_NO_OP = 0, /*!< Not buffered */ + BTR_INSERT_OP, /*!< Insert, do not ignore UNIQUE */ + BTR_INSERT_IGNORE_UNIQUE_OP, /*!< Insert, ignoring UNIQUE */ + BTR_DELETE_OP, /*!< Purge a delete-marked record */ + BTR_DELMARK_OP /*!< Mark a record for deletion */ +}; + +#ifdef UNIV_DEBUG +/** If the following is set to TRUE, this module prints a lot of +trace information of individual record operations */ +UNIV_INTERN ibool btr_cur_print_record_ops = FALSE; +#endif /* UNIV_DEBUG */ + +/** Number of searches down the B-tree in btr_cur_search_to_nth_level(). */ +UNIV_INTERN ulint btr_cur_n_non_sea = 0; +/** Number of successful adaptive hash index lookups in +btr_cur_search_to_nth_level(). */ +UNIV_INTERN ulint btr_cur_n_sea = 0; +/** Old value of btr_cur_n_non_sea. Copied by +srv_refresh_innodb_monitor_stats(). Referenced by +srv_printf_innodb_monitor(). */ +UNIV_INTERN ulint btr_cur_n_non_sea_old = 0; +/** Old value of btr_cur_n_sea. Copied by +srv_refresh_innodb_monitor_stats(). Referenced by +srv_printf_innodb_monitor(). */ +UNIV_INTERN ulint btr_cur_n_sea_old = 0; + +#ifdef UNIV_DEBUG +/* Flag to limit optimistic insert records */ +UNIV_INTERN uint btr_cur_limit_optimistic_insert_debug = 0; +#endif /* UNIV_DEBUG */ + +/** In the optimistic insert, if the insert does not fit, but this much space +can be released by page reorganize, then it is reorganized */ +#define BTR_CUR_PAGE_REORGANIZE_LIMIT (UNIV_PAGE_SIZE / 32) + +/** The structure of a BLOB part header */ +/* @{ */ +/*--------------------------------------*/ +#define BTR_BLOB_HDR_PART_LEN 0 /*!< BLOB part len on this + page */ +#define BTR_BLOB_HDR_NEXT_PAGE_NO 4 /*!< next BLOB part page no, + FIL_NULL if none */ +/*--------------------------------------*/ +#define BTR_BLOB_HDR_SIZE 8 /*!< Size of a BLOB + part header, in bytes */ + +/** Estimated table level stats from sampled value. +@param value sampled stats +@param index index being sampled +@param sample number of sampled rows +@param ext_size external stored data size +@param not_empty table not empty +@return estimated table wide stats from sampled value */ +#define BTR_TABLE_STATS_FROM_SAMPLE(value, index, sample, ext_size, not_empty)\ + (((value) * (ib_int64_t) index->stat_n_leaf_pages \ + + (sample) - 1 + (ext_size) + (not_empty)) / ((sample) + (ext_size))) + +/* @} */ +#endif /* !UNIV_HOTBACKUP */ + +/** A BLOB field reference full of zero, for use in assertions and tests. +Initially, BLOB field references are set to zero, in +dtuple_convert_big_rec(). */ +const byte field_ref_zero[BTR_EXTERN_FIELD_REF_SIZE] = { + 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, +}; + +#ifndef UNIV_HOTBACKUP +/*******************************************************************//** +Marks all extern fields in a record as owned by the record. This function +should be called if the delete mark of a record is removed: a not delete +marked record always owns all its extern fields. */ +static +void +btr_cur_unmark_extern_fields( +/*=========================*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page whose uncompressed + part will be updated, or NULL */ + rec_t* rec, /*!< in/out: record in a clustered index */ + dict_index_t* index, /*!< in: index of the page */ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + mtr_t* mtr); /*!< in: mtr, or NULL if not logged */ +/*******************************************************************//** +Adds path information to the cursor for the current page, for which +the binary search has been performed. */ +static +void +btr_cur_add_path_info( +/*==================*/ + btr_cur_t* cursor, /*!< in: cursor positioned on a page */ + ulint height, /*!< in: height of the page in tree; + 0 means leaf node */ + ulint root_height); /*!< in: root node height in tree */ +/***********************************************************//** +Frees the externally stored fields for a record, if the field is mentioned +in the update vector. */ +static +void +btr_rec_free_updated_extern_fields( +/*===============================*/ + dict_index_t* index, /*!< in: index of rec; the index tree MUST be + X-latched */ + rec_t* rec, /*!< in: record */ + page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed + part will be updated, or NULL */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + const upd_t* update, /*!< in: update vector */ + enum trx_rb_ctx rb_ctx, /*!< in: rollback context */ + mtr_t* mtr); /*!< in: mini-transaction handle which contains + an X-latch to record page and to the tree */ +/***********************************************************//** +Frees the externally stored fields for a record. */ +static +void +btr_rec_free_externally_stored_fields( +/*==================================*/ + dict_index_t* index, /*!< in: index of the data, the index + tree MUST be X-latched */ + rec_t* rec, /*!< in: record */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed + part will be updated, or NULL */ + enum trx_rb_ctx rb_ctx, /*!< in: rollback context */ + mtr_t* mtr); /*!< in: mini-transaction handle which contains + an X-latch to record page and to the index + tree */ +#endif /* !UNIV_HOTBACKUP */ + +/******************************************************//** +The following function is used to set the deleted bit of a record. */ +UNIV_INLINE +void +btr_rec_set_deleted_flag( +/*=====================*/ + rec_t* rec, /*!< in/out: physical record */ + page_zip_des_t* page_zip,/*!< in/out: compressed page (or NULL) */ + ulint flag) /*!< in: nonzero if delete marked */ +{ + if (page_rec_is_comp(rec)) { + rec_set_deleted_flag_new(rec, page_zip, flag); + } else { + ut_ad(!page_zip); + rec_set_deleted_flag_old(rec, flag); + } +} + +#ifndef UNIV_HOTBACKUP +/*==================== B-TREE SEARCH =========================*/ + +/********************************************************************//** +Latches the leaf page or pages requested. */ +static +void +btr_cur_latch_leaves( +/*=================*/ + page_t* page, /*!< in: leaf page where the search + converged */ + ulint space, /*!< in: space id */ + ulint zip_size, /*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page_no, /*!< in: page number of the leaf */ + ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */ + btr_cur_t* cursor, /*!< in: cursor */ + mtr_t* mtr) /*!< in: mtr */ +{ + ulint mode; + ulint sibling_mode; + ulint left_page_no; + ulint right_page_no; + buf_block_t* get_block; + + ut_ad(page && mtr); + + switch (latch_mode) { + case BTR_SEARCH_LEAF: + case BTR_MODIFY_LEAF: + mode = latch_mode == BTR_SEARCH_LEAF ? RW_S_LATCH : RW_X_LATCH; + get_block = btr_block_get( + space, zip_size, page_no, mode, cursor->index, mtr); + + SRV_CORRUPT_TABLE_CHECK(get_block, return;); + +#ifdef UNIV_BTR_DEBUG + ut_a(page_is_comp(get_block->frame) == page_is_comp(page)); +#endif /* UNIV_BTR_DEBUG */ + get_block->check_index_page_at_flush = TRUE; + return; + case BTR_SEARCH_TREE: + case BTR_MODIFY_TREE: + if (UNIV_UNLIKELY(latch_mode == BTR_SEARCH_TREE)) { + mode = RW_S_LATCH; + sibling_mode = RW_NO_LATCH; + } else { + mode = sibling_mode = RW_X_LATCH; + } + /* Fetch and possibly latch also brothers from left to right */ + left_page_no = btr_page_get_prev(page, mtr); + + if (left_page_no != FIL_NULL) { + get_block = btr_block_get( + space, zip_size, left_page_no, + sibling_mode, cursor->index, mtr); + + SRV_CORRUPT_TABLE_CHECK(get_block, return;); + +#ifdef UNIV_BTR_DEBUG + ut_a(page_is_comp(get_block->frame) + == page_is_comp(page)); + ut_a(btr_page_get_next(get_block->frame, mtr) + == page_get_page_no(page)); +#endif /* UNIV_BTR_DEBUG */ + if (sibling_mode == RW_NO_LATCH) { + /* btr_block_get() called with RW_NO_LATCH will + fix the read block in the buffer. This serves + no purpose for the fake changes prefetching, + thus we unfix the sibling blocks immediately.*/ + mtr_memo_release(mtr, get_block, + MTR_MEMO_BUF_FIX); + } else { + get_block->check_index_page_at_flush = TRUE; + } + } + + get_block = btr_block_get( + space, zip_size, page_no, + mode, cursor->index, mtr); + + SRV_CORRUPT_TABLE_CHECK(get_block, return;); + +#ifdef UNIV_BTR_DEBUG + ut_a(page_is_comp(get_block->frame) == page_is_comp(page)); +#endif /* UNIV_BTR_DEBUG */ + get_block->check_index_page_at_flush = TRUE; + + right_page_no = btr_page_get_next(page, mtr); + + if (right_page_no != FIL_NULL) { + get_block = btr_block_get( + space, zip_size, right_page_no, + sibling_mode, cursor->index, mtr); + + SRV_CORRUPT_TABLE_CHECK(get_block, return;); + +#ifdef UNIV_BTR_DEBUG + ut_a(page_is_comp(get_block->frame) + == page_is_comp(page)); + ut_a(btr_page_get_prev(get_block->frame, mtr) + == page_get_page_no(page)); +#endif /* UNIV_BTR_DEBUG */ + if (sibling_mode == RW_NO_LATCH) { + mtr_memo_release(mtr, get_block, + MTR_MEMO_BUF_FIX); + } else { + get_block->check_index_page_at_flush = TRUE; + } + } + + return; + + case BTR_SEARCH_PREV: + case BTR_MODIFY_PREV: + mode = latch_mode == BTR_SEARCH_PREV ? RW_S_LATCH : RW_X_LATCH; + /* latch also left brother */ + left_page_no = btr_page_get_prev(page, mtr); + + if (left_page_no != FIL_NULL) { + get_block = btr_block_get( + space, zip_size, + left_page_no, mode, cursor->index, mtr); + cursor->left_block = get_block; + + SRV_CORRUPT_TABLE_CHECK(get_block, return;); + +#ifdef UNIV_BTR_DEBUG + ut_a(page_is_comp(get_block->frame) + == page_is_comp(page)); + ut_a(btr_page_get_next(get_block->frame, mtr) + == page_get_page_no(page)); +#endif /* UNIV_BTR_DEBUG */ + get_block->check_index_page_at_flush = TRUE; + } + + get_block = btr_block_get( + space, zip_size, page_no, mode, cursor->index, mtr); + + SRV_CORRUPT_TABLE_CHECK(get_block, return;); + +#ifdef UNIV_BTR_DEBUG + ut_a(page_is_comp(get_block->frame) == page_is_comp(page)); +#endif /* UNIV_BTR_DEBUG */ + get_block->check_index_page_at_flush = TRUE; + return; + } + + ut_error; +} + +/********************************************************************//** +Searches an index tree and positions a tree cursor on a given level. +NOTE: n_fields_cmp in tuple must be set so that it cannot be compared +to node pointer page number fields on the upper levels of the tree! +Note that if mode is PAGE_CUR_LE, which is used in inserts, then +cursor->up_match and cursor->low_match both will have sensible values. +If mode is PAGE_CUR_GE, then up_match will a have a sensible value. + +If mode is PAGE_CUR_LE , cursor is left at the place where an insert of the +search tuple should be performed in the B-tree. InnoDB does an insert +immediately after the cursor. Thus, the cursor may end up on a user record, +or on a page infimum record. */ +UNIV_INTERN +void +btr_cur_search_to_nth_level( +/*========================*/ + dict_index_t* index, /*!< in: index */ + ulint level, /*!< in: the tree level of search */ + const dtuple_t* tuple, /*!< in: data tuple; NOTE: n_fields_cmp in + tuple must be set so that it cannot get + compared to the node ptr page number field! */ + ulint mode, /*!< in: PAGE_CUR_L, ...; + Inserts should always be made using + PAGE_CUR_LE to search the position! */ + ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ..., ORed with + at most one of BTR_INSERT, BTR_DELETE_MARK, + BTR_DELETE, or BTR_ESTIMATE; + cursor->left_block is used to store a pointer + to the left neighbor page, in the cases + BTR_SEARCH_PREV and BTR_MODIFY_PREV; + NOTE that if has_search_latch + is != 0, we maybe do not have a latch set + on the cursor page, we assume + the caller uses his search latch + to protect the record! */ + btr_cur_t* cursor, /*!< in/out: tree cursor; the cursor page is + s- or x-latched, but see also above! */ + ulint has_search_latch,/*!< in: info on the latch mode the + caller currently has on btr_search_latch: + RW_S_LATCH, or 0 */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_t* page; + buf_block_t* block; + ulint space; + buf_block_t* guess; + ulint height; + ulint page_no; + ulint up_match; + ulint up_bytes; + ulint low_match; + ulint low_bytes; + ulint savepoint; + ulint rw_latch; + ulint page_mode; + ulint buf_mode; + ulint estimate; + ulint zip_size; + page_cur_t* page_cursor; + btr_op_t btr_op; + ulint root_height = 0; /* remove warning */ + +#ifdef BTR_CUR_ADAPT + btr_search_t* info; +#endif + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + /* Currently, PAGE_CUR_LE is the only search mode used for searches + ending to upper levels */ + + ut_ad(level == 0 || mode == PAGE_CUR_LE); + ut_ad(dict_index_check_search_tuple(index, tuple)); + ut_ad(!dict_index_is_ibuf(index) || ibuf_inside(mtr)); + ut_ad(dtuple_check_typed(tuple)); + ut_ad(!(index->type & DICT_FTS)); + ut_ad(index->page != FIL_NULL); + + UNIV_MEM_INVALID(&cursor->up_match, sizeof cursor->up_match); + UNIV_MEM_INVALID(&cursor->up_bytes, sizeof cursor->up_bytes); + UNIV_MEM_INVALID(&cursor->low_match, sizeof cursor->low_match); + UNIV_MEM_INVALID(&cursor->low_bytes, sizeof cursor->low_bytes); +#ifdef UNIV_DEBUG + cursor->up_match = ULINT_UNDEFINED; + cursor->low_match = ULINT_UNDEFINED; +#endif + + ibool s_latch_by_caller; + + s_latch_by_caller = latch_mode & BTR_ALREADY_S_LATCHED; + + ut_ad(!s_latch_by_caller + || mtr_memo_contains(mtr, dict_index_get_lock(index), + MTR_MEMO_S_LOCK)); + + /* These flags are mutually exclusive, they are lumped together + with the latch mode for historical reasons. It's possible for + none of the flags to be set. */ + switch (UNIV_EXPECT(latch_mode + & (BTR_INSERT | BTR_DELETE | BTR_DELETE_MARK), + 0)) { + case 0: + btr_op = BTR_NO_OP; + break; + case BTR_INSERT: + btr_op = (latch_mode & BTR_IGNORE_SEC_UNIQUE) + ? BTR_INSERT_IGNORE_UNIQUE_OP + : BTR_INSERT_OP; + break; + case BTR_DELETE: + btr_op = BTR_DELETE_OP; + ut_a(cursor->purge_node); + break; + case BTR_DELETE_MARK: + btr_op = BTR_DELMARK_OP; + break; + default: + /* only one of BTR_INSERT, BTR_DELETE, BTR_DELETE_MARK + should be specified at a time */ + ut_error; + } + + /* Operations on the insert buffer tree cannot be buffered. */ + ut_ad(btr_op == BTR_NO_OP || !dict_index_is_ibuf(index)); + /* Operations on the clustered index cannot be buffered. */ + ut_ad(btr_op == BTR_NO_OP || !dict_index_is_clust(index)); + + estimate = latch_mode & BTR_ESTIMATE; + + /* Turn the flags unrelated to the latch mode off. */ + latch_mode = BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode); + + ut_ad(!s_latch_by_caller + || latch_mode == BTR_SEARCH_LEAF + || latch_mode == BTR_MODIFY_LEAF); + + cursor->flag = BTR_CUR_BINARY; + cursor->index = index; + +#ifndef BTR_CUR_ADAPT + guess = NULL; +#else + info = btr_search_get_info(index); + + guess = info->root_guess; + +#ifdef BTR_CUR_HASH_ADAPT + +# ifdef UNIV_SEARCH_PERF_STAT + info->n_searches++; +# endif + if (rw_lock_get_writer(btr_search_get_latch(cursor->index)) == + RW_LOCK_NOT_LOCKED + && latch_mode <= BTR_MODIFY_LEAF + && info->last_hash_succ + && !estimate +# ifdef PAGE_CUR_LE_OR_EXTENDS + && mode != PAGE_CUR_LE_OR_EXTENDS +# endif /* PAGE_CUR_LE_OR_EXTENDS */ + /* If !has_search_latch, we do a dirty read of + btr_search_enabled below, and btr_search_guess_on_hash() + will have to check it again. */ + && UNIV_LIKELY(btr_search_enabled) + && btr_search_guess_on_hash(index, info, tuple, mode, + latch_mode, cursor, + has_search_latch, mtr)) { + + /* Search using the hash index succeeded */ + + ut_ad(cursor->up_match != ULINT_UNDEFINED + || mode != PAGE_CUR_GE); + ut_ad(cursor->up_match != ULINT_UNDEFINED + || mode != PAGE_CUR_LE); + ut_ad(cursor->low_match != ULINT_UNDEFINED + || mode != PAGE_CUR_LE); + btr_cur_n_sea++; + + return; + } +# endif /* BTR_CUR_HASH_ADAPT */ +#endif /* BTR_CUR_ADAPT */ + btr_cur_n_non_sea++; + + /* If the hash search did not succeed, do binary search down the + tree */ + + if (has_search_latch) { + /* Release possible search latch to obey latching order */ + rw_lock_s_unlock(btr_search_get_latch(cursor->index)); + } + + /* Store the position of the tree latch we push to mtr so that we + know how to release it when we have latched leaf node(s) */ + + savepoint = mtr_set_savepoint(mtr); + + switch (latch_mode) { + case BTR_MODIFY_TREE: + mtr_x_lock(dict_index_get_lock(index), mtr); + break; + case BTR_CONT_MODIFY_TREE: + /* Do nothing */ + ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index), + MTR_MEMO_X_LOCK)); + break; + default: + if (!s_latch_by_caller) { + mtr_s_lock(dict_index_get_lock(index), mtr); + } + } + + page_cursor = btr_cur_get_page_cur(cursor); + + space = dict_index_get_space(index); + page_no = dict_index_get_page(index); + + up_match = 0; + up_bytes = 0; + low_match = 0; + low_bytes = 0; + + height = ULINT_UNDEFINED; + + /* We use these modified search modes on non-leaf levels of the + B-tree. These let us end up in the right B-tree leaf. In that leaf + we use the original search mode. */ + + switch (mode) { + case PAGE_CUR_GE: + page_mode = PAGE_CUR_L; + break; + case PAGE_CUR_G: + page_mode = PAGE_CUR_LE; + break; + default: +#ifdef PAGE_CUR_LE_OR_EXTENDS + ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE + || mode == PAGE_CUR_LE_OR_EXTENDS); +#else /* PAGE_CUR_LE_OR_EXTENDS */ + ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE); +#endif /* PAGE_CUR_LE_OR_EXTENDS */ + page_mode = mode; + break; + } + + /* Loop and search until we arrive at the desired level */ + +search_loop: + buf_mode = BUF_GET; + rw_latch = RW_NO_LATCH; + + if (height != 0) { + /* We are about to fetch the root or a non-leaf page. */ + } else if (latch_mode <= BTR_MODIFY_LEAF) { + rw_latch = latch_mode; + + if (btr_op != BTR_NO_OP + && ibuf_should_try(index, btr_op != BTR_INSERT_OP)) { + + /* Try to buffer the operation if the leaf + page is not in the buffer pool. */ + + buf_mode = btr_op == BTR_DELETE_OP + ? BUF_GET_IF_IN_POOL_OR_WATCH + : BUF_GET_IF_IN_POOL; + } + } + + zip_size = dict_table_zip_size(index->table); + +retry_page_get: + block = buf_page_get_gen( + space, zip_size, page_no, rw_latch, guess, buf_mode, + file, line, mtr); + + if (block == NULL) { + SRV_CORRUPT_TABLE_CHECK(buf_mode == BUF_GET_IF_IN_POOL || + buf_mode == BUF_GET_IF_IN_POOL_OR_WATCH, + { + page_cursor->block = 0; + page_cursor->rec = 0; + if (estimate) { + + cursor->path_arr->nth_rec = + ULINT_UNDEFINED; + } + + goto func_exit; + }); + + /* This must be a search to perform an insert/delete + mark/ delete; try using the insert/delete buffer */ + + ut_ad(height == 0); + ut_ad(cursor->thr); + + switch (btr_op) { + case BTR_INSERT_OP: + case BTR_INSERT_IGNORE_UNIQUE_OP: + ut_ad(buf_mode == BUF_GET_IF_IN_POOL); + + if (ibuf_insert(IBUF_OP_INSERT, tuple, index, + space, zip_size, page_no, + cursor->thr)) { + + cursor->flag = BTR_CUR_INSERT_TO_IBUF; + + goto func_exit; + } + break; + + case BTR_DELMARK_OP: + ut_ad(buf_mode == BUF_GET_IF_IN_POOL); + + if (ibuf_insert(IBUF_OP_DELETE_MARK, tuple, + index, space, zip_size, + page_no, cursor->thr)) { + + cursor->flag = BTR_CUR_DEL_MARK_IBUF; + + goto func_exit; + } + + break; + + case BTR_DELETE_OP: + ut_ad(buf_mode == BUF_GET_IF_IN_POOL_OR_WATCH); + + if (!row_purge_poss_sec(cursor->purge_node, + index, tuple)) { + + /* The record cannot be purged yet. */ + cursor->flag = BTR_CUR_DELETE_REF; + } else if (ibuf_insert(IBUF_OP_DELETE, tuple, + index, space, zip_size, + page_no, + cursor->thr)) { + + /* The purge was buffered. */ + cursor->flag = BTR_CUR_DELETE_IBUF; + } else { + /* The purge could not be buffered. */ + buf_pool_watch_unset(space, page_no); + break; + } + + buf_pool_watch_unset(space, page_no); + goto func_exit; + + default: + ut_error; + } + + /* Insert to the insert/delete buffer did not succeed, we + must read the page from disk. */ + + buf_mode = BUF_GET; + + goto retry_page_get; + } + + block->check_index_page_at_flush = TRUE; + page = buf_block_get_frame(block); + + SRV_CORRUPT_TABLE_CHECK(page, + { + page_cursor->block = 0; + page_cursor->rec = 0; + + if (estimate) { + + cursor->path_arr->nth_rec = ULINT_UNDEFINED; + } + + goto func_exit; + }); + + if (rw_latch != RW_NO_LATCH) { +#ifdef UNIV_ZIP_DEBUG + const page_zip_des_t* page_zip + = buf_block_get_page_zip(block); + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + + buf_block_dbg_add_level( + block, dict_index_is_ibuf(index) + ? SYNC_IBUF_TREE_NODE : SYNC_TREE_NODE); + } + + ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX); + ut_ad(index->id == btr_page_get_index_id(page)); + + if (UNIV_UNLIKELY(height == ULINT_UNDEFINED)) { + /* We are in the root node */ + + height = btr_page_get_level(page, mtr); + root_height = height; + cursor->tree_height = root_height + 1; + +#ifdef BTR_CUR_ADAPT + if (block != guess) { + info->root_guess = block; + } +#endif + } + + if (height == 0) { + if (rw_latch == RW_NO_LATCH) { + + btr_cur_latch_leaves( + page, space, zip_size, page_no, latch_mode, + cursor, mtr); + } + + switch (latch_mode) { + case BTR_MODIFY_TREE: + case BTR_CONT_MODIFY_TREE: + break; + default: + if (!s_latch_by_caller) { + /* Release the tree s-latch */ + mtr_release_s_latch_at_savepoint( + mtr, savepoint, + dict_index_get_lock(index)); + } + } + + page_mode = mode; + } + + page_cur_search_with_match( + block, index, tuple, page_mode, &up_match, &up_bytes, + &low_match, &low_bytes, page_cursor); + + if (estimate) { + btr_cur_add_path_info(cursor, height, root_height); + } + + /* If this is the desired level, leave the loop */ + + ut_ad(height == btr_page_get_level(page_cur_get_page(page_cursor), + mtr)); + + if (level != height) { + + const rec_t* node_ptr; + ut_ad(height > 0); + + height--; + guess = NULL; + + node_ptr = page_cur_get_rec(page_cursor); + + offsets = rec_get_offsets( + node_ptr, index, offsets, ULINT_UNDEFINED, &heap); + + /* Go to the child node */ + page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets); + + if (UNIV_UNLIKELY(height == 0 && dict_index_is_ibuf(index))) { + /* We're doing a search on an ibuf tree and we're one + level above the leaf page. */ + + ut_ad(level == 0); + + buf_mode = BUF_GET; + rw_latch = RW_NO_LATCH; + goto retry_page_get; + } + + goto search_loop; + } + + if (level != 0) { + /* x-latch the page */ + buf_block_t* child_block = btr_block_get( + space, zip_size, page_no, RW_X_LATCH, index, mtr); + + page = buf_block_get_frame(child_block); + btr_assert_not_corrupted(child_block, index); + } else { + cursor->low_match = low_match; + cursor->low_bytes = low_bytes; + cursor->up_match = up_match; + cursor->up_bytes = up_bytes; + +#ifdef BTR_CUR_ADAPT + /* We do a dirty read of btr_search_enabled here. We + will properly check btr_search_enabled again in + btr_search_build_page_hash_index() before building a + page hash index, while holding btr_search_latch. */ + if (btr_search_enabled) { + btr_search_info_update(index, cursor); + } +#endif + ut_ad(cursor->up_match != ULINT_UNDEFINED + || mode != PAGE_CUR_GE); + ut_ad(cursor->up_match != ULINT_UNDEFINED + || mode != PAGE_CUR_LE); + ut_ad(cursor->low_match != ULINT_UNDEFINED + || mode != PAGE_CUR_LE); + } + +func_exit: + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + if (has_search_latch) { + + rw_lock_s_lock(btr_search_get_latch(cursor->index)); + } +} + +/*****************************************************************//** +Opens a cursor at either end of an index. */ +UNIV_INTERN +void +btr_cur_open_at_index_side_func( +/*============================*/ + bool from_left, /*!< in: true if open to the low end, + false if to the high end */ + dict_index_t* index, /*!< in: index */ + ulint latch_mode, /*!< in: latch mode */ + btr_cur_t* cursor, /*!< in/out: cursor */ + ulint level, /*!< in: level to search for + (0=leaf). */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + page_cur_t* page_cursor; + ulint page_no; + ulint space; + ulint zip_size; + ulint height; + ulint root_height = 0; /* remove warning */ + rec_t* node_ptr; + ulint estimate; + ulint savepoint; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + estimate = latch_mode & BTR_ESTIMATE; + latch_mode &= ~BTR_ESTIMATE; + + ut_ad(level != ULINT_UNDEFINED); + + /* Store the position of the tree latch we push to mtr so that we + know how to release it when we have latched the leaf node */ + + savepoint = mtr_set_savepoint(mtr); + + switch (latch_mode) { + case BTR_CONT_MODIFY_TREE: + break; + case BTR_MODIFY_TREE: + mtr_x_lock(dict_index_get_lock(index), mtr); + break; + case BTR_SEARCH_LEAF | BTR_ALREADY_S_LATCHED: + case BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED: + ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index), + MTR_MEMO_S_LOCK)); + break; + default: + mtr_s_lock(dict_index_get_lock(index), mtr); + } + + page_cursor = btr_cur_get_page_cur(cursor); + cursor->index = index; + + space = dict_index_get_space(index); + zip_size = dict_table_zip_size(index->table); + page_no = dict_index_get_page(index); + + height = ULINT_UNDEFINED; + + for (;;) { + buf_block_t* block; + page_t* page; + block = buf_page_get_gen(space, zip_size, page_no, + RW_NO_LATCH, NULL, BUF_GET, + file, line, mtr); + page = buf_block_get_frame(block); + + SRV_CORRUPT_TABLE_CHECK(page, + { + page_cursor->block = 0; + page_cursor->rec = 0; + + if (estimate) { + + cursor->path_arr->nth_rec = + ULINT_UNDEFINED; + } + /* Can't use break with the macro */ + goto exit_loop; + }); + + ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX); + + ut_ad(index->id == btr_page_get_index_id(page)); + + block->check_index_page_at_flush = TRUE; + + if (height == ULINT_UNDEFINED) { + /* We are in the root node */ + + height = btr_page_get_level(page, mtr); + root_height = height; + ut_a(height >= level); + } else { + /* TODO: flag the index corrupted if this fails */ + ut_ad(height == btr_page_get_level(page, mtr)); + } + + if (height == level) { + btr_cur_latch_leaves( + page, space, zip_size, page_no, + latch_mode & ~BTR_ALREADY_S_LATCHED, + cursor, mtr); + + if (height == 0) { + /* In versions <= 3.23.52 we had + forgotten to release the tree latch + here. If in an index scan we had to + scan far to find a record visible to + the current transaction, that could + starve others waiting for the tree + latch. */ + + switch (latch_mode) { + case BTR_MODIFY_TREE: + case BTR_CONT_MODIFY_TREE: + case BTR_SEARCH_LEAF | BTR_ALREADY_S_LATCHED: + case BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED: + break; + default: + /* Release the tree s-latch */ + + mtr_release_s_latch_at_savepoint( + mtr, savepoint, + dict_index_get_lock(index)); + } + } + } + + if (from_left) { + page_cur_set_before_first(block, page_cursor); + } else { + page_cur_set_after_last(block, page_cursor); + } + + if (height == level) { + if (estimate) { + btr_cur_add_path_info(cursor, height, + root_height); + } + + break; + } + + ut_ad(height > 0); + + if (from_left) { + page_cur_move_to_next(page_cursor); + } else { + page_cur_move_to_prev(page_cursor); + } + + if (estimate) { + btr_cur_add_path_info(cursor, height, root_height); + } + + height--; + + node_ptr = page_cur_get_rec(page_cursor); + offsets = rec_get_offsets(node_ptr, cursor->index, offsets, + ULINT_UNDEFINED, &heap); + /* Go to the child node */ + page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets); + } + +exit_loop: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } +} + +/**********************************************************************//** +Positions a cursor at a randomly chosen position within a B-tree. */ +UNIV_INTERN +void +btr_cur_open_at_rnd_pos_func( +/*=========================*/ + dict_index_t* index, /*!< in: index */ + ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */ + btr_cur_t* cursor, /*!< in/out: B-tree cursor */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_cur_t* page_cursor; + ulint page_no; + ulint space; + ulint zip_size; + ulint height; + rec_t* node_ptr; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + switch (latch_mode) { + case BTR_MODIFY_TREE: + mtr_x_lock(dict_index_get_lock(index), mtr); + break; + default: + ut_ad(latch_mode != BTR_CONT_MODIFY_TREE); + mtr_s_lock(dict_index_get_lock(index), mtr); + } + + page_cursor = btr_cur_get_page_cur(cursor); + cursor->index = index; + + space = dict_index_get_space(index); + zip_size = dict_table_zip_size(index->table); + page_no = dict_index_get_page(index); + + height = ULINT_UNDEFINED; + + for (;;) { + buf_block_t* block; + page_t* page; + + block = buf_page_get_gen(space, zip_size, page_no, + RW_NO_LATCH, NULL, BUF_GET, + file, line, mtr); + page = buf_block_get_frame(block); + + SRV_CORRUPT_TABLE_CHECK(page, + { + page_cursor->block = 0; + page_cursor->rec = 0; + + goto exit_loop; + }); + + ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX); + + ut_ad(index->id == btr_page_get_index_id(page)); + + if (height == ULINT_UNDEFINED) { + /* We are in the root node */ + + height = btr_page_get_level(page, mtr); + } + + if (height == 0) { + btr_cur_latch_leaves(page, space, zip_size, page_no, + latch_mode, cursor, mtr); + } + + page_cur_open_on_rnd_user_rec(block, page_cursor); + + if (height == 0) { + + break; + } + + ut_ad(height > 0); + + height--; + + node_ptr = page_cur_get_rec(page_cursor); + offsets = rec_get_offsets(node_ptr, cursor->index, offsets, + ULINT_UNDEFINED, &heap); + /* Go to the child node */ + page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets); + } + +exit_loop: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } +} + +/*==================== B-TREE INSERT =========================*/ + +/*************************************************************//** +Inserts a record if there is enough space, or if enough space can +be freed by reorganizing. Differs from btr_cur_optimistic_insert because +no heuristics is applied to whether it pays to use CPU time for +reorganizing the page or not. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if this is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + +@return pointer to inserted record if succeed, else NULL */ +static __attribute__((nonnull, warn_unused_result)) +rec_t* +btr_cur_insert_if_possible( +/*=======================*/ + btr_cur_t* cursor, /*!< in: cursor on page after which to insert; + cursor stays valid */ + const dtuple_t* tuple, /*!< in: tuple to insert; the size info need not + have been stored to tuple */ + ulint** offsets,/*!< out: offsets on *rec */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */ + ulint n_ext, /*!< in: number of externally stored columns */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + page_cur_t* page_cursor; + rec_t* rec; + + ut_ad(dtuple_check_typed(tuple)); + + ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor), + MTR_MEMO_PAGE_X_FIX)); + page_cursor = btr_cur_get_page_cur(cursor); + + /* Now, try the insert */ + rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index, + offsets, heap, n_ext, mtr); + + /* If the record did not fit, reorganize. + For compressed pages, page_cur_tuple_insert() + attempted this already. */ + if (!rec && !page_cur_get_page_zip(page_cursor) + && btr_page_reorganize(page_cursor, cursor->index, mtr)) { + rec = page_cur_tuple_insert( + page_cursor, tuple, cursor->index, + offsets, heap, n_ext, mtr); + } + + ut_ad(!rec || rec_offs_validate(rec, cursor->index, *offsets)); + return(rec); +} + +/*************************************************************//** +For an insert, checks the locks and does the undo logging if desired. +@return DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */ +UNIV_INLINE __attribute__((warn_unused_result, nonnull(2,3,5,6))) +dberr_t +btr_cur_ins_lock_and_undo( +/*======================*/ + ulint flags, /*!< in: undo logging and locking flags: if + not zero, the parameters index and thr + should be specified */ + btr_cur_t* cursor, /*!< in: cursor on page after which to insert */ + dtuple_t* entry, /*!< in/out: entry to insert */ + que_thr_t* thr, /*!< in: query thread or NULL */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + ibool* inherit)/*!< out: TRUE if the inserted new record maybe + should inherit LOCK_GAP type locks from the + successor record */ +{ + dict_index_t* index; + dberr_t err; + rec_t* rec; + roll_ptr_t roll_ptr; + + if (UNIV_UNLIKELY(thr && thr_get_trx(thr)->fake_changes)) { + /* skip LOCK, UNDO */ + return(DB_SUCCESS); + } + + /* Check if we have to wait for a lock: enqueue an explicit lock + request if yes */ + + rec = btr_cur_get_rec(cursor); + index = cursor->index; + + ut_ad(!dict_index_is_online_ddl(index) + || dict_index_is_clust(index) + || (flags & BTR_CREATE_FLAG)); + + err = lock_rec_insert_check_and_lock(flags, rec, + btr_cur_get_block(cursor), + index, thr, mtr, inherit); + + if (err != DB_SUCCESS + || !dict_index_is_clust(index) || dict_index_is_ibuf(index)) { + + return(err); + } + + err = trx_undo_report_row_operation(flags, TRX_UNDO_INSERT_OP, + thr, index, entry, + NULL, 0, NULL, NULL, + &roll_ptr); + if (err != DB_SUCCESS) { + + return(err); + } + + /* Now we can fill in the roll ptr field in entry */ + + if (!(flags & BTR_KEEP_SYS_FLAG)) { + + row_upd_index_entry_sys_field(entry, index, + DATA_ROLL_PTR, roll_ptr); + } + + return(DB_SUCCESS); +} + +#ifdef UNIV_DEBUG +/*************************************************************//** +Report information about a transaction. */ +static +void +btr_cur_trx_report( +/*===============*/ + trx_id_t trx_id, /*!< in: transaction id */ + const dict_index_t* index, /*!< in: index */ + const char* op) /*!< in: operation */ +{ + fprintf(stderr, "Trx with id " TRX_ID_FMT " going to ", trx_id); + fputs(op, stderr); + dict_index_name_print(stderr, NULL, index); + putc('\n', stderr); +} +#endif /* UNIV_DEBUG */ + +/*************************************************************//** +Tries to perform an insert to a page in an index tree, next to cursor. +It is assumed that mtr holds an x-latch on the page. The operation does +not succeed if there is too little space on the page. If there is just +one record on the page, the insert will always succeed; this is to +prevent trying to split a page with just one record. +@return DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */ +UNIV_INTERN +dberr_t +btr_cur_optimistic_insert( +/*======================*/ + ulint flags, /*!< in: undo logging and locking flags: if not + zero, the parameters index and thr should be + specified */ + btr_cur_t* cursor, /*!< in: cursor on page after which to insert; + cursor stays valid */ + ulint** offsets,/*!< out: offsets on *rec */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */ + dtuple_t* entry, /*!< in/out: entry to insert */ + rec_t** rec, /*!< out: pointer to inserted record if + succeed */ + big_rec_t** big_rec,/*!< out: big rec vector whose fields have to + be stored externally by the caller, or + NULL */ + ulint n_ext, /*!< in: number of externally stored columns */ + que_thr_t* thr, /*!< in: query thread or NULL */ + mtr_t* mtr) /*!< in/out: mini-transaction; + if this function returns DB_SUCCESS on + a leaf page of a secondary index in a + compressed tablespace, the caller must + mtr_commit(mtr) before latching + any further pages */ +{ + big_rec_t* big_rec_vec = NULL; + dict_index_t* index; + page_cur_t* page_cursor; + buf_block_t* block; + page_t* page; + rec_t* dummy; + ibool leaf; + ibool reorg; + ibool inherit = TRUE; + ulint zip_size; + ulint rec_size; + dberr_t err; + + *big_rec = NULL; + + block = btr_cur_get_block(cursor); + + SRV_CORRUPT_TABLE_CHECK(block, return(DB_CORRUPTION);); + + page = buf_block_get_frame(block); + index = cursor->index; + + ut_ad((thr && thr_get_trx(thr)->fake_changes) + || mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(!dict_index_is_online_ddl(index) + || dict_index_is_clust(index) + || (flags & BTR_CREATE_FLAG)); + ut_ad(dtuple_check_typed(entry)); + + zip_size = buf_block_get_zip_size(block); +#ifdef UNIV_DEBUG_VALGRIND + if (zip_size) { + UNIV_MEM_ASSERT_RW(page, UNIV_PAGE_SIZE); + UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size); + } +#endif /* UNIV_DEBUG_VALGRIND */ + +#ifdef UNIV_DEBUG + if (btr_cur_print_record_ops && thr) { + btr_cur_trx_report(thr_get_trx(thr)->id, index, "insert "); + dtuple_print(stderr, entry); + } +#endif /* UNIV_DEBUG */ + + ut_ad((thr && thr_get_trx(thr)->fake_changes) + || mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + + leaf = page_is_leaf(page); + + /* Calculate the record size when entry is converted to a record */ + rec_size = rec_get_converted_size(index, entry, n_ext); + + if (page_zip_rec_needs_ext(rec_size, page_is_comp(page), + dtuple_get_n_fields(entry), zip_size)) { + + /* The record is so big that we have to store some fields + externally on separate database pages */ + big_rec_vec = dtuple_convert_big_rec(index, entry, &n_ext); + + if (UNIV_UNLIKELY(big_rec_vec == NULL)) { + + return(DB_TOO_BIG_RECORD); + } + + rec_size = rec_get_converted_size(index, entry, n_ext); + } + + if (zip_size) { + /* Estimate the free space of an empty compressed page. + Subtract one byte for the encoded heap_no in the + modification log. */ + ulint free_space_zip = page_zip_empty_size( + cursor->index->n_fields, zip_size); + ulint n_uniq = dict_index_get_n_unique_in_tree(index); + + ut_ad(dict_table_is_comp(index->table)); + + if (free_space_zip == 0) { +too_big: + if (big_rec_vec) { + dtuple_convert_back_big_rec( + index, entry, big_rec_vec); + } + + return(DB_TOO_BIG_RECORD); + } + + /* Subtract one byte for the encoded heap_no in the + modification log. */ + free_space_zip--; + + /* There should be enough room for two node pointer + records on an empty non-leaf page. This prevents + infinite page splits. */ + + if (entry->n_fields >= n_uniq + && (REC_NODE_PTR_SIZE + + rec_get_converted_size_comp_prefix( + index, entry->fields, n_uniq, NULL) + /* On a compressed page, there is + a two-byte entry in the dense + page directory for every record. + But there is no record header. */ + - (REC_N_NEW_EXTRA_BYTES - 2) + > free_space_zip / 2)) { + goto too_big; + } + } + + LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page), + goto fail); + + if (leaf && zip_size + && (page_get_data_size(page) + rec_size + >= dict_index_zip_pad_optimal_page_size(index))) { + /* If compression padding tells us that insertion will + result in too packed up page i.e.: which is likely to + cause compression failure then don't do an optimistic + insertion. */ +fail: + err = DB_FAIL; +fail_err: + + if (big_rec_vec) { + dtuple_convert_back_big_rec(index, entry, big_rec_vec); + } + + return(err); + } + + ulint max_size = page_get_max_insert_size_after_reorganize(page, 1); + + if (page_has_garbage(page)) { + if ((max_size < rec_size + || max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT) + && page_get_n_recs(page) > 1 + && page_get_max_insert_size(page, 1) < rec_size) { + + goto fail; + } + } else if (max_size < rec_size) { + goto fail; + } + + /* If there have been many consecutive inserts to the + clustered index leaf page of an uncompressed table, check if + we have to split the page to reserve enough free space for + future updates of records. */ + + if (leaf && !zip_size && dict_index_is_clust(index) + && page_get_n_recs(page) >= 2 + && dict_index_get_space_reserve() + rec_size > max_size + && (btr_page_get_split_rec_to_right(cursor, &dummy) + || btr_page_get_split_rec_to_left(cursor, &dummy))) { + goto fail; + } + + /* Check locks and write to the undo log, if specified */ + err = btr_cur_ins_lock_and_undo(flags, cursor, entry, + thr, mtr, &inherit); + + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + + goto fail_err; + } + + if (UNIV_UNLIKELY(thr && thr_get_trx(thr)->fake_changes)) { + /* skip CHANGE, LOG */ + *big_rec = big_rec_vec; + return(err); /* == DB_SUCCESS */ + } + + page_cursor = btr_cur_get_page_cur(cursor); + + /* Now, try the insert */ + + { + const rec_t* page_cursor_rec = page_cur_get_rec(page_cursor); + *rec = page_cur_tuple_insert(page_cursor, entry, index, + offsets, heap, n_ext, mtr); + reorg = page_cursor_rec != page_cur_get_rec(page_cursor); + } + + if (*rec) { + } else if (zip_size) { + /* Reset the IBUF_BITMAP_FREE bits, because + page_cur_tuple_insert() will have attempted page + reorganize before failing. */ + if (leaf && !dict_index_is_clust(index)) { + ibuf_reset_free_bits(block); + } + + goto fail; + } else { + ut_ad(!reorg); + + /* If the record did not fit, reorganize */ + if (!btr_page_reorganize(page_cursor, index, mtr)) { + ut_ad(0); + goto fail; + } + + ut_ad(page_get_max_insert_size(page, 1) == max_size); + + reorg = TRUE; + + *rec = page_cur_tuple_insert(page_cursor, entry, index, + offsets, heap, n_ext, mtr); + + if (UNIV_UNLIKELY(!*rec)) { + fputs("InnoDB: Error: cannot insert tuple ", stderr); + dtuple_print(stderr, entry); + fputs(" into ", stderr); + dict_index_name_print(stderr, thr_get_trx(thr), index); + fprintf(stderr, "\nInnoDB: max insert size %lu\n", + (ulong) max_size); + ut_error; + } + } + +#ifdef BTR_CUR_HASH_ADAPT + if (!reorg && leaf && (cursor->flag == BTR_CUR_HASH)) { + btr_search_update_hash_node_on_insert(cursor); + } else { + btr_search_update_hash_on_insert(cursor); + } +#endif + + if (!(flags & BTR_NO_LOCKING_FLAG) && inherit) { + + lock_update_insert(block, *rec); + } + + if (leaf && !dict_index_is_clust(index)) { + /* Update the free bits of the B-tree page in the + insert buffer bitmap. */ + + /* The free bits in the insert buffer bitmap must + never exceed the free space on a page. It is safe to + decrement or reset the bits in the bitmap in a + mini-transaction that is committed before the + mini-transaction that affects the free space. */ + + /* It is unsafe to increment the bits in a separately + committed mini-transaction, because in crash recovery, + the free bits could momentarily be set too high. */ + + if (zip_size) { + /* Update the bits in the same mini-transaction. */ + ibuf_update_free_bits_zip(block, mtr); + } else { + /* Decrement the bits in a separate + mini-transaction. */ + ibuf_update_free_bits_if_full( + block, max_size, + rec_size + PAGE_DIR_SLOT_SIZE); + } + } + + *big_rec = big_rec_vec; + + return(DB_SUCCESS); +} + +/*************************************************************//** +Performs an insert on a page of an index tree. It is assumed that mtr +holds an x-latch on the tree and on the cursor page. If the insert is +made on the leaf level, to avoid deadlocks, mtr must also own x-latches +to brothers of page, if those brothers exist. +@return DB_SUCCESS or error number */ +UNIV_INTERN +dberr_t +btr_cur_pessimistic_insert( +/*=======================*/ + ulint flags, /*!< in: undo logging and locking flags: if not + zero, the parameter thr should be + specified; if no undo logging is specified, + then the caller must have reserved enough + free extents in the file space so that the + insertion will certainly succeed */ + btr_cur_t* cursor, /*!< in: cursor after which to insert; + cursor stays valid */ + ulint** offsets,/*!< out: offsets on *rec */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap + that can be emptied, or NULL */ + dtuple_t* entry, /*!< in/out: entry to insert */ + rec_t** rec, /*!< out: pointer to inserted record if + succeed */ + big_rec_t** big_rec,/*!< out: big rec vector whose fields have to + be stored externally by the caller, or + NULL */ + ulint n_ext, /*!< in: number of externally stored columns */ + que_thr_t* thr, /*!< in: query thread or NULL */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + dict_index_t* index = cursor->index; + ulint zip_size = dict_table_zip_size(index->table); + big_rec_t* big_rec_vec = NULL; + dberr_t err; + ibool inherit = FALSE; + ibool success; + ulint n_reserved = 0; + + ut_ad(dtuple_check_typed(entry)); + + *big_rec = NULL; + + ut_ad((thr && thr_get_trx(thr)->fake_changes) || mtr_memo_contains(mtr, + dict_index_get_lock(btr_cur_get_index(cursor)), + MTR_MEMO_X_LOCK)); + ut_ad((thr && thr_get_trx(thr)->fake_changes) || mtr_memo_contains(mtr, btr_cur_get_block(cursor), + MTR_MEMO_PAGE_X_FIX)); + ut_ad(!dict_index_is_online_ddl(index) + || dict_index_is_clust(index) + || (flags & BTR_CREATE_FLAG)); + + cursor->flag = BTR_CUR_BINARY; + + /* Check locks and write to undo log, if specified */ + + err = btr_cur_ins_lock_and_undo(flags, cursor, entry, + thr, mtr, &inherit); + + if (err != DB_SUCCESS) { + + return(err); + } + + if (!(flags & BTR_NO_UNDO_LOG_FLAG)) { + + ut_a(cursor->tree_height != ULINT_UNDEFINED); + + /* First reserve enough free space for the file segments + of the index tree, so that the insert will not fail because + of lack of space */ + + ulint n_extents = cursor->tree_height / 16 + 3; + + success = fsp_reserve_free_extents(&n_reserved, index->space, + n_extents, FSP_NORMAL, mtr); + if (!success) { + return(DB_OUT_OF_FILE_SPACE); + } + } + + if (page_zip_rec_needs_ext(rec_get_converted_size(index, entry, n_ext), + dict_table_is_comp(index->table), + dtuple_get_n_fields(entry), + zip_size)) { + /* The record is so big that we have to store some fields + externally on separate database pages */ + + if (UNIV_LIKELY_NULL(big_rec_vec)) { + /* This should never happen, but we handle + the situation in a robust manner. */ + ut_ad(0); + dtuple_convert_back_big_rec(index, entry, big_rec_vec); + } + + big_rec_vec = dtuple_convert_big_rec(index, entry, &n_ext); + + if (big_rec_vec == NULL) { + + if (n_reserved > 0) { + fil_space_release_free_extents(index->space, + n_reserved); + } + return(DB_TOO_BIG_RECORD); + } + } + + if (UNIV_UNLIKELY(thr && thr_get_trx(thr)->fake_changes)) { + /* skip CHANGE, LOG */ + if (n_reserved > 0) { + fil_space_release_free_extents(index->space, + n_reserved); + } + *big_rec = big_rec_vec; + return(DB_SUCCESS); + } + + if (dict_index_get_page(index) + == buf_block_get_page_no(btr_cur_get_block(cursor))) { + + /* The page is the root page */ + *rec = btr_root_raise_and_insert( + flags, cursor, offsets, heap, entry, n_ext, mtr); + } else { + *rec = btr_page_split_and_insert( + flags, cursor, offsets, heap, entry, n_ext, mtr); + } + + ut_ad(page_rec_get_next(btr_cur_get_rec(cursor)) == *rec); + + if (!(flags & BTR_NO_LOCKING_FLAG)) { + /* The cursor might be moved to the other page, + and the max trx id field should be updated after + the cursor was fixed. */ + if (!dict_index_is_clust(index)) { + page_update_max_trx_id( + btr_cur_get_block(cursor), + btr_cur_get_page_zip(cursor), + thr_get_trx(thr)->id, mtr); + } + if (!page_rec_is_infimum(btr_cur_get_rec(cursor)) + || btr_page_get_prev( + buf_block_get_frame( + btr_cur_get_block(cursor)), mtr) + == FIL_NULL) { + /* split and inserted need to call + lock_update_insert() always. */ + inherit = TRUE; + } + } + +#ifdef BTR_CUR_ADAPT + btr_search_update_hash_on_insert(cursor); +#endif + if (inherit && !(flags & BTR_NO_LOCKING_FLAG)) { + + lock_update_insert(btr_cur_get_block(cursor), *rec); + } + + if (n_reserved > 0) { + fil_space_release_free_extents(index->space, n_reserved); + } + + *big_rec = big_rec_vec; + + return(DB_SUCCESS); +} + +/*==================== B-TREE UPDATE =========================*/ + +/*************************************************************//** +For an update, checks the locks and does the undo logging. +@return DB_SUCCESS, DB_WAIT_LOCK, or error number */ +UNIV_INLINE __attribute__((warn_unused_result, nonnull(2,3,6,7))) +dberr_t +btr_cur_upd_lock_and_undo( +/*======================*/ + ulint flags, /*!< in: undo logging and locking flags */ + btr_cur_t* cursor, /*!< in: cursor on record to update */ + const ulint* offsets,/*!< in: rec_get_offsets() on cursor */ + const upd_t* update, /*!< in: update vector */ + ulint cmpl_info,/*!< in: compiler info on secondary index + updates */ + que_thr_t* thr, /*!< in: query thread + (can be NULL if BTR_NO_LOCKING_FLAG) */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + roll_ptr_t* roll_ptr)/*!< out: roll pointer */ +{ + dict_index_t* index; + const rec_t* rec; + dberr_t err; + + ut_ad(thr || (flags & BTR_NO_LOCKING_FLAG)); + + if (UNIV_UNLIKELY(thr && thr_get_trx(thr)->fake_changes)) { + /* skip LOCK, UNDO */ + return(DB_SUCCESS); + } + + rec = btr_cur_get_rec(cursor); + index = cursor->index; + + ut_ad(rec_offs_validate(rec, index, offsets)); + + if (!dict_index_is_clust(index)) { + ut_ad(dict_index_is_online_ddl(index) + == !!(flags & BTR_CREATE_FLAG)); + + /* We do undo logging only when we update a clustered index + record */ + return(lock_sec_rec_modify_check_and_lock( + flags, btr_cur_get_block(cursor), rec, + index, thr, mtr)); + } + + /* Check if we have to wait for a lock: enqueue an explicit lock + request if yes */ + + if (!(flags & BTR_NO_LOCKING_FLAG)) { + err = lock_clust_rec_modify_check_and_lock( + flags, btr_cur_get_block(cursor), rec, index, + offsets, thr); + if (err != DB_SUCCESS) { + return(err); + } + } + + /* Append the info about the update in the undo log */ + + return(trx_undo_report_row_operation( + flags, TRX_UNDO_MODIFY_OP, thr, + index, NULL, update, + cmpl_info, rec, offsets, roll_ptr)); +} + +/***********************************************************//** +Writes a redo log record of updating a record in-place. */ +UNIV_INTERN +void +btr_cur_update_in_place_log( +/*========================*/ + ulint flags, /*!< in: flags */ + const rec_t* rec, /*!< in: record */ + dict_index_t* index, /*!< in: index of the record */ + const upd_t* update, /*!< in: update vector */ + trx_id_t trx_id, /*!< in: transaction id */ + roll_ptr_t roll_ptr, /*!< in: roll ptr */ + mtr_t* mtr) /*!< in: mtr */ +{ + byte* log_ptr; + const page_t* page = page_align(rec); + ut_ad(flags < 256); + ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table)); + + log_ptr = mlog_open_and_write_index(mtr, rec, index, page_is_comp(page) + ? MLOG_COMP_REC_UPDATE_IN_PLACE + : MLOG_REC_UPDATE_IN_PLACE, + 1 + DATA_ROLL_PTR_LEN + 14 + 2 + + MLOG_BUF_MARGIN); + + if (!log_ptr) { + /* Logging in mtr is switched off during crash recovery */ + return; + } + + /* For secondary indexes, we could skip writing the dummy system fields + to the redo log but we have to change redo log parsing of + MLOG_REC_UPDATE_IN_PLACE/MLOG_COMP_REC_UPDATE_IN_PLACE or we have to add + new redo log record. For now, just write dummy sys fields to the redo + log if we are updating a secondary index record. + */ + mach_write_to_1(log_ptr, flags); + log_ptr++; + + if (dict_index_is_clust(index)) { + log_ptr = row_upd_write_sys_vals_to_log( + index, trx_id, roll_ptr, log_ptr, mtr); + } else { + /* Dummy system fields for a secondary index */ + /* TRX_ID Position */ + log_ptr += mach_write_compressed(log_ptr, 0); + /* ROLL_PTR */ + trx_write_roll_ptr(log_ptr, 0); + log_ptr += DATA_ROLL_PTR_LEN; + /* TRX_ID */ + log_ptr += mach_ull_write_compressed(log_ptr, 0); + } + + mach_write_to_2(log_ptr, page_offset(rec)); + log_ptr += 2; + + row_upd_index_write_log(update, log_ptr, mtr); +} +#endif /* UNIV_HOTBACKUP */ + +/***********************************************************//** +Parses a redo log record of updating a record in-place. +@return end of log record or NULL */ +UNIV_INTERN +byte* +btr_cur_parse_update_in_place( +/*==========================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + page_t* page, /*!< in/out: page or NULL */ + page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */ + dict_index_t* index) /*!< in: index corresponding to page */ +{ + ulint flags; + rec_t* rec; + upd_t* update; + ulint pos; + trx_id_t trx_id; + roll_ptr_t roll_ptr; + ulint rec_offset; + mem_heap_t* heap; + ulint* offsets; + + if (end_ptr < ptr + 1) { + + return(NULL); + } + + flags = mach_read_from_1(ptr); + ptr++; + + ptr = row_upd_parse_sys_vals(ptr, end_ptr, &pos, &trx_id, &roll_ptr); + + if (ptr == NULL) { + + return(NULL); + } + + if (end_ptr < ptr + 2) { + + return(NULL); + } + + rec_offset = mach_read_from_2(ptr); + ptr += 2; + + ut_a(rec_offset <= UNIV_PAGE_SIZE); + + heap = mem_heap_create(256); + + ptr = row_upd_index_parse(ptr, end_ptr, heap, &update); + + if (!ptr || !page) { + + goto func_exit; + } + + ut_a((ibool)!!page_is_comp(page) == dict_table_is_comp(index->table)); + rec = page + rec_offset; + + /* We do not need to reserve btr_search_latch, as the page is only + being recovered, and there cannot be a hash index to it. */ + + offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap); + + if (!(flags & BTR_KEEP_SYS_FLAG)) { + row_upd_rec_sys_fields_in_recovery(rec, page_zip, offsets, + pos, trx_id, roll_ptr); + } + + row_upd_rec_in_place(rec, index, offsets, update, page_zip); + +func_exit: + mem_heap_free(heap); + + return(ptr); +} + +#ifndef UNIV_HOTBACKUP +/*************************************************************//** +See if there is enough place in the page modification log to log +an update-in-place. + +@retval false if out of space; IBUF_BITMAP_FREE will be reset +outside mtr if the page was recompressed +@retval true if enough place; + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE if this is +a secondary index leaf page. This has to be done either within the +same mini-transaction, or by invoking ibuf_reset_free_bits() before +mtr_commit(mtr). */ +UNIV_INTERN +bool +btr_cur_update_alloc_zip_func( +/*==========================*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page */ + page_cur_t* cursor, /*!< in/out: B-tree page cursor */ + dict_index_t* index, /*!< in: the index corresponding to cursor */ +#ifdef UNIV_DEBUG + ulint* offsets,/*!< in/out: offsets of the cursor record */ +#endif /* UNIV_DEBUG */ + ulint length, /*!< in: size needed */ + bool create, /*!< in: true=delete-and-insert, + false=update-in-place */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + trx_t* trx) /*!< in: NULL or transaction */ +{ + const page_t* page = page_cur_get_page(cursor); + + ut_ad(page_zip == page_cur_get_page_zip(cursor)); + ut_ad(page_zip); + ut_ad(!dict_index_is_ibuf(index)); + ut_ad(rec_offs_validate(page_cur_get_rec(cursor), index, offsets)); + + if (page_zip_available(page_zip, dict_index_is_clust(index), + length, create)) { + return(true); + } + + if (!page_zip->m_nonempty && !page_has_garbage(page)) { + /* The page has been freshly compressed, so + reorganizing it will not help. */ + return(false); + } + + if (create && page_is_leaf(page) + && (length + page_get_data_size(page) + >= dict_index_zip_pad_optimal_page_size(index))) { + return(false); + } + + if (UNIV_UNLIKELY(trx && trx->fake_changes)) { + /* Don't call page_zip_compress_write_log_no_data as that has + assert which would fail. Assume there won't be a compression + failure. */ + + return(true); + } + + if (!btr_page_reorganize(cursor, index, mtr)) { + goto out_of_space; + } + + rec_offs_make_valid(page_cur_get_rec(cursor), index, offsets); + + /* After recompressing a page, we must make sure that the free + bits in the insert buffer bitmap will not exceed the free + space on the page. Because this function will not attempt + recompression unless page_zip_available() fails above, it is + safe to reset the free bits if page_zip_available() fails + again, below. The free bits can safely be reset in a separate + mini-transaction. If page_zip_available() succeeds below, we + can be sure that the btr_page_reorganize() above did not reduce + the free space available on the page. */ + + if (page_zip_available(page_zip, dict_index_is_clust(index), + length, create)) { + return(true); + } + +out_of_space: + ut_ad(rec_offs_validate(page_cur_get_rec(cursor), index, offsets)); + + /* Out of space: reset the free bits. */ + if (!dict_index_is_clust(index) && page_is_leaf(page)) { + ibuf_reset_free_bits(page_cur_get_block(cursor)); + } + + return(false); +} + +/*************************************************************//** +Updates a record when the update causes no size changes in its fields. +We assume here that the ordering fields of the record do not change. +@return locking or undo log related error code, or +@retval DB_SUCCESS on success +@retval DB_ZIP_OVERFLOW if there is not enough space left +on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */ +UNIV_INTERN +dberr_t +btr_cur_update_in_place( +/*====================*/ + ulint flags, /*!< in: undo logging and locking flags */ + btr_cur_t* cursor, /*!< in: cursor on the record to update; + cursor stays valid and positioned on the + same record */ + ulint* offsets,/*!< in/out: offsets on cursor->page_cur.rec */ + const upd_t* update, /*!< in: update vector */ + ulint cmpl_info,/*!< in: compiler info on secondary index + updates */ + que_thr_t* thr, /*!< in: query thread */ + trx_id_t trx_id, /*!< in: transaction id */ + mtr_t* mtr) /*!< in/out: mini-transaction; if this + is a secondary index, the caller must + mtr_commit(mtr) before latching any + further pages */ +{ + dict_index_t* index; + buf_block_t* block; + page_zip_des_t* page_zip; + dberr_t err; + rec_t* rec; + roll_ptr_t roll_ptr = 0; + ulint was_delete_marked; + ibool is_hashed; + trx_t* trx; + + rec = btr_cur_get_rec(cursor); + index = cursor->index; + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table)); + /* The insert buffer tree should never be updated in place. */ + ut_ad(!dict_index_is_ibuf(index)); + ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG) + || dict_index_is_clust(index)); + ut_ad(thr_get_trx(thr)->id == trx_id + || (flags & ~(BTR_KEEP_POS_FLAG | BTR_KEEP_IBUF_BITMAP)) + == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG + | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG)); + ut_ad(fil_page_get_type(btr_cur_get_page(cursor)) == FIL_PAGE_INDEX); + ut_ad(btr_page_get_index_id(btr_cur_get_page(cursor)) == index->id); + +#ifdef UNIV_DEBUG + if (btr_cur_print_record_ops) { + btr_cur_trx_report(trx_id, index, "update "); + rec_print_new(stderr, rec, offsets); + } +#endif /* UNIV_DEBUG */ + + block = btr_cur_get_block(cursor); + page_zip = buf_block_get_page_zip(block); + trx = thr_get_trx(thr); + + /* Check that enough space is available on the compressed page. */ + if (page_zip) { + if (!btr_cur_update_alloc_zip( + page_zip, btr_cur_get_page_cur(cursor), + index, offsets, rec_offs_size(offsets), + false, mtr, trx)) { + return(DB_ZIP_OVERFLOW); + } + + rec = btr_cur_get_rec(cursor); + } + + /* Do lock checking and undo logging */ + err = btr_cur_upd_lock_and_undo(flags, cursor, offsets, + update, cmpl_info, + thr, mtr, &roll_ptr); + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + /* We may need to update the IBUF_BITMAP_FREE + bits after a reorganize that was done in + btr_cur_update_alloc_zip(). */ + goto func_exit; + } + + if (UNIV_UNLIKELY(trx->fake_changes)) { + /* skip CHANGE, LOG */ + return(err); /* == DB_SUCCESS */ + } + + if (!(flags & BTR_KEEP_SYS_FLAG)) { + row_upd_rec_sys_fields(rec, NULL, index, offsets, + thr_get_trx(thr), roll_ptr); + } + + was_delete_marked = rec_get_deleted_flag( + rec, page_is_comp(buf_block_get_frame(block))); + + is_hashed = (block->index != NULL); + + if (is_hashed) { + /* TO DO: Can we skip this if none of the fields + index->search_info->curr_n_fields + are being updated? */ + + /* The function row_upd_changes_ord_field_binary works only + if the update vector was built for a clustered index, we must + NOT call it if index is secondary */ + + if (!dict_index_is_clust(index) + || row_upd_changes_ord_field_binary(index, update, thr, + NULL, NULL)) { + + /* Remove possible hash index pointer to this record */ + btr_search_update_hash_on_delete(cursor); + } + + rw_lock_x_lock(btr_search_get_latch(cursor->index)); + } + + row_upd_rec_in_place(rec, index, offsets, update, page_zip); + + if (is_hashed) { + rw_lock_x_unlock(btr_search_get_latch(cursor->index)); + } + + btr_cur_update_in_place_log(flags, rec, index, update, + trx_id, roll_ptr, mtr); + + if (was_delete_marked + && !rec_get_deleted_flag( + rec, page_is_comp(buf_block_get_frame(block)))) { + /* The new updated record owns its possible externally + stored fields */ + + btr_cur_unmark_extern_fields(page_zip, + rec, index, offsets, mtr); + } + + ut_ad(err == DB_SUCCESS); + +func_exit: + if (page_zip + && !(flags & BTR_KEEP_IBUF_BITMAP) + && !dict_index_is_clust(index) + && page_is_leaf(buf_block_get_frame(block))) { + /* Update the free bits in the insert buffer. */ + ibuf_update_free_bits_zip(block, mtr); + } + + return(err); +} + +/*************************************************************//** +Tries to update a record on a page in an index tree. It is assumed that mtr +holds an x-latch on the page. The operation does not succeed if there is too +little space on the page or if the update would result in too empty a page, +so that tree compression is recommended. We assume here that the ordering +fields of the record do not change. +@return error code, including +@retval DB_SUCCESS on success +@retval DB_OVERFLOW if the updated record does not fit +@retval DB_UNDERFLOW if the page would become too empty +@retval DB_ZIP_OVERFLOW if there is not enough space left +on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */ +UNIV_INTERN +dberr_t +btr_cur_optimistic_update( +/*======================*/ + ulint flags, /*!< in: undo logging and locking flags */ + btr_cur_t* cursor, /*!< in: cursor on the record to update; + cursor stays valid and positioned on the + same record */ + ulint** offsets,/*!< out: offsets on cursor->page_cur.rec */ + mem_heap_t** heap, /*!< in/out: pointer to NULL or memory heap */ + const upd_t* update, /*!< in: update vector; this must also + contain trx id and roll ptr fields */ + ulint cmpl_info,/*!< in: compiler info on secondary index + updates */ + que_thr_t* thr, /*!< in: query thread */ + trx_id_t trx_id, /*!< in: transaction id */ + mtr_t* mtr) /*!< in/out: mini-transaction; if this + is a secondary index, the caller must + mtr_commit(mtr) before latching any + further pages */ +{ + dict_index_t* index; + page_cur_t* page_cursor; + dberr_t err; + buf_block_t* block; + page_t* page; + page_zip_des_t* page_zip; + rec_t* rec; + ulint max_size; + ulint new_rec_size; + ulint old_rec_size; + dtuple_t* new_entry; + roll_ptr_t roll_ptr; + ulint i; + ulint n_ext; + + block = btr_cur_get_block(cursor); + page = buf_block_get_frame(block); + rec = btr_cur_get_rec(cursor); + index = cursor->index; + ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table)); + ut_ad(thr_get_trx(thr)->fake_changes + || mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + /* The insert buffer tree should never be updated in place. */ + ut_ad(!dict_index_is_ibuf(index)); + ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG) + || dict_index_is_clust(index)); + ut_ad(thr_get_trx(thr)->id == trx_id + || (flags & ~(BTR_KEEP_POS_FLAG | BTR_KEEP_IBUF_BITMAP)) + == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG + | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG)); + ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX); + ut_ad(btr_page_get_index_id(page) == index->id); + + *offsets = rec_get_offsets(rec, index, *offsets, + ULINT_UNDEFINED, heap); +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + ut_a(!rec_offs_any_null_extern(rec, *offsets) + || trx_is_recv(thr_get_trx(thr))); +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + +#ifdef UNIV_DEBUG + if (btr_cur_print_record_ops) { + btr_cur_trx_report(trx_id, index, "update "); + rec_print_new(stderr, rec, *offsets); + } +#endif /* UNIV_DEBUG */ + + if (!row_upd_changes_field_size_or_external(index, *offsets, update)) { + + /* The simplest and the most common case: the update does not + change the size of any field and none of the updated fields is + externally stored in rec or update, and there is enough space + on the compressed page to log the update. */ + + return(btr_cur_update_in_place( + flags, cursor, *offsets, update, + cmpl_info, thr, trx_id, mtr)); + } + + if (rec_offs_any_extern(*offsets)) { +any_extern: + /* Externally stored fields are treated in pessimistic + update */ + + return(DB_OVERFLOW); + } + + for (i = 0; i < upd_get_n_fields(update); i++) { + if (dfield_is_ext(&upd_get_nth_field(update, i)->new_val)) { + + goto any_extern; + } + } + + page_cursor = btr_cur_get_page_cur(cursor); + + if (!*heap) { + *heap = mem_heap_create( + rec_offs_size(*offsets) + + DTUPLE_EST_ALLOC(rec_offs_n_fields(*offsets))); + } + + new_entry = row_rec_to_index_entry(rec, index, *offsets, + &n_ext, *heap); + /* We checked above that there are no externally stored fields. */ + ut_a(!n_ext); + + /* The page containing the clustered index record + corresponding to new_entry is latched in mtr. + Thus the following call is safe. */ + row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update, + FALSE, *heap); + old_rec_size = rec_offs_size(*offsets); + new_rec_size = rec_get_converted_size(index, new_entry, 0); + + page_zip = buf_block_get_page_zip(block); +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + + if (page_zip) { + if (!btr_cur_update_alloc_zip( + page_zip, page_cursor, index, *offsets, + new_rec_size, true, mtr, thr_get_trx(thr))) { + return(DB_ZIP_OVERFLOW); + } + + rec = page_cur_get_rec(page_cursor); + } + + if (UNIV_UNLIKELY(new_rec_size + >= (page_get_free_space_of_empty(page_is_comp(page)) + / 2))) { + /* We may need to update the IBUF_BITMAP_FREE + bits after a reorganize that was done in + btr_cur_update_alloc_zip(). */ + err = DB_OVERFLOW; + goto func_exit; + } + + if (UNIV_UNLIKELY(page_get_data_size(page) + - old_rec_size + new_rec_size + < BTR_CUR_PAGE_COMPRESS_LIMIT)) { + /* We may need to update the IBUF_BITMAP_FREE + bits after a reorganize that was done in + btr_cur_update_alloc_zip(). */ + + /* The page would become too empty */ + err = DB_UNDERFLOW; + goto func_exit; + } + + /* We do not attempt to reorganize if the page is compressed. + This is because the page may fail to compress after reorganization. */ + max_size = page_zip + ? page_get_max_insert_size(page, 1) + : (old_rec_size + + page_get_max_insert_size_after_reorganize(page, 1)); + + if (!(((max_size >= BTR_CUR_PAGE_REORGANIZE_LIMIT) + && (max_size >= new_rec_size)) + || (page_get_n_recs(page) <= 1))) { + + /* We may need to update the IBUF_BITMAP_FREE + bits after a reorganize that was done in + btr_cur_update_alloc_zip(). */ + + /* There was not enough space, or it did not pay to + reorganize: for simplicity, we decide what to do assuming a + reorganization is needed, though it might not be necessary */ + + err = DB_OVERFLOW; + goto func_exit; + } + + /* Do lock checking and undo logging */ + err = btr_cur_upd_lock_and_undo(flags, cursor, *offsets, + update, cmpl_info, + thr, mtr, &roll_ptr); + if (err != DB_SUCCESS) { + /* We may need to update the IBUF_BITMAP_FREE + bits after a reorganize that was done in + btr_cur_update_alloc_zip(). */ + goto func_exit; + } + + if (UNIV_UNLIKELY(thr_get_trx(thr)->fake_changes)) { + /* skip CHANGE, LOG */ + ut_ad(err == DB_SUCCESS); + return(DB_SUCCESS); + } + + /* Ok, we may do the replacement. Store on the page infimum the + explicit locks on rec, before deleting rec (see the comment in + btr_cur_pessimistic_update). */ + + lock_rec_store_on_page_infimum(block, rec); + + btr_search_update_hash_on_delete(cursor); + + page_cur_delete_rec(page_cursor, index, *offsets, mtr); + + page_cur_move_to_prev(page_cursor); + + if (!(flags & BTR_KEEP_SYS_FLAG)) { + row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR, + roll_ptr); + row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID, + trx_id); + } + + /* There are no externally stored columns in new_entry */ + rec = btr_cur_insert_if_possible( + cursor, new_entry, offsets, heap, 0/*n_ext*/, mtr); + ut_a(rec); /* <- We calculated above the insert would fit */ + + /* Restore the old explicit lock state on the record */ + + lock_rec_restore_from_page_infimum(block, rec, block); + + page_cur_move_to_next(page_cursor); + ut_ad(err == DB_SUCCESS); + +func_exit: + if (page_zip + && !(flags & BTR_KEEP_IBUF_BITMAP) + && !dict_index_is_clust(index) + && page_is_leaf(page)) { + /* Update the free bits in the insert buffer. */ + ibuf_update_free_bits_zip(block, mtr); + } + + return(err); +} + +/*************************************************************//** +If, in a split, a new supremum record was created as the predecessor of the +updated record, the supremum record must inherit exactly the locks on the +updated record. In the split it may have inherited locks from the successor +of the updated record, which is not correct. This function restores the +right locks for the new supremum. */ +static +void +btr_cur_pess_upd_restore_supremum( +/*==============================*/ + buf_block_t* block, /*!< in: buffer block of rec */ + const rec_t* rec, /*!< in: updated record */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_t* page; + buf_block_t* prev_block; + ulint space; + ulint zip_size; + ulint prev_page_no; + + page = buf_block_get_frame(block); + + if (page_rec_get_next(page_get_infimum_rec(page)) != rec) { + /* Updated record is not the first user record on its page */ + + return; + } + + space = buf_block_get_space(block); + zip_size = buf_block_get_zip_size(block); + prev_page_no = btr_page_get_prev(page, mtr); + + ut_ad(prev_page_no != FIL_NULL); + prev_block = buf_page_get_with_no_latch(space, zip_size, + prev_page_no, mtr); +#ifdef UNIV_BTR_DEBUG + ut_a(btr_page_get_next(prev_block->frame, mtr) + == page_get_page_no(page)); +#endif /* UNIV_BTR_DEBUG */ + + /* We must already have an x-latch on prev_block! */ + ut_ad(mtr_memo_contains(mtr, prev_block, MTR_MEMO_PAGE_X_FIX)); + + lock_rec_reset_and_inherit_gap_locks(prev_block, block, + PAGE_HEAP_NO_SUPREMUM, + page_rec_get_heap_no(rec)); +} + +/*************************************************************//** +Check if the total length of the modified blob for the row is within 10% +of the total redo log size. This constraint on the blob length is to +avoid overwriting the redo logs beyond the last checkpoint lsn. +@return DB_SUCCESS or DB_TOO_BIG_RECORD. */ +static +dberr_t +btr_check_blob_limit(const big_rec_t* big_rec_vec) +{ + const ib_uint64_t redo_size = srv_n_log_files * srv_log_file_size + * UNIV_PAGE_SIZE; + const ulint redo_10p = redo_size / 10; + ulint total_blob_len = 0; + dberr_t err = DB_SUCCESS; + + /* Calculate the total number of bytes for blob data */ + for (ulint i = 0; i < big_rec_vec->n_fields; i++) { + total_blob_len += big_rec_vec->fields[i].len; + } + + if (total_blob_len > redo_10p) { + ib_logf(IB_LOG_LEVEL_ERROR, "The total blob data" + " length (" ULINTPF ") is greater than" + " 10%% of the total redo log size (" UINT64PF + "). Please increase total redo log size.", + total_blob_len, redo_size); + err = DB_TOO_BIG_RECORD; + } + + return(err); +} + +/*************************************************************//** +Performs an update of a record on a page of a tree. It is assumed +that mtr holds an x-latch on the tree and on the cursor page. If the +update is made on the leaf level, to avoid deadlocks, mtr must also +own x-latches to brothers of page, if those brothers exist. We assume +here that the ordering fields of the record do not change. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +btr_cur_pessimistic_update( +/*=======================*/ + ulint flags, /*!< in: undo logging, locking, and rollback + flags */ + btr_cur_t* cursor, /*!< in/out: cursor on the record to update; + cursor may become invalid if *big_rec == NULL + || !(flags & BTR_KEEP_POS_FLAG) */ + ulint** offsets,/*!< out: offsets on cursor->page_cur.rec */ + mem_heap_t** offsets_heap, + /*!< in/out: pointer to memory heap + that can be emptied, or NULL */ + mem_heap_t* entry_heap, + /*!< in/out: memory heap for allocating + big_rec and the index tuple */ + big_rec_t** big_rec,/*!< out: big rec vector whose fields have to + be stored externally by the caller, or NULL */ + const upd_t* update, /*!< in: update vector; this is allowed also + contain trx id and roll ptr fields, but + the values in update vector have no effect */ + ulint cmpl_info,/*!< in: compiler info on secondary index + updates */ + que_thr_t* thr, /*!< in: query thread */ + trx_id_t trx_id, /*!< in: transaction id */ + mtr_t* mtr) /*!< in/out: mini-transaction; must be + committed before latching any further pages */ +{ + big_rec_t* big_rec_vec = NULL; + big_rec_t* dummy_big_rec; + dict_index_t* index; + buf_block_t* block; + page_t* page; + page_zip_des_t* page_zip; + rec_t* rec; + page_cur_t* page_cursor; + dberr_t err; + dberr_t optim_err; + roll_ptr_t roll_ptr; + ibool was_first; + ulint n_reserved = 0; + ulint n_ext; + trx_t* trx; + + *offsets = NULL; + *big_rec = NULL; + + block = btr_cur_get_block(cursor); + page = buf_block_get_frame(block); + page_zip = buf_block_get_page_zip(block); + index = cursor->index; + + ut_ad(thr_get_trx(thr)->fake_changes + || mtr_memo_contains(mtr, dict_index_get_lock(index), + MTR_MEMO_X_LOCK)); + ut_ad(thr_get_trx(thr)->fake_changes + || mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + /* The insert buffer tree should never be updated in place. */ + ut_ad(!dict_index_is_ibuf(index)); + ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG) + || dict_index_is_clust(index)); + ut_ad(thr_get_trx(thr)->id == trx_id + || (flags & ~BTR_KEEP_POS_FLAG) + == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG + | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG)); + + err = optim_err = btr_cur_optimistic_update( + flags | BTR_KEEP_IBUF_BITMAP, + cursor, offsets, offsets_heap, update, + cmpl_info, thr, trx_id, mtr); + + switch (err) { + case DB_ZIP_OVERFLOW: + case DB_UNDERFLOW: + case DB_OVERFLOW: + break; + default: + err_exit: + /* We suppressed this with BTR_KEEP_IBUF_BITMAP. + For DB_ZIP_OVERFLOW, the IBUF_BITMAP_FREE bits were + already reset by btr_cur_update_alloc_zip() if the + page was recompressed. */ + if (page_zip + && optim_err != DB_ZIP_OVERFLOW + && !dict_index_is_clust(index) + && page_is_leaf(page)) { + ibuf_update_free_bits_zip(block, mtr); + } + + return(err); + } + + /* Do lock checking and undo logging */ + err = btr_cur_upd_lock_and_undo(flags, cursor, *offsets, + update, cmpl_info, + thr, mtr, &roll_ptr); + if (err != DB_SUCCESS) { + goto err_exit; + } + + if (optim_err == DB_OVERFLOW) { + ulint reserve_flag; + ulint n_extents; + + /* First reserve enough free space for the file segments + of the index tree, so that the update will not fail because + of lack of space */ + if (UNIV_UNLIKELY(cursor->tree_height == ULINT_UNDEFINED)) { + /* When the tree height is uninitialized due to fake + changes, reserve some hardcoded number of extents. */ + ut_a(thr_get_trx(thr)->fake_changes); + n_extents = 3; + } + else { + n_extents = cursor->tree_height / 16 + 3; + } + + if (flags & BTR_NO_UNDO_LOG_FLAG) { + reserve_flag = FSP_CLEANING; + } else { + reserve_flag = FSP_NORMAL; + } + + if (!fsp_reserve_free_extents(&n_reserved, index->space, + n_extents, reserve_flag, mtr)) { + err = DB_OUT_OF_FILE_SPACE; + goto err_exit; + } + } + + rec = btr_cur_get_rec(cursor); + + *offsets = rec_get_offsets( + rec, index, *offsets, ULINT_UNDEFINED, offsets_heap); + + dtuple_t* new_entry = row_rec_to_index_entry( + rec, index, *offsets, &n_ext, entry_heap); + + /* The page containing the clustered index record + corresponding to new_entry is latched in mtr. If the + clustered index record is delete-marked, then its externally + stored fields cannot have been purged yet, because then the + purge would also have removed the clustered index record + itself. Thus the following call is safe. */ + row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update, + FALSE, entry_heap); + + trx = thr_get_trx(thr); + + if (!(flags & BTR_KEEP_SYS_FLAG) && UNIV_LIKELY(!trx->fake_changes)) { + row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR, + roll_ptr); + row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID, + trx_id); + } + + if ((flags & BTR_NO_UNDO_LOG_FLAG) && rec_offs_any_extern(*offsets)) { + /* We are in a transaction rollback undoing a row + update: we must free possible externally stored fields + which got new values in the update, if they are not + inherited values. They can be inherited if we have + updated the primary key to another value, and then + update it back again. */ + + ut_ad(big_rec_vec == NULL); + + /* fake_changes should not cause undo. so never reaches here */ + ut_ad(!(trx->fake_changes)); + + btr_rec_free_updated_extern_fields( + index, rec, page_zip, *offsets, update, + trx_is_recv(thr_get_trx(thr)) + ? RB_RECOVERY : RB_NORMAL, mtr); + } + + /* We have to set appropriate extern storage bits in the new + record to be inserted: we have to remember which fields were such */ + + ut_ad(!page_is_comp(page) || !rec_get_node_ptr_flag(rec)); + ut_ad(rec_offs_validate(rec, index, *offsets)); + n_ext += btr_push_update_extern_fields(new_entry, update, entry_heap); + + if (page_zip) { + ut_ad(page_is_comp(page)); + if (page_zip_rec_needs_ext( + rec_get_converted_size(index, new_entry, n_ext), + TRUE, + dict_index_get_n_fields(index), + page_zip_get_size(page_zip))) { + + goto make_external; + } + } else if (page_zip_rec_needs_ext( + rec_get_converted_size(index, new_entry, n_ext), + page_is_comp(page), 0, 0)) { +make_external: + big_rec_vec = dtuple_convert_big_rec(index, new_entry, &n_ext); + if (UNIV_UNLIKELY(big_rec_vec == NULL)) { + + /* We cannot goto return_after_reservations, + because we may need to update the + IBUF_BITMAP_FREE bits, which was suppressed by + BTR_KEEP_IBUF_BITMAP. */ +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip + || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + if (n_reserved > 0) { + fil_space_release_free_extents( + index->space, n_reserved); + } + + err = DB_TOO_BIG_RECORD; + goto err_exit; + } + + ut_ad(page_is_leaf(page)); + ut_ad(dict_index_is_clust(index)); + ut_ad(flags & BTR_KEEP_POS_FLAG); + } + + if (UNIV_UNLIKELY(trx->fake_changes)) { + /* skip CHANGE, LOG */ + err = DB_SUCCESS; + goto return_after_reservations; + } + + if (big_rec_vec) { + + err = btr_check_blob_limit(big_rec_vec); + + if (err != DB_SUCCESS) { + if (n_reserved > 0) { + fil_space_release_free_extents( + index->space, n_reserved); + } + goto err_exit; + } + } + + /* Store state of explicit locks on rec on the page infimum record, + before deleting rec. The page infimum acts as a dummy carrier of the + locks, taking care also of lock releases, before we can move the locks + back on the actual record. There is a special case: if we are + inserting on the root page and the insert causes a call of + btr_root_raise_and_insert. Therefore we cannot in the lock system + delete the lock structs set on the root page even if the root + page carries just node pointers. */ + + lock_rec_store_on_page_infimum(block, rec); + + btr_search_update_hash_on_delete(cursor); + +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + page_cursor = btr_cur_get_page_cur(cursor); + + page_cur_delete_rec(page_cursor, index, *offsets, mtr); + + page_cur_move_to_prev(page_cursor); + + rec = btr_cur_insert_if_possible(cursor, new_entry, + offsets, offsets_heap, n_ext, mtr); + + if (rec) { + page_cursor->rec = rec; + + lock_rec_restore_from_page_infimum(btr_cur_get_block(cursor), + rec, block); + + if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) { + /* The new inserted record owns its possible externally + stored fields */ + btr_cur_unmark_extern_fields( + page_zip, rec, index, *offsets, mtr); + } + + bool adjust = big_rec_vec && (flags & BTR_KEEP_POS_FLAG); + + if (btr_cur_compress_if_useful(cursor, adjust, mtr)) { + if (adjust) { + rec_offs_make_valid( + page_cursor->rec, index, *offsets); + } + } else if (page_zip && + !dict_index_is_clust(index) + && page_is_leaf(page)) { + /* Update the free bits in the insert buffer. + This is the same block which was skipped by + BTR_KEEP_IBUF_BITMAP. */ + ibuf_update_free_bits_zip(block, mtr); + } + + err = DB_SUCCESS; + goto return_after_reservations; + } else { + /* If the page is compressed and it initially + compresses very well, and there is a subsequent insert + of a badly-compressing record, it is possible for + btr_cur_optimistic_update() to return DB_UNDERFLOW and + btr_cur_insert_if_possible() to return FALSE. */ + ut_a(page_zip || optim_err != DB_UNDERFLOW); + + /* Out of space: reset the free bits. + This is the same block which was skipped by + BTR_KEEP_IBUF_BITMAP. */ + if (!dict_index_is_clust(index) && page_is_leaf(page)) { + ibuf_reset_free_bits(block); + } + } + + if (big_rec_vec) { + ut_ad(page_is_leaf(page)); + ut_ad(dict_index_is_clust(index)); + ut_ad(flags & BTR_KEEP_POS_FLAG); + + /* btr_page_split_and_insert() in + btr_cur_pessimistic_insert() invokes + mtr_memo_release(mtr, index->lock, MTR_MEMO_X_LOCK). + We must keep the index->lock when we created a + big_rec, so that row_upd_clust_rec() can store the + big_rec in the same mini-transaction. */ + + mtr_x_lock(dict_index_get_lock(index), mtr); + } + + /* Was the record to be updated positioned as the first user + record on its page? */ + was_first = page_cur_is_before_first(page_cursor); + + /* Lock checks and undo logging were already performed by + btr_cur_upd_lock_and_undo(). We do not try + btr_cur_optimistic_insert() because + btr_cur_insert_if_possible() already failed above. */ + + err = btr_cur_pessimistic_insert(BTR_NO_UNDO_LOG_FLAG + | BTR_NO_LOCKING_FLAG + | BTR_KEEP_SYS_FLAG, + cursor, offsets, offsets_heap, + new_entry, &rec, + &dummy_big_rec, n_ext, NULL, mtr); + ut_a(rec); + ut_a(err == DB_SUCCESS); + ut_a(dummy_big_rec == NULL); + ut_ad(rec_offs_validate(rec, cursor->index, *offsets)); + page_cursor->rec = rec; + + if (dict_index_is_sec_or_ibuf(index)) { + /* Update PAGE_MAX_TRX_ID in the index page header. + It was not updated by btr_cur_pessimistic_insert() + because of BTR_NO_LOCKING_FLAG. */ + buf_block_t* rec_block; + + rec_block = btr_cur_get_block(cursor); + + page_update_max_trx_id(rec_block, + buf_block_get_page_zip(rec_block), + trx_id, mtr); + } + + if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) { + /* The new inserted record owns its possible externally + stored fields */ + buf_block_t* rec_block = btr_cur_get_block(cursor); + +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); + page = buf_block_get_frame(rec_block); +#endif /* UNIV_ZIP_DEBUG */ + page_zip = buf_block_get_page_zip(rec_block); + + btr_cur_unmark_extern_fields(page_zip, + rec, index, *offsets, mtr); + } + + lock_rec_restore_from_page_infimum(btr_cur_get_block(cursor), + rec, block); + + /* If necessary, restore also the correct lock state for a new, + preceding supremum record created in a page split. While the old + record was nonexistent, the supremum might have inherited its locks + from a wrong record. */ + + if (!was_first) { + btr_cur_pess_upd_restore_supremum(btr_cur_get_block(cursor), + rec, mtr); + } + +return_after_reservations: +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + + if (n_reserved > 0) { + fil_space_release_free_extents(index->space, n_reserved); + } + + *big_rec = big_rec_vec; + + return(err); +} + +/*==================== B-TREE DELETE MARK AND UNMARK ===============*/ + +/****************************************************************//** +Writes the redo log record for delete marking or unmarking of an index +record. */ +UNIV_INLINE +void +btr_cur_del_mark_set_clust_rec_log( +/*===============================*/ + rec_t* rec, /*!< in: record */ + dict_index_t* index, /*!< in: index of the record */ + trx_id_t trx_id, /*!< in: transaction id */ + roll_ptr_t roll_ptr,/*!< in: roll ptr to the undo log record */ + mtr_t* mtr) /*!< in: mtr */ +{ + byte* log_ptr; + + ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table)); + + log_ptr = mlog_open_and_write_index(mtr, rec, index, + page_rec_is_comp(rec) + ? MLOG_COMP_REC_CLUST_DELETE_MARK + : MLOG_REC_CLUST_DELETE_MARK, + 1 + 1 + DATA_ROLL_PTR_LEN + + 14 + 2); + + if (!log_ptr) { + /* Logging in mtr is switched off during crash recovery */ + return; + } + + *log_ptr++ = 0; + *log_ptr++ = 1; + + log_ptr = row_upd_write_sys_vals_to_log( + index, trx_id, roll_ptr, log_ptr, mtr); + mach_write_to_2(log_ptr, page_offset(rec)); + log_ptr += 2; + + mlog_close(mtr, log_ptr); +} +#endif /* !UNIV_HOTBACKUP */ + +/****************************************************************//** +Parses the redo log record for delete marking or unmarking of a clustered +index record. +@return end of log record or NULL */ +UNIV_INTERN +byte* +btr_cur_parse_del_mark_set_clust_rec( +/*=================================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + page_t* page, /*!< in/out: page or NULL */ + page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */ + dict_index_t* index) /*!< in: index corresponding to page */ +{ + ulint flags; + ulint val; + ulint pos; + trx_id_t trx_id; + roll_ptr_t roll_ptr; + ulint offset; + rec_t* rec; + + ut_ad(!page + || !!page_is_comp(page) == dict_table_is_comp(index->table)); + + if (end_ptr < ptr + 2) { + + return(NULL); + } + + flags = mach_read_from_1(ptr); + ptr++; + val = mach_read_from_1(ptr); + ptr++; + + ptr = row_upd_parse_sys_vals(ptr, end_ptr, &pos, &trx_id, &roll_ptr); + + if (ptr == NULL) { + + return(NULL); + } + + if (end_ptr < ptr + 2) { + + return(NULL); + } + + offset = mach_read_from_2(ptr); + ptr += 2; + + ut_a(offset <= UNIV_PAGE_SIZE); + + if (page) { + rec = page + offset; + + /* We do not need to reserve btr_search_latch, as the page + is only being recovered, and there cannot be a hash index to + it. Besides, these fields are being updated in place + and the adaptive hash index does not depend on them. */ + + btr_rec_set_deleted_flag(rec, page_zip, val); + + if (!(flags & BTR_KEEP_SYS_FLAG)) { + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs_init(offsets_); + + row_upd_rec_sys_fields_in_recovery( + rec, page_zip, + rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap), + pos, trx_id, roll_ptr); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + } + } + + return(ptr); +} + +#ifndef UNIV_HOTBACKUP +/***********************************************************//** +Marks a clustered index record deleted. Writes an undo log record to +undo log on this delete marking. Writes in the trx id field the id +of the deleting transaction, and in the roll ptr field pointer to the +undo log record created. +@return DB_SUCCESS, DB_LOCK_WAIT, or error number */ +UNIV_INTERN +dberr_t +btr_cur_del_mark_set_clust_rec( +/*===========================*/ + buf_block_t* block, /*!< in/out: buffer block of the record */ + rec_t* rec, /*!< in/out: record */ + dict_index_t* index, /*!< in: clustered index of the record */ + const ulint* offsets,/*!< in: rec_get_offsets(rec) */ + que_thr_t* thr, /*!< in: query thread */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + roll_ptr_t roll_ptr; + dberr_t err; + page_zip_des_t* page_zip; + trx_t* trx; + + ut_ad(dict_index_is_clust(index)); + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table)); + ut_ad(buf_block_get_frame(block) == page_align(rec)); + ut_ad(page_is_leaf(page_align(rec))); + +#ifdef UNIV_DEBUG + if (btr_cur_print_record_ops && thr) { + btr_cur_trx_report(thr_get_trx(thr)->id, index, "del mark "); + rec_print_new(stderr, rec, offsets); + } +#endif /* UNIV_DEBUG */ + + ut_ad(dict_index_is_clust(index)); + ut_ad(!rec_get_deleted_flag(rec, rec_offs_comp(offsets))); + + if (UNIV_UNLIKELY(thr_get_trx(thr)->fake_changes)) { + /* skip LOCK, UNDO, CHANGE, LOG */ + return(DB_SUCCESS); + } + + err = lock_clust_rec_modify_check_and_lock(BTR_NO_LOCKING_FLAG, block, + rec, index, offsets, thr); + + if (err != DB_SUCCESS) { + + return(err); + } + + err = trx_undo_report_row_operation(0, TRX_UNDO_MODIFY_OP, thr, + index, NULL, NULL, 0, rec, offsets, + &roll_ptr); + if (err != DB_SUCCESS) { + + return(err); + } + + /* The btr_search_latch is not needed here, because + the adaptive hash index does not depend on the delete-mark + and the delete-mark is being updated in place. */ + + page_zip = buf_block_get_page_zip(block); + + btr_blob_dbg_set_deleted_flag(rec, index, offsets, TRUE); + btr_rec_set_deleted_flag(rec, page_zip, TRUE); + + trx = thr_get_trx(thr); + + if (dict_index_is_online_ddl(index)) { + row_log_table_delete(rec, index, offsets, NULL); + } + + row_upd_rec_sys_fields(rec, page_zip, index, offsets, trx, roll_ptr); + + btr_cur_del_mark_set_clust_rec_log(rec, index, trx->id, + roll_ptr, mtr); + + return(err); +} + +/****************************************************************//** +Writes the redo log record for a delete mark setting of a secondary +index record. */ +UNIV_INLINE +void +btr_cur_del_mark_set_sec_rec_log( +/*=============================*/ + rec_t* rec, /*!< in: record */ + ibool val, /*!< in: value to set */ + mtr_t* mtr) /*!< in: mtr */ +{ + byte* log_ptr; + ut_ad(val <= 1); + + log_ptr = mlog_open(mtr, 11 + 1 + 2); + + if (!log_ptr) { + /* Logging in mtr is switched off during crash recovery: + in that case mlog_open returns NULL */ + return; + } + + log_ptr = mlog_write_initial_log_record_fast( + rec, MLOG_REC_SEC_DELETE_MARK, log_ptr, mtr); + mach_write_to_1(log_ptr, val); + log_ptr++; + + mach_write_to_2(log_ptr, page_offset(rec)); + log_ptr += 2; + + mlog_close(mtr, log_ptr); +} +#endif /* !UNIV_HOTBACKUP */ + +/****************************************************************//** +Parses the redo log record for delete marking or unmarking of a secondary +index record. +@return end of log record or NULL */ +UNIV_INTERN +byte* +btr_cur_parse_del_mark_set_sec_rec( +/*===============================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + page_t* page, /*!< in/out: page or NULL */ + page_zip_des_t* page_zip)/*!< in/out: compressed page, or NULL */ +{ + ulint val; + ulint offset; + rec_t* rec; + + if (end_ptr < ptr + 3) { + + return(NULL); + } + + val = mach_read_from_1(ptr); + ptr++; + + offset = mach_read_from_2(ptr); + ptr += 2; + + ut_a(offset <= UNIV_PAGE_SIZE); + + if (page) { + rec = page + offset; + + /* We do not need to reserve btr_search_latch, as the page + is only being recovered, and there cannot be a hash index to + it. Besides, the delete-mark flag is being updated in place + and the adaptive hash index does not depend on it. */ + + btr_rec_set_deleted_flag(rec, page_zip, val); + } + + return(ptr); +} + +#ifndef UNIV_HOTBACKUP +/***********************************************************//** +Sets a secondary index record delete mark to TRUE or FALSE. +@return DB_SUCCESS, DB_LOCK_WAIT, or error number */ +UNIV_INTERN +dberr_t +btr_cur_del_mark_set_sec_rec( +/*=========================*/ + ulint flags, /*!< in: locking flag */ + btr_cur_t* cursor, /*!< in: cursor */ + ibool val, /*!< in: value to set */ + que_thr_t* thr, /*!< in: query thread */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + buf_block_t* block; + rec_t* rec; + dberr_t err; + + if (UNIV_UNLIKELY(thr_get_trx(thr)->fake_changes)) { + /* skip LOCK, CHANGE, LOG */ + return(DB_SUCCESS); + } + + block = btr_cur_get_block(cursor); + rec = btr_cur_get_rec(cursor); + +#ifdef UNIV_DEBUG + if (btr_cur_print_record_ops && thr) { + btr_cur_trx_report(thr_get_trx(thr)->id, cursor->index, + "del mark "); + rec_print(stderr, rec, cursor->index); + } +#endif /* UNIV_DEBUG */ + + err = lock_sec_rec_modify_check_and_lock(flags, + btr_cur_get_block(cursor), + rec, cursor->index, thr, mtr); + if (err != DB_SUCCESS) { + + return(err); + } + + ut_ad(!!page_rec_is_comp(rec) + == dict_table_is_comp(cursor->index->table)); + + /* We do not need to reserve btr_search_latch, as the + delete-mark flag is being updated in place and the adaptive + hash index does not depend on it. */ + btr_rec_set_deleted_flag(rec, buf_block_get_page_zip(block), val); + + btr_cur_del_mark_set_sec_rec_log(rec, val, mtr); + + return(DB_SUCCESS); +} + +/***********************************************************//** +Sets a secondary index record's delete mark to the given value. This +function is only used by the insert buffer merge mechanism. */ +UNIV_INTERN +void +btr_cur_set_deleted_flag_for_ibuf( +/*==============================*/ + rec_t* rec, /*!< in/out: record */ + page_zip_des_t* page_zip, /*!< in/out: compressed page + corresponding to rec, or NULL + when the tablespace is + uncompressed */ + ibool val, /*!< in: value to set */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + /* We do not need to reserve btr_search_latch, as the page + has just been read to the buffer pool and there cannot be + a hash index to it. Besides, the delete-mark flag is being + updated in place and the adaptive hash index does not depend + on it. */ + + btr_rec_set_deleted_flag(rec, page_zip, val); + + btr_cur_del_mark_set_sec_rec_log(rec, val, mtr); +} + +/*==================== B-TREE RECORD REMOVE =========================*/ + +/*************************************************************//** +Tries to compress a page of the tree if it seems useful. It is assumed +that mtr holds an x-latch on the tree and on the cursor page. To avoid +deadlocks, mtr must also own x-latches to brothers of page, if those +brothers exist. NOTE: it is assumed that the caller has reserved enough +free extents so that the compression will always succeed if done! +@return TRUE if compression occurred */ +UNIV_INTERN +ibool +btr_cur_compress_if_useful( +/*=======================*/ + btr_cur_t* cursor, /*!< in/out: cursor on the page to compress; + cursor does not stay valid if !adjust and + compression occurs */ + ibool adjust, /*!< in: TRUE if should adjust the + cursor position even if compression occurs */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ut_ad(mtr_memo_contains(mtr, + dict_index_get_lock(btr_cur_get_index(cursor)), + MTR_MEMO_X_LOCK)); + ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor), + MTR_MEMO_PAGE_X_FIX)); + + return(btr_cur_compress_recommendation(cursor, mtr) + && btr_compress(cursor, adjust, mtr)); +} + +/*******************************************************//** +Removes the record on which the tree cursor is positioned on a leaf page. +It is assumed that the mtr has an x-latch on the page where the cursor is +positioned, but no latch on the whole tree. +@return TRUE if success, i.e., the page did not become too empty */ +UNIV_INTERN +ibool +btr_cur_optimistic_delete_func( +/*===========================*/ + btr_cur_t* cursor, /*!< in: cursor on leaf page, on the record to + delete; cursor stays valid: if deletion + succeeds, on function exit it points to the + successor of the deleted record */ +#ifdef UNIV_DEBUG + ulint flags, /*!< in: BTR_CREATE_FLAG or 0 */ +#endif /* UNIV_DEBUG */ + mtr_t* mtr) /*!< in: mtr; if this function returns + TRUE on a leaf page of a secondary + index, the mtr must be committed + before latching any further pages */ +{ + buf_block_t* block; + rec_t* rec; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + ibool no_compress_needed; + rec_offs_init(offsets_); + + ut_ad(flags == 0 || flags == BTR_CREATE_FLAG); + ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor), + MTR_MEMO_PAGE_X_FIX)); + /* This is intended only for leaf page deletions */ + + block = btr_cur_get_block(cursor); + + SRV_CORRUPT_TABLE_CHECK(block, return(DB_CORRUPTION);); + + ut_ad(page_is_leaf(buf_block_get_frame(block))); + ut_ad(!dict_index_is_online_ddl(cursor->index) + || dict_index_is_clust(cursor->index) + || (flags & BTR_CREATE_FLAG)); + + rec = btr_cur_get_rec(cursor); + offsets = rec_get_offsets(rec, cursor->index, offsets, + ULINT_UNDEFINED, &heap); + + no_compress_needed = !rec_offs_any_extern(offsets) + && btr_cur_can_delete_without_compress( + cursor, rec_offs_size(offsets), mtr); + + if (no_compress_needed) { + + page_t* page = buf_block_get_frame(block); + page_zip_des_t* page_zip= buf_block_get_page_zip(block); + + lock_update_delete(block, rec); + + btr_search_update_hash_on_delete(cursor); + + if (page_zip) { +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page, cursor->index)); +#endif /* UNIV_ZIP_DEBUG */ + page_cur_delete_rec(btr_cur_get_page_cur(cursor), + cursor->index, offsets, mtr); +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page, cursor->index)); +#endif /* UNIV_ZIP_DEBUG */ + + /* On compressed pages, the IBUF_BITMAP_FREE + space is not affected by deleting (purging) + records, because it is defined as the minimum + of space available *without* reorganize, and + space available in the modification log. */ + } else { + const ulint max_ins + = page_get_max_insert_size_after_reorganize( + page, 1); + + page_cur_delete_rec(btr_cur_get_page_cur(cursor), + cursor->index, offsets, mtr); + + /* The change buffer does not handle inserts + into non-leaf pages, into clustered indexes, + or into the change buffer. */ + if (page_is_leaf(page) + && !dict_index_is_clust(cursor->index) + && !dict_index_is_ibuf(cursor->index)) { + ibuf_update_free_bits_low(block, max_ins, mtr); + } + } + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + return(no_compress_needed); +} + +/*************************************************************//** +Removes the record on which the tree cursor is positioned. Tries +to compress the page if its fillfactor drops below a threshold +or if it is the only page on the level. It is assumed that mtr holds +an x-latch on the tree and on the cursor page. To avoid deadlocks, +mtr must also own x-latches to brothers of page, if those brothers +exist. +@return TRUE if compression occurred */ +UNIV_INTERN +ibool +btr_cur_pessimistic_delete( +/*=======================*/ + dberr_t* err, /*!< out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE; + the latter may occur because we may have + to update node pointers on upper levels, + and in the case of variable length keys + these may actually grow in size */ + ibool has_reserved_extents, /*!< in: TRUE if the + caller has already reserved enough free + extents so that he knows that the operation + will succeed */ + btr_cur_t* cursor, /*!< in: cursor on the record to delete; + if compression does not occur, the cursor + stays valid: it points to successor of + deleted record on function exit */ + ulint flags, /*!< in: BTR_CREATE_FLAG or 0 */ + enum trx_rb_ctx rb_ctx, /*!< in: rollback context */ + mtr_t* mtr) /*!< in: mtr */ +{ + buf_block_t* block; + page_t* page; + page_zip_des_t* page_zip; + dict_index_t* index; + rec_t* rec; + ulint n_reserved = 0; + ibool success; + ibool ret = FALSE; + ulint level; + mem_heap_t* heap; + ulint* offsets; + + block = btr_cur_get_block(cursor); + page = buf_block_get_frame(block); + index = btr_cur_get_index(cursor); + + ut_ad(flags == 0 || flags == BTR_CREATE_FLAG); + ut_ad(!dict_index_is_online_ddl(index) + || dict_index_is_clust(index) + || (flags & BTR_CREATE_FLAG)); + ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index), + MTR_MEMO_X_LOCK)); + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + if (!has_reserved_extents) { + /* First reserve enough free space for the file segments + of the index tree, so that the node pointer updates will + not fail because of lack of space */ + + ut_a(cursor->tree_height != ULINT_UNDEFINED); + + ulint n_extents = cursor->tree_height / 32 + 1; + + success = fsp_reserve_free_extents(&n_reserved, + index->space, + n_extents, + FSP_CLEANING, mtr); + if (!success) { + *err = DB_OUT_OF_FILE_SPACE; + + return(FALSE); + } + } + + heap = mem_heap_create(1024); + rec = btr_cur_get_rec(cursor); + page_zip = buf_block_get_page_zip(block); +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + + offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap); + + if (rec_offs_any_extern(offsets)) { + btr_rec_free_externally_stored_fields(index, + rec, offsets, page_zip, + rb_ctx, mtr); +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + } + + if (UNIV_UNLIKELY(page_get_n_recs(page) < 2) + && UNIV_UNLIKELY(dict_index_get_page(index) + != buf_block_get_page_no(block))) { + + /* If there is only one record, drop the whole page in + btr_discard_page, if this is not the root page */ + + btr_discard_page(cursor, mtr); + + ret = TRUE; + + goto return_after_reservations; + } + + if (flags == 0) { + lock_update_delete(block, rec); + } + + level = btr_page_get_level(page, mtr); + + if (level > 0 + && UNIV_UNLIKELY(rec == page_rec_get_next( + page_get_infimum_rec(page)))) { + + rec_t* next_rec = page_rec_get_next(rec); + + if (btr_page_get_prev(page, mtr) == FIL_NULL) { + + /* If we delete the leftmost node pointer on a + non-leaf level, we must mark the new leftmost node + pointer as the predefined minimum record */ + + /* This will make page_zip_validate() fail until + page_cur_delete_rec() completes. This is harmless, + because everything will take place within a single + mini-transaction and because writing to the redo log + is an atomic operation (performed by mtr_commit()). */ + btr_set_min_rec_mark(next_rec, mtr); + } else { + /* Otherwise, if we delete the leftmost node pointer + on a page, we have to change the father node pointer + so that it is equal to the new leftmost node pointer + on the page */ + + btr_node_ptr_delete(index, block, mtr); + + dtuple_t* node_ptr = dict_index_build_node_ptr( + index, next_rec, buf_block_get_page_no(block), + heap, level); + + btr_insert_on_non_leaf_level( + flags, index, level + 1, node_ptr, mtr); + } + } + + btr_search_update_hash_on_delete(cursor); + + page_cur_delete_rec(btr_cur_get_page_cur(cursor), index, offsets, mtr); +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + + ut_ad(btr_check_node_ptr(index, block, mtr)); + +return_after_reservations: + *err = DB_SUCCESS; + + mem_heap_free(heap); + + if (ret == FALSE) { + ret = btr_cur_compress_if_useful(cursor, FALSE, mtr); + } + + if (n_reserved > 0) { + fil_space_release_free_extents(index->space, n_reserved); + } + + return(ret); +} + +/*******************************************************************//** +Adds path information to the cursor for the current page, for which +the binary search has been performed. */ +static +void +btr_cur_add_path_info( +/*==================*/ + btr_cur_t* cursor, /*!< in: cursor positioned on a page */ + ulint height, /*!< in: height of the page in tree; + 0 means leaf node */ + ulint root_height) /*!< in: root node height in tree */ +{ + btr_path_t* slot; + const rec_t* rec; + const page_t* page; + + ut_a(cursor->path_arr); + + if (root_height >= BTR_PATH_ARRAY_N_SLOTS - 1) { + /* Do nothing; return empty path */ + + slot = cursor->path_arr; + slot->nth_rec = ULINT_UNDEFINED; + + return; + } + + if (height == 0) { + /* Mark end of slots for path */ + slot = cursor->path_arr + root_height + 1; + slot->nth_rec = ULINT_UNDEFINED; + } + + rec = btr_cur_get_rec(cursor); + + slot = cursor->path_arr + (root_height - height); + + page = page_align(rec); + + slot->nth_rec = page_rec_get_n_recs_before(rec); + slot->n_recs = page_get_n_recs(page); + slot->page_no = page_get_page_no(page); + slot->page_level = btr_page_get_level_low(page); +} + +/*******************************************************************//** +Estimate the number of rows between slot1 and slot2 for any level on a +B-tree. This function starts from slot1->page and reads a few pages to +the right, counting their records. If we reach slot2->page quickly then +we know exactly how many records there are between slot1 and slot2 and +we set is_n_rows_exact to TRUE. If we cannot reach slot2->page quickly +then we calculate the average number of records in the pages scanned +so far and assume that all pages that we did not scan up to slot2->page +contain the same number of records, then we multiply that average to +the number of pages between slot1->page and slot2->page (which is +n_rows_on_prev_level). In this case we set is_n_rows_exact to FALSE. +@return number of rows (exact or estimated) */ +static +ib_int64_t +btr_estimate_n_rows_in_range_on_level( +/*==================================*/ + dict_index_t* index, /*!< in: index */ + btr_path_t* slot1, /*!< in: left border */ + btr_path_t* slot2, /*!< in: right border */ + ib_int64_t n_rows_on_prev_level, /*!< in: number of rows + on the previous level for the + same descend paths; used to + determine the numbe of pages + on this level */ + ibool* is_n_rows_exact) /*!< out: TRUE if the returned + value is exact i.e. not an + estimation */ +{ + ulint space; + ib_int64_t n_rows; + ulint n_pages_read; + ulint page_no; + ulint zip_size; + ulint level; + + space = dict_index_get_space(index); + + n_rows = 0; + n_pages_read = 0; + + /* Assume by default that we will scan all pages between + slot1->page_no and slot2->page_no */ + *is_n_rows_exact = TRUE; + + /* add records from slot1->page_no which are to the right of + the record which serves as a left border of the range, if any */ + if (slot1->nth_rec < slot1->n_recs) { + n_rows += slot1->n_recs - slot1->nth_rec; + } + + /* add records from slot2->page_no which are to the left of + the record which servers as a right border of the range, if any */ + if (slot2->nth_rec > 1) { + n_rows += slot2->nth_rec - 1; + } + + /* count the records in the pages between slot1->page_no and + slot2->page_no (non inclusive), if any */ + + zip_size = fil_space_get_zip_size(space); + + /* Do not read more than this number of pages in order not to hurt + performance with this code which is just an estimation. If we read + this many pages before reaching slot2->page_no then we estimate the + average from the pages scanned so far */ +# define N_PAGES_READ_LIMIT 10 + + page_no = slot1->page_no; + level = slot1->page_level; + + do { + mtr_t mtr; + page_t* page; + buf_block_t* block; + + mtr_start(&mtr); + + /* Fetch the page. Because we are not holding the + index->lock, the tree may have changed and we may be + attempting to read a page that is no longer part of + the B-tree. We pass BUF_GET_POSSIBLY_FREED in order to + silence a debug assertion about this. */ + block = buf_page_get_gen(space, zip_size, page_no, RW_S_LATCH, + NULL, BUF_GET_POSSIBLY_FREED, + __FILE__, __LINE__, &mtr); + + page = buf_block_get_frame(block); + + /* It is possible that the tree has been reorganized in the + meantime and this is a different page. If this happens the + calculated estimate will be bogus, which is not fatal as + this is only an estimate. We are sure that a page with + page_no exists because InnoDB never frees pages, only + reuses them. */ + if (fil_page_get_type(page) != FIL_PAGE_INDEX + || btr_page_get_index_id(page) != index->id + || btr_page_get_level_low(page) != level) { + + /* The page got reused for something else */ + mtr_commit(&mtr); + goto inexact; + } + + /* It is possible but highly unlikely that the page was + originally written by an old version of InnoDB that did + not initialize FIL_PAGE_TYPE on other than B-tree pages. + For example, this could be an almost-empty BLOB page + that happens to contain the magic values in the fields + that we checked above. */ + + n_pages_read++; + + if (page_no != slot1->page_no) { + /* Do not count the records on slot1->page_no, + we already counted them before this loop. */ + n_rows += page_get_n_recs(page); + } + + page_no = btr_page_get_next(page, &mtr); + + mtr_commit(&mtr); + + if (n_pages_read == N_PAGES_READ_LIMIT + || page_no == FIL_NULL) { + /* Either we read too many pages or + we reached the end of the level without passing + through slot2->page_no, the tree must have changed + in the meantime */ + goto inexact; + } + + } while (page_no != slot2->page_no); + + return(n_rows); + +inexact: + + *is_n_rows_exact = FALSE; + + /* We did interrupt before reaching slot2->page */ + + if (n_pages_read > 0) { + /* The number of pages on this level is + n_rows_on_prev_level, multiply it by the + average number of recs per page so far */ + n_rows = n_rows_on_prev_level + * n_rows / n_pages_read; + } else { + /* The tree changed before we could even + start with slot1->page_no */ + n_rows = 10; + } + + return(n_rows); +} + +/*******************************************************************//** +Estimates the number of rows in a given index range. +@return estimated number of rows */ +UNIV_INTERN +ib_int64_t +btr_estimate_n_rows_in_range( +/*=========================*/ + dict_index_t* index, /*!< in: index */ + const dtuple_t* tuple1, /*!< in: range start, may also be empty tuple */ + ulint mode1, /*!< in: search mode for range start */ + const dtuple_t* tuple2, /*!< in: range end, may also be empty tuple */ + ulint mode2) /*!< in: search mode for range end */ +{ + btr_path_t path1[BTR_PATH_ARRAY_N_SLOTS]; + btr_path_t path2[BTR_PATH_ARRAY_N_SLOTS]; + btr_cur_t cursor; + btr_path_t* slot1; + btr_path_t* slot2; + ibool diverged; + ibool diverged_lot; + ulint divergence_level; + ib_int64_t n_rows; + ibool is_n_rows_exact; + ulint i; + mtr_t mtr; + ib_int64_t table_n_rows; + + table_n_rows = dict_table_get_n_rows(index->table); + + mtr_start(&mtr); + + cursor.path_arr = path1; + + if (dtuple_get_n_fields(tuple1) > 0) { + + btr_cur_search_to_nth_level(index, 0, tuple1, mode1, + BTR_SEARCH_LEAF | BTR_ESTIMATE, + &cursor, 0, + __FILE__, __LINE__, &mtr); + } else { + btr_cur_open_at_index_side(true, index, + BTR_SEARCH_LEAF | BTR_ESTIMATE, + &cursor, 0, &mtr); + } + + mtr_commit(&mtr); + + mtr_start(&mtr); + + cursor.path_arr = path2; + + if (dtuple_get_n_fields(tuple2) > 0) { + + btr_cur_search_to_nth_level(index, 0, tuple2, mode2, + BTR_SEARCH_LEAF | BTR_ESTIMATE, + &cursor, 0, + __FILE__, __LINE__, &mtr); + } else { + btr_cur_open_at_index_side(false, index, + BTR_SEARCH_LEAF | BTR_ESTIMATE, + &cursor, 0, &mtr); + } + + mtr_commit(&mtr); + + /* We have the path information for the range in path1 and path2 */ + + n_rows = 1; + is_n_rows_exact = TRUE; + diverged = FALSE; /* This becomes true when the path is not + the same any more */ + diverged_lot = FALSE; /* This becomes true when the paths are + not the same or adjacent any more */ + divergence_level = 1000000; /* This is the level where paths diverged + a lot */ + for (i = 0; ; i++) { + ut_ad(i < BTR_PATH_ARRAY_N_SLOTS); + + slot1 = path1 + i; + slot2 = path2 + i; + + if (slot1->nth_rec == ULINT_UNDEFINED + || slot2->nth_rec == ULINT_UNDEFINED) { + + if (i > divergence_level + 1 && !is_n_rows_exact) { + /* In trees whose height is > 1 our algorithm + tends to underestimate: multiply the estimate + by 2: */ + + n_rows = n_rows * 2; + } + + DBUG_EXECUTE_IF("bug14007649", return(n_rows);); + + /* Do not estimate the number of rows in the range + to over 1 / 2 of the estimated rows in the whole + table */ + + if (n_rows > table_n_rows / 2 && !is_n_rows_exact) { + + n_rows = table_n_rows / 2; + + /* If there are just 0 or 1 rows in the table, + then we estimate all rows are in the range */ + + if (n_rows == 0) { + n_rows = table_n_rows; + } + } + + return(n_rows); + } + + if (!diverged && slot1->nth_rec != slot2->nth_rec) { + + diverged = TRUE; + + if (slot1->nth_rec < slot2->nth_rec) { + n_rows = slot2->nth_rec - slot1->nth_rec; + + if (n_rows > 1) { + diverged_lot = TRUE; + divergence_level = i; + } + } else { + /* It is possible that + slot1->nth_rec >= slot2->nth_rec + if, for example, we have a single page + tree which contains (inf, 5, 6, supr) + and we select where x > 20 and x < 30; + in this case slot1->nth_rec will point + to the supr record and slot2->nth_rec + will point to 6 */ + n_rows = 0; + } + + } else if (diverged && !diverged_lot) { + + if (slot1->nth_rec < slot1->n_recs + || slot2->nth_rec > 1) { + + diverged_lot = TRUE; + divergence_level = i; + + n_rows = 0; + + if (slot1->nth_rec < slot1->n_recs) { + n_rows += slot1->n_recs + - slot1->nth_rec; + } + + if (slot2->nth_rec > 1) { + n_rows += slot2->nth_rec - 1; + } + } + } else if (diverged_lot) { + + n_rows = btr_estimate_n_rows_in_range_on_level( + index, slot1, slot2, n_rows, + &is_n_rows_exact); + } + } +} + +/*******************************************************************//** +Record the number of non_null key values in a given index for +each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index). +The estimates are eventually stored in the array: +index->stat_n_non_null_key_vals[], which is indexed from 0 to n-1. */ +static +void +btr_record_not_null_field_in_rec( +/*=============================*/ + ulint n_unique, /*!< in: dict_index_get_n_unique(index), + number of columns uniquely determine + an index entry */ + const ulint* offsets, /*!< in: rec_get_offsets(rec, index), + its size could be for all fields or + that of "n_unique" */ + ib_uint64_t* n_not_null) /*!< in/out: array to record number of + not null rows for n-column prefix */ +{ + ulint i; + + ut_ad(rec_offs_n_fields(offsets) >= n_unique); + + if (n_not_null == NULL) { + return; + } + + for (i = 0; i < n_unique; i++) { + if (rec_offs_nth_sql_null(offsets, i)) { + break; + } + + n_not_null[i]++; + } +} + +/*******************************************************************//** +Estimates the number of different key values in a given index, for +each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index). +The estimates are stored in the array index->stat_n_diff_key_vals[] (indexed +0..n_uniq-1) and the number of pages that were sampled is saved in +index->stat_n_sample_sizes[]. +If innodb_stats_method is nulls_ignored, we also record the number of +non-null values for each prefix and stored the estimates in +array index->stat_n_non_null_key_vals. */ +UNIV_INTERN +void +btr_estimate_number_of_different_key_vals( +/*======================================*/ + dict_index_t* index) /*!< in: index */ +{ + btr_cur_t cursor; + page_t* page; + rec_t* rec; + ulint n_cols; + ulint matched_fields; + ulint matched_bytes; + ib_uint64_t* n_diff; + ib_uint64_t* n_not_null; + ibool stats_null_not_equal; + ullint n_sample_pages; /* number of pages to sample */ + ulint not_empty_flag = 0; + ulint total_external_size = 0; + ulint i; + ulint j; + ullint add_on; + mtr_t mtr; + mem_heap_t* heap = NULL; + ulint* offsets_rec = NULL; + ulint* offsets_next_rec = NULL; + + n_cols = dict_index_get_n_unique(index); + + heap = mem_heap_create((sizeof *n_diff + sizeof *n_not_null) + * n_cols + + dict_index_get_n_fields(index) + * (sizeof *offsets_rec + + sizeof *offsets_next_rec)); + + n_diff = (ib_uint64_t*) mem_heap_zalloc( + heap, n_cols * sizeof(ib_int64_t)); + + n_not_null = NULL; + + /* Check srv_innodb_stats_method setting, and decide whether we + need to record non-null value and also decide if NULL is + considered equal (by setting stats_null_not_equal value) */ + switch (srv_innodb_stats_method) { + case SRV_STATS_NULLS_IGNORED: + n_not_null = (ib_uint64_t*) mem_heap_zalloc( + heap, n_cols * sizeof *n_not_null); + /* fall through */ + + case SRV_STATS_NULLS_UNEQUAL: + /* for both SRV_STATS_NULLS_IGNORED and SRV_STATS_NULLS_UNEQUAL + case, we will treat NULLs as unequal value */ + stats_null_not_equal = TRUE; + break; + + case SRV_STATS_NULLS_EQUAL: + stats_null_not_equal = FALSE; + break; + + default: + ut_error; + } + + /* It makes no sense to test more pages than are contained + in the index, thus we lower the number if it is too high */ + if (srv_stats_transient_sample_pages > index->stat_index_size) { + if (index->stat_index_size > 0) { + n_sample_pages = index->stat_index_size; + } else { + n_sample_pages = 1; + } + } else { + n_sample_pages = srv_stats_transient_sample_pages; + } + + /* We sample some pages in the index to get an estimate */ + + for (i = 0; i < n_sample_pages; i++) { + mtr_start(&mtr); + + btr_cur_open_at_rnd_pos(index, BTR_SEARCH_LEAF, &cursor, &mtr); + + /* Count the number of different key values for each prefix of + the key on this index page. If the prefix does not determine + the index record uniquely in the B-tree, then we subtract one + because otherwise our algorithm would give a wrong estimate + for an index where there is just one key value. */ + + page = btr_cur_get_page(&cursor); + + SRV_CORRUPT_TABLE_CHECK(page, goto exit_loop;); + + rec = page_rec_get_next(page_get_infimum_rec(page)); + + if (!page_rec_is_supremum(rec)) { + not_empty_flag = 1; + offsets_rec = rec_get_offsets(rec, index, offsets_rec, + ULINT_UNDEFINED, &heap); + + if (n_not_null != NULL) { + btr_record_not_null_field_in_rec( + n_cols, offsets_rec, n_not_null); + } + } + + while (!page_rec_is_supremum(rec)) { + rec_t* next_rec = page_rec_get_next(rec); + if (page_rec_is_supremum(next_rec)) { + total_external_size += + btr_rec_get_externally_stored_len( + rec, offsets_rec); + break; + } + + matched_fields = 0; + matched_bytes = 0; + offsets_next_rec = rec_get_offsets(next_rec, index, + offsets_next_rec, + ULINT_UNDEFINED, + &heap); + + cmp_rec_rec_with_match(rec, next_rec, + offsets_rec, offsets_next_rec, + index, stats_null_not_equal, + &matched_fields, + &matched_bytes); + + for (j = matched_fields; j < n_cols; j++) { + /* We add one if this index record has + a different prefix from the previous */ + + n_diff[j]++; + } + + if (n_not_null != NULL) { + btr_record_not_null_field_in_rec( + n_cols, offsets_next_rec, n_not_null); + } + + total_external_size + += btr_rec_get_externally_stored_len( + rec, offsets_rec); + + rec = next_rec; + /* Initialize offsets_rec for the next round + and assign the old offsets_rec buffer to + offsets_next_rec. */ + { + ulint* offsets_tmp = offsets_rec; + offsets_rec = offsets_next_rec; + offsets_next_rec = offsets_tmp; + } + } + + + if (n_cols == dict_index_get_n_unique_in_tree(index)) { + + /* If there is more than one leaf page in the tree, + we add one because we know that the first record + on the page certainly had a different prefix than the + last record on the previous index page in the + alphabetical order. Before this fix, if there was + just one big record on each clustered index page, the + algorithm grossly underestimated the number of rows + in the table. */ + + if (btr_page_get_prev(page, &mtr) != FIL_NULL + || btr_page_get_next(page, &mtr) != FIL_NULL) { + + n_diff[n_cols - 1]++; + } + } + + mtr_commit(&mtr); + } + +exit_loop: + /* If we saw k borders between different key values on + n_sample_pages leaf pages, we can estimate how many + there will be in index->stat_n_leaf_pages */ + + /* We must take into account that our sample actually represents + also the pages used for external storage of fields (those pages are + included in index->stat_n_leaf_pages) */ + + for (j = 0; j < n_cols; j++) { + index->stat_n_diff_key_vals[j] + = BTR_TABLE_STATS_FROM_SAMPLE( + n_diff[j], index, n_sample_pages, + total_external_size, not_empty_flag); + + /* If the tree is small, smaller than + 10 * n_sample_pages + total_external_size, then + the above estimate is ok. For bigger trees it is common that we + do not see any borders between key values in the few pages + we pick. But still there may be n_sample_pages + different key values, or even more. Let us try to approximate + that: */ + + add_on = index->stat_n_leaf_pages + / (10 * (n_sample_pages + + total_external_size)); + + if (add_on > n_sample_pages) { + add_on = n_sample_pages; + } + + index->stat_n_diff_key_vals[j] += add_on; + + index->stat_n_sample_sizes[j] = n_sample_pages; + + /* Update the stat_n_non_null_key_vals[] with our + sampled result. stat_n_non_null_key_vals[] is created + and initialized to zero in dict_index_add_to_cache(), + along with stat_n_diff_key_vals[] array */ + if (n_not_null != NULL) { + index->stat_n_non_null_key_vals[j] = + BTR_TABLE_STATS_FROM_SAMPLE( + n_not_null[j], index, n_sample_pages, + total_external_size, not_empty_flag); + } + } + + mem_heap_free(heap); +} + +/*================== EXTERNAL STORAGE OF BIG FIELDS ===================*/ + +/***********************************************************//** +Gets the offset of the pointer to the externally stored part of a field. +@return offset of the pointer to the externally stored part */ +static +ulint +btr_rec_get_field_ref_offs( +/*=======================*/ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint n) /*!< in: index of the external field */ +{ + ulint field_ref_offs; + ulint local_len; + + ut_a(rec_offs_nth_extern(offsets, n)); + field_ref_offs = rec_get_nth_field_offs(offsets, n, &local_len); + ut_a(local_len != UNIV_SQL_NULL); + ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE); + + return(field_ref_offs + local_len - BTR_EXTERN_FIELD_REF_SIZE); +} + +/** Gets a pointer to the externally stored part of a field. +@param rec record +@param offsets rec_get_offsets(rec) +@param n index of the externally stored field +@return pointer to the externally stored part */ +#define btr_rec_get_field_ref(rec, offsets, n) \ + ((rec) + btr_rec_get_field_ref_offs(offsets, n)) + +/** Gets the externally stored size of a record, in units of a database page. +@param[in] rec record +@param[in] offsets array returned by rec_get_offsets() +@return externally stored part, in units of a database page */ + +ulint +btr_rec_get_externally_stored_len( + const rec_t* rec, + const ulint* offsets) +{ + ulint n_fields; + ulint total_extern_len = 0; + ulint i; + + ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec)); + + if (!rec_offs_any_extern(offsets)) { + return(0); + } + + n_fields = rec_offs_n_fields(offsets); + + for (i = 0; i < n_fields; i++) { + if (rec_offs_nth_extern(offsets, i)) { + + ulint extern_len = mach_read_from_4( + btr_rec_get_field_ref(rec, offsets, i) + + BTR_EXTERN_LEN + 4); + + total_extern_len += ut_calc_align(extern_len, + UNIV_PAGE_SIZE); + } + } + + return(total_extern_len / UNIV_PAGE_SIZE); +} + +/*******************************************************************//** +Sets the ownership bit of an externally stored field in a record. */ +static +void +btr_cur_set_ownership_of_extern_field( +/*==================================*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page whose uncompressed + part will be updated, or NULL */ + rec_t* rec, /*!< in/out: clustered index record */ + dict_index_t* index, /*!< in: index of the page */ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint i, /*!< in: field number */ + ibool val, /*!< in: value to set */ + mtr_t* mtr) /*!< in: mtr, or NULL if not logged */ +{ + byte* data; + ulint local_len; + ulint byte_val; + + data = rec_get_nth_field(rec, offsets, i, &local_len); + ut_ad(rec_offs_nth_extern(offsets, i)); + ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE); + + local_len -= BTR_EXTERN_FIELD_REF_SIZE; + + byte_val = mach_read_from_1(data + local_len + BTR_EXTERN_LEN); + + if (val) { + byte_val = byte_val & (~BTR_EXTERN_OWNER_FLAG); + } else { +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + ut_a(!(byte_val & BTR_EXTERN_OWNER_FLAG)); +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + byte_val = byte_val | BTR_EXTERN_OWNER_FLAG; + } + + if (page_zip) { + mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val); + page_zip_write_blob_ptr(page_zip, rec, index, offsets, i, mtr); + } else if (mtr != NULL) { + + mlog_write_ulint(data + local_len + BTR_EXTERN_LEN, byte_val, + MLOG_1BYTE, mtr); + } else { + mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val); + } + + btr_blob_dbg_owner(rec, index, offsets, i, val); +} + +/*******************************************************************//** +Marks non-updated off-page fields as disowned by this record. The ownership +must be transferred to the updated record which is inserted elsewhere in the +index tree. In purge only the owner of externally stored field is allowed +to free the field. */ +UNIV_INTERN +void +btr_cur_disown_inherited_fields( +/*============================*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page whose uncompressed + part will be updated, or NULL */ + rec_t* rec, /*!< in/out: record in a clustered index */ + dict_index_t* index, /*!< in: index of the page */ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + const upd_t* update, /*!< in: update vector */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ulint i; + + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec)); + ut_ad(rec_offs_any_extern(offsets)); + ut_ad(mtr); + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + if (rec_offs_nth_extern(offsets, i) + && !upd_get_field_by_field_no(update, i)) { + btr_cur_set_ownership_of_extern_field( + page_zip, rec, index, offsets, i, FALSE, mtr); + } + } +} + +/*******************************************************************//** +Marks all extern fields in a record as owned by the record. This function +should be called if the delete mark of a record is removed: a not delete +marked record always owns all its extern fields. */ +static +void +btr_cur_unmark_extern_fields( +/*=========================*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page whose uncompressed + part will be updated, or NULL */ + rec_t* rec, /*!< in/out: record in a clustered index */ + dict_index_t* index, /*!< in: index of the page */ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + mtr_t* mtr) /*!< in: mtr, or NULL if not logged */ +{ + ulint n; + ulint i; + + ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec)); + n = rec_offs_n_fields(offsets); + + if (!rec_offs_any_extern(offsets)) { + + return; + } + + for (i = 0; i < n; i++) { + if (rec_offs_nth_extern(offsets, i)) { + + btr_cur_set_ownership_of_extern_field( + page_zip, rec, index, offsets, i, TRUE, mtr); + } + } +} + +/*******************************************************************//** +Flags the data tuple fields that are marked as extern storage in the +update vector. We use this function to remember which fields we must +mark as extern storage in a record inserted for an update. +@return number of flagged external columns */ +UNIV_INTERN +ulint +btr_push_update_extern_fields( +/*==========================*/ + dtuple_t* tuple, /*!< in/out: data tuple */ + const upd_t* update, /*!< in: update vector */ + mem_heap_t* heap) /*!< in: memory heap */ +{ + ulint n_pushed = 0; + ulint n; + const upd_field_t* uf; + + ut_ad(tuple); + ut_ad(update); + + uf = update->fields; + n = upd_get_n_fields(update); + + for (; n--; uf++) { + if (dfield_is_ext(&uf->new_val)) { + dfield_t* field + = dtuple_get_nth_field(tuple, uf->field_no); + + if (!dfield_is_ext(field)) { + dfield_set_ext(field); + n_pushed++; + } + + switch (uf->orig_len) { + byte* data; + ulint len; + byte* buf; + case 0: + break; + case BTR_EXTERN_FIELD_REF_SIZE: + /* Restore the original locally stored + part of the column. In the undo log, + InnoDB writes a longer prefix of externally + stored columns, so that column prefixes + in secondary indexes can be reconstructed. */ + dfield_set_data(field, (byte*) dfield_get_data(field) + + dfield_get_len(field) + - BTR_EXTERN_FIELD_REF_SIZE, + BTR_EXTERN_FIELD_REF_SIZE); + dfield_set_ext(field); + break; + default: + /* Reconstruct the original locally + stored part of the column. The data + will have to be copied. */ + ut_a(uf->orig_len > BTR_EXTERN_FIELD_REF_SIZE); + + data = (byte*) dfield_get_data(field); + len = dfield_get_len(field); + + buf = (byte*) mem_heap_alloc(heap, + uf->orig_len); + /* Copy the locally stored prefix. */ + memcpy(buf, data, + uf->orig_len + - BTR_EXTERN_FIELD_REF_SIZE); + /* Copy the BLOB pointer. */ + memcpy(buf + uf->orig_len + - BTR_EXTERN_FIELD_REF_SIZE, + data + len - BTR_EXTERN_FIELD_REF_SIZE, + BTR_EXTERN_FIELD_REF_SIZE); + + dfield_set_data(field, buf, uf->orig_len); + dfield_set_ext(field); + } + } + } + + return(n_pushed); +} + +/*******************************************************************//** +Returns the length of a BLOB part stored on the header page. +@return part length */ +static +ulint +btr_blob_get_part_len( +/*==================*/ + const byte* blob_header) /*!< in: blob header */ +{ + return(mach_read_from_4(blob_header + BTR_BLOB_HDR_PART_LEN)); +} + +/*******************************************************************//** +Returns the page number where the next BLOB part is stored. +@return page number or FIL_NULL if no more pages */ +static +ulint +btr_blob_get_next_page_no( +/*======================*/ + const byte* blob_header) /*!< in: blob header */ +{ + return(mach_read_from_4(blob_header + BTR_BLOB_HDR_NEXT_PAGE_NO)); +} + +/*******************************************************************//** +Deallocate a buffer block that was reserved for a BLOB part. */ +static +void +btr_blob_free( +/*==========*/ + buf_block_t* block, /*!< in: buffer block */ + ibool all, /*!< in: TRUE=remove also the compressed page + if there is one */ + mtr_t* mtr) /*!< in: mini-transaction to commit */ +{ + buf_pool_t* buf_pool = buf_pool_from_block(block); + ulint space = buf_block_get_space(block); + ulint page_no = buf_block_get_page_no(block); + bool freed = false; + + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + + mtr_commit(mtr); + + mutex_enter(&buf_pool->LRU_list_mutex); + mutex_enter(&block->mutex); + + /* Only free the block if it is still allocated to + the same file page. */ + + if (buf_block_get_state(block) + == BUF_BLOCK_FILE_PAGE + && buf_block_get_space(block) == space + && buf_block_get_page_no(block) == page_no) { + + freed = buf_LRU_free_page(&block->page, all); + + if (!freed && all && block->page.zip.data + /* Now, buf_LRU_free_page() may release mutexes + temporarily */ + && buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE + && buf_block_get_space(block) == space + && buf_block_get_page_no(block) == page_no) { + + /* Attempt to deallocate the uncompressed page + if the whole block cannot be deallocted. */ + freed = buf_LRU_free_page(&block->page, false); + } + } + + if (!freed) { + mutex_exit(&buf_pool->LRU_list_mutex); + } + + mutex_exit(&block->mutex); +} + +/*******************************************************************//** +Stores the fields in big_rec_vec to the tablespace and puts pointers to +them in rec. The extern flags in rec will have to be set beforehand. +The fields are stored on pages allocated from leaf node +file segment of the index tree. +@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ +UNIV_INTERN +dberr_t +btr_store_big_rec_extern_fields( +/*============================*/ + dict_index_t* index, /*!< in: index of rec; the index tree + MUST be X-latched */ + buf_block_t* rec_block, /*!< in/out: block containing rec */ + rec_t* rec, /*!< in/out: record */ + const ulint* offsets, /*!< in: rec_get_offsets(rec, index); + the "external storage" flags in offsets + will not correspond to rec when + this function returns */ + const big_rec_t*big_rec_vec, /*!< in: vector containing fields + to be stored externally */ + mtr_t* btr_mtr, /*!< in: mtr containing the + latches to the clustered index */ + enum blob_op op) /*! in: operation code */ +{ + ulint rec_page_no; + byte* field_ref; + ulint extern_len; + ulint store_len; + ulint page_no; + ulint space_id; + ulint zip_size; + ulint prev_page_no; + ulint hint_page_no; + ulint i; + mtr_t mtr; + mtr_t* alloc_mtr; + mem_heap_t* heap = NULL; + page_zip_des_t* page_zip; + z_stream c_stream; + buf_block_t** freed_pages = NULL; + ulint n_freed_pages = 0; + dberr_t error = DB_SUCCESS; + + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(rec_offs_any_extern(offsets)); + ut_ad(btr_mtr); + ut_ad(mtr_memo_contains(btr_mtr, dict_index_get_lock(index), + MTR_MEMO_X_LOCK)); + ut_ad(mtr_memo_contains(btr_mtr, rec_block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(buf_block_get_frame(rec_block) == page_align(rec)); + ut_a(dict_index_is_clust(index)); + + page_zip = buf_block_get_page_zip(rec_block); + ut_a(dict_table_zip_size(index->table) + == buf_block_get_zip_size(rec_block)); + + space_id = buf_block_get_space(rec_block); + zip_size = buf_block_get_zip_size(rec_block); + rec_page_no = buf_block_get_page_no(rec_block); + ut_a(fil_page_get_type(page_align(rec)) == FIL_PAGE_INDEX); + + error = btr_check_blob_limit(big_rec_vec); + + if (error != DB_SUCCESS) { + ut_ad(op == BTR_STORE_INSERT); + return(error); + } + + if (page_zip) { + int err; + + /* Zlib deflate needs 128 kilobytes for the default + window size, plus 512 << memLevel, plus a few + kilobytes for small objects. We use reduced memLevel + to limit the memory consumption, and preallocate the + heap, hoping to avoid memory fragmentation. */ + heap = mem_heap_create(250000); + page_zip_set_alloc(&c_stream, heap); + + err = deflateInit2(&c_stream, page_zip_level, + Z_DEFLATED, 15, 7, Z_DEFAULT_STRATEGY); + ut_a(err == Z_OK); + } + + if (btr_blob_op_is_update(op)) { + /* Avoid reusing pages that have been previously freed + in btr_mtr. */ + if (btr_mtr->n_freed_pages) { + if (heap == NULL) { + heap = mem_heap_create( + btr_mtr->n_freed_pages + * sizeof *freed_pages); + } + + freed_pages = static_cast<buf_block_t**>( + mem_heap_alloc( + heap, + btr_mtr->n_freed_pages + * sizeof *freed_pages)); + n_freed_pages = 0; + } + + /* Because btr_mtr will be committed after mtr, it is + possible that the tablespace has been extended when + the B-tree record was updated or inserted, or it will + be extended while allocating pages for big_rec. + + TODO: In mtr (not btr_mtr), write a redo log record + about extending the tablespace to its current size, + and remember the current size. Whenever the tablespace + grows as pages are allocated, write further redo log + records to mtr. (Currently tablespace extension is not + covered by the redo log. If it were, the record would + only be written to btr_mtr, which is committed after + mtr.) */ + alloc_mtr = btr_mtr; + } else { + /* Use the local mtr for allocations. */ + alloc_mtr = &mtr; + } + +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + /* All pointers to externally stored columns in the record + must either be zero or they must be pointers to inherited + columns, owned by this record or an earlier record version. */ + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + if (!rec_offs_nth_extern(offsets, i)) { + continue; + } + field_ref = btr_rec_get_field_ref(rec, offsets, i); + + ut_a(!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG)); + /* Either this must be an update in place, + or the BLOB must be inherited, or the BLOB pointer + must be zero (will be written in this function). */ + ut_a(op == BTR_STORE_UPDATE + || (field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_INHERITED_FLAG) + || !memcmp(field_ref, field_ref_zero, + BTR_EXTERN_FIELD_REF_SIZE)); + } +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + /* We have to create a file segment to the tablespace + for each field and put the pointer to the field in rec */ + + for (i = 0; i < big_rec_vec->n_fields; i++) { + field_ref = btr_rec_get_field_ref( + rec, offsets, big_rec_vec->fields[i].field_no); +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + /* A zero BLOB pointer should have been initially inserted. */ + ut_a(!memcmp(field_ref, field_ref_zero, + BTR_EXTERN_FIELD_REF_SIZE)); +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + extern_len = big_rec_vec->fields[i].len; + UNIV_MEM_ASSERT_RW(big_rec_vec->fields[i].data, + extern_len); + + ut_a(extern_len > 0); + + prev_page_no = FIL_NULL; + + if (page_zip) { + int err = deflateReset(&c_stream); + ut_a(err == Z_OK); + + c_stream.next_in = (Bytef*) + big_rec_vec->fields[i].data; + c_stream.avail_in = static_cast<uInt>(extern_len); + } + + for (;;) { + buf_block_t* block; + page_t* page; + + mtr_start(&mtr); + + if (prev_page_no == FIL_NULL) { + hint_page_no = 1 + rec_page_no; + } else { + hint_page_no = prev_page_no + 1; + } + +alloc_another: + block = btr_page_alloc(index, hint_page_no, + FSP_NO_DIR, 0, alloc_mtr, &mtr); + if (UNIV_UNLIKELY(block == NULL)) { + mtr_commit(&mtr); + error = DB_OUT_OF_FILE_SPACE; + goto func_exit; + } + + if (rw_lock_get_x_lock_count(&block->lock) > 1) { + /* This page must have been freed in + btr_mtr previously. Put it aside, and + allocate another page for the BLOB data. */ + ut_ad(alloc_mtr == btr_mtr); + ut_ad(btr_blob_op_is_update(op)); + ut_ad(n_freed_pages < btr_mtr->n_freed_pages); + freed_pages[n_freed_pages++] = block; + goto alloc_another; + } + + page_no = buf_block_get_page_no(block); + page = buf_block_get_frame(block); + + if (prev_page_no != FIL_NULL) { + buf_block_t* prev_block; + page_t* prev_page; + + prev_block = buf_page_get(space_id, zip_size, + prev_page_no, + RW_X_LATCH, &mtr); + buf_block_dbg_add_level(prev_block, + SYNC_EXTERN_STORAGE); + prev_page = buf_block_get_frame(prev_block); + + if (page_zip) { + mlog_write_ulint( + prev_page + FIL_PAGE_NEXT, + page_no, MLOG_4BYTES, &mtr); + memcpy(buf_block_get_page_zip( + prev_block) + ->data + FIL_PAGE_NEXT, + prev_page + FIL_PAGE_NEXT, 4); + } else { + mlog_write_ulint( + prev_page + FIL_PAGE_DATA + + BTR_BLOB_HDR_NEXT_PAGE_NO, + page_no, MLOG_4BYTES, &mtr); + } + + } else if (dict_index_is_online_ddl(index)) { + row_log_table_blob_alloc(index, page_no); + } + + if (page_zip) { + int err; + page_zip_des_t* blob_page_zip; + + /* Write FIL_PAGE_TYPE to the redo log + separately, before logging any other + changes to the page, so that the debug + assertions in + recv_parse_or_apply_log_rec_body() can + be made simpler. Before InnoDB Plugin + 1.0.4, the initialization of + FIL_PAGE_TYPE was logged as part of + the mlog_log_string() below. */ + + mlog_write_ulint(page + FIL_PAGE_TYPE, + prev_page_no == FIL_NULL + ? FIL_PAGE_TYPE_ZBLOB + : FIL_PAGE_TYPE_ZBLOB2, + MLOG_2BYTES, &mtr); + + c_stream.next_out = page + + FIL_PAGE_DATA; + c_stream.avail_out + = static_cast<uInt>(page_zip_get_size(page_zip)) + - FIL_PAGE_DATA; + + err = deflate(&c_stream, Z_FINISH); + ut_a(err == Z_OK || err == Z_STREAM_END); + ut_a(err == Z_STREAM_END + || c_stream.avail_out == 0); + + /* Write the "next BLOB page" pointer */ + mlog_write_ulint(page + FIL_PAGE_NEXT, + FIL_NULL, MLOG_4BYTES, &mtr); + /* Initialize the unused "prev page" pointer */ + mlog_write_ulint(page + FIL_PAGE_PREV, + FIL_NULL, MLOG_4BYTES, &mtr); + /* Write a back pointer to the record + into the otherwise unused area. This + information could be useful in + debugging. Later, we might want to + implement the possibility to relocate + BLOB pages. Then, we would need to be + able to adjust the BLOB pointer in the + record. We do not store the heap + number of the record, because it can + change in page_zip_reorganize() or + btr_page_reorganize(). However, also + the page number of the record may + change when B-tree nodes are split or + merged. */ + mlog_write_ulint(page + + FIL_PAGE_FILE_FLUSH_LSN, + space_id, + MLOG_4BYTES, &mtr); + mlog_write_ulint(page + + FIL_PAGE_FILE_FLUSH_LSN + 4, + rec_page_no, + MLOG_4BYTES, &mtr); + + /* Zero out the unused part of the page. */ + memset(page + page_zip_get_size(page_zip) + - c_stream.avail_out, + 0, c_stream.avail_out); + mlog_log_string(page + FIL_PAGE_FILE_FLUSH_LSN, + page_zip_get_size(page_zip) + - FIL_PAGE_FILE_FLUSH_LSN, + &mtr); + /* Copy the page to compressed storage, + because it will be flushed to disk + from there. */ + blob_page_zip = buf_block_get_page_zip(block); + ut_ad(blob_page_zip); + ut_ad(page_zip_get_size(blob_page_zip) + == page_zip_get_size(page_zip)); + memcpy(blob_page_zip->data, page, + page_zip_get_size(page_zip)); + + if (err == Z_OK && prev_page_no != FIL_NULL) { + + goto next_zip_page; + } + + if (alloc_mtr == &mtr) { + rec_block = buf_page_get( + space_id, zip_size, + rec_page_no, + RW_X_LATCH, &mtr); + buf_block_dbg_add_level( + rec_block, + SYNC_NO_ORDER_CHECK); + } + + if (err == Z_STREAM_END) { + mach_write_to_4(field_ref + + BTR_EXTERN_LEN, 0); + mach_write_to_4(field_ref + + BTR_EXTERN_LEN + 4, + c_stream.total_in); + } else { + memset(field_ref + BTR_EXTERN_LEN, + 0, 8); + } + + if (prev_page_no == FIL_NULL) { + btr_blob_dbg_add_blob( + rec, big_rec_vec->fields[i] + .field_no, page_no, index, + "store"); + + mach_write_to_4(field_ref + + BTR_EXTERN_SPACE_ID, + space_id); + + mach_write_to_4(field_ref + + BTR_EXTERN_PAGE_NO, + page_no); + + mach_write_to_4(field_ref + + BTR_EXTERN_OFFSET, + FIL_PAGE_NEXT); + } + + page_zip_write_blob_ptr( + page_zip, rec, index, offsets, + big_rec_vec->fields[i].field_no, + alloc_mtr); + +next_zip_page: + prev_page_no = page_no; + + /* Commit mtr and release the + uncompressed page frame to save memory. */ + btr_blob_free(block, FALSE, &mtr); + + if (err == Z_STREAM_END) { + break; + } + } else { + mlog_write_ulint(page + FIL_PAGE_TYPE, + FIL_PAGE_TYPE_BLOB, + MLOG_2BYTES, &mtr); + + if (extern_len > (UNIV_PAGE_SIZE + - FIL_PAGE_DATA + - BTR_BLOB_HDR_SIZE + - FIL_PAGE_DATA_END)) { + store_len = UNIV_PAGE_SIZE + - FIL_PAGE_DATA + - BTR_BLOB_HDR_SIZE + - FIL_PAGE_DATA_END; + } else { + store_len = extern_len; + } + + mlog_write_string(page + FIL_PAGE_DATA + + BTR_BLOB_HDR_SIZE, + (const byte*) + big_rec_vec->fields[i].data + + big_rec_vec->fields[i].len + - extern_len, + store_len, &mtr); + mlog_write_ulint(page + FIL_PAGE_DATA + + BTR_BLOB_HDR_PART_LEN, + store_len, MLOG_4BYTES, &mtr); + mlog_write_ulint(page + FIL_PAGE_DATA + + BTR_BLOB_HDR_NEXT_PAGE_NO, + FIL_NULL, MLOG_4BYTES, &mtr); + + extern_len -= store_len; + + if (alloc_mtr == &mtr) { + rec_block = buf_page_get( + space_id, zip_size, + rec_page_no, + RW_X_LATCH, &mtr); + buf_block_dbg_add_level( + rec_block, + SYNC_NO_ORDER_CHECK); + } + + mlog_write_ulint(field_ref + BTR_EXTERN_LEN, 0, + MLOG_4BYTES, alloc_mtr); + mlog_write_ulint(field_ref + + BTR_EXTERN_LEN + 4, + big_rec_vec->fields[i].len + - extern_len, + MLOG_4BYTES, alloc_mtr); + + if (prev_page_no == FIL_NULL) { + btr_blob_dbg_add_blob( + rec, big_rec_vec->fields[i] + .field_no, page_no, index, + "store"); + + mlog_write_ulint(field_ref + + BTR_EXTERN_SPACE_ID, + space_id, MLOG_4BYTES, + alloc_mtr); + + mlog_write_ulint(field_ref + + BTR_EXTERN_PAGE_NO, + page_no, MLOG_4BYTES, + alloc_mtr); + + mlog_write_ulint(field_ref + + BTR_EXTERN_OFFSET, + FIL_PAGE_DATA, + MLOG_4BYTES, + alloc_mtr); + } + + prev_page_no = page_no; + + mtr_commit(&mtr); + + if (extern_len == 0) { + break; + } + } + } + + DBUG_EXECUTE_IF("btr_store_big_rec_extern", + error = DB_OUT_OF_FILE_SPACE; + goto func_exit;); + } + +func_exit: + if (page_zip) { + deflateEnd(&c_stream); + } + + if (n_freed_pages) { + ulint i; + + ut_ad(alloc_mtr == btr_mtr); + ut_ad(btr_blob_op_is_update(op)); + + for (i = 0; i < n_freed_pages; i++) { + btr_page_free_low(index, freed_pages[i], 0, alloc_mtr); + } + } + + if (heap != NULL) { + mem_heap_free(heap); + } + +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + /* All pointers to externally stored columns in the record + must be valid. */ + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + if (!rec_offs_nth_extern(offsets, i)) { + continue; + } + + field_ref = btr_rec_get_field_ref(rec, offsets, i); + + /* The pointer must not be zero if the operation + succeeded. */ + ut_a(0 != memcmp(field_ref, field_ref_zero, + BTR_EXTERN_FIELD_REF_SIZE) + || error != DB_SUCCESS); + /* The column must not be disowned by this record. */ + ut_a(!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG)); + } +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + return(error); +} + +/*******************************************************************//** +Check the FIL_PAGE_TYPE on an uncompressed BLOB page. */ +static +void +btr_check_blob_fil_page_type( +/*=========================*/ + ulint space_id, /*!< in: space id */ + ulint page_no, /*!< in: page number */ + const page_t* page, /*!< in: page */ + ibool read) /*!< in: TRUE=read, FALSE=purge */ +{ + ulint type = fil_page_get_type(page); + + ut_a(space_id == page_get_space_id(page)); + ut_a(page_no == page_get_page_no(page)); + + if (UNIV_UNLIKELY(type != FIL_PAGE_TYPE_BLOB)) { + ulint flags = fil_space_get_flags(space_id); + +#ifndef UNIV_DEBUG /* Improve debug test coverage */ + if (dict_tf_get_format(flags) == UNIV_FORMAT_A) { + /* Old versions of InnoDB did not initialize + FIL_PAGE_TYPE on BLOB pages. Do not print + anything about the type mismatch when reading + a BLOB page that is in Antelope format.*/ + return; + } +#endif /* !UNIV_DEBUG */ + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: FIL_PAGE_TYPE=%lu" + " on BLOB %s space %lu page %lu flags %lx\n", + (ulong) type, read ? "read" : "purge", + (ulong) space_id, (ulong) page_no, (ulong) flags); + ut_error; + } +} + +/*******************************************************************//** +Frees the space in an externally stored field to the file space +management if the field in data is owned by the externally stored field, +in a rollback we may have the additional condition that the field must +not be inherited. */ +UNIV_INTERN +void +btr_free_externally_stored_field( +/*=============================*/ + dict_index_t* index, /*!< in: index of the data, the index + tree MUST be X-latched; if the tree + height is 1, then also the root page + must be X-latched! (this is relevant + in the case this function is called + from purge where 'data' is located on + an undo log page, not an index + page) */ + byte* field_ref, /*!< in/out: field reference */ + const rec_t* rec, /*!< in: record containing field_ref, for + page_zip_write_blob_ptr(), or NULL */ + const ulint* offsets, /*!< in: rec_get_offsets(rec, index), + or NULL */ + page_zip_des_t* page_zip, /*!< in: compressed page corresponding + to rec, or NULL if rec == NULL */ + ulint i, /*!< in: field number of field_ref; + ignored if rec == NULL */ + enum trx_rb_ctx rb_ctx, /*!< in: rollback context */ + mtr_t* local_mtr __attribute__((unused))) /*!< in: mtr + containing the latch to data an an + X-latch to the index tree */ +{ + page_t* page; + const ulint space_id = mach_read_from_4( + field_ref + BTR_EXTERN_SPACE_ID); + const ulint start_page = mach_read_from_4( + field_ref + BTR_EXTERN_PAGE_NO); + ulint rec_zip_size = dict_table_zip_size(index->table); + ulint ext_zip_size; + ulint page_no; + ulint next_page_no; + mtr_t mtr; + + ut_ad(dict_index_is_clust(index)); + ut_ad(mtr_memo_contains(local_mtr, dict_index_get_lock(index), + MTR_MEMO_X_LOCK)); + ut_ad(mtr_memo_contains_page(local_mtr, field_ref, + MTR_MEMO_PAGE_X_FIX)); + ut_ad(!rec || rec_offs_validate(rec, index, offsets)); + ut_ad(!rec || field_ref == btr_rec_get_field_ref(rec, offsets, i)); + + if (UNIV_UNLIKELY(!memcmp(field_ref, field_ref_zero, + BTR_EXTERN_FIELD_REF_SIZE))) { + /* In the rollback, we may encounter a clustered index + record with some unwritten off-page columns. There is + nothing to free then. */ + ut_a(rb_ctx != RB_NONE); + return; + } + + ut_ad(space_id == index->space); + + if (UNIV_UNLIKELY(space_id != dict_index_get_space(index))) { + ext_zip_size = fil_space_get_zip_size(space_id); + /* This must be an undo log record in the system tablespace, + that is, in row_purge_upd_exist_or_extern(). + Currently, externally stored records are stored in the + same tablespace as the referring records. */ + ut_ad(!page_get_space_id(page_align(field_ref))); + ut_ad(!rec); + ut_ad(!page_zip); + } else { + ext_zip_size = rec_zip_size; + } + + if (!rec) { + /* This is a call from row_purge_upd_exist_or_extern(). */ + ut_ad(!page_zip); + rec_zip_size = 0; + } + +#ifdef UNIV_BLOB_DEBUG + if (!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG) + && !((field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_INHERITED_FLAG) + && (rb_ctx == RB_NORMAL || rb_ctx == RB_RECOVERY))) { + /* This off-page column will be freed. + Check that no references remain. */ + + btr_blob_dbg_t b; + + b.blob_page_no = start_page; + + if (rec) { + /* Remove the reference from the record to the + BLOB. If the BLOB were not freed, the + reference would be removed when the record is + removed. Freeing the BLOB will overwrite the + BTR_EXTERN_PAGE_NO in the field_ref of the + record with FIL_NULL, which would make the + btr_blob_dbg information inconsistent with the + record. */ + b.ref_page_no = page_get_page_no(page_align(rec)); + b.ref_heap_no = page_rec_get_heap_no(rec); + b.ref_field_no = i; + btr_blob_dbg_rbt_delete(index, &b, "free"); + } + + btr_blob_dbg_assert_empty(index, b.blob_page_no); + } +#endif /* UNIV_BLOB_DEBUG */ + + for (;;) { +#ifdef UNIV_SYNC_DEBUG + buf_block_t* rec_block; +#endif /* UNIV_SYNC_DEBUG */ + buf_block_t* ext_block; + + mtr_start(&mtr); + +#ifdef UNIV_SYNC_DEBUG + rec_block = +#endif /* UNIV_SYNC_DEBUG */ + buf_page_get(page_get_space_id(page_align(field_ref)), + rec_zip_size, + page_get_page_no(page_align(field_ref)), + RW_X_LATCH, &mtr); + buf_block_dbg_add_level(rec_block, SYNC_NO_ORDER_CHECK); + page_no = mach_read_from_4(field_ref + BTR_EXTERN_PAGE_NO); + + if (/* There is no external storage data */ + page_no == FIL_NULL + /* This field does not own the externally stored field */ + || (mach_read_from_1(field_ref + BTR_EXTERN_LEN) + & BTR_EXTERN_OWNER_FLAG) + /* Rollback and inherited field */ + || ((rb_ctx == RB_NORMAL || rb_ctx == RB_RECOVERY) + && (mach_read_from_1(field_ref + BTR_EXTERN_LEN) + & BTR_EXTERN_INHERITED_FLAG))) { + + /* Do not free */ + mtr_commit(&mtr); + + return; + } + + if (page_no == start_page && dict_index_is_online_ddl(index)) { + row_log_table_blob_free(index, start_page); + } + + ext_block = buf_page_get(space_id, ext_zip_size, page_no, + RW_X_LATCH, &mtr); + buf_block_dbg_add_level(ext_block, SYNC_EXTERN_STORAGE); + page = buf_block_get_frame(ext_block); + + if (ext_zip_size) { + /* Note that page_zip will be NULL + in row_purge_upd_exist_or_extern(). */ + switch (fil_page_get_type(page)) { + case FIL_PAGE_TYPE_ZBLOB: + case FIL_PAGE_TYPE_ZBLOB2: + break; + default: + ut_error; + } + next_page_no = mach_read_from_4(page + FIL_PAGE_NEXT); + + btr_page_free_low(index, ext_block, 0, &mtr); + + if (page_zip != NULL) { + mach_write_to_4(field_ref + BTR_EXTERN_PAGE_NO, + next_page_no); + mach_write_to_4(field_ref + BTR_EXTERN_LEN + 4, + 0); + page_zip_write_blob_ptr(page_zip, rec, index, + offsets, i, &mtr); + } else { + mlog_write_ulint(field_ref + + BTR_EXTERN_PAGE_NO, + next_page_no, + MLOG_4BYTES, &mtr); + mlog_write_ulint(field_ref + + BTR_EXTERN_LEN + 4, 0, + MLOG_4BYTES, &mtr); + } + } else { + ut_a(!page_zip); + btr_check_blob_fil_page_type(space_id, page_no, page, + FALSE); + + next_page_no = mach_read_from_4( + page + FIL_PAGE_DATA + + BTR_BLOB_HDR_NEXT_PAGE_NO); + + /* We must supply the page level (= 0) as an argument + because we did not store it on the page (we save the + space overhead from an index page header. */ + + btr_page_free_low(index, ext_block, 0, &mtr); + + mlog_write_ulint(field_ref + BTR_EXTERN_PAGE_NO, + next_page_no, + MLOG_4BYTES, &mtr); + /* Zero out the BLOB length. If the server + crashes during the execution of this function, + trx_rollback_or_clean_all_recovered() could + dereference the half-deleted BLOB, fetching a + wrong prefix for the BLOB. */ + mlog_write_ulint(field_ref + BTR_EXTERN_LEN + 4, + 0, + MLOG_4BYTES, &mtr); + } + + /* Commit mtr and release the BLOB block to save memory. */ + btr_blob_free(ext_block, TRUE, &mtr); + } +} + +/***********************************************************//** +Frees the externally stored fields for a record. */ +static +void +btr_rec_free_externally_stored_fields( +/*==================================*/ + dict_index_t* index, /*!< in: index of the data, the index + tree MUST be X-latched */ + rec_t* rec, /*!< in/out: record */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed + part will be updated, or NULL */ + enum trx_rb_ctx rb_ctx, /*!< in: rollback context */ + mtr_t* mtr) /*!< in: mini-transaction handle which contains + an X-latch to record page and to the index + tree */ +{ + ulint n_fields; + ulint i; + + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX)); + /* Free possible externally stored fields in the record */ + + ut_ad(dict_table_is_comp(index->table) == !!rec_offs_comp(offsets)); + n_fields = rec_offs_n_fields(offsets); + + for (i = 0; i < n_fields; i++) { + if (rec_offs_nth_extern(offsets, i)) { + btr_free_externally_stored_field( + index, btr_rec_get_field_ref(rec, offsets, i), + rec, offsets, page_zip, i, rb_ctx, mtr); + } + } +} + +/***********************************************************//** +Frees the externally stored fields for a record, if the field is mentioned +in the update vector. */ +static +void +btr_rec_free_updated_extern_fields( +/*===============================*/ + dict_index_t* index, /*!< in: index of rec; the index tree MUST be + X-latched */ + rec_t* rec, /*!< in/out: record */ + page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed + part will be updated, or NULL */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + const upd_t* update, /*!< in: update vector */ + enum trx_rb_ctx rb_ctx, /*!< in: rollback context */ + mtr_t* mtr) /*!< in: mini-transaction handle which contains + an X-latch to record page and to the tree */ +{ + ulint n_fields; + ulint i; + + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX)); + + /* Free possible externally stored fields in the record */ + + n_fields = upd_get_n_fields(update); + + for (i = 0; i < n_fields; i++) { + const upd_field_t* ufield = upd_get_nth_field(update, i); + + if (rec_offs_nth_extern(offsets, ufield->field_no)) { + ulint len; + byte* data = rec_get_nth_field( + rec, offsets, ufield->field_no, &len); + ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE); + + btr_free_externally_stored_field( + index, data + len - BTR_EXTERN_FIELD_REF_SIZE, + rec, offsets, page_zip, + ufield->field_no, rb_ctx, mtr); + } + } +} + +/*******************************************************************//** +Copies the prefix of an uncompressed BLOB. The clustered index record +that points to this BLOB must be protected by a lock or a page latch. +@return number of bytes written to buf */ +static +ulint +btr_copy_blob_prefix( +/*=================*/ + byte* buf, /*!< out: the externally stored part of + the field, or a prefix of it */ + ulint len, /*!< in: length of buf, in bytes */ + ulint space_id,/*!< in: space id of the BLOB pages */ + ulint page_no,/*!< in: page number of the first BLOB page */ + ulint offset) /*!< in: offset on the first BLOB page */ +{ + ulint copied_len = 0; + + for (;;) { + mtr_t mtr; + buf_block_t* block; + const page_t* page; + const byte* blob_header; + ulint part_len; + ulint copy_len; + + mtr_start(&mtr); + + block = buf_page_get(space_id, 0, page_no, RW_S_LATCH, &mtr); + buf_block_dbg_add_level(block, SYNC_EXTERN_STORAGE); + page = buf_block_get_frame(block); + + btr_check_blob_fil_page_type(space_id, page_no, page, TRUE); + + blob_header = page + offset; + part_len = btr_blob_get_part_len(blob_header); + copy_len = ut_min(part_len, len - copied_len); + + memcpy(buf + copied_len, + blob_header + BTR_BLOB_HDR_SIZE, copy_len); + copied_len += copy_len; + + page_no = btr_blob_get_next_page_no(blob_header); + + mtr_commit(&mtr); + + if (page_no == FIL_NULL || copy_len != part_len) { + UNIV_MEM_ASSERT_RW(buf, copied_len); + return(copied_len); + } + + /* On other BLOB pages except the first the BLOB header + always is at the page data start: */ + + offset = FIL_PAGE_DATA; + + ut_ad(copied_len <= len); + } +} + +/*******************************************************************//** +Copies the prefix of a compressed BLOB. The clustered index record +that points to this BLOB must be protected by a lock or a page latch. +@return number of bytes written to buf */ +static +ulint +btr_copy_zblob_prefix( +/*==================*/ + byte* buf, /*!< out: the externally stored part of + the field, or a prefix of it */ + ulint len, /*!< in: length of buf, in bytes */ + ulint zip_size,/*!< in: compressed BLOB page size */ + ulint space_id,/*!< in: space id of the BLOB pages */ + ulint page_no,/*!< in: page number of the first BLOB page */ + ulint offset) /*!< in: offset on the first BLOB page */ +{ + ulint page_type = FIL_PAGE_TYPE_ZBLOB; + mem_heap_t* heap; + int err; + z_stream d_stream; + + d_stream.next_out = buf; + d_stream.avail_out = static_cast<uInt>(len); + d_stream.next_in = Z_NULL; + d_stream.avail_in = 0; + + /* Zlib inflate needs 32 kilobytes for the default + window size, plus a few kilobytes for small objects. */ + heap = mem_heap_create(40000); + page_zip_set_alloc(&d_stream, heap); + + ut_ad(ut_is_2pow(zip_size)); + ut_ad(zip_size >= UNIV_ZIP_SIZE_MIN); + ut_ad(zip_size <= UNIV_ZIP_SIZE_MAX); + ut_ad(space_id); + + err = inflateInit(&d_stream); + ut_a(err == Z_OK); + + for (;;) { + buf_page_t* bpage; + ulint next_page_no; + + /* There is no latch on bpage directly. Instead, + bpage is protected by the B-tree page latch that + is being held on the clustered index record, or, + in row_merge_copy_blobs(), by an exclusive table lock. */ + bpage = buf_page_get_zip(space_id, zip_size, page_no); + + if (UNIV_UNLIKELY(!bpage)) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Cannot load" + " compressed BLOB" + " page %lu space %lu\n", + (ulong) page_no, (ulong) space_id); + goto func_exit; + } + + if (UNIV_UNLIKELY + (fil_page_get_type(bpage->zip.data) != page_type)) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Unexpected type %lu of" + " compressed BLOB" + " page %lu space %lu\n", + (ulong) fil_page_get_type(bpage->zip.data), + (ulong) page_no, (ulong) space_id); + ut_ad(0); + goto end_of_blob; + } + + next_page_no = mach_read_from_4(bpage->zip.data + offset); + + if (UNIV_LIKELY(offset == FIL_PAGE_NEXT)) { + /* When the BLOB begins at page header, + the compressed data payload does not + immediately follow the next page pointer. */ + offset = FIL_PAGE_DATA; + } else { + offset += 4; + } + + d_stream.next_in = bpage->zip.data + offset; + d_stream.avail_in = static_cast<uInt>(zip_size - offset); + + err = inflate(&d_stream, Z_NO_FLUSH); + switch (err) { + case Z_OK: + if (!d_stream.avail_out) { + goto end_of_blob; + } + break; + case Z_STREAM_END: + if (next_page_no == FIL_NULL) { + goto end_of_blob; + } + /* fall through */ + default: +inflate_error: + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: inflate() of" + " compressed BLOB" + " page %lu space %lu returned %d (%s)\n", + (ulong) page_no, (ulong) space_id, + err, d_stream.msg); + case Z_BUF_ERROR: + goto end_of_blob; + } + + if (next_page_no == FIL_NULL) { + if (!d_stream.avail_in) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: unexpected end of" + " compressed BLOB" + " page %lu space %lu\n", + (ulong) page_no, + (ulong) space_id); + } else { + err = inflate(&d_stream, Z_FINISH); + switch (err) { + case Z_STREAM_END: + case Z_BUF_ERROR: + break; + default: + goto inflate_error; + } + } + +end_of_blob: + buf_page_release_zip(bpage); + goto func_exit; + } + + buf_page_release_zip(bpage); + + /* On other BLOB pages except the first + the BLOB header always is at the page header: */ + + page_no = next_page_no; + offset = FIL_PAGE_NEXT; + page_type = FIL_PAGE_TYPE_ZBLOB2; + } + +func_exit: + inflateEnd(&d_stream); + mem_heap_free(heap); + UNIV_MEM_ASSERT_RW(buf, d_stream.total_out); + return(d_stream.total_out); +} + +/*******************************************************************//** +Copies the prefix of an externally stored field of a record. The +clustered index record that points to this BLOB must be protected by a +lock or a page latch. +@return number of bytes written to buf */ +static +ulint +btr_copy_externally_stored_field_prefix_low( +/*========================================*/ + byte* buf, /*!< out: the externally stored part of + the field, or a prefix of it */ + ulint len, /*!< in: length of buf, in bytes */ + ulint zip_size,/*!< in: nonzero=compressed BLOB page size, + zero for uncompressed BLOBs */ + ulint space_id,/*!< in: space id of the first BLOB page */ + ulint page_no,/*!< in: page number of the first BLOB page */ + ulint offset) /*!< in: offset on the first BLOB page */ +{ + if (UNIV_UNLIKELY(len == 0)) { + return(0); + } + + if (zip_size) { + return(btr_copy_zblob_prefix(buf, len, zip_size, + space_id, page_no, offset)); + } else { + return(btr_copy_blob_prefix(buf, len, space_id, + page_no, offset)); + } +} + +/*******************************************************************//** +Copies the prefix of an externally stored field of a record. The +clustered index record must be protected by a lock or a page latch. +@return the length of the copied field, or 0 if the column was being +or has been deleted */ +UNIV_INTERN +ulint +btr_copy_externally_stored_field_prefix( +/*====================================*/ + byte* buf, /*!< out: the field, or a prefix of it */ + ulint len, /*!< in: length of buf, in bytes */ + ulint zip_size,/*!< in: nonzero=compressed BLOB page size, + zero for uncompressed BLOBs */ + const byte* data, /*!< in: 'internally' stored part of the + field containing also the reference to + the external part; must be protected by + a lock or a page latch */ + ulint local_len)/*!< in: length of data, in bytes */ +{ + ulint space_id; + ulint page_no; + ulint offset; + + ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE); + + local_len -= BTR_EXTERN_FIELD_REF_SIZE; + + if (UNIV_UNLIKELY(local_len >= len)) { + memcpy(buf, data, len); + return(len); + } + + memcpy(buf, data, local_len); + data += local_len; + + ut_a(memcmp(data, field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE)); + + if (!mach_read_from_4(data + BTR_EXTERN_LEN + 4)) { + /* The externally stored part of the column has been + (partially) deleted. Signal the half-deleted BLOB + to the caller. */ + + return(0); + } + + space_id = mach_read_from_4(data + BTR_EXTERN_SPACE_ID); + + page_no = mach_read_from_4(data + BTR_EXTERN_PAGE_NO); + + offset = mach_read_from_4(data + BTR_EXTERN_OFFSET); + + return(local_len + + btr_copy_externally_stored_field_prefix_low(buf + local_len, + len - local_len, + zip_size, + space_id, page_no, + offset)); +} + +/*******************************************************************//** +Copies an externally stored field of a record to mem heap. The +clustered index record must be protected by a lock or a page latch. +@return the whole field copied to heap */ +UNIV_INTERN +byte* +btr_copy_externally_stored_field( +/*=============================*/ + ulint* len, /*!< out: length of the whole field */ + const byte* data, /*!< in: 'internally' stored part of the + field containing also the reference to + the external part; must be protected by + a lock or a page latch */ + ulint zip_size,/*!< in: nonzero=compressed BLOB page size, + zero for uncompressed BLOBs */ + ulint local_len,/*!< in: length of data */ + mem_heap_t* heap) /*!< in: mem heap */ +{ + ulint space_id; + ulint page_no; + ulint offset; + ulint extern_len; + byte* buf; + + ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE); + + local_len -= BTR_EXTERN_FIELD_REF_SIZE; + + space_id = mach_read_from_4(data + local_len + BTR_EXTERN_SPACE_ID); + + page_no = mach_read_from_4(data + local_len + BTR_EXTERN_PAGE_NO); + + offset = mach_read_from_4(data + local_len + BTR_EXTERN_OFFSET); + + /* Currently a BLOB cannot be bigger than 4 GB; we + leave the 4 upper bytes in the length field unused */ + + extern_len = mach_read_from_4(data + local_len + BTR_EXTERN_LEN + 4); + + buf = (byte*) mem_heap_alloc(heap, local_len + extern_len); + + memcpy(buf, data, local_len); + *len = local_len + + btr_copy_externally_stored_field_prefix_low(buf + local_len, + extern_len, + zip_size, + space_id, + page_no, offset); + + return(buf); +} + +/*******************************************************************//** +Copies an externally stored field of a record to mem heap. +@return the field copied to heap, or NULL if the field is incomplete */ +UNIV_INTERN +byte* +btr_rec_copy_externally_stored_field( +/*=================================*/ + const rec_t* rec, /*!< in: record in a clustered index; + must be protected by a lock or a page latch */ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint zip_size,/*!< in: nonzero=compressed BLOB page size, + zero for uncompressed BLOBs */ + ulint no, /*!< in: field number */ + ulint* len, /*!< out: length of the field */ + mem_heap_t* heap) /*!< in: mem heap */ +{ + ulint local_len; + const byte* data; + + ut_a(rec_offs_nth_extern(offsets, no)); + + /* An externally stored field can contain some initial + data from the field, and in the last 20 bytes it has the + space id, page number, and offset where the rest of the + field data is stored, and the data length in addition to + the data stored locally. We may need to store some data + locally to get the local record length above the 128 byte + limit so that field offsets are stored in two bytes, and + the extern bit is available in those two bytes. */ + + data = rec_get_nth_field(rec, offsets, no, &local_len); + + ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE); + + if (UNIV_UNLIKELY + (!memcmp(data + local_len - BTR_EXTERN_FIELD_REF_SIZE, + field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE))) { + /* The externally stored field was not written yet. + This record should only be seen by + recv_recovery_rollback_active() or any + TRX_ISO_READ_UNCOMMITTED transactions. */ + return(NULL); + } + + return(btr_copy_externally_stored_field(len, data, + zip_size, local_len, heap)); +} +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/xtradb/btr/btr0pcur.cc b/storage/xtradb/btr/btr0pcur.cc new file mode 100644 index 00000000000..28a60de6ba2 --- /dev/null +++ b/storage/xtradb/btr/btr0pcur.cc @@ -0,0 +1,610 @@ +/***************************************************************************** + +Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file btr/btr0pcur.cc +The index tree persistent cursor + +Created 2/23/1996 Heikki Tuuri +*******************************************************/ + +#include "btr0pcur.h" + +#ifdef UNIV_NONINL +#include "btr0pcur.ic" +#endif + +#include "ut0byte.h" +#include "rem0cmp.h" +#include "trx0trx.h" +#include "srv0srv.h" +/**************************************************************//** +Allocates memory for a persistent cursor object and initializes the cursor. +@return own: persistent cursor */ +UNIV_INTERN +btr_pcur_t* +btr_pcur_create_for_mysql(void) +/*============================*/ +{ + btr_pcur_t* pcur; + + pcur = (btr_pcur_t*) mem_alloc(sizeof(btr_pcur_t)); + + pcur->btr_cur.index = NULL; + btr_pcur_init(pcur); + pcur->btr_cur.tree_height = ULINT_UNDEFINED; + + return(pcur); +} + +/**************************************************************//** +Resets a persistent cursor object, freeing ::old_rec_buf if it is +allocated and resetting the other members to their initial values. */ +UNIV_INTERN +void +btr_pcur_reset( +/*===========*/ + btr_pcur_t* cursor) /*!< in, out: persistent cursor */ +{ + if (cursor->old_rec_buf != NULL) { + + mem_free(cursor->old_rec_buf); + + cursor->old_rec_buf = NULL; + } + + cursor->btr_cur.index = NULL; + cursor->btr_cur.page_cur.rec = NULL; + cursor->old_rec = NULL; + cursor->old_n_fields = 0; + cursor->old_stored = BTR_PCUR_OLD_NOT_STORED; + + cursor->latch_mode = BTR_NO_LATCHES; + cursor->pos_state = BTR_PCUR_NOT_POSITIONED; +} + +/**************************************************************//** +Frees the memory for a persistent cursor object. */ +UNIV_INTERN +void +btr_pcur_free_for_mysql( +/*====================*/ + btr_pcur_t* cursor) /*!< in, own: persistent cursor */ +{ + btr_pcur_reset(cursor); + mem_free(cursor); +} + +/**************************************************************//** +The position of the cursor is stored by taking an initial segment of the +record the cursor is positioned on, before, or after, and copying it to the +cursor data structure, or just setting a flag if the cursor id before the +first in an EMPTY tree, or after the last in an EMPTY tree. NOTE that the +page where the cursor is positioned must not be empty if the index tree is +not totally empty! */ +UNIV_INTERN +void +btr_pcur_store_position( +/*====================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_cur_t* page_cursor; + buf_block_t* block; + rec_t* rec; + dict_index_t* index; + page_t* page; + ulint offs; + + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + block = btr_pcur_get_block(cursor); + + SRV_CORRUPT_TABLE_CHECK(block, return;); + + index = btr_cur_get_index(btr_pcur_get_btr_cur(cursor)); + + page_cursor = btr_pcur_get_page_cur(cursor); + + rec = page_cur_get_rec(page_cursor); + page = page_align(rec); + offs = page_offset(rec); + + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_S_FIX) + || mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + + if (page_is_empty(page)) { + /* It must be an empty index tree; NOTE that in this case + we do not store the modify_clock, but always do a search + if we restore the cursor position */ + + ut_a(btr_page_get_next(page, mtr) == FIL_NULL); + ut_a(btr_page_get_prev(page, mtr) == FIL_NULL); + ut_ad(page_is_leaf(page)); + ut_ad(page_get_page_no(page) == index->page); + + cursor->old_stored = BTR_PCUR_OLD_STORED; + + if (page_rec_is_supremum_low(offs)) { + + cursor->rel_pos = BTR_PCUR_AFTER_LAST_IN_TREE; + } else { + cursor->rel_pos = BTR_PCUR_BEFORE_FIRST_IN_TREE; + } + + return; + } + + if (page_rec_is_supremum_low(offs)) { + + rec = page_rec_get_prev(rec); + + cursor->rel_pos = BTR_PCUR_AFTER; + + } else if (page_rec_is_infimum_low(offs)) { + + rec = page_rec_get_next(rec); + + cursor->rel_pos = BTR_PCUR_BEFORE; + } else { + cursor->rel_pos = BTR_PCUR_ON; + } + + cursor->old_stored = BTR_PCUR_OLD_STORED; + cursor->old_rec = dict_index_copy_rec_order_prefix( + index, rec, &cursor->old_n_fields, + &cursor->old_rec_buf, &cursor->buf_size); + + cursor->block_when_stored = block; + cursor->modify_clock = buf_block_get_modify_clock(block); +} + +/**************************************************************//** +Copies the stored position of a pcur to another pcur. */ +UNIV_INTERN +void +btr_pcur_copy_stored_position( +/*==========================*/ + btr_pcur_t* pcur_receive, /*!< in: pcur which will receive the + position info */ + btr_pcur_t* pcur_donate) /*!< in: pcur from which the info is + copied */ +{ + if (pcur_receive->old_rec_buf) { + mem_free(pcur_receive->old_rec_buf); + } + + ut_memcpy(pcur_receive, pcur_donate, sizeof(btr_pcur_t)); + + if (pcur_donate->old_rec_buf) { + + pcur_receive->old_rec_buf = (byte*) + mem_alloc(pcur_donate->buf_size); + + ut_memcpy(pcur_receive->old_rec_buf, pcur_donate->old_rec_buf, + pcur_donate->buf_size); + pcur_receive->old_rec = pcur_receive->old_rec_buf + + (pcur_donate->old_rec - pcur_donate->old_rec_buf); + } + + pcur_receive->old_n_fields = pcur_donate->old_n_fields; +} + +/**************************************************************//** +Restores the stored position of a persistent cursor bufferfixing the page and +obtaining the specified latches. If the cursor position was saved when the +(1) cursor was positioned on a user record: this function restores the position +to the last record LESS OR EQUAL to the stored record; +(2) cursor was positioned on a page infimum record: restores the position to +the last record LESS than the user record which was the successor of the page +infimum; +(3) cursor was positioned on the page supremum: restores to the first record +GREATER than the user record which was the predecessor of the supremum. +(4) cursor was positioned before the first or after the last in an empty tree: +restores to before first or after the last in the tree. +@return TRUE if the cursor position was stored when it was on a user +record and it can be restored on a user record whose ordering fields +are identical to the ones of the original user record */ +UNIV_INTERN +ibool +btr_pcur_restore_position_func( +/*===========================*/ + ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */ + btr_pcur_t* cursor, /*!< in: detached persistent cursor */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ + mtr_t* mtr) /*!< in: mtr */ +{ + dict_index_t* index; + dtuple_t* tuple; + ulint mode; + ulint old_mode; + mem_heap_t* heap; + + ut_ad(mtr); + ut_ad(mtr->state == MTR_ACTIVE); + ut_ad(cursor->old_stored == BTR_PCUR_OLD_STORED); + ut_ad(cursor->pos_state == BTR_PCUR_WAS_POSITIONED + || cursor->pos_state == BTR_PCUR_IS_POSITIONED); + + index = btr_cur_get_index(btr_pcur_get_btr_cur(cursor)); + + if (UNIV_UNLIKELY + (cursor->rel_pos == BTR_PCUR_AFTER_LAST_IN_TREE + || cursor->rel_pos == BTR_PCUR_BEFORE_FIRST_IN_TREE)) { + + /* In these cases we do not try an optimistic restoration, + but always do a search */ + + btr_cur_open_at_index_side( + cursor->rel_pos == BTR_PCUR_BEFORE_FIRST_IN_TREE, + index, latch_mode, + btr_pcur_get_btr_cur(cursor), 0, mtr); + + cursor->latch_mode = latch_mode; + cursor->pos_state = BTR_PCUR_IS_POSITIONED; + cursor->block_when_stored = btr_pcur_get_block(cursor); + + return(FALSE); + } + + ut_a(cursor->old_rec); + ut_a(cursor->old_n_fields); + + if (UNIV_LIKELY(latch_mode == BTR_SEARCH_LEAF) + || UNIV_LIKELY(latch_mode == BTR_MODIFY_LEAF)) { + /* Try optimistic restoration. */ + + if (buf_page_optimistic_get(latch_mode, + cursor->block_when_stored, + cursor->modify_clock, + file, line, mtr)) { + cursor->pos_state = BTR_PCUR_IS_POSITIONED; + cursor->latch_mode = latch_mode; + + buf_block_dbg_add_level( + btr_pcur_get_block(cursor), + dict_index_is_ibuf(index) + ? SYNC_IBUF_TREE_NODE : SYNC_TREE_NODE); + + if (cursor->rel_pos == BTR_PCUR_ON) { +#ifdef UNIV_DEBUG + const rec_t* rec; + const ulint* offsets1; + const ulint* offsets2; + rec = btr_pcur_get_rec(cursor); + + heap = mem_heap_create(256); + offsets1 = rec_get_offsets( + cursor->old_rec, index, NULL, + cursor->old_n_fields, &heap); + offsets2 = rec_get_offsets( + rec, index, NULL, + cursor->old_n_fields, &heap); + + ut_ad(!cmp_rec_rec(cursor->old_rec, + rec, offsets1, offsets2, + index)); + mem_heap_free(heap); +#endif /* UNIV_DEBUG */ + return(TRUE); + } + /* This is the same record as stored, + may need to be adjusted for BTR_PCUR_BEFORE/AFTER, + depending on search mode and direction. */ + if (btr_pcur_is_on_user_rec(cursor)) { + cursor->pos_state + = BTR_PCUR_IS_POSITIONED_OPTIMISTIC; + } + return(FALSE); + } + } + + /* If optimistic restoration did not succeed, open the cursor anew */ + + heap = mem_heap_create(256); + + tuple = dict_index_build_data_tuple(index, cursor->old_rec, + cursor->old_n_fields, heap); + + /* Save the old search mode of the cursor */ + old_mode = cursor->search_mode; + + switch (cursor->rel_pos) { + case BTR_PCUR_ON: + mode = PAGE_CUR_LE; + break; + case BTR_PCUR_AFTER: + mode = PAGE_CUR_G; + break; + case BTR_PCUR_BEFORE: + mode = PAGE_CUR_L; + break; + default: + ut_error; + mode = 0; + } + + btr_pcur_open_with_no_init_func(index, tuple, mode, latch_mode, + cursor, 0, file, line, mtr); + + /* Restore the old search mode */ + cursor->search_mode = old_mode; + + switch (cursor->rel_pos) { + case BTR_PCUR_ON: + if (btr_pcur_is_on_user_rec(cursor) + && !cmp_dtuple_rec( + tuple, btr_pcur_get_rec(cursor), + rec_get_offsets(btr_pcur_get_rec(cursor), + index, NULL, + ULINT_UNDEFINED, &heap))) { + + /* We have to store the NEW value for + the modify clock, since the cursor can + now be on a different page! But we can + retain the value of old_rec */ + + cursor->block_when_stored = + btr_pcur_get_block(cursor); + cursor->modify_clock = + buf_block_get_modify_clock( + cursor->block_when_stored); + cursor->old_stored = BTR_PCUR_OLD_STORED; + + mem_heap_free(heap); + + return(TRUE); + } +#ifdef UNIV_DEBUG + /* fall through */ + case BTR_PCUR_BEFORE: + case BTR_PCUR_AFTER: + break; + default: + ut_error; +#endif /* UNIV_DEBUG */ + } + + mem_heap_free(heap); + + /* We have to store new position information, modify_clock etc., + to the cursor because it can now be on a different page, the record + under it may have been removed, etc. */ + + btr_pcur_store_position(cursor, mtr); + + return(FALSE); +} + +/*********************************************************//** +Moves the persistent cursor to the first record on the next page. Releases the +latch on the current page, and bufferunfixes it. Note that there must not be +modifications on the current page, as then the x-latch can be released only in +mtr_commit. */ +UNIV_INTERN +void +btr_pcur_move_to_next_page( +/*=======================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor; must be on the + last record of the current page */ + mtr_t* mtr) /*!< in: mtr */ +{ + ulint next_page_no; + ulint space; + ulint zip_size; + page_t* page; + buf_block_t* next_block; + page_t* next_page; + + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + ut_ad(btr_pcur_is_after_last_on_page(cursor)); + + cursor->old_stored = BTR_PCUR_OLD_NOT_STORED; + + page = btr_pcur_get_page(cursor); + next_page_no = btr_page_get_next(page, mtr); + space = buf_block_get_space(btr_pcur_get_block(cursor)); + zip_size = buf_block_get_zip_size(btr_pcur_get_block(cursor)); + + ut_ad(next_page_no != FIL_NULL); + + next_block = btr_block_get(space, zip_size, next_page_no, + cursor->latch_mode, + btr_pcur_get_btr_cur(cursor)->index, mtr); + next_page = buf_block_get_frame(next_block); + + SRV_CORRUPT_TABLE_CHECK(next_page, + { + btr_leaf_page_release(btr_pcur_get_block(cursor), + cursor->latch_mode, mtr); + btr_pcur_get_page_cur(cursor)->block = 0; + btr_pcur_get_page_cur(cursor)->rec = 0; + + return; + }); + +#ifdef UNIV_BTR_DEBUG + ut_a(page_is_comp(next_page) == page_is_comp(page)); + ut_a(btr_page_get_prev(next_page, mtr) + == buf_block_get_page_no(btr_pcur_get_block(cursor))); +#endif /* UNIV_BTR_DEBUG */ + next_block->check_index_page_at_flush = TRUE; + + btr_leaf_page_release(btr_pcur_get_block(cursor), + cursor->latch_mode, mtr); + + page_cur_set_before_first(next_block, btr_pcur_get_page_cur(cursor)); + + page_check_dir(next_page); +} + +/*********************************************************//** +Moves the persistent cursor backward if it is on the first record of the page. +Commits mtr. Note that to prevent a possible deadlock, the operation +first stores the position of the cursor, commits mtr, acquires the necessary +latches and restores the cursor position again before returning. The +alphabetical position of the cursor is guaranteed to be sensible on +return, but it may happen that the cursor is not positioned on the last +record of any page, because the structure of the tree may have changed +during the time when the cursor had no latches. */ +UNIV_INTERN +void +btr_pcur_move_backward_from_page( +/*=============================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor, must be on the first + record of the current page */ + mtr_t* mtr) /*!< in: mtr */ +{ + ulint prev_page_no; + page_t* page; + buf_block_t* prev_block; + ulint latch_mode; + ulint latch_mode2; + + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + ut_ad(btr_pcur_is_before_first_on_page(cursor)); + ut_ad(!btr_pcur_is_before_first_in_tree(cursor, mtr)); + + latch_mode = cursor->latch_mode; + + if (latch_mode == BTR_SEARCH_LEAF) { + + latch_mode2 = BTR_SEARCH_PREV; + + } else if (latch_mode == BTR_MODIFY_LEAF) { + + latch_mode2 = BTR_MODIFY_PREV; + } else { + latch_mode2 = 0; /* To eliminate compiler warning */ + ut_error; + } + + btr_pcur_store_position(cursor, mtr); + + mtr_commit(mtr); + + mtr_start(mtr); + + btr_pcur_restore_position(latch_mode2, cursor, mtr); + + page = btr_pcur_get_page(cursor); + + prev_page_no = btr_page_get_prev(page, mtr); + + if (prev_page_no == FIL_NULL) { + } else if (btr_pcur_is_before_first_on_page(cursor)) { + + prev_block = btr_pcur_get_btr_cur(cursor)->left_block; + + btr_leaf_page_release(btr_pcur_get_block(cursor), + latch_mode, mtr); + + page_cur_set_after_last(prev_block, + btr_pcur_get_page_cur(cursor)); + } else { + + /* The repositioned cursor did not end on an infimum record on + a page. Cursor repositioning acquired a latch also on the + previous page, but we do not need the latch: release it. */ + + prev_block = btr_pcur_get_btr_cur(cursor)->left_block; + + btr_leaf_page_release(prev_block, latch_mode, mtr); + } + + cursor->latch_mode = latch_mode; + + cursor->old_stored = BTR_PCUR_OLD_NOT_STORED; +} + +/*********************************************************//** +Moves the persistent cursor to the previous record in the tree. If no records +are left, the cursor stays 'before first in tree'. +@return TRUE if the cursor was not before first in tree */ +UNIV_INTERN +ibool +btr_pcur_move_to_prev( +/*==================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor; NOTE that the + function may release the page latch */ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + cursor->old_stored = BTR_PCUR_OLD_NOT_STORED; + + if (btr_pcur_is_before_first_on_page(cursor)) { + + if (btr_pcur_is_before_first_in_tree(cursor, mtr)) { + + return(FALSE); + } + + btr_pcur_move_backward_from_page(cursor, mtr); + + return(TRUE); + } + + btr_pcur_move_to_prev_on_page(cursor); + + return(TRUE); +} + +/**************************************************************//** +If mode is PAGE_CUR_G or PAGE_CUR_GE, opens a persistent cursor on the first +user record satisfying the search condition, in the case PAGE_CUR_L or +PAGE_CUR_LE, on the last user record. If no such user record exists, then +in the first case sets the cursor after last in tree, and in the latter case +before first in tree. The latching mode must be BTR_SEARCH_LEAF or +BTR_MODIFY_LEAF. */ +UNIV_INTERN +void +btr_pcur_open_on_user_rec_func( +/*===========================*/ + dict_index_t* index, /*!< in: index */ + const dtuple_t* tuple, /*!< in: tuple on which search done */ + ulint mode, /*!< in: PAGE_CUR_L, ... */ + ulint latch_mode, /*!< in: BTR_SEARCH_LEAF or + BTR_MODIFY_LEAF */ + btr_pcur_t* cursor, /*!< in: memory buffer for persistent + cursor */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ + mtr_t* mtr) /*!< in: mtr */ +{ + btr_pcur_open_low(index, 0, tuple, mode, latch_mode, cursor, + file, line, mtr); + + if ((mode == PAGE_CUR_GE) || (mode == PAGE_CUR_G)) { + + if (btr_pcur_is_after_last_on_page(cursor)) { + + btr_pcur_move_to_next_user_rec(cursor, mtr); + } + } else { + ut_ad((mode == PAGE_CUR_LE) || (mode == PAGE_CUR_L)); + + /* Not implemented yet */ + + ut_error; + } +} diff --git a/storage/xtradb/btr/btr0sea.cc b/storage/xtradb/btr/btr0sea.cc new file mode 100644 index 00000000000..ac5e9aec67b --- /dev/null +++ b/storage/xtradb/btr/btr0sea.cc @@ -0,0 +1,2097 @@ +/***************************************************************************** + +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2008, Google Inc. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file btr/btr0sea.cc +The index tree adaptive search + +Created 2/17/1996 Heikki Tuuri +*************************************************************************/ + +#include "btr0sea.h" +#ifdef UNIV_NONINL +#include "btr0sea.ic" +#endif + +#include "buf0buf.h" +#include "page0page.h" +#include "page0cur.h" +#include "btr0cur.h" +#include "btr0pcur.h" +#include "btr0btr.h" +#include "ha0ha.h" +#include "srv0srv.h" +/** Flag: has the search system been enabled? +Protected by btr_search_latch. */ +UNIV_INTERN char btr_search_enabled = TRUE; + +/** Number of adaptive hash index partitions */ +UNIV_INTERN ulint btr_search_index_num; + +/** A dummy variable to fool the compiler */ +UNIV_INTERN ulint btr_search_this_is_zero = 0; + +#ifdef UNIV_SEARCH_PERF_STAT +/** Number of successful adaptive hash index lookups */ +UNIV_INTERN ulint btr_search_n_succ = 0; +/** Number of failed adaptive hash index lookups */ +UNIV_INTERN ulint btr_search_n_hash_fail = 0; +#endif /* UNIV_SEARCH_PERF_STAT */ + +/** padding to prevent other memory update +hotspots from residing on the same memory +cache line as btr_search_latch */ +UNIV_INTERN byte btr_sea_pad1[64]; + +/** Array of latches protecting individual AHI partitions. The latches +protect: (1) positions of records on those pages where a hash index from the +corresponding AHI partition has been built. +NOTE: They do not protect values of non-ordering fields within a record from +being updated in-place! We can use fact (1) to perform unique searches to +indexes. */ + +UNIV_INTERN prio_rw_lock_t* btr_search_latch_arr; + +/** padding to prevent other memory update hotspots from residing on +the same memory cache line */ +UNIV_INTERN byte btr_sea_pad2[64]; + +/** The adaptive hash index */ +UNIV_INTERN btr_search_sys_t* btr_search_sys; + +#ifdef UNIV_PFS_RWLOCK +/* Key to register btr_search_sys with performance schema */ +UNIV_INTERN mysql_pfs_key_t btr_search_latch_key; +#endif /* UNIV_PFS_RWLOCK */ + +/** If the number of records on the page divided by this parameter +would have been successfully accessed using a hash index, the index +is then built on the page, assuming the global limit has been reached */ +#define BTR_SEARCH_PAGE_BUILD_LIMIT 16 + +/** The global limit for consecutive potentially successful hash searches, +before hash index building is started */ +#define BTR_SEARCH_BUILD_LIMIT 100 + +/********************************************************************//** +Builds a hash index on a page with the given parameters. If the page already +has a hash index with different parameters, the old hash index is removed. +If index is non-NULL, this function checks if n_fields and n_bytes are +sensible values, and does not build a hash index if not. */ +static +void +btr_search_build_page_hash_index( +/*=============================*/ + dict_index_t* index, /*!< in: index for which to build, or NULL if + not known */ + buf_block_t* block, /*!< in: index page, s- or x-latched */ + ulint n_fields,/*!< in: hash this many full fields */ + ulint n_bytes,/*!< in: hash this many bytes from the next + field */ + ibool left_side);/*!< in: hash for searches from left side? */ + +/*****************************************************************//** +This function should be called before reserving any btr search mutex, if +the intended operation might add nodes to the search system hash table. +Because of the latching order, once we have reserved the btr search system +latch, we cannot allocate a free frame from the buffer pool. Checks that +there is a free buffer frame allocated for hash table heap in the btr search +system. If not, allocates a free frames for the heap. This check makes it +probable that, when have reserved the btr search system latch and we need to +allocate a new node to the hash table, it will succeed. However, the check +will not guarantee success. */ +static +void +btr_search_check_free_space_in_heap( +/*================================*/ + dict_index_t* index) +{ + hash_table_t* table; + mem_heap_t* heap; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(btr_search_get_latch(index), RW_LOCK_SHARED)); + ut_ad(!rw_lock_own(btr_search_get_latch(index), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + table = btr_search_get_hash_table(index); + + heap = table->heap; + + /* Note that we peek the value of heap->free_block without reserving + the latch: this is ok, because we will not guarantee that there will + be enough free space in the hash table. */ + + if (heap->free_block == NULL) { + buf_block_t* block = buf_block_alloc(NULL); + + rw_lock_x_lock(btr_search_get_latch(index)); + + if (heap->free_block == NULL) { + heap->free_block = block; + } else { + buf_block_free(block); + } + + rw_lock_x_unlock(btr_search_get_latch(index)); + } +} + +/*****************************************************************//** +Creates and initializes the adaptive search system at a database start. */ +UNIV_INTERN +void +btr_search_sys_create( +/*==================*/ + ulint hash_size) /*!< in: hash index hash table size */ +{ + ulint i; + + /* PS bug lp:1018264 - Multiple hash index partitions causes overly + large hash index: When multiple adaptive hash index partitions are + specified, _each_ partition was being created with hash_size which + should be 1/64 of the total size of all buffer pools which is + incorrect and can cause overly high memory usage. hash_size + should be representing the _total_ size of all partitions, not the + individual size of each partition. */ + hash_size /= btr_search_index_num; + + /* We allocate the search latch from dynamic memory: + see above at the global variable definition */ + + /* btr_search_index_num is constrained to machine word size for + historical reasons. This limitation can be easily removed later. */ + + btr_search_latch_arr = (prio_rw_lock_t *) + mem_alloc(sizeof(prio_rw_lock_t) * btr_search_index_num); + + btr_search_sys = (btr_search_sys_t*) + mem_alloc(sizeof(btr_search_sys_t)); + + btr_search_sys->hash_tables = (hash_table_t **) + mem_alloc(sizeof(hash_table_t *) * btr_search_index_num); + + for (i = 0; i < btr_search_index_num; i++) { + + rw_lock_create(btr_search_latch_key, + &btr_search_latch_arr[i], SYNC_SEARCH_SYS); + + btr_search_sys->hash_tables[i] + = ha_create(hash_size, 0, MEM_HEAP_FOR_BTR_SEARCH, 0); + +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + btr_search_sys->hash_tables[i]->adaptive = TRUE; +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + } +} + +/*****************************************************************//** +Frees the adaptive search system at a database shutdown. */ +UNIV_INTERN +void +btr_search_sys_free(void) +/*=====================*/ +{ + ulint i; + + for (i = 0; i < btr_search_index_num; i++) { + + rw_lock_free(&btr_search_latch_arr[i]); + + mem_heap_free(btr_search_sys->hash_tables[i]->heap); + + hash_table_free(btr_search_sys->hash_tables[i]); + + } + + mem_free(btr_search_latch_arr); + btr_search_latch_arr = NULL; + + mem_free(btr_search_sys->hash_tables); + + mem_free(btr_search_sys); + btr_search_sys = NULL; +} + +/********************************************************************//** +Set index->ref_count = 0 on all indexes of a table. */ +static +void +btr_search_disable_ref_count( +/*=========================*/ + dict_table_t* table) /*!< in/out: table */ +{ + dict_index_t* index; + + ut_ad(mutex_own(&dict_sys->mutex)); + + for (index = dict_table_get_first_index(table); index; + index = dict_table_get_next_index(index)) { + +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(btr_search_get_latch(index), + RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + index->search_info->ref_count = 0; + } +} + +/********************************************************************//** +Disable the adaptive hash search system and empty the index. */ +UNIV_INTERN +void +btr_search_disable(void) +/*====================*/ +{ + dict_table_t* table; + ulint i; + + mutex_enter(&dict_sys->mutex); + btr_search_x_lock_all(); + + btr_search_enabled = FALSE; + + /* Clear the index->search_info->ref_count of every index in + the data dictionary cache. */ + for (table = UT_LIST_GET_FIRST(dict_sys->table_LRU); table; + table = UT_LIST_GET_NEXT(table_LRU, table)) { + + btr_search_disable_ref_count(table); + } + + for (table = UT_LIST_GET_FIRST(dict_sys->table_non_LRU); table; + table = UT_LIST_GET_NEXT(table_LRU, table)) { + + btr_search_disable_ref_count(table); + } + + mutex_exit(&dict_sys->mutex); + + /* Set all block->index = NULL. */ + buf_pool_clear_hash_index(); + + /* Clear the adaptive hash index. */ + for (i = 0; i < btr_search_index_num; i++) { + hash_table_clear(btr_search_sys->hash_tables[i]); + mem_heap_empty(btr_search_sys->hash_tables[i]->heap); + } + + btr_search_x_unlock_all(); +} + +/********************************************************************//** +Enable the adaptive hash search system. */ +UNIV_INTERN +void +btr_search_enable(void) +/*====================*/ +{ + btr_search_x_lock_all(); + + btr_search_enabled = TRUE; + + btr_search_x_unlock_all(); +} + +/*****************************************************************//** +Creates and initializes a search info struct. +@return own: search info struct */ +UNIV_INTERN +btr_search_t* +btr_search_info_create( +/*===================*/ + mem_heap_t* heap) /*!< in: heap where created */ +{ + btr_search_t* info; + + info = (btr_search_t*) mem_heap_alloc(heap, sizeof(btr_search_t)); + +#ifdef UNIV_DEBUG + info->magic_n = BTR_SEARCH_MAGIC_N; +#endif /* UNIV_DEBUG */ + + info->ref_count = 0; + info->root_guess = NULL; + + info->hash_analysis = 0; + info->n_hash_potential = 0; + + info->last_hash_succ = FALSE; + +#ifdef UNIV_SEARCH_PERF_STAT + info->n_hash_succ = 0; + info->n_hash_fail = 0; + info->n_patt_succ = 0; + info->n_searches = 0; +#endif /* UNIV_SEARCH_PERF_STAT */ + + /* Set some sensible values */ + info->n_fields = 1; + info->n_bytes = 0; + + info->left_side = TRUE; + + return(info); +} + +/*****************************************************************//** +Returns the value of ref_count. The value is protected by +the latch of the AHI partition corresponding to this index. +@return ref_count value. */ +UNIV_INTERN +ulint +btr_search_info_get_ref_count( +/*==========================*/ + btr_search_t* info, /*!< in: search info. */ + dict_index_t* index) /*!< in: index */ +{ + ulint ret; + + ut_ad(info); + +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(btr_search_get_latch(index), RW_LOCK_SHARED)); + ut_ad(!rw_lock_own(btr_search_get_latch(index), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + rw_lock_s_lock(btr_search_get_latch(index)); + ret = info->ref_count; + rw_lock_s_unlock(btr_search_get_latch(index)); + + return(ret); +} + +/*********************************************************************//** +Updates the search info of an index about hash successes. NOTE that info +is NOT protected by any semaphore, to save CPU time! Do not assume its fields +are consistent. */ +static +void +btr_search_info_update_hash( +/*========================*/ + btr_search_t* info, /*!< in/out: search info */ + btr_cur_t* cursor) /*!< in: cursor which was just positioned */ +{ + dict_index_t* index = cursor->index; + ulint n_unique; + int cmp; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(btr_search_get_latch(index), RW_LOCK_SHARED)); + ut_ad(!rw_lock_own(btr_search_get_latch(index), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + if (dict_index_is_ibuf(index)) { + /* So many deletes are performed on an insert buffer tree + that we do not consider a hash index useful on it: */ + + return; + } + + n_unique = dict_index_get_n_unique_in_tree(index); + + if (info->n_hash_potential == 0) { + + goto set_new_recomm; + } + + /* Test if the search would have succeeded using the recommended + hash prefix */ + + if (info->n_fields >= n_unique && cursor->up_match >= n_unique) { +increment_potential: + info->n_hash_potential++; + + return; + } + + cmp = ut_pair_cmp(info->n_fields, info->n_bytes, + cursor->low_match, cursor->low_bytes); + + if (info->left_side ? cmp <= 0 : cmp > 0) { + + goto set_new_recomm; + } + + cmp = ut_pair_cmp(info->n_fields, info->n_bytes, + cursor->up_match, cursor->up_bytes); + + if (info->left_side ? cmp <= 0 : cmp > 0) { + + goto increment_potential; + } + +set_new_recomm: + /* We have to set a new recommendation; skip the hash analysis + for a while to avoid unnecessary CPU time usage when there is no + chance for success */ + + info->hash_analysis = 0; + + cmp = ut_pair_cmp(cursor->up_match, cursor->up_bytes, + cursor->low_match, cursor->low_bytes); + if (cmp == 0) { + info->n_hash_potential = 0; + + /* For extra safety, we set some sensible values here */ + + info->n_fields = 1; + info->n_bytes = 0; + + info->left_side = TRUE; + + } else if (cmp > 0) { + info->n_hash_potential = 1; + + if (cursor->up_match >= n_unique) { + + info->n_fields = n_unique; + info->n_bytes = 0; + + } else if (cursor->low_match < cursor->up_match) { + + info->n_fields = cursor->low_match + 1; + info->n_bytes = 0; + } else { + info->n_fields = cursor->low_match; + info->n_bytes = cursor->low_bytes + 1; + } + + info->left_side = TRUE; + } else { + info->n_hash_potential = 1; + + if (cursor->low_match >= n_unique) { + + info->n_fields = n_unique; + info->n_bytes = 0; + + } else if (cursor->low_match > cursor->up_match) { + + info->n_fields = cursor->up_match + 1; + info->n_bytes = 0; + } else { + info->n_fields = cursor->up_match; + info->n_bytes = cursor->up_bytes + 1; + } + + info->left_side = FALSE; + } +} + +/*********************************************************************//** +Updates the block search info on hash successes. NOTE that info and +block->n_hash_helps, n_fields, n_bytes, side are NOT protected by any +semaphore, to save CPU time! Do not assume the fields are consistent. +@return TRUE if building a (new) hash index on the block is recommended */ +static +ibool +btr_search_update_block_hash_info( +/*==============================*/ + btr_search_t* info, /*!< in: search info */ + buf_block_t* block, /*!< in: buffer block */ + btr_cur_t* cursor __attribute__((unused))) + /*!< in: cursor */ +{ +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(btr_search_get_latch(cursor->index), + RW_LOCK_SHARED)); + ut_ad(!rw_lock_own(btr_search_get_latch(cursor->index), + RW_LOCK_EX)); + ut_ad(rw_lock_own(&block->lock, RW_LOCK_SHARED) + || rw_lock_own(&block->lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(cursor); + + info->last_hash_succ = FALSE; + + ut_a(buf_block_state_valid(block)); + ut_ad(info->magic_n == BTR_SEARCH_MAGIC_N); + + if ((block->n_hash_helps > 0) + && (info->n_hash_potential > 0) + && (block->n_fields == info->n_fields) + && (block->n_bytes == info->n_bytes) + && (block->left_side == info->left_side)) { + + if ((block->index) + && (block->curr_n_fields == info->n_fields) + && (block->curr_n_bytes == info->n_bytes) + && (block->curr_left_side == info->left_side)) { + + /* The search would presumably have succeeded using + the hash index */ + + info->last_hash_succ = TRUE; + } + + block->n_hash_helps++; + } else { + block->n_hash_helps = 1; + block->n_fields = info->n_fields; + block->n_bytes = info->n_bytes; + block->left_side = info->left_side; + } + +#ifdef UNIV_DEBUG + if (cursor->index->table->does_not_fit_in_memory) { + block->n_hash_helps = 0; + } +#endif /* UNIV_DEBUG */ + + if ((block->n_hash_helps > page_get_n_recs(block->frame) + / BTR_SEARCH_PAGE_BUILD_LIMIT) + && (info->n_hash_potential >= BTR_SEARCH_BUILD_LIMIT)) { + + if ((!block->index) + || (block->n_hash_helps + > 2 * page_get_n_recs(block->frame)) + || (block->n_fields != block->curr_n_fields) + || (block->n_bytes != block->curr_n_bytes) + || (block->left_side != block->curr_left_side)) { + + /* Build a new hash index on the page */ + + return(TRUE); + } + } + + return(FALSE); +} + +/*********************************************************************//** +Updates a hash node reference when it has been unsuccessfully used in a +search which could have succeeded with the used hash parameters. This can +happen because when building a hash index for a page, we do not check +what happens at page boundaries, and therefore there can be misleading +hash nodes. Also, collisions in the fold value can lead to misleading +references. This function lazily fixes these imperfections in the hash +index. */ +static +void +btr_search_update_hash_ref( +/*=======================*/ + btr_search_t* info, /*!< in: search info */ + buf_block_t* block, /*!< in: buffer block where cursor positioned */ + btr_cur_t* cursor) /*!< in: cursor */ +{ + dict_index_t* index; + ulint fold; + const rec_t* rec; + + ut_ad(cursor->flag == BTR_CUR_HASH_FAIL); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(btr_search_get_latch(cursor->index), + RW_LOCK_EX)); + ut_ad(rw_lock_own(&(block->lock), RW_LOCK_SHARED) + || rw_lock_own(&(block->lock), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(page_align(btr_cur_get_rec(cursor)) + == buf_block_get_frame(block)); + + index = block->index; + + if (!index) { + + return; + } + + ut_a(index == cursor->index); + ut_a(!dict_index_is_ibuf(index)); + + if ((info->n_hash_potential > 0) + && (block->curr_n_fields == info->n_fields) + && (block->curr_n_bytes == info->n_bytes) + && (block->curr_left_side == info->left_side)) { + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs_init(offsets_); + + rec = btr_cur_get_rec(cursor); + + if (!page_rec_is_user_rec(rec)) { + + return; + } + + fold = rec_fold(rec, + rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap), + block->curr_n_fields, + block->curr_n_bytes, index->id); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(btr_search_get_latch(cursor->index), + RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + ha_insert_for_fold(btr_search_get_hash_table(cursor->index), + fold, block, rec); + + MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED); + } +} + +/*********************************************************************//** +Updates the search info. */ +UNIV_INTERN +void +btr_search_info_update_slow( +/*========================*/ + btr_search_t* info, /*!< in/out: search info */ + btr_cur_t* cursor) /*!< in: cursor which was just positioned */ +{ + buf_block_t* block; + ibool build_index; + ulint* params; + ulint* params2; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(btr_search_get_latch(cursor->index), + RW_LOCK_SHARED)); + ut_ad(!rw_lock_own(btr_search_get_latch(cursor->index), + RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + block = btr_cur_get_block(cursor); + + SRV_CORRUPT_TABLE_CHECK(block, return;); + + /* NOTE that the following two function calls do NOT protect + info or block->n_fields etc. with any semaphore, to save CPU time! + We cannot assume the fields are consistent when we return from + those functions! */ + + btr_search_info_update_hash(info, cursor); + + build_index = btr_search_update_block_hash_info(info, block, cursor); + + if (build_index || (cursor->flag == BTR_CUR_HASH_FAIL)) { + + btr_search_check_free_space_in_heap(cursor->index); + } + + if (cursor->flag == BTR_CUR_HASH_FAIL) { + /* Update the hash node reference, if appropriate */ + +#ifdef UNIV_SEARCH_PERF_STAT + btr_search_n_hash_fail++; +#endif /* UNIV_SEARCH_PERF_STAT */ + + rw_lock_x_lock(btr_search_get_latch(cursor->index)); + + btr_search_update_hash_ref(info, block, cursor); + + rw_lock_x_unlock(btr_search_get_latch(cursor->index)); + } + + if (build_index) { + /* Note that since we did not protect block->n_fields etc. + with any semaphore, the values can be inconsistent. We have + to check inside the function call that they make sense. We + also malloc an array and store the values there to make sure + the compiler does not let the function call parameters change + inside the called function. It might be that the compiler + would optimize the call just to pass pointers to block. */ + + params = (ulint*) mem_alloc(3 * sizeof(ulint)); + params[0] = block->n_fields; + params[1] = block->n_bytes; + params[2] = block->left_side; + + /* Make sure the compiler cannot deduce the values and do + optimizations */ + + params2 = params + btr_search_this_is_zero; + + btr_search_build_page_hash_index(cursor->index, + block, + params2[0], + params2[1], + params2[2]); + mem_free(params); + } +} + +/******************************************************************//** +Checks if a guessed position for a tree cursor is right. Note that if +mode is PAGE_CUR_LE, which is used in inserts, and the function returns +TRUE, then cursor->up_match and cursor->low_match both have sensible values. +@return TRUE if success */ +static +ibool +btr_search_check_guess( +/*===================*/ + btr_cur_t* cursor, /*!< in: guessed cursor position */ + ibool can_only_compare_to_cursor_rec, + /*!< in: if we do not have a latch on the page + of cursor, but only a latch on + btr_search_latch, then ONLY the columns + of the record UNDER the cursor are + protected, not the next or previous record + in the chain: we cannot look at the next or + previous record to check our guess! */ + const dtuple_t* tuple, /*!< in: data tuple */ + ulint mode, /*!< in: PAGE_CUR_L, PAGE_CUR_LE, PAGE_CUR_G, + or PAGE_CUR_GE */ + mtr_t* mtr) /*!< in: mtr */ +{ + rec_t* rec; + ulint n_unique; + ulint match; + ulint bytes; + int cmp; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + ibool success = FALSE; + rec_offs_init(offsets_); + + n_unique = dict_index_get_n_unique_in_tree(cursor->index); + + rec = btr_cur_get_rec(cursor); + + ut_ad(page_rec_is_user_rec(rec)); + + match = 0; + bytes = 0; + + offsets = rec_get_offsets(rec, cursor->index, offsets, + n_unique, &heap); + cmp = page_cmp_dtuple_rec_with_match(tuple, rec, + offsets, &match, &bytes); + + if (mode == PAGE_CUR_GE) { + if (cmp == 1) { + goto exit_func; + } + + cursor->up_match = match; + + if (match >= n_unique) { + success = TRUE; + goto exit_func; + } + } else if (mode == PAGE_CUR_LE) { + if (cmp == -1) { + goto exit_func; + } + + cursor->low_match = match; + + } else if (mode == PAGE_CUR_G) { + if (cmp != -1) { + goto exit_func; + } + } else if (mode == PAGE_CUR_L) { + if (cmp != 1) { + goto exit_func; + } + } + + if (can_only_compare_to_cursor_rec) { + /* Since we could not determine if our guess is right just by + looking at the record under the cursor, return FALSE */ + goto exit_func; + } + + match = 0; + bytes = 0; + + if ((mode == PAGE_CUR_G) || (mode == PAGE_CUR_GE)) { + rec_t* prev_rec; + + ut_ad(!page_rec_is_infimum(rec)); + + prev_rec = page_rec_get_prev(rec); + + if (page_rec_is_infimum(prev_rec)) { + success = btr_page_get_prev(page_align(prev_rec), mtr) + == FIL_NULL; + + goto exit_func; + } + + offsets = rec_get_offsets(prev_rec, cursor->index, offsets, + n_unique, &heap); + cmp = page_cmp_dtuple_rec_with_match(tuple, prev_rec, + offsets, &match, &bytes); + if (mode == PAGE_CUR_GE) { + success = cmp == 1; + } else { + success = cmp != -1; + } + + goto exit_func; + } else { + rec_t* next_rec; + + ut_ad(!page_rec_is_supremum(rec)); + + next_rec = page_rec_get_next(rec); + + if (page_rec_is_supremum(next_rec)) { + if (btr_page_get_next(page_align(next_rec), mtr) + == FIL_NULL) { + + cursor->up_match = 0; + success = TRUE; + } + + goto exit_func; + } + + offsets = rec_get_offsets(next_rec, cursor->index, offsets, + n_unique, &heap); + cmp = page_cmp_dtuple_rec_with_match(tuple, next_rec, + offsets, &match, &bytes); + if (mode == PAGE_CUR_LE) { + success = cmp == -1; + cursor->up_match = match; + } else { + success = cmp != 1; + } + } +exit_func: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(success); +} + +/******************************************************************//** +Tries to guess the right search position based on the hash search info +of the index. Note that if mode is PAGE_CUR_LE, which is used in inserts, +and the function returns TRUE, then cursor->up_match and cursor->low_match +both have sensible values. +@return TRUE if succeeded */ +UNIV_INTERN +ibool +btr_search_guess_on_hash( +/*=====================*/ + dict_index_t* index, /*!< in: index */ + btr_search_t* info, /*!< in: index search info */ + const dtuple_t* tuple, /*!< in: logical record */ + ulint mode, /*!< in: PAGE_CUR_L, ... */ + ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ...; + NOTE that only if has_search_latch + is 0, we will have a latch set on + the cursor page, otherwise we assume + the caller uses his search latch + to protect the record! */ + btr_cur_t* cursor, /*!< out: tree cursor */ + ulint has_search_latch,/*!< in: latch mode the caller + currently has on btr_search_latch: + RW_S_LATCH, RW_X_LATCH, or 0 */ + mtr_t* mtr) /*!< in: mtr */ +{ + buf_pool_t* buf_pool; + buf_block_t* block; + const rec_t* rec; + ulint fold; + index_id_t index_id; +#ifdef notdefined + btr_cur_t cursor2; + btr_pcur_t pcur; +#endif + ut_ad(index && info && tuple && cursor && mtr); + ut_ad(!dict_index_is_ibuf(index)); + ut_ad((latch_mode == BTR_SEARCH_LEAF) + || (latch_mode == BTR_MODIFY_LEAF)); + + /* Note that, for efficiency, the struct info may not be protected by + any latch here! */ + + if (UNIV_UNLIKELY(info->n_hash_potential == 0)) { + + return(FALSE); + } + + cursor->n_fields = info->n_fields; + cursor->n_bytes = info->n_bytes; + + if (UNIV_UNLIKELY(dtuple_get_n_fields(tuple) + < cursor->n_fields + (cursor->n_bytes > 0))) { + + return(FALSE); + } + + index_id = index->id; + +#ifdef UNIV_SEARCH_PERF_STAT + info->n_hash_succ++; +#endif + fold = dtuple_fold(tuple, cursor->n_fields, cursor->n_bytes, index_id); + + cursor->fold = fold; + cursor->flag = BTR_CUR_HASH; + + if (UNIV_LIKELY(!has_search_latch)) { + rw_lock_s_lock(btr_search_get_latch(index)); + + if (UNIV_UNLIKELY(!btr_search_enabled)) { + goto failure_unlock; + } + } + + ut_ad(rw_lock_get_writer(btr_search_get_latch(index)) != RW_LOCK_EX); + ut_ad(rw_lock_get_reader_count(btr_search_get_latch(index)) > 0); + + rec = (rec_t*) ha_search_and_get_data( + btr_search_get_hash_table(index), fold); + + if (UNIV_UNLIKELY(!rec)) { + goto failure_unlock; + } + + block = buf_block_align(rec); + + if (UNIV_LIKELY(!has_search_latch)) { + + if (UNIV_UNLIKELY( + !buf_page_get_known_nowait(latch_mode, block, + BUF_MAKE_YOUNG, + __FILE__, __LINE__, + mtr))) { + goto failure_unlock; + } + + rw_lock_s_unlock(btr_search_get_latch(index)); + + buf_block_dbg_add_level(block, SYNC_TREE_NODE_FROM_HASH); + } + + if (UNIV_UNLIKELY(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE)) { + ut_ad(buf_block_get_state(block) == BUF_BLOCK_REMOVE_HASH); + + if (UNIV_LIKELY(!has_search_latch)) { + + btr_leaf_page_release(block, latch_mode, mtr); + } + + goto failure; + } + + ut_ad(page_rec_is_user_rec(rec)); + + btr_cur_position(index, (rec_t*) rec, block, cursor); + + /* Check the validity of the guess within the page */ + + /* If we only have the latch on btr_search_latch, not on the + page, it only protects the columns of the record the cursor + is positioned on. We cannot look at the next of the previous + record to determine if our guess for the cursor position is + right. */ + if (UNIV_UNLIKELY(index_id != btr_page_get_index_id(block->frame)) + || !btr_search_check_guess(cursor, + has_search_latch, + tuple, mode, mtr)) { + if (UNIV_LIKELY(!has_search_latch)) { + btr_leaf_page_release(block, latch_mode, mtr); + } + + goto failure; + } + + if (UNIV_LIKELY(info->n_hash_potential < BTR_SEARCH_BUILD_LIMIT + 5)) { + + info->n_hash_potential++; + } + +#ifdef notdefined + /* These lines of code can be used in a debug version to check + the correctness of the searched cursor position: */ + + info->last_hash_succ = FALSE; + + /* Currently, does not work if the following fails: */ + ut_ad(!has_search_latch); + + btr_leaf_page_release(block, latch_mode, mtr); + + btr_cur_search_to_nth_level(index, 0, tuple, mode, latch_mode, + &cursor2, 0, mtr); + if (mode == PAGE_CUR_GE + && page_rec_is_supremum(btr_cur_get_rec(&cursor2))) { + + /* If mode is PAGE_CUR_GE, then the binary search + in the index tree may actually take us to the supremum + of the previous page */ + + info->last_hash_succ = FALSE; + + btr_pcur_open_on_user_rec(index, tuple, mode, latch_mode, + &pcur, mtr); + ut_ad(btr_pcur_get_rec(&pcur) == btr_cur_get_rec(cursor)); + } else { + ut_ad(btr_cur_get_rec(&cursor2) == btr_cur_get_rec(cursor)); + } + + /* NOTE that it is theoretically possible that the above assertions + fail if the page of the cursor gets removed from the buffer pool + meanwhile! Thus it might not be a bug. */ +#endif + info->last_hash_succ = TRUE; + +#ifdef UNIV_SEARCH_PERF_STAT + btr_search_n_succ++; +#endif + if (UNIV_LIKELY(!has_search_latch) + && buf_page_peek_if_too_old(&block->page)) { + + buf_page_make_young(&block->page); + } + + /* Increment the page get statistics though we did not really + fix the page: for user info only */ + buf_pool = buf_pool_from_bpage(&block->page); + buf_pool->stat.n_page_gets++; + + return(TRUE); + + /*-------------------------------------------*/ +failure_unlock: + if (UNIV_LIKELY(!has_search_latch)) { + rw_lock_s_unlock(btr_search_get_latch(index)); + } +failure: + cursor->flag = BTR_CUR_HASH_FAIL; + +#ifdef UNIV_SEARCH_PERF_STAT + info->n_hash_fail++; + + if (info->n_hash_succ > 0) { + info->n_hash_succ--; + } +#endif + info->last_hash_succ = FALSE; + + return(FALSE); +} + +/********************************************************************//** +Drops a page hash index. */ +UNIV_INTERN +void +btr_search_drop_page_hash_index( +/*============================*/ + buf_block_t* block) /*!< in: block containing index page, + s- or x-latched, or an index page + for which we know that + block->buf_fix_count == 0 or it is an + index page which has already been + removed from the buf_pool->page_hash + i.e.: it is in state + BUF_BLOCK_REMOVE_HASH */ +{ + hash_table_t* table; + ulint n_fields; + ulint n_bytes; + const page_t* page; + const rec_t* rec; + ulint fold; + ulint prev_fold; + index_id_t index_id; + ulint n_cached; + ulint n_recs; + ulint* folds; + ulint i; + mem_heap_t* heap; + const dict_index_t* index; + ulint* offsets; + btr_search_t* info; + +retry: + /* Do a dirty check on block->index, return if the block is not in the + adaptive hash index. This is to avoid acquiring an AHI latch for + performance considerations. */ + + index = block->index; + if (!index) { + + return; + } + +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(btr_search_get_latch(index), RW_LOCK_SHARED)); + ut_ad(!rw_lock_own(btr_search_get_latch(index), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + rw_lock_s_lock(btr_search_get_latch(index)); + + if (UNIV_UNLIKELY(index != block->index)) { + + rw_lock_s_unlock(btr_search_get_latch(index)); + + goto retry; + } + + ut_a(!dict_index_is_ibuf(index)); +#ifdef UNIV_DEBUG + switch (dict_index_get_online_status(index)) { + case ONLINE_INDEX_CREATION: + /* The index is being created (bulk loaded). */ + case ONLINE_INDEX_COMPLETE: + /* The index has been published. */ + case ONLINE_INDEX_ABORTED: + /* Either the index creation was aborted due to an + error observed by InnoDB (in which case there should + not be any adaptive hash index entries), or it was + completed and then flagged aborted in + rollback_inplace_alter_table(). */ + break; + case ONLINE_INDEX_ABORTED_DROPPED: + /* The index should have been dropped from the tablespace + already, and the adaptive hash index entries should have + been dropped as well. */ + ut_error; + } +#endif /* UNIV_DEBUG */ + + table = btr_search_get_hash_table(index); + +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&(block->lock), RW_LOCK_SHARED) + || rw_lock_own(&(block->lock), RW_LOCK_EX) + || block->page.buf_fix_count == 0 + || buf_block_get_state(block) == BUF_BLOCK_REMOVE_HASH); +#endif /* UNIV_SYNC_DEBUG */ + + n_fields = block->curr_n_fields; + n_bytes = block->curr_n_bytes; + + /* NOTE: The fields of block must not be accessed after + releasing btr_search_latch, as the index page might only + be s-latched! */ + + rw_lock_s_unlock(btr_search_get_latch(index)); + + ut_a(n_fields + n_bytes > 0); + + page = block->frame; + n_recs = page_get_n_recs(page); + + /* Calculate and cache fold values into an array for fast deletion + from the hash index */ + + folds = (ulint*) mem_alloc(n_recs * sizeof(ulint)); + + n_cached = 0; + + rec = page_get_infimum_rec(page); + rec = page_rec_get_next_low(rec, page_is_comp(page)); + + index_id = btr_page_get_index_id(page); + + ut_a(index_id == index->id); + + prev_fold = 0; + + heap = NULL; + offsets = NULL; + + while (!page_rec_is_supremum(rec)) { + offsets = rec_get_offsets(rec, index, offsets, + n_fields + (n_bytes > 0), &heap); + ut_a(rec_offs_n_fields(offsets) == n_fields + (n_bytes > 0)); + fold = rec_fold(rec, offsets, n_fields, n_bytes, index_id); + + if (fold == prev_fold && prev_fold != 0) { + + goto next_rec; + } + + /* Remove all hash nodes pointing to this page from the + hash chain */ + + folds[n_cached] = fold; + n_cached++; +next_rec: + rec = page_rec_get_next_low(rec, page_rec_is_comp(rec)); + prev_fold = fold; + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + rw_lock_x_lock(btr_search_get_latch(index)); + + if (UNIV_UNLIKELY(!block->index)) { + /* Someone else has meanwhile dropped the hash index */ + + goto cleanup; + } + + ut_a(block->index == index); + + if (UNIV_UNLIKELY(block->curr_n_fields != n_fields) + || UNIV_UNLIKELY(block->curr_n_bytes != n_bytes)) { + + /* Someone else has meanwhile built a new hash index on the + page, with different parameters */ + + rw_lock_x_unlock(btr_search_get_latch(index)); + + mem_free(folds); + goto retry; + } + + for (i = 0; i < n_cached; i++) { + + ha_remove_all_nodes_to_page(table, folds[i], page); + } + + info = btr_search_get_info(block->index); + ut_a(info->ref_count > 0); + info->ref_count--; + + block->index = NULL; + + MONITOR_INC(MONITOR_ADAPTIVE_HASH_PAGE_REMOVED); + MONITOR_INC_VALUE(MONITOR_ADAPTIVE_HASH_ROW_REMOVED, n_cached); + +cleanup: +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + if (UNIV_UNLIKELY(block->n_pointers)) { + /* Corruption */ + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Corruption of adaptive hash index." + " After dropping\n" + "InnoDB: the hash index to a page of %s," + " still %lu hash nodes remain.\n", + index->name, (ulong) block->n_pointers); + rw_lock_x_unlock(btr_search_get_latch(index)); + + ut_ad(btr_search_validate()); + } else { + rw_lock_x_unlock(btr_search_get_latch(index)); + } +#else /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + rw_lock_x_unlock(btr_search_get_latch(index)); +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + + mem_free(folds); +} + +/********************************************************************//** +Drops a possible page hash index when a page is evicted from the buffer pool +or freed in a file segment. */ +UNIV_INTERN +void +btr_search_drop_page_hash_when_freed( +/*=================================*/ + ulint space, /*!< in: space id */ + ulint zip_size, /*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page_no) /*!< in: page number */ +{ + buf_block_t* block; + mtr_t mtr; + + mtr_start(&mtr); + + /* If the caller has a latch on the page, then the caller must + have a x-latch on the page and it must have already dropped + the hash index for the page. Because of the x-latch that we + are possibly holding, we cannot s-latch the page, but must + (recursively) x-latch it, even though we are only reading. */ + + block = buf_page_get_gen(space, zip_size, page_no, RW_X_LATCH, NULL, + BUF_PEEK_IF_IN_POOL, __FILE__, __LINE__, + &mtr); + + if (block && block->index) { + + buf_block_dbg_add_level(block, SYNC_TREE_NODE_FROM_HASH); + + btr_search_drop_page_hash_index(block); + } + + mtr_commit(&mtr); +} + +/********************************************************************//** +Builds a hash index on a page with the given parameters. If the page already +has a hash index with different parameters, the old hash index is removed. +If index is non-NULL, this function checks if n_fields and n_bytes are +sensible values, and does not build a hash index if not. */ +static +void +btr_search_build_page_hash_index( +/*=============================*/ + dict_index_t* index, /*!< in: index for which to build */ + buf_block_t* block, /*!< in: index page, s- or x-latched */ + ulint n_fields,/*!< in: hash this many full fields */ + ulint n_bytes,/*!< in: hash this many bytes from the next + field */ + ibool left_side)/*!< in: hash for searches from left side? */ +{ + hash_table_t* table; + page_t* page; + rec_t* rec; + rec_t* next_rec; + ulint fold; + ulint next_fold; + ulint n_cached; + ulint n_recs; + ulint* folds; + rec_t** recs; + ulint i; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + ut_ad(index); + ut_a(!dict_index_is_ibuf(index)); + +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(btr_search_get_latch(index), RW_LOCK_EX)); + ut_ad(rw_lock_own(&(block->lock), RW_LOCK_SHARED) + || rw_lock_own(&(block->lock), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + rw_lock_s_lock(btr_search_get_latch(index)); + + if (!btr_search_enabled) { + rw_lock_s_unlock(btr_search_get_latch(index)); + return; + } + + table = btr_search_get_hash_table(index); + page = buf_block_get_frame(block); + + if (block->index && ((block->curr_n_fields != n_fields) + || (block->curr_n_bytes != n_bytes) + || (block->curr_left_side != left_side))) { + + rw_lock_s_unlock(btr_search_get_latch(index)); + + btr_search_drop_page_hash_index(block); + } else { + rw_lock_s_unlock(btr_search_get_latch(index)); + } + + n_recs = page_get_n_recs(page); + + if (n_recs == 0) { + + return; + } + + /* Check that the values for hash index build are sensible */ + + if (n_fields + n_bytes == 0) { + + return; + } + + if (dict_index_get_n_unique_in_tree(index) < n_fields + || (dict_index_get_n_unique_in_tree(index) == n_fields + && n_bytes > 0)) { + return; + } + + /* Calculate and cache fold values and corresponding records into + an array for fast insertion to the hash index */ + + folds = (ulint*) mem_alloc(n_recs * sizeof(ulint)); + recs = (rec_t**) mem_alloc(n_recs * sizeof(rec_t*)); + + n_cached = 0; + + ut_a(index->id == btr_page_get_index_id(page)); + + rec = page_rec_get_next(page_get_infimum_rec(page)); + + offsets = rec_get_offsets(rec, index, offsets, + n_fields + (n_bytes > 0), &heap); + + if (!page_rec_is_supremum(rec)) { + ut_a(n_fields <= rec_offs_n_fields(offsets)); + + if (n_bytes > 0) { + ut_a(n_fields < rec_offs_n_fields(offsets)); + } + } + + fold = rec_fold(rec, offsets, n_fields, n_bytes, index->id); + + if (left_side) { + + folds[n_cached] = fold; + recs[n_cached] = rec; + n_cached++; + } + + for (;;) { + next_rec = page_rec_get_next(rec); + + if (page_rec_is_supremum(next_rec)) { + + if (!left_side) { + + folds[n_cached] = fold; + recs[n_cached] = rec; + n_cached++; + } + + break; + } + + offsets = rec_get_offsets(next_rec, index, offsets, + n_fields + (n_bytes > 0), &heap); + next_fold = rec_fold(next_rec, offsets, n_fields, + n_bytes, index->id); + + if (fold != next_fold) { + /* Insert an entry into the hash index */ + + if (left_side) { + + folds[n_cached] = next_fold; + recs[n_cached] = next_rec; + n_cached++; + } else { + folds[n_cached] = fold; + recs[n_cached] = rec; + n_cached++; + } + } + + rec = next_rec; + fold = next_fold; + } + + btr_search_check_free_space_in_heap(index); + + rw_lock_x_lock(btr_search_get_latch(index)); + + if (UNIV_UNLIKELY(!btr_search_enabled)) { + goto exit_func; + } + + if (block->index && ((block->curr_n_fields != n_fields) + || (block->curr_n_bytes != n_bytes) + || (block->curr_left_side != left_side))) { + goto exit_func; + } + + /* This counter is decremented every time we drop page + hash index entries and is incremented here. Since we can + rebuild hash index for a page that is already hashed, we + have to take care not to increment the counter in that + case. */ + if (!block->index) { + index->search_info->ref_count++; + } + + block->n_hash_helps = 0; + + block->curr_n_fields = n_fields; + block->curr_n_bytes = n_bytes; + block->curr_left_side = left_side; + block->index = index; + + for (i = 0; i < n_cached; i++) { + + ha_insert_for_fold(table, folds[i], block, recs[i]); + } + + MONITOR_INC(MONITOR_ADAPTIVE_HASH_PAGE_ADDED); + MONITOR_INC_VALUE(MONITOR_ADAPTIVE_HASH_ROW_ADDED, n_cached); +exit_func: + rw_lock_x_unlock(btr_search_get_latch(index)); + + mem_free(folds); + mem_free(recs); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } +} + +/********************************************************************//** +Moves or deletes hash entries for moved records. If new_page is already hashed, +then the hash index for page, if any, is dropped. If new_page is not hashed, +and page is hashed, then a new hash index is built to new_page with the same +parameters as page (this often happens when a page is split). */ +UNIV_INTERN +void +btr_search_move_or_delete_hash_entries( +/*===================================*/ + buf_block_t* new_block, /*!< in: records are copied + to this page */ + buf_block_t* block, /*!< in: index page from which + records were copied, and the + copied records will be deleted + from this page */ + dict_index_t* index) /*!< in: record descriptor */ +{ + ulint n_fields; + ulint n_bytes; + ibool left_side; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX)); + ut_ad(rw_lock_own(&(new_block->lock), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + rw_lock_s_lock(btr_search_get_latch(index)); + + ut_a(!new_block->index || new_block->index == index); + ut_a(!block->index || block->index == index); + ut_a(!(new_block->index || block->index) + || !dict_index_is_ibuf(index)); + + if (new_block->index) { + + rw_lock_s_unlock(btr_search_get_latch(index)); + + btr_search_drop_page_hash_index(block); + + return; + } + + if (block->index) { + + n_fields = block->curr_n_fields; + n_bytes = block->curr_n_bytes; + left_side = block->curr_left_side; + + new_block->n_fields = block->curr_n_fields; + new_block->n_bytes = block->curr_n_bytes; + new_block->left_side = left_side; + + rw_lock_s_unlock(btr_search_get_latch(index)); + + ut_a(n_fields + n_bytes > 0); + + btr_search_build_page_hash_index(index, new_block, n_fields, + n_bytes, left_side); + ut_ad(n_fields == block->curr_n_fields); + ut_ad(n_bytes == block->curr_n_bytes); + ut_ad(left_side == block->curr_left_side); + return; + } + + rw_lock_s_unlock(btr_search_get_latch(index)); +} + +/********************************************************************//** +Updates the page hash index when a single record is deleted from a page. */ +UNIV_INTERN +void +btr_search_update_hash_on_delete( +/*=============================*/ + btr_cur_t* cursor) /*!< in: cursor which was positioned on the + record to delete using btr_cur_search_..., + the record is not yet deleted */ +{ + hash_table_t* table; + buf_block_t* block; + const rec_t* rec; + ulint fold; + dict_index_t* index; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + mem_heap_t* heap = NULL; + rec_offs_init(offsets_); + + block = btr_cur_get_block(cursor); + +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + index = block->index; + + if (!index) { + + return; + } + + ut_a(index == cursor->index); + ut_a(block->curr_n_fields + block->curr_n_bytes > 0); + ut_a(!dict_index_is_ibuf(index)); + + table = btr_search_get_hash_table(cursor->index); + + rec = btr_cur_get_rec(cursor); + + fold = rec_fold(rec, rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap), + block->curr_n_fields, block->curr_n_bytes, index->id); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + rw_lock_x_lock(btr_search_get_latch(cursor->index)); + + if (block->index) { + ut_a(block->index == index); + + if (ha_search_and_delete_if_found(table, fold, rec)) { + MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_REMOVED); + } else { + MONITOR_INC( + MONITOR_ADAPTIVE_HASH_ROW_REMOVE_NOT_FOUND); + } + } + + rw_lock_x_unlock(btr_search_get_latch(cursor->index)); +} + +/********************************************************************//** +Updates the page hash index when a single record is inserted on a page. */ +UNIV_INTERN +void +btr_search_update_hash_node_on_insert( +/*==================================*/ + btr_cur_t* cursor) /*!< in: cursor which was positioned to the + place to insert using btr_cur_search_..., + and the new record has been inserted next + to the cursor */ +{ + hash_table_t* table; + buf_block_t* block; + dict_index_t* index; + rec_t* rec; + + rec = btr_cur_get_rec(cursor); + + block = btr_cur_get_block(cursor); + +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + index = block->index; + + if (!index) { + + return; + } + + ut_a(cursor->index == index); + ut_a(!dict_index_is_ibuf(index)); + + rw_lock_x_lock(btr_search_get_latch(cursor->index)); + + if (!block->index) { + + goto func_exit; + } + + ut_a(block->index == index); + + if ((cursor->flag == BTR_CUR_HASH) + && (cursor->n_fields == block->curr_n_fields) + && (cursor->n_bytes == block->curr_n_bytes) + && !block->curr_left_side) { + + table = btr_search_get_hash_table(cursor->index); + + if (ha_search_and_update_if_found( + table, cursor->fold, rec, block, + page_rec_get_next(rec))) { + MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_UPDATED); + } + +func_exit: + rw_lock_x_unlock(btr_search_get_latch(cursor->index)); + } else { + rw_lock_x_unlock(btr_search_get_latch(cursor->index)); + + btr_search_update_hash_on_insert(cursor); + } +} + +/********************************************************************//** +Updates the page hash index when a single record is inserted on a page. */ +UNIV_INTERN +void +btr_search_update_hash_on_insert( +/*=============================*/ + btr_cur_t* cursor) /*!< in: cursor which was positioned to the + place to insert using btr_cur_search_..., + and the new record has been inserted next + to the cursor */ +{ + hash_table_t* table; + buf_block_t* block; + dict_index_t* index; + const rec_t* rec; + const rec_t* ins_rec; + const rec_t* next_rec; + ulint fold; + ulint ins_fold; + ulint next_fold = 0; /* remove warning (??? bug ???) */ + ulint n_fields; + ulint n_bytes; + ibool left_side; + ibool locked = FALSE; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + block = btr_cur_get_block(cursor); + +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + index = block->index; + + if (!index) { + + return; + } + + btr_search_check_free_space_in_heap(cursor->index); + + table = btr_search_get_hash_table(cursor->index); + + rec = btr_cur_get_rec(cursor); + + ut_a(index == cursor->index); + ut_a(!dict_index_is_ibuf(index)); + + n_fields = block->curr_n_fields; + n_bytes = block->curr_n_bytes; + left_side = block->curr_left_side; + + ins_rec = page_rec_get_next_const(rec); + next_rec = page_rec_get_next_const(ins_rec); + + offsets = rec_get_offsets(ins_rec, index, offsets, + ULINT_UNDEFINED, &heap); + ins_fold = rec_fold(ins_rec, offsets, n_fields, n_bytes, index->id); + + if (!page_rec_is_supremum(next_rec)) { + offsets = rec_get_offsets(next_rec, index, offsets, + n_fields + (n_bytes > 0), &heap); + next_fold = rec_fold(next_rec, offsets, n_fields, + n_bytes, index->id); + } + + if (!page_rec_is_infimum(rec)) { + offsets = rec_get_offsets(rec, index, offsets, + n_fields + (n_bytes > 0), &heap); + fold = rec_fold(rec, offsets, n_fields, n_bytes, index->id); + } else { + if (left_side) { + + rw_lock_x_lock(btr_search_get_latch(index)); + + locked = TRUE; + + if (!btr_search_enabled) { + goto function_exit; + } + + ha_insert_for_fold(table, ins_fold, block, ins_rec); + } + + goto check_next_rec; + } + + if (fold != ins_fold) { + + if (!locked) { + + rw_lock_x_lock(btr_search_get_latch(index)); + + locked = TRUE; + + if (!btr_search_enabled) { + goto function_exit; + } + } + + if (!left_side) { + ha_insert_for_fold(table, fold, block, rec); + } else { + ha_insert_for_fold(table, ins_fold, block, ins_rec); + } + } + +check_next_rec: + if (page_rec_is_supremum(next_rec)) { + + if (!left_side) { + + if (!locked) { + rw_lock_x_lock(btr_search_get_latch(index)); + + locked = TRUE; + + if (!btr_search_enabled) { + goto function_exit; + } + } + + ha_insert_for_fold(table, ins_fold, block, ins_rec); + } + + goto function_exit; + } + + if (ins_fold != next_fold) { + + if (!locked) { + + rw_lock_x_lock(btr_search_get_latch(index)); + + locked = TRUE; + + if (!btr_search_enabled) { + goto function_exit; + } + } + + if (!left_side) { + + ha_insert_for_fold(table, ins_fold, block, ins_rec); + /* + fputs("Hash insert for ", stderr); + dict_index_name_print(stderr, index); + fprintf(stderr, " fold %lu\n", ins_fold); + */ + } else { + ha_insert_for_fold(table, next_fold, block, next_rec); + } + } + +function_exit: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + if (locked) { + rw_lock_x_unlock(btr_search_get_latch(index)); + } +} + +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG +/********************************************************************//** +Validates one hash table in the search system. +@return TRUE if ok */ +static +ibool +btr_search_validate_one_table( +/*==========================*/ + ulint t) +{ + ha_node_t* node; + ulint n_page_dumps = 0; + ibool ok = TRUE; + ulint i; + ulint cell_count; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + + /* How many cells to check before temporarily releasing + btr_search_latch. */ + ulint chunk_size = 10000; + + rec_offs_init(offsets_); + + cell_count = hash_get_n_cells(btr_search_sys->hash_tables[t]); + + for (i = 0; i < cell_count; i++) { + /* We release btr_search_latch every once in a while to + give other queries a chance to run. */ + if ((i != 0) && ((i % chunk_size) == 0)) { + btr_search_x_unlock_all(); + os_thread_yield(); + btr_search_x_lock_all(); + } + + node = (ha_node_t*) + hash_get_nth_cell(btr_search_sys->hash_tables[t], + i)->node; + + for (; node != NULL; node = node->next) { + buf_block_t* block + = buf_block_align((byte*) node->data); + const buf_block_t* hash_block; + buf_pool_t* buf_pool; + index_id_t page_index_id; + + buf_pool = buf_pool_from_bpage((buf_page_t *) block); + /* Prevent BUF_BLOCK_FILE_PAGE -> BUF_BLOCK_REMOVE_HASH + transition until we lock the block mutex */ + mutex_enter(&buf_pool->LRU_list_mutex); + + if (UNIV_LIKELY(buf_block_get_state(block) + == BUF_BLOCK_FILE_PAGE)) { + + /* The space and offset are only valid + for file blocks. It is possible that + the block is being freed + (BUF_BLOCK_REMOVE_HASH, see the + assertion and the comment below) */ + hash_block = buf_block_hash_get( + buf_pool, + buf_block_get_space(block), + buf_block_get_page_no(block)); + } else { + hash_block = NULL; + } + + if (hash_block) { + ut_a(hash_block == block); + } else { + /* When a block is being freed, + buf_LRU_search_and_free_block() first + removes the block from + buf_pool->page_hash by calling + buf_LRU_block_remove_hashed_page(). + After that, it invokes + btr_search_drop_page_hash_index() to + remove the block from + btr_search_sys->hash_index. */ + + ut_a(buf_block_get_state(block) + == BUF_BLOCK_REMOVE_HASH); + } + + mutex_enter(&block->mutex); + mutex_exit(&buf_pool->LRU_list_mutex); + + ut_a(!dict_index_is_ibuf(block->index)); + + page_index_id = btr_page_get_index_id(block->frame); + + offsets = rec_get_offsets(node->data, + block->index, offsets, + block->curr_n_fields + + (block->curr_n_bytes > 0), + &heap); + + if (!block->index || node->fold + != rec_fold(node->data, + offsets, + block->curr_n_fields, + block->curr_n_bytes, + page_index_id)) { + const page_t* page = block->frame; + + ok = FALSE; + ut_print_timestamp(stderr); + + fprintf(stderr, + " InnoDB: Error in an adaptive hash" + " index pointer to page %lu\n" + "InnoDB: ptr mem address %p" + " index id %llu," + " node fold %lu, rec fold %lu\n", + (ulong) page_get_page_no(page), + node->data, + (ullint) page_index_id, + (ulong) node->fold, + (ulong) rec_fold(node->data, + offsets, + block->curr_n_fields, + block->curr_n_bytes, + page_index_id)); + + fputs("InnoDB: Record ", stderr); + rec_print_new(stderr, node->data, offsets); + fprintf(stderr, "\nInnoDB: on that page." + " Page mem address %p, is hashed %p," + " n fields %lu, n bytes %lu\n" + "InnoDB: side %lu\n", + (void*) page, (void*) block->index, + (ulong) block->curr_n_fields, + (ulong) block->curr_n_bytes, + (ulong) block->curr_left_side); + + if (n_page_dumps < 20) { + buf_page_print( + page, 0, + BUF_PAGE_PRINT_NO_CRASH); + n_page_dumps++; + } + } + + mutex_exit(&block->mutex); + } + } + + for (i = 0; i < cell_count; i += chunk_size) { + ulint end_index = ut_min(i + chunk_size - 1, cell_count - 1); + + /* We release btr_search_latch every once in a while to + give other queries a chance to run. */ + if (i != 0) { + btr_search_x_unlock_all(); + os_thread_yield(); + btr_search_x_lock_all(); + } + + if (!ha_validate(btr_search_sys->hash_tables[t], i, + end_index)) { + ok = FALSE; + } + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + return(ok); +} + +/********************************************************************//** +Validates the search system. +@return TRUE if ok */ +UNIV_INTERN +ibool +btr_search_validate(void) +/*=====================*/ +{ + ulint i; + ibool ok = TRUE; + + btr_search_x_lock_all(); + + for (i = 0; i < btr_search_index_num; i++) { + + if (!btr_search_validate_one_table(i)) + ok = FALSE; + } + + btr_search_x_unlock_all(); + + return(ok); +} + + +#endif /* defined UNIV_AHI_DEBUG || defined UNIV_DEBUG */ diff --git a/storage/xtradb/buf/buf0buddy.cc b/storage/xtradb/buf/buf0buddy.cc new file mode 100644 index 00000000000..8f6be0cf2af --- /dev/null +++ b/storage/xtradb/buf/buf0buddy.cc @@ -0,0 +1,742 @@ +/***************************************************************************** + +Copyright (c) 2006, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file buf/buf0buddy.cc +Binary buddy allocator for compressed pages + +Created December 2006 by Marko Makela +*******************************************************/ + +#define THIS_MODULE +#include "buf0buddy.h" +#ifdef UNIV_NONINL +# include "buf0buddy.ic" +#endif +#undef THIS_MODULE +#include "buf0buf.h" +#include "buf0lru.h" +#include "buf0flu.h" +#include "page0zip.h" +#include "srv0start.h" + +/** When freeing a buf we attempt to coalesce by looking at its buddy +and deciding whether it is free or not. To ascertain if the buddy is +free we look for BUF_BUDDY_STAMP_FREE at BUF_BUDDY_STAMP_OFFSET +within the buddy. The question is how we can be sure that it is +safe to look at BUF_BUDDY_STAMP_OFFSET. +The answer lies in following invariants: +* All blocks allocated by buddy allocator are used for compressed +page frame. +* A compressed table always have space_id < SRV_LOG_SPACE_FIRST_ID +* BUF_BUDDY_STAMP_OFFSET always points to the space_id field in +a frame. + -- The above is true because we look at these fields when the + corresponding buddy block is free which implies that: + * The block we are looking at must have an address aligned at + the same size that its free buddy has. For example, if we have + a free block of 8K then its buddy's address must be aligned at + 8K as well. + * It is possible that the block we are looking at may have been + further divided into smaller sized blocks but its starting + address must still remain the start of a page frame i.e.: it + cannot be middle of a block. For example, if we have a free + block of size 8K then its buddy may be divided into blocks + of, say, 1K, 1K, 2K, 4K but the buddy's address will still be + the starting address of first 1K compressed page. + * What is important to note is that for any given block, the + buddy's address cannot be in the middle of a larger block i.e.: + in above example, our 8K block cannot have a buddy whose address + is aligned on 8K but it is part of a larger 16K block. +*/ + +/** Offset within buf_buddy_free_t where free or non_free stamps +are written.*/ +#define BUF_BUDDY_STAMP_OFFSET FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID + +/** Value that we stamp on all buffers that are currently on the zip_free +list. This value is stamped at BUF_BUDDY_STAMP_OFFSET offset */ +#define BUF_BUDDY_STAMP_FREE (SRV_LOG_SPACE_FIRST_ID) + +/** Stamp value for non-free buffers. Will be overwritten by a non-zero +value by the consumer of the block */ +#define BUF_BUDDY_STAMP_NONFREE (0XFFFFFFFF) + +#if BUF_BUDDY_STAMP_FREE >= BUF_BUDDY_STAMP_NONFREE +# error "BUF_BUDDY_STAMP_FREE >= BUF_BUDDY_STAMP_NONFREE" +#endif + +/** Return type of buf_buddy_is_free() */ +enum buf_buddy_state_t { + BUF_BUDDY_STATE_FREE, /*!< If the buddy to completely free */ + BUF_BUDDY_STATE_USED, /*!< Buddy currently in used */ + BUF_BUDDY_STATE_PARTIALLY_USED/*!< Some sub-blocks in the buddy + are in use */ +}; + +#ifdef UNIV_DEBUG_VALGRIND +/**********************************************************************//** +Invalidate memory area that we won't access while page is free */ +UNIV_INLINE +void +buf_buddy_mem_invalid( +/*==================*/ + buf_buddy_free_t* buf, /*!< in: block to check */ + ulint i) /*!< in: index of zip_free[] */ +{ + const size_t size = BUF_BUDDY_LOW << i; + ut_ad(i <= BUF_BUDDY_SIZES); + + UNIV_MEM_ASSERT_W(buf, size); + UNIV_MEM_INVALID(buf, size); +} +#else /* UNIV_DEBUG_VALGRIND */ +# define buf_buddy_mem_invalid(buf, i) ut_ad((i) <= BUF_BUDDY_SIZES) +#endif /* UNIV_DEBUG_VALGRIND */ + +/**********************************************************************//** +Check if a buddy is stamped free. +@return whether the buddy is free */ +UNIV_INLINE __attribute__((warn_unused_result)) +bool +buf_buddy_stamp_is_free( +/*====================*/ + const buf_buddy_free_t* buf) /*!< in: block to check */ +{ + return(mach_read_from_4(buf->stamp.bytes + BUF_BUDDY_STAMP_OFFSET) + == BUF_BUDDY_STAMP_FREE); +} + +/**********************************************************************//** +Stamps a buddy free. */ +UNIV_INLINE +void +buf_buddy_stamp_free( +/*=================*/ + buf_buddy_free_t* buf, /*!< in/out: block to stamp */ + ulint i) /*!< in: block size */ +{ + ut_d(memset(buf, static_cast<int>(i), BUF_BUDDY_LOW << i)); + buf_buddy_mem_invalid(buf, i); + mach_write_to_4(buf->stamp.bytes + BUF_BUDDY_STAMP_OFFSET, + BUF_BUDDY_STAMP_FREE); + buf->stamp.size = i; +} + +/**********************************************************************//** +Stamps a buddy nonfree. +@param[in/out] buf block to stamp +@param[in] i block size */ +#define buf_buddy_stamp_nonfree(buf, i) do { \ + buf_buddy_mem_invalid(buf, i); \ + memset(buf->stamp.bytes + BUF_BUDDY_STAMP_OFFSET, 0xff, 4); \ +} while (0) +#if BUF_BUDDY_STAMP_NONFREE != 0xffffffff +# error "BUF_BUDDY_STAMP_NONFREE != 0xffffffff" +#endif + +/**********************************************************************//** +Get the offset of the buddy of a compressed page frame. +@return the buddy relative of page */ +UNIV_INLINE +void* +buf_buddy_get( +/*==========*/ + byte* page, /*!< in: compressed page */ + ulint size) /*!< in: page size in bytes */ +{ + ut_ad(ut_is_2pow(size)); + ut_ad(size >= BUF_BUDDY_LOW); + ut_ad(BUF_BUDDY_LOW <= UNIV_ZIP_SIZE_MIN); + ut_ad(size < BUF_BUDDY_HIGH); + ut_ad(BUF_BUDDY_HIGH == UNIV_PAGE_SIZE); + ut_ad(!ut_align_offset(page, size)); + + if (((ulint) page) & size) { + return(page - size); + } else { + return(page + size); + } +} + +/** Validate a given zip_free list. */ +struct CheckZipFree { + ulint i; + CheckZipFree(ulint i) : i (i) {} + + void operator()(const buf_buddy_free_t* elem) const + { + ut_a(buf_buddy_stamp_is_free(elem)); + ut_a(elem->stamp.size <= i); + } +}; + +#define BUF_BUDDY_LIST_VALIDATE(bp, i) \ + UT_LIST_VALIDATE(list, buf_buddy_free_t, \ + bp->zip_free[i], CheckZipFree(i)) + +#ifdef UNIV_DEBUG +/**********************************************************************//** +Debug function to validate that a buffer is indeed free i.e.: in the +zip_free[]. +@return true if free */ +UNIV_INLINE +bool +buf_buddy_check_free( +/*=================*/ + buf_pool_t* buf_pool,/*!< in: buffer pool instance */ + const buf_buddy_free_t* buf, /*!< in: block to check */ + ulint i) /*!< in: index of buf_pool->zip_free[] */ +{ + const ulint size = BUF_BUDDY_LOW << i; + + ut_ad(mutex_own(&buf_pool->zip_free_mutex)); + ut_ad(!ut_align_offset(buf, size)); + ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN)); + + buf_buddy_free_t* itr; + + for (itr = UT_LIST_GET_FIRST(buf_pool->zip_free[i]); + itr && itr != buf; + itr = UT_LIST_GET_NEXT(list, itr)) { + } + + return(itr == buf); +} +#endif /* UNIV_DEBUG */ + +/**********************************************************************//** +Checks if a buf is free i.e.: in the zip_free[]. +@retval BUF_BUDDY_STATE_FREE if fully free +@retval BUF_BUDDY_STATE_USED if currently in use +@retval BUF_BUDDY_STATE_PARTIALLY_USED if partially in use. */ +static __attribute__((warn_unused_result)) +buf_buddy_state_t +buf_buddy_is_free( +/*==============*/ + buf_buddy_free_t* buf, /*!< in: block to check */ + ulint i) /*!< in: index of + buf_pool->zip_free[] */ +{ +#ifdef UNIV_DEBUG + const ulint size = BUF_BUDDY_LOW << i; + ut_ad(!ut_align_offset(buf, size)); + ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN)); +#endif /* UNIV_DEBUG */ + + /* We assume that all memory from buf_buddy_alloc() + is used for compressed page frames. */ + + /* We look inside the allocated objects returned by + buf_buddy_alloc() and assume that each block is a compressed + page that contains one of the following in space_id. + * BUF_BUDDY_STAMP_FREE if the block is in a zip_free list or + * BUF_BUDDY_STAMP_NONFREE if the block has been allocated but + not initialized yet or + * A valid space_id of a compressed tablespace + + The call below attempts to read from free memory. The memory + is "owned" by the buddy allocator (and it has been allocated + from the buffer pool), so there is nothing wrong about this. */ + if (!buf_buddy_stamp_is_free(buf)) { + return(BUF_BUDDY_STATE_USED); + } + + /* A block may be free but a fragment of it may still be in use. + To guard against that we write the free block size in terms of + zip_free index at start of stamped block. Note that we can + safely rely on this value only if the buf is free. */ + ut_ad(buf->stamp.size <= i); + return(buf->stamp.size == i + ? BUF_BUDDY_STATE_FREE + : BUF_BUDDY_STATE_PARTIALLY_USED); +} + +/**********************************************************************//** +Add a block to the head of the appropriate buddy free list. */ +UNIV_INLINE +void +buf_buddy_add_to_free( +/*==================*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + buf_buddy_free_t* buf, /*!< in,own: block to be freed */ + ulint i) /*!< in: index of + buf_pool->zip_free[] */ +{ + ut_ad(mutex_own(&buf_pool->zip_free_mutex)); + ut_ad(buf_pool->zip_free[i].start != buf); + + buf_buddy_stamp_free(buf, i); + UT_LIST_ADD_FIRST(list, buf_pool->zip_free[i], buf); + ut_d(BUF_BUDDY_LIST_VALIDATE(buf_pool, i)); +} + +/**********************************************************************//** +Remove a block from the appropriate buddy free list. */ +UNIV_INLINE +void +buf_buddy_remove_from_free( +/*=======================*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + buf_buddy_free_t* buf, /*!< in,own: block to be freed */ + ulint i) /*!< in: index of + buf_pool->zip_free[] */ +{ + ut_ad(mutex_own(&buf_pool->zip_free_mutex)); + ut_ad(buf_buddy_check_free(buf_pool, buf, i)); + + UT_LIST_REMOVE(list, buf_pool->zip_free[i], buf); + buf_buddy_stamp_nonfree(buf, i); +} + +/**********************************************************************//** +Try to allocate a block from buf_pool->zip_free[]. +@return allocated block, or NULL if buf_pool->zip_free[] was empty */ +static +buf_buddy_free_t* +buf_buddy_alloc_zip( +/*================*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + ulint i) /*!< in: index of buf_pool->zip_free[] */ +{ + buf_buddy_free_t* buf; + + ut_ad(mutex_own(&buf_pool->zip_free_mutex)); + ut_a(i < BUF_BUDDY_SIZES); + ut_a(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN)); + + ut_d(BUF_BUDDY_LIST_VALIDATE(buf_pool, i)); + + buf = UT_LIST_GET_FIRST(buf_pool->zip_free[i]); + + if (buf) { + buf_buddy_remove_from_free(buf_pool, buf, i); + } else if (i + 1 < BUF_BUDDY_SIZES) { + /* Attempt to split. */ + buf = buf_buddy_alloc_zip(buf_pool, i + 1); + + if (buf) { + buf_buddy_free_t* buddy = + reinterpret_cast<buf_buddy_free_t*>( + buf->stamp.bytes + + (BUF_BUDDY_LOW << i)); + + ut_ad(!buf_pool_contains_zip(buf_pool, buddy)); + buf_buddy_add_to_free(buf_pool, buddy, i); + } + } + + if (buf) { + /* Trash the page other than the BUF_BUDDY_STAMP_NONFREE. */ + UNIV_MEM_TRASH(buf, ~i, BUF_BUDDY_STAMP_OFFSET); + UNIV_MEM_TRASH(BUF_BUDDY_STAMP_OFFSET + 4 + + buf->stamp.bytes, ~i, + (BUF_BUDDY_LOW << i) + - (BUF_BUDDY_STAMP_OFFSET + 4)); + ut_ad(mach_read_from_4(buf->stamp.bytes + + BUF_BUDDY_STAMP_OFFSET) + == BUF_BUDDY_STAMP_NONFREE); + } + + return(buf); +} + +/**********************************************************************//** +Deallocate a buffer frame of UNIV_PAGE_SIZE. */ +static +void +buf_buddy_block_free( +/*=================*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + void* buf) /*!< in: buffer frame to deallocate */ +{ + const ulint fold = BUF_POOL_ZIP_FOLD_PTR(buf); + buf_page_t* bpage; + buf_block_t* block; + + ut_ad(!mutex_own(&buf_pool->zip_mutex)); + ut_a(!ut_align_offset(buf, UNIV_PAGE_SIZE)); + + mutex_enter(&buf_pool->zip_hash_mutex); + + HASH_SEARCH(hash, buf_pool->zip_hash, fold, buf_page_t*, bpage, + ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_MEMORY + && bpage->in_zip_hash && !bpage->in_page_hash), + ((buf_block_t*) bpage)->frame == buf); + ut_a(bpage); + ut_a(buf_page_get_state(bpage) == BUF_BLOCK_MEMORY); + ut_ad(!bpage->in_page_hash); + ut_ad(bpage->in_zip_hash); + ut_d(bpage->in_zip_hash = FALSE); + HASH_DELETE(buf_page_t, hash, buf_pool->zip_hash, fold, bpage); + + mutex_exit(&buf_pool->zip_hash_mutex); + + ut_d(memset(buf, 0, UNIV_PAGE_SIZE)); + UNIV_MEM_INVALID(buf, UNIV_PAGE_SIZE); + + block = (buf_block_t*) bpage; + mutex_enter(&block->mutex); + buf_LRU_block_free_non_file_page(block); + mutex_exit(&block->mutex); + + ut_ad(buf_pool->buddy_n_frames > 0); + ut_d(buf_pool->buddy_n_frames--); +} + +/**********************************************************************//** +Allocate a buffer block to the buddy allocator. */ +static +void +buf_buddy_block_register( +/*=====================*/ + buf_block_t* block) /*!< in: buffer frame to allocate */ +{ + buf_pool_t* buf_pool = buf_pool_from_block(block); + const ulint fold = BUF_POOL_ZIP_FOLD(block); + ut_ad(!mutex_own(&buf_pool->zip_mutex)); + ut_ad(buf_block_get_state(block) == BUF_BLOCK_READY_FOR_USE); + + buf_block_set_state(block, BUF_BLOCK_MEMORY); + + ut_a(block->frame); + ut_a(!ut_align_offset(block->frame, UNIV_PAGE_SIZE)); + + ut_ad(!block->page.in_page_hash); + ut_ad(!block->page.in_zip_hash); + ut_d(block->page.in_zip_hash = TRUE); + + mutex_enter(&buf_pool->zip_hash_mutex); + HASH_INSERT(buf_page_t, hash, buf_pool->zip_hash, fold, &block->page); + mutex_exit(&buf_pool->zip_hash_mutex); + + ut_d(buf_pool->buddy_n_frames++); +} + +/**********************************************************************//** +Allocate a block from a bigger object. +@return allocated block */ +static +void* +buf_buddy_alloc_from( +/*=================*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + void* buf, /*!< in: a block that is free to use */ + ulint i, /*!< in: index of + buf_pool->zip_free[] */ + ulint j) /*!< in: size of buf as an index + of buf_pool->zip_free[] */ +{ + ulint offs = BUF_BUDDY_LOW << j; + ut_ad(mutex_own(&buf_pool->zip_free_mutex)); + ut_ad(j <= BUF_BUDDY_SIZES); + ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN)); + ut_ad(j >= i); + ut_ad(!ut_align_offset(buf, offs)); + + /* Add the unused parts of the block to the free lists. */ + while (j > i) { + buf_buddy_free_t* zip_buf; + + offs >>= 1; + j--; + + zip_buf = reinterpret_cast<buf_buddy_free_t*>( + reinterpret_cast<byte*>(buf) + offs); + buf_buddy_add_to_free(buf_pool, zip_buf, j); + } + + buf_buddy_stamp_nonfree(reinterpret_cast<buf_buddy_free_t*>(buf), i); + return(buf); +} + +/**********************************************************************//** +Allocate a block. The thread calling this function must hold +buf_pool->LRU_list_mutex and must not hold buf_pool->zip_mutex or any +block->mutex. The buf_pool->LRU_list_mutex may be released and reacquired. +@return allocated block, never NULL */ +UNIV_INTERN +void* +buf_buddy_alloc_low( +/*================*/ + buf_pool_t* buf_pool, /*!< in/out: buffer pool instance */ + ulint i, /*!< in: index of buf_pool->zip_free[], + or BUF_BUDDY_SIZES */ + ibool* lru) /*!< in: pointer to a variable that + will be assigned TRUE if storage was + allocated from the LRU list and + buf_pool->LRU_list_mutex was + temporarily released */ +{ + buf_block_t* block; + + ut_ad(lru); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); + ut_ad(!mutex_own(&buf_pool->zip_mutex)); + ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN)); + + if (i < BUF_BUDDY_SIZES) { + /* Try to allocate from the buddy system. */ + mutex_enter(&buf_pool->zip_free_mutex); + block = (buf_block_t*) buf_buddy_alloc_zip(buf_pool, i); + + if (block) { + goto func_exit; + } + mutex_exit(&buf_pool->zip_free_mutex); + } + + /* Try allocating from the buf_pool->free list. */ + block = buf_LRU_get_free_only(buf_pool); + + if (block) { + + goto alloc_big; + } + + /* Try replacing an uncompressed page in the buffer pool. */ + mutex_exit(&buf_pool->LRU_list_mutex); + block = buf_LRU_get_free_block(buf_pool); + *lru = TRUE; + mutex_enter(&buf_pool->LRU_list_mutex); + +alloc_big: + buf_buddy_block_register(block); + + mutex_enter(&buf_pool->zip_free_mutex); + block = (buf_block_t*) buf_buddy_alloc_from( + buf_pool, block->frame, i, BUF_BUDDY_SIZES); + +func_exit: + buf_pool->buddy_stat[i].used++; + mutex_exit(&buf_pool->zip_free_mutex); + + return(block); +} + +/**********************************************************************//** +Try to relocate a block. The caller must hold zip_free_mutex, and this +function will release and lock it again. +@return true if relocated */ +static +bool +buf_buddy_relocate( +/*===============*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + void* src, /*!< in: block to relocate */ + void* dst, /*!< in: free block to relocate to */ + ulint i) /*!< in: index of + buf_pool->zip_free[] */ +{ + buf_page_t* bpage; + const ulint size = BUF_BUDDY_LOW << i; + ulint space; + ulint offset; + + ut_ad(mutex_own(&buf_pool->zip_free_mutex)); + ut_ad(!mutex_own(&buf_pool->zip_mutex)); + ut_ad(!ut_align_offset(src, size)); + ut_ad(!ut_align_offset(dst, size)); + ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN)); + UNIV_MEM_ASSERT_W(dst, size); + + space = mach_read_from_4((const byte*) src + + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + offset = mach_read_from_4((const byte*) src + + FIL_PAGE_OFFSET); + + /* Suppress Valgrind warnings about conditional jump + on uninitialized value. */ + UNIV_MEM_VALID(&space, sizeof space); + UNIV_MEM_VALID(&offset, sizeof offset); + + ut_ad(space != BUF_BUDDY_STAMP_FREE); + + mutex_exit(&buf_pool->zip_free_mutex); + + ulint fold = buf_page_address_fold(space, offset); + prio_rw_lock_t* hash_lock = buf_page_hash_lock_get(buf_pool, fold); + + rw_lock_x_lock(hash_lock); + + bpage = buf_page_hash_get_low(buf_pool, space, offset, fold); + + if (!bpage || bpage->zip.data != src) { + /* The block has probably been freshly + allocated by buf_LRU_get_free_block() but not + added to buf_pool->page_hash yet. Obviously, + it cannot be relocated. */ + + rw_lock_x_unlock(hash_lock); + + mutex_enter(&buf_pool->zip_free_mutex); + return(false); + } + + if (page_zip_get_size(&bpage->zip) != size) { + /* The block is of different size. We would + have to relocate all blocks covered by src. + For the sake of simplicity, give up. */ + ut_ad(page_zip_get_size(&bpage->zip) < size); + + rw_lock_x_unlock(hash_lock); + + mutex_enter(&buf_pool->zip_free_mutex); + return(false); + } + + /* The block must have been allocated, but it may + contain uninitialized data. */ + UNIV_MEM_ASSERT_W(src, size); + + ib_mutex_t* block_mutex = buf_page_get_mutex(bpage); + + mutex_enter(block_mutex); + + mutex_enter(&buf_pool->zip_free_mutex); + + if (buf_page_can_relocate(bpage)) { + /* Relocate the compressed page. */ + ullint usec = ut_time_us(NULL); + + ut_a(bpage->zip.data == src); + + /* Note: This is potentially expensive, we need a better + solution here. We go with correctness for now. */ + ::memcpy(dst, src, size); + + bpage->zip.data = reinterpret_cast<page_zip_t*>(dst); + + rw_lock_x_unlock(hash_lock); + + mutex_exit(block_mutex); + + buf_buddy_mem_invalid( + reinterpret_cast<buf_buddy_free_t*>(src), i); + + buf_buddy_stat_t* buddy_stat = &buf_pool->buddy_stat[i]; + + ++buddy_stat->relocated; + + buddy_stat->relocated_usec += ut_time_us(NULL) - usec; + + return(true); + } + + rw_lock_x_unlock(hash_lock); + + mutex_exit(block_mutex); + + return(false); +} + +/**********************************************************************//** +Deallocate a block. */ +UNIV_INTERN +void +buf_buddy_free_low( +/*===============*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + void* buf, /*!< in: block to be freed, must not be + pointed to by the buffer pool */ + ulint i) /*!< in: index of buf_pool->zip_free[], + or BUF_BUDDY_SIZES */ +{ + buf_buddy_free_t* buddy; + + ut_ad(!mutex_own(&buf_pool->zip_mutex)); + ut_ad(i <= BUF_BUDDY_SIZES); + ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN)); + + mutex_enter(&buf_pool->zip_free_mutex); + + ut_ad(buf_pool->buddy_stat[i].used > 0); + buf_pool->buddy_stat[i].used--; +recombine: + UNIV_MEM_ASSERT_AND_ALLOC(buf, BUF_BUDDY_LOW << i); + + if (i == BUF_BUDDY_SIZES) { + mutex_exit(&buf_pool->zip_free_mutex); + buf_buddy_block_free(buf_pool, buf); + return; + } + + ut_ad(i < BUF_BUDDY_SIZES); + ut_ad(buf == ut_align_down(buf, BUF_BUDDY_LOW << i)); + ut_ad(!buf_pool_contains_zip(buf_pool, buf)); + + /* Do not recombine blocks if there are few free blocks. + We may waste up to 15360*max_len bytes to free blocks + (1024 + 2048 + 4096 + 8192 = 15360) */ + if (UT_LIST_GET_LEN(buf_pool->zip_free[i]) < 16) { + goto func_exit; + } + + /* Try to combine adjacent blocks. */ + buddy = reinterpret_cast<buf_buddy_free_t*>( + buf_buddy_get(reinterpret_cast<byte*>(buf), + BUF_BUDDY_LOW << i)); + + switch (buf_buddy_is_free(buddy, i)) { + case BUF_BUDDY_STATE_FREE: + /* The buddy is free: recombine */ + buf_buddy_remove_from_free(buf_pool, buddy, i); +buddy_is_free: + ut_ad(!buf_pool_contains_zip(buf_pool, buddy)); + i++; + buf = ut_align_down(buf, BUF_BUDDY_LOW << i); + + goto recombine; + + case BUF_BUDDY_STATE_USED: + ut_d(BUF_BUDDY_LIST_VALIDATE(buf_pool, i)); + + /* The buddy is not free. Is there a free block of + this size? */ + if (buf_buddy_free_t* zip_buf = + UT_LIST_GET_FIRST(buf_pool->zip_free[i])) { + + /* Remove the block from the free list, because + a successful buf_buddy_relocate() will overwrite + zip_free->list. */ + buf_buddy_remove_from_free(buf_pool, zip_buf, i); + + /* Try to relocate the buddy of buf to the free + block. */ + if (buf_buddy_relocate(buf_pool, buddy, zip_buf, i)) { + + goto buddy_is_free; + } + + buf_buddy_add_to_free(buf_pool, zip_buf, i); + } + + break; + case BUF_BUDDY_STATE_PARTIALLY_USED: + /* Some sub-blocks in the buddy are still in use. + Relocation will fail. No need to try. */ + break; + } + +func_exit: + /* Free the block to the buddy list. */ + buf_buddy_add_to_free(buf_pool, + reinterpret_cast<buf_buddy_free_t*>(buf), + i); + mutex_exit(&buf_pool->zip_free_mutex); +} diff --git a/storage/xtradb/buf/buf0buf.cc b/storage/xtradb/buf/buf0buf.cc new file mode 100644 index 00000000000..77cac63b629 --- /dev/null +++ b/storage/xtradb/buf/buf0buf.cc @@ -0,0 +1,5652 @@ +/***************************************************************************** + +Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2008, Google Inc. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file buf/buf0buf.cc +The database buffer buf_pool + +Created 11/5/1995 Heikki Tuuri +*******************************************************/ + +#include "buf0buf.h" + +#ifdef UNIV_NONINL +#include "buf0buf.ic" +#endif + +#include "mem0mem.h" +#include "btr0btr.h" +#include "fil0fil.h" +#ifndef UNIV_HOTBACKUP +#include "buf0buddy.h" +#include "lock0lock.h" +#include "btr0sea.h" +#include "ibuf0ibuf.h" +#include "trx0undo.h" +#include "log0log.h" +#endif /* !UNIV_HOTBACKUP */ +#include "srv0srv.h" +#include "dict0dict.h" +#include "log0recv.h" +#include "page0zip.h" +#include "srv0mon.h" +#include "buf0checksum.h" +#include "trx0trx.h" +#include "srv0start.h" + +/* prototypes for new functions added to ha_innodb.cc */ +trx_t* innobase_get_trx(); + +static inline +void +_increment_page_get_statistics(buf_block_t* block, trx_t* trx) +{ + ulint block_hash; + ulint block_hash_byte; + byte block_hash_offset; + + ut_ad(block); + ut_ad(trx && trx->take_stats); + + if (!trx->distinct_page_access_hash) { + trx->distinct_page_access_hash + = static_cast<byte *>(mem_alloc(DPAH_SIZE)); + memset(trx->distinct_page_access_hash, 0, DPAH_SIZE); + } + + block_hash = ut_hash_ulint((block->page.space << 20) + block->page.space + + block->page.offset, DPAH_SIZE << 3); + block_hash_byte = block_hash >> 3; + block_hash_offset = (byte) block_hash & 0x07; + if (block_hash_byte >= DPAH_SIZE) + fprintf(stderr, "!!! block_hash_byte = %lu block_hash_offset = %d !!!\n", block_hash_byte, block_hash_offset); + if (block_hash_offset > 7) + fprintf(stderr, "!!! block_hash_byte = %lu block_hash_offset = %d !!!\n", block_hash_byte, block_hash_offset); + if ((trx->distinct_page_access_hash[block_hash_byte] & ((byte) 0x01 << block_hash_offset)) == 0) + trx->distinct_page_access++; + trx->distinct_page_access_hash[block_hash_byte] |= (byte) 0x01 << block_hash_offset; + return; +} + +/* + IMPLEMENTATION OF THE BUFFER POOL + ================================= + +Performance improvement: +------------------------ +Thread scheduling in NT may be so slow that the OS wait mechanism should +not be used even in waiting for disk reads to complete. +Rather, we should put waiting query threads to the queue of +waiting jobs, and let the OS thread do something useful while the i/o +is processed. In this way we could remove most OS thread switches in +an i/o-intensive benchmark like TPC-C. + +A possibility is to put a user space thread library between the database +and NT. User space thread libraries might be very fast. + +SQL Server 7.0 can be configured to use 'fibers' which are lightweight +threads in NT. These should be studied. + + Buffer frames and blocks + ------------------------ +Following the terminology of Gray and Reuter, we call the memory +blocks where file pages are loaded buffer frames. For each buffer +frame there is a control block, or shortly, a block, in the buffer +control array. The control info which does not need to be stored +in the file along with the file page, resides in the control block. + + Buffer pool struct + ------------------ +The buffer buf_pool contains several mutexes which protect all the +control data structures of the buf_pool. The content of a buffer frame is +protected by a separate read-write lock in its control block, though. + + Control blocks + -------------- + +The control block contains, for instance, the bufferfix count +which is incremented when a thread wants a file page to be fixed +in a buffer frame. The bufferfix operation does not lock the +contents of the frame, however. For this purpose, the control +block contains a read-write lock. + +The buffer frames have to be aligned so that the start memory +address of a frame is divisible by the universal page size, which +is a power of two. + +We intend to make the buffer buf_pool size on-line reconfigurable, +that is, the buf_pool size can be changed without closing the database. +Then the database administarator may adjust it to be bigger +at night, for example. The control block array must +contain enough control blocks for the maximum buffer buf_pool size +which is used in the particular database. +If the buf_pool size is cut, we exploit the virtual memory mechanism of +the OS, and just refrain from using frames at high addresses. Then the OS +can swap them to disk. + +The control blocks containing file pages are put to a hash table +according to the file address of the page. +We could speed up the access to an individual page by using +"pointer swizzling": we could replace the page references on +non-leaf index pages by direct pointers to the page, if it exists +in the buf_pool. We could make a separate hash table where we could +chain all the page references in non-leaf pages residing in the buf_pool, +using the page reference as the hash key, +and at the time of reading of a page update the pointers accordingly. +Drawbacks of this solution are added complexity and, +possibly, extra space required on non-leaf pages for memory pointers. +A simpler solution is just to speed up the hash table mechanism +in the database, using tables whose size is a power of 2. + + Lists of blocks + --------------- + +There are several lists of control blocks. + +The free list (buf_pool->free) contains blocks which are currently not +used. + +The common LRU list contains all the blocks holding a file page +except those for which the bufferfix count is non-zero. +The pages are in the LRU list roughly in the order of the last +access to the page, so that the oldest pages are at the end of the +list. We also keep a pointer to near the end of the LRU list, +which we can use when we want to artificially age a page in the +buf_pool. This is used if we know that some page is not needed +again for some time: we insert the block right after the pointer, +causing it to be replaced sooner than would normally be the case. +Currently this aging mechanism is used for read-ahead mechanism +of pages, and it can also be used when there is a scan of a full +table which cannot fit in the memory. Putting the pages near the +end of the LRU list, we make sure that most of the buf_pool stays +in the main memory, undisturbed. + +The unzip_LRU list contains a subset of the common LRU list. The +blocks on the unzip_LRU list hold a compressed file page and the +corresponding uncompressed page frame. A block is in unzip_LRU if and +only if the predicate buf_page_belongs_to_unzip_LRU(&block->page) +holds. The blocks in unzip_LRU will be in same order as they are in +the common LRU list. That is, each manipulation of the common LRU +list will result in the same manipulation of the unzip_LRU list. + +The chain of modified blocks (buf_pool->flush_list) contains the blocks +holding file pages that have been modified in the memory +but not written to disk yet. The block with the oldest modification +which has not yet been written to disk is at the end of the chain. +The access to this list is protected by buf_pool->flush_list_mutex. + +The chain of unmodified compressed blocks (buf_pool->zip_clean) +contains the control blocks (buf_page_t) of those compressed pages +that are not in buf_pool->flush_list and for which no uncompressed +page has been allocated in the buffer pool. The control blocks for +uncompressed pages are accessible via buf_block_t objects that are +reachable via buf_pool->chunks[]. + +The chains of free memory blocks (buf_pool->zip_free[]) are used by +the buddy allocator (buf0buddy.cc) to keep track of currently unused +memory blocks of size sizeof(buf_page_t)..UNIV_PAGE_SIZE / 2. These +blocks are inside the UNIV_PAGE_SIZE-sized memory blocks of type +BUF_BLOCK_MEMORY that the buddy allocator requests from the buffer +pool. The buddy allocator is solely used for allocating control +blocks for compressed pages (buf_page_t) and compressed page frames. + + Loading a file page + ------------------- + +First, a victim block for replacement has to be found in the +buf_pool. It is taken from the free list or searched for from the +end of the LRU-list. An exclusive lock is reserved for the frame, +the io_fix field is set in the block fixing the block in buf_pool, +and the io-operation for loading the page is queued. The io-handler thread +releases the X-lock on the frame and resets the io_fix field +when the io operation completes. + +A thread may request the above operation using the function +buf_page_get(). It may then continue to request a lock on the frame. +The lock is granted when the io-handler releases the x-lock. + + Read-ahead + ---------- + +The read-ahead mechanism is intended to be intelligent and +isolated from the semantically higher levels of the database +index management. From the higher level we only need the +information if a file page has a natural successor or +predecessor page. On the leaf level of a B-tree index, +these are the next and previous pages in the natural +order of the pages. + +Let us first explain the read-ahead mechanism when the leafs +of a B-tree are scanned in an ascending or descending order. +When a read page is the first time referenced in the buf_pool, +the buffer manager checks if it is at the border of a so-called +linear read-ahead area. The tablespace is divided into these +areas of size 64 blocks, for example. So if the page is at the +border of such an area, the read-ahead mechanism checks if +all the other blocks in the area have been accessed in an +ascending or descending order. If this is the case, the system +looks at the natural successor or predecessor of the page, +checks if that is at the border of another area, and in this case +issues read-requests for all the pages in that area. Maybe +we could relax the condition that all the pages in the area +have to be accessed: if data is deleted from a table, there may +appear holes of unused pages in the area. + +A different read-ahead mechanism is used when there appears +to be a random access pattern to a file. +If a new page is referenced in the buf_pool, and several pages +of its random access area (for instance, 32 consecutive pages +in a tablespace) have recently been referenced, we may predict +that the whole area may be needed in the near future, and issue +the read requests for the whole area. +*/ + +#ifndef UNIV_HOTBACKUP +/** Value in microseconds */ +static const int WAIT_FOR_READ = 100; +/** Number of attemtps made to read in a page in the buffer pool */ +static const ulint BUF_PAGE_READ_MAX_RETRIES = 100; + +/** The buffer pools of the database */ +UNIV_INTERN buf_pool_t* buf_pool_ptr; + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +static ulint buf_dbg_counter = 0; /*!< This is used to insert validation + operations in execution in the + debug version */ +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ +#ifdef UNIV_DEBUG +/** If this is set TRUE, the program prints info whenever +read-ahead or flush occurs */ +UNIV_INTERN ibool buf_debug_prints = FALSE; +#endif /* UNIV_DEBUG */ + +#ifdef UNIV_PFS_RWLOCK +/* Keys to register buffer block related rwlocks and mutexes with +performance schema */ +UNIV_INTERN mysql_pfs_key_t buf_block_lock_key; +# ifdef UNIV_SYNC_DEBUG +UNIV_INTERN mysql_pfs_key_t buf_block_debug_latch_key; +# endif /* UNIV_SYNC_DEBUG */ +#endif /* UNIV_PFS_RWLOCK */ + +#ifdef UNIV_PFS_MUTEX +UNIV_INTERN mysql_pfs_key_t buffer_block_mutex_key; +UNIV_INTERN mysql_pfs_key_t buf_pool_zip_mutex_key; +UNIV_INTERN mysql_pfs_key_t buf_pool_flush_state_mutex_key; +UNIV_INTERN mysql_pfs_key_t buf_pool_LRU_list_mutex_key; +UNIV_INTERN mysql_pfs_key_t buf_pool_free_list_mutex_key; +UNIV_INTERN mysql_pfs_key_t buf_pool_zip_free_mutex_key; +UNIV_INTERN mysql_pfs_key_t buf_pool_zip_hash_mutex_key; +UNIV_INTERN mysql_pfs_key_t flush_list_mutex_key; +#endif /* UNIV_PFS_MUTEX */ + +#if defined UNIV_PFS_MUTEX || defined UNIV_PFS_RWLOCK +# ifndef PFS_SKIP_BUFFER_MUTEX_RWLOCK + +/* Buffer block mutexes and rwlocks can be registered +in one group rather than individually. If PFS_GROUP_BUFFER_SYNC +is defined, register buffer block mutex and rwlock +in one group after their initialization. */ +# define PFS_GROUP_BUFFER_SYNC + +/* This define caps the number of mutexes/rwlocks can +be registered with performance schema. Developers can +modify this define if necessary. Please note, this would +be effective only if PFS_GROUP_BUFFER_SYNC is defined. */ +# define PFS_MAX_BUFFER_MUTEX_LOCK_REGISTER ULINT_MAX + +# endif /* !PFS_SKIP_BUFFER_MUTEX_RWLOCK */ +#endif /* UNIV_PFS_MUTEX || UNIV_PFS_RWLOCK */ + +/** Macro to determine whether the read of write counter is used depending +on the io_type */ +#define MONITOR_RW_COUNTER(io_type, counter) \ + ((io_type == BUF_IO_READ) \ + ? (counter##_READ) \ + : (counter##_WRITTEN)) + +/********************************************************************//** +Gets the smallest oldest_modification lsn for any page in the pool. Returns +zero if all modified pages have been flushed to disk. +@return oldest modification in pool, zero if none */ +UNIV_INTERN +lsn_t +buf_pool_get_oldest_modification(void) +/*==================================*/ +{ + ulint i; + buf_page_t* bpage; + lsn_t lsn = 0; + lsn_t oldest_lsn = 0; + + /* When we traverse all the flush lists we don't want another + thread to add a dirty page to any flush list. */ + if (srv_buf_pool_instances > 1) + log_flush_order_mutex_enter(); + + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_array(i); + + buf_flush_list_mutex_enter(buf_pool); + + bpage = UT_LIST_GET_LAST(buf_pool->flush_list); + + if (bpage != NULL) { + ut_ad(bpage->in_flush_list); + lsn = bpage->oldest_modification; + } + + buf_flush_list_mutex_exit(buf_pool); + + if (!oldest_lsn || oldest_lsn > lsn) { + oldest_lsn = lsn; + } + } + + if (srv_buf_pool_instances > 1) + log_flush_order_mutex_exit(); + + /* The returned answer may be out of date: the flush_list can + change after the mutex has been released. */ + + return(oldest_lsn); +} + +/********************************************************************//** +Get total buffer pool statistics. */ +UNIV_INTERN +void +buf_get_total_list_len( +/*===================*/ + ulint* LRU_len, /*!< out: length of all LRU lists */ + ulint* free_len, /*!< out: length of all free lists */ + ulint* flush_list_len) /*!< out: length of all flush lists */ +{ + ulint i; + + *LRU_len = 0; + *free_len = 0; + *flush_list_len = 0; + + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_array(i); + + *LRU_len += UT_LIST_GET_LEN(buf_pool->LRU); + *free_len += UT_LIST_GET_LEN(buf_pool->free); + *flush_list_len += UT_LIST_GET_LEN(buf_pool->flush_list); + } +} + +/********************************************************************//** +Get total list size in bytes from all buffer pools. */ +UNIV_INTERN +void +buf_get_total_list_size_in_bytes( +/*=============================*/ + buf_pools_list_size_t* buf_pools_list_size) /*!< out: list sizes + in all buffer pools */ +{ + ut_ad(buf_pools_list_size); + memset(buf_pools_list_size, 0, sizeof(*buf_pools_list_size)); + + for (ulint i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_array(i); + /* We don't need mutex protection since this is + for statistics purpose */ + buf_pools_list_size->LRU_bytes += buf_pool->stat.LRU_bytes; + buf_pools_list_size->unzip_LRU_bytes += + UT_LIST_GET_LEN(buf_pool->unzip_LRU) * UNIV_PAGE_SIZE; + buf_pools_list_size->flush_list_bytes += + buf_pool->stat.flush_list_bytes; + } +} + +/********************************************************************//** +Get total buffer pool statistics. */ +UNIV_INTERN +void +buf_get_total_stat( +/*===============*/ + buf_pool_stat_t* tot_stat) /*!< out: buffer pool stats */ +{ + ulint i; + + memset(tot_stat, 0, sizeof(*tot_stat)); + + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_stat_t*buf_stat; + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_array(i); + + buf_stat = &buf_pool->stat; + tot_stat->n_page_gets += buf_stat->n_page_gets; + tot_stat->n_pages_read += buf_stat->n_pages_read; + tot_stat->n_pages_written += buf_stat->n_pages_written; + tot_stat->n_pages_created += buf_stat->n_pages_created; + tot_stat->n_ra_pages_read_rnd += buf_stat->n_ra_pages_read_rnd; + tot_stat->n_ra_pages_read += buf_stat->n_ra_pages_read; + tot_stat->n_ra_pages_evicted += buf_stat->n_ra_pages_evicted; + tot_stat->n_pages_made_young += buf_stat->n_pages_made_young; + + tot_stat->n_pages_not_made_young += + buf_stat->n_pages_not_made_young; + } +} + +/********************************************************************//** +Allocates a buffer block. +@return own: the allocated block, in state BUF_BLOCK_MEMORY */ +UNIV_INTERN +buf_block_t* +buf_block_alloc( +/*============*/ + buf_pool_t* buf_pool) /*!< in/out: buffer pool instance, + or NULL for round-robin selection + of the buffer pool */ +{ + buf_block_t* block; + ulint index; + static ulint buf_pool_index; + + if (buf_pool == NULL) { + /* We are allocating memory from any buffer pool, ensure + we spread the grace on all buffer pool instances. */ + index = buf_pool_index++ % srv_buf_pool_instances; + buf_pool = buf_pool_from_array(index); + } + + block = buf_LRU_get_free_block(buf_pool); + + buf_block_set_state(block, BUF_BLOCK_MEMORY); + + return(block); +} +#endif /* !UNIV_HOTBACKUP */ + +/********************************************************************//** +Checks if a page is all zeroes. +@return TRUE if the page is all zeroes */ +bool +buf_page_is_zeroes( +/*===============*/ + const byte* read_buf, /*!< in: a database page */ + const ulint zip_size) /*!< in: size of compressed page; + 0 for uncompressed pages */ +{ + const ulint page_size = zip_size ? zip_size : UNIV_PAGE_SIZE; + + for (ulint i = 0; i < page_size; i++) { + if (read_buf[i] != 0) { + return(false); + } + } + return(true); +} + +/********************************************************************//** +Checks if a page is corrupt. +@return TRUE if corrupted */ +UNIV_INTERN +ibool +buf_page_is_corrupted( +/*==================*/ + bool check_lsn, /*!< in: true if we need to check + and complain about the LSN */ + const byte* read_buf, /*!< in: a database page */ + ulint zip_size) /*!< in: size of compressed page; + 0 for uncompressed pages */ +{ + ulint checksum_field1; + ulint checksum_field2; + ibool crc32_inited = FALSE; + ib_uint32_t crc32 = ULINT32_UNDEFINED; + + if (!zip_size + && memcmp(read_buf + FIL_PAGE_LSN + 4, + read_buf + UNIV_PAGE_SIZE + - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4)) { + + /* Stored log sequence numbers at the start and the end + of page do not match */ + + return(TRUE); + } + +#ifndef UNIV_HOTBACKUP + if (check_lsn && recv_lsn_checks_on) { + lsn_t current_lsn; + + /* Since we are going to reset the page LSN during the import + phase it makes no sense to spam the log with error messages. */ + + if (log_peek_lsn(¤t_lsn) + && current_lsn + < mach_read_from_8(read_buf + FIL_PAGE_LSN)) { + ut_print_timestamp(stderr); + + fprintf(stderr, + " InnoDB: Error: page %lu log sequence number" + " " LSN_PF "\n" + "InnoDB: is in the future! Current system " + "log sequence number " LSN_PF ".\n" + "InnoDB: Your database may be corrupt or " + "you may have copied the InnoDB\n" + "InnoDB: tablespace but not the InnoDB " + "log files. See\n" + "InnoDB: " REFMAN + "forcing-innodb-recovery.html\n" + "InnoDB: for more information.\n", + (ulong) mach_read_from_4( + read_buf + FIL_PAGE_OFFSET), + (lsn_t) mach_read_from_8( + read_buf + FIL_PAGE_LSN), + current_lsn); + } + } +#endif + + /* Check whether the checksum fields have correct values */ + + if (srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_NONE) { + return(FALSE); + } + + if (zip_size) { + return(!page_zip_verify_checksum(read_buf, zip_size)); + } + + checksum_field1 = mach_read_from_4( + read_buf + FIL_PAGE_SPACE_OR_CHKSUM); + + checksum_field2 = mach_read_from_4( + read_buf + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM); + +#if FIL_PAGE_LSN % 8 +#error "FIL_PAGE_LSN must be 64 bit aligned" +#endif + + /* declare empty pages non-corrupted */ + if (checksum_field1 == 0 && checksum_field2 == 0 + && *reinterpret_cast<const ib_uint64_t*>(read_buf + + FIL_PAGE_LSN) == 0) { + /* make sure that the page is really empty */ + for (ulint i = 0; i < UNIV_PAGE_SIZE; i++) { + if (read_buf[i] != 0) { + return(TRUE); + } + } + + return(FALSE); + } + + switch ((srv_checksum_algorithm_t) srv_checksum_algorithm) { + case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32: + + crc32 = buf_calc_page_crc32(read_buf); + + return(checksum_field1 != crc32 || checksum_field2 != crc32); + + case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB: + + return(checksum_field1 + != buf_calc_page_new_checksum(read_buf) + || checksum_field2 + != buf_calc_page_old_checksum(read_buf)); + + case SRV_CHECKSUM_ALGORITHM_STRICT_NONE: + + return(checksum_field1 != BUF_NO_CHECKSUM_MAGIC + || checksum_field2 != BUF_NO_CHECKSUM_MAGIC); + + case SRV_CHECKSUM_ALGORITHM_CRC32: + case SRV_CHECKSUM_ALGORITHM_INNODB: + /* There are 3 valid formulas for + checksum_field2 (old checksum field): + + 1. Very old versions of InnoDB only stored 8 byte lsn to the + start and the end of the page. + + 2. InnoDB versions before MySQL 5.6.3 store the old formula + checksum (buf_calc_page_old_checksum()). + + 3. InnoDB versions 5.6.3 and newer with + innodb_checksum_algorithm=strict_crc32|crc32 store CRC32. */ + + /* since innodb_checksum_algorithm is not strict_* allow + any of the algos to match for the old field */ + + if (checksum_field2 + != mach_read_from_4(read_buf + FIL_PAGE_LSN) + && checksum_field2 != BUF_NO_CHECKSUM_MAGIC) { + + /* The checksum does not match any of the + fast to check. First check the selected algorithm + for writing checksums because we assume that the + chance of it matching is higher. */ + + if (srv_checksum_algorithm + == SRV_CHECKSUM_ALGORITHM_CRC32) { + + crc32 = buf_calc_page_crc32(read_buf); + crc32_inited = TRUE; + + if (checksum_field2 != crc32 + && checksum_field2 + != buf_calc_page_old_checksum(read_buf)) { + + return(TRUE); + } + } else { + ut_ad(srv_checksum_algorithm + == SRV_CHECKSUM_ALGORITHM_INNODB); + + if (checksum_field2 + != buf_calc_page_old_checksum(read_buf)) { + + crc32 = buf_calc_page_crc32(read_buf); + crc32_inited = TRUE; + + if (checksum_field2 != crc32) { + return(TRUE); + } + } + } + } + + /* old field is fine, check the new field */ + + /* InnoDB versions < 4.0.14 and < 4.1.1 stored the space id + (always equal to 0), to FIL_PAGE_SPACE_OR_CHKSUM */ + + if (checksum_field1 != 0 + && checksum_field1 != BUF_NO_CHECKSUM_MAGIC) { + + /* The checksum does not match any of the + fast to check. First check the selected algorithm + for writing checksums because we assume that the + chance of it matching is higher. */ + + if (srv_checksum_algorithm + == SRV_CHECKSUM_ALGORITHM_CRC32) { + + if (!crc32_inited) { + crc32 = buf_calc_page_crc32(read_buf); + crc32_inited = TRUE; + } + + if (checksum_field1 != crc32 + && checksum_field1 + != buf_calc_page_new_checksum(read_buf)) { + + return(TRUE); + } + } else { + ut_ad(srv_checksum_algorithm + == SRV_CHECKSUM_ALGORITHM_INNODB); + + if (checksum_field1 + != buf_calc_page_new_checksum(read_buf)) { + + if (!crc32_inited) { + crc32 = buf_calc_page_crc32( + read_buf); + crc32_inited = TRUE; + } + + if (checksum_field1 != crc32) { + return(TRUE); + } + } + } + } + + /* If CRC32 is stored in at least one of the fields, then the + other field must also be CRC32 */ + if (crc32_inited + && ((checksum_field1 == crc32 + && checksum_field2 != crc32) + || (checksum_field1 != crc32 + && checksum_field2 == crc32))) { + + return(TRUE); + } + + break; + case SRV_CHECKSUM_ALGORITHM_NONE: + /* should have returned FALSE earlier */ + ut_error; + /* no default so the compiler will emit a warning if new enum + is added and not handled here */ + } + + DBUG_EXECUTE_IF("buf_page_is_corrupt_failure", return(TRUE); ); + + return(FALSE); +} + +/********************************************************************//** +Prints a page to stderr. */ +UNIV_INTERN +void +buf_page_print( +/*===========*/ + const byte* read_buf, /*!< in: a database page */ + ulint zip_size, /*!< in: compressed page size, or + 0 for uncompressed pages */ + ulint flags) /*!< in: 0 or + BUF_PAGE_PRINT_NO_CRASH or + BUF_PAGE_PRINT_NO_FULL */ + +{ +#ifndef UNIV_HOTBACKUP + dict_index_t* index; +#endif /* !UNIV_HOTBACKUP */ + ulint size = zip_size; + + if (!read_buf) { + fprintf(stderr, + " InnoDB: Not dumping page as (in memory) pointer " + "is NULL\n"); + return; + } + + if (!size) { + size = UNIV_PAGE_SIZE; + } + + if (!(flags & BUF_PAGE_PRINT_NO_FULL)) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Page dump in ascii and hex (%lu bytes):\n", + (ulong) size); + ut_print_buf(stderr, read_buf, size); + fputs("\nInnoDB: End of page dump\n", stderr); + } + + if (zip_size) { + /* Print compressed page. */ + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Compressed page type (" ULINTPF "); " + "stored checksum in field1 " ULINTPF "; " + "calculated checksums for field1: " + "%s " ULINTPF ", " + "%s " ULINTPF ", " + "%s " ULINTPF "; " + "page LSN " LSN_PF "; " + "page number (if stored to page already) " ULINTPF "; " + "space id (if stored to page already) " ULINTPF "\n", + fil_page_get_type(read_buf), + mach_read_from_4(read_buf + FIL_PAGE_SPACE_OR_CHKSUM), + buf_checksum_algorithm_name( + SRV_CHECKSUM_ALGORITHM_CRC32), + page_zip_calc_checksum(read_buf, zip_size, + SRV_CHECKSUM_ALGORITHM_CRC32), + buf_checksum_algorithm_name( + SRV_CHECKSUM_ALGORITHM_INNODB), + page_zip_calc_checksum(read_buf, zip_size, + SRV_CHECKSUM_ALGORITHM_INNODB), + buf_checksum_algorithm_name( + SRV_CHECKSUM_ALGORITHM_NONE), + page_zip_calc_checksum(read_buf, zip_size, + SRV_CHECKSUM_ALGORITHM_NONE), + mach_read_from_8(read_buf + FIL_PAGE_LSN), + mach_read_from_4(read_buf + FIL_PAGE_OFFSET), + mach_read_from_4(read_buf + + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID)); + } else { + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: uncompressed page, " + "stored checksum in field1 " ULINTPF ", " + "calculated checksums for field1: " + "%s " UINT32PF ", " + "%s " ULINTPF ", " + "%s " ULINTPF ", " + + "stored checksum in field2 " ULINTPF ", " + "calculated checksums for field2: " + "%s " UINT32PF ", " + "%s " ULINTPF ", " + "%s " ULINTPF ", " + + "page LSN " ULINTPF " " ULINTPF ", " + "low 4 bytes of LSN at page end " ULINTPF ", " + "page number (if stored to page already) " ULINTPF ", " + "space id (if created with >= MySQL-4.1.1 " + "and stored already) %lu\n", + mach_read_from_4(read_buf + FIL_PAGE_SPACE_OR_CHKSUM), + buf_checksum_algorithm_name(SRV_CHECKSUM_ALGORITHM_CRC32), + buf_calc_page_crc32(read_buf), + buf_checksum_algorithm_name(SRV_CHECKSUM_ALGORITHM_INNODB), + buf_calc_page_new_checksum(read_buf), + buf_checksum_algorithm_name(SRV_CHECKSUM_ALGORITHM_NONE), + BUF_NO_CHECKSUM_MAGIC, + + mach_read_from_4(read_buf + UNIV_PAGE_SIZE + - FIL_PAGE_END_LSN_OLD_CHKSUM), + buf_checksum_algorithm_name(SRV_CHECKSUM_ALGORITHM_CRC32), + buf_calc_page_crc32(read_buf), + buf_checksum_algorithm_name(SRV_CHECKSUM_ALGORITHM_INNODB), + buf_calc_page_old_checksum(read_buf), + buf_checksum_algorithm_name(SRV_CHECKSUM_ALGORITHM_NONE), + BUF_NO_CHECKSUM_MAGIC, + + mach_read_from_4(read_buf + FIL_PAGE_LSN), + mach_read_from_4(read_buf + FIL_PAGE_LSN + 4), + mach_read_from_4(read_buf + UNIV_PAGE_SIZE + - FIL_PAGE_END_LSN_OLD_CHKSUM + 4), + mach_read_from_4(read_buf + FIL_PAGE_OFFSET), + mach_read_from_4(read_buf + + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID)); + } + +#ifndef UNIV_HOTBACKUP + if (mach_read_from_2(read_buf + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE) + == TRX_UNDO_INSERT) { + fprintf(stderr, + "InnoDB: Page may be an insert undo log page\n"); + } else if (mach_read_from_2(read_buf + TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_TYPE) + == TRX_UNDO_UPDATE) { + fprintf(stderr, + "InnoDB: Page may be an update undo log page\n"); + } +#endif /* !UNIV_HOTBACKUP */ + + switch (fil_page_get_type(read_buf)) { + index_id_t index_id; + case FIL_PAGE_INDEX: + index_id = btr_page_get_index_id(read_buf); + fprintf(stderr, + "InnoDB: Page may be an index page where" + " index id is %llu\n", + (ullint) index_id); +#ifndef UNIV_HOTBACKUP + index = dict_index_find_on_id_low(index_id); + if (index) { + fputs("InnoDB: (", stderr); + dict_index_name_print(stderr, NULL, index); + fputs(")\n", stderr); + } +#endif /* !UNIV_HOTBACKUP */ + break; + case FIL_PAGE_INODE: + fputs("InnoDB: Page may be an 'inode' page\n", stderr); + break; + case FIL_PAGE_IBUF_FREE_LIST: + fputs("InnoDB: Page may be an insert buffer free list page\n", + stderr); + break; + case FIL_PAGE_TYPE_ALLOCATED: + fputs("InnoDB: Page may be a freshly allocated page\n", + stderr); + break; + case FIL_PAGE_IBUF_BITMAP: + fputs("InnoDB: Page may be an insert buffer bitmap page\n", + stderr); + break; + case FIL_PAGE_TYPE_SYS: + fputs("InnoDB: Page may be a system page\n", + stderr); + break; + case FIL_PAGE_TYPE_TRX_SYS: + fputs("InnoDB: Page may be a transaction system page\n", + stderr); + break; + case FIL_PAGE_TYPE_FSP_HDR: + fputs("InnoDB: Page may be a file space header page\n", + stderr); + break; + case FIL_PAGE_TYPE_XDES: + fputs("InnoDB: Page may be an extent descriptor page\n", + stderr); + break; + case FIL_PAGE_TYPE_BLOB: + fputs("InnoDB: Page may be a BLOB page\n", + stderr); + break; + case FIL_PAGE_TYPE_ZBLOB: + case FIL_PAGE_TYPE_ZBLOB2: + fputs("InnoDB: Page may be a compressed BLOB page\n", + stderr); + break; + } + + ut_ad(flags & BUF_PAGE_PRINT_NO_CRASH); +} + +#ifndef UNIV_HOTBACKUP + +# ifdef PFS_GROUP_BUFFER_SYNC +/********************************************************************//** +This function registers mutexes and rwlocks in buffer blocks with +performance schema. If PFS_MAX_BUFFER_MUTEX_LOCK_REGISTER is +defined to be a value less than chunk->size, then only mutexes +and rwlocks in the first PFS_MAX_BUFFER_MUTEX_LOCK_REGISTER +blocks are registered. */ +static +void +pfs_register_buffer_block( +/*======================*/ + buf_chunk_t* chunk) /*!< in/out: chunk of buffers */ +{ + ulint i; + ulint num_to_register; + buf_block_t* block; + + block = chunk->blocks; + + num_to_register = ut_min(chunk->size, + PFS_MAX_BUFFER_MUTEX_LOCK_REGISTER); + + for (i = 0; i < num_to_register; i++) { + ib_mutex_t* mutex; + rw_lock_t* rwlock; + +# ifdef UNIV_PFS_MUTEX + mutex = &block->mutex; + ut_a(!mutex->pfs_psi); + mutex->pfs_psi = (PSI_server) + ? PSI_server->init_mutex(buffer_block_mutex_key, mutex) + : NULL; +# endif /* UNIV_PFS_MUTEX */ + +# ifdef UNIV_PFS_RWLOCK + rwlock = &block->lock; + ut_a(!rwlock->pfs_psi); + rwlock->pfs_psi = (PSI_server) + ? PSI_server->init_rwlock(buf_block_lock_key, rwlock) + : NULL; + +# ifdef UNIV_SYNC_DEBUG + rwlock = &block->debug_latch; + ut_a(!rwlock->pfs_psi); + rwlock->pfs_psi = (PSI_server) + ? PSI_server->init_rwlock(buf_block_debug_latch_key, + rwlock) + : NULL; +# endif /* UNIV_SYNC_DEBUG */ + +# endif /* UNIV_PFS_RWLOCK */ + block++; + } +} +# endif /* PFS_GROUP_BUFFER_SYNC */ + +/********************************************************************//** +Initializes a buffer control block when the buf_pool is created. */ +static +void +buf_block_init( +/*===========*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + buf_block_t* block, /*!< in: pointer to control block */ + byte* frame) /*!< in: pointer to buffer frame */ +{ + UNIV_MEM_DESC(frame, UNIV_PAGE_SIZE); + + block->frame = frame; + + block->page.buf_pool_index = buf_pool_index(buf_pool); + block->page.state = BUF_BLOCK_NOT_USED; + block->page.buf_fix_count = 0; + block->page.io_fix = BUF_IO_NONE; + + block->modify_clock = 0; + +#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG + block->page.file_page_was_freed = FALSE; +#endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */ + + block->check_index_page_at_flush = FALSE; + block->index = NULL; + +#ifdef UNIV_DEBUG + block->page.in_page_hash = FALSE; + block->page.in_zip_hash = FALSE; + block->page.in_flush_list = FALSE; + block->page.in_free_list = FALSE; + block->page.in_LRU_list = FALSE; + block->in_unzip_LRU_list = FALSE; +#endif /* UNIV_DEBUG */ +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + block->n_pointers = 0; +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + page_zip_des_init(&block->page.zip); + +#if defined PFS_SKIP_BUFFER_MUTEX_RWLOCK || defined PFS_GROUP_BUFFER_SYNC + /* If PFS_SKIP_BUFFER_MUTEX_RWLOCK is defined, skip registration + of buffer block mutex/rwlock with performance schema. If + PFS_GROUP_BUFFER_SYNC is defined, skip the registration + since buffer block mutex/rwlock will be registered later in + pfs_register_buffer_block() */ + + mutex_create(PFS_NOT_INSTRUMENTED, &block->mutex, SYNC_BUF_BLOCK); + rw_lock_create(PFS_NOT_INSTRUMENTED, &block->lock, SYNC_LEVEL_VARYING); + +# ifdef UNIV_SYNC_DEBUG + rw_lock_create(PFS_NOT_INSTRUMENTED, + &block->debug_latch, SYNC_NO_ORDER_CHECK); +# endif /* UNIV_SYNC_DEBUG */ + +#else /* PFS_SKIP_BUFFER_MUTEX_RWLOCK || PFS_GROUP_BUFFER_SYNC */ + mutex_create(buffer_block_mutex_key, &block->mutex, SYNC_BUF_BLOCK); + rw_lock_create(buf_block_lock_key, &block->lock, SYNC_LEVEL_VARYING); + +# ifdef UNIV_SYNC_DEBUG + rw_lock_create(buf_block_debug_latch_key, + &block->debug_latch, SYNC_NO_ORDER_CHECK); +# endif /* UNIV_SYNC_DEBUG */ +#endif /* PFS_SKIP_BUFFER_MUTEX_RWLOCK || PFS_GROUP_BUFFER_SYNC */ + + ut_ad(rw_lock_validate(&(block->lock))); +} + +/********************************************************************//** +Allocates a chunk of buffer frames. +@return chunk, or NULL on failure */ +static +buf_chunk_t* +buf_chunk_init( +/*===========*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + buf_chunk_t* chunk, /*!< out: chunk of buffers */ + ulint mem_size, /*!< in: requested size in bytes */ + ibool populate) /*!< in: virtual page preallocation */ +{ + buf_block_t* block; + byte* frame; + ulint i; + ulint size_target; + + /* Round down to a multiple of page size, + although it already should be. */ + mem_size = ut_2pow_round(mem_size, UNIV_PAGE_SIZE); + size_target = (mem_size / UNIV_PAGE_SIZE) - 1; + /* Reserve space for the block descriptors. */ + mem_size += ut_2pow_round((mem_size / UNIV_PAGE_SIZE) * (sizeof *block) + + (UNIV_PAGE_SIZE - 1), UNIV_PAGE_SIZE); + + chunk->mem_size = mem_size; + chunk->mem = os_mem_alloc_large(&chunk->mem_size, populate); + + if (UNIV_UNLIKELY(chunk->mem == NULL)) { + + return(NULL); + } + + /* Allocate the block descriptors from + the start of the memory block. */ + chunk->blocks = (buf_block_t*) chunk->mem; + + /* Align a pointer to the first frame. Note that when + os_large_page_size is smaller than UNIV_PAGE_SIZE, + we may allocate one fewer block than requested. When + it is bigger, we may allocate more blocks than requested. */ + + frame = (byte*) ut_align(chunk->mem, UNIV_PAGE_SIZE); + chunk->size = chunk->mem_size / UNIV_PAGE_SIZE + - (frame != chunk->mem); + + /* Subtract the space needed for block descriptors. */ + { + ulint size = chunk->size; + + while (frame < (byte*) (chunk->blocks + size)) { + frame += UNIV_PAGE_SIZE; + size--; + } + + chunk->size = size; + } + + if (chunk->size > size_target) { + chunk->size = size_target; + } + + /* Init block structs and assign frames for them. Then we + assign the frames to the first blocks (we already mapped the + memory above). */ + + block = chunk->blocks; + + for (i = chunk->size; i--; ) { + + buf_block_init(buf_pool, block, frame); + UNIV_MEM_INVALID(block->frame, UNIV_PAGE_SIZE); + + /* Add the block to the free list */ + UT_LIST_ADD_LAST(list, buf_pool->free, (&block->page)); + + ut_d(block->page.in_free_list = TRUE); + ut_ad(buf_pool_from_block(block) == buf_pool); + + block++; + frame += UNIV_PAGE_SIZE; + } + +#ifdef PFS_GROUP_BUFFER_SYNC + pfs_register_buffer_block(chunk); +#endif + return(chunk); +} + +#ifdef UNIV_DEBUG +/*********************************************************************//** +Finds a block in the given buffer chunk that points to a +given compressed page. +@return buffer block pointing to the compressed page, or NULL */ +static +buf_block_t* +buf_chunk_contains_zip( +/*===================*/ + buf_chunk_t* chunk, /*!< in: chunk being checked */ + const void* data) /*!< in: pointer to compressed page */ +{ + buf_block_t* block; + ulint i; + + block = chunk->blocks; + + for (i = chunk->size; i--; block++) { + if (block->page.zip.data == data) { + + return(block); + } + } + + return(NULL); +} + +/*********************************************************************//** +Finds a block in the buffer pool that points to a +given compressed page. +@return buffer block pointing to the compressed page, or NULL */ +UNIV_INTERN +buf_block_t* +buf_pool_contains_zip( +/*==================*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + const void* data) /*!< in: pointer to compressed page */ +{ + ulint n; + buf_chunk_t* chunk = buf_pool->chunks; + + ut_ad(buf_pool); + for (n = buf_pool->n_chunks; n--; chunk++) { + + buf_block_t* block = buf_chunk_contains_zip(chunk, data); + + if (block) { + return(block); + } + } + + return(NULL); +} +#endif /* UNIV_DEBUG */ + +/*********************************************************************//** +Checks that all file pages in the buffer chunk are in a replaceable state. +@return address of a non-free block, or NULL if all freed */ +static +const buf_block_t* +buf_chunk_not_freed( +/*================*/ + buf_chunk_t* chunk) /*!< in: chunk being checked */ +{ + buf_block_t* block; + ulint i; + + block = chunk->blocks; + + for (i = chunk->size; i--; block++) { + ibool ready; + + switch (buf_block_get_state(block)) { + case BUF_BLOCK_POOL_WATCH: + case BUF_BLOCK_ZIP_PAGE: + case BUF_BLOCK_ZIP_DIRTY: + /* The uncompressed buffer pool should never + contain compressed block descriptors. */ + ut_error; + break; + case BUF_BLOCK_NOT_USED: + case BUF_BLOCK_READY_FOR_USE: + case BUF_BLOCK_MEMORY: + case BUF_BLOCK_REMOVE_HASH: + /* Skip blocks that are not being used for + file pages. */ + break; + case BUF_BLOCK_FILE_PAGE: + mutex_enter(&block->mutex); + ready = buf_flush_ready_for_replace(&block->page); + mutex_exit(&block->mutex); + + if (UNIV_UNLIKELY(block->page.is_corrupt)) { + /* corrupt page may remain, it can be + skipped */ + break; + } + + if (!ready) { + + return(block); + } + + break; + } + } + + return(NULL); +} + +/********************************************************************//** +Set buffer pool size variables after resizing it */ +static +void +buf_pool_set_sizes(void) +/*====================*/ +{ + ulint i; + ulint curr_size = 0; + + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_array(i); + curr_size += buf_pool->curr_pool_size; + } + + srv_buf_pool_curr_size = curr_size; + srv_buf_pool_old_size = srv_buf_pool_size; +} + +/********************************************************************//** +Initialize a buffer pool instance. +@return DB_SUCCESS if all goes well. */ +UNIV_INTERN +ulint +buf_pool_init_instance( +/*===================*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + ulint buf_pool_size, /*!< in: size in bytes */ + ibool populate, /*!< in: virtual page preallocation */ + ulint instance_no) /*!< in: id of the instance */ +{ + ulint i; + buf_chunk_t* chunk; + + /* 1. Initialize general fields + ------------------------------- */ + mutex_create(buf_pool_LRU_list_mutex_key, + &buf_pool->LRU_list_mutex, SYNC_BUF_LRU_LIST); + mutex_create(buf_pool_free_list_mutex_key, + &buf_pool->free_list_mutex, SYNC_BUF_FREE_LIST); + mutex_create(buf_pool_zip_free_mutex_key, + &buf_pool->zip_free_mutex, SYNC_BUF_ZIP_FREE); + mutex_create(buf_pool_zip_hash_mutex_key, + &buf_pool->zip_hash_mutex, SYNC_BUF_ZIP_HASH); + mutex_create(buf_pool_zip_mutex_key, + &buf_pool->zip_mutex, SYNC_BUF_BLOCK); + mutex_create(buf_pool_flush_state_mutex_key, + &buf_pool->flush_state_mutex, SYNC_BUF_FLUSH_STATE); + + if (buf_pool_size > 0) { + buf_pool->n_chunks = 1; + + buf_pool->chunks = chunk = + (buf_chunk_t*) mem_zalloc(sizeof *chunk); + + UT_LIST_INIT(buf_pool->free); + + if (!buf_chunk_init(buf_pool, chunk, buf_pool_size, populate)) { + mem_free(chunk); + mem_free(buf_pool); + + return(DB_ERROR); + } + + buf_pool->instance_no = instance_no; + buf_pool->old_pool_size = buf_pool_size; + buf_pool->curr_size = chunk->size; + buf_pool->read_ahead_area + = ut_min(64, ut_2_power_up(buf_pool->curr_size / 32)); + buf_pool->curr_pool_size = buf_pool->curr_size * UNIV_PAGE_SIZE; + + /* Number of locks protecting page_hash must be a + power of two */ + srv_n_page_hash_locks = static_cast<ulong>( + ut_2_power_up(srv_n_page_hash_locks)); + ut_a(srv_n_page_hash_locks != 0); + ut_a(srv_n_page_hash_locks <= MAX_PAGE_HASH_LOCKS); + + buf_pool->page_hash = ha_create(2 * buf_pool->curr_size, + srv_n_page_hash_locks, + MEM_HEAP_FOR_PAGE_HASH, + SYNC_BUF_PAGE_HASH); + + buf_pool->zip_hash = hash_create(2 * buf_pool->curr_size); + + buf_pool->last_printout_time = ut_time(); + } + /* 2. Initialize flushing fields + -------------------------------- */ + + mutex_create(flush_list_mutex_key, &buf_pool->flush_list_mutex, + SYNC_BUF_FLUSH_LIST); + + for (i = BUF_FLUSH_LRU; i < BUF_FLUSH_N_TYPES; i++) { + buf_pool->no_flush[i] = os_event_create(); + } + + buf_pool->watch = (buf_page_t*) mem_zalloc( + sizeof(*buf_pool->watch) * BUF_POOL_WATCH_SIZE); + + /* All fields are initialized by mem_zalloc(). */ + + buf_pool->try_LRU_scan = TRUE; + + return(DB_SUCCESS); +} + +/********************************************************************//** +free one buffer pool instance */ +static +void +buf_pool_free_instance( +/*===================*/ + buf_pool_t* buf_pool) /* in,own: buffer pool instance + to free */ +{ + buf_chunk_t* chunk; + buf_chunk_t* chunks; + buf_page_t* bpage; + + bpage = UT_LIST_GET_LAST(buf_pool->LRU); + while (bpage != NULL) { + buf_page_t* prev_bpage = UT_LIST_GET_PREV(LRU, bpage); + enum buf_page_state state = buf_page_get_state(bpage); + + ut_ad(buf_page_in_file(bpage)); + ut_ad(bpage->in_LRU_list); + + if (state != BUF_BLOCK_FILE_PAGE) { + /* We must not have any dirty block except + when doing a fast shutdown. */ + ut_ad(state == BUF_BLOCK_ZIP_PAGE + || srv_fast_shutdown == 2); + buf_page_free_descriptor(bpage); + } + + bpage = prev_bpage; + } + + mem_free(buf_pool->watch); + buf_pool->watch = NULL; + + chunks = buf_pool->chunks; + chunk = chunks + buf_pool->n_chunks; + + while (--chunk >= chunks) { + os_mem_free_large(chunk->mem, chunk->mem_size); + } + + mem_free(buf_pool->chunks); + ha_clear(buf_pool->page_hash); + hash_table_free(buf_pool->page_hash); + hash_table_free(buf_pool->zip_hash); +} + +/********************************************************************//** +Creates the buffer pool. +@return DB_SUCCESS if success, DB_ERROR if not enough memory or error */ +UNIV_INTERN +dberr_t +buf_pool_init( +/*==========*/ + ulint total_size, /*!< in: size of the total pool in bytes */ + ibool populate, /*!< in: virtual page preallocation */ + ulint n_instances) /*!< in: number of instances */ +{ + ulint i; + const ulint size = total_size / n_instances; + + ut_ad(n_instances > 0); + ut_ad(n_instances <= MAX_BUFFER_POOLS); + ut_ad(n_instances == srv_buf_pool_instances); + + buf_pool_ptr = (buf_pool_t*) mem_zalloc( + n_instances * sizeof *buf_pool_ptr); + + for (i = 0; i < n_instances; i++) { + buf_pool_t* ptr = &buf_pool_ptr[i]; + + if (buf_pool_init_instance(ptr, size, populate, i) != DB_SUCCESS) { + + /* Free all the instances created so far. */ + buf_pool_free(i); + + return(DB_ERROR); + } + } + + buf_pool_set_sizes(); + buf_LRU_old_ratio_update(100 * 3/ 8, FALSE); + + btr_search_sys_create(buf_pool_get_curr_size() / sizeof(void*) / 64); + + return(DB_SUCCESS); +} + +/********************************************************************//** +Frees the buffer pool at shutdown. This must not be invoked before +freeing all mutexes. */ +UNIV_INTERN +void +buf_pool_free( +/*==========*/ + ulint n_instances) /*!< in: numbere of instances to free */ +{ + ulint i; + + for (i = 0; i < n_instances; i++) { + buf_pool_free_instance(buf_pool_from_array(i)); + } + + mem_free(buf_pool_ptr); + buf_pool_ptr = NULL; +} + +/********************************************************************//** +Clears the adaptive hash index on all pages in the buffer pool. */ +UNIV_INTERN +void +buf_pool_clear_hash_index(void) +/*===========================*/ +{ + ulint p; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(btr_search_own_all(RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(!btr_search_enabled); + + for (p = 0; p < srv_buf_pool_instances; p++) { + buf_pool_t* buf_pool = buf_pool_from_array(p); + buf_chunk_t* chunks = buf_pool->chunks; + buf_chunk_t* chunk = chunks + buf_pool->n_chunks; + + while (--chunk >= chunks) { + buf_block_t* block = chunk->blocks; + ulint i = chunk->size; + + for (; i--; block++) { + dict_index_t* index = block->index; + + /* We can set block->index = NULL + when we have an x-latch on btr_search_latch; + see the comment in buf0buf.h */ + + if (!index) { + /* Not hashed */ + continue; + } + + block->index = NULL; +# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + block->n_pointers = 0; +# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + } + } + } +} + +/********************************************************************//** +Relocate a buffer control block. Relocates the block on the LRU list +and in buf_pool->page_hash. Does not relocate bpage->list. +The caller must take care of relocating bpage->list. */ +UNIV_INTERN +void +buf_relocate( +/*=========*/ + buf_page_t* bpage, /*!< in/out: control block being relocated; + buf_page_get_state(bpage) must be + BUF_BLOCK_ZIP_DIRTY or BUF_BLOCK_ZIP_PAGE */ + buf_page_t* dpage) /*!< in/out: destination control block */ +{ + buf_page_t* b; + ulint fold; + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + + fold = buf_page_address_fold(bpage->space, bpage->offset); + + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); + ut_ad(buf_page_hash_lock_held_x(buf_pool, bpage)); + ut_ad(mutex_own(buf_page_get_mutex(bpage))); + ut_a(buf_page_get_io_fix(bpage) == BUF_IO_NONE); + ut_a(bpage->buf_fix_count == 0); + ut_ad(bpage->in_LRU_list); + ut_ad(!bpage->in_zip_hash); + ut_ad(bpage->in_page_hash); + ut_ad(bpage == buf_page_hash_get_low(buf_pool, + bpage->space, + bpage->offset, + fold)); + + ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage)); +#ifdef UNIV_DEBUG + switch (buf_page_get_state(bpage)) { + case BUF_BLOCK_POOL_WATCH: + case BUF_BLOCK_NOT_USED: + case BUF_BLOCK_READY_FOR_USE: + case BUF_BLOCK_FILE_PAGE: + case BUF_BLOCK_MEMORY: + case BUF_BLOCK_REMOVE_HASH: + ut_error; + case BUF_BLOCK_ZIP_DIRTY: + case BUF_BLOCK_ZIP_PAGE: + break; + } +#endif /* UNIV_DEBUG */ + + memcpy(dpage, bpage, sizeof *dpage); + + ut_d(bpage->in_LRU_list = FALSE); + ut_d(bpage->in_page_hash = FALSE); + + /* relocate buf_pool->LRU */ + b = UT_LIST_GET_PREV(LRU, bpage); + UT_LIST_REMOVE(LRU, buf_pool->LRU, bpage); + + if (b) { + UT_LIST_INSERT_AFTER(LRU, buf_pool->LRU, b, dpage); + } else { + UT_LIST_ADD_FIRST(LRU, buf_pool->LRU, dpage); + } + + if (UNIV_UNLIKELY(buf_pool->LRU_old == bpage)) { + buf_pool->LRU_old = dpage; +#ifdef UNIV_LRU_DEBUG + /* buf_pool->LRU_old must be the first item in the LRU list + whose "old" flag is set. */ + ut_a(buf_pool->LRU_old->old); + ut_a(!UT_LIST_GET_PREV(LRU, buf_pool->LRU_old) + || !UT_LIST_GET_PREV(LRU, buf_pool->LRU_old)->old); + ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old) + || UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old)->old); + } else { + /* Check that the "old" flag is consistent in + the block and its neighbours. */ + buf_page_set_old(dpage, buf_page_is_old(dpage)); +#endif /* UNIV_LRU_DEBUG */ + } + + ut_d(UT_LIST_VALIDATE( + LRU, buf_page_t, buf_pool->LRU, CheckInLRUList())); + + /* relocate buf_pool->page_hash */ + HASH_DELETE(buf_page_t, hash, buf_pool->page_hash, fold, bpage); + HASH_INSERT(buf_page_t, hash, buf_pool->page_hash, fold, dpage); +} + +/********************************************************************//** +Determine if a block is a sentinel for a buffer pool watch. +@return TRUE if a sentinel for a buffer pool watch, FALSE if not */ +UNIV_INTERN +ibool +buf_pool_watch_is_sentinel( +/*=======================*/ + buf_pool_t* buf_pool, /*!< buffer pool instance */ + const buf_page_t* bpage) /*!< in: block */ +{ + /* We must also own the appropriate hash lock. */ + ut_ad(buf_page_hash_lock_held_s_or_x(buf_pool, bpage)); + ut_ad(buf_page_in_file(bpage)); + + if (bpage < &buf_pool->watch[0] + || bpage >= &buf_pool->watch[BUF_POOL_WATCH_SIZE]) { + + ut_ad(buf_page_get_state(bpage) != BUF_BLOCK_ZIP_PAGE + || bpage->zip.data != NULL); + + return(FALSE); + } + + ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_PAGE); + ut_ad(!bpage->in_zip_hash); + ut_ad(bpage->in_page_hash); + ut_ad(bpage->zip.data == NULL); + ut_ad(bpage->buf_fix_count > 0); + return(TRUE); +} + +/****************************************************************//** +Add watch for the given page to be read in. Caller must have +appropriate hash_lock for the bpage and hold the LRU list mutex to avoid a race +condition with buf_LRU_free_page inserting the same page into the page hash. +This function may release the hash_lock and reacquire it. +@return NULL if watch set, block if the page is in the buffer pool */ +UNIV_INTERN +buf_page_t* +buf_pool_watch_set( +/*===============*/ + ulint space, /*!< in: space id */ + ulint offset, /*!< in: page number */ + ulint fold) /*!< in: buf_page_address_fold(space, offset) */ +{ + buf_page_t* bpage; + ulint i; + buf_pool_t* buf_pool = buf_pool_get(space, offset); + prio_rw_lock_t* hash_lock; + + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); + + hash_lock = buf_page_hash_lock_get(buf_pool, fold); + +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(hash_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + bpage = buf_page_hash_get_low(buf_pool, space, offset, fold); + + if (bpage != NULL) { +page_found: + if (!buf_pool_watch_is_sentinel(buf_pool, bpage)) { + /* The page was loaded meanwhile. */ + return(bpage); + } + + /* Add to an existing watch. */ +#ifdef PAGE_ATOMIC_REF_COUNT + os_atomic_increment_uint32(&bpage->buf_fix_count, 1); +#else + ++bpage->buf_fix_count; +#endif /* PAGE_ATOMIC_REF_COUNT */ + return(NULL); + } + + /* From this point this function becomes fairly heavy in terms + of latching. We acquire all the hash_locks. They are needed + because we don't want to read any stale information in + buf_pool->watch[]. However, it is not in the critical code path + as this function will be called only by the purge thread. */ + + + /* To obey latching order first release the hash_lock. */ + rw_lock_x_unlock(hash_lock); + + hash_lock_x_all(buf_pool->page_hash); + + /* We have to recheck that the page + was not loaded or a watch set by some other + purge thread. This is because of the small + time window between when we release the + hash_lock to acquire all the hash locks above. */ + + bpage = buf_page_hash_get_low(buf_pool, space, offset, fold); + if (UNIV_LIKELY_NULL(bpage)) { + hash_unlock_x_all_but(buf_pool->page_hash, hash_lock); + goto page_found; + } + + for (i = 0; i < BUF_POOL_WATCH_SIZE; i++) { + bpage = &buf_pool->watch[i]; + + ut_ad(bpage->access_time == 0); + ut_ad(bpage->newest_modification == 0); + ut_ad(bpage->oldest_modification == 0); + ut_ad(bpage->zip.data == NULL); + ut_ad(!bpage->in_zip_hash); + + switch (bpage->state) { + case BUF_BLOCK_POOL_WATCH: + ut_ad(!bpage->in_page_hash); + ut_ad(bpage->buf_fix_count == 0); + + bpage->state = BUF_BLOCK_ZIP_PAGE; + bpage->space = static_cast<ib_uint32_t>(space); + bpage->offset = static_cast<ib_uint32_t>(offset); + bpage->buf_fix_count = 1; + bpage->buf_pool_index = buf_pool_index(buf_pool); + + ut_d(bpage->in_page_hash = TRUE); + HASH_INSERT(buf_page_t, hash, buf_pool->page_hash, + fold, bpage); + + /* Once the sentinel is in the page_hash we can + safely release all locks except just the + relevant hash_lock */ + hash_unlock_x_all_but(buf_pool->page_hash, + hash_lock); + + return(NULL); + case BUF_BLOCK_ZIP_PAGE: + ut_ad(bpage->in_page_hash); + ut_ad(bpage->buf_fix_count > 0); + break; + default: + ut_error; + } + } + + /* Allocation failed. Either the maximum number of purge + threads should never exceed BUF_POOL_WATCH_SIZE, or this code + should be modified to return a special non-NULL value and the + caller should purge the record directly. */ + ut_error; + + /* Fix compiler warning */ + return(NULL); +} + +/****************************************************************//** +Remove the sentinel block for the watch before replacing it with a real block. +buf_page_watch_clear() or buf_page_watch_occurred() will notice that +the block has been replaced with the real block. +@return reference count, to be added to the replacement block */ +static +void +buf_pool_watch_remove( +/*==================*/ + buf_pool_t* buf_pool, /*!< buffer pool instance */ + ulint fold, /*!< in: buf_page_address_fold( + space, offset) */ + buf_page_t* watch) /*!< in/out: sentinel for watch */ +{ +#ifdef UNIV_SYNC_DEBUG + /* We must also own the appropriate hash_bucket mutex. */ + prio_rw_lock_t* hash_lock = buf_page_hash_lock_get(buf_pool, fold); + ut_ad(rw_lock_own(hash_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + ut_ad(buf_page_get_state(watch) == BUF_BLOCK_ZIP_PAGE); + + HASH_DELETE(buf_page_t, hash, buf_pool->page_hash, fold, watch); + ut_d(watch->in_page_hash = FALSE); + watch->buf_fix_count = 0; + watch->state = BUF_BLOCK_POOL_WATCH; +} + +/****************************************************************//** +Stop watching if the page has been read in. +buf_pool_watch_set(space,offset) must have returned NULL before. */ +UNIV_INTERN +void +buf_pool_watch_unset( +/*=================*/ + ulint space, /*!< in: space id */ + ulint offset) /*!< in: page number */ +{ + buf_page_t* bpage; + buf_pool_t* buf_pool = buf_pool_get(space, offset); + ulint fold = buf_page_address_fold(space, offset); + prio_rw_lock_t* hash_lock = buf_page_hash_lock_get(buf_pool, fold); + + rw_lock_x_lock(hash_lock); + + /* The page must exist because buf_pool_watch_set() increments + buf_fix_count. */ + + bpage = buf_page_hash_get_low(buf_pool, space, offset, fold); + + if (!buf_pool_watch_is_sentinel(buf_pool, bpage)) { + buf_block_unfix(reinterpret_cast<buf_block_t*>(bpage)); + } else { + + ut_ad(bpage->buf_fix_count > 0); + +#ifdef PAGE_ATOMIC_REF_COUNT + os_atomic_decrement_uint32(&bpage->buf_fix_count, 1); +#else + --bpage->buf_fix_count; +#endif /* PAGE_ATOMIC_REF_COUNT */ + + if (bpage->buf_fix_count == 0) { + buf_pool_watch_remove(buf_pool, fold, bpage); + } + } + + rw_lock_x_unlock(hash_lock); +} + +/****************************************************************//** +Check if the page has been read in. +This may only be called after buf_pool_watch_set(space,offset) +has returned NULL and before invoking buf_pool_watch_unset(space,offset). +@return FALSE if the given page was not read in, TRUE if it was */ +UNIV_INTERN +ibool +buf_pool_watch_occurred( +/*====================*/ + ulint space, /*!< in: space id */ + ulint offset) /*!< in: page number */ +{ + ibool ret; + buf_page_t* bpage; + buf_pool_t* buf_pool = buf_pool_get(space, offset); + ulint fold = buf_page_address_fold(space, offset); + prio_rw_lock_t* hash_lock = buf_page_hash_lock_get(buf_pool, + fold); + + rw_lock_s_lock(hash_lock); + + /* The page must exist because buf_pool_watch_set() + increments buf_fix_count. */ + bpage = buf_page_hash_get_low(buf_pool, space, offset, fold); + + ret = !buf_pool_watch_is_sentinel(buf_pool, bpage); + rw_lock_s_unlock(hash_lock); + + return(ret); +} + +/********************************************************************//** +Moves a page to the start of the buffer pool LRU list. This high-level +function can be used to prevent an important page from slipping out of +the buffer pool. */ +UNIV_INTERN +void +buf_page_make_young( +/*================*/ + buf_page_t* bpage) /*!< in: buffer block of a file page */ +{ + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + + mutex_enter(&buf_pool->LRU_list_mutex); + + ut_a(buf_page_in_file(bpage)); + + buf_LRU_make_block_young(bpage); + + mutex_exit(&buf_pool->LRU_list_mutex); +} + +/********************************************************************//** +Moves a page to the start of the buffer pool LRU list if it is too old. +This high-level function can be used to prevent an important page from +slipping out of the buffer pool. */ +static +void +buf_page_make_young_if_needed( +/*==========================*/ + buf_page_t* bpage) /*!< in/out: buffer block of a + file page */ +{ + ut_a(buf_page_in_file(bpage)); + + if (buf_page_peek_if_too_old(bpage)) { + buf_page_make_young(bpage); + } +} + +/********************************************************************//** +Resets the check_index_page_at_flush field of a page if found in the buffer +pool. */ +UNIV_INTERN +void +buf_reset_check_index_page_at_flush( +/*================================*/ + ulint space, /*!< in: space id */ + ulint offset) /*!< in: page number */ +{ + buf_block_t* block; + buf_pool_t* buf_pool = buf_pool_get(space, offset); + + block = (buf_block_t*) buf_page_hash_get(buf_pool, space, offset); + + if (block && buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE) { + ut_ad(!buf_pool_watch_is_sentinel(buf_pool, &block->page)); + block->check_index_page_at_flush = FALSE; + } +} + +#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG +/********************************************************************//** +Sets file_page_was_freed TRUE if the page is found in the buffer pool. +This function should be called when we free a file page and want the +debug version to check that it is not accessed any more unless +reallocated. +@return control block if found in page hash table, otherwise NULL */ +UNIV_INTERN +buf_page_t* +buf_page_set_file_page_was_freed( +/*=============================*/ + ulint space, /*!< in: space id */ + ulint offset) /*!< in: page number */ +{ + buf_page_t* bpage; + buf_pool_t* buf_pool = buf_pool_get(space, offset); + prio_rw_lock_t* hash_lock; + + bpage = buf_page_hash_get_s_locked(buf_pool, space, offset, + &hash_lock); + + if (bpage) { + ib_mutex_t* block_mutex = buf_page_get_mutex(bpage); + ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage)); + mutex_enter(block_mutex); + rw_lock_s_unlock(hash_lock); + /* bpage->file_page_was_freed can already hold + when this code is invoked from dict_drop_index_tree() */ + bpage->file_page_was_freed = TRUE; + mutex_exit(block_mutex); + } + + return(bpage); +} + +/********************************************************************//** +Sets file_page_was_freed FALSE if the page is found in the buffer pool. +This function should be called when we free a file page and want the +debug version to check that it is not accessed any more unless +reallocated. +@return control block if found in page hash table, otherwise NULL */ +UNIV_INTERN +buf_page_t* +buf_page_reset_file_page_was_freed( +/*===============================*/ + ulint space, /*!< in: space id */ + ulint offset) /*!< in: page number */ +{ + buf_page_t* bpage; + buf_pool_t* buf_pool = buf_pool_get(space, offset); + prio_rw_lock_t* hash_lock; + + bpage = buf_page_hash_get_s_locked(buf_pool, space, offset, + &hash_lock); + if (bpage) { + ib_mutex_t* block_mutex = buf_page_get_mutex(bpage); + ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage)); + mutex_enter(block_mutex); + rw_lock_s_unlock(hash_lock); + bpage->file_page_was_freed = FALSE; + mutex_exit(block_mutex); + } + + return(bpage); +} +#endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */ + +/********************************************************************//** +Attempts to discard the uncompressed frame of a compressed page. The +caller should not be holding any mutexes when this function is called. +@return TRUE if successful, FALSE otherwise. */ +static +void +buf_block_try_discard_uncompressed( +/*===============================*/ + ulint space, /*!< in: space id */ + ulint offset) /*!< in: page number */ +{ + buf_page_t* bpage; + buf_pool_t* buf_pool = buf_pool_get(space, offset); + + /* Since we need to acquire buf_pool->LRU_list_mutex to discard + the uncompressed frame and because page_hash mutex resides below + buf_pool->LRU_list_mutex in sync ordering therefore we must first + release the page_hash mutex. This means that the block in question + can move out of page_hash. Therefore we need to check again if the + block is still in page_hash. */ + + mutex_enter(&buf_pool->LRU_list_mutex); + + bpage = buf_page_hash_get(buf_pool, space, offset); + + if (bpage) { + + ib_mutex_t* block_mutex = buf_page_get_mutex(bpage); + + mutex_enter(block_mutex); + + if (buf_LRU_free_page(bpage, false)) { + + mutex_exit(block_mutex); + return; + } + mutex_exit(block_mutex); + } + + mutex_exit(&buf_pool->LRU_list_mutex); +} + +/********************************************************************//** +Get read access to a compressed page (usually of type +FIL_PAGE_TYPE_ZBLOB or FIL_PAGE_TYPE_ZBLOB2). +The page must be released with buf_page_release_zip(). +NOTE: the page is not protected by any latch. Mutual exclusion has to +be implemented at a higher level. In other words, all possible +accesses to a given page through this function must be protected by +the same set of mutexes or latches. +@return pointer to the block */ +UNIV_INTERN +buf_page_t* +buf_page_get_zip( +/*=============*/ + ulint space, /*!< in: space id */ + ulint zip_size,/*!< in: compressed page size */ + ulint offset) /*!< in: page number */ +{ + buf_page_t* bpage; + ib_mutex_t* block_mutex; + prio_rw_lock_t* hash_lock; + ibool discard_attempted = FALSE; + ibool must_read; + trx_t* trx = NULL; + ulint sec; + ulint ms; + ib_uint64_t start_time; + ib_uint64_t finish_time; + buf_pool_t* buf_pool = buf_pool_get(space, offset); + + if (UNIV_UNLIKELY(innobase_get_slow_log())) { + trx = innobase_get_trx(); + } + buf_pool->stat.n_page_gets++; + + for (;;) { +lookup: + + /* The following call will also grab the page_hash + mutex if the page is found. */ + bpage = buf_page_hash_get_s_locked(buf_pool, space, + offset, &hash_lock); + if (bpage) { + ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage)); + break; + } + + /* Page not in buf_pool: needs to be read from file */ + + ut_ad(!hash_lock); + buf_read_page(space, zip_size, offset, trx); + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG + ut_a(++buf_dbg_counter % 5771 || buf_validate()); +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ + } + + ut_ad(buf_page_hash_lock_held_s(buf_pool, bpage)); + + if (!bpage->zip.data) { + /* There is no compressed page. */ +err_exit: + rw_lock_s_unlock(hash_lock); + return(NULL); + } + + if (UNIV_UNLIKELY(bpage->is_corrupt && srv_pass_corrupt_table <= 1)) { + + rw_lock_s_unlock(hash_lock); + + return(NULL); + } + + ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage)); + + switch (buf_page_get_state(bpage)) { + case BUF_BLOCK_POOL_WATCH: + case BUF_BLOCK_NOT_USED: + case BUF_BLOCK_READY_FOR_USE: + case BUF_BLOCK_MEMORY: + case BUF_BLOCK_REMOVE_HASH: + ut_error; + + case BUF_BLOCK_ZIP_PAGE: + case BUF_BLOCK_ZIP_DIRTY: + block_mutex = &buf_pool->zip_mutex; + mutex_enter(block_mutex); +#ifdef PAGE_ATOMIC_REF_COUNT + os_atomic_increment_uint32(&bpage->buf_fix_count, 1); +#else + ++bpage->buf_fix_count; +#endif /* PAGE_ATOMIC_REF_COUNT */ + goto got_block; + case BUF_BLOCK_FILE_PAGE: + /* Discard the uncompressed page frame if possible. */ + if (!discard_attempted) { + rw_lock_s_unlock(hash_lock); + buf_block_try_discard_uncompressed(space, offset); + discard_attempted = TRUE; + goto lookup; + } + + block_mutex = &((buf_block_t*) bpage)->mutex; + + mutex_enter(block_mutex); + + buf_block_buf_fix_inc((buf_block_t*) bpage, __FILE__, __LINE__); + goto got_block; + } + + ut_error; + goto err_exit; + +got_block: + must_read = buf_page_get_io_fix(bpage) == BUF_IO_READ; + + rw_lock_s_unlock(hash_lock); +#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG + ut_a(!bpage->file_page_was_freed); +#endif /* defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG */ + + buf_page_set_accessed(bpage); + + mutex_exit(block_mutex); + + buf_page_make_young_if_needed(bpage); + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG + ut_a(++buf_dbg_counter % 5771 || buf_validate()); + ut_a(bpage->buf_fix_count > 0); + ut_a(buf_page_in_file(bpage)); +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ + + if (must_read) { + /* Let us wait until the read operation + completes */ + + if (UNIV_UNLIKELY(trx && trx->take_stats)) + { + ut_usectime(&sec, &ms); + start_time = (ib_uint64_t)sec * 1000000 + ms; + } else { + start_time = 0; + } + for (;;) { + enum buf_io_fix io_fix; + + mutex_enter(block_mutex); + io_fix = buf_page_get_io_fix(bpage); + mutex_exit(block_mutex); + + if (io_fix == BUF_IO_READ) { + + os_thread_sleep(WAIT_FOR_READ); + } else { + break; + } + } + if (UNIV_UNLIKELY(start_time != 0)) + { + ut_usectime(&sec, &ms); + finish_time = (ib_uint64_t)sec * 1000000 + ms; + trx->io_reads_wait_timer += (ulint)(finish_time - start_time); + } + } + +#ifdef UNIV_IBUF_COUNT_DEBUG + ut_a(ibuf_count_get(buf_page_get_space(bpage), + buf_page_get_page_no(bpage)) == 0); +#endif + return(bpage); +} + +/********************************************************************//** +Initialize some fields of a control block. */ +UNIV_INLINE +void +buf_block_init_low( +/*===============*/ + buf_block_t* block) /*!< in: block to init */ +{ + block->check_index_page_at_flush = FALSE; + block->index = NULL; + + block->n_hash_helps = 0; + block->n_fields = 1; + block->n_bytes = 0; + block->left_side = TRUE; +} +#endif /* !UNIV_HOTBACKUP */ + +/********************************************************************//** +Decompress a block. +@return TRUE if successful */ +UNIV_INTERN +ibool +buf_zip_decompress( +/*===============*/ + buf_block_t* block, /*!< in/out: block */ + ibool check) /*!< in: TRUE=verify the page checksum */ +{ + const byte* frame = block->page.zip.data; + ulint size = page_zip_get_size(&block->page.zip); + + ut_ad(buf_block_get_zip_size(block)); + ut_a(buf_block_get_space(block) != 0); + + if (UNIV_UNLIKELY(check && !page_zip_verify_checksum(frame, size))) { + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: compressed page checksum mismatch" + " (space %u page %u): stored: %lu, crc32: %lu " + "innodb: %lu, none: %lu\n", + block->page.space, block->page.offset, + mach_read_from_4(frame + FIL_PAGE_SPACE_OR_CHKSUM), + page_zip_calc_checksum(frame, size, + SRV_CHECKSUM_ALGORITHM_CRC32), + page_zip_calc_checksum(frame, size, + SRV_CHECKSUM_ALGORITHM_INNODB), + page_zip_calc_checksum(frame, size, + SRV_CHECKSUM_ALGORITHM_NONE)); + return(FALSE); + } + + switch (fil_page_get_type(frame)) { + case FIL_PAGE_INDEX: + if (page_zip_decompress(&block->page.zip, + block->frame, TRUE)) { + return(TRUE); + } + + fprintf(stderr, + "InnoDB: unable to decompress space %lu page %lu\n", + (ulong) block->page.space, + (ulong) block->page.offset); + return(FALSE); + + case FIL_PAGE_TYPE_ALLOCATED: + case FIL_PAGE_INODE: + case FIL_PAGE_IBUF_BITMAP: + case FIL_PAGE_TYPE_FSP_HDR: + case FIL_PAGE_TYPE_XDES: + case FIL_PAGE_TYPE_ZBLOB: + case FIL_PAGE_TYPE_ZBLOB2: + /* Copy to uncompressed storage. */ + memcpy(block->frame, frame, + buf_block_get_zip_size(block)); + return(TRUE); + } + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: unknown compressed page" + " type %lu\n", + fil_page_get_type(frame)); + return(FALSE); +} + +#ifndef UNIV_HOTBACKUP +/*******************************************************************//** +Gets the block to whose frame the pointer is pointing to if found +in this buffer pool instance. +@return pointer to block */ +UNIV_INTERN +buf_block_t* +buf_block_align_instance( +/*=====================*/ + buf_pool_t* buf_pool, /*!< in: buffer in which the block + resides */ + const byte* ptr) /*!< in: pointer to a frame */ +{ + buf_chunk_t* chunk; + ulint i; + + /* TODO: protect buf_pool->chunks with a mutex (it will + currently remain constant after buf_pool_init()) */ + for (chunk = buf_pool->chunks, i = buf_pool->n_chunks; i--; chunk++) { + ulint offs; + + if (UNIV_UNLIKELY(ptr < chunk->blocks->frame)) { + + continue; + } + /* else */ + + offs = ptr - chunk->blocks->frame; + + offs >>= UNIV_PAGE_SIZE_SHIFT; + + if (UNIV_LIKELY(offs < chunk->size)) { + buf_block_t* block = &chunk->blocks[offs]; + + /* The function buf_chunk_init() invokes + buf_block_init() so that block[n].frame == + block->frame + n * UNIV_PAGE_SIZE. Check it. */ + ut_ad(block->frame == page_align(ptr)); +#ifdef UNIV_DEBUG + /* A thread that updates these fields must + hold one of the buf_pool mutexes, depending on the + page state, and block->mutex. Acquire + only the latter. */ + mutex_enter(&block->mutex); + + switch (buf_block_get_state(block)) { + case BUF_BLOCK_POOL_WATCH: + case BUF_BLOCK_ZIP_PAGE: + case BUF_BLOCK_ZIP_DIRTY: + /* These types should only be used in + the compressed buffer pool, whose + memory is allocated from + buf_pool->chunks, in UNIV_PAGE_SIZE + blocks flagged as BUF_BLOCK_MEMORY. */ + ut_error; + break; + case BUF_BLOCK_NOT_USED: + case BUF_BLOCK_READY_FOR_USE: + case BUF_BLOCK_MEMORY: + /* Some data structures contain + "guess" pointers to file pages. The + file pages may have been freed and + reused. Do not complain. */ + break; + case BUF_BLOCK_REMOVE_HASH: + /* buf_LRU_block_remove_hashed_page() + will overwrite the FIL_PAGE_OFFSET and + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID with + 0xff and set the state to + BUF_BLOCK_REMOVE_HASH. */ + ut_ad(page_get_space_id(page_align(ptr)) + == 0xffffffff); + ut_ad(page_get_page_no(page_align(ptr)) + == 0xffffffff); + break; + case BUF_BLOCK_FILE_PAGE: + ut_ad(block->page.space + == page_get_space_id(page_align(ptr))); + ut_ad(block->page.offset + == page_get_page_no(page_align(ptr))); + break; + } + + mutex_exit(&block->mutex); +#endif /* UNIV_DEBUG */ + + return(block); + } + } + + return(NULL); +} + +/*******************************************************************//** +Gets the block to whose frame the pointer is pointing to. +@return pointer to block, never NULL */ +UNIV_INTERN +buf_block_t* +buf_block_align( +/*============*/ + const byte* ptr) /*!< in: pointer to a frame */ +{ + ulint i; + + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_block_t* block; + + block = buf_block_align_instance( + buf_pool_from_array(i), ptr); + if (block) { + return(block); + } + } + + /* The block should always be found. */ + ut_error; + return(NULL); +} + +/********************************************************************//** +Find out if a pointer belongs to a buf_block_t. It can be a pointer to +the buf_block_t itself or a member of it. This functions checks one of +the buffer pool instances. +@return TRUE if ptr belongs to a buf_block_t struct */ +static +ibool +buf_pointer_is_block_field_instance( +/*================================*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + const void* ptr) /*!< in: pointer not dereferenced */ +{ + const buf_chunk_t* chunk = buf_pool->chunks; + const buf_chunk_t* const echunk = chunk + buf_pool->n_chunks; + + /* TODO: protect buf_pool->chunks with a mutex (it will + currently remain constant after buf_pool_init()) */ + while (chunk < echunk) { + if (ptr >= (void*) chunk->blocks + && ptr < (void*) (chunk->blocks + chunk->size)) { + + return(TRUE); + } + + chunk++; + } + + return(FALSE); +} + +/********************************************************************//** +Find out if a pointer belongs to a buf_block_t. It can be a pointer to +the buf_block_t itself or a member of it +@return TRUE if ptr belongs to a buf_block_t struct */ +UNIV_INTERN +ibool +buf_pointer_is_block_field( +/*=======================*/ + const void* ptr) /*!< in: pointer not dereferenced */ +{ + ulint i; + + for (i = 0; i < srv_buf_pool_instances; i++) { + ibool found; + + found = buf_pointer_is_block_field_instance( + buf_pool_from_array(i), ptr); + if (found) { + return(TRUE); + } + } + + return(FALSE); +} + +/********************************************************************//** +Find out if a buffer block was created by buf_chunk_init(). +@return TRUE if "block" has been added to buf_pool->free by buf_chunk_init() */ +static +ibool +buf_block_is_uncompressed( +/*======================*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + const buf_block_t* block) /*!< in: pointer to block, + not dereferenced */ +{ + if ((((ulint) block) % sizeof *block) != 0) { + /* The pointer should be aligned. */ + return(FALSE); + } + + return(buf_pointer_is_block_field_instance(buf_pool, (void*) block)); +} + +#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG +/********************************************************************//** +Return true if probe is enabled. +@return true if probe enabled. */ +static +bool +buf_debug_execute_is_force_flush() +/*==============================*/ +{ + DBUG_EXECUTE_IF("ib_buf_force_flush", return(true); ); + + /* This is used during queisce testing, we want to ensure maximum + buffering by the change buffer. */ + + if (srv_ibuf_disable_background_merge) { + return(true); + } + + return(false); +} +#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ + +/** +Wait for the block to be read in. +@param block The block to check +@param trx Transaction to account the I/Os to */ +static +void +buf_wait_for_read(buf_block_t* block, trx_t* trx) +{ + /* Note: For the PAGE_ATOMIC_REF_COUNT case: + + We are using the block->lock to check for IO state (and a dirty read). + We set the IO_READ state under the protection of the hash_lock + (and block->mutex). This is safe because another thread can only + access the block (and check for IO state) after the block has been + added to the page hashtable. */ + + if (buf_block_get_io_fix_unlocked(block) == BUF_IO_READ) { + + ib_uint64_t start_time; + ulint sec; + ulint ms; + + /* Wait until the read operation completes */ + + ib_mutex_t* mutex = buf_page_get_mutex(&block->page); + + if (UNIV_UNLIKELY(trx && trx->take_stats)) + { + ut_usectime(&sec, &ms); + start_time = (ib_uint64_t)sec * 1000000 + ms; + } else { + start_time = 0; + } + + for (;;) { + buf_io_fix io_fix; + + mutex_enter(mutex); + + io_fix = buf_block_get_io_fix(block); + + mutex_exit(mutex); + + if (io_fix == BUF_IO_READ) { + /* Wait by temporaly s-latch */ + rw_lock_s_lock(&block->lock); + rw_lock_s_unlock(&block->lock); + } else { + break; + } + } + + if (UNIV_UNLIKELY(start_time != 0)) + { + ut_usectime(&sec, &ms); + ib_uint64_t finish_time + = (ib_uint64_t)sec * 1000000 + ms; + trx->io_reads_wait_timer + += (ulint)(finish_time - start_time); + } + + } +} + +/********************************************************************//** +This is the general function used to get access to a database page. +@return pointer to the block or NULL */ +UNIV_INTERN +buf_block_t* +buf_page_get_gen( +/*=============*/ + ulint space, /*!< in: space id */ + ulint zip_size,/*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint offset, /*!< in: page number */ + ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */ + buf_block_t* guess, /*!< in: guessed block or NULL */ + ulint mode, /*!< in: BUF_GET, BUF_GET_IF_IN_POOL, + BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH, or + BUF_GET_IF_IN_POOL_OR_WATCH */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ + mtr_t* mtr) /*!< in: mini-transaction */ +{ + buf_block_t* block; + ulint fold; + unsigned access_time; + ulint fix_type; + prio_rw_lock_t* hash_lock; + ulint retries = 0; + trx_t* trx = NULL; + buf_block_t* fix_block; + ib_mutex_t* fix_mutex = NULL; + buf_pool_t* buf_pool = buf_pool_get(space, offset); + + ut_ad(mtr); + ut_ad(mtr->state == MTR_ACTIVE); + ut_ad((rw_latch == RW_S_LATCH) + || (rw_latch == RW_X_LATCH) + || (rw_latch == RW_NO_LATCH)); +#ifdef UNIV_DEBUG + switch (mode) { + case BUF_GET_NO_LATCH: + ut_ad(rw_latch == RW_NO_LATCH); + break; + case BUF_GET: + case BUF_GET_IF_IN_POOL: + case BUF_PEEK_IF_IN_POOL: + case BUF_GET_IF_IN_POOL_OR_WATCH: + case BUF_GET_POSSIBLY_FREED: + break; + default: + ut_error; + } +#endif /* UNIV_DEBUG */ + ut_ad(zip_size == fil_space_get_zip_size(space)); + ut_ad(ut_is_2pow(zip_size)); +#ifndef UNIV_LOG_DEBUG + ut_ad(!ibuf_inside(mtr) + || ibuf_page_low(space, zip_size, offset, + FALSE, file, line, NULL)); +#endif + if (UNIV_UNLIKELY(innobase_get_slow_log())) { + trx = innobase_get_trx(); + } + buf_pool->stat.n_page_gets++; + fold = buf_page_address_fold(space, offset); + hash_lock = buf_page_hash_lock_get(buf_pool, fold); +loop: + block = guess; + + rw_lock_s_lock(hash_lock); + + if (block != NULL) { + + /* If the guess is a compressed page descriptor that + has been allocated by buf_page_alloc_descriptor(), + it may have been freed by buf_relocate(). */ + + if (!buf_block_is_uncompressed(buf_pool, block) + || offset != block->page.offset + || space != block->page.space + || buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) { + + /* Our guess was bogus or things have changed + since. */ + block = guess = NULL; + } else { + ut_ad(!block->page.in_zip_hash); + } + } + + if (block == NULL) { + block = (buf_block_t*) buf_page_hash_get_low( + buf_pool, space, offset, fold); + } + + if (!block || buf_pool_watch_is_sentinel(buf_pool, &block->page)) { + rw_lock_s_unlock(hash_lock); + block = NULL; + } + + if (block == NULL) { + /* Page not in buf_pool: needs to be read from file */ + + if (mode == BUF_GET_IF_IN_POOL_OR_WATCH) { + mutex_enter(&buf_pool->LRU_list_mutex); + rw_lock_x_lock(hash_lock); + block = (buf_block_t*) buf_pool_watch_set( + space, offset, fold); + mutex_exit(&buf_pool->LRU_list_mutex); + + if (UNIV_LIKELY_NULL(block)) { + /* We can release hash_lock after we + increment the fix count to make + sure that no state change takes place. */ + fix_block = block; + buf_block_fix(fix_block); + + /* Now safe to release page_hash mutex */ + rw_lock_x_unlock(hash_lock); + goto got_block; + } + + rw_lock_x_unlock(hash_lock); + } + + if (mode == BUF_GET_IF_IN_POOL + || mode == BUF_PEEK_IF_IN_POOL + || mode == BUF_GET_IF_IN_POOL_OR_WATCH) { +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX)); + ut_ad(!rw_lock_own(hash_lock, RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + return(NULL); + } + + if (buf_read_page(space, zip_size, offset, trx)) { + buf_read_ahead_random(space, zip_size, offset, + ibuf_inside(mtr), trx); + + retries = 0; + } else if (retries < BUF_PAGE_READ_MAX_RETRIES) { + ++retries; + DBUG_EXECUTE_IF( + "innodb_page_corruption_retries", + retries = BUF_PAGE_READ_MAX_RETRIES; + ); + } else { + fprintf(stderr, "InnoDB: Error: Unable" + " to read tablespace %lu page no" + " %lu into the buffer pool after" + " %lu attempts\n" + "InnoDB: The most probable cause" + " of this error may be that the" + " table has been corrupted.\n" + "InnoDB: You can try to fix this" + " problem by using" + " innodb_force_recovery.\n" + "InnoDB: Please see reference manual" + " for more details.\n" + "InnoDB: Aborting...\n", + space, offset, + BUF_PAGE_READ_MAX_RETRIES); + + ut_error; + } + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG + ut_a(++buf_dbg_counter % 5771 || buf_validate()); +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ + goto loop; + } else { + fix_block = block; + } + + buf_block_fix(fix_block); + + /* Now safe to release page_hash mutex */ + rw_lock_s_unlock(hash_lock); + +got_block: + + fix_mutex = buf_page_get_mutex(&fix_block->page); + + ut_ad(page_zip_get_size(&block->page.zip) == zip_size); + + if (mode == BUF_GET_IF_IN_POOL || mode == BUF_PEEK_IF_IN_POOL) { + + bool must_read; + + { + buf_page_t* fix_page = &fix_block->page; + + mutex_enter(fix_mutex); + + buf_io_fix io_fix = buf_page_get_io_fix(fix_page); + + must_read = (io_fix == BUF_IO_READ); + + mutex_exit(fix_mutex); + } + + if (must_read) { + /* The page is being read to buffer pool, + but we cannot wait around for the read to + complete. */ + buf_block_unfix(fix_block); + + return(NULL); + } + } + + if (UNIV_UNLIKELY(fix_block->page.is_corrupt && + srv_pass_corrupt_table <= 1)) { + + buf_block_unfix(fix_block); + + return(NULL); + } + + switch(buf_block_get_state(fix_block)) { + buf_page_t* bpage; + + case BUF_BLOCK_FILE_PAGE: + ut_ad(fix_mutex != &buf_pool->zip_mutex); + break; + + case BUF_BLOCK_ZIP_PAGE: + case BUF_BLOCK_ZIP_DIRTY: + if (mode == BUF_PEEK_IF_IN_POOL) { + /* This mode is only used for dropping an + adaptive hash index. There cannot be an + adaptive hash index for a compressed-only + page, so do not bother decompressing the page. */ + buf_block_unfix(fix_block); + + return(NULL); + } + + bpage = &block->page; + ut_ad(fix_mutex == &buf_pool->zip_mutex); + + /* Note: We have already buffer fixed this block. */ + if (bpage->buf_fix_count > 1 + || buf_page_get_io_fix_unlocked(bpage) != BUF_IO_NONE) { + + /* This condition often occurs when the buffer + is not buffer-fixed, but I/O-fixed by + buf_page_init_for_read(). */ + + buf_block_unfix(fix_block); + + /* The block is buffer-fixed or I/O-fixed. + Try again later. */ + os_thread_sleep(WAIT_FOR_READ); + + goto loop; + } + + /* Buffer-fix the block so that it cannot be evicted + or relocated while we are attempting to allocate an + uncompressed page. */ + + /* Allocate an uncompressed page. */ + + block = buf_LRU_get_free_block(buf_pool); + + mutex_enter(&buf_pool->LRU_list_mutex); + + rw_lock_x_lock(hash_lock); + + /* Buffer-fixing prevents the page_hash from changing. */ + ut_ad(bpage == buf_page_hash_get_low( + buf_pool, space, offset, fold)); + + buf_block_mutex_enter(block); + + mutex_enter(&buf_pool->zip_mutex); + + ut_ad(fix_block->page.buf_fix_count > 0); + +#ifdef PAGE_ATOMIC_REF_COUNT + os_atomic_decrement_uint32(&fix_block->page.buf_fix_count, 1); +#else + --fix_block->page.buf_fix_count; +#endif /* PAGE_ATOMIC_REF_COUNT */ + + fix_block = block; + + if (bpage->buf_fix_count > 0 + || buf_page_get_io_fix(bpage) != BUF_IO_NONE) { + + mutex_exit(&buf_pool->zip_mutex); + /* The block was buffer-fixed or I/O-fixed while + buf_pool->mutex was not held by this thread. + Free the block that was allocated and retry. + This should be extremely unlikely, for example, + if buf_page_get_zip() was invoked. */ + + buf_LRU_block_free_non_file_page(block); + mutex_exit(&buf_pool->LRU_list_mutex); + rw_lock_x_unlock(hash_lock); + buf_block_mutex_exit(block); + + /* Try again */ + goto loop; + } + + /* Move the compressed page from bpage to block, + and uncompress it. */ + + /* Note: this is the uncompressed block and it is not + accessible by other threads yet because it is not in + any list or hash table */ + buf_relocate(bpage, &block->page); + + buf_block_init_low(block); + + /* Set after relocate(). */ + block->page.buf_fix_count = 1; + + block->lock_hash_val = lock_rec_hash(space, offset); + + UNIV_MEM_DESC(&block->page.zip.data, + page_zip_get_size(&block->page.zip)); + + if (buf_page_get_state(&block->page) == BUF_BLOCK_ZIP_PAGE) { +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG + UT_LIST_REMOVE(list, buf_pool->zip_clean, + &block->page); +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ + ut_ad(!block->page.in_flush_list); + } else { + /* Relocate buf_pool->flush_list. */ + buf_flush_relocate_on_flush_list(bpage, &block->page); + } + + /* Buffer-fix, I/O-fix, and X-latch the block + for the duration of the decompression. + Also add the block to the unzip_LRU list. */ + block->page.state = BUF_BLOCK_FILE_PAGE; + + /* Insert at the front of unzip_LRU list */ + buf_unzip_LRU_add_block(block, FALSE); + + mutex_exit(&buf_pool->LRU_list_mutex); + + buf_block_set_io_fix(block, BUF_IO_READ); + rw_lock_x_lock_inline(&block->lock, 0, file, line); + + UNIV_MEM_INVALID(bpage, sizeof *bpage); + + rw_lock_x_unlock(hash_lock); + + os_atomic_increment_ulint(&buf_pool->n_pend_unzip, 1); + + mutex_exit(&buf_pool->zip_mutex); + + access_time = buf_page_is_accessed(&block->page); + + buf_block_mutex_exit(block); + + buf_page_free_descriptor(bpage); + + /* Decompress the page while not holding + any buf_pool or block->mutex. */ + + /* Page checksum verification is already done when + the page is read from disk. Hence page checksum + verification is not necessary when decompressing the page. */ + { + bool success = buf_zip_decompress(block, FALSE); + ut_a(success); + } + + if (!recv_no_ibuf_operations) { + if (access_time) { +#ifdef UNIV_IBUF_COUNT_DEBUG + ut_a(ibuf_count_get(space, offset) == 0); +#endif /* UNIV_IBUF_COUNT_DEBUG */ + } else { + ibuf_merge_or_delete_for_page( + block, space, offset, zip_size, TRUE); + } + } + + /* Unfix and unlatch the block. */ + buf_block_mutex_enter(fix_block); + + buf_block_set_io_fix(fix_block, BUF_IO_NONE); + + buf_block_mutex_exit(fix_block); + + os_atomic_decrement_ulint(&buf_pool->n_pend_unzip, 1); + + rw_lock_x_unlock(&block->lock); + + break; + + case BUF_BLOCK_POOL_WATCH: + case BUF_BLOCK_NOT_USED: + case BUF_BLOCK_READY_FOR_USE: + case BUF_BLOCK_MEMORY: + case BUF_BLOCK_REMOVE_HASH: + ut_error; + break; + } + + ut_ad(block == fix_block); + ut_ad(fix_block->page.buf_fix_count > 0); + +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX)); + ut_ad(!rw_lock_own(hash_lock, RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + + ut_ad(buf_block_get_state(fix_block) == BUF_BLOCK_FILE_PAGE); + +#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG + + if ((mode == BUF_GET_IF_IN_POOL || mode == BUF_GET_IF_IN_POOL_OR_WATCH) + && (ibuf_debug || buf_debug_execute_is_force_flush())) { + + /* Try to evict the block from the buffer pool, to use the + insert buffer (change buffer) as much as possible. */ + + mutex_enter(&buf_pool->LRU_list_mutex); + + buf_block_unfix(fix_block); + + /* Now we are only holding the buf_pool->LRU_list_mutex, + not block->mutex or hash_lock. Blocks cannot be + relocated or enter or exit the buf_pool while we + are holding the buf_pool->LRU_list_mutex. */ + + fix_mutex = buf_page_get_mutex(&fix_block->page); + mutex_enter(fix_mutex); + + if (buf_LRU_free_page(&fix_block->page, true)) { + + mutex_exit(fix_mutex); + + if (mode == BUF_GET_IF_IN_POOL_OR_WATCH) { + mutex_enter(&buf_pool->LRU_list_mutex); + rw_lock_x_lock(hash_lock); + + /* Set the watch, as it would have + been set if the page were not in the + buffer pool in the first place. */ + block = (buf_block_t*) buf_pool_watch_set( + space, offset, fold); + mutex_exit(&buf_pool->LRU_list_mutex); + } else { + rw_lock_x_lock(hash_lock); + block = (buf_block_t*) buf_page_hash_get_low( + buf_pool, space, offset, fold); + } + + rw_lock_x_unlock(hash_lock); + + if (block != NULL) { + /* Either the page has been read in or + a watch was set on that in the window + where we released the buf_pool::mutex + and before we acquire the hash_lock + above. Try again. */ + guess = block; + goto loop; + } + + fprintf(stderr, + "innodb_change_buffering_debug evict %u %u\n", + (unsigned) space, (unsigned) offset); + return(NULL); + } + + if (buf_flush_page_try(buf_pool, fix_block)) { + fprintf(stderr, + "innodb_change_buffering_debug flush %u %u\n", + (unsigned) space, (unsigned) offset); + guess = fix_block; + goto loop; + } + + mutex_exit(&buf_pool->LRU_list_mutex); + + buf_block_mutex_exit(fix_block); + + buf_block_fix(fix_block); + + /* Failed to evict the page; change it directly */ + } +#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ + + ut_ad(fix_block->page.buf_fix_count > 0); + +#ifdef UNIV_SYNC_DEBUG + /* We have already buffer fixed the page, and we are committed to + returning this page to the caller. Register for debugging. */ + { + ibool ret; + ret = rw_lock_s_lock_nowait(&fix_block->debug_latch, file, line); + ut_a(ret); + } +#endif /* UNIV_SYNC_DEBUG */ + +#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG + ut_a(mode == BUF_GET_POSSIBLY_FREED + || !fix_block->page.file_page_was_freed); +#endif + /* Check if this is the first access to the page */ + access_time = buf_page_is_accessed(&fix_block->page); + + /* This is a heuristic and we don't care about ordering issues. */ + if (access_time == 0) { + buf_block_mutex_enter(fix_block); + + buf_page_set_accessed(&fix_block->page); + + buf_block_mutex_exit(fix_block); + } + + if (mode != BUF_PEEK_IF_IN_POOL) { + buf_page_make_young_if_needed(&fix_block->page); + } + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG + ut_a(++buf_dbg_counter % 5771 || buf_validate()); + ut_a(fix_block->page.buf_fix_count > 0); + ut_a(buf_block_get_state(fix_block) == BUF_BLOCK_FILE_PAGE); +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ + +#ifdef PAGE_ATOMIC_REF_COUNT + /* We have to wait here because the IO_READ state was set + under the protection of the hash_lock and the block->mutex + but not the block->lock. */ + buf_wait_for_read(fix_block, trx); +#endif /* PAGE_ATOMIC_REF_COUNT */ + + switch (rw_latch) { + case RW_NO_LATCH: + +#ifndef PAGE_ATOMIC_REF_COUNT + buf_wait_for_read(fix_block, trx); +#endif /* !PAGE_ATOMIC_REF_COUNT */ + + fix_type = MTR_MEMO_BUF_FIX; + break; + + case RW_S_LATCH: + rw_lock_s_lock_inline(&fix_block->lock, 0, file, line); + + fix_type = MTR_MEMO_PAGE_S_FIX; + break; + + default: + ut_ad(rw_latch == RW_X_LATCH); + rw_lock_x_lock_inline(&fix_block->lock, 0, file, line); + + fix_type = MTR_MEMO_PAGE_X_FIX; + break; + } + + mtr_memo_push(mtr, fix_block, fix_type); + + if (mode != BUF_PEEK_IF_IN_POOL && !access_time) { + /* In the case of a first access, try to apply linear + read-ahead */ + + buf_read_ahead_linear( + space, zip_size, offset, ibuf_inside(mtr), trx); + } + +#ifdef UNIV_IBUF_COUNT_DEBUG + ut_a(ibuf_count_get(buf_block_get_space(fix_block), + buf_block_get_page_no(fix_block)) == 0); +#endif +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX)); + ut_ad(!rw_lock_own(hash_lock, RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + + if (UNIV_UNLIKELY(trx && trx->take_stats)) { + _increment_page_get_statistics(block, trx); + } + + return(fix_block); +} + +/********************************************************************//** +This is the general function used to get optimistic access to a database +page. +@return TRUE if success */ +UNIV_INTERN +ibool +buf_page_optimistic_get( +/*====================*/ + ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */ + buf_block_t* block, /*!< in: guessed buffer block */ + ib_uint64_t modify_clock,/*!< in: modify clock value */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ + mtr_t* mtr) /*!< in: mini-transaction */ +{ + buf_pool_t* buf_pool; + unsigned access_time; + ibool success; + ulint fix_type; + trx_t* trx = NULL; + + ut_ad(block); + ut_ad(mtr); + ut_ad(mtr->state == MTR_ACTIVE); + ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH)); + + mutex_enter(&block->mutex); + + if (UNIV_UNLIKELY(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE)) { + + mutex_exit(&block->mutex); + + return(FALSE); + } + + buf_block_buf_fix_inc(block, file, line); + + access_time = buf_page_is_accessed(&block->page); + + buf_page_set_accessed(&block->page); + + mutex_exit(&block->mutex); + + buf_page_make_young_if_needed(&block->page); + + ut_ad(!ibuf_inside(mtr) + || ibuf_page(buf_block_get_space(block), + buf_block_get_zip_size(block), + buf_block_get_page_no(block), NULL)); + + if (rw_latch == RW_S_LATCH) { + success = rw_lock_s_lock_nowait(&(block->lock), + file, line); + fix_type = MTR_MEMO_PAGE_S_FIX; + } else { + success = rw_lock_x_lock_func_nowait_inline(&(block->lock), + file, line); + fix_type = MTR_MEMO_PAGE_X_FIX; + } + + if (UNIV_UNLIKELY(!success)) { + buf_block_buf_fix_dec(block); + + return(FALSE); + } + + if (UNIV_UNLIKELY(modify_clock != block->modify_clock)) { + buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK); + + if (rw_latch == RW_S_LATCH) { + rw_lock_s_unlock(&(block->lock)); + } else { + rw_lock_x_unlock(&(block->lock)); + } + + buf_block_buf_fix_dec(block); + + return(FALSE); + } + + mtr_memo_push(mtr, block, fix_type); + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG + ut_a(++buf_dbg_counter % 5771 || buf_validate()); + ut_a(block->page.buf_fix_count > 0); + ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ + +#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG + mutex_enter(&block->mutex); + ut_a(!block->page.file_page_was_freed); + mutex_exit(&block->mutex); +#endif + if (UNIV_UNLIKELY(innobase_get_slow_log())) { + trx = innobase_get_trx(); + } + + if (!access_time) { + /* In the case of a first access, try to apply linear + read-ahead */ + + buf_read_ahead_linear(buf_block_get_space(block), + buf_block_get_zip_size(block), + buf_block_get_page_no(block), + ibuf_inside(mtr), trx); + } + +#ifdef UNIV_IBUF_COUNT_DEBUG + ut_a(ibuf_count_get(buf_block_get_space(block), + buf_block_get_page_no(block)) == 0); +#endif + buf_pool = buf_pool_from_block(block); + buf_pool->stat.n_page_gets++; + + if (UNIV_UNLIKELY(trx && trx->take_stats)) { + _increment_page_get_statistics(block, trx); + } + return(TRUE); +} + +/********************************************************************//** +This is used to get access to a known database page, when no waiting can be +done. For example, if a search in an adaptive hash index leads us to this +frame. +@return TRUE if success */ +UNIV_INTERN +ibool +buf_page_get_known_nowait( +/*======================*/ + ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */ + buf_block_t* block, /*!< in: the known page */ + ulint mode, /*!< in: BUF_MAKE_YOUNG or BUF_KEEP_OLD */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ + mtr_t* mtr) /*!< in: mini-transaction */ +{ + buf_pool_t* buf_pool; + ibool success; + ulint fix_type; + trx_t* trx = NULL; + + ut_ad(mtr); + ut_ad(mtr->state == MTR_ACTIVE); + ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH)); + + mutex_enter(&block->mutex); + + if (buf_block_get_state(block) == BUF_BLOCK_REMOVE_HASH) { + /* Another thread is just freeing the block from the LRU list + of the buffer pool: do not try to access this page; this + attempt to access the page can only come through the hash + index because when the buffer block state is ..._REMOVE_HASH, + we have already removed it from the page address hash table + of the buffer pool. */ + + mutex_exit(&block->mutex); + + return(FALSE); + } + + ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); + + buf_block_buf_fix_inc(block, file, line); + + buf_page_set_accessed(&block->page); + + mutex_exit(&block->mutex); + + buf_pool = buf_pool_from_block(block); + + if (mode == BUF_MAKE_YOUNG) { + buf_page_make_young_if_needed(&block->page); + } + + ut_ad(!ibuf_inside(mtr) || mode == BUF_KEEP_OLD); + + if (rw_latch == RW_S_LATCH) { + success = rw_lock_s_lock_nowait(&(block->lock), + file, line); + fix_type = MTR_MEMO_PAGE_S_FIX; + } else { + success = rw_lock_x_lock_func_nowait_inline(&(block->lock), + file, line); + fix_type = MTR_MEMO_PAGE_X_FIX; + } + + if (!success) { + buf_block_buf_fix_dec(block); + + return(FALSE); + } + + mtr_memo_push(mtr, block, fix_type); + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG + ut_a(++buf_dbg_counter % 5771 || buf_validate()); + ut_a(block->page.buf_fix_count > 0); + ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ +#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG + if (mode != BUF_KEEP_OLD) { + /* If mode == BUF_KEEP_OLD, we are executing an I/O + completion routine. Avoid a bogus assertion failure + when ibuf_merge_or_delete_for_page() is processing a + page that was just freed due to DROP INDEX, or + deleting a record from SYS_INDEXES. This check will be + skipped in recv_recover_page() as well. */ + + mutex_enter(&block->mutex); + ut_a(!block->page.file_page_was_freed); + mutex_exit(&block->mutex); + } +#endif + +#ifdef UNIV_IBUF_COUNT_DEBUG + ut_a((mode == BUF_KEEP_OLD) + || (ibuf_count_get(buf_block_get_space(block), + buf_block_get_page_no(block)) == 0)); +#endif + buf_pool->stat.n_page_gets++; + + if (UNIV_UNLIKELY(innobase_get_slow_log())) { + + trx = innobase_get_trx(); + if (trx != NULL && trx->take_stats) { + + _increment_page_get_statistics(block, trx); + } + } + + return(TRUE); +} + +/*******************************************************************//** +Given a tablespace id and page number tries to get that page. If the +page is not in the buffer pool it is not loaded and NULL is returned. +Suitable for using when holding the lock_sys_t::mutex. +@return pointer to a page or NULL */ +UNIV_INTERN +const buf_block_t* +buf_page_try_get_func( +/*==================*/ + ulint space_id,/*!< in: tablespace id */ + ulint page_no,/*!< in: page number */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ + mtr_t* mtr) /*!< in: mini-transaction */ +{ + buf_block_t* block; + ibool success; + ulint fix_type; + buf_pool_t* buf_pool = buf_pool_get(space_id, page_no); + prio_rw_lock_t* hash_lock; + + ut_ad(mtr); + ut_ad(mtr->state == MTR_ACTIVE); + + block = buf_block_hash_get_s_locked(buf_pool, space_id, + page_no, &hash_lock); + + if (!block || buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) { + if (block) { + rw_lock_s_unlock(hash_lock); + } + return(NULL); + } + + ut_ad(!buf_pool_watch_is_sentinel(buf_pool, &block->page)); + + mutex_enter(&block->mutex); + rw_lock_s_unlock(hash_lock); + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG + ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); + ut_a(buf_block_get_space(block) == space_id); + ut_a(buf_block_get_page_no(block) == page_no); +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ + + buf_block_buf_fix_inc(block, file, line); + mutex_exit(&block->mutex); + + fix_type = MTR_MEMO_PAGE_S_FIX; + success = rw_lock_s_lock_nowait(&block->lock, file, line); + + if (!success) { + /* Let us try to get an X-latch. If the current thread + is holding an X-latch on the page, we cannot get an + S-latch. */ + + fix_type = MTR_MEMO_PAGE_X_FIX; + success = rw_lock_x_lock_func_nowait_inline(&block->lock, + file, line); + } + + if (!success) { + buf_block_buf_fix_dec(block); + + return(NULL); + } + + mtr_memo_push(mtr, block, fix_type); +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG + ut_a(++buf_dbg_counter % 5771 || buf_validate()); + ut_a(block->page.buf_fix_count > 0); + ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ +#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG + mutex_enter(&block->mutex); + ut_a(!block->page.file_page_was_freed); + mutex_exit(&block->mutex); +#endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */ + buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK); + + buf_pool->stat.n_page_gets++; + +#ifdef UNIV_IBUF_COUNT_DEBUG + ut_a(ibuf_count_get(buf_block_get_space(block), + buf_block_get_page_no(block)) == 0); +#endif + + return(block); +} + +/********************************************************************//** +Initialize some fields of a control block. */ +UNIV_INLINE +void +buf_page_init_low( +/*==============*/ + buf_page_t* bpage) /*!< in: block to init */ +{ + bpage->flush_type = BUF_FLUSH_LRU; + bpage->io_fix = BUF_IO_NONE; + bpage->buf_fix_count = 0; + bpage->freed_page_clock = 0; + bpage->access_time = 0; + bpage->newest_modification = 0; + bpage->oldest_modification = 0; + HASH_INVALIDATE(bpage, hash); + bpage->is_corrupt = FALSE; +#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG + bpage->file_page_was_freed = FALSE; +#endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */ +} + +/********************************************************************//** +Inits a page to the buffer buf_pool. */ +static __attribute__((nonnull)) +void +buf_page_init( +/*==========*/ + buf_pool_t* buf_pool,/*!< in/out: buffer pool */ + ulint space, /*!< in: space id */ + ulint offset, /*!< in: offset of the page within space + in units of a page */ + ulint fold, /*!< in: buf_page_address_fold(space,offset) */ + ulint zip_size,/*!< in: compressed page size, or 0 */ + buf_block_t* block) /*!< in/out: block to init */ +{ + buf_page_t* hash_page; + + ut_ad(buf_pool == buf_pool_get(space, offset)); + + ut_ad(mutex_own(&(block->mutex))); + ut_a(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE); + +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(buf_page_hash_lock_get(buf_pool, fold), + RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + /* Set the state of the block */ + buf_block_set_file_page(block, space, offset); + +#ifdef UNIV_DEBUG_VALGRIND + if (!space) { + /* Silence valid Valgrind warnings about uninitialized + data being written to data files. There are some unused + bytes on some pages that InnoDB does not initialize. */ + UNIV_MEM_VALID(block->frame, UNIV_PAGE_SIZE); + } +#endif /* UNIV_DEBUG_VALGRIND */ + + buf_block_init_low(block); + + block->lock_hash_val = lock_rec_hash(space, offset); + + buf_page_init_low(&block->page); + + /* Insert into the hash table of file pages */ + + hash_page = buf_page_hash_get_low(buf_pool, space, offset, fold); + + if (hash_page == NULL) { + /* Block not found in the hash table */ + } else if (buf_pool_watch_is_sentinel(buf_pool, hash_page)) { + + mutex_enter(&buf_pool->zip_mutex); + + ib_uint32_t buf_fix_count = hash_page->buf_fix_count; + + ut_a(buf_fix_count > 0); + +#ifdef PAGE_ATOMIC_REF_COUNT + os_atomic_increment_uint32( + &block->page.buf_fix_count, buf_fix_count); +#else + block->page.buf_fix_count += ulint(buf_fix_count); +#endif /* PAGE_ATOMIC_REF_COUNT */ + + buf_pool_watch_remove(buf_pool, fold, hash_page); + + mutex_exit(&buf_pool->zip_mutex); + + } else { + fprintf(stderr, + "InnoDB: Error: page %lu %lu already found" + " in the hash table: %p, %p\n", + (ulong) space, + (ulong) offset, + (const void*) hash_page, (const void*) block); +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG + mutex_exit(&block->mutex); + buf_print(); + buf_LRU_print(); + buf_validate(); + buf_LRU_validate(); +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ + ut_error; + } + + ut_ad(!block->page.in_zip_hash); + ut_ad(!block->page.in_page_hash); + ut_d(block->page.in_page_hash = TRUE); + + HASH_INSERT(buf_page_t, hash, buf_pool->page_hash, fold, &block->page); + + if (zip_size) { + page_zip_set_size(&block->page.zip, zip_size); + } +} + +/********************************************************************//** +Function which inits a page for read to the buffer buf_pool. If the page is +(1) already in buf_pool, or +(2) if we specify to read only ibuf pages and the page is not an ibuf page, or +(3) if the space is deleted or being deleted, +then this function does nothing. +Sets the io_fix flag to BUF_IO_READ and sets a non-recursive exclusive lock +on the buffer frame. The io-handler must take care that the flag is cleared +and the lock released later. +@return pointer to the block or NULL */ +UNIV_INTERN +buf_page_t* +buf_page_init_for_read( +/*===================*/ + dberr_t* err, /*!< out: DB_SUCCESS or DB_TABLESPACE_DELETED */ + ulint mode, /*!< in: BUF_READ_IBUF_PAGES_ONLY, ... */ + ulint space, /*!< in: space id */ + ulint zip_size,/*!< in: compressed page size, or 0 */ + ibool unzip, /*!< in: TRUE=request uncompressed page */ + ib_int64_t tablespace_version, + /*!< in: prevents reading from a wrong + version of the tablespace in case we have done + DISCARD + IMPORT */ + ulint offset) /*!< in: page number */ +{ + buf_block_t* block; + buf_page_t* bpage = NULL; + buf_page_t* watch_page; + prio_rw_lock_t* hash_lock; + mtr_t mtr; + ulint fold; + ibool lru; + void* data; + buf_pool_t* buf_pool = buf_pool_get(space, offset); + + ut_ad(buf_pool); + + *err = DB_SUCCESS; + + if (mode == BUF_READ_IBUF_PAGES_ONLY) { + /* It is a read-ahead within an ibuf routine */ + + ut_ad(!ibuf_bitmap_page(zip_size, offset)); + + ibuf_mtr_start(&mtr); + + if (!recv_no_ibuf_operations + && !ibuf_page(space, zip_size, offset, &mtr)) { + + ibuf_mtr_commit(&mtr); + + return(NULL); + } + } else { + ut_ad(mode == BUF_READ_ANY_PAGE); + } + + if (zip_size && !unzip && !recv_recovery_is_on()) { + block = NULL; + } else { + block = buf_LRU_get_free_block(buf_pool); + ut_ad(block); + ut_ad(buf_pool_from_block(block) == buf_pool); + } + + fold = buf_page_address_fold(space, offset); + hash_lock = buf_page_hash_lock_get(buf_pool, fold); + + mutex_enter(&buf_pool->LRU_list_mutex); + rw_lock_x_lock(hash_lock); + + watch_page = buf_page_hash_get_low(buf_pool, space, offset, fold); + if (watch_page && !buf_pool_watch_is_sentinel(buf_pool, watch_page)) { + /* The page is already in the buffer pool. */ + watch_page = NULL; +err_exit: + mutex_exit(&buf_pool->LRU_list_mutex); + rw_lock_x_unlock(hash_lock); + if (block) { + mutex_enter(&block->mutex); + buf_LRU_block_free_non_file_page(block); + mutex_exit(&block->mutex); + } + + bpage = NULL; + goto func_exit; + } + + if (fil_tablespace_deleted_or_being_deleted_in_mem( + space, tablespace_version)) { + /* The page belongs to a space which has been + deleted or is being deleted. */ + *err = DB_TABLESPACE_DELETED; + + goto err_exit; + } + + if (block) { + bpage = &block->page; + + mutex_enter(&block->mutex); + + ut_ad(buf_pool_from_bpage(bpage) == buf_pool); + + buf_page_init(buf_pool, space, offset, fold, zip_size, block); + +#ifdef PAGE_ATOMIC_REF_COUNT + /* Note: We set the io state without the protection of + the block->lock. This is because other threads cannot + access this block unless it is in the hash table. */ + + buf_page_set_io_fix(bpage, BUF_IO_READ); +#endif /* PAGE_ATOMIC_REF_COUNT */ + + /* The block must be put to the LRU list, to the old blocks */ + buf_LRU_add_block(bpage, TRUE/* to old blocks */); + mutex_exit(&buf_pool->LRU_list_mutex); + + /* We set a pass-type x-lock on the frame because then + the same thread which called for the read operation + (and is running now at this point of code) can wait + for the read to complete by waiting for the x-lock on + the frame; if the x-lock were recursive, the same + thread would illegally get the x-lock before the page + read is completed. The x-lock is cleared by the + io-handler thread. */ + + rw_lock_x_lock_gen(&block->lock, BUF_IO_READ); + +#ifndef PAGE_ATOMIC_REF_COUNT + buf_page_set_io_fix(bpage, BUF_IO_READ); +#endif /* !PAGE_ATOMIC_REF_COUNT */ + + rw_lock_x_unlock(hash_lock); + + if (zip_size) { + /* buf_pool->LRU_list_mutex may be released and + reacquired by buf_buddy_alloc(). Thus, we + must release block->mutex in order not to + break the latching order in the reacquisition + of buf_pool->LRU_list_mutex. We also must defer this + operation until after the block descriptor has + been added to buf_pool->LRU and + buf_pool->page_hash. */ + mutex_exit(&block->mutex); + mutex_enter(&buf_pool->LRU_list_mutex); + data = buf_buddy_alloc(buf_pool, zip_size, &lru); + mutex_enter(&block->mutex); + block->page.zip.data = (page_zip_t*) data; + + /* To maintain the invariant + block->in_unzip_LRU_list + == buf_page_belongs_to_unzip_LRU(&block->page) + we have to add this block to unzip_LRU + after block->page.zip.data is set. */ + ut_ad(buf_page_belongs_to_unzip_LRU(&block->page)); + buf_unzip_LRU_add_block(block, TRUE); + mutex_exit(&buf_pool->LRU_list_mutex); + } + + mutex_exit(&block->mutex); + } else { + rw_lock_x_unlock(hash_lock); + + /* The compressed page must be allocated before the + control block (bpage), in order to avoid the + invocation of buf_buddy_relocate_block() on + uninitialized data. */ + data = buf_buddy_alloc(buf_pool, zip_size, &lru); + + rw_lock_x_lock(hash_lock); + + /* We must check the page_hash again, as it may have been + modified. */ + + watch_page = buf_page_hash_get_low( + buf_pool, space, offset, fold); + + if (UNIV_UNLIKELY(watch_page + && !buf_pool_watch_is_sentinel(buf_pool, + watch_page))) { + + /* The block was added by some other thread. */ + mutex_exit(&buf_pool->LRU_list_mutex); + rw_lock_x_unlock(hash_lock); + watch_page = NULL; + buf_buddy_free(buf_pool, data, zip_size); + + bpage = NULL; + goto func_exit; + } + + bpage = buf_page_alloc_descriptor(); + + /* Initialize the buf_pool pointer. */ + bpage->buf_pool_index = buf_pool_index(buf_pool); + + page_zip_des_init(&bpage->zip); + page_zip_set_size(&bpage->zip, zip_size); + bpage->zip.data = (page_zip_t*) data; + + mutex_enter(&buf_pool->zip_mutex); + UNIV_MEM_DESC(bpage->zip.data, + page_zip_get_size(&bpage->zip)); + + buf_page_init_low(bpage); + + bpage->state = BUF_BLOCK_ZIP_PAGE; + bpage->space = static_cast<ib_uint32_t>(space); + bpage->offset = static_cast<ib_uint32_t>(offset); + +#ifdef UNIV_DEBUG + bpage->in_page_hash = FALSE; + bpage->in_zip_hash = FALSE; + bpage->in_flush_list = FALSE; + bpage->in_free_list = FALSE; + bpage->in_LRU_list = FALSE; +#endif /* UNIV_DEBUG */ + + ut_d(bpage->in_page_hash = TRUE); + + if (watch_page != NULL) { + + /* Preserve the reference count. */ + ib_uint32_t buf_fix_count; + + buf_fix_count = watch_page->buf_fix_count; + + ut_a(buf_fix_count > 0); + + ut_ad(buf_own_zip_mutex_for_page(bpage)); + +#ifdef PAGE_ATOMIC_REF_COUNT + os_atomic_increment_uint32( + &bpage->buf_fix_count, buf_fix_count); +#else + bpage->buf_fix_count += buf_fix_count; +#endif /* PAGE_ATOMIC_REF_COUNT */ + + ut_ad(buf_pool_watch_is_sentinel(buf_pool, watch_page)); + buf_pool_watch_remove(buf_pool, fold, watch_page); + } + + HASH_INSERT(buf_page_t, hash, buf_pool->page_hash, fold, + bpage); + + rw_lock_x_unlock(hash_lock); + + /* The block must be put to the LRU list, to the old blocks. + The zip_size is already set into the page zip */ + buf_LRU_add_block(bpage, TRUE/* to old blocks */); +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG + buf_LRU_insert_zip_clean(bpage); +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ + mutex_exit(&buf_pool->LRU_list_mutex); + + buf_page_set_io_fix(bpage, BUF_IO_READ); + + mutex_exit(&buf_pool->zip_mutex); + } + + os_atomic_increment_ulint(&buf_pool->n_pend_reads, 1); +func_exit: + + if (mode == BUF_READ_IBUF_PAGES_ONLY) { + + ibuf_mtr_commit(&mtr); + } + + +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX)); + ut_ad(!rw_lock_own(hash_lock, RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + + ut_ad(!bpage || buf_page_in_file(bpage)); + return(bpage); +} + +/********************************************************************//** +Initializes a page to the buffer buf_pool. The page is usually not read +from a file even if it cannot be found in the buffer buf_pool. This is one +of the functions which perform to a block a state transition NOT_USED => +FILE_PAGE (the other is buf_page_get_gen). +@return pointer to the block, page bufferfixed */ +UNIV_INTERN +buf_block_t* +buf_page_create( +/*============*/ + ulint space, /*!< in: space id */ + ulint offset, /*!< in: offset of the page within space in units of + a page */ + ulint zip_size,/*!< in: compressed page size, or 0 */ + mtr_t* mtr) /*!< in: mini-transaction handle */ +{ + buf_frame_t* frame; + buf_block_t* block; + ulint fold; + buf_block_t* free_block = NULL; + buf_pool_t* buf_pool = buf_pool_get(space, offset); + prio_rw_lock_t* hash_lock; + + ut_ad(mtr); + ut_ad(mtr->state == MTR_ACTIVE); + ut_ad(space || !zip_size); + + free_block = buf_LRU_get_free_block(buf_pool); + + fold = buf_page_address_fold(space, offset); + hash_lock = buf_page_hash_lock_get(buf_pool, fold); + + mutex_enter(&buf_pool->LRU_list_mutex); + rw_lock_x_lock(hash_lock); + + block = (buf_block_t*) buf_page_hash_get_low( + buf_pool, space, offset, fold); + + if (block + && buf_page_in_file(&block->page) + && !buf_pool_watch_is_sentinel(buf_pool, &block->page)) { +#ifdef UNIV_IBUF_COUNT_DEBUG + ut_a(ibuf_count_get(space, offset) == 0); +#endif +#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG + block->page.file_page_was_freed = FALSE; +#endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */ + + /* Page can be found in buf_pool */ + rw_lock_x_unlock(hash_lock); + mutex_exit(&buf_pool->LRU_list_mutex); + + buf_block_free(free_block); + + return(buf_page_get_with_no_latch(space, zip_size, offset, mtr)); + } + + /* If we get here, the page was not in buf_pool: init it there */ + +#ifdef UNIV_DEBUG + if (buf_debug_prints) { + fprintf(stderr, "Creating space %lu page %lu to buffer\n", + (ulong) space, (ulong) offset); + } +#endif /* UNIV_DEBUG */ + + block = free_block; + + mutex_enter(&block->mutex); + + buf_page_init(buf_pool, space, offset, fold, zip_size, block); + + rw_lock_x_unlock(hash_lock); + + /* The block must be put to the LRU list */ + buf_LRU_add_block(&block->page, FALSE); + + buf_block_buf_fix_inc(block, __FILE__, __LINE__); + buf_pool->stat.n_pages_created++; + + if (zip_size) { + void* data; + ibool lru; + + /* Prevent race conditions during buf_buddy_alloc(), + which may release and reacquire buf_pool->LRU_list_mutex, + by IO-fixing and X-latching the block. */ + + buf_page_set_io_fix(&block->page, BUF_IO_READ); + rw_lock_x_lock(&block->lock); + + mutex_exit(&block->mutex); + /* buf_pool->LRU_list_mutex may be released and reacquired by + buf_buddy_alloc(). Thus, we must release block->mutex + in order not to break the latching order in + the reacquisition of buf_pool->LRU_list_mutex. We also must + defer this operation until after the block descriptor + has been added to buf_pool->LRU and buf_pool->page_hash. */ + data = buf_buddy_alloc(buf_pool, zip_size, &lru); + mutex_enter(&block->mutex); + block->page.zip.data = (page_zip_t*) data; + + /* To maintain the invariant + block->in_unzip_LRU_list + == buf_page_belongs_to_unzip_LRU(&block->page) + we have to add this block to unzip_LRU after + block->page.zip.data is set. */ + ut_ad(buf_page_belongs_to_unzip_LRU(&block->page)); + buf_unzip_LRU_add_block(block, FALSE); + + buf_page_set_io_fix(&block->page, BUF_IO_NONE); + rw_lock_x_unlock(&block->lock); + } + + mutex_exit(&buf_pool->LRU_list_mutex); + + mtr_memo_push(mtr, block, MTR_MEMO_BUF_FIX); + + buf_page_set_accessed(&block->page); + + mutex_exit(&block->mutex); + + /* Delete possible entries for the page from the insert buffer: + such can exist if the page belonged to an index which was dropped */ + + ibuf_merge_or_delete_for_page(NULL, space, offset, zip_size, TRUE); + + frame = block->frame; + + memset(frame + FIL_PAGE_PREV, 0xff, 4); + memset(frame + FIL_PAGE_NEXT, 0xff, 4); + mach_write_to_2(frame + FIL_PAGE_TYPE, FIL_PAGE_TYPE_ALLOCATED); + + /* Reset to zero the file flush lsn field in the page; if the first + page of an ibdata file is 'created' in this function into the buffer + pool then we lose the original contents of the file flush lsn stamp. + Then InnoDB could in a crash recovery print a big, false, corruption + warning if the stamp contains an lsn bigger than the ib_logfile lsn. */ + + memset(frame + FIL_PAGE_FILE_FLUSH_LSN, 0, 8); + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG + ut_a(++buf_dbg_counter % 5771 || buf_validate()); +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ +#ifdef UNIV_IBUF_COUNT_DEBUG + ut_a(ibuf_count_get(buf_block_get_space(block), + buf_block_get_page_no(block)) == 0); +#endif + return(block); +} + +/********************************************************************//** +Monitor the buffer page read/write activity, and increment corresponding +counter value if MONITOR_MODULE_BUF_PAGE (module_buf_page) module is +enabled. */ +static +void +buf_page_monitor( +/*=============*/ + const buf_page_t* bpage, /*!< in: pointer to the block */ + enum buf_io_fix io_type)/*!< in: io_fix types */ +{ + const byte* frame; + monitor_id_t counter; + + ut_ad(mutex_own(buf_page_get_mutex(bpage))); + + /* If the counter module is not turned on, just return */ + if (!MONITOR_IS_ON(MONITOR_MODULE_BUF_PAGE)) { + return; + } + + ut_a(io_type == BUF_IO_READ || io_type == BUF_IO_WRITE); + + frame = bpage->zip.data + ? bpage->zip.data + : ((buf_block_t*) bpage)->frame; + + switch (fil_page_get_type(frame)) { + ulint level; + + case FIL_PAGE_INDEX: + level = btr_page_get_level_low(frame); + + /* Check if it is an index page for insert buffer */ + if (btr_page_get_index_id(frame) + == (index_id_t)(DICT_IBUF_ID_MIN + IBUF_SPACE_ID)) { + if (level == 0) { + counter = MONITOR_RW_COUNTER( + io_type, MONITOR_INDEX_IBUF_LEAF_PAGE); + } else { + counter = MONITOR_RW_COUNTER( + io_type, + MONITOR_INDEX_IBUF_NON_LEAF_PAGE); + } + } else { + if (level == 0) { + counter = MONITOR_RW_COUNTER( + io_type, MONITOR_INDEX_LEAF_PAGE); + } else { + counter = MONITOR_RW_COUNTER( + io_type, MONITOR_INDEX_NON_LEAF_PAGE); + } + } + break; + + case FIL_PAGE_UNDO_LOG: + counter = MONITOR_RW_COUNTER(io_type, MONITOR_UNDO_LOG_PAGE); + break; + + case FIL_PAGE_INODE: + counter = MONITOR_RW_COUNTER(io_type, MONITOR_INODE_PAGE); + break; + + case FIL_PAGE_IBUF_FREE_LIST: + counter = MONITOR_RW_COUNTER(io_type, + MONITOR_IBUF_FREELIST_PAGE); + break; + + case FIL_PAGE_IBUF_BITMAP: + counter = MONITOR_RW_COUNTER(io_type, + MONITOR_IBUF_BITMAP_PAGE); + break; + + case FIL_PAGE_TYPE_SYS: + counter = MONITOR_RW_COUNTER(io_type, MONITOR_SYSTEM_PAGE); + break; + + case FIL_PAGE_TYPE_TRX_SYS: + counter = MONITOR_RW_COUNTER(io_type, MONITOR_TRX_SYSTEM_PAGE); + break; + + case FIL_PAGE_TYPE_FSP_HDR: + counter = MONITOR_RW_COUNTER(io_type, MONITOR_FSP_HDR_PAGE); + break; + + case FIL_PAGE_TYPE_XDES: + counter = MONITOR_RW_COUNTER(io_type, MONITOR_XDES_PAGE); + break; + + case FIL_PAGE_TYPE_BLOB: + counter = MONITOR_RW_COUNTER(io_type, MONITOR_BLOB_PAGE); + break; + + case FIL_PAGE_TYPE_ZBLOB: + counter = MONITOR_RW_COUNTER(io_type, MONITOR_ZBLOB_PAGE); + break; + + case FIL_PAGE_TYPE_ZBLOB2: + counter = MONITOR_RW_COUNTER(io_type, MONITOR_ZBLOB2_PAGE); + break; + + default: + counter = MONITOR_RW_COUNTER(io_type, MONITOR_OTHER_PAGE); + } + + MONITOR_INC_NOCHECK(counter); +} + +/********************************************************************//** +Mark a table with the specified space pointed by bpage->space corrupted. +Also remove the bpage from LRU list. +@return TRUE if successful */ +static +ibool +buf_mark_space_corrupt( +/*===================*/ + buf_page_t* bpage) /*!< in: pointer to the block in question */ +{ + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + const ibool uncompressed = (buf_page_get_state(bpage) + == BUF_BLOCK_FILE_PAGE); + ulint space = bpage->space; + ibool ret = TRUE; + const ulint fold = buf_page_address_fold(bpage->space, + bpage->offset); + prio_rw_lock_t* hash_lock = buf_page_hash_lock_get(buf_pool, fold); + + /* First unfix and release lock on the bpage */ + mutex_enter(&buf_pool->LRU_list_mutex); + rw_lock_x_lock(hash_lock); + mutex_enter(buf_page_get_mutex(bpage)); + ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_READ); + ut_ad(bpage->buf_fix_count == 0); + + /* Set BUF_IO_NONE before we remove the block from LRU list */ + buf_page_set_io_fix(bpage, BUF_IO_NONE); + + if (uncompressed) { + rw_lock_x_unlock_gen( + &((buf_block_t*) bpage)->lock, + BUF_IO_READ); + } + + /* Find the table with specified space id, and mark it corrupted */ + if (dict_set_corrupted_by_space(space)) { + buf_LRU_free_one_page(bpage); + } else { + mutex_exit(buf_page_get_mutex(bpage)); + ret = FALSE; + } + + mutex_exit(&buf_pool->LRU_list_mutex); + + ut_ad(buf_pool->n_pend_reads > 0); + os_atomic_decrement_ulint(&buf_pool->n_pend_reads, 1); + + return(ret); +} + +/********************************************************************//** +Completes an asynchronous read or write request of a file page to or from +the buffer pool. +@return true if successful */ +UNIV_INTERN +bool +buf_page_io_complete( +/*=================*/ + buf_page_t* bpage) /*!< in: pointer to the block in question */ +{ + enum buf_io_fix io_type; + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + const ibool uncompressed = (buf_page_get_state(bpage) + == BUF_BLOCK_FILE_PAGE); + bool have_LRU_mutex = false; + + ut_a(buf_page_in_file(bpage)); + + /* We do not need protect io_fix here by mutex to read + it because this is the only function where we can change the value + from BUF_IO_READ or BUF_IO_WRITE to some other value, and our code + ensures that this is the only thread that handles the i/o for this + block. */ + + io_type = buf_page_get_io_fix_unlocked(bpage); + ut_ad(io_type == BUF_IO_READ || io_type == BUF_IO_WRITE); + + if (io_type == BUF_IO_READ) { + ulint read_page_no; + ulint read_space_id; + byte* frame; + + if (buf_page_get_zip_size(bpage)) { + frame = bpage->zip.data; + os_atomic_increment_ulint(&buf_pool->n_pend_unzip, 1); + if (uncompressed + && !buf_zip_decompress((buf_block_t*) bpage, + FALSE)) { + + os_atomic_decrement_ulint( + &buf_pool->n_pend_unzip, 1); + goto corrupt; + } + os_atomic_decrement_ulint(&buf_pool->n_pend_unzip, 1); + } else { + ut_a(uncompressed); + frame = ((buf_block_t*) bpage)->frame; + } + + /* If this page is not uninitialized and not in the + doublewrite buffer, then the page number and space id + should be the same as in block. */ + read_page_no = mach_read_from_4(frame + FIL_PAGE_OFFSET); + read_space_id = mach_read_from_4( + frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + + if (bpage->space == TRX_SYS_SPACE + && buf_dblwr_page_inside(bpage->offset)) { + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: reading page %lu\n" + "InnoDB: which is in the" + " doublewrite buffer!\n", + (ulong) bpage->offset); + } else if (!read_space_id && !read_page_no) { + /* This is likely an uninitialized page. */ + } else if ((bpage->space + && bpage->space != read_space_id) + || bpage->offset != read_page_no) { + /* We did not compare space_id to read_space_id + if bpage->space == 0, because the field on the + page may contain garbage in MySQL < 4.1.1, + which only supported bpage->space == 0. */ + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: space id and page n:o" + " stored in the page\n" + "InnoDB: read in are %lu:%lu," + " should be %lu:%lu!\n", + (ulong) read_space_id, (ulong) read_page_no, + (ulong) bpage->space, + (ulong) bpage->offset); + } + + if (UNIV_LIKELY(!bpage->is_corrupt || + !srv_pass_corrupt_table)) { + /* From version 3.23.38 up we store the page checksum + to the 4 first bytes of the page end lsn field */ + + if (buf_page_is_corrupted(true, frame, + buf_page_get_zip_size(bpage))) { + + /* Not a real corruption if it was triggered by + error injection */ + DBUG_EXECUTE_IF("buf_page_is_corrupt_failure", + if (bpage->space > TRX_SYS_SPACE + && buf_mark_space_corrupt(bpage)) { + ib_logf(IB_LOG_LEVEL_INFO, + "Simulated page corruption"); + return(true); + } + goto page_not_corrupt; + ;); +corrupt: + fprintf(stderr, + "InnoDB: Database page corruption on disk" + " or a failed\n" + "InnoDB: file read of page %lu.\n" + "InnoDB: You may have to recover" + " from a backup.\n", + (ulong) bpage->offset); + buf_page_print(frame, buf_page_get_zip_size(bpage), + BUF_PAGE_PRINT_NO_CRASH); + fprintf(stderr, + "InnoDB: Database page corruption on disk" + " or a failed\n" + "InnoDB: file read of page %lu.\n" + "InnoDB: You may have to recover" + " from a backup.\n", + (ulong) bpage->offset); + fputs("InnoDB: It is also possible that" + " your operating\n" + "InnoDB: system has corrupted its" + " own file cache\n" + "InnoDB: and rebooting your computer" + " removes the\n" + "InnoDB: error.\n" + "InnoDB: If the corrupt page is an index page\n" + "InnoDB: you can also try to" + " fix the corruption\n" + "InnoDB: by dumping, dropping," + " and reimporting\n" + "InnoDB: the corrupt table." + " You can use CHECK\n" + "InnoDB: TABLE to scan your" + " table for corruption.\n" + "InnoDB: See also " + REFMAN "forcing-innodb-recovery.html\n" + "InnoDB: about forcing recovery.\n", stderr); + + if (srv_pass_corrupt_table && bpage->space != 0 + && bpage->space < SRV_LOG_SPACE_FIRST_ID) { + trx_t* trx; + + fprintf(stderr, + "InnoDB: space %u will be treated as corrupt.\n", + bpage->space); + fil_space_set_corrupt(bpage->space); + + trx = innobase_get_trx(); + if (trx && trx->dict_operation_lock_mode == RW_X_LATCH) { + dict_table_set_corrupt_by_space(bpage->space, FALSE); + } else { + dict_table_set_corrupt_by_space(bpage->space, TRUE); + } + bpage->is_corrupt = TRUE; + } else + if (srv_force_recovery < SRV_FORCE_IGNORE_CORRUPT) { + /* If page space id is larger than TRX_SYS_SPACE + (0), we will attempt to mark the corresponding + table as corrupted instead of crashing server */ + if (bpage->space > TRX_SYS_SPACE + && buf_mark_space_corrupt(bpage)) { + return(false); + } else { + fputs("InnoDB: Ending processing" + " because of" + " a corrupt database page.\n", + stderr); + + ut_error; + } + } + } + } /**/ + + DBUG_EXECUTE_IF("buf_page_is_corrupt_failure", + page_not_corrupt: bpage = bpage; ); + + if (recv_recovery_is_on()) { + /* Pages must be uncompressed for crash recovery. */ + ut_a(uncompressed); + recv_recover_page(TRUE, (buf_block_t*) bpage); + } + + if (uncompressed && !recv_no_ibuf_operations) { + + buf_block_t* block; + ibool update_ibuf_bitmap; + + if (UNIV_UNLIKELY(bpage->is_corrupt && + srv_pass_corrupt_table)) { + + block = NULL; + update_ibuf_bitmap = FALSE; + + } else { + + block = (buf_block_t *) bpage; + update_ibuf_bitmap = TRUE; + } + + ibuf_merge_or_delete_for_page( + block, bpage->space, + bpage->offset, buf_page_get_zip_size(bpage), + update_ibuf_bitmap); + } + } + + if (io_type == BUF_IO_WRITE + && ( +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG + /* to keep consistency at buf_LRU_insert_zip_clean() */ + buf_page_get_state(bpage) == BUF_BLOCK_ZIP_DIRTY || +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ + buf_page_get_flush_type(bpage) == BUF_FLUSH_LRU)) { + + have_LRU_mutex = true; /* optimistic */ + } +retry_mutex: + if (have_LRU_mutex) { + mutex_enter(&buf_pool->LRU_list_mutex); + } + + ib_mutex_t* block_mutex = buf_page_get_mutex(bpage); + mutex_enter(block_mutex); + + if (UNIV_UNLIKELY(io_type == BUF_IO_WRITE + && ( +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG + buf_page_get_state(bpage) == BUF_BLOCK_ZIP_DIRTY + || +#endif + buf_page_get_flush_type(bpage) == BUF_FLUSH_LRU) + && !have_LRU_mutex)) { + + mutex_exit(block_mutex); + have_LRU_mutex = true; + goto retry_mutex; + } + +#ifdef UNIV_IBUF_COUNT_DEBUG + if (io_type == BUF_IO_WRITE || uncompressed) { + /* For BUF_IO_READ of compressed-only blocks, the + buffered operations will be merged by buf_page_get_gen() + after the block has been uncompressed. */ + ut_a(ibuf_count_get(bpage->space, bpage->offset) == 0); + } +#endif + /* Because this thread which does the unlocking is not the same that + did the locking, we use a pass value != 0 in unlock, which simply + removes the newest lock debug record, without checking the thread + id. */ + + switch (io_type) { + case BUF_IO_READ: + + buf_page_set_io_fix(bpage, BUF_IO_NONE); + + /* NOTE that the call to ibuf may have moved the ownership of + the x-latch to this OS thread: do not let this confuse you in + debugging! */ + + ut_ad(buf_pool->n_pend_reads > 0); + os_atomic_decrement_ulint(&buf_pool->n_pend_reads, 1); + os_atomic_increment_ulint(&buf_pool->stat.n_pages_read, 1); + + ut_ad(!have_LRU_mutex); + + if (uncompressed) { + rw_lock_x_unlock_gen(&((buf_block_t*) bpage)->lock, + BUF_IO_READ); + } + + break; + + case BUF_IO_WRITE: + /* Write means a flush operation: call the completion + routine in the flush system */ + + buf_flush_write_complete(bpage); + + os_atomic_increment_ulint(&buf_pool->stat.n_pages_written, 1); + + if (have_LRU_mutex) { + mutex_exit(&buf_pool->LRU_list_mutex); + } + + if (uncompressed) { + rw_lock_s_unlock_gen(&((buf_block_t*) bpage)->lock, + BUF_IO_WRITE); + } + + break; + + default: + ut_error; + } + + buf_page_monitor(bpage, io_type); + +#ifdef UNIV_DEBUG + if (buf_debug_prints) { + fprintf(stderr, "Has %s page space %lu page no %lu\n", + io_type == BUF_IO_READ ? "read" : "written", + (ulong) buf_page_get_space(bpage), + (ulong) buf_page_get_page_no(bpage)); + } +#endif /* UNIV_DEBUG */ + + mutex_exit(block_mutex); + + return(true); +} + +/*********************************************************************//** +Asserts that all file pages in the buffer are in a replaceable state. +@return TRUE */ +static +ibool +buf_all_freed_instance( +/*===================*/ + buf_pool_t* buf_pool) /*!< in: buffer pool instancce */ +{ + ulint i; + buf_chunk_t* chunk; + + ut_ad(buf_pool); + + chunk = buf_pool->chunks; + + for (i = buf_pool->n_chunks; i--; chunk++) { + + mutex_enter(&buf_pool->LRU_list_mutex); + + const buf_block_t* block = buf_chunk_not_freed(chunk); + + mutex_exit(&buf_pool->LRU_list_mutex); + + if (UNIV_LIKELY_NULL(block)) { + fprintf(stderr, + "Page %lu %lu still fixed or dirty\n", + (ulong) block->page.space, + (ulong) block->page.offset); + ut_error; + } + } + + return(TRUE); +} + +/*********************************************************************//** +Invalidates file pages in one buffer pool instance */ +static +void +buf_pool_invalidate_instance( +/*=========================*/ + buf_pool_t* buf_pool) /*!< in: buffer pool instance */ +{ + ulint i; + + ut_ad(!mutex_own(&buf_pool->LRU_list_mutex)); + + mutex_enter(&buf_pool->flush_state_mutex); + + for (i = BUF_FLUSH_LRU; i < BUF_FLUSH_N_TYPES; i++) { + + /* As this function is called during startup and + during redo application phase during recovery, InnoDB + is single threaded (apart from IO helper threads) at + this stage. No new write batch can be in intialization + stage at this point. */ + ut_ad(buf_pool->init_flush[i] == FALSE); + + /* However, it is possible that a write batch that has + been posted earlier is still not complete. For buffer + pool invalidation to proceed we must ensure there is NO + write activity happening. */ + if (buf_pool->n_flush[i] > 0) { + buf_flush_t type = static_cast<buf_flush_t>(i); + + mutex_exit(&buf_pool->flush_state_mutex); + buf_flush_wait_batch_end(buf_pool, type); + mutex_enter(&buf_pool->flush_state_mutex); + } + } + mutex_exit(&buf_pool->flush_state_mutex); + + ut_ad(buf_all_freed_instance(buf_pool)); + + while (buf_LRU_scan_and_free_block(buf_pool, TRUE)) { + } + + mutex_enter(&buf_pool->LRU_list_mutex); + + ut_ad(UT_LIST_GET_LEN(buf_pool->LRU) == 0); + ut_ad(UT_LIST_GET_LEN(buf_pool->unzip_LRU) == 0); + + buf_pool->freed_page_clock = 0; + buf_pool->LRU_old = NULL; + buf_pool->LRU_old_len = 0; + + mutex_exit(&buf_pool->LRU_list_mutex); + + memset(&buf_pool->stat, 0x00, sizeof(buf_pool->stat)); + buf_refresh_io_stats(buf_pool); +} + +/*********************************************************************//** +Invalidates the file pages in the buffer pool when an archive recovery is +completed. All the file pages buffered must be in a replaceable state when +this function is called: not latched and not modified. */ +UNIV_INTERN +void +buf_pool_invalidate(void) +/*=====================*/ +{ + ulint i; + + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_invalidate_instance(buf_pool_from_array(i)); + } +} + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +/*********************************************************************//** +Validates data in one buffer pool instance +@return TRUE */ +static +ibool +buf_pool_validate_instance( +/*=======================*/ + buf_pool_t* buf_pool) /*!< in: buffer pool instance */ +{ + buf_page_t* b; + buf_chunk_t* chunk; + ulint i; + ulint n_lru_flush = 0; + ulint n_page_flush = 0; + ulint n_list_flush = 0; + ulint n_lru = 0; + ulint n_flush = 0; + ulint n_free = 0; + ulint n_zip = 0; + ulint fold = 0; + ulint space = 0; + ulint offset = 0; + + ut_ad(buf_pool); + + mutex_enter(&buf_pool->LRU_list_mutex); + hash_lock_x_all(buf_pool->page_hash); + mutex_enter(&buf_pool->zip_mutex); + mutex_enter(&buf_pool->free_list_mutex); + mutex_enter(&buf_pool->flush_state_mutex); + + chunk = buf_pool->chunks; + + /* Check the uncompressed blocks. */ + + for (i = buf_pool->n_chunks; i--; chunk++) { + + ulint j; + buf_block_t* block = chunk->blocks; + + for (j = chunk->size; j--; block++) { + + switch (buf_block_get_state(block)) { + case BUF_BLOCK_POOL_WATCH: + case BUF_BLOCK_ZIP_PAGE: + case BUF_BLOCK_ZIP_DIRTY: + /* These should only occur on + zip_clean, zip_free[], or flush_list. */ + ut_error; + break; + + case BUF_BLOCK_FILE_PAGE: + + space = buf_block_get_space(block); + offset = buf_block_get_page_no(block); + fold = buf_page_address_fold(space, offset); + ut_a(buf_page_hash_get_low(buf_pool, + space, + offset, + fold) + == &block->page); + +#ifdef UNIV_IBUF_COUNT_DEBUG + ut_a(buf_page_get_io_fix_unlocked(&block->page) + == BUF_IO_READ + || !ibuf_count_get(buf_block_get_space( + block), + buf_block_get_page_no( + block))); +#endif + switch (buf_page_get_io_fix_unlocked( + &block->page)) { + case BUF_IO_NONE: + break; + + case BUF_IO_WRITE: + switch (buf_page_get_flush_type( + &block->page)) { + case BUF_FLUSH_LRU: + case BUF_FLUSH_SINGLE_PAGE: + case BUF_FLUSH_LIST: + break; + default: + ut_error; + } + + break; + + case BUF_IO_READ: + + ut_a(rw_lock_is_locked(&block->lock, + RW_LOCK_EX)); + break; + + case BUF_IO_PIN: + break; + } + + n_lru++; + break; + + case BUF_BLOCK_NOT_USED: + n_free++; + break; + + case BUF_BLOCK_READY_FOR_USE: + case BUF_BLOCK_MEMORY: + case BUF_BLOCK_REMOVE_HASH: + /* do nothing */ + break; + } + } + } + + /* Check clean compressed-only blocks. */ + + for (b = UT_LIST_GET_FIRST(buf_pool->zip_clean); b; + b = UT_LIST_GET_NEXT(list, b)) { + ut_a(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE); + switch (buf_page_get_io_fix(b)) { + case BUF_IO_NONE: + case BUF_IO_PIN: + /* All clean blocks should be I/O-unfixed. */ + break; + case BUF_IO_READ: + /* In buf_LRU_free_page(), we temporarily set + b->io_fix = BUF_IO_READ for a newly allocated + control block in order to prevent + buf_page_get_gen() from decompressing the block. */ + break; + default: + ut_error; + break; + } + + /* It is OK to read oldest_modification here because + we have acquired buf_pool->zip_mutex above which acts + as the 'block->mutex' for these bpages. */ + ut_a(!b->oldest_modification); + fold = buf_page_address_fold(b->space, b->offset); + ut_a(buf_page_hash_get_low(buf_pool, b->space, b->offset, + fold) == b); + n_lru++; + n_zip++; + } + + /* Check dirty blocks. */ + + buf_flush_list_mutex_enter(buf_pool); + for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b; + b = UT_LIST_GET_NEXT(list, b)) { + ut_ad(b->in_flush_list); + ut_a(b->oldest_modification); + n_flush++; + + switch (buf_page_get_state(b)) { + case BUF_BLOCK_ZIP_DIRTY: + n_lru++; + n_zip++; + /* fallthrough */ + case BUF_BLOCK_FILE_PAGE: + switch (buf_page_get_io_fix_unlocked(b)) { + case BUF_IO_NONE: + case BUF_IO_READ: + case BUF_IO_PIN: + break; + case BUF_IO_WRITE: + switch (buf_page_get_flush_type(b)) { + case BUF_FLUSH_LRU: + n_lru_flush++; + break; + case BUF_FLUSH_SINGLE_PAGE: + n_page_flush++; + break; + case BUF_FLUSH_LIST: + n_list_flush++; + break; + default: + ut_error; + } + break; + default: + ut_error; + } + break; + case BUF_BLOCK_POOL_WATCH: + case BUF_BLOCK_ZIP_PAGE: + case BUF_BLOCK_NOT_USED: + case BUF_BLOCK_READY_FOR_USE: + case BUF_BLOCK_MEMORY: + case BUF_BLOCK_REMOVE_HASH: + ut_error; + break; + } + fold = buf_page_address_fold(b->space, b->offset); + ut_a(buf_page_hash_get_low(buf_pool, b->space, b->offset, + fold) == b); + } + + ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == n_flush); + + hash_unlock_x_all(buf_pool->page_hash); + buf_flush_list_mutex_exit(buf_pool); + + mutex_exit(&buf_pool->zip_mutex); + + if (n_lru + n_free > buf_pool->curr_size + n_zip) { + fprintf(stderr, "n LRU %lu, n free %lu, pool %lu zip %lu\n", + (ulong) n_lru, (ulong) n_free, + (ulong) buf_pool->curr_size, (ulong) n_zip); + ut_error; + } + + ut_a(UT_LIST_GET_LEN(buf_pool->LRU) == n_lru); + + mutex_exit(&buf_pool->LRU_list_mutex); + + if (UT_LIST_GET_LEN(buf_pool->free) != n_free) { + fprintf(stderr, "Free list len %lu, free blocks %lu\n", + (ulong) UT_LIST_GET_LEN(buf_pool->free), + (ulong) n_free); + ut_error; + } + + mutex_exit(&buf_pool->free_list_mutex); + + ut_a(buf_pool->n_flush[BUF_FLUSH_LIST] == n_list_flush); + ut_a(buf_pool->n_flush[BUF_FLUSH_LRU] == n_lru_flush); + ut_a(buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE] == n_page_flush); + + mutex_exit(&buf_pool->flush_state_mutex); + + ut_a(buf_LRU_validate()); + ut_a(buf_flush_validate(buf_pool)); + + return(TRUE); +} + +/*********************************************************************//** +Validates the buffer buf_pool data structure. +@return TRUE */ +UNIV_INTERN +ibool +buf_validate(void) +/*==============*/ +{ + ulint i; + + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_array(i); + + buf_pool_validate_instance(buf_pool); + } + return(TRUE); +} + +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ + +#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +/*********************************************************************//** +Prints info of the buffer buf_pool data structure for one instance. */ +static +void +buf_print_instance( +/*===============*/ + buf_pool_t* buf_pool) +{ + index_id_t* index_ids; + ulint* counts; + ulint size; + ulint i; + ulint j; + index_id_t id; + ulint n_found; + buf_chunk_t* chunk; + dict_index_t* index; + + ut_ad(buf_pool); + + size = buf_pool->curr_size; + + index_ids = static_cast<index_id_t*>( + mem_alloc(size * sizeof *index_ids)); + + counts = static_cast<ulint*>(mem_alloc(sizeof(ulint) * size)); + + /* Dirty reads below */ + + fprintf(stderr, + "buf_pool size %lu\n" + "database pages %lu\n" + "free pages %lu\n" + "modified database pages %lu\n" + "n pending decompressions %lu\n" + "n pending reads %lu\n" + "n pending flush LRU %lu list %lu single page %lu\n" + "pages made young %lu, not young %lu\n" + "pages read %lu, created %lu, written %lu\n", + (ulong) size, + (ulong) UT_LIST_GET_LEN(buf_pool->LRU), + (ulong) UT_LIST_GET_LEN(buf_pool->free), + (ulong) UT_LIST_GET_LEN(buf_pool->flush_list), + (ulong) buf_pool->n_pend_unzip, + (ulong) buf_pool->n_pend_reads, + (ulong) buf_pool->n_flush[BUF_FLUSH_LRU], + (ulong) buf_pool->n_flush[BUF_FLUSH_LIST], + (ulong) buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE], + (ulong) buf_pool->stat.n_pages_made_young, + (ulong) buf_pool->stat.n_pages_not_made_young, + (ulong) buf_pool->stat.n_pages_read, + (ulong) buf_pool->stat.n_pages_created, + (ulong) buf_pool->stat.n_pages_written); + + /* Count the number of blocks belonging to each index in the buffer */ + + n_found = 0; + + mutex_enter(&buf_pool->LRU_list_mutex); + + chunk = buf_pool->chunks; + + for (i = buf_pool->n_chunks; i--; chunk++) { + buf_block_t* block = chunk->blocks; + ulint n_blocks = chunk->size; + + for (; n_blocks--; block++) { + const buf_frame_t* frame = block->frame; + + if (fil_page_get_type(frame) == FIL_PAGE_INDEX) { + + id = btr_page_get_index_id(frame); + + /* Look for the id in the index_ids array */ + j = 0; + + while (j < n_found) { + + if (index_ids[j] == id) { + counts[j]++; + + break; + } + j++; + } + + if (j == n_found) { + n_found++; + index_ids[j] = id; + counts[j] = 1; + } + } + } + } + + mutex_exit(&buf_pool->LRU_list_mutex); + + for (i = 0; i < n_found; i++) { + index = dict_index_get_if_in_cache(index_ids[i]); + + fprintf(stderr, + "Block count for index %llu in buffer is about %lu", + (ullint) index_ids[i], + (ulong) counts[i]); + + if (index) { + putc(' ', stderr); + dict_index_name_print(stderr, NULL, index); + } + + putc('\n', stderr); + } + + mem_free(index_ids); + mem_free(counts); + + ut_a(buf_pool_validate_instance(buf_pool)); +} + +/*********************************************************************//** +Prints info of the buffer buf_pool data structure. */ +UNIV_INTERN +void +buf_print(void) +/*===========*/ +{ + ulint i; + + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_array(i); + buf_print_instance(buf_pool); + } +} +#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG || UNIV_BUF_DEBUG */ + +#ifdef UNIV_DEBUG +/*********************************************************************//** +Returns the number of latched pages in the buffer pool. +@return number of latched pages */ +UNIV_INTERN +ulint +buf_get_latched_pages_number_instance( +/*==================================*/ + buf_pool_t* buf_pool) /*!< in: buffer pool instance */ +{ + buf_page_t* b; + ulint i; + buf_chunk_t* chunk; + ulint fixed_pages_number = 0; + + /* The LRU list mutex is enough to protect the required fields below */ + mutex_enter(&buf_pool->LRU_list_mutex); + + chunk = buf_pool->chunks; + + for (i = buf_pool->n_chunks; i--; chunk++) { + buf_block_t* block; + ulint j; + + block = chunk->blocks; + + for (j = chunk->size; j--; block++) { + if (buf_block_get_state(block) + != BUF_BLOCK_FILE_PAGE) { + + continue; + } + + if (block->page.buf_fix_count != 0 + || buf_page_get_io_fix_unlocked(&block->page) + != BUF_IO_NONE) { + fixed_pages_number++; + } + + } + } + + mutex_exit(&buf_pool->LRU_list_mutex); + + mutex_enter(&buf_pool->zip_mutex); + + /* Traverse the lists of clean and dirty compressed-only blocks. */ + + for (b = UT_LIST_GET_FIRST(buf_pool->zip_clean); b; + b = UT_LIST_GET_NEXT(list, b)) { + ut_a(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE); + ut_a(buf_page_get_io_fix(b) != BUF_IO_WRITE); + + if (b->buf_fix_count != 0 + || buf_page_get_io_fix(b) != BUF_IO_NONE) { + fixed_pages_number++; + } + } + + buf_flush_list_mutex_enter(buf_pool); + for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b; + b = UT_LIST_GET_NEXT(list, b)) { + ut_ad(b->in_flush_list); + + switch (buf_page_get_state(b)) { + case BUF_BLOCK_ZIP_DIRTY: + if (b->buf_fix_count != 0 + || buf_page_get_io_fix(b) != BUF_IO_NONE) { + fixed_pages_number++; + } + break; + case BUF_BLOCK_FILE_PAGE: + /* uncompressed page */ + case BUF_BLOCK_REMOVE_HASH: + /* We hold flush list but not LRU list mutex here. + Thus encountering BUF_BLOCK_REMOVE_HASH pages is + possible. */ + break; + case BUF_BLOCK_POOL_WATCH: + case BUF_BLOCK_ZIP_PAGE: + case BUF_BLOCK_NOT_USED: + case BUF_BLOCK_READY_FOR_USE: + case BUF_BLOCK_MEMORY: + ut_error; + break; + } + } + + buf_flush_list_mutex_exit(buf_pool); + mutex_exit(&buf_pool->zip_mutex); + + return(fixed_pages_number); +} + +/*********************************************************************//** +Returns the number of latched pages in all the buffer pools. +@return number of latched pages */ +UNIV_INTERN +ulint +buf_get_latched_pages_number(void) +/*==============================*/ +{ + ulint i; + ulint total_latched_pages = 0; + + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_array(i); + + total_latched_pages += buf_get_latched_pages_number_instance( + buf_pool); + } + + return(total_latched_pages); +} + +#endif /* UNIV_DEBUG */ + +/*********************************************************************//** +Returns the number of pending buf pool read ios. +@return number of pending read I/O operations */ +UNIV_INTERN +ulint +buf_get_n_pending_read_ios(void) +/*============================*/ +{ + ulint i; + ulint pend_ios = 0; + + for (i = 0; i < srv_buf_pool_instances; i++) { + pend_ios += buf_pool_from_array(i)->n_pend_reads; + } + + return(pend_ios); +} + +/*********************************************************************//** +Returns the ratio in percents of modified pages in the buffer pool / +database pages in the buffer pool. +@return modified page percentage ratio */ +UNIV_INTERN +ulint +buf_get_modified_ratio_pct(void) +/*============================*/ +{ + ulint ratio; + ulint lru_len = 0; + ulint free_len = 0; + ulint flush_list_len = 0; + + buf_get_total_list_len(&lru_len, &free_len, &flush_list_len); + + ratio = (100 * flush_list_len) / (1 + lru_len + free_len); + + /* 1 + is there to avoid division by zero */ + + return(ratio); +} + +/*******************************************************************//** +Aggregates a pool stats information with the total buffer pool stats */ +static +void +buf_stats_aggregate_pool_info( +/*==========================*/ + buf_pool_info_t* total_info, /*!< in/out: the buffer pool + info to store aggregated + result */ + const buf_pool_info_t* pool_info) /*!< in: individual buffer pool + stats info */ +{ + ut_a(total_info && pool_info); + + /* Nothing to copy if total_info is the same as pool_info */ + if (total_info == pool_info) { + return; + } + + total_info->pool_size += pool_info->pool_size; + total_info->pool_size_bytes += pool_info->pool_size_bytes; + total_info->lru_len += pool_info->lru_len; + total_info->old_lru_len += pool_info->old_lru_len; + total_info->free_list_len += pool_info->free_list_len; + total_info->flush_list_len += pool_info->flush_list_len; + total_info->n_pend_unzip += pool_info->n_pend_unzip; + total_info->n_pend_reads += pool_info->n_pend_reads; + total_info->n_pending_flush_lru += pool_info->n_pending_flush_lru; + total_info->n_pending_flush_list += pool_info->n_pending_flush_list; + total_info->n_pages_made_young += pool_info->n_pages_made_young; + total_info->n_pages_not_made_young += pool_info->n_pages_not_made_young; + total_info->n_pages_read += pool_info->n_pages_read; + total_info->n_pages_created += pool_info->n_pages_created; + total_info->n_pages_written += pool_info->n_pages_written; + total_info->n_page_gets += pool_info->n_page_gets; + total_info->n_ra_pages_read_rnd += pool_info->n_ra_pages_read_rnd; + total_info->n_ra_pages_read += pool_info->n_ra_pages_read; + total_info->n_ra_pages_evicted += pool_info->n_ra_pages_evicted; + total_info->page_made_young_rate += pool_info->page_made_young_rate; + total_info->page_not_made_young_rate += + pool_info->page_not_made_young_rate; + total_info->pages_read_rate += pool_info->pages_read_rate; + total_info->pages_created_rate += pool_info->pages_created_rate; + total_info->pages_written_rate += pool_info->pages_written_rate; + total_info->n_page_get_delta += pool_info->n_page_get_delta; + total_info->page_read_delta += pool_info->page_read_delta; + total_info->young_making_delta += pool_info->young_making_delta; + total_info->not_young_making_delta += pool_info->not_young_making_delta; + total_info->pages_readahead_rnd_rate += pool_info->pages_readahead_rnd_rate; + total_info->pages_readahead_rate += pool_info->pages_readahead_rate; + total_info->pages_evicted_rate += pool_info->pages_evicted_rate; + total_info->unzip_lru_len += pool_info->unzip_lru_len; + total_info->io_sum += pool_info->io_sum; + total_info->io_cur += pool_info->io_cur; + total_info->unzip_sum += pool_info->unzip_sum; + total_info->unzip_cur += pool_info->unzip_cur; +} +/*******************************************************************//** +Collect buffer pool stats information for a buffer pool. Also +record aggregated stats if there are more than one buffer pool +in the server */ +UNIV_INTERN +void +buf_stats_get_pool_info( +/*====================*/ + buf_pool_t* buf_pool, /*!< in: buffer pool */ + ulint pool_id, /*!< in: buffer pool ID */ + buf_pool_info_t* all_pool_info) /*!< in/out: buffer pool info + to fill */ +{ + buf_pool_info_t* pool_info; + time_t current_time; + double time_elapsed; + + /* Find appropriate pool_info to store stats for this buffer pool */ + pool_info = &all_pool_info[pool_id]; + + pool_info->pool_unique_id = pool_id; + + pool_info->pool_size = buf_pool->curr_size; + + pool_info->pool_size_bytes = buf_pool->curr_pool_size; + + pool_info->lru_len = UT_LIST_GET_LEN(buf_pool->LRU); + + pool_info->old_lru_len = buf_pool->LRU_old_len; + + pool_info->free_list_len = UT_LIST_GET_LEN(buf_pool->free); + + pool_info->flush_list_len = UT_LIST_GET_LEN(buf_pool->flush_list); + + pool_info->n_pend_unzip = UT_LIST_GET_LEN(buf_pool->unzip_LRU); + + pool_info->n_pend_reads = buf_pool->n_pend_reads; + + mutex_enter(&buf_pool->flush_state_mutex); + + pool_info->n_pending_flush_lru = + (buf_pool->n_flush[BUF_FLUSH_LRU] + + buf_pool->init_flush[BUF_FLUSH_LRU]); + + pool_info->n_pending_flush_list = + (buf_pool->n_flush[BUF_FLUSH_LIST] + + buf_pool->init_flush[BUF_FLUSH_LIST]); + + pool_info->n_pending_flush_single_page = + (buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE] + + buf_pool->init_flush[BUF_FLUSH_SINGLE_PAGE]); + + mutex_exit(&buf_pool->flush_state_mutex); + + current_time = time(NULL); + time_elapsed = 0.001 + difftime(current_time, + buf_pool->last_printout_time); + + pool_info->n_pages_made_young = buf_pool->stat.n_pages_made_young; + + pool_info->n_pages_not_made_young = + buf_pool->stat.n_pages_not_made_young; + + pool_info->n_pages_read = buf_pool->stat.n_pages_read; + + pool_info->n_pages_created = buf_pool->stat.n_pages_created; + + pool_info->n_pages_written = buf_pool->stat.n_pages_written; + + pool_info->n_page_gets = buf_pool->stat.n_page_gets; + + pool_info->n_ra_pages_read_rnd = buf_pool->stat.n_ra_pages_read_rnd; + pool_info->n_ra_pages_read = buf_pool->stat.n_ra_pages_read; + + pool_info->n_ra_pages_evicted = buf_pool->stat.n_ra_pages_evicted; + + pool_info->page_made_young_rate = + (buf_pool->stat.n_pages_made_young + - buf_pool->old_stat.n_pages_made_young) / time_elapsed; + + pool_info->page_not_made_young_rate = + (buf_pool->stat.n_pages_not_made_young + - buf_pool->old_stat.n_pages_not_made_young) / time_elapsed; + + pool_info->pages_read_rate = + (buf_pool->stat.n_pages_read + - buf_pool->old_stat.n_pages_read) / time_elapsed; + + pool_info->pages_created_rate = + (buf_pool->stat.n_pages_created + - buf_pool->old_stat.n_pages_created) / time_elapsed; + + pool_info->pages_written_rate = + (buf_pool->stat.n_pages_written + - buf_pool->old_stat.n_pages_written) / time_elapsed; + + pool_info->n_page_get_delta = buf_pool->stat.n_page_gets + - buf_pool->old_stat.n_page_gets; + + if (pool_info->n_page_get_delta) { + pool_info->page_read_delta = buf_pool->stat.n_pages_read + - buf_pool->old_stat.n_pages_read; + + pool_info->young_making_delta = + buf_pool->stat.n_pages_made_young + - buf_pool->old_stat.n_pages_made_young; + + pool_info->not_young_making_delta = + buf_pool->stat.n_pages_not_made_young + - buf_pool->old_stat.n_pages_not_made_young; + } + pool_info->pages_readahead_rnd_rate = + (buf_pool->stat.n_ra_pages_read_rnd + - buf_pool->old_stat.n_ra_pages_read_rnd) / time_elapsed; + + + pool_info->pages_readahead_rate = + (buf_pool->stat.n_ra_pages_read + - buf_pool->old_stat.n_ra_pages_read) / time_elapsed; + + pool_info->pages_evicted_rate = + (buf_pool->stat.n_ra_pages_evicted + - buf_pool->old_stat.n_ra_pages_evicted) / time_elapsed; + + pool_info->unzip_lru_len = UT_LIST_GET_LEN(buf_pool->unzip_LRU); + + pool_info->io_sum = buf_LRU_stat_sum.io; + + pool_info->io_cur = buf_LRU_stat_cur.io; + + pool_info->unzip_sum = buf_LRU_stat_sum.unzip; + + pool_info->unzip_cur = buf_LRU_stat_cur.unzip; + + buf_refresh_io_stats(buf_pool); +} + +/*********************************************************************//** +Prints info of the buffer i/o. */ +UNIV_INTERN +void +buf_print_io_instance( +/*==================*/ + buf_pool_info_t*pool_info, /*!< in: buffer pool info */ + FILE* file) /*!< in/out: buffer where to print */ +{ + ut_ad(pool_info); + + fprintf(file, + "Buffer pool size %lu\n" + "Buffer pool size, bytes " ULINTPF "\n" + "Free buffers %lu\n" + "Database pages %lu\n" + "Old database pages %lu\n" + "Modified db pages %lu\n" + "Pending reads %lu\n" + "Pending writes: LRU %lu, flush list %lu, single page %lu\n", + pool_info->pool_size, + pool_info->pool_size_bytes, + pool_info->free_list_len, + pool_info->lru_len, + pool_info->old_lru_len, + pool_info->flush_list_len, + pool_info->n_pend_reads, + pool_info->n_pending_flush_lru, + pool_info->n_pending_flush_list, + pool_info->n_pending_flush_single_page); + + fprintf(file, + "Pages made young %lu, not young %lu\n" + "%.2f youngs/s, %.2f non-youngs/s\n" + "Pages read %lu, created %lu, written %lu\n" + "%.2f reads/s, %.2f creates/s, %.2f writes/s\n", + pool_info->n_pages_made_young, + pool_info->n_pages_not_made_young, + pool_info->page_made_young_rate, + pool_info->page_not_made_young_rate, + pool_info->n_pages_read, + pool_info->n_pages_created, + pool_info->n_pages_written, + pool_info->pages_read_rate, + pool_info->pages_created_rate, + pool_info->pages_written_rate); + + if (pool_info->n_page_get_delta) { + fprintf(file, + "Buffer pool hit rate %lu / 1000," + " young-making rate %lu / 1000 not %lu / 1000\n", + (ulong) (1000 - (1000 * pool_info->page_read_delta + / pool_info->n_page_get_delta)), + (ulong) (1000 * pool_info->young_making_delta + / pool_info->n_page_get_delta), + (ulong) (1000 * pool_info->not_young_making_delta + / pool_info->n_page_get_delta)); + } else { + fputs("No buffer pool page gets since the last printout\n", + file); + } + + /* Statistics about read ahead algorithm */ + fprintf(file, "Pages read ahead %.2f/s," + " evicted without access %.2f/s," + " Random read ahead %.2f/s\n", + + pool_info->pages_readahead_rate, + pool_info->pages_evicted_rate, + pool_info->pages_readahead_rnd_rate); + + /* Print some values to help us with visualizing what is + happening with LRU eviction. */ + fprintf(file, + "LRU len: %lu, unzip_LRU len: %lu\n" + "I/O sum[%lu]:cur[%lu], unzip sum[%lu]:cur[%lu]\n", + pool_info->lru_len, pool_info->unzip_lru_len, + pool_info->io_sum, pool_info->io_cur, + pool_info->unzip_sum, pool_info->unzip_cur); +} + +/*********************************************************************//** +Prints info of the buffer i/o. */ +UNIV_INTERN +void +buf_print_io( +/*=========*/ + FILE* file) /*!< in/out: buffer where to print */ +{ + ulint i; + buf_pool_info_t* pool_info; + buf_pool_info_t* pool_info_total; + + /* If srv_buf_pool_instances is greater than 1, allocate + one extra buf_pool_info_t, the last one stores + aggregated/total values from all pools */ + if (srv_buf_pool_instances > 1) { + pool_info = (buf_pool_info_t*) mem_zalloc(( + srv_buf_pool_instances + 1) * sizeof *pool_info); + + pool_info_total = &pool_info[srv_buf_pool_instances]; + } else { + ut_a(srv_buf_pool_instances == 1); + + pool_info_total = pool_info = + static_cast<buf_pool_info_t*>( + mem_zalloc(sizeof *pool_info)); + } + + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_array(i); + + /* Fetch individual buffer pool info and calculate + aggregated stats along the way */ + buf_stats_get_pool_info(buf_pool, i, pool_info); + + /* If we have more than one buffer pool, store + the aggregated stats */ + if (srv_buf_pool_instances > 1) { + buf_stats_aggregate_pool_info(pool_info_total, + &pool_info[i]); + } + } + + /* Print the aggreate buffer pool info */ + buf_print_io_instance(pool_info_total, file); + + /* If there are more than one buffer pool, print each individual pool + info */ + if (srv_buf_pool_instances > 1) { + fputs("----------------------\n" + "INDIVIDUAL BUFFER POOL INFO\n" + "----------------------\n", file); + + for (i = 0; i < srv_buf_pool_instances; i++) { + fprintf(file, "---BUFFER POOL %lu\n", i); + buf_print_io_instance(&pool_info[i], file); + } + } + + mem_free(pool_info); +} + +/**********************************************************************//** +Refreshes the statistics used to print per-second averages. */ +UNIV_INTERN +void +buf_refresh_io_stats( +/*=================*/ + buf_pool_t* buf_pool) /*!< in: buffer pool instance */ +{ + buf_pool->last_printout_time = ut_time(); + buf_pool->old_stat = buf_pool->stat; +} + +/**********************************************************************//** +Refreshes the statistics used to print per-second averages. */ +UNIV_INTERN +void +buf_refresh_io_stats_all(void) +/*==========================*/ +{ + for (ulint i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_array(i); + + buf_refresh_io_stats(buf_pool); + } +} + +/**********************************************************************//** +Check if all pages in all buffer pools are in a replacable state. +@return FALSE if not */ +UNIV_INTERN +ibool +buf_all_freed(void) +/*===============*/ +{ + for (ulint i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_array(i); + + if (!buf_all_freed_instance(buf_pool)) { + return(FALSE); + } + } + + return(TRUE); +} + +/*********************************************************************//** +Checks that there currently are no pending i/o-operations for the buffer +pool. +@return number of pending i/o */ +UNIV_INTERN +ulint +buf_pool_check_no_pending_io(void) +/*==============================*/ +{ + ulint i; + ulint pending_io = 0; + + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_array(i); + + pending_io += buf_pool->n_pend_reads; + + mutex_enter(&buf_pool->flush_state_mutex); + + pending_io += buf_pool->n_flush[BUF_FLUSH_LRU]; + pending_io += buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]; + pending_io += buf_pool->n_flush[BUF_FLUSH_LIST]; + + mutex_exit(&buf_pool->flush_state_mutex); + } + + return(pending_io); +} + +#if 0 +Code currently not used +/*********************************************************************//** +Gets the current length of the free list of buffer blocks. +@return length of the free list */ +UNIV_INTERN +ulint +buf_get_free_list_len(void) +/*=======================*/ +{ + ulint len; + + mutex_enter(&buf_pool->free_list_mutex); + + len = UT_LIST_GET_LEN(buf_pool->free); + + mutex_exit(&buf_pool->free_list_mutex); + + return(len); +} +#endif + +#else /* !UNIV_HOTBACKUP */ +/********************************************************************//** +Inits a page to the buffer buf_pool, for use in mysqlbackup --restore. */ +UNIV_INTERN +void +buf_page_init_for_backup_restore( +/*=============================*/ + ulint space, /*!< in: space id */ + ulint offset, /*!< in: offset of the page within space + in units of a page */ + ulint zip_size,/*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + buf_block_t* block) /*!< in: block to init */ +{ + block->page.state = BUF_BLOCK_FILE_PAGE; + block->page.space = space; + block->page.offset = offset; + + page_zip_des_init(&block->page.zip); + + /* We assume that block->page.data has been allocated + with zip_size == UNIV_PAGE_SIZE. */ + ut_ad(zip_size <= UNIV_ZIP_SIZE_MAX); + ut_ad(ut_is_2pow(zip_size)); + page_zip_set_size(&block->page.zip, zip_size); + if (zip_size) { + block->page.zip.data = block->frame + UNIV_PAGE_SIZE; + } +} +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/xtradb/buf/buf0checksum.cc b/storage/xtradb/buf/buf0checksum.cc new file mode 100644 index 00000000000..ec79bbe6be9 --- /dev/null +++ b/storage/xtradb/buf/buf0checksum.cc @@ -0,0 +1,155 @@ +/***************************************************************************** + +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file buf/buf0checksum.cc +Buffer pool checksum functions, also linked from /extra/innochecksum.cc + +Created Aug 11, 2011 Vasil Dimov +*******************************************************/ + +#include "univ.i" +#include "fil0fil.h" /* FIL_* */ +#include "ut0crc32.h" /* ut_crc32() */ +#include "ut0rnd.h" /* ut_fold_binary() */ + +#ifndef UNIV_INNOCHECKSUM + +#include "srv0srv.h" /* SRV_CHECKSUM_* */ +#include "buf0types.h" + +/** the macro MYSQL_SYSVAR_ENUM() requires "long unsigned int" and if we +use srv_checksum_algorithm_t here then we get a compiler error: +ha_innodb.cc:12251: error: cannot convert 'srv_checksum_algorithm_t*' to + 'long unsigned int*' in initialization */ +UNIV_INTERN ulong srv_checksum_algorithm = SRV_CHECKSUM_ALGORITHM_INNODB; + +#endif /* !UNIV_INNOCHECKSUM */ + +/********************************************************************//** +Calculates a page CRC32 which is stored to the page when it is written +to a file. Note that we must be careful to calculate the same value on +32-bit and 64-bit architectures. +@return checksum */ +UNIV_INTERN +ib_uint32_t +buf_calc_page_crc32( +/*================*/ + const byte* page) /*!< in: buffer page */ +{ + ib_uint32_t checksum; + + /* Since the field FIL_PAGE_FILE_FLUSH_LSN, and in versions <= 4.1.x + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, are written outside the buffer pool + to the first pages of data files, we have to skip them in the page + checksum calculation. + We must also skip the field FIL_PAGE_SPACE_OR_CHKSUM where the + checksum is stored, and also the last 8 bytes of page because + there we store the old formula checksum. */ + + checksum = ut_crc32(page + FIL_PAGE_OFFSET, + FIL_PAGE_FILE_FLUSH_LSN - FIL_PAGE_OFFSET) + ^ ut_crc32(page + FIL_PAGE_DATA, + UNIV_PAGE_SIZE - FIL_PAGE_DATA + - FIL_PAGE_END_LSN_OLD_CHKSUM); + + return(checksum); +} + +/********************************************************************//** +Calculates a page checksum which is stored to the page when it is written +to a file. Note that we must be careful to calculate the same value on +32-bit and 64-bit architectures. +@return checksum */ +UNIV_INTERN +ulint +buf_calc_page_new_checksum( +/*=======================*/ + const byte* page) /*!< in: buffer page */ +{ + ulint checksum; + + /* Since the field FIL_PAGE_FILE_FLUSH_LSN, and in versions <= 4.1.x + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, are written outside the buffer pool + to the first pages of data files, we have to skip them in the page + checksum calculation. + We must also skip the field FIL_PAGE_SPACE_OR_CHKSUM where the + checksum is stored, and also the last 8 bytes of page because + there we store the old formula checksum. */ + + checksum = ut_fold_binary(page + FIL_PAGE_OFFSET, + FIL_PAGE_FILE_FLUSH_LSN - FIL_PAGE_OFFSET) + + ut_fold_binary(page + FIL_PAGE_DATA, + UNIV_PAGE_SIZE - FIL_PAGE_DATA + - FIL_PAGE_END_LSN_OLD_CHKSUM); + checksum = checksum & 0xFFFFFFFFUL; + + return(checksum); +} + +/********************************************************************//** +In versions < 4.0.14 and < 4.1.1 there was a bug that the checksum only +looked at the first few bytes of the page. This calculates that old +checksum. +NOTE: we must first store the new formula checksum to +FIL_PAGE_SPACE_OR_CHKSUM before calculating and storing this old checksum +because this takes that field as an input! +@return checksum */ +UNIV_INTERN +ulint +buf_calc_page_old_checksum( +/*=======================*/ + const byte* page) /*!< in: buffer page */ +{ + ulint checksum; + + checksum = ut_fold_binary(page, FIL_PAGE_FILE_FLUSH_LSN); + + checksum = checksum & 0xFFFFFFFFUL; + + return(checksum); +} + +#ifndef UNIV_INNOCHECKSUM + +/********************************************************************//** +Return a printable string describing the checksum algorithm. +@return algorithm name */ +UNIV_INTERN +const char* +buf_checksum_algorithm_name( +/*========================*/ + srv_checksum_algorithm_t algo) /*!< in: algorithm */ +{ + switch (algo) { + case SRV_CHECKSUM_ALGORITHM_CRC32: + case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32: + return("crc32"); + case SRV_CHECKSUM_ALGORITHM_INNODB: + case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB: + return("innodb"); + case SRV_CHECKSUM_ALGORITHM_NONE: + case SRV_CHECKSUM_ALGORITHM_STRICT_NONE: + return("none"); + } + + ut_error; + return(NULL); +} + +#endif /* !UNIV_INNOCHECKSUM */ diff --git a/storage/xtradb/buf/buf0dblwr.cc b/storage/xtradb/buf/buf0dblwr.cc new file mode 100644 index 00000000000..f4d1c637e3e --- /dev/null +++ b/storage/xtradb/buf/buf0dblwr.cc @@ -0,0 +1,1172 @@ +/***************************************************************************** + +Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file buf/buf0dblwr.cc +Doublwrite buffer module + +Created 2011/12/19 +*******************************************************/ + +#include "buf0dblwr.h" + +#ifdef UNIV_NONINL +#include "buf0buf.ic" +#endif + +#include "buf0buf.h" +#include "buf0checksum.h" +#include "srv0start.h" +#include "srv0srv.h" +#include "page0zip.h" +#include "trx0sys.h" + +#ifndef UNIV_HOTBACKUP + +#ifdef UNIV_PFS_MUTEX +/* Key to register the mutex with performance schema */ +UNIV_INTERN mysql_pfs_key_t buf_dblwr_mutex_key; +#endif /* UNIV_PFS_RWLOCK */ + +/** The doublewrite buffer */ +UNIV_INTERN buf_dblwr_t* buf_dblwr = NULL; + +/** Set to TRUE when the doublewrite buffer is being created */ +UNIV_INTERN ibool buf_dblwr_being_created = FALSE; + +/****************************************************************//** +Determines if a page number is located inside the doublewrite buffer. +@return TRUE if the location is inside the two blocks of the +doublewrite buffer */ +UNIV_INTERN +ibool +buf_dblwr_page_inside( +/*==================*/ + ulint page_no) /*!< in: page number */ +{ + if (buf_dblwr == NULL) { + + return(FALSE); + } + + if (page_no >= buf_dblwr->block1 + && page_no < buf_dblwr->block1 + + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { + return(TRUE); + } + + if (page_no >= buf_dblwr->block2 + && page_no < buf_dblwr->block2 + + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { + return(TRUE); + } + + return(FALSE); +} + +/****************************************************************//** +Calls buf_page_get() on the TRX_SYS_PAGE and returns a pointer to the +doublewrite buffer within it. +@return pointer to the doublewrite buffer within the filespace header +page. */ +UNIV_INLINE +byte* +buf_dblwr_get( +/*==========*/ + mtr_t* mtr) /*!< in/out: MTR to hold the page latch */ +{ + buf_block_t* block; + + block = buf_page_get(TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, + RW_X_LATCH, mtr); + buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK); + + return(buf_block_get_frame(block) + TRX_SYS_DOUBLEWRITE); +} + +/********************************************************************//** +Flush a batch of writes to the datafiles that have already been +written to the dblwr buffer on disk. */ +UNIV_INLINE +void +buf_dblwr_sync_datafiles() +/*======================*/ +{ + /* Wake possible simulated aio thread to actually post the + writes to the operating system */ + os_aio_simulated_wake_handler_threads(); + + /* Wait that all async writes to tablespaces have been posted to + the OS */ + os_aio_wait_until_no_pending_writes(); + + /* Now we flush the data to disk (for example, with fsync) */ + fil_flush_file_spaces(FIL_TABLESPACE); +} + +/****************************************************************//** +Creates or initialializes the doublewrite buffer at a database start. */ +static +void +buf_dblwr_init( +/*===========*/ + byte* doublewrite) /*!< in: pointer to the doublewrite buf + header on trx sys page */ +{ + ulint buf_size; + + buf_dblwr = static_cast<buf_dblwr_t*>( + mem_zalloc(sizeof(buf_dblwr_t))); + + /* There are two blocks of same size in the doublewrite + buffer. */ + buf_size = 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE; + + /* There must be atleast one buffer for single page writes + and one buffer for batch writes. */ + ut_a(srv_doublewrite_batch_size > 0 + && srv_doublewrite_batch_size < buf_size); + + mutex_create(buf_dblwr_mutex_key, + &buf_dblwr->mutex, SYNC_DOUBLEWRITE); + + buf_dblwr->b_event = os_event_create(); + buf_dblwr->s_event = os_event_create(); + buf_dblwr->first_free = 0; + buf_dblwr->s_reserved = 0; + buf_dblwr->b_reserved = 0; + + buf_dblwr->block1 = mach_read_from_4( + doublewrite + TRX_SYS_DOUBLEWRITE_BLOCK1); + buf_dblwr->block2 = mach_read_from_4( + doublewrite + TRX_SYS_DOUBLEWRITE_BLOCK2); + + buf_dblwr->in_use = static_cast<bool*>( + mem_zalloc(buf_size * sizeof(bool))); + + buf_dblwr->write_buf_unaligned = static_cast<byte*>( + ut_malloc((1 + buf_size) * UNIV_PAGE_SIZE)); + + buf_dblwr->write_buf = static_cast<byte*>( + ut_align(buf_dblwr->write_buf_unaligned, + UNIV_PAGE_SIZE)); + + buf_dblwr->buf_block_arr = static_cast<buf_page_t**>( + mem_zalloc(buf_size * sizeof(void*))); +} + +/****************************************************************//** +Creates the doublewrite buffer to a new InnoDB installation. The header of the +doublewrite buffer is placed on the trx system header page. */ +UNIV_INTERN +void +buf_dblwr_create(void) +/*==================*/ +{ + buf_block_t* block2; + buf_block_t* new_block; + byte* doublewrite; + byte* fseg_header; + ulint page_no; + ulint prev_page_no; + ulint i; + mtr_t mtr; + + if (buf_dblwr) { + /* Already inited */ + + return; + } + +start_again: + mtr_start(&mtr); + buf_dblwr_being_created = TRUE; + + doublewrite = buf_dblwr_get(&mtr); + + if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC) + == TRX_SYS_DOUBLEWRITE_MAGIC_N) { + /* The doublewrite buffer has already been created: + just read in some numbers */ + + buf_dblwr_init(doublewrite); + + mtr_commit(&mtr); + buf_dblwr_being_created = FALSE; + return; + } + + ib_logf(IB_LOG_LEVEL_INFO, + "Doublewrite buffer not found: creating new"); + + if (buf_pool_get_curr_size() + < ((2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE + + FSP_EXTENT_SIZE / 2 + 100) + * UNIV_PAGE_SIZE)) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "Cannot create doublewrite buffer: you must " + "increase your buffer pool size. Cannot continue " + "operation."); + + exit(EXIT_FAILURE); + } + + block2 = fseg_create(TRX_SYS_SPACE, TRX_SYS_PAGE_NO, + TRX_SYS_DOUBLEWRITE + + TRX_SYS_DOUBLEWRITE_FSEG, &mtr); + + /* fseg_create acquires a second latch on the page, + therefore we must declare it: */ + + buf_block_dbg_add_level(block2, SYNC_NO_ORDER_CHECK); + + if (block2 == NULL) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Cannot create doublewrite buffer: you must " + "increase your tablespace size. " + "Cannot continue operation."); + + /* We exit without committing the mtr to prevent + its modifications to the database getting to disk */ + + exit(EXIT_FAILURE); + } + + fseg_header = doublewrite + TRX_SYS_DOUBLEWRITE_FSEG; + prev_page_no = 0; + + for (i = 0; i < 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE + + FSP_EXTENT_SIZE / 2; i++) { + new_block = fseg_alloc_free_page( + fseg_header, prev_page_no + 1, FSP_UP, &mtr); + if (new_block == NULL) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Cannot create doublewrite buffer: you must " + "increase your tablespace size. " + "Cannot continue operation."); + + exit(EXIT_FAILURE); + } + + /* We read the allocated pages to the buffer pool; + when they are written to disk in a flush, the space + id and page number fields are also written to the + pages. When we at database startup read pages + from the doublewrite buffer, we know that if the + space id and page number in them are the same as + the page position in the tablespace, then the page + has not been written to in doublewrite. */ + + ut_ad(rw_lock_get_x_lock_count(&new_block->lock) == 1); + page_no = buf_block_get_page_no(new_block); + + if (i == FSP_EXTENT_SIZE / 2) { + ut_a(page_no == FSP_EXTENT_SIZE); + mlog_write_ulint(doublewrite + + TRX_SYS_DOUBLEWRITE_BLOCK1, + page_no, MLOG_4BYTES, &mtr); + mlog_write_ulint(doublewrite + + TRX_SYS_DOUBLEWRITE_REPEAT + + TRX_SYS_DOUBLEWRITE_BLOCK1, + page_no, MLOG_4BYTES, &mtr); + + } else if (i == FSP_EXTENT_SIZE / 2 + + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { + ut_a(page_no == 2 * FSP_EXTENT_SIZE); + mlog_write_ulint(doublewrite + + TRX_SYS_DOUBLEWRITE_BLOCK2, + page_no, MLOG_4BYTES, &mtr); + mlog_write_ulint(doublewrite + + TRX_SYS_DOUBLEWRITE_REPEAT + + TRX_SYS_DOUBLEWRITE_BLOCK2, + page_no, MLOG_4BYTES, &mtr); + + } else if (i > FSP_EXTENT_SIZE / 2) { + ut_a(page_no == prev_page_no + 1); + } + + if (((i + 1) & 15) == 0) { + /* rw_locks can only be recursively x-locked + 2048 times. (on 32 bit platforms, + (lint) 0 - (X_LOCK_DECR * 2049) + is no longer a negative number, and thus + lock_word becomes like a shared lock). + For 4k page size this loop will + lock the fseg header too many times. Since + this code is not done while any other threads + are active, restart the MTR occasionally. */ + mtr_commit(&mtr); + mtr_start(&mtr); + doublewrite = buf_dblwr_get(&mtr); + fseg_header = doublewrite + + TRX_SYS_DOUBLEWRITE_FSEG; + } + + prev_page_no = page_no; + } + + mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC, + TRX_SYS_DOUBLEWRITE_MAGIC_N, + MLOG_4BYTES, &mtr); + mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC + + TRX_SYS_DOUBLEWRITE_REPEAT, + TRX_SYS_DOUBLEWRITE_MAGIC_N, + MLOG_4BYTES, &mtr); + + mlog_write_ulint(doublewrite + + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED, + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N, + MLOG_4BYTES, &mtr); + mtr_commit(&mtr); + + /* Flush the modified pages to disk and make a checkpoint */ + log_make_checkpoint_at(LSN_MAX, TRUE); + + /* Remove doublewrite pages from LRU */ + buf_pool_invalidate(); + + ib_logf(IB_LOG_LEVEL_INFO, "Doublewrite buffer created"); + + goto start_again; +} + +/****************************************************************//** +At a database startup initializes the doublewrite buffer memory structure if +we already have a doublewrite buffer created in the data files. If we are +upgrading to an InnoDB version which supports multiple tablespaces, then this +function performs the necessary update operations. If we are in a crash +recovery, this function loads the pages from double write buffer into memory. */ +void +buf_dblwr_init_or_load_pages( +/*=========================*/ + os_file_t file, + char* path, + bool load_corrupt_pages) +{ + byte* buf; + byte* read_buf; + byte* unaligned_read_buf; + ulint block1; + ulint block2; + byte* page; + ibool reset_space_ids = FALSE; + byte* doublewrite; + ulint space_id; + ulint i; + ulint block_bytes = 0; + recv_dblwr_t& recv_dblwr = recv_sys->dblwr; + + /* We do the file i/o past the buffer pool */ + + unaligned_read_buf = static_cast<byte*>(ut_malloc(2 * UNIV_PAGE_SIZE)); + + read_buf = static_cast<byte*>( + ut_align(unaligned_read_buf, UNIV_PAGE_SIZE)); + + /* Read the trx sys header to check if we are using the doublewrite + buffer */ + off_t trx_sys_page = TRX_SYS_PAGE_NO * UNIV_PAGE_SIZE; + os_file_read(file, read_buf, trx_sys_page, UNIV_PAGE_SIZE); + + doublewrite = read_buf + TRX_SYS_DOUBLEWRITE; + + if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC) + == TRX_SYS_DOUBLEWRITE_MAGIC_N) { + /* The doublewrite buffer has been created */ + + buf_dblwr_init(doublewrite); + + block1 = buf_dblwr->block1; + block2 = buf_dblwr->block2; + + buf = buf_dblwr->write_buf; + } else { + goto leave_func; + } + + if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED) + != TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N) { + + /* We are upgrading from a version < 4.1.x to a version where + multiple tablespaces are supported. We must reset the space id + field in the pages in the doublewrite buffer because starting + from this version the space id is stored to + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID. */ + + reset_space_ids = TRUE; + + ib_logf(IB_LOG_LEVEL_INFO, + "Resetting space id's in the doublewrite buffer"); + } + + /* Read the pages from the doublewrite buffer to memory */ + + block_bytes = TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE; + + os_file_read(file, buf, block1 * UNIV_PAGE_SIZE, block_bytes); + os_file_read(file, buf + block_bytes, block2 * UNIV_PAGE_SIZE, + block_bytes); + + /* Check if any of these pages is half-written in data files, in the + intended position */ + + page = buf; + + for (i = 0; i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 2; i++) { + + ulint source_page_no; + + if (reset_space_ids) { + + space_id = 0; + mach_write_to_4(page + + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, space_id); + /* We do not need to calculate new checksums for the + pages because the field .._SPACE_ID does not affect + them. Write the page back to where we read it from. */ + + if (i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { + source_page_no = block1 + i; + } else { + source_page_no = block2 + + i - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE; + } + + os_file_write(path, file, page, + source_page_no * UNIV_PAGE_SIZE, + UNIV_PAGE_SIZE); + + } else if (load_corrupt_pages) { + + recv_dblwr.add(page); + } + + page += UNIV_PAGE_SIZE; + } + + if (reset_space_ids) { + os_file_flush(file); + } + +leave_func: + ut_free(unaligned_read_buf); +} + +/****************************************************************//** +Process the double write buffer pages. */ +void +buf_dblwr_process() +/*===============*/ +{ + ulint space_id; + ulint page_no; + ulint page_no_dblwr = 0; + byte* page; + byte* read_buf; + byte* unaligned_read_buf; + recv_dblwr_t& recv_dblwr = recv_sys->dblwr; + + unaligned_read_buf = static_cast<byte*>(ut_malloc(2 * UNIV_PAGE_SIZE)); + + read_buf = static_cast<byte*>( + ut_align(unaligned_read_buf, UNIV_PAGE_SIZE)); + + for (std::list<byte*>::iterator i = recv_dblwr.pages.begin(); + i != recv_dblwr.pages.end(); ++i, ++page_no_dblwr ) { + + page = *i; + page_no = mach_read_from_4(page + FIL_PAGE_OFFSET); + space_id = mach_read_from_4(page + FIL_PAGE_SPACE_ID); + + if (!fil_tablespace_exists_in_mem(space_id)) { + /* Maybe we have dropped the single-table tablespace + and this page once belonged to it: do nothing */ + + } else if (!fil_check_adress_in_tablespace(space_id, + page_no)) { + ib_logf(IB_LOG_LEVEL_WARN, + "A page in the doublewrite buffer is not " + "within space bounds; space id %lu " + "page number %lu, page %lu in " + "doublewrite buf.", + (ulong) space_id, (ulong) page_no, + page_no_dblwr); + } else { + ulint zip_size = fil_space_get_zip_size(space_id); + + /* Read in the actual page from the file */ + fil_io(OS_FILE_READ, true, space_id, zip_size, + page_no, 0, + zip_size ? zip_size : UNIV_PAGE_SIZE, + read_buf, NULL); + + /* Check if the page is corrupt */ + + if (buf_page_is_corrupted(true, read_buf, zip_size)) { + + fprintf(stderr, + "InnoDB: Warning: database page" + " corruption or a failed\n" + "InnoDB: file read of" + " space %lu page %lu.\n" + "InnoDB: Trying to recover it from" + " the doublewrite buffer.\n", + (ulong) space_id, (ulong) page_no); + + if (buf_page_is_corrupted(true, + page, zip_size)) { + fprintf(stderr, + "InnoDB: Dump of the page:\n"); + buf_page_print( + read_buf, zip_size, + BUF_PAGE_PRINT_NO_CRASH); + fprintf(stderr, + "InnoDB: Dump of" + " corresponding page" + " in doublewrite buffer:\n"); + buf_page_print( + page, zip_size, + BUF_PAGE_PRINT_NO_CRASH); + + fprintf(stderr, + "InnoDB: Also the page in the" + " doublewrite buffer" + " is corrupt.\n" + "InnoDB: Cannot continue" + " operation.\n" + "InnoDB: You can try to" + " recover the database" + " with the my.cnf\n" + "InnoDB: option:\n" + "InnoDB:" + " innodb_force_recovery=6\n"); + ut_error; + } + + /* Write the good page from the + doublewrite buffer to the intended + position */ + + fil_io(OS_FILE_WRITE, true, space_id, + zip_size, page_no, 0, + zip_size ? zip_size : UNIV_PAGE_SIZE, + page, NULL); + + ib_logf(IB_LOG_LEVEL_INFO, + "Recovered the page from" + " the doublewrite buffer."); + + } else if (buf_page_is_zeroes(read_buf, zip_size)) { + + if (!buf_page_is_zeroes(page, zip_size) + && !buf_page_is_corrupted(true, page, + zip_size)) { + + /* Database page contained only + zeroes, while a valid copy is + available in dblwr buffer. */ + + fil_io(OS_FILE_WRITE, true, space_id, + zip_size, page_no, 0, + zip_size ? zip_size + : UNIV_PAGE_SIZE, + page, NULL); + } + } + } + } + + fil_flush_file_spaces(FIL_TABLESPACE); + ut_free(unaligned_read_buf); +} + +/****************************************************************//** +Frees doublewrite buffer. */ +UNIV_INTERN +void +buf_dblwr_free(void) +/*================*/ +{ + /* Free the double write data structures. */ + ut_a(buf_dblwr != NULL); + ut_ad(buf_dblwr->s_reserved == 0); + ut_ad(buf_dblwr->b_reserved == 0); + + os_event_free(buf_dblwr->b_event); + os_event_free(buf_dblwr->s_event); + ut_free(buf_dblwr->write_buf_unaligned); + buf_dblwr->write_buf_unaligned = NULL; + + mem_free(buf_dblwr->buf_block_arr); + buf_dblwr->buf_block_arr = NULL; + + mem_free(buf_dblwr->in_use); + buf_dblwr->in_use = NULL; + + mutex_free(&buf_dblwr->mutex); + mem_free(buf_dblwr); + buf_dblwr = NULL; +} + +/********************************************************************//** +Updates the doublewrite buffer when an IO request is completed. */ +UNIV_INTERN +void +buf_dblwr_update( +/*=============*/ + const buf_page_t* bpage, /*!< in: buffer block descriptor */ + buf_flush_t flush_type)/*!< in: flush type */ +{ + if (!srv_use_doublewrite_buf || buf_dblwr == NULL) { + return; + } + + switch (flush_type) { + case BUF_FLUSH_LIST: + case BUF_FLUSH_LRU: + mutex_enter(&buf_dblwr->mutex); + + ut_ad(buf_dblwr->batch_running); + ut_ad(buf_dblwr->b_reserved > 0); + ut_ad(buf_dblwr->b_reserved <= buf_dblwr->first_free); + + buf_dblwr->b_reserved--; + + if (buf_dblwr->b_reserved == 0) { + mutex_exit(&buf_dblwr->mutex); + /* This will finish the batch. Sync data files + to the disk. */ + fil_flush_file_spaces(FIL_TABLESPACE); + mutex_enter(&buf_dblwr->mutex); + + /* We can now reuse the doublewrite memory buffer: */ + buf_dblwr->first_free = 0; + buf_dblwr->batch_running = false; + os_event_set(buf_dblwr->b_event); + } + + mutex_exit(&buf_dblwr->mutex); + break; + case BUF_FLUSH_SINGLE_PAGE: + { + const ulint size = 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE; + ulint i; + mutex_enter(&buf_dblwr->mutex); + for (i = srv_doublewrite_batch_size; i < size; ++i) { + if (buf_dblwr->buf_block_arr[i] == bpage) { + buf_dblwr->s_reserved--; + buf_dblwr->buf_block_arr[i] = NULL; + buf_dblwr->in_use[i] = false; + break; + } + } + + /* The block we are looking for must exist as a + reserved block. */ + ut_a(i < size); + } + os_event_set(buf_dblwr->s_event); + mutex_exit(&buf_dblwr->mutex); + break; + case BUF_FLUSH_N_TYPES: + ut_error; + } +} + +/********************************************************************//** +Check the LSN values on the page. */ +static +void +buf_dblwr_check_page_lsn( +/*=====================*/ + const page_t* page) /*!< in: page to check */ +{ + if (memcmp(page + (FIL_PAGE_LSN + 4), + page + (UNIV_PAGE_SIZE + - FIL_PAGE_END_LSN_OLD_CHKSUM + 4), + 4)) { + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: ERROR: The page to be written" + " seems corrupt!\n" + "InnoDB: The low 4 bytes of LSN fields do not match " + "(" ULINTPF " != " ULINTPF ")!" + " Noticed in the buffer pool.\n", + mach_read_from_4( + page + FIL_PAGE_LSN + 4), + mach_read_from_4( + page + UNIV_PAGE_SIZE + - FIL_PAGE_END_LSN_OLD_CHKSUM + 4)); + } +} + +/********************************************************************//** +Asserts when a corrupt block is find during writing out data to the +disk. */ +static +void +buf_dblwr_assert_on_corrupt_block( +/*==============================*/ + const buf_block_t* block) /*!< in: block to check */ +{ + buf_page_print(block->frame, 0, BUF_PAGE_PRINT_NO_CRASH); + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Apparent corruption of an" + " index page n:o %lu in space %lu\n" + "InnoDB: to be written to data file." + " We intentionally crash server\n" + "InnoDB: to prevent corrupt data" + " from ending up in data\n" + "InnoDB: files.\n", + (ulong) buf_block_get_page_no(block), + (ulong) buf_block_get_space(block)); + + ut_error; +} + +/********************************************************************//** +Check the LSN values on the page with which this block is associated. +Also validate the page if the option is set. */ +static +void +buf_dblwr_check_block( +/*==================*/ + const buf_block_t* block) /*!< in: block to check */ +{ + if (buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE + || block->page.zip.data) { + /* No simple validate for compressed pages exists. */ + return; + } + + buf_dblwr_check_page_lsn(block->frame); + + if (!block->check_index_page_at_flush) { + return; + } + + if (page_is_comp(block->frame)) { + if (!page_simple_validate_new(block->frame)) { + buf_dblwr_assert_on_corrupt_block(block); + } + } else if (!page_simple_validate_old(block->frame)) { + + buf_dblwr_assert_on_corrupt_block(block); + } +} + +/********************************************************************//** +Writes a page that has already been written to the doublewrite buffer +to the datafile. It is the job of the caller to sync the datafile. */ +static +void +buf_dblwr_write_block_to_datafile( +/*==============================*/ + const buf_page_t* bpage, /*!< in: page to write */ + bool sync) /*!< in: true if sync IO + is requested */ +{ + ut_a(bpage); + ut_a(buf_page_in_file(bpage)); + + const ulint flags = sync + ? OS_FILE_WRITE + : OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER; + + if (bpage->zip.data) { + fil_io(flags, sync, buf_page_get_space(bpage), + buf_page_get_zip_size(bpage), + buf_page_get_page_no(bpage), 0, + buf_page_get_zip_size(bpage), + (void*) bpage->zip.data, + (void*) bpage); + + return; + } + + + const buf_block_t* block = (buf_block_t*) bpage; + ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); + buf_dblwr_check_page_lsn(block->frame); + + fil_io(flags, sync, buf_block_get_space(block), 0, + buf_block_get_page_no(block), 0, UNIV_PAGE_SIZE, + (void*) block->frame, (void*) block); + +} + +/********************************************************************//** +Flushes possible buffered writes from the doublewrite memory buffer to disk, +and also wakes up the aio thread if simulated aio is used. It is very +important to call this function after a batch of writes has been posted, +and also when we may have to wait for a page latch! Otherwise a deadlock +of threads can occur. */ +UNIV_INTERN +void +buf_dblwr_flush_buffered_writes(void) +/*=================================*/ +{ + byte* write_buf; + ulint first_free; + ulint len; + + if (!srv_use_doublewrite_buf || buf_dblwr == NULL) { + /* Sync the writes to the disk. */ + buf_dblwr_sync_datafiles(); + return; + } + +try_again: + mutex_enter(&buf_dblwr->mutex); + + /* Write first to doublewrite buffer blocks. We use synchronous + aio and thus know that file write has been completed when the + control returns. */ + + if (buf_dblwr->first_free == 0) { + + mutex_exit(&buf_dblwr->mutex); + + return; + } + + if (buf_dblwr->batch_running) { + /* Another thread is running the batch right now. Wait + for it to finish. */ + ib_int64_t sig_count = os_event_reset(buf_dblwr->b_event); + mutex_exit(&buf_dblwr->mutex); + + os_event_wait_low(buf_dblwr->b_event, sig_count); + goto try_again; + } + + ut_a(!buf_dblwr->batch_running); + ut_ad(buf_dblwr->first_free == buf_dblwr->b_reserved); + + /* Disallow anyone else to post to doublewrite buffer or to + start another batch of flushing. */ + buf_dblwr->batch_running = true; + first_free = buf_dblwr->first_free; + + /* Now safe to release the mutex. Note that though no other + thread is allowed to post to the doublewrite batch flushing + but any threads working on single page flushes are allowed + to proceed. */ + mutex_exit(&buf_dblwr->mutex); + + write_buf = buf_dblwr->write_buf; + + for (ulint len2 = 0, i = 0; + i < buf_dblwr->first_free; + len2 += UNIV_PAGE_SIZE, i++) { + + const buf_block_t* block; + + block = (buf_block_t*) buf_dblwr->buf_block_arr[i]; + + if (buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE + || block->page.zip.data) { + /* No simple validate for compressed + pages exists. */ + continue; + } + + /* Check that the actual page in the buffer pool is + not corrupt and the LSN values are sane. */ + buf_dblwr_check_block(block); + + /* Check that the page as written to the doublewrite + buffer has sane LSN values. */ + buf_dblwr_check_page_lsn(write_buf + len2); + } + + /* Write out the first block of the doublewrite buffer */ + len = ut_min(TRX_SYS_DOUBLEWRITE_BLOCK_SIZE, + buf_dblwr->first_free) * UNIV_PAGE_SIZE; + + fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0, + buf_dblwr->block1, 0, len, + (void*) write_buf, NULL); + + if (buf_dblwr->first_free <= TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { + /* No unwritten pages in the second block. */ + goto flush; + } + + /* Write out the second block of the doublewrite buffer. */ + len = (buf_dblwr->first_free - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) + * UNIV_PAGE_SIZE; + + write_buf = buf_dblwr->write_buf + + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE; + + fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0, + buf_dblwr->block2, 0, len, + (void*) write_buf, NULL); + +flush: + /* increment the doublewrite flushed pages counter */ + srv_stats.dblwr_pages_written.add(buf_dblwr->first_free); + srv_stats.dblwr_writes.inc(); + + /* Now flush the doublewrite buffer data to disk */ + fil_flush(TRX_SYS_SPACE); + + /* We know that the writes have been flushed to disk now + and in recovery we will find them in the doublewrite buffer + blocks. Next do the writes to the intended positions. */ + + /* Up to this point first_free and buf_dblwr->first_free are + same because we have set the buf_dblwr->batch_running flag + disallowing any other thread to post any request but we + can't safely access buf_dblwr->first_free in the loop below. + This is so because it is possible that after we are done with + the last iteration and before we terminate the loop, the batch + gets finished in the IO helper thread and another thread posts + a new batch setting buf_dblwr->first_free to a higher value. + If this happens and we are using buf_dblwr->first_free in the + loop termination condition then we'll end up dispatching + the same block twice from two different threads. */ + ut_ad(first_free == buf_dblwr->first_free); + for (ulint i = 0; i < first_free; i++) { + buf_dblwr_write_block_to_datafile( + buf_dblwr->buf_block_arr[i], false); + } + + /* Wake possible simulated aio thread to actually post the + writes to the operating system. We don't flush the files + at this point. We leave it to the IO helper thread to flush + datafiles when the whole batch has been processed. */ + os_aio_simulated_wake_handler_threads(); +} + +/********************************************************************//** +Posts a buffer page for writing. If the doublewrite memory buffer is +full, calls buf_dblwr_flush_buffered_writes and waits for for free +space to appear. */ +UNIV_INTERN +void +buf_dblwr_add_to_batch( +/*====================*/ + buf_page_t* bpage) /*!< in: buffer block to write */ +{ + ulint zip_size; + + ut_a(buf_page_in_file(bpage)); + ut_ad(!mutex_own(&buf_pool_from_bpage(bpage)->LRU_list_mutex)); + +try_again: + mutex_enter(&buf_dblwr->mutex); + + ut_a(buf_dblwr->first_free <= srv_doublewrite_batch_size); + + if (buf_dblwr->batch_running) { + + /* This not nearly as bad as it looks. There is only + page_cleaner thread which does background flushing + in batches therefore it is unlikely to be a contention + point. The only exception is when a user thread is + forced to do a flush batch because of a sync + checkpoint. */ + ib_int64_t sig_count = os_event_reset(buf_dblwr->b_event); + mutex_exit(&buf_dblwr->mutex); + + os_event_wait_low(buf_dblwr->b_event, sig_count); + goto try_again; + } + + if (buf_dblwr->first_free == srv_doublewrite_batch_size) { + mutex_exit(&(buf_dblwr->mutex)); + + buf_dblwr_flush_buffered_writes(); + + goto try_again; + } + + zip_size = buf_page_get_zip_size(bpage); + + if (zip_size) { + UNIV_MEM_ASSERT_RW(bpage->zip.data, zip_size); + /* Copy the compressed page and clear the rest. */ + memcpy(buf_dblwr->write_buf + + UNIV_PAGE_SIZE * buf_dblwr->first_free, + bpage->zip.data, zip_size); + memset(buf_dblwr->write_buf + + UNIV_PAGE_SIZE * buf_dblwr->first_free + + zip_size, 0, UNIV_PAGE_SIZE - zip_size); + } else { + ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE); + UNIV_MEM_ASSERT_RW(((buf_block_t*) bpage)->frame, + UNIV_PAGE_SIZE); + + memcpy(buf_dblwr->write_buf + + UNIV_PAGE_SIZE * buf_dblwr->first_free, + ((buf_block_t*) bpage)->frame, UNIV_PAGE_SIZE); + } + + buf_dblwr->buf_block_arr[buf_dblwr->first_free] = bpage; + + buf_dblwr->first_free++; + buf_dblwr->b_reserved++; + + ut_ad(!buf_dblwr->batch_running); + ut_ad(buf_dblwr->first_free == buf_dblwr->b_reserved); + ut_ad(buf_dblwr->b_reserved <= srv_doublewrite_batch_size); + + if (buf_dblwr->first_free == srv_doublewrite_batch_size) { + mutex_exit(&(buf_dblwr->mutex)); + + buf_dblwr_flush_buffered_writes(); + + return; + } + + mutex_exit(&(buf_dblwr->mutex)); +} + +/********************************************************************//** +Writes a page to the doublewrite buffer on disk, sync it, then write +the page to the datafile and sync the datafile. This function is used +for single page flushes. If all the buffers allocated for single page +flushes in the doublewrite buffer are in use we wait here for one to +become free. We are guaranteed that a slot will become free because any +thread that is using a slot must also release the slot before leaving +this function. */ +UNIV_INTERN +void +buf_dblwr_write_single_page( +/*========================*/ + buf_page_t* bpage, /*!< in: buffer block to write */ + bool sync) /*!< in: true if sync IO requested */ +{ + ulint n_slots; + ulint size; + ulint zip_size; + ulint offset; + ulint i; + + ut_a(buf_page_in_file(bpage)); + ut_a(srv_use_doublewrite_buf); + ut_a(buf_dblwr != NULL); + + /* total number of slots available for single page flushes + starts from srv_doublewrite_batch_size to the end of the + buffer. */ + size = 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE; + ut_a(size > srv_doublewrite_batch_size); + n_slots = size - srv_doublewrite_batch_size; + + if (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE) { + + /* Check that the actual page in the buffer pool is + not corrupt and the LSN values are sane. */ + buf_dblwr_check_block((buf_block_t*) bpage); + + /* Check that the page as written to the doublewrite + buffer has sane LSN values. */ + if (!bpage->zip.data) { + buf_dblwr_check_page_lsn( + ((buf_block_t*) bpage)->frame); + } + } + +retry: + mutex_enter(&buf_dblwr->mutex); + if (buf_dblwr->s_reserved == n_slots) { + + /* All slots are reserved. */ + ib_int64_t sig_count = + os_event_reset(buf_dblwr->s_event); + mutex_exit(&buf_dblwr->mutex); + os_event_wait_low(buf_dblwr->s_event, sig_count); + + goto retry; + } + + for (i = srv_doublewrite_batch_size; i < size; ++i) { + + if (!buf_dblwr->in_use[i]) { + break; + } + } + + /* We are guaranteed to find a slot. */ + ut_a(i < size); + buf_dblwr->in_use[i] = true; + buf_dblwr->s_reserved++; + buf_dblwr->buf_block_arr[i] = bpage; + + /* increment the doublewrite flushed pages counter */ + srv_stats.dblwr_pages_written.inc(); + srv_stats.dblwr_writes.inc(); + + mutex_exit(&buf_dblwr->mutex); + + /* Lets see if we are going to write in the first or second + block of the doublewrite buffer. */ + if (i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { + offset = buf_dblwr->block1 + i; + } else { + offset = buf_dblwr->block2 + i + - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE; + } + + /* We deal with compressed and uncompressed pages a little + differently here. In case of uncompressed pages we can + directly write the block to the allocated slot in the + doublewrite buffer in the system tablespace and then after + syncing the system table space we can proceed to write the page + in the datafile. + In case of compressed page we first do a memcpy of the block + to the in-memory buffer of doublewrite before proceeding to + write it. This is so because we want to pad the remaining + bytes in the doublewrite page with zeros. */ + + zip_size = buf_page_get_zip_size(bpage); + if (zip_size) { + memcpy(buf_dblwr->write_buf + UNIV_PAGE_SIZE * i, + bpage->zip.data, zip_size); + memset(buf_dblwr->write_buf + UNIV_PAGE_SIZE * i + + zip_size, 0, UNIV_PAGE_SIZE - zip_size); + + fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0, + offset, 0, UNIV_PAGE_SIZE, + (void*) (buf_dblwr->write_buf + + UNIV_PAGE_SIZE * i), NULL); + } else { + /* It is a regular page. Write it directly to the + doublewrite buffer */ + fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0, + offset, 0, UNIV_PAGE_SIZE, + (void*) ((buf_block_t*) bpage)->frame, + NULL); + } + + /* Now flush the doublewrite buffer data to disk */ + fil_flush(TRX_SYS_SPACE); + + /* We know that the write has been flushed to disk now + and during recovery we will find it in the doublewrite buffer + blocks. Next do the write to the intended position. */ + buf_dblwr_write_block_to_datafile(bpage, sync); +} +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/xtradb/buf/buf0dump.cc b/storage/xtradb/buf/buf0dump.cc new file mode 100644 index 00000000000..090e8cac63b --- /dev/null +++ b/storage/xtradb/buf/buf0dump.cc @@ -0,0 +1,621 @@ +/***************************************************************************** + +Copyright (c) 2011, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file buf/buf0dump.cc +Implements a buffer pool dump/load. + +Created April 08, 2011 Vasil Dimov +*******************************************************/ + +#include "univ.i" + +#include <stdarg.h> /* va_* */ +#include <string.h> /* strerror() */ + +#include "buf0buf.h" /* srv_buf_pool_instances */ +#include "buf0dump.h" +#include "db0err.h" +#include "dict0dict.h" /* dict_operation_lock */ +#include "os0file.h" /* OS_FILE_MAX_PATH */ +#include "os0sync.h" /* os_event* */ +#include "os0thread.h" /* os_thread_* */ +#include "srv0srv.h" /* srv_fast_shutdown, srv_buf_dump* */ +#include "srv0start.h" /* srv_shutdown_state */ +#include "sync0rw.h" /* rw_lock_s_lock() */ +#include "ut0byte.h" /* ut_ull_create() */ +#include "ut0sort.h" /* UT_SORT_FUNCTION_BODY */ + +enum status_severity { + STATUS_INFO, + STATUS_NOTICE, + STATUS_ERR +}; + +#define SHUTTING_DOWN() (UNIV_UNLIKELY(srv_shutdown_state \ + != SRV_SHUTDOWN_NONE)) + +/* Flags that tell the buffer pool dump/load thread which action should it +take after being waked up. */ +static ibool buf_dump_should_start = FALSE; +static ibool buf_load_should_start = FALSE; + +static ibool buf_load_abort_flag = FALSE; + +/* Used to temporary store dump info in order to avoid IO while holding +buffer pool LRU list mutex during dump and also to sort the contents of the +dump before reading the pages from disk during load. +We store the space id in the high 32 bits and page no in low 32 bits. */ +typedef ib_uint64_t buf_dump_t; + +/* Aux macros to create buf_dump_t and to extract space and page from it */ +#define BUF_DUMP_CREATE(space, page) ut_ull_create(space, page) +#define BUF_DUMP_SPACE(a) ((ulint) ((a) >> 32)) +#define BUF_DUMP_PAGE(a) ((ulint) ((a) & 0xFFFFFFFFUL)) + +/*****************************************************************//** +Wakes up the buffer pool dump/load thread and instructs it to start +a dump. This function is called by MySQL code via buffer_pool_dump_now() +and it should return immediately because the whole MySQL is frozen during +its execution. */ +UNIV_INTERN +void +buf_dump_start() +/*============*/ +{ + buf_dump_should_start = TRUE; + os_event_set(srv_buf_dump_event); +} + +/*****************************************************************//** +Wakes up the buffer pool dump/load thread and instructs it to start +a load. This function is called by MySQL code via buffer_pool_load_now() +and it should return immediately because the whole MySQL is frozen during +its execution. */ +UNIV_INTERN +void +buf_load_start() +/*============*/ +{ + buf_load_should_start = TRUE; + os_event_set(srv_buf_dump_event); +} + +/*****************************************************************//** +Sets the global variable that feeds MySQL's innodb_buffer_pool_dump_status +to the specified string. The format and the following parameters are the +same as the ones used for printf(3). The value of this variable can be +retrieved by: +SELECT variable_value FROM information_schema.global_status WHERE +variable_name = 'INNODB_BUFFER_POOL_DUMP_STATUS'; +or by: +SHOW STATUS LIKE 'innodb_buffer_pool_dump_status'; */ +static __attribute__((nonnull, format(printf, 2, 3))) +void +buf_dump_status( +/*============*/ + enum status_severity severity,/*!< in: status severity */ + const char* fmt, /*!< in: format */ + ...) /*!< in: extra parameters according + to fmt */ +{ + va_list ap; + + va_start(ap, fmt); + + ut_vsnprintf( + export_vars.innodb_buffer_pool_dump_status, + sizeof(export_vars.innodb_buffer_pool_dump_status), + fmt, ap); + + if (severity == STATUS_NOTICE || severity == STATUS_ERR) { + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: %s\n", + export_vars.innodb_buffer_pool_dump_status); + } + + va_end(ap); +} + +/*****************************************************************//** +Sets the global variable that feeds MySQL's innodb_buffer_pool_load_status +to the specified string. The format and the following parameters are the +same as the ones used for printf(3). The value of this variable can be +retrieved by: +SELECT variable_value FROM information_schema.global_status WHERE +variable_name = 'INNODB_BUFFER_POOL_LOAD_STATUS'; +or by: +SHOW STATUS LIKE 'innodb_buffer_pool_load_status'; */ +static __attribute__((nonnull, format(printf, 2, 3))) +void +buf_load_status( +/*============*/ + enum status_severity severity,/*!< in: status severity */ + const char* fmt, /*!< in: format */ + ...) /*!< in: extra parameters according to fmt */ +{ + va_list ap; + + va_start(ap, fmt); + + ut_vsnprintf( + export_vars.innodb_buffer_pool_load_status, + sizeof(export_vars.innodb_buffer_pool_load_status), + fmt, ap); + + if (severity == STATUS_NOTICE || severity == STATUS_ERR) { + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: %s\n", + export_vars.innodb_buffer_pool_load_status); + } + + va_end(ap); +} + +/*****************************************************************//** +Perform a buffer pool dump into the file specified by +innodb_buffer_pool_filename. If any errors occur then the value of +innodb_buffer_pool_dump_status will be set accordingly, see buf_dump_status(). +The dump filename can be specified by (relative to srv_data_home): +SET GLOBAL innodb_buffer_pool_filename='filename'; */ +static +void +buf_dump( +/*=====*/ + ibool obey_shutdown) /*!< in: quit if we are in a shutting down + state */ +{ +#define SHOULD_QUIT() (SHUTTING_DOWN() && obey_shutdown) + + char full_filename[OS_FILE_MAX_PATH]; + char tmp_filename[OS_FILE_MAX_PATH]; + char now[32]; + FILE* f; + ulint i; + int ret; + + ut_snprintf(full_filename, sizeof(full_filename), + "%s%c%s", srv_data_home, SRV_PATH_SEPARATOR, + srv_buf_dump_filename); + + ut_snprintf(tmp_filename, sizeof(tmp_filename), + "%s.incomplete", full_filename); + + buf_dump_status(STATUS_NOTICE, "Dumping buffer pool(s) to %s", + full_filename); + + f = fopen(tmp_filename, "w"); + if (f == NULL) { + buf_dump_status(STATUS_ERR, + "Cannot open '%s' for writing: %s", + tmp_filename, strerror(errno)); + return; + } + /* else */ + + /* walk through each buffer pool */ + for (i = 0; i < srv_buf_pool_instances && !SHOULD_QUIT(); i++) { + buf_pool_t* buf_pool; + const buf_page_t* bpage; + buf_dump_t* dump; + ulint n_pages; + ulint j; + + buf_pool = buf_pool_from_array(i); + + /* obtain buf_pool LRU list mutex before allocate, since + UT_LIST_GET_LEN(buf_pool->LRU) could change */ + mutex_enter(&buf_pool->LRU_list_mutex); + + n_pages = UT_LIST_GET_LEN(buf_pool->LRU); + + /* skip empty buffer pools */ + if (n_pages == 0) { + mutex_exit(&buf_pool->LRU_list_mutex); + continue; + } + + dump = static_cast<buf_dump_t*>( + ut_malloc(n_pages * sizeof(*dump))) ; + + if (dump == NULL) { + mutex_exit(&buf_pool->LRU_list_mutex); + fclose(f); + buf_dump_status(STATUS_ERR, + "Cannot allocate " ULINTPF " bytes: %s", + (ulint) (n_pages * sizeof(*dump)), + strerror(errno)); + /* leave tmp_filename to exist */ + return; + } + + for (bpage = UT_LIST_GET_LAST(buf_pool->LRU), j = 0; + bpage != NULL; + bpage = UT_LIST_GET_PREV(LRU, bpage), j++) { + + ut_a(buf_page_in_file(bpage)); + + dump[j] = BUF_DUMP_CREATE(buf_page_get_space(bpage), + buf_page_get_page_no(bpage)); + } + + ut_a(j == n_pages); + + mutex_exit(&buf_pool->LRU_list_mutex); + + for (j = 0; j < n_pages && !SHOULD_QUIT(); j++) { + ret = fprintf(f, ULINTPF "," ULINTPF "\n", + BUF_DUMP_SPACE(dump[j]), + BUF_DUMP_PAGE(dump[j])); + if (ret < 0) { + ut_free(dump); + fclose(f); + buf_dump_status(STATUS_ERR, + "Cannot write to '%s': %s", + tmp_filename, strerror(errno)); + /* leave tmp_filename to exist */ + return; + } + + if (j % 128 == 0) { + buf_dump_status( + STATUS_INFO, + "Dumping buffer pool " + ULINTPF "/" ULINTPF ", " + "page " ULINTPF "/" ULINTPF, + i + 1, srv_buf_pool_instances, + j + 1, n_pages); + } + } + + ut_free(dump); + } + + ret = fclose(f); + if (ret != 0) { + buf_dump_status(STATUS_ERR, + "Cannot close '%s': %s", + tmp_filename, strerror(errno)); + return; + } + /* else */ + + ret = unlink(full_filename); + if (ret != 0 && errno != ENOENT) { + buf_dump_status(STATUS_ERR, + "Cannot delete '%s': %s", + full_filename, strerror(errno)); + /* leave tmp_filename to exist */ + return; + } + /* else */ + + ret = rename(tmp_filename, full_filename); + if (ret != 0) { + buf_dump_status(STATUS_ERR, + "Cannot rename '%s' to '%s': %s", + tmp_filename, full_filename, + strerror(errno)); + /* leave tmp_filename to exist */ + return; + } + /* else */ + + /* success */ + + ut_sprintf_timestamp(now); + + buf_dump_status(STATUS_NOTICE, + "Buffer pool(s) dump completed at %s", now); +} + +/*****************************************************************//** +Compare two buffer pool dump entries, used to sort the dump on +space_no,page_no before loading in order to increase the chance for +sequential IO. +@return -1/0/1 if entry 1 is smaller/equal/bigger than entry 2 */ +static +lint +buf_dump_cmp( +/*=========*/ + const buf_dump_t d1, /*!< in: buffer pool dump entry 1 */ + const buf_dump_t d2) /*!< in: buffer pool dump entry 2 */ +{ + if (d1 < d2) { + return(-1); + } else if (d1 == d2) { + return(0); + } else { + return(1); + } +} + +/*****************************************************************//** +Sort a buffer pool dump on space_no, page_no. */ +static +void +buf_dump_sort( +/*==========*/ + buf_dump_t* dump, /*!< in/out: buffer pool dump to sort */ + buf_dump_t* tmp, /*!< in/out: temp storage */ + ulint low, /*!< in: lowest index (inclusive) */ + ulint high) /*!< in: highest index (non-inclusive) */ +{ + UT_SORT_FUNCTION_BODY(buf_dump_sort, dump, tmp, low, high, + buf_dump_cmp); +} + +/*****************************************************************//** +Perform a buffer pool load from the file specified by +innodb_buffer_pool_filename. If any errors occur then the value of +innodb_buffer_pool_load_status will be set accordingly, see buf_load_status(). +The dump filename can be specified by (relative to srv_data_home): +SET GLOBAL innodb_buffer_pool_filename='filename'; */ +static +void +buf_load() +/*======*/ +{ + char full_filename[OS_FILE_MAX_PATH]; + char now[32]; + FILE* f; + buf_dump_t* dump; + buf_dump_t* dump_tmp; + ulint dump_n; + ulint total_buffer_pools_pages; + ulint i; + ulint space_id; + ulint page_no; + int fscanf_ret; + + /* Ignore any leftovers from before */ + buf_load_abort_flag = FALSE; + + ut_snprintf(full_filename, sizeof(full_filename), + "%s%c%s", srv_data_home, SRV_PATH_SEPARATOR, + srv_buf_dump_filename); + + buf_load_status(STATUS_NOTICE, + "Loading buffer pool(s) from %s", full_filename); + + f = fopen(full_filename, "r"); + if (f == NULL) { + buf_load_status(STATUS_ERR, + "Cannot open '%s' for reading: %s", + full_filename, strerror(errno)); + return; + } + /* else */ + + /* First scan the file to estimate how many entries are in it. + This file is tiny (approx 500KB per 1GB buffer pool), reading it + two times is fine. */ + dump_n = 0; + while (fscanf(f, ULINTPF "," ULINTPF, &space_id, &page_no) == 2 + && !SHUTTING_DOWN()) { + dump_n++; + } + + if (!SHUTTING_DOWN() && !feof(f)) { + /* fscanf() returned != 2 */ + const char* what; + if (ferror(f)) { + what = "reading"; + } else { + what = "parsing"; + } + fclose(f); + buf_load_status(STATUS_ERR, "Error %s '%s', " + "unable to load buffer pool (stage 1)", + what, full_filename); + return; + } + + /* If dump is larger than the buffer pool(s), then we ignore the + extra trailing. This could happen if a dump is made, then buffer + pool is shrunk and then load it attempted. */ + total_buffer_pools_pages = buf_pool_get_n_pages() + * srv_buf_pool_instances; + if (dump_n > total_buffer_pools_pages) { + dump_n = total_buffer_pools_pages; + } + + dump = static_cast<buf_dump_t*>(ut_malloc(dump_n * sizeof(*dump))); + + if (dump == NULL) { + fclose(f); + buf_load_status(STATUS_ERR, + "Cannot allocate " ULINTPF " bytes: %s", + (ulint) (dump_n * sizeof(*dump)), + strerror(errno)); + return; + } + + dump_tmp = static_cast<buf_dump_t*>( + ut_malloc(dump_n * sizeof(*dump_tmp))); + + if (dump_tmp == NULL) { + ut_free(dump); + fclose(f); + buf_load_status(STATUS_ERR, + "Cannot allocate " ULINTPF " bytes: %s", + (ulint) (dump_n * sizeof(*dump_tmp)), + strerror(errno)); + return; + } + + rewind(f); + + for (i = 0; i < dump_n && !SHUTTING_DOWN(); i++) { + fscanf_ret = fscanf(f, ULINTPF "," ULINTPF, + &space_id, &page_no); + + if (fscanf_ret != 2) { + if (feof(f)) { + break; + } + /* else */ + + ut_free(dump); + ut_free(dump_tmp); + fclose(f); + buf_load_status(STATUS_ERR, + "Error parsing '%s', unable " + "to load buffer pool (stage 2)", + full_filename); + return; + } + + if (space_id > ULINT32_MASK || page_no > ULINT32_MASK) { + ut_free(dump); + ut_free(dump_tmp); + fclose(f); + buf_load_status(STATUS_ERR, + "Error parsing '%s': bogus " + "space,page " ULINTPF "," ULINTPF + " at line " ULINTPF ", " + "unable to load buffer pool", + full_filename, + space_id, page_no, + i); + return; + } + + dump[i] = BUF_DUMP_CREATE(space_id, page_no); + } + + /* Set dump_n to the actual number of initialized elements, + i could be smaller than dump_n here if the file got truncated after + we read it the first time. */ + dump_n = i; + + fclose(f); + + if (dump_n == 0) { + ut_free(dump); + ut_sprintf_timestamp(now); + buf_load_status(STATUS_NOTICE, + "Buffer pool(s) load completed at %s " + "(%s was empty)", now, full_filename); + return; + } + + if (!SHUTTING_DOWN()) { + buf_dump_sort(dump, dump_tmp, 0, dump_n); + } + + ut_free(dump_tmp); + + for (i = 0; i < dump_n && !SHUTTING_DOWN(); i++) { + + buf_read_page_async(BUF_DUMP_SPACE(dump[i]), + BUF_DUMP_PAGE(dump[i])); + + if (i % 64 == 63) { + os_aio_simulated_wake_handler_threads(); + } + + if (i % 128 == 0) { + buf_load_status(STATUS_INFO, + "Loaded " ULINTPF "/" ULINTPF " pages", + i + 1, dump_n); + } + + if (buf_load_abort_flag) { + buf_load_abort_flag = FALSE; + ut_free(dump); + buf_load_status( + STATUS_NOTICE, + "Buffer pool(s) load aborted on request"); + return; + } + } + + ut_free(dump); + + ut_sprintf_timestamp(now); + + buf_load_status(STATUS_NOTICE, + "Buffer pool(s) load completed at %s", now); +} + +/*****************************************************************//** +Aborts a currently running buffer pool load. This function is called by +MySQL code via buffer_pool_load_abort() and it should return immediately +because the whole MySQL is frozen during its execution. */ +UNIV_INTERN +void +buf_load_abort() +/*============*/ +{ + buf_load_abort_flag = TRUE; +} + +/*****************************************************************//** +This is the main thread for buffer pool dump/load. It waits for an +event and when waked up either performs a dump or load and sleeps +again. +@return this function does not return, it calls os_thread_exit() */ +extern "C" UNIV_INTERN +os_thread_ret_t +DECLARE_THREAD(buf_dump_thread)( +/*============================*/ + void* arg __attribute__((unused))) /*!< in: a dummy parameter + required by os_thread_create */ +{ + ut_ad(!srv_read_only_mode); + + srv_buf_dump_thread_active = TRUE; + + buf_dump_status(STATUS_INFO, "not started"); + buf_load_status(STATUS_INFO, "not started"); + + if (srv_buffer_pool_load_at_startup) { + buf_load(); + } + + while (!SHUTTING_DOWN()) { + + os_event_wait(srv_buf_dump_event); + + if (buf_dump_should_start) { + buf_dump_should_start = FALSE; + buf_dump(TRUE /* quit on shutdown */); + } + + if (buf_load_should_start) { + buf_load_should_start = FALSE; + buf_load(); + } + + os_event_reset(srv_buf_dump_event); + } + + if (srv_buffer_pool_dump_at_shutdown && srv_fast_shutdown != 2) { + buf_dump(FALSE /* ignore shutdown down flag, + keep going even if we are in a shutdown state */); + } + + srv_buf_dump_thread_active = FALSE; + + /* We count the number of threads in os_thread_exit(). A created + thread should always use that to exit and not use return() to exit. */ + os_thread_exit(NULL); + + OS_THREAD_DUMMY_RETURN; +} diff --git a/storage/xtradb/buf/buf0flu.cc b/storage/xtradb/buf/buf0flu.cc new file mode 100644 index 00000000000..6db3c3e571c --- /dev/null +++ b/storage/xtradb/buf/buf0flu.cc @@ -0,0 +1,3041 @@ +/***************************************************************************** + +Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file buf/buf0flu.cc +The database buffer buf_pool flush algorithm + +Created 11/11/1995 Heikki Tuuri +*******************************************************/ + +#include "buf0flu.h" + +#ifdef UNIV_NONINL +#include "buf0flu.ic" +#endif + +#include "buf0buf.h" +#include "buf0checksum.h" +#include "srv0start.h" +#include "srv0srv.h" +#include "page0zip.h" +#ifndef UNIV_HOTBACKUP +#include "ut0byte.h" +#include "ut0lst.h" +#include "page0page.h" +#include "fil0fil.h" +#include "buf0lru.h" +#include "buf0rea.h" +#include "ibuf0ibuf.h" +#include "log0log.h" +#include "os0file.h" +#include "trx0sys.h" +#include "srv0mon.h" +#include "mysql/plugin.h" +#include "mysql/service_thd_wait.h" + +/** Number of pages flushed through non flush_list flushes. */ +// static ulint buf_lru_flush_page_count = 0; + +/** Flag indicating if the page_cleaner is in active state. This flag +is set to TRUE by the page_cleaner thread when it is spawned and is set +back to FALSE at shutdown by the page_cleaner as well. Therefore no +need to protect it by a mutex. It is only ever read by the thread +doing the shutdown */ +UNIV_INTERN ibool buf_page_cleaner_is_active = FALSE; + +/** Flag indicating if the lru_manager is in active state. */ +UNIV_INTERN bool buf_lru_manager_is_active = false; + +#ifdef UNIV_PFS_THREAD +UNIV_INTERN mysql_pfs_key_t buf_page_cleaner_thread_key; +UNIV_INTERN mysql_pfs_key_t buf_lru_manager_thread_key; +#endif /* UNIV_PFS_THREAD */ + +/** If LRU list of a buf_pool is less than this size then LRU eviction +should not happen. This is because when we do LRU flushing we also put +the blocks on free list. If LRU list is very small then we can end up +in thrashing. */ +#define BUF_LRU_MIN_LEN 256 + +/* @} */ + +/** Handled page counters for a single flush */ +struct flush_counters_t { + ulint flushed; /*!< number of dirty pages flushed */ + ulint evicted; /*!< number of clean pages evicted, including + evicted uncompressed page images */ + ulint unzip_LRU_evicted;/*!< number of uncompressed page images + evicted */ +}; + +/******************************************************************//** +Increases flush_list size in bytes with zip_size for compressed page, +UNIV_PAGE_SIZE for uncompressed page in inline function */ +static inline +void +incr_flush_list_size_in_bytes( +/*==========================*/ + buf_block_t* block, /*!< in: control block */ + buf_pool_t* buf_pool) /*!< in: buffer pool instance */ +{ + ut_ad(buf_flush_list_mutex_own(buf_pool)); + ulint zip_size = page_zip_get_size(&block->page.zip); + buf_pool->stat.flush_list_bytes += zip_size ? zip_size : UNIV_PAGE_SIZE; + ut_ad(buf_pool->stat.flush_list_bytes <= buf_pool->curr_pool_size); +} + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +/******************************************************************//** +Validates the flush list. +@return TRUE if ok */ +static +ibool +buf_flush_validate_low( +/*===================*/ + buf_pool_t* buf_pool); /*!< in: Buffer pool instance */ + +/******************************************************************//** +Validates the flush list some of the time. +@return TRUE if ok or the check was skipped */ +static +ibool +buf_flush_validate_skip( +/*====================*/ + buf_pool_t* buf_pool) /*!< in: Buffer pool instance */ +{ +/** Try buf_flush_validate_low() every this many times */ +# define BUF_FLUSH_VALIDATE_SKIP 23 + + /** The buf_flush_validate_low() call skip counter. + Use a signed type because of the race condition below. */ + static int buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP; + + /* There is a race condition below, but it does not matter, + because this call is only for heuristic purposes. We want to + reduce the call frequency of the costly buf_flush_validate_low() + check in debug builds. */ + if (--buf_flush_validate_count > 0) { + return(TRUE); + } + + buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP; + return(buf_flush_validate_low(buf_pool)); +} +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ + +/*******************************************************************//** +Sets hazard pointer during flush_list iteration. */ +UNIV_INLINE +void +buf_flush_set_hp( +/*=============*/ + buf_pool_t* buf_pool,/*!< in/out: buffer pool instance */ + const buf_page_t* bpage) /*!< in: buffer control block */ +{ + ut_ad(buf_flush_list_mutex_own(buf_pool)); + ut_ad(buf_pool->flush_list_hp == NULL || bpage == NULL); + ut_ad(!bpage || buf_page_in_file(bpage) + || buf_page_get_state(bpage) == BUF_BLOCK_REMOVE_HASH); + ut_ad(!bpage || bpage->in_flush_list); + ut_ad(!bpage || buf_pool_from_bpage(bpage) == buf_pool); + + buf_pool->flush_list_hp = bpage; +} + +/*******************************************************************//** +Checks if the given block is a hazard pointer +@return true if bpage is hazard pointer */ +UNIV_INLINE +bool +buf_flush_is_hp( +/*============*/ + buf_pool_t* buf_pool,/*!< in: buffer pool instance */ + const buf_page_t* bpage) /*!< in: buffer control block */ +{ + ut_ad(buf_flush_list_mutex_own(buf_pool)); + + return(buf_pool->flush_list_hp == bpage); +} + +/*******************************************************************//** +Whenever we move a block in flush_list (either to remove it or to +relocate it) we check the hazard pointer set by some other thread +doing the flush list scan. If the hazard pointer is the same as the +one we are about going to move then we set it to NULL to force a rescan +in the thread doing the batch. */ +UNIV_INLINE +void +buf_flush_update_hp( +/*================*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + buf_page_t* bpage) /*!< in: buffer control block */ +{ + ut_ad(buf_flush_list_mutex_own(buf_pool)); + + if (buf_flush_is_hp(buf_pool, bpage)) { + buf_flush_set_hp(buf_pool, NULL); + MONITOR_INC(MONITOR_FLUSH_HP_RESCAN); + } +} + +/******************************************************************//** +Insert a block in the flush_rbt and returns a pointer to its +predecessor or NULL if no predecessor. The ordering is maintained +on the basis of the <oldest_modification, space, offset> key. +@return pointer to the predecessor or NULL if no predecessor. */ +static +buf_page_t* +buf_flush_insert_in_flush_rbt( +/*==========================*/ + buf_page_t* bpage) /*!< in: bpage to be inserted. */ +{ + const ib_rbt_node_t* c_node; + const ib_rbt_node_t* p_node; + buf_page_t* prev = NULL; + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + + ut_ad(buf_flush_list_mutex_own(buf_pool)); + + /* Insert this buffer into the rbt. */ + c_node = rbt_insert(buf_pool->flush_rbt, &bpage, &bpage); + ut_a(c_node != NULL); + + /* Get the predecessor. */ + p_node = rbt_prev(buf_pool->flush_rbt, c_node); + + if (p_node != NULL) { + buf_page_t** value; + value = rbt_value(buf_page_t*, p_node); + prev = *value; + ut_a(prev != NULL); + } + + return(prev); +} + +/*********************************************************//** +Delete a bpage from the flush_rbt. */ +static +void +buf_flush_delete_from_flush_rbt( +/*============================*/ + buf_page_t* bpage) /*!< in: bpage to be removed. */ +{ +#ifdef UNIV_DEBUG + ibool ret = FALSE; +#endif /* UNIV_DEBUG */ + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + + ut_ad(buf_flush_list_mutex_own(buf_pool)); + +#ifdef UNIV_DEBUG + ret = +#endif /* UNIV_DEBUG */ + rbt_delete(buf_pool->flush_rbt, &bpage); + + ut_ad(ret); +} + +/*****************************************************************//** +Compare two modified blocks in the buffer pool. The key for comparison +is: +key = <oldest_modification, space, offset> +This comparison is used to maintian ordering of blocks in the +buf_pool->flush_rbt. +Note that for the purpose of flush_rbt, we only need to order blocks +on the oldest_modification. The other two fields are used to uniquely +identify the blocks. +@return < 0 if b2 < b1, 0 if b2 == b1, > 0 if b2 > b1 */ +static +int +buf_flush_block_cmp( +/*================*/ + const void* p1, /*!< in: block1 */ + const void* p2) /*!< in: block2 */ +{ + int ret; + const buf_page_t* b1 = *(const buf_page_t**) p1; + const buf_page_t* b2 = *(const buf_page_t**) p2; +#ifdef UNIV_DEBUG + buf_pool_t* buf_pool = buf_pool_from_bpage(b1); +#endif /* UNIV_DEBUG */ + + ut_ad(b1 != NULL); + ut_ad(b2 != NULL); + + ut_ad(buf_flush_list_mutex_own(buf_pool)); + + ut_ad(b1->in_flush_list); + ut_ad(b2->in_flush_list); + + if (b2->oldest_modification > b1->oldest_modification) { + return(1); + } else if (b2->oldest_modification < b1->oldest_modification) { + return(-1); + } + + /* If oldest_modification is same then decide on the space. */ + ret = (int)(b2->space - b1->space); + + /* Or else decide ordering on the offset field. */ + return(ret ? ret : (int)(b2->offset - b1->offset)); +} + +/********************************************************************//** +Initialize the red-black tree to speed up insertions into the flush_list +during recovery process. Should be called at the start of recovery +process before any page has been read/written. */ +UNIV_INTERN +void +buf_flush_init_flush_rbt(void) +/*==========================*/ +{ + ulint i; + + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_array(i); + + buf_flush_list_mutex_enter(buf_pool); + + /* Create red black tree for speedy insertions in flush list. */ + buf_pool->flush_rbt = rbt_create( + sizeof(buf_page_t*), buf_flush_block_cmp); + + buf_flush_list_mutex_exit(buf_pool); + } +} + +/********************************************************************//** +Frees up the red-black tree. */ +UNIV_INTERN +void +buf_flush_free_flush_rbt(void) +/*==========================*/ +{ + ulint i; + + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_array(i); + + buf_flush_list_mutex_enter(buf_pool); + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG + ut_a(buf_flush_validate_low(buf_pool)); +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ + + rbt_free(buf_pool->flush_rbt); + buf_pool->flush_rbt = NULL; + + buf_flush_list_mutex_exit(buf_pool); + } +} + +/********************************************************************//** +Inserts a modified block into the flush list. */ +UNIV_INTERN +void +buf_flush_insert_into_flush_list( +/*=============================*/ + buf_pool_t* buf_pool, /*!< buffer pool instance */ + buf_block_t* block, /*!< in/out: block which is modified */ + lsn_t lsn) /*!< in: oldest modification */ +{ + ut_ad(log_flush_order_mutex_own()); + ut_ad(mutex_own(&block->mutex)); + + buf_flush_list_mutex_enter(buf_pool); + + ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL) + || (UT_LIST_GET_FIRST(buf_pool->flush_list)->oldest_modification + <= lsn)); + + /* If we are in the recovery then we need to update the flush + red-black tree as well. */ + if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) { + buf_flush_list_mutex_exit(buf_pool); + buf_flush_insert_sorted_into_flush_list(buf_pool, block, lsn); + return; + } + + ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); + ut_ad(!block->page.in_flush_list); + + ut_d(block->page.in_flush_list = TRUE); + block->page.oldest_modification = lsn; + UT_LIST_ADD_FIRST(list, buf_pool->flush_list, &block->page); + incr_flush_list_size_in_bytes(block, buf_pool); + +#ifdef UNIV_DEBUG_VALGRIND + { + ulint zip_size = buf_block_get_zip_size(block); + + if (zip_size) { + UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size); + } else { + UNIV_MEM_ASSERT_RW(block->frame, UNIV_PAGE_SIZE); + } + } +#endif /* UNIV_DEBUG_VALGRIND */ +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG + ut_a(buf_flush_validate_skip(buf_pool)); +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ + + buf_flush_list_mutex_exit(buf_pool); +} + +/********************************************************************//** +Inserts a modified block into the flush list in the right sorted position. +This function is used by recovery, because there the modifications do not +necessarily come in the order of lsn's. */ +UNIV_INTERN +void +buf_flush_insert_sorted_into_flush_list( +/*====================================*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + buf_block_t* block, /*!< in/out: block which is modified */ + lsn_t lsn) /*!< in: oldest modification */ +{ + buf_page_t* prev_b; + buf_page_t* b; + + ut_ad(log_flush_order_mutex_own()); + ut_ad(mutex_own(&block->mutex)); + ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); + + buf_flush_list_mutex_enter(buf_pool); + + /* The field in_LRU_list is protected by buf_pool->LRU_list_mutex, + which we are not holding. However, while a block is in the flush + list, it is dirty and cannot be discarded, not from the + page_hash or from the LRU list. At most, the uncompressed + page frame of a compressed block may be discarded or created + (copying the block->page to or from a buf_page_t that is + dynamically allocated from buf_buddy_alloc()). Because those + transitions hold block->mutex and the flush list mutex (via + buf_flush_relocate_on_flush_list()), there is no possibility + of a race condition in the assertions below. */ + ut_ad(block->page.in_LRU_list); + ut_ad(block->page.in_page_hash); + /* buf_buddy_block_register() will take a block in the + BUF_BLOCK_MEMORY state, not a file page. */ + ut_ad(!block->page.in_zip_hash); + + ut_ad(!block->page.in_flush_list); + ut_d(block->page.in_flush_list = TRUE); + block->page.oldest_modification = lsn; + +#ifdef UNIV_DEBUG_VALGRIND + { + ulint zip_size = buf_block_get_zip_size(block); + + if (zip_size) { + UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size); + } else { + UNIV_MEM_ASSERT_RW(block->frame, UNIV_PAGE_SIZE); + } + } +#endif /* UNIV_DEBUG_VALGRIND */ + + prev_b = NULL; + + /* For the most part when this function is called the flush_rbt + should not be NULL. In a very rare boundary case it is possible + that the flush_rbt has already been freed by the recovery thread + before the last page was hooked up in the flush_list by the + io-handler thread. In that case we'll just do a simple + linear search in the else block. */ + if (buf_pool->flush_rbt) { + + prev_b = buf_flush_insert_in_flush_rbt(&block->page); + + } else { + + b = UT_LIST_GET_FIRST(buf_pool->flush_list); + + while (b && b->oldest_modification + > block->page.oldest_modification) { + ut_ad(b->in_flush_list); + prev_b = b; + b = UT_LIST_GET_NEXT(list, b); + } + } + + if (prev_b == NULL) { + UT_LIST_ADD_FIRST(list, buf_pool->flush_list, &block->page); + } else { + UT_LIST_INSERT_AFTER(list, buf_pool->flush_list, + prev_b, &block->page); + } + + incr_flush_list_size_in_bytes(block, buf_pool); + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG + ut_a(buf_flush_validate_low(buf_pool)); +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ + + buf_flush_list_mutex_exit(buf_pool); +} + +/********************************************************************//** +Returns TRUE if the file page block is immediately suitable for replacement, +i.e., the transition FILE_PAGE => NOT_USED allowed. +@return TRUE if can replace immediately */ +UNIV_INTERN +ibool +buf_flush_ready_for_replace( +/*========================*/ + buf_page_t* bpage) /*!< in: buffer control block, must be + buf_page_in_file(bpage) and in the LRU list */ +{ +#ifdef UNIV_DEBUG + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); +#endif /* UNIV_DEBUG */ + ut_ad(mutex_own(buf_page_get_mutex(bpage))); + ut_ad(bpage->in_LRU_list); + + if (buf_page_in_file(bpage)) { + + return(bpage->oldest_modification == 0 + && bpage->buf_fix_count == 0 + && buf_page_get_io_fix(bpage) == BUF_IO_NONE); + } + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: buffer block state %lu" + " in the LRU list!\n", + (ulong) buf_page_get_state(bpage)); + ut_print_buf(stderr, bpage, sizeof(buf_page_t)); + putc('\n', stderr); + + return(FALSE); +} + +/********************************************************************//** +Returns true if the block is modified and ready for flushing. +@return true if can flush immediately */ +UNIV_INTERN +bool +buf_flush_ready_for_flush( +/*======================*/ + buf_page_t* bpage, /*!< in: buffer control block, must be + buf_page_in_file(bpage) */ + buf_flush_t flush_type)/*!< in: type of flush */ +{ + ut_ad(flush_type < BUF_FLUSH_N_TYPES); + ut_ad(mutex_own(buf_page_get_mutex(bpage)) + || flush_type == BUF_FLUSH_LIST); + ut_a(buf_page_in_file(bpage) + || (buf_page_get_state(bpage) == BUF_BLOCK_REMOVE_HASH +#ifdef UNIV_DEBUG + && !mutex_own(&buf_pool_from_bpage(bpage)->LRU_list_mutex) +#endif + )); + + if (bpage->oldest_modification == 0 + || buf_page_get_io_fix_unlocked(bpage) != BUF_IO_NONE) { + return(false); + } + + ut_ad(bpage->in_flush_list); + + switch (flush_type) { + case BUF_FLUSH_LIST: + return(buf_page_get_state(bpage) != BUF_BLOCK_REMOVE_HASH); + case BUF_FLUSH_LRU: + case BUF_FLUSH_SINGLE_PAGE: + return(true); + + case BUF_FLUSH_N_TYPES: + break; + } + + ut_error; + return(false); +} + +/********************************************************************//** +Remove a block from the flush list of modified blocks. */ +UNIV_INTERN +void +buf_flush_remove( +/*=============*/ + buf_page_t* bpage) /*!< in: pointer to the block in question */ +{ + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + ulint zip_size; + + ut_ad(mutex_own(buf_page_get_mutex(bpage))); +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG + ut_ad(buf_page_get_state(bpage) != BUF_BLOCK_ZIP_DIRTY + || mutex_own(&buf_pool->LRU_list_mutex)); +#endif + ut_ad(bpage->in_flush_list); + + buf_flush_list_mutex_enter(buf_pool); + + switch (buf_page_get_state(bpage)) { + case BUF_BLOCK_POOL_WATCH: + case BUF_BLOCK_ZIP_PAGE: + /* Clean compressed pages should not be on the flush list */ + case BUF_BLOCK_NOT_USED: + case BUF_BLOCK_READY_FOR_USE: + case BUF_BLOCK_MEMORY: + case BUF_BLOCK_REMOVE_HASH: + ut_error; + return; + case BUF_BLOCK_ZIP_DIRTY: + buf_page_set_state(bpage, BUF_BLOCK_ZIP_PAGE); + UT_LIST_REMOVE(list, buf_pool->flush_list, bpage); +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG + buf_LRU_insert_zip_clean(bpage); +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ + break; + case BUF_BLOCK_FILE_PAGE: + UT_LIST_REMOVE(list, buf_pool->flush_list, bpage); + break; + } + + /* If the flush_rbt is active then delete from there as well. */ + if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) { + buf_flush_delete_from_flush_rbt(bpage); + } + + /* Must be done after we have removed it from the flush_rbt + because we assert on in_flush_list in comparison function. */ + ut_d(bpage->in_flush_list = FALSE); + + zip_size = page_zip_get_size(&bpage->zip); + buf_pool->stat.flush_list_bytes -= zip_size ? zip_size : UNIV_PAGE_SIZE; + + bpage->oldest_modification = 0; + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG + ut_a(buf_flush_validate_skip(buf_pool)); +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ + + buf_flush_update_hp(buf_pool, bpage); + buf_flush_list_mutex_exit(buf_pool); +} + +/*******************************************************************//** +Relocates a buffer control block on the flush_list. +Note that it is assumed that the contents of bpage have already been +copied to dpage. +IMPORTANT: When this function is called bpage and dpage are not +exact copies of each other. For example, they both will have different +::state. Also the ::list pointers in dpage may be stale. We need to +use the current list node (bpage) to do the list manipulation because +the list pointers could have changed between the time that we copied +the contents of bpage to the dpage and the flush list manipulation +below. */ +UNIV_INTERN +void +buf_flush_relocate_on_flush_list( +/*=============================*/ + buf_page_t* bpage, /*!< in/out: control block being moved */ + buf_page_t* dpage) /*!< in/out: destination block */ +{ + buf_page_t* prev; + buf_page_t* prev_b = NULL; + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + + /* Must reside in the same buffer pool. */ + ut_ad(buf_pool == buf_pool_from_bpage(dpage)); + + ut_ad(mutex_own(buf_page_get_mutex(bpage))); + + buf_flush_list_mutex_enter(buf_pool); + + ut_ad(bpage->in_flush_list); + ut_ad(dpage->in_flush_list); + + /* If recovery is active we must swap the control blocks in + the flush_rbt as well. */ + if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) { + buf_flush_delete_from_flush_rbt(bpage); + prev_b = buf_flush_insert_in_flush_rbt(dpage); + } + + /* Must be done after we have removed it from the flush_rbt + because we assert on in_flush_list in comparison function. */ + ut_d(bpage->in_flush_list = FALSE); + + prev = UT_LIST_GET_PREV(list, bpage); + UT_LIST_REMOVE(list, buf_pool->flush_list, bpage); + + if (prev) { + ut_ad(prev->in_flush_list); + UT_LIST_INSERT_AFTER( + list, + buf_pool->flush_list, + prev, dpage); + } else { + UT_LIST_ADD_FIRST( + list, + buf_pool->flush_list, + dpage); + } + + /* Just an extra check. Previous in flush_list + should be the same control block as in flush_rbt. */ + ut_a(!buf_pool->flush_rbt || prev_b == prev); + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG + ut_a(buf_flush_validate_low(buf_pool)); +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ + + buf_flush_update_hp(buf_pool, bpage); + buf_flush_list_mutex_exit(buf_pool); +} + +/********************************************************************//** +Updates the flush system data structures when a write is completed. */ +UNIV_INTERN +void +buf_flush_write_complete( +/*=====================*/ + buf_page_t* bpage) /*!< in: pointer to the block in question */ +{ + buf_flush_t flush_type = buf_page_get_flush_type(bpage); + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + + mutex_enter(&buf_pool->flush_state_mutex); + + buf_flush_remove(bpage); + + buf_page_set_io_fix(bpage, BUF_IO_NONE); + + buf_pool->n_flush[flush_type]--; + + /* fprintf(stderr, "n pending flush %lu\n", + buf_pool->n_flush[flush_type]); */ + + if (buf_pool->n_flush[flush_type] == 0 + && buf_pool->init_flush[flush_type] == FALSE) { + + /* The running flush batch has ended */ + + os_event_set(buf_pool->no_flush[flush_type]); + } + + buf_dblwr_update(bpage, flush_type); + + mutex_exit(&buf_pool->flush_state_mutex); +} +#endif /* !UNIV_HOTBACKUP */ + +/********************************************************************//** +Calculate the checksum of a page from compressed table and update the page. */ +UNIV_INTERN +void +buf_flush_update_zip_checksum( +/*==========================*/ + buf_frame_t* page, /*!< in/out: Page to update */ + ulint zip_size, /*!< in: Compressed page size */ + lsn_t lsn) /*!< in: Lsn to stamp on the page */ +{ + ut_a(zip_size > 0); + + ib_uint32_t checksum = static_cast<ib_uint32_t>( + page_zip_calc_checksum( + page, zip_size, + static_cast<srv_checksum_algorithm_t>( + srv_checksum_algorithm))); + + mach_write_to_8(page + FIL_PAGE_LSN, lsn); + memset(page + FIL_PAGE_FILE_FLUSH_LSN, 0, 8); + mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, checksum); +} + +/********************************************************************//** +Initializes a page for writing to the tablespace. */ +UNIV_INTERN +void +buf_flush_init_for_writing( +/*=======================*/ + byte* page, /*!< in/out: page */ + void* page_zip_, /*!< in/out: compressed page, or NULL */ + lsn_t newest_lsn) /*!< in: newest modification lsn + to the page */ +{ + ib_uint32_t checksum = 0 /* silence bogus gcc warning */; + + ut_ad(page); + + if (page_zip_) { + page_zip_des_t* page_zip; + ulint zip_size; + + page_zip = static_cast<page_zip_des_t*>(page_zip_); + zip_size = page_zip_get_size(page_zip); + + ut_ad(zip_size); + ut_ad(ut_is_2pow(zip_size)); + ut_ad(zip_size <= UNIV_ZIP_SIZE_MAX); + + switch (UNIV_EXPECT(fil_page_get_type(page), FIL_PAGE_INDEX)) { + case FIL_PAGE_TYPE_ALLOCATED: + case FIL_PAGE_INODE: + case FIL_PAGE_IBUF_BITMAP: + case FIL_PAGE_TYPE_FSP_HDR: + case FIL_PAGE_TYPE_XDES: + /* These are essentially uncompressed pages. */ + memcpy(page_zip->data, page, zip_size); + /* fall through */ + case FIL_PAGE_TYPE_ZBLOB: + case FIL_PAGE_TYPE_ZBLOB2: + case FIL_PAGE_INDEX: + + buf_flush_update_zip_checksum( + page_zip->data, zip_size, newest_lsn); + + return; + } + + ut_print_timestamp(stderr); + fputs(" InnoDB: ERROR: The compressed page to be written" + " seems corrupt:", stderr); + ut_print_buf(stderr, page, zip_size); + fputs("\nInnoDB: Possibly older version of the page:", stderr); + ut_print_buf(stderr, page_zip->data, zip_size); + putc('\n', stderr); + ut_error; + } + + /* Write the newest modification lsn to the page header and trailer */ + mach_write_to_8(page + FIL_PAGE_LSN, newest_lsn); + + mach_write_to_8(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM, + newest_lsn); + + /* Store the new formula checksum */ + + switch ((srv_checksum_algorithm_t) srv_checksum_algorithm) { + case SRV_CHECKSUM_ALGORITHM_CRC32: + case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32: + checksum = buf_calc_page_crc32(page); + mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, checksum); + break; + case SRV_CHECKSUM_ALGORITHM_INNODB: + case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB: + checksum = (ib_uint32_t) buf_calc_page_new_checksum(page); + mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, checksum); + checksum = (ib_uint32_t) buf_calc_page_old_checksum(page); + break; + case SRV_CHECKSUM_ALGORITHM_NONE: + case SRV_CHECKSUM_ALGORITHM_STRICT_NONE: + checksum = BUF_NO_CHECKSUM_MAGIC; + mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, checksum); + break; + /* no default so the compiler will emit a warning if new enum + is added and not handled here */ + } + + /* With the InnoDB checksum, we overwrite the first 4 bytes of + the end lsn field to store the old formula checksum. Since it + depends also on the field FIL_PAGE_SPACE_OR_CHKSUM, it has to + be calculated after storing the new formula checksum. + + In other cases we write the same value to both fields. + If CRC32 is used then it is faster to use that checksum + (calculated above) instead of calculating another one. + We can afford to store something other than + buf_calc_page_old_checksum() or BUF_NO_CHECKSUM_MAGIC in + this field because the file will not be readable by old + versions of MySQL/InnoDB anyway (older than MySQL 5.6.3) */ + + mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM, + checksum); +} + +#ifndef UNIV_HOTBACKUP +/********************************************************************//** +Does an asynchronous write of a buffer page. NOTE: in simulated aio and +also when the doublewrite buffer is used, we must call +buf_dblwr_flush_buffered_writes after we have posted a batch of +writes! */ +static +void +buf_flush_write_block_low( +/*======================*/ + buf_page_t* bpage, /*!< in: buffer block to write */ + buf_flush_t flush_type, /*!< in: type of flush */ + bool sync) /*!< in: true if sync IO request */ +{ + ulint zip_size = buf_page_get_zip_size(bpage); + page_t* frame = NULL; + +#ifdef UNIV_DEBUG + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + ut_ad(!mutex_own(&buf_pool->LRU_list_mutex)); +#endif + +#ifdef UNIV_LOG_DEBUG + static ibool univ_log_debug_warned; +#endif /* UNIV_LOG_DEBUG */ + + ut_ad(buf_page_in_file(bpage)); + + /* We are not holding block_mutex here. + Nevertheless, it is safe to access bpage, because it is + io_fixed and oldest_modification != 0. Thus, it cannot be + relocated in the buffer pool or removed from flush_list or + LRU_list. */ + ut_ad(!buf_flush_list_mutex_own(buf_pool)); + ut_ad(!mutex_own(buf_page_get_mutex(bpage))); + ut_ad(buf_page_get_io_fix_unlocked(bpage) == BUF_IO_WRITE); + ut_ad(bpage->oldest_modification != 0); + +#ifdef UNIV_IBUF_COUNT_DEBUG + ut_a(ibuf_count_get(bpage->space, bpage->offset) == 0); +#endif + ut_ad(bpage->newest_modification != 0); + +#ifdef UNIV_LOG_DEBUG + if (!univ_log_debug_warned) { + univ_log_debug_warned = TRUE; + fputs("Warning: cannot force log to disk if" + " UNIV_LOG_DEBUG is defined!\n" + "Crash recovery will not work!\n", + stderr); + } +#else + /* Force the log to the disk before writing the modified block */ + log_write_up_to(bpage->newest_modification, LOG_WAIT_ALL_GROUPS, TRUE); +#endif + switch (buf_page_get_state(bpage)) { + case BUF_BLOCK_POOL_WATCH: + case BUF_BLOCK_ZIP_PAGE: /* The page should be dirty. */ + case BUF_BLOCK_NOT_USED: + case BUF_BLOCK_READY_FOR_USE: + case BUF_BLOCK_MEMORY: + case BUF_BLOCK_REMOVE_HASH: + ut_error; + break; + case BUF_BLOCK_ZIP_DIRTY: + frame = bpage->zip.data; + + ut_a(page_zip_verify_checksum(frame, zip_size)); + + mach_write_to_8(frame + FIL_PAGE_LSN, + bpage->newest_modification); + memset(frame + FIL_PAGE_FILE_FLUSH_LSN, 0, 8); + break; + case BUF_BLOCK_FILE_PAGE: + frame = bpage->zip.data; + if (!frame) { + frame = ((buf_block_t*) bpage)->frame; + } + + buf_flush_init_for_writing(((buf_block_t*) bpage)->frame, + bpage->zip.data + ? &bpage->zip : NULL, + bpage->newest_modification); + break; + } + + if (!srv_use_doublewrite_buf || !buf_dblwr) { + fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER, + sync, buf_page_get_space(bpage), zip_size, + buf_page_get_page_no(bpage), 0, + zip_size ? zip_size : UNIV_PAGE_SIZE, + frame, bpage); + } else if (flush_type == BUF_FLUSH_SINGLE_PAGE) { + buf_dblwr_write_single_page(bpage, sync); + } else { + ut_ad(!sync); + buf_dblwr_add_to_batch(bpage); + } + + /* When doing single page flushing the IO is done synchronously + and we flush the changes to disk only for the tablespace we + are working on. */ + if (sync) { + ut_ad(flush_type == BUF_FLUSH_SINGLE_PAGE); + fil_flush(buf_page_get_space(bpage)); + buf_page_io_complete(bpage); + } + + /* Increment the counter of I/O operations used + for selecting LRU policy. */ + buf_LRU_stat_inc_io(); +} + +/********************************************************************//** +Writes a flushable page asynchronously from the buffer pool to a file. +NOTE: in simulated aio we must call +os_aio_simulated_wake_handler_threads after we have posted a batch of +writes! NOTE: buf_page_get_mutex(bpage) must be held upon entering this +function, and it will be released by this function if it returns true. +LRU_list_mutex must be held iff performing a single page flush and will be +released by the function if it returns true. +@return TRUE if the page was flushed */ +UNIV_INTERN +bool +buf_flush_page( +/*===========*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + buf_page_t* bpage, /*!< in: buffer control block */ + buf_flush_t flush_type, /*!< in: type of flush */ + bool sync) /*!< in: true if sync IO request */ +{ + ut_ad(flush_type < BUF_FLUSH_N_TYPES); + /* Hold the LRU list mutex iff called for a single page LRU + flush. A single page LRU flush is already non-performant, and holding + the LRU list mutex allows us to avoid having to store the previous LRU + list page or to restart the LRU scan in + buf_flush_single_page_from_LRU(). */ + ut_ad(flush_type == BUF_FLUSH_SINGLE_PAGE || + !mutex_own(&buf_pool->LRU_list_mutex)); + ut_ad(flush_type != BUF_FLUSH_SINGLE_PAGE || + mutex_own(&buf_pool->LRU_list_mutex)); + ut_ad(buf_page_in_file(bpage)); + ut_ad(!sync || flush_type == BUF_FLUSH_SINGLE_PAGE); + + ib_mutex_t* block_mutex = buf_page_get_mutex(bpage); + + ut_ad(mutex_own(block_mutex)); + + ut_ad(buf_flush_ready_for_flush(bpage, flush_type)); + + bool is_uncompressed; + + is_uncompressed = (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE); + ut_ad(is_uncompressed == (block_mutex != &buf_pool->zip_mutex)); + + ibool flush; + rw_lock_t* rw_lock; + bool no_fix_count = bpage->buf_fix_count == 0; + + if (!is_uncompressed) { + flush = TRUE; + rw_lock = NULL; + + } else if (!(no_fix_count || flush_type == BUF_FLUSH_LIST)) { + /* This is a heuristic, to avoid expensive S attempts. */ + flush = FALSE; + } else { + + rw_lock = &reinterpret_cast<buf_block_t*>(bpage)->lock; + + if (flush_type != BUF_FLUSH_LIST) { + flush = rw_lock_s_lock_gen_nowait( + rw_lock, BUF_IO_WRITE); + } else { + /* Will S lock later */ + flush = TRUE; + } + } + + if (flush) { + + /* We are committed to flushing by the time we get here */ + + mutex_enter(&buf_pool->flush_state_mutex); + + buf_page_set_io_fix(bpage, BUF_IO_WRITE); + + buf_page_set_flush_type(bpage, flush_type); + + if (buf_pool->n_flush[flush_type] == 0) { + + os_event_reset(buf_pool->no_flush[flush_type]); + } + + ++buf_pool->n_flush[flush_type]; + + mutex_exit(&buf_pool->flush_state_mutex); + + mutex_exit(block_mutex); + + if (flush_type == BUF_FLUSH_SINGLE_PAGE) + mutex_exit(&buf_pool->LRU_list_mutex); + + if (flush_type == BUF_FLUSH_LIST + && is_uncompressed + && !rw_lock_s_lock_gen_nowait(rw_lock, BUF_IO_WRITE)) { + /* avoiding deadlock possibility involves doublewrite + buffer, should flush it, because it might hold the + another block->lock. */ + buf_dblwr_flush_buffered_writes(); + + rw_lock_s_lock_gen(rw_lock, BUF_IO_WRITE); + } + + /* Even though bpage is not protected by any mutex at this + point, it is safe to access bpage, because it is io_fixed and + oldest_modification != 0. Thus, it cannot be relocated in the + buffer pool or removed from flush_list or LRU_list. */ + + buf_flush_write_block_low(bpage, flush_type, sync); + } + + return(flush); +} + +# if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG +/********************************************************************//** +Writes a flushable page asynchronously from the buffer pool to a file. +NOTE: block and LRU list mutexes must be held upon entering this function, and +they will be released by this function after flushing. This is loosely based on +buf_flush_batch() and buf_flush_page(). +@return TRUE if the page was flushed and the mutexes released */ +UNIV_INTERN +ibool +buf_flush_page_try( +/*===============*/ + buf_pool_t* buf_pool, /*!< in/out: buffer pool instance */ + buf_block_t* block) /*!< in/out: buffer control block */ +{ + ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); + ut_ad(mutex_own(&block->mutex)); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); + + if (!buf_flush_ready_for_flush(&block->page, BUF_FLUSH_SINGLE_PAGE)) { + return(FALSE); + } + + /* The following call will release the LRU list and + block mutex if successful. */ + return(buf_flush_page( + buf_pool, &block->page, BUF_FLUSH_SINGLE_PAGE, true)); +} +# endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ +/***********************************************************//** +Check the page is in buffer pool and can be flushed. +@return true if the page can be flushed. */ +static +bool +buf_flush_check_neighbor( +/*=====================*/ + ulint space, /*!< in: space id */ + ulint offset, /*!< in: page offset */ + buf_flush_t flush_type) /*!< in: BUF_FLUSH_LRU or + BUF_FLUSH_LIST */ +{ + buf_page_t* bpage; + buf_pool_t* buf_pool = buf_pool_get(space, offset); + bool ret; + prio_rw_lock_t* hash_lock; + ib_mutex_t* block_mutex; + + ut_ad(flush_type == BUF_FLUSH_LRU + || flush_type == BUF_FLUSH_LIST); + + /* We only want to flush pages from this buffer pool. */ + bpage = buf_page_hash_get_s_locked(buf_pool, space, offset, + &hash_lock); + + if (!bpage) { + + return(false); + } + + block_mutex = buf_page_get_mutex(bpage); + + mutex_enter(block_mutex); + + rw_lock_s_unlock(hash_lock); + + ut_a(buf_page_in_file(bpage)); + + /* We avoid flushing 'non-old' blocks in an LRU flush, + because the flushed blocks are soon freed */ + + ret = false; + if (flush_type != BUF_FLUSH_LRU || buf_page_is_old(bpage)) { + + if (buf_flush_ready_for_flush(bpage, flush_type)) { + ret = true; + } + } + + mutex_exit(block_mutex); + + return(ret); +} + +/***********************************************************//** +Flushes to disk all flushable pages within the flush area. +@return number of pages flushed */ +static +ulint +buf_flush_try_neighbors( +/*====================*/ + ulint space, /*!< in: space id */ + ulint offset, /*!< in: page offset */ + buf_flush_t flush_type, /*!< in: BUF_FLUSH_LRU or + BUF_FLUSH_LIST */ + ulint n_flushed, /*!< in: number of pages + flushed so far in this batch */ + ulint n_to_flush) /*!< in: maximum number of pages + we are allowed to flush */ +{ + ulint i; + ulint low; + ulint high; + buf_pool_t* buf_pool = buf_pool_get(space, offset); + + ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST); + ut_ad(!mutex_own(&buf_pool->LRU_list_mutex)); + ut_ad(!buf_flush_list_mutex_own(buf_pool)); + + if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN + || srv_flush_neighbors == 0) { + /* If there is little space or neighbor flushing is + not enabled then just flush the victim. */ + low = offset; + high = offset + 1; + } else { + /* When flushed, dirty blocks are searched in + neighborhoods of this size, and flushed along with the + original page. */ + + ulint buf_flush_area; + + buf_flush_area = ut_min( + BUF_READ_AHEAD_AREA(buf_pool), + buf_pool->curr_size / 16); + + low = (offset / buf_flush_area) * buf_flush_area; + high = (offset / buf_flush_area + 1) * buf_flush_area; + + if (srv_flush_neighbors == 1) { + /* adjust 'low' and 'high' to limit + for contiguous dirty area */ + if (offset > low) { + for (i = offset - 1; + i >= low + && buf_flush_check_neighbor( + space, i, flush_type); + i--) { + /* do nothing */ + } + low = i + 1; + } + + for (i = offset + 1; + i < high + && buf_flush_check_neighbor( + space, i, flush_type); + i++) { + /* do nothing */ + } + high = i; + } + } + + /* fprintf(stderr, "Flush area: low %lu high %lu\n", low, high); */ + + if (high > fil_space_get_size(space)) { + high = fil_space_get_size(space); + } + + ulint count = 0; + + for (i = low; i < high; i++) { + + prio_rw_lock_t* hash_lock; + ib_mutex_t* block_mutex; + + if ((count + n_flushed) >= n_to_flush) { + + /* We have already flushed enough pages and + should call it a day. There is, however, one + exception. If the page whose neighbors we + are flushing has not been flushed yet then + we'll try to flush the victim that we + selected originally. */ + if (i <= offset) { + i = offset; + } else { + break; + } + } + + buf_pool = buf_pool_get(space, i); + + /* We only want to flush pages from this buffer pool. */ + buf_page_t* bpage = buf_page_hash_get_s_locked(buf_pool, + space, i, &hash_lock); + + if (bpage == NULL) { + + continue; + } + + block_mutex = buf_page_get_mutex(bpage); + + mutex_enter(block_mutex); + + rw_lock_s_unlock(hash_lock); + + ut_a(buf_page_in_file(bpage)); + + /* We avoid flushing 'non-old' blocks in an LRU flush, + because the flushed blocks are soon freed */ + + if (flush_type != BUF_FLUSH_LRU + || i == offset + || buf_page_is_old(bpage)) { + + if (buf_flush_ready_for_flush(bpage, flush_type) + && (i == offset || bpage->buf_fix_count == 0) + && buf_flush_page( + buf_pool, bpage, flush_type, false)) { + + ++count; + + continue; + } + } + + mutex_exit(block_mutex); + } + + if (count > 0) { + MONITOR_INC_VALUE_CUMULATIVE( + MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE, + MONITOR_FLUSH_NEIGHBOR_COUNT, + MONITOR_FLUSH_NEIGHBOR_PAGES, + (count - 1)); + } + + return(count); +} + +/********************************************************************//** +Check if the block is modified and ready for flushing. If the the block +is ready to flush then flush the page and try o flush its neighbors. + +@return TRUE if, depending on the flush type, either LRU or flush list +mutex was released during this function. This does not guarantee that some +pages were written as well. +Number of pages written are incremented to the count. */ +static +ibool +buf_flush_page_and_try_neighbors( +/*=============================*/ + buf_page_t* bpage, /*!< in: buffer control block, + must be + buf_page_in_file(bpage) */ + buf_flush_t flush_type, /*!< in: BUF_FLUSH_LRU + or BUF_FLUSH_LIST */ + ulint n_to_flush, /*!< in: number of pages to + flush */ + ulint* count) /*!< in/out: number of pages + flushed */ +{ + ibool flushed; + ib_mutex_t* block_mutex = NULL; +#ifdef UNIV_DEBUG + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); +#endif /* UNIV_DEBUG */ + + ut_ad((flush_type == BUF_FLUSH_LRU + && mutex_own(&buf_pool->LRU_list_mutex)) + || (flush_type == BUF_FLUSH_LIST + && buf_flush_list_mutex_own(buf_pool))); + + if (flush_type == BUF_FLUSH_LRU) { + block_mutex = buf_page_get_mutex(bpage); + mutex_enter(block_mutex); + } + + ut_a(buf_page_in_file(bpage) + || (buf_page_get_state(bpage) == BUF_BLOCK_REMOVE_HASH +#ifdef UNIV_DEBUG + && !mutex_own(&buf_pool->LRU_list_mutex) +#endif + )); + + if (buf_flush_ready_for_flush(bpage, flush_type)) { + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_bpage(bpage); + + if (flush_type == BUF_FLUSH_LRU) { + mutex_exit(&buf_pool->LRU_list_mutex); + } + + /* These fields are protected by the buf_page_get_mutex() + mutex. */ + /* Read the fields directly in order to avoid asserting on + BUF_BLOCK_REMOVE_HASH pages. */ + ulint space = bpage->space; + ulint offset = bpage->offset; + + if (flush_type == BUF_FLUSH_LRU) { + mutex_exit(block_mutex); + } else { + buf_flush_list_mutex_exit(buf_pool); + } + + /* Try to flush also all the neighbors */ + *count += buf_flush_try_neighbors( + space, offset, flush_type, *count, n_to_flush); + + if (flush_type == BUF_FLUSH_LRU) { + mutex_enter(&buf_pool->LRU_list_mutex); + } else { + buf_flush_list_mutex_enter(buf_pool); + } + flushed = TRUE; + + } else if (flush_type == BUF_FLUSH_LRU) { + mutex_exit(block_mutex); + flushed = FALSE; + } else { + flushed = FALSE; + } + + ut_ad((flush_type == BUF_FLUSH_LRU + && mutex_own(&buf_pool->LRU_list_mutex)) + || (flush_type == BUF_FLUSH_LIST + && buf_flush_list_mutex_own(buf_pool))); + + return(flushed); +} + +/*******************************************************************//** +This utility moves the uncompressed frames of pages to the free list. +Note that this function does not actually flush any data to disk. It +just detaches the uncompressed frames from the compressed pages at the +tail of the unzip_LRU and puts those freed frames in the free list. +Note that it is a best effort attempt and it is not guaranteed that +after a call to this function there will be 'max' blocks in the free +list. +@return number of blocks moved to the free list. */ +static +ulint +buf_free_from_unzip_LRU_list_batch( +/*===============================*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + ulint max) /*!< in: desired number of + blocks in the free_list */ +{ + buf_block_t* block; + ulint scanned = 0; + ulint count = 0; + ulint free_len = UT_LIST_GET_LEN(buf_pool->free); + ulint lru_len = UT_LIST_GET_LEN(buf_pool->unzip_LRU); + + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); + + block = UT_LIST_GET_LAST(buf_pool->unzip_LRU); + while (block != NULL && count < max + && free_len < srv_LRU_scan_depth + && lru_len > UT_LIST_GET_LEN(buf_pool->LRU) / 10) { + + ib_mutex_t* block_mutex = buf_page_get_mutex(&block->page); + + ++scanned; + + mutex_enter(block_mutex); + + if (buf_LRU_free_page(&block->page, false)) { + + mutex_exit(block_mutex); + /* Block was freed. LRU list mutex potentially + released and reacquired */ + ++count; + mutex_enter(&buf_pool->LRU_list_mutex); + block = UT_LIST_GET_LAST(buf_pool->unzip_LRU); + + } else { + + mutex_exit(block_mutex); + block = UT_LIST_GET_PREV(unzip_LRU, block); + } + + free_len = UT_LIST_GET_LEN(buf_pool->free); + lru_len = UT_LIST_GET_LEN(buf_pool->unzip_LRU); + } + + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); + + if (scanned) { + MONITOR_INC_VALUE_CUMULATIVE( + MONITOR_LRU_BATCH_SCANNED, + MONITOR_LRU_BATCH_SCANNED_NUM_CALL, + MONITOR_LRU_BATCH_SCANNED_PER_CALL, + scanned); + } + + return(count); +} + +/*******************************************************************//** +This utility flushes dirty blocks from the end of the LRU list. +The calling thread is not allowed to own any latches on pages! +It attempts to make 'max' blocks available in the free list. Note that +it is a best effort attempt and it is not guaranteed that after a call +to this function there will be 'max' blocks in the free list. +@return number of blocks for which the write request was queued. */ +__attribute__((nonnull)) +static +void +buf_flush_LRU_list_batch( +/*=====================*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + ulint max, /*!< in: desired number of + blocks in the free_list */ + bool limited_scan, /*!< in: if true, allow to scan only up + to srv_LRU_scan_depth pages in total */ + flush_counters_t* n) /*!< out: flushed/evicted page + counts */ +{ + buf_page_t* bpage; + ulint scanned = 0; + ulint lru_position = 0; + ulint max_lru_position; + ulint max_scanned_pages; + ulint free_len = UT_LIST_GET_LEN(buf_pool->free); + ulint lru_len = UT_LIST_GET_LEN(buf_pool->LRU); + + n->flushed = 0; + n->evicted = 0; + n->unzip_LRU_evicted = 0; + + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); + + max_scanned_pages = limited_scan ? srv_LRU_scan_depth : lru_len * max; + max_lru_position = ut_min(srv_LRU_scan_depth, lru_len); + + bpage = UT_LIST_GET_LAST(buf_pool->LRU); + while (bpage != NULL + && (srv_cleaner_eviction_factor ? n->evicted : n->flushed) < max + && free_len < srv_LRU_scan_depth + && lru_len > BUF_LRU_MIN_LEN + && lru_position < max_lru_position + && scanned < max_scanned_pages) { + + ib_mutex_t* block_mutex = buf_page_get_mutex(bpage); + ibool evict; + ulint failed_acquire; + + ++scanned; + ++lru_position; + + failed_acquire = mutex_enter_nowait(block_mutex); + + evict = UNIV_LIKELY(!failed_acquire) + && buf_flush_ready_for_replace(bpage); + + if (UNIV_LIKELY(!failed_acquire) && !evict) { + + mutex_exit(block_mutex); + } + + /* If the block is ready to be replaced we try to + free it i.e.: put it on the free list. + Otherwise we try to flush the block and its + neighbors. In this case we'll put it on the + free list in the next pass. We do this extra work + of putting blocks to the free list instead of + just flushing them because after every flush + we have to restart the scan from the tail of + the LRU list and if we don't clear the tail + of the flushed pages then the scan becomes + O(n*n). */ + if (evict) { + + if (buf_LRU_free_page(bpage, true)) { + + mutex_exit(block_mutex); + n->evicted++; + lru_position = 0; + mutex_enter(&buf_pool->LRU_list_mutex); + bpage = UT_LIST_GET_LAST(buf_pool->LRU); + } else { + + bpage = UT_LIST_GET_PREV(LRU, bpage); + mutex_exit(block_mutex); + } + } else if (UNIV_LIKELY(!failed_acquire)) { + + ulint space; + ulint offset; + buf_page_t* prev_bpage; + + prev_bpage = UT_LIST_GET_PREV(LRU, bpage); + + /* Save the previous bpage */ + + if (prev_bpage != NULL) { + space = prev_bpage->space; + offset = prev_bpage->offset; + } else { + space = ULINT_UNDEFINED; + offset = ULINT_UNDEFINED; + } + + if (buf_flush_page_and_try_neighbors( + bpage, + BUF_FLUSH_LRU, max, &n->flushed)) { + + /* LRU list mutex was released. + reposition the iterator. Note: the + prev block could have been repositioned + too but that should be rare. */ + + if (prev_bpage != NULL) { + + ut_ad(space != ULINT_UNDEFINED); + ut_ad(offset != ULINT_UNDEFINED); + + prev_bpage = buf_page_hash_get( + buf_pool, space, offset); + } + } + + bpage = prev_bpage; + } + + free_len = UT_LIST_GET_LEN(buf_pool->free); + lru_len = UT_LIST_GET_LEN(buf_pool->LRU); + } + + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); + + /* We keep track of all flushes happening as part of LRU + flush. When estimating the desired rate at which flush_list + should be flushed, we factor in this value. */ + buf_pool->stat.buf_lru_flush_page_count += n->flushed; + + if (scanned) { + MONITOR_INC_VALUE_CUMULATIVE( + MONITOR_LRU_BATCH_SCANNED, + MONITOR_LRU_BATCH_SCANNED_NUM_CALL, + MONITOR_LRU_BATCH_SCANNED_PER_CALL, + scanned); + } +} + +/*******************************************************************//** +Flush and move pages from LRU or unzip_LRU list to the free list. +Whether LRU or unzip_LRU is used depends on the state of the system. +@return number of blocks for which either the write request was queued +or in case of unzip_LRU the number of blocks actually moved to the +free list */ +__attribute__((nonnull)) +static +void +buf_do_LRU_batch( +/*=============*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + ulint max, /*!< in: desired number of + blocks in the free_list */ + bool limited_scan, /*!< in: if true, allow to scan only up + to srv_LRU_scan_depth pages in total */ + flush_counters_t* n) /*!< out: flushed/evicted page + counts */ +{ + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); + + if (buf_LRU_evict_from_unzip_LRU(buf_pool)) { + n->unzip_LRU_evicted + = buf_free_from_unzip_LRU_list_batch(buf_pool, max); + } else { + n->unzip_LRU_evicted = 0; + } + + if (max > n->unzip_LRU_evicted) { + buf_flush_LRU_list_batch(buf_pool, max - n->unzip_LRU_evicted, + limited_scan, n); + } else { + n->evicted = 0; + n->flushed = 0; + } + + n->evicted += n->unzip_LRU_evicted; +} + +/*******************************************************************//** +This utility flushes dirty blocks from the end of the flush_list. +the calling thread is not allowed to own any latches on pages! +@return number of blocks for which the write request was queued; +ULINT_UNDEFINED if there was a flush of the same type already +running */ +static +ulint +buf_do_flush_list_batch( +/*====================*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + ulint min_n, /*!< in: wished minimum mumber + of blocks flushed (it is not + guaranteed that the actual + number is that big, though) */ + lsn_t lsn_limit) /*!< all blocks whose + oldest_modification is smaller + than this should be flushed (if + their number does not exceed + min_n) */ +{ + ulint count = 0; + ulint scanned = 0; + + /* Start from the end of the list looking for a suitable + block to be flushed. */ + buf_flush_list_mutex_enter(buf_pool); + ulint len = UT_LIST_GET_LEN(buf_pool->flush_list); + + /* In order not to degenerate this scan to O(n*n) we attempt + to preserve pointer of previous block in the flush list. To do + so we declare it a hazard pointer. Any thread working on the + flush list must check the hazard pointer and if it is removing + the same block then it must reset it. */ + for (buf_page_t* bpage = UT_LIST_GET_LAST(buf_pool->flush_list); + count < min_n && bpage != NULL && len > 0 + && bpage->oldest_modification < lsn_limit; + ++scanned) { + + buf_page_t* prev; + + ut_a(bpage->oldest_modification > 0); + ut_ad(bpage->in_flush_list); + + prev = UT_LIST_GET_PREV(list, bpage); + buf_flush_set_hp(buf_pool, prev); + +#ifdef UNIV_DEBUG + bool flushed = +#endif /* UNIV_DEBUG */ + buf_flush_page_and_try_neighbors( + bpage, BUF_FLUSH_LIST, min_n, &count); + + ut_ad(flushed || buf_flush_is_hp(buf_pool, prev)); + + if (!buf_flush_is_hp(buf_pool, prev)) { + /* The hazard pointer was reset by some other + thread. Restart the scan. */ + ut_ad(buf_flush_is_hp(buf_pool, NULL)); + bpage = UT_LIST_GET_LAST(buf_pool->flush_list); + len = UT_LIST_GET_LEN(buf_pool->flush_list); + } else { + bpage = prev; + --len; + buf_flush_set_hp(buf_pool, NULL); + } + + ut_ad(!bpage || bpage->in_flush_list); + } + + buf_flush_list_mutex_exit(buf_pool); + + MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BATCH_SCANNED, + MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL, + MONITOR_FLUSH_BATCH_SCANNED_PER_CALL, + scanned); + + return(count); +} + +/*******************************************************************//** +This utility flushes dirty blocks from the end of the LRU list or flush_list. +NOTE 1: in the case of an LRU flush the calling thread may own latches to +pages: to avoid deadlocks, this function must be written so that it cannot +end up waiting for these latches! NOTE 2: in the case of a flush list flush, +the calling thread is not allowed to own any latches on pages! +@return number of blocks for which the write request was queued */ +__attribute__((nonnull)) +static +void +buf_flush_batch( +/*============*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + buf_flush_t flush_type, /*!< in: BUF_FLUSH_LRU or + BUF_FLUSH_LIST; if BUF_FLUSH_LIST, + then the caller must not own any + latches on pages */ + ulint min_n, /*!< in: wished minimum mumber of blocks + flushed (it is not guaranteed that the + actual number is that big, though) */ + lsn_t lsn_limit, /*!< in: in the case of BUF_FLUSH_LIST + all blocks whose oldest_modification is + smaller than this should be flushed + (if their number does not exceed + min_n), otherwise ignored */ + bool limited_lru_scan,/*!< in: for LRU flushes, if true, + allow to scan only up to + srv_LRU_scan_depth pages in total */ + flush_counters_t* n) /*!< out: flushed/evicted page + counts */ +{ + ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST); +#ifdef UNIV_SYNC_DEBUG + ut_ad((flush_type != BUF_FLUSH_LIST) + || sync_thread_levels_empty_except_dict()); +#endif /* UNIV_SYNC_DEBUG */ + + /* Note: The buffer pool mutexes are released and reacquired within + the flush functions. */ + switch (flush_type) { + case BUF_FLUSH_LRU: + mutex_enter(&buf_pool->LRU_list_mutex); + buf_do_LRU_batch(buf_pool, min_n, limited_lru_scan, n); + mutex_exit(&buf_pool->LRU_list_mutex); + break; + case BUF_FLUSH_LIST: + ut_ad(!limited_lru_scan); + n->flushed = buf_do_flush_list_batch(buf_pool, min_n, + lsn_limit); + n->evicted = 0; + break; + default: + ut_error; + } + +#ifdef UNIV_DEBUG + if (buf_debug_prints && n->flushed > 0) { + fprintf(stderr, flush_type == BUF_FLUSH_LRU + ? "Flushed %lu pages in LRU flush\n" + : "Flushed %lu pages in flush list flush\n", + (ulong) n->flushed); + } +#endif /* UNIV_DEBUG */ +} + +/******************************************************************//** +Gather the aggregated stats for both flush list and LRU list flushing */ +static +void +buf_flush_common( +/*=============*/ + buf_flush_t flush_type, /*!< in: type of flush */ + ulint page_count) /*!< in: number of pages flushed */ +{ + if (page_count) { + buf_dblwr_flush_buffered_writes(); + } + + ut_a(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST); + +#ifdef UNIV_DEBUG + if (buf_debug_prints && page_count > 0) { + fprintf(stderr, flush_type == BUF_FLUSH_LRU + ? "Flushed %lu pages in LRU flush\n" + : "Flushed %lu pages in flush list flush\n", + (ulong) page_count); + } +#endif /* UNIV_DEBUG */ + + srv_stats.buf_pool_flushed.add(page_count); +} + +/******************************************************************//** +Start a buffer flush batch for LRU or flush list */ +static +ibool +buf_flush_start( +/*============*/ + buf_pool_t* buf_pool, /*!< buffer pool instance */ + buf_flush_t flush_type) /*!< in: BUF_FLUSH_LRU + or BUF_FLUSH_LIST */ +{ + mutex_enter(&buf_pool->flush_state_mutex); + + if (buf_pool->n_flush[flush_type] > 0 + || buf_pool->init_flush[flush_type] == TRUE) { + + /* There is already a flush batch of the same type running */ + + mutex_exit(&buf_pool->flush_state_mutex); + + return(FALSE); + } + + buf_pool->init_flush[flush_type] = TRUE; + + mutex_exit(&buf_pool->flush_state_mutex); + + return(TRUE); +} + +/******************************************************************//** +End a buffer flush batch for LRU or flush list */ +static +void +buf_flush_end( +/*==========*/ + buf_pool_t* buf_pool, /*!< buffer pool instance */ + buf_flush_t flush_type) /*!< in: BUF_FLUSH_LRU + or BUF_FLUSH_LIST */ +{ + mutex_enter(&buf_pool->flush_state_mutex); + + buf_pool->init_flush[flush_type] = FALSE; + + buf_pool->try_LRU_scan = TRUE; + + if (buf_pool->n_flush[flush_type] == 0) { + + /* The running flush batch has ended */ + + os_event_set(buf_pool->no_flush[flush_type]); + } + + mutex_exit(&buf_pool->flush_state_mutex); +} + +/******************************************************************//** +Waits until a flush batch of the given type ends */ +UNIV_INTERN +void +buf_flush_wait_batch_end( +/*=====================*/ + buf_pool_t* buf_pool, /*!< buffer pool instance */ + buf_flush_t type) /*!< in: BUF_FLUSH_LRU + or BUF_FLUSH_LIST */ +{ + ut_ad(type == BUF_FLUSH_LRU || type == BUF_FLUSH_LIST); + + if (buf_pool == NULL) { + ulint i; + + for (i = 0; i < srv_buf_pool_instances; ++i) { + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_array(i); + + thd_wait_begin(NULL, THD_WAIT_DISKIO); + os_event_wait(buf_pool->no_flush[type]); + thd_wait_end(NULL); + } + } else { + thd_wait_begin(NULL, THD_WAIT_DISKIO); + os_event_wait(buf_pool->no_flush[type]); + thd_wait_end(NULL); + } +} + +/*******************************************************************//** +This utility flushes dirty blocks from the end of the LRU list and also +puts replaceable clean pages from the end of the LRU list to the free +list. +NOTE: The calling thread is not allowed to own any latches on pages! +@return true if a batch was queued successfully. false if another batch +of same type was already running. */ +__attribute__((nonnull)) +static +bool +buf_flush_LRU( +/*==========*/ + buf_pool_t* buf_pool, /*!< in/out: buffer pool instance */ + ulint min_n, /*!< in: wished minimum mumber of blocks + flushed (it is not guaranteed that the + actual number is that big, though) */ + bool limited_scan, /*!< in: if true, allow to scan + only up to srv_LRU_scan_depth + pages in total */ + flush_counters_t *n) /*!< out: flushed/evicted page + counts */ +{ + if (!buf_flush_start(buf_pool, BUF_FLUSH_LRU)) { + n->flushed = 0; + n->evicted = 0; + n->unzip_LRU_evicted = 0; + return(false); + } + + buf_flush_batch(buf_pool, BUF_FLUSH_LRU, min_n, 0, limited_scan, n); + + buf_flush_end(buf_pool, BUF_FLUSH_LRU); + + buf_flush_common(BUF_FLUSH_LRU, n->flushed); + + return(true); +} + +/*******************************************************************//** +This utility flushes dirty blocks from the end of the flush list of +all buffer pool instances. +NOTE: The calling thread is not allowed to own any latches on pages! +@return true if a batch was queued successfully for each buffer pool +instance. false if another batch of same type was already running in +at least one of the buffer pool instance */ +UNIV_INTERN +bool +buf_flush_list( +/*===========*/ + ulint min_n, /*!< in: wished minimum mumber of blocks + flushed (it is not guaranteed that the + actual number is that big, though) */ + lsn_t lsn_limit, /*!< in the case BUF_FLUSH_LIST all + blocks whose oldest_modification is + smaller than this should be flushed + (if their number does not exceed + min_n), otherwise ignored */ + ulint* n_processed) /*!< out: the number of pages + which were processed is passed + back to caller. Ignored if NULL */ + +{ + ulint i; + + ulint requested_pages[MAX_BUFFER_POOLS]; + bool active_instance[MAX_BUFFER_POOLS]; + ulint remaining_instances = srv_buf_pool_instances; + bool timeout = false; + ulint flush_start_time = 0; + + for (i = 0; i < srv_buf_pool_instances; i++) { + requested_pages[i] = 0; + active_instance[i] = true; + } + + if (n_processed) { + *n_processed = 0; + } + + if (min_n != ULINT_MAX) { + /* Ensure that flushing is spread evenly amongst the + buffer pool instances. When min_n is ULINT_MAX + we need to flush everything up to the lsn limit + so no limit here. */ + min_n = (min_n + srv_buf_pool_instances - 1) + / srv_buf_pool_instances; + if (lsn_limit != LSN_MAX) { + flush_start_time = ut_time_ms(); + } + } + + /* Flush to lsn_limit in all buffer pool instances */ + while (remaining_instances && !timeout) { + + ulint flush_common_batch = 0; + + for (i = 0; i < srv_buf_pool_instances; i++) { + + if (flush_start_time + && (ut_time_ms() - flush_start_time + >= srv_cleaner_max_flush_time)) { + + timeout = true; + break; + } + + if (active_instance[i]) { + + buf_pool_t* buf_pool; + ulint chunk_size; + flush_counters_t n; + + chunk_size = ut_min( + srv_cleaner_flush_chunk_size, + min_n - requested_pages[i]); + + buf_pool = buf_pool_from_array(i); + + if (!buf_flush_start(buf_pool, + BUF_FLUSH_LIST)) { + + continue; + } + + buf_flush_batch(buf_pool, BUF_FLUSH_LIST, + chunk_size, lsn_limit, false, + &n); + + buf_flush_end(buf_pool, BUF_FLUSH_LIST); + + flush_common_batch += n.flushed; + + if (n_processed) { + *n_processed += n.flushed; + } + + requested_pages[i] += chunk_size; + + if (requested_pages[i] >= min_n + || !n.flushed) { + + active_instance[i] = false; + remaining_instances--; + } + + if (n.flushed) { + MONITOR_INC_VALUE_CUMULATIVE( + MONITOR_FLUSH_BATCH_TOTAL_PAGE, + MONITOR_FLUSH_BATCH_COUNT, + MONITOR_FLUSH_BATCH_PAGES, + n.flushed); + } + } + } + + buf_flush_common(BUF_FLUSH_LIST, flush_common_batch); + } + + /* If we haven't flushed all the instances due to timeout or a repeat + failure to start a flush, return failure */ + for (i = 0; i < srv_buf_pool_instances; i++) { + if (active_instance[i]) { + return(false); + } + } + + return(true); +} + +/******************************************************************//** +This function picks up a single dirty page from the tail of the LRU +list, flushes it, removes it from page_hash and LRU list and puts +it on the free list. It is called from user threads when they are +unable to find a replaceable page at the tail of the LRU list i.e.: +when the background LRU flushing in the page_cleaner thread is not +fast enough to keep pace with the workload. +@return TRUE if success. */ +UNIV_INTERN +ibool +buf_flush_single_page_from_LRU( +/*===========================*/ + buf_pool_t* buf_pool) /*!< in/out: buffer pool instance */ +{ + ulint scanned; + buf_page_t* bpage; + ibool flushed = FALSE; + + mutex_enter(&buf_pool->LRU_list_mutex); + + for (bpage = UT_LIST_GET_LAST(buf_pool->LRU), scanned = 1; + bpage != NULL; + bpage = UT_LIST_GET_PREV(LRU, bpage), ++scanned) { + + ib_mutex_t* block_mutex = buf_page_get_mutex(bpage); + + mutex_enter(block_mutex); + + if (buf_flush_ready_for_flush(bpage, BUF_FLUSH_SINGLE_PAGE)) { + + /* The following call will release the LRU list + and block mutex. */ + + flushed = buf_flush_page(buf_pool, bpage, + BUF_FLUSH_SINGLE_PAGE, true); + + if (flushed) { + /* buf_flush_page() will release the + block mutex */ + break; + } + } + + mutex_exit(block_mutex); + } + + if (!flushed) + mutex_exit(&buf_pool->LRU_list_mutex); + + MONITOR_INC_VALUE_CUMULATIVE( + MONITOR_LRU_SINGLE_FLUSH_SCANNED, + MONITOR_LRU_SINGLE_FLUSH_SCANNED_NUM_CALL, + MONITOR_LRU_SINGLE_FLUSH_SCANNED_PER_CALL, + scanned); + + if (bpage == NULL) { + /* Can't find a single flushable page. */ + return(FALSE); + } + + + ibool freed = FALSE; + + /* At this point the page has been written to the disk. + As we are not holding LRU list or buf_page_get_mutex() mutex therefore + we cannot use the bpage safely. It may have been plucked out + of the LRU list by some other thread or it may even have + relocated in case of a compressed page. We need to start + the scan of LRU list again to remove the block from the LRU + list and put it on the free list. */ + mutex_enter(&buf_pool->LRU_list_mutex); + + for (bpage = UT_LIST_GET_LAST(buf_pool->LRU); + bpage != NULL; + bpage = UT_LIST_GET_PREV(LRU, bpage)) { + + ib_mutex_t* block_mutex = buf_page_get_mutex(bpage); + + mutex_enter(block_mutex); + + ibool ready = buf_flush_ready_for_replace(bpage); + + if (ready) { + bool evict_zip; + + evict_zip = !buf_LRU_evict_from_unzip_LRU(buf_pool);; + + freed = buf_LRU_free_page(bpage, evict_zip); + + mutex_exit(block_mutex); + + break; + } + + mutex_exit(block_mutex); + + } + + if (!freed) + mutex_exit(&buf_pool->LRU_list_mutex); + + return(freed); +} + +/*********************************************************************//** +Clears up tail of the LRU lists: +* Put replaceable pages at the tail of LRU to the free list +* Flush dirty pages at the tail of LRU to the disk +The depth to which we scan each buffer pool is controlled by dynamic +config parameter innodb_LRU_scan_depth. +@return total pages flushed */ +UNIV_INTERN +ulint +buf_flush_LRU_tail(void) +/*====================*/ +{ + ulint total_flushed = 0; + ulint start_time = ut_time_ms(); + ulint scan_depth[MAX_BUFFER_POOLS]; + ulint requested_pages[MAX_BUFFER_POOLS]; + bool active_instance[MAX_BUFFER_POOLS]; + bool limited_scan[MAX_BUFFER_POOLS]; + ulint previous_evicted[MAX_BUFFER_POOLS]; + ulint remaining_instances = srv_buf_pool_instances; + ulint lru_chunk_size = srv_cleaner_lru_chunk_size; + ulint free_list_lwm = srv_LRU_scan_depth / 100 + * srv_cleaner_free_list_lwm; + + for (ulint i = 0; i < srv_buf_pool_instances; i++) { + + const buf_pool_t* buf_pool = buf_pool_from_array(i); + + scan_depth[i] = ut_min(srv_LRU_scan_depth, + UT_LIST_GET_LEN(buf_pool->LRU)); + requested_pages[i] = 0; + active_instance[i] = true; + limited_scan[i] = true; + previous_evicted[i] = 0; + } + + while (remaining_instances) { + + if (ut_time_ms() - start_time >= srv_cleaner_max_lru_time) { + + break; + } + + for (ulint i = 0; i < srv_buf_pool_instances; i++) { + + if (!active_instance[i]) { + continue; + } + + ulint free_len = free_list_lwm; + buf_pool_t* buf_pool = buf_pool_from_array(i); + + do { + flush_counters_t n; + + ut_ad(requested_pages[i] <= scan_depth[i]); + + /* Currently page_cleaner is the only thread + that can trigger an LRU flush. It is possible + that a batch triggered during last iteration is + still running, */ + if (buf_flush_LRU(buf_pool, lru_chunk_size, + limited_scan[i], &n)) { + + /* Allowed only one batch per + buffer pool instance. */ + buf_flush_wait_batch_end( + buf_pool, BUF_FLUSH_LRU); + } + + total_flushed += n.flushed; + + /* When we evict less pages than we did on a + previous try we relax the LRU scan limit in + order to attempt to evict more */ + limited_scan[i] + = (previous_evicted[i] > n.evicted); + previous_evicted[i] = n.evicted; + + requested_pages[i] += lru_chunk_size; + + /* If we failed to flush or evict this + instance, do not bother anymore. But take into + account that we might have zero flushed pages + because the flushing request was fully + satisfied by unzip_LRU evictions. */ + if (requested_pages[i] >= scan_depth[i] + || !(srv_cleaner_eviction_factor + ? n.evicted + : (n.flushed + n.unzip_LRU_evicted))) { + + active_instance[i] = false; + remaining_instances--; + } else { + + free_len = UT_LIST_GET_LEN( + buf_pool->free); + } + } while (active_instance[i] + && free_len <= free_list_lwm); + } + } + + if (total_flushed) { + MONITOR_INC_VALUE_CUMULATIVE( + MONITOR_LRU_BATCH_TOTAL_PAGE, + MONITOR_LRU_BATCH_COUNT, + MONITOR_LRU_BATCH_PAGES, + total_flushed); + } + + return(total_flushed); +} + +/*********************************************************************//** +Wait for any possible LRU flushes that are in progress to end. */ +UNIV_INTERN +void +buf_flush_wait_LRU_batch_end(void) +/*==============================*/ +{ + for (ulint i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_array(i); + + mutex_enter(&buf_pool->flush_state_mutex); + + if (buf_pool->n_flush[BUF_FLUSH_LRU] > 0 + || buf_pool->init_flush[BUF_FLUSH_LRU]) { + + mutex_exit(&buf_pool->flush_state_mutex); + buf_flush_wait_batch_end(buf_pool, BUF_FLUSH_LRU); + } else { + mutex_exit(&buf_pool->flush_state_mutex); + } + } +} + +/*********************************************************************//** +Flush a batch of dirty pages from the flush list +@return number of pages flushed, 0 if no page is flushed or if another +flush_list type batch is running */ +static +ulint +page_cleaner_do_flush_batch( +/*========================*/ + ulint n_to_flush, /*!< in: number of pages that + we should attempt to flush. */ + lsn_t lsn_limit) /*!< in: LSN up to which flushing + must happen */ +{ + ulint n_flushed; + + buf_flush_list(n_to_flush, lsn_limit, &n_flushed); + + return(n_flushed); +} + +/*********************************************************************//** +Calculates if flushing is required based on number of dirty pages in +the buffer pool. +@return percent of io_capacity to flush to manage dirty page ratio */ +static +ulint +af_get_pct_for_dirty() +/*==================*/ +{ + ulint dirty_pct = buf_get_modified_ratio_pct(); + + if (dirty_pct > 0 && srv_max_buf_pool_modified_pct == 0) { + return(100); + } + + ut_a(srv_max_dirty_pages_pct_lwm + <= srv_max_buf_pool_modified_pct); + + if (srv_max_dirty_pages_pct_lwm == 0) { + /* The user has not set the option to preflush dirty + pages as we approach the high water mark. */ + if (dirty_pct > srv_max_buf_pool_modified_pct) { + /* We have crossed the high water mark of dirty + pages In this case we start flushing at 100% of + innodb_io_capacity. */ + return(100); + } + } else if (dirty_pct > srv_max_dirty_pages_pct_lwm) { + /* We should start flushing pages gradually. */ + return((dirty_pct * 100) + / (srv_max_buf_pool_modified_pct + 1)); + } + + return(0); +} + +/*********************************************************************//** +Calculates if flushing is required based on redo generation rate. +@return percent of io_capacity to flush to manage redo space */ +static +ulint +af_get_pct_for_lsn( +/*===============*/ + lsn_t age) /*!< in: current age of LSN. */ +{ + lsn_t max_async_age; + lsn_t lsn_age_factor; + lsn_t af_lwm = (srv_adaptive_flushing_lwm + * log_get_capacity()) / 100; + + if (age < af_lwm) { + /* No adaptive flushing. */ + return(0); + } + + max_async_age = log_get_max_modified_age_async(); + + if (age < max_async_age && !srv_adaptive_flushing) { + /* We have still not reached the max_async point and + the user has disabled adaptive flushing. */ + return(0); + } + + /* If we are here then we know that either: + 1) User has enabled adaptive flushing + 2) User may have disabled adaptive flushing but we have reached + max_async_age. */ + lsn_age_factor = (age * 100) / max_async_age; + + ut_ad(srv_max_io_capacity >= srv_io_capacity); + switch ((srv_cleaner_lsn_age_factor_t)srv_cleaner_lsn_age_factor) { + case SRV_CLEANER_LSN_AGE_FACTOR_LEGACY: + return(static_cast<ulint>( + ((srv_max_io_capacity / srv_io_capacity) + * (lsn_age_factor + * sqrt((double)lsn_age_factor))) + / 7.5)); + case SRV_CLEANER_LSN_AGE_FACTOR_HIGH_CHECKPOINT: + return(static_cast<ulint>( + ((srv_max_io_capacity / srv_io_capacity) + * (lsn_age_factor * lsn_age_factor + * sqrt((double)lsn_age_factor))) + / 700.5)); + default: + ut_error; + } +} + +/*********************************************************************//** +This function is called approximately once every second by the +page_cleaner thread. Based on various factors it decides if there is a +need to do flushing. If flushing is needed it is performed and the +number of pages flushed is returned. +@return number of pages flushed */ +static +ulint +page_cleaner_flush_pages_if_needed(void) +/*====================================*/ +{ + static lsn_t lsn_avg_rate = 0; + static lsn_t prev_lsn = 0; + static lsn_t last_lsn = 0; + static ulint sum_pages = 0; + static ulint last_pages = 0; + static ulint prev_pages = 0; + static ulint avg_page_rate = 0; + static ulint n_iterations = 0; + lsn_t oldest_lsn; + lsn_t cur_lsn; + lsn_t age; + lsn_t lsn_rate; + ulint n_pages = 0; + ulint pct_for_dirty = 0; + ulint pct_for_lsn = 0; + ulint pct_total = 0; + int age_factor = 0; + + cur_lsn = log_get_lsn(); + + if (prev_lsn == 0) { + /* First time around. */ + prev_lsn = cur_lsn; + return(0); + } + + if (prev_lsn == cur_lsn) { + return(0); + } + + /* We update our variables every srv_flushing_avg_loops + iterations to smooth out transition in workload. */ + if (++n_iterations >= srv_flushing_avg_loops) { + + avg_page_rate = ((sum_pages / srv_flushing_avg_loops) + + avg_page_rate) / 2; + + /* How much LSN we have generated since last call. */ + lsn_rate = (cur_lsn - prev_lsn) / srv_flushing_avg_loops; + + lsn_avg_rate = (lsn_avg_rate + lsn_rate) / 2; + + prev_lsn = cur_lsn; + + n_iterations = 0; + + sum_pages = 0; + } + + oldest_lsn = buf_pool_get_oldest_modification(); + + ut_ad(oldest_lsn <= log_get_lsn()); + + age = cur_lsn > oldest_lsn ? cur_lsn - oldest_lsn : 0; + + pct_for_dirty = af_get_pct_for_dirty(); + pct_for_lsn = af_get_pct_for_lsn(age); + + pct_total = ut_max(pct_for_dirty, pct_for_lsn); + + /* Cap the maximum IO capacity that we are going to use by + max_io_capacity. */ + n_pages = PCT_IO(pct_total); + if (age < log_get_max_modified_age_async()) + n_pages = (n_pages + avg_page_rate) / 2; + + if (n_pages > srv_max_io_capacity) { + n_pages = srv_max_io_capacity; + } + + if (last_pages && cur_lsn - last_lsn > lsn_avg_rate / 2) { + age_factor = static_cast<int>(prev_pages / last_pages); + } + + MONITOR_SET(MONITOR_FLUSH_N_TO_FLUSH_REQUESTED, n_pages); + + prev_pages = n_pages; + n_pages = page_cleaner_do_flush_batch( + n_pages, oldest_lsn + lsn_avg_rate * (age_factor + 1)); + + last_lsn= cur_lsn; + last_pages= n_pages + 1; + + MONITOR_SET(MONITOR_FLUSH_AVG_PAGE_RATE, avg_page_rate); + MONITOR_SET(MONITOR_FLUSH_LSN_AVG_RATE, lsn_avg_rate); + MONITOR_SET(MONITOR_FLUSH_PCT_FOR_DIRTY, pct_for_dirty); + MONITOR_SET(MONITOR_FLUSH_PCT_FOR_LSN, pct_for_lsn); + + if (n_pages) { + MONITOR_INC_VALUE_CUMULATIVE( + MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE, + MONITOR_FLUSH_ADAPTIVE_COUNT, + MONITOR_FLUSH_ADAPTIVE_PAGES, + n_pages); + + sum_pages += n_pages; + } + + return(n_pages); +} + +/*********************************************************************//** +Puts the page_cleaner thread to sleep if it has finished work in less +than a second */ +static +void +page_cleaner_sleep_if_needed( +/*=========================*/ + ulint next_loop_time) /*!< in: time when next loop iteration + should start */ +{ + ulint cur_time = ut_time_ms(); + + if (next_loop_time > cur_time) { + /* Get sleep interval in micro seconds. We use + ut_min() to avoid long sleep in case of + wrap around. */ + os_thread_sleep(ut_min(1000000, + (next_loop_time - cur_time) + * 1000)); + } +} + +/*********************************************************************//** +Returns the aggregate free list length over all buffer pool instances. +@return total free list length. */ +__attribute__((warn_unused_result)) +static +ulint +buf_get_total_free_list_length(void) +/*================================*/ +{ + ulint result = 0; + + for (ulint i = 0; i < srv_buf_pool_instances; i++) { + + result += UT_LIST_GET_LEN(buf_pool_from_array(i)->free); + } + + return result; +} + +/*********************************************************************//** +Adjust the desired page cleaner thread sleep time for LRU flushes. */ +__attribute__((nonnull)) +static +void +page_cleaner_adapt_lru_sleep_time( +/*==============================*/ + ulint* lru_sleep_time) /*!< in/out: desired page cleaner thread sleep + time for LRU flushes */ +{ + ulint free_len = buf_get_total_free_list_length(); + ulint max_free_len = srv_LRU_scan_depth * srv_buf_pool_instances; + + if (free_len < max_free_len / 100) { + + /* Free lists filled less than 1%, no sleep */ + *lru_sleep_time = 0; + } else if (free_len > max_free_len / 5) { + + /* Free lists filled more than 20%, sleep a bit more */ + *lru_sleep_time += 50; + if (*lru_sleep_time > srv_cleaner_max_lru_time) + *lru_sleep_time = srv_cleaner_max_lru_time; + } else if (free_len < max_free_len / 20 && *lru_sleep_time >= 50) { + + /* Free lists filled less than 5%, sleep a bit less */ + *lru_sleep_time -= 50; + } else { + + /* Free lists filled between 5% and 20%, no change */ + } +} + +/*********************************************************************//** +Get the desired page cleaner thread sleep time for flush list flushes. +@return desired sleep time */ +__attribute__((warn_unused_result)) +static +ulint +page_cleaner_adapt_flush_sleep_time(void) +/*=====================================*/ +{ + lsn_t age = log_get_lsn() - log_sys->last_checkpoint_lsn; + + if (age > log_sys->max_modified_age_sync) { + + /* No sleep if in sync preflush zone */ + return(0); + } + + /* In all other cases flush list factors do not influence the page + cleaner sleep time */ + return(srv_cleaner_max_flush_time); +} + +/******************************************************************//** +page_cleaner thread tasked with flushing dirty pages from the buffer +pool flush lists. As of now we'll have only one instance of this thread. +@return a dummy parameter */ +extern "C" UNIV_INTERN +os_thread_ret_t +DECLARE_THREAD(buf_flush_page_cleaner_thread)( +/*==========================================*/ + void* arg __attribute__((unused))) + /*!< in: a dummy parameter required by + os_thread_create */ +{ + ulint next_loop_time = ut_time_ms() + 1000; + ulint n_flushed = 0; + ulint last_activity = srv_get_activity_count(); + ulint last_activity_time = ut_time_ms(); + + ut_ad(!srv_read_only_mode); + +#ifdef UNIV_PFS_THREAD + pfs_register_thread(buf_page_cleaner_thread_key); +#endif /* UNIV_PFS_THREAD */ + + srv_cleaner_tid = os_thread_get_tid(); + + os_thread_set_priority(srv_cleaner_tid, srv_sched_priority_cleaner); + +#ifdef UNIV_DEBUG_THREAD_CREATION + fprintf(stderr, "InnoDB: page_cleaner thread running, id %lu\n", + os_thread_pf(os_thread_get_curr_id())); +#endif /* UNIV_DEBUG_THREAD_CREATION */ + + buf_page_cleaner_is_active = TRUE; + + while (srv_shutdown_state == SRV_SHUTDOWN_NONE) { + + ulint page_cleaner_sleep_time; + ibool server_active; + + srv_current_thread_priority = srv_cleaner_thread_priority; + + /* The page_cleaner skips sleep if the server is + idle and there are no pending IOs in the buffer pool + and there is work to do. */ + if (srv_check_activity(last_activity) + || buf_get_n_pending_read_ios() + || n_flushed == 0) { + page_cleaner_sleep_if_needed(next_loop_time); + } + + page_cleaner_sleep_time + = page_cleaner_adapt_flush_sleep_time(); + + next_loop_time = ut_time_ms() + page_cleaner_sleep_time; + + server_active = srv_check_activity(last_activity); + if (server_active + || ut_time_ms() - last_activity_time < 1000) { + + if (server_active) { + + last_activity = srv_get_activity_count(); + last_activity_time = ut_time_ms(); + } + + /* Flush pages from flush_list if required */ + n_flushed += page_cleaner_flush_pages_if_needed(); + } else { + n_flushed = page_cleaner_do_flush_batch( + PCT_IO(100), + LSN_MAX); + + if (n_flushed) { + MONITOR_INC_VALUE_CUMULATIVE( + MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE, + MONITOR_FLUSH_BACKGROUND_COUNT, + MONITOR_FLUSH_BACKGROUND_PAGES, + n_flushed); + } + } + } + + ut_ad(srv_shutdown_state > 0); + if (srv_fast_shutdown == 2) { + /* In very fast shutdown we simulate a crash of + buffer pool. We are not required to do any flushing */ + goto thread_exit; + } + + /* In case of normal and slow shutdown the page_cleaner thread + must wait for all other activity in the server to die down. + Note that we can start flushing the buffer pool as soon as the + server enters shutdown phase but we must stay alive long enough + to ensure that any work done by the master or purge threads is + also flushed. + During shutdown we pass through two stages. In the first stage, + when SRV_SHUTDOWN_CLEANUP is set other threads like the master + and the purge threads may be working as well. We start flushing + the buffer pool but can't be sure that no new pages are being + dirtied until we enter SRV_SHUTDOWN_FLUSH_PHASE phase. */ + + do { + n_flushed = page_cleaner_do_flush_batch(PCT_IO(100), LSN_MAX); + + /* We sleep only if there are no pages to flush */ + if (n_flushed == 0) { + os_thread_sleep(100000); + } + } while (srv_shutdown_state == SRV_SHUTDOWN_CLEANUP); + + /* At this point all threads including the master and the purge + thread must have been suspended. */ + ut_a(srv_get_active_thread_type() == SRV_NONE); + ut_a(srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE); + + /* We can now make a final sweep on flushing the buffer pool + and exit after we have cleaned the whole buffer pool. + It is important that we wait for any running batch that has + been triggered by us to finish. Otherwise we can end up + considering end of that batch as a finish of our final + sweep and we'll come out of the loop leaving behind dirty pages + in the flush_list */ + buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST); + buf_flush_wait_LRU_batch_end(); + + bool success; + + do { + + success = buf_flush_list(PCT_IO(100), LSN_MAX, &n_flushed); + buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST); + + } while (!success || n_flushed > 0); + + /* Some sanity checks */ + ut_a(srv_get_active_thread_type() == SRV_NONE); + ut_a(srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE); + for (ulint i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool = buf_pool_from_array(i); + ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == 0); + } + + /* We have lived our life. Time to die. */ + +thread_exit: + buf_page_cleaner_is_active = FALSE; + + /* We count the number of threads in os_thread_exit(). A created + thread should always use that to exit and not use return() to exit. */ + os_thread_exit(NULL); + + OS_THREAD_DUMMY_RETURN; +} + +/******************************************************************//** +lru_manager thread tasked with performing LRU flushes and evictions to refill +the buffer pool free lists. As of now we'll have only one instance of this +thread. +@return a dummy parameter */ +extern "C" UNIV_INTERN +os_thread_ret_t +DECLARE_THREAD(buf_flush_lru_manager_thread)( +/*==========================================*/ + void* arg __attribute__((unused))) + /*!< in: a dummy parameter required by + os_thread_create */ +{ + ulint next_loop_time = ut_time_ms() + 1000; + ulint lru_sleep_time = srv_cleaner_max_lru_time; + +#ifdef UNIV_PFS_THREAD + pfs_register_thread(buf_lru_manager_thread_key); +#endif /* UNIV_PFS_THREAD */ + + srv_lru_manager_tid = os_thread_get_tid(); + + os_thread_set_priority(srv_lru_manager_tid, + srv_sched_priority_cleaner); + +#ifdef UNIV_DEBUG_THREAD_CREATION + fprintf(stderr, "InnoDB: lru_manager thread running, id %lu\n", + os_thread_pf(os_thread_get_curr_id())); +#endif /* UNIV_DEBUG_THREAD_CREATION */ + + buf_lru_manager_is_active = true; + + /* On server shutdown, the LRU manager thread runs through cleanup + phase to provide free pages for the master and purge threads. */ + while (srv_shutdown_state == SRV_SHUTDOWN_NONE + || srv_shutdown_state == SRV_SHUTDOWN_CLEANUP) { + + ulint n_flushed_lru; + + srv_current_thread_priority = srv_cleaner_thread_priority; + + page_cleaner_sleep_if_needed(next_loop_time); + + page_cleaner_adapt_lru_sleep_time(&lru_sleep_time); + + next_loop_time = ut_time_ms() + lru_sleep_time; + + n_flushed_lru = buf_flush_LRU_tail(); + + if (n_flushed_lru) { + + MONITOR_INC_VALUE_CUMULATIVE( + MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE, + MONITOR_FLUSH_BACKGROUND_COUNT, + MONITOR_FLUSH_BACKGROUND_PAGES, + n_flushed_lru); + } + } + + buf_lru_manager_is_active = false; + + /* We count the number of threads in os_thread_exit(). A created + thread should always use that to exit and not use return() to exit. */ + os_thread_exit(NULL); + + OS_THREAD_DUMMY_RETURN; +} + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG + +/** Functor to validate the flush list. */ +struct Check { + void operator()(const buf_page_t* elem) + { + ut_a(elem->in_flush_list); + } +}; + +/******************************************************************//** +Validates the flush list. +@return TRUE if ok */ +static +ibool +buf_flush_validate_low( +/*===================*/ + buf_pool_t* buf_pool) /*!< in: Buffer pool instance */ +{ + buf_page_t* bpage; + const ib_rbt_node_t* rnode = NULL; + + ut_ad(buf_flush_list_mutex_own(buf_pool)); + + UT_LIST_VALIDATE(list, buf_page_t, buf_pool->flush_list, Check()); + + bpage = UT_LIST_GET_FIRST(buf_pool->flush_list); + + /* If we are in recovery mode i.e.: flush_rbt != NULL + then each block in the flush_list must also be present + in the flush_rbt. */ + if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) { + rnode = rbt_first(buf_pool->flush_rbt); + } + + while (bpage != NULL) { + const lsn_t om = bpage->oldest_modification; + + ut_ad(buf_pool_from_bpage(bpage) == buf_pool); + + ut_ad(bpage->in_flush_list); + + /* A page in buf_pool->flush_list can be in + BUF_BLOCK_REMOVE_HASH state. This happens when a page + is in the middle of being relocated. In that case the + original descriptor can have this state and still be + in the flush list waiting to acquire the + buf_pool->flush_list_mutex to complete the relocation. */ + ut_a(buf_page_in_file(bpage) + || buf_page_get_state(bpage) == BUF_BLOCK_REMOVE_HASH); + ut_a(om > 0); + + if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) { + buf_page_t** prpage; + + ut_a(rnode); + prpage = rbt_value(buf_page_t*, rnode); + + ut_a(*prpage); + ut_a(*prpage == bpage); + rnode = rbt_next(buf_pool->flush_rbt, rnode); + } + + bpage = UT_LIST_GET_NEXT(list, bpage); + + ut_a(!bpage || om >= bpage->oldest_modification); + } + + /* By this time we must have exhausted the traversal of + flush_rbt (if active) as well. */ + ut_a(rnode == NULL); + + return(TRUE); +} + +/******************************************************************//** +Validates the flush list. +@return TRUE if ok */ +UNIV_INTERN +ibool +buf_flush_validate( +/*===============*/ + buf_pool_t* buf_pool) /*!< buffer pool instance */ +{ + ibool ret; + + buf_flush_list_mutex_enter(buf_pool); + + ret = buf_flush_validate_low(buf_pool); + + buf_flush_list_mutex_exit(buf_pool); + + return(ret); +} +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ +#endif /* !UNIV_HOTBACKUP */ + +#ifdef UNIV_DEBUG +/******************************************************************//** +Check if there are any dirty pages that belong to a space id in the flush +list in a particular buffer pool. +@return number of dirty pages present in a single buffer pool */ +UNIV_INTERN +ulint +buf_pool_get_dirty_pages_count( +/*===========================*/ + buf_pool_t* buf_pool, /*!< in: buffer pool */ + ulint id) /*!< in: space id to check */ + +{ + ulint count = 0; + + buf_flush_list_mutex_enter(buf_pool); + + buf_page_t* bpage; + + for (bpage = UT_LIST_GET_FIRST(buf_pool->flush_list); + bpage != 0; + bpage = UT_LIST_GET_NEXT(list, bpage)) { + + ut_ad(buf_page_in_file(bpage) + || buf_page_get_state(bpage) == BUF_BLOCK_REMOVE_HASH); + ut_ad(bpage->in_flush_list); + ut_ad(bpage->oldest_modification > 0); + + if (bpage->space == id) { + ++count; + } + } + + buf_flush_list_mutex_exit(buf_pool); + + return(count); +} + +/******************************************************************//** +Check if there are any dirty pages that belong to a space id in the flush list. +@return number of dirty pages present in all the buffer pools */ +UNIV_INTERN +ulint +buf_flush_get_dirty_pages_count( +/*============================*/ + ulint id) /*!< in: space id to check */ + +{ + ulint count = 0; + + for (ulint i = 0; i < srv_buf_pool_instances; ++i) { + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_array(i); + + count += buf_pool_get_dirty_pages_count(buf_pool, id); + } + + return(count); +} +#endif /* UNIV_DEBUG */ diff --git a/storage/xtradb/buf/buf0lru.cc b/storage/xtradb/buf/buf0lru.cc new file mode 100644 index 00000000000..af816d36e23 --- /dev/null +++ b/storage/xtradb/buf/buf0lru.cc @@ -0,0 +1,2920 @@ +/***************************************************************************** + +Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file buf/buf0lru.cc +The database buffer replacement algorithm + +Created 11/5/1995 Heikki Tuuri +*******************************************************/ + +#include "buf0lru.h" + +#ifndef UNIV_HOTBACKUP +#ifdef UNIV_NONINL +#include "buf0lru.ic" +#endif + +#include "ut0byte.h" +#include "ut0lst.h" +#include "ut0rnd.h" +#include "sync0sync.h" +#include "sync0rw.h" +#include "hash0hash.h" +#include "os0sync.h" +#include "fil0fil.h" +#include "btr0btr.h" +#include "buf0buddy.h" +#include "buf0buf.h" +#include "buf0dblwr.h" +#include "buf0flu.h" +#include "buf0rea.h" +#include "btr0sea.h" +#include "ibuf0ibuf.h" +#include "os0file.h" +#include "page0zip.h" +#include "log0recv.h" +#include "srv0srv.h" +#include "srv0start.h" +#include "srv0mon.h" +#include "lock0lock.h" + +#include "ha_prototypes.h" + +/** The number of blocks from the LRU_old pointer onward, including +the block pointed to, must be buf_pool->LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV +of the whole LRU list length, except that the tolerance defined below +is allowed. Note that the tolerance must be small enough such that for +even the BUF_LRU_OLD_MIN_LEN long LRU list, the LRU_old pointer is not +allowed to point to either end of the LRU list. */ + +#define BUF_LRU_OLD_TOLERANCE 20 + +/** The minimum amount of non-old blocks when the LRU_old list exists +(that is, when there are more than BUF_LRU_OLD_MIN_LEN blocks). +@see buf_LRU_old_adjust_len */ +#define BUF_LRU_NON_OLD_MIN_LEN 5 +#if BUF_LRU_NON_OLD_MIN_LEN >= BUF_LRU_OLD_MIN_LEN +# error "BUF_LRU_NON_OLD_MIN_LEN >= BUF_LRU_OLD_MIN_LEN" +#endif + +/** When dropping the search hash index entries before deleting an ibd +file, we build a local array of pages belonging to that tablespace +in the buffer pool. Following is the size of that array. +We also release buf_pool->LRU_list_mutex after scanning this many pages of the +flush_list when dropping a table. This is to ensure that other threads +are not blocked for extended period of time when using very large +buffer pools. */ +#define BUF_LRU_DROP_SEARCH_SIZE 1024 + +/** If we switch on the InnoDB monitor because there are too few available +frames in the buffer pool, we set this to TRUE */ +static ibool buf_lru_switched_on_innodb_mon = FALSE; + +/******************************************************************//** +These statistics are not 'of' LRU but 'for' LRU. We keep count of I/O +and page_zip_decompress() operations. Based on the statistics, +buf_LRU_evict_from_unzip_LRU() decides if we want to evict from +unzip_LRU or the regular LRU. From unzip_LRU, we will only evict the +uncompressed frame (meaning we can evict dirty blocks as well). From +the regular LRU, we will evict the entire block (i.e.: both the +uncompressed and compressed data), which must be clean. */ + +/* @{ */ + +/** Number of intervals for which we keep the history of these stats. +Each interval is 1 second, defined by the rate at which +srv_error_monitor_thread() calls buf_LRU_stat_update(). */ +#define BUF_LRU_STAT_N_INTERVAL 50 + +/** Co-efficient with which we multiply I/O operations to equate them +with page_zip_decompress() operations. */ +#define BUF_LRU_IO_TO_UNZIP_FACTOR 50 + +/** Sampled values buf_LRU_stat_cur. +Not protected by any mutex. Updated by buf_LRU_stat_update(). */ +static buf_LRU_stat_t buf_LRU_stat_arr[BUF_LRU_STAT_N_INTERVAL]; + +/** Cursor to buf_LRU_stat_arr[] that is updated in a round-robin fashion. */ +static ulint buf_LRU_stat_arr_ind; + +/** Current operation counters. Not protected by any mutex. Cleared +by buf_LRU_stat_update(). */ +UNIV_INTERN buf_LRU_stat_t buf_LRU_stat_cur; + +/** Running sum of past values of buf_LRU_stat_cur. +Updated by buf_LRU_stat_update(). Not Protected by any mutex. */ +UNIV_INTERN buf_LRU_stat_t buf_LRU_stat_sum; + +/* @} */ + +/** @name Heuristics for detecting index scan @{ */ +/** Move blocks to "new" LRU list only if the first access was at +least this many milliseconds ago. Not protected by any mutex or latch. */ +UNIV_INTERN uint buf_LRU_old_threshold_ms; +/* @} */ + +/******************************************************************//** +Takes a block out of the LRU list and page hash table. +If the block is compressed-only (BUF_BLOCK_ZIP_PAGE), +the object will be freed. + +The caller must hold buf_pool->LRU_list_mutex, the buf_page_get_mutex() mutex +and the appropriate hash_lock. This function will release the +buf_page_get_mutex() and the hash_lock. + +If a compressed page is freed other compressed pages may be relocated. +@retval true if BUF_BLOCK_FILE_PAGE was removed from page_hash. The +caller needs to free the page to the free list +@retval false if BUF_BLOCK_ZIP_PAGE was removed from page_hash. In +this case the block is already returned to the buddy allocator. */ +static __attribute__((nonnull, warn_unused_result)) +bool +buf_LRU_block_remove_hashed( +/*========================*/ + buf_page_t* bpage, /*!< in: block, must contain a file page and + be in a state where it can be freed; there + may or may not be a hash index to the page */ + bool zip); /*!< in: true if should remove also the + compressed page of an uncompressed page */ +/******************************************************************//** +Puts a file page whose has no hash index to the free list. */ +static +void +buf_LRU_block_free_hashed_page( +/*===========================*/ + buf_block_t* block); /*!< in: block, must contain a file page and + be in a state where it can be freed */ + +/******************************************************************//** +Increases LRU size in bytes with zip_size for compressed page, +UNIV_PAGE_SIZE for uncompressed page in inline function */ +static inline +void +incr_LRU_size_in_bytes( +/*===================*/ + buf_page_t* bpage, /*!< in: control block */ + buf_pool_t* buf_pool) /*!< in: buffer pool instance */ +{ + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); + ulint zip_size = page_zip_get_size(&bpage->zip); + buf_pool->stat.LRU_bytes += zip_size ? zip_size : UNIV_PAGE_SIZE; + ut_ad(buf_pool->stat.LRU_bytes <= buf_pool->curr_pool_size); +} + +/******************************************************************//** +Determines if the unzip_LRU list should be used for evicting a victim +instead of the general LRU list. +@return TRUE if should use unzip_LRU */ +UNIV_INTERN +ibool +buf_LRU_evict_from_unzip_LRU( +/*=========================*/ + buf_pool_t* buf_pool) +{ + ulint io_avg; + ulint unzip_avg; + + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); + + /* If the unzip_LRU list is empty, we can only use the LRU. */ + if (UT_LIST_GET_LEN(buf_pool->unzip_LRU) == 0) { + return(FALSE); + } + + /* If unzip_LRU is at most 10% of the size of the LRU list, + then use the LRU. This slack allows us to keep hot + decompressed pages in the buffer pool. */ + if (UT_LIST_GET_LEN(buf_pool->unzip_LRU) + <= UT_LIST_GET_LEN(buf_pool->LRU) / 10) { + return(FALSE); + } + + /* If eviction hasn't started yet, we assume by default + that a workload is disk bound. */ + if (buf_pool->freed_page_clock == 0) { + return(TRUE); + } + + /* Calculate the average over past intervals, and add the values + of the current interval. */ + io_avg = buf_LRU_stat_sum.io / BUF_LRU_STAT_N_INTERVAL + + buf_LRU_stat_cur.io; + unzip_avg = buf_LRU_stat_sum.unzip / BUF_LRU_STAT_N_INTERVAL + + buf_LRU_stat_cur.unzip; + + /* Decide based on our formula. If the load is I/O bound + (unzip_avg is smaller than the weighted io_avg), evict an + uncompressed frame from unzip_LRU. Otherwise we assume that + the load is CPU bound and evict from the regular LRU. */ + return(unzip_avg <= io_avg * BUF_LRU_IO_TO_UNZIP_FACTOR); +} + +/******************************************************************//** +Attempts to drop page hash index on a batch of pages belonging to a +particular space id. */ +static +void +buf_LRU_drop_page_hash_batch( +/*=========================*/ + ulint space_id, /*!< in: space id */ + ulint zip_size, /*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + const ulint* arr, /*!< in: array of page_no */ + ulint count) /*!< in: number of entries in array */ +{ + ulint i; + + ut_ad(arr != NULL); + ut_ad(count <= BUF_LRU_DROP_SEARCH_SIZE); + + for (i = 0; i < count; ++i) { + btr_search_drop_page_hash_when_freed(space_id, zip_size, + arr[i]); + } +} + +/******************************************************************//** +When doing a DROP TABLE/DISCARD TABLESPACE we have to drop all page +hash index entries belonging to that table. This function tries to +do that in batch. Note that this is a 'best effort' attempt and does +not guarantee that ALL hash entries will be removed. */ +static +void +buf_LRU_drop_page_hash_for_tablespace( +/*==================================*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + ulint id) /*!< in: space id */ +{ + buf_page_t* bpage; + ulint* page_arr; + ulint num_entries; + ulint zip_size; + + zip_size = fil_space_get_zip_size(id); + + if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) { + /* Somehow, the tablespace does not exist. Nothing to drop. */ + ut_ad(0); + return; + } + + page_arr = static_cast<ulint*>(ut_malloc( + sizeof(ulint) * BUF_LRU_DROP_SEARCH_SIZE)); + + mutex_enter(&buf_pool->LRU_list_mutex); + num_entries = 0; + +scan_again: + bpage = UT_LIST_GET_LAST(buf_pool->LRU); + + while (bpage != NULL) { + buf_page_t* prev_bpage; + ibool is_fixed; + ib_mutex_t* block_mutex = buf_page_get_mutex(bpage); + + prev_bpage = UT_LIST_GET_PREV(LRU, bpage); + + ut_a(buf_page_in_file(bpage)); + + if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE + || bpage->space != id + || bpage->io_fix != BUF_IO_NONE) { + /* Compressed pages are never hashed. + Skip blocks of other tablespaces. + Skip I/O-fixed blocks (to be dealt with later). */ +next_page: + bpage = prev_bpage; + continue; + } + + mutex_enter(block_mutex); + is_fixed = bpage->buf_fix_count > 0 + || !((buf_block_t*) bpage)->index; + mutex_exit(block_mutex); + + if (is_fixed) { + goto next_page; + } + + /* Store the page number so that we can drop the hash + index in a batch later. */ + page_arr[num_entries] = bpage->offset; + ut_a(num_entries < BUF_LRU_DROP_SEARCH_SIZE); + ++num_entries; + + if (num_entries < BUF_LRU_DROP_SEARCH_SIZE) { + goto next_page; + } + + /* Array full. We release the buf_pool->LRU_list_mutex to obey + the latching order. */ + mutex_exit(&buf_pool->LRU_list_mutex); + + buf_LRU_drop_page_hash_batch( + id, zip_size, page_arr, num_entries); + + num_entries = 0; + + mutex_enter(&buf_pool->LRU_list_mutex); + + /* Note that we released the buf_pool->LRU_list_mutex above + after reading the prev_bpage during processing of a + page_hash_batch (i.e.: when the array was full). + Because prev_bpage could belong to a compressed-only + block, it may have been relocated, and thus the + pointer cannot be trusted. Because bpage is of type + buf_block_t, it is safe to dereference. + + bpage can change in the LRU list. This is OK because + this function is a 'best effort' to drop as many + search hash entries as possible and it does not + guarantee that ALL such entries will be dropped. */ + + /* If, however, bpage has been removed from LRU list + to the free list then we should restart the scan. */ + + if (bpage + && buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) { + goto scan_again; + } + } + + mutex_exit(&buf_pool->LRU_list_mutex); + + /* Drop any remaining batch of search hashed pages. */ + buf_LRU_drop_page_hash_batch(id, zip_size, page_arr, num_entries); + ut_free(page_arr); +} + +/******************************************************************//** +While flushing (or removing dirty) pages from a tablespace we don't +want to hog the CPU and resources. Release the buffer pool and block +mutex and try to force a context switch. Then reacquire the same mutexes. +The current page is "fixed" before the release of the mutexes and then +"unfixed" again once we have reacquired the mutexes. */ +static __attribute__((nonnull)) +void +buf_flush_yield( +/*============*/ + buf_pool_t* buf_pool, /*!< in/out: buffer pool instance */ + buf_page_t* bpage) /*!< in/out: current page */ +{ + ib_mutex_t* block_mutex = buf_page_get_mutex(bpage); + + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); + ut_ad(mutex_own(block_mutex)); + ut_ad(buf_page_in_file(bpage)); + + /* "Fix" the block so that the position cannot be + changed after we release the buffer pool and + block mutexes. */ + buf_page_set_sticky(bpage); + + /* Now it is safe to release the LRU list mutex */ + mutex_exit(&buf_pool->LRU_list_mutex); + + mutex_exit(block_mutex); + /* Try and force a context switch. */ + os_thread_yield(); + + mutex_enter(&buf_pool->LRU_list_mutex); + + mutex_enter(block_mutex); + /* "Unfix" the block now that we have both the + buffer pool and block mutex again. */ + buf_page_unset_sticky(bpage); + mutex_exit(block_mutex); +} + +/******************************************************************//** +If we have hogged the resources for too long then release the buffer +pool and flush list mutex and do a thread yield. Set the current page +to "sticky" so that it is not relocated during the yield. +@return true if yielded */ +static __attribute__((nonnull(1), warn_unused_result)) +bool +buf_flush_try_yield( +/*================*/ + buf_pool_t* buf_pool, /*!< in/out: buffer pool instance */ + buf_page_t* bpage, /*!< in/out: bpage to remove */ + ulint processed, /*!< in: number of pages processed */ + bool* must_restart) /*!< in/out: if true, we have to + restart the flush list scan */ +{ + /* Every BUF_LRU_DROP_SEARCH_SIZE iterations in the + loop we release buf_pool->LRU_list_mutex to let other threads + do their job but only if the block is not IO fixed. This + ensures that the block stays in its position in the + flush_list. */ + + if (bpage != NULL + && processed >= BUF_LRU_DROP_SEARCH_SIZE + && buf_page_get_io_fix_unlocked(bpage) == BUF_IO_NONE) { + + ib_mutex_t* block_mutex = buf_page_get_mutex(bpage); + + buf_flush_list_mutex_exit(buf_pool); + + /* We don't have to worry about bpage becoming a dangling + pointer by a compressed page flush list relocation because + buf_page_get_gen() won't be called for pages from this + tablespace. */ + + mutex_enter(block_mutex); + /* Recheck the I/O fix and the flush list presence now that we + hold the right mutex */ + if (UNIV_UNLIKELY(buf_page_get_io_fix(bpage) != BUF_IO_NONE + || bpage->oldest_modification == 0)) { + + mutex_exit(block_mutex); + + *must_restart = true; + + buf_flush_list_mutex_enter(buf_pool); + + return false; + } + + *must_restart = false; + + /* Release the LRU list and buf_page_get_mutex() mutex + to give the other threads a go. */ + + buf_flush_yield(buf_pool, bpage); + + buf_flush_list_mutex_enter(buf_pool); + + /* Should not have been removed from the flush + list during the yield. However, this check is + not sufficient to catch a remove -> add. */ + + ut_ad(bpage->in_flush_list); + + return(true); + } + + return(false); +} + +/******************************************************************//** +Removes a single page from a given tablespace inside a specific +buffer pool instance. +@return true if page was removed. */ +static __attribute__((nonnull, warn_unused_result)) +bool +buf_flush_or_remove_page( +/*=====================*/ + buf_pool_t* buf_pool, /*!< in/out: buffer pool instance */ + buf_page_t* bpage, /*!< in/out: bpage to remove */ + bool flush, /*!< in: flush to disk if true but + don't remove else remove without + flushing to disk */ + bool* must_restart) /*!< in/out: if true, must restart the + flush list scan */ +{ + ib_mutex_t* block_mutex = buf_page_get_mutex(bpage); + + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); + ut_ad(buf_flush_list_mutex_own(buf_pool)); + + /* It is safe to check bpage->space and bpage->io_fix while holding + buf_pool->LRU_list_mutex only. */ + + if (UNIV_UNLIKELY(buf_page_get_io_fix_unlocked(bpage) + != BUF_IO_NONE)) { + + /* We cannot remove this page during this scan + yet; maybe the system is currently reading it + in, or flushing the modifications to the file */ + return(false); + } + + buf_flush_list_mutex_exit(buf_pool); + + /* We don't have to worry about bpage becoming a dangling + pointer by a compressed page flush list relocation because + buf_page_get_gen() won't be called for pages from this + tablespace. */ + bool processed; + + mutex_enter(block_mutex); + + /* Recheck the page I/O fix and the flush list presence now + that we hold the right mutex. */ + if (UNIV_UNLIKELY(buf_page_get_io_fix(bpage) != BUF_IO_NONE + || bpage->oldest_modification == 0)) { + + /* The page became I/O-fixed or is not on the flush + list anymore, this invalidates any flush-list-page + pointers we have. */ + + mutex_exit(block_mutex); + + *must_restart = true; + processed = false; + + } else if (!flush) { + + buf_flush_remove(bpage); + + mutex_exit(block_mutex); + + processed = true; + + } else if (buf_flush_ready_for_flush(bpage, BUF_FLUSH_SINGLE_PAGE)) { + + if (buf_flush_page( + buf_pool, bpage, BUF_FLUSH_SINGLE_PAGE, false)) { + + /* Wake possible simulated aio thread to actually + post the writes to the operating system */ + os_aio_simulated_wake_handler_threads(); + + mutex_enter(&buf_pool->LRU_list_mutex); + + processed = true; + + } else { + mutex_exit(block_mutex); + + processed = false; + } + + } else { + mutex_exit(block_mutex); + + processed = false; + } + + buf_flush_list_mutex_enter(buf_pool); + + ut_ad(!mutex_own(block_mutex)); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); + + return(processed); +} + +/******************************************************************//** +Remove all dirty pages belonging to a given tablespace inside a specific +buffer pool instance when we are deleting the data file(s) of that +tablespace. The pages still remain a part of LRU and are evicted from +the list as they age towards the tail of the LRU. +@retval DB_SUCCESS if all freed +@retval DB_FAIL if not all freed +@retval DB_INTERRUPTED if the transaction was interrupted */ +static __attribute__((nonnull(1), warn_unused_result)) +dberr_t +buf_flush_or_remove_pages( +/*======================*/ + buf_pool_t* buf_pool, /*!< buffer pool instance */ + ulint id, /*!< in: target space id for which + to remove or flush pages */ + bool flush, /*!< in: flush to disk if true but + don't remove else remove without + flushing to disk */ + const trx_t* trx) /*!< to check if the operation must + be interrupted, can be 0 */ +{ + buf_page_t* prev; + buf_page_t* bpage; + ulint processed = 0; + + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); + + buf_flush_list_mutex_enter(buf_pool); + +rescan: + bool must_restart = false; + bool all_freed = true; + + for (bpage = UT_LIST_GET_LAST(buf_pool->flush_list); + bpage != NULL; + bpage = prev) { + + ut_a(buf_page_in_file(bpage)); + + /* Save the previous link because once we free the + page we can't rely on the links. */ + + prev = UT_LIST_GET_PREV(list, bpage); + + if (buf_page_get_space(bpage) != id) { + + /* Skip this block, as it does not belong to + the target space. */ + + } else if (!buf_flush_or_remove_page(buf_pool, bpage, flush, + &must_restart)) { + + /* Remove was unsuccessful, we have to try again + by scanning the entire list from the end. + This also means that we never released the + flush list mutex. Therefore we can trust the prev + pointer. + buf_flush_or_remove_page() released the + flush list mutex but not the LRU list mutex. + Therefore it is possible that a new page was + added to the flush list. For example, in case + where we are at the head of the flush list and + prev == NULL. That is OK because we have the + tablespace quiesced and no new pages for this + space-id should enter flush_list. This is + because the only callers of this function are + DROP TABLE and FLUSH TABLE FOR EXPORT. + We know that we'll have to do at least one more + scan but we don't break out of loop here and + try to do as much work as we can in this + iteration. */ + + all_freed = false; + } else if (flush) { + + /* The processing was successful. And during the + processing we have released all the buf_pool mutexes + when calling buf_page_flush(). We cannot trust + prev pointer. */ + goto rescan; + } else if (UNIV_UNLIKELY(must_restart)) { + + ut_ad(!all_freed); + break; + } + + ++processed; + + /* Yield if we have hogged the CPU and mutexes for too long. */ + if (buf_flush_try_yield(buf_pool, prev, processed, + &must_restart)) { + + ut_ad(!must_restart); + /* Reset the batch size counter if we had to yield. */ + + processed = 0; + } + +#ifdef DBUG_OFF + if (flush) { + DBUG_EXECUTE_IF("ib_export_flush_crash", + static ulint n_pages; + if (++n_pages == 4) {DBUG_SUICIDE();}); + } +#endif /* DBUG_OFF */ + + /* The check for trx is interrupted is expensive, we want + to check every N iterations. */ + if (!processed && trx && trx_is_interrupted(trx)) { + buf_flush_list_mutex_exit(buf_pool); + return(DB_INTERRUPTED); + } + } + + buf_flush_list_mutex_exit(buf_pool); + + return(all_freed ? DB_SUCCESS : DB_FAIL); +} + +/******************************************************************//** +Remove or flush all the dirty pages that belong to a given tablespace +inside a specific buffer pool instance. The pages will remain in the LRU +list and will be evicted from the LRU list as they age and move towards +the tail of the LRU list. */ +static __attribute__((nonnull(1))) +void +buf_flush_dirty_pages( +/*==================*/ + buf_pool_t* buf_pool, /*!< buffer pool instance */ + ulint id, /*!< in: space id */ + bool flush, /*!< in: flush to disk if true otherwise + remove the pages without flushing */ + const trx_t* trx) /*!< to check if the operation must + be interrupted */ +{ + dberr_t err; + + do { + mutex_enter(&buf_pool->LRU_list_mutex); + + err = buf_flush_or_remove_pages(buf_pool, id, flush, trx); + + mutex_exit(&buf_pool->LRU_list_mutex); + + ut_ad(buf_flush_validate(buf_pool)); + + if (err == DB_FAIL) { + os_thread_sleep(2000); + } + + /* DB_FAIL is a soft error, it means that the task wasn't + completed, needs to be retried. */ + + ut_ad(buf_flush_validate(buf_pool)); + + } while (err == DB_FAIL); + + ut_ad(err == DB_INTERRUPTED + || buf_pool_get_dirty_pages_count(buf_pool, id) == 0); +} + +/******************************************************************//** +Remove all pages that belong to a given tablespace inside a specific +buffer pool instance when we are DISCARDing the tablespace. */ +static __attribute__((nonnull)) +void +buf_LRU_remove_all_pages( +/*=====================*/ + buf_pool_t* buf_pool, /*!< buffer pool instance */ + ulint id) /*!< in: space id */ +{ + buf_page_t* bpage; + ibool all_freed; + +scan_again: + mutex_enter(&buf_pool->LRU_list_mutex); + + all_freed = TRUE; + + for (bpage = UT_LIST_GET_LAST(buf_pool->LRU); + bpage != NULL; + /* No op */) { + + prio_rw_lock_t* hash_lock; + buf_page_t* prev_bpage; + ib_mutex_t* block_mutex = NULL; + + ut_a(buf_page_in_file(bpage)); + ut_ad(bpage->in_LRU_list); + + prev_bpage = UT_LIST_GET_PREV(LRU, bpage); + + /* It is safe to check bpage->space and bpage->io_fix while + holding buf_pool->LRU_list_mutex only and later recheck + while holding the buf_page_get_mutex() mutex. */ + + if (buf_page_get_space(bpage) != id) { + /* Skip this block, as it does not belong to + the space that is being invalidated. */ + goto next_page; + } else if (UNIV_UNLIKELY(buf_page_get_io_fix_unlocked(bpage) + != BUF_IO_NONE)) { + /* We cannot remove this page during this scan + yet; maybe the system is currently reading it + in, or flushing the modifications to the file */ + + all_freed = FALSE; + goto next_page; + } else { + ulint fold = buf_page_address_fold( + bpage->space, bpage->offset); + + hash_lock = buf_page_hash_lock_get(buf_pool, fold); + + rw_lock_x_lock(hash_lock); + + block_mutex = buf_page_get_mutex(bpage); + mutex_enter(block_mutex); + + if (UNIV_UNLIKELY( + buf_page_get_space(bpage) != id + || bpage->buf_fix_count > 0 + || (buf_page_get_io_fix(bpage) + != BUF_IO_NONE))) { + + mutex_exit(block_mutex); + + rw_lock_x_unlock(hash_lock); + + /* We cannot remove this page during + this scan yet; maybe the system is + currently reading it in, or flushing + the modifications to the file */ + + all_freed = FALSE; + + goto next_page; + } + } + + ut_ad(mutex_own(block_mutex)); + +#ifdef UNIV_DEBUG + if (buf_debug_prints) { + fprintf(stderr, + "Dropping space %lu page %lu\n", + (ulong) buf_page_get_space(bpage), + (ulong) buf_page_get_page_no(bpage)); + } +#endif + if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) { + /* Do nothing, because the adaptive hash index + covers uncompressed pages only. */ + } else if (((buf_block_t*) bpage)->index) { + ulint page_no; + ulint zip_size; + + mutex_exit(&buf_pool->LRU_list_mutex); + + zip_size = buf_page_get_zip_size(bpage); + page_no = buf_page_get_page_no(bpage); + + mutex_exit(block_mutex); + + rw_lock_x_unlock(hash_lock); + + /* Note that the following call will acquire + and release block->lock X-latch. */ + + btr_search_drop_page_hash_when_freed( + id, zip_size, page_no); + + goto scan_again; + } + + if (bpage->oldest_modification != 0) { + + buf_flush_remove(bpage); + } + + ut_ad(!bpage->in_flush_list); + + /* Remove from the LRU list. */ + + if (buf_LRU_block_remove_hashed(bpage, true)) { + + mutex_enter(block_mutex); + buf_LRU_block_free_hashed_page((buf_block_t*) bpage); + mutex_exit(block_mutex); + } else { + ut_ad(block_mutex == &buf_pool->zip_mutex); + } + + ut_ad(!mutex_own(block_mutex)); + +#ifdef UNIV_SYNC_DEBUG + /* buf_LRU_block_remove_hashed() releases the hash_lock */ + ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX)); + ut_ad(!rw_lock_own(hash_lock, RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + +next_page: + bpage = prev_bpage; + } + + mutex_exit(&buf_pool->LRU_list_mutex); + + if (!all_freed) { + os_thread_sleep(20000); + + goto scan_again; + } +} + +/******************************************************************//** +Remove pages belonging to a given tablespace inside a specific +buffer pool instance when we are deleting the data file(s) of that +tablespace. The pages still remain a part of LRU and are evicted from +the list as they age towards the tail of the LRU only if buf_remove +is BUF_REMOVE_FLUSH_NO_WRITE. */ +static __attribute__((nonnull(1))) +void +buf_LRU_remove_pages( +/*=================*/ + buf_pool_t* buf_pool, /*!< buffer pool instance */ + ulint id, /*!< in: space id */ + buf_remove_t buf_remove, /*!< in: remove or flush strategy */ + const trx_t* trx) /*!< to check if the operation must + be interrupted */ +{ + switch (buf_remove) { + case BUF_REMOVE_ALL_NO_WRITE: + buf_LRU_remove_all_pages(buf_pool, id); + break; + + case BUF_REMOVE_FLUSH_NO_WRITE: + ut_a(trx == 0); + buf_flush_dirty_pages(buf_pool, id, false, NULL); + break; + + case BUF_REMOVE_FLUSH_WRITE: + ut_a(trx != 0); + buf_flush_dirty_pages(buf_pool, id, true, trx); + /* Ensure that all asynchronous IO is completed. */ + os_aio_wait_until_no_pending_writes(); + fil_flush(id); + break; + } +} + +/******************************************************************//** +Flushes all dirty pages or removes all pages belonging +to a given tablespace. A PROBLEM: if readahead is being started, what +guarantees that it will not try to read in pages after this operation +has completed? */ +UNIV_INTERN +void +buf_LRU_flush_or_remove_pages( +/*==========================*/ + ulint id, /*!< in: space id */ + buf_remove_t buf_remove, /*!< in: remove or flush strategy */ + const trx_t* trx) /*!< to check if the operation must + be interrupted */ +{ + ulint i; + + /* Before we attempt to drop pages one by one we first + attempt to drop page hash index entries in batches to make + it more efficient. The batching attempt is a best effort + attempt and does not guarantee that all pages hash entries + will be dropped. We get rid of remaining page hash entries + one by one below. */ + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_array(i); + + switch (buf_remove) { + case BUF_REMOVE_ALL_NO_WRITE: + buf_LRU_drop_page_hash_for_tablespace(buf_pool, id); + break; + + case BUF_REMOVE_FLUSH_NO_WRITE: + /* It is a DROP TABLE for a single table + tablespace. No AHI entries exist because + we already dealt with them when freeing up + extents. */ + case BUF_REMOVE_FLUSH_WRITE: + /* We allow read-only queries against the + table, there is no need to drop the AHI entries. */ + break; + } + + buf_LRU_remove_pages(buf_pool, id, buf_remove, trx); + } +} + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +/********************************************************************//** +Insert a compressed block into buf_pool->zip_clean in the LRU order. */ +UNIV_INTERN +void +buf_LRU_insert_zip_clean( +/*=====================*/ + buf_page_t* bpage) /*!< in: pointer to the block in question */ +{ + buf_page_t* b; + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); + ut_ad(mutex_own(&buf_pool->zip_mutex)); + ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_PAGE); + + /* Find the first successor of bpage in the LRU list + that is in the zip_clean list. */ + b = bpage; + do { + b = UT_LIST_GET_NEXT(LRU, b); + } while (b && buf_page_get_state(b) != BUF_BLOCK_ZIP_PAGE); + + /* Insert bpage before b, i.e., after the predecessor of b. */ + if (b) { + b = UT_LIST_GET_PREV(list, b); + } + + if (b) { + UT_LIST_INSERT_AFTER(list, buf_pool->zip_clean, b, bpage); + } else { + UT_LIST_ADD_FIRST(list, buf_pool->zip_clean, bpage); + } +} +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ + +/******************************************************************//** +Try to free an uncompressed page of a compressed block from the unzip +LRU list. The compressed page is preserved, and it need not be clean. +@return TRUE if freed */ +UNIV_INLINE +ibool +buf_LRU_free_from_unzip_LRU_list( +/*=============================*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + ibool scan_all) /*!< in: scan whole LRU list + if TRUE, otherwise scan only + srv_LRU_scan_depth / 2 blocks. */ +{ + buf_block_t* block; + ibool freed; + ulint scanned; + + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); + + if (!buf_LRU_evict_from_unzip_LRU(buf_pool)) { + return(FALSE); + } + + for (block = UT_LIST_GET_LAST(buf_pool->unzip_LRU), + scanned = 1, freed = FALSE; + block != NULL && !freed + && (scan_all || scanned < srv_LRU_scan_depth); + ++scanned) { + + buf_block_t* prev_block = UT_LIST_GET_PREV(unzip_LRU, + block); + + mutex_enter(&block->mutex); + + ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); + ut_ad(block->in_unzip_LRU_list); + ut_ad(block->page.in_LRU_list); + + freed = buf_LRU_free_page(&block->page, false); + + mutex_exit(&block->mutex); + + block = prev_block; + } + + MONITOR_INC_VALUE_CUMULATIVE( + MONITOR_LRU_UNZIP_SEARCH_SCANNED, + MONITOR_LRU_UNZIP_SEARCH_SCANNED_NUM_CALL, + MONITOR_LRU_UNZIP_SEARCH_SCANNED_PER_CALL, + scanned); + return(freed); +} + +/******************************************************************//** +Try to free a clean page from the common LRU list. +@return TRUE if freed */ +UNIV_INLINE +ibool +buf_LRU_free_from_common_LRU_list( +/*==============================*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + ibool scan_all) /*!< in: scan whole LRU list + if TRUE, otherwise scan only + srv_LRU_scan_depth / 2 blocks. */ +{ + buf_page_t* bpage; + ibool freed; + ulint scanned; + + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); + + for (bpage = UT_LIST_GET_LAST(buf_pool->LRU), + scanned = 1, freed = FALSE; + bpage != NULL && !freed + && (scan_all || scanned < srv_LRU_scan_depth); + ++scanned) { + + unsigned accessed; + buf_page_t* prev_bpage = UT_LIST_GET_PREV(LRU, + bpage); + ib_mutex_t* block_mutex = buf_page_get_mutex(bpage); + + ut_ad(buf_page_in_file(bpage)); + ut_ad(bpage->in_LRU_list); + + accessed = buf_page_is_accessed(bpage); + + mutex_enter(block_mutex); + + freed = buf_LRU_free_page(bpage, true); + + mutex_exit(block_mutex); + + if (freed && !accessed) { + /* Keep track of pages that are evicted without + ever being accessed. This gives us a measure of + the effectiveness of readahead */ + ++buf_pool->stat.n_ra_pages_evicted; + } + + bpage = prev_bpage; + } + + MONITOR_INC_VALUE_CUMULATIVE( + MONITOR_LRU_SEARCH_SCANNED, + MONITOR_LRU_SEARCH_SCANNED_NUM_CALL, + MONITOR_LRU_SEARCH_SCANNED_PER_CALL, + scanned); + + return(freed); +} + +/******************************************************************//** +Try to free a replaceable block. +@return TRUE if found and freed */ +UNIV_INTERN +ibool +buf_LRU_scan_and_free_block( +/*========================*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + ibool scan_all) /*!< in: scan whole LRU list + if TRUE, otherwise scan only + 'old' blocks. */ +{ + ibool freed = FALSE; + bool use_unzip_list = UT_LIST_GET_LEN(buf_pool->unzip_LRU) > 0; + + mutex_enter(&buf_pool->LRU_list_mutex); + + if (use_unzip_list) { + freed = buf_LRU_free_from_unzip_LRU_list(buf_pool, scan_all); + } + + if (!freed) { + freed = buf_LRU_free_from_common_LRU_list(buf_pool, scan_all); + } + + if (!freed) { + mutex_exit(&buf_pool->LRU_list_mutex); + } + + return(freed); +} + +/******************************************************************//** +Returns TRUE if less than 25 % of the buffer pool in any instance is +available. This can be used in heuristics to prevent huge transactions +eating up the whole buffer pool for their locks. +@return TRUE if less than 25 % of buffer pool left */ +UNIV_INTERN +ibool +buf_LRU_buf_pool_running_out(void) +/*==============================*/ +{ + ulint i; + ibool ret = FALSE; + + for (i = 0; i < srv_buf_pool_instances && !ret; i++) { + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_array(i); + + if (!recv_recovery_on + && UT_LIST_GET_LEN(buf_pool->free) + + UT_LIST_GET_LEN(buf_pool->LRU) + < buf_pool->curr_size / 4) { + + ret = TRUE; + } + } + + return(ret); +} + +/******************************************************************//** +Returns a free block from the buf_pool. The block is taken off the +free list. If it is empty, returns NULL. +@return a free control block, or NULL if the buf_block->free list is empty */ +UNIV_INTERN +buf_block_t* +buf_LRU_get_free_only( +/*==================*/ + buf_pool_t* buf_pool) +{ + buf_block_t* block; + + mutex_enter_last(&buf_pool->free_list_mutex); + + block = (buf_block_t*) UT_LIST_GET_LAST(buf_pool->free); + + if (block) { + + ut_ad(block->page.in_free_list); + ut_d(block->page.in_free_list = FALSE); + ut_ad(!block->page.in_flush_list); + ut_ad(!block->page.in_LRU_list); + ut_a(!buf_page_in_file(&block->page)); + UT_LIST_REMOVE(list, buf_pool->free, (&block->page)); + buf_block_set_state(block, BUF_BLOCK_READY_FOR_USE); + + mutex_exit(&buf_pool->free_list_mutex); + + mutex_enter(&block->mutex); + + UNIV_MEM_ALLOC(block->frame, UNIV_PAGE_SIZE); + + ut_ad(buf_pool_from_block(block) == buf_pool); + + mutex_exit(&block->mutex); + return(block); + } + + mutex_exit(&buf_pool->free_list_mutex); + + return(NULL); +} + +/******************************************************************//** +Checks how much of buf_pool is occupied by non-data objects like +AHI, lock heaps etc. Depending on the size of non-data objects this +function will either assert or issue a warning and switch on the +status monitor. */ +static +void +buf_LRU_check_size_of_non_data_objects( +/*===================================*/ + const buf_pool_t* buf_pool) /*!< in: buffer pool instance */ +{ + if (!recv_recovery_on && UT_LIST_GET_LEN(buf_pool->free) + + UT_LIST_GET_LEN(buf_pool->LRU) < buf_pool->curr_size / 20) { + ut_print_timestamp(stderr); + + fprintf(stderr, + " InnoDB: ERROR: over 95 percent of the buffer pool" + " is occupied by\n" + "InnoDB: lock heaps or the adaptive hash index!" + " Check that your\n" + "InnoDB: transactions do not set too many row locks.\n" + "InnoDB: Your buffer pool size is %lu MB." + " Maybe you should make\n" + "InnoDB: the buffer pool bigger?\n" + "InnoDB: We intentionally generate a seg fault" + " to print a stack trace\n" + "InnoDB: on Linux!\n", + (ulong) (buf_pool->curr_size + / (1024 * 1024 / UNIV_PAGE_SIZE))); + + ut_error; + + } else if (!recv_recovery_on + && (UT_LIST_GET_LEN(buf_pool->free) + + UT_LIST_GET_LEN(buf_pool->LRU)) + < buf_pool->curr_size / 3) { + + if (!buf_lru_switched_on_innodb_mon) { + + /* Over 67 % of the buffer pool is occupied by lock + heaps or the adaptive hash index. This may be a memory + leak! */ + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: WARNING: over 67 percent of" + " the buffer pool is occupied by\n" + "InnoDB: lock heaps or the adaptive" + " hash index! Check that your\n" + "InnoDB: transactions do not set too many" + " row locks.\n" + "InnoDB: Your buffer pool size is %lu MB." + " Maybe you should make\n" + "InnoDB: the buffer pool bigger?\n" + "InnoDB: Starting the InnoDB Monitor to print" + " diagnostics, including\n" + "InnoDB: lock heap and hash index sizes.\n", + (ulong) (buf_pool->curr_size + / (1024 * 1024 / UNIV_PAGE_SIZE))); + + buf_lru_switched_on_innodb_mon = TRUE; + srv_print_innodb_monitor = TRUE; + os_event_set(lock_sys->timeout_event); + } + } else if (buf_lru_switched_on_innodb_mon) { + + /* Switch off the InnoDB Monitor; this is a simple way + to stop the monitor if the situation becomes less urgent, + but may also surprise users if the user also switched on the + monitor! */ + + buf_lru_switched_on_innodb_mon = FALSE; + srv_print_innodb_monitor = FALSE; + } +} + +/** The maximum allowed backoff sleep time duration, microseconds */ +#define MAX_FREE_LIST_BACKOFF_SLEEP 10000 + +/** The sleep reduction factor for high-priority waiter backoff sleeps */ +#define FREE_LIST_BACKOFF_HIGH_PRIO_DIVIDER 100 + +/** The sleep reduction factor for low-priority waiter backoff sleeps */ +#define FREE_LIST_BACKOFF_LOW_PRIO_DIVIDER 1 + +/******************************************************************//** +Returns a free block from the buf_pool. The block is taken off the +free list. If free list is empty, blocks are moved from the end of the +LRU list to the free list. +This function is called from a user thread when it needs a clean +block to read in a page. Note that we only ever get a block from +the free list. Even when we flush a page or find a page in LRU scan +we put it to free list to be used. +* iteration 0: + * get a block from free list, success:done + * if there is an LRU flush batch in progress: + * wait for batch to end: retry free list + * if buf_pool->try_LRU_scan is set + * scan LRU up to srv_LRU_scan_depth to find a clean block + * the above will put the block on free list + * success:retry the free list + * flush one dirty page from tail of LRU to disk + * the above will put the block on free list + * success: retry the free list +* iteration 1: + * same as iteration 0 except: + * scan whole LRU list + * scan LRU list even if buf_pool->try_LRU_scan is not set +* iteration > 1: + * same as iteration 1 but sleep 100ms +@return the free control block, in state BUF_BLOCK_READY_FOR_USE */ +UNIV_INTERN +buf_block_t* +buf_LRU_get_free_block( +/*===================*/ + buf_pool_t* buf_pool) /*!< in/out: buffer pool instance */ +{ + buf_block_t* block = NULL; + ibool freed = FALSE; + ulint n_iterations = 0; + ulint flush_failures = 0; + ibool mon_value_was = FALSE; + ibool started_monitor = FALSE; + + ut_ad(!mutex_own(&buf_pool->LRU_list_mutex)); + + MONITOR_INC(MONITOR_LRU_GET_FREE_SEARCH); +loop: + buf_LRU_check_size_of_non_data_objects(buf_pool); + + /* If there is a block in the free list, take it */ + block = buf_LRU_get_free_only(buf_pool); + + if (block) { + + ut_ad(buf_pool_from_block(block) == buf_pool); + memset(&block->page.zip, 0, sizeof block->page.zip); + + if (started_monitor) { + srv_print_innodb_monitor = + static_cast<my_bool>(mon_value_was); + } + + return(block); + } + + if (srv_empty_free_list_algorithm == SRV_EMPTY_FREE_LIST_BACKOFF + && buf_lru_manager_is_active + && (srv_shutdown_state == SRV_SHUTDOWN_NONE + || srv_shutdown_state == SRV_SHUTDOWN_CLEANUP)) { + + /* Backoff to minimize the free list mutex contention while the + free list is empty */ + ulint priority = srv_current_thread_priority; + + if (n_iterations < 3) { + + os_thread_yield(); + if (!priority) { + os_thread_yield(); + } + } else { + + ulint i, b; + + if (n_iterations < 6) { + i = n_iterations - 3; + } else if (n_iterations < 8) { + i = 4; + } else if (n_iterations < 11) { + i = 5; + } else { + i = n_iterations - 5; + } + b = 1 << i; + if (b > MAX_FREE_LIST_BACKOFF_SLEEP) { + b = MAX_FREE_LIST_BACKOFF_SLEEP; + } + os_thread_sleep(b / (priority + ? FREE_LIST_BACKOFF_HIGH_PRIO_DIVIDER + : FREE_LIST_BACKOFF_LOW_PRIO_DIVIDER)); + } + + /* In case of backoff, do not ever attempt single page flushes + and wait for the cleaner to free some pages instead. */ + + n_iterations++; + + goto loop; + } else { + + /* The LRU manager is not running or Oracle MySQL 5.6 algorithm + was requested, will perform a single page flush */ + ut_ad((srv_empty_free_list_algorithm + == SRV_EMPTY_FREE_LIST_LEGACY) + || !buf_lru_manager_is_active + || (srv_shutdown_state != SRV_SHUTDOWN_NONE + && srv_shutdown_state != SRV_SHUTDOWN_CLEANUP)); + } + + mutex_enter(&buf_pool->flush_state_mutex); + + if (buf_pool->init_flush[BUF_FLUSH_LRU] + && srv_use_doublewrite_buf + && buf_dblwr != NULL) { + + mutex_exit(&buf_pool->flush_state_mutex); + + /* If there is an LRU flush happening in the background + then we wait for it to end instead of trying a single + page flush. If, however, we are not using doublewrite + buffer then it is better to do our own single page + flush instead of waiting for LRU flush to end. */ + buf_flush_wait_batch_end(buf_pool, BUF_FLUSH_LRU); + goto loop; + } + + mutex_exit(&buf_pool->flush_state_mutex); + + freed = FALSE; + if (buf_pool->try_LRU_scan || n_iterations > 0) { + + /* If no block was in the free list, search from the + end of the LRU list and try to free a block there. + If we are doing for the first time we'll scan only + tail of the LRU list otherwise we scan the whole LRU + list. */ + freed = buf_LRU_scan_and_free_block(buf_pool, + n_iterations > 0); + + if (!freed && n_iterations == 0) { + /* Tell other threads that there is no point + in scanning the LRU list. This flag is set to + TRUE again when we flush a batch from this + buffer pool. */ + buf_pool->try_LRU_scan = FALSE; + } + } + + if (freed) { + goto loop; + + } + + if (n_iterations > 20) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Warning: difficult to find free blocks in\n" + "InnoDB: the buffer pool (%lu search iterations)!\n" + "InnoDB: %lu failed attempts to flush a page!" + " Consider\n" + "InnoDB: increasing the buffer pool size.\n" + "InnoDB: It is also possible that" + " in your Unix version\n" + "InnoDB: fsync is very slow, or" + " completely frozen inside\n" + "InnoDB: the OS kernel. Then upgrading to" + " a newer version\n" + "InnoDB: of your operating system may help." + " Look at the\n" + "InnoDB: number of fsyncs in diagnostic info below.\n" + "InnoDB: Pending flushes (fsync) log: %lu;" + " buffer pool: %lu\n" + "InnoDB: %lu OS file reads, %lu OS file writes," + " %lu OS fsyncs\n" + "InnoDB: Starting InnoDB Monitor to print further\n" + "InnoDB: diagnostics to the standard output.\n", + (ulong) n_iterations, + (ulong) flush_failures, + (ulong) fil_n_pending_log_flushes, + (ulong) fil_n_pending_tablespace_flushes, + (ulong) os_n_file_reads, (ulong) os_n_file_writes, + (ulong) os_n_fsyncs); + + mon_value_was = srv_print_innodb_monitor; + started_monitor = TRUE; + srv_print_innodb_monitor = TRUE; + os_event_set(lock_sys->timeout_event); + } + + /* If we have scanned the whole LRU and still are unable to + find a free block then we should sleep here to let the + page_cleaner do an LRU batch for us. + TODO: It'd be better if we can signal the page_cleaner. Perhaps + we should use timed wait for page_cleaner. */ + if (n_iterations > 1) { + + os_thread_sleep(100000); + } + + /* No free block was found: try to flush the LRU list. + This call will flush one page from the LRU and put it on the + free list. That means that the free block is up for grabs for + all user threads. + TODO: A more elegant way would have been to return the freed + up block to the caller here but the code that deals with + removing the block from page_hash and LRU_list is fairly + involved (particularly in case of compressed pages). We + can do that in a separate patch sometime in future. */ + if (!buf_flush_single_page_from_LRU(buf_pool)) { + MONITOR_INC(MONITOR_LRU_SINGLE_FLUSH_FAILURE_COUNT); + ++flush_failures; + } + + srv_stats.buf_pool_wait_free.add(n_iterations, 1); + + n_iterations++; + + goto loop; +} + +/*******************************************************************//** +Moves the LRU_old pointer so that the length of the old blocks list +is inside the allowed limits. */ +UNIV_INLINE +void +buf_LRU_old_adjust_len( +/*===================*/ + buf_pool_t* buf_pool) /*!< in: buffer pool instance */ +{ + ulint old_len; + ulint new_len; + + ut_a(buf_pool->LRU_old); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); + ut_ad(buf_pool->LRU_old_ratio >= BUF_LRU_OLD_RATIO_MIN); + ut_ad(buf_pool->LRU_old_ratio <= BUF_LRU_OLD_RATIO_MAX); +#if BUF_LRU_OLD_RATIO_MIN * BUF_LRU_OLD_MIN_LEN <= BUF_LRU_OLD_RATIO_DIV * (BUF_LRU_OLD_TOLERANCE + 5) +# error "BUF_LRU_OLD_RATIO_MIN * BUF_LRU_OLD_MIN_LEN <= BUF_LRU_OLD_RATIO_DIV * (BUF_LRU_OLD_TOLERANCE + 5)" +#endif +#ifdef UNIV_LRU_DEBUG + /* buf_pool->LRU_old must be the first item in the LRU list + whose "old" flag is set. */ + ut_a(buf_pool->LRU_old->old); + ut_a(!UT_LIST_GET_PREV(LRU, buf_pool->LRU_old) + || !UT_LIST_GET_PREV(LRU, buf_pool->LRU_old)->old); + ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old) + || UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old)->old); +#endif /* UNIV_LRU_DEBUG */ + + old_len = buf_pool->LRU_old_len; + new_len = ut_min(UT_LIST_GET_LEN(buf_pool->LRU) + * buf_pool->LRU_old_ratio / BUF_LRU_OLD_RATIO_DIV, + UT_LIST_GET_LEN(buf_pool->LRU) + - (BUF_LRU_OLD_TOLERANCE + + BUF_LRU_NON_OLD_MIN_LEN)); + + for (;;) { + buf_page_t* LRU_old = buf_pool->LRU_old; + + ut_a(LRU_old); + ut_ad(LRU_old->in_LRU_list); +#ifdef UNIV_LRU_DEBUG + ut_a(LRU_old->old); +#endif /* UNIV_LRU_DEBUG */ + + /* Update the LRU_old pointer if necessary */ + + if (old_len + BUF_LRU_OLD_TOLERANCE < new_len) { + + buf_pool->LRU_old = LRU_old = UT_LIST_GET_PREV( + LRU, LRU_old); +#ifdef UNIV_LRU_DEBUG + ut_a(!LRU_old->old); +#endif /* UNIV_LRU_DEBUG */ + old_len = ++buf_pool->LRU_old_len; + buf_page_set_old(LRU_old, TRUE); + + } else if (old_len > new_len + BUF_LRU_OLD_TOLERANCE) { + + buf_pool->LRU_old = UT_LIST_GET_NEXT(LRU, LRU_old); + old_len = --buf_pool->LRU_old_len; + buf_page_set_old(LRU_old, FALSE); + } else { + return; + } + } +} + +/*******************************************************************//** +Initializes the old blocks pointer in the LRU list. This function should be +called when the LRU list grows to BUF_LRU_OLD_MIN_LEN length. */ +static +void +buf_LRU_old_init( +/*=============*/ + buf_pool_t* buf_pool) +{ + buf_page_t* bpage; + + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); + ut_a(UT_LIST_GET_LEN(buf_pool->LRU) == BUF_LRU_OLD_MIN_LEN); + + /* We first initialize all blocks in the LRU list as old and then use + the adjust function to move the LRU_old pointer to the right + position */ + + for (bpage = UT_LIST_GET_LAST(buf_pool->LRU); bpage != NULL; + bpage = UT_LIST_GET_PREV(LRU, bpage)) { + ut_ad(bpage->in_LRU_list); + ut_ad(buf_page_in_file(bpage)); + /* This loop temporarily violates the + assertions of buf_page_set_old(). */ + bpage->old = TRUE; + } + + buf_pool->LRU_old = UT_LIST_GET_FIRST(buf_pool->LRU); + buf_pool->LRU_old_len = UT_LIST_GET_LEN(buf_pool->LRU); + + buf_LRU_old_adjust_len(buf_pool); +} + +/******************************************************************//** +Remove a block from the unzip_LRU list if it belonged to the list. */ +static +void +buf_unzip_LRU_remove_block_if_needed( +/*=================================*/ + buf_page_t* bpage) /*!< in/out: control block */ +{ + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + + ut_ad(buf_pool); + ut_ad(bpage); + ut_ad(buf_page_in_file(bpage)); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); + + if (buf_page_belongs_to_unzip_LRU(bpage)) { + buf_block_t* block = (buf_block_t*) bpage; + + ut_ad(block->in_unzip_LRU_list); + ut_d(block->in_unzip_LRU_list = FALSE); + + UT_LIST_REMOVE(unzip_LRU, buf_pool->unzip_LRU, block); + } +} + +/******************************************************************//** +Removes a block from the LRU list. */ +UNIV_INLINE +void +buf_LRU_remove_block( +/*=================*/ + buf_page_t* bpage) /*!< in: control block */ +{ + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + ulint zip_size; + + ut_ad(buf_pool); + ut_ad(bpage); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); + + ut_a(buf_page_in_file(bpage)); + + ut_ad(bpage->in_LRU_list); + + /* If the LRU_old pointer is defined and points to just this block, + move it backward one step */ + + if (UNIV_UNLIKELY(bpage == buf_pool->LRU_old)) { + + /* Below: the previous block is guaranteed to exist, + because the LRU_old pointer is only allowed to differ + by BUF_LRU_OLD_TOLERANCE from strict + buf_pool->LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV of the LRU + list length. */ + buf_page_t* prev_bpage = UT_LIST_GET_PREV(LRU, bpage); + + ut_a(prev_bpage); +#ifdef UNIV_LRU_DEBUG + ut_a(!prev_bpage->old); +#endif /* UNIV_LRU_DEBUG */ + buf_pool->LRU_old = prev_bpage; + buf_page_set_old(prev_bpage, TRUE); + + buf_pool->LRU_old_len++; + } + + /* Remove the block from the LRU list */ + UT_LIST_REMOVE(LRU, buf_pool->LRU, bpage); + ut_d(bpage->in_LRU_list = FALSE); + + zip_size = page_zip_get_size(&bpage->zip); + buf_pool->stat.LRU_bytes -= zip_size ? zip_size : UNIV_PAGE_SIZE; + + buf_unzip_LRU_remove_block_if_needed(bpage); + + /* If the LRU list is so short that LRU_old is not defined, + clear the "old" flags and return */ + if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN) { + + for (bpage = UT_LIST_GET_FIRST(buf_pool->LRU); bpage != NULL; + bpage = UT_LIST_GET_NEXT(LRU, bpage)) { + /* This loop temporarily violates the + assertions of buf_page_set_old(). */ + bpage->old = FALSE; + } + + buf_pool->LRU_old = NULL; + buf_pool->LRU_old_len = 0; + + return; + } + + ut_ad(buf_pool->LRU_old); + + /* Update the LRU_old_len field if necessary */ + if (buf_page_is_old(bpage)) { + + buf_pool->LRU_old_len--; + } + + /* Adjust the length of the old block list if necessary */ + buf_LRU_old_adjust_len(buf_pool); +} + +/******************************************************************//** +Adds a block to the LRU list of decompressed zip pages. */ +UNIV_INTERN +void +buf_unzip_LRU_add_block( +/*====================*/ + buf_block_t* block, /*!< in: control block */ + ibool old) /*!< in: TRUE if should be put to the end + of the list, else put to the start */ +{ + buf_pool_t* buf_pool = buf_pool_from_block(block); + + ut_ad(buf_pool); + ut_ad(block); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); + + ut_a(buf_page_belongs_to_unzip_LRU(&block->page)); + + ut_ad(!block->in_unzip_LRU_list); + ut_d(block->in_unzip_LRU_list = TRUE); + + if (old) { + UT_LIST_ADD_LAST(unzip_LRU, buf_pool->unzip_LRU, block); + } else { + UT_LIST_ADD_FIRST(unzip_LRU, buf_pool->unzip_LRU, block); + } +} + +/******************************************************************//** +Adds a block to the LRU list end. Please make sure that the zip_size is +already set into the page zip when invoking the function, so that we +can get correct zip_size from the buffer page when adding a block +into LRU */ +UNIV_INLINE +void +buf_LRU_add_block_to_end_low( +/*=========================*/ + buf_page_t* bpage) /*!< in: control block */ +{ + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + + ut_ad(buf_pool); + ut_ad(bpage); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); + + ut_a(buf_page_in_file(bpage)); + + ut_ad(!bpage->in_LRU_list); + UT_LIST_ADD_LAST(LRU, buf_pool->LRU, bpage); + ut_d(bpage->in_LRU_list = TRUE); + + incr_LRU_size_in_bytes(bpage, buf_pool); + + if (UT_LIST_GET_LEN(buf_pool->LRU) > BUF_LRU_OLD_MIN_LEN) { + + ut_ad(buf_pool->LRU_old); + + /* Adjust the length of the old block list if necessary */ + + buf_page_set_old(bpage, TRUE); + buf_pool->LRU_old_len++; + buf_LRU_old_adjust_len(buf_pool); + + } else if (UT_LIST_GET_LEN(buf_pool->LRU) == BUF_LRU_OLD_MIN_LEN) { + + /* The LRU list is now long enough for LRU_old to become + defined: init it */ + + buf_LRU_old_init(buf_pool); + } else { + buf_page_set_old(bpage, buf_pool->LRU_old != NULL); + } + + /* If this is a zipped block with decompressed frame as well + then put it on the unzip_LRU list */ + if (buf_page_belongs_to_unzip_LRU(bpage)) { + buf_unzip_LRU_add_block((buf_block_t*) bpage, TRUE); + } +} + +/******************************************************************//** +Adds a block to the LRU list. Please make sure that the zip_size is +already set into the page zip when invoking the function, so that we +can get correct zip_size from the buffer page when adding a block +into LRU */ +UNIV_INLINE +void +buf_LRU_add_block_low( +/*==================*/ + buf_page_t* bpage, /*!< in: control block */ + ibool old) /*!< in: TRUE if should be put to the old blocks + in the LRU list, else put to the start; if the + LRU list is very short, the block is added to + the start, regardless of this parameter */ +{ + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); + + ut_a(buf_page_in_file(bpage)); + ut_ad(!bpage->in_LRU_list); + + if (!old || (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN)) { + + UT_LIST_ADD_FIRST(LRU, buf_pool->LRU, bpage); + + bpage->freed_page_clock = buf_pool->freed_page_clock; + } else { +#ifdef UNIV_LRU_DEBUG + /* buf_pool->LRU_old must be the first item in the LRU list + whose "old" flag is set. */ + ut_a(buf_pool->LRU_old->old); + ut_a(!UT_LIST_GET_PREV(LRU, buf_pool->LRU_old) + || !UT_LIST_GET_PREV(LRU, buf_pool->LRU_old)->old); + ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old) + || UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old)->old); +#endif /* UNIV_LRU_DEBUG */ + UT_LIST_INSERT_AFTER(LRU, buf_pool->LRU, buf_pool->LRU_old, + bpage); + buf_pool->LRU_old_len++; + } + + ut_d(bpage->in_LRU_list = TRUE); + + incr_LRU_size_in_bytes(bpage, buf_pool); + + if (UT_LIST_GET_LEN(buf_pool->LRU) > BUF_LRU_OLD_MIN_LEN) { + + ut_ad(buf_pool->LRU_old); + + /* Adjust the length of the old block list if necessary */ + + buf_page_set_old(bpage, old); + buf_LRU_old_adjust_len(buf_pool); + + } else if (UT_LIST_GET_LEN(buf_pool->LRU) == BUF_LRU_OLD_MIN_LEN) { + + /* The LRU list is now long enough for LRU_old to become + defined: init it */ + + buf_LRU_old_init(buf_pool); + } else { + buf_page_set_old(bpage, buf_pool->LRU_old != NULL); + } + + /* If this is a zipped block with decompressed frame as well + then put it on the unzip_LRU list */ + if (buf_page_belongs_to_unzip_LRU(bpage)) { + buf_unzip_LRU_add_block((buf_block_t*) bpage, old); + } +} + +/******************************************************************//** +Adds a block to the LRU list. Please make sure that the zip_size is +already set into the page zip when invoking the function, so that we +can get correct zip_size from the buffer page when adding a block +into LRU */ +UNIV_INTERN +void +buf_LRU_add_block( +/*==============*/ + buf_page_t* bpage, /*!< in: control block */ + ibool old) /*!< in: TRUE if should be put to the old + blocks in the LRU list, else put to the start; + if the LRU list is very short, the block is + added to the start, regardless of this + parameter */ +{ + buf_LRU_add_block_low(bpage, old); +} + +/******************************************************************//** +Moves a block to the start of the LRU list. */ +UNIV_INTERN +void +buf_LRU_make_block_young( +/*=====================*/ + buf_page_t* bpage) /*!< in: control block */ +{ + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); + + if (bpage->old) { + buf_pool->stat.n_pages_made_young++; + } + + buf_LRU_remove_block(bpage); + buf_LRU_add_block_low(bpage, FALSE); +} + +/******************************************************************//** +Moves a block to the end of the LRU list. */ +UNIV_INTERN +void +buf_LRU_make_block_old( +/*===================*/ + buf_page_t* bpage) /*!< in: control block */ +{ + buf_LRU_remove_block(bpage); + buf_LRU_add_block_to_end_low(bpage); +} + +/******************************************************************//** +Try to free a block. If bpage is a descriptor of a compressed-only +page, the descriptor object will be freed as well. + +NOTE: If this function returns true, it will release the LRU list mutex, +and temporarily release and relock the buf_page_get_mutex() mutex. +Furthermore, the page frame will no longer be accessible via bpage. If this +function returns false, the buf_page_get_mutex() might be temporarily released +and relocked too. + +The caller must hold the LRU list and buf_page_get_mutex() mutexes. + +@return true if freed, false otherwise. */ +UNIV_INTERN +bool +buf_LRU_free_page( +/*===============*/ + buf_page_t* bpage, /*!< in: block to be freed */ + bool zip) /*!< in: true if should remove also the + compressed page of an uncompressed page */ +{ + buf_page_t* b = NULL; + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + const ulint fold = buf_page_address_fold(bpage->space, + bpage->offset); + prio_rw_lock_t* hash_lock = buf_page_hash_lock_get(buf_pool, fold); + + ib_mutex_t* block_mutex = buf_page_get_mutex(bpage); + + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); + ut_ad(mutex_own(block_mutex)); + ut_ad(buf_page_in_file(bpage)); + ut_ad(bpage->in_LRU_list); + + if (!buf_page_can_relocate(bpage)) { + + /* Do not free buffer fixed or I/O-fixed blocks. */ + return(false); + } + +#ifdef UNIV_IBUF_COUNT_DEBUG + ut_a(ibuf_count_get(bpage->space, bpage->offset) == 0); +#endif /* UNIV_IBUF_COUNT_DEBUG */ + + if (zip || !bpage->zip.data) { + /* This would completely free the block. */ + /* Do not completely free dirty blocks. */ + + if (bpage->oldest_modification) { + return(false); + } + } else if (bpage->oldest_modification > 0 + && buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) { + + ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_DIRTY); + + return(false); + + } else if (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE) { + b = buf_page_alloc_descriptor(); + ut_a(b); + } + + ut_ad(buf_page_in_file(bpage)); + ut_ad(bpage->in_LRU_list); + ut_ad(!bpage->in_flush_list == !bpage->oldest_modification); + +#ifdef UNIV_DEBUG + if (buf_debug_prints) { + fprintf(stderr, "Putting space %lu page %lu to free list\n", + (ulong) buf_page_get_space(bpage), + (ulong) buf_page_get_page_no(bpage)); + } +#endif /* UNIV_DEBUG */ + + mutex_exit(block_mutex); + + rw_lock_x_lock(hash_lock); + mutex_enter(block_mutex); + + if (UNIV_UNLIKELY(!buf_page_can_relocate(bpage) + || ((zip || !bpage->zip.data) + && bpage->oldest_modification))) { + +not_freed: + rw_lock_x_unlock(hash_lock); + if (b) { + buf_page_free_descriptor(b); + } + + return(false); + } else if (UNIV_UNLIKELY(bpage->oldest_modification + && (buf_page_get_state(bpage) + != BUF_BLOCK_FILE_PAGE))) { + + ut_ad(buf_page_get_state(bpage) + == BUF_BLOCK_ZIP_DIRTY); + goto not_freed; + } + + if (b) { + memcpy(b, bpage, sizeof *b); + } + + if (!buf_LRU_block_remove_hashed(bpage, zip)) { + + mutex_exit(&buf_pool->LRU_list_mutex); + + if (b) { + buf_page_free_descriptor(b); + } + + mutex_enter(block_mutex); + + return(true); + } + +#ifdef UNIV_SYNC_DEBUG + /* buf_LRU_block_remove_hashed() releases the hash_lock */ + ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX) + && !rw_lock_own(hash_lock, RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + + /* We have just freed a BUF_BLOCK_FILE_PAGE. If b != NULL + then it was a compressed page with an uncompressed frame and + we are interested in freeing only the uncompressed frame. + Therefore we have to reinsert the compressed page descriptor + into the LRU and page_hash (and possibly flush_list). + if b == NULL then it was a regular page that has been freed */ + + if (b) { + buf_page_t* prev_b = UT_LIST_GET_PREV(LRU, b); + + rw_lock_x_lock(hash_lock); + mutex_enter(block_mutex); + + ut_a(!buf_page_hash_get_low( + buf_pool, b->space, b->offset, fold)); + + b->state = b->oldest_modification + ? BUF_BLOCK_ZIP_DIRTY + : BUF_BLOCK_ZIP_PAGE; + UNIV_MEM_DESC(b->zip.data, + page_zip_get_size(&b->zip)); + + /* The fields in_page_hash and in_LRU_list of + the to-be-freed block descriptor should have + been cleared in + buf_LRU_block_remove_hashed(), which + invokes buf_LRU_remove_block(). */ + ut_ad(!bpage->in_page_hash); + ut_ad(!bpage->in_LRU_list); + /* bpage->state was BUF_BLOCK_FILE_PAGE because + b != NULL. The type cast below is thus valid. */ + ut_ad(!((buf_block_t*) bpage)->in_unzip_LRU_list); + + /* The fields of bpage were copied to b before + buf_LRU_block_remove_hashed() was invoked. */ + ut_ad(!b->in_zip_hash); + ut_ad(b->in_page_hash); + ut_ad(b->in_LRU_list); + + HASH_INSERT(buf_page_t, hash, + buf_pool->page_hash, fold, b); + + /* Insert b where bpage was in the LRU list. */ + if (UNIV_LIKELY(prev_b != NULL)) { + ulint lru_len; + + ut_ad(prev_b->in_LRU_list); + ut_ad(buf_page_in_file(prev_b)); + UT_LIST_INSERT_AFTER(LRU, buf_pool->LRU, + prev_b, b); + + incr_LRU_size_in_bytes(b, buf_pool); + + if (buf_page_is_old(b)) { + buf_pool->LRU_old_len++; + if (UNIV_UNLIKELY + (buf_pool->LRU_old + == UT_LIST_GET_NEXT(LRU, b))) { + + buf_pool->LRU_old = b; + } + } + + lru_len = UT_LIST_GET_LEN(buf_pool->LRU); + + if (lru_len > BUF_LRU_OLD_MIN_LEN) { + ut_ad(buf_pool->LRU_old); + /* Adjust the length of the + old block list if necessary */ + buf_LRU_old_adjust_len(buf_pool); + } else if (lru_len == BUF_LRU_OLD_MIN_LEN) { + /* The LRU list is now long + enough for LRU_old to become + defined: init it */ + buf_LRU_old_init(buf_pool); + } +#ifdef UNIV_LRU_DEBUG + /* Check that the "old" flag is consistent + in the block and its neighbours. */ + buf_page_set_old(b, buf_page_is_old(b)); +#endif /* UNIV_LRU_DEBUG */ + } else { + ut_d(b->in_LRU_list = FALSE); + buf_LRU_add_block_low(b, buf_page_is_old(b)); + } + + mutex_enter(&buf_pool->zip_mutex); + rw_lock_x_unlock(hash_lock); + if (b->state == BUF_BLOCK_ZIP_PAGE) { +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG + buf_LRU_insert_zip_clean(b); +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ + } else { + /* Relocate on buf_pool->flush_list. */ + buf_flush_relocate_on_flush_list(bpage, b); + } + + bpage->zip.data = NULL; + page_zip_set_size(&bpage->zip, 0); + + /* Prevent buf_page_get_gen() from + decompressing the block while we release block_mutex. */ + buf_page_set_sticky(b); + mutex_exit(&buf_pool->zip_mutex); + mutex_exit(block_mutex); + + } + + mutex_exit(&buf_pool->LRU_list_mutex); + + /* Remove possible adaptive hash index on the page. + The page was declared uninitialized by + buf_LRU_block_remove_hashed(). We need to flag + the contents of the page valid (which it still is) in + order to avoid bogus Valgrind warnings.*/ + + UNIV_MEM_VALID(((buf_block_t*) bpage)->frame, + UNIV_PAGE_SIZE); + btr_search_drop_page_hash_index((buf_block_t*) bpage); + UNIV_MEM_INVALID(((buf_block_t*) bpage)->frame, + UNIV_PAGE_SIZE); + + if (b) { + ib_uint32_t checksum; + /* Compute and stamp the compressed page + checksum while not holding any mutex. The + block is already half-freed + (BUF_BLOCK_REMOVE_HASH) and removed from + buf_pool->page_hash, thus inaccessible by any + other thread. */ + + checksum = static_cast<ib_uint32_t>( + page_zip_calc_checksum( + b->zip.data, + page_zip_get_size(&b->zip), + static_cast<srv_checksum_algorithm_t>( + srv_checksum_algorithm))); + + mach_write_to_4(b->zip.data + FIL_PAGE_SPACE_OR_CHKSUM, + checksum); + } + + mutex_enter(block_mutex); + + if (b) { + mutex_enter(&buf_pool->zip_mutex); + buf_page_unset_sticky(b); + mutex_exit(&buf_pool->zip_mutex); + } + + buf_LRU_block_free_hashed_page((buf_block_t*) bpage); + ut_ad(mutex_own(block_mutex)); + ut_ad(!mutex_own(&buf_pool->LRU_list_mutex)); + return(true); +} + +/******************************************************************//** +Puts a block back to the free list. */ +UNIV_INTERN +void +buf_LRU_block_free_non_file_page( +/*=============================*/ + buf_block_t* block) /*!< in: block, must not contain a file page */ +{ + void* data; + buf_pool_t* buf_pool = buf_pool_from_block(block); + + ut_ad(block); + ut_ad(mutex_own(&block->mutex)); + + switch (buf_block_get_state(block)) { + case BUF_BLOCK_MEMORY: + case BUF_BLOCK_READY_FOR_USE: + break; + default: + ut_error; + } + +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + ut_a(block->n_pointers == 0); +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + ut_ad(!block->page.in_free_list); + ut_ad(!block->page.in_flush_list); + ut_ad(!block->page.in_LRU_list); + + UNIV_MEM_ALLOC(block->frame, UNIV_PAGE_SIZE); +#ifdef UNIV_DEBUG + /* Wipe contents of page to reveal possible stale pointers to it */ + memset(block->frame, '\0', UNIV_PAGE_SIZE); +#else + /* Wipe page_no and space_id */ + memset(block->frame + FIL_PAGE_OFFSET, 0xfe, 4); + memset(block->frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0xfe, 4); +#endif + data = block->page.zip.data; + + if (data) { + block->page.zip.data = NULL; + mutex_exit(&block->mutex); + + buf_buddy_free( + buf_pool, data, page_zip_get_size(&block->page.zip)); + + mutex_enter(&block->mutex); + page_zip_set_size(&block->page.zip, 0); + } + + mutex_enter_first(&buf_pool->free_list_mutex); + buf_block_set_state(block, BUF_BLOCK_NOT_USED); + UT_LIST_ADD_FIRST(list, buf_pool->free, (&block->page)); + ut_d(block->page.in_free_list = TRUE); + mutex_exit(&buf_pool->free_list_mutex); + + UNIV_MEM_ASSERT_AND_FREE(block->frame, UNIV_PAGE_SIZE); +} + +/******************************************************************//** +Takes a block out of the LRU list and page hash table. +If the block is compressed-only (BUF_BLOCK_ZIP_PAGE), +the object will be freed. + +The caller must hold buf_pool->LRU_list_mutex, the buf_page_get_mutex() mutex +and the appropriate hash_lock. This function will release the +buf_page_get_mutex() and the hash_lock. + +If a compressed page is freed other compressed pages may be relocated. +@retval true if BUF_BLOCK_FILE_PAGE was removed from page_hash. The +caller needs to free the page to the free list +@retval false if BUF_BLOCK_ZIP_PAGE was removed from page_hash. In +this case the block is already returned to the buddy allocator. */ +static +bool +buf_LRU_block_remove_hashed( +/*========================*/ + buf_page_t* bpage, /*!< in: block, must contain a file page and + be in a state where it can be freed; there + may or may not be a hash index to the page */ + bool zip) /*!< in: true if should remove also the + compressed page of an uncompressed page */ +{ + ulint fold; + const buf_page_t* hashed_bpage; + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + prio_rw_lock_t* hash_lock; + + ut_ad(bpage); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); + ut_ad(mutex_own(buf_page_get_mutex(bpage))); + + fold = buf_page_address_fold(bpage->space, bpage->offset); + hash_lock = buf_page_hash_lock_get(buf_pool, fold); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(hash_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + ut_a(buf_page_get_io_fix(bpage) == BUF_IO_NONE); + ut_a(bpage->buf_fix_count == 0); + + buf_LRU_remove_block(bpage); + + buf_pool->freed_page_clock += 1; + + switch (buf_page_get_state(bpage)) { + case BUF_BLOCK_FILE_PAGE: + UNIV_MEM_ASSERT_W(bpage, sizeof(buf_block_t)); + UNIV_MEM_ASSERT_W(((buf_block_t*) bpage)->frame, + UNIV_PAGE_SIZE); + buf_block_modify_clock_inc((buf_block_t*) bpage); + if (bpage->zip.data) { + const page_t* page = ((buf_block_t*) bpage)->frame; + const ulint zip_size + = page_zip_get_size(&bpage->zip); + + ut_a(!zip || bpage->oldest_modification == 0); + + switch (UNIV_EXPECT(fil_page_get_type(page), + FIL_PAGE_INDEX)) { + case FIL_PAGE_TYPE_ALLOCATED: + case FIL_PAGE_INODE: + case FIL_PAGE_IBUF_BITMAP: + case FIL_PAGE_TYPE_FSP_HDR: + case FIL_PAGE_TYPE_XDES: + /* These are essentially uncompressed pages. */ + if (!zip) { + /* InnoDB writes the data to the + uncompressed page frame. Copy it + to the compressed page, which will + be preserved. */ + memcpy(bpage->zip.data, page, + zip_size); + } + break; + case FIL_PAGE_TYPE_ZBLOB: + case FIL_PAGE_TYPE_ZBLOB2: + break; + case FIL_PAGE_INDEX: +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate( + &bpage->zip, page, + ((buf_block_t*) bpage)->index)); +#endif /* UNIV_ZIP_DEBUG */ + break; + default: + ut_print_timestamp(stderr); + fputs(" InnoDB: ERROR: The compressed page" + " to be evicted seems corrupt:", stderr); + ut_print_buf(stderr, page, zip_size); + fputs("\nInnoDB: Possibly older version" + " of the page:", stderr); + ut_print_buf(stderr, bpage->zip.data, + zip_size); + putc('\n', stderr); + ut_error; + } + + break; + } + /* fall through */ + case BUF_BLOCK_ZIP_PAGE: + ut_a(bpage->oldest_modification == 0); + UNIV_MEM_ASSERT_W(bpage->zip.data, + page_zip_get_size(&bpage->zip)); + break; + case BUF_BLOCK_POOL_WATCH: + case BUF_BLOCK_ZIP_DIRTY: + case BUF_BLOCK_NOT_USED: + case BUF_BLOCK_READY_FOR_USE: + case BUF_BLOCK_MEMORY: + case BUF_BLOCK_REMOVE_HASH: + ut_error; + break; + } + + hashed_bpage = buf_page_hash_get_low(buf_pool, bpage->space, + bpage->offset, fold); + + if (UNIV_UNLIKELY(bpage != hashed_bpage)) { + fprintf(stderr, + "InnoDB: Error: page %lu %lu not found" + " in the hash table\n", + (ulong) bpage->space, + (ulong) bpage->offset); + if (hashed_bpage) { + fprintf(stderr, + "InnoDB: In hash table we find block" + " %p of %lu %lu which is not %p\n", + (const void*) hashed_bpage, + (ulong) hashed_bpage->space, + (ulong) hashed_bpage->offset, + (const void*) bpage); + } + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG + mutex_exit(buf_page_get_mutex(bpage)); + rw_lock_x_unlock(hash_lock); + mutex_exit(&buf_pool->LRU_list_mutex); + buf_print(); + buf_LRU_print(); + buf_validate(); + buf_LRU_validate(); +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ + ut_error; + } + + ut_ad(!bpage->in_zip_hash); + ut_ad(bpage->in_page_hash); + ut_d(bpage->in_page_hash = FALSE); + HASH_DELETE(buf_page_t, hash, buf_pool->page_hash, fold, bpage); + switch (buf_page_get_state(bpage)) { + case BUF_BLOCK_ZIP_PAGE: + ut_ad(!bpage->in_free_list); + ut_ad(!bpage->in_flush_list); + ut_ad(!bpage->in_LRU_list); + ut_a(bpage->zip.data); + ut_a(buf_page_get_zip_size(bpage)); + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG + UT_LIST_REMOVE(list, buf_pool->zip_clean, bpage); +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ + + mutex_exit(&buf_pool->zip_mutex); + rw_lock_x_unlock(hash_lock); + + buf_buddy_free( + buf_pool, bpage->zip.data, + page_zip_get_size(&bpage->zip)); + + buf_page_free_descriptor(bpage); + return(false); + + case BUF_BLOCK_FILE_PAGE: + memset(((buf_block_t*) bpage)->frame + + FIL_PAGE_OFFSET, 0xff, 4); + memset(((buf_block_t*) bpage)->frame + + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0xff, 4); + UNIV_MEM_INVALID(((buf_block_t*) bpage)->frame, + UNIV_PAGE_SIZE); + buf_page_set_state(bpage, BUF_BLOCK_REMOVE_HASH); + + if (buf_pool->flush_rbt == NULL) { + bpage->space = ULINT32_UNDEFINED; + bpage->offset = ULINT32_UNDEFINED; + } + + /* Question: If we release bpage and hash mutex here + then what protects us against: + 1) Some other thread buffer fixing this page + 2) Some other thread trying to read this page and + not finding it in buffer pool attempting to read it + from the disk. + Answer: + 1) Cannot happen because the page is no longer in the + page_hash. Only possibility is when while invalidating + a tablespace we buffer fix the prev_page in LRU to + avoid relocation during the scan. But that is not + possible because we are holding LRU list mutex. + + 2) Not possible because in buf_page_init_for_read() + we do a look up of page_hash while holding LRU list + mutex and since we are holding LRU list mutex here + and by the time we'll release it in the caller we'd + have inserted the compressed only descriptor in the + page_hash. */ + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); + rw_lock_x_unlock(hash_lock); + mutex_exit(&((buf_block_t*) bpage)->mutex); + + if (zip && bpage->zip.data) { + /* Free the compressed page. */ + void* data = bpage->zip.data; + bpage->zip.data = NULL; + + ut_ad(!bpage->in_free_list); + ut_ad(!bpage->in_flush_list); + ut_ad(!bpage->in_LRU_list); + + buf_buddy_free( + buf_pool, data, + page_zip_get_size(&bpage->zip)); + + page_zip_set_size(&bpage->zip, 0); + } + + return(true); + + case BUF_BLOCK_POOL_WATCH: + case BUF_BLOCK_ZIP_DIRTY: + case BUF_BLOCK_NOT_USED: + case BUF_BLOCK_READY_FOR_USE: + case BUF_BLOCK_MEMORY: + case BUF_BLOCK_REMOVE_HASH: + break; + } + + ut_error; + return(false); +} + +/******************************************************************//** +Puts a file page whose has no hash index to the free list. */ +static +void +buf_LRU_block_free_hashed_page( +/*===========================*/ + buf_block_t* block) /*!< in: block, must contain a file page and + be in a state where it can be freed */ +{ + ut_ad(mutex_own(&block->mutex)); + + buf_block_set_state(block, BUF_BLOCK_MEMORY); + + buf_LRU_block_free_non_file_page(block); +} + +/******************************************************************//** +Remove one page from LRU list and put it to free list */ +UNIV_INTERN +void +buf_LRU_free_one_page( +/*==================*/ + buf_page_t* bpage) /*!< in/out: block, must contain a file page and + be in a state where it can be freed; there + may or may not be a hash index to the page */ +{ +#if defined(UNIV_DEBUG) || defined(UNIV_SYNC_DEBUG) + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); +#endif +#ifdef UNIV_SYNC_DEBUG + const ulint fold = buf_page_address_fold(bpage->space, + bpage->offset); + prio_rw_lock_t* hash_lock = buf_page_hash_lock_get(buf_pool, fold); +#endif + ib_mutex_t* block_mutex = buf_page_get_mutex(bpage); + + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); + ut_ad(mutex_own(block_mutex)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(hash_lock, RW_LOCK_EX)); +#endif + + if (buf_LRU_block_remove_hashed(bpage, true)) { + mutex_enter(block_mutex); + buf_LRU_block_free_hashed_page((buf_block_t*) bpage); + mutex_exit(block_mutex); + } + + /* buf_LRU_block_remove_hashed() releases hash_lock and block_mutex */ +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX) + && !rw_lock_own(hash_lock, RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(!mutex_own(block_mutex)); +} + +/**********************************************************************//** +Updates buf_pool->LRU_old_ratio for one buffer pool instance. +@return updated old_pct */ +static +uint +buf_LRU_old_ratio_update_instance( +/*==============================*/ + buf_pool_t* buf_pool,/*!< in: buffer pool instance */ + uint old_pct,/*!< in: Reserve this percentage of + the buffer pool for "old" blocks. */ + ibool adjust) /*!< in: TRUE=adjust the LRU list; + FALSE=just assign buf_pool->LRU_old_ratio + during the initialization of InnoDB */ +{ + uint ratio; + + ratio = old_pct * BUF_LRU_OLD_RATIO_DIV / 100; + if (ratio < BUF_LRU_OLD_RATIO_MIN) { + ratio = BUF_LRU_OLD_RATIO_MIN; + } else if (ratio > BUF_LRU_OLD_RATIO_MAX) { + ratio = BUF_LRU_OLD_RATIO_MAX; + } + + if (adjust) { + mutex_enter(&buf_pool->LRU_list_mutex); + + if (ratio != buf_pool->LRU_old_ratio) { + buf_pool->LRU_old_ratio = ratio; + + if (UT_LIST_GET_LEN(buf_pool->LRU) + >= BUF_LRU_OLD_MIN_LEN) { + + buf_LRU_old_adjust_len(buf_pool); + } + } + + mutex_exit(&buf_pool->LRU_list_mutex); + } else { + buf_pool->LRU_old_ratio = ratio; + } + /* the reverse of + ratio = old_pct * BUF_LRU_OLD_RATIO_DIV / 100 */ + return((uint) (ratio * 100 / (double) BUF_LRU_OLD_RATIO_DIV + 0.5)); +} + +/**********************************************************************//** +Updates buf_pool->LRU_old_ratio. +@return updated old_pct */ +UNIV_INTERN +ulint +buf_LRU_old_ratio_update( +/*=====================*/ + uint old_pct,/*!< in: Reserve this percentage of + the buffer pool for "old" blocks. */ + ibool adjust) /*!< in: TRUE=adjust the LRU list; + FALSE=just assign buf_pool->LRU_old_ratio + during the initialization of InnoDB */ +{ + ulint i; + ulint new_ratio = 0; + + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_array(i); + + new_ratio = buf_LRU_old_ratio_update_instance( + buf_pool, old_pct, adjust); + } + + return(new_ratio); +} + +/********************************************************************//** +Update the historical stats that we are collecting for LRU eviction +policy at the end of each interval. */ +UNIV_INTERN +void +buf_LRU_stat_update(void) +/*=====================*/ +{ + ulint i; + buf_LRU_stat_t* item; + buf_pool_t* buf_pool; + ibool evict_started = FALSE; + buf_LRU_stat_t cur_stat; + + /* If we haven't started eviction yet then don't update stats. */ + for (i = 0; i < srv_buf_pool_instances; i++) { + + buf_pool = buf_pool_from_array(i); + + if (buf_pool->freed_page_clock != 0) { + evict_started = TRUE; + break; + } + } + + if (!evict_started) { + goto func_exit; + } + + /* Update the index. */ + item = &buf_LRU_stat_arr[buf_LRU_stat_arr_ind]; + buf_LRU_stat_arr_ind++; + buf_LRU_stat_arr_ind %= BUF_LRU_STAT_N_INTERVAL; + + /* Add the current value and subtract the obsolete entry. + Since buf_LRU_stat_cur is not protected by any mutex, + it can be changing between adding to buf_LRU_stat_sum + and copying to item. Assign it to local variables to make + sure the same value assign to the buf_LRU_stat_sum + and item */ + cur_stat = buf_LRU_stat_cur; + + buf_LRU_stat_sum.io += cur_stat.io - item->io; + buf_LRU_stat_sum.unzip += cur_stat.unzip - item->unzip; + + /* Put current entry in the array. */ + memcpy(item, &cur_stat, sizeof *item); + +func_exit: + /* Clear the current entry. */ + memset(&buf_LRU_stat_cur, 0, sizeof buf_LRU_stat_cur); +} + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +/**********************************************************************//** +Validates the LRU list for one buffer pool instance. */ +static +void +buf_LRU_validate_instance( +/*======================*/ + buf_pool_t* buf_pool) +{ + buf_page_t* bpage; + buf_block_t* block; + ulint old_len; + ulint new_len; + + ut_ad(buf_pool); + mutex_enter(&buf_pool->LRU_list_mutex); + + if (UT_LIST_GET_LEN(buf_pool->LRU) >= BUF_LRU_OLD_MIN_LEN) { + + ut_a(buf_pool->LRU_old); + old_len = buf_pool->LRU_old_len; + new_len = ut_min(UT_LIST_GET_LEN(buf_pool->LRU) + * buf_pool->LRU_old_ratio + / BUF_LRU_OLD_RATIO_DIV, + UT_LIST_GET_LEN(buf_pool->LRU) + - (BUF_LRU_OLD_TOLERANCE + + BUF_LRU_NON_OLD_MIN_LEN)); + ut_a(old_len >= new_len - BUF_LRU_OLD_TOLERANCE); + ut_a(old_len <= new_len + BUF_LRU_OLD_TOLERANCE); + } + + UT_LIST_VALIDATE(LRU, buf_page_t, buf_pool->LRU, CheckInLRUList()); + + old_len = 0; + + for (bpage = UT_LIST_GET_FIRST(buf_pool->LRU); + bpage != NULL; + bpage = UT_LIST_GET_NEXT(LRU, bpage)) { + + switch (buf_page_get_state(bpage)) { + case BUF_BLOCK_POOL_WATCH: + case BUF_BLOCK_NOT_USED: + case BUF_BLOCK_READY_FOR_USE: + case BUF_BLOCK_MEMORY: + case BUF_BLOCK_REMOVE_HASH: + ut_error; + break; + case BUF_BLOCK_FILE_PAGE: + ut_ad(((buf_block_t*) bpage)->in_unzip_LRU_list + == buf_page_belongs_to_unzip_LRU(bpage)); + case BUF_BLOCK_ZIP_PAGE: + case BUF_BLOCK_ZIP_DIRTY: + break; + } + + if (buf_page_is_old(bpage)) { + const buf_page_t* prev + = UT_LIST_GET_PREV(LRU, bpage); + const buf_page_t* next + = UT_LIST_GET_NEXT(LRU, bpage); + + if (!old_len++) { + ut_a(buf_pool->LRU_old == bpage); + } else { + ut_a(!prev || buf_page_is_old(prev)); + } + + ut_a(!next || buf_page_is_old(next)); + } + } + + ut_a(buf_pool->LRU_old_len == old_len); + + mutex_exit(&buf_pool->LRU_list_mutex); + + mutex_enter(&buf_pool->free_list_mutex); + + UT_LIST_VALIDATE(list, buf_page_t, buf_pool->free, CheckInFreeList()); + + for (bpage = UT_LIST_GET_FIRST(buf_pool->free); + bpage != NULL; + bpage = UT_LIST_GET_NEXT(list, bpage)) { + + ut_a(buf_page_get_state(bpage) == BUF_BLOCK_NOT_USED); + } + + mutex_exit(&buf_pool->free_list_mutex); + + mutex_enter(&buf_pool->LRU_list_mutex); + + UT_LIST_VALIDATE( + unzip_LRU, buf_block_t, buf_pool->unzip_LRU, + CheckUnzipLRUAndLRUList()); + + for (block = UT_LIST_GET_FIRST(buf_pool->unzip_LRU); + block; + block = UT_LIST_GET_NEXT(unzip_LRU, block)) { + + ut_ad(block->in_unzip_LRU_list); + ut_ad(block->page.in_LRU_list); + ut_a(buf_page_belongs_to_unzip_LRU(&block->page)); + } + + mutex_exit(&buf_pool->LRU_list_mutex); +} + +/**********************************************************************//** +Validates the LRU list. +@return TRUE */ +UNIV_INTERN +ibool +buf_LRU_validate(void) +/*==================*/ +{ + ulint i; + + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_array(i); + buf_LRU_validate_instance(buf_pool); + } + + return(TRUE); +} +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ + +#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +/**********************************************************************//** +Prints the LRU list for one buffer pool instance. */ +UNIV_INTERN +void +buf_LRU_print_instance( +/*===================*/ + buf_pool_t* buf_pool) +{ + const buf_page_t* bpage; + + ut_ad(buf_pool); + mutex_enter(&buf_pool->LRU_list_mutex); + + bpage = UT_LIST_GET_FIRST(buf_pool->LRU); + + while (bpage != NULL) { + + mutex_enter(buf_page_get_mutex(bpage)); + fprintf(stderr, "BLOCK space %lu page %lu ", + (ulong) buf_page_get_space(bpage), + (ulong) buf_page_get_page_no(bpage)); + + if (buf_page_is_old(bpage)) { + fputs("old ", stderr); + } + + if (bpage->buf_fix_count) { + fprintf(stderr, "buffix count %lu ", + (ulong) bpage->buf_fix_count); + } + + if (buf_page_get_io_fix(bpage)) { + fprintf(stderr, "io_fix %lu ", + (ulong) buf_page_get_io_fix(bpage)); + } + + if (bpage->oldest_modification) { + fputs("modif. ", stderr); + } + + switch (buf_page_get_state(bpage)) { + const byte* frame; + case BUF_BLOCK_FILE_PAGE: + frame = buf_block_get_frame((buf_block_t*) bpage); + fprintf(stderr, "\ntype %lu" + " index id %llu\n", + (ulong) fil_page_get_type(frame), + (ullint) btr_page_get_index_id(frame)); + break; + case BUF_BLOCK_ZIP_PAGE: + frame = bpage->zip.data; + fprintf(stderr, "\ntype %lu size %lu" + " index id %llu\n", + (ulong) fil_page_get_type(frame), + (ulong) buf_page_get_zip_size(bpage), + (ullint) btr_page_get_index_id(frame)); + break; + + default: + fprintf(stderr, "\n!state %lu!\n", + (ulong) buf_page_get_state(bpage)); + break; + } + + mutex_exit(buf_page_get_mutex(bpage)); + bpage = UT_LIST_GET_NEXT(LRU, bpage); + } + + mutex_exit(&buf_pool->LRU_list_mutex); +} + +/**********************************************************************//** +Prints the LRU list. */ +UNIV_INTERN +void +buf_LRU_print(void) +/*===============*/ +{ + ulint i; + buf_pool_t* buf_pool; + + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_pool = buf_pool_from_array(i); + buf_LRU_print_instance(buf_pool); + } +} +#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG || UNIV_BUF_DEBUG */ +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/xtradb/buf/buf0rea.cc b/storage/xtradb/buf/buf0rea.cc new file mode 100644 index 00000000000..c28df72df92 --- /dev/null +++ b/storage/xtradb/buf/buf0rea.cc @@ -0,0 +1,1026 @@ +/***************************************************************************** + +Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file buf/buf0rea.cc +The database buffer read + +Created 11/5/1995 Heikki Tuuri +*******************************************************/ + +#include "buf0rea.h" + +#include "fil0fil.h" +#include "mtr0mtr.h" + +#include "buf0buf.h" +#include "buf0flu.h" +#include "buf0lru.h" +#include "buf0dblwr.h" +#include "ibuf0ibuf.h" +#include "log0recv.h" +#include "trx0sys.h" +#include "os0file.h" +#include "srv0start.h" +#include "srv0srv.h" +#include "mysql/plugin.h" +#include "mysql/service_thd_wait.h" + +/** There must be at least this many pages in buf_pool in the area to start +a random read-ahead */ +#define BUF_READ_AHEAD_RANDOM_THRESHOLD(b) \ + (5 + BUF_READ_AHEAD_AREA(b) / 8) + +/** If there are buf_pool->curr_size per the number below pending reads, then +read-ahead is not done: this is to prevent flooding the buffer pool with +i/o-fixed buffer blocks */ +#define BUF_READ_AHEAD_PEND_LIMIT 2 + +/********************************************************************//** +Unfixes the pages, unlatches the page, +removes it from page_hash and removes it from LRU. */ +static +void +buf_read_page_handle_error( +/*=======================*/ + buf_page_t* bpage) /*!< in: pointer to the block */ +{ + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + const bool uncompressed = (buf_page_get_state(bpage) + == BUF_BLOCK_FILE_PAGE); + const ulint fold = buf_page_address_fold(bpage->space, + bpage->offset); + prio_rw_lock_t* hash_lock = buf_page_hash_lock_get(buf_pool, fold); + + mutex_enter(&buf_pool->LRU_list_mutex); + rw_lock_x_lock(hash_lock); + mutex_enter(buf_page_get_mutex(bpage)); + + /* First unfix and release lock on the bpage */ + ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_READ); + ut_ad(bpage->buf_fix_count == 0); + + /* Set BUF_IO_NONE before we remove the block from LRU list */ + buf_page_set_io_fix(bpage, BUF_IO_NONE); + + if (uncompressed) { + rw_lock_x_unlock_gen( + &((buf_block_t*) bpage)->lock, + BUF_IO_READ); + } + + /* remove the block from LRU list */ + buf_LRU_free_one_page(bpage); + + mutex_exit(&buf_pool->LRU_list_mutex); + + ut_ad(buf_pool->n_pend_reads > 0); + os_atomic_decrement_ulint(&buf_pool->n_pend_reads, 1); +} + +/********************************************************************//** +Low-level function which reads a page asynchronously from a file to the +buffer buf_pool if it is not already there, in which case does nothing. +Sets the io_fix flag and sets an exclusive lock on the buffer frame. The +flag is cleared and the x-lock released by an i/o-handler thread. +@return 1 if a read request was queued, 0 if the page already resided +in buf_pool, or if the page is in the doublewrite buffer blocks in +which case it is never read into the pool, or if the tablespace does +not exist or is being dropped +@return 1 if read request is issued. 0 if it is not */ +UNIV_INTERN +ulint +buf_read_page_low( +/*==============*/ + dberr_t* err, /*!< out: DB_SUCCESS or DB_TABLESPACE_DELETED if we are + trying to read from a non-existent tablespace, or a + tablespace which is just now being dropped */ + bool sync, /*!< in: true if synchronous aio is desired */ + ulint mode, /*!< in: BUF_READ_IBUF_PAGES_ONLY, ..., + ORed to OS_AIO_SIMULATED_WAKE_LATER (see below + at read-ahead functions) */ + ulint space, /*!< in: space id */ + ulint zip_size,/*!< in: compressed page size, or 0 */ + ibool unzip, /*!< in: TRUE=request uncompressed page */ + ib_int64_t tablespace_version, /*!< in: if the space memory object has + this timestamp different from what we are giving here, + treat the tablespace as dropped; this is a timestamp we + use to stop dangling page reads from a tablespace + which we have DISCARDed + IMPORTed back */ + ulint offset, /*!< in: page number */ + trx_t* trx) +{ + buf_page_t* bpage; + ulint wake_later; + ibool ignore_nonexistent_pages; + + *err = DB_SUCCESS; + + wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER; + mode = mode & ~OS_AIO_SIMULATED_WAKE_LATER; + + ignore_nonexistent_pages = mode & BUF_READ_IGNORE_NONEXISTENT_PAGES; + mode &= ~BUF_READ_IGNORE_NONEXISTENT_PAGES; + + if (space == TRX_SYS_SPACE && buf_dblwr_page_inside(offset)) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Warning: trying to read" + " doublewrite buffer page %lu\n", + (ulong) offset); + + return(0); + } + + if (ibuf_bitmap_page(zip_size, offset) + || trx_sys_hdr_page(space, offset)) { + + /* Trx sys header is so low in the latching order that we play + safe and do not leave the i/o-completion to an asynchronous + i/o-thread. Ibuf bitmap pages must always be read with + syncronous i/o, to make sure they do not get involved in + thread deadlocks. */ + + sync = true; + } + + /* The following call will also check if the tablespace does not exist + or is being dropped; if we succeed in initing the page in the buffer + pool for read, then DISCARD cannot proceed until the read has + completed */ + bpage = buf_page_init_for_read(err, mode, space, zip_size, unzip, + tablespace_version, offset); + if (bpage == NULL) { + /* bugfix: http://bugs.mysql.com/bug.php?id=43948 */ + if (recv_recovery_is_on() && *err == DB_TABLESPACE_DELETED) { + /* hashed log recs must be treated here */ + recv_addr_t* recv_addr; + + mutex_enter(&(recv_sys->mutex)); + + if (recv_sys->apply_log_recs == FALSE) { + mutex_exit(&(recv_sys->mutex)); + goto not_to_recover; + } + + /* recv_get_fil_addr_struct() */ + recv_addr = (recv_addr_t*)HASH_GET_FIRST(recv_sys->addr_hash, + hash_calc_hash(ut_fold_ulint_pair(space, offset), + recv_sys->addr_hash)); + while (recv_addr) { + if ((recv_addr->space == space) + && (recv_addr->page_no == offset)) { + break; + } + recv_addr = (recv_addr_t*)HASH_GET_NEXT(addr_hash, recv_addr); + } + + if ((recv_addr == NULL) + || (recv_addr->state == RECV_BEING_PROCESSED) + || (recv_addr->state == RECV_PROCESSED)) { + mutex_exit(&(recv_sys->mutex)); + goto not_to_recover; + } + + fprintf(stderr, " (cannot find space: %lu)", space); + recv_addr->state = RECV_PROCESSED; + + ut_a(recv_sys->n_addrs); + recv_sys->n_addrs--; + + mutex_exit(&(recv_sys->mutex)); + } +not_to_recover: + + return(0); + } + +#ifdef UNIV_DEBUG + if (buf_debug_prints) { + fprintf(stderr, + "Posting read request for page %lu, sync %s\n", + (ulong) offset, sync ? "true" : "false"); + } +#endif + + ut_ad(buf_page_in_file(bpage)); + ut_ad(!mutex_own(&buf_pool_from_bpage(bpage)->LRU_list_mutex)); + + if (sync) { + thd_wait_begin(NULL, THD_WAIT_DISKIO); + } + + if (zip_size) { + *err = _fil_io(OS_FILE_READ | wake_later + | ignore_nonexistent_pages, + sync, space, zip_size, offset, 0, zip_size, + bpage->zip.data, bpage, trx); + } else { + ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE); + + *err = _fil_io(OS_FILE_READ | wake_later + | ignore_nonexistent_pages, + sync, space, 0, offset, 0, UNIV_PAGE_SIZE, + ((buf_block_t*) bpage)->frame, bpage, trx); + } + + if (sync) { + thd_wait_end(NULL); + } + + if (*err != DB_SUCCESS) { + if (ignore_nonexistent_pages || *err == DB_TABLESPACE_DELETED) { + buf_read_page_handle_error(bpage); + return(0); + } + SRV_CORRUPT_TABLE_CHECK(*err == DB_SUCCESS, + bpage->is_corrupt = TRUE;); + } + + if (sync) { + /* The i/o is already completed when we arrive from + fil_read */ + if (!buf_page_io_complete(bpage)) { + return(0); + } + } + + return(1); +} + +/********************************************************************//** +Applies a random read-ahead in buf_pool if there are at least a threshold +value of accessed pages from the random read-ahead area. Does not read any +page, not even the one at the position (space, offset), if the read-ahead +mechanism is not activated. NOTE 1: the calling thread may own latches on +pages: to avoid deadlocks this function must be written such that it cannot +end up waiting for these latches! NOTE 2: the calling thread must want +access to the page given: this rule is set to prevent unintended read-aheads +performed by ibuf routines, a situation which could result in a deadlock if +the OS does not support asynchronous i/o. +@return number of page read requests issued; NOTE that if we read ibuf +pages, it may happen that the page at the given page number does not +get read even if we return a positive value! +@return number of page read requests issued */ +UNIV_INTERN +ulint +buf_read_ahead_random( +/*==================*/ + ulint space, /*!< in: space id */ + ulint zip_size, /*!< in: compressed page size in bytes, + or 0 */ + ulint offset, /*!< in: page number of a page which + the current thread wants to access */ + ibool inside_ibuf, /*!< in: TRUE if we are inside ibuf + routine */ + trx_t* trx) +{ + buf_pool_t* buf_pool = buf_pool_get(space, offset); + ib_int64_t tablespace_version; + ulint recent_blocks = 0; + ulint ibuf_mode; + ulint count; + ulint low, high; + dberr_t err; + ulint i; + const ulint buf_read_ahead_random_area + = BUF_READ_AHEAD_AREA(buf_pool); + + if (!srv_random_read_ahead) { + /* Disabled by user */ + return(0); + } + + if (srv_startup_is_before_trx_rollback_phase) { + /* No read-ahead to avoid thread deadlocks */ + return(0); + } + + if (ibuf_bitmap_page(zip_size, offset) + || trx_sys_hdr_page(space, offset)) { + + /* If it is an ibuf bitmap page or trx sys hdr, we do + no read-ahead, as that could break the ibuf page access + order */ + + return(0); + } + + /* Remember the tablespace version before we ask te tablespace size + below: if DISCARD + IMPORT changes the actual .ibd file meanwhile, we + do not try to read outside the bounds of the tablespace! */ + + tablespace_version = fil_space_get_version(space); + + low = (offset / buf_read_ahead_random_area) + * buf_read_ahead_random_area; + high = (offset / buf_read_ahead_random_area + 1) + * buf_read_ahead_random_area; + if (high > fil_space_get_size(space)) { + + high = fil_space_get_size(space); + } + + if (buf_pool->n_pend_reads + > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) { + + return(0); + } + + /* Count how many blocks in the area have been recently accessed, + that is, reside near the start of the LRU list. */ + + for (i = low; i < high; i++) { + + prio_rw_lock_t* hash_lock; + + const buf_page_t* bpage = + buf_page_hash_get_s_locked(buf_pool, space, i, + &hash_lock); + + if (bpage + && buf_page_is_accessed(bpage) + && buf_page_peek_if_young(bpage)) { + + recent_blocks++; + + if (recent_blocks + >= BUF_READ_AHEAD_RANDOM_THRESHOLD(buf_pool)) { + + rw_lock_s_unlock(hash_lock); + goto read_ahead; + } + } + + if (bpage) { + rw_lock_s_unlock(hash_lock); + } + } + + /* Do nothing */ + return(0); + +read_ahead: + /* Read all the suitable blocks within the area */ + + if (inside_ibuf) { + ibuf_mode = BUF_READ_IBUF_PAGES_ONLY; + } else { + ibuf_mode = BUF_READ_ANY_PAGE; + } + + count = 0; + + for (i = low; i < high; i++) { + /* It is only sensible to do read-ahead in the non-sync aio + mode: hence FALSE as the first parameter */ + + if (!ibuf_bitmap_page(zip_size, i)) { + count += buf_read_page_low( + &err, false, + ibuf_mode | OS_AIO_SIMULATED_WAKE_LATER, + space, zip_size, FALSE, + tablespace_version, i, trx); + if (err == DB_TABLESPACE_DELETED) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Warning: in random" + " readahead trying to access\n" + "InnoDB: tablespace %lu page %lu,\n" + "InnoDB: but the tablespace does not" + " exist or is just being dropped.\n", + (ulong) space, (ulong) i); + } + } + } + + /* In simulated aio we wake the aio handler threads only after + queuing all aio requests, in native aio the following call does + nothing: */ + + os_aio_simulated_wake_handler_threads(); + +#ifdef UNIV_DEBUG + if (buf_debug_prints && (count > 0)) { + fprintf(stderr, + "Random read-ahead space %lu offset %lu pages %lu\n", + (ulong) space, (ulong) offset, + (ulong) count); + } +#endif /* UNIV_DEBUG */ + + /* Read ahead is considered one I/O operation for the purpose of + LRU policy decision. */ + buf_LRU_stat_inc_io(); + + buf_pool->stat.n_ra_pages_read_rnd += count; + srv_stats.buf_pool_reads.add(count); + return(count); +} + +/********************************************************************//** +High-level function which reads a page asynchronously from a file to the +buffer buf_pool if it is not already there. Sets the io_fix flag and sets +an exclusive lock on the buffer frame. The flag is cleared and the x-lock +released by the i/o-handler thread. +@return TRUE if page has been read in, FALSE in case of failure */ +UNIV_INTERN +ibool +buf_read_page( +/*==========*/ + ulint space, /*!< in: space id */ + ulint zip_size,/*!< in: compressed page size in bytes, or 0 */ + ulint offset, /*!< in: page number */ + trx_t* trx) +{ + ib_int64_t tablespace_version; + ulint count; + dberr_t err; + + tablespace_version = fil_space_get_version(space); + + /* We do the i/o in the synchronous aio mode to save thread + switches: hence TRUE */ + + count = buf_read_page_low(&err, true, BUF_READ_ANY_PAGE, space, + zip_size, FALSE, + tablespace_version, offset, trx); + srv_stats.buf_pool_reads.add(count); + if (err == DB_TABLESPACE_DELETED) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: trying to access" + " tablespace %lu page no. %lu,\n" + "InnoDB: but the tablespace does not exist" + " or is just being dropped.\n", + (ulong) space, (ulong) offset); + } + + /* Increment number of I/O operations used for LRU policy. */ + buf_LRU_stat_inc_io(); + + return(count > 0); +} + +/********************************************************************//** +High-level function which reads a page asynchronously from a file to the +buffer buf_pool if it is not already there. Sets the io_fix flag and sets +an exclusive lock on the buffer frame. The flag is cleared and the x-lock +released by the i/o-handler thread. +@return TRUE if page has been read in, FALSE in case of failure */ +UNIV_INTERN +ibool +buf_read_page_async( +/*================*/ + ulint space, /*!< in: space id */ + ulint offset) /*!< in: page number */ +{ + ulint zip_size; + ib_int64_t tablespace_version; + ulint count; + dberr_t err; + + zip_size = fil_space_get_zip_size(space); + + if (zip_size == ULINT_UNDEFINED) { + return(FALSE); + } + + tablespace_version = fil_space_get_version(space); + + count = buf_read_page_low(&err, true, BUF_READ_ANY_PAGE + | OS_AIO_SIMULATED_WAKE_LATER + | BUF_READ_IGNORE_NONEXISTENT_PAGES, + space, zip_size, FALSE, + tablespace_version, offset, NULL); + srv_stats.buf_pool_reads.add(count); + + /* We do not increment number of I/O operations used for LRU policy + here (buf_LRU_stat_inc_io()). We use this in heuristics to decide + about evicting uncompressed version of compressed pages from the + buffer pool. Since this function is called from buffer pool load + these IOs are deliberate and are not part of normal workload we can + ignore these in our heuristics. */ + + return(count > 0); +} + +/********************************************************************//** +Applies linear read-ahead if in the buf_pool the page is a border page of +a linear read-ahead area and all the pages in the area have been accessed. +Does not read any page if the read-ahead mechanism is not activated. Note +that the algorithm looks at the 'natural' adjacent successor and +predecessor of the page, which on the leaf level of a B-tree are the next +and previous page in the chain of leaves. To know these, the page specified +in (space, offset) must already be present in the buf_pool. Thus, the +natural way to use this function is to call it when a page in the buf_pool +is accessed the first time, calling this function just after it has been +bufferfixed. +NOTE 1: as this function looks at the natural predecessor and successor +fields on the page, what happens, if these are not initialized to any +sensible value? No problem, before applying read-ahead we check that the +area to read is within the span of the space, if not, read-ahead is not +applied. An uninitialized value may result in a useless read operation, but +only very improbably. +NOTE 2: the calling thread may own latches on pages: to avoid deadlocks this +function must be written such that it cannot end up waiting for these +latches! +NOTE 3: the calling thread must want access to the page given: this rule is +set to prevent unintended read-aheads performed by ibuf routines, a situation +which could result in a deadlock if the OS does not support asynchronous io. +@return number of page read requests issued */ +UNIV_INTERN +ulint +buf_read_ahead_linear( +/*==================*/ + ulint space, /*!< in: space id */ + ulint zip_size, /*!< in: compressed page size in bytes, or 0 */ + ulint offset, /*!< in: page number; see NOTE 3 above */ + ibool inside_ibuf, /*!< in: TRUE if we are inside ibuf routine */ + trx_t* trx) +{ + buf_pool_t* buf_pool = buf_pool_get(space, offset); + ib_int64_t tablespace_version; + buf_page_t* bpage; + buf_frame_t* frame; + buf_page_t* pred_bpage = NULL; + unsigned pred_bpage_is_accessed = 0; + ulint pred_offset; + ulint succ_offset; + ulint count; + int asc_or_desc; + ulint new_offset; + ulint fail_count; + ulint ibuf_mode; + ulint low, high; + dberr_t err; + ulint i; + const ulint buf_read_ahead_linear_area + = BUF_READ_AHEAD_AREA(buf_pool); + ulint threshold; + + /* check if readahead is disabled */ + if (!srv_read_ahead_threshold) { + return(0); + } + + if (UNIV_UNLIKELY(srv_startup_is_before_trx_rollback_phase)) { + /* No read-ahead to avoid thread deadlocks */ + return(0); + } + + low = (offset / buf_read_ahead_linear_area) + * buf_read_ahead_linear_area; + high = (offset / buf_read_ahead_linear_area + 1) + * buf_read_ahead_linear_area; + + if ((offset != low) && (offset != high - 1)) { + /* This is not a border page of the area: return */ + + return(0); + } + + if (ibuf_bitmap_page(zip_size, offset) + || trx_sys_hdr_page(space, offset)) { + + /* If it is an ibuf bitmap page or trx sys hdr, we do + no read-ahead, as that could break the ibuf page access + order */ + + return(0); + } + + /* Remember the tablespace version before we ask te tablespace size + below: if DISCARD + IMPORT changes the actual .ibd file meanwhile, we + do not try to read outside the bounds of the tablespace! */ + + tablespace_version = fil_space_get_version(space); + + if (high > fil_space_get_size(space)) { + /* The area is not whole, return */ + + return(0); + } + + if (buf_pool->n_pend_reads + > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) { + + return(0); + } + + /* Check that almost all pages in the area have been accessed; if + offset == low, the accesses must be in a descending order, otherwise, + in an ascending order. */ + + asc_or_desc = 1; + + if (offset == low) { + asc_or_desc = -1; + } + + /* How many out of order accessed pages can we ignore + when working out the access pattern for linear readahead */ + threshold = ut_min((64 - srv_read_ahead_threshold), + BUF_READ_AHEAD_AREA(buf_pool)); + + fail_count = 0; + + prio_rw_lock_t* hash_lock; + + for (i = low; i < high; i++) { + + bpage = buf_page_hash_get_s_locked(buf_pool, space, i, + &hash_lock); + + if (bpage == NULL || !buf_page_is_accessed(bpage)) { + /* Not accessed */ + fail_count++; + + } else if (pred_bpage) { + /* Note that buf_page_is_accessed() returns + the time of the first access. If some blocks + of the extent existed in the buffer pool at + the time of a linear access pattern, the first + access times may be nonmonotonic, even though + the latest access times were linear. The + threshold (srv_read_ahead_factor) should help + a little against this. */ + int res = ut_ulint_cmp( + buf_page_is_accessed(bpage), + pred_bpage_is_accessed); + /* Accesses not in the right order */ + if (res != 0 && res != asc_or_desc) { + fail_count++; + } + } + + if (fail_count > threshold) { + /* Too many failures: return */ + if (bpage) { + rw_lock_s_unlock(hash_lock); + } + return(0); + } + + if (bpage) { + if (buf_page_is_accessed(bpage)) { + pred_bpage = bpage; + pred_bpage_is_accessed + = buf_page_is_accessed(bpage); + } + + rw_lock_s_unlock(hash_lock); + } + } + + /* If we got this far, we know that enough pages in the area have + been accessed in the right order: linear read-ahead can be sensible */ + + bpage = buf_page_hash_get_s_locked(buf_pool, space, offset, &hash_lock); + + if (bpage == NULL) { + + return(0); + } + + switch (buf_page_get_state(bpage)) { + case BUF_BLOCK_ZIP_PAGE: + frame = bpage->zip.data; + break; + case BUF_BLOCK_FILE_PAGE: + frame = ((buf_block_t*) bpage)->frame; + break; + default: + ut_error; + break; + } + + /* Read the natural predecessor and successor page addresses from + the page; NOTE that because the calling thread may have an x-latch + on the page, we do not acquire an s-latch on the page, this is to + prevent deadlocks. Even if we read values which are nonsense, the + algorithm will work. */ + + pred_offset = fil_page_get_prev(frame); + succ_offset = fil_page_get_next(frame); + + rw_lock_s_unlock(hash_lock); + + if ((offset == low) && (succ_offset == offset + 1)) { + + /* This is ok, we can continue */ + new_offset = pred_offset; + + } else if ((offset == high - 1) && (pred_offset == offset - 1)) { + + /* This is ok, we can continue */ + new_offset = succ_offset; + } else { + /* Successor or predecessor not in the right order */ + + return(0); + } + + low = (new_offset / buf_read_ahead_linear_area) + * buf_read_ahead_linear_area; + high = (new_offset / buf_read_ahead_linear_area + 1) + * buf_read_ahead_linear_area; + + if ((new_offset != low) && (new_offset != high - 1)) { + /* This is not a border page of the area: return */ + + return(0); + } + + if (high > fil_space_get_size(space)) { + /* The area is not whole, return */ + + return(0); + } + + /* If we got this far, read-ahead can be sensible: do it */ + + ibuf_mode = inside_ibuf + ? BUF_READ_IBUF_PAGES_ONLY | OS_AIO_SIMULATED_WAKE_LATER + : BUF_READ_ANY_PAGE | OS_AIO_SIMULATED_WAKE_LATER; + + count = 0; + + /* Since Windows XP seems to schedule the i/o handler thread + very eagerly, and consequently it does not wait for the + full read batch to be posted, we use special heuristics here */ + + os_aio_simulated_put_read_threads_to_sleep(); + + for (i = low; i < high; i++) { + /* It is only sensible to do read-ahead in the non-sync + aio mode: hence FALSE as the first parameter */ + + if (!ibuf_bitmap_page(zip_size, i)) { + count += buf_read_page_low( + &err, false, + ibuf_mode, + space, zip_size, FALSE, tablespace_version, i, trx); + if (err == DB_TABLESPACE_DELETED) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Warning: in" + " linear readahead trying to access\n" + "InnoDB: tablespace %lu page %lu,\n" + "InnoDB: but the tablespace does not" + " exist or is just being dropped.\n", + (ulong) space, (ulong) i); + } + } + } + + /* In simulated aio we wake the aio handler threads only after + queuing all aio requests, in native aio the following call does + nothing: */ + + os_aio_simulated_wake_handler_threads(); + +#ifdef UNIV_DEBUG + if (buf_debug_prints && (count > 0)) { + fprintf(stderr, + "LINEAR read-ahead space %lu offset %lu pages %lu\n", + (ulong) space, (ulong) offset, (ulong) count); + } +#endif /* UNIV_DEBUG */ + + /* Read ahead is considered one I/O operation for the purpose of + LRU policy decision. */ + buf_LRU_stat_inc_io(); + + buf_pool->stat.n_ra_pages_read += count; + return(count); +} + +/********************************************************************//** +Issues read requests for pages which the ibuf module wants to read in, in +order to contract the insert buffer tree. Technically, this function is like +a read-ahead function. */ +UNIV_INTERN +void +buf_read_ibuf_merge_pages( +/*======================*/ + bool sync, /*!< in: true if the caller + wants this function to wait + for the highest address page + to get read in, before this + function returns */ + const ulint* space_ids, /*!< in: array of space ids */ + const ib_int64_t* space_versions,/*!< in: the spaces must have + this version number + (timestamp), otherwise we + discard the read; we use this + to cancel reads if DISCARD + + IMPORT may have changed the + tablespace size */ + const ulint* page_nos, /*!< in: array of page numbers + to read, with the highest page + number the last in the + array */ + ulint n_stored) /*!< in: number of elements + in the arrays */ +{ + ulint i; + +#ifdef UNIV_IBUF_DEBUG + ut_a(n_stored < UNIV_PAGE_SIZE); +#endif + + for (i = 0; i < n_stored; i++) { + dberr_t err; + buf_pool_t* buf_pool; + ulint zip_size = fil_space_get_zip_size(space_ids[i]); + + buf_pool = buf_pool_get(space_ids[i], page_nos[i]); + + while (buf_pool->n_pend_reads + > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) { + os_thread_sleep(500000); + } + + if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) { + + goto tablespace_deleted; + } + + buf_read_page_low(&err, sync && (i + 1 == n_stored), + BUF_READ_ANY_PAGE, space_ids[i], + zip_size, TRUE, space_versions[i], + page_nos[i], NULL); + + if (UNIV_UNLIKELY(err == DB_TABLESPACE_DELETED)) { +tablespace_deleted: + /* We have deleted or are deleting the single-table + tablespace: remove the entries for that page */ + + ibuf_merge_or_delete_for_page(NULL, space_ids[i], + page_nos[i], + zip_size, FALSE); + } + } + + os_aio_simulated_wake_handler_threads(); + +#ifdef UNIV_DEBUG + if (buf_debug_prints) { + fprintf(stderr, + "Ibuf merge read-ahead space %lu pages %lu\n", + (ulong) space_ids[0], (ulong) n_stored); + } +#endif /* UNIV_DEBUG */ +} + +/********************************************************************//** +Issues read requests for pages which recovery wants to read in. */ +UNIV_INTERN +void +buf_read_recv_pages( +/*================*/ + ibool sync, /*!< in: TRUE if the caller + wants this function to wait + for the highest address page + to get read in, before this + function returns */ + ulint space, /*!< in: space id */ + ulint zip_size, /*!< in: compressed page size in + bytes, or 0 */ + const ulint* page_nos, /*!< in: array of page numbers + to read, with the highest page + number the last in the + array */ + ulint n_stored) /*!< in: number of page numbers + in the array */ +{ + ib_int64_t tablespace_version; + ulint count; + dberr_t err; + ulint i; + + zip_size = fil_space_get_zip_size(space); + + if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) { + /* It is a single table tablespace and the .ibd file is + missing: do nothing */ + + /* the log records should be treated here same reason + for http://bugs.mysql.com/bug.php?id=43948 */ + + if (recv_recovery_is_on()) { + recv_addr_t* recv_addr; + + mutex_enter(&(recv_sys->mutex)); + + if (recv_sys->apply_log_recs == FALSE) { + mutex_exit(&(recv_sys->mutex)); + goto not_to_recover; + } + + for (i = 0; i < n_stored; i++) { + /* recv_get_fil_addr_struct() */ + recv_addr = (recv_addr_t*)HASH_GET_FIRST(recv_sys->addr_hash, + hash_calc_hash(ut_fold_ulint_pair(space, page_nos[i]), + recv_sys->addr_hash)); + while (recv_addr) { + if ((recv_addr->space == space) + && (recv_addr->page_no == page_nos[i])) { + break; + } + recv_addr = (recv_addr_t*)HASH_GET_NEXT(addr_hash, recv_addr); + } + + if ((recv_addr == NULL) + || (recv_addr->state == RECV_BEING_PROCESSED) + || (recv_addr->state == RECV_PROCESSED)) { + continue; + } + + recv_addr->state = RECV_PROCESSED; + + ut_a(recv_sys->n_addrs); + recv_sys->n_addrs--; + } + + mutex_exit(&(recv_sys->mutex)); + + fprintf(stderr, " (cannot find space: %lu)", space); + } +not_to_recover: + + return; + } + + tablespace_version = fil_space_get_version(space); + + for (i = 0; i < n_stored; i++) { + buf_pool_t* buf_pool; + + count = 0; + + os_aio_print_debug = FALSE; + buf_pool = buf_pool_get(space, page_nos[i]); + while (buf_pool->n_pend_reads + >= recv_n_pool_free_frames / 2) { + + os_aio_simulated_wake_handler_threads(); + os_thread_sleep(10000); + + count++; + + if (count > 1000) { + fprintf(stderr, + "InnoDB: Error: InnoDB has waited for" + " 10 seconds for pending\n" + "InnoDB: reads to the buffer pool to" + " be finished.\n" + "InnoDB: Number of pending reads %lu," + " pending pread calls %lu\n", + (ulong) buf_pool->n_pend_reads, + (ulong) os_file_n_pending_preads); + + os_aio_print_debug = TRUE; + } + } + + os_aio_print_debug = FALSE; + + if ((i + 1 == n_stored) && sync) { + buf_read_page_low(&err, true, BUF_READ_ANY_PAGE, space, + zip_size, TRUE, tablespace_version, + page_nos[i], NULL); + } else { + buf_read_page_low(&err, false, BUF_READ_ANY_PAGE + | OS_AIO_SIMULATED_WAKE_LATER, + space, zip_size, TRUE, + tablespace_version, page_nos[i], NULL); + } + } + + os_aio_simulated_wake_handler_threads(); + +#ifdef UNIV_DEBUG + if (buf_debug_prints) { + fprintf(stderr, + "Recovery applies read-ahead pages %lu\n", + (ulong) n_stored); + } +#endif /* UNIV_DEBUG */ +} diff --git a/storage/xtradb/compile-innodb b/storage/xtradb/compile-innodb new file mode 100755 index 00000000000..fa791282b28 --- /dev/null +++ b/storage/xtradb/compile-innodb @@ -0,0 +1,25 @@ +#!/bin/sh +# +# Copyright (c) 2006, 2013, Oracle and/or its affiliates. All rights reserved. +# +# This program is free software; you can redistribute it and/or modify it under +# the terms of the GNU General Public License as published by the Free Software +# Foundation; version 2 of the License. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along with +# this program; if not, write to the Free Software Foundation, Inc., 51 Franklin St, +# Fifth Floor, Boston, MA 02110-1301 USA +# + +# we assume this script is in storage/innobase/ + +MYSQL_ROOT="$(dirname ${0})/../.." + +cd ${MYSQL_ROOT} + +cmake -DWITH_INNOBASE_STORAGE_ENGINE:BOOL=ON +make -j$(nproc) diff --git a/storage/xtradb/data/data0data.cc b/storage/xtradb/data/data0data.cc new file mode 100644 index 00000000000..179de79b69f --- /dev/null +++ b/storage/xtradb/data/data0data.cc @@ -0,0 +1,750 @@ +/***************************************************************************** + +Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file data/data0data.cc +SQL data field and tuple + +Created 5/30/1994 Heikki Tuuri +*************************************************************************/ + +#include "data0data.h" + +#ifdef UNIV_NONINL +#include "data0data.ic" +#endif + +#ifndef UNIV_HOTBACKUP +#include "rem0rec.h" +#include "rem0cmp.h" +#include "page0page.h" +#include "page0zip.h" +#include "dict0dict.h" +#include "btr0cur.h" + +#include <ctype.h> +#endif /* !UNIV_HOTBACKUP */ + +#ifdef UNIV_DEBUG +/** Dummy variable to catch access to uninitialized fields. In the +debug version, dtuple_create() will make all fields of dtuple_t point +to data_error. */ +UNIV_INTERN byte data_error; + +# ifndef UNIV_DEBUG_VALGRIND +/** this is used to fool the compiler in dtuple_validate */ +UNIV_INTERN ulint data_dummy; +# endif /* !UNIV_DEBUG_VALGRIND */ +#endif /* UNIV_DEBUG */ + +#ifndef UNIV_HOTBACKUP +/************************************************************//** +Compare two data tuples, respecting the collation of character fields. +@return 1, 0 , -1 if tuple1 is greater, equal, less, respectively, +than tuple2 */ +UNIV_INTERN +int +dtuple_coll_cmp( +/*============*/ + const dtuple_t* tuple1, /*!< in: tuple 1 */ + const dtuple_t* tuple2) /*!< in: tuple 2 */ +{ + ulint n_fields; + ulint i; + + ut_ad(tuple1 && tuple2); + ut_ad(tuple1->magic_n == DATA_TUPLE_MAGIC_N); + ut_ad(tuple2->magic_n == DATA_TUPLE_MAGIC_N); + ut_ad(dtuple_check_typed(tuple1)); + ut_ad(dtuple_check_typed(tuple2)); + + n_fields = dtuple_get_n_fields(tuple1); + + if (n_fields != dtuple_get_n_fields(tuple2)) { + + return(n_fields < dtuple_get_n_fields(tuple2) ? -1 : 1); + } + + for (i = 0; i < n_fields; i++) { + int cmp; + const dfield_t* field1 = dtuple_get_nth_field(tuple1, i); + const dfield_t* field2 = dtuple_get_nth_field(tuple2, i); + + cmp = cmp_dfield_dfield(field1, field2); + + if (cmp) { + return(cmp); + } + } + + return(0); +} + +/*********************************************************************//** +Sets number of fields used in a tuple. Normally this is set in +dtuple_create, but if you want later to set it smaller, you can use this. */ +UNIV_INTERN +void +dtuple_set_n_fields( +/*================*/ + dtuple_t* tuple, /*!< in: tuple */ + ulint n_fields) /*!< in: number of fields */ +{ + ut_ad(tuple); + + tuple->n_fields = n_fields; + tuple->n_fields_cmp = n_fields; +} + +/**********************************************************//** +Checks that a data field is typed. +@return TRUE if ok */ +static +ibool +dfield_check_typed_no_assert( +/*=========================*/ + const dfield_t* field) /*!< in: data field */ +{ + if (dfield_get_type(field)->mtype > DATA_MYSQL + || dfield_get_type(field)->mtype < DATA_VARCHAR) { + + fprintf(stderr, + "InnoDB: Error: data field type %lu, len %lu\n", + (ulong) dfield_get_type(field)->mtype, + (ulong) dfield_get_len(field)); + return(FALSE); + } + + return(TRUE); +} + +/**********************************************************//** +Checks that a data tuple is typed. +@return TRUE if ok */ +UNIV_INTERN +ibool +dtuple_check_typed_no_assert( +/*=========================*/ + const dtuple_t* tuple) /*!< in: tuple */ +{ + const dfield_t* field; + ulint i; + + if (dtuple_get_n_fields(tuple) > REC_MAX_N_FIELDS) { + fprintf(stderr, + "InnoDB: Error: index entry has %lu fields\n", + (ulong) dtuple_get_n_fields(tuple)); +dump: + fputs("InnoDB: Tuple contents: ", stderr); + dtuple_print(stderr, tuple); + putc('\n', stderr); + + return(FALSE); + } + + for (i = 0; i < dtuple_get_n_fields(tuple); i++) { + + field = dtuple_get_nth_field(tuple, i); + + if (!dfield_check_typed_no_assert(field)) { + goto dump; + } + } + + return(TRUE); +} +#endif /* !UNIV_HOTBACKUP */ + +#ifdef UNIV_DEBUG +/**********************************************************//** +Checks that a data field is typed. Asserts an error if not. +@return TRUE if ok */ +UNIV_INTERN +ibool +dfield_check_typed( +/*===============*/ + const dfield_t* field) /*!< in: data field */ +{ + if (dfield_get_type(field)->mtype > DATA_MYSQL + || dfield_get_type(field)->mtype < DATA_VARCHAR) { + + fprintf(stderr, + "InnoDB: Error: data field type %lu, len %lu\n", + (ulong) dfield_get_type(field)->mtype, + (ulong) dfield_get_len(field)); + + ut_error; + } + + return(TRUE); +} + +/**********************************************************//** +Checks that a data tuple is typed. Asserts an error if not. +@return TRUE if ok */ +UNIV_INTERN +ibool +dtuple_check_typed( +/*===============*/ + const dtuple_t* tuple) /*!< in: tuple */ +{ + const dfield_t* field; + ulint i; + + for (i = 0; i < dtuple_get_n_fields(tuple); i++) { + + field = dtuple_get_nth_field(tuple, i); + + ut_a(dfield_check_typed(field)); + } + + return(TRUE); +} + +/**********************************************************//** +Validates the consistency of a tuple which must be complete, i.e, +all fields must have been set. +@return TRUE if ok */ +UNIV_INTERN +ibool +dtuple_validate( +/*============*/ + const dtuple_t* tuple) /*!< in: tuple */ +{ + const dfield_t* field; + ulint n_fields; + ulint len; + ulint i; + + ut_ad(tuple->magic_n == DATA_TUPLE_MAGIC_N); + + n_fields = dtuple_get_n_fields(tuple); + + /* We dereference all the data of each field to test + for memory traps */ + + for (i = 0; i < n_fields; i++) { + + field = dtuple_get_nth_field(tuple, i); + len = dfield_get_len(field); + + if (!dfield_is_null(field)) { + + const byte* data; + + data = static_cast<const byte*>(dfield_get_data(field)); +#ifndef UNIV_DEBUG_VALGRIND + ulint j; + + for (j = 0; j < len; j++) { + + data_dummy += *data; /* fool the compiler not + to optimize out this + code */ + data++; + } +#endif /* !UNIV_DEBUG_VALGRIND */ + + UNIV_MEM_ASSERT_RW(data, len); + } + } + + ut_a(dtuple_check_typed(tuple)); + + return(TRUE); +} +#endif /* UNIV_DEBUG */ + +#ifndef UNIV_HOTBACKUP +/*************************************************************//** +Pretty prints a dfield value according to its data type. */ +UNIV_INTERN +void +dfield_print( +/*=========*/ + const dfield_t* dfield) /*!< in: dfield */ +{ + const byte* data; + ulint len; + ulint i; + + len = dfield_get_len(dfield); + data = static_cast<const byte*>(dfield_get_data(dfield)); + + if (dfield_is_null(dfield)) { + fputs("NULL", stderr); + + return; + } + + switch (dtype_get_mtype(dfield_get_type(dfield))) { + case DATA_CHAR: + case DATA_VARCHAR: + for (i = 0; i < len; i++) { + int c = *data++; + putc(isprint(c) ? c : ' ', stderr); + } + + if (dfield_is_ext(dfield)) { + fputs("(external)", stderr); + } + break; + case DATA_INT: + ut_a(len == 4); /* only works for 32-bit integers */ + fprintf(stderr, "%d", (int) mach_read_from_4(data)); + break; + default: + ut_error; + } +} + +/*************************************************************//** +Pretty prints a dfield value according to its data type. Also the hex string +is printed if a string contains non-printable characters. */ +UNIV_INTERN +void +dfield_print_also_hex( +/*==================*/ + const dfield_t* dfield) /*!< in: dfield */ +{ + const byte* data; + ulint len; + ulint prtype; + ulint i; + ibool print_also_hex; + + len = dfield_get_len(dfield); + data = static_cast<const byte*>(dfield_get_data(dfield)); + + if (dfield_is_null(dfield)) { + fputs("NULL", stderr); + + return; + } + + prtype = dtype_get_prtype(dfield_get_type(dfield)); + + switch (dtype_get_mtype(dfield_get_type(dfield))) { + ib_id_t id; + case DATA_INT: + switch (len) { + ulint val; + case 1: + val = mach_read_from_1(data); + + if (!(prtype & DATA_UNSIGNED)) { + val &= ~0x80; + fprintf(stderr, "%ld", (long) val); + } else { + fprintf(stderr, "%lu", (ulong) val); + } + break; + + case 2: + val = mach_read_from_2(data); + + if (!(prtype & DATA_UNSIGNED)) { + val &= ~0x8000; + fprintf(stderr, "%ld", (long) val); + } else { + fprintf(stderr, "%lu", (ulong) val); + } + break; + + case 3: + val = mach_read_from_3(data); + + if (!(prtype & DATA_UNSIGNED)) { + val &= ~0x800000; + fprintf(stderr, "%ld", (long) val); + } else { + fprintf(stderr, "%lu", (ulong) val); + } + break; + + case 4: + val = mach_read_from_4(data); + + if (!(prtype & DATA_UNSIGNED)) { + val &= ~0x80000000; + fprintf(stderr, "%ld", (long) val); + } else { + fprintf(stderr, "%lu", (ulong) val); + } + break; + + case 6: + id = mach_read_from_6(data); + fprintf(stderr, "%llu", (ullint) id); + break; + + case 7: + id = mach_read_from_7(data); + fprintf(stderr, "%llu", (ullint) id); + break; + case 8: + id = mach_read_from_8(data); + fprintf(stderr, "%llu", (ullint) id); + break; + default: + goto print_hex; + } + break; + + case DATA_SYS: + switch (prtype & DATA_SYS_PRTYPE_MASK) { + case DATA_TRX_ID: + id = mach_read_from_6(data); + + fprintf(stderr, "trx_id " TRX_ID_FMT, id); + break; + + case DATA_ROLL_PTR: + id = mach_read_from_7(data); + + fprintf(stderr, "roll_ptr " TRX_ID_FMT, id); + break; + + case DATA_ROW_ID: + id = mach_read_from_6(data); + + fprintf(stderr, "row_id " TRX_ID_FMT, id); + break; + + default: + id = mach_ull_read_compressed(data); + + fprintf(stderr, "mix_id " TRX_ID_FMT, id); + } + break; + + case DATA_CHAR: + case DATA_VARCHAR: + print_also_hex = FALSE; + + for (i = 0; i < len; i++) { + int c = *data++; + + if (!isprint(c)) { + print_also_hex = TRUE; + + fprintf(stderr, "\\x%02x", (unsigned char) c); + } else { + putc(c, stderr); + } + } + + if (dfield_is_ext(dfield)) { + fputs("(external)", stderr); + } + + if (!print_also_hex) { + break; + } + + data = static_cast<byte*>(dfield_get_data(dfield)); + /* fall through */ + + case DATA_BINARY: + default: +print_hex: + fputs(" Hex: ",stderr); + + for (i = 0; i < len; i++) { + fprintf(stderr, "%02lx", (ulint) *data++); + } + + if (dfield_is_ext(dfield)) { + fputs("(external)", stderr); + } + } +} + +/*************************************************************//** +Print a dfield value using ut_print_buf. */ +static +void +dfield_print_raw( +/*=============*/ + FILE* f, /*!< in: output stream */ + const dfield_t* dfield) /*!< in: dfield */ +{ + ulint len = dfield_get_len(dfield); + if (!dfield_is_null(dfield)) { + ulint print_len = ut_min(len, 1000); + ut_print_buf(f, dfield_get_data(dfield), print_len); + if (len != print_len) { + fprintf(f, "(total %lu bytes%s)", + (ulong) len, + dfield_is_ext(dfield) ? ", external" : ""); + } + } else { + fputs(" SQL NULL", f); + } +} + +/**********************************************************//** +The following function prints the contents of a tuple. */ +UNIV_INTERN +void +dtuple_print( +/*=========*/ + FILE* f, /*!< in: output stream */ + const dtuple_t* tuple) /*!< in: tuple */ +{ + ulint n_fields; + ulint i; + + n_fields = dtuple_get_n_fields(tuple); + + fprintf(f, "DATA TUPLE: %lu fields;\n", (ulong) n_fields); + + for (i = 0; i < n_fields; i++) { + fprintf(f, " %lu:", (ulong) i); + + dfield_print_raw(f, dtuple_get_nth_field(tuple, i)); + + putc(';', f); + putc('\n', f); + } + + ut_ad(dtuple_validate(tuple)); +} + +/**************************************************************//** +Moves parts of long fields in entry to the big record vector so that +the size of tuple drops below the maximum record size allowed in the +database. Moves data only from those fields which are not necessary +to determine uniquely the insertion place of the tuple in the index. +@return own: created big record vector, NULL if we are not able to +shorten the entry enough, i.e., if there are too many fixed-length or +short fields in entry or the index is clustered */ +UNIV_INTERN +big_rec_t* +dtuple_convert_big_rec( +/*===================*/ + dict_index_t* index, /*!< in: index */ + dtuple_t* entry, /*!< in/out: index entry */ + ulint* n_ext) /*!< in/out: number of + externally stored columns */ +{ + mem_heap_t* heap; + big_rec_t* vector; + dfield_t* dfield; + dict_field_t* ifield; + ulint size; + ulint n_fields; + ulint local_len; + ulint local_prefix_len; + + if (!dict_index_is_clust(index)) { + return(NULL); + } + + if (dict_table_get_format(index->table) < UNIV_FORMAT_B) { + /* up to MySQL 5.1: store a 768-byte prefix locally */ + local_len = BTR_EXTERN_FIELD_REF_SIZE + + DICT_ANTELOPE_MAX_INDEX_COL_LEN; + } else { + /* new-format table: do not store any BLOB prefix locally */ + local_len = BTR_EXTERN_FIELD_REF_SIZE; + } + + ut_a(dtuple_check_typed_no_assert(entry)); + + size = rec_get_converted_size(index, entry, *n_ext); + + if (UNIV_UNLIKELY(size > 1000000000)) { + fprintf(stderr, + "InnoDB: Warning: tuple size very big: %lu\n", + (ulong) size); + fputs("InnoDB: Tuple contents: ", stderr); + dtuple_print(stderr, entry); + putc('\n', stderr); + } + + heap = mem_heap_create(size + dtuple_get_n_fields(entry) + * sizeof(big_rec_field_t) + 1000); + + vector = static_cast<big_rec_t*>( + mem_heap_alloc(heap, sizeof(big_rec_t))); + + vector->heap = heap; + + vector->fields = static_cast<big_rec_field_t*>( + mem_heap_alloc( + heap, + dtuple_get_n_fields(entry) * sizeof(big_rec_field_t))); + + /* Decide which fields to shorten: the algorithm is to look for + a variable-length field that yields the biggest savings when + stored externally */ + + n_fields = 0; + + while (page_zip_rec_needs_ext(rec_get_converted_size(index, entry, + *n_ext), + dict_table_is_comp(index->table), + dict_index_get_n_fields(index), + dict_table_zip_size(index->table))) { + ulint i; + ulint longest = 0; + ulint longest_i = ULINT_MAX; + byte* data; + big_rec_field_t* b; + + for (i = dict_index_get_n_unique_in_tree(index); + i < dtuple_get_n_fields(entry); i++) { + ulint savings; + + dfield = dtuple_get_nth_field(entry, i); + ifield = dict_index_get_nth_field(index, i); + + /* Skip fixed-length, NULL, externally stored, + or short columns */ + + if (ifield->fixed_len + || dfield_is_null(dfield) + || dfield_is_ext(dfield) + || dfield_get_len(dfield) <= local_len + || dfield_get_len(dfield) + <= BTR_EXTERN_FIELD_REF_SIZE * 2) { + goto skip_field; + } + + savings = dfield_get_len(dfield) - local_len; + + /* Check that there would be savings */ + if (longest >= savings) { + goto skip_field; + } + + /* In DYNAMIC and COMPRESSED format, store + locally any non-BLOB columns whose maximum + length does not exceed 256 bytes. This is + because there is no room for the "external + storage" flag when the maximum length is 255 + bytes or less. This restriction trivially + holds in REDUNDANT and COMPACT format, because + there we always store locally columns whose + length is up to local_len == 788 bytes. + @see rec_init_offsets_comp_ordinary */ + if (ifield->col->mtype != DATA_BLOB + && ifield->col->len < 256) { + goto skip_field; + } + + longest_i = i; + longest = savings; + +skip_field: + continue; + } + + if (!longest) { + /* Cannot shorten more */ + + mem_heap_free(heap); + + return(NULL); + } + + /* Move data from field longest_i to big rec vector. + + We store the first bytes locally to the record. Then + we can calculate all ordering fields in all indexes + from locally stored data. */ + + dfield = dtuple_get_nth_field(entry, longest_i); + ifield = dict_index_get_nth_field(index, longest_i); + local_prefix_len = local_len - BTR_EXTERN_FIELD_REF_SIZE; + + b = &vector->fields[n_fields]; + b->field_no = longest_i; + b->len = dfield_get_len(dfield) - local_prefix_len; + b->data = (char*) dfield_get_data(dfield) + local_prefix_len; + + /* Allocate the locally stored part of the column. */ + data = static_cast<byte*>(mem_heap_alloc(heap, local_len)); + + /* Copy the local prefix. */ + memcpy(data, dfield_get_data(dfield), local_prefix_len); + /* Clear the extern field reference (BLOB pointer). */ + memset(data + local_prefix_len, 0, BTR_EXTERN_FIELD_REF_SIZE); +#if 0 + /* The following would fail the Valgrind checks in + page_cur_insert_rec_low() and page_cur_insert_rec_zip(). + The BLOB pointers in the record will be initialized after + the record and the BLOBs have been written. */ + UNIV_MEM_ALLOC(data + local_prefix_len, + BTR_EXTERN_FIELD_REF_SIZE); +#endif + + dfield_set_data(dfield, data, local_len); + dfield_set_ext(dfield); + + n_fields++; + (*n_ext)++; + ut_ad(n_fields < dtuple_get_n_fields(entry)); + } + + vector->n_fields = n_fields; + return(vector); +} + +/**************************************************************//** +Puts back to entry the data stored in vector. Note that to ensure the +fields in entry can accommodate the data, vector must have been created +from entry with dtuple_convert_big_rec. */ +UNIV_INTERN +void +dtuple_convert_back_big_rec( +/*========================*/ + dict_index_t* index __attribute__((unused)), /*!< in: index */ + dtuple_t* entry, /*!< in: entry whose data was put to vector */ + big_rec_t* vector) /*!< in, own: big rec vector; it is + freed in this function */ +{ + big_rec_field_t* b = vector->fields; + const big_rec_field_t* const end = b + vector->n_fields; + + for (; b < end; b++) { + dfield_t* dfield; + ulint local_len; + + dfield = dtuple_get_nth_field(entry, b->field_no); + local_len = dfield_get_len(dfield); + + ut_ad(dfield_is_ext(dfield)); + ut_ad(local_len >= BTR_EXTERN_FIELD_REF_SIZE); + + local_len -= BTR_EXTERN_FIELD_REF_SIZE; + + /* Only in REDUNDANT and COMPACT format, we store + up to DICT_ANTELOPE_MAX_INDEX_COL_LEN (768) bytes + locally */ + ut_ad(local_len <= DICT_ANTELOPE_MAX_INDEX_COL_LEN); + + dfield_set_data(dfield, + (char*) b->data - local_len, + b->len + local_len); + } + + mem_heap_free(vector->heap); +} +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/xtradb/data/data0type.cc b/storage/xtradb/data/data0type.cc new file mode 100644 index 00000000000..0b9e08544a5 --- /dev/null +++ b/storage/xtradb/data/data0type.cc @@ -0,0 +1,298 @@ +/***************************************************************************** + +Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file data/data0type.cc +Data types + +Created 1/16/1996 Heikki Tuuri +*******************************************************/ + +#include "data0type.h" + +#ifdef UNIV_NONINL +#include "data0type.ic" +#endif + +#ifndef UNIV_HOTBACKUP +# include "ha_prototypes.h" + +/* At the database startup we store the default-charset collation number of +this MySQL installation to this global variable. If we have < 4.1.2 format +column definitions, or records in the insert buffer, we use this +charset-collation code for them. */ + +UNIV_INTERN ulint data_mysql_default_charset_coll; + +/*********************************************************************//** +Determine how many bytes the first n characters of the given string occupy. +If the string is shorter than n characters, returns the number of bytes +the characters in the string occupy. +@return length of the prefix, in bytes */ +UNIV_INTERN +ulint +dtype_get_at_most_n_mbchars( +/*========================*/ + ulint prtype, /*!< in: precise type */ + ulint mbminmaxlen, /*!< in: minimum and maximum length of + a multi-byte character */ + ulint prefix_len, /*!< in: length of the requested + prefix, in characters, multiplied by + dtype_get_mbmaxlen(dtype) */ + ulint data_len, /*!< in: length of str (in bytes) */ + const char* str) /*!< in: the string whose prefix + length is being determined */ +{ + ulint mbminlen = DATA_MBMINLEN(mbminmaxlen); + ulint mbmaxlen = DATA_MBMAXLEN(mbminmaxlen); + + ut_a(data_len != UNIV_SQL_NULL); + ut_ad(!mbmaxlen || !(prefix_len % mbmaxlen)); + + if (mbminlen != mbmaxlen) { + ut_a(!(prefix_len % mbmaxlen)); + return(innobase_get_at_most_n_mbchars( + dtype_get_charset_coll(prtype), + prefix_len, data_len, str)); + } + + if (prefix_len < data_len) { + + return(prefix_len); + + } + + return(data_len); +} +#endif /* UNIV_HOTBACKUP */ + +/*********************************************************************//** +Checks if a data main type is a string type. Also a BLOB is considered a +string type. +@return TRUE if string type */ +UNIV_INTERN +ibool +dtype_is_string_type( +/*=================*/ + ulint mtype) /*!< in: InnoDB main data type code: DATA_CHAR, ... */ +{ + if (mtype <= DATA_BLOB + || mtype == DATA_MYSQL + || mtype == DATA_VARMYSQL) { + + return(TRUE); + } + + return(FALSE); +} + +/*********************************************************************//** +Checks if a type is a binary string type. Note that for tables created with +< 4.0.14, we do not know if a DATA_BLOB column is a BLOB or a TEXT column. For +those DATA_BLOB columns this function currently returns FALSE. +@return TRUE if binary string type */ +UNIV_INTERN +ibool +dtype_is_binary_string_type( +/*========================*/ + ulint mtype, /*!< in: main data type */ + ulint prtype) /*!< in: precise type */ +{ + if ((mtype == DATA_FIXBINARY) + || (mtype == DATA_BINARY) + || (mtype == DATA_BLOB && (prtype & DATA_BINARY_TYPE))) { + + return(TRUE); + } + + return(FALSE); +} + +/*********************************************************************//** +Checks if a type is a non-binary string type. That is, dtype_is_string_type is +TRUE and dtype_is_binary_string_type is FALSE. Note that for tables created +with < 4.0.14, we do not know if a DATA_BLOB column is a BLOB or a TEXT column. +For those DATA_BLOB columns this function currently returns TRUE. +@return TRUE if non-binary string type */ +UNIV_INTERN +ibool +dtype_is_non_binary_string_type( +/*============================*/ + ulint mtype, /*!< in: main data type */ + ulint prtype) /*!< in: precise type */ +{ + if (dtype_is_string_type(mtype) == TRUE + && dtype_is_binary_string_type(mtype, prtype) == FALSE) { + + return(TRUE); + } + + return(FALSE); +} + +/*********************************************************************//** +Forms a precise type from the < 4.1.2 format precise type plus the +charset-collation code. +@return precise type, including the charset-collation code */ +UNIV_INTERN +ulint +dtype_form_prtype( +/*==============*/ + ulint old_prtype, /*!< in: the MySQL type code and the flags + DATA_BINARY_TYPE etc. */ + ulint charset_coll) /*!< in: MySQL charset-collation code */ +{ + ut_a(old_prtype < 256 * 256); + ut_a(charset_coll <= MAX_CHAR_COLL_NUM); + + return(old_prtype + (charset_coll << 16)); +} + +/*********************************************************************//** +Validates a data type structure. +@return TRUE if ok */ +UNIV_INTERN +ibool +dtype_validate( +/*===========*/ + const dtype_t* type) /*!< in: type struct to validate */ +{ + ut_a(type); + ut_a(type->mtype >= DATA_VARCHAR); + ut_a(type->mtype <= DATA_MYSQL); + + if (type->mtype == DATA_SYS) { + ut_a((type->prtype & DATA_MYSQL_TYPE_MASK) < DATA_N_SYS_COLS); + } + +#ifndef UNIV_HOTBACKUP + ut_a(dtype_get_mbminlen(type) <= dtype_get_mbmaxlen(type)); +#endif /* !UNIV_HOTBACKUP */ + + return(TRUE); +} + +#ifndef UNIV_HOTBACKUP +/*********************************************************************//** +Prints a data type structure. */ +UNIV_INTERN +void +dtype_print( +/*========*/ + const dtype_t* type) /*!< in: type */ +{ + ulint mtype; + ulint prtype; + ulint len; + + ut_a(type); + + mtype = type->mtype; + prtype = type->prtype; + + switch (mtype) { + case DATA_VARCHAR: + fputs("DATA_VARCHAR", stderr); + break; + + case DATA_CHAR: + fputs("DATA_CHAR", stderr); + break; + + case DATA_BINARY: + fputs("DATA_BINARY", stderr); + break; + + case DATA_FIXBINARY: + fputs("DATA_FIXBINARY", stderr); + break; + + case DATA_BLOB: + fputs("DATA_BLOB", stderr); + break; + + case DATA_INT: + fputs("DATA_INT", stderr); + break; + + case DATA_MYSQL: + fputs("DATA_MYSQL", stderr); + break; + + case DATA_SYS: + fputs("DATA_SYS", stderr); + break; + + case DATA_FLOAT: + fputs("DATA_FLOAT", stderr); + break; + + case DATA_DOUBLE: + fputs("DATA_DOUBLE", stderr); + break; + + case DATA_DECIMAL: + fputs("DATA_DECIMAL", stderr); + break; + + case DATA_VARMYSQL: + fputs("DATA_VARMYSQL", stderr); + break; + + default: + fprintf(stderr, "type %lu", (ulong) mtype); + break; + } + + len = type->len; + + if ((type->mtype == DATA_SYS) + || (type->mtype == DATA_VARCHAR) + || (type->mtype == DATA_CHAR)) { + putc(' ', stderr); + if (prtype == DATA_ROW_ID) { + fputs("DATA_ROW_ID", stderr); + len = DATA_ROW_ID_LEN; + } else if (prtype == DATA_ROLL_PTR) { + fputs("DATA_ROLL_PTR", stderr); + len = DATA_ROLL_PTR_LEN; + } else if (prtype == DATA_TRX_ID) { + fputs("DATA_TRX_ID", stderr); + len = DATA_TRX_ID_LEN; + } else if (prtype == DATA_ENGLISH) { + fputs("DATA_ENGLISH", stderr); + } else { + fprintf(stderr, "prtype %lu", (ulong) prtype); + } + } else { + if (prtype & DATA_UNSIGNED) { + fputs(" DATA_UNSIGNED", stderr); + } + + if (prtype & DATA_BINARY_TYPE) { + fputs(" DATA_BINARY_TYPE", stderr); + } + + if (prtype & DATA_NOT_NULL) { + fputs(" DATA_NOT_NULL", stderr); + } + } + + fprintf(stderr, " len %lu", (ulong) len); +} +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/xtradb/dict/dict0boot.cc b/storage/xtradb/dict/dict0boot.cc new file mode 100644 index 00000000000..b57a8873bd5 --- /dev/null +++ b/storage/xtradb/dict/dict0boot.cc @@ -0,0 +1,522 @@ +/***************************************************************************** + +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file dict/dict0boot.cc +Data dictionary creation and booting + +Created 4/18/1996 Heikki Tuuri +*******************************************************/ + +#include "dict0boot.h" + +#ifdef UNIV_NONINL +#include "dict0boot.ic" +#endif + +#include "dict0crea.h" +#include "btr0btr.h" +#include "btr0sea.h" +#include "dict0load.h" +#include "trx0trx.h" +#include "srv0srv.h" +#include "ibuf0ibuf.h" +#include "buf0flu.h" +#include "log0recv.h" +#include "os0file.h" + +/**********************************************************************//** +Gets a pointer to the dictionary header and x-latches its page. +@return pointer to the dictionary header, page x-latched */ +UNIV_INTERN +dict_hdr_t* +dict_hdr_get( +/*=========*/ + mtr_t* mtr) /*!< in: mtr */ +{ + buf_block_t* block; + dict_hdr_t* header; + + block = buf_page_get(DICT_HDR_SPACE, 0, DICT_HDR_PAGE_NO, + RW_X_LATCH, mtr); + header = DICT_HDR + buf_block_get_frame(block); + + buf_block_dbg_add_level(block, SYNC_DICT_HEADER); + + return(header); +} + +/**********************************************************************//** +Returns a new table, index, or space id. */ +UNIV_INTERN +void +dict_hdr_get_new_id( +/*================*/ + table_id_t* table_id, /*!< out: table id + (not assigned if NULL) */ + index_id_t* index_id, /*!< out: index id + (not assigned if NULL) */ + ulint* space_id) /*!< out: space id + (not assigned if NULL) */ +{ + dict_hdr_t* dict_hdr; + ib_id_t id; + mtr_t mtr; + + mtr_start(&mtr); + + dict_hdr = dict_hdr_get(&mtr); + + if (table_id) { + id = mach_read_from_8(dict_hdr + DICT_HDR_TABLE_ID); + id++; + mlog_write_ull(dict_hdr + DICT_HDR_TABLE_ID, id, &mtr); + *table_id = id; + } + + if (index_id) { + id = mach_read_from_8(dict_hdr + DICT_HDR_INDEX_ID); + id++; + mlog_write_ull(dict_hdr + DICT_HDR_INDEX_ID, id, &mtr); + *index_id = id; + } + + if (space_id) { + *space_id = mtr_read_ulint(dict_hdr + DICT_HDR_MAX_SPACE_ID, + MLOG_4BYTES, &mtr); + if (fil_assign_new_space_id(space_id)) { + mlog_write_ulint(dict_hdr + DICT_HDR_MAX_SPACE_ID, + *space_id, MLOG_4BYTES, &mtr); + } + } + + mtr_commit(&mtr); +} + +/**********************************************************************//** +Writes the current value of the row id counter to the dictionary header file +page. */ +UNIV_INTERN +void +dict_hdr_flush_row_id(void) +/*=======================*/ +{ + dict_hdr_t* dict_hdr; + row_id_t id; + mtr_t mtr; + + ut_ad(mutex_own(&(dict_sys->mutex))); + + id = dict_sys->row_id; + + mtr_start(&mtr); + + dict_hdr = dict_hdr_get(&mtr); + + mlog_write_ull(dict_hdr + DICT_HDR_ROW_ID, id, &mtr); + + mtr_commit(&mtr); +} + +/*****************************************************************//** +Creates the file page for the dictionary header. This function is +called only at the database creation. +@return TRUE if succeed */ +static +ibool +dict_hdr_create( +/*============*/ + mtr_t* mtr) /*!< in: mtr */ +{ + buf_block_t* block; + dict_hdr_t* dict_header; + ulint root_page_no; + + ut_ad(mtr); + + /* Create the dictionary header file block in a new, allocated file + segment in the system tablespace */ + block = fseg_create(DICT_HDR_SPACE, 0, + DICT_HDR + DICT_HDR_FSEG_HEADER, mtr); + + ut_a(DICT_HDR_PAGE_NO == buf_block_get_page_no(block)); + + dict_header = dict_hdr_get(mtr); + + /* Start counting row, table, index, and tree ids from + DICT_HDR_FIRST_ID */ + mlog_write_ull(dict_header + DICT_HDR_ROW_ID, + DICT_HDR_FIRST_ID, mtr); + + mlog_write_ull(dict_header + DICT_HDR_TABLE_ID, + DICT_HDR_FIRST_ID, mtr); + + mlog_write_ull(dict_header + DICT_HDR_INDEX_ID, + DICT_HDR_FIRST_ID, mtr); + + mlog_write_ulint(dict_header + DICT_HDR_MAX_SPACE_ID, + 0, MLOG_4BYTES, mtr); + + /* Obsolete, but we must initialize it anyway. */ + mlog_write_ulint(dict_header + DICT_HDR_MIX_ID_LOW, + DICT_HDR_FIRST_ID, MLOG_4BYTES, mtr); + + /* Create the B-tree roots for the clustered indexes of the basic + system tables */ + + /*--------------------------*/ + root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE, + DICT_HDR_SPACE, 0, DICT_TABLES_ID, + dict_ind_redundant, mtr); + if (root_page_no == FIL_NULL) { + + return(FALSE); + } + + mlog_write_ulint(dict_header + DICT_HDR_TABLES, root_page_no, + MLOG_4BYTES, mtr); + /*--------------------------*/ + root_page_no = btr_create(DICT_UNIQUE, DICT_HDR_SPACE, 0, + DICT_TABLE_IDS_ID, + dict_ind_redundant, mtr); + if (root_page_no == FIL_NULL) { + + return(FALSE); + } + + mlog_write_ulint(dict_header + DICT_HDR_TABLE_IDS, root_page_no, + MLOG_4BYTES, mtr); + /*--------------------------*/ + root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE, + DICT_HDR_SPACE, 0, DICT_COLUMNS_ID, + dict_ind_redundant, mtr); + if (root_page_no == FIL_NULL) { + + return(FALSE); + } + + mlog_write_ulint(dict_header + DICT_HDR_COLUMNS, root_page_no, + MLOG_4BYTES, mtr); + /*--------------------------*/ + root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE, + DICT_HDR_SPACE, 0, DICT_INDEXES_ID, + dict_ind_redundant, mtr); + if (root_page_no == FIL_NULL) { + + return(FALSE); + } + + mlog_write_ulint(dict_header + DICT_HDR_INDEXES, root_page_no, + MLOG_4BYTES, mtr); + /*--------------------------*/ + root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE, + DICT_HDR_SPACE, 0, DICT_FIELDS_ID, + dict_ind_redundant, mtr); + if (root_page_no == FIL_NULL) { + + return(FALSE); + } + + mlog_write_ulint(dict_header + DICT_HDR_FIELDS, root_page_no, + MLOG_4BYTES, mtr); + /*--------------------------*/ + + return(TRUE); +} + +/*****************************************************************//** +Initializes the data dictionary memory structures when the database is +started. This function is also called when the data dictionary is created. +@return DB_SUCCESS or error code. */ +UNIV_INTERN +dberr_t +dict_boot(void) +/*===========*/ +{ + dict_table_t* table; + dict_index_t* index; + dict_hdr_t* dict_hdr; + mem_heap_t* heap; + mtr_t mtr; + dberr_t error; + + /* Be sure these constants do not ever change. To avoid bloat, + only check the *NUM_FIELDS* in each table */ + + ut_ad(DICT_NUM_COLS__SYS_TABLES == 8); + ut_ad(DICT_NUM_FIELDS__SYS_TABLES == 10); + ut_ad(DICT_NUM_FIELDS__SYS_TABLE_IDS == 2); + ut_ad(DICT_NUM_COLS__SYS_COLUMNS == 7); + ut_ad(DICT_NUM_FIELDS__SYS_COLUMNS == 9); + ut_ad(DICT_NUM_COLS__SYS_INDEXES == 7); + ut_ad(DICT_NUM_FIELDS__SYS_INDEXES == 9); + ut_ad(DICT_NUM_COLS__SYS_FIELDS == 3); + ut_ad(DICT_NUM_FIELDS__SYS_FIELDS == 5); + ut_ad(DICT_NUM_COLS__SYS_FOREIGN == 4); + ut_ad(DICT_NUM_FIELDS__SYS_FOREIGN == 6); + ut_ad(DICT_NUM_FIELDS__SYS_FOREIGN_FOR_NAME == 2); + ut_ad(DICT_NUM_COLS__SYS_FOREIGN_COLS == 4); + ut_ad(DICT_NUM_FIELDS__SYS_FOREIGN_COLS == 6); + + mtr_start(&mtr); + + /* Create the hash tables etc. */ + dict_init(); + + heap = mem_heap_create(450); + + mutex_enter(&(dict_sys->mutex)); + + /* Get the dictionary header */ + dict_hdr = dict_hdr_get(&mtr); + + /* Because we only write new row ids to disk-based data structure + (dictionary header) when it is divisible by + DICT_HDR_ROW_ID_WRITE_MARGIN, in recovery we will not recover + the latest value of the row id counter. Therefore we advance + the counter at the database startup to avoid overlapping values. + Note that when a user after database startup first time asks for + a new row id, then because the counter is now divisible by + ..._MARGIN, it will immediately be updated to the disk-based + header. */ + + dict_sys->row_id = DICT_HDR_ROW_ID_WRITE_MARGIN + + ut_uint64_align_up(mach_read_from_8(dict_hdr + DICT_HDR_ROW_ID), + DICT_HDR_ROW_ID_WRITE_MARGIN); + + /* Insert into the dictionary cache the descriptions of the basic + system tables */ + /*-------------------------*/ + table = dict_mem_table_create("SYS_TABLES", DICT_HDR_SPACE, 8, 0, 0, + false); + + dict_mem_table_add_col(table, heap, "NAME", DATA_BINARY, 0, 0); + dict_mem_table_add_col(table, heap, "ID", DATA_BINARY, 0, 0); + /* ROW_FORMAT = (N_COLS >> 31) ? COMPACT : REDUNDANT */ + dict_mem_table_add_col(table, heap, "N_COLS", DATA_INT, 0, 4); + /* The low order bit of TYPE is always set to 1. If the format + is UNIV_FORMAT_B or higher, this field matches table->flags. */ + dict_mem_table_add_col(table, heap, "TYPE", DATA_INT, 0, 4); + dict_mem_table_add_col(table, heap, "MIX_ID", DATA_BINARY, 0, 0); + /* MIX_LEN may contain additional table flags when + ROW_FORMAT!=REDUNDANT. Currently, these flags include + DICT_TF2_TEMPORARY. */ + dict_mem_table_add_col(table, heap, "MIX_LEN", DATA_INT, 0, 4); + dict_mem_table_add_col(table, heap, "CLUSTER_NAME", DATA_BINARY, 0, 0); + dict_mem_table_add_col(table, heap, "SPACE", DATA_INT, 0, 4); + + table->id = DICT_TABLES_ID; + + dict_table_add_to_cache(table, FALSE, heap); + dict_sys->sys_tables = table; + mem_heap_empty(heap); + + index = dict_mem_index_create("SYS_TABLES", "CLUST_IND", + DICT_HDR_SPACE, + DICT_UNIQUE | DICT_CLUSTERED, 1); + + dict_mem_index_add_field(index, "NAME", 0); + + index->id = DICT_TABLES_ID; + btr_search_index_init(index); + + error = dict_index_add_to_cache(table, index, + mtr_read_ulint(dict_hdr + + DICT_HDR_TABLES, + MLOG_4BYTES, &mtr), + FALSE); + ut_a(error == DB_SUCCESS); + + /*-------------------------*/ + index = dict_mem_index_create("SYS_TABLES", "ID_IND", + DICT_HDR_SPACE, DICT_UNIQUE, 1); + dict_mem_index_add_field(index, "ID", 0); + + index->id = DICT_TABLE_IDS_ID; + btr_search_index_init(index); + error = dict_index_add_to_cache(table, index, + mtr_read_ulint(dict_hdr + + DICT_HDR_TABLE_IDS, + MLOG_4BYTES, &mtr), + FALSE); + ut_a(error == DB_SUCCESS); + + /*-------------------------*/ + table = dict_mem_table_create("SYS_COLUMNS", DICT_HDR_SPACE, 7, 0, 0, + false); + + dict_mem_table_add_col(table, heap, "TABLE_ID", DATA_BINARY, 0, 0); + dict_mem_table_add_col(table, heap, "POS", DATA_INT, 0, 4); + dict_mem_table_add_col(table, heap, "NAME", DATA_BINARY, 0, 0); + dict_mem_table_add_col(table, heap, "MTYPE", DATA_INT, 0, 4); + dict_mem_table_add_col(table, heap, "PRTYPE", DATA_INT, 0, 4); + dict_mem_table_add_col(table, heap, "LEN", DATA_INT, 0, 4); + dict_mem_table_add_col(table, heap, "PREC", DATA_INT, 0, 4); + + table->id = DICT_COLUMNS_ID; + + dict_table_add_to_cache(table, FALSE, heap); + dict_sys->sys_columns = table; + mem_heap_empty(heap); + + index = dict_mem_index_create("SYS_COLUMNS", "CLUST_IND", + DICT_HDR_SPACE, + DICT_UNIQUE | DICT_CLUSTERED, 2); + + dict_mem_index_add_field(index, "TABLE_ID", 0); + dict_mem_index_add_field(index, "POS", 0); + + index->id = DICT_COLUMNS_ID; + btr_search_index_init(index); + error = dict_index_add_to_cache(table, index, + mtr_read_ulint(dict_hdr + + DICT_HDR_COLUMNS, + MLOG_4BYTES, &mtr), + FALSE); + ut_a(error == DB_SUCCESS); + + /*-------------------------*/ + table = dict_mem_table_create("SYS_INDEXES", DICT_HDR_SPACE, 7, 0, 0, + false); + + dict_mem_table_add_col(table, heap, "TABLE_ID", DATA_BINARY, 0, 0); + dict_mem_table_add_col(table, heap, "ID", DATA_BINARY, 0, 0); + dict_mem_table_add_col(table, heap, "NAME", DATA_BINARY, 0, 0); + dict_mem_table_add_col(table, heap, "N_FIELDS", DATA_INT, 0, 4); + dict_mem_table_add_col(table, heap, "TYPE", DATA_INT, 0, 4); + dict_mem_table_add_col(table, heap, "SPACE", DATA_INT, 0, 4); + dict_mem_table_add_col(table, heap, "PAGE_NO", DATA_INT, 0, 4); + + table->id = DICT_INDEXES_ID; + + dict_table_add_to_cache(table, FALSE, heap); + dict_sys->sys_indexes = table; + mem_heap_empty(heap); + + index = dict_mem_index_create("SYS_INDEXES", "CLUST_IND", + DICT_HDR_SPACE, + DICT_UNIQUE | DICT_CLUSTERED, 2); + + dict_mem_index_add_field(index, "TABLE_ID", 0); + dict_mem_index_add_field(index, "ID", 0); + + index->id = DICT_INDEXES_ID; + btr_search_index_init(index); + error = dict_index_add_to_cache(table, index, + mtr_read_ulint(dict_hdr + + DICT_HDR_INDEXES, + MLOG_4BYTES, &mtr), + FALSE); + ut_a(error == DB_SUCCESS); + + /*-------------------------*/ + table = dict_mem_table_create("SYS_FIELDS", DICT_HDR_SPACE, 3, 0, 0, + false); + + dict_mem_table_add_col(table, heap, "INDEX_ID", DATA_BINARY, 0, 0); + dict_mem_table_add_col(table, heap, "POS", DATA_INT, 0, 4); + dict_mem_table_add_col(table, heap, "COL_NAME", DATA_BINARY, 0, 0); + + table->id = DICT_FIELDS_ID; + + dict_table_add_to_cache(table, FALSE, heap); + dict_sys->sys_fields = table; + mem_heap_free(heap); + + index = dict_mem_index_create("SYS_FIELDS", "CLUST_IND", + DICT_HDR_SPACE, + DICT_UNIQUE | DICT_CLUSTERED, 2); + + dict_mem_index_add_field(index, "INDEX_ID", 0); + dict_mem_index_add_field(index, "POS", 0); + + index->id = DICT_FIELDS_ID; + btr_search_index_init(index); + error = dict_index_add_to_cache(table, index, + mtr_read_ulint(dict_hdr + + DICT_HDR_FIELDS, + MLOG_4BYTES, &mtr), + FALSE); + ut_a(error == DB_SUCCESS); + + mtr_commit(&mtr); + + /*-------------------------*/ + + /* Initialize the insert buffer table and index for each tablespace */ + + ibuf_init_at_db_start(); + + dberr_t err = DB_SUCCESS; + + if (srv_read_only_mode && !ibuf_is_empty()) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "Change buffer must be empty when --innodb-read-only " + "is set!"); + + err = DB_ERROR; + } else { + /* Load definitions of other indexes on system tables */ + + dict_load_sys_table(dict_sys->sys_tables); + dict_load_sys_table(dict_sys->sys_columns); + dict_load_sys_table(dict_sys->sys_indexes); + dict_load_sys_table(dict_sys->sys_fields); + } + + mutex_exit(&(dict_sys->mutex)); + + return(err); +} + +/*****************************************************************//** +Inserts the basic system table data into themselves in the database +creation. */ +static +void +dict_insert_initial_data(void) +/*==========================*/ +{ + /* Does nothing yet */ +} + +/*****************************************************************//** +Creates and initializes the data dictionary at the server bootstrap. +@return DB_SUCCESS or error code. */ +UNIV_INTERN +dberr_t +dict_create(void) +/*=============*/ +{ + mtr_t mtr; + + mtr_start(&mtr); + + dict_hdr_create(&mtr); + + mtr_commit(&mtr); + + dberr_t err = dict_boot(); + + if (err == DB_SUCCESS) { + dict_insert_initial_data(); + } + + return(err); +} diff --git a/storage/xtradb/dict/dict0crea.cc b/storage/xtradb/dict/dict0crea.cc new file mode 100644 index 00000000000..30523ff2af4 --- /dev/null +++ b/storage/xtradb/dict/dict0crea.cc @@ -0,0 +1,1845 @@ +/***************************************************************************** + +Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file dict/dict0crea.cc +Database object creation + +Created 1/8/1996 Heikki Tuuri +*******************************************************/ + +#include "dict0crea.h" + +#ifdef UNIV_NONINL +#include "dict0crea.ic" +#endif + +#include "btr0pcur.h" +#include "btr0btr.h" +#include "page0page.h" +#include "mach0data.h" +#include "dict0boot.h" +#include "dict0dict.h" +#include "que0que.h" +#include "row0ins.h" +#include "row0mysql.h" +#include "pars0pars.h" +#include "trx0roll.h" +#include "usr0sess.h" +#include "ut0vec.h" +#include "dict0priv.h" +#include "fts0priv.h" +#include "ha_prototypes.h" + +/*****************************************************************//** +Based on a table object, this function builds the entry to be inserted +in the SYS_TABLES system table. +@return the tuple which should be inserted */ +static +dtuple_t* +dict_create_sys_tables_tuple( +/*=========================*/ + const dict_table_t* table, /*!< in: table */ + mem_heap_t* heap) /*!< in: memory heap from + which the memory for the built + tuple is allocated */ +{ + dict_table_t* sys_tables; + dtuple_t* entry; + dfield_t* dfield; + byte* ptr; + ulint type; + + ut_ad(table); + ut_ad(heap); + + sys_tables = dict_sys->sys_tables; + + entry = dtuple_create(heap, 8 + DATA_N_SYS_COLS); + + dict_table_copy_types(entry, sys_tables); + + /* 0: NAME -----------------------------*/ + dfield = dtuple_get_nth_field( + entry, DICT_COL__SYS_TABLES__NAME); + + dfield_set_data(dfield, table->name, ut_strlen(table->name)); + + /* 1: DB_TRX_ID added later */ + /* 2: DB_ROLL_PTR added later */ + /* 3: ID -------------------------------*/ + dfield = dtuple_get_nth_field( + entry, DICT_COL__SYS_TABLES__ID); + + ptr = static_cast<byte*>(mem_heap_alloc(heap, 8)); + mach_write_to_8(ptr, table->id); + + dfield_set_data(dfield, ptr, 8); + + /* 4: N_COLS ---------------------------*/ + dfield = dtuple_get_nth_field( + entry, DICT_COL__SYS_TABLES__N_COLS); + + ptr = static_cast<byte*>(mem_heap_alloc(heap, 4)); + mach_write_to_4(ptr, table->n_def + | ((table->flags & DICT_TF_COMPACT) << 31)); + dfield_set_data(dfield, ptr, 4); + + /* 5: TYPE (table flags) -----------------------------*/ + dfield = dtuple_get_nth_field( + entry, DICT_COL__SYS_TABLES__TYPE); + + ptr = static_cast<byte*>(mem_heap_alloc(heap, 4)); + + /* Validate the table flags and convert them to what is saved in + SYS_TABLES.TYPE. Table flag values 0 and 1 are both written to + SYS_TABLES.TYPE as 1. */ + type = dict_tf_to_sys_tables_type(table->flags); + mach_write_to_4(ptr, type); + + dfield_set_data(dfield, ptr, 4); + + /* 6: MIX_ID (obsolete) ---------------------------*/ + dfield = dtuple_get_nth_field( + entry, DICT_COL__SYS_TABLES__MIX_ID); + + ptr = static_cast<byte*>(mem_heap_zalloc(heap, 8)); + + dfield_set_data(dfield, ptr, 8); + + /* 7: MIX_LEN (additional flags) --------------------------*/ + dfield = dtuple_get_nth_field( + entry, DICT_COL__SYS_TABLES__MIX_LEN); + + ptr = static_cast<byte*>(mem_heap_alloc(heap, 4)); + /* Be sure all non-used bits are zero. */ + ut_a(!(table->flags2 & ~DICT_TF2_BIT_MASK)); + mach_write_to_4(ptr, table->flags2); + + dfield_set_data(dfield, ptr, 4); + + /* 8: CLUSTER_NAME ---------------------*/ + dfield = dtuple_get_nth_field( + entry, DICT_COL__SYS_TABLES__CLUSTER_ID); + dfield_set_null(dfield); /* not supported */ + + /* 9: SPACE ----------------------------*/ + dfield = dtuple_get_nth_field( + entry, DICT_COL__SYS_TABLES__SPACE); + + ptr = static_cast<byte*>(mem_heap_alloc(heap, 4)); + mach_write_to_4(ptr, table->space); + + dfield_set_data(dfield, ptr, 4); + /*----------------------------------*/ + + return(entry); +} + +/*****************************************************************//** +Based on a table object, this function builds the entry to be inserted +in the SYS_COLUMNS system table. +@return the tuple which should be inserted */ +static +dtuple_t* +dict_create_sys_columns_tuple( +/*==========================*/ + const dict_table_t* table, /*!< in: table */ + ulint i, /*!< in: column number */ + mem_heap_t* heap) /*!< in: memory heap from + which the memory for the built + tuple is allocated */ +{ + dict_table_t* sys_columns; + dtuple_t* entry; + const dict_col_t* column; + dfield_t* dfield; + byte* ptr; + const char* col_name; + + ut_ad(table); + ut_ad(heap); + + column = dict_table_get_nth_col(table, i); + + sys_columns = dict_sys->sys_columns; + + entry = dtuple_create(heap, 7 + DATA_N_SYS_COLS); + + dict_table_copy_types(entry, sys_columns); + + /* 0: TABLE_ID -----------------------*/ + dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__TABLE_ID); + + ptr = static_cast<byte*>(mem_heap_alloc(heap, 8)); + mach_write_to_8(ptr, table->id); + + dfield_set_data(dfield, ptr, 8); + + /* 1: POS ----------------------------*/ + dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__POS); + + ptr = static_cast<byte*>(mem_heap_alloc(heap, 4)); + mach_write_to_4(ptr, i); + + dfield_set_data(dfield, ptr, 4); + + /* 2: DB_TRX_ID added later */ + /* 3: DB_ROLL_PTR added later */ + /* 4: NAME ---------------------------*/ + dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__NAME); + + col_name = dict_table_get_col_name(table, i); + dfield_set_data(dfield, col_name, ut_strlen(col_name)); + + /* 5: MTYPE --------------------------*/ + dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__MTYPE); + + ptr = static_cast<byte*>(mem_heap_alloc(heap, 4)); + mach_write_to_4(ptr, column->mtype); + + dfield_set_data(dfield, ptr, 4); + + /* 6: PRTYPE -------------------------*/ + dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__PRTYPE); + + ptr = static_cast<byte*>(mem_heap_alloc(heap, 4)); + mach_write_to_4(ptr, column->prtype); + + dfield_set_data(dfield, ptr, 4); + + /* 7: LEN ----------------------------*/ + dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__LEN); + + ptr = static_cast<byte*>(mem_heap_alloc(heap, 4)); + mach_write_to_4(ptr, column->len); + + dfield_set_data(dfield, ptr, 4); + + /* 8: PREC ---------------------------*/ + dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__PREC); + + ptr = static_cast<byte*>(mem_heap_alloc(heap, 4)); + mach_write_to_4(ptr, 0/* unused */); + + dfield_set_data(dfield, ptr, 4); + /*---------------------------------*/ + + return(entry); +} + +/***************************************************************//** +Builds a table definition to insert. +@return DB_SUCCESS or error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +dict_build_table_def_step( +/*======================*/ + que_thr_t* thr, /*!< in: query thread */ + tab_node_t* node) /*!< in: table create node */ +{ + dict_table_t* table; + dtuple_t* row; + dberr_t error; + const char* path; + mtr_t mtr; + ulint space = 0; + bool use_tablespace; + + ut_ad(mutex_own(&(dict_sys->mutex))); + + table = node->table; + use_tablespace = DICT_TF2_FLAG_IS_SET(table, DICT_TF2_USE_TABLESPACE); + + dict_hdr_get_new_id(&table->id, NULL, NULL); + + thr_get_trx(thr)->table_id = table->id; + + /* Always set this bit for all new created tables */ + DICT_TF2_FLAG_SET(table, DICT_TF2_FTS_AUX_HEX_NAME); + DBUG_EXECUTE_IF("innodb_test_wrong_fts_aux_table_name", + DICT_TF2_FLAG_UNSET(table, + DICT_TF2_FTS_AUX_HEX_NAME);); + + if (use_tablespace) { + /* This table will not use the system tablespace. + Get a new space id. */ + dict_hdr_get_new_id(NULL, NULL, &space); + + DBUG_EXECUTE_IF( + "ib_create_table_fail_out_of_space_ids", + space = ULINT_UNDEFINED; + ); + + if (UNIV_UNLIKELY(space == ULINT_UNDEFINED)) { + return(DB_ERROR); + } + + /* We create a new single-table tablespace for the table. + We initially let it be 4 pages: + - page 0 is the fsp header and an extent descriptor page, + - page 1 is an ibuf bitmap page, + - page 2 is the first inode page, + - page 3 will contain the root of the clustered index of the + table we create here. */ + + path = table->data_dir_path ? table->data_dir_path + : table->dir_path_of_temp_table; + + ut_ad(dict_table_get_format(table) <= UNIV_FORMAT_MAX); + ut_ad(!dict_table_zip_size(table) + || dict_table_get_format(table) >= UNIV_FORMAT_B); + + error = fil_create_new_single_table_tablespace( + space, table->name, path, + dict_tf_to_fsp_flags(table->flags), + table->flags2, + FIL_IBD_FILE_INITIAL_SIZE); + + table->space = (unsigned int) space; + + if (error != DB_SUCCESS) { + + return(error); + } + + mtr_start(&mtr); + + fsp_header_init(table->space, FIL_IBD_FILE_INITIAL_SIZE, &mtr); + + mtr_commit(&mtr); + } else { + /* Create in the system tablespace: disallow Barracuda + features by keeping only the first bit which says whether + the row format is redundant or compact */ + table->flags &= DICT_TF_COMPACT; + } + + row = dict_create_sys_tables_tuple(table, node->heap); + + ins_node_set_new_row(node->tab_def, row); + + return(DB_SUCCESS); +} + +/***************************************************************//** +Builds a column definition to insert. */ +static +void +dict_build_col_def_step( +/*====================*/ + tab_node_t* node) /*!< in: table create node */ +{ + dtuple_t* row; + + row = dict_create_sys_columns_tuple(node->table, node->col_no, + node->heap); + ins_node_set_new_row(node->col_def, row); +} + +/*****************************************************************//** +Based on an index object, this function builds the entry to be inserted +in the SYS_INDEXES system table. +@return the tuple which should be inserted */ +static +dtuple_t* +dict_create_sys_indexes_tuple( +/*==========================*/ + const dict_index_t* index, /*!< in: index */ + mem_heap_t* heap) /*!< in: memory heap from + which the memory for the built + tuple is allocated */ +{ + dict_table_t* sys_indexes; + dict_table_t* table; + dtuple_t* entry; + dfield_t* dfield; + byte* ptr; + + ut_ad(mutex_own(&(dict_sys->mutex))); + ut_ad(index); + ut_ad(heap); + + sys_indexes = dict_sys->sys_indexes; + + table = dict_table_get_low(index->table_name); + + entry = dtuple_create(heap, 7 + DATA_N_SYS_COLS); + + dict_table_copy_types(entry, sys_indexes); + + /* 0: TABLE_ID -----------------------*/ + dfield = dtuple_get_nth_field( + entry, DICT_COL__SYS_INDEXES__TABLE_ID); + + ptr = static_cast<byte*>(mem_heap_alloc(heap, 8)); + mach_write_to_8(ptr, table->id); + + dfield_set_data(dfield, ptr, 8); + + /* 1: ID ----------------------------*/ + dfield = dtuple_get_nth_field( + entry, DICT_COL__SYS_INDEXES__ID); + + ptr = static_cast<byte*>(mem_heap_alloc(heap, 8)); + mach_write_to_8(ptr, index->id); + + dfield_set_data(dfield, ptr, 8); + + /* 2: DB_TRX_ID added later */ + /* 3: DB_ROLL_PTR added later */ + /* 4: NAME --------------------------*/ + dfield = dtuple_get_nth_field( + entry, DICT_COL__SYS_INDEXES__NAME); + + dfield_set_data(dfield, index->name, ut_strlen(index->name)); + + /* 5: N_FIELDS ----------------------*/ + dfield = dtuple_get_nth_field( + entry, DICT_COL__SYS_INDEXES__N_FIELDS); + + ptr = static_cast<byte*>(mem_heap_alloc(heap, 4)); + mach_write_to_4(ptr, index->n_fields); + + dfield_set_data(dfield, ptr, 4); + + /* 6: TYPE --------------------------*/ + dfield = dtuple_get_nth_field( + entry, DICT_COL__SYS_INDEXES__TYPE); + + ptr = static_cast<byte*>(mem_heap_alloc(heap, 4)); + mach_write_to_4(ptr, index->type); + + dfield_set_data(dfield, ptr, 4); + + /* 7: SPACE --------------------------*/ + + dfield = dtuple_get_nth_field( + entry, DICT_COL__SYS_INDEXES__SPACE); + + ptr = static_cast<byte*>(mem_heap_alloc(heap, 4)); + mach_write_to_4(ptr, index->space); + + dfield_set_data(dfield, ptr, 4); + + /* 8: PAGE_NO --------------------------*/ + + dfield = dtuple_get_nth_field( + entry, DICT_COL__SYS_INDEXES__PAGE_NO); + + ptr = static_cast<byte*>(mem_heap_alloc(heap, 4)); + mach_write_to_4(ptr, FIL_NULL); + + dfield_set_data(dfield, ptr, 4); + + /*--------------------------------*/ + + return(entry); +} + +/*****************************************************************//** +Based on an index object, this function builds the entry to be inserted +in the SYS_FIELDS system table. +@return the tuple which should be inserted */ +static +dtuple_t* +dict_create_sys_fields_tuple( +/*=========================*/ + const dict_index_t* index, /*!< in: index */ + ulint fld_no, /*!< in: field number */ + mem_heap_t* heap) /*!< in: memory heap from + which the memory for the built + tuple is allocated */ +{ + dict_table_t* sys_fields; + dtuple_t* entry; + dict_field_t* field; + dfield_t* dfield; + byte* ptr; + ibool index_contains_column_prefix_field = FALSE; + ulint j; + + ut_ad(index); + ut_ad(heap); + + for (j = 0; j < index->n_fields; j++) { + if (dict_index_get_nth_field(index, j)->prefix_len > 0) { + index_contains_column_prefix_field = TRUE; + break; + } + } + + field = dict_index_get_nth_field(index, fld_no); + + sys_fields = dict_sys->sys_fields; + + entry = dtuple_create(heap, 3 + DATA_N_SYS_COLS); + + dict_table_copy_types(entry, sys_fields); + + /* 0: INDEX_ID -----------------------*/ + dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_FIELDS__INDEX_ID); + + ptr = static_cast<byte*>(mem_heap_alloc(heap, 8)); + mach_write_to_8(ptr, index->id); + + dfield_set_data(dfield, ptr, 8); + + /* 1: POS; FIELD NUMBER & PREFIX LENGTH -----------------------*/ + + dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_FIELDS__POS); + + ptr = static_cast<byte*>(mem_heap_alloc(heap, 4)); + + if (index_contains_column_prefix_field) { + /* If there are column prefix fields in the index, then + we store the number of the field to the 2 HIGH bytes + and the prefix length to the 2 low bytes, */ + + mach_write_to_4(ptr, (fld_no << 16) + field->prefix_len); + } else { + /* Else we store the number of the field to the 2 LOW bytes. + This is to keep the storage format compatible with + InnoDB versions < 4.0.14. */ + + mach_write_to_4(ptr, fld_no); + } + + dfield_set_data(dfield, ptr, 4); + + /* 2: DB_TRX_ID added later */ + /* 3: DB_ROLL_PTR added later */ + /* 4: COL_NAME -------------------------*/ + dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_FIELDS__COL_NAME); + + dfield_set_data(dfield, field->name, + ut_strlen(field->name)); + /*---------------------------------*/ + + return(entry); +} + +/*****************************************************************//** +Creates the tuple with which the index entry is searched for writing the index +tree root page number, if such a tree is created. +@return the tuple for search */ +static +dtuple_t* +dict_create_search_tuple( +/*=====================*/ + const dtuple_t* tuple, /*!< in: the tuple inserted in the SYS_INDEXES + table */ + mem_heap_t* heap) /*!< in: memory heap from which the memory for + the built tuple is allocated */ +{ + dtuple_t* search_tuple; + const dfield_t* field1; + dfield_t* field2; + + ut_ad(tuple && heap); + + search_tuple = dtuple_create(heap, 2); + + field1 = dtuple_get_nth_field(tuple, 0); + field2 = dtuple_get_nth_field(search_tuple, 0); + + dfield_copy(field2, field1); + + field1 = dtuple_get_nth_field(tuple, 1); + field2 = dtuple_get_nth_field(search_tuple, 1); + + dfield_copy(field2, field1); + + ut_ad(dtuple_validate(search_tuple)); + + return(search_tuple); +} + +/***************************************************************//** +Builds an index definition row to insert. +@return DB_SUCCESS or error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +dict_build_index_def_step( +/*======================*/ + que_thr_t* thr, /*!< in: query thread */ + ind_node_t* node) /*!< in: index create node */ +{ + dict_table_t* table; + dict_index_t* index; + dtuple_t* row; + trx_t* trx; + + ut_ad(mutex_own(&(dict_sys->mutex))); + + trx = thr_get_trx(thr); + + index = node->index; + + table = dict_table_get_low(index->table_name); + + if (table == NULL) { + return(DB_TABLE_NOT_FOUND); + } + + if (!trx->table_id) { + /* Record only the first table id. */ + trx->table_id = table->id; + } + + node->table = table; + + ut_ad((UT_LIST_GET_LEN(table->indexes) > 0) + || dict_index_is_clust(index)); + + dict_hdr_get_new_id(NULL, &index->id, NULL); + + /* Inherit the space id from the table; we store all indexes of a + table in the same tablespace */ + + index->space = table->space; + node->page_no = FIL_NULL; + row = dict_create_sys_indexes_tuple(index, node->heap); + node->ind_row = row; + + ins_node_set_new_row(node->ind_def, row); + + /* Note that the index was created by this transaction. */ + index->trx_id = trx->id; + ut_ad(table->def_trx_id <= trx->id); + table->def_trx_id = trx->id; + + return(DB_SUCCESS); +} + +/***************************************************************//** +Builds a field definition row to insert. */ +static +void +dict_build_field_def_step( +/*======================*/ + ind_node_t* node) /*!< in: index create node */ +{ + dict_index_t* index; + dtuple_t* row; + + index = node->index; + + row = dict_create_sys_fields_tuple(index, node->field_no, node->heap); + + ins_node_set_new_row(node->field_def, row); +} + +/***************************************************************//** +Creates an index tree for the index if it is not a member of a cluster. +@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +dict_create_index_tree_step( +/*========================*/ + ind_node_t* node) /*!< in: index create node */ +{ + dict_index_t* index; + dict_table_t* sys_indexes; + dtuple_t* search_tuple; + btr_pcur_t pcur; + mtr_t mtr; + + ut_ad(mutex_own(&(dict_sys->mutex))); + + index = node->index; + + sys_indexes = dict_sys->sys_indexes; + + if (index->type == DICT_FTS) { + /* FTS index does not need an index tree */ + return(DB_SUCCESS); + } + + /* Run a mini-transaction in which the index tree is allocated for + the index and its root address is written to the index entry in + sys_indexes */ + + mtr_start(&mtr); + + search_tuple = dict_create_search_tuple(node->ind_row, node->heap); + + btr_pcur_open(UT_LIST_GET_FIRST(sys_indexes->indexes), + search_tuple, PAGE_CUR_L, BTR_MODIFY_LEAF, + &pcur, &mtr); + + btr_pcur_move_to_next_user_rec(&pcur, &mtr); + + + dberr_t err = DB_SUCCESS; + ulint zip_size = dict_table_zip_size(index->table); + + if (node->index->table->ibd_file_missing + || dict_table_is_discarded(node->index->table)) { + + node->page_no = FIL_NULL; + } else { + node->page_no = btr_create( + index->type, index->space, zip_size, + index->id, index, &mtr); + + if (node->page_no == FIL_NULL) { + err = DB_OUT_OF_FILE_SPACE; + } + + DBUG_EXECUTE_IF("ib_import_create_index_failure_1", + node->page_no = FIL_NULL; + err = DB_OUT_OF_FILE_SPACE; ); + } + + page_rec_write_field( + btr_pcur_get_rec(&pcur), DICT_FLD__SYS_INDEXES__PAGE_NO, + node->page_no, &mtr); + + btr_pcur_close(&pcur); + + mtr_commit(&mtr); + + return(err); +} + +/*******************************************************************//** +Drops the index tree associated with a row in SYS_INDEXES table. */ +UNIV_INTERN +void +dict_drop_index_tree( +/*=================*/ + rec_t* rec, /*!< in/out: record in the clustered index + of SYS_INDEXES table */ + mtr_t* mtr) /*!< in: mtr having the latch on the record page */ +{ + ulint root_page_no; + ulint space; + ulint zip_size; + const byte* ptr; + ulint len; + + ut_ad(mutex_own(&(dict_sys->mutex))); + ut_a(!dict_table_is_comp(dict_sys->sys_indexes)); + ptr = rec_get_nth_field_old( + rec, DICT_FLD__SYS_INDEXES__PAGE_NO, &len); + + ut_ad(len == 4); + + root_page_no = mtr_read_ulint(ptr, MLOG_4BYTES, mtr); + + if (root_page_no == FIL_NULL) { + /* The tree has already been freed */ + + return; + } + + ptr = rec_get_nth_field_old( + rec, DICT_FLD__SYS_INDEXES__SPACE, &len); + + ut_ad(len == 4); + + space = mtr_read_ulint(ptr, MLOG_4BYTES, mtr); + zip_size = fil_space_get_zip_size(space); + + if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) { + /* It is a single table tablespace and the .ibd file is + missing: do nothing */ + + return; + } + + /* We free all the pages but the root page first; this operation + may span several mini-transactions */ + + btr_free_but_not_root(space, zip_size, root_page_no); + + /* Then we free the root page in the same mini-transaction where + we write FIL_NULL to the appropriate field in the SYS_INDEXES + record: this mini-transaction marks the B-tree totally freed */ + + /* printf("Dropping index tree in space %lu root page %lu\n", space, + root_page_no); */ + btr_free_root(space, zip_size, root_page_no, mtr); + + page_rec_write_field(rec, DICT_FLD__SYS_INDEXES__PAGE_NO, + FIL_NULL, mtr); +} + +/*******************************************************************//** +Truncates the index tree associated with a row in SYS_INDEXES table. +@return new root page number, or FIL_NULL on failure */ +UNIV_INTERN +ulint +dict_truncate_index_tree( +/*=====================*/ + dict_table_t* table, /*!< in: the table the index belongs to */ + ulint space, /*!< in: 0=truncate, + nonzero=create the index tree in the + given tablespace */ + btr_pcur_t* pcur, /*!< in/out: persistent cursor pointing to + record in the clustered index of + SYS_INDEXES table. The cursor may be + repositioned in this call. */ + mtr_t* mtr) /*!< in: mtr having the latch + on the record page. The mtr may be + committed and restarted in this call. */ +{ + ulint root_page_no; + ibool drop = !space; + ulint zip_size; + ulint type; + index_id_t index_id; + rec_t* rec; + const byte* ptr; + ulint len; + dict_index_t* index; + bool has_been_dropped = false; + + ut_ad(mutex_own(&(dict_sys->mutex))); + ut_a(!dict_table_is_comp(dict_sys->sys_indexes)); + rec = btr_pcur_get_rec(pcur); + ptr = rec_get_nth_field_old( + rec, DICT_FLD__SYS_INDEXES__PAGE_NO, &len); + + ut_ad(len == 4); + + root_page_no = mtr_read_ulint(ptr, MLOG_4BYTES, mtr); + + if (drop && root_page_no == FIL_NULL) { + has_been_dropped = true; + drop = FALSE; + } + + ptr = rec_get_nth_field_old( + rec, DICT_FLD__SYS_INDEXES__SPACE, &len); + + ut_ad(len == 4); + + if (drop) { + space = mtr_read_ulint(ptr, MLOG_4BYTES, mtr); + } + + zip_size = fil_space_get_zip_size(space); + + if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) { + /* It is a single table tablespace and the .ibd file is + missing: do nothing */ + + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Trying to TRUNCATE" + " a missing .ibd file of table %s!\n", table->name); + return(FIL_NULL); + } + + ptr = rec_get_nth_field_old( + rec, DICT_FLD__SYS_INDEXES__TYPE, &len); + ut_ad(len == 4); + type = mach_read_from_4(ptr); + + ptr = rec_get_nth_field_old(rec, DICT_FLD__SYS_INDEXES__ID, &len); + ut_ad(len == 8); + index_id = mach_read_from_8(ptr); + + if (!drop) { + + goto create; + } + + /* We free all the pages but the root page first; this operation + may span several mini-transactions */ + + btr_free_but_not_root(space, zip_size, root_page_no); + + /* Then we free the root page in the same mini-transaction where + we create the b-tree and write its new root page number to the + appropriate field in the SYS_INDEXES record: this mini-transaction + marks the B-tree totally truncated */ + + btr_block_get(space, zip_size, root_page_no, RW_X_LATCH, NULL, mtr); + + btr_free_root(space, zip_size, root_page_no, mtr); +create: + /* We will temporarily write FIL_NULL to the PAGE_NO field + in SYS_INDEXES, so that the database will not get into an + inconsistent state in case it crashes between the mtr_commit() + below and the following mtr_commit() call. */ + page_rec_write_field(rec, DICT_FLD__SYS_INDEXES__PAGE_NO, + FIL_NULL, mtr); + + /* We will need to commit the mini-transaction in order to avoid + deadlocks in the btr_create() call, because otherwise we would + be freeing and allocating pages in the same mini-transaction. */ + btr_pcur_store_position(pcur, mtr); + mtr_commit(mtr); + + mtr_start(mtr); + btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, mtr); + + /* Find the index corresponding to this SYS_INDEXES record. */ + for (index = UT_LIST_GET_FIRST(table->indexes); + index; + index = UT_LIST_GET_NEXT(indexes, index)) { + if (index->id == index_id) { + if (index->type & DICT_FTS) { + return(FIL_NULL); + } else { + if (has_been_dropped) { + fprintf(stderr, " InnoDB: Trying to" + " TRUNCATE a missing index of" + " table %s!\n", + index->table->name); + } + + root_page_no = btr_create(type, space, zip_size, + index_id, index, mtr); + index->page = (unsigned int) root_page_no; + return(root_page_no); + } + } + } + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Index %llu of table %s is missing\n" + "InnoDB: from the data dictionary during TRUNCATE!\n", + (ullint) index_id, + table->name); + + return(FIL_NULL); +} + +/*********************************************************************//** +Creates a table create graph. +@return own: table create node */ +UNIV_INTERN +tab_node_t* +tab_create_graph_create( +/*====================*/ + dict_table_t* table, /*!< in: table to create, built as a memory data + structure */ + mem_heap_t* heap, /*!< in: heap where created */ + bool commit) /*!< in: true if the commit node should be + added to the query graph */ +{ + tab_node_t* node; + + node = static_cast<tab_node_t*>( + mem_heap_alloc(heap, sizeof(tab_node_t))); + + node->common.type = QUE_NODE_CREATE_TABLE; + + node->table = table; + + node->state = TABLE_BUILD_TABLE_DEF; + node->heap = mem_heap_create(256); + + node->tab_def = ins_node_create(INS_DIRECT, dict_sys->sys_tables, + heap); + node->tab_def->common.parent = node; + + node->col_def = ins_node_create(INS_DIRECT, dict_sys->sys_columns, + heap); + node->col_def->common.parent = node; + + if (commit) { + node->commit_node = trx_commit_node_create(heap); + node->commit_node->common.parent = node; + } else { + node->commit_node = 0; + } + + return(node); +} + +/*********************************************************************//** +Creates an index create graph. +@return own: index create node */ +UNIV_INTERN +ind_node_t* +ind_create_graph_create( +/*====================*/ + dict_index_t* index, /*!< in: index to create, built as a memory data + structure */ + mem_heap_t* heap, /*!< in: heap where created */ + bool commit) /*!< in: true if the commit node should be + added to the query graph */ +{ + ind_node_t* node; + + node = static_cast<ind_node_t*>( + mem_heap_alloc(heap, sizeof(ind_node_t))); + + node->common.type = QUE_NODE_CREATE_INDEX; + + node->index = index; + + node->state = INDEX_BUILD_INDEX_DEF; + node->page_no = FIL_NULL; + node->heap = mem_heap_create(256); + + node->ind_def = ins_node_create(INS_DIRECT, + dict_sys->sys_indexes, heap); + node->ind_def->common.parent = node; + + node->field_def = ins_node_create(INS_DIRECT, + dict_sys->sys_fields, heap); + node->field_def->common.parent = node; + + if (commit) { + node->commit_node = trx_commit_node_create(heap); + node->commit_node->common.parent = node; + } else { + node->commit_node = 0; + } + + return(node); +} + +/***********************************************************//** +Creates a table. This is a high-level function used in SQL execution graphs. +@return query thread to run next or NULL */ +UNIV_INTERN +que_thr_t* +dict_create_table_step( +/*===================*/ + que_thr_t* thr) /*!< in: query thread */ +{ + tab_node_t* node; + dberr_t err = DB_ERROR; + trx_t* trx; + + ut_ad(thr); + ut_ad(mutex_own(&(dict_sys->mutex))); + + trx = thr_get_trx(thr); + + node = static_cast<tab_node_t*>(thr->run_node); + + ut_ad(que_node_get_type(node) == QUE_NODE_CREATE_TABLE); + + if (thr->prev_node == que_node_get_parent(node)) { + node->state = TABLE_BUILD_TABLE_DEF; + } + + if (node->state == TABLE_BUILD_TABLE_DEF) { + + /* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */ + + err = dict_build_table_def_step(thr, node); + + if (err != DB_SUCCESS) { + + goto function_exit; + } + + node->state = TABLE_BUILD_COL_DEF; + node->col_no = 0; + + thr->run_node = node->tab_def; + + return(thr); + } + + if (node->state == TABLE_BUILD_COL_DEF) { + + if (node->col_no < (node->table)->n_def) { + + dict_build_col_def_step(node); + + node->col_no++; + + thr->run_node = node->col_def; + + return(thr); + } else { + node->state = TABLE_COMMIT_WORK; + } + } + + if (node->state == TABLE_COMMIT_WORK) { + + /* Table was correctly defined: do NOT commit the transaction + (CREATE TABLE does NOT do an implicit commit of the current + transaction) */ + + node->state = TABLE_ADD_TO_CACHE; + + /* thr->run_node = node->commit_node; + + return(thr); */ + } + + if (node->state == TABLE_ADD_TO_CACHE) { + + dict_table_add_to_cache(node->table, TRUE, node->heap); + + err = DB_SUCCESS; + } + +function_exit: + trx->error_state = err; + + if (err == DB_SUCCESS) { + /* Ok: do nothing */ + + } else if (err == DB_LOCK_WAIT) { + + return(NULL); + } else { + /* SQL error detected */ + + return(NULL); + } + + thr->run_node = que_node_get_parent(node); + + return(thr); +} + +/***********************************************************//** +Creates an index. This is a high-level function used in SQL execution +graphs. +@return query thread to run next or NULL */ +UNIV_INTERN +que_thr_t* +dict_create_index_step( +/*===================*/ + que_thr_t* thr) /*!< in: query thread */ +{ + ind_node_t* node; + dberr_t err = DB_ERROR; + trx_t* trx; + + ut_ad(thr); + ut_ad(mutex_own(&(dict_sys->mutex))); + + trx = thr_get_trx(thr); + + node = static_cast<ind_node_t*>(thr->run_node); + + ut_ad(que_node_get_type(node) == QUE_NODE_CREATE_INDEX); + + if (thr->prev_node == que_node_get_parent(node)) { + node->state = INDEX_BUILD_INDEX_DEF; + } + + if (node->state == INDEX_BUILD_INDEX_DEF) { + /* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */ + err = dict_build_index_def_step(thr, node); + + if (err != DB_SUCCESS) { + + goto function_exit; + } + + node->state = INDEX_BUILD_FIELD_DEF; + node->field_no = 0; + + thr->run_node = node->ind_def; + + return(thr); + } + + if (node->state == INDEX_BUILD_FIELD_DEF) { + + if (node->field_no < (node->index)->n_fields) { + + dict_build_field_def_step(node); + + node->field_no++; + + thr->run_node = node->field_def; + + return(thr); + } else { + node->state = INDEX_ADD_TO_CACHE; + } + } + + if (node->state == INDEX_ADD_TO_CACHE) { + + index_id_t index_id = node->index->id; + + err = dict_index_add_to_cache( + node->table, node->index, FIL_NULL, + trx_is_strict(trx) + || dict_table_get_format(node->table) + >= UNIV_FORMAT_B); + + node->index = dict_index_get_if_in_cache_low(index_id); + ut_a(!node->index == (err != DB_SUCCESS)); + + if (err != DB_SUCCESS) { + + goto function_exit; + } + + node->state = INDEX_CREATE_INDEX_TREE; + } + + if (node->state == INDEX_CREATE_INDEX_TREE) { + + err = dict_create_index_tree_step(node); + + DBUG_EXECUTE_IF("ib_dict_create_index_tree_fail", + err = DB_OUT_OF_MEMORY;); + + if (err != DB_SUCCESS) { + /* If this is a FTS index, we will need to remove + it from fts->cache->indexes list as well */ + if ((node->index->type & DICT_FTS) + && node->table->fts) { + fts_index_cache_t* index_cache; + + rw_lock_x_lock( + &node->table->fts->cache->init_lock); + + index_cache = (fts_index_cache_t*) + fts_find_index_cache( + node->table->fts->cache, + node->index); + + if (index_cache->words) { + rbt_free(index_cache->words); + index_cache->words = 0; + } + + ib_vector_remove( + node->table->fts->cache->indexes, + *reinterpret_cast<void**>(index_cache)); + + rw_lock_x_unlock( + &node->table->fts->cache->init_lock); + } + + dict_index_remove_from_cache(node->table, node->index); + node->index = NULL; + + goto function_exit; + } + + node->index->page = node->page_no; + /* These should have been set in + dict_build_index_def_step() and + dict_index_add_to_cache(). */ + ut_ad(node->index->trx_id == trx->id); + ut_ad(node->index->table->def_trx_id == trx->id); + node->state = INDEX_COMMIT_WORK; + } + + if (node->state == INDEX_COMMIT_WORK) { + + /* Index was correctly defined: do NOT commit the transaction + (CREATE INDEX does NOT currently do an implicit commit of + the current transaction) */ + + node->state = INDEX_CREATE_INDEX_TREE; + + /* thr->run_node = node->commit_node; + + return(thr); */ + } + +function_exit: + trx->error_state = err; + + if (err == DB_SUCCESS) { + /* Ok: do nothing */ + + } else if (err == DB_LOCK_WAIT) { + + return(NULL); + } else { + /* SQL error detected */ + + return(NULL); + } + + thr->run_node = que_node_get_parent(node); + + return(thr); +} + +/****************************************************************//** +Check whether a system table exists. Additionally, if it exists, +move it to the non-LRU end of the table LRU list. This is oly used +for system tables that can be upgraded or added to an older database, +which include SYS_FOREIGN, SYS_FOREIGN_COLS, SYS_TABLESPACES and +SYS_DATAFILES. +@return DB_SUCCESS if the sys table exists, DB_CORRUPTION if it exists +but is not current, DB_TABLE_NOT_FOUND if it does not exist*/ +static +dberr_t +dict_check_if_system_table_exists( +/*==============================*/ + const char* tablename, /*!< in: name of table */ + ulint num_fields, /*!< in: number of fields */ + ulint num_indexes) /*!< in: number of indexes */ +{ + dict_table_t* sys_table; + dberr_t error = DB_SUCCESS; + + ut_a(srv_get_active_thread_type() == SRV_NONE); + + mutex_enter(&dict_sys->mutex); + + sys_table = dict_table_get_low(tablename); + + if (sys_table == NULL) { + error = DB_TABLE_NOT_FOUND; + + } else if (UT_LIST_GET_LEN(sys_table->indexes) != num_indexes + || sys_table->n_cols != num_fields) { + error = DB_CORRUPTION; + + } else { + /* This table has already been created, and it is OK. + Ensure that it can't be evicted from the table LRU cache. */ + + dict_table_move_from_lru_to_non_lru(sys_table); + } + + mutex_exit(&dict_sys->mutex); + + return(error); +} + +/****************************************************************//** +Creates the foreign key constraints system tables inside InnoDB +at server bootstrap or server start if they are not found or are +not of the right form. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +dict_create_or_check_foreign_constraint_tables(void) +/*================================================*/ +{ + trx_t* trx; + my_bool srv_file_per_table_backup; + dberr_t err; + dberr_t sys_foreign_err; + dberr_t sys_foreign_cols_err; + + ut_a(srv_get_active_thread_type() == SRV_NONE); + + /* Note: The master thread has not been started at this point. */ + + + sys_foreign_err = dict_check_if_system_table_exists( + "SYS_FOREIGN", DICT_NUM_FIELDS__SYS_FOREIGN + 1, 3); + sys_foreign_cols_err = dict_check_if_system_table_exists( + "SYS_FOREIGN_COLS", DICT_NUM_FIELDS__SYS_FOREIGN_COLS + 1, 1); + + if (sys_foreign_err == DB_SUCCESS + && sys_foreign_cols_err == DB_SUCCESS) { + return(DB_SUCCESS); + } + + trx = trx_allocate_for_mysql(); + + trx_set_dict_operation(trx, TRX_DICT_OP_TABLE); + + trx->op_info = "creating foreign key sys tables"; + + row_mysql_lock_data_dictionary(trx); + + /* Check which incomplete table definition to drop. */ + + if (sys_foreign_err == DB_CORRUPTION) { + ib_logf(IB_LOG_LEVEL_WARN, + "Dropping incompletely created " + "SYS_FOREIGN table."); + row_drop_table_for_mysql("SYS_FOREIGN", trx, TRUE); + } + + if (sys_foreign_cols_err == DB_CORRUPTION) { + ib_logf(IB_LOG_LEVEL_WARN, + "Dropping incompletely created " + "SYS_FOREIGN_COLS table."); + + row_drop_table_for_mysql("SYS_FOREIGN_COLS", trx, TRUE); + } + + ib_logf(IB_LOG_LEVEL_WARN, + "Creating foreign key constraint system tables."); + + /* NOTE: in dict_load_foreigns we use the fact that + there are 2 secondary indexes on SYS_FOREIGN, and they + are defined just like below */ + + /* NOTE: when designing InnoDB's foreign key support in 2001, we made + an error and made the table names and the foreign key id of type + 'CHAR' (internally, really a VARCHAR). We should have made the type + VARBINARY, like in other InnoDB system tables, to get a clean + design. */ + + srv_file_per_table_backup = srv_file_per_table; + + /* We always want SYSTEM tables to be created inside the system + tablespace. */ + + srv_file_per_table = 0; + + err = que_eval_sql( + NULL, + "PROCEDURE CREATE_FOREIGN_SYS_TABLES_PROC () IS\n" + "BEGIN\n" + "CREATE TABLE\n" + "SYS_FOREIGN(ID CHAR, FOR_NAME CHAR," + " REF_NAME CHAR, N_COLS INT);\n" + "CREATE UNIQUE CLUSTERED INDEX ID_IND" + " ON SYS_FOREIGN (ID);\n" + "CREATE INDEX FOR_IND" + " ON SYS_FOREIGN (FOR_NAME);\n" + "CREATE INDEX REF_IND" + " ON SYS_FOREIGN (REF_NAME);\n" + "CREATE TABLE\n" + "SYS_FOREIGN_COLS(ID CHAR, POS INT," + " FOR_COL_NAME CHAR, REF_COL_NAME CHAR);\n" + "CREATE UNIQUE CLUSTERED INDEX ID_IND" + " ON SYS_FOREIGN_COLS (ID, POS);\n" + "END;\n", + FALSE, trx); + + if (err != DB_SUCCESS) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Creation of SYS_FOREIGN and SYS_FOREIGN_COLS " + "has failed with error %lu. Tablespace is full. " + "Dropping incompletely created tables.", + (ulong) err); + + ut_ad(err == DB_OUT_OF_FILE_SPACE + || err == DB_TOO_MANY_CONCURRENT_TRXS); + + row_drop_table_for_mysql("SYS_FOREIGN", trx, TRUE); + row_drop_table_for_mysql("SYS_FOREIGN_COLS", trx, TRUE); + + if (err == DB_OUT_OF_FILE_SPACE) { + err = DB_MUST_GET_MORE_FILE_SPACE; + } + } + + trx_commit_for_mysql(trx); + + row_mysql_unlock_data_dictionary(trx); + + trx_free_for_mysql(trx); + + srv_file_per_table = srv_file_per_table_backup; + + if (err == DB_SUCCESS) { + ib_logf(IB_LOG_LEVEL_INFO, + "Foreign key constraint system tables created"); + } + + /* Note: The master thread has not been started at this point. */ + /* Confirm and move to the non-LRU part of the table LRU list. */ + sys_foreign_err = dict_check_if_system_table_exists( + "SYS_FOREIGN", DICT_NUM_FIELDS__SYS_FOREIGN + 1, 3); + ut_a(sys_foreign_err == DB_SUCCESS); + + sys_foreign_cols_err = dict_check_if_system_table_exists( + "SYS_FOREIGN_COLS", DICT_NUM_FIELDS__SYS_FOREIGN_COLS + 1, 1); + ut_a(sys_foreign_cols_err == DB_SUCCESS); + + return(err); +} + +/****************************************************************//** +Evaluate the given foreign key SQL statement. +@return error code or DB_SUCCESS */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +dict_foreign_eval_sql( +/*==================*/ + pars_info_t* info, /*!< in: info struct */ + const char* sql, /*!< in: SQL string to evaluate */ + const char* name, /*!< in: table name (for diagnostics) */ + const char* id, /*!< in: foreign key id */ + trx_t* trx) /*!< in/out: transaction */ +{ + dberr_t error; + FILE* ef = dict_foreign_err_file; + + error = que_eval_sql(info, sql, FALSE, trx); + + if (error == DB_DUPLICATE_KEY) { + mutex_enter(&dict_foreign_err_mutex); + rewind(ef); + ut_print_timestamp(ef); + fputs(" Error in foreign key constraint creation for table ", + ef); + ut_print_name(ef, trx, TRUE, name); + fputs(".\nA foreign key constraint of name ", ef); + ut_print_name(ef, trx, TRUE, id); + fputs("\nalready exists." + " (Note that internally InnoDB adds 'databasename'\n" + "in front of the user-defined constraint name.)\n" + "Note that InnoDB's FOREIGN KEY system tables store\n" + "constraint names as case-insensitive, with the\n" + "MySQL standard latin1_swedish_ci collation. If you\n" + "create tables or databases whose names differ only in\n" + "the character case, then collisions in constraint\n" + "names can occur. Workaround: name your constraints\n" + "explicitly with unique names.\n", + ef); + + mutex_exit(&dict_foreign_err_mutex); + + return(error); + } + + if (error != DB_SUCCESS) { + fprintf(stderr, + "InnoDB: Foreign key constraint creation failed:\n" + "InnoDB: internal error number %lu\n", (ulong) error); + + mutex_enter(&dict_foreign_err_mutex); + ut_print_timestamp(ef); + fputs(" Internal error in foreign key constraint creation" + " for table ", ef); + ut_print_name(ef, trx, TRUE, name); + fputs(".\n" + "See the MySQL .err log in the datadir" + " for more information.\n", ef); + mutex_exit(&dict_foreign_err_mutex); + + return(error); + } + + return(DB_SUCCESS); +} + +/********************************************************************//** +Add a single foreign key field definition to the data dictionary tables in +the database. +@return error code or DB_SUCCESS */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +dict_create_add_foreign_field_to_dictionary( +/*========================================*/ + ulint field_nr, /*!< in: field number */ + const char* table_name, /*!< in: table name */ + const dict_foreign_t* foreign, /*!< in: foreign */ + trx_t* trx) /*!< in/out: transaction */ +{ + pars_info_t* info = pars_info_create(); + + pars_info_add_str_literal(info, "id", foreign->id); + + pars_info_add_int4_literal(info, "pos", field_nr); + + pars_info_add_str_literal(info, "for_col_name", + foreign->foreign_col_names[field_nr]); + + pars_info_add_str_literal(info, "ref_col_name", + foreign->referenced_col_names[field_nr]); + + return(dict_foreign_eval_sql( + info, + "PROCEDURE P () IS\n" + "BEGIN\n" + "INSERT INTO SYS_FOREIGN_COLS VALUES" + "(:id, :pos, :for_col_name, :ref_col_name);\n" + "END;\n", + table_name, foreign->id, trx)); +} + +/********************************************************************//** +Add a foreign key definition to the data dictionary tables. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +dict_create_add_foreign_to_dictionary( +/*==================================*/ + const char* name, /*!< in: table name */ + const dict_foreign_t* foreign,/*!< in: foreign key */ + trx_t* trx) /*!< in/out: dictionary transaction */ +{ + dberr_t error; + pars_info_t* info = pars_info_create(); + + pars_info_add_str_literal(info, "id", foreign->id); + + pars_info_add_str_literal(info, "for_name", name); + + pars_info_add_str_literal(info, "ref_name", + foreign->referenced_table_name); + + pars_info_add_int4_literal(info, "n_cols", + foreign->n_fields + (foreign->type << 24)); + + error = dict_foreign_eval_sql(info, + "PROCEDURE P () IS\n" + "BEGIN\n" + "INSERT INTO SYS_FOREIGN VALUES" + "(:id, :for_name, :ref_name, :n_cols);\n" + "END;\n" + , name, foreign->id, trx); + + if (error != DB_SUCCESS) { + + return(error); + } + + for (ulint i = 0; i < foreign->n_fields; i++) { + error = dict_create_add_foreign_field_to_dictionary( + i, name, foreign, trx); + + if (error != DB_SUCCESS) { + + return(error); + } + } + + return(error); +} + +/** Adds the given set of foreign key objects to the dictionary tables +in the database. This function does not modify the dictionary cache. The +caller must ensure that all foreign key objects contain a valid constraint +name in foreign->id. +@param[in] local_fk_set set of foreign key objects, to be added to +the dictionary tables +@param[in] table table to which the foreign key objects in +local_fk_set belong to +@param[in,out] trx transaction +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +dict_create_add_foreigns_to_dictionary( +/*===================================*/ + const dict_foreign_set& local_fk_set, + const dict_table_t* table, + trx_t* trx) +{ + dict_foreign_t* foreign; + dberr_t error; + + ut_ad(mutex_own(&(dict_sys->mutex))); + + if (NULL == dict_table_get_low("SYS_FOREIGN")) { + fprintf(stderr, + "InnoDB: table SYS_FOREIGN not found" + " in internal data dictionary\n"); + + return(DB_ERROR); + } + + for (dict_foreign_set::const_iterator it = local_fk_set.begin(); + it != local_fk_set.end(); + ++it) { + + foreign = *it; + ut_ad(foreign->id != NULL); + + error = dict_create_add_foreign_to_dictionary(table->name, + foreign, trx); + + if (error != DB_SUCCESS) { + + return(error); + } + } + + trx->op_info = "committing foreign key definitions"; + + trx_commit(trx); + + trx->op_info = ""; + + return(DB_SUCCESS); +} + +/****************************************************************//** +Creates the tablespaces and datafiles system tables inside InnoDB +at server bootstrap or server start if they are not found or are +not of the right form. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +dict_create_or_check_sys_tablespace(void) +/*=====================================*/ +{ + trx_t* trx; + my_bool srv_file_per_table_backup; + dberr_t err; + dberr_t sys_tablespaces_err; + dberr_t sys_datafiles_err; + + ut_a(srv_get_active_thread_type() == SRV_NONE); + + /* Note: The master thread has not been started at this point. */ + + sys_tablespaces_err = dict_check_if_system_table_exists( + "SYS_TABLESPACES", DICT_NUM_FIELDS__SYS_TABLESPACES + 1, 1); + sys_datafiles_err = dict_check_if_system_table_exists( + "SYS_DATAFILES", DICT_NUM_FIELDS__SYS_DATAFILES + 1, 1); + + if (sys_tablespaces_err == DB_SUCCESS + && sys_datafiles_err == DB_SUCCESS) { + return(DB_SUCCESS); + } + + trx = trx_allocate_for_mysql(); + + trx_set_dict_operation(trx, TRX_DICT_OP_TABLE); + + trx->op_info = "creating tablepace and datafile sys tables"; + + row_mysql_lock_data_dictionary(trx); + + /* Check which incomplete table definition to drop. */ + + if (sys_tablespaces_err == DB_CORRUPTION) { + ib_logf(IB_LOG_LEVEL_WARN, + "Dropping incompletely created " + "SYS_TABLESPACES table."); + row_drop_table_for_mysql("SYS_TABLESPACES", trx, TRUE); + } + + if (sys_datafiles_err == DB_CORRUPTION) { + ib_logf(IB_LOG_LEVEL_WARN, + "Dropping incompletely created " + "SYS_DATAFILES table."); + + row_drop_table_for_mysql("SYS_DATAFILES", trx, TRUE); + } + + ib_logf(IB_LOG_LEVEL_INFO, + "Creating tablespace and datafile system tables."); + + /* We always want SYSTEM tables to be created inside the system + tablespace. */ + srv_file_per_table_backup = srv_file_per_table; + srv_file_per_table = 0; + + err = que_eval_sql( + NULL, + "PROCEDURE CREATE_SYS_TABLESPACE_PROC () IS\n" + "BEGIN\n" + "CREATE TABLE SYS_TABLESPACES(\n" + " SPACE INT, NAME CHAR, FLAGS INT);\n" + "CREATE UNIQUE CLUSTERED INDEX SYS_TABLESPACES_SPACE" + " ON SYS_TABLESPACES (SPACE);\n" + "CREATE TABLE SYS_DATAFILES(\n" + " SPACE INT, PATH CHAR);\n" + "CREATE UNIQUE CLUSTERED INDEX SYS_DATAFILES_SPACE" + " ON SYS_DATAFILES (SPACE);\n" + "END;\n", + FALSE, trx); + + if (err != DB_SUCCESS) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Creation of SYS_TABLESPACES and SYS_DATAFILES " + "has failed with error %lu. Tablespace is full. " + "Dropping incompletely created tables.", + (ulong) err); + + ut_a(err == DB_OUT_OF_FILE_SPACE + || err == DB_TOO_MANY_CONCURRENT_TRXS); + + row_drop_table_for_mysql("SYS_TABLESPACES", trx, TRUE); + row_drop_table_for_mysql("SYS_DATAFILES", trx, TRUE); + + if (err == DB_OUT_OF_FILE_SPACE) { + err = DB_MUST_GET_MORE_FILE_SPACE; + } + } + + trx_commit_for_mysql(trx); + + row_mysql_unlock_data_dictionary(trx); + + trx_free_for_mysql(trx); + + srv_file_per_table = srv_file_per_table_backup; + + if (err == DB_SUCCESS) { + ib_logf(IB_LOG_LEVEL_INFO, + "Tablespace and datafile system tables created."); + } + + /* Note: The master thread has not been started at this point. */ + /* Confirm and move to the non-LRU part of the table LRU list. */ + + sys_tablespaces_err = dict_check_if_system_table_exists( + "SYS_TABLESPACES", DICT_NUM_FIELDS__SYS_TABLESPACES + 1, 1); + ut_a(sys_tablespaces_err == DB_SUCCESS); + + sys_datafiles_err = dict_check_if_system_table_exists( + "SYS_DATAFILES", DICT_NUM_FIELDS__SYS_DATAFILES + 1, 1); + ut_a(sys_datafiles_err == DB_SUCCESS); + + return(err); +} + +/********************************************************************//** +Add a single tablespace definition to the data dictionary tables in the +database. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +dict_create_add_tablespace_to_dictionary( +/*=====================================*/ + ulint space, /*!< in: tablespace id */ + const char* name, /*!< in: tablespace name */ + ulint flags, /*!< in: tablespace flags */ + const char* path, /*!< in: tablespace path */ + trx_t* trx, /*!< in/out: transaction */ + bool commit) /*!< in: if true then commit the + transaction */ +{ + dberr_t error; + + pars_info_t* info = pars_info_create(); + + ut_a(space > TRX_SYS_SPACE); + + pars_info_add_int4_literal(info, "space", space); + + pars_info_add_str_literal(info, "name", name); + + pars_info_add_int4_literal(info, "flags", flags); + + pars_info_add_str_literal(info, "path", path); + + error = que_eval_sql(info, + "PROCEDURE P () IS\n" + "BEGIN\n" + "INSERT INTO SYS_TABLESPACES VALUES" + "(:space, :name, :flags);\n" + "INSERT INTO SYS_DATAFILES VALUES" + "(:space, :path);\n" + "END;\n", + FALSE, trx); + + if (error != DB_SUCCESS) { + return(error); + } + + if (commit) { + trx->op_info = "committing tablespace and datafile definition"; + trx_commit(trx); + } + + trx->op_info = ""; + + return(error); +} diff --git a/storage/xtradb/dict/dict0dict.cc b/storage/xtradb/dict/dict0dict.cc new file mode 100644 index 00000000000..87a1caa31bb --- /dev/null +++ b/storage/xtradb/dict/dict0dict.cc @@ -0,0 +1,6750 @@ +/***************************************************************************** + +Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file dict/dict0dict.cc +Data dictionary system + +Created 1/8/1996 Heikki Tuuri +***********************************************************************/ + +#include "dict0dict.h" +#include "fts0fts.h" +#include "fil0fil.h" +#include <algorithm> + +#ifdef UNIV_NONINL +#include "dict0dict.ic" +#include "dict0priv.ic" +#endif + +/** dummy index for ROW_FORMAT=REDUNDANT supremum and infimum records */ +UNIV_INTERN dict_index_t* dict_ind_redundant; +/** dummy index for ROW_FORMAT=COMPACT supremum and infimum records */ +UNIV_INTERN dict_index_t* dict_ind_compact; + +#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG +/** Flag to control insert buffer debugging. */ +extern UNIV_INTERN uint ibuf_debug; +#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ + +/********************************************************************** +Issue a warning that the row is too big. */ +void +ib_warn_row_too_big(const dict_table_t* table); + +#ifndef UNIV_HOTBACKUP +#include "buf0buf.h" +#include "data0type.h" +#include "mach0data.h" +#include "dict0boot.h" +#include "dict0mem.h" +#include "dict0crea.h" +#include "dict0stats.h" +#include "trx0undo.h" +#include "btr0btr.h" +#include "btr0cur.h" +#include "btr0sea.h" +#include "os0once.h" +#include "page0zip.h" +#include "page0page.h" +#include "pars0pars.h" +#include "pars0sym.h" +#include "que0que.h" +#include "rem0cmp.h" +#include "fts0fts.h" +#include "fts0types.h" +#include "m_ctype.h" /* my_isspace() */ +#include "ha_prototypes.h" /* innobase_strcasecmp(), innobase_casedn_str() */ +#include "srv0mon.h" +#include "srv0start.h" +#include "lock0lock.h" +#include "dict0priv.h" +#include "row0upd.h" +#include "row0mysql.h" +#include "row0merge.h" +#include "row0log.h" +#include "ut0ut.h" /* ut_format_name() */ +#include "m_string.h" +#include "my_sys.h" +#include "mysqld.h" /* system_charset_info */ +#include "strfunc.h" /* strconvert() */ + +#include <ctype.h> + +/** the dictionary system */ +UNIV_INTERN dict_sys_t* dict_sys = NULL; + +/** @brief the data dictionary rw-latch protecting dict_sys + +table create, drop, etc. reserve this in X-mode; implicit or +backround operations purge, rollback, foreign key checks reserve this +in S-mode; we cannot trust that MySQL protects implicit or background +operations a table drop since MySQL does not know of them; therefore +we need this; NOTE: a transaction which reserves this must keep book +on the mode in trx_t::dict_operation_lock_mode */ +UNIV_INTERN rw_lock_t dict_operation_lock; + +/** Percentage of compression failures that are allowed in a single +round */ +UNIV_INTERN ulong zip_failure_threshold_pct = 5; + +/** Maximum percentage of a page that can be allowed as a pad to avoid +compression failures */ +UNIV_INTERN ulong zip_pad_max = 50; + +/* Keys to register rwlocks and mutexes with performance schema */ +#ifdef UNIV_PFS_RWLOCK +UNIV_INTERN mysql_pfs_key_t dict_operation_lock_key; +UNIV_INTERN mysql_pfs_key_t index_tree_rw_lock_key; +UNIV_INTERN mysql_pfs_key_t index_online_log_key; +UNIV_INTERN mysql_pfs_key_t dict_table_stats_key; +#endif /* UNIV_PFS_RWLOCK */ + +#ifdef UNIV_PFS_MUTEX +UNIV_INTERN mysql_pfs_key_t zip_pad_mutex_key; +UNIV_INTERN mysql_pfs_key_t dict_sys_mutex_key; +UNIV_INTERN mysql_pfs_key_t dict_foreign_err_mutex_key; +#endif /* UNIV_PFS_MUTEX */ + +#define DICT_HEAP_SIZE 100 /*!< initial memory heap size when + creating a table or index object */ +#define DICT_POOL_PER_TABLE_HASH 512 /*!< buffer pool max size per table + hash table fixed size in bytes */ +#define DICT_POOL_PER_VARYING 4 /*!< buffer pool max size per data + dictionary varying size in bytes */ + +/** Identifies generated InnoDB foreign key names */ +static char dict_ibfk[] = "_ibfk_"; + +/*******************************************************************//** +Tries to find column names for the index and sets the col field of the +index. +@return TRUE if the column names were found */ +static +ibool +dict_index_find_cols( +/*=================*/ + dict_table_t* table, /*!< in: table */ + dict_index_t* index); /*!< in: index */ +/*******************************************************************//** +Builds the internal dictionary cache representation for a clustered +index, containing also system fields not defined by the user. +@return own: the internal representation of the clustered index */ +static +dict_index_t* +dict_index_build_internal_clust( +/*============================*/ + const dict_table_t* table, /*!< in: table */ + dict_index_t* index); /*!< in: user representation of + a clustered index */ +/*******************************************************************//** +Builds the internal dictionary cache representation for a non-clustered +index, containing also system fields not defined by the user. +@return own: the internal representation of the non-clustered index */ +static +dict_index_t* +dict_index_build_internal_non_clust( +/*================================*/ + const dict_table_t* table, /*!< in: table */ + dict_index_t* index); /*!< in: user representation of + a non-clustered index */ +/**********************************************************************//** +Builds the internal dictionary cache representation for an FTS index. +@return own: the internal representation of the FTS index */ +static +dict_index_t* +dict_index_build_internal_fts( +/*==========================*/ + dict_table_t* table, /*!< in: table */ + dict_index_t* index); /*!< in: user representation of an FTS index */ +/**********************************************************************//** +Prints a column data. */ +static +void +dict_col_print_low( +/*===============*/ + const dict_table_t* table, /*!< in: table */ + const dict_col_t* col); /*!< in: column */ +/**********************************************************************//** +Prints an index data. */ +static +void +dict_index_print_low( +/*=================*/ + dict_index_t* index); /*!< in: index */ +/**********************************************************************//** +Prints a field data. */ +static +void +dict_field_print_low( +/*=================*/ + const dict_field_t* field); /*!< in: field */ + +/**********************************************************************//** +Removes an index from the dictionary cache. */ +static +void +dict_index_remove_from_cache_low( +/*=============================*/ + dict_table_t* table, /*!< in/out: table */ + dict_index_t* index, /*!< in, own: index */ + ibool lru_evict); /*!< in: TRUE if page being evicted + to make room in the table LRU list */ +/**********************************************************************//** +Removes a table object from the dictionary cache. */ +static +void +dict_table_remove_from_cache_low( +/*=============================*/ + dict_table_t* table, /*!< in, own: table */ + ibool lru_evict); /*!< in: TRUE if evicting from LRU */ +#ifdef UNIV_DEBUG +/**********************************************************************//** +Validate the dictionary table LRU list. +@return TRUE if validate OK */ +static +ibool +dict_lru_validate(void); +/*===================*/ +/**********************************************************************//** +Check if table is in the dictionary table LRU list. +@return TRUE if table found */ +static +ibool +dict_lru_find_table( +/*================*/ + const dict_table_t* find_table); /*!< in: table to find */ +/**********************************************************************//** +Check if a table exists in the dict table non-LRU list. +@return TRUE if table found */ +static +ibool +dict_non_lru_find_table( +/*====================*/ + const dict_table_t* find_table); /*!< in: table to find */ +#endif /* UNIV_DEBUG */ + +/* Stream for storing detailed information about the latest foreign key +and unique key errors. Only created if !srv_read_only_mode */ +UNIV_INTERN FILE* dict_foreign_err_file = NULL; +/* mutex protecting the foreign and unique error buffers */ +UNIV_INTERN ib_mutex_t dict_foreign_err_mutex; + +/******************************************************************//** +Makes all characters in a NUL-terminated UTF-8 string lower case. */ +UNIV_INTERN +void +dict_casedn_str( +/*============*/ + char* a) /*!< in/out: string to put in lower case */ +{ + innobase_casedn_str(a); +} + +/********************************************************************//** +Checks if the database name in two table names is the same. +@return TRUE if same db name */ +UNIV_INTERN +ibool +dict_tables_have_same_db( +/*=====================*/ + const char* name1, /*!< in: table name in the form + dbname '/' tablename */ + const char* name2) /*!< in: table name in the form + dbname '/' tablename */ +{ + for (; *name1 == *name2; name1++, name2++) { + if (*name1 == '/') { + return(TRUE); + } + ut_a(*name1); /* the names must contain '/' */ + } + return(FALSE); +} + +/********************************************************************//** +Return the end of table name where we have removed dbname and '/'. +@return table name */ +UNIV_INTERN +const char* +dict_remove_db_name( +/*================*/ + const char* name) /*!< in: table name in the form + dbname '/' tablename */ +{ + const char* s = strchr(name, '/'); + ut_a(s); + + return(s + 1); +} + +/********************************************************************//** +Get the database name length in a table name. +@return database name length */ +UNIV_INTERN +ulint +dict_get_db_name_len( +/*=================*/ + const char* name) /*!< in: table name in the form + dbname '/' tablename */ +{ + const char* s; + s = strchr(name, '/'); + ut_a(s); + return(s - name); +} + +/********************************************************************//** +Reserves the dictionary system mutex for MySQL. */ +UNIV_INTERN +void +dict_mutex_enter_for_mysql(void) +/*============================*/ +{ + mutex_enter(&(dict_sys->mutex)); +} + +/********************************************************************//** +Releases the dictionary system mutex for MySQL. */ +UNIV_INTERN +void +dict_mutex_exit_for_mysql(void) +/*===========================*/ +{ + mutex_exit(&(dict_sys->mutex)); +} + +/** Allocate and init a dict_table_t's stats latch. +This function must not be called concurrently on the same table object. +@param[in,out] table_void table whose stats latch to create */ +static +void +dict_table_stats_latch_alloc( + void* table_void) +{ + dict_table_t* table = static_cast<dict_table_t*>(table_void); + + table->stats_latch = new(std::nothrow) rw_lock_t; + + ut_a(table->stats_latch != NULL); + + rw_lock_create(dict_table_stats_key, table->stats_latch, + SYNC_INDEX_TREE); +} + +/** Deinit and free a dict_table_t's stats latch. +This function must not be called concurrently on the same table object. +@param[in,out] table table whose stats latch to free */ +static +void +dict_table_stats_latch_free( + dict_table_t* table) +{ + rw_lock_free(table->stats_latch); + delete table->stats_latch; +} + +/** Create a dict_table_t's stats latch or delay for lazy creation. +This function is only called from either single threaded environment +or from a thread that has not shared the table object with other threads. +@param[in,out] table table whose stats latch to create +@param[in] enabled if false then the latch is disabled +and dict_table_stats_lock()/unlock() become noop on this table. */ + +void +dict_table_stats_latch_create( + dict_table_t* table, + bool enabled) +{ + if (!enabled) { + table->stats_latch = NULL; + table->stats_latch_created = os_once::DONE; + return; + } + +#ifdef HAVE_ATOMIC_BUILTINS + /* We create this lazily the first time it is used. */ + table->stats_latch = NULL; + table->stats_latch_created = os_once::NEVER_DONE; +#else /* HAVE_ATOMIC_BUILTINS */ + + dict_table_stats_latch_alloc(table); + + table->stats_latch_created = os_once::DONE; +#endif /* HAVE_ATOMIC_BUILTINS */ +} + +/** Destroy a dict_table_t's stats latch. +This function is only called from either single threaded environment +or from a thread that has not shared the table object with other threads. +@param[in,out] table table whose stats latch to destroy */ + +void +dict_table_stats_latch_destroy( + dict_table_t* table) +{ + if (table->stats_latch_created == os_once::DONE + && table->stats_latch != NULL) { + + dict_table_stats_latch_free(table); + } +} + +/**********************************************************************//** +Lock the appropriate latch to protect a given table's statistics. */ +UNIV_INTERN +void +dict_table_stats_lock( +/*==================*/ + dict_table_t* table, /*!< in: table */ + ulint latch_mode) /*!< in: RW_S_LATCH or RW_X_LATCH */ +{ + ut_ad(table != NULL); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + +#ifdef HAVE_ATOMIC_BUILTINS + os_once::do_or_wait_for_done( + &table->stats_latch_created, + dict_table_stats_latch_alloc, table); +#else /* HAVE_ATOMIC_BUILTINS */ + ut_ad(table->stats_latch_created == os_once::DONE); +#endif /* HAVE_ATOMIC_BUILTINS */ + + if (table->stats_latch == NULL) { + /* This is a dummy table object that is private in the current + thread and is not shared between multiple threads, thus we + skip any locking. */ + return; + } + + switch (latch_mode) { + case RW_S_LATCH: + rw_lock_s_lock(table->stats_latch); + break; + case RW_X_LATCH: + rw_lock_x_lock(table->stats_latch); + break; + case RW_NO_LATCH: + /* fall through */ + default: + ut_error; + } +} + +/**********************************************************************//** +Unlock the latch that has been locked by dict_table_stats_lock() */ +UNIV_INTERN +void +dict_table_stats_unlock( +/*====================*/ + dict_table_t* table, /*!< in: table */ + ulint latch_mode) /*!< in: RW_S_LATCH or + RW_X_LATCH */ +{ + ut_ad(table != NULL); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + + if (table->stats_latch == NULL) { + /* This is a dummy table object that is private in the current + thread and is not shared between multiple threads, thus we + skip any locking. */ + return; + } + + switch (latch_mode) { + case RW_S_LATCH: + rw_lock_s_unlock(table->stats_latch); + break; + case RW_X_LATCH: + rw_lock_x_unlock(table->stats_latch); + break; + case RW_NO_LATCH: + /* fall through */ + default: + ut_error; + } +} + +/**********************************************************************//** +Try to drop any indexes after an aborted index creation. +This can also be after a server kill during DROP INDEX. */ +static +void +dict_table_try_drop_aborted( +/*========================*/ + dict_table_t* table, /*!< in: table, or NULL if it + needs to be looked up again */ + table_id_t table_id, /*!< in: table identifier */ + ulint ref_count) /*!< in: expected table->n_ref_count */ +{ + trx_t* trx; + + trx = trx_allocate_for_background(); + trx->op_info = "try to drop any indexes after an aborted index creation"; + row_mysql_lock_data_dictionary(trx); + trx_set_dict_operation(trx, TRX_DICT_OP_INDEX); + + if (table == NULL) { + table = dict_table_open_on_id_low( + table_id, DICT_ERR_IGNORE_NONE); + } else { + ut_ad(table->id == table_id); + } + + if (table && table->n_ref_count == ref_count && table->drop_aborted) { + /* Silence a debug assertion in row_merge_drop_indexes(). */ + ut_d(table->n_ref_count++); + row_merge_drop_indexes(trx, table, TRUE); + ut_d(table->n_ref_count--); + ut_ad(table->n_ref_count == ref_count); + trx_commit_for_mysql(trx); + } + + row_mysql_unlock_data_dictionary(trx); + trx_free_for_background(trx); +} + +/**********************************************************************//** +When opening a table, +try to drop any indexes after an aborted index creation. +Release the dict_sys->mutex. */ +static +void +dict_table_try_drop_aborted_and_mutex_exit( +/*=======================================*/ + dict_table_t* table, /*!< in: table (may be NULL) */ + ibool try_drop) /*!< in: FALSE if should try to + drop indexes whose online creation + was aborted */ +{ + if (try_drop + && table != NULL + && table->drop_aborted + && table->n_ref_count == 1 + && dict_table_get_first_index(table)) { + + /* Attempt to drop the indexes whose online creation + was aborted. */ + table_id_t table_id = table->id; + + mutex_exit(&dict_sys->mutex); + + dict_table_try_drop_aborted(table, table_id, 1); + } else { + mutex_exit(&dict_sys->mutex); + } +} + +/********************************************************************//** +Decrements the count of open handles to a table. */ +UNIV_INTERN +void +dict_table_close( +/*=============*/ + dict_table_t* table, /*!< in/out: table */ + ibool dict_locked, /*!< in: TRUE=data dictionary locked */ + ibool try_drop) /*!< in: TRUE=try to drop any orphan + indexes after an aborted online + index creation */ +{ + if (!dict_locked) { + mutex_enter(&dict_sys->mutex); + } + + ut_ad(mutex_own(&dict_sys->mutex)); + ut_a(table->n_ref_count > 0); + + --table->n_ref_count; + + /* Force persistent stats re-read upon next open of the table + so that FLUSH TABLE can be used to forcibly fetch stats from disk + if they have been manually modified. We reset table->stat_initialized + only if table reference count is 0 because we do not want too frequent + stats re-reads (e.g. in other cases than FLUSH TABLE). */ + if (strchr(table->name, '/') != NULL + && table->n_ref_count == 0 + && dict_stats_is_persistent_enabled(table)) { + + dict_stats_deinit(table); + } + + MONITOR_DEC(MONITOR_TABLE_REFERENCE); + + ut_ad(dict_lru_validate()); + +#ifdef UNIV_DEBUG + if (table->can_be_evicted) { + ut_ad(dict_lru_find_table(table)); + } else { + ut_ad(dict_non_lru_find_table(table)); + } +#endif /* UNIV_DEBUG */ + + if (!dict_locked) { + table_id_t table_id = table->id; + ibool drop_aborted; + + drop_aborted = try_drop + && table->drop_aborted + && table->n_ref_count == 1 + && dict_table_get_first_index(table); + + mutex_exit(&dict_sys->mutex); + + if (drop_aborted) { + dict_table_try_drop_aborted(NULL, table_id, 0); + } + } +} +#endif /* !UNIV_HOTBACKUP */ + +/**********************************************************************//** +Returns a column's name. +@return column name. NOTE: not guaranteed to stay valid if table is +modified in any way (columns added, etc.). */ +UNIV_INTERN +const char* +dict_table_get_col_name( +/*====================*/ + const dict_table_t* table, /*!< in: table */ + ulint col_nr) /*!< in: column number */ +{ + ulint i; + const char* s; + + ut_ad(table); + ut_ad(col_nr < table->n_def); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + + s = table->col_names; + if (s) { + for (i = 0; i < col_nr; i++) { + s += strlen(s) + 1; + } + } + + return(s); +} + +#ifndef UNIV_HOTBACKUP +/********************************************************************//** +Acquire the autoinc lock. */ +UNIV_INTERN +void +dict_table_autoinc_lock( +/*====================*/ + dict_table_t* table) /*!< in/out: table */ +{ + mutex_enter(&table->autoinc_mutex); +} + +/********************************************************************//** +Unconditionally set the autoinc counter. */ +UNIV_INTERN +void +dict_table_autoinc_initialize( +/*==========================*/ + dict_table_t* table, /*!< in/out: table */ + ib_uint64_t value) /*!< in: next value to assign to a row */ +{ + ut_ad(mutex_own(&table->autoinc_mutex)); + + table->autoinc = value; +} + +/************************************************************************ +Get all the FTS indexes on a table. +@return number of FTS indexes */ +UNIV_INTERN +ulint +dict_table_get_all_fts_indexes( +/*===========================*/ + dict_table_t* table, /*!< in: table */ + ib_vector_t* indexes) /*!< out: all FTS indexes on this + table */ +{ + dict_index_t* index; + + ut_a(ib_vector_size(indexes) == 0); + + for (index = dict_table_get_first_index(table); + index; + index = dict_table_get_next_index(index)) { + + if (index->type == DICT_FTS) { + ib_vector_push(indexes, &index); + } + } + + return(ib_vector_size(indexes)); +} + +/********************************************************************//** +Reads the next autoinc value (== autoinc counter value), 0 if not yet +initialized. +@return value for a new row, or 0 */ +UNIV_INTERN +ib_uint64_t +dict_table_autoinc_read( +/*====================*/ + const dict_table_t* table) /*!< in: table */ +{ + ut_ad(mutex_own(&table->autoinc_mutex)); + + return(table->autoinc); +} + +/********************************************************************//** +Updates the autoinc counter if the value supplied is greater than the +current value. */ +UNIV_INTERN +void +dict_table_autoinc_update_if_greater( +/*=================================*/ + + dict_table_t* table, /*!< in/out: table */ + ib_uint64_t value) /*!< in: value which was assigned to a row */ +{ + ut_ad(mutex_own(&table->autoinc_mutex)); + + if (value > table->autoinc) { + + table->autoinc = value; + } +} + +/********************************************************************//** +Release the autoinc lock. */ +UNIV_INTERN +void +dict_table_autoinc_unlock( +/*======================*/ + dict_table_t* table) /*!< in/out: table */ +{ + mutex_exit(&table->autoinc_mutex); +} +#endif /* !UNIV_HOTBACKUP */ + +/********************************************************************//** +Looks for column n in an index. +@return position in internal representation of the index; +ULINT_UNDEFINED if not contained */ +UNIV_INTERN +ulint +dict_index_get_nth_col_or_prefix_pos( +/*=================================*/ + const dict_index_t* index, /*!< in: index */ + ulint n, /*!< in: column number */ + ibool inc_prefix) /*!< in: TRUE=consider + column prefixes too */ +{ + const dict_field_t* field; + const dict_col_t* col; + ulint pos; + ulint n_fields; + + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + col = dict_table_get_nth_col(index->table, n); + + if (dict_index_is_clust(index)) { + + return(dict_col_get_clust_pos(col, index)); + } + + n_fields = dict_index_get_n_fields(index); + + for (pos = 0; pos < n_fields; pos++) { + field = dict_index_get_nth_field(index, pos); + + if (col == field->col + && (inc_prefix || field->prefix_len == 0)) { + + return(pos); + } + } + + return(ULINT_UNDEFINED); +} + +#ifndef UNIV_HOTBACKUP +/********************************************************************//** +Returns TRUE if the index contains a column or a prefix of that column. +@return TRUE if contains the column or its prefix */ +UNIV_INTERN +ibool +dict_index_contains_col_or_prefix( +/*==============================*/ + const dict_index_t* index, /*!< in: index */ + ulint n) /*!< in: column number */ +{ + const dict_field_t* field; + const dict_col_t* col; + ulint pos; + ulint n_fields; + + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + if (dict_index_is_clust(index)) { + + return(TRUE); + } + + col = dict_table_get_nth_col(index->table, n); + + n_fields = dict_index_get_n_fields(index); + + for (pos = 0; pos < n_fields; pos++) { + field = dict_index_get_nth_field(index, pos); + + if (col == field->col) { + + return(TRUE); + } + } + + return(FALSE); +} + +/********************************************************************//** +Looks for a matching field in an index. The column has to be the same. The +column in index must be complete, or must contain a prefix longer than the +column in index2. That is, we must be able to construct the prefix in index2 +from the prefix in index. +@return position in internal representation of the index; +ULINT_UNDEFINED if not contained */ +UNIV_INTERN +ulint +dict_index_get_nth_field_pos( +/*=========================*/ + const dict_index_t* index, /*!< in: index from which to search */ + const dict_index_t* index2, /*!< in: index */ + ulint n) /*!< in: field number in index2 */ +{ + const dict_field_t* field; + const dict_field_t* field2; + ulint n_fields; + ulint pos; + + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + field2 = dict_index_get_nth_field(index2, n); + + n_fields = dict_index_get_n_fields(index); + + for (pos = 0; pos < n_fields; pos++) { + field = dict_index_get_nth_field(index, pos); + + if (field->col == field2->col + && (field->prefix_len == 0 + || (field->prefix_len >= field2->prefix_len + && field2->prefix_len != 0))) { + + return(pos); + } + } + + return(ULINT_UNDEFINED); +} + +/**********************************************************************//** +Returns a table object based on table id. +@return table, NULL if does not exist */ +UNIV_INTERN +dict_table_t* +dict_table_open_on_id( +/*==================*/ + table_id_t table_id, /*!< in: table id */ + ibool dict_locked, /*!< in: TRUE=data dictionary locked */ + dict_table_op_t table_op) /*!< in: operation to perform */ +{ + dict_table_t* table; + + if (!dict_locked) { + mutex_enter(&dict_sys->mutex); + } + + ut_ad(mutex_own(&dict_sys->mutex)); + + table = dict_table_open_on_id_low( + table_id, + table_op == DICT_TABLE_OP_LOAD_TABLESPACE + ? DICT_ERR_IGNORE_RECOVER_LOCK + : DICT_ERR_IGNORE_NONE); + + if (table != NULL) { + + if (table->can_be_evicted) { + dict_move_to_mru(table); + } + + ++table->n_ref_count; + + MONITOR_INC(MONITOR_TABLE_REFERENCE); + } + + if (!dict_locked) { + dict_table_try_drop_aborted_and_mutex_exit( + table, table_op == DICT_TABLE_OP_DROP_ORPHAN); + } + + return(table); +} + +/********************************************************************//** +Looks for column n position in the clustered index. +@return position in internal representation of the clustered index */ +UNIV_INTERN +ulint +dict_table_get_nth_col_pos( +/*=======================*/ + const dict_table_t* table, /*!< in: table */ + ulint n) /*!< in: column number */ +{ + return(dict_index_get_nth_col_pos(dict_table_get_first_index(table), + n)); +} + +/********************************************************************//** +Checks if a column is in the ordering columns of the clustered index of a +table. Column prefixes are treated like whole columns. +@return TRUE if the column, or its prefix, is in the clustered key */ +UNIV_INTERN +ibool +dict_table_col_in_clustered_key( +/*============================*/ + const dict_table_t* table, /*!< in: table */ + ulint n) /*!< in: column number */ +{ + const dict_index_t* index; + const dict_field_t* field; + const dict_col_t* col; + ulint pos; + ulint n_fields; + + ut_ad(table); + + col = dict_table_get_nth_col(table, n); + + index = dict_table_get_first_index(table); + + n_fields = dict_index_get_n_unique(index); + + for (pos = 0; pos < n_fields; pos++) { + field = dict_index_get_nth_field(index, pos); + + if (col == field->col) { + + return(TRUE); + } + } + + return(FALSE); +} + +/**********************************************************************//** +Inits the data dictionary module. */ +UNIV_INTERN +void +dict_init(void) +/*===========*/ +{ + dict_sys = static_cast<dict_sys_t*>(mem_zalloc(sizeof(*dict_sys))); + + mutex_create(dict_sys_mutex_key, &dict_sys->mutex, SYNC_DICT); + + dict_sys->table_hash = hash_create(buf_pool_get_curr_size() + / (DICT_POOL_PER_TABLE_HASH + * UNIV_WORD_SIZE)); + dict_sys->table_id_hash = hash_create(buf_pool_get_curr_size() + / (DICT_POOL_PER_TABLE_HASH + * UNIV_WORD_SIZE)); + rw_lock_create(dict_operation_lock_key, + &dict_operation_lock, SYNC_DICT_OPERATION); + + if (!srv_read_only_mode) { + dict_foreign_err_file = os_file_create_tmpfile(); + ut_a(dict_foreign_err_file); + + mutex_create(dict_foreign_err_mutex_key, + &dict_foreign_err_mutex, SYNC_NO_ORDER_CHECK); + } +} + +/**********************************************************************//** +Move to the most recently used segment of the LRU list. */ +UNIV_INTERN +void +dict_move_to_mru( +/*=============*/ + dict_table_t* table) /*!< in: table to move to MRU */ +{ + ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(dict_lru_validate()); + ut_ad(dict_lru_find_table(table)); + + ut_a(table->can_be_evicted); + + UT_LIST_REMOVE(table_LRU, dict_sys->table_LRU, table); + + UT_LIST_ADD_FIRST(table_LRU, dict_sys->table_LRU, table); + + ut_ad(dict_lru_validate()); +} + +/**********************************************************************//** +Returns a table object and increment its open handle count. +NOTE! This is a high-level function to be used mainly from outside the +'dict' module. Inside this directory dict_table_get_low +is usually the appropriate function. +@return table, NULL if does not exist */ +UNIV_INTERN +dict_table_t* +dict_table_open_on_name( +/*====================*/ + const char* table_name, /*!< in: table name */ + ibool dict_locked, /*!< in: TRUE=data dictionary locked */ + ibool try_drop, /*!< in: TRUE=try to drop any orphan + indexes after an aborted online + index creation */ + dict_err_ignore_t + ignore_err) /*!< in: error to be ignored when + loading a table definition */ +{ + dict_table_t* table; + + if (!dict_locked) { + mutex_enter(&(dict_sys->mutex)); + } + + ut_ad(table_name); + ut_ad(mutex_own(&dict_sys->mutex)); + + table = dict_table_check_if_in_cache_low(table_name); + + if (table == NULL) { + table = dict_load_table(table_name, TRUE, ignore_err); + } + + ut_ad(!table || table->cached); + + if (table != NULL) { + + /* If table is corrupted, return NULL */ + if (ignore_err == DICT_ERR_IGNORE_NONE + && table->corrupted) { + + /* Make life easy for drop table. */ + if (table->can_be_evicted) { + dict_table_move_from_lru_to_non_lru(table); + } + + if (!dict_locked) { + mutex_exit(&dict_sys->mutex); + } + + ut_print_timestamp(stderr); + + fprintf(stderr, " InnoDB: table "); + ut_print_name(stderr, NULL, TRUE, table->name); + fprintf(stderr, "is corrupted. Please drop the table " + "and recreate\n"); + + return(NULL); + } + + if (table->can_be_evicted) { + dict_move_to_mru(table); + } + + ++table->n_ref_count; + + MONITOR_INC(MONITOR_TABLE_REFERENCE); + } + + ut_ad(dict_lru_validate()); + + if (!dict_locked) { + dict_table_try_drop_aborted_and_mutex_exit(table, try_drop); + } + + return(table); +} +#endif /* !UNIV_HOTBACKUP */ + +/**********************************************************************//** +Adds system columns to a table object. */ +UNIV_INTERN +void +dict_table_add_system_columns( +/*==========================*/ + dict_table_t* table, /*!< in/out: table */ + mem_heap_t* heap) /*!< in: temporary heap */ +{ + ut_ad(table); + ut_ad(table->n_def == table->n_cols - DATA_N_SYS_COLS); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + ut_ad(!table->cached); + + /* NOTE: the system columns MUST be added in the following order + (so that they can be indexed by the numerical value of DATA_ROW_ID, + etc.) and as the last columns of the table memory object. + The clustered index will not always physically contain all + system columns. */ + + dict_mem_table_add_col(table, heap, "DB_ROW_ID", DATA_SYS, + DATA_ROW_ID | DATA_NOT_NULL, + DATA_ROW_ID_LEN); +#if DATA_ROW_ID != 0 +#error "DATA_ROW_ID != 0" +#endif + dict_mem_table_add_col(table, heap, "DB_TRX_ID", DATA_SYS, + DATA_TRX_ID | DATA_NOT_NULL, + DATA_TRX_ID_LEN); +#if DATA_TRX_ID != 1 +#error "DATA_TRX_ID != 1" +#endif + dict_mem_table_add_col(table, heap, "DB_ROLL_PTR", DATA_SYS, + DATA_ROLL_PTR | DATA_NOT_NULL, + DATA_ROLL_PTR_LEN); +#if DATA_ROLL_PTR != 2 +#error "DATA_ROLL_PTR != 2" +#endif + + /* This check reminds that if a new system column is added to + the program, it should be dealt with here */ +#if DATA_N_SYS_COLS != 3 +#error "DATA_N_SYS_COLS != 3" +#endif +} + +#ifndef UNIV_HOTBACKUP +/**********************************************************************//** +Adds a table object to the dictionary cache. */ +UNIV_INTERN +void +dict_table_add_to_cache( +/*====================*/ + dict_table_t* table, /*!< in: table */ + ibool can_be_evicted, /*!< in: TRUE if can be evicted */ + mem_heap_t* heap) /*!< in: temporary heap */ +{ + ulint fold; + ulint id_fold; + ulint i; + ulint row_len; + + ut_ad(dict_lru_validate()); + + /* The lower limit for what we consider a "big" row */ +#define BIG_ROW_SIZE 1024 + + ut_ad(mutex_own(&(dict_sys->mutex))); + + dict_table_add_system_columns(table, heap); + + table->cached = TRUE; + + fold = ut_fold_string(table->name); + id_fold = ut_fold_ull(table->id); + + row_len = 0; + for (i = 0; i < table->n_def; i++) { + ulint col_len = dict_col_get_max_size( + dict_table_get_nth_col(table, i)); + + row_len += col_len; + + /* If we have a single unbounded field, or several gigantic + fields, mark the maximum row size as BIG_ROW_SIZE. */ + if (row_len >= BIG_ROW_SIZE || col_len >= BIG_ROW_SIZE) { + row_len = BIG_ROW_SIZE; + + break; + } + } + + table->big_rows = row_len >= BIG_ROW_SIZE; + + /* Look for a table with the same name: error if such exists */ + { + dict_table_t* table2; + HASH_SEARCH(name_hash, dict_sys->table_hash, fold, + dict_table_t*, table2, ut_ad(table2->cached), + ut_strcmp(table2->name, table->name) == 0); + ut_a(table2 == NULL); + +#ifdef UNIV_DEBUG + /* Look for the same table pointer with a different name */ + HASH_SEARCH_ALL(name_hash, dict_sys->table_hash, + dict_table_t*, table2, ut_ad(table2->cached), + table2 == table); + ut_ad(table2 == NULL); +#endif /* UNIV_DEBUG */ + } + + /* Look for a table with the same id: error if such exists */ + { + dict_table_t* table2; + HASH_SEARCH(id_hash, dict_sys->table_id_hash, id_fold, + dict_table_t*, table2, ut_ad(table2->cached), + table2->id == table->id); + ut_a(table2 == NULL); + +#ifdef UNIV_DEBUG + /* Look for the same table pointer with a different id */ + HASH_SEARCH_ALL(id_hash, dict_sys->table_id_hash, + dict_table_t*, table2, ut_ad(table2->cached), + table2 == table); + ut_ad(table2 == NULL); +#endif /* UNIV_DEBUG */ + } + + /* Add table to hash table of tables */ + HASH_INSERT(dict_table_t, name_hash, dict_sys->table_hash, fold, + table); + + /* Add table to hash table of tables based on table id */ + HASH_INSERT(dict_table_t, id_hash, dict_sys->table_id_hash, id_fold, + table); + + table->can_be_evicted = can_be_evicted; + + if (table->can_be_evicted) { + UT_LIST_ADD_FIRST(table_LRU, dict_sys->table_LRU, table); + } else { + UT_LIST_ADD_FIRST(table_LRU, dict_sys->table_non_LRU, table); + } + + ut_ad(dict_lru_validate()); + + dict_sys->size += mem_heap_get_size(table->heap) + + strlen(table->name) + 1; +} + +/**********************************************************************//** +Test whether a table can be evicted from the LRU cache. +@return TRUE if table can be evicted. */ +static +ibool +dict_table_can_be_evicted( +/*======================*/ + const dict_table_t* table) /*!< in: table to test */ +{ + ut_ad(mutex_own(&dict_sys->mutex)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + ut_a(table->can_be_evicted); + ut_a(table->foreign_set.empty()); + ut_a(table->referenced_set.empty()); + + if (table->n_ref_count == 0) { + dict_index_t* index; + + /* The transaction commit and rollback are called from + outside the handler interface. This means that there is + a window where the table->n_ref_count can be zero but + the table instance is in "use". */ + + if (lock_table_has_locks(table)) { + return(FALSE); + } + + for (index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + + btr_search_t* info = btr_search_get_info(index); + + /* We are not allowed to free the in-memory index + struct dict_index_t until all entries in the adaptive + hash index that point to any of the page belonging to + his b-tree index are dropped. This is so because + dropping of these entries require access to + dict_index_t struct. To avoid such scenario we keep + a count of number of such pages in the search_info and + only free the dict_index_t struct when this count + drops to zero. + + See also: dict_index_remove_from_cache_low() */ + + if (btr_search_info_get_ref_count(info, index) > 0) { + return(FALSE); + } + } + + return(TRUE); + } + + return(FALSE); +} + +/**********************************************************************//** +Make room in the table cache by evicting an unused table. The unused table +should not be part of FK relationship and currently not used in any user +transaction. There is no guarantee that it will remove a table. +@return number of tables evicted. If the number of tables in the dict_LRU +is less than max_tables it will not do anything. */ +UNIV_INTERN +ulint +dict_make_room_in_cache( +/*====================*/ + ulint max_tables, /*!< in: max tables allowed in cache */ + ulint pct_check) /*!< in: max percent to check */ +{ + ulint i; + ulint len; + dict_table_t* table; + ulint check_up_to; + ulint n_evicted = 0; + + ut_a(pct_check > 0); + ut_a(pct_check <= 100); + ut_ad(mutex_own(&dict_sys->mutex)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(dict_lru_validate()); + + i = len = UT_LIST_GET_LEN(dict_sys->table_LRU); + + if (len < max_tables) { + return(0); + } + + check_up_to = len - ((len * pct_check) / 100); + + /* Check for overflow */ + ut_a(i == 0 || check_up_to <= i); + + /* Find a suitable candidate to evict from the cache. Don't scan the + entire LRU list. Only scan pct_check list entries. */ + + for (table = UT_LIST_GET_LAST(dict_sys->table_LRU); + table != NULL + && i > check_up_to + && (len - n_evicted) > max_tables; + --i) { + + dict_table_t* prev_table; + + prev_table = UT_LIST_GET_PREV(table_LRU, table); + + if (dict_table_can_be_evicted(table)) { + + dict_table_remove_from_cache_low(table, TRUE); + + ++n_evicted; + } + + table = prev_table; + } + + return(n_evicted); +} + +/**********************************************************************//** +Move a table to the non-LRU list from the LRU list. */ +UNIV_INTERN +void +dict_table_move_from_lru_to_non_lru( +/*================================*/ + dict_table_t* table) /*!< in: table to move from LRU to non-LRU */ +{ + ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(dict_lru_find_table(table)); + + ut_a(table->can_be_evicted); + + UT_LIST_REMOVE(table_LRU, dict_sys->table_LRU, table); + + UT_LIST_ADD_LAST(table_LRU, dict_sys->table_non_LRU, table); + + table->can_be_evicted = FALSE; +} + +/**********************************************************************//** +Move a table to the LRU list from the non-LRU list. */ +UNIV_INTERN +void +dict_table_move_from_non_lru_to_lru( +/*================================*/ + dict_table_t* table) /*!< in: table to move from non-LRU to LRU */ +{ + ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(dict_non_lru_find_table(table)); + + ut_a(!table->can_be_evicted); + + UT_LIST_REMOVE(table_LRU, dict_sys->table_non_LRU, table); + + UT_LIST_ADD_LAST(table_LRU, dict_sys->table_LRU, table); + + table->can_be_evicted = TRUE; +} + +/**********************************************************************//** +Looks for an index with the given id given a table instance. +@return index or NULL */ +static +dict_index_t* +dict_table_find_index_on_id( +/*========================*/ + const dict_table_t* table, /*!< in: table instance */ + index_id_t id) /*!< in: index id */ +{ + dict_index_t* index; + + for (index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + + if (id == index->id) { + /* Found */ + + return(index); + } + } + + return(NULL); +} + +/**********************************************************************//** +Looks for an index with the given id. NOTE that we do not reserve +the dictionary mutex: this function is for emergency purposes like +printing info of a corrupt database page! +@return index or NULL if not found in cache */ +UNIV_INTERN +dict_index_t* +dict_index_find_on_id_low( +/*======================*/ + index_id_t id) /*!< in: index id */ +{ + dict_table_t* table; + + /* This can happen if the system tablespace is the wrong page size */ + if (dict_sys == NULL) { + return(NULL); + } + + for (table = UT_LIST_GET_FIRST(dict_sys->table_LRU); + table != NULL; + table = UT_LIST_GET_NEXT(table_LRU, table)) { + + dict_index_t* index = dict_table_find_index_on_id(table, id); + + if (index != NULL) { + return(index); + } + } + + for (table = UT_LIST_GET_FIRST(dict_sys->table_non_LRU); + table != NULL; + table = UT_LIST_GET_NEXT(table_LRU, table)) { + + dict_index_t* index = dict_table_find_index_on_id(table, id); + + if (index != NULL) { + return(index); + } + } + + return(NULL); +} + +/** Function object to remove a foreign key constraint from the +referenced_set of the referenced table. The foreign key object is +also removed from the dictionary cache. The foreign key constraint +is not removed from the foreign_set of the table containing the +constraint. */ +struct dict_foreign_remove_partial +{ + void operator()(dict_foreign_t* foreign) { + dict_table_t* table = foreign->referenced_table; + if (table != NULL) { + table->referenced_set.erase(foreign); + } + dict_foreign_free(foreign); + } +}; + +/**********************************************************************//** +Renames a table object. +@return TRUE if success */ +UNIV_INTERN +dberr_t +dict_table_rename_in_cache( +/*=======================*/ + dict_table_t* table, /*!< in/out: table */ + const char* new_name, /*!< in: new name */ + ibool rename_also_foreigns)/*!< in: in ALTER TABLE we want + to preserve the original table name + in constraints which reference it */ +{ + dict_foreign_t* foreign; + dict_index_t* index; + ulint fold; + char old_name[MAX_FULL_NAME_LEN + 1]; + + ut_ad(mutex_own(&(dict_sys->mutex))); + + /* store the old/current name to an automatic variable */ + if (strlen(table->name) + 1 <= sizeof(old_name)) { + memcpy(old_name, table->name, strlen(table->name) + 1); + } else { + ut_print_timestamp(stderr); + fprintf(stderr, "InnoDB: too long table name: '%s', " + "max length is %d\n", table->name, + MAX_FULL_NAME_LEN); + ut_error; + } + + fold = ut_fold_string(new_name); + + /* Look for a table with the same name: error if such exists */ + dict_table_t* table2; + HASH_SEARCH(name_hash, dict_sys->table_hash, fold, + dict_table_t*, table2, ut_ad(table2->cached), + (ut_strcmp(table2->name, new_name) == 0)); + DBUG_EXECUTE_IF("dict_table_rename_in_cache_failure", + if (table2 == NULL) { + table2 = (dict_table_t*) -1; + } ); + if (table2) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Cannot rename table '%s' to '%s' since the " + "dictionary cache already contains '%s'.", + old_name, new_name, new_name); + return(DB_ERROR); + } + + /* If the table is stored in a single-table tablespace, rename the + .ibd file and rebuild the .isl file if needed. */ + + if (dict_table_is_discarded(table)) { + os_file_type_t type; + ibool exists; + char* filepath; + + ut_ad(table->space != TRX_SYS_SPACE); + + if (DICT_TF_HAS_DATA_DIR(table->flags)) { + + dict_get_and_save_data_dir_path(table, true); + ut_a(table->data_dir_path); + + filepath = os_file_make_remote_pathname( + table->data_dir_path, table->name, "ibd"); + } else { + filepath = fil_make_ibd_name(table->name, false); + } + + fil_delete_tablespace(table->space, BUF_REMOVE_ALL_NO_WRITE); + + /* Delete any temp file hanging around. */ + if (os_file_status(filepath, &exists, &type) + && exists + && !os_file_delete_if_exists(innodb_file_temp_key, + filepath)) { + + ib_logf(IB_LOG_LEVEL_INFO, + "Delete of %s failed.", filepath); + } + + mem_free(filepath); + + } else if (table->space != TRX_SYS_SPACE) { + char* new_path = NULL; + + if (table->dir_path_of_temp_table != NULL) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Error: trying to rename a" + " TEMPORARY TABLE ", stderr); + ut_print_name(stderr, NULL, TRUE, old_name); + fputs(" (", stderr); + ut_print_filename(stderr, + table->dir_path_of_temp_table); + fputs(" )\n", stderr); + return(DB_ERROR); + + } else if (DICT_TF_HAS_DATA_DIR(table->flags)) { + char* old_path; + + old_path = fil_space_get_first_path(table->space); + + new_path = os_file_make_new_pathname( + old_path, new_name); + + mem_free(old_path); + + dberr_t err = fil_create_link_file( + new_name, new_path); + + if (err != DB_SUCCESS) { + mem_free(new_path); + return(DB_TABLESPACE_EXISTS); + } + } + + ibool success = fil_rename_tablespace( + old_name, table->space, new_name, new_path); + + /* If the tablespace is remote, a new .isl file was created + If success, delete the old one. If not, delete the new one. */ + if (new_path) { + + mem_free(new_path); + fil_delete_link_file(success ? old_name : new_name); + } + + if (!success) { + return(DB_ERROR); + } + } + + /* Remove table from the hash tables of tables */ + HASH_DELETE(dict_table_t, name_hash, dict_sys->table_hash, + ut_fold_string(old_name), table); + + if (strlen(new_name) > strlen(table->name)) { + /* We allocate MAX_FULL_NAME_LEN + 1 bytes here to avoid + memory fragmentation, we assume a repeated calls of + ut_realloc() with the same size do not cause fragmentation */ + ut_a(strlen(new_name) <= MAX_FULL_NAME_LEN); + + table->name = static_cast<char*>( + ut_realloc(table->name, MAX_FULL_NAME_LEN + 1)); + } + memcpy(table->name, new_name, strlen(new_name) + 1); + + /* Add table to hash table of tables */ + HASH_INSERT(dict_table_t, name_hash, dict_sys->table_hash, fold, + table); + + dict_sys->size += strlen(new_name) - strlen(old_name); + ut_a(dict_sys->size > 0); + + /* Update the table_name field in indexes */ + for (index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + + index->table_name = table->name; + } + + if (!rename_also_foreigns) { + /* In ALTER TABLE we think of the rename table operation + in the direction table -> temporary table (#sql...) + as dropping the table with the old name and creating + a new with the new name. Thus we kind of drop the + constraints from the dictionary cache here. The foreign key + constraints will be inherited to the new table from the + system tables through a call of dict_load_foreigns. */ + + /* Remove the foreign constraints from the cache */ + std::for_each(table->foreign_set.begin(), + table->foreign_set.end(), + dict_foreign_remove_partial()); + table->foreign_set.clear(); + + /* Reset table field in referencing constraints */ + for (dict_foreign_set::iterator it + = table->referenced_set.begin(); + it != table->referenced_set.end(); + ++it) { + + foreign = *it; + foreign->referenced_table = NULL; + foreign->referenced_index = NULL; + + } + + /* Make the set of referencing constraints empty */ + table->referenced_set.clear(); + + return(DB_SUCCESS); + } + + /* Update the table name fields in foreign constraints, and update also + the constraint id of new format >= 4.0.18 constraints. Note that at + this point we have already changed table->name to the new name. */ + + dict_foreign_set fk_set; + + for (;;) { + + dict_foreign_set::iterator it + = table->foreign_set.begin(); + + if (it == table->foreign_set.end()) { + break; + } + + foreign = *it; + + if (foreign->referenced_table) { + foreign->referenced_table->referenced_set.erase(foreign); + } + + if (ut_strlen(foreign->foreign_table_name) + < ut_strlen(table->name)) { + /* Allocate a longer name buffer; + TODO: store buf len to save memory */ + + foreign->foreign_table_name = mem_heap_strdup( + foreign->heap, table->name); + dict_mem_foreign_table_name_lookup_set(foreign, TRUE); + } else { + strcpy(foreign->foreign_table_name, table->name); + dict_mem_foreign_table_name_lookup_set(foreign, FALSE); + } + if (strchr(foreign->id, '/')) { + /* This is a >= 4.0.18 format id */ + + ulint db_len; + char* old_id; + char old_name_cs_filename[MAX_TABLE_NAME_LEN+20]; + uint errors = 0; + + /* All table names are internally stored in charset + my_charset_filename (except the temp tables and the + partition identifier suffix in partition tables). The + foreign key constraint names are internally stored + in UTF-8 charset. The variable fkid here is used + to store foreign key constraint name in charset + my_charset_filename for comparison further below. */ + char fkid[MAX_TABLE_NAME_LEN+20]; + ibool on_tmp = FALSE; + + /* The old table name in my_charset_filename is stored + in old_name_cs_filename */ + + strncpy(old_name_cs_filename, old_name, + MAX_TABLE_NAME_LEN); + if (strstr(old_name, TEMP_TABLE_PATH_PREFIX) == NULL) { + + innobase_convert_to_system_charset( + strchr(old_name_cs_filename, '/') + 1, + strchr(old_name, '/') + 1, + MAX_TABLE_NAME_LEN, &errors); + + if (errors) { + /* There has been an error to convert + old table into UTF-8. This probably + means that the old table name is + actually in UTF-8. */ + innobase_convert_to_filename_charset( + strchr(old_name_cs_filename, + '/') + 1, + strchr(old_name, '/') + 1, + MAX_TABLE_NAME_LEN); + } else { + /* Old name already in + my_charset_filename */ + strncpy(old_name_cs_filename, old_name, + MAX_TABLE_NAME_LEN); + } + } + + strncpy(fkid, foreign->id, MAX_TABLE_NAME_LEN); + + if (strstr(fkid, TEMP_TABLE_PATH_PREFIX) == NULL) { + innobase_convert_to_filename_charset( + strchr(fkid, '/') + 1, + strchr(foreign->id, '/') + 1, + MAX_TABLE_NAME_LEN+20); + } else { + on_tmp = TRUE; + } + + old_id = mem_strdup(foreign->id); + + if (ut_strlen(fkid) > ut_strlen(old_name_cs_filename) + + ((sizeof dict_ibfk) - 1) + && !memcmp(fkid, old_name_cs_filename, + ut_strlen(old_name_cs_filename)) + && !memcmp(fkid + ut_strlen(old_name_cs_filename), + dict_ibfk, (sizeof dict_ibfk) - 1)) { + + /* This is a generated >= 4.0.18 format id */ + + char table_name[MAX_TABLE_NAME_LEN] = ""; + uint errors = 0; + + if (strlen(table->name) > strlen(old_name)) { + foreign->id = static_cast<char*>( + mem_heap_alloc( + foreign->heap, + strlen(table->name) + + strlen(old_id) + 1)); + } + + /* Convert the table name to UTF-8 */ + strncpy(table_name, table->name, + MAX_TABLE_NAME_LEN); + innobase_convert_to_system_charset( + strchr(table_name, '/') + 1, + strchr(table->name, '/') + 1, + MAX_TABLE_NAME_LEN, &errors); + + if (errors) { + /* Table name could not be converted + from charset my_charset_filename to + UTF-8. This means that the table name + is already in UTF-8 (#mysql#50). */ + strncpy(table_name, table->name, + MAX_TABLE_NAME_LEN); + } + + /* Replace the prefix 'databasename/tablename' + with the new names */ + strcpy(foreign->id, table_name); + if (on_tmp) { + strcat(foreign->id, + old_id + ut_strlen(old_name)); + } else { + sprintf(strchr(foreign->id, '/') + 1, + "%s%s", + strchr(table_name, '/') +1, + strstr(old_id, "_ibfk_") ); + } + + } else { + /* This is a >= 4.0.18 format id where the user + gave the id name */ + db_len = dict_get_db_name_len(table->name) + 1; + + if (dict_get_db_name_len(table->name) + > dict_get_db_name_len(foreign->id)) { + + foreign->id = static_cast<char*>( + mem_heap_alloc( + foreign->heap, + db_len + strlen(old_id) + 1)); + } + + /* Replace the database prefix in id with the + one from table->name */ + + ut_memcpy(foreign->id, table->name, db_len); + + strcpy(foreign->id + db_len, + dict_remove_db_name(old_id)); + } + + mem_free(old_id); + } + + table->foreign_set.erase(it); + fk_set.insert(foreign); + + if (foreign->referenced_table) { + foreign->referenced_table->referenced_set.insert(foreign); + } + } + + ut_a(table->foreign_set.empty()); + table->foreign_set.swap(fk_set); + + for (dict_foreign_set::iterator it = table->referenced_set.begin(); + it != table->referenced_set.end(); + ++it) { + + foreign = *it; + + if (ut_strlen(foreign->referenced_table_name) + < ut_strlen(table->name)) { + /* Allocate a longer name buffer; + TODO: store buf len to save memory */ + + foreign->referenced_table_name = mem_heap_strdup( + foreign->heap, table->name); + + dict_mem_referenced_table_name_lookup_set( + foreign, TRUE); + } else { + /* Use the same buffer */ + strcpy(foreign->referenced_table_name, table->name); + + dict_mem_referenced_table_name_lookup_set( + foreign, FALSE); + } + } + + return(DB_SUCCESS); +} + +/**********************************************************************//** +Change the id of a table object in the dictionary cache. This is used in +DISCARD TABLESPACE. */ +UNIV_INTERN +void +dict_table_change_id_in_cache( +/*==========================*/ + dict_table_t* table, /*!< in/out: table object already in cache */ + table_id_t new_id) /*!< in: new id to set */ +{ + ut_ad(table); + ut_ad(mutex_own(&(dict_sys->mutex))); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + + /* Remove the table from the hash table of id's */ + + HASH_DELETE(dict_table_t, id_hash, dict_sys->table_id_hash, + ut_fold_ull(table->id), table); + table->id = new_id; + + /* Add the table back to the hash table */ + HASH_INSERT(dict_table_t, id_hash, dict_sys->table_id_hash, + ut_fold_ull(table->id), table); +} + +/**********************************************************************//** +Removes a table object from the dictionary cache. */ +static +void +dict_table_remove_from_cache_low( +/*=============================*/ + dict_table_t* table, /*!< in, own: table */ + ibool lru_evict) /*!< in: TRUE if table being evicted + to make room in the table LRU list */ +{ + dict_foreign_t* foreign; + dict_index_t* index; + ulint size; + + ut_ad(table); + ut_ad(dict_lru_validate()); + ut_a(table->n_ref_count == 0); + ut_a(table->n_rec_locks == 0); + ut_ad(mutex_own(&(dict_sys->mutex))); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + + /* Remove the foreign constraints from the cache */ + std::for_each(table->foreign_set.begin(), table->foreign_set.end(), + dict_foreign_remove_partial()); + table->foreign_set.clear(); + + /* Reset table field in referencing constraints */ + for (dict_foreign_set::iterator it = table->referenced_set.begin(); + it != table->referenced_set.end(); + ++it) { + + foreign = *it; + foreign->referenced_table = NULL; + foreign->referenced_index = NULL; + } + + /* Remove the indexes from the cache */ + + for (index = UT_LIST_GET_LAST(table->indexes); + index != NULL; + index = UT_LIST_GET_LAST(table->indexes)) { + + dict_index_remove_from_cache_low(table, index, lru_evict); + } + + /* Remove table from the hash tables of tables */ + + HASH_DELETE(dict_table_t, name_hash, dict_sys->table_hash, + ut_fold_string(table->name), table); + + HASH_DELETE(dict_table_t, id_hash, dict_sys->table_id_hash, + ut_fold_ull(table->id), table); + + /* Remove table from LRU or non-LRU list. */ + if (table->can_be_evicted) { + ut_ad(dict_lru_find_table(table)); + UT_LIST_REMOVE(table_LRU, dict_sys->table_LRU, table); + } else { + ut_ad(dict_non_lru_find_table(table)); + UT_LIST_REMOVE(table_LRU, dict_sys->table_non_LRU, table); + } + + ut_ad(dict_lru_validate()); + + if (lru_evict && table->drop_aborted) { + /* Do as dict_table_try_drop_aborted() does. */ + + trx_t* trx = trx_allocate_for_background(); + + ut_ad(mutex_own(&dict_sys->mutex)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + /* Mimic row_mysql_lock_data_dictionary(). */ + trx->dict_operation_lock_mode = RW_X_LATCH; + + trx_set_dict_operation(trx, TRX_DICT_OP_INDEX); + + /* Silence a debug assertion in row_merge_drop_indexes(). */ + ut_d(table->n_ref_count++); + row_merge_drop_indexes(trx, table, TRUE); + ut_d(table->n_ref_count--); + ut_ad(table->n_ref_count == 0); + trx_commit_for_mysql(trx); + trx->dict_operation_lock_mode = 0; + trx_free_for_background(trx); + } + + size = mem_heap_get_size(table->heap) + strlen(table->name) + 1; + + ut_ad(dict_sys->size >= size); + + dict_sys->size -= size; + + dict_mem_table_free(table); +} + +/**********************************************************************//** +Removes a table object from the dictionary cache. */ +UNIV_INTERN +void +dict_table_remove_from_cache( +/*=========================*/ + dict_table_t* table) /*!< in, own: table */ +{ + dict_table_remove_from_cache_low(table, FALSE); +} + +/****************************************************************//** +If the given column name is reserved for InnoDB system columns, return +TRUE. +@return TRUE if name is reserved */ +UNIV_INTERN +ibool +dict_col_name_is_reserved( +/*======================*/ + const char* name) /*!< in: column name */ +{ + /* This check reminds that if a new system column is added to + the program, it should be dealt with here. */ +#if DATA_N_SYS_COLS != 3 +#error "DATA_N_SYS_COLS != 3" +#endif + + static const char* reserved_names[] = { + "DB_ROW_ID", "DB_TRX_ID", "DB_ROLL_PTR" + }; + + ulint i; + + for (i = 0; i < UT_ARR_SIZE(reserved_names); i++) { + if (innobase_strcasecmp(name, reserved_names[i]) == 0) { + + return(TRUE); + } + } + + return(FALSE); +} + +#if 1 /* This function is not very accurate at determining + whether an UNDO record will be too big. See innodb_4k.test, + Bug 13336585, for a testcase that shows an index that can + be created but cannot be updated. */ + +/****************************************************************//** +If an undo log record for this table might not fit on a single page, +return TRUE. +@return TRUE if the undo log record could become too big */ +static +ibool +dict_index_too_big_for_undo( +/*========================*/ + const dict_table_t* table, /*!< in: table */ + const dict_index_t* new_index) /*!< in: index */ +{ + /* Make sure that all column prefixes will fit in the undo log record + in trx_undo_page_report_modify() right after trx_undo_page_init(). */ + + ulint i; + const dict_index_t* clust_index + = dict_table_get_first_index(table); + ulint undo_page_len + = TRX_UNDO_PAGE_HDR - TRX_UNDO_PAGE_HDR_SIZE + + 2 /* next record pointer */ + + 1 /* type_cmpl */ + + 11 /* trx->undo_no */ + 11 /* table->id */ + + 1 /* rec_get_info_bits() */ + + 11 /* DB_TRX_ID */ + + 11 /* DB_ROLL_PTR */ + + 10 + FIL_PAGE_DATA_END /* trx_undo_left() */ + + 2/* pointer to previous undo log record */; + + /* FTS index consists of auxiliary tables, they shall be excluded from + index row size check */ + if (new_index->type & DICT_FTS) { + return(false); + } + + if (!clust_index) { + ut_a(dict_index_is_clust(new_index)); + clust_index = new_index; + } + + /* Add the size of the ordering columns in the + clustered index. */ + for (i = 0; i < clust_index->n_uniq; i++) { + const dict_col_t* col + = dict_index_get_nth_col(clust_index, i); + + /* Use the maximum output size of + mach_write_compressed(), although the encoded + length should always fit in 2 bytes. */ + undo_page_len += 5 + dict_col_get_max_size(col); + } + + /* Add the old values of the columns to be updated. + First, the amount and the numbers of the columns. + These are written by mach_write_compressed() whose + maximum output length is 5 bytes. However, given that + the quantities are below REC_MAX_N_FIELDS (10 bits), + the maximum length is 2 bytes per item. */ + undo_page_len += 2 * (dict_table_get_n_cols(table) + 1); + + for (i = 0; i < clust_index->n_def; i++) { + const dict_col_t* col + = dict_index_get_nth_col(clust_index, i); + ulint max_size + = dict_col_get_max_size(col); + ulint fixed_size + = dict_col_get_fixed_size(col, + dict_table_is_comp(table)); + ulint max_prefix + = col->max_prefix; + + if (fixed_size) { + /* Fixed-size columns are stored locally. */ + max_size = fixed_size; + } else if (max_size <= BTR_EXTERN_FIELD_REF_SIZE * 2) { + /* Short columns are stored locally. */ + } else if (!col->ord_part + || (col->max_prefix + < (ulint) DICT_MAX_FIELD_LEN_BY_FORMAT(table))) { + /* See if col->ord_part would be set + because of new_index. Also check if the new + index could have longer prefix on columns + that already had ord_part set */ + ulint j; + + for (j = 0; j < new_index->n_uniq; j++) { + if (dict_index_get_nth_col( + new_index, j) == col) { + const dict_field_t* field + = dict_index_get_nth_field( + new_index, j); + + if (field->prefix_len + > col->max_prefix) { + max_prefix = + field->prefix_len; + } + + goto is_ord_part; + } + } + + if (col->ord_part) { + goto is_ord_part; + } + + /* This is not an ordering column in any index. + Thus, it can be stored completely externally. */ + max_size = BTR_EXTERN_FIELD_REF_SIZE; + } else { + ulint max_field_len; +is_ord_part: + max_field_len = DICT_MAX_FIELD_LEN_BY_FORMAT(table); + + /* This is an ordering column in some index. + A long enough prefix must be written to the + undo log. See trx_undo_page_fetch_ext(). */ + max_size = ut_min(max_size, max_field_len); + + /* We only store the needed prefix length in undo log */ + if (max_prefix) { + ut_ad(dict_table_get_format(table) + >= UNIV_FORMAT_B); + + max_size = ut_min(max_prefix, max_size); + } + + max_size += BTR_EXTERN_FIELD_REF_SIZE; + } + + undo_page_len += 5 + max_size; + } + + return(undo_page_len >= UNIV_PAGE_SIZE); +} +#endif + +/****************************************************************//** +If a record of this index might not fit on a single B-tree page, +return TRUE. +@return TRUE if the index record could become too big */ +static +ibool +dict_index_too_big_for_tree( +/*========================*/ + const dict_table_t* table, /*!< in: table */ + const dict_index_t* new_index) /*!< in: index */ +{ + ulint zip_size; + ulint comp; + ulint i; + /* maximum possible storage size of a record */ + ulint rec_max_size; + /* maximum allowed size of a record on a leaf page */ + ulint page_rec_max; + /* maximum allowed size of a node pointer record */ + ulint page_ptr_max; + + /* FTS index consists of auxiliary tables, they shall be excluded from + index row size check */ + if (new_index->type & DICT_FTS) { + return(false); + } + + DBUG_EXECUTE_IF( + "ib_force_create_table", + return(FALSE);); + + comp = dict_table_is_comp(table); + zip_size = dict_table_zip_size(table); + + if (zip_size && zip_size < UNIV_PAGE_SIZE) { + /* On a compressed page, two records must fit in the + uncompressed page modification log. On compressed + pages with zip_size == UNIV_PAGE_SIZE, this limit will + never be reached. */ + ut_ad(comp); + /* The maximum allowed record size is the size of + an empty page, minus a byte for recoding the heap + number in the page modification log. The maximum + allowed node pointer size is half that. */ + page_rec_max = page_zip_empty_size(new_index->n_fields, + zip_size); + if (page_rec_max) { + page_rec_max--; + } + page_ptr_max = page_rec_max / 2; + /* On a compressed page, there is a two-byte entry in + the dense page directory for every record. But there + is no record header. */ + rec_max_size = 2; + } else { + /* The maximum allowed record size is half a B-tree + page. No additional sparse page directory entry will + be generated for the first few user records. */ + page_rec_max = page_get_free_space_of_empty(comp) / 2; + page_ptr_max = page_rec_max; + /* Each record has a header. */ + rec_max_size = comp + ? REC_N_NEW_EXTRA_BYTES + : REC_N_OLD_EXTRA_BYTES; + } + + if (comp) { + /* Include the "null" flags in the + maximum possible record size. */ + rec_max_size += UT_BITS_IN_BYTES(new_index->n_nullable); + } else { + /* For each column, include a 2-byte offset and a + "null" flag. The 1-byte format is only used in short + records that do not contain externally stored columns. + Such records could never exceed the page limit, even + when using the 2-byte format. */ + rec_max_size += 2 * new_index->n_fields; + } + + /* Compute the maximum possible record size. */ + for (i = 0; i < new_index->n_fields; i++) { + const dict_field_t* field + = dict_index_get_nth_field(new_index, i); + const dict_col_t* col + = dict_field_get_col(field); + ulint field_max_size; + ulint field_ext_max_size; + + /* In dtuple_convert_big_rec(), variable-length columns + that are longer than BTR_EXTERN_FIELD_REF_SIZE * 2 + may be chosen for external storage. + + Fixed-length columns, and all columns of secondary + index records are always stored inline. */ + + /* Determine the maximum length of the index field. + The field_ext_max_size should be computed as the worst + case in rec_get_converted_size_comp() for + REC_STATUS_ORDINARY records. */ + + field_max_size = dict_col_get_fixed_size(col, comp); + if (field_max_size) { + /* dict_index_add_col() should guarantee this */ + ut_ad(!field->prefix_len + || field->fixed_len == field->prefix_len); + /* Fixed lengths are not encoded + in ROW_FORMAT=COMPACT. */ + field_ext_max_size = 0; + goto add_field_size; + } + + field_max_size = dict_col_get_max_size(col); + field_ext_max_size = field_max_size < 256 ? 1 : 2; + + if (field->prefix_len) { + if (field->prefix_len < field_max_size) { + field_max_size = field->prefix_len; + } + } else if (field_max_size > BTR_EXTERN_FIELD_REF_SIZE * 2 + && dict_index_is_clust(new_index)) { + + /* In the worst case, we have a locally stored + column of BTR_EXTERN_FIELD_REF_SIZE * 2 bytes. + The length can be stored in one byte. If the + column were stored externally, the lengths in + the clustered index page would be + BTR_EXTERN_FIELD_REF_SIZE and 2. */ + field_max_size = BTR_EXTERN_FIELD_REF_SIZE * 2; + field_ext_max_size = 1; + } + + if (comp) { + /* Add the extra size for ROW_FORMAT=COMPACT. + For ROW_FORMAT=REDUNDANT, these bytes were + added to rec_max_size before this loop. */ + rec_max_size += field_ext_max_size; + } +add_field_size: + rec_max_size += field_max_size; + + /* Check the size limit on leaf pages. */ + if (UNIV_UNLIKELY(rec_max_size >= page_rec_max)) { + + return(TRUE); + } + + /* Check the size limit on non-leaf pages. Records + stored in non-leaf B-tree pages consist of the unique + columns of the record (the key columns of the B-tree) + and a node pointer field. When we have processed the + unique columns, rec_max_size equals the size of the + node pointer record minus the node pointer column. */ + if (i + 1 == dict_index_get_n_unique_in_tree(new_index) + && rec_max_size + REC_NODE_PTR_SIZE >= page_ptr_max) { + + return(TRUE); + } + } + + return(FALSE); +} + +/**********************************************************************//** +Adds an index to the dictionary cache. +@return DB_SUCCESS, DB_TOO_BIG_RECORD, or DB_CORRUPTION */ +UNIV_INTERN +dberr_t +dict_index_add_to_cache( +/*====================*/ + dict_table_t* table, /*!< in: table on which the index is */ + dict_index_t* index, /*!< in, own: index; NOTE! The index memory + object is freed in this function! */ + ulint page_no,/*!< in: root page number of the index */ + ibool strict) /*!< in: TRUE=refuse to create the index + if records could be too big to fit in + an B-tree page */ +{ + dict_index_t* new_index; + ulint n_ord; + ulint i; + + ut_ad(index); + ut_ad(mutex_own(&(dict_sys->mutex))); + ut_ad(index->n_def == index->n_fields); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + ut_ad(!dict_index_is_online_ddl(index)); + + ut_ad(mem_heap_validate(index->heap)); + ut_a(!dict_index_is_clust(index) + || UT_LIST_GET_LEN(table->indexes) == 0); + + if (!dict_index_find_cols(table, index)) { + + dict_mem_index_free(index); + return(DB_CORRUPTION); + } + + /* Build the cache internal representation of the index, + containing also the added system fields */ + + if (index->type == DICT_FTS) { + new_index = dict_index_build_internal_fts(table, index); + } else if (dict_index_is_clust(index)) { + new_index = dict_index_build_internal_clust(table, index); + } else { + new_index = dict_index_build_internal_non_clust(table, index); + } + + /* Set the n_fields value in new_index to the actual defined + number of fields in the cache internal representation */ + + new_index->n_fields = new_index->n_def; + new_index->trx_id = index->trx_id; + + if (dict_index_too_big_for_tree(table, new_index)) { + + if (strict) { +too_big: + dict_mem_index_free(new_index); + dict_mem_index_free(index); + return(DB_TOO_BIG_RECORD); + } else { + + ib_warn_row_too_big(table); + + } + } + + if (dict_index_is_univ(index)) { + n_ord = new_index->n_fields; + } else { + n_ord = new_index->n_uniq; + } + +#if 1 /* The following code predetermines whether to call + dict_index_too_big_for_undo(). This function is not + accurate. See innodb_4k.test, Bug 13336585, for a + testcase that shows an index that can be created but + cannot be updated. */ + + switch (dict_table_get_format(table)) { + case UNIV_FORMAT_A: + /* ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT store + prefixes of externally stored columns locally within + the record. There are no special considerations for + the undo log record size. */ + goto undo_size_ok; + + case UNIV_FORMAT_B: + /* In ROW_FORMAT=DYNAMIC and ROW_FORMAT=COMPRESSED, + column prefix indexes require that prefixes of + externally stored columns are written to the undo log. + This may make the undo log record bigger than the + record on the B-tree page. The maximum size of an + undo log record is the page size. That must be + checked for below. */ + break; + +#if UNIV_FORMAT_B != UNIV_FORMAT_MAX +# error "UNIV_FORMAT_B != UNIV_FORMAT_MAX" +#endif + } + + for (i = 0; i < n_ord; i++) { + const dict_field_t* field + = dict_index_get_nth_field(new_index, i); + const dict_col_t* col + = dict_field_get_col(field); + + /* In dtuple_convert_big_rec(), variable-length columns + that are longer than BTR_EXTERN_FIELD_REF_SIZE * 2 + may be chosen for external storage. If the column appears + in an ordering column of an index, a longer prefix determined + by dict_max_field_len_store_undo() will be copied to the undo + log by trx_undo_page_report_modify() and + trx_undo_page_fetch_ext(). It suffices to check the + capacity of the undo log whenever new_index includes + a column prefix on a column that may be stored externally. */ + + if (field->prefix_len /* prefix index */ + && (!col->ord_part /* not yet ordering column */ + || field->prefix_len > col->max_prefix) + && !dict_col_get_fixed_size(col, TRUE) /* variable-length */ + && dict_col_get_max_size(col) + > BTR_EXTERN_FIELD_REF_SIZE * 2 /* long enough */) { + + if (dict_index_too_big_for_undo(table, new_index)) { + /* An undo log record might not fit in + a single page. Refuse to create this index. */ + + goto too_big; + } + + break; + } + } + +undo_size_ok: +#endif + /* Flag the ordering columns and also set column max_prefix */ + + for (i = 0; i < n_ord; i++) { + const dict_field_t* field + = dict_index_get_nth_field(new_index, i); + + field->col->ord_part = 1; + + if (field->prefix_len > field->col->max_prefix) { + field->col->max_prefix = field->prefix_len; + } + } + + if (!dict_index_is_univ(new_index)) { + + new_index->stat_n_diff_key_vals = + static_cast<ib_uint64_t*>(mem_heap_zalloc( + new_index->heap, + dict_index_get_n_unique(new_index) + * sizeof(*new_index->stat_n_diff_key_vals))); + + new_index->stat_n_sample_sizes = + static_cast<ib_uint64_t*>(mem_heap_zalloc( + new_index->heap, + dict_index_get_n_unique(new_index) + * sizeof(*new_index->stat_n_sample_sizes))); + + new_index->stat_n_non_null_key_vals = + static_cast<ib_uint64_t*>(mem_heap_zalloc( + new_index->heap, + dict_index_get_n_unique(new_index) + * sizeof(*new_index->stat_n_non_null_key_vals))); + } + + new_index->stat_index_size = 1; + new_index->stat_n_leaf_pages = 1; + + /* Add the new index as the last index for the table */ + + UT_LIST_ADD_LAST(indexes, table->indexes, new_index); + new_index->table = table; + new_index->table_name = table->name; + new_index->search_info = btr_search_info_create(new_index->heap); + + new_index->page = page_no; + rw_lock_create(index_tree_rw_lock_key, &new_index->lock, + dict_index_is_ibuf(index) + ? SYNC_IBUF_INDEX_TREE : SYNC_INDEX_TREE); + + dict_sys->size += mem_heap_get_size(new_index->heap); + + dict_mem_index_free(index); + + return(DB_SUCCESS); +} + +/**********************************************************************//** +Removes an index from the dictionary cache. */ +static +void +dict_index_remove_from_cache_low( +/*=============================*/ + dict_table_t* table, /*!< in/out: table */ + dict_index_t* index, /*!< in, own: index */ + ibool lru_evict) /*!< in: TRUE if index being evicted + to make room in the table LRU list */ +{ + ulint size; + ulint retries = 0; + btr_search_t* info; + + ut_ad(table && index); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + ut_ad(mutex_own(&(dict_sys->mutex))); + + /* No need to acquire the dict_index_t::lock here because + there can't be any active operations on this index (or table). */ + + if (index->online_log) { + ut_ad(index->online_status == ONLINE_INDEX_CREATION); + row_log_free(index->online_log); + } + + /* We always create search info whether or not adaptive + hash index is enabled or not. */ + info = btr_search_get_info(index); + ut_ad(info); + + /* We are not allowed to free the in-memory index struct + dict_index_t until all entries in the adaptive hash index + that point to any of the page belonging to his b-tree index + are dropped. This is so because dropping of these entries + require access to dict_index_t struct. To avoid such scenario + We keep a count of number of such pages in the search_info and + only free the dict_index_t struct when this count drops to + zero. See also: dict_table_can_be_evicted() */ + + do { + ulint ref_count = btr_search_info_get_ref_count(info, + index); + + if (ref_count == 0) { + break; + } + + /* Sleep for 10ms before trying again. */ + os_thread_sleep(10000); + ++retries; + + if (retries % 500 == 0) { + /* No luck after 5 seconds of wait. */ + fprintf(stderr, "InnoDB: Error: Waited for" + " %lu secs for hash index" + " ref_count (%lu) to drop" + " to 0.\n" + "index: \"%s\"" + " table: \"%s\"\n", + retries/100, + ref_count, + index->name, + table->name); + } + + /* To avoid a hang here we commit suicide if the + ref_count doesn't drop to zero in 600 seconds. */ + if (retries >= 60000) { + ut_error; + } + } while (srv_shutdown_state == SRV_SHUTDOWN_NONE || !lru_evict); + + rw_lock_free(&index->lock); + + /* Remove the index from the list of indexes of the table */ + UT_LIST_REMOVE(indexes, table->indexes, index); + + size = mem_heap_get_size(index->heap); + + ut_ad(dict_sys->size >= size); + + dict_sys->size -= size; + + dict_mem_index_free(index); +} + +/**********************************************************************//** +Removes an index from the dictionary cache. */ +UNIV_INTERN +void +dict_index_remove_from_cache( +/*=========================*/ + dict_table_t* table, /*!< in/out: table */ + dict_index_t* index) /*!< in, own: index */ +{ + dict_index_remove_from_cache_low(table, index, FALSE); +} + +/*******************************************************************//** +Tries to find column names for the index and sets the col field of the +index. +@return TRUE if the column names were found */ +static +ibool +dict_index_find_cols( +/*=================*/ + dict_table_t* table, /*!< in: table */ + dict_index_t* index) /*!< in: index */ +{ + ulint i; + + ut_ad(table && index); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + ut_ad(mutex_own(&(dict_sys->mutex))); + + for (i = 0; i < index->n_fields; i++) { + ulint j; + dict_field_t* field = dict_index_get_nth_field(index, i); + + for (j = 0; j < table->n_cols; j++) { + if (!strcmp(dict_table_get_col_name(table, j), + field->name)) { + field->col = dict_table_get_nth_col(table, j); + + goto found; + } + } + +#ifdef UNIV_DEBUG + /* It is an error not to find a matching column. */ + fputs("InnoDB: Error: no matching column for ", stderr); + ut_print_name(stderr, NULL, FALSE, field->name); + fputs(" in ", stderr); + dict_index_name_print(stderr, NULL, index); + fputs("!\n", stderr); +#endif /* UNIV_DEBUG */ + return(FALSE); + +found: + ; + } + + return(TRUE); +} +#endif /* !UNIV_HOTBACKUP */ + +/*******************************************************************//** +Adds a column to index. */ +UNIV_INTERN +void +dict_index_add_col( +/*===============*/ + dict_index_t* index, /*!< in/out: index */ + const dict_table_t* table, /*!< in: table */ + dict_col_t* col, /*!< in: column */ + ulint prefix_len) /*!< in: column prefix length */ +{ + dict_field_t* field; + const char* col_name; + + col_name = dict_table_get_col_name(table, dict_col_get_no(col)); + + dict_mem_index_add_field(index, col_name, prefix_len); + + field = dict_index_get_nth_field(index, index->n_def - 1); + + field->col = col; + field->fixed_len = (unsigned int) dict_col_get_fixed_size( + col, dict_table_is_comp(table)); + + if (prefix_len && field->fixed_len > prefix_len) { + field->fixed_len = (unsigned int) prefix_len; + } + + /* Long fixed-length fields that need external storage are treated as + variable-length fields, so that the extern flag can be embedded in + the length word. */ + + if (field->fixed_len > DICT_MAX_FIXED_COL_LEN) { + field->fixed_len = 0; + } +#if DICT_MAX_FIXED_COL_LEN != 768 + /* The comparison limit above must be constant. If it were + changed, the disk format of some fixed-length columns would + change, which would be a disaster. */ +# error "DICT_MAX_FIXED_COL_LEN != 768" +#endif + + if (!(col->prtype & DATA_NOT_NULL)) { + index->n_nullable++; + } +} + +#ifndef UNIV_HOTBACKUP +/*******************************************************************//** +Copies fields contained in index2 to index1. */ +static +void +dict_index_copy( +/*============*/ + dict_index_t* index1, /*!< in: index to copy to */ + dict_index_t* index2, /*!< in: index to copy from */ + const dict_table_t* table, /*!< in: table */ + ulint start, /*!< in: first position to copy */ + ulint end) /*!< in: last position to copy */ +{ + dict_field_t* field; + ulint i; + + /* Copy fields contained in index2 */ + + for (i = start; i < end; i++) { + + field = dict_index_get_nth_field(index2, i); + dict_index_add_col(index1, table, field->col, + field->prefix_len); + } +} + +/*******************************************************************//** +Copies types of fields contained in index to tuple. */ +UNIV_INTERN +void +dict_index_copy_types( +/*==================*/ + dtuple_t* tuple, /*!< in/out: data tuple */ + const dict_index_t* index, /*!< in: index */ + ulint n_fields) /*!< in: number of + field types to copy */ +{ + ulint i; + + if (dict_index_is_univ(index)) { + dtuple_set_types_binary(tuple, n_fields); + + return; + } + + for (i = 0; i < n_fields; i++) { + const dict_field_t* ifield; + dtype_t* dfield_type; + + ifield = dict_index_get_nth_field(index, i); + dfield_type = dfield_get_type(dtuple_get_nth_field(tuple, i)); + dict_col_copy_type(dict_field_get_col(ifield), dfield_type); + } +} + +/*******************************************************************//** +Copies types of columns contained in table to tuple and sets all +fields of the tuple to the SQL NULL value. This function should +be called right after dtuple_create(). */ +UNIV_INTERN +void +dict_table_copy_types( +/*==================*/ + dtuple_t* tuple, /*!< in/out: data tuple */ + const dict_table_t* table) /*!< in: table */ +{ + ulint i; + + for (i = 0; i < dtuple_get_n_fields(tuple); i++) { + + dfield_t* dfield = dtuple_get_nth_field(tuple, i); + dtype_t* dtype = dfield_get_type(dfield); + + dfield_set_null(dfield); + dict_col_copy_type(dict_table_get_nth_col(table, i), dtype); + } +} + +/******************************************************************** +Wait until all the background threads of the given table have exited, i.e., +bg_threads == 0. Note: bg_threads_mutex must be reserved when +calling this. */ +UNIV_INTERN +void +dict_table_wait_for_bg_threads_to_exit( +/*===================================*/ + dict_table_t* table, /*< in: table */ + ulint delay) /*< in: time in microseconds to wait between + checks of bg_threads. */ +{ + fts_t* fts = table->fts; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&fts->bg_threads_mutex)); +#endif /* UNIV_SYNC_DEBUG */ + + while (fts->bg_threads > 0) { + mutex_exit(&fts->bg_threads_mutex); + + os_thread_sleep(delay); + + mutex_enter(&fts->bg_threads_mutex); + } +} + +/*******************************************************************//** +Builds the internal dictionary cache representation for a clustered +index, containing also system fields not defined by the user. +@return own: the internal representation of the clustered index */ +static +dict_index_t* +dict_index_build_internal_clust( +/*============================*/ + const dict_table_t* table, /*!< in: table */ + dict_index_t* index) /*!< in: user representation of + a clustered index */ +{ + dict_index_t* new_index; + dict_field_t* field; + ulint trx_id_pos; + ulint i; + ibool* indexed; + + ut_ad(table && index); + ut_ad(dict_index_is_clust(index)); + ut_ad(mutex_own(&(dict_sys->mutex))); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + + /* Create a new index object with certainly enough fields */ + new_index = dict_mem_index_create(table->name, + index->name, table->space, + index->type, + index->n_fields + table->n_cols); + + /* Copy other relevant data from the old index struct to the new + struct: it inherits the values */ + + new_index->n_user_defined_cols = index->n_fields; + + new_index->id = index->id; + btr_search_index_init(new_index); + + /* Copy the fields of index */ + dict_index_copy(new_index, index, table, 0, index->n_fields); + + if (dict_index_is_univ(index)) { + /* No fixed number of fields determines an entry uniquely */ + + new_index->n_uniq = REC_MAX_N_FIELDS; + + } else if (dict_index_is_unique(index)) { + /* Only the fields defined so far are needed to identify + the index entry uniquely */ + + new_index->n_uniq = new_index->n_def; + } else { + /* Also the row id is needed to identify the entry */ + new_index->n_uniq = 1 + new_index->n_def; + } + + new_index->trx_id_offset = 0; + + if (!dict_index_is_ibuf(index)) { + /* Add system columns, trx id first */ + + trx_id_pos = new_index->n_def; + +#if DATA_ROW_ID != 0 +# error "DATA_ROW_ID != 0" +#endif +#if DATA_TRX_ID != 1 +# error "DATA_TRX_ID != 1" +#endif +#if DATA_ROLL_PTR != 2 +# error "DATA_ROLL_PTR != 2" +#endif + + if (!dict_index_is_unique(index)) { + dict_index_add_col(new_index, table, + dict_table_get_sys_col( + table, DATA_ROW_ID), + 0); + trx_id_pos++; + } + + dict_index_add_col(new_index, table, + dict_table_get_sys_col(table, DATA_TRX_ID), + 0); + + dict_index_add_col(new_index, table, + dict_table_get_sys_col(table, + DATA_ROLL_PTR), + 0); + + for (i = 0; i < trx_id_pos; i++) { + + ulint fixed_size = dict_col_get_fixed_size( + dict_index_get_nth_col(new_index, i), + dict_table_is_comp(table)); + + if (fixed_size == 0) { + new_index->trx_id_offset = 0; + + break; + } + + if (dict_index_get_nth_field(new_index, i)->prefix_len + > 0) { + new_index->trx_id_offset = 0; + + break; + } + + /* Add fixed_size to new_index->trx_id_offset. + Because the latter is a bit-field, an overflow + can theoretically occur. Check for it. */ + fixed_size += new_index->trx_id_offset; + + new_index->trx_id_offset = fixed_size; + + if (new_index->trx_id_offset != fixed_size) { + /* Overflow. Pretend that this is a + variable-length PRIMARY KEY. */ + ut_ad(0); + new_index->trx_id_offset = 0; + break; + } + } + + } + + /* Remember the table columns already contained in new_index */ + indexed = static_cast<ibool*>( + mem_zalloc(table->n_cols * sizeof *indexed)); + + /* Mark the table columns already contained in new_index */ + for (i = 0; i < new_index->n_def; i++) { + + field = dict_index_get_nth_field(new_index, i); + + /* If there is only a prefix of the column in the index + field, do not mark the column as contained in the index */ + + if (field->prefix_len == 0) { + + indexed[field->col->ind] = TRUE; + } + } + + /* Add to new_index non-system columns of table not yet included + there */ + for (i = 0; i + DATA_N_SYS_COLS < (ulint) table->n_cols; i++) { + + dict_col_t* col = dict_table_get_nth_col(table, i); + ut_ad(col->mtype != DATA_SYS); + + if (!indexed[col->ind]) { + dict_index_add_col(new_index, table, col, 0); + } + } + + mem_free(indexed); + + ut_ad(dict_index_is_ibuf(index) + || (UT_LIST_GET_LEN(table->indexes) == 0)); + + new_index->cached = TRUE; + + return(new_index); +} + +/*******************************************************************//** +Builds the internal dictionary cache representation for a non-clustered +index, containing also system fields not defined by the user. +@return own: the internal representation of the non-clustered index */ +static +dict_index_t* +dict_index_build_internal_non_clust( +/*================================*/ + const dict_table_t* table, /*!< in: table */ + dict_index_t* index) /*!< in: user representation of + a non-clustered index */ +{ + dict_field_t* field; + dict_index_t* new_index; + dict_index_t* clust_index; + ulint i; + ibool* indexed; + + ut_ad(table && index); + ut_ad(!dict_index_is_clust(index)); + ut_ad(mutex_own(&(dict_sys->mutex))); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + + /* The clustered index should be the first in the list of indexes */ + clust_index = UT_LIST_GET_FIRST(table->indexes); + + ut_ad(clust_index); + ut_ad(dict_index_is_clust(clust_index)); + ut_ad(!dict_index_is_univ(clust_index)); + + /* Create a new index */ + new_index = dict_mem_index_create( + table->name, index->name, index->space, index->type, + index->n_fields + 1 + clust_index->n_uniq); + + /* Copy other relevant data from the old index + struct to the new struct: it inherits the values */ + + new_index->n_user_defined_cols = index->n_fields; + + new_index->id = index->id; + btr_search_index_init(new_index); + + /* Copy fields from index to new_index */ + dict_index_copy(new_index, index, table, 0, index->n_fields); + + /* Remember the table columns already contained in new_index */ + indexed = static_cast<ibool*>( + mem_zalloc(table->n_cols * sizeof *indexed)); + + /* Mark the table columns already contained in new_index */ + for (i = 0; i < new_index->n_def; i++) { + + field = dict_index_get_nth_field(new_index, i); + + /* If there is only a prefix of the column in the index + field, do not mark the column as contained in the index */ + + if (field->prefix_len == 0) { + + indexed[field->col->ind] = TRUE; + } + } + + /* Add to new_index the columns necessary to determine the clustered + index entry uniquely */ + + for (i = 0; i < clust_index->n_uniq; i++) { + + field = dict_index_get_nth_field(clust_index, i); + + if (!indexed[field->col->ind]) { + dict_index_add_col(new_index, table, field->col, + field->prefix_len); + } + } + + mem_free(indexed); + + if (dict_index_is_unique(index)) { + new_index->n_uniq = index->n_fields; + } else { + new_index->n_uniq = new_index->n_def; + } + + /* Set the n_fields value in new_index to the actual defined + number of fields */ + + new_index->n_fields = new_index->n_def; + + new_index->cached = TRUE; + + return(new_index); +} + +/*********************************************************************** +Builds the internal dictionary cache representation for an FTS index. +@return own: the internal representation of the FTS index */ +static +dict_index_t* +dict_index_build_internal_fts( +/*==========================*/ + dict_table_t* table, /*!< in: table */ + dict_index_t* index) /*!< in: user representation of an FTS index */ +{ + dict_index_t* new_index; + + ut_ad(table && index); + ut_ad(index->type == DICT_FTS); +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&(dict_sys->mutex))); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + + /* Create a new index */ + new_index = dict_mem_index_create( + table->name, index->name, index->space, index->type, + index->n_fields); + + /* Copy other relevant data from the old index struct to the new + struct: it inherits the values */ + + new_index->n_user_defined_cols = index->n_fields; + + new_index->id = index->id; + btr_search_index_init(new_index); + + /* Copy fields from index to new_index */ + dict_index_copy(new_index, index, table, 0, index->n_fields); + + new_index->n_uniq = 0; + new_index->cached = TRUE; + + if (table->fts->cache == NULL) { + table->fts->cache = fts_cache_create(table); + } + + rw_lock_x_lock(&table->fts->cache->init_lock); + /* Notify the FTS cache about this index. */ + fts_cache_index_cache_create(table, new_index); + rw_lock_x_unlock(&table->fts->cache->init_lock); + + return(new_index); +} +/*====================== FOREIGN KEY PROCESSING ========================*/ + +/*********************************************************************//** +Checks if a table is referenced by foreign keys. +@return TRUE if table is referenced by a foreign key */ +UNIV_INTERN +ibool +dict_table_is_referenced_by_foreign_key( +/*====================================*/ + const dict_table_t* table) /*!< in: InnoDB table */ +{ + return(!table->referenced_set.empty()); +} + +/*********************************************************************//** +Check if the index is referenced by a foreign key, if TRUE return foreign +else return NULL +@return pointer to foreign key struct if index is defined for foreign +key, otherwise NULL */ +UNIV_INTERN +dict_foreign_t* +dict_table_get_referenced_constraint( +/*=================================*/ + dict_table_t* table, /*!< in: InnoDB table */ + dict_index_t* index) /*!< in: InnoDB index */ +{ + dict_foreign_t* foreign; + + ut_ad(index != NULL); + ut_ad(table != NULL); + + for (dict_foreign_set::iterator it = table->referenced_set.begin(); + it != table->referenced_set.end(); + ++it) { + + foreign = *it; + + if (foreign->referenced_index == index) { + + return(foreign); + } + } + + return(NULL); +} + +/*********************************************************************//** +Checks if a index is defined for a foreign key constraint. Index is a part +of a foreign key constraint if the index is referenced by foreign key +or index is a foreign key index. +@return pointer to foreign key struct if index is defined for foreign +key, otherwise NULL */ +UNIV_INTERN +dict_foreign_t* +dict_table_get_foreign_constraint( +/*==============================*/ + dict_table_t* table, /*!< in: InnoDB table */ + dict_index_t* index) /*!< in: InnoDB index */ +{ + dict_foreign_t* foreign; + + ut_ad(index != NULL); + ut_ad(table != NULL); + + for (dict_foreign_set::iterator it = table->foreign_set.begin(); + it != table->foreign_set.end(); + ++it) { + + foreign = *it; + + if (foreign->foreign_index == index) { + + return(foreign); + } + } + + return(NULL); +} + +/**********************************************************************//** +Removes a foreign constraint struct from the dictionary cache. */ +UNIV_INTERN +void +dict_foreign_remove_from_cache( +/*===========================*/ + dict_foreign_t* foreign) /*!< in, own: foreign constraint */ +{ + ut_ad(mutex_own(&(dict_sys->mutex))); + ut_a(foreign); + + if (foreign->referenced_table != NULL) { + foreign->referenced_table->referenced_set.erase(foreign); + } + + if (foreign->foreign_table != NULL) { + foreign->foreign_table->foreign_set.erase(foreign); + } + + dict_foreign_free(foreign); +} + +/**********************************************************************//** +Looks for the foreign constraint from the foreign and referenced lists +of a table. +@return foreign constraint */ +static +dict_foreign_t* +dict_foreign_find( +/*==============*/ + dict_table_t* table, /*!< in: table object */ + dict_foreign_t* foreign) /*!< in: foreign constraint */ +{ + ut_ad(mutex_own(&(dict_sys->mutex))); + + ut_ad(dict_foreign_set_validate(table->foreign_set)); + ut_ad(dict_foreign_set_validate(table->referenced_set)); + + dict_foreign_set::iterator it = table->foreign_set.find(foreign); + + if (it != table->foreign_set.end()) { + return(*it); + } + + it = table->referenced_set.find(foreign); + + if (it != table->referenced_set.end()) { + return(*it); + } + + return(NULL); +} + + +/*********************************************************************//** +Tries to find an index whose first fields are the columns in the array, +in the same order and is not marked for deletion and is not the same +as types_idx. +@return matching index, NULL if not found */ +UNIV_INTERN +dict_index_t* +dict_foreign_find_index( +/*====================*/ + const dict_table_t* table, /*!< in: table */ + const char** col_names, + /*!< in: column names, or NULL + to use table->col_names */ + const char** columns,/*!< in: array of column names */ + ulint n_cols, /*!< in: number of columns */ + const dict_index_t* types_idx, + /*!< in: NULL or an index + whose types the column types + must match */ + bool check_charsets, + /*!< in: whether to check + charsets. only has an effect + if types_idx != NULL */ + ulint check_null) + /*!< in: nonzero if none of + the columns must be declared + NOT NULL */ +{ + dict_index_t* index; + + ut_ad(mutex_own(&dict_sys->mutex)); + + index = dict_table_get_first_index(table); + + while (index != NULL) { + if (types_idx != index + && !(index->type & DICT_FTS) + && !index->to_be_dropped + && dict_foreign_qualify_index( + table, col_names, columns, n_cols, + index, types_idx, + check_charsets, check_null)) { + return(index); + } + + index = dict_table_get_next_index(index); + } + + return(NULL); +} + +/**********************************************************************//** +Report an error in a foreign key definition. */ +static +void +dict_foreign_error_report_low( +/*==========================*/ + FILE* file, /*!< in: output stream */ + const char* name) /*!< in: table name */ +{ + rewind(file); + ut_print_timestamp(file); + fprintf(file, " Error in foreign key constraint of table %s:\n", + name); +} + +/**********************************************************************//** +Report an error in a foreign key definition. */ +static +void +dict_foreign_error_report( +/*======================*/ + FILE* file, /*!< in: output stream */ + dict_foreign_t* fk, /*!< in: foreign key constraint */ + const char* msg) /*!< in: the error message */ +{ + mutex_enter(&dict_foreign_err_mutex); + dict_foreign_error_report_low(file, fk->foreign_table_name); + fputs(msg, file); + fputs(" Constraint:\n", file); + dict_print_info_on_foreign_key_in_create_format(file, NULL, fk, TRUE); + putc('\n', file); + if (fk->foreign_index) { + fputs("The index in the foreign key in table is ", file); + ut_print_name(file, NULL, FALSE, fk->foreign_index->name); + fputs("\n" + "See " REFMAN "innodb-foreign-key-constraints.html\n" + "for correct foreign key definition.\n", + file); + } + mutex_exit(&dict_foreign_err_mutex); +} + +/**********************************************************************//** +Adds a foreign key constraint object to the dictionary cache. May free +the object if there already is an object with the same identifier in. +At least one of the foreign table and the referenced table must already +be in the dictionary cache! +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +dict_foreign_add_to_cache( +/*======================*/ + dict_foreign_t* foreign, + /*!< in, own: foreign key constraint */ + const char** col_names, + /*!< in: column names, or NULL to use + foreign->foreign_table->col_names */ + bool check_charsets, + /*!< in: whether to check charset + compatibility */ + dict_err_ignore_t ignore_err) + /*!< in: error to be ignored */ +{ + dict_table_t* for_table; + dict_table_t* ref_table; + dict_foreign_t* for_in_cache = NULL; + dict_index_t* index; + ibool added_to_referenced_list= FALSE; + FILE* ef = dict_foreign_err_file; + + ut_ad(mutex_own(&(dict_sys->mutex))); + + for_table = dict_table_check_if_in_cache_low( + foreign->foreign_table_name_lookup); + + ref_table = dict_table_check_if_in_cache_low( + foreign->referenced_table_name_lookup); + ut_a(for_table || ref_table); + + if (for_table) { + for_in_cache = dict_foreign_find(for_table, foreign); + } + + if (!for_in_cache && ref_table) { + for_in_cache = dict_foreign_find(ref_table, foreign); + } + + if (for_in_cache) { + /* Free the foreign object */ + mem_heap_free(foreign->heap); + } else { + for_in_cache = foreign; + } + + if (ref_table && !for_in_cache->referenced_table) { + index = dict_foreign_find_index( + ref_table, NULL, + for_in_cache->referenced_col_names, + for_in_cache->n_fields, for_in_cache->foreign_index, + check_charsets, false); + + if (index == NULL + && !(ignore_err & DICT_ERR_IGNORE_FK_NOKEY)) { + dict_foreign_error_report( + ef, for_in_cache, + "there is no index in referenced table" + " which would contain\n" + "the columns as the first columns," + " or the data types in the\n" + "referenced table do not match" + " the ones in table."); + + if (for_in_cache == foreign) { + mem_heap_free(foreign->heap); + } + + return(DB_CANNOT_ADD_CONSTRAINT); + } + + for_in_cache->referenced_table = ref_table; + for_in_cache->referenced_index = index; + + std::pair<dict_foreign_set::iterator, bool> ret + = ref_table->referenced_set.insert(for_in_cache); + + ut_a(ret.second); /* second is true if the insertion + took place */ + added_to_referenced_list = TRUE; + } + + if (for_table && !for_in_cache->foreign_table) { + index = dict_foreign_find_index( + for_table, col_names, + for_in_cache->foreign_col_names, + for_in_cache->n_fields, + for_in_cache->referenced_index, check_charsets, + for_in_cache->type + & (DICT_FOREIGN_ON_DELETE_SET_NULL + | DICT_FOREIGN_ON_UPDATE_SET_NULL)); + + if (index == NULL + && !(ignore_err & DICT_ERR_IGNORE_FK_NOKEY)) { + dict_foreign_error_report( + ef, for_in_cache, + "there is no index in the table" + " which would contain\n" + "the columns as the first columns," + " or the data types in the\n" + "table do not match" + " the ones in the referenced table\n" + "or one of the ON ... SET NULL columns" + " is declared NOT NULL."); + + if (for_in_cache == foreign) { + if (added_to_referenced_list) { + const dict_foreign_set::size_type n + = ref_table->referenced_set + .erase(for_in_cache); + + ut_a(n == 1); /* the number of + elements removed must + be one */ + } + + mem_heap_free(foreign->heap); + } + + return(DB_CANNOT_ADD_CONSTRAINT); + } + + for_in_cache->foreign_table = for_table; + for_in_cache->foreign_index = index; + std::pair<dict_foreign_set::iterator, bool> ret + = for_table->foreign_set.insert(for_in_cache); + + ut_a(ret.second); /* second is true if the insertion + took place */ + } + + /* We need to move the table to the non-LRU end of the table LRU + list. Otherwise it will be evicted from the cache. */ + + if (ref_table != NULL && ref_table->can_be_evicted) { + dict_table_move_from_lru_to_non_lru(ref_table); + } + + if (for_table != NULL && for_table->can_be_evicted) { + dict_table_move_from_lru_to_non_lru(for_table); + } + + ut_ad(dict_lru_validate()); + + return(DB_SUCCESS); +} + +/*********************************************************************//** +Scans from pointer onwards. Stops if is at the start of a copy of +'string' where characters are compared without case sensitivity, and +only outside `` or "" quotes. Stops also at NUL. +@return scanned up to this */ +static +const char* +dict_scan_to( +/*=========*/ + const char* ptr, /*!< in: scan from */ + const char* string) /*!< in: look for this */ +{ + char quote = '\0'; + bool escape = false; + + for (; *ptr; ptr++) { + if (*ptr == quote) { + /* Closing quote character: do not look for + starting quote or the keyword. */ + + /* If the quote character is escaped by a + backslash, ignore it. */ + if (escape) { + escape = false; + } else { + quote = '\0'; + } + } else if (quote) { + /* Within quotes: do nothing. */ + if (escape) { + escape = false; + } else if (*ptr == '\\') { + escape = true; + } + } else if (*ptr == '`' || *ptr == '"' || *ptr == '\'') { + /* Starting quote: remember the quote character. */ + quote = *ptr; + } else { + /* Outside quotes: look for the keyword. */ + ulint i; + for (i = 0; string[i]; i++) { + if (toupper((int)(unsigned char)(ptr[i])) + != toupper((int)(unsigned char) + (string[i]))) { + goto nomatch; + } + } + break; +nomatch: + ; + } + } + + return(ptr); +} + +/*********************************************************************//** +Accepts a specified string. Comparisons are case-insensitive. +@return if string was accepted, the pointer is moved after that, else +ptr is returned */ +static +const char* +dict_accept( +/*========*/ + struct charset_info_st* cs,/*!< in: the character set of ptr */ + const char* ptr, /*!< in: scan from this */ + const char* string, /*!< in: accept only this string as the next + non-whitespace string */ + ibool* success)/*!< out: TRUE if accepted */ +{ + const char* old_ptr = ptr; + const char* old_ptr2; + + *success = FALSE; + + while (my_isspace(cs, *ptr)) { + ptr++; + } + + old_ptr2 = ptr; + + ptr = dict_scan_to(ptr, string); + + if (*ptr == '\0' || old_ptr2 != ptr) { + return(old_ptr); + } + + *success = TRUE; + + return(ptr + ut_strlen(string)); +} + +/*********************************************************************//** +Scans an id. For the lexical definition of an 'id', see the code below. +Strips backquotes or double quotes from around the id. +@return scanned to */ +static +const char* +dict_scan_id( +/*=========*/ + struct charset_info_st* cs,/*!< in: the character set of ptr */ + const char* ptr, /*!< in: scanned to */ + mem_heap_t* heap, /*!< in: heap where to allocate the id + (NULL=id will not be allocated, but it + will point to string near ptr) */ + const char** id, /*!< out,own: the id; NULL if no id was + scannable */ + ibool table_id,/*!< in: TRUE=convert the allocated id + as a table name; FALSE=convert to UTF-8 */ + ibool accept_also_dot) + /*!< in: TRUE if also a dot can appear in a + non-quoted id; in a quoted id it can appear + always */ +{ + char quote = '\0'; + ulint len = 0; + const char* s; + char* str; + char* dst; + + *id = NULL; + + while (my_isspace(cs, *ptr)) { + ptr++; + } + + if (*ptr == '\0') { + + return(ptr); + } + + if (*ptr == '`' || *ptr == '"') { + quote = *ptr++; + } + + s = ptr; + + if (quote) { + for (;;) { + if (!*ptr) { + /* Syntax error */ + return(ptr); + } + if (*ptr == quote) { + ptr++; + if (*ptr != quote) { + break; + } + } + ptr++; + len++; + } + } else { + while (!my_isspace(cs, *ptr) && *ptr != '(' && *ptr != ')' + && (accept_also_dot || *ptr != '.') + && *ptr != ',' && *ptr != '\0') { + + ptr++; + } + + len = ptr - s; + } + + if (UNIV_UNLIKELY(!heap)) { + /* no heap given: id will point to source string */ + *id = s; + return(ptr); + } + + if (quote) { + char* d; + + str = d = static_cast<char*>( + mem_heap_alloc(heap, len + 1)); + + while (len--) { + if ((*d++ = *s++) == quote) { + s++; + } + } + *d++ = 0; + len = d - str; + ut_ad(*s == quote); + ut_ad(s + 1 == ptr); + } else { + str = mem_heap_strdupl(heap, s, len); + } + + if (!table_id) { +convert_id: + /* Convert the identifier from connection character set + to UTF-8. */ + len = 3 * len + 1; + *id = dst = static_cast<char*>(mem_heap_alloc(heap, len)); + + innobase_convert_from_id(cs, dst, str, len); + } else if (!strncmp(str, srv_mysql50_table_name_prefix, + sizeof(srv_mysql50_table_name_prefix) - 1)) { + /* This is a pre-5.1 table name + containing chars other than [A-Za-z0-9]. + Discard the prefix and use raw UTF-8 encoding. */ + str += sizeof(srv_mysql50_table_name_prefix) - 1; + len -= sizeof(srv_mysql50_table_name_prefix) - 1; + goto convert_id; + } else { + /* Encode using filename-safe characters. */ + len = 5 * len + 1; + *id = dst = static_cast<char*>(mem_heap_alloc(heap, len)); + + innobase_convert_from_table_id(cs, dst, str, len); + } + + return(ptr); +} + +/*********************************************************************//** +Tries to scan a column name. +@return scanned to */ +static +const char* +dict_scan_col( +/*==========*/ + struct charset_info_st* cs, /*!< in: the character set of ptr */ + const char* ptr, /*!< in: scanned to */ + ibool* success,/*!< out: TRUE if success */ + dict_table_t* table, /*!< in: table in which the column is */ + const dict_col_t** column, /*!< out: pointer to column if success */ + mem_heap_t* heap, /*!< in: heap where to allocate */ + const char** name) /*!< out,own: the column name; + NULL if no name was scannable */ +{ + ulint i; + + *success = FALSE; + + ptr = dict_scan_id(cs, ptr, heap, name, FALSE, TRUE); + + if (*name == NULL) { + + return(ptr); /* Syntax error */ + } + + if (table == NULL) { + *success = TRUE; + *column = NULL; + } else { + for (i = 0; i < dict_table_get_n_cols(table); i++) { + + const char* col_name = dict_table_get_col_name( + table, i); + + if (0 == innobase_strcasecmp(col_name, *name)) { + /* Found */ + + *success = TRUE; + *column = dict_table_get_nth_col(table, i); + strcpy((char*) *name, col_name); + + break; + } + } + } + + return(ptr); +} + + +/*********************************************************************//** +Open a table from its database and table name, this is currently used by +foreign constraint parser to get the referenced table. +@return complete table name with database and table name, allocated from +heap memory passed in */ +UNIV_INTERN +char* +dict_get_referenced_table( +/*======================*/ + const char* name, /*!< in: foreign key table name */ + const char* database_name, /*!< in: table db name */ + ulint database_name_len, /*!< in: db name length */ + const char* table_name, /*!< in: table name */ + ulint table_name_len, /*!< in: table name length */ + dict_table_t** table, /*!< out: table object or NULL */ + mem_heap_t* heap) /*!< in/out: heap memory */ +{ + char* ref; + const char* db_name; + + if (!database_name) { + /* Use the database name of the foreign key table */ + + db_name = name; + database_name_len = dict_get_db_name_len(name); + } else { + db_name = database_name; + } + + /* Copy database_name, '/', table_name, '\0' */ + ref = static_cast<char*>( + mem_heap_alloc(heap, database_name_len + table_name_len + 2)); + + memcpy(ref, db_name, database_name_len); + ref[database_name_len] = '/'; + memcpy(ref + database_name_len + 1, table_name, table_name_len + 1); + + /* Values; 0 = Store and compare as given; case sensitive + 1 = Store and compare in lower; case insensitive + 2 = Store as given, compare in lower; case semi-sensitive */ + if (innobase_get_lower_case_table_names() == 2) { + innobase_casedn_str(ref); + *table = dict_table_get_low(ref); + memcpy(ref, db_name, database_name_len); + ref[database_name_len] = '/'; + memcpy(ref + database_name_len + 1, table_name, table_name_len + 1); + + } else { +#ifndef __WIN__ + if (innobase_get_lower_case_table_names() == 1) { + innobase_casedn_str(ref); + } +#else + innobase_casedn_str(ref); +#endif /* !__WIN__ */ + *table = dict_table_get_low(ref); + } + + return(ref); +} +/*********************************************************************//** +Scans a table name from an SQL string. +@return scanned to */ +static +const char* +dict_scan_table_name( +/*=================*/ + struct charset_info_st* cs,/*!< in: the character set of ptr */ + const char* ptr, /*!< in: scanned to */ + dict_table_t** table, /*!< out: table object or NULL */ + const char* name, /*!< in: foreign key table name */ + ibool* success,/*!< out: TRUE if ok name found */ + mem_heap_t* heap, /*!< in: heap where to allocate the id */ + const char** ref_name)/*!< out,own: the table name; + NULL if no name was scannable */ +{ + const char* database_name = NULL; + ulint database_name_len = 0; + const char* table_name = NULL; + const char* scan_name; + + *success = FALSE; + *table = NULL; + + ptr = dict_scan_id(cs, ptr, heap, &scan_name, TRUE, FALSE); + + if (scan_name == NULL) { + + return(ptr); /* Syntax error */ + } + + if (*ptr == '.') { + /* We scanned the database name; scan also the table name */ + + ptr++; + + database_name = scan_name; + database_name_len = strlen(database_name); + + ptr = dict_scan_id(cs, ptr, heap, &table_name, TRUE, FALSE); + + if (table_name == NULL) { + + return(ptr); /* Syntax error */ + } + } else { + /* To be able to read table dumps made with InnoDB-4.0.17 or + earlier, we must allow the dot separator between the database + name and the table name also to appear within a quoted + identifier! InnoDB used to print a constraint as: + ... REFERENCES `databasename.tablename` ... + starting from 4.0.18 it is + ... REFERENCES `databasename`.`tablename` ... */ + const char* s; + + for (s = scan_name; *s; s++) { + if (*s == '.') { + database_name = scan_name; + database_name_len = s - scan_name; + scan_name = ++s; + break;/* to do: multiple dots? */ + } + } + + table_name = scan_name; + } + + *ref_name = dict_get_referenced_table( + name, database_name, database_name_len, + table_name, strlen(table_name), table, heap); + + *success = TRUE; + return(ptr); +} + +/*********************************************************************//** +Skips one id. The id is allowed to contain also '.'. +@return scanned to */ +static +const char* +dict_skip_word( +/*===========*/ + struct charset_info_st* cs,/*!< in: the character set of ptr */ + const char* ptr, /*!< in: scanned to */ + ibool* success)/*!< out: TRUE if success, FALSE if just spaces + left in string or a syntax error */ +{ + const char* start; + + *success = FALSE; + + ptr = dict_scan_id(cs, ptr, NULL, &start, FALSE, TRUE); + + if (start) { + *success = TRUE; + } + + return(ptr); +} + +/*********************************************************************//** +Removes MySQL comments from an SQL string. A comment is either +(a) '#' to the end of the line, +(b) '--[space]' to the end of the line, or +(c) '[slash][asterisk]' till the next '[asterisk][slash]' (like the familiar +C comment syntax). +@return own: SQL string stripped from comments; the caller must free +this with mem_free()! */ +static +char* +dict_strip_comments( +/*================*/ + const char* sql_string, /*!< in: SQL string */ + size_t sql_length) /*!< in: length of sql_string */ +{ + char* str; + const char* sptr; + const char* eptr = sql_string + sql_length; + char* ptr; + /* unclosed quote character (0 if none) */ + char quote = 0; + bool escape = false; + + DBUG_ENTER("dict_strip_comments"); + + DBUG_PRINT("dict_strip_comments", ("%s", sql_string)); + + str = static_cast<char*>(mem_alloc(sql_length + 1)); + + sptr = sql_string; + ptr = str; + + for (;;) { +scan_more: + if (sptr >= eptr || *sptr == '\0') { +end_of_string: + *ptr = '\0'; + + ut_a(ptr <= str + sql_length); + + DBUG_PRINT("dict_strip_comments", ("%s", str)); + DBUG_RETURN(str); + } + + if (*sptr == quote) { + /* Closing quote character: do not look for + starting quote or comments. */ + + /* If the quote character is escaped by a + backslash, ignore it. */ + if (escape) { + escape = false; + } else { + quote = 0; + } + } else if (quote) { + /* Within quotes: do not look for + starting quotes or comments. */ + if (escape) { + escape = false; + } else if (*sptr == '\\') { + escape = true; + } + } else if (*sptr == '"' || *sptr == '`' || *sptr == '\'') { + /* Starting quote: remember the quote character. */ + quote = *sptr; + } else if (*sptr == '#' + || (sptr[0] == '-' && sptr[1] == '-' + && sptr[2] == ' ')) { + for (;;) { + if (++sptr >= eptr) { + goto end_of_string; + } + + /* In Unix a newline is 0x0A while in Windows + it is 0x0D followed by 0x0A */ + + switch (*sptr) { + case (char) 0X0A: + case (char) 0x0D: + case '\0': + goto scan_more; + } + } + } else if (!quote && *sptr == '/' && *(sptr + 1) == '*') { + sptr += 2; + for (;;) { + if (sptr >= eptr) { + goto end_of_string; + } + + switch (*sptr) { + case '\0': + goto scan_more; + case '*': + if (sptr[1] == '/') { + sptr += 2; + goto scan_more; + } + } + + sptr++; + } + } + + *ptr = *sptr; + + ptr++; + sptr++; + } +} + +/*********************************************************************//** +Finds the highest [number] for foreign key constraints of the table. Looks +only at the >= 4.0.18-format id's, which are of the form +databasename/tablename_ibfk_[number]. +@return highest number, 0 if table has no new format foreign key constraints */ +UNIV_INTERN +ulint +dict_table_get_highest_foreign_id( +/*==============================*/ + dict_table_t* table) /*!< in: table in the dictionary memory cache */ +{ + dict_foreign_t* foreign; + char* endp; + ulint biggest_id = 0; + ulint id; + ulint len; + + ut_a(table); + + len = ut_strlen(table->name); + + for (dict_foreign_set::iterator it = table->foreign_set.begin(); + it != table->foreign_set.end(); + ++it) { + foreign = *it; + + if (ut_strlen(foreign->id) > ((sizeof dict_ibfk) - 1) + len + && 0 == ut_memcmp(foreign->id, table->name, len) + && 0 == ut_memcmp(foreign->id + len, + dict_ibfk, (sizeof dict_ibfk) - 1) + && foreign->id[len + ((sizeof dict_ibfk) - 1)] != '0') { + /* It is of the >= 4.0.18 format */ + + id = strtoul(foreign->id + len + + ((sizeof dict_ibfk) - 1), + &endp, 10); + if (*endp == '\0') { + ut_a(id != biggest_id); + + if (id > biggest_id) { + biggest_id = id; + } + } + } + } + + return(biggest_id); +} + +/*********************************************************************//** +Reports a simple foreign key create clause syntax error. */ +static +void +dict_foreign_report_syntax_err( +/*===========================*/ + const char* name, /*!< in: table name */ + const char* start_of_latest_foreign, + /*!< in: start of the foreign key clause + in the SQL string */ + const char* ptr) /*!< in: place of the syntax error */ +{ + ut_ad(!srv_read_only_mode); + + FILE* ef = dict_foreign_err_file; + + mutex_enter(&dict_foreign_err_mutex); + dict_foreign_error_report_low(ef, name); + fprintf(ef, "%s:\nSyntax error close to:\n%s\n", + start_of_latest_foreign, ptr); + mutex_exit(&dict_foreign_err_mutex); +} + +/*********************************************************************//** +Scans a table create SQL string and adds to the data dictionary the foreign +key constraints declared in the string. This function should be called after +the indexes for a table have been created. Each foreign key constraint must +be accompanied with indexes in both participating tables. The indexes are +allowed to contain more fields than mentioned in the constraint. +@return error code or DB_SUCCESS */ +static +dberr_t +dict_create_foreign_constraints_low( +/*================================*/ + trx_t* trx, /*!< in: transaction */ + mem_heap_t* heap, /*!< in: memory heap */ + struct charset_info_st* cs,/*!< in: the character set of sql_string */ + const char* sql_string, + /*!< in: CREATE TABLE or ALTER TABLE statement + where foreign keys are declared like: + FOREIGN KEY (a, b) REFERENCES table2(c, d), + table2 can be written also with the database + name before it: test.table2; the default + database is the database of parameter name */ + const char* name, /*!< in: table full name in the normalized form + database_name/table_name */ + ibool reject_fks) + /*!< in: if TRUE, fail with error code + DB_CANNOT_ADD_CONSTRAINT if any foreign + keys are found. */ +{ + dict_table_t* table; + dict_table_t* referenced_table; + dict_table_t* table_to_alter; + ulint highest_id_so_far = 0; + ulint number = 1; + dict_index_t* index; + dict_foreign_t* foreign; + const char* ptr = sql_string; + const char* start_of_latest_foreign = sql_string; + FILE* ef = dict_foreign_err_file; + const char* constraint_name; + ibool success; + dberr_t error; + const char* ptr1; + const char* ptr2; + ulint i; + ulint j; + ibool is_on_delete; + ulint n_on_deletes; + ulint n_on_updates; + const dict_col_t*columns[500]; + const char* column_names[500]; + const char* referenced_table_name; + dict_foreign_set local_fk_set; + dict_foreign_set_free local_fk_set_free(local_fk_set); + + ut_ad(!srv_read_only_mode); + ut_ad(mutex_own(&(dict_sys->mutex))); + + table = dict_table_get_low(name); + + if (table == NULL) { + mutex_enter(&dict_foreign_err_mutex); + dict_foreign_error_report_low(ef, name); + fprintf(ef, + "Cannot find the table in the internal" + " data dictionary of InnoDB.\n" + "Create table statement:\n%s\n", sql_string); + mutex_exit(&dict_foreign_err_mutex); + + return(DB_ERROR); + } + + /* First check if we are actually doing an ALTER TABLE, and in that + case look for the table being altered */ + + ptr = dict_accept(cs, ptr, "ALTER", &success); + + if (!success) { + + goto loop; + } + + ptr = dict_accept(cs, ptr, "TABLE", &success); + + if (!success) { + + goto loop; + } + + /* We are doing an ALTER TABLE: scan the table name we are altering */ + + ptr = dict_scan_table_name(cs, ptr, &table_to_alter, name, + &success, heap, &referenced_table_name); + if (!success) { + fprintf(stderr, + "InnoDB: Error: could not find" + " the table being ALTERED in:\n%s\n", + sql_string); + + return(DB_ERROR); + } + + /* Starting from 4.0.18 and 4.1.2, we generate foreign key id's in the + format databasename/tablename_ibfk_[number], where [number] is local + to the table; look for the highest [number] for table_to_alter, so + that we can assign to new constraints higher numbers. */ + + /* If we are altering a temporary table, the table name after ALTER + TABLE does not correspond to the internal table name, and + table_to_alter is NULL. TODO: should we fix this somehow? */ + + if (table_to_alter == NULL) { + highest_id_so_far = 0; + } else { + highest_id_so_far = dict_table_get_highest_foreign_id( + table_to_alter); + } + + number = highest_id_so_far + 1; + /* Scan for foreign key declarations in a loop */ +loop: + /* Scan either to "CONSTRAINT" or "FOREIGN", whichever is closer */ + + ptr1 = dict_scan_to(ptr, "CONSTRAINT"); + ptr2 = dict_scan_to(ptr, "FOREIGN"); + + constraint_name = NULL; + + if (ptr1 < ptr2) { + /* The user may have specified a constraint name. Pick it so + that we can store 'databasename/constraintname' as the id of + of the constraint to system tables. */ + ptr = ptr1; + + ptr = dict_accept(cs, ptr, "CONSTRAINT", &success); + + ut_a(success); + + if (!my_isspace(cs, *ptr) && *ptr != '"' && *ptr != '`') { + goto loop; + } + + while (my_isspace(cs, *ptr)) { + ptr++; + } + + /* read constraint name unless got "CONSTRAINT FOREIGN" */ + if (ptr != ptr2) { + ptr = dict_scan_id(cs, ptr, heap, + &constraint_name, FALSE, FALSE); + } + } else { + ptr = ptr2; + } + + if (*ptr == '\0') { + /* The proper way to reject foreign keys for temporary + tables would be to split the lexing and syntactical + analysis of foreign key clauses from the actual adding + of them, so that ha_innodb.cc could first parse the SQL + command, determine if there are any foreign keys, and + if so, immediately reject the command if the table is a + temporary one. For now, this kludge will work. */ + if (reject_fks && !local_fk_set.empty()) { + + return(DB_CANNOT_ADD_CONSTRAINT); + } + + /**********************************************************/ + /* The following call adds the foreign key constraints + to the data dictionary system tables on disk */ + + error = dict_create_add_foreigns_to_dictionary( + local_fk_set, table, trx); + + if (error == DB_SUCCESS) { + + table->foreign_set.insert(local_fk_set.begin(), + local_fk_set.end()); + std::for_each(local_fk_set.begin(), + local_fk_set.end(), + dict_foreign_add_to_referenced_table()); + local_fk_set.clear(); + } + return(error); + } + + start_of_latest_foreign = ptr; + + ptr = dict_accept(cs, ptr, "FOREIGN", &success); + + if (!success) { + goto loop; + } + + if (!my_isspace(cs, *ptr)) { + goto loop; + } + + ptr = dict_accept(cs, ptr, "KEY", &success); + + if (!success) { + goto loop; + } + + ptr = dict_accept(cs, ptr, "(", &success); + + if (!success) { + /* MySQL allows also an index id before the '('; we + skip it */ + ptr = dict_skip_word(cs, ptr, &success); + + if (!success) { + dict_foreign_report_syntax_err( + name, start_of_latest_foreign, ptr); + + return(DB_CANNOT_ADD_CONSTRAINT); + } + + ptr = dict_accept(cs, ptr, "(", &success); + + if (!success) { + /* We do not flag a syntax error here because in an + ALTER TABLE we may also have DROP FOREIGN KEY abc */ + + goto loop; + } + } + + i = 0; + + /* Scan the columns in the first list */ +col_loop1: + ut_a(i < (sizeof column_names) / sizeof *column_names); + ptr = dict_scan_col(cs, ptr, &success, table, columns + i, + heap, column_names + i); + if (!success) { + mutex_enter(&dict_foreign_err_mutex); + dict_foreign_error_report_low(ef, name); + fprintf(ef, "%s:\nCannot resolve column name close to:\n%s\n", + start_of_latest_foreign, ptr); + mutex_exit(&dict_foreign_err_mutex); + + return(DB_CANNOT_ADD_CONSTRAINT); + } + + i++; + + ptr = dict_accept(cs, ptr, ",", &success); + + if (success) { + goto col_loop1; + } + + ptr = dict_accept(cs, ptr, ")", &success); + + if (!success) { + dict_foreign_report_syntax_err( + name, start_of_latest_foreign, ptr); + return(DB_CANNOT_ADD_CONSTRAINT); + } + + /* Try to find an index which contains the columns + as the first fields and in the right order. There is + no need to check column type match (on types_idx), since + the referenced table can be NULL if foreign_key_checks is + set to 0 */ + + index = dict_foreign_find_index( + table, NULL, column_names, i, NULL, TRUE, FALSE); + + if (!index) { + mutex_enter(&dict_foreign_err_mutex); + dict_foreign_error_report_low(ef, name); + fputs("There is no index in table ", ef); + ut_print_name(ef, NULL, TRUE, name); + fprintf(ef, " where the columns appear\n" + "as the first columns. Constraint:\n%s\n" + "See " REFMAN "innodb-foreign-key-constraints.html\n" + "for correct foreign key definition.\n", + start_of_latest_foreign); + mutex_exit(&dict_foreign_err_mutex); + + return(DB_CHILD_NO_INDEX); + } + ptr = dict_accept(cs, ptr, "REFERENCES", &success); + + if (!success || !my_isspace(cs, *ptr)) { + dict_foreign_report_syntax_err( + name, start_of_latest_foreign, ptr); + return(DB_CANNOT_ADD_CONSTRAINT); + } + + /* Let us create a constraint struct */ + + foreign = dict_mem_foreign_create(); + + if (constraint_name) { + ulint db_len; + + /* Catenate 'databasename/' to the constraint name specified + by the user: we conceive the constraint as belonging to the + same MySQL 'database' as the table itself. We store the name + to foreign->id. */ + + db_len = dict_get_db_name_len(table->name); + + foreign->id = static_cast<char*>(mem_heap_alloc( + foreign->heap, db_len + strlen(constraint_name) + 2)); + + ut_memcpy(foreign->id, table->name, db_len); + foreign->id[db_len] = '/'; + strcpy(foreign->id + db_len + 1, constraint_name); + } + + if (foreign->id == NULL) { + error = dict_create_add_foreign_id(&number, + table->name, foreign); + if (error != DB_SUCCESS) { + dict_foreign_free(foreign); + return(error); + } + } + + std::pair<dict_foreign_set::iterator, bool> ret + = local_fk_set.insert(foreign); + + if (!ret.second) { + /* A duplicate foreign key name has been found */ + dict_foreign_free(foreign); + return(DB_CANNOT_ADD_CONSTRAINT); + } + + foreign->foreign_table = table; + foreign->foreign_table_name = mem_heap_strdup( + foreign->heap, table->name); + dict_mem_foreign_table_name_lookup_set(foreign, TRUE); + + foreign->foreign_index = index; + foreign->n_fields = (unsigned int) i; + + foreign->foreign_col_names = static_cast<const char**>( + mem_heap_alloc(foreign->heap, i * sizeof(void*))); + + for (i = 0; i < foreign->n_fields; i++) { + foreign->foreign_col_names[i] = mem_heap_strdup( + foreign->heap, + dict_table_get_col_name(table, + dict_col_get_no(columns[i]))); + } + + ptr = dict_scan_table_name(cs, ptr, &referenced_table, name, + &success, heap, &referenced_table_name); + + /* Note that referenced_table can be NULL if the user has suppressed + checking of foreign key constraints! */ + + if (!success || (!referenced_table && trx->check_foreigns)) { + mutex_enter(&dict_foreign_err_mutex); + dict_foreign_error_report_low(ef, name); + fprintf(ef, "%s:\nCannot resolve table name close to:\n" + "%s\n", + start_of_latest_foreign, ptr); + mutex_exit(&dict_foreign_err_mutex); + + return(DB_CANNOT_ADD_CONSTRAINT); + } + + ptr = dict_accept(cs, ptr, "(", &success); + + if (!success) { + dict_foreign_report_syntax_err(name, start_of_latest_foreign, + ptr); + return(DB_CANNOT_ADD_CONSTRAINT); + } + + /* Scan the columns in the second list */ + i = 0; + +col_loop2: + ptr = dict_scan_col(cs, ptr, &success, referenced_table, columns + i, + heap, column_names + i); + i++; + + if (!success) { + + mutex_enter(&dict_foreign_err_mutex); + dict_foreign_error_report_low(ef, name); + fprintf(ef, "%s:\nCannot resolve column name close to:\n" + "%s\n", + start_of_latest_foreign, ptr); + mutex_exit(&dict_foreign_err_mutex); + + return(DB_CANNOT_ADD_CONSTRAINT); + } + + ptr = dict_accept(cs, ptr, ",", &success); + + if (success) { + goto col_loop2; + } + + ptr = dict_accept(cs, ptr, ")", &success); + + if (!success || foreign->n_fields != i) { + + dict_foreign_report_syntax_err(name, start_of_latest_foreign, + ptr); + return(DB_CANNOT_ADD_CONSTRAINT); + } + + n_on_deletes = 0; + n_on_updates = 0; + +scan_on_conditions: + /* Loop here as long as we can find ON ... conditions */ + + ptr = dict_accept(cs, ptr, "ON", &success); + + if (!success) { + + goto try_find_index; + } + + ptr = dict_accept(cs, ptr, "DELETE", &success); + + if (!success) { + ptr = dict_accept(cs, ptr, "UPDATE", &success); + + if (!success) { + + dict_foreign_report_syntax_err( + name, start_of_latest_foreign, ptr); + return(DB_CANNOT_ADD_CONSTRAINT); + } + + is_on_delete = FALSE; + n_on_updates++; + } else { + is_on_delete = TRUE; + n_on_deletes++; + } + + ptr = dict_accept(cs, ptr, "RESTRICT", &success); + + if (success) { + goto scan_on_conditions; + } + + ptr = dict_accept(cs, ptr, "CASCADE", &success); + + if (success) { + if (is_on_delete) { + foreign->type |= DICT_FOREIGN_ON_DELETE_CASCADE; + } else { + foreign->type |= DICT_FOREIGN_ON_UPDATE_CASCADE; + } + + goto scan_on_conditions; + } + + ptr = dict_accept(cs, ptr, "NO", &success); + + if (success) { + ptr = dict_accept(cs, ptr, "ACTION", &success); + + if (!success) { + dict_foreign_report_syntax_err( + name, start_of_latest_foreign, ptr); + + return(DB_CANNOT_ADD_CONSTRAINT); + } + + if (is_on_delete) { + foreign->type |= DICT_FOREIGN_ON_DELETE_NO_ACTION; + } else { + foreign->type |= DICT_FOREIGN_ON_UPDATE_NO_ACTION; + } + + goto scan_on_conditions; + } + + ptr = dict_accept(cs, ptr, "SET", &success); + + if (!success) { + dict_foreign_report_syntax_err(name, start_of_latest_foreign, + ptr); + return(DB_CANNOT_ADD_CONSTRAINT); + } + + ptr = dict_accept(cs, ptr, "NULL", &success); + + if (!success) { + dict_foreign_report_syntax_err(name, start_of_latest_foreign, + ptr); + return(DB_CANNOT_ADD_CONSTRAINT); + } + + for (j = 0; j < foreign->n_fields; j++) { + if ((dict_index_get_nth_col(foreign->foreign_index, j)->prtype) + & DATA_NOT_NULL) { + + /* It is not sensible to define SET NULL + if the column is not allowed to be NULL! */ + + mutex_enter(&dict_foreign_err_mutex); + dict_foreign_error_report_low(ef, name); + fprintf(ef, "%s:\n" + "You have defined a SET NULL condition" + " though some of the\n" + "columns are defined as NOT NULL.\n", + start_of_latest_foreign); + mutex_exit(&dict_foreign_err_mutex); + + return(DB_CANNOT_ADD_CONSTRAINT); + } + } + + if (is_on_delete) { + foreign->type |= DICT_FOREIGN_ON_DELETE_SET_NULL; + } else { + foreign->type |= DICT_FOREIGN_ON_UPDATE_SET_NULL; + } + + goto scan_on_conditions; + +try_find_index: + if (n_on_deletes > 1 || n_on_updates > 1) { + /* It is an error to define more than 1 action */ + + mutex_enter(&dict_foreign_err_mutex); + dict_foreign_error_report_low(ef, name); + fprintf(ef, "%s:\n" + "You have twice an ON DELETE clause" + " or twice an ON UPDATE clause.\n", + start_of_latest_foreign); + mutex_exit(&dict_foreign_err_mutex); + + return(DB_CANNOT_ADD_CONSTRAINT); + } + + /* Try to find an index which contains the columns as the first fields + and in the right order, and the types are the same as in + foreign->foreign_index */ + + if (referenced_table) { + index = dict_foreign_find_index(referenced_table, NULL, + column_names, i, + foreign->foreign_index, + TRUE, FALSE); + if (!index) { + mutex_enter(&dict_foreign_err_mutex); + dict_foreign_error_report_low(ef, name); + fprintf(ef, "%s:\n" + "Cannot find an index in the" + " referenced table where the\n" + "referenced columns appear as the" + " first columns, or column types\n" + "in the table and the referenced table" + " do not match for constraint.\n" + "Note that the internal storage type of" + " ENUM and SET changed in\n" + "tables created with >= InnoDB-4.1.12," + " and such columns in old tables\n" + "cannot be referenced by such columns" + " in new tables.\n" + "See " REFMAN + "innodb-foreign-key-constraints.html\n" + "for correct foreign key definition.\n", + start_of_latest_foreign); + mutex_exit(&dict_foreign_err_mutex); + + return(DB_PARENT_NO_INDEX); + } + } else { + ut_a(trx->check_foreigns == FALSE); + index = NULL; + } + + foreign->referenced_index = index; + foreign->referenced_table = referenced_table; + + foreign->referenced_table_name = mem_heap_strdup( + foreign->heap, referenced_table_name); + dict_mem_referenced_table_name_lookup_set(foreign, TRUE); + + foreign->referenced_col_names = static_cast<const char**>( + mem_heap_alloc(foreign->heap, i * sizeof(void*))); + + for (i = 0; i < foreign->n_fields; i++) { + foreign->referenced_col_names[i] + = mem_heap_strdup(foreign->heap, column_names[i]); + } + + goto loop; +} +/************************************************************************** +Determines whether a string starts with the specified keyword. +@return TRUE if str starts with keyword */ +UNIV_INTERN +ibool +dict_str_starts_with_keyword( +/*=========================*/ + THD* thd, /*!< in: MySQL thread handle */ + const char* str, /*!< in: string to scan for keyword */ + const char* keyword) /*!< in: keyword to look for */ +{ + struct charset_info_st* cs = innobase_get_charset(thd); + ibool success; + + dict_accept(cs, str, keyword, &success); + return(success); +} + +/*********************************************************************//** +Scans a table create SQL string and adds to the data dictionary the foreign +key constraints declared in the string. This function should be called after +the indexes for a table have been created. Each foreign key constraint must +be accompanied with indexes in both participating tables. The indexes are +allowed to contain more fields than mentioned in the constraint. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +dict_create_foreign_constraints( +/*============================*/ + trx_t* trx, /*!< in: transaction */ + const char* sql_string, /*!< in: table create statement where + foreign keys are declared like: + FOREIGN KEY (a, b) REFERENCES + table2(c, d), table2 can be written + also with the database + name before it: test.table2; the + default database id the database of + parameter name */ + size_t sql_length, /*!< in: length of sql_string */ + const char* name, /*!< in: table full name in the + normalized form + database_name/table_name */ + ibool reject_fks) /*!< in: if TRUE, fail with error + code DB_CANNOT_ADD_CONSTRAINT if + any foreign keys are found. */ +{ + char* str; + dberr_t err; + mem_heap_t* heap; + + ut_a(trx); + ut_a(trx->mysql_thd); + + str = dict_strip_comments(sql_string, sql_length); + heap = mem_heap_create(10000); + + err = dict_create_foreign_constraints_low( + trx, heap, innobase_get_charset(trx->mysql_thd), str, name, + reject_fks); + + mem_heap_free(heap); + mem_free(str); + + return(err); +} + +/**********************************************************************//** +Parses the CONSTRAINT id's to be dropped in an ALTER TABLE statement. +@return DB_SUCCESS or DB_CANNOT_DROP_CONSTRAINT if syntax error or the +constraint id does not match */ +UNIV_INTERN +dberr_t +dict_foreign_parse_drop_constraints( +/*================================*/ + mem_heap_t* heap, /*!< in: heap from which we can + allocate memory */ + trx_t* trx, /*!< in: transaction */ + dict_table_t* table, /*!< in: table */ + ulint* n, /*!< out: number of constraints + to drop */ + const char*** constraints_to_drop) /*!< out: id's of the + constraints to drop */ +{ + ibool success; + char* str; + size_t len; + const char* ptr; + const char* id; + struct charset_info_st* cs; + + ut_a(trx); + ut_a(trx->mysql_thd); + + cs = innobase_get_charset(trx->mysql_thd); + + *n = 0; + + *constraints_to_drop = static_cast<const char**>( + mem_heap_alloc(heap, 1000 * sizeof(char*))); + + ptr = innobase_get_stmt(trx->mysql_thd, &len); + + str = dict_strip_comments(ptr, len); + + ptr = str; + + ut_ad(mutex_own(&(dict_sys->mutex))); +loop: + ptr = dict_scan_to(ptr, "DROP"); + + if (*ptr == '\0') { + mem_free(str); + + return(DB_SUCCESS); + } + + ptr = dict_accept(cs, ptr, "DROP", &success); + + if (!my_isspace(cs, *ptr)) { + + goto loop; + } + + ptr = dict_accept(cs, ptr, "FOREIGN", &success); + + if (!success || !my_isspace(cs, *ptr)) { + + goto loop; + } + + ptr = dict_accept(cs, ptr, "KEY", &success); + + if (!success) { + + goto syntax_error; + } + + ptr = dict_scan_id(cs, ptr, heap, &id, FALSE, TRUE); + + if (id == NULL) { + + goto syntax_error; + } + + ut_a(*n < 1000); + (*constraints_to_drop)[*n] = id; + (*n)++; + + if (std::find_if(table->foreign_set.begin(), + table->foreign_set.end(), + dict_foreign_matches_id(id)) + == table->foreign_set.end()) { + + if (!srv_read_only_mode) { + FILE* ef = dict_foreign_err_file; + + mutex_enter(&dict_foreign_err_mutex); + rewind(ef); + ut_print_timestamp(ef); + fputs(" Error in dropping of a foreign key " + "constraint of table ", ef); + ut_print_name(ef, NULL, TRUE, table->name); + fputs(",\nin SQL command\n", ef); + fputs(str, ef); + fputs("\nCannot find a constraint with the " + "given id ", ef); + ut_print_name(ef, NULL, FALSE, id); + fputs(".\n", ef); + mutex_exit(&dict_foreign_err_mutex); + } + + mem_free(str); + + return(DB_CANNOT_DROP_CONSTRAINT); + } + + goto loop; + +syntax_error: + if (!srv_read_only_mode) { + FILE* ef = dict_foreign_err_file; + + mutex_enter(&dict_foreign_err_mutex); + rewind(ef); + ut_print_timestamp(ef); + fputs(" Syntax error in dropping of a" + " foreign key constraint of table ", ef); + ut_print_name(ef, NULL, TRUE, table->name); + fprintf(ef, ",\n" + "close to:\n%s\n in SQL command\n%s\n", ptr, str); + mutex_exit(&dict_foreign_err_mutex); + } + + mem_free(str); + + return(DB_CANNOT_DROP_CONSTRAINT); +} + +/*==================== END OF FOREIGN KEY PROCESSING ====================*/ + +/**********************************************************************//** +Returns an index object if it is found in the dictionary cache. +Assumes that dict_sys->mutex is already being held. +@return index, NULL if not found */ +UNIV_INTERN +dict_index_t* +dict_index_get_if_in_cache_low( +/*===========================*/ + index_id_t index_id) /*!< in: index id */ +{ + ut_ad(mutex_own(&(dict_sys->mutex))); + + return(dict_index_find_on_id_low(index_id)); +} + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +/**********************************************************************//** +Returns an index object if it is found in the dictionary cache. +@return index, NULL if not found */ +UNIV_INTERN +dict_index_t* +dict_index_get_if_in_cache( +/*=======================*/ + index_id_t index_id) /*!< in: index id */ +{ + dict_index_t* index; + + if (dict_sys == NULL) { + return(NULL); + } + + mutex_enter(&(dict_sys->mutex)); + + index = dict_index_get_if_in_cache_low(index_id); + + mutex_exit(&(dict_sys->mutex)); + + return(index); +} +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ + +#ifdef UNIV_DEBUG +/**********************************************************************//** +Checks that a tuple has n_fields_cmp value in a sensible range, so that +no comparison can occur with the page number field in a node pointer. +@return TRUE if ok */ +UNIV_INTERN +ibool +dict_index_check_search_tuple( +/*==========================*/ + const dict_index_t* index, /*!< in: index tree */ + const dtuple_t* tuple) /*!< in: tuple used in a search */ +{ + ut_a(index); + ut_a(dtuple_get_n_fields_cmp(tuple) + <= dict_index_get_n_unique_in_tree(index)); + return(TRUE); +} +#endif /* UNIV_DEBUG */ + +/**********************************************************************//** +Builds a node pointer out of a physical record and a page number. +@return own: node pointer */ +UNIV_INTERN +dtuple_t* +dict_index_build_node_ptr( +/*======================*/ + const dict_index_t* index, /*!< in: index */ + const rec_t* rec, /*!< in: record for which to build node + pointer */ + ulint page_no,/*!< in: page number to put in node + pointer */ + mem_heap_t* heap, /*!< in: memory heap where pointer + created */ + ulint level) /*!< in: level of rec in tree: + 0 means leaf level */ +{ + dtuple_t* tuple; + dfield_t* field; + byte* buf; + ulint n_unique; + + if (dict_index_is_univ(index)) { + /* In a universal index tree, we take the whole record as + the node pointer if the record is on the leaf level, + on non-leaf levels we remove the last field, which + contains the page number of the child page */ + + ut_a(!dict_table_is_comp(index->table)); + n_unique = rec_get_n_fields_old(rec); + + if (level > 0) { + ut_a(n_unique > 1); + n_unique--; + } + } else { + n_unique = dict_index_get_n_unique_in_tree(index); + } + + tuple = dtuple_create(heap, n_unique + 1); + + /* When searching in the tree for the node pointer, we must not do + comparison on the last field, the page number field, as on upper + levels in the tree there may be identical node pointers with a + different page number; therefore, we set the n_fields_cmp to one + less: */ + + dtuple_set_n_fields_cmp(tuple, n_unique); + + dict_index_copy_types(tuple, index, n_unique); + + buf = static_cast<byte*>(mem_heap_alloc(heap, 4)); + + mach_write_to_4(buf, page_no); + + field = dtuple_get_nth_field(tuple, n_unique); + dfield_set_data(field, buf, 4); + + dtype_set(dfield_get_type(field), DATA_SYS_CHILD, DATA_NOT_NULL, 4); + + rec_copy_prefix_to_dtuple(tuple, rec, index, n_unique, heap); + dtuple_set_info_bits(tuple, dtuple_get_info_bits(tuple) + | REC_STATUS_NODE_PTR); + + ut_ad(dtuple_check_typed(tuple)); + + return(tuple); +} + +/**********************************************************************//** +Copies an initial segment of a physical record, long enough to specify an +index entry uniquely. +@return pointer to the prefix record */ +UNIV_INTERN +rec_t* +dict_index_copy_rec_order_prefix( +/*=============================*/ + const dict_index_t* index, /*!< in: index */ + const rec_t* rec, /*!< in: record for which to + copy prefix */ + ulint* n_fields,/*!< out: number of fields copied */ + byte** buf, /*!< in/out: memory buffer for the + copied prefix, or NULL */ + ulint* buf_size)/*!< in/out: buffer size */ +{ + ulint n; + + UNIV_PREFETCH_R(rec); + + if (dict_index_is_univ(index)) { + ut_a(!dict_table_is_comp(index->table)); + n = rec_get_n_fields_old(rec); + } else { + n = dict_index_get_n_unique_in_tree(index); + } + + *n_fields = n; + return(rec_copy_prefix_to_buf(rec, index, n, buf, buf_size)); +} + +/**********************************************************************//** +Builds a typed data tuple out of a physical record. +@return own: data tuple */ +UNIV_INTERN +dtuple_t* +dict_index_build_data_tuple( +/*========================*/ + dict_index_t* index, /*!< in: index tree */ + rec_t* rec, /*!< in: record for which to build data tuple */ + ulint n_fields,/*!< in: number of data fields */ + mem_heap_t* heap) /*!< in: memory heap where tuple created */ +{ + dtuple_t* tuple; + + ut_ad(dict_table_is_comp(index->table) + || n_fields <= rec_get_n_fields_old(rec)); + + tuple = dtuple_create(heap, n_fields); + + dict_index_copy_types(tuple, index, n_fields); + + rec_copy_prefix_to_dtuple(tuple, rec, index, n_fields, heap); + + ut_ad(dtuple_check_typed(tuple)); + + return(tuple); +} + +/*********************************************************************//** +Calculates the minimum record length in an index. */ +UNIV_INTERN +ulint +dict_index_calc_min_rec_len( +/*========================*/ + const dict_index_t* index) /*!< in: index */ +{ + ulint sum = 0; + ulint i; + ulint comp = dict_table_is_comp(index->table); + + if (comp) { + ulint nullable = 0; + sum = REC_N_NEW_EXTRA_BYTES; + for (i = 0; i < dict_index_get_n_fields(index); i++) { + const dict_col_t* col + = dict_index_get_nth_col(index, i); + ulint size = dict_col_get_fixed_size(col, comp); + sum += size; + if (!size) { + size = col->len; + sum += size < 128 ? 1 : 2; + } + if (!(col->prtype & DATA_NOT_NULL)) { + nullable++; + } + } + + /* round the NULL flags up to full bytes */ + sum += UT_BITS_IN_BYTES(nullable); + + return(sum); + } + + for (i = 0; i < dict_index_get_n_fields(index); i++) { + sum += dict_col_get_fixed_size( + dict_index_get_nth_col(index, i), comp); + } + + if (sum > 127) { + sum += 2 * dict_index_get_n_fields(index); + } else { + sum += dict_index_get_n_fields(index); + } + + sum += REC_N_OLD_EXTRA_BYTES; + + return(sum); +} + +/**********************************************************************//** +Prints info of a foreign key constraint. */ +static +void +dict_foreign_print_low( +/*===================*/ + dict_foreign_t* foreign) /*!< in: foreign key constraint */ +{ + ulint i; + + ut_ad(mutex_own(&(dict_sys->mutex))); + + fprintf(stderr, " FOREIGN KEY CONSTRAINT %s: %s (", + foreign->id, foreign->foreign_table_name); + + for (i = 0; i < foreign->n_fields; i++) { + fprintf(stderr, " %s", foreign->foreign_col_names[i]); + } + + fprintf(stderr, " )\n" + " REFERENCES %s (", + foreign->referenced_table_name); + + for (i = 0; i < foreign->n_fields; i++) { + fprintf(stderr, " %s", foreign->referenced_col_names[i]); + } + + fputs(" )\n", stderr); +} + +/**********************************************************************//** +Prints a table data. */ +UNIV_INTERN +void +dict_table_print( +/*=============*/ + dict_table_t* table) /*!< in: table */ +{ + dict_index_t* index; + ulint i; + + ut_ad(mutex_own(&(dict_sys->mutex))); + + dict_table_stats_lock(table, RW_X_LATCH); + + if (!table->stat_initialized) { + dict_stats_update_transient(table); + } + + fprintf(stderr, + "--------------------------------------\n" + "TABLE: name %s, id %llu, flags %lx, columns %lu," + " indexes %lu, appr.rows " UINT64PF "\n" + " COLUMNS: ", + table->name, + (ullint) table->id, + (ulong) table->flags, + (ulong) table->n_cols, + (ulong) UT_LIST_GET_LEN(table->indexes), + table->stat_n_rows); + + for (i = 0; i < (ulint) table->n_cols; i++) { + dict_col_print_low(table, dict_table_get_nth_col(table, i)); + fputs("; ", stderr); + } + + putc('\n', stderr); + + index = UT_LIST_GET_FIRST(table->indexes); + + while (index != NULL) { + dict_index_print_low(index); + index = UT_LIST_GET_NEXT(indexes, index); + } + + dict_table_stats_unlock(table, RW_X_LATCH); + + std::for_each(table->foreign_set.begin(), + table->foreign_set.end(), + dict_foreign_print_low); + + std::for_each(table->referenced_set.begin(), + table->referenced_set.end(), + dict_foreign_print_low); +} + +/**********************************************************************//** +Prints a column data. */ +static +void +dict_col_print_low( +/*===============*/ + const dict_table_t* table, /*!< in: table */ + const dict_col_t* col) /*!< in: column */ +{ + dtype_t type; + + ut_ad(mutex_own(&(dict_sys->mutex))); + + dict_col_copy_type(col, &type); + fprintf(stderr, "%s: ", dict_table_get_col_name(table, + dict_col_get_no(col))); + + dtype_print(&type); +} + +/**********************************************************************//** +Prints an index data. */ +static +void +dict_index_print_low( +/*=================*/ + dict_index_t* index) /*!< in: index */ +{ + ib_int64_t n_vals; + ulint i; + + ut_a(index->table->stat_initialized); + + ut_ad(mutex_own(&(dict_sys->mutex))); + + if (index->n_user_defined_cols > 0) { + n_vals = index->stat_n_diff_key_vals[ + index->n_user_defined_cols - 1]; + } else { + n_vals = index->stat_n_diff_key_vals[0]; + } + + fprintf(stderr, + " INDEX: name %s, id %llu, fields %lu/%lu," + " uniq %lu, type %lu\n" + " root page %lu, appr.key vals %lu," + " leaf pages %lu, size pages %lu\n" + " FIELDS: ", + index->name, + (ullint) index->id, + (ulong) index->n_user_defined_cols, + (ulong) index->n_fields, + (ulong) index->n_uniq, + (ulong) index->type, + (ulong) index->page, + (ulong) n_vals, + (ulong) index->stat_n_leaf_pages, + (ulong) index->stat_index_size); + + for (i = 0; i < index->n_fields; i++) { + dict_field_print_low(dict_index_get_nth_field(index, i)); + } + + putc('\n', stderr); + +#ifdef UNIV_BTR_PRINT + btr_print_size(index); + + btr_print_index(index, 7); +#endif /* UNIV_BTR_PRINT */ +} + +/**********************************************************************//** +Prints a field data. */ +static +void +dict_field_print_low( +/*=================*/ + const dict_field_t* field) /*!< in: field */ +{ + ut_ad(mutex_own(&(dict_sys->mutex))); + + fprintf(stderr, " %s", field->name); + + if (field->prefix_len != 0) { + fprintf(stderr, "(%lu)", (ulong) field->prefix_len); + } +} + +/**********************************************************************//** +Outputs info on a foreign key of a table in a format suitable for +CREATE TABLE. */ +UNIV_INTERN +void +dict_print_info_on_foreign_key_in_create_format( +/*============================================*/ + FILE* file, /*!< in: file where to print */ + trx_t* trx, /*!< in: transaction */ + dict_foreign_t* foreign, /*!< in: foreign key constraint */ + ibool add_newline) /*!< in: whether to add a newline */ +{ + const char* stripped_id; + ulint i; + + if (strchr(foreign->id, '/')) { + /* Strip the preceding database name from the constraint id */ + stripped_id = foreign->id + 1 + + dict_get_db_name_len(foreign->id); + } else { + stripped_id = foreign->id; + } + + putc(',', file); + + if (add_newline) { + /* SHOW CREATE TABLE wants constraints each printed nicely + on its own line, while error messages want no newlines + inserted. */ + fputs("\n ", file); + } + + fputs(" CONSTRAINT ", file); + ut_print_name(file, trx, FALSE, stripped_id); + fputs(" FOREIGN KEY (", file); + + for (i = 0;;) { + ut_print_name(file, trx, FALSE, foreign->foreign_col_names[i]); + if (++i < foreign->n_fields) { + fputs(", ", file); + } else { + break; + } + } + + fputs(") REFERENCES ", file); + + if (dict_tables_have_same_db(foreign->foreign_table_name_lookup, + foreign->referenced_table_name_lookup)) { + /* Do not print the database name of the referenced table */ + ut_print_name(file, trx, TRUE, + dict_remove_db_name( + foreign->referenced_table_name)); + } else { + ut_print_name(file, trx, TRUE, + foreign->referenced_table_name); + } + + putc(' ', file); + putc('(', file); + + for (i = 0;;) { + ut_print_name(file, trx, FALSE, + foreign->referenced_col_names[i]); + if (++i < foreign->n_fields) { + fputs(", ", file); + } else { + break; + } + } + + putc(')', file); + + if (foreign->type & DICT_FOREIGN_ON_DELETE_CASCADE) { + fputs(" ON DELETE CASCADE", file); + } + + if (foreign->type & DICT_FOREIGN_ON_DELETE_SET_NULL) { + fputs(" ON DELETE SET NULL", file); + } + + if (foreign->type & DICT_FOREIGN_ON_DELETE_NO_ACTION) { + fputs(" ON DELETE NO ACTION", file); + } + + if (foreign->type & DICT_FOREIGN_ON_UPDATE_CASCADE) { + fputs(" ON UPDATE CASCADE", file); + } + + if (foreign->type & DICT_FOREIGN_ON_UPDATE_SET_NULL) { + fputs(" ON UPDATE SET NULL", file); + } + + if (foreign->type & DICT_FOREIGN_ON_UPDATE_NO_ACTION) { + fputs(" ON UPDATE NO ACTION", file); + } +} + +/**********************************************************************//** +Outputs info on foreign keys of a table. */ +UNIV_INTERN +void +dict_print_info_on_foreign_keys( +/*============================*/ + ibool create_table_format, /*!< in: if TRUE then print in + a format suitable to be inserted into + a CREATE TABLE, otherwise in the format + of SHOW TABLE STATUS */ + FILE* file, /*!< in: file where to print */ + trx_t* trx, /*!< in: transaction */ + dict_table_t* table) /*!< in: table */ +{ + dict_foreign_t* foreign; + + mutex_enter(&(dict_sys->mutex)); + + for (dict_foreign_set::iterator it = table->foreign_set.begin(); + it != table->foreign_set.end(); + ++it) { + + foreign = *it; + + if (create_table_format) { + dict_print_info_on_foreign_key_in_create_format( + file, trx, foreign, TRUE); + } else { + ulint i; + fputs("; (", file); + + for (i = 0; i < foreign->n_fields; i++) { + if (i) { + putc(' ', file); + } + + ut_print_name(file, trx, FALSE, + foreign->foreign_col_names[i]); + } + + fputs(") REFER ", file); + ut_print_name(file, trx, TRUE, + foreign->referenced_table_name); + putc('(', file); + + for (i = 0; i < foreign->n_fields; i++) { + if (i) { + putc(' ', file); + } + ut_print_name( + file, trx, FALSE, + foreign->referenced_col_names[i]); + } + + putc(')', file); + + if (foreign->type == DICT_FOREIGN_ON_DELETE_CASCADE) { + fputs(" ON DELETE CASCADE", file); + } + + if (foreign->type == DICT_FOREIGN_ON_DELETE_SET_NULL) { + fputs(" ON DELETE SET NULL", file); + } + + if (foreign->type & DICT_FOREIGN_ON_DELETE_NO_ACTION) { + fputs(" ON DELETE NO ACTION", file); + } + + if (foreign->type & DICT_FOREIGN_ON_UPDATE_CASCADE) { + fputs(" ON UPDATE CASCADE", file); + } + + if (foreign->type & DICT_FOREIGN_ON_UPDATE_SET_NULL) { + fputs(" ON UPDATE SET NULL", file); + } + + if (foreign->type & DICT_FOREIGN_ON_UPDATE_NO_ACTION) { + fputs(" ON UPDATE NO ACTION", file); + } + } + } + + mutex_exit(&(dict_sys->mutex)); +} + +/********************************************************************//** +Displays the names of the index and the table. */ +UNIV_INTERN +void +dict_index_name_print( +/*==================*/ + FILE* file, /*!< in: output stream */ + const trx_t* trx, /*!< in: transaction */ + const dict_index_t* index) /*!< in: index to print */ +{ + fputs("index ", file); + ut_print_name(file, trx, FALSE, index->name); + fputs(" of table ", file); + ut_print_name(file, trx, TRUE, index->table_name); +} + +/**********************************************************************//** +Find a table in dict_sys->table_LRU list with specified space id +@return table if found, NULL if not */ +static +dict_table_t* +dict_find_table_by_space( +/*=====================*/ + ulint space_id) /*!< in: space ID */ +{ + dict_table_t* table; + ulint num_item; + ulint count = 0; + + ut_ad(space_id > 0); + + if (dict_sys == NULL) { + /* This could happen when it's in redo processing. */ + return(NULL); + } + + table = UT_LIST_GET_FIRST(dict_sys->table_LRU); + num_item = UT_LIST_GET_LEN(dict_sys->table_LRU); + + /* This function intentionally does not acquire mutex as it is used + by error handling code in deep call stack as last means to avoid + killing the server, so it worth to risk some consequencies for + the action. */ + while (table && count < num_item) { + if (table->space == space_id) { + return(table); + } + + table = UT_LIST_GET_NEXT(table_LRU, table); + count++; + } + + return(NULL); +} + +/**********************************************************************//** +Flags a table with specified space_id corrupted in the data dictionary +cache +@return TRUE if successful */ +UNIV_INTERN +ibool +dict_set_corrupted_by_space( +/*========================*/ + ulint space_id) /*!< in: space ID */ +{ + dict_table_t* table; + + table = dict_find_table_by_space(space_id); + + if (!table) { + return(FALSE); + } + + /* mark the table->corrupted bit only, since the caller + could be too deep in the stack for SYS_INDEXES update */ + table->corrupted = TRUE; + + return(TRUE); +} + +/**********************************************************************//** +Flags an index corrupted both in the data dictionary cache +and in the SYS_INDEXES */ +UNIV_INTERN +void +dict_set_corrupted( +/*===============*/ + dict_index_t* index, /*!< in/out: index */ + trx_t* trx, /*!< in/out: transaction */ + const char* ctx) /*!< in: context */ +{ + mem_heap_t* heap; + mtr_t mtr; + dict_index_t* sys_index; + dtuple_t* tuple; + dfield_t* dfield; + byte* buf; + char* table_name; + const char* status; + btr_cur_t cursor; + bool locked = RW_X_LATCH == trx->dict_operation_lock_mode; + + if (!locked) { + row_mysql_lock_data_dictionary(trx); + } + + ut_ad(index); + ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(!dict_table_is_comp(dict_sys->sys_tables)); + ut_ad(!dict_table_is_comp(dict_sys->sys_indexes)); + +#ifdef UNIV_SYNC_DEBUG + ut_ad(sync_thread_levels_empty_except_dict()); +#endif + + /* Mark the table as corrupted only if the clustered index + is corrupted */ + if (dict_index_is_clust(index)) { + index->table->corrupted = TRUE; + } + + if (index->type & DICT_CORRUPT) { + /* The index was already flagged corrupted. */ + ut_ad(!dict_index_is_clust(index) || index->table->corrupted); + goto func_exit; + } + + heap = mem_heap_create(sizeof(dtuple_t) + 2 * (sizeof(dfield_t) + + sizeof(que_fork_t) + sizeof(upd_node_t) + + sizeof(upd_t) + 12)); + mtr_start(&mtr); + index->type |= DICT_CORRUPT; + + sys_index = UT_LIST_GET_FIRST(dict_sys->sys_indexes->indexes); + + /* Find the index row in SYS_INDEXES */ + tuple = dtuple_create(heap, 2); + + dfield = dtuple_get_nth_field(tuple, 0); + buf = static_cast<byte*>(mem_heap_alloc(heap, 8)); + mach_write_to_8(buf, index->table->id); + dfield_set_data(dfield, buf, 8); + + dfield = dtuple_get_nth_field(tuple, 1); + buf = static_cast<byte*>(mem_heap_alloc(heap, 8)); + mach_write_to_8(buf, index->id); + dfield_set_data(dfield, buf, 8); + + dict_index_copy_types(tuple, sys_index, 2); + + btr_cur_search_to_nth_level(sys_index, 0, tuple, PAGE_CUR_LE, + BTR_MODIFY_LEAF, + &cursor, 0, __FILE__, __LINE__, &mtr); + + if (cursor.low_match == dtuple_get_n_fields(tuple)) { + /* UPDATE SYS_INDEXES SET TYPE=index->type + WHERE TABLE_ID=index->table->id AND INDEX_ID=index->id */ + ulint len; + byte* field = rec_get_nth_field_old( + btr_cur_get_rec(&cursor), + DICT_FLD__SYS_INDEXES__TYPE, &len); + if (len != 4) { + goto fail; + } + mlog_write_ulint(field, index->type, MLOG_4BYTES, &mtr); + status = "Flagged"; + } else { +fail: + status = "Unable to flag"; + } + + mtr_commit(&mtr); + mem_heap_empty(heap); + table_name = static_cast<char*>(mem_heap_alloc(heap, FN_REFLEN + 1)); + *innobase_convert_name( + table_name, FN_REFLEN, + index->table_name, strlen(index->table_name), + NULL, TRUE) = 0; + + ib_logf(IB_LOG_LEVEL_ERROR, "%s corruption of %s in table %s in %s", + status, index->name, table_name, ctx); + + mem_heap_free(heap); + +func_exit: + if (!locked) { + row_mysql_unlock_data_dictionary(trx); + } +} + +/**********************************************************************//** +Flags an index corrupted in the data dictionary cache only. This +is used mostly to mark a corrupted index when index's own dictionary +is corrupted, and we force to load such index for repair purpose */ +UNIV_INTERN +void +dict_set_corrupted_index_cache_only( +/*================================*/ + dict_index_t* index, /*!< in/out: index */ + dict_table_t* table) /*!< in/out: table */ +{ + ut_ad(index); + ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(!dict_table_is_comp(dict_sys->sys_tables)); + ut_ad(!dict_table_is_comp(dict_sys->sys_indexes)); + + /* Mark the table as corrupted only if the clustered index + is corrupted */ + if (dict_index_is_clust(index)) { + dict_table_t* corrupt_table; + + corrupt_table = table ? table : index->table; + ut_ad(!index->table || !table || index->table == table); + + if (corrupt_table) { + corrupt_table->corrupted = TRUE; + } + } + + index->type |= DICT_CORRUPT; +} + +/************************************************************************* +set is_corrupt flag by space_id*/ + +void +dict_table_set_corrupt_by_space( +/*============================*/ + ulint space_id, + ibool need_mutex) +{ + dict_table_t* table; + ibool found = FALSE; + + ut_a(space_id != 0 && space_id < SRV_LOG_SPACE_FIRST_ID); + + if (need_mutex) + mutex_enter(&(dict_sys->mutex)); + + table = UT_LIST_GET_FIRST(dict_sys->table_LRU); + + while (table) { + if (table->space == space_id) { + table->is_corrupt = TRUE; + found = TRUE; + } + + table = UT_LIST_GET_NEXT(table_LRU, table); + } + + if (need_mutex) + mutex_exit(&(dict_sys->mutex)); + + if (!found) { + fprintf(stderr, "InnoDB: space to be marked as " + "crashed was not found for id " ULINTPF ".\n", + space_id); + } +} +#endif /* !UNIV_HOTBACKUP */ + +/**********************************************************************//** +Inits dict_ind_redundant and dict_ind_compact. */ +UNIV_INTERN +void +dict_ind_init(void) +/*===============*/ +{ + dict_table_t* table; + + /* create dummy table and index for REDUNDANT infimum and supremum */ + table = dict_mem_table_create("SYS_DUMMY1", DICT_HDR_SPACE, 1, 0, 0, + true); + dict_mem_table_add_col(table, NULL, NULL, DATA_CHAR, + DATA_ENGLISH | DATA_NOT_NULL, 8); + + dict_ind_redundant = dict_mem_index_create("SYS_DUMMY1", "SYS_DUMMY1", + DICT_HDR_SPACE, 0, 1); + dict_index_add_col(dict_ind_redundant, table, + dict_table_get_nth_col(table, 0), 0); + dict_ind_redundant->table = table; + + /* create dummy table and index for COMPACT infimum and supremum */ + table = dict_mem_table_create("SYS_DUMMY2", + DICT_HDR_SPACE, 1, + DICT_TF_COMPACT, 0, true); + dict_mem_table_add_col(table, NULL, NULL, DATA_CHAR, + DATA_ENGLISH | DATA_NOT_NULL, 8); + dict_ind_compact = dict_mem_index_create("SYS_DUMMY2", "SYS_DUMMY2", + DICT_HDR_SPACE, 0, 1); + dict_index_add_col(dict_ind_compact, table, + dict_table_get_nth_col(table, 0), 0); + dict_ind_compact->table = table; + + /* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */ + dict_ind_redundant->cached = dict_ind_compact->cached = TRUE; +} + +#ifndef UNIV_HOTBACKUP +/**********************************************************************//** +Frees dict_ind_redundant and dict_ind_compact. */ +static +void +dict_ind_free(void) +/*===============*/ +{ + dict_table_t* table; + + table = dict_ind_compact->table; + dict_mem_index_free(dict_ind_compact); + dict_ind_compact = NULL; + dict_mem_table_free(table); + + table = dict_ind_redundant->table; + dict_mem_index_free(dict_ind_redundant); + dict_ind_redundant = NULL; + dict_mem_table_free(table); +} + +/**********************************************************************//** +Get index by name +@return index, NULL if does not exist */ +UNIV_INTERN +dict_index_t* +dict_table_get_index_on_name( +/*=========================*/ + dict_table_t* table, /*!< in: table */ + const char* name) /*!< in: name of the index to find */ +{ + dict_index_t* index; + + /* If name is NULL, just return */ + if (!name) { + return(NULL); + } + + index = dict_table_get_first_index(table); + + while (index != NULL) { + if (innobase_strcasecmp(index->name, name) == 0) { + + return(index); + } + + index = dict_table_get_next_index(index); + } + + return(NULL); +} + +/**********************************************************************//** +Replace the index passed in with another equivalent index in the +foreign key lists of the table. +@return whether all replacements were found */ +UNIV_INTERN +bool +dict_foreign_replace_index( +/*=======================*/ + dict_table_t* table, /*!< in/out: table */ + const char** col_names, + /*!< in: column names, or NULL + to use table->col_names */ + const dict_index_t* index) /*!< in: index to be replaced */ +{ + bool found = true; + dict_foreign_t* foreign; + + ut_ad(index->to_be_dropped); + ut_ad(index->table == table); + + for (dict_foreign_set::iterator it = table->foreign_set.begin(); + it != table->foreign_set.end(); + ++it) { + + foreign = *it; + if (foreign->foreign_index == index) { + ut_ad(foreign->foreign_table == index->table); + + dict_index_t* new_index = dict_foreign_find_index( + foreign->foreign_table, col_names, + foreign->foreign_col_names, + foreign->n_fields, index, + /*check_charsets=*/TRUE, /*check_null=*/FALSE); + if (new_index) { + ut_ad(new_index->table == index->table); + ut_ad(!new_index->to_be_dropped); + } else { + found = false; + } + + foreign->foreign_index = new_index; + } + } + + for (dict_foreign_set::iterator it = table->referenced_set.begin(); + it != table->referenced_set.end(); + ++it) { + + foreign = *it; + if (foreign->referenced_index == index) { + ut_ad(foreign->referenced_table == index->table); + + dict_index_t* new_index = dict_foreign_find_index( + foreign->referenced_table, NULL, + foreign->referenced_col_names, + foreign->n_fields, index, + /*check_charsets=*/TRUE, /*check_null=*/FALSE); + /* There must exist an alternative index, + since this must have been checked earlier. */ + if (new_index) { + ut_ad(new_index->table == index->table); + ut_ad(!new_index->to_be_dropped); + } else { + found = false; + } + + foreign->referenced_index = new_index; + } + } + + return(found); +} + +/**********************************************************************//** +In case there is more than one index with the same name return the index +with the min(id). +@return index, NULL if does not exist */ +UNIV_INTERN +dict_index_t* +dict_table_get_index_on_name_and_min_id( +/*=====================================*/ + dict_table_t* table, /*!< in: table */ + const char* name) /*!< in: name of the index to find */ +{ + dict_index_t* index; + dict_index_t* min_index; /* Index with matching name and min(id) */ + + min_index = NULL; + index = dict_table_get_first_index(table); + + while (index != NULL) { + if (ut_strcmp(index->name, name) == 0) { + if (!min_index || index->id < min_index->id) { + + min_index = index; + } + } + + index = dict_table_get_next_index(index); + } + + return(min_index); + +} + +#ifdef UNIV_DEBUG +/**********************************************************************//** +Check for duplicate index entries in a table [using the index name] */ +UNIV_INTERN +void +dict_table_check_for_dup_indexes( +/*=============================*/ + const dict_table_t* table, /*!< in: Check for dup indexes + in this table */ + enum check_name check) /*!< in: whether and when to allow + temporary index names */ +{ + /* Check for duplicates, ignoring indexes that are marked + as to be dropped */ + + const dict_index_t* index1; + const dict_index_t* index2; + + ut_ad(mutex_own(&dict_sys->mutex)); + + /* The primary index _must_ exist */ + ut_a(UT_LIST_GET_LEN(table->indexes) > 0); + + index1 = UT_LIST_GET_FIRST(table->indexes); + + do { + if (*index1->name == TEMP_INDEX_PREFIX) { + ut_a(!dict_index_is_clust(index1)); + + switch (check) { + case CHECK_ALL_COMPLETE: + ut_error; + case CHECK_ABORTED_OK: + switch (dict_index_get_online_status(index1)) { + case ONLINE_INDEX_COMPLETE: + case ONLINE_INDEX_CREATION: + ut_error; + break; + case ONLINE_INDEX_ABORTED: + case ONLINE_INDEX_ABORTED_DROPPED: + break; + } + /* fall through */ + case CHECK_PARTIAL_OK: + break; + } + } + + for (index2 = UT_LIST_GET_NEXT(indexes, index1); + index2 != NULL; + index2 = UT_LIST_GET_NEXT(indexes, index2)) { + ut_ad(ut_strcmp(index1->name, index2->name)); + } + + index1 = UT_LIST_GET_NEXT(indexes, index1); + } while (index1); +} +#endif /* UNIV_DEBUG */ + +/** Auxiliary macro used inside dict_table_schema_check(). */ +#define CREATE_TYPES_NAMES() \ + dtype_sql_name((unsigned) req_schema->columns[i].mtype, \ + (unsigned) req_schema->columns[i].prtype_mask, \ + (unsigned) req_schema->columns[i].len, \ + req_type, sizeof(req_type)); \ + dtype_sql_name(table->cols[j].mtype, \ + table->cols[j].prtype, \ + table->cols[j].len, \ + actual_type, sizeof(actual_type)) + +/*********************************************************************//** +Checks whether a table exists and whether it has the given structure. +The table must have the same number of columns with the same names and +types. The order of the columns does not matter. +The caller must own the dictionary mutex. +dict_table_schema_check() @{ +@return DB_SUCCESS if the table exists and contains the necessary columns */ +UNIV_INTERN +dberr_t +dict_table_schema_check( +/*====================*/ + dict_table_schema_t* req_schema, /*!< in/out: required table + schema */ + char* errstr, /*!< out: human readable error + message if != DB_SUCCESS is + returned */ + size_t errstr_sz) /*!< in: errstr size */ +{ + char buf[MAX_FULL_NAME_LEN]; + char req_type[64]; + char actual_type[64]; + dict_table_t* table; + ulint i; + + ut_ad(mutex_own(&dict_sys->mutex)); + + table = dict_table_get_low(req_schema->table_name); + + if (table == NULL) { + /* no such table */ + + ut_snprintf(errstr, errstr_sz, + "Table %s not found.", + ut_format_name(req_schema->table_name, + TRUE, buf, sizeof(buf))); + + return(DB_TABLE_NOT_FOUND); + } + + if (table->ibd_file_missing) { + /* missing tablespace */ + + ut_snprintf(errstr, errstr_sz, + "Tablespace for table %s is missing.", + ut_format_name(req_schema->table_name, + TRUE, buf, sizeof(buf))); + + return(DB_TABLE_NOT_FOUND); + } + + if ((ulint) table->n_def - DATA_N_SYS_COLS != req_schema->n_cols) { + /* the table has a different number of columns than + required */ + + ut_snprintf(errstr, errstr_sz, + "%s has %d columns but should have %lu.", + ut_format_name(req_schema->table_name, + TRUE, buf, sizeof(buf)), + table->n_def - DATA_N_SYS_COLS, + req_schema->n_cols); + + return(DB_ERROR); + } + + /* For each column from req_schema->columns[] search + whether it is present in table->cols[]. + The following algorithm is O(n_cols^2), but is optimized to + be O(n_cols) if the columns are in the same order in both arrays. */ + + for (i = 0; i < req_schema->n_cols; i++) { + ulint j; + + /* check if i'th column is the same in both arrays */ + if (innobase_strcasecmp(req_schema->columns[i].name, + dict_table_get_col_name(table, i)) == 0) { + + /* we found the column in table->cols[] quickly */ + j = i; + } else { + + /* columns in both arrays are not in the same order, + do a full scan of the second array */ + for (j = 0; j < table->n_def; j++) { + const char* name; + + name = dict_table_get_col_name(table, j); + + if (innobase_strcasecmp(name, + req_schema->columns[i].name) == 0) { + + /* found the column on j'th + position */ + break; + } + } + + if (j == table->n_def) { + + ut_snprintf(errstr, errstr_sz, + "required column %s " + "not found in table %s.", + req_schema->columns[i].name, + ut_format_name( + req_schema->table_name, + TRUE, buf, sizeof(buf))); + + return(DB_ERROR); + } + } + + /* we found a column with the same name on j'th position, + compare column types and flags */ + + /* check length for exact match */ + if (req_schema->columns[i].len != table->cols[j].len) { + + CREATE_TYPES_NAMES(); + + ut_snprintf(errstr, errstr_sz, + "Column %s in table %s is %s " + "but should be %s (length mismatch).", + req_schema->columns[i].name, + ut_format_name(req_schema->table_name, + TRUE, buf, sizeof(buf)), + actual_type, req_type); + + return(DB_ERROR); + } + + /* check mtype for exact match */ + if (req_schema->columns[i].mtype != table->cols[j].mtype) { + + CREATE_TYPES_NAMES(); + + ut_snprintf(errstr, errstr_sz, + "Column %s in table %s is %s " + "but should be %s (type mismatch).", + req_schema->columns[i].name, + ut_format_name(req_schema->table_name, + TRUE, buf, sizeof(buf)), + actual_type, req_type); + + return(DB_ERROR); + } + + /* check whether required prtype mask is set */ + if (req_schema->columns[i].prtype_mask != 0 + && (table->cols[j].prtype + & req_schema->columns[i].prtype_mask) + != req_schema->columns[i].prtype_mask) { + + CREATE_TYPES_NAMES(); + + ut_snprintf(errstr, errstr_sz, + "Column %s in table %s is %s " + "but should be %s (flags mismatch).", + req_schema->columns[i].name, + ut_format_name(req_schema->table_name, + TRUE, buf, sizeof(buf)), + actual_type, req_type); + + return(DB_ERROR); + } + } + + if (req_schema->n_foreign != table->foreign_set.size()) { + ut_snprintf( + errstr, errstr_sz, + "Table %s has " ULINTPF " foreign key(s) pointing" + " to other tables, but it must have %lu.", + ut_format_name(req_schema->table_name, + TRUE, buf, sizeof(buf)), + static_cast<ulint>(table->foreign_set.size()), + req_schema->n_foreign); + return(DB_ERROR); + } + + if (req_schema->n_referenced != table->referenced_set.size()) { + ut_snprintf( + errstr, errstr_sz, + "There are " ULINTPF " foreign key(s) pointing to %s, " + "but there must be %lu.", + static_cast<ulint>(table->referenced_set.size()), + ut_format_name(req_schema->table_name, + TRUE, buf, sizeof(buf)), + req_schema->n_referenced); + return(DB_ERROR); + } + + return(DB_SUCCESS); +} +/* @} */ + +/*********************************************************************//** +Converts a database and table name from filesystem encoding +(e.g. d@i1b/a@q1b@1Kc, same format as used in dict_table_t::name) in two +strings in UTF8 encoding (e.g. dцb and aюbØc). The output buffers must be +at least MAX_DB_UTF8_LEN and MAX_TABLE_UTF8_LEN bytes. */ +UNIV_INTERN +void +dict_fs2utf8( +/*=========*/ + const char* db_and_table, /*!< in: database and table names, + e.g. d@i1b/a@q1b@1Kc */ + char* db_utf8, /*!< out: database name, e.g. dцb */ + size_t db_utf8_size, /*!< in: dbname_utf8 size */ + char* table_utf8, /*!< out: table name, e.g. aюbØc */ + size_t table_utf8_size)/*!< in: table_utf8 size */ +{ + char db[MAX_DATABASE_NAME_LEN + 1]; + ulint db_len; + uint errors; + + db_len = dict_get_db_name_len(db_and_table); + + ut_a(db_len <= sizeof(db)); + + memcpy(db, db_and_table, db_len); + db[db_len] = '\0'; + + strconvert( + &my_charset_filename, db, system_charset_info, + db_utf8, static_cast<uint>(db_utf8_size), &errors); + + /* convert each # to @0023 in table name and store the result in buf */ + const char* table = dict_remove_db_name(db_and_table); + const char* table_p; + char buf[MAX_TABLE_NAME_LEN * 5 + 1]; + char* buf_p; + for (table_p = table, buf_p = buf; table_p[0] != '\0'; table_p++) { + if (table_p[0] != '#') { + buf_p[0] = table_p[0]; + buf_p++; + } else { + buf_p[0] = '@'; + buf_p[1] = '0'; + buf_p[2] = '0'; + buf_p[3] = '2'; + buf_p[4] = '3'; + buf_p += 5; + } + ut_a((size_t) (buf_p - buf) < sizeof(buf)); + } + buf_p[0] = '\0'; + + errors = 0; + strconvert( + &my_charset_filename, buf, system_charset_info, + table_utf8, static_cast<uint>(table_utf8_size), + &errors); + + if (errors != 0) { + ut_snprintf(table_utf8, table_utf8_size, "%s%s", + srv_mysql50_table_name_prefix, table); + } +} + +/**********************************************************************//** +Closes the data dictionary module. */ +UNIV_INTERN +void +dict_close(void) +/*============*/ +{ + ulint i; + + /* Free the hash elements. We don't remove them from the table + because we are going to destroy the table anyway. */ + for (i = 0; i < hash_get_n_cells(dict_sys->table_hash); i++) { + dict_table_t* table; + + table = static_cast<dict_table_t*>( + HASH_GET_FIRST(dict_sys->table_hash, i)); + + while (table) { + dict_table_t* prev_table = table; + + table = static_cast<dict_table_t*>( + HASH_GET_NEXT(name_hash, prev_table)); +#ifdef UNIV_DEBUG + ut_a(prev_table->magic_n == DICT_TABLE_MAGIC_N); +#endif + /* Acquire only because it's a pre-condition. */ + mutex_enter(&dict_sys->mutex); + + dict_table_remove_from_cache(prev_table); + + mutex_exit(&dict_sys->mutex); + } + } + + hash_table_free(dict_sys->table_hash); + + /* The elements are the same instance as in dict_sys->table_hash, + therefore we don't delete the individual elements. */ + hash_table_free(dict_sys->table_id_hash); + + dict_ind_free(); + + mutex_free(&dict_sys->mutex); + + rw_lock_free(&dict_operation_lock); + memset(&dict_operation_lock, 0x0, sizeof(dict_operation_lock)); + + if (!srv_read_only_mode) { + mutex_free(&dict_foreign_err_mutex); + } + + mem_free(dict_sys); + dict_sys = NULL; +} + +#ifdef UNIV_DEBUG +/**********************************************************************//** +Validate the dictionary table LRU list. +@return TRUE if valid */ +static +ibool +dict_lru_validate(void) +/*===================*/ +{ + dict_table_t* table; + + ut_ad(mutex_own(&dict_sys->mutex)); + + for (table = UT_LIST_GET_FIRST(dict_sys->table_LRU); + table != NULL; + table = UT_LIST_GET_NEXT(table_LRU, table)) { + + ut_a(table->can_be_evicted); + } + + for (table = UT_LIST_GET_FIRST(dict_sys->table_non_LRU); + table != NULL; + table = UT_LIST_GET_NEXT(table_LRU, table)) { + + ut_a(!table->can_be_evicted); + } + + return(TRUE); +} + +/**********************************************************************//** +Check if a table exists in the dict table LRU list. +@return TRUE if table found in LRU list */ +static +ibool +dict_lru_find_table( +/*================*/ + const dict_table_t* find_table) /*!< in: table to find */ +{ + dict_table_t* table; + + ut_ad(find_table != NULL); + ut_ad(mutex_own(&dict_sys->mutex)); + + for (table = UT_LIST_GET_FIRST(dict_sys->table_LRU); + table != NULL; + table = UT_LIST_GET_NEXT(table_LRU, table)) { + + ut_a(table->can_be_evicted); + + if (table == find_table) { + return(TRUE); + } + } + + return(FALSE); +} + +/**********************************************************************//** +Check if a table exists in the dict table non-LRU list. +@return TRUE if table found in non-LRU list */ +static +ibool +dict_non_lru_find_table( +/*====================*/ + const dict_table_t* find_table) /*!< in: table to find */ +{ + dict_table_t* table; + + ut_ad(find_table != NULL); + ut_ad(mutex_own(&dict_sys->mutex)); + + for (table = UT_LIST_GET_FIRST(dict_sys->table_non_LRU); + table != NULL; + table = UT_LIST_GET_NEXT(table_LRU, table)) { + + ut_a(!table->can_be_evicted); + + if (table == find_table) { + return(TRUE); + } + } + + return(FALSE); +} +#endif /* UNIV_DEBUG */ +/*********************************************************************//** +Check an index to see whether its first fields are the columns in the array, +in the same order and is not marked for deletion and is not the same +as types_idx. +@return true if the index qualifies, otherwise false */ +UNIV_INTERN +bool +dict_foreign_qualify_index( +/*=======================*/ + const dict_table_t* table, /*!< in: table */ + const char** col_names, + /*!< in: column names, or NULL + to use table->col_names */ + const char** columns,/*!< in: array of column names */ + ulint n_cols, /*!< in: number of columns */ + const dict_index_t* index, /*!< in: index to check */ + const dict_index_t* types_idx, + /*!< in: NULL or an index + whose types the column types + must match */ + bool check_charsets, + /*!< in: whether to check + charsets. only has an effect + if types_idx != NULL */ + ulint check_null) + /*!< in: nonzero if none of + the columns must be declared + NOT NULL */ +{ + if (dict_index_get_n_fields(index) < n_cols) { + return(false); + } + + for (ulint i = 0; i < n_cols; i++) { + dict_field_t* field; + const char* col_name; + ulint col_no; + + field = dict_index_get_nth_field(index, i); + col_no = dict_col_get_no(field->col); + + if (field->prefix_len != 0) { + /* We do not accept column prefix + indexes here */ + return(false); + } + + if (check_null + && (field->col->prtype & DATA_NOT_NULL)) { + return(false); + } + + col_name = col_names + ? col_names[col_no] + : dict_table_get_col_name(table, col_no); + + if (0 != innobase_strcasecmp(columns[i], col_name)) { + return(false); + } + + if (types_idx && !cmp_cols_are_equal( + dict_index_get_nth_col(index, i), + dict_index_get_nth_col(types_idx, i), + check_charsets)) { + return(false); + } + } + + return(true); +} + +/*********************************************************************//** +Update the state of compression failure padding heuristics. This is +called whenever a compression operation succeeds or fails. +The caller must be holding info->mutex */ +static +void +dict_index_zip_pad_update( +/*======================*/ + zip_pad_info_t* info, /*<! in/out: info to be updated */ + ulint zip_threshold) /*<! in: zip threshold value */ +{ + ulint total; + ulint fail_pct; + + ut_ad(info); + + total = info->success + info->failure; + + ut_ad(total > 0); + + if(zip_threshold == 0) { + /* User has just disabled the padding. */ + return; + } + + if (total < ZIP_PAD_ROUND_LEN) { + /* We are in middle of a round. Do nothing. */ + return; + } + + /* We are at a 'round' boundary. Reset the values but first + calculate fail rate for our heuristic. */ + fail_pct = (info->failure * 100) / total; + info->failure = 0; + info->success = 0; + + if (fail_pct > zip_threshold) { + /* Compression failures are more then user defined + threshold. Increase the pad size to reduce chances of + compression failures. */ + ut_ad(info->pad % ZIP_PAD_INCR == 0); + + /* Only do increment if it won't increase padding + beyond max pad size. */ + if (info->pad + ZIP_PAD_INCR + < (UNIV_PAGE_SIZE * zip_pad_max) / 100) { +#ifdef HAVE_ATOMIC_BUILTINS + /* Use atomics even though we have the mutex. + This is to ensure that we are able to read + info->pad atomically where atomics are + supported. */ + os_atomic_increment_ulint(&info->pad, ZIP_PAD_INCR); +#else /* HAVE_ATOMIC_BUILTINS */ + info->pad += ZIP_PAD_INCR; +#endif /* HAVE_ATOMIC_BUILTINS */ + + MONITOR_INC(MONITOR_PAD_INCREMENTS); + } + + info->n_rounds = 0; + + } else { + /* Failure rate was OK. Another successful round + completed. */ + ++info->n_rounds; + + /* If enough successful rounds are completed with + compression failure rate in control, decrease the + padding. */ + if (info->n_rounds >= ZIP_PAD_SUCCESSFUL_ROUND_LIMIT + && info->pad > 0) { + + ut_ad(info->pad % ZIP_PAD_INCR == 0); +#ifdef HAVE_ATOMIC_BUILTINS + /* Use atomics even though we have the mutex. + This is to ensure that we are able to read + info->pad atomically where atomics are + supported. */ + os_atomic_decrement_ulint(&info->pad, ZIP_PAD_INCR); +#else /* HAVE_ATOMIC_BUILTINS */ + info->pad -= ZIP_PAD_INCR; +#endif /* HAVE_ATOMIC_BUILTINS */ + + info->n_rounds = 0; + + MONITOR_INC(MONITOR_PAD_DECREMENTS); + } + } +} + +/*********************************************************************//** +This function should be called whenever a page is successfully +compressed. Updates the compression padding information. */ +UNIV_INTERN +void +dict_index_zip_success( +/*===================*/ + dict_index_t* index) /*!< in/out: index to be updated. */ +{ + ut_ad(index); + + ulint zip_threshold = zip_failure_threshold_pct; + if (!zip_threshold) { + /* Disabled by user. */ + return; + } + + os_fast_mutex_lock(&index->zip_pad.mutex); + ++index->zip_pad.success; + dict_index_zip_pad_update(&index->zip_pad, zip_threshold); + os_fast_mutex_unlock(&index->zip_pad.mutex); +} + +/*********************************************************************//** +This function should be called whenever a page compression attempt +fails. Updates the compression padding information. */ +UNIV_INTERN +void +dict_index_zip_failure( +/*===================*/ + dict_index_t* index) /*!< in/out: index to be updated. */ +{ + ut_ad(index); + + ulint zip_threshold = zip_failure_threshold_pct; + if (!zip_threshold) { + /* Disabled by user. */ + return; + } + + os_fast_mutex_lock(&index->zip_pad.mutex); + ++index->zip_pad.failure; + dict_index_zip_pad_update(&index->zip_pad, zip_threshold); + os_fast_mutex_unlock(&index->zip_pad.mutex); +} + + +/*********************************************************************//** +Return the optimal page size, for which page will likely compress. +@return page size beyond which page might not compress */ +UNIV_INTERN +ulint +dict_index_zip_pad_optimal_page_size( +/*=================================*/ + dict_index_t* index) /*!< in: index for which page size + is requested */ +{ + ulint pad; + ulint min_sz; + ulint sz; + + ut_ad(index); + + if (!zip_failure_threshold_pct) { + /* Disabled by user. */ + return(UNIV_PAGE_SIZE); + } + + /* We use atomics to read index->zip_pad.pad. Here we use zero + as increment as are not changing the value of the 'pad'. On + platforms where atomics are not available we grab the mutex. */ + +#ifdef HAVE_ATOMIC_BUILTINS + pad = os_atomic_increment_ulint(&index->zip_pad.pad, 0); +#else /* HAVE_ATOMIC_BUILTINS */ + os_fast_mutex_lock(&index->zip_pad.mutex); + pad = index->zip_pad.pad; + os_fast_mutex_unlock(&index->zip_pad.mutex); +#endif /* HAVE_ATOMIC_BUILTINS */ + + ut_ad(pad < UNIV_PAGE_SIZE); + sz = UNIV_PAGE_SIZE - pad; + + /* Min size allowed by user. */ + ut_ad(zip_pad_max < 100); + min_sz = (UNIV_PAGE_SIZE * (100 - zip_pad_max)) / 100; + + return(ut_max(sz, min_sz)); +} + +/*************************************************************//** +Convert table flag to row format string. +@return row format name. */ +UNIV_INTERN +const char* +dict_tf_to_row_format_string( +/*=========================*/ + ulint table_flag) /*!< in: row format setting */ +{ + switch (dict_tf_get_rec_format(table_flag)) { + case REC_FORMAT_REDUNDANT: + return("ROW_TYPE_REDUNDANT"); + case REC_FORMAT_COMPACT: + return("ROW_TYPE_COMPACT"); + case REC_FORMAT_COMPRESSED: + return("ROW_TYPE_COMPRESSED"); + case REC_FORMAT_DYNAMIC: + return("ROW_TYPE_DYNAMIC"); + } + + ut_error; + return(0); +} +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/xtradb/dict/dict0load.cc b/storage/xtradb/dict/dict0load.cc new file mode 100644 index 00000000000..874614bfb5c --- /dev/null +++ b/storage/xtradb/dict/dict0load.cc @@ -0,0 +1,3149 @@ +/***************************************************************************** + +Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file dict/dict0load.cc +Loads to the memory cache database object definitions +from dictionary tables + +Created 4/24/1996 Heikki Tuuri +*******************************************************/ + +#include "dict0load.h" +#include "mysql_version.h" + +#ifdef UNIV_NONINL +#include "dict0load.ic" +#endif + +#include "btr0pcur.h" +#include "btr0btr.h" +#include "page0page.h" +#include "mach0data.h" +#include "dict0dict.h" +#include "dict0boot.h" +#include "dict0stats.h" +#include "rem0cmp.h" +#include "srv0start.h" +#include "srv0srv.h" +#include "dict0crea.h" +#include "dict0priv.h" +#include "ha_prototypes.h" /* innobase_casedn_str() */ +#include "fts0priv.h" + +/** Following are the InnoDB system tables. The positions in +this array are referenced by enum dict_system_table_id. */ +static const char* SYSTEM_TABLE_NAME[] = { + "SYS_TABLES", + "SYS_INDEXES", + "SYS_COLUMNS", + "SYS_FIELDS", + "SYS_FOREIGN", + "SYS_FOREIGN_COLS", + "SYS_TABLESPACES", + "SYS_DATAFILES" +}; + +/* If this flag is TRUE, then we will load the cluster index's (and tables') +metadata even if it is marked as "corrupted". */ +UNIV_INTERN my_bool srv_load_corrupted = FALSE; + +#ifdef UNIV_DEBUG +/****************************************************************//** +Compare the name of an index column. +@return TRUE if the i'th column of index is 'name'. */ +static +ibool +name_of_col_is( +/*===========*/ + const dict_table_t* table, /*!< in: table */ + const dict_index_t* index, /*!< in: index */ + ulint i, /*!< in: index field offset */ + const char* name) /*!< in: name to compare to */ +{ + ulint tmp = dict_col_get_no(dict_field_get_col( + dict_index_get_nth_field( + index, i))); + + return(strcmp(name, dict_table_get_col_name(table, tmp)) == 0); +} +#endif /* UNIV_DEBUG */ + +/********************************************************************//** +Finds the first table name in the given database. +@return own: table name, NULL if does not exist; the caller must free +the memory in the string! */ +UNIV_INTERN +char* +dict_get_first_table_name_in_db( +/*============================*/ + const char* name) /*!< in: database name which ends in '/' */ +{ + dict_table_t* sys_tables; + btr_pcur_t pcur; + dict_index_t* sys_index; + dtuple_t* tuple; + mem_heap_t* heap; + dfield_t* dfield; + const rec_t* rec; + const byte* field; + ulint len; + mtr_t mtr; + + ut_ad(mutex_own(&(dict_sys->mutex))); + + heap = mem_heap_create(1000); + + mtr_start(&mtr); + + sys_tables = dict_table_get_low("SYS_TABLES"); + sys_index = UT_LIST_GET_FIRST(sys_tables->indexes); + ut_ad(!dict_table_is_comp(sys_tables)); + + tuple = dtuple_create(heap, 1); + dfield = dtuple_get_nth_field(tuple, 0); + + dfield_set_data(dfield, name, ut_strlen(name)); + dict_index_copy_types(tuple, sys_index, 1); + + btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE, + BTR_SEARCH_LEAF, &pcur, &mtr); +loop: + rec = btr_pcur_get_rec(&pcur); + + if (!btr_pcur_is_on_user_rec(&pcur)) { + /* Not found */ + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + mem_heap_free(heap); + + return(NULL); + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_TABLES__NAME, &len); + + if (len < strlen(name) + || ut_memcmp(name, field, strlen(name)) != 0) { + /* Not found */ + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + mem_heap_free(heap); + + return(NULL); + } + + if (!rec_get_deleted_flag(rec, 0)) { + + /* We found one */ + + char* table_name = mem_strdupl((char*) field, len); + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + mem_heap_free(heap); + + return(table_name); + } + + btr_pcur_move_to_next_user_rec(&pcur, &mtr); + + goto loop; +} + +/********************************************************************//** +Prints to the standard output information on all tables found in the data +dictionary system table. */ +UNIV_INTERN +void +dict_print(void) +/*============*/ +{ + dict_table_t* table; + btr_pcur_t pcur; + const rec_t* rec; + mem_heap_t* heap; + mtr_t mtr; + + /* Enlarge the fatal semaphore wait timeout during the InnoDB table + monitor printout */ + + os_increment_counter_by_amount( + server_mutex, + srv_fatal_semaphore_wait_threshold, + SRV_SEMAPHORE_WAIT_EXTENSION); + + heap = mem_heap_create(1000); + mutex_enter(&(dict_sys->mutex)); + mtr_start(&mtr); + + rec = dict_startscan_system(&pcur, &mtr, SYS_TABLES); + + while (rec) { + const char* err_msg; + + err_msg = static_cast<const char*>( + dict_process_sys_tables_rec_and_mtr_commit( + heap, rec, &table, DICT_TABLE_LOAD_FROM_CACHE, + &mtr)); + + if (!err_msg) { + dict_table_print(table); + } else { + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: %s\n", err_msg); + } + + mem_heap_empty(heap); + + mtr_start(&mtr); + rec = dict_getnext_system(&pcur, &mtr); + } + + mtr_commit(&mtr); + mutex_exit(&(dict_sys->mutex)); + mem_heap_free(heap); + + /* Restore the fatal semaphore wait timeout */ + os_decrement_counter_by_amount( + server_mutex, + srv_fatal_semaphore_wait_threshold, + SRV_SEMAPHORE_WAIT_EXTENSION); +} + +/********************************************************************//** +This function gets the next system table record as it scans the table. +@return the next record if found, NULL if end of scan */ +static +const rec_t* +dict_getnext_system_low( +/*====================*/ + btr_pcur_t* pcur, /*!< in/out: persistent cursor to the + record*/ + mtr_t* mtr) /*!< in: the mini-transaction */ +{ + rec_t* rec = NULL; + + while (!rec || rec_get_deleted_flag(rec, 0)) { + btr_pcur_move_to_next_user_rec(pcur, mtr); + + rec = btr_pcur_get_rec(pcur); + + if (!btr_pcur_is_on_user_rec(pcur)) { + /* end of index */ + btr_pcur_close(pcur); + + return(NULL); + } + } + + /* Get a record, let's save the position */ + btr_pcur_store_position(pcur, mtr); + + return(rec); +} + +/********************************************************************//** +This function opens a system table, and returns the first record. +@return first record of the system table */ +UNIV_INTERN +const rec_t* +dict_startscan_system( +/*==================*/ + btr_pcur_t* pcur, /*!< out: persistent cursor to + the record */ + mtr_t* mtr, /*!< in: the mini-transaction */ + dict_system_id_t system_id) /*!< in: which system table to open */ +{ + dict_table_t* system_table; + dict_index_t* clust_index; + const rec_t* rec; + + ut_a(system_id < SYS_NUM_SYSTEM_TABLES); + + system_table = dict_table_get_low(SYSTEM_TABLE_NAME[system_id]); + + clust_index = UT_LIST_GET_FIRST(system_table->indexes); + + btr_pcur_open_at_index_side(true, clust_index, BTR_SEARCH_LEAF, pcur, + true, 0, mtr); + + rec = dict_getnext_system_low(pcur, mtr); + + return(rec); +} + +/********************************************************************//** +This function gets the next system table record as it scans the table. +@return the next record if found, NULL if end of scan */ +UNIV_INTERN +const rec_t* +dict_getnext_system( +/*================*/ + btr_pcur_t* pcur, /*!< in/out: persistent cursor + to the record */ + mtr_t* mtr) /*!< in: the mini-transaction */ +{ + const rec_t* rec; + + /* Restore the position */ + btr_pcur_restore_position(BTR_SEARCH_LEAF, pcur, mtr); + + /* Get the next record */ + rec = dict_getnext_system_low(pcur, mtr); + + return(rec); +} + +/********************************************************************//** +This function processes one SYS_TABLES record and populate the dict_table_t +struct for the table. Extracted out of dict_print() to be used by +both monitor table output and information schema innodb_sys_tables output. +@return error message, or NULL on success */ +UNIV_INTERN +const char* +dict_process_sys_tables_rec_and_mtr_commit( +/*=======================================*/ + mem_heap_t* heap, /*!< in/out: temporary memory heap */ + const rec_t* rec, /*!< in: SYS_TABLES record */ + dict_table_t** table, /*!< out: dict_table_t to fill */ + dict_table_info_t status, /*!< in: status bit controls + options such as whether we shall + look for dict_table_t from cache + first */ + mtr_t* mtr) /*!< in/out: mini-transaction, + will be committed */ +{ + ulint len; + const char* field; + const char* err_msg = NULL; + char* table_name; + + field = (const char*) rec_get_nth_field_old( + rec, DICT_FLD__SYS_TABLES__NAME, &len); + + ut_a(!rec_get_deleted_flag(rec, 0)); + + ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_S_FIX)); + + /* Get the table name */ + table_name = mem_heap_strdupl(heap, field, len); + + /* If DICT_TABLE_LOAD_FROM_CACHE is set, first check + whether there is cached dict_table_t struct */ + if (status & DICT_TABLE_LOAD_FROM_CACHE) { + + /* Commit before load the table again */ + mtr_commit(mtr); + + *table = dict_table_get_low(table_name); + + if (!(*table)) { + err_msg = "Table not found in cache"; + } + } else { + err_msg = dict_load_table_low(table_name, rec, table); + mtr_commit(mtr); + } + + if (err_msg) { + return(err_msg); + } + + return(NULL); +} + +/********************************************************************//** +This function parses a SYS_INDEXES record and populate a dict_index_t +structure with the information from the record. For detail information +about SYS_INDEXES fields, please refer to dict_boot() function. +@return error message, or NULL on success */ +UNIV_INTERN +const char* +dict_process_sys_indexes_rec( +/*=========================*/ + mem_heap_t* heap, /*!< in/out: heap memory */ + const rec_t* rec, /*!< in: current SYS_INDEXES rec */ + dict_index_t* index, /*!< out: index to be filled */ + table_id_t* table_id) /*!< out: index table id */ +{ + const char* err_msg; + byte* buf; + + buf = static_cast<byte*>(mem_heap_alloc(heap, 8)); + + /* Parse the record, and get "dict_index_t" struct filled */ + err_msg = dict_load_index_low(buf, NULL, + heap, rec, FALSE, &index); + + *table_id = mach_read_from_8(buf); + + return(err_msg); +} + +/********************************************************************//** +This function parses a SYS_COLUMNS record and populate a dict_column_t +structure with the information from the record. +@return error message, or NULL on success */ +UNIV_INTERN +const char* +dict_process_sys_columns_rec( +/*=========================*/ + mem_heap_t* heap, /*!< in/out: heap memory */ + const rec_t* rec, /*!< in: current SYS_COLUMNS rec */ + dict_col_t* column, /*!< out: dict_col_t to be filled */ + table_id_t* table_id, /*!< out: table id */ + const char** col_name) /*!< out: column name */ +{ + const char* err_msg; + + /* Parse the record, and get "dict_col_t" struct filled */ + err_msg = dict_load_column_low(NULL, heap, column, + table_id, col_name, rec); + + return(err_msg); +} + +/********************************************************************//** +This function parses a SYS_FIELDS record and populates a dict_field_t +structure with the information from the record. +@return error message, or NULL on success */ +UNIV_INTERN +const char* +dict_process_sys_fields_rec( +/*========================*/ + mem_heap_t* heap, /*!< in/out: heap memory */ + const rec_t* rec, /*!< in: current SYS_FIELDS rec */ + dict_field_t* sys_field, /*!< out: dict_field_t to be + filled */ + ulint* pos, /*!< out: Field position */ + index_id_t* index_id, /*!< out: current index id */ + index_id_t last_id) /*!< in: previous index id */ +{ + byte* buf; + byte* last_index_id; + const char* err_msg; + + buf = static_cast<byte*>(mem_heap_alloc(heap, 8)); + + last_index_id = static_cast<byte*>(mem_heap_alloc(heap, 8)); + mach_write_to_8(last_index_id, last_id); + + err_msg = dict_load_field_low(buf, NULL, sys_field, + pos, last_index_id, heap, rec); + + *index_id = mach_read_from_8(buf); + + return(err_msg); + +} + +/********************************************************************//** +This function parses a SYS_FOREIGN record and populate a dict_foreign_t +structure with the information from the record. For detail information +about SYS_FOREIGN fields, please refer to dict_load_foreign() function. +@return error message, or NULL on success */ +UNIV_INTERN +const char* +dict_process_sys_foreign_rec( +/*=========================*/ + mem_heap_t* heap, /*!< in/out: heap memory */ + const rec_t* rec, /*!< in: current SYS_FOREIGN rec */ + dict_foreign_t* foreign) /*!< out: dict_foreign_t struct + to be filled */ +{ + ulint len; + const byte* field; + ulint n_fields_and_type; + + if (rec_get_deleted_flag(rec, 0)) { + return("delete-marked record in SYS_FOREIGN"); + } + + if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_FOREIGN) { + return("wrong number of columns in SYS_FOREIGN record"); + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN__ID, &len); + if (len == 0 || len == UNIV_SQL_NULL) { +err_len: + return("incorrect column length in SYS_FOREIGN"); + } + + /* This recieves a dict_foreign_t* that points to a stack variable. + So mem_heap_free(foreign->heap) is not used as elsewhere. + Since the heap used here is freed elsewhere, foreign->heap + is not assigned. */ + foreign->id = mem_heap_strdupl(heap, (const char*) field, len); + + rec_get_nth_field_offs_old( + rec, DICT_FLD__SYS_FOREIGN__DB_TRX_ID, &len); + if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) { + goto err_len; + } + rec_get_nth_field_offs_old( + rec, DICT_FLD__SYS_FOREIGN__DB_ROLL_PTR, &len); + if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) { + goto err_len; + } + + /* The _lookup versions of the referenced and foreign table names + are not assigned since they are not used in this dict_foreign_t */ + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN__FOR_NAME, &len); + if (len == 0 || len == UNIV_SQL_NULL) { + goto err_len; + } + foreign->foreign_table_name = mem_heap_strdupl( + heap, (const char*) field, len); + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN__REF_NAME, &len); + if (len == 0 || len == UNIV_SQL_NULL) { + goto err_len; + } + foreign->referenced_table_name = mem_heap_strdupl( + heap, (const char*) field, len); + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN__N_COLS, &len); + if (len != 4) { + goto err_len; + } + n_fields_and_type = mach_read_from_4(field); + + foreign->type = (unsigned int) (n_fields_and_type >> 24); + foreign->n_fields = (unsigned int) (n_fields_and_type & 0x3FFUL); + + return(NULL); +} + +/********************************************************************//** +This function parses a SYS_FOREIGN_COLS record and extract necessary +information from the record and return to caller. +@return error message, or NULL on success */ +UNIV_INTERN +const char* +dict_process_sys_foreign_col_rec( +/*=============================*/ + mem_heap_t* heap, /*!< in/out: heap memory */ + const rec_t* rec, /*!< in: current SYS_FOREIGN_COLS rec */ + const char** name, /*!< out: foreign key constraint name */ + const char** for_col_name, /*!< out: referencing column name */ + const char** ref_col_name, /*!< out: referenced column name + in referenced table */ + ulint* pos) /*!< out: column position */ +{ + ulint len; + const byte* field; + + if (rec_get_deleted_flag(rec, 0)) { + return("delete-marked record in SYS_FOREIGN_COLS"); + } + + if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_FOREIGN_COLS) { + return("wrong number of columns in SYS_FOREIGN_COLS record"); + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN_COLS__ID, &len); + if (len == 0 || len == UNIV_SQL_NULL) { +err_len: + return("incorrect column length in SYS_FOREIGN_COLS"); + } + *name = mem_heap_strdupl(heap, (char*) field, len); + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN_COLS__POS, &len); + if (len != 4) { + goto err_len; + } + *pos = mach_read_from_4(field); + + rec_get_nth_field_offs_old( + rec, DICT_FLD__SYS_FOREIGN_COLS__DB_TRX_ID, &len); + if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) { + goto err_len; + } + rec_get_nth_field_offs_old( + rec, DICT_FLD__SYS_FOREIGN_COLS__DB_ROLL_PTR, &len); + if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) { + goto err_len; + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN_COLS__FOR_COL_NAME, &len); + if (len == 0 || len == UNIV_SQL_NULL) { + goto err_len; + } + *for_col_name = mem_heap_strdupl(heap, (char*) field, len); + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN_COLS__REF_COL_NAME, &len); + if (len == 0 || len == UNIV_SQL_NULL) { + goto err_len; + } + *ref_col_name = mem_heap_strdupl(heap, (char*) field, len); + + return(NULL); +} + +/********************************************************************//** +This function parses a SYS_TABLESPACES record, extracts necessary +information from the record and returns to caller. +@return error message, or NULL on success */ +UNIV_INTERN +const char* +dict_process_sys_tablespaces( +/*=========================*/ + mem_heap_t* heap, /*!< in/out: heap memory */ + const rec_t* rec, /*!< in: current SYS_TABLESPACES rec */ + ulint* space, /*!< out: space id */ + const char** name, /*!< out: tablespace name */ + ulint* flags) /*!< out: tablespace flags */ +{ + ulint len; + const byte* field; + + /* Initialize the output values */ + *space = ULINT_UNDEFINED; + *name = NULL; + *flags = ULINT_UNDEFINED; + + if (rec_get_deleted_flag(rec, 0)) { + return("delete-marked record in SYS_TABLESPACES"); + } + + if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_TABLESPACES) { + return("wrong number of columns in SYS_TABLESPACES record"); + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_TABLESPACES__SPACE, &len); + if (len != DICT_FLD_LEN_SPACE) { +err_len: + return("incorrect column length in SYS_TABLESPACES"); + } + *space = mach_read_from_4(field); + + rec_get_nth_field_offs_old( + rec, DICT_FLD__SYS_TABLESPACES__DB_TRX_ID, &len); + if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) { + goto err_len; + } + + rec_get_nth_field_offs_old( + rec, DICT_FLD__SYS_TABLESPACES__DB_ROLL_PTR, &len); + if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) { + goto err_len; + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_TABLESPACES__NAME, &len); + if (len == 0 || len == UNIV_SQL_NULL) { + goto err_len; + } + *name = mem_heap_strdupl(heap, (char*) field, len); + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_TABLESPACES__FLAGS, &len); + if (len != DICT_FLD_LEN_FLAGS) { + goto err_len; + } + *flags = mach_read_from_4(field); + + return(NULL); +} + +/********************************************************************//** +This function parses a SYS_DATAFILES record, extracts necessary +information from the record and returns it to the caller. +@return error message, or NULL on success */ +UNIV_INTERN +const char* +dict_process_sys_datafiles( +/*=======================*/ + mem_heap_t* heap, /*!< in/out: heap memory */ + const rec_t* rec, /*!< in: current SYS_DATAFILES rec */ + ulint* space, /*!< out: space id */ + const char** path) /*!< out: datafile paths */ +{ + ulint len; + const byte* field; + + if (rec_get_deleted_flag(rec, 0)) { + return("delete-marked record in SYS_DATAFILES"); + } + + if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_DATAFILES) { + return("wrong number of columns in SYS_DATAFILES record"); + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_DATAFILES__SPACE, &len); + if (len != DICT_FLD_LEN_SPACE) { +err_len: + return("incorrect column length in SYS_DATAFILES"); + } + *space = mach_read_from_4(field); + + rec_get_nth_field_offs_old( + rec, DICT_FLD__SYS_DATAFILES__DB_TRX_ID, &len); + if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) { + goto err_len; + } + + rec_get_nth_field_offs_old( + rec, DICT_FLD__SYS_DATAFILES__DB_ROLL_PTR, &len); + if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) { + goto err_len; + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_DATAFILES__PATH, &len); + if (len == 0 || len == UNIV_SQL_NULL) { + goto err_len; + } + *path = mem_heap_strdupl(heap, (char*) field, len); + + return(NULL); +} + +/********************************************************************//** +Determine the flags of a table as stored in SYS_TABLES.TYPE and N_COLS. +@return ULINT_UNDEFINED if error, else a valid dict_table_t::flags. */ +static +ulint +dict_sys_tables_get_flags( +/*======================*/ + const rec_t* rec) /*!< in: a record of SYS_TABLES */ +{ + const byte* field; + ulint len; + ulint type; + ulint n_cols; + + /* read the 4 byte flags from the TYPE field */ + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_TABLES__TYPE, &len); + ut_a(len == 4); + type = mach_read_from_4(field); + + /* The low order bit of SYS_TABLES.TYPE is always set to 1. But in + dict_table_t::flags the low order bit is used to determine if the + row format is Redundant or Compact when the format is Antelope. + Read the 4 byte N_COLS field and look at the high order bit. It + should be set for COMPACT and later. It should not be set for + REDUNDANT. */ + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_TABLES__N_COLS, &len); + ut_a(len == 4); + n_cols = mach_read_from_4(field); + + /* This validation function also combines the DICT_N_COLS_COMPACT + flag in n_cols into the type field to effectively make it a + dict_table_t::flags. */ + + if (ULINT_UNDEFINED == dict_sys_tables_type_validate(type, n_cols)) { + return(ULINT_UNDEFINED); + } + + return(dict_sys_tables_type_to_tf(type, n_cols)); +} + +/********************************************************************//** +Gets the filepath for a spaceid from SYS_DATAFILES and checks it against +the contents of a link file. This function is called when there is no +fil_node_t entry for this space ID so both durable locations on disk +must be checked and compared. +We use a temporary heap here for the table lookup, but not for the path +returned which the caller must free. +This function can return NULL if the space ID is not found in SYS_DATAFILES, +then the caller will assume that the ibd file is in the normal datadir. +@return own: A copy of the first datafile found in SYS_DATAFILES.PATH for +the given space ID. NULL if space ID is zero or not found. */ +UNIV_INTERN +char* +dict_get_first_path( +/*================*/ + ulint space, /*!< in: space id */ + const char* name) /*!< in: tablespace name */ +{ + mtr_t mtr; + dict_table_t* sys_datafiles; + dict_index_t* sys_index; + dtuple_t* tuple; + dfield_t* dfield; + byte* buf; + btr_pcur_t pcur; + const rec_t* rec; + const byte* field; + ulint len; + char* dict_filepath = NULL; + mem_heap_t* heap = mem_heap_create(1024); + + ut_ad(mutex_own(&(dict_sys->mutex))); + + mtr_start(&mtr); + + sys_datafiles = dict_table_get_low("SYS_DATAFILES"); + sys_index = UT_LIST_GET_FIRST(sys_datafiles->indexes); + ut_ad(!dict_table_is_comp(sys_datafiles)); + ut_ad(name_of_col_is(sys_datafiles, sys_index, + DICT_FLD__SYS_DATAFILES__SPACE, "SPACE")); + ut_ad(name_of_col_is(sys_datafiles, sys_index, + DICT_FLD__SYS_DATAFILES__PATH, "PATH")); + + tuple = dtuple_create(heap, 1); + dfield = dtuple_get_nth_field(tuple, DICT_FLD__SYS_DATAFILES__SPACE); + + buf = static_cast<byte*>(mem_heap_alloc(heap, 4)); + mach_write_to_4(buf, space); + + dfield_set_data(dfield, buf, 4); + dict_index_copy_types(tuple, sys_index, 1); + + btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE, + BTR_SEARCH_LEAF, &pcur, &mtr); + + rec = btr_pcur_get_rec(&pcur); + + /* If the file-per-table tablespace was created with + an earlier version of InnoDB, then this record is not + in SYS_DATAFILES. But a link file still might exist. */ + + if (btr_pcur_is_on_user_rec(&pcur)) { + /* A record for this space ID was found. */ + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_DATAFILES__PATH, &len); + ut_a(len > 0 || len == UNIV_SQL_NULL); + ut_a(len < OS_FILE_MAX_PATH); + dict_filepath = mem_strdupl((char*) field, len); + ut_a(dict_filepath); + } + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + mem_heap_free(heap); + + return(dict_filepath); +} + +/********************************************************************//** +Update the record for space_id in SYS_TABLESPACES to this filepath. +@return DB_SUCCESS if OK, dberr_t if the insert failed */ +UNIV_INTERN +dberr_t +dict_update_filepath( +/*=================*/ + ulint space_id, /*!< in: space id */ + const char* filepath) /*!< in: filepath */ +{ + dberr_t err = DB_SUCCESS; + trx_t* trx; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(mutex_own(&(dict_sys->mutex))); + + trx = trx_allocate_for_background(); + trx->op_info = "update filepath"; + trx->dict_operation_lock_mode = RW_X_LATCH; + trx_start_for_ddl(trx, TRX_DICT_OP_INDEX); + + pars_info_t* info = pars_info_create(); + + pars_info_add_int4_literal(info, "space", space_id); + pars_info_add_str_literal(info, "path", filepath); + + err = que_eval_sql(info, + "PROCEDURE UPDATE_FILEPATH () IS\n" + "BEGIN\n" + "UPDATE SYS_DATAFILES" + " SET PATH = :path\n" + " WHERE SPACE = :space;\n" + "END;\n", FALSE, trx); + + trx_commit_for_mysql(trx); + trx->dict_operation_lock_mode = 0; + trx_free_for_background(trx); + + if (err == DB_SUCCESS) { + /* We just updated SYS_DATAFILES due to the contents in + a link file. Make a note that we did this. */ + ib_logf(IB_LOG_LEVEL_INFO, + "The InnoDB data dictionary table SYS_DATAFILES " + "for tablespace ID %lu was updated to use file %s.", + (ulong) space_id, filepath); + } else { + ib_logf(IB_LOG_LEVEL_WARN, + "Problem updating InnoDB data dictionary table " + "SYS_DATAFILES for tablespace ID %lu to file %s.", + (ulong) space_id, filepath); + } + + return(err); +} + +/********************************************************************//** +Insert records into SYS_TABLESPACES and SYS_DATAFILES. +@return DB_SUCCESS if OK, dberr_t if the insert failed */ +UNIV_INTERN +dberr_t +dict_insert_tablespace_and_filepath( +/*================================*/ + ulint space, /*!< in: space id */ + const char* name, /*!< in: talespace name */ + const char* filepath, /*!< in: filepath */ + ulint fsp_flags) /*!< in: tablespace flags */ +{ + dberr_t err = DB_SUCCESS; + trx_t* trx; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(mutex_own(&(dict_sys->mutex))); + ut_ad(filepath); + + trx = trx_allocate_for_background(); + trx->op_info = "insert tablespace and filepath"; + trx->dict_operation_lock_mode = RW_X_LATCH; + trx_start_for_ddl(trx, TRX_DICT_OP_INDEX); + + /* A record for this space ID was not found in + SYS_DATAFILES. Assume the record is also missing in + SYS_TABLESPACES. Insert records onto them both. */ + err = dict_create_add_tablespace_to_dictionary( + space, name, fsp_flags, filepath, trx, false); + + trx_commit_for_mysql(trx); + trx->dict_operation_lock_mode = 0; + trx_free_for_background(trx); + + return(err); +} + +/********************************************************************//** +This function looks at each table defined in SYS_TABLES. It checks the +tablespace for any table with a space_id > 0. It looks up the tablespace +in SYS_DATAFILES to ensure the correct path. + +In a crash recovery we already have all the tablespace objects created. +This function compares the space id information in the InnoDB data dictionary +to what we already read with fil_load_single_table_tablespaces(). + +In a normal startup, we create the tablespace objects for every table in +InnoDB's data dictionary, if the corresponding .ibd file exists. +We also scan the biggest space id, and store it to fil_system. */ +UNIV_INTERN +void +dict_check_tablespaces_and_store_max_id( +/*====================================*/ + dict_check_t dict_check) /*!< in: how to check */ +{ + dict_table_t* sys_tables; + dict_index_t* sys_index; + btr_pcur_t pcur; + const rec_t* rec; + ulint max_space_id; + mtr_t mtr; + + rw_lock_x_lock(&dict_operation_lock); + mutex_enter(&(dict_sys->mutex)); + + mtr_start(&mtr); + + sys_tables = dict_table_get_low("SYS_TABLES"); + sys_index = UT_LIST_GET_FIRST(sys_tables->indexes); + ut_ad(!dict_table_is_comp(sys_tables)); + + max_space_id = mtr_read_ulint(dict_hdr_get(&mtr) + + DICT_HDR_MAX_SPACE_ID, + MLOG_4BYTES, &mtr); + fil_set_max_space_id_if_bigger(max_space_id); + + btr_pcur_open_at_index_side(true, sys_index, BTR_SEARCH_LEAF, &pcur, + true, 0, &mtr); +loop: + btr_pcur_move_to_next_user_rec(&pcur, &mtr); + + rec = btr_pcur_get_rec(&pcur); + + if (!btr_pcur_is_on_user_rec(&pcur)) { + /* end of index */ + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + /* We must make the tablespace cache aware of the biggest + known space id */ + + /* printf("Biggest space id in data dictionary %lu\n", + max_space_id); */ + fil_set_max_space_id_if_bigger(max_space_id); + + mutex_exit(&(dict_sys->mutex)); + rw_lock_x_unlock(&dict_operation_lock); + + return; + } + + if (!rec_get_deleted_flag(rec, 0)) { + + /* We found one */ + const byte* field; + ulint len; + ulint space_id; + ulint flags; + char* name; + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_TABLES__NAME, &len); + + name = mem_strdupl((char*) field, len); + + char table_name[MAX_FULL_NAME_LEN + 1]; + + innobase_format_name( + table_name, sizeof(table_name), name, FALSE); + + flags = dict_sys_tables_get_flags(rec); + if (UNIV_UNLIKELY(flags == ULINT_UNDEFINED)) { + /* Read again the 4 bytes from rec. */ + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_TABLES__TYPE, &len); + ut_ad(len == 4); /* this was checked earlier */ + flags = mach_read_from_4(field); + + ib_logf(IB_LOG_LEVEL_ERROR, + "Table '%s' in InnoDB data dictionary" + " has unknown type %lx", table_name, flags); + mem_free(name); + goto loop; + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_TABLES__SPACE, &len); + ut_a(len == 4); + + space_id = mach_read_from_4(field); + + btr_pcur_store_position(&pcur, &mtr); + + mtr_commit(&mtr); + + /* For tables created with old versions of InnoDB, + SYS_TABLES.MIX_LEN may contain garbage. Such tables + would always be in ROW_FORMAT=REDUNDANT. Pretend that + all such tables are non-temporary. That is, do not + suppress error printouts about temporary or discarded + tablespaces not being found. */ + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_TABLES__MIX_LEN, &len); + + bool is_temp = false; + bool discarded = false; + ib_uint32_t flags2 = static_cast<ib_uint32_t>( + mach_read_from_4(field)); + + /* Check that the tablespace (the .ibd file) really + exists; print a warning to the .err log if not. + Do not print warnings for temporary tables or for + tablespaces that have been discarded. */ + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_TABLES__N_COLS, &len); + + /* MIX_LEN valid only for ROW_FORMAT > REDUNDANT. */ + if (mach_read_from_4(field) & DICT_N_COLS_COMPACT) { + + is_temp = !!(flags2 & DICT_TF2_TEMPORARY); + discarded = !!(flags2 & DICT_TF2_DISCARDED); + } + + if (space_id == 0) { + /* The system tablespace always exists. */ + ut_ad(!discarded); + goto next_tablespace; + } + + switch (dict_check) { + case DICT_CHECK_ALL_LOADED: + /* All tablespaces should have been found in + fil_load_single_table_tablespaces(). */ + if (fil_space_for_table_exists_in_mem( + space_id, name, TRUE, !(is_temp || discarded), + false, NULL, 0) + && !(is_temp || discarded)) { + /* If user changes the path of .ibd files in + *.isl files before doing crash recovery , + then this leads to inconsistency in + SYS_DATAFILES system table because the + tables are loaded from the updated path + but the SYS_DATAFILES still points to the + old path.Therefore after crash recovery + update SYS_DATAFILES with the updated path.*/ + ut_ad(space_id); + ut_ad(recv_needed_recovery); + char *dict_path = dict_get_first_path(space_id, + name); + char *remote_path = fil_read_link_file(name); + if(dict_path && remote_path) { + if(strcmp(dict_path,remote_path)) { + dict_update_filepath(space_id, + remote_path); + } + } + if(dict_path) + mem_free(dict_path); + if(remote_path) + mem_free(remote_path); + } + break; + + case DICT_CHECK_SOME_LOADED: + /* Some tablespaces may have been opened in + trx_resurrect_table_locks(). */ + if (fil_space_for_table_exists_in_mem( + space_id, name, FALSE, FALSE, + false, NULL, 0)) { + break; + } + /* fall through */ + case DICT_CHECK_NONE_LOADED: + if (discarded) { + ib_logf(IB_LOG_LEVEL_INFO, + "DISCARD flag set for table '%s'," + " ignored.", + table_name); + break; + } + + /* It is a normal database startup: create the + space object and check that the .ibd file exists. + If the table uses a remote tablespace, look for the + space_id in SYS_DATAFILES to find the filepath */ + + /* Use the remote filepath if known. */ + char* filepath = NULL; + if (DICT_TF_HAS_DATA_DIR(flags)) { + filepath = dict_get_first_path( + space_id, name); + } + + /* We set the 2nd param (fix_dict = true) + here because we already have an x-lock on + dict_operation_lock and dict_sys->mutex. Besides, + this is at startup and we are now single threaded. + If the filepath is not known, it will need to + be discovered. */ + dberr_t err = fil_open_single_table_tablespace( + false, srv_read_only_mode ? false : true, + space_id, dict_tf_to_fsp_flags(flags), + name, filepath); + + if (err != DB_SUCCESS) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Tablespace open failed for '%s', " + "ignored.", table_name); + } + + if (filepath) { + mem_free(filepath); + } + + break; + } + + if (space_id > max_space_id) { + max_space_id = space_id; + } + +next_tablespace: + mem_free(name); + mtr_start(&mtr); + + btr_pcur_restore_position(BTR_SEARCH_LEAF, &pcur, &mtr); + } + + goto loop; +} + +/********************************************************************//** +Loads a table column definition from a SYS_COLUMNS record to +dict_table_t. +@return error message, or NULL on success */ +UNIV_INTERN +const char* +dict_load_column_low( +/*=================*/ + dict_table_t* table, /*!< in/out: table, could be NULL + if we just populate a dict_column_t + struct with information from + a SYS_COLUMNS record */ + mem_heap_t* heap, /*!< in/out: memory heap + for temporary storage */ + dict_col_t* column, /*!< out: dict_column_t to fill, + or NULL if table != NULL */ + table_id_t* table_id, /*!< out: table id */ + const char** col_name, /*!< out: column name */ + const rec_t* rec) /*!< in: SYS_COLUMNS record */ +{ + char* name; + const byte* field; + ulint len; + ulint mtype; + ulint prtype; + ulint col_len; + ulint pos; + + ut_ad(table || column); + + if (rec_get_deleted_flag(rec, 0)) { + return("delete-marked record in SYS_COLUMNS"); + } + + if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_COLUMNS) { + return("wrong number of columns in SYS_COLUMNS record"); + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_COLUMNS__TABLE_ID, &len); + if (len != 8) { +err_len: + return("incorrect column length in SYS_COLUMNS"); + } + + if (table_id) { + *table_id = mach_read_from_8(field); + } else if (table->id != mach_read_from_8(field)) { + return("SYS_COLUMNS.TABLE_ID mismatch"); + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_COLUMNS__POS, &len); + if (len != 4) { + + goto err_len; + } + + pos = mach_read_from_4(field); + + if (table && table->n_def != pos) { + return("SYS_COLUMNS.POS mismatch"); + } + + rec_get_nth_field_offs_old( + rec, DICT_FLD__SYS_COLUMNS__DB_TRX_ID, &len); + if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) { + goto err_len; + } + rec_get_nth_field_offs_old( + rec, DICT_FLD__SYS_COLUMNS__DB_ROLL_PTR, &len); + if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) { + goto err_len; + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_COLUMNS__NAME, &len); + if (len == 0 || len == UNIV_SQL_NULL) { + goto err_len; + } + + name = mem_heap_strdupl(heap, (const char*) field, len); + + if (col_name) { + *col_name = name; + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_COLUMNS__MTYPE, &len); + if (len != 4) { + goto err_len; + } + + mtype = mach_read_from_4(field); + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_COLUMNS__PRTYPE, &len); + if (len != 4) { + goto err_len; + } + prtype = mach_read_from_4(field); + + if (dtype_get_charset_coll(prtype) == 0 + && dtype_is_string_type(mtype)) { + /* The table was created with < 4.1.2. */ + + if (dtype_is_binary_string_type(mtype, prtype)) { + /* Use the binary collation for + string columns of binary type. */ + + prtype = dtype_form_prtype( + prtype, + DATA_MYSQL_BINARY_CHARSET_COLL); + } else { + /* Use the default charset for + other than binary columns. */ + + prtype = dtype_form_prtype( + prtype, + data_mysql_default_charset_coll); + } + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_COLUMNS__LEN, &len); + if (len != 4) { + goto err_len; + } + col_len = mach_read_from_4(field); + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_COLUMNS__PREC, &len); + if (len != 4) { + goto err_len; + } + + if (!column) { + dict_mem_table_add_col(table, heap, name, mtype, + prtype, col_len); + } else { + dict_mem_fill_column_struct(column, pos, mtype, + prtype, col_len); + } + + return(NULL); +} + +/********************************************************************//** +Loads definitions for table columns. */ +static +void +dict_load_columns( +/*==============*/ + dict_table_t* table, /*!< in/out: table */ + mem_heap_t* heap) /*!< in/out: memory heap + for temporary storage */ +{ + dict_table_t* sys_columns; + dict_index_t* sys_index; + btr_pcur_t pcur; + dtuple_t* tuple; + dfield_t* dfield; + const rec_t* rec; + byte* buf; + ulint i; + mtr_t mtr; + + ut_ad(mutex_own(&(dict_sys->mutex))); + + mtr_start(&mtr); + + sys_columns = dict_table_get_low("SYS_COLUMNS"); + sys_index = UT_LIST_GET_FIRST(sys_columns->indexes); + ut_ad(!dict_table_is_comp(sys_columns)); + + ut_ad(name_of_col_is(sys_columns, sys_index, + DICT_FLD__SYS_COLUMNS__NAME, "NAME")); + ut_ad(name_of_col_is(sys_columns, sys_index, + DICT_FLD__SYS_COLUMNS__PREC, "PREC")); + + tuple = dtuple_create(heap, 1); + dfield = dtuple_get_nth_field(tuple, 0); + + buf = static_cast<byte*>(mem_heap_alloc(heap, 8)); + mach_write_to_8(buf, table->id); + + dfield_set_data(dfield, buf, 8); + dict_index_copy_types(tuple, sys_index, 1); + + btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE, + BTR_SEARCH_LEAF, &pcur, &mtr); + for (i = 0; i + DATA_N_SYS_COLS < (ulint) table->n_cols; i++) { + const char* err_msg; + const char* name = NULL; + + rec = btr_pcur_get_rec(&pcur); + + ut_a(btr_pcur_is_on_user_rec(&pcur)); + + err_msg = dict_load_column_low(table, heap, NULL, NULL, + &name, rec); + + if (err_msg) { + fprintf(stderr, "InnoDB: %s\n", err_msg); + ut_error; + } + + /* Note: Currently we have one DOC_ID column that is + shared by all FTS indexes on a table. */ + if (innobase_strcasecmp(name, + FTS_DOC_ID_COL_NAME) == 0) { + dict_col_t* col; + /* As part of normal loading of tables the + table->flag is not set for tables with FTS + till after the FTS indexes are loaded. So we + create the fts_t instance here if there isn't + one already created. + + This case does not arise for table create as + the flag is set before the table is created. */ + if (table->fts == NULL) { + table->fts = fts_create(table); + fts_optimize_add_table(table); + } + + ut_a(table->fts->doc_col == ULINT_UNDEFINED); + + col = dict_table_get_nth_col(table, i); + + ut_ad(col->len == sizeof(doc_id_t)); + + if (col->prtype & DATA_FTS_DOC_ID) { + DICT_TF2_FLAG_SET( + table, DICT_TF2_FTS_HAS_DOC_ID); + DICT_TF2_FLAG_UNSET( + table, DICT_TF2_FTS_ADD_DOC_ID); + } + + table->fts->doc_col = i; + } + + btr_pcur_move_to_next_user_rec(&pcur, &mtr); + } + + btr_pcur_close(&pcur); + mtr_commit(&mtr); +} + +/** Error message for a delete-marked record in dict_load_field_low() */ +static const char* dict_load_field_del = "delete-marked record in SYS_FIELDS"; + +/********************************************************************//** +Loads an index field definition from a SYS_FIELDS record to +dict_index_t. +@return error message, or NULL on success */ +UNIV_INTERN +const char* +dict_load_field_low( +/*================*/ + byte* index_id, /*!< in/out: index id (8 bytes) + an "in" value if index != NULL + and "out" if index == NULL */ + dict_index_t* index, /*!< in/out: index, could be NULL + if we just populate a dict_field_t + struct with information from + a SYS_FIELDS record */ + dict_field_t* sys_field, /*!< out: dict_field_t to be + filled */ + ulint* pos, /*!< out: Field position */ + byte* last_index_id, /*!< in: last index id */ + mem_heap_t* heap, /*!< in/out: memory heap + for temporary storage */ + const rec_t* rec) /*!< in: SYS_FIELDS record */ +{ + const byte* field; + ulint len; + ulint pos_and_prefix_len; + ulint prefix_len; + ibool first_field; + ulint position; + + /* Either index or sys_field is supplied, not both */ + ut_a((!index) || (!sys_field)); + + if (rec_get_deleted_flag(rec, 0)) { + return(dict_load_field_del); + } + + if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_FIELDS) { + return("wrong number of columns in SYS_FIELDS record"); + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FIELDS__INDEX_ID, &len); + if (len != 8) { +err_len: + return("incorrect column length in SYS_FIELDS"); + } + + if (!index) { + ut_a(last_index_id); + memcpy(index_id, (const char*) field, 8); + first_field = memcmp(index_id, last_index_id, 8); + } else { + first_field = (index->n_def == 0); + if (memcmp(field, index_id, 8)) { + return("SYS_FIELDS.INDEX_ID mismatch"); + } + } + + /* The next field stores the field position in the index and a + possible column prefix length if the index field does not + contain the whole column. The storage format is like this: if + there is at least one prefix field in the index, then the HIGH + 2 bytes contain the field number (index->n_def) and the low 2 + bytes the prefix length for the field. Otherwise the field + number (index->n_def) is contained in the 2 LOW bytes. */ + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FIELDS__POS, &len); + if (len != 4) { + goto err_len; + } + + pos_and_prefix_len = mach_read_from_4(field); + + if (index && UNIV_UNLIKELY + ((pos_and_prefix_len & 0xFFFFUL) != index->n_def + && (pos_and_prefix_len >> 16 & 0xFFFF) != index->n_def)) { + return("SYS_FIELDS.POS mismatch"); + } + + if (first_field || pos_and_prefix_len > 0xFFFFUL) { + prefix_len = pos_and_prefix_len & 0xFFFFUL; + position = (pos_and_prefix_len & 0xFFFF0000UL) >> 16; + } else { + prefix_len = 0; + position = pos_and_prefix_len & 0xFFFFUL; + } + + rec_get_nth_field_offs_old( + rec, DICT_FLD__SYS_FIELDS__DB_TRX_ID, &len); + if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) { + goto err_len; + } + rec_get_nth_field_offs_old( + rec, DICT_FLD__SYS_FIELDS__DB_ROLL_PTR, &len); + if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) { + goto err_len; + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FIELDS__COL_NAME, &len); + if (len == 0 || len == UNIV_SQL_NULL) { + goto err_len; + } + + if (index) { + dict_mem_index_add_field( + index, mem_heap_strdupl(heap, (const char*) field, len), + prefix_len); + } else { + ut_a(sys_field); + ut_a(pos); + + sys_field->name = mem_heap_strdupl( + heap, (const char*) field, len); + sys_field->prefix_len = prefix_len; + *pos = position; + } + + return(NULL); +} + +/********************************************************************//** +Loads definitions for index fields. +@return DB_SUCCESS if ok, DB_CORRUPTION if corruption */ +static +ulint +dict_load_fields( +/*=============*/ + dict_index_t* index, /*!< in/out: index whose fields to load */ + mem_heap_t* heap) /*!< in: memory heap for temporary storage */ +{ + dict_table_t* sys_fields; + dict_index_t* sys_index; + btr_pcur_t pcur; + dtuple_t* tuple; + dfield_t* dfield; + const rec_t* rec; + byte* buf; + ulint i; + mtr_t mtr; + dberr_t error; + + ut_ad(mutex_own(&(dict_sys->mutex))); + + mtr_start(&mtr); + + sys_fields = dict_table_get_low("SYS_FIELDS"); + sys_index = UT_LIST_GET_FIRST(sys_fields->indexes); + ut_ad(!dict_table_is_comp(sys_fields)); + ut_ad(name_of_col_is(sys_fields, sys_index, + DICT_FLD__SYS_FIELDS__COL_NAME, "COL_NAME")); + + tuple = dtuple_create(heap, 1); + dfield = dtuple_get_nth_field(tuple, 0); + + buf = static_cast<byte*>(mem_heap_alloc(heap, 8)); + mach_write_to_8(buf, index->id); + + dfield_set_data(dfield, buf, 8); + dict_index_copy_types(tuple, sys_index, 1); + + btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE, + BTR_SEARCH_LEAF, &pcur, &mtr); + for (i = 0; i < index->n_fields; i++) { + const char* err_msg; + + rec = btr_pcur_get_rec(&pcur); + + ut_a(btr_pcur_is_on_user_rec(&pcur)); + + err_msg = dict_load_field_low(buf, index, NULL, NULL, NULL, + heap, rec); + + if (err_msg == dict_load_field_del) { + /* There could be delete marked records in + SYS_FIELDS because SYS_FIELDS.INDEX_ID can be + updated by ALTER TABLE ADD INDEX. */ + + goto next_rec; + } else if (err_msg) { + fprintf(stderr, "InnoDB: %s\n", err_msg); + error = DB_CORRUPTION; + goto func_exit; + } +next_rec: + btr_pcur_move_to_next_user_rec(&pcur, &mtr); + } + + error = DB_SUCCESS; +func_exit: + btr_pcur_close(&pcur); + mtr_commit(&mtr); + return(error); +} + +/** Error message for a delete-marked record in dict_load_index_low() */ +static const char* dict_load_index_del = "delete-marked record in SYS_INDEXES"; +/** Error message for table->id mismatch in dict_load_index_low() */ +static const char* dict_load_index_id_err = "SYS_INDEXES.TABLE_ID mismatch"; + +/********************************************************************//** +Loads an index definition from a SYS_INDEXES record to dict_index_t. +If allocate=TRUE, we will create a dict_index_t structure and fill it +accordingly. If allocated=FALSE, the dict_index_t will be supplied by +the caller and filled with information read from the record. @return +error message, or NULL on success */ +UNIV_INTERN +const char* +dict_load_index_low( +/*================*/ + byte* table_id, /*!< in/out: table id (8 bytes), + an "in" value if allocate=TRUE + and "out" when allocate=FALSE */ + const char* table_name, /*!< in: table name */ + mem_heap_t* heap, /*!< in/out: temporary memory heap */ + const rec_t* rec, /*!< in: SYS_INDEXES record */ + ibool allocate, /*!< in: TRUE=allocate *index, + FALSE=fill in a pre-allocated + *index */ + dict_index_t** index) /*!< out,own: index, or NULL */ +{ + const byte* field; + ulint len; + ulint name_len; + char* name_buf; + index_id_t id; + ulint n_fields; + ulint type; + ulint space; + + if (allocate) { + /* If allocate=TRUE, no dict_index_t will + be supplied. Initialize "*index" to NULL */ + *index = NULL; + } + + if (rec_get_deleted_flag(rec, 0)) { + return(dict_load_index_del); + } + + if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_INDEXES) { + return("wrong number of columns in SYS_INDEXES record"); + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_INDEXES__TABLE_ID, &len); + if (len != 8) { +err_len: + return("incorrect column length in SYS_INDEXES"); + } + + if (!allocate) { + /* We are reading a SYS_INDEXES record. Copy the table_id */ + memcpy(table_id, (const char*) field, 8); + } else if (memcmp(field, table_id, 8)) { + /* Caller supplied table_id, verify it is the same + id as on the index record */ + return(dict_load_index_id_err); + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_INDEXES__ID, &len); + if (len != 8) { + goto err_len; + } + + id = mach_read_from_8(field); + + rec_get_nth_field_offs_old( + rec, DICT_FLD__SYS_INDEXES__DB_TRX_ID, &len); + if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) { + goto err_len; + } + rec_get_nth_field_offs_old( + rec, DICT_FLD__SYS_INDEXES__DB_ROLL_PTR, &len); + if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) { + goto err_len; + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_INDEXES__NAME, &name_len); + if (name_len == UNIV_SQL_NULL) { + goto err_len; + } + + name_buf = mem_heap_strdupl(heap, (const char*) field, + name_len); + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_INDEXES__N_FIELDS, &len); + if (len != 4) { + goto err_len; + } + n_fields = mach_read_from_4(field); + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_INDEXES__TYPE, &len); + if (len != 4) { + goto err_len; + } + type = mach_read_from_4(field); + if (type & (~0 << DICT_IT_BITS)) { + return("unknown SYS_INDEXES.TYPE bits"); + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_INDEXES__SPACE, &len); + if (len != 4) { + goto err_len; + } + space = mach_read_from_4(field); + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_INDEXES__PAGE_NO, &len); + if (len != 4) { + goto err_len; + } + + if (allocate) { + *index = dict_mem_index_create(table_name, name_buf, + space, type, n_fields); + } else { + ut_a(*index); + + dict_mem_fill_index_struct(*index, NULL, NULL, name_buf, + space, type, n_fields); + } + + (*index)->id = id; + (*index)->page = mach_read_from_4(field); + btr_search_index_init(*index); + ut_ad((*index)->page); + + return(NULL); +} + +/********************************************************************//** +Loads definitions for table indexes. Adds them to the data dictionary +cache. +@return DB_SUCCESS if ok, DB_CORRUPTION if corruption of dictionary +table or DB_UNSUPPORTED if table has unknown index type */ +static __attribute__((nonnull)) +dberr_t +dict_load_indexes( +/*==============*/ + dict_table_t* table, /*!< in/out: table */ + mem_heap_t* heap, /*!< in: memory heap for temporary storage */ + dict_err_ignore_t ignore_err) + /*!< in: error to be ignored when + loading the index definition */ +{ + dict_table_t* sys_indexes; + dict_index_t* sys_index; + btr_pcur_t pcur; + dtuple_t* tuple; + dfield_t* dfield; + const rec_t* rec; + byte* buf; + mtr_t mtr; + dberr_t error = DB_SUCCESS; + + ut_ad(mutex_own(&(dict_sys->mutex))); + + mtr_start(&mtr); + + sys_indexes = dict_table_get_low("SYS_INDEXES"); + sys_index = UT_LIST_GET_FIRST(sys_indexes->indexes); + ut_ad(!dict_table_is_comp(sys_indexes)); + ut_ad(name_of_col_is(sys_indexes, sys_index, + DICT_FLD__SYS_INDEXES__NAME, "NAME")); + ut_ad(name_of_col_is(sys_indexes, sys_index, + DICT_FLD__SYS_INDEXES__PAGE_NO, "PAGE_NO")); + + tuple = dtuple_create(heap, 1); + dfield = dtuple_get_nth_field(tuple, 0); + + buf = static_cast<byte*>(mem_heap_alloc(heap, 8)); + mach_write_to_8(buf, table->id); + + dfield_set_data(dfield, buf, 8); + dict_index_copy_types(tuple, sys_index, 1); + + btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE, + BTR_SEARCH_LEAF, &pcur, &mtr); + for (;;) { + dict_index_t* index = NULL; + const char* err_msg; + + if (!btr_pcur_is_on_user_rec(&pcur)) { + + /* We should allow the table to open even + without index when DICT_ERR_IGNORE_CORRUPT is set. + DICT_ERR_IGNORE_CORRUPT is currently only set + for drop table */ + if (dict_table_get_first_index(table) == NULL + && !(ignore_err & DICT_ERR_IGNORE_CORRUPT)) { + ib_logf(IB_LOG_LEVEL_WARN, + "Cannot load table %s " + "because it has no indexes in " + "InnoDB internal data dictionary.", + table->name); + error = DB_CORRUPTION; + goto func_exit; + } + + break; + } + + rec = btr_pcur_get_rec(&pcur); + + if ((ignore_err & DICT_ERR_IGNORE_RECOVER_LOCK) + && rec_get_n_fields_old(rec) + == DICT_NUM_FIELDS__SYS_INDEXES) { + const byte* field; + ulint len; + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_INDEXES__NAME, &len); + + if (len != UNIV_SQL_NULL + && char(*field) == char(TEMP_INDEX_PREFIX)) { + /* Skip indexes whose name starts with + TEMP_INDEX_PREFIX, because they will + be dropped during crash recovery. */ + goto next_rec; + } + } + + err_msg = dict_load_index_low(buf, table->name, heap, rec, + TRUE, &index); + ut_ad((index == NULL && err_msg != NULL) + || (index != NULL && err_msg == NULL)); + + if (err_msg == dict_load_index_id_err) { + /* TABLE_ID mismatch means that we have + run out of index definitions for the table. */ + + if (dict_table_get_first_index(table) == NULL + && !(ignore_err & DICT_ERR_IGNORE_CORRUPT)) { + ib_logf(IB_LOG_LEVEL_WARN, + "Failed to load the " + "clustered index for table %s " + "because of the following error: %s. " + "Refusing to load the rest of the " + "indexes (if any) and the whole table " + "altogether.", table->name, err_msg); + error = DB_CORRUPTION; + goto func_exit; + } + + break; + } else if (err_msg == dict_load_index_del) { + /* Skip delete-marked records. */ + goto next_rec; + } else if (err_msg) { + fprintf(stderr, "InnoDB: %s\n", err_msg); + if (ignore_err & DICT_ERR_IGNORE_CORRUPT) { + goto next_rec; + } + error = DB_CORRUPTION; + goto func_exit; + } + + ut_ad(index); + + /* Check whether the index is corrupted */ + if (dict_index_is_corrupted(index)) { + ut_print_timestamp(stderr); + fputs(" InnoDB: ", stderr); + dict_index_name_print(stderr, NULL, index); + fputs(" is corrupted\n", stderr); + + if (!srv_load_corrupted + && !(ignore_err & DICT_ERR_IGNORE_CORRUPT) + && dict_index_is_clust(index)) { + dict_mem_index_free(index); + + error = DB_INDEX_CORRUPT; + goto func_exit; + } else { + /* We will load the index if + 1) srv_load_corrupted is TRUE + 2) ignore_err is set with + DICT_ERR_IGNORE_CORRUPT + 3) if the index corrupted is a secondary + index */ + ut_print_timestamp(stderr); + fputs(" InnoDB: load corrupted index ", stderr); + dict_index_name_print(stderr, NULL, index); + putc('\n', stderr); + } + } + + if (index->type & DICT_FTS + && !DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS)) { + /* This should have been created by now. */ + ut_a(table->fts != NULL); + DICT_TF2_FLAG_SET(table, DICT_TF2_FTS); + } + + /* We check for unsupported types first, so that the + subsequent checks are relevant for the supported types. */ + if (index->type & ~(DICT_CLUSTERED | DICT_UNIQUE + | DICT_CORRUPT | DICT_FTS)) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Unknown type %lu of index %s of table %s", + (ulong) index->type, index->name, table->name); + + error = DB_UNSUPPORTED; + dict_mem_index_free(index); + goto func_exit; + } else if (index->page == FIL_NULL + && !table->ibd_file_missing + && (!(index->type & DICT_FTS))) { + + fprintf(stderr, + "InnoDB: Error: trying to load index %s" + " for table %s\n" + "InnoDB: but the index tree has been freed!\n", + index->name, table->name); + + if (ignore_err & DICT_ERR_IGNORE_INDEX_ROOT) { + /* If caller can tolerate this error, + we will continue to load the index and + let caller deal with this error. However + mark the index and table corrupted. We + only need to mark such in the index + dictionary cache for such metadata corruption, + since we would always be able to set it + when loading the dictionary cache */ + dict_set_corrupted_index_cache_only( + index, table); + + fprintf(stderr, + "InnoDB: Index is corrupt but forcing" + " load into data dictionary\n"); + } else { +corrupted: + dict_mem_index_free(index); + error = DB_CORRUPTION; + goto func_exit; + } + } else if (!dict_index_is_clust(index) + && NULL == dict_table_get_first_index(table)) { + + fputs("InnoDB: Error: trying to load index ", + stderr); + ut_print_name(stderr, NULL, FALSE, index->name); + fputs(" for table ", stderr); + ut_print_name(stderr, NULL, TRUE, table->name); + fputs("\nInnoDB: but the first index" + " is not clustered!\n", stderr); + + goto corrupted; + } else if (dict_is_sys_table(table->id) + && (dict_index_is_clust(index) + || ((table == dict_sys->sys_tables) + && !strcmp("ID_IND", index->name)))) { + + /* The index was created in memory already at booting + of the database server */ + dict_mem_index_free(index); + } else { + dict_load_fields(index, heap); + + error = dict_index_add_to_cache( + table, index, index->page, FALSE); + + /* The data dictionary tables should never contain + invalid index definitions. If we ignored this error + and simply did not load this index definition, the + .frm file would disagree with the index definitions + inside InnoDB. */ + if (UNIV_UNLIKELY(error != DB_SUCCESS)) { + + goto func_exit; + } + } +next_rec: + btr_pcur_move_to_next_user_rec(&pcur, &mtr); + } + + /* If the table contains FTS indexes, populate table->fts->indexes */ + if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS)) { + /* table->fts->indexes should have been created. */ + ut_a(table->fts->indexes != NULL); + dict_table_get_all_fts_indexes(table, table->fts->indexes); + } + +func_exit: + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + return(error); +} + +/********************************************************************//** +Loads a table definition from a SYS_TABLES record to dict_table_t. +Does not load any columns or indexes. +@return error message, or NULL on success */ +UNIV_INTERN +const char* +dict_load_table_low( +/*================*/ + const char* name, /*!< in: table name */ + const rec_t* rec, /*!< in: SYS_TABLES record */ + dict_table_t** table) /*!< out,own: table, or NULL */ +{ + const byte* field; + ulint len; + ulint space; + ulint n_cols; + ulint flags = 0; + ulint flags2; + + if (rec_get_deleted_flag(rec, 0)) { + return("delete-marked record in SYS_TABLES"); + } + + if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_TABLES) { + return("wrong number of columns in SYS_TABLES record"); + } + + rec_get_nth_field_offs_old( + rec, DICT_FLD__SYS_TABLES__NAME, &len); + if (len == 0 || len == UNIV_SQL_NULL) { +err_len: + return("incorrect column length in SYS_TABLES"); + } + rec_get_nth_field_offs_old( + rec, DICT_FLD__SYS_TABLES__DB_TRX_ID, &len); + if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) { + goto err_len; + } + rec_get_nth_field_offs_old( + rec, DICT_FLD__SYS_TABLES__DB_ROLL_PTR, &len); + if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) { + goto err_len; + } + + rec_get_nth_field_offs_old(rec, DICT_FLD__SYS_TABLES__ID, &len); + if (len != 8) { + goto err_len; + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_TABLES__N_COLS, &len); + if (len != 4) { + goto err_len; + } + + n_cols = mach_read_from_4(field); + + rec_get_nth_field_offs_old(rec, DICT_FLD__SYS_TABLES__TYPE, &len); + if (len != 4) { + goto err_len; + } + + rec_get_nth_field_offs_old( + rec, DICT_FLD__SYS_TABLES__MIX_ID, &len); + if (len != 8) { + goto err_len; + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_TABLES__MIX_LEN, &len); + if (len != 4) { + goto err_len; + } + + /* MIX_LEN may hold additional flags in post-antelope file formats. */ + flags2 = mach_read_from_4(field); + + /* DICT_TF2_FTS will be set when indexes is being loaded */ + flags2 &= ~DICT_TF2_FTS; + + rec_get_nth_field_offs_old( + rec, DICT_FLD__SYS_TABLES__CLUSTER_ID, &len); + if (len != UNIV_SQL_NULL) { + goto err_len; + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_TABLES__SPACE, &len); + if (len != 4) { + goto err_len; + } + + space = mach_read_from_4(field); + + /* Check if the tablespace exists and has the right name */ + flags = dict_sys_tables_get_flags(rec); + + if (UNIV_UNLIKELY(flags == ULINT_UNDEFINED)) { + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_TABLES__TYPE, &len); + ut_ad(len == 4); /* this was checked earlier */ + flags = mach_read_from_4(field); + + ut_print_timestamp(stderr); + fputs(" InnoDB: Error: table ", stderr); + ut_print_filename(stderr, name); + fprintf(stderr, "\n" + "InnoDB: in InnoDB data dictionary" + " has unknown type %lx.\n", + (ulong) flags); + return("incorrect flags in SYS_TABLES"); + } + + /* The high-order bit of N_COLS is the "compact format" flag. + For tables in that format, MIX_LEN may hold additional flags. */ + if (n_cols & DICT_N_COLS_COMPACT) { + ut_ad(flags & DICT_TF_COMPACT); + + if (flags2 & ~DICT_TF2_BIT_MASK) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Warning: table ", stderr); + ut_print_filename(stderr, name); + fprintf(stderr, "\n" + "InnoDB: in InnoDB data dictionary" + " has unknown flags %lx.\n", + (ulong) flags2); + + /* Clean it up and keep going */ + flags2 &= DICT_TF2_BIT_MASK; + } + } else { + /* Do not trust the MIX_LEN field when the + row format is Redundant. */ + flags2 = 0; + } + + /* See if the tablespace is available. */ + *table = dict_mem_table_create( + name, space, n_cols & ~DICT_N_COLS_COMPACT, flags, flags2, + false); + + field = rec_get_nth_field_old(rec, DICT_FLD__SYS_TABLES__ID, &len); + ut_ad(len == 8); /* this was checked earlier */ + + (*table)->id = mach_read_from_8(field); + + (*table)->ibd_file_missing = FALSE; + + return(NULL); +} + +/********************************************************************//** +Using the table->heap, copy the null-terminated filepath into +table->data_dir_path and replace the 'databasename/tablename.ibd' +portion with 'tablename'. +This allows SHOW CREATE TABLE to return the correct DATA DIRECTORY path. +Make this data directory path only if it has not yet been saved. */ +UNIV_INTERN +void +dict_save_data_dir_path( +/*====================*/ + dict_table_t* table, /*!< in/out: table */ + char* filepath) /*!< in: filepath of tablespace */ +{ + ut_ad(mutex_own(&(dict_sys->mutex))); + ut_a(DICT_TF_HAS_DATA_DIR(table->flags)); + + ut_a(!table->data_dir_path); + ut_a(filepath); + + /* Be sure this filepath is not the default filepath. */ + char* default_filepath = fil_make_ibd_name(table->name, false); + if (strcmp(filepath, default_filepath)) { + ulint pathlen = strlen(filepath); + ut_a(pathlen < OS_FILE_MAX_PATH); + ut_a(0 == strcmp(filepath + pathlen - 4, ".ibd")); + + table->data_dir_path = mem_heap_strdup(table->heap, filepath); + os_file_make_data_dir_path(table->data_dir_path); + } else { + /* This does not change SYS_DATAFILES or SYS_TABLES + or FSP_FLAGS on the header page of the tablespace, + but it makes dict_table_t consistent */ + table->flags &= ~DICT_TF_MASK_DATA_DIR; + } + mem_free(default_filepath); +} + +/*****************************************************************//** +Make sure the data_file_name is saved in dict_table_t if needed. Try to +read it from the file dictionary first, then from SYS_DATAFILES. */ +UNIV_INTERN +void +dict_get_and_save_data_dir_path( +/*============================*/ + dict_table_t* table, /*!< in/out: table */ + bool dict_mutex_own) /*!< in: true if dict_sys->mutex + is owned already */ +{ + if (DICT_TF_HAS_DATA_DIR(table->flags) + && (!table->data_dir_path)) { + char* path = fil_space_get_first_path(table->space); + + if (!dict_mutex_own) { + dict_mutex_enter_for_mysql(); + } + if (!path) { + path = dict_get_first_path( + table->space, table->name); + } + + if (path) { + dict_save_data_dir_path(table, path); + mem_free(path); + } + + if (!dict_mutex_own) { + dict_mutex_exit_for_mysql(); + } + } +} + +/********************************************************************//** +Loads a table definition and also all its index definitions, and also +the cluster definition if the table is a member in a cluster. Also loads +all foreign key constraints where the foreign key is in the table or where +a foreign key references columns in this table. Adds all these to the data +dictionary cache. +@return table, NULL if does not exist; if the table is stored in an +.ibd file, but the file does not exist, then we set the +ibd_file_missing flag TRUE in the table object we return */ +UNIV_INTERN +dict_table_t* +dict_load_table( +/*============*/ + const char* name, /*!< in: table name in the + databasename/tablename format */ + ibool cached, /*!< in: TRUE=add to cache, FALSE=do not */ + dict_err_ignore_t ignore_err) + /*!< in: error to be ignored when loading + table and its indexes' definition */ +{ + dberr_t err; + dict_table_t* table; + dict_table_t* sys_tables; + btr_pcur_t pcur; + dict_index_t* sys_index; + dtuple_t* tuple; + mem_heap_t* heap; + dfield_t* dfield; + const rec_t* rec; + const byte* field; + ulint len; + char* filepath = NULL; + const char* err_msg; + mtr_t mtr; + + ut_ad(mutex_own(&(dict_sys->mutex))); + + heap = mem_heap_create(32000); + + mtr_start(&mtr); + + sys_tables = dict_table_get_low("SYS_TABLES"); + sys_index = UT_LIST_GET_FIRST(sys_tables->indexes); + ut_ad(!dict_table_is_comp(sys_tables)); + ut_ad(name_of_col_is(sys_tables, sys_index, + DICT_FLD__SYS_TABLES__ID, "ID")); + ut_ad(name_of_col_is(sys_tables, sys_index, + DICT_FLD__SYS_TABLES__N_COLS, "N_COLS")); + ut_ad(name_of_col_is(sys_tables, sys_index, + DICT_FLD__SYS_TABLES__TYPE, "TYPE")); + ut_ad(name_of_col_is(sys_tables, sys_index, + DICT_FLD__SYS_TABLES__MIX_LEN, "MIX_LEN")); + ut_ad(name_of_col_is(sys_tables, sys_index, + DICT_FLD__SYS_TABLES__SPACE, "SPACE")); + + tuple = dtuple_create(heap, 1); + dfield = dtuple_get_nth_field(tuple, 0); + + dfield_set_data(dfield, name, ut_strlen(name)); + dict_index_copy_types(tuple, sys_index, 1); + + btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE, + BTR_SEARCH_LEAF, &pcur, &mtr); + rec = btr_pcur_get_rec(&pcur); + + if (!btr_pcur_is_on_user_rec(&pcur) + || rec_get_deleted_flag(rec, 0)) { + /* Not found */ +err_exit: + btr_pcur_close(&pcur); + mtr_commit(&mtr); + mem_heap_free(heap); + + return(NULL); + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_TABLES__NAME, &len); + + /* Check if the table name in record is the searched one */ + if (len != ut_strlen(name) || ut_memcmp(name, field, len) != 0) { + + goto err_exit; + } + + err_msg = dict_load_table_low(name, rec, &table); + + if (err_msg) { + + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: %s\n", err_msg); + goto err_exit; + } + + char table_name[MAX_FULL_NAME_LEN + 1]; + + innobase_format_name(table_name, sizeof(table_name), name, FALSE); + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + if (table->space == 0) { + /* The system tablespace is always available. */ + } else if (table->flags2 & DICT_TF2_DISCARDED) { + + ib_logf(IB_LOG_LEVEL_WARN, + "Table '%s' tablespace is set as discarded.", + table_name); + + table->ibd_file_missing = TRUE; + + } else if (!fil_space_for_table_exists_in_mem( + table->space, name, FALSE, FALSE, true, heap, + table->id)) { + + if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_TEMPORARY)) { + /* Do not bother to retry opening temporary tables. */ + table->ibd_file_missing = TRUE; + + } else { + if (!(ignore_err & DICT_ERR_IGNORE_RECOVER_LOCK)) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Failed to find tablespace for " + "table '%s' in the cache. " + "Attempting to load the tablespace " + "with space id %lu.", + table_name, (ulong) table->space); + } + + /* Use the remote filepath if needed. */ + if (DICT_TF_HAS_DATA_DIR(table->flags)) { + /* This needs to be added to the table + from SYS_DATAFILES */ + dict_get_and_save_data_dir_path(table, true); + + if (table->data_dir_path) { + filepath = os_file_make_remote_pathname( + table->data_dir_path, + table->name, "ibd"); + } + } + + /* Try to open the tablespace. We set the + 2nd param (fix_dict = false) here because we + do not have an x-lock on dict_operation_lock */ + err = fil_open_single_table_tablespace( + true, false, table->space, + dict_tf_to_fsp_flags(table->flags), + name, filepath); + + if (err != DB_SUCCESS) { + /* We failed to find a sensible + tablespace file */ + + table->ibd_file_missing = TRUE; + } + if (filepath) { + mem_free(filepath); + } + } + } + + dict_load_columns(table, heap); + + if (cached) { + dict_table_add_to_cache(table, TRUE, heap); + } else { + dict_table_add_system_columns(table, heap); + } + + mem_heap_empty(heap); + + /* If there is no tablespace for the table then we only need to + load the index definitions. So that we can IMPORT the tablespace + later. When recovering table locks for resurrected incomplete + transactions, the tablespace should exist, because DDL operations + were not allowed while the table is being locked by a transaction. */ + dict_err_ignore_t index_load_err = + !(ignore_err & DICT_ERR_IGNORE_RECOVER_LOCK) + && table->ibd_file_missing + ? DICT_ERR_IGNORE_ALL + : ignore_err; + err = dict_load_indexes(table, heap, index_load_err); + + if (err == DB_INDEX_CORRUPT) { + /* Refuse to load the table if the table has a corrupted + cluster index */ + if (!srv_load_corrupted) { + fprintf(stderr, "InnoDB: Error: Load table "); + ut_print_name(stderr, NULL, TRUE, table->name); + fprintf(stderr, " failed, the table has corrupted" + " clustered indexes. Turn on" + " 'innodb_force_load_corrupted'" + " to drop it\n"); + + dict_table_remove_from_cache(table); + table = NULL; + goto func_exit; + } else { + dict_index_t* clust_index; + clust_index = dict_table_get_first_index(table); + + if (dict_index_is_corrupted(clust_index)) { + table->corrupted = TRUE; + } + } + } + + /* Initialize table foreign_child value. Its value could be + changed when dict_load_foreigns() is called below */ + table->fk_max_recusive_level = 0; + + /* If the force recovery flag is set, we open the table irrespective + of the error condition, since the user may want to dump data from the + clustered index. However we load the foreign key information only if + all indexes were loaded. */ + if (!cached || table->ibd_file_missing) { + /* Don't attempt to load the indexes from disk. */ + } else if (err == DB_SUCCESS) { + err = dict_load_foreigns(table->name, NULL, true, true, + ignore_err); + + if (err != DB_SUCCESS) { + ib_logf(IB_LOG_LEVEL_WARN, + "Load table '%s' failed, the table has missing " + "foreign key indexes. Turn off " + "'foreign_key_checks' and try again.", + table->name); + + dict_table_remove_from_cache(table); + table = NULL; + } else { + table->fk_max_recusive_level = 0; + } + } else { + dict_index_t* index; + + /* Make sure that at least the clustered index was loaded. + Otherwise refuse to load the table */ + index = dict_table_get_first_index(table); + + if (!srv_force_recovery + || !index + || !dict_index_is_clust(index)) { + + dict_table_remove_from_cache(table); + table = NULL; + + } else if (dict_index_is_corrupted(index) + && !table->ibd_file_missing) { + + /* It is possible we force to load a corrupted + clustered index if srv_load_corrupted is set. + Mark the table as corrupted in this case */ + table->corrupted = TRUE; + } + } + +func_exit: + mem_heap_free(heap); + + ut_ad(!table + || ignore_err != DICT_ERR_IGNORE_NONE + || table->ibd_file_missing + || !table->corrupted); + + if (table && table->fts) { + if (!(dict_table_has_fts_index(table) + || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID) + || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_ADD_DOC_ID))) { + /* the table->fts could be created in dict_load_column + when a user defined FTS_DOC_ID is present, but no + FTS */ + fts_free(table); + } else { + fts_optimize_add_table(table); + } + } + + ut_ad(err != DB_SUCCESS || dict_foreign_set_validate(*table)); + + return(table); +} + +/***********************************************************************//** +Loads a table object based on the table id. +@return table; NULL if table does not exist */ +UNIV_INTERN +dict_table_t* +dict_load_table_on_id( +/*==================*/ + table_id_t table_id, /*!< in: table id */ + dict_err_ignore_t ignore_err) /*!< in: errors to ignore + when loading the table */ +{ + byte id_buf[8]; + btr_pcur_t pcur; + mem_heap_t* heap; + dtuple_t* tuple; + dfield_t* dfield; + dict_index_t* sys_table_ids; + dict_table_t* sys_tables; + const rec_t* rec; + const byte* field; + ulint len; + dict_table_t* table; + mtr_t mtr; + + ut_ad(mutex_own(&(dict_sys->mutex))); + + table = NULL; + + /* NOTE that the operation of this function is protected by + the dictionary mutex, and therefore no deadlocks can occur + with other dictionary operations. */ + + mtr_start(&mtr); + /*---------------------------------------------------*/ + /* Get the secondary index based on ID for table SYS_TABLES */ + sys_tables = dict_sys->sys_tables; + sys_table_ids = dict_table_get_next_index( + dict_table_get_first_index(sys_tables)); + ut_ad(!dict_table_is_comp(sys_tables)); + ut_ad(!dict_index_is_clust(sys_table_ids)); + heap = mem_heap_create(256); + + tuple = dtuple_create(heap, 1); + dfield = dtuple_get_nth_field(tuple, 0); + + /* Write the table id in byte format to id_buf */ + mach_write_to_8(id_buf, table_id); + + dfield_set_data(dfield, id_buf, 8); + dict_index_copy_types(tuple, sys_table_ids, 1); + + btr_pcur_open_on_user_rec(sys_table_ids, tuple, PAGE_CUR_GE, + BTR_SEARCH_LEAF, &pcur, &mtr); + +check_rec: + rec = btr_pcur_get_rec(&pcur); + + if (page_rec_is_user_rec(rec)) { + /*---------------------------------------------------*/ + /* Now we have the record in the secondary index + containing the table ID and NAME */ + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_TABLE_IDS__ID, &len); + ut_ad(len == 8); + + /* Check if the table id in record is the one searched for */ + if (table_id == mach_read_from_8(field)) { + if (rec_get_deleted_flag(rec, 0)) { + /* Until purge has completed, there + may be delete-marked duplicate records + for the same SYS_TABLES.ID. + Due to Bug #60049, some delete-marked + records may survive the purge forever. */ + if (btr_pcur_move_to_next(&pcur, &mtr)) { + + goto check_rec; + } + } else { + /* Now we get the table name from the record */ + field = rec_get_nth_field_old(rec, + DICT_FLD__SYS_TABLE_IDS__NAME, &len); + /* Load the table definition to memory */ + table = dict_load_table( + mem_heap_strdupl( + heap, (char*) field, len), + TRUE, ignore_err); + } + } + } + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + mem_heap_free(heap); + + return(table); +} + +/********************************************************************//** +This function is called when the database is booted. Loads system table +index definitions except for the clustered index which is added to the +dictionary cache at booting before calling this function. */ +UNIV_INTERN +void +dict_load_sys_table( +/*================*/ + dict_table_t* table) /*!< in: system table */ +{ + mem_heap_t* heap; + + ut_ad(mutex_own(&(dict_sys->mutex))); + + heap = mem_heap_create(1000); + + dict_load_indexes(table, heap, DICT_ERR_IGNORE_NONE); + + mem_heap_free(heap); +} + +/********************************************************************//** +Loads foreign key constraint col names (also for the referenced table). +Members that must be set (and valid) in foreign: +foreign->heap +foreign->n_fields +foreign->id ('\0'-terminated) +Members that will be created and set by this function: +foreign->foreign_col_names[i] +foreign->referenced_col_names[i] +(for i=0..foreign->n_fields-1) */ +static +void +dict_load_foreign_cols( +/*===================*/ + dict_foreign_t* foreign)/*!< in/out: foreign constraint object */ +{ + dict_table_t* sys_foreign_cols; + dict_index_t* sys_index; + btr_pcur_t pcur; + dtuple_t* tuple; + dfield_t* dfield; + const rec_t* rec; + const byte* field; + ulint len; + ulint i; + mtr_t mtr; + size_t id_len; + + ut_ad(mutex_own(&(dict_sys->mutex))); + + id_len = strlen(foreign->id); + + foreign->foreign_col_names = static_cast<const char**>( + mem_heap_alloc(foreign->heap, + foreign->n_fields * sizeof(void*))); + + foreign->referenced_col_names = static_cast<const char**>( + mem_heap_alloc(foreign->heap, + foreign->n_fields * sizeof(void*))); + + mtr_start(&mtr); + + sys_foreign_cols = dict_table_get_low("SYS_FOREIGN_COLS"); + + sys_index = UT_LIST_GET_FIRST(sys_foreign_cols->indexes); + ut_ad(!dict_table_is_comp(sys_foreign_cols)); + + tuple = dtuple_create(foreign->heap, 1); + dfield = dtuple_get_nth_field(tuple, 0); + + dfield_set_data(dfield, foreign->id, id_len); + dict_index_copy_types(tuple, sys_index, 1); + + btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE, + BTR_SEARCH_LEAF, &pcur, &mtr); + for (i = 0; i < foreign->n_fields; i++) { + + rec = btr_pcur_get_rec(&pcur); + + ut_a(btr_pcur_is_on_user_rec(&pcur)); + ut_a(!rec_get_deleted_flag(rec, 0)); + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN_COLS__ID, &len); + + if (len != id_len || ut_memcmp(foreign->id, field, len) != 0) { + const rec_t* pos; + ulint pos_len; + const rec_t* for_col_name; + ulint for_col_name_len; + const rec_t* ref_col_name; + ulint ref_col_name_len; + + pos = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN_COLS__POS, + &pos_len); + + for_col_name = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN_COLS__FOR_COL_NAME, + &for_col_name_len); + + ref_col_name = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN_COLS__REF_COL_NAME, + &ref_col_name_len); + + ib_logf(IB_LOG_LEVEL_ERROR, + "Unable to load columns names for foreign " + "key '%s' because it was not found in " + "InnoDB internal table SYS_FOREIGN_COLS. The " + "closest entry we found is: " + "(ID='%.*s', POS=%lu, FOR_COL_NAME='%.*s', " + "REF_COL_NAME='%.*s')", + foreign->id, + (int) len, field, + mach_read_from_4(pos), + (int) for_col_name_len, for_col_name, + (int) ref_col_name_len, ref_col_name); + + ut_error; + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN_COLS__POS, &len); + ut_a(len == 4); + ut_a(i == mach_read_from_4(field)); + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN_COLS__FOR_COL_NAME, &len); + foreign->foreign_col_names[i] = mem_heap_strdupl( + foreign->heap, (char*) field, len); + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN_COLS__REF_COL_NAME, &len); + foreign->referenced_col_names[i] = mem_heap_strdupl( + foreign->heap, (char*) field, len); + + btr_pcur_move_to_next_user_rec(&pcur, &mtr); + } + + btr_pcur_close(&pcur); + mtr_commit(&mtr); +} + +/***********************************************************************//** +Loads a foreign key constraint to the dictionary cache. +@return DB_SUCCESS or error code */ +static __attribute__((nonnull(1), warn_unused_result)) +dberr_t +dict_load_foreign( +/*==============*/ + const char* id, + /*!< in: foreign constraint id, must be + '\0'-terminated */ + const char** col_names, + /*!< in: column names, or NULL + to use foreign->foreign_table->col_names */ + bool check_recursive, + /*!< in: whether to record the foreign table + parent count to avoid unlimited recursive + load of chained foreign tables */ + bool check_charsets, + /*!< in: whether to check charset + compatibility */ + dict_err_ignore_t ignore_err) + /*!< in: error to be ignored */ +{ + dict_foreign_t* foreign; + dict_table_t* sys_foreign; + btr_pcur_t pcur; + dict_index_t* sys_index; + dtuple_t* tuple; + mem_heap_t* heap2; + dfield_t* dfield; + const rec_t* rec; + const byte* field; + ulint len; + ulint n_fields_and_type; + mtr_t mtr; + dict_table_t* for_table; + dict_table_t* ref_table; + size_t id_len; + + ut_ad(mutex_own(&(dict_sys->mutex))); + + id_len = strlen(id); + + heap2 = mem_heap_create(1000); + + mtr_start(&mtr); + + sys_foreign = dict_table_get_low("SYS_FOREIGN"); + + sys_index = UT_LIST_GET_FIRST(sys_foreign->indexes); + ut_ad(!dict_table_is_comp(sys_foreign)); + + tuple = dtuple_create(heap2, 1); + dfield = dtuple_get_nth_field(tuple, 0); + + dfield_set_data(dfield, id, id_len); + dict_index_copy_types(tuple, sys_index, 1); + + btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE, + BTR_SEARCH_LEAF, &pcur, &mtr); + rec = btr_pcur_get_rec(&pcur); + + if (!btr_pcur_is_on_user_rec(&pcur) + || rec_get_deleted_flag(rec, 0)) { + /* Not found */ + + fprintf(stderr, + "InnoDB: Error: cannot load foreign constraint " + "%s: could not find the relevant record in " + "SYS_FOREIGN\n", id); + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + mem_heap_free(heap2); + + return(DB_ERROR); + } + + field = rec_get_nth_field_old(rec, DICT_FLD__SYS_FOREIGN__ID, &len); + + /* Check if the id in record is the searched one */ + if (len != id_len || ut_memcmp(id, field, len) != 0) { + + fprintf(stderr, + "InnoDB: Error: cannot load foreign constraint " + "%s: found %.*s instead in SYS_FOREIGN\n", + id, (int) len, field); + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + mem_heap_free(heap2); + + return(DB_ERROR); + } + + /* Read the table names and the number of columns associated + with the constraint */ + + mem_heap_free(heap2); + + foreign = dict_mem_foreign_create(); + + n_fields_and_type = mach_read_from_4( + rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN__N_COLS, &len)); + + ut_a(len == 4); + + /* We store the type in the bits 24..29 of n_fields_and_type. */ + + foreign->type = (unsigned int) (n_fields_and_type >> 24); + foreign->n_fields = (unsigned int) (n_fields_and_type & 0x3FFUL); + + foreign->id = mem_heap_strdupl(foreign->heap, id, id_len); + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN__FOR_NAME, &len); + + foreign->foreign_table_name = mem_heap_strdupl( + foreign->heap, (char*) field, len); + dict_mem_foreign_table_name_lookup_set(foreign, TRUE); + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN__REF_NAME, &len); + foreign->referenced_table_name = mem_heap_strdupl( + foreign->heap, (char*) field, len); + dict_mem_referenced_table_name_lookup_set(foreign, TRUE); + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + dict_load_foreign_cols(foreign); + + ref_table = dict_table_check_if_in_cache_low( + foreign->referenced_table_name_lookup); + + /* We could possibly wind up in a deep recursive calls if + we call dict_table_get_low() again here if there + is a chain of tables concatenated together with + foreign constraints. In such case, each table is + both a parent and child of the other tables, and + act as a "link" in such table chains. + To avoid such scenario, we would need to check the + number of ancesters the current table has. If that + exceeds DICT_FK_MAX_CHAIN_LEN, we will stop loading + the child table. + Foreign constraints are loaded in a Breath First fashion, + that is, the index on FOR_NAME is scanned first, and then + index on REF_NAME. So foreign constrains in which + current table is a child (foreign table) are loaded first, + and then those constraints where current table is a + parent (referenced) table. + Thus we could check the parent (ref_table) table's + reference count (fk_max_recusive_level) to know how deep the + recursive call is. If the parent table (ref_table) is already + loaded, and its fk_max_recusive_level is larger than + DICT_FK_MAX_CHAIN_LEN, we will stop the recursive loading + by skipping loading the child table. It will not affect foreign + constraint check for DMLs since child table will be loaded + at that time for the constraint check. */ + if (!ref_table + || ref_table->fk_max_recusive_level < DICT_FK_MAX_RECURSIVE_LOAD) { + + /* If the foreign table is not yet in the dictionary cache, we + have to load it so that we are able to make type comparisons + in the next function call. */ + + for_table = dict_table_get_low(foreign->foreign_table_name_lookup); + + if (for_table && ref_table && check_recursive) { + /* This is to record the longest chain of ancesters + this table has, if the parent has more ancesters + than this table has, record it after add 1 (for this + parent */ + if (ref_table->fk_max_recusive_level + >= for_table->fk_max_recusive_level) { + for_table->fk_max_recusive_level = + ref_table->fk_max_recusive_level + 1; + } + } + } + + /* Note that there may already be a foreign constraint object in + the dictionary cache for this constraint: then the following + call only sets the pointers in it to point to the appropriate table + and index objects and frees the newly created object foreign. + Adding to the cache should always succeed since we are not creating + a new foreign key constraint but loading one from the data + dictionary. */ + + return(dict_foreign_add_to_cache(foreign, col_names, check_charsets, + ignore_err)); +} + +/***********************************************************************//** +Loads foreign key constraints where the table is either the foreign key +holder or where the table is referenced by a foreign key. Adds these +constraints to the data dictionary. Note that we know that the dictionary +cache already contains all constraints where the other relevant table is +already in the dictionary cache. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +dict_load_foreigns( +/*===============*/ + const char* table_name, /*!< in: table name */ + const char** col_names, /*!< in: column names, or NULL + to use table->col_names */ + bool check_recursive,/*!< in: Whether to check + recursive load of tables + chained by FK */ + bool check_charsets, /*!< in: whether to check + charset compatibility */ + dict_err_ignore_t ignore_err) /*!< in: error to be ignored */ +{ + ulint tuple_buf[(DTUPLE_EST_ALLOC(1) + sizeof(ulint) - 1) + / sizeof(ulint)]; + btr_pcur_t pcur; + dtuple_t* tuple; + dfield_t* dfield; + dict_index_t* sec_index; + dict_table_t* sys_foreign; + const rec_t* rec; + const byte* field; + ulint len; + dberr_t err; + mtr_t mtr; + + ut_ad(mutex_own(&(dict_sys->mutex))); + + sys_foreign = dict_table_get_low("SYS_FOREIGN"); + + if (sys_foreign == NULL) { + /* No foreign keys defined yet in this database */ + + fprintf(stderr, + "InnoDB: Error: no foreign key system tables" + " in the database\n"); + + return(DB_ERROR); + } + + ut_ad(!dict_table_is_comp(sys_foreign)); + mtr_start(&mtr); + + /* Get the secondary index based on FOR_NAME from table + SYS_FOREIGN */ + + sec_index = dict_table_get_next_index( + dict_table_get_first_index(sys_foreign)); + ut_ad(!dict_index_is_clust(sec_index)); +start_load: + + tuple = dtuple_create_from_mem(tuple_buf, sizeof(tuple_buf), 1); + dfield = dtuple_get_nth_field(tuple, 0); + + dfield_set_data(dfield, table_name, ut_strlen(table_name)); + dict_index_copy_types(tuple, sec_index, 1); + + btr_pcur_open_on_user_rec(sec_index, tuple, PAGE_CUR_GE, + BTR_SEARCH_LEAF, &pcur, &mtr); +loop: + rec = btr_pcur_get_rec(&pcur); + + if (!btr_pcur_is_on_user_rec(&pcur)) { + /* End of index */ + + goto load_next_index; + } + + /* Now we have the record in the secondary index containing a table + name and a foreign constraint ID */ + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN_FOR_NAME__NAME, &len); + + /* Check if the table name in the record is the one searched for; the + following call does the comparison in the latin1_swedish_ci + charset-collation, in a case-insensitive way. */ + + if (0 != cmp_data_data(dfield_get_type(dfield)->mtype, + dfield_get_type(dfield)->prtype, + static_cast<const byte*>( + dfield_get_data(dfield)), + dfield_get_len(dfield), + field, len)) { + + goto load_next_index; + } + + /* Since table names in SYS_FOREIGN are stored in a case-insensitive + order, we have to check that the table name matches also in a binary + string comparison. On Unix, MySQL allows table names that only differ + in character case. If lower_case_table_names=2 then what is stored + may not be the same case, but the previous comparison showed that they + match with no-case. */ + + if (rec_get_deleted_flag(rec, 0)) { + goto next_rec; + } + + if ((innobase_get_lower_case_table_names() != 2) + && (0 != ut_memcmp(field, table_name, len))) { + goto next_rec; + } + + /* Now we get a foreign key constraint id */ + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN_FOR_NAME__ID, &len); + + /* Copy the string because the page may be modified or evicted + after mtr_commit() below. */ + char fk_id[MAX_TABLE_NAME_LEN + 1]; + + ut_a(len <= MAX_TABLE_NAME_LEN); + memcpy(fk_id, field, len); + fk_id[len] = '\0'; + + btr_pcur_store_position(&pcur, &mtr); + + mtr_commit(&mtr); + + /* Load the foreign constraint definition to the dictionary cache */ + + err = dict_load_foreign(fk_id, col_names, + check_recursive, check_charsets, ignore_err); + + if (err != DB_SUCCESS) { + btr_pcur_close(&pcur); + + return(err); + } + + mtr_start(&mtr); + + btr_pcur_restore_position(BTR_SEARCH_LEAF, &pcur, &mtr); +next_rec: + btr_pcur_move_to_next_user_rec(&pcur, &mtr); + + goto loop; + +load_next_index: + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + sec_index = dict_table_get_next_index(sec_index); + + if (sec_index != NULL) { + + mtr_start(&mtr); + + /* Switch to scan index on REF_NAME, fk_max_recusive_level + already been updated when scanning FOR_NAME index, no need to + update again */ + check_recursive = FALSE; + + goto start_load; + } + + return(DB_SUCCESS); +} diff --git a/storage/xtradb/dict/dict0mem.cc b/storage/xtradb/dict/dict0mem.cc new file mode 100644 index 00000000000..44b074dd718 --- /dev/null +++ b/storage/xtradb/dict/dict0mem.cc @@ -0,0 +1,755 @@ +/***************************************************************************** + +Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file dict/dict0mem.cc +Data dictionary memory object creation + +Created 1/8/1996 Heikki Tuuri +***********************************************************************/ + +#include "dict0mem.h" + +#ifdef UNIV_NONINL +#include "dict0mem.ic" +#endif + +#include "rem0rec.h" +#include "data0type.h" +#include "mach0data.h" +#include "dict0dict.h" +#include "fts0priv.h" +#include "ut0crc32.h" +#ifndef UNIV_HOTBACKUP +# include "ha_prototypes.h" /* innobase_casedn_str(), + innobase_get_lower_case_table_names */ +# include "mysql_com.h" /* NAME_LEN */ +# include "lock0lock.h" +#endif /* !UNIV_HOTBACKUP */ +#ifdef UNIV_BLOB_DEBUG +# include "ut0rbt.h" +#endif /* UNIV_BLOB_DEBUG */ +#include <iostream> + +#define DICT_HEAP_SIZE 100 /*!< initial memory heap size when + creating a table or index object */ + +#ifdef UNIV_PFS_MUTEX +/* Key to register autoinc_mutex with performance schema */ +UNIV_INTERN mysql_pfs_key_t autoinc_mutex_key; +#endif /* UNIV_PFS_MUTEX */ + +/** An interger randomly initialized at startup used to make a temporary +table name as unique as possible. */ +static ib_uint32_t dict_temp_file_num; + +/**********************************************************************//** +Creates a table memory object. +@return own: table object */ +UNIV_INTERN +dict_table_t* +dict_mem_table_create( +/*==================*/ + const char* name, /*!< in: table name */ + ulint space, /*!< in: space where the clustered index of + the table is placed */ + ulint n_cols, /*!< in: number of columns */ + ulint flags, /*!< in: table flags */ + ulint flags2, /*!< in: table flags2 */ + bool nonshared)/*!< in: whether the table object is a dummy + one that does not need the initialization of + locking-related fields. */ +{ + dict_table_t* table; + mem_heap_t* heap; + + ut_ad(name); + ut_a(dict_tf_is_valid(flags)); + ut_a(!(flags2 & ~DICT_TF2_BIT_MASK)); + + heap = mem_heap_create(DICT_HEAP_SIZE); + + table = static_cast<dict_table_t*>( + mem_heap_zalloc(heap, sizeof(dict_table_t))); + + table->heap = heap; + + table->flags = (unsigned int) flags; + table->flags2 = (unsigned int) flags2; + table->name = static_cast<char*>(ut_malloc(strlen(name) + 1)); + memcpy(table->name, name, strlen(name) + 1); + table->space = (unsigned int) space; + table->n_cols = (unsigned int) (n_cols + DATA_N_SYS_COLS); + + table->cols = static_cast<dict_col_t*>( + mem_heap_alloc(heap, + (n_cols + DATA_N_SYS_COLS) + * sizeof(dict_col_t))); + + ut_d(table->magic_n = DICT_TABLE_MAGIC_N); + + /* true means that the stats latch will be enabled - + dict_table_stats_lock() will not be noop. */ + dict_table_stats_latch_create(table, true); + +#ifndef UNIV_HOTBACKUP + + if (!nonshared) { + + table->autoinc_lock = static_cast<ib_lock_t*>( + mem_heap_alloc(heap, lock_get_size())); + + mutex_create(autoinc_mutex_key, + &table->autoinc_mutex, SYNC_DICT_AUTOINC_MUTEX); + } else { + + table->autoinc_lock = NULL; + } + + table->autoinc = 0; + + /* The number of transactions that are either waiting on the + AUTOINC lock or have been granted the lock. */ + table->n_waiting_or_granted_auto_inc_locks = 0; + + /* If the table has an FTS index or we are in the process + of building one, create the table->fts */ + if (dict_table_has_fts_index(table) + || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID) + || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_ADD_DOC_ID)) { + table->fts = fts_create(table); + table->fts->cache = fts_cache_create(table); + } else { + table->fts = NULL; + } + + table->is_corrupt = FALSE; + +#endif /* !UNIV_HOTBACKUP */ + + new(&table->foreign_set) dict_foreign_set(); + new(&table->referenced_set) dict_foreign_set(); + + return(table); +} + +/****************************************************************//** +Free a table memory object. */ +UNIV_INTERN +void +dict_mem_table_free( +/*================*/ + dict_table_t* table) /*!< in: table */ +{ + ut_ad(table); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + ut_d(table->cached = FALSE); + + if (dict_table_has_fts_index(table) + || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID) + || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_ADD_DOC_ID)) { + if (table->fts) { + if (table->cached) { + fts_optimize_remove_table(table); + } + + fts_free(table); + } + } +#ifndef UNIV_HOTBACKUP + if (table->autoinc_lock) { + + mutex_free(&(table->autoinc_mutex)); + } +#endif /* UNIV_HOTBACKUP */ + + dict_table_stats_latch_destroy(table); + + table->foreign_set.~dict_foreign_set(); + table->referenced_set.~dict_foreign_set(); + + ut_free(table->name); + mem_heap_free(table->heap); +} + +/****************************************************************//** +Append 'name' to 'col_names'. @see dict_table_t::col_names +@return new column names array */ +static +const char* +dict_add_col_name( +/*==============*/ + const char* col_names, /*!< in: existing column names, or + NULL */ + ulint cols, /*!< in: number of existing columns */ + const char* name, /*!< in: new column name */ + mem_heap_t* heap) /*!< in: heap */ +{ + ulint old_len; + ulint new_len; + ulint total_len; + char* res; + + ut_ad(!cols == !col_names); + + /* Find out length of existing array. */ + if (col_names) { + const char* s = col_names; + ulint i; + + for (i = 0; i < cols; i++) { + s += strlen(s) + 1; + } + + old_len = s - col_names; + } else { + old_len = 0; + } + + new_len = strlen(name) + 1; + total_len = old_len + new_len; + + res = static_cast<char*>(mem_heap_alloc(heap, total_len)); + + if (old_len > 0) { + memcpy(res, col_names, old_len); + } + + memcpy(res + old_len, name, new_len); + + return(res); +} + +/**********************************************************************//** +Adds a column definition to a table. */ +UNIV_INTERN +void +dict_mem_table_add_col( +/*===================*/ + dict_table_t* table, /*!< in: table */ + mem_heap_t* heap, /*!< in: temporary memory heap, or NULL */ + const char* name, /*!< in: column name, or NULL */ + ulint mtype, /*!< in: main datatype */ + ulint prtype, /*!< in: precise type */ + ulint len) /*!< in: precision */ +{ + dict_col_t* col; + ulint i; + + ut_ad(table); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + ut_ad(!heap == !name); + + i = table->n_def++; + + if (name) { + if (UNIV_UNLIKELY(table->n_def == table->n_cols)) { + heap = table->heap; + } + if (UNIV_LIKELY(i) && UNIV_UNLIKELY(!table->col_names)) { + /* All preceding column names are empty. */ + char* s = static_cast<char*>( + mem_heap_zalloc(heap, table->n_def)); + + table->col_names = s; + } + + table->col_names = dict_add_col_name(table->col_names, + i, name, heap); + } + + col = dict_table_get_nth_col(table, i); + + dict_mem_fill_column_struct(col, i, mtype, prtype, len); +} + +/**********************************************************************//** +Renames a column of a table in the data dictionary cache. */ +static __attribute__((nonnull)) +void +dict_mem_table_col_rename_low( +/*==========================*/ + dict_table_t* table, /*!< in/out: table */ + unsigned i, /*!< in: column offset corresponding to s */ + const char* to, /*!< in: new column name */ + const char* s) /*!< in: pointer to table->col_names */ +{ + size_t from_len = strlen(s), to_len = strlen(to); + + ut_ad(i < table->n_def); + ut_ad(from_len <= NAME_LEN); + ut_ad(to_len <= NAME_LEN); + + if (from_len == to_len) { + /* The easy case: simply replace the column name in + table->col_names. */ + strcpy(const_cast<char*>(s), to); + } else { + /* We need to adjust all affected index->field + pointers, as in dict_index_add_col(). First, copy + table->col_names. */ + ulint prefix_len = s - table->col_names; + + for (; i < table->n_def; i++) { + s += strlen(s) + 1; + } + + ulint full_len = s - table->col_names; + char* col_names; + + if (to_len > from_len) { + col_names = static_cast<char*>( + mem_heap_alloc( + table->heap, + full_len + to_len - from_len)); + + memcpy(col_names, table->col_names, prefix_len); + } else { + col_names = const_cast<char*>(table->col_names); + } + + memcpy(col_names + prefix_len, to, to_len); + memmove(col_names + prefix_len + to_len, + table->col_names + (prefix_len + from_len), + full_len - (prefix_len + from_len)); + + /* Replace the field names in every index. */ + for (dict_index_t* index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + ulint n_fields = dict_index_get_n_fields(index); + + for (ulint i = 0; i < n_fields; i++) { + dict_field_t* field + = dict_index_get_nth_field( + index, i); + ulint name_ofs + = field->name - table->col_names; + if (name_ofs <= prefix_len) { + field->name = col_names + name_ofs; + } else { + ut_a(name_ofs < full_len); + field->name = col_names + + name_ofs + to_len - from_len; + } + } + } + + table->col_names = col_names; + } + + dict_foreign_t* foreign; + + /* Replace the field names in every foreign key constraint. */ + for (dict_foreign_set::iterator it = table->foreign_set.begin(); + it != table->foreign_set.end(); + ++it) { + + foreign = *it; + + for (unsigned f = 0; f < foreign->n_fields; f++) { + /* These can point straight to + table->col_names, because the foreign key + constraints will be freed at the same time + when the table object is freed. */ + foreign->foreign_col_names[f] + = dict_index_get_nth_field( + foreign->foreign_index, f)->name; + } + } + + for (dict_foreign_set::iterator it = table->referenced_set.begin(); + it != table->referenced_set.end(); + ++it) { + + foreign = *it; + + for (unsigned f = 0; f < foreign->n_fields; f++) { + /* foreign->referenced_col_names[] need to be + copies, because the constraint may become + orphan when foreign_key_checks=0 and the + parent table is dropped. */ + + const char* col_name = dict_index_get_nth_field( + foreign->referenced_index, f)->name; + + if (strcmp(foreign->referenced_col_names[f], + col_name)) { + char** rc = const_cast<char**>( + foreign->referenced_col_names + f); + size_t col_name_len_1 = strlen(col_name) + 1; + + if (col_name_len_1 <= strlen(*rc) + 1) { + memcpy(*rc, col_name, col_name_len_1); + } else { + *rc = static_cast<char*>( + mem_heap_dup( + foreign->heap, + col_name, + col_name_len_1)); + } + } + } + } +} + +/**********************************************************************//** +Renames a column of a table in the data dictionary cache. */ +UNIV_INTERN +void +dict_mem_table_col_rename( +/*======================*/ + dict_table_t* table, /*!< in/out: table */ + unsigned nth_col,/*!< in: column index */ + const char* from, /*!< in: old column name */ + const char* to) /*!< in: new column name */ +{ + const char* s = table->col_names; + + ut_ad(nth_col < table->n_def); + + for (unsigned i = 0; i < nth_col; i++) { + size_t len = strlen(s); + ut_ad(len > 0); + s += len + 1; + } + + /* This could fail if the data dictionaries are out of sync. + Proceed with the renaming anyway. */ + ut_ad(!strcmp(from, s)); + + dict_mem_table_col_rename_low(table, nth_col, to, s); +} + +/**********************************************************************//** +This function populates a dict_col_t memory structure with +supplied information. */ +UNIV_INTERN +void +dict_mem_fill_column_struct( +/*========================*/ + dict_col_t* column, /*!< out: column struct to be + filled */ + ulint col_pos, /*!< in: column position */ + ulint mtype, /*!< in: main data type */ + ulint prtype, /*!< in: precise type */ + ulint col_len) /*!< in: column length */ +{ +#ifndef UNIV_HOTBACKUP + ulint mbminlen; + ulint mbmaxlen; +#endif /* !UNIV_HOTBACKUP */ + + column->ind = (unsigned int) col_pos; + column->ord_part = 0; + column->max_prefix = 0; + column->mtype = (unsigned int) mtype; + column->prtype = (unsigned int) prtype; + column->len = (unsigned int) col_len; +#ifndef UNIV_HOTBACKUP + dtype_get_mblen(mtype, prtype, &mbminlen, &mbmaxlen); + dict_col_set_mbminmaxlen(column, mbminlen, mbmaxlen); +#endif /* !UNIV_HOTBACKUP */ +} + +/**********************************************************************//** +Creates an index memory object. +@return own: index object */ +UNIV_INTERN +dict_index_t* +dict_mem_index_create( +/*==================*/ + const char* table_name, /*!< in: table name */ + const char* index_name, /*!< in: index name */ + ulint space, /*!< in: space where the index tree is + placed, ignored if the index is of + the clustered type */ + ulint type, /*!< in: DICT_UNIQUE, + DICT_CLUSTERED, ... ORed */ + ulint n_fields) /*!< in: number of fields */ +{ + dict_index_t* index; + mem_heap_t* heap; + + ut_ad(table_name && index_name); + + heap = mem_heap_create(DICT_HEAP_SIZE); + + index = static_cast<dict_index_t*>( + mem_heap_zalloc(heap, sizeof(*index))); + + dict_mem_fill_index_struct(index, heap, table_name, index_name, + space, type, n_fields); + + os_fast_mutex_init(zip_pad_mutex_key, &index->zip_pad.mutex); + + return(index); +} + +#ifndef UNIV_HOTBACKUP +/**********************************************************************//** +Creates and initializes a foreign constraint memory object. +@return own: foreign constraint struct */ +UNIV_INTERN +dict_foreign_t* +dict_mem_foreign_create(void) +/*=========================*/ +{ + dict_foreign_t* foreign; + mem_heap_t* heap; + + heap = mem_heap_create(100); + + foreign = static_cast<dict_foreign_t*>( + mem_heap_zalloc(heap, sizeof(dict_foreign_t))); + + foreign->heap = heap; + + return(foreign); +} + +/**********************************************************************//** +Sets the foreign_table_name_lookup pointer based on the value of +lower_case_table_names. If that is 0 or 1, foreign_table_name_lookup +will point to foreign_table_name. If 2, then another string is +allocated from foreign->heap and set to lower case. */ +UNIV_INTERN +void +dict_mem_foreign_table_name_lookup_set( +/*===================================*/ + dict_foreign_t* foreign, /*!< in/out: foreign struct */ + ibool do_alloc) /*!< in: is an alloc needed */ +{ + if (innobase_get_lower_case_table_names() == 2) { + if (do_alloc) { + ulint len; + + len = strlen(foreign->foreign_table_name) + 1; + + foreign->foreign_table_name_lookup = + static_cast<char*>( + mem_heap_alloc(foreign->heap, len)); + } + strcpy(foreign->foreign_table_name_lookup, + foreign->foreign_table_name); + innobase_casedn_str(foreign->foreign_table_name_lookup); + } else { + foreign->foreign_table_name_lookup + = foreign->foreign_table_name; + } +} + +/**********************************************************************//** +Sets the referenced_table_name_lookup pointer based on the value of +lower_case_table_names. If that is 0 or 1, referenced_table_name_lookup +will point to referenced_table_name. If 2, then another string is +allocated from foreign->heap and set to lower case. */ +UNIV_INTERN +void +dict_mem_referenced_table_name_lookup_set( +/*======================================*/ + dict_foreign_t* foreign, /*!< in/out: foreign struct */ + ibool do_alloc) /*!< in: is an alloc needed */ +{ + if (innobase_get_lower_case_table_names() == 2) { + if (do_alloc) { + ulint len; + + len = strlen(foreign->referenced_table_name) + 1; + + foreign->referenced_table_name_lookup = + static_cast<char*>( + mem_heap_alloc(foreign->heap, len)); + } + strcpy(foreign->referenced_table_name_lookup, + foreign->referenced_table_name); + innobase_casedn_str(foreign->referenced_table_name_lookup); + } else { + foreign->referenced_table_name_lookup + = foreign->referenced_table_name; + } +} +#endif /* !UNIV_HOTBACKUP */ + +/**********************************************************************//** +Adds a field definition to an index. NOTE: does not take a copy +of the column name if the field is a column. The memory occupied +by the column name may be released only after publishing the index. */ +UNIV_INTERN +void +dict_mem_index_add_field( +/*=====================*/ + dict_index_t* index, /*!< in: index */ + const char* name, /*!< in: column name */ + ulint prefix_len) /*!< in: 0 or the column prefix length + in a MySQL index like + INDEX (textcol(25)) */ +{ + dict_field_t* field; + + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + index->n_def++; + + field = dict_index_get_nth_field(index, index->n_def - 1); + + field->name = name; + field->prefix_len = (unsigned int) prefix_len; +} + +/**********************************************************************//** +Frees an index memory object. */ +UNIV_INTERN +void +dict_mem_index_free( +/*================*/ + dict_index_t* index) /*!< in: index */ +{ + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); +#ifdef UNIV_BLOB_DEBUG + if (index->blobs) { + mutex_free(&index->blobs_mutex); + rbt_free(index->blobs); + } +#endif /* UNIV_BLOB_DEBUG */ + + os_fast_mutex_free(&index->zip_pad.mutex); + + mem_heap_free(index->heap); +} + +/** Create a temporary tablename like "#sql-ibtid-inc where + tid = the Table ID + inc = a randomly initialized number that is incremented for each file +The table ID is a 64 bit integer, can use up to 20 digits, and is +initialized at bootstrap. The second number is 32 bits, can use up to 10 +digits, and is initialized at startup to a randomly distributed number. +It is hoped that the combination of these two numbers will provide a +reasonably unique temporary file name. +@param[in] heap A memory heap +@param[in] dbtab Table name in the form database/table name +@param[in] id Table id +@return A unique temporary tablename suitable for InnoDB use */ +UNIV_INTERN +char* +dict_mem_create_temporary_tablename( + mem_heap_t* heap, + const char* dbtab, + table_id_t id) +{ + size_t size; + char* name; + const char* dbend = strchr(dbtab, '/'); + ut_ad(dbend); + size_t dblen = dbend - dbtab + 1; + +#ifdef HAVE_ATOMIC_BUILTINS + /* Increment a randomly initialized number for each temp file. */ + os_atomic_increment_uint32(&dict_temp_file_num, 1); +#else /* HAVE_ATOMIC_BUILTINS */ + dict_temp_file_num++; +#endif /* HAVE_ATOMIC_BUILTINS */ + + size = tmp_file_prefix_length + 3 + 20 + 1 + 10 + dblen; + name = static_cast<char*>(mem_heap_alloc(heap, size)); + memcpy(name, dbtab, dblen); + ut_snprintf(name + dblen, size - dblen, + TEMP_FILE_PREFIX_INNODB UINT64PF "-" UINT32PF, + id, dict_temp_file_num); + + return(name); +} + +/** Initialize dict memory variables */ + +void +dict_mem_init(void) +{ + /* Initialize a randomly distributed temporary file number */ + ib_uint32_t now = static_cast<ib_uint32_t>(ut_time()); + + const byte* buf = reinterpret_cast<const byte*>(&now); + ut_ad(ut_crc32 != NULL); + + dict_temp_file_num = ut_crc32(buf, sizeof(now)); + + DBUG_PRINT("dict_mem_init", + ("Starting Temporary file number is " UINT32PF, + dict_temp_file_num)); +} + +/** Validate the search order in the foreign key set. +@param[in] fk_set the foreign key set to be validated +@return true if search order is fine in the set, false otherwise. */ +bool +dict_foreign_set_validate( + const dict_foreign_set& fk_set) +{ + dict_foreign_not_exists not_exists(fk_set); + + dict_foreign_set::iterator it = std::find_if( + fk_set.begin(), fk_set.end(), not_exists); + + if (it == fk_set.end()) { + return(true); + } + + dict_foreign_t* foreign = *it; + std::cerr << "Foreign key lookup failed: " << *foreign; + std::cerr << fk_set; + ut_ad(0); + return(false); +} + +/** Validate the search order in the foreign key sets of the table +(foreign_set and referenced_set). +@param[in] table table whose foreign key sets are to be validated +@return true if foreign key sets are fine, false otherwise. */ +bool +dict_foreign_set_validate( + const dict_table_t& table) +{ + return(dict_foreign_set_validate(table.foreign_set) + && dict_foreign_set_validate(table.referenced_set)); +} + +std::ostream& +operator<< (std::ostream& out, const dict_foreign_t& foreign) +{ + out << "[dict_foreign_t: id='" << foreign.id << "'"; + + if (foreign.foreign_table_name != NULL) { + out << ",for: '" << foreign.foreign_table_name << "'"; + } + + out << "]"; + return(out); +} + +std::ostream& +operator<< (std::ostream& out, const dict_foreign_set& fk_set) +{ + out << "[dict_foreign_set:"; + std::for_each(fk_set.begin(), fk_set.end(), dict_foreign_print(out)); + out << "]" << std::endl; + return(out); +} + diff --git a/storage/xtradb/dict/dict0stats.cc b/storage/xtradb/dict/dict0stats.cc new file mode 100644 index 00000000000..9cd909686ed --- /dev/null +++ b/storage/xtradb/dict/dict0stats.cc @@ -0,0 +1,4182 @@ +/***************************************************************************** + +Copyright (c) 2009, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file dict/dict0stats.cc +Code used for calculating and manipulating table statistics. + +Created Jan 06, 2010 Vasil Dimov +*******************************************************/ + +#ifndef UNIV_HOTBACKUP + +#include "univ.i" + +#include "btr0btr.h" /* btr_get_size() */ +#include "btr0cur.h" /* btr_estimate_number_of_different_key_vals() */ +#include "dict0dict.h" /* dict_table_get_first_index(), dict_fs2utf8() */ +#include "dict0mem.h" /* DICT_TABLE_MAGIC_N */ +#include "dict0stats.h" +#include "data0type.h" /* dtype_t */ +#include "db0err.h" /* dberr_t */ +#include "page0page.h" /* page_align() */ +#include "pars0pars.h" /* pars_info_create() */ +#include "pars0types.h" /* pars_info_t */ +#include "que0que.h" /* que_eval_sql() */ +#include "rem0cmp.h" /* REC_MAX_N_FIELDS,cmp_rec_rec_with_match() */ +#include "row0sel.h" /* sel_node_t */ +#include "row0types.h" /* sel_node_t */ +#include "trx0trx.h" /* trx_create() */ +#include "trx0roll.h" /* trx_rollback_to_savepoint() */ +#include "ut0rnd.h" /* ut_rnd_interval() */ +#include "ut0ut.h" /* ut_format_name(), ut_time() */ + +#include <algorithm> +#include <map> +#include <vector> + +/* Sampling algorithm description @{ + +The algorithm is controlled by one number - N_SAMPLE_PAGES(index), +let it be A, which is the number of leaf pages to analyze for a given index +for each n-prefix (if the index is on 3 columns, then 3*A leaf pages will be +analyzed). + +Let the total number of leaf pages in the table be T. +Level 0 - leaf pages, level H - root. + +Definition: N-prefix-boring record is a record on a non-leaf page that equals +the next (to the right, cross page boundaries, skipping the supremum and +infimum) record on the same level when looking at the fist n-prefix columns. +The last (user) record on a level is not boring (it does not match the +non-existent user record to the right). We call the records boring because all +the records on the page below a boring record are equal to that boring record. + +We avoid diving below boring records when searching for a leaf page to +estimate the number of distinct records because we know that such a leaf +page will have number of distinct records == 1. + +For each n-prefix: start from the root level and full scan subsequent lower +levels until a level that contains at least A*10 distinct records is found. +Lets call this level LA. +As an optimization the search is canceled if it has reached level 1 (never +descend to the level 0 (leaf)) and also if the next level to be scanned +would contain more than A pages. The latter is because the user has asked +to analyze A leaf pages and it does not make sense to scan much more than +A non-leaf pages with the sole purpose of finding a good sample of A leaf +pages. + +After finding the appropriate level LA with >A*10 distinct records (or less in +the exceptions described above), divide it into groups of equal records and +pick A such groups. Then pick the last record from each group. For example, +let the level be: + +index: 0,1,2,3,4,5,6,7,8,9,10 +record: 1,1,1,2,2,7,7,7,7,7,9 + +There are 4 groups of distinct records and if A=2 random ones are selected, +e.g. 1,1,1 and 7,7,7,7,7, then records with indexes 2 and 9 will be selected. + +After selecting A records as described above, dive below them to find A leaf +pages and analyze them, finding the total number of distinct records. The +dive to the leaf level is performed by selecting a non-boring record from +each page and diving below it. + +This way, a total of A leaf pages are analyzed for the given n-prefix. + +Let the number of different key values found in each leaf page i be Pi (i=1..A). +Let N_DIFF_AVG_LEAF be (P1 + P2 + ... + PA) / A. +Let the number of different key values on level LA be N_DIFF_LA. +Let the total number of records on level LA be TOTAL_LA. +Let R be N_DIFF_LA / TOTAL_LA, we assume this ratio is the same on the +leaf level. +Let the number of leaf pages be N. +Then the total number of different key values on the leaf level is: +N * R * N_DIFF_AVG_LEAF. +See REF01 for the implementation. + +The above describes how to calculate the cardinality of an index. +This algorithm is executed for each n-prefix of a multi-column index +where n=1..n_uniq. +@} */ + +/* names of the tables from the persistent statistics storage */ +#define TABLE_STATS_NAME "mysql/innodb_table_stats" +#define TABLE_STATS_NAME_PRINT "mysql.innodb_table_stats" +#define INDEX_STATS_NAME "mysql/innodb_index_stats" +#define INDEX_STATS_NAME_PRINT "mysql.innodb_index_stats" + +#ifdef UNIV_STATS_DEBUG +#define DEBUG_PRINTF(fmt, ...) printf(fmt, ## __VA_ARGS__) +#else /* UNIV_STATS_DEBUG */ +#define DEBUG_PRINTF(fmt, ...) /* noop */ +#endif /* UNIV_STATS_DEBUG */ + +/* Gets the number of leaf pages to sample in persistent stats estimation */ +#define N_SAMPLE_PAGES(index) \ + static_cast<ib_uint64_t>( \ + (index)->table->stats_sample_pages != 0 \ + ? (index)->table->stats_sample_pages \ + : srv_stats_persistent_sample_pages) + +/* number of distinct records on a given level that are required to stop +descending to lower levels and fetch N_SAMPLE_PAGES(index) records +from that level */ +#define N_DIFF_REQUIRED(index) (N_SAMPLE_PAGES(index) * 10) + +/* A dynamic array where we store the boundaries of each distinct group +of keys. For example if a btree level is: +index: 0,1,2,3,4,5,6,7,8,9,10,11,12 +data: b,b,b,b,b,b,g,g,j,j,j, x, y +then we would store 5,7,10,11,12 in the array. */ +typedef std::vector<ib_uint64_t> boundaries_t; + +/* This is used to arrange the index based on the index name. +@return true if index_name1 is smaller than index_name2. */ +struct index_cmp +{ + bool operator()(const char* index_name1, const char* index_name2) const { + return(strcmp(index_name1, index_name2) < 0); + } +}; + +typedef std::map<const char*, dict_index_t*, index_cmp> index_map_t; + +/*********************************************************************//** +Checks whether an index should be ignored in stats manipulations: +* stats fetch +* stats recalc +* stats save +@return true if exists and all tables are ok */ +UNIV_INLINE +bool +dict_stats_should_ignore_index( +/*===========================*/ + const dict_index_t* index) /*!< in: index */ +{ + return((index->type & DICT_FTS) + || dict_index_is_corrupted(index) + || index->to_be_dropped + || *index->name == TEMP_INDEX_PREFIX); +} + +/*********************************************************************//** +Checks whether the persistent statistics storage exists and that all +tables have the proper structure. +@return true if exists and all tables are ok */ +static +bool +dict_stats_persistent_storage_check( +/*================================*/ + bool caller_has_dict_sys_mutex) /*!< in: true if the caller + owns dict_sys->mutex */ +{ + /* definition for the table TABLE_STATS_NAME */ + dict_col_meta_t table_stats_columns[] = { + {"database_name", DATA_VARMYSQL, + DATA_NOT_NULL, 192}, + + {"table_name", DATA_VARMYSQL, + DATA_NOT_NULL, 192}, + + {"last_update", DATA_FIXBINARY, + DATA_NOT_NULL, 4}, + + {"n_rows", DATA_INT, + DATA_NOT_NULL | DATA_UNSIGNED, 8}, + + {"clustered_index_size", DATA_INT, + DATA_NOT_NULL | DATA_UNSIGNED, 8}, + + {"sum_of_other_index_sizes", DATA_INT, + DATA_NOT_NULL | DATA_UNSIGNED, 8} + }; + dict_table_schema_t table_stats_schema = { + TABLE_STATS_NAME, + UT_ARR_SIZE(table_stats_columns), + table_stats_columns, + 0 /* n_foreign */, + 0 /* n_referenced */ + }; + + /* definition for the table INDEX_STATS_NAME */ + dict_col_meta_t index_stats_columns[] = { + {"database_name", DATA_VARMYSQL, + DATA_NOT_NULL, 192}, + + {"table_name", DATA_VARMYSQL, + DATA_NOT_NULL, 192}, + + {"index_name", DATA_VARMYSQL, + DATA_NOT_NULL, 192}, + + {"last_update", DATA_FIXBINARY, + DATA_NOT_NULL, 4}, + + {"stat_name", DATA_VARMYSQL, + DATA_NOT_NULL, 64*3}, + + {"stat_value", DATA_INT, + DATA_NOT_NULL | DATA_UNSIGNED, 8}, + + {"sample_size", DATA_INT, + DATA_UNSIGNED, 8}, + + {"stat_description", DATA_VARMYSQL, + DATA_NOT_NULL, 1024*3} + }; + dict_table_schema_t index_stats_schema = { + INDEX_STATS_NAME, + UT_ARR_SIZE(index_stats_columns), + index_stats_columns, + 0 /* n_foreign */, + 0 /* n_referenced */ + }; + + char errstr[512]; + dberr_t ret; + + if (!caller_has_dict_sys_mutex) { + mutex_enter(&(dict_sys->mutex)); + } + + ut_ad(mutex_own(&dict_sys->mutex)); + + /* first check table_stats */ + ret = dict_table_schema_check(&table_stats_schema, errstr, + sizeof(errstr)); + if (ret == DB_SUCCESS) { + /* if it is ok, then check index_stats */ + ret = dict_table_schema_check(&index_stats_schema, errstr, + sizeof(errstr)); + } + + if (!caller_has_dict_sys_mutex) { + mutex_exit(&(dict_sys->mutex)); + } + + if (ret != DB_SUCCESS) { + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Error: %s\n", errstr); + return(false); + } + /* else */ + + return(true); +} + +/** Executes a given SQL statement using the InnoDB internal SQL parser. +This function will free the pinfo object. +@param[in,out] pinfo pinfo to pass to que_eval_sql() must already +have any literals bound to it +@param[in] sql SQL string to execute +@param[in,out] trx in case of NULL the function will allocate and +free the trx object. If it is not NULL then it will be rolled back +only in the case of error, but not freed. +@return DB_SUCCESS or error code */ +static +dberr_t +dict_stats_exec_sql( + pars_info_t* pinfo, + const char* sql, + trx_t* trx) +{ + dberr_t err; + bool trx_started = false; +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(mutex_own(&dict_sys->mutex)); + + if (!dict_stats_persistent_storage_check(true)) { + pars_info_free(pinfo); + return(DB_STATS_DO_NOT_EXIST); + } + + if (trx == NULL) { + trx = trx_allocate_for_background(); + trx_start_if_not_started(trx); + trx_started = true; + } + + err = que_eval_sql(pinfo, sql, FALSE, trx); /* pinfo is freed here */ + + DBUG_EXECUTE_IF("stats_index_error", + if (!trx_started) { + err = DB_STATS_DO_NOT_EXIST; + trx->error_state = DB_STATS_DO_NOT_EXIST; + }); + + if (!trx_started && err == DB_SUCCESS) { + return(DB_SUCCESS); + } + + if (err == DB_SUCCESS) { + trx_commit_for_mysql(trx); + } else { + trx->op_info = "rollback of internal trx on stats tables"; + trx->dict_operation_lock_mode = RW_X_LATCH; + trx_rollback_to_savepoint(trx, NULL); + trx->dict_operation_lock_mode = 0; + trx->op_info = ""; + ut_a(trx->error_state == DB_SUCCESS); + } + + if (trx_started) { + trx_free_for_background(trx); + } + + return(err); +} + +/*********************************************************************//** +Duplicate a table object and its indexes. +This function creates a dummy dict_table_t object and initializes the +following table and index members: +dict_table_t::id (copied) +dict_table_t::heap (newly created) +dict_table_t::name (copied) +dict_table_t::corrupted (copied) +dict_table_t::indexes<> (newly created) +dict_table_t::magic_n +for each entry in dict_table_t::indexes, the following are initialized: +(indexes that have DICT_FTS set in index->type are skipped) +dict_index_t::id (copied) +dict_index_t::name (copied) +dict_index_t::table_name (points to the copied table name) +dict_index_t::table (points to the above semi-initialized object) +dict_index_t::type (copied) +dict_index_t::to_be_dropped (copied) +dict_index_t::online_status (copied) +dict_index_t::n_uniq (copied) +dict_index_t::fields[] (newly created, only first n_uniq, only fields[i].name) +dict_index_t::indexes<> (newly created) +dict_index_t::stat_n_diff_key_vals[] (only allocated, left uninitialized) +dict_index_t::stat_n_sample_sizes[] (only allocated, left uninitialized) +dict_index_t::stat_n_non_null_key_vals[] (only allocated, left uninitialized) +dict_index_t::magic_n +The returned object should be freed with dict_stats_table_clone_free() +when no longer needed. +@return incomplete table object */ +static +dict_table_t* +dict_stats_table_clone_create( +/*==========================*/ + const dict_table_t* table) /*!< in: table whose stats to copy */ +{ + size_t heap_size; + dict_index_t* index; + + /* Estimate the size needed for the table and all of its indexes */ + + heap_size = 0; + heap_size += sizeof(dict_table_t); + heap_size += strlen(table->name) + 1; + + for (index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + + if (dict_stats_should_ignore_index(index)) { + continue; + } + + ut_ad(!dict_index_is_univ(index)); + + ulint n_uniq = dict_index_get_n_unique(index); + + heap_size += sizeof(dict_index_t); + heap_size += strlen(index->name) + 1; + heap_size += n_uniq * sizeof(index->fields[0]); + for (ulint i = 0; i < n_uniq; i++) { + heap_size += strlen(index->fields[i].name) + 1; + } + heap_size += n_uniq * sizeof(index->stat_n_diff_key_vals[0]); + heap_size += n_uniq * sizeof(index->stat_n_sample_sizes[0]); + heap_size += n_uniq * sizeof(index->stat_n_non_null_key_vals[0]); + } + + /* Allocate the memory and copy the members */ + + mem_heap_t* heap; + + heap = mem_heap_create(heap_size); + + dict_table_t* t; + + t = (dict_table_t*) mem_heap_alloc(heap, sizeof(*t)); + + UNIV_MEM_ASSERT_RW_ABORT(&table->id, sizeof(table->id)); + t->id = table->id; + + t->heap = heap; + + UNIV_MEM_ASSERT_RW_ABORT(table->name, strlen(table->name) + 1); + t->name = (char*) mem_heap_strdup(heap, table->name); + + t->corrupted = table->corrupted; + + /* This private object "t" is not shared with other threads, so + we do not need the stats_latch (thus we pass false below). The + dict_table_stats_lock()/unlock() routines will do nothing. */ + dict_table_stats_latch_create(t, false); + + UT_LIST_INIT(t->indexes); + + for (index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + + if (dict_stats_should_ignore_index(index)) { + continue; + } + + ut_ad(!dict_index_is_univ(index)); + + dict_index_t* idx; + + idx = (dict_index_t*) mem_heap_alloc(heap, sizeof(*idx)); + + UNIV_MEM_ASSERT_RW_ABORT(&index->id, sizeof(index->id)); + idx->id = index->id; + + UNIV_MEM_ASSERT_RW_ABORT(index->name, strlen(index->name) + 1); + idx->name = (char*) mem_heap_strdup(heap, index->name); + + idx->table_name = t->name; + + idx->table = t; + + idx->type = index->type; + + idx->to_be_dropped = 0; + + idx->online_status = ONLINE_INDEX_COMPLETE; + + idx->n_uniq = index->n_uniq; + + idx->fields = (dict_field_t*) mem_heap_alloc( + heap, idx->n_uniq * sizeof(idx->fields[0])); + + for (ulint i = 0; i < idx->n_uniq; i++) { + UNIV_MEM_ASSERT_RW_ABORT(index->fields[i].name, strlen(index->fields[i].name) + 1); + idx->fields[i].name = (char*) mem_heap_strdup( + heap, index->fields[i].name); + } + + /* hook idx into t->indexes */ + UT_LIST_ADD_LAST(indexes, t->indexes, idx); + + idx->stat_n_diff_key_vals = (ib_uint64_t*) mem_heap_alloc( + heap, + idx->n_uniq * sizeof(idx->stat_n_diff_key_vals[0])); + + idx->stat_n_sample_sizes = (ib_uint64_t*) mem_heap_alloc( + heap, + idx->n_uniq * sizeof(idx->stat_n_sample_sizes[0])); + + idx->stat_n_non_null_key_vals = (ib_uint64_t*) mem_heap_alloc( + heap, + idx->n_uniq * sizeof(idx->stat_n_non_null_key_vals[0])); + ut_d(idx->magic_n = DICT_INDEX_MAGIC_N); + } + + ut_d(t->magic_n = DICT_TABLE_MAGIC_N); + + return(t); +} + +/*********************************************************************//** +Free the resources occupied by an object returned by +dict_stats_table_clone_create(). */ +static +void +dict_stats_table_clone_free( +/*========================*/ + dict_table_t* t) /*!< in: dummy table object to free */ +{ + dict_table_stats_latch_destroy(t); + mem_heap_free(t->heap); +} + +/*********************************************************************//** +Write all zeros (or 1 where it makes sense) into an index +statistics members. The resulting stats correspond to an empty index. +The caller must own index's table stats latch in X mode +(dict_table_stats_lock(table, RW_X_LATCH)) */ +static +void +dict_stats_empty_index( +/*===================*/ + dict_index_t* index) /*!< in/out: index */ +{ + ut_ad(!(index->type & DICT_FTS)); + ut_ad(!dict_index_is_univ(index)); + + ulint n_uniq = index->n_uniq; + + for (ulint i = 0; i < n_uniq; i++) { + index->stat_n_diff_key_vals[i] = 0; + index->stat_n_sample_sizes[i] = 1; + index->stat_n_non_null_key_vals[i] = 0; + } + + index->stat_index_size = 1; + index->stat_n_leaf_pages = 1; +} + +/*********************************************************************//** +Write all zeros (or 1 where it makes sense) into a table and its indexes' +statistics members. The resulting stats correspond to an empty table. */ +static +void +dict_stats_empty_table( +/*===================*/ + dict_table_t* table) /*!< in/out: table */ +{ + /* Zero the stats members */ + + dict_table_stats_lock(table, RW_X_LATCH); + + table->stat_n_rows = 0; + table->stat_clustered_index_size = 1; + /* 1 page for each index, not counting the clustered */ + table->stat_sum_of_other_index_sizes + = UT_LIST_GET_LEN(table->indexes) - 1; + table->stat_modified_counter = 0; + + dict_index_t* index; + + for (index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + + if (index->type & DICT_FTS) { + continue; + } + + ut_ad(!dict_index_is_univ(index)); + + dict_stats_empty_index(index); + } + + table->stat_initialized = TRUE; + + dict_table_stats_unlock(table, RW_X_LATCH); +} + +/*********************************************************************//** +Check whether index's stats are initialized (assert if they are not). */ +static +void +dict_stats_assert_initialized_index( +/*================================*/ + const dict_index_t* index) /*!< in: index */ +{ + UNIV_MEM_ASSERT_RW_ABORT( + index->stat_n_diff_key_vals, + index->n_uniq * sizeof(index->stat_n_diff_key_vals[0])); + + UNIV_MEM_ASSERT_RW_ABORT( + index->stat_n_sample_sizes, + index->n_uniq * sizeof(index->stat_n_sample_sizes[0])); + + UNIV_MEM_ASSERT_RW_ABORT( + index->stat_n_non_null_key_vals, + index->n_uniq * sizeof(index->stat_n_non_null_key_vals[0])); + + UNIV_MEM_ASSERT_RW_ABORT( + &index->stat_index_size, + sizeof(index->stat_index_size)); + + UNIV_MEM_ASSERT_RW_ABORT( + &index->stat_n_leaf_pages, + sizeof(index->stat_n_leaf_pages)); +} + +/*********************************************************************//** +Check whether table's stats are initialized (assert if they are not). */ +static +void +dict_stats_assert_initialized( +/*==========================*/ + const dict_table_t* table) /*!< in: table */ +{ + ut_a(table->stat_initialized); + + UNIV_MEM_ASSERT_RW_ABORT(&table->stats_last_recalc, + sizeof(table->stats_last_recalc)); + + UNIV_MEM_ASSERT_RW_ABORT(&table->stat_persistent, + sizeof(table->stat_persistent)); + + UNIV_MEM_ASSERT_RW_ABORT(&table->stats_auto_recalc, + sizeof(table->stats_auto_recalc)); + + UNIV_MEM_ASSERT_RW_ABORT(&table->stats_sample_pages, + sizeof(table->stats_sample_pages)); + + UNIV_MEM_ASSERT_RW_ABORT(&table->stat_n_rows, + sizeof(table->stat_n_rows)); + + UNIV_MEM_ASSERT_RW_ABORT(&table->stat_clustered_index_size, + sizeof(table->stat_clustered_index_size)); + + UNIV_MEM_ASSERT_RW_ABORT(&table->stat_sum_of_other_index_sizes, + sizeof(table->stat_sum_of_other_index_sizes)); + + UNIV_MEM_ASSERT_RW_ABORT(&table->stat_modified_counter, + sizeof(table->stat_modified_counter)); + + UNIV_MEM_ASSERT_RW_ABORT(&table->stats_bg_flag, + sizeof(table->stats_bg_flag)); + + for (dict_index_t* index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + + if (!dict_stats_should_ignore_index(index)) { + dict_stats_assert_initialized_index(index); + } + } +} + +#define INDEX_EQ(i1, i2) \ + ((i1) != NULL \ + && (i2) != NULL \ + && (i1)->id == (i2)->id \ + && strcmp((i1)->name, (i2)->name) == 0) + +/*********************************************************************//** +Copy table and index statistics from one table to another, including index +stats. Extra indexes in src are ignored and extra indexes in dst are +initialized to correspond to an empty index. */ +static +void +dict_stats_copy( +/*============*/ + dict_table_t* dst, /*!< in/out: destination table */ + const dict_table_t* src) /*!< in: source table */ +{ + dst->stats_last_recalc = src->stats_last_recalc; + dst->stat_n_rows = src->stat_n_rows; + dst->stat_clustered_index_size = src->stat_clustered_index_size; + dst->stat_sum_of_other_index_sizes = src->stat_sum_of_other_index_sizes; + dst->stat_modified_counter = src->stat_modified_counter; + + dict_index_t* dst_idx; + dict_index_t* src_idx; + + for (dst_idx = dict_table_get_first_index(dst), + src_idx = dict_table_get_first_index(src); + dst_idx != NULL; + dst_idx = dict_table_get_next_index(dst_idx), + (src_idx != NULL + && (src_idx = dict_table_get_next_index(src_idx)))) { + + if (dict_stats_should_ignore_index(dst_idx)) { + continue; + } + + ut_ad(!dict_index_is_univ(dst_idx)); + + if (!INDEX_EQ(src_idx, dst_idx)) { + for (src_idx = dict_table_get_first_index(src); + src_idx != NULL; + src_idx = dict_table_get_next_index(src_idx)) { + + if (INDEX_EQ(src_idx, dst_idx)) { + break; + } + } + } + + if (!INDEX_EQ(src_idx, dst_idx)) { + dict_stats_empty_index(dst_idx); + continue; + } + + ulint n_copy_el; + + if (dst_idx->n_uniq > src_idx->n_uniq) { + n_copy_el = src_idx->n_uniq; + /* Since src is smaller some elements in dst + will remain untouched by the following memmove(), + thus we init all of them here. */ + dict_stats_empty_index(dst_idx); + } else { + n_copy_el = dst_idx->n_uniq; + } + + memmove(dst_idx->stat_n_diff_key_vals, + src_idx->stat_n_diff_key_vals, + n_copy_el * sizeof(dst_idx->stat_n_diff_key_vals[0])); + + memmove(dst_idx->stat_n_sample_sizes, + src_idx->stat_n_sample_sizes, + n_copy_el * sizeof(dst_idx->stat_n_sample_sizes[0])); + + memmove(dst_idx->stat_n_non_null_key_vals, + src_idx->stat_n_non_null_key_vals, + n_copy_el * sizeof(dst_idx->stat_n_non_null_key_vals[0])); + + dst_idx->stat_index_size = src_idx->stat_index_size; + + dst_idx->stat_n_leaf_pages = src_idx->stat_n_leaf_pages; + } + + dst->stat_initialized = TRUE; +} + +/*********************************************************************//** +Duplicate the stats of a table and its indexes. +This function creates a dummy dict_table_t object and copies the input +table's stats into it. The returned table object is not in the dictionary +cache and cannot be accessed by any other threads. In addition to the +members copied in dict_stats_table_clone_create() this function initializes +the following: +dict_table_t::stat_initialized +dict_table_t::stat_persistent +dict_table_t::stat_n_rows +dict_table_t::stat_clustered_index_size +dict_table_t::stat_sum_of_other_index_sizes +dict_table_t::stat_modified_counter +dict_index_t::stat_n_diff_key_vals[] +dict_index_t::stat_n_sample_sizes[] +dict_index_t::stat_n_non_null_key_vals[] +dict_index_t::stat_index_size +dict_index_t::stat_n_leaf_pages +The returned object should be freed with dict_stats_snapshot_free() +when no longer needed. +@return incomplete table object */ +static +dict_table_t* +dict_stats_snapshot_create( +/*=======================*/ + dict_table_t* table) /*!< in: table whose stats to copy */ +{ + mutex_enter(&dict_sys->mutex); + + dict_table_stats_lock(table, RW_S_LATCH); + + dict_stats_assert_initialized(table); + + dict_table_t* t; + + t = dict_stats_table_clone_create(table); + + dict_stats_copy(t, table); + + t->stat_persistent = table->stat_persistent; + t->stats_auto_recalc = table->stats_auto_recalc; + t->stats_sample_pages = table->stats_sample_pages; + t->stats_bg_flag = table->stats_bg_flag; + + dict_table_stats_unlock(table, RW_S_LATCH); + + mutex_exit(&dict_sys->mutex); + + return(t); +} + +/*********************************************************************//** +Free the resources occupied by an object returned by +dict_stats_snapshot_create(). */ +static +void +dict_stats_snapshot_free( +/*=====================*/ + dict_table_t* t) /*!< in: dummy table object to free */ +{ + dict_stats_table_clone_free(t); +} + +/*********************************************************************//** +Calculates new estimates for index statistics. This function is +relatively quick and is used to calculate transient statistics that +are not saved on disk. This was the only way to calculate statistics +before the Persistent Statistics feature was introduced. */ +static +void +dict_stats_update_transient_for_index( +/*==================================*/ + dict_index_t* index) /*!< in/out: index */ +{ + if (srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO + && (srv_force_recovery >= SRV_FORCE_NO_LOG_REDO + || !dict_index_is_clust(index))) { + /* If we have set a high innodb_force_recovery + level, do not calculate statistics, as a badly + corrupted index can cause a crash in it. + Initialize some bogus index cardinality + statistics, so that the data can be queried in + various means, also via secondary indexes. */ + dict_stats_empty_index(index); +#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG + } else if (ibuf_debug && !dict_index_is_clust(index)) { + dict_stats_empty_index(index); +#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ + } else { + mtr_t mtr; + ulint size; + mtr_start(&mtr); + mtr_s_lock(dict_index_get_lock(index), &mtr); + + size = btr_get_size(index, BTR_TOTAL_SIZE, &mtr); + + if (size != ULINT_UNDEFINED) { + index->stat_index_size = size; + + size = btr_get_size( + index, BTR_N_LEAF_PAGES, &mtr); + } + + mtr_commit(&mtr); + + switch (size) { + case ULINT_UNDEFINED: + dict_stats_empty_index(index); + return; + case 0: + /* The root node of the tree is a leaf */ + size = 1; + } + + index->stat_n_leaf_pages = size; + + btr_estimate_number_of_different_key_vals(index); + } +} + +/*********************************************************************//** +Calculates new estimates for table and index statistics. This function +is relatively quick and is used to calculate transient statistics that +are not saved on disk. +This was the only way to calculate statistics before the +Persistent Statistics feature was introduced. */ +UNIV_INTERN +void +dict_stats_update_transient( +/*========================*/ + dict_table_t* table) /*!< in/out: table */ +{ + dict_index_t* index; + ulint sum_of_index_sizes = 0; + + /* Find out the sizes of the indexes and how many different values + for the key they approximately have */ + + index = dict_table_get_first_index(table); + + if (dict_table_is_discarded(table)) { + /* Nothing to do. */ + dict_stats_empty_table(table); + return; + } else if (index == NULL) { + /* Table definition is corrupt */ + + char buf[MAX_FULL_NAME_LEN]; + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: table %s has no indexes. " + "Cannot calculate statistics.\n", + ut_format_name(table->name, TRUE, buf, sizeof(buf))); + dict_stats_empty_table(table); + return; + } + + for (; index != NULL; index = dict_table_get_next_index(index)) { + + ut_ad(!dict_index_is_univ(index)); + + if (index->type & DICT_FTS) { + continue; + } + + dict_stats_empty_index(index); + + if (dict_stats_should_ignore_index(index)) { + continue; + } + + dict_stats_update_transient_for_index(index); + + sum_of_index_sizes += index->stat_index_size; + } + + index = dict_table_get_first_index(table); + + table->stat_n_rows = index->stat_n_diff_key_vals[ + dict_index_get_n_unique(index) - 1]; + + table->stat_clustered_index_size = index->stat_index_size; + + table->stat_sum_of_other_index_sizes = sum_of_index_sizes + - index->stat_index_size; + + table->stats_last_recalc = ut_time(); + + table->stat_modified_counter = 0; + + table->stat_initialized = TRUE; +} + +/* @{ Pseudo code about the relation between the following functions + +let N = N_SAMPLE_PAGES(index) + +dict_stats_analyze_index() + for each n_prefix + search for good enough level: + dict_stats_analyze_index_level() // only called if level has <= N pages + // full scan of the level in one mtr + collect statistics about the given level + if we are not satisfied with the level, search next lower level + we have found a good enough level here + dict_stats_analyze_index_for_n_prefix(that level, stats collected above) + // full scan of the level in one mtr + dive below some records and analyze the leaf page there: + dict_stats_analyze_index_below_cur() +@} */ + +/*********************************************************************//** +Find the total number and the number of distinct keys on a given level in +an index. Each of the 1..n_uniq prefixes are looked up and the results are +saved in the array n_diff[0] .. n_diff[n_uniq - 1]. The total number of +records on the level is saved in total_recs. +Also, the index of the last record in each group of equal records is saved +in n_diff_boundaries[0..n_uniq - 1], records indexing starts from the leftmost +record on the level and continues cross pages boundaries, counting from 0. */ +static +void +dict_stats_analyze_index_level( +/*===========================*/ + dict_index_t* index, /*!< in: index */ + ulint level, /*!< in: level */ + ib_uint64_t* n_diff, /*!< out: array for number of + distinct keys for all prefixes */ + ib_uint64_t* total_recs, /*!< out: total number of records */ + ib_uint64_t* total_pages, /*!< out: total number of pages */ + boundaries_t* n_diff_boundaries,/*!< out: boundaries of the groups + of distinct keys */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ulint n_uniq; + mem_heap_t* heap; + btr_pcur_t pcur; + const page_t* page; + const rec_t* rec; + const rec_t* prev_rec; + bool prev_rec_is_copied; + byte* prev_rec_buf = NULL; + ulint prev_rec_buf_size = 0; + ulint* rec_offsets; + ulint* prev_rec_offsets; + ulint i; + + DEBUG_PRINTF(" %s(table=%s, index=%s, level=%lu)\n", __func__, + index->table->name, index->name, level); + + ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index), + MTR_MEMO_S_LOCK)); + + n_uniq = dict_index_get_n_unique(index); + + /* elements in the n_diff array are 0..n_uniq-1 (inclusive) */ + memset(n_diff, 0x0, n_uniq * sizeof(n_diff[0])); + + /* Allocate space for the offsets header (the allocation size at + offsets[0] and the REC_OFFS_HEADER_SIZE bytes), and n_fields + 1, + so that this will never be less than the size calculated in + rec_get_offsets_func(). */ + i = (REC_OFFS_HEADER_SIZE + 1 + 1) + index->n_fields; + + heap = mem_heap_create((2 * sizeof *rec_offsets) * i); + rec_offsets = static_cast<ulint*>( + mem_heap_alloc(heap, i * sizeof *rec_offsets)); + prev_rec_offsets = static_cast<ulint*>( + mem_heap_alloc(heap, i * sizeof *prev_rec_offsets)); + rec_offs_set_n_alloc(rec_offsets, i); + rec_offs_set_n_alloc(prev_rec_offsets, i); + + /* reset the dynamic arrays n_diff_boundaries[0..n_uniq-1] */ + if (n_diff_boundaries != NULL) { + for (i = 0; i < n_uniq; i++) { + n_diff_boundaries[i].erase( + n_diff_boundaries[i].begin(), + n_diff_boundaries[i].end()); + } + } + + /* Position pcur on the leftmost record on the leftmost page + on the desired level. */ + + btr_pcur_open_at_index_side( + true, index, BTR_SEARCH_LEAF | BTR_ALREADY_S_LATCHED, + &pcur, true, level, mtr); + btr_pcur_move_to_next_on_page(&pcur); + + page = btr_pcur_get_page(&pcur); + + /* The page must not be empty, except when + it is the root page (and the whole index is empty). */ + ut_ad(btr_pcur_is_on_user_rec(&pcur) || page_is_leaf(page)); + ut_ad(btr_pcur_get_rec(&pcur) + == page_rec_get_next_const(page_get_infimum_rec(page))); + + /* check that we are indeed on the desired level */ + ut_a(btr_page_get_level(page, mtr) == level); + + /* there should not be any pages on the left */ + ut_a(btr_page_get_prev(page, mtr) == FIL_NULL); + + /* check whether the first record on the leftmost page is marked + as such, if we are on a non-leaf level */ + ut_a((level == 0) + == !(REC_INFO_MIN_REC_FLAG & rec_get_info_bits( + btr_pcur_get_rec(&pcur), page_is_comp(page)))); + + prev_rec = NULL; + prev_rec_is_copied = false; + + /* no records by default */ + *total_recs = 0; + + *total_pages = 0; + + /* iterate over all user records on this level + and compare each two adjacent ones, even the last on page + X and the fist on page X+1 */ + for (; + btr_pcur_is_on_user_rec(&pcur); + btr_pcur_move_to_next_user_rec(&pcur, mtr)) { + + ulint matched_fields = 0; + ulint matched_bytes = 0; + bool rec_is_last_on_page; + + rec = btr_pcur_get_rec(&pcur); + + /* If rec and prev_rec are on different pages, then prev_rec + must have been copied, because we hold latch only on the page + where rec resides. */ + if (prev_rec != NULL + && page_align(rec) != page_align(prev_rec)) { + + ut_a(prev_rec_is_copied); + } + + rec_is_last_on_page = + page_rec_is_supremum(page_rec_get_next_const(rec)); + + /* increment the pages counter at the end of each page */ + if (rec_is_last_on_page) { + + (*total_pages)++; + } + + /* Skip delete-marked records on the leaf level. If we + do not skip them, then ANALYZE quickly after DELETE + could count them or not (purge may have already wiped + them away) which brings non-determinism. We skip only + leaf-level delete marks because delete marks on + non-leaf level do not make sense. */ + if (level == 0 && + rec_get_deleted_flag( + rec, + page_is_comp(btr_pcur_get_page(&pcur)))) { + + if (rec_is_last_on_page + && !prev_rec_is_copied + && prev_rec != NULL) { + /* copy prev_rec */ + + prev_rec_offsets = rec_get_offsets( + prev_rec, index, prev_rec_offsets, + n_uniq, &heap); + + prev_rec = rec_copy_prefix_to_buf( + prev_rec, index, + rec_offs_n_fields(prev_rec_offsets), + &prev_rec_buf, &prev_rec_buf_size); + + prev_rec_is_copied = true; + } + + continue; + } + + rec_offsets = rec_get_offsets( + rec, index, rec_offsets, n_uniq, &heap); + + (*total_recs)++; + + if (prev_rec != NULL) { + prev_rec_offsets = rec_get_offsets( + prev_rec, index, prev_rec_offsets, + n_uniq, &heap); + + cmp_rec_rec_with_match(rec, + prev_rec, + rec_offsets, + prev_rec_offsets, + index, + FALSE, + &matched_fields, + &matched_bytes); + + for (i = matched_fields; i < n_uniq; i++) { + + if (n_diff_boundaries != NULL) { + /* push the index of the previous + record, that is - the last one from + a group of equal keys */ + + ib_uint64_t idx; + + /* the index of the current record + is total_recs - 1, the index of the + previous record is total_recs - 2; + we know that idx is not going to + become negative here because if we + are in this branch then there is a + previous record and thus + total_recs >= 2 */ + idx = *total_recs - 2; + + n_diff_boundaries[i].push_back(idx); + } + + /* increment the number of different keys + for n_prefix=i+1 (e.g. if i=0 then we increment + for n_prefix=1 which is stored in n_diff[0]) */ + n_diff[i]++; + } + } else { + /* this is the first non-delete marked record */ + for (i = 0; i < n_uniq; i++) { + n_diff[i] = 1; + } + } + + if (rec_is_last_on_page) { + /* end of a page has been reached */ + + /* we need to copy the record instead of assigning + like prev_rec = rec; because when we traverse the + records on this level at some point we will jump from + one page to the next and then rec and prev_rec will + be on different pages and + btr_pcur_move_to_next_user_rec() will release the + latch on the page that prev_rec is on */ + prev_rec = rec_copy_prefix_to_buf( + rec, index, rec_offs_n_fields(rec_offsets), + &prev_rec_buf, &prev_rec_buf_size); + prev_rec_is_copied = true; + + } else { + /* still on the same page, the next call to + btr_pcur_move_to_next_user_rec() will not jump + on the next page, we can simply assign pointers + instead of copying the records like above */ + + prev_rec = rec; + prev_rec_is_copied = false; + } + } + + /* if *total_pages is left untouched then the above loop was not + entered at all and there is one page in the whole tree which is + empty or the loop was entered but this is level 0, contains one page + and all records are delete-marked */ + if (*total_pages == 0) { + + ut_ad(level == 0); + ut_ad(*total_recs == 0); + + *total_pages = 1; + } + + /* if there are records on this level and boundaries + should be saved */ + if (*total_recs > 0 && n_diff_boundaries != NULL) { + + /* remember the index of the last record on the level as the + last one from the last group of equal keys; this holds for + all possible prefixes */ + for (i = 0; i < n_uniq; i++) { + ib_uint64_t idx; + + idx = *total_recs - 1; + + n_diff_boundaries[i].push_back(idx); + } + } + + /* now in n_diff_boundaries[i] there are exactly n_diff[i] integers, + for i=0..n_uniq-1 */ + +#ifdef UNIV_STATS_DEBUG + for (i = 0; i < n_uniq; i++) { + + DEBUG_PRINTF(" %s(): total recs: " UINT64PF + ", total pages: " UINT64PF + ", n_diff[%lu]: " UINT64PF "\n", + __func__, *total_recs, + *total_pages, + i, n_diff[i]); + +#if 0 + if (n_diff_boundaries != NULL) { + ib_uint64_t j; + + DEBUG_PRINTF(" %s(): boundaries[%lu]: ", + __func__, i); + + for (j = 0; j < n_diff[i]; j++) { + ib_uint64_t idx; + + idx = n_diff_boundaries[i][j]; + + DEBUG_PRINTF(UINT64PF "=" UINT64PF ", ", + j, idx); + } + DEBUG_PRINTF("\n"); + } +#endif + } +#endif /* UNIV_STATS_DEBUG */ + + /* Release the latch on the last page, because that is not done by + btr_pcur_close(). This function works also for non-leaf pages. */ + btr_leaf_page_release(btr_pcur_get_block(&pcur), BTR_SEARCH_LEAF, mtr); + + btr_pcur_close(&pcur); + + if (prev_rec_buf != NULL) { + + mem_free(prev_rec_buf); + } + + mem_heap_free(heap); +} + +/* aux enum for controlling the behavior of dict_stats_scan_page() @{ */ +enum page_scan_method_t { + COUNT_ALL_NON_BORING_AND_SKIP_DEL_MARKED,/* scan all records on + the given page and count the number of + distinct ones, also ignore delete marked + records */ + QUIT_ON_FIRST_NON_BORING/* quit when the first record that differs + from its right neighbor is found */ +}; +/* @} */ + +/** Scan a page, reading records from left to right and counting the number +of distinct records (looking only at the first n_prefix +columns) and the number of external pages pointed by records from this page. +If scan_method is QUIT_ON_FIRST_NON_BORING then the function +will return as soon as it finds a record that does not match its neighbor +to the right, which means that in the case of QUIT_ON_FIRST_NON_BORING the +returned n_diff can either be 0 (empty page), 1 (the whole page has all keys +equal) or 2 (the function found a non-boring record and returned). +@param[out] out_rec record, or NULL +@param[out] offsets1 rec_get_offsets() working space (must +be big enough) +@param[out] offsets2 rec_get_offsets() working space (must +be big enough) +@param[in] index index of the page +@param[in] page the page to scan +@param[in] n_prefix look at the first n_prefix columns +@param[in] scan_method scan to the end of the page or not +@param[out] n_diff number of distinct records encountered +@param[out] n_external_pages if this is non-NULL then it will be set +to the number of externally stored pages which were encountered +@return offsets1 or offsets2 (the offsets of *out_rec), +or NULL if the page is empty and does not contain user records. */ +UNIV_INLINE +ulint* +dict_stats_scan_page( + const rec_t** out_rec, + ulint* offsets1, + ulint* offsets2, + dict_index_t* index, + const page_t* page, + ulint n_prefix, + page_scan_method_t scan_method, + ib_uint64_t* n_diff, + ib_uint64_t* n_external_pages) +{ + ulint* offsets_rec = offsets1; + ulint* offsets_next_rec = offsets2; + const rec_t* rec; + const rec_t* next_rec; + /* A dummy heap, to be passed to rec_get_offsets(). + Because offsets1,offsets2 should be big enough, + this memory heap should never be used. */ + mem_heap_t* heap = NULL; + const rec_t* (*get_next)(const rec_t*); + + if (scan_method == COUNT_ALL_NON_BORING_AND_SKIP_DEL_MARKED) { + get_next = page_rec_get_next_non_del_marked; + } else { + get_next = page_rec_get_next_const; + } + + const bool should_count_external_pages = n_external_pages != NULL; + + if (should_count_external_pages) { + *n_external_pages = 0; + } + + rec = get_next(page_get_infimum_rec(page)); + + if (page_rec_is_supremum(rec)) { + /* the page is empty or contains only delete-marked records */ + *n_diff = 0; + *out_rec = NULL; + return(NULL); + } + + offsets_rec = rec_get_offsets(rec, index, offsets_rec, + ULINT_UNDEFINED, &heap); + + if (should_count_external_pages) { + *n_external_pages += btr_rec_get_externally_stored_len( + rec, offsets_rec); + } + + next_rec = get_next(rec); + + *n_diff = 1; + + while (!page_rec_is_supremum(next_rec)) { + + ulint matched_fields = 0; + ulint matched_bytes = 0; + + offsets_next_rec = rec_get_offsets(next_rec, index, + offsets_next_rec, + ULINT_UNDEFINED, + &heap); + + /* check whether rec != next_rec when looking at + the first n_prefix fields */ + cmp_rec_rec_with_match(rec, next_rec, + offsets_rec, offsets_next_rec, + index, FALSE, &matched_fields, + &matched_bytes); + + if (matched_fields < n_prefix) { + /* rec != next_rec, => rec is non-boring */ + + (*n_diff)++; + + if (scan_method == QUIT_ON_FIRST_NON_BORING) { + goto func_exit; + } + } + + rec = next_rec; + { + /* Assign offsets_rec = offsets_next_rec + so that offsets_rec matches with rec which + was just assigned rec = next_rec above. + Also need to point offsets_next_rec to the + place where offsets_rec was pointing before + because we have just 2 placeholders where + data is actually stored: + offsets_onstack1 and offsets_onstack2 and we + are using them in circular fashion + (offsets[_next]_rec are just pointers to + those placeholders). */ + ulint* offsets_tmp; + offsets_tmp = offsets_rec; + offsets_rec = offsets_next_rec; + offsets_next_rec = offsets_tmp; + } + + if (should_count_external_pages) { + *n_external_pages += btr_rec_get_externally_stored_len( + rec, offsets_rec); + } + + next_rec = get_next(next_rec); + } + +func_exit: + /* offsets1,offsets2 should have been big enough */ + ut_a(heap == NULL); + *out_rec = rec; + return(offsets_rec); +} + +/** Dive below the current position of a cursor and calculate the number of +distinct records on the leaf page, when looking at the fist n_prefix +columns. Also calculate the number of external pages pointed by records +on the leaf page. +@param[in] cur cursor +@param[in] n_prefix look at the first n_prefix columns +when comparing records +@param[out] n_diff number of distinct records +@param[out] n_external_pages number of external pages +@param[in,out] mtr mini-transaction +@return number of distinct records on the leaf page */ +static +void +dict_stats_analyze_index_below_cur( + const btr_cur_t* cur, + ulint n_prefix, + ib_uint64_t* n_diff, + ib_uint64_t* n_external_pages, + mtr_t* mtr) +{ + dict_index_t* index; + ulint space; + ulint zip_size; + buf_block_t* block; + ulint page_no; + const page_t* page; + mem_heap_t* heap; + const rec_t* rec; + ulint* offsets1; + ulint* offsets2; + ulint* offsets_rec; + ulint size; + + index = btr_cur_get_index(cur); + + /* Allocate offsets for the record and the node pointer, for + node pointer records. In a secondary index, the node pointer + record will consist of all index fields followed by a child + page number. + Allocate space for the offsets header (the allocation size at + offsets[0] and the REC_OFFS_HEADER_SIZE bytes), and n_fields + 1, + so that this will never be less than the size calculated in + rec_get_offsets_func(). */ + size = (1 + REC_OFFS_HEADER_SIZE) + 1 + dict_index_get_n_fields(index); + + heap = mem_heap_create(size * (sizeof *offsets1 + sizeof *offsets2)); + + offsets1 = static_cast<ulint*>(mem_heap_alloc( + heap, size * sizeof *offsets1)); + + offsets2 = static_cast<ulint*>(mem_heap_alloc( + heap, size * sizeof *offsets2)); + + rec_offs_set_n_alloc(offsets1, size); + rec_offs_set_n_alloc(offsets2, size); + + space = dict_index_get_space(index); + zip_size = dict_table_zip_size(index->table); + + rec = btr_cur_get_rec(cur); + + offsets_rec = rec_get_offsets(rec, index, offsets1, + ULINT_UNDEFINED, &heap); + + page_no = btr_node_ptr_get_child_page_no(rec, offsets_rec); + + /* assume no external pages by default - in case we quit from this + function without analyzing any leaf pages */ + *n_external_pages = 0; + + /* descend to the leaf level on the B-tree */ + for (;;) { + + block = buf_page_get_gen(space, zip_size, page_no, RW_S_LATCH, + NULL /* no guessed block */, + BUF_GET, __FILE__, __LINE__, mtr); + + page = buf_block_get_frame(block); + + if (btr_page_get_level(page, mtr) == 0) { + /* leaf level */ + break; + } + /* else */ + + /* search for the first non-boring record on the page */ + offsets_rec = dict_stats_scan_page( + &rec, offsets1, offsets2, index, page, n_prefix, + QUIT_ON_FIRST_NON_BORING, n_diff, NULL); + + /* pages on level > 0 are not allowed to be empty */ + ut_a(offsets_rec != NULL); + /* if page is not empty (offsets_rec != NULL) then n_diff must + be > 0, otherwise there is a bug in dict_stats_scan_page() */ + ut_a(*n_diff > 0); + + if (*n_diff == 1) { + /* page has all keys equal and the end of the page + was reached by dict_stats_scan_page(), no need to + descend to the leaf level */ + mem_heap_free(heap); + /* can't get an estimate for n_external_pages here + because we do not dive to the leaf level, assume no + external pages (*n_external_pages was assigned to 0 + above). */ + return; + } + /* else */ + + /* when we instruct dict_stats_scan_page() to quit on the + first non-boring record it finds, then the returned n_diff + can either be 0 (empty page), 1 (page has all keys equal) or + 2 (non-boring record was found) */ + ut_a(*n_diff == 2); + + /* we have a non-boring record in rec, descend below it */ + + page_no = btr_node_ptr_get_child_page_no(rec, offsets_rec); + } + + /* make sure we got a leaf page as a result from the above loop */ + ut_ad(btr_page_get_level(page, mtr) == 0); + + /* scan the leaf page and find the number of distinct keys, + when looking only at the first n_prefix columns; also estimate + the number of externally stored pages pointed by records on this + page */ + + offsets_rec = dict_stats_scan_page( + &rec, offsets1, offsets2, index, page, n_prefix, + COUNT_ALL_NON_BORING_AND_SKIP_DEL_MARKED, n_diff, + n_external_pages); + +#if 0 + DEBUG_PRINTF(" %s(): n_diff below page_no=%lu: " UINT64PF "\n", + __func__, page_no, n_diff); +#endif + + mem_heap_free(heap); +} + +/** Input data that is used to calculate dict_index_t::stat_n_diff_key_vals[] +for each n-columns prefix (n from 1 to n_uniq). */ +struct n_diff_data_t { + /** Index of the level on which the descent through the btree + stopped. level 0 is the leaf level. This is >= 1 because we + avoid scanning the leaf level because it may contain too many + pages and doing so is useless when combined with the random dives - + if we are to scan the leaf level, this means a full scan and we can + simply do that instead of fiddling with picking random records higher + in the tree and to dive below them. At the start of the analyzing + we may decide to do full scan of the leaf level, but then this + structure is not used in that code path. */ + ulint level; + + /** Number of records on the level where the descend through the btree + stopped. When we scan the btree from the root, we stop at some mid + level, choose some records from it and dive below them towards a leaf + page to analyze. */ + ib_uint64_t n_recs_on_level; + + /** Number of different key values that were found on the mid level. */ + ib_uint64_t n_diff_on_level; + + /** Number of leaf pages that are analyzed. This is also the same as + the number of records that we pick from the mid level and dive below + them. */ + ib_uint64_t n_leaf_pages_to_analyze; + + /** Cumulative sum of the number of different key values that were + found on all analyzed pages. */ + ib_uint64_t n_diff_all_analyzed_pages; + + /** Cumulative sum of the number of external pages (stored outside of + the btree but in the same file segment). */ + ib_uint64_t n_external_pages_sum; +}; + +/** Estimate the number of different key values in an index when looking at +the first n_prefix columns. For a given level in an index select +n_diff_data->n_leaf_pages_to_analyze records from that level and dive below +them to the corresponding leaf pages, then scan those leaf pages and save the +sampling results in n_diff_data->n_diff_all_analyzed_pages. +@param[in] index index +@param[in] n_prefix look at first 'n_prefix' columns when +comparing records +@param[in] boundaries a vector that contains +n_diff_data->n_diff_on_level integers each of which represents the index (on +level 'level', counting from left/smallest to right/biggest from 0) of the +last record from each group of distinct keys +@param[in,out] n_diff_data n_diff_all_analyzed_pages and +n_external_pages_sum in this structure will be set by this function. The +members level, n_diff_on_level and n_leaf_pages_to_analyze must be set by the +caller in advance - they are used by some calculations inside this function +@param[in,out] mtr mini-transaction */ +static +void +dict_stats_analyze_index_for_n_prefix( + dict_index_t* index, + ulint n_prefix, + const boundaries_t* boundaries, + n_diff_data_t* n_diff_data, + mtr_t* mtr) +{ + btr_pcur_t pcur; + const page_t* page; + ib_uint64_t rec_idx; + ib_uint64_t i; + +#if 0 + DEBUG_PRINTF(" %s(table=%s, index=%s, level=%lu, n_prefix=%lu, " + "n_diff_on_level=" UINT64PF ")\n", + __func__, index->table->name, index->name, level, + n_prefix, n_diff_data->n_diff_on_level); +#endif + + ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index), + MTR_MEMO_S_LOCK)); + + /* Position pcur on the leftmost record on the leftmost page + on the desired level. */ + + btr_pcur_open_at_index_side( + true, index, BTR_SEARCH_LEAF | BTR_ALREADY_S_LATCHED, + &pcur, true, n_diff_data->level, mtr); + btr_pcur_move_to_next_on_page(&pcur); + + page = btr_pcur_get_page(&pcur); + + const rec_t* first_rec = btr_pcur_get_rec(&pcur); + + /* We shouldn't be scanning the leaf level. The caller of this function + should have stopped the descend on level 1 or higher. */ + ut_ad(n_diff_data->level > 0); + ut_ad(!page_is_leaf(page)); + + /* The page must not be empty, except when + it is the root page (and the whole index is empty). */ + ut_ad(btr_pcur_is_on_user_rec(&pcur)); + ut_ad(first_rec == page_rec_get_next_const(page_get_infimum_rec(page))); + + /* check that we are indeed on the desired level */ + ut_a(btr_page_get_level(page, mtr) == n_diff_data->level); + + /* there should not be any pages on the left */ + ut_a(btr_page_get_prev(page, mtr) == FIL_NULL); + + /* check whether the first record on the leftmost page is marked + as such; we are on a non-leaf level */ + ut_a(rec_get_info_bits(first_rec, page_is_comp(page)) + & REC_INFO_MIN_REC_FLAG); + + const ib_uint64_t last_idx_on_level = boundaries->at( + static_cast<unsigned>(n_diff_data->n_diff_on_level - 1)); + + rec_idx = 0; + + n_diff_data->n_diff_all_analyzed_pages = 0; + n_diff_data->n_external_pages_sum = 0; + + for (i = 0; i < n_diff_data->n_leaf_pages_to_analyze; i++) { + /* there are n_diff_on_level elements + in 'boundaries' and we divide those elements + into n_leaf_pages_to_analyze segments, for example: + + let n_diff_on_level=100, n_leaf_pages_to_analyze=4, then: + segment i=0: [0, 24] + segment i=1: [25, 49] + segment i=2: [50, 74] + segment i=3: [75, 99] or + + let n_diff_on_level=1, n_leaf_pages_to_analyze=1, then: + segment i=0: [0, 0] or + + let n_diff_on_level=2, n_leaf_pages_to_analyze=2, then: + segment i=0: [0, 0] + segment i=1: [1, 1] or + + let n_diff_on_level=13, n_leaf_pages_to_analyze=7, then: + segment i=0: [0, 0] + segment i=1: [1, 2] + segment i=2: [3, 4] + segment i=3: [5, 6] + segment i=4: [7, 8] + segment i=5: [9, 10] + segment i=6: [11, 12] + + then we select a random record from each segment and dive + below it */ + const ib_uint64_t n_diff = n_diff_data->n_diff_on_level; + const ib_uint64_t n_pick + = n_diff_data->n_leaf_pages_to_analyze; + + const ib_uint64_t left = n_diff * i / n_pick; + const ib_uint64_t right = n_diff * (i + 1) / n_pick - 1; + + ut_a(left <= right); + ut_a(right <= last_idx_on_level); + + /* we do not pass (left, right) because we do not want to ask + ut_rnd_interval() to work with too big numbers since + ib_uint64_t could be bigger than ulint */ + const ulint rnd = ut_rnd_interval( + 0, static_cast<ulint>(right - left)); + + const ib_uint64_t dive_below_idx + = boundaries->at(static_cast<unsigned>(left + rnd)); + +#if 0 + DEBUG_PRINTF(" %s(): dive below record with index=" + UINT64PF "\n", __func__, dive_below_idx); +#endif + + /* seek to the record with index dive_below_idx */ + while (rec_idx < dive_below_idx + && btr_pcur_is_on_user_rec(&pcur)) { + + btr_pcur_move_to_next_user_rec(&pcur, mtr); + rec_idx++; + } + + /* if the level has finished before the record we are + searching for, this means that the B-tree has changed in + the meantime, quit our sampling and use whatever stats + we have collected so far */ + if (rec_idx < dive_below_idx) { + + ut_ad(!btr_pcur_is_on_user_rec(&pcur)); + break; + } + + /* it could be that the tree has changed in such a way that + the record under dive_below_idx is the supremum record, in + this case rec_idx == dive_below_idx and pcur is positioned + on the supremum, we do not want to dive below it */ + if (!btr_pcur_is_on_user_rec(&pcur)) { + break; + } + + ut_a(rec_idx == dive_below_idx); + + ib_uint64_t n_diff_on_leaf_page; + ib_uint64_t n_external_pages; + + dict_stats_analyze_index_below_cur(btr_pcur_get_btr_cur(&pcur), + n_prefix, + &n_diff_on_leaf_page, + &n_external_pages, + mtr); + + /* We adjust n_diff_on_leaf_page here to avoid counting + one record twice - once as the last on some page and once + as the first on another page. Consider the following example: + Leaf level: + page: (2,2,2,2,3,3) + ... many pages like (3,3,3,3,3,3) ... + page: (3,3,3,3,5,5) + ... many pages like (5,5,5,5,5,5) ... + page: (5,5,5,5,8,8) + page: (8,8,8,8,9,9) + our algo would (correctly) get an estimate that there are + 2 distinct records per page (average). Having 4 pages below + non-boring records, it would (wrongly) estimate the number + of distinct records to 8. */ + if (n_diff_on_leaf_page > 0) { + n_diff_on_leaf_page--; + } + + n_diff_data->n_diff_all_analyzed_pages += n_diff_on_leaf_page; + + n_diff_data->n_external_pages_sum += n_external_pages; + } + + btr_pcur_close(&pcur); +} + +/** Set dict_index_t::stat_n_diff_key_vals[] and stat_n_sample_sizes[]. +@param[in] n_diff_data input data to use to derive the results +@param[in,out] index index whose stat_n_diff_key_vals[] to set */ +UNIV_INLINE +void +dict_stats_index_set_n_diff( + const n_diff_data_t* n_diff_data, + dict_index_t* index) +{ + for (ulint n_prefix = dict_index_get_n_unique(index); + n_prefix >= 1; + n_prefix--) { + /* n_diff_all_analyzed_pages can be 0 here if + all the leaf pages sampled contained only + delete-marked records. In this case we should assign + 0 to index->stat_n_diff_key_vals[n_prefix - 1], which + the formula below does. */ + + const n_diff_data_t* data = &n_diff_data[n_prefix - 1]; + + ut_ad(data->n_leaf_pages_to_analyze > 0); + ut_ad(data->n_recs_on_level > 0); + + ulint n_ordinary_leaf_pages; + + if (data->level == 1) { + /* If we know the number of records on level 1, then + this number is the same as the number of pages on + level 0 (leaf). */ + n_ordinary_leaf_pages = data->n_recs_on_level; + } else { + /* If we analyzed D ordinary leaf pages and found E + external pages in total linked from those D ordinary + leaf pages, then this means that the ratio + ordinary/external is D/E. Then the ratio ordinary/total + is D / (D + E). Knowing that the total number of pages + is T (including ordinary and external) then we estimate + that the total number of ordinary leaf pages is + T * D / (D + E). */ + n_ordinary_leaf_pages + = index->stat_n_leaf_pages + * data->n_leaf_pages_to_analyze + / (data->n_leaf_pages_to_analyze + + data->n_external_pages_sum); + } + + /* See REF01 for an explanation of the algorithm */ + index->stat_n_diff_key_vals[n_prefix - 1] + = n_ordinary_leaf_pages + + * data->n_diff_on_level + / data->n_recs_on_level + + * data->n_diff_all_analyzed_pages + / data->n_leaf_pages_to_analyze; + + index->stat_n_sample_sizes[n_prefix - 1] + = data->n_leaf_pages_to_analyze; + + DEBUG_PRINTF(" %s(): n_diff=" UINT64PF " for n_prefix=%lu" + " (%lu" + " * " UINT64PF " / " UINT64PF + " * " UINT64PF " / " UINT64PF ")\n", + __func__, + index->stat_n_diff_key_vals[n_prefix - 1], + n_prefix, + index->stat_n_leaf_pages, + data->n_diff_on_level, + data->n_recs_on_level, + data->n_diff_all_analyzed_pages, + data->n_leaf_pages_to_analyze); + } +} + +/*********************************************************************//** +Calculates new statistics for a given index and saves them to the index +members stat_n_diff_key_vals[], stat_n_sample_sizes[], stat_index_size and +stat_n_leaf_pages. This function could be slow. */ +static +void +dict_stats_analyze_index( +/*=====================*/ + dict_index_t* index) /*!< in/out: index to analyze */ +{ + ulint root_level; + ulint level; + bool level_is_analyzed; + ulint n_uniq; + ulint n_prefix; + ib_uint64_t total_recs; + ib_uint64_t total_pages; + mtr_t mtr; + ulint size; + DBUG_ENTER("dict_stats_analyze_index"); + + DBUG_PRINT("info", ("index: %s, online status: %d", index->name, + dict_index_get_online_status(index))); + + DEBUG_PRINTF(" %s(index=%s)\n", __func__, index->name); + + dict_stats_empty_index(index); + + mtr_start(&mtr); + + mtr_s_lock(dict_index_get_lock(index), &mtr); + + size = btr_get_size(index, BTR_TOTAL_SIZE, &mtr); + + if (size != ULINT_UNDEFINED) { + index->stat_index_size = size; + size = btr_get_size(index, BTR_N_LEAF_PAGES, &mtr); + } + + /* Release the X locks on the root page taken by btr_get_size() */ + mtr_commit(&mtr); + + switch (size) { + case ULINT_UNDEFINED: + dict_stats_assert_initialized_index(index); + DBUG_VOID_RETURN; + case 0: + /* The root node of the tree is a leaf */ + size = 1; + } + + index->stat_n_leaf_pages = size; + + mtr_start(&mtr); + + mtr_s_lock(dict_index_get_lock(index), &mtr); + + root_level = btr_height_get(index, &mtr); + + n_uniq = dict_index_get_n_unique(index); + + /* If the tree has just one level (and one page) or if the user + has requested to sample too many pages then do full scan. + + For each n-column prefix (for n=1..n_uniq) N_SAMPLE_PAGES(index) + will be sampled, so in total N_SAMPLE_PAGES(index) * n_uniq leaf + pages will be sampled. If that number is bigger than the total + number of leaf pages then do full scan of the leaf level instead + since it will be faster and will give better results. */ + + if (root_level == 0 + || N_SAMPLE_PAGES(index) * n_uniq > index->stat_n_leaf_pages) { + + if (root_level == 0) { + DEBUG_PRINTF(" %s(): just one page, " + "doing full scan\n", __func__); + } else { + DEBUG_PRINTF(" %s(): too many pages requested for " + "sampling, doing full scan\n", __func__); + } + + /* do full scan of level 0; save results directly + into the index */ + + dict_stats_analyze_index_level(index, + 0 /* leaf level */, + index->stat_n_diff_key_vals, + &total_recs, + &total_pages, + NULL /* boundaries not needed */, + &mtr); + + for (ulint i = 0; i < n_uniq; i++) { + index->stat_n_sample_sizes[i] = total_pages; + } + + mtr_commit(&mtr); + + dict_stats_assert_initialized_index(index); + DBUG_VOID_RETURN; + } + + /* For each level that is being scanned in the btree, this contains the + number of different key values for all possible n-column prefixes. */ + ib_uint64_t* n_diff_on_level = new ib_uint64_t[n_uniq]; + + /* For each level that is being scanned in the btree, this contains the + index of the last record from each group of equal records (when + comparing only the first n columns, n=1..n_uniq). */ + boundaries_t* n_diff_boundaries = new boundaries_t[n_uniq]; + + /* For each n-column prefix this array contains the input data that is + used to calculate dict_index_t::stat_n_diff_key_vals[]. */ + n_diff_data_t* n_diff_data = new n_diff_data_t[n_uniq]; + + /* total_recs is also used to estimate the number of pages on one + level below, so at the start we have 1 page (the root) */ + total_recs = 1; + + /* Here we use the following optimization: + If we find that level L is the first one (searching from the + root) that contains at least D distinct keys when looking at + the first n_prefix columns, then: + if we look at the first n_prefix-1 columns then the first + level that contains D distinct keys will be either L or a + lower one. + So if we find that the first level containing D distinct + keys (on n_prefix columns) is L, we continue from L when + searching for D distinct keys on n_prefix-1 columns. */ + level = root_level; + level_is_analyzed = false; + + for (n_prefix = n_uniq; n_prefix >= 1; n_prefix--) { + + DEBUG_PRINTF(" %s(): searching level with >=%llu " + "distinct records, n_prefix=%lu\n", + __func__, N_DIFF_REQUIRED(index), n_prefix); + + /* Commit the mtr to release the tree S lock to allow + other threads to do some work too. */ + mtr_commit(&mtr); + mtr_start(&mtr); + mtr_s_lock(dict_index_get_lock(index), &mtr); + if (root_level != btr_height_get(index, &mtr)) { + /* Just quit if the tree has changed beyond + recognition here. The old stats from previous + runs will remain in the values that we have + not calculated yet. Initially when the index + object is created the stats members are given + some sensible values so leaving them untouched + here even the first time will not cause us to + read uninitialized memory later. */ + break; + } + + /* check whether we should pick the current level; + we pick level 1 even if it does not have enough + distinct records because we do not want to scan the + leaf level because it may contain too many records */ + if (level_is_analyzed + && (n_diff_on_level[n_prefix - 1] >= N_DIFF_REQUIRED(index) + || level == 1)) { + + goto found_level; + } + + /* search for a level that contains enough distinct records */ + + if (level_is_analyzed && level > 1) { + + /* if this does not hold we should be on + "found_level" instead of here */ + ut_ad(n_diff_on_level[n_prefix - 1] + < N_DIFF_REQUIRED(index)); + + level--; + level_is_analyzed = false; + } + + /* descend into the tree, searching for "good enough" level */ + for (;;) { + + /* make sure we do not scan the leaf level + accidentally, it may contain too many pages */ + ut_ad(level > 0); + + /* scanning the same level twice is an optimization + bug */ + ut_ad(!level_is_analyzed); + + /* Do not scan if this would read too many pages. + Here we use the following fact: + the number of pages on level L equals the number + of records on level L+1, thus we deduce that the + following call would scan total_recs pages, because + total_recs is left from the previous iteration when + we scanned one level upper or we have not scanned any + levels yet in which case total_recs is 1. */ + if (total_recs > N_SAMPLE_PAGES(index)) { + + /* if the above cond is true then we are + not at the root level since on the root + level total_recs == 1 (set before we + enter the n-prefix loop) and cannot + be > N_SAMPLE_PAGES(index) */ + ut_a(level != root_level); + + /* step one level back and be satisfied with + whatever it contains */ + level++; + level_is_analyzed = true; + + break; + } + + dict_stats_analyze_index_level(index, + level, + n_diff_on_level, + &total_recs, + &total_pages, + n_diff_boundaries, + &mtr); + + level_is_analyzed = true; + + if (level == 1 + || n_diff_on_level[n_prefix - 1] + >= N_DIFF_REQUIRED(index)) { + /* we have reached the last level we could scan + or we found a good level with many distinct + records */ + break; + } + + level--; + level_is_analyzed = false; + } +found_level: + + DEBUG_PRINTF(" %s(): found level %lu that has " UINT64PF + " distinct records for n_prefix=%lu\n", + __func__, level, n_diff_on_level[n_prefix - 1], + n_prefix); + /* here we are either on level 1 or the level that we are on + contains >= N_DIFF_REQUIRED distinct keys or we did not scan + deeper levels because they would contain too many pages */ + + ut_ad(level > 0); + + ut_ad(level_is_analyzed); + + /* if any of these is 0 then there is exactly one page in the + B-tree and it is empty and we should have done full scan and + should not be here */ + ut_ad(total_recs > 0); + ut_ad(n_diff_on_level[n_prefix - 1] > 0); + + ut_ad(N_SAMPLE_PAGES(index) > 0); + + n_diff_data_t* data = &n_diff_data[n_prefix - 1]; + + data->level = level; + + data->n_recs_on_level = total_recs; + + data->n_diff_on_level = n_diff_on_level[n_prefix - 1]; + + data->n_leaf_pages_to_analyze = std::min( + N_SAMPLE_PAGES(index), + n_diff_on_level[n_prefix - 1]); + + /* pick some records from this level and dive below them for + the given n_prefix */ + + dict_stats_analyze_index_for_n_prefix( + index, n_prefix, &n_diff_boundaries[n_prefix - 1], + data, &mtr); + } + + mtr_commit(&mtr); + + delete[] n_diff_boundaries; + + delete[] n_diff_on_level; + + /* n_prefix == 0 means that the above loop did not end up prematurely + due to tree being changed and so n_diff_data[] is set up. */ + if (n_prefix == 0) { + dict_stats_index_set_n_diff(n_diff_data, index); + } + + delete[] n_diff_data; + + dict_stats_assert_initialized_index(index); + DBUG_VOID_RETURN; +} + +/*********************************************************************//** +Calculates new estimates for table and index statistics. This function +is relatively slow and is used to calculate persistent statistics that +will be saved on disk. +@return DB_SUCCESS or error code */ +static +dberr_t +dict_stats_update_persistent( +/*=========================*/ + dict_table_t* table) /*!< in/out: table */ +{ + dict_index_t* index; + + DEBUG_PRINTF("%s(table=%s)\n", __func__, table->name); + + dict_table_stats_lock(table, RW_X_LATCH); + + /* analyze the clustered index first */ + + index = dict_table_get_first_index(table); + + if (index == NULL + || dict_index_is_corrupted(index) + || (index->type | DICT_UNIQUE) != (DICT_CLUSTERED | DICT_UNIQUE)) { + + /* Table definition is corrupt */ + dict_table_stats_unlock(table, RW_X_LATCH); + dict_stats_empty_table(table); + + return(DB_CORRUPTION); + } + + ut_ad(!dict_index_is_univ(index)); + + dict_stats_analyze_index(index); + + ulint n_unique = dict_index_get_n_unique(index); + + table->stat_n_rows = index->stat_n_diff_key_vals[n_unique - 1]; + + table->stat_clustered_index_size = index->stat_index_size; + + /* analyze other indexes from the table, if any */ + + table->stat_sum_of_other_index_sizes = 0; + + for (index = dict_table_get_next_index(index); + index != NULL; + index = dict_table_get_next_index(index)) { + + ut_ad(!dict_index_is_univ(index)); + + if (index->type & DICT_FTS) { + continue; + } + + dict_stats_empty_index(index); + + if (dict_stats_should_ignore_index(index)) { + continue; + } + + if (!(table->stats_bg_flag & BG_STAT_SHOULD_QUIT)) { + dict_stats_analyze_index(index); + } + + table->stat_sum_of_other_index_sizes + += index->stat_index_size; + } + + table->stats_last_recalc = ut_time(); + + table->stat_modified_counter = 0; + + table->stat_initialized = TRUE; + + dict_stats_assert_initialized(table); + + dict_table_stats_unlock(table, RW_X_LATCH); + + return(DB_SUCCESS); +} + +#include "mysql_com.h" +/** Save an individual index's statistic into the persistent statistics +storage. +@param[in] index index to be updated +@param[in] last_update timestamp of the stat +@param[in] stat_name name of the stat +@param[in] stat_value value of the stat +@param[in] sample_size n pages sampled or NULL +@param[in] stat_description description of the stat +@param[in,out] trx in case of NULL the function will +allocate and free the trx object. If it is not NULL then it will be +rolled back only in the case of error, but not freed. +@return DB_SUCCESS or error code */ +static +dberr_t +dict_stats_save_index_stat( + dict_index_t* index, + lint last_update, + const char* stat_name, + ib_uint64_t stat_value, + ib_uint64_t* sample_size, + const char* stat_description, + trx_t* trx) +{ + pars_info_t* pinfo; + dberr_t ret; + char db_utf8[MAX_DB_UTF8_LEN]; + char table_utf8[MAX_TABLE_UTF8_LEN]; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(mutex_own(&dict_sys->mutex)); + + dict_fs2utf8(index->table->name, db_utf8, sizeof(db_utf8), + table_utf8, sizeof(table_utf8)); + + pinfo = pars_info_create(); + pars_info_add_str_literal(pinfo, "database_name", db_utf8); + pars_info_add_str_literal(pinfo, "table_name", table_utf8); + UNIV_MEM_ASSERT_RW_ABORT(index->name, strlen(index->name)); + pars_info_add_str_literal(pinfo, "index_name", index->name); + UNIV_MEM_ASSERT_RW_ABORT(&last_update, 4); + pars_info_add_int4_literal(pinfo, "last_update", last_update); + UNIV_MEM_ASSERT_RW_ABORT(stat_name, strlen(stat_name)); + pars_info_add_str_literal(pinfo, "stat_name", stat_name); + UNIV_MEM_ASSERT_RW_ABORT(&stat_value, 8); + pars_info_add_ull_literal(pinfo, "stat_value", stat_value); + if (sample_size != NULL) { + UNIV_MEM_ASSERT_RW_ABORT(sample_size, 8); + pars_info_add_ull_literal(pinfo, "sample_size", *sample_size); + } else { + pars_info_add_literal(pinfo, "sample_size", NULL, + UNIV_SQL_NULL, DATA_FIXBINARY, 0); + } + UNIV_MEM_ASSERT_RW_ABORT(stat_description, strlen(stat_description)); + pars_info_add_str_literal(pinfo, "stat_description", + stat_description); + + ret = dict_stats_exec_sql( + pinfo, + "PROCEDURE INDEX_STATS_SAVE () IS\n" + "BEGIN\n" + + "DELETE FROM \"" INDEX_STATS_NAME "\"\n" + "WHERE\n" + "database_name = :database_name AND\n" + "table_name = :table_name AND\n" + "index_name = :index_name AND\n" + "stat_name = :stat_name;\n" + + "INSERT INTO \"" INDEX_STATS_NAME "\"\n" + "VALUES\n" + "(\n" + ":database_name,\n" + ":table_name,\n" + ":index_name,\n" + ":last_update,\n" + ":stat_name,\n" + ":stat_value,\n" + ":sample_size,\n" + ":stat_description\n" + ");\n" + "END;", trx); + + if (ret != DB_SUCCESS) { + char buf_table[MAX_FULL_NAME_LEN]; + char buf_index[MAX_FULL_NAME_LEN]; + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Cannot save index statistics for table " + "%s, index %s, stat name \"%s\": %s\n", + ut_format_name(index->table->name, TRUE, + buf_table, sizeof(buf_table)), + ut_format_name(index->name, FALSE, + buf_index, sizeof(buf_index)), + stat_name, ut_strerr(ret)); + } + + return(ret); +} + +/** Save the table's statistics into the persistent statistics storage. +@param[in] table_orig table whose stats to save +@param[in] only_for_index if this is non-NULL, then stats for indexes +that are not equal to it will not be saved, if NULL, then all +indexes' stats are saved +@return DB_SUCCESS or error code */ +static +dberr_t +dict_stats_save( +/*============*/ + dict_table_t* table_orig, + const index_id_t* only_for_index) +{ + pars_info_t* pinfo; + lint now; + dberr_t ret; + dict_table_t* table; + char db_utf8[MAX_DB_UTF8_LEN]; + char table_utf8[MAX_TABLE_UTF8_LEN]; + + table = dict_stats_snapshot_create(table_orig); + + dict_fs2utf8(table->name, db_utf8, sizeof(db_utf8), + table_utf8, sizeof(table_utf8)); + + rw_lock_x_lock(&dict_operation_lock); + mutex_enter(&dict_sys->mutex); + + /* MySQL's timestamp is 4 byte, so we use + pars_info_add_int4_literal() which takes a lint arg, so "now" is + lint */ + now = (lint) ut_time(); + + pinfo = pars_info_create(); + + pars_info_add_str_literal(pinfo, "database_name", db_utf8); + pars_info_add_str_literal(pinfo, "table_name", table_utf8); + pars_info_add_int4_literal(pinfo, "last_update", now); + pars_info_add_ull_literal(pinfo, "n_rows", table->stat_n_rows); + pars_info_add_ull_literal(pinfo, "clustered_index_size", + table->stat_clustered_index_size); + pars_info_add_ull_literal(pinfo, "sum_of_other_index_sizes", + table->stat_sum_of_other_index_sizes); + + ret = dict_stats_exec_sql( + pinfo, + "PROCEDURE TABLE_STATS_SAVE () IS\n" + "BEGIN\n" + + "DELETE FROM \"" TABLE_STATS_NAME "\"\n" + "WHERE\n" + "database_name = :database_name AND\n" + "table_name = :table_name;\n" + + "INSERT INTO \"" TABLE_STATS_NAME "\"\n" + "VALUES\n" + "(\n" + ":database_name,\n" + ":table_name,\n" + ":last_update,\n" + ":n_rows,\n" + ":clustered_index_size,\n" + ":sum_of_other_index_sizes\n" + ");\n" + "END;", NULL); + + if (ret != DB_SUCCESS) { + char buf[MAX_FULL_NAME_LEN]; + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Cannot save table statistics for table " + "%s: %s\n", + ut_format_name(table->name, TRUE, buf, sizeof(buf)), + ut_strerr(ret)); + + mutex_exit(&dict_sys->mutex); + rw_lock_x_unlock(&dict_operation_lock); + + dict_stats_snapshot_free(table); + + return(ret); + } + + trx_t* trx = trx_allocate_for_background(); + trx_start_if_not_started(trx); + + dict_index_t* index; + index_map_t indexes; + + /* Below we do all the modifications in innodb_index_stats in a single + transaction for performance reasons. Modifying more than one row in a + single transaction may deadlock with other transactions if they + lock the rows in different order. Other transaction could be for + example when we DROP a table and do + DELETE FROM innodb_index_stats WHERE database_name = '...' + AND table_name = '...'; which will affect more than one row. To + prevent deadlocks we always lock the rows in the same order - the + order of the PK, which is (database_name, table_name, index_name, + stat_name). This is why below we sort the indexes by name and then + for each index, do the mods ordered by stat_name. */ + + for (index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + + indexes[index->name] = index; + } + + index_map_t::const_iterator it; + + for (it = indexes.begin(); it != indexes.end(); ++it) { + + index = it->second; + + if (only_for_index != NULL && index->id != *only_for_index) { + continue; + } + + if (dict_stats_should_ignore_index(index)) { + continue; + } + + ut_ad(!dict_index_is_univ(index)); + + for (ulint i = 0; i < index->n_uniq; i++) { + + char stat_name[16]; + char stat_description[1024]; + ulint j; + + ut_snprintf(stat_name, sizeof(stat_name), + "n_diff_pfx%02lu", i + 1); + + /* craft a string that contains the columns names */ + ut_snprintf(stat_description, + sizeof(stat_description), + "%s", index->fields[0].name); + for (j = 1; j <= i; j++) { + size_t len; + + len = strlen(stat_description); + + ut_snprintf(stat_description + len, + sizeof(stat_description) - len, + ",%s", index->fields[j].name); + } + + ret = dict_stats_save_index_stat( + index, now, stat_name, + index->stat_n_diff_key_vals[i], + &index->stat_n_sample_sizes[i], + stat_description, trx); + + if (ret != DB_SUCCESS) { + goto end; + } + } + + ret = dict_stats_save_index_stat(index, now, "n_leaf_pages", + index->stat_n_leaf_pages, + NULL, + "Number of leaf pages " + "in the index", trx); + if (ret != DB_SUCCESS) { + goto end; + } + + ret = dict_stats_save_index_stat(index, now, "size", + index->stat_index_size, + NULL, + "Number of pages " + "in the index", trx); + if (ret != DB_SUCCESS) { + goto end; + } + } + + trx_commit_for_mysql(trx); + +end: + trx_free_for_background(trx); + + mutex_exit(&dict_sys->mutex); + rw_lock_x_unlock(&dict_operation_lock); + + dict_stats_snapshot_free(table); + + return(ret); +} + +/*********************************************************************//** +Called for the row that is selected by +SELECT ... FROM mysql.innodb_table_stats WHERE table='...' +The second argument is a pointer to the table and the fetched stats are +written to it. +@return non-NULL dummy */ +static +ibool +dict_stats_fetch_table_stats_step( +/*==============================*/ + void* node_void, /*!< in: select node */ + void* table_void) /*!< out: table */ +{ + sel_node_t* node = (sel_node_t*) node_void; + dict_table_t* table = (dict_table_t*) table_void; + que_common_t* cnode; + int i; + + /* this should loop exactly 3 times - for + n_rows,clustered_index_size,sum_of_other_index_sizes */ + for (cnode = static_cast<que_common_t*>(node->select_list), i = 0; + cnode != NULL; + cnode = static_cast<que_common_t*>(que_node_get_next(cnode)), + i++) { + + const byte* data; + dfield_t* dfield = que_node_get_val(cnode); + dtype_t* type = dfield_get_type(dfield); + ulint len = dfield_get_len(dfield); + + data = static_cast<const byte*>(dfield_get_data(dfield)); + + switch (i) { + case 0: /* mysql.innodb_table_stats.n_rows */ + + ut_a(dtype_get_mtype(type) == DATA_INT); + ut_a(len == 8); + + table->stat_n_rows = mach_read_from_8(data); + + break; + + case 1: /* mysql.innodb_table_stats.clustered_index_size */ + + ut_a(dtype_get_mtype(type) == DATA_INT); + ut_a(len == 8); + + table->stat_clustered_index_size + = (ulint) mach_read_from_8(data); + + break; + + case 2: /* mysql.innodb_table_stats.sum_of_other_index_sizes */ + + ut_a(dtype_get_mtype(type) == DATA_INT); + ut_a(len == 8); + + table->stat_sum_of_other_index_sizes + = (ulint) mach_read_from_8(data); + + break; + + default: + + /* someone changed SELECT + n_rows,clustered_index_size,sum_of_other_index_sizes + to select more columns from innodb_table_stats without + adjusting here */ + ut_error; + } + } + + /* if i < 3 this means someone changed the + SELECT n_rows,clustered_index_size,sum_of_other_index_sizes + to select less columns from innodb_table_stats without adjusting here; + if i > 3 we would have ut_error'ed earlier */ + ut_a(i == 3 /*n_rows,clustered_index_size,sum_of_other_index_sizes*/); + + /* XXX this is not used but returning non-NULL is necessary */ + return(TRUE); +} + +/** Aux struct used to pass a table and a boolean to +dict_stats_fetch_index_stats_step(). */ +struct index_fetch_t { + dict_table_t* table; /*!< table whose indexes are to be modified */ + bool stats_were_modified; /*!< will be set to true if at + least one index stats were modified */ +}; + +/*********************************************************************//** +Called for the rows that are selected by +SELECT ... FROM mysql.innodb_index_stats WHERE table='...' +The second argument is a pointer to the table and the fetched stats are +written to its indexes. +Let a table has N indexes and each index has Ui unique columns for i=1..N, +then mysql.innodb_index_stats will have SUM(Ui) i=1..N rows for that table. +So this function will be called SUM(Ui) times where SUM(Ui) is of magnitude +N*AVG(Ui). In each call it searches for the currently fetched index into +table->indexes linearly, assuming this list is not sorted. Thus, overall, +fetching all indexes' stats from mysql.innodb_index_stats is O(N^2) where N +is the number of indexes. +This can be improved if we sort table->indexes in a temporary area just once +and then search in that sorted list. Then the complexity will be O(N*log(N)). +We assume a table will not have more than 100 indexes, so we go with the +simpler N^2 algorithm. +@return non-NULL dummy */ +static +ibool +dict_stats_fetch_index_stats_step( +/*==============================*/ + void* node_void, /*!< in: select node */ + void* arg_void) /*!< out: table + a flag that tells if we + modified anything */ +{ + sel_node_t* node = (sel_node_t*) node_void; + index_fetch_t* arg = (index_fetch_t*) arg_void; + dict_table_t* table = arg->table; + dict_index_t* index = NULL; + que_common_t* cnode; + const char* stat_name = NULL; + ulint stat_name_len = ULINT_UNDEFINED; + ib_uint64_t stat_value = UINT64_UNDEFINED; + ib_uint64_t sample_size = UINT64_UNDEFINED; + int i; + + /* this should loop exactly 4 times - for the columns that + were selected: index_name,stat_name,stat_value,sample_size */ + for (cnode = static_cast<que_common_t*>(node->select_list), i = 0; + cnode != NULL; + cnode = static_cast<que_common_t*>(que_node_get_next(cnode)), + i++) { + + const byte* data; + dfield_t* dfield = que_node_get_val(cnode); + dtype_t* type = dfield_get_type(dfield); + ulint len = dfield_get_len(dfield); + + data = static_cast<const byte*>(dfield_get_data(dfield)); + + switch (i) { + case 0: /* mysql.innodb_index_stats.index_name */ + + ut_a(dtype_get_mtype(type) == DATA_VARMYSQL); + + /* search for index in table's indexes whose name + matches data; the fetched index name is in data, + has no terminating '\0' and has length len */ + for (index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + + if (strlen(index->name) == len + && memcmp(index->name, data, len) == 0) { + /* the corresponding index was found */ + break; + } + } + + /* if index is NULL here this means that + mysql.innodb_index_stats contains more rows than the + number of indexes in the table; this is ok, we just + return ignoring those extra rows; in other words + dict_stats_fetch_index_stats_step() has been called + for a row from index_stats with unknown index_name + column */ + if (index == NULL) { + + return(TRUE); + } + + break; + + case 1: /* mysql.innodb_index_stats.stat_name */ + + ut_a(dtype_get_mtype(type) == DATA_VARMYSQL); + + ut_a(index != NULL); + + stat_name = (const char*) data; + stat_name_len = len; + + break; + + case 2: /* mysql.innodb_index_stats.stat_value */ + + ut_a(dtype_get_mtype(type) == DATA_INT); + ut_a(len == 8); + + ut_a(index != NULL); + ut_a(stat_name != NULL); + ut_a(stat_name_len != ULINT_UNDEFINED); + + stat_value = mach_read_from_8(data); + + break; + + case 3: /* mysql.innodb_index_stats.sample_size */ + + ut_a(dtype_get_mtype(type) == DATA_INT); + ut_a(len == 8 || len == UNIV_SQL_NULL); + + ut_a(index != NULL); + ut_a(stat_name != NULL); + ut_a(stat_name_len != ULINT_UNDEFINED); + ut_a(stat_value != UINT64_UNDEFINED); + + if (len == UNIV_SQL_NULL) { + break; + } + /* else */ + + sample_size = mach_read_from_8(data); + + break; + + default: + + /* someone changed + SELECT index_name,stat_name,stat_value,sample_size + to select more columns from innodb_index_stats without + adjusting here */ + ut_error; + } + } + + /* if i < 4 this means someone changed the + SELECT index_name,stat_name,stat_value,sample_size + to select less columns from innodb_index_stats without adjusting here; + if i > 4 we would have ut_error'ed earlier */ + ut_a(i == 4 /* index_name,stat_name,stat_value,sample_size */); + + ut_a(index != NULL); + ut_a(stat_name != NULL); + ut_a(stat_name_len != ULINT_UNDEFINED); + ut_a(stat_value != UINT64_UNDEFINED); + /* sample_size could be UINT64_UNDEFINED here, if it is NULL */ + +#define PFX "n_diff_pfx" +#define PFX_LEN 10 + + if (stat_name_len == 4 /* strlen("size") */ + && strncasecmp("size", stat_name, stat_name_len) == 0) { + index->stat_index_size = (ulint) stat_value; + arg->stats_were_modified = true; + } else if (stat_name_len == 12 /* strlen("n_leaf_pages") */ + && strncasecmp("n_leaf_pages", stat_name, stat_name_len) + == 0) { + index->stat_n_leaf_pages = (ulint) stat_value; + arg->stats_were_modified = true; + } else if (stat_name_len > PFX_LEN /* e.g. stat_name=="n_diff_pfx01" */ + && strncasecmp(PFX, stat_name, PFX_LEN) == 0) { + + const char* num_ptr; + unsigned long n_pfx; + + /* point num_ptr into "1" from "n_diff_pfx12..." */ + num_ptr = stat_name + PFX_LEN; + + /* stat_name should have exactly 2 chars appended to PFX + and they should be digits */ + if (stat_name_len != PFX_LEN + 2 + || num_ptr[0] < '0' || num_ptr[0] > '9' + || num_ptr[1] < '0' || num_ptr[1] > '9') { + + char db_utf8[MAX_DB_UTF8_LEN]; + char table_utf8[MAX_TABLE_UTF8_LEN]; + + dict_fs2utf8(table->name, db_utf8, sizeof(db_utf8), + table_utf8, sizeof(table_utf8)); + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Ignoring strange row from " + "%s WHERE " + "database_name = '%s' AND " + "table_name = '%s' AND " + "index_name = '%s' AND " + "stat_name = '%.*s'; because stat_name " + "is malformed\n", + INDEX_STATS_NAME_PRINT, + db_utf8, + table_utf8, + index->name, + (int) stat_name_len, + stat_name); + return(TRUE); + } + /* else */ + + /* extract 12 from "n_diff_pfx12..." into n_pfx + note that stat_name does not have a terminating '\0' */ + n_pfx = (num_ptr[0] - '0') * 10 + (num_ptr[1] - '0'); + + ulint n_uniq = index->n_uniq; + + if (n_pfx == 0 || n_pfx > n_uniq) { + + char db_utf8[MAX_DB_UTF8_LEN]; + char table_utf8[MAX_TABLE_UTF8_LEN]; + + dict_fs2utf8(table->name, db_utf8, sizeof(db_utf8), + table_utf8, sizeof(table_utf8)); + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Ignoring strange row from " + "%s WHERE " + "database_name = '%s' AND " + "table_name = '%s' AND " + "index_name = '%s' AND " + "stat_name = '%.*s'; because stat_name is " + "out of range, the index has %lu unique " + "columns\n", + INDEX_STATS_NAME_PRINT, + db_utf8, + table_utf8, + index->name, + (int) stat_name_len, + stat_name, + n_uniq); + return(TRUE); + } + /* else */ + + index->stat_n_diff_key_vals[n_pfx - 1] = stat_value; + + if (sample_size != UINT64_UNDEFINED) { + index->stat_n_sample_sizes[n_pfx - 1] = sample_size; + } else { + /* hmm, strange... the user must have UPDATEd the + table manually and SET sample_size = NULL */ + index->stat_n_sample_sizes[n_pfx - 1] = 0; + } + + index->stat_n_non_null_key_vals[n_pfx - 1] = 0; + + arg->stats_were_modified = true; + } else { + /* silently ignore rows with unknown stat_name, the + user may have developed her own stats */ + } + + /* XXX this is not used but returning non-NULL is necessary */ + return(TRUE); +} + +/*********************************************************************//** +Read table's statistics from the persistent statistics storage. +@return DB_SUCCESS or error code */ +static +dberr_t +dict_stats_fetch_from_ps( +/*=====================*/ + dict_table_t* table) /*!< in/out: table */ +{ + index_fetch_t index_fetch_arg; + trx_t* trx; + pars_info_t* pinfo; + dberr_t ret; + char db_utf8[MAX_DB_UTF8_LEN]; + char table_utf8[MAX_TABLE_UTF8_LEN]; + + ut_ad(!mutex_own(&dict_sys->mutex)); + + /* Initialize all stats to dummy values before fetching because if + the persistent storage contains incomplete stats (e.g. missing stats + for some index) then we would end up with (partially) uninitialized + stats. */ + dict_stats_empty_table(table); + + trx = trx_allocate_for_background(); + + /* Use 'read-uncommitted' so that the SELECTs we execute + do not get blocked in case some user has locked the rows we + are SELECTing */ + + trx->isolation_level = TRX_ISO_READ_UNCOMMITTED; + + trx_start_if_not_started(trx); + + dict_fs2utf8(table->name, db_utf8, sizeof(db_utf8), + table_utf8, sizeof(table_utf8)); + + pinfo = pars_info_create(); + + pars_info_add_str_literal(pinfo, "database_name", db_utf8); + + pars_info_add_str_literal(pinfo, "table_name", table_utf8); + + pars_info_bind_function(pinfo, + "fetch_table_stats_step", + dict_stats_fetch_table_stats_step, + table); + + index_fetch_arg.table = table; + index_fetch_arg.stats_were_modified = false; + pars_info_bind_function(pinfo, + "fetch_index_stats_step", + dict_stats_fetch_index_stats_step, + &index_fetch_arg); + + ret = que_eval_sql(pinfo, + "PROCEDURE FETCH_STATS () IS\n" + "found INT;\n" + "DECLARE FUNCTION fetch_table_stats_step;\n" + "DECLARE FUNCTION fetch_index_stats_step;\n" + "DECLARE CURSOR table_stats_cur IS\n" + " SELECT\n" + /* if you change the selected fields, be + sure to adjust + dict_stats_fetch_table_stats_step() */ + " n_rows,\n" + " clustered_index_size,\n" + " sum_of_other_index_sizes\n" + " FROM \"" TABLE_STATS_NAME "\"\n" + " WHERE\n" + " database_name = :database_name AND\n" + " table_name = :table_name;\n" + "DECLARE CURSOR index_stats_cur IS\n" + " SELECT\n" + /* if you change the selected fields, be + sure to adjust + dict_stats_fetch_index_stats_step() */ + " index_name,\n" + " stat_name,\n" + " stat_value,\n" + " sample_size\n" + " FROM \"" INDEX_STATS_NAME "\"\n" + " WHERE\n" + " database_name = :database_name AND\n" + " table_name = :table_name;\n" + + "BEGIN\n" + + "OPEN table_stats_cur;\n" + "FETCH table_stats_cur INTO\n" + " fetch_table_stats_step();\n" + "IF (SQL % NOTFOUND) THEN\n" + " CLOSE table_stats_cur;\n" + " RETURN;\n" + "END IF;\n" + "CLOSE table_stats_cur;\n" + + "OPEN index_stats_cur;\n" + "found := 1;\n" + "WHILE found = 1 LOOP\n" + " FETCH index_stats_cur INTO\n" + " fetch_index_stats_step();\n" + " IF (SQL % NOTFOUND) THEN\n" + " found := 0;\n" + " END IF;\n" + "END LOOP;\n" + "CLOSE index_stats_cur;\n" + + "END;", + TRUE, trx); + /* pinfo is freed by que_eval_sql() */ + + trx_commit_for_mysql(trx); + + trx_free_for_background(trx); + + if (!index_fetch_arg.stats_were_modified) { + return(DB_STATS_DO_NOT_EXIST); + } + + return(ret); +} + +/*********************************************************************//** +Fetches or calculates new estimates for index statistics. */ +UNIV_INTERN +void +dict_stats_update_for_index( +/*========================*/ + dict_index_t* index) /*!< in/out: index */ +{ + DBUG_ENTER("dict_stats_update_for_index"); + + ut_ad(!mutex_own(&dict_sys->mutex)); + + if (dict_stats_is_persistent_enabled(index->table)) { + + if (dict_stats_persistent_storage_check(false)) { + dict_table_stats_lock(index->table, RW_X_LATCH); + dict_stats_analyze_index(index); + dict_table_stats_unlock(index->table, RW_X_LATCH); + dict_stats_save(index->table, &index->id); + DBUG_VOID_RETURN; + } + /* else */ + + /* Fall back to transient stats since the persistent + storage is not present or is corrupted */ + char buf_table[MAX_FULL_NAME_LEN]; + char buf_index[MAX_FULL_NAME_LEN]; + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Recalculation of persistent statistics " + "requested for table %s index %s but the required " + "persistent statistics storage is not present or is " + "corrupted. Using transient stats instead.\n", + ut_format_name(index->table->name, TRUE, + buf_table, sizeof(buf_table)), + ut_format_name(index->name, FALSE, + buf_index, sizeof(buf_index))); + } + + dict_table_stats_lock(index->table, RW_X_LATCH); + dict_stats_update_transient_for_index(index); + dict_table_stats_unlock(index->table, RW_X_LATCH); + + DBUG_VOID_RETURN; +} + +/*********************************************************************//** +Calculates new estimates for table and index statistics. The statistics +are used in query optimization. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +dict_stats_update( +/*==============*/ + dict_table_t* table, /*!< in/out: table */ + dict_stats_upd_option_t stats_upd_option) + /*!< in: whether to (re) calc + the stats or to fetch them from + the persistent statistics + storage */ +{ + char buf[MAX_FULL_NAME_LEN]; + + ut_ad(!mutex_own(&dict_sys->mutex)); + + if (table->ibd_file_missing) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: cannot calculate statistics for table %s " + "because the .ibd file is missing. For help, please " + "refer to " REFMAN "innodb-troubleshooting.html\n", + ut_format_name(table->name, TRUE, buf, sizeof(buf))); + dict_stats_empty_table(table); + return(DB_TABLESPACE_DELETED); + } else if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE) { + /* If we have set a high innodb_force_recovery level, do + not calculate statistics, as a badly corrupted index can + cause a crash in it. */ + dict_stats_empty_table(table); + return(DB_SUCCESS); + } + + switch (stats_upd_option) { + case DICT_STATS_RECALC_PERSISTENT: + + if (srv_read_only_mode) { + goto transient; + } + + /* Persistent recalculation requested, called from + 1) ANALYZE TABLE, or + 2) the auto recalculation background thread, or + 3) open table if stats do not exist on disk and auto recalc + is enabled */ + + /* InnoDB internal tables (e.g. SYS_TABLES) cannot have + persistent stats enabled */ + ut_a(strchr(table->name, '/') != NULL); + + /* check if the persistent statistics storage exists + before calling the potentially slow function + dict_stats_update_persistent(); that is a + prerequisite for dict_stats_save() succeeding */ + if (dict_stats_persistent_storage_check(false)) { + + dberr_t err; + + err = dict_stats_update_persistent(table); + + if (err != DB_SUCCESS) { + return(err); + } + + err = dict_stats_save(table, NULL); + + return(err); + } + + /* Fall back to transient stats since the persistent + storage is not present or is corrupted */ + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Recalculation of persistent statistics " + "requested for table %s but the required persistent " + "statistics storage is not present or is corrupted. " + "Using transient stats instead.\n", + ut_format_name(table->name, TRUE, buf, sizeof(buf))); + + goto transient; + + case DICT_STATS_RECALC_TRANSIENT: + + goto transient; + + case DICT_STATS_EMPTY_TABLE: + + dict_stats_empty_table(table); + + /* If table is using persistent stats, + then save the stats on disk */ + + if (dict_stats_is_persistent_enabled(table)) { + + if (dict_stats_persistent_storage_check(false)) { + + return(dict_stats_save(table, NULL)); + } + + return(DB_STATS_DO_NOT_EXIST); + } + + return(DB_SUCCESS); + + case DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY: + + /* fetch requested, either fetch from persistent statistics + storage or use the old method */ + + if (table->stat_initialized) { + return(DB_SUCCESS); + } + + /* InnoDB internal tables (e.g. SYS_TABLES) cannot have + persistent stats enabled */ + ut_a(strchr(table->name, '/') != NULL); + + if (!dict_stats_persistent_storage_check(false)) { + /* persistent statistics storage does not exist + or is corrupted, calculate the transient stats */ + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: Fetch of persistent " + "statistics requested for table %s but the " + "required system tables %s and %s are not " + "present or have unexpected structure. " + "Using transient stats instead.\n", + ut_format_name(table->name, TRUE, + buf, sizeof(buf)), + TABLE_STATS_NAME_PRINT, + INDEX_STATS_NAME_PRINT); + + goto transient; + } + + dict_table_t* t; + + /* Create a dummy table object with the same name and + indexes, suitable for fetching the stats into it. */ + t = dict_stats_table_clone_create(table); + + dberr_t err = dict_stats_fetch_from_ps(t); + + t->stats_last_recalc = table->stats_last_recalc; + t->stat_modified_counter = 0; + + switch (err) { + case DB_SUCCESS: + + dict_table_stats_lock(table, RW_X_LATCH); + + /* Initialize all stats to dummy values before + copying because dict_stats_table_clone_create() does + skip corrupted indexes so our dummy object 't' may + have less indexes than the real object 'table'. */ + dict_stats_empty_table(table); + + dict_stats_copy(table, t); + + dict_stats_assert_initialized(table); + + dict_table_stats_unlock(table, RW_X_LATCH); + + dict_stats_table_clone_free(t); + + return(DB_SUCCESS); + case DB_STATS_DO_NOT_EXIST: + + dict_stats_table_clone_free(t); + + if (srv_read_only_mode) { + goto transient; + } + + if (dict_stats_auto_recalc_is_enabled(table)) { + return(dict_stats_update( + table, + DICT_STATS_RECALC_PERSISTENT)); + } + + ut_format_name(table->name, TRUE, buf, sizeof(buf)); + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Trying to use table %s which has " + "persistent statistics enabled, but auto " + "recalculation turned off and the statistics " + "do not exist in %s and %s. Please either run " + "\"ANALYZE TABLE %s;\" manually or enable the " + "auto recalculation with " + "\"ALTER TABLE %s STATS_AUTO_RECALC=1;\". " + "InnoDB will now use transient statistics for " + "%s.\n", + buf, TABLE_STATS_NAME, INDEX_STATS_NAME, buf, + buf, buf); + + goto transient; + default: + + dict_stats_table_clone_free(t); + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error fetching persistent statistics " + "for table %s from %s and %s: %s. " + "Using transient stats method instead.\n", + ut_format_name(table->name, TRUE, buf, + sizeof(buf)), + TABLE_STATS_NAME, + INDEX_STATS_NAME, + ut_strerr(err)); + + goto transient; + } + /* no "default:" in order to produce a compilation warning + about unhandled enumeration value */ + } + +transient: + + dict_table_stats_lock(table, RW_X_LATCH); + + dict_stats_update_transient(table); + + dict_table_stats_unlock(table, RW_X_LATCH); + + return(DB_SUCCESS); +} + +/*********************************************************************//** +Removes the information for a particular index's stats from the persistent +storage if it exists and if there is data stored for this index. +This function creates its own trx and commits it. +A note from Marko why we cannot edit user and sys_* tables in one trx: +marko: The problem is that ibuf merges should be disabled while we are +rolling back dict transactions. +marko: If ibuf merges are not disabled, we need to scan the *.ibd files. +But we shouldn't open *.ibd files before we have rolled back dict +transactions and opened the SYS_* records for the *.ibd files. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +dict_stats_drop_index( +/*==================*/ + const char* db_and_table,/*!< in: db and table, e.g. 'db/table' */ + const char* iname, /*!< in: index name */ + char* errstr, /*!< out: error message if != DB_SUCCESS + is returned */ + ulint errstr_sz)/*!< in: size of the errstr buffer */ +{ + char db_utf8[MAX_DB_UTF8_LEN]; + char table_utf8[MAX_TABLE_UTF8_LEN]; + pars_info_t* pinfo; + dberr_t ret; + + ut_ad(!mutex_own(&dict_sys->mutex)); + + /* skip indexes whose table names do not contain a database name + e.g. if we are dropping an index from SYS_TABLES */ + if (strchr(db_and_table, '/') == NULL) { + + return(DB_SUCCESS); + } + + dict_fs2utf8(db_and_table, db_utf8, sizeof(db_utf8), + table_utf8, sizeof(table_utf8)); + + pinfo = pars_info_create(); + + pars_info_add_str_literal(pinfo, "database_name", db_utf8); + + pars_info_add_str_literal(pinfo, "table_name", table_utf8); + + pars_info_add_str_literal(pinfo, "index_name", iname); + + rw_lock_x_lock(&dict_operation_lock); + mutex_enter(&dict_sys->mutex); + + ret = dict_stats_exec_sql( + pinfo, + "PROCEDURE DROP_INDEX_STATS () IS\n" + "BEGIN\n" + "DELETE FROM \"" INDEX_STATS_NAME "\" WHERE\n" + "database_name = :database_name AND\n" + "table_name = :table_name AND\n" + "index_name = :index_name;\n" + "END;\n", NULL); + + mutex_exit(&dict_sys->mutex); + rw_lock_x_unlock(&dict_operation_lock); + + if (ret == DB_STATS_DO_NOT_EXIST) { + ret = DB_SUCCESS; + } + + if (ret != DB_SUCCESS) { + ut_snprintf(errstr, errstr_sz, + "Unable to delete statistics for index %s " + "from %s%s: %s. They can be deleted later using " + "DELETE FROM %s WHERE " + "database_name = '%s' AND " + "table_name = '%s' AND " + "index_name = '%s';", + iname, + INDEX_STATS_NAME_PRINT, + (ret == DB_LOCK_WAIT_TIMEOUT + ? " because the rows are locked" + : ""), + ut_strerr(ret), + INDEX_STATS_NAME_PRINT, + db_utf8, + table_utf8, + iname); + + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: %s\n", errstr); + } + + return(ret); +} + +/*********************************************************************//** +Executes +DELETE FROM mysql.innodb_table_stats +WHERE database_name = '...' AND table_name = '...'; +Creates its own transaction and commits it. +@return DB_SUCCESS or error code */ +UNIV_INLINE +dberr_t +dict_stats_delete_from_table_stats( +/*===============================*/ + const char* database_name, /*!< in: database name, e.g. 'db' */ + const char* table_name) /*!< in: table name, e.g. 'table' */ +{ + pars_info_t* pinfo; + dberr_t ret; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(mutex_own(&dict_sys->mutex)); + + pinfo = pars_info_create(); + + pars_info_add_str_literal(pinfo, "database_name", database_name); + pars_info_add_str_literal(pinfo, "table_name", table_name); + + ret = dict_stats_exec_sql( + pinfo, + "PROCEDURE DELETE_FROM_TABLE_STATS () IS\n" + "BEGIN\n" + "DELETE FROM \"" TABLE_STATS_NAME "\" WHERE\n" + "database_name = :database_name AND\n" + "table_name = :table_name;\n" + "END;\n", NULL); + + return(ret); +} + +/*********************************************************************//** +Executes +DELETE FROM mysql.innodb_index_stats +WHERE database_name = '...' AND table_name = '...'; +Creates its own transaction and commits it. +@return DB_SUCCESS or error code */ +UNIV_INLINE +dberr_t +dict_stats_delete_from_index_stats( +/*===============================*/ + const char* database_name, /*!< in: database name, e.g. 'db' */ + const char* table_name) /*!< in: table name, e.g. 'table' */ +{ + pars_info_t* pinfo; + dberr_t ret; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(mutex_own(&dict_sys->mutex)); + + pinfo = pars_info_create(); + + pars_info_add_str_literal(pinfo, "database_name", database_name); + pars_info_add_str_literal(pinfo, "table_name", table_name); + + ret = dict_stats_exec_sql( + pinfo, + "PROCEDURE DELETE_FROM_INDEX_STATS () IS\n" + "BEGIN\n" + "DELETE FROM \"" INDEX_STATS_NAME "\" WHERE\n" + "database_name = :database_name AND\n" + "table_name = :table_name;\n" + "END;\n", NULL); + + return(ret); +} + +/*********************************************************************//** +Removes the statistics for a table and all of its indexes from the +persistent statistics storage if it exists and if there is data stored for +the table. This function creates its own transaction and commits it. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +dict_stats_drop_table( +/*==================*/ + const char* db_and_table, /*!< in: db and table, e.g. 'db/table' */ + char* errstr, /*!< out: error message + if != DB_SUCCESS is returned */ + ulint errstr_sz) /*!< in: size of errstr buffer */ +{ + char db_utf8[MAX_DB_UTF8_LEN]; + char table_utf8[MAX_TABLE_UTF8_LEN]; + dberr_t ret; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(mutex_own(&dict_sys->mutex)); + + /* skip tables that do not contain a database name + e.g. if we are dropping SYS_TABLES */ + if (strchr(db_and_table, '/') == NULL) { + + return(DB_SUCCESS); + } + + /* skip innodb_table_stats and innodb_index_stats themselves */ + if (strcmp(db_and_table, TABLE_STATS_NAME) == 0 + || strcmp(db_and_table, INDEX_STATS_NAME) == 0) { + + return(DB_SUCCESS); + } + + dict_fs2utf8(db_and_table, db_utf8, sizeof(db_utf8), + table_utf8, sizeof(table_utf8)); + + ret = dict_stats_delete_from_table_stats(db_utf8, table_utf8); + + if (ret == DB_SUCCESS) { + ret = dict_stats_delete_from_index_stats(db_utf8, table_utf8); + } + + if (ret == DB_STATS_DO_NOT_EXIST) { + ret = DB_SUCCESS; + } + + if (ret != DB_SUCCESS) { + + ut_snprintf(errstr, errstr_sz, + "Unable to delete statistics for table %s.%s: %s. " + "They can be deleted later using " + + "DELETE FROM %s WHERE " + "database_name = '%s' AND " + "table_name = '%s'; " + + "DELETE FROM %s WHERE " + "database_name = '%s' AND " + "table_name = '%s';", + + db_utf8, table_utf8, + ut_strerr(ret), + + INDEX_STATS_NAME_PRINT, + db_utf8, table_utf8, + + TABLE_STATS_NAME_PRINT, + db_utf8, table_utf8); + } + + return(ret); +} + +/*********************************************************************//** +Executes +UPDATE mysql.innodb_table_stats SET +database_name = '...', table_name = '...' +WHERE database_name = '...' AND table_name = '...'; +Creates its own transaction and commits it. +@return DB_SUCCESS or error code */ +UNIV_INLINE +dberr_t +dict_stats_rename_in_table_stats( +/*=============================*/ + const char* old_dbname_utf8,/*!< in: database name, e.g. 'olddb' */ + const char* old_tablename_utf8,/*!< in: table name, e.g. 'oldtable' */ + const char* new_dbname_utf8,/*!< in: database name, e.g. 'newdb' */ + const char* new_tablename_utf8)/*!< in: table name, e.g. 'newtable' */ +{ + pars_info_t* pinfo; + dberr_t ret; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(mutex_own(&dict_sys->mutex)); + + pinfo = pars_info_create(); + + pars_info_add_str_literal(pinfo, "old_dbname_utf8", old_dbname_utf8); + pars_info_add_str_literal(pinfo, "old_tablename_utf8", old_tablename_utf8); + pars_info_add_str_literal(pinfo, "new_dbname_utf8", new_dbname_utf8); + pars_info_add_str_literal(pinfo, "new_tablename_utf8", new_tablename_utf8); + + ret = dict_stats_exec_sql( + pinfo, + "PROCEDURE RENAME_IN_TABLE_STATS () IS\n" + "BEGIN\n" + "UPDATE \"" TABLE_STATS_NAME "\" SET\n" + "database_name = :new_dbname_utf8,\n" + "table_name = :new_tablename_utf8\n" + "WHERE\n" + "database_name = :old_dbname_utf8 AND\n" + "table_name = :old_tablename_utf8;\n" + "END;\n", NULL); + + return(ret); +} + +/*********************************************************************//** +Executes +UPDATE mysql.innodb_index_stats SET +database_name = '...', table_name = '...' +WHERE database_name = '...' AND table_name = '...'; +Creates its own transaction and commits it. +@return DB_SUCCESS or error code */ +UNIV_INLINE +dberr_t +dict_stats_rename_in_index_stats( +/*=============================*/ + const char* old_dbname_utf8,/*!< in: database name, e.g. 'olddb' */ + const char* old_tablename_utf8,/*!< in: table name, e.g. 'oldtable' */ + const char* new_dbname_utf8,/*!< in: database name, e.g. 'newdb' */ + const char* new_tablename_utf8)/*!< in: table name, e.g. 'newtable' */ +{ + pars_info_t* pinfo; + dberr_t ret; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(mutex_own(&dict_sys->mutex)); + + pinfo = pars_info_create(); + + pars_info_add_str_literal(pinfo, "old_dbname_utf8", old_dbname_utf8); + pars_info_add_str_literal(pinfo, "old_tablename_utf8", old_tablename_utf8); + pars_info_add_str_literal(pinfo, "new_dbname_utf8", new_dbname_utf8); + pars_info_add_str_literal(pinfo, "new_tablename_utf8", new_tablename_utf8); + + ret = dict_stats_exec_sql( + pinfo, + "PROCEDURE RENAME_IN_INDEX_STATS () IS\n" + "BEGIN\n" + "UPDATE \"" INDEX_STATS_NAME "\" SET\n" + "database_name = :new_dbname_utf8,\n" + "table_name = :new_tablename_utf8\n" + "WHERE\n" + "database_name = :old_dbname_utf8 AND\n" + "table_name = :old_tablename_utf8;\n" + "END;\n", NULL); + + return(ret); +} + +/*********************************************************************//** +Renames a table in InnoDB persistent stats storage. +This function creates its own transaction and commits it. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +dict_stats_rename_table( +/*====================*/ + const char* old_name, /*!< in: old name, e.g. 'db/table' */ + const char* new_name, /*!< in: new name, e.g. 'db/table' */ + char* errstr, /*!< out: error string if != DB_SUCCESS + is returned */ + size_t errstr_sz) /*!< in: errstr size */ +{ + char old_db_utf8[MAX_DB_UTF8_LEN]; + char new_db_utf8[MAX_DB_UTF8_LEN]; + char old_table_utf8[MAX_TABLE_UTF8_LEN]; + char new_table_utf8[MAX_TABLE_UTF8_LEN]; + dberr_t ret; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(!mutex_own(&dict_sys->mutex)); + + /* skip innodb_table_stats and innodb_index_stats themselves */ + if (strcmp(old_name, TABLE_STATS_NAME) == 0 + || strcmp(old_name, INDEX_STATS_NAME) == 0 + || strcmp(new_name, TABLE_STATS_NAME) == 0 + || strcmp(new_name, INDEX_STATS_NAME) == 0) { + + return(DB_SUCCESS); + } + + dict_fs2utf8(old_name, old_db_utf8, sizeof(old_db_utf8), + old_table_utf8, sizeof(old_table_utf8)); + + dict_fs2utf8(new_name, new_db_utf8, sizeof(new_db_utf8), + new_table_utf8, sizeof(new_table_utf8)); + + rw_lock_x_lock(&dict_operation_lock); + mutex_enter(&dict_sys->mutex); + + ulint n_attempts = 0; + do { + n_attempts++; + + ret = dict_stats_rename_in_table_stats( + old_db_utf8, old_table_utf8, + new_db_utf8, new_table_utf8); + + if (ret == DB_DUPLICATE_KEY) { + dict_stats_delete_from_table_stats( + new_db_utf8, new_table_utf8); + } + + if (ret == DB_STATS_DO_NOT_EXIST) { + ret = DB_SUCCESS; + } + + if (ret != DB_SUCCESS) { + mutex_exit(&dict_sys->mutex); + rw_lock_x_unlock(&dict_operation_lock); + os_thread_sleep(200000 /* 0.2 sec */); + rw_lock_x_lock(&dict_operation_lock); + mutex_enter(&dict_sys->mutex); + } + } while ((ret == DB_DEADLOCK + || ret == DB_DUPLICATE_KEY + || ret == DB_LOCK_WAIT_TIMEOUT) + && n_attempts < 5); + + if (ret != DB_SUCCESS) { + ut_snprintf(errstr, errstr_sz, + "Unable to rename statistics from " + "%s.%s to %s.%s in %s: %s. " + "They can be renamed later using " + + "UPDATE %s SET " + "database_name = '%s', " + "table_name = '%s' " + "WHERE " + "database_name = '%s' AND " + "table_name = '%s';", + + old_db_utf8, old_table_utf8, + new_db_utf8, new_table_utf8, + TABLE_STATS_NAME_PRINT, + ut_strerr(ret), + + TABLE_STATS_NAME_PRINT, + new_db_utf8, new_table_utf8, + old_db_utf8, old_table_utf8); + mutex_exit(&dict_sys->mutex); + rw_lock_x_unlock(&dict_operation_lock); + return(ret); + } + /* else */ + + n_attempts = 0; + do { + n_attempts++; + + ret = dict_stats_rename_in_index_stats( + old_db_utf8, old_table_utf8, + new_db_utf8, new_table_utf8); + + if (ret == DB_DUPLICATE_KEY) { + dict_stats_delete_from_index_stats( + new_db_utf8, new_table_utf8); + } + + if (ret == DB_STATS_DO_NOT_EXIST) { + ret = DB_SUCCESS; + } + + if (ret != DB_SUCCESS) { + mutex_exit(&dict_sys->mutex); + rw_lock_x_unlock(&dict_operation_lock); + os_thread_sleep(200000 /* 0.2 sec */); + rw_lock_x_lock(&dict_operation_lock); + mutex_enter(&dict_sys->mutex); + } + } while ((ret == DB_DEADLOCK + || ret == DB_DUPLICATE_KEY + || ret == DB_LOCK_WAIT_TIMEOUT) + && n_attempts < 5); + + mutex_exit(&dict_sys->mutex); + rw_lock_x_unlock(&dict_operation_lock); + + if (ret != DB_SUCCESS) { + ut_snprintf(errstr, errstr_sz, + "Unable to rename statistics from " + "%s.%s to %s.%s in %s: %s. " + "They can be renamed later using " + + "UPDATE %s SET " + "database_name = '%s', " + "table_name = '%s' " + "WHERE " + "database_name = '%s' AND " + "table_name = '%s';", + + old_db_utf8, old_table_utf8, + new_db_utf8, new_table_utf8, + INDEX_STATS_NAME_PRINT, + ut_strerr(ret), + + INDEX_STATS_NAME_PRINT, + new_db_utf8, new_table_utf8, + old_db_utf8, old_table_utf8); + } + + return(ret); +} + +/* tests @{ */ +#ifdef UNIV_COMPILE_TEST_FUNCS + +/* The following unit tests test some of the functions in this file +individually, such testing cannot be performed by the mysql-test framework +via SQL. */ + +/* test_dict_table_schema_check() @{ */ +void +test_dict_table_schema_check() +{ + /* + CREATE TABLE tcheck ( + c01 VARCHAR(123), + c02 INT, + c03 INT NOT NULL, + c04 INT UNSIGNED, + c05 BIGINT, + c06 BIGINT UNSIGNED NOT NULL, + c07 TIMESTAMP + ) ENGINE=INNODB; + */ + /* definition for the table 'test/tcheck' */ + dict_col_meta_t columns[] = { + {"c01", DATA_VARCHAR, 0, 123}, + {"c02", DATA_INT, 0, 4}, + {"c03", DATA_INT, DATA_NOT_NULL, 4}, + {"c04", DATA_INT, DATA_UNSIGNED, 4}, + {"c05", DATA_INT, 0, 8}, + {"c06", DATA_INT, DATA_NOT_NULL | DATA_UNSIGNED, 8}, + {"c07", DATA_INT, 0, 4}, + {"c_extra", DATA_INT, 0, 4} + }; + dict_table_schema_t schema = { + "test/tcheck", + 0 /* will be set individually for each test below */, + columns + }; + char errstr[512]; + + ut_snprintf(errstr, sizeof(errstr), "Table not found"); + + /* prevent any data dictionary modifications while we are checking + the tables' structure */ + + mutex_enter(&(dict_sys->mutex)); + + /* check that a valid table is reported as valid */ + schema.n_cols = 7; + if (dict_table_schema_check(&schema, errstr, sizeof(errstr)) + == DB_SUCCESS) { + printf("OK: test.tcheck ok\n"); + } else { + printf("ERROR: %s\n", errstr); + printf("ERROR: test.tcheck not present or corrupted\n"); + goto test_dict_table_schema_check_end; + } + + /* check columns with wrong length */ + schema.columns[1].len = 8; + if (dict_table_schema_check(&schema, errstr, sizeof(errstr)) + != DB_SUCCESS) { + printf("OK: test.tcheck.c02 has different length and is " + "reported as corrupted\n"); + } else { + printf("OK: test.tcheck.c02 has different length but is " + "reported as ok\n"); + goto test_dict_table_schema_check_end; + } + schema.columns[1].len = 4; + + /* request that c02 is NOT NULL while actually it does not have + this flag set */ + schema.columns[1].prtype_mask |= DATA_NOT_NULL; + if (dict_table_schema_check(&schema, errstr, sizeof(errstr)) + != DB_SUCCESS) { + printf("OK: test.tcheck.c02 does not have NOT NULL while " + "it should and is reported as corrupted\n"); + } else { + printf("ERROR: test.tcheck.c02 does not have NOT NULL while " + "it should and is not reported as corrupted\n"); + goto test_dict_table_schema_check_end; + } + schema.columns[1].prtype_mask &= ~DATA_NOT_NULL; + + /* check a table that contains some extra columns */ + schema.n_cols = 6; + if (dict_table_schema_check(&schema, errstr, sizeof(errstr)) + == DB_SUCCESS) { + printf("ERROR: test.tcheck has more columns but is not " + "reported as corrupted\n"); + goto test_dict_table_schema_check_end; + } else { + printf("OK: test.tcheck has more columns and is " + "reported as corrupted\n"); + } + + /* check a table that has some columns missing */ + schema.n_cols = 8; + if (dict_table_schema_check(&schema, errstr, sizeof(errstr)) + != DB_SUCCESS) { + printf("OK: test.tcheck has missing columns and is " + "reported as corrupted\n"); + } else { + printf("ERROR: test.tcheck has missing columns but is " + "reported as ok\n"); + goto test_dict_table_schema_check_end; + } + + /* check non-existent table */ + schema.table_name = "test/tcheck_nonexistent"; + if (dict_table_schema_check(&schema, errstr, sizeof(errstr)) + != DB_SUCCESS) { + printf("OK: test.tcheck_nonexistent is not present\n"); + } else { + printf("ERROR: test.tcheck_nonexistent is present!?\n"); + goto test_dict_table_schema_check_end; + } + +test_dict_table_schema_check_end: + + mutex_exit(&(dict_sys->mutex)); +} +/* @} */ + +/* save/fetch aux macros @{ */ +#define TEST_DATABASE_NAME "foobardb" +#define TEST_TABLE_NAME "test_dict_stats" + +#define TEST_N_ROWS 111 +#define TEST_CLUSTERED_INDEX_SIZE 222 +#define TEST_SUM_OF_OTHER_INDEX_SIZES 333 + +#define TEST_IDX1_NAME "tidx1" +#define TEST_IDX1_COL1_NAME "tidx1_col1" +#define TEST_IDX1_INDEX_SIZE 123 +#define TEST_IDX1_N_LEAF_PAGES 234 +#define TEST_IDX1_N_DIFF1 50 +#define TEST_IDX1_N_DIFF1_SAMPLE_SIZE 500 + +#define TEST_IDX2_NAME "tidx2" +#define TEST_IDX2_COL1_NAME "tidx2_col1" +#define TEST_IDX2_COL2_NAME "tidx2_col2" +#define TEST_IDX2_COL3_NAME "tidx2_col3" +#define TEST_IDX2_COL4_NAME "tidx2_col4" +#define TEST_IDX2_INDEX_SIZE 321 +#define TEST_IDX2_N_LEAF_PAGES 432 +#define TEST_IDX2_N_DIFF1 60 +#define TEST_IDX2_N_DIFF1_SAMPLE_SIZE 600 +#define TEST_IDX2_N_DIFF2 61 +#define TEST_IDX2_N_DIFF2_SAMPLE_SIZE 610 +#define TEST_IDX2_N_DIFF3 62 +#define TEST_IDX2_N_DIFF3_SAMPLE_SIZE 620 +#define TEST_IDX2_N_DIFF4 63 +#define TEST_IDX2_N_DIFF4_SAMPLE_SIZE 630 +/* @} */ + +/* test_dict_stats_save() @{ */ +void +test_dict_stats_save() +{ + dict_table_t table; + dict_index_t index1; + dict_field_t index1_fields[1]; + ib_uint64_t index1_stat_n_diff_key_vals[1]; + ib_uint64_t index1_stat_n_sample_sizes[1]; + dict_index_t index2; + dict_field_t index2_fields[4]; + ib_uint64_t index2_stat_n_diff_key_vals[4]; + ib_uint64_t index2_stat_n_sample_sizes[4]; + dberr_t ret; + + /* craft a dummy dict_table_t */ + table.name = (char*) (TEST_DATABASE_NAME "/" TEST_TABLE_NAME); + table.stat_n_rows = TEST_N_ROWS; + table.stat_clustered_index_size = TEST_CLUSTERED_INDEX_SIZE; + table.stat_sum_of_other_index_sizes = TEST_SUM_OF_OTHER_INDEX_SIZES; + UT_LIST_INIT(table.indexes); + UT_LIST_ADD_LAST(indexes, table.indexes, &index1); + UT_LIST_ADD_LAST(indexes, table.indexes, &index2); + ut_d(table.magic_n = DICT_TABLE_MAGIC_N); + ut_d(index1.magic_n = DICT_INDEX_MAGIC_N); + + index1.name = TEST_IDX1_NAME; + index1.table = &table; + index1.cached = 1; + index1.n_uniq = 1; + index1.fields = index1_fields; + index1.stat_n_diff_key_vals = index1_stat_n_diff_key_vals; + index1.stat_n_sample_sizes = index1_stat_n_sample_sizes; + index1.stat_index_size = TEST_IDX1_INDEX_SIZE; + index1.stat_n_leaf_pages = TEST_IDX1_N_LEAF_PAGES; + index1_fields[0].name = TEST_IDX1_COL1_NAME; + index1_stat_n_diff_key_vals[0] = TEST_IDX1_N_DIFF1; + index1_stat_n_sample_sizes[0] = TEST_IDX1_N_DIFF1_SAMPLE_SIZE; + + ut_d(index2.magic_n = DICT_INDEX_MAGIC_N); + index2.name = TEST_IDX2_NAME; + index2.table = &table; + index2.cached = 1; + index2.n_uniq = 4; + index2.fields = index2_fields; + index2.stat_n_diff_key_vals = index2_stat_n_diff_key_vals; + index2.stat_n_sample_sizes = index2_stat_n_sample_sizes; + index2.stat_index_size = TEST_IDX2_INDEX_SIZE; + index2.stat_n_leaf_pages = TEST_IDX2_N_LEAF_PAGES; + index2_fields[0].name = TEST_IDX2_COL1_NAME; + index2_fields[1].name = TEST_IDX2_COL2_NAME; + index2_fields[2].name = TEST_IDX2_COL3_NAME; + index2_fields[3].name = TEST_IDX2_COL4_NAME; + index2_stat_n_diff_key_vals[0] = TEST_IDX2_N_DIFF1; + index2_stat_n_diff_key_vals[1] = TEST_IDX2_N_DIFF2; + index2_stat_n_diff_key_vals[2] = TEST_IDX2_N_DIFF3; + index2_stat_n_diff_key_vals[3] = TEST_IDX2_N_DIFF4; + index2_stat_n_sample_sizes[0] = TEST_IDX2_N_DIFF1_SAMPLE_SIZE; + index2_stat_n_sample_sizes[1] = TEST_IDX2_N_DIFF2_SAMPLE_SIZE; + index2_stat_n_sample_sizes[2] = TEST_IDX2_N_DIFF3_SAMPLE_SIZE; + index2_stat_n_sample_sizes[3] = TEST_IDX2_N_DIFF4_SAMPLE_SIZE; + + ret = dict_stats_save(&table, NULL); + + ut_a(ret == DB_SUCCESS); + + printf("\nOK: stats saved successfully, now go ahead and read " + "what's inside %s and %s:\n\n", + TABLE_STATS_NAME_PRINT, + INDEX_STATS_NAME_PRINT); + + printf("SELECT COUNT(*) = 1 AS table_stats_saved_successfully\n" + "FROM %s\n" + "WHERE\n" + "database_name = '%s' AND\n" + "table_name = '%s' AND\n" + "n_rows = %d AND\n" + "clustered_index_size = %d AND\n" + "sum_of_other_index_sizes = %d;\n" + "\n", + TABLE_STATS_NAME_PRINT, + TEST_DATABASE_NAME, + TEST_TABLE_NAME, + TEST_N_ROWS, + TEST_CLUSTERED_INDEX_SIZE, + TEST_SUM_OF_OTHER_INDEX_SIZES); + + printf("SELECT COUNT(*) = 3 AS tidx1_stats_saved_successfully\n" + "FROM %s\n" + "WHERE\n" + "database_name = '%s' AND\n" + "table_name = '%s' AND\n" + "index_name = '%s' AND\n" + "(\n" + " (stat_name = 'size' AND stat_value = %d AND" + " sample_size IS NULL) OR\n" + " (stat_name = 'n_leaf_pages' AND stat_value = %d AND" + " sample_size IS NULL) OR\n" + " (stat_name = 'n_diff_pfx01' AND stat_value = %d AND" + " sample_size = '%d' AND stat_description = '%s')\n" + ");\n" + "\n", + INDEX_STATS_NAME_PRINT, + TEST_DATABASE_NAME, + TEST_TABLE_NAME, + TEST_IDX1_NAME, + TEST_IDX1_INDEX_SIZE, + TEST_IDX1_N_LEAF_PAGES, + TEST_IDX1_N_DIFF1, + TEST_IDX1_N_DIFF1_SAMPLE_SIZE, + TEST_IDX1_COL1_NAME); + + printf("SELECT COUNT(*) = 6 AS tidx2_stats_saved_successfully\n" + "FROM %s\n" + "WHERE\n" + "database_name = '%s' AND\n" + "table_name = '%s' AND\n" + "index_name = '%s' AND\n" + "(\n" + " (stat_name = 'size' AND stat_value = %d AND" + " sample_size IS NULL) OR\n" + " (stat_name = 'n_leaf_pages' AND stat_value = %d AND" + " sample_size IS NULL) OR\n" + " (stat_name = 'n_diff_pfx01' AND stat_value = %d AND" + " sample_size = '%d' AND stat_description = '%s') OR\n" + " (stat_name = 'n_diff_pfx02' AND stat_value = %d AND" + " sample_size = '%d' AND stat_description = '%s,%s') OR\n" + " (stat_name = 'n_diff_pfx03' AND stat_value = %d AND" + " sample_size = '%d' AND stat_description = '%s,%s,%s') OR\n" + " (stat_name = 'n_diff_pfx04' AND stat_value = %d AND" + " sample_size = '%d' AND stat_description = '%s,%s,%s,%s')\n" + ");\n" + "\n", + INDEX_STATS_NAME_PRINT, + TEST_DATABASE_NAME, + TEST_TABLE_NAME, + TEST_IDX2_NAME, + TEST_IDX2_INDEX_SIZE, + TEST_IDX2_N_LEAF_PAGES, + TEST_IDX2_N_DIFF1, + TEST_IDX2_N_DIFF1_SAMPLE_SIZE, TEST_IDX2_COL1_NAME, + TEST_IDX2_N_DIFF2, + TEST_IDX2_N_DIFF2_SAMPLE_SIZE, + TEST_IDX2_COL1_NAME, TEST_IDX2_COL2_NAME, + TEST_IDX2_N_DIFF3, + TEST_IDX2_N_DIFF3_SAMPLE_SIZE, + TEST_IDX2_COL1_NAME, TEST_IDX2_COL2_NAME, TEST_IDX2_COL3_NAME, + TEST_IDX2_N_DIFF4, + TEST_IDX2_N_DIFF4_SAMPLE_SIZE, + TEST_IDX2_COL1_NAME, TEST_IDX2_COL2_NAME, TEST_IDX2_COL3_NAME, + TEST_IDX2_COL4_NAME); +} +/* @} */ + +/* test_dict_stats_fetch_from_ps() @{ */ +void +test_dict_stats_fetch_from_ps() +{ + dict_table_t table; + dict_index_t index1; + ib_uint64_t index1_stat_n_diff_key_vals[1]; + ib_uint64_t index1_stat_n_sample_sizes[1]; + dict_index_t index2; + ib_uint64_t index2_stat_n_diff_key_vals[4]; + ib_uint64_t index2_stat_n_sample_sizes[4]; + dberr_t ret; + + /* craft a dummy dict_table_t */ + table.name = (char*) (TEST_DATABASE_NAME "/" TEST_TABLE_NAME); + UT_LIST_INIT(table.indexes); + UT_LIST_ADD_LAST(indexes, table.indexes, &index1); + UT_LIST_ADD_LAST(indexes, table.indexes, &index2); + ut_d(table.magic_n = DICT_TABLE_MAGIC_N); + + index1.name = TEST_IDX1_NAME; + ut_d(index1.magic_n = DICT_INDEX_MAGIC_N); + index1.cached = 1; + index1.n_uniq = 1; + index1.stat_n_diff_key_vals = index1_stat_n_diff_key_vals; + index1.stat_n_sample_sizes = index1_stat_n_sample_sizes; + + index2.name = TEST_IDX2_NAME; + ut_d(index2.magic_n = DICT_INDEX_MAGIC_N); + index2.cached = 1; + index2.n_uniq = 4; + index2.stat_n_diff_key_vals = index2_stat_n_diff_key_vals; + index2.stat_n_sample_sizes = index2_stat_n_sample_sizes; + + ret = dict_stats_fetch_from_ps(&table); + + ut_a(ret == DB_SUCCESS); + + ut_a(table.stat_n_rows == TEST_N_ROWS); + ut_a(table.stat_clustered_index_size == TEST_CLUSTERED_INDEX_SIZE); + ut_a(table.stat_sum_of_other_index_sizes + == TEST_SUM_OF_OTHER_INDEX_SIZES); + + ut_a(index1.stat_index_size == TEST_IDX1_INDEX_SIZE); + ut_a(index1.stat_n_leaf_pages == TEST_IDX1_N_LEAF_PAGES); + ut_a(index1_stat_n_diff_key_vals[0] == TEST_IDX1_N_DIFF1); + ut_a(index1_stat_n_sample_sizes[0] == TEST_IDX1_N_DIFF1_SAMPLE_SIZE); + + ut_a(index2.stat_index_size == TEST_IDX2_INDEX_SIZE); + ut_a(index2.stat_n_leaf_pages == TEST_IDX2_N_LEAF_PAGES); + ut_a(index2_stat_n_diff_key_vals[0] == TEST_IDX2_N_DIFF1); + ut_a(index2_stat_n_sample_sizes[0] == TEST_IDX2_N_DIFF1_SAMPLE_SIZE); + ut_a(index2_stat_n_diff_key_vals[1] == TEST_IDX2_N_DIFF2); + ut_a(index2_stat_n_sample_sizes[1] == TEST_IDX2_N_DIFF2_SAMPLE_SIZE); + ut_a(index2_stat_n_diff_key_vals[2] == TEST_IDX2_N_DIFF3); + ut_a(index2_stat_n_sample_sizes[2] == TEST_IDX2_N_DIFF3_SAMPLE_SIZE); + ut_a(index2_stat_n_diff_key_vals[3] == TEST_IDX2_N_DIFF4); + ut_a(index2_stat_n_sample_sizes[3] == TEST_IDX2_N_DIFF4_SAMPLE_SIZE); + + printf("OK: fetch successful\n"); +} +/* @} */ + +/* test_dict_stats_all() @{ */ +void +test_dict_stats_all() +{ + test_dict_table_schema_check(); + + test_dict_stats_save(); + + test_dict_stats_fetch_from_ps(); +} +/* @} */ + +#endif /* UNIV_COMPILE_TEST_FUNCS */ +/* @} */ + +#endif /* UNIV_HOTBACKUP */ diff --git a/storage/xtradb/dict/dict0stats_bg.cc b/storage/xtradb/dict/dict0stats_bg.cc new file mode 100644 index 00000000000..9e1f75a13a9 --- /dev/null +++ b/storage/xtradb/dict/dict0stats_bg.cc @@ -0,0 +1,367 @@ +/***************************************************************************** + +Copyright (c) 2012, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file dict/dict0stats_bg.cc +Code used for background table and index stats gathering. + +Created Apr 25, 2012 Vasil Dimov +*******************************************************/ + +#include "row0mysql.h" +#include "srv0start.h" +#include "dict0stats.h" +#include "dict0stats_bg.h" + +#ifdef UNIV_NONINL +# include "dict0stats_bg.ic" +#endif + +#include <vector> + +/** Minimum time interval between stats recalc for a given table */ +#define MIN_RECALC_INTERVAL 10 /* seconds */ + +#define SHUTTING_DOWN() (srv_shutdown_state != SRV_SHUTDOWN_NONE) + +/** Event to wake up the stats thread */ +UNIV_INTERN os_event_t dict_stats_event = NULL; + +/** This mutex protects the "recalc_pool" variable. */ +static ib_mutex_t recalc_pool_mutex; +#ifdef HAVE_PSI_INTERFACE +static mysql_pfs_key_t recalc_pool_mutex_key; +#endif /* HAVE_PSI_INTERFACE */ + +/** The number of tables that can be added to "recalc_pool" before +it is enlarged */ +static const ulint RECALC_POOL_INITIAL_SLOTS = 128; + +/** The multitude of tables whose stats are to be automatically +recalculated - an STL vector */ +typedef std::vector<table_id_t> recalc_pool_t; +static recalc_pool_t recalc_pool; + +typedef recalc_pool_t::iterator recalc_pool_iterator_t; + +/*****************************************************************//** +Initialize the recalc pool, called once during thread initialization. */ +static +void +dict_stats_recalc_pool_init() +/*=========================*/ +{ + ut_ad(!srv_read_only_mode); + + recalc_pool.reserve(RECALC_POOL_INITIAL_SLOTS); +} + +/*****************************************************************//** +Free the resources occupied by the recalc pool, called once during +thread de-initialization. */ +static +void +dict_stats_recalc_pool_deinit() +/*===========================*/ +{ + ut_ad(!srv_read_only_mode); + + recalc_pool.clear(); +} + +/*****************************************************************//** +Add a table to the recalc pool, which is processed by the +background stats gathering thread. Only the table id is added to the +list, so the table can be closed after being enqueued and it will be +opened when needed. If the table does not exist later (has been DROPped), +then it will be removed from the pool and skipped. */ +UNIV_INTERN +void +dict_stats_recalc_pool_add( +/*=======================*/ + const dict_table_t* table) /*!< in: table to add */ +{ + ut_ad(!srv_read_only_mode); + + mutex_enter(&recalc_pool_mutex); + + /* quit if already in the list */ + for (recalc_pool_iterator_t iter = recalc_pool.begin(); + iter != recalc_pool.end(); + ++iter) { + + if (*iter == table->id) { + mutex_exit(&recalc_pool_mutex); + return; + } + } + + recalc_pool.push_back(table->id); + + mutex_exit(&recalc_pool_mutex); + + os_event_set(dict_stats_event); +} + +/*****************************************************************//** +Get a table from the auto recalc pool. The returned table id is removed +from the pool. +@return true if the pool was non-empty and "id" was set, false otherwise */ +static +bool +dict_stats_recalc_pool_get( +/*=======================*/ + table_id_t* id) /*!< out: table id, or unmodified if list is + empty */ +{ + ut_ad(!srv_read_only_mode); + + mutex_enter(&recalc_pool_mutex); + + if (recalc_pool.empty()) { + mutex_exit(&recalc_pool_mutex); + return(false); + } + + *id = recalc_pool[0]; + + recalc_pool.erase(recalc_pool.begin()); + + mutex_exit(&recalc_pool_mutex); + + return(true); +} + +/*****************************************************************//** +Delete a given table from the auto recalc pool. +dict_stats_recalc_pool_del() */ +UNIV_INTERN +void +dict_stats_recalc_pool_del( +/*=======================*/ + const dict_table_t* table) /*!< in: table to remove */ +{ + ut_ad(!srv_read_only_mode); + ut_ad(mutex_own(&dict_sys->mutex)); + + mutex_enter(&recalc_pool_mutex); + + ut_ad(table->id > 0); + + for (recalc_pool_iterator_t iter = recalc_pool.begin(); + iter != recalc_pool.end(); + ++iter) { + + if (*iter == table->id) { + /* erase() invalidates the iterator */ + recalc_pool.erase(iter); + break; + } + } + + mutex_exit(&recalc_pool_mutex); +} + +/*****************************************************************//** +Wait until background stats thread has stopped using the specified table. +The caller must have locked the data dictionary using +row_mysql_lock_data_dictionary() and this function may unlock it temporarily +and restore the lock before it exits. +The background stats thread is guaranteed not to start using the specified +table after this function returns and before the caller unlocks the data +dictionary because it sets the BG_STAT_IN_PROGRESS bit in table->stats_bg_flag +under dict_sys->mutex. */ +UNIV_INTERN +void +dict_stats_wait_bg_to_stop_using_table( +/*===================================*/ + dict_table_t* table, /*!< in/out: table */ + trx_t* trx) /*!< in/out: transaction to use for + unlocking/locking the data dict */ +{ + while (!dict_stats_stop_bg(table)) { + DICT_STATS_BG_YIELD(trx); + } +} + +/*****************************************************************//** +Initialize global variables needed for the operation of dict_stats_thread() +Must be called before dict_stats_thread() is started. */ +UNIV_INTERN +void +dict_stats_thread_init() +/*====================*/ +{ + ut_a(!srv_read_only_mode); + + dict_stats_event = os_event_create(); + + /* The recalc_pool_mutex is acquired from: + 1) the background stats gathering thread before any other latch + and released without latching anything else in between (thus + any level would do here) + 2) from row_update_statistics_if_needed() + and released without latching anything else in between. We know + that dict_sys->mutex (SYNC_DICT) is not acquired when + row_update_statistics_if_needed() is called and it may be acquired + inside that function (thus a level <=SYNC_DICT would do). + 3) from row_drop_table_for_mysql() after dict_sys->mutex (SYNC_DICT) + and dict_operation_lock (SYNC_DICT_OPERATION) have been locked + (thus a level <SYNC_DICT && <SYNC_DICT_OPERATION would do) + So we choose SYNC_STATS_AUTO_RECALC to be about below SYNC_DICT. */ + mutex_create(recalc_pool_mutex_key, &recalc_pool_mutex, + SYNC_STATS_AUTO_RECALC); + + dict_stats_recalc_pool_init(); +} + +/*****************************************************************//** +Free resources allocated by dict_stats_thread_init(), must be called +after dict_stats_thread() has exited. */ +UNIV_INTERN +void +dict_stats_thread_deinit() +/*======================*/ +{ + ut_a(!srv_read_only_mode); + ut_ad(!srv_dict_stats_thread_active); + + dict_stats_recalc_pool_deinit(); + + mutex_free(&recalc_pool_mutex); + memset(&recalc_pool_mutex, 0x0, sizeof(recalc_pool_mutex)); + + os_event_free(dict_stats_event); + dict_stats_event = NULL; +} + +/*****************************************************************//** +Get the first table that has been added for auto recalc and eventually +update its stats. */ +static +void +dict_stats_process_entry_from_recalc_pool() +/*=======================================*/ +{ + table_id_t table_id; + + ut_ad(!srv_read_only_mode); + + /* pop the first table from the auto recalc pool */ + if (!dict_stats_recalc_pool_get(&table_id)) { + /* no tables for auto recalc */ + return; + } + + dict_table_t* table; + + mutex_enter(&dict_sys->mutex); + + table = dict_table_open_on_id(table_id, TRUE, DICT_TABLE_OP_NORMAL); + + if (table == NULL) { + /* table does not exist, must have been DROPped + after its id was enqueued */ + mutex_exit(&dict_sys->mutex); + return; + } + + /* Check whether table is corrupted */ + if (table->corrupted) { + dict_table_close(table, TRUE, FALSE); + mutex_exit(&dict_sys->mutex); + return; + } + + table->stats_bg_flag = BG_STAT_IN_PROGRESS; + + mutex_exit(&dict_sys->mutex); + + /* ut_time() could be expensive, the current function + is called once every time a table has been changed more than 10% and + on a system with lots of small tables, this could become hot. If we + find out that this is a problem, then the check below could eventually + be replaced with something else, though a time interval is the natural + approach. */ + + if (ut_difftime(ut_time(), table->stats_last_recalc) + < MIN_RECALC_INTERVAL) { + + /* Stats were (re)calculated not long ago. To avoid + too frequent stats updates we put back the table on + the auto recalc list and do nothing. */ + + dict_stats_recalc_pool_add(table); + + } else { + + dict_stats_update(table, DICT_STATS_RECALC_PERSISTENT); + } + + mutex_enter(&dict_sys->mutex); + + table->stats_bg_flag = BG_STAT_NONE; + + dict_table_close(table, TRUE, FALSE); + + mutex_exit(&dict_sys->mutex); +} + +/*****************************************************************//** +This is the thread for background stats gathering. It pops tables, from +the auto recalc list and proceeds them, eventually recalculating their +statistics. +@return this function does not return, it calls os_thread_exit() */ +extern "C" UNIV_INTERN +os_thread_ret_t +DECLARE_THREAD(dict_stats_thread)( +/*==============================*/ + void* arg __attribute__((unused))) /*!< in: a dummy parameter + required by os_thread_create */ +{ + ut_a(!srv_read_only_mode); + + srv_dict_stats_thread_active = TRUE; + + while (!SHUTTING_DOWN()) { + + /* Wake up periodically even if not signaled. This is + because we may lose an event - if the below call to + dict_stats_process_entry_from_recalc_pool() puts the entry back + in the list, the os_event_set() will be lost by the subsequent + os_event_reset(). */ + os_event_wait_time( + dict_stats_event, MIN_RECALC_INTERVAL * 1000000); + + if (SHUTTING_DOWN()) { + break; + } + + dict_stats_process_entry_from_recalc_pool(); + + os_event_reset(dict_stats_event); + } + + srv_dict_stats_thread_active = FALSE; + + /* We count the number of threads in os_thread_exit(). A created + thread should always use that to exit instead of return(). */ + os_thread_exit(NULL); + + OS_THREAD_DUMMY_RETURN; +} diff --git a/storage/xtradb/dyn/dyn0dyn.cc b/storage/xtradb/dyn/dyn0dyn.cc new file mode 100644 index 00000000000..3ef5297a7c9 --- /dev/null +++ b/storage/xtradb/dyn/dyn0dyn.cc @@ -0,0 +1,66 @@ +/***************************************************************************** + +Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file dyn/dyn0dyn.cc +The dynamically allocated array + +Created 2/5/1996 Heikki Tuuri +*******************************************************/ + +#include "dyn0dyn.h" +#ifdef UNIV_NONINL +#include "dyn0dyn.ic" +#endif + +/************************************************************//** +Adds a new block to a dyn array. +@return created block */ +UNIV_INTERN +dyn_block_t* +dyn_array_add_block( +/*================*/ + dyn_array_t* arr) /*!< in/out: dyn array */ +{ + mem_heap_t* heap; + dyn_block_t* block; + + ut_ad(arr); + ut_ad(arr->magic_n == DYN_BLOCK_MAGIC_N); + + if (arr->heap == NULL) { + UT_LIST_INIT(arr->base); + UT_LIST_ADD_FIRST(list, arr->base, arr); + + arr->heap = mem_heap_create(sizeof(dyn_block_t)); + } + + block = dyn_array_get_last_block(arr); + block->used = block->used | DYN_BLOCK_FULL_FLAG; + + heap = arr->heap; + + block = static_cast<dyn_block_t*>( + mem_heap_alloc(heap, sizeof(dyn_block_t))); + + block->used = 0; + + UT_LIST_ADD_LAST(list, arr->base, block); + + return(block); +} diff --git a/storage/xtradb/eval/eval0eval.cc b/storage/xtradb/eval/eval0eval.cc new file mode 100644 index 00000000000..ccc54781102 --- /dev/null +++ b/storage/xtradb/eval/eval0eval.cc @@ -0,0 +1,950 @@ +/***************************************************************************** + +Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file eval/eval0eval.cc +SQL evaluator: evaluates simple data structures, like expressions, in +a query graph + +Created 12/29/1997 Heikki Tuuri +*******************************************************/ + +#include "eval0eval.h" + +#ifdef UNIV_NONINL +#include "eval0eval.ic" +#endif + +#include "data0data.h" +#include "row0sel.h" +#include "rem0cmp.h" + +/** The RND function seed */ +static ulint eval_rnd = 128367121; + +/** Dummy adress used when we should allocate a buffer of size 0 in +eval_node_alloc_val_buf */ + +static byte eval_dummy; + +/************************************************************************* +Gets the like node from the node */ +UNIV_INLINE +que_node_t* +que_node_get_like_node( +/*===================*/ + /* out: next node in a list of nodes */ + que_node_t* node) /* in: node in a list */ +{ + return(((sym_node_t*) node)->like_node); +} + +/*****************************************************************//** +Allocate a buffer from global dynamic memory for a value of a que_node. +NOTE that this memory must be explicitly freed when the query graph is +freed. If the node already has an allocated buffer, that buffer is freed +here. NOTE that this is the only function where dynamic memory should be +allocated for a query node val field. +@return pointer to allocated buffer */ +UNIV_INTERN +byte* +eval_node_alloc_val_buf( +/*====================*/ + que_node_t* node, /*!< in: query graph node; sets the val field + data field to point to the new buffer, and + len field equal to size */ + ulint size) /*!< in: buffer size */ +{ + dfield_t* dfield; + byte* data; + + ut_ad(que_node_get_type(node) == QUE_NODE_SYMBOL + || que_node_get_type(node) == QUE_NODE_FUNC); + + dfield = que_node_get_val(node); + + data = static_cast<byte*>(dfield_get_data(dfield)); + + if (data && data != &eval_dummy) { + mem_free(data); + } + + if (size == 0) { + data = &eval_dummy; + } else { + data = static_cast<byte*>(mem_alloc(size)); + } + + que_node_set_val_buf_size(node, size); + + dfield_set_data(dfield, data, size); + + return(data); +} + +/*****************************************************************//** +Free the buffer from global dynamic memory for a value of a que_node, +if it has been allocated in the above function. The freeing for pushed +column values is done in sel_col_prefetch_buf_free. */ +UNIV_INTERN +void +eval_node_free_val_buf( +/*===================*/ + que_node_t* node) /*!< in: query graph node */ +{ + dfield_t* dfield; + byte* data; + + ut_ad(que_node_get_type(node) == QUE_NODE_SYMBOL + || que_node_get_type(node) == QUE_NODE_FUNC); + + dfield = que_node_get_val(node); + + data = static_cast<byte*>(dfield_get_data(dfield)); + + if (que_node_get_val_buf_size(node) > 0) { + ut_a(data); + + mem_free(data); + } +} + +/********************************************************************* +Evaluates a LIKE comparison node. +@return the result of the comparison */ +UNIV_INLINE +ibool +eval_cmp_like( +/*==========*/ + que_node_t* arg1, /* !< in: left operand */ + que_node_t* arg2) /* !< in: right operand */ +{ + ib_like_t op; + int res; + que_node_t* arg3; + que_node_t* arg4; + dfield_t* dfield; + dtype_t* dtype; + ibool val = TRUE; + + arg3 = que_node_get_like_node(arg2); + + /* Get the comparison type operator */ + ut_a(arg3); + + dfield = que_node_get_val(arg3); + dtype = dfield_get_type(dfield); + + ut_a(dtype_get_mtype(dtype) == DATA_INT); + op = static_cast<ib_like_t>(mach_read_from_4(static_cast<const unsigned char*>(dfield_get_data(dfield)))); + + switch (op) { + case IB_LIKE_PREFIX: + + arg4 = que_node_get_next(arg3); + res = cmp_dfield_dfield_like_prefix( + que_node_get_val(arg1), + que_node_get_val(arg4)); + break; + + case IB_LIKE_SUFFIX: + + arg4 = que_node_get_next(arg3); + res = cmp_dfield_dfield_like_suffix( + que_node_get_val(arg1), + que_node_get_val(arg4)); + break; + + case IB_LIKE_SUBSTR: + + arg4 = que_node_get_next(arg3); + res = cmp_dfield_dfield_like_substr( + que_node_get_val(arg1), + que_node_get_val(arg4)); + break; + + case IB_LIKE_EXACT: + res = cmp_dfield_dfield( + que_node_get_val(arg1), + que_node_get_val(arg2)); + break; + + default: + ut_error; + } + + if (res != 0) { + val = FALSE; + } + + return(val); +} + +/********************************************************************* +Evaluates a comparison node. +@return the result of the comparison */ +ibool +eval_cmp( +/*=====*/ + func_node_t* cmp_node) /*!< in: comparison node */ +{ + que_node_t* arg1; + que_node_t* arg2; + int res; + int func; + ibool val = TRUE; + + ut_ad(que_node_get_type(cmp_node) == QUE_NODE_FUNC); + + arg1 = cmp_node->args; + arg2 = que_node_get_next(arg1); + + func = cmp_node->func; + + if (func == PARS_LIKE_TOKEN_EXACT + || func == PARS_LIKE_TOKEN_PREFIX + || func == PARS_LIKE_TOKEN_SUFFIX + || func == PARS_LIKE_TOKEN_SUBSTR) { + + val = eval_cmp_like(arg1, arg2); + } else { + res = cmp_dfield_dfield( + que_node_get_val(arg1), que_node_get_val(arg2)); + + if (func == '=') { + if (res != 0) { + val = FALSE; + } + } else if (func == '<') { + if (res != -1) { + val = FALSE; + } + } else if (func == PARS_LE_TOKEN) { + if (res == 1) { + val = FALSE; + } + } else if (func == PARS_NE_TOKEN) { + if (res == 0) { + val = FALSE; + } + } else if (func == PARS_GE_TOKEN) { + if (res == -1) { + val = FALSE; + } + } else { + ut_ad(func == '>'); + + if (res != 1) { + val = FALSE; + } + } + } + + eval_node_set_ibool_val(cmp_node, val); + + return(val); +} + +/*****************************************************************//** +Evaluates a logical operation node. */ +UNIV_INLINE +void +eval_logical( +/*=========*/ + func_node_t* logical_node) /*!< in: logical operation node */ +{ + que_node_t* arg1; + que_node_t* arg2; + ibool val1; + ibool val2 = 0; /* remove warning */ + ibool val = 0; /* remove warning */ + int func; + + ut_ad(que_node_get_type(logical_node) == QUE_NODE_FUNC); + + arg1 = logical_node->args; + arg2 = que_node_get_next(arg1); /* arg2 is NULL if func is 'NOT' */ + + val1 = eval_node_get_ibool_val(arg1); + + if (arg2) { + val2 = eval_node_get_ibool_val(arg2); + } + + func = logical_node->func; + + if (func == PARS_AND_TOKEN) { + val = val1 & val2; + } else if (func == PARS_OR_TOKEN) { + val = val1 | val2; + } else if (func == PARS_NOT_TOKEN) { + val = TRUE - val1; + } else { + ut_error; + } + + eval_node_set_ibool_val(logical_node, val); +} + +/*****************************************************************//** +Evaluates an arithmetic operation node. */ +UNIV_INLINE +void +eval_arith( +/*=======*/ + func_node_t* arith_node) /*!< in: arithmetic operation node */ +{ + que_node_t* arg1; + que_node_t* arg2; + lint val1; + lint val2 = 0; /* remove warning */ + lint val; + int func; + + ut_ad(que_node_get_type(arith_node) == QUE_NODE_FUNC); + + arg1 = arith_node->args; + arg2 = que_node_get_next(arg1); /* arg2 is NULL if func is unary '-' */ + + val1 = eval_node_get_int_val(arg1); + + if (arg2) { + val2 = eval_node_get_int_val(arg2); + } + + func = arith_node->func; + + if (func == '+') { + val = val1 + val2; + } else if ((func == '-') && arg2) { + val = val1 - val2; + } else if (func == '-') { + val = -val1; + } else if (func == '*') { + val = val1 * val2; + } else { + ut_ad(func == '/'); + val = val1 / val2; + } + + eval_node_set_int_val(arith_node, val); +} + +/*****************************************************************//** +Evaluates an aggregate operation node. */ +UNIV_INLINE +void +eval_aggregate( +/*===========*/ + func_node_t* node) /*!< in: aggregate operation node */ +{ + que_node_t* arg; + lint val; + lint arg_val; + int func; + + ut_ad(que_node_get_type(node) == QUE_NODE_FUNC); + + val = eval_node_get_int_val(node); + + func = node->func; + + if (func == PARS_COUNT_TOKEN) { + + val = val + 1; + } else { + ut_ad(func == PARS_SUM_TOKEN); + + arg = node->args; + arg_val = eval_node_get_int_val(arg); + + val = val + arg_val; + } + + eval_node_set_int_val(node, val); +} + +/*****************************************************************//** +Evaluates a predefined function node where the function is not relevant +in benchmarks. */ +static +void +eval_predefined_2( +/*==============*/ + func_node_t* func_node) /*!< in: predefined function node */ +{ + que_node_t* arg; + que_node_t* arg1; + que_node_t* arg2 = 0; /* remove warning (??? bug ???) */ + lint int_val; + byte* data; + ulint len1; + ulint len2; + int func; + ulint i; + + ut_ad(que_node_get_type(func_node) == QUE_NODE_FUNC); + + arg1 = func_node->args; + + if (arg1) { + arg2 = que_node_get_next(arg1); + } + + func = func_node->func; + + if (func == PARS_PRINTF_TOKEN) { + + arg = arg1; + + while (arg) { + dfield_print(que_node_get_val(arg)); + + arg = que_node_get_next(arg); + } + + putc('\n', stderr); + + } else if (func == PARS_ASSERT_TOKEN) { + + if (!eval_node_get_ibool_val(arg1)) { + fputs("SQL assertion fails in a stored procedure!\n", + stderr); + } + + ut_a(eval_node_get_ibool_val(arg1)); + + /* This function, or more precisely, a debug procedure, + returns no value */ + + } else if (func == PARS_RND_TOKEN) { + + len1 = (ulint) eval_node_get_int_val(arg1); + len2 = (ulint) eval_node_get_int_val(arg2); + + ut_ad(len2 >= len1); + + if (len2 > len1) { + int_val = (lint) (len1 + + (eval_rnd % (len2 - len1 + 1))); + } else { + int_val = (lint) len1; + } + + eval_rnd = ut_rnd_gen_next_ulint(eval_rnd); + + eval_node_set_int_val(func_node, int_val); + + } else if (func == PARS_RND_STR_TOKEN) { + + len1 = (ulint) eval_node_get_int_val(arg1); + + data = eval_node_ensure_val_buf(func_node, len1); + + for (i = 0; i < len1; i++) { + data[i] = (byte)(97 + (eval_rnd % 3)); + + eval_rnd = ut_rnd_gen_next_ulint(eval_rnd); + } + } else { + ut_error; + } +} + +/*****************************************************************//** +Evaluates a notfound-function node. */ +UNIV_INLINE +void +eval_notfound( +/*==========*/ + func_node_t* func_node) /*!< in: function node */ +{ + sym_node_t* cursor; + sel_node_t* sel_node; + ibool ibool_val; + + ut_ad(func_node->func == PARS_NOTFOUND_TOKEN); + + cursor = static_cast<sym_node_t*>(func_node->args); + + ut_ad(que_node_get_type(cursor) == QUE_NODE_SYMBOL); + + if (cursor->token_type == SYM_LIT) { + + ut_ad(ut_memcmp(dfield_get_data(que_node_get_val(cursor)), + "SQL", 3) == 0); + + sel_node = cursor->sym_table->query_graph->last_sel_node; + } else { + sel_node = cursor->alias->cursor_def; + } + + if (sel_node->state == SEL_NODE_NO_MORE_ROWS) { + ibool_val = TRUE; + } else { + ibool_val = FALSE; + } + + eval_node_set_ibool_val(func_node, ibool_val); +} + +/*****************************************************************//** +Evaluates a substr-function node. */ +UNIV_INLINE +void +eval_substr( +/*========*/ + func_node_t* func_node) /*!< in: function node */ +{ + que_node_t* arg1; + que_node_t* arg2; + que_node_t* arg3; + dfield_t* dfield; + byte* str1; + ulint len1; + ulint len2; + + arg1 = func_node->args; + arg2 = que_node_get_next(arg1); + + ut_ad(func_node->func == PARS_SUBSTR_TOKEN); + + arg3 = que_node_get_next(arg2); + + str1 = static_cast<byte*>(dfield_get_data(que_node_get_val(arg1))); + + len1 = (ulint) eval_node_get_int_val(arg2); + len2 = (ulint) eval_node_get_int_val(arg3); + + dfield = que_node_get_val(func_node); + + dfield_set_data(dfield, str1 + len1, len2); +} + +/*****************************************************************//** +Evaluates a replstr-procedure node. */ +static +void +eval_replstr( +/*=========*/ + func_node_t* func_node) /*!< in: function node */ +{ + que_node_t* arg1; + que_node_t* arg2; + que_node_t* arg3; + que_node_t* arg4; + byte* str1; + byte* str2; + ulint len1; + ulint len2; + + arg1 = func_node->args; + arg2 = que_node_get_next(arg1); + + ut_ad(que_node_get_type(arg1) == QUE_NODE_SYMBOL); + + arg3 = que_node_get_next(arg2); + arg4 = que_node_get_next(arg3); + + str1 = static_cast<byte*>(dfield_get_data(que_node_get_val(arg1))); + str2 = static_cast<byte*>(dfield_get_data(que_node_get_val(arg2))); + + len1 = (ulint) eval_node_get_int_val(arg3); + len2 = (ulint) eval_node_get_int_val(arg4); + + if ((dfield_get_len(que_node_get_val(arg1)) < len1 + len2) + || (dfield_get_len(que_node_get_val(arg2)) < len2)) { + + ut_error; + } + + ut_memcpy(str1 + len1, str2, len2); +} + +/*****************************************************************//** +Evaluates an instr-function node. */ +static +void +eval_instr( +/*=======*/ + func_node_t* func_node) /*!< in: function node */ +{ + que_node_t* arg1; + que_node_t* arg2; + dfield_t* dfield1; + dfield_t* dfield2; + lint int_val; + byte* str1; + byte* str2; + byte match_char; + ulint len1; + ulint len2; + ulint i; + ulint j; + + arg1 = func_node->args; + arg2 = que_node_get_next(arg1); + + dfield1 = que_node_get_val(arg1); + dfield2 = que_node_get_val(arg2); + + str1 = static_cast<byte*>(dfield_get_data(dfield1)); + str2 = static_cast<byte*>(dfield_get_data(dfield2)); + + len1 = dfield_get_len(dfield1); + len2 = dfield_get_len(dfield2); + + if (len2 == 0) { + ut_error; + } + + match_char = str2[0]; + + for (i = 0; i < len1; i++) { + /* In this outer loop, the number of matched characters is 0 */ + + if (str1[i] == match_char) { + + if (i + len2 > len1) { + + break; + } + + for (j = 1;; j++) { + /* We have already matched j characters */ + + if (j == len2) { + int_val = i + 1; + + goto match_found; + } + + if (str1[i + j] != str2[j]) { + + break; + } + } + } + } + + int_val = 0; + +match_found: + eval_node_set_int_val(func_node, int_val); +} + +/*****************************************************************//** +Evaluates a predefined function node. */ +UNIV_INLINE +void +eval_binary_to_number( +/*==================*/ + func_node_t* func_node) /*!< in: function node */ +{ + que_node_t* arg1; + dfield_t* dfield; + byte* str1; + byte* str2; + ulint len1; + ulint int_val; + + arg1 = func_node->args; + + dfield = que_node_get_val(arg1); + + str1 = static_cast<byte*>(dfield_get_data(dfield)); + len1 = dfield_get_len(dfield); + + if (len1 > 4) { + ut_error; + } + + if (len1 == 4) { + str2 = str1; + } else { + int_val = 0; + str2 = (byte*) &int_val; + + ut_memcpy(str2 + (4 - len1), str1, len1); + } + + eval_node_copy_and_alloc_val(func_node, str2, 4); +} + +/*****************************************************************//** +Evaluates a predefined function node. */ +static +void +eval_concat( +/*========*/ + func_node_t* func_node) /*!< in: function node */ +{ + que_node_t* arg; + dfield_t* dfield; + byte* data; + ulint len; + ulint len1; + + arg = func_node->args; + len = 0; + + while (arg) { + len1 = dfield_get_len(que_node_get_val(arg)); + + len += len1; + + arg = que_node_get_next(arg); + } + + data = eval_node_ensure_val_buf(func_node, len); + + arg = func_node->args; + len = 0; + + while (arg) { + dfield = que_node_get_val(arg); + len1 = dfield_get_len(dfield); + + ut_memcpy(data + len, dfield_get_data(dfield), len1); + + len += len1; + + arg = que_node_get_next(arg); + } +} + +/*****************************************************************//** +Evaluates a predefined function node. If the first argument is an integer, +this function looks at the second argument which is the integer length in +bytes, and converts the integer to a VARCHAR. +If the first argument is of some other type, this function converts it to +BINARY. */ +UNIV_INLINE +void +eval_to_binary( +/*===========*/ + func_node_t* func_node) /*!< in: function node */ +{ + que_node_t* arg1; + que_node_t* arg2; + dfield_t* dfield; + byte* str1; + ulint len; + ulint len1; + + arg1 = func_node->args; + + str1 = static_cast<byte*>(dfield_get_data(que_node_get_val(arg1))); + + if (dtype_get_mtype(que_node_get_data_type(arg1)) != DATA_INT) { + + len = dfield_get_len(que_node_get_val(arg1)); + + dfield = que_node_get_val(func_node); + + dfield_set_data(dfield, str1, len); + + return; + } + + arg2 = que_node_get_next(arg1); + + len1 = (ulint) eval_node_get_int_val(arg2); + + if (len1 > 4) { + + ut_error; + } + + dfield = que_node_get_val(func_node); + + dfield_set_data(dfield, str1 + (4 - len1), len1); +} + +/*****************************************************************//** +Evaluates a predefined function node. */ +UNIV_INLINE +void +eval_predefined( +/*============*/ + func_node_t* func_node) /*!< in: function node */ +{ + que_node_t* arg1; + lint int_val; + byte* data; + int func; + + func = func_node->func; + + arg1 = func_node->args; + + if (func == PARS_LENGTH_TOKEN) { + + int_val = (lint) dfield_get_len(que_node_get_val(arg1)); + + } else if (func == PARS_TO_CHAR_TOKEN) { + + /* Convert number to character string as a + signed decimal integer. */ + + ulint uint_val; + int int_len; + + int_val = eval_node_get_int_val(arg1); + + /* Determine the length of the string. */ + + if (int_val == 0) { + int_len = 1; /* the number 0 occupies 1 byte */ + } else { + int_len = 0; + if (int_val < 0) { + uint_val = ((ulint) -int_val - 1) + 1; + int_len++; /* reserve space for minus sign */ + } else { + uint_val = (ulint) int_val; + } + for (; uint_val > 0; int_len++) { + uint_val /= 10; + } + } + + /* allocate the string */ + data = eval_node_ensure_val_buf(func_node, int_len + 1); + + /* add terminating NUL character */ + data[int_len] = 0; + + /* convert the number */ + + if (int_val == 0) { + data[0] = '0'; + } else { + int tmp; + if (int_val < 0) { + data[0] = '-'; /* preceding minus sign */ + uint_val = ((ulint) -int_val - 1) + 1; + } else { + uint_val = (ulint) int_val; + } + for (tmp = int_len; uint_val > 0; uint_val /= 10) { + data[--tmp] = (byte) + ('0' + (byte)(uint_val % 10)); + } + } + + dfield_set_len(que_node_get_val(func_node), int_len); + + return; + + } else if (func == PARS_TO_NUMBER_TOKEN) { + + int_val = atoi((char*) + dfield_get_data(que_node_get_val(arg1))); + + } else if (func == PARS_SYSDATE_TOKEN) { + int_val = (lint) ut_time(); + } else { + eval_predefined_2(func_node); + + return; + } + + eval_node_set_int_val(func_node, int_val); +} + +/*****************************************************************//** +Evaluates a function node. */ +UNIV_INTERN +void +eval_func( +/*======*/ + func_node_t* func_node) /*!< in: function node */ +{ + que_node_t* arg; + ulint fclass; + ulint func; + + ut_ad(que_node_get_type(func_node) == QUE_NODE_FUNC); + + fclass = func_node->fclass; + func = func_node->func; + + arg = func_node->args; + + /* Evaluate first the argument list */ + while (arg) { + eval_exp(arg); + + /* The functions are not defined for SQL null argument + values, except for eval_cmp and notfound */ + + if (dfield_is_null(que_node_get_val(arg)) + && (fclass != PARS_FUNC_CMP) + && (func != PARS_NOTFOUND_TOKEN) + && (func != PARS_PRINTF_TOKEN)) { + ut_error; + } + + arg = que_node_get_next(arg); + } + + switch (fclass) { + case PARS_FUNC_CMP: + eval_cmp(func_node); + return; + case PARS_FUNC_ARITH: + eval_arith(func_node); + return; + case PARS_FUNC_AGGREGATE: + eval_aggregate(func_node); + return; + case PARS_FUNC_PREDEFINED: + switch (func) { + case PARS_NOTFOUND_TOKEN: + eval_notfound(func_node); + return; + case PARS_SUBSTR_TOKEN: + eval_substr(func_node); + return; + case PARS_REPLSTR_TOKEN: + eval_replstr(func_node); + return; + case PARS_INSTR_TOKEN: + eval_instr(func_node); + return; + case PARS_BINARY_TO_NUMBER_TOKEN: + eval_binary_to_number(func_node); + return; + case PARS_CONCAT_TOKEN: + eval_concat(func_node); + return; + case PARS_TO_BINARY_TOKEN: + eval_to_binary(func_node); + return; + default: + eval_predefined(func_node); + return; + } + case PARS_FUNC_LOGICAL: + eval_logical(func_node); + return; + } + + ut_error; +} diff --git a/storage/xtradb/eval/eval0proc.cc b/storage/xtradb/eval/eval0proc.cc new file mode 100644 index 00000000000..e6f3a32cd48 --- /dev/null +++ b/storage/xtradb/eval/eval0proc.cc @@ -0,0 +1,296 @@ +/***************************************************************************** + +Copyright (c) 1998, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file eval/eval0proc.cc +Executes SQL stored procedures and their control structures + +Created 1/20/1998 Heikki Tuuri +*******************************************************/ + +#include "eval0proc.h" + +#ifdef UNIV_NONINL +#include "eval0proc.ic" +#endif + +/**********************************************************************//** +Performs an execution step of an if-statement node. +@return query thread to run next or NULL */ +UNIV_INTERN +que_thr_t* +if_step( +/*====*/ + que_thr_t* thr) /*!< in: query thread */ +{ + if_node_t* node; + elsif_node_t* elsif_node; + + ut_ad(thr); + + node = static_cast<if_node_t*>(thr->run_node); + ut_ad(que_node_get_type(node) == QUE_NODE_IF); + + if (thr->prev_node == que_node_get_parent(node)) { + + /* Evaluate the condition */ + + eval_exp(node->cond); + + if (eval_node_get_ibool_val(node->cond)) { + + /* The condition evaluated to TRUE: start execution + from the first statement in the statement list */ + + thr->run_node = node->stat_list; + + } else if (node->else_part) { + thr->run_node = node->else_part; + + } else if (node->elsif_list) { + elsif_node = node->elsif_list; + + for (;;) { + eval_exp(elsif_node->cond); + + if (eval_node_get_ibool_val( + elsif_node->cond)) { + + /* The condition evaluated to TRUE: + start execution from the first + statement in the statement list */ + + thr->run_node = elsif_node->stat_list; + + break; + } + + elsif_node = static_cast<elsif_node_t*>( + que_node_get_next(elsif_node)); + + if (elsif_node == NULL) { + thr->run_node = NULL; + + break; + } + } + } else { + thr->run_node = NULL; + } + } else { + /* Move to the next statement */ + ut_ad(que_node_get_next(thr->prev_node) == NULL); + + thr->run_node = NULL; + } + + if (thr->run_node == NULL) { + thr->run_node = que_node_get_parent(node); + } + + return(thr); +} + +/**********************************************************************//** +Performs an execution step of a while-statement node. +@return query thread to run next or NULL */ +UNIV_INTERN +que_thr_t* +while_step( +/*=======*/ + que_thr_t* thr) /*!< in: query thread */ +{ + while_node_t* node; + + ut_ad(thr); + + node = static_cast<while_node_t*>(thr->run_node); + ut_ad(que_node_get_type(node) == QUE_NODE_WHILE); + + ut_ad((thr->prev_node == que_node_get_parent(node)) + || (que_node_get_next(thr->prev_node) == NULL)); + + /* Evaluate the condition */ + + eval_exp(node->cond); + + if (eval_node_get_ibool_val(node->cond)) { + + /* The condition evaluated to TRUE: start execution + from the first statement in the statement list */ + + thr->run_node = node->stat_list; + } else { + thr->run_node = que_node_get_parent(node); + } + + return(thr); +} + +/**********************************************************************//** +Performs an execution step of an assignment statement node. +@return query thread to run next or NULL */ +UNIV_INTERN +que_thr_t* +assign_step( +/*========*/ + que_thr_t* thr) /*!< in: query thread */ +{ + assign_node_t* node; + + ut_ad(thr); + + node = static_cast<assign_node_t*>(thr->run_node); + ut_ad(que_node_get_type(node) == QUE_NODE_ASSIGNMENT); + + /* Evaluate the value to assign */ + + eval_exp(node->val); + + eval_node_copy_val(node->var->alias, node->val); + + thr->run_node = que_node_get_parent(node); + + return(thr); +} + +/**********************************************************************//** +Performs an execution step of a for-loop node. +@return query thread to run next or NULL */ +UNIV_INTERN +que_thr_t* +for_step( +/*=====*/ + que_thr_t* thr) /*!< in: query thread */ +{ + for_node_t* node; + que_node_t* parent; + lint loop_var_value; + + ut_ad(thr); + + node = static_cast<for_node_t*>(thr->run_node); + + ut_ad(que_node_get_type(node) == QUE_NODE_FOR); + + parent = que_node_get_parent(node); + + if (thr->prev_node != parent) { + + /* Move to the next statement */ + thr->run_node = que_node_get_next(thr->prev_node); + + if (thr->run_node != NULL) { + + return(thr); + } + + /* Increment the value of loop_var */ + + loop_var_value = 1 + eval_node_get_int_val(node->loop_var); + } else { + /* Initialize the loop */ + + eval_exp(node->loop_start_limit); + eval_exp(node->loop_end_limit); + + loop_var_value = eval_node_get_int_val(node->loop_start_limit); + + node->loop_end_value + = (int) eval_node_get_int_val(node->loop_end_limit); + } + + /* Check if we should do another loop */ + + if (loop_var_value > node->loop_end_value) { + + /* Enough loops done */ + + thr->run_node = parent; + } else { + eval_node_set_int_val(node->loop_var, loop_var_value); + + thr->run_node = node->stat_list; + } + + return(thr); +} + +/**********************************************************************//** +Performs an execution step of an exit statement node. +@return query thread to run next or NULL */ +UNIV_INTERN +que_thr_t* +exit_step( +/*======*/ + que_thr_t* thr) /*!< in: query thread */ +{ + exit_node_t* node; + que_node_t* loop_node; + + ut_ad(thr); + + node = static_cast<exit_node_t*>(thr->run_node); + + ut_ad(que_node_get_type(node) == QUE_NODE_EXIT); + + /* Loops exit by setting thr->run_node as the loop node's parent, so + find our containing loop node and get its parent. */ + + loop_node = que_node_get_containing_loop_node(node); + + /* If someone uses an EXIT statement outside of a loop, this will + trigger. */ + ut_a(loop_node); + + thr->run_node = que_node_get_parent(loop_node); + + return(thr); +} + +/**********************************************************************//** +Performs an execution step of a return-statement node. +@return query thread to run next or NULL */ +UNIV_INTERN +que_thr_t* +return_step( +/*========*/ + que_thr_t* thr) /*!< in: query thread */ +{ + return_node_t* node; + que_node_t* parent; + + ut_ad(thr); + + node = static_cast<return_node_t*>(thr->run_node); + + ut_ad(que_node_get_type(node) == QUE_NODE_RETURN); + + parent = node; + + while (que_node_get_type(parent) != QUE_NODE_PROC) { + + parent = que_node_get_parent(parent); + } + + ut_a(parent); + + thr->run_node = que_node_get_parent(parent); + + return(thr); +} diff --git a/storage/xtradb/fil/fil0fil.cc b/storage/xtradb/fil/fil0fil.cc new file mode 100644 index 00000000000..1a23d844522 --- /dev/null +++ b/storage/xtradb/fil/fil0fil.cc @@ -0,0 +1,6699 @@ +/***************************************************************************** + +Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file fil/fil0fil.cc +The tablespace memory cache + +Created 10/25/1995 Heikki Tuuri +*******************************************************/ + +#include "fil0fil.h" + +#include <debug_sync.h> +#include <my_dbug.h> + +#include "mem0mem.h" +#include "hash0hash.h" +#include "os0file.h" +#include "mach0data.h" +#include "buf0buf.h" +#include "buf0flu.h" +#include "log0recv.h" +#include "fsp0fsp.h" +#include "srv0srv.h" +#include "srv0start.h" +#include "mtr0mtr.h" +#include "mtr0log.h" +#include "dict0dict.h" +#include "page0page.h" +#include "page0zip.h" +#include "trx0sys.h" +#include "row0mysql.h" +#ifndef UNIV_HOTBACKUP +# include "buf0lru.h" +# include "ibuf0ibuf.h" +# include "sync0sync.h" +# include "os0sync.h" +#else /* !UNIV_HOTBACKUP */ +# include "srv0srv.h" +static ulint srv_data_read, srv_data_written; +#endif /* !UNIV_HOTBACKUP */ + +/* + IMPLEMENTATION OF THE TABLESPACE MEMORY CACHE + ============================================= + +The tablespace cache is responsible for providing fast read/write access to +tablespaces and logs of the database. File creation and deletion is done +in other modules which know more of the logic of the operation, however. + +A tablespace consists of a chain of files. The size of the files does not +have to be divisible by the database block size, because we may just leave +the last incomplete block unused. When a new file is appended to the +tablespace, the maximum size of the file is also specified. At the moment, +we think that it is best to extend the file to its maximum size already at +the creation of the file, because then we can avoid dynamically extending +the file when more space is needed for the tablespace. + +A block's position in the tablespace is specified with a 32-bit unsigned +integer. The files in the chain are thought to be catenated, and the block +corresponding to an address n is the nth block in the catenated file (where +the first block is named the 0th block, and the incomplete block fragments +at the end of files are not taken into account). A tablespace can be extended +by appending a new file at the end of the chain. + +Our tablespace concept is similar to the one of Oracle. + +To acquire more speed in disk transfers, a technique called disk striping is +sometimes used. This means that logical block addresses are divided in a +round-robin fashion across several disks. Windows NT supports disk striping, +so there we do not need to support it in the database. Disk striping is +implemented in hardware in RAID disks. We conclude that it is not necessary +to implement it in the database. Oracle 7 does not support disk striping, +either. + +Another trick used at some database sites is replacing tablespace files by +raw disks, that is, the whole physical disk drive, or a partition of it, is +opened as a single file, and it is accessed through byte offsets calculated +from the start of the disk or the partition. This is recommended in some +books on database tuning to achieve more speed in i/o. Using raw disk +certainly prevents the OS from fragmenting disk space, but it is not clear +if it really adds speed. We measured on the Pentium 100 MHz + NT + NTFS file +system + EIDE Conner disk only a negligible difference in speed when reading +from a file, versus reading from a raw disk. + +To have fast access to a tablespace or a log file, we put the data structures +to a hash table. Each tablespace and log file is given an unique 32-bit +identifier. + +Some operating systems do not support many open files at the same time, +though NT seems to tolerate at least 900 open files. Therefore, we put the +open files in an LRU-list. If we need to open another file, we may close the +file at the end of the LRU-list. When an i/o-operation is pending on a file, +the file cannot be closed. We take the file nodes with pending i/o-operations +out of the LRU-list and keep a count of pending operations. When an operation +completes, we decrement the count and return the file node to the LRU-list if +the count drops to zero. */ + +/** When mysqld is run, the default directory "." is the mysqld datadir, +but in the MySQL Embedded Server Library and mysqlbackup it is not the default +directory, and we must set the base file path explicitly */ +UNIV_INTERN const char* fil_path_to_mysql_datadir = "."; + +/** The number of fsyncs done to the log */ +UNIV_INTERN ulint fil_n_log_flushes = 0; + +/** Number of pending redo log flushes */ +UNIV_INTERN ulint fil_n_pending_log_flushes = 0; +/** Number of pending tablespace flushes */ +UNIV_INTERN ulint fil_n_pending_tablespace_flushes = 0; + +/** Number of files currently open */ +UNIV_INTERN ulint fil_n_file_opened = 0; + +/** The null file address */ +UNIV_INTERN fil_addr_t fil_addr_null = {FIL_NULL, 0}; + +#ifdef UNIV_PFS_MUTEX +/* Key to register fil_system_mutex with performance schema */ +UNIV_INTERN mysql_pfs_key_t fil_system_mutex_key; +#endif /* UNIV_PFS_MUTEX */ + +#ifdef UNIV_PFS_RWLOCK +/* Key to register file space latch with performance schema */ +UNIV_INTERN mysql_pfs_key_t fil_space_latch_key; +#endif /* UNIV_PFS_RWLOCK */ + +/** File node of a tablespace or the log data space */ +struct fil_node_t { + fil_space_t* space; /*!< backpointer to the space where this node + belongs */ + char* name; /*!< path to the file */ + ibool open; /*!< TRUE if file open */ + os_file_t handle; /*!< OS handle to the file, if file open */ + os_event_t sync_event;/*!< Condition event to group and + serialize calls to fsync */ + ibool is_raw_disk;/*!< TRUE if the 'file' is actually a raw + device or a raw disk partition */ + ulint size; /*!< size of the file in database pages, 0 if + not known yet; the possible last incomplete + megabyte may be ignored if space == 0 */ + ulint n_pending; + /*!< count of pending i/o's on this file; + closing of the file is not allowed if + this is > 0 */ + ulint n_pending_flushes; + /*!< count of pending flushes on this file; + closing of the file is not allowed if + this is > 0 */ + ibool being_extended; + /*!< TRUE if the node is currently + being extended. */ + ib_int64_t modification_counter;/*!< when we write to the file we + increment this by one */ + ib_int64_t flush_counter;/*!< up to what + modification_counter value we have + flushed the modifications to disk */ + UT_LIST_NODE_T(fil_node_t) chain; + /*!< link field for the file chain */ + UT_LIST_NODE_T(fil_node_t) LRU; + /*!< link field for the LRU list */ + ulint magic_n;/*!< FIL_NODE_MAGIC_N */ +}; + +/** Value of fil_node_t::magic_n */ +#define FIL_NODE_MAGIC_N 89389 + +/** Tablespace or log data space: let us call them by a common name space */ +struct fil_space_t { + char* name; /*!< space name = the path to the first file in + it */ + ulint id; /*!< space id */ + ib_int64_t tablespace_version; + /*!< in DISCARD/IMPORT this timestamp + is used to check if we should ignore + an insert buffer merge request for a + page because it actually was for the + previous incarnation of the space */ + ibool mark; /*!< this is set to TRUE at database startup if + the space corresponds to a table in the InnoDB + data dictionary; so we can print a warning of + orphaned tablespaces */ + ibool stop_ios;/*!< TRUE if we want to rename the + .ibd file of tablespace and want to + stop temporarily posting of new i/o + requests on the file */ + ibool stop_new_ops; + /*!< we set this TRUE when we start + deleting a single-table tablespace. + When this is set following new ops + are not allowed: + * read IO request + * ibuf merge + * file flush + Note that we can still possibly have + new write operations because we don't + check this flag when doing flush + batches. */ + ulint purpose;/*!< FIL_TABLESPACE, FIL_LOG, or + FIL_ARCH_LOG */ + UT_LIST_BASE_NODE_T(fil_node_t) chain; + /*!< base node for the file chain */ + ulint size; /*!< space size in pages; 0 if a single-table + tablespace whose size we do not know yet; + last incomplete megabytes in data files may be + ignored if space == 0 */ + ulint flags; /*!< tablespace flags; see + fsp_flags_is_valid(), + fsp_flags_get_zip_size() */ + ulint n_reserved_extents; + /*!< number of reserved free extents for + ongoing operations like B-tree page split */ + ulint n_pending_flushes; /*!< this is positive when flushing + the tablespace to disk; dropping of the + tablespace is forbidden if this is positive */ + ulint n_pending_ops;/*!< this is positive when we + have pending operations against this + tablespace. The pending operations can + be ibuf merges or lock validation code + trying to read a block. + Dropping of the tablespace is forbidden + if this is positive */ + hash_node_t hash; /*!< hash chain node */ + hash_node_t name_hash;/*!< hash chain the name_hash table */ +#ifndef UNIV_HOTBACKUP + prio_rw_lock_t latch; /*!< latch protecting the file space storage + allocation */ +#endif /* !UNIV_HOTBACKUP */ + UT_LIST_NODE_T(fil_space_t) unflushed_spaces; + /*!< list of spaces with at least one unflushed + file we have written to */ + bool is_in_unflushed_spaces; + /*!< true if this space is currently in + unflushed_spaces */ + ibool is_corrupt; + UT_LIST_NODE_T(fil_space_t) space_list; + /*!< list of all spaces */ + ulint magic_n;/*!< FIL_SPACE_MAGIC_N */ +}; + +/** Value of fil_space_t::magic_n */ +#define FIL_SPACE_MAGIC_N 89472 + +/** The tablespace memory cache; also the totality of logs (the log +data space) is stored here; below we talk about tablespaces, but also +the ib_logfiles form a 'space' and it is handled here */ +struct fil_system_t { +#ifndef UNIV_HOTBACKUP + ib_mutex_t mutex; /*!< The mutex protecting the cache */ +#endif /* !UNIV_HOTBACKUP */ + hash_table_t* spaces; /*!< The hash table of spaces in the + system; they are hashed on the space + id */ + hash_table_t* name_hash; /*!< hash table based on the space + name */ + UT_LIST_BASE_NODE_T(fil_node_t) LRU; + /*!< base node for the LRU list of the + most recently used open files with no + pending i/o's; if we start an i/o on + the file, we first remove it from this + list, and return it to the start of + the list when the i/o ends; + log files and the system tablespace are + not put to this list: they are opened + after the startup, and kept open until + shutdown */ + UT_LIST_BASE_NODE_T(fil_space_t) unflushed_spaces; + /*!< base node for the list of those + tablespaces whose files contain + unflushed writes; those spaces have + at least one file node where + modification_counter > flush_counter */ + ulint n_open; /*!< number of files currently open */ + ulint max_n_open; /*!< n_open is not allowed to exceed + this */ + ib_int64_t modification_counter;/*!< when we write to a file we + increment this by one */ + ulint max_assigned_id;/*!< maximum space id in the existing + tables, or assigned during the time + mysqld has been up; at an InnoDB + startup we scan the data dictionary + and set here the maximum of the + space id's of the tables there */ + ib_int64_t tablespace_version; + /*!< a counter which is incremented for + every space object memory creation; + every space mem object gets a + 'timestamp' from this; in DISCARD/ + IMPORT this is used to check if we + should ignore an insert buffer merge + request */ + UT_LIST_BASE_NODE_T(fil_space_t) space_list; + /*!< list of all file spaces */ + ibool space_id_reuse_warned; + /* !< TRUE if fil_space_create() + has issued a warning about + potential space_id reuse */ +}; + +/** The tablespace memory cache. This variable is NULL before the module is +initialized. */ +static fil_system_t* fil_system = NULL; + +/** Determine if (i) is a user tablespace id or not. */ +# define fil_is_user_tablespace_id(i) ((i) > srv_undo_tablespaces_open) + +/** Determine if user has explicitly disabled fsync(). */ +#ifndef __WIN__ +# define fil_buffering_disabled(s) \ + (((s)->purpose == FIL_TABLESPACE \ + && srv_unix_file_flush_method == SRV_UNIX_O_DIRECT_NO_FSYNC)\ + || ((s)->purpose == FIL_LOG \ + && srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT)) + +#else /* __WIN__ */ +# define fil_buffering_disabled(s) (0) +#endif /* __WIN__ */ + +#ifdef UNIV_DEBUG +/** Try fil_validate() every this many times */ +# define FIL_VALIDATE_SKIP 17 + +/******************************************************************//** +Checks the consistency of the tablespace cache some of the time. +@return TRUE if ok or the check was skipped */ +static +ibool +fil_validate_skip(void) +/*===================*/ +{ + /** The fil_validate() call skip counter. Use a signed type + because of the race condition below. */ + static int fil_validate_count = FIL_VALIDATE_SKIP; + + /* There is a race condition below, but it does not matter, + because this call is only for heuristic purposes. We want to + reduce the call frequency of the costly fil_validate() check + in debug builds. */ + if (--fil_validate_count > 0) { + return(TRUE); + } + + fil_validate_count = FIL_VALIDATE_SKIP; + return(fil_validate()); +} +#endif /* UNIV_DEBUG */ + +/********************************************************************//** +Determines if a file node belongs to the least-recently-used list. +@return TRUE if the file belongs to fil_system->LRU mutex. */ +UNIV_INLINE +ibool +fil_space_belongs_in_lru( +/*=====================*/ + const fil_space_t* space) /*!< in: file space */ +{ + return(space->purpose == FIL_TABLESPACE + && fil_is_user_tablespace_id(space->id)); +} + +/********************************************************************//** +NOTE: you must call fil_mutex_enter_and_prepare_for_io() first! + +Prepares a file node for i/o. Opens the file if it is closed. Updates the +pending i/o's field in the node and the system appropriately. Takes the node +off the LRU list if it is in the LRU list. The caller must hold the fil_sys +mutex. +@return false if the file can't be opened, otherwise true */ +static +bool +fil_node_prepare_for_io( +/*====================*/ + fil_node_t* node, /*!< in: file node */ + fil_system_t* system, /*!< in: tablespace memory cache */ + fil_space_t* space); /*!< in: space */ +/********************************************************************//** +Updates the data structures when an i/o operation finishes. Updates the +pending i/o's field in the node appropriately. */ +static +void +fil_node_complete_io( +/*=================*/ + fil_node_t* node, /*!< in: file node */ + fil_system_t* system, /*!< in: tablespace memory cache */ + ulint type); /*!< in: OS_FILE_WRITE or OS_FILE_READ; marks + the node as modified if + type == OS_FILE_WRITE */ +/*******************************************************************//** +Frees a space object from the tablespace memory cache. Closes the files in +the chain but does not delete them. There must not be any pending i/o's or +flushes on the files. +@return TRUE on success */ +static +ibool +fil_space_free( +/*===========*/ + ulint id, /* in: space id */ + ibool x_latched); /* in: TRUE if caller has space->latch + in X mode */ +/********************************************************************//** +Reads data from a space to a buffer. Remember that the possible incomplete +blocks at the end of file are ignored: they are not taken into account when +calculating the byte offset within a space. +@return DB_SUCCESS, or DB_TABLESPACE_DELETED if we are trying to do +i/o on a tablespace which does not exist */ +UNIV_INLINE +dberr_t +fil_read( +/*=====*/ + bool sync, /*!< in: true if synchronous aio is desired */ + ulint space_id, /*!< in: space id */ + ulint zip_size, /*!< in: compressed page size in bytes; + 0 for uncompressed pages */ + ulint block_offset, /*!< in: offset in number of blocks */ + ulint byte_offset, /*!< in: remainder of offset in bytes; in aio + this must be divisible by the OS block size */ + ulint len, /*!< in: how many bytes to read; this must not + cross a file boundary; in aio this must be a + block size multiple */ + void* buf, /*!< in/out: buffer where to store data read; + in aio this must be appropriately aligned */ + void* message) /*!< in: message for aio handler if non-sync + aio used, else ignored */ +{ + return(fil_io(OS_FILE_READ, sync, space_id, zip_size, block_offset, + byte_offset, len, buf, message)); +} + +/********************************************************************//** +Writes data to a space from a buffer. Remember that the possible incomplete +blocks at the end of file are ignored: they are not taken into account when +calculating the byte offset within a space. +@return DB_SUCCESS, or DB_TABLESPACE_DELETED if we are trying to do +i/o on a tablespace which does not exist */ +UNIV_INLINE +dberr_t +fil_write( +/*======*/ + bool sync, /*!< in: true if synchronous aio is desired */ + ulint space_id, /*!< in: space id */ + ulint zip_size, /*!< in: compressed page size in bytes; + 0 for uncompressed pages */ + ulint block_offset, /*!< in: offset in number of blocks */ + ulint byte_offset, /*!< in: remainder of offset in bytes; in aio + this must be divisible by the OS block size */ + ulint len, /*!< in: how many bytes to write; this must + not cross a file boundary; in aio this must + be a block size multiple */ + void* buf, /*!< in: buffer from which to write; in aio + this must be appropriately aligned */ + void* message) /*!< in: message for aio handler if non-sync + aio used, else ignored */ +{ + ut_ad(!srv_read_only_mode); + + return(fil_io(OS_FILE_WRITE, sync, space_id, zip_size, block_offset, + byte_offset, len, buf, message)); +} + +/*******************************************************************//** +Returns the table space by a given id, NULL if not found. */ +UNIV_INLINE +fil_space_t* +fil_space_get_by_id( +/*================*/ + ulint id) /*!< in: space id */ +{ + fil_space_t* space; + + ut_ad(mutex_own(&fil_system->mutex)); + + HASH_SEARCH(hash, fil_system->spaces, id, + fil_space_t*, space, + ut_ad(space->magic_n == FIL_SPACE_MAGIC_N), + space->id == id); + + return(space); +} + +/*******************************************************************//** +Returns the table space by a given name, NULL if not found. */ +UNIV_INLINE +fil_space_t* +fil_space_get_by_name( +/*==================*/ + const char* name) /*!< in: space name */ +{ + fil_space_t* space; + ulint fold; + + ut_ad(mutex_own(&fil_system->mutex)); + + fold = ut_fold_string(name); + + HASH_SEARCH(name_hash, fil_system->name_hash, fold, + fil_space_t*, space, + ut_ad(space->magic_n == FIL_SPACE_MAGIC_N), + !strcmp(name, space->name)); + + return(space); +} + +#ifndef UNIV_HOTBACKUP +/*******************************************************************//** +Returns the version number of a tablespace, -1 if not found. +@return version number, -1 if the tablespace does not exist in the +memory cache */ +UNIV_INTERN +ib_int64_t +fil_space_get_version( +/*==================*/ + ulint id) /*!< in: space id */ +{ + fil_space_t* space; + ib_int64_t version = -1; + + ut_ad(fil_system); + + mutex_enter(&fil_system->mutex); + + space = fil_space_get_by_id(id); + + if (space) { + version = space->tablespace_version; + } + + mutex_exit(&fil_system->mutex); + + return(version); +} + +/*******************************************************************//** +Returns the latch of a file space. +@return latch protecting storage allocation */ +UNIV_INTERN +prio_rw_lock_t* +fil_space_get_latch( +/*================*/ + ulint id, /*!< in: space id */ + ulint* flags) /*!< out: tablespace flags */ +{ + fil_space_t* space; + + ut_ad(fil_system); + + mutex_enter(&fil_system->mutex); + + space = fil_space_get_by_id(id); + + ut_a(space); + + if (flags) { + *flags = space->flags; + } + + mutex_exit(&fil_system->mutex); + + return(&(space->latch)); +} + +/*******************************************************************//** +Returns the type of a file space. +@return FIL_TABLESPACE or FIL_LOG */ +UNIV_INTERN +ulint +fil_space_get_type( +/*===============*/ + ulint id) /*!< in: space id */ +{ + fil_space_t* space; + + ut_ad(fil_system); + + mutex_enter(&fil_system->mutex); + + space = fil_space_get_by_id(id); + + ut_a(space); + + mutex_exit(&fil_system->mutex); + + return(space->purpose); +} +#endif /* !UNIV_HOTBACKUP */ + +/**********************************************************************//** +Checks if all the file nodes in a space are flushed. The caller must hold +the fil_system mutex. +@return true if all are flushed */ +static +bool +fil_space_is_flushed( +/*=================*/ + fil_space_t* space) /*!< in: space */ +{ + fil_node_t* node; + + ut_ad(mutex_own(&fil_system->mutex)); + + node = UT_LIST_GET_FIRST(space->chain); + + while (node) { + if (node->modification_counter > node->flush_counter) { + + ut_ad(!fil_buffering_disabled(space)); + return(false); + } + + node = UT_LIST_GET_NEXT(chain, node); + } + + return(true); +} + +/*******************************************************************//** +Appends a new file to the chain of files of a space. File must be closed. +@return pointer to the file name, or NULL on error */ +UNIV_INTERN +char* +fil_node_create( +/*============*/ + const char* name, /*!< in: file name (file must be closed) */ + ulint size, /*!< in: file size in database blocks, rounded + downwards to an integer */ + ulint id, /*!< in: space id where to append */ + ibool is_raw) /*!< in: TRUE if a raw device or + a raw disk partition */ +{ + fil_node_t* node; + fil_space_t* space; + + ut_a(fil_system); + ut_a(name); + + mutex_enter(&fil_system->mutex); + + node = static_cast<fil_node_t*>(mem_zalloc(sizeof(fil_node_t))); + + node->name = mem_strdup(name); + + ut_a(!is_raw || srv_start_raw_disk_in_use); + + node->sync_event = os_event_create(); + node->is_raw_disk = is_raw; + node->size = size; + node->magic_n = FIL_NODE_MAGIC_N; + + space = fil_space_get_by_id(id); + + if (!space) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: Could not find tablespace %lu for\n" + "InnoDB: file ", (ulong) id); + ut_print_filename(stderr, name); + fputs(" in the tablespace memory cache.\n", stderr); + mem_free(node->name); + + mem_free(node); + + mutex_exit(&fil_system->mutex); + + return(NULL); + } + + space->size += size; + + node->space = space; + + UT_LIST_ADD_LAST(chain, space->chain, node); + + if (id < SRV_LOG_SPACE_FIRST_ID && fil_system->max_assigned_id < id) { + + fil_system->max_assigned_id = id; + } + + mutex_exit(&fil_system->mutex); + + return(node->name); +} + +/********************************************************************//** +Opens a file of a node of a tablespace. The caller must own the fil_system +mutex. +@return false if the file can't be opened, otherwise true */ +static +bool +fil_node_open_file( +/*===============*/ + fil_node_t* node, /*!< in: file node */ + fil_system_t* system, /*!< in: tablespace memory cache */ + fil_space_t* space) /*!< in: space */ +{ + os_offset_t size_bytes; + ibool ret; + ibool success; + byte* buf2; + byte* page; + ulint space_id; + ulint flags; + ulint page_size; + + ut_ad(mutex_own(&(system->mutex))); + ut_a(node->n_pending == 0); + ut_a(node->open == FALSE); + + if (node->size == 0) { + /* It must be a single-table tablespace and we do not know the + size of the file yet. First we open the file in the normal + mode, no async I/O here, for simplicity. Then do some checks, + and close the file again. + NOTE that we could not use the simple file read function + os_file_read() in Windows to read from a file opened for + async I/O! */ + + node->handle = os_file_create_simple_no_error_handling( + innodb_file_data_key, node->name, OS_FILE_OPEN, + OS_FILE_READ_ONLY, &success); + if (!success) { + /* The following call prints an error message */ + os_file_get_last_error(true); + + ut_print_timestamp(stderr); + + ib_logf(IB_LOG_LEVEL_WARN, "InnoDB: Error: cannot " + "open %s\n. InnoDB: Have you deleted .ibd " + "files under a running mysqld server?\n", + node->name); + + return(false); + } + + size_bytes = os_file_get_size(node->handle); + ut_a(size_bytes != (os_offset_t) -1); +#ifdef UNIV_HOTBACKUP + if (space->id == 0) { + node->size = (ulint) (size_bytes / UNIV_PAGE_SIZE); + os_file_close(node->handle); + goto add_size; + } +#endif /* UNIV_HOTBACKUP */ + ut_a(space->purpose != FIL_LOG); + ut_a(fil_is_user_tablespace_id(space->id)); + + if (size_bytes < FIL_IBD_FILE_INITIAL_SIZE * UNIV_PAGE_SIZE) { + fprintf(stderr, + "InnoDB: Error: the size of single-table" + " tablespace file %s\n" + "InnoDB: is only " UINT64PF "," + " should be at least %lu!\n", + node->name, + size_bytes, + (ulong) (FIL_IBD_FILE_INITIAL_SIZE + * UNIV_PAGE_SIZE)); + + ut_a(0); + } + + /* Read the first page of the tablespace */ + + buf2 = static_cast<byte*>(ut_malloc(2 * UNIV_PAGE_SIZE)); + /* Align the memory for file i/o if we might have O_DIRECT + set */ + page = static_cast<byte*>(ut_align(buf2, UNIV_PAGE_SIZE)); + + success = os_file_read(node->handle, page, 0, UNIV_PAGE_SIZE); + space_id = fsp_header_get_space_id(page); + flags = fsp_header_get_flags(page); + page_size = fsp_flags_get_page_size(flags); + + ut_free(buf2); + + /* Close the file now that we have read the space id from it */ + + os_file_close(node->handle); + + if (UNIV_UNLIKELY(space_id != space->id)) { + fprintf(stderr, + "InnoDB: Error: tablespace id is %lu" + " in the data dictionary\n" + "InnoDB: but in file %s it is %lu!\n", + space->id, node->name, space_id); + + ut_error; + } + + if (UNIV_UNLIKELY(space_id == ULINT_UNDEFINED + || space_id == 0)) { + fprintf(stderr, + "InnoDB: Error: tablespace id %lu" + " in file %s is not sensible\n", + (ulong) space_id, node->name); + + ut_error; + } + + if (UNIV_UNLIKELY(fsp_flags_get_page_size(space->flags) + != page_size)) { + fprintf(stderr, + "InnoDB: Error: tablespace file %s" + " has page size 0x%lx\n" + "InnoDB: but the data dictionary" + " expects page size 0x%lx!\n", + node->name, flags, + fsp_flags_get_page_size(space->flags)); + + ut_error; + } + + if (UNIV_UNLIKELY(space->flags != flags)) { + fprintf(stderr, + "InnoDB: Error: table flags are 0x%lx" + " in the data dictionary\n" + "InnoDB: but the flags in file %s are 0x%lx!\n", + space->flags, node->name, flags); + + ut_error; + } + + if (size_bytes >= 1024 * 1024) { + /* Truncate the size to whole megabytes. */ + size_bytes = ut_2pow_round(size_bytes, 1024 * 1024); + } + + if (!fsp_flags_is_compressed(flags)) { + node->size = (ulint) (size_bytes / UNIV_PAGE_SIZE); + } else { + node->size = (ulint) + (size_bytes + / fsp_flags_get_zip_size(flags)); + } + +#ifdef UNIV_HOTBACKUP +add_size: +#endif /* UNIV_HOTBACKUP */ + space->size += node->size; + } + + /* printf("Opening file %s\n", node->name); */ + + /* Open the file for reading and writing, in Windows normally in the + unbuffered async I/O mode, though global variables may make + os_file_create() to fall back to the normal file I/O mode. */ + + if (space->purpose == FIL_LOG) { + node->handle = os_file_create(innodb_file_log_key, + node->name, OS_FILE_OPEN, + OS_FILE_AIO, OS_LOG_FILE, + &ret); + } else if (node->is_raw_disk) { + node->handle = os_file_create(innodb_file_data_key, + node->name, + OS_FILE_OPEN_RAW, + OS_FILE_AIO, OS_DATA_FILE, + &ret); + } else { + node->handle = os_file_create(innodb_file_data_key, + node->name, OS_FILE_OPEN, + OS_FILE_AIO, OS_DATA_FILE, + &ret); + } + + ut_a(ret); + + node->open = TRUE; + + system->n_open++; + fil_n_file_opened++; + + if (fil_space_belongs_in_lru(space)) { + + /* Put the node to the LRU list */ + UT_LIST_ADD_FIRST(LRU, system->LRU, node); + } + + return(true); +} + +/**********************************************************************//** +Closes a file. */ +static +void +fil_node_close_file( +/*================*/ + fil_node_t* node, /*!< in: file node */ + fil_system_t* system) /*!< in: tablespace memory cache */ +{ + ibool ret; + + ut_ad(node && system); + ut_ad(mutex_own(&(system->mutex))); + ut_a(node->open); + ut_a(node->n_pending == 0); + ut_a(node->n_pending_flushes == 0); + ut_a(!node->being_extended); +#ifndef UNIV_HOTBACKUP + ut_a(node->modification_counter == node->flush_counter + || srv_fast_shutdown == 2); +#endif /* !UNIV_HOTBACKUP */ + + ret = os_file_close(node->handle); + ut_a(ret); + + /* printf("Closing file %s\n", node->name); */ + + node->open = FALSE; + ut_a(system->n_open > 0); + system->n_open--; + fil_n_file_opened--; + + if (fil_space_belongs_in_lru(node->space)) { + + ut_a(UT_LIST_GET_LEN(system->LRU) > 0); + + /* The node is in the LRU list, remove it */ + UT_LIST_REMOVE(LRU, system->LRU, node); + } +} + +/********************************************************************//** +Tries to close a file in the LRU list. The caller must hold the fil_sys +mutex. +@return TRUE if success, FALSE if should retry later; since i/o's +generally complete in < 100 ms, and as InnoDB writes at most 128 pages +from the buffer pool in a batch, and then immediately flushes the +files, there is a good chance that the next time we find a suitable +node from the LRU list */ +static +ibool +fil_try_to_close_file_in_LRU( +/*=========================*/ + ibool print_info) /*!< in: if TRUE, prints information why it + cannot close a file */ +{ + fil_node_t* node; + + ut_ad(mutex_own(&fil_system->mutex)); + + if (print_info) { + fprintf(stderr, + "InnoDB: fil_sys open file LRU len %lu\n", + (ulong) UT_LIST_GET_LEN(fil_system->LRU)); + } + + for (node = UT_LIST_GET_LAST(fil_system->LRU); + node != NULL; + node = UT_LIST_GET_PREV(LRU, node)) { + + if (node->modification_counter == node->flush_counter + && node->n_pending_flushes == 0 + && !node->being_extended) { + + fil_node_close_file(node, fil_system); + + return(TRUE); + } + + if (!print_info) { + continue; + } + + if (node->n_pending_flushes > 0) { + fputs("InnoDB: cannot close file ", stderr); + ut_print_filename(stderr, node->name); + fprintf(stderr, ", because n_pending_flushes %lu\n", + (ulong) node->n_pending_flushes); + } + + if (node->modification_counter != node->flush_counter) { + fputs("InnoDB: cannot close file ", stderr); + ut_print_filename(stderr, node->name); + fprintf(stderr, + ", because mod_count %ld != fl_count %ld\n", + (long) node->modification_counter, + (long) node->flush_counter); + + } + + if (node->being_extended) { + fputs("InnoDB: cannot close file ", stderr); + ut_print_filename(stderr, node->name); + fprintf(stderr, ", because it is being extended\n"); + } + } + + return(FALSE); +} + +/*******************************************************************//** +Reserves the fil_system mutex and tries to make sure we can open at least one +file while holding it. This should be called before calling +fil_node_prepare_for_io(), because that function may need to open a file. */ +static +void +fil_mutex_enter_and_prepare_for_io( +/*===============================*/ + ulint space_id) /*!< in: space id */ +{ + fil_space_t* space; + ibool success; + ibool print_info = FALSE; + ulint count = 0; + ulint count2 = 0; + +retry: + mutex_enter(&fil_system->mutex); + + if (space_id == 0 || space_id >= SRV_LOG_SPACE_FIRST_ID) { + /* We keep log files and system tablespace files always open; + this is important in preventing deadlocks in this module, as + a page read completion often performs another read from the + insert buffer. The insert buffer is in tablespace 0, and we + cannot end up waiting in this function. */ + + return; + } + + space = fil_space_get_by_id(space_id); + + if (space != NULL && space->stop_ios) { + /* We are going to do a rename file and want to stop new i/o's + for a while */ + + if (count2 > 20000) { + fputs("InnoDB: Warning: tablespace ", stderr); + ut_print_filename(stderr, space->name); + fprintf(stderr, + " has i/o ops stopped for a long time %lu\n", + (ulong) count2); + } + + mutex_exit(&fil_system->mutex); + +#ifndef UNIV_HOTBACKUP + + /* Wake the i/o-handler threads to make sure pending + i/o's are performed */ + os_aio_simulated_wake_handler_threads(); + + /* The sleep here is just to give IO helper threads a + bit of time to do some work. It is not required that + all IO related to the tablespace being renamed must + be flushed here as we do fil_flush() in + fil_rename_tablespace() as well. */ + os_thread_sleep(20000); + +#endif /* UNIV_HOTBACKUP */ + + /* Flush tablespaces so that we can close modified + files in the LRU list */ + fil_flush_file_spaces(FIL_TABLESPACE); + + os_thread_sleep(20000); + + count2++; + + goto retry; + } + + if (fil_system->n_open < fil_system->max_n_open) { + + return; + } + + /* If the file is already open, no need to do anything; if the space + does not exist, we handle the situation in the function which called + this function */ + + if (!space || UT_LIST_GET_FIRST(space->chain)->open) { + + return; + } + + if (count > 1) { + print_info = TRUE; + } + + /* Too many files are open, try to close some */ +close_more: + success = fil_try_to_close_file_in_LRU(print_info); + + if (success && fil_system->n_open >= fil_system->max_n_open) { + + goto close_more; + } + + if (fil_system->n_open < fil_system->max_n_open) { + /* Ok */ + + return; + } + + if (count >= 2) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Warning: too many (%lu) files stay open" + " while the maximum\n" + "InnoDB: allowed value would be %lu.\n" + "InnoDB: You may need to raise the value of" + " innodb_open_files in\n" + "InnoDB: my.cnf.\n", + (ulong) fil_system->n_open, + (ulong) fil_system->max_n_open); + + return; + } + + mutex_exit(&fil_system->mutex); + +#ifndef UNIV_HOTBACKUP + /* Wake the i/o-handler threads to make sure pending i/o's are + performed */ + os_aio_simulated_wake_handler_threads(); + + os_thread_sleep(20000); +#endif + /* Flush tablespaces so that we can close modified files in the LRU + list */ + + fil_flush_file_spaces(FIL_TABLESPACE); + + count++; + + goto retry; +} + +/*******************************************************************//** +Frees a file node object from a tablespace memory cache. */ +static +void +fil_node_free( +/*==========*/ + fil_node_t* node, /*!< in, own: file node */ + fil_system_t* system, /*!< in: tablespace memory cache */ + fil_space_t* space) /*!< in: space where the file node is chained */ +{ + ut_ad(node && system && space); + ut_ad(mutex_own(&(system->mutex))); + ut_a(node->magic_n == FIL_NODE_MAGIC_N); + ut_a(node->n_pending == 0); + ut_a(!node->being_extended); + + if (node->open) { + /* We fool the assertion in fil_node_close_file() to think + there are no unflushed modifications in the file */ + + node->modification_counter = node->flush_counter; + os_event_set(node->sync_event); + + if (fil_buffering_disabled(space)) { + + ut_ad(!space->is_in_unflushed_spaces); + ut_ad(fil_space_is_flushed(space)); + + } else if (space->is_in_unflushed_spaces + && fil_space_is_flushed(space)) { + + space->is_in_unflushed_spaces = false; + + UT_LIST_REMOVE(unflushed_spaces, + system->unflushed_spaces, + space); + } + + fil_node_close_file(node, system); + } + + space->size -= node->size; + + UT_LIST_REMOVE(chain, space->chain, node); + + os_event_free(node->sync_event); + mem_free(node->name); + mem_free(node); +} + +#ifdef UNIV_LOG_ARCHIVE +/****************************************************************//** +Drops files from the start of a file space, so that its size is cut by +the amount given. */ +UNIV_INTERN +void +fil_space_truncate_start( +/*=====================*/ + ulint id, /*!< in: space id */ + ulint trunc_len) /*!< in: truncate by this much; it is an error + if this does not equal to the combined size of + some initial files in the space */ +{ + fil_node_t* node; + fil_space_t* space; + + mutex_enter(&fil_system->mutex); + + space = fil_space_get_by_id(id); + + ut_a(space); + + while (trunc_len > 0) { + node = UT_LIST_GET_FIRST(space->chain); + + ut_a(node->size * UNIV_PAGE_SIZE <= trunc_len); + + trunc_len -= node->size * UNIV_PAGE_SIZE; + + fil_node_free(node, fil_system, space); + } + + mutex_exit(&fil_system->mutex); +} + +/****************************************************************//** +Check is there node in file space with given name. */ +UNIV_INTERN +ibool +fil_space_contains_node( +/*====================*/ + ulint id, /*!< in: space id */ + char* node_name) /*!< in: node name */ +{ + fil_node_t* node; + fil_space_t* space; + + mutex_enter(&fil_system->mutex); + + space = fil_space_get_by_id(id); + + ut_a(space); + + for (node = UT_LIST_GET_FIRST(space->chain); node != NULL; + node = UT_LIST_GET_NEXT(chain, node)) { + + if (ut_strcmp(node->name, node_name) == 0) { + mutex_exit(&fil_system->mutex); + return(TRUE); + } + + } + + mutex_exit(&fil_system->mutex); + return(FALSE); +} + +#endif /* UNIV_LOG_ARCHIVE */ + +/*******************************************************************//** +Creates a space memory object and puts it to the 'fil system' hash table. +If there is an error, prints an error message to the .err log. +@return TRUE if success */ +UNIV_INTERN +ibool +fil_space_create( +/*=============*/ + const char* name, /*!< in: space name */ + ulint id, /*!< in: space id */ + ulint flags, /*!< in: tablespace flags */ + ulint purpose)/*!< in: FIL_TABLESPACE, or FIL_LOG if log */ +{ + fil_space_t* space; + + DBUG_EXECUTE_IF("fil_space_create_failure", return(false);); + + ut_a(fil_system); + ut_a(fsp_flags_is_valid(flags)); + + /* Look for a matching tablespace and if found free it. */ + do { + mutex_enter(&fil_system->mutex); + + space = fil_space_get_by_name(name); + + if (space != 0) { + ib_logf(IB_LOG_LEVEL_WARN, + "Tablespace '%s' exists in the cache " + "with id %lu != %lu", + name, (ulong) space->id, (ulong) id); + + if (id == 0 || purpose != FIL_TABLESPACE) { + + mutex_exit(&fil_system->mutex); + + return(FALSE); + } + + ib_logf(IB_LOG_LEVEL_WARN, + "Freeing existing tablespace '%s' entry " + "from the cache with id %lu", + name, (ulong) id); + + ibool success = fil_space_free(space->id, FALSE); + ut_a(success); + + mutex_exit(&fil_system->mutex); + } + + } while (space != 0); + + space = fil_space_get_by_id(id); + + if (space != 0) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Trying to add tablespace '%s' with id %lu " + "to the tablespace memory cache, but tablespace '%s' " + "with id %lu already exists in the cache!", + name, (ulong) id, space->name, (ulong) space->id); + + mutex_exit(&fil_system->mutex); + + return(FALSE); + } + + space = static_cast<fil_space_t*>(mem_zalloc(sizeof(*space))); + + space->name = mem_strdup(name); + space->id = id; + + fil_system->tablespace_version++; + space->tablespace_version = fil_system->tablespace_version; + space->mark = FALSE; + + if (purpose == FIL_TABLESPACE && !recv_recovery_on + && id > fil_system->max_assigned_id) { + + if (!fil_system->space_id_reuse_warned) { + fil_system->space_id_reuse_warned = TRUE; + + ib_logf(IB_LOG_LEVEL_WARN, + "Allocated tablespace %lu, old maximum " + "was %lu", + (ulong) id, + (ulong) fil_system->max_assigned_id); + } + + fil_system->max_assigned_id = id; + } + + space->purpose = purpose; + space->flags = flags; + + space->magic_n = FIL_SPACE_MAGIC_N; + + rw_lock_create(fil_space_latch_key, &space->latch, SYNC_FSP); + + HASH_INSERT(fil_space_t, hash, fil_system->spaces, id, space); + + HASH_INSERT(fil_space_t, name_hash, fil_system->name_hash, + ut_fold_string(name), space); + space->is_in_unflushed_spaces = false; + + space->is_corrupt = FALSE; + + UT_LIST_ADD_LAST(space_list, fil_system->space_list, space); + + mutex_exit(&fil_system->mutex); + + return(TRUE); +} + +/*******************************************************************//** +Assigns a new space id for a new single-table tablespace. This works simply by +incrementing the global counter. If 4 billion id's is not enough, we may need +to recycle id's. +@return TRUE if assigned, FALSE if not */ +UNIV_INTERN +ibool +fil_assign_new_space_id( +/*====================*/ + ulint* space_id) /*!< in/out: space id */ +{ + ulint id; + ibool success; + + mutex_enter(&fil_system->mutex); + + id = *space_id; + + if (id < fil_system->max_assigned_id) { + id = fil_system->max_assigned_id; + } + + id++; + + if (id > (SRV_LOG_SPACE_FIRST_ID / 2) && (id % 1000000UL == 0)) { + ut_print_timestamp(stderr); + fprintf(stderr, + "InnoDB: Warning: you are running out of new" + " single-table tablespace id's.\n" + "InnoDB: Current counter is %lu and it" + " must not exceed %lu!\n" + "InnoDB: To reset the counter to zero" + " you have to dump all your tables and\n" + "InnoDB: recreate the whole InnoDB installation.\n", + (ulong) id, + (ulong) SRV_LOG_SPACE_FIRST_ID); + } + + success = (id < SRV_LOG_SPACE_FIRST_ID); + + if (success) { + *space_id = fil_system->max_assigned_id = id; + } else { + ut_print_timestamp(stderr); + fprintf(stderr, + "InnoDB: You have run out of single-table" + " tablespace id's!\n" + "InnoDB: Current counter is %lu.\n" + "InnoDB: To reset the counter to zero you" + " have to dump all your tables and\n" + "InnoDB: recreate the whole InnoDB installation.\n", + (ulong) id); + *space_id = ULINT_UNDEFINED; + } + + mutex_exit(&fil_system->mutex); + + return(success); +} + +/*******************************************************************//** +Frees a space object from the tablespace memory cache. Closes the files in +the chain but does not delete them. There must not be any pending i/o's or +flushes on the files. +@return TRUE if success */ +static +ibool +fil_space_free( +/*===========*/ + /* out: TRUE if success */ + ulint id, /* in: space id */ + ibool x_latched) /* in: TRUE if caller has space->latch + in X mode */ +{ + fil_space_t* space; + fil_space_t* fnamespace; + + ut_ad(mutex_own(&fil_system->mutex)); + + space = fil_space_get_by_id(id); + + if (!space) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: trying to remove tablespace %lu" + " from the cache but\n" + "InnoDB: it is not there.\n", (ulong) id); + + return(FALSE); + } + + HASH_DELETE(fil_space_t, hash, fil_system->spaces, id, space); + + fnamespace = fil_space_get_by_name(space->name); + ut_a(fnamespace); + ut_a(space == fnamespace); + + HASH_DELETE(fil_space_t, name_hash, fil_system->name_hash, + ut_fold_string(space->name), space); + + if (space->is_in_unflushed_spaces) { + + ut_ad(!fil_buffering_disabled(space)); + space->is_in_unflushed_spaces = false; + + UT_LIST_REMOVE(unflushed_spaces, fil_system->unflushed_spaces, + space); + } + + UT_LIST_REMOVE(space_list, fil_system->space_list, space); + + ut_a(space->magic_n == FIL_SPACE_MAGIC_N); + ut_a(0 == space->n_pending_flushes); + + for (fil_node_t* fil_node = UT_LIST_GET_FIRST(space->chain); + fil_node != NULL; + fil_node = UT_LIST_GET_FIRST(space->chain)) { + + fil_node_free(fil_node, fil_system, space); + } + + ut_a(0 == UT_LIST_GET_LEN(space->chain)); + + if (x_latched) { + rw_lock_x_unlock(&space->latch); + } + + rw_lock_free(&(space->latch)); + + mem_free(space->name); + mem_free(space); + + return(TRUE); +} + +/*******************************************************************//** +Returns a pointer to the file_space_t that is in the memory cache +associated with a space id. The caller must lock fil_system->mutex. +@return file_space_t pointer, NULL if space not found */ +UNIV_INLINE +fil_space_t* +fil_space_get_space( +/*================*/ + ulint id) /*!< in: space id */ +{ + fil_space_t* space; + fil_node_t* node; + + ut_ad(fil_system); + + space = fil_space_get_by_id(id); + if (space == NULL) { + return(NULL); + } + + if (space->size == 0 && space->purpose == FIL_TABLESPACE) { + ut_a(id != 0); + + mutex_exit(&fil_system->mutex); + + /* It is possible that the space gets evicted at this point + before the fil_mutex_enter_and_prepare_for_io() acquires + the fil_system->mutex. Check for this after completing the + call to fil_mutex_enter_and_prepare_for_io(). */ + fil_mutex_enter_and_prepare_for_io(id); + + /* We are still holding the fil_system->mutex. Check if + the space is still in memory cache. */ + space = fil_space_get_by_id(id); + if (space == NULL) { + return(NULL); + } + + /* The following code must change when InnoDB supports + multiple datafiles per tablespace. */ + ut_a(1 == UT_LIST_GET_LEN(space->chain)); + + node = UT_LIST_GET_FIRST(space->chain); + + /* It must be a single-table tablespace and we have not opened + the file yet; the following calls will open it and update the + size fields */ + + if (!fil_node_prepare_for_io(node, fil_system, space)) { + /* The single-table tablespace can't be opened, + because the ibd file is missing. */ + return(NULL); + } + fil_node_complete_io(node, fil_system, OS_FILE_READ); + } + + return(space); +} + +/*******************************************************************//** +Returns the path from the first fil_node_t found for the space ID sent. +The caller is responsible for freeing the memory allocated here for the +value returned. +@return own: A copy of fil_node_t::path, NULL if space ID is zero +or not found. */ +UNIV_INTERN +char* +fil_space_get_first_path( +/*=====================*/ + ulint id) /*!< in: space id */ +{ + fil_space_t* space; + fil_node_t* node; + char* path; + + ut_ad(fil_system); + ut_a(id); + + fil_mutex_enter_and_prepare_for_io(id); + + space = fil_space_get_space(id); + + if (space == NULL) { + mutex_exit(&fil_system->mutex); + + return(NULL); + } + + ut_ad(mutex_own(&fil_system->mutex)); + + node = UT_LIST_GET_FIRST(space->chain); + + path = mem_strdup(node->name); + + mutex_exit(&fil_system->mutex); + + return(path); +} + +/*******************************************************************//** +Returns the size of the space in pages. The tablespace must be cached in the +memory cache. +@return space size, 0 if space not found */ +UNIV_INTERN +ulint +fil_space_get_size( +/*===============*/ + ulint id) /*!< in: space id */ +{ + fil_space_t* space; + ulint size; + + ut_ad(fil_system); + mutex_enter(&fil_system->mutex); + + space = fil_space_get_space(id); + + size = space ? space->size : 0; + + mutex_exit(&fil_system->mutex); + + return(size); +} + +/*******************************************************************//** +Returns the flags of the space. The tablespace must be cached +in the memory cache. +@return flags, ULINT_UNDEFINED if space not found */ +UNIV_INTERN +ulint +fil_space_get_flags( +/*================*/ + ulint id) /*!< in: space id */ +{ + fil_space_t* space; + ulint flags; + + ut_ad(fil_system); + + if (!id) { + return(0); + } + + mutex_enter(&fil_system->mutex); + + space = fil_space_get_space(id); + + if (space == NULL) { + mutex_exit(&fil_system->mutex); + + return(ULINT_UNDEFINED); + } + + flags = space->flags; + + mutex_exit(&fil_system->mutex); + + return(flags); +} + +/*******************************************************************//** +Returns the compressed page size of the space, or 0 if the space +is not compressed. The tablespace must be cached in the memory cache. +@return compressed page size, ULINT_UNDEFINED if space not found */ +UNIV_INTERN +ulint +fil_space_get_zip_size( +/*===================*/ + ulint id) /*!< in: space id */ +{ + ulint flags; + + flags = fil_space_get_flags(id); + + if (flags && flags != ULINT_UNDEFINED) { + + return(fsp_flags_get_zip_size(flags)); + } + + return(flags); +} + +/*******************************************************************//** +Checks if the pair space, page_no refers to an existing page in a tablespace +file space. The tablespace must be cached in the memory cache. +@return TRUE if the address is meaningful */ +UNIV_INTERN +ibool +fil_check_adress_in_tablespace( +/*===========================*/ + ulint id, /*!< in: space id */ + ulint page_no)/*!< in: page number */ +{ + if (fil_space_get_size(id) > page_no) { + + return(TRUE); + } + + return(FALSE); +} + +/****************************************************************//** +Initializes the tablespace memory cache. */ +UNIV_INTERN +void +fil_init( +/*=====*/ + ulint hash_size, /*!< in: hash table size */ + ulint max_n_open) /*!< in: max number of open files */ +{ + ut_a(fil_system == NULL); + + ut_a(hash_size > 0); + ut_a(max_n_open > 0); + + fil_system = static_cast<fil_system_t*>( + mem_zalloc(sizeof(fil_system_t))); + + mutex_create(fil_system_mutex_key, + &fil_system->mutex, SYNC_ANY_LATCH); + + fil_system->spaces = hash_create(hash_size); + fil_system->name_hash = hash_create(hash_size); + + UT_LIST_INIT(fil_system->LRU); + + fil_system->max_n_open = max_n_open; +} + +/*******************************************************************//** +Opens all log files and system tablespace data files. They stay open until the +database server shutdown. This should be called at a server startup after the +space objects for the log and the system tablespace have been created. The +purpose of this operation is to make sure we never run out of file descriptors +if we need to read from the insert buffer or to write to the log. */ +UNIV_INTERN +void +fil_open_log_and_system_tablespace_files(void) +/*==========================================*/ +{ + fil_space_t* space; + + mutex_enter(&fil_system->mutex); + + for (space = UT_LIST_GET_FIRST(fil_system->space_list); + space != NULL; + space = UT_LIST_GET_NEXT(space_list, space)) { + + fil_node_t* node; + + if (fil_space_belongs_in_lru(space)) { + + continue; + } + + for (node = UT_LIST_GET_FIRST(space->chain); + node != NULL; + node = UT_LIST_GET_NEXT(chain, node)) { + + if (!node->open) { + if (!fil_node_open_file(node, fil_system, + space)) { + /* This func is called during server's + startup. If some file of log or system + tablespace is missing, the server + can't start successfully. So we should + assert for it. */ + ut_a(0); + } + } + + if (fil_system->max_n_open < 10 + fil_system->n_open) { + + fprintf(stderr, + "InnoDB: Warning: you must" + " raise the value of" + " innodb_open_files in\n" + "InnoDB: my.cnf! Remember that" + " InnoDB keeps all log files" + " and all system\n" + "InnoDB: tablespace files open" + " for the whole time mysqld is" + " running, and\n" + "InnoDB: needs to open also" + " some .ibd files if the" + " file-per-table storage\n" + "InnoDB: model is used." + " Current open files %lu," + " max allowed" + " open files %lu.\n", + (ulong) fil_system->n_open, + (ulong) fil_system->max_n_open); + } + } + } + + mutex_exit(&fil_system->mutex); +} + +/*******************************************************************//** +Closes all open files. There must not be any pending i/o's or not flushed +modifications in the files. */ +UNIV_INTERN +void +fil_close_all_files(void) +/*=====================*/ +{ + fil_space_t* space; + + if (srv_track_changed_pages && srv_redo_log_thread_started) + os_event_wait(srv_redo_log_tracked_event); + + mutex_enter(&fil_system->mutex); + + space = UT_LIST_GET_FIRST(fil_system->space_list); + + while (space != NULL) { + fil_node_t* node; + fil_space_t* prev_space = space; + + for (node = UT_LIST_GET_FIRST(space->chain); + node != NULL; + node = UT_LIST_GET_NEXT(chain, node)) { + + if (node->open) { + fil_node_close_file(node, fil_system); + } + } + + space = UT_LIST_GET_NEXT(space_list, space); + + fil_space_free(prev_space->id, FALSE); + } + + mutex_exit(&fil_system->mutex); +} + +/*******************************************************************//** +Closes the redo log files. There must not be any pending i/o's or not +flushed modifications in the files. */ +UNIV_INTERN +void +fil_close_log_files( +/*================*/ + bool free) /*!< in: whether to free the memory object */ +{ + fil_space_t* space; + + if (srv_track_changed_pages && srv_redo_log_thread_started) + os_event_wait(srv_redo_log_tracked_event); + + mutex_enter(&fil_system->mutex); + + space = UT_LIST_GET_FIRST(fil_system->space_list); + + while (space != NULL) { + fil_node_t* node; + fil_space_t* prev_space = space; + + if (space->purpose != FIL_LOG) { + space = UT_LIST_GET_NEXT(space_list, space); + continue; + } + + for (node = UT_LIST_GET_FIRST(space->chain); + node != NULL; + node = UT_LIST_GET_NEXT(chain, node)) { + + if (node->open) { + fil_node_close_file(node, fil_system); + } + } + + space = UT_LIST_GET_NEXT(space_list, space); + + if (free) { + fil_space_free(prev_space->id, FALSE); + } + } + + mutex_exit(&fil_system->mutex); +} + +/*******************************************************************//** +Sets the max tablespace id counter if the given number is bigger than the +previous value. */ +UNIV_INTERN +void +fil_set_max_space_id_if_bigger( +/*===========================*/ + ulint max_id) /*!< in: maximum known id */ +{ + if (max_id >= SRV_LOG_SPACE_FIRST_ID) { + fprintf(stderr, + "InnoDB: Fatal error: max tablespace id" + " is too high, %lu\n", (ulong) max_id); + ut_error; + } + + mutex_enter(&fil_system->mutex); + + if (fil_system->max_assigned_id < max_id) { + + fil_system->max_assigned_id = max_id; + } + + mutex_exit(&fil_system->mutex); +} + +/****************************************************************//** +Writes the flushed lsn and the latest archived log number to the page header +of the first page of a data file of the system tablespace (space 0), +which is uncompressed. */ +static __attribute__((warn_unused_result)) +dberr_t +fil_write_lsn_and_arch_no_to_file( +/*==============================*/ + ulint space, /*!< in: space to write to */ + ulint sum_of_sizes, /*!< in: combined size of previous files + in space, in database pages */ + lsn_t lsn, /*!< in: lsn to write */ + ulint arch_log_no __attribute__((unused))) + /*!< in: archived log number to write */ +{ + byte* buf1; + byte* buf; + dberr_t err; + + buf1 = static_cast<byte*>(mem_alloc(2 * UNIV_PAGE_SIZE)); + buf = static_cast<byte*>(ut_align(buf1, UNIV_PAGE_SIZE)); + + err = fil_read(TRUE, space, 0, sum_of_sizes, 0, + UNIV_PAGE_SIZE, buf, NULL); + if (err == DB_SUCCESS) { + mach_write_to_8(buf + FIL_PAGE_FILE_FLUSH_LSN, lsn); + + err = fil_write(TRUE, space, 0, sum_of_sizes, 0, + UNIV_PAGE_SIZE, buf, NULL); + } + + mem_free(buf1); + + return(err); +} + +/****************************************************************//** +Writes the flushed lsn and the latest archived log number to the page +header of the first page of each data file in the system tablespace. +@return DB_SUCCESS or error number */ +UNIV_INTERN +dberr_t +fil_write_flushed_lsn_to_data_files( +/*================================*/ + lsn_t lsn, /*!< in: lsn to write */ + ulint arch_log_no) /*!< in: latest archived log file number */ +{ + fil_space_t* space; + fil_node_t* node; + dberr_t err; + + mutex_enter(&fil_system->mutex); + + for (space = UT_LIST_GET_FIRST(fil_system->space_list); + space != NULL; + space = UT_LIST_GET_NEXT(space_list, space)) { + + /* We only write the lsn to all existing data files which have + been open during the lifetime of the mysqld process; they are + represented by the space objects in the tablespace memory + cache. Note that all data files in the system tablespace 0 + and the UNDO log tablespaces (if separate) are always open. */ + + if (space->purpose == FIL_TABLESPACE + && !fil_is_user_tablespace_id(space->id)) { + ulint sum_of_sizes = 0; + + for (node = UT_LIST_GET_FIRST(space->chain); + node != NULL; + node = UT_LIST_GET_NEXT(chain, node)) { + + mutex_exit(&fil_system->mutex); + + err = fil_write_lsn_and_arch_no_to_file( + space->id, sum_of_sizes, lsn, + arch_log_no); + + if (err != DB_SUCCESS) { + + return(err); + } + + mutex_enter(&fil_system->mutex); + + sum_of_sizes += node->size; + } + } + } + + mutex_exit(&fil_system->mutex); + + return(DB_SUCCESS); +} + +/*******************************************************************//** +Checks the consistency of the first data page of a tablespace +at database startup. +@retval NULL on success, or if innodb_force_recovery is set +@return pointer to an error message string */ +static __attribute__((warn_unused_result)) +const char* +fil_check_first_page( +/*=================*/ + const page_t* page) /*!< in: data page */ +{ + ulint space_id; + ulint flags; + + if (srv_force_recovery >= SRV_FORCE_IGNORE_CORRUPT) { + return(NULL); + } + + space_id = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SPACE_ID + page); + flags = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + page); + + if (UNIV_PAGE_SIZE != fsp_flags_get_page_size(flags)) { + return("innodb-page-size mismatch"); + } + + if (!space_id && !flags) { + ulint nonzero_bytes = UNIV_PAGE_SIZE; + const byte* b = page; + + while (!*b && --nonzero_bytes) { + b++; + } + + if (!nonzero_bytes) { + return("space header page consists of zero bytes"); + } + } + + if (buf_page_is_corrupted( + false, page, fsp_flags_get_zip_size(flags))) { + return("checksum mismatch"); + } + + if (page_get_space_id(page) == space_id + && page_get_page_no(page) == 0) { + return(NULL); + } + + return("inconsistent data in space header"); +} + +/*******************************************************************//** +Reads the flushed lsn, arch no, space_id and tablespace flag fields from +the first page of a data file at database startup. +@retval NULL on success, or if innodb_force_recovery is set +@return pointer to an error message string */ +UNIV_INTERN +const char* +fil_read_first_page( +/*================*/ + os_file_t data_file, /*!< in: open data file */ + ibool one_read_already, /*!< in: TRUE if min and max + parameters below already + contain sensible data */ + ulint* flags, /*!< out: tablespace flags */ + ulint* space_id, /*!< out: tablespace ID */ + lsn_t* min_flushed_lsn, /*!< out: min of flushed + lsn values in data files */ + lsn_t* max_flushed_lsn) /*!< out: max of flushed + lsn values in data files */ +{ + byte* buf; + byte* page; + lsn_t flushed_lsn; + const char* check_msg = NULL; + + buf = static_cast<byte*>(ut_malloc(2 * UNIV_PAGE_SIZE)); + + /* Align the memory for a possible read from a raw device */ + + page = static_cast<byte*>(ut_align(buf, UNIV_PAGE_SIZE)); + + os_file_read(data_file, page, 0, UNIV_PAGE_SIZE); + + /* The FSP_HEADER on page 0 is only valid for the first file + in a tablespace. So if this is not the first datafile, leave + *flags and *space_id as they were read from the first file and + do not validate the first page. */ + if (!one_read_already) { + *flags = fsp_header_get_flags(page); + *space_id = fsp_header_get_space_id(page); + + check_msg = fil_check_first_page(page); + } + + flushed_lsn = mach_read_from_8(page + FIL_PAGE_FILE_FLUSH_LSN); + + ut_free(buf); + + if (check_msg) { + return(check_msg); + } + + if (!one_read_already) { + *min_flushed_lsn = flushed_lsn; + *max_flushed_lsn = flushed_lsn; + + return(NULL); + } + + if (*min_flushed_lsn > flushed_lsn) { + *min_flushed_lsn = flushed_lsn; + } + if (*max_flushed_lsn < flushed_lsn) { + *max_flushed_lsn = flushed_lsn; + } + + return(NULL); +} + +/*================ SINGLE-TABLE TABLESPACES ==========================*/ + +#ifndef UNIV_HOTBACKUP +/*******************************************************************//** +Increments the count of pending operation, if space is not being deleted. +@return TRUE if being deleted, and operation should be skipped */ +UNIV_INTERN +ibool +fil_inc_pending_ops( +/*================*/ + ulint id, /*!< in: space id */ + ibool print_err) /*!< in: need to print error or not */ +{ + fil_space_t* space; + + mutex_enter(&fil_system->mutex); + + space = fil_space_get_by_id(id); + + if (space == NULL) { + if (print_err) { + fprintf(stderr, + "InnoDB: Error: trying to do an operation on a" + " dropped tablespace %lu\n", + (ulong) id); + } + } + + if (space == NULL || space->stop_new_ops) { + mutex_exit(&fil_system->mutex); + + return(TRUE); + } + + space->n_pending_ops++; + + mutex_exit(&fil_system->mutex); + + return(FALSE); +} + +/*******************************************************************//** +Decrements the count of pending operations. */ +UNIV_INTERN +void +fil_decr_pending_ops( +/*=================*/ + ulint id) /*!< in: space id */ +{ + fil_space_t* space; + + mutex_enter(&fil_system->mutex); + + space = fil_space_get_by_id(id); + + if (space == NULL) { + fprintf(stderr, + "InnoDB: Error: decrementing pending operation" + " of a dropped tablespace %lu\n", + (ulong) id); + } + + if (space != NULL) { + space->n_pending_ops--; + } + + mutex_exit(&fil_system->mutex); +} +#endif /* !UNIV_HOTBACKUP */ + +/********************************************************//** +Creates the database directory for a table if it does not exist yet. */ +static +void +fil_create_directory_for_tablename( +/*===============================*/ + const char* name) /*!< in: name in the standard + 'databasename/tablename' format */ +{ + const char* namend; + char* path; + ulint len; + + len = strlen(fil_path_to_mysql_datadir); + namend = strchr(name, '/'); + ut_a(namend); + path = static_cast<char*>(mem_alloc(len + (namend - name) + 2)); + + memcpy(path, fil_path_to_mysql_datadir, len); + path[len] = '/'; + memcpy(path + len + 1, name, namend - name); + path[len + (namend - name) + 1] = 0; + + srv_normalize_path_for_win(path); + + ut_a(os_file_create_directory(path, FALSE)); + mem_free(path); +} + +#ifndef UNIV_HOTBACKUP +/********************************************************//** +Writes a log record about an .ibd file create/rename/delete. */ +static +void +fil_op_write_log( +/*=============*/ + ulint type, /*!< in: MLOG_FILE_CREATE, + MLOG_FILE_CREATE2, + MLOG_FILE_DELETE, or + MLOG_FILE_RENAME */ + ulint space_id, /*!< in: space id */ + ulint log_flags, /*!< in: redo log flags (stored + in the page number field) */ + ulint flags, /*!< in: compressed page size + and file format + if type==MLOG_FILE_CREATE2, or 0 */ + const char* name, /*!< in: table name in the familiar + 'databasename/tablename' format, or + the file path in the case of + MLOG_FILE_DELETE */ + const char* new_name, /*!< in: if type is MLOG_FILE_RENAME, + the new table name in the + 'databasename/tablename' format */ + mtr_t* mtr) /*!< in: mini-transaction handle */ +{ + byte* log_ptr; + ulint len; + + log_ptr = mlog_open(mtr, 11 + 2 + 1); + + if (!log_ptr) { + /* Logging in mtr is switched off during crash recovery: + in that case mlog_open returns NULL */ + return; + } + + log_ptr = mlog_write_initial_log_record_for_file_op( + type, space_id, log_flags, log_ptr, mtr); + if (type == MLOG_FILE_CREATE2) { + mach_write_to_4(log_ptr, flags); + log_ptr += 4; + } + /* Let us store the strings as null-terminated for easier readability + and handling */ + + len = strlen(name) + 1; + + mach_write_to_2(log_ptr, len); + log_ptr += 2; + mlog_close(mtr, log_ptr); + + mlog_catenate_string(mtr, (byte*) name, len); + + if (type == MLOG_FILE_RENAME) { + len = strlen(new_name) + 1; + log_ptr = mlog_open(mtr, 2 + len); + ut_a(log_ptr); + mach_write_to_2(log_ptr, len); + log_ptr += 2; + mlog_close(mtr, log_ptr); + + mlog_catenate_string(mtr, (byte*) new_name, len); + } +} +#endif + +/*******************************************************************//** +Parses the body of a log record written about an .ibd file operation. That is, +the log record part after the standard (type, space id, page no) header of the +log record. + +If desired, also replays the delete or rename operation if the .ibd file +exists and the space id in it matches. Replays the create operation if a file +at that path does not exist yet. If the database directory for the file to be +created does not exist, then we create the directory, too. + +Note that mysqlbackup --apply-log sets fil_path_to_mysql_datadir to point to +the datadir that we should use in replaying the file operations. + +InnoDB recovery does not replay these fully since it always sets the space id +to zero. But mysqlbackup does replay them. TODO: If remote tablespaces are +used, mysqlbackup will only create tables in the default directory since +MLOG_FILE_CREATE and MLOG_FILE_CREATE2 only know the tablename, not the path. + +@return end of log record, or NULL if the record was not completely +contained between ptr and end_ptr */ +UNIV_INTERN +byte* +fil_op_log_parse_or_replay( +/*=======================*/ + byte* ptr, /*!< in: buffer containing the log record body, + or an initial segment of it, if the record does + not fir completely between ptr and end_ptr */ + byte* end_ptr, /*!< in: buffer end */ + ulint type, /*!< in: the type of this log record */ + ulint space_id, /*!< in: the space id of the tablespace in + question, or 0 if the log record should + only be parsed but not replayed */ + ulint log_flags) /*!< in: redo log flags + (stored in the page number parameter) */ +{ + ulint name_len; + ulint new_name_len; + const char* name; + const char* new_name = NULL; + ulint flags = 0; + + if (type == MLOG_FILE_CREATE2) { + if (end_ptr < ptr + 4) { + + return(NULL); + } + + flags = mach_read_from_4(ptr); + ptr += 4; + } + + if (end_ptr < ptr + 2) { + + return(NULL); + } + + name_len = mach_read_from_2(ptr); + + ptr += 2; + + if (end_ptr < ptr + name_len) { + + return(NULL); + } + + name = (const char*) ptr; + + ptr += name_len; + + if (type == MLOG_FILE_RENAME) { + if (end_ptr < ptr + 2) { + + return(NULL); + } + + new_name_len = mach_read_from_2(ptr); + + ptr += 2; + + if (end_ptr < ptr + new_name_len) { + + return(NULL); + } + + new_name = (const char*) ptr; + + ptr += new_name_len; + } + + /* We managed to parse a full log record body */ + /* + printf("Parsed log rec of type %lu space %lu\n" + "name %s\n", type, space_id, name); + + if (type == MLOG_FILE_RENAME) { + printf("new name %s\n", new_name); + } + */ + if (!space_id) { + return(ptr); + } else { + /* Only replay file ops during recovery. This is a + release-build assert to minimize any data loss risk by a + misapplied file operation. */ + ut_a(recv_recovery_is_on()); + } + + /* Let us try to perform the file operation, if sensible. Note that + mysqlbackup has at this stage already read in all space id info to the + fil0fil.cc data structures. + + NOTE that our algorithm is not guaranteed to work correctly if there + were renames of tables during the backup. See mysqlbackup code for more + on the problem. */ + + switch (type) { + case MLOG_FILE_DELETE: + if (fil_tablespace_exists_in_mem(space_id)) { + dberr_t err = fil_delete_tablespace( + space_id, BUF_REMOVE_FLUSH_NO_WRITE); + ut_a(err == DB_SUCCESS); + } + + break; + + case MLOG_FILE_RENAME: + /* In order to replay the rename, the following must hold: + * The new name is not already used. + * A tablespace is open in memory with the old name. + * The space ID for that tablepace matches this log entry. + This will prevent unintended renames during recovery. */ + + if (fil_get_space_id_for_table(new_name) == ULINT_UNDEFINED + && space_id == fil_get_space_id_for_table(name)) { + /* Create the database directory for the new name, if + it does not exist yet */ + fil_create_directory_for_tablename(new_name); + + if (!fil_rename_tablespace(name, space_id, + new_name, NULL)) { + ut_error; + } + } + + break; + + case MLOG_FILE_CREATE: + case MLOG_FILE_CREATE2: + if (fil_tablespace_exists_in_mem(space_id)) { + /* Do nothing */ + } else if (fil_get_space_id_for_table(name) + != ULINT_UNDEFINED) { + /* Do nothing */ + } else if (log_flags & MLOG_FILE_FLAG_TEMP) { + /* Temporary table, do nothing */ + } else { + const char* path = NULL; + + /* Create the database directory for name, if it does + not exist yet */ + fil_create_directory_for_tablename(name); + + if (fil_create_new_single_table_tablespace( + space_id, name, path, flags, + DICT_TF2_USE_TABLESPACE, + FIL_IBD_FILE_INITIAL_SIZE) != DB_SUCCESS) { + ut_error; + } + } + + break; + + default: + ut_error; + } + + return(ptr); +} + +/*******************************************************************//** +Allocates a file name for the EXPORT/IMPORT config file name. The +string must be freed by caller with mem_free(). +@return own: file name */ +static +char* +fil_make_cfg_name( +/*==============*/ + const char* filepath) /*!< in: .ibd file name */ +{ + char* cfg_name; + + /* Create a temporary file path by replacing the .ibd suffix + with .cfg. */ + + ut_ad(strlen(filepath) > 4); + + cfg_name = mem_strdup(filepath); + ut_snprintf(cfg_name + strlen(cfg_name) - 3, 4, "cfg"); + return(cfg_name); +} + +/*******************************************************************//** +Check for change buffer merges. +@return 0 if no merges else count + 1. */ +static +ulint +fil_ibuf_check_pending_ops( +/*=======================*/ + fil_space_t* space, /*!< in/out: Tablespace to check */ + ulint count) /*!< in: number of attempts so far */ +{ + ut_ad(mutex_own(&fil_system->mutex)); + + if (space != 0 && space->n_pending_ops != 0) { + + if (count > 5000) { + ib_logf(IB_LOG_LEVEL_WARN, + "Trying to close/delete tablespace " + "'%s' but there are %lu pending change " + "buffer merges on it.", + space->name, + (ulong) space->n_pending_ops); + } + + return(count + 1); + } + + return(0); +} + +/*******************************************************************//** +Check for pending IO. +@return 0 if no pending else count + 1. */ +static +ulint +fil_check_pending_io( +/*=================*/ + fil_space_t* space, /*!< in/out: Tablespace to check */ + fil_node_t** node, /*!< out: Node in space list */ + ulint count) /*!< in: number of attempts so far */ +{ + ut_ad(mutex_own(&fil_system->mutex)); + ut_a(space->n_pending_ops == 0); + + /* The following code must change when InnoDB supports + multiple datafiles per tablespace. */ + ut_a(UT_LIST_GET_LEN(space->chain) == 1); + + *node = UT_LIST_GET_FIRST(space->chain); + + if (space->n_pending_flushes > 0 || (*node)->n_pending > 0) { + + ut_a(!(*node)->being_extended); + + if (count > 1000) { + ib_logf(IB_LOG_LEVEL_WARN, + "Trying to close/delete tablespace '%s' " + "but there are %lu flushes " + " and %lu pending i/o's on it.", + space->name, + (ulong) space->n_pending_flushes, + (ulong) (*node)->n_pending); + } + + return(count + 1); + } + + return(0); +} + +/*******************************************************************//** +Check pending operations on a tablespace. +@return DB_SUCCESS or error failure. */ +static +dberr_t +fil_check_pending_operations( +/*=========================*/ + ulint id, /*!< in: space id */ + fil_space_t** space, /*!< out: tablespace instance in memory */ + char** path) /*!< out/own: tablespace path */ +{ + ulint count = 0; + + ut_a(id != TRX_SYS_SPACE); + ut_ad(space); + + *space = 0; + + mutex_enter(&fil_system->mutex); + fil_space_t* sp = fil_space_get_by_id(id); + if (sp) { + sp->stop_new_ops = TRUE; + } + mutex_exit(&fil_system->mutex); + + /* Check for pending change buffer merges. */ + + do { + mutex_enter(&fil_system->mutex); + + sp = fil_space_get_by_id(id); + + count = fil_ibuf_check_pending_ops(sp, count); + + mutex_exit(&fil_system->mutex); + + if (count > 0) { + os_thread_sleep(20000); + } + + } while (count > 0); + + /* Check for pending IO. */ + + *path = 0; + + do { + mutex_enter(&fil_system->mutex); + + sp = fil_space_get_by_id(id); + + if (sp == NULL) { + mutex_exit(&fil_system->mutex); + return(DB_TABLESPACE_NOT_FOUND); + } + + fil_node_t* node; + + count = fil_check_pending_io(sp, &node, count); + + if (count == 0) { + *path = mem_strdup(node->name); + } + + mutex_exit(&fil_system->mutex); + + if (count > 0) { + os_thread_sleep(20000); + } + + } while (count > 0); + + ut_ad(sp); + + *space = sp; + return(DB_SUCCESS); +} + +/*******************************************************************//** +Closes a single-table tablespace. The tablespace must be cached in the +memory cache. Free all pages used by the tablespace. +@return DB_SUCCESS or error */ +UNIV_INTERN +dberr_t +fil_close_tablespace( +/*=================*/ + trx_t* trx, /*!< in/out: Transaction covering the close */ + ulint id) /*!< in: space id */ +{ + char* path = 0; + fil_space_t* space = 0; + + ut_a(id != TRX_SYS_SPACE); + + dberr_t err = fil_check_pending_operations(id, &space, &path); + + if (err != DB_SUCCESS) { + return(err); + } + + ut_a(space); + ut_a(path != 0); + + rw_lock_x_lock(&space->latch); + +#ifndef UNIV_HOTBACKUP + /* Invalidate in the buffer pool all pages belonging to the + tablespace. Since we have set space->stop_new_ops = TRUE, readahead + or ibuf merge can no longer read more pages of this tablespace to the + buffer pool. Thus we can clean the tablespace out of the buffer pool + completely and permanently. The flag stop_new_ops also prevents + fil_flush() from being applied to this tablespace. */ + + buf_LRU_flush_or_remove_pages(id, BUF_REMOVE_FLUSH_WRITE, trx); +#endif + mutex_enter(&fil_system->mutex); + + /* If the free is successful, the X lock will be released before + the space memory data structure is freed. */ + + if (!fil_space_free(id, TRUE)) { + rw_lock_x_unlock(&space->latch); + err = DB_TABLESPACE_NOT_FOUND; + } else { + err = DB_SUCCESS; + } + + mutex_exit(&fil_system->mutex); + + /* If it is a delete then also delete any generated files, otherwise + when we drop the database the remove directory will fail. */ + + char* cfg_name = fil_make_cfg_name(path); + + os_file_delete_if_exists(innodb_file_data_key, cfg_name); + + mem_free(path); + mem_free(cfg_name); + + return(err); +} + +/*******************************************************************//** +Deletes a single-table tablespace. The tablespace must be cached in the +memory cache. +@return DB_SUCCESS or error */ +UNIV_INTERN +dberr_t +fil_delete_tablespace( +/*==================*/ + ulint id, /*!< in: space id */ + buf_remove_t buf_remove) /*!< in: specify the action to take + on the tables pages in the buffer + pool */ +{ + char* path = 0; + fil_space_t* space = 0; + + ut_a(id != TRX_SYS_SPACE); + + dberr_t err = fil_check_pending_operations(id, &space, &path); + + if (err != DB_SUCCESS) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "Cannot delete tablespace %lu because it is not " + "found in the tablespace memory cache.", + (ulong) id); + + return(err); + } + + ut_a(space); + ut_a(path != 0); + + /* Important: We rely on the data dictionary mutex to ensure + that a race is not possible here. It should serialize the tablespace + drop/free. We acquire an X latch only to avoid a race condition + when accessing the tablespace instance via: + + fsp_get_available_space_in_free_extents(). + + There our main motivation is to reduce the contention on the + dictionary mutex. */ + + rw_lock_x_lock(&space->latch); + +#ifndef UNIV_HOTBACKUP + /* IMPORTANT: Because we have set space::stop_new_ops there + can't be any new ibuf merges, reads or flushes. We are here + because node::n_pending was zero above. However, it is still + possible to have pending read and write requests: + + A read request can happen because the reader thread has + gone through the ::stop_new_ops check in buf_page_init_for_read() + before the flag was set and has not yet incremented ::n_pending + when we checked it above. + + A write request can be issued any time because we don't check + the ::stop_new_ops flag when queueing a block for write. + + We deal with pending write requests in the following function + where we'd minimally evict all dirty pages belonging to this + space from the flush_list. Not that if a block is IO-fixed + we'll wait for IO to complete. + + To deal with potential read requests by checking the + ::stop_new_ops flag in fil_io() */ + + buf_LRU_flush_or_remove_pages(id, buf_remove, 0); + +#endif /* !UNIV_HOTBACKUP */ + + /* If it is a delete then also delete any generated files, otherwise + when we drop the database the remove directory will fail. */ + { + char* cfg_name = fil_make_cfg_name(path); + os_file_delete_if_exists(innodb_file_data_key, cfg_name); + mem_free(cfg_name); + } + + /* Delete the link file pointing to the ibd file we are deleting. */ + if (FSP_FLAGS_HAS_DATA_DIR(space->flags)) { + fil_delete_link_file(space->name); + } + + mutex_enter(&fil_system->mutex); + + /* Double check the sanity of pending ops after reacquiring + the fil_system::mutex. */ + if (fil_space_get_by_id(id)) { + ut_a(space->n_pending_ops == 0); + ut_a(UT_LIST_GET_LEN(space->chain) == 1); + fil_node_t* node = UT_LIST_GET_FIRST(space->chain); + ut_a(node->n_pending == 0); + } + + if (!fil_space_free(id, TRUE)) { + err = DB_TABLESPACE_NOT_FOUND; + } + + mutex_exit(&fil_system->mutex); + + if (err != DB_SUCCESS) { + rw_lock_x_unlock(&space->latch); + } else if (!os_file_delete(innodb_file_data_key, path) + && !os_file_delete_if_exists(innodb_file_data_key, path)) { + + /* Note: This is because we have removed the + tablespace instance from the cache. */ + + err = DB_IO_ERROR; + } + + if (err == DB_SUCCESS) { +#ifndef UNIV_HOTBACKUP + /* Write a log record about the deletion of the .ibd + file, so that mysqlbackup can replay it in the + --apply-log phase. We use a dummy mtr and the familiar + log write mechanism. */ + mtr_t mtr; + + /* When replaying the operation in mysqlbackup, do not try + to write any log record */ + mtr_start(&mtr); + + fil_op_write_log(MLOG_FILE_DELETE, id, 0, 0, path, NULL, &mtr); + mtr_commit(&mtr); +#endif + err = DB_SUCCESS; + } + + mem_free(path); + + return(err); +} + +/*******************************************************************//** +Returns TRUE if a single-table tablespace is being deleted. +@return TRUE if being deleted */ +UNIV_INTERN +ibool +fil_tablespace_is_being_deleted( +/*============================*/ + ulint id) /*!< in: space id */ +{ + fil_space_t* space; + ibool is_being_deleted; + + mutex_enter(&fil_system->mutex); + + space = fil_space_get_by_id(id); + + ut_a(space != NULL); + + is_being_deleted = space->stop_new_ops; + + mutex_exit(&fil_system->mutex); + + return(is_being_deleted); +} + +#ifndef UNIV_HOTBACKUP +/*******************************************************************//** +Discards a single-table tablespace. The tablespace must be cached in the +memory cache. Discarding is like deleting a tablespace, but + + 1. We do not drop the table from the data dictionary; + + 2. We remove all insert buffer entries for the tablespace immediately; + in DROP TABLE they are only removed gradually in the background; + + 3. Free all the pages in use by the tablespace. +@return DB_SUCCESS or error */ +UNIV_INTERN +dberr_t +fil_discard_tablespace( +/*===================*/ + ulint id) /*!< in: space id */ +{ + dberr_t err; + + switch (err = fil_delete_tablespace(id, BUF_REMOVE_ALL_NO_WRITE)) { + case DB_SUCCESS: + break; + + case DB_IO_ERROR: + ib_logf(IB_LOG_LEVEL_WARN, + "While deleting tablespace %lu in DISCARD TABLESPACE." + " File rename/delete failed: %s", + (ulong) id, ut_strerr(err)); + break; + + case DB_TABLESPACE_NOT_FOUND: + ib_logf(IB_LOG_LEVEL_WARN, + "Cannot delete tablespace %lu in DISCARD " + "TABLESPACE. %s", + (ulong) id, ut_strerr(err)); + break; + + default: + ut_error; + } + + /* Remove all insert buffer entries for the tablespace */ + + ibuf_delete_for_discarded_space(id); + + return(err); +} +#endif /* !UNIV_HOTBACKUP */ + +/*******************************************************************//** +Renames the memory cache structures of a single-table tablespace. +@return TRUE if success */ +static +ibool +fil_rename_tablespace_in_mem( +/*=========================*/ + fil_space_t* space, /*!< in: tablespace memory object */ + fil_node_t* node, /*!< in: file node of that tablespace */ + const char* new_name, /*!< in: new name */ + const char* new_path) /*!< in: new file path */ +{ + fil_space_t* space2; + const char* old_name = space->name; + + ut_ad(mutex_own(&fil_system->mutex)); + + space2 = fil_space_get_by_name(old_name); + if (space != space2) { + fputs("InnoDB: Error: cannot find ", stderr); + ut_print_filename(stderr, old_name); + fputs(" in tablespace memory cache\n", stderr); + + return(FALSE); + } + + space2 = fil_space_get_by_name(new_name); + if (space2 != NULL) { + fputs("InnoDB: Error: ", stderr); + ut_print_filename(stderr, new_name); + fputs(" is already in tablespace memory cache\n", stderr); + + return(FALSE); + } + + HASH_DELETE(fil_space_t, name_hash, fil_system->name_hash, + ut_fold_string(space->name), space); + mem_free(space->name); + mem_free(node->name); + + space->name = mem_strdup(new_name); + node->name = mem_strdup(new_path); + + HASH_INSERT(fil_space_t, name_hash, fil_system->name_hash, + ut_fold_string(new_name), space); + return(TRUE); +} + +/*******************************************************************//** +Allocates a file name for a single-table tablespace. The string must be freed +by caller with mem_free(). +@return own: file name */ +UNIV_INTERN +char* +fil_make_ibd_name( +/*==============*/ + const char* name, /*!< in: table name or a dir path */ + bool is_full_path) /*!< in: TRUE if it is a dir path */ +{ + char* filename; + ulint namelen = strlen(name); + ulint dirlen = strlen(fil_path_to_mysql_datadir); + ulint pathlen = dirlen + namelen + sizeof "/.ibd"; + + filename = static_cast<char*>(mem_alloc(pathlen)); + + if (is_full_path) { + memcpy(filename, name, namelen); + memcpy(filename + namelen, ".ibd", sizeof ".ibd"); + } else { + ut_snprintf(filename, pathlen, "%s/%s.ibd", + fil_path_to_mysql_datadir, name); + + } + + srv_normalize_path_for_win(filename); + + return(filename); +} + +/*******************************************************************//** +Allocates a file name for a tablespace ISL file (InnoDB Symbolic Link). +The string must be freed by caller with mem_free(). +@return own: file name */ +UNIV_INTERN +char* +fil_make_isl_name( +/*==============*/ + const char* name) /*!< in: table name */ +{ + char* filename; + ulint namelen = strlen(name); + ulint dirlen = strlen(fil_path_to_mysql_datadir); + ulint pathlen = dirlen + namelen + sizeof "/.isl"; + + filename = static_cast<char*>(mem_alloc(pathlen)); + + ut_snprintf(filename, pathlen, "%s/%s.isl", + fil_path_to_mysql_datadir, name); + + srv_normalize_path_for_win(filename); + + return(filename); +} + +/*******************************************************************//** +Renames a single-table tablespace. The tablespace must be cached in the +tablespace memory cache. +@return TRUE if success */ +UNIV_INTERN +ibool +fil_rename_tablespace( +/*==================*/ + const char* old_name_in, /*!< in: old table name in the + standard databasename/tablename + format of InnoDB, or NULL if we + do the rename based on the space + id only */ + ulint id, /*!< in: space id */ + const char* new_name, /*!< in: new table name in the + standard databasename/tablename + format of InnoDB */ + const char* new_path_in) /*!< in: new full datafile path + if the tablespace is remotely + located, or NULL if it is located + in the normal data directory. */ +{ + ibool success; + fil_space_t* space; + fil_node_t* node; + ulint count = 0; + char* new_path; + char* old_name; + char* old_path; + const char* not_given = "(name not specified)"; + + ut_a(id != 0); + +retry: + count++; + + if (!(count % 1000)) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Warning: problems renaming ", stderr); + ut_print_filename(stderr, + old_name_in ? old_name_in : not_given); + fputs(" to ", stderr); + ut_print_filename(stderr, new_name); + fprintf(stderr, ", %lu iterations\n", (ulong) count); + } + + mutex_enter(&fil_system->mutex); + + space = fil_space_get_by_id(id); + + DBUG_EXECUTE_IF("fil_rename_tablespace_failure_1", space = NULL; ); + + if (space == NULL) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Cannot find space id %lu in the tablespace " + "memory cache, though the table '%s' in a " + "rename operation should have that id.", + (ulong) id, old_name_in ? old_name_in : not_given); + mutex_exit(&fil_system->mutex); + + return(FALSE); + } + + if (count > 25000) { + space->stop_ios = FALSE; + mutex_exit(&fil_system->mutex); + + return(FALSE); + } + + /* We temporarily close the .ibd file because we do not trust that + operating systems can rename an open file. For the closing we have to + wait until there are no pending i/o's or flushes on the file. */ + + space->stop_ios = TRUE; + + /* The following code must change when InnoDB supports + multiple datafiles per tablespace. */ + ut_a(UT_LIST_GET_LEN(space->chain) == 1); + node = UT_LIST_GET_FIRST(space->chain); + + if (node->n_pending > 0 + || node->n_pending_flushes > 0 + || node->being_extended) { + /* There are pending i/o's or flushes or the file is + currently being extended, sleep for a while and + retry */ + + mutex_exit(&fil_system->mutex); + + os_thread_sleep(20000); + + goto retry; + + } else if (node->modification_counter > node->flush_counter) { + /* Flush the space */ + + mutex_exit(&fil_system->mutex); + + os_thread_sleep(20000); + + fil_flush(id); + + goto retry; + + } else if (node->open) { + /* Close the file */ + + fil_node_close_file(node, fil_system); + } + + /* Check that the old name in the space is right */ + + if (old_name_in) { + old_name = mem_strdup(old_name_in); + ut_a(strcmp(space->name, old_name) == 0); + } else { + old_name = mem_strdup(space->name); + } + old_path = mem_strdup(node->name); + + /* Rename the tablespace and the node in the memory cache */ + new_path = new_path_in ? mem_strdup(new_path_in) + : fil_make_ibd_name(new_name, false); + + success = fil_rename_tablespace_in_mem( + space, node, new_name, new_path); + + if (success) { + + DBUG_EXECUTE_IF("fil_rename_tablespace_failure_2", + goto skip_second_rename; ); + + success = os_file_rename( + innodb_file_data_key, old_path, new_path); + + DBUG_EXECUTE_IF("fil_rename_tablespace_failure_2", +skip_second_rename: + success = FALSE; ); + + if (!success) { + /* We have to revert the changes we made + to the tablespace memory cache */ + + ut_a(fil_rename_tablespace_in_mem( + space, node, old_name, old_path)); + } + } + + space->stop_ios = FALSE; + + mutex_exit(&fil_system->mutex); + +#ifndef UNIV_HOTBACKUP + if (success && !recv_recovery_on) { + mtr_t mtr; + + mtr_start(&mtr); + + fil_op_write_log(MLOG_FILE_RENAME, id, 0, 0, old_name, new_name, + &mtr); + mtr_commit(&mtr); + } +#endif /* !UNIV_HOTBACKUP */ + + mem_free(new_path); + mem_free(old_path); + mem_free(old_name); + + return(success); +} + +/*******************************************************************//** +Creates a new InnoDB Symbolic Link (ISL) file. It is always created +under the 'datadir' of MySQL. The datadir is the directory of a +running mysqld program. We can refer to it by simply using the path '.'. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fil_create_link_file( +/*=================*/ + const char* tablename, /*!< in: tablename */ + const char* filepath) /*!< in: pathname of tablespace */ +{ + os_file_t file; + ibool success; + dberr_t err = DB_SUCCESS; + char* link_filepath; + char* prev_filepath = fil_read_link_file(tablename); + + ut_ad(!srv_read_only_mode); + + if (prev_filepath) { + /* Truncate will call this with an existing + link file which contains the same filepath. */ + if (0 == strcmp(prev_filepath, filepath)) { + mem_free(prev_filepath); + return(DB_SUCCESS); + } + mem_free(prev_filepath); + } + + link_filepath = fil_make_isl_name(tablename); + + file = os_file_create_simple_no_error_handling( + innodb_file_data_key, link_filepath, + OS_FILE_CREATE, OS_FILE_READ_WRITE, &success); + + if (!success) { + /* The following call will print an error message */ + ulint error = os_file_get_last_error(true); + + ut_print_timestamp(stderr); + fputs(" InnoDB: Cannot create file ", stderr); + ut_print_filename(stderr, link_filepath); + fputs(".\n", stderr); + + if (error == OS_FILE_ALREADY_EXISTS) { + fputs("InnoDB: The link file: ", stderr); + ut_print_filename(stderr, filepath); + fputs(" already exists.\n", stderr); + err = DB_TABLESPACE_EXISTS; + + } else if (error == OS_FILE_DISK_FULL) { + err = DB_OUT_OF_FILE_SPACE; + + } else { + err = DB_ERROR; + } + + /* file is not open, no need to close it. */ + mem_free(link_filepath); + return(err); + } + + if (!os_file_write(link_filepath, file, filepath, 0, + strlen(filepath))) { + err = DB_ERROR; + } + + /* Close the file, we only need it at startup */ + os_file_close(file); + + mem_free(link_filepath); + + return(err); +} + +/*******************************************************************//** +Deletes an InnoDB Symbolic Link (ISL) file. */ +UNIV_INTERN +void +fil_delete_link_file( +/*=================*/ + const char* tablename) /*!< in: name of table */ +{ + char* link_filepath = fil_make_isl_name(tablename); + + os_file_delete_if_exists(innodb_file_data_key, link_filepath); + + mem_free(link_filepath); +} + +/*******************************************************************//** +Reads an InnoDB Symbolic Link (ISL) file. +It is always created under the 'datadir' of MySQL. The name is of the +form {databasename}/{tablename}. and the isl file is expected to be in a +'{databasename}' directory called '{tablename}.isl'. The caller must free +the memory of the null-terminated path returned if it is not null. +@return own: filepath found in link file, NULL if not found. */ +UNIV_INTERN +char* +fil_read_link_file( +/*===============*/ + const char* name) /*!< in: tablespace name */ +{ + char* filepath = NULL; + char* link_filepath; + FILE* file = NULL; + + /* The .isl file is in the 'normal' tablespace location. */ + link_filepath = fil_make_isl_name(name); + + file = fopen(link_filepath, "r+b"); + + mem_free(link_filepath); + + if (file) { + filepath = static_cast<char*>(mem_alloc(OS_FILE_MAX_PATH)); + + os_file_read_string(file, filepath, OS_FILE_MAX_PATH); + fclose(file); + + if (strlen(filepath)) { + /* Trim whitespace from end of filepath */ + ulint lastch = strlen(filepath) - 1; + while (lastch > 4 && filepath[lastch] <= 0x20) { + filepath[lastch--] = 0x00; + } + srv_normalize_path_for_win(filepath); + } + } + + return(filepath); +} + +/*******************************************************************//** +Opens a handle to the file linked to in an InnoDB Symbolic Link file. +@return TRUE if remote linked tablespace file is found and opened. */ +UNIV_INTERN +ibool +fil_open_linked_file( +/*===============*/ + const char* tablename, /*!< in: database/tablename */ + char** remote_filepath,/*!< out: remote filepath */ + os_file_t* remote_file) /*!< out: remote file handle */ + +{ + ibool success; + + *remote_filepath = fil_read_link_file(tablename); + if (*remote_filepath == NULL) { + return(FALSE); + } + + /* The filepath provided is different from what was + found in the link file. */ + *remote_file = os_file_create_simple_no_error_handling( + innodb_file_data_key, *remote_filepath, + OS_FILE_OPEN, OS_FILE_READ_ONLY, + &success); + + if (!success) { + char* link_filepath = fil_make_isl_name(tablename); + + /* The following call prints an error message */ + os_file_get_last_error(true); + + ib_logf(IB_LOG_LEVEL_ERROR, + "A link file was found named '%s' " + "but the linked tablespace '%s' " + "could not be opened.", + link_filepath, *remote_filepath); + + mem_free(link_filepath); + mem_free(*remote_filepath); + *remote_filepath = NULL; + } + + return(success); +} + +/*******************************************************************//** +Creates a new single-table tablespace to a database directory of MySQL. +Database directories are under the 'datadir' of MySQL. The datadir is the +directory of a running mysqld program. We can refer to it by simply the +path '.'. Tables created with CREATE TEMPORARY TABLE we place in the temp +dir of the mysqld server. + +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fil_create_new_single_table_tablespace( +/*===================================*/ + ulint space_id, /*!< in: space id */ + const char* tablename, /*!< in: the table name in the usual + databasename/tablename format + of InnoDB */ + const char* dir_path, /*!< in: NULL or a dir path */ + ulint flags, /*!< in: tablespace flags */ + ulint flags2, /*!< in: table flags2 */ + ulint size) /*!< in: the initial size of the + tablespace file in pages, + must be >= FIL_IBD_FILE_INITIAL_SIZE */ +{ + os_file_t file; + ibool ret; + dberr_t err; + byte* buf2; + byte* page; + char* path; + ibool success; + /* TRUE if a table is created with CREATE TEMPORARY TABLE */ + bool is_temp = !!(flags2 & DICT_TF2_TEMPORARY); + bool has_data_dir = FSP_FLAGS_HAS_DATA_DIR(flags); + + ut_a(space_id > 0); + ut_ad(!srv_read_only_mode); + ut_a(space_id < SRV_LOG_SPACE_FIRST_ID); + ut_a(size >= FIL_IBD_FILE_INITIAL_SIZE); + ut_a(fsp_flags_is_valid(flags)); + + if (is_temp) { + /* Temporary table filepath */ + ut_ad(dir_path); + path = fil_make_ibd_name(dir_path, true); + } else if (has_data_dir) { + ut_ad(dir_path); + path = os_file_make_remote_pathname(dir_path, tablename, "ibd"); + + /* Since this tablespace file will be created in a + remote directory, let's create the subdirectories + in the path, if they are not there already. */ + success = os_file_create_subdirs_if_needed(path); + if (!success) { + err = DB_ERROR; + goto error_exit_3; + } + } else { + path = fil_make_ibd_name(tablename, false); + } + + file = os_file_create( + innodb_file_data_key, path, + OS_FILE_CREATE | OS_FILE_ON_ERROR_NO_EXIT, + OS_FILE_NORMAL, + OS_DATA_FILE, + &ret); + + if (ret == FALSE) { + /* The following call will print an error message */ + ulint error = os_file_get_last_error(true); + + ib_logf(IB_LOG_LEVEL_ERROR, + "Cannot create file '%s'\n", path); + + if (error == OS_FILE_ALREADY_EXISTS) { + ib_logf(IB_LOG_LEVEL_ERROR, + "The file '%s' already exists though the " + "corresponding table did not exist " + "in the InnoDB data dictionary. " + "Have you moved InnoDB .ibd files " + "around without using the SQL commands " + "DISCARD TABLESPACE and IMPORT TABLESPACE, " + "or did mysqld crash in the middle of " + "CREATE TABLE? " + "You can resolve the problem by removing " + "the file '%s' under the 'datadir' of MySQL.", + path, path); + + err = DB_TABLESPACE_EXISTS; + goto error_exit_3; + } + + if (error == OS_FILE_DISK_FULL) { + err = DB_OUT_OF_FILE_SPACE; + goto error_exit_3; + } + + err = DB_ERROR; + goto error_exit_3; + } + + ret = os_file_set_size(path, file, size * UNIV_PAGE_SIZE); + + if (!ret) { + err = DB_OUT_OF_FILE_SPACE; + goto error_exit_2; + } + + /* printf("Creating tablespace %s id %lu\n", path, space_id); */ + + /* We have to write the space id to the file immediately and flush the + file to disk. This is because in crash recovery we must be aware what + tablespaces exist and what are their space id's, so that we can apply + the log records to the right file. It may take quite a while until + buffer pool flush algorithms write anything to the file and flush it to + disk. If we would not write here anything, the file would be filled + with zeros from the call of os_file_set_size(), until a buffer pool + flush would write to it. */ + + buf2 = static_cast<byte*>(ut_malloc(3 * UNIV_PAGE_SIZE)); + /* Align the memory for file i/o if we might have O_DIRECT set */ + page = static_cast<byte*>(ut_align(buf2, UNIV_PAGE_SIZE)); + + memset(page, '\0', UNIV_PAGE_SIZE); + + /* Add the UNIV_PAGE_SIZE to the table flags and write them to the + tablespace header. */ + flags = fsp_flags_set_page_size(flags, UNIV_PAGE_SIZE); + fsp_header_init_fields(page, space_id, flags); + mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, space_id); + + if (!(fsp_flags_is_compressed(flags))) { + buf_flush_init_for_writing(page, NULL, 0); + ret = os_file_write(path, file, page, 0, UNIV_PAGE_SIZE); + } else { + page_zip_des_t page_zip; + ulint zip_size; + + zip_size = fsp_flags_get_zip_size(flags); + + page_zip_set_size(&page_zip, zip_size); + page_zip.data = page + UNIV_PAGE_SIZE; +#ifdef UNIV_DEBUG + page_zip.m_start = +#endif /* UNIV_DEBUG */ + page_zip.m_end = page_zip.m_nonempty = + page_zip.n_blobs = 0; + buf_flush_init_for_writing(page, &page_zip, 0); + ret = os_file_write(path, file, page_zip.data, 0, zip_size); + } + + ut_free(buf2); + + if (!ret) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Could not write the first page to tablespace " + "'%s'", path); + + err = DB_ERROR; + goto error_exit_2; + } + + ret = os_file_flush(file); + + if (!ret) { + ib_logf(IB_LOG_LEVEL_ERROR, + "File flush of tablespace '%s' failed", path); + err = DB_ERROR; + goto error_exit_2; + } + + if (has_data_dir) { + /* Now that the IBD file is created, make the ISL file. */ + err = fil_create_link_file(tablename, path); + if (err != DB_SUCCESS) { + goto error_exit_2; + } + } + + success = fil_space_create(tablename, space_id, flags, FIL_TABLESPACE); + if (!success || !fil_node_create(path, size, space_id, FALSE)) { + err = DB_ERROR; + goto error_exit_1; + } + +#ifndef UNIV_HOTBACKUP + { + mtr_t mtr; + ulint mlog_file_flag = 0; + + if (is_temp) { + mlog_file_flag |= MLOG_FILE_FLAG_TEMP; + } + + mtr_start(&mtr); + + fil_op_write_log(flags + ? MLOG_FILE_CREATE2 + : MLOG_FILE_CREATE, + space_id, mlog_file_flag, flags, + tablename, NULL, &mtr); + + mtr_commit(&mtr); + } +#endif + err = DB_SUCCESS; + + /* Error code is set. Cleanup the various variables used. + These labels reflect the order in which variables are assigned or + actions are done. */ +error_exit_1: + if (has_data_dir && err != DB_SUCCESS) { + fil_delete_link_file(tablename); + } +error_exit_2: + os_file_close(file); + if (err != DB_SUCCESS) { + os_file_delete(innodb_file_data_key, path); + } +error_exit_3: + mem_free(path); + + return(err); +} + +#ifndef UNIV_HOTBACKUP +/********************************************************************//** +Report information about a bad tablespace. */ +static +void +fil_report_bad_tablespace( +/*======================*/ + const char* filepath, /*!< in: filepath */ + const char* check_msg, /*!< in: fil_check_first_page() */ + ulint found_id, /*!< in: found space ID */ + ulint found_flags, /*!< in: found flags */ + ulint expected_id, /*!< in: expected space id */ + ulint expected_flags) /*!< in: expected flags */ +{ + if (check_msg) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Error %s in file '%s'," + "tablespace id=%lu, flags=%lu. " + "Please refer to " + REFMAN "innodb-troubleshooting-datadict.html " + "for how to resolve the issue.", + check_msg, filepath, + (ulong) expected_id, (ulong) expected_flags); + return; + } + + ib_logf(IB_LOG_LEVEL_ERROR, + "In file '%s', tablespace id and flags are %lu and %lu, " + "but in the InnoDB data dictionary they are %lu and %lu. " + "Have you moved InnoDB .ibd files around without using the " + "commands DISCARD TABLESPACE and IMPORT TABLESPACE? " + "Please refer to " + REFMAN "innodb-troubleshooting-datadict.html " + "for how to resolve the issue.", + filepath, (ulong) found_id, (ulong) found_flags, + (ulong) expected_id, (ulong) expected_flags); +} + +/********************************************************************//** +Tries to open a single-table tablespace and optionally checks that the +space id in it is correct. If this does not succeed, print an error message +to the .err log. This function is used to open a tablespace when we start +mysqld after the dictionary has been booted, and also in IMPORT TABLESPACE. + +NOTE that we assume this operation is used either at the database startup +or under the protection of the dictionary mutex, so that two users cannot +race here. This operation does not leave the file associated with the +tablespace open, but closes it after we have looked at the space id in it. + +If the validate boolean is set, we read the first page of the file and +check that the space id in the file is what we expect. We assume that +this function runs much faster if no check is made, since accessing the +file inode probably is much faster (the OS caches them) than accessing +the first page of the file. This boolean may be initially FALSE, but if +a remote tablespace is found it will be changed to true. + +If the fix_dict boolean is set, then it is safe to use an internal SQL +statement to update the dictionary tables if they are incorrect. + +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fil_open_single_table_tablespace( +/*=============================*/ + bool validate, /*!< in: Do we validate tablespace? */ + bool fix_dict, /*!< in: Can we fix the dictionary? */ + ulint id, /*!< in: space id */ + ulint flags, /*!< in: tablespace flags */ + const char* tablename, /*!< in: table name in the + databasename/tablename format */ + const char* path_in) /*!< in: tablespace filepath */ +{ + dberr_t err = DB_SUCCESS; + bool dict_filepath_same_as_default = false; + bool link_file_found = false; + bool link_file_is_bad = false; + fsp_open_info def; + fsp_open_info dict; + fsp_open_info remote; + ulint tablespaces_found = 0; + ulint valid_tablespaces_found = 0; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(!fix_dict || rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(!fix_dict || mutex_own(&(dict_sys->mutex))); + + if (!fsp_flags_is_valid(flags)) { + return(DB_CORRUPTION); + } + + /* If the tablespace was relocated, we do not + compare the DATA_DIR flag */ + ulint mod_flags = flags & ~FSP_FLAGS_MASK_DATA_DIR; + + memset(&def, 0, sizeof(def)); + memset(&dict, 0, sizeof(dict)); + memset(&remote, 0, sizeof(remote)); + + /* Discover the correct filepath. We will always look for an ibd + in the default location. If it is remote, it should not be here. */ + def.filepath = fil_make_ibd_name(tablename, false); + + /* The path_in was read from SYS_DATAFILES. */ + if (path_in) { + if (strcmp(def.filepath, path_in)) { + dict.filepath = mem_strdup(path_in); + /* possibility of multiple files. */ + validate = true; + } else { + dict_filepath_same_as_default = true; + } + } + + link_file_found = fil_open_linked_file( + tablename, &remote.filepath, &remote.file); + remote.success = link_file_found; + if (remote.success) { + /* possibility of multiple files. */ + validate = true; + tablespaces_found++; + + /* A link file was found. MySQL does not allow a DATA + DIRECTORY to be be the same as the default filepath. */ + ut_a(strcmp(def.filepath, remote.filepath)); + + /* If there was a filepath found in SYS_DATAFILES, + we hope it was the same as this remote.filepath found + in the ISL file. */ + if (dict.filepath + && (0 == strcmp(dict.filepath, remote.filepath))) { + remote.success = FALSE; + os_file_close(remote.file); + mem_free(remote.filepath); + remote.filepath = NULL; + tablespaces_found--; + } + } + + /* Attempt to open the tablespace at other possible filepaths. */ + if (dict.filepath) { + dict.file = os_file_create_simple_no_error_handling( + innodb_file_data_key, dict.filepath, OS_FILE_OPEN, + OS_FILE_READ_ONLY, &dict.success); + if (dict.success) { + /* possibility of multiple files. */ + validate = true; + tablespaces_found++; + } + } + + /* Always look for a file at the default location. */ + ut_a(def.filepath); + def.file = os_file_create_simple_no_error_handling( + innodb_file_data_key, def.filepath, OS_FILE_OPEN, + OS_FILE_READ_ONLY, &def.success); + if (def.success) { + tablespaces_found++; + } + + /* We have now checked all possible tablespace locations and + have a count of how many we found. If things are normal, we + only found 1. */ + if (!validate && tablespaces_found == 1) { + goto skip_validate; + } + + /* Read the first page of the datadir tablespace, if found. */ + if (def.success) { + def.check_msg = fil_read_first_page( + def.file, FALSE, &def.flags, &def.id, + &def.lsn, &def.lsn); + def.valid = !def.check_msg; + + /* Validate this single-table-tablespace with SYS_TABLES, + but do not compare the DATA_DIR flag, in case the + tablespace was relocated. */ + if (def.valid && def.id == id + && (def.flags & ~FSP_FLAGS_MASK_DATA_DIR) == mod_flags) { + valid_tablespaces_found++; + } else { + def.valid = false; + /* Do not use this tablespace. */ + fil_report_bad_tablespace( + def.filepath, def.check_msg, def.id, + def.flags, id, flags); + } + } + + /* Read the first page of the remote tablespace */ + if (remote.success) { + remote.check_msg = fil_read_first_page( + remote.file, FALSE, &remote.flags, &remote.id, + &remote.lsn, &remote.lsn); + remote.valid = !remote.check_msg; + + /* Validate this single-table-tablespace with SYS_TABLES, + but do not compare the DATA_DIR flag, in case the + tablespace was relocated. */ + if (remote.valid && remote.id == id + && (remote.flags & ~FSP_FLAGS_MASK_DATA_DIR) == mod_flags) { + valid_tablespaces_found++; + } else { + remote.valid = false; + /* Do not use this linked tablespace. */ + fil_report_bad_tablespace( + remote.filepath, remote.check_msg, remote.id, + remote.flags, id, flags); + link_file_is_bad = true; + } + } + + /* Read the first page of the datadir tablespace, if found. */ + if (dict.success) { + dict.check_msg = fil_read_first_page( + dict.file, FALSE, &dict.flags, &dict.id, + &dict.lsn, &dict.lsn); + dict.valid = !dict.check_msg; + + /* Validate this single-table-tablespace with SYS_TABLES, + but do not compare the DATA_DIR flag, in case the + tablespace was relocated. */ + if (dict.valid && dict.id == id + && (dict.flags & ~FSP_FLAGS_MASK_DATA_DIR) == mod_flags) { + valid_tablespaces_found++; + } else { + dict.valid = false; + /* Do not use this tablespace. */ + fil_report_bad_tablespace( + dict.filepath, dict.check_msg, dict.id, + dict.flags, id, flags); + } + } + + /* Make sense of these three possible locations. + First, bail out if no tablespace files were found. */ + if (valid_tablespaces_found == 0) { + /* The following call prints an error message */ + os_file_get_last_error(true); + + ib_logf(IB_LOG_LEVEL_ERROR, + "Could not find a valid tablespace file for '%s'. " + "See " REFMAN "innodb-troubleshooting-datadict.html " + "for how to resolve the issue.", + tablename); + + err = DB_CORRUPTION; + + goto cleanup_and_exit; + } + + /* Do not open any tablespaces if more than one tablespace with + the correct space ID and flags were found. */ + if (tablespaces_found > 1) { + ib_logf(IB_LOG_LEVEL_ERROR, + "A tablespace for %s has been found in " + "multiple places;", tablename); + if (def.success) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Default location; %s, LSN=" LSN_PF + ", Space ID=%lu, Flags=%lu", + def.filepath, def.lsn, + (ulong) def.id, (ulong) def.flags); + } + if (remote.success) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Remote location; %s, LSN=" LSN_PF + ", Space ID=%lu, Flags=%lu", + remote.filepath, remote.lsn, + (ulong) remote.id, (ulong) remote.flags); + } + if (dict.success) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Dictionary location; %s, LSN=" LSN_PF + ", Space ID=%lu, Flags=%lu", + dict.filepath, dict.lsn, + (ulong) dict.id, (ulong) dict.flags); + } + + /* Force-recovery will allow some tablespaces to be + skipped by REDO if there was more than one file found. + Unlike during the REDO phase of recovery, we now know + if the tablespace is valid according to the dictionary, + which was not available then. So if we did not force + recovery and there is only one good tablespace, ignore + any bad tablespaces. */ + if (valid_tablespaces_found > 1 || srv_force_recovery > 0) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Will not open the tablespace for '%s'", + tablename); + + if (def.success != def.valid + || dict.success != dict.valid + || remote.success != remote.valid) { + err = DB_CORRUPTION; + } else { + err = DB_ERROR; + } + goto cleanup_and_exit; + } + + /* There is only one valid tablespace found and we did + not use srv_force_recovery during REDO. Use this one + tablespace and clean up invalid tablespace pointers */ + if (def.success && !def.valid) { + def.success = false; + os_file_close(def.file); + tablespaces_found--; + } + if (dict.success && !dict.valid) { + dict.success = false; + os_file_close(dict.file); + /* Leave dict.filepath so that SYS_DATAFILES + can be corrected below. */ + tablespaces_found--; + } + if (remote.success && !remote.valid) { + remote.success = false; + os_file_close(remote.file); + mem_free(remote.filepath); + remote.filepath = NULL; + tablespaces_found--; + } + } + + /* At this point, there should be only one filepath. */ + ut_a(tablespaces_found == 1); + ut_a(valid_tablespaces_found == 1); + + /* Only fix the dictionary at startup when there is only one thread. + Calls to dict_load_table() can be done while holding other latches. */ + if (!fix_dict) { + goto skip_validate; + } + + /* We may need to change what is stored in SYS_DATAFILES or + SYS_TABLESPACES or adjust the link file. + Since a failure to update SYS_TABLESPACES or SYS_DATAFILES does + not prevent opening and using the single_table_tablespace either + this time or the next, we do not check the return code or fail + to open the tablespace. But dict_update_filepath() will issue a + warning to the log. */ + if (dict.filepath) { + if (remote.success) { + dict_update_filepath(id, remote.filepath); + } else if (def.success) { + dict_update_filepath(id, def.filepath); + if (link_file_is_bad) { + fil_delete_link_file(tablename); + } + } else if (!link_file_found || link_file_is_bad) { + ut_ad(dict.success); + /* Fix the link file if we got our filepath + from the dictionary but a link file did not + exist or it did not point to a valid file. */ + fil_delete_link_file(tablename); + fil_create_link_file(tablename, dict.filepath); + } + + } else if (remote.success && dict_filepath_same_as_default) { + dict_update_filepath(id, remote.filepath); + + } else if (remote.success && path_in == NULL) { + /* SYS_DATAFILES record for this space ID was not found. */ + dict_insert_tablespace_and_filepath( + id, tablename, remote.filepath, flags); + } + +skip_validate: + if (err != DB_SUCCESS) { + ; // Don't load the tablespace into the cache + } else if (!fil_space_create(tablename, id, flags, FIL_TABLESPACE)) { + err = DB_ERROR; + } else { + /* We do not measure the size of the file, that is why + we pass the 0 below */ + + if (!fil_node_create(remote.success ? remote.filepath : + dict.success ? dict.filepath : + def.filepath, 0, id, FALSE)) { + err = DB_ERROR; + } + } + +cleanup_and_exit: + if (remote.success) { + os_file_close(remote.file); + } + if (remote.filepath) { + mem_free(remote.filepath); + } + if (dict.success) { + os_file_close(dict.file); + } + if (dict.filepath) { + mem_free(dict.filepath); + } + if (def.success) { + os_file_close(def.file); + } + mem_free(def.filepath); + + return(err); +} +#endif /* !UNIV_HOTBACKUP */ + +#ifdef UNIV_HOTBACKUP +/*******************************************************************//** +Allocates a file name for an old version of a single-table tablespace. +The string must be freed by caller with mem_free()! +@return own: file name */ +static +char* +fil_make_ibbackup_old_name( +/*=======================*/ + const char* name) /*!< in: original file name */ +{ + static const char suffix[] = "_ibbackup_old_vers_"; + char* path; + ulint len = strlen(name); + + path = static_cast<char*>(mem_alloc(len + (15 + sizeof suffix))); + + memcpy(path, name, len); + memcpy(path + len, suffix, (sizeof suffix) - 1); + ut_sprintf_timestamp_without_extra_chars( + path + len + ((sizeof suffix) - 1)); + return(path); +} +#endif /* UNIV_HOTBACKUP */ + + +/*******************************************************************//** +Determine the space id of the given file descriptor by reading a few +pages from the beginning of the .ibd file. +@return true if space id was successfully identified, or false. */ +static +bool +fil_user_tablespace_find_space_id( +/*==============================*/ + fsp_open_info* fsp) /* in/out: contains file descriptor, which is + used as input. contains space_id, which is + the output */ +{ + bool st; + os_offset_t file_size; + + file_size = os_file_get_size(fsp->file); + + if (file_size == (os_offset_t) -1) { + ib_logf(IB_LOG_LEVEL_ERROR, "Could not get file size: %s", + fsp->filepath); + return(false); + } + + /* Assuming a page size, read the space_id from each page and store it + in a map. Find out which space_id is agreed on by majority of the + pages. Choose that space_id. */ + for (ulint page_size = UNIV_ZIP_SIZE_MIN; + page_size <= UNIV_PAGE_SIZE_MAX; page_size <<= 1) { + + /* map[space_id] = count of pages */ + std::map<ulint, ulint> verify; + + ulint page_count = 64; + ulint valid_pages = 0; + + /* Adjust the number of pages to analyze based on file size */ + while ((page_count * page_size) > file_size) { + --page_count; + } + + ib_logf(IB_LOG_LEVEL_INFO, "Page size:%lu Pages to analyze:" + "%lu", page_size, page_count); + + byte* buf = static_cast<byte*>(ut_malloc(2*page_size)); + byte* page = static_cast<byte*>(ut_align(buf, page_size)); + + for (ulint j = 0; j < page_count; ++j) { + + st = os_file_read(fsp->file, page, (j* page_size), page_size); + + if (!st) { + ib_logf(IB_LOG_LEVEL_INFO, + "READ FAIL: page_no:%lu", j); + continue; + } + + bool uncompressed_ok = false; + + /* For uncompressed pages, the page size must be equal + to UNIV_PAGE_SIZE. */ + if (page_size == UNIV_PAGE_SIZE) { + uncompressed_ok = !buf_page_is_corrupted( + false, page, 0); + } + + bool compressed_ok = !buf_page_is_corrupted( + false, page, page_size); + + if (uncompressed_ok || compressed_ok) { + + ulint space_id = mach_read_from_4(page + + FIL_PAGE_SPACE_ID); + + if (space_id > 0) { + ib_logf(IB_LOG_LEVEL_INFO, + "VALID: space:%lu " + "page_no:%lu page_size:%lu", + space_id, j, page_size); + verify[space_id]++; + ++valid_pages; + } + } + } + + ut_free(buf); + + ib_logf(IB_LOG_LEVEL_INFO, "Page size: %lu, Possible space_id " + "count:%lu", page_size, (ulint) verify.size()); + + const ulint pages_corrupted = 3; + for (ulint missed = 0; missed <= pages_corrupted; ++missed) { + + for (std::map<ulint, ulint>::iterator + m = verify.begin(); m != verify.end(); ++m ) { + + ib_logf(IB_LOG_LEVEL_INFO, "space_id:%lu, " + "Number of pages matched: %lu/%lu " + "(%lu)", m->first, m->second, + valid_pages, page_size); + + if (m->second == (valid_pages - missed)) { + + ib_logf(IB_LOG_LEVEL_INFO, + "Chosen space:%lu\n", m->first); + + fsp->id = m->first; + return(true); + } + } + + } + } + + return(false); +} + +/*******************************************************************//** +Finds the given page_no of the given space id from the double write buffer, +and copies it to the corresponding .ibd file. +@return true if copy was successful, or false. */ +bool +fil_user_tablespace_restore_page( +/*==============================*/ + fsp_open_info* fsp, /* in: contains space id and .ibd + file information */ + ulint page_no) /* in: page_no to obtain from double + write buffer */ +{ + bool err; + ulint flags; + ulint zip_size; + ulint page_size; + ulint buflen; + byte* page; + + ib_logf(IB_LOG_LEVEL_INFO, "Restoring page %lu of tablespace %lu", + page_no, fsp->id); + + // find if double write buffer has page_no of given space id + page = recv_sys->dblwr.find_page(fsp->id, page_no); + + if (!page) { + ib_logf(IB_LOG_LEVEL_WARN, "Doublewrite does not have " + "page_no=%lu of space: %lu", page_no, fsp->id); + err = false; + goto out; + } + + flags = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + page); + zip_size = fsp_flags_get_zip_size(flags); + page_size = fsp_flags_get_page_size(flags); + + ut_ad(page_no == page_get_page_no(page)); + + buflen = zip_size ? zip_size: page_size; + + ib_logf(IB_LOG_LEVEL_INFO, "Writing %lu bytes into file: %s", + buflen, fsp->filepath); + + err = os_file_write(fsp->filepath, fsp->file, page, + (zip_size ? zip_size : page_size) * page_no, + buflen); + + os_file_flush(fsp->file); +out: + return(err); +} + +/********************************************************************//** +Opens an .ibd file and adds the associated single-table tablespace to the +InnoDB fil0fil.cc data structures. +Set fsp->success to TRUE if tablespace is valid, FALSE if not. */ +static +void +fil_validate_single_table_tablespace( +/*=================================*/ + const char* tablename, /*!< in: database/tablename */ + fsp_open_info* fsp) /*!< in/out: tablespace info */ +{ + bool restore_attempted = false; + +check_first_page: + fsp->success = TRUE; + if (const char* check_msg = fil_read_first_page( + fsp->file, FALSE, &fsp->flags, &fsp->id, + &fsp->lsn, &fsp->lsn)) { + ib_logf(IB_LOG_LEVEL_ERROR, + "%s in tablespace %s (table %s)", + check_msg, fsp->filepath, tablename); + fsp->success = FALSE; + } + + if (!fsp->success) { + if (!restore_attempted) { + if (!fil_user_tablespace_find_space_id(fsp)) { + return; + } + restore_attempted = true; + + if (fsp->id > 0 + && !fil_user_tablespace_restore_page(fsp, 0)) { + return; + } + goto check_first_page; + } + return; + } + + if (fsp->id == ULINT_UNDEFINED || fsp->id == 0) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Tablespace is not sensible;" + " Table: %s Space ID: %lu Filepath: %s\n", + tablename, (ulong) fsp->id, fsp->filepath); + fsp->success = FALSE; + return; + } + + mutex_enter(&fil_system->mutex); + fil_space_t* space = fil_space_get_by_id(fsp->id); + mutex_exit(&fil_system->mutex); + if (space != NULL) { + char* prev_filepath = fil_space_get_first_path(fsp->id); + + ib_logf(IB_LOG_LEVEL_ERROR, + "Attempted to open a previously opened tablespace. " + "Previous tablespace %s uses space ID: %lu at " + "filepath: %s. Cannot open tablespace %s which uses " + "space ID: %lu at filepath: %s", + space->name, (ulong) space->id, prev_filepath, + tablename, (ulong) fsp->id, fsp->filepath); + + mem_free(prev_filepath); + fsp->success = FALSE; + return; + } + + fsp->success = TRUE; +} + + +/********************************************************************//** +Opens an .ibd file and adds the associated single-table tablespace to the +InnoDB fil0fil.cc data structures. */ +static +void +fil_load_single_table_tablespace( +/*=============================*/ + const char* dbname, /*!< in: database name */ + const char* filename) /*!< in: file name (not a path), + including the .ibd or .isl extension */ +{ + char* tablename; + ulint tablename_len; + ulint dbname_len = strlen(dbname); + ulint filename_len = strlen(filename); + fsp_open_info def; + fsp_open_info remote; + os_offset_t size; +#ifdef UNIV_HOTBACKUP + fil_space_t* space; +#endif + + memset(&def, 0, sizeof(def)); + memset(&remote, 0, sizeof(remote)); + + /* The caller assured that the extension is ".ibd" or ".isl". */ + ut_ad(0 == memcmp(filename + filename_len - 4, ".ibd", 4) + || 0 == memcmp(filename + filename_len - 4, ".isl", 4)); + + /* Build up the tablename in the standard form database/table. */ + tablename = static_cast<char*>( + mem_alloc(dbname_len + filename_len + 2)); + + /* When lower_case_table_names = 2 it is possible that the + dbname is in upper case ,but while storing it in fil_space_t + we must convert it into lower case */ + sprintf(tablename, "%s" , dbname); + tablename[dbname_len] = '\0'; + + if (lower_case_file_system) { + dict_casedn_str(tablename); + } + + sprintf(tablename+dbname_len,"/%s",filename); + tablename_len = strlen(tablename) - strlen(".ibd"); + tablename[tablename_len] = '\0'; + + /* There may be both .ibd and .isl file in the directory. + And it is possible that the .isl file refers to a different + .ibd file. If so, we open and compare them the first time + one of them is sent to this function. So if this table has + already been loaded, there is nothing to do.*/ + mutex_enter(&fil_system->mutex); + if (fil_space_get_by_name(tablename)) { + mem_free(tablename); + mutex_exit(&fil_system->mutex); + return; + } + mutex_exit(&fil_system->mutex); + + /* Build up the filepath of the .ibd tablespace in the datadir. + This must be freed independent of def.success. */ + def.filepath = fil_make_ibd_name(tablename, false); + +#ifdef __WIN__ +# ifndef UNIV_HOTBACKUP + /* If lower_case_table_names is 0 or 2, then MySQL allows database + directory names with upper case letters. On Windows, all table and + database names in InnoDB are internally always in lower case. Put the + file path to lower case, so that we are consistent with InnoDB's + internal data dictionary. */ + + dict_casedn_str(def.filepath); +# endif /* !UNIV_HOTBACKUP */ +#endif + + /* Check for a link file which locates a remote tablespace. */ + remote.success = fil_open_linked_file( + tablename, &remote.filepath, &remote.file); + + /* Read the first page of the remote tablespace */ + if (remote.success) { + fil_validate_single_table_tablespace(tablename, &remote); + if (!remote.success) { + os_file_close(remote.file); + mem_free(remote.filepath); + } + } + + + /* Try to open the tablespace in the datadir. */ + def.file = os_file_create_simple_no_error_handling( + innodb_file_data_key, def.filepath, OS_FILE_OPEN, + OS_FILE_READ_WRITE, &def.success); + + /* Read the first page of the remote tablespace */ + if (def.success) { + fil_validate_single_table_tablespace(tablename, &def); + if (!def.success) { + os_file_close(def.file); + } + } + + if (!def.success && !remote.success) { + /* The following call prints an error message */ + os_file_get_last_error(true); + fprintf(stderr, + "InnoDB: Error: could not open single-table" + " tablespace file %s\n", def.filepath); + + if (!strncmp(filename, + tmp_file_prefix, tmp_file_prefix_length)) { + /* Ignore errors for #sql tablespaces. */ + mem_free(tablename); + if (remote.filepath) { + mem_free(remote.filepath); + } + if (def.filepath) { + mem_free(def.filepath); + } + return; + } +no_good_file: + fprintf(stderr, + "InnoDB: We do not continue the crash recovery," + " because the table may become\n" + "InnoDB: corrupt if we cannot apply the log" + " records in the InnoDB log to it.\n" + "InnoDB: To fix the problem and start mysqld:\n" + "InnoDB: 1) If there is a permission problem" + " in the file and mysqld cannot\n" + "InnoDB: open the file, you should" + " modify the permissions.\n" + "InnoDB: 2) If the table is not needed, or you" + " can restore it from a backup,\n" + "InnoDB: then you can remove the .ibd file," + " and InnoDB will do a normal\n" + "InnoDB: crash recovery and ignore that table.\n" + "InnoDB: 3) If the file system or the" + " disk is broken, and you cannot remove\n" + "InnoDB: the .ibd file, you can set" + " innodb_force_recovery > 0 in my.cnf\n" + "InnoDB: and force InnoDB to continue crash" + " recovery here.\n"); +will_not_choose: + mem_free(tablename); + if (remote.filepath) { + mem_free(remote.filepath); + } + if (def.filepath) { + mem_free(def.filepath); + } + + if (srv_force_recovery > 0) { + ib_logf(IB_LOG_LEVEL_INFO, + "innodb_force_recovery was set to %lu. " + "Continuing crash recovery even though we " + "cannot access the .ibd file of this table.", + srv_force_recovery); + return; + } + + exit(1); + } + + if (def.success && remote.success) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Tablespaces for %s have been found in two places;\n" + "Location 1: SpaceID: %lu LSN: %lu File: %s\n" + "Location 2: SpaceID: %lu LSN: %lu File: %s\n" + "You must delete one of them.", + tablename, (ulong) def.id, (ulong) def.lsn, + def.filepath, (ulong) remote.id, (ulong) remote.lsn, + remote.filepath); + + def.success = FALSE; + os_file_close(def.file); + os_file_close(remote.file); + goto will_not_choose; + } + + /* At this point, only one tablespace is open */ + ut_a(def.success == !remote.success); + + fsp_open_info* fsp = def.success ? &def : &remote; + + /* Get and test the file size. */ + size = os_file_get_size(fsp->file); + + if (size == (os_offset_t) -1) { + /* The following call prints an error message */ + os_file_get_last_error(true); + + ib_logf(IB_LOG_LEVEL_ERROR, + "could not measure the size of single-table " + "tablespace file %s", fsp->filepath); + + os_file_close(fsp->file); + goto no_good_file; + } + + /* Every .ibd file is created >= 4 pages in size. Smaller files + cannot be ok. */ + ulong minimum_size = FIL_IBD_FILE_INITIAL_SIZE * UNIV_PAGE_SIZE; + if (size < minimum_size) { +#ifndef UNIV_HOTBACKUP + ib_logf(IB_LOG_LEVEL_ERROR, + "The size of single-table tablespace file %s " + "is only " UINT64PF ", should be at least %lu!", + fsp->filepath, size, minimum_size); + os_file_close(fsp->file); + goto no_good_file; +#else + fsp->id = ULINT_UNDEFINED; + fsp->flags = 0; +#endif /* !UNIV_HOTBACKUP */ + } + +#ifdef UNIV_HOTBACKUP + if (fsp->id == ULINT_UNDEFINED || fsp->id == 0) { + char* new_path; + + fprintf(stderr, + "InnoDB: Renaming tablespace %s of id %lu,\n" + "InnoDB: to %s_ibbackup_old_vers_<timestamp>\n" + "InnoDB: because its size %" PRId64 " is too small" + " (< 4 pages 16 kB each),\n" + "InnoDB: or the space id in the file header" + " is not sensible.\n" + "InnoDB: This can happen in an mysqlbackup run," + " and is not dangerous.\n", + fsp->filepath, fsp->id, fsp->filepath, size); + os_file_close(fsp->file); + + new_path = fil_make_ibbackup_old_name(fsp->filepath); + + bool success = os_file_rename( + innodb_file_data_key, fsp->filepath, new_path); + + ut_a(success); + + mem_free(new_path); + + goto func_exit_after_close; + } + + /* A backup may contain the same space several times, if the space got + renamed at a sensitive time. Since it is enough to have one version of + the space, we rename the file if a space with the same space id + already exists in the tablespace memory cache. We rather rename the + file than delete it, because if there is a bug, we do not want to + destroy valuable data. */ + + mutex_enter(&fil_system->mutex); + + space = fil_space_get_by_id(fsp->id); + + if (space) { + char* new_path; + + fprintf(stderr, + "InnoDB: Renaming tablespace %s of id %lu,\n" + "InnoDB: to %s_ibbackup_old_vers_<timestamp>\n" + "InnoDB: because space %s with the same id\n" + "InnoDB: was scanned earlier. This can happen" + " if you have renamed tables\n" + "InnoDB: during an mysqlbackup run.\n", + fsp->filepath, fsp->id, fsp->filepath, + space->name); + os_file_close(fsp->file); + + new_path = fil_make_ibbackup_old_name(fsp->filepath); + + mutex_exit(&fil_system->mutex); + + bool success = os_file_rename( + innodb_file_data_key, fsp->filepath, new_path); + + ut_a(success); + + mem_free(new_path); + + goto func_exit_after_close; + } + mutex_exit(&fil_system->mutex); +#endif /* UNIV_HOTBACKUP */ + ibool file_space_create_success = fil_space_create( + tablename, fsp->id, fsp->flags, FIL_TABLESPACE); + + if (!file_space_create_success) { + if (srv_force_recovery > 0) { + fprintf(stderr, + "InnoDB: innodb_force_recovery was set" + " to %lu. Continuing crash recovery\n" + "InnoDB: even though the tablespace" + " creation of this table failed.\n", + srv_force_recovery); + goto func_exit; + } + + /* Exit here with a core dump, stack, etc. */ + ut_a(file_space_create_success); + } + + /* We do not use the size information we have about the file, because + the rounding formula for extents and pages is somewhat complex; we + let fil_node_open() do that task. */ + + if (!fil_node_create(fsp->filepath, 0, fsp->id, FALSE)) { + ut_error; + } + +func_exit: + os_file_close(fsp->file); + +#ifdef UNIV_HOTBACKUP +func_exit_after_close: +#else + ut_ad(!mutex_own(&fil_system->mutex)); +#endif + mem_free(tablename); + if (remote.success) { + mem_free(remote.filepath); + } + mem_free(def.filepath); +} + +/***********************************************************************//** +A fault-tolerant function that tries to read the next file name in the +directory. We retry 100 times if os_file_readdir_next_file() returns -1. The +idea is to read as much good data as we can and jump over bad data. +@return 0 if ok, -1 if error even after the retries, 1 if at the end +of the directory */ +static +int +fil_file_readdir_next_file( +/*=======================*/ + dberr_t* err, /*!< out: this is set to DB_ERROR if an error + was encountered, otherwise not changed */ + const char* dirname,/*!< in: directory name or path */ + os_file_dir_t dir, /*!< in: directory stream */ + os_file_stat_t* info) /*!< in/out: buffer where the + info is returned */ +{ + for (ulint i = 0; i < 100; i++) { + int ret = os_file_readdir_next_file(dirname, dir, info); + + if (ret != -1) { + + return(ret); + } + + ib_logf(IB_LOG_LEVEL_ERROR, + "os_file_readdir_next_file() returned -1 in " + "directory %s, crash recovery may have failed " + "for some .ibd files!", dirname); + + *err = DB_ERROR; + } + + return(-1); +} + +/********************************************************************//** +At the server startup, if we need crash recovery, scans the database +directories under the MySQL datadir, looking for .ibd files. Those files are +single-table tablespaces. We need to know the space id in each of them so that +we know into which file we should look to check the contents of a page stored +in the doublewrite buffer, also to know where to apply log records where the +space id is != 0. +@return DB_SUCCESS or error number */ +UNIV_INTERN +dberr_t +fil_load_single_table_tablespaces(void) +/*===================================*/ +{ + int ret; + char* dbpath = NULL; + ulint dbpath_len = 100; + os_file_dir_t dir; + os_file_dir_t dbdir; + os_file_stat_t dbinfo; + os_file_stat_t fileinfo; + dberr_t err = DB_SUCCESS; + + /* The datadir of MySQL is always the default directory of mysqld */ + + dir = os_file_opendir(fil_path_to_mysql_datadir, TRUE); + + if (dir == NULL) { + + return(DB_ERROR); + } + + dbpath = static_cast<char*>(mem_alloc(dbpath_len)); + + /* Scan all directories under the datadir. They are the database + directories of MySQL. */ + + ret = fil_file_readdir_next_file(&err, fil_path_to_mysql_datadir, dir, + &dbinfo); + while (ret == 0) { + ulint len; + /* printf("Looking at %s in datadir\n", dbinfo.name); */ + + if (dbinfo.type == OS_FILE_TYPE_FILE + || dbinfo.type == OS_FILE_TYPE_UNKNOWN) { + + goto next_datadir_item; + } + + /* We found a symlink or a directory; try opening it to see + if a symlink is a directory */ + + len = strlen(fil_path_to_mysql_datadir) + + strlen (dbinfo.name) + 2; + if (len > dbpath_len) { + dbpath_len = len; + + if (dbpath) { + mem_free(dbpath); + } + + dbpath = static_cast<char*>(mem_alloc(dbpath_len)); + } + ut_snprintf(dbpath, dbpath_len, + "%s/%s", fil_path_to_mysql_datadir, dbinfo.name); + srv_normalize_path_for_win(dbpath); + + dbdir = os_file_opendir(dbpath, FALSE); + + if (dbdir != NULL) { + + /* We found a database directory; loop through it, + looking for possible .ibd files in it */ + + ret = fil_file_readdir_next_file(&err, dbpath, dbdir, + &fileinfo); + while (ret == 0) { + + if (fileinfo.type == OS_FILE_TYPE_DIR) { + + goto next_file_item; + } + + /* We found a symlink or a file */ + if (strlen(fileinfo.name) > 4 + && (0 == strcmp(fileinfo.name + + strlen(fileinfo.name) - 4, + ".ibd") + || 0 == strcmp(fileinfo.name + + strlen(fileinfo.name) - 4, + ".isl"))) { + /* The name ends in .ibd or .isl; + try opening the file */ + fil_load_single_table_tablespace( + dbinfo.name, fileinfo.name); + } +next_file_item: + ret = fil_file_readdir_next_file(&err, + dbpath, dbdir, + &fileinfo); + } + + if (0 != os_file_closedir(dbdir)) { + fputs("InnoDB: Warning: could not" + " close database directory ", stderr); + ut_print_filename(stderr, dbpath); + putc('\n', stderr); + + err = DB_ERROR; + } + } + +next_datadir_item: + ret = fil_file_readdir_next_file(&err, + fil_path_to_mysql_datadir, + dir, &dbinfo); + } + + mem_free(dbpath); + + if (0 != os_file_closedir(dir)) { + fprintf(stderr, + "InnoDB: Error: could not close MySQL datadir\n"); + + return(DB_ERROR); + } + + return(err); +} + +/*******************************************************************//** +Returns TRUE if a single-table tablespace does not exist in the memory cache, +or is being deleted there. +@return TRUE if does not exist or is being deleted */ +UNIV_INTERN +ibool +fil_tablespace_deleted_or_being_deleted_in_mem( +/*===========================================*/ + ulint id, /*!< in: space id */ + ib_int64_t version)/*!< in: tablespace_version should be this; if + you pass -1 as the value of this, then this + parameter is ignored */ +{ + fil_space_t* space; + + ut_ad(fil_system); + + mutex_enter(&fil_system->mutex); + + space = fil_space_get_by_id(id); + + if (space == NULL || space->stop_new_ops) { + mutex_exit(&fil_system->mutex); + + return(TRUE); + } + + if (version != ((ib_int64_t)-1) + && space->tablespace_version != version) { + mutex_exit(&fil_system->mutex); + + return(TRUE); + } + + mutex_exit(&fil_system->mutex); + + return(FALSE); +} + +/*******************************************************************//** +Returns TRUE if a single-table tablespace exists in the memory cache. +@return TRUE if exists */ +UNIV_INTERN +ibool +fil_tablespace_exists_in_mem( +/*=========================*/ + ulint id) /*!< in: space id */ +{ + fil_space_t* space; + + ut_ad(fil_system); + + mutex_enter(&fil_system->mutex); + + space = fil_space_get_by_id(id); + + mutex_exit(&fil_system->mutex); + + return(space != NULL); +} + +/*******************************************************************//** +Report that a tablespace for a table was not found. */ +static +void +fil_report_missing_tablespace( +/*===========================*/ + const char* name, /*!< in: table name */ + ulint space_id) /*!< in: table's space id */ +{ + char index_name[MAX_FULL_NAME_LEN + 1]; + + innobase_format_name(index_name, sizeof(index_name), name, TRUE); + + ib_logf(IB_LOG_LEVEL_ERROR, + "Table %s in the InnoDB data dictionary has tablespace id %lu, " + "but tablespace with that id or name does not exist. Have " + "you deleted or moved .ibd files? This may also be a table " + "created with CREATE TEMPORARY TABLE whose .ibd and .frm " + "files MySQL automatically removed, but the table still " + "exists in the InnoDB internal data dictionary.", + name, space_id); +} + +/*******************************************************************//** +Returns TRUE if a matching tablespace exists in the InnoDB tablespace memory +cache. Note that if we have not done a crash recovery at the database startup, +there may be many tablespaces which are not yet in the memory cache. +@return TRUE if a matching tablespace exists in the memory cache */ +UNIV_INTERN +ibool +fil_space_for_table_exists_in_mem( +/*==============================*/ + ulint id, /*!< in: space id */ + const char* name, /*!< in: table name used in + fil_space_create(). Either the + standard 'dbname/tablename' format + or table->dir_path_of_temp_table */ + ibool mark_space, /*!< in: in crash recovery, at database + startup we mark all spaces which have + an associated table in the InnoDB + data dictionary, so that + we can print a warning about orphaned + tablespaces */ + ibool print_error_if_does_not_exist, + /*!< in: print detailed error + information to the .err log if a + matching tablespace is not found from + memory */ + bool adjust_space, /*!< in: whether to adjust space id + when find table space mismatch */ + mem_heap_t* heap, /*!< in: heap memory */ + table_id_t table_id) /*!< in: table id */ +{ + fil_space_t* fnamespace; + fil_space_t* space; + + ut_ad(fil_system); + + mutex_enter(&fil_system->mutex); + + /* Look if there is a space with the same id */ + + space = fil_space_get_by_id(id); + + /* Look if there is a space with the same name; the name is the + directory path from the datadir to the file */ + + fnamespace = fil_space_get_by_name(name); + if (space && space == fnamespace) { + /* Found */ + + if (mark_space) { + space->mark = TRUE; + } + + mutex_exit(&fil_system->mutex); + + return(TRUE); + } + + /* Info from "fnamespace" comes from the ibd file itself, it can + be different from data obtained from System tables since it is + not transactional. If adjust_space is set, and the mismatching + space are between a user table and its temp table, we shall + adjust the ibd file name according to system table info */ + if (adjust_space + && space != NULL + && row_is_mysql_tmp_table_name(space->name) + && !row_is_mysql_tmp_table_name(name)) { + + mutex_exit(&fil_system->mutex); + + DBUG_EXECUTE_IF("ib_crash_before_adjust_fil_space", + DBUG_SUICIDE();); + + if (fnamespace) { + char* tmp_name; + + tmp_name = dict_mem_create_temporary_tablename( + heap, name, table_id); + + fil_rename_tablespace(fnamespace->name, fnamespace->id, + tmp_name, NULL); + } + + DBUG_EXECUTE_IF("ib_crash_after_adjust_one_fil_space", + DBUG_SUICIDE();); + + fil_rename_tablespace(space->name, id, name, NULL); + + DBUG_EXECUTE_IF("ib_crash_after_adjust_fil_space", + DBUG_SUICIDE();); + + mutex_enter(&fil_system->mutex); + fnamespace = fil_space_get_by_name(name); + ut_ad(space == fnamespace); + mutex_exit(&fil_system->mutex); + + return(TRUE); + } + + if (!print_error_if_does_not_exist) { + + mutex_exit(&fil_system->mutex); + + return(FALSE); + } + + if (space == NULL) { + if (fnamespace == NULL) { + if (print_error_if_does_not_exist) { + fil_report_missing_tablespace(name, id); + } + } else { + ut_print_timestamp(stderr); + fputs(" InnoDB: Error: table ", stderr); + ut_print_filename(stderr, name); + fprintf(stderr, "\n" + "InnoDB: in InnoDB data dictionary has" + " tablespace id %lu,\n" + "InnoDB: but a tablespace with that id" + " does not exist. There is\n" + "InnoDB: a tablespace of name %s and id %lu," + " though. Have\n" + "InnoDB: you deleted or moved .ibd files?\n", + (ulong) id, fnamespace->name, + (ulong) fnamespace->id); + } +error_exit: + fputs("InnoDB: Please refer to\n" + "InnoDB: " REFMAN "innodb-troubleshooting-datadict.html\n" + "InnoDB: for how to resolve the issue.\n", stderr); + + mutex_exit(&fil_system->mutex); + + return(FALSE); + } + + if (0 != strcmp(space->name, name)) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Error: table ", stderr); + ut_print_filename(stderr, name); + fprintf(stderr, "\n" + "InnoDB: in InnoDB data dictionary has" + " tablespace id %lu,\n" + "InnoDB: but the tablespace with that id" + " has name %s.\n" + "InnoDB: Have you deleted or moved .ibd files?\n", + (ulong) id, space->name); + + if (fnamespace != NULL) { + fputs("InnoDB: There is a tablespace" + " with the right name\n" + "InnoDB: ", stderr); + ut_print_filename(stderr, fnamespace->name); + fprintf(stderr, ", but its id is %lu.\n", + (ulong) fnamespace->id); + } + + goto error_exit; + } + + mutex_exit(&fil_system->mutex); + + return(FALSE); +} + +/*******************************************************************//** +Checks if a single-table tablespace for a given table name exists in the +tablespace memory cache. +@return space id, ULINT_UNDEFINED if not found */ +UNIV_INTERN +ulint +fil_get_space_id_for_table( +/*=======================*/ + const char* tablename) /*!< in: table name in the standard + 'databasename/tablename' format */ +{ + fil_space_t* fnamespace; + ulint id = ULINT_UNDEFINED; + + ut_ad(fil_system); + + mutex_enter(&fil_system->mutex); + + /* Look if there is a space with the same name. */ + + fnamespace = fil_space_get_by_name(tablename); + + if (fnamespace) { + id = fnamespace->id; + } + + mutex_exit(&fil_system->mutex); + + return(id); +} + +/**********************************************************************//** +Tries to extend a data file so that it would accommodate the number of pages +given. The tablespace must be cached in the memory cache. If the space is big +enough already, does nothing. +@return TRUE if success */ +UNIV_INTERN +ibool +fil_extend_space_to_desired_size( +/*=============================*/ + ulint* actual_size, /*!< out: size of the space after extension; + if we ran out of disk space this may be lower + than the desired size */ + ulint space_id, /*!< in: space id */ + ulint size_after_extend)/*!< in: desired size in pages after the + extension; if the current space size is bigger + than this already, the function does nothing */ +{ + fil_node_t* node; + fil_space_t* space; + byte* buf2; + byte* buf; + ulint buf_size; + ulint start_page_no; + ulint file_start_page_no; + ulint page_size; + ulint pages_added; + ibool success; + + ut_ad(!srv_read_only_mode); + +retry: + pages_added = 0; + success = TRUE; + + fil_mutex_enter_and_prepare_for_io(space_id); + + space = fil_space_get_by_id(space_id); + ut_a(space); + + if (space->size >= size_after_extend) { + /* Space already big enough */ + + *actual_size = space->size; + + mutex_exit(&fil_system->mutex); + + return(TRUE); + } + + page_size = fsp_flags_get_zip_size(space->flags); + if (!page_size) { + page_size = UNIV_PAGE_SIZE; + } + + node = UT_LIST_GET_LAST(space->chain); + + if (!node->being_extended) { + /* Mark this node as undergoing extension. This flag + is used by other threads to wait for the extension + opereation to finish. */ + node->being_extended = TRUE; + } else { + /* Another thread is currently extending the file. Wait + for it to finish. + It'd have been better to use event driven mechanism but + the entire module is peppered with polling stuff. */ + mutex_exit(&fil_system->mutex); + os_thread_sleep(100000); + goto retry; + } + + if (!fil_node_prepare_for_io(node, fil_system, space)) { + /* The tablespace data file, such as .ibd file, is missing */ + node->being_extended = false; + mutex_exit(&fil_system->mutex); + + return(false); + } + + /* At this point it is safe to release fil_system mutex. No + other thread can rename, delete or close the file because + we have set the node->being_extended flag. */ + mutex_exit(&fil_system->mutex); + + start_page_no = space->size; + file_start_page_no = space->size - node->size; + +#ifdef HAVE_POSIX_FALLOCATE + if (srv_use_posix_fallocate) { + + os_offset_t start_offset = file_start_page_no * page_size; + os_offset_t end_offset + = (size_after_extend - file_start_page_no) * page_size; + + success = (posix_fallocate(node->handle, start_offset, + end_offset) == 0); + if (!success) + { + ib_logf(IB_LOG_LEVEL_ERROR, + "preallocating file space for file \'%s\' " + "failed. Current size " INT64PF + ", len " INT64PF ", desired size " INT64PF + "\n", node->name, start_offset, end_offset, + start_offset + end_offset); + } + mutex_enter(&fil_system->mutex); + if (success) { + node->size += (size_after_extend - start_page_no); + space->size += (size_after_extend - start_page_no); + os_has_said_disk_full = FALSE; + } + node->being_extended = FALSE; + fil_node_complete_io(node, fil_system, OS_FILE_READ); + goto complete_io; + } +#endif + + /* Extend at most 64 pages at a time */ + buf_size = ut_min(64, size_after_extend - start_page_no) * page_size; + buf2 = static_cast<byte*>(mem_alloc(buf_size + page_size)); + buf = static_cast<byte*>(ut_align(buf2, page_size)); + + memset(buf, 0, buf_size); + + while (start_page_no < size_after_extend) { + ulint n_pages + = ut_min(buf_size / page_size, + size_after_extend - start_page_no); + + os_offset_t offset + = ((os_offset_t) (start_page_no - file_start_page_no)) + * page_size; +#ifdef UNIV_HOTBACKUP + success = os_file_write(node->name, node->handle, buf, + offset, page_size * n_pages); +#else + success = os_aio(OS_FILE_WRITE, OS_AIO_SYNC, + node->name, node->handle, buf, + offset, page_size * n_pages, + NULL, NULL, space_id, NULL); +#endif /* UNIV_HOTBACKUP */ + if (success) { + os_has_said_disk_full = FALSE; + } else { + /* Let us measure the size of the file to determine + how much we were able to extend it */ + os_offset_t size; + + size = os_file_get_size(node->handle); + ut_a(size != (os_offset_t) -1); + + n_pages = ((ulint) (size / page_size)) + - node->size - pages_added; + + pages_added += n_pages; + break; + } + + start_page_no += n_pages; + pages_added += n_pages; + } + + mem_free(buf2); + + mutex_enter(&fil_system->mutex); + + ut_a(node->being_extended); + + space->size += pages_added; + node->size += pages_added; + node->being_extended = FALSE; + + fil_node_complete_io(node, fil_system, OS_FILE_WRITE); + +#ifdef HAVE_POSIX_FALLOCATE +complete_io: +#endif + + *actual_size = space->size; + +#ifndef UNIV_HOTBACKUP + if (space_id == 0) { + ulint pages_per_mb = (1024 * 1024) / page_size; + + /* Keep the last data file size info up to date, rounded to + full megabytes */ + + srv_data_file_sizes[srv_n_data_files - 1] + = (node->size / pages_per_mb) * pages_per_mb; + } +#endif /* !UNIV_HOTBACKUP */ + + /* + printf("Extended %s to %lu, actual size %lu pages\n", space->name, + size_after_extend, *actual_size); */ + mutex_exit(&fil_system->mutex); + + fil_flush(space_id); + + return(success); +} + +#ifdef UNIV_HOTBACKUP +/********************************************************************//** +Extends all tablespaces to the size stored in the space header. During the +mysqlbackup --apply-log phase we extended the spaces on-demand so that log +records could be applied, but that may have left spaces still too small +compared to the size stored in the space header. */ +UNIV_INTERN +void +fil_extend_tablespaces_to_stored_len(void) +/*======================================*/ +{ + fil_space_t* space; + byte* buf; + ulint actual_size; + ulint size_in_header; + dberr_t error; + ibool success; + + buf = mem_alloc(UNIV_PAGE_SIZE); + + mutex_enter(&fil_system->mutex); + + space = UT_LIST_GET_FIRST(fil_system->space_list); + + while (space) { + ut_a(space->purpose == FIL_TABLESPACE); + + mutex_exit(&fil_system->mutex); /* no need to protect with a + mutex, because this is a + single-threaded operation */ + error = fil_read(TRUE, space->id, + fsp_flags_get_zip_size(space->flags), + 0, 0, UNIV_PAGE_SIZE, buf, NULL); + ut_a(error == DB_SUCCESS); + + size_in_header = fsp_get_size_low(buf); + + success = fil_extend_space_to_desired_size( + &actual_size, space->id, size_in_header); + if (!success) { + fprintf(stderr, + "InnoDB: Error: could not extend the" + " tablespace of %s\n" + "InnoDB: to the size stored in header," + " %lu pages;\n" + "InnoDB: size after extension %lu pages\n" + "InnoDB: Check that you have free disk space" + " and retry!\n", + space->name, size_in_header, actual_size); + ut_a(success); + } + + mutex_enter(&fil_system->mutex); + + space = UT_LIST_GET_NEXT(space_list, space); + } + + mutex_exit(&fil_system->mutex); + + mem_free(buf); +} +#endif + +/*========== RESERVE FREE EXTENTS (for a B-tree split, for example) ===*/ + +/*******************************************************************//** +Tries to reserve free extents in a file space. +@return TRUE if succeed */ +UNIV_INTERN +ibool +fil_space_reserve_free_extents( +/*===========================*/ + ulint id, /*!< in: space id */ + ulint n_free_now, /*!< in: number of free extents now */ + ulint n_to_reserve) /*!< in: how many one wants to reserve */ +{ + fil_space_t* space; + ibool success; + + ut_ad(fil_system); + + mutex_enter(&fil_system->mutex); + + space = fil_space_get_by_id(id); + + ut_a(space); + + if (space->n_reserved_extents + n_to_reserve > n_free_now) { + success = FALSE; + } else { + space->n_reserved_extents += n_to_reserve; + success = TRUE; + } + + mutex_exit(&fil_system->mutex); + + return(success); +} + +/*******************************************************************//** +Releases free extents in a file space. */ +UNIV_INTERN +void +fil_space_release_free_extents( +/*===========================*/ + ulint id, /*!< in: space id */ + ulint n_reserved) /*!< in: how many one reserved */ +{ + fil_space_t* space; + + ut_ad(fil_system); + + mutex_enter(&fil_system->mutex); + + space = fil_space_get_by_id(id); + + ut_a(space); + ut_a(space->n_reserved_extents >= n_reserved); + + space->n_reserved_extents -= n_reserved; + + mutex_exit(&fil_system->mutex); +} + +/*******************************************************************//** +Gets the number of reserved extents. If the database is silent, this number +should be zero. */ +UNIV_INTERN +ulint +fil_space_get_n_reserved_extents( +/*=============================*/ + ulint id) /*!< in: space id */ +{ + fil_space_t* space; + ulint n; + + ut_ad(fil_system); + + mutex_enter(&fil_system->mutex); + + space = fil_space_get_by_id(id); + + ut_a(space); + + n = space->n_reserved_extents; + + mutex_exit(&fil_system->mutex); + + return(n); +} + +/*============================ FILE I/O ================================*/ + +/********************************************************************//** +NOTE: you must call fil_mutex_enter_and_prepare_for_io() first! + +Prepares a file node for i/o. Opens the file if it is closed. Updates the +pending i/o's field in the node and the system appropriately. Takes the node +off the LRU list if it is in the LRU list. The caller must hold the fil_sys +mutex. +@return false if the file can't be opened, otherwise true */ +static +bool +fil_node_prepare_for_io( +/*====================*/ + fil_node_t* node, /*!< in: file node */ + fil_system_t* system, /*!< in: tablespace memory cache */ + fil_space_t* space) /*!< in: space */ +{ + ut_ad(node && system && space); + ut_ad(mutex_own(&(system->mutex))); + + if (system->n_open > system->max_n_open + 5) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Warning: open files %lu" + " exceeds the limit %lu\n", + (ulong) system->n_open, + (ulong) system->max_n_open); + } + + if (node->open == FALSE) { + /* File is closed: open it */ + ut_a(node->n_pending == 0); + + if (!fil_node_open_file(node, system, space)) { + return(false); + } + } + + if (node->n_pending == 0 && fil_space_belongs_in_lru(space)) { + /* The node is in the LRU list, remove it */ + + ut_a(UT_LIST_GET_LEN(system->LRU) > 0); + + UT_LIST_REMOVE(LRU, system->LRU, node); + } + + node->n_pending++; + + return(true); +} + +/********************************************************************//** +Updates the data structures when an i/o operation finishes. Updates the +pending i/o's field in the node appropriately. */ +static +void +fil_node_complete_io( +/*=================*/ + fil_node_t* node, /*!< in: file node */ + fil_system_t* system, /*!< in: tablespace memory cache */ + ulint type) /*!< in: OS_FILE_WRITE or OS_FILE_READ; marks + the node as modified if + type == OS_FILE_WRITE */ +{ + ut_ad(node); + ut_ad(system); + ut_ad(mutex_own(&(system->mutex))); + + ut_a(node->n_pending > 0); + + node->n_pending--; + + if (type == OS_FILE_WRITE) { + ut_ad(!srv_read_only_mode); + system->modification_counter++; + node->modification_counter = system->modification_counter; + + if (fil_buffering_disabled(node->space)) { + + /* We don't need to keep track of unflushed + changes as user has explicitly disabled + buffering. */ + ut_ad(!node->space->is_in_unflushed_spaces); + node->flush_counter = node->modification_counter; + + } else if (!node->space->is_in_unflushed_spaces) { + + node->space->is_in_unflushed_spaces = true; + UT_LIST_ADD_FIRST(unflushed_spaces, + system->unflushed_spaces, + node->space); + } + } + + if (node->n_pending == 0 && fil_space_belongs_in_lru(node->space)) { + + /* The node must be put back to the LRU list */ + UT_LIST_ADD_FIRST(LRU, system->LRU, node); + } +} + +/********************************************************************//** +Report information about an invalid page access. */ +static +void +fil_report_invalid_page_access( +/*===========================*/ + ulint block_offset, /*!< in: block offset */ + ulint space_id, /*!< in: space id */ + const char* space_name, /*!< in: space name */ + ulint byte_offset, /*!< in: byte offset */ + ulint len, /*!< in: I/O length */ + ulint type) /*!< in: I/O type */ +{ + fprintf(stderr, + "InnoDB: Error: trying to access page number %lu" + " in space %lu,\n" + "InnoDB: space name %s,\n" + "InnoDB: which is outside the tablespace bounds.\n" + "InnoDB: Byte offset %lu, len %lu, i/o type %lu.\n" + "InnoDB: If you get this error at mysqld startup," + " please check that\n" + "InnoDB: your my.cnf matches the ibdata files" + " that you have in the\n" + "InnoDB: MySQL server.\n", + (ulong) block_offset, (ulong) space_id, space_name, + (ulong) byte_offset, (ulong) len, (ulong) type); +} + +/********************************************************************//** +Reads or writes data. This operation is asynchronous (aio). +@return DB_SUCCESS, or DB_TABLESPACE_DELETED if we are trying to do +i/o on a tablespace which does not exist */ +UNIV_INTERN +dberr_t +_fil_io( +/*===*/ + ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE, + ORed to OS_FILE_LOG, if a log i/o + and ORed to OS_AIO_SIMULATED_WAKE_LATER + if simulated aio and we want to post a + batch of i/os; NOTE that a simulated batch + may introduce hidden chances of deadlocks, + because i/os are not actually handled until + all have been posted: use with great + caution! */ + bool sync, /*!< in: true if synchronous aio is desired */ + ulint space_id, /*!< in: space id */ + ulint zip_size, /*!< in: compressed page size in bytes; + 0 for uncompressed pages */ + ulint block_offset, /*!< in: offset in number of blocks */ + ulint byte_offset, /*!< in: remainder of offset in bytes; in + aio this must be divisible by the OS block + size */ + ulint len, /*!< in: how many bytes to read or write; this + must not cross a file boundary; in aio this + must be a block size multiple */ + void* buf, /*!< in/out: buffer where to store read data + or from where to write; in aio this must be + appropriately aligned */ + void* message, /*!< in: message for aio handler if non-sync + aio used, else ignored */ + trx_t* trx) +{ + ulint mode; + fil_space_t* space; + fil_node_t* node; + ibool ret; + ulint is_log; + ulint wake_later; + os_offset_t offset; + ibool ignore_nonexistent_pages; + + is_log = type & OS_FILE_LOG; + type = type & ~OS_FILE_LOG; + + wake_later = type & OS_AIO_SIMULATED_WAKE_LATER; + type = type & ~OS_AIO_SIMULATED_WAKE_LATER; + + ignore_nonexistent_pages = type & BUF_READ_IGNORE_NONEXISTENT_PAGES; + type &= ~BUF_READ_IGNORE_NONEXISTENT_PAGES; + + ut_ad(byte_offset < UNIV_PAGE_SIZE); + ut_ad(!zip_size || !byte_offset); + ut_ad(ut_is_2pow(zip_size)); + ut_ad(buf); + ut_ad(len > 0); + ut_ad(UNIV_PAGE_SIZE == (ulong)(1 << UNIV_PAGE_SIZE_SHIFT)); +#if (1 << UNIV_PAGE_SIZE_SHIFT_MAX) != UNIV_PAGE_SIZE_MAX +# error "(1 << UNIV_PAGE_SIZE_SHIFT_MAX) != UNIV_PAGE_SIZE_MAX" +#endif +#if (1 << UNIV_PAGE_SIZE_SHIFT_MIN) != UNIV_PAGE_SIZE_MIN +# error "(1 << UNIV_PAGE_SIZE_SHIFT_MIN) != UNIV_PAGE_SIZE_MIN" +#endif + ut_ad(fil_validate_skip()); +#ifndef UNIV_HOTBACKUP +# ifndef UNIV_LOG_DEBUG + /* ibuf bitmap pages must be read in the sync aio mode: */ + ut_ad(recv_no_ibuf_operations + || type == OS_FILE_WRITE + || !ibuf_bitmap_page(zip_size, block_offset) + || sync + || is_log); +# endif /* UNIV_LOG_DEBUG */ + if (sync) { + mode = OS_AIO_SYNC; + } else if (is_log) { + mode = OS_AIO_LOG; + } else if (type == OS_FILE_READ + && !recv_no_ibuf_operations + && ibuf_page(space_id, zip_size, block_offset, NULL)) { + mode = OS_AIO_IBUF; + } else { + mode = OS_AIO_NORMAL; + } +#else /* !UNIV_HOTBACKUP */ + ut_a(sync); + mode = OS_AIO_SYNC; +#endif /* !UNIV_HOTBACKUP */ + + if (type == OS_FILE_READ) { + srv_stats.data_read.add(len); + } else if (type == OS_FILE_WRITE) { + ut_ad(!srv_read_only_mode); + srv_stats.data_written.add(len); + } + + /* Reserve the fil_system mutex and make sure that we can open at + least one file while holding it, if the file is not already open */ + + fil_mutex_enter_and_prepare_for_io(space_id); + + space = fil_space_get_by_id(space_id); + + /* If we are deleting a tablespace we don't allow any read + operations on that. However, we do allow write operations. */ + if (space == 0 || (type == OS_FILE_READ && space->stop_new_ops)) { + mutex_exit(&fil_system->mutex); + + ib_logf(IB_LOG_LEVEL_ERROR, + "Trying to do i/o to a tablespace which does " + "not exist. i/o type %lu, space id %lu, " + "page no. %lu, i/o length %lu bytes", + (ulong) type, (ulong) space_id, (ulong) block_offset, + (ulong) len); + + return(DB_TABLESPACE_DELETED); + } + + ut_ad(mode != OS_AIO_IBUF || space->purpose == FIL_TABLESPACE); + + node = UT_LIST_GET_FIRST(space->chain); + + for (;;) { + if (node == NULL) { + if (ignore_nonexistent_pages) { + mutex_exit(&fil_system->mutex); + return(DB_ERROR); + } + + fil_report_invalid_page_access( + block_offset, space_id, space->name, + byte_offset, len, type); + + ut_error; + + } else if (fil_is_user_tablespace_id(space->id) + && node->size == 0) { + + /* We do not know the size of a single-table tablespace + before we open the file */ + break; + } else if (node->size > block_offset) { + /* Found! */ + break; + } else { + block_offset -= node->size; + node = UT_LIST_GET_NEXT(chain, node); + } + } + + /* Open file if closed */ + if (!fil_node_prepare_for_io(node, fil_system, space)) { + if (space->purpose == FIL_TABLESPACE + && fil_is_user_tablespace_id(space->id)) { + mutex_exit(&fil_system->mutex); + + ib_logf(IB_LOG_LEVEL_ERROR, + "Trying to do i/o to a tablespace which " + "exists without .ibd data file. " + "i/o type %lu, space id %lu, page no %lu, " + "i/o length %lu bytes", + (ulong) type, (ulong) space_id, + (ulong) block_offset, (ulong) len); + + return(DB_TABLESPACE_DELETED); + } + + /* The tablespace is for log. Currently, we just assert here + to prevent handling errors along the way fil_io returns. + Also, if the log files are missing, it would be hard to + promise the server can continue running. */ + ut_a(0); + } + + /* Check that at least the start offset is within the bounds of a + single-table tablespace, including rollback tablespaces. */ + if (UNIV_UNLIKELY(node->size <= block_offset) + && space->id != 0 && space->purpose == FIL_TABLESPACE) { + + fil_report_invalid_page_access( + block_offset, space_id, space->name, byte_offset, + len, type); + + ut_error; + } + + /* Now we have made the changes in the data structures of fil_system */ + mutex_exit(&fil_system->mutex); + + /* Calculate the low 32 bits and the high 32 bits of the file offset */ + + if (!zip_size) { + offset = ((os_offset_t) block_offset << UNIV_PAGE_SIZE_SHIFT) + + byte_offset; + + ut_a(node->size - block_offset + >= ((byte_offset + len + (UNIV_PAGE_SIZE - 1)) + / UNIV_PAGE_SIZE)); + } else { + ulint zip_size_shift; + switch (zip_size) { + case 1024: zip_size_shift = 10; break; + case 2048: zip_size_shift = 11; break; + case 4096: zip_size_shift = 12; break; + case 8192: zip_size_shift = 13; break; + case 16384: zip_size_shift = 14; break; + default: ut_error; + } + offset = ((os_offset_t) block_offset << zip_size_shift) + + byte_offset; + ut_a(node->size - block_offset + >= (len + (zip_size - 1)) / zip_size); + } + + /* Do aio */ + + ut_a(byte_offset % OS_MIN_LOG_BLOCK_SIZE == 0); + ut_a((len % OS_MIN_LOG_BLOCK_SIZE) == 0); + +#ifndef UNIV_HOTBACKUP + if (UNIV_UNLIKELY(space->is_corrupt && srv_pass_corrupt_table)) { + + /* should ignore i/o for the crashed space */ + if (srv_pass_corrupt_table == 1 || + type == OS_FILE_WRITE) { + + mutex_enter(&fil_system->mutex); + fil_node_complete_io(node, fil_system, type); + mutex_exit(&fil_system->mutex); + if (mode == OS_AIO_NORMAL) { + ut_a(space->purpose == FIL_TABLESPACE); + buf_page_io_complete(static_cast<buf_page_t *> + (message)); + } + } + + if (srv_pass_corrupt_table == 1 && type == OS_FILE_READ) { + + return(DB_TABLESPACE_DELETED); + + } else if (type == OS_FILE_WRITE) { + + return(DB_SUCCESS); + } + } + + /* Queue the aio request */ + ret = os_aio(type, mode | wake_later, node->name, node->handle, buf, + offset, len, node, message, space_id, trx); + +#else + /* In mysqlbackup do normal i/o, not aio */ + if (type == OS_FILE_READ) { + ret = os_file_read(node->handle, buf, offset, len); + } else { + ut_ad(!srv_read_only_mode); + ret = os_file_write(node->name, node->handle, buf, + offset, len); + } +#endif /* !UNIV_HOTBACKUP */ + ut_a(ret); + + if (mode == OS_AIO_SYNC) { + /* The i/o operation is already completed when we return from + os_aio: */ + + mutex_enter(&fil_system->mutex); + + fil_node_complete_io(node, fil_system, type); + + mutex_exit(&fil_system->mutex); + + ut_ad(fil_validate_skip()); + } + + return(DB_SUCCESS); +} + +#ifndef UNIV_HOTBACKUP +/**********************************************************************//** +Waits for an aio operation to complete. This function is used to write the +handler for completed requests. The aio array of pending requests is divided +into segments (see os0file.cc for more info). The thread specifies which +segment it wants to wait for. */ +UNIV_INTERN +void +fil_aio_wait( +/*=========*/ + ulint segment) /*!< in: the number of the segment in the aio + array to wait for */ +{ + ibool ret; + fil_node_t* fil_node; + void* message; + ulint type; + ulint space_id = 0; + + ut_ad(fil_validate_skip()); + + if (srv_use_native_aio) { + srv_set_io_thread_op_info(segment, "native aio handle"); +#ifdef WIN_ASYNC_IO + ret = os_aio_windows_handle( + segment, 0, &fil_node, &message, &type, &space_id); +#elif defined(LINUX_NATIVE_AIO) + ret = os_aio_linux_handle( + segment, &fil_node, &message, &type, &space_id); +#else + ut_error; + ret = 0; /* Eliminate compiler warning */ +#endif /* WIN_ASYNC_IO */ + } else { + srv_set_io_thread_op_info(segment, "simulated aio handle"); + + ret = os_aio_simulated_handle( + segment, &fil_node, &message, &type, &space_id); + } + + ut_a(ret); + if (fil_node == NULL) { + ut_ad(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS); + return; + } + + srv_set_io_thread_op_info(segment, "complete io for fil node"); + + mutex_enter(&fil_system->mutex); + + fil_node_complete_io(fil_node, fil_system, type); + + mutex_exit(&fil_system->mutex); + + ut_ad(fil_validate_skip()); + + /* Do the i/o handling */ + /* IMPORTANT: since i/o handling for reads will read also the insert + buffer in tablespace 0, you have to be very careful not to introduce + deadlocks in the i/o system. We keep tablespace 0 data files always + open, and use a special i/o thread to serve insert buffer requests. */ + + if (fil_node->space->purpose == FIL_TABLESPACE) { + srv_set_io_thread_op_info(segment, "complete io for buf page"); + buf_page_io_complete(static_cast<buf_page_t*>(message)); + } else { + srv_set_io_thread_op_info(segment, "complete io for log"); + log_io_complete(static_cast<log_group_t*>(message)); + } +} +#endif /* UNIV_HOTBACKUP */ + +/**********************************************************************//** +Flushes to disk possible writes cached by the OS. If the space does not exist +or is being dropped, does not do anything. */ +UNIV_INTERN +void +fil_flush( +/*======*/ + ulint space_id) /*!< in: file space id (this can be a group of + log files or a tablespace of the database) */ +{ + fil_space_t* space; + fil_node_t* node; + os_file_t file; + + + mutex_enter(&fil_system->mutex); + + space = fil_space_get_by_id(space_id); + + if (!space || space->stop_new_ops) { + mutex_exit(&fil_system->mutex); + + return; + } + + if (fil_buffering_disabled(space)) { + + /* No need to flush. User has explicitly disabled + buffering. */ + ut_ad(!space->is_in_unflushed_spaces); + ut_ad(fil_space_is_flushed(space)); + ut_ad(space->n_pending_flushes == 0); + +#ifdef UNIV_DEBUG + for (node = UT_LIST_GET_FIRST(space->chain); + node != NULL; + node = UT_LIST_GET_NEXT(chain, node)) { + ut_ad(node->modification_counter + == node->flush_counter); + ut_ad(node->n_pending_flushes == 0); + } +#endif /* UNIV_DEBUG */ + + mutex_exit(&fil_system->mutex); + return; + } + + space->n_pending_flushes++; /*!< prevent dropping of the space while + we are flushing */ + for (node = UT_LIST_GET_FIRST(space->chain); + node != NULL; + node = UT_LIST_GET_NEXT(chain, node)) { + + ib_int64_t old_mod_counter = node->modification_counter;; + + if (old_mod_counter <= node->flush_counter) { + continue; + } + + ut_a(node->open); + + if (space->purpose == FIL_TABLESPACE) { + fil_n_pending_tablespace_flushes++; + } else { + fil_n_pending_log_flushes++; + fil_n_log_flushes++; + } +#ifdef __WIN__ + if (node->is_raw_disk) { + + goto skip_flush; + } +#endif /* __WIN__ */ +retry: + if (node->n_pending_flushes > 0) { + /* We want to avoid calling os_file_flush() on + the file twice at the same time, because we do + not know what bugs OS's may contain in file + i/o */ + + ib_int64_t sig_count = + os_event_reset(node->sync_event); + + mutex_exit(&fil_system->mutex); + + os_event_wait_low(node->sync_event, sig_count); + + mutex_enter(&fil_system->mutex); + + if (node->flush_counter >= old_mod_counter) { + + goto skip_flush; + } + + goto retry; + } + + ut_a(node->open); + file = node->handle; + node->n_pending_flushes++; + + mutex_exit(&fil_system->mutex); + + os_file_flush(file); + + mutex_enter(&fil_system->mutex); + + os_event_set(node->sync_event); + + node->n_pending_flushes--; +skip_flush: + if (node->flush_counter < old_mod_counter) { + node->flush_counter = old_mod_counter; + + if (space->is_in_unflushed_spaces + && fil_space_is_flushed(space)) { + + space->is_in_unflushed_spaces = false; + + UT_LIST_REMOVE( + unflushed_spaces, + fil_system->unflushed_spaces, + space); + } + } + + if (space->purpose == FIL_TABLESPACE) { + fil_n_pending_tablespace_flushes--; + } else { + fil_n_pending_log_flushes--; + } + } + + space->n_pending_flushes--; + + mutex_exit(&fil_system->mutex); +} + +/**********************************************************************//** +Flushes to disk the writes in file spaces of the given type possibly cached by +the OS. */ +UNIV_INTERN +void +fil_flush_file_spaces( +/*==================*/ + ulint purpose) /*!< in: FIL_TABLESPACE, FIL_LOG */ +{ + fil_space_t* space; + ulint* space_ids; + ulint n_space_ids; + ulint i; + + mutex_enter(&fil_system->mutex); + + n_space_ids = UT_LIST_GET_LEN(fil_system->unflushed_spaces); + if (n_space_ids == 0) { + + mutex_exit(&fil_system->mutex); + return; + } + + /* Assemble a list of space ids to flush. Previously, we + traversed fil_system->unflushed_spaces and called UT_LIST_GET_NEXT() + on a space that was just removed from the list by fil_flush(). + Thus, the space could be dropped and the memory overwritten. */ + space_ids = static_cast<ulint*>( + mem_alloc(n_space_ids * sizeof *space_ids)); + + n_space_ids = 0; + + for (space = UT_LIST_GET_FIRST(fil_system->unflushed_spaces); + space; + space = UT_LIST_GET_NEXT(unflushed_spaces, space)) { + + if (space->purpose == purpose && !space->stop_new_ops) { + + space_ids[n_space_ids++] = space->id; + } + } + + mutex_exit(&fil_system->mutex); + + /* Flush the spaces. It will not hurt to call fil_flush() on + a non-existing space id. */ + for (i = 0; i < n_space_ids; i++) { + + fil_flush(space_ids[i]); + } + + mem_free(space_ids); +} + +/** Functor to validate the space list. */ +struct Check { + void operator()(const fil_node_t* elem) + { + ut_a(elem->open || !elem->n_pending); + } +}; + +/******************************************************************//** +Checks the consistency of the tablespace cache. +@return TRUE if ok */ +UNIV_INTERN +ibool +fil_validate(void) +/*==============*/ +{ + fil_space_t* space; + fil_node_t* fil_node; + ulint n_open = 0; + ulint i; + + mutex_enter(&fil_system->mutex); + + /* Look for spaces in the hash table */ + + for (i = 0; i < hash_get_n_cells(fil_system->spaces); i++) { + + for (space = static_cast<fil_space_t*>( + HASH_GET_FIRST(fil_system->spaces, i)); + space != 0; + space = static_cast<fil_space_t*>( + HASH_GET_NEXT(hash, space))) { + + UT_LIST_VALIDATE( + chain, fil_node_t, space->chain, Check()); + + for (fil_node = UT_LIST_GET_FIRST(space->chain); + fil_node != 0; + fil_node = UT_LIST_GET_NEXT(chain, fil_node)) { + + if (fil_node->n_pending > 0) { + ut_a(fil_node->open); + } + + if (fil_node->open) { + n_open++; + } + } + } + } + + ut_a(fil_system->n_open == n_open); + + UT_LIST_CHECK(LRU, fil_node_t, fil_system->LRU); + + for (fil_node = UT_LIST_GET_FIRST(fil_system->LRU); + fil_node != 0; + fil_node = UT_LIST_GET_NEXT(LRU, fil_node)) { + + ut_a(fil_node->n_pending == 0); + ut_a(!fil_node->being_extended); + ut_a(fil_node->open); + ut_a(fil_space_belongs_in_lru(fil_node->space)); + } + + mutex_exit(&fil_system->mutex); + + return(TRUE); +} + +/********************************************************************//** +Returns TRUE if file address is undefined. +@return TRUE if undefined */ +UNIV_INTERN +ibool +fil_addr_is_null( +/*=============*/ + fil_addr_t addr) /*!< in: address */ +{ + return(addr.page == FIL_NULL); +} + +/********************************************************************//** +Get the predecessor of a file page. +@return FIL_PAGE_PREV */ +UNIV_INTERN +ulint +fil_page_get_prev( +/*==============*/ + const byte* page) /*!< in: file page */ +{ + return(mach_read_from_4(page + FIL_PAGE_PREV)); +} + +/********************************************************************//** +Get the successor of a file page. +@return FIL_PAGE_NEXT */ +UNIV_INTERN +ulint +fil_page_get_next( +/*==============*/ + const byte* page) /*!< in: file page */ +{ + return(mach_read_from_4(page + FIL_PAGE_NEXT)); +} + +/*********************************************************************//** +Sets the file page type. */ +UNIV_INTERN +void +fil_page_set_type( +/*==============*/ + byte* page, /*!< in/out: file page */ + ulint type) /*!< in: type */ +{ + ut_ad(page); + + mach_write_to_2(page + FIL_PAGE_TYPE, type); +} + +/*********************************************************************//** +Gets the file page type. +@return type; NOTE that if the type has not been written to page, the +return value not defined */ +UNIV_INTERN +ulint +fil_page_get_type( +/*==============*/ + const byte* page) /*!< in: file page */ +{ + ut_ad(page); + + return(mach_read_from_2(page + FIL_PAGE_TYPE)); +} + +/****************************************************************//** +Closes the tablespace memory cache. */ +UNIV_INTERN +void +fil_close(void) +/*===========*/ +{ +#ifndef UNIV_HOTBACKUP + /* The mutex should already have been freed. */ + ut_ad(fil_system->mutex.magic_n == 0); +#endif /* !UNIV_HOTBACKUP */ + + hash_table_free(fil_system->spaces); + + hash_table_free(fil_system->name_hash); + + ut_a(UT_LIST_GET_LEN(fil_system->LRU) == 0); + ut_a(UT_LIST_GET_LEN(fil_system->unflushed_spaces) == 0); + ut_a(UT_LIST_GET_LEN(fil_system->space_list) == 0); + + mem_free(fil_system); + + fil_system = NULL; +} + +/********************************************************************//** +Initializes a buffer control block when the buf_pool is created. */ +static +void +fil_buf_block_init( +/*===============*/ + buf_block_t* block, /*!< in: pointer to control block */ + byte* frame) /*!< in: pointer to buffer frame */ +{ + UNIV_MEM_DESC(frame, UNIV_PAGE_SIZE); + + block->frame = frame; + + block->page.io_fix = BUF_IO_NONE; + /* There are assertions that check for this. */ + block->page.buf_fix_count = 1; + block->page.state = BUF_BLOCK_READY_FOR_USE; + + page_zip_des_init(&block->page.zip); +} + +struct fil_iterator_t { + os_file_t file; /*!< File handle */ + const char* filepath; /*!< File path name */ + os_offset_t start; /*!< From where to start */ + os_offset_t end; /*!< Where to stop */ + os_offset_t file_size; /*!< File size in bytes */ + ulint page_size; /*!< Page size */ + ulint n_io_buffers; /*!< Number of pages to use + for IO */ + byte* io_buffer; /*!< Buffer to use for IO */ +}; + +/********************************************************************//** +TODO: This can be made parallel trivially by chunking up the file and creating +a callback per thread. . Main benefit will be to use multiple CPUs for +checksums and compressed tables. We have to do compressed tables block by +block right now. Secondly we need to decompress/compress and copy too much +of data. These are CPU intensive. + +Iterate over all the pages in the tablespace. +@param iter - Tablespace iterator +@param block - block to use for IO +@param callback - Callback to inspect and update page contents +@retval DB_SUCCESS or error code */ +static +dberr_t +fil_iterate( +/*========*/ + const fil_iterator_t& iter, + buf_block_t* block, + PageCallback& callback) +{ + os_offset_t offset; + ulint page_no = 0; + ulint space_id = callback.get_space_id(); + ulint n_bytes = iter.n_io_buffers * iter.page_size; + + ut_ad(!srv_read_only_mode); + + /* TODO: For compressed tables we do a lot of useless + copying for non-index pages. Unfortunately, it is + required by buf_zip_decompress() */ + + for (offset = iter.start; offset < iter.end; offset += n_bytes) { + + byte* io_buffer = iter.io_buffer; + + block->frame = io_buffer; + + if (callback.get_zip_size() > 0) { + page_zip_des_init(&block->page.zip); + page_zip_set_size(&block->page.zip, iter.page_size); + block->page.zip.data = block->frame + UNIV_PAGE_SIZE; + ut_d(block->page.zip.m_external = true); + ut_ad(iter.page_size == callback.get_zip_size()); + + /* Zip IO is done in the compressed page buffer. */ + io_buffer = block->page.zip.data; + } else { + io_buffer = iter.io_buffer; + } + + /* We have to read the exact number of bytes. Otherwise the + InnoDB IO functions croak on failed reads. */ + + n_bytes = static_cast<ulint>( + ut_min(static_cast<os_offset_t>(n_bytes), + iter.end - offset)); + + ut_ad(n_bytes > 0); + ut_ad(!(n_bytes % iter.page_size)); + + if (!os_file_read(iter.file, io_buffer, offset, + (ulint) n_bytes)) { + + ib_logf(IB_LOG_LEVEL_ERROR, "os_file_read() failed"); + + return(DB_IO_ERROR); + } + + bool updated = false; + os_offset_t page_off = offset; + ulint n_pages_read = (ulint) n_bytes / iter.page_size; + + for (ulint i = 0; i < n_pages_read; ++i) { + + buf_block_set_file_page(block, space_id, page_no++); + + dberr_t err; + + if ((err = callback(page_off, block)) != DB_SUCCESS) { + + return(err); + + } else if (!updated) { + updated = buf_block_get_state(block) + == BUF_BLOCK_FILE_PAGE; + } + + buf_block_set_state(block, BUF_BLOCK_NOT_USED); + buf_block_set_state(block, BUF_BLOCK_READY_FOR_USE); + + page_off += iter.page_size; + block->frame += iter.page_size; + } + + /* A page was updated in the set, write back to disk. */ + if (updated + && !os_file_write( + iter.filepath, iter.file, io_buffer, + offset, (ulint) n_bytes)) { + + ib_logf(IB_LOG_LEVEL_ERROR, "os_file_write() failed"); + + return(DB_IO_ERROR); + } + } + + return(DB_SUCCESS); +} + +/********************************************************************//** +Iterate over all the pages in the tablespace. +@param table - the table definiton in the server +@param n_io_buffers - number of blocks to read and write together +@param callback - functor that will do the page updates +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fil_tablespace_iterate( +/*===================*/ + dict_table_t* table, + ulint n_io_buffers, + PageCallback& callback) +{ + dberr_t err; + os_file_t file; + char* filepath; + + ut_a(n_io_buffers > 0); + ut_ad(!srv_read_only_mode); + + DBUG_EXECUTE_IF("ib_import_trigger_corruption_1", + return(DB_CORRUPTION);); + + if (DICT_TF_HAS_DATA_DIR(table->flags)) { + dict_get_and_save_data_dir_path(table, false); + ut_a(table->data_dir_path); + + filepath = os_file_make_remote_pathname( + table->data_dir_path, table->name, "ibd"); + } else { + filepath = fil_make_ibd_name(table->name, false); + } + + { + ibool success; + + file = os_file_create_simple_no_error_handling( + innodb_file_data_key, filepath, + OS_FILE_OPEN, OS_FILE_READ_WRITE, &success); + + DBUG_EXECUTE_IF("fil_tablespace_iterate_failure", + { + static bool once; + + if (!once || ut_rnd_interval(0, 10) == 5) { + once = true; + success = FALSE; + os_file_close(file); + } + }); + + if (!success) { + /* The following call prints an error message */ + os_file_get_last_error(true); + + ib_logf(IB_LOG_LEVEL_ERROR, + "Trying to import a tablespace, but could not " + "open the tablespace file %s", filepath); + + mem_free(filepath); + + return(DB_TABLESPACE_NOT_FOUND); + + } else { + err = DB_SUCCESS; + } + } + + callback.set_file(filepath, file); + + os_offset_t file_size = os_file_get_size(file); + ut_a(file_size != (os_offset_t) -1); + + /* The block we will use for every physical page */ + buf_block_t block; + + memset(&block, 0x0, sizeof(block)); + + /* Allocate a page to read in the tablespace header, so that we + can determine the page size and zip_size (if it is compressed). + We allocate an extra page in case it is a compressed table. One + page is to ensure alignement. */ + + void* page_ptr = mem_alloc(3 * UNIV_PAGE_SIZE); + byte* page = static_cast<byte*>(ut_align(page_ptr, UNIV_PAGE_SIZE)); + + fil_buf_block_init(&block, page); + + /* Read the first page and determine the page and zip size. */ + + if (!os_file_read(file, page, 0, UNIV_PAGE_SIZE)) { + + err = DB_IO_ERROR; + + } else if ((err = callback.init(file_size, &block)) == DB_SUCCESS) { + fil_iterator_t iter; + + iter.file = file; + iter.start = 0; + iter.end = file_size; + iter.filepath = filepath; + iter.file_size = file_size; + iter.n_io_buffers = n_io_buffers; + iter.page_size = callback.get_page_size(); + + /* Compressed pages can't be optimised for block IO for now. + We do the IMPORT page by page. */ + + if (callback.get_zip_size() > 0) { + iter.n_io_buffers = 1; + ut_a(iter.page_size == callback.get_zip_size()); + } + + /** Add an extra page for compressed page scratch area. */ + + void* io_buffer = mem_alloc( + (2 + iter.n_io_buffers) * UNIV_PAGE_SIZE); + + iter.io_buffer = static_cast<byte*>( + ut_align(io_buffer, UNIV_PAGE_SIZE)); + + err = fil_iterate(iter, &block, callback); + + mem_free(io_buffer); + } + + if (err == DB_SUCCESS) { + + ib_logf(IB_LOG_LEVEL_INFO, "Sync to disk"); + + if (!os_file_flush(file)) { + ib_logf(IB_LOG_LEVEL_INFO, "os_file_flush() failed!"); + err = DB_IO_ERROR; + } else { + ib_logf(IB_LOG_LEVEL_INFO, "Sync to disk - done!"); + } + } + + os_file_close(file); + + mem_free(page_ptr); + mem_free(filepath); + + return(err); +} + +/** +Set the tablespace compressed table size. +@return DB_SUCCESS if it is valie or DB_CORRUPTION if not */ +dberr_t +PageCallback::set_zip_size(const buf_frame_t* page) UNIV_NOTHROW +{ + m_zip_size = fsp_header_get_zip_size(page); + + if (!ut_is_2pow(m_zip_size) || m_zip_size > UNIV_ZIP_SIZE_MAX) { + return(DB_CORRUPTION); + } + + return(DB_SUCCESS); +} + +/********************************************************************//** +Delete the tablespace file and any related files like .cfg. +This should not be called for temporary tables. */ +UNIV_INTERN +void +fil_delete_file( +/*============*/ + const char* ibd_name) /*!< in: filepath of the ibd + tablespace */ +{ + /* Force a delete of any stale .ibd files that are lying around. */ + + ib_logf(IB_LOG_LEVEL_INFO, "Deleting %s", ibd_name); + + os_file_delete_if_exists(innodb_file_data_key, ibd_name); + + char* cfg_name = fil_make_cfg_name(ibd_name); + + os_file_delete_if_exists(innodb_file_data_key, cfg_name); + + mem_free(cfg_name); +} + +/************************************************************************* +Return local hash table informations. */ + +ulint +fil_system_hash_cells(void) +/*=======================*/ +{ + if (fil_system) { + return (fil_system->spaces->n_cells + + fil_system->name_hash->n_cells); + } else { + return 0; + } +} + +ulint +fil_system_hash_nodes(void) +/*=======================*/ +{ + if (fil_system) { + return (UT_LIST_GET_LEN(fil_system->space_list) + * (sizeof(fil_space_t) + MEM_BLOCK_HEADER_SIZE)); + } else { + return 0; + } +} + +/** +Iterate over all the spaces in the space list and fetch the +tablespace names. It will return a copy of the name that must be +freed by the caller using: delete[]. +@return DB_SUCCESS if all OK. */ +UNIV_INTERN +dberr_t +fil_get_space_names( +/*================*/ + space_name_list_t& space_name_list) + /*!< in/out: List to append to */ +{ + fil_space_t* space; + dberr_t err = DB_SUCCESS; + + mutex_enter(&fil_system->mutex); + + for (space = UT_LIST_GET_FIRST(fil_system->space_list); + space != NULL; + space = UT_LIST_GET_NEXT(space_list, space)) { + + if (space->purpose == FIL_TABLESPACE) { + ulint len; + char* name; + + len = strlen(space->name); + name = new(std::nothrow) char[len + 1]; + + if (name == 0) { + /* Caller to free elements allocated so far. */ + err = DB_OUT_OF_MEMORY; + break; + } + + memcpy(name, space->name, len); + name[len] = 0; + + space_name_list.push_back(name); + } + } + + mutex_exit(&fil_system->mutex); + + return(err); +} + +/****************************************************************//** +Generate redo logs for swapping two .ibd files */ +UNIV_INTERN +void +fil_mtr_rename_log( +/*===============*/ + ulint old_space_id, /*!< in: tablespace id of the old + table. */ + const char* old_name, /*!< in: old table name */ + ulint new_space_id, /*!< in: tablespace id of the new + table */ + const char* new_name, /*!< in: new table name */ + const char* tmp_name, /*!< in: temp table name used while + swapping */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + if (old_space_id != TRX_SYS_SPACE) { + fil_op_write_log(MLOG_FILE_RENAME, old_space_id, + 0, 0, old_name, tmp_name, mtr); + } + + if (new_space_id != TRX_SYS_SPACE) { + fil_op_write_log(MLOG_FILE_RENAME, new_space_id, + 0, 0, new_name, old_name, mtr); + } +} + +/************************************************************************* +functions to access is_corrupt flag of fil_space_t*/ + +ibool +fil_space_is_corrupt( +/*=================*/ + ulint space_id) +{ + fil_space_t* space; + ibool ret = FALSE; + + mutex_enter(&fil_system->mutex); + + space = fil_space_get_by_id(space_id); + + if (UNIV_UNLIKELY(space && space->is_corrupt)) { + ret = TRUE; + } + + mutex_exit(&fil_system->mutex); + + return(ret); +} + +void +fil_space_set_corrupt( +/*==================*/ + ulint space_id) +{ + fil_space_t* space; + + mutex_enter(&fil_system->mutex); + + space = fil_space_get_by_id(space_id); + + if (space) { + space->is_corrupt = TRUE; + } + + mutex_exit(&fil_system->mutex); +} diff --git a/storage/xtradb/fsp/fsp0fsp.cc b/storage/xtradb/fsp/fsp0fsp.cc new file mode 100644 index 00000000000..1993cdf2b7c --- /dev/null +++ b/storage/xtradb/fsp/fsp0fsp.cc @@ -0,0 +1,4154 @@ +/***************************************************************************** + +Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file fsp/fsp0fsp.cc +File space management + +Created 11/29/1995 Heikki Tuuri +***********************************************************************/ + +#include "fsp0fsp.h" + +#ifdef UNIV_NONINL +#include "fsp0fsp.ic" +#endif + +#include "buf0buf.h" +#include "fil0fil.h" +#include "mtr0log.h" +#include "ut0byte.h" +#include "page0page.h" +#include "page0zip.h" +#ifdef UNIV_HOTBACKUP +# include "fut0lst.h" +#else /* UNIV_HOTBACKUP */ +# include "sync0sync.h" +# include "fut0fut.h" +# include "srv0srv.h" +# include "ibuf0ibuf.h" +# include "btr0btr.h" +# include "btr0sea.h" +# include "dict0boot.h" +# include "log0log.h" +#endif /* UNIV_HOTBACKUP */ +#include "dict0mem.h" +#include "srv0start.h" + + +#ifndef UNIV_HOTBACKUP +/** Flag to indicate if we have printed the tablespace full error. */ +static ibool fsp_tbs_full_error_printed = FALSE; + +/**********************************************************************//** +Returns an extent to the free list of a space. */ +static +void +fsp_free_extent( +/*============*/ + ulint space, /*!< in: space id */ + ulint zip_size,/*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page, /*!< in: page offset in the extent */ + mtr_t* mtr); /*!< in/out: mini-transaction */ +/**********************************************************************//** +Frees an extent of a segment to the space free list. */ +static +void +fseg_free_extent( +/*=============*/ + fseg_inode_t* seg_inode, /*!< in: segment inode */ + ulint space, /*!< in: space id */ + ulint zip_size,/*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page, /*!< in: page offset in the extent */ + mtr_t* mtr); /*!< in/out: mini-transaction */ +/**********************************************************************//** +Calculates the number of pages reserved by a segment, and how +many pages are currently used. +@return number of reserved pages */ +static +ulint +fseg_n_reserved_pages_low( +/*======================*/ + fseg_inode_t* header, /*!< in: segment inode */ + ulint* used, /*!< out: number of pages used (not + more than reserved) */ + mtr_t* mtr); /*!< in/out: mini-transaction */ +/********************************************************************//** +Marks a page used. The page must reside within the extents of the given +segment. */ +static __attribute__((nonnull)) +void +fseg_mark_page_used( +/*================*/ + fseg_inode_t* seg_inode,/*!< in: segment inode */ + ulint page, /*!< in: page offset */ + xdes_t* descr, /*!< in: extent descriptor */ + mtr_t* mtr); /*!< in/out: mini-transaction */ +/**********************************************************************//** +Returns the first extent descriptor for a segment. We think of the extent +lists of the segment catenated in the order FSEG_FULL -> FSEG_NOT_FULL +-> FSEG_FREE. +@return the first extent descriptor, or NULL if none */ +static +xdes_t* +fseg_get_first_extent( +/*==================*/ + fseg_inode_t* inode, /*!< in: segment inode */ + ulint space, /*!< in: space id */ + ulint zip_size,/*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + mtr_t* mtr); /*!< in/out: mini-transaction */ +/**********************************************************************//** +Puts new extents to the free list if +there are free extents above the free limit. If an extent happens +to contain an extent descriptor page, the extent is put to +the FSP_FREE_FRAG list with the page marked as used. */ +static +void +fsp_fill_free_list( +/*===============*/ + ibool init_space, /*!< in: TRUE if this is a single-table + tablespace and we are only initing + the tablespace's first extent + descriptor page and ibuf bitmap page; + then we do not allocate more extents */ + ulint space, /*!< in: space */ + fsp_header_t* header, /*!< in/out: space header */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + UNIV_COLD __attribute__((nonnull)); +/**********************************************************************//** +Allocates a single free page from a segment. This function implements +the intelligent allocation strategy which tries to minimize file space +fragmentation. +@retval NULL if no page could be allocated +@retval block, rw_lock_x_lock_count(&block->lock) == 1 if allocation succeeded +(init_mtr == mtr, or the page was not previously freed in mtr) +@retval block (not allocated or initialized) otherwise */ +static +buf_block_t* +fseg_alloc_free_page_low( +/*=====================*/ + ulint space, /*!< in: space */ + ulint zip_size,/*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + fseg_inode_t* seg_inode, /*!< in/out: segment inode */ + ulint hint, /*!< in: hint of which page would be + desirable */ + byte direction, /*!< in: if the new page is needed because + of an index page split, and records are + inserted there in order, into which + direction they go alphabetically: FSP_DOWN, + FSP_UP, FSP_NO_DIR */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + mtr_t* init_mtr)/*!< in/out: mtr or another mini-transaction + in which the page should be initialized. + If init_mtr!=mtr, but the page is already + latched in mtr, do not initialize the page. */ + __attribute__((warn_unused_result, nonnull)); +#endif /* !UNIV_HOTBACKUP */ + +/**********************************************************************//** +Reads the file space size stored in the header page. +@return tablespace size stored in the space header */ +UNIV_INTERN +ulint +fsp_get_size_low( +/*=============*/ + page_t* page) /*!< in: header page (page 0 in the tablespace) */ +{ + return(mach_read_from_4(page + FSP_HEADER_OFFSET + FSP_SIZE)); +} + +#ifndef UNIV_HOTBACKUP +/**********************************************************************//** +Gets a pointer to the space header and x-locks its page. +@return pointer to the space header, page x-locked */ +UNIV_INLINE +fsp_header_t* +fsp_get_space_header( +/*=================*/ + ulint id, /*!< in: space id */ + ulint zip_size,/*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + buf_block_t* block; + fsp_header_t* header; + + ut_ad(ut_is_2pow(zip_size)); + ut_ad(zip_size <= UNIV_ZIP_SIZE_MAX); + ut_ad(!zip_size || zip_size >= UNIV_ZIP_SIZE_MIN); + ut_ad(id || !zip_size); + + block = buf_page_get(id, zip_size, 0, RW_X_LATCH, mtr); + + SRV_CORRUPT_TABLE_CHECK(block, return(0);); + + header = FSP_HEADER_OFFSET + buf_block_get_frame(block); + buf_block_dbg_add_level(block, SYNC_FSP_PAGE); + + ut_ad(id == mach_read_from_4(FSP_SPACE_ID + header)); + ut_ad(zip_size == fsp_flags_get_zip_size( + mach_read_from_4(FSP_SPACE_FLAGS + header))); + return(header); +} + +/**********************************************************************//** +Gets a descriptor bit of a page. +@return TRUE if free */ +UNIV_INLINE +ibool +xdes_mtr_get_bit( +/*=============*/ + const xdes_t* descr, /*!< in: descriptor */ + ulint bit, /*!< in: XDES_FREE_BIT or XDES_CLEAN_BIT */ + ulint offset, /*!< in: page offset within extent: + 0 ... FSP_EXTENT_SIZE - 1 */ + mtr_t* mtr) /*!< in: mini-transaction */ +{ + ut_ad(mtr->state == MTR_ACTIVE); + ut_ad(mtr_memo_contains_page(mtr, descr, MTR_MEMO_PAGE_X_FIX)); + + return(xdes_get_bit(descr, bit, offset)); +} + +/**********************************************************************//** +Sets a descriptor bit of a page. */ +UNIV_INLINE +void +xdes_set_bit( +/*=========*/ + xdes_t* descr, /*!< in: descriptor */ + ulint bit, /*!< in: XDES_FREE_BIT or XDES_CLEAN_BIT */ + ulint offset, /*!< in: page offset within extent: + 0 ... FSP_EXTENT_SIZE - 1 */ + ibool val, /*!< in: bit value */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ulint index; + ulint byte_index; + ulint bit_index; + ulint descr_byte; + + ut_ad(mtr_memo_contains_page(mtr, descr, MTR_MEMO_PAGE_X_FIX)); + ut_ad((bit == XDES_FREE_BIT) || (bit == XDES_CLEAN_BIT)); + ut_ad(offset < FSP_EXTENT_SIZE); + + index = bit + XDES_BITS_PER_PAGE * offset; + + byte_index = index / 8; + bit_index = index % 8; + + descr_byte = mtr_read_ulint(descr + XDES_BITMAP + byte_index, + MLOG_1BYTE, mtr); + descr_byte = ut_bit_set_nth(descr_byte, bit_index, val); + + mlog_write_ulint(descr + XDES_BITMAP + byte_index, descr_byte, + MLOG_1BYTE, mtr); +} + +/**********************************************************************//** +Looks for a descriptor bit having the desired value. Starts from hint +and scans upward; at the end of the extent the search is wrapped to +the start of the extent. +@return bit index of the bit, ULINT_UNDEFINED if not found */ +UNIV_INLINE +ulint +xdes_find_bit( +/*==========*/ + xdes_t* descr, /*!< in: descriptor */ + ulint bit, /*!< in: XDES_FREE_BIT or XDES_CLEAN_BIT */ + ibool val, /*!< in: desired bit value */ + ulint hint, /*!< in: hint of which bit position would + be desirable */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ulint i; + + ut_ad(descr && mtr); + ut_ad(val <= TRUE); + ut_ad(hint < FSP_EXTENT_SIZE); + ut_ad(mtr_memo_contains_page(mtr, descr, MTR_MEMO_PAGE_X_FIX)); + for (i = hint; i < FSP_EXTENT_SIZE; i++) { + if (val == xdes_mtr_get_bit(descr, bit, i, mtr)) { + + return(i); + } + } + + for (i = 0; i < hint; i++) { + if (val == xdes_mtr_get_bit(descr, bit, i, mtr)) { + + return(i); + } + } + + return(ULINT_UNDEFINED); +} + +/**********************************************************************//** +Looks for a descriptor bit having the desired value. Scans the extent in +a direction opposite to xdes_find_bit. +@return bit index of the bit, ULINT_UNDEFINED if not found */ +UNIV_INLINE +ulint +xdes_find_bit_downward( +/*===================*/ + xdes_t* descr, /*!< in: descriptor */ + ulint bit, /*!< in: XDES_FREE_BIT or XDES_CLEAN_BIT */ + ibool val, /*!< in: desired bit value */ + ulint hint, /*!< in: hint of which bit position would + be desirable */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ulint i; + + ut_ad(descr && mtr); + ut_ad(val <= TRUE); + ut_ad(hint < FSP_EXTENT_SIZE); + ut_ad(mtr_memo_contains_page(mtr, descr, MTR_MEMO_PAGE_X_FIX)); + for (i = hint + 1; i > 0; i--) { + if (val == xdes_mtr_get_bit(descr, bit, i - 1, mtr)) { + + return(i - 1); + } + } + + for (i = FSP_EXTENT_SIZE - 1; i > hint; i--) { + if (val == xdes_mtr_get_bit(descr, bit, i, mtr)) { + + return(i); + } + } + + return(ULINT_UNDEFINED); +} + +/**********************************************************************//** +Returns the number of used pages in a descriptor. +@return number of pages used */ +UNIV_INLINE +ulint +xdes_get_n_used( +/*============*/ + const xdes_t* descr, /*!< in: descriptor */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ulint count = 0; + + ut_ad(descr && mtr); + ut_ad(mtr_memo_contains_page(mtr, descr, MTR_MEMO_PAGE_X_FIX)); + for (ulint i = 0; i < FSP_EXTENT_SIZE; ++i) { + if (FALSE == xdes_mtr_get_bit(descr, XDES_FREE_BIT, i, mtr)) { + count++; + } + } + + return(count); +} + +/**********************************************************************//** +Returns true if extent contains no used pages. +@return TRUE if totally free */ +UNIV_INLINE +ibool +xdes_is_free( +/*=========*/ + const xdes_t* descr, /*!< in: descriptor */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + if (0 == xdes_get_n_used(descr, mtr)) { + + return(TRUE); + } + + return(FALSE); +} + +/**********************************************************************//** +Returns true if extent contains no free pages. +@return TRUE if full */ +UNIV_INLINE +ibool +xdes_is_full( +/*=========*/ + const xdes_t* descr, /*!< in: descriptor */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + if (FSP_EXTENT_SIZE == xdes_get_n_used(descr, mtr)) { + + return(TRUE); + } + + return(FALSE); +} + +/**********************************************************************//** +Sets the state of an xdes. */ +UNIV_INLINE +void +xdes_set_state( +/*===========*/ + xdes_t* descr, /*!< in/out: descriptor */ + ulint state, /*!< in: state to set */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ut_ad(descr && mtr); + ut_ad(state >= XDES_FREE); + ut_ad(state <= XDES_FSEG); + ut_ad(mtr_memo_contains_page(mtr, descr, MTR_MEMO_PAGE_X_FIX)); + + mlog_write_ulint(descr + XDES_STATE, state, MLOG_4BYTES, mtr); +} + +/**********************************************************************//** +Gets the state of an xdes. +@return state */ +UNIV_INLINE +ulint +xdes_get_state( +/*===========*/ + const xdes_t* descr, /*!< in: descriptor */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ulint state; + + ut_ad(descr && mtr); + ut_ad(mtr_memo_contains_page(mtr, descr, MTR_MEMO_PAGE_X_FIX)); + + state = mtr_read_ulint(descr + XDES_STATE, MLOG_4BYTES, mtr); + ut_ad(state - 1 < XDES_FSEG); + return(state); +} + +/**********************************************************************//** +Inits an extent descriptor to the free and clean state. */ +UNIV_INLINE +void +xdes_init( +/*======*/ + xdes_t* descr, /*!< in: descriptor */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ulint i; + + ut_ad(descr && mtr); + ut_ad(mtr_memo_contains_page(mtr, descr, MTR_MEMO_PAGE_X_FIX)); + ut_ad((XDES_SIZE - XDES_BITMAP) % 4 == 0); + + for (i = XDES_BITMAP; i < XDES_SIZE; i += 4) { + mlog_write_ulint(descr + i, 0xFFFFFFFFUL, MLOG_4BYTES, mtr); + } + + xdes_set_state(descr, XDES_FREE, mtr); +} + +/********************************************************************//** +Gets pointer to a the extent descriptor of a page. The page where the extent +descriptor resides is x-locked. This function no longer extends the data +file. +@return pointer to the extent descriptor, NULL if the page does not +exist in the space or if the offset is >= the free limit */ +UNIV_INLINE __attribute__((nonnull, warn_unused_result)) +xdes_t* +xdes_get_descriptor_with_space_hdr( +/*===============================*/ + fsp_header_t* sp_header, /*!< in/out: space header, x-latched + in mtr */ + ulint space, /*!< in: space id */ + ulint offset, /*!< in: page offset; if equal + to the free limit, we try to + add new extents to the space + free list */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ulint limit; + ulint size; + ulint zip_size; + ulint descr_page_no; + page_t* descr_page; + + ut_ad(mtr_memo_contains(mtr, fil_space_get_latch(space, NULL), + MTR_MEMO_X_LOCK)); + ut_ad(mtr_memo_contains_page(mtr, sp_header, MTR_MEMO_PAGE_X_FIX)); + ut_ad(page_offset(sp_header) == FSP_HEADER_OFFSET); + /* Read free limit and space size */ + limit = mach_read_from_4(sp_header + FSP_FREE_LIMIT); + size = mach_read_from_4(sp_header + FSP_SIZE); + zip_size = fsp_flags_get_zip_size( + mach_read_from_4(sp_header + FSP_SPACE_FLAGS)); + + if ((offset >= size) || (offset >= limit)) { + return(NULL); + } + + descr_page_no = xdes_calc_descriptor_page(zip_size, offset); + + if (descr_page_no == 0) { + /* It is on the space header page */ + + descr_page = page_align(sp_header); + } else { + buf_block_t* block; + + block = buf_page_get(space, zip_size, descr_page_no, + RW_X_LATCH, mtr); + buf_block_dbg_add_level(block, SYNC_FSP_PAGE); + + descr_page = buf_block_get_frame(block); + } + + return(descr_page + XDES_ARR_OFFSET + + XDES_SIZE * xdes_calc_descriptor_index(zip_size, offset)); +} + +/********************************************************************//** +Gets pointer to a the extent descriptor of a page. The page where the +extent descriptor resides is x-locked. If the page offset is equal to +the free limit of the space, adds new extents from above the free limit +to the space free list, if not free limit == space size. This adding +is necessary to make the descriptor defined, as they are uninitialized +above the free limit. +@return pointer to the extent descriptor, NULL if the page does not +exist in the space or if the offset exceeds the free limit */ +static __attribute__((nonnull, warn_unused_result)) +xdes_t* +xdes_get_descriptor( +/*================*/ + ulint space, /*!< in: space id */ + ulint zip_size,/*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint offset, /*!< in: page offset; if equal to the free limit, + we try to add new extents to the space free list */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + buf_block_t* block; + fsp_header_t* sp_header; + + block = buf_page_get(space, zip_size, 0, RW_X_LATCH, mtr); + + SRV_CORRUPT_TABLE_CHECK(block, return(0);); + + buf_block_dbg_add_level(block, SYNC_FSP_PAGE); + + sp_header = FSP_HEADER_OFFSET + buf_block_get_frame(block); + return(xdes_get_descriptor_with_space_hdr(sp_header, space, offset, + mtr)); +} + +/********************************************************************//** +Gets pointer to a the extent descriptor if the file address +of the descriptor list node is known. The page where the +extent descriptor resides is x-locked. +@return pointer to the extent descriptor */ +UNIV_INLINE +xdes_t* +xdes_lst_get_descriptor( +/*====================*/ + ulint space, /*!< in: space id */ + ulint zip_size,/*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + fil_addr_t lst_node,/*!< in: file address of the list node + contained in the descriptor */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + xdes_t* descr; + + ut_ad(mtr); + ut_ad(mtr_memo_contains(mtr, fil_space_get_latch(space, NULL), + MTR_MEMO_X_LOCK)); + descr = fut_get_ptr(space, zip_size, lst_node, RW_X_LATCH, mtr) + - XDES_FLST_NODE; + + return(descr); +} + +/********************************************************************//** +Returns page offset of the first page in extent described by a descriptor. +@return offset of the first page in extent */ +UNIV_INLINE +ulint +xdes_get_offset( +/*============*/ + const xdes_t* descr) /*!< in: extent descriptor */ +{ + ut_ad(descr); + + return(page_get_page_no(page_align(descr)) + + ((page_offset(descr) - XDES_ARR_OFFSET) / XDES_SIZE) + * FSP_EXTENT_SIZE); +} +#endif /* !UNIV_HOTBACKUP */ + +/***********************************************************//** +Inits a file page whose prior contents should be ignored. */ +static +void +fsp_init_file_page_low( +/*===================*/ + buf_block_t* block) /*!< in: pointer to a page */ +{ + page_t* page = buf_block_get_frame(block); + page_zip_des_t* page_zip= buf_block_get_page_zip(block); + +#ifndef UNIV_HOTBACKUP + block->check_index_page_at_flush = FALSE; +#endif /* !UNIV_HOTBACKUP */ + + if (page_zip) { + memset(page, 0, UNIV_PAGE_SIZE); + memset(page_zip->data, 0, page_zip_get_size(page_zip)); + mach_write_to_4(page + FIL_PAGE_OFFSET, + buf_block_get_page_no(block)); + mach_write_to_4(page + + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, + buf_block_get_space(block)); + memcpy(page_zip->data + FIL_PAGE_OFFSET, + page + FIL_PAGE_OFFSET, 4); + memcpy(page_zip->data + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, + page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 4); + return; + } + + memset(page, 0, UNIV_PAGE_SIZE); + mach_write_to_4(page + FIL_PAGE_OFFSET, buf_block_get_page_no(block)); + mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, + buf_block_get_space(block)); +} + +#ifndef UNIV_HOTBACKUP +/***********************************************************//** +Inits a file page whose prior contents should be ignored. */ +static +void +fsp_init_file_page( +/*===============*/ + buf_block_t* block, /*!< in: pointer to a page */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + fsp_init_file_page_low(block); + + mlog_write_initial_log_record(buf_block_get_frame(block), + MLOG_INIT_FILE_PAGE, mtr); +} +#endif /* !UNIV_HOTBACKUP */ + +/***********************************************************//** +Parses a redo log record of a file page init. +@return end of log record or NULL */ +UNIV_INTERN +byte* +fsp_parse_init_file_page( +/*=====================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr __attribute__((unused)), /*!< in: buffer end */ + buf_block_t* block) /*!< in: block or NULL */ +{ + ut_ad(ptr && end_ptr); + + if (block) { + fsp_init_file_page_low(block); + } + + return(ptr); +} + +/**********************************************************************//** +Initializes the fsp system. */ +UNIV_INTERN +void +fsp_init(void) +/*==========*/ +{ + /* FSP_EXTENT_SIZE must be a multiple of page & zip size */ + ut_a(0 == (UNIV_PAGE_SIZE % FSP_EXTENT_SIZE)); + ut_a(UNIV_PAGE_SIZE); + +#if UNIV_PAGE_SIZE_MAX % FSP_EXTENT_SIZE_MAX +# error "UNIV_PAGE_SIZE_MAX % FSP_EXTENT_SIZE_MAX != 0" +#endif +#if UNIV_ZIP_SIZE_MIN % FSP_EXTENT_SIZE_MIN +# error "UNIV_ZIP_SIZE_MIN % FSP_EXTENT_SIZE_MIN != 0" +#endif + + /* Does nothing at the moment */ +} + +/**********************************************************************//** +Writes the space id and flags to a tablespace header. The flags contain +row type, physical/compressed page size, and logical/uncompressed page +size of the tablespace. */ +UNIV_INTERN +void +fsp_header_init_fields( +/*===================*/ + page_t* page, /*!< in/out: first page in the space */ + ulint space_id, /*!< in: space id */ + ulint flags) /*!< in: tablespace flags (FSP_SPACE_FLAGS) */ +{ + ut_a(fsp_flags_is_valid(flags)); + + mach_write_to_4(FSP_HEADER_OFFSET + FSP_SPACE_ID + page, + space_id); + mach_write_to_4(FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + page, + flags); +} + +#ifndef UNIV_HOTBACKUP +/**********************************************************************//** +Initializes the space header of a new created space and creates also the +insert buffer tree root if space == 0. */ +UNIV_INTERN +void +fsp_header_init( +/*============*/ + ulint space, /*!< in: space id */ + ulint size, /*!< in: current size in blocks */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + fsp_header_t* header; + buf_block_t* block; + page_t* page; + ulint flags; + ulint zip_size; + + ut_ad(mtr); + + mtr_x_lock(fil_space_get_latch(space, &flags), mtr); + + zip_size = fsp_flags_get_zip_size(flags); + block = buf_page_create(space, 0, zip_size, mtr); + buf_page_get(space, zip_size, 0, RW_X_LATCH, mtr); + buf_block_dbg_add_level(block, SYNC_FSP_PAGE); + + /* The prior contents of the file page should be ignored */ + + fsp_init_file_page(block, mtr); + page = buf_block_get_frame(block); + + mlog_write_ulint(page + FIL_PAGE_TYPE, FIL_PAGE_TYPE_FSP_HDR, + MLOG_2BYTES, mtr); + + header = FSP_HEADER_OFFSET + page; + + mlog_write_ulint(header + FSP_SPACE_ID, space, MLOG_4BYTES, mtr); + mlog_write_ulint(header + FSP_NOT_USED, 0, MLOG_4BYTES, mtr); + + mlog_write_ulint(header + FSP_SIZE, size, MLOG_4BYTES, mtr); + mlog_write_ulint(header + FSP_FREE_LIMIT, 0, MLOG_4BYTES, mtr); + mlog_write_ulint(header + FSP_SPACE_FLAGS, flags, + MLOG_4BYTES, mtr); + mlog_write_ulint(header + FSP_FRAG_N_USED, 0, MLOG_4BYTES, mtr); + + flst_init(header + FSP_FREE, mtr); + flst_init(header + FSP_FREE_FRAG, mtr); + flst_init(header + FSP_FULL_FRAG, mtr); + flst_init(header + FSP_SEG_INODES_FULL, mtr); + flst_init(header + FSP_SEG_INODES_FREE, mtr); + + mlog_write_ull(header + FSP_SEG_ID, 1, mtr); + if (space == 0) { + fsp_fill_free_list(FALSE, space, header, mtr); + btr_create(DICT_CLUSTERED | DICT_UNIVERSAL | DICT_IBUF, + 0, 0, DICT_IBUF_ID_MIN + space, + dict_ind_redundant, mtr); + } else { + fsp_fill_free_list(TRUE, space, header, mtr); + } +} +#endif /* !UNIV_HOTBACKUP */ + +/**********************************************************************//** +Reads the space id from the first page of a tablespace. +@return space id, ULINT UNDEFINED if error */ +UNIV_INTERN +ulint +fsp_header_get_space_id( +/*====================*/ + const page_t* page) /*!< in: first page of a tablespace */ +{ + ulint fsp_id; + ulint id; + + fsp_id = mach_read_from_4(FSP_HEADER_OFFSET + page + FSP_SPACE_ID); + + id = mach_read_from_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + + DBUG_EXECUTE_IF("fsp_header_get_space_id_failure", + id = ULINT_UNDEFINED;); + + if (id != fsp_id) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Space id in fsp header %lu,but in the page header " + "%lu", fsp_id, id); + + return(ULINT_UNDEFINED); + } + + return(id); +} + +/**********************************************************************//** +Reads the space flags from the first page of a tablespace. +@return flags */ +UNIV_INTERN +ulint +fsp_header_get_flags( +/*=================*/ + const page_t* page) /*!< in: first page of a tablespace */ +{ + ut_ad(!page_offset(page)); + + return(mach_read_from_4(FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + page)); +} + +/**********************************************************************//** +Reads the compressed page size from the first page of a tablespace. +@return compressed page size in bytes, or 0 if uncompressed */ +UNIV_INTERN +ulint +fsp_header_get_zip_size( +/*====================*/ + const page_t* page) /*!< in: first page of a tablespace */ +{ + ulint flags = fsp_header_get_flags(page); + + return(fsp_flags_get_zip_size(flags)); +} + +#ifndef UNIV_HOTBACKUP +/**********************************************************************//** +Increases the space size field of a space. */ +UNIV_INTERN +void +fsp_header_inc_size( +/*================*/ + ulint space, /*!< in: space id */ + ulint size_inc, /*!< in: size increment in pages */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + fsp_header_t* header; + ulint size; + ulint flags; + + ut_ad(mtr); + + mtr_x_lock(fil_space_get_latch(space, &flags), mtr); + + header = fsp_get_space_header(space, + fsp_flags_get_zip_size(flags), + mtr); + + size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr); + + mlog_write_ulint(header + FSP_SIZE, size + size_inc, MLOG_4BYTES, + mtr); +} + +/**********************************************************************//** +Gets the size of the system tablespace from the tablespace header. If +we do not have an auto-extending data file, this should be equal to +the size of the data files. If there is an auto-extending data file, +this can be smaller. +@return size in pages */ +UNIV_INTERN +ulint +fsp_header_get_tablespace_size(void) +/*================================*/ +{ + fsp_header_t* header; + ulint size; + mtr_t mtr; + + mtr_start(&mtr); + + mtr_x_lock(fil_space_get_latch(0, NULL), &mtr); + + header = fsp_get_space_header(0, 0, &mtr); + + size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, &mtr); + + mtr_commit(&mtr); + + return(size); +} + +/***********************************************************************//** +Tries to extend a single-table tablespace so that a page would fit in the +data file. +@return TRUE if success */ +static UNIV_COLD __attribute__((nonnull, warn_unused_result)) +ibool +fsp_try_extend_data_file_with_pages( +/*================================*/ + ulint space, /*!< in: space */ + ulint page_no, /*!< in: page number */ + fsp_header_t* header, /*!< in/out: space header */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ibool success; + ulint actual_size; + ulint size; + + ut_a(space != 0); + + size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr); + + ut_a(page_no >= size); + + success = fil_extend_space_to_desired_size(&actual_size, space, + page_no + 1); + /* actual_size now has the space size in pages; it may be less than + we wanted if we ran out of disk space */ + + mlog_write_ulint(header + FSP_SIZE, actual_size, MLOG_4BYTES, mtr); + + return(success); +} + +/***********************************************************************//** +Tries to extend the last data file of a tablespace if it is auto-extending. +@return FALSE if not auto-extending */ +static UNIV_COLD __attribute__((nonnull)) +ibool +fsp_try_extend_data_file( +/*=====================*/ + ulint* actual_increase,/*!< out: actual increase in pages, where + we measure the tablespace size from + what the header field says; it may be + the actual file size rounded down to + megabyte */ + ulint space, /*!< in: space */ + fsp_header_t* header, /*!< in/out: space header */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ulint size; + ulint zip_size; + ulint new_size; + ulint old_size; + ulint size_increase; + ulint actual_size; + ibool success; + + *actual_increase = 0; + + if (space == 0 && !srv_auto_extend_last_data_file) { + + /* We print the error message only once to avoid + spamming the error log. Note that we don't need + to reset the flag to FALSE as dealing with this + error requires server restart. */ + if (fsp_tbs_full_error_printed == FALSE) { + fprintf(stderr, + "InnoDB: Error: Data file(s) ran" + " out of space.\n" + "Please add another data file or" + " use \'autoextend\' for the last" + " data file.\n"); + fsp_tbs_full_error_printed = TRUE; + } + return(FALSE); + } + + size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr); + zip_size = fsp_flags_get_zip_size( + mach_read_from_4(header + FSP_SPACE_FLAGS)); + + old_size = size; + + if (space == 0) { + if (!srv_last_file_size_max) { + size_increase = SRV_AUTO_EXTEND_INCREMENT; + } else { + if (srv_last_file_size_max + < srv_data_file_sizes[srv_n_data_files - 1]) { + + fprintf(stderr, + "InnoDB: Error: Last data file size" + " is %lu, max size allowed %lu\n", + (ulong) srv_data_file_sizes[ + srv_n_data_files - 1], + (ulong) srv_last_file_size_max); + } + + size_increase = srv_last_file_size_max + - srv_data_file_sizes[srv_n_data_files - 1]; + if (size_increase > SRV_AUTO_EXTEND_INCREMENT) { + size_increase = SRV_AUTO_EXTEND_INCREMENT; + } + } + } else { + /* We extend single-table tablespaces first one extent + at a time, but for bigger tablespaces more. It is not + enough to extend always by one extent, because some + extents are frag page extents. */ + ulint extent_size; /*!< one megabyte, in pages */ + + if (!zip_size) { + extent_size = FSP_EXTENT_SIZE; + } else { + extent_size = FSP_EXTENT_SIZE + * UNIV_PAGE_SIZE / zip_size; + } + + if (size < extent_size) { + /* Let us first extend the file to extent_size */ + success = fsp_try_extend_data_file_with_pages( + space, extent_size - 1, header, mtr); + if (!success) { + new_size = mtr_read_ulint(header + FSP_SIZE, + MLOG_4BYTES, mtr); + + *actual_increase = new_size - old_size; + + return(FALSE); + } + + size = extent_size; + } + + if (size < 32 * extent_size) { + size_increase = extent_size; + } else { + /* Below in fsp_fill_free_list() we assume + that we add at most FSP_FREE_ADD extents at + a time */ + size_increase = FSP_FREE_ADD * extent_size; + } + } + + if (size_increase == 0) { + + return(TRUE); + } + + success = fil_extend_space_to_desired_size(&actual_size, space, + size + size_increase); + if (!success) { + + return(false); + } + + /* We ignore any fragments of a full megabyte when storing the size + to the space header */ + + if (!zip_size) { + new_size = ut_calc_align_down(actual_size, + (1024 * 1024) / UNIV_PAGE_SIZE); + } else { + new_size = ut_calc_align_down(actual_size, + (1024 * 1024) / zip_size); + } + mlog_write_ulint(header + FSP_SIZE, new_size, MLOG_4BYTES, mtr); + + *actual_increase = new_size - old_size; + + return(TRUE); +} + +/**********************************************************************//** +Puts new extents to the free list if there are free extents above the free +limit. If an extent happens to contain an extent descriptor page, the extent +is put to the FSP_FREE_FRAG list with the page marked as used. */ +static +void +fsp_fill_free_list( +/*===============*/ + ibool init_space, /*!< in: TRUE if this is a single-table + tablespace and we are only initing + the tablespace's first extent + descriptor page and ibuf bitmap page; + then we do not allocate more extents */ + ulint space, /*!< in: space */ + fsp_header_t* header, /*!< in/out: space header */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ulint limit; + ulint size; + ulint zip_size; + xdes_t* descr; + ulint count = 0; + ulint frag_n_used; + ulint actual_increase; + ulint i; + mtr_t ibuf_mtr; + + ut_ad(header && mtr); + ut_ad(page_offset(header) == FSP_HEADER_OFFSET); + + /* Check if we can fill free list from above the free list limit */ + size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr); + limit = mtr_read_ulint(header + FSP_FREE_LIMIT, MLOG_4BYTES, mtr); + + zip_size = fsp_flags_get_zip_size( + mach_read_from_4(FSP_SPACE_FLAGS + header)); + ut_a(ut_is_2pow(zip_size)); + ut_a(zip_size <= UNIV_ZIP_SIZE_MAX); + ut_a(!zip_size || zip_size >= UNIV_ZIP_SIZE_MIN); + + if (space == 0 && srv_auto_extend_last_data_file + && size < limit + FSP_EXTENT_SIZE * FSP_FREE_ADD) { + + /* Try to increase the last data file size */ + fsp_try_extend_data_file(&actual_increase, space, header, mtr); + size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr); + } + + if (space != 0 && !init_space + && size < limit + FSP_EXTENT_SIZE * FSP_FREE_ADD) { + + /* Try to increase the .ibd file size */ + fsp_try_extend_data_file(&actual_increase, space, header, mtr); + size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr); + } + + i = limit; + + while ((init_space && i < 1) + || ((i + FSP_EXTENT_SIZE <= size) && (count < FSP_FREE_ADD))) { + + ibool init_xdes; + if (zip_size) { + init_xdes = ut_2pow_remainder(i, zip_size) == 0; + } else { + init_xdes = ut_2pow_remainder(i, UNIV_PAGE_SIZE) == 0; + } + + mlog_write_ulint(header + FSP_FREE_LIMIT, i + FSP_EXTENT_SIZE, + MLOG_4BYTES, mtr); + + if (UNIV_UNLIKELY(init_xdes)) { + + buf_block_t* block; + + /* We are going to initialize a new descriptor page + and a new ibuf bitmap page: the prior contents of the + pages should be ignored. */ + + if (i > 0) { + block = buf_page_create( + space, i, zip_size, mtr); + buf_page_get(space, zip_size, i, + RW_X_LATCH, mtr); + buf_block_dbg_add_level(block, + SYNC_FSP_PAGE); + + fsp_init_file_page(block, mtr); + mlog_write_ulint(buf_block_get_frame(block) + + FIL_PAGE_TYPE, + FIL_PAGE_TYPE_XDES, + MLOG_2BYTES, mtr); + } + + /* Initialize the ibuf bitmap page in a separate + mini-transaction because it is low in the latching + order, and we must be able to release its latch + before returning from the fsp routine */ + + mtr_start(&ibuf_mtr); + + block = buf_page_create(space, + i + FSP_IBUF_BITMAP_OFFSET, + zip_size, &ibuf_mtr); + buf_page_get(space, zip_size, + i + FSP_IBUF_BITMAP_OFFSET, + RW_X_LATCH, &ibuf_mtr); + buf_block_dbg_add_level(block, SYNC_FSP_PAGE); + + fsp_init_file_page(block, &ibuf_mtr); + + ibuf_bitmap_page_init(block, &ibuf_mtr); + + mtr_commit(&ibuf_mtr); + } + + descr = xdes_get_descriptor_with_space_hdr(header, space, i, + mtr); + xdes_init(descr, mtr); + + if (UNIV_UNLIKELY(init_xdes)) { + + /* The first page in the extent is a descriptor page + and the second is an ibuf bitmap page: mark them + used */ + + xdes_set_bit(descr, XDES_FREE_BIT, 0, FALSE, mtr); + xdes_set_bit(descr, XDES_FREE_BIT, + FSP_IBUF_BITMAP_OFFSET, FALSE, mtr); + xdes_set_state(descr, XDES_FREE_FRAG, mtr); + + flst_add_last(header + FSP_FREE_FRAG, + descr + XDES_FLST_NODE, mtr); + frag_n_used = mtr_read_ulint(header + FSP_FRAG_N_USED, + MLOG_4BYTES, mtr); + mlog_write_ulint(header + FSP_FRAG_N_USED, + frag_n_used + 2, MLOG_4BYTES, mtr); + } else { + flst_add_last(header + FSP_FREE, + descr + XDES_FLST_NODE, mtr); + count++; + } + + i += FSP_EXTENT_SIZE; + } +} + +/**********************************************************************//** +Allocates a new free extent. +@return extent descriptor, NULL if cannot be allocated */ +static +xdes_t* +fsp_alloc_free_extent( +/*==================*/ + ulint space, /*!< in: space id */ + ulint zip_size,/*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint hint, /*!< in: hint of which extent would be desirable: any + page offset in the extent goes; the hint must not + be > FSP_FREE_LIMIT */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + fsp_header_t* header; + fil_addr_t first; + xdes_t* descr; + + ut_ad(mtr); + + header = fsp_get_space_header(space, zip_size, mtr); + + descr = xdes_get_descriptor_with_space_hdr(header, space, hint, mtr); + + if (descr && (xdes_get_state(descr, mtr) == XDES_FREE)) { + /* Ok, we can take this extent */ + } else { + /* Take the first extent in the free list */ + first = flst_get_first(header + FSP_FREE, mtr); + + if (fil_addr_is_null(first)) { + fsp_fill_free_list(FALSE, space, header, mtr); + + first = flst_get_first(header + FSP_FREE, mtr); + } + + if (fil_addr_is_null(first)) { + + return(NULL); /* No free extents left */ + } + + descr = xdes_lst_get_descriptor(space, zip_size, first, mtr); + } + + flst_remove(header + FSP_FREE, descr + XDES_FLST_NODE, mtr); + + return(descr); +} + +/**********************************************************************//** +Allocates a single free page from a space. */ +static __attribute__((nonnull)) +void +fsp_alloc_from_free_frag( +/*=====================*/ + fsp_header_t* header, /*!< in/out: tablespace header */ + xdes_t* descr, /*!< in/out: extent descriptor */ + ulint bit, /*!< in: slot to allocate in the extent */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ulint frag_n_used; + + ut_ad(xdes_get_state(descr, mtr) == XDES_FREE_FRAG); + ut_a(xdes_mtr_get_bit(descr, XDES_FREE_BIT, bit, mtr)); + xdes_set_bit(descr, XDES_FREE_BIT, bit, FALSE, mtr); + + /* Update the FRAG_N_USED field */ + frag_n_used = mtr_read_ulint(header + FSP_FRAG_N_USED, MLOG_4BYTES, + mtr); + frag_n_used++; + mlog_write_ulint(header + FSP_FRAG_N_USED, frag_n_used, MLOG_4BYTES, + mtr); + if (xdes_is_full(descr, mtr)) { + /* The fragment is full: move it to another list */ + flst_remove(header + FSP_FREE_FRAG, descr + XDES_FLST_NODE, + mtr); + xdes_set_state(descr, XDES_FULL_FRAG, mtr); + + flst_add_last(header + FSP_FULL_FRAG, descr + XDES_FLST_NODE, + mtr); + mlog_write_ulint(header + FSP_FRAG_N_USED, + frag_n_used - FSP_EXTENT_SIZE, MLOG_4BYTES, + mtr); + } +} + +/**********************************************************************//** +Gets a buffer block for an allocated page. + +NOTE: If init_mtr != mtr, the block will only be initialized if it was +not previously x-latched. It is assumed that the block has been +x-latched only by mtr, and freed in mtr in that case. + +@return block, initialized if init_mtr==mtr +or rw_lock_x_lock_count(&block->lock) == 1 */ +static +buf_block_t* +fsp_page_create( +/*============*/ + ulint space, /*!< in: space id of the allocated page */ + ulint zip_size, /*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page_no, /*!< in: page number of the allocated page */ + mtr_t* mtr, /*!< in: mini-transaction of the allocation */ + mtr_t* init_mtr) /*!< in: mini-transaction for initializing + the page */ +{ + buf_block_t* block + = buf_page_create(space, page_no, zip_size, init_mtr); +#ifdef UNIV_SYNC_DEBUG + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX) + == rw_lock_own(&block->lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + /* Mimic buf_page_get(), but avoid the buf_pool->page_hash lookup. */ + rw_lock_x_lock(&block->lock); + mutex_enter(&block->mutex); + buf_block_buf_fix_inc(block, __FILE__, __LINE__); + mutex_exit(&block->mutex); + mtr_memo_push(init_mtr, block, MTR_MEMO_PAGE_X_FIX); + + if (init_mtr == mtr + || rw_lock_get_x_lock_count(&block->lock) == 1) { + + /* Initialize the page, unless it was already + X-latched in mtr. (In this case, we would want to + allocate another page that has not been freed in mtr.) */ + ut_ad(init_mtr == mtr + || !mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + + fsp_init_file_page(block, init_mtr); + } + + return(block); +} + +/**********************************************************************//** +Allocates a single free page from a space. The page is marked as used. +@retval NULL if no page could be allocated +@retval block, rw_lock_x_lock_count(&block->lock) == 1 if allocation succeeded +(init_mtr == mtr, or the page was not previously freed in mtr) +@retval block (not allocated or initialized) otherwise */ +static __attribute__((nonnull, warn_unused_result)) +buf_block_t* +fsp_alloc_free_page( +/*================*/ + ulint space, /*!< in: space id */ + ulint zip_size,/*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint hint, /*!< in: hint of which page would be desirable */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + mtr_t* init_mtr)/*!< in/out: mini-transaction in which the + page should be initialized + (may be the same as mtr) */ +{ + fsp_header_t* header; + fil_addr_t first; + xdes_t* descr; + ulint free; + ulint page_no; + ulint space_size; + + ut_ad(mtr); + ut_ad(init_mtr); + + header = fsp_get_space_header(space, zip_size, mtr); + + /* Get the hinted descriptor */ + descr = xdes_get_descriptor_with_space_hdr(header, space, hint, mtr); + + if (descr && (xdes_get_state(descr, mtr) == XDES_FREE_FRAG)) { + /* Ok, we can take this extent */ + } else { + /* Else take the first extent in free_frag list */ + first = flst_get_first(header + FSP_FREE_FRAG, mtr); + + if (fil_addr_is_null(first)) { + /* There are no partially full fragments: allocate + a free extent and add it to the FREE_FRAG list. NOTE + that the allocation may have as a side-effect that an + extent containing a descriptor page is added to the + FREE_FRAG list. But we will allocate our page from the + the free extent anyway. */ + + descr = fsp_alloc_free_extent(space, zip_size, + hint, mtr); + + if (descr == NULL) { + /* No free space left */ + + return(NULL); + } + + xdes_set_state(descr, XDES_FREE_FRAG, mtr); + flst_add_last(header + FSP_FREE_FRAG, + descr + XDES_FLST_NODE, mtr); + } else { + descr = xdes_lst_get_descriptor(space, zip_size, + first, mtr); + } + + /* Reset the hint */ + hint = 0; + } + + /* Now we have in descr an extent with at least one free page. Look + for a free page in the extent. */ + + free = xdes_find_bit(descr, XDES_FREE_BIT, TRUE, + hint % FSP_EXTENT_SIZE, mtr); + if (free == ULINT_UNDEFINED) { + + ut_print_buf(stderr, ((byte*) descr) - 500, 1000); + putc('\n', stderr); + + ut_error; + } + + page_no = xdes_get_offset(descr) + free; + + space_size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr); + + if (space_size <= page_no) { + /* It must be that we are extending a single-table tablespace + whose size is still < 64 pages */ + + ut_a(space != 0); + if (page_no >= FSP_EXTENT_SIZE) { + fprintf(stderr, + "InnoDB: Error: trying to extend a" + " single-table tablespace %lu\n" + "InnoDB: by single page(s) though the" + " space size %lu. Page no %lu.\n", + (ulong) space, (ulong) space_size, + (ulong) page_no); + return(NULL); + } + if (!fsp_try_extend_data_file_with_pages(space, page_no, + header, mtr)) { + /* No disk space left */ + return(NULL); + } + } + + fsp_alloc_from_free_frag(header, descr, free, mtr); + return(fsp_page_create(space, zip_size, page_no, mtr, init_mtr)); +} + +/**********************************************************************//** +Frees a single page of a space. The page is marked as free and clean. */ +static +void +fsp_free_page( +/*==========*/ + ulint space, /*!< in: space id */ + ulint zip_size,/*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page, /*!< in: page offset */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + fsp_header_t* header; + xdes_t* descr; + ulint state; + ulint frag_n_used; + + ut_ad(mtr); + + /* fprintf(stderr, "Freeing page %lu in space %lu\n", page, space); */ + + header = fsp_get_space_header(space, zip_size, mtr); + + descr = xdes_get_descriptor_with_space_hdr(header, space, page, mtr); + + state = xdes_get_state(descr, mtr); + + if (state != XDES_FREE_FRAG && state != XDES_FULL_FRAG) { + fprintf(stderr, + "InnoDB: Error: File space extent descriptor" + " of page %lu has state %lu\n", + (ulong) page, + (ulong) state); + fputs("InnoDB: Dump of descriptor: ", stderr); + ut_print_buf(stderr, ((byte*) descr) - 50, 200); + putc('\n', stderr); + /* Crash in debug version, so that we get a core dump + of this corruption. */ + ut_ad(0); + + if (state == XDES_FREE) { + /* We put here some fault tolerance: if the page + is already free, return without doing anything! */ + + return; + } + + ut_error; + } + + if (xdes_mtr_get_bit(descr, XDES_FREE_BIT, + page % FSP_EXTENT_SIZE, mtr)) { + + fprintf(stderr, + "InnoDB: Error: File space extent descriptor" + " of page %lu says it is free\n" + "InnoDB: Dump of descriptor: ", (ulong) page); + ut_print_buf(stderr, ((byte*) descr) - 50, 200); + putc('\n', stderr); + /* Crash in debug version, so that we get a core dump + of this corruption. */ + ut_ad(0); + + /* We put here some fault tolerance: if the page + is already free, return without doing anything! */ + + return; + } + + xdes_set_bit(descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, TRUE, mtr); + xdes_set_bit(descr, XDES_CLEAN_BIT, page % FSP_EXTENT_SIZE, TRUE, mtr); + + frag_n_used = mtr_read_ulint(header + FSP_FRAG_N_USED, MLOG_4BYTES, + mtr); + if (state == XDES_FULL_FRAG) { + /* The fragment was full: move it to another list */ + flst_remove(header + FSP_FULL_FRAG, descr + XDES_FLST_NODE, + mtr); + xdes_set_state(descr, XDES_FREE_FRAG, mtr); + flst_add_last(header + FSP_FREE_FRAG, descr + XDES_FLST_NODE, + mtr); + mlog_write_ulint(header + FSP_FRAG_N_USED, + frag_n_used + FSP_EXTENT_SIZE - 1, + MLOG_4BYTES, mtr); + } else { + ut_a(frag_n_used > 0); + mlog_write_ulint(header + FSP_FRAG_N_USED, frag_n_used - 1, + MLOG_4BYTES, mtr); + } + + if (xdes_is_free(descr, mtr)) { + /* The extent has become free: move it to another list */ + flst_remove(header + FSP_FREE_FRAG, descr + XDES_FLST_NODE, + mtr); + fsp_free_extent(space, zip_size, page, mtr); + } + + mtr->n_freed_pages++; +} + +/**********************************************************************//** +Returns an extent to the free list of a space. */ +static +void +fsp_free_extent( +/*============*/ + ulint space, /*!< in: space id */ + ulint zip_size,/*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page, /*!< in: page offset in the extent */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + fsp_header_t* header; + xdes_t* descr; + + ut_ad(mtr); + + header = fsp_get_space_header(space, zip_size, mtr); + + descr = xdes_get_descriptor_with_space_hdr(header, space, page, mtr); + + if (xdes_get_state(descr, mtr) == XDES_FREE) { + + ut_print_buf(stderr, (byte*) descr - 500, 1000); + putc('\n', stderr); + + ut_error; + } + + xdes_init(descr, mtr); + + flst_add_last(header + FSP_FREE, descr + XDES_FLST_NODE, mtr); +} + +/**********************************************************************//** +Returns the nth inode slot on an inode page. +@return segment inode */ +UNIV_INLINE +fseg_inode_t* +fsp_seg_inode_page_get_nth_inode( +/*=============================*/ + page_t* page, /*!< in: segment inode page */ + ulint i, /*!< in: inode index on page */ + ulint zip_size __attribute__((unused)), + /*!< in: compressed page size, or 0 */ + mtr_t* mtr __attribute__((unused))) + /*!< in/out: mini-transaction */ +{ + ut_ad(i < FSP_SEG_INODES_PER_PAGE(zip_size)); + ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX)); + + return(page + FSEG_ARR_OFFSET + FSEG_INODE_SIZE * i); +} + +/**********************************************************************//** +Looks for a used segment inode on a segment inode page. +@return segment inode index, or ULINT_UNDEFINED if not found */ +static +ulint +fsp_seg_inode_page_find_used( +/*=========================*/ + page_t* page, /*!< in: segment inode page */ + ulint zip_size,/*!< in: compressed page size, or 0 */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ulint i; + fseg_inode_t* inode; + + for (i = 0; i < FSP_SEG_INODES_PER_PAGE(zip_size); i++) { + + inode = fsp_seg_inode_page_get_nth_inode( + page, i, zip_size, mtr); + + if (mach_read_from_8(inode + FSEG_ID)) { + /* This is used */ + + ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) + == FSEG_MAGIC_N_VALUE); + return(i); + } + } + + return(ULINT_UNDEFINED); +} + +/**********************************************************************//** +Looks for an unused segment inode on a segment inode page. +@return segment inode index, or ULINT_UNDEFINED if not found */ +static +ulint +fsp_seg_inode_page_find_free( +/*=========================*/ + page_t* page, /*!< in: segment inode page */ + ulint i, /*!< in: search forward starting from this index */ + ulint zip_size,/*!< in: compressed page size, or 0 */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + SRV_CORRUPT_TABLE_CHECK(page, return(ULINT_UNDEFINED);); + + for (; i < FSP_SEG_INODES_PER_PAGE(zip_size); i++) { + + fseg_inode_t* inode; + + inode = fsp_seg_inode_page_get_nth_inode( + page, i, zip_size, mtr); + + if (!mach_read_from_8(inode + FSEG_ID)) { + /* This is unused */ + return(i); + } + + ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) + == FSEG_MAGIC_N_VALUE); + } + + return(ULINT_UNDEFINED); +} + +/**********************************************************************//** +Allocates a new file segment inode page. +@return TRUE if could be allocated */ +static +ibool +fsp_alloc_seg_inode_page( +/*=====================*/ + fsp_header_t* space_header, /*!< in: space header */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + fseg_inode_t* inode; + buf_block_t* block; + page_t* page; + ulint space; + ulint zip_size; + + ut_ad(page_offset(space_header) == FSP_HEADER_OFFSET); + + space = page_get_space_id(page_align(space_header)); + + zip_size = fsp_flags_get_zip_size( + mach_read_from_4(FSP_SPACE_FLAGS + space_header)); + + block = fsp_alloc_free_page(space, zip_size, 0, mtr, mtr); + + if (block == NULL) { + + return(FALSE); + } + + buf_block_dbg_add_level(block, SYNC_FSP_PAGE); + ut_ad(rw_lock_get_x_lock_count(&block->lock) == 1); + + block->check_index_page_at_flush = FALSE; + + page = buf_block_get_frame(block); + + mlog_write_ulint(page + FIL_PAGE_TYPE, FIL_PAGE_INODE, + MLOG_2BYTES, mtr); + + for (ulint i = 0; i < FSP_SEG_INODES_PER_PAGE(zip_size); i++) { + + inode = fsp_seg_inode_page_get_nth_inode( + page, i, zip_size, mtr); + + mlog_write_ull(inode + FSEG_ID, 0, mtr); + } + + flst_add_last( + space_header + FSP_SEG_INODES_FREE, + page + FSEG_INODE_PAGE_NODE, mtr); + + return(TRUE); +} + +/**********************************************************************//** +Allocates a new file segment inode. +@return segment inode, or NULL if not enough space */ +static +fseg_inode_t* +fsp_alloc_seg_inode( +/*================*/ + fsp_header_t* space_header, /*!< in: space header */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ulint page_no; + buf_block_t* block; + page_t* page; + fseg_inode_t* inode; + ibool success; + ulint zip_size; + ulint n; + + ut_ad(page_offset(space_header) == FSP_HEADER_OFFSET); + + if (flst_get_len(space_header + FSP_SEG_INODES_FREE, mtr) == 0) { + /* Allocate a new segment inode page */ + + success = fsp_alloc_seg_inode_page(space_header, mtr); + + if (!success) { + + return(NULL); + } + } + + page_no = flst_get_first(space_header + FSP_SEG_INODES_FREE, mtr).page; + + zip_size = fsp_flags_get_zip_size( + mach_read_from_4(FSP_SPACE_FLAGS + space_header)); + block = buf_page_get(page_get_space_id(page_align(space_header)), + zip_size, page_no, RW_X_LATCH, mtr); + buf_block_dbg_add_level(block, SYNC_FSP_PAGE); + + page = buf_block_get_frame(block); + + SRV_CORRUPT_TABLE_CHECK(page, return(0);); + + n = fsp_seg_inode_page_find_free(page, 0, zip_size, mtr); + + ut_a(n != ULINT_UNDEFINED); + + inode = fsp_seg_inode_page_get_nth_inode(page, n, zip_size, mtr); + + if (ULINT_UNDEFINED == fsp_seg_inode_page_find_free(page, n + 1, + zip_size, mtr)) { + /* There are no other unused headers left on the page: move it + to another list */ + + flst_remove(space_header + FSP_SEG_INODES_FREE, + page + FSEG_INODE_PAGE_NODE, mtr); + + flst_add_last(space_header + FSP_SEG_INODES_FULL, + page + FSEG_INODE_PAGE_NODE, mtr); + } + + ut_ad(!mach_read_from_8(inode + FSEG_ID) + || mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE); + return(inode); +} + +/**********************************************************************//** +Frees a file segment inode. */ +static +void +fsp_free_seg_inode( +/*===============*/ + ulint space, /*!< in: space id */ + ulint zip_size,/*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + fseg_inode_t* inode, /*!< in: segment inode */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + page_t* page; + fsp_header_t* space_header; + + page = page_align(inode); + + space_header = fsp_get_space_header(space, zip_size, mtr); + + ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE); + + if (ULINT_UNDEFINED + == fsp_seg_inode_page_find_free(page, 0, zip_size, mtr)) { + + /* Move the page to another list */ + + flst_remove(space_header + FSP_SEG_INODES_FULL, + page + FSEG_INODE_PAGE_NODE, mtr); + + flst_add_last(space_header + FSP_SEG_INODES_FREE, + page + FSEG_INODE_PAGE_NODE, mtr); + } + + mlog_write_ull(inode + FSEG_ID, 0, mtr); + mlog_write_ulint(inode + FSEG_MAGIC_N, 0xfa051ce3, MLOG_4BYTES, mtr); + + if (ULINT_UNDEFINED + == fsp_seg_inode_page_find_used(page, zip_size, mtr)) { + + /* There are no other used headers left on the page: free it */ + + flst_remove(space_header + FSP_SEG_INODES_FREE, + page + FSEG_INODE_PAGE_NODE, mtr); + + fsp_free_page(space, zip_size, page_get_page_no(page), mtr); + } +} + +/**********************************************************************//** +Returns the file segment inode, page x-latched. +@return segment inode, page x-latched; NULL if the inode is free */ +static +fseg_inode_t* +fseg_inode_try_get( +/*===============*/ + fseg_header_t* header, /*!< in: segment header */ + ulint space, /*!< in: space id */ + ulint zip_size,/*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + fil_addr_t inode_addr; + fseg_inode_t* inode; + + inode_addr.page = mach_read_from_4(header + FSEG_HDR_PAGE_NO); + inode_addr.boffset = mach_read_from_2(header + FSEG_HDR_OFFSET); + ut_ad(space == mach_read_from_4(header + FSEG_HDR_SPACE)); + + inode = fut_get_ptr(space, zip_size, inode_addr, RW_X_LATCH, mtr); + + SRV_CORRUPT_TABLE_CHECK(inode, return(0);); + + if (UNIV_UNLIKELY(!mach_read_from_8(inode + FSEG_ID))) { + + inode = NULL; + } else { + ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) + == FSEG_MAGIC_N_VALUE); + } + + return(inode); +} + +/**********************************************************************//** +Returns the file segment inode, page x-latched. +@return segment inode, page x-latched */ +static +fseg_inode_t* +fseg_inode_get( +/*===========*/ + fseg_header_t* header, /*!< in: segment header */ + ulint space, /*!< in: space id */ + ulint zip_size,/*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + fseg_inode_t* inode + = fseg_inode_try_get(header, space, zip_size, mtr); + SRV_CORRUPT_TABLE_CHECK(inode, ; /* do nothing */); + return(inode); +} + +/**********************************************************************//** +Gets the page number from the nth fragment page slot. +@return page number, FIL_NULL if not in use */ +UNIV_INLINE +ulint +fseg_get_nth_frag_page_no( +/*======================*/ + fseg_inode_t* inode, /*!< in: segment inode */ + ulint n, /*!< in: slot index */ + mtr_t* mtr __attribute__((unused))) + /*!< in/out: mini-transaction */ +{ + ut_ad(inode && mtr); + ut_ad(n < FSEG_FRAG_ARR_N_SLOTS); + ut_ad(mtr_memo_contains_page(mtr, inode, MTR_MEMO_PAGE_X_FIX)); + ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE); + return(mach_read_from_4(inode + FSEG_FRAG_ARR + + n * FSEG_FRAG_SLOT_SIZE)); +} + +/**********************************************************************//** +Sets the page number in the nth fragment page slot. */ +UNIV_INLINE +void +fseg_set_nth_frag_page_no( +/*======================*/ + fseg_inode_t* inode, /*!< in: segment inode */ + ulint n, /*!< in: slot index */ + ulint page_no,/*!< in: page number to set */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ut_ad(inode && mtr); + ut_ad(n < FSEG_FRAG_ARR_N_SLOTS); + ut_ad(mtr_memo_contains_page(mtr, inode, MTR_MEMO_PAGE_X_FIX)); + ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE); + + mlog_write_ulint(inode + FSEG_FRAG_ARR + n * FSEG_FRAG_SLOT_SIZE, + page_no, MLOG_4BYTES, mtr); +} + +/**********************************************************************//** +Finds a fragment page slot which is free. +@return slot index; ULINT_UNDEFINED if none found */ +static +ulint +fseg_find_free_frag_page_slot( +/*==========================*/ + fseg_inode_t* inode, /*!< in: segment inode */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ulint i; + ulint page_no; + + ut_ad(inode && mtr); + + for (i = 0; i < FSEG_FRAG_ARR_N_SLOTS; i++) { + page_no = fseg_get_nth_frag_page_no(inode, i, mtr); + + if (page_no == FIL_NULL) { + + return(i); + } + } + + return(ULINT_UNDEFINED); +} + +/**********************************************************************//** +Finds a fragment page slot which is used and last in the array. +@return slot index; ULINT_UNDEFINED if none found */ +static +ulint +fseg_find_last_used_frag_page_slot( +/*===============================*/ + fseg_inode_t* inode, /*!< in: segment inode */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ulint i; + ulint page_no; + + ut_ad(inode && mtr); + + for (i = 0; i < FSEG_FRAG_ARR_N_SLOTS; i++) { + page_no = fseg_get_nth_frag_page_no( + inode, FSEG_FRAG_ARR_N_SLOTS - i - 1, mtr); + + if (page_no != FIL_NULL) { + + return(FSEG_FRAG_ARR_N_SLOTS - i - 1); + } + } + + return(ULINT_UNDEFINED); +} + +/**********************************************************************//** +Calculates reserved fragment page slots. +@return number of fragment pages */ +static +ulint +fseg_get_n_frag_pages( +/*==================*/ + fseg_inode_t* inode, /*!< in: segment inode */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ulint i; + ulint count = 0; + + ut_ad(inode && mtr); + + for (i = 0; i < FSEG_FRAG_ARR_N_SLOTS; i++) { + if (FIL_NULL != fseg_get_nth_frag_page_no(inode, i, mtr)) { + count++; + } + } + + return(count); +} + +/**********************************************************************//** +Creates a new segment. +@return the block where the segment header is placed, x-latched, NULL +if could not create segment because of lack of space */ +UNIV_INTERN +buf_block_t* +fseg_create_general( +/*================*/ + ulint space, /*!< in: space id */ + ulint page, /*!< in: page where the segment header is placed: if + this is != 0, the page must belong to another segment, + if this is 0, a new page will be allocated and it + will belong to the created segment */ + ulint byte_offset, /*!< in: byte offset of the created segment header + on the page */ + ibool has_done_reservation, /*!< in: TRUE if the caller has already + done the reservation for the pages with + fsp_reserve_free_extents (at least 2 extents: one for + the inode and the other for the segment) then there is + no need to do the check for this individual + operation */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ulint flags; + ulint zip_size; + fsp_header_t* space_header; + fseg_inode_t* inode; + ib_id_t seg_id; + buf_block_t* block = 0; /* remove warning */ + fseg_header_t* header = 0; /* remove warning */ + prio_rw_lock_t* latch; + ibool success; + ulint n_reserved; + ulint i; + + ut_ad(mtr); + ut_ad(byte_offset + FSEG_HEADER_SIZE + <= UNIV_PAGE_SIZE - FIL_PAGE_DATA_END); + + latch = fil_space_get_latch(space, &flags); + zip_size = fsp_flags_get_zip_size(flags); + + if (page != 0) { + block = buf_page_get(space, zip_size, page, RW_X_LATCH, mtr); + header = byte_offset + buf_block_get_frame(block); + } + + mtr_x_lock(latch, mtr); + + if (rw_lock_get_x_lock_count(latch) == 1) { + /* This thread did not own the latch before this call: free + excess pages from the insert buffer free list */ + + if (space == IBUF_SPACE_ID) { + ibuf_free_excess_pages(); + } + } + + if (!has_done_reservation) { + success = fsp_reserve_free_extents(&n_reserved, space, 2, + FSP_NORMAL, mtr); + if (!success) { + return(NULL); + } + } + + space_header = fsp_get_space_header(space, zip_size, mtr); + + inode = fsp_alloc_seg_inode(space_header, mtr); + + if (inode == NULL) { + + goto funct_exit; + } + + /* Read the next segment id from space header and increment the + value in space header */ + + seg_id = mach_read_from_8(space_header + FSP_SEG_ID); + + mlog_write_ull(space_header + FSP_SEG_ID, seg_id + 1, mtr); + + mlog_write_ull(inode + FSEG_ID, seg_id, mtr); + mlog_write_ulint(inode + FSEG_NOT_FULL_N_USED, 0, MLOG_4BYTES, mtr); + + flst_init(inode + FSEG_FREE, mtr); + flst_init(inode + FSEG_NOT_FULL, mtr); + flst_init(inode + FSEG_FULL, mtr); + + mlog_write_ulint(inode + FSEG_MAGIC_N, FSEG_MAGIC_N_VALUE, + MLOG_4BYTES, mtr); + for (i = 0; i < FSEG_FRAG_ARR_N_SLOTS; i++) { + fseg_set_nth_frag_page_no(inode, i, FIL_NULL, mtr); + } + + if (page == 0) { + block = fseg_alloc_free_page_low(space, zip_size, + inode, 0, FSP_UP, mtr, mtr); + + if (block == NULL) { + + fsp_free_seg_inode(space, zip_size, inode, mtr); + + goto funct_exit; + } + + ut_ad(rw_lock_get_x_lock_count(&block->lock) == 1); + + header = byte_offset + buf_block_get_frame(block); + mlog_write_ulint(buf_block_get_frame(block) + FIL_PAGE_TYPE, + FIL_PAGE_TYPE_SYS, MLOG_2BYTES, mtr); + } + + mlog_write_ulint(header + FSEG_HDR_OFFSET, + page_offset(inode), MLOG_2BYTES, mtr); + + mlog_write_ulint(header + FSEG_HDR_PAGE_NO, + page_get_page_no(page_align(inode)), + MLOG_4BYTES, mtr); + + mlog_write_ulint(header + FSEG_HDR_SPACE, space, MLOG_4BYTES, mtr); + +funct_exit: + if (!has_done_reservation) { + + fil_space_release_free_extents(space, n_reserved); + } + + return(block); +} + +/**********************************************************************//** +Creates a new segment. +@return the block where the segment header is placed, x-latched, NULL +if could not create segment because of lack of space */ +UNIV_INTERN +buf_block_t* +fseg_create( +/*========*/ + ulint space, /*!< in: space id */ + ulint page, /*!< in: page where the segment header is placed: if + this is != 0, the page must belong to another segment, + if this is 0, a new page will be allocated and it + will belong to the created segment */ + ulint byte_offset, /*!< in: byte offset of the created segment header + on the page */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + return(fseg_create_general(space, page, byte_offset, FALSE, mtr)); +} + +/**********************************************************************//** +Calculates the number of pages reserved by a segment, and how many pages are +currently used. +@return number of reserved pages */ +static +ulint +fseg_n_reserved_pages_low( +/*======================*/ + fseg_inode_t* inode, /*!< in: segment inode */ + ulint* used, /*!< out: number of pages used (not + more than reserved) */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ulint ret; + + ut_ad(inode && used && mtr); + ut_ad(mtr_memo_contains_page(mtr, inode, MTR_MEMO_PAGE_X_FIX)); + + *used = mtr_read_ulint(inode + FSEG_NOT_FULL_N_USED, MLOG_4BYTES, mtr) + + FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_FULL, mtr) + + fseg_get_n_frag_pages(inode, mtr); + + ret = fseg_get_n_frag_pages(inode, mtr) + + FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_FREE, mtr) + + FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_NOT_FULL, mtr) + + FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_FULL, mtr); + + return(ret); +} + +/**********************************************************************//** +Calculates the number of pages reserved by a segment, and how many pages are +currently used. +@return number of reserved pages */ +UNIV_INTERN +ulint +fseg_n_reserved_pages( +/*==================*/ + fseg_header_t* header, /*!< in: segment header */ + ulint* used, /*!< out: number of pages used (<= reserved) */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ulint ret; + fseg_inode_t* inode; + ulint space; + ulint flags; + ulint zip_size; + prio_rw_lock_t* latch; + + space = page_get_space_id(page_align(header)); + latch = fil_space_get_latch(space, &flags); + zip_size = fsp_flags_get_zip_size(flags); + + mtr_x_lock(latch, mtr); + + inode = fseg_inode_get(header, space, zip_size, mtr); + + ret = fseg_n_reserved_pages_low(inode, used, mtr); + + return(ret); +} + +/*********************************************************************//** +Tries to fill the free list of a segment with consecutive free extents. +This happens if the segment is big enough to allow extents in the free list, +the free list is empty, and the extents can be allocated consecutively from +the hint onward. */ +static +void +fseg_fill_free_list( +/*================*/ + fseg_inode_t* inode, /*!< in: segment inode */ + ulint space, /*!< in: space id */ + ulint zip_size,/*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint hint, /*!< in: hint which extent would be good as + the first extent */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + xdes_t* descr; + ulint i; + ib_id_t seg_id; + ulint reserved; + ulint used; + + ut_ad(inode && mtr); + ut_ad(!((page_offset(inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE)); + + reserved = fseg_n_reserved_pages_low(inode, &used, mtr); + + if (reserved < FSEG_FREE_LIST_LIMIT * FSP_EXTENT_SIZE) { + + /* The segment is too small to allow extents in free list */ + + return; + } + + if (flst_get_len(inode + FSEG_FREE, mtr) > 0) { + /* Free list is not empty */ + + return; + } + + for (i = 0; i < FSEG_FREE_LIST_MAX_LEN; i++) { + descr = xdes_get_descriptor(space, zip_size, hint, mtr); + + if ((descr == NULL) + || (XDES_FREE != xdes_get_state(descr, mtr))) { + + /* We cannot allocate the desired extent: stop */ + + return; + } + + descr = fsp_alloc_free_extent(space, zip_size, hint, mtr); + + xdes_set_state(descr, XDES_FSEG, mtr); + + seg_id = mach_read_from_8(inode + FSEG_ID); + ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) + == FSEG_MAGIC_N_VALUE); + mlog_write_ull(descr + XDES_ID, seg_id, mtr); + + flst_add_last(inode + FSEG_FREE, descr + XDES_FLST_NODE, mtr); + hint += FSP_EXTENT_SIZE; + } +} + +/*********************************************************************//** +Allocates a free extent for the segment: looks first in the free list of the +segment, then tries to allocate from the space free list. NOTE that the extent +returned still resides in the segment free list, it is not yet taken off it! +@retval NULL if no page could be allocated +@retval block, rw_lock_x_lock_count(&block->lock) == 1 if allocation succeeded +(init_mtr == mtr, or the page was not previously freed in mtr) +@retval block (not allocated or initialized) otherwise */ +static +xdes_t* +fseg_alloc_free_extent( +/*===================*/ + fseg_inode_t* inode, /*!< in: segment inode */ + ulint space, /*!< in: space id */ + ulint zip_size,/*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + xdes_t* descr; + ib_id_t seg_id; + fil_addr_t first; + + ut_ad(!((page_offset(inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE)); + ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE); + + if (flst_get_len(inode + FSEG_FREE, mtr) > 0) { + /* Segment free list is not empty, allocate from it */ + + first = flst_get_first(inode + FSEG_FREE, mtr); + + descr = xdes_lst_get_descriptor(space, zip_size, first, mtr); + } else { + /* Segment free list was empty, allocate from space */ + descr = fsp_alloc_free_extent(space, zip_size, 0, mtr); + + if (descr == NULL) { + + return(NULL); + } + + seg_id = mach_read_from_8(inode + FSEG_ID); + + xdes_set_state(descr, XDES_FSEG, mtr); + mlog_write_ull(descr + XDES_ID, seg_id, mtr); + flst_add_last(inode + FSEG_FREE, descr + XDES_FLST_NODE, mtr); + + /* Try to fill the segment free list */ + fseg_fill_free_list(inode, space, zip_size, + xdes_get_offset(descr) + FSP_EXTENT_SIZE, + mtr); + } + + return(descr); +} + +/**********************************************************************//** +Allocates a single free page from a segment. This function implements +the intelligent allocation strategy which tries to minimize file space +fragmentation. +@retval NULL if no page could be allocated +@retval block, rw_lock_x_lock_count(&block->lock) == 1 if allocation succeeded +(init_mtr == mtr, or the page was not previously freed in mtr) +@retval block (not allocated or initialized) otherwise */ +static +buf_block_t* +fseg_alloc_free_page_low( +/*=====================*/ + ulint space, /*!< in: space */ + ulint zip_size,/*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + fseg_inode_t* seg_inode, /*!< in/out: segment inode */ + ulint hint, /*!< in: hint of which page would be + desirable */ + byte direction, /*!< in: if the new page is needed because + of an index page split, and records are + inserted there in order, into which + direction they go alphabetically: FSP_DOWN, + FSP_UP, FSP_NO_DIR */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + mtr_t* init_mtr)/*!< in/out: mtr or another mini-transaction + in which the page should be initialized. + If init_mtr!=mtr, but the page is already + latched in mtr, do not initialize the page. */ +{ + fsp_header_t* space_header; + ulint space_size; + ib_id_t seg_id; + ulint used; + ulint reserved; + xdes_t* descr; /*!< extent of the hinted page */ + ulint ret_page; /*!< the allocated page offset, FIL_NULL + if could not be allocated */ + xdes_t* ret_descr; /*!< the extent of the allocated page */ + ibool success; + ulint n; + + ut_ad(mtr); + ut_ad((direction >= FSP_UP) && (direction <= FSP_NO_DIR)); + ut_ad(mach_read_from_4(seg_inode + FSEG_MAGIC_N) + == FSEG_MAGIC_N_VALUE); + ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE)); + seg_id = mach_read_from_8(seg_inode + FSEG_ID); + + ut_ad(seg_id); + + reserved = fseg_n_reserved_pages_low(seg_inode, &used, mtr); + + space_header = fsp_get_space_header(space, zip_size, mtr); + + descr = xdes_get_descriptor_with_space_hdr(space_header, space, + hint, mtr); + if (descr == NULL) { + /* Hint outside space or too high above free limit: reset + hint */ + /* The file space header page is always allocated. */ + hint = 0; + descr = xdes_get_descriptor(space, zip_size, hint, mtr); + } + + /* In the big if-else below we look for ret_page and ret_descr */ + /*-------------------------------------------------------------*/ + if ((xdes_get_state(descr, mtr) == XDES_FSEG) + && mach_read_from_8(descr + XDES_ID) == seg_id + && (xdes_mtr_get_bit(descr, XDES_FREE_BIT, + hint % FSP_EXTENT_SIZE, mtr) == TRUE)) { +take_hinted_page: + /* 1. We can take the hinted page + =================================*/ + ret_descr = descr; + ret_page = hint; + /* Skip the check for extending the tablespace. If the + page hint were not within the size of the tablespace, + we would have got (descr == NULL) above and reset the hint. */ + goto got_hinted_page; + /*-----------------------------------------------------------*/ + } else if (xdes_get_state(descr, mtr) == XDES_FREE + && reserved - used < reserved / FSEG_FILLFACTOR + && used >= FSEG_FRAG_LIMIT) { + + /* 2. We allocate the free extent from space and can take + ========================================================= + the hinted page + ===============*/ + ret_descr = fsp_alloc_free_extent(space, zip_size, hint, mtr); + + ut_a(ret_descr == descr); + + xdes_set_state(ret_descr, XDES_FSEG, mtr); + mlog_write_ull(ret_descr + XDES_ID, seg_id, mtr); + flst_add_last(seg_inode + FSEG_FREE, + ret_descr + XDES_FLST_NODE, mtr); + + /* Try to fill the segment free list */ + fseg_fill_free_list(seg_inode, space, zip_size, + hint + FSP_EXTENT_SIZE, mtr); + goto take_hinted_page; + /*-----------------------------------------------------------*/ + } else if ((direction != FSP_NO_DIR) + && ((reserved - used) < reserved / FSEG_FILLFACTOR) + && (used >= FSEG_FRAG_LIMIT) + && (!!(ret_descr + = fseg_alloc_free_extent(seg_inode, + space, zip_size, mtr)))) { + + /* 3. We take any free extent (which was already assigned above + =============================================================== + in the if-condition to ret_descr) and take the lowest or + ======================================================== + highest page in it, depending on the direction + ==============================================*/ + ret_page = xdes_get_offset(ret_descr); + + if (direction == FSP_DOWN) { + ret_page += FSP_EXTENT_SIZE - 1; + } + /*-----------------------------------------------------------*/ + } else if ((xdes_get_state(descr, mtr) == XDES_FSEG) + && mach_read_from_8(descr + XDES_ID) == seg_id + && (!xdes_is_full(descr, mtr))) { + + /* 4. We can take the page from the same extent as the + ====================================================== + hinted page (and the extent already belongs to the + ================================================== + segment) + ========*/ + ret_descr = descr; + ret_page = xdes_get_offset(ret_descr) + + xdes_find_bit(ret_descr, XDES_FREE_BIT, TRUE, + hint % FSP_EXTENT_SIZE, mtr); + /*-----------------------------------------------------------*/ + } else if (reserved - used > 0) { + /* 5. We take any unused page from the segment + ==============================================*/ + fil_addr_t first; + + if (flst_get_len(seg_inode + FSEG_NOT_FULL, mtr) > 0) { + first = flst_get_first(seg_inode + FSEG_NOT_FULL, + mtr); + } else if (flst_get_len(seg_inode + FSEG_FREE, mtr) > 0) { + first = flst_get_first(seg_inode + FSEG_FREE, mtr); + } else { + ut_error; + return(NULL); + } + + ret_descr = xdes_lst_get_descriptor(space, zip_size, + first, mtr); + ret_page = xdes_get_offset(ret_descr) + + xdes_find_bit(ret_descr, XDES_FREE_BIT, TRUE, + 0, mtr); + /*-----------------------------------------------------------*/ + } else if (used < FSEG_FRAG_LIMIT) { + /* 6. We allocate an individual page from the space + ===================================================*/ + buf_block_t* block = fsp_alloc_free_page( + space, zip_size, hint, mtr, init_mtr); + + if (block != NULL) { + /* Put the page in the fragment page array of the + segment */ + n = fseg_find_free_frag_page_slot(seg_inode, mtr); + ut_a(n != ULINT_UNDEFINED); + + fseg_set_nth_frag_page_no( + seg_inode, n, buf_block_get_page_no(block), + mtr); + } + + /* fsp_alloc_free_page() invoked fsp_init_file_page() + already. */ + return(block); + /*-----------------------------------------------------------*/ + } else { + /* 7. We allocate a new extent and take its first page + ======================================================*/ + ret_descr = fseg_alloc_free_extent(seg_inode, + space, zip_size, mtr); + + if (ret_descr == NULL) { + ret_page = FIL_NULL; + } else { + ret_page = xdes_get_offset(ret_descr); + } + } + + if (ret_page == FIL_NULL) { + /* Page could not be allocated */ + + return(NULL); + } + + if (space != 0) { + space_size = fil_space_get_size(space); + + if (space_size <= ret_page) { + /* It must be that we are extending a single-table + tablespace whose size is still < 64 pages */ + + if (ret_page >= FSP_EXTENT_SIZE) { + fprintf(stderr, + "InnoDB: Error (2): trying to extend" + " a single-table tablespace %lu\n" + "InnoDB: by single page(s) though" + " the space size %lu. Page no %lu.\n", + (ulong) space, (ulong) space_size, + (ulong) ret_page); + return(NULL); + } + + success = fsp_try_extend_data_file_with_pages( + space, ret_page, space_header, mtr); + if (!success) { + /* No disk space left */ + return(NULL); + } + } + } + +got_hinted_page: + /* ret_descr == NULL if the block was allocated from free_frag + (XDES_FREE_FRAG) */ + if (ret_descr != NULL) { + /* At this point we know the extent and the page offset. + The extent is still in the appropriate list (FSEG_NOT_FULL + or FSEG_FREE), and the page is not yet marked as used. */ + + ut_ad(xdes_get_descriptor(space, zip_size, ret_page, mtr) + == ret_descr); + + ut_ad(xdes_mtr_get_bit( + ret_descr, XDES_FREE_BIT, + ret_page % FSP_EXTENT_SIZE, mtr)); + + fseg_mark_page_used(seg_inode, ret_page, ret_descr, mtr); + } + + return(fsp_page_create( + space, fsp_flags_get_zip_size( + mach_read_from_4(FSP_SPACE_FLAGS + + space_header)), + ret_page, mtr, init_mtr)); +} + +/**********************************************************************//** +Allocates a single free page from a segment. This function implements +the intelligent allocation strategy which tries to minimize file space +fragmentation. +@retval NULL if no page could be allocated +@retval block, rw_lock_x_lock_count(&block->lock) == 1 if allocation succeeded +(init_mtr == mtr, or the page was not previously freed in mtr) +@retval block (not allocated or initialized) otherwise */ +UNIV_INTERN +buf_block_t* +fseg_alloc_free_page_general( +/*=========================*/ + fseg_header_t* seg_header,/*!< in/out: segment header */ + ulint hint, /*!< in: hint of which page would be + desirable */ + byte direction,/*!< in: if the new page is needed because + of an index page split, and records are + inserted there in order, into which + direction they go alphabetically: FSP_DOWN, + FSP_UP, FSP_NO_DIR */ + ibool has_done_reservation, /*!< in: TRUE if the caller has + already done the reservation for the page + with fsp_reserve_free_extents, then there + is no need to do the check for this individual + page */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + mtr_t* init_mtr)/*!< in/out: mtr or another mini-transaction + in which the page should be initialized. + If init_mtr!=mtr, but the page is already + latched in mtr, do not initialize the page. */ +{ + fseg_inode_t* inode; + ulint space; + ulint flags; + ulint zip_size; + prio_rw_lock_t* latch; + buf_block_t* block; + ulint n_reserved; + + space = page_get_space_id(page_align(seg_header)); + + latch = fil_space_get_latch(space, &flags); + + zip_size = fsp_flags_get_zip_size(flags); + + mtr_x_lock(latch, mtr); + + if (rw_lock_get_x_lock_count(latch) == 1) { + /* This thread did not own the latch before this call: free + excess pages from the insert buffer free list */ + + if (space == IBUF_SPACE_ID) { + ibuf_free_excess_pages(); + } + } + + inode = fseg_inode_get(seg_header, space, zip_size, mtr); + + if (!has_done_reservation + && !fsp_reserve_free_extents(&n_reserved, space, 2, + FSP_NORMAL, mtr)) { + return(NULL); + } + + block = fseg_alloc_free_page_low(space, zip_size, + inode, hint, direction, + mtr, init_mtr); + if (!has_done_reservation) { + fil_space_release_free_extents(space, n_reserved); + } + + return(block); +} + +/**********************************************************************//** +Checks that we have at least 2 frag pages free in the first extent of a +single-table tablespace, and they are also physically initialized to the data +file. That is we have already extended the data file so that those pages are +inside the data file. If not, this function extends the tablespace with +pages. +@return TRUE if there were >= 3 free pages, or we were able to extend */ +static +ibool +fsp_reserve_free_pages( +/*===================*/ + ulint space, /*!< in: space id, must be != 0 */ + fsp_header_t* space_header, /*!< in: header of that space, + x-latched */ + ulint size, /*!< in: size of the tablespace in + pages, must be < FSP_EXTENT_SIZE */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + xdes_t* descr; + ulint n_used; + + ut_a(space != 0); + ut_a(size < FSP_EXTENT_SIZE); + + descr = xdes_get_descriptor_with_space_hdr(space_header, space, 0, + mtr); + n_used = xdes_get_n_used(descr, mtr); + + ut_a(n_used <= size); + + if (size >= n_used + 2) { + + return(TRUE); + } + + return(fsp_try_extend_data_file_with_pages(space, n_used + 1, + space_header, mtr)); +} + +/**********************************************************************//** +Reserves free pages from a tablespace. All mini-transactions which may +use several pages from the tablespace should call this function beforehand +and reserve enough free extents so that they certainly will be able +to do their operation, like a B-tree page split, fully. Reservations +must be released with function fil_space_release_free_extents! + +The alloc_type below has the following meaning: FSP_NORMAL means an +operation which will probably result in more space usage, like an +insert in a B-tree; FSP_UNDO means allocation to undo logs: if we are +deleting rows, then this allocation will in the long run result in +less space usage (after a purge); FSP_CLEANING means allocation done +in a physical record delete (like in a purge) or other cleaning operation +which will result in less space usage in the long run. We prefer the latter +two types of allocation: when space is scarce, FSP_NORMAL allocations +will not succeed, but the latter two allocations will succeed, if possible. +The purpose is to avoid dead end where the database is full but the +user cannot free any space because these freeing operations temporarily +reserve some space. + +Single-table tablespaces whose size is < 32 pages are a special case. In this +function we would liberally reserve several 64 page extents for every page +split or merge in a B-tree. But we do not want to waste disk space if the table +only occupies < 32 pages. That is why we apply different rules in that special +case, just ensuring that there are 3 free pages available. +@return TRUE if we were able to make the reservation */ +UNIV_INTERN +ibool +fsp_reserve_free_extents( +/*=====================*/ + ulint* n_reserved,/*!< out: number of extents actually reserved; if we + return TRUE and the tablespace size is < 64 pages, + then this can be 0, otherwise it is n_ext */ + ulint space, /*!< in: space id */ + ulint n_ext, /*!< in: number of extents to reserve */ + ulint alloc_type,/*!< in: FSP_NORMAL, FSP_UNDO, or FSP_CLEANING */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + fsp_header_t* space_header; + prio_rw_lock_t* latch; + ulint n_free_list_ext; + ulint free_limit; + ulint size; + ulint flags; + ulint zip_size; + ulint n_free; + ulint n_free_up; + ulint reserve; + ibool success; + ulint n_pages_added; + + ut_ad(mtr); + *n_reserved = n_ext; + + latch = fil_space_get_latch(space, &flags); + zip_size = fsp_flags_get_zip_size(flags); + + mtr_x_lock(latch, mtr); + + space_header = fsp_get_space_header(space, zip_size, mtr); +try_again: + size = mtr_read_ulint(space_header + FSP_SIZE, MLOG_4BYTES, mtr); + + if (size < FSP_EXTENT_SIZE) { + /* Use different rules for small single-table tablespaces */ + *n_reserved = 0; + return(fsp_reserve_free_pages(space, space_header, size, mtr)); + } + + n_free_list_ext = flst_get_len(space_header + FSP_FREE, mtr); + + free_limit = mtr_read_ulint(space_header + FSP_FREE_LIMIT, + MLOG_4BYTES, mtr); + + /* Below we play safe when counting free extents above the free limit: + some of them will contain extent descriptor pages, and therefore + will not be free extents */ + + ut_ad(size >= free_limit); + n_free_up = (size - free_limit) / FSP_EXTENT_SIZE; + + if (n_free_up > 0) { + n_free_up--; + if (!zip_size) { + n_free_up -= n_free_up + / (UNIV_PAGE_SIZE / FSP_EXTENT_SIZE); + } else { + n_free_up -= n_free_up + / (zip_size / FSP_EXTENT_SIZE); + } + } + + n_free = n_free_list_ext + n_free_up; + + if (alloc_type == FSP_NORMAL) { + /* We reserve 1 extent + 0.5 % of the space size to undo logs + and 1 extent + 0.5 % to cleaning operations; NOTE: this source + code is duplicated in the function below! */ + + reserve = 2 + ((size / FSP_EXTENT_SIZE) * 2) / 200; + + if (n_free <= reserve + n_ext) { + + goto try_to_extend; + } + } else if (alloc_type == FSP_UNDO) { + /* We reserve 0.5 % of the space size to cleaning operations */ + + reserve = 1 + ((size / FSP_EXTENT_SIZE) * 1) / 200; + + if (n_free <= reserve + n_ext) { + + goto try_to_extend; + } + } else { + ut_a(alloc_type == FSP_CLEANING); + } + + success = fil_space_reserve_free_extents(space, n_free, n_ext); + + if (success) { + return(TRUE); + } +try_to_extend: + success = fsp_try_extend_data_file(&n_pages_added, space, + space_header, mtr); + if (success && n_pages_added > 0) { + + goto try_again; + } + + return(FALSE); +} + +/**********************************************************************//** +This function should be used to get information on how much we still +will be able to insert new data to the database without running out the +tablespace. Only free extents are taken into account and we also subtract +the safety margin required by the above function fsp_reserve_free_extents. +@return available space in kB */ +UNIV_INTERN +ullint +fsp_get_available_space_in_free_extents( +/*====================================*/ + ulint space) /*!< in: space id */ +{ + fsp_header_t* space_header; + ulint n_free_list_ext; + ulint free_limit; + ulint size; + ulint flags; + ulint zip_size; + ulint n_free; + ulint n_free_up; + ulint reserve; + prio_rw_lock_t* latch; + mtr_t mtr; + + /* The convoluted mutex acquire is to overcome latching order + issues: The problem is that the fil_mutex is at a lower level + than the tablespace latch and the buffer pool mutexes. We have to + first prevent any operations on the file system by acquiring the + dictionary mutex. Then acquire the tablespace latch to obey the + latching order and then release the dictionary mutex. That way we + ensure that the tablespace instance can't be freed while we are + examining its contents (see fil_space_free()). + + However, there is one further complication, we release the fil_mutex + when we need to invalidate the the pages in the buffer pool and we + reacquire the fil_mutex when deleting and freeing the tablespace + instance in fil0fil.cc. Here we need to account for that situation + too. */ + + mutex_enter(&dict_sys->mutex); + + /* At this stage there is no guarantee that the tablespace even + exists in the cache. */ + + if (fil_tablespace_deleted_or_being_deleted_in_mem(space, -1)) { + + mutex_exit(&dict_sys->mutex); + + return(ULLINT_UNDEFINED); + } + + mtr_start(&mtr); + + latch = fil_space_get_latch(space, &flags); + + /* This should ensure that the tablespace instance can't be freed + by another thread. However, the tablespace pages can still be freed + from the buffer pool. We need to check for that again. */ + + zip_size = fsp_flags_get_zip_size(flags); + + mtr_x_lock(latch, &mtr); + + mutex_exit(&dict_sys->mutex); + + /* At this point it is possible for the tablespace to be deleted and + its pages removed from the buffer pool. We need to check for that + situation. However, the tablespace instance can't be deleted because + our latching above should ensure that. */ + + if (fil_tablespace_is_being_deleted(space)) { + + mtr_commit(&mtr); + + return(ULLINT_UNDEFINED); + } + + /* From here on even if the user has dropped the tablespace, the + pages _must_ still exist in the buffer pool and the tablespace + instance _must_ be in the file system hash table. */ + + space_header = fsp_get_space_header(space, zip_size, &mtr); + + size = mtr_read_ulint(space_header + FSP_SIZE, MLOG_4BYTES, &mtr); + + n_free_list_ext = flst_get_len(space_header + FSP_FREE, &mtr); + + free_limit = mtr_read_ulint(space_header + FSP_FREE_LIMIT, + MLOG_4BYTES, &mtr); + mtr_commit(&mtr); + + if (size < FSP_EXTENT_SIZE) { + ut_a(space != 0); /* This must be a single-table + tablespace */ + + return(0); /* TODO: count free frag pages and + return a value based on that */ + } + + /* Below we play safe when counting free extents above the free limit: + some of them will contain extent descriptor pages, and therefore + will not be free extents */ + + n_free_up = (size - free_limit) / FSP_EXTENT_SIZE; + + if (n_free_up > 0) { + n_free_up--; + if (!zip_size) { + n_free_up -= n_free_up + / (UNIV_PAGE_SIZE / FSP_EXTENT_SIZE); + } else { + n_free_up -= n_free_up + / (zip_size / FSP_EXTENT_SIZE); + } + } + + n_free = n_free_list_ext + n_free_up; + + /* We reserve 1 extent + 0.5 % of the space size to undo logs + and 1 extent + 0.5 % to cleaning operations; NOTE: this source + code is duplicated in the function above! */ + + reserve = 2 + ((size / FSP_EXTENT_SIZE) * 2) / 200; + + if (reserve > n_free) { + return(0); + } + + if (!zip_size) { + return((ullint) (n_free - reserve) + * FSP_EXTENT_SIZE + * (UNIV_PAGE_SIZE / 1024)); + } else { + return((ullint) (n_free - reserve) + * FSP_EXTENT_SIZE + * (zip_size / 1024)); + } +} + +/********************************************************************//** +Marks a page used. The page must reside within the extents of the given +segment. */ +static __attribute__((nonnull)) +void +fseg_mark_page_used( +/*================*/ + fseg_inode_t* seg_inode,/*!< in: segment inode */ + ulint page, /*!< in: page offset */ + xdes_t* descr, /*!< in: extent descriptor */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ulint not_full_n_used; + + ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE)); + ut_ad(mach_read_from_4(seg_inode + FSEG_MAGIC_N) + == FSEG_MAGIC_N_VALUE); + + ut_ad(mtr_read_ulint(seg_inode + FSEG_ID, MLOG_4BYTES, mtr) + == mtr_read_ulint(descr + XDES_ID, MLOG_4BYTES, mtr)); + + if (xdes_is_free(descr, mtr)) { + /* We move the extent from the free list to the + NOT_FULL list */ + flst_remove(seg_inode + FSEG_FREE, descr + XDES_FLST_NODE, + mtr); + flst_add_last(seg_inode + FSEG_NOT_FULL, + descr + XDES_FLST_NODE, mtr); + } + + ut_ad(xdes_mtr_get_bit( + descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, mtr)); + + /* We mark the page as used */ + xdes_set_bit(descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, FALSE, mtr); + + not_full_n_used = mtr_read_ulint(seg_inode + FSEG_NOT_FULL_N_USED, + MLOG_4BYTES, mtr); + not_full_n_used++; + mlog_write_ulint(seg_inode + FSEG_NOT_FULL_N_USED, not_full_n_used, + MLOG_4BYTES, mtr); + if (xdes_is_full(descr, mtr)) { + /* We move the extent from the NOT_FULL list to the + FULL list */ + flst_remove(seg_inode + FSEG_NOT_FULL, + descr + XDES_FLST_NODE, mtr); + flst_add_last(seg_inode + FSEG_FULL, + descr + XDES_FLST_NODE, mtr); + + mlog_write_ulint(seg_inode + FSEG_NOT_FULL_N_USED, + not_full_n_used - FSP_EXTENT_SIZE, + MLOG_4BYTES, mtr); + } +} + +/**********************************************************************//** +Frees a single page of a segment. */ +static +void +fseg_free_page_low( +/*===============*/ + fseg_inode_t* seg_inode, /*!< in: segment inode */ + ulint space, /*!< in: space id */ + ulint zip_size,/*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page, /*!< in: page offset */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + xdes_t* descr; + ulint not_full_n_used; + ulint state; + ib_id_t descr_id; + ib_id_t seg_id; + ulint i; + + ut_ad(seg_inode && mtr); + ut_ad(mach_read_from_4(seg_inode + FSEG_MAGIC_N) + == FSEG_MAGIC_N_VALUE); + ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE)); + + /* Drop search system page hash index if the page is found in + the pool and is hashed */ + + btr_search_drop_page_hash_when_freed(space, zip_size, page); + + descr = xdes_get_descriptor(space, zip_size, page, mtr); + + SRV_CORRUPT_TABLE_CHECK(descr, + { + /* The page may be corrupt. pass it. */ + return; + }); + + if (xdes_mtr_get_bit(descr, XDES_FREE_BIT, + page % FSP_EXTENT_SIZE, mtr)) { + fputs("InnoDB: Dump of the tablespace extent descriptor: ", + stderr); + ut_print_buf(stderr, descr, 40); + + fprintf(stderr, "\n" + "InnoDB: Serious error! InnoDB is trying to" + " free page %lu\n" + "InnoDB: though it is already marked as free" + " in the tablespace!\n" + "InnoDB: The tablespace free space info is corrupt.\n" + "InnoDB: You may need to dump your" + " InnoDB tables and recreate the whole\n" + "InnoDB: database!\n", (ulong) page); +crash: + fputs("InnoDB: Please refer to\n" + "InnoDB: " REFMAN "forcing-innodb-recovery.html\n" + "InnoDB: about forcing recovery.\n", stderr); + ut_error; + } + + state = xdes_get_state(descr, mtr); + + if (state != XDES_FSEG) { + /* The page is in the fragment pages of the segment */ + + for (i = 0;; i++) { + if (fseg_get_nth_frag_page_no(seg_inode, i, mtr) + == page) { + + fseg_set_nth_frag_page_no(seg_inode, i, + FIL_NULL, mtr); + break; + } + } + + fsp_free_page(space, zip_size, page, mtr); + + return; + } + + /* If we get here, the page is in some extent of the segment */ + + descr_id = mach_read_from_8(descr + XDES_ID); + seg_id = mach_read_from_8(seg_inode + FSEG_ID); +#if 0 + fprintf(stderr, + "InnoDB: InnoDB is freeing space %lu page %lu,\n" + "InnoDB: which belongs to descr seg %llu\n" + "InnoDB: segment %llu.\n", + (ulong) space, (ulong) page, + (ullint) descr_id, + (ullint) seg_id); +#endif /* 0 */ + if (UNIV_UNLIKELY(descr_id != seg_id)) { + fputs("InnoDB: Dump of the tablespace extent descriptor: ", + stderr); + ut_print_buf(stderr, descr, 40); + fputs("\nInnoDB: Dump of the segment inode: ", stderr); + ut_print_buf(stderr, seg_inode, 40); + putc('\n', stderr); + + fprintf(stderr, + "InnoDB: Serious error: InnoDB is trying to" + " free space %lu page %lu,\n" + "InnoDB: which does not belong to" + " segment %llu but belongs\n" + "InnoDB: to segment %llu.\n", + (ulong) space, (ulong) page, + (ullint) descr_id, + (ullint) seg_id); + goto crash; + } + + not_full_n_used = mtr_read_ulint(seg_inode + FSEG_NOT_FULL_N_USED, + MLOG_4BYTES, mtr); + if (xdes_is_full(descr, mtr)) { + /* The fragment is full: move it to another list */ + flst_remove(seg_inode + FSEG_FULL, + descr + XDES_FLST_NODE, mtr); + flst_add_last(seg_inode + FSEG_NOT_FULL, + descr + XDES_FLST_NODE, mtr); + mlog_write_ulint(seg_inode + FSEG_NOT_FULL_N_USED, + not_full_n_used + FSP_EXTENT_SIZE - 1, + MLOG_4BYTES, mtr); + } else { + ut_a(not_full_n_used > 0); + mlog_write_ulint(seg_inode + FSEG_NOT_FULL_N_USED, + not_full_n_used - 1, MLOG_4BYTES, mtr); + } + + xdes_set_bit(descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, TRUE, mtr); + xdes_set_bit(descr, XDES_CLEAN_BIT, page % FSP_EXTENT_SIZE, TRUE, mtr); + + if (xdes_is_free(descr, mtr)) { + /* The extent has become free: free it to space */ + flst_remove(seg_inode + FSEG_NOT_FULL, + descr + XDES_FLST_NODE, mtr); + fsp_free_extent(space, zip_size, page, mtr); + } + + mtr->n_freed_pages++; +} + +/**********************************************************************//** +Frees a single page of a segment. */ +UNIV_INTERN +void +fseg_free_page( +/*===========*/ + fseg_header_t* seg_header, /*!< in: segment header */ + ulint space, /*!< in: space id */ + ulint page, /*!< in: page offset */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ulint flags; + ulint zip_size; + fseg_inode_t* seg_inode; + prio_rw_lock_t* latch; + + latch = fil_space_get_latch(space, &flags); + zip_size = fsp_flags_get_zip_size(flags); + + mtr_x_lock(latch, mtr); + + seg_inode = fseg_inode_get(seg_header, space, zip_size, mtr); + + fseg_free_page_low(seg_inode, space, zip_size, page, mtr); + +#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG + buf_page_set_file_page_was_freed(space, page); +#endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */ +} + +/**********************************************************************//** +Checks if a single page of a segment is free. +@return true if free */ +UNIV_INTERN +bool +fseg_page_is_free( +/*==============*/ + fseg_header_t* seg_header, /*!< in: segment header */ + ulint space, /*!< in: space id */ + ulint page) /*!< in: page offset */ +{ + mtr_t mtr; + ibool is_free; + ulint flags; + prio_rw_lock_t* latch; + xdes_t* descr; + ulint zip_size; + fseg_inode_t* seg_inode; + + latch = fil_space_get_latch(space, &flags); + zip_size = dict_tf_get_zip_size(flags); + + mtr_start(&mtr); + mtr_x_lock(latch, &mtr); + + seg_inode = fseg_inode_get(seg_header, space, zip_size, &mtr); + + ut_a(seg_inode); + ut_ad(mach_read_from_4(seg_inode + FSEG_MAGIC_N) + == FSEG_MAGIC_N_VALUE); + ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE)); + + descr = xdes_get_descriptor(space, zip_size, page, &mtr); + ut_a(descr); + + is_free = xdes_mtr_get_bit( + descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, &mtr); + + mtr_commit(&mtr); + + return(is_free); +} + +/**********************************************************************//** +Frees an extent of a segment to the space free list. */ +static +void +fseg_free_extent( +/*=============*/ + fseg_inode_t* seg_inode, /*!< in: segment inode */ + ulint space, /*!< in: space id */ + ulint zip_size,/*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page, /*!< in: a page in the extent */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ulint first_page_in_extent; + xdes_t* descr; + ulint not_full_n_used; + ulint descr_n_used; + ulint i; + + ut_ad(seg_inode && mtr); + + descr = xdes_get_descriptor(space, zip_size, page, mtr); + + ut_a(xdes_get_state(descr, mtr) == XDES_FSEG); + ut_a(!memcmp(descr + XDES_ID, seg_inode + FSEG_ID, 8)); + ut_ad(mach_read_from_4(seg_inode + FSEG_MAGIC_N) + == FSEG_MAGIC_N_VALUE); + + first_page_in_extent = page - (page % FSP_EXTENT_SIZE); + + for (i = 0; i < FSP_EXTENT_SIZE; i++) { + if (!xdes_mtr_get_bit(descr, XDES_FREE_BIT, i, mtr)) { + + /* Drop search system page hash index if the page is + found in the pool and is hashed */ + + btr_search_drop_page_hash_when_freed( + space, zip_size, first_page_in_extent + i); + } + } + + if (xdes_is_full(descr, mtr)) { + flst_remove(seg_inode + FSEG_FULL, + descr + XDES_FLST_NODE, mtr); + } else if (xdes_is_free(descr, mtr)) { + flst_remove(seg_inode + FSEG_FREE, + descr + XDES_FLST_NODE, mtr); + } else { + flst_remove(seg_inode + FSEG_NOT_FULL, + descr + XDES_FLST_NODE, mtr); + + not_full_n_used = mtr_read_ulint( + seg_inode + FSEG_NOT_FULL_N_USED, MLOG_4BYTES, mtr); + + descr_n_used = xdes_get_n_used(descr, mtr); + ut_a(not_full_n_used >= descr_n_used); + mlog_write_ulint(seg_inode + FSEG_NOT_FULL_N_USED, + not_full_n_used - descr_n_used, + MLOG_4BYTES, mtr); + } + + fsp_free_extent(space, zip_size, page, mtr); + +#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG + for (i = 0; i < FSP_EXTENT_SIZE; i++) { + + buf_page_set_file_page_was_freed(space, + first_page_in_extent + i); + } +#endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */ +} + +/**********************************************************************//** +Frees part of a segment. This function can be used to free a segment by +repeatedly calling this function in different mini-transactions. Doing +the freeing in a single mini-transaction might result in too big a +mini-transaction. +@return TRUE if freeing completed */ +UNIV_INTERN +ibool +fseg_free_step( +/*===========*/ + fseg_header_t* header, /*!< in, own: segment header; NOTE: if the header + resides on the first page of the frag list + of the segment, this pointer becomes obsolete + after the last freeing step */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ulint n; + ulint page; + xdes_t* descr; + fseg_inode_t* inode; + ulint space; + ulint flags; + ulint zip_size; + ulint header_page; + prio_rw_lock_t* latch; + + space = page_get_space_id(page_align(header)); + header_page = page_get_page_no(page_align(header)); + + latch = fil_space_get_latch(space, &flags); + zip_size = fsp_flags_get_zip_size(flags); + + mtr_x_lock(latch, mtr); + + descr = xdes_get_descriptor(space, zip_size, header_page, mtr); + + SRV_CORRUPT_TABLE_CHECK(descr, + { + /* The page may be corrupt. pass it. */ + return(TRUE); + }); + + /* Check that the header resides on a page which has not been + freed yet */ + + ut_a(xdes_mtr_get_bit(descr, XDES_FREE_BIT, + header_page % FSP_EXTENT_SIZE, mtr) == FALSE); + + inode = fseg_inode_try_get(header, space, zip_size, mtr); + + if (UNIV_UNLIKELY(inode == NULL)) { + fprintf(stderr, "double free of inode from %u:%u\n", + (unsigned) space, (unsigned) header_page); + return(TRUE); + } + + descr = fseg_get_first_extent(inode, space, zip_size, mtr); + + if (descr != NULL) { + /* Free the extent held by the segment */ + page = xdes_get_offset(descr); + + fseg_free_extent(inode, space, zip_size, page, mtr); + + return(FALSE); + } + + /* Free a frag page */ + n = fseg_find_last_used_frag_page_slot(inode, mtr); + + if (n == ULINT_UNDEFINED) { + /* Freeing completed: free the segment inode */ + fsp_free_seg_inode(space, zip_size, inode, mtr); + + return(TRUE); + } + + fseg_free_page_low(inode, space, zip_size, + fseg_get_nth_frag_page_no(inode, n, mtr), mtr); + + n = fseg_find_last_used_frag_page_slot(inode, mtr); + + if (n == ULINT_UNDEFINED) { + /* Freeing completed: free the segment inode */ + fsp_free_seg_inode(space, zip_size, inode, mtr); + + return(TRUE); + } + + return(FALSE); +} + +/**********************************************************************//** +Frees part of a segment. Differs from fseg_free_step because this function +leaves the header page unfreed. +@return TRUE if freeing completed, except the header page */ +UNIV_INTERN +ibool +fseg_free_step_not_header( +/*======================*/ + fseg_header_t* header, /*!< in: segment header which must reside on + the first fragment page of the segment */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ulint n; + ulint page; + xdes_t* descr; + fseg_inode_t* inode; + ulint space; + ulint flags; + ulint zip_size; + ulint page_no; + prio_rw_lock_t* latch; + + space = page_get_space_id(page_align(header)); + + latch = fil_space_get_latch(space, &flags); + zip_size = fsp_flags_get_zip_size(flags); + + mtr_x_lock(latch, mtr); + + inode = fseg_inode_get(header, space, zip_size, mtr); + + SRV_CORRUPT_TABLE_CHECK(inode, + { + /* ignore the corruption */ + return(TRUE); + }); + + descr = fseg_get_first_extent(inode, space, zip_size, mtr); + + if (descr != NULL) { + /* Free the extent held by the segment */ + page = xdes_get_offset(descr); + + fseg_free_extent(inode, space, zip_size, page, mtr); + + return(FALSE); + } + + /* Free a frag page */ + + n = fseg_find_last_used_frag_page_slot(inode, mtr); + + if (n == ULINT_UNDEFINED) { + ut_error; + } + + page_no = fseg_get_nth_frag_page_no(inode, n, mtr); + + if (page_no == page_get_page_no(page_align(header))) { + + return(TRUE); + } + + fseg_free_page_low(inode, space, zip_size, page_no, mtr); + + return(FALSE); +} + +/**********************************************************************//** +Returns the first extent descriptor for a segment. We think of the extent +lists of the segment catenated in the order FSEG_FULL -> FSEG_NOT_FULL +-> FSEG_FREE. +@return the first extent descriptor, or NULL if none */ +static +xdes_t* +fseg_get_first_extent( +/*==================*/ + fseg_inode_t* inode, /*!< in: segment inode */ + ulint space, /*!< in: space id */ + ulint zip_size,/*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + fil_addr_t first; + xdes_t* descr; + + ut_ad(inode && mtr); + + ut_ad(space == page_get_space_id(page_align(inode))); + ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE); + + first = fil_addr_null; + + if (flst_get_len(inode + FSEG_FULL, mtr) > 0) { + + first = flst_get_first(inode + FSEG_FULL, mtr); + + } else if (flst_get_len(inode + FSEG_NOT_FULL, mtr) > 0) { + + first = flst_get_first(inode + FSEG_NOT_FULL, mtr); + + } else if (flst_get_len(inode + FSEG_FREE, mtr) > 0) { + + first = flst_get_first(inode + FSEG_FREE, mtr); + } + + if (first.page == FIL_NULL) { + + return(NULL); + } + descr = xdes_lst_get_descriptor(space, zip_size, first, mtr); + + return(descr); +} + +/*******************************************************************//** +Validates a segment. +@return TRUE if ok */ +static +ibool +fseg_validate_low( +/*==============*/ + fseg_inode_t* inode, /*!< in: segment inode */ + mtr_t* mtr2) /*!< in/out: mini-transaction */ +{ + ulint space; + ib_id_t seg_id; + mtr_t mtr; + xdes_t* descr; + fil_addr_t node_addr; + ulint n_used = 0; + ulint n_used2 = 0; + + ut_ad(mtr_memo_contains_page(mtr2, inode, MTR_MEMO_PAGE_X_FIX)); + ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE); + + space = page_get_space_id(page_align(inode)); + + seg_id = mach_read_from_8(inode + FSEG_ID); + n_used = mtr_read_ulint(inode + FSEG_NOT_FULL_N_USED, + MLOG_4BYTES, mtr2); + flst_validate(inode + FSEG_FREE, mtr2); + flst_validate(inode + FSEG_NOT_FULL, mtr2); + flst_validate(inode + FSEG_FULL, mtr2); + + /* Validate FSEG_FREE list */ + node_addr = flst_get_first(inode + FSEG_FREE, mtr2); + + while (!fil_addr_is_null(node_addr)) { + ulint flags; + ulint zip_size; + + mtr_start(&mtr); + mtr_x_lock(fil_space_get_latch(space, &flags), &mtr); + zip_size = fsp_flags_get_zip_size(flags); + + descr = xdes_lst_get_descriptor(space, zip_size, + node_addr, &mtr); + + ut_a(xdes_get_n_used(descr, &mtr) == 0); + ut_a(xdes_get_state(descr, &mtr) == XDES_FSEG); + ut_a(mach_read_from_8(descr + XDES_ID) == seg_id); + + node_addr = flst_get_next_addr(descr + XDES_FLST_NODE, &mtr); + mtr_commit(&mtr); + } + + /* Validate FSEG_NOT_FULL list */ + + node_addr = flst_get_first(inode + FSEG_NOT_FULL, mtr2); + + while (!fil_addr_is_null(node_addr)) { + ulint flags; + ulint zip_size; + + mtr_start(&mtr); + mtr_x_lock(fil_space_get_latch(space, &flags), &mtr); + zip_size = fsp_flags_get_zip_size(flags); + + descr = xdes_lst_get_descriptor(space, zip_size, + node_addr, &mtr); + + ut_a(xdes_get_n_used(descr, &mtr) > 0); + ut_a(xdes_get_n_used(descr, &mtr) < FSP_EXTENT_SIZE); + ut_a(xdes_get_state(descr, &mtr) == XDES_FSEG); + ut_a(mach_read_from_8(descr + XDES_ID) == seg_id); + + n_used2 += xdes_get_n_used(descr, &mtr); + + node_addr = flst_get_next_addr(descr + XDES_FLST_NODE, &mtr); + mtr_commit(&mtr); + } + + /* Validate FSEG_FULL list */ + + node_addr = flst_get_first(inode + FSEG_FULL, mtr2); + + while (!fil_addr_is_null(node_addr)) { + ulint flags; + ulint zip_size; + + mtr_start(&mtr); + mtr_x_lock(fil_space_get_latch(space, &flags), &mtr); + zip_size = fsp_flags_get_zip_size(flags); + + descr = xdes_lst_get_descriptor(space, zip_size, + node_addr, &mtr); + + ut_a(xdes_get_n_used(descr, &mtr) == FSP_EXTENT_SIZE); + ut_a(xdes_get_state(descr, &mtr) == XDES_FSEG); + ut_a(mach_read_from_8(descr + XDES_ID) == seg_id); + + node_addr = flst_get_next_addr(descr + XDES_FLST_NODE, &mtr); + mtr_commit(&mtr); + } + + ut_a(n_used == n_used2); + + return(TRUE); +} + +#ifdef UNIV_DEBUG +/*******************************************************************//** +Validates a segment. +@return TRUE if ok */ +UNIV_INTERN +ibool +fseg_validate( +/*==========*/ + fseg_header_t* header, /*!< in: segment header */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + fseg_inode_t* inode; + ibool ret; + ulint space; + ulint flags; + ulint zip_size; + + space = page_get_space_id(page_align(header)); + + mtr_x_lock(fil_space_get_latch(space, &flags), mtr); + zip_size = fsp_flags_get_zip_size(flags); + + inode = fseg_inode_get(header, space, zip_size, mtr); + + ret = fseg_validate_low(inode, mtr); + + return(ret); +} +#endif /* UNIV_DEBUG */ + +/*******************************************************************//** +Writes info of a segment. */ +static +void +fseg_print_low( +/*===========*/ + fseg_inode_t* inode, /*!< in: segment inode */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ulint space; + ulint n_used; + ulint n_frag; + ulint n_free; + ulint n_not_full; + ulint n_full; + ulint reserved; + ulint used; + ulint page_no; + ib_id_t seg_id; + + ut_ad(mtr_memo_contains_page(mtr, inode, MTR_MEMO_PAGE_X_FIX)); + space = page_get_space_id(page_align(inode)); + page_no = page_get_page_no(page_align(inode)); + + reserved = fseg_n_reserved_pages_low(inode, &used, mtr); + + seg_id = mach_read_from_8(inode + FSEG_ID); + + n_used = mtr_read_ulint(inode + FSEG_NOT_FULL_N_USED, + MLOG_4BYTES, mtr); + n_frag = fseg_get_n_frag_pages(inode, mtr); + n_free = flst_get_len(inode + FSEG_FREE, mtr); + n_not_full = flst_get_len(inode + FSEG_NOT_FULL, mtr); + n_full = flst_get_len(inode + FSEG_FULL, mtr); + + fprintf(stderr, + "SEGMENT id %llu space %lu; page %lu;" + " res %lu used %lu; full ext %lu\n" + "fragm pages %lu; free extents %lu;" + " not full extents %lu: pages %lu\n", + (ullint) seg_id, + (ulong) space, (ulong) page_no, + (ulong) reserved, (ulong) used, (ulong) n_full, + (ulong) n_frag, (ulong) n_free, (ulong) n_not_full, + (ulong) n_used); + ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE); +} + +#ifdef UNIV_BTR_PRINT +/*******************************************************************//** +Writes info of a segment. */ +UNIV_INTERN +void +fseg_print( +/*=======*/ + fseg_header_t* header, /*!< in: segment header */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + fseg_inode_t* inode; + ulint space; + ulint flags; + ulint zip_size; + + space = page_get_space_id(page_align(header)); + + mtr_x_lock(fil_space_get_latch(space, &flags), mtr); + zip_size = fsp_flags_get_zip_size(flags); + + inode = fseg_inode_get(header, space, zip_size, mtr); + + fseg_print_low(inode, mtr); +} +#endif /* UNIV_BTR_PRINT */ + +/*******************************************************************//** +Validates the file space system and its segments. +@return TRUE if ok */ +UNIV_INTERN +ibool +fsp_validate( +/*=========*/ + ulint space) /*!< in: space id */ +{ + fsp_header_t* header; + fseg_inode_t* seg_inode; + page_t* seg_inode_page; + prio_rw_lock_t* latch; + ulint size; + ulint flags; + ulint zip_size; + ulint free_limit; + ulint frag_n_used; + mtr_t mtr; + mtr_t mtr2; + xdes_t* descr; + fil_addr_t node_addr; + fil_addr_t next_node_addr; + ulint descr_count = 0; + ulint n_used = 0; + ulint n_used2 = 0; + ulint n_full_frag_pages; + ulint n; + ulint seg_inode_len_free; + ulint seg_inode_len_full; + + latch = fil_space_get_latch(space, &flags); + zip_size = fsp_flags_get_zip_size(flags); + ut_a(ut_is_2pow(zip_size)); + ut_a(zip_size <= UNIV_ZIP_SIZE_MAX); + ut_a(!zip_size || zip_size >= UNIV_ZIP_SIZE_MIN); + + /* Start first a mini-transaction mtr2 to lock out all other threads + from the fsp system */ + mtr_start(&mtr2); + mtr_x_lock(latch, &mtr2); + + mtr_start(&mtr); + mtr_x_lock(latch, &mtr); + + header = fsp_get_space_header(space, zip_size, &mtr); + + size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, &mtr); + free_limit = mtr_read_ulint(header + FSP_FREE_LIMIT, + MLOG_4BYTES, &mtr); + frag_n_used = mtr_read_ulint(header + FSP_FRAG_N_USED, + MLOG_4BYTES, &mtr); + + n_full_frag_pages = FSP_EXTENT_SIZE + * flst_get_len(header + FSP_FULL_FRAG, &mtr); + + if (UNIV_UNLIKELY(free_limit > size)) { + + ut_a(space != 0); + ut_a(size < FSP_EXTENT_SIZE); + } + + flst_validate(header + FSP_FREE, &mtr); + flst_validate(header + FSP_FREE_FRAG, &mtr); + flst_validate(header + FSP_FULL_FRAG, &mtr); + + mtr_commit(&mtr); + + /* Validate FSP_FREE list */ + mtr_start(&mtr); + mtr_x_lock(latch, &mtr); + + header = fsp_get_space_header(space, zip_size, &mtr); + node_addr = flst_get_first(header + FSP_FREE, &mtr); + + mtr_commit(&mtr); + + while (!fil_addr_is_null(node_addr)) { + mtr_start(&mtr); + mtr_x_lock(latch, &mtr); + + descr_count++; + descr = xdes_lst_get_descriptor(space, zip_size, + node_addr, &mtr); + + ut_a(xdes_get_n_used(descr, &mtr) == 0); + ut_a(xdes_get_state(descr, &mtr) == XDES_FREE); + + node_addr = flst_get_next_addr(descr + XDES_FLST_NODE, &mtr); + mtr_commit(&mtr); + } + + /* Validate FSP_FREE_FRAG list */ + mtr_start(&mtr); + mtr_x_lock(latch, &mtr); + + header = fsp_get_space_header(space, zip_size, &mtr); + node_addr = flst_get_first(header + FSP_FREE_FRAG, &mtr); + + mtr_commit(&mtr); + + while (!fil_addr_is_null(node_addr)) { + mtr_start(&mtr); + mtr_x_lock(latch, &mtr); + + descr_count++; + descr = xdes_lst_get_descriptor(space, zip_size, + node_addr, &mtr); + + ut_a(xdes_get_n_used(descr, &mtr) > 0); + ut_a(xdes_get_n_used(descr, &mtr) < FSP_EXTENT_SIZE); + ut_a(xdes_get_state(descr, &mtr) == XDES_FREE_FRAG); + + n_used += xdes_get_n_used(descr, &mtr); + node_addr = flst_get_next_addr(descr + XDES_FLST_NODE, &mtr); + + mtr_commit(&mtr); + } + + /* Validate FSP_FULL_FRAG list */ + mtr_start(&mtr); + mtr_x_lock(latch, &mtr); + + header = fsp_get_space_header(space, zip_size, &mtr); + node_addr = flst_get_first(header + FSP_FULL_FRAG, &mtr); + + mtr_commit(&mtr); + + while (!fil_addr_is_null(node_addr)) { + mtr_start(&mtr); + mtr_x_lock(latch, &mtr); + + descr_count++; + descr = xdes_lst_get_descriptor(space, zip_size, + node_addr, &mtr); + + ut_a(xdes_get_n_used(descr, &mtr) == FSP_EXTENT_SIZE); + ut_a(xdes_get_state(descr, &mtr) == XDES_FULL_FRAG); + + node_addr = flst_get_next_addr(descr + XDES_FLST_NODE, &mtr); + mtr_commit(&mtr); + } + + /* Validate segments */ + mtr_start(&mtr); + mtr_x_lock(latch, &mtr); + + header = fsp_get_space_header(space, zip_size, &mtr); + + node_addr = flst_get_first(header + FSP_SEG_INODES_FULL, &mtr); + + seg_inode_len_full = flst_get_len(header + FSP_SEG_INODES_FULL, &mtr); + + mtr_commit(&mtr); + + while (!fil_addr_is_null(node_addr)) { + + n = 0; + do { + mtr_start(&mtr); + mtr_x_lock(latch, &mtr); + + seg_inode_page = fut_get_ptr( + space, zip_size, node_addr, RW_X_LATCH, &mtr) + - FSEG_INODE_PAGE_NODE; + + seg_inode = fsp_seg_inode_page_get_nth_inode( + seg_inode_page, n, zip_size, &mtr); + ut_a(mach_read_from_8(seg_inode + FSEG_ID) != 0); + fseg_validate_low(seg_inode, &mtr); + + descr_count += flst_get_len(seg_inode + FSEG_FREE, + &mtr); + descr_count += flst_get_len(seg_inode + FSEG_FULL, + &mtr); + descr_count += flst_get_len(seg_inode + FSEG_NOT_FULL, + &mtr); + + n_used2 += fseg_get_n_frag_pages(seg_inode, &mtr); + + next_node_addr = flst_get_next_addr( + seg_inode_page + FSEG_INODE_PAGE_NODE, &mtr); + mtr_commit(&mtr); + } while (++n < FSP_SEG_INODES_PER_PAGE(zip_size)); + + node_addr = next_node_addr; + } + + mtr_start(&mtr); + mtr_x_lock(latch, &mtr); + + header = fsp_get_space_header(space, zip_size, &mtr); + + node_addr = flst_get_first(header + FSP_SEG_INODES_FREE, &mtr); + + seg_inode_len_free = flst_get_len(header + FSP_SEG_INODES_FREE, &mtr); + + mtr_commit(&mtr); + + while (!fil_addr_is_null(node_addr)) { + + n = 0; + + do { + mtr_start(&mtr); + mtr_x_lock(latch, &mtr); + + seg_inode_page = fut_get_ptr( + space, zip_size, node_addr, RW_X_LATCH, &mtr) + - FSEG_INODE_PAGE_NODE; + + seg_inode = fsp_seg_inode_page_get_nth_inode( + seg_inode_page, n, zip_size, &mtr); + if (mach_read_from_8(seg_inode + FSEG_ID)) { + fseg_validate_low(seg_inode, &mtr); + + descr_count += flst_get_len( + seg_inode + FSEG_FREE, &mtr); + descr_count += flst_get_len( + seg_inode + FSEG_FULL, &mtr); + descr_count += flst_get_len( + seg_inode + FSEG_NOT_FULL, &mtr); + n_used2 += fseg_get_n_frag_pages( + seg_inode, &mtr); + } + + next_node_addr = flst_get_next_addr( + seg_inode_page + FSEG_INODE_PAGE_NODE, &mtr); + mtr_commit(&mtr); + } while (++n < FSP_SEG_INODES_PER_PAGE(zip_size)); + + node_addr = next_node_addr; + } + + ut_a(descr_count * FSP_EXTENT_SIZE == free_limit); + if (!zip_size) { + ut_a(n_used + n_full_frag_pages + == n_used2 + 2 * ((free_limit + (UNIV_PAGE_SIZE - 1)) + / UNIV_PAGE_SIZE) + + seg_inode_len_full + seg_inode_len_free); + } else { + ut_a(n_used + n_full_frag_pages + == n_used2 + 2 * ((free_limit + (zip_size - 1)) + / zip_size) + + seg_inode_len_full + seg_inode_len_free); + } + ut_a(frag_n_used == n_used); + + mtr_commit(&mtr2); + + return(TRUE); +} + +/*******************************************************************//** +Prints info of a file space. */ +UNIV_INTERN +void +fsp_print( +/*======*/ + ulint space) /*!< in: space id */ +{ + fsp_header_t* header; + fseg_inode_t* seg_inode; + page_t* seg_inode_page; + prio_rw_lock_t* latch; + ulint flags; + ulint zip_size; + ulint size; + ulint free_limit; + ulint frag_n_used; + fil_addr_t node_addr; + fil_addr_t next_node_addr; + ulint n_free; + ulint n_free_frag; + ulint n_full_frag; + ib_id_t seg_id; + ulint n; + ulint n_segs = 0; + mtr_t mtr; + mtr_t mtr2; + + latch = fil_space_get_latch(space, &flags); + zip_size = fsp_flags_get_zip_size(flags); + + /* Start first a mini-transaction mtr2 to lock out all other threads + from the fsp system */ + + mtr_start(&mtr2); + + mtr_x_lock(latch, &mtr2); + + mtr_start(&mtr); + + mtr_x_lock(latch, &mtr); + + header = fsp_get_space_header(space, zip_size, &mtr); + + size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, &mtr); + + free_limit = mtr_read_ulint(header + FSP_FREE_LIMIT, MLOG_4BYTES, + &mtr); + frag_n_used = mtr_read_ulint(header + FSP_FRAG_N_USED, MLOG_4BYTES, + &mtr); + n_free = flst_get_len(header + FSP_FREE, &mtr); + n_free_frag = flst_get_len(header + FSP_FREE_FRAG, &mtr); + n_full_frag = flst_get_len(header + FSP_FULL_FRAG, &mtr); + + seg_id = mach_read_from_8(header + FSP_SEG_ID); + + fprintf(stderr, + "FILE SPACE INFO: id %lu\n" + "size %lu, free limit %lu, free extents %lu\n" + "not full frag extents %lu: used pages %lu," + " full frag extents %lu\n" + "first seg id not used %llu\n", + (ulong) space, + (ulong) size, (ulong) free_limit, (ulong) n_free, + (ulong) n_free_frag, (ulong) frag_n_used, (ulong) n_full_frag, + (ullint) seg_id); + + mtr_commit(&mtr); + + /* Print segments */ + + mtr_start(&mtr); + mtr_x_lock(latch, &mtr); + + header = fsp_get_space_header(space, zip_size, &mtr); + + node_addr = flst_get_first(header + FSP_SEG_INODES_FULL, &mtr); + + mtr_commit(&mtr); + + while (!fil_addr_is_null(node_addr)) { + + n = 0; + + do { + + mtr_start(&mtr); + mtr_x_lock(latch, &mtr); + + seg_inode_page = fut_get_ptr( + space, zip_size, node_addr, RW_X_LATCH, &mtr) + - FSEG_INODE_PAGE_NODE; + + seg_inode = fsp_seg_inode_page_get_nth_inode( + seg_inode_page, n, zip_size, &mtr); + ut_a(mach_read_from_8(seg_inode + FSEG_ID) != 0); + fseg_print_low(seg_inode, &mtr); + + n_segs++; + + next_node_addr = flst_get_next_addr( + seg_inode_page + FSEG_INODE_PAGE_NODE, &mtr); + mtr_commit(&mtr); + } while (++n < FSP_SEG_INODES_PER_PAGE(zip_size)); + + node_addr = next_node_addr; + } + + mtr_start(&mtr); + mtr_x_lock(latch, &mtr); + + header = fsp_get_space_header(space, zip_size, &mtr); + + node_addr = flst_get_first(header + FSP_SEG_INODES_FREE, &mtr); + + mtr_commit(&mtr); + + while (!fil_addr_is_null(node_addr)) { + + n = 0; + + do { + + mtr_start(&mtr); + mtr_x_lock(latch, &mtr); + + seg_inode_page = fut_get_ptr( + space, zip_size, node_addr, RW_X_LATCH, &mtr) + - FSEG_INODE_PAGE_NODE; + + seg_inode = fsp_seg_inode_page_get_nth_inode( + seg_inode_page, n, zip_size, &mtr); + if (mach_read_from_8(seg_inode + FSEG_ID)) { + + fseg_print_low(seg_inode, &mtr); + n_segs++; + } + + next_node_addr = flst_get_next_addr( + seg_inode_page + FSEG_INODE_PAGE_NODE, &mtr); + mtr_commit(&mtr); + } while (++n < FSP_SEG_INODES_PER_PAGE(zip_size)); + + node_addr = next_node_addr; + } + + mtr_commit(&mtr2); + + fprintf(stderr, "NUMBER of file segments: %lu\n", (ulong) n_segs); +} +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/xtradb/fts/Makefile.query b/storage/xtradb/fts/Makefile.query new file mode 100644 index 00000000000..12dcd833064 --- /dev/null +++ b/storage/xtradb/fts/Makefile.query @@ -0,0 +1,32 @@ +LEX=flex +YACC=bison +PREFIX=fts + +all: fts0pars.cc fts0blex.cc fts0tlex.cc + +fts0par.cc: fts0pars.y +fts0blex.cc: fts0blex.l +fts0tlex.cc: fts0tlex.l + +.l.cc: + $(LEX) -P$(subst lex,,$*) -o $*.cc --header-file=../include/$*.h $< + +.y.cc: + $(YACC) -p $(PREFIX) -o $*.cc -d $< + mv $*.h ../include +LEX=flex +YACC=bison +PREFIX=fts + +all: fts0pars.cc fts0blex.cc fts0tlex.cc + +fts0par.cc: fts0pars.y +fts0blex.cc: fts0blex.l +fts0tlex.cc: fts0tlex.l + +.l.cc: + $(LEX) -P$(subst lex,,$*) -o $*.cc --header-file=../include/$*.h $< + +.y.cc: + $(YACC) -p $(PREFIX) -o $*.cc -d $< + mv $*.h ../include diff --git a/storage/xtradb/fts/fts0ast.cc b/storage/xtradb/fts/fts0ast.cc new file mode 100644 index 00000000000..dd48ffee14d --- /dev/null +++ b/storage/xtradb/fts/fts0ast.cc @@ -0,0 +1,696 @@ +/***************************************************************************** + +Copyright (c) 2007, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file fts/fts0ast.cc +Full Text Search parser helper file. + +Created 2007/3/16 Sunny Bains. +***********************************************************************/ + +#include "mem0mem.h" +#include "fts0ast.h" +#include "fts0pars.h" +#include "fts0fts.h" + +/* The FTS ast visit pass. */ +enum fts_ast_visit_pass_t { + FTS_PASS_FIRST, /*!< First visit pass, + process operators excluding + FTS_EXIST and FTS_IGNORE */ + FTS_PASS_EXIST, /*!< Exist visit pass, + process operator FTS_EXIST */ + FTS_PASS_IGNORE /*!< Ignore visit pass, + process operator FTS_IGNORE */ +}; + +/******************************************************************//** +Create an empty fts_ast_node_t. +@return Create a new node */ +static +fts_ast_node_t* +fts_ast_node_create(void) +/*=====================*/ +{ + fts_ast_node_t* node; + + node = (fts_ast_node_t*) ut_malloc(sizeof(*node)); + memset(node, 0x0, sizeof(*node)); + + return(node); +} + +/******************************************************************//** +Create a operator fts_ast_node_t. +@return new node */ +UNIV_INTERN +fts_ast_node_t* +fts_ast_create_node_oper( +/*=====================*/ + void* arg, /*!< in: ast state instance */ + fts_ast_oper_t oper) /*!< in: ast operator */ +{ + fts_ast_node_t* node = fts_ast_node_create(); + + node->type = FTS_AST_OPER; + node->oper = oper; + + fts_ast_state_add_node((fts_ast_state_t*) arg, node); + + return(node); +} + +/******************************************************************//** +This function takes ownership of the ptr and is responsible +for free'ing it +@return new node or a node list with tokenized words */ +UNIV_INTERN +fts_ast_node_t* +fts_ast_create_node_term( +/*=====================*/ + void* arg, /*!< in: ast state instance */ + const fts_ast_string_t* ptr) /*!< in: ast term string */ +{ + fts_ast_state_t* state = static_cast<fts_ast_state_t*>(arg); + ulint len = ptr->len; + ulint cur_pos = 0; + fts_ast_node_t* node = NULL; + fts_ast_node_t* node_list = NULL; + fts_ast_node_t* first_node = NULL; + + /* Scan the incoming string and filter out any "non-word" characters */ + while (cur_pos < len) { + fts_string_t str; + ulint offset; + ulint cur_len; + + cur_len = innobase_mysql_fts_get_token( + state->charset, + reinterpret_cast<const byte*>(ptr->str) + cur_pos, + reinterpret_cast<const byte*>(ptr->str) + len, + &str, &offset); + + if (cur_len == 0) { + break; + } + + cur_pos += cur_len; + + if (str.f_n_char > 0) { + /* If the subsequent term (after the first one)'s size + is less than fts_min_token_size or the term is greater + than fts_max_token_size, we shall ignore that. This is + to make consistent with MyISAM behavior */ + if ((first_node && (str.f_n_char < fts_min_token_size)) + || str.f_n_char > fts_max_token_size) { + continue; + } + + node = fts_ast_node_create(); + + node->type = FTS_AST_TERM; + + node->term.ptr = fts_ast_string_create( + str.f_str, str.f_len); + + fts_ast_state_add_node( + static_cast<fts_ast_state_t*>(arg), node); + + if (first_node) { + /* There is more than one word, create + a list to organize them */ + if (!node_list) { + node_list = fts_ast_create_node_list( + static_cast<fts_ast_state_t*>( + arg), + first_node); + } + + fts_ast_add_node(node_list, node); + } else { + first_node = node; + } + } + } + + return((node_list != NULL) ? node_list : first_node); +} + +/******************************************************************//** +This function takes ownership of the ptr and is responsible +for free'ing it. +@return new node */ +UNIV_INTERN +fts_ast_node_t* +fts_ast_create_node_text( +/*=====================*/ + void* arg, /*!< in: ast state instance */ + const fts_ast_string_t* ptr) /*!< in: ast text string */ +{ + ulint len = ptr->len; + fts_ast_node_t* node = NULL; + + /* Once we come here, the string must have at least 2 quotes "" + around the query string, which could be empty. Also the query + string may contain 0x00 in it, we don't treat it as null-terminated. */ + ut_ad(len >= 2); + ut_ad(ptr->str[0] == '\"' && ptr->str[len - 1] == '\"'); + + if (len == 2) { + /* If the query string contains nothing except quotes, + it's obviously an invalid query. */ + return(NULL); + } + + node = fts_ast_node_create(); + + /*!< We ignore the actual quotes "" */ + len -= 2; + + node->type = FTS_AST_TEXT; + /*!< Skip copying the first quote */ + node->text.ptr = fts_ast_string_create( + reinterpret_cast<const byte*>(ptr->str + 1), len); + node->text.distance = ULINT_UNDEFINED; + + fts_ast_state_add_node((fts_ast_state_t*) arg, node); + + return(node); +} + +/******************************************************************//** +This function takes ownership of the expr and is responsible +for free'ing it. +@return new node */ +UNIV_INTERN +fts_ast_node_t* +fts_ast_create_node_list( +/*=====================*/ + void* arg, /*!< in: ast state instance */ + fts_ast_node_t* expr) /*!< in: ast expr instance */ +{ + fts_ast_node_t* node = fts_ast_node_create(); + + node->type = FTS_AST_LIST; + node->list.head = node->list.tail = expr; + + fts_ast_state_add_node((fts_ast_state_t*) arg, node); + + return(node); +} + +/******************************************************************//** +Create a sub-expression list node. This function takes ownership of +expr and is responsible for deleting it. +@return new node */ +UNIV_INTERN +fts_ast_node_t* +fts_ast_create_node_subexp_list( +/*============================*/ + void* arg, /*!< in: ast state instance */ + fts_ast_node_t* expr) /*!< in: ast expr instance */ +{ + fts_ast_node_t* node = fts_ast_node_create(); + + node->type = FTS_AST_SUBEXP_LIST; + node->list.head = node->list.tail = expr; + + fts_ast_state_add_node((fts_ast_state_t*) arg, node); + + return(node); +} + +/******************************************************************//** +Free an expr list node elements. */ +static +void +fts_ast_free_list( +/*==============*/ + fts_ast_node_t* node) /*!< in: ast node to free */ +{ + ut_a(node->type == FTS_AST_LIST + || node->type == FTS_AST_SUBEXP_LIST); + + for (node = node->list.head; + node != NULL; + node = fts_ast_free_node(node)) { + + /*!< No op */ + } +} + +/********************************************************************//** +Free a fts_ast_node_t instance. +@return next node to free */ +UNIV_INTERN +fts_ast_node_t* +fts_ast_free_node( +/*==============*/ + fts_ast_node_t* node) /*!< in: the node to free */ +{ + fts_ast_node_t* next_node; + + switch (node->type) { + case FTS_AST_TEXT: + if (node->text.ptr) { + fts_ast_string_free(node->text.ptr); + node->text.ptr = NULL; + } + break; + + case FTS_AST_TERM: + if (node->term.ptr) { + fts_ast_string_free(node->term.ptr); + node->term.ptr = NULL; + } + break; + + case FTS_AST_LIST: + case FTS_AST_SUBEXP_LIST: + fts_ast_free_list(node); + node->list.head = node->list.tail = NULL; + break; + + case FTS_AST_OPER: + break; + + default: + ut_error; + } + + /*!< Get next node before freeing the node itself */ + next_node = node->next; + + ut_free(node); + + return(next_node); +} + +/******************************************************************//** +This AST takes ownership of the expr and is responsible +for free'ing it. +@return in param "list" */ +UNIV_INTERN +fts_ast_node_t* +fts_ast_add_node( +/*=============*/ + fts_ast_node_t* node, /*!< in: list instance */ + fts_ast_node_t* elem) /*!< in: node to add to list */ +{ + if (!elem) { + return(NULL); + } + + ut_a(!elem->next); + ut_a(node->type == FTS_AST_LIST + || node->type == FTS_AST_SUBEXP_LIST); + + if (!node->list.head) { + ut_a(!node->list.tail); + + node->list.head = node->list.tail = elem; + } else { + ut_a(node->list.tail); + + node->list.tail->next = elem; + node->list.tail = elem; + } + + return(node); +} + +/******************************************************************//** +For tracking node allocations, in case there is an error during +parsing. */ +UNIV_INTERN +void +fts_ast_state_add_node( +/*===================*/ + fts_ast_state_t*state, /*!< in: ast instance */ + fts_ast_node_t* node) /*!< in: node to add to ast */ +{ + if (!state->list.head) { + ut_a(!state->list.tail); + + state->list.head = state->list.tail = node; + } else { + state->list.tail->next_alloc = node; + state->list.tail = node; + } +} + +/******************************************************************//** +Set the wildcard attribute of a term. */ +UNIV_INTERN +void +fts_ast_term_set_wildcard( +/*======================*/ + fts_ast_node_t* node) /*!< in/out: set attribute of + a term node */ +{ + if (!node) { + return; + } + + /* If it's a node list, the wildcard should be set to the tail node*/ + if (node->type == FTS_AST_LIST) { + ut_ad(node->list.tail != NULL); + node = node->list.tail; + } + + ut_a(node->type == FTS_AST_TERM); + ut_a(!node->term.wildcard); + + node->term.wildcard = TRUE; +} + +/******************************************************************//** +Set the proximity attribute of a text node. */ +UNIV_INTERN +void +fts_ast_term_set_distance( +/*======================*/ + fts_ast_node_t* node, /*!< in/out: text node */ + ulint distance) /*!< in: the text proximity + distance */ +{ + if (node == NULL) { + return; + } + + ut_a(node->type == FTS_AST_TEXT); + ut_a(node->text.distance == ULINT_UNDEFINED); + + node->text.distance = distance; +} + +/******************************************************************//** +Free node and expr allocations. */ +UNIV_INTERN +void +fts_ast_state_free( +/*===============*/ + fts_ast_state_t*state) /*!< in: ast state to free */ +{ + fts_ast_node_t* node = state->list.head; + + /* Free the nodes that were allocated during parsing. */ + while (node) { + fts_ast_node_t* next = node->next_alloc; + + if (node->type == FTS_AST_TEXT && node->text.ptr) { + fts_ast_string_free(node->text.ptr); + node->text.ptr = NULL; + } else if (node->type == FTS_AST_TERM && node->term.ptr) { + fts_ast_string_free(node->term.ptr); + node->term.ptr = NULL; + } + + ut_free(node); + node = next; + } + + state->root = state->list.head = state->list.tail = NULL; +} + +/******************************************************************//** +Print an ast node. */ +UNIV_INTERN +void +fts_ast_node_print( +/*===============*/ + fts_ast_node_t* node) /*!< in: ast node to print */ +{ + switch (node->type) { + case FTS_AST_TEXT: + printf("TEXT: "); + fts_ast_string_print(node->text.ptr); + break; + + case FTS_AST_TERM: + printf("TERM: "); + fts_ast_string_print(node->term.ptr); + break; + + case FTS_AST_LIST: + printf("LIST: "); + node = node->list.head; + + while (node) { + fts_ast_node_print(node); + node = node->next; + } + break; + + case FTS_AST_SUBEXP_LIST: + printf("SUBEXP_LIST: "); + node = node->list.head; + + while (node) { + fts_ast_node_print(node); + node = node->next; + } + case FTS_AST_OPER: + printf("OPER: %d\n", node->oper); + break; + + default: + ut_error; + } +} + +/******************************************************************//** +Traverse the AST - in-order traversal, except for the FTX_EXIST and FTS_IGNORE +nodes, which will be ignored in the first pass of each level, and visited in a +second and third pass after all other nodes in the same level are visited. +@return DB_SUCCESS if all went well */ +UNIV_INTERN +dberr_t +fts_ast_visit( +/*==========*/ + fts_ast_oper_t oper, /*!< in: current operator */ + fts_ast_node_t* node, /*!< in: current root node */ + fts_ast_callback visitor, /*!< in: callback function */ + void* arg, /*!< in: arg for callback */ + bool* has_ignore) /*!< out: true, if the operator + was ignored during processing, + currently we ignore FTS_EXIST + and FTS_IGNORE operators */ +{ + dberr_t error = DB_SUCCESS; + fts_ast_node_t* oper_node = NULL; + fts_ast_node_t* start_node; + bool revisit = false; + bool will_be_ignored = false; + fts_ast_visit_pass_t visit_pass = FTS_PASS_FIRST; + + start_node = node->list.head; + + ut_a(node->type == FTS_AST_LIST + || node->type == FTS_AST_SUBEXP_LIST); + + if (oper == FTS_EXIST_SKIP) { + visit_pass = FTS_PASS_EXIST; + } else if (oper == FTS_IGNORE_SKIP) { + visit_pass = FTS_PASS_IGNORE; + } + + /* In the first pass of the tree, at the leaf level of the + tree, FTS_EXIST and FTS_IGNORE operation will be ignored. + It will be repeated at the level above the leaf level. + + The basic idea here is that when we encounter FTS_EXIST or + FTS_IGNORE, we will change the operator node into FTS_EXIST_SKIP + or FTS_IGNORE_SKIP, and term node & text node with the operators + is ignored in the first pass. We have two passes during the revisit: + We process nodes with FTS_EXIST_SKIP in the exist pass, and then + process nodes with FTS_IGNORE_SKIP in the ignore pass. + + The order should be restrictly followed, or we will get wrong results. + For example, we have a query 'a +b -c d +e -f'. + first pass: process 'a' and 'd' by union; + exist pass: process '+b' and '+e' by intersection; + ignore pass: process '-c' and '-f' by difference. */ + + for (node = node->list.head; + node && (error == DB_SUCCESS); + node = node->next) { + + switch(node->type) { + case FTS_AST_LIST: + if (visit_pass != FTS_PASS_FIRST) { + break; + } + + error = fts_ast_visit(oper, node, visitor, + arg, &will_be_ignored); + + /* If will_be_ignored is set to true, then + we encountered and ignored a FTS_EXIST or FTS_IGNORE + operator. */ + if (will_be_ignored) { + revisit = true; + /* Remember oper for list in case '-abc&def', + ignored oper is from previous node of list.*/ + node->oper = oper; + } + + break; + + case FTS_AST_OPER: + oper = node->oper; + oper_node = node; + + /* Change the operator for revisit */ + if (oper == FTS_EXIST) { + oper_node->oper = FTS_EXIST_SKIP; + } else if (oper == FTS_IGNORE) { + oper_node->oper = FTS_IGNORE_SKIP; + } + + break; + + default: + if (node->visited) { + continue; + } + + ut_a(oper == FTS_NONE || !oper_node + || oper_node->oper == oper + || oper_node->oper == FTS_EXIST_SKIP + || oper_node->oper == FTS_IGNORE_SKIP); + + if (oper== FTS_EXIST || oper == FTS_IGNORE) { + *has_ignore = true; + continue; + } + + /* Process leaf node accroding to its pass.*/ + if (oper == FTS_EXIST_SKIP + && visit_pass == FTS_PASS_EXIST) { + error = visitor(FTS_EXIST, node, arg); + node->visited = true; + } else if (oper == FTS_IGNORE_SKIP + && visit_pass == FTS_PASS_IGNORE) { + error = visitor(FTS_IGNORE, node, arg); + node->visited = true; + } else if (visit_pass == FTS_PASS_FIRST) { + error = visitor(oper, node, arg); + node->visited = true; + } + } + } + + if (revisit) { + /* Exist pass processes the skipped FTS_EXIST operation. */ + for (node = start_node; + node && error == DB_SUCCESS; + node = node->next) { + + if (node->type == FTS_AST_LIST + && node->oper != FTS_IGNORE) { + error = fts_ast_visit(FTS_EXIST_SKIP, node, + visitor, arg, &will_be_ignored); + } + } + + /* Ignore pass processes the skipped FTS_IGNORE operation. */ + for (node = start_node; + node && error == DB_SUCCESS; + node = node->next) { + + if (node->type == FTS_AST_LIST) { + error = fts_ast_visit(FTS_IGNORE_SKIP, node, + visitor, arg, &will_be_ignored); + } + } + } + + return(error); +} + +/** +Create an ast string object, with NUL-terminator, so the string +has one more byte than len +@param[in] str pointer to string +@param[in] len length of the string +@return ast string with NUL-terminator */ +UNIV_INTERN +fts_ast_string_t* +fts_ast_string_create( + const byte* str, + ulint len) +{ + fts_ast_string_t* ast_str; + + ut_ad(len > 0); + + ast_str = static_cast<fts_ast_string_t*> + (ut_malloc(sizeof(fts_ast_string_t))); + ast_str->str = static_cast<byte*>(ut_malloc(len + 1)); + + ast_str->len = len; + memcpy(ast_str->str, str, len); + ast_str->str[len] = '\0'; + + return(ast_str); +} + +/** +Free an ast string instance +@param[in,out] ast_str string to free */ +UNIV_INTERN +void +fts_ast_string_free( + fts_ast_string_t* ast_str) +{ + if (ast_str != NULL) { + ut_free(ast_str->str); + ut_free(ast_str); + } +} + +/** +Translate ast string of type FTS_AST_NUMB to unsigned long by strtoul +@param[in] str string to translate +@param[in] base the base +@return translated number */ +UNIV_INTERN +ulint +fts_ast_string_to_ul( + const fts_ast_string_t* ast_str, + int base) +{ + return(strtoul(reinterpret_cast<const char*>(ast_str->str), + NULL, base)); +} + +/** +Print the ast string +@param[in] str string to print */ +UNIV_INTERN +void +fts_ast_string_print( + const fts_ast_string_t* ast_str) +{ + for (ulint i = 0; i < ast_str->len; ++i) { + printf("%c", ast_str->str[i]); + } + + printf("\n"); +} diff --git a/storage/xtradb/fts/fts0blex.cc b/storage/xtradb/fts/fts0blex.cc new file mode 100644 index 00000000000..7d0acb00a3b --- /dev/null +++ b/storage/xtradb/fts/fts0blex.cc @@ -0,0 +1,1957 @@ +#include "univ.i" +#line 2 "fts0blex.cc" + +#line 4 "fts0blex.cc" + +#define YY_INT_ALIGNED short int + +/* A lexical scanner generated by flex */ + +#define FLEX_SCANNER +#define YY_FLEX_MAJOR_VERSION 2 +#define YY_FLEX_MINOR_VERSION 5 +#define YY_FLEX_SUBMINOR_VERSION 35 +#if YY_FLEX_SUBMINOR_VERSION > 0 +#define FLEX_BETA +#endif + +/* First, we deal with platform-specific or compiler-specific issues. */ + +/* begin standard C headers. */ +#include <stdio.h> +#include <string.h> +#include <errno.h> +#include <stdlib.h> + +/* end standard C headers. */ + +/* flex integer type definitions */ + +#ifndef FLEXINT_H +#define FLEXINT_H + +/* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */ + +#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + +/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h, + * if you want the limit (max/min) macros for int types. + */ +#ifndef __STDC_LIMIT_MACROS +#define __STDC_LIMIT_MACROS 1 +#endif + +#include <inttypes.h> +typedef int8_t flex_int8_t; +typedef uint8_t flex_uint8_t; +typedef int16_t flex_int16_t; +typedef uint16_t flex_uint16_t; +typedef int32_t flex_int32_t; +typedef uint32_t flex_uint32_t; +#else +typedef signed char flex_int8_t; +typedef short int flex_int16_t; +typedef int flex_int32_t; +typedef unsigned char flex_uint8_t; +typedef unsigned short int flex_uint16_t; +typedef unsigned int flex_uint32_t; + +/* Limits of integral types. */ +#ifndef INT8_MIN +#define INT8_MIN (-128) +#endif +#ifndef INT16_MIN +#define INT16_MIN (-32767-1) +#endif +#ifndef INT32_MIN +#define INT32_MIN (-2147483647-1) +#endif +#ifndef INT8_MAX +#define INT8_MAX (127) +#endif +#ifndef INT16_MAX +#define INT16_MAX (32767) +#endif +#ifndef INT32_MAX +#define INT32_MAX (2147483647) +#endif +#ifndef UINT8_MAX +#define UINT8_MAX (255U) +#endif +#ifndef UINT16_MAX +#define UINT16_MAX (65535U) +#endif +#ifndef UINT32_MAX +#define UINT32_MAX (4294967295U) +#endif + +#endif /* ! C99 */ + +#endif /* ! FLEXINT_H */ + +#ifdef __cplusplus + +/* The "const" storage-class-modifier is valid. */ +#define YY_USE_CONST + +#else /* ! __cplusplus */ + +/* C99 requires __STDC__ to be defined as 1. */ +#if defined (__STDC__) + +#define YY_USE_CONST + +#endif /* defined (__STDC__) */ +#endif /* ! __cplusplus */ + +#ifdef YY_USE_CONST +#define yyconst const +#else +#define yyconst +#endif + +/* Returned upon end-of-file. */ +#define YY_NULL 0 + +/* Promotes a possibly negative, possibly signed char to an unsigned + * integer for use as an array index. If the signed char is negative, + * we want to instead treat it as an 8-bit unsigned char, hence the + * double cast. + */ +#define YY_SC_TO_UI(c) ((unsigned int) (unsigned char) c) + +/* An opaque pointer. */ +#ifndef YY_TYPEDEF_YY_SCANNER_T +#define YY_TYPEDEF_YY_SCANNER_T +typedef void* yyscan_t; +#endif + +/* For convenience, these vars (plus the bison vars far below) + are macros in the reentrant scanner. */ +#define yyin yyg->yyin_r +#define yyout yyg->yyout_r +#define yyextra yyg->yyextra_r +#define yyleng yyg->yyleng_r +#define yytext yyg->yytext_r +#define yylineno (YY_CURRENT_BUFFER_LVALUE->yy_bs_lineno) +#define yycolumn (YY_CURRENT_BUFFER_LVALUE->yy_bs_column) +#define yy_flex_debug yyg->yy_flex_debug_r + +/* Enter a start condition. This macro really ought to take a parameter, + * but we do it the disgusting crufty way forced on us by the ()-less + * definition of BEGIN. + */ +#define BEGIN yyg->yy_start = 1 + 2 * + +/* Translate the current start state into a value that can be later handed + * to BEGIN to return to the state. The YYSTATE alias is for lex + * compatibility. + */ +#define YY_START ((yyg->yy_start - 1) / 2) +#define YYSTATE YY_START + +/* Action number for EOF rule of a given start state. */ +#define YY_STATE_EOF(state) (YY_END_OF_BUFFER + state + 1) + +/* Special action meaning "start processing a new file". */ +#define YY_NEW_FILE fts0brestart(yyin ,yyscanner ) + +#define YY_END_OF_BUFFER_CHAR 0 + +/* Size of default input buffer. */ +#ifndef YY_BUF_SIZE +#ifdef __ia64__ +/* On IA-64, the buffer size is 16k, not 8k. + * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case. + * Ditto for the __ia64__ case accordingly. + */ +#define YY_BUF_SIZE 32768 +#else +#define YY_BUF_SIZE 16384 +#endif /* __ia64__ */ +#endif + +/* The state buf must be large enough to hold one state per character in the main buffer. + */ +#define YY_STATE_BUF_SIZE ((YY_BUF_SIZE + 2) * sizeof(yy_state_type)) + +#ifndef YY_TYPEDEF_YY_BUFFER_STATE +#define YY_TYPEDEF_YY_BUFFER_STATE +typedef struct yy_buffer_state *YY_BUFFER_STATE; +#endif + +#define EOB_ACT_CONTINUE_SCAN 0 +#define EOB_ACT_END_OF_FILE 1 +#define EOB_ACT_LAST_MATCH 2 + +#define YY_LESS_LINENO(n) + +/* Return all but the first "n" matched characters back to the input stream. */ +#define yyless(n) \ + do \ + { \ + /* Undo effects of setting up yytext. */ \ + int yyless_macro_arg = (n); \ + YY_LESS_LINENO(yyless_macro_arg);\ + *yy_cp = yyg->yy_hold_char; \ + YY_RESTORE_YY_MORE_OFFSET \ + yyg->yy_c_buf_p = yy_cp = yy_bp + yyless_macro_arg - YY_MORE_ADJ; \ + YY_DO_BEFORE_ACTION; /* set up yytext again */ \ + } \ + while ( 0 ) + +#define unput(c) yyunput( c, yyg->yytext_ptr , yyscanner ) + +#ifndef YY_TYPEDEF_YY_SIZE_T +#define YY_TYPEDEF_YY_SIZE_T +typedef size_t yy_size_t; +#endif + +#ifndef YY_STRUCT_YY_BUFFER_STATE +#define YY_STRUCT_YY_BUFFER_STATE +struct yy_buffer_state + { + FILE *yy_input_file; + + char *yy_ch_buf; /* input buffer */ + char *yy_buf_pos; /* current position in input buffer */ + + /* Size of input buffer in bytes, not including room for EOB + * characters. + */ + yy_size_t yy_buf_size; + + /* Number of characters read into yy_ch_buf, not including EOB + * characters. + */ + int yy_n_chars; + + /* Whether we "own" the buffer - i.e., we know we created it, + * and can realloc() it to grow it, and should free() it to + * delete it. + */ + int yy_is_our_buffer; + + /* Whether this is an "interactive" input source; if so, and + * if we're using stdio for input, then we want to use getc() + * instead of fread(), to make sure we stop fetching input after + * each newline. + */ + int yy_is_interactive; + + /* Whether we're considered to be at the beginning of a line. + * If so, '^' rules will be active on the next match, otherwise + * not. + */ + int yy_at_bol; + + int yy_bs_lineno; /**< The line count. */ + int yy_bs_column; /**< The column count. */ + + /* Whether to try to fill the input buffer when we reach the + * end of it. + */ + int yy_fill_buffer; + + int yy_buffer_status; + +#define YY_BUFFER_NEW 0 +#define YY_BUFFER_NORMAL 1 + /* When an EOF's been seen but there's still some text to process + * then we mark the buffer as YY_EOF_PENDING, to indicate that we + * shouldn't try reading from the input source any more. We might + * still have a bunch of tokens to match, though, because of + * possible backing-up. + * + * When we actually see the EOF, we change the status to "new" + * (via fts0brestart()), so that the user can continue scanning by + * just pointing yyin at a new input file. + */ +#define YY_BUFFER_EOF_PENDING 2 + + }; +#endif /* !YY_STRUCT_YY_BUFFER_STATE */ + +/* We provide macros for accessing buffer states in case in the + * future we want to put the buffer states in a more general + * "scanner state". + * + * Returns the top of the stack, or NULL. + */ +#define YY_CURRENT_BUFFER ( yyg->yy_buffer_stack \ + ? yyg->yy_buffer_stack[yyg->yy_buffer_stack_top] \ + : NULL) + +/* Same as previous macro, but useful when we know that the buffer stack is not + * NULL or when we need an lvalue. For internal use only. + */ +#define YY_CURRENT_BUFFER_LVALUE yyg->yy_buffer_stack[yyg->yy_buffer_stack_top] + +void fts0brestart (FILE *input_file ,yyscan_t yyscanner ); +void fts0b_switch_to_buffer (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner ); +YY_BUFFER_STATE fts0b_create_buffer (FILE *file,int size ,yyscan_t yyscanner ); +void fts0b_delete_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner ); +void fts0b_flush_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner ); +void fts0bpush_buffer_state (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner ); +void fts0bpop_buffer_state (yyscan_t yyscanner ); + +static void fts0bensure_buffer_stack (yyscan_t yyscanner ); +static void fts0b_load_buffer_state (yyscan_t yyscanner ); +static void fts0b_init_buffer (YY_BUFFER_STATE b,FILE *file ,yyscan_t yyscanner ); + +#define YY_FLUSH_BUFFER fts0b_flush_buffer(YY_CURRENT_BUFFER ,yyscanner) + +YY_BUFFER_STATE fts0b_scan_buffer (char *base,yy_size_t size ,yyscan_t yyscanner ); +YY_BUFFER_STATE fts0b_scan_string (yyconst char *yy_str ,yyscan_t yyscanner ); +YY_BUFFER_STATE fts0b_scan_bytes (yyconst char *bytes,int len ,yyscan_t yyscanner ); + +void *fts0balloc (yy_size_t , yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) ); +void *fts0brealloc (void *,yy_size_t , yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) ); +void fts0bfree (void * , yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) ); + +#define yy_new_buffer fts0b_create_buffer + +#define yy_set_interactive(is_interactive) \ + { \ + if ( ! YY_CURRENT_BUFFER ){ \ + fts0bensure_buffer_stack (yyscanner); \ + YY_CURRENT_BUFFER_LVALUE = \ + fts0b_create_buffer(yyin,YY_BUF_SIZE ,yyscanner); \ + } \ + YY_CURRENT_BUFFER_LVALUE->yy_is_interactive = is_interactive; \ + } + +#define yy_set_bol(at_bol) \ + { \ + if ( ! YY_CURRENT_BUFFER ){\ + fts0bensure_buffer_stack (yyscanner); \ + YY_CURRENT_BUFFER_LVALUE = \ + fts0b_create_buffer(yyin,YY_BUF_SIZE ,yyscanner); \ + } \ + YY_CURRENT_BUFFER_LVALUE->yy_at_bol = at_bol; \ + } + +#define YY_AT_BOL() (YY_CURRENT_BUFFER_LVALUE->yy_at_bol) + +/* Begin user sect3 */ + +#define fts0bwrap(n) 1 +#define YY_SKIP_YYWRAP + +typedef unsigned char YY_CHAR; + +typedef int yy_state_type; + +#define yytext_ptr yytext_r + +static yy_state_type yy_get_previous_state (yyscan_t yyscanner ); +static yy_state_type yy_try_NUL_trans (yy_state_type current_state ,yyscan_t yyscanner); +static int yy_get_next_buffer (yyscan_t yyscanner ); +static void yy_fatal_error (yyconst char msg[] , yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) ); + +/* Done after the current pattern has been matched and before the + * corresponding action - sets up yytext. + */ +#define YY_DO_BEFORE_ACTION \ + yyg->yytext_ptr = yy_bp; \ + yyleng = static_cast<int>(yy_cp - yy_bp); \ + yyg->yy_hold_char = *yy_cp; \ + *yy_cp = '\0'; \ + yyg->yy_c_buf_p = yy_cp; + +#define YY_NUM_RULES 7 +#define YY_END_OF_BUFFER 8 +/* This struct is not used in this scanner, + but its presence is necessary. */ +struct yy_trans_info + { + flex_int32_t yy_verify; + flex_int32_t yy_nxt; + }; +static yyconst flex_int16_t yy_accept[19] = + { 0, + 4, 4, 8, 4, 1, 6, 1, 7, 7, 2, + 3, 4, 1, 1, 0, 5, 3, 0 + } ; + +static yyconst flex_int32_t yy_ec[256] = + { 0, + 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 4, 1, 5, 1, 1, 6, 1, 1, 7, + 7, 7, 7, 1, 7, 1, 1, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 1, 1, 7, + 1, 7, 1, 7, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 7, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1 + } ; + +static yyconst flex_int32_t yy_meta[9] = + { 0, + 1, 2, 3, 4, 5, 5, 5, 1 + } ; + +static yyconst flex_int16_t yy_base[22] = + { 0, + 0, 0, 22, 0, 7, 23, 0, 14, 23, 23, + 7, 0, 0, 0, 5, 23, 0, 23, 11, 12, + 16 + } ; + +static yyconst flex_int16_t yy_def[22] = + { 0, + 18, 1, 18, 19, 19, 18, 20, 21, 18, 18, + 19, 19, 5, 20, 21, 18, 11, 0, 18, 18, + 18 + } ; + +static yyconst flex_int16_t yy_nxt[32] = + { 0, + 4, 5, 6, 7, 8, 9, 10, 11, 13, 16, + 14, 12, 12, 14, 17, 14, 15, 15, 16, 15, + 15, 18, 3, 18, 18, 18, 18, 18, 18, 18, + 18 + } ; + +static yyconst flex_int16_t yy_chk[32] = + { 0, + 1, 1, 1, 1, 1, 1, 1, 1, 5, 15, + 5, 19, 19, 20, 11, 20, 21, 21, 8, 21, + 21, 3, 18, 18, 18, 18, 18, 18, 18, 18, + 18 + } ; + +/* The intent behind this definition is that it'll catch + * any uses of REJECT which flex missed. + */ +#define REJECT reject_used_but_not_detected +#define yymore() yymore_used_but_not_detected +#define YY_MORE_ADJ 0 +#define YY_RESTORE_YY_MORE_OFFSET +#line 1 "fts0blex.l" +/***************************************************************************** + +Copyright (c) 2007, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ +/** + * @file fts/fts0blex.l + * FTS parser lexical analyzer + * + * Created 2007/5/9 Sunny Bains + */ +#line 27 "fts0blex.l" + +#include "fts0ast.h" +#include "fts0pars.h" + +/* Required for reentrant parser */ +#define YY_DECL int fts_blexer(YYSTYPE* val, yyscan_t yyscanner) + +#define YY_NO_INPUT 1 +#line 484 "fts0blex.cc" + +#define INITIAL 0 + +#ifndef YY_NO_UNISTD_H +/* Special case for "unistd.h", since it is non-ANSI. We include it way + * down here because we want the user's section 1 to have been scanned first. + * The user has a chance to override it with an option. + */ +#include <unistd.h> +#endif + +#ifndef YY_EXTRA_TYPE +#define YY_EXTRA_TYPE void * +#endif + +/* Holds the entire state of the reentrant scanner. */ +struct yyguts_t +{ + + /* User-defined. Not touched by flex. */ + YY_EXTRA_TYPE yyextra_r; + + /* The rest are the same as the globals declared in the non-reentrant scanner. */ + FILE *yyin_r, *yyout_r; + size_t yy_buffer_stack_top; /**< index of top of stack. */ + size_t yy_buffer_stack_max; /**< capacity of stack. */ + YY_BUFFER_STATE * yy_buffer_stack; /**< Stack as an array. */ + char yy_hold_char; + int yy_n_chars; + int yyleng_r; + char *yy_c_buf_p; + int yy_init; + int yy_start; + int yy_did_buffer_switch_on_eof; + int yy_start_stack_ptr; + int yy_start_stack_depth; + int *yy_start_stack; + yy_state_type yy_last_accepting_state; + char* yy_last_accepting_cpos; + + int yylineno_r; + int yy_flex_debug_r; + + char *yytext_r; + int yy_more_flag; + int yy_more_len; + +}; /* end struct yyguts_t */ + +static int yy_init_globals (yyscan_t yyscanner ); + +int fts0blex_init (yyscan_t* scanner); + +int fts0blex_init_extra (YY_EXTRA_TYPE user_defined,yyscan_t* scanner); + +/* Accessor methods to globals. + These are made visible to non-reentrant scanners for convenience. */ + +int fts0blex_destroy (yyscan_t yyscanner ); + +int fts0bget_debug (yyscan_t yyscanner ); + +void fts0bset_debug (int debug_flag ,yyscan_t yyscanner ); + +YY_EXTRA_TYPE fts0bget_extra (yyscan_t yyscanner ); + +void fts0bset_extra (YY_EXTRA_TYPE user_defined ,yyscan_t yyscanner ); + +FILE *fts0bget_in (yyscan_t yyscanner ); + +void fts0bset_in (FILE * in_str ,yyscan_t yyscanner ); + +FILE *fts0bget_out (yyscan_t yyscanner ); + +void fts0bset_out (FILE * out_str ,yyscan_t yyscanner ); + +int fts0bget_leng (yyscan_t yyscanner ); + +char *fts0bget_text (yyscan_t yyscanner ); + +int fts0bget_lineno (yyscan_t yyscanner ); + +void fts0bset_lineno (int line_number ,yyscan_t yyscanner ); + +/* Macros after this point can all be overridden by user definitions in + * section 1. + */ + +#ifndef YY_SKIP_YYWRAP +#ifdef __cplusplus +extern "C" int fts0bwrap (yyscan_t yyscanner ); +#else +extern int fts0bwrap (yyscan_t yyscanner ); +#endif +#endif + +#ifndef yytext_ptr +static void yy_flex_strncpy (char *,yyconst char *,int , yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused))); +#endif + +#ifdef YY_NEED_STRLEN +static int yy_flex_strlen (yyconst char * , yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused))); +#endif + +#ifndef YY_NO_INPUT + +#ifdef __cplusplus +static int yyinput (yyscan_t yyscanner ); +#else +static int input (yyscan_t yyscanner ); +#endif + +#endif + +/* Amount of stuff to slurp up with each read. */ +#ifndef YY_READ_BUF_SIZE +#ifdef __ia64__ +/* On IA-64, the buffer size is 16k, not 8k */ +#define YY_READ_BUF_SIZE 16384 +#else +#define YY_READ_BUF_SIZE 8192 +#endif /* __ia64__ */ +#endif + +/* Copy whatever the last rule matched to the standard output. */ +#ifndef ECHO +/* This used to be an fputs(), but since the string might contain NUL's, + * we now use fwrite(). + */ +#define ECHO do { if (fwrite( yytext, yyleng, 1, yyout )) {} } while (0) +#endif + +/* Gets input and stuffs it into "buf". number of characters read, or YY_NULL, + * is returned in "result". + */ +#ifndef YY_INPUT +#define YY_INPUT(buf,result,max_size) \ + if ( YY_CURRENT_BUFFER_LVALUE->yy_is_interactive ) \ + { \ + int c = '*'; \ + int n; \ + for ( n = 0; n < static_cast<int>(max_size) && \ + (c = getc( yyin )) != EOF && c != '\n'; ++n ) \ + buf[n] = (char) c; \ + if ( c == '\n' ) \ + buf[n++] = (char) c; \ + if ( c == EOF && ferror( yyin ) ) \ + YY_FATAL_ERROR( "input in flex scanner failed" ); \ + result = n; \ + } \ + else \ + { \ + errno=0; \ + while ( (result = static_cast<int>(fread(buf, 1, max_size, yyin))) \ + == 0 && ferror(yyin) ) \ + { \ + if( errno != EINTR) \ + { \ + YY_FATAL_ERROR( "input in flex scanner failed" ); \ + break; \ + } \ + errno=0; \ + clearerr(yyin); \ + } \ + }\ +\ + +#endif + +/* No semi-colon after return; correct usage is to write "yyterminate();" - + * we don't want an extra ';' after the "return" because that will cause + * some compilers to complain about unreachable statements. + */ +#ifndef yyterminate +#define yyterminate() return YY_NULL +#endif + +/* Number of entries by which start-condition stack grows. */ +#ifndef YY_START_STACK_INCR +#define YY_START_STACK_INCR 25 +#endif + +/* Report a fatal error. */ +#ifndef YY_FATAL_ERROR +#define YY_FATAL_ERROR(msg) yy_fatal_error( msg , yyscanner) +#endif + +/* end tables serialization structures and prototypes */ + +/* Default declaration of generated scanner - a define so the user can + * easily add parameters. + */ +#ifndef YY_DECL +#define YY_DECL_IS_OURS 1 + +extern int fts0blex (yyscan_t yyscanner); + +#define YY_DECL int fts0blex (yyscan_t yyscanner) +#endif /* !YY_DECL */ + +/* Code executed at the beginning of each rule, after yytext and yyleng + * have been set up. + */ +#ifndef YY_USER_ACTION +#define YY_USER_ACTION +#endif + +/* Code executed at the end of each rule. */ +#ifndef YY_BREAK +#define YY_BREAK break; +#endif + +#define YY_RULE_SETUP \ + YY_USER_ACTION + +/** The main scanner function which does all the work. + */ +YY_DECL +{ + register yy_state_type yy_current_state; + register char *yy_cp, *yy_bp; + register int yy_act; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + +#line 43 "fts0blex.l" + + +#line 712 "fts0blex.cc" + + if ( !yyg->yy_init ) + { + yyg->yy_init = 1; + +#ifdef YY_USER_INIT + YY_USER_INIT; +#endif + + if ( ! yyg->yy_start ) + yyg->yy_start = 1; /* first start state */ + + if ( ! yyin ) + yyin = stdin; + + if ( ! yyout ) + yyout = stdout; + + if ( ! YY_CURRENT_BUFFER ) { + fts0bensure_buffer_stack (yyscanner); + YY_CURRENT_BUFFER_LVALUE = + fts0b_create_buffer(yyin,YY_BUF_SIZE ,yyscanner); + } + + fts0b_load_buffer_state(yyscanner ); + } + + while ( 1 ) /* loops until end-of-file is reached */ + { + yy_cp = yyg->yy_c_buf_p; + + /* Support of yytext. */ + *yy_cp = yyg->yy_hold_char; + + /* yy_bp points to the position in yy_ch_buf of the start of + * the current run. + */ + yy_bp = yy_cp; + + yy_current_state = yyg->yy_start; +yy_match: + do + { + register YY_CHAR yy_c = yy_ec[YY_SC_TO_UI(*yy_cp)]; + if ( yy_accept[yy_current_state] ) + { + yyg->yy_last_accepting_state = yy_current_state; + yyg->yy_last_accepting_cpos = yy_cp; + } + while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) + { + yy_current_state = (int) yy_def[yy_current_state]; + if ( yy_current_state >= 19 ) + yy_c = yy_meta[(unsigned int) yy_c]; + } + yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c]; + ++yy_cp; + } + while ( yy_current_state != 18 ); + yy_cp = yyg->yy_last_accepting_cpos; + yy_current_state = yyg->yy_last_accepting_state; + +yy_find_action: + yy_act = yy_accept[yy_current_state]; + + YY_DO_BEFORE_ACTION; + +do_action: /* This label is used only to access EOF actions. */ + + switch ( yy_act ) + { /* beginning of action switch */ + case 0: /* must back up */ + /* undo the effects of YY_DO_BEFORE_ACTION */ + *yy_cp = yyg->yy_hold_char; + yy_cp = yyg->yy_last_accepting_cpos; + yy_current_state = yyg->yy_last_accepting_state; + goto yy_find_action; + +case 1: +YY_RULE_SETUP +#line 45 "fts0blex.l" +/* Ignore whitespace */ ; + YY_BREAK +case 2: +YY_RULE_SETUP +#line 47 "fts0blex.l" +{ + val->oper = fts0bget_text(yyscanner)[0]; + + return(val->oper); +} + YY_BREAK +case 3: +YY_RULE_SETUP +#line 53 "fts0blex.l" +{ + val->token = fts_ast_string_create(reinterpret_cast<const byte*>(fts0bget_text(yyscanner)), fts0bget_leng(yyscanner)); + + return(FTS_NUMB); +} + YY_BREAK +case 4: +YY_RULE_SETUP +#line 59 "fts0blex.l" +{ + val->token = fts_ast_string_create(reinterpret_cast<const byte*>(fts0bget_text(yyscanner)), fts0bget_leng(yyscanner)); + + return(FTS_TERM); +} + YY_BREAK +case 5: +YY_RULE_SETUP +#line 65 "fts0blex.l" +{ + val->token = fts_ast_string_create(reinterpret_cast<const byte*>(fts0bget_text(yyscanner)), fts0bget_leng(yyscanner)); + + return(FTS_TEXT); +} + YY_BREAK +case 6: +/* rule 6 can match eol */ +YY_RULE_SETUP +#line 71 "fts0blex.l" + + YY_BREAK +case 7: +YY_RULE_SETUP +#line 73 "fts0blex.l" +ECHO; + YY_BREAK +#line 843 "fts0blex.cc" +case YY_STATE_EOF(INITIAL): + yyterminate(); + + case YY_END_OF_BUFFER: + { + /* Amount of text matched not including the EOB char. */ + int yy_amount_of_matched_text = (int) (yy_cp - yyg->yytext_ptr) - 1; + + /* Undo the effects of YY_DO_BEFORE_ACTION. */ + *yy_cp = yyg->yy_hold_char; + YY_RESTORE_YY_MORE_OFFSET + + if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_NEW ) + { + /* We're scanning a new file or input source. It's + * possible that this happened because the user + * just pointed yyin at a new source and called + * fts0blex(). If so, then we have to assure + * consistency between YY_CURRENT_BUFFER and our + * globals. Here is the right place to do so, because + * this is the first action (other than possibly a + * back-up) that will match for the new input source. + */ + yyg->yy_n_chars = YY_CURRENT_BUFFER_LVALUE->yy_n_chars; + YY_CURRENT_BUFFER_LVALUE->yy_input_file = yyin; + YY_CURRENT_BUFFER_LVALUE->yy_buffer_status = YY_BUFFER_NORMAL; + } + + /* Note that here we test for yy_c_buf_p "<=" to the position + * of the first EOB in the buffer, since yy_c_buf_p will + * already have been incremented past the NUL character + * (since all states make transitions on EOB to the + * end-of-buffer state). Contrast this with the test + * in input(). + */ + if ( yyg->yy_c_buf_p <= &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars] ) + { /* This was really a NUL. */ + yy_state_type yy_next_state; + + yyg->yy_c_buf_p = yyg->yytext_ptr + yy_amount_of_matched_text; + + yy_current_state = yy_get_previous_state( yyscanner ); + + /* Okay, we're now positioned to make the NUL + * transition. We couldn't have + * yy_get_previous_state() go ahead and do it + * for us because it doesn't know how to deal + * with the possibility of jamming (and we don't + * want to build jamming into it because then it + * will run more slowly). + */ + + yy_next_state = yy_try_NUL_trans( yy_current_state , yyscanner); + + yy_bp = yyg->yytext_ptr + YY_MORE_ADJ; + + if ( yy_next_state ) + { + /* Consume the NUL. */ + yy_cp = ++yyg->yy_c_buf_p; + yy_current_state = yy_next_state; + goto yy_match; + } + + else + { + yy_cp = yyg->yy_last_accepting_cpos; + yy_current_state = yyg->yy_last_accepting_state; + goto yy_find_action; + } + } + + else switch ( yy_get_next_buffer( yyscanner ) ) + { + case EOB_ACT_END_OF_FILE: + { + yyg->yy_did_buffer_switch_on_eof = 0; + + if ( fts0bwrap(yyscanner ) ) + { + /* Note: because we've taken care in + * yy_get_next_buffer() to have set up + * yytext, we can now set up + * yy_c_buf_p so that if some total + * hoser (like flex itself) wants to + * call the scanner after we return the + * YY_NULL, it'll still work - another + * YY_NULL will get returned. + */ + yyg->yy_c_buf_p = yyg->yytext_ptr + YY_MORE_ADJ; + + yy_act = YY_STATE_EOF(YY_START); + goto do_action; + } + + else + { + if ( ! yyg->yy_did_buffer_switch_on_eof ) + YY_NEW_FILE; + } + break; + } + + case EOB_ACT_CONTINUE_SCAN: + yyg->yy_c_buf_p = + yyg->yytext_ptr + yy_amount_of_matched_text; + + yy_current_state = yy_get_previous_state( yyscanner ); + + yy_cp = yyg->yy_c_buf_p; + yy_bp = yyg->yytext_ptr + YY_MORE_ADJ; + goto yy_match; + + case EOB_ACT_LAST_MATCH: + yyg->yy_c_buf_p = + &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars]; + + yy_current_state = yy_get_previous_state( yyscanner ); + + yy_cp = yyg->yy_c_buf_p; + yy_bp = yyg->yytext_ptr + YY_MORE_ADJ; + goto yy_find_action; + } + break; + } + + default: + YY_FATAL_ERROR( + "fatal flex scanner internal error--no action found" ); + } /* end of action switch */ + } /* end of scanning one token */ +} /* end of fts0blex */ + +/* yy_get_next_buffer - try to read in a new buffer + * + * Returns a code representing an action: + * EOB_ACT_LAST_MATCH - + * EOB_ACT_CONTINUE_SCAN - continue scanning from current position + * EOB_ACT_END_OF_FILE - end of file + */ +static int yy_get_next_buffer (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + register char *dest = YY_CURRENT_BUFFER_LVALUE->yy_ch_buf; + register char *source = yyg->yytext_ptr; + register int number_to_move, i; + int ret_val; + + if ( yyg->yy_c_buf_p > &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars + 1] ) + YY_FATAL_ERROR( + "fatal flex scanner internal error--end of buffer missed" ); + + if ( YY_CURRENT_BUFFER_LVALUE->yy_fill_buffer == 0 ) + { /* Don't try to fill the buffer, so this is an EOF. */ + if ( yyg->yy_c_buf_p - yyg->yytext_ptr - YY_MORE_ADJ == 1 ) + { + /* We matched a single character, the EOB, so + * treat this as a final EOF. + */ + return EOB_ACT_END_OF_FILE; + } + + else + { + /* We matched some text prior to the EOB, first + * process it. + */ + return EOB_ACT_LAST_MATCH; + } + } + + /* Try to read more data. */ + + /* First move last chars to start of buffer. */ + number_to_move = (int) (yyg->yy_c_buf_p - yyg->yytext_ptr) - 1; + + for ( i = 0; i < number_to_move; ++i ) + *(dest++) = *(source++); + + if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_EOF_PENDING ) + /* don't do the read, it's not guaranteed to return an EOF, + * just force an EOF + */ + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars = 0; + + else + { + int num_to_read = static_cast<int>( + YY_CURRENT_BUFFER_LVALUE->yy_buf_size - number_to_move - 1); + + while ( num_to_read <= 0 ) + { /* Not enough room in the buffer - grow it. */ + + /* just a shorter name for the current buffer */ + YY_BUFFER_STATE b = YY_CURRENT_BUFFER; + + int yy_c_buf_p_offset = + (int) (yyg->yy_c_buf_p - b->yy_ch_buf); + + if ( b->yy_is_our_buffer ) + { + int new_size = static_cast<int>(b->yy_buf_size * 2); + + if ( new_size <= 0 ) + b->yy_buf_size += b->yy_buf_size / 8; + else + b->yy_buf_size *= 2; + + b->yy_ch_buf = (char *) + /* Include room in for 2 EOB chars. */ + fts0brealloc((void *) b->yy_ch_buf,b->yy_buf_size + 2 ,yyscanner ); + } + else + /* Can't grow it, we don't own it. */ + b->yy_ch_buf = 0; + + if ( ! b->yy_ch_buf ) + YY_FATAL_ERROR( + "fatal error - scanner input buffer overflow" ); + + yyg->yy_c_buf_p = &b->yy_ch_buf[yy_c_buf_p_offset]; + + num_to_read = static_cast<int>( + YY_CURRENT_BUFFER_LVALUE->yy_buf_size - number_to_move - 1); + + } + + if ( num_to_read > YY_READ_BUF_SIZE ) + num_to_read = YY_READ_BUF_SIZE; + + /* Read in more data. */ + YY_INPUT( (&YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[number_to_move]), + yyg->yy_n_chars, num_to_read); + + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars; + } + + if ( yyg->yy_n_chars == 0 ) + { + if ( number_to_move == YY_MORE_ADJ ) + { + ret_val = EOB_ACT_END_OF_FILE; + fts0brestart(yyin ,yyscanner); + } + + else + { + ret_val = EOB_ACT_LAST_MATCH; + YY_CURRENT_BUFFER_LVALUE->yy_buffer_status = + YY_BUFFER_EOF_PENDING; + } + } + + else + ret_val = EOB_ACT_CONTINUE_SCAN; + + if ((yy_size_t) (yyg->yy_n_chars + number_to_move) > YY_CURRENT_BUFFER_LVALUE->yy_buf_size) { + /* Extend the array by 50%, plus the number we really need. */ + yy_size_t new_size = yyg->yy_n_chars + number_to_move + (yyg->yy_n_chars >> 1); + YY_CURRENT_BUFFER_LVALUE->yy_ch_buf = (char *) fts0brealloc((void *) YY_CURRENT_BUFFER_LVALUE->yy_ch_buf,new_size ,yyscanner ); + if ( ! YY_CURRENT_BUFFER_LVALUE->yy_ch_buf ) + YY_FATAL_ERROR( "out of dynamic memory in yy_get_next_buffer()" ); + } + + yyg->yy_n_chars += number_to_move; + YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars] = YY_END_OF_BUFFER_CHAR; + YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars + 1] = YY_END_OF_BUFFER_CHAR; + + yyg->yytext_ptr = &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[0]; + + return ret_val; +} + +/* yy_get_previous_state - get the state just before the EOB char was reached */ + +static yy_state_type yy_get_previous_state (yyscan_t yyscanner) +{ + register yy_state_type yy_current_state; + register char *yy_cp; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + yy_current_state = yyg->yy_start; + + for ( yy_cp = yyg->yytext_ptr + YY_MORE_ADJ; yy_cp < yyg->yy_c_buf_p; ++yy_cp ) + { + register YY_CHAR yy_c = (*yy_cp ? yy_ec[YY_SC_TO_UI(*yy_cp)] : 1); + if ( yy_accept[yy_current_state] ) + { + yyg->yy_last_accepting_state = yy_current_state; + yyg->yy_last_accepting_cpos = yy_cp; + } + while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) + { + yy_current_state = (int) yy_def[yy_current_state]; + if ( yy_current_state >= 19 ) + yy_c = yy_meta[(unsigned int) yy_c]; + } + yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c]; + } + + return yy_current_state; +} + +/* yy_try_NUL_trans - try to make a transition on the NUL character + * + * synopsis + * next_state = yy_try_NUL_trans( current_state ); + */ +static yy_state_type yy_try_NUL_trans (yy_state_type yy_current_state , yyscan_t yyscanner) +{ + register int yy_is_jam; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; /* This var may be unused depending upon options. */ + register char *yy_cp = yyg->yy_c_buf_p; + + register YY_CHAR yy_c = 1; + if ( yy_accept[yy_current_state] ) + { + yyg->yy_last_accepting_state = yy_current_state; + yyg->yy_last_accepting_cpos = yy_cp; + } + while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) + { + yy_current_state = (int) yy_def[yy_current_state]; + if ( yy_current_state >= 19 ) + yy_c = yy_meta[(unsigned int) yy_c]; + } + yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c]; + yy_is_jam = (yy_current_state == 18); + + return yy_is_jam ? 0 : yy_current_state; +} + +#ifndef YY_NO_INPUT +#ifdef __cplusplus + static int yyinput (yyscan_t yyscanner) +#else + static int input (yyscan_t yyscanner) +#endif + +{ + int c; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + *yyg->yy_c_buf_p = yyg->yy_hold_char; + + if ( *yyg->yy_c_buf_p == YY_END_OF_BUFFER_CHAR ) + { + /* yy_c_buf_p now points to the character we want to return. + * If this occurs *before* the EOB characters, then it's a + * valid NUL; if not, then we've hit the end of the buffer. + */ + if ( yyg->yy_c_buf_p < &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars] ) + /* This was really a NUL. */ + *yyg->yy_c_buf_p = '\0'; + + else + { /* need more input */ + int offset = yyg->yy_c_buf_p - yyg->yytext_ptr; + ++yyg->yy_c_buf_p; + + switch ( yy_get_next_buffer( yyscanner ) ) + { + case EOB_ACT_LAST_MATCH: + /* This happens because yy_g_n_b() + * sees that we've accumulated a + * token and flags that we need to + * try matching the token before + * proceeding. But for input(), + * there's no matching to consider. + * So convert the EOB_ACT_LAST_MATCH + * to EOB_ACT_END_OF_FILE. + */ + + /* Reset buffer status. */ + fts0brestart(yyin ,yyscanner); + + /*FALLTHROUGH*/ + + case EOB_ACT_END_OF_FILE: + { + if ( fts0bwrap(yyscanner ) ) + return EOF; + + if ( ! yyg->yy_did_buffer_switch_on_eof ) + YY_NEW_FILE; +#ifdef __cplusplus + return yyinput(yyscanner); +#else + return input(yyscanner); +#endif + } + + case EOB_ACT_CONTINUE_SCAN: + yyg->yy_c_buf_p = yyg->yytext_ptr + offset; + break; + } + } + } + + c = *(unsigned char *) yyg->yy_c_buf_p; /* cast for 8-bit char's */ + *yyg->yy_c_buf_p = '\0'; /* preserve yytext */ + yyg->yy_hold_char = *++yyg->yy_c_buf_p; + + return c; +} +#endif /* ifndef YY_NO_INPUT */ + +/** Immediately switch to a different input stream. + * @param input_file A readable stream. + * @param yyscanner The scanner object. + * @note This function does not reset the start condition to @c INITIAL . + */ +void fts0brestart (FILE * input_file , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + if ( ! YY_CURRENT_BUFFER ){ + fts0bensure_buffer_stack (yyscanner); + YY_CURRENT_BUFFER_LVALUE = + fts0b_create_buffer(yyin,YY_BUF_SIZE ,yyscanner); + } + + fts0b_init_buffer(YY_CURRENT_BUFFER,input_file ,yyscanner); + fts0b_load_buffer_state(yyscanner ); +} + +/** Switch to a different input buffer. + * @param new_buffer The new input buffer. + * @param yyscanner The scanner object. + */ +void fts0b_switch_to_buffer (YY_BUFFER_STATE new_buffer , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + /* TODO. We should be able to replace this entire function body + * with + * fts0bpop_buffer_state(); + * fts0bpush_buffer_state(new_buffer); + */ + fts0bensure_buffer_stack (yyscanner); + if ( YY_CURRENT_BUFFER == new_buffer ) + return; + + if ( YY_CURRENT_BUFFER ) + { + /* Flush out information for old buffer. */ + *yyg->yy_c_buf_p = yyg->yy_hold_char; + YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = yyg->yy_c_buf_p; + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars; + } + + YY_CURRENT_BUFFER_LVALUE = new_buffer; + fts0b_load_buffer_state(yyscanner ); + + /* We don't actually know whether we did this switch during + * EOF (fts0bwrap()) processing, but the only time this flag + * is looked at is after fts0bwrap() is called, so it's safe + * to go ahead and always set it. + */ + yyg->yy_did_buffer_switch_on_eof = 1; +} + +static void fts0b_load_buffer_state (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + yyg->yy_n_chars = YY_CURRENT_BUFFER_LVALUE->yy_n_chars; + yyg->yytext_ptr = yyg->yy_c_buf_p = YY_CURRENT_BUFFER_LVALUE->yy_buf_pos; + yyin = YY_CURRENT_BUFFER_LVALUE->yy_input_file; + yyg->yy_hold_char = *yyg->yy_c_buf_p; +} + +/** Allocate and initialize an input buffer state. + * @param file A readable stream. + * @param size The character buffer size in bytes. When in doubt, use @c YY_BUF_SIZE. + * @param yyscanner The scanner object. + * @return the allocated buffer state. + */ +YY_BUFFER_STATE fts0b_create_buffer (FILE * file, int size , yyscan_t yyscanner) +{ + YY_BUFFER_STATE b; + + b = (YY_BUFFER_STATE) fts0balloc(sizeof( struct yy_buffer_state ) ,yyscanner ); + if ( ! b ) + YY_FATAL_ERROR( "out of dynamic memory in fts0b_create_buffer()" ); + + b->yy_buf_size = size; + + /* yy_ch_buf has to be 2 characters longer than the size given because + * we need to put in 2 end-of-buffer characters. + */ + b->yy_ch_buf = (char *) fts0balloc(b->yy_buf_size + 2 ,yyscanner ); + if ( ! b->yy_ch_buf ) + YY_FATAL_ERROR( "out of dynamic memory in fts0b_create_buffer()" ); + + b->yy_is_our_buffer = 1; + + fts0b_init_buffer(b,file ,yyscanner); + + return b; +} + +/** Destroy the buffer. + * @param b a buffer created with fts0b_create_buffer() + * @param yyscanner The scanner object. + */ +void fts0b_delete_buffer (YY_BUFFER_STATE b , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + if ( ! b ) + return; + + if ( b == YY_CURRENT_BUFFER ) /* Not sure if we should pop here. */ + YY_CURRENT_BUFFER_LVALUE = (YY_BUFFER_STATE) 0; + + if ( b->yy_is_our_buffer ) + fts0bfree((void *) b->yy_ch_buf ,yyscanner ); + + fts0bfree((void *) b ,yyscanner ); +} + +/* Initializes or reinitializes a buffer. + * This function is sometimes called more than once on the same buffer, + * such as during a fts0brestart() or at EOF. + */ +static void fts0b_init_buffer (YY_BUFFER_STATE b, FILE * file , yyscan_t yyscanner) + +{ + int oerrno = errno; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + fts0b_flush_buffer(b ,yyscanner); + + b->yy_input_file = file; + b->yy_fill_buffer = 1; + + /* If b is the current buffer, then fts0b_init_buffer was _probably_ + * called from fts0brestart() or through yy_get_next_buffer. + * In that case, we don't want to reset the lineno or column. + */ + if (b != YY_CURRENT_BUFFER){ + b->yy_bs_lineno = 1; + b->yy_bs_column = 0; + } + + b->yy_is_interactive = 0; + + errno = oerrno; +} + +/** Discard all buffered characters. On the next scan, YY_INPUT will be called. + * @param b the buffer state to be flushed, usually @c YY_CURRENT_BUFFER. + * @param yyscanner The scanner object. + */ +void fts0b_flush_buffer (YY_BUFFER_STATE b , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + if ( ! b ) + return; + + b->yy_n_chars = 0; + + /* We always need two end-of-buffer characters. The first causes + * a transition to the end-of-buffer state. The second causes + * a jam in that state. + */ + b->yy_ch_buf[0] = YY_END_OF_BUFFER_CHAR; + b->yy_ch_buf[1] = YY_END_OF_BUFFER_CHAR; + + b->yy_buf_pos = &b->yy_ch_buf[0]; + + b->yy_at_bol = 1; + b->yy_buffer_status = YY_BUFFER_NEW; + + if ( b == YY_CURRENT_BUFFER ) + fts0b_load_buffer_state(yyscanner ); +} + +/** Pushes the new state onto the stack. The new state becomes + * the current state. This function will allocate the stack + * if necessary. + * @param new_buffer The new state. + * @param yyscanner The scanner object. + */ +void fts0bpush_buffer_state (YY_BUFFER_STATE new_buffer , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + if (new_buffer == NULL) + return; + + fts0bensure_buffer_stack(yyscanner); + + /* This block is copied from fts0b_switch_to_buffer. */ + if ( YY_CURRENT_BUFFER ) + { + /* Flush out information for old buffer. */ + *yyg->yy_c_buf_p = yyg->yy_hold_char; + YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = yyg->yy_c_buf_p; + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars; + } + + /* Only push if top exists. Otherwise, replace top. */ + if (YY_CURRENT_BUFFER) + yyg->yy_buffer_stack_top++; + YY_CURRENT_BUFFER_LVALUE = new_buffer; + + /* copied from fts0b_switch_to_buffer. */ + fts0b_load_buffer_state(yyscanner ); + yyg->yy_did_buffer_switch_on_eof = 1; +} + +/** Removes and deletes the top of the stack, if present. + * The next element becomes the new top. + * @param yyscanner The scanner object. + */ +void fts0bpop_buffer_state (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + if (!YY_CURRENT_BUFFER) + return; + + fts0b_delete_buffer(YY_CURRENT_BUFFER ,yyscanner); + YY_CURRENT_BUFFER_LVALUE = NULL; + if (yyg->yy_buffer_stack_top > 0) + --yyg->yy_buffer_stack_top; + + if (YY_CURRENT_BUFFER) { + fts0b_load_buffer_state(yyscanner ); + yyg->yy_did_buffer_switch_on_eof = 1; + } +} + +/* Allocates the stack if it does not exist. + * Guarantees space for at least one push. + */ +static void fts0bensure_buffer_stack (yyscan_t yyscanner) +{ + int num_to_alloc; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + if (!yyg->yy_buffer_stack) { + + /* First allocation is just for 2 elements, since we don't know if this + * scanner will even need a stack. We use 2 instead of 1 to avoid an + * immediate realloc on the next call. + */ + num_to_alloc = 1; + yyg->yy_buffer_stack = (struct yy_buffer_state**)fts0balloc + (num_to_alloc * sizeof(struct yy_buffer_state*) + , yyscanner); + if ( ! yyg->yy_buffer_stack ) + YY_FATAL_ERROR( "out of dynamic memory in fts0bensure_buffer_stack()" ); + + memset(yyg->yy_buffer_stack, 0, num_to_alloc * sizeof(struct yy_buffer_state*)); + + yyg->yy_buffer_stack_max = num_to_alloc; + yyg->yy_buffer_stack_top = 0; + return; + } + + if (yyg->yy_buffer_stack_top >= (yyg->yy_buffer_stack_max) - 1){ + + /* Increase the buffer to prepare for a possible push. */ + int grow_size = 8 /* arbitrary grow size */; + + num_to_alloc = static_cast<int>(yyg->yy_buffer_stack_max + grow_size); + yyg->yy_buffer_stack = (struct yy_buffer_state**)fts0brealloc + (yyg->yy_buffer_stack, + num_to_alloc * sizeof(struct yy_buffer_state*) + , yyscanner); + if ( ! yyg->yy_buffer_stack ) + YY_FATAL_ERROR( "out of dynamic memory in fts0bensure_buffer_stack()" ); + + /* zero only the new slots.*/ + memset(yyg->yy_buffer_stack + yyg->yy_buffer_stack_max, 0, grow_size * sizeof(struct yy_buffer_state*)); + yyg->yy_buffer_stack_max = num_to_alloc; + } +} + +/** Setup the input buffer state to scan directly from a user-specified character buffer. + * @param base the character buffer + * @param size the size in bytes of the character buffer + * @param yyscanner The scanner object. + * @return the newly allocated buffer state object. + */ +YY_BUFFER_STATE fts0b_scan_buffer (char * base, yy_size_t size , yyscan_t yyscanner) +{ + YY_BUFFER_STATE b; + + if ( size < 2 || + base[size-2] != YY_END_OF_BUFFER_CHAR || + base[size-1] != YY_END_OF_BUFFER_CHAR ) + /* They forgot to leave room for the EOB's. */ + return 0; + + b = (YY_BUFFER_STATE) fts0balloc(sizeof( struct yy_buffer_state ) ,yyscanner ); + if ( ! b ) + YY_FATAL_ERROR( "out of dynamic memory in fts0b_scan_buffer()" ); + + b->yy_buf_size = size - 2; /* "- 2" to take care of EOB's */ + b->yy_buf_pos = b->yy_ch_buf = base; + b->yy_is_our_buffer = 0; + b->yy_input_file = 0; + b->yy_n_chars = static_cast<int>(b->yy_buf_size); + b->yy_is_interactive = 0; + b->yy_at_bol = 1; + b->yy_fill_buffer = 0; + b->yy_buffer_status = YY_BUFFER_NEW; + + fts0b_switch_to_buffer(b ,yyscanner ); + + return b; +} + +/** Setup the input buffer state to scan a string. The next call to fts0blex() will + * scan from a @e copy of @a str. + * @param yystr a NUL-terminated string to scan + * @param yyscanner The scanner object. + * @return the newly allocated buffer state object. + * @note If you want to scan bytes that may contain NUL values, then use + * fts0b_scan_bytes() instead. + */ +YY_BUFFER_STATE fts0b_scan_string (yyconst char * yystr , yyscan_t yyscanner) +{ + return fts0b_scan_bytes(yystr,static_cast<int>(strlen(yystr)), yyscanner); +} + +/** Setup the input buffer state to scan the given bytes. The next call to fts0blex() will + * scan from a @e copy of @a bytes. + * @param yybytes the byte buffer to scan + * @param _yybytes_len the number of bytes in the buffer pointed to by @a bytes. + * @param yyscanner The scanner object. + * @return the newly allocated buffer state object. + */ +YY_BUFFER_STATE fts0b_scan_bytes (yyconst char * yybytes, int _yybytes_len , yyscan_t yyscanner) +{ + YY_BUFFER_STATE b; + char *buf; + yy_size_t n; + int i; + + /* Get memory for full buffer, including space for trailing EOB's. */ + n = _yybytes_len + 2; + buf = (char *) fts0balloc(n ,yyscanner ); + if ( ! buf ) + YY_FATAL_ERROR( "out of dynamic memory in fts0b_scan_bytes()" ); + + for ( i = 0; i < _yybytes_len; ++i ) + buf[i] = yybytes[i]; + + buf[_yybytes_len] = buf[_yybytes_len+1] = YY_END_OF_BUFFER_CHAR; + + b = fts0b_scan_buffer(buf,n ,yyscanner); + if ( ! b ) + YY_FATAL_ERROR( "bad buffer in fts0b_scan_bytes()" ); + + /* It's okay to grow etc. this buffer, and we should throw it + * away when we're done. + */ + b->yy_is_our_buffer = 1; + + return b; +} + +#ifndef YY_EXIT_FAILURE +#define YY_EXIT_FAILURE 2 +#endif + +static void yy_fatal_error (yyconst char* msg , yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused))) +{ + (void) fprintf( stderr, "%s\n", msg ); + exit( YY_EXIT_FAILURE ); +} + +/* Redefine yyless() so it works in section 3 code. */ + +#undef yyless +#define yyless(n) \ + do \ + { \ + /* Undo effects of setting up yytext. */ \ + int yyless_macro_arg = (n); \ + YY_LESS_LINENO(yyless_macro_arg);\ + yytext[yyleng] = yyg->yy_hold_char; \ + yyg->yy_c_buf_p = yytext + yyless_macro_arg; \ + yyg->yy_hold_char = *yyg->yy_c_buf_p; \ + *yyg->yy_c_buf_p = '\0'; \ + yyleng = yyless_macro_arg; \ + } \ + while ( 0 ) + +/* Accessor methods (get/set functions) to struct members. */ + +/** Get the user-defined data for this scanner. + * @param yyscanner The scanner object. + */ +YY_EXTRA_TYPE fts0bget_extra (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + return yyextra; +} + +/** Get the current line number. + * @param yyscanner The scanner object. + */ +int fts0bget_lineno (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + if (! YY_CURRENT_BUFFER) + return 0; + + return yylineno; +} + +/** Get the current column number. + * @param yyscanner The scanner object. + */ +int fts0bget_column (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + if (! YY_CURRENT_BUFFER) + return 0; + + return yycolumn; +} + +/** Get the input stream. + * @param yyscanner The scanner object. + */ +FILE *fts0bget_in (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + return yyin; +} + +/** Get the output stream. + * @param yyscanner The scanner object. + */ +FILE *fts0bget_out (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + return yyout; +} + +/** Get the length of the current token. + * @param yyscanner The scanner object. + */ +int fts0bget_leng (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + return yyleng; +} + +/** Get the current token. + * @param yyscanner The scanner object. + */ + +char *fts0bget_text (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + return yytext; +} + +/** Set the user-defined data. This data is never touched by the scanner. + * @param user_defined The data to be associated with this scanner. + * @param yyscanner The scanner object. + */ +void fts0bset_extra (YY_EXTRA_TYPE user_defined , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + yyextra = user_defined ; +} + +/** Set the current line number. + * @param line_number + * @param yyscanner The scanner object. + */ +void fts0bset_lineno (int line_number , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + /* lineno is only valid if an input buffer exists. */ + if (! YY_CURRENT_BUFFER ) + yy_fatal_error( "fts0bset_lineno called with no buffer" , yyscanner); + + yylineno = line_number; +} + +/** Set the current column. + * @param line_number + * @param yyscanner The scanner object. + */ +void fts0bset_column (int column_no , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + /* column is only valid if an input buffer exists. */ + if (! YY_CURRENT_BUFFER ) + yy_fatal_error( "fts0bset_column called with no buffer" , yyscanner); + + yycolumn = column_no; +} + +/** Set the input stream. This does not discard the current + * input buffer. + * @param in_str A readable stream. + * @param yyscanner The scanner object. + * @see fts0b_switch_to_buffer + */ +void fts0bset_in (FILE * in_str , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + yyin = in_str ; +} + +void fts0bset_out (FILE * out_str , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + yyout = out_str ; +} + +int fts0bget_debug (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + return yy_flex_debug; +} + +void fts0bset_debug (int bdebug , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + yy_flex_debug = bdebug ; +} + +/* Accessor methods for yylval and yylloc */ + +/* User-visible API */ + +/* fts0blex_init is special because it creates the scanner itself, so it is + * the ONLY reentrant function that doesn't take the scanner as the last argument. + * That's why we explicitly handle the declaration, instead of using our macros. + */ + +int fts0blex_init(yyscan_t* ptr_yy_globals) + +{ + if (ptr_yy_globals == NULL){ + errno = EINVAL; + return 1; + } + + *ptr_yy_globals = (yyscan_t) fts0balloc ( sizeof( struct yyguts_t ), NULL ); + + if (*ptr_yy_globals == NULL){ + errno = ENOMEM; + return 1; + } + + /* By setting to 0xAA, we expose bugs in yy_init_globals. Leave at 0x00 for releases. */ + memset(*ptr_yy_globals,0x00,sizeof(struct yyguts_t)); + + return yy_init_globals ( *ptr_yy_globals ); +} + +/* fts0blex_init_extra has the same functionality as fts0blex_init, but follows the + * convention of taking the scanner as the last argument. Note however, that + * this is a *pointer* to a scanner, as it will be allocated by this call (and + * is the reason, too, why this function also must handle its own declaration). + * The user defined value in the first argument will be available to fts0balloc in + * the yyextra field. + */ + +int fts0blex_init_extra(YY_EXTRA_TYPE yy_user_defined,yyscan_t* ptr_yy_globals ) + +{ + struct yyguts_t dummy_yyguts; + + fts0bset_extra (yy_user_defined, &dummy_yyguts); + + if (ptr_yy_globals == NULL){ + errno = EINVAL; + return 1; + } + + *ptr_yy_globals = (yyscan_t) fts0balloc ( sizeof( struct yyguts_t ), &dummy_yyguts ); + + if (*ptr_yy_globals == NULL){ + errno = ENOMEM; + return 1; + } + + /* By setting to 0xAA, we expose bugs in + yy_init_globals. Leave at 0x00 for releases. */ + memset(*ptr_yy_globals,0x00,sizeof(struct yyguts_t)); + + fts0bset_extra (yy_user_defined, *ptr_yy_globals); + + return yy_init_globals ( *ptr_yy_globals ); +} + +static int yy_init_globals (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + /* Initialization is the same as for the non-reentrant scanner. + * This function is called from fts0blex_destroy(), so don't allocate here. + */ + + yyg->yy_buffer_stack = 0; + yyg->yy_buffer_stack_top = 0; + yyg->yy_buffer_stack_max = 0; + yyg->yy_c_buf_p = (char *) 0; + yyg->yy_init = 0; + yyg->yy_start = 0; + + yyg->yy_start_stack_ptr = 0; + yyg->yy_start_stack_depth = 0; + yyg->yy_start_stack = NULL; + + /* Defined in main.c */ +#ifdef YY_STDINIT + yyin = stdin; + yyout = stdout; +#else + yyin = (FILE *) 0; + yyout = (FILE *) 0; +#endif + + /* For future reference: Set errno on error, since we are called by + * fts0blex_init() + */ + return 0; +} + +/* fts0blex_destroy is for both reentrant and non-reentrant scanners. */ +int fts0blex_destroy (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + /* Pop the buffer stack, destroying each element. */ + while(YY_CURRENT_BUFFER){ + fts0b_delete_buffer(YY_CURRENT_BUFFER ,yyscanner ); + YY_CURRENT_BUFFER_LVALUE = NULL; + fts0bpop_buffer_state(yyscanner); + } + + /* Destroy the stack itself. */ + fts0bfree(yyg->yy_buffer_stack ,yyscanner); + yyg->yy_buffer_stack = NULL; + + /* Destroy the start condition stack. */ + fts0bfree(yyg->yy_start_stack ,yyscanner ); + yyg->yy_start_stack = NULL; + + /* Reset the globals. This is important in a non-reentrant scanner so the next time + * fts0blex() is called, initialization will occur. */ + yy_init_globals( yyscanner); + + /* Destroy the main struct (reentrant only). */ + fts0bfree ( yyscanner , yyscanner ); + yyscanner = NULL; + return 0; +} + +/* + * Internal utility routines. + */ + +#ifndef yytext_ptr +static void yy_flex_strncpy (char* s1, yyconst char * s2, int n , yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused))) +{ + register int i; + for ( i = 0; i < n; ++i ) + s1[i] = s2[i]; +} +#endif + +#ifdef YY_NEED_STRLEN +static int yy_flex_strlen (yyconst char * s , yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused))) +{ + register int n; + for ( n = 0; s[n]; ++n ) + ; + + return n; +} +#endif + +void *fts0balloc (yy_size_t size , yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused))) +{ + return (void *) malloc( size ); +} + +void *fts0brealloc (void * ptr, yy_size_t size , yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused))) +{ + /* The cast to (char *) in the following accommodates both + * implementations that use char* generic pointers, and those + * that use void* generic pointers. It works with the latter + * because both ANSI C and C++ allow castless assignment from + * any pointer type to void*, and deal with argument conversions + * as though doing an assignment. + */ + return (void *) realloc( (char *) ptr, size ); +} + +void fts0bfree (void * ptr , yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused))) +{ + free( (char *) ptr ); /* see fts0brealloc() for (char *) cast */ +} + +#define YYTABLES_NAME "yytables" + +#line 73 "fts0blex.l" + diff --git a/storage/xtradb/fts/fts0blex.l b/storage/xtradb/fts/fts0blex.l new file mode 100644 index 00000000000..ae6e8ffaa48 --- /dev/null +++ b/storage/xtradb/fts/fts0blex.l @@ -0,0 +1,73 @@ +/***************************************************************************** + +Copyright (c) 2007, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/** + * @file fts/fts0blex.l + * FTS parser lexical analyzer + * + * Created 2007/5/9 Sunny Bains + */ + +%{ + +#include "fts0ast.h" +#include "fts0pars.h" + +/* Required for reentrant parser */ +#define YY_DECL int fts_blexer(YYSTYPE* val, yyscan_t yyscanner) + +%} + +%option noinput +%option nounput +%option noyywrap +%option nostdinit +%option reentrant +%option never-interactive + +%% + +[\t ]+ /* Ignore whitespace */ ; + +[*()+\-<>~@] { + val->oper = fts0bget_text(yyscanner)[0]; + + return(val->oper); +} + +[0-9]+ { + val->token = fts_ast_string_create(reinterpret_cast<const byte*>(fts0bget_text(yyscanner)), fts0bget_leng(yyscanner)); + + return(FTS_NUMB); +} + +[^" \n*()+\-<>~@%]* { + val->token = fts_ast_string_create(reinterpret_cast<const byte*>(fts0bget_text(yyscanner)), fts0bget_leng(yyscanner)); + + return(FTS_TERM); +} + +\"[^\"\n]*\" { + val->token = fts_ast_string_create(reinterpret_cast<const byte*>(fts0bget_text(yyscanner)), fts0bget_leng(yyscanner)); + + return(FTS_TEXT); +} + +\n + +%% diff --git a/storage/xtradb/fts/fts0config.cc b/storage/xtradb/fts/fts0config.cc new file mode 100644 index 00000000000..5b4ae5c39f7 --- /dev/null +++ b/storage/xtradb/fts/fts0config.cc @@ -0,0 +1,564 @@ +/***************************************************************************** + +Copyright (c) 2007, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file fts/fts0config.cc +Full Text Search configuration table. + +Created 2007/5/9 Sunny Bains +***********************************************************************/ + +#include "trx0roll.h" +#include "row0sel.h" + +#include "fts0priv.h" + +#ifndef UNIV_NONINL +#include "fts0types.ic" +#include "fts0vlc.ic" +#endif + +/******************************************************************//** +Callback function for fetching the config value. +@return always returns TRUE */ +static +ibool +fts_config_fetch_value( +/*===================*/ + void* row, /*!< in: sel_node_t* */ + void* user_arg) /*!< in: pointer to + ib_vector_t */ +{ + sel_node_t* node = static_cast<sel_node_t*>(row); + fts_string_t* value = static_cast<fts_string_t*>(user_arg); + + dfield_t* dfield = que_node_get_val(node->select_list); + dtype_t* type = dfield_get_type(dfield); + ulint len = dfield_get_len(dfield); + void* data = dfield_get_data(dfield); + + ut_a(dtype_get_mtype(type) == DATA_VARCHAR); + + if (len != UNIV_SQL_NULL) { + ulint max_len = ut_min(value->f_len - 1, len); + + memcpy(value->f_str, data, max_len); + value->f_len = max_len; + value->f_str[value->f_len] = '\0'; + } + + return(TRUE); +} + +/******************************************************************//** +Get value from the config table. The caller must ensure that enough +space is allocated for value to hold the column contents. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fts_config_get_value( +/*=================*/ + trx_t* trx, /*!< transaction */ + fts_table_t* fts_table, /*!< in: the indexed + FTS table */ + const char* name, /*!< in: get config value for + this parameter name */ + fts_string_t* value) /*!< out: value read from + config table */ +{ + pars_info_t* info; + que_t* graph; + dberr_t error; + ulint name_len = strlen(name); + + info = pars_info_create(); + + *value->f_str = '\0'; + ut_a(value->f_len > 0); + + pars_info_bind_function(info, "my_func", fts_config_fetch_value, + value); + + /* The len field of value must be set to the max bytes that + it can hold. On a successful read, the len field will be set + to the actual number of bytes copied to value. */ + pars_info_bind_varchar_literal(info, "name", (byte*) name, name_len); + + fts_table->suffix = "CONFIG"; + + graph = fts_parse_sql( + fts_table, + info, + "DECLARE FUNCTION my_func;\n" + "DECLARE CURSOR c IS SELECT value FROM \"%s\"" + " WHERE key = :name;\n" + "BEGIN\n" + "" + "OPEN c;\n" + "WHILE 1 = 1 LOOP\n" + " FETCH c INTO my_func();\n" + " IF c % NOTFOUND THEN\n" + " EXIT;\n" + " END IF;\n" + "END LOOP;\n" + "CLOSE c;"); + + trx->op_info = "getting FTS config value"; + + error = fts_eval_sql(trx, graph); + + mutex_enter(&dict_sys->mutex); + que_graph_free(graph); + mutex_exit(&dict_sys->mutex); + + return(error); +} + +/*********************************************************************//** +Create the config table name for retrieving index specific value. +@return index config parameter name */ +UNIV_INTERN +char* +fts_config_create_index_param_name( +/*===============================*/ + const char* param, /*!< in: base name of param */ + const dict_index_t* index) /*!< in: index for config */ +{ + ulint len; + char* name; + + /* The format of the config name is: name_<index_id>. */ + len = strlen(param); + + /* Caller is responsible for deleting name. */ + name = static_cast<char*>(ut_malloc( + len + FTS_AUX_MIN_TABLE_ID_LENGTH + 2)); + strcpy(name, param); + name[len] = '_'; + + fts_write_object_id(index->id, name + len + 1, + DICT_TF2_FLAG_IS_SET(index->table, + DICT_TF2_FTS_AUX_HEX_NAME)); + + return(name); +} + +/******************************************************************//** +Get value specific to an FTS index from the config table. The caller +must ensure that enough space is allocated for value to hold the +column contents. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fts_config_get_index_value( +/*=======================*/ + trx_t* trx, /*!< transaction */ + dict_index_t* index, /*!< in: index */ + const char* param, /*!< in: get config value for + this parameter name */ + fts_string_t* value) /*!< out: value read from + config table */ +{ + char* name; + dberr_t error; + fts_table_t fts_table; + + FTS_INIT_FTS_TABLE(&fts_table, "CONFIG", FTS_COMMON_TABLE, + index->table); + + /* We are responsible for free'ing name. */ + name = fts_config_create_index_param_name(param, index); + + error = fts_config_get_value(trx, &fts_table, name, value); + + ut_free(name); + + return(error); +} + +/******************************************************************//** +Set the value in the config table for name. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fts_config_set_value( +/*=================*/ + trx_t* trx, /*!< transaction */ + fts_table_t* fts_table, /*!< in: the indexed + FTS table */ + const char* name, /*!< in: get config value for + this parameter name */ + const fts_string_t* + value) /*!< in: value to update */ +{ + pars_info_t* info; + que_t* graph; + dberr_t error; + undo_no_t undo_no; + undo_no_t n_rows_updated; + ulint name_len = strlen(name); + + info = pars_info_create(); + + pars_info_bind_varchar_literal(info, "name", (byte*) name, name_len); + pars_info_bind_varchar_literal(info, "value", + value->f_str, value->f_len); + + fts_table->suffix = "CONFIG"; + + graph = fts_parse_sql( + fts_table, info, + "BEGIN UPDATE \"%s\" SET value = :value WHERE key = :name;"); + + trx->op_info = "setting FTS config value"; + + undo_no = trx->undo_no; + + error = fts_eval_sql(trx, graph); + + fts_que_graph_free_check_lock(fts_table, NULL, graph); + + n_rows_updated = trx->undo_no - undo_no; + + /* Check if we need to do an insert. */ + if (n_rows_updated == 0) { + info = pars_info_create(); + + pars_info_bind_varchar_literal( + info, "name", (byte*) name, name_len); + + pars_info_bind_varchar_literal( + info, "value", value->f_str, value->f_len); + + graph = fts_parse_sql( + fts_table, info, + "BEGIN\n" + "INSERT INTO \"%s\" VALUES(:name, :value);"); + + trx->op_info = "inserting FTS config value"; + + error = fts_eval_sql(trx, graph); + + fts_que_graph_free_check_lock(fts_table, NULL, graph); + } + + return(error); +} + +/******************************************************************//** +Set the value specific to an FTS index in the config table. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fts_config_set_index_value( +/*=======================*/ + trx_t* trx, /*!< transaction */ + dict_index_t* index, /*!< in: index */ + const char* param, /*!< in: get config value for + this parameter name */ + fts_string_t* value) /*!< out: value read from + config table */ +{ + char* name; + dberr_t error; + fts_table_t fts_table; + + FTS_INIT_FTS_TABLE(&fts_table, "CONFIG", FTS_COMMON_TABLE, + index->table); + + /* We are responsible for free'ing name. */ + name = fts_config_create_index_param_name(param, index); + + error = fts_config_set_value(trx, &fts_table, name, value); + + ut_free(name); + + return(error); +} + +/******************************************************************//** +Get an ulint value from the config table. +@return DB_SUCCESS if all OK else error code */ +UNIV_INTERN +dberr_t +fts_config_get_index_ulint( +/*=======================*/ + trx_t* trx, /*!< in: transaction */ + dict_index_t* index, /*!< in: FTS index */ + const char* name, /*!< in: param name */ + ulint* int_value) /*!< out: value */ +{ + dberr_t error; + fts_string_t value; + + /* We set the length of value to the max bytes it can hold. This + information is used by the callback that reads the value.*/ + value.f_len = FTS_MAX_CONFIG_VALUE_LEN; + value.f_str = static_cast<byte*>(ut_malloc(value.f_len + 1)); + + error = fts_config_get_index_value(trx, index, name, &value); + + if (UNIV_UNLIKELY(error != DB_SUCCESS)) { + ut_print_timestamp(stderr); + + fprintf(stderr, " InnoDB: Error: (%s) reading `%s'\n", + ut_strerr(error), name); + } else { + *int_value = strtoul((char*) value.f_str, NULL, 10); + } + + ut_free(value.f_str); + + return(error); +} + +/******************************************************************//** +Set an ulint value in the config table. +@return DB_SUCCESS if all OK else error code */ +UNIV_INTERN +dberr_t +fts_config_set_index_ulint( +/*=======================*/ + trx_t* trx, /*!< in: transaction */ + dict_index_t* index, /*!< in: FTS index */ + const char* name, /*!< in: param name */ + ulint int_value) /*!< in: value */ +{ + dberr_t error; + fts_string_t value; + + /* We set the length of value to the max bytes it can hold. This + information is used by the callback that reads the value.*/ + value.f_len = FTS_MAX_CONFIG_VALUE_LEN; + value.f_str = static_cast<byte*>(ut_malloc(value.f_len + 1)); + + // FIXME: Get rid of snprintf + ut_a(FTS_MAX_INT_LEN < FTS_MAX_CONFIG_VALUE_LEN); + + value.f_len = ut_snprintf( + (char*) value.f_str, FTS_MAX_INT_LEN, "%lu", int_value); + + error = fts_config_set_index_value(trx, index, name, &value); + + if (UNIV_UNLIKELY(error != DB_SUCCESS)) { + ut_print_timestamp(stderr); + + fprintf(stderr, " InnoDB: Error: (%s) writing `%s'\n", + ut_strerr(error), name); + } + + ut_free(value.f_str); + + return(error); +} + +/******************************************************************//** +Get an ulint value from the config table. +@return DB_SUCCESS if all OK else error code */ +UNIV_INTERN +dberr_t +fts_config_get_ulint( +/*=================*/ + trx_t* trx, /*!< in: transaction */ + fts_table_t* fts_table, /*!< in: the indexed + FTS table */ + const char* name, /*!< in: param name */ + ulint* int_value) /*!< out: value */ +{ + dberr_t error; + fts_string_t value; + + /* We set the length of value to the max bytes it can hold. This + information is used by the callback that reads the value.*/ + value.f_len = FTS_MAX_CONFIG_VALUE_LEN; + value.f_str = static_cast<byte*>(ut_malloc(value.f_len + 1)); + + error = fts_config_get_value(trx, fts_table, name, &value); + + if (UNIV_UNLIKELY(error != DB_SUCCESS)) { + ut_print_timestamp(stderr); + + fprintf(stderr, " InnoDB: Error: (%s) reading `%s'\n", + ut_strerr(error), name); + } else { + *int_value = strtoul((char*) value.f_str, NULL, 10); + } + + ut_free(value.f_str); + + return(error); +} + +/******************************************************************//** +Set an ulint value in the config table. +@return DB_SUCCESS if all OK else error code */ +UNIV_INTERN +dberr_t +fts_config_set_ulint( +/*=================*/ + trx_t* trx, /*!< in: transaction */ + fts_table_t* fts_table, /*!< in: the indexed + FTS table */ + const char* name, /*!< in: param name */ + ulint int_value) /*!< in: value */ +{ + dberr_t error; + fts_string_t value; + + /* We set the length of value to the max bytes it can hold. This + information is used by the callback that reads the value.*/ + value.f_len = FTS_MAX_CONFIG_VALUE_LEN; + value.f_str = static_cast<byte*>(ut_malloc(value.f_len + 1)); + + // FIXME: Get rid of snprintf + ut_a(FTS_MAX_INT_LEN < FTS_MAX_CONFIG_VALUE_LEN); + + value.f_len = snprintf( + (char*) value.f_str, FTS_MAX_INT_LEN, "%lu", int_value); + + error = fts_config_set_value(trx, fts_table, name, &value); + + if (UNIV_UNLIKELY(error != DB_SUCCESS)) { + ut_print_timestamp(stderr); + + fprintf(stderr, " InnoDB: Error: (%s) writing `%s'\n", + ut_strerr(error), name); + } + + ut_free(value.f_str); + + return(error); +} + +/******************************************************************//** +Increment the value in the config table for column name. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fts_config_increment_value( +/*=======================*/ + trx_t* trx, /*!< transaction */ + fts_table_t* fts_table, /*!< in: the indexed + FTS table */ + const char* name, /*!< in: increment config value + for this parameter name */ + ulint delta) /*!< in: increment by this + much */ +{ + dberr_t error; + fts_string_t value; + que_t* graph = NULL; + ulint name_len = strlen(name); + pars_info_t* info = pars_info_create(); + + /* We set the length of value to the max bytes it can hold. This + information is used by the callback that reads the value.*/ + value.f_len = FTS_MAX_CONFIG_VALUE_LEN; + value.f_str = static_cast<byte*>(ut_malloc(value.f_len + 1)); + + *value.f_str = '\0'; + + pars_info_bind_varchar_literal(info, "name", (byte*) name, name_len); + + pars_info_bind_function( + info, "my_func", fts_config_fetch_value, &value); + + fts_table->suffix = "CONFIG"; + + graph = fts_parse_sql( + fts_table, info, + "DECLARE FUNCTION my_func;\n" + "DECLARE CURSOR c IS SELECT value FROM \"%s\"" + " WHERE key = :name FOR UPDATE;\n" + "BEGIN\n" + "" + "OPEN c;\n" + "WHILE 1 = 1 LOOP\n" + " FETCH c INTO my_func();\n" + " IF c % NOTFOUND THEN\n" + " EXIT;\n" + " END IF;\n" + "END LOOP;\n" + "CLOSE c;"); + + trx->op_info = "read FTS config value"; + + error = fts_eval_sql(trx, graph); + + fts_que_graph_free_check_lock(fts_table, NULL, graph); + + if (UNIV_UNLIKELY(error == DB_SUCCESS)) { + ulint int_value; + + int_value = strtoul((char*) value.f_str, NULL, 10); + + int_value += delta; + + ut_a(FTS_MAX_CONFIG_VALUE_LEN > FTS_MAX_INT_LEN); + + // FIXME: Get rid of snprintf + value.f_len = snprintf( + (char*) value.f_str, FTS_MAX_INT_LEN, "%lu", int_value); + + fts_config_set_value(trx, fts_table, name, &value); + } + + if (UNIV_UNLIKELY(error != DB_SUCCESS)) { + + ut_print_timestamp(stderr); + + fprintf(stderr, " InnoDB: Error: (%s) " + "while incrementing %s.\n", ut_strerr(error), name); + } + + ut_free(value.f_str); + + return(error); +} + +/******************************************************************//** +Increment the per index value in the config table for column name. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fts_config_increment_index_value( +/*=============================*/ + trx_t* trx, /*!< transaction */ + dict_index_t* index, /*!< in: FTS index */ + const char* param, /*!< in: increment config value + for this parameter name */ + ulint delta) /*!< in: increment by this + much */ +{ + char* name; + dberr_t error; + fts_table_t fts_table; + + FTS_INIT_FTS_TABLE(&fts_table, "CONFIG", FTS_COMMON_TABLE, + index->table); + + /* We are responsible for free'ing name. */ + name = fts_config_create_index_param_name(param, index); + + error = fts_config_increment_value(trx, &fts_table, name, delta); + + ut_free(name); + + return(error); +} + diff --git a/storage/xtradb/fts/fts0fts.cc b/storage/xtradb/fts/fts0fts.cc new file mode 100644 index 00000000000..c2d3f154075 --- /dev/null +++ b/storage/xtradb/fts/fts0fts.cc @@ -0,0 +1,7550 @@ +/***************************************************************************** + +Copyright (c) 2011, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file fts/fts0fts.cc +Full Text Search interface +***********************************************************************/ + +#include "trx0roll.h" +#include "row0mysql.h" +#include "row0upd.h" +#include "dict0types.h" +#include "row0sel.h" + +#include "fts0fts.h" +#include "fts0priv.h" +#include "fts0types.h" + +#include "fts0types.ic" +#include "fts0vlc.ic" +#include "dict0priv.h" +#include "dict0stats.h" +#include "btr0pcur.h" +#include <vector> + +#include "ha_prototypes.h" + +#define FTS_MAX_ID_LEN 32 + +/** Column name from the FTS config table */ +#define FTS_MAX_CACHE_SIZE_IN_MB "cache_size_in_mb" + +/** Verify if a aux table name is a obsolete table +by looking up the key word in the obsolete table names */ +#define FTS_IS_OBSOLETE_AUX_TABLE(table_name) \ + (strstr((table_name), "DOC_ID") != NULL \ + || strstr((table_name), "ADDED") != NULL \ + || strstr((table_name), "STOPWORDS") != NULL) + +/** This is maximum FTS cache for each table and would be +a configurable variable */ +UNIV_INTERN ulong fts_max_cache_size; + +/** Whether the total memory used for FTS cache is exhausted, and we will +need a sync to free some memory */ +UNIV_INTERN bool fts_need_sync = false; + +/** Variable specifying the total memory allocated for FTS cache */ +UNIV_INTERN ulong fts_max_total_cache_size; + +/** This is FTS result cache limit for each query and would be +a configurable variable */ +UNIV_INTERN ulong fts_result_cache_limit; + +/** Variable specifying the maximum FTS max token size */ +UNIV_INTERN ulong fts_max_token_size; + +/** Variable specifying the minimum FTS max token size */ +UNIV_INTERN ulong fts_min_token_size; + + +// FIXME: testing +ib_time_t elapsed_time = 0; +ulint n_nodes = 0; + +/** Error condition reported by fts_utf8_decode() */ +const ulint UTF8_ERROR = 0xFFFFFFFF; + +/** The cache size permissible lower limit (1K) */ +static const ulint FTS_CACHE_SIZE_LOWER_LIMIT_IN_MB = 1; + +/** The cache size permissible upper limit (1G) */ +static const ulint FTS_CACHE_SIZE_UPPER_LIMIT_IN_MB = 1024; + +/** Time to sleep after DEADLOCK error before retrying operation. */ +static const ulint FTS_DEADLOCK_RETRY_WAIT = 100000; + +#ifdef UNIV_PFS_RWLOCK +UNIV_INTERN mysql_pfs_key_t fts_cache_rw_lock_key; +UNIV_INTERN mysql_pfs_key_t fts_cache_init_rw_lock_key; +#endif /* UNIV_PFS_RWLOCK */ + +#ifdef UNIV_PFS_MUTEX +UNIV_INTERN mysql_pfs_key_t fts_delete_mutex_key; +UNIV_INTERN mysql_pfs_key_t fts_optimize_mutex_key; +UNIV_INTERN mysql_pfs_key_t fts_bg_threads_mutex_key; +UNIV_INTERN mysql_pfs_key_t fts_doc_id_mutex_key; +UNIV_INTERN mysql_pfs_key_t fts_pll_tokenize_mutex_key; +#endif /* UNIV_PFS_MUTEX */ + +/** variable to record innodb_fts_internal_tbl_name for information +schema table INNODB_FTS_INSERTED etc. */ +UNIV_INTERN char* fts_internal_tbl_name = NULL; + +/** InnoDB default stopword list: +There are different versions of stopwords, the stop words listed +below comes from "Google Stopword" list. Reference: +http://meta.wikimedia.org/wiki/Stop_word_list/google_stop_word_list. +The final version of InnoDB default stopword list is still pending +for decision */ +const char *fts_default_stopword[] = +{ + "a", + "about", + "an", + "are", + "as", + "at", + "be", + "by", + "com", + "de", + "en", + "for", + "from", + "how", + "i", + "in", + "is", + "it", + "la", + "of", + "on", + "or", + "that", + "the", + "this", + "to", + "was", + "what", + "when", + "where", + "who", + "will", + "with", + "und", + "the", + "www", + NULL +}; + +/** For storing table info when checking for orphaned tables. */ +struct fts_aux_table_t { + table_id_t id; /*!< Table id */ + table_id_t parent_id; /*!< Parent table id */ + table_id_t index_id; /*!< Table FT index id */ + char* name; /*!< Name of the table */ +}; + +/** SQL statements for creating the ancillary common FTS tables. */ +static const char* fts_create_common_tables_sql = { + "BEGIN\n" + "" + "CREATE TABLE \"%s_DELETED\" (\n" + " doc_id BIGINT UNSIGNED\n" + ") COMPACT;\n" + "CREATE UNIQUE CLUSTERED INDEX IND ON \"%s_DELETED\"(doc_id);\n" + "" + "CREATE TABLE \"%s_DELETED_CACHE\" (\n" + " doc_id BIGINT UNSIGNED\n" + ") COMPACT;\n" + "CREATE UNIQUE CLUSTERED INDEX IND " + "ON \"%s_DELETED_CACHE\"(doc_id);\n" + "" + "CREATE TABLE \"%s_BEING_DELETED\" (\n" + " doc_id BIGINT UNSIGNED\n" + ") COMPACT;\n" + "CREATE UNIQUE CLUSTERED INDEX IND " + "ON \"%s_BEING_DELETED\"(doc_id);\n" + "" + "CREATE TABLE \"%s_BEING_DELETED_CACHE\" (\n" + " doc_id BIGINT UNSIGNED\n" + ") COMPACT;\n" + "CREATE UNIQUE CLUSTERED INDEX IND " + "ON \"%s_BEING_DELETED_CACHE\"(doc_id);\n" + "" + "CREATE TABLE \"%s_CONFIG\" (\n" + " key CHAR(50),\n" + " value CHAR(50) NOT NULL\n" + ") COMPACT;\n" + "CREATE UNIQUE CLUSTERED INDEX IND ON \"%s_CONFIG\"(key);\n" +}; + +#ifdef FTS_DOC_STATS_DEBUG +/** Template for creating the FTS auxiliary index specific tables. This is +mainly designed for the statistics work in the future */ +static const char* fts_create_index_tables_sql = { + "BEGIN\n" + "" + "CREATE TABLE \"%s_DOC_ID\" (\n" + " doc_id BIGINT UNSIGNED,\n" + " word_count INTEGER UNSIGNED NOT NULL\n" + ") COMPACT;\n" + "CREATE UNIQUE CLUSTERED INDEX IND ON \"%s_DOC_ID\"(doc_id);\n" +}; +#endif + +/** Template for creating the ancillary FTS tables word index tables. */ +static const char* fts_create_index_sql = { + "BEGIN\n" + "" + "CREATE UNIQUE CLUSTERED INDEX FTS_INDEX_TABLE_IND " + "ON \"%s\"(word, first_doc_id);\n" +}; + +/** FTS auxiliary table suffixes that are common to all FT indexes. */ +static const char* fts_common_tables[] = { + "BEING_DELETED", + "BEING_DELETED_CACHE", + "CONFIG", + "DELETED", + "DELETED_CACHE", + NULL +}; + +/** FTS auxiliary INDEX split intervals. */ +const fts_index_selector_t fts_index_selector[] = { + { 9, "INDEX_1" }, + { 65, "INDEX_2" }, + { 70, "INDEX_3" }, + { 75, "INDEX_4" }, + { 80, "INDEX_5" }, + { 85, "INDEX_6" }, + { 0 , NULL } +}; + +/** Default config values for FTS indexes on a table. */ +static const char* fts_config_table_insert_values_sql = + "BEGIN\n" + "\n" + "INSERT INTO \"%s\" VALUES('" + FTS_MAX_CACHE_SIZE_IN_MB "', '256');\n" + "" + "INSERT INTO \"%s\" VALUES('" + FTS_OPTIMIZE_LIMIT_IN_SECS "', '180');\n" + "" + "INSERT INTO \"%s\" VALUES ('" + FTS_SYNCED_DOC_ID "', '0');\n" + "" + "INSERT INTO \"%s\" VALUES ('" + FTS_TOTAL_DELETED_COUNT "', '0');\n" + "" /* Note: 0 == FTS_TABLE_STATE_RUNNING */ + "INSERT INTO \"%s\" VALUES ('" + FTS_TABLE_STATE "', '0');\n"; + +/****************************************************************//** +Run SYNC on the table, i.e., write out data from the cache to the +FTS auxiliary INDEX table and clear the cache at the end. +@return DB_SUCCESS if all OK */ +static +dberr_t +fts_sync( +/*=====*/ + fts_sync_t* sync) /*!< in: sync state */ + __attribute__((nonnull)); + +/****************************************************************//** +Release all resources help by the words rb tree e.g., the node ilist. */ +static +void +fts_words_free( +/*===========*/ + ib_rbt_t* words) /*!< in: rb tree of words */ + __attribute__((nonnull)); +#ifdef FTS_CACHE_SIZE_DEBUG +/****************************************************************//** +Read the max cache size parameter from the config table. */ +static +void +fts_update_max_cache_size( +/*======================*/ + fts_sync_t* sync); /*!< in: sync state */ +#endif + +/*********************************************************************//** +This function fetches the document just inserted right before +we commit the transaction, and tokenize the inserted text data +and insert into FTS auxiliary table and its cache. +@return TRUE if successful */ +static +ulint +fts_add_doc_by_id( +/*==============*/ + fts_trx_table_t*ftt, /*!< in: FTS trx table */ + doc_id_t doc_id, /*!< in: doc id */ + ib_vector_t* fts_indexes __attribute__((unused))); + /*!< in: affected fts indexes */ +#ifdef FTS_DOC_STATS_DEBUG +/****************************************************************//** +Check whether a particular word (term) exists in the FTS index. +@return DB_SUCCESS if all went fine */ +static +dberr_t +fts_is_word_in_index( +/*=================*/ + trx_t* trx, /*!< in: FTS query state */ + que_t** graph, /*!< out: Query graph */ + fts_table_t* fts_table, /*!< in: table instance */ + const fts_string_t* word, /*!< in: the word to check */ + ibool* found) /*!< out: TRUE if exists */ + __attribute__((nonnull, warn_unused_result)); +#endif /* FTS_DOC_STATS_DEBUG */ + +/******************************************************************//** +Update the last document id. This function could create a new +transaction to update the last document id. +@return DB_SUCCESS if OK */ +static +dberr_t +fts_update_sync_doc_id( +/*===================*/ + const dict_table_t* table, /*!< in: table */ + const char* table_name, /*!< in: table name, or NULL */ + doc_id_t doc_id, /*!< in: last document id */ + trx_t* trx) /*!< in: update trx, or NULL */ + __attribute__((nonnull(1))); +/******************************************************************** +Check if we should stop. */ +UNIV_INLINE +ibool +fts_is_stop_signalled( +/*==================*/ + fts_t* fts) /*!< in: fts instance */ +{ + ibool stop_signalled = FALSE; + + mutex_enter(&fts->bg_threads_mutex); + + if (fts->fts_status & BG_THREAD_STOP) { + + stop_signalled = TRUE; + } + + mutex_exit(&fts->bg_threads_mutex); + + return(stop_signalled); +} + +/****************************************************************//** +This function loads the default InnoDB stopword list */ +static +void +fts_load_default_stopword( +/*======================*/ + fts_stopword_t* stopword_info) /*!< in: stopword info */ +{ + fts_string_t str; + mem_heap_t* heap; + ib_alloc_t* allocator; + ib_rbt_t* stop_words; + + allocator = stopword_info->heap; + heap = static_cast<mem_heap_t*>(allocator->arg); + + if (!stopword_info->cached_stopword) { + /* For default stopword, we always use fts_utf8_string_cmp() */ + stopword_info->cached_stopword = rbt_create( + sizeof(fts_tokenizer_word_t), fts_utf8_string_cmp); + } + + stop_words = stopword_info->cached_stopword; + + str.f_n_char = 0; + + for (ulint i = 0; fts_default_stopword[i]; ++i) { + char* word; + fts_tokenizer_word_t new_word; + + /* We are going to duplicate the value below. */ + word = const_cast<char*>(fts_default_stopword[i]); + + new_word.nodes = ib_vector_create( + allocator, sizeof(fts_node_t), 4); + + str.f_len = ut_strlen(word); + str.f_str = reinterpret_cast<byte*>(word); + + fts_utf8_string_dup(&new_word.text, &str, heap); + + rbt_insert(stop_words, &new_word, &new_word); + } + + stopword_info->status = STOPWORD_FROM_DEFAULT; +} + +/****************************************************************//** +Callback function to read a single stopword value. +@return Always return TRUE */ +static +ibool +fts_read_stopword( +/*==============*/ + void* row, /*!< in: sel_node_t* */ + void* user_arg) /*!< in: pointer to ib_vector_t */ +{ + ib_alloc_t* allocator; + fts_stopword_t* stopword_info; + sel_node_t* sel_node; + que_node_t* exp; + ib_rbt_t* stop_words; + dfield_t* dfield; + fts_string_t str; + mem_heap_t* heap; + ib_rbt_bound_t parent; + + sel_node = static_cast<sel_node_t*>(row); + stopword_info = static_cast<fts_stopword_t*>(user_arg); + + stop_words = stopword_info->cached_stopword; + allocator = static_cast<ib_alloc_t*>(stopword_info->heap); + heap = static_cast<mem_heap_t*>(allocator->arg); + + exp = sel_node->select_list; + + /* We only need to read the first column */ + dfield = que_node_get_val(exp); + + str.f_n_char = 0; + str.f_str = static_cast<byte*>(dfield_get_data(dfield)); + str.f_len = dfield_get_len(dfield); + + /* Only create new node if it is a value not already existed */ + if (str.f_len != UNIV_SQL_NULL + && rbt_search(stop_words, &parent, &str) != 0) { + + fts_tokenizer_word_t new_word; + + new_word.nodes = ib_vector_create( + allocator, sizeof(fts_node_t), 4); + + new_word.text.f_str = static_cast<byte*>( + mem_heap_alloc(heap, str.f_len + 1)); + + memcpy(new_word.text.f_str, str.f_str, str.f_len); + + new_word.text.f_n_char = 0; + new_word.text.f_len = str.f_len; + new_word.text.f_str[str.f_len] = 0; + + rbt_insert(stop_words, &new_word, &new_word); + } + + return(TRUE); +} + +/******************************************************************//** +Load user defined stopword from designated user table +@return TRUE if load operation is successful */ +static +ibool +fts_load_user_stopword( +/*===================*/ + fts_t* fts, /*!< in: FTS struct */ + const char* stopword_table_name, /*!< in: Stopword table + name */ + fts_stopword_t* stopword_info) /*!< in: Stopword info */ +{ + pars_info_t* info; + que_t* graph; + dberr_t error = DB_SUCCESS; + ibool ret = TRUE; + trx_t* trx; + ibool has_lock = fts->fts_status & TABLE_DICT_LOCKED; + + trx = trx_allocate_for_background(); + trx->op_info = "Load user stopword table into FTS cache"; + + if (!has_lock) { + mutex_enter(&dict_sys->mutex); + } + + /* Validate the user table existence and in the right + format */ + stopword_info->charset = fts_valid_stopword_table(stopword_table_name); + if (!stopword_info->charset) { + ret = FALSE; + goto cleanup; + } else if (!stopword_info->cached_stopword) { + /* Create the stopword RB tree with the stopword column + charset. All comparison will use this charset */ + stopword_info->cached_stopword = rbt_create_arg_cmp( + sizeof(fts_tokenizer_word_t), innobase_fts_text_cmp, + stopword_info->charset); + + } + + info = pars_info_create(); + + pars_info_bind_id(info, TRUE, "table_stopword", stopword_table_name); + + pars_info_bind_function(info, "my_func", fts_read_stopword, + stopword_info); + + graph = fts_parse_sql_no_dict_lock( + NULL, + info, + "DECLARE FUNCTION my_func;\n" + "DECLARE CURSOR c IS" + " SELECT value " + " FROM $table_stopword;\n" + "BEGIN\n" + "\n" + "OPEN c;\n" + "WHILE 1 = 1 LOOP\n" + " FETCH c INTO my_func();\n" + " IF c % NOTFOUND THEN\n" + " EXIT;\n" + " END IF;\n" + "END LOOP;\n" + "CLOSE c;"); + + for (;;) { + error = fts_eval_sql(trx, graph); + + if (error == DB_SUCCESS) { + fts_sql_commit(trx); + stopword_info->status = STOPWORD_USER_TABLE; + break; + } else { + + fts_sql_rollback(trx); + + ut_print_timestamp(stderr); + + if (error == DB_LOCK_WAIT_TIMEOUT) { + fprintf(stderr, " InnoDB: Warning: lock wait " + "timeout reading user stopword table. " + "Retrying!\n"); + + trx->error_state = DB_SUCCESS; + } else { + fprintf(stderr, " InnoDB: Error '%s' " + "while reading user stopword table.\n", + ut_strerr(error)); + ret = FALSE; + break; + } + } + } + + que_graph_free(graph); + +cleanup: + if (!has_lock) { + mutex_exit(&dict_sys->mutex); + } + + trx_free_for_background(trx); + return(ret); +} + +/******************************************************************//** +Initialize the index cache. */ +static +void +fts_index_cache_init( +/*=================*/ + ib_alloc_t* allocator, /*!< in: the allocator to use */ + fts_index_cache_t* index_cache) /*!< in: index cache */ +{ + ulint i; + + ut_a(index_cache->words == NULL); + + index_cache->words = rbt_create_arg_cmp( + sizeof(fts_tokenizer_word_t), innobase_fts_text_cmp, + index_cache->charset); + + ut_a(index_cache->doc_stats == NULL); + + index_cache->doc_stats = ib_vector_create( + allocator, sizeof(fts_doc_stats_t), 4); + + for (i = 0; fts_index_selector[i].value; ++i) { + ut_a(index_cache->ins_graph[i] == NULL); + ut_a(index_cache->sel_graph[i] == NULL); + } +} + +/*********************************************************************//** +Initialize FTS cache. */ +UNIV_INTERN +void +fts_cache_init( +/*===========*/ + fts_cache_t* cache) /*!< in: cache to initialize */ +{ + ulint i; + + /* Just to make sure */ + ut_a(cache->sync_heap->arg == NULL); + + cache->sync_heap->arg = mem_heap_create(1024); + + cache->total_size = 0; + + mutex_enter((ib_mutex_t*) &cache->deleted_lock); + cache->deleted_doc_ids = ib_vector_create( + cache->sync_heap, sizeof(fts_update_t), 4); + mutex_exit((ib_mutex_t*) &cache->deleted_lock); + + /* Reset the cache data for all the FTS indexes. */ + for (i = 0; i < ib_vector_size(cache->indexes); ++i) { + fts_index_cache_t* index_cache; + + index_cache = static_cast<fts_index_cache_t*>( + ib_vector_get(cache->indexes, i)); + + fts_index_cache_init(cache->sync_heap, index_cache); + } +} + +/****************************************************************//** +Create a FTS cache. */ +UNIV_INTERN +fts_cache_t* +fts_cache_create( +/*=============*/ + dict_table_t* table) /*!< in: table owns the FTS cache */ +{ + mem_heap_t* heap; + fts_cache_t* cache; + + heap = static_cast<mem_heap_t*>(mem_heap_create(512)); + + cache = static_cast<fts_cache_t*>( + mem_heap_zalloc(heap, sizeof(*cache))); + + cache->cache_heap = heap; + + rw_lock_create(fts_cache_rw_lock_key, &cache->lock, SYNC_FTS_CACHE); + + rw_lock_create( + fts_cache_init_rw_lock_key, &cache->init_lock, + SYNC_FTS_CACHE_INIT); + + mutex_create( + fts_delete_mutex_key, &cache->deleted_lock, SYNC_FTS_OPTIMIZE); + + mutex_create( + fts_optimize_mutex_key, &cache->optimize_lock, + SYNC_FTS_OPTIMIZE); + + mutex_create( + fts_doc_id_mutex_key, &cache->doc_id_lock, SYNC_FTS_OPTIMIZE); + + /* This is the heap used to create the cache itself. */ + cache->self_heap = ib_heap_allocator_create(heap); + + /* This is a transient heap, used for storing sync data. */ + cache->sync_heap = ib_heap_allocator_create(heap); + cache->sync_heap->arg = NULL; + + fts_need_sync = false; + + cache->sync = static_cast<fts_sync_t*>( + mem_heap_zalloc(heap, sizeof(fts_sync_t))); + + cache->sync->table = table; + + /* Create the index cache vector that will hold the inverted indexes. */ + cache->indexes = ib_vector_create( + cache->self_heap, sizeof(fts_index_cache_t), 2); + + fts_cache_init(cache); + + cache->stopword_info.cached_stopword = NULL; + cache->stopword_info.charset = NULL; + + cache->stopword_info.heap = cache->self_heap; + + cache->stopword_info.status = STOPWORD_NOT_INIT; + + return(cache); +} + +/*******************************************************************//** +Add a newly create index into FTS cache */ +UNIV_INTERN +void +fts_add_index( +/*==========*/ + dict_index_t* index, /*!< FTS index to be added */ + dict_table_t* table) /*!< table */ +{ + fts_t* fts = table->fts; + fts_cache_t* cache; + fts_index_cache_t* index_cache; + + ut_ad(fts); + cache = table->fts->cache; + + rw_lock_x_lock(&cache->init_lock); + + ib_vector_push(fts->indexes, &index); + + index_cache = fts_find_index_cache(cache, index); + + if (!index_cache) { + /* Add new index cache structure */ + index_cache = fts_cache_index_cache_create(table, index); + } + + rw_lock_x_unlock(&cache->init_lock); +} + +/*******************************************************************//** +recalibrate get_doc structure after index_cache in cache->indexes changed */ +static +void +fts_reset_get_doc( +/*==============*/ + fts_cache_t* cache) /*!< in: FTS index cache */ +{ + fts_get_doc_t* get_doc; + ulint i; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&cache->init_lock, RW_LOCK_EX)); +#endif + ib_vector_reset(cache->get_docs); + + for (i = 0; i < ib_vector_size(cache->indexes); i++) { + fts_index_cache_t* ind_cache; + + ind_cache = static_cast<fts_index_cache_t*>( + ib_vector_get(cache->indexes, i)); + + get_doc = static_cast<fts_get_doc_t*>( + ib_vector_push(cache->get_docs, NULL)); + + memset(get_doc, 0x0, sizeof(*get_doc)); + + get_doc->index_cache = ind_cache; + } + + ut_ad(ib_vector_size(cache->get_docs) + == ib_vector_size(cache->indexes)); +} + +/*******************************************************************//** +Check an index is in the table->indexes list +@return TRUE if it exists */ +static +ibool +fts_in_dict_index( +/*==============*/ + dict_table_t* table, /*!< in: Table */ + dict_index_t* index_check) /*!< in: index to be checked */ +{ + dict_index_t* index; + + for (index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + + if (index == index_check) { + return(TRUE); + } + } + + return(FALSE); +} + +/*******************************************************************//** +Check an index is in the fts->cache->indexes list +@return TRUE if it exists */ +static +ibool +fts_in_index_cache( +/*===============*/ + dict_table_t* table, /*!< in: Table */ + dict_index_t* index) /*!< in: index to be checked */ +{ + ulint i; + + for (i = 0; i < ib_vector_size(table->fts->cache->indexes); i++) { + fts_index_cache_t* index_cache; + + index_cache = static_cast<fts_index_cache_t*>( + ib_vector_get(table->fts->cache->indexes, i)); + + if (index_cache->index == index) { + return(TRUE); + } + } + + return(FALSE); +} + +/*******************************************************************//** +Check indexes in the fts->indexes is also present in index cache and +table->indexes list +@return TRUE if all indexes match */ +UNIV_INTERN +ibool +fts_check_cached_index( +/*===================*/ + dict_table_t* table) /*!< in: Table where indexes are dropped */ +{ + ulint i; + + if (!table->fts || !table->fts->cache) { + return(TRUE); + } + + ut_a(ib_vector_size(table->fts->indexes) + == ib_vector_size(table->fts->cache->indexes)); + + for (i = 0; i < ib_vector_size(table->fts->indexes); i++) { + dict_index_t* index; + + index = static_cast<dict_index_t*>( + ib_vector_getp(table->fts->indexes, i)); + + if (!fts_in_index_cache(table, index)) { + return(FALSE); + } + + if (!fts_in_dict_index(table, index)) { + return(FALSE); + } + } + + return(TRUE); +} + +/*******************************************************************//** +Drop auxiliary tables related to an FTS index +@return DB_SUCCESS or error number */ +UNIV_INTERN +dberr_t +fts_drop_index( +/*===========*/ + dict_table_t* table, /*!< in: Table where indexes are dropped */ + dict_index_t* index, /*!< in: Index to be dropped */ + trx_t* trx) /*!< in: Transaction for the drop */ +{ + ib_vector_t* indexes = table->fts->indexes; + dberr_t err = DB_SUCCESS; + + ut_a(indexes); + + if ((ib_vector_size(indexes) == 1 + && (index == static_cast<dict_index_t*>( + ib_vector_getp(table->fts->indexes, 0)))) + || ib_vector_is_empty(indexes)) { + doc_id_t current_doc_id; + doc_id_t first_doc_id; + + /* If we are dropping the only FTS index of the table, + remove it from optimize thread */ + fts_optimize_remove_table(table); + + DICT_TF2_FLAG_UNSET(table, DICT_TF2_FTS); + + /* If Doc ID column is not added internally by FTS index, + we can drop all FTS auxiliary tables. Otherwise, we will + need to keep some common table such as CONFIG table, so + as to keep track of incrementing Doc IDs */ + if (!DICT_TF2_FLAG_IS_SET( + table, DICT_TF2_FTS_HAS_DOC_ID)) { + + err = fts_drop_tables(trx, table); + + err = fts_drop_index_tables(trx, index); + + fts_free(table); + + return(err); + } + + current_doc_id = table->fts->cache->next_doc_id; + first_doc_id = table->fts->cache->first_doc_id; + fts_cache_clear(table->fts->cache); + fts_cache_destroy(table->fts->cache); + table->fts->cache = fts_cache_create(table); + table->fts->cache->next_doc_id = current_doc_id; + table->fts->cache->first_doc_id = first_doc_id; + } else { + fts_cache_t* cache = table->fts->cache; + fts_index_cache_t* index_cache; + + rw_lock_x_lock(&cache->init_lock); + + index_cache = fts_find_index_cache(cache, index); + + if (index_cache != NULL) { + if (index_cache->words) { + fts_words_free(index_cache->words); + rbt_free(index_cache->words); + } + + ib_vector_remove(cache->indexes, *(void**) index_cache); + } + + if (cache->get_docs) { + fts_reset_get_doc(cache); + } + + rw_lock_x_unlock(&cache->init_lock); + } + + err = fts_drop_index_tables(trx, index); + + ib_vector_remove(indexes, (const void*) index); + + return(err); +} + +/****************************************************************//** +Free the query graph but check whether dict_sys->mutex is already +held */ +UNIV_INTERN +void +fts_que_graph_free_check_lock( +/*==========================*/ + fts_table_t* fts_table, /*!< in: FTS table */ + const fts_index_cache_t*index_cache, /*!< in: FTS index cache */ + que_t* graph) /*!< in: query graph */ +{ + ibool has_dict = FALSE; + + if (fts_table && fts_table->table) { + ut_ad(fts_table->table->fts); + + has_dict = fts_table->table->fts->fts_status + & TABLE_DICT_LOCKED; + } else if (index_cache) { + ut_ad(index_cache->index->table->fts); + + has_dict = index_cache->index->table->fts->fts_status + & TABLE_DICT_LOCKED; + } + + if (!has_dict) { + mutex_enter(&dict_sys->mutex); + } + + ut_ad(mutex_own(&dict_sys->mutex)); + + que_graph_free(graph); + + if (!has_dict) { + mutex_exit(&dict_sys->mutex); + } +} + +/****************************************************************//** +Create an FTS index cache. */ +UNIV_INTERN +CHARSET_INFO* +fts_index_get_charset( +/*==================*/ + dict_index_t* index) /*!< in: FTS index */ +{ + CHARSET_INFO* charset = NULL; + dict_field_t* field; + ulint prtype; + + field = dict_index_get_nth_field(index, 0); + prtype = field->col->prtype; + + charset = innobase_get_fts_charset( + (int) (prtype & DATA_MYSQL_TYPE_MASK), + (uint) dtype_get_charset_coll(prtype)); + +#ifdef FTS_DEBUG + /* Set up charset info for this index. Please note all + field of the FTS index should have the same charset */ + for (i = 1; i < index->n_fields; i++) { + CHARSET_INFO* fld_charset; + + field = dict_index_get_nth_field(index, i); + prtype = field->col->prtype; + + fld_charset = innobase_get_fts_charset( + (int)(prtype & DATA_MYSQL_TYPE_MASK), + (uint) dtype_get_charset_coll(prtype)); + + /* All FTS columns should have the same charset */ + if (charset) { + ut_a(charset == fld_charset); + } else { + charset = fld_charset; + } + } +#endif + + return(charset); + +} +/****************************************************************//** +Create an FTS index cache. +@return Index Cache */ +UNIV_INTERN +fts_index_cache_t* +fts_cache_index_cache_create( +/*=========================*/ + dict_table_t* table, /*!< in: table with FTS index */ + dict_index_t* index) /*!< in: FTS index */ +{ + ulint n_bytes; + fts_index_cache_t* index_cache; + fts_cache_t* cache = table->fts->cache; + + ut_a(cache != NULL); + +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&cache->init_lock, RW_LOCK_EX)); +#endif + + /* Must not already exist in the cache vector. */ + ut_a(fts_find_index_cache(cache, index) == NULL); + + index_cache = static_cast<fts_index_cache_t*>( + ib_vector_push(cache->indexes, NULL)); + + memset(index_cache, 0x0, sizeof(*index_cache)); + + index_cache->index = index; + + index_cache->charset = fts_index_get_charset(index); + + n_bytes = sizeof(que_t*) * sizeof(fts_index_selector); + + index_cache->ins_graph = static_cast<que_t**>( + mem_heap_zalloc(static_cast<mem_heap_t*>( + cache->self_heap->arg), n_bytes)); + + index_cache->sel_graph = static_cast<que_t**>( + mem_heap_zalloc(static_cast<mem_heap_t*>( + cache->self_heap->arg), n_bytes)); + + fts_index_cache_init(cache->sync_heap, index_cache); + + if (cache->get_docs) { + fts_reset_get_doc(cache); + } + + return(index_cache); +} + +/****************************************************************//** +Release all resources help by the words rb tree e.g., the node ilist. */ +static +void +fts_words_free( +/*===========*/ + ib_rbt_t* words) /*!< in: rb tree of words */ +{ + const ib_rbt_node_t* rbt_node; + + /* Free the resources held by a word. */ + for (rbt_node = rbt_first(words); + rbt_node != NULL; + rbt_node = rbt_first(words)) { + + ulint i; + fts_tokenizer_word_t* word; + + word = rbt_value(fts_tokenizer_word_t, rbt_node); + + /* Free the ilists of this word. */ + for (i = 0; i < ib_vector_size(word->nodes); ++i) { + + fts_node_t* fts_node = static_cast<fts_node_t*>( + ib_vector_get(word->nodes, i)); + + ut_free(fts_node->ilist); + fts_node->ilist = NULL; + } + + /* NOTE: We are responsible for free'ing the node */ + ut_free(rbt_remove_node(words, rbt_node)); + } +} + +/*********************************************************************//** +Clear cache. */ +UNIV_INTERN +void +fts_cache_clear( +/*============*/ + fts_cache_t* cache) /*!< in: cache */ +{ + ulint i; + + for (i = 0; i < ib_vector_size(cache->indexes); ++i) { + ulint j; + fts_index_cache_t* index_cache; + + index_cache = static_cast<fts_index_cache_t*>( + ib_vector_get(cache->indexes, i)); + + fts_words_free(index_cache->words); + + rbt_free(index_cache->words); + + index_cache->words = NULL; + + for (j = 0; fts_index_selector[j].value; ++j) { + + if (index_cache->ins_graph[j] != NULL) { + + fts_que_graph_free_check_lock( + NULL, index_cache, + index_cache->ins_graph[j]); + + index_cache->ins_graph[j] = NULL; + } + + if (index_cache->sel_graph[j] != NULL) { + + fts_que_graph_free_check_lock( + NULL, index_cache, + index_cache->sel_graph[j]); + + index_cache->sel_graph[j] = NULL; + } + } + + index_cache->doc_stats = NULL; + } + + mem_heap_free(static_cast<mem_heap_t*>(cache->sync_heap->arg)); + cache->sync_heap->arg = NULL; + + cache->total_size = 0; + + mutex_enter((ib_mutex_t*) &cache->deleted_lock); + cache->deleted_doc_ids = NULL; + mutex_exit((ib_mutex_t*) &cache->deleted_lock); +} + +/*********************************************************************//** +Search the index specific cache for a particular FTS index. +@return the index cache else NULL */ +UNIV_INLINE +fts_index_cache_t* +fts_get_index_cache( +/*================*/ + fts_cache_t* cache, /*!< in: cache to search */ + const dict_index_t* index) /*!< in: index to search for */ +{ + ulint i; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own((rw_lock_t*) &cache->lock, RW_LOCK_EX) + || rw_lock_own((rw_lock_t*) &cache->init_lock, RW_LOCK_EX)); +#endif + + for (i = 0; i < ib_vector_size(cache->indexes); ++i) { + fts_index_cache_t* index_cache; + + index_cache = static_cast<fts_index_cache_t*>( + ib_vector_get(cache->indexes, i)); + + if (index_cache->index == index) { + + return(index_cache); + } + } + + return(NULL); +} + +#ifdef FTS_DEBUG +/*********************************************************************//** +Search the index cache for a get_doc structure. +@return the fts_get_doc_t item else NULL */ +static +fts_get_doc_t* +fts_get_index_get_doc( +/*==================*/ + fts_cache_t* cache, /*!< in: cache to search */ + const dict_index_t* index) /*!< in: index to search for */ +{ + ulint i; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own((rw_lock_t*) &cache->init_lock, RW_LOCK_EX)); +#endif + + for (i = 0; i < ib_vector_size(cache->get_docs); ++i) { + fts_get_doc_t* get_doc; + + get_doc = static_cast<fts_get_doc_t*>( + ib_vector_get(cache->get_docs, i)); + + if (get_doc->index_cache->index == index) { + + return(get_doc); + } + } + + return(NULL); +} +#endif + +/**********************************************************************//** +Free the FTS cache. */ +UNIV_INTERN +void +fts_cache_destroy( +/*==============*/ + fts_cache_t* cache) /*!< in: cache*/ +{ + rw_lock_free(&cache->lock); + rw_lock_free(&cache->init_lock); + mutex_free(&cache->optimize_lock); + mutex_free(&cache->deleted_lock); + mutex_free(&cache->doc_id_lock); + + if (cache->stopword_info.cached_stopword) { + rbt_free(cache->stopword_info.cached_stopword); + } + + if (cache->sync_heap->arg) { + mem_heap_free(static_cast<mem_heap_t*>(cache->sync_heap->arg)); + } + + mem_heap_free(cache->cache_heap); +} + +/**********************************************************************//** +Find an existing word, or if not found, create one and return it. +@return specified word token */ +static +fts_tokenizer_word_t* +fts_tokenizer_word_get( +/*===================*/ + fts_cache_t* cache, /*!< in: cache */ + fts_index_cache_t* + index_cache, /*!< in: index cache */ + fts_string_t* text) /*!< in: node text */ +{ + fts_tokenizer_word_t* word; + ib_rbt_bound_t parent; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&cache->lock, RW_LOCK_EX)); +#endif + + /* If it is a stopword, do not index it */ + if (cache->stopword_info.cached_stopword != NULL + && rbt_search(cache->stopword_info.cached_stopword, + &parent, text) == 0) { + + return(NULL); + } + + /* Check if we found a match, if not then add word to tree. */ + if (rbt_search(index_cache->words, &parent, text) != 0) { + mem_heap_t* heap; + fts_tokenizer_word_t new_word; + + heap = static_cast<mem_heap_t*>(cache->sync_heap->arg); + + new_word.nodes = ib_vector_create( + cache->sync_heap, sizeof(fts_node_t), 4); + + fts_utf8_string_dup(&new_word.text, text, heap); + + parent.last = rbt_add_node( + index_cache->words, &parent, &new_word); + + /* Take into account the RB tree memory use and the vector. */ + cache->total_size += sizeof(new_word) + + sizeof(ib_rbt_node_t) + + text->f_len + + (sizeof(fts_node_t) * 4) + + sizeof(*new_word.nodes); + + ut_ad(rbt_validate(index_cache->words)); + } + + word = rbt_value(fts_tokenizer_word_t, parent.last); + + return(word); +} + +/**********************************************************************//** +Add the given doc_id/word positions to the given node's ilist. */ +UNIV_INTERN +void +fts_cache_node_add_positions( +/*=========================*/ + fts_cache_t* cache, /*!< in: cache */ + fts_node_t* node, /*!< in: word node */ + doc_id_t doc_id, /*!< in: doc id */ + ib_vector_t* positions) /*!< in: fts_token_t::positions */ +{ + ulint i; + byte* ptr; + byte* ilist; + ulint enc_len; + ulint last_pos; + byte* ptr_start; + ulint doc_id_delta; + +#ifdef UNIV_SYNC_DEBUG + if (cache) { + ut_ad(rw_lock_own(&cache->lock, RW_LOCK_EX)); + } +#endif + ut_ad(doc_id >= node->last_doc_id); + + /* Calculate the space required to store the ilist. */ + doc_id_delta = (ulint)(doc_id - node->last_doc_id); + enc_len = fts_get_encoded_len(doc_id_delta); + + last_pos = 0; + for (i = 0; i < ib_vector_size(positions); i++) { + ulint pos = *(static_cast<ulint*>( + ib_vector_get(positions, i))); + + ut_ad(last_pos == 0 || pos > last_pos); + + enc_len += fts_get_encoded_len(pos - last_pos); + last_pos = pos; + } + + /* The 0x00 byte at the end of the token positions list. */ + enc_len++; + + if ((node->ilist_size_alloc - node->ilist_size) >= enc_len) { + /* No need to allocate more space, we can fit in the new + data at the end of the old one. */ + ilist = NULL; + ptr = node->ilist + node->ilist_size; + } else { + ulint new_size = node->ilist_size + enc_len; + + /* Over-reserve space by a fixed size for small lengths and + by 20% for lengths >= 48 bytes. */ + if (new_size < 16) { + new_size = 16; + } else if (new_size < 32) { + new_size = 32; + } else if (new_size < 48) { + new_size = 48; + } else { + new_size = (ulint)(1.2 * new_size); + } + + ilist = static_cast<byte*>(ut_malloc(new_size)); + ptr = ilist + node->ilist_size; + + node->ilist_size_alloc = new_size; + } + + ptr_start = ptr; + + /* Encode the new fragment. */ + ptr += fts_encode_int(doc_id_delta, ptr); + + last_pos = 0; + for (i = 0; i < ib_vector_size(positions); i++) { + ulint pos = *(static_cast<ulint*>( + ib_vector_get(positions, i))); + + ptr += fts_encode_int(pos - last_pos, ptr); + last_pos = pos; + } + + *ptr++ = 0; + + ut_a(enc_len == (ulint)(ptr - ptr_start)); + + if (ilist) { + /* Copy old ilist to the start of the new one and switch the + new one into place in the node. */ + if (node->ilist_size > 0) { + memcpy(ilist, node->ilist, node->ilist_size); + ut_free(node->ilist); + } + + node->ilist = ilist; + } + + node->ilist_size += enc_len; + + if (cache) { + cache->total_size += enc_len; + } + + if (node->first_doc_id == FTS_NULL_DOC_ID) { + node->first_doc_id = doc_id; + } + + node->last_doc_id = doc_id; + ++node->doc_count; +} + +/**********************************************************************//** +Add document to the cache. */ +static +void +fts_cache_add_doc( +/*==============*/ + fts_cache_t* cache, /*!< in: cache */ + fts_index_cache_t* + index_cache, /*!< in: index cache */ + doc_id_t doc_id, /*!< in: doc id to add */ + ib_rbt_t* tokens) /*!< in: document tokens */ +{ + const ib_rbt_node_t* node; + ulint n_words; + fts_doc_stats_t* doc_stats; + + if (!tokens) { + return; + } + +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&cache->lock, RW_LOCK_EX)); +#endif + + n_words = rbt_size(tokens); + + for (node = rbt_first(tokens); node; node = rbt_first(tokens)) { + + fts_tokenizer_word_t* word; + fts_node_t* fts_node = NULL; + fts_token_t* token = rbt_value(fts_token_t, node); + + /* Find and/or add token to the cache. */ + word = fts_tokenizer_word_get( + cache, index_cache, &token->text); + + if (!word) { + ut_free(rbt_remove_node(tokens, node)); + continue; + } + + if (ib_vector_size(word->nodes) > 0) { + fts_node = static_cast<fts_node_t*>( + ib_vector_last(word->nodes)); + } + + if (fts_node == NULL + || fts_node->ilist_size > FTS_ILIST_MAX_SIZE + || doc_id < fts_node->last_doc_id) { + + fts_node = static_cast<fts_node_t*>( + ib_vector_push(word->nodes, NULL)); + + memset(fts_node, 0x0, sizeof(*fts_node)); + + cache->total_size += sizeof(*fts_node); + } + + fts_cache_node_add_positions( + cache, fts_node, doc_id, token->positions); + + ut_free(rbt_remove_node(tokens, node)); + } + + ut_a(rbt_empty(tokens)); + + /* Add to doc ids processed so far. */ + doc_stats = static_cast<fts_doc_stats_t*>( + ib_vector_push(index_cache->doc_stats, NULL)); + + doc_stats->doc_id = doc_id; + doc_stats->word_count = n_words; + + /* Add the doc stats memory usage too. */ + cache->total_size += sizeof(*doc_stats); + + if (doc_id > cache->sync->max_doc_id) { + cache->sync->max_doc_id = doc_id; + } +} + +/****************************************************************//** +Drops a table. If the table can't be found we return a SUCCESS code. +@return DB_SUCCESS or error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +fts_drop_table( +/*===========*/ + trx_t* trx, /*!< in: transaction */ + const char* table_name) /*!< in: table to drop */ +{ + dict_table_t* table; + dberr_t error = DB_SUCCESS; + + /* Check that the table exists in our data dictionary. + Similar to regular drop table case, we will open table with + DICT_ERR_IGNORE_INDEX_ROOT and DICT_ERR_IGNORE_CORRUPT option */ + table = dict_table_open_on_name( + table_name, TRUE, FALSE, + static_cast<dict_err_ignore_t>( + DICT_ERR_IGNORE_INDEX_ROOT | DICT_ERR_IGNORE_CORRUPT)); + + if (table != 0) { + + dict_table_close(table, TRUE, FALSE); + + /* Pass nonatomic=false (dont allow data dict unlock), + because the transaction may hold locks on SYS_* tables from + previous calls to fts_drop_table(). */ + error = row_drop_table_for_mysql(table_name, trx, true, false); + + if (error != DB_SUCCESS) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Unable to drop FTS index aux table %s: %s", + table_name, ut_strerr(error)); + } + } else { + error = DB_FAIL; + } + + return(error); +} + +/****************************************************************//** +Rename a single auxiliary table due to database name change. +@return DB_SUCCESS or error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +fts_rename_one_aux_table( +/*=====================*/ + const char* new_name, /*!< in: new parent tbl name */ + const char* fts_table_old_name, /*!< in: old aux tbl name */ + trx_t* trx) /*!< in: transaction */ +{ + char fts_table_new_name[MAX_TABLE_NAME_LEN]; + ulint new_db_name_len = dict_get_db_name_len(new_name); + ulint old_db_name_len = dict_get_db_name_len(fts_table_old_name); + ulint table_new_name_len = strlen(fts_table_old_name) + + new_db_name_len - old_db_name_len; + + /* Check if the new and old database names are the same, if so, + nothing to do */ + ut_ad((new_db_name_len != old_db_name_len) + || strncmp(new_name, fts_table_old_name, old_db_name_len) != 0); + + /* Get the database name from "new_name", and table name + from the fts_table_old_name */ + strncpy(fts_table_new_name, new_name, new_db_name_len); + strncpy(fts_table_new_name + new_db_name_len, + strchr(fts_table_old_name, '/'), + table_new_name_len - new_db_name_len); + fts_table_new_name[table_new_name_len] = 0; + + return(row_rename_table_for_mysql( + fts_table_old_name, fts_table_new_name, trx, false)); +} + +/****************************************************************//** +Rename auxiliary tables for all fts index for a table. This(rename) +is due to database name change +@return DB_SUCCESS or error code */ + +dberr_t +fts_rename_aux_tables( +/*==================*/ + dict_table_t* table, /*!< in: user Table */ + const char* new_name, /*!< in: new table name */ + trx_t* trx) /*!< in: transaction */ +{ + ulint i; + fts_table_t fts_table; + + FTS_INIT_FTS_TABLE(&fts_table, NULL, FTS_COMMON_TABLE, table); + + /* Rename common auxiliary tables */ + for (i = 0; fts_common_tables[i] != NULL; ++i) { + char* old_table_name; + dberr_t err = DB_SUCCESS; + + fts_table.suffix = fts_common_tables[i]; + + old_table_name = fts_get_table_name(&fts_table); + + err = fts_rename_one_aux_table(new_name, old_table_name, trx); + + mem_free(old_table_name); + + if (err != DB_SUCCESS) { + return(err); + } + } + + fts_t* fts = table->fts; + + /* Rename index specific auxiliary tables */ + for (i = 0; fts->indexes != 0 && i < ib_vector_size(fts->indexes); + ++i) { + dict_index_t* index; + + index = static_cast<dict_index_t*>( + ib_vector_getp(fts->indexes, i)); + + FTS_INIT_INDEX_TABLE(&fts_table, NULL, FTS_INDEX_TABLE, index); + + for (ulint j = 0; fts_index_selector[j].value; ++j) { + dberr_t err; + char* old_table_name; + + fts_table.suffix = fts_get_suffix(j); + + old_table_name = fts_get_table_name(&fts_table); + + err = fts_rename_one_aux_table( + new_name, old_table_name, trx); + + DBUG_EXECUTE_IF("fts_rename_failure", + err = DB_DEADLOCK; + fts_sql_rollback(trx);); + + mem_free(old_table_name); + + if (err != DB_SUCCESS) { + return(err); + } + } + } + + return(DB_SUCCESS); +} + +/****************************************************************//** +Drops the common ancillary tables needed for supporting an FTS index +on the given table. row_mysql_lock_data_dictionary must have been called +before this. +@return DB_SUCCESS or error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +fts_drop_common_tables( +/*===================*/ + trx_t* trx, /*!< in: transaction */ + fts_table_t* fts_table) /*!< in: table with an FTS + index */ +{ + ulint i; + dberr_t error = DB_SUCCESS; + + for (i = 0; fts_common_tables[i] != NULL; ++i) { + dberr_t err; + char* table_name; + + fts_table->suffix = fts_common_tables[i]; + + table_name = fts_get_table_name(fts_table); + + err = fts_drop_table(trx, table_name); + + /* We only return the status of the last error. */ + if (err != DB_SUCCESS && err != DB_FAIL) { + error = err; + } + + mem_free(table_name); + } + + return(error); +} + +/****************************************************************//** +Since we do a horizontal split on the index table, we need to drop +all the split tables. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fts_drop_index_split_tables( +/*========================*/ + trx_t* trx, /*!< in: transaction */ + dict_index_t* index) /*!< in: fts instance */ + +{ + ulint i; + fts_table_t fts_table; + dberr_t error = DB_SUCCESS; + + FTS_INIT_INDEX_TABLE(&fts_table, NULL, FTS_INDEX_TABLE, index); + + for (i = 0; fts_index_selector[i].value; ++i) { + dberr_t err; + char* table_name; + + fts_table.suffix = fts_get_suffix(i); + + table_name = fts_get_table_name(&fts_table); + + err = fts_drop_table(trx, table_name); + + /* We only return the status of the last error. */ + if (err != DB_SUCCESS && err != DB_FAIL) { + error = err; + } + + mem_free(table_name); + } + + return(error); +} + +/****************************************************************//** +Drops FTS auxiliary tables for an FTS index +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fts_drop_index_tables( +/*==================*/ + trx_t* trx, /*!< in: transaction */ + dict_index_t* index) /*!< in: Index to drop */ +{ + dberr_t error = DB_SUCCESS; + +#ifdef FTS_DOC_STATS_DEBUG + fts_table_t fts_table; + static const char* index_tables[] = { + "DOC_ID", + NULL + }; +#endif /* FTS_DOC_STATS_DEBUG */ + + dberr_t err = fts_drop_index_split_tables(trx, index); + + /* We only return the status of the last error. */ + if (err != DB_SUCCESS) { + error = err; + } + +#ifdef FTS_DOC_STATS_DEBUG + FTS_INIT_INDEX_TABLE(&fts_table, NULL, FTS_INDEX_TABLE, index); + + for (ulint i = 0; index_tables[i] != NULL; ++i) { + char* table_name; + + fts_table.suffix = index_tables[i]; + + table_name = fts_get_table_name(&fts_table); + + err = fts_drop_table(trx, table_name); + + /* We only return the status of the last error. */ + if (err != DB_SUCCESS && err != DB_FAIL) { + error = err; + } + + mem_free(table_name); + } +#endif /* FTS_DOC_STATS_DEBUG */ + + return(error); +} + +/****************************************************************//** +Drops FTS ancillary tables needed for supporting an FTS index +on the given table. row_mysql_lock_data_dictionary must have been called +before this. +@return DB_SUCCESS or error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +fts_drop_all_index_tables( +/*======================*/ + trx_t* trx, /*!< in: transaction */ + fts_t* fts) /*!< in: fts instance */ +{ + dberr_t error = DB_SUCCESS; + + for (ulint i = 0; + fts->indexes != 0 && i < ib_vector_size(fts->indexes); + ++i) { + + dberr_t err; + dict_index_t* index; + + index = static_cast<dict_index_t*>( + ib_vector_getp(fts->indexes, i)); + + err = fts_drop_index_tables(trx, index); + + if (err != DB_SUCCESS) { + error = err; + } + } + + return(error); +} + +/*********************************************************************//** +Drops the ancillary tables needed for supporting an FTS index on a +given table. row_mysql_lock_data_dictionary must have been called before +this. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fts_drop_tables( +/*============*/ + trx_t* trx, /*!< in: transaction */ + dict_table_t* table) /*!< in: table has the FTS index */ +{ + dberr_t error; + fts_table_t fts_table; + + FTS_INIT_FTS_TABLE(&fts_table, NULL, FTS_COMMON_TABLE, table); + + /* TODO: This is not atomic and can cause problems during recovery. */ + + error = fts_drop_common_tables(trx, &fts_table); + + if (error == DB_SUCCESS) { + error = fts_drop_all_index_tables(trx, table->fts); + } + + return(error); +} + +/*********************************************************************//** +Prepare the SQL, so that all '%s' are replaced by the common prefix. +@return sql string, use mem_free() to free the memory */ +static +char* +fts_prepare_sql( +/*============*/ + fts_table_t* fts_table, /*!< in: table name info */ + const char* my_template) /*!< in: sql template */ +{ + char* sql; + char* name_prefix; + + name_prefix = fts_get_table_name_prefix(fts_table); + sql = ut_strreplace(my_template, "%s", name_prefix); + mem_free(name_prefix); + + return(sql); +} + +/*********************************************************************//** +Creates the common ancillary tables needed for supporting an FTS index +on the given table. row_mysql_lock_data_dictionary must have been called +before this. +@return DB_SUCCESS if succeed */ +UNIV_INTERN +dberr_t +fts_create_common_tables( +/*=====================*/ + trx_t* trx, /*!< in: transaction */ + const dict_table_t* table, /*!< in: table with FTS index */ + const char* name, /*!< in: table name normalized.*/ + bool skip_doc_id_index)/*!< in: Skip index on doc id */ +{ + char* sql; + dberr_t error; + que_t* graph; + fts_table_t fts_table; + mem_heap_t* heap = mem_heap_create(1024); + pars_info_t* info; + + FTS_INIT_FTS_TABLE(&fts_table, NULL, FTS_COMMON_TABLE, table); + + error = fts_drop_common_tables(trx, &fts_table); + + if (error != DB_SUCCESS) { + + goto func_exit; + } + + /* Create the FTS tables that are common to an FTS index. */ + sql = fts_prepare_sql(&fts_table, fts_create_common_tables_sql); + graph = fts_parse_sql_no_dict_lock(NULL, NULL, sql); + mem_free(sql); + + error = fts_eval_sql(trx, graph); + + que_graph_free(graph); + + if (error != DB_SUCCESS) { + + goto func_exit; + } + + /* Write the default settings to the config table. */ + fts_table.suffix = "CONFIG"; + graph = fts_parse_sql_no_dict_lock( + &fts_table, NULL, fts_config_table_insert_values_sql); + + error = fts_eval_sql(trx, graph); + + que_graph_free(graph); + + if (error != DB_SUCCESS || skip_doc_id_index) { + + goto func_exit; + } + + info = pars_info_create(); + + pars_info_bind_id(info, TRUE, "table_name", name); + pars_info_bind_id(info, TRUE, "index_name", FTS_DOC_ID_INDEX_NAME); + pars_info_bind_id(info, TRUE, "doc_id_col_name", FTS_DOC_ID_COL_NAME); + + /* Create the FTS DOC_ID index on the hidden column. Currently this + is common for any FT index created on the table. */ + graph = fts_parse_sql_no_dict_lock( + NULL, + info, + mem_heap_printf( + heap, + "BEGIN\n" + "" + "CREATE UNIQUE INDEX $index_name ON $table_name(" + "$doc_id_col_name);\n")); + + error = fts_eval_sql(trx, graph); + que_graph_free(graph); + +func_exit: + if (error != DB_SUCCESS) { + /* We have special error handling here */ + + trx->error_state = DB_SUCCESS; + + trx_rollback_to_savepoint(trx, NULL); + + row_drop_table_for_mysql(table->name, trx, FALSE); + + trx->error_state = DB_SUCCESS; + } + + mem_heap_free(heap); + + return(error); +} + +/*************************************************************//** +Wrapper function of fts_create_index_tables_low(), create auxiliary +tables for an FTS index +@return: DB_SUCCESS or error code */ +static +dict_table_t* +fts_create_one_index_table( +/*=======================*/ + trx_t* trx, /*!< in: transaction */ + const dict_index_t* + index, /*!< in: the index instance */ + fts_table_t* fts_table, /*!< in: fts_table structure */ + mem_heap_t* heap) /*!< in: heap */ +{ + dict_field_t* field; + dict_table_t* new_table = NULL; + char* table_name = fts_get_table_name(fts_table); + dberr_t error; + CHARSET_INFO* charset; + ulint flags2 = 0; + + ut_ad(index->type & DICT_FTS); + + if (srv_file_per_table) { + flags2 = DICT_TF2_USE_TABLESPACE; + } + + new_table = dict_mem_table_create(table_name, 0, 5, 1, flags2, false); + + field = dict_index_get_nth_field(index, 0); + charset = innobase_get_fts_charset( + (int)(field->col->prtype & DATA_MYSQL_TYPE_MASK), + (uint) dtype_get_charset_coll(field->col->prtype)); + + if (strcmp(charset->name, "latin1_swedish_ci") == 0) { + dict_mem_table_add_col(new_table, heap, "word", DATA_VARCHAR, + field->col->prtype, FTS_MAX_WORD_LEN); + } else { + dict_mem_table_add_col(new_table, heap, "word", DATA_VARMYSQL, + field->col->prtype, FTS_MAX_WORD_LEN); + } + + dict_mem_table_add_col(new_table, heap, "first_doc_id", DATA_INT, + DATA_NOT_NULL | DATA_UNSIGNED, + sizeof(doc_id_t)); + + dict_mem_table_add_col(new_table, heap, "last_doc_id", DATA_INT, + DATA_NOT_NULL | DATA_UNSIGNED, + sizeof(doc_id_t)); + + dict_mem_table_add_col(new_table, heap, "doc_count", DATA_INT, + DATA_NOT_NULL | DATA_UNSIGNED, 4); + + dict_mem_table_add_col(new_table, heap, "ilist", DATA_BLOB, + 4130048, 0); + + error = row_create_table_for_mysql(new_table, trx, false); + + if (error != DB_SUCCESS) { + trx->error_state = error; + dict_mem_table_free(new_table); + new_table = NULL; + ib_logf(IB_LOG_LEVEL_WARN, + "Fail to create FTS index table %s", table_name); + } + + mem_free(table_name); + + return(new_table); +} + +/*************************************************************//** +Wrapper function of fts_create_index_tables_low(), create auxiliary +tables for an FTS index +@return: DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fts_create_index_tables_low( +/*========================*/ + trx_t* trx, /*!< in: transaction */ + const dict_index_t* + index, /*!< in: the index instance */ + const char* table_name, /*!< in: the table name */ + table_id_t table_id) /*!< in: the table id */ + +{ + ulint i; + que_t* graph; + fts_table_t fts_table; + dberr_t error = DB_SUCCESS; + mem_heap_t* heap = mem_heap_create(1024); + + fts_table.type = FTS_INDEX_TABLE; + fts_table.index_id = index->id; + fts_table.table_id = table_id; + fts_table.parent = table_name; + fts_table.table = index->table; + +#ifdef FTS_DOC_STATS_DEBUG + char* sql; + + /* Create the FTS auxiliary tables that are specific + to an FTS index. */ + sql = fts_prepare_sql(&fts_table, fts_create_index_tables_sql); + + graph = fts_parse_sql_no_dict_lock(NULL, NULL, sql); + mem_free(sql); + + error = fts_eval_sql(trx, graph); + que_graph_free(graph); +#endif /* FTS_DOC_STATS_DEBUG */ + + for (i = 0; fts_index_selector[i].value && error == DB_SUCCESS; ++i) { + dict_table_t* new_table; + + /* Create the FTS auxiliary tables that are specific + to an FTS index. We need to preserve the table_id %s + which fts_parse_sql_no_dict_lock() will fill in for us. */ + fts_table.suffix = fts_get_suffix(i); + + new_table = fts_create_one_index_table( + trx, index, &fts_table, heap); + + if (!new_table) { + error = DB_FAIL; + break; + } + + graph = fts_parse_sql_no_dict_lock( + &fts_table, NULL, fts_create_index_sql); + + error = fts_eval_sql(trx, graph); + que_graph_free(graph); + } + + if (error != DB_SUCCESS) { + /* We have special error handling here */ + + trx->error_state = DB_SUCCESS; + + trx_rollback_to_savepoint(trx, NULL); + + row_drop_table_for_mysql(table_name, trx, FALSE); + + trx->error_state = DB_SUCCESS; + } + + mem_heap_free(heap); + + return(error); +} + +/******************************************************************//** +Creates the column specific ancillary tables needed for supporting an +FTS index on the given table. row_mysql_lock_data_dictionary must have +been called before this. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fts_create_index_tables( +/*====================*/ + trx_t* trx, /*!< in: transaction */ + const dict_index_t* index) /*!< in: the index instance */ +{ + dberr_t err; + dict_table_t* table; + + table = dict_table_get_low(index->table_name); + ut_a(table != NULL); + + err = fts_create_index_tables_low(trx, index, table->name, table->id); + + if (err == DB_SUCCESS) { + trx_commit(trx); + } + + return(err); +} +#if 0 +/******************************************************************//** +Return string representation of state. */ +static +const char* +fts_get_state_str( +/*==============*/ + /* out: string representation of state */ + fts_row_state state) /*!< in: state */ +{ + switch (state) { + case FTS_INSERT: + return("INSERT"); + + case FTS_MODIFY: + return("MODIFY"); + + case FTS_DELETE: + return("DELETE"); + + case FTS_NOTHING: + return("NOTHING"); + + case FTS_INVALID: + return("INVALID"); + + default: + return("UNKNOWN"); + } +} +#endif + +/******************************************************************//** +Calculate the new state of a row given the existing state and a new event. +@return new state of row */ +static +fts_row_state +fts_trx_row_get_new_state( +/*======================*/ + fts_row_state old_state, /*!< in: existing state of row */ + fts_row_state event) /*!< in: new event */ +{ + /* The rules for transforming states: + + I = inserted + M = modified + D = deleted + N = nothing + + M+D -> D: + + If the row existed before the transaction started and it is modified + during the transaction, followed by a deletion of the row, only the + deletion will be signaled. + + M+ -> M: + + If the row existed before the transaction started and it is modified + more than once during the transaction, only the last modification + will be signaled. + + IM*D -> N: + + If a new row is added during the transaction (and possibly modified + after its initial insertion) but it is deleted before the end of the + transaction, nothing will be signaled. + + IM* -> I: + + If a new row is added during the transaction and modified after its + initial insertion, only the addition will be signaled. + + M*DI -> M: + + If the row existed before the transaction started and it is deleted, + then re-inserted, only a modification will be signaled. Note that + this case is only possible if the table is using the row's primary + key for FTS row ids, since those can be re-inserted by the user, + which is not true for InnoDB generated row ids. + + It is easily seen that the above rules decompose such that we do not + need to store the row's entire history of events. Instead, we can + store just one state for the row and update that when new events + arrive. Then we can implement the above rules as a two-dimensional + look-up table, and get checking of invalid combinations "for free" + in the process. */ + + /* The lookup table for transforming states. old_state is the + Y-axis, event is the X-axis. */ + static const fts_row_state table[4][4] = { + /* I M D N */ + /* I */ { FTS_INVALID, FTS_INSERT, FTS_NOTHING, FTS_INVALID }, + /* M */ { FTS_INVALID, FTS_MODIFY, FTS_DELETE, FTS_INVALID }, + /* D */ { FTS_MODIFY, FTS_INVALID, FTS_INVALID, FTS_INVALID }, + /* N */ { FTS_INVALID, FTS_INVALID, FTS_INVALID, FTS_INVALID } + }; + + fts_row_state result; + + ut_a(old_state < FTS_INVALID); + ut_a(event < FTS_INVALID); + + result = table[(int) old_state][(int) event]; + ut_a(result != FTS_INVALID); + + return(result); +} + +/******************************************************************//** +Create a savepoint instance. +@return savepoint instance */ +static +fts_savepoint_t* +fts_savepoint_create( +/*=================*/ + ib_vector_t* savepoints, /*!< out: InnoDB transaction */ + const char* name, /*!< in: savepoint name */ + mem_heap_t* heap) /*!< in: heap */ +{ + fts_savepoint_t* savepoint; + + savepoint = static_cast<fts_savepoint_t*>( + ib_vector_push(savepoints, NULL)); + + memset(savepoint, 0x0, sizeof(*savepoint)); + + if (name) { + savepoint->name = mem_heap_strdup(heap, name); + } + + savepoint->tables = rbt_create( + sizeof(fts_trx_table_t*), fts_trx_table_cmp); + + return(savepoint); +} + +/******************************************************************//** +Create an FTS trx. +@return FTS trx */ +static +fts_trx_t* +fts_trx_create( +/*===========*/ + trx_t* trx) /*!< in/out: InnoDB + transaction */ +{ + fts_trx_t* ftt; + ib_alloc_t* heap_alloc; + mem_heap_t* heap = mem_heap_create(1024); + trx_named_savept_t* savep; + + ut_a(trx->fts_trx == NULL); + + ftt = static_cast<fts_trx_t*>(mem_heap_alloc(heap, sizeof(fts_trx_t))); + ftt->trx = trx; + ftt->heap = heap; + + heap_alloc = ib_heap_allocator_create(heap); + + ftt->savepoints = static_cast<ib_vector_t*>(ib_vector_create( + heap_alloc, sizeof(fts_savepoint_t), 4)); + + ftt->last_stmt = static_cast<ib_vector_t*>(ib_vector_create( + heap_alloc, sizeof(fts_savepoint_t), 4)); + + /* Default instance has no name and no heap. */ + fts_savepoint_create(ftt->savepoints, NULL, NULL); + fts_savepoint_create(ftt->last_stmt, NULL, NULL); + + /* Copy savepoints that already set before. */ + for (savep = UT_LIST_GET_FIRST(trx->trx_savepoints); + savep != NULL; + savep = UT_LIST_GET_NEXT(trx_savepoints, savep)) { + + fts_savepoint_take(trx, ftt, savep->name); + } + + return(ftt); +} + +/******************************************************************//** +Create an FTS trx table. +@return FTS trx table */ +static +fts_trx_table_t* +fts_trx_table_create( +/*=================*/ + fts_trx_t* fts_trx, /*!< in: FTS trx */ + dict_table_t* table) /*!< in: table */ +{ + fts_trx_table_t* ftt; + + ftt = static_cast<fts_trx_table_t*>( + mem_heap_alloc(fts_trx->heap, sizeof(*ftt))); + + memset(ftt, 0x0, sizeof(*ftt)); + + ftt->table = table; + ftt->fts_trx = fts_trx; + + ftt->rows = rbt_create(sizeof(fts_trx_row_t), fts_trx_row_doc_id_cmp); + + return(ftt); +} + +/******************************************************************//** +Clone an FTS trx table. +@return FTS trx table */ +static +fts_trx_table_t* +fts_trx_table_clone( +/*=================*/ + const fts_trx_table_t* ftt_src) /*!< in: FTS trx */ +{ + fts_trx_table_t* ftt; + + ftt = static_cast<fts_trx_table_t*>( + mem_heap_alloc(ftt_src->fts_trx->heap, sizeof(*ftt))); + + memset(ftt, 0x0, sizeof(*ftt)); + + ftt->table = ftt_src->table; + ftt->fts_trx = ftt_src->fts_trx; + + ftt->rows = rbt_create(sizeof(fts_trx_row_t), fts_trx_row_doc_id_cmp); + + /* Copy the rb tree values to the new savepoint. */ + rbt_merge_uniq(ftt->rows, ftt_src->rows); + + /* These are only added on commit. At this stage we only have + the updated row state. */ + ut_a(ftt_src->added_doc_ids == NULL); + + return(ftt); +} + +/******************************************************************//** +Initialize the FTS trx instance. +@return FTS trx instance */ +static +fts_trx_table_t* +fts_trx_init( +/*=========*/ + trx_t* trx, /*!< in: transaction */ + dict_table_t* table, /*!< in: FTS table instance */ + ib_vector_t* savepoints) /*!< in: Savepoints */ +{ + fts_trx_table_t* ftt; + ib_rbt_bound_t parent; + ib_rbt_t* tables; + fts_savepoint_t* savepoint; + + savepoint = static_cast<fts_savepoint_t*>(ib_vector_last(savepoints)); + + tables = savepoint->tables; + rbt_search_cmp(tables, &parent, &table->id, fts_trx_table_id_cmp, NULL); + + if (parent.result == 0) { + fts_trx_table_t** fttp; + + fttp = rbt_value(fts_trx_table_t*, parent.last); + ftt = *fttp; + } else { + ftt = fts_trx_table_create(trx->fts_trx, table); + rbt_add_node(tables, &parent, &ftt); + } + + ut_a(ftt->table == table); + + return(ftt); +} + +/******************************************************************//** +Notify the FTS system about an operation on an FTS-indexed table. */ +static +void +fts_trx_table_add_op( +/*=================*/ + fts_trx_table_t*ftt, /*!< in: FTS trx table */ + doc_id_t doc_id, /*!< in: doc id */ + fts_row_state state, /*!< in: state of the row */ + ib_vector_t* fts_indexes) /*!< in: FTS indexes affected */ +{ + ib_rbt_t* rows; + ib_rbt_bound_t parent; + + rows = ftt->rows; + rbt_search(rows, &parent, &doc_id); + + /* Row id found, update state, and if new state is FTS_NOTHING, + we delete the row from our tree. */ + if (parent.result == 0) { + fts_trx_row_t* row = rbt_value(fts_trx_row_t, parent.last); + + row->state = fts_trx_row_get_new_state(row->state, state); + + if (row->state == FTS_NOTHING) { + if (row->fts_indexes) { + ib_vector_free(row->fts_indexes); + } + + ut_free(rbt_remove_node(rows, parent.last)); + row = NULL; + } else if (row->fts_indexes != NULL) { + ib_vector_free(row->fts_indexes); + row->fts_indexes = fts_indexes; + } + + } else { /* Row-id not found, create a new one. */ + fts_trx_row_t row; + + row.doc_id = doc_id; + row.state = state; + row.fts_indexes = fts_indexes; + + rbt_add_node(rows, &parent, &row); + } +} + +/******************************************************************//** +Notify the FTS system about an operation on an FTS-indexed table. */ +UNIV_INTERN +void +fts_trx_add_op( +/*===========*/ + trx_t* trx, /*!< in: InnoDB transaction */ + dict_table_t* table, /*!< in: table */ + doc_id_t doc_id, /*!< in: new doc id */ + fts_row_state state, /*!< in: state of the row */ + ib_vector_t* fts_indexes) /*!< in: FTS indexes affected + (NULL=all) */ +{ + fts_trx_table_t* tran_ftt; + fts_trx_table_t* stmt_ftt; + + if (!trx->fts_trx) { + trx->fts_trx = fts_trx_create(trx); + } + + tran_ftt = fts_trx_init(trx, table, trx->fts_trx->savepoints); + stmt_ftt = fts_trx_init(trx, table, trx->fts_trx->last_stmt); + + fts_trx_table_add_op(tran_ftt, doc_id, state, fts_indexes); + fts_trx_table_add_op(stmt_ftt, doc_id, state, fts_indexes); +} + +/******************************************************************//** +Fetch callback that converts a textual document id to a binary value and +stores it in the given place. +@return always returns NULL */ +static +ibool +fts_fetch_store_doc_id( +/*===================*/ + void* row, /*!< in: sel_node_t* */ + void* user_arg) /*!< in: doc_id_t* to store + doc_id in */ +{ + int n_parsed; + sel_node_t* node = static_cast<sel_node_t*>(row); + doc_id_t* doc_id = static_cast<doc_id_t*>(user_arg); + dfield_t* dfield = que_node_get_val(node->select_list); + dtype_t* type = dfield_get_type(dfield); + ulint len = dfield_get_len(dfield); + + char buf[32]; + + ut_a(dtype_get_mtype(type) == DATA_VARCHAR); + ut_a(len > 0 && len < sizeof(buf)); + + memcpy(buf, dfield_get_data(dfield), len); + buf[len] = '\0'; + + n_parsed = sscanf(buf, FTS_DOC_ID_FORMAT, doc_id); + ut_a(n_parsed == 1); + + return(FALSE); +} + +#ifdef FTS_CACHE_SIZE_DEBUG +/******************************************************************//** +Get the max cache size in bytes. If there is an error reading the +value we simply print an error message here and return the default +value to the caller. +@return max cache size in bytes */ +static +ulint +fts_get_max_cache_size( +/*===================*/ + trx_t* trx, /*!< in: transaction */ + fts_table_t* fts_table) /*!< in: table instance */ +{ + dberr_t error; + fts_string_t value; + ulint cache_size_in_mb; + + /* Set to the default value. */ + cache_size_in_mb = FTS_CACHE_SIZE_LOWER_LIMIT_IN_MB; + + /* We set the length of value to the max bytes it can hold. This + information is used by the callback that reads the value. */ + value.f_n_char = 0; + value.f_len = FTS_MAX_CONFIG_VALUE_LEN; + value.f_str = ut_malloc(value.f_len + 1); + + error = fts_config_get_value( + trx, fts_table, FTS_MAX_CACHE_SIZE_IN_MB, &value); + + if (error == DB_SUCCESS) { + + value.f_str[value.f_len] = 0; + cache_size_in_mb = strtoul((char*) value.f_str, NULL, 10); + + if (cache_size_in_mb > FTS_CACHE_SIZE_UPPER_LIMIT_IN_MB) { + + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Warning: FTS max cache size " + " (%lu) out of range. Minimum value is " + "%luMB and the maximum values is %luMB, " + "setting cache size to upper limit\n", + cache_size_in_mb, + FTS_CACHE_SIZE_LOWER_LIMIT_IN_MB, + FTS_CACHE_SIZE_UPPER_LIMIT_IN_MB); + + cache_size_in_mb = FTS_CACHE_SIZE_UPPER_LIMIT_IN_MB; + + } else if (cache_size_in_mb + < FTS_CACHE_SIZE_LOWER_LIMIT_IN_MB) { + + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Warning: FTS max cache size " + " (%lu) out of range. Minimum value is " + "%luMB and the maximum values is %luMB, " + "setting cache size to lower limit\n", + cache_size_in_mb, + FTS_CACHE_SIZE_LOWER_LIMIT_IN_MB, + FTS_CACHE_SIZE_UPPER_LIMIT_IN_MB); + + cache_size_in_mb = FTS_CACHE_SIZE_LOWER_LIMIT_IN_MB; + } + } else { + ut_print_timestamp(stderr); + fprintf(stderr, "InnoDB: Error: (%lu) reading max cache " + "config value from config table\n", error); + } + + ut_free(value.f_str); + + return(cache_size_in_mb * 1024 * 1024); +} +#endif + +#ifdef FTS_DOC_STATS_DEBUG +/*********************************************************************//** +Get the total number of words in the FTS for a particular FTS index. +@return DB_SUCCESS if all OK else error code */ +UNIV_INTERN +dberr_t +fts_get_total_word_count( +/*=====================*/ + trx_t* trx, /*!< in: transaction */ + dict_index_t* index, /*!< in: for this index */ + ulint* total) /* out: total words */ +{ + dberr_t error; + fts_string_t value; + + *total = 0; + + /* We set the length of value to the max bytes it can hold. This + information is used by the callback that reads the value. */ + value.f_n_char = 0; + value.f_len = FTS_MAX_CONFIG_VALUE_LEN; + value.f_str = static_cast<byte*>(ut_malloc(value.f_len + 1)); + + error = fts_config_get_index_value( + trx, index, FTS_TOTAL_WORD_COUNT, &value); + + if (error == DB_SUCCESS) { + + value.f_str[value.f_len] = 0; + *total = strtoul((char*) value.f_str, NULL, 10); + } else { + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Error: (%s) reading total words " + "value from config table\n", ut_strerr(error)); + } + + ut_free(value.f_str); + + return(error); +} +#endif /* FTS_DOC_STATS_DEBUG */ + +/*********************************************************************//** +Update the next and last Doc ID in the CONFIG table to be the input +"doc_id" value (+ 1). We would do so after each FTS index build or +table truncate */ +UNIV_INTERN +void +fts_update_next_doc_id( +/*===================*/ + trx_t* trx, /*!< in/out: transaction */ + const dict_table_t* table, /*!< in: table */ + const char* table_name, /*!< in: table name, or NULL */ + doc_id_t doc_id) /*!< in: DOC ID to set */ +{ + table->fts->cache->synced_doc_id = doc_id; + table->fts->cache->next_doc_id = doc_id + 1; + + table->fts->cache->first_doc_id = table->fts->cache->next_doc_id; + + fts_update_sync_doc_id( + table, table_name, table->fts->cache->synced_doc_id, trx); + +} + +/*********************************************************************//** +Get the next available document id. +@return DB_SUCCESS if OK */ +UNIV_INTERN +dberr_t +fts_get_next_doc_id( +/*================*/ + const dict_table_t* table, /*!< in: table */ + doc_id_t* doc_id) /*!< out: new document id */ +{ + fts_cache_t* cache = table->fts->cache; + + /* If the Doc ID system has not yet been initialized, we + will consult the CONFIG table and user table to re-establish + the initial value of the Doc ID */ + + if (cache->first_doc_id != 0 || !fts_init_doc_id(table)) { + if (!DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) { + *doc_id = FTS_NULL_DOC_ID; + return(DB_SUCCESS); + } + + /* Otherwise, simply increment the value in cache */ + mutex_enter(&cache->doc_id_lock); + *doc_id = ++cache->next_doc_id; + mutex_exit(&cache->doc_id_lock); + } else { + mutex_enter(&cache->doc_id_lock); + *doc_id = cache->next_doc_id; + mutex_exit(&cache->doc_id_lock); + } + + return(DB_SUCCESS); +} + +/*********************************************************************//** +This function fetch the Doc ID from CONFIG table, and compare with +the Doc ID supplied. And store the larger one to the CONFIG table. +@return DB_SUCCESS if OK */ +static __attribute__((nonnull)) +dberr_t +fts_cmp_set_sync_doc_id( +/*====================*/ + const dict_table_t* table, /*!< in: table */ + doc_id_t doc_id_cmp, /*!< in: Doc ID to compare */ + ibool read_only, /*!< in: TRUE if read the + synced_doc_id only */ + doc_id_t* doc_id) /*!< out: larger document id + after comparing "doc_id_cmp" + to the one stored in CONFIG + table */ +{ + trx_t* trx; + pars_info_t* info; + dberr_t error; + fts_table_t fts_table; + que_t* graph = NULL; + fts_cache_t* cache = table->fts->cache; +retry: + ut_a(table->fts->doc_col != ULINT_UNDEFINED); + + fts_table.suffix = "CONFIG"; + fts_table.table_id = table->id; + fts_table.type = FTS_COMMON_TABLE; + fts_table.table = table; + + fts_table.parent = table->name; + + trx = trx_allocate_for_background(); + + trx->op_info = "update the next FTS document id"; + + info = pars_info_create(); + + pars_info_bind_function( + info, "my_func", fts_fetch_store_doc_id, doc_id); + + graph = fts_parse_sql( + &fts_table, info, + "DECLARE FUNCTION my_func;\n" + "DECLARE CURSOR c IS SELECT value FROM \"%s\"" + " WHERE key = 'synced_doc_id' FOR UPDATE;\n" + "BEGIN\n" + "" + "OPEN c;\n" + "WHILE 1 = 1 LOOP\n" + " FETCH c INTO my_func();\n" + " IF c % NOTFOUND THEN\n" + " EXIT;\n" + " END IF;\n" + "END LOOP;\n" + "CLOSE c;"); + + *doc_id = 0; + + error = fts_eval_sql(trx, graph); + + fts_que_graph_free_check_lock(&fts_table, NULL, graph); + + // FIXME: We need to retry deadlock errors + if (error != DB_SUCCESS) { + goto func_exit; + } + + if (read_only) { + goto func_exit; + } + + if (doc_id_cmp == 0 && *doc_id) { + cache->synced_doc_id = *doc_id - 1; + } else { + cache->synced_doc_id = ut_max(doc_id_cmp, *doc_id); + } + + mutex_enter(&cache->doc_id_lock); + /* For each sync operation, we will add next_doc_id by 1, + so to mark a sync operation */ + if (cache->next_doc_id < cache->synced_doc_id + 1) { + cache->next_doc_id = cache->synced_doc_id + 1; + } + mutex_exit(&cache->doc_id_lock); + + if (doc_id_cmp > *doc_id) { + error = fts_update_sync_doc_id( + table, table->name, cache->synced_doc_id, trx); + } + + *doc_id = cache->next_doc_id; + +func_exit: + + if (error == DB_SUCCESS) { + fts_sql_commit(trx); + } else { + *doc_id = 0; + + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Error: (%s) " + "while getting next doc id.\n", ut_strerr(error)); + + fts_sql_rollback(trx); + + if (error == DB_DEADLOCK) { + os_thread_sleep(FTS_DEADLOCK_RETRY_WAIT); + goto retry; + } + } + + trx_free_for_background(trx); + + return(error); +} + +/*********************************************************************//** +Update the last document id. This function could create a new +transaction to update the last document id. +@return DB_SUCCESS if OK */ +static +dberr_t +fts_update_sync_doc_id( +/*===================*/ + const dict_table_t* table, /*!< in: table */ + const char* table_name, /*!< in: table name, or NULL */ + doc_id_t doc_id, /*!< in: last document id */ + trx_t* trx) /*!< in: update trx, or NULL */ +{ + byte id[FTS_MAX_ID_LEN]; + pars_info_t* info; + fts_table_t fts_table; + ulint id_len; + que_t* graph = NULL; + dberr_t error; + ibool local_trx = FALSE; + fts_cache_t* cache = table->fts->cache; + + fts_table.suffix = "CONFIG"; + fts_table.table_id = table->id; + fts_table.type = FTS_COMMON_TABLE; + fts_table.table = table; + if (table_name) { + fts_table.parent = table_name; + } else { + fts_table.parent = table->name; + } + + if (!trx) { + trx = trx_allocate_for_background(); + + trx->op_info = "setting last FTS document id"; + local_trx = TRUE; + } + + info = pars_info_create(); + + id_len = ut_snprintf( + (char*) id, sizeof(id), FTS_DOC_ID_FORMAT, doc_id + 1); + + pars_info_bind_varchar_literal(info, "doc_id", id, id_len); + + graph = fts_parse_sql( + &fts_table, info, + "BEGIN " + "UPDATE \"%s\" SET value = :doc_id" + " WHERE key = 'synced_doc_id';"); + + error = fts_eval_sql(trx, graph); + + fts_que_graph_free_check_lock(&fts_table, NULL, graph); + + if (local_trx) { + if (error == DB_SUCCESS) { + fts_sql_commit(trx); + cache->synced_doc_id = doc_id; + } else { + + ib_logf(IB_LOG_LEVEL_ERROR, + "(%s) while updating last doc id.", + ut_strerr(error)); + + fts_sql_rollback(trx); + } + trx_free_for_background(trx); + } + + return(error); +} + +/*********************************************************************//** +Create a new fts_doc_ids_t. +@return new fts_doc_ids_t */ +UNIV_INTERN +fts_doc_ids_t* +fts_doc_ids_create(void) +/*====================*/ +{ + fts_doc_ids_t* fts_doc_ids; + mem_heap_t* heap = mem_heap_create(512); + + fts_doc_ids = static_cast<fts_doc_ids_t*>( + mem_heap_alloc(heap, sizeof(*fts_doc_ids))); + + fts_doc_ids->self_heap = ib_heap_allocator_create(heap); + + fts_doc_ids->doc_ids = static_cast<ib_vector_t*>(ib_vector_create( + fts_doc_ids->self_heap, sizeof(fts_update_t), 32)); + + return(fts_doc_ids); +} + +/*********************************************************************//** +Free a fts_doc_ids_t. */ + +void +fts_doc_ids_free( +/*=============*/ + fts_doc_ids_t* fts_doc_ids) +{ + mem_heap_t* heap = static_cast<mem_heap_t*>( + fts_doc_ids->self_heap->arg); + + memset(fts_doc_ids, 0, sizeof(*fts_doc_ids)); + + mem_heap_free(heap); +} + +/*********************************************************************//** +Do commit-phase steps necessary for the insertion of a new row. +@return DB_SUCCESS or error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +fts_add( +/*====*/ + fts_trx_table_t*ftt, /*!< in: FTS trx table */ + fts_trx_row_t* row) /*!< in: row */ +{ + dict_table_t* table = ftt->table; + dberr_t error = DB_SUCCESS; + doc_id_t doc_id = row->doc_id; + + ut_a(row->state == FTS_INSERT || row->state == FTS_MODIFY); + + fts_add_doc_by_id(ftt, doc_id, row->fts_indexes); + + if (error == DB_SUCCESS) { + mutex_enter(&table->fts->cache->deleted_lock); + ++table->fts->cache->added; + mutex_exit(&table->fts->cache->deleted_lock); + + if (!DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID) + && doc_id >= table->fts->cache->next_doc_id) { + table->fts->cache->next_doc_id = doc_id + 1; + } + } + + return(error); +} + +/*********************************************************************//** +Do commit-phase steps necessary for the deletion of a row. +@return DB_SUCCESS or error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +fts_delete( +/*=======*/ + fts_trx_table_t*ftt, /*!< in: FTS trx table */ + fts_trx_row_t* row) /*!< in: row */ +{ + que_t* graph; + fts_table_t fts_table; + dberr_t error = DB_SUCCESS; + doc_id_t write_doc_id; + dict_table_t* table = ftt->table; + doc_id_t doc_id = row->doc_id; + trx_t* trx = ftt->fts_trx->trx; + pars_info_t* info = pars_info_create(); + fts_cache_t* cache = table->fts->cache; + + /* we do not index Documents whose Doc ID value is 0 */ + if (doc_id == FTS_NULL_DOC_ID) { + ut_ad(!DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)); + return(error); + } + + ut_a(row->state == FTS_DELETE || row->state == FTS_MODIFY); + + FTS_INIT_FTS_TABLE(&fts_table, "DELETED", FTS_COMMON_TABLE, table); + + /* Convert to "storage" byte order. */ + fts_write_doc_id((byte*) &write_doc_id, doc_id); + fts_bind_doc_id(info, "doc_id", &write_doc_id); + + /* It is possible we update a record that has not yet been sync-ed + into cache from last crash (delete Doc will not initialize the + sync). Avoid any added counter accounting until the FTS cache + is re-established and sync-ed */ + if (table->fts->fts_status & ADDED_TABLE_SYNCED + && doc_id > cache->synced_doc_id) { + mutex_enter(&table->fts->cache->deleted_lock); + + /* The Doc ID could belong to those left in + ADDED table from last crash. So need to check + if it is less than first_doc_id when we initialize + the Doc ID system after reboot */ + if (doc_id >= table->fts->cache->first_doc_id + && table->fts->cache->added > 0) { + --table->fts->cache->added; + } + + mutex_exit(&table->fts->cache->deleted_lock); + + /* Only if the row was really deleted. */ + ut_a(row->state == FTS_DELETE || row->state == FTS_MODIFY); + } + + /* Note the deleted document for OPTIMIZE to purge. */ + if (error == DB_SUCCESS) { + + trx->op_info = "adding doc id to FTS DELETED"; + + info->graph_owns_us = TRUE; + + fts_table.suffix = "DELETED"; + + graph = fts_parse_sql( + &fts_table, + info, + "BEGIN INSERT INTO \"%s\" VALUES (:doc_id);"); + + error = fts_eval_sql(trx, graph); + + fts_que_graph_free(graph); + } else { + pars_info_free(info); + } + + /* Increment the total deleted count, this is used to calculate the + number of documents indexed. */ + if (error == DB_SUCCESS) { + mutex_enter(&table->fts->cache->deleted_lock); + + ++table->fts->cache->deleted; + + mutex_exit(&table->fts->cache->deleted_lock); + } + + return(error); +} + +/*********************************************************************//** +Do commit-phase steps necessary for the modification of a row. +@return DB_SUCCESS or error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +fts_modify( +/*=======*/ + fts_trx_table_t* ftt, /*!< in: FTS trx table */ + fts_trx_row_t* row) /*!< in: row */ +{ + dberr_t error; + + ut_a(row->state == FTS_MODIFY); + + error = fts_delete(ftt, row); + + if (error == DB_SUCCESS) { + error = fts_add(ftt, row); + } + + return(error); +} + +/*********************************************************************//** +Create a new document id. +@return DB_SUCCESS if all went well else error */ +UNIV_INTERN +dberr_t +fts_create_doc_id( +/*==============*/ + dict_table_t* table, /*!< in: row is of this table. */ + dtuple_t* row, /* in/out: add doc id value to this + row. This is the current row that is + being inserted. */ + mem_heap_t* heap) /*!< in: heap */ +{ + doc_id_t doc_id; + dberr_t error = DB_SUCCESS; + + ut_a(table->fts->doc_col != ULINT_UNDEFINED); + + if (!DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) { + if (table->fts->cache->first_doc_id == FTS_NULL_DOC_ID) { + error = fts_get_next_doc_id(table, &doc_id); + } + return(error); + } + + error = fts_get_next_doc_id(table, &doc_id); + + if (error == DB_SUCCESS) { + dfield_t* dfield; + doc_id_t* write_doc_id; + + ut_a(doc_id > 0); + + dfield = dtuple_get_nth_field(row, table->fts->doc_col); + write_doc_id = static_cast<doc_id_t*>( + mem_heap_alloc(heap, sizeof(*write_doc_id))); + + ut_a(doc_id != FTS_NULL_DOC_ID); + ut_a(sizeof(doc_id) == dfield->type.len); + fts_write_doc_id((byte*) write_doc_id, doc_id); + + dfield_set_data(dfield, write_doc_id, sizeof(*write_doc_id)); + } + + return(error); +} + +/*********************************************************************//** +The given transaction is about to be committed; do whatever is necessary +from the FTS system's POV. +@return DB_SUCCESS or error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +fts_commit_table( +/*=============*/ + fts_trx_table_t* ftt) /*!< in: FTS table to commit*/ +{ + const ib_rbt_node_t* node; + ib_rbt_t* rows; + dberr_t error = DB_SUCCESS; + fts_cache_t* cache = ftt->table->fts->cache; + trx_t* trx = trx_allocate_for_background(); + + rows = ftt->rows; + + ftt->fts_trx->trx = trx; + + if (cache->get_docs == NULL) { + rw_lock_x_lock(&cache->init_lock); + if (cache->get_docs == NULL) { + cache->get_docs = fts_get_docs_create(cache); + } + rw_lock_x_unlock(&cache->init_lock); + } + + for (node = rbt_first(rows); + node != NULL && error == DB_SUCCESS; + node = rbt_next(rows, node)) { + + fts_trx_row_t* row = rbt_value(fts_trx_row_t, node); + + switch (row->state) { + case FTS_INSERT: + error = fts_add(ftt, row); + break; + + case FTS_MODIFY: + error = fts_modify(ftt, row); + break; + + case FTS_DELETE: + error = fts_delete(ftt, row); + break; + + default: + ut_error; + } + } + + fts_sql_commit(trx); + + trx_free_for_background(trx); + + return(error); +} + +/*********************************************************************//** +The given transaction is about to be committed; do whatever is necessary +from the FTS system's POV. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fts_commit( +/*=======*/ + trx_t* trx) /*!< in: transaction */ +{ + const ib_rbt_node_t* node; + dberr_t error; + ib_rbt_t* tables; + fts_savepoint_t* savepoint; + + savepoint = static_cast<fts_savepoint_t*>( + ib_vector_last(trx->fts_trx->savepoints)); + tables = savepoint->tables; + + for (node = rbt_first(tables), error = DB_SUCCESS; + node != NULL && error == DB_SUCCESS; + node = rbt_next(tables, node)) { + + fts_trx_table_t** ftt; + + ftt = rbt_value(fts_trx_table_t*, node); + + error = fts_commit_table(*ftt); + } + + return(error); +} + +/*********************************************************************//** +Initialize a document. */ +UNIV_INTERN +void +fts_doc_init( +/*=========*/ + fts_doc_t* doc) /*!< in: doc to initialize */ +{ + mem_heap_t* heap = mem_heap_create(32); + + memset(doc, 0, sizeof(*doc)); + + doc->self_heap = ib_heap_allocator_create(heap); +} + +/*********************************************************************//** +Free document. */ +UNIV_INTERN +void +fts_doc_free( +/*=========*/ + fts_doc_t* doc) /*!< in: document */ +{ + mem_heap_t* heap = static_cast<mem_heap_t*>(doc->self_heap->arg); + + if (doc->tokens) { + rbt_free(doc->tokens); + } + +#ifdef UNIV_DEBUG + memset(doc, 0, sizeof(*doc)); +#endif /* UNIV_DEBUG */ + + mem_heap_free(heap); +} + +/*********************************************************************//** +Callback function for fetch that stores a row id to the location pointed. +The column's type must be DATA_FIXBINARY, DATA_BINARY_TYPE, length = 8. +@return always returns NULL */ +UNIV_INTERN +void* +fts_fetch_row_id( +/*=============*/ + void* row, /*!< in: sel_node_t* */ + void* user_arg) /*!< in: data pointer */ +{ + sel_node_t* node = static_cast<sel_node_t*>(row); + + dfield_t* dfield = que_node_get_val(node->select_list); + dtype_t* type = dfield_get_type(dfield); + ulint len = dfield_get_len(dfield); + + ut_a(dtype_get_mtype(type) == DATA_FIXBINARY); + ut_a(dtype_get_prtype(type) & DATA_BINARY_TYPE); + ut_a(len == 8); + + memcpy(user_arg, dfield_get_data(dfield), 8); + + return(NULL); +} + +/*********************************************************************//** +Callback function for fetch that stores the text of an FTS document, +converting each column to UTF-16. +@return always FALSE */ +UNIV_INTERN +ibool +fts_query_expansion_fetch_doc( +/*==========================*/ + void* row, /*!< in: sel_node_t* */ + void* user_arg) /*!< in: fts_doc_t* */ +{ + que_node_t* exp; + sel_node_t* node = static_cast<sel_node_t*>(row); + fts_doc_t* result_doc = static_cast<fts_doc_t*>(user_arg); + dfield_t* dfield; + ulint len; + ulint doc_len; + fts_doc_t doc; + CHARSET_INFO* doc_charset = NULL; + ulint field_no = 0; + + len = 0; + + fts_doc_init(&doc); + doc.found = TRUE; + + exp = node->select_list; + doc_len = 0; + + doc_charset = result_doc->charset; + + /* Copy each indexed column content into doc->text.f_str */ + while (exp) { + dfield = que_node_get_val(exp); + len = dfield_get_len(dfield); + + /* NULL column */ + if (len == UNIV_SQL_NULL) { + exp = que_node_get_next(exp); + continue; + } + + if (!doc_charset) { + ulint prtype = dfield->type.prtype; + doc_charset = innobase_get_fts_charset( + (int)(prtype & DATA_MYSQL_TYPE_MASK), + (uint) dtype_get_charset_coll(prtype)); + } + + doc.charset = doc_charset; + + if (dfield_is_ext(dfield)) { + /* We ignore columns that are stored externally, this + could result in too many words to search */ + exp = que_node_get_next(exp); + continue; + } else { + doc.text.f_n_char = 0; + + doc.text.f_str = static_cast<byte*>( + dfield_get_data(dfield)); + + doc.text.f_len = len; + } + + if (field_no == 0) { + fts_tokenize_document(&doc, result_doc); + } else { + fts_tokenize_document_next(&doc, doc_len, result_doc); + } + + exp = que_node_get_next(exp); + + doc_len += (exp) ? len + 1 : len; + + field_no++; + } + + ut_ad(doc_charset); + + if (!result_doc->charset) { + result_doc->charset = doc_charset; + } + + fts_doc_free(&doc); + + return(FALSE); +} + +/*********************************************************************//** +fetch and tokenize the document. */ +static +void +fts_fetch_doc_from_rec( +/*===================*/ + fts_get_doc_t* get_doc, /*!< in: FTS index's get_doc struct */ + dict_index_t* clust_index, /*!< in: cluster index */ + btr_pcur_t* pcur, /*!< in: cursor whose position + has been stored */ + ulint* offsets, /*!< in: offsets */ + fts_doc_t* doc) /*!< out: fts doc to hold parsed + documents */ +{ + dict_index_t* index; + dict_table_t* table; + const rec_t* clust_rec; + ulint num_field; + const dict_field_t* ifield; + const dict_col_t* col; + ulint clust_pos; + ulint i; + ulint doc_len = 0; + ulint processed_doc = 0; + + if (!get_doc) { + return; + } + + index = get_doc->index_cache->index; + table = get_doc->index_cache->index->table; + + clust_rec = btr_pcur_get_rec(pcur); + + num_field = dict_index_get_n_fields(index); + + for (i = 0; i < num_field; i++) { + ifield = dict_index_get_nth_field(index, i); + col = dict_field_get_col(ifield); + clust_pos = dict_col_get_clust_pos(col, clust_index); + + if (!get_doc->index_cache->charset) { + ulint prtype = ifield->col->prtype; + + get_doc->index_cache->charset = + innobase_get_fts_charset( + (int) (prtype & DATA_MYSQL_TYPE_MASK), + (uint) dtype_get_charset_coll(prtype)); + } + + if (rec_offs_nth_extern(offsets, clust_pos)) { + doc->text.f_str = + btr_rec_copy_externally_stored_field( + clust_rec, offsets, + dict_table_zip_size(table), + clust_pos, &doc->text.f_len, + static_cast<mem_heap_t*>( + doc->self_heap->arg)); + } else { + doc->text.f_str = (byte*) rec_get_nth_field( + clust_rec, offsets, clust_pos, + &doc->text.f_len); + } + + doc->found = TRUE; + doc->charset = get_doc->index_cache->charset; + + /* Null Field */ + if (doc->text.f_len == UNIV_SQL_NULL) { + continue; + } + + if (processed_doc == 0) { + fts_tokenize_document(doc, NULL); + } else { + fts_tokenize_document_next(doc, doc_len, NULL); + } + + processed_doc++; + doc_len += doc->text.f_len + 1; + } +} + +/*********************************************************************//** +This function fetches the document inserted during the committing +transaction, and tokenize the inserted text data and insert into +FTS auxiliary table and its cache. +@return TRUE if successful */ +static +ulint +fts_add_doc_by_id( +/*==============*/ + fts_trx_table_t*ftt, /*!< in: FTS trx table */ + doc_id_t doc_id, /*!< in: doc id */ + ib_vector_t* fts_indexes __attribute__((unused))) + /*!< in: affected fts indexes */ +{ + mtr_t mtr; + mem_heap_t* heap; + btr_pcur_t pcur; + dict_table_t* table; + dtuple_t* tuple; + dfield_t* dfield; + fts_get_doc_t* get_doc; + doc_id_t temp_doc_id; + dict_index_t* clust_index; + dict_index_t* fts_id_index; + ibool is_id_cluster; + fts_cache_t* cache = ftt->table->fts->cache; + + ut_ad(cache->get_docs); + + /* If Doc ID has been supplied by the user, then the table + might not yet be sync-ed */ + + if (!(ftt->table->fts->fts_status & ADDED_TABLE_SYNCED)) { + fts_init_index(ftt->table, FALSE); + } + + /* Get the first FTS index's get_doc */ + get_doc = static_cast<fts_get_doc_t*>( + ib_vector_get(cache->get_docs, 0)); + ut_ad(get_doc); + + table = get_doc->index_cache->index->table; + + heap = mem_heap_create(512); + + clust_index = dict_table_get_first_index(table); + fts_id_index = dict_table_get_index_on_name( + table, FTS_DOC_ID_INDEX_NAME); + + /* Check whether the index on FTS_DOC_ID is cluster index */ + is_id_cluster = (clust_index == fts_id_index); + + mtr_start(&mtr); + btr_pcur_init(&pcur); + + /* Search based on Doc ID. Here, we'll need to consider the case + when there is no primary index on Doc ID */ + tuple = dtuple_create(heap, 1); + dfield = dtuple_get_nth_field(tuple, 0); + dfield->type.mtype = DATA_INT; + dfield->type.prtype = DATA_NOT_NULL | DATA_UNSIGNED | DATA_BINARY_TYPE; + + mach_write_to_8((byte*) &temp_doc_id, doc_id); + dfield_set_data(dfield, &temp_doc_id, sizeof(temp_doc_id)); + + btr_pcur_open_with_no_init( + fts_id_index, tuple, PAGE_CUR_LE, BTR_SEARCH_LEAF, + &pcur, 0, &mtr); + + /* If we have a match, add the data to doc structure */ + if (btr_pcur_get_low_match(&pcur) == 1) { + const rec_t* rec; + btr_pcur_t* doc_pcur; + const rec_t* clust_rec; + btr_pcur_t clust_pcur; + ulint* offsets = NULL; + ulint num_idx = ib_vector_size(cache->get_docs); + + rec = btr_pcur_get_rec(&pcur); + + /* Doc could be deleted */ + if (page_rec_is_infimum(rec) + || rec_get_deleted_flag(rec, dict_table_is_comp(table))) { + + goto func_exit; + } + + if (is_id_cluster) { + clust_rec = rec; + doc_pcur = &pcur; + } else { + dtuple_t* clust_ref; + ulint n_fields; + + btr_pcur_init(&clust_pcur); + n_fields = dict_index_get_n_unique(clust_index); + + clust_ref = dtuple_create(heap, n_fields); + dict_index_copy_types(clust_ref, clust_index, n_fields); + + row_build_row_ref_in_tuple( + clust_ref, rec, fts_id_index, NULL, NULL); + + btr_pcur_open_with_no_init( + clust_index, clust_ref, PAGE_CUR_LE, + BTR_SEARCH_LEAF, &clust_pcur, 0, &mtr); + + doc_pcur = &clust_pcur; + clust_rec = btr_pcur_get_rec(&clust_pcur); + + } + + offsets = rec_get_offsets(clust_rec, clust_index, + NULL, ULINT_UNDEFINED, &heap); + + for (ulint i = 0; i < num_idx; ++i) { + fts_doc_t doc; + dict_table_t* table; + fts_get_doc_t* get_doc; + + get_doc = static_cast<fts_get_doc_t*>( + ib_vector_get(cache->get_docs, i)); + + table = get_doc->index_cache->index->table; + + fts_doc_init(&doc); + + fts_fetch_doc_from_rec( + get_doc, clust_index, doc_pcur, offsets, &doc); + + if (doc.found) { + ibool success __attribute__((unused)); + + btr_pcur_store_position(doc_pcur, &mtr); + mtr_commit(&mtr); + + rw_lock_x_lock(&table->fts->cache->lock); + + if (table->fts->cache->stopword_info.status + & STOPWORD_NOT_INIT) { + fts_load_stopword(table, NULL, NULL, + NULL, TRUE, TRUE); + } + + fts_cache_add_doc( + table->fts->cache, + get_doc->index_cache, + doc_id, doc.tokens); + + rw_lock_x_unlock(&table->fts->cache->lock); + + DBUG_EXECUTE_IF( + "fts_instrument_sync", + fts_sync(cache->sync); + ); + + if (cache->total_size > fts_max_cache_size + || fts_need_sync) { + fts_sync(cache->sync); + } + + mtr_start(&mtr); + + if (i < num_idx - 1) { + + success = btr_pcur_restore_position( + BTR_SEARCH_LEAF, doc_pcur, + &mtr); + + ut_ad(success); + } + } + + fts_doc_free(&doc); + } + + if (!is_id_cluster) { + btr_pcur_close(doc_pcur); + } + } +func_exit: + mtr_commit(&mtr); + + btr_pcur_close(&pcur); + + mem_heap_free(heap); + return(TRUE); +} + + +/*********************************************************************//** +Callback function to read a single ulint column. +return always returns TRUE */ +static +ibool +fts_read_ulint( +/*===========*/ + void* row, /*!< in: sel_node_t* */ + void* user_arg) /*!< in: pointer to ulint */ +{ + sel_node_t* sel_node = static_cast<sel_node_t*>(row); + ulint* value = static_cast<ulint*>(user_arg); + que_node_t* exp = sel_node->select_list; + dfield_t* dfield = que_node_get_val(exp); + void* data = dfield_get_data(dfield); + + *value = static_cast<ulint>(mach_read_from_4( + static_cast<const byte*>(data))); + + return(TRUE); +} + +/*********************************************************************//** +Get maximum Doc ID in a table if index "FTS_DOC_ID_INDEX" exists +@return max Doc ID or 0 if index "FTS_DOC_ID_INDEX" does not exist */ +UNIV_INTERN +doc_id_t +fts_get_max_doc_id( +/*===============*/ + dict_table_t* table) /*!< in: user table */ +{ + dict_index_t* index; + dict_field_t* dfield __attribute__((unused)) = NULL; + doc_id_t doc_id = 0; + mtr_t mtr; + btr_pcur_t pcur; + + index = dict_table_get_index_on_name(table, FTS_DOC_ID_INDEX_NAME); + + if (!index) { + return(0); + } + + dfield = dict_index_get_nth_field(index, 0); + +#if 0 /* This can fail when renaming a column to FTS_DOC_ID_COL_NAME. */ + ut_ad(innobase_strcasecmp(FTS_DOC_ID_COL_NAME, dfield->name) == 0); +#endif + + mtr_start(&mtr); + + /* fetch the largest indexes value */ + btr_pcur_open_at_index_side( + false, index, BTR_SEARCH_LEAF, &pcur, true, 0, &mtr); + + if (!page_is_empty(btr_pcur_get_page(&pcur))) { + const rec_t* rec = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + mem_heap_t* heap = NULL; + ulint len; + const void* data; + + rec_offs_init(offsets_); + + do { + rec = btr_pcur_get_rec(&pcur); + + if (page_rec_is_user_rec(rec)) { + break; + } + } while (btr_pcur_move_to_prev(&pcur, &mtr)); + + if (!rec) { + goto func_exit; + } + + offsets = rec_get_offsets( + rec, index, offsets, ULINT_UNDEFINED, &heap); + + data = rec_get_nth_field(rec, offsets, 0, &len); + + doc_id = static_cast<doc_id_t>(fts_read_doc_id( + static_cast<const byte*>(data))); + } + +func_exit: + btr_pcur_close(&pcur); + mtr_commit(&mtr); + return(doc_id); +} + +/*********************************************************************//** +Fetch document with the given document id. +@return DB_SUCCESS if OK else error */ +UNIV_INTERN +dberr_t +fts_doc_fetch_by_doc_id( +/*====================*/ + fts_get_doc_t* get_doc, /*!< in: state */ + doc_id_t doc_id, /*!< in: id of document to + fetch */ + dict_index_t* index_to_use, /*!< in: caller supplied FTS index, + or NULL */ + ulint option, /*!< in: search option, if it is + greater than doc_id or equal */ + fts_sql_callback + callback, /*!< in: callback to read */ + void* arg) /*!< in: callback arg */ +{ + pars_info_t* info; + dberr_t error; + const char* select_str; + doc_id_t write_doc_id; + dict_index_t* index; + trx_t* trx = trx_allocate_for_background(); + que_t* graph; + + trx->op_info = "fetching indexed FTS document"; + + /* The FTS index can be supplied by caller directly with + "index_to_use", otherwise, get it from "get_doc" */ + index = (index_to_use) ? index_to_use : get_doc->index_cache->index; + + if (get_doc && get_doc->get_document_graph) { + info = get_doc->get_document_graph->info; + } else { + info = pars_info_create(); + } + + /* Convert to "storage" byte order. */ + fts_write_doc_id((byte*) &write_doc_id, doc_id); + fts_bind_doc_id(info, "doc_id", &write_doc_id); + pars_info_bind_function(info, "my_func", callback, arg); + + select_str = fts_get_select_columns_str(index, info, info->heap); + pars_info_bind_id(info, TRUE, "table_name", index->table_name); + + if (!get_doc || !get_doc->get_document_graph) { + if (option == FTS_FETCH_DOC_BY_ID_EQUAL) { + graph = fts_parse_sql( + NULL, + info, + mem_heap_printf(info->heap, + "DECLARE FUNCTION my_func;\n" + "DECLARE CURSOR c IS" + " SELECT %s FROM $table_name" + " WHERE %s = :doc_id;\n" + "BEGIN\n" + "" + "OPEN c;\n" + "WHILE 1 = 1 LOOP\n" + " FETCH c INTO my_func();\n" + " IF c %% NOTFOUND THEN\n" + " EXIT;\n" + " END IF;\n" + "END LOOP;\n" + "CLOSE c;", + select_str, FTS_DOC_ID_COL_NAME)); + } else { + ut_ad(option == FTS_FETCH_DOC_BY_ID_LARGE); + + /* This is used for crash recovery of table with + hidden DOC ID or FTS indexes. We will scan the table + to re-processing user table rows whose DOC ID or + FTS indexed documents have not been sync-ed to disc + during recent crash. + In the case that all fulltext indexes are dropped + for a table, we will keep the "hidden" FTS_DOC_ID + column, and this scan is to retreive the largest + DOC ID being used in the table to determine the + appropriate next DOC ID. + In the case of there exists fulltext index(es), this + operation will re-tokenize any docs that have not + been sync-ed to the disk, and re-prime the FTS + cached */ + graph = fts_parse_sql( + NULL, + info, + mem_heap_printf(info->heap, + "DECLARE FUNCTION my_func;\n" + "DECLARE CURSOR c IS" + " SELECT %s, %s FROM $table_name" + " WHERE %s > :doc_id;\n" + "BEGIN\n" + "" + "OPEN c;\n" + "WHILE 1 = 1 LOOP\n" + " FETCH c INTO my_func();\n" + " IF c %% NOTFOUND THEN\n" + " EXIT;\n" + " END IF;\n" + "END LOOP;\n" + "CLOSE c;", + FTS_DOC_ID_COL_NAME, + select_str, FTS_DOC_ID_COL_NAME)); + } + if (get_doc) { + get_doc->get_document_graph = graph; + } + } else { + graph = get_doc->get_document_graph; + } + + error = fts_eval_sql(trx, graph); + + if (error == DB_SUCCESS) { + fts_sql_commit(trx); + } else { + fts_sql_rollback(trx); + } + + trx_free_for_background(trx); + + if (!get_doc) { + fts_que_graph_free(graph); + } + + return(error); +} + +/*********************************************************************//** +Write out a single word's data as new entry/entries in the INDEX table. +@return DB_SUCCESS if all OK. */ +UNIV_INTERN +dberr_t +fts_write_node( +/*===========*/ + trx_t* trx, /*!< in: transaction */ + que_t** graph, /*!< in: query graph */ + fts_table_t* fts_table, /*!< in: aux table */ + fts_string_t* word, /*!< in: word in UTF-8 */ + fts_node_t* node) /*!< in: node columns */ +{ + pars_info_t* info; + dberr_t error; + ib_uint32_t doc_count; + ib_time_t start_time; + doc_id_t last_doc_id; + doc_id_t first_doc_id; + + if (*graph) { + info = (*graph)->info; + } else { + info = pars_info_create(); + } + + pars_info_bind_varchar_literal(info, "token", word->f_str, word->f_len); + + /* Convert to "storage" byte order. */ + fts_write_doc_id((byte*) &first_doc_id, node->first_doc_id); + fts_bind_doc_id(info, "first_doc_id", &first_doc_id); + + /* Convert to "storage" byte order. */ + fts_write_doc_id((byte*) &last_doc_id, node->last_doc_id); + fts_bind_doc_id(info, "last_doc_id", &last_doc_id); + + ut_a(node->last_doc_id >= node->first_doc_id); + + /* Convert to "storage" byte order. */ + mach_write_to_4((byte*) &doc_count, node->doc_count); + pars_info_bind_int4_literal( + info, "doc_count", (const ib_uint32_t*) &doc_count); + + /* Set copy_name to FALSE since it's a static. */ + pars_info_bind_literal( + info, "ilist", node->ilist, node->ilist_size, + DATA_BLOB, DATA_BINARY_TYPE); + + if (!*graph) { + *graph = fts_parse_sql( + fts_table, + info, + "BEGIN\n" + "INSERT INTO \"%s\" VALUES " + "(:token, :first_doc_id," + " :last_doc_id, :doc_count, :ilist);"); + } + + start_time = ut_time(); + error = fts_eval_sql(trx, *graph); + elapsed_time += ut_time() - start_time; + ++n_nodes; + + return(error); +} + +/*********************************************************************//** +Add rows to the DELETED_CACHE table. +@return DB_SUCCESS if all went well else error code*/ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +fts_sync_add_deleted_cache( +/*=======================*/ + fts_sync_t* sync, /*!< in: sync state */ + ib_vector_t* doc_ids) /*!< in: doc ids to add */ +{ + ulint i; + pars_info_t* info; + que_t* graph; + fts_table_t fts_table; + doc_id_t dummy = 0; + dberr_t error = DB_SUCCESS; + ulint n_elems = ib_vector_size(doc_ids); + + ut_a(ib_vector_size(doc_ids) > 0); + + ib_vector_sort(doc_ids, fts_update_doc_id_cmp); + + info = pars_info_create(); + + fts_bind_doc_id(info, "doc_id", &dummy); + + FTS_INIT_FTS_TABLE( + &fts_table, "DELETED_CACHE", FTS_COMMON_TABLE, sync->table); + + graph = fts_parse_sql( + &fts_table, + info, + "BEGIN INSERT INTO \"%s\" VALUES (:doc_id);"); + + for (i = 0; i < n_elems && error == DB_SUCCESS; ++i) { + fts_update_t* update; + doc_id_t write_doc_id; + + update = static_cast<fts_update_t*>(ib_vector_get(doc_ids, i)); + + /* Convert to "storage" byte order. */ + fts_write_doc_id((byte*) &write_doc_id, update->doc_id); + fts_bind_doc_id(info, "doc_id", &write_doc_id); + + error = fts_eval_sql(sync->trx, graph); + } + + fts_que_graph_free(graph); + + return(error); +} + +/*********************************************************************//** +Write the words and ilist to disk. +@return DB_SUCCESS if all went well else error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +fts_sync_write_words( +/*=================*/ + trx_t* trx, /*!< in: transaction */ + fts_index_cache_t* + index_cache) /*!< in: index cache */ +{ + fts_table_t fts_table; + ulint n_nodes = 0; + ulint n_words = 0; + const ib_rbt_node_t* rbt_node; + dberr_t error = DB_SUCCESS; + ibool print_error = FALSE; +#ifdef FTS_DOC_STATS_DEBUG + dict_table_t* table = index_cache->index->table; + ulint n_new_words = 0; +#endif /* FTS_DOC_STATS_DEBUG */ + + FTS_INIT_INDEX_TABLE( + &fts_table, NULL, FTS_INDEX_TABLE, index_cache->index); + + n_words = rbt_size(index_cache->words); + + /* We iterate over the entire tree, even if there is an error, + since we want to free the memory used during caching. */ + for (rbt_node = rbt_first(index_cache->words); + rbt_node; + rbt_node = rbt_first(index_cache->words)) { + + ulint i; + ulint selected; + fts_tokenizer_word_t* word; + + word = rbt_value(fts_tokenizer_word_t, rbt_node); + + selected = fts_select_index( + index_cache->charset, word->text.f_str, + word->text.f_len); + + fts_table.suffix = fts_get_suffix(selected); + +#ifdef FTS_DOC_STATS_DEBUG + /* Check if the word exists in the FTS index and if not + then we need to increment the total word count stats. */ + if (error == DB_SUCCESS && fts_enable_diag_print) { + ibool found = FALSE; + + error = fts_is_word_in_index( + trx, + &index_cache->sel_graph[selected], + &fts_table, + &word->text, &found); + + if (error == DB_SUCCESS && !found) { + + ++n_new_words; + } + } +#endif /* FTS_DOC_STATS_DEBUG */ + + n_nodes += ib_vector_size(word->nodes); + + /* We iterate over all the nodes even if there was an error, + this is to free the memory of the fts_node_t elements. */ + for (i = 0; i < ib_vector_size(word->nodes); ++i) { + + fts_node_t* fts_node = static_cast<fts_node_t*>( + ib_vector_get(word->nodes, i)); + + if (error == DB_SUCCESS) { + + error = fts_write_node( + trx, + &index_cache->ins_graph[selected], + &fts_table, &word->text, fts_node); + } + + ut_free(fts_node->ilist); + fts_node->ilist = NULL; + } + + if (error != DB_SUCCESS && !print_error) { + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Error (%s) writing " + "word node to FTS auxiliary index " + "table.\n", ut_strerr(error)); + + print_error = TRUE; + } + + /* NOTE: We are responsible for free'ing the node */ + ut_free(rbt_remove_node(index_cache->words, rbt_node)); + } + +#ifdef FTS_DOC_STATS_DEBUG + if (error == DB_SUCCESS && n_new_words > 0 && fts_enable_diag_print) { + fts_table_t fts_table; + + FTS_INIT_FTS_TABLE(&fts_table, NULL, FTS_COMMON_TABLE, table); + + /* Increment the total number of words in the FTS index */ + error = fts_config_increment_index_value( + trx, index_cache->index, FTS_TOTAL_WORD_COUNT, + n_new_words); + } +#endif /* FTS_DOC_STATS_DEBUG */ + + if (fts_enable_diag_print) { + printf("Avg number of nodes: %lf\n", + (double) n_nodes / (double) (n_words > 1 ? n_words : 1)); + } + + return(error); +} + +#ifdef FTS_DOC_STATS_DEBUG +/*********************************************************************//** +Write a single documents statistics to disk. +@return DB_SUCCESS if all went well else error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +fts_sync_write_doc_stat( +/*====================*/ + trx_t* trx, /*!< in: transaction */ + dict_index_t* index, /*!< in: index */ + que_t** graph, /* out: query graph */ + const fts_doc_stats_t* doc_stat) /*!< in: doc stats to write */ +{ + pars_info_t* info; + doc_id_t doc_id; + dberr_t error = DB_SUCCESS; + ib_uint32_t word_count; + + if (*graph) { + info = (*graph)->info; + } else { + info = pars_info_create(); + } + + /* Convert to "storage" byte order. */ + mach_write_to_4((byte*) &word_count, doc_stat->word_count); + pars_info_bind_int4_literal( + info, "count", (const ib_uint32_t*) &word_count); + + /* Convert to "storage" byte order. */ + fts_write_doc_id((byte*) &doc_id, doc_stat->doc_id); + fts_bind_doc_id(info, "doc_id", &doc_id); + + if (!*graph) { + fts_table_t fts_table; + + FTS_INIT_INDEX_TABLE( + &fts_table, "DOC_ID", FTS_INDEX_TABLE, index); + + *graph = fts_parse_sql( + &fts_table, + info, + "BEGIN INSERT INTO \"%s\" VALUES (:doc_id, :count);"); + } + + for (;;) { + error = fts_eval_sql(trx, *graph); + + if (error == DB_SUCCESS) { + + break; /* Exit the loop. */ + } else { + ut_print_timestamp(stderr); + + if (error == DB_LOCK_WAIT_TIMEOUT) { + fprintf(stderr, " InnoDB: Warning: lock wait " + "timeout writing to FTS doc_id. " + "Retrying!\n"); + + trx->error_state = DB_SUCCESS; + } else { + fprintf(stderr, " InnoDB: Error: (%s) " + "while writing to FTS doc_id.\n", + ut_strerr(error)); + + break; /* Exit the loop. */ + } + } + } + + return(error); +} + +/*********************************************************************//** +Write document statistics to disk. +@return DB_SUCCESS if all OK */ +static +ulint +fts_sync_write_doc_stats( +/*=====================*/ + trx_t* trx, /*!< in: transaction */ + const fts_index_cache_t*index_cache) /*!< in: index cache */ +{ + dberr_t error = DB_SUCCESS; + que_t* graph = NULL; + fts_doc_stats_t* doc_stat; + + if (ib_vector_is_empty(index_cache->doc_stats)) { + return(DB_SUCCESS); + } + + doc_stat = static_cast<ts_doc_stats_t*>( + ib_vector_pop(index_cache->doc_stats)); + + while (doc_stat) { + error = fts_sync_write_doc_stat( + trx, index_cache->index, &graph, doc_stat); + + if (error != DB_SUCCESS) { + break; + } + + if (ib_vector_is_empty(index_cache->doc_stats)) { + break; + } + + doc_stat = static_cast<ts_doc_stats_t*>( + ib_vector_pop(index_cache->doc_stats)); + } + + if (graph != NULL) { + fts_que_graph_free_check_lock(NULL, index_cache, graph); + } + + return(error); +} + +/*********************************************************************//** +Callback to check the existince of a word. +@return always return NULL */ +static +ibool +fts_lookup_word( +/*============*/ + void* row, /*!< in: sel_node_t* */ + void* user_arg) /*!< in: fts_doc_t* */ +{ + + que_node_t* exp; + sel_node_t* node = static_cast<sel_node_t*>(row); + ibool* found = static_cast<ibool*>(user_arg); + + exp = node->select_list; + + while (exp) { + dfield_t* dfield = que_node_get_val(exp); + ulint len = dfield_get_len(dfield); + + if (len != UNIV_SQL_NULL && len != 0) { + *found = TRUE; + } + + exp = que_node_get_next(exp); + } + + return(FALSE); +} + +/*********************************************************************//** +Check whether a particular word (term) exists in the FTS index. +@return DB_SUCCESS if all went well else error code */ +static +dberr_t +fts_is_word_in_index( +/*=================*/ + trx_t* trx, /*!< in: FTS query state */ + que_t** graph, /* out: Query graph */ + fts_table_t* fts_table, /*!< in: table instance */ + const fts_string_t* + word, /*!< in: the word to check */ + ibool* found) /* out: TRUE if exists */ +{ + pars_info_t* info; + dberr_t error; + + trx->op_info = "looking up word in FTS index"; + + if (*graph) { + info = (*graph)->info; + } else { + info = pars_info_create(); + } + + pars_info_bind_function(info, "my_func", fts_lookup_word, found); + pars_info_bind_varchar_literal(info, "word", word->f_str, word->f_len); + + if (*graph == NULL) { + *graph = fts_parse_sql( + fts_table, + info, + "DECLARE FUNCTION my_func;\n" + "DECLARE CURSOR c IS" + " SELECT doc_count\n" + " FROM \"%s\"\n" + " WHERE word = :word " + " ORDER BY first_doc_id;\n" + "BEGIN\n" + "\n" + "OPEN c;\n" + "WHILE 1 = 1 LOOP\n" + " FETCH c INTO my_func();\n" + " IF c % NOTFOUND THEN\n" + " EXIT;\n" + " END IF;\n" + "END LOOP;\n" + "CLOSE c;"); + } + + for (;;) { + error = fts_eval_sql(trx, *graph); + + if (error == DB_SUCCESS) { + + break; /* Exit the loop. */ + } else { + ut_print_timestamp(stderr); + + if (error == DB_LOCK_WAIT_TIMEOUT) { + fprintf(stderr, " InnoDB: Warning: lock wait " + "timeout reading FTS index. " + "Retrying!\n"); + + trx->error_state = DB_SUCCESS; + } else { + fprintf(stderr, " InnoDB: Error: (%s) " + "while reading FTS index.\n", + ut_strerr(error)); + + break; /* Exit the loop. */ + } + } + } + + return(error); +} +#endif /* FTS_DOC_STATS_DEBUG */ + +/*********************************************************************//** +Begin Sync, create transaction, acquire locks, etc. */ +static +void +fts_sync_begin( +/*===========*/ + fts_sync_t* sync) /*!< in: sync state */ +{ + fts_cache_t* cache = sync->table->fts->cache; + + n_nodes = 0; + elapsed_time = 0; + + sync->start_time = ut_time(); + + sync->trx = trx_allocate_for_background(); + + if (fts_enable_diag_print) { + ib_logf(IB_LOG_LEVEL_INFO, + "FTS SYNC for table %s, deleted count: %ld size: " + "%lu bytes", + sync->table->name, + ib_vector_size(cache->deleted_doc_ids), + cache->total_size); + } +} + +/*********************************************************************//** +Run SYNC on the table, i.e., write out data from the index specific +cache to the FTS aux INDEX table and FTS aux doc id stats table. +@return DB_SUCCESS if all OK */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +fts_sync_index( +/*===========*/ + fts_sync_t* sync, /*!< in: sync state */ + fts_index_cache_t* index_cache) /*!< in: index cache */ +{ + trx_t* trx = sync->trx; + dberr_t error = DB_SUCCESS; + + trx->op_info = "doing SYNC index"; + + if (fts_enable_diag_print) { + ib_logf(IB_LOG_LEVEL_INFO, + "SYNC words: %ld", rbt_size(index_cache->words)); + } + + ut_ad(rbt_validate(index_cache->words)); + + error = fts_sync_write_words(trx, index_cache); + +#ifdef FTS_DOC_STATS_DEBUG + /* FTS_RESOLVE: the word counter info in auxiliary table "DOC_ID" + is not used currently for ranking. We disable fts_sync_write_doc_stats() + for now */ + /* Write the per doc statistics that will be used for ranking. */ + if (error == DB_SUCCESS) { + + error = fts_sync_write_doc_stats(trx, index_cache); + } +#endif /* FTS_DOC_STATS_DEBUG */ + + return(error); +} + +/*********************************************************************//** +Commit the SYNC, change state of processed doc ids etc. +@return DB_SUCCESS if all OK */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +fts_sync_commit( +/*============*/ + fts_sync_t* sync) /*!< in: sync state */ +{ + dberr_t error; + trx_t* trx = sync->trx; + fts_cache_t* cache = sync->table->fts->cache; + doc_id_t last_doc_id; + + trx->op_info = "doing SYNC commit"; + + /* After each Sync, update the CONFIG table about the max doc id + we just sync-ed to index table */ + error = fts_cmp_set_sync_doc_id(sync->table, sync->max_doc_id, FALSE, + &last_doc_id); + + /* Get the list of deleted documents that are either in the + cache or were headed there but were deleted before the add + thread got to them. */ + + if (error == DB_SUCCESS && ib_vector_size(cache->deleted_doc_ids) > 0) { + + error = fts_sync_add_deleted_cache( + sync, cache->deleted_doc_ids); + } + + /* We need to do this within the deleted lock since fts_delete() can + attempt to add a deleted doc id to the cache deleted id array. */ + fts_cache_clear(cache); + DEBUG_SYNC_C("fts_deleted_doc_ids_clear"); + fts_cache_init(cache); + rw_lock_x_unlock(&cache->lock); + + if (error == DB_SUCCESS) { + + fts_sql_commit(trx); + + } else if (error != DB_SUCCESS) { + + fts_sql_rollback(trx); + + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Error: (%s) during SYNC.\n", + ut_strerr(error)); + } + + if (fts_enable_diag_print && elapsed_time) { + ib_logf(IB_LOG_LEVEL_INFO, + "SYNC for table %s: SYNC time : %lu secs: " + "elapsed %lf ins/sec", + sync->table->name, + (ulong) (ut_time() - sync->start_time), + (double) n_nodes/ (double) elapsed_time); + } + + trx_free_for_background(trx); + + return(error); +} + +/*********************************************************************//** +Rollback a sync operation */ +static +void +fts_sync_rollback( +/*==============*/ + fts_sync_t* sync) /*!< in: sync state */ +{ + trx_t* trx = sync->trx; + fts_cache_t* cache = sync->table->fts->cache; + + rw_lock_x_unlock(&cache->lock); + + fts_sql_rollback(trx); + trx_free_for_background(trx); +} + +/****************************************************************//** +Run SYNC on the table, i.e., write out data from the cache to the +FTS auxiliary INDEX table and clear the cache at the end. +@return DB_SUCCESS if all OK */ +static +dberr_t +fts_sync( +/*=====*/ + fts_sync_t* sync) /*!< in: sync state */ +{ + ulint i; + dberr_t error = DB_SUCCESS; + fts_cache_t* cache = sync->table->fts->cache; + + rw_lock_x_lock(&cache->lock); + + fts_sync_begin(sync); + + for (i = 0; i < ib_vector_size(cache->indexes); ++i) { + fts_index_cache_t* index_cache; + + index_cache = static_cast<fts_index_cache_t*>( + ib_vector_get(cache->indexes, i)); + + if (index_cache->index->to_be_dropped) { + continue; + } + + error = fts_sync_index(sync, index_cache); + + if (error != DB_SUCCESS && !sync->interrupted) { + + break; + } + } + + DBUG_EXECUTE_IF("fts_instrument_sync_interrupted", + sync->interrupted = true; + error = DB_INTERRUPTED; + ); + + if (error == DB_SUCCESS && !sync->interrupted) { + error = fts_sync_commit(sync); + } else { + fts_sync_rollback(sync); + } + + /* We need to check whether an optimize is required, for that + we make copies of the two variables that control the trigger. These + variables can change behind our back and we don't want to hold the + lock for longer than is needed. */ + mutex_enter(&cache->deleted_lock); + + cache->added = 0; + cache->deleted = 0; + + mutex_exit(&cache->deleted_lock); + + return(error); +} + +/****************************************************************//** +Run SYNC on the table, i.e., write out data from the cache to the +FTS auxiliary INDEX table and clear the cache at the end. */ +UNIV_INTERN +dberr_t +fts_sync_table( +/*===========*/ + dict_table_t* table) /*!< in: table */ +{ + dberr_t err = DB_SUCCESS; + + ut_ad(table->fts); + + if (!dict_table_is_discarded(table) && table->fts->cache) { + err = fts_sync(table->fts->cache->sync); + } + + return(err); +} + +/******************************************************************** +Process next token from document starting at the given position, i.e., add +the token's start position to the token's list of positions. +@return number of characters handled in this call */ +static +ulint +fts_process_token( +/*==============*/ + fts_doc_t* doc, /* in/out: document to + tokenize */ + fts_doc_t* result, /* out: if provided, save + result here */ + ulint start_pos, /*!< in: start position in text */ + ulint add_pos) /*!< in: add this position to all + tokens from this tokenization */ +{ + ulint ret; + fts_string_t str; + ulint offset = 0; + fts_doc_t* result_doc; + + /* Determine where to save the result. */ + result_doc = (result) ? result : doc; + + /* The length of a string in characters is set here only. */ + ret = innobase_mysql_fts_get_token( + doc->charset, doc->text.f_str + start_pos, + doc->text.f_str + doc->text.f_len, &str, &offset); + + /* Ignore string whose character number is less than + "fts_min_token_size" or more than "fts_max_token_size" */ + + if (str.f_n_char >= fts_min_token_size + && str.f_n_char <= fts_max_token_size) { + + mem_heap_t* heap; + fts_string_t t_str; + fts_token_t* token; + ib_rbt_bound_t parent; + ulint newlen; + + heap = static_cast<mem_heap_t*>(result_doc->self_heap->arg); + + t_str.f_n_char = str.f_n_char; + + t_str.f_len = str.f_len * doc->charset->casedn_multiply + 1; + + t_str.f_str = static_cast<byte*>( + mem_heap_alloc(heap, t_str.f_len)); + + newlen = innobase_fts_casedn_str( + doc->charset, (char*) str.f_str, str.f_len, + (char*) t_str.f_str, t_str.f_len); + + t_str.f_len = newlen; + t_str.f_str[newlen] = 0; + + /* Add the word to the document statistics. If the word + hasn't been seen before we create a new entry for it. */ + if (rbt_search(result_doc->tokens, &parent, &t_str) != 0) { + fts_token_t new_token; + + new_token.text.f_len = newlen; + new_token.text.f_str = t_str.f_str; + new_token.text.f_n_char = t_str.f_n_char; + + new_token.positions = ib_vector_create( + result_doc->self_heap, sizeof(ulint), 32); + + ut_a(new_token.text.f_n_char >= fts_min_token_size); + ut_a(new_token.text.f_n_char <= fts_max_token_size); + + parent.last = rbt_add_node( + result_doc->tokens, &parent, &new_token); + + ut_ad(rbt_validate(result_doc->tokens)); + } + +#ifdef FTS_CHARSET_DEBUG + offset += start_pos + add_pos; +#endif /* FTS_CHARSET_DEBUG */ + + offset += start_pos + ret - str.f_len + add_pos; + + token = rbt_value(fts_token_t, parent.last); + ib_vector_push(token->positions, &offset); + } + + return(ret); +} + +/******************************************************************//** +Tokenize a document. */ +UNIV_INTERN +void +fts_tokenize_document( +/*==================*/ + fts_doc_t* doc, /* in/out: document to + tokenize */ + fts_doc_t* result) /* out: if provided, save + the result token here */ +{ + ulint inc; + + ut_a(!doc->tokens); + ut_a(doc->charset); + + doc->tokens = rbt_create_arg_cmp( + sizeof(fts_token_t), innobase_fts_text_cmp, doc->charset); + + for (ulint i = 0; i < doc->text.f_len; i += inc) { + inc = fts_process_token(doc, result, i, 0); + ut_a(inc > 0); + } +} + +/******************************************************************//** +Continue to tokenize a document. */ +UNIV_INTERN +void +fts_tokenize_document_next( +/*=======================*/ + fts_doc_t* doc, /*!< in/out: document to + tokenize */ + ulint add_pos, /*!< in: add this position to all + tokens from this tokenization */ + fts_doc_t* result) /*!< out: if provided, save + the result token here */ +{ + ulint inc; + + ut_a(doc->tokens); + + for (ulint i = 0; i < doc->text.f_len; i += inc) { + inc = fts_process_token(doc, result, i, add_pos); + ut_a(inc > 0); + } +} + +/******************************************************************** +Create the vector of fts_get_doc_t instances. */ +UNIV_INTERN +ib_vector_t* +fts_get_docs_create( +/*================*/ + /* out: vector of + fts_get_doc_t instances */ + fts_cache_t* cache) /*!< in: fts cache */ +{ + ulint i; + ib_vector_t* get_docs; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&cache->init_lock, RW_LOCK_EX)); +#endif + /* We need one instance of fts_get_doc_t per index. */ + get_docs = ib_vector_create( + cache->self_heap, sizeof(fts_get_doc_t), 4); + + /* Create the get_doc instance, we need one of these + per FTS index. */ + for (i = 0; i < ib_vector_size(cache->indexes); ++i) { + + dict_index_t** index; + fts_get_doc_t* get_doc; + + index = static_cast<dict_index_t**>( + ib_vector_get(cache->indexes, i)); + + get_doc = static_cast<fts_get_doc_t*>( + ib_vector_push(get_docs, NULL)); + + memset(get_doc, 0x0, sizeof(*get_doc)); + + get_doc->index_cache = fts_get_index_cache(cache, *index); + get_doc->cache = cache; + + /* Must find the index cache. */ + ut_a(get_doc->index_cache != NULL); + } + + return(get_docs); +} + +/******************************************************************** +Release any resources held by the fts_get_doc_t instances. */ +static +void +fts_get_docs_clear( +/*===============*/ + ib_vector_t* get_docs) /*!< in: Doc retrieval vector */ +{ + ulint i; + + /* Release the get doc graphs if any. */ + for (i = 0; i < ib_vector_size(get_docs); ++i) { + + fts_get_doc_t* get_doc = static_cast<fts_get_doc_t*>( + ib_vector_get(get_docs, i)); + + if (get_doc->get_document_graph != NULL) { + + ut_a(get_doc->index_cache); + + fts_que_graph_free(get_doc->get_document_graph); + get_doc->get_document_graph = NULL; + } + } +} + +/*********************************************************************//** +Get the initial Doc ID by consulting the CONFIG table +@return initial Doc ID */ +UNIV_INTERN +doc_id_t +fts_init_doc_id( +/*============*/ + const dict_table_t* table) /*!< in: table */ +{ + doc_id_t max_doc_id = 0; + + rw_lock_x_lock(&table->fts->cache->lock); + + /* Return if the table is already initialized for DOC ID */ + if (table->fts->cache->first_doc_id != FTS_NULL_DOC_ID) { + rw_lock_x_unlock(&table->fts->cache->lock); + return(0); + } + + DEBUG_SYNC_C("fts_initialize_doc_id"); + + /* Then compare this value with the ID value stored in the CONFIG + table. The larger one will be our new initial Doc ID */ + fts_cmp_set_sync_doc_id(table, 0, FALSE, &max_doc_id); + + /* If DICT_TF2_FTS_ADD_DOC_ID is set, we are in the process of + creating index (and add doc id column. No need to recovery + documents */ + if (!DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_ADD_DOC_ID)) { + fts_init_index((dict_table_t*) table, TRUE); + } + + table->fts->fts_status |= ADDED_TABLE_SYNCED; + + table->fts->cache->first_doc_id = max_doc_id; + + rw_lock_x_unlock(&table->fts->cache->lock); + + ut_ad(max_doc_id > 0); + + return(max_doc_id); +} + +#ifdef FTS_MULT_INDEX +/*********************************************************************//** +Check if the index is in the affected set. +@return TRUE if index is updated */ +static +ibool +fts_is_index_updated( +/*=================*/ + const ib_vector_t* fts_indexes, /*!< in: affected FTS indexes */ + const fts_get_doc_t* get_doc) /*!< in: info for reading + document */ +{ + ulint i; + dict_index_t* index = get_doc->index_cache->index; + + for (i = 0; i < ib_vector_size(fts_indexes); ++i) { + const dict_index_t* updated_fts_index; + + updated_fts_index = static_cast<const dict_index_t*>( + ib_vector_getp_const(fts_indexes, i)); + + ut_a(updated_fts_index != NULL); + + if (updated_fts_index == index) { + return(TRUE); + } + } + + return(FALSE); +} +#endif + +/*********************************************************************//** +Fetch COUNT(*) from specified table. +@return the number of rows in the table */ +UNIV_INTERN +ulint +fts_get_rows_count( +/*===============*/ + fts_table_t* fts_table) /*!< in: fts table to read */ +{ + trx_t* trx; + pars_info_t* info; + que_t* graph; + dberr_t error; + ulint count = 0; + + trx = trx_allocate_for_background(); + + trx->op_info = "fetching FT table rows count"; + + info = pars_info_create(); + + pars_info_bind_function(info, "my_func", fts_read_ulint, &count); + + graph = fts_parse_sql( + fts_table, + info, + "DECLARE FUNCTION my_func;\n" + "DECLARE CURSOR c IS" + " SELECT COUNT(*) " + " FROM \"%s\";\n" + "BEGIN\n" + "\n" + "OPEN c;\n" + "WHILE 1 = 1 LOOP\n" + " FETCH c INTO my_func();\n" + " IF c % NOTFOUND THEN\n" + " EXIT;\n" + " END IF;\n" + "END LOOP;\n" + "CLOSE c;"); + + for (;;) { + error = fts_eval_sql(trx, graph); + + if (error == DB_SUCCESS) { + fts_sql_commit(trx); + + break; /* Exit the loop. */ + } else { + fts_sql_rollback(trx); + + ut_print_timestamp(stderr); + + if (error == DB_LOCK_WAIT_TIMEOUT) { + fprintf(stderr, " InnoDB: Warning: lock wait " + "timeout reading FTS table. " + "Retrying!\n"); + + trx->error_state = DB_SUCCESS; + } else { + fprintf(stderr, " InnoDB: Error: (%s) " + "while reading FTS table.\n", + ut_strerr(error)); + + break; /* Exit the loop. */ + } + } + } + + fts_que_graph_free(graph); + + trx_free_for_background(trx); + + return(count); +} + +#ifdef FTS_CACHE_SIZE_DEBUG +/*********************************************************************//** +Read the max cache size parameter from the config table. */ +static +void +fts_update_max_cache_size( +/*======================*/ + fts_sync_t* sync) /*!< in: sync state */ +{ + trx_t* trx; + fts_table_t fts_table; + + trx = trx_allocate_for_background(); + + FTS_INIT_FTS_TABLE(&fts_table, "CONFIG", FTS_COMMON_TABLE, sync->table); + + /* The size returned is in bytes. */ + sync->max_cache_size = fts_get_max_cache_size(trx, &fts_table); + + fts_sql_commit(trx); + + trx_free_for_background(trx); +} +#endif /* FTS_CACHE_SIZE_DEBUG */ + +/*********************************************************************//** +Free the modified rows of a table. */ +UNIV_INLINE +void +fts_trx_table_rows_free( +/*====================*/ + ib_rbt_t* rows) /*!< in: rbt of rows to free */ +{ + const ib_rbt_node_t* node; + + for (node = rbt_first(rows); node; node = rbt_first(rows)) { + fts_trx_row_t* row; + + row = rbt_value(fts_trx_row_t, node); + + if (row->fts_indexes != NULL) { + /* This vector shouldn't be using the + heap allocator. */ + ut_a(row->fts_indexes->allocator->arg == NULL); + + ib_vector_free(row->fts_indexes); + row->fts_indexes = NULL; + } + + ut_free(rbt_remove_node(rows, node)); + } + + ut_a(rbt_empty(rows)); + rbt_free(rows); +} + +/*********************************************************************//** +Free an FTS savepoint instance. */ +UNIV_INLINE +void +fts_savepoint_free( +/*===============*/ + fts_savepoint_t* savepoint) /*!< in: savepoint instance */ +{ + const ib_rbt_node_t* node; + ib_rbt_t* tables = savepoint->tables; + + /* Nothing to free! */ + if (tables == NULL) { + return; + } + + for (node = rbt_first(tables); node; node = rbt_first(tables)) { + fts_trx_table_t* ftt; + fts_trx_table_t** fttp; + + fttp = rbt_value(fts_trx_table_t*, node); + ftt = *fttp; + + /* This can be NULL if a savepoint was released. */ + if (ftt->rows != NULL) { + fts_trx_table_rows_free(ftt->rows); + ftt->rows = NULL; + } + + /* This can be NULL if a savepoint was released. */ + if (ftt->added_doc_ids != NULL) { + fts_doc_ids_free(ftt->added_doc_ids); + ftt->added_doc_ids = NULL; + } + + /* The default savepoint name must be NULL. */ + if (ftt->docs_added_graph) { + fts_que_graph_free(ftt->docs_added_graph); + } + + /* NOTE: We are responsible for free'ing the node */ + ut_free(rbt_remove_node(tables, node)); + } + + ut_a(rbt_empty(tables)); + rbt_free(tables); + savepoint->tables = NULL; +} + +/*********************************************************************//** +Free an FTS trx. */ +UNIV_INTERN +void +fts_trx_free( +/*=========*/ + fts_trx_t* fts_trx) /* in, own: FTS trx */ +{ + ulint i; + + for (i = 0; i < ib_vector_size(fts_trx->savepoints); ++i) { + fts_savepoint_t* savepoint; + + savepoint = static_cast<fts_savepoint_t*>( + ib_vector_get(fts_trx->savepoints, i)); + + /* The default savepoint name must be NULL. */ + if (i == 0) { + ut_a(savepoint->name == NULL); + } + + fts_savepoint_free(savepoint); + } + + for (i = 0; i < ib_vector_size(fts_trx->last_stmt); ++i) { + fts_savepoint_t* savepoint; + + savepoint = static_cast<fts_savepoint_t*>( + ib_vector_get(fts_trx->last_stmt, i)); + + /* The default savepoint name must be NULL. */ + if (i == 0) { + ut_a(savepoint->name == NULL); + } + + fts_savepoint_free(savepoint); + } + + if (fts_trx->heap) { + mem_heap_free(fts_trx->heap); + } +} + +/*********************************************************************//** +Extract the doc id from the FTS hidden column. +@return doc id that was extracted from rec */ +UNIV_INTERN +doc_id_t +fts_get_doc_id_from_row( +/*====================*/ + dict_table_t* table, /*!< in: table */ + dtuple_t* row) /*!< in: row whose FTS doc id we + want to extract.*/ +{ + dfield_t* field; + doc_id_t doc_id = 0; + + ut_a(table->fts->doc_col != ULINT_UNDEFINED); + + field = dtuple_get_nth_field(row, table->fts->doc_col); + + ut_a(dfield_get_len(field) == sizeof(doc_id)); + ut_a(dfield_get_type(field)->mtype == DATA_INT); + + doc_id = fts_read_doc_id( + static_cast<const byte*>(dfield_get_data(field))); + + return(doc_id); +} + +/*********************************************************************//** +Extract the doc id from the FTS hidden column. +@return doc id that was extracted from rec */ +UNIV_INTERN +doc_id_t +fts_get_doc_id_from_rec( +/*====================*/ + dict_table_t* table, /*!< in: table */ + const rec_t* rec, /*!< in: rec */ + mem_heap_t* heap) /*!< in: heap */ +{ + ulint len; + const byte* data; + ulint col_no; + doc_id_t doc_id = 0; + dict_index_t* clust_index; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + mem_heap_t* my_heap = heap; + + ut_a(table->fts->doc_col != ULINT_UNDEFINED); + + clust_index = dict_table_get_first_index(table); + + rec_offs_init(offsets_); + + offsets = rec_get_offsets( + rec, clust_index, offsets, ULINT_UNDEFINED, &my_heap); + + col_no = dict_col_get_clust_pos( + &table->cols[table->fts->doc_col], clust_index); + ut_ad(col_no != ULINT_UNDEFINED); + + data = rec_get_nth_field(rec, offsets, col_no, &len); + + ut_a(len == 8); + ut_ad(8 == sizeof(doc_id)); + doc_id = static_cast<doc_id_t>(mach_read_from_8(data)); + + if (my_heap && !heap) { + mem_heap_free(my_heap); + } + + return(doc_id); +} + +/*********************************************************************//** +Search the index specific cache for a particular FTS index. +@return the index specific cache else NULL */ +UNIV_INTERN +fts_index_cache_t* +fts_find_index_cache( +/*=================*/ + const fts_cache_t* cache, /*!< in: cache to search */ + const dict_index_t* index) /*!< in: index to search for */ +{ + /* We cast away the const because our internal function, takes + non-const cache arg and returns a non-const pointer. */ + return(static_cast<fts_index_cache_t*>( + fts_get_index_cache((fts_cache_t*) cache, index))); +} + +/*********************************************************************//** +Search cache for word. +@return the word node vector if found else NULL */ +UNIV_INTERN +const ib_vector_t* +fts_cache_find_word( +/*================*/ + const fts_index_cache_t*index_cache, /*!< in: cache to search */ + const fts_string_t* text) /*!< in: word to search for */ +{ + ib_rbt_bound_t parent; + const ib_vector_t* nodes = NULL; +#ifdef UNIV_SYNC_DEBUG + dict_table_t* table = index_cache->index->table; + fts_cache_t* cache = table->fts->cache; + + ut_ad(rw_lock_own((rw_lock_t*) &cache->lock, RW_LOCK_EX)); +#endif + + /* Lookup the word in the rb tree */ + if (rbt_search(index_cache->words, &parent, text) == 0) { + const fts_tokenizer_word_t* word; + + word = rbt_value(fts_tokenizer_word_t, parent.last); + + nodes = word->nodes; + } + + return(nodes); +} + +/*********************************************************************//** +Check cache for deleted doc id. +@return TRUE if deleted */ +UNIV_INTERN +ibool +fts_cache_is_deleted_doc_id( +/*========================*/ + const fts_cache_t* cache, /*!< in: cache ito search */ + doc_id_t doc_id) /*!< in: doc id to search for */ +{ + ulint i; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&cache->deleted_lock)); +#endif + + for (i = 0; i < ib_vector_size(cache->deleted_doc_ids); ++i) { + const fts_update_t* update; + + update = static_cast<const fts_update_t*>( + ib_vector_get_const(cache->deleted_doc_ids, i)); + + if (doc_id == update->doc_id) { + + return(TRUE); + } + } + + return(FALSE); +} + +/*********************************************************************//** +Append deleted doc ids to vector. */ +UNIV_INTERN +void +fts_cache_append_deleted_doc_ids( +/*=============================*/ + const fts_cache_t* cache, /*!< in: cache to use */ + ib_vector_t* vector) /*!< in: append to this vector */ +{ + ulint i; + + mutex_enter((ib_mutex_t*) &cache->deleted_lock); + + if (cache->deleted_doc_ids == NULL) { + mutex_exit((ib_mutex_t*) &cache->deleted_lock); + return; + } + + + for (i = 0; i < ib_vector_size(cache->deleted_doc_ids); ++i) { + fts_update_t* update; + + update = static_cast<fts_update_t*>( + ib_vector_get(cache->deleted_doc_ids, i)); + + ib_vector_push(vector, &update->doc_id); + } + + mutex_exit((ib_mutex_t*) &cache->deleted_lock); +} + +/*********************************************************************//** +Wait for the background thread to start. We poll to detect change +of state, which is acceptable, since the wait should happen only +once during startup. +@return true if the thread started else FALSE (i.e timed out) */ +UNIV_INTERN +ibool +fts_wait_for_background_thread_to_start( +/*====================================*/ + dict_table_t* table, /*!< in: table to which the thread + is attached */ + ulint max_wait) /*!< in: time in microseconds, if + set to 0 then it disables + timeout checking */ +{ + ulint count = 0; + ibool done = FALSE; + + ut_a(max_wait == 0 || max_wait >= FTS_MAX_BACKGROUND_THREAD_WAIT); + + for (;;) { + fts_t* fts = table->fts; + + mutex_enter(&fts->bg_threads_mutex); + + if (fts->fts_status & BG_THREAD_READY) { + + done = TRUE; + } + + mutex_exit(&fts->bg_threads_mutex); + + if (!done) { + os_thread_sleep(FTS_MAX_BACKGROUND_THREAD_WAIT); + + if (max_wait > 0) { + + max_wait -= FTS_MAX_BACKGROUND_THREAD_WAIT; + + /* We ignore the residual value. */ + if (max_wait < FTS_MAX_BACKGROUND_THREAD_WAIT) { + break; + } + } + + ++count; + } else { + break; + } + + if (count >= FTS_BACKGROUND_THREAD_WAIT_COUNT) { + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Error the background thread " + "for the FTS table %s refuses to start\n", + table->name); + + count = 0; + } + } + + return(done); +} + +/*********************************************************************//** +Add the FTS document id hidden column. */ +UNIV_INTERN +void +fts_add_doc_id_column( +/*==================*/ + dict_table_t* table, /*!< in/out: Table with FTS index */ + mem_heap_t* heap) /*!< in: temporary memory heap, or NULL */ +{ + dict_mem_table_add_col( + table, heap, + FTS_DOC_ID_COL_NAME, + DATA_INT, + dtype_form_prtype( + DATA_NOT_NULL | DATA_UNSIGNED + | DATA_BINARY_TYPE | DATA_FTS_DOC_ID, 0), + sizeof(doc_id_t)); + DICT_TF2_FLAG_SET(table, DICT_TF2_FTS_HAS_DOC_ID); +} + +/*********************************************************************//** +Update the query graph with a new document id. +@return Doc ID used */ +UNIV_INTERN +doc_id_t +fts_update_doc_id( +/*==============*/ + dict_table_t* table, /*!< in: table */ + upd_field_t* ufield, /*!< out: update node */ + doc_id_t* next_doc_id) /*!< in/out: buffer for writing */ +{ + doc_id_t doc_id; + dberr_t error = DB_SUCCESS; + + if (*next_doc_id) { + doc_id = *next_doc_id; + } else { + /* Get the new document id that will be added. */ + error = fts_get_next_doc_id(table, &doc_id); + } + + if (error == DB_SUCCESS) { + dict_index_t* clust_index; + + ufield->exp = NULL; + + ufield->new_val.len = sizeof(doc_id); + + clust_index = dict_table_get_first_index(table); + + ufield->field_no = dict_col_get_clust_pos( + &table->cols[table->fts->doc_col], clust_index); + + /* It is possible we update record that has + not yet be sync-ed from last crash. */ + + /* Convert to storage byte order. */ + ut_a(doc_id != FTS_NULL_DOC_ID); + fts_write_doc_id((byte*) next_doc_id, doc_id); + + ufield->new_val.data = next_doc_id; + } + + return(doc_id); +} + +/*********************************************************************//** +Check if the table has an FTS index. This is the non-inline version +of dict_table_has_fts_index(). +@return TRUE if table has an FTS index */ +UNIV_INTERN +ibool +fts_dict_table_has_fts_index( +/*=========================*/ + dict_table_t* table) /*!< in: table */ +{ + return(dict_table_has_fts_index(table)); +} + +/*********************************************************************//** +Create an instance of fts_t. +@return instance of fts_t */ +UNIV_INTERN +fts_t* +fts_create( +/*=======*/ + dict_table_t* table) /*!< in/out: table with FTS indexes */ +{ + fts_t* fts; + ib_alloc_t* heap_alloc; + mem_heap_t* heap; + + ut_a(!table->fts); + + heap = mem_heap_create(512); + + fts = static_cast<fts_t*>(mem_heap_alloc(heap, sizeof(*fts))); + + memset(fts, 0x0, sizeof(*fts)); + + fts->fts_heap = heap; + + fts->doc_col = ULINT_UNDEFINED; + + mutex_create( + fts_bg_threads_mutex_key, &fts->bg_threads_mutex, + SYNC_FTS_BG_THREADS); + + heap_alloc = ib_heap_allocator_create(heap); + fts->indexes = ib_vector_create(heap_alloc, sizeof(dict_index_t*), 4); + dict_table_get_all_fts_indexes(table, fts->indexes); + + return(fts); +} + +/*********************************************************************//** +Free the FTS resources. */ +UNIV_INTERN +void +fts_free( +/*=====*/ + dict_table_t* table) /*!< in/out: table with FTS indexes */ +{ + fts_t* fts = table->fts; + + mutex_free(&fts->bg_threads_mutex); + + ut_ad(!fts->add_wq); + + if (fts->cache) { + fts_cache_clear(fts->cache); + fts_cache_destroy(fts->cache); + fts->cache = NULL; + } + + mem_heap_free(fts->fts_heap); + + table->fts = NULL; +} + +/*********************************************************************//** +Signal FTS threads to initiate shutdown. */ +UNIV_INTERN +void +fts_start_shutdown( +/*===============*/ + dict_table_t* table, /*!< in: table with FTS indexes */ + fts_t* fts) /*!< in: fts instance that needs + to be informed about shutdown */ +{ + mutex_enter(&fts->bg_threads_mutex); + + fts->fts_status |= BG_THREAD_STOP; + + mutex_exit(&fts->bg_threads_mutex); + +} + +/*********************************************************************//** +Wait for FTS threads to shutdown. */ +UNIV_INTERN +void +fts_shutdown( +/*=========*/ + dict_table_t* table, /*!< in: table with FTS indexes */ + fts_t* fts) /*!< in: fts instance to shutdown */ +{ + mutex_enter(&fts->bg_threads_mutex); + + ut_a(fts->fts_status & BG_THREAD_STOP); + + dict_table_wait_for_bg_threads_to_exit(table, 20000); + + mutex_exit(&fts->bg_threads_mutex); +} + +/*********************************************************************//** +Take a FTS savepoint. */ +UNIV_INLINE +void +fts_savepoint_copy( +/*===============*/ + const fts_savepoint_t* src, /*!< in: source savepoint */ + fts_savepoint_t* dst) /*!< out: destination savepoint */ +{ + const ib_rbt_node_t* node; + const ib_rbt_t* tables; + + tables = src->tables; + + for (node = rbt_first(tables); node; node = rbt_next(tables, node)) { + + fts_trx_table_t* ftt_dst; + const fts_trx_table_t** ftt_src; + + ftt_src = rbt_value(const fts_trx_table_t*, node); + + ftt_dst = fts_trx_table_clone(*ftt_src); + + rbt_insert(dst->tables, &ftt_dst, &ftt_dst); + } +} + +/*********************************************************************//** +Take a FTS savepoint. */ +UNIV_INTERN +void +fts_savepoint_take( +/*===============*/ + trx_t* trx, /*!< in: transaction */ + fts_trx_t* fts_trx, /*!< in: fts transaction */ + const char* name) /*!< in: savepoint name */ +{ + mem_heap_t* heap; + fts_savepoint_t* savepoint; + fts_savepoint_t* last_savepoint; + + ut_a(name != NULL); + + heap = fts_trx->heap; + + /* The implied savepoint must exist. */ + ut_a(ib_vector_size(fts_trx->savepoints) > 0); + + last_savepoint = static_cast<fts_savepoint_t*>( + ib_vector_last(fts_trx->savepoints)); + savepoint = fts_savepoint_create(fts_trx->savepoints, name, heap); + + if (last_savepoint->tables != NULL) { + fts_savepoint_copy(last_savepoint, savepoint); + } +} + +/*********************************************************************//** +Lookup a savepoint instance by name. +@return ULINT_UNDEFINED if not found */ +UNIV_INLINE +ulint +fts_savepoint_lookup( +/*==================*/ + ib_vector_t* savepoints, /*!< in: savepoints */ + const char* name) /*!< in: savepoint name */ +{ + ulint i; + + ut_a(ib_vector_size(savepoints) > 0); + + for (i = 1; i < ib_vector_size(savepoints); ++i) { + fts_savepoint_t* savepoint; + + savepoint = static_cast<fts_savepoint_t*>( + ib_vector_get(savepoints, i)); + + if (strcmp(name, savepoint->name) == 0) { + return(i); + } + } + + return(ULINT_UNDEFINED); +} + +/*********************************************************************//** +Release the savepoint data identified by name. All savepoints created +after the named savepoint are also released. +@return DB_SUCCESS or error code */ +UNIV_INTERN +void +fts_savepoint_release( +/*==================*/ + trx_t* trx, /*!< in: transaction */ + const char* name) /*!< in: savepoint name */ +{ + ulint i; + ib_vector_t* savepoints; + ulint top_of_stack = 0; + + ut_a(name != NULL); + + savepoints = trx->fts_trx->savepoints; + + ut_a(ib_vector_size(savepoints) > 0); + + /* Skip the implied savepoint (first element). */ + for (i = 1; i < ib_vector_size(savepoints); ++i) { + fts_savepoint_t* savepoint; + + savepoint = static_cast<fts_savepoint_t*>( + ib_vector_get(savepoints, i)); + + /* Even though we release the resources that are part + of the savepoint, we don't (always) actually delete the + entry. We simply set the savepoint name to NULL. Therefore + we have to skip deleted/released entries. */ + if (savepoint->name != NULL + && strcmp(name, savepoint->name) == 0) { + break; + + /* Track the previous savepoint instance that will + be at the top of the stack after the release. */ + } else if (savepoint->name != NULL) { + /* We need to delete all entries + greater than this element. */ + top_of_stack = i; + } + } + + /* Only if we found and element to release. */ + if (i < ib_vector_size(savepoints)) { + fts_savepoint_t* last_savepoint; + fts_savepoint_t* top_savepoint; + ib_rbt_t* tables; + + ut_a(top_of_stack < ib_vector_size(savepoints)); + + /* Exchange tables between last savepoint and top savepoint */ + last_savepoint = static_cast<fts_savepoint_t*>( + ib_vector_last(trx->fts_trx->savepoints)); + top_savepoint = static_cast<fts_savepoint_t*>( + ib_vector_get(savepoints, top_of_stack)); + tables = top_savepoint->tables; + top_savepoint->tables = last_savepoint->tables; + last_savepoint->tables = tables; + + /* Skip the implied savepoint. */ + for (i = ib_vector_size(savepoints) - 1; + i > top_of_stack; + --i) { + + fts_savepoint_t* savepoint; + + savepoint = static_cast<fts_savepoint_t*>( + ib_vector_get(savepoints, i)); + + /* Skip savepoints that were released earlier. */ + if (savepoint->name != NULL) { + savepoint->name = NULL; + fts_savepoint_free(savepoint); + } + + ib_vector_pop(savepoints); + } + + /* Make sure we don't delete the implied savepoint. */ + ut_a(ib_vector_size(savepoints) > 0); + + /* This must hold. */ + ut_a(ib_vector_size(savepoints) == (top_of_stack + 1)); + } +} + +/**********************************************************************//** +Refresh last statement savepoint. */ +UNIV_INTERN +void +fts_savepoint_laststmt_refresh( +/*===========================*/ + trx_t* trx) /*!< in: transaction */ +{ + + fts_trx_t* fts_trx; + fts_savepoint_t* savepoint; + + fts_trx = trx->fts_trx; + + savepoint = static_cast<fts_savepoint_t*>( + ib_vector_pop(fts_trx->last_stmt)); + fts_savepoint_free(savepoint); + + ut_ad(ib_vector_is_empty(fts_trx->last_stmt)); + savepoint = fts_savepoint_create(fts_trx->last_stmt, NULL, NULL); +} + +/******************************************************************** +Undo the Doc ID add/delete operations in last stmt */ +static +void +fts_undo_last_stmt( +/*===============*/ + fts_trx_table_t* s_ftt, /*!< in: Transaction FTS table */ + fts_trx_table_t* l_ftt) /*!< in: last stmt FTS table */ +{ + ib_rbt_t* s_rows; + ib_rbt_t* l_rows; + const ib_rbt_node_t* node; + + l_rows = l_ftt->rows; + s_rows = s_ftt->rows; + + for (node = rbt_first(l_rows); + node; + node = rbt_next(l_rows, node)) { + fts_trx_row_t* l_row = rbt_value(fts_trx_row_t, node); + ib_rbt_bound_t parent; + + rbt_search(s_rows, &parent, &(l_row->doc_id)); + + if (parent.result == 0) { + fts_trx_row_t* s_row = rbt_value( + fts_trx_row_t, parent.last); + + switch (l_row->state) { + case FTS_INSERT: + ut_free(rbt_remove_node(s_rows, parent.last)); + break; + + case FTS_DELETE: + if (s_row->state == FTS_NOTHING) { + s_row->state = FTS_INSERT; + } else if (s_row->state == FTS_DELETE) { + ut_free(rbt_remove_node( + s_rows, parent.last)); + } + break; + + /* FIXME: Check if FTS_MODIFY need to be addressed */ + case FTS_MODIFY: + case FTS_NOTHING: + break; + default: + ut_error; + } + } + } +} + +/**********************************************************************//** +Rollback to savepoint indentified by name. +@return DB_SUCCESS or error code */ +UNIV_INTERN +void +fts_savepoint_rollback_last_stmt( +/*=============================*/ + trx_t* trx) /*!< in: transaction */ +{ + ib_vector_t* savepoints; + fts_savepoint_t* savepoint; + fts_savepoint_t* last_stmt; + fts_trx_t* fts_trx; + ib_rbt_bound_t parent; + const ib_rbt_node_t* node; + ib_rbt_t* l_tables; + ib_rbt_t* s_tables; + + fts_trx = trx->fts_trx; + savepoints = fts_trx->savepoints; + + savepoint = static_cast<fts_savepoint_t*>(ib_vector_last(savepoints)); + last_stmt = static_cast<fts_savepoint_t*>( + ib_vector_last(fts_trx->last_stmt)); + + l_tables = last_stmt->tables; + s_tables = savepoint->tables; + + for (node = rbt_first(l_tables); + node; + node = rbt_next(l_tables, node)) { + + fts_trx_table_t** l_ftt; + + l_ftt = rbt_value(fts_trx_table_t*, node); + + rbt_search_cmp( + s_tables, &parent, &(*l_ftt)->table->id, + fts_trx_table_id_cmp, NULL); + + if (parent.result == 0) { + fts_trx_table_t** s_ftt; + + s_ftt = rbt_value(fts_trx_table_t*, parent.last); + + fts_undo_last_stmt(*s_ftt, *l_ftt); + } + } +} + +/**********************************************************************//** +Rollback to savepoint indentified by name. +@return DB_SUCCESS or error code */ +UNIV_INTERN +void +fts_savepoint_rollback( +/*===================*/ + trx_t* trx, /*!< in: transaction */ + const char* name) /*!< in: savepoint name */ +{ + ulint i; + ib_vector_t* savepoints; + + ut_a(name != NULL); + + savepoints = trx->fts_trx->savepoints; + + /* We pop all savepoints from the the top of the stack up to + and including the instance that was found. */ + i = fts_savepoint_lookup(savepoints, name); + + if (i != ULINT_UNDEFINED) { + fts_savepoint_t* savepoint; + + ut_a(i > 0); + + while (ib_vector_size(savepoints) > i) { + fts_savepoint_t* savepoint; + + savepoint = static_cast<fts_savepoint_t*>( + ib_vector_pop(savepoints)); + + if (savepoint->name != NULL) { + /* Since name was allocated on the heap, the + memory will be released when the transaction + completes. */ + savepoint->name = NULL; + + fts_savepoint_free(savepoint); + } + } + + /* Pop all a elements from the top of the stack that may + have been released. We have to be careful that we don't + delete the implied savepoint. */ + + for (savepoint = static_cast<fts_savepoint_t*>( + ib_vector_last(savepoints)); + ib_vector_size(savepoints) > 1 + && savepoint->name == NULL; + savepoint = static_cast<fts_savepoint_t*>( + ib_vector_last(savepoints))) { + + ib_vector_pop(savepoints); + } + + /* Make sure we don't delete the implied savepoint. */ + ut_a(ib_vector_size(savepoints) > 0); + + /* Restore the savepoint. */ + fts_savepoint_take(trx, trx->fts_trx, name); + } +} + +/**********************************************************************//** +Check if a table is an FTS auxiliary table name. +@return TRUE if the name matches an auxiliary table name pattern */ +static +ibool +fts_is_aux_table_name( +/*==================*/ + fts_aux_table_t*table, /*!< out: table info */ + const char* name, /*!< in: table name */ + ulint len) /*!< in: length of table name */ +{ + const char* ptr; + char* end; + char my_name[MAX_FULL_NAME_LEN + 1]; + + ut_ad(len <= MAX_FULL_NAME_LEN); + ut_memcpy(my_name, name, len); + my_name[len] = 0; + end = my_name + len; + + ptr = static_cast<const char*>(memchr(my_name, '/', len)); + + if (ptr != NULL) { + /* We will start the match after the '/' */ + ++ptr; + len = end - ptr; + } + + /* All auxiliary tables are prefixed with "FTS_" and the name + length will be at the very least greater than 20 bytes. */ + if (ptr != NULL && len > 20 && strncmp(ptr, "FTS_", 4) == 0) { + ulint i; + + /* Skip the prefix. */ + ptr += 4; + len -= 4; + + /* Try and read the table id. */ + if (!fts_read_object_id(&table->parent_id, ptr)) { + return(FALSE); + } + + /* Skip the table id. */ + ptr = static_cast<const char*>(memchr(ptr, '_', len)); + + if (ptr == NULL) { + return(FALSE); + } + + /* Skip the underscore. */ + ++ptr; + ut_a(end > ptr); + len = end - ptr; + + /* First search the common table suffix array. */ + for (i = 0; fts_common_tables[i] != NULL; ++i) { + + if (strncmp(ptr, fts_common_tables[i], len) == 0) { + return(TRUE); + } + } + + /* Could be obsolete common tables. */ + if (strncmp(ptr, "ADDED", len) == 0 + || strncmp(ptr, "STOPWORDS", len) == 0) { + return(true); + } + + /* Try and read the index id. */ + if (!fts_read_object_id(&table->index_id, ptr)) { + return(FALSE); + } + + /* Skip the table id. */ + ptr = static_cast<const char*>(memchr(ptr, '_', len)); + + if (ptr == NULL) { + return(FALSE); + } + + /* Skip the underscore. */ + ++ptr; + ut_a(end > ptr); + len = end - ptr; + + /* Search the FT index specific array. */ + for (i = 0; fts_index_selector[i].value; ++i) { + + if (strncmp(ptr, fts_get_suffix(i), len) == 0) { + return(TRUE); + } + } + + /* Other FT index specific table(s). */ + if (strncmp(ptr, "DOC_ID", len) == 0) { + return(TRUE); + } + } + + return(FALSE); +} + +/**********************************************************************//** +Callback function to read a single table ID column. +@return Always return TRUE */ +static +ibool +fts_read_tables( +/*============*/ + void* row, /*!< in: sel_node_t* */ + void* user_arg) /*!< in: pointer to ib_vector_t */ +{ + int i; + fts_aux_table_t*table; + mem_heap_t* heap; + ibool done = FALSE; + ib_vector_t* tables = static_cast<ib_vector_t*>(user_arg); + sel_node_t* sel_node = static_cast<sel_node_t*>(row); + que_node_t* exp = sel_node->select_list; + + /* Must be a heap allocated vector. */ + ut_a(tables->allocator->arg != NULL); + + /* We will use this heap for allocating strings. */ + heap = static_cast<mem_heap_t*>(tables->allocator->arg); + table = static_cast<fts_aux_table_t*>(ib_vector_push(tables, NULL)); + + memset(table, 0x0, sizeof(*table)); + + /* Iterate over the columns and read the values. */ + for (i = 0; exp && !done; exp = que_node_get_next(exp), ++i) { + + dfield_t* dfield = que_node_get_val(exp); + void* data = dfield_get_data(dfield); + ulint len = dfield_get_len(dfield); + + ut_a(len != UNIV_SQL_NULL); + + /* Note: The column numbers below must match the SELECT */ + switch (i) { + case 0: /* NAME */ + + if (!fts_is_aux_table_name( + table, static_cast<const char*>(data), len)) { + ib_vector_pop(tables); + done = TRUE; + break; + } + + table->name = static_cast<char*>( + mem_heap_alloc(heap, len + 1)); + memcpy(table->name, data, len); + table->name[len] = 0; + break; + + case 1: /* ID */ + ut_a(len == 8); + table->id = mach_read_from_8( + static_cast<const byte*>(data)); + break; + + default: + ut_error; + } + } + + return(TRUE); +} + +/******************************************************************//** +Callback that sets a hex formatted FTS table's flags2 in +SYS_TABLES. The flags is stored in MIX_LEN column. +@return FALSE if all OK */ +static +ibool +fts_set_hex_format( +/*===============*/ + void* row, /*!< in: sel_node_t* */ + void* user_arg) /*!< in: bool set/unset flag */ +{ + sel_node_t* node = static_cast<sel_node_t*>(row); + dfield_t* dfield = que_node_get_val(node->select_list); + + ut_ad(dtype_get_mtype(dfield_get_type(dfield)) == DATA_INT); + ut_ad(dfield_get_len(dfield) == sizeof(ib_uint32_t)); + /* There should be at most one matching record. So the value + must be the default value. */ + ut_ad(mach_read_from_4(static_cast<byte*>(user_arg)) + == ULINT32_UNDEFINED); + + ulint flags2 = mach_read_from_4( + static_cast<byte*>(dfield_get_data(dfield))); + + flags2 |= DICT_TF2_FTS_AUX_HEX_NAME; + + mach_write_to_4(static_cast<byte*>(user_arg), flags2); + + return(FALSE); +} + +/*****************************************************************//** +Update the DICT_TF2_FTS_AUX_HEX_NAME flag in SYS_TABLES. +@return DB_SUCCESS or error code. */ +UNIV_INTERN +dberr_t +fts_update_hex_format_flag( +/*=======================*/ + trx_t* trx, /*!< in/out: transaction that + covers the update */ + table_id_t table_id, /*!< in: Table for which we want + to set the root table->flags2 */ + bool dict_locked) /*!< in: set to true if the + caller already owns the + dict_sys_t::mutex. */ +{ + pars_info_t* info; + ib_uint32_t flags2; + + static const char sql[] = + "PROCEDURE UPDATE_HEX_FORMAT_FLAG() IS\n" + "DECLARE FUNCTION my_func;\n" + "DECLARE CURSOR c IS\n" + " SELECT MIX_LEN " + " FROM SYS_TABLES " + " WHERE ID = :table_id FOR UPDATE;" + "\n" + "BEGIN\n" + "OPEN c;\n" + "WHILE 1 = 1 LOOP\n" + " FETCH c INTO my_func();\n" + " IF c % NOTFOUND THEN\n" + " EXIT;\n" + " END IF;\n" + "END LOOP;\n" + "UPDATE SYS_TABLES" + " SET MIX_LEN = :flags2" + " WHERE ID = :table_id;\n" + "CLOSE c;\n" + "END;\n"; + + flags2 = ULINT32_UNDEFINED; + + info = pars_info_create(); + + pars_info_add_ull_literal(info, "table_id", table_id); + pars_info_bind_int4_literal(info, "flags2", &flags2); + + pars_info_bind_function( + info, "my_func", fts_set_hex_format, &flags2); + + if (trx_get_dict_operation(trx) == TRX_DICT_OP_NONE) { + trx_set_dict_operation(trx, TRX_DICT_OP_INDEX); + } + + dberr_t err = que_eval_sql(info, sql, !dict_locked, trx); + + ut_a(flags2 != ULINT32_UNDEFINED); + + return (err); +} + +/*********************************************************************//** +Rename an aux table to HEX format. It's called when "%016llu" is used +to format an object id in table name, which only happens in Windows. */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +fts_rename_one_aux_table_to_hex_format( +/*===================================*/ + trx_t* trx, /*!< in: transaction */ + const fts_aux_table_t* aux_table, /*!< in: table info */ + const dict_table_t* parent_table) /*!< in: parent table name */ +{ + const char* ptr; + fts_table_t fts_table; + char* new_name; + dberr_t error; + + ptr = strchr(aux_table->name, '/'); + ut_a(ptr != NULL); + ++ptr; + /* Skip "FTS_", table id and underscore */ + for (ulint i = 0; i < 2; ++i) { + ptr = strchr(ptr, '_'); + ut_a(ptr != NULL); + ++ptr; + } + + fts_table.suffix = NULL; + if (aux_table->index_id == 0) { + fts_table.type = FTS_COMMON_TABLE; + + for (ulint i = 0; fts_common_tables[i] != NULL; ++i) { + if (strcmp(ptr, fts_common_tables[i]) == 0) { + fts_table.suffix = fts_common_tables[i]; + break; + } + } + } else { + fts_table.type = FTS_INDEX_TABLE; + + /* Skip index id and underscore */ + ptr = strchr(ptr, '_'); + ut_a(ptr != NULL); + ++ptr; + + for (ulint i = 0; fts_index_selector[i].value; ++i) { + if (strcmp(ptr, fts_get_suffix(i)) == 0) { + fts_table.suffix = fts_get_suffix(i); + break; + } + } + } + + ut_a(fts_table.suffix != NULL); + + fts_table.parent = parent_table->name; + fts_table.table_id = aux_table->parent_id; + fts_table.index_id = aux_table->index_id; + fts_table.table = parent_table; + + new_name = fts_get_table_name(&fts_table); + ut_ad(strcmp(new_name, aux_table->name) != 0); + + if (trx_get_dict_operation(trx) == TRX_DICT_OP_NONE) { + trx_set_dict_operation(trx, TRX_DICT_OP_INDEX); + } + + error = row_rename_table_for_mysql(aux_table->name, new_name, trx, + FALSE); + + if (error != DB_SUCCESS) { + ib_logf(IB_LOG_LEVEL_WARN, + "Failed to rename aux table \'%s\' to " + "new format \'%s\'. ", + aux_table->name, new_name); + } else { + ib_logf(IB_LOG_LEVEL_INFO, + "Renamed aux table \'%s\' to \'%s\'.", + aux_table->name, new_name); + } + + mem_free(new_name); + + return (error); +} + +/**********************************************************************//** +Rename all aux tables of a parent table to HEX format. Also set aux tables' +flags2 and parent table's flags2 with DICT_TF2_FTS_AUX_HEX_NAME. +It's called when "%016llu" is used to format an object id in table name, +which only happens in Windows. +Note the ids in tables are correct but the names are old ambiguous ones. + +This function should make sure that either all the parent table and aux tables +are set DICT_TF2_FTS_AUX_HEX_NAME with flags2 or none of them are set */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +fts_rename_aux_tables_to_hex_format_low( +/*====================================*/ + trx_t* trx, /*!< in: transaction */ + dict_table_t* parent_table, /*!< in: parent table */ + ib_vector_t* tables) /*!< in: aux tables to rename. */ +{ + dberr_t error; + ulint count; + + ut_ad(!DICT_TF2_FLAG_IS_SET(parent_table, DICT_TF2_FTS_AUX_HEX_NAME)); + ut_ad(!ib_vector_is_empty(tables)); + + error = fts_update_hex_format_flag(trx, parent_table->id, true); + + if (error != DB_SUCCESS) { + ib_logf(IB_LOG_LEVEL_WARN, + "Setting parent table %s to hex format failed.", + parent_table->name); + + fts_sql_rollback(trx); + return (error); + } + + DICT_TF2_FLAG_SET(parent_table, DICT_TF2_FTS_AUX_HEX_NAME); + + for (count = 0; count < ib_vector_size(tables); ++count) { + dict_table_t* table; + fts_aux_table_t* aux_table; + + aux_table = static_cast<fts_aux_table_t*>( + ib_vector_get(tables, count)); + + table = dict_table_open_on_id(aux_table->id, TRUE, + DICT_TABLE_OP_NORMAL); + + ut_ad(table != NULL); + ut_ad(!DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_AUX_HEX_NAME)); + + /* Set HEX_NAME flag here to make sure we can get correct + new table name in following function */ + DICT_TF2_FLAG_SET(table, DICT_TF2_FTS_AUX_HEX_NAME); + error = fts_rename_one_aux_table_to_hex_format(trx, + aux_table, parent_table); + /* We will rollback the trx if the error != DB_SUCCESS, + so setting the flag here is the same with setting it in + row_rename_table_for_mysql */ + DBUG_EXECUTE_IF("rename_aux_table_fail", error = DB_ERROR;); + + if (error != DB_SUCCESS) { + dict_table_close(table, TRUE, FALSE); + + ib_logf(IB_LOG_LEVEL_WARN, + "Failed to rename one aux table %s " + "Will revert all successful rename " + "operations.", aux_table->name); + + fts_sql_rollback(trx); + break; + } + + error = fts_update_hex_format_flag(trx, aux_table->id, true); + dict_table_close(table, TRUE, FALSE); + + if (error != DB_SUCCESS) { + ib_logf(IB_LOG_LEVEL_WARN, + "Setting aux table %s to hex format failed.", + aux_table->name); + + fts_sql_rollback(trx); + break; + } + } + + if (error != DB_SUCCESS) { + ut_ad(count != ib_vector_size(tables)); + /* If rename fails, thr trx would be rolled back, we can't + use it any more, we'll start a new background trx to do + the reverting. */ + ut_a(trx->state == TRX_STATE_NOT_STARTED); + bool not_rename = false; + + /* Try to revert those succesful rename operations + in order to revert the ibd file rename. */ + for (ulint i = 0; i <= count; ++i) { + dict_table_t* table; + fts_aux_table_t* aux_table; + trx_t* trx_bg; + dberr_t err; + + aux_table = static_cast<fts_aux_table_t*>( + ib_vector_get(tables, i)); + + table = dict_table_open_on_id(aux_table->id, TRUE, + DICT_TABLE_OP_NORMAL); + ut_ad(table != NULL); + + if (not_rename) { + DICT_TF2_FLAG_UNSET(table, + DICT_TF2_FTS_AUX_HEX_NAME); + } + + if (!DICT_TF2_FLAG_IS_SET(table, + DICT_TF2_FTS_AUX_HEX_NAME)) { + dict_table_close(table, TRUE, FALSE); + continue; + } + + trx_bg = trx_allocate_for_background(); + trx_bg->op_info = "Revert half done rename"; + trx_bg->dict_operation_lock_mode = RW_X_LATCH; + trx_start_for_ddl(trx_bg, TRX_DICT_OP_TABLE); + + DICT_TF2_FLAG_UNSET(table, DICT_TF2_FTS_AUX_HEX_NAME); + err = row_rename_table_for_mysql(table->name, + aux_table->name, + trx_bg, FALSE); + + trx_bg->dict_operation_lock_mode = 0; + dict_table_close(table, TRUE, FALSE); + + if (err != DB_SUCCESS) { + ib_logf(IB_LOG_LEVEL_WARN, "Failed to revert " + "table %s. Please revert manually.", + table->name); + fts_sql_rollback(trx_bg); + trx_free_for_background(trx_bg); + /* Continue to clear aux tables' flags2 */ + not_rename = true; + continue; + } + + fts_sql_commit(trx_bg); + trx_free_for_background(trx_bg); + } + + DICT_TF2_FLAG_UNSET(parent_table, DICT_TF2_FTS_AUX_HEX_NAME); + } + + return (error); +} + +/**********************************************************************//** +Convert an id, which is actually a decimal number but was regard as a HEX +from a string, to its real value. */ +static +ib_id_t +fts_fake_hex_to_dec( +/*================*/ + ib_id_t id) /*!< in: number to convert */ +{ + ib_id_t dec_id = 0; + char tmp_id[FTS_AUX_MIN_TABLE_ID_LENGTH]; + int ret; + + ret = sprintf(tmp_id, UINT64PFx, id); + ut_ad(ret == 16); +#ifdef _WIN32 + ret = sscanf(tmp_id, "%016llu", &dec_id); +#else + ret = sscanf(tmp_id, "%016"PRIu64, &dec_id); +#endif /* _WIN32 */ + ut_ad(ret == 1); + + return dec_id; +} + +/*********************************************************************//** +Compare two fts_aux_table_t parent_ids. +@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ +UNIV_INLINE +int +fts_check_aux_table_parent_id_cmp( +/*==============================*/ + const void* p1, /*!< in: id1 */ + const void* p2) /*!< in: id2 */ +{ + const fts_aux_table_t* fa1 = static_cast<const fts_aux_table_t*>(p1); + const fts_aux_table_t* fa2 = static_cast<const fts_aux_table_t*>(p2); + + return static_cast<int>(fa1->parent_id - fa2->parent_id); +} + +/** Mark all the fts index associated with the parent table as corrupted. +@param[in] trx transaction +@param[in, out] parent_table fts index associated with this parent table + will be marked as corrupted. */ +static +void +fts_parent_all_index_set_corrupt( + trx_t* trx, + dict_table_t* parent_table) +{ + fts_t* fts = parent_table->fts; + + if (trx_get_dict_operation(trx) == TRX_DICT_OP_NONE) { + trx_set_dict_operation(trx, TRX_DICT_OP_INDEX); + } + + for (ulint j = 0; j < ib_vector_size(fts->indexes); j++) { + dict_index_t* index = static_cast<dict_index_t*>( + ib_vector_getp_const(fts->indexes, j)); + dict_set_corrupted(index, + trx, "DROP ORPHANED TABLE"); + } +} + +/** Mark the fts index which index id matches the id as corrupted. +@param[in] trx transaction +@param[in] id index id to search +@param[in, out] parent_table parent table to check with all + the index. */ +static +void +fts_set_index_corrupt( + trx_t* trx, + index_id_t id, + dict_table_t* table) +{ + fts_t* fts = table->fts; + + if (trx_get_dict_operation(trx) == TRX_DICT_OP_NONE) { + trx_set_dict_operation(trx, TRX_DICT_OP_INDEX); + } + + for (ulint j = 0; j < ib_vector_size(fts->indexes); j++) { + dict_index_t* index = static_cast<dict_index_t*>( + ib_vector_getp_const(fts->indexes, j)); + if (index->id == id) { + dict_set_corrupted(index, trx, + "DROP ORPHANED TABLE"); + break; + } + } +} + +/** Check the index for the aux table is corrupted. +@param[in] aux_table auxiliary table +@retval nonzero if index is corrupted, zero for valid index */ +static +ulint +fts_check_corrupt_index( + fts_aux_table_t* aux_table) +{ + dict_table_t* table; + dict_index_t* index; + table = dict_table_open_on_id( + aux_table->parent_id, TRUE, DICT_TABLE_OP_NORMAL); + + if (table == NULL) { + return(0); + } + + for (index = UT_LIST_GET_FIRST(table->indexes); + index; + index = UT_LIST_GET_NEXT(indexes, index)) { + if (index->id == aux_table->index_id) { + ut_ad(index->type & DICT_FTS); + dict_table_close(table, true, false); + return(dict_index_is_corrupted(index)); + } + } + + dict_table_close(table, true, false); + return(0); +} + +/** Check the validity of the parent table. +@param[in] aux_table auxiliary table +@return true if it is a valid table or false if it is not */ +static +bool +fts_valid_parent_table( + const fts_aux_table_t* aux_table) +{ + dict_table_t* parent_table; + bool valid = false; + + parent_table = dict_table_open_on_id( + aux_table->parent_id, TRUE, DICT_TABLE_OP_NORMAL); + + if (parent_table != NULL && parent_table->fts != NULL) { + if (aux_table->index_id == 0) { + valid = true; + } else { + index_id_t id = aux_table->index_id; + dict_index_t* index; + + /* Search for the FT index in the table's list. */ + for (index = UT_LIST_GET_FIRST(parent_table->indexes); + index; + index = UT_LIST_GET_NEXT(indexes, index)) { + if (index->id == id) { + valid = true; + break; + } + + } + } + } + + if (parent_table) { + dict_table_close(parent_table, TRUE, FALSE); + } + + return(valid); +} + +/** Try to rename all aux tables of the specified parent table. +@param[in] aux_tables aux_tables to be renamed +@param[in] parent_table parent table of all aux + tables stored in tables. */ +static +void +fts_rename_aux_tables_to_hex_format( + ib_vector_t* aux_tables, + dict_table_t* parent_table) +{ + dberr_t err; + trx_t* trx_rename = trx_allocate_for_background(); + trx_rename->op_info = "Rename aux tables to hex format"; + trx_rename->dict_operation_lock_mode = RW_X_LATCH; + trx_start_for_ddl(trx_rename, TRX_DICT_OP_TABLE); + + err = fts_rename_aux_tables_to_hex_format_low(trx_rename, + parent_table, aux_tables); + + trx_rename->dict_operation_lock_mode = 0; + + if (err != DB_SUCCESS) { + + ib_logf(IB_LOG_LEVEL_WARN, + "Rollback operations on all aux tables of table %s. " + "All the fts index associated with the table are " + "marked as corrupted. Please rebuild the " + "index again.", parent_table->name); + fts_sql_rollback(trx_rename); + + /* Corrupting the fts index related to parent table. */ + trx_t* trx_corrupt; + trx_corrupt = trx_allocate_for_background(); + trx_corrupt->dict_operation_lock_mode = RW_X_LATCH; + trx_start_for_ddl(trx_corrupt, TRX_DICT_OP_TABLE); + fts_parent_all_index_set_corrupt(trx_corrupt, parent_table); + trx_corrupt->dict_operation_lock_mode = 0; + fts_sql_commit(trx_corrupt); + trx_free_for_background(trx_corrupt); + } else { + fts_sql_commit(trx_rename); + } + + trx_free_for_background(trx_rename); + ib_vector_reset(aux_tables); +} + +/** Set the hex format flag for the parent table. +@param[in, out] parent_table parent table +@param[in] trx transaction */ +static +void +fts_set_parent_hex_format_flag( + dict_table_t* parent_table, + trx_t* trx) +{ + if (!DICT_TF2_FLAG_IS_SET(parent_table, + DICT_TF2_FTS_AUX_HEX_NAME)) { + DBUG_EXECUTE_IF("parent_table_flag_fail", + ib_logf(IB_LOG_LEVEL_FATAL, + "Setting parent table %s to hex format " + "failed. Please try to restart the server " + "again, if it doesn't work, the system " + "tables might be corrupted.", + parent_table->name); + return;); + + dberr_t err = fts_update_hex_format_flag( + trx, parent_table->id, true); + + if (err != DB_SUCCESS) { + ib_logf(IB_LOG_LEVEL_FATAL, + "Setting parent table %s to hex format " + "failed. Please try to restart the server " + "again, if it doesn't work, the system " + "tables might be corrupted.", + parent_table->name); + } else { + DICT_TF2_FLAG_SET( + parent_table, DICT_TF2_FTS_AUX_HEX_NAME); + } + } +} + +/** Drop the obsolete auxilary table. +@param[in] tables tables to be dropped. */ +static +void +fts_drop_obsolete_aux_table_from_vector( + ib_vector_t* tables) +{ + dberr_t err; + + for (ulint count = 0; count < ib_vector_size(tables); + ++count) { + + fts_aux_table_t* aux_drop_table; + aux_drop_table = static_cast<fts_aux_table_t*>( + ib_vector_get(tables, count)); + trx_t* trx_drop = trx_allocate_for_background(); + trx_drop->op_info = "Drop obsolete aux tables"; + trx_drop->dict_operation_lock_mode = RW_X_LATCH; + trx_start_for_ddl(trx_drop, TRX_DICT_OP_TABLE); + + err = row_drop_table_for_mysql( + aux_drop_table->name, trx_drop, false, true); + + trx_drop->dict_operation_lock_mode = 0; + + if (err != DB_SUCCESS) { + /* We don't need to worry about the + failure, since server would try to + drop it on next restart, even if + the table was broken. */ + ib_logf(IB_LOG_LEVEL_WARN, + "Fail to drop obsolete aux table '%s', which " + "is harmless. will try to drop it on next " + "restart.", aux_drop_table->name); + fts_sql_rollback(trx_drop); + } else { + ib_logf(IB_LOG_LEVEL_INFO, + "Dropped obsolete aux table '%s'.", + aux_drop_table->name); + + fts_sql_commit(trx_drop); + } + + trx_free_for_background(trx_drop); + } +} + +/** Drop all the auxiliary table present in the vector. +@param[in] trx transaction +@param[in] tables tables to be dropped */ +static +void +fts_drop_aux_table_from_vector( + trx_t* trx, + ib_vector_t* tables) +{ + for (ulint count = 0; count < ib_vector_size(tables); + ++count) { + fts_aux_table_t* aux_drop_table; + aux_drop_table = static_cast<fts_aux_table_t*>( + ib_vector_get(tables, count)); + + /* Check for the validity of the parent table */ + if (!fts_valid_parent_table(aux_drop_table)) { + ib_logf(IB_LOG_LEVEL_WARN, + "Parent table of FTS auxiliary table %s not " + "found.", aux_drop_table->name); + dberr_t err = fts_drop_table(trx, aux_drop_table->name); + if (err == DB_FAIL) { + char* path = fil_make_ibd_name( + aux_drop_table->name, false); + os_file_delete_if_exists(innodb_file_data_key, + path); + mem_free(path); + } + } + } +} + +/**********************************************************************//** +Check and drop all orphaned FTS auxiliary tables, those that don't have +a parent table or FTS index defined on them. +@return DB_SUCCESS or error code */ +static __attribute__((nonnull)) +void +fts_check_and_drop_orphaned_tables( +/*===============================*/ + trx_t* trx, /*!< in: transaction */ + ib_vector_t* tables) /*!< in: tables to check */ +{ + mem_heap_t* heap; + ib_vector_t* aux_tables_to_rename; + ib_vector_t* invalid_aux_tables; + ib_vector_t* valid_aux_tables; + ib_vector_t* drop_aux_tables; + ib_vector_t* obsolete_aux_tables; + ib_alloc_t* heap_alloc; + + heap = mem_heap_create(1024); + heap_alloc = ib_heap_allocator_create(heap); + + /* We store all aux tables belonging to the same parent table here, + and rename all these tables in a batch mode. */ + aux_tables_to_rename = ib_vector_create(heap_alloc, + sizeof(fts_aux_table_t), 128); + + /* We store all fake auxiliary table and orphaned table here. */ + invalid_aux_tables = ib_vector_create(heap_alloc, + sizeof(fts_aux_table_t), 128); + + /* We store all valid aux tables. We use this to filter the + fake auxiliary table from invalid auxiliary tables. */ + valid_aux_tables = ib_vector_create(heap_alloc, + sizeof(fts_aux_table_t), 128); + + /* We store all auxiliary tables to be dropped. */ + drop_aux_tables = ib_vector_create(heap_alloc, + sizeof(fts_aux_table_t), 128); + + /* We store all obsolete auxiliary tables to be dropped. */ + obsolete_aux_tables = ib_vector_create(heap_alloc, + sizeof(fts_aux_table_t), 128); + + /* Sort by parent_id first, in case rename will fail */ + ib_vector_sort(tables, fts_check_aux_table_parent_id_cmp); + + for (ulint i = 0; i < ib_vector_size(tables); ++i) { + dict_table_t* parent_table; + fts_aux_table_t* aux_table; + bool drop = false; + dict_table_t* table; + fts_aux_table_t* next_aux_table = NULL; + ib_id_t orig_parent_id = 0; + ib_id_t orig_index_id = 0; + bool rename = false; + + aux_table = static_cast<fts_aux_table_t*>( + ib_vector_get(tables, i)); + + table = dict_table_open_on_id( + aux_table->id, TRUE, DICT_TABLE_OP_NORMAL); + orig_parent_id = aux_table->parent_id; + orig_index_id = aux_table->index_id; + + if (table == NULL || strcmp(table->name, aux_table->name)) { + + bool fake_aux = false; + + if (table != NULL) { + dict_table_close(table, TRUE, FALSE); + } + + if (i + 1 < ib_vector_size(tables)) { + next_aux_table = static_cast<fts_aux_table_t*>( + ib_vector_get(tables, i + 1)); + } + + /* To know whether aux table is fake fts or + orphan fts table. */ + for (ulint count = 0; + count < ib_vector_size(valid_aux_tables); + count++) { + fts_aux_table_t* valid_aux; + valid_aux = static_cast<fts_aux_table_t*>( + ib_vector_get(valid_aux_tables, count)); + if (strcmp(valid_aux->name, + aux_table->name) == 0) { + fake_aux = true; + break; + } + } + + /* All aux tables of parent table, whose id is + last_parent_id, have been checked, try to rename + them if necessary. */ + if ((next_aux_table == NULL + || orig_parent_id != next_aux_table->parent_id) + && (!ib_vector_is_empty(aux_tables_to_rename))) { + + ulint parent_id = fts_fake_hex_to_dec( + aux_table->parent_id); + + parent_table = dict_table_open_on_id( + parent_id, TRUE, + DICT_TABLE_OP_NORMAL); + + fts_rename_aux_tables_to_hex_format( + aux_tables_to_rename, parent_table); + + dict_table_close(parent_table, TRUE, + FALSE); + } + + /* If the aux table is fake aux table. Skip it. */ + if (!fake_aux) { + ib_vector_push(invalid_aux_tables, aux_table); + } + + continue; + } else if (!DICT_TF2_FLAG_IS_SET(table, + DICT_TF2_FTS_AUX_HEX_NAME)) { + + aux_table->parent_id = fts_fake_hex_to_dec( + aux_table->parent_id); + + if (aux_table->index_id != 0) { + aux_table->index_id = fts_fake_hex_to_dec( + aux_table->index_id); + } + + ut_ad(aux_table->id > aux_table->parent_id); + + /* Check whether parent table id and index id + are stored as decimal format. */ + if (fts_valid_parent_table(aux_table)) { + + parent_table = dict_table_open_on_id( + aux_table->parent_id, true, + DICT_TABLE_OP_NORMAL); + + ut_ad(parent_table != NULL); + ut_ad(parent_table->fts != NULL); + + if (!DICT_TF2_FLAG_IS_SET( + parent_table, + DICT_TF2_FTS_AUX_HEX_NAME)) { + rename = true; + } + + dict_table_close(parent_table, TRUE, FALSE); + } + + if (!rename) { + /* Reassign the original value of + aux table if it is not in decimal format */ + aux_table->parent_id = orig_parent_id; + aux_table->index_id = orig_index_id; + } + } + + if (table != NULL) { + dict_table_close(table, true, false); + } + + if (!rename) { + /* Check the validity of the parent table. */ + if (!fts_valid_parent_table(aux_table)) { + drop = true; + } + } + + /* Filter out the fake aux table by comparing with the + current valid auxiliary table name . */ + for (ulint count = 0; + count < ib_vector_size(invalid_aux_tables); count++) { + fts_aux_table_t* invalid_aux; + invalid_aux = static_cast<fts_aux_table_t*>( + ib_vector_get(invalid_aux_tables, count)); + if (strcmp(invalid_aux->name, aux_table->name) == 0) { + ib_vector_remove( + invalid_aux_tables, + *reinterpret_cast<void**>(invalid_aux)); + break; + } + } + + ib_vector_push(valid_aux_tables, aux_table); + + /* If the index associated with aux table is corrupted, + skip it. */ + if (fts_check_corrupt_index(aux_table) > 0) { + + if (i + 1 < ib_vector_size(tables)) { + next_aux_table = static_cast<fts_aux_table_t*>( + ib_vector_get(tables, i + 1)); + } + + if (next_aux_table == NULL + || orig_parent_id != next_aux_table->parent_id) { + + parent_table = dict_table_open_on_id( + aux_table->parent_id, TRUE, + DICT_TABLE_OP_NORMAL); + + if (!ib_vector_is_empty(aux_tables_to_rename)) { + fts_rename_aux_tables_to_hex_format( + aux_tables_to_rename, parent_table); + + } else { + fts_set_parent_hex_format_flag( + parent_table, trx); + } + + dict_table_close(parent_table, TRUE, FALSE); + } + + continue; + } + + parent_table = dict_table_open_on_id( + aux_table->parent_id, TRUE, DICT_TABLE_OP_NORMAL); + + if (drop) { + ib_vector_push(drop_aux_tables, aux_table); + } else { + if (FTS_IS_OBSOLETE_AUX_TABLE(aux_table->name)) { + + /* Current table could be one of the three + obsolete tables, in this case, we should + always try to drop it but not rename it. + This could happen when we try to upgrade + from older server to later one, which doesn't + contain these obsolete tables. */ + ib_vector_push(obsolete_aux_tables, aux_table); + continue; + } + } + + /* If the aux table is in decimal format, we should + rename it, so push it to aux_tables_to_rename */ + if (!drop && rename) { + ib_vector_push(aux_tables_to_rename, aux_table); + } + + if (i + 1 < ib_vector_size(tables)) { + next_aux_table = static_cast<fts_aux_table_t*>( + ib_vector_get(tables, i + 1)); + } + + if ((next_aux_table == NULL + || orig_parent_id != next_aux_table->parent_id) + && !ib_vector_is_empty(aux_tables_to_rename)) { + /* All aux tables of parent table, whose id is + last_parent_id, have been checked, try to rename + them if necessary. We had better use a new background + trx to rename rather than the original trx, in case + any failure would cause a complete rollback. */ + ut_ad(rename); + ut_ad(!DICT_TF2_FLAG_IS_SET( + parent_table, DICT_TF2_FTS_AUX_HEX_NAME)); + + fts_rename_aux_tables_to_hex_format( + aux_tables_to_rename,parent_table); + } + + /* The IDs are already in correct hex format. */ + if (!drop && !rename) { + dict_table_t* table; + + table = dict_table_open_on_id( + aux_table->id, TRUE, DICT_TABLE_OP_NORMAL); + if (table != NULL + && strcmp(table->name, aux_table->name)) { + dict_table_close(table, TRUE, FALSE); + table = NULL; + } + + if (table != NULL + && !DICT_TF2_FLAG_IS_SET( + table, + DICT_TF2_FTS_AUX_HEX_NAME)) { + + DBUG_EXECUTE_IF("aux_table_flag_fail", + ib_logf(IB_LOG_LEVEL_WARN, + "Setting aux table %s to hex " + "format failed.", table->name); + fts_set_index_corrupt( + trx, aux_table->index_id, + parent_table); + goto table_exit;); + + dberr_t err = fts_update_hex_format_flag( + trx, table->id, true); + + if (err != DB_SUCCESS) { + ib_logf(IB_LOG_LEVEL_WARN, + "Setting aux table %s to hex " + "format failed.", table->name); + + fts_set_index_corrupt( + trx, aux_table->index_id, + parent_table); + } else { + DICT_TF2_FLAG_SET(table, + DICT_TF2_FTS_AUX_HEX_NAME); + } + } +#ifndef DBUG_OFF +table_exit: +#endif /* !DBUG_OFF */ + + if (table != NULL) { + dict_table_close(table, TRUE, FALSE); + } + + ut_ad(parent_table != NULL); + + fts_set_parent_hex_format_flag( + parent_table, trx); + } + + if (parent_table != NULL) { + dict_table_close(parent_table, TRUE, FALSE); + } + } + + fts_drop_aux_table_from_vector(trx, invalid_aux_tables); + fts_drop_aux_table_from_vector(trx, drop_aux_tables); + fts_sql_commit(trx); + + fts_drop_obsolete_aux_table_from_vector(obsolete_aux_tables); + + /* Free the memory allocated at the beginning */ + if (heap != NULL) { + mem_heap_free(heap); + } +} + +/**********************************************************************//** +Drop all orphaned FTS auxiliary tables, those that don't have a parent +table or FTS index defined on them. */ +UNIV_INTERN +void +fts_drop_orphaned_tables(void) +/*==========================*/ +{ + trx_t* trx; + pars_info_t* info; + mem_heap_t* heap; + que_t* graph; + ib_vector_t* tables; + ib_alloc_t* heap_alloc; + space_name_list_t space_name_list; + dberr_t error = DB_SUCCESS; + + /* Note: We have to free the memory after we are done with the list. */ + error = fil_get_space_names(space_name_list); + + if (error == DB_OUT_OF_MEMORY) { + ib_logf(IB_LOG_LEVEL_ERROR, "Out of memory"); + ut_error; + } + + heap = mem_heap_create(1024); + heap_alloc = ib_heap_allocator_create(heap); + + /* We store the table ids of all the FTS indexes that were found. */ + tables = ib_vector_create(heap_alloc, sizeof(fts_aux_table_t), 128); + + /* Get the list of all known .ibd files and check for orphaned + FTS auxiliary files in that list. We need to remove them because + users can't map them back to table names and this will create + unnecessary clutter. */ + + for (space_name_list_t::iterator it = space_name_list.begin(); + it != space_name_list.end(); + ++it) { + + fts_aux_table_t* fts_aux_table; + + fts_aux_table = static_cast<fts_aux_table_t*>( + ib_vector_push(tables, NULL)); + + memset(fts_aux_table, 0x0, sizeof(*fts_aux_table)); + + if (!fts_is_aux_table_name(fts_aux_table, *it, strlen(*it))) { + ib_vector_pop(tables); + } else { + ulint len = strlen(*it); + + fts_aux_table->id = fil_get_space_id_for_table(*it); + + /* We got this list from fil0fil.cc. The tablespace + with this name must exist. */ + ut_a(fts_aux_table->id != ULINT_UNDEFINED); + + fts_aux_table->name = static_cast<char*>( + mem_heap_dup(heap, *it, len + 1)); + + fts_aux_table->name[len] = 0; + } + } + + trx = trx_allocate_for_background(); + trx->op_info = "dropping orphaned FTS tables"; + row_mysql_lock_data_dictionary(trx); + + info = pars_info_create(); + + pars_info_bind_function(info, "my_func", fts_read_tables, tables); + + graph = fts_parse_sql_no_dict_lock( + NULL, + info, + "DECLARE FUNCTION my_func;\n" + "DECLARE CURSOR c IS" + " SELECT NAME, ID " + " FROM SYS_TABLES;\n" + "BEGIN\n" + "\n" + "OPEN c;\n" + "WHILE 1 = 1 LOOP\n" + " FETCH c INTO my_func();\n" + " IF c % NOTFOUND THEN\n" + " EXIT;\n" + " END IF;\n" + "END LOOP;\n" + "CLOSE c;"); + + for (;;) { + error = fts_eval_sql(trx, graph); + + if (error == DB_SUCCESS) { + fts_check_and_drop_orphaned_tables(trx, tables); + break; /* Exit the loop. */ + } else { + ib_vector_reset(tables); + + fts_sql_rollback(trx); + + ut_print_timestamp(stderr); + + if (error == DB_LOCK_WAIT_TIMEOUT) { + ib_logf(IB_LOG_LEVEL_WARN, + "lock wait timeout reading SYS_TABLES. " + "Retrying!"); + + trx->error_state = DB_SUCCESS; + } else { + ib_logf(IB_LOG_LEVEL_ERROR, + "(%s) while reading SYS_TABLES.", + ut_strerr(error)); + + break; /* Exit the loop. */ + } + } + } + + que_graph_free(graph); + + row_mysql_unlock_data_dictionary(trx); + + trx_free_for_background(trx); + + if (heap != NULL) { + mem_heap_free(heap); + } + + /** Free the memory allocated to store the .ibd names. */ + for (space_name_list_t::iterator it = space_name_list.begin(); + it != space_name_list.end(); + ++it) { + + delete[] *it; + } +} + +/**********************************************************************//** +Check whether user supplied stopword table is of the right format. +Caller is responsible to hold dictionary locks. +@return the stopword column charset if qualifies */ +UNIV_INTERN +CHARSET_INFO* +fts_valid_stopword_table( +/*=====================*/ + const char* stopword_table_name) /*!< in: Stopword table + name */ +{ + dict_table_t* table; + dict_col_t* col = NULL; + + if (!stopword_table_name) { + return(NULL); + } + + table = dict_table_get_low(stopword_table_name); + + if (!table) { + fprintf(stderr, + "InnoDB: user stopword table %s does not exist.\n", + stopword_table_name); + + return(NULL); + } else { + const char* col_name; + + col_name = dict_table_get_col_name(table, 0); + + if (ut_strcmp(col_name, "value")) { + fprintf(stderr, + "InnoDB: invalid column name for stopword " + "table %s. Its first column must be named as " + "'value'.\n", stopword_table_name); + + return(NULL); + } + + col = dict_table_get_nth_col(table, 0); + + if (col->mtype != DATA_VARCHAR + && col->mtype != DATA_VARMYSQL) { + fprintf(stderr, + "InnoDB: invalid column type for stopword " + "table %s. Its first column must be of " + "varchar type\n", stopword_table_name); + + return(NULL); + } + } + + ut_ad(col); + + return(innobase_get_fts_charset( + static_cast<int>(col->prtype & DATA_MYSQL_TYPE_MASK), + static_cast<uint>(dtype_get_charset_coll(col->prtype)))); +} + +/**********************************************************************//** +This function loads the stopword into the FTS cache. It also +records/fetches stopword configuration to/from FTS configure +table, depending on whether we are creating or reloading the +FTS. +@return TRUE if load operation is successful */ +UNIV_INTERN +ibool +fts_load_stopword( +/*==============*/ + const dict_table_t* + table, /*!< in: Table with FTS */ + trx_t* trx, /*!< in: Transactions */ + const char* global_stopword_table, /*!< in: Global stopword table + name */ + const char* session_stopword_table, /*!< in: Session stopword table + name */ + ibool stopword_is_on, /*!< in: Whether stopword + option is turned on/off */ + ibool reload) /*!< in: Whether it is + for reloading FTS table */ +{ + fts_table_t fts_table; + fts_string_t str; + dberr_t error = DB_SUCCESS; + ulint use_stopword; + fts_cache_t* cache; + const char* stopword_to_use = NULL; + ibool new_trx = FALSE; + byte str_buffer[MAX_FULL_NAME_LEN + 1]; + + FTS_INIT_FTS_TABLE(&fts_table, "CONFIG", FTS_COMMON_TABLE, table); + + cache = table->fts->cache; + + if (!reload && !(cache->stopword_info.status + & STOPWORD_NOT_INIT)) { + return(TRUE); + } + + if (!trx) { + trx = trx_allocate_for_background(); + trx->op_info = "upload FTS stopword"; + new_trx = TRUE; + } + + /* First check whether stopword filtering is turned off */ + if (reload) { + error = fts_config_get_ulint( + trx, &fts_table, FTS_USE_STOPWORD, &use_stopword); + } else { + use_stopword = (ulint) stopword_is_on; + + error = fts_config_set_ulint( + trx, &fts_table, FTS_USE_STOPWORD, use_stopword); + } + + if (error != DB_SUCCESS) { + goto cleanup; + } + + /* If stopword is turned off, no need to continue to load the + stopword into cache, but still need to do initialization */ + if (!use_stopword) { + cache->stopword_info.status = STOPWORD_OFF; + goto cleanup; + } + + if (reload) { + /* Fetch the stopword table name from FTS config + table */ + str.f_n_char = 0; + str.f_str = str_buffer; + str.f_len = sizeof(str_buffer) - 1; + + error = fts_config_get_value( + trx, &fts_table, FTS_STOPWORD_TABLE_NAME, &str); + + if (error != DB_SUCCESS) { + goto cleanup; + } + + if (strlen((char*) str.f_str) > 0) { + stopword_to_use = (const char*) str.f_str; + } + } else { + stopword_to_use = (session_stopword_table) + ? session_stopword_table : global_stopword_table; + } + + if (stopword_to_use + && fts_load_user_stopword(table->fts, stopword_to_use, + &cache->stopword_info)) { + /* Save the stopword table name to the configure + table */ + if (!reload) { + str.f_n_char = 0; + str.f_str = (byte*) stopword_to_use; + str.f_len = ut_strlen(stopword_to_use); + + error = fts_config_set_value( + trx, &fts_table, FTS_STOPWORD_TABLE_NAME, &str); + } + } else { + /* Load system default stopword list */ + fts_load_default_stopword(&cache->stopword_info); + } + +cleanup: + if (new_trx) { + if (error == DB_SUCCESS) { + fts_sql_commit(trx); + } else { + fts_sql_rollback(trx); + } + + trx_free_for_background(trx); + } + + if (!cache->stopword_info.cached_stopword) { + cache->stopword_info.cached_stopword = rbt_create( + sizeof(fts_tokenizer_word_t), fts_utf8_string_cmp); + } + + return(error == DB_SUCCESS); +} + +/**********************************************************************//** +Callback function when we initialize the FTS at the start up +time. It recovers the maximum Doc IDs presented in the current table. +@return: always returns TRUE */ +static +ibool +fts_init_get_doc_id( +/*================*/ + void* row, /*!< in: sel_node_t* */ + void* user_arg) /*!< in: fts cache */ +{ + doc_id_t doc_id = FTS_NULL_DOC_ID; + sel_node_t* node = static_cast<sel_node_t*>(row); + que_node_t* exp = node->select_list; + fts_cache_t* cache = static_cast<fts_cache_t*>(user_arg); + + ut_ad(ib_vector_is_empty(cache->get_docs)); + + /* Copy each indexed column content into doc->text.f_str */ + if (exp) { + dfield_t* dfield = que_node_get_val(exp); + dtype_t* type = dfield_get_type(dfield); + void* data = dfield_get_data(dfield); + + ut_a(dtype_get_mtype(type) == DATA_INT); + + doc_id = static_cast<doc_id_t>(mach_read_from_8( + static_cast<const byte*>(data))); + + if (doc_id >= cache->next_doc_id) { + cache->next_doc_id = doc_id + 1; + } + } + + return(TRUE); +} + +/**********************************************************************//** +Callback function when we initialize the FTS at the start up +time. It recovers Doc IDs that have not sync-ed to the auxiliary +table, and require to bring them back into FTS index. +@return: always returns TRUE */ +static +ibool +fts_init_recover_doc( +/*=================*/ + void* row, /*!< in: sel_node_t* */ + void* user_arg) /*!< in: fts cache */ +{ + + fts_doc_t doc; + ulint doc_len = 0; + ulint field_no = 0; + fts_get_doc_t* get_doc = static_cast<fts_get_doc_t*>(user_arg); + doc_id_t doc_id = FTS_NULL_DOC_ID; + sel_node_t* node = static_cast<sel_node_t*>(row); + que_node_t* exp = node->select_list; + fts_cache_t* cache = get_doc->cache; + + fts_doc_init(&doc); + doc.found = TRUE; + + ut_ad(cache); + + /* Copy each indexed column content into doc->text.f_str */ + while (exp) { + dfield_t* dfield = que_node_get_val(exp); + ulint len = dfield_get_len(dfield); + + if (field_no == 0) { + dtype_t* type = dfield_get_type(dfield); + void* data = dfield_get_data(dfield); + + ut_a(dtype_get_mtype(type) == DATA_INT); + + doc_id = static_cast<doc_id_t>(mach_read_from_8( + static_cast<const byte*>(data))); + + field_no++; + exp = que_node_get_next(exp); + continue; + } + + if (len == UNIV_SQL_NULL) { + exp = que_node_get_next(exp); + continue; + } + + ut_ad(get_doc); + + if (!get_doc->index_cache->charset) { + ulint prtype = dfield->type.prtype; + + get_doc->index_cache->charset = + innobase_get_fts_charset( + (int)(prtype & DATA_MYSQL_TYPE_MASK), + (uint) dtype_get_charset_coll(prtype)); + } + + doc.charset = get_doc->index_cache->charset; + + if (dfield_is_ext(dfield)) { + dict_table_t* table = cache->sync->table; + ulint zip_size = dict_table_zip_size(table); + + doc.text.f_str = btr_copy_externally_stored_field( + &doc.text.f_len, + static_cast<byte*>(dfield_get_data(dfield)), + zip_size, len, + static_cast<mem_heap_t*>(doc.self_heap->arg)); + } else { + doc.text.f_str = static_cast<byte*>( + dfield_get_data(dfield)); + + doc.text.f_len = len; + } + + if (field_no == 1) { + fts_tokenize_document(&doc, NULL); + } else { + fts_tokenize_document_next(&doc, doc_len, NULL); + } + + exp = que_node_get_next(exp); + + doc_len += (exp) ? len + 1 : len; + + field_no++; + } + + fts_cache_add_doc(cache, get_doc->index_cache, doc_id, doc.tokens); + + fts_doc_free(&doc); + + cache->added++; + + if (doc_id >= cache->next_doc_id) { + cache->next_doc_id = doc_id + 1; + } + + return(TRUE); +} + +/**********************************************************************//** +This function brings FTS index in sync when FTS index is first +used. There are documents that have not yet sync-ed to auxiliary +tables from last server abnormally shutdown, we will need to bring +such document into FTS cache before any further operations +@return TRUE if all OK */ +UNIV_INTERN +ibool +fts_init_index( +/*===========*/ + dict_table_t* table, /*!< in: Table with FTS */ + ibool has_cache_lock) /*!< in: Whether we already have + cache lock */ +{ + dict_index_t* index; + doc_id_t start_doc; + fts_get_doc_t* get_doc = NULL; + fts_cache_t* cache = table->fts->cache; + bool need_init = false; + + ut_ad(!mutex_own(&dict_sys->mutex)); + + /* First check cache->get_docs is initialized */ + if (!has_cache_lock) { + rw_lock_x_lock(&cache->lock); + } + + rw_lock_x_lock(&cache->init_lock); + if (cache->get_docs == NULL) { + cache->get_docs = fts_get_docs_create(cache); + } + rw_lock_x_unlock(&cache->init_lock); + + if (table->fts->fts_status & ADDED_TABLE_SYNCED) { + goto func_exit; + } + + need_init = true; + + start_doc = cache->synced_doc_id; + + if (!start_doc) { + fts_cmp_set_sync_doc_id(table, 0, TRUE, &start_doc); + cache->synced_doc_id = start_doc; + } + + /* No FTS index, this is the case when previous FTS index + dropped, and we re-initialize the Doc ID system for subsequent + insertion */ + if (ib_vector_is_empty(cache->get_docs)) { + index = dict_table_get_index_on_name(table, FTS_DOC_ID_INDEX_NAME); + + ut_a(index); + + fts_doc_fetch_by_doc_id(NULL, start_doc, index, + FTS_FETCH_DOC_BY_ID_LARGE, + fts_init_get_doc_id, cache); + } else { + if (table->fts->cache->stopword_info.status + & STOPWORD_NOT_INIT) { + fts_load_stopword(table, NULL, NULL, NULL, TRUE, TRUE); + } + + for (ulint i = 0; i < ib_vector_size(cache->get_docs); ++i) { + get_doc = static_cast<fts_get_doc_t*>( + ib_vector_get(cache->get_docs, i)); + + index = get_doc->index_cache->index; + + fts_doc_fetch_by_doc_id(NULL, start_doc, index, + FTS_FETCH_DOC_BY_ID_LARGE, + fts_init_recover_doc, get_doc); + } + } + + table->fts->fts_status |= ADDED_TABLE_SYNCED; + + fts_get_docs_clear(cache->get_docs); + +func_exit: + if (!has_cache_lock) { + rw_lock_x_unlock(&cache->lock); + } + + if (need_init) { + mutex_enter(&dict_sys->mutex); + /* Register the table with the optimize thread. */ + fts_optimize_add_table(table); + mutex_exit(&dict_sys->mutex); + } + + return(TRUE); +} diff --git a/storage/xtradb/fts/fts0opt.cc b/storage/xtradb/fts/fts0opt.cc new file mode 100644 index 00000000000..2e2bd061d07 --- /dev/null +++ b/storage/xtradb/fts/fts0opt.cc @@ -0,0 +1,3203 @@ +/***************************************************************************** + +Copyright (c) 2007, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file fts/fts0opt.cc +Full Text Search optimize thread + +Created 2007/03/27 Sunny Bains +Completed 2011/7/10 Sunny and Jimmy Yang + +***********************************************************************/ + +#include "fts0fts.h" +#include "row0sel.h" +#include "que0types.h" +#include "fts0priv.h" +#include "fts0types.h" +#include "ut0wqueue.h" +#include "srv0start.h" +#include "zlib.h" + +#ifndef UNIV_NONINL +#include "fts0types.ic" +#include "fts0vlc.ic" +#endif + +/** The FTS optimize thread's work queue. */ +static ib_wqueue_t* fts_optimize_wq; + +/** The number of document ids to delete in one statement. */ +static const ulint FTS_MAX_DELETE_DOC_IDS = 1000; + +/** Time to wait for a message. */ +static const ulint FTS_QUEUE_WAIT_IN_USECS = 5000000; + +/** Default optimize interval in secs. */ +static const ulint FTS_OPTIMIZE_INTERVAL_IN_SECS = 300; + +/** Server is shutting down, so does we exiting the optimize thread */ +static bool fts_opt_start_shutdown = false; + +/** Initial size of nodes in fts_word_t. */ +static const ulint FTS_WORD_NODES_INIT_SIZE = 64; + +/** Last time we did check whether system need a sync */ +static ib_time_t last_check_sync_time; + +#if 0 +/** Check each table in round robin to see whether they'd +need to be "optimized" */ +static ulint fts_optimize_sync_iterator = 0; +#endif + +/** State of a table within the optimization sub system. */ +enum fts_state_t { + FTS_STATE_LOADED, + FTS_STATE_RUNNING, + FTS_STATE_SUSPENDED, + FTS_STATE_DONE, + FTS_STATE_EMPTY +}; + +/** FTS optimize thread message types. */ +enum fts_msg_type_t { + FTS_MSG_START, /*!< Start optimizing thread */ + + FTS_MSG_PAUSE, /*!< Pause optimizing thread */ + + FTS_MSG_STOP, /*!< Stop optimizing and exit thread */ + + FTS_MSG_ADD_TABLE, /*!< Add table to the optimize thread's + work queue */ + + FTS_MSG_OPTIMIZE_TABLE, /*!< Optimize a table */ + + FTS_MSG_DEL_TABLE, /*!< Remove a table from the optimize + threads work queue */ +}; + +/** Compressed list of words that have been read from FTS INDEX +that needs to be optimized. */ +struct fts_zip_t { + lint status; /*!< Status of (un)/zip operation */ + + ulint n_words; /*!< Number of words compressed */ + + ulint block_sz; /*!< Size of a block in bytes */ + + ib_vector_t* blocks; /*!< Vector of compressed blocks */ + + ib_alloc_t* heap_alloc; /*!< Heap to use for allocations */ + + ulint pos; /*!< Offset into blocks */ + + ulint last_big_block; /*!< Offset of last block in the + blocks array that is of size + block_sz. Blocks beyond this offset + are of size FTS_MAX_WORD_LEN */ + + z_streamp zp; /*!< ZLib state */ + + /*!< The value of the last word read + from the FTS INDEX table. This is + used to discard duplicates */ + + fts_string_t word; /*!< UTF-8 string */ + + ulint max_words; /*!< maximum number of words to read + in one pase */ +}; + +/** Prepared statemets used during optimize */ +struct fts_optimize_graph_t { + /*!< Delete a word from FTS INDEX */ + que_t* delete_nodes_graph; + /*!< Insert a word into FTS INDEX */ + que_t* write_nodes_graph; + /*!< COMMIT a transaction */ + que_t* commit_graph; + /*!< Read the nodes from FTS_INDEX */ + que_t* read_nodes_graph; +}; + +/** Used by fts_optimize() to store state. */ +struct fts_optimize_t { + trx_t* trx; /*!< The transaction used for all SQL */ + + ib_alloc_t* self_heap; /*!< Heap to use for allocations */ + + char* name_prefix; /*!< FTS table name prefix */ + + fts_table_t fts_index_table;/*!< Common table definition */ + + /*!< Common table definition */ + fts_table_t fts_common_table; + + dict_table_t* table; /*!< Table that has to be queried */ + + dict_index_t* index; /*!< The FTS index to be optimized */ + + fts_doc_ids_t* to_delete; /*!< doc ids to delete, we check against + this vector and purge the matching + entries during the optimizing + process. The vector entries are + sorted on doc id */ + + ulint del_pos; /*!< Offset within to_delete vector, + this is used to keep track of where + we are up to in the vector */ + + ibool done; /*!< TRUE when optimize finishes */ + + ib_vector_t* words; /*!< Word + Nodes read from FTS_INDEX, + it contains instances of fts_word_t */ + + fts_zip_t* zip; /*!< Words read from the FTS_INDEX */ + + fts_optimize_graph_t /*!< Prepared statements used during */ + graph; /*optimize */ + + ulint n_completed; /*!< Number of FTS indexes that have + been optimized */ + ibool del_list_regenerated; + /*!< BEING_DELETED list regenarated */ +}; + +/** Used by the optimize, to keep state during compacting nodes. */ +struct fts_encode_t { + doc_id_t src_last_doc_id;/*!< Last doc id read from src node */ + byte* src_ilist_ptr; /*!< Current ptr within src ilist */ +}; + +/** We use this information to determine when to start the optimize +cycle for a table. */ +struct fts_slot_t { + dict_table_t* table; /*!< Table to optimize */ + + table_id_t table_id; /*!< Table id */ + + fts_state_t state; /*!< State of this slot */ + + ulint added; /*!< Number of doc ids added since the + last time this table was optimized */ + + ulint deleted; /*!< Number of doc ids deleted since the + last time this table was optimized */ + + ib_time_t last_run; /*!< Time last run completed */ + + ib_time_t completed; /*!< Optimize finish time */ + + ib_time_t interval_time; /*!< Minimum time to wait before + optimizing the table again. */ +}; + +/** A table remove message for the FTS optimize thread. */ +struct fts_msg_del_t { + dict_table_t* table; /*!< The table to remove */ + + os_event_t event; /*!< Event to synchronize acknowledgement + of receipt and processing of the + this message by the consumer */ +}; + +/** Stop the optimize thread. */ +struct fts_msg_optimize_t { + dict_table_t* table; /*!< Table to optimize */ +}; + +/** The FTS optimize message work queue message type. */ +struct fts_msg_t { + fts_msg_type_t type; /*!< Message type */ + + void* ptr; /*!< The message contents */ + + mem_heap_t* heap; /*!< The heap used to allocate this + message, the message consumer will + free the heap. */ +}; + +/** The number of words to read and optimize in a single pass. */ +UNIV_INTERN ulong fts_num_word_optimize; + +// FIXME +UNIV_INTERN char fts_enable_diag_print; + +/** ZLib compressed block size.*/ +static ulint FTS_ZIP_BLOCK_SIZE = 1024; + +/** The amount of time optimizing in a single pass, in milliseconds. */ +static ib_time_t fts_optimize_time_limit = 0; + +/** SQL Statement for changing state of rows to be deleted from FTS Index. */ +static const char* fts_init_delete_sql = + "BEGIN\n" + "\n" + "INSERT INTO \"%s_BEING_DELETED\"\n" + "SELECT doc_id FROM \"%s_DELETED\";\n" + "\n" + "INSERT INTO \"%s_BEING_DELETED_CACHE\"\n" + "SELECT doc_id FROM \"%s_DELETED_CACHE\";\n"; + +static const char* fts_delete_doc_ids_sql = + "BEGIN\n" + "\n" + "DELETE FROM \"%s_DELETED\" WHERE doc_id = :doc_id1;\n" + "DELETE FROM \"%s_DELETED_CACHE\" WHERE doc_id = :doc_id2;\n"; + +static const char* fts_end_delete_sql = + "BEGIN\n" + "\n" + "DELETE FROM \"%s_BEING_DELETED\";\n" + "DELETE FROM \"%s_BEING_DELETED_CACHE\";\n"; + +/**********************************************************************//** +Initialize fts_zip_t. */ +static +void +fts_zip_initialize( +/*===============*/ + fts_zip_t* zip) /*!< out: zip instance to initialize */ +{ + zip->pos = 0; + zip->n_words = 0; + + zip->status = Z_OK; + + zip->last_big_block = 0; + + zip->word.f_len = 0; + memset(zip->word.f_str, 0, FTS_MAX_WORD_LEN); + + ib_vector_reset(zip->blocks); + + memset(zip->zp, 0, sizeof(*zip->zp)); +} + +/**********************************************************************//** +Create an instance of fts_zip_t. +@return a new instance of fts_zip_t */ +static +fts_zip_t* +fts_zip_create( +/*===========*/ + mem_heap_t* heap, /*!< in: heap */ + ulint block_sz, /*!< in: size of a zip block.*/ + ulint max_words) /*!< in: max words to read */ +{ + fts_zip_t* zip; + + zip = static_cast<fts_zip_t*>(mem_heap_zalloc(heap, sizeof(*zip))); + + zip->word.f_str = static_cast<byte*>( + mem_heap_zalloc(heap, FTS_MAX_WORD_LEN + 1)); + + zip->block_sz = block_sz; + + zip->heap_alloc = ib_heap_allocator_create(heap); + + zip->blocks = ib_vector_create(zip->heap_alloc, sizeof(void*), 128); + + zip->max_words = max_words; + + zip->zp = static_cast<z_stream*>( + mem_heap_zalloc(heap, sizeof(*zip->zp))); + + return(zip); +} + +/**********************************************************************//** +Initialize an instance of fts_zip_t. */ +static +void +fts_zip_init( +/*=========*/ + + fts_zip_t* zip) /*!< in: zip instance to init */ +{ + memset(zip->zp, 0, sizeof(*zip->zp)); + + zip->word.f_len = 0; + *zip->word.f_str = '\0'; +} + +/**********************************************************************//** +Create a fts_optimizer_word_t instance. +@return new instance */ +UNIV_INTERN +fts_word_t* +fts_word_init( +/*==========*/ + fts_word_t* word, /*!< in: word to initialize */ + byte* utf8, /*!< in: UTF-8 string */ + ulint len) /*!< in: length of string in bytes */ +{ + mem_heap_t* heap = mem_heap_create(sizeof(fts_node_t)); + + memset(word, 0, sizeof(*word)); + + word->text.f_len = len; + word->text.f_str = static_cast<byte*>(mem_heap_alloc(heap, len + 1)); + + /* Need to copy the NUL character too. */ + memcpy(word->text.f_str, utf8, word->text.f_len); + word->text.f_str[word->text.f_len] = 0; + + word->heap_alloc = ib_heap_allocator_create(heap); + + word->nodes = ib_vector_create( + word->heap_alloc, sizeof(fts_node_t), FTS_WORD_NODES_INIT_SIZE); + + return(word); +} + +/**********************************************************************//** +Read the FTS INDEX row. +@return fts_node_t instance */ +static +fts_node_t* +fts_optimize_read_node( +/*===================*/ + fts_word_t* word, /*!< in: */ + que_node_t* exp) /*!< in: */ +{ + int i; + fts_node_t* node = static_cast<fts_node_t*>( + ib_vector_push(word->nodes, NULL)); + + /* Start from 1 since the first node has been read by the caller */ + for (i = 1; exp; exp = que_node_get_next(exp), ++i) { + + dfield_t* dfield = que_node_get_val(exp); + byte* data = static_cast<byte*>( + dfield_get_data(dfield)); + ulint len = dfield_get_len(dfield); + + ut_a(len != UNIV_SQL_NULL); + + /* Note: The column numbers below must match the SELECT */ + switch (i) { + case 1: /* DOC_COUNT */ + node->doc_count = mach_read_from_4(data); + break; + + case 2: /* FIRST_DOC_ID */ + node->first_doc_id = fts_read_doc_id(data); + break; + + case 3: /* LAST_DOC_ID */ + node->last_doc_id = fts_read_doc_id(data); + break; + + case 4: /* ILIST */ + node->ilist_size_alloc = node->ilist_size = len; + node->ilist = static_cast<byte*>(ut_malloc(len)); + memcpy(node->ilist, data, len); + break; + + default: + ut_error; + } + } + + /* Make sure all columns were read. */ + ut_a(i == 5); + + return(node); +} + +/**********************************************************************//** +Callback function to fetch the rows in an FTS INDEX record. +@return always returns non-NULL */ +UNIV_INTERN +ibool +fts_optimize_index_fetch_node( +/*==========================*/ + void* row, /*!< in: sel_node_t* */ + void* user_arg) /*!< in: pointer to ib_vector_t */ +{ + fts_word_t* word; + sel_node_t* sel_node = static_cast<sel_node_t*>(row); + fts_fetch_t* fetch = static_cast<fts_fetch_t*>(user_arg); + ib_vector_t* words = static_cast<ib_vector_t*>(fetch->read_arg); + que_node_t* exp = sel_node->select_list; + dfield_t* dfield = que_node_get_val(exp); + void* data = dfield_get_data(dfield); + ulint dfield_len = dfield_get_len(dfield); + fts_node_t* node; + bool is_word_init = false; + + ut_a(dfield_len <= FTS_MAX_WORD_LEN); + + if (ib_vector_size(words) == 0) { + + word = static_cast<fts_word_t*>(ib_vector_push(words, NULL)); + fts_word_init(word, (byte*) data, dfield_len); + is_word_init = true; + } + + word = static_cast<fts_word_t*>(ib_vector_last(words)); + + if (dfield_len != word->text.f_len + || memcmp(word->text.f_str, data, dfield_len)) { + + word = static_cast<fts_word_t*>(ib_vector_push(words, NULL)); + fts_word_init(word, (byte*) data, dfield_len); + is_word_init = true; + } + + node = fts_optimize_read_node(word, que_node_get_next(exp)); + + fetch->total_memory += node->ilist_size; + if (is_word_init) { + fetch->total_memory += sizeof(fts_word_t) + + sizeof(ib_alloc_t) + sizeof(ib_vector_t) + dfield_len + + sizeof(fts_node_t) * FTS_WORD_NODES_INIT_SIZE; + } else if (ib_vector_size(words) > FTS_WORD_NODES_INIT_SIZE) { + fetch->total_memory += sizeof(fts_node_t); + } + + if (fetch->total_memory >= fts_result_cache_limit) { + return(FALSE); + } + + return(TRUE); +} + +/**********************************************************************//** +Read the rows from the FTS inde. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fts_index_fetch_nodes( +/*==================*/ + trx_t* trx, /*!< in: transaction */ + que_t** graph, /*!< in: prepared statement */ + fts_table_t* fts_table, /*!< in: table of the FTS INDEX */ + const fts_string_t* + word, /*!< in: the word to fetch */ + fts_fetch_t* fetch) /*!< in: fetch callback.*/ +{ + pars_info_t* info; + dberr_t error; + + trx->op_info = "fetching FTS index nodes"; + + if (*graph) { + info = (*graph)->info; + } else { + info = pars_info_create(); + } + + pars_info_bind_function(info, "my_func", fetch->read_record, fetch); + pars_info_bind_varchar_literal(info, "word", word->f_str, word->f_len); + + if (!*graph) { + ulint selected; + + ut_a(fts_table->type == FTS_INDEX_TABLE); + + selected = fts_select_index(fts_table->charset, + word->f_str, word->f_len); + + fts_table->suffix = fts_get_suffix(selected); + + *graph = fts_parse_sql( + fts_table, + info, + "DECLARE FUNCTION my_func;\n" + "DECLARE CURSOR c IS" + " SELECT word, doc_count, first_doc_id, last_doc_id, " + "ilist\n" + " FROM \"%s\"\n" + " WHERE word LIKE :word\n" + " ORDER BY first_doc_id;\n" + "BEGIN\n" + "\n" + "OPEN c;\n" + "WHILE 1 = 1 LOOP\n" + " FETCH c INTO my_func();\n" + " IF c % NOTFOUND THEN\n" + " EXIT;\n" + " END IF;\n" + "END LOOP;\n" + "CLOSE c;"); + } + + for(;;) { + error = fts_eval_sql(trx, *graph); + + if (error == DB_SUCCESS) { + fts_sql_commit(trx); + + break; /* Exit the loop. */ + } else { + fts_sql_rollback(trx); + + ut_print_timestamp(stderr); + + if (error == DB_LOCK_WAIT_TIMEOUT) { + fprintf(stderr, " InnoDB: Warning: lock wait " + "timeout reading FTS index. " + "Retrying!\n"); + + trx->error_state = DB_SUCCESS; + } else { + fprintf(stderr, " InnoDB: Error: (%s) " + "while reading FTS index.\n", + ut_strerr(error)); + + break; /* Exit the loop. */ + } + } + } + + return(error); +} + +/**********************************************************************//** +Read a word */ +static +byte* +fts_zip_read_word( +/*==============*/ + fts_zip_t* zip, /*!< in: Zip state + data */ + fts_string_t* word) /*!< out: uncompressed word */ +{ +#ifdef UNIV_DEBUG + ulint i; +#endif + byte len = 0; + void* null = NULL; + byte* ptr = word->f_str; + int flush = Z_NO_FLUSH; + + /* Either there was an error or we are at the Z_STREAM_END. */ + if (zip->status != Z_OK) { + return(NULL); + } + + zip->zp->next_out = &len; + zip->zp->avail_out = sizeof(len); + + while (zip->status == Z_OK && zip->zp->avail_out > 0) { + + /* Finished decompressing block. */ + if (zip->zp->avail_in == 0) { + + /* Free the block thats been decompressed. */ + if (zip->pos > 0) { + ulint prev = zip->pos - 1; + + ut_a(zip->pos < ib_vector_size(zip->blocks)); + + ut_free(ib_vector_getp(zip->blocks, prev)); + ib_vector_set(zip->blocks, prev, &null); + } + + /* Any more blocks to decompress. */ + if (zip->pos < ib_vector_size(zip->blocks)) { + + zip->zp->next_in = static_cast<byte*>( + ib_vector_getp( + zip->blocks, zip->pos)); + + if (zip->pos > zip->last_big_block) { + zip->zp->avail_in = + FTS_MAX_WORD_LEN; + } else { + zip->zp->avail_in = static_cast<uInt>(zip->block_sz); + } + + ++zip->pos; + } else { + flush = Z_FINISH; + } + } + + switch (zip->status = inflate(zip->zp, flush)) { + case Z_OK: + if (zip->zp->avail_out == 0 && len > 0) { + + ut_a(len <= FTS_MAX_WORD_LEN); + ptr[len] = 0; + + zip->zp->next_out = ptr; + zip->zp->avail_out = len; + + word->f_len = len; + len = 0; + } + break; + + case Z_BUF_ERROR: /* No progress possible. */ + case Z_STREAM_END: + inflateEnd(zip->zp); + break; + + case Z_STREAM_ERROR: + default: + ut_error; + } + } + +#ifdef UNIV_DEBUG + /* All blocks must be freed at end of inflate. */ + if (zip->status != Z_OK) { + for (i = 0; i < ib_vector_size(zip->blocks); ++i) { + if (ib_vector_getp(zip->blocks, i)) { + ut_free(ib_vector_getp(zip->blocks, i)); + ib_vector_set(zip->blocks, i, &null); + } + } + } + + if (ptr != NULL) { + ut_ad(word->f_len == strlen((char*) ptr)); + } +#endif /* UNIV_DEBUG */ + + return(zip->status == Z_OK || zip->status == Z_STREAM_END ? ptr : NULL); +} + +/**********************************************************************//** +Callback function to fetch and compress the word in an FTS +INDEX record. +@return FALSE on EOF */ +static +ibool +fts_fetch_index_words( +/*==================*/ + void* row, /*!< in: sel_node_t* */ + void* user_arg) /*!< in: pointer to ib_vector_t */ +{ + sel_node_t* sel_node = static_cast<sel_node_t*>(row); + fts_zip_t* zip = static_cast<fts_zip_t*>(user_arg); + que_node_t* exp = sel_node->select_list; + dfield_t* dfield = que_node_get_val(exp); + byte len = (byte) dfield_get_len(dfield); + void* data = dfield_get_data(dfield); + + /* Skip the duplicate words. */ + if (zip->word.f_len == len && !memcmp(zip->word.f_str, data, len)) { + + return(TRUE); + } + + ut_a(len <= FTS_MAX_WORD_LEN); + + memcpy(zip->word.f_str, data, len); + zip->word.f_len = len; + + ut_a(zip->zp->avail_in == 0); + ut_a(zip->zp->next_in == NULL); + + /* The string is prefixed by len. */ + zip->zp->next_in = &len; + zip->zp->avail_in = sizeof(len); + + /* Compress the word, create output blocks as necessary. */ + while (zip->zp->avail_in > 0) { + + /* No space left in output buffer, create a new one. */ + if (zip->zp->avail_out == 0) { + byte* block; + + block = static_cast<byte*>(ut_malloc(zip->block_sz)); + ib_vector_push(zip->blocks, &block); + + zip->zp->next_out = block; + zip->zp->avail_out = static_cast<uInt>(zip->block_sz); + } + + switch (zip->status = deflate(zip->zp, Z_NO_FLUSH)) { + case Z_OK: + if (zip->zp->avail_in == 0) { + zip->zp->next_in = static_cast<byte*>(data); + zip->zp->avail_in = len; + ut_a(len <= FTS_MAX_WORD_LEN); + len = 0; + } + break; + + case Z_STREAM_END: + case Z_BUF_ERROR: + case Z_STREAM_ERROR: + default: + ut_error; + break; + } + } + + /* All data should have been compressed. */ + ut_a(zip->zp->avail_in == 0); + zip->zp->next_in = NULL; + + ++zip->n_words; + + return(zip->n_words >= zip->max_words ? FALSE : TRUE); +} + +/**********************************************************************//** +Finish Zip deflate. */ +static +void +fts_zip_deflate_end( +/*================*/ + fts_zip_t* zip) /*!< in: instance that should be closed*/ +{ + ut_a(zip->zp->avail_in == 0); + ut_a(zip->zp->next_in == NULL); + + zip->status = deflate(zip->zp, Z_FINISH); + + ut_a(ib_vector_size(zip->blocks) > 0); + zip->last_big_block = ib_vector_size(zip->blocks) - 1; + + /* Allocate smaller block(s), since this is trailing data. */ + while (zip->status == Z_OK) { + byte* block; + + ut_a(zip->zp->avail_out == 0); + + block = static_cast<byte*>(ut_malloc(FTS_MAX_WORD_LEN + 1)); + ib_vector_push(zip->blocks, &block); + + zip->zp->next_out = block; + zip->zp->avail_out = FTS_MAX_WORD_LEN; + + zip->status = deflate(zip->zp, Z_FINISH); + } + + ut_a(zip->status == Z_STREAM_END); + + zip->status = deflateEnd(zip->zp); + ut_a(zip->status == Z_OK); + + /* Reset the ZLib data structure. */ + memset(zip->zp, 0, sizeof(*zip->zp)); +} + +/**********************************************************************//** +Read the words from the FTS INDEX. +@return DB_SUCCESS if all OK, DB_TABLE_NOT_FOUND if no more indexes + to search else error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +fts_index_fetch_words( +/*==================*/ + fts_optimize_t* optim, /*!< in: optimize scratch pad */ + const fts_string_t* word, /*!< in: get words greater than this + word */ + ulint n_words)/*!< in: max words to read */ +{ + pars_info_t* info; + que_t* graph; + ulint selected; + fts_zip_t* zip = NULL; + dberr_t error = DB_SUCCESS; + mem_heap_t* heap = static_cast<mem_heap_t*>(optim->self_heap->arg); + ibool inited = FALSE; + + optim->trx->op_info = "fetching FTS index words"; + + if (optim->zip == NULL) { + optim->zip = fts_zip_create(heap, FTS_ZIP_BLOCK_SIZE, n_words); + } else { + fts_zip_initialize(optim->zip); + } + + for (selected = fts_select_index( + optim->fts_index_table.charset, word->f_str, word->f_len); + fts_index_selector[selected].value; + selected++) { + + optim->fts_index_table.suffix = fts_get_suffix(selected); + + /* We've search all indexes. */ + if (optim->fts_index_table.suffix == NULL) { + return(DB_TABLE_NOT_FOUND); + } + + info = pars_info_create(); + + pars_info_bind_function( + info, "my_func", fts_fetch_index_words, optim->zip); + + pars_info_bind_varchar_literal( + info, "word", word->f_str, word->f_len); + + graph = fts_parse_sql( + &optim->fts_index_table, + info, + "DECLARE FUNCTION my_func;\n" + "DECLARE CURSOR c IS" + " SELECT word\n" + " FROM \"%s\"\n" + " WHERE word > :word\n" + " ORDER BY word;\n" + "BEGIN\n" + "\n" + "OPEN c;\n" + "WHILE 1 = 1 LOOP\n" + " FETCH c INTO my_func();\n" + " IF c % NOTFOUND THEN\n" + " EXIT;\n" + " END IF;\n" + "END LOOP;\n" + "CLOSE c;"); + + zip = optim->zip; + + for(;;) { + int err; + + if (!inited && ((err = deflateInit(zip->zp, 9)) + != Z_OK)) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: ZLib deflateInit() " + "failed: %d\n", err); + + error = DB_ERROR; + break; + } else { + inited = TRUE; + error = fts_eval_sql(optim->trx, graph); + } + + if (error == DB_SUCCESS) { + //FIXME fts_sql_commit(optim->trx); + break; + } else { + //FIXME fts_sql_rollback(optim->trx); + + ut_print_timestamp(stderr); + + if (error == DB_LOCK_WAIT_TIMEOUT) { + fprintf(stderr, " InnoDB: " + "Warning: lock wait " + "timeout reading document. " + "Retrying!\n"); + + /* We need to reset the ZLib state. */ + inited = FALSE; + deflateEnd(zip->zp); + fts_zip_init(zip); + + optim->trx->error_state = DB_SUCCESS; + } else { + fprintf(stderr, " InnoDB: Error: (%s) " + "while reading document.\n", + ut_strerr(error)); + + break; /* Exit the loop. */ + } + } + } + + fts_que_graph_free(graph); + + /* Check if max word to fetch is exceeded */ + if (optim->zip->n_words >= n_words) { + break; + } + } + + if (error == DB_SUCCESS && zip->status == Z_OK && zip->n_words > 0) { + + /* All data should have been read. */ + ut_a(zip->zp->avail_in == 0); + + fts_zip_deflate_end(zip); + } else { + deflateEnd(zip->zp); + } + + return(error); +} + +/**********************************************************************//** +Callback function to fetch the doc id from the record. +@return always returns TRUE */ +static +ibool +fts_fetch_doc_ids( +/*==============*/ + void* row, /*!< in: sel_node_t* */ + void* user_arg) /*!< in: pointer to ib_vector_t */ +{ + que_node_t* exp; + int i = 0; + sel_node_t* sel_node = static_cast<sel_node_t*>(row); + fts_doc_ids_t* fts_doc_ids = static_cast<fts_doc_ids_t*>(user_arg); + fts_update_t* update = static_cast<fts_update_t*>( + ib_vector_push(fts_doc_ids->doc_ids, NULL)); + + for (exp = sel_node->select_list; + exp; + exp = que_node_get_next(exp), ++i) { + + dfield_t* dfield = que_node_get_val(exp); + void* data = dfield_get_data(dfield); + ulint len = dfield_get_len(dfield); + + ut_a(len != UNIV_SQL_NULL); + + /* Note: The column numbers below must match the SELECT. */ + switch (i) { + case 0: /* DOC_ID */ + update->fts_indexes = NULL; + update->doc_id = fts_read_doc_id( + static_cast<byte*>(data)); + break; + + default: + ut_error; + } + } + + return(TRUE); +} + +/**********************************************************************//** +Read the rows from a FTS common auxiliary table. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fts_table_fetch_doc_ids( +/*====================*/ + trx_t* trx, /*!< in: transaction */ + fts_table_t* fts_table, /*!< in: table */ + fts_doc_ids_t* doc_ids) /*!< in: For collecting doc ids */ +{ + dberr_t error; + que_t* graph; + pars_info_t* info = pars_info_create(); + ibool alloc_bk_trx = FALSE; + + ut_a(fts_table->suffix != NULL); + ut_a(fts_table->type == FTS_COMMON_TABLE); + + if (!trx) { + trx = trx_allocate_for_background(); + alloc_bk_trx = TRUE; + } + + trx->op_info = "fetching FTS doc ids"; + + pars_info_bind_function(info, "my_func", fts_fetch_doc_ids, doc_ids); + + graph = fts_parse_sql( + fts_table, + info, + "DECLARE FUNCTION my_func;\n" + "DECLARE CURSOR c IS" + " SELECT doc_id FROM \"%s\";\n" + "BEGIN\n" + "\n" + "OPEN c;\n" + "WHILE 1 = 1 LOOP\n" + " FETCH c INTO my_func();\n" + " IF c % NOTFOUND THEN\n" + " EXIT;\n" + " END IF;\n" + "END LOOP;\n" + "CLOSE c;"); + + error = fts_eval_sql(trx, graph); + + mutex_enter(&dict_sys->mutex); + que_graph_free(graph); + mutex_exit(&dict_sys->mutex); + + if (error == DB_SUCCESS) { + fts_sql_commit(trx); + + ib_vector_sort(doc_ids->doc_ids, fts_update_doc_id_cmp); + } else { + fts_sql_rollback(trx); + } + + if (alloc_bk_trx) { + trx_free_for_background(trx); + } + + return(error); +} + +/**********************************************************************//** +Do a binary search for a doc id in the array +@return +ve index if found -ve index where it should be inserted + if not found */ +UNIV_INTERN +int +fts_bsearch( +/*========*/ + fts_update_t* array, /*!< in: array to sort */ + int lower, /*!< in: the array lower bound */ + int upper, /*!< in: the array upper bound */ + doc_id_t doc_id) /*!< in: the doc id to search for */ +{ + int orig_size = upper; + + if (upper == 0) { + /* Nothing to search */ + return(-1); + } else { + while (lower < upper) { + int i = (lower + upper) >> 1; + + if (doc_id > array[i].doc_id) { + lower = i + 1; + } else if (doc_id < array[i].doc_id) { + upper = i - 1; + } else { + return(i); /* Found. */ + } + } + } + + if (lower == upper && lower < orig_size) { + if (doc_id == array[lower].doc_id) { + return(lower); + } else if (lower == 0) { + return(-1); + } + } + + /* Not found. */ + return( (lower == 0) ? -1 : -lower); +} + +/**********************************************************************//** +Search in the to delete array whether any of the doc ids within +the [first, last] range are to be deleted +@return +ve index if found -ve index where it should be inserted + if not found */ +static +int +fts_optimize_lookup( +/*================*/ + ib_vector_t* doc_ids, /*!< in: array to search */ + ulint lower, /*!< in: lower limit of array */ + doc_id_t first_doc_id, /*!< in: doc id to lookup */ + doc_id_t last_doc_id) /*!< in: doc id to lookup */ +{ + int pos; + int upper = static_cast<int>(ib_vector_size(doc_ids)); + fts_update_t* array = (fts_update_t*) doc_ids->data; + + pos = fts_bsearch(array, static_cast<int>(lower), upper, first_doc_id); + + ut_a(abs(pos) <= upper + 1); + + if (pos < 0) { + + int i = abs(pos); + + /* If i is 1, it could be first_doc_id is less than + either the first or second array item, do a + double check */ + if (i == 1 && array[0].doc_id <= last_doc_id + && first_doc_id < array[0].doc_id) { + pos = 0; + } else if (i < upper && array[i].doc_id <= last_doc_id) { + + /* Check if the "next" doc id is within the + first & last doc id of the node. */ + pos = i; + } + } + + return(pos); +} + +/**********************************************************************//** +Encode the word pos list into the node +@return DB_SUCCESS or error code*/ +static __attribute__((nonnull)) +dberr_t +fts_optimize_encode_node( +/*=====================*/ + fts_node_t* node, /*!< in: node to fill*/ + doc_id_t doc_id, /*!< in: doc id to encode */ + fts_encode_t* enc) /*!< in: encoding state.*/ +{ + byte* dst; + ulint enc_len; + ulint pos_enc_len; + doc_id_t doc_id_delta; + dberr_t error = DB_SUCCESS; + byte* src = enc->src_ilist_ptr; + + if (node->first_doc_id == 0) { + ut_a(node->last_doc_id == 0); + + node->first_doc_id = doc_id; + } + + /* Calculate the space required to store the ilist. */ + doc_id_delta = doc_id - node->last_doc_id; + enc_len = fts_get_encoded_len(static_cast<ulint>(doc_id_delta)); + + /* Calculate the size of the encoded pos array. */ + while (*src) { + fts_decode_vlc(&src); + } + + /* Skip the 0x00 byte at the end of the word positions list. */ + ++src; + + /* Number of encoded pos bytes to copy. */ + pos_enc_len = src - enc->src_ilist_ptr; + + /* Total number of bytes required for copy. */ + enc_len += pos_enc_len; + + /* Check we have enough space in the destination buffer for + copying the document word list. */ + if (!node->ilist) { + ulint new_size; + + ut_a(node->ilist_size == 0); + + new_size = enc_len > FTS_ILIST_MAX_SIZE + ? enc_len : FTS_ILIST_MAX_SIZE; + + node->ilist = static_cast<byte*>(ut_malloc(new_size)); + node->ilist_size_alloc = new_size; + + } else if ((node->ilist_size + enc_len) > node->ilist_size_alloc) { + ulint new_size = node->ilist_size + enc_len; + byte* ilist = static_cast<byte*>(ut_malloc(new_size)); + + memcpy(ilist, node->ilist, node->ilist_size); + + ut_free(node->ilist); + + node->ilist = ilist; + node->ilist_size_alloc = new_size; + } + + src = enc->src_ilist_ptr; + dst = node->ilist + node->ilist_size; + + /* Encode the doc id. Cast to ulint, the delta should be small and + therefore no loss of precision. */ + dst += fts_encode_int((ulint) doc_id_delta, dst); + + /* Copy the encoded pos array. */ + memcpy(dst, src, pos_enc_len); + + node->last_doc_id = doc_id; + + /* Data copied upto here. */ + node->ilist_size += enc_len; + enc->src_ilist_ptr += pos_enc_len; + + ut_a(node->ilist_size <= node->ilist_size_alloc); + + return(error); +} + +/**********************************************************************//** +Optimize the data contained in a node. +@return DB_SUCCESS or error code*/ +static __attribute__((nonnull)) +dberr_t +fts_optimize_node( +/*==============*/ + ib_vector_t* del_vec, /*!< in: vector of doc ids to delete*/ + int* del_pos, /*!< in: offset into above vector */ + fts_node_t* dst_node, /*!< in: node to fill*/ + fts_node_t* src_node, /*!< in: source node for data*/ + fts_encode_t* enc) /*!< in: encoding state */ +{ + ulint copied; + dberr_t error = DB_SUCCESS; + doc_id_t doc_id = enc->src_last_doc_id; + + if (!enc->src_ilist_ptr) { + enc->src_ilist_ptr = src_node->ilist; + } + + copied = enc->src_ilist_ptr - src_node->ilist; + + /* While there is data in the source node and space to copy + into in the destination node. */ + while (copied < src_node->ilist_size + && dst_node->ilist_size < FTS_ILIST_MAX_SIZE) { + + doc_id_t delta; + doc_id_t del_doc_id = FTS_NULL_DOC_ID; + + delta = fts_decode_vlc(&enc->src_ilist_ptr); + +test_again: + /* Check whether the doc id is in the delete list, if + so then we skip the entries but we need to track the + delta for decoding the entries following this document's + entries. */ + if (*del_pos >= 0 && *del_pos < (int) ib_vector_size(del_vec)) { + fts_update_t* update; + + update = (fts_update_t*) ib_vector_get( + del_vec, *del_pos); + + del_doc_id = update->doc_id; + } + + if (enc->src_ilist_ptr == src_node->ilist && doc_id == 0) { + ut_a(delta == src_node->first_doc_id); + } + + doc_id += delta; + + if (del_doc_id > 0 && doc_id == del_doc_id) { + + ++*del_pos; + + /* Skip the entries for this document. */ + while (*enc->src_ilist_ptr) { + fts_decode_vlc(&enc->src_ilist_ptr); + } + + /* Skip the end of word position marker. */ + ++enc->src_ilist_ptr; + + } else { + + /* DOC ID already becomes larger than + del_doc_id, check the next del_doc_id */ + if (del_doc_id > 0 && doc_id > del_doc_id) { + del_doc_id = 0; + ++*del_pos; + delta = 0; + goto test_again; + } + + /* Decode and copy the word positions into + the dest node. */ + fts_optimize_encode_node(dst_node, doc_id, enc); + + ++dst_node->doc_count; + + ut_a(dst_node->last_doc_id == doc_id); + } + + /* Bytes copied so for from source. */ + copied = enc->src_ilist_ptr - src_node->ilist; + } + + if (copied >= src_node->ilist_size) { + ut_a(doc_id == src_node->last_doc_id); + } + + enc->src_last_doc_id = doc_id; + + return(error); +} + +/**********************************************************************//** +Determine the starting pos within the deleted doc id vector for a word. +@return delete position */ +static __attribute__((nonnull, warn_unused_result)) +int +fts_optimize_deleted_pos( +/*=====================*/ + fts_optimize_t* optim, /*!< in: optimize state data */ + fts_word_t* word) /*!< in: the word data to check */ +{ + int del_pos; + ib_vector_t* del_vec = optim->to_delete->doc_ids; + + /* Get the first and last dict ids for the word, we will use + these values to determine which doc ids need to be removed + when we coalesce the nodes. This way we can reduce the numer + of elements that need to be searched in the deleted doc ids + vector and secondly we can remove the doc ids during the + coalescing phase. */ + if (ib_vector_size(del_vec) > 0) { + fts_node_t* node; + doc_id_t last_id; + doc_id_t first_id; + ulint size = ib_vector_size(word->nodes); + + node = (fts_node_t*) ib_vector_get(word->nodes, 0); + first_id = node->first_doc_id; + + node = (fts_node_t*) ib_vector_get(word->nodes, size - 1); + last_id = node->last_doc_id; + + ut_a(first_id <= last_id); + + del_pos = fts_optimize_lookup( + del_vec, optim->del_pos, first_id, last_id); + } else { + + del_pos = -1; /* Note that there is nothing to delete. */ + } + + return(del_pos); +} + +#define FTS_DEBUG_PRINT +/**********************************************************************//** +Compact the nodes for a word, we also remove any doc ids during the +compaction pass. +@return DB_SUCCESS or error code.*/ +static +ib_vector_t* +fts_optimize_word( +/*==============*/ + fts_optimize_t* optim, /*!< in: optimize state data */ + fts_word_t* word) /*!< in: the word to optimize */ +{ + fts_encode_t enc; + ib_vector_t* nodes; + ulint i = 0; + int del_pos; + fts_node_t* dst_node = NULL; + ib_vector_t* del_vec = optim->to_delete->doc_ids; + ulint size = ib_vector_size(word->nodes); + + del_pos = fts_optimize_deleted_pos(optim, word); + nodes = ib_vector_create(word->heap_alloc, sizeof(*dst_node), 128); + + enc.src_last_doc_id = 0; + enc.src_ilist_ptr = NULL; + + if (fts_enable_diag_print) { + word->text.f_str[word->text.f_len] = 0; + fprintf(stderr, "FTS_OPTIMIZE: optimize \"%s\"\n", + word->text.f_str); + } + + while (i < size) { + ulint copied; + fts_node_t* src_node; + + src_node = (fts_node_t*) ib_vector_get(word->nodes, i); + + if (!dst_node) { + + dst_node = static_cast<fts_node_t*>( + ib_vector_push(nodes, NULL)); + memset(dst_node, 0, sizeof(*dst_node)); + } + + /* Copy from the src to the dst node. */ + fts_optimize_node(del_vec, &del_pos, dst_node, src_node, &enc); + + ut_a(enc.src_ilist_ptr != NULL); + + /* Determine the numer of bytes copied to dst_node. */ + copied = enc.src_ilist_ptr - src_node->ilist; + + /* Can't copy more than whats in the vlc array. */ + ut_a(copied <= src_node->ilist_size); + + /* We are done with this node release the resources. */ + if (copied == src_node->ilist_size) { + + enc.src_last_doc_id = 0; + enc.src_ilist_ptr = NULL; + + ut_free(src_node->ilist); + + src_node->ilist = NULL; + src_node->ilist_size = src_node->ilist_size_alloc = 0; + + src_node = NULL; + + ++i; /* Get next source node to OPTIMIZE. */ + } + + if (dst_node->ilist_size >= FTS_ILIST_MAX_SIZE || i >= size) { + + dst_node = NULL; + } + } + + /* All dst nodes created should have been added to the vector. */ + ut_a(dst_node == NULL); + + /* Return the OPTIMIZED nodes. */ + return(nodes); +} + +/**********************************************************************//** +Update the FTS index table. This is a delete followed by an insert. +@return DB_SUCCESS or error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +fts_optimize_write_word( +/*====================*/ + trx_t* trx, /*!< in: transaction */ + fts_table_t* fts_table, /*!< in: table of FTS index */ + fts_string_t* word, /*!< in: word data to write */ + ib_vector_t* nodes) /*!< in: the nodes to write */ +{ + ulint i; + pars_info_t* info; + que_t* graph; + ulint selected; + dberr_t error = DB_SUCCESS; + char* table_name = fts_get_table_name(fts_table); + + info = pars_info_create(); + + ut_ad(fts_table->charset); + + if (fts_enable_diag_print) { + fprintf(stderr, "FTS_OPTIMIZE: processed \"%s\"\n", + word->f_str); + } + + pars_info_bind_varchar_literal( + info, "word", word->f_str, word->f_len); + + selected = fts_select_index(fts_table->charset, + word->f_str, word->f_len); + + fts_table->suffix = fts_get_suffix(selected); + + graph = fts_parse_sql( + fts_table, + info, + "BEGIN DELETE FROM \"%s\" WHERE word = :word;"); + + error = fts_eval_sql(trx, graph); + + if (error != DB_SUCCESS) { + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Error: (%s) during optimize, " + "when deleting a word from the FTS index.\n", + ut_strerr(error)); + } + + fts_que_graph_free(graph); + graph = NULL; + + mem_free(table_name); + + /* Even if the operation needs to be rolled back and redone, + we iterate over the nodes in order to free the ilist. */ + for (i = 0; i < ib_vector_size(nodes); ++i) { + + fts_node_t* node = (fts_node_t*) ib_vector_get(nodes, i); + + if (error == DB_SUCCESS) { + error = fts_write_node( + trx, &graph, fts_table, word, node); + + if (error != DB_SUCCESS) { + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Error: (%s) " + "during optimize, while adding a " + "word to the FTS index.\n", + ut_strerr(error)); + } + } + + ut_free(node->ilist); + node->ilist = NULL; + node->ilist_size = node->ilist_size_alloc = 0; + } + + if (graph != NULL) { + fts_que_graph_free(graph); + } + + return(error); +} + +/**********************************************************************//** +Free fts_optimizer_word_t instanace.*/ +UNIV_INTERN +void +fts_word_free( +/*==========*/ + fts_word_t* word) /*!< in: instance to free.*/ +{ + mem_heap_t* heap = static_cast<mem_heap_t*>(word->heap_alloc->arg); + +#ifdef UNIV_DEBUG + memset(word, 0, sizeof(*word)); +#endif /* UNIV_DEBUG */ + + mem_heap_free(heap); +} + +/**********************************************************************//** +Optimize the word ilist and rewrite data to the FTS index. +@return status one of RESTART, EXIT, ERROR */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +fts_optimize_compact( +/*=================*/ + fts_optimize_t* optim, /*!< in: optimize state data */ + dict_index_t* index, /*!< in: current FTS being optimized */ + ib_time_t start_time) /*!< in: optimize start time */ +{ + ulint i; + dberr_t error = DB_SUCCESS; + ulint size = ib_vector_size(optim->words); + + for (i = 0; i < size && error == DB_SUCCESS && !optim->done; ++i) { + fts_word_t* word; + ib_vector_t* nodes; + trx_t* trx = optim->trx; + + word = (fts_word_t*) ib_vector_get(optim->words, i); + + /* nodes is allocated from the word heap and will be destroyed + when the word is freed. We however have to be careful about + the ilist, that needs to be freed explicitly. */ + nodes = fts_optimize_word(optim, word); + + /* Update the data on disk. */ + error = fts_optimize_write_word( + trx, &optim->fts_index_table, &word->text, nodes); + + if (error == DB_SUCCESS) { + /* Write the last word optimized to the config table, + we use this value for restarting optimize. */ + error = fts_config_set_index_value( + optim->trx, index, + FTS_LAST_OPTIMIZED_WORD, &word->text); + } + + /* Free the word that was optimized. */ + fts_word_free(word); + + if (fts_optimize_time_limit > 0 + && (ut_time() - start_time) > fts_optimize_time_limit) { + + optim->done = TRUE; + } + } + + return(error); +} + +/**********************************************************************//** +Create an instance of fts_optimize_t. Also create a new +background transaction.*/ +static +fts_optimize_t* +fts_optimize_create( +/*================*/ + dict_table_t* table) /*!< in: table with FTS indexes */ +{ + fts_optimize_t* optim; + mem_heap_t* heap = mem_heap_create(128); + + optim = (fts_optimize_t*) mem_heap_zalloc(heap, sizeof(*optim)); + + optim->self_heap = ib_heap_allocator_create(heap); + + optim->to_delete = fts_doc_ids_create(); + + optim->words = ib_vector_create( + optim->self_heap, sizeof(fts_word_t), 256); + + optim->table = table; + + optim->trx = trx_allocate_for_background(); + + optim->fts_common_table.parent = table->name; + optim->fts_common_table.table_id = table->id; + optim->fts_common_table.type = FTS_COMMON_TABLE; + optim->fts_common_table.table = table; + + optim->fts_index_table.parent = table->name; + optim->fts_index_table.table_id = table->id; + optim->fts_index_table.type = FTS_INDEX_TABLE; + optim->fts_index_table.table = table; + + /* The common prefix for all this parent table's aux tables. */ + optim->name_prefix = fts_get_table_name_prefix( + &optim->fts_common_table); + + return(optim); +} + +#ifdef FTS_OPTIMIZE_DEBUG +/**********************************************************************//** +Get optimize start time of an FTS index. +@return DB_SUCCESS if all OK else error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +fts_optimize_get_index_start_time( +/*==============================*/ + trx_t* trx, /*!< in: transaction */ + dict_index_t* index, /*!< in: FTS index */ + ib_time_t* start_time) /*!< out: time in secs */ +{ + return(fts_config_get_index_ulint( + trx, index, FTS_OPTIMIZE_START_TIME, + (ulint*) start_time)); +} + +/**********************************************************************//** +Set the optimize start time of an FTS index. +@return DB_SUCCESS if all OK else error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +fts_optimize_set_index_start_time( +/*==============================*/ + trx_t* trx, /*!< in: transaction */ + dict_index_t* index, /*!< in: FTS index */ + ib_time_t start_time) /*!< in: start time */ +{ + return(fts_config_set_index_ulint( + trx, index, FTS_OPTIMIZE_START_TIME, + (ulint) start_time)); +} + +/**********************************************************************//** +Get optimize end time of an FTS index. +@return DB_SUCCESS if all OK else error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +fts_optimize_get_index_end_time( +/*============================*/ + trx_t* trx, /*!< in: transaction */ + dict_index_t* index, /*!< in: FTS index */ + ib_time_t* end_time) /*!< out: time in secs */ +{ + return(fts_config_get_index_ulint( + trx, index, FTS_OPTIMIZE_END_TIME, (ulint*) end_time)); +} + +/**********************************************************************//** +Set the optimize end time of an FTS index. +@return DB_SUCCESS if all OK else error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +fts_optimize_set_index_end_time( +/*============================*/ + trx_t* trx, /*!< in: transaction */ + dict_index_t* index, /*!< in: FTS index */ + ib_time_t end_time) /*!< in: end time */ +{ + return(fts_config_set_index_ulint( + trx, index, FTS_OPTIMIZE_END_TIME, (ulint) end_time)); +} +#endif + +/**********************************************************************//** +Free the optimize prepared statements.*/ +static +void +fts_optimize_graph_free( +/*====================*/ + fts_optimize_graph_t* graph) /*!< in/out: The graph instances + to free */ +{ + if (graph->commit_graph) { + que_graph_free(graph->commit_graph); + graph->commit_graph = NULL; + } + + if (graph->write_nodes_graph) { + que_graph_free(graph->write_nodes_graph); + graph->write_nodes_graph = NULL; + } + + if (graph->delete_nodes_graph) { + que_graph_free(graph->delete_nodes_graph); + graph->delete_nodes_graph = NULL; + } + + if (graph->read_nodes_graph) { + que_graph_free(graph->read_nodes_graph); + graph->read_nodes_graph = NULL; + } +} + +/**********************************************************************//** +Free all optimize resources. */ +static +void +fts_optimize_free( +/*==============*/ + fts_optimize_t* optim) /*!< in: table with on FTS index */ +{ + mem_heap_t* heap = static_cast<mem_heap_t*>(optim->self_heap->arg); + + trx_free_for_background(optim->trx); + + fts_doc_ids_free(optim->to_delete); + fts_optimize_graph_free(&optim->graph); + + mem_free(optim->name_prefix); + + /* This will free the heap from which optim itself was allocated. */ + mem_heap_free(heap); +} + +/**********************************************************************//** +Get the max time optimize should run in millisecs. +@return max optimize time limit in millisecs. */ +static +ib_time_t +fts_optimize_get_time_limit( +/*========================*/ + trx_t* trx, /*!< in: transaction */ + fts_table_t* fts_table) /*!< in: aux table */ +{ + ib_time_t time_limit = 0; + + fts_config_get_ulint( + trx, fts_table, + FTS_OPTIMIZE_LIMIT_IN_SECS, (ulint*) &time_limit); + + return(time_limit * 1000); +} + + +/**********************************************************************//** +Run OPTIMIZE on the given table. Note: this can take a very long time +(hours). */ +static +void +fts_optimize_words( +/*===============*/ + fts_optimize_t* optim, /*!< in: optimize instance */ + dict_index_t* index, /*!< in: current FTS being optimized */ + fts_string_t* word) /*!< in: the starting word to optimize */ +{ + fts_fetch_t fetch; + ib_time_t start_time; + que_t* graph = NULL; + CHARSET_INFO* charset = optim->fts_index_table.charset; + + ut_a(!optim->done); + + /* Get the time limit from the config table. */ + fts_optimize_time_limit = fts_optimize_get_time_limit( + optim->trx, &optim->fts_common_table); + + start_time = ut_time(); + + /* Setup the callback to use for fetching the word ilist etc. */ + fetch.read_arg = optim->words; + fetch.read_record = fts_optimize_index_fetch_node; + + fprintf(stderr, "%.*s\n", (int) word->f_len, word->f_str); + + while(!optim->done) { + dberr_t error; + trx_t* trx = optim->trx; + ulint selected; + + ut_a(ib_vector_size(optim->words) == 0); + + selected = fts_select_index(charset, word->f_str, word->f_len); + + /* Read the index records to optimize. */ + fetch.total_memory = 0; + error = fts_index_fetch_nodes( + trx, &graph, &optim->fts_index_table, word, + &fetch); + ut_ad(fetch.total_memory < fts_result_cache_limit); + + if (error == DB_SUCCESS) { + /* There must be some nodes to read. */ + ut_a(ib_vector_size(optim->words) > 0); + + /* Optimize the nodes that were read and write + back to DB. */ + error = fts_optimize_compact(optim, index, start_time); + + if (error == DB_SUCCESS) { + fts_sql_commit(optim->trx); + } else { + fts_sql_rollback(optim->trx); + } + } + + ib_vector_reset(optim->words); + + if (error == DB_SUCCESS) { + if (!optim->done) { + if (!fts_zip_read_word(optim->zip, word)) { + optim->done = TRUE; + } else if (selected + != fts_select_index( + charset, word->f_str, + word->f_len) + && graph) { + fts_que_graph_free(graph); + graph = NULL; + } + } + } else if (error == DB_LOCK_WAIT_TIMEOUT) { + fprintf(stderr, "InnoDB: Warning: lock wait timeout " + "during optimize. Retrying!\n"); + + trx->error_state = DB_SUCCESS; + } else if (error == DB_DEADLOCK) { + fprintf(stderr, "InnoDB: Warning: deadlock " + "during optimize. Retrying!\n"); + + trx->error_state = DB_SUCCESS; + } else { + optim->done = TRUE; /* Exit the loop. */ + } + } + + if (graph != NULL) { + fts_que_graph_free(graph); + } +} + +/**********************************************************************//** +Select the FTS index to search. +@return TRUE if last index */ +static +ibool +fts_optimize_set_next_word( +/*=======================*/ + CHARSET_INFO* charset, /*!< in: charset */ + fts_string_t* word) /*!< in: current last word */ +{ + ulint selected; + ibool last = FALSE; + + selected = fts_select_next_index(charset, word->f_str, word->f_len); + + /* If this was the last index then reset to start. */ + if (fts_index_selector[selected].value == 0) { + /* Reset the last optimized word to '' if no + more words could be read from the FTS index. */ + word->f_len = 0; + *word->f_str = 0; + + last = TRUE; + } else { + ulint value = fts_index_selector[selected].value; + + ut_a(value <= 0xff); + + /* Set to the first character of the next slot. */ + word->f_len = 1; + *word->f_str = (byte) value; + } + + return(last); +} + +/**********************************************************************//** +Optimize is complete. Set the completion time, and reset the optimize +start string for this FTS index to "". +@return DB_SUCCESS if all OK */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +fts_optimize_index_completed( +/*=========================*/ + fts_optimize_t* optim, /*!< in: optimize instance */ + dict_index_t* index) /*!< in: table with one FTS index */ +{ + fts_string_t word; + dberr_t error; + byte buf[sizeof(ulint)]; +#ifdef FTS_OPTIMIZE_DEBUG + ib_time_t end_time = ut_time(); + + error = fts_optimize_set_index_end_time(optim->trx, index, end_time); +#endif + + /* If we've reached the end of the index then set the start + word to the empty string. */ + + word.f_len = 0; + word.f_str = buf; + *word.f_str = '\0'; + + error = fts_config_set_index_value( + optim->trx, index, FTS_LAST_OPTIMIZED_WORD, &word); + + if (error != DB_SUCCESS) { + + fprintf(stderr, "InnoDB: Error: (%s) while " + "updating last optimized word!\n", ut_strerr(error)); + } + + return(error); +} + + +/**********************************************************************//** +Read the list of words from the FTS auxiliary index that will be +optimized in this pass. +@return DB_SUCCESS if all OK */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +fts_optimize_index_read_words( +/*==========================*/ + fts_optimize_t* optim, /*!< in: optimize instance */ + dict_index_t* index, /*!< in: table with one FTS index */ + fts_string_t* word) /*!< in: buffer to use */ +{ + dberr_t error = DB_SUCCESS; + + if (optim->del_list_regenerated) { + word->f_len = 0; + } else { + + /* Get the last word that was optimized from + the config table. */ + error = fts_config_get_index_value( + optim->trx, index, FTS_LAST_OPTIMIZED_WORD, word); + } + + /* If record not found then we start from the top. */ + if (error == DB_RECORD_NOT_FOUND) { + word->f_len = 0; + error = DB_SUCCESS; + } + + while (error == DB_SUCCESS) { + + error = fts_index_fetch_words( + optim, word, fts_num_word_optimize); + + if (error == DB_SUCCESS) { + + /* If the search returned an empty set + try the next index in the horizontal split. */ + if (optim->zip->n_words > 0) { + break; + } else { + + fts_optimize_set_next_word( + optim->fts_index_table.charset, + word); + + if (word->f_len == 0) { + break; + } + } + } + } + + return(error); +} + +/**********************************************************************//** +Run OPTIMIZE on the given FTS index. Note: this can take a very long +time (hours). +@return DB_SUCCESS if all OK */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +fts_optimize_index( +/*===============*/ + fts_optimize_t* optim, /*!< in: optimize instance */ + dict_index_t* index) /*!< in: table with one FTS index */ +{ + fts_string_t word; + dberr_t error; + byte str[FTS_MAX_WORD_LEN + 1]; + + /* Set the current index that we have to optimize. */ + optim->fts_index_table.index_id = index->id; + optim->fts_index_table.charset = fts_index_get_charset(index); + + optim->done = FALSE; /* Optimize until !done */ + + /* We need to read the last word optimized so that we start from + the next word. */ + word.f_str = str; + + /* We set the length of word to the size of str since we + need to pass the max len info to the fts_get_config_value() function. */ + word.f_len = sizeof(str) - 1; + + memset(word.f_str, 0x0, word.f_len); + + /* Read the words that will be optimized in this pass. */ + error = fts_optimize_index_read_words(optim, index, &word); + + if (error == DB_SUCCESS) { + int zip_error; + + ut_a(optim->zip->pos == 0); + ut_a(optim->zip->zp->total_in == 0); + ut_a(optim->zip->zp->total_out == 0); + + zip_error = inflateInit(optim->zip->zp); + ut_a(zip_error == Z_OK); + + word.f_len = 0; + word.f_str = str; + + /* Read the first word to optimize from the Zip buffer. */ + if (!fts_zip_read_word(optim->zip, &word)) { + + optim->done = TRUE; + } else { + fts_optimize_words(optim, index, &word); + } + + /* If we couldn't read any records then optimize is + complete. Increment the number of indexes that have + been optimized and set FTS index optimize state to + completed. */ + if (error == DB_SUCCESS && optim->zip->n_words == 0) { + + error = fts_optimize_index_completed(optim, index); + + if (error == DB_SUCCESS) { + ++optim->n_completed; + } + } + } + + return(error); +} + +/**********************************************************************//** +Delete the document ids in the delete, and delete cache tables. +@return DB_SUCCESS if all OK */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +fts_optimize_purge_deleted_doc_ids( +/*===============================*/ + fts_optimize_t* optim) /*!< in: optimize instance */ +{ + ulint i; + pars_info_t* info; + que_t* graph; + fts_update_t* update; + char* sql_str; + doc_id_t write_doc_id; + dberr_t error = DB_SUCCESS; + + info = pars_info_create(); + + ut_a(ib_vector_size(optim->to_delete->doc_ids) > 0); + + update = static_cast<fts_update_t*>( + ib_vector_get(optim->to_delete->doc_ids, 0)); + + /* Convert to "storage" byte order. */ + fts_write_doc_id((byte*) &write_doc_id, update->doc_id); + + /* This is required for the SQL parser to work. It must be able + to find the following variables. So we do it twice. */ + fts_bind_doc_id(info, "doc_id1", &write_doc_id); + fts_bind_doc_id(info, "doc_id2", &write_doc_id); + + /* Since we only replace the table_id and don't construct the full + name, we do substitution ourselves. Remember to free sql_str. */ + sql_str = ut_strreplace( + fts_delete_doc_ids_sql, "%s", optim->name_prefix); + + graph = fts_parse_sql(NULL, info, sql_str); + + mem_free(sql_str); + + /* Delete the doc ids that were copied at the start. */ + for (i = 0; i < ib_vector_size(optim->to_delete->doc_ids); ++i) { + + update = static_cast<fts_update_t*>(ib_vector_get( + optim->to_delete->doc_ids, i)); + + /* Convert to "storage" byte order. */ + fts_write_doc_id((byte*) &write_doc_id, update->doc_id); + + fts_bind_doc_id(info, "doc_id1", &write_doc_id); + + fts_bind_doc_id(info, "doc_id2", &write_doc_id); + + error = fts_eval_sql(optim->trx, graph); + + // FIXME: Check whether delete actually succeeded! + if (error != DB_SUCCESS) { + + fts_sql_rollback(optim->trx); + break; + } + } + + fts_que_graph_free(graph); + + return(error); +} + +/**********************************************************************//** +Delete the document ids in the pending delete, and delete tables. +@return DB_SUCCESS if all OK */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +fts_optimize_purge_deleted_doc_id_snapshot( +/*=======================================*/ + fts_optimize_t* optim) /*!< in: optimize instance */ +{ + dberr_t error; + que_t* graph; + char* sql_str; + + /* Since we only replace the table_id and don't construct + the full name, we do the '%s' substitution ourselves. */ + sql_str = ut_strreplace(fts_end_delete_sql, "%s", optim->name_prefix); + + /* Delete the doc ids that were copied to delete pending state at + the start of optimize. */ + graph = fts_parse_sql(NULL, NULL, sql_str); + + mem_free(sql_str); + + error = fts_eval_sql(optim->trx, graph); + fts_que_graph_free(graph); + + return(error); +} + +/**********************************************************************//** +Copy the deleted doc ids that will be purged during this optimize run +to the being deleted FTS auxiliary tables. The transaction is committed +upon successfull copy and rolled back on DB_DUPLICATE_KEY error. +@return DB_SUCCESS if all OK */ +static +ulint +fts_optimize_being_deleted_count( +/*=============================*/ + fts_optimize_t* optim) /*!< in: optimize instance */ +{ + fts_table_t fts_table; + + FTS_INIT_FTS_TABLE(&fts_table, "BEING_DELETED", FTS_COMMON_TABLE, + optim->table); + + return(fts_get_rows_count(&fts_table)); +} + +/*********************************************************************//** +Copy the deleted doc ids that will be purged during this optimize run +to the being deleted FTS auxiliary tables. The transaction is committed +upon successfull copy and rolled back on DB_DUPLICATE_KEY error. +@return DB_SUCCESS if all OK */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +fts_optimize_create_deleted_doc_id_snapshot( +/*========================================*/ + fts_optimize_t* optim) /*!< in: optimize instance */ +{ + dberr_t error; + que_t* graph; + char* sql_str; + + /* Since we only replace the table_id and don't construct the + full name, we do the substitution ourselves. */ + sql_str = ut_strreplace(fts_init_delete_sql, "%s", optim->name_prefix); + + /* Move doc_ids that are to be deleted to state being deleted. */ + graph = fts_parse_sql(NULL, NULL, sql_str); + + mem_free(sql_str); + + error = fts_eval_sql(optim->trx, graph); + + fts_que_graph_free(graph); + + if (error != DB_SUCCESS) { + fts_sql_rollback(optim->trx); + } else { + fts_sql_commit(optim->trx); + } + + optim->del_list_regenerated = TRUE; + + return(error); +} + +/*********************************************************************//** +Read in the document ids that are to be purged during optimize. The +transaction is committed upon successfully read. +@return DB_SUCCESS if all OK */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +fts_optimize_read_deleted_doc_id_snapshot( +/*======================================*/ + fts_optimize_t* optim) /*!< in: optimize instance */ +{ + dberr_t error; + + optim->fts_common_table.suffix = "BEING_DELETED"; + + /* Read the doc_ids to delete. */ + error = fts_table_fetch_doc_ids( + optim->trx, &optim->fts_common_table, optim->to_delete); + + if (error == DB_SUCCESS) { + + optim->fts_common_table.suffix = "BEING_DELETED_CACHE"; + + /* Read additional doc_ids to delete. */ + error = fts_table_fetch_doc_ids( + optim->trx, &optim->fts_common_table, optim->to_delete); + } + + if (error != DB_SUCCESS) { + + fts_doc_ids_free(optim->to_delete); + optim->to_delete = NULL; + } + + return(error); +} + +/*********************************************************************//** +Optimze all the FTS indexes, skipping those that have already been +optimized, since the FTS auxiliary indexes are not guaranteed to be +of the same cardinality. +@return DB_SUCCESS if all OK */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +fts_optimize_indexes( +/*=================*/ + fts_optimize_t* optim) /*!< in: optimize instance */ +{ + ulint i; + dberr_t error = DB_SUCCESS; + fts_t* fts = optim->table->fts; + + /* Optimize the FTS indexes. */ + for (i = 0; i < ib_vector_size(fts->indexes); ++i) { + dict_index_t* index; + +#ifdef FTS_OPTIMIZE_DEBUG + ib_time_t end_time; + ib_time_t start_time; + + /* Get the start and end optimize times for this index. */ + error = fts_optimize_get_index_start_time( + optim->trx, index, &start_time); + + if (error != DB_SUCCESS) { + break; + } + + error = fts_optimize_get_index_end_time( + optim->trx, index, &end_time); + + if (error != DB_SUCCESS) { + break; + } + + /* Start time will be 0 only for the first time or after + completing the optimization of all FTS indexes. */ + if (start_time == 0) { + start_time = ut_time(); + + error = fts_optimize_set_index_start_time( + optim->trx, index, start_time); + } + + /* Check if this index needs to be optimized or not. */ + if (ut_difftime(end_time, start_time) < 0) { + error = fts_optimize_index(optim, index); + + if (error != DB_SUCCESS) { + break; + } + } else { + ++optim->n_completed; + } +#endif + index = static_cast<dict_index_t*>( + ib_vector_getp(fts->indexes, i)); + error = fts_optimize_index(optim, index); + } + + if (error == DB_SUCCESS) { + fts_sql_commit(optim->trx); + } else { + fts_sql_rollback(optim->trx); + } + + return(error); +} + +/*********************************************************************//** +Cleanup the snapshot tables and the master deleted table. +@return DB_SUCCESS if all OK */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +fts_optimize_purge_snapshot( +/*========================*/ + fts_optimize_t* optim) /*!< in: optimize instance */ +{ + dberr_t error; + + /* Delete the doc ids from the master deleted tables, that were + in the snapshot that was taken at the start of optimize. */ + error = fts_optimize_purge_deleted_doc_ids(optim); + + if (error == DB_SUCCESS) { + /* Destroy the deleted doc id snapshot. */ + error = fts_optimize_purge_deleted_doc_id_snapshot(optim); + } + + if (error == DB_SUCCESS) { + fts_sql_commit(optim->trx); + } else { + fts_sql_rollback(optim->trx); + } + + return(error); +} + +/*********************************************************************//** +Reset the start time to 0 so that a new optimize can be started. +@return DB_SUCCESS if all OK */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +fts_optimize_reset_start_time( +/*==========================*/ + fts_optimize_t* optim) /*!< in: optimize instance */ +{ + dberr_t error = DB_SUCCESS; +#ifdef FTS_OPTIMIZE_DEBUG + fts_t* fts = optim->table->fts; + + /* Optimization should have been completed for all indexes. */ + ut_a(optim->n_completed == ib_vector_size(fts->indexes)); + + for (uint i = 0; i < ib_vector_size(fts->indexes); ++i) { + dict_index_t* index; + + ib_time_t start_time = 0; + + /* Reset the start time to 0 for this index. */ + error = fts_optimize_set_index_start_time( + optim->trx, index, start_time); + + index = static_cast<dict_index_t*>( + ib_vector_getp(fts->indexes, i)); + } +#endif + + if (error == DB_SUCCESS) { + fts_sql_commit(optim->trx); + } else { + fts_sql_rollback(optim->trx); + } + + return(error); +} + +/*********************************************************************//** +Run OPTIMIZE on the given table by a background thread. +@return DB_SUCCESS if all OK */ +static __attribute__((nonnull)) +dberr_t +fts_optimize_table_bk( +/*==================*/ + fts_slot_t* slot) /*!< in: table to optimiza */ +{ + dberr_t error; + dict_table_t* table = slot->table; + fts_t* fts = table->fts; + + /* Avoid optimizing tables that were optimized recently. */ + if (slot->last_run > 0 + && (ut_time() - slot->last_run) < slot->interval_time) { + + return(DB_SUCCESS); + + } else if (fts && fts->cache + && fts->cache->deleted >= FTS_OPTIMIZE_THRESHOLD) { + + error = fts_optimize_table(table); + + if (error == DB_SUCCESS) { + slot->state = FTS_STATE_DONE; + slot->last_run = 0; + slot->completed = ut_time(); + } + } else { + error = DB_SUCCESS; + } + + /* Note time this run completed. */ + slot->last_run = ut_time(); + + return(error); +} +/*********************************************************************//** +Run OPTIMIZE on the given table. +@return DB_SUCCESS if all OK */ +UNIV_INTERN +dberr_t +fts_optimize_table( +/*===============*/ + dict_table_t* table) /*!< in: table to optimiza */ +{ + dberr_t error = DB_SUCCESS; + fts_optimize_t* optim = NULL; + fts_t* fts = table->fts; + + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: FTS start optimize %s\n", table->name); + + optim = fts_optimize_create(table); + + // FIXME: Call this only at the start of optimize, currently we + // rely on DB_DUPLICATE_KEY to handle corrupting the snapshot. + + /* Check whether there are still records in BEING_DELETED table */ + if (fts_optimize_being_deleted_count(optim) == 0) { + /* Take a snapshot of the deleted document ids, they are copied + to the BEING_ tables. */ + error = fts_optimize_create_deleted_doc_id_snapshot(optim); + } + + /* A duplicate error is OK, since we don't erase the + doc ids from the being deleted state until all FTS + indexes have been optimized. */ + if (error == DB_DUPLICATE_KEY) { + error = DB_SUCCESS; + } + + if (error == DB_SUCCESS) { + + /* These document ids will be filtered out during the + index optimization phase. They are in the snapshot that we + took above, at the start of the optimize. */ + error = fts_optimize_read_deleted_doc_id_snapshot(optim); + + if (error == DB_SUCCESS) { + + /* Commit the read of being deleted + doc ids transaction. */ + fts_sql_commit(optim->trx); + + /* We would do optimization only if there + are deleted records to be cleaned up */ + if (ib_vector_size(optim->to_delete->doc_ids) > 0) { + error = fts_optimize_indexes(optim); + } + + } else { + ut_a(optim->to_delete == NULL); + } + + /* Only after all indexes have been optimized can we + delete the (snapshot) doc ids in the pending delete, + and master deleted tables. */ + if (error == DB_SUCCESS + && optim->n_completed == ib_vector_size(fts->indexes)) { + + if (fts_enable_diag_print) { + fprintf(stderr, "FTS_OPTIMIZE: Completed " + "Optimize, cleanup DELETED " + "table\n"); + } + + if (ib_vector_size(optim->to_delete->doc_ids) > 0) { + + /* Purge the doc ids that were in the + snapshot from the snapshot tables and + the master deleted table. */ + error = fts_optimize_purge_snapshot(optim); + } + + if (error == DB_SUCCESS) { + /* Reset the start time of all the FTS indexes + so that optimize can be restarted. */ + error = fts_optimize_reset_start_time(optim); + } + } + } + + fts_optimize_free(optim); + + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: FTS end optimize %s\n", table->name); + + return(error); +} + +/********************************************************************//** +Add the table to add to the OPTIMIZER's list. +@return new message instance */ +static +fts_msg_t* +fts_optimize_create_msg( +/*====================*/ + fts_msg_type_t type, /*!< in: type of message */ + void* ptr) /*!< in: message payload */ +{ + mem_heap_t* heap; + fts_msg_t* msg; + + heap = mem_heap_create(sizeof(*msg) + sizeof(ib_list_node_t) + 16); + msg = static_cast<fts_msg_t*>(mem_heap_alloc(heap, sizeof(*msg))); + + msg->ptr = ptr; + msg->type = type; + msg->heap = heap; + + return(msg); +} + +/**********************************************************************//** +Add the table to add to the OPTIMIZER's list. */ +UNIV_INTERN +void +fts_optimize_add_table( +/*===================*/ + dict_table_t* table) /*!< in: table to add */ +{ + fts_msg_t* msg; + + if (!fts_optimize_wq) { + return; + } + + ut_ad(table->cached && table->fts != NULL); + + /* Make sure table with FTS index cannot be evicted */ + if (table->can_be_evicted) { + dict_table_move_from_lru_to_non_lru(table); + } + + msg = fts_optimize_create_msg(FTS_MSG_ADD_TABLE, table); + + ib_wqueue_add(fts_optimize_wq, msg, msg->heap); +} + +/**********************************************************************//** +Optimize a table. */ +UNIV_INTERN +void +fts_optimize_do_table( +/*==================*/ + dict_table_t* table) /*!< in: table to optimize */ +{ + fts_msg_t* msg; + + /* Optimizer thread could be shutdown */ + if (!fts_optimize_wq) { + return; + } + + msg = fts_optimize_create_msg(FTS_MSG_OPTIMIZE_TABLE, table); + + ib_wqueue_add(fts_optimize_wq, msg, msg->heap); +} + +/**********************************************************************//** +Remove the table from the OPTIMIZER's list. We do wait for +acknowledgement from the consumer of the message. */ +UNIV_INTERN +void +fts_optimize_remove_table( +/*======================*/ + dict_table_t* table) /*!< in: table to remove */ +{ + fts_msg_t* msg; + os_event_t event; + fts_msg_del_t* remove; + + /* if the optimize system not yet initialized, return */ + if (!fts_optimize_wq) { + return; + } + + /* FTS optimizer thread is already exited */ + if (fts_opt_start_shutdown) { + ib_logf(IB_LOG_LEVEL_INFO, + "Try to remove table %s after FTS optimize" + " thread exiting.", table->name); + return; + } + + msg = fts_optimize_create_msg(FTS_MSG_DEL_TABLE, NULL); + + /* We will wait on this event until signalled by the consumer. */ + event = os_event_create(); + + remove = static_cast<fts_msg_del_t*>( + mem_heap_alloc(msg->heap, sizeof(*remove))); + + remove->table = table; + remove->event = event; + msg->ptr = remove; + + ib_wqueue_add(fts_optimize_wq, msg, msg->heap); + + os_event_wait(event); + + os_event_free(event); +} + +/**********************************************************************//** +Find the slot for a particular table. +@return slot if found else NULL. */ +static +fts_slot_t* +fts_optimize_find_slot( +/*===================*/ + ib_vector_t* tables, /*!< in: vector of tables */ + const dict_table_t* table) /*!< in: table to add */ +{ + ulint i; + + for (i = 0; i < ib_vector_size(tables); ++i) { + fts_slot_t* slot; + + slot = static_cast<fts_slot_t*>(ib_vector_get(tables, i)); + + if (slot->table->id == table->id) { + return(slot); + } + } + + return(NULL); +} + +/**********************************************************************//** +Start optimizing table. */ +static +void +fts_optimize_start_table( +/*=====================*/ + ib_vector_t* tables, /*!< in/out: vector of tables */ + dict_table_t* table) /*!< in: table to optimize */ +{ + fts_slot_t* slot; + + slot = fts_optimize_find_slot(tables, table); + + if (slot == NULL) { + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Error: table %s not registered " + "with the optimize thread.\n", table->name); + } else { + slot->last_run = 0; + slot->completed = 0; + } +} + +/**********************************************************************//** +Add the table to the vector if it doesn't already exist. */ +static +ibool +fts_optimize_new_table( +/*===================*/ + ib_vector_t* tables, /*!< in/out: vector of tables */ + dict_table_t* table) /*!< in: table to add */ +{ + ulint i; + fts_slot_t* slot; + ulint empty_slot = ULINT_UNDEFINED; + + /* Search for duplicates, also find a free slot if one exists. */ + for (i = 0; i < ib_vector_size(tables); ++i) { + + slot = static_cast<fts_slot_t*>( + ib_vector_get(tables, i)); + + if (slot->state == FTS_STATE_EMPTY) { + empty_slot = i; + } else if (slot->table->id == table->id) { + /* Already exists in our optimize queue. */ + return(FALSE); + } + } + + /* Reuse old slot. */ + if (empty_slot != ULINT_UNDEFINED) { + + slot = static_cast<fts_slot_t*>( + ib_vector_get(tables, empty_slot)); + + ut_a(slot->state == FTS_STATE_EMPTY); + + } else { /* Create a new slot. */ + + slot = static_cast<fts_slot_t*>(ib_vector_push(tables, NULL)); + } + + memset(slot, 0x0, sizeof(*slot)); + + slot->table = table; + slot->table_id = table->id; + slot->state = FTS_STATE_LOADED; + slot->interval_time = FTS_OPTIMIZE_INTERVAL_IN_SECS; + + return(TRUE); +} + +/**********************************************************************//** +Remove the table from the vector if it exists. */ +static +ibool +fts_optimize_del_table( +/*===================*/ + ib_vector_t* tables, /*!< in/out: vector of tables */ + fts_msg_del_t* msg) /*!< in: table to delete */ +{ + ulint i; + dict_table_t* table = msg->table; + + for (i = 0; i < ib_vector_size(tables); ++i) { + fts_slot_t* slot; + + slot = static_cast<fts_slot_t*>(ib_vector_get(tables, i)); + + /* FIXME: Should we assert on this ? */ + if (slot->state != FTS_STATE_EMPTY + && slot->table->id == table->id) { + + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: FTS Optimize Removing " + "table %s\n", table->name); + + slot->table = NULL; + slot->state = FTS_STATE_EMPTY; + + return(TRUE); + } + } + + return(FALSE); +} + +/**********************************************************************//** +Calculate how many of the registered tables need to be optimized. +@return no. of tables to optimize */ +static +ulint +fts_optimize_how_many( +/*==================*/ + const ib_vector_t* tables) /*!< in: registered tables + vector*/ +{ + ulint i; + ib_time_t delta; + ulint n_tables = 0; + ib_time_t current_time; + + current_time = ut_time(); + + for (i = 0; i < ib_vector_size(tables); ++i) { + const fts_slot_t* slot; + + slot = static_cast<const fts_slot_t*>( + ib_vector_get_const(tables, i)); + + switch (slot->state) { + case FTS_STATE_DONE: + case FTS_STATE_LOADED: + ut_a(slot->completed <= current_time); + + delta = current_time - slot->completed; + + /* Skip slots that have been optimized recently. */ + if (delta >= slot->interval_time) { + ++n_tables; + } + break; + + case FTS_STATE_RUNNING: + ut_a(slot->last_run <= current_time); + + delta = current_time - slot->last_run; + + if (delta > slot->interval_time) { + ++n_tables; + } + break; + + /* Slots in a state other than the above + are ignored. */ + case FTS_STATE_EMPTY: + case FTS_STATE_SUSPENDED: + break; + } + + } + + return(n_tables); +} + +/**********************************************************************//** +Check if the total memory used by all FTS table exceeds the maximum limit. +@return true if a sync is needed, false otherwise */ +static +bool +fts_is_sync_needed( +/*===============*/ + const ib_vector_t* tables) /*!< in: registered tables + vector*/ +{ + ulint total_memory = 0; + double time_diff = difftime(ut_time(), last_check_sync_time); + + if (fts_need_sync || time_diff < 5) { + return(false); + } + + last_check_sync_time = ut_time(); + + for (ulint i = 0; i < ib_vector_size(tables); ++i) { + const fts_slot_t* slot; + + slot = static_cast<const fts_slot_t*>( + ib_vector_get_const(tables, i)); + + if (slot->state != FTS_STATE_EMPTY && slot->table + && slot->table->fts) { + total_memory += slot->table->fts->cache->total_size; + } + + if (total_memory > fts_max_total_cache_size) { + return(true); + } + } + + return(false); +} + +#if 0 +/*********************************************************************//** +Check whether a table needs to be optimized. */ +static +void +fts_optimize_need_sync( +/*===================*/ + ib_vector_t* tables) /*!< in: list of tables */ +{ + dict_table_t* table = NULL; + fts_slot_t* slot; + ulint num_table = ib_vector_size(tables); + + if (!num_table) { + return; + } + + if (fts_optimize_sync_iterator >= num_table) { + fts_optimize_sync_iterator = 0; + } + + slot = ib_vector_get(tables, fts_optimize_sync_iterator); + table = slot->table; + + if (!table) { + return; + } + + ut_ad(table->fts); + + if (table->fts->cache) { + ulint deleted = table->fts->cache->deleted; + + if (table->fts->cache->added + >= fts_optimize_add_threshold) { + fts_sync_table(table); + } else if (deleted >= fts_optimize_delete_threshold) { + fts_optimize_do_table(table); + + mutex_enter(&table->fts->cache->deleted_lock); + table->fts->cache->deleted -= deleted; + mutex_exit(&table->fts->cache->deleted_lock); + } + } + + fts_optimize_sync_iterator++; + + return; +} +#endif + +/**********************************************************************//** +Optimize all FTS tables. +@return Dummy return */ +UNIV_INTERN +os_thread_ret_t +fts_optimize_thread( +/*================*/ + void* arg) /*!< in: work queue*/ +{ + mem_heap_t* heap; + ib_vector_t* tables; + ib_alloc_t* heap_alloc; + ulint current = 0; + ibool done = FALSE; + ulint n_tables = 0; + os_event_t exit_event = 0; + ulint n_optimize = 0; + ib_wqueue_t* wq = (ib_wqueue_t*) arg; + + ut_ad(!srv_read_only_mode); + my_thread_init(); + + heap = mem_heap_create(sizeof(dict_table_t*) * 64); + heap_alloc = ib_heap_allocator_create(heap); + + tables = ib_vector_create(heap_alloc, sizeof(fts_slot_t), 4); + + while(!done && srv_shutdown_state == SRV_SHUTDOWN_NONE) { + + /* If there is no message in the queue and we have tables + to optimize then optimize the tables. */ + + if (!done + && ib_wqueue_is_empty(wq) + && n_tables > 0 + && n_optimize > 0) { + + fts_slot_t* slot; + + ut_a(ib_vector_size(tables) > 0); + + slot = static_cast<fts_slot_t*>( + ib_vector_get(tables, current)); + + /* Handle the case of empty slots. */ + if (slot->state != FTS_STATE_EMPTY) { + + slot->state = FTS_STATE_RUNNING; + + fts_optimize_table_bk(slot); + } + + ++current; + + /* Wrap around the counter. */ + if (current >= ib_vector_size(tables)) { + n_optimize = fts_optimize_how_many(tables); + + current = 0; + } + + } else if (n_optimize == 0 || !ib_wqueue_is_empty(wq)) { + fts_msg_t* msg; + + msg = static_cast<fts_msg_t*>( + ib_wqueue_timedwait(wq, + FTS_QUEUE_WAIT_IN_USECS)); + + /* Timeout ? */ + if (msg == NULL) { + if (fts_is_sync_needed(tables)) { + fts_need_sync = true; + } + + continue; + } + + switch (msg->type) { + case FTS_MSG_START: + break; + + case FTS_MSG_PAUSE: + break; + + case FTS_MSG_STOP: + done = TRUE; + exit_event = (os_event_t) msg->ptr; + break; + + case FTS_MSG_ADD_TABLE: + ut_a(!done); + if (fts_optimize_new_table( + tables, + static_cast<dict_table_t*>( + msg->ptr))) { + ++n_tables; + } + break; + + case FTS_MSG_OPTIMIZE_TABLE: + if (!done) { + fts_optimize_start_table( + tables, + static_cast<dict_table_t*>( + msg->ptr)); + } + break; + + case FTS_MSG_DEL_TABLE: + if (fts_optimize_del_table( + tables, static_cast<fts_msg_del_t*>( + msg->ptr))) { + --n_tables; + } + + /* Signal the producer that we have + removed the table. */ + os_event_set( + ((fts_msg_del_t*) msg->ptr)->event); + break; + + default: + ut_error; + } + + mem_heap_free(msg->heap); + + if (!done) { + n_optimize = fts_optimize_how_many(tables); + } else { + n_optimize = 0; + } + } + } + + /* Server is being shutdown, sync the data from FTS cache to disk + if needed */ + if (n_tables > 0) { + ulint i; + + for (i = 0; i < ib_vector_size(tables); i++) { + fts_slot_t* slot; + + slot = static_cast<fts_slot_t*>( + ib_vector_get(tables, i)); + + if (slot->state != FTS_STATE_EMPTY) { + dict_table_t* table = NULL; + + /*slot->table may be freed, so we try to open + table by slot->table_id.*/ + table = dict_table_open_on_id( + slot->table_id, FALSE, + DICT_TABLE_OP_NORMAL); + + if (table) { + + if (dict_table_has_fts_index(table)) { + fts_sync_table(table); + } + + if (table->fts) { + fts_free(table); + } + + dict_table_close(table, FALSE, FALSE); + } + } + } + } + + ib_vector_free(tables); + + ib_logf(IB_LOG_LEVEL_INFO, "FTS optimize thread exiting."); + + os_event_set(exit_event); + my_thread_end(); + + /* We count the number of threads in os_thread_exit(). A created + thread should always use that to exit and not use return() to exit. */ + os_thread_exit(NULL); + + OS_THREAD_DUMMY_RETURN; +} + +/**********************************************************************//** +Startup the optimize thread and create the work queue. */ +UNIV_INTERN +void +fts_optimize_init(void) +/*===================*/ +{ + ut_ad(!srv_read_only_mode); + + /* For now we only support one optimize thread. */ + ut_a(fts_optimize_wq == NULL); + + fts_optimize_wq = ib_wqueue_create(); + ut_a(fts_optimize_wq != NULL); + last_check_sync_time = ut_time(); + + os_thread_create(fts_optimize_thread, fts_optimize_wq, NULL); +} + +/**********************************************************************//** +Check whether the work queue is initialized. +@return TRUE if optimze queue is initialized. */ +UNIV_INTERN +ibool +fts_optimize_is_init(void) +/*======================*/ +{ + return(fts_optimize_wq != NULL); +} + +/**********************************************************************//** +Signal the optimize thread to prepare for shutdown. */ +UNIV_INTERN +void +fts_optimize_start_shutdown(void) +/*=============================*/ +{ + ut_ad(!srv_read_only_mode); + + fts_msg_t* msg; + os_event_t event; + + /* If there is an ongoing activity on dictionary, such as + srv_master_evict_from_table_cache(), wait for it */ + dict_mutex_enter_for_mysql(); + + /* Tells FTS optimizer system that we are exiting from + optimizer thread, message send their after will not be + processed */ + fts_opt_start_shutdown = true; + dict_mutex_exit_for_mysql(); + + /* We tell the OPTIMIZE thread to switch to state done, we + can't delete the work queue here because the add thread needs + deregister the FTS tables. */ + event = os_event_create(); + + msg = fts_optimize_create_msg(FTS_MSG_STOP, NULL); + msg->ptr = event; + + ib_wqueue_add(fts_optimize_wq, msg, msg->heap); + + os_event_wait(event); + os_event_free(event); + + ib_wqueue_free(fts_optimize_wq); + +} + +/**********************************************************************//** +Reset the work queue. */ +UNIV_INTERN +void +fts_optimize_end(void) +/*==================*/ +{ + ut_ad(!srv_read_only_mode); + + // FIXME: Potential race condition here: We should wait for + // the optimize thread to confirm shutdown. + fts_optimize_wq = NULL; +} diff --git a/storage/xtradb/fts/fts0pars.cc b/storage/xtradb/fts/fts0pars.cc new file mode 100644 index 00000000000..7f0ba4e0c1b --- /dev/null +++ b/storage/xtradb/fts/fts0pars.cc @@ -0,0 +1,2010 @@ +/* A Bison parser, made by GNU Bison 2.5. */ + +/* Bison implementation for Yacc-like parsers in C + + Copyright (C) 1984, 1989-1990, 2000-2011 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +/* As a special exception, you may create a larger work that contains + part or all of the Bison parser skeleton and distribute that work + under terms of your choice, so long as that work isn't itself a + parser generator using the skeleton or a modified version thereof + as a parser skeleton. Alternatively, if you modify or redistribute + the parser skeleton itself, you may (at your option) remove this + special exception, which will cause the skeleton and the resulting + Bison output files to be licensed under the GNU General Public + License without this special exception. + + This special exception was added by the Free Software Foundation in + version 2.2 of Bison. */ + +/* C LALR(1) parser skeleton written by Richard Stallman, by + simplifying the original so-called "semantic" parser. */ + +/* All symbols defined below should begin with yy or YY, to avoid + infringing on user name space. This should be done even for local + variables, as they might otherwise be expanded by user macros. + There are some unavoidable exceptions within include files to + define necessary library symbols; they are noted "INFRINGES ON + USER NAME SPACE" below. */ + +/* Identify Bison output. */ +#define YYBISON 1 + +/* Bison version. */ +#define YYBISON_VERSION "2.5" + +/* Skeleton name. */ +#define YYSKELETON_NAME "yacc.c" + +/* Pure parsers. */ +#define YYPURE 1 + +/* Push parsers. */ +#define YYPUSH 0 + +/* Pull parsers. */ +#define YYPULL 1 + +/* Using locations. */ +#define YYLSP_NEEDED 0 + +/* Substitute the variable and function names. */ +#define yyparse ftsparse +#define yylex ftslex +#define yyerror ftserror +#define yylval ftslval +#define yychar ftschar +#define yydebug ftsdebug +#define yynerrs ftsnerrs + + +/* Copy the first part of user declarations. */ + +/* Line 268 of yacc.c */ +#line 26 "fts0pars.y" + + +#include "mem0mem.h" +#include "fts0ast.h" +#include "fts0blex.h" +#include "fts0tlex.h" +#include "fts0pars.h" + +extern int fts_lexer(YYSTYPE*, fts_lexer_t*); +extern int fts_blexer(YYSTYPE*, yyscan_t); +extern int fts_tlexer(YYSTYPE*, yyscan_t); + +typedef int (*fts_scan)(); + +extern int ftserror(const char* p); + +/* Required for reentrant parser */ +#define ftslex fts_lexer + +#define YYERROR_VERBOSE + +/* For passing an argument to yyparse() */ +#define YYPARSE_PARAM state +#define YYLEX_PARAM ((fts_ast_state_t*) state)->lexer + +#define YYTOKENFREE(token) fts_ast_string_free((token)) + +typedef int (*fts_scanner_alt)(YYSTYPE* val, yyscan_t yyscanner); +typedef int (*fts_scanner)(); + +struct fts_lexer_t { + fts_scanner scanner; + void* yyscanner; +}; + + + +/* Line 268 of yacc.c */ +#line 115 "fts0pars.cc" + +/* Enabling traces. */ +#ifndef YYDEBUG +# define YYDEBUG 0 +#endif + +/* Enabling verbose error messages. */ +#ifdef YYERROR_VERBOSE +# undef YYERROR_VERBOSE +# define YYERROR_VERBOSE 1 +#else +# define YYERROR_VERBOSE 0 +#endif + +/* Enabling the token table. */ +#ifndef YYTOKEN_TABLE +# define YYTOKEN_TABLE 0 +#endif + + +/* Tokens. */ +#ifndef YYTOKENTYPE +# define YYTOKENTYPE + /* Put the tokens into the symbol table, so that GDB and other debuggers + know about them. */ + enum yytokentype { + FTS_OPER = 258, + FTS_TEXT = 259, + FTS_TERM = 260, + FTS_NUMB = 261 + }; +#endif + + + +#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED +typedef union YYSTYPE +{ + +/* Line 293 of yacc.c */ +#line 61 "fts0pars.y" + + int oper; + fts_ast_string_t* token; + fts_ast_node_t* node; + + + +/* Line 293 of yacc.c */ +#line 165 "fts0pars.cc" +} YYSTYPE; +# define YYSTYPE_IS_TRIVIAL 1 +# define yystype YYSTYPE /* obsolescent; will be withdrawn */ +# define YYSTYPE_IS_DECLARED 1 +#endif + + +/* Copy the second part of user declarations. */ + + +/* Line 343 of yacc.c */ +#line 177 "fts0pars.cc" + +#ifdef short +# undef short +#endif + +#ifdef YYTYPE_UINT8 +typedef YYTYPE_UINT8 yytype_uint8; +#else +typedef unsigned char yytype_uint8; +#endif + +#ifdef YYTYPE_INT8 +typedef YYTYPE_INT8 yytype_int8; +#elif (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +typedef signed char yytype_int8; +#else +typedef short int yytype_int8; +#endif + +#ifdef YYTYPE_UINT16 +typedef YYTYPE_UINT16 yytype_uint16; +#else +typedef unsigned short int yytype_uint16; +#endif + +#ifdef YYTYPE_INT16 +typedef YYTYPE_INT16 yytype_int16; +#else +typedef short int yytype_int16; +#endif + +#ifndef YYSIZE_T +# ifdef __SIZE_TYPE__ +# define YYSIZE_T __SIZE_TYPE__ +# elif defined size_t +# define YYSIZE_T size_t +# elif ! defined YYSIZE_T && (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +# include <stddef.h> /* INFRINGES ON USER NAME SPACE */ +# define YYSIZE_T size_t +# else +# define YYSIZE_T unsigned int +# endif +#endif + +#define YYSIZE_MAXIMUM ((YYSIZE_T) -1) + +#ifndef YY_ +# if defined YYENABLE_NLS && YYENABLE_NLS +# if ENABLE_NLS +# include <libintl.h> /* INFRINGES ON USER NAME SPACE */ +# define YY_(msgid) dgettext ("bison-runtime", msgid) +# endif +# endif +# ifndef YY_ +# define YY_(msgid) msgid +# endif +#endif + +/* Suppress unused-variable warnings by "using" E. */ +#if ! defined lint || defined __GNUC__ +# define YYUSE(e) ((void) (e)) +#else +# define YYUSE(e) /* empty */ +#endif + +/* Identity function, used to suppress warnings about constant conditions. */ +#ifndef lint +# define YYID(n) (n) +#else +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static int +YYID (int yyi) +#else +static int +YYID (yyi) + int yyi; +#endif +{ + return yyi; +} +#endif + +#if ! defined yyoverflow || YYERROR_VERBOSE + +/* The parser invokes alloca or malloc; define the necessary symbols. */ + +# ifdef YYSTACK_USE_ALLOCA +# if YYSTACK_USE_ALLOCA +# ifdef __GNUC__ +# define YYSTACK_ALLOC __builtin_alloca +# elif defined __BUILTIN_VA_ARG_INCR +# include <alloca.h> /* INFRINGES ON USER NAME SPACE */ +# elif defined _AIX +# define YYSTACK_ALLOC __alloca +# elif defined _MSC_VER +# include <malloc.h> /* INFRINGES ON USER NAME SPACE */ +# define alloca _alloca +# else +# define YYSTACK_ALLOC alloca +# if ! defined _ALLOCA_H && ! defined EXIT_SUCCESS && (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +# include <stdlib.h> /* INFRINGES ON USER NAME SPACE */ +# ifndef EXIT_SUCCESS +# define EXIT_SUCCESS 0 +# endif +# endif +# endif +# endif +# endif + +# ifdef YYSTACK_ALLOC + /* Pacify GCC's `empty if-body' warning. */ +# define YYSTACK_FREE(Ptr) do { /* empty */; } while (YYID (0)) +# ifndef YYSTACK_ALLOC_MAXIMUM + /* The OS might guarantee only one guard page at the bottom of the stack, + and a page size can be as small as 4096 bytes. So we cannot safely + invoke alloca (N) if N exceeds 4096. Use a slightly smaller number + to allow for a few compiler-allocated temporary stack slots. */ +# define YYSTACK_ALLOC_MAXIMUM 4032 /* reasonable circa 2006 */ +# endif +# else +# define YYSTACK_ALLOC YYMALLOC +# define YYSTACK_FREE YYFREE +# ifndef YYSTACK_ALLOC_MAXIMUM +# define YYSTACK_ALLOC_MAXIMUM YYSIZE_MAXIMUM +# endif +# if (defined __cplusplus && ! defined EXIT_SUCCESS \ + && ! ((defined YYMALLOC || defined malloc) \ + && (defined YYFREE || defined free))) +# include <stdlib.h> /* INFRINGES ON USER NAME SPACE */ +# ifndef EXIT_SUCCESS +# define EXIT_SUCCESS 0 +# endif +# endif +# ifndef YYMALLOC +# define YYMALLOC malloc +# if ! defined malloc && ! defined EXIT_SUCCESS && (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +void *malloc (YYSIZE_T); /* INFRINGES ON USER NAME SPACE */ +# endif +# endif +# ifndef YYFREE +# define YYFREE free +# if ! defined free && ! defined EXIT_SUCCESS && (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +void free (void *); /* INFRINGES ON USER NAME SPACE */ +# endif +# endif +# endif +#endif /* ! defined yyoverflow || YYERROR_VERBOSE */ + + +#if (! defined yyoverflow \ + && (! defined __cplusplus \ + || (defined YYSTYPE_IS_TRIVIAL && YYSTYPE_IS_TRIVIAL))) + +/* A type that is properly aligned for any stack member. */ +union yyalloc +{ + yytype_int16 yyss_alloc; + YYSTYPE yyvs_alloc; +}; + +/* The size of the maximum gap between one aligned stack and the next. */ +# define YYSTACK_GAP_MAXIMUM (sizeof (union yyalloc) - 1) + +/* The size of an array large to enough to hold all stacks, each with + N elements. */ +# define YYSTACK_BYTES(N) \ + ((N) * (sizeof (yytype_int16) + sizeof (YYSTYPE)) \ + + YYSTACK_GAP_MAXIMUM) + +# define YYCOPY_NEEDED 1 + +/* Relocate STACK from its old location to the new one. The + local variables YYSIZE and YYSTACKSIZE give the old and new number of + elements in the stack, and YYPTR gives the new location of the + stack. Advance YYPTR to a properly aligned location for the next + stack. */ +# define YYSTACK_RELOCATE(Stack_alloc, Stack) \ + do \ + { \ + YYSIZE_T yynewbytes; \ + YYCOPY (&yyptr->Stack_alloc, Stack, yysize); \ + Stack = &yyptr->Stack_alloc; \ + yynewbytes = yystacksize * sizeof (*Stack) + YYSTACK_GAP_MAXIMUM; \ + yyptr += yynewbytes / sizeof (*yyptr); \ + } \ + while (YYID (0)) + +#endif + +#if defined YYCOPY_NEEDED && YYCOPY_NEEDED +/* Copy COUNT objects from FROM to TO. The source and destination do + not overlap. */ +# ifndef YYCOPY +# if defined __GNUC__ && 1 < __GNUC__ +# define YYCOPY(To, From, Count) \ + __builtin_memcpy (To, From, (Count) * sizeof (*(From))) +# else +# define YYCOPY(To, From, Count) \ + do \ + { \ + YYSIZE_T yyi; \ + for (yyi = 0; yyi < (Count); yyi++) \ + (To)[yyi] = (From)[yyi]; \ + } \ + while (YYID (0)) +# endif +# endif +#endif /* !YYCOPY_NEEDED */ + +/* YYFINAL -- State number of the termination state. */ +#define YYFINAL 3 +/* YYLAST -- Last index in YYTABLE. */ +#define YYLAST 52 + +/* YYNTOKENS -- Number of terminals. */ +#define YYNTOKENS 16 +/* YYNNTS -- Number of nonterminals. */ +#define YYNNTS 8 +/* YYNRULES -- Number of rules. */ +#define YYNRULES 24 +/* YYNRULES -- Number of states. */ +#define YYNSTATES 33 + +/* YYTRANSLATE(YYLEX) -- Bison symbol number corresponding to YYLEX. */ +#define YYUNDEFTOK 2 +#define YYMAXUTOK 261 + +#define YYTRANSLATE(YYX) \ + ((unsigned int) (YYX) <= YYMAXUTOK ? yytranslate[YYX] : YYUNDEFTOK) + +/* YYTRANSLATE[YYLEX] -- Bison symbol number corresponding to YYLEX. */ +static const yytype_uint8 yytranslate[] = +{ + 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 12, 13, 14, 7, 2, 8, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 10, 2, 11, 2, 15, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 9, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 1, 2, 3, 4, + 5, 6 +}; + +#if YYDEBUG +/* YYPRHS[YYN] -- Index of the first RHS symbol of rule number YYN in + YYRHS. */ +static const yytype_uint8 yyprhs[] = +{ + 0, 0, 3, 5, 6, 9, 12, 16, 21, 23, + 25, 28, 32, 36, 39, 44, 47, 49, 51, 53, + 55, 57, 59, 61, 64 +}; + +/* YYRHS -- A `-1'-separated list of the rules' RHS. */ +static const yytype_int8 yyrhs[] = +{ + 17, 0, -1, 18, -1, -1, 18, 20, -1, 18, + 19, -1, 12, 18, 13, -1, 21, 12, 18, 13, + -1, 22, -1, 23, -1, 22, 14, -1, 23, 15, + 6, -1, 21, 22, 14, -1, 21, 22, -1, 21, + 23, 15, 6, -1, 21, 23, -1, 8, -1, 7, + -1, 9, -1, 10, -1, 11, -1, 5, -1, 6, + -1, 14, 22, -1, 4, -1 +}; + +/* YYRLINE[YYN] -- source line where rule number YYN was defined. */ +static const yytype_uint8 yyrline[] = +{ + 0, 79, 79, 85, 89, 99, 111, 119, 129, 133, + 137, 141, 146, 152, 157, 164, 170, 174, 178, 182, + 186, 191, 196, 202, 207 +}; +#endif + +#if YYDEBUG || YYERROR_VERBOSE || YYTOKEN_TABLE +/* YYTNAME[SYMBOL-NUM] -- String name of the symbol SYMBOL-NUM. + First, the terminals, then, starting at YYNTOKENS, nonterminals. */ +static const char *const yytname[] = +{ + "$end", "error", "$undefined", "FTS_OPER", "FTS_TEXT", "FTS_TERM", + "FTS_NUMB", "'+'", "'-'", "'~'", "'<'", "'>'", "'('", "')'", "'*'", + "'@'", "$accept", "query", "expr_lst", "sub_expr", "expr", "prefix", + "term", "text", 0 +}; +#endif + +# ifdef YYPRINT +/* YYTOKNUM[YYLEX-NUM] -- Internal token number corresponding to + token YYLEX-NUM. */ +static const yytype_uint16 yytoknum[] = +{ + 0, 256, 257, 258, 259, 260, 261, 43, 45, 126, + 60, 62, 40, 41, 42, 64 +}; +# endif + +/* YYR1[YYN] -- Symbol number of symbol that rule YYN derives. */ +static const yytype_uint8 yyr1[] = +{ + 0, 16, 17, 18, 18, 18, 19, 19, 20, 20, + 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, + 21, 22, 22, 22, 23 +}; + +/* YYR2[YYN] -- Number of symbols composing right hand side of rule YYN. */ +static const yytype_uint8 yyr2[] = +{ + 0, 2, 1, 0, 2, 2, 3, 4, 1, 1, + 2, 3, 3, 2, 4, 2, 1, 1, 1, 1, + 1, 1, 1, 2, 1 +}; + +/* YYDEFACT[STATE-NAME] -- Default reduction number in state STATE-NUM. + Performed when YYTABLE doesn't specify something else to do. Zero + means the default is an error. */ +static const yytype_uint8 yydefact[] = +{ + 3, 0, 2, 1, 24, 21, 22, 17, 16, 18, + 19, 20, 3, 0, 5, 4, 0, 8, 9, 0, + 23, 3, 13, 15, 10, 0, 6, 0, 12, 0, + 11, 7, 14 +}; + +/* YYDEFGOTO[NTERM-NUM]. */ +static const yytype_int8 yydefgoto[] = +{ + -1, 1, 2, 14, 15, 16, 17, 18 +}; + +/* YYPACT[STATE-NUM] -- Index in YYTABLE of the portion describing + STATE-NUM. */ +#define YYPACT_NINF -5 +static const yytype_int8 yypact[] = +{ + -5, 38, 18, -5, -5, -5, -5, -5, -5, -5, + -5, -5, -5, 31, -5, -5, 29, 30, 32, -4, + -5, -5, 34, 35, -5, 40, -5, 7, -5, 43, + -5, -5, -5 +}; + +/* YYPGOTO[NTERM-NUM]. */ +static const yytype_int8 yypgoto[] = +{ + -5, -5, 19, -5, -5, -5, 26, 36 +}; + +/* YYTABLE[YYPACT[STATE-NUM]]. What to do in state STATE-NUM. If + positive, shift that token. If negative, reduce the rule which + number is the opposite. If YYTABLE_NINF, syntax error. */ +#define YYTABLE_NINF -1 +static const yytype_uint8 yytable[] = +{ + 4, 5, 6, 7, 8, 9, 10, 11, 12, 26, + 13, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 31, 13, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 19, 13, 4, 5, 6, 5, 6, 3, 20, + 27, 21, 22, 13, 24, 13, 30, 25, 28, 32, + 29, 0, 23 +}; + +#define yypact_value_is_default(yystate) \ + ((yystate) == (-5)) + +#define yytable_value_is_error(yytable_value) \ + YYID (0) + +static const yytype_int8 yycheck[] = +{ + 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 12, 14, 4, 5, 6, 5, 6, 0, 13, + 21, 12, 16, 14, 14, 14, 6, 15, 14, 6, + 15, -1, 16 +}; + +/* YYSTOS[STATE-NUM] -- The (internal number of the) accessing + symbol of state STATE-NUM. */ +static const yytype_uint8 yystos[] = +{ + 0, 17, 18, 0, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 14, 19, 20, 21, 22, 23, 18, + 22, 12, 22, 23, 14, 15, 13, 18, 14, 15, + 6, 13, 6 +}; + +#define yyerrok (yyerrstatus = 0) +#define yyclearin (yychar = YYEMPTY) +#define YYEMPTY (-2) +#define YYEOF 0 + +#define YYACCEPT goto yyacceptlab +#define YYABORT goto yyabortlab +#define YYERROR goto yyerrorlab + + +/* Like YYERROR except do call yyerror. This remains here temporarily + to ease the transition to the new meaning of YYERROR, for GCC. + Once GCC version 2 has supplanted version 1, this can go. However, + YYFAIL appears to be in use. Nevertheless, it is formally deprecated + in Bison 2.4.2's NEWS entry, where a plan to phase it out is + discussed. */ + +#define YYFAIL goto yyerrlab +#if defined YYFAIL + /* This is here to suppress warnings from the GCC cpp's + -Wunused-macros. Normally we don't worry about that warning, but + some users do, and we want to make it easy for users to remove + YYFAIL uses, which will produce warnings from Bison 2.5. */ +#endif + +#define YYRECOVERING() (!!yyerrstatus) + +#define YYBACKUP(Token, Value) \ +do \ + if (yychar == YYEMPTY && yylen == 1) \ + { \ + yychar = (Token); \ + yylval = (Value); \ + YYPOPSTACK (1); \ + goto yybackup; \ + } \ + else \ + { \ + yyerror (YY_("syntax error: cannot back up")); \ + YYERROR; \ + } \ +while (YYID (0)) + + +#define YYTERROR 1 +#define YYERRCODE 256 + +#define YYERRCLEANUP \ +do \ + switch (yylastchar) \ + { \ + case FTS_NUMB: \ + case FTS_TEXT: \ + case FTS_TERM: \ + YYTOKENFREE(yylval.token); \ + break; \ + default: \ + break; \ + } \ +while (YYID (0)) + +/* YYLLOC_DEFAULT -- Set CURRENT to span from RHS[1] to RHS[N]. + If N is 0, then set CURRENT to the empty location which ends + the previous symbol: RHS[0] (always defined). */ + +#define YYRHSLOC(Rhs, K) ((Rhs)[K]) +#ifndef YYLLOC_DEFAULT +# define YYLLOC_DEFAULT(Current, Rhs, N) \ + do \ + if (YYID (N)) \ + { \ + (Current).first_line = YYRHSLOC (Rhs, 1).first_line; \ + (Current).first_column = YYRHSLOC (Rhs, 1).first_column; \ + (Current).last_line = YYRHSLOC (Rhs, N).last_line; \ + (Current).last_column = YYRHSLOC (Rhs, N).last_column; \ + } \ + else \ + { \ + (Current).first_line = (Current).last_line = \ + YYRHSLOC (Rhs, 0).last_line; \ + (Current).first_column = (Current).last_column = \ + YYRHSLOC (Rhs, 0).last_column; \ + } \ + while (YYID (0)) +#endif + + +/* This macro is provided for backward compatibility. */ + +#ifndef YY_LOCATION_PRINT +# define YY_LOCATION_PRINT(File, Loc) ((void) 0) +#endif + + +/* YYLEX -- calling `yylex' with the right arguments. */ + +#ifdef YYLEX_PARAM +# define YYLEX yylex (&yylval, YYLEX_PARAM) +#else +# define YYLEX yylex (&yylval) +#endif + +/* Enable debugging if requested. */ +#if YYDEBUG + +# ifndef YYFPRINTF +# include <stdio.h> /* INFRINGES ON USER NAME SPACE */ +# define YYFPRINTF fprintf +# endif + +# define YYDPRINTF(Args) \ +do { \ + if (yydebug) \ + YYFPRINTF Args; \ +} while (YYID (0)) + +# define YY_SYMBOL_PRINT(Title, Type, Value, Location) \ +do { \ + if (yydebug) \ + { \ + YYFPRINTF (stderr, "%s ", Title); \ + yy_symbol_print (stderr, \ + Type, Value); \ + YYFPRINTF (stderr, "\n"); \ + } \ +} while (YYID (0)) + + +/*--------------------------------. +| Print this symbol on YYOUTPUT. | +`--------------------------------*/ + +/*ARGSUSED*/ +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static void +yy_symbol_value_print (FILE *yyoutput, int yytype, YYSTYPE const * const yyvaluep) +#else +static void +yy_symbol_value_print (yyoutput, yytype, yyvaluep) + FILE *yyoutput; + int yytype; + YYSTYPE const * const yyvaluep; +#endif +{ + if (!yyvaluep) + return; +# ifdef YYPRINT + if (yytype < YYNTOKENS) + YYPRINT (yyoutput, yytoknum[yytype], *yyvaluep); +# else + YYUSE (yyoutput); +# endif + switch (yytype) + { + default: + break; + } +} + + +/*--------------------------------. +| Print this symbol on YYOUTPUT. | +`--------------------------------*/ + +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static void +yy_symbol_print (FILE *yyoutput, int yytype, YYSTYPE const * const yyvaluep) +#else +static void +yy_symbol_print (yyoutput, yytype, yyvaluep) + FILE *yyoutput; + int yytype; + YYSTYPE const * const yyvaluep; +#endif +{ + if (yytype < YYNTOKENS) + YYFPRINTF (yyoutput, "token %s (", yytname[yytype]); + else + YYFPRINTF (yyoutput, "nterm %s (", yytname[yytype]); + + yy_symbol_value_print (yyoutput, yytype, yyvaluep); + YYFPRINTF (yyoutput, ")"); +} + +/*------------------------------------------------------------------. +| yy_stack_print -- Print the state stack from its BOTTOM up to its | +| TOP (included). | +`------------------------------------------------------------------*/ + +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static void +yy_stack_print (yytype_int16 *yybottom, yytype_int16 *yytop) +#else +static void +yy_stack_print (yybottom, yytop) + yytype_int16 *yybottom; + yytype_int16 *yytop; +#endif +{ + YYFPRINTF (stderr, "Stack now"); + for (; yybottom <= yytop; yybottom++) + { + int yybot = *yybottom; + YYFPRINTF (stderr, " %d", yybot); + } + YYFPRINTF (stderr, "\n"); +} + +# define YY_STACK_PRINT(Bottom, Top) \ +do { \ + if (yydebug) \ + yy_stack_print ((Bottom), (Top)); \ +} while (YYID (0)) + + +/*------------------------------------------------. +| Report that the YYRULE is going to be reduced. | +`------------------------------------------------*/ + +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static void +yy_reduce_print (YYSTYPE *yyvsp, int yyrule) +#else +static void +yy_reduce_print (yyvsp, yyrule) + YYSTYPE *yyvsp; + int yyrule; +#endif +{ + int yynrhs = yyr2[yyrule]; + int yyi; + unsigned long int yylno = yyrline[yyrule]; + YYFPRINTF (stderr, "Reducing stack by rule %d (line %lu):\n", + yyrule - 1, yylno); + /* The symbols being reduced. */ + for (yyi = 0; yyi < yynrhs; yyi++) + { + YYFPRINTF (stderr, " $%d = ", yyi + 1); + yy_symbol_print (stderr, yyrhs[yyprhs[yyrule] + yyi], + &(yyvsp[(yyi + 1) - (yynrhs)]) + ); + YYFPRINTF (stderr, "\n"); + } +} + +# define YY_REDUCE_PRINT(Rule) \ +do { \ + if (yydebug) \ + yy_reduce_print (yyvsp, Rule); \ +} while (YYID (0)) + +/* Nonzero means print parse trace. It is left uninitialized so that + multiple parsers can coexist. */ +int yydebug; +#else /* !YYDEBUG */ +# define YYDPRINTF(Args) +# define YY_SYMBOL_PRINT(Title, Type, Value, Location) +# define YY_STACK_PRINT(Bottom, Top) +# define YY_REDUCE_PRINT(Rule) +#endif /* !YYDEBUG */ + + +/* YYINITDEPTH -- initial size of the parser's stacks. */ +#ifndef YYINITDEPTH +# define YYINITDEPTH 200 +#endif + +/* YYMAXDEPTH -- maximum size the stacks can grow to (effective only + if the built-in stack extension method is used). + + Do not make this value too large; the results are undefined if + YYSTACK_ALLOC_MAXIMUM < YYSTACK_BYTES (YYMAXDEPTH) + evaluated with infinite-precision integer arithmetic. */ + +#ifndef YYMAXDEPTH +# define YYMAXDEPTH 10000 +#endif + + +#if YYERROR_VERBOSE + +# ifndef yystrlen +# if defined __GLIBC__ && defined _STRING_H +# define yystrlen strlen +# else +/* Return the length of YYSTR. */ +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static YYSIZE_T +yystrlen (const char *yystr) +#else +static YYSIZE_T +yystrlen (yystr) + const char *yystr; +#endif +{ + YYSIZE_T yylen; + for (yylen = 0; yystr[yylen]; yylen++) + continue; + return yylen; +} +# endif +# endif + +# ifndef yystpcpy +# if defined __GLIBC__ && defined _STRING_H && defined _GNU_SOURCE +# define yystpcpy stpcpy +# else +/* Copy YYSRC to YYDEST, returning the address of the terminating '\0' in + YYDEST. */ +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static char * +yystpcpy (char *yydest, const char *yysrc) +#else +static char * +yystpcpy (yydest, yysrc) + char *yydest; + const char *yysrc; +#endif +{ + char *yyd = yydest; + const char *yys = yysrc; + + while ((*yyd++ = *yys++) != '\0') + continue; + + return yyd - 1; +} +# endif +# endif + +# ifndef yytnamerr +/* Copy to YYRES the contents of YYSTR after stripping away unnecessary + quotes and backslashes, so that it's suitable for yyerror. The + heuristic is that double-quoting is unnecessary unless the string + contains an apostrophe, a comma, or backslash (other than + backslash-backslash). YYSTR is taken from yytname. If YYRES is + null, do not copy; instead, return the length of what the result + would have been. */ +static YYSIZE_T +yytnamerr (char *yyres, const char *yystr) +{ + if (*yystr == '"') + { + YYSIZE_T yyn = 0; + char const *yyp = yystr; + + for (;;) + switch (*++yyp) + { + case '\'': + case ',': + goto do_not_strip_quotes; + + case '\\': + if (*++yyp != '\\') + goto do_not_strip_quotes; + /* Fall through. */ + default: + if (yyres) + yyres[yyn] = *yyp; + yyn++; + break; + + case '"': + if (yyres) + yyres[yyn] = '\0'; + return yyn; + } + do_not_strip_quotes: ; + } + + if (! yyres) + return yystrlen (yystr); + + return yystpcpy (yyres, yystr) - yyres; +} +# endif + +/* Copy into *YYMSG, which is of size *YYMSG_ALLOC, an error message + about the unexpected token YYTOKEN for the state stack whose top is + YYSSP. + + Return 0 if *YYMSG was successfully written. Return 1 if *YYMSG is + not large enough to hold the message. In that case, also set + *YYMSG_ALLOC to the required number of bytes. Return 2 if the + required number of bytes is too large to store. */ +static int +yysyntax_error (YYSIZE_T *yymsg_alloc, char **yymsg, + yytype_int16 *yyssp, int yytoken) +{ + YYSIZE_T yysize0 = yytnamerr (0, yytname[yytoken]); + YYSIZE_T yysize = yysize0; + YYSIZE_T yysize1; + enum { YYERROR_VERBOSE_ARGS_MAXIMUM = 5 }; + /* Internationalized format string. */ + const char *yyformat = 0; + /* Arguments of yyformat. */ + char const *yyarg[YYERROR_VERBOSE_ARGS_MAXIMUM]; + /* Number of reported tokens (one for the "unexpected", one per + "expected"). */ + int yycount = 0; + + /* There are many possibilities here to consider: + - Assume YYFAIL is not used. It's too flawed to consider. See + <http://lists.gnu.org/archive/html/bison-patches/2009-12/msg00024.html> + for details. YYERROR is fine as it does not invoke this + function. + - If this state is a consistent state with a default action, then + the only way this function was invoked is if the default action + is an error action. In that case, don't check for expected + tokens because there are none. + - The only way there can be no lookahead present (in yychar) is if + this state is a consistent state with a default action. Thus, + detecting the absence of a lookahead is sufficient to determine + that there is no unexpected or expected token to report. In that + case, just report a simple "syntax error". + - Don't assume there isn't a lookahead just because this state is a + consistent state with a default action. There might have been a + previous inconsistent state, consistent state with a non-default + action, or user semantic action that manipulated yychar. + - Of course, the expected token list depends on states to have + correct lookahead information, and it depends on the parser not + to perform extra reductions after fetching a lookahead from the + scanner and before detecting a syntax error. Thus, state merging + (from LALR or IELR) and default reductions corrupt the expected + token list. However, the list is correct for canonical LR with + one exception: it will still contain any token that will not be + accepted due to an error action in a later state. + */ + if (yytoken != YYEMPTY) + { + int yyn = yypact[*yyssp]; + yyarg[yycount++] = yytname[yytoken]; + if (!yypact_value_is_default (yyn)) + { + /* Start YYX at -YYN if negative to avoid negative indexes in + YYCHECK. In other words, skip the first -YYN actions for + this state because they are default actions. */ + int yyxbegin = yyn < 0 ? -yyn : 0; + /* Stay within bounds of both yycheck and yytname. */ + int yychecklim = YYLAST - yyn + 1; + int yyxend = yychecklim < YYNTOKENS ? yychecklim : YYNTOKENS; + int yyx; + + for (yyx = yyxbegin; yyx < yyxend; ++yyx) + if (yycheck[yyx + yyn] == yyx && yyx != YYTERROR + && !yytable_value_is_error (yytable[yyx + yyn])) + { + if (yycount == YYERROR_VERBOSE_ARGS_MAXIMUM) + { + yycount = 1; + yysize = yysize0; + break; + } + yyarg[yycount++] = yytname[yyx]; + yysize1 = yysize + yytnamerr (0, yytname[yyx]); + if (! (yysize <= yysize1 + && yysize1 <= YYSTACK_ALLOC_MAXIMUM)) + return 2; + yysize = yysize1; + } + } + } + + switch (yycount) + { +# define YYCASE_(N, S) \ + case N: \ + yyformat = S; \ + break + YYCASE_(0, YY_("syntax error")); + YYCASE_(1, YY_("syntax error, unexpected %s")); + YYCASE_(2, YY_("syntax error, unexpected %s, expecting %s")); + YYCASE_(3, YY_("syntax error, unexpected %s, expecting %s or %s")); + YYCASE_(4, YY_("syntax error, unexpected %s, expecting %s or %s or %s")); + YYCASE_(5, YY_("syntax error, unexpected %s, expecting %s or %s or %s or %s")); +# undef YYCASE_ + } + + yysize1 = yysize + yystrlen (yyformat); + if (! (yysize <= yysize1 && yysize1 <= YYSTACK_ALLOC_MAXIMUM)) + return 2; + yysize = yysize1; + + if (*yymsg_alloc < yysize) + { + *yymsg_alloc = 2 * yysize; + if (! (yysize <= *yymsg_alloc + && *yymsg_alloc <= YYSTACK_ALLOC_MAXIMUM)) + *yymsg_alloc = YYSTACK_ALLOC_MAXIMUM; + return 1; + } + + /* Avoid sprintf, as that infringes on the user's name space. + Don't have undefined behavior even if the translation + produced a string with the wrong number of "%s"s. */ + { + char *yyp = *yymsg; + int yyi = 0; + while ((*yyp = *yyformat) != '\0') + if (*yyp == '%' && yyformat[1] == 's' && yyi < yycount) + { + yyp += yytnamerr (yyp, yyarg[yyi++]); + yyformat += 2; + } + else + { + yyp++; + yyformat++; + } + } + return 0; +} +#endif /* YYERROR_VERBOSE */ + +/*-----------------------------------------------. +| Release the memory associated to this symbol. | +`-----------------------------------------------*/ + +/*ARGSUSED*/ +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static void +yydestruct (const char *yymsg, int yytype, YYSTYPE *yyvaluep) +#else +static void +yydestruct (yymsg, yytype, yyvaluep) + const char *yymsg; + int yytype; + YYSTYPE *yyvaluep; +#endif +{ + YYUSE (yyvaluep); + + if (!yymsg) + yymsg = "Deleting"; + YY_SYMBOL_PRINT (yymsg, yytype, yyvaluep, yylocationp); + + switch (yytype) + { + + default: + break; + } +} + + +/* Prevent warnings from -Wmissing-prototypes. */ +#ifdef YYPARSE_PARAM +#if defined __STDC__ || defined __cplusplus +int yyparse (void *YYPARSE_PARAM); +#else +int yyparse (); +#endif +#else /* ! YYPARSE_PARAM */ +#if defined __STDC__ || defined __cplusplus +int yyparse (void); +#else +int yyparse (); +#endif +#endif /* ! YYPARSE_PARAM */ + + +/*----------. +| yyparse. | +`----------*/ + +#ifdef YYPARSE_PARAM +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +int +yyparse (void *YYPARSE_PARAM) +#else +int +yyparse (YYPARSE_PARAM) + void *YYPARSE_PARAM; +#endif +#else /* ! YYPARSE_PARAM */ +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +int +yyparse (void) +#else +int +yyparse () + +#endif +#endif +{ +/* The lookahead symbol. */ +int yychar; +/* The backup of yychar when there is an error and we're in yyerrlab. */ +int yylastchar; + +/* The semantic value of the lookahead symbol. */ +YYSTYPE yylval; + + /* Number of syntax errors so far. */ + int yynerrs; + + int yystate; + /* Number of tokens to shift before error messages enabled. */ + int yyerrstatus; + + /* The stacks and their tools: + `yyss': related to states. + `yyvs': related to semantic values. + + Refer to the stacks thru separate pointers, to allow yyoverflow + to reallocate them elsewhere. */ + + /* The state stack. */ + yytype_int16 yyssa[YYINITDEPTH]; + yytype_int16 *yyss; + yytype_int16 *yyssp; + + /* The semantic value stack. */ + YYSTYPE yyvsa[YYINITDEPTH]; + YYSTYPE *yyvs; + YYSTYPE *yyvsp; + + YYSIZE_T yystacksize; + + int yyn; + int yyresult; + /* Lookahead token as an internal (translated) token number. */ + int yytoken; + /* The variables used to return semantic value and location from the + action routines. */ + YYSTYPE yyval; + +#if YYERROR_VERBOSE + /* Buffer for error messages, and its allocated size. */ + char yymsgbuf[128]; + char *yymsg = yymsgbuf; + YYSIZE_T yymsg_alloc = sizeof yymsgbuf; +#endif + +#define YYPOPSTACK(N) (yyvsp -= (N), yyssp -= (N)) + + /* The number of symbols on the RHS of the reduced rule. + Keep to zero when no symbol should be popped. */ + int yylen = 0; + + yytoken = 0; + yyss = yyssa; + yyvs = yyvsa; + yystacksize = YYINITDEPTH; + + YYDPRINTF ((stderr, "Starting parse\n")); + + yystate = 0; + yyerrstatus = 0; + yynerrs = 0; + yychar = YYEMPTY; /* Cause a token to be read. */ + + /* Initialize stack pointers. + Waste one element of value and location stack + so that they stay on the same level as the state stack. + The wasted elements are never initialized. */ + yyssp = yyss; + yyvsp = yyvs; + + goto yysetstate; + +/*------------------------------------------------------------. +| yynewstate -- Push a new state, which is found in yystate. | +`------------------------------------------------------------*/ + yynewstate: + /* In all cases, when you get here, the value and location stacks + have just been pushed. So pushing a state here evens the stacks. */ + yyssp++; + + yysetstate: + *yyssp = yystate; + + if (yyss + yystacksize - 1 <= yyssp) + { + /* Get the current used size of the three stacks, in elements. */ + YYSIZE_T yysize = yyssp - yyss + 1; + +#ifdef yyoverflow + { + /* Give user a chance to reallocate the stack. Use copies of + these so that the &'s don't force the real ones into + memory. */ + YYSTYPE *yyvs1 = yyvs; + yytype_int16 *yyss1 = yyss; + + /* Each stack pointer address is followed by the size of the + data in use in that stack, in bytes. This used to be a + conditional around just the two extra args, but that might + be undefined if yyoverflow is a macro. */ + yyoverflow (YY_("memory exhausted"), + &yyss1, yysize * sizeof (*yyssp), + &yyvs1, yysize * sizeof (*yyvsp), + &yystacksize); + + yyss = yyss1; + yyvs = yyvs1; + } +#else /* no yyoverflow */ +# ifndef YYSTACK_RELOCATE + goto yyexhaustedlab; +# else + /* Extend the stack our own way. */ + if (YYMAXDEPTH <= yystacksize) + goto yyexhaustedlab; + yystacksize *= 2; + if (YYMAXDEPTH < yystacksize) + yystacksize = YYMAXDEPTH; + + { + yytype_int16 *yyss1 = yyss; + union yyalloc *yyptr = + (union yyalloc *) YYSTACK_ALLOC (YYSTACK_BYTES (yystacksize)); + if (! yyptr) + goto yyexhaustedlab; + YYSTACK_RELOCATE (yyss_alloc, yyss); + YYSTACK_RELOCATE (yyvs_alloc, yyvs); +# undef YYSTACK_RELOCATE + if (yyss1 != yyssa) + YYSTACK_FREE (yyss1); + } +# endif +#endif /* no yyoverflow */ + + yyssp = yyss + yysize - 1; + yyvsp = yyvs + yysize - 1; + + YYDPRINTF ((stderr, "Stack size increased to %lu\n", + (unsigned long int) yystacksize)); + + if (yyss + yystacksize - 1 <= yyssp) + YYABORT; + } + + YYDPRINTF ((stderr, "Entering state %d\n", yystate)); + + if (yystate == YYFINAL) + YYACCEPT; + + goto yybackup; + +/*-----------. +| yybackup. | +`-----------*/ +yybackup: + + /* Do appropriate processing given the current state. Read a + lookahead token if we need one and don't already have one. */ + + /* First try to decide what to do without reference to lookahead token. */ + yyn = yypact[yystate]; + if (yypact_value_is_default (yyn)) + goto yydefault; + + /* Not known => get a lookahead token if don't already have one. */ + + /* YYCHAR is either YYEMPTY or YYEOF or a valid lookahead symbol. */ + if (yychar == YYEMPTY) + { + YYDPRINTF ((stderr, "Reading a token: ")); + yychar = YYLEX; + } + + if (yychar <= YYEOF) + { + yychar = yytoken = YYEOF; + YYDPRINTF ((stderr, "Now at end of input.\n")); + } + else + { + yytoken = YYTRANSLATE (yychar); + YY_SYMBOL_PRINT ("Next token is", yytoken, &yylval, &yylloc); + } + + /* If the proper action on seeing token YYTOKEN is to reduce or to + detect an error, take that action. */ + yyn += yytoken; + if (yyn < 0 || YYLAST < yyn || yycheck[yyn] != yytoken) + goto yydefault; + yyn = yytable[yyn]; + if (yyn <= 0) + { + if (yytable_value_is_error (yyn)) + goto yyerrlab; + yyn = -yyn; + goto yyreduce; + } + + /* Count tokens shifted since error; after three, turn off error + status. */ + if (yyerrstatus) + yyerrstatus--; + + /* Shift the lookahead token. */ + YY_SYMBOL_PRINT ("Shifting", yytoken, &yylval, &yylloc); + + /* Discard the shifted token. */ + yychar = YYEMPTY; + + yystate = yyn; + *++yyvsp = yylval; + + goto yynewstate; + + +/*-----------------------------------------------------------. +| yydefault -- do the default action for the current state. | +`-----------------------------------------------------------*/ +yydefault: + yyn = yydefact[yystate]; + if (yyn == 0) + goto yyerrlab; + goto yyreduce; + + +/*-----------------------------. +| yyreduce -- Do a reduction. | +`-----------------------------*/ +yyreduce: + /* yyn is the number of a rule to reduce with. */ + yylen = yyr2[yyn]; + + /* If YYLEN is nonzero, implement the default value of the action: + `$$ = $1'. + + Otherwise, the following line sets YYVAL to garbage. + This behavior is undocumented and Bison + users should not rely upon it. Assigning to YYVAL + unconditionally makes the parser a bit smaller, and it avoids a + GCC warning that YYVAL may be used uninitialized. */ + yyval = yyvsp[1-yylen]; + + + YY_REDUCE_PRINT (yyn); + switch (yyn) + { + case 2: + +/* Line 1806 of yacc.c */ +#line 79 "fts0pars.y" + { + (yyval.node) = (yyvsp[(1) - (1)].node); + ((fts_ast_state_t*) state)->root = (yyval.node); + } + break; + + case 3: + +/* Line 1806 of yacc.c */ +#line 85 "fts0pars.y" + { + (yyval.node) = NULL; + } + break; + + case 4: + +/* Line 1806 of yacc.c */ +#line 89 "fts0pars.y" + { + (yyval.node) = (yyvsp[(1) - (2)].node); + + if (!(yyval.node)) { + (yyval.node) = fts_ast_create_node_list(state, (yyvsp[(2) - (2)].node)); + } else { + fts_ast_add_node((yyval.node), (yyvsp[(2) - (2)].node)); + } + } + break; + + case 5: + +/* Line 1806 of yacc.c */ +#line 99 "fts0pars.y" + { + (yyval.node) = (yyvsp[(1) - (2)].node); + (yyval.node) = fts_ast_create_node_list(state, (yyvsp[(1) - (2)].node)); + + if (!(yyval.node)) { + (yyval.node) = (yyvsp[(2) - (2)].node); + } else { + fts_ast_add_node((yyval.node), (yyvsp[(2) - (2)].node)); + } + } + break; + + case 6: + +/* Line 1806 of yacc.c */ +#line 111 "fts0pars.y" + { + (yyval.node) = (yyvsp[(2) - (3)].node); + + if ((yyval.node)) { + (yyval.node) = fts_ast_create_node_subexp_list(state, (yyval.node)); + } + } + break; + + case 7: + +/* Line 1806 of yacc.c */ +#line 119 "fts0pars.y" + { + (yyval.node) = fts_ast_create_node_list(state, (yyvsp[(1) - (4)].node)); + + if ((yyvsp[(3) - (4)].node)) { + fts_ast_add_node((yyval.node), + fts_ast_create_node_subexp_list(state, (yyvsp[(3) - (4)].node))); + } + } + break; + + case 8: + +/* Line 1806 of yacc.c */ +#line 129 "fts0pars.y" + { + (yyval.node) = (yyvsp[(1) - (1)].node); + } + break; + + case 9: + +/* Line 1806 of yacc.c */ +#line 133 "fts0pars.y" + { + (yyval.node) = (yyvsp[(1) - (1)].node); + } + break; + + case 10: + +/* Line 1806 of yacc.c */ +#line 137 "fts0pars.y" + { + fts_ast_term_set_wildcard((yyvsp[(1) - (2)].node)); + } + break; + + case 11: + +/* Line 1806 of yacc.c */ +#line 141 "fts0pars.y" + { + fts_ast_term_set_distance((yyvsp[(1) - (3)].node), fts_ast_string_to_ul((yyvsp[(3) - (3)].token), 10)); + fts_ast_string_free((yyvsp[(3) - (3)].token)); + } + break; + + case 12: + +/* Line 1806 of yacc.c */ +#line 146 "fts0pars.y" + { + (yyval.node) = fts_ast_create_node_list(state, (yyvsp[(1) - (3)].node)); + fts_ast_add_node((yyval.node), (yyvsp[(2) - (3)].node)); + fts_ast_term_set_wildcard((yyvsp[(2) - (3)].node)); + } + break; + + case 13: + +/* Line 1806 of yacc.c */ +#line 152 "fts0pars.y" + { + (yyval.node) = fts_ast_create_node_list(state, (yyvsp[(1) - (2)].node)); + fts_ast_add_node((yyval.node), (yyvsp[(2) - (2)].node)); + } + break; + + case 14: + +/* Line 1806 of yacc.c */ +#line 157 "fts0pars.y" + { + (yyval.node) = fts_ast_create_node_list(state, (yyvsp[(1) - (4)].node)); + fts_ast_add_node((yyval.node), (yyvsp[(2) - (4)].node)); + fts_ast_term_set_distance((yyvsp[(2) - (4)].node), fts_ast_string_to_ul((yyvsp[(4) - (4)].token), 10)); + fts_ast_string_free((yyvsp[(4) - (4)].token)); + } + break; + + case 15: + +/* Line 1806 of yacc.c */ +#line 164 "fts0pars.y" + { + (yyval.node) = fts_ast_create_node_list(state, (yyvsp[(1) - (2)].node)); + fts_ast_add_node((yyval.node), (yyvsp[(2) - (2)].node)); + } + break; + + case 16: + +/* Line 1806 of yacc.c */ +#line 170 "fts0pars.y" + { + (yyval.node) = fts_ast_create_node_oper(state, FTS_IGNORE); + } + break; + + case 17: + +/* Line 1806 of yacc.c */ +#line 174 "fts0pars.y" + { + (yyval.node) = fts_ast_create_node_oper(state, FTS_EXIST); + } + break; + + case 18: + +/* Line 1806 of yacc.c */ +#line 178 "fts0pars.y" + { + (yyval.node) = fts_ast_create_node_oper(state, FTS_NEGATE); + } + break; + + case 19: + +/* Line 1806 of yacc.c */ +#line 182 "fts0pars.y" + { + (yyval.node) = fts_ast_create_node_oper(state, FTS_DECR_RATING); + } + break; + + case 20: + +/* Line 1806 of yacc.c */ +#line 186 "fts0pars.y" + { + (yyval.node) = fts_ast_create_node_oper(state, FTS_INCR_RATING); + } + break; + + case 21: + +/* Line 1806 of yacc.c */ +#line 191 "fts0pars.y" + { + (yyval.node) = fts_ast_create_node_term(state, (yyvsp[(1) - (1)].token)); + fts_ast_string_free((yyvsp[(1) - (1)].token)); + } + break; + + case 22: + +/* Line 1806 of yacc.c */ +#line 196 "fts0pars.y" + { + (yyval.node) = fts_ast_create_node_term(state, (yyvsp[(1) - (1)].token)); + fts_ast_string_free((yyvsp[(1) - (1)].token)); + } + break; + + case 23: + +/* Line 1806 of yacc.c */ +#line 202 "fts0pars.y" + { + (yyval.node) = (yyvsp[(2) - (2)].node); + } + break; + + case 24: + +/* Line 1806 of yacc.c */ +#line 207 "fts0pars.y" + { + (yyval.node) = fts_ast_create_node_text(state, (yyvsp[(1) - (1)].token)); + fts_ast_string_free((yyvsp[(1) - (1)].token)); + } + break; + + + +/* Line 1806 of yacc.c */ +#line 1663 "fts0pars.cc" + default: break; + } + /* User semantic actions sometimes alter yychar, and that requires + that yytoken be updated with the new translation. We take the + approach of translating immediately before every use of yytoken. + One alternative is translating here after every semantic action, + but that translation would be missed if the semantic action invokes + YYABORT, YYACCEPT, or YYERROR immediately after altering yychar or + if it invokes YYBACKUP. In the case of YYABORT or YYACCEPT, an + incorrect destructor might then be invoked immediately. In the + case of YYERROR or YYBACKUP, subsequent parser actions might lead + to an incorrect destructor call or verbose syntax error message + before the lookahead is translated. */ + YY_SYMBOL_PRINT ("-> $$ =", yyr1[yyn], &yyval, &yyloc); + + YYPOPSTACK (yylen); + yylen = 0; + YY_STACK_PRINT (yyss, yyssp); + + *++yyvsp = yyval; + + /* Now `shift' the result of the reduction. Determine what state + that goes to, based on the state we popped back to and the rule + number reduced by. */ + + yyn = yyr1[yyn]; + + yystate = yypgoto[yyn - YYNTOKENS] + *yyssp; + if (0 <= yystate && yystate <= YYLAST && yycheck[yystate] == *yyssp) + yystate = yytable[yystate]; + else + yystate = yydefgoto[yyn - YYNTOKENS]; + + goto yynewstate; + + +/*------------------------------------. +| yyerrlab -- here on detecting error | +`------------------------------------*/ +yyerrlab: + /* Backup yychar, in case we would change it. */ + yylastchar = yychar; + /* Make sure we have latest lookahead translation. See comments at + user semantic actions for why this is necessary. */ + yytoken = yychar == YYEMPTY ? YYEMPTY : YYTRANSLATE (yychar); + + /* If not already recovering from an error, report this error. */ + if (!yyerrstatus) + { + ++yynerrs; +#if ! YYERROR_VERBOSE + yyerror (YY_("syntax error")); +#else +# define YYSYNTAX_ERROR yysyntax_error (&yymsg_alloc, &yymsg, \ + yyssp, yytoken) + { + char const *yymsgp = YY_("syntax error"); + int yysyntax_error_status; + yysyntax_error_status = YYSYNTAX_ERROR; + if (yysyntax_error_status == 0) + yymsgp = yymsg; + else if (yysyntax_error_status == 1) + { + if (yymsg != yymsgbuf) + YYSTACK_FREE (yymsg); + yymsg = (char *) YYSTACK_ALLOC (yymsg_alloc); + if (!yymsg) + { + yymsg = yymsgbuf; + yymsg_alloc = sizeof yymsgbuf; + yysyntax_error_status = 2; + } + else + { + yysyntax_error_status = YYSYNTAX_ERROR; + yymsgp = yymsg; + } + } + yyerror (yymsgp); + if (yysyntax_error_status == 2) + goto yyexhaustedlab; + } +# undef YYSYNTAX_ERROR +#endif + } + + + + if (yyerrstatus == 3) + { + /* If just tried and failed to reuse lookahead token after an + error, discard it. */ + + if (yychar <= YYEOF) + { + /* Return failure if at end of input. */ + if (yychar == YYEOF) + { + /* Since we don't need the token, we have to free it first. */ + YYERRCLEANUP; + YYABORT; + } + } + else + { + yydestruct ("Error: discarding", + yytoken, &yylval); + yychar = YYEMPTY; + } + } + + /* Else will try to reuse lookahead token after shifting the error + token. */ + goto yyerrlab1; + + +/*---------------------------------------------------. +| yyerrorlab -- error raised explicitly by YYERROR. | +`---------------------------------------------------*/ +yyerrorlab: + + /* Pacify compilers like GCC when the user code never invokes + YYERROR and the label yyerrorlab therefore never appears in user + code. */ + if (/*CONSTCOND*/ 0) + goto yyerrorlab; + + /* Do not reclaim the symbols of the rule which action triggered + this YYERROR. */ + YYPOPSTACK (yylen); + yylen = 0; + YY_STACK_PRINT (yyss, yyssp); + yystate = *yyssp; + goto yyerrlab1; + + +/*-------------------------------------------------------------. +| yyerrlab1 -- common code for both syntax error and YYERROR. | +`-------------------------------------------------------------*/ +yyerrlab1: + yyerrstatus = 3; /* Each real token shifted decrements this. */ + + for (;;) + { + yyn = yypact[yystate]; + if (!yypact_value_is_default (yyn)) + { + yyn += YYTERROR; + if (0 <= yyn && yyn <= YYLAST && yycheck[yyn] == YYTERROR) + { + yyn = yytable[yyn]; + if (0 < yyn) + break; + } + } + + /* Pop the current state because it cannot handle the error token. */ + if (yyssp == yyss) + { + /* Since we don't need the error token, we have to free it first. */ + YYERRCLEANUP; + YYABORT; + } + + + yydestruct ("Error: popping", + yystos[yystate], yyvsp); + YYPOPSTACK (1); + yystate = *yyssp; + YY_STACK_PRINT (yyss, yyssp); + } + + *++yyvsp = yylval; + + + /* Shift the error token. */ + YY_SYMBOL_PRINT ("Shifting", yystos[yyn], yyvsp, yylsp); + + yystate = yyn; + goto yynewstate; + + +/*-------------------------------------. +| yyacceptlab -- YYACCEPT comes here. | +`-------------------------------------*/ +yyacceptlab: + yyresult = 0; + goto yyreturn; + +/*-----------------------------------. +| yyabortlab -- YYABORT comes here. | +`-----------------------------------*/ +yyabortlab: + yyresult = 1; + goto yyreturn; + +#if !defined(yyoverflow) || YYERROR_VERBOSE +/*-------------------------------------------------. +| yyexhaustedlab -- memory exhaustion comes here. | +`-------------------------------------------------*/ +yyexhaustedlab: + yyerror (YY_("memory exhausted")); + yyresult = 2; + /* Fall through. */ +#endif + +yyreturn: + if (yychar != YYEMPTY) + { + /* Make sure we have latest lookahead translation. See comments at + user semantic actions for why this is necessary. */ + yytoken = YYTRANSLATE (yychar); + yydestruct ("Cleanup: discarding lookahead", + yytoken, &yylval); + } + /* Do not reclaim the symbols of the rule which action triggered + this YYABORT or YYACCEPT. */ + YYPOPSTACK (yylen); + YY_STACK_PRINT (yyss, yyssp); + while (yyssp != yyss) + { + yydestruct ("Cleanup: popping", + yystos[*yyssp], yyvsp); + YYPOPSTACK (1); + } +#ifndef yyoverflow + if (yyss != yyssa) + YYSTACK_FREE (yyss); +#endif +#if YYERROR_VERBOSE + if (yymsg != yymsgbuf) + YYSTACK_FREE (yymsg); +#endif + /* Make sure YYID is used. */ + return YYID (yyresult); +} + + + +/* Line 2067 of yacc.c */ +#line 212 "fts0pars.y" + + +/******************************************************************** +*/ +int +ftserror( +/*=====*/ + const char* p) +{ + my_printf_error(ER_PARSE_ERROR, "%s", MYF(0), p); + return(0); +} + +/******************************************************************** +Create a fts_lexer_t instance.*/ + +fts_lexer_t* +fts_lexer_create( +/*=============*/ + ibool boolean_mode, + const byte* query, + ulint query_len) +{ + fts_lexer_t* fts_lexer = static_cast<fts_lexer_t*>( + ut_malloc(sizeof(fts_lexer_t))); + + if (boolean_mode) { + fts0blex_init(&fts_lexer->yyscanner); + fts0b_scan_bytes( + reinterpret_cast<const char*>(query), + static_cast<int>(query_len), + fts_lexer->yyscanner); + fts_lexer->scanner = reinterpret_cast<fts_scan>(fts_blexer); + /* FIXME: Debugging */ + /* fts0bset_debug(1 , fts_lexer->yyscanner); */ + } else { + fts0tlex_init(&fts_lexer->yyscanner); + fts0t_scan_bytes( + reinterpret_cast<const char*>(query), + static_cast<int>(query_len), + fts_lexer->yyscanner); + fts_lexer->scanner = reinterpret_cast<fts_scan>(fts_tlexer); + } + + return(fts_lexer); +} + +/******************************************************************** +Free an fts_lexer_t instance.*/ +void + +fts_lexer_free( +/*===========*/ + fts_lexer_t* fts_lexer) +{ + if (fts_lexer->scanner == (fts_scan) fts_blexer) { + fts0blex_destroy(fts_lexer->yyscanner); + } else { + fts0tlex_destroy(fts_lexer->yyscanner); + } + + ut_free(fts_lexer); +} + +/******************************************************************** +Call the appropaiate scanner.*/ + +int +fts_lexer( +/*======*/ + YYSTYPE* val, + fts_lexer_t* fts_lexer) +{ + fts_scanner_alt func_ptr; + + func_ptr = (fts_scanner_alt) fts_lexer->scanner; + + return(func_ptr(val, fts_lexer->yyscanner)); +} + +/******************************************************************** +Parse the query.*/ +int +fts_parse( +/*======*/ + fts_ast_state_t* state) +{ + return(ftsparse(state)); +} + diff --git a/storage/xtradb/fts/fts0pars.y b/storage/xtradb/fts/fts0pars.y new file mode 100644 index 00000000000..e48036e82fe --- /dev/null +++ b/storage/xtradb/fts/fts0pars.y @@ -0,0 +1,294 @@ +/***************************************************************************** + +Copyright (c) 2007, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/** + * @file fts/fts0pars.y + * FTS parser: input file for the GNU Bison parser generator + * + * Created 2007/5/9 Sunny Bains + */ + +%{ + +#include "mem0mem.h" +#include "fts0ast.h" +#include "fts0blex.h" +#include "fts0tlex.h" +#include "fts0pars.h" + +extern int fts_lexer(YYSTYPE*, fts_lexer_t*); +extern int fts_blexer(YYSTYPE*, yyscan_t); +extern int fts_tlexer(YYSTYPE*, yyscan_t); + +typedef int (*fts_scan)(); + +extern int ftserror(const char* p); + +/* Required for reentrant parser */ +#define ftslex fts_lexer + +#define YYERROR_VERBOSE + +/* For passing an argument to yyparse() */ +#define YYPARSE_PARAM state +#define YYLEX_PARAM ((fts_ast_state_t*) state)->lexer + +typedef int (*fts_scanner_alt)(YYSTYPE* val, yyscan_t yyscanner); +typedef int (*fts_scanner)(); + +struct fts_lexer_struct { + fts_scanner scanner; + void* yyscanner; +}; + +%} + +%union { + int oper; + fts_ast_string_t* token; + fts_ast_node_t* node; +}; + +/* Enable re-entrant parser */ +%pure_parser + +%token<oper> FTS_OPER +%token<token> FTS_TEXT FTS_TERM FTS_NUMB + +%type<node> prefix term text expr sub_expr expr_lst query + +%nonassoc '+' '-' '~' '<' '>' + +%% + +query : expr_lst { + $$ = $1; + ((fts_ast_state_t*) state)->root = $$; + } + ; + +expr_lst: /* Empty */ { + $$ = NULL; + } + + | expr_lst expr { + $$ = $1; + + if (!$$) { + $$ = fts_ast_create_node_list(state, $2); + } else { + fts_ast_add_node($$, $2); + } + } + + | expr_lst sub_expr { + $$ = $1; + $$ = fts_ast_create_node_list(state, $1); + + if (!$$) { + $$ = $2; + } else { + fts_ast_add_node($$, $2); + } + } + ; + +sub_expr: '(' expr_lst ')' { + $$ = $2; + + if ($$) { + $$ = fts_ast_create_node_subexp_list(state, $$); + } + } + + | prefix '(' expr_lst ')' { + $$ = fts_ast_create_node_list(state, $1); + + if ($3) { + fts_ast_add_node($$, + fts_ast_create_node_subexp_list(state, $3)); + } + } + ; + +expr : term { + $$ = $1; + } + + | text { + $$ = $1; + } + + | term '*' { + fts_ast_term_set_wildcard($1); + } + + | text '@' FTS_NUMB { + fts_ast_term_set_distance($1, fts_ast_string_to_ul($3, 10)); + fts_ast_string_free($3); + } + + | prefix term '*' { + $$ = fts_ast_create_node_list(state, $1); + fts_ast_add_node($$, $2); + fts_ast_term_set_wildcard($2); + } + + | prefix term { + $$ = fts_ast_create_node_list(state, $1); + fts_ast_add_node($$, $2); + } + + | prefix text '@' FTS_NUMB { + $$ = fts_ast_create_node_list(state, $1); + fts_ast_add_node($$, $2); + fts_ast_term_set_distance($2, fts_ast_string_to_ul($4, 10)); + fts_ast_string_free($4); + } + + | prefix text { + $$ = fts_ast_create_node_list(state, $1); + fts_ast_add_node($$, $2); + } + ; + +prefix : '-' { + $$ = fts_ast_create_node_oper(state, FTS_IGNORE); + } + + | '+' { + $$ = fts_ast_create_node_oper(state, FTS_EXIST); + } + + | '~' { + $$ = fts_ast_create_node_oper(state, FTS_NEGATE); + } + + | '<' { + $$ = fts_ast_create_node_oper(state, FTS_DECR_RATING); + } + + | '>' { + $$ = fts_ast_create_node_oper(state, FTS_INCR_RATING); + } + ; + +term : FTS_TERM { + $$ = fts_ast_create_node_term(state, $1); + fts_ast_string_free($1); + } + + | FTS_NUMB { + $$ = fts_ast_create_node_term(state, $1); + fts_ast_string_free($1); + } + + /* Ignore leading '*' */ + | '*' term { + $$ = $2; + } + ; + +text : FTS_TEXT { + $$ = fts_ast_create_node_text(state, $1); + fts_ast_string_free($1); + } + ; +%% + +/******************************************************************** +*/ +int +ftserror( +/*=====*/ + const char* p) +{ + fprintf(stderr, "%s\n", p); + return(0); +} + +/******************************************************************** +Create a fts_lexer_t instance.*/ + +fts_lexer_t* +fts_lexer_create( +/*=============*/ + ibool boolean_mode, + const byte* query, + ulint query_len) +{ + fts_lexer_t* fts_lexer = static_cast<fts_lexer_t*>( + ut_malloc(sizeof(fts_lexer_t))); + + if (boolean_mode) { + fts0blex_init(&fts_lexer->yyscanner); + fts0b_scan_bytes((char*) query, query_len, fts_lexer->yyscanner); + fts_lexer->scanner = (fts_scan) fts_blexer; + /* FIXME: Debugging */ + /* fts0bset_debug(1 , fts_lexer->yyscanner); */ + } else { + fts0tlex_init(&fts_lexer->yyscanner); + fts0t_scan_bytes((char*) query, query_len, fts_lexer->yyscanner); + fts_lexer->scanner = (fts_scan) fts_tlexer; + } + + return(fts_lexer); +} + +/******************************************************************** +Free an fts_lexer_t instance.*/ +void + +fts_lexer_free( +/*===========*/ + fts_lexer_t* fts_lexer) +{ + if (fts_lexer->scanner == (fts_scan) fts_blexer) { + fts0blex_destroy(fts_lexer->yyscanner); + } else { + fts0tlex_destroy(fts_lexer->yyscanner); + } + + ut_free(fts_lexer); +} + +/******************************************************************** +Call the appropaiate scanner.*/ + +int +fts_lexer( +/*======*/ + YYSTYPE* val, + fts_lexer_t* fts_lexer) +{ + fts_scanner_alt func_ptr; + + func_ptr = (fts_scanner_alt) fts_lexer->scanner; + + return(func_ptr(val, fts_lexer->yyscanner)); +} + +/******************************************************************** +Parse the query.*/ +int +fts_parse( +/*======*/ + fts_ast_state_t* state) +{ + return(ftsparse(state)); +} diff --git a/storage/xtradb/fts/fts0que.cc b/storage/xtradb/fts/fts0que.cc new file mode 100644 index 00000000000..816b52c1a67 --- /dev/null +++ b/storage/xtradb/fts/fts0que.cc @@ -0,0 +1,4463 @@ +/***************************************************************************** + +Copyright (c) 2007, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file fts/fts0que.cc +Full Text Search functionality. + +Created 2007/03/27 Sunny Bains +Completed 2011/7/10 Sunny and Jimmy Yang +*******************************************************/ + +#include "dict0dict.h" /* dict_table_get_n_rows() */ +#include "ut0rbt.h" +#include "row0sel.h" +#include "fts0fts.h" +#include "fts0priv.h" +#include "fts0ast.h" +#include "fts0pars.h" +#include "fts0types.h" +#include "ha_prototypes.h" +#include <ctype.h> + +#ifndef UNIV_NONINL +#include "fts0types.ic" +#include "fts0vlc.ic" +#endif + +#include <vector> + +#define FTS_ELEM(t, n, i, j) (t[(i) * n + (j)]) + +#define RANK_DOWNGRADE (-1.0F) +#define RANK_UPGRADE (1.0F) + +/* Maximum number of words supported in a phrase or proximity search. */ +#define MAX_PROXIMITY_ITEM 128 + +/* Memory used by rbt itself for create and node add */ +#define SIZEOF_RBT_CREATE sizeof(ib_rbt_t) + sizeof(ib_rbt_node_t) * 2 +#define SIZEOF_RBT_NODE_ADD sizeof(ib_rbt_node_t) + +/*Initial byte length for 'words' in fts_ranking_t */ +#define RANKING_WORDS_INIT_LEN 4 + +/* Coeffecient to use for normalize relevance ranking. */ +static const double FTS_NORMALIZE_COEFF = 0.0115F; + +// FIXME: Need to have a generic iterator that traverses the ilist. + +typedef std::vector<fts_string_t> word_vector_t; + +struct fts_word_freq_t; + +/** State of an FTS query. */ +struct fts_query_t { + mem_heap_t* heap; /*!< Heap to use for allocations */ + + trx_t* trx; /*!< The query transaction */ + + dict_index_t* index; /*!< The FTS index to search */ + /*!< FTS auxiliary common table def */ + fts_table_t fts_common_table; + + fts_table_t fts_index_table;/*!< FTS auxiliary index table def */ + + ulint total_size; /*!< total memory size used by query */ + + fts_doc_ids_t* deleted; /*!< Deleted doc ids that need to be + filtered from the output */ + + fts_ast_node_t* root; /*!< Abstract syntax tree */ + + fts_ast_node_t* cur_node; /*!< Current tree node */ + + ib_rbt_t* word_map; /*!< Matched word map for + searching by word*/ + + word_vector_t* word_vector; /*!< Matched word vector for + searching by index */ + + ib_rbt_t* doc_ids; /*!< The current set of matching + doc ids, elements are of + type fts_ranking_t */ + + ib_rbt_t* intersection; /*!< The doc ids that were found in + doc_ids, this tree will become + the new doc_ids, elements are of type + fts_ranking_t */ + + /*!< Prepared statement to read the + nodes from the FTS INDEX */ + que_t* read_nodes_graph; + + fts_ast_oper_t oper; /*!< Current boolean mode operator */ + + /*!< TRUE if we want to collect the + word positions within the document */ + ibool collect_positions; + + ulint flags; /*!< Specify the full text search type, + such as boolean search, phrase + search, proximity search etc. */ + + ulint distance; /*!< The proximity distance of a + phrase search. */ + + /*!< These doc ids are used as a + boundary condition when searching the + FTS index rows */ + + doc_id_t lower_doc_id; /*!< Lowest doc id in doc_ids */ + + doc_id_t upper_doc_id; /*!< Highest doc id in doc_ids */ + + bool boolean_mode; /*!< TRUE if boolean mode query */ + + ib_vector_t* matched; /*!< Array of matching documents + (fts_match_t) to search for a phrase */ + + ib_vector_t** match_array; /*!< Used for proximity search, contains + position info for each matched word + in the word list */ + + ib_uint64_t total_docs; /*!< The total number of documents */ + + ulint total_words; /*!< The total number of words */ + + dberr_t error; /*!< Error code if any, that is + encountered during query processing */ + + ib_rbt_t* word_freqs; /*!< RB tree of word frequencies per + document, its elements are of type + fts_word_freq_t */ + + bool multi_exist; /*!< multiple FTS_EXIST oper */ +}; + +/** For phrase matching, first we collect the documents and the positions +then we match. */ +struct fts_match_t { + doc_id_t doc_id; /*!< Document id */ + + ulint start; /*!< Start the phrase match from + this offset within the positions + vector. */ + + ib_vector_t* positions; /*!< Offsets of a word in a + document */ +}; + +/** For matching tokens in a phrase search. We use this data structure in +the callback that determines whether a document should be accepted or +rejected for a phrase search. */ +struct fts_select_t { + doc_id_t doc_id; /*!< The document id to match */ + + ulint min_pos; /*!< For found to be TRUE at least + one position must be greater than + min_pos. */ + + ibool found; /*!< TRUE if found */ + + fts_word_freq_t* + word_freq; /*!< Word frequency instance of the + current word being looked up in + the FTS index */ +}; + +typedef std::vector<ulint> pos_vector_t; + +/** structure defines a set of ranges for original documents, each of which +has a minimum position and maximum position. Text in such range should +contain all words in the proximity search. We will need to count the +words in such range to make sure it is less than the specified distance +of the proximity search */ +struct fts_proximity_t { + ulint n_pos; /*!< number of position set, defines + a range (min to max) containing all + matching words */ + pos_vector_t min_pos; /*!< the minimum position (in bytes) + of the range */ + pos_vector_t max_pos; /*!< the maximum position (in bytes) + of the range */ +}; + +/** The match positions and tokesn to match */ +struct fts_phrase_t { + ibool found; /*!< Match result */ + + const fts_match_t* + match; /*!< Positions within text */ + + const ib_vector_t* + tokens; /*!< Tokens to match */ + + ulint distance; /*!< For matching on proximity + distance. Can be 0 for exact match */ + CHARSET_INFO* charset; /*!< Phrase match charset */ + mem_heap_t* heap; /*!< Heap for word processing */ + ulint zip_size; /*!< row zip size */ + fts_proximity_t*proximity_pos; /*!< position info for proximity + search verification. Records the min + and max position of words matched */ +}; + +/** For storing the frequncy of a word/term in a document */ +struct fts_doc_freq_t { + doc_id_t doc_id; /*!< Document id */ + ulint freq; /*!< Frequency of a word in a document */ +}; + +/** To determine the word frequency per document. */ +struct fts_word_freq_t { + fts_string_t word; /*!< Word for which we need the freq, + it's allocated on the query heap */ + + ib_rbt_t* doc_freqs; /*!< RB Tree for storing per document + word frequencies. The elements are + of type fts_doc_freq_t */ + ib_uint64_t doc_count; /*!< Total number of documents that + contain this word */ + double idf; /*!< Inverse document frequency */ +}; + +/******************************************************************** +Callback function to fetch the rows in an FTS INDEX record. +@return always TRUE */ +static +ibool +fts_query_index_fetch_nodes( +/*========================*/ + void* row, /*!< in: sel_node_t* */ + void* user_arg); /*!< in: pointer to ib_vector_t */ + +/******************************************************************** +Read and filter nodes. +@return fts_node_t instance */ +static +dberr_t +fts_query_filter_doc_ids( +/*=====================*/ + fts_query_t* query, /*!< in: query instance */ + const fts_string_t* word, /*!< in: the current word */ + fts_word_freq_t* word_freq, /*!< in/out: word frequency */ + const fts_node_t* node, /*!< in: current FTS node */ + void* data, /*!< in: doc id ilist */ + ulint len, /*!< in: doc id ilist size */ + ibool calc_doc_count);/*!< in: whether to remember doc + count */ + +#if 0 +/*****************************************************************//*** +Find a doc_id in a word's ilist. +@return TRUE if found. */ +static +ibool +fts_query_find_doc_id( +/*==================*/ + fts_select_t* select, /*!< in/out: search the doc id selected, + update the frequency if found. */ + void* data, /*!< in: doc id ilist */ + ulint len); /*!< in: doc id ilist size */ +#endif + +/*************************************************************//** +This function implements a simple "blind" query expansion search: +words in documents found in the first search pass will be used as +search arguments to search the document again, thus "expand" +the search result set. +@return DB_SUCCESS if success, otherwise the error code */ +static +dberr_t +fts_expand_query( +/*=============*/ + dict_index_t* index, /*!< in: FTS index to search */ + fts_query_t* query) /*!< in: query result, to be freed + by the client */ + __attribute__((nonnull, warn_unused_result)); +/*************************************************************//** +This function finds documents that contain all words in a +phrase or proximity search. And if proximity search, verify +the words are close enough to each other, as in specified distance. +This function is called for phrase and proximity search. +@return TRUE if documents are found, FALSE if otherwise */ +static +ibool +fts_phrase_or_proximity_search( +/*===========================*/ + fts_query_t* query, /*!< in/out: query instance + query->doc_ids might be instantiated + with qualified doc IDs */ + ib_vector_t* tokens); /*!< in: Tokens contain words */ +/*************************************************************//** +This function checks whether words in result documents are close to +each other (within proximity range as specified by "distance"). +If "distance" is MAX_ULINT, then it will find all combinations of +positions of matching words and store min and max positions +in the "qualified_pos" for later verification. +@return true if words are close to each other, false if otherwise */ +static +bool +fts_proximity_get_positions( +/*========================*/ + fts_match_t** match, /*!< in: query instance */ + ulint num_match, /*!< in: number of matching + items */ + ulint distance, /*!< in: distance value + for proximity search */ + fts_proximity_t* qualified_pos); /*!< out: the position info + records ranges containing + all matching words. */ +#if 0 +/******************************************************************** +Get the total number of words in a documents. */ +static +ulint +fts_query_terms_in_document( +/*========================*/ + /*!< out: DB_SUCCESS if all go well + else error code */ + fts_query_t* query, /*!< in: FTS query state */ + doc_id_t doc_id, /*!< in: the word to check */ + ulint* total); /*!< out: total words in document */ +#endif + +/******************************************************************** +Compare two fts_doc_freq_t doc_ids. +@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ +UNIV_INLINE +int +fts_freq_doc_id_cmp( +/*================*/ + const void* p1, /*!< in: id1 */ + const void* p2) /*!< in: id2 */ +{ + const fts_doc_freq_t* fq1 = (const fts_doc_freq_t*) p1; + const fts_doc_freq_t* fq2 = (const fts_doc_freq_t*) p2; + + return((int) (fq1->doc_id - fq2->doc_id)); +} + +#if 0 +/*******************************************************************//** +Print the table used for calculating LCS. */ +static +void +fts_print_lcs_table( +/*================*/ + const ulint* table, /*!< in: array to print */ + ulint n_rows, /*!< in: total no. of rows */ + ulint n_cols) /*!< in: total no. of cols */ +{ + ulint i; + + for (i = 0; i < n_rows; ++i) { + ulint j; + + printf("\n"); + + for (j = 0; j < n_cols; ++j) { + + printf("%2lu ", FTS_ELEM(table, n_cols, i, j)); + } + } +} + +/******************************************************************** +Find the longest common subsequence between the query string and +the document. */ +static +ulint +fts_query_lcs( +/*==========*/ + /*!< out: LCS (length) between + two ilists */ + const ulint* p1, /*!< in: word positions of query */ + ulint len_p1, /*!< in: no. of elements in p1 */ + const ulint* p2, /*!< in: word positions within document */ + ulint len_p2) /*!< in: no. of elements in p2 */ +{ + int i; + ulint len = 0; + ulint r = len_p1; + ulint c = len_p2; + ulint size = (r + 1) * (c + 1) * sizeof(ulint); + ulint* table = (ulint*) ut_malloc(size); + + /* Traverse the table backwards, from the last row to the first and + also from the last column to the first. We compute the smaller + common subsequeces first, then use the caluclated values to determine + the longest common subsequence. The result will be in TABLE[0][0]. */ + for (i = r; i >= 0; --i) { + int j; + + for (j = c; j >= 0; --j) { + + if (p1[i] == (ulint) -1 || p2[j] == (ulint) -1) { + + FTS_ELEM(table, c, i, j) = 0; + + } else if (p1[i] == p2[j]) { + + FTS_ELEM(table, c, i, j) = FTS_ELEM( + table, c, i + 1, j + 1) + 1; + + } else { + + ulint value; + + value = ut_max( + FTS_ELEM(table, c, i + 1, j), + FTS_ELEM(table, c, i, j + 1)); + + FTS_ELEM(table, c, i, j) = value; + } + } + } + + len = FTS_ELEM(table, c, 0, 0); + + fts_print_lcs_table(table, r, c); + printf("\nLen=%lu\n", len); + + ut_free(table); + + return(len); +} +#endif + +/*******************************************************************//** +Compare two fts_ranking_t instance on their rank value and doc ids in +descending order on the rank and ascending order on doc id. +@return 0 if p1 == p2, < 0 if p1 < p2, > 0 if p1 > p2 */ +static +int +fts_query_compare_rank( +/*===================*/ + const void* p1, /*!< in: pointer to elem */ + const void* p2) /*!< in: pointer to elem */ +{ + const fts_ranking_t* r1 = (const fts_ranking_t*) p1; + const fts_ranking_t* r2 = (const fts_ranking_t*) p2; + + if (r2->rank < r1->rank) { + return(-1); + } else if (r2->rank == r1->rank) { + + if (r1->doc_id < r2->doc_id) { + return(1); + } else if (r1->doc_id > r2->doc_id) { + return(1); + } + + return(0); + } + + return(1); +} + +#ifdef FTS_UTF8_DEBUG +/*******************************************************************//** +Convert string to lowercase. +@return lower case string, callers responsibility to delete using +ut_free() */ +static +byte* +fts_tolower( +/*========*/ + const byte* src, /*!< in: src string */ + ulint len) /*!< in: src string length */ +{ + fts_string_t str; + byte* lc_str = ut_malloc(len + 1); + + str.f_len = len; + str.f_str = lc_str; + + memcpy(str.f_str, src, len); + + /* Make sure the last byte is NUL terminated */ + str.f_str[len] = '\0'; + + fts_utf8_tolower(&str); + + return(lc_str); +} + +/*******************************************************************//** +Do a case insensitive search. Doesn't check for NUL byte end marker +only relies on len. Convert str2 to lower case before comparing. +@return 0 if p1 == p2, < 0 if p1 < p2, > 0 if p1 > p2 */ +static +int +fts_utf8_strcmp( +/*============*/ + const fts_string_t* + str1, /*!< in: should be lower case*/ + + fts_string_t* str2) /*!< in: any case. We will use the length + of this string during compare as it + should be the min of the two strings */ +{ + byte b = str2->f_str[str2->f_len]; + + ut_a(str2->f_len <= str1->f_len); + + /* We need to write a NUL byte at the end of the string because the + string is converted to lowercase by a MySQL function which doesn't + care about the length. */ + str2->f_str[str2->f_len] = 0; + + fts_utf8_tolower(str2); + + /* Restore the value we replaced above. */ + str2->f_str[str2->f_len] = b; + + return(memcmp(str1->f_str, str2->f_str, str2->f_len)); +} +#endif + +/*******************************************************************//** +Create words in ranking */ +static +void +fts_ranking_words_create( +/*=====================*/ + fts_query_t* query, /*!< in: query instance */ + fts_ranking_t* ranking) /*!< in: ranking instance */ +{ + ranking->words = static_cast<byte*>( + mem_heap_zalloc(query->heap, RANKING_WORDS_INIT_LEN)); + ranking->words_len = RANKING_WORDS_INIT_LEN; +} + +/* +The optimization here is using a char array(bitmap) to replace words rb tree +in fts_ranking_t. + +It can save lots of memory except in some cases of QUERY EXPANSION. + +'word_map' is used as a word dictionary, in which the key is a word, the value +is a number. In 'fts_ranking_words_add', we first check if the word is in 'word_map'. +if not, we add it into 'word_map', and give it a position(actually a number). +then we set the corresponding bit to '1' at the position in the char array 'words'. + +'word_vector' is a useful backup of 'word_map', and we can get a word by its position, +more quickly than searching by value in 'word_map'. we use 'word_vector' +in 'fts_query_calculate_ranking' and 'fts_expand_query'. In the two functions, we need +to scan the bitmap 'words', and get a word when a bit is '1', then we get word_freq +by the word. +*/ + +/*******************************************************************//** +Add a word into ranking */ +static +void +fts_ranking_words_add( +/*==================*/ + fts_query_t* query, /*!< in: query instance */ + fts_ranking_t* ranking, /*!< in: ranking instance */ + const fts_string_t* word) /*!< in: term/word to add */ +{ + ulint pos; + ulint byte_offset; + ulint bit_offset; + ib_rbt_bound_t parent; + + /* Note: we suppose the word map and vector are append-only. */ + ut_ad(query->word_vector->size() == rbt_size(query->word_map)); + + /* We use ib_rbt to simulate a map, f_n_char means position. */ + if (rbt_search(query->word_map, &parent, word) == 0) { + fts_string_t* result_word; + + result_word = rbt_value(fts_string_t, parent.last); + pos = result_word->f_n_char; + ut_ad(pos < rbt_size(query->word_map)); + } else { + /* Add the word to map. */ + fts_string_t new_word; + + pos = rbt_size(query->word_map); + + new_word.f_str = static_cast<byte*>(mem_heap_alloc(query->heap, + word->f_len + 1)); + memcpy(new_word.f_str, word->f_str, word->f_len); + new_word.f_str[word->f_len] = 0; + new_word.f_len = word->f_len; + new_word.f_n_char = pos; + + rbt_add_node(query->word_map, &parent, &new_word); + ut_ad(rbt_validate(query->word_map)); + query->word_vector->push_back(new_word); + } + + /* Check words len */ + byte_offset = pos / CHAR_BIT; + if (byte_offset >= ranking->words_len) { + byte* words = ranking->words; + ulint words_len = ranking->words_len; + + while (byte_offset >= words_len) { + words_len *= 2; + } + + ranking->words = static_cast<byte*>( + mem_heap_zalloc(query->heap, words_len)); + ut_memcpy(ranking->words, words, ranking->words_len); + ranking->words_len = words_len; + } + + /* Set ranking words */ + ut_ad(byte_offset < ranking->words_len); + bit_offset = pos % CHAR_BIT; + ranking->words[byte_offset] |= 1 << bit_offset; +} + +/*******************************************************************//** +Get a word from a ranking +@return true if it's successful */ +static +bool +fts_ranking_words_get_next( +/*=======================*/ + const fts_query_t* query, /*!< in: query instance */ + fts_ranking_t* ranking,/*!< in: ranking instance */ + ulint* pos, /*!< in/out: word start pos */ + fts_string_t* word) /*!< in/out: term/word to add */ +{ + bool ret = false; + ulint max_pos = ranking->words_len * CHAR_BIT; + + /* Search for next word */ + while (*pos < max_pos) { + ulint byte_offset = *pos / CHAR_BIT; + ulint bit_offset = *pos % CHAR_BIT; + + if (ranking->words[byte_offset] & (1 << bit_offset)) { + ret = true; + break; + } + + *pos += 1; + }; + + /* Get next word from word vector */ + if (ret) { + ut_ad(*pos < query->word_vector->size()); + *word = query->word_vector->at((size_t)*pos); + *pos += 1; + } + + return ret; +} + +/*******************************************************************//** +Add a word if it doesn't exist, to the term freq RB tree. We store +a pointer to the word that is passed in as the argument. +@return pointer to word */ +static +fts_word_freq_t* +fts_query_add_word_freq( +/*====================*/ + fts_query_t* query, /*!< in: query instance */ + const fts_string_t* word) /*!< in: term/word to add */ +{ + ib_rbt_bound_t parent; + + /* Lookup the word in our rb tree and add if it doesn't exist. */ + if (rbt_search(query->word_freqs, &parent, word) != 0) { + fts_word_freq_t word_freq; + + memset(&word_freq, 0, sizeof(word_freq)); + + word_freq.word.f_str = static_cast<byte*>( + mem_heap_alloc(query->heap, word->f_len + 1)); + memcpy(word_freq.word.f_str, word->f_str, word->f_len); + word_freq.word.f_str[word->f_len] = 0; + word_freq.word.f_len = word->f_len; + + word_freq.doc_count = 0; + + word_freq.doc_freqs = rbt_create( + sizeof(fts_doc_freq_t), fts_freq_doc_id_cmp); + + parent.last = rbt_add_node( + query->word_freqs, &parent, &word_freq); + + query->total_size += word->f_len + + SIZEOF_RBT_CREATE + + SIZEOF_RBT_NODE_ADD + + sizeof(fts_word_freq_t); + } + + return(rbt_value(fts_word_freq_t, parent.last)); +} + +/*******************************************************************//** +Add a doc id if it doesn't exist, to the doc freq RB tree. +@return pointer to word */ +static +fts_doc_freq_t* +fts_query_add_doc_freq( +/*===================*/ + fts_query_t* query, /*!< in: query instance */ + ib_rbt_t* doc_freqs, /*!< in: rb tree of fts_doc_freq_t */ + doc_id_t doc_id) /*!< in: doc id to add */ +{ + ib_rbt_bound_t parent; + + /* Lookup the doc id in our rb tree and add if it doesn't exist. */ + if (rbt_search(doc_freqs, &parent, &doc_id) != 0) { + fts_doc_freq_t doc_freq; + + memset(&doc_freq, 0, sizeof(doc_freq)); + + doc_freq.freq = 0; + doc_freq.doc_id = doc_id; + + parent.last = rbt_add_node(doc_freqs, &parent, &doc_freq); + + query->total_size += SIZEOF_RBT_NODE_ADD + + sizeof(fts_doc_freq_t); + } + + return(rbt_value(fts_doc_freq_t, parent.last)); +} + +/*******************************************************************//** +Add the doc id to the query set only if it's not in the +deleted array. */ +static +void +fts_query_union_doc_id( +/*===================*/ + fts_query_t* query, /*!< in: query instance */ + doc_id_t doc_id, /*!< in: the doc id to add */ + fts_rank_t rank) /*!< in: if non-zero, it is the + rank associated with the doc_id */ +{ + ib_rbt_bound_t parent; + ulint size = ib_vector_size(query->deleted->doc_ids); + fts_update_t* array = (fts_update_t*) query->deleted->doc_ids->data; + + /* Check if the doc id is deleted and it's not already in our set. */ + if (fts_bsearch(array, 0, static_cast<int>(size), doc_id) < 0 + && rbt_search(query->doc_ids, &parent, &doc_id) != 0) { + + fts_ranking_t ranking; + + ranking.rank = rank; + ranking.doc_id = doc_id; + fts_ranking_words_create(query, &ranking); + + rbt_add_node(query->doc_ids, &parent, &ranking); + + query->total_size += SIZEOF_RBT_NODE_ADD + + sizeof(fts_ranking_t) + RANKING_WORDS_INIT_LEN; + } +} + +/*******************************************************************//** +Remove the doc id from the query set only if it's not in the +deleted set. */ +static +void +fts_query_remove_doc_id( +/*====================*/ + fts_query_t* query, /*!< in: query instance */ + doc_id_t doc_id) /*!< in: the doc id to add */ +{ + ib_rbt_bound_t parent; + ulint size = ib_vector_size(query->deleted->doc_ids); + fts_update_t* array = (fts_update_t*) query->deleted->doc_ids->data; + + /* Check if the doc id is deleted and it's in our set. */ + if (fts_bsearch(array, 0, static_cast<int>(size), doc_id) < 0 + && rbt_search(query->doc_ids, &parent, &doc_id) == 0) { + ut_free(rbt_remove_node(query->doc_ids, parent.last)); + + ut_ad(query->total_size >= + SIZEOF_RBT_NODE_ADD + sizeof(fts_ranking_t)); + query->total_size -= SIZEOF_RBT_NODE_ADD + + sizeof(fts_ranking_t); + } +} + +/*******************************************************************//** +Find the doc id in the query set but not in the deleted set, artificialy +downgrade or upgrade its ranking by a value and make/initialize its ranking +under or above its normal range 0 to 1. This is used for Boolean Search +operator such as Negation operator, which makes word's contribution to the +row's relevance to be negative */ +static +void +fts_query_change_ranking( +/*====================*/ + fts_query_t* query, /*!< in: query instance */ + doc_id_t doc_id, /*!< in: the doc id to add */ + ibool downgrade) /*!< in: Whether to downgrade ranking */ +{ + ib_rbt_bound_t parent; + ulint size = ib_vector_size(query->deleted->doc_ids); + fts_update_t* array = (fts_update_t*) query->deleted->doc_ids->data; + + /* Check if the doc id is deleted and it's in our set. */ + if (fts_bsearch(array, 0, static_cast<int>(size), doc_id) < 0 + && rbt_search(query->doc_ids, &parent, &doc_id) == 0) { + + fts_ranking_t* ranking; + + ranking = rbt_value(fts_ranking_t, parent.last); + + ranking->rank += downgrade ? RANK_DOWNGRADE : RANK_UPGRADE; + + /* Allow at most 2 adjustment by RANK_DOWNGRADE (-0.5) + and RANK_UPGRADE (0.5) */ + if (ranking->rank >= 1.0F) { + ranking->rank = 1.0F; + } else if (ranking->rank <= -1.0F) { + ranking->rank = -1.0F; + } + } +} + +/*******************************************************************//** +Check the doc id in the query set only if it's not in the +deleted array. The doc ids that were found are stored in +another rb tree (fts_query_t::intersect). */ +static +void +fts_query_intersect_doc_id( +/*=======================*/ + fts_query_t* query, /*!< in: query instance */ + doc_id_t doc_id, /*!< in: the doc id to add */ + fts_rank_t rank) /*!< in: if non-zero, it is the + rank associated with the doc_id */ +{ + ib_rbt_bound_t parent; + ulint size = ib_vector_size(query->deleted->doc_ids); + fts_update_t* array = (fts_update_t*) query->deleted->doc_ids->data; + fts_ranking_t* ranking= NULL; + + /* There are three types of intersect: + 1. '+a': doc_ids is empty, add doc into intersect if it matches 'a'. + 2. 'a +b': docs match 'a' is in doc_ids, add doc into intersect + if it matches 'b'. if the doc is also in doc_ids, then change the + doc's rank, and add 'a' in doc's words. + 3. '+a +b': docs matching '+a' is in doc_ids, add doc into intsersect + if it matches 'b' and it's in doc_ids.(multi_exist = true). */ + + /* Check if the doc id is deleted and it's in our set */ + if (fts_bsearch(array, 0, static_cast<int>(size), doc_id) < 0) { + fts_ranking_t new_ranking; + + if (rbt_search(query->doc_ids, &parent, &doc_id) != 0) { + if (query->multi_exist) { + return; + } else { + new_ranking.words = NULL; + } + } else { + ranking = rbt_value(fts_ranking_t, parent.last); + + /* We've just checked the doc id before */ + if (ranking->words == NULL) { + ut_ad(rbt_search(query->intersection, &parent, + ranking) == 0); + return; + } + + /* Merge rank */ + rank += ranking->rank; + if (rank >= 1.0F) { + rank = 1.0F; + } else if (rank <= -1.0F) { + rank = -1.0F; + } + + /* Take words */ + new_ranking.words = ranking->words; + new_ranking.words_len = ranking->words_len; + } + + new_ranking.rank = rank; + new_ranking.doc_id = doc_id; + + if (rbt_search(query->intersection, &parent, + &new_ranking) != 0) { + if (new_ranking.words == NULL) { + fts_ranking_words_create(query, &new_ranking); + + query->total_size += RANKING_WORDS_INIT_LEN; + } else { + /* Note that the intersection has taken + ownership of the ranking data. */ + ranking->words = NULL; + } + + rbt_add_node(query->intersection, + &parent, &new_ranking); + + query->total_size += SIZEOF_RBT_NODE_ADD + + sizeof(fts_ranking_t); + } + } +} + +/*******************************************************************//** +Free the document ranking rb tree. */ +static +void +fts_query_free_doc_ids( +/*===================*/ + fts_query_t* query, /*!< in: query instance */ + ib_rbt_t* doc_ids) /*!< in: rb tree to free */ +{ + const ib_rbt_node_t* node; + + for (node = rbt_first(doc_ids); node; node = rbt_first(doc_ids)) { + + fts_ranking_t* ranking; + + ranking = rbt_value(fts_ranking_t, node); + + if (ranking->words) { + ranking->words = NULL; + } + + ut_free(rbt_remove_node(doc_ids, node)); + + ut_ad(query->total_size >= + SIZEOF_RBT_NODE_ADD + sizeof(fts_ranking_t)); + query->total_size -= SIZEOF_RBT_NODE_ADD + + sizeof(fts_ranking_t); + } + + rbt_free(doc_ids); + + ut_ad(query->total_size >= SIZEOF_RBT_CREATE); + query->total_size -= SIZEOF_RBT_CREATE; +} + +/*******************************************************************//** +Add the word to the documents "list" of matching words from +the query. We make a copy of the word from the query heap. */ +static +void +fts_query_add_word_to_document( +/*===========================*/ + fts_query_t* query, /*!< in: query to update */ + doc_id_t doc_id, /*!< in: the document to update */ + const fts_string_t* word) /*!< in: the token to add */ +{ + ib_rbt_bound_t parent; + fts_ranking_t* ranking = NULL; + + if (query->flags == FTS_OPT_RANKING) { + return; + } + + /* First we search the intersection RB tree as it could have + taken ownership of the words rb tree instance. */ + if (query->intersection + && rbt_search(query->intersection, &parent, &doc_id) == 0) { + + ranking = rbt_value(fts_ranking_t, parent.last); + } + + if (ranking == NULL + && rbt_search(query->doc_ids, &parent, &doc_id) == 0) { + + ranking = rbt_value(fts_ranking_t, parent.last); + } + + if (ranking != NULL) { + fts_ranking_words_add(query, ranking, word); + } +} + +/*******************************************************************//** +Check the node ilist. */ +static +void +fts_query_check_node( +/*=================*/ + fts_query_t* query, /*!< in: query to update */ + const fts_string_t* token, /*!< in: the token to search */ + const fts_node_t* node) /*!< in: node to check */ +{ + /* Skip nodes whose doc ids are out range. */ + if (query->oper == FTS_EXIST + && ((query->upper_doc_id > 0 + && node->first_doc_id > query->upper_doc_id) + || (query->lower_doc_id > 0 + && node->last_doc_id < query->lower_doc_id))) { + + /* Ignore */ + + } else { + int ret; + ib_rbt_bound_t parent; + ulint ilist_size = node->ilist_size; + fts_word_freq_t*word_freqs; + + /* The word must exist. */ + ret = rbt_search(query->word_freqs, &parent, token); + ut_a(ret == 0); + + word_freqs = rbt_value(fts_word_freq_t, parent.last); + + query->error = fts_query_filter_doc_ids( + query, token, word_freqs, node, + node->ilist, ilist_size, TRUE); + } +} + +/*****************************************************************//** +Search index cache for word with wildcard match. +@return number of words matched */ +static +ulint +fts_cache_find_wildcard( +/*====================*/ + fts_query_t* query, /*!< in: query instance */ + const fts_index_cache_t*index_cache, /*!< in: cache to search */ + const fts_string_t* token) /*!< in: token to search */ +{ + ib_rbt_bound_t parent; + const ib_vector_t* nodes = NULL; + fts_string_t srch_text; + byte term[FTS_MAX_WORD_LEN + 1]; + ulint num_word = 0; + + srch_text.f_len = (token->f_str[token->f_len - 1] == '%') + ? token->f_len - 1 + : token->f_len; + + strncpy((char*) term, (char*) token->f_str, srch_text.f_len); + term[srch_text.f_len] = '\0'; + srch_text.f_str = term; + + /* Lookup the word in the rb tree */ + if (rbt_search_cmp(index_cache->words, &parent, &srch_text, NULL, + innobase_fts_text_cmp_prefix) == 0) { + const fts_tokenizer_word_t* word; + ulint i; + const ib_rbt_node_t* cur_node; + ibool forward = FALSE; + + word = rbt_value(fts_tokenizer_word_t, parent.last); + cur_node = parent.last; + + while (innobase_fts_text_cmp_prefix( + index_cache->charset, &srch_text, &word->text) == 0) { + + nodes = word->nodes; + + for (i = 0; nodes && i < ib_vector_size(nodes); ++i) { + int ret; + const fts_node_t* node; + ib_rbt_bound_t freq_parent; + fts_word_freq_t* word_freqs; + + node = static_cast<const fts_node_t*>( + ib_vector_get_const(nodes, i)); + + ret = rbt_search(query->word_freqs, + &freq_parent, + &srch_text); + + ut_a(ret == 0); + + word_freqs = rbt_value( + fts_word_freq_t, + freq_parent.last); + + query->error = fts_query_filter_doc_ids( + query, &srch_text, + word_freqs, node, + node->ilist, node->ilist_size, TRUE); + + if (query->error != DB_SUCCESS) { + return(0); + } + } + + num_word++; + + if (!forward) { + cur_node = rbt_prev( + index_cache->words, cur_node); + } else { +cont_search: + cur_node = rbt_next( + index_cache->words, cur_node); + } + + if (!cur_node) { + break; + } + + word = rbt_value(fts_tokenizer_word_t, cur_node); + } + + if (!forward) { + forward = TRUE; + cur_node = parent.last; + goto cont_search; + } + } + + return(num_word); +} + +/*****************************************************************//** +Set difference. +@return DB_SUCCESS if all go well */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +fts_query_difference( +/*=================*/ + fts_query_t* query, /*!< in: query instance */ + const fts_string_t* token) /*!< in: token to search */ +{ + ulint n_doc_ids= 0; + trx_t* trx = query->trx; + dict_table_t* table = query->index->table; + + ut_a(query->oper == FTS_IGNORE); + +#ifdef FTS_INTERNAL_DIAG_PRINT + fprintf(stderr, "DIFFERENCE: Searching: '%.*s'\n", + (int) token->f_len, token->f_str); +#endif + + if (query->doc_ids) { + n_doc_ids = rbt_size(query->doc_ids); + } + + /* There is nothing we can substract from an empty set. */ + if (query->doc_ids && !rbt_empty(query->doc_ids)) { + ulint i; + fts_fetch_t fetch; + const ib_vector_t* nodes; + const fts_index_cache_t*index_cache; + que_t* graph = NULL; + fts_cache_t* cache = table->fts->cache; + dberr_t error; + + rw_lock_x_lock(&cache->lock); + + index_cache = fts_find_index_cache(cache, query->index); + + /* Must find the index cache */ + ut_a(index_cache != NULL); + + /* Search the cache for a matching word first. */ + if (query->cur_node->term.wildcard + && query->flags != FTS_PROXIMITY + && query->flags != FTS_PHRASE) { + fts_cache_find_wildcard(query, index_cache, token); + } else { + nodes = fts_cache_find_word(index_cache, token); + + for (i = 0; nodes && i < ib_vector_size(nodes) + && query->error == DB_SUCCESS; ++i) { + const fts_node_t* node; + + node = static_cast<const fts_node_t*>( + ib_vector_get_const(nodes, i)); + + fts_query_check_node(query, token, node); + } + } + + rw_lock_x_unlock(&cache->lock); + + /* error is passed by 'query->error' */ + if (query->error != DB_SUCCESS) { + ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT); + return(query->error); + } + + /* Setup the callback args for filtering and + consolidating the ilist. */ + fetch.read_arg = query; + fetch.read_record = fts_query_index_fetch_nodes; + + error = fts_index_fetch_nodes( + trx, &graph, &query->fts_index_table, token, &fetch); + + /* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */ + ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS)); + if (error != DB_SUCCESS) { + query->error = error; + } + + fts_que_graph_free(graph); + } + + /* The size can't increase. */ + ut_a(rbt_size(query->doc_ids) <= n_doc_ids); + + return(query->error); +} + +/*****************************************************************//** +Intersect the token doc ids with the current set. +@return DB_SUCCESS if all go well */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +fts_query_intersect( +/*================*/ + fts_query_t* query, /*!< in: query instance */ + const fts_string_t* token) /*!< in: the token to search */ +{ + trx_t* trx = query->trx; + dict_table_t* table = query->index->table; + + ut_a(query->oper == FTS_EXIST); + +#ifdef FTS_INTERNAL_DIAG_PRINT + fprintf(stderr, "INTERSECT: Searching: '%.*s'\n", + (int) token->f_len, token->f_str); +#endif + + /* If the words set is not empty and multi exist is true, + we know the intersection set is empty in advance. */ + if (!(rbt_empty(query->doc_ids) && query->multi_exist)) { + ulint n_doc_ids = 0; + ulint i; + fts_fetch_t fetch; + const ib_vector_t* nodes; + const fts_index_cache_t*index_cache; + que_t* graph = NULL; + fts_cache_t* cache = table->fts->cache; + dberr_t error; + + ut_a(!query->intersection); + + n_doc_ids = rbt_size(query->doc_ids); + + /* Create the rb tree that will hold the doc ids of + the intersection. */ + query->intersection = rbt_create( + sizeof(fts_ranking_t), fts_ranking_doc_id_cmp); + + query->total_size += SIZEOF_RBT_CREATE; + + /* This is to avoid decompressing the ilist if the + node's ilist doc ids are out of range. */ + if (!rbt_empty(query->doc_ids) && query->multi_exist) { + const ib_rbt_node_t* node; + doc_id_t* doc_id; + + node = rbt_first(query->doc_ids); + doc_id = rbt_value(doc_id_t, node); + query->lower_doc_id = *doc_id; + + node = rbt_last(query->doc_ids); + doc_id = rbt_value(doc_id_t, node); + query->upper_doc_id = *doc_id; + + } else { + query->lower_doc_id = 0; + query->upper_doc_id = 0; + } + + /* Search the cache for a matching word first. */ + + rw_lock_x_lock(&cache->lock); + + /* Search for the index specific cache. */ + index_cache = fts_find_index_cache(cache, query->index); + + /* Must find the index cache. */ + ut_a(index_cache != NULL); + + if (query->cur_node->term.wildcard) { + /* Wildcard search the index cache */ + fts_cache_find_wildcard(query, index_cache, token); + } else { + nodes = fts_cache_find_word(index_cache, token); + + for (i = 0; nodes && i < ib_vector_size(nodes) + && query->error == DB_SUCCESS; ++i) { + const fts_node_t* node; + + node = static_cast<const fts_node_t*>( + ib_vector_get_const(nodes, i)); + + fts_query_check_node(query, token, node); + } + } + + rw_lock_x_unlock(&cache->lock); + + /* error is passed by 'query->error' */ + if (query->error != DB_SUCCESS) { + ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT); + return(query->error); + } + + /* Setup the callback args for filtering and + consolidating the ilist. */ + fetch.read_arg = query; + fetch.read_record = fts_query_index_fetch_nodes; + + error = fts_index_fetch_nodes( + trx, &graph, &query->fts_index_table, token, &fetch); + + /* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */ + ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS)); + if (error != DB_SUCCESS) { + query->error = error; + } + + fts_que_graph_free(graph); + + if (query->error == DB_SUCCESS) { + /* Make the intesection (rb tree) the current doc id + set and free the old set. */ + fts_query_free_doc_ids(query, query->doc_ids); + query->doc_ids = query->intersection; + query->intersection = NULL; + + ut_a(!query->multi_exist || (query->multi_exist + && rbt_size(query->doc_ids) <= n_doc_ids)); + } + } + + return(query->error); +} + +/*****************************************************************//** +Query index cache. +@return DB_SUCCESS if all go well */ +static +dberr_t +fts_query_cache( +/*============*/ + fts_query_t* query, /*!< in/out: query instance */ + const fts_string_t* token) /*!< in: token to search */ +{ + const fts_index_cache_t*index_cache; + dict_table_t* table = query->index->table; + fts_cache_t* cache = table->fts->cache; + + /* Search the cache for a matching word first. */ + rw_lock_x_lock(&cache->lock); + + /* Search for the index specific cache. */ + index_cache = fts_find_index_cache(cache, query->index); + + /* Must find the index cache. */ + ut_a(index_cache != NULL); + + if (query->cur_node->term.wildcard + && query->flags != FTS_PROXIMITY + && query->flags != FTS_PHRASE) { + /* Wildcard search the index cache */ + fts_cache_find_wildcard(query, index_cache, token); + } else { + const ib_vector_t* nodes; + ulint i; + + nodes = fts_cache_find_word(index_cache, token); + + for (i = 0; nodes && i < ib_vector_size(nodes) + && query->error == DB_SUCCESS; ++i) { + const fts_node_t* node; + + node = static_cast<const fts_node_t*>( + ib_vector_get_const(nodes, i)); + + fts_query_check_node(query, token, node); + } + } + + rw_lock_x_unlock(&cache->lock); + + return(query->error); +} + +/*****************************************************************//** +Set union. +@return DB_SUCCESS if all go well */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +fts_query_union( +/*============*/ + fts_query_t* query, /*!< in: query instance */ + fts_string_t* token) /*!< in: token to search */ +{ + fts_fetch_t fetch; + ulint n_doc_ids = 0; + trx_t* trx = query->trx; + que_t* graph = NULL; + dberr_t error; + + ut_a(query->oper == FTS_NONE || query->oper == FTS_DECR_RATING || + query->oper == FTS_NEGATE || query->oper == FTS_INCR_RATING); + +#ifdef FTS_INTERNAL_DIAG_PRINT + fprintf(stderr, "UNION: Searching: '%.*s'\n", + (int) token->f_len, token->f_str); +#endif + + if (query->doc_ids) { + n_doc_ids = rbt_size(query->doc_ids); + } + + if (token->f_len == 0) { + return(query->error); + } + + /* Single '%' would confuse parser in pars_like_rebind(). In addition, + our wildcard search only supports prefix search */ + ut_ad(*token->f_str != '%'); + + fts_query_cache(query, token); + + /* Setup the callback args for filtering and + consolidating the ilist. */ + fetch.read_arg = query; + fetch.read_record = fts_query_index_fetch_nodes; + + /* Read the nodes from disk. */ + error = fts_index_fetch_nodes( + trx, &graph, &query->fts_index_table, token, &fetch); + + /* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */ + ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS)); + if (error != DB_SUCCESS) { + query->error = error; + } + + fts_que_graph_free(graph); + + if (query->error == DB_SUCCESS) { + + /* The size can't decrease. */ + ut_a(rbt_size(query->doc_ids) >= n_doc_ids); + + /* Calulate the number of doc ids that were added to + the current doc id set. */ + if (query->doc_ids) { + n_doc_ids = rbt_size(query->doc_ids) - n_doc_ids; + } + } + + return(query->error); +} + +/*****************************************************************//** +Depending upon the current query operator process the doc id. +return DB_SUCCESS if all go well +or return DB_FTS_EXCEED_RESULT_CACHE_LIMIT */ +static +dberr_t +fts_query_process_doc_id( +/*=====================*/ + fts_query_t* query, /*!< in: query instance */ + doc_id_t doc_id, /*!< in: doc id to process */ + fts_rank_t rank) /*!< in: if non-zero, it is the + rank associated with the doc_id */ +{ + if (query->flags == FTS_OPT_RANKING) { + return(DB_SUCCESS); + } + + switch (query->oper) { + case FTS_NONE: + fts_query_union_doc_id(query, doc_id, rank); + break; + + case FTS_EXIST: + fts_query_intersect_doc_id(query, doc_id, rank); + break; + + case FTS_IGNORE: + fts_query_remove_doc_id(query, doc_id); + break; + + case FTS_NEGATE: + fts_query_change_ranking(query, doc_id, TRUE); + break; + + case FTS_DECR_RATING: + fts_query_union_doc_id(query, doc_id, rank); + fts_query_change_ranking(query, doc_id, TRUE); + break; + + case FTS_INCR_RATING: + fts_query_union_doc_id(query, doc_id, rank); + fts_query_change_ranking(query, doc_id, FALSE); + break; + + default: + ut_error; + } + + if (query->total_size > fts_result_cache_limit) { + return(DB_FTS_EXCEED_RESULT_CACHE_LIMIT); + } else { + return(DB_SUCCESS); + } +} + +/*****************************************************************//** +Merge two result sets. */ +static +dberr_t +fts_merge_doc_ids( +/*==============*/ + fts_query_t* query, /*!< in,out: query instance */ + const ib_rbt_t* doc_ids) /*!< in: result set to merge */ +{ + const ib_rbt_node_t* node; + + ut_a(!rbt_empty(doc_ids)); + ut_a(!query->intersection); + + /* To process FTS_EXIST operation (intersection), we need + to create a new result set for fts_query_intersect(). */ + if (query->oper == FTS_EXIST) { + + query->intersection = rbt_create( + sizeof(fts_ranking_t), fts_ranking_doc_id_cmp); + + query->total_size += SIZEOF_RBT_CREATE; + } + + /* Merge the elements to the result set. */ + for (node = rbt_first(doc_ids); node; node = rbt_next(doc_ids, node)) { + fts_ranking_t* ranking; + ulint pos = 0; + fts_string_t word; + + ranking = rbt_value(fts_ranking_t, node); + + query->error = fts_query_process_doc_id( + query, ranking->doc_id, ranking->rank); + + if (query->error != DB_SUCCESS) { + return(query->error); + } + + /* Merge words. Don't need to take operator into account. */ + ut_a(ranking->words); + while (fts_ranking_words_get_next(query, ranking, &pos, &word)) { + fts_query_add_word_to_document(query, ranking->doc_id, + &word); + } + } + + /* If it is an intersection operation, reset query->doc_ids + to query->intersection and free the old result list. */ + if (query->oper == FTS_EXIST && query->intersection != NULL) { + fts_query_free_doc_ids(query, query->doc_ids); + query->doc_ids = query->intersection; + query->intersection = NULL; + } + + return(DB_SUCCESS); +} + +/*****************************************************************//** +Skip non-whitespace in a string. Move ptr to the next word boundary. +@return pointer to first whitespace character or end */ +UNIV_INLINE +byte* +fts_query_skip_word( +/*================*/ + byte* ptr, /*!< in: start of scan */ + const byte* end) /*!< in: pointer to end of string */ +{ + /* TODO: Does this have to be UTF-8 too ? */ + while (ptr < end && !(ispunct(*ptr) || isspace(*ptr))) { + ++ptr; + } + + return(ptr); +} + +/*****************************************************************//** +Check whether the remaining terms in the phrase match the text. +@return TRUE if matched else FALSE */ +static +ibool +fts_query_match_phrase_terms( +/*=========================*/ + fts_phrase_t* phrase, /*!< in: phrase to match */ + byte** start, /*!< in/out: text to search, we can't + make this const becase we need to + first convert the string to + lowercase */ + const byte* end, /*!< in: pointer to the end of + the string to search */ + mem_heap_t* heap) /*!< in: heap */ +{ + ulint i; + byte* ptr = *start; + const ib_vector_t* tokens = phrase->tokens; + ulint distance = phrase->distance; + + /* We check only from the second term onwards, since the first + must have matched otherwise we wouldn't be here. */ + for (i = 1; ptr < end && i < ib_vector_size(tokens); /* No op */) { + fts_string_t match; + fts_string_t cmp_str; + const fts_string_t* token; + int result; + ulint ret; + ulint offset; + + ret = innobase_mysql_fts_get_token( + phrase->charset, ptr, (byte*) end, + &match, &offset); + + if (match.f_len > 0) { + /* Get next token to match. */ + token = static_cast<const fts_string_t*>( + ib_vector_get_const(tokens, i)); + + fts_utf8_string_dup(&cmp_str, &match, heap); + + result = innobase_fts_text_case_cmp( + phrase->charset, token, &cmp_str); + + /* Skip the rest of the tokens if this one doesn't + match and the proximity distance is exceeded. */ + if (result + && (distance == ULINT_UNDEFINED + || distance == 0)) { + + break; + } + + /* This token matched move to the next token. */ + if (result == 0) { + /* Advance the text to search by the length + of the last token. */ + ptr += ret; + + /* Advance to the next token. */ + ++i; + } else { + + ut_a(distance != ULINT_UNDEFINED); + + ptr = fts_query_skip_word(ptr, end); + } + + /* Distance can be 0 for exact matches. */ + if (distance != ULINT_UNDEFINED && distance > 0) { + --distance; + } + } else { + ptr += ret; + } + } + + *start = ptr; + + /* Can't be greater than the number of elements. */ + ut_a(i <= ib_vector_size(tokens)); + + /* This is the case for multiple words. */ + if (i == ib_vector_size(tokens)) { + phrase->found = TRUE; + } + + return(phrase->found); +} + +/*****************************************************************//** +Callback function to count the number of words in position ranges, +and see whether the word count is in specified "phrase->distance" +@return true if the number of characters is less than the "distance" */ +static +bool +fts_proximity_is_word_in_range( +/*===========================*/ + const fts_phrase_t* + phrase, /*!< in: phrase with the search info */ + byte* start, /*!< in: text to search */ + ulint total_len) /*!< in: length of text */ +{ + fts_proximity_t* proximity_pos = phrase->proximity_pos; + + ut_ad(proximity_pos->n_pos == proximity_pos->min_pos.size()); + ut_ad(proximity_pos->n_pos == proximity_pos->max_pos.size()); + + /* Search each matched position pair (with min and max positions) + and count the number of words in the range */ + for (ulint i = 0; i < proximity_pos->n_pos; i++) { + ulint cur_pos = proximity_pos->min_pos[i]; + ulint n_word = 0; + + ut_ad(proximity_pos->max_pos[i] <= total_len); + + /* Walk through words in the range and count them */ + while (cur_pos <= proximity_pos->max_pos[i]) { + ulint len; + fts_string_t str; + ulint offset = 0; + + len = innobase_mysql_fts_get_token( + phrase->charset, + start + cur_pos, + start + total_len, &str, &offset); + + if (len == 0) { + break; + } + + /* Advances position with "len" bytes */ + cur_pos += len; + + /* Record the number of words */ + if (str.f_n_char > 0) { + n_word++; + } + + if (n_word > phrase->distance) { + break; + } + } + + /* Check if the number of words is less than specified + "distance" */ + if (n_word && n_word <= phrase->distance) { + return(true); + } + } + + return(false); +} + +/*****************************************************************//** +Callback function to fetch and search the document. +@return TRUE if matched else FALSE */ +static +ibool +fts_query_match_phrase( +/*===================*/ + fts_phrase_t* phrase, /*!< in: phrase to match */ + byte* start, /*!< in: text to search, we can't make + this const becase we need to first + convert the string to lowercase */ + ulint cur_len, /*!< in: length of text */ + ulint prev_len, /*!< in: total length for searched + doc fields*/ + mem_heap_t* heap) /* heap */ +{ + ulint i; + const fts_string_t* first; + const byte* end = start + cur_len; + const ib_vector_t* tokens = phrase->tokens; + const ib_vector_t* positions = phrase->match->positions; + + ut_a(!phrase->found); + ut_a(phrase->match->doc_id > 0); + ut_a(ib_vector_size(tokens) > 0); + ut_a(ib_vector_size(positions) > 0); + + first = static_cast<const fts_string_t*>( + ib_vector_get_const(tokens, 0)); + + ut_a(phrase->match->start < ib_vector_size(positions)); + + for (i = phrase->match->start; i < ib_vector_size(positions); ++i) { + ulint pos; + fts_string_t match; + fts_string_t cmp_str; + byte* ptr = start; + ulint ret; + ulint offset; + + pos = *(ulint*) ib_vector_get_const(positions, i); + + if (pos == ULINT_UNDEFINED) { + break; + } + + if (pos < prev_len) { + continue; + } + + /* Document positions are calculated from the beginning + of the first field, need to save the length for each + searched field to adjust the doc position when search + phrases. */ + pos -= prev_len; + ptr = match.f_str = start + pos; + + /* Within limits ? */ + if (ptr >= end) { + break; + } + + ret = innobase_mysql_fts_get_token( + phrase->charset, start + pos, (byte*) end, + &match, &offset); + + if (match.f_len == 0) { + break; + } + + fts_utf8_string_dup(&cmp_str, &match, heap); + + if (innobase_fts_text_case_cmp( + phrase->charset, first, &cmp_str) == 0) { + + /* This is the case for the single word + in the phrase. */ + if (ib_vector_size(phrase->tokens) == 1) { + phrase->found = TRUE; + break; + } + + ptr += ret; + + /* Match the remaining terms in the phrase. */ + if (fts_query_match_phrase_terms(phrase, &ptr, + end, heap)) { + break; + } + } + } + + return(phrase->found); +} + +/*****************************************************************//** +Callback function to fetch and search the document. +@return whether the phrase is found */ +static +ibool +fts_query_fetch_document( +/*=====================*/ + void* row, /*!< in: sel_node_t* */ + void* user_arg) /*!< in: fts_doc_t* */ +{ + + que_node_t* exp; + sel_node_t* node = static_cast<sel_node_t*>(row); + fts_phrase_t* phrase = static_cast<fts_phrase_t*>(user_arg); + ulint prev_len = 0; + ulint total_len = 0; + byte* document_text = NULL; + + exp = node->select_list; + + phrase->found = FALSE; + + /* For proximity search, we will need to get the whole document + from all fields, so first count the total length of the document + from all the fields */ + if (phrase->proximity_pos) { + while (exp) { + ulint field_len; + dfield_t* dfield = que_node_get_val(exp); + byte* data = static_cast<byte*>( + dfield_get_data(dfield)); + + if (dfield_is_ext(dfield)) { + ulint local_len = dfield_get_len(dfield); + + local_len -= BTR_EXTERN_FIELD_REF_SIZE; + + field_len = mach_read_from_4( + data + local_len + BTR_EXTERN_LEN + 4); + } else { + field_len = dfield_get_len(dfield); + } + + if (field_len != UNIV_SQL_NULL) { + total_len += field_len + 1; + } + + exp = que_node_get_next(exp); + } + + document_text = static_cast<byte*>(mem_heap_zalloc( + phrase->heap, total_len)); + + if (!document_text) { + return(FALSE); + } + } + + exp = node->select_list; + + while (exp) { + dfield_t* dfield = que_node_get_val(exp); + byte* data = static_cast<byte*>( + dfield_get_data(dfield)); + ulint cur_len; + + if (dfield_is_ext(dfield)) { + data = btr_copy_externally_stored_field( + &cur_len, data, phrase->zip_size, + dfield_get_len(dfield), phrase->heap); + } else { + cur_len = dfield_get_len(dfield); + } + + if (cur_len != UNIV_SQL_NULL && cur_len != 0) { + if (phrase->proximity_pos) { + ut_ad(prev_len + cur_len <= total_len); + memcpy(document_text + prev_len, data, cur_len); + } else { + /* For phrase search */ + phrase->found = + fts_query_match_phrase( + phrase, + static_cast<byte*>(data), + cur_len, prev_len, + phrase->heap); + } + + /* Document positions are calculated from the beginning + of the first field, need to save the length for each + searched field to adjust the doc position when search + phrases. */ + prev_len += cur_len + 1; + } + + if (phrase->found) { + break; + } + + exp = que_node_get_next(exp); + } + + if (phrase->proximity_pos) { + ut_ad(prev_len <= total_len); + + phrase->found = fts_proximity_is_word_in_range( + phrase, document_text, total_len); + } + + return(phrase->found); +} + +#if 0 +/******************************************************************** +Callback function to check whether a record was found or not. */ +static +ibool +fts_query_select( +/*=============*/ + void* row, /*!< in: sel_node_t* */ + void* user_arg) /*!< in: fts_doc_t* */ +{ + int i; + que_node_t* exp; + sel_node_t* node = row; + fts_select_t* select = user_arg; + + ut_a(select->word_freq); + ut_a(select->word_freq->doc_freqs); + + exp = node->select_list; + + for (i = 0; exp && !select->found; ++i) { + dfield_t* dfield = que_node_get_val(exp); + void* data = dfield_get_data(dfield); + ulint len = dfield_get_len(dfield); + + switch (i) { + case 0: /* DOC_COUNT */ + if (len != UNIV_SQL_NULL && len != 0) { + + select->word_freq->doc_count += + mach_read_from_4(data); + } + break; + + case 1: /* ILIST */ + if (len != UNIV_SQL_NULL && len != 0) { + + fts_query_find_doc_id(select, data, len); + } + break; + + default: + ut_error; + } + + exp = que_node_get_next(exp); + } + + return(FALSE); +} + +/******************************************************************** +Read the rows from the FTS index, that match word and where the +doc id is between first and last doc id. +@return DB_SUCCESS if all go well else error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +fts_query_find_term( +/*================*/ + fts_query_t* query, /*!< in: FTS query state */ + que_t** graph, /*!< in: prepared statement */ + const fts_string_t* word, /*!< in: the word to fetch */ + doc_id_t doc_id, /*!< in: doc id to match */ + ulint* min_pos,/*!< in/out: pos found must be + greater than this minimum value. */ + ibool* found) /*!< out: TRUE if found else FALSE */ +{ + pars_info_t* info; + dberr_t error; + fts_select_t select; + doc_id_t match_doc_id; + trx_t* trx = query->trx; + + trx->op_info = "fetching FTS index matching nodes"; + + if (*graph) { + info = (*graph)->info; + } else { + info = pars_info_create(); + } + + select.found = FALSE; + select.doc_id = doc_id; + select.min_pos = *min_pos; + select.word_freq = fts_query_add_word_freq(query, word->f_str); + + pars_info_bind_function(info, "my_func", fts_query_select, &select); + pars_info_bind_varchar_literal(info, "word", word->f_str, word->f_len); + + /* Convert to "storage" byte order. */ + fts_write_doc_id((byte*) &match_doc_id, doc_id); + + fts_bind_doc_id(info, "min_doc_id", &match_doc_id); + + fts_bind_doc_id(info, "max_doc_id", &match_doc_id); + + if (!*graph) { + ulint selected; + + selected = fts_select_index(*word->f_str); + + query->fts_index_table.suffix = fts_get_suffix(selected); + + *graph = fts_parse_sql( + &query->fts_index_table, + info, + "DECLARE FUNCTION my_func;\n" + "DECLARE CURSOR c IS" + " SELECT doc_count, ilist\n" + " FROM \"%s\"\n" + " WHERE word LIKE :word AND " + " first_doc_id <= :min_doc_id AND " + " last_doc_id >= :max_doc_id\n" + " ORDER BY first_doc_id;\n" + "BEGIN\n" + "\n" + "OPEN c;\n" + "WHILE 1 = 1 LOOP\n" + " FETCH c INTO my_func();\n" + " IF c % NOTFOUND THEN\n" + " EXIT;\n" + " END IF;\n" + "END LOOP;\n" + "CLOSE c;"); + } + + for(;;) { + error = fts_eval_sql(trx, *graph); + + if (error == DB_SUCCESS) { + + break; /* Exit the loop. */ + } else { + ut_print_timestamp(stderr); + + if (error == DB_LOCK_WAIT_TIMEOUT) { + fprintf(stderr, " InnoDB: Warning: lock wait " + "timeout reading FTS index. " + "Retrying!\n"); + + trx->error_state = DB_SUCCESS; + } else { + fprintf(stderr, " InnoDB: Error: %lu " + "while reading FTS index.\n", error); + + break; /* Exit the loop. */ + } + } + } + + /* Value to return */ + *found = select.found; + + if (*found) { + *min_pos = select.min_pos; + } + + return(error); +} + +/******************************************************************** +Callback aggregator for int columns. */ +static +ibool +fts_query_sum( +/*==========*/ + /*!< out: always returns TRUE */ + void* row, /*!< in: sel_node_t* */ + void* user_arg) /*!< in: ulint* */ +{ + + que_node_t* exp; + sel_node_t* node = row; + ulint* total = user_arg; + + exp = node->select_list; + + while (exp) { + dfield_t* dfield = que_node_get_val(exp); + void* data = dfield_get_data(dfield); + ulint len = dfield_get_len(dfield); + + if (len != UNIV_SQL_NULL && len != 0) { + *total += mach_read_from_4(data); + } + + exp = que_node_get_next(exp); + } + + return(TRUE); +} + +/******************************************************************** +Calculate the total documents that contain a particular word (term). +@return DB_SUCCESS if all go well else error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +fts_query_total_docs_containing_term( +/*=================================*/ + fts_query_t* query, /*!< in: FTS query state */ + const fts_string_t* word, /*!< in: the word to check */ + ulint* total) /*!< out: documents containing word */ +{ + pars_info_t* info; + dberr_t error; + que_t* graph; + ulint selected; + trx_t* trx = query->trx; + + trx->op_info = "fetching FTS index document count"; + + *total = 0; + + info = pars_info_create(); + + pars_info_bind_function(info, "my_func", fts_query_sum, total); + pars_info_bind_varchar_literal(info, "word", word->f_str, word->f_len); + + selected = fts_select_index(*word->f_str); + + query->fts_index_table.suffix = fts_get_suffix(selected); + + graph = fts_parse_sql( + &query->fts_index_table, + info, + "DECLARE FUNCTION my_func;\n" + "DECLARE CURSOR c IS" + " SELECT doc_count\n" + " FROM %s\n" + " WHERE word = :word " + " ORDER BY first_doc_id;\n" + "BEGIN\n" + "\n" + "OPEN c;\n" + "WHILE 1 = 1 LOOP\n" + " FETCH c INTO my_func();\n" + " IF c % NOTFOUND THEN\n" + " EXIT;\n" + " END IF;\n" + "END LOOP;\n" + "CLOSE c;"); + + for(;;) { + error = fts_eval_sql(trx, graph); + + if (error == DB_SUCCESS) { + + break; /* Exit the loop. */ + } else { + ut_print_timestamp(stderr); + + if (error == DB_LOCK_WAIT_TIMEOUT) { + fprintf(stderr, " InnoDB: Warning: lock wait " + "timeout reading FTS index. " + "Retrying!\n"); + + trx->error_state = DB_SUCCESS; + } else { + fprintf(stderr, " InnoDB: Error: %lu " + "while reading FTS index.\n", error); + + break; /* Exit the loop. */ + } + } + } + + fts_que_graph_free(graph); + + return(error); +} + +/******************************************************************** +Get the total number of words in a documents. +@return DB_SUCCESS if all go well else error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +fts_query_terms_in_document( +/*========================*/ + fts_query_t* query, /*!< in: FTS query state */ + doc_id_t doc_id, /*!< in: the word to check */ + ulint* total) /*!< out: total words in document */ +{ + pars_info_t* info; + dberr_t error; + que_t* graph; + doc_id_t read_doc_id; + trx_t* trx = query->trx; + + trx->op_info = "fetching FTS document term count"; + + *total = 0; + + info = pars_info_create(); + + pars_info_bind_function(info, "my_func", fts_query_sum, total); + + /* Convert to "storage" byte order. */ + fts_write_doc_id((byte*) &read_doc_id, doc_id); + fts_bind_doc_id(info, "doc_id", &read_doc_id); + + query->fts_index_table.suffix = "DOC_ID"; + + graph = fts_parse_sql( + &query->fts_index_table, + info, + "DECLARE FUNCTION my_func;\n" + "DECLARE CURSOR c IS" + " SELECT count\n" + " FROM \"%s\"\n" + " WHERE doc_id = :doc_id " + "BEGIN\n" + "\n" + "OPEN c;\n" + "WHILE 1 = 1 LOOP\n" + " FETCH c INTO my_func();\n" + " IF c % NOTFOUND THEN\n" + " EXIT;\n" + " END IF;\n" + "END LOOP;\n" + "CLOSE c;"); + + for(;;) { + error = fts_eval_sql(trx, graph); + + if (error == DB_SUCCESS) { + + break; /* Exit the loop. */ + } else { + ut_print_timestamp(stderr); + + if (error == DB_LOCK_WAIT_TIMEOUT) { + fprintf(stderr, " InnoDB: Warning: lock wait " + "timeout reading FTS doc id table. " + "Retrying!\n"); + + trx->error_state = DB_SUCCESS; + } else { + fprintf(stderr, " InnoDB: Error: %lu " + "while reading FTS doc id table.\n", + error); + + break; /* Exit the loop. */ + } + } + } + + fts_que_graph_free(graph); + + return(error); +} +#endif + +/*****************************************************************//** +Retrieve the document and match the phrase tokens. +@return DB_SUCCESS or error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +fts_query_match_document( +/*=====================*/ + ib_vector_t* tokens, /*!< in: phrase tokens */ + fts_get_doc_t* get_doc, /*!< in: table and prepared statements */ + fts_match_t* match, /*!< in: doc id and positions */ + ulint distance, /*!< in: proximity distance */ + ibool* found) /*!< out: TRUE if phrase found */ +{ + dberr_t error; + fts_phrase_t phrase; + + memset(&phrase, 0x0, sizeof(phrase)); + + phrase.match = match; /* Positions to match */ + phrase.tokens = tokens; /* Tokens to match */ + phrase.distance = distance; + phrase.charset = get_doc->index_cache->charset; + phrase.zip_size = dict_table_zip_size( + get_doc->index_cache->index->table); + phrase.heap = mem_heap_create(512); + + *found = phrase.found = FALSE; + + error = fts_doc_fetch_by_doc_id( + get_doc, match->doc_id, NULL, FTS_FETCH_DOC_BY_ID_EQUAL, + fts_query_fetch_document, &phrase); + + if (error != DB_SUCCESS) { + ut_print_timestamp(stderr); + fprintf(stderr, "InnoDB: Error: (%s) matching document.\n", + ut_strerr(error)); + } else { + *found = phrase.found; + } + + mem_heap_free(phrase.heap); + + return(error); +} + +/*****************************************************************//** +This function fetches the original documents and count the +words in between matching words to see that is in specified distance +@return DB_SUCCESS if all OK */ +static __attribute__((nonnull, warn_unused_result)) +bool +fts_query_is_in_proximity_range( +/*============================*/ + const fts_query_t* query, /*!< in: query instance */ + fts_match_t** match, /*!< in: query instance */ + fts_proximity_t* qualified_pos) /*!< in: position info for + qualified ranges */ +{ + fts_get_doc_t get_doc; + fts_cache_t* cache = query->index->table->fts->cache; + dberr_t err; + fts_phrase_t phrase; + + memset(&get_doc, 0x0, sizeof(get_doc)); + memset(&phrase, 0x0, sizeof(phrase)); + + rw_lock_x_lock(&cache->lock); + get_doc.index_cache = fts_find_index_cache(cache, query->index); + rw_lock_x_unlock(&cache->lock); + ut_a(get_doc.index_cache != NULL); + + phrase.distance = query->distance; + phrase.charset = get_doc.index_cache->charset; + phrase.zip_size = dict_table_zip_size( + get_doc.index_cache->index->table); + phrase.heap = mem_heap_create(512); + phrase.proximity_pos = qualified_pos; + phrase.found = FALSE; + + err = fts_doc_fetch_by_doc_id( + &get_doc, match[0]->doc_id, NULL, FTS_FETCH_DOC_BY_ID_EQUAL, + fts_query_fetch_document, &phrase); + + if (err != DB_SUCCESS) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Error: (%s) in verification phase of proximity " + "search", ut_strerr(err)); + } + + /* Free the prepared statement. */ + if (get_doc.get_document_graph) { + fts_que_graph_free(get_doc.get_document_graph); + get_doc.get_document_graph = NULL; + } + + mem_heap_free(phrase.heap); + + return(err == DB_SUCCESS && phrase.found); +} + +/*****************************************************************//** +Iterate over the matched document ids and search the for the +actual phrase in the text. +@return DB_SUCCESS if all OK */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +fts_query_search_phrase( +/*====================*/ + fts_query_t* query, /*!< in: query instance */ + ib_vector_t* orig_tokens, /*!< in: tokens to search, + with any stopwords in the + original phrase */ + ib_vector_t* tokens) /*!< in: tokens that does + not include stopwords and + can be used to calculate + ranking */ +{ + ulint i; + fts_get_doc_t get_doc; + ulint n_matched; + fts_cache_t* cache = query->index->table->fts->cache; + + n_matched = ib_vector_size(query->matched); + + /* Setup the doc retrieval infrastructure. */ + memset(&get_doc, 0x0, sizeof(get_doc)); + + rw_lock_x_lock(&cache->lock); + + get_doc.index_cache = fts_find_index_cache(cache, query->index); + + /* Must find the index cache */ + ut_a(get_doc.index_cache != NULL); + + rw_lock_x_unlock(&cache->lock); + +#ifdef FTS_INTERNAL_DIAG_PRINT + ut_print_timestamp(stderr); + fprintf(stderr, " Start phrase search\n"); +#endif + + /* Read the document from disk and do the actual + match, matching documents will be added to the current + doc id set. */ + for (i = 0; i < n_matched && query->error == DB_SUCCESS; ++i) { + fts_match_t* match; + ibool found = FALSE; + + match = static_cast<fts_match_t*>( + ib_vector_get(query->matched, i)); + + /* Skip the document ids that were filtered out by + an earlier pass. */ + if (match->doc_id != 0) { + + query->error = fts_query_match_document( + orig_tokens, &get_doc, + match, query->distance, &found); + + if (query->error == DB_SUCCESS && found) { + ulint z; + + query->error = fts_query_process_doc_id(query, + match->doc_id, 0); + if (query->error != DB_SUCCESS) { + goto func_exit; + } + + for (z = 0; z < ib_vector_size(tokens); z++) { + fts_string_t* token; + token = static_cast<fts_string_t*>( + ib_vector_get(tokens, z)); + fts_query_add_word_to_document( + query, match->doc_id, token); + } + } + } + } + +func_exit: + /* Free the prepared statement. */ + if (get_doc.get_document_graph) { + fts_que_graph_free(get_doc.get_document_graph); + get_doc.get_document_graph = NULL; + } + + return(query->error); +} + +/*****************************************************************//** +Text/Phrase search. +@return DB_SUCCESS or error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +fts_query_phrase_search( +/*====================*/ + fts_query_t* query, /*!< in: query instance */ + const fts_string_t* phrase) /*!< in: token to search */ +{ + ib_vector_t* tokens; + ib_vector_t* orig_tokens; + mem_heap_t* heap = mem_heap_create(sizeof(fts_string_t)); + ulint len = phrase->f_len; + ulint cur_pos = 0; + ib_alloc_t* heap_alloc; + ulint num_token; + CHARSET_INFO* charset; + + charset = query->fts_index_table.charset; + + heap_alloc = ib_heap_allocator_create(heap); + + tokens = ib_vector_create(heap_alloc, sizeof(fts_string_t), 4); + orig_tokens = ib_vector_create(heap_alloc, sizeof(fts_string_t), 4); + + if (query->distance != ULINT_UNDEFINED && query->distance > 0) { + query->flags = FTS_PROXIMITY; + } else { + query->flags = FTS_PHRASE; + } + + /* Split the phrase into tokens. */ + while (cur_pos < len) { + fts_cache_t* cache = query->index->table->fts->cache; + ib_rbt_bound_t parent; + ulint offset; + ulint cur_len; + fts_string_t result_str; + + cur_len = innobase_mysql_fts_get_token( + charset, + reinterpret_cast<const byte*>(phrase->f_str) + cur_pos, + reinterpret_cast<const byte*>(phrase->f_str) + len, + &result_str, &offset); + + if (cur_len == 0) { + break; + } + + cur_pos += cur_len; + + if (result_str.f_n_char == 0) { + continue; + } + + fts_string_t* token = static_cast<fts_string_t*>( + ib_vector_push(tokens, NULL)); + + token->f_str = static_cast<byte*>( + mem_heap_alloc(heap, result_str.f_len + 1)); + ut_memcpy(token->f_str, result_str.f_str, result_str.f_len); + + token->f_len = result_str.f_len; + token->f_str[token->f_len] = 0; + + if (cache->stopword_info.cached_stopword + && rbt_search(cache->stopword_info.cached_stopword, + &parent, token) != 0 + && result_str.f_n_char >= fts_min_token_size + && result_str.f_n_char <= fts_max_token_size) { + /* Add the word to the RB tree so that we can + calculate it's frequencey within a document. */ + fts_query_add_word_freq(query, token); + } else { + ib_vector_pop(tokens); + } + + /* we will start to store all words including stopwords + in the "orig_tokens" vector, but skip any leading words + that are stopwords */ + if (!ib_vector_is_empty(tokens)) { + fts_string_t* orig_token = static_cast<fts_string_t*>( + ib_vector_push(orig_tokens, NULL)); + + orig_token->f_str = token->f_str; + orig_token->f_len = token->f_len; + } + } + + num_token = ib_vector_size(tokens); + if (num_token > MAX_PROXIMITY_ITEM) { + query->error = DB_FTS_TOO_MANY_WORDS_IN_PHRASE; + goto func_exit; + } + + ut_ad(ib_vector_size(orig_tokens) >= num_token); + + /* Ignore empty strings. */ + if (num_token > 0) { + fts_string_t* token; + fts_fetch_t fetch; + trx_t* trx = query->trx; + fts_ast_oper_t oper = query->oper; + que_t* graph = NULL; + ulint i; + dberr_t error; + + /* Create the vector for storing matching document ids + and the positions of the first token of the phrase. */ + if (!query->matched) { + ib_alloc_t* heap_alloc; + + heap_alloc = ib_heap_allocator_create(heap); + + if (!(query->flags & FTS_PROXIMITY) + && !(query->flags & FTS_PHRASE)) { + query->matched = ib_vector_create( + heap_alloc, sizeof(fts_match_t), + 64); + } else { + ut_a(num_token <= MAX_PROXIMITY_ITEM); + query->match_array = + (ib_vector_t**) mem_heap_alloc( + heap, + num_token * + sizeof(query->matched)); + + for (i = 0; i < num_token; i++) { + query->match_array[i] = + ib_vector_create( + heap_alloc, sizeof(fts_match_t), + 64); + } + + query->matched = query->match_array[0]; + } + } + + /* Setup the callback args for filtering and consolidating + the ilist. */ + fetch.read_arg = query; + fetch.read_record = fts_query_index_fetch_nodes; + + for (i = 0; i < num_token; i++) { + /* Search for the first word from the phrase. */ + token = static_cast<fts_string_t*>( + ib_vector_get(tokens, i)); + + if (query->flags & FTS_PROXIMITY + || query->flags & FTS_PHRASE) { + query->matched = query->match_array[i]; + } + + error = fts_index_fetch_nodes( + trx, &graph, &query->fts_index_table, + token, &fetch); + + /* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */ + ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS)); + if (error != DB_SUCCESS) { + query->error = error; + } + + fts_que_graph_free(graph); + graph = NULL; + + fts_query_cache(query, token); + + if (!(query->flags & FTS_PHRASE) + && !(query->flags & FTS_PROXIMITY)) { + break; + } + + /* If any of the token can't be found, + no need to continue match */ + if (ib_vector_is_empty(query->match_array[i]) + || query->error != DB_SUCCESS) { + goto func_exit; + } + } + + /* Just a single word, no need to fetch the original + documents to do phrase matching */ + if (ib_vector_size(orig_tokens) == 1 + && !ib_vector_is_empty(query->match_array[0])) { + fts_match_t* match; + ulint n_matched; + + n_matched = ib_vector_size(query->match_array[0]); + + for (i = 0; i < n_matched; i++) { + match = static_cast<fts_match_t*>( + ib_vector_get( + query->match_array[0], i)); + + query->error = fts_query_process_doc_id( + query, match->doc_id, 0); + if (query->error != DB_SUCCESS) { + goto func_exit; + } + + fts_query_add_word_to_document( + query, match->doc_id, token); + } + query->oper = oper; + goto func_exit; + } + + /* If we are doing proximity search, verify the distance + between all words, and check they are in specified distance. */ + if (query->flags & FTS_PROXIMITY) { + fts_phrase_or_proximity_search(query, tokens); + } else { + ibool matched; + + /* Phrase Search case: + We filter out the doc ids that don't contain + all the tokens in the phrase. It's cheaper to + search the ilist than bringing the documents in + and then doing a search through the text. Isolated + testing shows this also helps in mitigating disruption + of the buffer cache. */ + matched = fts_phrase_or_proximity_search(query, tokens); + query->matched = query->match_array[0]; + + /* Read the actual text in and search for the phrase. */ + if (matched) { + ut_ad(query->error == DB_SUCCESS); + query->error = fts_query_search_phrase( + query, orig_tokens, tokens); + } + } + + /* Restore original operation. */ + query->oper = oper; + + if (query->error != DB_SUCCESS) { + goto func_exit; + } + } + +func_exit: + mem_heap_free(heap); + + /* Don't need it anymore. */ + query->matched = NULL; + + return(query->error); +} + +/*****************************************************************//** +Find the word and evaluate. +@return DB_SUCCESS if all go well */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +fts_query_execute( +/*==============*/ + fts_query_t* query, /*!< in: query instance */ + fts_string_t* token) /*!< in: token to search */ +{ + switch (query->oper) { + case FTS_NONE: + case FTS_NEGATE: + case FTS_INCR_RATING: + case FTS_DECR_RATING: + query->error = fts_query_union(query, token); + break; + + case FTS_EXIST: + query->error = fts_query_intersect(query, token); + break; + + case FTS_IGNORE: + query->error = fts_query_difference(query, token); + break; + + default: + ut_error; + } + + return(query->error); +} + +/*****************************************************************//** +Create a wildcard string. It's the responsibility of the caller to +free the byte* pointer. It's allocated using ut_malloc(). +@return ptr to allocated memory */ +static +byte* +fts_query_get_token( +/*================*/ + fts_ast_node_t* node, /*!< in: the current sub tree */ + fts_string_t* token) /*!< in: token to create */ +{ + ulint str_len; + byte* new_ptr = NULL; + + str_len = node->term.ptr->len; + + ut_a(node->type == FTS_AST_TERM); + + token->f_len = str_len; + token->f_str = node->term.ptr->str; + + if (node->term.wildcard) { + + token->f_str = static_cast<byte*>(ut_malloc(str_len + 2)); + token->f_len = str_len + 1; + + memcpy(token->f_str, node->term.ptr->str, str_len); + + token->f_str[str_len] = '%'; + token->f_str[token->f_len] = 0; + + new_ptr = token->f_str; + } + + return(new_ptr); +} + +/*****************************************************************//** +Visit every node of the AST. */ +static +dberr_t +fts_query_visitor( +/*==============*/ + fts_ast_oper_t oper, /*!< in: current operator */ + fts_ast_node_t* node, /*!< in: The root of the current subtree*/ + void* arg) /*!< in: callback arg*/ +{ + byte* ptr; + fts_string_t token; + fts_query_t* query = static_cast<fts_query_t*>(arg); + + ut_a(node); + + token.f_n_char = 0; + + query->oper = oper; + + query->cur_node = node; + + switch (node->type) { + case FTS_AST_TEXT: + token.f_str = node->text.ptr->str; + token.f_len = node->text.ptr->len; + + if (query->oper == FTS_EXIST) { + ut_ad(query->intersection == NULL); + query->intersection = rbt_create( + sizeof(fts_ranking_t), fts_ranking_doc_id_cmp); + + query->total_size += SIZEOF_RBT_CREATE; + } + + /* Set the current proximity distance. */ + query->distance = node->text.distance; + + /* Force collection of doc ids and the positions. */ + query->collect_positions = TRUE; + + query->error = fts_query_phrase_search(query, &token); + + query->collect_positions = FALSE; + + if (query->oper == FTS_EXIST) { + fts_query_free_doc_ids(query, query->doc_ids); + query->doc_ids = query->intersection; + query->intersection = NULL; + } + + break; + + case FTS_AST_TERM: + token.f_str = node->term.ptr->str; + token.f_len = node->term.ptr->len; + + /* Add the word to our RB tree that will be used to + calculate this terms per document frequency. */ + fts_query_add_word_freq(query, &token); + + ptr = fts_query_get_token(node, &token); + query->error = fts_query_execute(query, &token); + + if (ptr) { + ut_free(ptr); + } + break; + + case FTS_AST_SUBEXP_LIST: + query->error = fts_ast_visit_sub_exp(node, fts_query_visitor, arg); + break; + + default: + ut_error; + } + + if (query->oper == FTS_EXIST) { + query->multi_exist = true; + } + + return(query->error); +} + +/*****************************************************************//** +Process (nested) sub-expression, create a new result set to store the +sub-expression result by processing nodes under current sub-expression +list. Merge the sub-expression result with that of parent expression list. +@return DB_SUCCESS if all well */ +UNIV_INTERN +dberr_t +fts_ast_visit_sub_exp( +/*==================*/ + fts_ast_node_t* node, /*!< in,out: current root node */ + fts_ast_callback visitor, /*!< in: callback function */ + void* arg) /*!< in,out: arg for callback */ +{ + fts_ast_oper_t cur_oper; + fts_query_t* query = static_cast<fts_query_t*>(arg); + ib_rbt_t* parent_doc_ids; + ib_rbt_t* subexpr_doc_ids; + dberr_t error = DB_SUCCESS; + bool will_be_ignored = false; + bool multi_exist; + + ut_a(node->type == FTS_AST_SUBEXP_LIST); + + cur_oper = query->oper; + + /* Save current result set */ + parent_doc_ids = query->doc_ids; + + /* Create new result set to store the sub-expression result. We + will merge this result set with the parent after processing. */ + query->doc_ids = rbt_create(sizeof(fts_ranking_t), + fts_ranking_doc_id_cmp); + + query->total_size += SIZEOF_RBT_CREATE; + + multi_exist = query->multi_exist; + query->multi_exist = false; + /* Process nodes in current sub-expression and store its + result set in query->doc_ids we created above. */ + error = fts_ast_visit(FTS_NONE, node, visitor, + arg, &will_be_ignored); + + /* Reinstate parent node state */ + query->multi_exist = multi_exist; + query->oper = cur_oper; + + /* Merge the sub-expression result with the parent result set. */ + subexpr_doc_ids = query->doc_ids; + query->doc_ids = parent_doc_ids; + if (error == DB_SUCCESS && !rbt_empty(subexpr_doc_ids)) { + error = fts_merge_doc_ids(query, subexpr_doc_ids); + } + + /* Free current result set. Result already merged into parent. */ + fts_query_free_doc_ids(query, subexpr_doc_ids); + + return(error); +} + +#if 0 +/*****************************************************************//*** +Check if the doc id exists in the ilist. +@return TRUE if doc id found */ +static +ulint +fts_query_find_doc_id( +/*==================*/ + fts_select_t* select, /*!< in/out: contains the doc id to + find, we update the word freq if + document found */ + void* data, /*!< in: doc id ilist */ + ulint len) /*!< in: doc id ilist size */ +{ + byte* ptr = data; + doc_id_t doc_id = 0; + ulint decoded = 0; + + /* Decode the ilist and search for selected doc_id. We also + calculate the frequency of the word in the document if found. */ + while (decoded < len && !select->found) { + ulint freq = 0; + ulint min_pos = 0; + ulint last_pos = 0; + ulint pos = fts_decode_vlc(&ptr); + + /* Add the delta. */ + doc_id += pos; + + while (*ptr) { + ++freq; + last_pos += fts_decode_vlc(&ptr); + + /* Only if min_pos is not set and the current + term exists in a position greater than the + min_pos of the previous term. */ + if (min_pos == 0 && last_pos > select->min_pos) { + min_pos = last_pos; + } + } + + /* Skip the end of word position marker. */ + ++ptr; + + /* Bytes decoded so far. */ + decoded = ptr - (byte*) data; + + /* A word may exist in the document but we only consider a + match if it exists in a position that is greater than the + position of the previous term. */ + if (doc_id == select->doc_id && min_pos > 0) { + fts_doc_freq_t* doc_freq; + + /* Add the doc id to the doc freq rb tree, if + the doc id doesn't exist it will be created. */ + doc_freq = fts_query_add_doc_freq( + select->word_freq->doc_freqs, doc_id); + + /* Avoid duplicating the frequency tally */ + if (doc_freq->freq == 0) { + doc_freq->freq = freq; + } + + select->found = TRUE; + select->min_pos = min_pos; + } + } + + return(select->found); +} +#endif + +/*****************************************************************//** +Read and filter nodes. +@return DB_SUCCESS if all go well, +or return DB_FTS_EXCEED_RESULT_CACHE_LIMIT */ +static +dberr_t +fts_query_filter_doc_ids( +/*=====================*/ + fts_query_t* query, /*!< in: query instance */ + const fts_string_t* word, /*!< in: the current word */ + fts_word_freq_t* word_freq, /*!< in/out: word frequency */ + const fts_node_t* node, /*!< in: current FTS node */ + void* data, /*!< in: doc id ilist */ + ulint len, /*!< in: doc id ilist size */ + ibool calc_doc_count) /*!< in: whether to remember doc count */ +{ + byte* ptr = static_cast<byte*>(data); + doc_id_t doc_id = 0; + ulint decoded = 0; + ib_rbt_t* doc_freqs = word_freq->doc_freqs; + + /* Decode the ilist and add the doc ids to the query doc_id set. */ + while (decoded < len) { + ulint freq = 0; + fts_doc_freq_t* doc_freq; + fts_match_t* match = NULL; + ulint last_pos = 0; + ulint pos = fts_decode_vlc(&ptr); + + /* Some sanity checks. */ + if (doc_id == 0) { + ut_a(pos == node->first_doc_id); + } + + /* Add the delta. */ + doc_id += pos; + + if (calc_doc_count) { + word_freq->doc_count++; + } + + /* We simply collect the matching instances here. */ + if (query->collect_positions) { + ib_alloc_t* heap_alloc; + + /* Create a new fts_match_t instance. */ + match = static_cast<fts_match_t*>( + ib_vector_push(query->matched, NULL)); + + match->start = 0; + match->doc_id = doc_id; + heap_alloc = ib_vector_allocator(query->matched); + + /* Allocate from the same heap as the + parent container. */ + match->positions = ib_vector_create( + heap_alloc, sizeof(ulint), 64); + + query->total_size += sizeof(fts_match_t) + + sizeof(ib_vector_t) + + sizeof(ulint) * 64; + } + + /* Unpack the positions within the document. */ + while (*ptr) { + last_pos += fts_decode_vlc(&ptr); + + /* Collect the matching word positions, for phrase + matching later. */ + if (query->collect_positions) { + ib_vector_push(match->positions, &last_pos); + } + + ++freq; + } + + /* End of list marker. */ + last_pos = (ulint) -1; + + if (query->collect_positions) { + ut_a(match != NULL); + ib_vector_push(match->positions, &last_pos); + } + + /* Add the doc id to the doc freq rb tree, if the doc id + doesn't exist it will be created. */ + doc_freq = fts_query_add_doc_freq(query, doc_freqs, doc_id); + + /* Avoid duplicating frequency tally. */ + if (doc_freq->freq == 0) { + doc_freq->freq = freq; + } + + /* Skip the end of word position marker. */ + ++ptr; + + /* Bytes decoded so far */ + decoded = ptr - (byte*) data; + + /* We simply collect the matching documents and the + positions here and match later. */ + if (!query->collect_positions) { + /* We ignore error here and will check it later */ + fts_query_process_doc_id(query, doc_id, 0); + + /* Add the word to the document's matched RB tree. */ + fts_query_add_word_to_document(query, doc_id, word); + } + } + + /* Some sanity checks. */ + ut_a(doc_id == node->last_doc_id); + + if (query->total_size > fts_result_cache_limit) { + return(DB_FTS_EXCEED_RESULT_CACHE_LIMIT); + } else { + return(DB_SUCCESS); + } +} + +/*****************************************************************//** +Read the FTS INDEX row. +@return DB_SUCCESS if all go well. */ +static +dberr_t +fts_query_read_node( +/*================*/ + fts_query_t* query, /*!< in: query instance */ + const fts_string_t* word, /*!< in: current word */ + que_node_t* exp) /*!< in: query graph node */ +{ + int i; + int ret; + fts_node_t node; + ib_rbt_bound_t parent; + fts_word_freq_t* word_freq; + ibool skip = FALSE; + fts_string_t term; + byte buf[FTS_MAX_WORD_LEN + 1]; + dberr_t error = DB_SUCCESS; + + ut_a(query->cur_node->type == FTS_AST_TERM || + query->cur_node->type == FTS_AST_TEXT); + + memset(&node, 0, sizeof(node)); + term.f_str = buf; + + /* Need to consider the wildcard search case, the word frequency + is created on the search string not the actual word. So we need + to assign the frequency on search string behalf. */ + if (query->cur_node->type == FTS_AST_TERM + && query->cur_node->term.wildcard) { + term.f_len = query->cur_node->term.ptr->len; + ut_ad(FTS_MAX_WORD_LEN >= term.f_len); + memcpy(term.f_str, query->cur_node->term.ptr->str, term.f_len); + } else { + term.f_len = word->f_len; + ut_ad(FTS_MAX_WORD_LEN >= word->f_len); + memcpy(term.f_str, word->f_str, word->f_len); + } + + /* Lookup the word in our rb tree, it must exist. */ + ret = rbt_search(query->word_freqs, &parent, &term); + + ut_a(ret == 0); + + word_freq = rbt_value(fts_word_freq_t, parent.last); + + /* Start from 1 since the first column has been read by the caller. + Also, we rely on the order of the columns projected, to filter + out ilists that are out of range and we always want to read + the doc_count irrespective of the suitablility of the row. */ + + for (i = 1; exp && !skip; exp = que_node_get_next(exp), ++i) { + + dfield_t* dfield = que_node_get_val(exp); + byte* data = static_cast<byte*>( + dfield_get_data(dfield)); + ulint len = dfield_get_len(dfield); + + ut_a(len != UNIV_SQL_NULL); + + /* Note: The column numbers below must match the SELECT. */ + + switch (i) { + case 1: /* DOC_COUNT */ + word_freq->doc_count += mach_read_from_4(data); + break; + + case 2: /* FIRST_DOC_ID */ + node.first_doc_id = fts_read_doc_id(data); + + /* Skip nodes whose doc ids are out range. */ + if (query->oper == FTS_EXIST + && query->upper_doc_id > 0 + && node.first_doc_id > query->upper_doc_id) { + skip = TRUE; + } + break; + + case 3: /* LAST_DOC_ID */ + node.last_doc_id = fts_read_doc_id(data); + + /* Skip nodes whose doc ids are out range. */ + if (query->oper == FTS_EXIST + && query->lower_doc_id > 0 + && node.last_doc_id < query->lower_doc_id) { + skip = TRUE; + } + break; + + case 4: /* ILIST */ + + error = fts_query_filter_doc_ids( + query, &word_freq->word, word_freq, + &node, data, len, FALSE); + + break; + + default: + ut_error; + } + } + + if (!skip) { + /* Make sure all columns were read. */ + + ut_a(i == 5); + } + + return error; +} + +/*****************************************************************//** +Callback function to fetch the rows in an FTS INDEX record. +@return always returns TRUE */ +static +ibool +fts_query_index_fetch_nodes( +/*========================*/ + void* row, /*!< in: sel_node_t* */ + void* user_arg) /*!< in: pointer to fts_fetch_t */ +{ + fts_string_t key; + sel_node_t* sel_node = static_cast<sel_node_t*>(row); + fts_fetch_t* fetch = static_cast<fts_fetch_t*>(user_arg); + fts_query_t* query = static_cast<fts_query_t*>(fetch->read_arg); + que_node_t* exp = sel_node->select_list; + dfield_t* dfield = que_node_get_val(exp); + void* data = dfield_get_data(dfield); + ulint dfield_len = dfield_get_len(dfield); + + key.f_str = static_cast<byte*>(data); + key.f_len = dfield_len; + + ut_a(dfield_len <= FTS_MAX_WORD_LEN); + + /* Note: we pass error out by 'query->error' */ + query->error = fts_query_read_node(query, &key, que_node_get_next(exp)); + + if (query->error != DB_SUCCESS) { + ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT); + return(FALSE); + } else { + return(TRUE); + } +} + +/*****************************************************************//** +Calculate the inverse document frequency (IDF) for all the terms. */ +static +void +fts_query_calculate_idf( +/*====================*/ + fts_query_t* query) /*!< in: Query state */ +{ + const ib_rbt_node_t* node; + ib_uint64_t total_docs = query->total_docs; + + /* We need to free any instances of fts_doc_freq_t that we + may have allocated. */ + for (node = rbt_first(query->word_freqs); + node; + node = rbt_next(query->word_freqs, node)) { + + fts_word_freq_t* word_freq; + + word_freq = rbt_value(fts_word_freq_t, node); + + if (word_freq->doc_count > 0) { + if (total_docs == word_freq->doc_count) { + /* QP assume ranking > 0 if we find + a match. Since Log10(1) = 0, we cannot + make IDF a zero value if do find a + word in all documents. So let's make + it an arbitrary very small number */ + word_freq->idf = log10(1.0001); + } else { + word_freq->idf = log10( + total_docs + / (double) word_freq->doc_count); + } + } + + if (fts_enable_diag_print) { + fprintf(stderr,"'%s' -> " UINT64PF "/" UINT64PF + " %6.5lf\n", + word_freq->word.f_str, + query->total_docs, word_freq->doc_count, + word_freq->idf); + } + } +} + +/*****************************************************************//** +Calculate the ranking of the document. */ +static +void +fts_query_calculate_ranking( +/*========================*/ + const fts_query_t* query, /*!< in: query state */ + fts_ranking_t* ranking) /*!< in: Document to rank */ +{ + ulint pos = 0; + fts_string_t word; + + /* At this stage, ranking->rank should not exceed the 1.0 + bound */ + ut_ad(ranking->rank <= 1.0 && ranking->rank >= -1.0); + ut_ad(rbt_size(query->word_map) == query->word_vector->size()); + + while (fts_ranking_words_get_next(query, ranking, &pos, &word)) { + int ret; + ib_rbt_bound_t parent; + double weight; + fts_doc_freq_t* doc_freq; + fts_word_freq_t* word_freq; + + ret = rbt_search(query->word_freqs, &parent, &word); + + /* It must exist. */ + ut_a(ret == 0); + + word_freq = rbt_value(fts_word_freq_t, parent.last); + + ret = rbt_search( + word_freq->doc_freqs, &parent, &ranking->doc_id); + + /* It must exist. */ + ut_a(ret == 0); + + doc_freq = rbt_value(fts_doc_freq_t, parent.last); + + weight = (double) doc_freq->freq * word_freq->idf; + + ranking->rank += (fts_rank_t) (weight * word_freq->idf); + } +} + +/*****************************************************************//** +Add ranking to the result set. */ +static +void +fts_query_add_ranking( +/*==================*/ + fts_query_t* query, /*!< in: query state */ + ib_rbt_t* ranking_tree, /*!< in: ranking tree */ + const fts_ranking_t* new_ranking) /*!< in: ranking of a document */ +{ + ib_rbt_bound_t parent; + + /* Lookup the ranking in our rb tree and add if it doesn't exist. */ + if (rbt_search(ranking_tree, &parent, new_ranking) == 0) { + fts_ranking_t* ranking; + + ranking = rbt_value(fts_ranking_t, parent.last); + + ranking->rank += new_ranking->rank; + + ut_a(ranking->words == NULL); + } else { + rbt_add_node(ranking_tree, &parent, new_ranking); + + query->total_size += SIZEOF_RBT_NODE_ADD + + sizeof(fts_ranking_t); + } +} + +/*****************************************************************//** +Retrieve the FTS Relevance Ranking result for doc with doc_id +@return the relevance ranking value, 0 if no ranking value +present. */ +float +fts_retrieve_ranking( +/*=================*/ + fts_result_t* result, /*!< in: FTS result structure */ + doc_id_t doc_id) /*!< in: doc_id of the item to retrieve */ +{ + ib_rbt_bound_t parent; + fts_ranking_t new_ranking; + + if (!result || !result->rankings_by_id) { + return(0); + } + + new_ranking.doc_id = doc_id; + + /* Lookup the ranking in our rb tree */ + if (rbt_search(result->rankings_by_id, &parent, &new_ranking) == 0) { + fts_ranking_t* ranking; + + ranking = rbt_value(fts_ranking_t, parent.last); + + return(ranking->rank); + } + + return(0); +} + +/*****************************************************************//** +Create the result and copy the data to it. */ +static +fts_result_t* +fts_query_prepare_result( +/*=====================*/ + fts_query_t* query, /*!< in: Query state */ + fts_result_t* result) /*!< in: result this can contain + data from a previous search on + another FTS index */ +{ + const ib_rbt_node_t* node; + bool result_is_null = false; + + if (result == NULL) { + result = static_cast<fts_result_t*>(ut_malloc(sizeof(*result))); + + memset(result, 0x0, sizeof(*result)); + + result->rankings_by_id = rbt_create( + sizeof(fts_ranking_t), fts_ranking_doc_id_cmp); + + query->total_size += sizeof(fts_result_t) + SIZEOF_RBT_CREATE; + result_is_null = true; + } + + if (query->flags == FTS_OPT_RANKING) { + fts_word_freq_t* word_freq; + ulint size = ib_vector_size(query->deleted->doc_ids); + fts_update_t* array = + (fts_update_t*) query->deleted->doc_ids->data; + + node = rbt_first(query->word_freqs); + ut_ad(node); + word_freq = rbt_value(fts_word_freq_t, node); + + for (node = rbt_first(word_freq->doc_freqs); + node; + node = rbt_next(word_freq->doc_freqs, node)) { + fts_doc_freq_t* doc_freq; + fts_ranking_t ranking; + + doc_freq = rbt_value(fts_doc_freq_t, node); + + /* Don't put deleted docs into result */ + if (fts_bsearch(array, 0, static_cast<int>(size), + doc_freq->doc_id) >= 0) { + /* one less matching doc count */ + --word_freq->doc_count; + continue; + } + + ranking.doc_id = doc_freq->doc_id; + ranking.rank = static_cast<fts_rank_t>(doc_freq->freq); + ranking.words = NULL; + + fts_query_add_ranking(query, result->rankings_by_id, + &ranking); + + if (query->total_size > fts_result_cache_limit) { + query->error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT; + fts_query_free_result(result); + return(NULL); + } + } + + /* Calculate IDF only after we exclude the deleted items */ + fts_query_calculate_idf(query); + + node = rbt_first(query->word_freqs); + word_freq = rbt_value(fts_word_freq_t, node); + + /* Calculate the ranking for each doc */ + for (node = rbt_first(result->rankings_by_id); + node != NULL; + node = rbt_next(result->rankings_by_id, node)) { + + fts_ranking_t* ranking; + + ranking = rbt_value(fts_ranking_t, node); + + ranking->rank = static_cast<fts_rank_t>( + ranking->rank * word_freq->idf * word_freq->idf); + } + + return(result); + } + + ut_a(rbt_size(query->doc_ids) > 0); + + for (node = rbt_first(query->doc_ids); + node; + node = rbt_next(query->doc_ids, node)) { + + fts_ranking_t* ranking; + + ranking = rbt_value(fts_ranking_t, node); + fts_query_calculate_ranking(query, ranking); + + // FIXME: I think we may requre this information to improve the + // ranking of doc ids which have more word matches from + // different FTS indexes. + + /* We don't need these anymore free the resources. */ + ranking->words = NULL; + + if (!result_is_null) { + fts_query_add_ranking(query, result->rankings_by_id, ranking); + + if (query->total_size > fts_result_cache_limit) { + query->error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT; + fts_query_free_result(result); + return(NULL); + } + } + } + + if (result_is_null) { + /* Use doc_ids directly */ + rbt_free(result->rankings_by_id); + result->rankings_by_id = query->doc_ids; + query->doc_ids = NULL; + } + + return(result); +} + +/*****************************************************************//** +Get the result of the query. Calculate the similarity coefficient. */ +static +fts_result_t* +fts_query_get_result( +/*=================*/ + fts_query_t* query, /*!< in: query instance */ + fts_result_t* result) /*!< in: result */ +{ + if (rbt_size(query->doc_ids) > 0 || query->flags == FTS_OPT_RANKING) { + /* Copy the doc ids to the result. */ + result = fts_query_prepare_result(query, result); + } else { + /* Create an empty result instance. */ + result = static_cast<fts_result_t*>(ut_malloc(sizeof(*result))); + memset(result, 0, sizeof(*result)); + } + + return(result); +} + +/*****************************************************************//** +FTS Query free resources and reset. */ +static +void +fts_query_free( +/*===========*/ + fts_query_t* query) /*!< in: query instance to free*/ +{ + + if (query->read_nodes_graph) { + fts_que_graph_free(query->read_nodes_graph); + } + + if (query->root) { + fts_ast_free_node(query->root); + } + + if (query->deleted) { + fts_doc_ids_free(query->deleted); + } + + if (query->doc_ids) { + fts_query_free_doc_ids(query, query->doc_ids); + } + + if (query->word_freqs) { + const ib_rbt_node_t* node; + + /* We need to free any instances of fts_doc_freq_t that we + may have allocated. */ + for (node = rbt_first(query->word_freqs); + node; + node = rbt_next(query->word_freqs, node)) { + + fts_word_freq_t* word_freq; + + word_freq = rbt_value(fts_word_freq_t, node); + + /* We need to cast away the const. */ + rbt_free(word_freq->doc_freqs); + } + + rbt_free(query->word_freqs); + } + + ut_a(!query->intersection); + + if (query->word_map) { + rbt_free(query->word_map); + } + + if (query->word_vector) { + delete query->word_vector; + } + + if (query->heap) { + mem_heap_free(query->heap); + } + + memset(query, 0, sizeof(*query)); +} + +/*****************************************************************//** +Parse the query using flex/bison. */ +static +fts_ast_node_t* +fts_query_parse( +/*============*/ + fts_query_t* query, /*!< in: query instance */ + byte* query_str, /*!< in: query string */ + ulint query_len) /*!< in: query string length */ +{ + int error; + fts_ast_state_t state; + bool mode = query->boolean_mode; + + memset(&state, 0x0, sizeof(state)); + + /* Setup the scanner to use, this depends on the mode flag. */ + state.lexer = fts_lexer_create(mode, query_str, query_len); + state.charset = query->fts_index_table.charset; + error = fts_parse(&state); + fts_lexer_free(state.lexer); + state.lexer = NULL; + + /* Error during parsing ? */ + if (error) { + /* Free the nodes that were allocated during parsing. */ + fts_ast_state_free(&state); + } else { + query->root = state.root; + } + + return(state.root); +} + +/*******************************************************************//** +FTS Query optimization +Set FTS_OPT_RANKING if it is a simple term query */ +static +void +fts_query_can_optimize( +/*===================*/ + fts_query_t* query, /*!< in/out: query instance */ + uint flags) /*!< In: FTS search mode */ +{ + fts_ast_node_t* node = query->root; + + if (flags & FTS_EXPAND) { + return; + } + + /* Check if it has only a term without oper */ + ut_ad(node->type == FTS_AST_LIST); + node = node->list.head; + if (node != NULL && node->type == FTS_AST_TERM && node->next == NULL) { + query->flags = FTS_OPT_RANKING; + } +} + +/*******************************************************************//** +Pre-process the query string +1) make it lower case +2) in boolean mode, if there is '-' or '+' that is immediately proceeded +and followed by valid word, make it a space +@return the processed string */ +static +byte* +fts_query_str_preprocess( +/*=====================*/ + const byte* query_str, /*!< in: FTS query */ + ulint query_len, /*!< in: FTS query string len */ + ulint *result_len, /*!< out: result string length */ + CHARSET_INFO* charset, /*!< in: string charset */ + bool boolean_mode) /*!< in: is boolean mode */ +{ + ulint cur_pos = 0; + ulint str_len; + byte* str_ptr; + bool in_phrase = false; + + /* Convert the query string to lower case before parsing. We own + the ut_malloc'ed result and so remember to free it before return. */ + + str_len = query_len * charset->casedn_multiply + 1; + str_ptr = static_cast<byte*>(ut_malloc(str_len)); + + *result_len = innobase_fts_casedn_str( + charset, const_cast<char*>(reinterpret_cast<const char*>( + query_str)), query_len, + reinterpret_cast<char*>(str_ptr), str_len); + + ut_ad(*result_len < str_len); + + str_ptr[*result_len] = 0; + + /* If it is boolean mode, no need to check for '-/+' */ + if (!boolean_mode) { + return(str_ptr); + } + + /* Otherwise, we travese the string to find any '-/+' that are + immediately proceeded and followed by valid search word. + NOTE: we should not do so for CJK languages, this should + be taken care of in our CJK implementation */ + while (cur_pos < *result_len) { + fts_string_t str; + ulint offset; + ulint cur_len; + + cur_len = innobase_mysql_fts_get_token( + charset, str_ptr + cur_pos, str_ptr + *result_len, + &str, &offset); + + if (cur_len == 0 || str.f_str == NULL) { + /* No valid word found */ + break; + } + + /* Check if we are in a phrase, if so, no need to do + replacement of '-/+'. */ + for (byte* ptr = str_ptr + cur_pos; ptr < str.f_str; ptr++) { + if ((char) (*ptr) == '"' ) { + in_phrase = !in_phrase; + } + } + + /* Find those are not leading '-/+' and also not in a phrase */ + if (cur_pos > 0 && str.f_str - str_ptr - cur_pos == 1 + && !in_phrase) { + char* last_op = reinterpret_cast<char*>( + str_ptr + cur_pos); + + if (*last_op == '-' || *last_op == '+') { + *last_op = ' '; + } + } + + cur_pos += cur_len; + } + + return(str_ptr); +} + +/*******************************************************************//** +FTS Query entry point. +@return DB_SUCCESS if successful otherwise error code */ +UNIV_INTERN +dberr_t +fts_query( +/*======*/ + trx_t* trx, /*!< in: transaction */ + dict_index_t* index, /*!< in: The FTS index to search */ + uint flags, /*!< in: FTS search mode */ + const byte* query_str, /*!< in: FTS query */ + ulint query_len, /*!< in: FTS query string len + in bytes */ + fts_result_t** result) /*!< in/out: result doc ids */ +{ + fts_query_t query; + dberr_t error = DB_SUCCESS; + byte* lc_query_str; + ulint result_len; + bool boolean_mode; + trx_t* query_trx; + CHARSET_INFO* charset; + ulint start_time_ms; + bool will_be_ignored = false; + + boolean_mode = flags & FTS_BOOL; + + *result = NULL; + memset(&query, 0x0, sizeof(query)); + query_trx = trx_allocate_for_background(); + query_trx->op_info = "FTS query"; + + start_time_ms = ut_time_ms(); + + query.trx = query_trx; + query.index = index; + query.boolean_mode = boolean_mode; + query.deleted = fts_doc_ids_create(); + query.cur_node = NULL; + + query.fts_common_table.type = FTS_COMMON_TABLE; + query.fts_common_table.table_id = index->table->id; + query.fts_common_table.parent = index->table->name; + query.fts_common_table.table = index->table; + + charset = fts_index_get_charset(index); + + query.fts_index_table.type = FTS_INDEX_TABLE; + query.fts_index_table.index_id = index->id; + query.fts_index_table.table_id = index->table->id; + query.fts_index_table.parent = index->table->name; + query.fts_index_table.charset = charset; + query.fts_index_table.table = index->table; + + query.word_map = rbt_create_arg_cmp( + sizeof(fts_string_t), innobase_fts_text_cmp, charset); + query.word_vector = new word_vector_t; + query.error = DB_SUCCESS; + + /* Setup the RB tree that will be used to collect per term + statistics. */ + query.word_freqs = rbt_create_arg_cmp( + sizeof(fts_word_freq_t), innobase_fts_text_cmp, charset); + + query.total_size += SIZEOF_RBT_CREATE; + + query.total_docs = dict_table_get_n_rows(index->table); + +#ifdef FTS_DOC_STATS_DEBUG + if (ft_enable_diag_print) { + error = fts_get_total_word_count( + trx, query.index, &query.total_words); + + if (error != DB_SUCCESS) { + goto func_exit; + } + + fprintf(stderr, "Total docs: " UINT64PF " Total words: %lu\n", + query.total_docs, query.total_words); + } +#endif /* FTS_DOC_STATS_DEBUG */ + + query.fts_common_table.suffix = "DELETED"; + + /* Read the deleted doc_ids, we need these for filtering. */ + error = fts_table_fetch_doc_ids( + NULL, &query.fts_common_table, query.deleted); + + if (error != DB_SUCCESS) { + goto func_exit; + } + + query.fts_common_table.suffix = "DELETED_CACHE"; + + error = fts_table_fetch_doc_ids( + NULL, &query.fts_common_table, query.deleted); + + if (error != DB_SUCCESS) { + goto func_exit; + } + + /* Get the deleted doc ids that are in the cache. */ + fts_cache_append_deleted_doc_ids( + index->table->fts->cache, query.deleted->doc_ids); + DEBUG_SYNC_C("fts_deleted_doc_ids_append"); + + /* Sort the vector so that we can do a binary search over the ids. */ + ib_vector_sort(query.deleted->doc_ids, fts_update_doc_id_cmp); + +#if 0 + /* Convert the query string to lower case before parsing. We own + the ut_malloc'ed result and so remember to free it before return. */ + + lc_query_str_len = query_len * charset->casedn_multiply + 1; + lc_query_str = static_cast<byte*>(ut_malloc(lc_query_str_len)); + + result_len = innobase_fts_casedn_str( + charset, (char*) query_str, query_len, + (char*) lc_query_str, lc_query_str_len); + + ut_ad(result_len < lc_query_str_len); + + lc_query_str[result_len] = 0; + +#endif + + lc_query_str = fts_query_str_preprocess( + query_str, query_len, &result_len, charset, boolean_mode); + + query.heap = mem_heap_create(128); + + /* Create the rb tree for the doc id (current) set. */ + query.doc_ids = rbt_create( + sizeof(fts_ranking_t), fts_ranking_doc_id_cmp); + + query.total_size += SIZEOF_RBT_CREATE; + + /* Parse the input query string. */ + if (fts_query_parse(&query, lc_query_str, result_len)) { + fts_ast_node_t* ast = query.root; + + /* Optimize query to check if it's a single term */ + fts_query_can_optimize(&query, flags); + + DBUG_EXECUTE_IF("fts_instrument_result_cache_limit", + fts_result_cache_limit = 2048; + ); + + /* Traverse the Abstract Syntax Tree (AST) and execute + the query. */ + query.error = fts_ast_visit( + FTS_NONE, ast, fts_query_visitor, + &query, &will_be_ignored); + + /* If query expansion is requested, extend the search + with first search pass result */ + if (query.error == DB_SUCCESS && (flags & FTS_EXPAND)) { + query.error = fts_expand_query(index, &query); + } + + /* Calculate the inverse document frequency of the terms. */ + if (query.error == DB_SUCCESS + && query.flags != FTS_OPT_RANKING) { + fts_query_calculate_idf(&query); + } + + /* Copy the result from the query state, so that we can + return it to the caller. */ + if (query.error == DB_SUCCESS) { + *result = fts_query_get_result(&query, *result); + } + + error = query.error; + } else { + /* still return an empty result set */ + *result = static_cast<fts_result_t*>( + ut_malloc(sizeof(**result))); + memset(*result, 0, sizeof(**result)); + } + + ut_free(lc_query_str); + + if (fts_enable_diag_print && (*result)) { + ulint diff_time = ut_time_ms() - start_time_ms; + fprintf(stderr, "FTS Search Processing time: %ld secs:" + " %ld millisec: row(s) %d \n", + diff_time / 1000, diff_time % 1000, + (*result)->rankings_by_id + ? (int) rbt_size((*result)->rankings_by_id) + : -1); + + /* Log memory consumption & result size */ + ib_logf(IB_LOG_LEVEL_INFO, + "Full Search Memory: " + "%lu (bytes), Row: %lu .", + query.total_size, + (*result)->rankings_by_id + ? rbt_size((*result)->rankings_by_id) + : 0); + } + +func_exit: + fts_query_free(&query); + + trx_free_for_background(query_trx); + + return(error); +} + +/*****************************************************************//** +FTS Query free result, returned by fts_query(). */ + +void +fts_query_free_result( +/*==================*/ + fts_result_t* result) /*!< in: result instance to free.*/ +{ + if (result) { + if (result->rankings_by_id != NULL) { + rbt_free(result->rankings_by_id); + result->rankings_by_id = NULL; + } + if (result->rankings_by_rank != NULL) { + rbt_free(result->rankings_by_rank); + result->rankings_by_rank = NULL; + } + + ut_free(result); + result = NULL; + } +} + +/*****************************************************************//** +FTS Query sort result, returned by fts_query() on fts_ranking_t::rank. */ + +void +fts_query_sort_result_on_rank( +/*==========================*/ + fts_result_t* result) /*!< out: result instance to sort.*/ +{ + const ib_rbt_node_t* node; + ib_rbt_t* ranked; + + ut_a(result->rankings_by_id != NULL); + if (result->rankings_by_rank) { + rbt_free(result->rankings_by_rank); + } + + ranked = rbt_create(sizeof(fts_ranking_t), fts_query_compare_rank); + + /* We need to free any instances of fts_doc_freq_t that we + may have allocated. */ + for (node = rbt_first(result->rankings_by_id); + node; + node = rbt_next(result->rankings_by_id, node)) { + + fts_ranking_t* ranking; + + ranking = rbt_value(fts_ranking_t, node); + + ut_a(ranking->words == NULL); + + rbt_insert(ranked, ranking, ranking); + } + + /* Reset the current node too. */ + result->current = NULL; + result->rankings_by_rank = ranked; +} + +#ifdef UNIV_DEBUG +/*******************************************************************//** +A debug function to print result doc_id set. */ +static +void +fts_print_doc_id( +/*=============*/ + fts_query_t* query) /*!< in : tree that stores doc_ids.*/ +{ + const ib_rbt_node_t* node; + + /* Iterate each member of the doc_id set */ + for (node = rbt_first(query->doc_ids); + node; + node = rbt_next(query->doc_ids, node)) { + fts_ranking_t* ranking; + ranking = rbt_value(fts_ranking_t, node); + + ib_logf(IB_LOG_LEVEL_INFO, "doc_ids info, doc_id: %ld \n", + (ulint) ranking->doc_id); + + ulint pos = 0; + fts_string_t word; + + while (fts_ranking_words_get_next(query, ranking, &pos, &word)) { + ib_logf(IB_LOG_LEVEL_INFO, "doc_ids info, value: %s \n", word.f_str); + } + } +} +#endif + +/*************************************************************//** +This function implements a simple "blind" query expansion search: +words in documents found in the first search pass will be used as +search arguments to search the document again, thus "expand" +the search result set. +@return DB_SUCCESS if success, otherwise the error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +fts_expand_query( +/*=============*/ + dict_index_t* index, /*!< in: FTS index to search */ + fts_query_t* query) /*!< in: FTS query instance */ +{ + const ib_rbt_node_t* node; + const ib_rbt_node_t* token_node; + fts_doc_t result_doc; + dberr_t error = DB_SUCCESS; + const fts_index_cache_t*index_cache; + + /* If no doc is found in first search pass, return */ + if (!rbt_size(query->doc_ids)) { + return(error); + } + + /* Init "result_doc", to hold words from the first search pass */ + fts_doc_init(&result_doc); + + rw_lock_x_lock(&index->table->fts->cache->lock); + index_cache = fts_find_index_cache(index->table->fts->cache, index); + rw_lock_x_unlock(&index->table->fts->cache->lock); + + ut_a(index_cache); + + result_doc.tokens = rbt_create_arg_cmp( + sizeof(fts_token_t), innobase_fts_text_cmp, + index_cache->charset); + + result_doc.charset = index_cache->charset; + + query->total_size += SIZEOF_RBT_CREATE; +#ifdef UNIV_DEBUG + fts_print_doc_id(query); +#endif + + for (node = rbt_first(query->doc_ids); + node; + node = rbt_next(query->doc_ids, node)) { + + fts_ranking_t* ranking; + ulint pos; + fts_string_t word; + ulint prev_token_size; + ulint estimate_size; + + prev_token_size = rbt_size(result_doc.tokens); + + ranking = rbt_value(fts_ranking_t, node); + + /* Fetch the documents with the doc_id from the + result of first seach pass. Since we do not + store document-to-word mapping, we need to + fetch the original document and parse them. + Future optimization could be done here if we + support some forms of document-to-word mapping */ + fts_doc_fetch_by_doc_id(NULL, ranking->doc_id, index, + FTS_FETCH_DOC_BY_ID_EQUAL, + fts_query_expansion_fetch_doc, + &result_doc); + + /* Remove words that have already been searched in the + first pass */ + pos = 0; + while (fts_ranking_words_get_next(query, ranking, &pos, + &word)) { + ibool ret; + + ret = rbt_delete(result_doc.tokens, &word); + + /* The word must exist in the doc we found */ + if (!ret) { + ib_logf(IB_LOG_LEVEL_ERROR, "Did not " + "find word %s in doc %ld for query " + "expansion search.\n", word.f_str, + (ulint) ranking->doc_id); + } + } + + /* Estimate memory used, see fts_process_token and fts_token_t. + We ignore token size here. */ + estimate_size = (rbt_size(result_doc.tokens) - prev_token_size) + * (SIZEOF_RBT_NODE_ADD + sizeof(fts_token_t) + + sizeof(ib_vector_t) + sizeof(ulint) * 32); + query->total_size += estimate_size; + + if (query->total_size > fts_result_cache_limit) { + error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT; + goto func_exit; + } + } + + /* Search the table the second time with expanded search list */ + for (token_node = rbt_first(result_doc.tokens); + token_node; + token_node = rbt_next(result_doc.tokens, token_node)) { + fts_token_t* mytoken; + mytoken = rbt_value(fts_token_t, token_node); + + ut_ad(mytoken->text.f_str[mytoken->text.f_len] == 0); + fts_query_add_word_freq(query, &mytoken->text); + error = fts_query_union(query, &mytoken->text); + + if (error != DB_SUCCESS) { + break; + } + } + +func_exit: + fts_doc_free(&result_doc); + + return(error); +} +/*************************************************************//** +This function finds documents that contain all words in a +phrase or proximity search. And if proximity search, verify +the words are close enough to each other, as in specified distance. +This function is called for phrase and proximity search. +@return TRUE if documents are found, FALSE if otherwise */ +static +ibool +fts_phrase_or_proximity_search( +/*===========================*/ + fts_query_t* query, /*!< in/out: query instance. + query->doc_ids might be instantiated + with qualified doc IDs */ + ib_vector_t* tokens) /*!< in: Tokens contain words */ +{ + ulint n_matched; + ulint i; + ibool matched = FALSE; + ulint num_token = ib_vector_size(tokens); + fts_match_t* match[MAX_PROXIMITY_ITEM]; + ibool end_list = FALSE; + + /* Number of matched documents for the first token */ + n_matched = ib_vector_size(query->match_array[0]); + + /* We have a set of match list for each word, we shall + walk through the list and find common documents that + contain all the matching words. */ + for (i = 0; i < n_matched; i++) { + ulint j; + ulint k = 0; + fts_proximity_t qualified_pos; + + match[0] = static_cast<fts_match_t*>( + ib_vector_get(query->match_array[0], i)); + + /* For remaining match list for the token(word), we + try to see if there is a document with the same + doc id */ + for (j = 1; j < num_token; j++) { + match[j] = static_cast<fts_match_t*>( + ib_vector_get(query->match_array[j], k)); + + while (match[j]->doc_id < match[0]->doc_id + && k < ib_vector_size(query->match_array[j])) { + match[j] = static_cast<fts_match_t*>( + ib_vector_get( + query->match_array[j], k)); + k++; + } + + if (match[j]->doc_id > match[0]->doc_id) { + /* no match */ + if (query->flags & FTS_PHRASE) { + match[0]->doc_id = 0; + } + break; + } + + if (k == ib_vector_size(query->match_array[j])) { + end_list = TRUE; + + if (match[j]->doc_id != match[0]->doc_id) { + /* no match */ + if (query->flags & FTS_PHRASE) { + ulint s; + + match[0]->doc_id = 0; + + for (s = i + 1; s < n_matched; + s++) { + match[0] = static_cast< + fts_match_t*>( + ib_vector_get( + query->match_array[0], + s)); + match[0]->doc_id = 0; + } + } + + goto func_exit; + } + } + + /* FIXME: A better solution will be a counter array + remember each run's last position. So we don't + reset it here very time */ + k = 0; + } + + if (j != num_token) { + continue; + } + + /* For this matching doc, we need to further + verify whether the words in the doc are close + to each other, and within the distance specified + in the proximity search */ + if (query->flags & FTS_PHRASE) { + matched = TRUE; + } else if (fts_proximity_get_positions( + match, num_token, ULINT_MAX, &qualified_pos)) { + + /* Fetch the original documents and count the + words in between matching words to see that is in + specified distance */ + if (fts_query_is_in_proximity_range( + query, match, &qualified_pos)) { + /* If so, mark we find a matching doc */ + query->error = fts_query_process_doc_id( + query, match[0]->doc_id, 0); + if (query->error != DB_SUCCESS) { + matched = FALSE; + goto func_exit; + } + + matched = TRUE; + for (ulint z = 0; z < num_token; z++) { + fts_string_t* token; + token = static_cast<fts_string_t*>( + ib_vector_get(tokens, z)); + fts_query_add_word_to_document( + query, match[0]->doc_id, token); + } + } + } + + if (end_list) { + break; + } + } + +func_exit: + return(matched); +} + +/*************************************************************//** +This function checks whether words in result documents are close to +each other (within proximity range as specified by "distance"). +If "distance" is MAX_ULINT, then it will find all combinations of +positions of matching words and store min and max positions +in the "qualified_pos" for later verification. +@return true if words are close to each other, false if otherwise */ +static +bool +fts_proximity_get_positions( +/*========================*/ + fts_match_t** match, /*!< in: query instance */ + ulint num_match, /*!< in: number of matching + items */ + ulint distance, /*!< in: distance value + for proximity search */ + fts_proximity_t* qualified_pos) /*!< out: the position info + records ranges containing + all matching words. */ +{ + ulint i; + ulint idx[MAX_PROXIMITY_ITEM]; + ulint num_pos[MAX_PROXIMITY_ITEM]; + ulint min_idx; + + qualified_pos->n_pos = 0; + + ut_a(num_match <= MAX_PROXIMITY_ITEM); + + /* Each word could appear multiple times in a doc. So + we need to walk through each word's position list, and find + closest distance between different words to see if + they are in the proximity distance. */ + + /* Assume each word's position list is sorted, we + will just do a walk through to all words' lists + similar to a the merge phase of a merge sort */ + for (i = 0; i < num_match; i++) { + /* idx is the current position we are checking + for a particular word */ + idx[i] = 0; + + /* Number of positions for this word */ + num_pos[i] = ib_vector_size(match[i]->positions); + } + + /* Start with the first word */ + min_idx = 0; + + while (idx[min_idx] < num_pos[min_idx]) { + ulint position[MAX_PROXIMITY_ITEM]; + ulint min_pos = ULINT_MAX; + ulint max_pos = 0; + + /* Check positions in each word position list, and + record the max/min position */ + for (i = 0; i < num_match; i++) { + position[i] = *(ulint*) ib_vector_get_const( + match[i]->positions, idx[i]); + + if (position[i] == ULINT_UNDEFINED) { + break; + } + + if (position[i] < min_pos) { + min_pos = position[i]; + min_idx = i; + } + + if (position[i] > max_pos) { + max_pos = position[i]; + } + } + + /* If max and min position are within range, we + find a good match */ + if (max_pos - min_pos <= distance + && (i >= num_match || position[i] != ULINT_UNDEFINED)) { + /* The charset has variable character + length encoding, record the min_pos and + max_pos, we will need to verify the actual + number of characters */ + qualified_pos->min_pos.push_back(min_pos); + qualified_pos->max_pos.push_back(max_pos); + qualified_pos->n_pos++; + } + + /* Otherwise, move to the next position is the + list for the word with the smallest position */ + idx[min_idx]++; + } + + return(qualified_pos->n_pos != 0); +} diff --git a/storage/xtradb/fts/fts0sql.cc b/storage/xtradb/fts/fts0sql.cc new file mode 100644 index 00000000000..cb8eff3cacc --- /dev/null +++ b/storage/xtradb/fts/fts0sql.cc @@ -0,0 +1,363 @@ +/***************************************************************************** + +Copyright (c) 2007, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file fts/fts0sql.cc +Full Text Search functionality. + +Created 2007-03-27 Sunny Bains +*******************************************************/ + +#include "que0que.h" +#include "trx0roll.h" +#include "pars0pars.h" +#include "dict0dict.h" +#include "fts0types.h" +#include "fts0priv.h" + +#ifndef UNIV_NONINL +#include "fts0types.ic" +#include "fts0vlc.ic" +#endif + +/** SQL statements for creating the ancillary FTS tables. %s must be replaced +with the indexed table's id. */ + +/** Preamble to all SQL statements. */ +static const char* fts_sql_begin= + "PROCEDURE P() IS\n"; + +/** Postamble to non-committing SQL statements. */ +static const char* fts_sql_end= + "\n" + "END;\n"; + +/******************************************************************//** +Get the table id. +@return number of bytes written */ +UNIV_INTERN +int +fts_get_table_id( +/*=============*/ + const fts_table_t* + fts_table, /*!< in: FTS Auxiliary table */ + char* table_id) /*!< out: table id, must be at least + FTS_AUX_MIN_TABLE_ID_LENGTH bytes + long */ +{ + int len; + bool hex_name = DICT_TF2_FLAG_IS_SET(fts_table->table, + DICT_TF2_FTS_AUX_HEX_NAME); + + ut_a(fts_table->table != NULL); + + switch (fts_table->type) { + case FTS_COMMON_TABLE: + len = fts_write_object_id(fts_table->table_id, table_id, + hex_name); + break; + + case FTS_INDEX_TABLE: + + len = fts_write_object_id(fts_table->table_id, table_id, + hex_name); + + table_id[len] = '_'; + ++len; + table_id += len; + + len += fts_write_object_id(fts_table->index_id, table_id, + hex_name); + break; + + default: + ut_error; + } + + ut_a(len >= 16); + ut_a(len < FTS_AUX_MIN_TABLE_ID_LENGTH); + + return(len); +} + +/******************************************************************//** +Construct the prefix name of an FTS table. +@return own: table name, must be freed with mem_free() */ +UNIV_INTERN +char* +fts_get_table_name_prefix( +/*======================*/ + const fts_table_t* + fts_table) /*!< in: Auxiliary table type */ +{ + int len; + const char* slash; + char* prefix_name; + int dbname_len = 0; + int prefix_name_len; + char table_id[FTS_AUX_MIN_TABLE_ID_LENGTH]; + + slash = static_cast<const char*>( + memchr(fts_table->parent, '/', strlen(fts_table->parent))); + + if (slash) { + /* Print up to and including the separator. */ + dbname_len = static_cast<int>(slash - fts_table->parent) + 1; + } + + len = fts_get_table_id(fts_table, table_id); + + prefix_name_len = dbname_len + 4 + len + 1; + + prefix_name = static_cast<char*>(mem_alloc(prefix_name_len)); + + len = sprintf(prefix_name, "%.*sFTS_%s", + dbname_len, fts_table->parent, table_id); + + ut_a(len > 0); + ut_a(len == prefix_name_len - 1); + + return(prefix_name); +} + +/******************************************************************//** +Construct the name of an ancillary FTS table. +@return own: table name, must be freed with mem_free() */ +UNIV_INTERN +char* +fts_get_table_name( +/*===============*/ + const fts_table_t* fts_table) + /*!< in: Auxiliary table type */ +{ + int len; + char* name; + int name_len; + char* prefix_name; + + prefix_name = fts_get_table_name_prefix(fts_table); + + name_len = static_cast<int>( + strlen(prefix_name) + 1 + strlen(fts_table->suffix) + 1); + + name = static_cast<char*>(mem_alloc(name_len)); + + len = sprintf(name, "%s_%s", prefix_name, fts_table->suffix); + + ut_a(len > 0); + ut_a(len == name_len - 1); + + mem_free(prefix_name); + + return(name); +} + +/******************************************************************//** +Parse an SQL string. %s is replaced with the table's id. +@return query graph */ +UNIV_INTERN +que_t* +fts_parse_sql( +/*==========*/ + fts_table_t* fts_table, /*!< in: FTS auxiliarry table info */ + pars_info_t* info, /*!< in: info struct, or NULL */ + const char* sql) /*!< in: SQL string to evaluate */ +{ + char* str; + que_t* graph; + char* str_tmp; + ibool dict_locked; + + if (fts_table != NULL) { + char* table_name; + + table_name = fts_get_table_name(fts_table); + str_tmp = ut_strreplace(sql, "%s", table_name); + mem_free(table_name); + } else { + ulint sql_len = strlen(sql) + 1; + + str_tmp = static_cast<char*>(mem_alloc(sql_len)); + strcpy(str_tmp, sql); + } + + str = ut_str3cat(fts_sql_begin, str_tmp, fts_sql_end); + mem_free(str_tmp); + + dict_locked = (fts_table && fts_table->table->fts + && (fts_table->table->fts->fts_status + & TABLE_DICT_LOCKED)); + + if (!dict_locked) { + ut_ad(!mutex_own(&(dict_sys->mutex))); + + /* The InnoDB SQL parser is not re-entrant. */ + mutex_enter(&dict_sys->mutex); + } + + graph = pars_sql(info, str); + ut_a(graph); + + if (!dict_locked) { + mutex_exit(&dict_sys->mutex); + } + + mem_free(str); + + return(graph); +} + +/******************************************************************//** +Parse an SQL string. %s is replaced with the table's id. +@return query graph */ +UNIV_INTERN +que_t* +fts_parse_sql_no_dict_lock( +/*=======================*/ + fts_table_t* fts_table, /*!< in: FTS aux table info */ + pars_info_t* info, /*!< in: info struct, or NULL */ + const char* sql) /*!< in: SQL string to evaluate */ +{ + char* str; + que_t* graph; + char* str_tmp = NULL; + +#ifdef UNIV_DEBUG + ut_ad(mutex_own(&dict_sys->mutex)); +#endif + + if (fts_table != NULL) { + char* table_name; + + table_name = fts_get_table_name(fts_table); + str_tmp = ut_strreplace(sql, "%s", table_name); + mem_free(table_name); + } + + if (str_tmp != NULL) { + str = ut_str3cat(fts_sql_begin, str_tmp, fts_sql_end); + mem_free(str_tmp); + } else { + str = ut_str3cat(fts_sql_begin, sql, fts_sql_end); + } + + //fprintf(stderr, "%s\n", str); + + graph = pars_sql(info, str); + ut_a(graph); + + mem_free(str); + + return(graph); +} + +/******************************************************************//** +Evaluate an SQL query graph. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fts_eval_sql( +/*=========*/ + trx_t* trx, /*!< in: transaction */ + que_t* graph) /*!< in: Query graph to evaluate */ +{ + que_thr_t* thr; + + graph->trx = trx; + graph->fork_type = QUE_FORK_MYSQL_INTERFACE; + + ut_a(thr = que_fork_start_command(graph)); + + que_run_threads(thr); + + return(trx->error_state); +} + +/******************************************************************//** +Construct the column specification part of the SQL string for selecting the +indexed FTS columns for the given table. Adds the necessary bound +ids to the given 'info' and returns the SQL string. Examples: + +One indexed column named "text": + + "$sel0", + info/ids: sel0 -> "text" + +Two indexed columns named "subject" and "content": + + "$sel0, $sel1", + info/ids: sel0 -> "subject", sel1 -> "content", +@return heap-allocated WHERE string */ +UNIV_INTERN +const char* +fts_get_select_columns_str( +/*=======================*/ + dict_index_t* index, /*!< in: index */ + pars_info_t* info, /*!< in/out: parser info */ + mem_heap_t* heap) /*!< in: memory heap */ +{ + ulint i; + const char* str = ""; + + for (i = 0; i < index->n_user_defined_cols; i++) { + char* sel_str; + + dict_field_t* field = dict_index_get_nth_field(index, i); + + sel_str = mem_heap_printf(heap, "sel%lu", (ulong) i); + + /* Set copy_name to TRUE since it's dynamic. */ + pars_info_bind_id(info, TRUE, sel_str, field->name); + + str = mem_heap_printf( + heap, "%s%s$%s", str, (*str) ? ", " : "", sel_str); + } + + return(str); +} + +/******************************************************************//** +Commit a transaction. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fts_sql_commit( +/*===========*/ + trx_t* trx) /*!< in: transaction */ +{ + dberr_t error; + + error = trx_commit_for_mysql(trx); + + /* Commit should always succeed */ + ut_a(error == DB_SUCCESS); + + return(DB_SUCCESS); +} + +/******************************************************************//** +Rollback a transaction. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fts_sql_rollback( +/*=============*/ + trx_t* trx) /*!< in: transaction */ +{ + return(trx_rollback_to_savepoint(trx, NULL)); +} diff --git a/storage/xtradb/fts/fts0tlex.cc b/storage/xtradb/fts/fts0tlex.cc new file mode 100644 index 00000000000..b744fbf0763 --- /dev/null +++ b/storage/xtradb/fts/fts0tlex.cc @@ -0,0 +1,1952 @@ +#include "univ.i" +#line 2 "fts0tlex.cc" + +#line 4 "fts0tlex.cc" + +#define YY_INT_ALIGNED short int + +/* A lexical scanner generated by flex */ + +#define FLEX_SCANNER +#define YY_FLEX_MAJOR_VERSION 2 +#define YY_FLEX_MINOR_VERSION 5 +#define YY_FLEX_SUBMINOR_VERSION 35 +#if YY_FLEX_SUBMINOR_VERSION > 0 +#define FLEX_BETA +#endif + +/* First, we deal with platform-specific or compiler-specific issues. */ + +/* begin standard C headers. */ +#include <stdio.h> +#include <string.h> +#include <errno.h> +#include <stdlib.h> + +/* end standard C headers. */ + +/* flex integer type definitions */ + +#ifndef FLEXINT_H +#define FLEXINT_H + +/* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */ + +#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + +/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h, + * if you want the limit (max/min) macros for int types. + */ +#ifndef __STDC_LIMIT_MACROS +#define __STDC_LIMIT_MACROS 1 +#endif + +#include <inttypes.h> +typedef int8_t flex_int8_t; +typedef uint8_t flex_uint8_t; +typedef int16_t flex_int16_t; +typedef uint16_t flex_uint16_t; +typedef int32_t flex_int32_t; +typedef uint32_t flex_uint32_t; +#else +typedef signed char flex_int8_t; +typedef short int flex_int16_t; +typedef int flex_int32_t; +typedef unsigned char flex_uint8_t; +typedef unsigned short int flex_uint16_t; +typedef unsigned int flex_uint32_t; + +/* Limits of integral types. */ +#ifndef INT8_MIN +#define INT8_MIN (-128) +#endif +#ifndef INT16_MIN +#define INT16_MIN (-32767-1) +#endif +#ifndef INT32_MIN +#define INT32_MIN (-2147483647-1) +#endif +#ifndef INT8_MAX +#define INT8_MAX (127) +#endif +#ifndef INT16_MAX +#define INT16_MAX (32767) +#endif +#ifndef INT32_MAX +#define INT32_MAX (2147483647) +#endif +#ifndef UINT8_MAX +#define UINT8_MAX (255U) +#endif +#ifndef UINT16_MAX +#define UINT16_MAX (65535U) +#endif +#ifndef UINT32_MAX +#define UINT32_MAX (4294967295U) +#endif + +#endif /* ! C99 */ + +#endif /* ! FLEXINT_H */ + +#ifdef __cplusplus + +/* The "const" storage-class-modifier is valid. */ +#define YY_USE_CONST + +#else /* ! __cplusplus */ + +/* C99 requires __STDC__ to be defined as 1. */ +#if defined (__STDC__) + +#define YY_USE_CONST + +#endif /* defined (__STDC__) */ +#endif /* ! __cplusplus */ + +#ifdef YY_USE_CONST +#define yyconst const +#else +#define yyconst +#endif + +/* Returned upon end-of-file. */ +#define YY_NULL 0 + +/* Promotes a possibly negative, possibly signed char to an unsigned + * integer for use as an array index. If the signed char is negative, + * we want to instead treat it as an 8-bit unsigned char, hence the + * double cast. + */ +#define YY_SC_TO_UI(c) ((unsigned int) (unsigned char) c) + +/* An opaque pointer. */ +#ifndef YY_TYPEDEF_YY_SCANNER_T +#define YY_TYPEDEF_YY_SCANNER_T +typedef void* yyscan_t; +#endif + +/* For convenience, these vars (plus the bison vars far below) + are macros in the reentrant scanner. */ +#define yyin yyg->yyin_r +#define yyout yyg->yyout_r +#define yyextra yyg->yyextra_r +#define yyleng yyg->yyleng_r +#define yytext yyg->yytext_r +#define yylineno (YY_CURRENT_BUFFER_LVALUE->yy_bs_lineno) +#define yycolumn (YY_CURRENT_BUFFER_LVALUE->yy_bs_column) +#define yy_flex_debug yyg->yy_flex_debug_r + +/* Enter a start condition. This macro really ought to take a parameter, + * but we do it the disgusting crufty way forced on us by the ()-less + * definition of BEGIN. + */ +#define BEGIN yyg->yy_start = 1 + 2 * + +/* Translate the current start state into a value that can be later handed + * to BEGIN to return to the state. The YYSTATE alias is for lex + * compatibility. + */ +#define YY_START ((yyg->yy_start - 1) / 2) +#define YYSTATE YY_START + +/* Action number for EOF rule of a given start state. */ +#define YY_STATE_EOF(state) (YY_END_OF_BUFFER + state + 1) + +/* Special action meaning "start processing a new file". */ +#define YY_NEW_FILE fts0trestart(yyin ,yyscanner ) + +#define YY_END_OF_BUFFER_CHAR 0 + +/* Size of default input buffer. */ +#ifndef YY_BUF_SIZE +#ifdef __ia64__ +/* On IA-64, the buffer size is 16k, not 8k. + * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case. + * Ditto for the __ia64__ case accordingly. + */ +#define YY_BUF_SIZE 32768 +#else +#define YY_BUF_SIZE 16384 +#endif /* __ia64__ */ +#endif + +/* The state buf must be large enough to hold one state per character in the main buffer. + */ +#define YY_STATE_BUF_SIZE ((YY_BUF_SIZE + 2) * sizeof(yy_state_type)) + +#ifndef YY_TYPEDEF_YY_BUFFER_STATE +#define YY_TYPEDEF_YY_BUFFER_STATE +typedef struct yy_buffer_state *YY_BUFFER_STATE; +#endif + +#define EOB_ACT_CONTINUE_SCAN 0 +#define EOB_ACT_END_OF_FILE 1 +#define EOB_ACT_LAST_MATCH 2 + + #define YY_LESS_LINENO(n) + +/* Return all but the first "n" matched characters back to the input stream. */ +#define yyless(n) \ + do \ + { \ + /* Undo effects of setting up yytext. */ \ + int yyless_macro_arg = (n); \ + YY_LESS_LINENO(yyless_macro_arg);\ + *yy_cp = yyg->yy_hold_char; \ + YY_RESTORE_YY_MORE_OFFSET \ + yyg->yy_c_buf_p = yy_cp = yy_bp + yyless_macro_arg - YY_MORE_ADJ; \ + YY_DO_BEFORE_ACTION; /* set up yytext again */ \ + } \ + while ( 0 ) + +#define unput(c) yyunput( c, yyg->yytext_ptr , yyscanner ) + +#ifndef YY_TYPEDEF_YY_SIZE_T +#define YY_TYPEDEF_YY_SIZE_T +typedef size_t yy_size_t; +#endif + +#ifndef YY_STRUCT_YY_BUFFER_STATE +#define YY_STRUCT_YY_BUFFER_STATE +struct yy_buffer_state + { + FILE *yy_input_file; + + char *yy_ch_buf; /* input buffer */ + char *yy_buf_pos; /* current position in input buffer */ + + /* Size of input buffer in bytes, not including room for EOB + * characters. + */ + yy_size_t yy_buf_size; + + /* Number of characters read into yy_ch_buf, not including EOB + * characters. + */ + int yy_n_chars; + + /* Whether we "own" the buffer - i.e., we know we created it, + * and can realloc() it to grow it, and should free() it to + * delete it. + */ + int yy_is_our_buffer; + + /* Whether this is an "interactive" input source; if so, and + * if we're using stdio for input, then we want to use getc() + * instead of fread(), to make sure we stop fetching input after + * each newline. + */ + int yy_is_interactive; + + /* Whether we're considered to be at the beginning of a line. + * If so, '^' rules will be active on the next match, otherwise + * not. + */ + int yy_at_bol; + + int yy_bs_lineno; /**< The line count. */ + int yy_bs_column; /**< The column count. */ + + /* Whether to try to fill the input buffer when we reach the + * end of it. + */ + int yy_fill_buffer; + + int yy_buffer_status; + +#define YY_BUFFER_NEW 0 +#define YY_BUFFER_NORMAL 1 + /* When an EOF's been seen but there's still some text to process + * then we mark the buffer as YY_EOF_PENDING, to indicate that we + * shouldn't try reading from the input source any more. We might + * still have a bunch of tokens to match, though, because of + * possible backing-up. + * + * When we actually see the EOF, we change the status to "new" + * (via fts0trestart()), so that the user can continue scanning by + * just pointing yyin at a new input file. + */ +#define YY_BUFFER_EOF_PENDING 2 + + }; +#endif /* !YY_STRUCT_YY_BUFFER_STATE */ + +/* We provide macros for accessing buffer states in case in the + * future we want to put the buffer states in a more general + * "scanner state". + * + * Returns the top of the stack, or NULL. + */ +#define YY_CURRENT_BUFFER ( yyg->yy_buffer_stack \ + ? yyg->yy_buffer_stack[yyg->yy_buffer_stack_top] \ + : NULL) + +/* Same as previous macro, but useful when we know that the buffer stack is not + * NULL or when we need an lvalue. For internal use only. + */ +#define YY_CURRENT_BUFFER_LVALUE yyg->yy_buffer_stack[yyg->yy_buffer_stack_top] + +void fts0trestart (FILE *input_file ,yyscan_t yyscanner ); +void fts0t_switch_to_buffer (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner ); +YY_BUFFER_STATE fts0t_create_buffer (FILE *file,int size ,yyscan_t yyscanner ); +void fts0t_delete_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner ); +void fts0t_flush_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner ); +void fts0tpush_buffer_state (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner ); +void fts0tpop_buffer_state (yyscan_t yyscanner ); + +static void fts0tensure_buffer_stack (yyscan_t yyscanner ); +static void fts0t_load_buffer_state (yyscan_t yyscanner ); +static void fts0t_init_buffer (YY_BUFFER_STATE b,FILE *file ,yyscan_t yyscanner ); + +#define YY_FLUSH_BUFFER fts0t_flush_buffer(YY_CURRENT_BUFFER ,yyscanner) + +YY_BUFFER_STATE fts0t_scan_buffer (char *base,yy_size_t size ,yyscan_t yyscanner ); +YY_BUFFER_STATE fts0t_scan_string (yyconst char *yy_str ,yyscan_t yyscanner ); +YY_BUFFER_STATE fts0t_scan_bytes (yyconst char *bytes,int len ,yyscan_t yyscanner ); + +void *fts0talloc (yy_size_t , yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) ); +void *fts0trealloc (void *,yy_size_t , yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) ); +void fts0tfree (void * , yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) ); + +#define yy_new_buffer fts0t_create_buffer + +#define yy_set_interactive(is_interactive) \ + { \ + if ( ! YY_CURRENT_BUFFER ){ \ + fts0tensure_buffer_stack (yyscanner); \ + YY_CURRENT_BUFFER_LVALUE = \ + fts0t_create_buffer(yyin,YY_BUF_SIZE ,yyscanner); \ + } \ + YY_CURRENT_BUFFER_LVALUE->yy_is_interactive = is_interactive; \ + } + +#define yy_set_bol(at_bol) \ + { \ + if ( ! YY_CURRENT_BUFFER ){\ + fts0tensure_buffer_stack (yyscanner); \ + YY_CURRENT_BUFFER_LVALUE = \ + fts0t_create_buffer(yyin,YY_BUF_SIZE ,yyscanner); \ + } \ + YY_CURRENT_BUFFER_LVALUE->yy_at_bol = at_bol; \ + } + +#define YY_AT_BOL() (YY_CURRENT_BUFFER_LVALUE->yy_at_bol) + +/* Begin user sect3 */ + +#define fts0twrap(n) 1 +#define YY_SKIP_YYWRAP + +typedef unsigned char YY_CHAR; + +typedef int yy_state_type; + +#define yytext_ptr yytext_r + +static yy_state_type yy_get_previous_state (yyscan_t yyscanner ); +static yy_state_type yy_try_NUL_trans (yy_state_type current_state ,yyscan_t yyscanner); +static int yy_get_next_buffer (yyscan_t yyscanner ); +static void yy_fatal_error (yyconst char msg[] , yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) ); + +/* Done after the current pattern has been matched and before the + * corresponding action - sets up yytext. + */ +#define YY_DO_BEFORE_ACTION \ + yyg->yytext_ptr = yy_bp; \ + yyleng = static_cast<int>(yy_cp - yy_bp); \ + yyg->yy_hold_char = *yy_cp; \ + *yy_cp = '\0'; \ + yyg->yy_c_buf_p = yy_cp; + +#define YY_NUM_RULES 7 +#define YY_END_OF_BUFFER 8 +/* This struct is not used in this scanner, + but its presence is necessary. */ +struct yy_trans_info + { + flex_int32_t yy_verify; + flex_int32_t yy_nxt; + }; +static yyconst flex_int16_t yy_accept[17] = + { 0, + 4, 4, 8, 4, 1, 6, 1, 5, 5, 2, + 4, 1, 1, 0, 3, 0 + } ; + +static yyconst flex_int32_t yy_ec[256] = + { 0, + 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 4, 1, 5, 1, 1, 6, 1, 1, 1, + 1, 7, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1 + } ; + +static yyconst flex_int32_t yy_meta[8] = + { 0, + 1, 2, 3, 4, 5, 5, 1 + } ; + +static yyconst flex_int16_t yy_base[20] = + { 0, + 0, 0, 18, 0, 6, 21, 0, 9, 21, 0, + 0, 0, 0, 4, 21, 21, 10, 11, 15 + } ; + +static yyconst flex_int16_t yy_def[20] = + { 0, + 16, 1, 16, 17, 17, 16, 18, 19, 16, 17, + 17, 5, 18, 19, 16, 0, 16, 16, 16 + } ; + +static yyconst flex_int16_t yy_nxt[29] = + { 0, + 4, 5, 6, 7, 8, 9, 10, 12, 15, 13, + 11, 11, 13, 15, 13, 14, 14, 16, 14, 14, + 3, 16, 16, 16, 16, 16, 16, 16 + } ; + +static yyconst flex_int16_t yy_chk[29] = + { 0, + 1, 1, 1, 1, 1, 1, 1, 5, 14, 5, + 17, 17, 18, 8, 18, 19, 19, 3, 19, 19, + 16, 16, 16, 16, 16, 16, 16, 16 + } ; + +/* The intent behind this definition is that it'll catch + * any uses of REJECT which flex missed. + */ +#define REJECT reject_used_but_not_detected +#define yymore() yymore_used_but_not_detected +#define YY_MORE_ADJ 0 +#define YY_RESTORE_YY_MORE_OFFSET +#line 1 "fts0tlex.l" +/***************************************************************************** + +Copyright (c) 2007, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ +/** + * @file fts/fts0tlex.l + * FTS parser lexical analyzer + * + * Created 2007/5/9 Sunny Bains + */ +#line 27 "fts0tlex.l" + +#include "fts0ast.h" +#include "fts0pars.h" + +/* Required for reentrant parser */ +#define YY_DECL int fts_tlexer(YYSTYPE* val, yyscan_t yyscanner) + +#define YY_NO_INPUT 1 +#line 480 "fts0tlex.cc" + +#define INITIAL 0 + +#ifndef YY_NO_UNISTD_H +/* Special case for "unistd.h", since it is non-ANSI. We include it way + * down here because we want the user's section 1 to have been scanned first. + * The user has a chance to override it with an option. + */ +#include <unistd.h> +#endif + +#ifndef YY_EXTRA_TYPE +#define YY_EXTRA_TYPE void * +#endif + +/* Holds the entire state of the reentrant scanner. */ +struct yyguts_t + { + + /* User-defined. Not touched by flex. */ + YY_EXTRA_TYPE yyextra_r; + + /* The rest are the same as the globals declared in the non-reentrant scanner. */ + FILE *yyin_r, *yyout_r; + size_t yy_buffer_stack_top; /**< index of top of stack. */ + size_t yy_buffer_stack_max; /**< capacity of stack. */ + YY_BUFFER_STATE * yy_buffer_stack; /**< Stack as an array. */ + char yy_hold_char; + int yy_n_chars; + int yyleng_r; + char *yy_c_buf_p; + int yy_init; + int yy_start; + int yy_did_buffer_switch_on_eof; + int yy_start_stack_ptr; + int yy_start_stack_depth; + int *yy_start_stack; + yy_state_type yy_last_accepting_state; + char* yy_last_accepting_cpos; + + int yylineno_r; + int yy_flex_debug_r; + + char *yytext_r; + int yy_more_flag; + int yy_more_len; + + }; /* end struct yyguts_t */ + +static int yy_init_globals (yyscan_t yyscanner ); + +int fts0tlex_init (yyscan_t* scanner); + +int fts0tlex_init_extra (YY_EXTRA_TYPE user_defined,yyscan_t* scanner); + +/* Accessor methods to globals. + These are made visible to non-reentrant scanners for convenience. */ + +int fts0tlex_destroy (yyscan_t yyscanner ); + +int fts0tget_debug (yyscan_t yyscanner ); + +void fts0tset_debug (int debug_flag ,yyscan_t yyscanner ); + +YY_EXTRA_TYPE fts0tget_extra (yyscan_t yyscanner ); + +void fts0tset_extra (YY_EXTRA_TYPE user_defined ,yyscan_t yyscanner ); + +FILE *fts0tget_in (yyscan_t yyscanner ); + +void fts0tset_in (FILE * in_str ,yyscan_t yyscanner ); + +FILE *fts0tget_out (yyscan_t yyscanner ); + +void fts0tset_out (FILE * out_str ,yyscan_t yyscanner ); + +int fts0tget_leng (yyscan_t yyscanner ); + +char *fts0tget_text (yyscan_t yyscanner ); + +int fts0tget_lineno (yyscan_t yyscanner ); + +void fts0tset_lineno (int line_number ,yyscan_t yyscanner ); + +/* Macros after this point can all be overridden by user definitions in + * section 1. + */ + +#ifndef YY_SKIP_YYWRAP +#ifdef __cplusplus +extern "C" int fts0twrap (yyscan_t yyscanner ); +#else +extern int fts0twrap (yyscan_t yyscanner ); +#endif +#endif + +#ifndef yytext_ptr +static void yy_flex_strncpy (char *,yyconst char *,int , yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused))); +#endif + +#ifdef YY_NEED_STRLEN +static int yy_flex_strlen (yyconst char * , yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused))); +#endif + +#ifndef YY_NO_INPUT + +#ifdef __cplusplus +static int yyinput (yyscan_t yyscanner ); +#else +static int input (yyscan_t yyscanner ); +#endif + +#endif + +/* Amount of stuff to slurp up with each read. */ +#ifndef YY_READ_BUF_SIZE +#ifdef __ia64__ +/* On IA-64, the buffer size is 16k, not 8k */ +#define YY_READ_BUF_SIZE 16384 +#else +#define YY_READ_BUF_SIZE 8192 +#endif /* __ia64__ */ +#endif + +/* Copy whatever the last rule matched to the standard output. */ +#ifndef ECHO +/* This used to be an fputs(), but since the string might contain NUL's, + * we now use fwrite(). + */ +#define ECHO do { if (fwrite( yytext, yyleng, 1, yyout )) {} } while (0) +#endif + +/* Gets input and stuffs it into "buf". number of characters read, or YY_NULL, + * is returned in "result". + */ +#ifndef YY_INPUT +#define YY_INPUT(buf,result,max_size) \ + if ( YY_CURRENT_BUFFER_LVALUE->yy_is_interactive ) \ + { \ + int c = '*'; \ + int n; \ + for ( n = 0; n < static_cast<int>(max_size) && \ + (c = getc( yyin )) != EOF && c != '\n'; ++n ) \ + buf[n] = (char) c; \ + if ( c == '\n' ) \ + buf[n++] = (char) c; \ + if ( c == EOF && ferror( yyin ) ) \ + YY_FATAL_ERROR( "input in flex scanner failed" ); \ + result = n; \ + } \ + else \ + { \ + errno=0; \ + while ( (result = static_cast<int>(fread(buf, 1, max_size, yyin)))==0 \ + && ferror(yyin)) \ + { \ + if( errno != EINTR) \ + { \ + YY_FATAL_ERROR( "input in flex scanner failed" ); \ + break; \ + } \ + errno=0; \ + clearerr(yyin); \ + } \ + }\ +\ + +#endif + +/* No semi-colon after return; correct usage is to write "yyterminate();" - + * we don't want an extra ';' after the "return" because that will cause + * some compilers to complain about unreachable statements. + */ +#ifndef yyterminate +#define yyterminate() return YY_NULL +#endif + +/* Number of entries by which start-condition stack grows. */ +#ifndef YY_START_STACK_INCR +#define YY_START_STACK_INCR 25 +#endif + +/* Report a fatal error. */ +#ifndef YY_FATAL_ERROR +#define YY_FATAL_ERROR(msg) yy_fatal_error( msg , yyscanner) +#endif + +/* end tables serialization structures and prototypes */ + +/* Default declaration of generated scanner - a define so the user can + * easily add parameters. + */ +#ifndef YY_DECL +#define YY_DECL_IS_OURS 1 + +extern int fts0tlex (yyscan_t yyscanner); + +#define YY_DECL int fts0tlex (yyscan_t yyscanner) +#endif /* !YY_DECL */ + +/* Code executed at the beginning of each rule, after yytext and yyleng + * have been set up. + */ +#ifndef YY_USER_ACTION +#define YY_USER_ACTION +#endif + +/* Code executed at the end of each rule. */ +#ifndef YY_BREAK +#define YY_BREAK break; +#endif + +#define YY_RULE_SETUP \ + YY_USER_ACTION + +/** The main scanner function which does all the work. + */ +YY_DECL +{ + register yy_state_type yy_current_state; + register char *yy_cp, *yy_bp; + register int yy_act; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + +#line 44 "fts0tlex.l" + + +#line 707 "fts0tlex.cc" + + if ( !yyg->yy_init ) + { + yyg->yy_init = 1; + +#ifdef YY_USER_INIT + YY_USER_INIT; +#endif + + if ( ! yyg->yy_start ) + yyg->yy_start = 1; /* first start state */ + + if ( ! yyin ) + yyin = stdin; + + if ( ! yyout ) + yyout = stdout; + + if ( ! YY_CURRENT_BUFFER ) { + fts0tensure_buffer_stack (yyscanner); + YY_CURRENT_BUFFER_LVALUE = + fts0t_create_buffer(yyin,YY_BUF_SIZE ,yyscanner); + } + + fts0t_load_buffer_state(yyscanner ); + } + + while ( 1 ) /* loops until end-of-file is reached */ + { + yy_cp = yyg->yy_c_buf_p; + + /* Support of yytext. */ + *yy_cp = yyg->yy_hold_char; + + /* yy_bp points to the position in yy_ch_buf of the start of + * the current run. + */ + yy_bp = yy_cp; + + yy_current_state = yyg->yy_start; +yy_match: + do + { + register YY_CHAR yy_c = yy_ec[YY_SC_TO_UI(*yy_cp)]; + if ( yy_accept[yy_current_state] ) + { + yyg->yy_last_accepting_state = yy_current_state; + yyg->yy_last_accepting_cpos = yy_cp; + } + while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) + { + yy_current_state = (int) yy_def[yy_current_state]; + if ( yy_current_state >= 17 ) + yy_c = yy_meta[(unsigned int) yy_c]; + } + yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c]; + ++yy_cp; + } + while ( yy_current_state != 16 ); + yy_cp = yyg->yy_last_accepting_cpos; + yy_current_state = yyg->yy_last_accepting_state; + +yy_find_action: + yy_act = yy_accept[yy_current_state]; + + YY_DO_BEFORE_ACTION; + +do_action: /* This label is used only to access EOF actions. */ + + switch ( yy_act ) + { /* beginning of action switch */ + case 0: /* must back up */ + /* undo the effects of YY_DO_BEFORE_ACTION */ + *yy_cp = yyg->yy_hold_char; + yy_cp = yyg->yy_last_accepting_cpos; + yy_current_state = yyg->yy_last_accepting_state; + goto yy_find_action; + +case 1: +YY_RULE_SETUP +#line 46 "fts0tlex.l" +/* Ignore whitespace */ ; + YY_BREAK +case 2: +YY_RULE_SETUP +#line 48 "fts0tlex.l" +{ + val->oper = fts0tget_text(yyscanner)[0]; + + return(val->oper); +} + YY_BREAK +case 3: +YY_RULE_SETUP +#line 54 "fts0tlex.l" +{ + val->token = fts_ast_string_create(reinterpret_cast<const byte*>(fts0tget_text(yyscanner)), fts0tget_leng(yyscanner)); + + return(FTS_TEXT); +} + YY_BREAK +case 4: +YY_RULE_SETUP +#line 60 "fts0tlex.l" +{ + val->token = fts_ast_string_create(reinterpret_cast<const byte*>(fts0tget_text(yyscanner)), fts0tget_leng(yyscanner)); + + return(FTS_TERM); +} + YY_BREAK +case 5: +YY_RULE_SETUP +#line 65 "fts0tlex.l" +; + YY_BREAK +case 6: +/* rule 6 can match eol */ +YY_RULE_SETUP +#line 66 "fts0tlex.l" + + YY_BREAK +case 7: +YY_RULE_SETUP +#line 68 "fts0tlex.l" +ECHO; + YY_BREAK +#line 834 "fts0tlex.cc" +case YY_STATE_EOF(INITIAL): + yyterminate(); + + case YY_END_OF_BUFFER: + { + /* Amount of text matched not including the EOB char. */ + int yy_amount_of_matched_text = (int) (yy_cp - yyg->yytext_ptr) - 1; + + /* Undo the effects of YY_DO_BEFORE_ACTION. */ + *yy_cp = yyg->yy_hold_char; + YY_RESTORE_YY_MORE_OFFSET + + if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_NEW ) + { + /* We're scanning a new file or input source. It's + * possible that this happened because the user + * just pointed yyin at a new source and called + * fts0tlex(). If so, then we have to assure + * consistency between YY_CURRENT_BUFFER and our + * globals. Here is the right place to do so, because + * this is the first action (other than possibly a + * back-up) that will match for the new input source. + */ + yyg->yy_n_chars = YY_CURRENT_BUFFER_LVALUE->yy_n_chars; + YY_CURRENT_BUFFER_LVALUE->yy_input_file = yyin; + YY_CURRENT_BUFFER_LVALUE->yy_buffer_status = YY_BUFFER_NORMAL; + } + + /* Note that here we test for yy_c_buf_p "<=" to the position + * of the first EOB in the buffer, since yy_c_buf_p will + * already have been incremented past the NUL character + * (since all states make transitions on EOB to the + * end-of-buffer state). Contrast this with the test + * in input(). + */ + if ( yyg->yy_c_buf_p <= &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars] ) + { /* This was really a NUL. */ + yy_state_type yy_next_state; + + yyg->yy_c_buf_p = yyg->yytext_ptr + yy_amount_of_matched_text; + + yy_current_state = yy_get_previous_state( yyscanner ); + + /* Okay, we're now positioned to make the NUL + * transition. We couldn't have + * yy_get_previous_state() go ahead and do it + * for us because it doesn't know how to deal + * with the possibility of jamming (and we don't + * want to build jamming into it because then it + * will run more slowly). + */ + + yy_next_state = yy_try_NUL_trans( yy_current_state , yyscanner); + + yy_bp = yyg->yytext_ptr + YY_MORE_ADJ; + + if ( yy_next_state ) + { + /* Consume the NUL. */ + yy_cp = ++yyg->yy_c_buf_p; + yy_current_state = yy_next_state; + goto yy_match; + } + + else + { + yy_cp = yyg->yy_last_accepting_cpos; + yy_current_state = yyg->yy_last_accepting_state; + goto yy_find_action; + } + } + + else switch ( yy_get_next_buffer( yyscanner ) ) + { + case EOB_ACT_END_OF_FILE: + { + yyg->yy_did_buffer_switch_on_eof = 0; + + if ( fts0twrap(yyscanner ) ) + { + /* Note: because we've taken care in + * yy_get_next_buffer() to have set up + * yytext, we can now set up + * yy_c_buf_p so that if some total + * hoser (like flex itself) wants to + * call the scanner after we return the + * YY_NULL, it'll still work - another + * YY_NULL will get returned. + */ + yyg->yy_c_buf_p = yyg->yytext_ptr + YY_MORE_ADJ; + + yy_act = YY_STATE_EOF(YY_START); + goto do_action; + } + + else + { + if ( ! yyg->yy_did_buffer_switch_on_eof ) + YY_NEW_FILE; + } + break; + } + + case EOB_ACT_CONTINUE_SCAN: + yyg->yy_c_buf_p = + yyg->yytext_ptr + yy_amount_of_matched_text; + + yy_current_state = yy_get_previous_state( yyscanner ); + + yy_cp = yyg->yy_c_buf_p; + yy_bp = yyg->yytext_ptr + YY_MORE_ADJ; + goto yy_match; + + case EOB_ACT_LAST_MATCH: + yyg->yy_c_buf_p = + &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars]; + + yy_current_state = yy_get_previous_state( yyscanner ); + + yy_cp = yyg->yy_c_buf_p; + yy_bp = yyg->yytext_ptr + YY_MORE_ADJ; + goto yy_find_action; + } + break; + } + + default: + YY_FATAL_ERROR( + "fatal flex scanner internal error--no action found" ); + } /* end of action switch */ + } /* end of scanning one token */ +} /* end of fts0tlex */ + +/* yy_get_next_buffer - try to read in a new buffer + * + * Returns a code representing an action: + * EOB_ACT_LAST_MATCH - + * EOB_ACT_CONTINUE_SCAN - continue scanning from current position + * EOB_ACT_END_OF_FILE - end of file + */ +static int yy_get_next_buffer (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + register char *dest = YY_CURRENT_BUFFER_LVALUE->yy_ch_buf; + register char *source = yyg->yytext_ptr; + register int number_to_move, i; + int ret_val; + + if ( yyg->yy_c_buf_p > &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars + 1] ) + YY_FATAL_ERROR( + "fatal flex scanner internal error--end of buffer missed" ); + + if ( YY_CURRENT_BUFFER_LVALUE->yy_fill_buffer == 0 ) + { /* Don't try to fill the buffer, so this is an EOF. */ + if ( yyg->yy_c_buf_p - yyg->yytext_ptr - YY_MORE_ADJ == 1 ) + { + /* We matched a single character, the EOB, so + * treat this as a final EOF. + */ + return EOB_ACT_END_OF_FILE; + } + + else + { + /* We matched some text prior to the EOB, first + * process it. + */ + return EOB_ACT_LAST_MATCH; + } + } + + /* Try to read more data. */ + + /* First move last chars to start of buffer. */ + number_to_move = (int) (yyg->yy_c_buf_p - yyg->yytext_ptr) - 1; + + for ( i = 0; i < number_to_move; ++i ) + *(dest++) = *(source++); + + if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_EOF_PENDING ) + /* don't do the read, it's not guaranteed to return an EOF, + * just force an EOF + */ + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars = 0; + + else + { + int num_to_read =static_cast<int>( + YY_CURRENT_BUFFER_LVALUE->yy_buf_size - number_to_move - 1); + + while ( num_to_read <= 0 ) + { /* Not enough room in the buffer - grow it. */ + + /* just a shorter name for the current buffer */ + YY_BUFFER_STATE b = YY_CURRENT_BUFFER; + + int yy_c_buf_p_offset = + (int) (yyg->yy_c_buf_p - b->yy_ch_buf); + + if ( b->yy_is_our_buffer ) + { + int new_size = static_cast<int>(b->yy_buf_size * 2); + + if ( new_size <= 0 ) + b->yy_buf_size += b->yy_buf_size / 8; + else + b->yy_buf_size *= 2; + + b->yy_ch_buf = (char *) + /* Include room in for 2 EOB chars. */ + fts0trealloc((void *) b->yy_ch_buf,b->yy_buf_size + 2 ,yyscanner ); + } + else + /* Can't grow it, we don't own it. */ + b->yy_ch_buf = 0; + + if ( ! b->yy_ch_buf ) + YY_FATAL_ERROR( + "fatal error - scanner input buffer overflow" ); + + yyg->yy_c_buf_p = &b->yy_ch_buf[yy_c_buf_p_offset]; + + num_to_read = static_cast<int>( + YY_CURRENT_BUFFER_LVALUE->yy_buf_size - number_to_move - 1); + + } + + if ( num_to_read > YY_READ_BUF_SIZE ) + num_to_read = YY_READ_BUF_SIZE; + + /* Read in more data. */ + YY_INPUT( (&YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[number_to_move]), + yyg->yy_n_chars, num_to_read); + + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars; + } + + if ( yyg->yy_n_chars == 0 ) + { + if ( number_to_move == YY_MORE_ADJ ) + { + ret_val = EOB_ACT_END_OF_FILE; + fts0trestart(yyin ,yyscanner); + } + + else + { + ret_val = EOB_ACT_LAST_MATCH; + YY_CURRENT_BUFFER_LVALUE->yy_buffer_status = + YY_BUFFER_EOF_PENDING; + } + } + + else + ret_val = EOB_ACT_CONTINUE_SCAN; + + if ((yy_size_t) (yyg->yy_n_chars + number_to_move) > YY_CURRENT_BUFFER_LVALUE->yy_buf_size) { + /* Extend the array by 50%, plus the number we really need. */ + yy_size_t new_size = yyg->yy_n_chars + number_to_move + (yyg->yy_n_chars >> 1); + YY_CURRENT_BUFFER_LVALUE->yy_ch_buf = (char *) fts0trealloc((void *) YY_CURRENT_BUFFER_LVALUE->yy_ch_buf,new_size ,yyscanner ); + if ( ! YY_CURRENT_BUFFER_LVALUE->yy_ch_buf ) + YY_FATAL_ERROR( "out of dynamic memory in yy_get_next_buffer()" ); + } + + yyg->yy_n_chars += number_to_move; + YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars] = YY_END_OF_BUFFER_CHAR; + YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars + 1] = YY_END_OF_BUFFER_CHAR; + + yyg->yytext_ptr = &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[0]; + + return ret_val; +} + +/* yy_get_previous_state - get the state just before the EOB char was reached */ + + static yy_state_type yy_get_previous_state (yyscan_t yyscanner) +{ + register yy_state_type yy_current_state; + register char *yy_cp; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + yy_current_state = yyg->yy_start; + + for ( yy_cp = yyg->yytext_ptr + YY_MORE_ADJ; yy_cp < yyg->yy_c_buf_p; ++yy_cp ) + { + register YY_CHAR yy_c = (*yy_cp ? yy_ec[YY_SC_TO_UI(*yy_cp)] : 1); + if ( yy_accept[yy_current_state] ) + { + yyg->yy_last_accepting_state = yy_current_state; + yyg->yy_last_accepting_cpos = yy_cp; + } + while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) + { + yy_current_state = (int) yy_def[yy_current_state]; + if ( yy_current_state >= 17 ) + yy_c = yy_meta[(unsigned int) yy_c]; + } + yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c]; + } + + return yy_current_state; +} + +/* yy_try_NUL_trans - try to make a transition on the NUL character + * + * synopsis + * next_state = yy_try_NUL_trans( current_state ); + */ + static yy_state_type yy_try_NUL_trans (yy_state_type yy_current_state , yyscan_t yyscanner) +{ + register int yy_is_jam; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; /* This var may be unused depending upon options. */ + register char *yy_cp = yyg->yy_c_buf_p; + + register YY_CHAR yy_c = 1; + if ( yy_accept[yy_current_state] ) + { + yyg->yy_last_accepting_state = yy_current_state; + yyg->yy_last_accepting_cpos = yy_cp; + } + while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) + { + yy_current_state = (int) yy_def[yy_current_state]; + if ( yy_current_state >= 17 ) + yy_c = yy_meta[(unsigned int) yy_c]; + } + yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c]; + yy_is_jam = (yy_current_state == 16); + + return yy_is_jam ? 0 : yy_current_state; +} + +#ifndef YY_NO_INPUT +#ifdef __cplusplus + static int yyinput (yyscan_t yyscanner) +#else + static int input (yyscan_t yyscanner) +#endif + +{ + int c; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + *yyg->yy_c_buf_p = yyg->yy_hold_char; + + if ( *yyg->yy_c_buf_p == YY_END_OF_BUFFER_CHAR ) + { + /* yy_c_buf_p now points to the character we want to return. + * If this occurs *before* the EOB characters, then it's a + * valid NUL; if not, then we've hit the end of the buffer. + */ + if ( yyg->yy_c_buf_p < &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars] ) + /* This was really a NUL. */ + *yyg->yy_c_buf_p = '\0'; + + else + { /* need more input */ + int offset = yyg->yy_c_buf_p - yyg->yytext_ptr; + ++yyg->yy_c_buf_p; + + switch ( yy_get_next_buffer( yyscanner ) ) + { + case EOB_ACT_LAST_MATCH: + /* This happens because yy_g_n_b() + * sees that we've accumulated a + * token and flags that we need to + * try matching the token before + * proceeding. But for input(), + * there's no matching to consider. + * So convert the EOB_ACT_LAST_MATCH + * to EOB_ACT_END_OF_FILE. + */ + + /* Reset buffer status. */ + fts0trestart(yyin ,yyscanner); + + /*FALLTHROUGH*/ + + case EOB_ACT_END_OF_FILE: + { + if ( fts0twrap(yyscanner ) ) + return EOF; + + if ( ! yyg->yy_did_buffer_switch_on_eof ) + YY_NEW_FILE; +#ifdef __cplusplus + return yyinput(yyscanner); +#else + return input(yyscanner); +#endif + } + + case EOB_ACT_CONTINUE_SCAN: + yyg->yy_c_buf_p = yyg->yytext_ptr + offset; + break; + } + } + } + + c = *(unsigned char *) yyg->yy_c_buf_p; /* cast for 8-bit char's */ + *yyg->yy_c_buf_p = '\0'; /* preserve yytext */ + yyg->yy_hold_char = *++yyg->yy_c_buf_p; + + return c; +} +#endif /* ifndef YY_NO_INPUT */ + +/** Immediately switch to a different input stream. + * @param input_file A readable stream. + * @param yyscanner The scanner object. + * @note This function does not reset the start condition to @c INITIAL . + */ + void fts0trestart (FILE * input_file , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + if ( ! YY_CURRENT_BUFFER ){ + fts0tensure_buffer_stack (yyscanner); + YY_CURRENT_BUFFER_LVALUE = + fts0t_create_buffer(yyin,YY_BUF_SIZE ,yyscanner); + } + + fts0t_init_buffer(YY_CURRENT_BUFFER,input_file ,yyscanner); + fts0t_load_buffer_state(yyscanner ); +} + +/** Switch to a different input buffer. + * @param new_buffer The new input buffer. + * @param yyscanner The scanner object. + */ + void fts0t_switch_to_buffer (YY_BUFFER_STATE new_buffer , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + /* TODO. We should be able to replace this entire function body + * with + * fts0tpop_buffer_state(); + * fts0tpush_buffer_state(new_buffer); + */ + fts0tensure_buffer_stack (yyscanner); + if ( YY_CURRENT_BUFFER == new_buffer ) + return; + + if ( YY_CURRENT_BUFFER ) + { + /* Flush out information for old buffer. */ + *yyg->yy_c_buf_p = yyg->yy_hold_char; + YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = yyg->yy_c_buf_p; + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars; + } + + YY_CURRENT_BUFFER_LVALUE = new_buffer; + fts0t_load_buffer_state(yyscanner ); + + /* We don't actually know whether we did this switch during + * EOF (fts0twrap()) processing, but the only time this flag + * is looked at is after fts0twrap() is called, so it's safe + * to go ahead and always set it. + */ + yyg->yy_did_buffer_switch_on_eof = 1; +} + +static void fts0t_load_buffer_state (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + yyg->yy_n_chars = YY_CURRENT_BUFFER_LVALUE->yy_n_chars; + yyg->yytext_ptr = yyg->yy_c_buf_p = YY_CURRENT_BUFFER_LVALUE->yy_buf_pos; + yyin = YY_CURRENT_BUFFER_LVALUE->yy_input_file; + yyg->yy_hold_char = *yyg->yy_c_buf_p; +} + +/** Allocate and initialize an input buffer state. + * @param file A readable stream. + * @param size The character buffer size in bytes. When in doubt, use @c YY_BUF_SIZE. + * @param yyscanner The scanner object. + * @return the allocated buffer state. + */ + YY_BUFFER_STATE fts0t_create_buffer (FILE * file, int size , yyscan_t yyscanner) +{ + YY_BUFFER_STATE b; + + b = (YY_BUFFER_STATE) fts0talloc(sizeof( struct yy_buffer_state ) ,yyscanner ); + if ( ! b ) + YY_FATAL_ERROR( "out of dynamic memory in fts0t_create_buffer()" ); + + b->yy_buf_size = size; + + /* yy_ch_buf has to be 2 characters longer than the size given because + * we need to put in 2 end-of-buffer characters. + */ + b->yy_ch_buf = (char *) fts0talloc(b->yy_buf_size + 2 ,yyscanner ); + if ( ! b->yy_ch_buf ) + YY_FATAL_ERROR( "out of dynamic memory in fts0t_create_buffer()" ); + + b->yy_is_our_buffer = 1; + + fts0t_init_buffer(b,file ,yyscanner); + + return b; +} + +/** Destroy the buffer. + * @param b a buffer created with fts0t_create_buffer() + * @param yyscanner The scanner object. + */ + void fts0t_delete_buffer (YY_BUFFER_STATE b , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + if ( ! b ) + return; + + if ( b == YY_CURRENT_BUFFER ) /* Not sure if we should pop here. */ + YY_CURRENT_BUFFER_LVALUE = (YY_BUFFER_STATE) 0; + + if ( b->yy_is_our_buffer ) + fts0tfree((void *) b->yy_ch_buf ,yyscanner ); + + fts0tfree((void *) b ,yyscanner ); +} + +/* Initializes or reinitializes a buffer. + * This function is sometimes called more than once on the same buffer, + * such as during a fts0trestart() or at EOF. + */ + static void fts0t_init_buffer (YY_BUFFER_STATE b, FILE * file , yyscan_t yyscanner) + +{ + int oerrno = errno; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + fts0t_flush_buffer(b ,yyscanner); + + b->yy_input_file = file; + b->yy_fill_buffer = 1; + + /* If b is the current buffer, then fts0t_init_buffer was _probably_ + * called from fts0trestart() or through yy_get_next_buffer. + * In that case, we don't want to reset the lineno or column. + */ + if (b != YY_CURRENT_BUFFER){ + b->yy_bs_lineno = 1; + b->yy_bs_column = 0; + } + + b->yy_is_interactive = 0; + + errno = oerrno; +} + +/** Discard all buffered characters. On the next scan, YY_INPUT will be called. + * @param b the buffer state to be flushed, usually @c YY_CURRENT_BUFFER. + * @param yyscanner The scanner object. + */ + void fts0t_flush_buffer (YY_BUFFER_STATE b , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + if ( ! b ) + return; + + b->yy_n_chars = 0; + + /* We always need two end-of-buffer characters. The first causes + * a transition to the end-of-buffer state. The second causes + * a jam in that state. + */ + b->yy_ch_buf[0] = YY_END_OF_BUFFER_CHAR; + b->yy_ch_buf[1] = YY_END_OF_BUFFER_CHAR; + + b->yy_buf_pos = &b->yy_ch_buf[0]; + + b->yy_at_bol = 1; + b->yy_buffer_status = YY_BUFFER_NEW; + + if ( b == YY_CURRENT_BUFFER ) + fts0t_load_buffer_state(yyscanner ); +} + +/** Pushes the new state onto the stack. The new state becomes + * the current state. This function will allocate the stack + * if necessary. + * @param new_buffer The new state. + * @param yyscanner The scanner object. + */ +void fts0tpush_buffer_state (YY_BUFFER_STATE new_buffer , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + if (new_buffer == NULL) + return; + + fts0tensure_buffer_stack(yyscanner); + + /* This block is copied from fts0t_switch_to_buffer. */ + if ( YY_CURRENT_BUFFER ) + { + /* Flush out information for old buffer. */ + *yyg->yy_c_buf_p = yyg->yy_hold_char; + YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = yyg->yy_c_buf_p; + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars; + } + + /* Only push if top exists. Otherwise, replace top. */ + if (YY_CURRENT_BUFFER) + yyg->yy_buffer_stack_top++; + YY_CURRENT_BUFFER_LVALUE = new_buffer; + + /* copied from fts0t_switch_to_buffer. */ + fts0t_load_buffer_state(yyscanner ); + yyg->yy_did_buffer_switch_on_eof = 1; +} + +/** Removes and deletes the top of the stack, if present. + * The next element becomes the new top. + * @param yyscanner The scanner object. + */ +void fts0tpop_buffer_state (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + if (!YY_CURRENT_BUFFER) + return; + + fts0t_delete_buffer(YY_CURRENT_BUFFER ,yyscanner); + YY_CURRENT_BUFFER_LVALUE = NULL; + if (yyg->yy_buffer_stack_top > 0) + --yyg->yy_buffer_stack_top; + + if (YY_CURRENT_BUFFER) { + fts0t_load_buffer_state(yyscanner ); + yyg->yy_did_buffer_switch_on_eof = 1; + } +} + +/* Allocates the stack if it does not exist. + * Guarantees space for at least one push. + */ +static void fts0tensure_buffer_stack (yyscan_t yyscanner) +{ + int num_to_alloc; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + if (!yyg->yy_buffer_stack) { + + /* First allocation is just for 2 elements, since we don't know if this + * scanner will even need a stack. We use 2 instead of 1 to avoid an + * immediate realloc on the next call. + */ + num_to_alloc = 1; + yyg->yy_buffer_stack = (struct yy_buffer_state**)fts0talloc + (num_to_alloc * sizeof(struct yy_buffer_state*) + , yyscanner); + if ( ! yyg->yy_buffer_stack ) + YY_FATAL_ERROR( "out of dynamic memory in fts0tensure_buffer_stack()" ); + + memset(yyg->yy_buffer_stack, 0, num_to_alloc * sizeof(struct yy_buffer_state*)); + + yyg->yy_buffer_stack_max = num_to_alloc; + yyg->yy_buffer_stack_top = 0; + return; + } + + if (yyg->yy_buffer_stack_top >= (yyg->yy_buffer_stack_max) - 1){ + + /* Increase the buffer to prepare for a possible push. */ + int grow_size = 8 /* arbitrary grow size */; + + num_to_alloc = static_cast<int>(yyg->yy_buffer_stack_max + grow_size); + yyg->yy_buffer_stack = (struct yy_buffer_state**)fts0trealloc + (yyg->yy_buffer_stack, + num_to_alloc * sizeof(struct yy_buffer_state*) + , yyscanner); + if ( ! yyg->yy_buffer_stack ) + YY_FATAL_ERROR( "out of dynamic memory in fts0tensure_buffer_stack()" ); + + /* zero only the new slots.*/ + memset(yyg->yy_buffer_stack + yyg->yy_buffer_stack_max, 0, grow_size * sizeof(struct yy_buffer_state*)); + yyg->yy_buffer_stack_max = num_to_alloc; + } +} + +/** Setup the input buffer state to scan directly from a user-specified character buffer. + * @param base the character buffer + * @param size the size in bytes of the character buffer + * @param yyscanner The scanner object. + * @return the newly allocated buffer state object. + */ +YY_BUFFER_STATE fts0t_scan_buffer (char * base, yy_size_t size , yyscan_t yyscanner) +{ + YY_BUFFER_STATE b; + + if ( size < 2 || + base[size-2] != YY_END_OF_BUFFER_CHAR || + base[size-1] != YY_END_OF_BUFFER_CHAR ) + /* They forgot to leave room for the EOB's. */ + return 0; + + b = (YY_BUFFER_STATE) fts0talloc(sizeof( struct yy_buffer_state ) ,yyscanner ); + if ( ! b ) + YY_FATAL_ERROR( "out of dynamic memory in fts0t_scan_buffer()" ); + + b->yy_buf_size = size - 2; /* "- 2" to take care of EOB's */ + b->yy_buf_pos = b->yy_ch_buf = base; + b->yy_is_our_buffer = 0; + b->yy_input_file = 0; + b->yy_n_chars = static_cast<int>(b->yy_buf_size); + b->yy_is_interactive = 0; + b->yy_at_bol = 1; + b->yy_fill_buffer = 0; + b->yy_buffer_status = YY_BUFFER_NEW; + + fts0t_switch_to_buffer(b ,yyscanner ); + + return b; +} + +/** Setup the input buffer state to scan a string. The next call to fts0tlex() will + * scan from a @e copy of @a str. + * @param yystr a NUL-terminated string to scan + * @param yyscanner The scanner object. + * @return the newly allocated buffer state object. + * @note If you want to scan bytes that may contain NUL values, then use + * fts0t_scan_bytes() instead. + */ +YY_BUFFER_STATE fts0t_scan_string (yyconst char * yystr , yyscan_t yyscanner) +{ + + return fts0t_scan_bytes(yystr,static_cast<int>(strlen(yystr)) ,yyscanner); +} + +/** Setup the input buffer state to scan the given bytes. The next call to fts0tlex() will + * scan from a @e copy of @a bytes. + * @param yybytes the byte buffer to scan + * @param _yybytes_len the number of bytes in the buffer pointed to by @a bytes. + * @param yyscanner The scanner object. + * @return the newly allocated buffer state object. + */ +YY_BUFFER_STATE fts0t_scan_bytes (yyconst char * yybytes, int _yybytes_len , yyscan_t yyscanner) +{ + YY_BUFFER_STATE b; + char *buf; + yy_size_t n; + int i; + + /* Get memory for full buffer, including space for trailing EOB's. */ + n = _yybytes_len + 2; + buf = (char *) fts0talloc(n ,yyscanner ); + if ( ! buf ) + YY_FATAL_ERROR( "out of dynamic memory in fts0t_scan_bytes()" ); + + for ( i = 0; i < _yybytes_len; ++i ) + buf[i] = yybytes[i]; + + buf[_yybytes_len] = buf[_yybytes_len+1] = YY_END_OF_BUFFER_CHAR; + + b = fts0t_scan_buffer(buf,n ,yyscanner); + if ( ! b ) + YY_FATAL_ERROR( "bad buffer in fts0t_scan_bytes()" ); + + /* It's okay to grow etc. this buffer, and we should throw it + * away when we're done. + */ + b->yy_is_our_buffer = 1; + + return b; +} + +#ifndef YY_EXIT_FAILURE +#define YY_EXIT_FAILURE 2 +#endif + +static void yy_fatal_error (yyconst char* msg , yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused))) +{ + (void) fprintf( stderr, "%s\n", msg ); + exit( YY_EXIT_FAILURE ); +} + +/* Redefine yyless() so it works in section 3 code. */ + +#undef yyless +#define yyless(n) \ + do \ + { \ + /* Undo effects of setting up yytext. */ \ + int yyless_macro_arg = (n); \ + YY_LESS_LINENO(yyless_macro_arg);\ + yytext[yyleng] = yyg->yy_hold_char; \ + yyg->yy_c_buf_p = yytext + yyless_macro_arg; \ + yyg->yy_hold_char = *yyg->yy_c_buf_p; \ + *yyg->yy_c_buf_p = '\0'; \ + yyleng = yyless_macro_arg; \ + } \ + while ( 0 ) + +/* Accessor methods (get/set functions) to struct members. */ + +/** Get the user-defined data for this scanner. + * @param yyscanner The scanner object. + */ +YY_EXTRA_TYPE fts0tget_extra (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + return yyextra; +} + +/** Get the current line number. + * @param yyscanner The scanner object. + */ +int fts0tget_lineno (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + if (! YY_CURRENT_BUFFER) + return 0; + + return yylineno; +} + +/** Get the current column number. + * @param yyscanner The scanner object. + */ +int fts0tget_column (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + if (! YY_CURRENT_BUFFER) + return 0; + + return yycolumn; +} + +/** Get the input stream. + * @param yyscanner The scanner object. + */ +FILE *fts0tget_in (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + return yyin; +} + +/** Get the output stream. + * @param yyscanner The scanner object. + */ +FILE *fts0tget_out (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + return yyout; +} + +/** Get the length of the current token. + * @param yyscanner The scanner object. + */ +int fts0tget_leng (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + return yyleng; +} + +/** Get the current token. + * @param yyscanner The scanner object. + */ + +char *fts0tget_text (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + return yytext; +} + +/** Set the user-defined data. This data is never touched by the scanner. + * @param user_defined The data to be associated with this scanner. + * @param yyscanner The scanner object. + */ +void fts0tset_extra (YY_EXTRA_TYPE user_defined , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + yyextra = user_defined ; +} + +/** Set the current line number. + * @param line_number + * @param yyscanner The scanner object. + */ +void fts0tset_lineno (int line_number , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + /* lineno is only valid if an input buffer exists. */ + if (! YY_CURRENT_BUFFER ) + yy_fatal_error( "fts0tset_lineno called with no buffer" , yyscanner); + + yylineno = line_number; +} + +/** Set the current column. + * @param line_number + * @param yyscanner The scanner object. + */ +void fts0tset_column (int column_no , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + /* column is only valid if an input buffer exists. */ + if (! YY_CURRENT_BUFFER ) + yy_fatal_error( "fts0tset_column called with no buffer" , yyscanner); + + yycolumn = column_no; +} + +/** Set the input stream. This does not discard the current + * input buffer. + * @param in_str A readable stream. + * @param yyscanner The scanner object. + * @see fts0t_switch_to_buffer + */ +void fts0tset_in (FILE * in_str , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + yyin = in_str ; +} + +void fts0tset_out (FILE * out_str , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + yyout = out_str ; +} + +int fts0tget_debug (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + return yy_flex_debug; +} + +void fts0tset_debug (int bdebug , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + yy_flex_debug = bdebug ; +} + +/* Accessor methods for yylval and yylloc */ + +/* User-visible API */ + +/* fts0tlex_init is special because it creates the scanner itself, so it is + * the ONLY reentrant function that doesn't take the scanner as the last argument. + * That's why we explicitly handle the declaration, instead of using our macros. + */ + +int fts0tlex_init(yyscan_t* ptr_yy_globals) + +{ + if (ptr_yy_globals == NULL){ + errno = EINVAL; + return 1; + } + + *ptr_yy_globals = (yyscan_t) fts0talloc ( sizeof( struct yyguts_t ), NULL ); + + if (*ptr_yy_globals == NULL){ + errno = ENOMEM; + return 1; + } + + /* By setting to 0xAA, we expose bugs in yy_init_globals. Leave at 0x00 for releases. */ + memset(*ptr_yy_globals,0x00,sizeof(struct yyguts_t)); + + return yy_init_globals ( *ptr_yy_globals ); +} + +/* fts0tlex_init_extra has the same functionality as fts0tlex_init, but follows the + * convention of taking the scanner as the last argument. Note however, that + * this is a *pointer* to a scanner, as it will be allocated by this call (and + * is the reason, too, why this function also must handle its own declaration). + * The user defined value in the first argument will be available to fts0talloc in + * the yyextra field. + */ + +int fts0tlex_init_extra(YY_EXTRA_TYPE yy_user_defined,yyscan_t* ptr_yy_globals ) + +{ + struct yyguts_t dummy_yyguts; + + fts0tset_extra (yy_user_defined, &dummy_yyguts); + + if (ptr_yy_globals == NULL){ + errno = EINVAL; + return 1; + } + + *ptr_yy_globals = (yyscan_t) fts0talloc ( sizeof( struct yyguts_t ), &dummy_yyguts ); + + if (*ptr_yy_globals == NULL){ + errno = ENOMEM; + return 1; + } + + /* By setting to 0xAA, we expose bugs in + yy_init_globals. Leave at 0x00 for releases. */ + memset(*ptr_yy_globals,0x00,sizeof(struct yyguts_t)); + + fts0tset_extra (yy_user_defined, *ptr_yy_globals); + + return yy_init_globals ( *ptr_yy_globals ); +} + +static int yy_init_globals (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + /* Initialization is the same as for the non-reentrant scanner. + * This function is called from fts0tlex_destroy(), so don't allocate here. + */ + + yyg->yy_buffer_stack = 0; + yyg->yy_buffer_stack_top = 0; + yyg->yy_buffer_stack_max = 0; + yyg->yy_c_buf_p = (char *) 0; + yyg->yy_init = 0; + yyg->yy_start = 0; + + yyg->yy_start_stack_ptr = 0; + yyg->yy_start_stack_depth = 0; + yyg->yy_start_stack = NULL; + +/* Defined in main.c */ +#ifdef YY_STDINIT + yyin = stdin; + yyout = stdout; +#else + yyin = (FILE *) 0; + yyout = (FILE *) 0; +#endif + + /* For future reference: Set errno on error, since we are called by + * fts0tlex_init() + */ + return 0; +} + +/* fts0tlex_destroy is for both reentrant and non-reentrant scanners. */ +int fts0tlex_destroy (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + /* Pop the buffer stack, destroying each element. */ + while(YY_CURRENT_BUFFER){ + fts0t_delete_buffer(YY_CURRENT_BUFFER ,yyscanner ); + YY_CURRENT_BUFFER_LVALUE = NULL; + fts0tpop_buffer_state(yyscanner); + } + + /* Destroy the stack itself. */ + fts0tfree(yyg->yy_buffer_stack ,yyscanner); + yyg->yy_buffer_stack = NULL; + + /* Destroy the start condition stack. */ + fts0tfree(yyg->yy_start_stack ,yyscanner ); + yyg->yy_start_stack = NULL; + + /* Reset the globals. This is important in a non-reentrant scanner so the next time + * fts0tlex() is called, initialization will occur. */ + yy_init_globals( yyscanner); + + /* Destroy the main struct (reentrant only). */ + fts0tfree ( yyscanner , yyscanner ); + yyscanner = NULL; + return 0; +} + +/* + * Internal utility routines. + */ + +#ifndef yytext_ptr +static void yy_flex_strncpy (char* s1, yyconst char * s2, int n , yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused))) +{ + register int i; + for ( i = 0; i < n; ++i ) + s1[i] = s2[i]; +} +#endif + +#ifdef YY_NEED_STRLEN +static int yy_flex_strlen (yyconst char * s , yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused))) +{ + register int n; + for ( n = 0; s[n]; ++n ) + ; + + return n; +} +#endif + +void *fts0talloc (yy_size_t size , yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused))) +{ + return (void *) malloc( size ); +} + +void *fts0trealloc (void * ptr, yy_size_t size , yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused))) +{ + /* The cast to (char *) in the following accommodates both + * implementations that use char* generic pointers, and those + * that use void* generic pointers. It works with the latter + * because both ANSI C and C++ allow castless assignment from + * any pointer type to void*, and deal with argument conversions + * as though doing an assignment. + */ + return (void *) realloc( (char *) ptr, size ); +} + +void fts0tfree (void * ptr , yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused))) +{ + free( (char *) ptr ); /* see fts0trealloc() for (char *) cast */ +} + +#define YYTABLES_NAME "yytables" + +#line 68 "fts0tlex.l" + + + diff --git a/storage/xtradb/fts/fts0tlex.l b/storage/xtradb/fts/fts0tlex.l new file mode 100644 index 00000000000..4f55a83afe5 --- /dev/null +++ b/storage/xtradb/fts/fts0tlex.l @@ -0,0 +1,68 @@ +/***************************************************************************** + +Copyright (c) 2007, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/** + * @file fts/fts0tlex.l + * FTS parser lexical analyzer + * + * Created 2007/5/9 Sunny Bains + */ + +%{ + +#include "fts0ast.h" +#include "fts0pars.h" + +/* Required for reentrant parser */ +#define YY_DECL int fts_tlexer(YYSTYPE* val, yyscan_t yyscanner) + +%} + +%option noinput +%option nounput +%option noyywrap +%option nostdinit +%option reentrant +%option never-interactive + + +%% + +[\t ]+ /* Ignore whitespace */ ; + +[*] { + val->oper = fts0tget_text(yyscanner)[0]; + + return(val->oper); +} + +\"[^\"\n]*\" { + val->token = fts_ast_string_create(reinterpret_cast<const byte*>(fts0tget_text(yyscanner)), fts0tget_leng(yyscanner)); + + return(FTS_TEXT); +} + +[^" \n\%]* { + val->token = fts_ast_string_create(reinterpret_cast<const byte*>(fts0tget_text(yyscanner)), fts0tget_leng(yyscanner)); + + return(FTS_TERM); +} +. ; +\n + +%% diff --git a/storage/xtradb/fts/make_parser.sh b/storage/xtradb/fts/make_parser.sh new file mode 100755 index 00000000000..2c072914c8b --- /dev/null +++ b/storage/xtradb/fts/make_parser.sh @@ -0,0 +1,49 @@ +#!/bin/sh +# +# Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify it under +# the terms of the GNU General Public License as published by the Free Software +# Foundation; version 2 of the License. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along with +# this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + + +TMPF=t.$$ + +make -f Makefile.query + +echo '#include "univ.i"' > $TMPF + +# This is to avoid compiler warning about unused parameters. +# FIXME: gcc extension "__attribute__" causing compilation errors on windows +# platform. Quote them out for now. +sed -e ' +s/^\(static.*void.*yy_fatal_error.*msg.*,\)\(.*yyscanner\)/\1 \2 __attribute__((unused))/; +s/^\(static.*void.*yy_flex_strncpy.*n.*,\)\(.*yyscanner\)/\1 \2 __attribute__((unused))/; +s/^\(static.*int.*yy_flex_strlen.*s.*,\)\(.*yyscanner\)/\1 \2 __attribute__((unused))/; +s/^\(\(static\|void\).*fts0[bt]alloc.*,\)\(.*yyscanner\)/\1 \3 __attribute__((unused))/; +s/^\(\(static\|void\).*fts0[bt]realloc.*,\)\(.*yyscanner\)/\1 \3 __attribute__((unused))/; +s/^\(\(static\|void\).*fts0[bt]free.*,\)\(.*yyscanner\)/\1 \3 __attribute__((unused))/; +' < fts0blex.cc >> $TMPF + +mv $TMPF fts0blex.cc + +echo '#include "univ.i"' > $TMPF + +sed -e ' +s/^\(static.*void.*yy_fatal_error.*msg.*,\)\(.*yyscanner\)/\1 \2 __attribute__((unused))/; +s/^\(static.*void.*yy_flex_strncpy.*n.*,\)\(.*yyscanner\)/\1 \2 __attribute__((unused))/; +s/^\(static.*int.*yy_flex_strlen.*s.*,\)\(.*yyscanner\)/\1 \2 __attribute__((unused))/; +s/^\(\(static\|void\).*fts0[bt]alloc.*,\)\(.*yyscanner\)/\1 \3 __attribute__((unused))/; +s/^\(\(static\|void\).*fts0[bt]realloc.*,\)\(.*yyscanner\)/\1 \3 __attribute__((unused))/; +s/^\(\(static\|void\).*fts0[bt]free.*,\)\(.*yyscanner\)/\1 \3 __attribute__((unused))/; +' < fts0tlex.cc >> $TMPF + +mv $TMPF fts0tlex.cc diff --git a/storage/xtradb/fut/fut0fut.cc b/storage/xtradb/fut/fut0fut.cc new file mode 100644 index 00000000000..9bb1c512182 --- /dev/null +++ b/storage/xtradb/fut/fut0fut.cc @@ -0,0 +1,31 @@ +/***************************************************************************** + +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file fut/fut0fut.cc +File-based utilities + +Created 12/13/1995 Heikki Tuuri +***********************************************************************/ + +#include "fut0fut.h" + +#ifdef UNIV_NONINL +#include "fut0fut.ic" +#endif + diff --git a/storage/xtradb/fut/fut0lst.cc b/storage/xtradb/fut/fut0lst.cc new file mode 100644 index 00000000000..8f96a6426d2 --- /dev/null +++ b/storage/xtradb/fut/fut0lst.cc @@ -0,0 +1,530 @@ +/***************************************************************************** + +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file fut/fut0lst.cc +File-based list utilities + +Created 11/28/1995 Heikki Tuuri +***********************************************************************/ + +#include "fut0lst.h" + +#ifdef UNIV_NONINL +#include "fut0lst.ic" +#endif + +#include "buf0buf.h" +#include "page0page.h" + +/********************************************************************//** +Adds a node to an empty list. */ +static +void +flst_add_to_empty( +/*==============*/ + flst_base_node_t* base, /*!< in: pointer to base node of + empty list */ + flst_node_t* node, /*!< in: node to add */ + mtr_t* mtr) /*!< in: mini-transaction handle */ +{ + ulint space; + fil_addr_t node_addr; + ulint len; + + ut_ad(mtr && base && node); + ut_ad(base != node); + ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX)); + ut_ad(mtr_memo_contains_page(mtr, node, MTR_MEMO_PAGE_X_FIX)); + len = flst_get_len(base, mtr); + ut_a(len == 0); + + buf_ptr_get_fsp_addr(node, &space, &node_addr); + + /* Update first and last fields of base node */ + flst_write_addr(base + FLST_FIRST, node_addr, mtr); + flst_write_addr(base + FLST_LAST, node_addr, mtr); + + /* Set prev and next fields of node to add */ + flst_write_addr(node + FLST_PREV, fil_addr_null, mtr); + flst_write_addr(node + FLST_NEXT, fil_addr_null, mtr); + + /* Update len of base node */ + mlog_write_ulint(base + FLST_LEN, len + 1, MLOG_4BYTES, mtr); +} + +/********************************************************************//** +Adds a node as the last node in a list. */ +UNIV_INTERN +void +flst_add_last( +/*==========*/ + flst_base_node_t* base, /*!< in: pointer to base node of list */ + flst_node_t* node, /*!< in: node to add */ + mtr_t* mtr) /*!< in: mini-transaction handle */ +{ + ulint space; + fil_addr_t node_addr; + ulint len; + fil_addr_t last_addr; + flst_node_t* last_node; + + ut_ad(mtr && base && node); + ut_ad(base != node); + ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX)); + ut_ad(mtr_memo_contains_page(mtr, node, MTR_MEMO_PAGE_X_FIX)); + len = flst_get_len(base, mtr); + last_addr = flst_get_last(base, mtr); + + buf_ptr_get_fsp_addr(node, &space, &node_addr); + + /* If the list is not empty, call flst_insert_after */ + if (len != 0) { + if (last_addr.page == node_addr.page) { + last_node = page_align(node) + last_addr.boffset; + } else { + ulint zip_size = fil_space_get_zip_size(space); + + last_node = fut_get_ptr(space, zip_size, last_addr, + RW_X_LATCH, mtr); + } + + flst_insert_after(base, last_node, node, mtr); + } else { + /* else call flst_add_to_empty */ + flst_add_to_empty(base, node, mtr); + } +} + +/********************************************************************//** +Adds a node as the first node in a list. */ +UNIV_INTERN +void +flst_add_first( +/*===========*/ + flst_base_node_t* base, /*!< in: pointer to base node of list */ + flst_node_t* node, /*!< in: node to add */ + mtr_t* mtr) /*!< in: mini-transaction handle */ +{ + ulint space; + fil_addr_t node_addr; + ulint len; + fil_addr_t first_addr; + flst_node_t* first_node; + + ut_ad(mtr && base && node); + ut_ad(base != node); + ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX)); + ut_ad(mtr_memo_contains_page(mtr, node, MTR_MEMO_PAGE_X_FIX)); + len = flst_get_len(base, mtr); + first_addr = flst_get_first(base, mtr); + + buf_ptr_get_fsp_addr(node, &space, &node_addr); + + /* If the list is not empty, call flst_insert_before */ + if (len != 0) { + if (first_addr.page == node_addr.page) { + first_node = page_align(node) + first_addr.boffset; + } else { + ulint zip_size = fil_space_get_zip_size(space); + + first_node = fut_get_ptr(space, zip_size, first_addr, + RW_X_LATCH, mtr); + } + + flst_insert_before(base, node, first_node, mtr); + } else { + /* else call flst_add_to_empty */ + flst_add_to_empty(base, node, mtr); + } +} + +/********************************************************************//** +Inserts a node after another in a list. */ +UNIV_INTERN +void +flst_insert_after( +/*==============*/ + flst_base_node_t* base, /*!< in: pointer to base node of list */ + flst_node_t* node1, /*!< in: node to insert after */ + flst_node_t* node2, /*!< in: node to add */ + mtr_t* mtr) /*!< in: mini-transaction handle */ +{ + ulint space; + fil_addr_t node1_addr; + fil_addr_t node2_addr; + flst_node_t* node3; + fil_addr_t node3_addr; + ulint len; + + ut_ad(mtr && node1 && node2 && base); + ut_ad(base != node1); + ut_ad(base != node2); + ut_ad(node2 != node1); + ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX)); + ut_ad(mtr_memo_contains_page(mtr, node1, MTR_MEMO_PAGE_X_FIX)); + ut_ad(mtr_memo_contains_page(mtr, node2, MTR_MEMO_PAGE_X_FIX)); + + buf_ptr_get_fsp_addr(node1, &space, &node1_addr); + buf_ptr_get_fsp_addr(node2, &space, &node2_addr); + + node3_addr = flst_get_next_addr(node1, mtr); + + /* Set prev and next fields of node2 */ + flst_write_addr(node2 + FLST_PREV, node1_addr, mtr); + flst_write_addr(node2 + FLST_NEXT, node3_addr, mtr); + + if (!fil_addr_is_null(node3_addr)) { + /* Update prev field of node3 */ + ulint zip_size = fil_space_get_zip_size(space); + + node3 = fut_get_ptr(space, zip_size, + node3_addr, RW_X_LATCH, mtr); + flst_write_addr(node3 + FLST_PREV, node2_addr, mtr); + } else { + /* node1 was last in list: update last field in base */ + flst_write_addr(base + FLST_LAST, node2_addr, mtr); + } + + /* Set next field of node1 */ + flst_write_addr(node1 + FLST_NEXT, node2_addr, mtr); + + /* Update len of base node */ + len = flst_get_len(base, mtr); + mlog_write_ulint(base + FLST_LEN, len + 1, MLOG_4BYTES, mtr); +} + +/********************************************************************//** +Inserts a node before another in a list. */ +UNIV_INTERN +void +flst_insert_before( +/*===============*/ + flst_base_node_t* base, /*!< in: pointer to base node of list */ + flst_node_t* node2, /*!< in: node to insert */ + flst_node_t* node3, /*!< in: node to insert before */ + mtr_t* mtr) /*!< in: mini-transaction handle */ +{ + ulint space; + flst_node_t* node1; + fil_addr_t node1_addr; + fil_addr_t node2_addr; + fil_addr_t node3_addr; + ulint len; + + ut_ad(mtr && node2 && node3 && base); + ut_ad(base != node2); + ut_ad(base != node3); + ut_ad(node2 != node3); + ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX)); + ut_ad(mtr_memo_contains_page(mtr, node2, MTR_MEMO_PAGE_X_FIX)); + ut_ad(mtr_memo_contains_page(mtr, node3, MTR_MEMO_PAGE_X_FIX)); + + buf_ptr_get_fsp_addr(node2, &space, &node2_addr); + buf_ptr_get_fsp_addr(node3, &space, &node3_addr); + + node1_addr = flst_get_prev_addr(node3, mtr); + + /* Set prev and next fields of node2 */ + flst_write_addr(node2 + FLST_PREV, node1_addr, mtr); + flst_write_addr(node2 + FLST_NEXT, node3_addr, mtr); + + if (!fil_addr_is_null(node1_addr)) { + ulint zip_size = fil_space_get_zip_size(space); + /* Update next field of node1 */ + node1 = fut_get_ptr(space, zip_size, node1_addr, + RW_X_LATCH, mtr); + flst_write_addr(node1 + FLST_NEXT, node2_addr, mtr); + } else { + /* node3 was first in list: update first field in base */ + flst_write_addr(base + FLST_FIRST, node2_addr, mtr); + } + + /* Set prev field of node3 */ + flst_write_addr(node3 + FLST_PREV, node2_addr, mtr); + + /* Update len of base node */ + len = flst_get_len(base, mtr); + mlog_write_ulint(base + FLST_LEN, len + 1, MLOG_4BYTES, mtr); +} + +/********************************************************************//** +Removes a node. */ +UNIV_INTERN +void +flst_remove( +/*========*/ + flst_base_node_t* base, /*!< in: pointer to base node of list */ + flst_node_t* node2, /*!< in: node to remove */ + mtr_t* mtr) /*!< in: mini-transaction handle */ +{ + ulint space; + ulint zip_size; + flst_node_t* node1; + fil_addr_t node1_addr; + fil_addr_t node2_addr; + flst_node_t* node3; + fil_addr_t node3_addr; + ulint len; + + ut_ad(mtr && node2 && base); + ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX)); + ut_ad(mtr_memo_contains_page(mtr, node2, MTR_MEMO_PAGE_X_FIX)); + + buf_ptr_get_fsp_addr(node2, &space, &node2_addr); + zip_size = fil_space_get_zip_size(space); + + node1_addr = flst_get_prev_addr(node2, mtr); + node3_addr = flst_get_next_addr(node2, mtr); + + if (!fil_addr_is_null(node1_addr)) { + + /* Update next field of node1 */ + + if (node1_addr.page == node2_addr.page) { + + node1 = page_align(node2) + node1_addr.boffset; + } else { + node1 = fut_get_ptr(space, zip_size, + node1_addr, RW_X_LATCH, mtr); + } + + ut_ad(node1 != node2); + + flst_write_addr(node1 + FLST_NEXT, node3_addr, mtr); + } else { + /* node2 was first in list: update first field in base */ + flst_write_addr(base + FLST_FIRST, node3_addr, mtr); + } + + if (!fil_addr_is_null(node3_addr)) { + /* Update prev field of node3 */ + + if (node3_addr.page == node2_addr.page) { + + node3 = page_align(node2) + node3_addr.boffset; + } else { + node3 = fut_get_ptr(space, zip_size, + node3_addr, RW_X_LATCH, mtr); + } + + ut_ad(node2 != node3); + + flst_write_addr(node3 + FLST_PREV, node1_addr, mtr); + } else { + /* node2 was last in list: update last field in base */ + flst_write_addr(base + FLST_LAST, node1_addr, mtr); + } + + /* Update len of base node */ + len = flst_get_len(base, mtr); + ut_ad(len > 0); + + mlog_write_ulint(base + FLST_LEN, len - 1, MLOG_4BYTES, mtr); +} + +/********************************************************************//** +Cuts off the tail of the list, including the node given. The number of +nodes which will be removed must be provided by the caller, as this function +does not measure the length of the tail. */ +UNIV_INTERN +void +flst_cut_end( +/*=========*/ + flst_base_node_t* base, /*!< in: pointer to base node of list */ + flst_node_t* node2, /*!< in: first node to remove */ + ulint n_nodes,/*!< in: number of nodes to remove, + must be >= 1 */ + mtr_t* mtr) /*!< in: mini-transaction handle */ +{ + ulint space; + flst_node_t* node1; + fil_addr_t node1_addr; + fil_addr_t node2_addr; + ulint len; + + ut_ad(mtr && node2 && base); + ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX)); + ut_ad(mtr_memo_contains_page(mtr, node2, MTR_MEMO_PAGE_X_FIX)); + ut_ad(n_nodes > 0); + + buf_ptr_get_fsp_addr(node2, &space, &node2_addr); + + node1_addr = flst_get_prev_addr(node2, mtr); + + if (!fil_addr_is_null(node1_addr)) { + + /* Update next field of node1 */ + + if (node1_addr.page == node2_addr.page) { + + node1 = page_align(node2) + node1_addr.boffset; + } else { + node1 = fut_get_ptr(space, + fil_space_get_zip_size(space), + node1_addr, RW_X_LATCH, mtr); + } + + flst_write_addr(node1 + FLST_NEXT, fil_addr_null, mtr); + } else { + /* node2 was first in list: update the field in base */ + flst_write_addr(base + FLST_FIRST, fil_addr_null, mtr); + } + + flst_write_addr(base + FLST_LAST, node1_addr, mtr); + + /* Update len of base node */ + len = flst_get_len(base, mtr); + ut_ad(len >= n_nodes); + + mlog_write_ulint(base + FLST_LEN, len - n_nodes, MLOG_4BYTES, mtr); +} + +/********************************************************************//** +Cuts off the tail of the list, not including the given node. The number of +nodes which will be removed must be provided by the caller, as this function +does not measure the length of the tail. */ +UNIV_INTERN +void +flst_truncate_end( +/*==============*/ + flst_base_node_t* base, /*!< in: pointer to base node of list */ + flst_node_t* node2, /*!< in: first node not to remove */ + ulint n_nodes,/*!< in: number of nodes to remove */ + mtr_t* mtr) /*!< in: mini-transaction handle */ +{ + fil_addr_t node2_addr; + ulint len; + ulint space; + + ut_ad(mtr && node2 && base); + ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX)); + ut_ad(mtr_memo_contains_page(mtr, node2, MTR_MEMO_PAGE_X_FIX)); + if (n_nodes == 0) { + + ut_ad(fil_addr_is_null(flst_get_next_addr(node2, mtr))); + + return; + } + + buf_ptr_get_fsp_addr(node2, &space, &node2_addr); + + /* Update next field of node2 */ + flst_write_addr(node2 + FLST_NEXT, fil_addr_null, mtr); + + flst_write_addr(base + FLST_LAST, node2_addr, mtr); + + /* Update len of base node */ + len = flst_get_len(base, mtr); + ut_ad(len >= n_nodes); + + mlog_write_ulint(base + FLST_LEN, len - n_nodes, MLOG_4BYTES, mtr); +} + +/********************************************************************//** +Validates a file-based list. +@return TRUE if ok */ +UNIV_INTERN +ibool +flst_validate( +/*==========*/ + const flst_base_node_t* base, /*!< in: pointer to base node of list */ + mtr_t* mtr1) /*!< in: mtr */ +{ + ulint space; + ulint zip_size; + const flst_node_t* node; + fil_addr_t node_addr; + fil_addr_t base_addr; + ulint len; + ulint i; + mtr_t mtr2; + + ut_ad(base); + ut_ad(mtr_memo_contains_page(mtr1, base, MTR_MEMO_PAGE_X_FIX)); + + /* We use two mini-transaction handles: the first is used to + lock the base node, and prevent other threads from modifying the + list. The second is used to traverse the list. We cannot run the + second mtr without committing it at times, because if the list + is long, then the x-locked pages could fill the buffer resulting + in a deadlock. */ + + /* Find out the space id */ + buf_ptr_get_fsp_addr(base, &space, &base_addr); + zip_size = fil_space_get_zip_size(space); + + len = flst_get_len(base, mtr1); + node_addr = flst_get_first(base, mtr1); + + for (i = 0; i < len; i++) { + mtr_start(&mtr2); + + node = fut_get_ptr(space, zip_size, + node_addr, RW_X_LATCH, &mtr2); + node_addr = flst_get_next_addr(node, &mtr2); + + mtr_commit(&mtr2); /* Commit mtr2 each round to prevent buffer + becoming full */ + } + + ut_a(fil_addr_is_null(node_addr)); + + node_addr = flst_get_last(base, mtr1); + + for (i = 0; i < len; i++) { + mtr_start(&mtr2); + + node = fut_get_ptr(space, zip_size, + node_addr, RW_X_LATCH, &mtr2); + node_addr = flst_get_prev_addr(node, &mtr2); + + mtr_commit(&mtr2); /* Commit mtr2 each round to prevent buffer + becoming full */ + } + + ut_a(fil_addr_is_null(node_addr)); + + return(TRUE); +} + +/********************************************************************//** +Prints info of a file-based list. */ +UNIV_INTERN +void +flst_print( +/*=======*/ + const flst_base_node_t* base, /*!< in: pointer to base node of list */ + mtr_t* mtr) /*!< in: mtr */ +{ + const buf_frame_t* frame; + ulint len; + + ut_ad(base && mtr); + ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX)); + frame = page_align((byte*) base); + + len = flst_get_len(base, mtr); + + fprintf(stderr, + "FILE-BASED LIST:\n" + "Base node in space %lu page %lu byte offset %lu; len %lu\n", + (ulong) page_get_space_id(frame), + (ulong) page_get_page_no(frame), + (ulong) page_offset(base), (ulong) len); +} diff --git a/storage/xtradb/ha/ha0ha.cc b/storage/xtradb/ha/ha0ha.cc new file mode 100644 index 00000000000..b79ae922045 --- /dev/null +++ b/storage/xtradb/ha/ha0ha.cc @@ -0,0 +1,524 @@ +/***************************************************************************** + +Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file ha/ha0ha.cc +The hash table with external chains + +Created 8/22/1994 Heikki Tuuri +*************************************************************************/ + +#include "ha0ha.h" +#ifdef UNIV_NONINL +#include "ha0ha.ic" +#endif + +#ifndef UNIV_HOTBACKUP +#ifdef UNIV_DEBUG +# include "buf0buf.h" +#endif /* UNIV_DEBUG */ +# include "btr0sea.h" +#include "page0page.h" + +/*************************************************************//** +Creates a hash table with at least n array cells. The actual number +of cells is chosen to be a prime number slightly bigger than n. +@return own: created table */ +UNIV_INTERN +hash_table_t* +ha_create_func( +/*===========*/ + ulint n, /*!< in: number of array cells */ +#ifdef UNIV_SYNC_DEBUG + ulint sync_level, /*!< in: level of the mutexes or rw_locks + in the latching order: this is used in the + debug version */ +#endif /* UNIV_SYNC_DEBUG */ + ulint n_sync_obj, /*!< in: number of mutexes or rw_locks + to protect the hash table: must be a + power of 2, or 0 */ + ulint type) /*!< in: type of datastructure for which + the memory heap is going to be used e.g.: + MEM_HEAP_FOR_BTR_SEARCH or + MEM_HEAP_FOR_PAGE_HASH */ +{ + hash_table_t* table; + ulint i; + + ut_a(type == MEM_HEAP_FOR_BTR_SEARCH + || type == MEM_HEAP_FOR_PAGE_HASH); + + ut_ad(ut_is_2pow(n_sync_obj)); + table = hash_create(n); + + /* Creating MEM_HEAP_BTR_SEARCH type heaps can potentially fail, + but in practise it never should in this case, hence the asserts. */ + + if (n_sync_obj == 0) { + table->heap = mem_heap_create_typed( + ut_min(4096, MEM_MAX_ALLOC_IN_BUF), type); + ut_a(table->heap); + + return(table); + } + + if (type == MEM_HEAP_FOR_PAGE_HASH) { + /* We create a hash table protected by rw_locks for + buf_pool->page_hash. */ + hash_create_sync_obj(table, HASH_TABLE_SYNC_RW_LOCK, + n_sync_obj, sync_level); + } else { + hash_create_sync_obj(table, HASH_TABLE_SYNC_MUTEX, + n_sync_obj, sync_level); + } + + table->heaps = static_cast<mem_heap_t**>( + mem_alloc(n_sync_obj * sizeof(void*))); + + for (i = 0; i < n_sync_obj; i++) { + table->heaps[i] = mem_heap_create_typed(4096, type); + ut_a(table->heaps[i]); + } + + return(table); +} + +#ifdef UNIV_SYNC_DEBUG +/*************************************************************//** +Verifies that the specified hash table is a part of adaptive hash index and +that its corresponding latch is X-latched by the current thread. */ +static +bool +ha_assert_btr_x_locked( +/*===================*/ + const hash_table_t* table) /*!<in: hash table to check */ +{ + ulint i; + + ut_ad(table->adaptive); + + for (i = 0; i < btr_search_index_num; i++) { + if (btr_search_sys->hash_tables[i] == table) { + break; + } + } + + ut_ad(i < btr_search_index_num); + ut_ad(rw_lock_own(&btr_search_latch_arr[i], RW_LOCK_EX)); + + return(true); +} +#endif /* UNIV_SYNC_DEBUG */ + +/*************************************************************//** +Empties a hash table and frees the memory heaps. */ +UNIV_INTERN +void +ha_clear( +/*=====*/ + hash_table_t* table) /*!< in, own: hash table */ +{ + ulint i; + ulint n; + + ut_ad(table); + ut_ad(table->magic_n == HASH_TABLE_MAGIC_N); +#ifdef UNIV_SYNC_DEBUG + ut_ad(!table->adaptive || ha_assert_btr_x_locked(table)); +#endif /* UNIV_SYNC_DEBUG */ + + /* Free the memory heaps. */ + n = table->n_sync_obj; + + for (i = 0; i < n; i++) { + mem_heap_free(table->heaps[i]); + } + + if (table->heaps) { + mem_free(table->heaps); + } + + switch (table->type) { + case HASH_TABLE_SYNC_MUTEX: + mem_free(table->sync_obj.mutexes); + table->sync_obj.mutexes = NULL; + break; + + case HASH_TABLE_SYNC_RW_LOCK: + mem_free(table->sync_obj.rw_locks); + table->sync_obj.rw_locks = NULL; + break; + + case HASH_TABLE_SYNC_NONE: + /* do nothing */ + break; + } + + table->n_sync_obj = 0; + table->type = HASH_TABLE_SYNC_NONE; + + + /* Clear the hash table. */ + n = hash_get_n_cells(table); + + for (i = 0; i < n; i++) { + hash_get_nth_cell(table, i)->node = NULL; + } +} + +/*************************************************************//** +Inserts an entry into a hash table. If an entry with the same fold number +is found, its node is updated to point to the new data, and no new node +is inserted. If btr_search_enabled is set to FALSE, we will only allow +updating existing nodes, but no new node is allowed to be added. +@return TRUE if succeed, FALSE if no more memory could be allocated */ +UNIV_INTERN +ibool +ha_insert_for_fold_func( +/*====================*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold, /*!< in: folded value of data; if a node with + the same fold value already exists, it is + updated to point to the same data, and no new + node is created! */ +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + buf_block_t* block, /*!< in: buffer block containing the data */ +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + const rec_t* data) /*!< in: data, must not be NULL */ +{ + hash_cell_t* cell; + ha_node_t* node; + ha_node_t* prev_node; + ulint hash; + + ut_ad(data); + ut_ad(table); + ut_ad(table->magic_n == HASH_TABLE_MAGIC_N); +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + ut_a(block->frame == page_align(data)); +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + hash_assert_can_modify(table, fold); + ut_ad(btr_search_enabled); + + hash = hash_calc_hash(fold, table); + + cell = hash_get_nth_cell(table, hash); + + prev_node = static_cast<ha_node_t*>(cell->node); + + while (prev_node != NULL) { + if (prev_node->fold == fold) { +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + if (table->adaptive) { + buf_block_t* prev_block = prev_node->block; + ut_a(prev_block->frame + == page_align(prev_node->data)); + ut_a(prev_block->n_pointers > 0); + prev_block->n_pointers--; + block->n_pointers++; + } + + prev_node->block = block; +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + prev_node->data = data; + + return(TRUE); + } + + prev_node = prev_node->next; + } + + /* We have to allocate a new chain node */ + + node = static_cast<ha_node_t*>( + mem_heap_alloc(hash_get_heap(table, fold), sizeof(ha_node_t))); + + if (node == NULL) { + /* It was a btr search type memory heap and at the moment + no more memory could be allocated: return */ + + ut_ad(hash_get_heap(table, fold)->type & MEM_HEAP_BTR_SEARCH); + + return(FALSE); + } + + ha_node_set_data(node, block, data); + +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + if (table->adaptive) { + block->n_pointers++; + } +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + + node->fold = fold; + + node->next = NULL; + + prev_node = static_cast<ha_node_t*>(cell->node); + + if (prev_node == NULL) { + + cell->node = node; + + return(TRUE); + } + + while (prev_node->next != NULL) { + + prev_node = prev_node->next; + } + + prev_node->next = node; + + return(TRUE); +} + +/***********************************************************//** +Deletes a hash node. */ +UNIV_INTERN +void +ha_delete_hash_node( +/*================*/ + hash_table_t* table, /*!< in: hash table */ + ha_node_t* del_node) /*!< in: node to be deleted */ +{ + ut_ad(table); + ut_ad(table->magic_n == HASH_TABLE_MAGIC_N); +#ifdef UNIV_SYNC_DEBUG + ut_ad(ha_assert_btr_x_locked(table)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(btr_search_enabled); +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + if (table->adaptive) { + ut_a(del_node->block->frame = page_align(del_node->data)); + ut_a(del_node->block->n_pointers > 0); + del_node->block->n_pointers--; + } +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + + HASH_DELETE_AND_COMPACT(ha_node_t, next, table, del_node); +} + +/*********************************************************//** +Looks for an element when we know the pointer to the data, and updates +the pointer to data, if found. +@return TRUE if found */ +UNIV_INTERN +ibool +ha_search_and_update_if_found_func( +/*===============================*/ + hash_table_t* table, /*!< in/out: hash table */ + ulint fold, /*!< in: folded value of the searched data */ + const rec_t* data, /*!< in: pointer to the data */ +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + buf_block_t* new_block,/*!< in: block containing new_data */ +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + const rec_t* new_data)/*!< in: new pointer to the data */ +{ + ha_node_t* node; + + ut_ad(table); + ut_ad(table->magic_n == HASH_TABLE_MAGIC_N); + hash_assert_can_modify(table, fold); +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + ut_a(new_block->frame == page_align(new_data)); +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ +#ifdef UNIV_SYNC_DEBUG + ut_ad(ha_assert_btr_x_locked(table)); +#endif /* UNIV_SYNC_DEBUG */ + + if (!btr_search_enabled) { + return(FALSE); + } + + node = ha_search_with_data(table, fold, data); + + if (node) { +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + if (table->adaptive) { + ut_a(node->block->n_pointers > 0); + node->block->n_pointers--; + new_block->n_pointers++; + } + + node->block = new_block; +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + node->data = new_data; + + return(TRUE); + } + + return(FALSE); +} + +/*****************************************************************//** +Removes from the chain determined by fold all nodes whose data pointer +points to the page given. */ +UNIV_INTERN +void +ha_remove_all_nodes_to_page( +/*========================*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold, /*!< in: fold value */ + const page_t* page) /*!< in: buffer page */ +{ + ha_node_t* node; + + ut_ad(table); + ut_ad(table->magic_n == HASH_TABLE_MAGIC_N); + hash_assert_can_modify(table, fold); + ut_ad(btr_search_enabled); + + node = ha_chain_get_first(table, fold); + + while (node) { + if (page_align(ha_node_get_data(node)) == page) { + + /* Remove the hash node */ + + ha_delete_hash_node(table, node); + + /* Start again from the first node in the chain + because the deletion may compact the heap of + nodes and move other nodes! */ + + node = ha_chain_get_first(table, fold); + } else { + node = ha_chain_get_next(node); + } + } +#ifdef UNIV_DEBUG + /* Check that all nodes really got deleted */ + + node = ha_chain_get_first(table, fold); + + while (node) { + ut_a(page_align(ha_node_get_data(node)) != page); + + node = ha_chain_get_next(node); + } +#endif +} + +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG +/*************************************************************//** +Validates a given range of the cells in hash table. +@return TRUE if ok */ +UNIV_INTERN +ibool +ha_validate( +/*========*/ + hash_table_t* table, /*!< in: hash table */ + ulint start_index, /*!< in: start index */ + ulint end_index) /*!< in: end index */ +{ + ibool ok = TRUE; + ulint i; + + ut_ad(table); + ut_ad(table->magic_n == HASH_TABLE_MAGIC_N); + ut_a(start_index <= end_index); + ut_a(start_index < hash_get_n_cells(table)); + ut_a(end_index < hash_get_n_cells(table)); + + for (i = start_index; i <= end_index; i++) { + ha_node_t* node; + hash_cell_t* cell; + + cell = hash_get_nth_cell(table, i); + + for (node = static_cast<ha_node_t*>(cell->node); + node != 0; + node = node->next) { + + if (hash_calc_hash(node->fold, table) != i) { + ut_print_timestamp(stderr); + fprintf(stderr, + "InnoDB: Error: hash table node" + " fold value %lu does not\n" + "InnoDB: match the cell number %lu.\n", + (ulong) node->fold, (ulong) i); + + ok = FALSE; + } + } + } + + return(ok); +} +#endif /* defined UNIV_AHI_DEBUG || defined UNIV_DEBUG */ + +/*************************************************************//** +Prints info of a hash table. */ +UNIV_INTERN +void +ha_print_info( +/*==========*/ + FILE* file, /*!< in: file where to print */ + hash_table_t* table) /*!< in: hash table */ +{ +#ifdef UNIV_DEBUG +/* Some of the code here is disabled for performance reasons in production +builds, see http://bugs.mysql.com/36941 */ +#define PRINT_USED_CELLS +#endif /* UNIV_DEBUG */ + +#ifdef PRINT_USED_CELLS + hash_cell_t* cell; + ulint cells = 0; + ulint i; +#endif /* PRINT_USED_CELLS */ + ulint n_bufs; + + ut_ad(table); + ut_ad(table->magic_n == HASH_TABLE_MAGIC_N); +#ifdef PRINT_USED_CELLS + for (i = 0; i < hash_get_n_cells(table); i++) { + + cell = hash_get_nth_cell(table, i); + + if (cell->node) { + + cells++; + } + } +#endif /* PRINT_USED_CELLS */ + + fprintf(file, "Hash table size %lu", + (ulong) hash_get_n_cells(table)); + +#ifdef PRINT_USED_CELLS + fprintf(file, ", used cells %lu", (ulong) cells); +#endif /* PRINT_USED_CELLS */ + + if (table->heaps == NULL && table->heap != NULL) { + + /* This calculation is intended for the adaptive hash + index: how many buffer frames we have reserved? */ + + n_bufs = UT_LIST_GET_LEN(table->heap->base) - 1; + + if (table->heap->free_block) { + n_bufs++; + } + + fprintf(file, ", node heap has %lu buffer(s)\n", + (ulong) n_bufs); + } +} +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/xtradb/ha/ha0storage.cc b/storage/xtradb/ha/ha0storage.cc new file mode 100644 index 00000000000..6820591f316 --- /dev/null +++ b/storage/xtradb/ha/ha0storage.cc @@ -0,0 +1,184 @@ +/***************************************************************************** + +Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file ha/ha0storage.cc +Hash storage. +Provides a data structure that stores chunks of data in +its own storage, avoiding duplicates. + +Created September 22, 2007 Vasil Dimov +*******************************************************/ + +#include "univ.i" +#include "ha0storage.h" +#include "hash0hash.h" +#include "mem0mem.h" +#include "ut0rnd.h" + +#ifdef UNIV_NONINL +#include "ha0storage.ic" +#endif + +/*******************************************************************//** +Retrieves a data from a storage. If it is present, a pointer to the +stored copy of data is returned, otherwise NULL is returned. */ +static +const void* +ha_storage_get( +/*===========*/ + ha_storage_t* storage, /*!< in: hash storage */ + const void* data, /*!< in: data to check for */ + ulint data_len) /*!< in: data length */ +{ + ha_storage_node_t* node; + ulint fold; + + /* avoid repetitive calls to ut_fold_binary() in the HASH_SEARCH + macro */ + fold = ut_fold_binary(static_cast<const byte*>(data), data_len); + +#define IS_FOUND \ + node->data_len == data_len && memcmp(node->data, data, data_len) == 0 + + HASH_SEARCH( + next, /* node->"next" */ + storage->hash, /* the hash table */ + fold, /* key */ + ha_storage_node_t*, /* type of node->next */ + node, /* auxiliary variable */ + , /* assertion */ + IS_FOUND); /* search criteria */ + + if (node == NULL) { + + return(NULL); + } + /* else */ + + return(node->data); +} + +/*******************************************************************//** +Copies data into the storage and returns a pointer to the copy. If the +same data chunk is already present, then pointer to it is returned. +Data chunks are considered to be equal if len1 == len2 and +memcmp(data1, data2, len1) == 0. If "data" is not present (and thus +data_len bytes need to be allocated) and the size of storage is going to +become more than "memlim" then "data" is not added and NULL is returned. +To disable this behavior "memlim" can be set to 0, which stands for +"no limit". */ +UNIV_INTERN +const void* +ha_storage_put_memlim( +/*==================*/ + ha_storage_t* storage, /*!< in/out: hash storage */ + const void* data, /*!< in: data to store */ + ulint data_len, /*!< in: data length */ + ulint memlim) /*!< in: memory limit to obey */ +{ + void* raw; + ha_storage_node_t* node; + const void* data_copy; + ulint fold; + + /* check if data chunk is already present */ + data_copy = ha_storage_get(storage, data, data_len); + if (data_copy != NULL) { + + return(data_copy); + } + + /* not present */ + + /* check if we are allowed to allocate data_len bytes */ + if (memlim > 0 + && ha_storage_get_size(storage) + data_len > memlim) { + + return(NULL); + } + + /* we put the auxiliary node struct and the data itself in one + continuous block */ + raw = mem_heap_alloc(storage->heap, + sizeof(ha_storage_node_t) + data_len); + + node = (ha_storage_node_t*) raw; + data_copy = (byte*) raw + sizeof(*node); + + memcpy((byte*) raw + sizeof(*node), data, data_len); + + node->data_len = data_len; + node->data = data_copy; + + /* avoid repetitive calls to ut_fold_binary() in the HASH_INSERT + macro */ + fold = ut_fold_binary(static_cast<const byte*>(data), data_len); + + HASH_INSERT( + ha_storage_node_t, /* type used in the hash chain */ + next, /* node->"next" */ + storage->hash, /* the hash table */ + fold, /* key */ + node); /* add this data to the hash */ + + /* the output should not be changed because it will spoil the + hash table */ + return(data_copy); +} + +#ifdef UNIV_COMPILE_TEST_FUNCS + +void +test_ha_storage() +{ + ha_storage_t* storage; + char buf[1024]; + int i; + const void* stored[256]; + const void* p; + + storage = ha_storage_create(0, 0); + + for (i = 0; i < 256; i++) { + + memset(buf, i, sizeof(buf)); + stored[i] = ha_storage_put(storage, buf, sizeof(buf)); + } + + //ha_storage_empty(&storage); + + for (i = 255; i >= 0; i--) { + + memset(buf, i, sizeof(buf)); + p = ha_storage_put(storage, buf, sizeof(buf)); + + if (p != stored[i]) { + + fprintf(stderr, "ha_storage_put() returned %p " + "instead of %p, i=%d\n", p, stored[i], i); + return; + } + } + + fprintf(stderr, "all ok\n"); + + ha_storage_free(storage); +} + +#endif /* UNIV_COMPILE_TEST_FUNCS */ diff --git a/storage/xtradb/ha/hash0hash.cc b/storage/xtradb/ha/hash0hash.cc new file mode 100644 index 00000000000..6f5b98e5e98 --- /dev/null +++ b/storage/xtradb/ha/hash0hash.cc @@ -0,0 +1,403 @@ +/***************************************************************************** + +Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file ha/hash0hash.cc +The simple hash table utility + +Created 5/20/1997 Heikki Tuuri +*******************************************************/ + +#include "hash0hash.h" +#ifdef UNIV_NONINL +#include "hash0hash.ic" +#endif + +#include "mem0mem.h" + +#ifndef UNIV_HOTBACKUP + +# ifdef UNIV_PFS_MUTEX +UNIV_INTERN mysql_pfs_key_t hash_table_mutex_key; +# endif /* UNIV_PFS_MUTEX */ + +# ifdef UNIV_PFS_RWLOCK +UNIV_INTERN mysql_pfs_key_t hash_table_rw_lock_key; +# endif /* UNIV_PFS_RWLOCK */ +/************************************************************//** +Reserves the mutex for a fold value in a hash table. */ +UNIV_INTERN +void +hash_mutex_enter( +/*=============*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold) /*!< in: fold */ +{ + ut_ad(table->type == HASH_TABLE_SYNC_MUTEX); + mutex_enter(hash_get_mutex(table, fold)); +} + +/************************************************************//** +Releases the mutex for a fold value in a hash table. */ +UNIV_INTERN +void +hash_mutex_exit( +/*============*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold) /*!< in: fold */ +{ + ut_ad(table->type == HASH_TABLE_SYNC_MUTEX); + mutex_exit(hash_get_mutex(table, fold)); +} + +/************************************************************//** +Reserves all the mutexes of a hash table, in an ascending order. */ +UNIV_INTERN +void +hash_mutex_enter_all( +/*=================*/ + hash_table_t* table) /*!< in: hash table */ +{ + ulint i; + + ut_ad(table->type == HASH_TABLE_SYNC_MUTEX); + for (i = 0; i < table->n_sync_obj; i++) { + + mutex_enter(table->sync_obj.mutexes + i); + } +} + +/************************************************************//** +Releases all the mutexes of a hash table. */ +UNIV_INTERN +void +hash_mutex_exit_all( +/*================*/ + hash_table_t* table) /*!< in: hash table */ +{ + ulint i; + + ut_ad(table->type == HASH_TABLE_SYNC_MUTEX); + for (i = 0; i < table->n_sync_obj; i++) { + + mutex_exit(table->sync_obj.mutexes + i); + } +} + +/************************************************************//** +Releases all but the passed in mutex of a hash table. */ +UNIV_INTERN +void +hash_mutex_exit_all_but( +/*====================*/ + hash_table_t* table, /*!< in: hash table */ + ib_prio_mutex_t* keep_mutex) /*!< in: mutex to keep */ +{ + ulint i; + + ut_ad(table->type == HASH_TABLE_SYNC_MUTEX); + for (i = 0; i < table->n_sync_obj; i++) { + + ib_prio_mutex_t* mutex = table->sync_obj.mutexes + i; + if (UNIV_LIKELY(keep_mutex != mutex)) { + mutex_exit(mutex); + } + } + + ut_ad(mutex_own(keep_mutex)); +} + +/************************************************************//** +s-lock a lock for a fold value in a hash table. */ +UNIV_INTERN +void +hash_lock_s( +/*========*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold) /*!< in: fold */ +{ + + prio_rw_lock_t* lock = hash_get_lock(table, fold); + + ut_ad(table->type == HASH_TABLE_SYNC_RW_LOCK); + ut_ad(lock); + +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(lock, RW_LOCK_SHARED)); + ut_ad(!rw_lock_own(lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + rw_lock_s_lock(lock); +} + +/************************************************************//** +x-lock a lock for a fold value in a hash table. */ +UNIV_INTERN +void +hash_lock_x( +/*========*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold) /*!< in: fold */ +{ + + prio_rw_lock_t* lock = hash_get_lock(table, fold); + + ut_ad(table->type == HASH_TABLE_SYNC_RW_LOCK); + ut_ad(lock); + +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(lock, RW_LOCK_SHARED)); + ut_ad(!rw_lock_own(lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + rw_lock_x_lock(lock); +} + +/************************************************************//** +unlock an s-lock for a fold value in a hash table. */ +UNIV_INTERN +void +hash_unlock_s( +/*==========*/ + + hash_table_t* table, /*!< in: hash table */ + ulint fold) /*!< in: fold */ +{ + + prio_rw_lock_t* lock = hash_get_lock(table, fold); + + ut_ad(table->type == HASH_TABLE_SYNC_RW_LOCK); + ut_ad(lock); + +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(lock, RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + + rw_lock_s_unlock(lock); +} + +/************************************************************//** +unlock x-lock for a fold value in a hash table. */ +UNIV_INTERN +void +hash_unlock_x( +/*==========*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold) /*!< in: fold */ +{ + prio_rw_lock_t* lock = hash_get_lock(table, fold); + + ut_ad(table->type == HASH_TABLE_SYNC_RW_LOCK); + ut_ad(lock); + +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + rw_lock_x_unlock(lock); +} + +/************************************************************//** +Reserves all the locks of a hash table, in an ascending order. */ +UNIV_INTERN +void +hash_lock_x_all( +/*============*/ + hash_table_t* table) /*!< in: hash table */ +{ + ulint i; + + ut_ad(table->type == HASH_TABLE_SYNC_RW_LOCK); + for (i = 0; i < table->n_sync_obj; i++) { + + prio_rw_lock_t* lock = table->sync_obj.rw_locks + i; +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(lock, RW_LOCK_SHARED)); + ut_ad(!rw_lock_own(lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + rw_lock_x_lock(lock); + } +} + +/************************************************************//** +Releases all the locks of a hash table, in an ascending order. */ +UNIV_INTERN +void +hash_unlock_x_all( +/*==============*/ + hash_table_t* table) /*!< in: hash table */ +{ + ulint i; + + ut_ad(table->type == HASH_TABLE_SYNC_RW_LOCK); + for (i = 0; i < table->n_sync_obj; i++) { + + prio_rw_lock_t* lock = table->sync_obj.rw_locks + i; +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + rw_lock_x_unlock(lock); + } +} + +/************************************************************//** +Releases all but passed in lock of a hash table, */ +UNIV_INTERN +void +hash_unlock_x_all_but( +/*==================*/ + hash_table_t* table, /*!< in: hash table */ + prio_rw_lock_t* keep_lock) /*!< in: lock to keep */ +{ + ulint i; + + ut_ad(table->type == HASH_TABLE_SYNC_RW_LOCK); + for (i = 0; i < table->n_sync_obj; i++) { + + prio_rw_lock_t* lock = table->sync_obj.rw_locks + i; +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + if (UNIV_LIKELY(keep_lock != lock)) { + rw_lock_x_unlock(lock); + } + } +} + +#endif /* !UNIV_HOTBACKUP */ + +/*************************************************************//** +Creates a hash table with >= n array cells. The actual number of cells is +chosen to be a prime number slightly bigger than n. +@return own: created table */ +UNIV_INTERN +hash_table_t* +hash_create( +/*========*/ + ulint n) /*!< in: number of array cells */ +{ + hash_cell_t* array; + ulint prime; + hash_table_t* table; + + prime = ut_find_prime(n); + + table = static_cast<hash_table_t*>(mem_alloc(sizeof(hash_table_t))); + + array = static_cast<hash_cell_t*>( + ut_malloc(sizeof(hash_cell_t) * prime)); + + /* The default type of hash_table is HASH_TABLE_SYNC_NONE i.e.: + the caller is responsible for access control to the table. */ + table->type = HASH_TABLE_SYNC_NONE; + table->array = array; + table->n_cells = prime; +#ifndef UNIV_HOTBACKUP +# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + table->adaptive = FALSE; +# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + table->n_sync_obj = 0; + table->sync_obj.mutexes = NULL; + table->heaps = NULL; +#endif /* !UNIV_HOTBACKUP */ + table->heap = NULL; + ut_d(table->magic_n = HASH_TABLE_MAGIC_N); + + /* Initialize the cell array */ + hash_table_clear(table); + + return(table); +} + +/*************************************************************//** +Frees a hash table. */ +UNIV_INTERN +void +hash_table_free( +/*============*/ + hash_table_t* table) /*!< in, own: hash table */ +{ + ut_ad(table); + ut_ad(table->magic_n == HASH_TABLE_MAGIC_N); + + ut_free(table->array); + mem_free(table); +} + +#ifndef UNIV_HOTBACKUP +/*************************************************************//** +Creates a sync object array to protect a hash table. +::sync_obj can be mutexes or rw_locks depening on the type of +hash table. */ +UNIV_INTERN +void +hash_create_sync_obj_func( +/*======================*/ + hash_table_t* table, /*!< in: hash table */ + enum hash_table_sync_t type, /*!< in: HASH_TABLE_SYNC_MUTEX + or HASH_TABLE_SYNC_RW_LOCK */ +#ifdef UNIV_SYNC_DEBUG + ulint sync_level,/*!< in: latching order level + of the mutexes: used in the + debug version */ +#endif /* UNIV_SYNC_DEBUG */ + ulint n_sync_obj)/*!< in: number of sync objects, + must be a power of 2 */ +{ + ulint i; + + ut_ad(table); + ut_ad(table->magic_n == HASH_TABLE_MAGIC_N); + ut_a(n_sync_obj > 0); + ut_a(ut_is_2pow(n_sync_obj)); + + table->type = type; + + switch (type) { + case HASH_TABLE_SYNC_MUTEX: + table->sync_obj.mutexes = static_cast<ib_prio_mutex_t*>( + mem_alloc(n_sync_obj * sizeof(ib_prio_mutex_t))); + + for (i = 0; i < n_sync_obj; i++) { + mutex_create(hash_table_mutex_key, + table->sync_obj.mutexes + i, sync_level); + } + + break; + + case HASH_TABLE_SYNC_RW_LOCK: + table->sync_obj.rw_locks = static_cast<prio_rw_lock_t*>( + mem_alloc(n_sync_obj * sizeof(prio_rw_lock_t))); + + for (i = 0; i < n_sync_obj; i++) { + rw_lock_create(hash_table_rw_lock_key, + table->sync_obj.rw_locks + i, sync_level); + } + + break; + + case HASH_TABLE_SYNC_NONE: + ut_error; + } + + table->n_sync_obj = n_sync_obj; +} +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/xtradb/ha_innodb.def b/storage/xtradb/ha_innodb.def new file mode 100644 index 00000000000..e0faa62deb1 --- /dev/null +++ b/storage/xtradb/ha_innodb.def @@ -0,0 +1,4 @@ +EXPORTS + _mysql_plugin_interface_version_ + _mysql_sizeof_struct_st_plugin_ + _mysql_plugin_declarations_ diff --git a/storage/xtradb/handler/ha_innodb.cc b/storage/xtradb/handler/ha_innodb.cc new file mode 100644 index 00000000000..be6bb651ae6 --- /dev/null +++ b/storage/xtradb/handler/ha_innodb.cc @@ -0,0 +1,18586 @@ +/***************************************************************************** + +Copyright (c) 2000, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2008, 2009 Google Inc. +Copyright (c) 2009, Percona Inc. +Copyright (c) 2012, Facebook Inc. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +Portions of this file contain modifications contributed and copyrighted +by Percona Inc.. Those modifications are +gratefully acknowledged and are described briefly in the InnoDB +documentation. The contributions by Percona Inc. are incorporated with +their permission, and subject to the conditions contained in the file +COPYING.Percona. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +#define MYSQL_SERVER + +#include <sql_table.h> // explain_filename, nz2, EXPLAIN_PARTITIONS_AS_COMMENT, + // EXPLAIN_FILENAME_MAX_EXTRA_LENGTH + +#include <sql_acl.h> // PROCESS_ACL +#include <debug_sync.h> // DEBUG_SYNC +#include <my_base.h> // HA_OPTION_* +#include <mysys_err.h> +#include <mysql/innodb_priv.h> +#include <mysql/thread_pool_priv.h> +#include <my_check_opt.h> + +/** @file ha_innodb.cc */ + +/* Include necessary InnoDB headers */ +#include "univ.i" +#include "buf0dump.h" +#include "buf0lru.h" +#include "buf0flu.h" +#include "buf0dblwr.h" +#include "btr0sea.h" +#include "os0file.h" +#include "os0thread.h" +#include "srv0start.h" +#include "srv0srv.h" +#include "trx0roll.h" +#include "trx0trx.h" + +#include "trx0sys.h" +#include "mtr0mtr.h" +#include "rem0types.h" +#include "row0ins.h" +#include "row0mysql.h" +#include "row0sel.h" +#include "row0upd.h" +#include "log0log.h" +#include "log0online.h" +#include "lock0lock.h" +#include "dict0crea.h" +#include "btr0cur.h" +#include "btr0btr.h" +#include "fsp0fsp.h" +#include "sync0sync.h" +#include "fil0fil.h" +#include "trx0xa.h" +#include "row0merge.h" +#include "dict0boot.h" +#include "dict0stats.h" +#include "dict0stats_bg.h" +#include "ha_prototypes.h" +#include "ut0mem.h" +#include "ibuf0ibuf.h" +#include "dict0dict.h" +#include "srv0mon.h" +#include "api0api.h" +#include "api0misc.h" +#include "pars0pars.h" +#include "fts0fts.h" +#include "fts0types.h" +#include "row0import.h" +#include "row0quiesce.h" +#ifdef UNIV_DEBUG +#include "trx0purge.h" +#endif /* UNIV_DEBUG */ +#include "fts0priv.h" +#include "page0zip.h" + +enum_tx_isolation thd_get_trx_isolation(const THD* thd); + +#include "ha_innodb.h" +#include "i_s.h" +#include "xtradb_i_s.h" + +# ifndef MYSQL_PLUGIN_IMPORT +# define MYSQL_PLUGIN_IMPORT /* nothing */ +# endif /* MYSQL_PLUGIN_IMPORT */ + +/** to protect innobase_open_files */ +static mysql_mutex_t innobase_share_mutex; +/** to force correct commit order in binlog */ +static ulong commit_threads = 0; +static mysql_cond_t commit_cond; +static mysql_mutex_t commit_cond_m; +static bool innodb_inited = 0; + +#define INSIDE_HA_INNOBASE_CC + +#define EQ_CURRENT_THD(thd) ((thd) == current_thd) + +static struct handlerton* innodb_hton_ptr; + +static const long AUTOINC_OLD_STYLE_LOCKING = 0; +static const long AUTOINC_NEW_STYLE_LOCKING = 1; +static const long AUTOINC_NO_LOCKING = 2; + +static long innobase_mirrored_log_groups; +static long innobase_log_buffer_size; +static long innobase_additional_mem_pool_size; +static long innobase_file_io_threads; +static long innobase_open_files; +static long innobase_autoinc_lock_mode; +static ulong innobase_commit_concurrency = 0; +static ulong innobase_read_io_threads; +static ulong innobase_write_io_threads; +static long innobase_buffer_pool_instances = 1; + +static ulong innobase_log_block_size; + +static long long innobase_buffer_pool_size, innobase_log_file_size; + +/** Percentage of the buffer pool to reserve for 'old' blocks. +Connected to buf_LRU_old_ratio. */ +static uint innobase_old_blocks_pct; + +/** Maximum on-disk size of change buffer in terms of percentage +of the buffer pool. */ +static uint innobase_change_buffer_max_size = CHANGE_BUFFER_DEFAULT_SIZE; + +/* The default values for the following char* start-up parameters +are determined in innobase_init below: */ + +static char* innobase_data_home_dir = NULL; +static char* innobase_data_file_path = NULL; +static char* innobase_file_format_name = NULL; +static char* innobase_change_buffering = NULL; +static char* innobase_enable_monitor_counter = NULL; +static char* innobase_disable_monitor_counter = NULL; +static char* innobase_reset_monitor_counter = NULL; +static char* innobase_reset_all_monitor_counter = NULL; + +/* The highest file format being used in the database. The value can be +set by user, however, it will be adjusted to the newer file format if +a table of such format is created/opened. */ +static char* innobase_file_format_max = NULL; + +static char* innobase_file_flush_method = NULL; + +/* This variable can be set in the server configure file, specifying +stopword table to be used */ +static char* innobase_server_stopword_table = NULL; + +/* Below we have boolean-valued start-up parameters, and their default +values */ + +static ulong innobase_fast_shutdown = 1; +static my_bool innobase_file_format_check = TRUE; +#ifdef UNIV_LOG_ARCHIVE +static my_bool innobase_log_archive = FALSE; +static char* innobase_log_arch_dir = NULL; +#endif /* UNIV_LOG_ARCHIVE */ +static my_bool innobase_use_atomic_writes = FALSE; +static my_bool innobase_use_doublewrite = TRUE; +static my_bool innobase_use_checksums = TRUE; +static my_bool innobase_locks_unsafe_for_binlog = FALSE; +static my_bool innobase_rollback_on_timeout = FALSE; +static my_bool innobase_create_status_file = FALSE; +static my_bool innobase_stats_on_metadata = TRUE; +static my_bool innobase_large_prefix = FALSE; +static my_bool innodb_optimize_fulltext_only = FALSE; + +static char* internal_innobase_data_file_path = NULL; + +static char* innodb_version_str = (char*) INNODB_VERSION_STR; + +/** Possible values for system variable "innodb_stats_method". The values +are defined the same as its corresponding MyISAM system variable +"myisam_stats_method"(see "myisam_stats_method_names"), for better usability */ +static const char* innodb_stats_method_names[] = { + "nulls_equal", + "nulls_unequal", + "nulls_ignored", + NullS +}; + +/** Used to define an enumerate type of the system variable innodb_stats_method. +This is the same as "myisam_stats_method_typelib" */ +static TYPELIB innodb_stats_method_typelib = { + array_elements(innodb_stats_method_names) - 1, + "innodb_stats_method_typelib", + innodb_stats_method_names, + NULL +}; + +/** Possible values for system variables "innodb_checksum_algorithm" and +"innodb_log_checksum_algorithm". */ +static const char* innodb_checksum_algorithm_names[] = { + "crc32", + "strict_crc32", + "innodb", + "strict_innodb", + "none", + "strict_none", + NullS +}; + +/** Used to define an enumerate type of the system variables +innodb_checksum_algorithm and innodb_log_checksum_algorithm. */ +static TYPELIB innodb_checksum_algorithm_typelib = { + array_elements(innodb_checksum_algorithm_names) - 1, + "innodb_checksum_algorithm_typelib", + innodb_checksum_algorithm_names, + NULL +}; + +/** Possible values for system variable "innodb_cleaner_lsn_age_factor". */ +static const char* innodb_cleaner_lsn_age_factor_names[] = { + "legacy", + "high_checkpoint", + NullS +}; + +/** Enumeration for innodb_cleaner_lsn_age_factor. */ +static TYPELIB innodb_cleaner_lsn_age_factor_typelib = { + array_elements(innodb_cleaner_lsn_age_factor_names) - 1, + "innodb_cleaner_lsn_age_factor_typelib", + innodb_cleaner_lsn_age_factor_names, + NULL +}; + +/** Possible values for system variable "innodb_foreground_preflush". */ +static const char* innodb_foreground_preflush_names[] = { + "sync_preflush", + "exponential_backoff", + NullS +}; + +/* Enumeration for innodb_foreground_preflush. */ +static TYPELIB innodb_foreground_preflush_typelib = { + array_elements(innodb_foreground_preflush_names) - 1, + "innodb_foreground_preflush_typelib", + innodb_foreground_preflush_names, + NULL +}; + +/** Possible values for system variable "innodb_empty_free_list_algorithm". */ +static const char* innodb_empty_free_list_algorithm_names[] = { + "legacy", + "backoff", + NullS +}; + +/** Enumeration for innodb_empty_free_list_algorithm. */ +static TYPELIB innodb_empty_free_list_algorithm_typelib = { + array_elements(innodb_empty_free_list_algorithm_names) - 1, + "innodb_empty_free_list_algorithm_typelib", + innodb_empty_free_list_algorithm_names, + NULL +}; + +/* The following counter is used to convey information to InnoDB +about server activity: in case of normal DML ops it is not +sensible to call srv_active_wake_master_thread after each +operation, we only do it every INNOBASE_WAKE_INTERVAL'th step. */ + +#define INNOBASE_WAKE_INTERVAL 32 +static ulong innobase_active_counter = 0; + +static hash_table_t* innobase_open_tables; + +/** Allowed values of innodb_change_buffering */ +static const char* innobase_change_buffering_values[IBUF_USE_COUNT] = { + "none", /* IBUF_USE_NONE */ + "inserts", /* IBUF_USE_INSERT */ + "deletes", /* IBUF_USE_DELETE_MARK */ + "changes", /* IBUF_USE_INSERT_DELETE_MARK */ + "purges", /* IBUF_USE_DELETE */ + "all" /* IBUF_USE_ALL */ +}; + +/* Call back function array defined by MySQL and used to +retrieve FTS results. */ +const struct _ft_vft ft_vft_result = {NULL, + innobase_fts_find_ranking, + innobase_fts_close_ranking, + innobase_fts_retrieve_ranking, + NULL}; + +const struct _ft_vft_ext ft_vft_ext_result = {innobase_fts_get_version, + innobase_fts_flags, + innobase_fts_retrieve_docid, + innobase_fts_count_matches}; + +#ifdef HAVE_PSI_INTERFACE +/* Keys to register pthread mutexes/cond in the current file with +performance schema */ +static mysql_pfs_key_t innobase_share_mutex_key; +static mysql_pfs_key_t commit_cond_mutex_key; +static mysql_pfs_key_t commit_cond_key; + +static PSI_mutex_info all_pthread_mutexes[] = { + {&commit_cond_mutex_key, "commit_cond_mutex", 0}, + {&innobase_share_mutex_key, "innobase_share_mutex", 0} +}; + +static PSI_cond_info all_innodb_conds[] = { + {&commit_cond_key, "commit_cond", 0} +}; + +# ifdef UNIV_PFS_MUTEX +/* all_innodb_mutexes array contains mutexes that are +performance schema instrumented if "UNIV_PFS_MUTEX" +is defined */ +static PSI_mutex_info all_innodb_mutexes[] = { + {&autoinc_mutex_key, "autoinc_mutex", 0}, +# ifndef PFS_SKIP_BUFFER_MUTEX_RWLOCK + {&buffer_block_mutex_key, "buffer_block_mutex", 0}, +# endif /* !PFS_SKIP_BUFFER_MUTEX_RWLOCK */ + {&buf_pool_zip_mutex_key, "buf_pool_zip_mutex", 0}, + {&buf_pool_LRU_list_mutex_key, "buf_pool_LRU_list_mutex", 0}, + {&buf_pool_free_list_mutex_key, "buf_pool_free_list_mutex", 0}, + {&buf_pool_zip_free_mutex_key, "buf_pool_zip_free_mutex", 0}, + {&buf_pool_zip_hash_mutex_key, "buf_pool_zip_hash_mutex", 0}, + {&buf_pool_flush_state_mutex_key, "buf_pool_flush_state_mutex", 0}, + {&cache_last_read_mutex_key, "cache_last_read_mutex", 0}, + {&dict_foreign_err_mutex_key, "dict_foreign_err_mutex", 0}, + {&dict_sys_mutex_key, "dict_sys_mutex", 0}, + {&file_format_max_mutex_key, "file_format_max_mutex", 0}, + {&fil_system_mutex_key, "fil_system_mutex", 0}, + {&flush_list_mutex_key, "flush_list_mutex", 0}, + {&fts_bg_threads_mutex_key, "fts_bg_threads_mutex", 0}, + {&fts_delete_mutex_key, "fts_delete_mutex", 0}, + {&fts_optimize_mutex_key, "fts_optimize_mutex", 0}, + {&fts_doc_id_mutex_key, "fts_doc_id_mutex", 0}, + {&fts_pll_tokenize_mutex_key, "fts_pll_tokenize_mutex", 0}, + {&log_flush_order_mutex_key, "log_flush_order_mutex", 0}, + {&hash_table_mutex_key, "hash_table_mutex", 0}, + {&ibuf_bitmap_mutex_key, "ibuf_bitmap_mutex", 0}, + {&ibuf_mutex_key, "ibuf_mutex", 0}, + {&ibuf_pessimistic_insert_mutex_key, + "ibuf_pessimistic_insert_mutex", 0}, +# ifndef HAVE_ATOMIC_BUILTINS + {&server_mutex_key, "server_mutex", 0}, +# endif /* !HAVE_ATOMIC_BUILTINS */ + {&log_bmp_sys_mutex_key, "log_bmp_sys_mutex", 0}, + {&log_sys_mutex_key, "log_sys_mutex", 0}, +# ifdef UNIV_MEM_DEBUG + {&mem_hash_mutex_key, "mem_hash_mutex", 0}, +# endif /* UNIV_MEM_DEBUG */ + {&mem_pool_mutex_key, "mem_pool_mutex", 0}, + {&mutex_list_mutex_key, "mutex_list_mutex", 0}, + {&page_zip_stat_per_index_mutex_key, "page_zip_stat_per_index_mutex", 0}, + {&purge_sys_bh_mutex_key, "purge_sys_bh_mutex", 0}, + {&recv_sys_mutex_key, "recv_sys_mutex", 0}, + {&recv_writer_mutex_key, "recv_writer_mutex", 0}, + {&rseg_mutex_key, "rseg_mutex", 0}, +# ifdef UNIV_SYNC_DEBUG + {&rw_lock_debug_mutex_key, "rw_lock_debug_mutex", 0}, +# endif /* UNIV_SYNC_DEBUG */ + {&rw_lock_list_mutex_key, "rw_lock_list_mutex", 0}, + {&rw_lock_mutex_key, "rw_lock_mutex", 0}, + {&srv_dict_tmpfile_mutex_key, "srv_dict_tmpfile_mutex", 0}, + {&srv_innodb_monitor_mutex_key, "srv_innodb_monitor_mutex", 0}, + {&srv_misc_tmpfile_mutex_key, "srv_misc_tmpfile_mutex", 0}, + {&srv_monitor_file_mutex_key, "srv_monitor_file_mutex", 0}, +# ifdef UNIV_SYNC_DEBUG + {&sync_thread_mutex_key, "sync_thread_mutex", 0}, +# endif /* UNIV_SYNC_DEBUG */ + {&buf_dblwr_mutex_key, "buf_dblwr_mutex", 0}, + {&trx_undo_mutex_key, "trx_undo_mutex", 0}, + {&srv_sys_mutex_key, "srv_sys_mutex", 0}, + {&lock_sys_mutex_key, "lock_mutex", 0}, + {&lock_sys_wait_mutex_key, "lock_wait_mutex", 0}, + {&trx_mutex_key, "trx_mutex", 0}, + {&srv_sys_tasks_mutex_key, "srv_threads_mutex", 0}, + /* mutex with os_fast_mutex_ interfaces */ +# ifndef PFS_SKIP_EVENT_MUTEX + {&event_os_mutex_key, "event_os_mutex", 0}, +# endif /* PFS_SKIP_EVENT_MUTEX */ + {&os_mutex_key, "os_mutex", 0}, +#ifndef HAVE_ATOMIC_BUILTINS + {&srv_conc_mutex_key, "srv_conc_mutex", 0}, +#endif /* !HAVE_ATOMIC_BUILTINS */ +#ifndef HAVE_ATOMIC_BUILTINS_64 + {&monitor_mutex_key, "monitor_mutex", 0}, +#endif /* !HAVE_ATOMIC_BUILTINS_64 */ + {&ut_list_mutex_key, "ut_list_mutex", 0}, + {&trx_sys_mutex_key, "trx_sys_mutex", 0}, + {&zip_pad_mutex_key, "zip_pad_mutex", 0}, +}; +# endif /* UNIV_PFS_MUTEX */ + +# ifdef UNIV_PFS_RWLOCK +/* all_innodb_rwlocks array contains rwlocks that are +performance schema instrumented if "UNIV_PFS_RWLOCK" +is defined */ +static PSI_rwlock_info all_innodb_rwlocks[] = { +# ifdef UNIV_LOG_ARCHIVE + {&archive_lock_key, "archive_lock", 0}, +# endif /* UNIV_LOG_ARCHIVE */ + {&btr_search_latch_key, "btr_search_latch", 0}, +# ifndef PFS_SKIP_BUFFER_MUTEX_RWLOCK + {&buf_block_lock_key, "buf_block_lock", 0}, +# endif /* !PFS_SKIP_BUFFER_MUTEX_RWLOCK */ +# ifdef UNIV_SYNC_DEBUG + {&buf_block_debug_latch_key, "buf_block_debug_latch", 0}, +# endif /* UNIV_SYNC_DEBUG */ + {&dict_operation_lock_key, "dict_operation_lock", 0}, + {&fil_space_latch_key, "fil_space_latch", 0}, + {&checkpoint_lock_key, "checkpoint_lock", 0}, + {&fts_cache_rw_lock_key, "fts_cache_rw_lock", 0}, + {&fts_cache_init_rw_lock_key, "fts_cache_init_rw_lock", 0}, + {&trx_i_s_cache_lock_key, "trx_i_s_cache_lock", 0}, + {&trx_purge_latch_key, "trx_purge_latch", 0}, + {&index_tree_rw_lock_key, "index_tree_rw_lock", 0}, + {&index_online_log_key, "index_online_log", 0}, + {&dict_table_stats_key, "dict_table_stats", 0}, + {&hash_table_rw_lock_key, "hash_table_locks", 0} +}; +# endif /* UNIV_PFS_RWLOCK */ + +# ifdef UNIV_PFS_THREAD +/* all_innodb_threads array contains threads that are +performance schema instrumented if "UNIV_PFS_THREAD" +is defined */ +static PSI_thread_info all_innodb_threads[] = { + {&trx_rollback_clean_thread_key, "trx_rollback_clean_thread", 0}, + {&io_handler_thread_key, "io_handler_thread", 0}, + {&srv_lock_timeout_thread_key, "srv_lock_timeout_thread", 0}, + {&srv_error_monitor_thread_key, "srv_error_monitor_thread", 0}, + {&srv_monitor_thread_key, "srv_monitor_thread", 0}, + {&srv_master_thread_key, "srv_master_thread", 0}, + {&srv_purge_thread_key, "srv_purge_thread", 0}, + {&buf_page_cleaner_thread_key, "page_cleaner_thread", 0}, + {&buf_lru_manager_thread_key, "lru_manager_thread", 0}, + {&recv_writer_thread_key, "recv_writer_thread", 0}, + {&srv_log_tracking_thread_key, "srv_redo_log_follow_thread", 0} +}; +# endif /* UNIV_PFS_THREAD */ + +# ifdef UNIV_PFS_IO +/* all_innodb_files array contains the type of files that are +performance schema instrumented if "UNIV_PFS_IO" is defined */ +static PSI_file_info all_innodb_files[] = { + {&innodb_file_data_key, "innodb_data_file", 0}, + {&innodb_file_log_key, "innodb_log_file", 0}, + {&innodb_file_temp_key, "innodb_temp_file", 0}, + {&innodb_file_bmp_key, "innodb_bmp_file", 0} +}; +# endif /* UNIV_PFS_IO */ +#endif /* HAVE_PSI_INTERFACE */ + +/** Always normalize table name to lower case on Windows */ +#ifdef __WIN__ +#define normalize_table_name(norm_name, name) \ + normalize_table_name_low(norm_name, name, TRUE) +#else +#define normalize_table_name(norm_name, name) \ + normalize_table_name_low(norm_name, name, FALSE) +#endif /* __WIN__ */ + +/** Set up InnoDB API callback function array */ +ib_cb_t innodb_api_cb[] = { + (ib_cb_t) ib_cursor_open_table, + (ib_cb_t) ib_cursor_read_row, + (ib_cb_t) ib_cursor_insert_row, + (ib_cb_t) ib_cursor_delete_row, + (ib_cb_t) ib_cursor_update_row, + (ib_cb_t) ib_cursor_moveto, + (ib_cb_t) ib_cursor_first, + (ib_cb_t) ib_cursor_next, + (ib_cb_t) ib_cursor_last, + (ib_cb_t) ib_cursor_set_match_mode, + (ib_cb_t) ib_sec_search_tuple_create, + (ib_cb_t) ib_clust_read_tuple_create, + (ib_cb_t) ib_tuple_delete, + (ib_cb_t) ib_tuple_copy, + (ib_cb_t) ib_tuple_read_u8, + (ib_cb_t) ib_tuple_write_u8, + (ib_cb_t) ib_tuple_read_u16, + (ib_cb_t) ib_tuple_write_u16, + (ib_cb_t) ib_tuple_read_u32, + (ib_cb_t) ib_tuple_write_u32, + (ib_cb_t) ib_tuple_read_u64, + (ib_cb_t) ib_tuple_write_u64, + (ib_cb_t) ib_tuple_read_i8, + (ib_cb_t) ib_tuple_write_i8, + (ib_cb_t) ib_tuple_read_i16, + (ib_cb_t) ib_tuple_write_i16, + (ib_cb_t) ib_tuple_read_i32, + (ib_cb_t) ib_tuple_write_i32, + (ib_cb_t) ib_tuple_read_i64, + (ib_cb_t) ib_tuple_write_i64, + (ib_cb_t) ib_tuple_get_n_cols, + (ib_cb_t) ib_col_set_value, + (ib_cb_t) ib_col_get_value, + (ib_cb_t) ib_col_get_meta, + (ib_cb_t) ib_trx_begin, + (ib_cb_t) ib_trx_commit, + (ib_cb_t) ib_trx_rollback, + (ib_cb_t) ib_trx_start, + (ib_cb_t) ib_trx_release, + (ib_cb_t) ib_trx_state, + (ib_cb_t) ib_cursor_lock, + (ib_cb_t) ib_cursor_close, + (ib_cb_t) ib_cursor_new_trx, + (ib_cb_t) ib_cursor_reset, + (ib_cb_t) ib_open_table_by_name, + (ib_cb_t) ib_col_get_name, + (ib_cb_t) ib_table_truncate, + (ib_cb_t) ib_cursor_open_index_using_name, + (ib_cb_t) ib_close_thd, + (ib_cb_t) ib_cfg_get_cfg, + (ib_cb_t) ib_cursor_set_memcached_sync, + (ib_cb_t) ib_cursor_set_cluster_access, + (ib_cb_t) ib_cursor_commit_trx, + (ib_cb_t) ib_cfg_trx_level, + (ib_cb_t) ib_tuple_get_n_user_cols, + (ib_cb_t) ib_cursor_set_lock_mode, + (ib_cb_t) ib_cursor_clear_trx, + (ib_cb_t) ib_get_idx_field_name, + (ib_cb_t) ib_trx_get_start_time, + (ib_cb_t) ib_cfg_bk_commit_interval, + (ib_cb_t) ib_cursor_stmt_begin +}; + +/*************************************************************//** +Check whether valid argument given to innodb_ft_*_stopword_table. +This function is registered as a callback with MySQL. +@return 0 for valid stopword table */ +static +int +innodb_stopword_table_validate( +/*===========================*/ + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var* var, /*!< in: pointer to system + variable */ + void* save, /*!< out: immediate result + for update function */ + struct st_mysql_value* value); /*!< in: incoming string */ + +/** "GEN_CLUST_INDEX" is the name reserved for InnoDB default +system clustered index when there is no primary key. */ +const char innobase_index_reserve_name[] = "GEN_CLUST_INDEX"; +/************************************************************//** +Synchronously read and parse the redo log up to the last +checkpoint to write the changed page bitmap. +@return 0 to indicate success. Current implementation cannot fail. */ +static +my_bool +innobase_flush_changed_page_bitmaps(); +/*==================================*/ +/************************************************************//** +Delete all the bitmap files for data less than the specified LSN. +If called with lsn == 0 (i.e. set by RESET request) or +IB_ULONGLONG_MAX, restart the bitmap file sequence, otherwise +continue it. +@return 0 to indicate success, 1 for failure. */ +static +my_bool +innobase_purge_changed_page_bitmaps( +/*================================*/ + ulonglong lsn); /*!< in: LSN to purge files up to */ + + +/*****************************************************************//** +Check whether this is a fake change transaction. +@return TRUE if a fake change transaction */ +static +my_bool +innobase_is_fake_change( +/*====================*/ + handlerton *hton, /*!< in: InnoDB handlerton */ + THD* thd); /*!< in: MySQL thread handle of the user for + whom the transaction is being committed */ + + +/******************************************************************//** +Maps a MySQL trx isolation level code to the InnoDB isolation level code +@return InnoDB isolation level */ +static inline +ulint +innobase_map_isolation_level( +/*=========================*/ + enum_tx_isolation iso); /*!< in: MySQL isolation level code */ + +static const char innobase_hton_name[]= "InnoDB"; + +static MYSQL_THDVAR_BOOL(support_xa, PLUGIN_VAR_OPCMDARG, + "Enable InnoDB support for the XA two-phase commit", + /* check_func */ NULL, /* update_func */ NULL, + /* default */ TRUE); + +static MYSQL_THDVAR_BOOL(table_locks, PLUGIN_VAR_OPCMDARG, + "Enable InnoDB locking in LOCK TABLES", + /* check_func */ NULL, /* update_func */ NULL, + /* default */ TRUE); + +static MYSQL_THDVAR_BOOL(strict_mode, PLUGIN_VAR_OPCMDARG, + "Use strict mode when evaluating create options.", + NULL, NULL, FALSE); + +static MYSQL_THDVAR_BOOL(ft_enable_stopword, PLUGIN_VAR_OPCMDARG, + "Create FTS index with stopword.", + NULL, NULL, + /* default */ TRUE); + +static MYSQL_THDVAR_ULONG(lock_wait_timeout, PLUGIN_VAR_RQCMDARG, + "Timeout in seconds an InnoDB transaction may wait for a lock before being rolled back. Values above 100000000 disable the timeout.", + NULL, NULL, 50, 1, 1024 * 1024 * 1024, 0); + +static MYSQL_THDVAR_STR(ft_user_stopword_table, + PLUGIN_VAR_OPCMDARG|PLUGIN_VAR_MEMALLOC, + "User supplied stopword table name, effective in the session level.", + innodb_stopword_table_validate, NULL, NULL); + +static MYSQL_THDVAR_ULONG(flush_log_at_trx_commit, PLUGIN_VAR_OPCMDARG, + "Set to 0 (write and flush once per second)," + " 1 (write and flush at each commit)" + " or 2 (write at commit, flush once per second).", + NULL, NULL, 1, 0, 2, 0); + +static MYSQL_THDVAR_BOOL(fake_changes, PLUGIN_VAR_OPCMDARG, + "In the transaction after enabled, UPDATE, INSERT and DELETE only move the cursor to the records " + "and do nothing other operations (no changes, no ibuf, no undo, no transaction log) in the transaction. " + "This is to cause replication prefetch IO. ATTENTION: the transaction started after enabled is affected.", + NULL, NULL, FALSE); + + +static SHOW_VAR innodb_status_variables[]= { + {"buffer_pool_dump_status", + (char*) &export_vars.innodb_buffer_pool_dump_status, SHOW_CHAR}, + {"buffer_pool_load_status", + (char*) &export_vars.innodb_buffer_pool_load_status, SHOW_CHAR}, + {"background_log_sync", + (char*) &export_vars.innodb_background_log_sync, SHOW_LONG}, + {"buffer_pool_pages_data", + (char*) &export_vars.innodb_buffer_pool_pages_data, SHOW_LONG}, + {"buffer_pool_bytes_data", + (char*) &export_vars.innodb_buffer_pool_bytes_data, SHOW_LONG}, + {"buffer_pool_pages_dirty", + (char*) &export_vars.innodb_buffer_pool_pages_dirty, SHOW_LONG}, + {"buffer_pool_bytes_dirty", + (char*) &export_vars.innodb_buffer_pool_bytes_dirty, SHOW_LONG}, + {"buffer_pool_pages_flushed", + (char*) &export_vars.innodb_buffer_pool_pages_flushed, SHOW_LONG}, + {"buffer_pool_pages_LRU_flushed", + (char*) &export_vars.innodb_buffer_pool_pages_LRU_flushed, SHOW_LONG}, + {"buffer_pool_pages_free", + (char*) &export_vars.innodb_buffer_pool_pages_free, SHOW_LONG}, +#ifdef UNIV_DEBUG + {"buffer_pool_pages_latched", + (char*) &export_vars.innodb_buffer_pool_pages_latched, SHOW_LONG}, +#endif /* UNIV_DEBUG */ + {"buffer_pool_pages_made_not_young", + (char*) &export_vars.innodb_buffer_pool_pages_made_not_young, SHOW_LONG}, + {"buffer_pool_pages_made_young", + (char*) &export_vars.innodb_buffer_pool_pages_made_young, SHOW_LONG}, + {"buffer_pool_pages_misc", + (char*) &export_vars.innodb_buffer_pool_pages_misc, SHOW_LONG}, + {"buffer_pool_pages_old", + (char*) &export_vars.innodb_buffer_pool_pages_old, SHOW_LONG}, + {"buffer_pool_pages_total", + (char*) &export_vars.innodb_buffer_pool_pages_total, SHOW_LONG}, + {"buffer_pool_read_ahead_rnd", + (char*) &export_vars.innodb_buffer_pool_read_ahead_rnd, SHOW_LONG}, + {"buffer_pool_read_ahead", + (char*) &export_vars.innodb_buffer_pool_read_ahead, SHOW_LONG}, + {"buffer_pool_read_ahead_evicted", + (char*) &export_vars.innodb_buffer_pool_read_ahead_evicted, SHOW_LONG}, + {"buffer_pool_read_requests", + (char*) &export_vars.innodb_buffer_pool_read_requests, SHOW_LONG}, + {"buffer_pool_reads", + (char*) &export_vars.innodb_buffer_pool_reads, SHOW_LONG}, + {"buffer_pool_wait_free", + (char*) &export_vars.innodb_buffer_pool_wait_free, SHOW_LONG}, + {"buffer_pool_write_requests", + (char*) &export_vars.innodb_buffer_pool_write_requests, SHOW_LONG}, + {"checkpoint_age", + (char*) &export_vars.innodb_checkpoint_age, SHOW_LONG}, + {"checkpoint_max_age", + (char*) &export_vars.innodb_checkpoint_max_age, SHOW_LONG}, + {"data_fsyncs", + (char*) &export_vars.innodb_data_fsyncs, SHOW_LONG}, + {"data_pending_fsyncs", + (char*) &export_vars.innodb_data_pending_fsyncs, SHOW_LONG}, + {"data_pending_reads", + (char*) &export_vars.innodb_data_pending_reads, SHOW_LONG}, + {"data_pending_writes", + (char*) &export_vars.innodb_data_pending_writes, SHOW_LONG}, + {"data_read", + (char*) &export_vars.innodb_data_read, SHOW_LONG}, + {"data_reads", + (char*) &export_vars.innodb_data_reads, SHOW_LONG}, + {"data_writes", + (char*) &export_vars.innodb_data_writes, SHOW_LONG}, + {"data_written", + (char*) &export_vars.innodb_data_written, SHOW_LONG}, + {"dblwr_pages_written", + (char*) &export_vars.innodb_dblwr_pages_written, SHOW_LONG}, + {"dblwr_writes", + (char*) &export_vars.innodb_dblwr_writes, SHOW_LONG}, + {"deadlocks", + (char*) &export_vars.innodb_deadlocks, SHOW_LONG}, + {"have_atomic_builtins", + (char*) &export_vars.innodb_have_atomic_builtins, SHOW_BOOL}, + {"history_list_length", + (char*) &export_vars.innodb_history_list_length, SHOW_LONG}, + {"ibuf_discarded_delete_marks", + (char*) &export_vars.innodb_ibuf_discarded_delete_marks, SHOW_LONG}, + {"ibuf_discarded_deletes", + (char*) &export_vars.innodb_ibuf_discarded_deletes, SHOW_LONG}, + {"ibuf_discarded_inserts", + (char*) &export_vars.innodb_ibuf_discarded_inserts, SHOW_LONG}, + {"ibuf_free_list", + (char*) &export_vars.innodb_ibuf_free_list, SHOW_LONG}, + {"ibuf_merged_delete_marks", + (char*) &export_vars.innodb_ibuf_merged_delete_marks, SHOW_LONG}, + {"ibuf_merged_deletes", + (char*) &export_vars.innodb_ibuf_merged_deletes, SHOW_LONG}, + {"ibuf_merged_inserts", + (char*) &export_vars.innodb_ibuf_merged_inserts, SHOW_LONG}, + {"ibuf_merges", + (char*) &export_vars.innodb_ibuf_merges, SHOW_LONG}, + {"ibuf_segment_size", + (char*) &export_vars.innodb_ibuf_segment_size, SHOW_LONG}, + {"ibuf_size", + (char*) &export_vars.innodb_ibuf_size, SHOW_LONG}, + {"log_waits", + (char*) &export_vars.innodb_log_waits, SHOW_LONG}, + {"log_write_requests", + (char*) &export_vars.innodb_log_write_requests, SHOW_LONG}, + {"log_writes", + (char*) &export_vars.innodb_log_writes, SHOW_LONG}, + {"lsn_current", + (char*) &export_vars.innodb_lsn_current, SHOW_LONGLONG}, + {"lsn_flushed", + (char*) &export_vars.innodb_lsn_flushed, SHOW_LONGLONG}, + {"lsn_last_checkpoint", + (char*) &export_vars.innodb_lsn_last_checkpoint, SHOW_LONGLONG}, + {"master_thread_active_loops", + (char*) &export_vars.innodb_master_thread_active_loops, SHOW_LONG}, + {"master_thread_idle_loops", + (char*) &export_vars.innodb_master_thread_idle_loops, SHOW_LONG}, + {"max_trx_id", + (char*) &export_vars.innodb_max_trx_id, SHOW_LONGLONG}, + {"mem_adaptive_hash", + (char*) &export_vars.innodb_mem_adaptive_hash, SHOW_LONG}, + {"mem_dictionary", + (char*) &export_vars.innodb_mem_dictionary, SHOW_LONG}, + {"mem_total", + (char*) &export_vars.innodb_mem_total, SHOW_LONG}, + {"mutex_os_waits", + (char*) &export_vars.innodb_mutex_os_waits, SHOW_LONGLONG}, + {"mutex_spin_rounds", + (char*) &export_vars.innodb_mutex_spin_rounds, SHOW_LONGLONG}, + {"mutex_spin_waits", + (char*) &export_vars.innodb_mutex_spin_waits, SHOW_LONGLONG}, + {"oldest_view_low_limit_trx_id", + (char*) &export_vars.innodb_oldest_view_low_limit_trx_id, SHOW_LONGLONG}, + {"os_log_fsyncs", + (char*) &export_vars.innodb_os_log_fsyncs, SHOW_LONG}, + {"os_log_pending_fsyncs", + (char*) &export_vars.innodb_os_log_pending_fsyncs, SHOW_LONG}, + {"os_log_pending_writes", + (char*) &export_vars.innodb_os_log_pending_writes, SHOW_LONG}, + {"os_log_written", + (char*) &export_vars.innodb_os_log_written, SHOW_LONGLONG}, + {"page_size", + (char*) &export_vars.innodb_page_size, SHOW_LONG}, + {"pages_created", + (char*) &export_vars.innodb_pages_created, SHOW_LONG}, + {"pages_read", + (char*) &export_vars.innodb_pages_read, SHOW_LONG}, + {"pages_written", + (char*) &export_vars.innodb_pages_written, SHOW_LONG}, + {"purge_trx_id", + (char*) &export_vars.innodb_purge_trx_id, SHOW_LONGLONG}, + {"purge_undo_no", + (char*) &export_vars.innodb_purge_undo_no, SHOW_LONGLONG}, + {"row_lock_current_waits", + (char*) &export_vars.innodb_row_lock_current_waits, SHOW_LONG}, + {"current_row_locks", + (char*) &export_vars.innodb_current_row_locks, SHOW_LONG}, + {"row_lock_time", + (char*) &export_vars.innodb_row_lock_time, SHOW_LONGLONG}, + {"row_lock_time_avg", + (char*) &export_vars.innodb_row_lock_time_avg, SHOW_LONG}, + {"row_lock_time_max", + (char*) &export_vars.innodb_row_lock_time_max, SHOW_LONG}, + {"row_lock_waits", + (char*) &export_vars.innodb_row_lock_waits, SHOW_LONG}, + {"rows_deleted", + (char*) &export_vars.innodb_rows_deleted, SHOW_LONG}, + {"rows_inserted", + (char*) &export_vars.innodb_rows_inserted, SHOW_LONG}, + {"rows_read", + (char*) &export_vars.innodb_rows_read, SHOW_LONG}, + {"rows_updated", + (char*) &export_vars.innodb_rows_updated, SHOW_LONG}, + {"num_open_files", + (char*) &export_vars.innodb_num_open_files, SHOW_LONG}, + {"read_views_memory", + (char*) &export_vars.innodb_read_views_memory, SHOW_LONG}, + {"descriptors_memory", + (char*) &export_vars.innodb_descriptors_memory, SHOW_LONG}, + {"s_lock_os_waits", + (char*) &export_vars.innodb_s_lock_os_waits, SHOW_LONGLONG}, + {"s_lock_spin_rounds", + (char*) &export_vars.innodb_s_lock_spin_rounds, SHOW_LONGLONG}, + {"s_lock_spin_waits", + (char*) &export_vars.innodb_s_lock_spin_waits, SHOW_LONGLONG}, + {"truncated_status_writes", + (char*) &export_vars.innodb_truncated_status_writes, SHOW_LONG}, + {"available_undo_logs", + (char*) &export_vars.innodb_available_undo_logs, SHOW_LONG}, +#ifdef UNIV_DEBUG + {"purge_trx_id_age", + (char*) &export_vars.innodb_purge_trx_id_age, SHOW_LONG}, + {"purge_view_trx_id_age", + (char*) &export_vars.innodb_purge_view_trx_id_age, SHOW_LONG}, +#endif /* UNIV_DEBUG */ + {"x_lock_os_waits", + (char*) &export_vars.innodb_x_lock_os_waits, SHOW_LONGLONG}, + {"x_lock_spin_rounds", + (char*) &export_vars.innodb_x_lock_spin_rounds, SHOW_LONGLONG}, + {"x_lock_spin_waits", + (char*) &export_vars.innodb_x_lock_spin_waits, SHOW_LONGLONG}, + {NullS, NullS, SHOW_LONG} +}; + +/************************************************************************//** +Handling the shared INNOBASE_SHARE structure that is needed to provide table +locking. Register the table name if it doesn't exist in the hash table. */ +static +INNOBASE_SHARE* +get_share( +/*======*/ + const char* table_name); /*!< in: table to lookup */ + +/************************************************************************//** +Free the shared object that was registered with get_share(). */ +static +void +free_share( +/*=======*/ + INNOBASE_SHARE* share); /*!< in/own: share to free */ + +/*****************************************************************//** +Frees a possible InnoDB trx object associated with the current THD. +@return 0 or error number */ +static +int +innobase_close_connection( +/*======================*/ + handlerton* hton, /*!< in/out: Innodb handlerton */ + THD* thd); /*!< in: MySQL thread handle for + which to close the connection */ + +/*****************************************************************//** +Cancel any pending lock request associated with the current THD. */ +static +void +innobase_kill_connection( +/*======================*/ + handlerton* hton, /*!< in: innobase handlerton */ + THD* thd); /*!< in: handle to the MySQL thread being killed */ + +/*****************************************************************//** +Commits a transaction in an InnoDB database or marks an SQL statement +ended. +@return 0 */ +static +int +innobase_commit( +/*============*/ + handlerton* hton, /*!< in/out: Innodb handlerton */ + THD* thd, /*!< in: MySQL thread handle of the + user for whom the transaction should + be committed */ + bool commit_trx); /*!< in: true - commit transaction + false - the current SQL statement + ended */ + +/*****************************************************************//** +Rolls back a transaction to a savepoint. +@return 0 if success, HA_ERR_NO_SAVEPOINT if no savepoint with the +given name */ +static +int +innobase_rollback( +/*==============*/ + handlerton* hton, /*!< in/out: Innodb handlerton */ + THD* thd, /*!< in: handle to the MySQL thread + of the user whose transaction should + be rolled back */ + bool rollback_trx); /*!< in: TRUE - rollback entire + transaction FALSE - rollback the current + statement only */ + +/*****************************************************************//** +Rolls back a transaction to a savepoint. +@return 0 if success, HA_ERR_NO_SAVEPOINT if no savepoint with the +given name */ +static +int +innobase_rollback_to_savepoint( +/*===========================*/ + handlerton* hton, /*!< in/out: InnoDB handlerton */ + THD* thd, /*!< in: handle to the MySQL thread of + the user whose XA transaction should + be rolled back to savepoint */ + void* savepoint); /*!< in: savepoint data */ + +/*****************************************************************//** +Check whether innodb state allows to safely release MDL locks after +rollback to savepoint. +@return true if it is safe, false if its not safe. */ +static +bool +innobase_rollback_to_savepoint_can_release_mdl( +/*===========================================*/ + handlerton* hton, /*!< in/out: InnoDB handlerton */ + THD* thd); /*!< in: handle to the MySQL thread of + the user whose XA transaction should + be rolled back to savepoint */ + +/*****************************************************************//** +Sets a transaction savepoint. +@return always 0, that is, always succeeds */ +static +int +innobase_savepoint( +/*===============*/ + handlerton* hton, /*!< in/out: InnoDB handlerton */ + THD* thd, /*!< in: handle to the MySQL thread of + the user's XA transaction for which + we need to take a savepoint */ + void* savepoint); /*!< in: savepoint data */ + +/*****************************************************************//** +Release transaction savepoint name. +@return 0 if success, HA_ERR_NO_SAVEPOINT if no savepoint with the +given name */ +static +int +innobase_release_savepoint( +/*=======================*/ + handlerton* hton, /*!< in/out: handlerton for Innodb */ + THD* thd, /*!< in: handle to the MySQL thread + of the user whose transaction's + savepoint should be released */ + void* savepoint); /*!< in: savepoint data */ + +/************************************************************************//** +Function for constructing an InnoDB table handler instance. */ +static +handler* +innobase_create_handler( +/*====================*/ + handlerton* hton, /*!< in/out: handlerton for Innodb */ + TABLE_SHARE* table, + MEM_ROOT* mem_root); + +/** @brief Initialize the default value of innodb_commit_concurrency. + +Once InnoDB is running, the innodb_commit_concurrency must not change +from zero to nonzero. (Bug #42101) + +The initial default value is 0, and without this extra initialization, +SET GLOBAL innodb_commit_concurrency=DEFAULT would set the parameter +to 0, even if it was initially set to nonzero at the command line +or configuration file. */ +static +void +innobase_commit_concurrency_init_default(); +/*=======================================*/ + +/** @brief Initialize the default and max value of innodb_undo_logs. + +Once InnoDB is running, the default value and the max value of +innodb_undo_logs must be equal to the available undo logs, +given by srv_available_undo_logs. */ +static +void +innobase_undo_logs_init_default_max(); +/*==================================*/ + +/************************************************************//** +Validate the file format name and return its corresponding id. +@return valid file format id */ +static +uint +innobase_file_format_name_lookup( +/*=============================*/ + const char* format_name); /*!< in: pointer to file format + name */ +/************************************************************//** +Validate the file format check config parameters, as a side effect it +sets the srv_max_file_format_at_startup variable. +@return the format_id if valid config value, otherwise, return -1 */ +static +int +innobase_file_format_validate_and_set( +/*==================================*/ + const char* format_max); /*!< in: parameter value */ + +/*******************************************************************//** +This function is used to prepare an X/Open XA distributed transaction. +@return 0 or error number */ +static +int +innobase_xa_prepare( +/*================*/ + handlerton* hton, /*!< in: InnoDB handlerton */ + THD* thd, /*!< in: handle to the MySQL thread of + the user whose XA transaction should + be prepared */ + bool all); /*!< in: true - prepare transaction + false - the current SQL statement + ended */ +/*******************************************************************//** +This function is used to recover X/Open XA distributed transactions. +@return number of prepared transactions stored in xid_list */ +static +int +innobase_xa_recover( +/*================*/ + handlerton* hton, /*!< in: InnoDB handlerton */ + XID* xid_list, /*!< in/out: prepared transactions */ + uint len); /*!< in: number of slots in xid_list */ +/*******************************************************************//** +This function is used to commit one X/Open XA distributed transaction +which is in the prepared state +@return 0 or error number */ +static +int +innobase_commit_by_xid( +/*===================*/ + handlerton* hton, /*!< in: InnoDB handlerton */ + XID* xid); /*!< in: X/Open XA transaction + identification */ +/*******************************************************************//** +This function is used to rollback one X/Open XA distributed transaction +which is in the prepared state +@return 0 or error number */ +static +int +innobase_rollback_by_xid( +/*=====================*/ + handlerton* hton, /*!< in: InnoDB handlerton */ + XID* xid); /*!< in: X/Open XA transaction + identification */ +/*******************************************************************//** +Create a consistent view for a cursor based on current transaction +which is created if the corresponding MySQL thread still lacks one. +This consistent view is then used inside of MySQL when accessing records +using a cursor. +@return pointer to cursor view or NULL */ +static +void* +innobase_create_cursor_view( +/*========================*/ + handlerton* hton, /*!< in: innobase hton */ + THD* thd); /*!< in: user thread handle */ +/*******************************************************************//** +Set the given consistent cursor view to a transaction which is created +if the corresponding MySQL thread still lacks one. If the given +consistent cursor view is NULL global read view of a transaction is +restored to a transaction read view. */ +static +void +innobase_set_cursor_view( +/*=====================*/ + handlerton* hton, /*!< in: handlerton of Innodb */ + THD* thd, /*!< in: user thread handle */ + void* curview); /*!< in: Consistent cursor view to + be set */ +/*******************************************************************//** +Close the given consistent cursor view of a transaction and restore +global read view to a transaction read view. Transaction is created if the +corresponding MySQL thread still lacks one. */ +static +void +innobase_close_cursor_view( +/*=======================*/ + handlerton* hton, /*!< in: handlerton of Innodb */ + THD* thd, /*!< in: user thread handle */ + void* curview); /*!< in: Consistent read view to be + closed */ +/*****************************************************************//** +Removes all tables in the named database inside InnoDB. */ +static +void +innobase_drop_database( +/*===================*/ + handlerton* hton, /*!< in: handlerton of Innodb */ + char* path); /*!< in: database path; inside InnoDB + the name of the last directory in + the path is used as the database name: + for example, in 'mysql/data/test' the + database name is 'test' */ +/*******************************************************************//** +Closes an InnoDB database. */ +static +int +innobase_end( +/*=========*/ + handlerton* hton, /* in: Innodb handlerton */ + ha_panic_function type); + +/*****************************************************************//** +Creates an InnoDB transaction struct for the thd if it does not yet have one. +Starts a new InnoDB transaction if a transaction is not yet started. And +assigns a new snapshot for a consistent read if the transaction does not yet +have one. +@return 0 */ +static +int +innobase_start_trx_and_assign_read_view( +/*====================================*/ + handlerton* hton, /* in: Innodb handlerton */ + THD* thd); /* in: MySQL thread handle of the + user for whom the transaction should + be committed */ +/*****************************************************************//** +Creates an InnoDB transaction struct for the thd if it does not yet have one. +Starts a new InnoDB transaction if a transaction is not yet started. And +clones snapshot for a consistent read from another session, if it has one. +@return 0 */ +static +int +innobase_start_trx_and_clone_read_view( +/*====================================*/ + handlerton* hton, /* in: Innodb handlerton */ + THD* thd, /* in: MySQL thread handle of the + user for whom the transaction should + be committed */ + THD* from_thd); /* in: MySQL thread handle of the + user session from which the consistent + read should be cloned */ +/****************************************************************//** +Flushes InnoDB logs to disk and makes a checkpoint. Really, a commit flushes +the logs, and the name of this function should be innobase_checkpoint. +@return TRUE if error */ +static +bool +innobase_flush_logs( +/*================*/ + handlerton* hton); /*!< in: InnoDB handlerton */ + +/************************************************************************//** +Implements the SHOW ENGINE INNODB STATUS command. Sends the output of the +InnoDB Monitor to the client. +@return 0 on success */ +static +int +innodb_show_status( +/*===============*/ + handlerton* hton, /*!< in: the innodb handlerton */ + THD* thd, /*!< in: the MySQL query thread of + the caller */ + stat_print_fn* stat_print); +/************************************************************************//** +Return 0 on success and non-zero on failure. Note: the bool return type +seems to be abused here, should be an int. */ +static +bool +innobase_show_status( +/*=================*/ + handlerton* hton, /*!< in: the innodb handlerton */ + THD* thd, /*!< in: the MySQL query thread of + the caller */ + stat_print_fn* stat_print, + enum ha_stat_type stat_type); + +/*****************************************************************//** +Commits a transaction in an InnoDB database. */ +static +void +innobase_commit_low( +/*================*/ + trx_t* trx); /*!< in: transaction handle */ + +/****************************************************************//** +Parse and enable InnoDB monitor counters during server startup. +User can enable monitor counters/groups by specifying +"loose-innodb_monitor_enable = monitor_name1;monitor_name2..." +in server configuration file or at the command line. */ +static +void +innodb_enable_monitor_at_startup( +/*=============================*/ + char* str); /*!< in: monitor counter enable list */ + +/********************************************************************* +Normalizes a table name string. A normalized name consists of the +database name catenated to '/' and table name. An example: +test/mytable. On Windows normalization puts both the database name and the +table name always to lower case if "set_lower_case" is set to TRUE. */ +static +void +normalize_table_name_low( +/*=====================*/ + char* norm_name, /* out: normalized name as a + null-terminated string */ + const char* name, /* in: table name string */ + ibool set_lower_case); /* in: TRUE if we want to set + name to lower case */ + +/*************************************************************//** +Removes old archived transaction log files. +@return true on error */ +static bool innobase_purge_archive_logs( + handlerton *hton, /*!< in: InnoDB handlerton */ + time_t before_date, /*!< in: all files modified + before timestamp should be removed */ + const char* to_filename) /*!< in: this and earler files + should be removed */ +{ + ulint err= DB_ERROR; + if (before_date > 0) { + err= purge_archived_logs(before_date, 0); + } else if (to_filename) { + if (is_prefix(to_filename, IB_ARCHIVED_LOGS_PREFIX)) { + unsigned long long log_file_lsn = strtoll(to_filename + + IB_ARCHIVED_LOGS_PREFIX_LEN, + NULL, 10); + if (log_file_lsn > 0 && log_file_lsn < ULLONG_MAX) { + err= purge_archived_logs(0, log_file_lsn); + } + } + } + return (err != DB_SUCCESS); +} + +/*************************************************************//** +Check for a valid value of innobase_commit_concurrency. +@return 0 for valid innodb_commit_concurrency */ +static +int +innobase_commit_concurrency_validate( +/*=================================*/ + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var* var, /*!< in: pointer to system + variable */ + void* save, /*!< out: immediate result + for update function */ + struct st_mysql_value* value) /*!< in: incoming string */ +{ + long long intbuf; + ulong commit_concurrency; + + DBUG_ENTER("innobase_commit_concurrency_validate"); + + if (value->val_int(value, &intbuf)) { + /* The value is NULL. That is invalid. */ + DBUG_RETURN(1); + } + + *reinterpret_cast<ulong*>(save) = commit_concurrency + = static_cast<ulong>(intbuf); + + /* Allow the value to be updated, as long as it remains zero + or nonzero. */ + DBUG_RETURN(!(!commit_concurrency == !innobase_commit_concurrency)); +} + +/*******************************************************************//** +Function for constructing an InnoDB table handler instance. */ +static +handler* +innobase_create_handler( +/*====================*/ + handlerton* hton, /*!< in: InnoDB handlerton */ + TABLE_SHARE* table, + MEM_ROOT* mem_root) +{ + return(new (mem_root) ha_innobase(hton, table)); +} + +/* General functions */ + +/*************************************************************//** +Check that a page_size is correct for InnoDB. If correct, set the +associated page_size_shift which is the power of 2 for this page size. +@return an associated page_size_shift if valid, 0 if invalid. */ +inline +int +innodb_page_size_validate( +/*======================*/ + ulong page_size) /*!< in: Page Size to evaluate */ +{ + ulong n; + + DBUG_ENTER("innodb_page_size_validate"); + + for (n = UNIV_PAGE_SIZE_SHIFT_MIN; + n <= UNIV_PAGE_SIZE_SHIFT_MAX; + n++) { + if (page_size == (ulong) (1 << n)) { + DBUG_RETURN(n); + } + } + + DBUG_RETURN(0); +} + +/******************************************************************//** +Returns true if the thread is the replication thread on the slave +server. Used in srv_conc_enter_innodb() to determine if the thread +should be allowed to enter InnoDB - the replication thread is treated +differently than other threads. Also used in +srv_conc_force_exit_innodb(). +@return true if thd is the replication thread */ +UNIV_INTERN +ibool +thd_is_replication_slave_thread( +/*============================*/ + THD* thd) /*!< in: thread handle */ +{ + return((ibool) thd_slave_thread(thd)); +} + +/******************************************************************//** +Gets information on the durability property requested by thread. +Used when writing either a prepare or commit record to the log +buffer. @return the durability property. */ +UNIV_INTERN +enum durability_properties +thd_requested_durability( +/*=====================*/ + const THD* thd) /*!< in: thread handle */ +{ + return(thd_get_durability_property(thd)); +} + +/******************************************************************//** +Returns true if transaction should be flagged as read-only. +@return true if the thd is marked as read-only */ +UNIV_INTERN +ibool +thd_trx_is_read_only( +/*=================*/ + THD* thd) /*!< in: thread handle */ +{ + return(thd != 0 && thd_tx_is_read_only(thd)); +} + +/******************************************************************//** +Check if the transaction is an auto-commit transaction. TRUE also +implies that it is a SELECT (read-only) transaction. +@return true if the transaction is an auto commit read-only transaction. */ +UNIV_INTERN +ibool +thd_trx_is_auto_commit( +/*===================*/ + THD* thd) /*!< in: thread handle, can be NULL */ +{ + return(thd != NULL + && !thd_test_options( + thd, + OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN) + && thd_is_select(thd)); +} + +/******************************************************************//** +Save some CPU by testing the value of srv_thread_concurrency in inline +functions. */ +static inline +void +innobase_srv_conc_enter_innodb( +/*===========================*/ + trx_t* trx) /*!< in: transaction handle */ +{ + if (srv_thread_concurrency) { + if (trx->n_tickets_to_enter_innodb > 0) { + + /* If trx has 'free tickets' to enter the engine left, + then use one such ticket */ + + --trx->n_tickets_to_enter_innodb; + + } else if (trx->mysql_thd != NULL + && thd_is_replication_slave_thread(trx->mysql_thd)) { + + UT_WAIT_FOR( + srv_conc_get_active_threads() + < srv_thread_concurrency, + srv_replication_delay * 1000); + + } else { + srv_conc_enter_innodb(trx); + } + } +} + +/******************************************************************//** +Note that the thread wants to leave InnoDB only if it doesn't have +any spare tickets. */ +static inline +void +innobase_srv_conc_exit_innodb( +/*==========================*/ + trx_t* trx) /*!< in: transaction handle */ +{ +#ifdef UNIV_SYNC_DEBUG + ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch)); +#endif /* UNIV_SYNC_DEBUG */ + + /* This is to avoid making an unnecessary function call. */ + if (trx->declared_to_be_inside_innodb + && trx->n_tickets_to_enter_innodb == 0) { + + srv_conc_force_exit_innodb(trx); + } +} + +/******************************************************************//** +Force a thread to leave InnoDB even if it has spare tickets. */ +static inline +void +innobase_srv_conc_force_exit_innodb( +/*================================*/ + trx_t* trx) /*!< in: transaction handle */ +{ +#ifdef UNIV_SYNC_DEBUG + ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch)); +#endif /* UNIV_SYNC_DEBUG */ + + /* This is to avoid making an unnecessary function call. */ + if (trx->declared_to_be_inside_innodb) { + srv_conc_force_exit_innodb(trx); + } +} + +/******************************************************************//** +Returns the NUL terminated value of glob_hostname. +@return pointer to glob_hostname. */ +UNIV_INTERN +const char* +server_get_hostname() +/*=================*/ +{ + return(glob_hostname); +} + +/******************************************************************//** +Returns true if the transaction this thread is processing has edited +non-transactional tables. Used by the deadlock detector when deciding +which transaction to rollback in case of a deadlock - we try to avoid +rolling back transactions that have edited non-transactional tables. +@return true if non-transactional tables have been edited */ +UNIV_INTERN +ibool +thd_has_edited_nontrans_tables( +/*===========================*/ + THD* thd) /*!< in: thread handle */ +{ + return((ibool) thd_non_transactional_update(thd)); +} + +/******************************************************************//** +Returns true if the thread is executing a SELECT statement. +@return true if thd is executing SELECT */ +UNIV_INTERN +ibool +thd_is_select( +/*==========*/ + const THD* thd) /*!< in: thread handle */ +{ + return(thd_sql_command(thd) == SQLCOM_SELECT); +} + +/******************************************************************//** +Returns true if the thread supports XA, +global value of innodb_supports_xa if thd is NULL. +@return true if thd has XA support */ +UNIV_INTERN +ibool +thd_supports_xa( +/*============*/ + THD* thd) /*!< in: thread handle, or NULL to query + the global innodb_supports_xa */ +{ + return(THDVAR(thd, support_xa)); +} + +/******************************************************************//** +Returns the lock wait timeout for the current connection. +@return the lock wait timeout, in seconds */ +UNIV_INTERN +ulong +thd_lock_wait_timeout( +/*==================*/ + THD* thd) /*!< in: thread handle, or NULL to query + the global innodb_lock_wait_timeout */ +{ + /* According to <mysql/plugin.h>, passing thd == NULL + returns the global value of the session variable. */ + return(THDVAR(thd, lock_wait_timeout)); +} + +/******************************************************************//** +Set the time waited for the lock for the current query. */ +UNIV_INTERN +void +thd_set_lock_wait_time( +/*===================*/ + THD* thd, /*!< in/out: thread handle */ + ulint value) /*!< in: time waited for the lock */ +{ + if (thd) { + thd_storage_lock_wait(thd, value); + } +} + +/******************************************************************//** +*/ +UNIV_INTERN +ulong +thd_flush_log_at_trx_commit( +/*================================*/ + void* thd) +{ + return(THDVAR((THD*) thd, flush_log_at_trx_commit)); +} + +/******************************************************************//** +Returns true if expand_fast_index_creation is enabled for the current +session. +@return the value of the server's expand_fast_index_creation variable */ +UNIV_INTERN +ibool +thd_expand_fast_index_creation( +/*================================*/ + void* thd) +{ + return((ibool) (((THD*) thd)->variables.expand_fast_index_creation)); +} + +/********************************************************************//** +Obtain the InnoDB transaction of a MySQL thread. +@return reference to transaction pointer */ +__attribute__((warn_unused_result, nonnull)) +static inline +trx_t*& +thd_to_trx( +/*=======*/ + THD* thd) /*!< in: MySQL thread */ +{ + return(*(trx_t**) thd_ha_data(thd, innodb_hton_ptr)); +} + +my_bool +ha_innobase::is_fake_change_enabled(THD* thd) +{ + trx_t* trx = thd_to_trx(thd); + return(trx && UNIV_UNLIKELY(trx->fake_changes)); +} + +/********************************************************************//** +In XtraDB it is impossible for a transaction to own a search latch outside of +InnoDB code, so there is nothing to release on demand. We keep this function to +simplify maintenance. +@return 0 */ +static +int +innobase_release_temporary_latches( +/*===============================*/ + handlerton* hton __attribute__((unused)), /*!< in: handlerton */ + THD* thd __attribute__((unused))) /*!< in: MySQL thread */ +{ +#ifdef UNIV_DEBUG + DBUG_ASSERT(hton == innodb_hton_ptr); + + if (!innodb_inited || thd == NULL) { + + return(0); + } + + trx_t* trx = thd_to_trx(thd); + + if (trx != NULL) { +#ifdef UNIV_SYNC_DEBUG + ut_ad(!btr_search_own_any()); +#endif + trx_search_latch_release_if_reserved(trx); + } +#endif + + return(0); +} + +/********************************************************************//** +Increments innobase_active_counter and every INNOBASE_WAKE_INTERVALth +time calls srv_active_wake_master_thread. This function should be used +when a single database operation may introduce a small need for +server utility activity, like checkpointing. */ +static inline +void +innobase_active_small(void) +/*=======================*/ +{ + innobase_active_counter++; + + if ((innobase_active_counter % INNOBASE_WAKE_INTERVAL) == 0) { + srv_active_wake_master_thread(); + } +} + +/********************************************************************//** +Converts an InnoDB error code to a MySQL error code and also tells to MySQL +about a possible transaction rollback inside InnoDB caused by a lock wait +timeout or a deadlock. +@return MySQL error code */ +static +int +convert_error_code_to_mysql( +/*========================*/ + dberr_t error, /*!< in: InnoDB error code */ + ulint flags, /*!< in: InnoDB table flags, or 0 */ + THD* thd) /*!< in: user thread handle or NULL */ +{ + switch (error) { + case DB_SUCCESS: + return(0); + + case DB_INTERRUPTED: + thd_set_kill_status(thd ? thd : thd_get_current_thd()); + return(-1); + + case DB_FOREIGN_EXCEED_MAX_CASCADE: + ut_ad(thd); + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + HA_ERR_ROW_IS_REFERENCED, + "InnoDB: Cannot delete/update " + "rows with cascading foreign key " + "constraints that exceed max " + "depth of %d. Please " + "drop extra constraints and try " + "again", DICT_FK_MAX_RECURSIVE_LOAD); + + /* fall through */ + + case DB_ERROR: + default: + return(-1); /* unspecified error */ + + case DB_DUPLICATE_KEY: + /* Be cautious with returning this error, since + mysql could re-enter the storage layer to get + duplicated key info, the operation requires a + valid table handle and/or transaction information, + which might not always be available in the error + handling stage. */ + return(HA_ERR_FOUND_DUPP_KEY); + + case DB_READ_ONLY: + if(srv_force_recovery) { + return(HA_ERR_INNODB_FORCED_RECOVERY); + } + return(HA_ERR_TABLE_READONLY); + + case DB_FOREIGN_DUPLICATE_KEY: + return(HA_ERR_FOREIGN_DUPLICATE_KEY); + + case DB_MISSING_HISTORY: + return(HA_ERR_TABLE_DEF_CHANGED); + + case DB_RECORD_NOT_FOUND: + return(HA_ERR_NO_ACTIVE_RECORD); + + case DB_DEADLOCK: + /* Since we rolled back the whole transaction, we must + tell it also to MySQL so that MySQL knows to empty the + cached binlog for this transaction */ + + if (thd) { + thd_mark_transaction_to_rollback(thd, TRUE); + } + + return(HA_ERR_LOCK_DEADLOCK); + + case DB_LOCK_WAIT_TIMEOUT: + /* Starting from 5.0.13, we let MySQL just roll back the + latest SQL statement in a lock wait timeout. Previously, we + rolled back the whole transaction. */ + + if (thd) { + thd_mark_transaction_to_rollback( + thd, (bool) row_rollback_on_timeout); + } + + return(HA_ERR_LOCK_WAIT_TIMEOUT); + + case DB_NO_REFERENCED_ROW: + return(HA_ERR_NO_REFERENCED_ROW); + + case DB_ROW_IS_REFERENCED: + return(HA_ERR_ROW_IS_REFERENCED); + + case DB_CANNOT_ADD_CONSTRAINT: + case DB_CHILD_NO_INDEX: + case DB_PARENT_NO_INDEX: + return(HA_ERR_CANNOT_ADD_FOREIGN); + + case DB_CANNOT_DROP_CONSTRAINT: + + return(HA_ERR_ROW_IS_REFERENCED); /* TODO: This is a bit + misleading, a new MySQL error + code should be introduced */ + + case DB_CORRUPTION: + return(HA_ERR_CRASHED); + + case DB_OUT_OF_FILE_SPACE: + return(HA_ERR_RECORD_FILE_FULL); + + case DB_TEMP_FILE_WRITE_FAILURE: + return(HA_ERR_TEMP_FILE_WRITE_FAILURE); + + case DB_TABLE_IN_FK_CHECK: + return(HA_ERR_TABLE_IN_FK_CHECK); + + case DB_TABLE_IS_BEING_USED: + return(HA_ERR_WRONG_COMMAND); + + case DB_TABLESPACE_DELETED: + case DB_TABLE_NOT_FOUND: + return(HA_ERR_NO_SUCH_TABLE); + + case DB_TABLESPACE_NOT_FOUND: + return(HA_ERR_NO_SUCH_TABLE); + + case DB_TOO_BIG_RECORD: { + /* If prefix is true then a 768-byte prefix is stored + locally for BLOB fields. Refer to dict_table_get_format() */ + bool prefix = (dict_tf_get_format(flags) == UNIV_FORMAT_A); + my_printf_error(ER_TOO_BIG_ROWSIZE, + "Row size too large (> %lu). Changing some columns " + "to TEXT or BLOB %smay help. In current row " + "format, BLOB prefix of %d bytes is stored inline.", + MYF(0), + page_get_free_space_of_empty(flags & + DICT_TF_COMPACT) / 2, + prefix ? "or using ROW_FORMAT=DYNAMIC " + "or ROW_FORMAT=COMPRESSED ": "", + prefix ? DICT_MAX_FIXED_COL_LEN : 0); + return(HA_ERR_TO_BIG_ROW); + } + + case DB_TOO_BIG_INDEX_COL: + my_error(ER_INDEX_COLUMN_TOO_LONG, MYF(0), + DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(flags)); + return(HA_ERR_INDEX_COL_TOO_LONG); + + case DB_NO_SAVEPOINT: + return(HA_ERR_NO_SAVEPOINT); + + case DB_LOCK_TABLE_FULL: + /* Since we rolled back the whole transaction, we must + tell it also to MySQL so that MySQL knows to empty the + cached binlog for this transaction */ + + if (thd) { + thd_mark_transaction_to_rollback(thd, TRUE); + } + + return(HA_ERR_LOCK_TABLE_FULL); + + case DB_FTS_INVALID_DOCID: + return(HA_FTS_INVALID_DOCID); + case DB_FTS_EXCEED_RESULT_CACHE_LIMIT: + return(HA_ERR_FTS_EXCEED_RESULT_CACHE_LIMIT); + case DB_TOO_MANY_CONCURRENT_TRXS: + return(HA_ERR_TOO_MANY_CONCURRENT_TRXS); + case DB_UNSUPPORTED: + return(HA_ERR_UNSUPPORTED); + case DB_INDEX_CORRUPT: + return(HA_ERR_INDEX_CORRUPT); + case DB_UNDO_RECORD_TOO_BIG: + return(HA_ERR_UNDO_REC_TOO_BIG); + case DB_OUT_OF_MEMORY: + return(HA_ERR_OUT_OF_MEM); + case DB_TABLESPACE_EXISTS: + return(HA_ERR_TABLESPACE_EXISTS); + case DB_IDENTIFIER_TOO_LONG: + return(HA_ERR_INTERNAL_ERROR); + case DB_FTS_TOO_MANY_WORDS_IN_PHRASE: + return(HA_ERR_FTS_TOO_MANY_WORDS_IN_PHRASE); + } +} + +/*************************************************************//** +Prints info of a THD object (== user session thread) to the given file. */ +UNIV_INTERN +void +innobase_mysql_print_thd( +/*=====================*/ + FILE* f, /*!< in: output stream */ + THD* thd, /*!< in: MySQL THD object */ + uint max_query_len) /*!< in: max query length to print, or 0 to + use the default max length */ +{ + char buffer[1024]; + + fputs(thd_security_context(thd, buffer, sizeof buffer, + max_query_len), f); + putc('\n', f); +} + +/******************************************************************//** +Get the error message format string. +@return the format string or 0 if not found. */ +UNIV_INTERN +const char* +innobase_get_err_msg( +/*=================*/ + int error_code) /*!< in: MySQL error code */ +{ + return(my_get_err_msg(error_code)); +} + +/******************************************************************//** +Get the variable length bounds of the given character set. */ +UNIV_INTERN +void +innobase_get_cset_width( +/*====================*/ + ulint cset, /*!< in: MySQL charset-collation code */ + ulint* mbminlen, /*!< out: minimum length of a char (in bytes) */ + ulint* mbmaxlen) /*!< out: maximum length of a char (in bytes) */ +{ + CHARSET_INFO* cs; + ut_ad(cset <= MAX_CHAR_COLL_NUM); + ut_ad(mbminlen); + ut_ad(mbmaxlen); + + cs = all_charsets[cset]; + if (cs) { + *mbminlen = cs->mbminlen; + *mbmaxlen = cs->mbmaxlen; + ut_ad(*mbminlen < DATA_MBMAX); + ut_ad(*mbmaxlen < DATA_MBMAX); + } else { + THD* thd = current_thd; + + if (thd && thd_sql_command(thd) == SQLCOM_DROP_TABLE) { + + /* Fix bug#46256: allow tables to be dropped if the + collation is not found, but issue a warning. */ + if ((log_warnings) + && (cset != 0)){ + + sql_print_warning( + "Unknown collation #%lu.", cset); + } + } else { + + ut_a(cset == 0); + } + + *mbminlen = *mbmaxlen = 0; + } +} + +/******************************************************************//** +Converts an identifier to a table name. */ +UNIV_INTERN +void +innobase_convert_from_table_id( +/*===========================*/ + struct charset_info_st* cs, /*!< in: the 'from' character set */ + char* to, /*!< out: converted identifier */ + const char* from, /*!< in: identifier to convert */ + ulint len) /*!< in: length of 'to', in bytes */ +{ + uint errors; + + strconvert(cs, from, &my_charset_filename, to, (uint) len, &errors); +} + +/********************************************************************** +Check if the length of the identifier exceeds the maximum allowed. +return true when length of identifier is too long. */ +UNIV_INTERN +my_bool +innobase_check_identifier_length( +/*=============================*/ + const char* id) /* in: FK identifier to check excluding the + database portion. */ +{ + int well_formed_error = 0; + CHARSET_INFO *cs = system_charset_info; + DBUG_ENTER("innobase_check_identifier_length"); + + size_t len = cs->cset->well_formed_len( + cs, id, id + strlen(id), + NAME_CHAR_LEN, &well_formed_error); + + if (well_formed_error || len == NAME_CHAR_LEN) { + my_error(ER_TOO_LONG_IDENT, MYF(0), id); + DBUG_RETURN(true); + } + DBUG_RETURN(false); +} + +/******************************************************************//** +Converts an identifier to UTF-8. */ +UNIV_INTERN +void +innobase_convert_from_id( +/*=====================*/ + struct charset_info_st* cs, /*!< in: the 'from' character set */ + char* to, /*!< out: converted identifier */ + const char* from, /*!< in: identifier to convert */ + ulint len) /*!< in: length of 'to', in bytes */ +{ + uint errors; + + strconvert(cs, from, system_charset_info, to, (uint) len, &errors); +} + +/******************************************************************//** +Compares NUL-terminated UTF-8 strings case insensitively. +@return 0 if a=b, <0 if a<b, >1 if a>b */ +UNIV_INTERN +int +innobase_strcasecmp( +/*================*/ + const char* a, /*!< in: first string to compare */ + const char* b) /*!< in: second string to compare */ +{ + if (!a) { + if (!b) { + return(0); + } else { + return(-1); + } + } else if (!b) { + return(1); + } + + return(my_strcasecmp(system_charset_info, a, b)); +} + +/******************************************************************//** +Compares NUL-terminated UTF-8 strings case insensitively. The +second string contains wildcards. +@return 0 if a match is found, 1 if not */ +UNIV_INTERN +int +innobase_wildcasecmp( +/*=================*/ + const char* a, /*!< in: string to compare */ + const char* b) /*!< in: wildcard string to compare */ +{ + return(wild_case_compare(system_charset_info, a, b)); +} + +/******************************************************************//** +Strip dir name from a full path name and return only the file name +@return file name or "null" if no file name */ +UNIV_INTERN +const char* +innobase_basename( +/*==============*/ + const char* path_name) /*!< in: full path name */ +{ + const char* name = base_name(path_name); + + return((name) ? name : "null"); +} + +/******************************************************************//** +Makes all characters in a NUL-terminated UTF-8 string lower case. */ +UNIV_INTERN +void +innobase_casedn_str( +/*================*/ + char* a) /*!< in/out: string to put in lower case */ +{ + my_casedn_str(system_charset_info, a); +} + +/**********************************************************************//** +Determines the connection character set. +@return connection character set */ +UNIV_INTERN +struct charset_info_st* +innobase_get_charset( +/*=================*/ + THD* mysql_thd) /*!< in: MySQL thread handle */ +{ + return(thd_charset(mysql_thd)); +} + +/**********************************************************************//** +Determines the current SQL statement. +@return SQL statement string */ +UNIV_INTERN +const char* +innobase_get_stmt( +/*==============*/ + THD* thd, /*!< in: MySQL thread handle */ + size_t* length) /*!< out: length of the SQL statement */ +{ + LEX_STRING* stmt; + + stmt = thd_query_string(thd); + *length = stmt->length; + return(stmt->str); +} + +/**********************************************************************//** +Get the current setting of the table_def_size global parameter. We do +a dirty read because for one there is no synchronization object and +secondly there is little harm in doing so even if we get a torn read. +@return value of table_def_size */ +UNIV_INTERN +ulint +innobase_get_table_cache_size(void) +/*===============================*/ +{ + return(table_def_size); +} + +/**********************************************************************//** +Get the current setting of the lower_case_table_names global parameter from +mysqld.cc. We do a dirty read because for one there is no synchronization +object and secondly there is little harm in doing so even if we get a torn +read. +@return value of lower_case_table_names */ +UNIV_INTERN +ulint +innobase_get_lower_case_table_names(void) +/*=====================================*/ +{ + return(lower_case_table_names); +} + +/*********************************************************************//** +Creates a temporary file. +@return temporary file descriptor, or < 0 on error */ +UNIV_INTERN +int +innobase_mysql_tmpfile(void) +/*========================*/ +{ + int fd2 = -1; + File fd; + + DBUG_EXECUTE_IF( + "innobase_tmpfile_creation_failure", + return(-1); + ); + + fd = mysql_tmpfile("ib"); + + if (fd >= 0) { + /* Copy the file descriptor, so that the additional resources + allocated by create_temp_file() can be freed by invoking + my_close(). + + Because the file descriptor returned by this function + will be passed to fdopen(), it will be closed by invoking + fclose(), which in turn will invoke close() instead of + my_close(). */ + +#ifdef _WIN32 + /* Note that on Windows, the integer returned by mysql_tmpfile + has no relation to C runtime file descriptor. Here, we need + to call my_get_osfhandle to get the HANDLE and then convert it + to C runtime filedescriptor. */ + { + HANDLE hFile = my_get_osfhandle(fd); + HANDLE hDup; + BOOL bOK = DuplicateHandle( + GetCurrentProcess(), + hFile, GetCurrentProcess(), + &hDup, 0, FALSE, DUPLICATE_SAME_ACCESS); + if (bOK) { + fd2 = _open_osfhandle((intptr_t) hDup, 0); + } else { + my_osmaperr(GetLastError()); + fd2 = -1; + } + } +#else + fd2 = dup(fd); +#endif + if (fd2 < 0) { + char errbuf[MYSYS_STRERROR_SIZE]; + DBUG_PRINT("error",("Got error %d on dup",fd2)); + my_errno=errno; + my_error(EE_OUT_OF_FILERESOURCES, + MYF(ME_BELL+ME_WAITTANG), + "ib*", my_errno, + my_strerror(errbuf, sizeof(errbuf), my_errno)); + } + my_close(fd, MYF(MY_WME)); + } + return(fd2); +} + +/*********************************************************************//** +Wrapper around MySQL's copy_and_convert function. +@return number of bytes copied to 'to' */ +UNIV_INTERN +ulint +innobase_convert_string( +/*====================*/ + void* to, /*!< out: converted string */ + ulint to_length, /*!< in: number of bytes reserved + for the converted string */ + CHARSET_INFO* to_cs, /*!< in: character set to convert to */ + const void* from, /*!< in: string to convert */ + ulint from_length, /*!< in: number of bytes to convert */ + CHARSET_INFO* from_cs, /*!< in: character set to convert + from */ + uint* errors) /*!< out: number of errors encountered + during the conversion */ +{ + return(copy_and_convert( + (char*) to, (uint32) to_length, to_cs, + (const char*) from, (uint32) from_length, from_cs, + errors)); +} + +/*******************************************************************//** +Formats the raw data in "data" (in InnoDB on-disk format) that is of +type DATA_(CHAR|VARCHAR|MYSQL|VARMYSQL) using "charset_coll" and writes +the result to "buf". The result is converted to "system_charset_info". +Not more than "buf_size" bytes are written to "buf". +The result is always NUL-terminated (provided buf_size > 0) and the +number of bytes that were written to "buf" is returned (including the +terminating NUL). +@return number of bytes that were written */ +UNIV_INTERN +ulint +innobase_raw_format( +/*================*/ + const char* data, /*!< in: raw data */ + ulint data_len, /*!< in: raw data length + in bytes */ + ulint charset_coll, /*!< in: charset collation */ + char* buf, /*!< out: output buffer */ + ulint buf_size) /*!< in: output buffer size + in bytes */ +{ + /* XXX we use a hard limit instead of allocating + but_size bytes from the heap */ + CHARSET_INFO* data_cs; + char buf_tmp[8192]; + ulint buf_tmp_used; + uint num_errors; + + data_cs = all_charsets[charset_coll]; + + buf_tmp_used = innobase_convert_string(buf_tmp, sizeof(buf_tmp), + system_charset_info, + data, data_len, data_cs, + &num_errors); + + return(ut_str_sql_format(buf_tmp, buf_tmp_used, buf, buf_size)); +} + +/*********************************************************************//** +Compute the next autoinc value. + +For MySQL replication the autoincrement values can be partitioned among +the nodes. The offset is the start or origin of the autoincrement value +for a particular node. For n nodes the increment will be n and the offset +will be in the interval [1, n]. The formula tries to allocate the next +value for a particular node. + +Note: This function is also called with increment set to the number of +values we want to reserve for multi-value inserts e.g., + + INSERT INTO T VALUES(), (), (); + +innobase_next_autoinc() will be called with increment set to 3 where +autoinc_lock_mode != TRADITIONAL because we want to reserve 3 values for +the multi-value INSERT above. +@return the next value */ +UNIV_INTERN +ulonglong +innobase_next_autoinc( +/*==================*/ + ulonglong current, /*!< in: Current value */ + ulonglong need, /*!< in: count of values needed */ + ulonglong step, /*!< in: AUTOINC increment step */ + ulonglong offset, /*!< in: AUTOINC offset */ + ulonglong max_value) /*!< in: max value for type */ +{ + ulonglong next_value; + ulonglong block = need * step; + + /* Should never be 0. */ + ut_a(need > 0); + ut_a(block > 0); + ut_a(max_value > 0); + + /* According to MySQL documentation, if the offset is greater than + the step then the offset is ignored. */ + if (offset > block) { + offset = 0; + } + + /* Check for overflow. Current can be > max_value if the value is + in reality a negative value.The visual studio compilers converts + large double values automatically into unsigned long long datatype + maximum value */ + + if (block >= max_value + || offset > max_value + || current >= max_value + || max_value - offset <= offset) { + + next_value = max_value; + } else { + ut_a(max_value > current); + + ulonglong free = max_value - current; + + if (free < offset || free - offset <= block) { + next_value = max_value; + } else { + next_value = 0; + } + } + + if (next_value == 0) { + ulonglong next; + + if (current > offset) { + next = (current - offset) / step; + } else { + next = (offset - current) / step; + } + + ut_a(max_value > next); + next_value = next * step; + /* Check for multiplication overflow. */ + ut_a(next_value >= next); + ut_a(max_value > next_value); + + /* Check for overflow */ + if (max_value - next_value >= block) { + + next_value += block; + + if (max_value - next_value >= offset) { + next_value += offset; + } else { + next_value = max_value; + } + } else { + next_value = max_value; + } + } + + ut_a(next_value != 0); + ut_a(next_value <= max_value); + + return(next_value); +} + +/*********************************************************************//** +Initializes some fields in an InnoDB transaction object. */ +static +void +innobase_trx_init( +/*==============*/ + THD* thd, /*!< in: user thread handle */ + trx_t* trx) /*!< in/out: InnoDB transaction handle */ +{ + DBUG_ENTER("innobase_trx_init"); + DBUG_ASSERT(EQ_CURRENT_THD(thd)); + DBUG_ASSERT(thd == trx->mysql_thd); + + trx->check_foreigns = !thd_test_options( + thd, OPTION_NO_FOREIGN_KEY_CHECKS); + + trx->check_unique_secondary = !thd_test_options( + thd, OPTION_RELAXED_UNIQUE_CHECKS); + + trx->fake_changes = THDVAR(thd, fake_changes); + +#ifdef EXTENDED_SLOWLOG + if (thd_log_slow_verbosity(thd) & (1ULL << SLOG_V_INNODB)) { + trx->take_stats = TRUE; + } else { + trx->take_stats = FALSE; + } +#else + trx->take_stats = FALSE; +#endif + + DBUG_VOID_RETURN; +} + +/*********************************************************************//** +Allocates an InnoDB transaction for a MySQL handler object for DML. +@return InnoDB transaction handle */ +UNIV_INTERN +trx_t* +innobase_trx_allocate( +/*==================*/ + THD* thd) /*!< in: user thread handle */ +{ + trx_t* trx; + + DBUG_ENTER("innobase_trx_allocate"); + DBUG_ASSERT(thd != NULL); + DBUG_ASSERT(EQ_CURRENT_THD(thd)); + + trx = trx_allocate_for_mysql(); + + trx->mysql_thd = thd; + + innobase_trx_init(thd, trx); + + DBUG_RETURN(trx); +} + +/*********************************************************************//** +Gets the InnoDB transaction handle for a MySQL handler object, creates +an InnoDB transaction struct if the corresponding MySQL thread struct still +lacks one. +@return InnoDB transaction handle */ +static inline +trx_t* +check_trx_exists( +/*=============*/ + THD* thd) /*!< in: user thread handle */ +{ + trx_t*& trx = thd_to_trx(thd); + + ut_ad(EQ_CURRENT_THD(thd)); + + if (trx == NULL) { + trx = innobase_trx_allocate(thd); + } else if (UNIV_UNLIKELY(trx->magic_n != TRX_MAGIC_N)) { + mem_analyze_corruption(trx); + ut_error; + } + + innobase_trx_init(thd, trx); + + return(trx); +} + +/************************************************************************* +Gets current trx. */ +trx_t* +innobase_get_trx() +{ + THD *thd=current_thd; + if (likely(thd != 0)) { + trx_t*& trx = thd_to_trx(thd); + return(trx); + } else { + return(NULL); + } +} + +ibool +innobase_get_slow_log() +{ +#ifdef EXTENDED_SLOWLOG + return((ibool) thd_opt_slow_log()); +#else + return(FALSE); +#endif +} + +/*********************************************************************//** +Note that a transaction has been registered with MySQL. +@return true if transaction is registered with MySQL 2PC coordinator */ +static inline +bool +trx_is_registered_for_2pc( +/*=========================*/ + const trx_t* trx) /* in: transaction */ +{ + return(trx->is_registered == 1); +} + +/*********************************************************************//** +Note that a transaction has been registered with MySQL 2PC coordinator. */ +static inline +void +trx_register_for_2pc( +/*==================*/ + trx_t* trx) /* in: transaction */ +{ + trx->is_registered = 1; + ut_ad(trx->owns_prepare_mutex == 0); +} + +/*********************************************************************//** +Note that a transaction has been deregistered. */ +static inline +void +trx_deregister_from_2pc( +/*====================*/ + trx_t* trx) /* in: transaction */ +{ + trx->is_registered = 0; + trx->owns_prepare_mutex = 0; +} + +/*********************************************************************//** +Check if transaction is started. +@reutrn true if transaction is in state started */ +static +bool +trx_is_started( +/*===========*/ + trx_t* trx) /* in: transaction */ +{ + return(trx->state != TRX_STATE_NOT_STARTED); +} + +/****************************************************************//** +Update log_checksum_algorithm_ptr with a pointer to the function corresponding +to a given checksum algorithm. */ +static +void +innodb_log_checksum_func_update( +/*============================*/ + ulint algorithm) /*!< in: algorithm */ +{ + switch (algorithm) { + case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB: + case SRV_CHECKSUM_ALGORITHM_INNODB: + log_checksum_algorithm_ptr=log_block_calc_checksum_innodb; + break; + case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32: + case SRV_CHECKSUM_ALGORITHM_CRC32: + log_checksum_algorithm_ptr=log_block_calc_checksum_crc32; + break; + case SRV_CHECKSUM_ALGORITHM_STRICT_NONE: + case SRV_CHECKSUM_ALGORITHM_NONE: + log_checksum_algorithm_ptr=log_block_calc_checksum_none; + break; + default: + ut_a(0); + } +} + +/****************************************************************//** +On update hook for the innodb_log_checksum_algorithm variable. */ +static +void +innodb_log_checksum_algorithm_update( +/*=================================*/ + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var* var, /*!< in: pointer to + system variable */ + void* var_ptr,/*!< out: where the + formal string goes */ + const void* save) /*!< in: immediate result + from check function */ +{ + srv_checksum_algorithm_t algorithm; + + algorithm = (srv_checksum_algorithm_t) + (*static_cast<const ulong*>(save)); + + /* Make sure we are the only log user */ + mutex_enter(&log_sys->mutex); + + innodb_log_checksum_func_update(algorithm); + + srv_log_checksum_algorithm = algorithm; + + mutex_exit(&log_sys->mutex); +} + +/*********************************************************************//** +Copy table flags from MySQL's HA_CREATE_INFO into an InnoDB table object. +Those flags are stored in .frm file and end up in the MySQL table object, +but are frequently used inside InnoDB so we keep their copies into the +InnoDB table object. */ +UNIV_INTERN +void +innobase_copy_frm_flags_from_create_info( +/*=====================================*/ + dict_table_t* innodb_table, /*!< in/out: InnoDB table */ + const HA_CREATE_INFO* create_info) /*!< in: create info */ +{ + ibool ps_on; + ibool ps_off; + + if (dict_table_is_temporary(innodb_table)) { + /* Temp tables do not use persistent stats. */ + ps_on = FALSE; + ps_off = TRUE; + } else { + ps_on = create_info->table_options + & HA_OPTION_STATS_PERSISTENT; + ps_off = create_info->table_options + & HA_OPTION_NO_STATS_PERSISTENT; + } + + dict_stats_set_persistent(innodb_table, ps_on, ps_off); + + dict_stats_auto_recalc_set( + innodb_table, + create_info->stats_auto_recalc == HA_STATS_AUTO_RECALC_ON, + create_info->stats_auto_recalc == HA_STATS_AUTO_RECALC_OFF); + + innodb_table->stats_sample_pages = create_info->stats_sample_pages; +} + +/*********************************************************************//** +Copy table flags from MySQL's TABLE_SHARE into an InnoDB table object. +Those flags are stored in .frm file and end up in the MySQL table object, +but are frequently used inside InnoDB so we keep their copies into the +InnoDB table object. */ +UNIV_INTERN +void +innobase_copy_frm_flags_from_table_share( +/*=====================================*/ + dict_table_t* innodb_table, /*!< in/out: InnoDB table */ + const TABLE_SHARE* table_share) /*!< in: table share */ +{ + ibool ps_on; + ibool ps_off; + + if (dict_table_is_temporary(innodb_table)) { + /* Temp tables do not use persistent stats */ + ps_on = FALSE; + ps_off = TRUE; + } else { + ps_on = table_share->db_create_options + & HA_OPTION_STATS_PERSISTENT; + ps_off = table_share->db_create_options + & HA_OPTION_NO_STATS_PERSISTENT; + } + + dict_stats_set_persistent(innodb_table, ps_on, ps_off); + + dict_stats_auto_recalc_set( + innodb_table, + table_share->stats_auto_recalc == HA_STATS_AUTO_RECALC_ON, + table_share->stats_auto_recalc == HA_STATS_AUTO_RECALC_OFF); + + innodb_table->stats_sample_pages = table_share->stats_sample_pages; +} + +/*********************************************************************//** +Construct ha_innobase handler. */ +UNIV_INTERN +ha_innobase::ha_innobase( +/*=====================*/ + handlerton* hton, + TABLE_SHARE* table_arg) + :handler(hton, table_arg), + int_table_flags(HA_REC_NOT_IN_SEQ | + HA_NULL_IN_KEY | + HA_CAN_INDEX_BLOBS | + HA_CAN_SQL_HANDLER | + HA_PRIMARY_KEY_REQUIRED_FOR_POSITION | + HA_PRIMARY_KEY_IN_READ_INDEX | + HA_BINLOG_ROW_CAPABLE | + HA_CAN_GEOMETRY | HA_PARTIAL_COLUMN_READ | + HA_TABLE_SCAN_ON_INDEX | HA_CAN_FULLTEXT | + HA_CAN_FULLTEXT_EXT | HA_CAN_EXPORT), + start_of_scan(0), + num_write_row(0) +{} + +/*********************************************************************//** +Destruct ha_innobase handler. */ +UNIV_INTERN +ha_innobase::~ha_innobase() +/*======================*/ +{ +} + +/*********************************************************************//** +Updates the user_thd field in a handle and also allocates a new InnoDB +transaction handle if needed, and updates the transaction fields in the +prebuilt struct. */ +UNIV_INTERN inline +void +ha_innobase::update_thd( +/*====================*/ + THD* thd) /*!< in: thd to use the handle */ +{ + trx_t* trx; + + DBUG_ENTER("ha_innobase::update_thd"); + DBUG_PRINT("ha_innobase::update_thd", ("user_thd: %p -> %p", + user_thd, thd)); + + /* The table should have been opened in ha_innobase::open(). */ + DBUG_ASSERT(prebuilt->table->n_ref_count > 0); + + trx = check_trx_exists(thd); + + if (prebuilt->trx != trx) { + + row_update_prebuilt_trx(prebuilt, trx); + } + + user_thd = thd; + DBUG_VOID_RETURN; +} + +/*********************************************************************//** +Updates the user_thd field in a handle and also allocates a new InnoDB +transaction handle if needed, and updates the transaction fields in the +prebuilt struct. */ +UNIV_INTERN +void +ha_innobase::update_thd() +/*=====================*/ +{ + THD* thd = ha_thd(); + + ut_ad(EQ_CURRENT_THD(thd)); + update_thd(thd); +} + +/*********************************************************************//** +Registers an InnoDB transaction with the MySQL 2PC coordinator, so that +the MySQL XA code knows to call the InnoDB prepare and commit, or rollback +for the transaction. This MUST be called for every transaction for which +the user may call commit or rollback. Calling this several times to register +the same transaction is allowed, too. This function also registers the +current SQL statement. */ +static inline +void +innobase_register_trx( +/*==================*/ + handlerton* hton, /* in: Innobase handlerton */ + THD* thd, /* in: MySQL thd (connection) object */ + trx_t* trx) /* in: transaction to register */ +{ + trans_register_ha(thd, FALSE, hton); + + if (!trx_is_registered_for_2pc(trx) + && thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) { + + trans_register_ha(thd, TRUE, hton); + } + + trx_register_for_2pc(trx); +} + +/* BACKGROUND INFO: HOW THE MYSQL QUERY CACHE WORKS WITH INNODB + ------------------------------------------------------------ + +1) The use of the query cache for TBL is disabled when there is an +uncommitted change to TBL. + +2) When a change to TBL commits, InnoDB stores the current value of +its global trx id counter, let us denote it by INV_TRX_ID, to the table object +in the InnoDB data dictionary, and does only allow such transactions whose +id <= INV_TRX_ID to use the query cache. + +3) When InnoDB does an INSERT/DELETE/UPDATE to a table TBL, or an implicit +modification because an ON DELETE CASCADE, we invalidate the MySQL query cache +of TBL immediately. + +How this is implemented inside InnoDB: + +1) Since every modification always sets an IX type table lock on the InnoDB +table, it is easy to check if there can be uncommitted modifications for a +table: just check if there are locks in the lock list of the table. + +2) When a transaction inside InnoDB commits, it reads the global trx id +counter and stores the value INV_TRX_ID to the tables on which it had a lock. + +3) If there is an implicit table change from ON DELETE CASCADE or SET NULL, +InnoDB calls an invalidate method for the MySQL query cache for that table. + +How this is implemented inside sql_cache.cc: + +1) The query cache for an InnoDB table TBL is invalidated immediately at an +INSERT/UPDATE/DELETE, just like in the case of MyISAM. No need to delay +invalidation to the transaction commit. + +2) To store or retrieve a value from the query cache of an InnoDB table TBL, +any query must first ask InnoDB's permission. We must pass the thd as a +parameter because InnoDB will look at the trx id, if any, associated with +that thd. Also the full_name which is used as key to search for the table +object. The full_name is a string containing the normalized path to the +table in the canonical format. + +3) Use of the query cache for InnoDB tables is now allowed also when +AUTOCOMMIT==0 or we are inside BEGIN ... COMMIT. Thus transactions no longer +put restrictions on the use of the query cache. +*/ + +/******************************************************************//** +The MySQL query cache uses this to check from InnoDB if the query cache at +the moment is allowed to operate on an InnoDB table. The SQL query must +be a non-locking SELECT. + +The query cache is allowed to operate on certain query only if this function +returns TRUE for all tables in the query. + +If thd is not in the autocommit state, this function also starts a new +transaction for thd if there is no active trx yet, and assigns a consistent +read view to it if there is no read view yet. + +Why a deadlock of threads is not possible: the query cache calls this function +at the start of a SELECT processing. Then the calling thread cannot be +holding any InnoDB semaphores. The calling thread is holding the +query cache mutex, and this function will reserve the InnoDB trx_sys->mutex. +Thus, the 'rank' in sync0sync.h of the MySQL query cache mutex is above +the InnoDB trx_sys->mutex. +@return TRUE if permitted, FALSE if not; note that the value FALSE +does not mean we should invalidate the query cache: invalidation is +called explicitly */ +static +my_bool +innobase_query_caching_of_table_permitted( +/*======================================*/ + THD* thd, /*!< in: thd of the user who is trying to + store a result to the query cache or + retrieve it */ + char* full_name, /*!< in: normalized path to the table */ + uint full_name_len, /*!< in: length of the normalized path + to the table */ + ulonglong *unused) /*!< unused for this engine */ +{ + ibool is_autocommit; + trx_t* trx; + char norm_name[1000]; + + ut_a(full_name_len < 999); + + trx = check_trx_exists(thd); + + if (trx->isolation_level == TRX_ISO_SERIALIZABLE) { + /* In the SERIALIZABLE mode we add LOCK IN SHARE MODE to every + plain SELECT if AUTOCOMMIT is not on. */ + + return((my_bool)FALSE); + } + + if (UNIV_UNLIKELY(trx->has_search_latch)) { + sql_print_error("The calling thread is holding the adaptive " + "search, latch though calling " + "innobase_query_caching_of_table_permitted."); + trx_print(stderr, trx, 1024); + } + + trx_search_latch_release_if_reserved(trx); + + innobase_srv_conc_force_exit_innodb(trx); + + if (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) { + + is_autocommit = TRUE; + } else { + is_autocommit = FALSE; + + } + + if (is_autocommit && trx->n_mysql_tables_in_use == 0) { + /* We are going to retrieve the query result from the query + cache. This cannot be a store operation to the query cache + because then MySQL would have locks on tables already. + + TODO: if the user has used LOCK TABLES to lock the table, + then we open a transaction in the call of row_.. below. + That trx can stay open until UNLOCK TABLES. The same problem + exists even if we do not use the query cache. MySQL should be + modified so that it ALWAYS calls some cleanup function when + the processing of a query ends! + + We can imagine we instantaneously serialize this consistent + read trx to the current trx id counter. If trx2 would have + changed the tables of a query result stored in the cache, and + trx2 would have already committed, making the result obsolete, + then trx2 would have already invalidated the cache. Thus we + can trust the result in the cache is ok for this query. */ + + return((my_bool)TRUE); + } + + /* Normalize the table name to InnoDB format */ + normalize_table_name(norm_name, full_name); + + innobase_register_trx(innodb_hton_ptr, thd, trx); + + if (row_search_check_if_query_cache_permitted(trx, norm_name)) { + + /* printf("Query cache for %s permitted\n", norm_name); */ + + return((my_bool)TRUE); + } + + /* printf("Query cache for %s NOT permitted\n", norm_name); */ + + return((my_bool)FALSE); +} + +/*****************************************************************//** +Invalidates the MySQL query cache for the table. */ +UNIV_INTERN +void +innobase_invalidate_query_cache( +/*============================*/ + trx_t* trx, /*!< in: transaction which + modifies the table */ + const char* full_name, /*!< in: concatenation of + database name, null char NUL, + table name, null char NUL; + NOTE that in Windows this is + always in LOWER CASE! */ + ulint full_name_len) /*!< in: full name length where + also the null chars count */ +{ + /* Note that the sync0sync.h rank of the query cache mutex is just + above the InnoDB trx_sys_t->lock. The caller of this function must + not have latches of a lower rank. */ + +#ifdef HAVE_QUERY_CACHE + char qcache_key_name[2 * (NAME_LEN + 1)]; + size_t tabname_len; + size_t dbname_len; + + /* Construct the key("db-name\0table$name\0") for the query cache using + the path name("db@002dname\0table@0024name\0") of the table in its + canonical form. */ + dbname_len = filename_to_tablename(full_name, qcache_key_name, + sizeof(qcache_key_name)); + tabname_len = filename_to_tablename(full_name + strlen(full_name) + 1, + qcache_key_name + dbname_len + 1, + sizeof(qcache_key_name) + - dbname_len - 1); + + /* Argument TRUE below means we are using transactions */ + mysql_query_cache_invalidate4(trx->mysql_thd, + qcache_key_name, + (dbname_len + tabname_len + 2), + TRUE); +#endif +} + +/*****************************************************************//** +Convert an SQL identifier to the MySQL system_charset_info (UTF-8) +and quote it if needed. +@return pointer to the end of buf */ +static +char* +innobase_convert_identifier( +/*========================*/ + char* buf, /*!< out: buffer for converted identifier */ + ulint buflen, /*!< in: length of buf, in bytes */ + const char* id, /*!< in: identifier to convert */ + ulint idlen, /*!< in: length of id, in bytes */ + THD* thd, /*!< in: MySQL connection thread, or NULL */ + ibool file_id)/*!< in: TRUE=id is a table or database name; + FALSE=id is an UTF-8 string */ +{ + const char* s = id; + int q; + + if (file_id) { + + char nz[MAX_TABLE_NAME_LEN + 1]; + char nz2[MAX_TABLE_NAME_LEN + 1]; + + /* Decode the table name. The MySQL function expects + a NUL-terminated string. The input and output strings + buffers must not be shared. */ + ut_a(idlen <= MAX_TABLE_NAME_LEN); + memcpy(nz, id, idlen); + nz[idlen] = 0; + + s = nz2; + idlen = explain_filename(thd, nz, nz2, sizeof nz2, + EXPLAIN_PARTITIONS_AS_COMMENT); + goto no_quote; + } + + /* See if the identifier needs to be quoted. */ + if (UNIV_UNLIKELY(!thd)) { + q = '"'; + } else { + q = get_quote_char_for_identifier(thd, s, (int) idlen); + } + + if (q == EOF) { +no_quote: + if (UNIV_UNLIKELY(idlen > buflen)) { + idlen = buflen; + } + memcpy(buf, s, idlen); + return(buf + idlen); + } + + /* Quote the identifier. */ + if (buflen < 2) { + return(buf); + } + + *buf++ = q; + buflen--; + + for (; idlen; idlen--) { + int c = *s++; + if (UNIV_UNLIKELY(c == q)) { + if (UNIV_UNLIKELY(buflen < 3)) { + break; + } + + *buf++ = c; + *buf++ = c; + buflen -= 2; + } else { + if (UNIV_UNLIKELY(buflen < 2)) { + break; + } + + *buf++ = c; + buflen--; + } + } + + *buf++ = q; + return(buf); +} + +/*****************************************************************//** +Convert a table or index name to the MySQL system_charset_info (UTF-8) +and quote it if needed. +@return pointer to the end of buf */ +UNIV_INTERN +char* +innobase_convert_name( +/*==================*/ + char* buf, /*!< out: buffer for converted identifier */ + ulint buflen, /*!< in: length of buf, in bytes */ + const char* id, /*!< in: identifier to convert */ + ulint idlen, /*!< in: length of id, in bytes */ + THD* thd, /*!< in: MySQL connection thread, or NULL */ + ibool table_id)/*!< in: TRUE=id is a table or database name; + FALSE=id is an index name */ +{ + char* s = buf; + const char* bufend = buf + buflen; + + if (table_id) { + const char* slash = (const char*) memchr(id, '/', idlen); + if (!slash) { + + goto no_db_name; + } + + /* Print the database name and table name separately. */ + s = innobase_convert_identifier(s, bufend - s, id, slash - id, + thd, TRUE); + if (UNIV_LIKELY(s < bufend)) { + *s++ = '.'; + s = innobase_convert_identifier(s, bufend - s, + slash + 1, idlen + - (slash - id) - 1, + thd, TRUE); + } + } else if (UNIV_UNLIKELY(*id == TEMP_INDEX_PREFIX)) { + /* Temporary index name (smart ALTER TABLE) */ + const char temp_index_suffix[]= "--temporary--"; + + s = innobase_convert_identifier(buf, buflen, id + 1, idlen - 1, + thd, FALSE); + if (s - buf + (sizeof temp_index_suffix - 1) < buflen) { + memcpy(s, temp_index_suffix, + sizeof temp_index_suffix - 1); + s += sizeof temp_index_suffix - 1; + } + } else { +no_db_name: + s = innobase_convert_identifier(buf, buflen, id, idlen, + thd, table_id); + } + + return(s); +} + +/*****************************************************************//** +A wrapper function of innobase_convert_name(), convert a table or +index name to the MySQL system_charset_info (UTF-8) and quote it if needed. +@return pointer to the end of buf */ +UNIV_INTERN +void +innobase_format_name( +/*==================*/ + char* buf, /*!< out: buffer for converted identifier */ + ulint buflen, /*!< in: length of buf, in bytes */ + const char* name, /*!< in: index or table name to format */ + ibool is_index_name) /*!< in: index name */ +{ + const char* bufend; + + bufend = innobase_convert_name(buf, buflen, name, strlen(name), + NULL, !is_index_name); + + ut_ad((ulint) (bufend - buf) < buflen); + + buf[bufend - buf] = '\0'; +} + +/**********************************************************************//** +Determines if the currently running transaction has been interrupted. +@return TRUE if interrupted */ +UNIV_INTERN +ibool +trx_is_interrupted( +/*===============*/ + const trx_t* trx) /*!< in: transaction */ +{ + return(trx && trx->mysql_thd && thd_killed(trx->mysql_thd)); +} + +/**********************************************************************//** +Determines if the currently running transaction is in strict mode. +@return TRUE if strict */ +UNIV_INTERN +ibool +trx_is_strict( +/*==========*/ + trx_t* trx) /*!< in: transaction */ +{ + return(trx && trx->mysql_thd && THDVAR(trx->mysql_thd, strict_mode)); +} + +/**********************************************************************//** +Determines if the current MySQL thread is running in strict mode. +If thd==NULL, THDVAR returns the global value of innodb-strict-mode. +@return TRUE if strict */ +UNIV_INLINE +ibool +thd_is_strict( +/*==========*/ + THD* thd) /*!< in: MySQL thread descriptor */ +{ + return(THDVAR(thd, strict_mode)); +} + +/**************************************************************//** +Resets some fields of a prebuilt struct. The template is used in fast +retrieval of just those column values MySQL needs in its processing. */ +inline +void +ha_innobase::reset_template(void) +/*=============================*/ +{ + ut_ad(prebuilt->magic_n == ROW_PREBUILT_ALLOCATED); + ut_ad(prebuilt->magic_n2 == prebuilt->magic_n); + + prebuilt->keep_other_fields_on_keyread = 0; + prebuilt->read_just_key = 0; + prebuilt->in_fts_query = 0; + /* Reset index condition pushdown state. */ + if (prebuilt->idx_cond) { + prebuilt->idx_cond = NULL; + prebuilt->idx_cond_n_cols = 0; + /* Invalidate prebuilt->mysql_template + in ha_innobase::write_row(). */ + prebuilt->template_type = ROW_MYSQL_NO_TEMPLATE; + } +} + +/*****************************************************************//** +Call this when you have opened a new table handle in HANDLER, before you +call index_read_idx() etc. Actually, we can let the cursor stay open even +over a transaction commit! Then you should call this before every operation, +fetch next etc. This function inits the necessary things even after a +transaction commit. */ +UNIV_INTERN +void +ha_innobase::init_table_handle_for_HANDLER(void) +/*============================================*/ +{ + /* If current thd does not yet have a trx struct, create one. + If the current handle does not yet have a prebuilt struct, create + one. Update the trx pointers in the prebuilt struct. Normally + this operation is done in external_lock. */ + + update_thd(ha_thd()); + + /* Initialize the prebuilt struct much like it would be inited in + external_lock */ + + trx_search_latch_release_if_reserved(prebuilt->trx); + + innobase_srv_conc_force_exit_innodb(prebuilt->trx); + + /* If the transaction is not started yet, start it */ + + trx_start_if_not_started_xa(prebuilt->trx); + + /* Assign a read view if the transaction does not have it yet */ + + trx_assign_read_view(prebuilt->trx); + + innobase_register_trx(ht, user_thd, prebuilt->trx); + + /* We did the necessary inits in this function, no need to repeat them + in row_search_for_mysql */ + + prebuilt->sql_stat_start = FALSE; + + /* We let HANDLER always to do the reads as consistent reads, even + if the trx isolation level would have been specified as SERIALIZABLE */ + + prebuilt->select_lock_type = LOCK_NONE; + prebuilt->stored_select_lock_type = LOCK_NONE; + + /* Always fetch all columns in the index record */ + + prebuilt->hint_need_to_fetch_extra_cols = ROW_RETRIEVE_ALL_COLS; + + /* We want always to fetch all columns in the whole row? Or do + we???? */ + + prebuilt->used_in_HANDLER = TRUE; + reset_template(); +} + +/*********************************************************************//** +Opens an InnoDB database. +@return 0 on success, error code on failure */ +static +int +innobase_init( +/*==========*/ + void *p) /*!< in: InnoDB handlerton */ +{ + static char current_dir[3]; /*!< Set if using current lib */ + int err; + bool ret; + char *default_path; + uint format_id; + ulong num_pll_degree; + + DBUG_ENTER("innobase_init"); + handlerton *innobase_hton= (handlerton*) p; + innodb_hton_ptr = innobase_hton; + + innobase_hton->state = SHOW_OPTION_YES; + innobase_hton->db_type= DB_TYPE_INNODB; + innobase_hton->savepoint_offset = sizeof(trx_named_savept_t); + innobase_hton->close_connection = innobase_close_connection; + innobase_hton->savepoint_set = innobase_savepoint; + innobase_hton->savepoint_rollback = innobase_rollback_to_savepoint; + innobase_hton->savepoint_rollback_can_release_mdl = + innobase_rollback_to_savepoint_can_release_mdl; + innobase_hton->savepoint_release = innobase_release_savepoint; + innobase_hton->commit = innobase_commit; + innobase_hton->rollback = innobase_rollback; + innobase_hton->prepare = innobase_xa_prepare; + innobase_hton->recover = innobase_xa_recover; + innobase_hton->commit_by_xid = innobase_commit_by_xid; + innobase_hton->rollback_by_xid = innobase_rollback_by_xid; + innobase_hton->create_cursor_read_view = innobase_create_cursor_view; + innobase_hton->set_cursor_read_view = innobase_set_cursor_view; + innobase_hton->close_cursor_read_view = innobase_close_cursor_view; + innobase_hton->create = innobase_create_handler; + innobase_hton->drop_database = innobase_drop_database; + innobase_hton->panic = innobase_end; + + innobase_hton->start_consistent_snapshot = + innobase_start_trx_and_assign_read_view; + innobase_hton->clone_consistent_snapshot = + innobase_start_trx_and_clone_read_view; + + innobase_hton->flush_logs = innobase_flush_logs; + innobase_hton->show_status = innobase_show_status; + innobase_hton->flags = HTON_SUPPORTS_EXTENDED_KEYS | + HTON_SUPPORTS_ONLINE_BACKUPS | HTON_SUPPORTS_FOREIGN_KEYS; + + innobase_hton->release_temporary_latches = + innobase_release_temporary_latches; + innobase_hton->purge_archive_logs = innobase_purge_archive_logs; + + innobase_hton->data = &innodb_api_cb; + innobase_hton->flush_changed_page_bitmaps + = innobase_flush_changed_page_bitmaps; + innobase_hton->purge_changed_page_bitmaps + = innobase_purge_changed_page_bitmaps; + innobase_hton->is_fake_change = innobase_is_fake_change; + + innobase_hton->kill_connection = innobase_kill_connection; + + ut_a(DATA_MYSQL_TRUE_VARCHAR == (ulint)MYSQL_TYPE_VARCHAR); + +#ifndef DBUG_OFF + static const char test_filename[] = "-@"; + char test_tablename[sizeof test_filename + + sizeof(srv_mysql50_table_name_prefix) - 1]; + if ((sizeof(test_tablename)) - 1 + != filename_to_tablename(test_filename, + test_tablename, + sizeof(test_tablename), true) + || strncmp(test_tablename, + srv_mysql50_table_name_prefix, + sizeof(srv_mysql50_table_name_prefix) - 1) + || strcmp(test_tablename + + sizeof(srv_mysql50_table_name_prefix) - 1, + test_filename)) { + + sql_print_error("tablename encoding has been changed"); + + goto error; + } +#endif /* DBUG_OFF */ + + srv_log_block_size = 0; + if (innobase_log_block_size != (1 << 9)) { /*!=512*/ + uint n_shift; + + fprintf(stderr, + "InnoDB: Warning: innodb_log_block_size has been " + "changed from default value 512. (###EXPERIMENTAL### " + "operation)\n"); + for (n_shift = 9; n_shift <= UNIV_PAGE_SIZE_SHIFT_MAX; + n_shift++) { + if (innobase_log_block_size == ((ulong)1 << n_shift)) { + srv_log_block_size = (1 << n_shift); + fprintf(stderr, + "InnoDB: The log block size is set to " + ULINTPF ".\n",srv_log_block_size); + break; + } + } + } else { + srv_log_block_size = 512; + } + ut_ad (srv_log_block_size >= OS_MIN_LOG_BLOCK_SIZE); + + if (!srv_log_block_size) { + fprintf(stderr, + "InnoDB: Error: %lu is not a valid value for " + "innodb_log_block_size.\n" + "InnoDB: Error: A valid value for " + "innodb_log_block_size is\n" + "InnoDB: Error: a power of 2 from 512 to 16384.\n", + innobase_log_block_size); + goto error; + } + + /* Check that values don't overflow on 32-bit systems. */ + if (sizeof(ulint) == 4) { + if (innobase_buffer_pool_size > UINT_MAX32) { + sql_print_error( + "innobase_buffer_pool_size can't be over 4GB" + " on 32-bit systems"); + + goto error; + } + } + + os_innodb_umask = (ulint) my_umask; + + /* First calculate the default path for innodb_data_home_dir etc., + in case the user has not given any value. + + Note that when using the embedded server, the datadirectory is not + necessarily the current directory of this program. */ + + if (mysqld_embedded) { + default_path = mysql_real_data_home; + fil_path_to_mysql_datadir = mysql_real_data_home; + } else { + /* It's better to use current lib, to keep paths short */ + current_dir[0] = FN_CURLIB; + current_dir[1] = FN_LIBCHAR; + current_dir[2] = 0; + default_path = current_dir; + } + + ut_a(default_path); + + /* Set InnoDB initialization parameters according to the values + read from MySQL .cnf file */ + + /*--------------- Data files -------------------------*/ + + /* The default dir for data files is the datadir of MySQL */ + + srv_data_home = (innobase_data_home_dir ? innobase_data_home_dir : + default_path); + + /* Set default InnoDB data file size to 12 MB and let it be + auto-extending. Thus users can use InnoDB in >= 4.0 without having + to specify any startup options. */ + + if (!innobase_data_file_path) { + innobase_data_file_path = (char*) "ibdata1:12M:autoextend"; + } + + /* Since InnoDB edits the argument in the next call, we make another + copy of it: */ + + internal_innobase_data_file_path = my_strdup(innobase_data_file_path, + MYF(MY_FAE)); + + ret = (bool) srv_parse_data_file_paths_and_sizes( + internal_innobase_data_file_path); + if (ret == FALSE) { + sql_print_error( + "InnoDB: syntax error in innodb_data_file_path" + " or size specified is less than 1 megabyte"); +mem_free_and_error: + srv_free_paths_and_sizes(); + my_free(internal_innobase_data_file_path); + goto error; + } + + /* -------------- All log files ---------------------------*/ + + /* The default dir for log files is the datadir of MySQL */ + + if (!srv_log_group_home_dir) { + srv_log_group_home_dir = default_path; + } + +#ifdef UNIV_LOG_ARCHIVE + if (!innobase_log_arch_dir) { + innobase_log_arch_dir = srv_log_group_home_dir; + } + srv_arch_dir = innobase_log_arch_dir; +#endif /* UNIG_LOG_ARCHIVE */ + + srv_normalize_path_for_win(srv_log_group_home_dir); + + if (strchr(srv_log_group_home_dir, ';')) { + sql_print_error("syntax error in innodb_log_group_home_dir"); + goto mem_free_and_error; + } + + if (innobase_mirrored_log_groups == 1) { + sql_print_warning( + "innodb_mirrored_log_groups is an unimplemented " + "feature and the variable will be completely " + "removed in a future version."); + } + + if (innobase_mirrored_log_groups > 1) { + sql_print_error( + "innodb_mirrored_log_groups is an unimplemented feature and " + "the variable will be completely removed in a future version. " + "Using values other than 1 is not supported."); + goto mem_free_and_error; + } + + if (innobase_mirrored_log_groups == 0) { + /* To throw a deprecation warning message when the option is + passed, the default was changed to '0' (as a workaround). Since + the only value accepted for this option is '1', reset it to 1 */ + innobase_mirrored_log_groups = 1; + } + + /* Validate the file format by animal name */ + if (innobase_file_format_name != NULL) { + + format_id = innobase_file_format_name_lookup( + innobase_file_format_name); + + if (format_id > UNIV_FORMAT_MAX) { + + sql_print_error("InnoDB: wrong innodb_file_format."); + + goto mem_free_and_error; + } + } else { + /* Set it to the default file format id. Though this + should never happen. */ + format_id = 0; + } + + srv_file_format = format_id; + + /* Given the type of innobase_file_format_name we have little + choice but to cast away the constness from the returned name. + innobase_file_format_name is used in the MySQL set variable + interface and so can't be const. */ + + innobase_file_format_name = + (char*) trx_sys_file_format_id_to_name(format_id); + + /* Check innobase_file_format_check variable */ + if (!innobase_file_format_check) { + + /* Set the value to disable checking. */ + srv_max_file_format_at_startup = UNIV_FORMAT_MAX + 1; + + } else { + + /* Set the value to the lowest supported format. */ + srv_max_file_format_at_startup = UNIV_FORMAT_MIN; + } + + /* Did the user specify a format name that we support? + As a side effect it will update the variable + srv_max_file_format_at_startup */ + if (innobase_file_format_validate_and_set( + innobase_file_format_max) < 0) { + + sql_print_error("InnoDB: invalid " + "innodb_file_format_max value: " + "should be any value up to %s or its " + "equivalent numeric id", + trx_sys_file_format_id_to_name( + UNIV_FORMAT_MAX)); + + goto mem_free_and_error; + } + + if (innobase_change_buffering) { + ulint use; + + for (use = 0; + use < UT_ARR_SIZE(innobase_change_buffering_values); + use++) { + if (!innobase_strcasecmp( + innobase_change_buffering, + innobase_change_buffering_values[use])) { + ibuf_use = (ibuf_use_t) use; + goto innobase_change_buffering_inited_ok; + } + } + + sql_print_error("InnoDB: invalid value " + "innodb_change_buffering=%s", + innobase_change_buffering); + goto mem_free_and_error; + } + +innobase_change_buffering_inited_ok: + ut_a((ulint) ibuf_use < UT_ARR_SIZE(innobase_change_buffering_values)); + innobase_change_buffering = (char*) + innobase_change_buffering_values[ibuf_use]; + + /* Check that interdependent parameters have sane values. */ + if (srv_max_buf_pool_modified_pct < srv_max_dirty_pages_pct_lwm) { + sql_print_warning("InnoDB: innodb_max_dirty_pages_pct_lwm" + " cannot be set higher than" + " innodb_max_dirty_pages_pct.\n" + "InnoDB: Setting" + " innodb_max_dirty_pages_pct_lwm to %lu\n", + srv_max_buf_pool_modified_pct); + + srv_max_dirty_pages_pct_lwm = srv_max_buf_pool_modified_pct; + } + + if (srv_max_io_capacity == SRV_MAX_IO_CAPACITY_DUMMY_DEFAULT) { + + if (srv_io_capacity >= SRV_MAX_IO_CAPACITY_LIMIT / 2) { + /* Avoid overflow. */ + srv_max_io_capacity = SRV_MAX_IO_CAPACITY_LIMIT; + } else { + /* The user has not set the value. We should + set it based on innodb_io_capacity. */ + srv_max_io_capacity = static_cast<ulong>( + ut_max(2 * srv_io_capacity, 2000)); + } + + } else if (srv_max_io_capacity < srv_io_capacity) { + sql_print_warning("InnoDB: innodb_io_capacity" + " cannot be set higher than" + " innodb_io_capacity_max.\n" + "InnoDB: Setting" + " innodb_io_capacity to %lu\n", + srv_max_io_capacity); + + srv_io_capacity = srv_max_io_capacity; + } + + if (!is_filename_allowed(srv_buf_dump_filename, + strlen(srv_buf_dump_filename), FALSE)) { + sql_print_error("InnoDB: innodb_buffer_pool_filename" + " cannot have colon (:) in the file name."); + goto mem_free_and_error; + } + + /* --------------------------------------------------*/ + + srv_file_flush_method_str = innobase_file_flush_method; + + srv_log_file_size = (ib_uint64_t) innobase_log_file_size; + +#ifdef UNIV_LOG_ARCHIVE + srv_log_archive_on = (ulint) innobase_log_archive; +#endif /* UNIV_LOG_ARCHIVE */ + + /* Check that the value of system variable innodb_page_size was + set correctly. Its value was put into srv_page_size. If valid, + return the associated srv_page_size_shift.*/ + srv_page_size_shift = innodb_page_size_validate(srv_page_size); + if (!srv_page_size_shift) { + sql_print_error("InnoDB: Invalid page size=%lu.\n", + srv_page_size); + goto mem_free_and_error; + } + if (UNIV_PAGE_SIZE_DEF != srv_page_size) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: innodb-page-size has been changed" + " from the default value %d to %lu.\n", + UNIV_PAGE_SIZE_DEF, srv_page_size); + } + + srv_log_buffer_size = (ulint) innobase_log_buffer_size; + + if (innobase_buffer_pool_instances == 0) { + innobase_buffer_pool_instances = 8; + +#if defined(__WIN__) && !defined(_WIN64) + if (innobase_buffer_pool_size > 1331 * 1024 * 1024) { + innobase_buffer_pool_instances + = ut_min(MAX_BUFFER_POOLS, + (long) (innobase_buffer_pool_size + / (128 * 1024 * 1024))); + } +#endif /* defined(__WIN__) && !defined(_WIN64) */ + } + srv_buf_pool_size = (ulint) innobase_buffer_pool_size; + srv_buf_pool_instances = (ulint) innobase_buffer_pool_instances; + + srv_mem_pool_size = (ulint) innobase_additional_mem_pool_size; + + if (innobase_additional_mem_pool_size + != 8*1024*1024L /* the default */ ) { + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Warning: Using " + "innodb_additional_mem_pool_size is DEPRECATED. " + "This option may be removed in future releases, " + "together with the option innodb_use_sys_malloc " + "and with the InnoDB's internal memory " + "allocator.\n"); + } + + if (!srv_use_sys_malloc ) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Warning: Setting " + "innodb_use_sys_malloc to FALSE is DEPRECATED. " + "This option may be removed in future releases, " + "together with the InnoDB's internal memory " + "allocator.\n"); + } + + srv_n_file_io_threads = (ulint) innobase_file_io_threads; + srv_n_read_io_threads = (ulint) innobase_read_io_threads; + srv_n_write_io_threads = (ulint) innobase_write_io_threads; + + srv_use_doublewrite_buf = (ibool) innobase_use_doublewrite; + + if (!innobase_use_checksums) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Warning: Setting " + "innodb_checksums to OFF is DEPRECATED. " + "This option may be removed in future releases. " + "You should set innodb_checksum_algorithm=NONE " + "instead.\n"); + srv_checksum_algorithm = SRV_CHECKSUM_ALGORITHM_NONE; + } + + innodb_log_checksum_func_update(srv_log_checksum_algorithm); + +#ifdef HAVE_LARGE_PAGES + if ((os_use_large_pages = (ibool) my_use_large_pages)) { + os_large_page_size = (ulint) opt_large_page_size; + } +#endif + + row_rollback_on_timeout = (ibool) innobase_rollback_on_timeout; + + srv_locks_unsafe_for_binlog = (ibool) innobase_locks_unsafe_for_binlog; + if (innobase_locks_unsafe_for_binlog) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Warning: Using " + "innodb_locks_unsafe_for_binlog is DEPRECATED. " + "This option may be removed in future releases. " + "Please use READ COMMITTED transaction isolation " + "level instead, see " REFMAN "set-transaction.html.\n"); + } + + if (innobase_open_files < 10) { + innobase_open_files = 300; + if (srv_file_per_table && table_cache_size > 300) { + innobase_open_files = table_cache_size; + } + } + srv_max_n_open_files = (ulint) innobase_open_files; + srv_innodb_status = (ibool) innobase_create_status_file; + + srv_print_verbose_log = mysqld_embedded ? 0 : 1; + + /* Round up fts_sort_pll_degree to nearest power of 2 number */ + for (num_pll_degree = 1; + num_pll_degree < fts_sort_pll_degree; + num_pll_degree <<= 1) { + + /* No op */ + } + + fts_sort_pll_degree = num_pll_degree; + + /* Store the default charset-collation number of this MySQL + installation */ + + data_mysql_default_charset_coll = (ulint) default_charset_info->number; + + ut_a(DATA_MYSQL_LATIN1_SWEDISH_CHARSET_COLL == + my_charset_latin1.number); + ut_a(DATA_MYSQL_BINARY_CHARSET_COLL == my_charset_bin.number); + + /* Store the latin1_swedish_ci character ordering table to InnoDB. For + non-latin1_swedish_ci charsets we use the MySQL comparison functions, + and consequently we do not need to know the ordering internally in + InnoDB. */ + + ut_a(0 == strcmp(my_charset_latin1.name, "latin1_swedish_ci")); + srv_latin1_ordering = my_charset_latin1.sort_order; + + innobase_commit_concurrency_init_default(); + +#ifndef EXTENDED_FOR_KILLIDLE + srv_kill_idle_transaction = 0; +#endif + + srv_use_atomic_writes = (ibool) innobase_use_atomic_writes; + if (innobase_use_atomic_writes) { + ib_logf(IB_LOG_LEVEL_INFO, "using atomic writes."); + + /* Force doublewrite buffer off, atomic writes replace it. */ + if (srv_use_doublewrite_buf) { + ib_logf(IB_LOG_LEVEL_INFO, "switching off doublewrite " + "buffer because of atomic writes."); + innobase_use_doublewrite = FALSE; + srv_use_doublewrite_buf = FALSE; + } + + /* Force O_DIRECT on Unixes (on Windows writes are always + unbuffered)*/ +#ifndef _WIN32 + if(!innobase_file_flush_method || + !strstr(innobase_file_flush_method, "O_DIRECT")) { + innobase_file_flush_method = + srv_file_flush_method_str = (char*)"O_DIRECT"; + ib_logf(IB_LOG_LEVEL_INFO, + "using O_DIRECT due to atomic writes."); + } +#endif +#ifdef HAVE_POSIX_FALLOCATE + /* Due to a bug in directFS, using atomics needs + posix_fallocate() to extend the file, because pwrite() past the + end of the file won't work */ + srv_use_posix_fallocate = TRUE; +#endif + } + +#ifdef HAVE_PSI_INTERFACE + /* Register keys with MySQL performance schema */ + int count; + + count = array_elements(all_pthread_mutexes); + mysql_mutex_register("innodb", all_pthread_mutexes, count); + +# ifdef UNIV_PFS_MUTEX + count = array_elements(all_innodb_mutexes); + mysql_mutex_register("innodb", all_innodb_mutexes, count); +# endif /* UNIV_PFS_MUTEX */ + +# ifdef UNIV_PFS_RWLOCK + count = array_elements(all_innodb_rwlocks); + mysql_rwlock_register("innodb", all_innodb_rwlocks, count); +# endif /* UNIV_PFS_MUTEX */ + +# ifdef UNIV_PFS_THREAD + count = array_elements(all_innodb_threads); + mysql_thread_register("innodb", all_innodb_threads, count); +# endif /* UNIV_PFS_THREAD */ + +# ifdef UNIV_PFS_IO + count = array_elements(all_innodb_files); + mysql_file_register("innodb", all_innodb_files, count); +# endif /* UNIV_PFS_IO */ + + count = array_elements(all_innodb_conds); + mysql_cond_register("innodb", all_innodb_conds, count); +#endif /* HAVE_PSI_INTERFACE */ + + /* Since we in this module access directly the fields of a trx + struct, and due to different headers and flags it might happen that + ib_mutex_t has a different size in this module and in InnoDB + modules, we check at run time that the size is the same in + these compilation modules. */ + + err = innobase_start_or_create_for_mysql(); + + if (err != DB_SUCCESS) { + goto mem_free_and_error; + } + + /* Adjust the innodb_undo_logs config object */ + innobase_undo_logs_init_default_max(); + + innobase_old_blocks_pct = static_cast<uint>( + buf_LRU_old_ratio_update(innobase_old_blocks_pct, TRUE)); + + ibuf_max_size_update(innobase_change_buffer_max_size); + + innobase_open_tables = hash_create(200); + mysql_mutex_init(innobase_share_mutex_key, + &innobase_share_mutex, + MY_MUTEX_INIT_FAST); + mysql_mutex_init(commit_cond_mutex_key, + &commit_cond_m, MY_MUTEX_INIT_FAST); + mysql_cond_init(commit_cond_key, &commit_cond, NULL); + innodb_inited= 1; +#ifdef MYSQL_DYNAMIC_PLUGIN + if (innobase_hton != p) { + innobase_hton = reinterpret_cast<handlerton*>(p); + *innobase_hton = *innodb_hton_ptr; + } +#endif /* MYSQL_DYNAMIC_PLUGIN */ + + /* Get the current high water mark format. */ + innobase_file_format_max = (char*) trx_sys_file_format_max_get(); + + /* Currently, monitor counter information are not persistent. */ + memset(monitor_set_tbl, 0, sizeof monitor_set_tbl); + + memset(innodb_counter_value, 0, sizeof innodb_counter_value); + + /* Do this as late as possible so server is fully starts up, + since we might get some initial stats if user choose to turn + on some counters from start up */ + if (innobase_enable_monitor_counter) { + innodb_enable_monitor_at_startup( + innobase_enable_monitor_counter); + } + + /* Turn on monitor counters that are default on */ + srv_mon_default_on(); + + DBUG_RETURN(FALSE); +error: + DBUG_RETURN(TRUE); +} + +/*******************************************************************//** +Closes an InnoDB database. +@return TRUE if error */ +static +int +innobase_end( +/*=========*/ + handlerton* hton, /*!< in/out: InnoDB handlerton */ + ha_panic_function type __attribute__((unused))) + /*!< in: ha_panic() parameter */ +{ + int err= 0; + + DBUG_ENTER("innobase_end"); + DBUG_ASSERT(hton == innodb_hton_ptr); + + if (innodb_inited) { + + srv_fast_shutdown = (ulint) innobase_fast_shutdown; + + innodb_inited = 0; + hash_table_free(innobase_open_tables); + innobase_open_tables = NULL; + if (innobase_shutdown_for_mysql() != DB_SUCCESS) { + err = 1; + } + srv_free_paths_and_sizes(); + my_free(internal_innobase_data_file_path); + mysql_mutex_destroy(&innobase_share_mutex); + mysql_mutex_destroy(&commit_cond_m); + mysql_cond_destroy(&commit_cond); + } + + DBUG_RETURN(err); +} + +/****************************************************************//** +Flushes InnoDB logs to disk and makes a checkpoint. Really, a commit flushes +the logs, and the name of this function should be innobase_checkpoint. +@return TRUE if error */ +static +bool +innobase_flush_logs( +/*================*/ + handlerton* hton) /*!< in/out: InnoDB handlerton */ +{ + bool result = 0; + + DBUG_ENTER("innobase_flush_logs"); + DBUG_ASSERT(hton == innodb_hton_ptr); + + if (!srv_read_only_mode) { + log_buffer_flush_to_disk(); + } + + DBUG_RETURN(result); +} + +/************************************************************//** +Synchronously read and parse the redo log up to the last +checkpoint to write the changed page bitmap. +@return 0 to indicate success. Current implementation cannot fail. */ +static +my_bool +innobase_flush_changed_page_bitmaps() +/*=================================*/ +{ + if (srv_track_changed_pages) { + os_event_reset(srv_checkpoint_completed_event); + log_online_follow_redo_log(); + } + return FALSE; +} + +/************************************************************//** +Delete all the bitmap files for data less than the specified LSN. +If called with lsn == IB_ULONGLONG_MAX (i.e. set by RESET request), +restart the bitmap file sequence, otherwise continue it. +@return 0 to indicate success, 1 for failure. */ +static +my_bool +innobase_purge_changed_page_bitmaps( +/*================================*/ + ulonglong lsn) /*!< in: LSN to purge files up to */ +{ + return (my_bool)log_online_purge_changed_page_bitmaps(lsn); +} + +/*****************************************************************//** +Check whether this is a fake change transaction. +@return TRUE if a fake change transaction */ +static +my_bool +innobase_is_fake_change( +/*====================*/ + handlerton *hton __attribute__((unused)), + /*!< in: InnoDB handlerton */ + THD* thd) /*!< in: MySQL thread handle of the user for + whom the transaction is being committed */ +{ + trx_t* trx = check_trx_exists(thd); + return UNIV_UNLIKELY(trx->fake_changes); +} + +/*****************************************************************//** +Commits a transaction in an InnoDB database. */ +static +void +innobase_commit_low( +/*================*/ + trx_t* trx) /*!< in: transaction handle */ +{ + if (trx_is_started(trx)) { + + trx_commit_for_mysql(trx); + } +} + +/*****************************************************************//** +Creates an InnoDB transaction struct for the thd if it does not yet have one. +Starts a new InnoDB transaction if a transaction is not yet started. And +assigns a new snapshot for a consistent read if the transaction does not yet +have one. +@return 0 */ +static +int +innobase_start_trx_and_assign_read_view( +/*====================================*/ + handlerton* hton, /*!< in: Innodb handlerton */ + THD* thd) /*!< in: MySQL thread handle of the user for + whom the transaction should be committed */ +{ + trx_t* trx; + + DBUG_ENTER("innobase_start_trx_and_assign_read_view"); + DBUG_ASSERT(hton == innodb_hton_ptr); + + /* Create a new trx struct for thd, if it does not yet have one */ + + trx = check_trx_exists(thd); + + /* This is just to play safe: release a possible FIFO ticket and + search latch. Since we can potentially reserve the trx_sys->mutex, + we have to release the search system latch first to obey the latching + order. */ + + trx_search_latch_release_if_reserved(trx); + + innobase_srv_conc_force_exit_innodb(trx); + + /* If the transaction is not started yet, start it */ + + trx_start_if_not_started_xa(trx); + + /* Assign a read view if the transaction does not have it yet. + Do this only if transaction is using REPEATABLE READ isolation + level. */ + trx->isolation_level = innobase_map_isolation_level( + thd_get_trx_isolation(thd)); + + if (trx->isolation_level == TRX_ISO_REPEATABLE_READ) { + trx_assign_read_view(trx); + } else { + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + HA_ERR_UNSUPPORTED, + "InnoDB: WITH CONSISTENT SNAPSHOT " + "was ignored because this phrase " + "can only be used with " + "REPEATABLE READ isolation level."); + } + + /* Set the MySQL flag to mark that there is an active transaction */ + + innobase_register_trx(hton, current_thd, trx); + + DBUG_RETURN(0); +} + + +/*****************************************************************//** +Creates an InnoDB transaction struct for the thd if it does not yet have one. +Starts a new InnoDB transaction if a transaction is not yet started. And +clones snapshot for a consistent read from another session, if it has one. +@return 0 */ +static +int +innobase_start_trx_and_clone_read_view( +/*====================================*/ + handlerton* hton, /* in: Innodb handlerton */ + THD* thd, /* in: MySQL thread handle of the + user for whom the transaction should + be committed */ + THD* from_thd) /* in: MySQL thread handle of the + user session from which the consistent + read should be cloned */ +{ + trx_t* trx; + trx_t* from_trx; + + DBUG_ENTER("innobase_start_trx_and_clone_read_view"); + DBUG_ASSERT(hton == innodb_hton_ptr); + + /* Get transaction handle from the donor session */ + + from_trx = thd_to_trx(from_thd); + + if (!from_trx) { + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + HA_ERR_UNSUPPORTED, + "InnoDB: WITH CONSISTENT SNAPSHOT " + "FROM SESSION was ignored because the " + "specified session does not have an open " + "transaction inside InnoDB."); + + DBUG_RETURN(0); + } + + /* Create a new trx struct for thd, if it does not yet have one */ + + trx = check_trx_exists(thd); + + /* This is just to play safe: release a possible FIFO ticket and + search latch. Since we can potentially reserve the trx_sys->mutex, + we have to release the search system latch first to obey the latching + order. */ + + trx_search_latch_release_if_reserved(trx); + + innobase_srv_conc_force_exit_innodb(trx); + + /* If the transaction is not started yet, start it */ + + trx_start_if_not_started_xa(trx); + + /* Clone the read view from the donor transaction. Do this only if + transaction is using REPEATABLE READ isolation level. */ + trx->isolation_level = innobase_map_isolation_level( + thd_get_trx_isolation(thd)); + + if (trx->isolation_level != TRX_ISO_REPEATABLE_READ) { + + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + HA_ERR_UNSUPPORTED, + "InnoDB: WITH CONSISTENT SNAPSHOT " + "was ignored because this phrase " + "can only be used with " + "REPEATABLE READ isolation level."); + } else { + + lock_mutex_enter(); + mutex_enter(&trx_sys->mutex); + trx_mutex_enter(from_trx); + + if (!trx_clone_read_view(trx, from_trx)) { + + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + HA_ERR_UNSUPPORTED, + "InnoDB: WITH CONSISTENT SNAPSHOT " + "FROM SESSION was ignored because " + "the target transaction has not been " + "assigned a read view."); + } + + trx_mutex_exit(from_trx); + mutex_exit(&trx_sys->mutex); + lock_mutex_exit(); + } + + /* Set the MySQL flag to mark that there is an active transaction */ + + innobase_register_trx(hton, current_thd, trx); + + DBUG_RETURN(0); +} + +/*****************************************************************//** +Commits a transaction in an InnoDB database or marks an SQL statement +ended. +@return 0 */ +static +int +innobase_commit( +/*============*/ + handlerton* hton, /*!< in: Innodb handlerton */ + THD* thd, /*!< in: MySQL thread handle of the + user for whom the transaction should + be committed */ + bool commit_trx) /*!< in: true - commit transaction + false - the current SQL statement + ended */ +{ + trx_t* trx; + + DBUG_ENTER("innobase_commit"); + DBUG_ASSERT(hton == innodb_hton_ptr); + DBUG_PRINT("trans", ("ending transaction")); + + trx = check_trx_exists(thd); + + /* Since we will reserve the trx_sys->mutex, we have to release + the search system latch first to obey the latching order. */ + + /* No-op in XtraDB */ + trx_search_latch_release_if_reserved(trx); + + if (UNIV_UNLIKELY(trx->fake_changes && + (commit_trx || + (!thd_test_options(thd, + OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))))) { + + /* rollback implicitly */ + innobase_rollback(hton, thd, commit_trx); + /* because debug assertion code complains, if something left */ + thd->get_stmt_da()->reset_diagnostics_area(); + DBUG_RETURN(HA_ERR_WRONG_COMMAND); + } + /* Transaction is deregistered only in a commit or a rollback. If + it is deregistered we know there cannot be resources to be freed + and we could return immediately. For the time being, we play safe + and do the cleanup though there should be nothing to clean up. */ + + if (!trx_is_registered_for_2pc(trx) && trx_is_started(trx)) { + + sql_print_error("Transaction not registered for MySQL 2PC, " + "but transaction is active"); + } + + if (commit_trx + || (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) { + + DBUG_EXECUTE_IF("crash_innodb_before_commit", + DBUG_SUICIDE();); + + /* We were instructed to commit the whole transaction, or + this is an SQL statement end and autocommit is on */ + + /* We need current binlog position for mysqlbackup to work. */ +retry: + if (innobase_commit_concurrency > 0) { + mysql_mutex_lock(&commit_cond_m); + commit_threads++; + + if (commit_threads > innobase_commit_concurrency) { + commit_threads--; + mysql_cond_wait(&commit_cond, + &commit_cond_m); + mysql_mutex_unlock(&commit_cond_m); + goto retry; + } + else { + mysql_mutex_unlock(&commit_cond_m); + } + } + + /* The following call read the binary log position of + the transaction being committed. + + Binary logging of other engines is not relevant to + InnoDB as all InnoDB requires is that committing + InnoDB transactions appear in the same order in the + MySQL binary log as they appear in InnoDB logs, which + is guaranteed by the server. + + If the binary log is not enabled, or the transaction + is not written to the binary log, the file name will + be a NULL pointer. */ + unsigned long long pos; + thd_binlog_pos(thd, &trx->mysql_log_file_name, &pos); + trx->mysql_log_offset= static_cast<ib_int64_t>(pos); + /* Don't do write + flush right now. For group commit + to work we want to do the flush later. */ + trx->flush_log_later = TRUE; + innobase_commit_low(trx); + trx->flush_log_later = FALSE; + + if (innobase_commit_concurrency > 0) { + mysql_mutex_lock(&commit_cond_m); + commit_threads--; + mysql_cond_signal(&commit_cond); + mysql_mutex_unlock(&commit_cond_m); + } + + trx_deregister_from_2pc(trx); + + /* Now do a write + flush of logs. */ + trx_commit_complete_for_mysql(trx); + } else { + /* We just mark the SQL statement ended and do not do a + transaction commit */ + + /* If we had reserved the auto-inc lock for some + table in this SQL statement we release it now */ + + lock_unlock_table_autoinc(trx); + + /* Store the current undo_no of the transaction so that we + know where to roll back if we have to roll back the next + SQL statement */ + + trx_mark_sql_stat_end(trx); + } + + trx->n_autoinc_rows = 0; /* Reset the number AUTO-INC rows required */ + + /* This is a statement level variable. */ + trx->fts_next_doc_id = 0; + + innobase_srv_conc_force_exit_innodb(trx); + + DBUG_RETURN(0); +} + +/*****************************************************************//** +Rolls back a transaction or the latest SQL statement. +@return 0 or error number */ +static +int +innobase_rollback( +/*==============*/ + handlerton* hton, /*!< in: Innodb handlerton */ + THD* thd, /*!< in: handle to the MySQL thread + of the user whose transaction should + be rolled back */ + bool rollback_trx) /*!< in: TRUE - rollback entire + transaction FALSE - rollback the current + statement only */ +{ + dberr_t error; + trx_t* trx; + + DBUG_ENTER("innobase_rollback"); + DBUG_ASSERT(hton == innodb_hton_ptr); + DBUG_PRINT("trans", ("aborting transaction")); + + trx = check_trx_exists(thd); + + /* Release a possible FIFO ticket and search latch. Since we will + reserve the trx_sys->mutex, we have to release the search system + latch first to obey the latching order. */ + + trx_search_latch_release_if_reserved(trx); + + innobase_srv_conc_force_exit_innodb(trx); + + trx->n_autoinc_rows = 0; /* Reset the number AUTO-INC rows required */ + + /* If we had reserved the auto-inc lock for some table (if + we come here to roll back the latest SQL statement) we + release it now before a possibly lengthy rollback */ + + lock_unlock_table_autoinc(trx); + + /* This is a statement level variable. */ + trx->fts_next_doc_id = 0; + + if (rollback_trx + || !thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) { + + error = trx_rollback_for_mysql(trx); + trx_deregister_from_2pc(trx); + } else { + error = trx_rollback_last_sql_stat_for_mysql(trx); + } + + DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL)); +} + +/*****************************************************************//** +Rolls back a transaction +@return 0 or error number */ +static +int +innobase_rollback_trx( +/*==================*/ + trx_t* trx) /*!< in: transaction */ +{ + dberr_t error = DB_SUCCESS; + + DBUG_ENTER("innobase_rollback_trx"); + DBUG_PRINT("trans", ("aborting transaction")); + + /* Release a possible FIFO ticket and search latch. Since we will + reserve the trx_sys->mutex, we have to release the search system + latch first to obey the latching order. */ + + trx_search_latch_release_if_reserved(trx); + + innobase_srv_conc_force_exit_innodb(trx); + + /* If we had reserved the auto-inc lock for some table (if + we come here to roll back the latest SQL statement) we + release it now before a possibly lengthy rollback */ + + lock_unlock_table_autoinc(trx); + + if (!trx->read_only) { + error = trx_rollback_for_mysql(trx); + } + + DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL)); +} + +/*****************************************************************//** +Rolls back a transaction to a savepoint. +@return 0 if success, HA_ERR_NO_SAVEPOINT if no savepoint with the +given name */ +static +int +innobase_rollback_to_savepoint( +/*===========================*/ + handlerton* hton, /*!< in: Innodb handlerton */ + THD* thd, /*!< in: handle to the MySQL thread + of the user whose transaction should + be rolled back to savepoint */ + void* savepoint) /*!< in: savepoint data */ +{ + ib_int64_t mysql_binlog_cache_pos; + dberr_t error; + trx_t* trx; + char name[64]; + + DBUG_ENTER("innobase_rollback_to_savepoint"); + DBUG_ASSERT(hton == innodb_hton_ptr); + + trx = check_trx_exists(thd); + + /* Release a possible FIFO ticket and search latch. Since we will + reserve the trx_sys->mutex, we have to release the search system + latch first to obey the latching order. */ + + trx_search_latch_release_if_reserved(trx); + + innobase_srv_conc_force_exit_innodb(trx); + + /* TODO: use provided savepoint data area to store savepoint data */ + + longlong2str((ulint) savepoint, name, 36); + + error = trx_rollback_to_savepoint_for_mysql( + trx, name, &mysql_binlog_cache_pos); + + if (error == DB_SUCCESS && trx->fts_trx != NULL) { + fts_savepoint_rollback(trx, name); + } + + DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL)); +} + +/*****************************************************************//** +Check whether innodb state allows to safely release MDL locks after +rollback to savepoint. +When binlog is on, MDL locks acquired after savepoint unit are not +released if there are any locks held in InnoDB. +@return true if it is safe, false if its not safe. */ +static +bool +innobase_rollback_to_savepoint_can_release_mdl( +/*===========================================*/ + handlerton* hton, /*!< in: InnoDB handlerton */ + THD* thd) /*!< in: handle to the MySQL thread + of the user whose transaction should + be rolled back to savepoint */ +{ + trx_t* trx; + + DBUG_ENTER("innobase_rollback_to_savepoint_can_release_mdl"); + DBUG_ASSERT(hton == innodb_hton_ptr); + + trx = check_trx_exists(thd); + ut_ad(trx); + + /* If transaction has not acquired any locks then it is safe + to release MDL after rollback to savepoint */ + if (!(UT_LIST_GET_LEN(trx->lock.trx_locks))) { + DBUG_RETURN(true); + } + + DBUG_RETURN(false); +} + +/*****************************************************************//** +Release transaction savepoint name. +@return 0 if success, HA_ERR_NO_SAVEPOINT if no savepoint with the +given name */ +static +int +innobase_release_savepoint( +/*=======================*/ + handlerton* hton, /*!< in: handlerton for Innodb */ + THD* thd, /*!< in: handle to the MySQL thread + of the user whose transaction's + savepoint should be released */ + void* savepoint) /*!< in: savepoint data */ +{ + dberr_t error; + trx_t* trx; + char name[64]; + + DBUG_ENTER("innobase_release_savepoint"); + DBUG_ASSERT(hton == innodb_hton_ptr); + + trx = check_trx_exists(thd); + + /* TODO: use provided savepoint data area to store savepoint data */ + + longlong2str((ulint) savepoint, name, 36); + + error = trx_release_savepoint_for_mysql(trx, name); + + if (error == DB_SUCCESS && trx->fts_trx != NULL) { + fts_savepoint_release(trx, name); + } + + DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL)); +} + +/*****************************************************************//** +Sets a transaction savepoint. +@return always 0, that is, always succeeds */ +static +int +innobase_savepoint( +/*===============*/ + handlerton* hton, /*!< in: handle to the Innodb handlerton */ + THD* thd, /*!< in: handle to the MySQL thread */ + void* savepoint) /*!< in: savepoint data */ +{ + dberr_t error; + trx_t* trx; + + DBUG_ENTER("innobase_savepoint"); + DBUG_ASSERT(hton == innodb_hton_ptr); + + /* In the autocommit mode there is no sense to set a savepoint + (unless we are in sub-statement), so SQL layer ensures that + this method is never called in such situation. */ + + trx = check_trx_exists(thd); + + /* Release a possible FIFO ticket and search latch. Since we will + reserve the trx_sys->mutex, we have to release the search system + latch first to obey the latching order. */ + + trx_search_latch_release_if_reserved(trx); + + innobase_srv_conc_force_exit_innodb(trx); + + /* Cannot happen outside of transaction */ + DBUG_ASSERT(trx_is_registered_for_2pc(trx)); + + /* TODO: use provided savepoint data area to store savepoint data */ + char name[64]; + longlong2str((ulint) savepoint,name,36); + + error = trx_savepoint_for_mysql(trx, name, (ib_int64_t)0); + + if (error == DB_SUCCESS && trx->fts_trx != NULL) { + fts_savepoint_take(trx, trx->fts_trx, name); + } + + DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL)); +} + +/*****************************************************************//** +Frees a possible InnoDB trx object associated with the current THD. +@return 0 or error number */ +static +int +innobase_close_connection( +/*======================*/ + handlerton* hton, /*!< in: innobase handlerton */ + THD* thd) /*!< in: handle to the MySQL thread of the user + whose resources should be free'd */ +{ + trx_t* trx; + + DBUG_ENTER("innobase_close_connection"); + DBUG_ASSERT(hton == innodb_hton_ptr); + trx = thd_to_trx(thd); + + ut_a(trx); + + if (!trx_is_registered_for_2pc(trx) && trx_is_started(trx)) { + + sql_print_error("Transaction not registered for MySQL 2PC, " + "but transaction is active"); + } + + if (trx_is_started(trx) && log_warnings) { + + sql_print_warning( + "MySQL is closing a connection that has an active " + "InnoDB transaction. " TRX_ID_FMT " row modifications " + "will roll back.", + trx->undo_no); + } + + innobase_rollback_trx(trx); + + trx_free_for_mysql(trx); + + DBUG_RETURN(0); +} + +/*****************************************************************//** +Frees a possible InnoDB trx object associated with the current THD. +@return 0 or error number */ +UNIV_INTERN +int +innobase_close_thd( +/*===============*/ + THD* thd) /*!< in: handle to the MySQL thread of the user + whose resources should be free'd */ +{ + trx_t* trx = thd_to_trx(thd); + + if (!trx) { + return(0); + } + + return(innobase_close_connection(innodb_hton_ptr, thd)); +} + +/*************************************************************************//** +** InnoDB database tables +*****************************************************************************/ + +/****************************************************************//** +Get the record format from the data dictionary. +@return one of ROW_TYPE_REDUNDANT, ROW_TYPE_COMPACT, +ROW_TYPE_COMPRESSED, ROW_TYPE_DYNAMIC */ +UNIV_INTERN +enum row_type +ha_innobase::get_row_type() const +/*=============================*/ +{ + if (prebuilt && prebuilt->table) { + const ulint flags = prebuilt->table->flags; + + switch (dict_tf_get_rec_format(flags)) { + case REC_FORMAT_REDUNDANT: + return(ROW_TYPE_REDUNDANT); + case REC_FORMAT_COMPACT: + return(ROW_TYPE_COMPACT); + case REC_FORMAT_COMPRESSED: + return(ROW_TYPE_COMPRESSED); + case REC_FORMAT_DYNAMIC: + return(ROW_TYPE_DYNAMIC); + } + } + ut_ad(0); + return(ROW_TYPE_NOT_USED); +} + +/*****************************************************************//** +Cancel any pending lock request associated with the current THD. */ +static +void +innobase_kill_connection( +/*======================*/ + handlerton* hton, /*!< in: innobase handlerton */ + THD* thd) /*!< in: handle to the MySQL thread being killed */ +{ + trx_t* trx; + + DBUG_ENTER("innobase_kill_connection"); + DBUG_ASSERT(hton == innodb_hton_ptr); + + lock_mutex_enter(); + + trx = thd_to_trx(thd); + + if (trx) + { + trx_mutex_enter(trx); + + /* Cancel a pending lock request. */ + if (trx->lock.wait_lock) + lock_cancel_waiting_and_release(trx->lock.wait_lock); + + trx_mutex_exit(trx); + } + + lock_mutex_exit(); + + DBUG_VOID_RETURN; +} + + + +/****************************************************************//** +Get the table flags to use for the statement. +@return table flags */ +UNIV_INTERN +handler::Table_flags +ha_innobase::table_flags() const +/*============================*/ +{ + /* Need to use tx_isolation here since table flags is (also) + called before prebuilt is inited. */ + ulong const tx_isolation = thd_tx_isolation(ha_thd()); + + if (tx_isolation <= ISO_READ_COMMITTED) { + return(int_table_flags); + } + + return(int_table_flags | HA_BINLOG_STMT_CAPABLE); +} + +/****************************************************************//** +Gives the file extension of an InnoDB single-table tablespace. */ +static const char* ha_innobase_exts[] = { + ".ibd", + ".isl", + NullS +}; + +/****************************************************************//** +Returns the table type (storage engine name). +@return table type */ +UNIV_INTERN +const char* +ha_innobase::table_type() const +/*===========================*/ +{ + return(innobase_hton_name); +} + +/****************************************************************//** +Returns the index type. +@return index type */ +UNIV_INTERN +const char* +ha_innobase::index_type( +/*====================*/ + uint keynr) /*!< : index number */ +{ + dict_index_t* index = innobase_get_index(keynr); + + if (index && index->type & DICT_FTS) { + return("FULLTEXT"); + } else { + return("BTREE"); + } +} + +/****************************************************************//** +Returns the table file name extension. +@return file extension string */ +UNIV_INTERN +const char** +ha_innobase::bas_ext() const +/*========================*/ +{ + return(ha_innobase_exts); +} + +/****************************************************************//** +Returns the operations supported for indexes. +@return flags of supported operations */ +UNIV_INTERN +ulong +ha_innobase::index_flags( +/*=====================*/ + uint key, + uint, + bool) const +{ + return((table_share->key_info[key].algorithm == HA_KEY_ALG_FULLTEXT) + ? 0 + : (HA_READ_NEXT | HA_READ_PREV | HA_READ_ORDER + | HA_READ_RANGE | HA_KEYREAD_ONLY + | HA_DO_INDEX_COND_PUSHDOWN)); +} + +/****************************************************************//** +Returns the maximum number of keys. +@return MAX_KEY */ +UNIV_INTERN +uint +ha_innobase::max_supported_keys() const +/*===================================*/ +{ + return(MAX_KEY); +} + +/****************************************************************//** +Returns the maximum key length. +@return maximum supported key length, in bytes */ +UNIV_INTERN +uint +ha_innobase::max_supported_key_length() const +/*=========================================*/ +{ + /* An InnoDB page must store >= 2 keys; a secondary key record + must also contain the primary key value. Therefore, if both + the primary key and the secondary key are at this maximum length, + it must be less than 1/4th of the free space on a page including + record overhead. + + MySQL imposes its own limit to this number; MAX_KEY_LENGTH = 3072. + + For page sizes = 16k, InnoDB historically reported 3500 bytes here, + But the MySQL limit of 3072 was always used through the handler + interface. */ + + switch (UNIV_PAGE_SIZE) { + case 4096: + return(768); + case 8192: + return(1536); + default: + return(3500); + } +} + +/****************************************************************//** +Returns the key map of keys that are usable for scanning. +@return key_map_full */ +UNIV_INTERN +const key_map* +ha_innobase::keys_to_use_for_scanning() +/*===================================*/ +{ + return(&key_map_full); +} + +/****************************************************************//** +Determines if table caching is supported. +@return HA_CACHE_TBL_ASKTRANSACT */ +UNIV_INTERN +uint8 +ha_innobase::table_cache_type() +/*===========================*/ +{ + return(HA_CACHE_TBL_ASKTRANSACT); +} + +/****************************************************************//** +Determines if the primary key is clustered index. +@return true */ +UNIV_INTERN +bool +ha_innobase::primary_key_is_clustered() +/*===================================*/ +{ + return(true); +} + +/*****************************************************************//** +Normalizes a table name string. A normalized name consists of the +database name catenated to '/' and table name. Example: test/mytable. +On Windows normalization puts both the database name and the +table name always to lower case if "set_lower_case" is set to TRUE. */ +static +void +normalize_table_name_low( +/*=====================*/ + char* norm_name, /*!< out: normalized name as a + null-terminated string */ + const char* name, /*!< in: table name string */ + ibool set_lower_case) /*!< in: TRUE if we want to set name + to lower case */ +{ + char* name_ptr; + ulint name_len; + char* db_ptr; + ulint db_len; + char* ptr; + ulint norm_len; + + /* Scan name from the end */ + + ptr = strend(name) - 1; + + /* seek to the last path separator */ + while (ptr >= name && *ptr != '\\' && *ptr != '/') { + ptr--; + } + + name_ptr = ptr + 1; + name_len = strlen(name_ptr); + + /* skip any number of path separators */ + while (ptr >= name && (*ptr == '\\' || *ptr == '/')) { + ptr--; + } + + DBUG_ASSERT(ptr >= name); + + /* seek to the last but one path separator or one char before + the beginning of name */ + db_len = 0; + while (ptr >= name && *ptr != '\\' && *ptr != '/') { + ptr--; + db_len++; + } + + db_ptr = ptr + 1; + + norm_len = db_len + name_len + sizeof "/"; + ut_a(norm_len < FN_REFLEN - 1); + + memcpy(norm_name, db_ptr, db_len); + + norm_name[db_len] = '/'; + + /* Copy the name and null-byte. */ + memcpy(norm_name + db_len + 1, name_ptr, name_len + 1); + + if (set_lower_case) { + innobase_casedn_str(norm_name); + } +} + +#if !defined(DBUG_OFF) +/********************************************************************* +Test normalize_table_name_low(). */ +static +void +test_normalize_table_name_low() +/*===========================*/ +{ + char norm_name[FN_REFLEN]; + const char* test_data[][2] = { + /* input, expected result */ + {"./mysqltest/t1", "mysqltest/t1"}, + {"./test/#sql-842b_2", "test/#sql-842b_2"}, + {"./test/#sql-85a3_10", "test/#sql-85a3_10"}, + {"./test/#sql2-842b-2", "test/#sql2-842b-2"}, + {"./test/bug29807", "test/bug29807"}, + {"./test/foo", "test/foo"}, + {"./test/innodb_bug52663", "test/innodb_bug52663"}, + {"./test/t", "test/t"}, + {"./test/t1", "test/t1"}, + {"./test/t10", "test/t10"}, + {"/a/b/db/table", "db/table"}, + {"/a/b/db///////table", "db/table"}, + {"/a/b////db///////table", "db/table"}, + {"/var/tmp/mysqld.1/#sql842b_2_10", "mysqld.1/#sql842b_2_10"}, + {"db/table", "db/table"}, + {"ddd/t", "ddd/t"}, + {"d/ttt", "d/ttt"}, + {"d/t", "d/t"}, + {".\\mysqltest\\t1", "mysqltest/t1"}, + {".\\test\\#sql-842b_2", "test/#sql-842b_2"}, + {".\\test\\#sql-85a3_10", "test/#sql-85a3_10"}, + {".\\test\\#sql2-842b-2", "test/#sql2-842b-2"}, + {".\\test\\bug29807", "test/bug29807"}, + {".\\test\\foo", "test/foo"}, + {".\\test\\innodb_bug52663", "test/innodb_bug52663"}, + {".\\test\\t", "test/t"}, + {".\\test\\t1", "test/t1"}, + {".\\test\\t10", "test/t10"}, + {"C:\\a\\b\\db\\table", "db/table"}, + {"C:\\a\\b\\db\\\\\\\\\\\\\\table", "db/table"}, + {"C:\\a\\b\\\\\\\\db\\\\\\\\\\\\\\table", "db/table"}, + {"C:\\var\\tmp\\mysqld.1\\#sql842b_2_10", "mysqld.1/#sql842b_2_10"}, + {"db\\table", "db/table"}, + {"ddd\\t", "ddd/t"}, + {"d\\ttt", "d/ttt"}, + {"d\\t", "d/t"}, + }; + + for (size_t i = 0; i < UT_ARR_SIZE(test_data); i++) { + printf("test_normalize_table_name_low(): " + "testing \"%s\", expected \"%s\"... ", + test_data[i][0], test_data[i][1]); + + normalize_table_name_low(norm_name, test_data[i][0], FALSE); + + if (strcmp(norm_name, test_data[i][1]) == 0) { + printf("ok\n"); + } else { + printf("got \"%s\"\n", norm_name); + ut_error; + } + } +} + +/********************************************************************* +Test ut_format_name(). */ +static +void +test_ut_format_name() +/*=================*/ +{ + char buf[NAME_LEN * 3]; + + struct { + const char* name; + ibool is_table; + ulint buf_size; + const char* expected; + } test_data[] = { + {"test/t1", TRUE, sizeof(buf), "\"test\".\"t1\""}, + {"test/t1", TRUE, 12, "\"test\".\"t1\""}, + {"test/t1", TRUE, 11, "\"test\".\"t1"}, + {"test/t1", TRUE, 10, "\"test\".\"t"}, + {"test/t1", TRUE, 9, "\"test\".\""}, + {"test/t1", TRUE, 8, "\"test\"."}, + {"test/t1", TRUE, 7, "\"test\""}, + {"test/t1", TRUE, 6, "\"test"}, + {"test/t1", TRUE, 5, "\"tes"}, + {"test/t1", TRUE, 4, "\"te"}, + {"test/t1", TRUE, 3, "\"t"}, + {"test/t1", TRUE, 2, "\""}, + {"test/t1", TRUE, 1, ""}, + {"test/t1", TRUE, 0, "BUF_NOT_CHANGED"}, + {"table", TRUE, sizeof(buf), "\"table\""}, + {"ta'le", TRUE, sizeof(buf), "\"ta'le\""}, + {"ta\"le", TRUE, sizeof(buf), "\"ta\"\"le\""}, + {"ta`le", TRUE, sizeof(buf), "\"ta`le\""}, + {"index", FALSE, sizeof(buf), "\"index\""}, + {"ind/ex", FALSE, sizeof(buf), "\"ind/ex\""}, + }; + + for (size_t i = 0; i < UT_ARR_SIZE(test_data); i++) { + + memcpy(buf, "BUF_NOT_CHANGED", strlen("BUF_NOT_CHANGED") + 1); + + char* ret; + + ret = ut_format_name(test_data[i].name, + test_data[i].is_table, + buf, + test_data[i].buf_size); + + ut_a(ret == buf); + + if (strcmp(buf, test_data[i].expected) == 0) { + fprintf(stderr, + "ut_format_name(%s, %s, buf, %lu), " + "expected %s, OK\n", + test_data[i].name, + test_data[i].is_table ? "TRUE" : "FALSE", + test_data[i].buf_size, + test_data[i].expected); + } else { + fprintf(stderr, + "ut_format_name(%s, %s, buf, %lu), " + "expected %s, ERROR: got %s\n", + test_data[i].name, + test_data[i].is_table ? "TRUE" : "FALSE", + test_data[i].buf_size, + test_data[i].expected, + buf); + ut_error; + } + } +} +#endif /* !DBUG_OFF */ + +/********************************************************************//** +Get the upper limit of the MySQL integral and floating-point type. +@return maximum allowed value for the field */ +UNIV_INTERN +ulonglong +innobase_get_int_col_max_value( +/*===========================*/ + const Field* field) /*!< in: MySQL field */ +{ + ulonglong max_value = 0; + + switch (field->key_type()) { + /* TINY */ + case HA_KEYTYPE_BINARY: + max_value = 0xFFULL; + break; + case HA_KEYTYPE_INT8: + max_value = 0x7FULL; + break; + /* SHORT */ + case HA_KEYTYPE_USHORT_INT: + max_value = 0xFFFFULL; + break; + case HA_KEYTYPE_SHORT_INT: + max_value = 0x7FFFULL; + break; + /* MEDIUM */ + case HA_KEYTYPE_UINT24: + max_value = 0xFFFFFFULL; + break; + case HA_KEYTYPE_INT24: + max_value = 0x7FFFFFULL; + break; + /* LONG */ + case HA_KEYTYPE_ULONG_INT: + max_value = 0xFFFFFFFFULL; + break; + case HA_KEYTYPE_LONG_INT: + max_value = 0x7FFFFFFFULL; + break; + /* BIG */ + case HA_KEYTYPE_ULONGLONG: + max_value = 0xFFFFFFFFFFFFFFFFULL; + break; + case HA_KEYTYPE_LONGLONG: + max_value = 0x7FFFFFFFFFFFFFFFULL; + break; + case HA_KEYTYPE_FLOAT: + /* We use the maximum as per IEEE754-2008 standard, 2^24 */ + max_value = 0x1000000ULL; + break; + case HA_KEYTYPE_DOUBLE: + /* We use the maximum as per IEEE754-2008 standard, 2^53 */ + max_value = 0x20000000000000ULL; + break; + default: + ut_error; + } + + return(max_value); +} + +/*******************************************************************//** +This function checks whether the index column information +is consistent between KEY info from mysql and that from innodb index. +@return TRUE if all column types match. */ +static +ibool +innobase_match_index_columns( +/*=========================*/ + const KEY* key_info, /*!< in: Index info + from mysql */ + const dict_index_t* index_info) /*!< in: Index info + from Innodb */ +{ + const KEY_PART_INFO* key_part; + const KEY_PART_INFO* key_end; + const dict_field_t* innodb_idx_fld; + const dict_field_t* innodb_idx_fld_end; + + DBUG_ENTER("innobase_match_index_columns"); + + /* Check whether user defined index column count matches */ + if (key_info->user_defined_key_parts != + index_info->n_user_defined_cols) { + DBUG_RETURN(FALSE); + } + + key_part = key_info->key_part; + key_end = key_part + key_info->user_defined_key_parts; + innodb_idx_fld = index_info->fields; + innodb_idx_fld_end = index_info->fields + index_info->n_fields; + + /* Check each index column's datatype. We do not check + column name because there exists case that index + column name got modified in mysql but such change does not + propagate to InnoDB. + One hidden assumption here is that the index column sequences + are matched up between those in mysql and Innodb. */ + for (; key_part != key_end; ++key_part) { + ulint col_type; + ibool is_unsigned; + ulint mtype = innodb_idx_fld->col->mtype; + + /* Need to translate to InnoDB column type before + comparison. */ + col_type = get_innobase_type_from_mysql_type(&is_unsigned, + key_part->field); + + /* Ignore Innodb specific system columns. */ + while (mtype == DATA_SYS) { + innodb_idx_fld++; + + if (innodb_idx_fld >= innodb_idx_fld_end) { + DBUG_RETURN(FALSE); + } + } + + if (col_type != mtype) { + /* Column Type mismatches */ + DBUG_RETURN(FALSE); + } + + innodb_idx_fld++; + } + + DBUG_RETURN(TRUE); +} + +/*******************************************************************//** +This function builds a translation table in INNOBASE_SHARE +structure for fast index location with mysql array number from its +table->key_info structure. This also provides the necessary translation +between the key order in mysql key_info and Innodb ib_table->indexes if +they are not fully matched with each other. +Note we do not have any mutex protecting the translation table +building based on the assumption that there is no concurrent +index creation/drop and DMLs that requires index lookup. All table +handle will be closed before the index creation/drop. +@return TRUE if index translation table built successfully */ +static +ibool +innobase_build_index_translation( +/*=============================*/ + const TABLE* table, /*!< in: table in MySQL data + dictionary */ + dict_table_t* ib_table,/*!< in: table in Innodb data + dictionary */ + INNOBASE_SHARE* share) /*!< in/out: share structure + where index translation table + will be constructed in. */ +{ + ulint mysql_num_index; + ulint ib_num_index; + dict_index_t** index_mapping; + ibool ret = TRUE; + + DBUG_ENTER("innobase_build_index_translation"); + + mutex_enter(&dict_sys->mutex); + + mysql_num_index = table->s->keys; + ib_num_index = UT_LIST_GET_LEN(ib_table->indexes); + + index_mapping = share->idx_trans_tbl.index_mapping; + + /* If there exists inconsistency between MySQL and InnoDB dictionary + (metadata) information, the number of index defined in MySQL + could exceed that in InnoDB, do not build index translation + table in such case */ + if (UNIV_UNLIKELY(ib_num_index < mysql_num_index)) { + ret = FALSE; + goto func_exit; + } + + /* If index entry count is non-zero, nothing has + changed since last update, directly return TRUE */ + if (share->idx_trans_tbl.index_count) { + /* Index entry count should still match mysql_num_index */ + ut_a(share->idx_trans_tbl.index_count == mysql_num_index); + goto func_exit; + } + + /* The number of index increased, rebuild the mapping table */ + if (mysql_num_index > share->idx_trans_tbl.array_size) { + index_mapping = (dict_index_t**) my_realloc(index_mapping, + mysql_num_index * + sizeof(*index_mapping), + MYF(MY_ALLOW_ZERO_PTR)); + + if (!index_mapping) { + /* Report an error if index_mapping continues to be + NULL and mysql_num_index is a non-zero value */ + sql_print_error("InnoDB: fail to allocate memory for " + "index translation table. Number of " + "Index:%lu, array size:%lu", + mysql_num_index, + share->idx_trans_tbl.array_size); + ret = FALSE; + goto func_exit; + } + + share->idx_trans_tbl.array_size = mysql_num_index; + } + + /* For each index in the mysql key_info array, fetch its + corresponding InnoDB index pointer into index_mapping + array. */ + for (ulint count = 0; count < mysql_num_index; count++) { + + /* Fetch index pointers into index_mapping according to mysql + index sequence */ + index_mapping[count] = dict_table_get_index_on_name( + ib_table, table->key_info[count].name); + + if (!index_mapping[count]) { + sql_print_error("Cannot find index %s in InnoDB " + "index dictionary.", + table->key_info[count].name); + ret = FALSE; + goto func_exit; + } + + /* Double check fetched index has the same + column info as those in mysql key_info. */ + if (!innobase_match_index_columns(&table->key_info[count], + index_mapping[count])) { + sql_print_error("Found index %s whose column info " + "does not match that of MySQL.", + table->key_info[count].name); + ret = FALSE; + goto func_exit; + } + } + + /* Successfully built the translation table */ + share->idx_trans_tbl.index_count = mysql_num_index; + +func_exit: + if (!ret) { + /* Build translation table failed. */ + my_free(index_mapping); + + share->idx_trans_tbl.array_size = 0; + share->idx_trans_tbl.index_count = 0; + index_mapping = NULL; + } + + share->idx_trans_tbl.index_mapping = index_mapping; + + mutex_exit(&dict_sys->mutex); + + DBUG_RETURN(ret); +} + +/*******************************************************************//** +This function uses index translation table to quickly locate the +requested index structure. +Note we do not have mutex protection for the index translatoin table +access, it is based on the assumption that there is no concurrent +translation table rebuild (fter create/drop index) and DMLs that +require index lookup. +@return dict_index_t structure for requested index. NULL if +fail to locate the index structure. */ +static +dict_index_t* +innobase_index_lookup( +/*==================*/ + INNOBASE_SHARE* share, /*!< in: share structure for index + translation table. */ + uint keynr) /*!< in: index number for the requested + index */ +{ + if (!share->idx_trans_tbl.index_mapping + || keynr >= share->idx_trans_tbl.index_count) { + return(NULL); + } + + return(share->idx_trans_tbl.index_mapping[keynr]); +} + +/************************************************************************ +Set the autoinc column max value. This should only be called once from +ha_innobase::open(). Therefore there's no need for a covering lock. */ +UNIV_INTERN +void +ha_innobase::innobase_initialize_autoinc() +/*======================================*/ +{ + ulonglong auto_inc; + const Field* field = table->found_next_number_field; + + if (field != NULL) { + auto_inc = innobase_get_int_col_max_value(field); + } else { + /* We have no idea what's been passed in to us as the + autoinc column. We set it to the 0, effectively disabling + updates to the table. */ + auto_inc = 0; + + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Unable to determine the AUTOINC " + "column name\n"); + } + + if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE) { + /* If the recovery level is set so high that writes + are disabled we force the AUTOINC counter to 0 + value effectively disabling writes to the table. + Secondly, we avoid reading the table in case the read + results in failure due to a corrupted table/index. + + We will not return an error to the client, so that the + tables can be dumped with minimal hassle. If an error + were returned in this case, the first attempt to read + the table would fail and subsequent SELECTs would succeed. */ + auto_inc = 0; + } else if (field == NULL) { + /* This is a far more serious error, best to avoid + opening the table and return failure. */ + my_error(ER_AUTOINC_READ_FAILED, MYF(0)); + } else { + dict_index_t* index; + const char* col_name; + ib_uint64_t read_auto_inc; + ulint err; + + update_thd(ha_thd()); + + ut_a(prebuilt->trx == thd_to_trx(user_thd)); + + col_name = field->field_name; + index = innobase_get_index(table->s->next_number_index); + + /* Execute SELECT MAX(col_name) FROM TABLE; */ + err = row_search_max_autoinc(index, col_name, &read_auto_inc); + + switch (err) { + case DB_SUCCESS: { + ulonglong col_max_value; + + col_max_value = innobase_get_int_col_max_value(field); + + /* At the this stage we do not know the increment + nor the offset, so use a default increment of 1. */ + + auto_inc = innobase_next_autoinc( + read_auto_inc, 1, 1, 0, col_max_value); + + break; + } + case DB_RECORD_NOT_FOUND: + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: MySQL and InnoDB data " + "dictionaries are out of sync.\n" + "InnoDB: Unable to find the AUTOINC column " + "%s in the InnoDB table %s.\n" + "InnoDB: We set the next AUTOINC column " + "value to 0,\n" + "InnoDB: in effect disabling the AUTOINC " + "next value generation.\n" + "InnoDB: You can either set the next " + "AUTOINC value explicitly using ALTER TABLE\n" + "InnoDB: or fix the data dictionary by " + "recreating the table.\n", + col_name, index->table->name); + + /* This will disable the AUTOINC generation. */ + auto_inc = 0; + + /* We want the open to succeed, so that the user can + take corrective action. ie. reads should succeed but + updates should fail. */ + err = DB_SUCCESS; + break; + default: + /* row_search_max_autoinc() should only return + one of DB_SUCCESS or DB_RECORD_NOT_FOUND. */ + ut_error; + } + } + + dict_table_autoinc_initialize(prebuilt->table, auto_inc); +} + +/*****************************************************************//** +Creates and opens a handle to a table which already exists in an InnoDB +database. +@return 1 if error, 0 if success */ +UNIV_INTERN +int +ha_innobase::open( +/*==============*/ + const char* name, /*!< in: table name */ + int mode, /*!< in: not used */ + uint test_if_locked) /*!< in: not used */ +{ + dict_table_t* ib_table; + char norm_name[FN_REFLEN]; + THD* thd; + char* is_part = NULL; + ibool par_case_name_set = FALSE; + char par_case_name[FN_REFLEN]; + dict_err_ignore_t ignore_err = DICT_ERR_IGNORE_NONE; + + DBUG_ENTER("ha_innobase::open"); + + UT_NOT_USED(mode); + UT_NOT_USED(test_if_locked); + + thd = ha_thd(); + + /* No-op in XtraDB */ + innobase_release_temporary_latches(ht, thd); + + normalize_table_name(norm_name, name); + + user_thd = NULL; + + if (!(share=get_share(name))) { + + DBUG_RETURN(1); + } + + if (UNIV_UNLIKELY(share->ib_table && share->ib_table->is_corrupt && + srv_pass_corrupt_table <= 1)) { + free_share(share); + + DBUG_RETURN(HA_ERR_CRASHED_ON_USAGE); + } + + /* Will be allocated if it is needed in ::update_row() */ + upd_buf = NULL; + upd_buf_size = 0; + + /* We look for pattern #P# to see if the table is partitioned + MySQL table. */ +#ifdef __WIN__ + is_part = strstr(norm_name, "#p#"); +#else + is_part = strstr(norm_name, "#P#"); +#endif /* __WIN__ */ + + /* Check whether FOREIGN_KEY_CHECKS is set to 0. If so, the table + can be opened even if some FK indexes are missing. If not, the table + can't be opened in the same situation */ + if (thd_test_options(thd, OPTION_NO_FOREIGN_KEY_CHECKS)) { + ignore_err = DICT_ERR_IGNORE_FK_NOKEY; + } + + /* Get pointer to a table object in InnoDB dictionary cache */ + ib_table = dict_table_open_on_name(norm_name, FALSE, TRUE, ignore_err); + + if (ib_table + && ((!DICT_TF2_FLAG_IS_SET(ib_table, DICT_TF2_FTS_HAS_DOC_ID) + && table->s->fields != dict_table_get_n_user_cols(ib_table)) + || (DICT_TF2_FLAG_IS_SET(ib_table, DICT_TF2_FTS_HAS_DOC_ID) + && (table->s->fields + != dict_table_get_n_user_cols(ib_table) - 1)))) { + ib_logf(IB_LOG_LEVEL_WARN, + "table %s contains %lu user defined columns " + "in InnoDB, but %lu columns in MySQL. Please " + "check INFORMATION_SCHEMA.INNODB_SYS_COLUMNS and " + REFMAN "innodb-troubleshooting.html " + "for how to resolve it", + norm_name, (ulong) dict_table_get_n_user_cols(ib_table), + (ulong) table->s->fields); + + /* Mark this table as corrupted, so the drop table + or force recovery can still use it, but not others. */ + ib_table->corrupted = true; + dict_table_close(ib_table, FALSE, FALSE); + ib_table = NULL; + is_part = NULL; + } + + if (UNIV_UNLIKELY(ib_table && ib_table->is_corrupt && + srv_pass_corrupt_table <= 1)) { + free_share(share); + my_free(upd_buf); + upd_buf = NULL; + upd_buf_size = 0; + + DBUG_RETURN(HA_ERR_CRASHED_ON_USAGE); + } + + share->ib_table = ib_table; + + if (NULL == ib_table) { + if (is_part) { + /* MySQL partition engine hard codes the file name + separator as "#P#". The text case is fixed even if + lower_case_table_names is set to 1 or 2. This is true + for sub-partition names as well. InnoDB always + normalises file names to lower case on Windows, this + can potentially cause problems when copying/moving + tables between platforms. + + 1) If boot against an installation from Windows + platform, then its partition table name could + be in lower case in system tables. So we will + need to check lower case name when load table. + + 2) If we boot an installation from other case + sensitive platform in Windows, we might need to + check the existence of table name without lower + case in the system table. */ + if (innobase_get_lower_case_table_names() == 1) { + + if (!par_case_name_set) { +#ifndef __WIN__ + /* Check for the table using lower + case name, including the partition + separator "P" */ + strcpy(par_case_name, norm_name); + innobase_casedn_str(par_case_name); +#else + /* On Windows platfrom, check + whether there exists table name in + system table whose name is + not being normalized to lower case */ + normalize_table_name_low( + par_case_name, name, FALSE); +#endif + par_case_name_set = TRUE; + } + + ib_table = dict_table_open_on_name( + par_case_name, FALSE, TRUE, + ignore_err); + } + + if (ib_table) { +#ifndef __WIN__ + sql_print_warning("Partition table %s opened " + "after converting to lower " + "case. The table may have " + "been moved from a case " + "in-sensitive file system. " + "Please recreate table in " + "the current file system\n", + norm_name); +#else + sql_print_warning("Partition table %s opened " + "after skipping the step to " + "lower case the table name. " + "The table may have been " + "moved from a case sensitive " + "file system. Please " + "recreate table in the " + "current file system\n", + norm_name); +#endif + goto table_opened; + } + } + + if (is_part) { + sql_print_error("Failed to open table %s.\n", + norm_name); + } + + ib_logf(IB_LOG_LEVEL_WARN, + "Cannot open table %s from the internal data " + "dictionary of InnoDB though the .frm file " + "for the table exists. See " + REFMAN "innodb-troubleshooting.html for how " + "you can resolve the problem.", norm_name); + + free_share(share); + my_errno = ENOENT; + + DBUG_RETURN(HA_ERR_NO_SUCH_TABLE); + } + +table_opened: + + innobase_copy_frm_flags_from_table_share(ib_table, table->s); + + dict_stats_init(ib_table); + + MONITOR_INC(MONITOR_TABLE_OPEN); + + bool no_tablespace; + + if (dict_table_is_discarded(ib_table)) { + + ib_senderrf(thd, + IB_LOG_LEVEL_WARN, ER_TABLESPACE_DISCARDED, + table->s->table_name.str); + + /* Allow an open because a proper DISCARD should have set + all the flags and index root page numbers to FIL_NULL that + should prevent any DML from running but it should allow DDL + operations. */ + + no_tablespace = false; + + } else if (ib_table->ibd_file_missing) { + + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, + ER_TABLESPACE_MISSING, norm_name); + + /* This means we have no idea what happened to the tablespace + file, best to play it safe. */ + + no_tablespace = true; + } else { + no_tablespace = false; + } + + if (!thd_tablespace_op(thd) && no_tablespace) { + free_share(share); + my_errno = ENOENT; + + dict_table_close(ib_table, FALSE, FALSE); + + DBUG_RETURN(HA_ERR_NO_SUCH_TABLE); + } + + prebuilt = row_create_prebuilt(ib_table, table->s->reclength); + + prebuilt->default_rec = table->s->default_values; + ut_ad(prebuilt->default_rec); + + /* Looks like MySQL-3.23 sometimes has primary key number != 0 */ + primary_key = table->s->primary_key; + key_used_on_scan = primary_key; + + if (!innobase_build_index_translation(table, ib_table, share)) { + sql_print_error("Build InnoDB index translation table for" + " Table %s failed", name); + } + + /* Allocate a buffer for a 'row reference'. A row reference is + a string of bytes of length ref_length which uniquely specifies + a row in our table. Note that MySQL may also compare two row + references for equality by doing a simple memcmp on the strings + of length ref_length! */ + + if (!row_table_got_default_clust_index(ib_table)) { + + prebuilt->clust_index_was_generated = FALSE; + + if (UNIV_UNLIKELY(primary_key >= MAX_KEY)) { + sql_print_error("Table %s has a primary key in " + "InnoDB data dictionary, but not " + "in MySQL!", name); + + /* This mismatch could cause further problems + if not attended, bring this to the user's attention + by printing a warning in addition to log a message + in the errorlog */ + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_NO_SUCH_INDEX, + "InnoDB: Table %s has a " + "primary key in InnoDB data " + "dictionary, but not in " + "MySQL!", name); + + /* If primary_key >= MAX_KEY, its (primary_key) + value could be out of bound if continue to index + into key_info[] array. Find InnoDB primary index, + and assign its key_length to ref_length. + In addition, since MySQL indexes are sorted starting + with primary index, unique index etc., initialize + ref_length to the first index key length in + case we fail to find InnoDB cluster index. + + Please note, this will not resolve the primary + index mismatch problem, other side effects are + possible if users continue to use the table. + However, we allow this table to be opened so + that user can adopt necessary measures for the + mismatch while still being accessible to the table + date. */ + if (!table->key_info) { + ut_ad(!table->s->keys); + ref_length = 0; + } else { + ref_length = table->key_info[0].key_length; + } + + /* Find corresponding cluster index + key length in MySQL's key_info[] array */ + for (uint i = 0; i < table->s->keys; i++) { + dict_index_t* index; + index = innobase_get_index(i); + if (dict_index_is_clust(index)) { + ref_length = + table->key_info[i].key_length; + } + } + } else { + /* MySQL allocates the buffer for ref. + key_info->key_length includes space for all key + columns + one byte for each column that may be + NULL. ref_length must be as exact as possible to + save space, because all row reference buffers are + allocated based on ref_length. */ + + ref_length = table->key_info[primary_key].key_length; + } + } else { + if (primary_key != MAX_KEY) { + sql_print_error( + "Table %s has no primary key in InnoDB data " + "dictionary, but has one in MySQL! If you " + "created the table with a MySQL version < " + "3.23.54 and did not define a primary key, " + "but defined a unique key with all non-NULL " + "columns, then MySQL internally treats that " + "key as the primary key. You can fix this " + "error by dump + DROP + CREATE + reimport " + "of the table.", name); + + /* This mismatch could cause further problems + if not attended, bring this to the user attention + by printing a warning in addition to log a message + in the errorlog */ + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_NO_SUCH_INDEX, + "InnoDB: Table %s has no " + "primary key in InnoDB data " + "dictionary, but has one in " + "MySQL!", name); + } + + prebuilt->clust_index_was_generated = TRUE; + + ref_length = DATA_ROW_ID_LEN; + + /* If we automatically created the clustered index, then + MySQL does not know about it, and MySQL must NOT be aware + of the index used on scan, to make it avoid checking if we + update the column of the index. That is why we assert below + that key_used_on_scan is the undefined value MAX_KEY. + The column is the row id in the automatical generation case, + and it will never be updated anyway. */ + + if (key_used_on_scan != MAX_KEY) { + sql_print_warning( + "Table %s key_used_on_scan is %lu even " + "though there is no primary key inside " + "InnoDB.", name, (ulong) key_used_on_scan); + } + } + + /* Index block size in InnoDB: used by MySQL in query optimization */ + stats.block_size = UNIV_PAGE_SIZE; + + /* Init table lock structure */ + thr_lock_data_init(&share->lock,&lock,(void*) 0); + + if (prebuilt->table) { + /* We update the highest file format in the system table + space, if this table has higher file format setting. */ + + trx_sys_file_format_max_upgrade( + (const char**) &innobase_file_format_max, + dict_table_get_format(prebuilt->table)); + } + + /* Only if the table has an AUTOINC column. */ + if (prebuilt->table != NULL + && !prebuilt->table->ibd_file_missing + && table->found_next_number_field != NULL) { + dict_table_autoinc_lock(prebuilt->table); + + /* Since a table can already be "open" in InnoDB's internal + data dictionary, we only init the autoinc counter once, the + first time the table is loaded. We can safely reuse the + autoinc value from a previous MySQL open. */ + if (dict_table_autoinc_read(prebuilt->table) == 0) { + + innobase_initialize_autoinc(); + } + + dict_table_autoinc_unlock(prebuilt->table); + } + + info(HA_STATUS_NO_LOCK | HA_STATUS_VARIABLE | HA_STATUS_CONST); + + DBUG_RETURN(0); +} + +UNIV_INTERN +handler* +ha_innobase::clone( +/*===============*/ + const char* name, /*!< in: table name */ + MEM_ROOT* mem_root) /*!< in: memory context */ +{ + ha_innobase* new_handler; + + DBUG_ENTER("ha_innobase::clone"); + + new_handler = static_cast<ha_innobase*>(handler::clone(name, + mem_root)); + if (new_handler) { + DBUG_ASSERT(new_handler->prebuilt != NULL); + + new_handler->prebuilt->select_lock_type + = prebuilt->select_lock_type; + } + + DBUG_RETURN(new_handler); +} + +UNIV_INTERN +uint +ha_innobase::max_supported_key_part_length() const +/*==============================================*/ +{ + /* A table format specific index column length check will be performed + at ha_innobase::add_index() and row_create_index_for_mysql() */ + return(innobase_large_prefix + ? REC_VERSION_56_MAX_INDEX_COL_LEN + : REC_ANTELOPE_MAX_INDEX_COL_LEN - 1); +} + +/******************************************************************//** +Closes a handle to an InnoDB table. +@return 0 */ +UNIV_INTERN +int +ha_innobase::close() +/*================*/ +{ + THD* thd; + + DBUG_ENTER("ha_innobase::close"); + + thd = ha_thd(); + + /* No-op in XtraDB */ + innobase_release_temporary_latches(ht, thd); + + row_prebuilt_free(prebuilt, FALSE); + + if (upd_buf != NULL) { + ut_ad(upd_buf_size != 0); + my_free(upd_buf); + upd_buf = NULL; + upd_buf_size = 0; + } + + free_share(share); + + MONITOR_INC(MONITOR_TABLE_CLOSE); + + /* Tell InnoDB server that there might be work for + utility threads: */ + + srv_active_wake_master_thread(); + + DBUG_RETURN(0); +} + +/* The following accessor functions should really be inside MySQL code! */ + +/**************************************************************//** +Gets field offset for a field in a table. +@return offset */ +static inline +uint +get_field_offset( +/*=============*/ + const TABLE* table, /*!< in: MySQL table object */ + const Field* field) /*!< in: MySQL field object */ +{ + return((uint) (field->ptr - table->record[0])); +} + +/*************************************************************//** +InnoDB uses this function to compare two data fields for which the data type +is such that we must use MySQL code to compare them. NOTE that the prototype +of this function is in rem0cmp.cc in InnoDB source code! If you change this +function, remember to update the prototype there! +@return 1, 0, -1, if a is greater, equal, less than b, respectively */ +UNIV_INTERN +int +innobase_mysql_cmp( +/*===============*/ + int mysql_type, /*!< in: MySQL type */ + uint charset_number, /*!< in: number of the charset */ + const unsigned char* a, /*!< in: data field */ + unsigned int a_length, /*!< in: data field length, + not UNIV_SQL_NULL */ + const unsigned char* b, /*!< in: data field */ + unsigned int b_length) /*!< in: data field length, + not UNIV_SQL_NULL */ +{ + CHARSET_INFO* charset; + enum_field_types mysql_tp; + int ret; + + DBUG_ASSERT(a_length != UNIV_SQL_NULL); + DBUG_ASSERT(b_length != UNIV_SQL_NULL); + + mysql_tp = (enum_field_types) mysql_type; + + switch (mysql_tp) { + + case MYSQL_TYPE_BIT: + case MYSQL_TYPE_STRING: + case MYSQL_TYPE_VAR_STRING: + case MYSQL_TYPE_TINY_BLOB: + case MYSQL_TYPE_MEDIUM_BLOB: + case MYSQL_TYPE_BLOB: + case MYSQL_TYPE_LONG_BLOB: + case MYSQL_TYPE_VARCHAR: + /* Use the charset number to pick the right charset struct for + the comparison. Since the MySQL function get_charset may be + slow before Bar removes the mutex operation there, we first + look at 2 common charsets directly. */ + + if (charset_number == default_charset_info->number) { + charset = default_charset_info; + } else if (charset_number == my_charset_latin1.number) { + charset = &my_charset_latin1; + } else { + charset = get_charset(charset_number, MYF(MY_WME)); + + if (charset == NULL) { + sql_print_error("InnoDB needs charset %lu for doing " + "a comparison, but MySQL cannot " + "find that charset.", + (ulong) charset_number); + ut_a(0); + } + } + + /* Starting from 4.1.3, we use strnncollsp() in comparisons of + non-latin1_swedish_ci strings. NOTE that the collation order + changes then: 'b\0\0...' is ordered BEFORE 'b ...'. Users + having indexes on such data need to rebuild their tables! */ + + ret = charset->coll->strnncollsp( + charset, a, a_length, b, b_length, 0); + + if (ret < 0) { + return(-1); + } else if (ret > 0) { + return(1); + } else { + return(0); + } + default: + ut_error; + } + + return(0); +} + + +/*************************************************************//** +Get the next token from the given string and store it in *token. */ +UNIV_INTERN +CHARSET_INFO* +innobase_get_fts_charset( +/*=====================*/ + int mysql_type, /*!< in: MySQL type */ + uint charset_number) /*!< in: number of the charset */ +{ + enum_field_types mysql_tp; + CHARSET_INFO* charset; + + mysql_tp = (enum_field_types) mysql_type; + + switch (mysql_tp) { + + case MYSQL_TYPE_BIT: + case MYSQL_TYPE_STRING: + case MYSQL_TYPE_VAR_STRING: + case MYSQL_TYPE_TINY_BLOB: + case MYSQL_TYPE_MEDIUM_BLOB: + case MYSQL_TYPE_BLOB: + case MYSQL_TYPE_LONG_BLOB: + case MYSQL_TYPE_VARCHAR: + /* Use the charset number to pick the right charset struct for + the comparison. Since the MySQL function get_charset may be + slow before Bar removes the mutex operation there, we first + look at 2 common charsets directly. */ + + if (charset_number == default_charset_info->number) { + charset = default_charset_info; + } else if (charset_number == my_charset_latin1.number) { + charset = &my_charset_latin1; + } else { + charset = get_charset(charset_number, MYF(MY_WME)); + + if (charset == NULL) { + sql_print_error("InnoDB needs charset %lu for doing " + "a comparison, but MySQL cannot " + "find that charset.", + (ulong) charset_number); + ut_a(0); + } + } + break; + default: + ut_error; + } + + return(charset); +} + +/*************************************************************//** +InnoDB uses this function to compare two data fields for which the data type +is such that we must use MySQL code to compare them. NOTE that the prototype +of this function is in rem0cmp.c in InnoDB source code! If you change this +function, remember to update the prototype there! +@return 1, 0, -1, if a is greater, equal, less than b, respectively */ +UNIV_INTERN +int +innobase_mysql_cmp_prefix( +/*======================*/ + int mysql_type, /*!< in: MySQL type */ + uint charset_number, /*!< in: number of the charset */ + const unsigned char* a, /*!< in: data field */ + unsigned int a_length, /*!< in: data field length, + not UNIV_SQL_NULL */ + const unsigned char* b, /*!< in: data field */ + unsigned int b_length) /*!< in: data field length, + not UNIV_SQL_NULL */ +{ + CHARSET_INFO* charset; + int result; + + charset = innobase_get_fts_charset(mysql_type, charset_number); + + result = ha_compare_text(charset, (uchar*) a, a_length, + (uchar*) b, b_length, 1, 0); + + return(result); +} +/******************************************************************//** +compare two character string according to their charset. */ +UNIV_INTERN +int +innobase_fts_text_cmp( +/*==================*/ + const void* cs, /*!< in: Character set */ + const void* p1, /*!< in: key */ + const void* p2) /*!< in: node */ +{ + const CHARSET_INFO* charset = (const CHARSET_INFO*) cs; + const fts_string_t* s1 = (const fts_string_t*) p1; + const fts_string_t* s2 = (const fts_string_t*) p2; + + return(ha_compare_text( + charset, s1->f_str, static_cast<uint>(s1->f_len), + s2->f_str, static_cast<uint>(s2->f_len), 0, 0)); +} +/******************************************************************//** +compare two character string case insensitively according to their charset. */ +UNIV_INTERN +int +innobase_fts_text_case_cmp( +/*=======================*/ + const void* cs, /*!< in: Character set */ + const void* p1, /*!< in: key */ + const void* p2) /*!< in: node */ +{ + const CHARSET_INFO* charset = (const CHARSET_INFO*) cs; + const fts_string_t* s1 = (const fts_string_t*) p1; + const fts_string_t* s2 = (const fts_string_t*) p2; + ulint newlen; + + my_casedn_str(charset, (char*) s2->f_str); + + newlen = strlen((const char*) s2->f_str); + + return(ha_compare_text( + charset, s1->f_str, static_cast<uint>(s1->f_len), + s2->f_str, static_cast<uint>(newlen), 0, 0)); +} +/******************************************************************//** +Get the first character's code position for FTS index partition. */ +UNIV_INTERN +ulint +innobase_strnxfrm( +/*==============*/ + const CHARSET_INFO* + cs, /*!< in: Character set */ + const uchar* str, /*!< in: string */ + const ulint len) /*!< in: string length */ +{ + uchar mystr[2]; + ulint value; + + if (!str || len == 0) { + return(0); + } + + my_strnxfrm(cs, (uchar*) mystr, 2, str, len); + + value = mach_read_from_2(mystr); + + if (value > 255) { + value = value / 256; + } + + return(value); +} + +/******************************************************************//** +compare two character string according to their charset. */ +UNIV_INTERN +int +innobase_fts_text_cmp_prefix( +/*=========================*/ + const void* cs, /*!< in: Character set */ + const void* p1, /*!< in: prefix key */ + const void* p2) /*!< in: value to compare */ +{ + const CHARSET_INFO* charset = (const CHARSET_INFO*) cs; + const fts_string_t* s1 = (const fts_string_t*) p1; + const fts_string_t* s2 = (const fts_string_t*) p2; + int result; + + result = ha_compare_text( + charset, s2->f_str, static_cast<uint>(s2->f_len), + s1->f_str, static_cast<uint>(s1->f_len), 1, 0); + + /* We switched s1, s2 position in ha_compare_text. So we need + to negate the result */ + return(-result); +} + +/******************************************************************//** +Makes all characters in a string lower case. */ +UNIV_INTERN +size_t +innobase_fts_casedn_str( +/*====================*/ + CHARSET_INFO* cs, /*!< in: Character set */ + char* src, /*!< in: string to put in lower case */ + size_t src_len,/*!< in: input string length */ + char* dst, /*!< in: buffer for result string */ + size_t dst_len)/*!< in: buffer size */ +{ + if (cs->casedn_multiply == 1) { + memcpy(dst, src, src_len); + dst[src_len] = 0; + my_casedn_str(cs, dst); + + return(strlen(dst)); + } else { + return(cs->cset->casedn(cs, src, src_len, dst, dst_len)); + } +} + +#define true_word_char(c, ch) ((c) & (_MY_U | _MY_L | _MY_NMR) || (ch) == '_') + +#define misc_word_char(X) 0 + +/*************************************************************//** +Get the next token from the given string and store it in *token. +It is mostly copied from MyISAM's doc parsing function ft_simple_get_word() +@return length of string processed */ +UNIV_INTERN +ulint +innobase_mysql_fts_get_token( +/*=========================*/ + CHARSET_INFO* cs, /*!< in: Character set */ + const byte* start, /*!< in: start of text */ + const byte* end, /*!< in: one character past end of + text */ + fts_string_t* token, /*!< out: token's text */ + ulint* offset) /*!< out: offset to token, + measured as characters from + 'start' */ +{ + int mbl; + const uchar* doc = start; + + ut_a(cs); + + token->f_n_char = token->f_len = 0; + token->f_str = NULL; + + for (;;) { + + if (doc >= end) { + return(doc - start); + } + + int ctype; + + mbl = cs->cset->ctype( + cs, &ctype, doc, (const uchar*) end); + + if (true_word_char(ctype, *doc)) { + break; + } + + doc += mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1); + } + + ulint mwc = 0; + ulint length = 0; + + token->f_str = const_cast<byte*>(doc); + + while (doc < end) { + + int ctype; + + mbl = cs->cset->ctype( + cs, &ctype, (uchar*) doc, (uchar*) end); + if (true_word_char(ctype, *doc)) { + mwc = 0; + } else if (!misc_word_char(*doc) || mwc) { + break; + } else { + ++mwc; + } + + ++length; + + doc += mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1); + } + + token->f_len = (uint) (doc - token->f_str) - mwc; + token->f_n_char = length; + + return(doc - start); +} + +/**************************************************************//** +Converts a MySQL type to an InnoDB type. Note that this function returns +the 'mtype' of InnoDB. InnoDB differentiates between MySQL's old <= 4.1 +VARCHAR and the new true VARCHAR in >= 5.0.3 by the 'prtype'. +@return DATA_BINARY, DATA_VARCHAR, ... */ +UNIV_INTERN +ulint +get_innobase_type_from_mysql_type( +/*==============================*/ + ulint* unsigned_flag, /*!< out: DATA_UNSIGNED if an + 'unsigned type'; + at least ENUM and SET, + and unsigned integer + types are 'unsigned types' */ + const void* f) /*!< in: MySQL Field */ +{ + const class Field* field = reinterpret_cast<const class Field*>(f); + + /* The following asserts try to check that the MySQL type code fits in + 8 bits: this is used in ibuf and also when DATA_NOT_NULL is ORed to + the type */ + + DBUG_ASSERT((ulint)MYSQL_TYPE_STRING < 256); + DBUG_ASSERT((ulint)MYSQL_TYPE_VAR_STRING < 256); + DBUG_ASSERT((ulint)MYSQL_TYPE_DOUBLE < 256); + DBUG_ASSERT((ulint)MYSQL_TYPE_FLOAT < 256); + DBUG_ASSERT((ulint)MYSQL_TYPE_DECIMAL < 256); + + if (field->flags & UNSIGNED_FLAG) { + + *unsigned_flag = DATA_UNSIGNED; + } else { + *unsigned_flag = 0; + } + + if (field->real_type() == MYSQL_TYPE_ENUM + || field->real_type() == MYSQL_TYPE_SET) { + + /* MySQL has field->type() a string type for these, but the + data is actually internally stored as an unsigned integer + code! */ + + *unsigned_flag = DATA_UNSIGNED; /* MySQL has its own unsigned + flag set to zero, even though + internally this is an unsigned + integer type */ + return(DATA_INT); + } + + switch (field->type()) { + /* NOTE that we only allow string types in DATA_MYSQL and + DATA_VARMYSQL */ + case MYSQL_TYPE_VAR_STRING: /* old <= 4.1 VARCHAR */ + case MYSQL_TYPE_VARCHAR: /* new >= 5.0.3 true VARCHAR */ + if (field->binary()) { + return(DATA_BINARY); + } else if (strcmp(field->charset()->name, + "latin1_swedish_ci") == 0) { + return(DATA_VARCHAR); + } else { + return(DATA_VARMYSQL); + } + case MYSQL_TYPE_BIT: + case MYSQL_TYPE_STRING: if (field->binary()) { + + return(DATA_FIXBINARY); + } else if (strcmp(field->charset()->name, + "latin1_swedish_ci") == 0) { + return(DATA_CHAR); + } else { + return(DATA_MYSQL); + } + case MYSQL_TYPE_NEWDECIMAL: + return(DATA_FIXBINARY); + case MYSQL_TYPE_LONG: + case MYSQL_TYPE_LONGLONG: + case MYSQL_TYPE_TINY: + case MYSQL_TYPE_SHORT: + case MYSQL_TYPE_INT24: + case MYSQL_TYPE_DATE: + case MYSQL_TYPE_YEAR: + case MYSQL_TYPE_NEWDATE: + return(DATA_INT); + case MYSQL_TYPE_TIME: + case MYSQL_TYPE_DATETIME: + case MYSQL_TYPE_TIMESTAMP: + switch (field->real_type()) { + case MYSQL_TYPE_TIME: + case MYSQL_TYPE_DATETIME: + case MYSQL_TYPE_TIMESTAMP: + return(DATA_INT); + default: /* Fall through */ + DBUG_ASSERT((ulint)MYSQL_TYPE_DECIMAL < 256); + case MYSQL_TYPE_TIME2: + case MYSQL_TYPE_DATETIME2: + case MYSQL_TYPE_TIMESTAMP2: + return(DATA_FIXBINARY); + } + case MYSQL_TYPE_FLOAT: + return(DATA_FLOAT); + case MYSQL_TYPE_DOUBLE: + return(DATA_DOUBLE); + case MYSQL_TYPE_DECIMAL: + return(DATA_DECIMAL); + case MYSQL_TYPE_GEOMETRY: + case MYSQL_TYPE_TINY_BLOB: + case MYSQL_TYPE_MEDIUM_BLOB: + case MYSQL_TYPE_BLOB: + case MYSQL_TYPE_LONG_BLOB: + return(DATA_BLOB); + case MYSQL_TYPE_NULL: + /* MySQL currently accepts "NULL" datatype, but will + reject such datatype in the next release. We will cope + with it and not trigger assertion failure in 5.1 */ + break; + default: + ut_error; + } + + return(0); +} + +/*******************************************************************//** +Writes an unsigned integer value < 64k to 2 bytes, in the little-endian +storage format. */ +static inline +void +innobase_write_to_2_little_endian( +/*==============================*/ + byte* buf, /*!< in: where to store */ + ulint val) /*!< in: value to write, must be < 64k */ +{ + ut_a(val < 256 * 256); + + buf[0] = (byte)(val & 0xFF); + buf[1] = (byte)(val / 256); +} + +/*******************************************************************//** +Reads an unsigned integer value < 64k from 2 bytes, in the little-endian +storage format. +@return value */ +static inline +uint +innobase_read_from_2_little_endian( +/*===============================*/ + const uchar* buf) /*!< in: from where to read */ +{ + return((uint) ((ulint)(buf[0]) + 256 * ((ulint)(buf[1])))); +} + +/*******************************************************************//** +Stores a key value for a row to a buffer. +@return key value length as stored in buff */ +UNIV_INTERN +uint +ha_innobase::store_key_val_for_row( +/*===============================*/ + uint keynr, /*!< in: key number */ + char* buff, /*!< in/out: buffer for the key value (in MySQL + format) */ + uint buff_len,/*!< in: buffer length */ + const uchar* record)/*!< in: row in MySQL format */ +{ + KEY* key_info = table->key_info + keynr; + KEY_PART_INFO* key_part = key_info->key_part; + KEY_PART_INFO* end = + key_part + key_info->user_defined_key_parts; + char* buff_start = buff; + enum_field_types mysql_type; + Field* field; + ibool is_null; + + DBUG_ENTER("store_key_val_for_row"); + + /* The format for storing a key field in MySQL is the following: + + 1. If the column can be NULL, then in the first byte we put 1 if the + field value is NULL, 0 otherwise. + + 2. If the column is of a BLOB type (it must be a column prefix field + in this case), then we put the length of the data in the field to the + next 2 bytes, in the little-endian format. If the field is SQL NULL, + then these 2 bytes are set to 0. Note that the length of data in the + field is <= column prefix length. + + 3. In a column prefix field, prefix_len next bytes are reserved for + data. In a normal field the max field length next bytes are reserved + for data. For a VARCHAR(n) the max field length is n. If the stored + value is the SQL NULL then these data bytes are set to 0. + + 4. We always use a 2 byte length for a true >= 5.0.3 VARCHAR. Note that + in the MySQL row format, the length is stored in 1 or 2 bytes, + depending on the maximum allowed length. But in the MySQL key value + format, the length always takes 2 bytes. + + We have to zero-fill the buffer so that MySQL is able to use a + simple memcmp to compare two key values to determine if they are + equal. MySQL does this to compare contents of two 'ref' values. */ + + memset(buff, 0, buff_len); + + for (; key_part != end; key_part++) { + is_null = FALSE; + + if (key_part->null_bit) { + if (record[key_part->null_offset] + & key_part->null_bit) { + *buff = 1; + is_null = TRUE; + } else { + *buff = 0; + } + buff++; + } + + field = key_part->field; + mysql_type = field->type(); + + if (mysql_type == MYSQL_TYPE_VARCHAR) { + /* >= 5.0.3 true VARCHAR */ + ulint lenlen; + ulint len; + const byte* data; + ulint key_len; + ulint true_len; + const CHARSET_INFO* cs; + int error=0; + + key_len = key_part->length; + + if (is_null) { + buff += key_len + 2; + + continue; + } + cs = field->charset(); + + lenlen = (ulint) + (((Field_varstring*) field)->length_bytes); + + data = row_mysql_read_true_varchar(&len, + (byte*) (record + + (ulint) get_field_offset(table, field)), + lenlen); + + true_len = len; + + /* For multi byte character sets we need to calculate + the true length of the key */ + + if (len > 0 && cs->mbmaxlen > 1) { + true_len = (ulint) cs->cset->well_formed_len(cs, + (const char*) data, + (const char*) data + len, + (uint) (key_len / cs->mbmaxlen), + &error); + } + + /* In a column prefix index, we may need to truncate + the stored value: */ + + if (true_len > key_len) { + true_len = key_len; + } + + /* The length in a key value is always stored in 2 + bytes */ + + row_mysql_store_true_var_len((byte*) buff, true_len, 2); + buff += 2; + + memcpy(buff, data, true_len); + + /* Note that we always reserve the maximum possible + length of the true VARCHAR in the key value, though + only len first bytes after the 2 length bytes contain + actual data. The rest of the space was reset to zero + in the memset() call above. */ + + buff += key_len; + + } else if (mysql_type == MYSQL_TYPE_TINY_BLOB + || mysql_type == MYSQL_TYPE_MEDIUM_BLOB + || mysql_type == MYSQL_TYPE_BLOB + || mysql_type == MYSQL_TYPE_LONG_BLOB + /* MYSQL_TYPE_GEOMETRY data is treated + as BLOB data in innodb. */ + || mysql_type == MYSQL_TYPE_GEOMETRY) { + + const CHARSET_INFO* cs; + ulint key_len; + ulint true_len; + int error=0; + ulint blob_len; + const byte* blob_data; + + ut_a(key_part->key_part_flag & HA_PART_KEY_SEG); + + key_len = key_part->length; + + if (is_null) { + buff += key_len + 2; + + continue; + } + + cs = field->charset(); + + blob_data = row_mysql_read_blob_ref(&blob_len, + (byte*) (record + + (ulint) get_field_offset(table, field)), + (ulint) field->pack_length()); + + true_len = blob_len; + + ut_a(get_field_offset(table, field) + == key_part->offset); + + /* For multi byte character sets we need to calculate + the true length of the key */ + + if (blob_len > 0 && cs->mbmaxlen > 1) { + true_len = (ulint) cs->cset->well_formed_len(cs, + (const char*) blob_data, + (const char*) blob_data + + blob_len, + (uint) (key_len / cs->mbmaxlen), + &error); + } + + /* All indexes on BLOB and TEXT are column prefix + indexes, and we may need to truncate the data to be + stored in the key value: */ + + if (true_len > key_len) { + true_len = key_len; + } + + /* MySQL reserves 2 bytes for the length and the + storage of the number is little-endian */ + + innobase_write_to_2_little_endian( + (byte*) buff, true_len); + buff += 2; + + memcpy(buff, blob_data, true_len); + + /* Note that we always reserve the maximum possible + length of the BLOB prefix in the key value. */ + + buff += key_len; + } else { + /* Here we handle all other data types except the + true VARCHAR, BLOB and TEXT. Note that the column + value we store may be also in a column prefix + index. */ + + const CHARSET_INFO* cs = NULL; + ulint true_len; + ulint key_len; + const uchar* src_start; + int error=0; + enum_field_types real_type; + + key_len = key_part->length; + + if (is_null) { + buff += key_len; + + continue; + } + + src_start = record + key_part->offset; + real_type = field->real_type(); + true_len = key_len; + + /* Character set for the field is defined only + to fields whose type is string and real field + type is not enum or set. For these fields check + if character set is multi byte. */ + + if (real_type != MYSQL_TYPE_ENUM + && real_type != MYSQL_TYPE_SET + && ( mysql_type == MYSQL_TYPE_VAR_STRING + || mysql_type == MYSQL_TYPE_STRING)) { + + cs = field->charset(); + + /* For multi byte character sets we need to + calculate the true length of the key */ + + if (key_len > 0 && cs->mbmaxlen > 1) { + + true_len = (ulint) + cs->cset->well_formed_len(cs, + (const char*) src_start, + (const char*) src_start + + key_len, + (uint) (key_len + / cs->mbmaxlen), + &error); + } + } + + memcpy(buff, src_start, true_len); + buff += true_len; + + /* Pad the unused space with spaces. */ + + if (true_len < key_len) { + ulint pad_len = key_len - true_len; + ut_a(cs != NULL); + ut_a(!(pad_len % cs->mbminlen)); + + cs->cset->fill(cs, buff, pad_len, + 0x20 /* space */); + buff += pad_len; + } + } + } + + ut_a(buff <= buff_start + buff_len); + + DBUG_RETURN((uint)(buff - buff_start)); +} + +/**************************************************************//** +Determines if a field is needed in a prebuilt struct 'template'. +@return field to use, or NULL if the field is not needed */ +static +const Field* +build_template_needs_field( +/*=======================*/ + ibool index_contains, /*!< in: + dict_index_contains_col_or_prefix( + index, i) */ + ibool read_just_key, /*!< in: TRUE when MySQL calls + ha_innobase::extra with the + argument HA_EXTRA_KEYREAD; it is enough + to read just columns defined in + the index (i.e., no read of the + clustered index record necessary) */ + ibool fetch_all_in_key, + /*!< in: true=fetch all fields in + the index */ + ibool fetch_primary_key_cols, + /*!< in: true=fetch the + primary key columns */ + dict_index_t* index, /*!< in: InnoDB index to use */ + const TABLE* table, /*!< in: MySQL table object */ + ulint i) /*!< in: field index in InnoDB table */ +{ + const Field* field = table->field[i]; + + ut_ad(index_contains == dict_index_contains_col_or_prefix(index, i)); + + if (!index_contains) { + if (read_just_key) { + /* If this is a 'key read', we do not need + columns that are not in the key */ + + return(NULL); + } + } else if (fetch_all_in_key) { + /* This field is needed in the query */ + + return(field); + } + + if (bitmap_is_set(table->read_set, static_cast<uint>(i)) + || bitmap_is_set(table->write_set, static_cast<uint>(i))) { + /* This field is needed in the query */ + + return(field); + } + + if (fetch_primary_key_cols + && dict_table_col_in_clustered_key(index->table, i)) { + /* This field is needed in the query */ + + return(field); + } + + /* This field is not needed in the query, skip it */ + + return(NULL); +} + +/**************************************************************//** +Determines if a field is needed in a prebuilt struct 'template'. +@return whether the field is needed for index condition pushdown */ +inline +bool +build_template_needs_field_in_icp( +/*==============================*/ + const dict_index_t* index, /*!< in: InnoDB index */ + const row_prebuilt_t* prebuilt,/*!< in: row fetch template */ + bool contains,/*!< in: whether the index contains + column i */ + ulint i) /*!< in: column number */ +{ + ut_ad(contains == dict_index_contains_col_or_prefix(index, i)); + + return(index == prebuilt->index + ? contains + : dict_index_contains_col_or_prefix(prebuilt->index, i)); +} + +/**************************************************************//** +Adds a field to a prebuilt struct 'template'. +@return the field template */ +static +mysql_row_templ_t* +build_template_field( +/*=================*/ + row_prebuilt_t* prebuilt, /*!< in/out: template */ + dict_index_t* clust_index, /*!< in: InnoDB clustered index */ + dict_index_t* index, /*!< in: InnoDB index to use */ + TABLE* table, /*!< in: MySQL table object */ + const Field* field, /*!< in: field in MySQL table */ + ulint i) /*!< in: field index in InnoDB table */ +{ + mysql_row_templ_t* templ; + const dict_col_t* col; + + ut_ad(field == table->field[i]); + ut_ad(clust_index->table == index->table); + + col = dict_table_get_nth_col(index->table, i); + + templ = prebuilt->mysql_template + prebuilt->n_template++; + UNIV_MEM_INVALID(templ, sizeof *templ); + templ->col_no = i; + templ->clust_rec_field_no = dict_col_get_clust_pos(col, clust_index); + ut_a(templ->clust_rec_field_no != ULINT_UNDEFINED); + + if (dict_index_is_clust(index)) { + templ->rec_field_no = templ->clust_rec_field_no; + } else { + templ->rec_field_no = dict_index_get_nth_col_pos(index, i); + } + + if (field->real_maybe_null()) { + templ->mysql_null_byte_offset = + field->null_offset(); + + templ->mysql_null_bit_mask = (ulint) field->null_bit; + } else { + templ->mysql_null_bit_mask = 0; + } + + templ->mysql_col_offset = (ulint) get_field_offset(table, field); + + templ->mysql_col_len = (ulint) field->pack_length(); + templ->type = col->mtype; + templ->mysql_type = (ulint) field->type(); + + if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR) { + templ->mysql_length_bytes = (ulint) + (((Field_varstring*) field)->length_bytes); + } + + templ->charset = dtype_get_charset_coll(col->prtype); + templ->mbminlen = dict_col_get_mbminlen(col); + templ->mbmaxlen = dict_col_get_mbmaxlen(col); + templ->is_unsigned = col->prtype & DATA_UNSIGNED; + + if (!dict_index_is_clust(index) + && templ->rec_field_no == ULINT_UNDEFINED) { + prebuilt->need_to_access_clustered = TRUE; + } + + if (prebuilt->mysql_prefix_len < templ->mysql_col_offset + + templ->mysql_col_len) { + prebuilt->mysql_prefix_len = templ->mysql_col_offset + + templ->mysql_col_len; + } + + if (templ->type == DATA_BLOB) { + prebuilt->templ_contains_blob = TRUE; + } + + return(templ); +} + +/**************************************************************//** +Builds a 'template' to the prebuilt struct. The template is used in fast +retrieval of just those column values MySQL needs in its processing. */ +UNIV_INTERN +void +ha_innobase::build_template( +/*========================*/ + bool whole_row) /*!< in: true=ROW_MYSQL_WHOLE_ROW, + false=ROW_MYSQL_REC_FIELDS */ +{ + dict_index_t* index; + dict_index_t* clust_index; + ulint n_fields; + ibool fetch_all_in_key = FALSE; + ibool fetch_primary_key_cols = FALSE; + ulint i; + + if (prebuilt->select_lock_type == LOCK_X) { + /* We always retrieve the whole clustered index record if we + use exclusive row level locks, for example, if the read is + done in an UPDATE statement. */ + + whole_row = true; + } else if (!whole_row) { + if (prebuilt->hint_need_to_fetch_extra_cols + == ROW_RETRIEVE_ALL_COLS) { + + /* We know we must at least fetch all columns in the + key, or all columns in the table */ + + if (prebuilt->read_just_key) { + /* MySQL has instructed us that it is enough + to fetch the columns in the key; looks like + MySQL can set this flag also when there is + only a prefix of the column in the key: in + that case we retrieve the whole column from + the clustered index */ + + fetch_all_in_key = TRUE; + } else { + whole_row = true; + } + } else if (prebuilt->hint_need_to_fetch_extra_cols + == ROW_RETRIEVE_PRIMARY_KEY) { + /* We must at least fetch all primary key cols. Note + that if the clustered index was internally generated + by InnoDB on the row id (no primary key was + defined), then row_search_for_mysql() will always + retrieve the row id to a special buffer in the + prebuilt struct. */ + + fetch_primary_key_cols = TRUE; + } + } + + clust_index = dict_table_get_first_index(prebuilt->table); + + index = whole_row ? clust_index : prebuilt->index; + + prebuilt->need_to_access_clustered = (index == clust_index); + + /* Either prebuilt->index should be a secondary index, or it + should be the clustered index. */ + ut_ad(dict_index_is_clust(index) == (index == clust_index)); + + /* Below we check column by column if we need to access + the clustered index. */ + + n_fields = (ulint) table->s->fields; /* number of columns */ + + if (!prebuilt->mysql_template) { + prebuilt->mysql_template = (mysql_row_templ_t*) + mem_alloc(n_fields * sizeof(mysql_row_templ_t)); + } + + prebuilt->template_type = whole_row + ? ROW_MYSQL_WHOLE_ROW : ROW_MYSQL_REC_FIELDS; + prebuilt->null_bitmap_len = table->s->null_bytes; + + /* Prepare to build prebuilt->mysql_template[]. */ + prebuilt->templ_contains_blob = FALSE; + prebuilt->mysql_prefix_len = 0; + prebuilt->n_template = 0; + prebuilt->idx_cond_n_cols = 0; + + /* Note that in InnoDB, i is the column number in the table. + MySQL calls columns 'fields'. */ + + if (active_index != MAX_KEY && active_index == pushed_idx_cond_keyno) { + /* Push down an index condition or an end_range check. */ + for (i = 0; i < n_fields; i++) { + const ibool index_contains + = dict_index_contains_col_or_prefix(index, i); + + /* Test if an end_range or an index condition + refers to the field. Note that "index" and + "index_contains" may refer to the clustered index. + Index condition pushdown is relative to prebuilt->index + (the index that is being looked up first). */ + + /* When join_read_always_key() invokes this + code via handler::ha_index_init() and + ha_innobase::index_init(), end_range is not + yet initialized. Because of that, we must + always check for index_contains, instead of + the subset + field->part_of_key.is_set(active_index) + which would be acceptable if end_range==NULL. */ + if (build_template_needs_field_in_icp( + index, prebuilt, index_contains, i)) { + /* Needed in ICP */ + const Field* field; + mysql_row_templ_t* templ; + + if (whole_row) { + field = table->field[i]; + } else { + field = build_template_needs_field( + index_contains, + prebuilt->read_just_key, + fetch_all_in_key, + fetch_primary_key_cols, + index, table, i); + if (!field) { + continue; + } + } + + templ = build_template_field( + prebuilt, clust_index, index, + table, field, i); + prebuilt->idx_cond_n_cols++; + ut_ad(prebuilt->idx_cond_n_cols + == prebuilt->n_template); + + if (index == prebuilt->index) { + templ->icp_rec_field_no + = templ->rec_field_no; + } else { + templ->icp_rec_field_no + = dict_index_get_nth_col_pos( + prebuilt->index, i); + } + + if (dict_index_is_clust(prebuilt->index)) { + ut_ad(templ->icp_rec_field_no + != ULINT_UNDEFINED); + /* If the primary key includes + a column prefix, use it in + index condition pushdown, + because the condition is + evaluated before fetching any + off-page (externally stored) + columns. */ + if (templ->icp_rec_field_no + < prebuilt->index->n_uniq) { + /* This is a key column; + all set. */ + continue; + } + } else if (templ->icp_rec_field_no + != ULINT_UNDEFINED) { + continue; + } + + /* This is a column prefix index. + The column prefix can be used in + an end_range comparison. */ + + templ->icp_rec_field_no + = dict_index_get_nth_col_or_prefix_pos( + prebuilt->index, i, TRUE); + ut_ad(templ->icp_rec_field_no + != ULINT_UNDEFINED); + + /* Index condition pushdown can be used on + all columns of a secondary index, and on + the PRIMARY KEY columns. On the clustered + index, it must never be used on other than + PRIMARY KEY columns, because those columns + may be stored off-page, and we will not + fetch externally stored columns before + checking the index condition. */ + /* TODO: test the above with an assertion + like this. Note that index conditions are + currently pushed down as part of the + "optimizer phase" while end_range is done + as part of the execution phase. Therefore, + we were unable to use an accurate condition + for end_range in the "if" condition above, + and the following assertion would fail. + ut_ad(!dict_index_is_clust(prebuilt->index) + || templ->rec_field_no + < prebuilt->index->n_uniq); + */ + } + } + + ut_ad(prebuilt->idx_cond_n_cols > 0); + ut_ad(prebuilt->idx_cond_n_cols == prebuilt->n_template); + + /* Include the fields that are not needed in index condition + pushdown. */ + for (i = 0; i < n_fields; i++) { + const ibool index_contains + = dict_index_contains_col_or_prefix(index, i); + + if (!build_template_needs_field_in_icp( + index, prebuilt, index_contains, i)) { + /* Not needed in ICP */ + const Field* field; + + if (whole_row) { + field = table->field[i]; + } else { + field = build_template_needs_field( + index_contains, + prebuilt->read_just_key, + fetch_all_in_key, + fetch_primary_key_cols, + index, table, i); + if (!field) { + continue; + } + } + + build_template_field(prebuilt, + clust_index, index, + table, field, i); + } + } + + prebuilt->idx_cond = this; + } else { + /* No index condition pushdown */ + prebuilt->idx_cond = NULL; + + for (i = 0; i < n_fields; i++) { + const Field* field; + + if (whole_row) { + field = table->field[i]; + } else { + field = build_template_needs_field( + dict_index_contains_col_or_prefix( + index, i), + prebuilt->read_just_key, + fetch_all_in_key, + fetch_primary_key_cols, + index, table, i); + if (!field) { + continue; + } + } + + build_template_field(prebuilt, clust_index, index, + table, field, i); + } + } + + if (index != clust_index && prebuilt->need_to_access_clustered) { + /* Change rec_field_no's to correspond to the clustered index + record */ + for (i = 0; i < prebuilt->n_template; i++) { + + mysql_row_templ_t* templ + = &prebuilt->mysql_template[i]; + + templ->rec_field_no = templ->clust_rec_field_no; + } + } +} + +/********************************************************************//** +This special handling is really to overcome the limitations of MySQL's +binlogging. We need to eliminate the non-determinism that will arise in +INSERT ... SELECT type of statements, since MySQL binlog only stores the +min value of the autoinc interval. Once that is fixed we can get rid of +the special lock handling. +@return DB_SUCCESS if all OK else error code */ +UNIV_INTERN +dberr_t +ha_innobase::innobase_lock_autoinc(void) +/*====================================*/ +{ + dberr_t error = DB_SUCCESS; + + ut_ad(!srv_read_only_mode); + + switch (innobase_autoinc_lock_mode) { + case AUTOINC_NO_LOCKING: + /* Acquire only the AUTOINC mutex. */ + dict_table_autoinc_lock(prebuilt->table); + break; + + case AUTOINC_NEW_STYLE_LOCKING: + /* For simple (single/multi) row INSERTs, we fallback to the + old style only if another transaction has already acquired + the AUTOINC lock on behalf of a LOAD FILE or INSERT ... SELECT + etc. type of statement. */ + if (thd_sql_command(user_thd) == SQLCOM_INSERT + || thd_sql_command(user_thd) == SQLCOM_REPLACE) { + dict_table_t* ib_table = prebuilt->table; + + /* Acquire the AUTOINC mutex. */ + dict_table_autoinc_lock(ib_table); + + /* We need to check that another transaction isn't + already holding the AUTOINC lock on the table. */ + if (ib_table->n_waiting_or_granted_auto_inc_locks) { + /* Release the mutex to avoid deadlocks. */ + dict_table_autoinc_unlock(ib_table); + } else { + break; + } + } + /* Fall through to old style locking. */ + + case AUTOINC_OLD_STYLE_LOCKING: + error = row_lock_table_autoinc_for_mysql(prebuilt); + + if (error == DB_SUCCESS) { + + /* Acquire the AUTOINC mutex. */ + dict_table_autoinc_lock(prebuilt->table); + } + break; + + default: + ut_error; + } + + return(error); +} + +/********************************************************************//** +Reset the autoinc value in the table. +@return DB_SUCCESS if all went well else error code */ +UNIV_INTERN +dberr_t +ha_innobase::innobase_reset_autoinc( +/*================================*/ + ulonglong autoinc) /*!< in: value to store */ +{ + dberr_t error; + + error = innobase_lock_autoinc(); + + if (error == DB_SUCCESS) { + + dict_table_autoinc_initialize(prebuilt->table, autoinc); + + dict_table_autoinc_unlock(prebuilt->table); + } + + return(error); +} + +/********************************************************************//** +Store the autoinc value in the table. The autoinc value is only set if +it's greater than the existing autoinc value in the table. +@return DB_SUCCESS if all went well else error code */ +UNIV_INTERN +dberr_t +ha_innobase::innobase_set_max_autoinc( +/*==================================*/ + ulonglong auto_inc) /*!< in: value to store */ +{ + dberr_t error; + + error = innobase_lock_autoinc(); + + if (error == DB_SUCCESS) { + + dict_table_autoinc_update_if_greater(prebuilt->table, auto_inc); + + dict_table_autoinc_unlock(prebuilt->table); + } + + return(error); +} + +/********************************************************************//** +Stores a row in an InnoDB database, to the table specified in this +handle. +@return error code */ +UNIV_INTERN +int +ha_innobase::write_row( +/*===================*/ + uchar* record) /*!< in: a row in MySQL format */ +{ + dberr_t error; + int error_result= 0; + ibool auto_inc_used= FALSE; + ulint sql_command; + trx_t* trx = thd_to_trx(user_thd); + + DBUG_ENTER("ha_innobase::write_row"); + + if (srv_read_only_mode) { + ib_senderrf(ha_thd(), IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE); + DBUG_RETURN(HA_ERR_TABLE_READONLY); + } else if (prebuilt->trx != trx) { + sql_print_error("The transaction object for the table handle " + "is at %p, but for the current thread it is at " + "%p", + (const void*) prebuilt->trx, (const void*) trx); + + fputs("InnoDB: Dump of 200 bytes around prebuilt: ", stderr); + ut_print_buf(stderr, ((const byte*) prebuilt) - 100, 200); + fputs("\n" + "InnoDB: Dump of 200 bytes around ha_data: ", + stderr); + ut_print_buf(stderr, ((const byte*) trx) - 100, 200); + putc('\n', stderr); + ut_error; + } else if (!trx_is_started(trx)) { + ++trx->will_lock; + } + + ha_statistic_increment(&SSV::ha_write_count); + + if (UNIV_UNLIKELY(share->ib_table->is_corrupt)) { + DBUG_RETURN(HA_ERR_CRASHED); + } + + sql_command = thd_sql_command(user_thd); + + if ((sql_command == SQLCOM_ALTER_TABLE + || sql_command == SQLCOM_OPTIMIZE + || sql_command == SQLCOM_CREATE_INDEX + || sql_command == SQLCOM_DROP_INDEX) + && num_write_row >= 10000) { + /* ALTER TABLE is COMMITted at every 10000 copied rows. + The IX table lock for the original table has to be re-issued. + As this method will be called on a temporary table where the + contents of the original table is being copied to, it is + a bit tricky to determine the source table. The cursor + position in the source table need not be adjusted after the + intermediate COMMIT, since writes by other transactions are + being blocked by a MySQL table lock TL_WRITE_ALLOW_READ. */ + + dict_table_t* src_table; + enum lock_mode mode; + + num_write_row = 0; + + /* Commit the transaction. This will release the table + locks, so they have to be acquired again. */ + + /* Altering an InnoDB table */ + /* Get the source table. */ + src_table = lock_get_src_table( + prebuilt->trx, prebuilt->table, &mode); + if (!src_table) { +no_commit: + /* Unknown situation: do not commit */ + /* + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: ALTER TABLE is holding lock" + " on %lu tables!\n", + prebuilt->trx->mysql_n_tables_locked); + */ + ; + } else if (src_table == prebuilt->table) { + /* Source table is not in InnoDB format: + no need to re-acquire locks on it. */ + + /* Altering to InnoDB format */ + innobase_commit(ht, user_thd, 1); + /* Note that this transaction is still active. */ + trx_register_for_2pc(prebuilt->trx); + /* We will need an IX lock on the destination table. */ + prebuilt->sql_stat_start = TRUE; + } else { + /* Ensure that there are no other table locks than + LOCK_IX and LOCK_AUTO_INC on the destination table. */ + + if (!lock_is_table_exclusive(prebuilt->table, + prebuilt->trx)) { + goto no_commit; + } + + /* Commit the transaction. This will release the table + locks, so they have to be acquired again. */ + innobase_commit(ht, user_thd, 1); + /* Note that this transaction is still active. */ + trx_register_for_2pc(prebuilt->trx); + /* Re-acquire the table lock on the source table. */ + row_lock_table_for_mysql(prebuilt, src_table, mode); + /* We will need an IX lock on the destination table. */ + prebuilt->sql_stat_start = TRUE; + } + } + + num_write_row++; + + /* This is the case where the table has an auto-increment column */ + if (table->next_number_field && record == table->record[0]) { + + /* Reset the error code before calling + innobase_get_auto_increment(). */ + prebuilt->autoinc_error = DB_SUCCESS; + + if ((error_result = update_auto_increment())) { + /* We don't want to mask autoinc overflow errors. */ + + /* Handle the case where the AUTOINC sub-system + failed during initialization. */ + if (prebuilt->autoinc_error == DB_UNSUPPORTED) { + error_result = ER_AUTOINC_READ_FAILED; + /* Set the error message to report too. */ + my_error(ER_AUTOINC_READ_FAILED, MYF(0)); + goto func_exit; + } else if (prebuilt->autoinc_error != DB_SUCCESS) { + error = prebuilt->autoinc_error; + goto report_error; + } + + /* MySQL errors are passed straight back. */ + goto func_exit; + } + + auto_inc_used = TRUE; + } + + if (prebuilt->mysql_template == NULL + || prebuilt->template_type != ROW_MYSQL_WHOLE_ROW) { + + /* Build the template used in converting quickly between + the two database formats */ + + build_template(true); + } + + innobase_srv_conc_enter_innodb(prebuilt->trx); + + error = row_insert_for_mysql((byte*) record, prebuilt); + DEBUG_SYNC(user_thd, "ib_after_row_insert"); + +#ifdef EXTENDED_FOR_USERSTAT + if (UNIV_LIKELY(error == DB_SUCCESS && !trx->fake_changes)) { + rows_changed++; + } +#endif + + /* Handle duplicate key errors */ + if (auto_inc_used) { + ulonglong auto_inc; + ulonglong col_max_value; + + /* Note the number of rows processed for this statement, used + by get_auto_increment() to determine the number of AUTO-INC + values to reserve. This is only useful for a mult-value INSERT + and is a statement level counter.*/ + if (trx->n_autoinc_rows > 0) { + --trx->n_autoinc_rows; + } + + /* We need the upper limit of the col type to check for + whether we update the table autoinc counter or not. */ + col_max_value = innobase_get_int_col_max_value( + table->next_number_field); + + /* Get the value that MySQL attempted to store in the table.*/ + auto_inc = table->next_number_field->val_int(); + + switch (error) { + case DB_DUPLICATE_KEY: + + /* A REPLACE command and LOAD DATA INFILE REPLACE + handle a duplicate key error themselves, but we + must update the autoinc counter if we are performing + those statements. */ + + switch (sql_command) { + case SQLCOM_LOAD: + if (trx->duplicates) { + + goto set_max_autoinc; + } + break; + + case SQLCOM_REPLACE: + case SQLCOM_INSERT_SELECT: + case SQLCOM_REPLACE_SELECT: + goto set_max_autoinc; + + default: + break; + } + + break; + + case DB_SUCCESS: + /* If the actual value inserted is greater than + the upper limit of the interval, then we try and + update the table upper limit. Note: last_value + will be 0 if get_auto_increment() was not called.*/ + + if (auto_inc >= prebuilt->autoinc_last_value) { +set_max_autoinc: + /* This should filter out the negative + values set explicitly by the user. */ + if (auto_inc <= col_max_value) { + ut_a(prebuilt->autoinc_increment > 0); + + ulonglong offset; + ulonglong increment; + dberr_t err; + + offset = prebuilt->autoinc_offset; + increment = prebuilt->autoinc_increment; + + auto_inc = innobase_next_autoinc( + auto_inc, + 1, increment, offset, + col_max_value); + + err = innobase_set_max_autoinc( + auto_inc); + + if (err != DB_SUCCESS) { + error = err; + } + } + } + break; + default: + break; + } + } + + innobase_srv_conc_exit_innodb(prebuilt->trx); + +report_error: + if (error == DB_TABLESPACE_DELETED) { + ib_senderrf( + trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_TABLESPACE_DISCARDED, + table->s->table_name.str); + } + + error_result = convert_error_code_to_mysql(error, + prebuilt->table->flags, + user_thd); + + if (error_result == HA_FTS_INVALID_DOCID) { + my_error(HA_FTS_INVALID_DOCID, MYF(0)); + } + +func_exit: + innobase_active_small(); + + if (UNIV_UNLIKELY(share->ib_table->is_corrupt)) { + DBUG_RETURN(HA_ERR_CRASHED); + } + + DBUG_RETURN(error_result); +} + +/**********************************************************************//** +Checks which fields have changed in a row and stores information +of them to an update vector. +@return DB_SUCCESS or error code */ +static +dberr_t +calc_row_difference( +/*================*/ + upd_t* uvect, /*!< in/out: update vector */ + uchar* old_row, /*!< in: old row in MySQL format */ + uchar* new_row, /*!< in: new row in MySQL format */ + TABLE* table, /*!< in: table in MySQL data + dictionary */ + uchar* upd_buff, /*!< in: buffer to use */ + ulint buff_len, /*!< in: buffer length */ + row_prebuilt_t* prebuilt, /*!< in: InnoDB prebuilt struct */ + THD* thd) /*!< in: user thread */ +{ + uchar* original_upd_buff = upd_buff; + Field* field; + enum_field_types field_mysql_type; + uint n_fields; + ulint o_len; + ulint n_len; + ulint col_pack_len; + const byte* new_mysql_row_col; + const byte* o_ptr; + const byte* n_ptr; + byte* buf; + upd_field_t* ufield; + ulint col_type; + ulint n_changed = 0; + dfield_t dfield; + dict_index_t* clust_index; + uint i; + ibool changes_fts_column = FALSE; + ibool changes_fts_doc_col = FALSE; + trx_t* trx = thd_to_trx(thd); + doc_id_t doc_id = FTS_NULL_DOC_ID; + + ut_ad(!srv_read_only_mode); + + n_fields = table->s->fields; + clust_index = dict_table_get_first_index(prebuilt->table); + + /* We use upd_buff to convert changed fields */ + buf = (byte*) upd_buff; + + for (i = 0; i < n_fields; i++) { + field = table->field[i]; + + o_ptr = (const byte*) old_row + get_field_offset(table, field); + n_ptr = (const byte*) new_row + get_field_offset(table, field); + + /* Use new_mysql_row_col and col_pack_len save the values */ + + new_mysql_row_col = n_ptr; + col_pack_len = field->pack_length(); + + o_len = col_pack_len; + n_len = col_pack_len; + + /* We use o_ptr and n_ptr to dig up the actual data for + comparison. */ + + field_mysql_type = field->type(); + + col_type = prebuilt->table->cols[i].mtype; + + switch (col_type) { + + case DATA_BLOB: + o_ptr = row_mysql_read_blob_ref(&o_len, o_ptr, o_len); + n_ptr = row_mysql_read_blob_ref(&n_len, n_ptr, n_len); + + break; + + case DATA_VARCHAR: + case DATA_BINARY: + case DATA_VARMYSQL: + if (field_mysql_type == MYSQL_TYPE_VARCHAR) { + /* This is a >= 5.0.3 type true VARCHAR where + the real payload data length is stored in + 1 or 2 bytes */ + + o_ptr = row_mysql_read_true_varchar( + &o_len, o_ptr, + (ulint) + (((Field_varstring*) field)->length_bytes)); + + n_ptr = row_mysql_read_true_varchar( + &n_len, n_ptr, + (ulint) + (((Field_varstring*) field)->length_bytes)); + } + + break; + default: + ; + } + + if (field_mysql_type == MYSQL_TYPE_LONGLONG + && prebuilt->table->fts + && innobase_strcasecmp( + field->field_name, FTS_DOC_ID_COL_NAME) == 0) { + doc_id = (doc_id_t) mach_read_from_n_little_endian( + n_ptr, 8); + if (doc_id == 0) { + return(DB_FTS_INVALID_DOCID); + } + } + + + if (field->real_maybe_null()) { + if (field->is_null_in_record(old_row)) { + o_len = UNIV_SQL_NULL; + } + + if (field->is_null_in_record(new_row)) { + n_len = UNIV_SQL_NULL; + } + } + + if (o_len != n_len || (o_len != UNIV_SQL_NULL && + 0 != memcmp(o_ptr, n_ptr, o_len))) { + /* The field has changed */ + + ufield = uvect->fields + n_changed; + UNIV_MEM_INVALID(ufield, sizeof *ufield); + + /* Let us use a dummy dfield to make the conversion + from the MySQL column format to the InnoDB format */ + + if (n_len != UNIV_SQL_NULL) { + dict_col_copy_type(prebuilt->table->cols + i, + dfield_get_type(&dfield)); + + buf = row_mysql_store_col_in_innobase_format( + &dfield, + (byte*) buf, + TRUE, + new_mysql_row_col, + col_pack_len, + dict_table_is_comp(prebuilt->table)); + dfield_copy(&ufield->new_val, &dfield); + } else { + dfield_set_null(&ufield->new_val); + } + + ufield->exp = NULL; + ufield->orig_len = 0; + ufield->field_no = dict_col_get_clust_pos( + &prebuilt->table->cols[i], clust_index); + n_changed++; + + /* If an FTS indexed column was changed by this + UPDATE then we need to inform the FTS sub-system. + + NOTE: Currently we re-index all FTS indexed columns + even if only a subset of the FTS indexed columns + have been updated. That is the reason we are + checking only once here. Later we will need to + note which columns have been updated and do + selective processing. */ + if (prebuilt->table->fts != NULL) { + ulint offset; + dict_table_t* innodb_table; + + innodb_table = prebuilt->table; + + if (!changes_fts_column) { + offset = row_upd_changes_fts_column( + innodb_table, ufield); + + if (offset != ULINT_UNDEFINED) { + changes_fts_column = TRUE; + } + } + + if (!changes_fts_doc_col) { + changes_fts_doc_col = + row_upd_changes_doc_id( + innodb_table, ufield); + } + } + } + } + + /* If the update changes a column with an FTS index on it, we + then add an update column node with a new document id to the + other changes. We piggy back our changes on the normal UPDATE + to reduce processing and IO overhead. */ + if (!prebuilt->table->fts) { + trx->fts_next_doc_id = 0; + } else if (changes_fts_column || changes_fts_doc_col) { + dict_table_t* innodb_table = prebuilt->table; + + ufield = uvect->fields + n_changed; + + if (!DICT_TF2_FLAG_IS_SET( + innodb_table, DICT_TF2_FTS_HAS_DOC_ID)) { + + /* If Doc ID is managed by user, and if any + FTS indexed column has been updated, its corresponding + Doc ID must also be updated. Otherwise, return + error */ + if (changes_fts_column && !changes_fts_doc_col) { + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: A new Doc ID" + " must be supplied while updating" + " FTS indexed columns.\n"); + return(DB_FTS_INVALID_DOCID); + } + + /* Doc ID must monotonically increase */ + ut_ad(innodb_table->fts->cache); + if (doc_id < prebuilt->table->fts->cache->next_doc_id) { + fprintf(stderr, + "InnoDB: FTS Doc ID must be larger than" + " " IB_ID_FMT " for table", + innodb_table->fts->cache->next_doc_id + - 1); + ut_print_name(stderr, trx, + TRUE, innodb_table->name); + putc('\n', stderr); + + return(DB_FTS_INVALID_DOCID); + } else if ((doc_id + - prebuilt->table->fts->cache->next_doc_id) + >= FTS_DOC_ID_MAX_STEP) { + fprintf(stderr, + "InnoDB: Doc ID " UINT64PF " is too" + " big. Its difference with largest" + " Doc ID used " UINT64PF " cannot" + " exceed or equal to %d\n", + doc_id, + prebuilt->table->fts->cache->next_doc_id - 1, + FTS_DOC_ID_MAX_STEP); + } + + + trx->fts_next_doc_id = doc_id; + } else { + /* If the Doc ID is a hidden column, it can't be + changed by user */ + ut_ad(!changes_fts_doc_col); + + /* Doc ID column is hidden, a new Doc ID will be + generated by following fts_update_doc_id() call */ + trx->fts_next_doc_id = 0; + } + + fts_update_doc_id( + innodb_table, ufield, &trx->fts_next_doc_id); + + ++n_changed; + } else { + /* We have a Doc ID column, but none of FTS indexed + columns are touched, nor the Doc ID column, so set + fts_next_doc_id to UINT64_UNDEFINED, which means do not + update the Doc ID column */ + trx->fts_next_doc_id = UINT64_UNDEFINED; + } + + uvect->n_fields = n_changed; + uvect->info_bits = 0; + + ut_a(buf <= (byte*) original_upd_buff + buff_len); + + return(DB_SUCCESS); +} + +/**********************************************************************//** +Updates a row given as a parameter to a new value. Note that we are given +whole rows, not just the fields which are updated: this incurs some +overhead for CPU when we check which fields are actually updated. +TODO: currently InnoDB does not prevent the 'Halloween problem': +in a searched update a single row can get updated several times +if its index columns are updated! +@return error number or 0 */ +UNIV_INTERN +int +ha_innobase::update_row( +/*====================*/ + const uchar* old_row, /*!< in: old row in MySQL format */ + uchar* new_row) /*!< in: new row in MySQL format */ +{ + upd_t* uvect; + dberr_t error; + trx_t* trx = thd_to_trx(user_thd); + + DBUG_ENTER("ha_innobase::update_row"); + + ut_a(prebuilt->trx == trx); + + if (srv_read_only_mode) { + ib_senderrf(ha_thd(), IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE); + DBUG_RETURN(HA_ERR_TABLE_READONLY); + } else if (!trx_is_started(trx)) { + ++trx->will_lock; + } + + if (upd_buf == NULL) { + ut_ad(upd_buf_size == 0); + + /* Create a buffer for packing the fields of a record. Why + table->reclength did not work here? Obviously, because char + fields when packed actually became 1 byte longer, when we also + stored the string length as the first byte. */ + + upd_buf_size = table->s->reclength + table->s->max_key_length + + MAX_REF_PARTS * 3; + upd_buf = (uchar*) my_malloc(upd_buf_size, MYF(MY_WME)); + if (upd_buf == NULL) { + upd_buf_size = 0; + DBUG_RETURN(HA_ERR_OUT_OF_MEM); + } + } + + ha_statistic_increment(&SSV::ha_update_count); + + if (UNIV_UNLIKELY(share->ib_table->is_corrupt)) { + DBUG_RETURN(HA_ERR_CRASHED); + } + + if (prebuilt->upd_node) { + uvect = prebuilt->upd_node->update; + } else { + uvect = row_get_prebuilt_update_vector(prebuilt); + } + + /* Build an update vector from the modified fields in the rows + (uses upd_buf of the handle) */ + + error = calc_row_difference(uvect, (uchar*) old_row, new_row, table, + upd_buf, upd_buf_size, prebuilt, user_thd); + + if (error != DB_SUCCESS) { + goto func_exit; + } + + /* This is not a delete */ + prebuilt->upd_node->is_delete = FALSE; + + ut_a(prebuilt->template_type == ROW_MYSQL_WHOLE_ROW); + + innobase_srv_conc_enter_innodb(trx); + + error = row_update_for_mysql((byte*) old_row, prebuilt); + + /* We need to do some special AUTOINC handling for the following case: + + INSERT INTO t (c1,c2) VALUES(x,y) ON DUPLICATE KEY UPDATE ... + + We need to use the AUTOINC counter that was actually used by + MySQL in the UPDATE statement, which can be different from the + value used in the INSERT statement.*/ + + if (error == DB_SUCCESS + && table->next_number_field + && new_row == table->record[0] + && thd_sql_command(user_thd) == SQLCOM_INSERT + && trx->duplicates) { + + ulonglong auto_inc; + ulonglong col_max_value; + + auto_inc = table->next_number_field->val_int(); + + /* We need the upper limit of the col type to check for + whether we update the table autoinc counter or not. */ + col_max_value = innobase_get_int_col_max_value( + table->next_number_field); + + if (auto_inc <= col_max_value && auto_inc != 0) { + + ulonglong offset; + ulonglong increment; + + offset = prebuilt->autoinc_offset; + increment = prebuilt->autoinc_increment; + + auto_inc = innobase_next_autoinc( + auto_inc, 1, increment, offset, col_max_value); + + error = innobase_set_max_autoinc(auto_inc); + } + } + +#ifdef EXTENDED_FOR_USERSTAT + if (UNIV_LIKELY(error == DB_SUCCESS && !trx->fake_changes)) { + rows_changed++; + } +#endif + + innobase_srv_conc_exit_innodb(trx); + +func_exit: + int err = convert_error_code_to_mysql(error, + prebuilt->table->flags, user_thd); + + /* If success and no columns were updated. */ + if (err == 0 && uvect->n_fields == 0) { + + /* This is the same as success, but instructs + MySQL that the row is not really updated and it + should not increase the count of updated rows. + This is fix for http://bugs.mysql.com/29157 */ + err = HA_ERR_RECORD_IS_THE_SAME; + } else if (err == HA_FTS_INVALID_DOCID) { + my_error(HA_FTS_INVALID_DOCID, MYF(0)); + } + + /* Tell InnoDB server that there might be work for + utility threads: */ + + innobase_active_small(); + + if (UNIV_UNLIKELY(share->ib_table->is_corrupt)) { + DBUG_RETURN(HA_ERR_CRASHED); + } + + DBUG_RETURN(err); +} + +/**********************************************************************//** +Deletes a row given as the parameter. +@return error number or 0 */ +UNIV_INTERN +int +ha_innobase::delete_row( +/*====================*/ + const uchar* record) /*!< in: a row in MySQL format */ +{ + dberr_t error; + trx_t* trx = thd_to_trx(user_thd); + + DBUG_ENTER("ha_innobase::delete_row"); + + ut_a(prebuilt->trx == trx); + + if (srv_read_only_mode) { + ib_senderrf(ha_thd(), IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE); + DBUG_RETURN(HA_ERR_TABLE_READONLY); + } else if (!trx_is_started(trx)) { + ++trx->will_lock; + } + + ha_statistic_increment(&SSV::ha_delete_count); + + if (UNIV_UNLIKELY(share && share->ib_table + && share->ib_table->is_corrupt)) { + DBUG_RETURN(HA_ERR_CRASHED); + } + + if (!prebuilt->upd_node) { + row_get_prebuilt_update_vector(prebuilt); + } + + /* This is a delete */ + + prebuilt->upd_node->is_delete = TRUE; + + innobase_srv_conc_enter_innodb(trx); + + error = row_update_for_mysql((byte*) record, prebuilt); + +#ifdef EXTENDED_FOR_USERSTAT + if (UNIV_LIKELY(error == DB_SUCCESS && !trx->fake_changes)) { + rows_changed++; + } +#endif + + innobase_srv_conc_exit_innodb(trx); + + /* Tell the InnoDB server that there might be work for + utility threads: */ + + innobase_active_small(); + + if (UNIV_UNLIKELY(share && share->ib_table + && share->ib_table->is_corrupt)) { + DBUG_RETURN(HA_ERR_CRASHED); + } + + DBUG_RETURN(convert_error_code_to_mysql( + error, prebuilt->table->flags, user_thd)); +} + +/**********************************************************************//** +Removes a new lock set on a row, if it was not read optimistically. This can +be called after a row has been read in the processing of an UPDATE or a DELETE +query, if the option innodb_locks_unsafe_for_binlog is set. */ +UNIV_INTERN +void +ha_innobase::unlock_row(void) +/*=========================*/ +{ + DBUG_ENTER("ha_innobase::unlock_row"); + + /* Consistent read does not take any locks, thus there is + nothing to unlock. */ + + if (prebuilt->select_lock_type == LOCK_NONE) { + DBUG_VOID_RETURN; + } + + /* Ideally, this assert must be in the beginning of the function. + But there are some calls to this function from the SQL layer when the + transaction is in state TRX_STATE_NOT_STARTED. The check on + prebuilt->select_lock_type above gets around this issue. */ + ut_ad(trx_state_eq(prebuilt->trx, TRX_STATE_ACTIVE)); + + switch (prebuilt->row_read_type) { + case ROW_READ_WITH_LOCKS: + if (!srv_locks_unsafe_for_binlog + && prebuilt->trx->isolation_level + > TRX_ISO_READ_COMMITTED) { + break; + } + /* fall through */ + case ROW_READ_TRY_SEMI_CONSISTENT: + row_unlock_for_mysql(prebuilt, FALSE); + break; + case ROW_READ_DID_SEMI_CONSISTENT: + prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT; + break; + } + + DBUG_VOID_RETURN; +} + +/* See handler.h and row0mysql.h for docs on this function. */ +UNIV_INTERN +bool +ha_innobase::was_semi_consistent_read(void) +/*=======================================*/ +{ + return(prebuilt->row_read_type == ROW_READ_DID_SEMI_CONSISTENT); +} + +/* See handler.h and row0mysql.h for docs on this function. */ +UNIV_INTERN +void +ha_innobase::try_semi_consistent_read(bool yes) +/*===========================================*/ +{ + ut_a(prebuilt->trx == thd_to_trx(ha_thd())); + + /* Row read type is set to semi consistent read if this was + requested by the MySQL and either innodb_locks_unsafe_for_binlog + option is used or this session is using READ COMMITTED isolation + level. */ + + if (yes + && (srv_locks_unsafe_for_binlog + || prebuilt->trx->isolation_level <= TRX_ISO_READ_COMMITTED)) { + prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT; + } else { + prebuilt->row_read_type = ROW_READ_WITH_LOCKS; + } +} + +/******************************************************************//** +Initializes a handle to use an index. +@return 0 or error number */ +UNIV_INTERN +int +ha_innobase::index_init( +/*====================*/ + uint keynr, /*!< in: key (index) number */ + bool sorted) /*!< in: 1 if result MUST be sorted according to index */ +{ + DBUG_ENTER("index_init"); + + DBUG_RETURN(change_active_index(keynr)); +} + +/******************************************************************//** +Currently does nothing. +@return 0 */ +UNIV_INTERN +int +ha_innobase::index_end(void) +/*========================*/ +{ + int error = 0; + DBUG_ENTER("index_end"); + active_index = MAX_KEY; + in_range_check_pushed_down = FALSE; + ds_mrr.dsmrr_close(); + DBUG_RETURN(error); +} + +/*********************************************************************//** +Converts a search mode flag understood by MySQL to a flag understood +by InnoDB. */ +static inline +ulint +convert_search_mode_to_innobase( +/*============================*/ + enum ha_rkey_function find_flag) +{ + switch (find_flag) { + case HA_READ_KEY_EXACT: + /* this does not require the index to be UNIQUE */ + return(PAGE_CUR_GE); + case HA_READ_KEY_OR_NEXT: + return(PAGE_CUR_GE); + case HA_READ_KEY_OR_PREV: + return(PAGE_CUR_LE); + case HA_READ_AFTER_KEY: + return(PAGE_CUR_G); + case HA_READ_BEFORE_KEY: + return(PAGE_CUR_L); + case HA_READ_PREFIX: + return(PAGE_CUR_GE); + case HA_READ_PREFIX_LAST: + return(PAGE_CUR_LE); + case HA_READ_PREFIX_LAST_OR_PREV: + return(PAGE_CUR_LE); + /* In MySQL-4.0 HA_READ_PREFIX and HA_READ_PREFIX_LAST always + pass a complete-field prefix of a key value as the search + tuple. I.e., it is not allowed that the last field would + just contain n first bytes of the full field value. + MySQL uses a 'padding' trick to convert LIKE 'abc%' + type queries so that it can use as a search tuple + a complete-field-prefix of a key value. Thus, the InnoDB + search mode PAGE_CUR_LE_OR_EXTENDS is never used. + TODO: when/if MySQL starts to use also partial-field + prefixes, we have to deal with stripping of spaces + and comparison of non-latin1 char type fields in + innobase_mysql_cmp() to get PAGE_CUR_LE_OR_EXTENDS to + work correctly. */ + case HA_READ_MBR_CONTAIN: + case HA_READ_MBR_INTERSECT: + case HA_READ_MBR_WITHIN: + case HA_READ_MBR_DISJOINT: + case HA_READ_MBR_EQUAL: + return(PAGE_CUR_UNSUPP); + /* do not use "default:" in order to produce a gcc warning: + enumeration value '...' not handled in switch + (if -Wswitch or -Wall is used) */ + } + + my_error(ER_CHECK_NOT_IMPLEMENTED, MYF(0), "this functionality"); + + return(PAGE_CUR_UNSUPP); +} + +/* + BACKGROUND INFO: HOW A SELECT SQL QUERY IS EXECUTED + --------------------------------------------------- +The following does not cover all the details, but explains how we determine +the start of a new SQL statement, and what is associated with it. + +For each table in the database the MySQL interpreter may have several +table handle instances in use, also in a single SQL query. For each table +handle instance there is an InnoDB 'prebuilt' struct which contains most +of the InnoDB data associated with this table handle instance. + + A) if the user has not explicitly set any MySQL table level locks: + + 1) MySQL calls ::external_lock to set an 'intention' table level lock on +the table of the handle instance. There we set +prebuilt->sql_stat_start = TRUE. The flag sql_stat_start should be set +true if we are taking this table handle instance to use in a new SQL +statement issued by the user. We also increment trx->n_mysql_tables_in_use. + + 2) If prebuilt->sql_stat_start == TRUE we 'pre-compile' the MySQL search +instructions to prebuilt->template of the table handle instance in +::index_read. The template is used to save CPU time in large joins. + + 3) In row_search_for_mysql, if prebuilt->sql_stat_start is true, we +allocate a new consistent read view for the trx if it does not yet have one, +or in the case of a locking read, set an InnoDB 'intention' table level +lock on the table. + + 4) We do the SELECT. MySQL may repeatedly call ::index_read for the +same table handle instance, if it is a join. + + 5) When the SELECT ends, MySQL removes its intention table level locks +in ::external_lock. When trx->n_mysql_tables_in_use drops to zero, + (a) we execute a COMMIT there if the autocommit is on, + (b) we also release possible 'SQL statement level resources' InnoDB may +have for this SQL statement. The MySQL interpreter does NOT execute +autocommit for pure read transactions, though it should. That is why the +table handler in that case has to execute the COMMIT in ::external_lock. + + B) If the user has explicitly set MySQL table level locks, then MySQL +does NOT call ::external_lock at the start of the statement. To determine +when we are at the start of a new SQL statement we at the start of +::index_read also compare the query id to the latest query id where the +table handle instance was used. If it has changed, we know we are at the +start of a new SQL statement. Since the query id can theoretically +overwrap, we use this test only as a secondary way of determining the +start of a new SQL statement. */ + + +/**********************************************************************//** +Positions an index cursor to the index specified in the handle. Fetches the +row if any. +@return 0, HA_ERR_KEY_NOT_FOUND, or error number */ +UNIV_INTERN +int +ha_innobase::index_read( +/*====================*/ + uchar* buf, /*!< in/out: buffer for the returned + row */ + const uchar* key_ptr, /*!< in: key value; if this is NULL + we position the cursor at the + start or end of index; this can + also contain an InnoDB row id, in + which case key_len is the InnoDB + row id length; the key value can + also be a prefix of a full key value, + and the last column can be a prefix + of a full column */ + uint key_len,/*!< in: key value length */ + enum ha_rkey_function find_flag)/*!< in: search flags from my_base.h */ +{ + ulint mode; + dict_index_t* index; + ulint match_mode = 0; + int error; + dberr_t ret; + + DBUG_ENTER("index_read"); + DEBUG_SYNC_C("ha_innobase_index_read_begin"); + + ut_a(prebuilt->trx == thd_to_trx(user_thd)); + ut_ad(key_len != 0 || find_flag != HA_READ_KEY_EXACT); + + ha_statistic_increment(&SSV::ha_read_key_count); + + if (UNIV_UNLIKELY(srv_pass_corrupt_table <= 1 && share + && share->ib_table && share->ib_table->is_corrupt)) { + DBUG_RETURN(HA_ERR_CRASHED); + } + + index = prebuilt->index; + + if (UNIV_UNLIKELY(index == NULL) || dict_index_is_corrupted(index)) { + prebuilt->index_usable = FALSE; + DBUG_RETURN(HA_ERR_CRASHED); + } + if (UNIV_UNLIKELY(!prebuilt->index_usable)) { + DBUG_RETURN(dict_index_is_corrupted(index) + ? HA_ERR_INDEX_CORRUPT + : HA_ERR_TABLE_DEF_CHANGED); + } + + if (index->type & DICT_FTS) { + DBUG_RETURN(HA_ERR_KEY_NOT_FOUND); + } + + /* Note that if the index for which the search template is built is not + necessarily prebuilt->index, but can also be the clustered index */ + + if (prebuilt->sql_stat_start) { + build_template(false); + } + + if (key_ptr) { + /* Convert the search key value to InnoDB format into + prebuilt->search_tuple */ + + row_sel_convert_mysql_key_to_innobase( + prebuilt->search_tuple, + prebuilt->srch_key_val1, + prebuilt->srch_key_val_len, + index, + (byte*) key_ptr, + (ulint) key_len, + prebuilt->trx); + DBUG_ASSERT(prebuilt->search_tuple->n_fields > 0); + } else { + /* We position the cursor to the last or the first entry + in the index */ + + dtuple_set_n_fields(prebuilt->search_tuple, 0); + } + + mode = convert_search_mode_to_innobase(find_flag); + + match_mode = 0; + + if (find_flag == HA_READ_KEY_EXACT) { + + match_mode = ROW_SEL_EXACT; + + } else if (find_flag == HA_READ_PREFIX + || find_flag == HA_READ_PREFIX_LAST) { + + match_mode = ROW_SEL_EXACT_PREFIX; + } + + last_match_mode = (uint) match_mode; + + if (mode != PAGE_CUR_UNSUPP) { + + innobase_srv_conc_enter_innodb(prebuilt->trx); + + ret = row_search_for_mysql((byte*) buf, mode, prebuilt, + match_mode, 0); + + innobase_srv_conc_exit_innodb(prebuilt->trx); + } else { + + ret = DB_UNSUPPORTED; + } + + if (UNIV_UNLIKELY(srv_pass_corrupt_table <= 1 && share + && share->ib_table && share->ib_table->is_corrupt)) { + DBUG_RETURN(HA_ERR_CRASHED); + } + + switch (ret) { + case DB_SUCCESS: + error = 0; + table->status = 0; + srv_stats.n_rows_read.add((size_t) prebuilt->trx->id, 1); +#ifdef EXTENDED_FOR_USERSTAT + rows_read++; + if (active_index < MAX_KEY) + index_rows_read[active_index]++; +#endif + break; + case DB_RECORD_NOT_FOUND: + error = HA_ERR_KEY_NOT_FOUND; + table->status = STATUS_NOT_FOUND; + break; + case DB_END_OF_INDEX: + error = HA_ERR_KEY_NOT_FOUND; + table->status = STATUS_NOT_FOUND; + break; + case DB_TABLESPACE_DELETED: + + ib_senderrf( + prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_TABLESPACE_DISCARDED, + table->s->table_name.str); + + table->status = STATUS_NOT_FOUND; + error = HA_ERR_NO_SUCH_TABLE; + break; + case DB_TABLESPACE_NOT_FOUND: + + ib_senderrf( + prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_TABLESPACE_MISSING, MYF(0), + table->s->table_name.str); + + table->status = STATUS_NOT_FOUND; + error = HA_ERR_NO_SUCH_TABLE; + break; + default: + error = convert_error_code_to_mysql( + ret, prebuilt->table->flags, user_thd); + + table->status = STATUS_NOT_FOUND; + break; + } + + DBUG_RETURN(error); +} + +/*******************************************************************//** +The following functions works like index_read, but it find the last +row with the current key value or prefix. +@return 0, HA_ERR_KEY_NOT_FOUND, or an error code */ +UNIV_INTERN +int +ha_innobase::index_read_last( +/*=========================*/ + uchar* buf, /*!< out: fetched row */ + const uchar* key_ptr,/*!< in: key value, or a prefix of a full + key value */ + uint key_len)/*!< in: length of the key val or prefix + in bytes */ +{ + return(index_read(buf, key_ptr, key_len, HA_READ_PREFIX_LAST)); +} + +/********************************************************************//** +Get the index for a handle. Does not change active index. +@return NULL or index instance. */ +UNIV_INTERN +dict_index_t* +ha_innobase::innobase_get_index( +/*============================*/ + uint keynr) /*!< in: use this index; MAX_KEY means always + clustered index, even if it was internally + generated by InnoDB */ +{ + KEY* key = 0; + dict_index_t* index = 0; + + DBUG_ENTER("innobase_get_index"); + + if (keynr != MAX_KEY && table->s->keys > 0) { + key = table->key_info + keynr; + + index = innobase_index_lookup(share, keynr); + + if (index) { + ut_a(ut_strcmp(index->name, key->name) == 0); + } else { + /* Can't find index with keynr in the translation + table. Only print message if the index translation + table exists */ + if (share->idx_trans_tbl.index_mapping) { + sql_print_warning("InnoDB could not find " + "index %s key no %u for " + "table %s through its " + "index translation table", + key ? key->name : "NULL", + keynr, + prebuilt->table->name); + } + + index = dict_table_get_index_on_name(prebuilt->table, + key->name); + } + } else { + index = dict_table_get_first_index(prebuilt->table); + } + + if (!index) { + sql_print_error( + "Innodb could not find key n:o %u with name %s " + "from dict cache for table %s", + keynr, key ? key->name : "NULL", + prebuilt->table->name); + } + + DBUG_RETURN(index); +} + +/********************************************************************//** +Changes the active index of a handle. +@return 0 or error code */ +UNIV_INTERN +int +ha_innobase::change_active_index( +/*=============================*/ + uint keynr) /*!< in: use this index; MAX_KEY means always clustered + index, even if it was internally generated by + InnoDB */ +{ + DBUG_ENTER("change_active_index"); + + if (UNIV_UNLIKELY(srv_pass_corrupt_table <= 1 && share + && share->ib_table && share->ib_table->is_corrupt)) { + DBUG_RETURN(HA_ERR_CRASHED); + } + + ut_ad(user_thd == ha_thd()); + ut_a(prebuilt->trx == thd_to_trx(user_thd)); + + active_index = keynr; + + prebuilt->index = innobase_get_index(keynr); + + if (UNIV_UNLIKELY(!prebuilt->index)) { + sql_print_warning("InnoDB: change_active_index(%u) failed", + keynr); + prebuilt->index_usable = FALSE; + DBUG_RETURN(1); + } + + prebuilt->index_usable = row_merge_is_index_usable(prebuilt->trx, + prebuilt->index); + + if (UNIV_UNLIKELY(!prebuilt->index_usable)) { + if (dict_index_is_corrupted(prebuilt->index)) { + char index_name[MAX_FULL_NAME_LEN + 1]; + char table_name[MAX_FULL_NAME_LEN + 1]; + + innobase_format_name( + index_name, sizeof index_name, + prebuilt->index->name, TRUE); + + innobase_format_name( + table_name, sizeof table_name, + prebuilt->index->table->name, FALSE); + + push_warning_printf( + user_thd, Sql_condition::WARN_LEVEL_WARN, + HA_ERR_INDEX_CORRUPT, + "InnoDB: Index %s for table %s is" + " marked as corrupted", + index_name, table_name); + DBUG_RETURN(HA_ERR_INDEX_CORRUPT); + } else { + push_warning_printf( + user_thd, Sql_condition::WARN_LEVEL_WARN, + HA_ERR_TABLE_DEF_CHANGED, + "InnoDB: insufficient history for index %u", + keynr); + } + + /* The caller seems to ignore this. Thus, we must check + this again in row_search_for_mysql(). */ + DBUG_RETURN(HA_ERR_TABLE_DEF_CHANGED); + } + + ut_a(prebuilt->search_tuple != 0); + + dtuple_set_n_fields(prebuilt->search_tuple, prebuilt->index->n_fields); + + dict_index_copy_types(prebuilt->search_tuple, prebuilt->index, + prebuilt->index->n_fields); + + /* MySQL changes the active index for a handle also during some + queries, for example SELECT MAX(a), SUM(a) first retrieves the MAX() + and then calculates the sum. Previously we played safe and used + the flag ROW_MYSQL_WHOLE_ROW below, but that caused unnecessary + copying. Starting from MySQL-4.1 we use a more efficient flag here. */ + + build_template(false); + + DBUG_RETURN(0); +} + +/**********************************************************************//** +Positions an index cursor to the index specified in keynr. Fetches the +row if any. +??? This is only used to read whole keys ??? +@return error number or 0 */ +UNIV_INTERN +int +ha_innobase::index_read_idx( +/*========================*/ + uchar* buf, /*!< in/out: buffer for the returned + row */ + uint keynr, /*!< in: use this index */ + const uchar* key, /*!< in: key value; if this is NULL + we position the cursor at the + start or end of index */ + uint key_len, /*!< in: key value length */ + enum ha_rkey_function find_flag)/*!< in: search flags from my_base.h */ +{ + if (change_active_index(keynr)) { + + return(1); + } + + return(index_read(buf, key, key_len, find_flag)); +} + +/***********************************************************************//** +Reads the next or previous row from a cursor, which must have previously been +positioned using index_read. +@return 0, HA_ERR_END_OF_FILE, or error number */ +UNIV_INTERN +int +ha_innobase::general_fetch( +/*=======================*/ + uchar* buf, /*!< in/out: buffer for next row in MySQL + format */ + uint direction, /*!< in: ROW_SEL_NEXT or ROW_SEL_PREV */ + uint match_mode) /*!< in: 0, ROW_SEL_EXACT, or + ROW_SEL_EXACT_PREFIX */ +{ + dberr_t ret; + int error; + + DBUG_ENTER("general_fetch"); + + if (UNIV_UNLIKELY(srv_pass_corrupt_table <= 1 && share + && share->ib_table && share->ib_table->is_corrupt)) { + DBUG_RETURN(HA_ERR_CRASHED); + } + + ut_a(prebuilt->trx == thd_to_trx(user_thd)); + + innobase_srv_conc_enter_innodb(prebuilt->trx); + + ret = row_search_for_mysql( + (byte*) buf, 0, prebuilt, match_mode, direction); + + innobase_srv_conc_exit_innodb(prebuilt->trx); + + if (UNIV_UNLIKELY(srv_pass_corrupt_table <= 1 && share + && share->ib_table && share->ib_table->is_corrupt)) { + DBUG_RETURN(HA_ERR_CRASHED); + } + + switch (ret) { + case DB_SUCCESS: + error = 0; + table->status = 0; + srv_stats.n_rows_read.add((size_t) prebuilt->trx->id, 1); +#ifdef EXTENDED_FOR_USERSTAT + rows_read++; + if (active_index < MAX_KEY) + index_rows_read[active_index]++; +#endif + break; + case DB_RECORD_NOT_FOUND: + error = HA_ERR_END_OF_FILE; + table->status = STATUS_NOT_FOUND; + break; + case DB_END_OF_INDEX: + error = HA_ERR_END_OF_FILE; + table->status = STATUS_NOT_FOUND; + break; + case DB_TABLESPACE_DELETED: + + ib_senderrf( + prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_TABLESPACE_DISCARDED, + table->s->table_name.str); + + table->status = STATUS_NOT_FOUND; + error = HA_ERR_NO_SUCH_TABLE; + break; + case DB_TABLESPACE_NOT_FOUND: + + ib_senderrf( + prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_TABLESPACE_MISSING, + table->s->table_name.str); + + table->status = STATUS_NOT_FOUND; + error = HA_ERR_NO_SUCH_TABLE; + break; + default: + error = convert_error_code_to_mysql( + ret, prebuilt->table->flags, user_thd); + + table->status = STATUS_NOT_FOUND; + break; + } + + DBUG_RETURN(error); +} + +/***********************************************************************//** +Reads the next row from a cursor, which must have previously been +positioned using index_read. +@return 0, HA_ERR_END_OF_FILE, or error number */ +UNIV_INTERN +int +ha_innobase::index_next( +/*====================*/ + uchar* buf) /*!< in/out: buffer for next row in MySQL + format */ +{ + ha_statistic_increment(&SSV::ha_read_next_count); + + return(general_fetch(buf, ROW_SEL_NEXT, 0)); +} + +/*******************************************************************//** +Reads the next row matching to the key value given as the parameter. +@return 0, HA_ERR_END_OF_FILE, or error number */ +UNIV_INTERN +int +ha_innobase::index_next_same( +/*=========================*/ + uchar* buf, /*!< in/out: buffer for the row */ + const uchar* key, /*!< in: key value */ + uint keylen) /*!< in: key value length */ +{ + ha_statistic_increment(&SSV::ha_read_next_count); + + return(general_fetch(buf, ROW_SEL_NEXT, last_match_mode)); +} + +/***********************************************************************//** +Reads the previous row from a cursor, which must have previously been +positioned using index_read. +@return 0, HA_ERR_END_OF_FILE, or error number */ +UNIV_INTERN +int +ha_innobase::index_prev( +/*====================*/ + uchar* buf) /*!< in/out: buffer for previous row in MySQL format */ +{ + ha_statistic_increment(&SSV::ha_read_prev_count); + + return(general_fetch(buf, ROW_SEL_PREV, 0)); +} + +/********************************************************************//** +Positions a cursor on the first record in an index and reads the +corresponding row to buf. +@return 0, HA_ERR_END_OF_FILE, or error code */ +UNIV_INTERN +int +ha_innobase::index_first( +/*=====================*/ + uchar* buf) /*!< in/out: buffer for the row */ +{ + int error; + + DBUG_ENTER("index_first"); + ha_statistic_increment(&SSV::ha_read_first_count); + + error = index_read(buf, NULL, 0, HA_READ_AFTER_KEY); + + /* MySQL does not seem to allow this to return HA_ERR_KEY_NOT_FOUND */ + + if (error == HA_ERR_KEY_NOT_FOUND) { + error = HA_ERR_END_OF_FILE; + } + + DBUG_RETURN(error); +} + +/********************************************************************//** +Positions a cursor on the last record in an index and reads the +corresponding row to buf. +@return 0, HA_ERR_END_OF_FILE, or error code */ +UNIV_INTERN +int +ha_innobase::index_last( +/*====================*/ + uchar* buf) /*!< in/out: buffer for the row */ +{ + int error; + + DBUG_ENTER("index_last"); + ha_statistic_increment(&SSV::ha_read_last_count); + + error = index_read(buf, NULL, 0, HA_READ_BEFORE_KEY); + + /* MySQL does not seem to allow this to return HA_ERR_KEY_NOT_FOUND */ + + if (error == HA_ERR_KEY_NOT_FOUND) { + error = HA_ERR_END_OF_FILE; + } + + DBUG_RETURN(error); +} + +/****************************************************************//** +Initialize a table scan. +@return 0 or error number */ +UNIV_INTERN +int +ha_innobase::rnd_init( +/*==================*/ + bool scan) /*!< in: TRUE if table/index scan FALSE otherwise */ +{ + int err; + + /* Store the active index value so that we can restore the original + value after a scan */ + + if (prebuilt->clust_index_was_generated) { + err = change_active_index(MAX_KEY); + } else { + err = change_active_index(primary_key); + } + + /* Don't use semi-consistent read in random row reads (by position). + This means we must disable semi_consistent_read if scan is false */ + + if (!scan) { + try_semi_consistent_read(0); + } + + start_of_scan = 1; + + return(err); +} + +/*****************************************************************//** +Ends a table scan. +@return 0 or error number */ +UNIV_INTERN +int +ha_innobase::rnd_end(void) +/*======================*/ +{ + return(index_end()); +} + +/*****************************************************************//** +Reads the next row in a table scan (also used to read the FIRST row +in a table scan). +@return 0, HA_ERR_END_OF_FILE, or error number */ +UNIV_INTERN +int +ha_innobase::rnd_next( +/*==================*/ + uchar* buf) /*!< in/out: returns the row in this buffer, + in MySQL format */ +{ + int error; + + DBUG_ENTER("rnd_next"); + ha_statistic_increment(&SSV::ha_read_rnd_next_count); + + if (start_of_scan) { + error = index_first(buf); + + if (error == HA_ERR_KEY_NOT_FOUND) { + error = HA_ERR_END_OF_FILE; + } + + start_of_scan = 0; + } else { + error = general_fetch(buf, ROW_SEL_NEXT, 0); + } + + DBUG_RETURN(error); +} + +/**********************************************************************//** +Fetches a row from the table based on a row reference. +@return 0, HA_ERR_KEY_NOT_FOUND, or error code */ +UNIV_INTERN +int +ha_innobase::rnd_pos( +/*=================*/ + uchar* buf, /*!< in/out: buffer for the row */ + uchar* pos) /*!< in: primary key value of the row in the + MySQL format, or the row id if the clustered + index was internally generated by InnoDB; the + length of data in pos has to be ref_length */ +{ + int error; + DBUG_ENTER("rnd_pos"); + DBUG_DUMP("key", pos, ref_length); + + ha_statistic_increment(&SSV::ha_read_rnd_count); + + ut_a(prebuilt->trx == thd_to_trx(ha_thd())); + + /* Note that we assume the length of the row reference is fixed + for the table, and it is == ref_length */ + + error = index_read(buf, pos, ref_length, HA_READ_KEY_EXACT); + + if (error) { + DBUG_PRINT("error", ("Got error: %d", error)); + } + + DBUG_RETURN(error); +} + +/**********************************************************************//** +Initialize FT index scan +@return 0 or error number */ +UNIV_INTERN +int +ha_innobase::ft_init() +/*==================*/ +{ + DBUG_ENTER("ft_init"); + + trx_t* trx = check_trx_exists(ha_thd()); + + /* FTS queries are not treated as autocommit non-locking selects. + This is because the FTS implementation can acquire locks behind + the scenes. This has not been verified but it is safer to treat + them as regular read only transactions for now. */ + + if (!trx_is_started(trx)) { + ++trx->will_lock; + } + + DBUG_RETURN(rnd_init(false)); +} + +/**********************************************************************//** +Initialize FT index scan +@return FT_INFO structure if successful or NULL */ +UNIV_INTERN +FT_INFO* +ha_innobase::ft_init_ext( +/*=====================*/ + uint flags, /* in: */ + uint keynr, /* in: */ + String* key) /* in: */ +{ + trx_t* trx; + dict_table_t* ft_table; + dberr_t error; + byte* query = (byte*) key->ptr(); + ulint query_len = key->length(); + const CHARSET_INFO* char_set = key->charset(); + NEW_FT_INFO* fts_hdl = NULL; + dict_index_t* index; + fts_result_t* result; + char buf_tmp[8192]; + ulint buf_tmp_used; + uint num_errors; + + if (fts_enable_diag_print) { + fprintf(stderr, "keynr=%u, '%.*s'\n", + keynr, (int) key->length(), (byte*) key->ptr()); + + if (flags & FT_BOOL) { + fprintf(stderr, "BOOL search\n"); + } else { + fprintf(stderr, "NL search\n"); + } + } + + /* FIXME: utf32 and utf16 are not compatible with some + string function used. So to convert them to uft8 before + proceed. */ + if (strcmp(char_set->csname, "utf32") == 0 + || strcmp(char_set->csname, "utf16") == 0) { + buf_tmp_used = innobase_convert_string( + buf_tmp, sizeof(buf_tmp) - 1, + &my_charset_utf8_general_ci, + query, query_len, (CHARSET_INFO*) char_set, + &num_errors); + + query = (byte*) buf_tmp; + query_len = buf_tmp_used; + query[query_len] = 0; + } + + trx = prebuilt->trx; + + /* FTS queries are not treated as autocommit non-locking selects. + This is because the FTS implementation can acquire locks behind + the scenes. This has not been verified but it is safer to treat + them as regular read only transactions for now. */ + + if (!trx_is_started(trx)) { + ++trx->will_lock; + } + + ft_table = prebuilt->table; + + /* Table does not have an FTS index */ + if (!ft_table->fts || ib_vector_is_empty(ft_table->fts->indexes)) { + my_error(ER_TABLE_HAS_NO_FT, MYF(0)); + return(NULL); + } + + /* If tablespace is discarded, we should return here */ + if (dict_table_is_discarded(ft_table)) { + my_error(ER_NO_SUCH_TABLE, MYF(0), table->s->db.str, + table->s->table_name.str); + return(NULL); + } + + if (keynr == NO_SUCH_KEY) { + /* FIXME: Investigate the NO_SUCH_KEY usage */ + index = (dict_index_t*) ib_vector_getp(ft_table->fts->indexes, 0); + } else { + index = innobase_get_index(keynr); + } + + if (!index || index->type != DICT_FTS) { + my_error(ER_TABLE_HAS_NO_FT, MYF(0)); + return(NULL); + } + + if (!(ft_table->fts->fts_status & ADDED_TABLE_SYNCED)) { + fts_init_index(ft_table, FALSE); + + ft_table->fts->fts_status |= ADDED_TABLE_SYNCED; + } + + error = fts_query(trx, index, flags, query, query_len, &result); + + if (error != DB_SUCCESS) { + my_error(convert_error_code_to_mysql(error, 0, NULL), + MYF(0)); + return(NULL); + } + + /* Allocate FTS handler, and instantiate it before return */ + fts_hdl = static_cast<NEW_FT_INFO*>(my_malloc(sizeof(NEW_FT_INFO), + MYF(0))); + + fts_hdl->please = const_cast<_ft_vft*>(&ft_vft_result); + fts_hdl->could_you = const_cast<_ft_vft_ext*>(&ft_vft_ext_result); + fts_hdl->ft_prebuilt = prebuilt; + fts_hdl->ft_result = result; + + /* FIXME: Re-evluate the condition when Bug 14469540 + is resolved */ + prebuilt->in_fts_query = true; + + return((FT_INFO*) fts_hdl); +} + +/*****************************************************************//** +Set up search tuple for a query through FTS_DOC_ID_INDEX on +supplied Doc ID. This is used by MySQL to retrieve the documents +once the search result (Doc IDs) is available */ +static +void +innobase_fts_create_doc_id_key( +/*===========================*/ + dtuple_t* tuple, /* in/out: prebuilt->search_tuple */ + const dict_index_t* + index, /* in: index (FTS_DOC_ID_INDEX) */ + doc_id_t* doc_id) /* in/out: doc id to search, value + could be changed to storage format + used for search. */ +{ + doc_id_t temp_doc_id; + dfield_t* dfield = dtuple_get_nth_field(tuple, 0); + + ut_a(dict_index_get_n_unique(index) == 1); + + dtuple_set_n_fields(tuple, index->n_fields); + dict_index_copy_types(tuple, index, index->n_fields); + +#ifdef UNIV_DEBUG + /* The unique Doc ID field should be an eight-bytes integer */ + dict_field_t* field = dict_index_get_nth_field(index, 0); + ut_a(field->col->mtype == DATA_INT); + ut_ad(sizeof(*doc_id) == field->fixed_len); + ut_ad(innobase_strcasecmp(index->name, FTS_DOC_ID_INDEX_NAME) == 0); +#endif /* UNIV_DEBUG */ + + /* Convert to storage byte order */ + mach_write_to_8(reinterpret_cast<byte*>(&temp_doc_id), *doc_id); + *doc_id = temp_doc_id; + dfield_set_data(dfield, doc_id, sizeof(*doc_id)); + + dtuple_set_n_fields_cmp(tuple, 1); + + for (ulint i = 1; i < index->n_fields; i++) { + dfield = dtuple_get_nth_field(tuple, i); + dfield_set_null(dfield); + } +} + +/**********************************************************************//** +Fetch next result from the FT result set +@return error code */ +UNIV_INTERN +int +ha_innobase::ft_read( +/*=================*/ + uchar* buf) /*!< in/out: buf contain result row */ +{ + fts_result_t* result; + int error; + row_prebuilt_t* ft_prebuilt; + + ft_prebuilt = ((NEW_FT_INFO*) ft_handler)->ft_prebuilt; + + ut_a(ft_prebuilt == prebuilt); + + result = ((NEW_FT_INFO*) ft_handler)->ft_result; + + if (result->current == NULL) { + /* This is the case where the FTS query did not + contain and matching documents. */ + if (result->rankings_by_id != NULL) { + /* Now that we have the complete result, we + need to sort the document ids on their rank + calculation. */ + + fts_query_sort_result_on_rank(result); + + result->current = const_cast<ib_rbt_node_t*>( + rbt_first(result->rankings_by_rank)); + } else { + ut_a(result->current == NULL); + } + } else { + result->current = const_cast<ib_rbt_node_t*>( + rbt_next(result->rankings_by_rank, result->current)); + } + +next_record: + + if (result->current != NULL) { + dict_index_t* index; + dtuple_t* tuple = prebuilt->search_tuple; + doc_id_t search_doc_id; + + /* If we only need information from result we can return + without fetching the table row */ + if (ft_prebuilt->read_just_key) { + table->status= 0; + return(0); + } + + index = dict_table_get_index_on_name( + prebuilt->table, FTS_DOC_ID_INDEX_NAME); + + /* Must find the index */ + ut_a(index); + + /* Switch to the FTS doc id index */ + prebuilt->index = index; + + fts_ranking_t* ranking = rbt_value( + fts_ranking_t, result->current); + + search_doc_id = ranking->doc_id; + + /* We pass a pointer of search_doc_id because it will be + converted to storage byte order used in the search + tuple. */ + innobase_fts_create_doc_id_key(tuple, index, &search_doc_id); + + innobase_srv_conc_enter_innodb(prebuilt->trx); + + dberr_t ret = row_search_for_mysql( + (byte*) buf, PAGE_CUR_GE, prebuilt, ROW_SEL_EXACT, 0); + + innobase_srv_conc_exit_innodb(prebuilt->trx); + + switch (ret) { + case DB_SUCCESS: + error = 0; + table->status = 0; + break; + case DB_RECORD_NOT_FOUND: + result->current = const_cast<ib_rbt_node_t*>( + rbt_next(result->rankings_by_rank, + result->current)); + + if (!result->current) { + /* exhaust the result set, should return + HA_ERR_END_OF_FILE just like + ha_innobase::general_fetch() and/or + ha_innobase::index_first() etc. */ + error = HA_ERR_END_OF_FILE; + table->status = STATUS_NOT_FOUND; + } else { + goto next_record; + } + break; + case DB_END_OF_INDEX: + error = HA_ERR_END_OF_FILE; + table->status = STATUS_NOT_FOUND; + break; + case DB_TABLESPACE_DELETED: + + ib_senderrf( + prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_TABLESPACE_DISCARDED, + table->s->table_name.str); + + table->status = STATUS_NOT_FOUND; + error = HA_ERR_NO_SUCH_TABLE; + break; + case DB_TABLESPACE_NOT_FOUND: + + ib_senderrf( + prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_TABLESPACE_MISSING, + table->s->table_name.str); + + table->status = STATUS_NOT_FOUND; + error = HA_ERR_NO_SUCH_TABLE; + break; + default: + error = convert_error_code_to_mysql( + ret, 0, user_thd); + + table->status = STATUS_NOT_FOUND; + break; + } + + return(error); + } + + return(HA_ERR_END_OF_FILE); +} + +/************************************************************************* +*/ + +void +ha_innobase::ft_end() +{ + fprintf(stderr, "ft_end()\n"); + + rnd_end(); +} + +/*********************************************************************//** +Stores a reference to the current row to 'ref' field of the handle. Note +that in the case where we have generated the clustered index for the +table, the function parameter is illogical: we MUST ASSUME that 'record' +is the current 'position' of the handle, because if row ref is actually +the row id internally generated in InnoDB, then 'record' does not contain +it. We just guess that the row id must be for the record where the handle +was positioned the last time. */ +UNIV_INTERN +void +ha_innobase::position( +/*==================*/ + const uchar* record) /*!< in: row in MySQL format */ +{ + uint len; + + ut_a(prebuilt->trx == thd_to_trx(ha_thd())); + + if (prebuilt->clust_index_was_generated) { + /* No primary key was defined for the table and we + generated the clustered index from row id: the + row reference will be the row id, not any key value + that MySQL knows of */ + + len = DATA_ROW_ID_LEN; + + memcpy(ref, prebuilt->row_id, len); + } else { + len = store_key_val_for_row(primary_key, (char*) ref, + ref_length, record); + } + + /* We assume that the 'ref' value len is always fixed for the same + table. */ + + if (len != ref_length) { + sql_print_error("Stored ref len is %lu, but table ref len is " + "%lu", (ulong) len, (ulong) ref_length); + } +} + +/*****************************************************************//** +Check whether there exist a column named as "FTS_DOC_ID", which is +reserved for InnoDB FTS Doc ID +@return true if there exist a "FTS_DOC_ID" column */ +static +bool +create_table_check_doc_id_col( +/*==========================*/ + trx_t* trx, /*!< in: InnoDB transaction handle */ + const TABLE* form, /*!< in: information on table + columns and indexes */ + ulint* doc_id_col) /*!< out: Doc ID column number if + there exist a FTS_DOC_ID column, + ULINT_UNDEFINED if column is of the + wrong type/name/size */ +{ + for (ulint i = 0; i < form->s->fields; i++) { + const Field* field; + ulint col_type; + ulint col_len; + ulint unsigned_type; + + field = form->field[i]; + + col_type = get_innobase_type_from_mysql_type(&unsigned_type, + field); + + col_len = field->pack_length(); + + if (innobase_strcasecmp(field->field_name, + FTS_DOC_ID_COL_NAME) == 0) { + + /* Note the name is case sensitive due to + our internal query parser */ + if (col_type == DATA_INT + && !field->real_maybe_null() + && col_len == sizeof(doc_id_t) + && (strcmp(field->field_name, + FTS_DOC_ID_COL_NAME) == 0)) { + *doc_id_col = i; + } else { + push_warning_printf( + trx->mysql_thd, + Sql_condition::WARN_LEVEL_WARN, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: FTS_DOC_ID column must be " + "of BIGINT NOT NULL type, and named " + "in all capitalized characters"); + my_error(ER_WRONG_COLUMN_NAME, MYF(0), + field->field_name); + *doc_id_col = ULINT_UNDEFINED; + } + + return(true); + } + } + + return(false); +} + +/*****************************************************************//** +Creates a table definition to an InnoDB database. */ +static __attribute__((nonnull, warn_unused_result)) +int +create_table_def( +/*=============*/ + trx_t* trx, /*!< in: InnoDB transaction handle */ + const TABLE* form, /*!< in: information on table + columns and indexes */ + const char* table_name, /*!< in: table name */ + const char* temp_path, /*!< in: if this is a table explicitly + created by the user with the + TEMPORARY keyword, then this + parameter is the dir path where the + table should be placed if we create + an .ibd file for it (no .ibd extension + in the path, though). Otherwise this + is a zero length-string */ + const char* remote_path, /*!< in: Remote path or zero length-string */ + ulint flags, /*!< in: table flags */ + ulint flags2) /*!< in: table flags2 */ +{ + THD* thd = trx->mysql_thd; + dict_table_t* table; + ulint n_cols; + dberr_t err; + ulint col_type; + ulint col_len; + ulint nulls_allowed; + ulint unsigned_type; + ulint binary_type; + ulint long_true_varchar; + ulint charset_no; + ulint i; + ulint doc_id_col = 0; + ibool has_doc_id_col = FALSE; + mem_heap_t* heap; + + DBUG_ENTER("create_table_def"); + DBUG_PRINT("enter", ("table_name: %s", table_name)); + + DBUG_ASSERT(thd != NULL); + + /* MySQL does the name length check. But we do additional check + on the name length here */ + if (strlen(table_name) > MAX_FULL_NAME_LEN) { + push_warning_printf( + thd, Sql_condition::WARN_LEVEL_WARN, + ER_TABLE_NAME, + "InnoDB: Table Name or Database Name is too long"); + + DBUG_RETURN(ER_TABLE_NAME); + } + + n_cols = form->s->fields; + + /* Check whether there already exists a FTS_DOC_ID column */ + if (create_table_check_doc_id_col(trx, form, &doc_id_col)){ + + /* Raise error if the Doc ID column is of wrong type or name */ + if (doc_id_col == ULINT_UNDEFINED) { + trx_commit_for_mysql(trx); + + err = DB_ERROR; + goto error_ret; + } else { + has_doc_id_col = TRUE; + } + } + + /* We pass 0 as the space id, and determine at a lower level the space + id where to store the table */ + + if (flags2 & DICT_TF2_FTS) { + /* Adjust for the FTS hidden field */ + if (!has_doc_id_col) { + table = dict_mem_table_create(table_name, 0, n_cols + 1, + flags, flags2, false); + + /* Set the hidden doc_id column. */ + table->fts->doc_col = n_cols; + } else { + table = dict_mem_table_create(table_name, 0, n_cols, + flags, flags2, false); + table->fts->doc_col = doc_id_col; + } + } else { + table = dict_mem_table_create(table_name, 0, n_cols, + flags, flags2, false); + } + + if (flags2 & DICT_TF2_TEMPORARY) { + ut_a(strlen(temp_path)); + table->dir_path_of_temp_table = + mem_heap_strdup(table->heap, temp_path); + } + + if (DICT_TF_HAS_DATA_DIR(flags)) { + ut_a(strlen(remote_path)); + table->data_dir_path = mem_heap_strdup(table->heap, remote_path); + } else { + table->data_dir_path = NULL; + } + heap = mem_heap_create(1000); + + for (i = 0; i < n_cols; i++) { + Field* field = form->field[i]; + + col_type = get_innobase_type_from_mysql_type(&unsigned_type, + field); + + if (!col_type) { + push_warning_printf( + thd, Sql_condition::WARN_LEVEL_WARN, + ER_CANT_CREATE_TABLE, + "Error creating table '%s' with " + "column '%s'. Please check its " + "column type and try to re-create " + "the table with an appropriate " + "column type.", + table->name, field->field_name); + goto err_col; + } + + nulls_allowed = field->real_maybe_null() ? 0 : DATA_NOT_NULL; + binary_type = field->binary() ? DATA_BINARY_TYPE : 0; + + charset_no = 0; + + if (dtype_is_string_type(col_type)) { + + charset_no = (ulint) field->charset()->number; + + if (UNIV_UNLIKELY(charset_no > MAX_CHAR_COLL_NUM)) { + /* in data0type.h we assume that the + number fits in one byte in prtype */ + push_warning_printf( + thd, Sql_condition::WARN_LEVEL_WARN, + ER_CANT_CREATE_TABLE, + "In InnoDB, charset-collation codes" + " must be below 256." + " Unsupported code %lu.", + (ulong) charset_no); + mem_heap_free(heap); + DBUG_RETURN(ER_CANT_CREATE_TABLE); + } + } + + /* we assume in dtype_form_prtype() that this fits in + two bytes */ + ut_a(static_cast<uint>(field->type()) <= MAX_CHAR_COLL_NUM); + col_len = field->pack_length(); + + /* The MySQL pack length contains 1 or 2 bytes length field + for a true VARCHAR. Let us subtract that, so that the InnoDB + column length in the InnoDB data dictionary is the real + maximum byte length of the actual data. */ + + long_true_varchar = 0; + + if (field->type() == MYSQL_TYPE_VARCHAR) { + col_len -= ((Field_varstring*) field)->length_bytes; + + if (((Field_varstring*) field)->length_bytes == 2) { + long_true_varchar = DATA_LONG_TRUE_VARCHAR; + } + } + + /* First check whether the column to be added has a + system reserved name. */ + if (dict_col_name_is_reserved(field->field_name)){ + my_error(ER_WRONG_COLUMN_NAME, MYF(0), + field->field_name); +err_col: + dict_mem_table_free(table); + mem_heap_free(heap); + trx_commit_for_mysql(trx); + + err = DB_ERROR; + goto error_ret; + } + + dict_mem_table_add_col(table, heap, + field->field_name, + col_type, + dtype_form_prtype( + (ulint) field->type() + | nulls_allowed | unsigned_type + | binary_type | long_true_varchar, + charset_no), + col_len); + } + + /* Add the FTS doc_id hidden column. */ + if (flags2 & DICT_TF2_FTS && !has_doc_id_col) { + fts_add_doc_id_column(table, heap); + } + + err = row_create_table_for_mysql(table, trx, false); + + mem_heap_free(heap); + + DBUG_EXECUTE_IF("ib_create_err_tablespace_exist", + err = DB_TABLESPACE_EXISTS;); + + if (err == DB_DUPLICATE_KEY || err == DB_TABLESPACE_EXISTS) { + char display_name[FN_REFLEN]; + char* buf_end = innobase_convert_identifier( + display_name, sizeof(display_name) - 1, + table_name, strlen(table_name), + thd, TRUE); + + *buf_end = '\0'; + + my_error(err == DB_DUPLICATE_KEY + ? ER_TABLE_EXISTS_ERROR + : ER_TABLESPACE_EXISTS, MYF(0), display_name); + } + + if (err == DB_SUCCESS && (flags2 & DICT_TF2_FTS)) { + fts_optimize_add_table(table); + } + +error_ret: + DBUG_RETURN(convert_error_code_to_mysql(err, flags, thd)); +} + +/*****************************************************************//** +Creates an index in an InnoDB database. */ +static +int +create_index( +/*=========*/ + trx_t* trx, /*!< in: InnoDB transaction handle */ + const TABLE* form, /*!< in: information on table + columns and indexes */ + ulint flags, /*!< in: InnoDB table flags */ + const char* table_name, /*!< in: table name */ + uint key_num) /*!< in: index number */ +{ + dict_index_t* index; + int error; + const KEY* key; + ulint ind_type; + ulint* field_lengths; + + DBUG_ENTER("create_index"); + + key = form->key_info + key_num; + + /* Assert that "GEN_CLUST_INDEX" cannot be used as non-primary index */ + ut_a(innobase_strcasecmp(key->name, innobase_index_reserve_name) != 0); + + if (key->flags & HA_FULLTEXT) { + index = dict_mem_index_create(table_name, key->name, 0, + DICT_FTS, + key->user_defined_key_parts); + + for (ulint i = 0; i < key->user_defined_key_parts; i++) { + KEY_PART_INFO* key_part = key->key_part + i; + dict_mem_index_add_field( + index, key_part->field->field_name, 0); + } + + DBUG_RETURN(convert_error_code_to_mysql( + row_create_index_for_mysql( + index, trx, NULL), + flags, NULL)); + + } + + ind_type = 0; + + if (key_num == form->s->primary_key) { + ind_type |= DICT_CLUSTERED; + } + + if (key->flags & HA_NOSAME) { + ind_type |= DICT_UNIQUE; + } + + field_lengths = (ulint*) my_malloc( + key->user_defined_key_parts * sizeof * + field_lengths, MYF(MY_FAE)); + + /* We pass 0 as the space id, and determine at a lower level the space + id where to store the table */ + + index = dict_mem_index_create(table_name, key->name, 0, + ind_type, key->user_defined_key_parts); + + for (ulint i = 0; i < key->user_defined_key_parts; i++) { + KEY_PART_INFO* key_part = key->key_part + i; + ulint prefix_len; + ulint col_type; + ulint is_unsigned; + + + /* (The flag HA_PART_KEY_SEG denotes in MySQL a + column prefix field in an index: we only store a + specified number of first bytes of the column to + the index field.) The flag does not seem to be + properly set by MySQL. Let us fall back on testing + the length of the key part versus the column. */ + + Field* field = NULL; + + for (ulint j = 0; j < form->s->fields; j++) { + + field = form->field[j]; + + if (0 == innobase_strcasecmp( + field->field_name, + key_part->field->field_name)) { + /* Found the corresponding column */ + + goto found; + } + } + + ut_error; +found: + col_type = get_innobase_type_from_mysql_type( + &is_unsigned, key_part->field); + + if (DATA_BLOB == col_type + || (key_part->length < field->pack_length() + && field->type() != MYSQL_TYPE_VARCHAR) + || (field->type() == MYSQL_TYPE_VARCHAR + && key_part->length < field->pack_length() + - ((Field_varstring*) field)->length_bytes)) { + + switch (col_type) { + default: + prefix_len = key_part->length; + break; + case DATA_INT: + case DATA_FLOAT: + case DATA_DOUBLE: + case DATA_DECIMAL: + sql_print_error( + "MySQL is trying to create a column " + "prefix index field, on an " + "inappropriate data type. Table " + "name %s, column name %s.", + table_name, + key_part->field->field_name); + + prefix_len = 0; + } + } else { + prefix_len = 0; + } + + field_lengths[i] = key_part->length; + + dict_mem_index_add_field( + index, key_part->field->field_name, prefix_len); + } + + ut_ad(key->flags & HA_FULLTEXT || !(index->type & DICT_FTS)); + + /* Even though we've defined max_supported_key_part_length, we + still do our own checking using field_lengths to be absolutely + sure we don't create too long indexes. */ + + error = convert_error_code_to_mysql( + row_create_index_for_mysql(index, trx, field_lengths), + flags, NULL); + + my_free(field_lengths); + + DBUG_RETURN(error); +} + +/*****************************************************************//** +Creates an index to an InnoDB table when the user has defined no +primary index. */ +static +int +create_clustered_index_when_no_primary( +/*===================================*/ + trx_t* trx, /*!< in: InnoDB transaction handle */ + ulint flags, /*!< in: InnoDB table flags */ + const char* table_name) /*!< in: table name */ +{ + dict_index_t* index; + dberr_t error; + + /* We pass 0 as the space id, and determine at a lower level the space + id where to store the table */ + index = dict_mem_index_create(table_name, + innobase_index_reserve_name, + 0, DICT_CLUSTERED, 0); + + error = row_create_index_for_mysql(index, trx, NULL); + + return(convert_error_code_to_mysql(error, flags, NULL)); +} + +/*****************************************************************//** +Return a display name for the row format +@return row format name */ +UNIV_INTERN +const char* +get_row_format_name( +/*================*/ + enum row_type row_format) /*!< in: Row Format */ +{ + switch (row_format) { + case ROW_TYPE_COMPACT: + return("COMPACT"); + case ROW_TYPE_COMPRESSED: + return("COMPRESSED"); + case ROW_TYPE_DYNAMIC: + return("DYNAMIC"); + case ROW_TYPE_REDUNDANT: + return("REDUNDANT"); + case ROW_TYPE_DEFAULT: + return("DEFAULT"); + case ROW_TYPE_FIXED: + return("FIXED"); + case ROW_TYPE_PAGE: + case ROW_TYPE_NOT_USED: + default: + break; + } + return("NOT USED"); +} + +/** If file-per-table is missing, issue warning and set ret false */ +#define CHECK_ERROR_ROW_TYPE_NEEDS_FILE_PER_TABLE(use_tablespace)\ + if (!use_tablespace) { \ + push_warning_printf( \ + thd, Sql_condition::WARN_LEVEL_WARN, \ + ER_ILLEGAL_HA_CREATE_OPTION, \ + "InnoDB: ROW_FORMAT=%s requires" \ + " innodb_file_per_table.", \ + get_row_format_name(row_format)); \ + ret = "ROW_FORMAT"; \ + } + +/** If file-format is Antelope, issue warning and set ret false */ +#define CHECK_ERROR_ROW_TYPE_NEEDS_GT_ANTELOPE \ + if (srv_file_format < UNIV_FORMAT_B) { \ + push_warning_printf( \ + thd, Sql_condition::WARN_LEVEL_WARN, \ + ER_ILLEGAL_HA_CREATE_OPTION, \ + "InnoDB: ROW_FORMAT=%s requires" \ + " innodb_file_format > Antelope.", \ + get_row_format_name(row_format)); \ + ret = "ROW_FORMAT"; \ + } + + +/*****************************************************************//** +Validates the create options. We may build on this function +in future. For now, it checks two specifiers: +KEY_BLOCK_SIZE and ROW_FORMAT +If innodb_strict_mode is not set then this function is a no-op +@return NULL if valid, string if not. */ +UNIV_INTERN +const char* +create_options_are_invalid( +/*=======================*/ + THD* thd, /*!< in: connection thread. */ + TABLE* form, /*!< in: information on table + columns and indexes */ + HA_CREATE_INFO* create_info, /*!< in: create info. */ + bool use_tablespace) /*!< in: srv_file_per_table */ +{ + ibool kbs_specified = FALSE; + const char* ret = NULL; + enum row_type row_format = form->s->row_type; + + ut_ad(thd != NULL); + + /* If innodb_strict_mode is not set don't do any validation. */ + if (!(THDVAR(thd, strict_mode))) { + return(NULL); + } + + ut_ad(form != NULL); + ut_ad(create_info != NULL); + + /* First check if a non-zero KEY_BLOCK_SIZE was specified. */ + if (create_info->key_block_size) { + kbs_specified = TRUE; + switch (create_info->key_block_size) { + ulint kbs_max; + case 1: + case 2: + case 4: + case 8: + case 16: + /* Valid KEY_BLOCK_SIZE, check its dependencies. */ + if (!use_tablespace) { + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: KEY_BLOCK_SIZE requires" + " innodb_file_per_table."); + ret = "KEY_BLOCK_SIZE"; + } + if (srv_file_format < UNIV_FORMAT_B) { + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: KEY_BLOCK_SIZE requires" + " innodb_file_format > Antelope."); + ret = "KEY_BLOCK_SIZE"; + } + + /* The maximum KEY_BLOCK_SIZE (KBS) is 16. But if + UNIV_PAGE_SIZE is smaller than 16k, the maximum + KBS is also smaller. */ + kbs_max = ut_min( + 1 << (UNIV_PAGE_SSIZE_MAX - 1), + 1 << (PAGE_ZIP_SSIZE_MAX - 1)); + if (create_info->key_block_size > kbs_max) { + push_warning_printf( + thd, Sql_condition::WARN_LEVEL_WARN, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: KEY_BLOCK_SIZE=%ld" + " cannot be larger than %ld.", + create_info->key_block_size, + kbs_max); + ret = "KEY_BLOCK_SIZE"; + } + break; + default: + push_warning_printf( + thd, Sql_condition::WARN_LEVEL_WARN, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: invalid KEY_BLOCK_SIZE = %lu." + " Valid values are [1, 2, 4, 8, 16]", + create_info->key_block_size); + ret = "KEY_BLOCK_SIZE"; + break; + } + } + + /* Check for a valid Innodb ROW_FORMAT specifier and + other incompatibilities. */ + switch (row_format) { + case ROW_TYPE_COMPRESSED: + CHECK_ERROR_ROW_TYPE_NEEDS_FILE_PER_TABLE(use_tablespace); + CHECK_ERROR_ROW_TYPE_NEEDS_GT_ANTELOPE; + break; + case ROW_TYPE_DYNAMIC: + CHECK_ERROR_ROW_TYPE_NEEDS_FILE_PER_TABLE(use_tablespace); + CHECK_ERROR_ROW_TYPE_NEEDS_GT_ANTELOPE; + /* fall through since dynamic also shuns KBS */ + case ROW_TYPE_COMPACT: + case ROW_TYPE_REDUNDANT: + if (kbs_specified) { + push_warning_printf( + thd, Sql_condition::WARN_LEVEL_WARN, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: cannot specify ROW_FORMAT = %s" + " with KEY_BLOCK_SIZE.", + get_row_format_name(row_format)); + ret = "KEY_BLOCK_SIZE"; + } + break; + case ROW_TYPE_DEFAULT: + break; + case ROW_TYPE_FIXED: + case ROW_TYPE_PAGE: + case ROW_TYPE_NOT_USED: + default: + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + ER_ILLEGAL_HA_CREATE_OPTION, \ + "InnoDB: invalid ROW_FORMAT specifier."); + ret = "ROW_TYPE"; + break; + } + + /* Use DATA DIRECTORY only with file-per-table. */ + if (create_info->data_file_name && !use_tablespace) { + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: DATA DIRECTORY requires" + " innodb_file_per_table."); + ret = "DATA DIRECTORY"; + } + + /* Do not use DATA DIRECTORY with TEMPORARY TABLE. */ + if (create_info->data_file_name + && create_info->options & HA_LEX_CREATE_TMP_TABLE) { + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: DATA DIRECTORY cannot be used" + " for TEMPORARY tables."); + ret = "DATA DIRECTORY"; + } + + /* Do not allow INDEX_DIRECTORY */ + if (create_info->index_file_name) { + push_warning_printf( + thd, Sql_condition::WARN_LEVEL_WARN, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: INDEX DIRECTORY is not supported"); + ret = "INDEX DIRECTORY"; + } + + return(ret); +} + +/*****************************************************************//** +Update create_info. Used in SHOW CREATE TABLE et al. */ +UNIV_INTERN +void +ha_innobase::update_create_info( +/*============================*/ + HA_CREATE_INFO* create_info) /*!< in/out: create info */ +{ + if (!(create_info->used_fields & HA_CREATE_USED_AUTO)) { + ha_innobase::info(HA_STATUS_AUTO); + create_info->auto_increment_value = stats.auto_increment_value; + } + + /* Update the DATA DIRECTORY name from SYS_DATAFILES. */ + dict_get_and_save_data_dir_path(prebuilt->table, false); + + if (prebuilt->table->data_dir_path) { + create_info->data_file_name = prebuilt->table->data_dir_path; + } +} + +/*****************************************************************//** +Initialize the table FTS stopword list +@return TRUE if success */ +UNIV_INTERN +ibool +innobase_fts_load_stopword( +/*=======================*/ + dict_table_t* table, /*!< in: Table has the FTS */ + trx_t* trx, /*!< in: transaction */ + THD* thd) /*!< in: current thread */ +{ + return(fts_load_stopword(table, trx, + innobase_server_stopword_table, + THDVAR(thd, ft_user_stopword_table), + THDVAR(thd, ft_enable_stopword), FALSE)); +} + +/*****************************************************************//** +Parses the table name into normal name and either temp path or remote path +if needed. +@return 0 if successful, otherwise, error number */ +UNIV_INTERN +int +ha_innobase::parse_table_name( +/*==========================*/ + const char* name, /*!< in/out: table name provided*/ + HA_CREATE_INFO* create_info, /*!< in: more information of the + created table, contains also the + create statement string */ + ulint flags, /*!< in: flags*/ + ulint flags2, /*!< in: flags2*/ + char* norm_name, /*!< out: normalized table name */ + char* temp_path, /*!< out: absolute path of table */ + char* remote_path) /*!< out: remote path of table */ +{ + THD* thd = ha_thd(); + bool use_tablespace = flags2 & DICT_TF2_USE_TABLESPACE; + DBUG_ENTER("ha_innobase::parse_table_name"); + +#ifdef __WIN__ + /* Names passed in from server are in two formats: + 1. <database_name>/<table_name>: for normal table creation + 2. full path: for temp table creation, or DATA DIRECTORY. + + When srv_file_per_table is on and mysqld_embedded is off, + check for full path pattern, i.e. + X:\dir\..., X is a driver letter, or + \\dir1\dir2\..., UNC path + returns error if it is in full path format, but not creating a temp. + table. Currently InnoDB does not support symbolic link on Windows. */ + + if (use_tablespace + && !mysqld_embedded + && !(create_info->options & HA_LEX_CREATE_TMP_TABLE)) { + + if ((name[1] == ':') + || (name[0] == '\\' && name[1] == '\\')) { + sql_print_error("Cannot create table %s\n", name); + DBUG_RETURN(HA_ERR_GENERIC); + } + } +#endif + + normalize_table_name(norm_name, name); + temp_path[0] = '\0'; + remote_path[0] = '\0'; + + /* A full path is used for TEMPORARY TABLE and DATA DIRECTORY. + In the case of; + CREATE TEMPORARY TABLE ... DATA DIRECTORY={path} ... ; + We ignore the DATA DIRECTORY. */ + if (create_info->options & HA_LEX_CREATE_TMP_TABLE) { + strncpy(temp_path, name, FN_REFLEN - 1); + } + + if (create_info->data_file_name) { + bool ignore = false; + + /* Use DATA DIRECTORY only with file-per-table. */ + if (!use_tablespace) { + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: DATA DIRECTORY requires" + " innodb_file_per_table."); + ignore = true; + } + + /* Do not use DATA DIRECTORY with TEMPORARY TABLE. */ + if (create_info->options & HA_LEX_CREATE_TMP_TABLE) { + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: DATA DIRECTORY cannot be" + " used for TEMPORARY tables."); + ignore = true; + } + + if (ignore) { + push_warning_printf( + thd, Sql_condition::WARN_LEVEL_WARN, + WARN_OPTION_IGNORED, + ER_DEFAULT(WARN_OPTION_IGNORED), + "DATA DIRECTORY"); + } else { + strncpy(remote_path, create_info->data_file_name, + FN_REFLEN - 1); + } + } + + if (create_info->index_file_name) { + push_warning_printf( + thd, Sql_condition::WARN_LEVEL_WARN, + WARN_OPTION_IGNORED, + ER_DEFAULT(WARN_OPTION_IGNORED), + "INDEX DIRECTORY"); + } + + DBUG_RETURN(0); +} + +/*****************************************************************//** +Determines InnoDB table flags. +@retval true if successful, false if error */ +UNIV_INTERN +bool +innobase_table_flags( +/*=================*/ + const TABLE* form, /*!< in: table */ + const HA_CREATE_INFO* create_info, /*!< in: information + on table columns and indexes */ + THD* thd, /*!< in: connection */ + bool use_tablespace, /*!< in: whether to create + outside system tablespace */ + ulint* flags, /*!< out: DICT_TF flags */ + ulint* flags2) /*!< out: DICT_TF2 flags */ +{ + DBUG_ENTER("innobase_table_flags"); + + const char* fts_doc_id_index_bad = NULL; + bool zip_allowed = true; + ulint zip_ssize = 0; + enum row_type row_format; + rec_format_t innodb_row_format = REC_FORMAT_COMPACT; + bool use_data_dir; + + /* Cache the value of innodb_file_format, in case it is + modified by another thread while the table is being created. */ + const ulint file_format_allowed = srv_file_format; + + *flags = 0; + *flags2 = 0; + + /* Check if there are any FTS indexes defined on this table. */ + for (uint i = 0; i < form->s->keys; i++) { + const KEY* key = &form->key_info[i]; + + if (key->flags & HA_FULLTEXT) { + *flags2 |= DICT_TF2_FTS; + + /* We don't support FTS indexes in temporary + tables. */ + if (create_info->options & HA_LEX_CREATE_TMP_TABLE) { + + my_error(ER_INNODB_NO_FT_TEMP_TABLE, MYF(0)); + DBUG_RETURN(false); + } + + if (key->flags & HA_USES_PARSER) { + my_error(ER_INNODB_NO_FT_USES_PARSER, MYF(0)); + DBUG_RETURN(false); + } + + if (fts_doc_id_index_bad) { + goto index_bad; + } + } + + if (innobase_strcasecmp(key->name, FTS_DOC_ID_INDEX_NAME)) { + continue; + } + + /* Do a pre-check on FTS DOC ID index */ + if (!(key->flags & HA_NOSAME) + || strcmp(key->name, FTS_DOC_ID_INDEX_NAME) + || strcmp(key->key_part[0].field->field_name, + FTS_DOC_ID_COL_NAME)) { + fts_doc_id_index_bad = key->name; + } + + if (fts_doc_id_index_bad && (*flags2 & DICT_TF2_FTS)) { +index_bad: + my_error(ER_INNODB_FT_WRONG_DOCID_INDEX, MYF(0), + fts_doc_id_index_bad); + DBUG_RETURN(false); + } + } + + if (create_info->key_block_size) { + /* The requested compressed page size (key_block_size) + is given in kilobytes. If it is a valid number, store + that value as the number of log2 shifts from 512 in + zip_ssize. Zero means it is not compressed. */ + ulint zssize; /* Zip Shift Size */ + ulint kbsize; /* Key Block Size */ + for (zssize = kbsize = 1; + zssize <= ut_min(UNIV_PAGE_SSIZE_MAX, + PAGE_ZIP_SSIZE_MAX); + zssize++, kbsize <<= 1) { + if (kbsize == create_info->key_block_size) { + zip_ssize = zssize; + break; + } + } + + /* Make sure compressed row format is allowed. */ + if (!use_tablespace) { + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: KEY_BLOCK_SIZE requires" + " innodb_file_per_table."); + zip_allowed = FALSE; + } + + if (file_format_allowed < UNIV_FORMAT_B) { + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: KEY_BLOCK_SIZE requires" + " innodb_file_format > Antelope."); + zip_allowed = FALSE; + } + + if (!zip_allowed + || zssize > ut_min(UNIV_PAGE_SSIZE_MAX, + PAGE_ZIP_SSIZE_MAX)) { + push_warning_printf( + thd, Sql_condition::WARN_LEVEL_WARN, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: ignoring KEY_BLOCK_SIZE=%lu.", + create_info->key_block_size); + } + } + + row_format = form->s->row_type; + + if (zip_ssize && zip_allowed) { + /* if ROW_FORMAT is set to default, + automatically change it to COMPRESSED.*/ + if (row_format == ROW_TYPE_DEFAULT) { + row_format = ROW_TYPE_COMPRESSED; + } else if (row_format != ROW_TYPE_COMPRESSED) { + /* ROW_FORMAT other than COMPRESSED + ignores KEY_BLOCK_SIZE. It does not + make sense to reject conflicting + KEY_BLOCK_SIZE and ROW_FORMAT, because + such combinations can be obtained + with ALTER TABLE anyway. */ + push_warning_printf( + thd, Sql_condition::WARN_LEVEL_WARN, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: ignoring KEY_BLOCK_SIZE=%lu" + " unless ROW_FORMAT=COMPRESSED.", + create_info->key_block_size); + zip_allowed = FALSE; + } + } else { + /* zip_ssize == 0 means no KEY_BLOCK_SIZE.*/ + if (row_format == ROW_TYPE_COMPRESSED && zip_allowed) { + /* ROW_FORMAT=COMPRESSED without KEY_BLOCK_SIZE + implies half the maximum KEY_BLOCK_SIZE(*1k) or + UNIV_PAGE_SIZE, whichever is less. */ + zip_ssize = ut_min(UNIV_PAGE_SSIZE_MAX, + PAGE_ZIP_SSIZE_MAX) - 1; + } + } + + /* Validate the row format. Correct it if necessary */ + switch (row_format) { + case ROW_TYPE_REDUNDANT: + innodb_row_format = REC_FORMAT_REDUNDANT; + break; + + case ROW_TYPE_COMPRESSED: + case ROW_TYPE_DYNAMIC: + if (!use_tablespace) { + push_warning_printf( + thd, Sql_condition::WARN_LEVEL_WARN, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: ROW_FORMAT=%s requires" + " innodb_file_per_table.", + get_row_format_name(row_format)); + } else if (file_format_allowed == UNIV_FORMAT_A) { + push_warning_printf( + thd, Sql_condition::WARN_LEVEL_WARN, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: ROW_FORMAT=%s requires" + " innodb_file_format > Antelope.", + get_row_format_name(row_format)); + } else { + innodb_row_format = (row_format == ROW_TYPE_DYNAMIC + ? REC_FORMAT_DYNAMIC + : REC_FORMAT_COMPRESSED); + break; + } + zip_allowed = FALSE; + /* fall through to set row_format = COMPACT */ + case ROW_TYPE_NOT_USED: + case ROW_TYPE_FIXED: + case ROW_TYPE_PAGE: + default: + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: assuming ROW_FORMAT=COMPACT."); + case ROW_TYPE_DEFAULT: + /* If we fell through, set row format to Compact. */ + row_format = ROW_TYPE_COMPACT; + case ROW_TYPE_COMPACT: + break; + } + + /* Set the table flags */ + if (!zip_allowed) { + zip_ssize = 0; + } + + use_data_dir = use_tablespace + && ((create_info->data_file_name != NULL) + && !(create_info->options & HA_LEX_CREATE_TMP_TABLE)); + + dict_tf_set(flags, innodb_row_format, zip_ssize, use_data_dir); + + if (create_info->options & HA_LEX_CREATE_TMP_TABLE) { + *flags2 |= DICT_TF2_TEMPORARY; + } + + if (use_tablespace) { + *flags2 |= DICT_TF2_USE_TABLESPACE; + } + + /* Set the flags2 when create table or alter tables */ + *flags2 |= DICT_TF2_FTS_AUX_HEX_NAME; + DBUG_EXECUTE_IF("innodb_test_wrong_fts_aux_table_name", + *flags2 &= ~DICT_TF2_FTS_AUX_HEX_NAME;); + + DBUG_RETURN(true); +} + +/*****************************************************************//** +Creates a new table to an InnoDB database. +@return error number */ +UNIV_INTERN +int +ha_innobase::create( +/*================*/ + const char* name, /*!< in: table name */ + TABLE* form, /*!< in: information on table + columns and indexes */ + HA_CREATE_INFO* create_info) /*!< in: more information of the + created table, contains also the + create statement string */ +{ + int error; + trx_t* parent_trx; + trx_t* trx; + int primary_key_no; + uint i; + char norm_name[FN_REFLEN]; /* {database}/{tablename} */ + char temp_path[FN_REFLEN]; /* absolute path of temp frm */ + char remote_path[FN_REFLEN]; /* absolute path of table */ + THD* thd = ha_thd(); + ib_int64_t auto_inc_value; + + /* Cache the global variable "srv_file_per_table" to a local + variable before using it. Note that "srv_file_per_table" + is not under dict_sys mutex protection, and could be changed + while creating the table. So we read the current value here + and make all further decisions based on this. */ + bool use_tablespace = srv_file_per_table; + + /* Zip Shift Size - log2 - 9 of compressed page size, + zero for uncompressed */ + ulint flags; + ulint flags2; + dict_table_t* innobase_table = NULL; + + const char* stmt; + size_t stmt_len; + + DBUG_ENTER("ha_innobase::create"); + + DBUG_ASSERT(thd != NULL); + DBUG_ASSERT(create_info != NULL); + + if (form->s->fields > REC_MAX_N_USER_FIELDS) { + DBUG_RETURN(HA_ERR_TOO_MANY_FIELDS); + } else if (srv_read_only_mode) { + DBUG_RETURN(HA_ERR_INNODB_READ_ONLY); + } + + /* Create the table definition in InnoDB */ + + /* Validate create options if innodb_strict_mode is set. */ + if (create_options_are_invalid( + thd, form, create_info, use_tablespace)) { + DBUG_RETURN(HA_WRONG_CREATE_OPTION); + } + + if (!innobase_table_flags(form, create_info, + thd, use_tablespace, + &flags, &flags2)) { + DBUG_RETURN(-1); + } + + error = parse_table_name(name, create_info, flags, flags2, + norm_name, temp_path, remote_path); + if (error) { + DBUG_RETURN(error); + } + + /* Look for a primary key */ + primary_key_no = (form->s->primary_key != MAX_KEY ? + (int) form->s->primary_key : + -1); + + /* Our function innobase_get_mysql_key_number_for_index assumes + the primary key is always number 0, if it exists */ + ut_a(primary_key_no == -1 || primary_key_no == 0); + + /* Check for name conflicts (with reserved name) for + any user indices to be created. */ + if (innobase_index_name_is_reserved(thd, form->key_info, + form->s->keys)) { + DBUG_RETURN(-1); + } + + if (row_is_magic_monitor_table(norm_name)) { + push_warning_printf(thd, + Sql_condition::WARN_LEVEL_WARN, + HA_ERR_WRONG_COMMAND, + "Using the table name %s to enable " + "diagnostic output is deprecated " + "and may be removed in future releases. " + "Use INFORMATION_SCHEMA or " + "PERFORMANCE_SCHEMA tables or " + "SET GLOBAL innodb_status_output=ON.", + dict_remove_db_name(norm_name)); + + /* Limit innodb monitor access to users with PROCESS privilege. + See http://bugs.mysql.com/32710 why we chose PROCESS. */ + if (check_global_access(thd, PROCESS_ACL)) { + DBUG_RETURN(HA_ERR_GENERIC); + } + } + + /* Get the transaction associated with the current thd, or create one + if not yet created */ + + parent_trx = check_trx_exists(thd); + + /* In case MySQL calls this in the middle of a SELECT query, release + possible adaptive hash latch to avoid deadlocks of threads */ + + trx_search_latch_release_if_reserved(parent_trx); + + trx = innobase_trx_allocate(thd); + + if (UNIV_UNLIKELY(trx->fake_changes)) { + innobase_commit_low(trx); + trx_free_for_mysql(trx); + DBUG_RETURN(HA_ERR_WRONG_COMMAND); + } + + /* Latch the InnoDB data dictionary exclusively so that no deadlocks + or lock waits can happen in it during a table create operation. + Drop table etc. do this latching in row0mysql.cc. */ + + row_mysql_lock_data_dictionary(trx); + + error = create_table_def(trx, form, norm_name, temp_path, + remote_path, flags, flags2); + if (error) { + goto cleanup; + } + + /* Create the keys */ + + if (form->s->keys == 0 || primary_key_no == -1) { + /* Create an index which is used as the clustered index; + order the rows by their row id which is internally generated + by InnoDB */ + + error = create_clustered_index_when_no_primary( + trx, flags, norm_name); + if (error) { + goto cleanup; + } + } + + if (primary_key_no != -1) { + /* In InnoDB the clustered index must always be created + first */ + if ((error = create_index(trx, form, flags, norm_name, + (uint) primary_key_no))) { + goto cleanup; + } + } + + /* Create the ancillary tables that are common to all FTS indexes on + this table. */ + if (flags2 & DICT_TF2_FTS) { + enum fts_doc_id_index_enum ret; + + innobase_table = dict_table_open_on_name( + norm_name, TRUE, FALSE, DICT_ERR_IGNORE_NONE); + + ut_a(innobase_table); + + /* Check whether there already exists FTS_DOC_ID_INDEX */ + ret = innobase_fts_check_doc_id_index_in_def( + form->s->keys, form->key_info); + + switch (ret) { + case FTS_INCORRECT_DOC_ID_INDEX: + push_warning_printf(thd, + Sql_condition::WARN_LEVEL_WARN, + ER_WRONG_NAME_FOR_INDEX, + " InnoDB: Index name %s is reserved" + " for the unique index on" + " FTS_DOC_ID column for FTS" + " Document ID indexing" + " on table %s. Please check" + " the index definition to" + " make sure it is of correct" + " type\n", + FTS_DOC_ID_INDEX_NAME, + innobase_table->name); + + if (innobase_table->fts) { + fts_free(innobase_table); + } + + dict_table_close(innobase_table, TRUE, FALSE); + my_error(ER_WRONG_NAME_FOR_INDEX, MYF(0), + FTS_DOC_ID_INDEX_NAME); + error = -1; + goto cleanup; + case FTS_EXIST_DOC_ID_INDEX: + case FTS_NOT_EXIST_DOC_ID_INDEX: + break; + } + + dberr_t err = fts_create_common_tables( + trx, innobase_table, norm_name, + (ret == FTS_EXIST_DOC_ID_INDEX)); + + error = convert_error_code_to_mysql(err, 0, NULL); + + dict_table_close(innobase_table, TRUE, FALSE); + + if (error) { + goto cleanup; + } + } + + for (i = 0; i < form->s->keys; i++) { + + if (i != static_cast<uint>(primary_key_no)) { + + if ((error = create_index(trx, form, flags, + norm_name, i))) { + goto cleanup; + } + } + } + + /* Cache all the FTS indexes on this table in the FTS specific + structure. They are used for FTS indexed column update handling. */ + if (flags2 & DICT_TF2_FTS) { + fts_t* fts = innobase_table->fts; + + ut_a(fts != NULL); + + dict_table_get_all_fts_indexes(innobase_table, fts->indexes); + } + + stmt = innobase_get_stmt(thd, &stmt_len); + + if (stmt) { + dberr_t err = row_table_add_foreign_constraints( + trx, stmt, stmt_len, norm_name, + create_info->options & HA_LEX_CREATE_TMP_TABLE); + + switch (err) { + + case DB_PARENT_NO_INDEX: + push_warning_printf( + thd, Sql_condition::WARN_LEVEL_WARN, + HA_ERR_CANNOT_ADD_FOREIGN, + "Create table '%s' with foreign key constraint" + " failed. There is no index in the referenced" + " table where the referenced columns appear" + " as the first columns.\n", norm_name); + break; + + case DB_CHILD_NO_INDEX: + push_warning_printf( + thd, Sql_condition::WARN_LEVEL_WARN, + HA_ERR_CANNOT_ADD_FOREIGN, + "Create table '%s' with foreign key constraint" + " failed. There is no index in the referencing" + " table where referencing columns appear" + " as the first columns.\n", norm_name); + break; + default: + break; + } + + error = convert_error_code_to_mysql(err, flags, NULL); + + if (error) { + goto cleanup; + } + } + + innobase_commit_low(trx); + + row_mysql_unlock_data_dictionary(trx); + + /* Flush the log to reduce probability that the .frm files and + the InnoDB data dictionary get out-of-sync if the user runs + with innodb_flush_log_at_trx_commit = 0 */ + + log_buffer_flush_to_disk(); + + innobase_table = dict_table_open_on_name( + norm_name, FALSE, FALSE, DICT_ERR_IGNORE_NONE); + + DBUG_ASSERT(innobase_table != 0); + + innobase_copy_frm_flags_from_create_info(innobase_table, create_info); + + dict_stats_update(innobase_table, DICT_STATS_EMPTY_TABLE); + + if (innobase_table) { + /* We update the highest file format in the system table + space, if this table has higher file format setting. */ + + trx_sys_file_format_max_upgrade( + (const char**) &innobase_file_format_max, + dict_table_get_format(innobase_table)); + } + + /* Load server stopword into FTS cache */ + if (flags2 & DICT_TF2_FTS) { + if (!innobase_fts_load_stopword(innobase_table, NULL, thd)) { + dict_table_close(innobase_table, FALSE, FALSE); + srv_active_wake_master_thread(); + trx_free_for_mysql(trx); + DBUG_RETURN(-1); + } + } + + /* Note: We can't call update_thd() as prebuilt will not be + setup at this stage and so we use thd. */ + + /* We need to copy the AUTOINC value from the old table if + this is an ALTER|OPTIMIZE TABLE or CREATE INDEX because CREATE INDEX + does a table copy too. If query was one of : + + CREATE TABLE ...AUTO_INCREMENT = x; or + ALTER TABLE...AUTO_INCREMENT = x; or + OPTIMIZE TABLE t; or + CREATE INDEX x on t(...); + + Find out a table definition from the dictionary and get + the current value of the auto increment field. Set a new + value to the auto increment field if the value is greater + than the maximum value in the column. */ + + if (((create_info->used_fields & HA_CREATE_USED_AUTO) + || thd_sql_command(thd) == SQLCOM_ALTER_TABLE + || thd_sql_command(thd) == SQLCOM_OPTIMIZE + || thd_sql_command(thd) == SQLCOM_CREATE_INDEX) + && create_info->auto_increment_value > 0) { + + auto_inc_value = create_info->auto_increment_value; + + dict_table_autoinc_lock(innobase_table); + dict_table_autoinc_initialize(innobase_table, auto_inc_value); + dict_table_autoinc_unlock(innobase_table); + } + + dict_table_close(innobase_table, FALSE, FALSE); + + /* Tell the InnoDB server that there might be work for + utility threads: */ + + srv_active_wake_master_thread(); + + trx_free_for_mysql(trx); + + DBUG_RETURN(0); + +cleanup: + trx_rollback_for_mysql(trx); + + row_mysql_unlock_data_dictionary(trx); + + trx_free_for_mysql(trx); + + DBUG_RETURN(error); +} + +/*****************************************************************//** +Discards or imports an InnoDB tablespace. +@return 0 == success, -1 == error */ +UNIV_INTERN +int +ha_innobase::discard_or_import_tablespace( +/*======================================*/ + my_bool discard) /*!< in: TRUE if discard, else import */ +{ + dberr_t err; + dict_table_t* dict_table; + + DBUG_ENTER("ha_innobase::discard_or_import_tablespace"); + + ut_a(prebuilt->trx); + ut_a(prebuilt->trx->magic_n == TRX_MAGIC_N); + ut_a(prebuilt->trx == thd_to_trx(ha_thd())); + + if (srv_read_only_mode) { + DBUG_RETURN(HA_ERR_TABLE_READONLY); + } + + dict_table = prebuilt->table; + + if (dict_table->space == TRX_SYS_SPACE) { + + ib_senderrf( + prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_TABLE_IN_SYSTEM_TABLESPACE, + table->s->table_name.str); + + DBUG_RETURN(HA_ERR_TABLE_NEEDS_UPGRADE); + } + + trx_start_if_not_started(prebuilt->trx); + + /* In case MySQL calls this in the middle of a SELECT query, release + possible adaptive hash latch to avoid deadlocks of threads. */ + trx_search_latch_release_if_reserved(prebuilt->trx); + + /* Obtain an exclusive lock on the table. */ + err = row_mysql_lock_table( + prebuilt->trx, dict_table, LOCK_X, + discard ? "setting table lock for DISCARD TABLESPACE" + : "setting table lock for IMPORT TABLESPACE"); + + if (err != DB_SUCCESS) { + /* unable to lock the table: do nothing */ + } else if (discard) { + + /* Discarding an already discarded tablespace should be an + idempotent operation. Also, if the .ibd file is missing the + user may want to set the DISCARD flag in order to IMPORT + a new tablespace. */ + + if (dict_table->ibd_file_missing) { + ib_senderrf( + prebuilt->trx->mysql_thd, + IB_LOG_LEVEL_WARN, ER_TABLESPACE_MISSING, + table->s->table_name.str); + } + + err = row_discard_tablespace_for_mysql( + dict_table->name, prebuilt->trx); + + } else if (!dict_table->ibd_file_missing) { + /* Commit the transaction in order to + release the table lock. */ + trx_commit_for_mysql(prebuilt->trx); + + ib_senderrf( + prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_TABLESPACE_EXISTS, table->s->table_name.str); + + DBUG_RETURN(HA_ERR_TABLE_EXIST); + } else { + err = row_import_for_mysql(dict_table, prebuilt); + + if (err == DB_SUCCESS) { + + if (table->found_next_number_field) { + dict_table_autoinc_lock(dict_table); + innobase_initialize_autoinc(); + dict_table_autoinc_unlock(dict_table); + } + + info(HA_STATUS_TIME + | HA_STATUS_CONST + | HA_STATUS_VARIABLE + | HA_STATUS_AUTO); + } + } + + /* Commit the transaction in order to release the table lock. */ + trx_commit_for_mysql(prebuilt->trx); + + DBUG_RETURN(convert_error_code_to_mysql(err, dict_table->flags, NULL)); +} + +/*****************************************************************//** +Deletes all rows of an InnoDB table. +@return error number */ +UNIV_INTERN +int +ha_innobase::truncate() +/*===================*/ +{ + dberr_t err; + int error; + + DBUG_ENTER("ha_innobase::truncate"); + + if (srv_read_only_mode) { + DBUG_RETURN(HA_ERR_TABLE_READONLY); + } + + /* Get the transaction associated with the current thd, or create one + if not yet created, and update prebuilt->trx */ + + update_thd(ha_thd()); + + if (UNIV_UNLIKELY(share->ib_table->is_corrupt)) { + DBUG_RETURN(HA_ERR_CRASHED); + } + + if (UNIV_UNLIKELY(prebuilt->trx->fake_changes)) { + DBUG_RETURN(HA_ERR_WRONG_COMMAND); + } + + if (!trx_is_started(prebuilt->trx)) { + ++prebuilt->trx->will_lock; + } + /* Truncate the table in InnoDB */ + + err = row_truncate_table_for_mysql(prebuilt->table, prebuilt->trx); + + if (UNIV_UNLIKELY(share->ib_table->is_corrupt)) { + DBUG_RETURN(HA_ERR_CRASHED); + } + + switch (err) { + + case DB_TABLESPACE_DELETED: + case DB_TABLESPACE_NOT_FOUND: + ib_senderrf( + prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR, + (err == DB_TABLESPACE_DELETED ? + ER_TABLESPACE_DISCARDED : ER_TABLESPACE_MISSING), + table->s->table_name.str); + table->status = STATUS_NOT_FOUND; + error = HA_ERR_NO_SUCH_TABLE; + break; + + default: + error = convert_error_code_to_mysql( + err, prebuilt->table->flags, + prebuilt->trx->mysql_thd); + table->status = STATUS_NOT_FOUND; + break; + } + DBUG_RETURN(error); +} + +/*****************************************************************//** +Drops a table from an InnoDB database. Before calling this function, +MySQL calls innobase_commit to commit the transaction of the current user. +Then the current user cannot have locks set on the table. Drop table +operation inside InnoDB will remove all locks any user has on the table +inside InnoDB. +@return error number */ +UNIV_INTERN +int +ha_innobase::delete_table( +/*======================*/ + const char* name) /*!< in: table name */ +{ + ulint name_len; + dberr_t err; + trx_t* parent_trx; + trx_t* trx; + THD* thd = ha_thd(); + char norm_name[FN_REFLEN]; + + DBUG_ENTER("ha_innobase::delete_table"); + + DBUG_EXECUTE_IF( + "test_normalize_table_name_low", + test_normalize_table_name_low(); + ); + DBUG_EXECUTE_IF( + "test_ut_format_name", + test_ut_format_name(); + ); + + /* Strangely, MySQL passes the table name without the '.frm' + extension, in contrast to ::create */ + normalize_table_name(norm_name, name); + + if (srv_read_only_mode) { + DBUG_RETURN(HA_ERR_TABLE_READONLY); + } else if (row_is_magic_monitor_table(norm_name) + && check_global_access(thd, PROCESS_ACL)) { + DBUG_RETURN(HA_ERR_GENERIC); + } + + parent_trx = check_trx_exists(thd); + + /* In case MySQL calls this in the middle of a SELECT query, release + possible adaptive hash latch to avoid deadlocks of threads */ + + trx_search_latch_release_if_reserved(parent_trx); + + trx = innobase_trx_allocate(thd); + + if (UNIV_UNLIKELY(trx->fake_changes)) { + innobase_commit_low(trx); + trx_free_for_mysql(trx); + DBUG_RETURN(HA_ERR_WRONG_COMMAND); + } + + name_len = strlen(name); + + ut_a(name_len < 1000); + + /* Either the transaction is already flagged as a locking transaction + or it hasn't been started yet. */ + + ut_a(!trx_is_started(trx) || trx->will_lock > 0); + + /* We are doing a DDL operation. */ + ++trx->will_lock; + trx->ddl = true; + + /* Drop the table in InnoDB */ + err = row_drop_table_for_mysql( + norm_name, trx, thd_sql_command(thd) == SQLCOM_DROP_DB); + + + if (err == DB_TABLE_NOT_FOUND + && innobase_get_lower_case_table_names() == 1) { + char* is_part = NULL; +#ifdef __WIN__ + is_part = strstr(norm_name, "#p#"); +#else + is_part = strstr(norm_name, "#P#"); +#endif /* __WIN__ */ + + if (is_part) { + char par_case_name[FN_REFLEN]; + +#ifndef __WIN__ + /* Check for the table using lower + case name, including the partition + separator "P" */ + strcpy(par_case_name, norm_name); + innobase_casedn_str(par_case_name); +#else + /* On Windows platfrom, check + whether there exists table name in + system table whose name is + not being normalized to lower case */ + normalize_table_name_low( + par_case_name, name, FALSE); +#endif + err = row_drop_table_for_mysql( + par_case_name, trx, + thd_sql_command(thd) == SQLCOM_DROP_DB); + } + } + + /* Flush the log to reduce probability that the .frm files and + the InnoDB data dictionary get out-of-sync if the user runs + with innodb_flush_log_at_trx_commit = 0 */ + + log_buffer_flush_to_disk(); + + innobase_commit_low(trx); + + trx_free_for_mysql(trx); + + DBUG_RETURN(convert_error_code_to_mysql(err, 0, NULL)); +} + +/*****************************************************************//** +Removes all tables in the named database inside InnoDB. */ +static +void +innobase_drop_database( +/*===================*/ + handlerton* hton, /*!< in: handlerton of Innodb */ + char* path) /*!< in: database path; inside InnoDB the name + of the last directory in the path is used as + the database name: for example, in + 'mysql/data/test' the database name is 'test' */ +{ + ulint len = 0; + trx_t* trx; + char* ptr; + char* namebuf; + THD* thd = current_thd; + + /* Get the transaction associated with the current thd, or create one + if not yet created */ + + DBUG_ASSERT(hton == innodb_hton_ptr); + + if (srv_read_only_mode) { + return; + } + + /* In the Windows plugin, thd = current_thd is always NULL */ + if (thd) { + trx_t* parent_trx = check_trx_exists(thd); + + /* In case MySQL calls this in the middle of a SELECT + query, release possible adaptive hash latch to avoid + deadlocks of threads */ + + trx_search_latch_release_if_reserved(parent_trx); + } + + ptr = strend(path) - 2; + + while (ptr >= path && *ptr != '\\' && *ptr != '/') { + ptr--; + len++; + } + + ptr++; + namebuf = (char*) my_malloc((uint) len + 2, MYF(0)); + + memcpy(namebuf, ptr, len); + namebuf[len] = '/'; + namebuf[len + 1] = '\0'; +#ifdef __WIN__ + innobase_casedn_str(namebuf); +#endif + trx = innobase_trx_allocate(thd); + + if (UNIV_UNLIKELY(trx->fake_changes)) { + my_free(namebuf); + innobase_commit_low(trx); + trx_free_for_mysql(trx); + return; /* ignore */ + } + + /* Either the transaction is already flagged as a locking transaction + or it hasn't been started yet. */ + + ut_a(!trx_is_started(trx) || trx->will_lock > 0); + + /* We are doing a DDL operation. */ + ++trx->will_lock; + + row_drop_database_for_mysql(namebuf, trx); + + my_free(namebuf); + + /* Flush the log to reduce probability that the .frm files and + the InnoDB data dictionary get out-of-sync if the user runs + with innodb_flush_log_at_trx_commit = 0 */ + + log_buffer_flush_to_disk(); + + innobase_commit_low(trx); + trx_free_for_mysql(trx); +} + +/*********************************************************************//** +Renames an InnoDB table. +@return DB_SUCCESS or error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +innobase_rename_table( +/*==================*/ + trx_t* trx, /*!< in: transaction */ + const char* from, /*!< in: old name of the table */ + const char* to) /*!< in: new name of the table */ +{ + dberr_t error; + char norm_to[FN_REFLEN]; + char norm_from[FN_REFLEN]; + + DBUG_ENTER("innobase_rename_table"); + DBUG_ASSERT(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX); + + ut_ad(!srv_read_only_mode); + + normalize_table_name(norm_to, to); + normalize_table_name(norm_from, from); + + DEBUG_SYNC_C("innodb_rename_table_ready"); + + trx_start_if_not_started(trx); + + /* Serialize data dictionary operations with dictionary mutex: + no deadlocks can occur then in these operations. */ + + row_mysql_lock_data_dictionary(trx); + + /* Transaction must be flagged as a locking transaction or it hasn't + been started yet. */ + + ut_a(trx->will_lock > 0); + + error = row_rename_table_for_mysql( + norm_from, norm_to, trx, TRUE); + + if (error != DB_SUCCESS) { + if (error == DB_TABLE_NOT_FOUND + && innobase_get_lower_case_table_names() == 1) { + char* is_part = NULL; +#ifdef __WIN__ + is_part = strstr(norm_from, "#p#"); +#else + is_part = strstr(norm_from, "#P#"); +#endif /* __WIN__ */ + + if (is_part) { + char par_case_name[FN_REFLEN]; +#ifndef __WIN__ + /* Check for the table using lower + case name, including the partition + separator "P" */ + strcpy(par_case_name, norm_from); + innobase_casedn_str(par_case_name); +#else + /* On Windows platfrom, check + whether there exists table name in + system table whose name is + not being normalized to lower case */ + normalize_table_name_low( + par_case_name, from, FALSE); +#endif + trx_start_if_not_started(trx); + error = row_rename_table_for_mysql( + par_case_name, norm_to, trx, TRUE); + } + } + + if (error == DB_SUCCESS) { +#ifndef __WIN__ + sql_print_warning("Rename partition table %s " + "succeeds after converting to lower " + "case. The table may have " + "been moved from a case " + "in-sensitive file system.\n", + norm_from); +#else + sql_print_warning("Rename partition table %s " + "succeeds after skipping the step to " + "lower case the table name. " + "The table may have been " + "moved from a case sensitive " + "file system.\n", + norm_from); +#endif /* __WIN__ */ + } + } + + row_mysql_unlock_data_dictionary(trx); + + /* Flush the log to reduce probability that the .frm + files and the InnoDB data dictionary get out-of-sync + if the user runs with innodb_flush_log_at_trx_commit = 0 */ + + log_buffer_flush_to_disk(); + + DBUG_RETURN(error); +} + +/*********************************************************************//** +Renames an InnoDB table. +@return 0 or error code */ +UNIV_INTERN +int +ha_innobase::rename_table( +/*======================*/ + const char* from, /*!< in: old name of the table */ + const char* to) /*!< in: new name of the table */ +{ + trx_t* trx; + dberr_t error; + trx_t* parent_trx; + THD* thd = ha_thd(); + + DBUG_ENTER("ha_innobase::rename_table"); + + if (srv_read_only_mode) { + ib_senderrf(thd, IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE); + DBUG_RETURN(HA_ERR_TABLE_READONLY); + } + + /* Get the transaction associated with the current thd, or create one + if not yet created */ + + parent_trx = check_trx_exists(thd); + + /* In case MySQL calls this in the middle of a SELECT query, release + possible adaptive hash latch to avoid deadlocks of threads */ + + trx_search_latch_release_if_reserved(parent_trx); + + trx = innobase_trx_allocate(thd); + if (UNIV_UNLIKELY(trx->fake_changes)) { + innobase_commit_low(trx); + trx_free_for_mysql(trx); + DBUG_RETURN(HA_ERR_WRONG_COMMAND); + } + + /* We are doing a DDL operation. */ + ++trx->will_lock; + trx_set_dict_operation(trx, TRX_DICT_OP_INDEX); + + error = innobase_rename_table(trx, from, to); + + DEBUG_SYNC(thd, "after_innobase_rename_table"); + + innobase_commit_low(trx); + trx_free_for_mysql(trx); + + if (error == DB_SUCCESS) { + char norm_from[MAX_FULL_NAME_LEN]; + char norm_to[MAX_FULL_NAME_LEN]; + char errstr[512]; + dberr_t ret; + + normalize_table_name(norm_from, from); + normalize_table_name(norm_to, to); + + ret = dict_stats_rename_table(norm_from, norm_to, + errstr, sizeof(errstr)); + + if (ret != DB_SUCCESS) { + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: %s\n", errstr); + + push_warning(thd, Sql_condition::WARN_LEVEL_WARN, + ER_LOCK_WAIT_TIMEOUT, errstr); + } + } + + /* Add a special case to handle the Duplicated Key error + and return DB_ERROR instead. + This is to avoid a possible SIGSEGV error from mysql error + handling code. Currently, mysql handles the Duplicated Key + error by re-entering the storage layer and getting dup key + info by calling get_dup_key(). This operation requires a valid + table handle ('row_prebuilt_t' structure) which could no + longer be available in the error handling stage. The suggested + solution is to report a 'table exists' error message (since + the dup key error here is due to an existing table whose name + is the one we are trying to rename to) and return the generic + error code. */ + if (error == DB_DUPLICATE_KEY) { + my_error(ER_TABLE_EXISTS_ERROR, MYF(0), to); + + error = DB_ERROR; + } + + DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL)); +} + +/*********************************************************************//** +Estimates the number of index records in a range. +@return estimated number of rows */ +UNIV_INTERN +ha_rows +ha_innobase::records_in_range( +/*==========================*/ + uint keynr, /*!< in: index number */ + key_range *min_key, /*!< in: start key value of the + range, may also be 0 */ + key_range *max_key) /*!< in: range end key val, may + also be 0 */ +{ + KEY* key; + dict_index_t* index; + dtuple_t* range_start; + dtuple_t* range_end; + ib_int64_t n_rows; + ulint mode1; + ulint mode2; + mem_heap_t* heap; + + DBUG_ENTER("records_in_range"); + + ut_a(prebuilt->trx == thd_to_trx(ha_thd())); + + prebuilt->trx->op_info = (char*)"estimating records in index range"; + + /* In case MySQL calls this in the middle of a SELECT query, release + possible adaptive hash latch to avoid deadlocks of threads */ + + trx_search_latch_release_if_reserved(prebuilt->trx); + + active_index = keynr; + + key = table->key_info + active_index; + + index = innobase_get_index(keynr); + + /* There exists possibility of not being able to find requested + index due to inconsistency between MySQL and InoDB dictionary info. + Necessary message should have been printed in innobase_get_index() */ + if (dict_table_is_discarded(prebuilt->table)) { + n_rows = HA_POS_ERROR; + goto func_exit; + } + if (UNIV_UNLIKELY(!index)) { + n_rows = HA_POS_ERROR; + goto func_exit; + } + if (dict_index_is_corrupted(index)) { + n_rows = HA_ERR_INDEX_CORRUPT; + goto func_exit; + } + if (UNIV_UNLIKELY(!row_merge_is_index_usable(prebuilt->trx, index))) { + n_rows = HA_ERR_TABLE_DEF_CHANGED; + goto func_exit; + } + + heap = mem_heap_create(2 * (key->actual_key_parts * sizeof(dfield_t) + + sizeof(dtuple_t))); + + range_start = dtuple_create(heap, key->actual_key_parts); + dict_index_copy_types(range_start, index, key->actual_key_parts); + + range_end = dtuple_create(heap, key->actual_key_parts); + dict_index_copy_types(range_end, index, key->actual_key_parts); + + row_sel_convert_mysql_key_to_innobase( + range_start, + prebuilt->srch_key_val1, + prebuilt->srch_key_val_len, + index, + (byte*) (min_key ? min_key->key : + (const uchar*) 0), + (ulint) (min_key ? min_key->length : 0), + prebuilt->trx); + DBUG_ASSERT(min_key + ? range_start->n_fields > 0 + : range_start->n_fields == 0); + + row_sel_convert_mysql_key_to_innobase( + range_end, + prebuilt->srch_key_val2, + prebuilt->srch_key_val_len, + index, + (byte*) (max_key ? max_key->key : + (const uchar*) 0), + (ulint) (max_key ? max_key->length : 0), + prebuilt->trx); + DBUG_ASSERT(max_key + ? range_end->n_fields > 0 + : range_end->n_fields == 0); + + mode1 = convert_search_mode_to_innobase(min_key ? min_key->flag : + HA_READ_KEY_EXACT); + mode2 = convert_search_mode_to_innobase(max_key ? max_key->flag : + HA_READ_KEY_EXACT); + + if (mode1 != PAGE_CUR_UNSUPP && mode2 != PAGE_CUR_UNSUPP) { + + n_rows = btr_estimate_n_rows_in_range(index, range_start, + mode1, range_end, + mode2); + } else { + + n_rows = HA_POS_ERROR; + } + + mem_heap_free(heap); + +func_exit: + + prebuilt->trx->op_info = (char*)""; + + /* The MySQL optimizer seems to believe an estimate of 0 rows is + always accurate and may return the result 'Empty set' based on that. + The accuracy is not guaranteed, and even if it were, for a locking + read we should anyway perform the search to set the next-key lock. + Add 1 to the value to make sure MySQL does not make the assumption! */ + + if (n_rows == 0) { + n_rows = 1; + } + + DBUG_RETURN((ha_rows) n_rows); +} + +/*********************************************************************//** +Gives an UPPER BOUND to the number of rows in a table. This is used in +filesort.cc. +@return upper bound of rows */ +UNIV_INTERN +ha_rows +ha_innobase::estimate_rows_upper_bound() +/*====================================*/ +{ + const dict_index_t* index; + ulonglong estimate; + ulonglong local_data_file_length; + ulint stat_n_leaf_pages; + + DBUG_ENTER("estimate_rows_upper_bound"); + + /* We do not know if MySQL can call this function before calling + external_lock(). To be safe, update the thd of the current table + handle. */ + + update_thd(ha_thd()); + + prebuilt->trx->op_info = "calculating upper bound for table rows"; + + /* In case MySQL calls this in the middle of a SELECT query, release + possible adaptive hash latch to avoid deadlocks of threads */ + + trx_search_latch_release_if_reserved(prebuilt->trx); + + index = dict_table_get_first_index(prebuilt->table); + + stat_n_leaf_pages = index->stat_n_leaf_pages; + + ut_a(stat_n_leaf_pages > 0); + + local_data_file_length = + ((ulonglong) stat_n_leaf_pages) * UNIV_PAGE_SIZE; + + /* Calculate a minimum length for a clustered index record and from + that an upper bound for the number of rows. Since we only calculate + new statistics in row0mysql.cc when a table has grown by a threshold + factor, we must add a safety factor 2 in front of the formula below. */ + + estimate = 2 * local_data_file_length + / dict_index_calc_min_rec_len(index); + + prebuilt->trx->op_info = ""; + + DBUG_RETURN((ha_rows) estimate); +} + +/*********************************************************************//** +How many seeks it will take to read through the table. This is to be +comparable to the number returned by records_in_range so that we can +decide if we should scan the table or use keys. +@return estimated time measured in disk seeks */ +UNIV_INTERN +double +ha_innobase::scan_time() +/*====================*/ +{ + /* Since MySQL seems to favor table scans too much over index + searches, we pretend that a sequential read takes the same time + as a random disk read, that is, we do not divide the following + by 10, which would be physically realistic. */ + + /* The locking below is disabled for performance reasons. Without + it we could end up returning uninitialized value to the caller, + which in the worst case could make some query plan go bogus or + issue a Valgrind warning. */ +#if 0 + /* avoid potential lock order violation with dict_table_stats_lock() + below */ + update_thd(ha_thd()); + trx_search_latch_release_if_reserved(prebuilt->trx); +#endif + + ulint stat_clustered_index_size; + +#if 0 + dict_table_stats_lock(prebuilt->table, RW_S_LATCH); +#endif + + ut_a(prebuilt->table->stat_initialized); + + stat_clustered_index_size = prebuilt->table->stat_clustered_index_size; + +#if 0 + dict_table_stats_unlock(prebuilt->table, RW_S_LATCH); +#endif + + return((double) stat_clustered_index_size); +} + +/******************************************************************//** +Calculate the time it takes to read a set of ranges through an index +This enables us to optimise reads for clustered indexes. +@return estimated time measured in disk seeks */ +UNIV_INTERN +double +ha_innobase::read_time( +/*===================*/ + uint index, /*!< in: key number */ + uint ranges, /*!< in: how many ranges */ + ha_rows rows) /*!< in: estimated number of rows in the ranges */ +{ + ha_rows total_rows; + double time_for_scan; + + if (index != table->s->primary_key) { + /* Not clustered */ + return(handler::read_time(index, ranges, rows)); + } + + if (rows <= 2) { + + return((double) rows); + } + + /* Assume that the read time is proportional to the scan time for all + rows + at most one seek per range. */ + + time_for_scan = scan_time(); + + if ((total_rows = estimate_rows_upper_bound()) < rows) { + + return(time_for_scan); + } + + return(ranges + (double) rows / (double) total_rows * time_for_scan); +} + +/******************************************************************//** +Return the size of the InnoDB memory buffer. */ +UNIV_INTERN +longlong +ha_innobase::get_memory_buffer_size() const +/*=======================================*/ +{ + return(innobase_buffer_pool_size); +} + +/*********************************************************************//** +Calculates the key number used inside MySQL for an Innobase index. We will +first check the "index translation table" for a match of the index to get +the index number. If there does not exist an "index translation table", +or not able to find the index in the translation table, then we will fall back +to the traditional way of looping through dict_index_t list to find a +match. In this case, we have to take into account if we generated a +default clustered index for the table +@return the key number used inside MySQL */ +static +int +innobase_get_mysql_key_number_for_index( +/*====================================*/ + INNOBASE_SHARE* share, /*!< in: share structure for index + translation table. */ + const TABLE* table, /*!< in: table in MySQL data + dictionary */ + dict_table_t* ib_table,/*!< in: table in Innodb data + dictionary */ + const dict_index_t* index) /*!< in: index */ +{ + const dict_index_t* ind; + unsigned int i; + + ut_a(index); + + /* If index does not belong to the table object of share structure + (ib_table comes from the share structure) search the index->table + object instead */ + if (index->table != ib_table) { + i = 0; + ind = dict_table_get_first_index(index->table); + + while (index != ind) { + ind = dict_table_get_next_index(ind); + i++; + } + + if (row_table_got_default_clust_index(index->table)) { + ut_a(i > 0); + i--; + } + + return(i); + } + + /* If index translation table exists, we will first check + the index through index translation table for a match. */ + if (share->idx_trans_tbl.index_mapping) { + for (i = 0; i < share->idx_trans_tbl.index_count; i++) { + if (share->idx_trans_tbl.index_mapping[i] == index) { + return(i); + } + } + + /* Print an error message if we cannot find the index + in the "index translation table". */ + if (*index->name != TEMP_INDEX_PREFIX) { + sql_print_error("Cannot find index %s in InnoDB index " + "translation table.", index->name); + } + } + + /* If we do not have an "index translation table", or not able + to find the index in the translation table, we'll directly find + matching index with information from mysql TABLE structure and + InnoDB dict_index_t list */ + for (i = 0; i < table->s->keys; i++) { + ind = dict_table_get_index_on_name( + ib_table, table->key_info[i].name); + + if (index == ind) { + return(i); + } + } + + /* Loop through each index of the table and lock them */ + for (ind = dict_table_get_first_index(ib_table); + ind != NULL; + ind = dict_table_get_next_index(ind)) { + if (index == ind) { + /* Temp index is internal to InnoDB, that is + not present in the MySQL index list, so no + need to print such mismatch warning. */ + if (*(index->name) != TEMP_INDEX_PREFIX) { + sql_print_warning( + "Find index %s in InnoDB index list " + "but not its MySQL index number " + "It could be an InnoDB internal index.", + index->name); + } + return(-1); + } + } + + ut_error; + + return(-1); +} + +/*********************************************************************//** +Calculate Record Per Key value. Need to exclude the NULL value if +innodb_stats_method is set to "nulls_ignored" +@return estimated record per key value */ +static +ha_rows +innodb_rec_per_key( +/*===============*/ + dict_index_t* index, /*!< in: dict_index_t structure */ + ulint i, /*!< in: the column we are + calculating rec per key */ + ha_rows records) /*!< in: estimated total records */ +{ + ha_rows rec_per_key; + ib_uint64_t n_diff; + + ut_a(index->table->stat_initialized); + + ut_ad(i < dict_index_get_n_unique(index)); + + n_diff = index->stat_n_diff_key_vals[i]; + + if (n_diff == 0) { + + rec_per_key = records; + } else if (srv_innodb_stats_method == SRV_STATS_NULLS_IGNORED) { + ib_uint64_t n_null; + ib_uint64_t n_non_null; + + n_non_null = index->stat_n_non_null_key_vals[i]; + + /* In theory, index->stat_n_non_null_key_vals[i] + should always be less than the number of records. + Since this is statistics value, the value could + have slight discrepancy. But we will make sure + the number of null values is not a negative number. */ + if (records < n_non_null) { + n_null = 0; + } else { + n_null = records - n_non_null; + } + + /* If the number of NULL values is the same as or + large than that of the distinct values, we could + consider that the table consists mostly of NULL value. + Set rec_per_key to 1. */ + if (n_diff <= n_null) { + rec_per_key = 1; + } else { + /* Need to exclude rows with NULL values from + rec_per_key calculation */ + rec_per_key = (ha_rows) + ((records - n_null) / (n_diff - n_null)); + } + } else { + DEBUG_SYNC_C("after_checking_for_0"); + rec_per_key = (ha_rows) (records / n_diff); + } + + return(rec_per_key); +} + +/*********************************************************************//** +Returns statistics information of the table to the MySQL interpreter, +in various fields of the handle object. +@return HA_ERR_* error code or 0 */ +UNIV_INTERN +int +ha_innobase::info_low( +/*==================*/ + uint flag, /*!< in: what information is requested */ + bool is_analyze) +{ + dict_table_t* ib_table; + ha_rows rec_per_key; + ib_uint64_t n_rows; + char path[FN_REFLEN]; + os_file_stat_t stat_info; + + DBUG_ENTER("info"); + + /* If we are forcing recovery at a high level, we will suppress + statistics calculation on tables, because that may crash the + server if an index is badly corrupted. */ + + /* We do not know if MySQL can call this function before calling + external_lock(). To be safe, update the thd of the current table + handle. */ + + update_thd(ha_thd()); + + /* In case MySQL calls this in the middle of a SELECT query, release + possible adaptive hash latch to avoid deadlocks of threads */ + + prebuilt->trx->op_info = (char*)"returning various info to MySQL"; + + trx_search_latch_release_if_reserved(prebuilt->trx); + + ib_table = prebuilt->table; + DBUG_ASSERT(ib_table->n_ref_count > 0); + + if (flag & HA_STATUS_TIME) { + if (is_analyze || innobase_stats_on_metadata) { + + dict_stats_upd_option_t opt; + dberr_t ret; + + prebuilt->trx->op_info = "updating table statistics"; + + if (dict_stats_is_persistent_enabled(ib_table)) { + + if (is_analyze) { + opt = DICT_STATS_RECALC_PERSISTENT; + } else { + /* This is e.g. 'SHOW INDEXES', fetch + the persistent stats from disk. */ + opt = DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY; + } + } else { + opt = DICT_STATS_RECALC_TRANSIENT; + } + + ut_ad(!mutex_own(&dict_sys->mutex)); + ret = dict_stats_update(ib_table, opt); + + if (ret != DB_SUCCESS) { + prebuilt->trx->op_info = ""; + DBUG_RETURN(HA_ERR_GENERIC); + } + + prebuilt->trx->op_info = + "returning various info to MySQL"; + } + + my_snprintf(path, sizeof(path), "%s/%s%s", + mysql_data_home, ib_table->name, reg_ext); + + unpack_filename(path,path); + + /* Note that we do not know the access time of the table, + nor the CHECK TABLE time, nor the UPDATE or INSERT time. */ + + if (os_file_get_status(path, &stat_info, false) == DB_SUCCESS) { + stats.create_time = (ulong) stat_info.ctime; + } + } + + if (flag & HA_STATUS_VARIABLE) { + + ulint page_size; + ulint stat_clustered_index_size; + ulint stat_sum_of_other_index_sizes; + + if (!(flag & HA_STATUS_NO_LOCK)) { + dict_table_stats_lock(ib_table, RW_S_LATCH); + } + + ut_a(ib_table->stat_initialized); + + n_rows = ib_table->stat_n_rows; + + stat_clustered_index_size + = ib_table->stat_clustered_index_size; + + stat_sum_of_other_index_sizes + = ib_table->stat_sum_of_other_index_sizes; + + if (!(flag & HA_STATUS_NO_LOCK)) { + dict_table_stats_unlock(ib_table, RW_S_LATCH); + } + + /* + The MySQL optimizer seems to assume in a left join that n_rows + is an accurate estimate if it is zero. Of course, it is not, + since we do not have any locks on the rows yet at this phase. + Since SHOW TABLE STATUS seems to call this function with the + HA_STATUS_TIME flag set, while the left join optimizer does not + set that flag, we add one to a zero value if the flag is not + set. That way SHOW TABLE STATUS will show the best estimate, + while the optimizer never sees the table empty. */ + + if (n_rows == 0 && !(flag & HA_STATUS_TIME)) { + n_rows++; + } + + /* Fix bug#40386: Not flushing query cache after truncate. + n_rows can not be 0 unless the table is empty, set to 1 + instead. The original problem of bug#29507 is actually + fixed in the server code. */ + if (thd_sql_command(user_thd) == SQLCOM_TRUNCATE) { + + n_rows = 1; + + /* We need to reset the prebuilt value too, otherwise + checks for values greater than the last value written + to the table will fail and the autoinc counter will + not be updated. This will force write_row() into + attempting an update of the table's AUTOINC counter. */ + + prebuilt->autoinc_last_value = 0; + } + + page_size = dict_table_zip_size(ib_table); + if (page_size == 0) { + page_size = UNIV_PAGE_SIZE; + } + + stats.records = (ha_rows) n_rows; + stats.deleted = 0; + stats.data_file_length + = ((ulonglong) stat_clustered_index_size) + * page_size; + stats.index_file_length + = ((ulonglong) stat_sum_of_other_index_sizes) + * page_size; + + /* Since fsp_get_available_space_in_free_extents() is + acquiring latches inside InnoDB, we do not call it if we + are asked by MySQL to avoid locking. Another reason to + avoid the call is that it uses quite a lot of CPU. + See Bug#38185. */ + if (flag & HA_STATUS_NO_LOCK + || !(flag & HA_STATUS_VARIABLE_EXTRA)) { + /* We do not update delete_length if no + locking is requested so the "old" value can + remain. delete_length is initialized to 0 in + the ha_statistics' constructor. Also we only + need delete_length to be set when + HA_STATUS_VARIABLE_EXTRA is set */ + } else if (UNIV_UNLIKELY + (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE)) { + /* Avoid accessing the tablespace if + innodb_crash_recovery is set to a high value. */ + stats.delete_length = 0; + } else { + ullint avail_space; + + avail_space = fsp_get_available_space_in_free_extents( + ib_table->space); + + if (avail_space == ULLINT_UNDEFINED) { + THD* thd; + char errbuf[MYSYS_STRERROR_SIZE]; + + thd = ha_thd(); + + push_warning_printf( + thd, + Sql_condition::WARN_LEVEL_WARN, + ER_CANT_GET_STAT, + "InnoDB: Trying to get the free " + "space for table %s but its " + "tablespace has been discarded or " + "the .ibd file is missing. Setting " + "the free space to zero. " + "(errno: %d - %s)", + ib_table->name, errno, + my_strerror(errbuf, sizeof(errbuf), + errno)); + + stats.delete_length = 0; + } else { + stats.delete_length = avail_space * 1024; + } + } + + stats.check_time = 0; + stats.mrr_length_per_rec = ref_length + sizeof(void*); + + if (stats.records == 0) { + stats.mean_rec_length = 0; + } else { + stats.mean_rec_length = (ulong) + (stats.data_file_length / stats.records); + } + } + + if (flag & HA_STATUS_CONST) { + ulong i; + /* Verify the number of index in InnoDB and MySQL + matches up. If prebuilt->clust_index_was_generated + holds, InnoDB defines GEN_CLUST_INDEX internally */ + ulint num_innodb_index = UT_LIST_GET_LEN(ib_table->indexes) + - prebuilt->clust_index_was_generated; + if (table->s->keys < num_innodb_index) { + /* If there are too many indexes defined + inside InnoDB, ignore those that are being + created, because MySQL will only consider + the fully built indexes here. */ + + for (const dict_index_t* index + = UT_LIST_GET_FIRST(ib_table->indexes); + index != NULL; + index = UT_LIST_GET_NEXT(indexes, index)) { + + /* First, online index creation is + completed inside InnoDB, and then + MySQL attempts to upgrade the + meta-data lock so that it can rebuild + the .frm file. If we get here in that + time frame, dict_index_is_online_ddl() + would not hold and the index would + still not be included in TABLE_SHARE. */ + if (*index->name == TEMP_INDEX_PREFIX) { + num_innodb_index--; + } + } + + if (table->s->keys < num_innodb_index + && innobase_fts_check_doc_id_index( + ib_table, NULL, NULL) + == FTS_EXIST_DOC_ID_INDEX) { + num_innodb_index--; + } + } + + if (table->s->keys != num_innodb_index) { + sql_print_error("InnoDB: Table %s contains %lu " + "indexes inside InnoDB, which " + "is different from the number of " + "indexes %u defined in the MySQL ", + ib_table->name, num_innodb_index, + table->s->keys); + } + + if (!(flag & HA_STATUS_NO_LOCK)) { + dict_table_stats_lock(ib_table, RW_S_LATCH); + } + + ut_a(ib_table->stat_initialized); + + for (i = 0; i < table->s->keys; i++) { + ulong j; + /* We could get index quickly through internal + index mapping with the index translation table. + The identity of index (match up index name with + that of table->key_info[i]) is already verified in + innobase_get_index(). */ + dict_index_t* index = innobase_get_index(i); + + if (index == NULL) { + sql_print_error("Table %s contains fewer " + "indexes inside InnoDB than " + "are defined in the MySQL " + ".frm file. Have you mixed up " + ".frm files from different " + "installations? See " + REFMAN + "innodb-troubleshooting.html\n", + ib_table->name); + break; + } + + for (j = 0; j < table->key_info[i].actual_key_parts; j++) { + + if (table->key_info[i].flags & HA_FULLTEXT) { + /* The whole concept has no validity + for FTS indexes. */ + table->key_info[i].rec_per_key[j] = 1; + continue; + } + + if (j + 1 > index->n_uniq) { + sql_print_error( + "Index %s of %s has %lu columns" + " unique inside InnoDB, but " + "MySQL is asking statistics for" + " %lu columns. Have you mixed " + "up .frm files from different " + "installations? " + "See " REFMAN + "innodb-troubleshooting.html\n", + index->name, + ib_table->name, + (unsigned long) + index->n_uniq, j + 1); + break; + } + + rec_per_key = innodb_rec_per_key( + index, j, stats.records); + + /* Since MySQL seems to favor table scans + too much over index searches, we pretend + index selectivity is 2 times better than + our estimate: */ + + rec_per_key = rec_per_key / 2; + + if (rec_per_key == 0) { + rec_per_key = 1; + } + + table->key_info[i].rec_per_key[j] = + rec_per_key >= ~(ulong) 0 ? ~(ulong) 0 : + (ulong) rec_per_key; + } + } + + if (!(flag & HA_STATUS_NO_LOCK)) { + dict_table_stats_unlock(ib_table, RW_S_LATCH); + } + } + + if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE) { + + goto func_exit; + } + + if (flag & HA_STATUS_ERRKEY) { + const dict_index_t* err_index; + + ut_a(prebuilt->trx); + ut_a(prebuilt->trx->magic_n == TRX_MAGIC_N); + + err_index = trx_get_error_info(prebuilt->trx); + + if (err_index) { + errkey = innobase_get_mysql_key_number_for_index( + share, table, ib_table, err_index); + } else { + errkey = (unsigned int) ( + (prebuilt->trx->error_key_num + == ULINT_UNDEFINED) + ? ~0 + : prebuilt->trx->error_key_num); + } + } + + if ((flag & HA_STATUS_AUTO) && table->found_next_number_field) { + stats.auto_increment_value = innobase_peek_autoinc(); + } + +func_exit: + prebuilt->trx->op_info = (char*)""; + + DBUG_RETURN(0); +} + +/*********************************************************************//** +Returns statistics information of the table to the MySQL interpreter, +in various fields of the handle object. +@return HA_ERR_* error code or 0 */ +UNIV_INTERN +int +ha_innobase::info( +/*==============*/ + uint flag) /*!< in: what information is requested */ +{ + return(this->info_low(flag, false /* not ANALYZE */)); +} + +/**********************************************************************//** +Updates index cardinalities of the table, based on random dives into +each index tree. This does NOT calculate exact statistics on the table. +@return HA_ADMIN_* error code or HA_ADMIN_OK */ +UNIV_INTERN +int +ha_innobase::analyze( +/*=================*/ + THD* thd, /*!< in: connection thread handle */ + HA_CHECK_OPT* check_opt) /*!< in: currently ignored */ +{ + int ret; + + if (UNIV_UNLIKELY(share->ib_table->is_corrupt)) { + return(HA_ADMIN_CORRUPT); + } + + /* Simply call this->info_low() with all the flags + and request recalculation of the statistics */ + ret = this->info_low( + HA_STATUS_TIME | HA_STATUS_CONST | HA_STATUS_VARIABLE, + true /* this is ANALYZE */); + + if (UNIV_UNLIKELY(share->ib_table->is_corrupt)) { + return(HA_ADMIN_CORRUPT); + } + + if (ret != 0) { + return(HA_ADMIN_FAILED); + } + + return(HA_ADMIN_OK); +} + +/**********************************************************************//** +This is mapped to "ALTER TABLE tablename ENGINE=InnoDB", which rebuilds +the table in MySQL. */ +UNIV_INTERN +int +ha_innobase::optimize( +/*==================*/ + THD* thd, /*!< in: connection thread handle */ + HA_CHECK_OPT* check_opt) /*!< in: currently ignored */ +{ + /*FTS-FIXME: Since MySQL doesn't support engine-specific commands, + we have to hijack some existing command in order to be able to test + the new admin commands added in InnoDB's FTS support. For now, we + use MySQL's OPTIMIZE command, normally mapped to ALTER TABLE in + InnoDB (so it recreates the table anew), and map it to OPTIMIZE. + + This works OK otherwise, but MySQL locks the entire table during + calls to OPTIMIZE, which is undesirable. */ + + if (innodb_optimize_fulltext_only) { + if (prebuilt->table->fts && prebuilt->table->fts->cache + && !dict_table_is_discarded(prebuilt->table)) { + fts_sync_table(prebuilt->table); + fts_optimize_table(prebuilt->table); + } + return(HA_ADMIN_OK); + } else { + + return(HA_ADMIN_TRY_ALTER); + } +} + +/*******************************************************************//** +Tries to check that an InnoDB table is not corrupted. If corruption is +noticed, prints to stderr information about it. In case of corruption +may also assert a failure and crash the server. +@return HA_ADMIN_CORRUPT or HA_ADMIN_OK */ +UNIV_INTERN +int +ha_innobase::check( +/*===============*/ + THD* thd, /*!< in: user thread handle */ + HA_CHECK_OPT* check_opt) /*!< in: check options */ +{ + dict_index_t* index; + ulint n_rows; + ulint n_rows_in_table = ULINT_UNDEFINED; + bool is_ok = true; + ulint old_isolation_level; + ibool table_corrupted; + + DBUG_ENTER("ha_innobase::check"); + DBUG_ASSERT(thd == ha_thd()); + ut_a(prebuilt->trx); + ut_a(prebuilt->trx->magic_n == TRX_MAGIC_N); + ut_a(prebuilt->trx == thd_to_trx(thd)); + + if (prebuilt->mysql_template == NULL) { + /* Build the template; we will use a dummy template + in index scans done in checking */ + + build_template(true); + } + + if (dict_table_is_discarded(prebuilt->table)) { + + ib_senderrf( + thd, + IB_LOG_LEVEL_ERROR, + ER_TABLESPACE_DISCARDED, + table->s->table_name.str); + + DBUG_RETURN(HA_ADMIN_CORRUPT); + + } else if (prebuilt->table->ibd_file_missing) { + + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, + ER_TABLESPACE_MISSING, + table->s->table_name.str); + + DBUG_RETURN(HA_ADMIN_CORRUPT); + } + + prebuilt->trx->op_info = "checking table"; + + old_isolation_level = prebuilt->trx->isolation_level; + + /* We must run the index record counts at an isolation level + >= READ COMMITTED, because a dirty read can see a wrong number + of records in some index; to play safe, we use always + REPEATABLE READ here */ + + prebuilt->trx->isolation_level = TRX_ISO_REPEATABLE_READ; + + /* Check whether the table is already marked as corrupted + before running the check table */ + table_corrupted = prebuilt->table->corrupted; + + /* Reset table->corrupted bit so that check table can proceed to + do additional check */ + prebuilt->table->corrupted = FALSE; + + for (index = dict_table_get_first_index(prebuilt->table); + index != NULL; + index = dict_table_get_next_index(index)) { + char index_name[MAX_FULL_NAME_LEN + 1]; + + /* If this is an index being created or dropped, skip */ + if (*index->name == TEMP_INDEX_PREFIX) { + continue; + } + + if (!(check_opt->flags & T_QUICK)) { + /* Enlarge the fatal lock wait timeout during + CHECK TABLE. */ + os_increment_counter_by_amount( + server_mutex, + srv_fatal_semaphore_wait_threshold, + SRV_SEMAPHORE_WAIT_EXTENSION); + bool valid = btr_validate_index(index, prebuilt->trx); + + /* Restore the fatal lock wait timeout after + CHECK TABLE. */ + os_decrement_counter_by_amount( + server_mutex, + srv_fatal_semaphore_wait_threshold, + SRV_SEMAPHORE_WAIT_EXTENSION); + + if (!valid) { + is_ok = false; + + innobase_format_name( + index_name, sizeof index_name, + index->name, TRUE); + push_warning_printf( + thd, + Sql_condition::WARN_LEVEL_WARN, + ER_NOT_KEYFILE, + "InnoDB: The B-tree of" + " index %s is corrupted.", + index_name); + continue; + } + } + + /* Instead of invoking change_active_index(), set up + a dummy template for non-locking reads, disabling + access to the clustered index. */ + prebuilt->index = index; + + prebuilt->index_usable = row_merge_is_index_usable( + prebuilt->trx, prebuilt->index); + + if (UNIV_UNLIKELY(!prebuilt->index_usable)) { + innobase_format_name( + index_name, sizeof index_name, + prebuilt->index->name, TRUE); + + if (dict_index_is_corrupted(prebuilt->index)) { + push_warning_printf( + user_thd, + Sql_condition::WARN_LEVEL_WARN, + HA_ERR_INDEX_CORRUPT, + "InnoDB: Index %s is marked as" + " corrupted", + index_name); + is_ok = false; + } else { + push_warning_printf( + thd, + Sql_condition::WARN_LEVEL_WARN, + HA_ERR_TABLE_DEF_CHANGED, + "InnoDB: Insufficient history for" + " index %s", + index_name); + } + continue; + } + + prebuilt->sql_stat_start = TRUE; + prebuilt->template_type = ROW_MYSQL_DUMMY_TEMPLATE; + prebuilt->n_template = 0; + prebuilt->need_to_access_clustered = FALSE; + + dtuple_set_n_fields(prebuilt->search_tuple, 0); + + prebuilt->select_lock_type = LOCK_NONE; + + if (!row_check_index_for_mysql(prebuilt, index, &n_rows)) { + innobase_format_name( + index_name, sizeof index_name, + index->name, TRUE); + + push_warning_printf( + thd, Sql_condition::WARN_LEVEL_WARN, + ER_NOT_KEYFILE, + "InnoDB: The B-tree of" + " index %s is corrupted.", + index_name); + is_ok = false; + dict_set_corrupted( + index, prebuilt->trx, "CHECK TABLE-check index"); + } + + if (thd_killed(user_thd)) { + break; + } + +#if 0 + fprintf(stderr, "%lu entries in index %s\n", n_rows, + index->name); +#endif + + if (index == dict_table_get_first_index(prebuilt->table)) { + n_rows_in_table = n_rows; + } else if (!(index->type & DICT_FTS) + && (n_rows != n_rows_in_table)) { + push_warning_printf( + thd, Sql_condition::WARN_LEVEL_WARN, + ER_NOT_KEYFILE, + "InnoDB: Index '%-.200s' contains %lu" + " entries, should be %lu.", + index->name, + (ulong) n_rows, + (ulong) n_rows_in_table); + is_ok = false; + dict_set_corrupted( + index, prebuilt->trx, + "CHECK TABLE; Wrong count"); + } + } + + if (table_corrupted) { + /* If some previous operation has marked the table as + corrupted in memory, and has not propagated such to + clustered index, we will do so here */ + index = dict_table_get_first_index(prebuilt->table); + + if (!dict_index_is_corrupted(index)) { + dict_set_corrupted( + index, prebuilt->trx, "CHECK TABLE"); + } + prebuilt->table->corrupted = TRUE; + } + + /* Restore the original isolation level */ + prebuilt->trx->isolation_level = old_isolation_level; + + /* We validate the whole adaptive hash index for all tables + at every CHECK TABLE only when QUICK flag is not present. */ + +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + if (!(check_opt->flags & T_QUICK) && !btr_search_validate()) { + push_warning(thd, Sql_condition::WARN_LEVEL_WARN, + ER_NOT_KEYFILE, + "InnoDB: The adaptive hash index is corrupted."); + is_ok = false; + } +#endif /* defined UNIV_AHI_DEBUG || defined UNIV_DEBUG */ + + prebuilt->trx->op_info = ""; + if (thd_killed(user_thd)) { + thd_set_kill_status(user_thd); + } + + if (UNIV_UNLIKELY(share->ib_table->is_corrupt)) { + return(HA_ADMIN_CORRUPT); + } + + DBUG_RETURN(is_ok ? HA_ADMIN_OK : HA_ADMIN_CORRUPT); +} + +/*************************************************************//** +Adds information about free space in the InnoDB tablespace to a table comment +which is printed out when a user calls SHOW TABLE STATUS. Adds also info on +foreign keys. +@return table comment + InnoDB free space + info on foreign keys */ +UNIV_INTERN +char* +ha_innobase::update_table_comment( +/*==============================*/ + const char* comment)/*!< in: table comment defined by user */ +{ + uint length = (uint) strlen(comment); + char* str; + long flen; + + /* We do not know if MySQL can call this function before calling + external_lock(). To be safe, update the thd of the current table + handle. */ + + if (length > 64000 - 3) { + return((char*) comment); /* string too long */ + } + + update_thd(ha_thd()); + + prebuilt->trx->op_info = (char*)"returning table comment"; + + /* In case MySQL calls this in the middle of a SELECT query, release + possible adaptive hash latch to avoid deadlocks of threads */ + + trx_search_latch_release_if_reserved(prebuilt->trx); + str = NULL; + + /* output the data to a temporary file */ + + if (!srv_read_only_mode) { + + mutex_enter(&srv_dict_tmpfile_mutex); + + rewind(srv_dict_tmpfile); + + fprintf(srv_dict_tmpfile, "InnoDB free: %llu kB", + fsp_get_available_space_in_free_extents( + prebuilt->table->space)); + + dict_print_info_on_foreign_keys( + FALSE, srv_dict_tmpfile, prebuilt->trx, + prebuilt->table); + + flen = ftell(srv_dict_tmpfile); + + if (flen < 0) { + flen = 0; + } else if (length + flen + 3 > 64000) { + flen = 64000 - 3 - length; + } + + /* allocate buffer for the full string, and + read the contents of the temporary file */ + + str = (char*) my_malloc(length + flen + 3, MYF(0)); + + if (str) { + char* pos = str + length; + if (length) { + memcpy(str, comment, length); + *pos++ = ';'; + *pos++ = ' '; + } + rewind(srv_dict_tmpfile); + flen = (uint) fread(pos, 1, flen, srv_dict_tmpfile); + pos[flen] = 0; + } + + mutex_exit(&srv_dict_tmpfile_mutex); + } + + prebuilt->trx->op_info = (char*)""; + + return(str ? str : (char*) comment); +} + +/*******************************************************************//** +Gets the foreign key create info for a table stored in InnoDB. +@return own: character string in the form which can be inserted to the +CREATE TABLE statement, MUST be freed with +ha_innobase::free_foreign_key_create_info */ +UNIV_INTERN +char* +ha_innobase::get_foreign_key_create_info(void) +/*==========================================*/ +{ + long flen; + char* str = 0; + + ut_a(prebuilt != NULL); + + /* We do not know if MySQL can call this function before calling + external_lock(). To be safe, update the thd of the current table + handle. */ + + update_thd(ha_thd()); + + prebuilt->trx->op_info = (char*)"getting info on foreign keys"; + + /* In case MySQL calls this in the middle of a SELECT query, + release possible adaptive hash latch to avoid + deadlocks of threads */ + + trx_search_latch_release_if_reserved(prebuilt->trx); + + if (!srv_read_only_mode) { + mutex_enter(&srv_dict_tmpfile_mutex); + rewind(srv_dict_tmpfile); + + /* Output the data to a temporary file */ + dict_print_info_on_foreign_keys( + TRUE, srv_dict_tmpfile, prebuilt->trx, + prebuilt->table); + + prebuilt->trx->op_info = (char*)""; + + flen = ftell(srv_dict_tmpfile); + + if (flen < 0) { + flen = 0; + } + + /* Allocate buffer for the string, and + read the contents of the temporary file */ + + str = (char*) my_malloc(flen + 1, MYF(0)); + + if (str) { + rewind(srv_dict_tmpfile); + flen = (uint) fread(str, 1, flen, srv_dict_tmpfile); + str[flen] = 0; + } + + mutex_exit(&srv_dict_tmpfile_mutex); + } + + return(str); +} + + +/***********************************************************************//** +Maps a InnoDB foreign key constraint to a equivalent MySQL foreign key info. +@return pointer to foreign key info */ +static +FOREIGN_KEY_INFO* +get_foreign_key_info( +/*=================*/ + THD* thd, /*!< in: user thread handle */ + dict_foreign_t* foreign) /*!< in: foreign key constraint */ +{ + FOREIGN_KEY_INFO f_key_info; + FOREIGN_KEY_INFO* pf_key_info; + uint i = 0; + ulint len; + char tmp_buff[NAME_LEN+1]; + char name_buff[NAME_LEN+1]; + const char* ptr; + LEX_STRING* referenced_key_name; + LEX_STRING* name = NULL; + + ptr = dict_remove_db_name(foreign->id); + f_key_info.foreign_id = thd_make_lex_string(thd, 0, ptr, + (uint) strlen(ptr), 1); + + /* Name format: database name, '/', table name, '\0' */ + + /* Referenced (parent) database name */ + len = dict_get_db_name_len(foreign->referenced_table_name); + ut_a(len < sizeof(tmp_buff)); + ut_memcpy(tmp_buff, foreign->referenced_table_name, len); + tmp_buff[len] = 0; + + len = filename_to_tablename(tmp_buff, name_buff, sizeof(name_buff)); + f_key_info.referenced_db = thd_make_lex_string( + thd, 0, name_buff, static_cast<unsigned int>(len), 1); + + /* Referenced (parent) table name */ + ptr = dict_remove_db_name(foreign->referenced_table_name); + len = filename_to_tablename(ptr, name_buff, sizeof(name_buff)); + f_key_info.referenced_table = thd_make_lex_string( + thd, 0, name_buff, static_cast<unsigned int>(len), 1); + + /* Dependent (child) database name */ + len = dict_get_db_name_len(foreign->foreign_table_name); + ut_a(len < sizeof(tmp_buff)); + ut_memcpy(tmp_buff, foreign->foreign_table_name, len); + tmp_buff[len] = 0; + + len = filename_to_tablename(tmp_buff, name_buff, sizeof(name_buff)); + f_key_info.foreign_db = thd_make_lex_string( + thd, 0, name_buff, static_cast<unsigned int>(len), 1); + + /* Dependent (child) table name */ + ptr = dict_remove_db_name(foreign->foreign_table_name); + len = filename_to_tablename(ptr, name_buff, sizeof(name_buff)); + f_key_info.foreign_table = thd_make_lex_string( + thd, 0, name_buff, static_cast<unsigned int>(len), 1); + + do { + ptr = foreign->foreign_col_names[i]; + name = thd_make_lex_string(thd, name, ptr, + (uint) strlen(ptr), 1); + f_key_info.foreign_fields.push_back(name); + ptr = foreign->referenced_col_names[i]; + name = thd_make_lex_string(thd, name, ptr, + (uint) strlen(ptr), 1); + f_key_info.referenced_fields.push_back(name); + } while (++i < foreign->n_fields); + + if (foreign->type & DICT_FOREIGN_ON_DELETE_CASCADE) { + len = 7; + ptr = "CASCADE"; + } else if (foreign->type & DICT_FOREIGN_ON_DELETE_SET_NULL) { + len = 8; + ptr = "SET NULL"; + } else if (foreign->type & DICT_FOREIGN_ON_DELETE_NO_ACTION) { + len = 9; + ptr = "NO ACTION"; + } else { + len = 8; + ptr = "RESTRICT"; + } + + f_key_info.delete_method = thd_make_lex_string( + thd, f_key_info.delete_method, ptr, + static_cast<unsigned int>(len), 1); + + if (foreign->type & DICT_FOREIGN_ON_UPDATE_CASCADE) { + len = 7; + ptr = "CASCADE"; + } else if (foreign->type & DICT_FOREIGN_ON_UPDATE_SET_NULL) { + len = 8; + ptr = "SET NULL"; + } else if (foreign->type & DICT_FOREIGN_ON_UPDATE_NO_ACTION) { + len = 9; + ptr = "NO ACTION"; + } else { + len = 8; + ptr = "RESTRICT"; + } + + f_key_info.update_method = thd_make_lex_string( + thd, f_key_info.update_method, ptr, + static_cast<unsigned int>(len), 1); + + if (foreign->referenced_index && foreign->referenced_index->name) { + referenced_key_name = thd_make_lex_string(thd, + f_key_info.referenced_key_name, + foreign->referenced_index->name, + (uint) strlen(foreign->referenced_index->name), + 1); + } else { + referenced_key_name = NULL; + } + + f_key_info.referenced_key_name = referenced_key_name; + + pf_key_info = (FOREIGN_KEY_INFO*) thd_memdup(thd, &f_key_info, + sizeof(FOREIGN_KEY_INFO)); + + return(pf_key_info); +} + +/*******************************************************************//** +Gets the list of foreign keys in this table. +@return always 0, that is, always succeeds */ +UNIV_INTERN +int +ha_innobase::get_foreign_key_list( +/*==============================*/ + THD* thd, /*!< in: user thread handle */ + List<FOREIGN_KEY_INFO>* f_key_list) /*!< out: foreign key list */ +{ + FOREIGN_KEY_INFO* pf_key_info; + dict_foreign_t* foreign; + + ut_a(prebuilt != NULL); + update_thd(ha_thd()); + + prebuilt->trx->op_info = "getting list of foreign keys"; + + trx_search_latch_release_if_reserved(prebuilt->trx); + + mutex_enter(&(dict_sys->mutex)); + + for (dict_foreign_set::iterator it + = prebuilt->table->foreign_set.begin(); + it != prebuilt->table->foreign_set.end(); + ++it) { + + foreign = *it; + + pf_key_info = get_foreign_key_info(thd, foreign); + if (pf_key_info) { + f_key_list->push_back(pf_key_info); + } + } + + mutex_exit(&(dict_sys->mutex)); + + prebuilt->trx->op_info = ""; + + return(0); +} + +/*******************************************************************//** +Gets the set of foreign keys where this table is the referenced table. +@return always 0, that is, always succeeds */ +UNIV_INTERN +int +ha_innobase::get_parent_foreign_key_list( +/*=====================================*/ + THD* thd, /*!< in: user thread handle */ + List<FOREIGN_KEY_INFO>* f_key_list) /*!< out: foreign key list */ +{ + FOREIGN_KEY_INFO* pf_key_info; + dict_foreign_t* foreign; + + ut_a(prebuilt != NULL); + update_thd(ha_thd()); + + prebuilt->trx->op_info = "getting list of referencing foreign keys"; + + trx_search_latch_release_if_reserved(prebuilt->trx); + + mutex_enter(&(dict_sys->mutex)); + + for (dict_foreign_set::iterator it + = prebuilt->table->referenced_set.begin(); + it != prebuilt->table->referenced_set.end(); + ++it) { + + foreign = *it; + + pf_key_info = get_foreign_key_info(thd, foreign); + if (pf_key_info) { + f_key_list->push_back(pf_key_info); + } + } + + mutex_exit(&(dict_sys->mutex)); + + prebuilt->trx->op_info = ""; + + return(0); +} + +/*****************************************************************//** +Checks if ALTER TABLE may change the storage engine of the table. +Changing storage engines is not allowed for tables for which there +are foreign key constraints (parent or child tables). +@return TRUE if can switch engines */ +UNIV_INTERN +bool +ha_innobase::can_switch_engines(void) +/*=================================*/ +{ + bool can_switch; + + DBUG_ENTER("ha_innobase::can_switch_engines"); + update_thd(); + + prebuilt->trx->op_info = + "determining if there are foreign key constraints"; + row_mysql_freeze_data_dictionary(prebuilt->trx); + + can_switch = prebuilt->table->referenced_set.empty() + && prebuilt->table->foreign_set.empty(); + + row_mysql_unfreeze_data_dictionary(prebuilt->trx); + prebuilt->trx->op_info = ""; + + DBUG_RETURN(can_switch); +} + +/*******************************************************************//** +Checks if a table is referenced by a foreign key. The MySQL manual states that +a REPLACE is either equivalent to an INSERT, or DELETE(s) + INSERT. Only a +delete is then allowed internally to resolve a duplicate key conflict in +REPLACE, not an update. +@return > 0 if referenced by a FOREIGN KEY */ +UNIV_INTERN +uint +ha_innobase::referenced_by_foreign_key(void) +/*========================================*/ +{ + if (dict_table_is_referenced_by_foreign_key(prebuilt->table)) { + + return(1); + } + + return(0); +} + +/*******************************************************************//** +Frees the foreign key create info for a table stored in InnoDB, if it is +non-NULL. */ +UNIV_INTERN +void +ha_innobase::free_foreign_key_create_info( +/*======================================*/ + char* str) /*!< in, own: create info string to free */ +{ + if (str) { + my_free(str); + } +} + +/*******************************************************************//** +Tells something additional to the handler about how to do things. +@return 0 or error number */ +UNIV_INTERN +int +ha_innobase::extra( +/*===============*/ + enum ha_extra_function operation) + /*!< in: HA_EXTRA_FLUSH or some other flag */ +{ + check_trx_exists(ha_thd()); + + /* Warning: since it is not sure that MySQL calls external_lock + before calling this function, the trx field in prebuilt can be + obsolete! */ + + switch (operation) { + case HA_EXTRA_FLUSH: + if (prebuilt->blob_heap) { + row_mysql_prebuilt_free_blob_heap(prebuilt); + } + break; + case HA_EXTRA_RESET_STATE: + reset_template(); + thd_to_trx(ha_thd())->duplicates = 0; + break; + case HA_EXTRA_NO_KEYREAD: + prebuilt->read_just_key = 0; + break; + case HA_EXTRA_KEYREAD: + prebuilt->read_just_key = 1; + break; + case HA_EXTRA_KEYREAD_PRESERVE_FIELDS: + prebuilt->keep_other_fields_on_keyread = 1; + break; + + /* IMPORTANT: prebuilt->trx can be obsolete in + this method, because it is not sure that MySQL + calls external_lock before this method with the + parameters below. We must not invoke update_thd() + either, because the calling threads may change. + CAREFUL HERE, OR MEMORY CORRUPTION MAY OCCUR! */ + case HA_EXTRA_INSERT_WITH_UPDATE: + thd_to_trx(ha_thd())->duplicates |= TRX_DUP_IGNORE; + break; + case HA_EXTRA_NO_IGNORE_DUP_KEY: + thd_to_trx(ha_thd())->duplicates &= ~TRX_DUP_IGNORE; + break; + case HA_EXTRA_WRITE_CAN_REPLACE: + thd_to_trx(ha_thd())->duplicates |= TRX_DUP_REPLACE; + break; + case HA_EXTRA_WRITE_CANNOT_REPLACE: + thd_to_trx(ha_thd())->duplicates &= ~TRX_DUP_REPLACE; + break; + default:/* Do nothing */ + ; + } + + return(0); +} + +/******************************************************************//** +*/ +UNIV_INTERN +int +ha_innobase::reset() +/*================*/ +{ + if (prebuilt->blob_heap) { + row_mysql_prebuilt_free_blob_heap(prebuilt); + } + + reset_template(); + ds_mrr.reset(); + + /* TODO: This should really be reset in reset_template() but for now + it's safer to do it explicitly here. */ + + /* This is a statement level counter. */ + prebuilt->autoinc_last_value = 0; + + return(0); +} + +/******************************************************************//** +MySQL calls this function at the start of each SQL statement inside LOCK +TABLES. Inside LOCK TABLES the ::external_lock method does not work to +mark SQL statement borders. Note also a special case: if a temporary table +is created inside LOCK TABLES, MySQL has not called external_lock() at all +on that table. +MySQL-5.0 also calls this before each statement in an execution of a stored +procedure. To make the execution more deterministic for binlogging, MySQL-5.0 +locks all tables involved in a stored procedure with full explicit table +locks (thd_in_lock_tables(thd) holds in store_lock()) before executing the +procedure. +@return 0 or error code */ +UNIV_INTERN +int +ha_innobase::start_stmt( +/*====================*/ + THD* thd, /*!< in: handle to the user thread */ + thr_lock_type lock_type) +{ + trx_t* trx; + DBUG_ENTER("ha_innobase::start_stmt"); + + update_thd(thd); + + trx = prebuilt->trx; + + /* Here we release the search latch and the InnoDB thread FIFO ticket + if they were reserved. They should have been released already at the + end of the previous statement, but because inside LOCK TABLES the + lock count method does not work to mark the end of a SELECT statement, + that may not be the case. We MUST release the search latch before an + INSERT, for example. */ + + trx_search_latch_release_if_reserved(trx); + + innobase_srv_conc_force_exit_innodb(trx); + + /* Reset the AUTOINC statement level counter for multi-row INSERTs. */ + trx->n_autoinc_rows = 0; + + prebuilt->sql_stat_start = TRUE; + prebuilt->hint_need_to_fetch_extra_cols = 0; + reset_template(); + + if (dict_table_is_temporary(prebuilt->table) + && prebuilt->mysql_has_locked + && prebuilt->select_lock_type == LOCK_NONE) { + dberr_t error; + + switch (thd_sql_command(thd)) { + case SQLCOM_INSERT: + case SQLCOM_UPDATE: + case SQLCOM_DELETE: + init_table_handle_for_HANDLER(); + prebuilt->select_lock_type = LOCK_X; + prebuilt->stored_select_lock_type = LOCK_X; + error = row_lock_table_for_mysql(prebuilt, NULL, 1); + + if (error != DB_SUCCESS) { + int st = convert_error_code_to_mysql( + error, 0, thd); + DBUG_RETURN(st); + } + break; + } + } + + if (!prebuilt->mysql_has_locked) { + /* This handle is for a temporary table created inside + this same LOCK TABLES; since MySQL does NOT call external_lock + in this case, we must use x-row locks inside InnoDB to be + prepared for an update of a row */ + + prebuilt->select_lock_type = LOCK_X; + + } else if (trx->isolation_level != TRX_ISO_SERIALIZABLE + && thd_sql_command(thd) == SQLCOM_SELECT + && lock_type == TL_READ) { + + /* For other than temporary tables, we obtain + no lock for consistent read (plain SELECT). */ + + prebuilt->select_lock_type = LOCK_NONE; + } else { + /* Not a consistent read: restore the + select_lock_type value. The value of + stored_select_lock_type was decided in: + 1) ::store_lock(), + 2) ::external_lock(), + 3) ::init_table_handle_for_HANDLER(), and + 4) ::transactional_table_lock(). */ + + ut_a(prebuilt->stored_select_lock_type != LOCK_NONE_UNSET); + prebuilt->select_lock_type = prebuilt->stored_select_lock_type; + } + + *trx->detailed_error = 0; + + innobase_register_trx(ht, thd, trx); + + if (!trx_is_started(trx)) { + ++trx->will_lock; + } + + DBUG_RETURN(0); +} + +/******************************************************************//** +Maps a MySQL trx isolation level code to the InnoDB isolation level code +@return InnoDB isolation level */ +static inline +ulint +innobase_map_isolation_level( +/*=========================*/ + enum_tx_isolation iso) /*!< in: MySQL isolation level code */ +{ + switch (iso) { + case ISO_REPEATABLE_READ: return(TRX_ISO_REPEATABLE_READ); + case ISO_READ_COMMITTED: return(TRX_ISO_READ_COMMITTED); + case ISO_SERIALIZABLE: return(TRX_ISO_SERIALIZABLE); + case ISO_READ_UNCOMMITTED: return(TRX_ISO_READ_UNCOMMITTED); + } + + ut_error; + + return(0); +} + +/******************************************************************//** +As MySQL will execute an external lock for every new table it uses when it +starts to process an SQL statement (an exception is when MySQL calls +start_stmt for the handle) we can use this function to store the pointer to +the THD in the handle. We will also use this function to communicate +to InnoDB that a new SQL statement has started and that we must store a +savepoint to our transaction handle, so that we are able to roll back +the SQL statement in case of an error. +@return 0 */ +UNIV_INTERN +int +ha_innobase::external_lock( +/*=======================*/ + THD* thd, /*!< in: handle to the user thread */ + int lock_type) /*!< in: lock type */ +{ + trx_t* trx; + + DBUG_ENTER("ha_innobase::external_lock"); + DBUG_PRINT("enter",("lock_type: %d", lock_type)); + + update_thd(thd); + + /* Statement based binlogging does not work in isolation level + READ UNCOMMITTED and READ COMMITTED since the necessary + locks cannot be taken. In this case, we print an + informative error message and return with an error. + Note: decide_logging_format would give the same error message, + except it cannot give the extra details. */ + + if (lock_type == F_WRLCK + && !(table_flags() & HA_BINLOG_STMT_CAPABLE) + && thd_binlog_format(thd) == BINLOG_FORMAT_STMT + && thd_binlog_filter_ok(thd) + && thd_sqlcom_can_generate_row_events(thd)) { + bool skip = 0; + /* used by test case */ + DBUG_EXECUTE_IF("no_innodb_binlog_errors", skip = true;); + if (!skip) { + my_error(ER_BINLOG_STMT_MODE_AND_ROW_ENGINE, MYF(0), + " InnoDB is limited to row-logging when " + "transaction isolation level is " + "READ COMMITTED or READ UNCOMMITTED."); + DBUG_RETURN(HA_ERR_LOGGING_IMPOSSIBLE); + } + } + + /* Check for UPDATEs in read-only mode. */ + if (srv_read_only_mode + && (thd_sql_command(thd) == SQLCOM_UPDATE + || thd_sql_command(thd) == SQLCOM_INSERT + || thd_sql_command(thd) == SQLCOM_REPLACE + || thd_sql_command(thd) == SQLCOM_DROP_TABLE + || thd_sql_command(thd) == SQLCOM_ALTER_TABLE + || thd_sql_command(thd) == SQLCOM_OPTIMIZE + || (thd_sql_command(thd) == SQLCOM_CREATE_TABLE + && lock_type == F_WRLCK) + || thd_sql_command(thd) == SQLCOM_CREATE_INDEX + || thd_sql_command(thd) == SQLCOM_DROP_INDEX + || thd_sql_command(thd) == SQLCOM_DELETE)) { + + if (thd_sql_command(thd) == SQLCOM_CREATE_TABLE) + { + ib_senderrf(thd, IB_LOG_LEVEL_WARN, + ER_INNODB_READ_ONLY); + DBUG_RETURN(HA_ERR_INNODB_READ_ONLY); + } else { + ib_senderrf(thd, IB_LOG_LEVEL_WARN, + ER_READ_ONLY_MODE); + DBUG_RETURN(HA_ERR_TABLE_READONLY); + } + + } + + trx = prebuilt->trx; + + prebuilt->sql_stat_start = TRUE; + prebuilt->hint_need_to_fetch_extra_cols = 0; + + reset_template(); + + switch (prebuilt->table->quiesce) { + case QUIESCE_START: + /* Check for FLUSH TABLE t WITH READ LOCK; */ + if (!srv_read_only_mode + && thd_sql_command(thd) == SQLCOM_FLUSH + && lock_type == F_RDLCK) { + + row_quiesce_table_start(prebuilt->table, trx); + + /* Use the transaction instance to track UNLOCK + TABLES. It can be done via START TRANSACTION; too + implicitly. */ + + ++trx->flush_tables; + } + break; + + case QUIESCE_COMPLETE: + /* Check for UNLOCK TABLES; implicit or explicit + or trx interruption. */ + if (trx->flush_tables > 0 + && (lock_type == F_UNLCK || trx_is_interrupted(trx))) { + + row_quiesce_table_complete(prebuilt->table, trx); + + ut_a(trx->flush_tables > 0); + --trx->flush_tables; + } + + break; + + case QUIESCE_NONE: + break; + } + + if (lock_type == F_WRLCK) { + + /* If this is a SELECT, then it is in UPDATE TABLE ... + or SELECT ... FOR UPDATE */ + prebuilt->select_lock_type = LOCK_X; + prebuilt->stored_select_lock_type = LOCK_X; + } + + if (lock_type != F_UNLCK) { + /* MySQL is setting a new table lock */ + + *trx->detailed_error = 0; + + innobase_register_trx(ht, thd, trx); + + if (trx->isolation_level == TRX_ISO_SERIALIZABLE + && prebuilt->select_lock_type == LOCK_NONE + && thd_test_options( + thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) { + + /* To get serializable execution, we let InnoDB + conceptually add 'LOCK IN SHARE MODE' to all SELECTs + which otherwise would have been consistent reads. An + exception is consistent reads in the AUTOCOMMIT=1 mode: + we know that they are read-only transactions, and they + can be serialized also if performed as consistent + reads. */ + + prebuilt->select_lock_type = LOCK_S; + prebuilt->stored_select_lock_type = LOCK_S; + } + + /* Starting from 4.1.9, no InnoDB table lock is taken in LOCK + TABLES if AUTOCOMMIT=1. It does not make much sense to acquire + an InnoDB table lock if it is released immediately at the end + of LOCK TABLES, and InnoDB's table locks in that case cause + VERY easily deadlocks. + + We do not set InnoDB table locks if user has not explicitly + requested a table lock. Note that thd_in_lock_tables(thd) + can hold in some cases, e.g., at the start of a stored + procedure call (SQLCOM_CALL). */ + + if (prebuilt->select_lock_type != LOCK_NONE) { + + if (thd_sql_command(thd) == SQLCOM_LOCK_TABLES + && THDVAR(thd, table_locks) + && thd_test_options(thd, OPTION_NOT_AUTOCOMMIT) + && thd_in_lock_tables(thd)) { + + dberr_t error = row_lock_table_for_mysql( + prebuilt, NULL, 0); + + if (error != DB_SUCCESS) { + DBUG_RETURN( + convert_error_code_to_mysql( + error, 0, thd)); + } + } + + trx->mysql_n_tables_locked++; + } + + trx->n_mysql_tables_in_use++; + prebuilt->mysql_has_locked = TRUE; + + if (!trx_is_started(trx) + && (prebuilt->select_lock_type != LOCK_NONE + || prebuilt->stored_select_lock_type != LOCK_NONE)) { + + ++trx->will_lock; + } + + DBUG_RETURN(0); + } + + /* MySQL is releasing a table lock */ + + trx->n_mysql_tables_in_use--; + prebuilt->mysql_has_locked = FALSE; + + /* Release a possible FIFO ticket and search latch. Since we + may reserve the trx_sys->mutex, we have to release the search + system latch first to obey the latching order. */ + + trx_search_latch_release_if_reserved(trx); + + innobase_srv_conc_force_exit_innodb(trx); + + /* If the MySQL lock count drops to zero we know that the current SQL + statement has ended */ + + if (trx->n_mysql_tables_in_use == 0) { +#ifdef EXTENDED_SLOWLOG + if (UNIV_UNLIKELY(trx->take_stats)) { + increment_thd_innodb_stats(thd, + (unsigned long long) trx->id, + trx->io_reads, + trx->io_read, + trx->io_reads_wait_timer, + trx->lock_que_wait_timer, + trx->innodb_que_wait_timer, + trx->distinct_page_access); + + trx->io_reads = 0; + trx->io_read = 0; + trx->io_reads_wait_timer = 0; + trx->lock_que_wait_timer = 0; + trx->innodb_que_wait_timer = 0; + trx->distinct_page_access = 0; + if (trx->distinct_page_access_hash) + memset(trx->distinct_page_access_hash, 0, + DPAH_SIZE); + } +#endif + + trx->mysql_n_tables_locked = 0; + prebuilt->used_in_HANDLER = FALSE; + + if (!thd_test_options( + thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) { + + if (trx_is_started(trx)) { + innobase_commit(ht, thd, TRUE); + } + + } else if (trx->isolation_level <= TRX_ISO_READ_COMMITTED + && trx->global_read_view) { + + /* At low transaction isolation levels we let + each consistent read set its own snapshot */ + + read_view_close_for_mysql(trx); + } + } + + if (!trx_is_started(trx) + && (prebuilt->select_lock_type != LOCK_NONE + || prebuilt->stored_select_lock_type != LOCK_NONE)) { + + ++trx->will_lock; + } + + DBUG_RETURN(0); +} + +/******************************************************************//** +With this function MySQL request a transactional lock to a table when +user issued query LOCK TABLES..WHERE ENGINE = InnoDB. +@return error code */ +UNIV_INTERN +int +ha_innobase::transactional_table_lock( +/*==================================*/ + THD* thd, /*!< in: handle to the user thread */ + int lock_type) /*!< in: lock type */ +{ + trx_t* trx; + + DBUG_ENTER("ha_innobase::transactional_table_lock"); + DBUG_PRINT("enter",("lock_type: %d", lock_type)); + + /* We do not know if MySQL can call this function before calling + external_lock(). To be safe, update the thd of the current table + handle. */ + + update_thd(thd); + + if (UNIV_UNLIKELY(share->ib_table->is_corrupt)) { + DBUG_RETURN(HA_ERR_CRASHED); + } + + if (!thd_tablespace_op(thd)) { + + if (dict_table_is_discarded(prebuilt->table)) { + + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, + ER_TABLESPACE_DISCARDED, + table->s->table_name.str); + + } else if (prebuilt->table->ibd_file_missing) { + + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, + ER_TABLESPACE_MISSING, + table->s->table_name.str); + } + + DBUG_RETURN(HA_ERR_CRASHED); + } + + trx = prebuilt->trx; + + prebuilt->sql_stat_start = TRUE; + prebuilt->hint_need_to_fetch_extra_cols = 0; + + reset_template(); + + if (lock_type == F_WRLCK) { + prebuilt->select_lock_type = LOCK_X; + prebuilt->stored_select_lock_type = LOCK_X; + } else if (lock_type == F_RDLCK) { + prebuilt->select_lock_type = LOCK_S; + prebuilt->stored_select_lock_type = LOCK_S; + } else { + ib_logf(IB_LOG_LEVEL_ERROR, + "MySQL is trying to set transactional table lock " + "with corrupted lock type to table %s, lock type " + "%d does not exist.", + table->s->table_name.str, lock_type); + + DBUG_RETURN(HA_ERR_CRASHED); + } + + /* MySQL is setting a new transactional table lock */ + + innobase_register_trx(ht, thd, trx); + + if (THDVAR(thd, table_locks) && thd_in_lock_tables(thd)) { + dberr_t error; + + error = row_lock_table_for_mysql(prebuilt, NULL, 0); + + if (error != DB_SUCCESS) { + DBUG_RETURN( + convert_error_code_to_mysql( + error, prebuilt->table->flags, thd)); + } + + if (thd_test_options( + thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) { + + /* Store the current undo_no of the transaction + so that we know where to roll back if we have + to roll back the next SQL statement */ + + trx_mark_sql_stat_end(trx); + } + } + + DBUG_RETURN(0); +} + +/************************************************************************//** +Here we export InnoDB status variables to MySQL. */ +static +void +innodb_export_status() +/*==================*/ +{ + if (innodb_inited) { + srv_export_innodb_status(); + } +} + +/************************************************************************//** +Implements the SHOW ENGINE INNODB STATUS command. Sends the output of the +InnoDB Monitor to the client. +@return 0 on success */ +static +int +innodb_show_status( +/*===============*/ + handlerton* hton, /*!< in: the innodb handlerton */ + THD* thd, /*!< in: the MySQL query thread of the caller */ + stat_print_fn* stat_print) +{ + trx_t* trx; + static const char truncated_msg[] = "... truncated...\n"; + const long MAX_STATUS_SIZE = 1048576; + ulint trx_list_start = ULINT_UNDEFINED; + ulint trx_list_end = ULINT_UNDEFINED; + bool ret_val; + + DBUG_ENTER("innodb_show_status"); + DBUG_ASSERT(hton == innodb_hton_ptr); + + /* We don't create the temp files or associated + mutexes in read-only-mode */ + + if (srv_read_only_mode) { + DBUG_RETURN(0); + } + + trx = check_trx_exists(thd); + + trx_search_latch_release_if_reserved(trx); + + innobase_srv_conc_force_exit_innodb(trx); + + /* We let the InnoDB Monitor to output at most MAX_STATUS_SIZE + bytes of text. */ + + char* str; + ssize_t flen, usable_len; + + mutex_enter(&srv_monitor_file_mutex); + rewind(srv_monitor_file); + + srv_printf_innodb_monitor(srv_monitor_file, FALSE, + &trx_list_start, &trx_list_end); + + os_file_set_eof(srv_monitor_file); + + if ((flen = ftell(srv_monitor_file)) < 0) { + flen = 0; + } + + if (flen > MAX_STATUS_SIZE) { + usable_len = MAX_STATUS_SIZE; + srv_truncated_status_writes++; + } else { + usable_len = flen; + } + + /* allocate buffer for the string, and + read the contents of the temporary file */ + + if (!(str = (char*) my_malloc(usable_len + 1, MYF(0)))) { + mutex_exit(&srv_monitor_file_mutex); + DBUG_RETURN(1); + } + + rewind(srv_monitor_file); + + if (flen < MAX_STATUS_SIZE) { + /* Display the entire output. */ + flen = fread(str, 1, flen, srv_monitor_file); + } else if (trx_list_end < (ulint) flen + && trx_list_start < trx_list_end + && trx_list_start + (flen - trx_list_end) + < MAX_STATUS_SIZE - sizeof truncated_msg - 1) { + + /* Omit the beginning of the list of active transactions. */ + ssize_t len = fread(str, 1, trx_list_start, srv_monitor_file); + + memcpy(str + len, truncated_msg, sizeof truncated_msg - 1); + len += sizeof truncated_msg - 1; + usable_len = (MAX_STATUS_SIZE - 1) - len; + fseek(srv_monitor_file, + static_cast<long>(flen - usable_len), SEEK_SET); + len += fread(str + len, 1, usable_len, srv_monitor_file); + flen = len; + } else { + /* Omit the end of the output. */ + flen = fread(str, 1, MAX_STATUS_SIZE - 1, srv_monitor_file); + } + + mutex_exit(&srv_monitor_file_mutex); + + ret_val= stat_print( + thd, innobase_hton_name, + static_cast<uint>(strlen(innobase_hton_name)), + STRING_WITH_LEN(""), str, static_cast<uint>(flen)); + + my_free(str); + + DBUG_RETURN(ret_val); +} + +/************************************************************************//** +Implements the SHOW MUTEX STATUS command. +@return 0 on success. */ +static +int +innodb_mutex_show_status( +/*=====================*/ + handlerton* hton, /*!< in: the innodb handlerton */ + THD* thd, /*!< in: the MySQL query thread of the + caller */ + stat_print_fn* stat_print) /*!< in: function for printing + statistics */ +{ + char buf1[IO_SIZE]; + char buf2[IO_SIZE]; + ib_mutex_t* mutex; + rw_lock_t* lock; + ulint block_mutex_oswait_count = 0; + ulint block_lock_oswait_count = 0; + ib_mutex_t* block_mutex = NULL; + rw_lock_t* block_lock = NULL; +#ifdef UNIV_DEBUG + ulint rw_lock_count= 0; + ulint rw_lock_count_spin_loop= 0; + ulint rw_lock_count_spin_rounds= 0; + ulint rw_lock_count_os_wait= 0; + ulint rw_lock_count_os_yield= 0; + ulonglong rw_lock_wait_time= 0; +#endif /* UNIV_DEBUG */ + uint buf1len; + uint buf2len; + uint hton_name_len; + + hton_name_len = (uint) strlen(innobase_hton_name); + + DBUG_ENTER("innodb_mutex_show_status"); + DBUG_ASSERT(hton == innodb_hton_ptr); + + mutex_enter(&mutex_list_mutex); + + for (mutex = UT_LIST_GET_FIRST(mutex_list); mutex != NULL; + mutex = UT_LIST_GET_NEXT(list, mutex)) { + if (mutex->count_os_wait == 0) { + continue; + } + + if (buf_pool_is_block_mutex(mutex)) { + block_mutex = mutex; + block_mutex_oswait_count += mutex->count_os_wait; + continue; + } + + buf1len= (uint) my_snprintf(buf1, sizeof(buf1), "%s", + mutex->cmutex_name); + buf2len= (uint) my_snprintf(buf2, sizeof(buf2), "os_waits=%lu", + (ulong) mutex->count_os_wait); + + if (stat_print(thd, innobase_hton_name, + hton_name_len, buf1, buf1len, + buf2, buf2len)) { + mutex_exit(&mutex_list_mutex); + DBUG_RETURN(1); + } + } + + if (block_mutex) { + buf1len = (uint) my_snprintf(buf1, sizeof buf1, + "combined %s", + block_mutex->cmutex_name); + buf2len = (uint) my_snprintf(buf2, sizeof buf2, + "os_waits=%lu", + (ulong) block_mutex_oswait_count); + + if (stat_print(thd, innobase_hton_name, + hton_name_len, buf1, buf1len, + buf2, buf2len)) { + mutex_exit(&mutex_list_mutex); + DBUG_RETURN(1); + } + } + + mutex_exit(&mutex_list_mutex); + + mutex_enter(&rw_lock_list_mutex); + + for (lock = UT_LIST_GET_FIRST(rw_lock_list); lock != NULL; + lock = UT_LIST_GET_NEXT(list, lock)) { + if (lock->count_os_wait == 0) { + continue; + } + + if (buf_pool_is_block_lock(lock)) { + block_lock = lock; + block_lock_oswait_count += lock->count_os_wait; + continue; + } + + buf1len = (uint) my_snprintf( + buf1, sizeof buf1, "%s", + lock->lock_name); + buf2len = (uint) my_snprintf( + buf2, sizeof buf2, "os_waits=%lu", + static_cast<ulong>(lock->count_os_wait)); + + if (stat_print(thd, innobase_hton_name, + hton_name_len, buf1, buf1len, + buf2, buf2len)) { + mutex_exit(&rw_lock_list_mutex); + DBUG_RETURN(1); + } + } + + if (block_lock) { + buf1len = (uint) my_snprintf(buf1, sizeof buf1, + "combined %s", + block_lock->lock_name); + buf2len = (uint) my_snprintf(buf2, sizeof buf2, + "os_waits=%lu", + (ulong) block_lock_oswait_count); + + if (stat_print(thd, innobase_hton_name, + hton_name_len, buf1, buf1len, + buf2, buf2len)) { + mutex_exit(&rw_lock_list_mutex); + DBUG_RETURN(1); + } + } + + mutex_exit(&rw_lock_list_mutex); + +#ifdef UNIV_DEBUG + buf2len = static_cast<uint>(my_snprintf(buf2, sizeof buf2, + "count=%lu, spin_waits=%lu, spin_rounds=%lu, " + "os_waits=%lu, os_yields=%lu, os_wait_times=%lu", + (ulong) rw_lock_count, + (ulong) rw_lock_count_spin_loop, + (ulong) rw_lock_count_spin_rounds, + (ulong) rw_lock_count_os_wait, + (ulong) rw_lock_count_os_yield, + (ulong) (rw_lock_wait_time / 1000))); + + if (stat_print(thd, innobase_hton_name, hton_name_len, + STRING_WITH_LEN("rw_lock_mutexes"), buf2, buf2len)) { + DBUG_RETURN(1); + } +#endif /* UNIV_DEBUG */ + + /* Success */ + DBUG_RETURN(0); +} + +/************************************************************************//** +Return 0 on success and non-zero on failure. Note: the bool return type +seems to be abused here, should be an int. */ +static +bool +innobase_show_status( +/*=================*/ + handlerton* hton, /*!< in: the innodb handlerton */ + THD* thd, /*!< in: the MySQL query thread + of the caller */ + stat_print_fn* stat_print, + enum ha_stat_type stat_type) +{ + DBUG_ASSERT(hton == innodb_hton_ptr); + + switch (stat_type) { + case HA_ENGINE_STATUS: + /* Non-zero return value means there was an error. */ + return(innodb_show_status(hton, thd, stat_print) != 0); + + case HA_ENGINE_MUTEX: + /* Non-zero return value means there was an error. */ + return(innodb_mutex_show_status(hton, thd, stat_print) != 0); + + case HA_ENGINE_LOGS: + /* Not handled */ + break; + } + + /* Success */ + return(false); +} + +/************************************************************************//** +Handling the shared INNOBASE_SHARE structure that is needed to provide table +locking. Register the table name if it doesn't exist in the hash table. */ +static +INNOBASE_SHARE* +get_share( +/*======*/ + const char* table_name) +{ + INNOBASE_SHARE* share; + + mysql_mutex_lock(&innobase_share_mutex); + + ulint fold = ut_fold_string(table_name); + + HASH_SEARCH(table_name_hash, innobase_open_tables, fold, + INNOBASE_SHARE*, share, + ut_ad(share->use_count > 0), + !strcmp(share->table_name, table_name)); + + if (!share) { + + uint length = (uint) strlen(table_name); + + /* TODO: invoke HASH_MIGRATE if innobase_open_tables + grows too big */ + + share = (INNOBASE_SHARE*) my_malloc(sizeof(*share)+length+1, + MYF(MY_FAE | MY_ZEROFILL)); + + share->table_name = (char*) memcpy(share + 1, + table_name, length + 1); + + HASH_INSERT(INNOBASE_SHARE, table_name_hash, + innobase_open_tables, fold, share); + + thr_lock_init(&share->lock); + + /* Index translation table initialization */ + share->idx_trans_tbl.index_mapping = NULL; + share->idx_trans_tbl.index_count = 0; + share->idx_trans_tbl.array_size = 0; + } + + share->use_count++; + mysql_mutex_unlock(&innobase_share_mutex); + + return(share); +} + +/************************************************************************//** +Free the shared object that was registered with get_share(). */ +static +void +free_share( +/*=======*/ + INNOBASE_SHARE* share) /*!< in/own: table share to free */ +{ + mysql_mutex_lock(&innobase_share_mutex); + +#ifdef UNIV_DEBUG + INNOBASE_SHARE* share2; + ulint fold = ut_fold_string(share->table_name); + + HASH_SEARCH(table_name_hash, innobase_open_tables, fold, + INNOBASE_SHARE*, share2, + ut_ad(share->use_count > 0), + !strcmp(share->table_name, share2->table_name)); + + ut_a(share2 == share); +#endif /* UNIV_DEBUG */ + + if (!--share->use_count) { + ulint fold = ut_fold_string(share->table_name); + + HASH_DELETE(INNOBASE_SHARE, table_name_hash, + innobase_open_tables, fold, share); + thr_lock_delete(&share->lock); + + /* Free any memory from index translation table */ + my_free(share->idx_trans_tbl.index_mapping); + + my_free(share); + + /* TODO: invoke HASH_MIGRATE if innobase_open_tables + shrinks too much */ + } + + mysql_mutex_unlock(&innobase_share_mutex); +} + +/*****************************************************************//** +Converts a MySQL table lock stored in the 'lock' field of the handle to +a proper type before storing pointer to the lock into an array of pointers. +MySQL also calls this if it wants to reset some table locks to a not-locked +state during the processing of an SQL query. An example is that during a +SELECT the read lock is released early on the 'const' tables where we only +fetch one row. MySQL does not call this when it releases all locks at the +end of an SQL statement. +@return pointer to the next element in the 'to' array */ +UNIV_INTERN +THR_LOCK_DATA** +ha_innobase::store_lock( +/*====================*/ + THD* thd, /*!< in: user thread handle */ + THR_LOCK_DATA** to, /*!< in: pointer to an array + of pointers to lock structs; + pointer to the 'lock' field + of current handle is stored + next to this array */ + enum thr_lock_type lock_type) /*!< in: lock type to store in + 'lock'; this may also be + TL_IGNORE */ +{ + trx_t* trx; + + /* Note that trx in this function is NOT necessarily prebuilt->trx + because we call update_thd() later, in ::external_lock()! Failure to + understand this caused a serious memory corruption bug in 5.1.11. */ + + trx = check_trx_exists(thd); + + /* NOTE: MySQL can call this function with lock 'type' TL_IGNORE! + Be careful to ignore TL_IGNORE if we are going to do something with + only 'real' locks! */ + + /* If no MySQL table is in use, we need to set the isolation level + of the transaction. */ + + if (lock_type != TL_IGNORE + && trx->n_mysql_tables_in_use == 0) { + trx->isolation_level = innobase_map_isolation_level( + (enum_tx_isolation) thd_tx_isolation(thd)); + + if (trx->isolation_level <= TRX_ISO_READ_COMMITTED + && trx->global_read_view) { + + /* At low transaction isolation levels we let + each consistent read set its own snapshot */ + + read_view_close_for_mysql(trx); + } + } + + DBUG_ASSERT(EQ_CURRENT_THD(thd)); + const bool in_lock_tables = thd_in_lock_tables(thd); + const uint sql_command = thd_sql_command(thd); + + if (srv_read_only_mode + && (sql_command == SQLCOM_UPDATE + || sql_command == SQLCOM_INSERT + || sql_command == SQLCOM_REPLACE + || sql_command == SQLCOM_DROP_TABLE + || sql_command == SQLCOM_ALTER_TABLE + || sql_command == SQLCOM_OPTIMIZE + || (sql_command == SQLCOM_CREATE_TABLE + && (lock_type >= TL_WRITE_CONCURRENT_INSERT + && lock_type <= TL_WRITE)) + || sql_command == SQLCOM_CREATE_INDEX + || sql_command == SQLCOM_DROP_INDEX + || sql_command == SQLCOM_DELETE)) { + + ib_senderrf(trx->mysql_thd, + IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE); + + } else if (sql_command == SQLCOM_FLUSH + && lock_type == TL_READ_NO_INSERT) { + + /* Check for FLUSH TABLES ... WITH READ LOCK */ + + /* Note: This call can fail, but there is no way to return + the error to the caller. We simply ignore it for now here + and push the error code to the caller where the error is + detected in the function. */ + + dberr_t err = row_quiesce_set_state( + prebuilt->table, QUIESCE_START, trx); + + ut_a(err == DB_SUCCESS || err == DB_UNSUPPORTED); + + if (trx->isolation_level == TRX_ISO_SERIALIZABLE) { + prebuilt->select_lock_type = LOCK_S; + prebuilt->stored_select_lock_type = LOCK_S; + } else { + prebuilt->select_lock_type = LOCK_NONE; + prebuilt->stored_select_lock_type = LOCK_NONE; + } + + /* Check for DROP TABLE */ + } else if (sql_command == SQLCOM_DROP_TABLE) { + + /* MySQL calls this function in DROP TABLE though this table + handle may belong to another thd that is running a query. Let + us in that case skip any changes to the prebuilt struct. */ + + /* Check for LOCK TABLE t1,...,tn WITH SHARED LOCKS */ + } else if ((lock_type == TL_READ && in_lock_tables) + || (lock_type == TL_READ_HIGH_PRIORITY && in_lock_tables) + || lock_type == TL_READ_WITH_SHARED_LOCKS + || lock_type == TL_READ_NO_INSERT + || (lock_type != TL_IGNORE + && sql_command != SQLCOM_SELECT)) { + + /* The OR cases above are in this order: + 1) MySQL is doing LOCK TABLES ... READ LOCAL, or we + are processing a stored procedure or function, or + 2) (we do not know when TL_READ_HIGH_PRIORITY is used), or + 3) this is a SELECT ... IN SHARE MODE, or + 4) we are doing a complex SQL statement like + INSERT INTO ... SELECT ... and the logical logging (MySQL + binlog) requires the use of a locking read, or + MySQL is doing LOCK TABLES ... READ. + 5) we let InnoDB do locking reads for all SQL statements that + are not simple SELECTs; note that select_lock_type in this + case may get strengthened in ::external_lock() to LOCK_X. + Note that we MUST use a locking read in all data modifying + SQL statements, because otherwise the execution would not be + serializable, and also the results from the update could be + unexpected if an obsolete consistent read view would be + used. */ + + /* Use consistent read for checksum table */ + + if (sql_command == SQLCOM_CHECKSUM + || ((srv_locks_unsafe_for_binlog + || trx->isolation_level <= TRX_ISO_READ_COMMITTED) + && trx->isolation_level != TRX_ISO_SERIALIZABLE + && (lock_type == TL_READ + || lock_type == TL_READ_NO_INSERT) + && (sql_command == SQLCOM_INSERT_SELECT + || sql_command == SQLCOM_REPLACE_SELECT + || sql_command == SQLCOM_UPDATE + || sql_command == SQLCOM_CREATE_TABLE))) { + + /* If we either have innobase_locks_unsafe_for_binlog + option set or this session is using READ COMMITTED + isolation level and isolation level of the transaction + is not set to serializable and MySQL is doing + INSERT INTO...SELECT or REPLACE INTO...SELECT + or UPDATE ... = (SELECT ...) or CREATE ... + SELECT... without FOR UPDATE or IN SHARE + MODE in select, then we use consistent read + for select. */ + + prebuilt->select_lock_type = LOCK_NONE; + prebuilt->stored_select_lock_type = LOCK_NONE; + } else { + prebuilt->select_lock_type = LOCK_S; + prebuilt->stored_select_lock_type = LOCK_S; + } + + } else if (lock_type != TL_IGNORE) { + + /* We set possible LOCK_X value in external_lock, not yet + here even if this would be SELECT ... FOR UPDATE */ + + prebuilt->select_lock_type = LOCK_NONE; + prebuilt->stored_select_lock_type = LOCK_NONE; + } + + if (lock_type != TL_IGNORE && lock.type == TL_UNLOCK) { + + /* Starting from 5.0.7, we weaken also the table locks + set at the start of a MySQL stored procedure call, just like + we weaken the locks set at the start of an SQL statement. + MySQL does set in_lock_tables TRUE there, but in reality + we do not need table locks to make the execution of a + single transaction stored procedure call deterministic + (if it does not use a consistent read). */ + + if (lock_type == TL_READ + && sql_command == SQLCOM_LOCK_TABLES) { + /* We come here if MySQL is processing LOCK TABLES + ... READ LOCAL. MyISAM under that table lock type + reads the table as it was at the time the lock was + granted (new inserts are allowed, but not seen by the + reader). To get a similar effect on an InnoDB table, + we must use LOCK TABLES ... READ. We convert the lock + type here, so that for InnoDB, READ LOCAL is + equivalent to READ. This will change the InnoDB + behavior in mysqldump, so that dumps of InnoDB tables + are consistent with dumps of MyISAM tables. */ + + lock_type = TL_READ_NO_INSERT; + } + + /* If we are not doing a LOCK TABLE, DISCARD/IMPORT + TABLESPACE or TRUNCATE TABLE then allow multiple + writers. Note that ALTER TABLE uses a TL_WRITE_ALLOW_READ + < TL_WRITE_CONCURRENT_INSERT. + + We especially allow multiple writers if MySQL is at the + start of a stored procedure call (SQLCOM_CALL) or a + stored function call (MySQL does have in_lock_tables + TRUE there). */ + + if ((lock_type >= TL_WRITE_CONCURRENT_INSERT + && lock_type <= TL_WRITE) + && !(in_lock_tables + && sql_command == SQLCOM_LOCK_TABLES) + && !thd_tablespace_op(thd) + && sql_command != SQLCOM_TRUNCATE + && sql_command != SQLCOM_OPTIMIZE + && sql_command != SQLCOM_CREATE_TABLE) { + + lock_type = TL_WRITE_ALLOW_WRITE; + } + + /* In queries of type INSERT INTO t1 SELECT ... FROM t2 ... + MySQL would use the lock TL_READ_NO_INSERT on t2, and that + would conflict with TL_WRITE_ALLOW_WRITE, blocking all inserts + to t2. Convert the lock to a normal read lock to allow + concurrent inserts to t2. + + We especially allow concurrent inserts if MySQL is at the + start of a stored procedure call (SQLCOM_CALL) + (MySQL does have thd_in_lock_tables() TRUE there). */ + + if (lock_type == TL_READ_NO_INSERT + && sql_command != SQLCOM_LOCK_TABLES) { + + lock_type = TL_READ; + } + + lock.type = lock_type; + } + + *to++= &lock; + + if (!trx_is_started(trx) + && (prebuilt->select_lock_type != LOCK_NONE + || prebuilt->stored_select_lock_type != LOCK_NONE)) { + + ++trx->will_lock; + } + + return(to); +} + +/*********************************************************************//** +Read the next autoinc value. Acquire the relevant locks before reading +the AUTOINC value. If SUCCESS then the table AUTOINC mutex will be locked +on return and all relevant locks acquired. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +ha_innobase::innobase_get_autoinc( +/*==============================*/ + ulonglong* value) /*!< out: autoinc value */ +{ + *value = 0; + + prebuilt->autoinc_error = innobase_lock_autoinc(); + + if (prebuilt->autoinc_error == DB_SUCCESS) { + + /* Determine the first value of the interval */ + *value = dict_table_autoinc_read(prebuilt->table); + + /* It should have been initialized during open. */ + if (*value == 0) { + prebuilt->autoinc_error = DB_UNSUPPORTED; + dict_table_autoinc_unlock(prebuilt->table); + } + } + + return(prebuilt->autoinc_error); +} + +/*******************************************************************//** +This function reads the global auto-inc counter. It doesn't use the +AUTOINC lock even if the lock mode is set to TRADITIONAL. +@return the autoinc value */ +UNIV_INTERN +ulonglong +ha_innobase::innobase_peek_autoinc(void) +/*====================================*/ +{ + ulonglong auto_inc; + dict_table_t* innodb_table; + + ut_a(prebuilt != NULL); + ut_a(prebuilt->table != NULL); + + innodb_table = prebuilt->table; + + dict_table_autoinc_lock(innodb_table); + + auto_inc = dict_table_autoinc_read(innodb_table); + + if (auto_inc == 0) { + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: AUTOINC next value generation " + "is disabled for '%s'\n", innodb_table->name); + } + + dict_table_autoinc_unlock(innodb_table); + + return(auto_inc); +} + +/*********************************************************************//** +Returns the value of the auto-inc counter in *first_value and ~0 on failure. */ +UNIV_INTERN +void +ha_innobase::get_auto_increment( +/*============================*/ + ulonglong offset, /*!< in: table autoinc offset */ + ulonglong increment, /*!< in: table autoinc + increment */ + ulonglong nb_desired_values, /*!< in: number of values + reqd */ + ulonglong* first_value, /*!< out: the autoinc value */ + ulonglong* nb_reserved_values) /*!< out: count of reserved + values */ +{ + trx_t* trx; + dberr_t error; + ulonglong autoinc = 0; + + /* Prepare prebuilt->trx in the table handle */ + update_thd(ha_thd()); + + error = innobase_get_autoinc(&autoinc); + + if (error != DB_SUCCESS) { + *first_value = (~(ulonglong) 0); + return; + } + + /* This is a hack, since nb_desired_values seems to be accurate only + for the first call to get_auto_increment() for multi-row INSERT and + meaningless for other statements e.g, LOAD etc. Subsequent calls to + this method for the same statement results in different values which + don't make sense. Therefore we store the value the first time we are + called and count down from that as rows are written (see write_row()). + */ + + trx = prebuilt->trx; + + /* Note: We can't rely on *first_value since some MySQL engines, + in particular the partition engine, don't initialize it to 0 when + invoking this method. So we are not sure if it's guaranteed to + be 0 or not. */ + + /* We need the upper limit of the col type to check for + whether we update the table autoinc counter or not. */ + ulonglong col_max_value = innobase_get_int_col_max_value( + table->next_number_field); + + /* Called for the first time ? */ + if (trx->n_autoinc_rows == 0) { + + trx->n_autoinc_rows = (ulint) nb_desired_values; + + /* It's possible for nb_desired_values to be 0: + e.g., INSERT INTO T1(C) SELECT C FROM T2; */ + if (nb_desired_values == 0) { + + trx->n_autoinc_rows = 1; + } + + set_if_bigger(*first_value, autoinc); + /* Not in the middle of a mult-row INSERT. */ + } else if (prebuilt->autoinc_last_value == 0) { + set_if_bigger(*first_value, autoinc); + /* Check for -ve values. */ + } else if (*first_value > col_max_value && trx->n_autoinc_rows > 0) { + /* Set to next logical value. */ + ut_a(autoinc > trx->n_autoinc_rows); + *first_value = (autoinc - trx->n_autoinc_rows) - 1; + } + + *nb_reserved_values = trx->n_autoinc_rows; + + /* With old style AUTOINC locking we only update the table's + AUTOINC counter after attempting to insert the row. */ + if (innobase_autoinc_lock_mode != AUTOINC_OLD_STYLE_LOCKING) { + ulonglong current; + ulonglong next_value; + + current = *first_value > col_max_value ? autoinc : *first_value; + + /* If the increment step of the auto increment column + decreases then it is not affecting the immediate + next value in the series. */ + if (prebuilt->autoinc_increment > increment) { + + current = autoinc - prebuilt->autoinc_increment; + + current = innobase_next_autoinc( + current, 1, increment, 1, col_max_value); + + dict_table_autoinc_initialize(prebuilt->table, current); + + *first_value = current; + } + + /* Compute the last value in the interval */ + next_value = innobase_next_autoinc( + current, *nb_reserved_values, increment, offset, + col_max_value); + + prebuilt->autoinc_last_value = next_value; + + if (prebuilt->autoinc_last_value < *first_value) { + *first_value = (~(ulonglong) 0); + } else { + /* Update the table autoinc variable */ + dict_table_autoinc_update_if_greater( + prebuilt->table, prebuilt->autoinc_last_value); + } + } else { + /* This will force write_row() into attempting an update + of the table's AUTOINC counter. */ + prebuilt->autoinc_last_value = 0; + } + + /* The increment to be used to increase the AUTOINC value, we use + this in write_row() and update_row() to increase the autoinc counter + for columns that are filled by the user. We need the offset and + the increment. */ + prebuilt->autoinc_offset = offset; + prebuilt->autoinc_increment = increment; + + dict_table_autoinc_unlock(prebuilt->table); +} + +/*******************************************************************//** +Reset the auto-increment counter to the given value, i.e. the next row +inserted will get the given value. This is called e.g. after TRUNCATE +is emulated by doing a 'DELETE FROM t'. HA_ERR_WRONG_COMMAND is +returned by storage engines that don't support this operation. +@return 0 or error code */ +UNIV_INTERN +int +ha_innobase::reset_auto_increment( +/*==============================*/ + ulonglong value) /*!< in: new value for table autoinc */ +{ + DBUG_ENTER("ha_innobase::reset_auto_increment"); + + dberr_t error; + + update_thd(ha_thd()); + + error = row_lock_table_autoinc_for_mysql(prebuilt); + + if (error != DB_SUCCESS) { + DBUG_RETURN(convert_error_code_to_mysql( + error, prebuilt->table->flags, user_thd)); + } + + /* The next value can never be 0. */ + if (value == 0) { + value = 1; + } + + innobase_reset_autoinc(value); + + DBUG_RETURN(0); +} + +/*******************************************************************//** +See comment in handler.cc */ +UNIV_INTERN +bool +ha_innobase::get_error_message( +/*===========================*/ + int error, + String* buf) +{ + trx_t* trx = check_trx_exists(ha_thd()); + + buf->copy(trx->detailed_error, (uint) strlen(trx->detailed_error), + system_charset_info); + + return(FALSE); +} + +/*******************************************************************//** + Retrieves the names of the table and the key for which there was a + duplicate entry in the case of HA_ERR_FOREIGN_DUPLICATE_KEY. + + If any of the names is not available, then this method will return + false and will not change any of child_table_name or child_key_name. + + @param child_table_name[out] Table name + @param child_table_name_len[in] Table name buffer size + @param child_key_name[out] Key name + @param child_key_name_len[in] Key name buffer size + + @retval true table and key names were available + and were written into the corresponding + out parameters. + @retval false table and key names were not available, + the out parameters were not touched. +*/ +bool +ha_innobase::get_foreign_dup_key( +/*=============================*/ + char* child_table_name, + uint child_table_name_len, + char* child_key_name, + uint child_key_name_len) +{ + const dict_index_t* err_index; + + ut_a(prebuilt->trx != NULL); + ut_a(prebuilt->trx->magic_n == TRX_MAGIC_N); + + err_index = trx_get_error_info(prebuilt->trx); + + if (err_index == NULL) { + return(false); + } + /* else */ + + /* copy table name (and convert from filename-safe encoding to + system_charset_info) */ + char* p; + p = strchr(err_index->table->name, '/'); + /* strip ".../" prefix if any */ + if (p != NULL) { + p++; + } else { + p = err_index->table->name; + } + uint len; + len = filename_to_tablename(p, child_table_name, child_table_name_len); + child_table_name[len] = '\0'; + + /* copy index name */ + ut_snprintf(child_key_name, child_key_name_len, "%s", err_index->name); + + return(true); +} + +/*******************************************************************//** +Compares two 'refs'. A 'ref' is the (internal) primary key value of the row. +If there is no explicitly declared non-null unique key or a primary key, then +InnoDB internally uses the row id as the primary key. +@return < 0 if ref1 < ref2, 0 if equal, else > 0 */ +UNIV_INTERN +int +ha_innobase::cmp_ref( +/*=================*/ + const uchar* ref1, /*!< in: an (internal) primary key value in the + MySQL key value format */ + const uchar* ref2) /*!< in: an (internal) primary key value in the + MySQL key value format */ +{ + enum_field_types mysql_type; + Field* field; + KEY_PART_INFO* key_part; + KEY_PART_INFO* key_part_end; + uint len1; + uint len2; + int result; + + if (prebuilt->clust_index_was_generated) { + /* The 'ref' is an InnoDB row id */ + + return(memcmp(ref1, ref2, DATA_ROW_ID_LEN)); + } + + /* Do a type-aware comparison of primary key fields. PK fields + are always NOT NULL, so no checks for NULL are performed. */ + + key_part = table->key_info[table->s->primary_key].key_part; + + key_part_end = key_part + + table->key_info[table->s->primary_key].user_defined_key_parts; + + for (; key_part != key_part_end; ++key_part) { + field = key_part->field; + mysql_type = field->type(); + + if (mysql_type == MYSQL_TYPE_TINY_BLOB + || mysql_type == MYSQL_TYPE_MEDIUM_BLOB + || mysql_type == MYSQL_TYPE_BLOB + || mysql_type == MYSQL_TYPE_LONG_BLOB) { + + /* In the MySQL key value format, a column prefix of + a BLOB is preceded by a 2-byte length field */ + + len1 = innobase_read_from_2_little_endian(ref1); + len2 = innobase_read_from_2_little_endian(ref2); + + ref1 += 2; + ref2 += 2; + result = ((Field_blob*) field)->cmp( + ref1, len1, ref2, len2); + } else { + result = field->key_cmp(ref1, ref2); + } + + if (result) { + + return(result); + } + + ref1 += key_part->store_length; + ref2 += key_part->store_length; + } + + return(0); +} + +/*******************************************************************//** +Ask InnoDB if a query to a table can be cached. +@return TRUE if query caching of the table is permitted */ +UNIV_INTERN +my_bool +ha_innobase::register_query_cache_table( +/*====================================*/ + THD* thd, /*!< in: user thread handle */ + char* table_key, /*!< in: normalized path to the + table */ + uint key_length, /*!< in: length of the normalized + path to the table */ + qc_engine_callback* + call_back, /*!< out: pointer to function for + checking if query caching + is permitted */ + ulonglong *engine_data) /*!< in/out: data to call_back */ +{ + *call_back = innobase_query_caching_of_table_permitted; + *engine_data = 0; + return(innobase_query_caching_of_table_permitted(thd, table_key, + key_length, + engine_data)); +} + +/*******************************************************************//** +Get the bin log name. */ +UNIV_INTERN +const char* +ha_innobase::get_mysql_bin_log_name() +/*=================================*/ +{ + return(trx_sys_mysql_bin_log_name); +} + +/*******************************************************************//** +Get the bin log offset (or file position). */ +UNIV_INTERN +ulonglong +ha_innobase::get_mysql_bin_log_pos() +/*================================*/ +{ + /* trx... is ib_int64_t, which is a typedef for a 64-bit integer + (__int64 or longlong) so it's ok to cast it to ulonglong. */ + + return(trx_sys_mysql_bin_log_pos); +} + +/******************************************************************//** +This function is used to find the storage length in bytes of the first n +characters for prefix indexes using a multibyte character set. The function +finds charset information and returns length of prefix_len characters in the +index field in bytes. +@return number of bytes occupied by the first n characters */ +UNIV_INTERN +ulint +innobase_get_at_most_n_mbchars( +/*===========================*/ + ulint charset_id, /*!< in: character set id */ + ulint prefix_len, /*!< in: prefix length in bytes of the index + (this has to be divided by mbmaxlen to get the + number of CHARACTERS n in the prefix) */ + ulint data_len, /*!< in: length of the string in bytes */ + const char* str) /*!< in: character string */ +{ + ulint char_length; /*!< character length in bytes */ + ulint n_chars; /*!< number of characters in prefix */ + CHARSET_INFO* charset; /*!< charset used in the field */ + + charset = get_charset((uint) charset_id, MYF(MY_WME)); + + ut_ad(charset); + ut_ad(charset->mbmaxlen); + + /* Calculate how many characters at most the prefix index contains */ + + n_chars = prefix_len / charset->mbmaxlen; + + /* If the charset is multi-byte, then we must find the length of the + first at most n chars in the string. If the string contains less + characters than n, then we return the length to the end of the last + character. */ + + if (charset->mbmaxlen > 1) { + /* my_charpos() returns the byte length of the first n_chars + characters, or a value bigger than the length of str, if + there were not enough full characters in str. + + Why does the code below work: + Suppose that we are looking for n UTF-8 characters. + + 1) If the string is long enough, then the prefix contains at + least n complete UTF-8 characters + maybe some extra + characters + an incomplete UTF-8 character. No problem in + this case. The function returns the pointer to the + end of the nth character. + + 2) If the string is not long enough, then the string contains + the complete value of a column, that is, only complete UTF-8 + characters, and we can store in the column prefix index the + whole string. */ + + char_length = my_charpos(charset, str, + str + data_len, (int) n_chars); + if (char_length > data_len) { + char_length = data_len; + } + } else { + if (data_len < prefix_len) { + char_length = data_len; + } else { + char_length = prefix_len; + } + } + + return(char_length); +} + +/*******************************************************************//** +This function is used to prepare an X/Open XA distributed transaction. +@return 0 or error number */ +static +int +innobase_xa_prepare( +/*================*/ + handlerton* hton, /*!< in: InnoDB handlerton */ + THD* thd, /*!< in: handle to the MySQL thread of + the user whose XA transaction should + be prepared */ + bool prepare_trx) /*!< in: true - prepare transaction + false - the current SQL statement + ended */ +{ + int error = 0; + trx_t* trx = check_trx_exists(thd); + + DBUG_ASSERT(hton == innodb_hton_ptr); + + /* we use support_xa value as it was seen at transaction start + time, not the current session variable value. Any possible changes + to the session variable take effect only in the next transaction */ + if (!trx->support_xa) { + + return(0); + } + + if (UNIV_UNLIKELY(trx->fake_changes)) { + + if (prepare_trx + || (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT + | OPTION_BEGIN))) { + + thd->get_stmt_da()->reset_diagnostics_area(); + return(HA_ERR_WRONG_COMMAND); + } + return(0); + } + + thd_get_xid(thd, (MYSQL_XID*) &trx->xid); + + /* Release a possible FIFO ticket and search latch. Since we will + reserve the trx_sys->mutex, we have to release the search system + latch first to obey the latching order. */ + + trx_search_latch_release_if_reserved(trx); + + innobase_srv_conc_force_exit_innodb(trx); + + if (!trx_is_registered_for_2pc(trx) && trx_is_started(trx)) { + + sql_print_error("Transaction not registered for MySQL 2PC, " + "but transaction is active"); + } + + if (prepare_trx + || (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) { + + /* We were instructed to prepare the whole transaction, or + this is an SQL statement end and autocommit is on */ + + ut_ad(trx_is_registered_for_2pc(trx)); + + trx_prepare_for_mysql(trx); + + DBUG_EXECUTE_IF("crash_innodb_after_prepare", + DBUG_SUICIDE();); + + error = 0; + } else { + /* We just mark the SQL statement ended and do not do a + transaction prepare */ + + /* If we had reserved the auto-inc lock for some + table in this SQL statement we release it now */ + + lock_unlock_table_autoinc(trx); + + /* Store the current undo_no of the transaction so that we + know where to roll back if we have to roll back the next + SQL statement */ + + trx_mark_sql_stat_end(trx); + } + + if (thd_sql_command(thd) != SQLCOM_XA_PREPARE + && (prepare_trx + || !thd_test_options( + thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) { + + /* For mysqlbackup to work the order of transactions in binlog + and InnoDB must be the same. Consider the situation + + thread1> prepare; write to binlog; ... + <context switch> + thread2> prepare; write to binlog; commit + thread1> ... commit + + The server guarantees that writes to the binary log + and commits are in the same order, so we do not have + to handle this case. */ + } + + return(error); +} + +/*******************************************************************//** +This function is used to recover X/Open XA distributed transactions. +@return number of prepared transactions stored in xid_list */ +static +int +innobase_xa_recover( +/*================*/ + handlerton* hton, /*!< in: InnoDB handlerton */ + XID* xid_list,/*!< in/out: prepared transactions */ + uint len) /*!< in: number of slots in xid_list */ +{ + DBUG_ASSERT(hton == innodb_hton_ptr); + + if (len == 0 || xid_list == NULL) { + + return(0); + } + + return(trx_recover_for_mysql(xid_list, len)); +} + +/*******************************************************************//** +This function is used to commit one X/Open XA distributed transaction +which is in the prepared state +@return 0 or error number */ +static +int +innobase_commit_by_xid( +/*===================*/ + handlerton* hton, + XID* xid) /*!< in: X/Open XA transaction identification */ +{ + trx_t* trx; + + DBUG_ASSERT(hton == innodb_hton_ptr); + + trx = trx_get_trx_by_xid(xid); + + if (trx) { + innobase_commit_low(trx); + trx_free_for_background(trx); + return(XA_OK); + } else { + return(XAER_NOTA); + } +} + +/*******************************************************************//** +This function is used to rollback one X/Open XA distributed transaction +which is in the prepared state +@return 0 or error number */ +static +int +innobase_rollback_by_xid( +/*=====================*/ + handlerton* hton, /*!< in: InnoDB handlerton */ + XID* xid) /*!< in: X/Open XA transaction + identification */ +{ + trx_t* trx; + + DBUG_ASSERT(hton == innodb_hton_ptr); + + trx = trx_get_trx_by_xid(xid); + + if (trx) { + int ret = innobase_rollback_trx(trx); + trx_free_for_background(trx); + return(ret); + } else { + return(XAER_NOTA); + } +} + +/*******************************************************************//** +Create a consistent view for a cursor based on current transaction +which is created if the corresponding MySQL thread still lacks one. +This consistent view is then used inside of MySQL when accessing records +using a cursor. +@return pointer to cursor view or NULL */ +static +void* +innobase_create_cursor_view( +/*========================*/ + handlerton* hton, /*!< in: innobase hton */ + THD* thd) /*!< in: user thread handle */ +{ + DBUG_ASSERT(hton == innodb_hton_ptr); + + return(read_cursor_view_create_for_mysql(check_trx_exists(thd))); +} + +/*******************************************************************//** +Close the given consistent cursor view of a transaction and restore +global read view to a transaction read view. Transaction is created if the +corresponding MySQL thread still lacks one. */ +static +void +innobase_close_cursor_view( +/*=======================*/ + handlerton* hton, /*!< in: innobase hton */ + THD* thd, /*!< in: user thread handle */ + void* curview)/*!< in: Consistent read view to be closed */ +{ + DBUG_ASSERT(hton == innodb_hton_ptr); + + read_cursor_view_close_for_mysql(check_trx_exists(thd), + (cursor_view_t*) curview); +} + +/*******************************************************************//** +Set the given consistent cursor view to a transaction which is created +if the corresponding MySQL thread still lacks one. If the given +consistent cursor view is NULL global read view of a transaction is +restored to a transaction read view. */ +static +void +innobase_set_cursor_view( +/*=====================*/ + handlerton* hton, /*!< in: innobase hton */ + THD* thd, /*!< in: user thread handle */ + void* curview)/*!< in: Consistent cursor view to be set */ +{ + DBUG_ASSERT(hton == innodb_hton_ptr); + + read_cursor_set_for_mysql(check_trx_exists(thd), + (cursor_view_t*) curview); +} + +/*******************************************************************//** +*/ +UNIV_INTERN +bool +ha_innobase::check_if_incompatible_data( +/*====================================*/ + HA_CREATE_INFO* info, + uint table_changes) +{ + innobase_copy_frm_flags_from_create_info(prebuilt->table, info); + + if (table_changes != IS_EQUAL_YES) { + + return(COMPATIBLE_DATA_NO); + } + + /* Check that auto_increment value was not changed */ + if ((info->used_fields & HA_CREATE_USED_AUTO) && + info->auto_increment_value != 0) { + + return(COMPATIBLE_DATA_NO); + } + + /* Check that row format didn't change */ + if ((info->used_fields & HA_CREATE_USED_ROW_FORMAT) + && info->row_type != get_row_type()) { + + return(COMPATIBLE_DATA_NO); + } + + /* Specifying KEY_BLOCK_SIZE requests a rebuild of the table. */ + if (info->used_fields & HA_CREATE_USED_KEY_BLOCK_SIZE) { + return(COMPATIBLE_DATA_NO); + } + + return(COMPATIBLE_DATA_YES); +} + +/****************************************************************//** +Update the system variable innodb_io_capacity_max using the "saved" +value. This function is registered as a callback with MySQL. */ +static +void +innodb_io_capacity_max_update( +/*===========================*/ + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var* var, /*!< in: pointer to + system variable */ + void* var_ptr,/*!< out: where the + formal string goes */ + const void* save) /*!< in: immediate result + from check function */ +{ + ulong in_val = *static_cast<const ulong*>(save); + if (in_val < srv_io_capacity) { + in_val = srv_io_capacity; + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_WRONG_ARGUMENTS, + "innodb_io_capacity_max cannot be" + " set lower than innodb_io_capacity."); + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_WRONG_ARGUMENTS, + "Setting innodb_io_capacity_max to %lu", + srv_io_capacity); + } + + srv_max_io_capacity = in_val; +} + +/****************************************************************//** +Update the system variable innodb_io_capacity using the "saved" +value. This function is registered as a callback with MySQL. */ +static +void +innodb_io_capacity_update( +/*======================*/ + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var* var, /*!< in: pointer to + system variable */ + void* var_ptr,/*!< out: where the + formal string goes */ + const void* save) /*!< in: immediate result + from check function */ +{ + ulong in_val = *static_cast<const ulong*>(save); + if (in_val > srv_max_io_capacity) { + in_val = srv_max_io_capacity; + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_WRONG_ARGUMENTS, + "innodb_io_capacity cannot be set" + " higher than innodb_io_capacity_max."); + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_WRONG_ARGUMENTS, + "Setting innodb_io_capacity to %lu", + srv_max_io_capacity); + } + + srv_io_capacity = in_val; +} + +/****************************************************************//** +Update the system variable innodb_log_arch_expire_sec using +the "saved" value. This function is registered as a callback with MySQL. */ +static +void +innodb_log_archive_expire_update( +/*==============================*/ + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var* var, /*!< in: pointer to + system variable */ + void* var_ptr, /*!< out: unused */ + const void* save) /*!< in: immediate result + from check function */ +{ + srv_log_arch_expire_sec = *(ulint*) save; +} + +static +void +innodb_log_archive_update( +/*======================*/ + THD* thd, + struct st_mysql_sys_var* var, + void* var_ptr, + const void* save) +{ + my_bool in_val = *static_cast<const my_bool*>(save); + + if (in_val) { + /* turn archiving on */ + srv_log_archive_on = innobase_log_archive = 1; + log_archive_archivelog(); + } else { + /* turn archivng off */ + srv_log_archive_on = innobase_log_archive = 0; + log_archive_noarchivelog(); + } +} + +/****************************************************************//** +Update the system variable innodb_max_dirty_pages_pct using the "saved" +value. This function is registered as a callback with MySQL. */ +static +void +innodb_max_dirty_pages_pct_update( +/*==============================*/ + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var* var, /*!< in: pointer to + system variable */ + void* var_ptr,/*!< out: where the + formal string goes */ + const void* save) /*!< in: immediate result + from check function */ +{ + ulong in_val = *static_cast<const ulong*>(save); + if (in_val < srv_max_dirty_pages_pct_lwm) { + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_WRONG_ARGUMENTS, + "innodb_max_dirty_pages_pct cannot be" + " set lower than" + " innodb_max_dirty_pages_pct_lwm."); + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_WRONG_ARGUMENTS, + "Lowering" + " innodb_max_dirty_page_pct_lwm to %lu", + in_val); + + srv_max_dirty_pages_pct_lwm = in_val; + } + + srv_max_buf_pool_modified_pct = in_val; +} + +/****************************************************************//** +Update the system variable innodb_max_dirty_pages_pct_lwm using the +"saved" value. This function is registered as a callback with MySQL. */ +static +void +innodb_max_dirty_pages_pct_lwm_update( +/*==================================*/ + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var* var, /*!< in: pointer to + system variable */ + void* var_ptr,/*!< out: where the + formal string goes */ + const void* save) /*!< in: immediate result + from check function */ +{ + ulong in_val = *static_cast<const ulong*>(save); + if (in_val > srv_max_buf_pool_modified_pct) { + in_val = srv_max_buf_pool_modified_pct; + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_WRONG_ARGUMENTS, + "innodb_max_dirty_pages_pct_lwm" + " cannot be set higher than" + " innodb_max_dirty_pages_pct."); + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_WRONG_ARGUMENTS, + "Setting innodb_max_dirty_page_pct_lwm" + " to %lu", + in_val); + } + + srv_max_dirty_pages_pct_lwm = in_val; +} + +/************************************************************//** +Validate the file format name and return its corresponding id. +@return valid file format id */ +static +uint +innobase_file_format_name_lookup( +/*=============================*/ + const char* format_name) /*!< in: pointer to file format name */ +{ + char* endp; + uint format_id; + + ut_a(format_name != NULL); + + /* The format name can contain the format id itself instead of + the name and we check for that. */ + format_id = (uint) strtoul(format_name, &endp, 10); + + /* Check for valid parse. */ + if (*endp == '\0' && *format_name != '\0') { + + if (format_id <= UNIV_FORMAT_MAX) { + + return(format_id); + } + } else { + + for (format_id = 0; format_id <= UNIV_FORMAT_MAX; + format_id++) { + const char* name; + + name = trx_sys_file_format_id_to_name(format_id); + + if (!innobase_strcasecmp(format_name, name)) { + + return(format_id); + } + } + } + + return(UNIV_FORMAT_MAX + 1); +} + +/************************************************************//** +Validate the file format check config parameters, as a side effect it +sets the srv_max_file_format_at_startup variable. +@return the format_id if valid config value, otherwise, return -1 */ +static +int +innobase_file_format_validate_and_set( +/*==================================*/ + const char* format_max) /*!< in: parameter value */ +{ + uint format_id; + + format_id = innobase_file_format_name_lookup(format_max); + + if (format_id < UNIV_FORMAT_MAX + 1) { + srv_max_file_format_at_startup = format_id; + + return((int) format_id); + } else { + return(-1); + } +} + +/*************************************************************//** +Check if it is a valid file format. This function is registered as +a callback with MySQL. +@return 0 for valid file format */ +static +int +innodb_file_format_name_validate( +/*=============================*/ + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var* var, /*!< in: pointer to system + variable */ + void* save, /*!< out: immediate result + for update function */ + struct st_mysql_value* value) /*!< in: incoming string */ +{ + const char* file_format_input; + char buff[STRING_BUFFER_USUAL_SIZE]; + int len = sizeof(buff); + + ut_a(save != NULL); + ut_a(value != NULL); + + file_format_input = value->val_str(value, buff, &len); + + if (file_format_input != NULL) { + uint format_id; + + format_id = innobase_file_format_name_lookup( + file_format_input); + + if (format_id <= UNIV_FORMAT_MAX) { + + /* Save a pointer to the name in the + 'file_format_name_map' constant array. */ + *static_cast<const char**>(save) = + trx_sys_file_format_id_to_name(format_id); + + return(0); + } + } + + *static_cast<const char**>(save) = NULL; + return(1); +} + +/****************************************************************//** +Update the system variable innodb_file_format using the "saved" +value. This function is registered as a callback with MySQL. */ +static +void +innodb_file_format_name_update( +/*===========================*/ + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var* var, /*!< in: pointer to + system variable */ + void* var_ptr, /*!< out: where the + formal string goes */ + const void* save) /*!< in: immediate result + from check function */ +{ + const char* format_name; + + ut_a(var_ptr != NULL); + ut_a(save != NULL); + + format_name = *static_cast<const char*const*>(save); + + if (format_name) { + uint format_id; + + format_id = innobase_file_format_name_lookup(format_name); + + if (format_id <= UNIV_FORMAT_MAX) { + srv_file_format = format_id; + } + } + + *static_cast<const char**>(var_ptr) + = trx_sys_file_format_id_to_name(srv_file_format); +} + +/*************************************************************//** +Check if valid argument to innodb_file_format_max. This function +is registered as a callback with MySQL. +@return 0 for valid file format */ +static +int +innodb_file_format_max_validate( +/*============================*/ + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var* var, /*!< in: pointer to system + variable */ + void* save, /*!< out: immediate result + for update function */ + struct st_mysql_value* value) /*!< in: incoming string */ +{ + const char* file_format_input; + char buff[STRING_BUFFER_USUAL_SIZE]; + int len = sizeof(buff); + int format_id; + + ut_a(save != NULL); + ut_a(value != NULL); + + file_format_input = value->val_str(value, buff, &len); + + if (file_format_input != NULL) { + + format_id = innobase_file_format_validate_and_set( + file_format_input); + + if (format_id >= 0) { + /* Save a pointer to the name in the + 'file_format_name_map' constant array. */ + *static_cast<const char**>(save) = + trx_sys_file_format_id_to_name( + (uint) format_id); + + return(0); + + } else { + push_warning_printf(thd, + Sql_condition::WARN_LEVEL_WARN, + ER_WRONG_ARGUMENTS, + "InnoDB: invalid innodb_file_format_max " + "value; can be any format up to %s " + "or equivalent id of %d", + trx_sys_file_format_id_to_name(UNIV_FORMAT_MAX), + UNIV_FORMAT_MAX); + } + } + + *static_cast<const char**>(save) = NULL; + return(1); +} + +/****************************************************************//** +Update the system variable innodb_file_format_max using the "saved" +value. This function is registered as a callback with MySQL. */ +static +void +innodb_file_format_max_update( +/*==========================*/ + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var* var, /*!< in: pointer to + system variable */ + void* var_ptr,/*!< out: where the + formal string goes */ + const void* save) /*!< in: immediate result + from check function */ +{ + const char* format_name_in; + const char** format_name_out; + uint format_id; + + ut_a(save != NULL); + ut_a(var_ptr != NULL); + + format_name_in = *static_cast<const char*const*>(save); + + if (!format_name_in) { + + return; + } + + format_id = innobase_file_format_name_lookup(format_name_in); + + if (format_id > UNIV_FORMAT_MAX) { + /* DEFAULT is "on", which is invalid at runtime. */ + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_WRONG_ARGUMENTS, + "Ignoring SET innodb_file_format=%s", + format_name_in); + return; + } + + format_name_out = static_cast<const char**>(var_ptr); + + /* Update the max format id in the system tablespace. */ + if (trx_sys_file_format_max_set(format_id, format_name_out)) { + ut_print_timestamp(stderr); + fprintf(stderr, + " [Info] InnoDB: the file format in the system " + "tablespace is now set to %s.\n", *format_name_out); + } +} + +/*************************************************************//** +Check whether valid argument given to innobase_*_stopword_table. +This function is registered as a callback with MySQL. +@return 0 for valid stopword table */ +static +int +innodb_stopword_table_validate( +/*===========================*/ + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var* var, /*!< in: pointer to system + variable */ + void* save, /*!< out: immediate result + for update function */ + struct st_mysql_value* value) /*!< in: incoming string */ +{ + const char* stopword_table_name; + char buff[STRING_BUFFER_USUAL_SIZE]; + int len = sizeof(buff); + trx_t* trx; + int ret = 1; + + ut_a(save != NULL); + ut_a(value != NULL); + + stopword_table_name = value->val_str(value, buff, &len); + + trx = check_trx_exists(thd); + + row_mysql_lock_data_dictionary(trx); + + /* Validate the stopword table's (if supplied) existence and + of the right format */ + if (!stopword_table_name + || fts_valid_stopword_table(stopword_table_name)) { + *static_cast<const char**>(save) = stopword_table_name; + ret = 0; + } + + row_mysql_unlock_data_dictionary(trx); + + return(ret); +} + +/*************************************************************//** +Check whether valid argument given to "innodb_fts_internal_tbl_name" +This function is registered as a callback with MySQL. +@return 0 for valid stopword table */ +static +int +innodb_internal_table_validate( +/*===========================*/ + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var* var, /*!< in: pointer to system + variable */ + void* save, /*!< out: immediate result + for update function */ + struct st_mysql_value* value) /*!< in: incoming string */ +{ + const char* table_name; + char buff[STRING_BUFFER_USUAL_SIZE]; + int len = sizeof(buff); + int ret = 1; + dict_table_t* user_table; + + ut_a(save != NULL); + ut_a(value != NULL); + + table_name = value->val_str(value, buff, &len); + + if (!table_name) { + *static_cast<const char**>(save) = NULL; + return(0); + } + + user_table = dict_table_open_on_name( + table_name, FALSE, TRUE, DICT_ERR_IGNORE_NONE); + + if (user_table) { + if (dict_table_has_fts_index(user_table)) { + *static_cast<const char**>(save) = table_name; + ret = 0; + } + + dict_table_close(user_table, FALSE, TRUE); + } + + return(ret); +} + +/****************************************************************//** +Update global variable "fts_internal_tbl_name" with the "saved" +stopword table name value. This function is registered as a callback +with MySQL. */ +static +void +innodb_internal_table_update( +/*=========================*/ + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var* var, /*!< in: pointer to + system variable */ + void* var_ptr,/*!< out: where the + formal string goes */ + const void* save) /*!< in: immediate result + from check function */ +{ + const char* table_name; + char* old; + + ut_a(save != NULL); + ut_a(var_ptr != NULL); + + table_name = *static_cast<const char*const*>(save); + old = *(char**) var_ptr; + + if (table_name) { + *(char**) var_ptr = my_strdup(table_name, MYF(0)); + } else { + *(char**) var_ptr = NULL; + } + + if (old) { + my_free(old); + } + + fts_internal_tbl_name = *(char**) var_ptr; +} + +/****************************************************************//** +Update the system variable innodb_adaptive_hash_index using the "saved" +value. This function is registered as a callback with MySQL. */ +static +void +innodb_adaptive_hash_index_update( +/*==============================*/ + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var* var, /*!< in: pointer to + system variable */ + void* var_ptr,/*!< out: where the + formal string goes */ + const void* save) /*!< in: immediate result + from check function */ +{ + if (*(my_bool*) save) { + btr_search_enable(); + } else { + btr_search_disable(); + } +} + +/****************************************************************//** +Update the system variable innodb_cmp_per_index using the "saved" +value. This function is registered as a callback with MySQL. */ +static +void +innodb_cmp_per_index_update( +/*========================*/ + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var* var, /*!< in: pointer to + system variable */ + void* var_ptr,/*!< out: where the + formal string goes */ + const void* save) /*!< in: immediate result + from check function */ +{ + /* Reset the stats whenever we enable the table + INFORMATION_SCHEMA.innodb_cmp_per_index. */ + if (!srv_cmp_per_index_enabled && *(my_bool*) save) { + page_zip_reset_stat_per_index(); + } + + srv_cmp_per_index_enabled = !!(*(my_bool*) save); +} + +/****************************************************************//** +Update the system variable innodb_old_blocks_pct using the "saved" +value. This function is registered as a callback with MySQL. */ +static +void +innodb_old_blocks_pct_update( +/*=========================*/ + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var* var, /*!< in: pointer to + system variable */ + void* var_ptr,/*!< out: where the + formal string goes */ + const void* save) /*!< in: immediate result + from check function */ +{ + innobase_old_blocks_pct = static_cast<uint>( + buf_LRU_old_ratio_update( + *static_cast<const uint*>(save), TRUE)); +} + +/****************************************************************//** +Update the system variable innodb_old_blocks_pct using the "saved" +value. This function is registered as a callback with MySQL. */ +static +void +innodb_change_buffer_max_size_update( +/*=================================*/ + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var* var, /*!< in: pointer to + system variable */ + void* var_ptr,/*!< out: where the + formal string goes */ + const void* save) /*!< in: immediate result + from check function */ +{ + innobase_change_buffer_max_size = + (*static_cast<const uint*>(save)); + ibuf_max_size_update(innobase_change_buffer_max_size); +} + +#ifdef UNIV_DEBUG +ulong srv_fil_make_page_dirty_debug = 0; +ulong srv_saved_page_number_debug = 0; + +/****************************************************************//** +Save an InnoDB page number. */ +static +void +innodb_save_page_no( +/*================*/ + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var* var, /*!< in: pointer to + system variable */ + void* var_ptr,/*!< out: where the + formal string goes */ + const void* save) /*!< in: immediate result + from check function */ +{ + srv_saved_page_number_debug = *static_cast<const ulong*>(save); + + ib_logf(IB_LOG_LEVEL_INFO, + "Saving InnoDB page number: %lu", + srv_saved_page_number_debug); +} + +/****************************************************************//** +Make the first page of given user tablespace dirty. */ +static +void +innodb_make_page_dirty( +/*===================*/ + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var* var, /*!< in: pointer to + system variable */ + void* var_ptr,/*!< out: where the + formal string goes */ + const void* save) /*!< in: immediate result + from check function */ +{ + mtr_t mtr; + ulong space_id = *static_cast<const ulong*>(save); + + mtr_start(&mtr); + + buf_block_t* block = buf_page_get( + space_id, 0, srv_saved_page_number_debug, RW_X_LATCH, &mtr); + + if (block) { + byte* page = block->frame; + ib_logf(IB_LOG_LEVEL_INFO, + "Dirtying page:%lu of space:%lu", + page_get_page_no(page), + page_get_space_id(page)); + mlog_write_ulint(page + FIL_PAGE_TYPE, + fil_page_get_type(page), + MLOG_2BYTES, &mtr); + } + mtr_commit(&mtr); +} +#endif // UNIV_DEBUG + +/*************************************************************//** +Find the corresponding ibuf_use_t value that indexes into +innobase_change_buffering_values[] array for the input +change buffering option name. +@return corresponding IBUF_USE_* value for the input variable +name, or IBUF_USE_COUNT if not able to find a match */ +static +ibuf_use_t +innodb_find_change_buffering_value( +/*===============================*/ + const char* input_name) /*!< in: input change buffering + option name */ +{ + ulint use; + + for (use = 0; use < UT_ARR_SIZE(innobase_change_buffering_values); + use++) { + /* found a match */ + if (!innobase_strcasecmp( + input_name, innobase_change_buffering_values[use])) { + return((ibuf_use_t) use); + } + } + + /* Did not find any match */ + return(IBUF_USE_COUNT); +} + +/*************************************************************//** +Check if it is a valid value of innodb_change_buffering. This function is +registered as a callback with MySQL. +@return 0 for valid innodb_change_buffering */ +static +int +innodb_change_buffering_validate( +/*=============================*/ + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var* var, /*!< in: pointer to system + variable */ + void* save, /*!< out: immediate result + for update function */ + struct st_mysql_value* value) /*!< in: incoming string */ +{ + const char* change_buffering_input; + char buff[STRING_BUFFER_USUAL_SIZE]; + int len = sizeof(buff); + + ut_a(save != NULL); + ut_a(value != NULL); + + change_buffering_input = value->val_str(value, buff, &len); + + if (change_buffering_input != NULL) { + ibuf_use_t use; + + use = innodb_find_change_buffering_value( + change_buffering_input); + + if (use != IBUF_USE_COUNT) { + /* Find a matching change_buffering option value. */ + *static_cast<const char**>(save) = + innobase_change_buffering_values[use]; + + return(0); + } + } + + /* No corresponding change buffering option for user supplied + "change_buffering_input" */ + return(1); +} + +/****************************************************************//** +Update the system variable innodb_change_buffering using the "saved" +value. This function is registered as a callback with MySQL. */ +static +void +innodb_change_buffering_update( +/*===========================*/ + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var* var, /*!< in: pointer to + system variable */ + void* var_ptr,/*!< out: where the + formal string goes */ + const void* save) /*!< in: immediate result + from check function */ +{ + ibuf_use_t use; + + ut_a(var_ptr != NULL); + ut_a(save != NULL); + + use = innodb_find_change_buffering_value( + *static_cast<const char*const*>(save)); + + ut_a(use < IBUF_USE_COUNT); + + ibuf_use = use; + *static_cast<const char**>(var_ptr) = + *static_cast<const char*const*>(save); +} + +/*************************************************************//** +Just emit a warning that the usage of the variable is deprecated. +@return 0 */ +static +void +innodb_stats_sample_pages_update( +/*=============================*/ + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var* var, /*!< in: pointer to + system variable */ + void* var_ptr,/*!< out: where the + formal string goes */ + const void* save) /*!< in: immediate result + from check function */ +{ +#define STATS_SAMPLE_PAGES_DEPRECATED_MSG \ + "Using innodb_stats_sample_pages is deprecated and " \ + "the variable may be removed in future releases. " \ + "Please use innodb_stats_transient_sample_pages " \ + "instead." + + push_warning(thd, Sql_condition::WARN_LEVEL_WARN, + HA_ERR_WRONG_COMMAND, STATS_SAMPLE_PAGES_DEPRECATED_MSG); + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Warning: %s\n", + STATS_SAMPLE_PAGES_DEPRECATED_MSG); + + srv_stats_transient_sample_pages = + *static_cast<const unsigned long long*>(save); +} + +/****************************************************************//** +Update the monitor counter according to the "set_option", turn +on/off or reset specified monitor counter. */ +static +void +innodb_monitor_set_option( +/*======================*/ + const monitor_info_t* monitor_info,/*!< in: monitor info for the monitor + to set */ + mon_option_t set_option) /*!< in: Turn on/off reset the + counter */ +{ + monitor_id_t monitor_id = monitor_info->monitor_id; + + /* If module type is MONITOR_GROUP_MODULE, it cannot be + turned on/off individually. It should never use this + function to set options */ + ut_a(!(monitor_info->monitor_type & MONITOR_GROUP_MODULE)); + + switch (set_option) { + case MONITOR_TURN_ON: + MONITOR_ON(monitor_id); + MONITOR_INIT(monitor_id); + MONITOR_SET_START(monitor_id); + + /* If the monitor to be turned on uses + exisitng monitor counter (status variable), + make special processing to remember existing + counter value. */ + if (monitor_info->monitor_type + & MONITOR_EXISTING) { + srv_mon_process_existing_counter( + monitor_id, MONITOR_TURN_ON); + } + break; + + case MONITOR_TURN_OFF: + if (monitor_info->monitor_type & MONITOR_EXISTING) { + srv_mon_process_existing_counter( + monitor_id, MONITOR_TURN_OFF); + } + + MONITOR_OFF(monitor_id); + MONITOR_SET_OFF(monitor_id); + break; + + case MONITOR_RESET_VALUE: + srv_mon_reset(monitor_id); + break; + + case MONITOR_RESET_ALL_VALUE: + srv_mon_reset_all(monitor_id); + break; + + default: + ut_error; + } +} + +/****************************************************************//** +Find matching InnoDB monitor counters and update their status +according to the "set_option", turn on/off or reset specified +monitor counter. */ +static +void +innodb_monitor_update_wildcard( +/*===========================*/ + const char* name, /*!< in: monitor name to match */ + mon_option_t set_option) /*!< in: the set option, whether + to turn on/off or reset the counter */ +{ + ut_a(name); + + for (ulint use = 0; use < NUM_MONITOR; use++) { + ulint type; + monitor_id_t monitor_id = static_cast<monitor_id_t>(use); + monitor_info_t* monitor_info; + + if (!innobase_wildcasecmp( + srv_mon_get_name(monitor_id), name)) { + monitor_info = srv_mon_get_info(monitor_id); + + type = monitor_info->monitor_type; + + /* If the monitor counter is of MONITOR_MODULE + type, skip it. Except for those also marked with + MONITOR_GROUP_MODULE flag, which can be turned + on only as a module. */ + if (!(type & MONITOR_MODULE) + && !(type & MONITOR_GROUP_MODULE)) { + innodb_monitor_set_option(monitor_info, + set_option); + } + + /* Need to special handle counters marked with + MONITOR_GROUP_MODULE, turn on the whole module if + any one of it comes here. Currently, only + "module_buf_page" is marked with MONITOR_GROUP_MODULE */ + if (type & MONITOR_GROUP_MODULE) { + if ((monitor_id >= MONITOR_MODULE_BUF_PAGE) + && (monitor_id < MONITOR_MODULE_OS)) { + if (set_option == MONITOR_TURN_ON + && MONITOR_IS_ON( + MONITOR_MODULE_BUF_PAGE)) { + continue; + } + + srv_mon_set_module_control( + MONITOR_MODULE_BUF_PAGE, + set_option); + } else { + /* If new monitor is added with + MONITOR_GROUP_MODULE, it needs + to be added here. */ + ut_ad(0); + } + } + } + } +} + +/*************************************************************//** +Given a configuration variable name, find corresponding monitor counter +and return its monitor ID if found. +@return monitor ID if found, MONITOR_NO_MATCH if there is no match */ +static +ulint +innodb_monitor_id_by_name_get( +/*==========================*/ + const char* name) /*!< in: monitor counter namer */ +{ + ut_a(name); + + /* Search for wild character '%' in the name, if + found, we treat it as a wildcard match. We do not search for + single character wildcard '_' since our monitor names already contain + such character. To avoid confusion, we request user must include + at least one '%' character to activate the wildcard search. */ + if (strchr(name, '%')) { + return(MONITOR_WILDCARD_MATCH); + } + + /* Not wildcard match, check for an exact match */ + for (ulint i = 0; i < NUM_MONITOR; i++) { + if (!innobase_strcasecmp( + name, srv_mon_get_name(static_cast<monitor_id_t>(i)))) { + return(i); + } + } + + return(MONITOR_NO_MATCH); +} +/*************************************************************//** +Validate that the passed in monitor name matches at least one +monitor counter name with wildcard compare. +@return TRUE if at least one monitor name matches */ +static +ibool +innodb_monitor_validate_wildcard_name( +/*==================================*/ + const char* name) /*!< in: monitor counter namer */ +{ + for (ulint i = 0; i < NUM_MONITOR; i++) { + if (!innobase_wildcasecmp( + srv_mon_get_name(static_cast<monitor_id_t>(i)), name)) { + return(TRUE); + } + } + + return(FALSE); +} +/*************************************************************//** +Validate the passed in monitor name, find and save the +corresponding monitor name in the function parameter "save". +@return 0 if monitor name is valid */ +static +int +innodb_monitor_valid_byname( +/*========================*/ + void* save, /*!< out: immediate result + for update function */ + const char* name) /*!< in: incoming monitor name */ +{ + ulint use; + monitor_info_t* monitor_info; + + if (!name) { + return(1); + } + + use = innodb_monitor_id_by_name_get(name); + + /* No monitor name matches, nor it is wildcard match */ + if (use == MONITOR_NO_MATCH) { + return(1); + } + + if (use < NUM_MONITOR) { + monitor_info = srv_mon_get_info((monitor_id_t) use); + + /* If the monitor counter is marked with + MONITOR_GROUP_MODULE flag, then this counter + cannot be turned on/off individually, instead + it shall be turned on/off as a group using + its module name */ + if ((monitor_info->monitor_type & MONITOR_GROUP_MODULE) + && (!(monitor_info->monitor_type & MONITOR_MODULE))) { + sql_print_warning( + "Monitor counter '%s' cannot" + " be turned on/off individually." + " Please use its module name" + " to turn on/off the counters" + " in the module as a group.\n", + name); + + return(1); + } + + } else { + ut_a(use == MONITOR_WILDCARD_MATCH); + + /* For wildcard match, if there is not a single monitor + counter name that matches, treat it as an invalid + value for the system configuration variables */ + if (!innodb_monitor_validate_wildcard_name(name)) { + return(1); + } + } + + /* Save the configure name for innodb_monitor_update() */ + *static_cast<const char**>(save) = name; + + return(0); +} +/*************************************************************//** +Validate passed-in "value" is a valid monitor counter name. +This function is registered as a callback with MySQL. +@return 0 for valid name */ +static +int +innodb_monitor_validate( +/*====================*/ + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var* var, /*!< in: pointer to system + variable */ + void* save, /*!< out: immediate result + for update function */ + struct st_mysql_value* value) /*!< in: incoming string */ +{ + const char* name; + char* monitor_name; + char buff[STRING_BUFFER_USUAL_SIZE]; + int len = sizeof(buff); + int ret; + + ut_a(save != NULL); + ut_a(value != NULL); + + name = value->val_str(value, buff, &len); + + /* monitor_name could point to memory from MySQL + or buff[]. Always dup the name to memory allocated + by InnoDB, so we can access it in another callback + function innodb_monitor_update() and free it appropriately */ + if (name) { + monitor_name = my_strdup(name, MYF(0)); + } else { + return(1); + } + + ret = innodb_monitor_valid_byname(save, monitor_name); + + if (ret) { + /* Validation failed */ + my_free(monitor_name); + } else { + /* monitor_name will be freed in separate callback function + innodb_monitor_update(). Assert "save" point to + the "monitor_name" variable */ + ut_ad(*static_cast<char**>(save) == monitor_name); + } + + return(ret); +} + +/****************************************************************//** +Update the system variable innodb_enable(disable/reset/reset_all)_monitor +according to the "set_option" and turn on/off or reset specified monitor +counter. */ +static +void +innodb_monitor_update( +/*==================*/ + THD* thd, /*!< in: thread handle */ + void* var_ptr, /*!< out: where the + formal string goes */ + const void* save, /*!< in: immediate result + from check function */ + mon_option_t set_option, /*!< in: the set option, + whether to turn on/off or + reset the counter */ + ibool free_mem) /*!< in: whether we will + need to free the memory */ +{ + monitor_info_t* monitor_info; + ulint monitor_id; + ulint err_monitor = 0; + const char* name; + + ut_a(save != NULL); + + name = *static_cast<const char*const*>(save); + + if (!name) { + monitor_id = MONITOR_DEFAULT_START; + } else { + monitor_id = innodb_monitor_id_by_name_get(name); + + /* Double check we have a valid monitor ID */ + if (monitor_id == MONITOR_NO_MATCH) { + return; + } + } + + if (monitor_id == MONITOR_DEFAULT_START) { + /* If user set the variable to "default", we will + print a message and make this set operation a "noop". + The check is being made here is because "set default" + does not go through validation function */ + if (thd) { + push_warning_printf( + thd, Sql_condition::WARN_LEVEL_WARN, + ER_NO_DEFAULT, + "Default value is not defined for " + "this set option. Please specify " + "correct counter or module name."); + } else { + sql_print_error( + "Default value is not defined for " + "this set option. Please specify " + "correct counter or module name.\n"); + } + + if (var_ptr) { + *(const char**) var_ptr = NULL; + } + } else if (monitor_id == MONITOR_WILDCARD_MATCH) { + innodb_monitor_update_wildcard(name, set_option); + } else { + monitor_info = srv_mon_get_info( + static_cast<monitor_id_t>(monitor_id)); + + ut_a(monitor_info); + + /* If monitor is already truned on, someone could already + collect monitor data, exit and ask user to turn off the + monitor before turn it on again. */ + if (set_option == MONITOR_TURN_ON + && MONITOR_IS_ON(monitor_id)) { + err_monitor = monitor_id; + goto exit; + } + + if (var_ptr) { + *(const char**) var_ptr = monitor_info->monitor_name; + } + + /* Depending on the monitor name is for a module or + a counter, process counters in the whole module or + individual counter. */ + if (monitor_info->monitor_type & MONITOR_MODULE) { + srv_mon_set_module_control( + static_cast<monitor_id_t>(monitor_id), + set_option); + } else { + innodb_monitor_set_option(monitor_info, set_option); + } + } +exit: + /* Only if we are trying to turn on a monitor that already + been turned on, we will set err_monitor. Print related + information */ + if (err_monitor) { + sql_print_warning("Monitor %s is already enabled.", + srv_mon_get_name((monitor_id_t) err_monitor)); + } + + if (free_mem && name) { + my_free((void*) name); + } + + return; +} + +#ifdef __WIN__ +/*************************************************************//** +Validate if passed-in "value" is a valid value for +innodb_buffer_pool_filename. On Windows, file names with colon (:) +are not allowed. + +@return 0 for valid name */ +static +int +innodb_srv_buf_dump_filename_validate( +/*==================================*/ + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var* var, /*!< in: pointer to system + variable */ + void* save, /*!< out: immediate result + for update function */ + struct st_mysql_value* value) /*!< in: incoming string */ +{ + const char* buf_name; + char buff[OS_FILE_MAX_PATH]; + int len= sizeof(buff); + + ut_a(save != NULL); + ut_a(value != NULL); + + buf_name = value->val_str(value, buff, &len); + + if (buf_name) { + if (is_filename_allowed(buf_name, len, FALSE)){ + *static_cast<const char**>(save) = buf_name; + return(0); + } else { + push_warning_printf(thd, + Sql_condition::WARN_LEVEL_WARN, + ER_WRONG_ARGUMENTS, + "InnoDB: innodb_buffer_pool_filename " + "cannot have colon (:) in the file name."); + + } + } + + return(1); +} +#else /* __WIN__ */ +# define innodb_srv_buf_dump_filename_validate NULL +#endif /* __WIN__ */ + +#ifdef UNIV_DEBUG +static char* srv_buffer_pool_evict; + +/****************************************************************//** +Evict all uncompressed pages of compressed tables from the buffer pool. +Keep the compressed pages in the buffer pool. +@return whether all uncompressed pages were evicted */ +static __attribute__((warn_unused_result)) +bool +innodb_buffer_pool_evict_uncompressed(void) +/*=======================================*/ +{ + bool all_evicted = true; + + for (ulint i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool = &buf_pool_ptr[i]; + + mutex_enter(&buf_pool->LRU_list_mutex); + + for (buf_block_t* block = UT_LIST_GET_LAST( + buf_pool->unzip_LRU); + block != NULL; ) { + buf_block_t* prev_block = UT_LIST_GET_PREV( + unzip_LRU, block); + ut_ad(buf_block_get_state(block) + == BUF_BLOCK_FILE_PAGE); + ut_ad(block->in_unzip_LRU_list); + ut_ad(block->page.in_LRU_list); + + mutex_enter(&block->mutex); + if (!buf_LRU_free_page(&block->page, false)) { + mutex_exit(&block->mutex); + all_evicted = false; + } else { + mutex_exit(&block->mutex); + mutex_enter(&buf_pool->LRU_list_mutex); + } + + block = prev_block; + } + + mutex_exit(&buf_pool->LRU_list_mutex); + } + + return(all_evicted); +} + +/****************************************************************//** +Called on SET GLOBAL innodb_buffer_pool_evict=... +Handles some values specially, to evict pages from the buffer pool. +SET GLOBAL innodb_buffer_pool_evict='uncompressed' +evicts all uncompressed page frames of compressed tablespaces. */ +static +void +innodb_buffer_pool_evict_update( +/*============================*/ + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var*var, /*!< in: pointer to system variable */ + void* var_ptr,/*!< out: ignored */ + const void* save) /*!< in: immediate result + from check function */ +{ + if (const char* op = *static_cast<const char*const*>(save)) { + if (!strcmp(op, "uncompressed")) { + for (uint tries = 0; tries < 10000; tries++) { + if (innodb_buffer_pool_evict_uncompressed()) { + return; + } + + os_thread_sleep(10000); + } + + /* We failed to evict all uncompressed pages. */ + ut_ad(0); + } + } +} +#endif /* UNIV_DEBUG */ + +/****************************************************************//** +Update the system variable innodb_monitor_enable and enable +specified monitor counter. +This function is registered as a callback with MySQL. */ +static +void +innodb_enable_monitor_update( +/*=========================*/ + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var* var, /*!< in: pointer to + system variable */ + void* var_ptr,/*!< out: where the + formal string goes */ + const void* save) /*!< in: immediate result + from check function */ +{ + innodb_monitor_update(thd, var_ptr, save, MONITOR_TURN_ON, TRUE); +} + +/****************************************************************//** +Update the system variable innodb_monitor_disable and turn +off specified monitor counter. */ +static +void +innodb_disable_monitor_update( +/*==========================*/ + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var* var, /*!< in: pointer to + system variable */ + void* var_ptr,/*!< out: where the + formal string goes */ + const void* save) /*!< in: immediate result + from check function */ +{ + innodb_monitor_update(thd, var_ptr, save, MONITOR_TURN_OFF, TRUE); +} + +/****************************************************************//** +Update the system variable innodb_monitor_reset and reset +specified monitor counter(s). +This function is registered as a callback with MySQL. */ +static +void +innodb_reset_monitor_update( +/*========================*/ + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var* var, /*!< in: pointer to + system variable */ + void* var_ptr,/*!< out: where the + formal string goes */ + const void* save) /*!< in: immediate result + from check function */ +{ + innodb_monitor_update(thd, var_ptr, save, MONITOR_RESET_VALUE, TRUE); +} + +/****************************************************************//** +Update the system variable innodb_monitor_reset_all and reset +all value related monitor counter. +This function is registered as a callback with MySQL. */ +static +void +innodb_reset_all_monitor_update( +/*============================*/ + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var* var, /*!< in: pointer to + system variable */ + void* var_ptr,/*!< out: where the + formal string goes */ + const void* save) /*!< in: immediate result + from check function */ +{ + innodb_monitor_update(thd, var_ptr, save, MONITOR_RESET_ALL_VALUE, + TRUE); +} + +/****************************************************************//** +Parse and enable InnoDB monitor counters during server startup. +User can list the monitor counters/groups to be enable by specifying +"loose-innodb_monitor_enable=monitor_name1;monitor_name2..." +in server configuration file or at the command line. The string +separate could be ";", "," or empty space. */ +static +void +innodb_enable_monitor_at_startup( +/*=============================*/ + char* str) /*!< in/out: monitor counter enable list */ +{ + static const char* sep = " ;,"; + char* last; + + ut_a(str); + + /* Walk through the string, and separate each monitor counter + and/or counter group name, and calling innodb_monitor_update() + if successfully updated. Please note that the "str" would be + changed by strtok_r() as it walks through it. */ + for (char* option = strtok_r(str, sep, &last); + option; + option = strtok_r(NULL, sep, &last)) { + ulint ret; + char* option_name; + + ret = innodb_monitor_valid_byname(&option_name, option); + + /* The name is validated if ret == 0 */ + if (!ret) { + innodb_monitor_update(NULL, NULL, &option, + MONITOR_TURN_ON, FALSE); + } else { + sql_print_warning("Invalid monitor counter" + " name: '%s'", option); + } + } +} + +#ifdef UNIV_LINUX + +/****************************************************************//** +Update the innodb_sched_priority_cleaner variable and set the thread +priorities accordingly. */ +static +void +innodb_sched_priority_cleaner_update( +/*=================================*/ + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var* var, /*!< in: pointer to + system variable */ + void* var_ptr,/*!< out: where the + formal string goes */ + const void* save) /*!< in: immediate result + from check function */ +{ + ulint priority = *static_cast<const ulint *>(save); + ulint actual_priority; + + /* Set the priority for the LRU manager thread */ + ut_ad(buf_lru_manager_is_active); + actual_priority = os_thread_set_priority(srv_lru_manager_tid, + priority); + if (UNIV_UNLIKELY(actual_priority != priority)) { + + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_WRONG_ARGUMENTS, + "Failed to set the LRU manager thread " + "priority to %lu, " + "the current priority is %lu", priority, + actual_priority); + } else { + + srv_sched_priority_cleaner = priority; + } + + /* Set the priority for the page cleaner thread */ + if (srv_read_only_mode) { + + return; + } + + ut_ad(buf_page_cleaner_is_active); + actual_priority = os_thread_set_priority(srv_cleaner_tid, priority); + if (UNIV_UNLIKELY(actual_priority != priority)) { + + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_WRONG_ARGUMENTS, + "Failed to set the page cleaner thread " + "priority to %lu, " + "the current priority is %lu", priority, + actual_priority); + } +} + +#if defined(UNIV_DEBUG) || (UNIV_PERF_DEBUG) + +/****************************************************************//** +Update the innodb_sched_priority_purge variable and set the thread +priorities accordingly. */ +static +void +innodb_sched_priority_purge_update( +/*===============================*/ + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var* var, /*!< in: pointer to + system variable */ + void* var_ptr,/*!< out: where the + formal string goes */ + const void* save) /*!< in: immediate result + from check function */ +{ + ulint priority = *static_cast<const ulint *>(save); + + if (srv_read_only_mode) { + return; + } + + ut_ad(purge_sys->state == PURGE_STATE_RUN); + for (ulint i = 0; i < srv_n_purge_threads; i++) { + + ulint actual_priority + = os_thread_set_priority(srv_purge_tids[i], priority); + if (UNIV_UNLIKELY(actual_priority != priority)) { + + push_warning_printf(thd, + Sql_condition::WARN_LEVEL_WARN, + ER_WRONG_ARGUMENTS, + "Failed to set the purge " + "thread priority to %lu, the " + "current priority is %lu, " + "aborting priority update", + priority, actual_priority); + return; + } + } + + srv_sched_priority_purge = priority; +} + +/****************************************************************//** +Update the innodb_sched_priority_io variable and set the thread +priorities accordingly. */ +static +void +innodb_sched_priority_io_update( +/*============================*/ + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var* var, /*!< in: pointer to + system variable */ + void* var_ptr,/*!< out: where the + formal string goes */ + const void* save) /*!< in: immediate result + from check function */ +{ + ulint priority = *static_cast<const ulint *>(save); + + for (ulint i = 0; i < srv_n_file_io_threads; i++) { + + ulint actual_priority = os_thread_set_priority(srv_io_tids[i], + priority); + + if (UNIV_UNLIKELY(actual_priority != priority)) { + + push_warning_printf(thd, + Sql_condition::WARN_LEVEL_WARN, + ER_WRONG_ARGUMENTS, + "Failed to set the I/O " + "thread priority to %lu, the " + "current priority is %lu, " + "aborting priority update", + priority, actual_priority); + return; + } + } + + srv_sched_priority_io = priority; +} + +/****************************************************************//** +Update the innodb_sched_priority_master variable and set the thread +priorities accordingly. */ +static +void +innodb_sched_priority_master_update( +/*================================*/ + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var* var, /*!< in: pointer to + system variable */ + void* var_ptr,/*!< out: where the + formal string goes */ + const void* save) /*!< in: immediate result + from check function */ +{ + ulint priority = *static_cast<const lint *>(save); + ulint actual_priority; + + if (srv_read_only_mode) { + return; + } + + actual_priority = os_thread_set_priority(srv_master_tid, priority); + if (UNIV_UNLIKELY(actual_priority != priority)) { + + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_WRONG_ARGUMENTS, + "Failed to set the master thread " + "priority to %lu, " + "the current priority is %lu", priority, + actual_priority); + } else { + + srv_sched_priority_master = priority; + } +} + +#endif /* defined(UNIV_DEBUG) || (UNIV_PERF_DEBUG) */ + +#endif /* UNIV_LINUX */ + +/****************************************************************//** +Callback function for accessing the InnoDB variables from MySQL: +SHOW VARIABLES. */ +static +int +show_innodb_vars( +/*=============*/ + THD* thd, + SHOW_VAR* var, + char* buff) +{ + innodb_export_status(); + var->type = SHOW_ARRAY; + var->value = (char*) &innodb_status_variables; + + return(0); +} + +/****************************************************************//** +This function checks each index name for a table against reserved +system default primary index name 'GEN_CLUST_INDEX'. If a name +matches, this function pushes an warning message to the client, +and returns true. +@return true if the index name matches the reserved name */ +UNIV_INTERN +bool +innobase_index_name_is_reserved( +/*============================*/ + THD* thd, /*!< in/out: MySQL connection */ + const KEY* key_info, /*!< in: Indexes to be created */ + ulint num_of_keys) /*!< in: Number of indexes to + be created. */ +{ + const KEY* key; + uint key_num; /* index number */ + + for (key_num = 0; key_num < num_of_keys; key_num++) { + key = &key_info[key_num]; + + if (innobase_strcasecmp(key->name, + innobase_index_reserve_name) == 0) { + /* Push warning to mysql */ + push_warning_printf(thd, + Sql_condition::WARN_LEVEL_WARN, + ER_WRONG_NAME_FOR_INDEX, + "Cannot Create Index with name " + "'%s'. The name is reserved " + "for the system default primary " + "index.", + innobase_index_reserve_name); + + my_error(ER_WRONG_NAME_FOR_INDEX, MYF(0), + innobase_index_reserve_name); + + return(true); + } + } + + return(false); +} + +/*********************************************************************** +Retrieve the FTS Relevance Ranking result for doc with doc_id +of prebuilt->fts_doc_id +@return the relevance ranking value */ +UNIV_INTERN +float +innobase_fts_retrieve_ranking( +/*============================*/ + FT_INFO * fts_hdl) /*!< in: FTS handler */ +{ + row_prebuilt_t* ft_prebuilt; + fts_result_t* result; + + result = ((NEW_FT_INFO*) fts_hdl)->ft_result; + + ft_prebuilt = ((NEW_FT_INFO*) fts_hdl)->ft_prebuilt; + + if (ft_prebuilt->read_just_key) { + fts_ranking_t* ranking = + rbt_value(fts_ranking_t, result->current); + return(ranking->rank); + } + + /* Retrieve the ranking value for doc_id with value of + prebuilt->fts_doc_id */ + return(fts_retrieve_ranking(result, ft_prebuilt->fts_doc_id)); +} + +/*********************************************************************** +functions for kill session of idle transaction */ +ibool +innobase_thd_is_idle( +/*=================*/ + const void* thd) /*!< in: thread handle (THD*) */ +{ +#ifdef EXTENDED_FOR_KILLIDLE + return(thd_command((const THD*) thd) == COM_SLEEP); +#else + return(FALSE); +#endif +} + +ib_int64_t +innobase_thd_get_start_time( +/*========================*/ + const void* thd) /*!< in: thread handle (THD*) */ +{ +#ifdef EXTENDED_FOR_KILLIDLE + return((ib_int64_t)thd_start_time((const THD*) thd)); +#else + return(0); /*dummy value*/ +#endif +} + +/*********************************************************************** +Free the memory for the FTS handler */ +UNIV_INTERN +void +innobase_fts_close_ranking( +/*=======================*/ + FT_INFO * fts_hdl) +{ + fts_result_t* result; + + ((NEW_FT_INFO*) fts_hdl)->ft_prebuilt->in_fts_query = false; + + result = ((NEW_FT_INFO*) fts_hdl)->ft_result; + + fts_query_free_result(result); + + my_free((uchar*) fts_hdl); + + return; +} + +UNIV_INTERN +void +innobase_thd_kill( +/*==============*/ + ulong thd_id) +{ +#ifdef EXTENDED_FOR_KILLIDLE + thd_kill(thd_id); +#else + return; +#endif +} + +/*********************************************************************** +Find and Retrieve the FTS Relevance Ranking result for doc with doc_id +of prebuilt->fts_doc_id +@return the relevance ranking value */ +UNIV_INTERN +float +innobase_fts_find_ranking( +/*======================*/ + FT_INFO* fts_hdl, /*!< in: FTS handler */ + uchar* record, /*!< in: Unused */ + uint len) /*!< in: Unused */ +{ + row_prebuilt_t* ft_prebuilt; + fts_result_t* result; + + ft_prebuilt = ((NEW_FT_INFO*) fts_hdl)->ft_prebuilt; + result = ((NEW_FT_INFO*) fts_hdl)->ft_result; + + /* Retrieve the ranking value for doc_id with value of + prebuilt->fts_doc_id */ + return(fts_retrieve_ranking(result, ft_prebuilt->fts_doc_id)); +} + +#ifdef UNIV_DEBUG +static my_bool innodb_purge_run_now = TRUE; +static my_bool innodb_purge_stop_now = TRUE; +static my_bool innodb_log_checkpoint_now = TRUE; +static my_bool innodb_buf_flush_list_now = TRUE; +static my_bool innodb_track_redo_log_now = TRUE; + +/****************************************************************//** +Set the purge state to RUN. If purge is disabled then it +is a no-op. This function is registered as a callback with MySQL. */ +static +void +purge_run_now_set( +/*==============*/ + THD* thd /*!< in: thread handle */ + __attribute__((unused)), + struct st_mysql_sys_var* var /*!< in: pointer to system + variable */ + __attribute__((unused)), + void* var_ptr /*!< out: where the formal + string goes */ + __attribute__((unused)), + const void* save) /*!< in: immediate result from + check function */ +{ + if (*(my_bool*) save && trx_purge_state() != PURGE_STATE_DISABLED) { + trx_purge_run(); + } +} + +/****************************************************************//** +Set the purge state to STOP. If purge is disabled then it +is a no-op. This function is registered as a callback with MySQL. */ +static +void +purge_stop_now_set( +/*===============*/ + THD* thd /*!< in: thread handle */ + __attribute__((unused)), + struct st_mysql_sys_var* var /*!< in: pointer to system + variable */ + __attribute__((unused)), + void* var_ptr /*!< out: where the formal + string goes */ + __attribute__((unused)), + const void* save) /*!< in: immediate result from + check function */ +{ + if (*(my_bool*) save && trx_purge_state() != PURGE_STATE_DISABLED) { + trx_purge_stop(); + } +} + +/****************************************************************//** +Force innodb to checkpoint. */ +static +void +checkpoint_now_set( +/*===============*/ + THD* thd /*!< in: thread handle */ + __attribute__((unused)), + struct st_mysql_sys_var* var /*!< in: pointer to system + variable */ + __attribute__((unused)), + void* var_ptr /*!< out: where the formal + string goes */ + __attribute__((unused)), + const void* save) /*!< in: immediate result from + check function */ +{ + if (*(my_bool*) save) { + while (log_sys->last_checkpoint_lsn < log_sys->lsn) { + log_make_checkpoint_at(LSN_MAX, TRUE); + fil_flush_file_spaces(FIL_LOG); + } + fil_write_flushed_lsn_to_data_files(log_sys->lsn, 0); + fil_flush_file_spaces(FIL_TABLESPACE); + } +} + +/****************************************************************//** +Force a dirty pages flush now. */ +static +void +buf_flush_list_now_set( +/*===================*/ + THD* thd /*!< in: thread handle */ + __attribute__((unused)), + struct st_mysql_sys_var* var /*!< in: pointer to system + variable */ + __attribute__((unused)), + void* var_ptr /*!< out: where the formal + string goes */ + __attribute__((unused)), + const void* save) /*!< in: immediate result from + check function */ +{ + if (*(my_bool*) save) { + buf_flush_list(ULINT_MAX, LSN_MAX, NULL); + buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST); + } +} + +/****************************************************************//** +Force log tracker to track the log synchronously. */ +static +void +track_redo_log_now_set( +/*===================*/ + THD* thd /*!< in: thread handle */ + __attribute__((unused)), + struct st_mysql_sys_var* var /*!< in: pointer to system + variable */ + __attribute__((unused)), + void* var_ptr /*!< out: where the formal + string goes */ + __attribute__((unused)), + const void* save) /*!< in: immediate result from + check function */ +{ + if (*(my_bool*) save && srv_track_changed_pages) { + + log_online_follow_redo_log(); + } +} + +#endif /* UNIV_DEBUG */ + +/*********************************************************************** +@return version of the extended FTS API */ +uint +innobase_fts_get_version() +/*======================*/ +{ + /* Currently this doesn't make much sense as returning + HA_CAN_FULLTEXT_EXT automatically mean this version is supported. + This supposed to ease future extensions. */ + return(2); +} + +/*********************************************************************** +@return Which part of the extended FTS API is supported */ +ulonglong +innobase_fts_flags() +/*================*/ +{ + return(FTS_ORDERED_RESULT | FTS_DOCID_IN_RESULT); +} + + +/*********************************************************************** +Find and Retrieve the FTS doc_id for the current result row +@return the document ID */ +ulonglong +innobase_fts_retrieve_docid( +/*========================*/ + FT_INFO_EXT * fts_hdl) /*!< in: FTS handler */ +{ + row_prebuilt_t* ft_prebuilt; + fts_result_t* result; + + ft_prebuilt = ((NEW_FT_INFO *)fts_hdl)->ft_prebuilt; + result = ((NEW_FT_INFO *)fts_hdl)->ft_result; + + if (ft_prebuilt->read_just_key) { + fts_ranking_t* ranking = + rbt_value(fts_ranking_t, result->current); + return(ranking->doc_id); + } + + return(ft_prebuilt->fts_doc_id); +} + + +ulong +innobase_thd_get_thread_id( +/*=======================*/ + const void* thd) +{ + return(thd_get_thread_id((const THD*) thd)); +} + + + +/*********************************************************************** +Find and retrieve the size of the current result +@return number of matching rows */ +ulonglong +innobase_fts_count_matches( +/*=======================*/ + FT_INFO_EXT* fts_hdl) /*!< in: FTS handler */ +{ + NEW_FT_INFO* handle = (NEW_FT_INFO *) fts_hdl; + + if (handle->ft_result->rankings_by_id != 0) { + return rbt_size(handle->ft_result->rankings_by_id); + } else { + return(0); + } +} + +/* These variables are never read by InnoDB or changed. They are a kind of +dummies that are needed by the MySQL infrastructure to call +buffer_pool_dump_now(), buffer_pool_load_now() and buffer_pool_load_abort() +by the user by doing: + SET GLOBAL innodb_buffer_pool_dump_now=ON; + SET GLOBAL innodb_buffer_pool_load_now=ON; + SET GLOBAL innodb_buffer_pool_load_abort=ON; +Their values are read by MySQL and displayed to the user when the variables +are queried, e.g.: + SELECT @@innodb_buffer_pool_dump_now; + SELECT @@innodb_buffer_pool_load_now; + SELECT @@innodb_buffer_pool_load_abort; */ +static my_bool innodb_buffer_pool_dump_now = FALSE; +static my_bool innodb_buffer_pool_load_now = FALSE; +static my_bool innodb_buffer_pool_load_abort = FALSE; + +/****************************************************************//** +Trigger a dump of the buffer pool if innodb_buffer_pool_dump_now is set +to ON. This function is registered as a callback with MySQL. */ +static +void +buffer_pool_dump_now( +/*=================*/ + THD* thd /*!< in: thread handle */ + __attribute__((unused)), + struct st_mysql_sys_var* var /*!< in: pointer to system + variable */ + __attribute__((unused)), + void* var_ptr /*!< out: where the formal + string goes */ + __attribute__((unused)), + const void* save) /*!< in: immediate result from + check function */ +{ + if (*(my_bool*) save && !srv_read_only_mode) { + buf_dump_start(); + } +} + +/****************************************************************//** +Trigger a load of the buffer pool if innodb_buffer_pool_load_now is set +to ON. This function is registered as a callback with MySQL. */ +static +void +buffer_pool_load_now( +/*=================*/ + THD* thd /*!< in: thread handle */ + __attribute__((unused)), + struct st_mysql_sys_var* var /*!< in: pointer to system + variable */ + __attribute__((unused)), + void* var_ptr /*!< out: where the formal + string goes */ + __attribute__((unused)), + const void* save) /*!< in: immediate result from + check function */ +{ + if (*(my_bool*) save) { + buf_load_start(); + } +} + +/****************************************************************//** +Abort a load of the buffer pool if innodb_buffer_pool_load_abort +is set to ON. This function is registered as a callback with MySQL. */ +static +void +buffer_pool_load_abort( +/*===================*/ + THD* thd /*!< in: thread handle */ + __attribute__((unused)), + struct st_mysql_sys_var* var /*!< in: pointer to system + variable */ + __attribute__((unused)), + void* var_ptr /*!< out: where the formal + string goes */ + __attribute__((unused)), + const void* save) /*!< in: immediate result from + check function */ +{ + if (*(my_bool*) save) { + buf_load_abort(); + } +} + +/** Update innodb_status_output or innodb_status_output_locks, +which control InnoDB "status monitor" output to the error log. +@param[in] thd thread handle +@param[in] var system variable +@param[out] var_ptr current value +@param[in] save to-be-assigned value */ +static +void +innodb_status_output_update( + THD* thd __attribute__((unused)), + struct st_mysql_sys_var* var __attribute__((unused)), + void* var_ptr __attribute__((unused)), + const void* save __attribute__((unused))) +{ + *static_cast<my_bool*>(var_ptr) = *static_cast<const my_bool*>(save); + /* The lock timeout monitor thread also takes care of this + output. */ + os_event_set(lock_sys->timeout_event); +} + +static SHOW_VAR innodb_status_variables_export[]= { + {"Innodb", (char*) &show_innodb_vars, SHOW_FUNC}, + {NullS, NullS, SHOW_LONG} +}; + +static struct st_mysql_storage_engine innobase_storage_engine= +{ MYSQL_HANDLERTON_INTERFACE_VERSION }; + +/* plugin options */ + +static MYSQL_SYSVAR_ENUM(checksum_algorithm, srv_checksum_algorithm, + PLUGIN_VAR_RQCMDARG, + "The algorithm InnoDB uses for page checksumming. Possible values are " + "CRC32 (hardware accelerated if the CPU supports it) " + "write crc32, allow any of the other checksums to match when reading; " + "STRICT_CRC32 " + "write crc32, do not allow other algorithms to match when reading; " + "INNODB " + "write a software calculated checksum, allow any other checksums " + "to match when reading; " + "STRICT_INNODB " + "write a software calculated checksum, do not allow other algorithms " + "to match when reading; " + "NONE " + "write a constant magic number, do not do any checksum verification " + "when reading (same as innodb_checksums=OFF); " + "STRICT_NONE " + "write a constant magic number, do not allow values other than that " + "magic number when reading; " + "Files updated when this option is set to crc32 or strict_crc32 will " + "not be readable by MySQL versions older than 5.6.3", + NULL, NULL, SRV_CHECKSUM_ALGORITHM_INNODB, + &innodb_checksum_algorithm_typelib); + + +static MYSQL_SYSVAR_ENUM(log_checksum_algorithm, srv_log_checksum_algorithm, + PLUGIN_VAR_RQCMDARG, + "The algorithm InnoDB uses for log block checksums. Possible values are " + "CRC32 (hardware accelerated if the CPU supports it) " + "write crc32, allow any of the other checksums to match when reading; " + "STRICT_CRC32 " + "write crc32, do not allow other algorithms to match when reading; " + "INNODB " + "write a software calculated checksum, allow any other checksums " + "to match when reading; " + "STRICT_INNODB " + "write a software calculated checksum, do not allow other algorithms " + "to match when reading; " + "NONE " + "write a constant magic number, do not do any checksum verification " + "when reading (same as innodb_checksums=OFF); " + "STRICT_NONE " + "write a constant magic number, do not allow values other than that " + "magic number when reading; " + "Logs created when this option is set to crc32/strict_crc32/none/strict_none " + "will not be readable by any MySQL version or Percona Server versions that do" + "not support this feature", + NULL, innodb_log_checksum_algorithm_update, SRV_CHECKSUM_ALGORITHM_INNODB, + &innodb_checksum_algorithm_typelib); + + +static MYSQL_SYSVAR_BOOL(checksums, innobase_use_checksums, + PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, + "DEPRECATED. Use innodb_checksum_algorithm=NONE instead of setting " + "this to OFF. " + "Enable InnoDB checksums validation (enabled by default). " + "Disable with --skip-innodb-checksums.", + NULL, NULL, TRUE); + +static MYSQL_SYSVAR_ULONG(log_block_size, innobase_log_block_size, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "###EXPERIMENTAL###: The log block size of the transaction log file. Changing for created log file is not supported. Use on your own risk!", + NULL, NULL, (1 << 9)/*512*/, OS_MIN_LOG_BLOCK_SIZE, + (1 << UNIV_PAGE_SIZE_SHIFT_MAX), 0); + +static MYSQL_SYSVAR_STR(data_home_dir, innobase_data_home_dir, + PLUGIN_VAR_READONLY, + "The common part for InnoDB table spaces.", + NULL, NULL, NULL); + +static MYSQL_SYSVAR_BOOL(doublewrite, innobase_use_doublewrite, + PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, + "Enable InnoDB doublewrite buffer (enabled by default). " + "Disable with --skip-innodb-doublewrite.", + NULL, NULL, TRUE); + +static MYSQL_SYSVAR_BOOL(use_atomic_writes, innobase_use_atomic_writes, + PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, + "Prevent partial page writes, via atomic writes (beta). " + "The option is used to prevent partial writes in case of a crash/poweroff, " + "as faster alternative to doublewrite buffer. " + "Currently this option works only " + "on Linux only with FusionIO device, and directFS filesystem.", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_ULONG(io_capacity, srv_io_capacity, + PLUGIN_VAR_RQCMDARG, + "Number of IOPs the server can do. Tunes the background IO rate", + NULL, innodb_io_capacity_update, 200, 100, ~0UL, 0); + +static MYSQL_SYSVAR_ULONG(io_capacity_max, srv_max_io_capacity, + PLUGIN_VAR_RQCMDARG, + "Limit to which innodb_io_capacity can be inflated.", + NULL, innodb_io_capacity_max_update, + SRV_MAX_IO_CAPACITY_DUMMY_DEFAULT, 100, + SRV_MAX_IO_CAPACITY_LIMIT, 0); + +#ifdef UNIV_DEBUG +static MYSQL_SYSVAR_BOOL(purge_run_now, innodb_purge_run_now, + PLUGIN_VAR_OPCMDARG, + "Set purge state to RUN", + NULL, purge_run_now_set, FALSE); + +static MYSQL_SYSVAR_BOOL(purge_stop_now, innodb_purge_stop_now, + PLUGIN_VAR_OPCMDARG, + "Set purge state to STOP", + NULL, purge_stop_now_set, FALSE); + +static MYSQL_SYSVAR_BOOL(log_checkpoint_now, innodb_log_checkpoint_now, + PLUGIN_VAR_OPCMDARG, + "Force checkpoint now", + NULL, checkpoint_now_set, FALSE); + +static MYSQL_SYSVAR_BOOL(buf_flush_list_now, innodb_buf_flush_list_now, + PLUGIN_VAR_OPCMDARG, + "Force dirty page flush now", + NULL, buf_flush_list_now_set, FALSE); + +static MYSQL_SYSVAR_BOOL(track_redo_log_now, + innodb_track_redo_log_now, + PLUGIN_VAR_OPCMDARG, + "Force log tracker to catch up with checkpoint now", + NULL, track_redo_log_now_set, FALSE); + +#endif /* UNIV_DEBUG */ + +static MYSQL_SYSVAR_ULONG(purge_batch_size, srv_purge_batch_size, + PLUGIN_VAR_OPCMDARG, + "Number of UNDO log pages to purge in one batch from the history list.", + NULL, NULL, + 300, /* Default setting */ + 1, /* Minimum value */ + 5000, 0); /* Maximum value */ + +static MYSQL_SYSVAR_ULONG(purge_threads, srv_n_purge_threads, + PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY, + "Purge threads can be from 1 to 32. Default is 1.", + NULL, NULL, + 1, /* Default setting */ + 1, /* Minimum value */ + SRV_MAX_N_PURGE_THREADS, 0); /* Maximum value */ + +static MYSQL_SYSVAR_ULONG(sync_array_size, srv_sync_array_size, + PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY, + "Size of the mutex/lock wait array.", + NULL, NULL, + 1, /* Default setting */ + 1, /* Minimum value */ + 1024, 0); /* Maximum value */ + +static MYSQL_SYSVAR_ULONG(fast_shutdown, innobase_fast_shutdown, + PLUGIN_VAR_OPCMDARG, + "Speeds up the shutdown process of the InnoDB storage engine. Possible " + "values are 0, 1 (faster) or 2 (fastest - crash-like).", + NULL, NULL, 1, 0, 2, 0); + +static MYSQL_SYSVAR_BOOL(file_per_table, srv_file_per_table, + PLUGIN_VAR_NOCMDARG, + "Stores each InnoDB table to an .ibd file in the database dir.", + NULL, NULL, TRUE); + +static MYSQL_SYSVAR_STR(file_format, innobase_file_format_name, + PLUGIN_VAR_RQCMDARG, + "File format to use for new tables in .ibd files.", + innodb_file_format_name_validate, + innodb_file_format_name_update, "Antelope"); + +/* "innobase_file_format_check" decides whether we would continue +booting the server if the file format stamped on the system +table space exceeds the maximum file format supported +by the server. Can be set during server startup at command +line or configure file, and a read only variable after +server startup */ +static MYSQL_SYSVAR_BOOL(file_format_check, innobase_file_format_check, + PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, + "Whether to perform system file format check.", + NULL, NULL, TRUE); + +/* If a new file format is introduced, the file format +name needs to be updated accordingly. Please refer to +file_format_name_map[] defined in trx0sys.cc for the next +file format name. */ +static MYSQL_SYSVAR_STR(file_format_max, innobase_file_format_max, + PLUGIN_VAR_OPCMDARG, + "The highest file format in the tablespace.", + innodb_file_format_max_validate, + innodb_file_format_max_update, "Antelope"); + +static MYSQL_SYSVAR_STR(ft_server_stopword_table, innobase_server_stopword_table, + PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_MEMALLOC, + "The user supplied stopword table name.", + innodb_stopword_table_validate, + NULL, + NULL); + +static MYSQL_SYSVAR_UINT(flush_log_at_timeout, srv_flush_log_at_timeout, + PLUGIN_VAR_OPCMDARG, + "Write and flush logs every (n) second.", + NULL, NULL, 1, 0, 2700, 0); + +/* Changed to the THDVAR */ +//static MYSQL_SYSVAR_ULONG(flush_log_at_trx_commit, srv_flush_log_at_trx_commit, +// PLUGIN_VAR_OPCMDARG, +// "Set to 0 (write and flush once per second)," +// " 1 (write and flush at each commit)" +// " or 2 (write at commit, flush once per second).", +// NULL, NULL, 1, 0, 2, 0); + +static MYSQL_SYSVAR_BOOL(use_global_flush_log_at_trx_commit, srv_use_global_flush_log_at_trx_commit, + PLUGIN_VAR_NOCMDARG, + "Use global innodb_flush_log_at_trx_commit value. (default: ON).", + NULL, NULL, TRUE); + +static MYSQL_SYSVAR_STR(flush_method, innobase_file_flush_method, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "With which method to flush data.", NULL, NULL, NULL); + +static MYSQL_SYSVAR_BOOL(large_prefix, innobase_large_prefix, + PLUGIN_VAR_NOCMDARG, + "Support large index prefix length of REC_VERSION_56_MAX_INDEX_COL_LEN (3072) bytes.", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_BOOL(force_load_corrupted, srv_load_corrupted, + PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, + "Force InnoDB to load metadata of corrupted table.", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_BOOL(locks_unsafe_for_binlog, innobase_locks_unsafe_for_binlog, + PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, + "DEPRECATED. This option may be removed in future releases. " + "Please use READ COMMITTED transaction isolation level instead. " + "Force InnoDB to not use next-key locking, to use only row-level locking.", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_ULONG(show_verbose_locks, srv_show_verbose_locks, + PLUGIN_VAR_OPCMDARG, + "Whether to show records locked in SHOW INNODB STATUS.", + NULL, NULL, 0, 0, 1, 0); + +static MYSQL_SYSVAR_ULONG(show_locks_held, srv_show_locks_held, + PLUGIN_VAR_RQCMDARG, + "Number of locks held to print for each InnoDB transaction in SHOW INNODB STATUS.", + NULL, NULL, 10, 0, 1000, 0); + +#ifdef UNIV_LOG_ARCHIVE +static MYSQL_SYSVAR_STR(log_arch_dir, innobase_log_arch_dir, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Where full logs should be archived.", NULL, NULL, NULL); + +static MYSQL_SYSVAR_BOOL(log_archive, innobase_log_archive, + PLUGIN_VAR_OPCMDARG, + "Set to 1 if you want to have logs archived.", + NULL, innodb_log_archive_update, FALSE); +#endif /* UNIV_LOG_ARCHIVE */ + +static MYSQL_SYSVAR_STR(log_group_home_dir, srv_log_group_home_dir, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Path to InnoDB log files.", NULL, NULL, NULL); + +static MYSQL_SYSVAR_ULONG(log_arch_expire_sec, + srv_log_arch_expire_sec, PLUGIN_VAR_OPCMDARG, + "Expiration time for archived innodb transaction logs.", + NULL, innodb_log_archive_expire_update, 0, 0, ~0UL, 0); + +static MYSQL_SYSVAR_ULONG(max_dirty_pages_pct, srv_max_buf_pool_modified_pct, + PLUGIN_VAR_RQCMDARG, + "Percentage of dirty pages allowed in bufferpool.", + NULL, innodb_max_dirty_pages_pct_update, 75, 0, 99, 0); + +static MYSQL_SYSVAR_ULONG(max_dirty_pages_pct_lwm, + srv_max_dirty_pages_pct_lwm, + PLUGIN_VAR_RQCMDARG, + "Percentage of dirty pages at which flushing kicks in.", + NULL, innodb_max_dirty_pages_pct_lwm_update, 0, 0, 99, 0); + +static MYSQL_SYSVAR_ULONG(adaptive_flushing_lwm, + srv_adaptive_flushing_lwm, + PLUGIN_VAR_RQCMDARG, + "Percentage of log capacity below which no adaptive flushing happens.", + NULL, NULL, 10, 0, 70, 0); + +static MYSQL_SYSVAR_BOOL(adaptive_flushing, srv_adaptive_flushing, + PLUGIN_VAR_NOCMDARG, + "Attempt flushing dirty pages to avoid IO bursts at checkpoints.", + NULL, NULL, TRUE); + +static MYSQL_SYSVAR_ULONG(flushing_avg_loops, + srv_flushing_avg_loops, + PLUGIN_VAR_RQCMDARG, + "Number of iterations over which the background flushing is averaged.", + NULL, NULL, 30, 1, 1000, 0); + +static MYSQL_SYSVAR_ULONG(max_purge_lag, srv_max_purge_lag, + PLUGIN_VAR_RQCMDARG, + "Desired maximum length of the purge queue (0 = no limit)", + NULL, NULL, 0, 0, ~0UL, 0); + +static MYSQL_SYSVAR_ULONG(max_purge_lag_delay, srv_max_purge_lag_delay, + PLUGIN_VAR_RQCMDARG, + "Maximum delay of user threads in micro-seconds", + NULL, NULL, + 0L, /* Default seting */ + 0L, /* Minimum value */ + 10000000UL, 0); /* Maximum value */ + +static MYSQL_SYSVAR_BOOL(rollback_on_timeout, innobase_rollback_on_timeout, + PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY, + "Roll back the complete transaction on lock wait timeout, for 4.x compatibility (disabled by default)", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_BOOL(status_file, innobase_create_status_file, + PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_NOSYSVAR, + "Enable SHOW ENGINE INNODB STATUS output in the innodb_status.<pid> file", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_BOOL(stats_on_metadata, innobase_stats_on_metadata, + PLUGIN_VAR_OPCMDARG, + "Enable statistics gathering for metadata commands such as " + "SHOW TABLE STATUS for tables that use transient statistics (off by default)", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_ULONGLONG(stats_sample_pages, srv_stats_transient_sample_pages, + PLUGIN_VAR_RQCMDARG, + "Deprecated, use innodb_stats_transient_sample_pages instead", + NULL, innodb_stats_sample_pages_update, 8, 1, ~0ULL, 0); + +static MYSQL_SYSVAR_ULONGLONG(stats_transient_sample_pages, + srv_stats_transient_sample_pages, + PLUGIN_VAR_RQCMDARG, + "The number of leaf index pages to sample when calculating transient " + "statistics (if persistent statistics are not used, default 8)", + NULL, NULL, 8, 1, ~0ULL, 0); + +static MYSQL_SYSVAR_BOOL(stats_persistent, srv_stats_persistent, + PLUGIN_VAR_OPCMDARG, + "InnoDB persistent statistics enabled for all tables unless overridden " + "at table level", + NULL, NULL, TRUE); + +static MYSQL_SYSVAR_BOOL(stats_auto_recalc, srv_stats_auto_recalc, + PLUGIN_VAR_OPCMDARG, + "InnoDB automatic recalculation of persistent statistics enabled for all " + "tables unless overridden at table level (automatic recalculation is only " + "done when InnoDB decides that the table has changed too much and needs a " + "new statistics)", + NULL, NULL, TRUE); + +static MYSQL_SYSVAR_ULONGLONG(stats_persistent_sample_pages, + srv_stats_persistent_sample_pages, + PLUGIN_VAR_RQCMDARG, + "The number of leaf index pages to sample when calculating persistent " + "statistics (by ANALYZE, default 20)", + NULL, NULL, 20, 1, ~0ULL, 0); + +static MYSQL_SYSVAR_BOOL(adaptive_hash_index, btr_search_enabled, + PLUGIN_VAR_OPCMDARG, + "Enable InnoDB adaptive hash index (enabled by default). " + "Disable with --skip-innodb-adaptive-hash-index.", + NULL, innodb_adaptive_hash_index_update, TRUE); + +/* btr_search_index_num is constrained to machine word size for historical +reasons. This limitation can be easily removed later. */ +static MYSQL_SYSVAR_ULONG(adaptive_hash_index_partitions, btr_search_index_num, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Number of InnoDB adaptive hash index partitions (default 1: disable " + "partitioning)", + NULL, NULL, 1, 1, sizeof(ulint) * 8, 0); + +static MYSQL_SYSVAR_ULONG(replication_delay, srv_replication_delay, + PLUGIN_VAR_RQCMDARG, + "Replication thread delay (ms) on the slave server if " + "innodb_thread_concurrency is reached (0 by default)", + NULL, NULL, 0, 0, ~0UL, 0); + +static MYSQL_SYSVAR_UINT(compression_level, page_zip_level, + PLUGIN_VAR_RQCMDARG, + "Compression level used for compressed row format. 0 is no compression" + ", 1 is fastest, 9 is best compression and default is 6.", + NULL, NULL, DEFAULT_COMPRESSION_LEVEL, 0, 9, 0); + +static MYSQL_SYSVAR_BOOL(log_compressed_pages, page_zip_log_pages, + PLUGIN_VAR_OPCMDARG, + "Enables/disables the logging of entire compressed page images." + " InnoDB logs the compressed pages to prevent corruption if" + " the zlib compression algorithm changes." + " When turned OFF, InnoDB will assume that the zlib" + " compression algorithm doesn't change.", + NULL, NULL, TRUE); + +static MYSQL_SYSVAR_LONG(additional_mem_pool_size, innobase_additional_mem_pool_size, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "DEPRECATED. This option may be removed in future releases, " + "together with the option innodb_use_sys_malloc and with the InnoDB's " + "internal memory allocator. " + "Size of a memory pool InnoDB uses to store data dictionary information and other internal data structures.", + NULL, NULL, 8*1024*1024L, 512*1024L, LONG_MAX, 1024); + +static MYSQL_SYSVAR_ULONG(autoextend_increment, srv_auto_extend_increment, + PLUGIN_VAR_RQCMDARG, + "Data file autoextend increment in megabytes", + NULL, NULL, 64L, 1L, 1000L, 0); + +static MYSQL_SYSVAR_LONGLONG(buffer_pool_size, innobase_buffer_pool_size, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "The size of the memory buffer InnoDB uses to cache data and indexes of its tables.", + NULL, NULL, 128*1024*1024L, 5*1024*1024L, LONGLONG_MAX, 1024*1024L); + +static MYSQL_SYSVAR_BOOL(buffer_pool_populate, srv_buf_pool_populate, + PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, + "Preallocate (pre-fault) the page frames required for the mapping " + "established by the buffer pool memory region. Disabled by default.", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_ENUM(foreground_preflush, srv_foreground_preflush, + PLUGIN_VAR_OPCMDARG, + "The algorithm InnoDB uses for the query threads at sync preflush. " + "Possible values are " + "SYNC_PREFLUSH: perform a sync preflush as Oracle MySQL; " + "EXPONENTIAL_BACKOFF: (default) wait for the page cleaner flush.", + NULL, NULL, SRV_FOREGROUND_PREFLUSH_EXP_BACKOFF, + &innodb_foreground_preflush_typelib); + +#ifdef UNIV_LINUX + +static MYSQL_SYSVAR_ULONG(sched_priority_cleaner, srv_sched_priority_cleaner, + PLUGIN_VAR_RQCMDARG, + "Nice value for the cleaner and LRU manager thread scheduling", + NULL, innodb_sched_priority_cleaner_update, 19, 0, 39, 0); + +#endif /* UNIV_LINUX */ + +#if defined UNIV_DEBUG || defined UNIV_PERF_DEBUG +static MYSQL_SYSVAR_ULONG(page_hash_locks, srv_n_page_hash_locks, + PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY, + "Number of rw_locks protecting buffer pool page_hash. Rounded up to the next power of 2", + NULL, NULL, 16, 1, MAX_PAGE_HASH_LOCKS, 0); + +static MYSQL_SYSVAR_ULONG(doublewrite_batch_size, srv_doublewrite_batch_size, + PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY, + "Number of pages reserved in doublewrite buffer for batch flushing", + NULL, NULL, 120, 1, 127, 0); + +#ifdef UNIV_LINUX + +static MYSQL_SYSVAR_ULONG(sched_priority_purge, srv_sched_priority_purge, + PLUGIN_VAR_RQCMDARG, + "Nice value for the purge thread scheduling", + NULL, innodb_sched_priority_purge_update, 19, 0, 39, 0); + +static MYSQL_SYSVAR_ULONG(sched_priority_io, srv_sched_priority_io, + PLUGIN_VAR_RQCMDARG, + "Nice value for the I/O handler thread scheduling", + NULL, innodb_sched_priority_io_update, 19, 0, 39, 0); + +static MYSQL_SYSVAR_ULONG(sched_priority_master, srv_sched_priority_master, + PLUGIN_VAR_RQCMDARG, + "Nice value for the master thread scheduling", + NULL, innodb_sched_priority_master_update, 19, 0, 39, 0); + +static MYSQL_SYSVAR_BOOL(priority_purge, srv_purge_thread_priority, + PLUGIN_VAR_OPCMDARG, + "Make purge coordinator and worker threads acquire shared resources with " + "priority", NULL, NULL, FALSE); + +static MYSQL_SYSVAR_BOOL(priority_io, srv_io_thread_priority, + PLUGIN_VAR_OPCMDARG, + "Make I/O threads acquire shared resources with priority", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_BOOL(priority_cleaner, srv_cleaner_thread_priority, + PLUGIN_VAR_OPCMDARG, + "Make buffer pool cleaner and LRU manager threads acquire shared resources " + "with priority", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_BOOL(priority_master, srv_master_thread_priority, + PLUGIN_VAR_OPCMDARG, + "Make buffer pool cleaner thread acquire shared resources with priority", + NULL, NULL, FALSE); + +#endif /* UNIV_LINUX */ + +static MYSQL_SYSVAR_ULONG(cleaner_max_lru_time, srv_cleaner_max_lru_time, + PLUGIN_VAR_RQCMDARG, + "The maximum time limit for a single LRU tail flush iteration by the page " + "cleaner thread in miliseconds", + NULL, NULL, 1000, 0, ~0UL, 0); + +static MYSQL_SYSVAR_ULONG(cleaner_max_flush_time, srv_cleaner_max_flush_time, + PLUGIN_VAR_RQCMDARG, + "The maximum time limit for a single flush list flush iteration by the page " + "cleaner thread in miliseconds", + NULL, NULL, 1000, 0, ~0UL, 0); + +static MYSQL_SYSVAR_ULONG(cleaner_flush_chunk_size, + srv_cleaner_flush_chunk_size, + PLUGIN_VAR_RQCMDARG, + "Divide page cleaner flush list flush batches into chunks of this size", + NULL, NULL, 100, 1, ~0UL, 0); + +static MYSQL_SYSVAR_ULONG(cleaner_lru_chunk_size, + srv_cleaner_lru_chunk_size, + PLUGIN_VAR_RQCMDARG, + "Divide page cleaner LRU list flush batches into chunks of this size", + NULL, NULL, 100, 1, ~0UL, 0); + +static MYSQL_SYSVAR_ULONG(cleaner_free_list_lwm, srv_cleaner_free_list_lwm, + PLUGIN_VAR_RQCMDARG, + "Page cleaner will keep on flushing the same buffer pool instance if its " + "free list length is below this percentage of innodb_lru_scan_depth", + NULL, NULL, 10, 0, 100, 0); + +static MYSQL_SYSVAR_BOOL(cleaner_eviction_factor, srv_cleaner_eviction_factor, + PLUGIN_VAR_OPCMDARG, + "Make page cleaner LRU flushes use evicted instead of flushed page counts " + "for its heuristics", + NULL, NULL, FALSE); + +#endif /* defined UNIV_DEBUG || defined UNIV_PERF_DEBUG */ + +static MYSQL_SYSVAR_ENUM(cleaner_lsn_age_factor, + srv_cleaner_lsn_age_factor, + PLUGIN_VAR_OPCMDARG, + "The formula for LSN age factor for page cleaner adaptive flushing. " + "LEGACY: Original Oracle MySQL 5.6 formula. " + "HIGH_CHECKPOINT: (the default) Percona Server 5.6 formula.", + NULL, NULL, SRV_CLEANER_LSN_AGE_FACTOR_HIGH_CHECKPOINT, + &innodb_cleaner_lsn_age_factor_typelib); + +static MYSQL_SYSVAR_ENUM(empty_free_list_algorithm, + srv_empty_free_list_algorithm, + PLUGIN_VAR_OPCMDARG, + "The algorithm to use for empty free list handling. Allowed values: " + "LEGACY: Original Oracle MySQL 5.6 handling with single page flushes; " + "BACKOFF: (default) Wait until cleaner produces a free page.", + NULL, NULL, SRV_EMPTY_FREE_LIST_BACKOFF, + &innodb_empty_free_list_algorithm_typelib); + +static MYSQL_SYSVAR_LONG(buffer_pool_instances, innobase_buffer_pool_instances, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Number of buffer pool instances, set to higher value on high-end machines to increase scalability", + NULL, NULL, 0L, 0L, MAX_BUFFER_POOLS, 1L); + +static MYSQL_SYSVAR_STR(buffer_pool_filename, srv_buf_dump_filename, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC, + "Filename to/from which to dump/load the InnoDB buffer pool", + innodb_srv_buf_dump_filename_validate, NULL, SRV_BUF_DUMP_FILENAME_DEFAULT); + +static MYSQL_SYSVAR_BOOL(buffer_pool_dump_now, innodb_buffer_pool_dump_now, + PLUGIN_VAR_RQCMDARG, + "Trigger an immediate dump of the buffer pool into a file named @@innodb_buffer_pool_filename", + NULL, buffer_pool_dump_now, FALSE); + +static MYSQL_SYSVAR_BOOL(buffer_pool_dump_at_shutdown, srv_buffer_pool_dump_at_shutdown, + PLUGIN_VAR_RQCMDARG, + "Dump the buffer pool into a file named @@innodb_buffer_pool_filename", + NULL, NULL, FALSE); + +#ifdef UNIV_DEBUG +static MYSQL_SYSVAR_STR(buffer_pool_evict, srv_buffer_pool_evict, + PLUGIN_VAR_RQCMDARG, + "Evict pages from the buffer pool", + NULL, innodb_buffer_pool_evict_update, ""); +#endif /* UNIV_DEBUG */ + +static MYSQL_SYSVAR_BOOL(buffer_pool_load_now, innodb_buffer_pool_load_now, + PLUGIN_VAR_RQCMDARG, + "Trigger an immediate load of the buffer pool from a file named @@innodb_buffer_pool_filename", + NULL, buffer_pool_load_now, FALSE); + +static MYSQL_SYSVAR_BOOL(buffer_pool_load_abort, innodb_buffer_pool_load_abort, + PLUGIN_VAR_RQCMDARG, + "Abort a currently running load of the buffer pool", + NULL, buffer_pool_load_abort, FALSE); + +/* there is no point in changing this during runtime, thus readonly */ +static MYSQL_SYSVAR_BOOL(buffer_pool_load_at_startup, srv_buffer_pool_load_at_startup, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Load the buffer pool from a file named @@innodb_buffer_pool_filename", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_ULONG(lru_scan_depth, srv_LRU_scan_depth, + PLUGIN_VAR_RQCMDARG, + "How deep to scan LRU to keep it clean", + NULL, NULL, 1024, 100, ~0UL, 0); + +static MYSQL_SYSVAR_ULONG(flush_neighbors, srv_flush_neighbors, + PLUGIN_VAR_OPCMDARG, + "Set to 0 (don't flush neighbors from buffer pool)," + " 1 (flush contiguous neighbors from buffer pool)" + " or 2 (flush neighbors from buffer pool)," + " when flushing a block", + NULL, NULL, 1, 0, 2, 0); + +static MYSQL_SYSVAR_ULONG(commit_concurrency, innobase_commit_concurrency, + PLUGIN_VAR_RQCMDARG, + "Helps in performance tuning in heavily concurrent environments.", + innobase_commit_concurrency_validate, NULL, 0, 0, 1000, 0); + +static MYSQL_SYSVAR_ULONG(concurrency_tickets, srv_n_free_tickets_to_enter, + PLUGIN_VAR_RQCMDARG, + "Number of times a thread is allowed to enter InnoDB within the same SQL query after it has once got the ticket", + NULL, NULL, 5000L, 1L, ~0UL, 0); + +static MYSQL_SYSVAR_LONG(kill_idle_transaction, srv_kill_idle_transaction, + PLUGIN_VAR_RQCMDARG, +#ifdef EXTENDED_FOR_KILLIDLE + "If non-zero value, the idle session with transaction which is idle over the value in seconds is killed by InnoDB.", +#else + "No effect for this build.", +#endif + NULL, NULL, 0, 0, LONG_MAX, 0); + +static MYSQL_SYSVAR_LONG(file_io_threads, innobase_file_io_threads, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY | PLUGIN_VAR_NOSYSVAR, + "Number of file I/O threads in InnoDB.", + NULL, NULL, 4, 4, 64, 0); + +static MYSQL_SYSVAR_BOOL(ft_enable_diag_print, fts_enable_diag_print, + PLUGIN_VAR_OPCMDARG, + "Whether to enable additional FTS diagnostic printout ", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_BOOL(disable_sort_file_cache, srv_disable_sort_file_cache, + PLUGIN_VAR_OPCMDARG, + "Whether to disable OS system file cache for sort I/O", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_STR(ft_aux_table, fts_internal_tbl_name, + PLUGIN_VAR_NOCMDARG, + "FTS internal auxiliary table to be checked", + innodb_internal_table_validate, + innodb_internal_table_update, NULL); + +static MYSQL_SYSVAR_ULONG(ft_cache_size, fts_max_cache_size, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "InnoDB Fulltext search cache size in bytes", + NULL, NULL, 8000000, 1600000, 80000000, 0); + +static MYSQL_SYSVAR_ULONG(ft_total_cache_size, fts_max_total_cache_size, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Total memory allocated for InnoDB Fulltext Search cache", + NULL, NULL, 640000000, 32000000, 1600000000, 0); + +static MYSQL_SYSVAR_ULONG(ft_result_cache_limit, fts_result_cache_limit, + PLUGIN_VAR_RQCMDARG, + "InnoDB Fulltext search query result cache limit in bytes", + NULL, NULL, 2000000000L, 1000000L, 4294967295UL, 0); + +static MYSQL_SYSVAR_ULONG(ft_min_token_size, fts_min_token_size, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "InnoDB Fulltext search minimum token size in characters", + NULL, NULL, 3, 0, 16, 0); + +static MYSQL_SYSVAR_ULONG(ft_max_token_size, fts_max_token_size, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "InnoDB Fulltext search maximum token size in characters", + NULL, NULL, FTS_MAX_WORD_LEN_IN_CHAR, 10, FTS_MAX_WORD_LEN_IN_CHAR, 0); + + +static MYSQL_SYSVAR_ULONG(ft_num_word_optimize, fts_num_word_optimize, + PLUGIN_VAR_OPCMDARG, + "InnoDB Fulltext search number of words to optimize for each optimize table call ", + NULL, NULL, 2000, 1000, 10000, 0); + +static MYSQL_SYSVAR_ULONG(ft_sort_pll_degree, fts_sort_pll_degree, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "InnoDB Fulltext search parallel sort degree, will round up to nearest power of 2 number", + NULL, NULL, 2, 1, 16, 0); + +static MYSQL_SYSVAR_ULONG(sort_buffer_size, srv_sort_buf_size, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Memory buffer size for index creation", + NULL, NULL, 1048576, 65536, 64<<20, 0); + +static MYSQL_SYSVAR_ULONGLONG(online_alter_log_max_size, srv_online_max_size, + PLUGIN_VAR_RQCMDARG, + "Maximum modification log file size for online index creation", + NULL, NULL, 128<<20, 65536, ~0ULL, 0); + +static MYSQL_SYSVAR_BOOL(optimize_fulltext_only, innodb_optimize_fulltext_only, + PLUGIN_VAR_NOCMDARG, + "Only optimize the Fulltext index of the table", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_ULONG(read_io_threads, innobase_read_io_threads, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Number of background read I/O threads in InnoDB.", + NULL, NULL, 4, 1, 64, 0); + +static MYSQL_SYSVAR_ULONG(write_io_threads, innobase_write_io_threads, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Number of background write I/O threads in InnoDB.", + NULL, NULL, 4, 1, 64, 0); + +static MYSQL_SYSVAR_ULONG(force_recovery, srv_force_recovery, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Helps to save your data in case the disk image of the database becomes corrupt.", + NULL, NULL, 0, 0, 6, 0); + +#ifndef DBUG_OFF +static MYSQL_SYSVAR_ULONG(force_recovery_crash, srv_force_recovery_crash, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Kills the server during crash recovery.", + NULL, NULL, 0, 0, 10, 0); +#endif /* !DBUG_OFF */ + +static MYSQL_SYSVAR_ULONG(page_size, srv_page_size, + PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY, + "Page size to use for all InnoDB tablespaces.", + NULL, NULL, UNIV_PAGE_SIZE_DEF, + UNIV_PAGE_SIZE_MIN, UNIV_PAGE_SIZE_MAX, 0); + +static MYSQL_SYSVAR_LONG(log_buffer_size, innobase_log_buffer_size, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "The size of the buffer which InnoDB uses to write log to the log files on disk.", + NULL, NULL, 8*1024*1024L, 256*1024L, LONG_MAX, 1024); + +static MYSQL_SYSVAR_LONGLONG(log_file_size, innobase_log_file_size, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Size of each log file in a log group.", + NULL, NULL, 48*1024*1024L, 1*1024*1024L, LONGLONG_MAX, 1024*1024L); + +static MYSQL_SYSVAR_ULONG(log_files_in_group, srv_n_log_files, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Number of log files in the log group. InnoDB writes to the files in a circular fashion.", + NULL, NULL, 2, 2, SRV_N_LOG_FILES_MAX, 0); + +/* Note that the default and minimum values are set to 0 to +detect if the option is passed and print deprecation message */ +static MYSQL_SYSVAR_LONG(mirrored_log_groups, innobase_mirrored_log_groups, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Number of identical copies of log groups we keep for the database. Currently this should be set to 1.", + NULL, NULL, 0, 0, 10, 0); + +static MYSQL_SYSVAR_UINT(old_blocks_pct, innobase_old_blocks_pct, + PLUGIN_VAR_RQCMDARG, + "Percentage of the buffer pool to reserve for 'old' blocks.", + NULL, innodb_old_blocks_pct_update, 100 * 3 / 8, 5, 95, 0); + +static MYSQL_SYSVAR_UINT(old_blocks_time, buf_LRU_old_threshold_ms, + PLUGIN_VAR_RQCMDARG, + "Move blocks to the 'new' end of the buffer pool if the first access" + " was at least this many milliseconds ago." + " The timeout is disabled if 0.", + NULL, NULL, 1000, 0, UINT_MAX32, 0); + +static MYSQL_SYSVAR_LONG(open_files, innobase_open_files, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "How many files at the maximum InnoDB keeps open at the same time.", + NULL, NULL, 0L, 0L, LONG_MAX, 0); + +static MYSQL_SYSVAR_ULONG(sync_spin_loops, srv_n_spin_wait_rounds, + PLUGIN_VAR_RQCMDARG, + "Count of spin-loop rounds in InnoDB mutexes (30 by default)", + NULL, NULL, 30L, 0L, ~0UL, 0); + +static MYSQL_SYSVAR_ULONG(spin_wait_delay, srv_spin_wait_delay, + PLUGIN_VAR_OPCMDARG, + "Maximum delay between polling for a spin lock (6 by default)", + NULL, NULL, 6L, 0L, ~0UL, 0); + +static MYSQL_SYSVAR_ULONG(thread_concurrency, srv_thread_concurrency, + PLUGIN_VAR_RQCMDARG, + "Helps in performance tuning in heavily concurrent environments. Sets the maximum number of threads allowed inside InnoDB. Value 0 will disable the thread throttling.", + NULL, NULL, 0, 0, 1000, 0); + +#ifdef HAVE_ATOMIC_BUILTINS +static MYSQL_SYSVAR_ULONG( + adaptive_max_sleep_delay, srv_adaptive_max_sleep_delay, + PLUGIN_VAR_RQCMDARG, + "The upper limit of the sleep delay in usec. Value of 0 disables it.", + NULL, NULL, + 150000, /* Default setting */ + 0, /* Minimum value */ + 1000000, 0); /* Maximum value */ +#endif /* HAVE_ATOMIC_BUILTINS */ + +static MYSQL_SYSVAR_ULONG(thread_sleep_delay, srv_thread_sleep_delay, + PLUGIN_VAR_RQCMDARG, + "Time of innodb thread sleeping before joining InnoDB queue (usec). " + "Value 0 disable a sleep", + NULL, NULL, + 10000L, + 0L, + 1000000L, 0); + +static MYSQL_SYSVAR_STR(data_file_path, innobase_data_file_path, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Path to individual files and their sizes.", + NULL, NULL, NULL); + +static MYSQL_SYSVAR_STR(undo_directory, srv_undo_dir, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Directory where undo tablespace files live, this path can be absolute.", + NULL, NULL, "."); + +static MYSQL_SYSVAR_ULONG(undo_tablespaces, srv_undo_tablespaces, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Number of undo tablespaces to use. ", + NULL, NULL, + 0L, /* Default seting */ + 0L, /* Minimum value */ + 126L, 0); /* Maximum value */ + +static MYSQL_SYSVAR_ULONG(undo_logs, srv_undo_logs, + PLUGIN_VAR_OPCMDARG, + "Number of undo logs to use.", + NULL, NULL, + TRX_SYS_N_RSEGS, /* Default setting */ + 1, /* Minimum value */ + TRX_SYS_N_RSEGS, 0); /* Maximum value */ + +/* Alias for innodb_undo_logs, this config variable is deprecated. */ +static MYSQL_SYSVAR_ULONG(rollback_segments, srv_undo_logs, + PLUGIN_VAR_OPCMDARG, + "Number of undo logs to use (deprecated).", + NULL, NULL, + TRX_SYS_N_RSEGS, /* Default setting */ + 1, /* Minimum value */ + TRX_SYS_N_RSEGS, 0); /* Maximum value */ + +static MYSQL_SYSVAR_LONG(autoinc_lock_mode, innobase_autoinc_lock_mode, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "The AUTOINC lock modes supported by InnoDB: " + "0 => Old style AUTOINC locking (for backward" + " compatibility) " + "1 => New style AUTOINC locking " + "2 => No AUTOINC locking (unsafe for SBR)", + NULL, NULL, + AUTOINC_NEW_STYLE_LOCKING, /* Default setting */ + AUTOINC_OLD_STYLE_LOCKING, /* Minimum value */ + AUTOINC_NO_LOCKING, 0); /* Maximum value */ + +static MYSQL_SYSVAR_STR(version, innodb_version_str, + PLUGIN_VAR_NOCMDOPT | PLUGIN_VAR_READONLY, + "Percona-InnoDB-plugin version", NULL, NULL, INNODB_VERSION_STR); + +static MYSQL_SYSVAR_BOOL(use_sys_malloc, srv_use_sys_malloc, + PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, + "DEPRECATED. This option may be removed in future releases, " + "together with the InnoDB's internal memory allocator. " + "Use OS memory allocator instead of InnoDB's internal memory allocator", + NULL, NULL, TRUE); + +static MYSQL_SYSVAR_BOOL(use_native_aio, srv_use_native_aio, + PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, + "Use native AIO if supported on this platform.", + NULL, NULL, TRUE); + +static MYSQL_SYSVAR_BOOL(api_enable_binlog, ib_binlog_enabled, + PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, + "Enable binlog for applications direct access InnoDB through InnoDB APIs", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_BOOL(api_enable_mdl, ib_mdl_enabled, + PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, + "Enable MDL for applications direct access InnoDB through InnoDB APIs", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_BOOL(api_disable_rowlock, ib_disable_row_lock, + PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, + "Disable row lock when direct access InnoDB through InnoDB APIs", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_ULONG(api_trx_level, ib_trx_level_setting, + PLUGIN_VAR_OPCMDARG, + "InnoDB API transaction isolation level", + NULL, NULL, + 0, /* Default setting */ + 0, /* Minimum value */ + 3, 0); /* Maximum value */ + +static MYSQL_SYSVAR_ULONG(api_bk_commit_interval, ib_bk_commit_interval, + PLUGIN_VAR_OPCMDARG, + "Background commit interval in seconds", + NULL, NULL, + 5, /* Default setting */ + 1, /* Minimum value */ + 1024 * 1024 * 1024, 0); /* Maximum value */ + +static MYSQL_SYSVAR_STR(change_buffering, innobase_change_buffering, + PLUGIN_VAR_RQCMDARG, + "Buffer changes to reduce random access: " + "OFF, ON, inserting, deleting, changing, or purging.", + innodb_change_buffering_validate, + innodb_change_buffering_update, "all"); + +static MYSQL_SYSVAR_UINT(change_buffer_max_size, + innobase_change_buffer_max_size, + PLUGIN_VAR_RQCMDARG, + "Maximum on-disk size of change buffer in terms of percentage" + " of the buffer pool.", + NULL, innodb_change_buffer_max_size_update, + CHANGE_BUFFER_DEFAULT_SIZE, 0, 50, 0); + +static MYSQL_SYSVAR_ENUM(stats_method, srv_innodb_stats_method, + PLUGIN_VAR_RQCMDARG, + "Specifies how InnoDB index statistics collection code should " + "treat NULLs. Possible values are NULLS_EQUAL (default), " + "NULLS_UNEQUAL and NULLS_IGNORED", + NULL, NULL, SRV_STATS_NULLS_EQUAL, &innodb_stats_method_typelib); + +static MYSQL_SYSVAR_BOOL(track_changed_pages, srv_track_changed_pages, + PLUGIN_VAR_NOCMDARG +#ifndef UNIV_DEBUG + /* Make this variable dynamic for debug builds to + provide a testcase sync facility */ + | PLUGIN_VAR_READONLY +#endif + , + "Track the redo log for changed pages and output a changed page bitmap", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_ULONGLONG(max_bitmap_file_size, srv_max_bitmap_file_size, + PLUGIN_VAR_RQCMDARG, + "The maximum size of changed page bitmap files", + NULL, NULL, 100*1024*1024ULL, 4096ULL, ULONGLONG_MAX, 0); + +static MYSQL_SYSVAR_ULONGLONG(max_changed_pages, srv_max_changed_pages, + PLUGIN_VAR_RQCMDARG, + "The maximum number of rows for " + "INFORMATION_SCHEMA.INNODB_CHANGED_PAGES table, " + "0 - unlimited", + NULL, NULL, 1000000, 0, ~0ULL, 0); + +#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG +static MYSQL_SYSVAR_UINT(change_buffering_debug, ibuf_debug, + PLUGIN_VAR_RQCMDARG, + "Debug flags for InnoDB change buffering (0=none, 2=crash at merge)", + NULL, NULL, 0, 0, 2, 0); + +static MYSQL_SYSVAR_BOOL(disable_background_merge, + srv_ibuf_disable_background_merge, + PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_RQCMDARG, + "Disable change buffering merges by the master thread", + NULL, NULL, FALSE); +#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ + +static MYSQL_SYSVAR_BOOL(random_read_ahead, srv_random_read_ahead, + PLUGIN_VAR_NOCMDARG, + "Whether to use read ahead for random access within an extent.", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_ULONG(read_ahead_threshold, srv_read_ahead_threshold, + PLUGIN_VAR_RQCMDARG, + "Number of pages that must be accessed sequentially for InnoDB to " + "trigger a readahead.", + NULL, NULL, 56, 0, 64, 0); + +static MYSQL_SYSVAR_STR(monitor_enable, innobase_enable_monitor_counter, + PLUGIN_VAR_RQCMDARG, + "Turn on a monitor counter", + innodb_monitor_validate, + innodb_enable_monitor_update, NULL); + +static MYSQL_SYSVAR_STR(monitor_disable, innobase_disable_monitor_counter, + PLUGIN_VAR_RQCMDARG, + "Turn off a monitor counter", + innodb_monitor_validate, + innodb_disable_monitor_update, NULL); + +static MYSQL_SYSVAR_STR(monitor_reset, innobase_reset_monitor_counter, + PLUGIN_VAR_RQCMDARG, + "Reset a monitor counter", + innodb_monitor_validate, + innodb_reset_monitor_update, NULL); + +static MYSQL_SYSVAR_STR(monitor_reset_all, innobase_reset_all_monitor_counter, + PLUGIN_VAR_RQCMDARG, + "Reset all values for a monitor counter", + innodb_monitor_validate, + innodb_reset_all_monitor_update, NULL); + +static MYSQL_SYSVAR_BOOL(status_output, srv_print_innodb_monitor, + PLUGIN_VAR_OPCMDARG, "Enable InnoDB monitor output to the error log.", + NULL, innodb_status_output_update, FALSE); + +static MYSQL_SYSVAR_BOOL(status_output_locks, srv_print_innodb_lock_monitor, + PLUGIN_VAR_OPCMDARG, "Enable InnoDB lock monitor output to the error log." + " Requires innodb_status_output=ON.", + NULL, innodb_status_output_update, FALSE); + +static MYSQL_SYSVAR_BOOL(print_all_deadlocks, srv_print_all_deadlocks, + PLUGIN_VAR_OPCMDARG, + "Print all deadlocks to MySQL error log (off by default)", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_ULONG(compression_failure_threshold_pct, + zip_failure_threshold_pct, PLUGIN_VAR_OPCMDARG, + "If the compression failure rate of a table is greater than this number" + " more padding is added to the pages to reduce the failures. A value of" + " zero implies no padding", + NULL, NULL, 5, 0, 100, 0); + +static MYSQL_SYSVAR_ULONG(compression_pad_pct_max, + zip_pad_max, PLUGIN_VAR_OPCMDARG, + "Percentage of empty space on a data page that can be reserved" + " to make the page compressible.", + NULL, NULL, 50, 0, 75, 0); + +static MYSQL_SYSVAR_BOOL(read_only, srv_read_only_mode, + PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY, + "Start InnoDB in read only mode (off by default)", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_BOOL(cmp_per_index_enabled, srv_cmp_per_index_enabled, + PLUGIN_VAR_OPCMDARG, + "Enable INFORMATION_SCHEMA.innodb_cmp_per_index, " + "may have negative impact on performance (off by default)", + NULL, innodb_cmp_per_index_update, FALSE); + +#ifdef UNIV_DEBUG +static MYSQL_SYSVAR_UINT(trx_rseg_n_slots_debug, trx_rseg_n_slots_debug, + PLUGIN_VAR_RQCMDARG, + "Debug flags for InnoDB to limit TRX_RSEG_N_SLOTS for trx_rsegf_undo_find_free()", + NULL, NULL, 0, 0, 1024, 0); + +static MYSQL_SYSVAR_UINT(limit_optimistic_insert_debug, + btr_cur_limit_optimistic_insert_debug, PLUGIN_VAR_RQCMDARG, + "Artificially limit the number of records per B-tree page (0=unlimited).", + NULL, NULL, 0, 0, UINT_MAX32, 0); + +static MYSQL_SYSVAR_BOOL(trx_purge_view_update_only_debug, + srv_purge_view_update_only_debug, PLUGIN_VAR_NOCMDARG, + "Pause actual purging any delete-marked records, but merely update the purge view. " + "It is to create artificially the situation the purge view have been updated " + "but the each purges were not done yet.", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_ULONG(fil_make_page_dirty_debug, + srv_fil_make_page_dirty_debug, PLUGIN_VAR_OPCMDARG, + "Make the first page of the given tablespace dirty.", + NULL, innodb_make_page_dirty, 0, 0, UINT_MAX32, 0); + +static MYSQL_SYSVAR_ULONG(saved_page_number_debug, + srv_saved_page_number_debug, PLUGIN_VAR_OPCMDARG, + "An InnoDB page number.", + NULL, innodb_save_page_no, 0, 0, UINT_MAX32, 0); +#endif /* UNIV_DEBUG */ + +const char *corrupt_table_action_names[]= +{ + "assert", /* 0 */ + "warn", /* 1 */ + "salvage", /* 2 */ + NullS +}; +TYPELIB corrupt_table_action_typelib= +{ + array_elements(corrupt_table_action_names) - 1, "corrupt_table_action_typelib", + corrupt_table_action_names, NULL +}; +static MYSQL_SYSVAR_ENUM(corrupt_table_action, srv_pass_corrupt_table, + PLUGIN_VAR_RQCMDARG, + "Warn corruptions of user tables as 'corrupt table' instead of not crashing itself, " + "when used with file_per_table. " + "All file io for the datafile after detected as corrupt are disabled, " + "except for the deletion.", + NULL, NULL, 0, &corrupt_table_action_typelib); + +static MYSQL_SYSVAR_BOOL(locking_fake_changes, srv_fake_changes_locks, + PLUGIN_VAR_NOCMDARG, + "###EXPERIMENTAL### if enabled, transactions will get S row locks instead " + "of X locks for fake changes. If disabled, fake change transactions will " + "not take any locks at all.", + NULL, NULL, TRUE); + +static struct st_mysql_sys_var* innobase_system_variables[]= { + MYSQL_SYSVAR(log_block_size), + MYSQL_SYSVAR(additional_mem_pool_size), + MYSQL_SYSVAR(api_trx_level), + MYSQL_SYSVAR(api_bk_commit_interval), + MYSQL_SYSVAR(autoextend_increment), + MYSQL_SYSVAR(buffer_pool_size), + MYSQL_SYSVAR(buffer_pool_populate), + MYSQL_SYSVAR(buffer_pool_instances), + MYSQL_SYSVAR(buffer_pool_filename), + MYSQL_SYSVAR(buffer_pool_dump_now), + MYSQL_SYSVAR(buffer_pool_dump_at_shutdown), +#ifdef UNIV_DEBUG + MYSQL_SYSVAR(buffer_pool_evict), +#endif /* UNIV_DEBUG */ + MYSQL_SYSVAR(buffer_pool_load_now), + MYSQL_SYSVAR(buffer_pool_load_abort), + MYSQL_SYSVAR(buffer_pool_load_at_startup), + MYSQL_SYSVAR(lru_scan_depth), + MYSQL_SYSVAR(flush_neighbors), + MYSQL_SYSVAR(checksum_algorithm), + MYSQL_SYSVAR(log_checksum_algorithm), + MYSQL_SYSVAR(checksums), + MYSQL_SYSVAR(commit_concurrency), + MYSQL_SYSVAR(concurrency_tickets), + MYSQL_SYSVAR(compression_level), + MYSQL_SYSVAR(kill_idle_transaction), + MYSQL_SYSVAR(data_file_path), + MYSQL_SYSVAR(data_home_dir), + MYSQL_SYSVAR(doublewrite), + MYSQL_SYSVAR(api_enable_binlog), + MYSQL_SYSVAR(api_enable_mdl), + MYSQL_SYSVAR(api_disable_rowlock), + MYSQL_SYSVAR(use_atomic_writes), + MYSQL_SYSVAR(fast_shutdown), + MYSQL_SYSVAR(file_io_threads), + MYSQL_SYSVAR(read_io_threads), + MYSQL_SYSVAR(write_io_threads), + MYSQL_SYSVAR(file_per_table), + MYSQL_SYSVAR(file_format), + MYSQL_SYSVAR(file_format_check), + MYSQL_SYSVAR(file_format_max), + MYSQL_SYSVAR(flush_log_at_timeout), + MYSQL_SYSVAR(flush_log_at_trx_commit), + MYSQL_SYSVAR(use_global_flush_log_at_trx_commit), + MYSQL_SYSVAR(flush_method), + MYSQL_SYSVAR(force_recovery), +#ifndef DBUG_OFF + MYSQL_SYSVAR(force_recovery_crash), +#endif /* !DBUG_OFF */ + MYSQL_SYSVAR(ft_cache_size), + MYSQL_SYSVAR(ft_total_cache_size), + MYSQL_SYSVAR(ft_result_cache_limit), + MYSQL_SYSVAR(ft_enable_stopword), + MYSQL_SYSVAR(ft_max_token_size), + MYSQL_SYSVAR(ft_min_token_size), + MYSQL_SYSVAR(ft_num_word_optimize), + MYSQL_SYSVAR(ft_sort_pll_degree), + MYSQL_SYSVAR(large_prefix), + MYSQL_SYSVAR(force_load_corrupted), + MYSQL_SYSVAR(locks_unsafe_for_binlog), + MYSQL_SYSVAR(lock_wait_timeout), +#ifdef UNIV_LOG_ARCHIVE + MYSQL_SYSVAR(log_arch_dir), + MYSQL_SYSVAR(log_archive), + MYSQL_SYSVAR(log_arch_expire_sec), +#endif /* UNIV_LOG_ARCHIVE */ + MYSQL_SYSVAR(page_size), + MYSQL_SYSVAR(log_buffer_size), + MYSQL_SYSVAR(log_file_size), + MYSQL_SYSVAR(log_files_in_group), + MYSQL_SYSVAR(log_group_home_dir), + MYSQL_SYSVAR(log_compressed_pages), + MYSQL_SYSVAR(max_dirty_pages_pct), + MYSQL_SYSVAR(max_dirty_pages_pct_lwm), + MYSQL_SYSVAR(adaptive_flushing_lwm), + MYSQL_SYSVAR(adaptive_flushing), + MYSQL_SYSVAR(flushing_avg_loops), + MYSQL_SYSVAR(max_purge_lag), + MYSQL_SYSVAR(max_purge_lag_delay), + MYSQL_SYSVAR(mirrored_log_groups), + MYSQL_SYSVAR(old_blocks_pct), + MYSQL_SYSVAR(old_blocks_time), + MYSQL_SYSVAR(open_files), + MYSQL_SYSVAR(optimize_fulltext_only), + MYSQL_SYSVAR(rollback_on_timeout), + MYSQL_SYSVAR(ft_aux_table), + MYSQL_SYSVAR(ft_enable_diag_print), + MYSQL_SYSVAR(ft_server_stopword_table), + MYSQL_SYSVAR(ft_user_stopword_table), + MYSQL_SYSVAR(disable_sort_file_cache), + MYSQL_SYSVAR(stats_on_metadata), + MYSQL_SYSVAR(stats_sample_pages), + MYSQL_SYSVAR(stats_transient_sample_pages), + MYSQL_SYSVAR(stats_persistent), + MYSQL_SYSVAR(stats_persistent_sample_pages), + MYSQL_SYSVAR(stats_auto_recalc), + MYSQL_SYSVAR(adaptive_hash_index), + MYSQL_SYSVAR(adaptive_hash_index_partitions), + MYSQL_SYSVAR(stats_method), + MYSQL_SYSVAR(replication_delay), + MYSQL_SYSVAR(status_file), + MYSQL_SYSVAR(strict_mode), + MYSQL_SYSVAR(support_xa), + MYSQL_SYSVAR(sort_buffer_size), + MYSQL_SYSVAR(online_alter_log_max_size), + MYSQL_SYSVAR(sync_spin_loops), + MYSQL_SYSVAR(spin_wait_delay), + MYSQL_SYSVAR(table_locks), + MYSQL_SYSVAR(thread_concurrency), +#ifdef HAVE_ATOMIC_BUILTINS + MYSQL_SYSVAR(adaptive_max_sleep_delay), +#endif /* HAVE_ATOMIC_BUILTINS */ + MYSQL_SYSVAR(thread_sleep_delay), + MYSQL_SYSVAR(autoinc_lock_mode), + MYSQL_SYSVAR(show_verbose_locks), + MYSQL_SYSVAR(show_locks_held), + MYSQL_SYSVAR(version), + MYSQL_SYSVAR(use_sys_malloc), + MYSQL_SYSVAR(use_native_aio), + MYSQL_SYSVAR(change_buffering), + MYSQL_SYSVAR(change_buffer_max_size), + MYSQL_SYSVAR(track_changed_pages), + MYSQL_SYSVAR(max_bitmap_file_size), + MYSQL_SYSVAR(max_changed_pages), +#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG + MYSQL_SYSVAR(change_buffering_debug), + MYSQL_SYSVAR(disable_background_merge), +#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ + MYSQL_SYSVAR(random_read_ahead), + MYSQL_SYSVAR(read_ahead_threshold), + MYSQL_SYSVAR(read_only), + MYSQL_SYSVAR(io_capacity), + MYSQL_SYSVAR(io_capacity_max), + MYSQL_SYSVAR(monitor_enable), + MYSQL_SYSVAR(monitor_disable), + MYSQL_SYSVAR(monitor_reset), + MYSQL_SYSVAR(monitor_reset_all), + MYSQL_SYSVAR(purge_threads), + MYSQL_SYSVAR(purge_batch_size), +#ifdef UNIV_DEBUG + MYSQL_SYSVAR(purge_run_now), + MYSQL_SYSVAR(purge_stop_now), + MYSQL_SYSVAR(log_checkpoint_now), + MYSQL_SYSVAR(buf_flush_list_now), + MYSQL_SYSVAR(track_redo_log_now), +#endif /* UNIV_DEBUG */ +#ifdef UNIV_LINUX + MYSQL_SYSVAR(sched_priority_cleaner), +#endif +#if defined UNIV_DEBUG || defined UNIV_PERF_DEBUG + MYSQL_SYSVAR(page_hash_locks), + MYSQL_SYSVAR(doublewrite_batch_size), +#ifdef UNIV_LINUX + MYSQL_SYSVAR(sched_priority_purge), + MYSQL_SYSVAR(sched_priority_io), + MYSQL_SYSVAR(sched_priority_master), + MYSQL_SYSVAR(priority_purge), + MYSQL_SYSVAR(priority_io), + MYSQL_SYSVAR(priority_cleaner), + MYSQL_SYSVAR(priority_master), +#endif /* UNIV_LINUX */ + MYSQL_SYSVAR(cleaner_max_lru_time), + MYSQL_SYSVAR(cleaner_max_flush_time), + MYSQL_SYSVAR(cleaner_flush_chunk_size), + MYSQL_SYSVAR(cleaner_lru_chunk_size), + MYSQL_SYSVAR(cleaner_free_list_lwm), + MYSQL_SYSVAR(cleaner_eviction_factor), +#endif /* defined UNIV_DEBUG || defined UNIV_PERF_DEBUG */ + MYSQL_SYSVAR(status_output), + MYSQL_SYSVAR(status_output_locks), + MYSQL_SYSVAR(cleaner_lsn_age_factor), + MYSQL_SYSVAR(foreground_preflush), + MYSQL_SYSVAR(empty_free_list_algorithm), + MYSQL_SYSVAR(print_all_deadlocks), + MYSQL_SYSVAR(cmp_per_index_enabled), + MYSQL_SYSVAR(undo_logs), + MYSQL_SYSVAR(rollback_segments), + MYSQL_SYSVAR(undo_directory), + MYSQL_SYSVAR(undo_tablespaces), + MYSQL_SYSVAR(sync_array_size), + MYSQL_SYSVAR(compression_failure_threshold_pct), + MYSQL_SYSVAR(compression_pad_pct_max), +#ifdef UNIV_DEBUG + MYSQL_SYSVAR(trx_rseg_n_slots_debug), + MYSQL_SYSVAR(limit_optimistic_insert_debug), + MYSQL_SYSVAR(trx_purge_view_update_only_debug), + MYSQL_SYSVAR(fil_make_page_dirty_debug), + MYSQL_SYSVAR(saved_page_number_debug), +#endif /* UNIV_DEBUG */ + MYSQL_SYSVAR(corrupt_table_action), + MYSQL_SYSVAR(fake_changes), + MYSQL_SYSVAR(locking_fake_changes), + NULL +}; + +mysql_declare_plugin(innobase) +{ + MYSQL_STORAGE_ENGINE_PLUGIN, + &innobase_storage_engine, + innobase_hton_name, + plugin_author, + "Percona-XtraDB, Supports transactions, row-level locking, and foreign keys", + PLUGIN_LICENSE_GPL, + innobase_init, /* Plugin Init */ + NULL, /* Plugin Deinit */ + INNODB_VERSION_SHORT, + innodb_status_variables_export,/* status variables */ + innobase_system_variables, /* system variables */ + NULL, /* reserved */ + 0, /* flags */ +}, +i_s_xtradb_read_view, +i_s_xtradb_internal_hash_tables, +i_s_xtradb_rseg, +i_s_innodb_trx, +i_s_innodb_locks, +i_s_innodb_lock_waits, +i_s_innodb_cmp, +i_s_innodb_cmp_reset, +i_s_innodb_cmpmem, +i_s_innodb_cmpmem_reset, +i_s_innodb_cmp_per_index, +i_s_innodb_cmp_per_index_reset, +i_s_innodb_buffer_page, +i_s_innodb_buffer_page_lru, +i_s_innodb_buffer_stats, +i_s_innodb_metrics, +i_s_innodb_ft_default_stopword, +i_s_innodb_ft_deleted, +i_s_innodb_ft_being_deleted, +i_s_innodb_ft_config, +i_s_innodb_ft_index_cache, +i_s_innodb_ft_index_table, +i_s_innodb_sys_tables, +i_s_innodb_sys_tablestats, +i_s_innodb_sys_indexes, +i_s_innodb_sys_columns, +i_s_innodb_sys_fields, +i_s_innodb_sys_foreign, +i_s_innodb_sys_foreign_cols, +i_s_innodb_sys_tablespaces, +i_s_innodb_sys_datafiles, +i_s_innodb_changed_pages +mysql_declare_plugin_end; + +/** @brief Initialize the default value of innodb_commit_concurrency. + +Once InnoDB is running, the innodb_commit_concurrency must not change +from zero to nonzero. (Bug #42101) + +The initial default value is 0, and without this extra initialization, +SET GLOBAL innodb_commit_concurrency=DEFAULT would set the parameter +to 0, even if it was initially set to nonzero at the command line +or configuration file. */ +static +void +innobase_commit_concurrency_init_default() +/*======================================*/ +{ + MYSQL_SYSVAR_NAME(commit_concurrency).def_val + = innobase_commit_concurrency; +} + +/** @brief Initialize the default and max value of innodb_undo_logs. + +Once InnoDB is running, the default value and the max value of +innodb_undo_logs must be equal to the available undo logs, +given by srv_available_undo_logs. */ +static +void +innobase_undo_logs_init_default_max() +/*=================================*/ +{ + MYSQL_SYSVAR_NAME(undo_logs).max_val + = MYSQL_SYSVAR_NAME(undo_logs).def_val + = static_cast<unsigned long>(srv_available_undo_logs); +} + +#ifdef UNIV_COMPILE_TEST_FUNCS + +struct innobase_convert_name_test_t { + char* buf; + ulint buflen; + const char* id; + ulint idlen; + void* thd; + ibool file_id; + + const char* expected; +}; + +void +test_innobase_convert_name() +{ + char buf[1024]; + ulint i; + + innobase_convert_name_test_t test_input[] = { + {buf, sizeof(buf), "abcd", 4, NULL, TRUE, "\"abcd\""}, + {buf, 7, "abcd", 4, NULL, TRUE, "\"abcd\""}, + {buf, 6, "abcd", 4, NULL, TRUE, "\"abcd\""}, + {buf, 5, "abcd", 4, NULL, TRUE, "\"abc\""}, + {buf, 4, "abcd", 4, NULL, TRUE, "\"ab\""}, + + {buf, sizeof(buf), "ab@0060cd", 9, NULL, TRUE, "\"ab`cd\""}, + {buf, 9, "ab@0060cd", 9, NULL, TRUE, "\"ab`cd\""}, + {buf, 8, "ab@0060cd", 9, NULL, TRUE, "\"ab`cd\""}, + {buf, 7, "ab@0060cd", 9, NULL, TRUE, "\"ab`cd\""}, + {buf, 6, "ab@0060cd", 9, NULL, TRUE, "\"ab`c\""}, + {buf, 5, "ab@0060cd", 9, NULL, TRUE, "\"ab`\""}, + {buf, 4, "ab@0060cd", 9, NULL, TRUE, "\"ab\""}, + + {buf, sizeof(buf), "ab\"cd", 5, NULL, TRUE, + "\"#mysql50#ab\"\"cd\""}, + {buf, 17, "ab\"cd", 5, NULL, TRUE, + "\"#mysql50#ab\"\"cd\""}, + {buf, 16, "ab\"cd", 5, NULL, TRUE, + "\"#mysql50#ab\"\"c\""}, + {buf, 15, "ab\"cd", 5, NULL, TRUE, + "\"#mysql50#ab\"\"\""}, + {buf, 14, "ab\"cd", 5, NULL, TRUE, + "\"#mysql50#ab\""}, + {buf, 13, "ab\"cd", 5, NULL, TRUE, + "\"#mysql50#ab\""}, + {buf, 12, "ab\"cd", 5, NULL, TRUE, + "\"#mysql50#a\""}, + {buf, 11, "ab\"cd", 5, NULL, TRUE, + "\"#mysql50#\""}, + {buf, 10, "ab\"cd", 5, NULL, TRUE, + "\"#mysql50\""}, + + {buf, sizeof(buf), "ab/cd", 5, NULL, TRUE, "\"ab\".\"cd\""}, + {buf, 9, "ab/cd", 5, NULL, TRUE, "\"ab\".\"cd\""}, + {buf, 8, "ab/cd", 5, NULL, TRUE, "\"ab\".\"c\""}, + {buf, 7, "ab/cd", 5, NULL, TRUE, "\"ab\".\"\""}, + {buf, 6, "ab/cd", 5, NULL, TRUE, "\"ab\"."}, + {buf, 5, "ab/cd", 5, NULL, TRUE, "\"ab\"."}, + {buf, 4, "ab/cd", 5, NULL, TRUE, "\"ab\""}, + {buf, 3, "ab/cd", 5, NULL, TRUE, "\"a\""}, + {buf, 2, "ab/cd", 5, NULL, TRUE, "\"\""}, + /* XXX probably "" is a better result in this case + {buf, 1, "ab/cd", 5, NULL, TRUE, "."}, + */ + {buf, 0, "ab/cd", 5, NULL, TRUE, ""}, + }; + + for (i = 0; i < sizeof(test_input) / sizeof(test_input[0]); i++) { + + char* end; + ibool ok = TRUE; + size_t res_len; + + fprintf(stderr, "TESTING %lu, %s, %lu, %s\n", + test_input[i].buflen, + test_input[i].id, + test_input[i].idlen, + test_input[i].expected); + + end = innobase_convert_name( + test_input[i].buf, + test_input[i].buflen, + test_input[i].id, + test_input[i].idlen, + test_input[i].thd, + test_input[i].file_id); + + res_len = (size_t) (end - test_input[i].buf); + + if (res_len != strlen(test_input[i].expected)) { + + fprintf(stderr, "unexpected len of the result: %u, " + "expected: %u\n", (unsigned) res_len, + (unsigned) strlen(test_input[i].expected)); + ok = FALSE; + } + + if (memcmp(test_input[i].buf, + test_input[i].expected, + strlen(test_input[i].expected)) != 0 + || !ok) { + + fprintf(stderr, "unexpected result: %.*s, " + "expected: %s\n", (int) res_len, + test_input[i].buf, + test_input[i].expected); + ok = FALSE; + } + + if (ok) { + fprintf(stderr, "OK: res: %.*s\n\n", (int) res_len, + buf); + } else { + fprintf(stderr, "FAILED\n\n"); + return; + } + } +} + +#endif /* UNIV_COMPILE_TEST_FUNCS */ + +/**************************************************************************** + * DS-MRR implementation + ***************************************************************************/ + +/** + * Multi Range Read interface, DS-MRR calls + */ + +int +ha_innobase::multi_range_read_init( + RANGE_SEQ_IF* seq, + void* seq_init_param, + uint n_ranges, + uint mode, + HANDLER_BUFFER* buf) +{ + return(ds_mrr.dsmrr_init(this, seq, seq_init_param, + n_ranges, mode, buf)); +} + +int +ha_innobase::multi_range_read_next( + char** range_info) +{ + return(ds_mrr.dsmrr_next(range_info)); +} + +ha_rows +ha_innobase::multi_range_read_info_const( + uint keyno, + RANGE_SEQ_IF* seq, + void* seq_init_param, + uint n_ranges, + uint* bufsz, + uint* flags, + Cost_estimate* cost) +{ + /* See comments in ha_myisam::multi_range_read_info_const */ + ds_mrr.init(this, table); + return(ds_mrr.dsmrr_info_const(keyno, seq, seq_init_param, + n_ranges, bufsz, flags, cost)); +} + +ha_rows +ha_innobase::multi_range_read_info( + uint keyno, + uint n_ranges, + uint keys, + uint* bufsz, + uint* flags, + Cost_estimate* cost) +{ + ds_mrr.init(this, table); + return(ds_mrr.dsmrr_info(keyno, n_ranges, keys, bufsz, flags, cost)); +} + + +/** + * Index Condition Pushdown interface implementation + */ + +/*************************************************************//** +InnoDB index push-down condition check +@return ICP_NO_MATCH, ICP_MATCH, or ICP_OUT_OF_RANGE */ +UNIV_INTERN +enum icp_result +innobase_index_cond( +/*================*/ + void* file) /*!< in/out: pointer to ha_innobase */ +{ + DBUG_ENTER("innobase_index_cond"); + + ha_innobase* h = reinterpret_cast<class ha_innobase*>(file); + + DBUG_ASSERT(h->pushed_idx_cond); + DBUG_ASSERT(h->pushed_idx_cond_keyno != MAX_KEY); + + if (h->end_range && h->compare_key_icp(h->end_range) > 0) { + + /* caller should return HA_ERR_END_OF_FILE already */ + DBUG_RETURN(ICP_OUT_OF_RANGE); + } + + DBUG_RETURN(h->pushed_idx_cond->val_int() ? ICP_MATCH : ICP_NO_MATCH); +} + +/** Attempt to push down an index condition. +* @param[in] keyno MySQL key number +* @param[in] idx_cond Index condition to be checked +* @return Part of idx_cond which the handler will not evaluate +*/ +UNIV_INTERN +class Item* +ha_innobase::idx_cond_push( + uint keyno, + class Item* idx_cond) +{ + DBUG_ENTER("ha_innobase::idx_cond_push"); + DBUG_ASSERT(keyno != MAX_KEY); + DBUG_ASSERT(idx_cond != NULL); + + pushed_idx_cond = idx_cond; + pushed_idx_cond_keyno = keyno; + in_range_check_pushed_down = TRUE; + /* We will evaluate the condition entirely */ + DBUG_RETURN(NULL); +} + +/******************************************************************//** +Use this when the args are passed to the format string from +errmsg-utf8.txt directly as is. + +Push a warning message to the client, it is a wrapper around: + +void push_warning_printf( + THD *thd, Sql_condition::enum_warning_level level, + uint code, const char *format, ...); +*/ +UNIV_INTERN +void +ib_senderrf( +/*========*/ + THD* thd, /*!< in/out: session */ + ib_log_level_t level, /*!< in: warning level */ + ib_uint32_t code, /*!< MySQL error code */ + ...) /*!< Args */ +{ + char* str; + va_list args; + const char* format = innobase_get_err_msg(code); + + /* If the caller wants to push a message to the client then + the caller must pass a valid session handle. */ + + ut_a(thd != 0); + + /* The error code must exist in the errmsg-utf8.txt file. */ + ut_a(format != 0); + + va_start(args, code); + +#ifdef __WIN__ + int size = _vscprintf(format, args) + 1; + str = static_cast<char*>(malloc(size)); + str[size - 1] = 0x0; + vsnprintf(str, size, format, args); +#elif HAVE_VASPRINTF + int ret; + ret = vasprintf(&str, format, args); + ut_a(ret != -1); +#else + /* Use a fixed length string. */ + str = static_cast<char*>(malloc(BUFSIZ)); + my_vsnprintf(str, BUFSIZ, format, args); +#endif /* __WIN__ */ + + Sql_condition::enum_warning_level l; + + l = Sql_condition::WARN_LEVEL_NOTE; + + switch(level) { + case IB_LOG_LEVEL_INFO: + break; + case IB_LOG_LEVEL_WARN: + l = Sql_condition::WARN_LEVEL_WARN; + break; + case IB_LOG_LEVEL_ERROR: + /* We can't use push_warning_printf(), it is a hard error. */ + my_printf_error(code, "%s", MYF(0), str); + break; + case IB_LOG_LEVEL_FATAL: + l = Sql_condition::WARN_LEVEL_END; + break; + } + + if (level != IB_LOG_LEVEL_ERROR) { + push_warning_printf(thd, l, code, "InnoDB: %s", str); + } + + va_end(args); + free(str); + + if (level == IB_LOG_LEVEL_FATAL) { + ut_error; + } +} + +/******************************************************************//** +Use this when the args are first converted to a formatted string and then +passed to the format string from errmsg-utf8.txt. The error message format +must be: "Some string ... %s". + +Push a warning message to the client, it is a wrapper around: + +void push_warning_printf( + THD *thd, Sql_condition::enum_warning_level level, + uint code, const char *format, ...); +*/ +UNIV_INTERN +void +ib_errf( +/*====*/ + THD* thd, /*!< in/out: session */ + ib_log_level_t level, /*!< in: warning level */ + ib_uint32_t code, /*!< MySQL error code */ + const char* format, /*!< printf format */ + ...) /*!< Args */ +{ + char* str; + va_list args; + + /* If the caller wants to push a message to the client then + the caller must pass a valid session handle. */ + + ut_a(thd != 0); + ut_a(format != 0); + + va_start(args, format); + +#ifdef __WIN__ + int size = _vscprintf(format, args) + 1; + str = static_cast<char*>(malloc(size)); + str[size - 1] = 0x0; + vsnprintf(str, size, format, args); +#elif HAVE_VASPRINTF + int ret; + ret = vasprintf(&str, format, args); + ut_a(ret != -1); +#else + /* Use a fixed length string. */ + str = static_cast<char*>(malloc(BUFSIZ)); + my_vsnprintf(str, BUFSIZ, format, args); +#endif /* __WIN__ */ + + ib_senderrf(thd, level, code, str); + + va_end(args); + free(str); +} + +/******************************************************************//** +Write a message to the MySQL log, prefixed with "InnoDB: " */ +UNIV_INTERN +void +ib_logf( +/*====*/ + ib_log_level_t level, /*!< in: warning level */ + const char* format, /*!< printf format */ + ...) /*!< Args */ +{ + char* str; + va_list args; + + va_start(args, format); + +#ifdef __WIN__ + int size = _vscprintf(format, args) + 1; + str = static_cast<char*>(malloc(size)); + str[size - 1] = 0x0; + vsnprintf(str, size, format, args); +#elif HAVE_VASPRINTF + int ret; + ret = vasprintf(&str, format, args); + ut_a(ret != -1); +#else + /* Use a fixed length string. */ + str = static_cast<char*>(malloc(BUFSIZ)); + my_vsnprintf(str, BUFSIZ, format, args); +#endif /* __WIN__ */ + + switch(level) { + case IB_LOG_LEVEL_INFO: + sql_print_information("InnoDB: %s", str); + break; + case IB_LOG_LEVEL_WARN: + sql_print_warning("InnoDB: %s", str); + break; + case IB_LOG_LEVEL_ERROR: + sql_print_error("InnoDB: %s", str); + break; + case IB_LOG_LEVEL_FATAL: + sql_print_error("InnoDB: %s", str); + break; + } + + va_end(args); + free(str); + + if (level == IB_LOG_LEVEL_FATAL) { + ut_error; + } +} + +/********************************************************************** +Converts an identifier from my_charset_filename to UTF-8 charset. +@return result string length, as returned by strconvert() */ +uint +innobase_convert_to_filename_charset( +/*=================================*/ + char* to, /* out: converted identifier */ + const char* from, /* in: identifier to convert */ + ulint len) /* in: length of 'to', in bytes */ +{ + uint errors; + CHARSET_INFO* cs_to = &my_charset_filename; + CHARSET_INFO* cs_from = system_charset_info; + + return(strconvert( + cs_from, from, cs_to, to, static_cast<uint>(len), &errors)); +} + +/********************************************************************** +Converts an identifier from my_charset_filename to UTF-8 charset. +@return result string length, as returned by strconvert() */ +uint +innobase_convert_to_system_charset( +/*===============================*/ + char* to, /* out: converted identifier */ + const char* from, /* in: identifier to convert */ + ulint len, /* in: length of 'to', in bytes */ + uint* errors) /* out: error return */ +{ + CHARSET_INFO* cs1 = &my_charset_filename; + CHARSET_INFO* cs2 = system_charset_info; + + return(strconvert( + cs1, from, cs2, to, static_cast<uint>(len), errors)); +} + +/********************************************************************** +Issue a warning that the row is too big. */ +void +ib_warn_row_too_big(const dict_table_t* table) +{ + /* If prefix is true then a 768-byte prefix is stored + locally for BLOB fields. Refer to dict_table_get_format() */ + const bool prefix = (dict_tf_get_format(table->flags) + == UNIV_FORMAT_A); + + const ulint free_space = page_get_free_space_of_empty( + table->flags & DICT_TF_COMPACT) / 2; + + THD* thd = current_thd; + + push_warning_printf( + thd, Sql_condition::WARN_LEVEL_WARN, HA_ERR_TO_BIG_ROW, + "Row size too large (> %lu). Changing some columns to TEXT" + " or BLOB %smay help. In current row format, BLOB prefix of" + " %d bytes is stored inline.", free_space + , prefix ? "or using ROW_FORMAT=DYNAMIC or" + " ROW_FORMAT=COMPRESSED ": "" + , prefix ? DICT_MAX_FIXED_COL_LEN : 0); +} diff --git a/storage/xtradb/handler/ha_innodb.h b/storage/xtradb/handler/ha_innodb.h new file mode 100644 index 00000000000..d49a7c228f7 --- /dev/null +++ b/storage/xtradb/handler/ha_innodb.h @@ -0,0 +1,644 @@ +/***************************************************************************** + +Copyright (c) 2000, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/* + This file is based on ha_berkeley.h of MySQL distribution + + This file defines the Innodb handler: the interface between MySQL and + Innodb +*/ + +#include "dict0stats.h" + +/* Structure defines translation table between mysql index and innodb +index structures */ +struct innodb_idx_translate_t { + ulint index_count; /*!< number of valid index entries + in the index_mapping array */ + ulint array_size; /*!< array size of index_mapping */ + dict_index_t** index_mapping; /*!< index pointer array directly + maps to index in Innodb from MySQL + array index */ +}; + + +/** InnoDB table share */ +typedef struct st_innobase_share { + THR_LOCK lock; /*!< MySQL lock protecting + this structure */ + const char* table_name; /*!< InnoDB table name */ + uint use_count; /*!< reference count, + incremented in get_share() + and decremented in + free_share() */ + void* table_name_hash;/*!< hash table chain node */ + innodb_idx_translate_t idx_trans_tbl; /*!< index translation + table between MySQL and + Innodb */ + dict_table_t* ib_table; +} INNOBASE_SHARE; + + +/** Prebuilt structures in an InnoDB table handle used within MySQL */ +struct row_prebuilt_t; + +/** The class defining a handle to an Innodb table */ +class ha_innobase: public handler +{ + row_prebuilt_t* prebuilt; /*!< prebuilt struct in InnoDB, used + to save CPU time with prebuilt data + structures*/ + THD* user_thd; /*!< the thread handle of the user + currently using the handle; this is + set in external_lock function */ + THR_LOCK_DATA lock; + INNOBASE_SHARE* share; /*!< information for MySQL + table locking */ + + uchar* upd_buf; /*!< buffer used in updates */ + ulint upd_buf_size; /*!< the size of upd_buf in bytes */ + Table_flags int_table_flags; + uint primary_key; + ulong start_of_scan; /*!< this is set to 1 when we are + starting a table scan but have not + yet fetched any row, else 0 */ + uint last_match_mode;/* match mode of the latest search: + ROW_SEL_EXACT, ROW_SEL_EXACT_PREFIX, + or undefined */ + uint num_write_row; /*!< number of write_row() calls */ + + uint store_key_val_for_row(uint keynr, char* buff, uint buff_len, + const uchar* record); + inline void update_thd(THD* thd); + void update_thd(); + int change_active_index(uint keynr); + int general_fetch(uchar* buf, uint direction, uint match_mode); + dberr_t innobase_lock_autoinc(); + ulonglong innobase_peek_autoinc(); + dberr_t innobase_set_max_autoinc(ulonglong auto_inc); + dberr_t innobase_reset_autoinc(ulonglong auto_inc); + dberr_t innobase_get_autoinc(ulonglong* value); + void innobase_initialize_autoinc(); + dict_index_t* innobase_get_index(uint keynr); + + /* Init values for the class: */ + public: + ha_innobase(handlerton *hton, TABLE_SHARE *table_arg); + ~ha_innobase(); + /* + Get the row type from the storage engine. If this method returns + ROW_TYPE_NOT_USED, the information in HA_CREATE_INFO should be used. + */ + enum row_type get_row_type() const; + + const char* table_type() const; + const char* index_type(uint key_number); + const char** bas_ext() const; + Table_flags table_flags() const; + ulong index_flags(uint idx, uint part, bool all_parts) const; + uint max_supported_keys() const; + uint max_supported_key_length() const; + uint max_supported_key_part_length() const; + const key_map* keys_to_use_for_scanning(); + + int open(const char *name, int mode, uint test_if_locked); + handler* clone(const char *name, MEM_ROOT *mem_root); + int close(void); + double scan_time(); + double read_time(uint index, uint ranges, ha_rows rows); + longlong get_memory_buffer_size() const; + my_bool is_fake_change_enabled(THD *thd); + + int write_row(uchar * buf); + int update_row(const uchar * old_data, uchar * new_data); + int delete_row(const uchar * buf); + bool was_semi_consistent_read(); + void try_semi_consistent_read(bool yes); + void unlock_row(); + + int index_init(uint index, bool sorted); + int index_end(); + int index_read(uchar * buf, const uchar * key, + uint key_len, enum ha_rkey_function find_flag); + int index_read_idx(uchar * buf, uint index, const uchar * key, + uint key_len, enum ha_rkey_function find_flag); + int index_read_last(uchar * buf, const uchar * key, uint key_len); + int index_next(uchar * buf); + int index_next_same(uchar * buf, const uchar *key, uint keylen); + int index_prev(uchar * buf); + int index_first(uchar * buf); + int index_last(uchar * buf); + + int rnd_init(bool scan); + int rnd_end(); + int rnd_next(uchar *buf); + int rnd_pos(uchar * buf, uchar *pos); + + int ft_init(); + void ft_end(); + FT_INFO *ft_init_ext(uint flags, uint inx, String* key); + int ft_read(uchar* buf); + + void position(const uchar *record); + int info(uint); + int analyze(THD* thd,HA_CHECK_OPT* check_opt); + int optimize(THD* thd,HA_CHECK_OPT* check_opt); + int discard_or_import_tablespace(my_bool discard); + int extra(enum ha_extra_function operation); + int reset(); + int external_lock(THD *thd, int lock_type); + int transactional_table_lock(THD *thd, int lock_type); + int start_stmt(THD *thd, thr_lock_type lock_type); + void position(uchar *record); + ha_rows records_in_range(uint inx, key_range *min_key, key_range + *max_key); + ha_rows estimate_rows_upper_bound(); + + void update_create_info(HA_CREATE_INFO* create_info); + int parse_table_name(const char*name, + HA_CREATE_INFO* create_info, + ulint flags, + ulint flags2, + char* norm_name, + char* temp_path, + char* remote_path); + int create(const char *name, register TABLE *form, + HA_CREATE_INFO *create_info); + int truncate(); + int delete_table(const char *name); + int rename_table(const char* from, const char* to); + int check(THD* thd, HA_CHECK_OPT* check_opt); + char* update_table_comment(const char* comment); + char* get_foreign_key_create_info(); + int get_foreign_key_list(THD *thd, List<FOREIGN_KEY_INFO> *f_key_list); + int get_parent_foreign_key_list(THD *thd, + List<FOREIGN_KEY_INFO> *f_key_list); + bool can_switch_engines(); + uint referenced_by_foreign_key(); + void free_foreign_key_create_info(char* str); + THR_LOCK_DATA **store_lock(THD *thd, THR_LOCK_DATA **to, + enum thr_lock_type lock_type); + void init_table_handle_for_HANDLER(); + virtual void get_auto_increment(ulonglong offset, ulonglong increment, + ulonglong nb_desired_values, + ulonglong *first_value, + ulonglong *nb_reserved_values); + int reset_auto_increment(ulonglong value); + + virtual bool get_error_message(int error, String *buf); + virtual bool get_foreign_dup_key(char*, uint, char*, uint); + uint8 table_cache_type(); + /* + ask handler about permission to cache table during query registration + */ + my_bool register_query_cache_table(THD *thd, char *table_key, + uint key_length, + qc_engine_callback *call_back, + ulonglong *engine_data); + static const char *get_mysql_bin_log_name(); + static ulonglong get_mysql_bin_log_pos(); + bool primary_key_is_clustered(); + int cmp_ref(const uchar *ref1, const uchar *ref2); + /** On-line ALTER TABLE interface @see handler0alter.cc @{ */ + + /** Check if InnoDB supports a particular alter table in-place + @param altered_table TABLE object for new version of table. + @param ha_alter_info Structure describing changes to be done + by ALTER TABLE and holding data used during in-place alter. + + @retval HA_ALTER_INPLACE_NOT_SUPPORTED Not supported + @retval HA_ALTER_INPLACE_NO_LOCK Supported + @retval HA_ALTER_INPLACE_SHARED_LOCK_AFTER_PREPARE + Supported, but requires lock + during main phase and exclusive + lock during prepare phase. + @retval HA_ALTER_INPLACE_NO_LOCK_AFTER_PREPARE + Supported, prepare phase + requires exclusive lock. + */ + enum_alter_inplace_result check_if_supported_inplace_alter( + TABLE* altered_table, + Alter_inplace_info* ha_alter_info); + /** Allows InnoDB to update internal structures with concurrent + writes blocked (provided that check_if_supported_inplace_alter() + did not return HA_ALTER_INPLACE_NO_LOCK). + This will be invoked before inplace_alter_table(). + + @param altered_table TABLE object for new version of table. + @param ha_alter_info Structure describing changes to be done + by ALTER TABLE and holding data used during in-place alter. + + @retval true Failure + @retval false Success + */ + bool prepare_inplace_alter_table( + TABLE* altered_table, + Alter_inplace_info* ha_alter_info); + + /** Alter the table structure in-place with operations + specified using HA_ALTER_FLAGS and Alter_inplace_information. + The level of concurrency allowed during this operation depends + on the return value from check_if_supported_inplace_alter(). + + @param altered_table TABLE object for new version of table. + @param ha_alter_info Structure describing changes to be done + by ALTER TABLE and holding data used during in-place alter. + + @retval true Failure + @retval false Success + */ + bool inplace_alter_table( + TABLE* altered_table, + Alter_inplace_info* ha_alter_info); + + /** Commit or rollback the changes made during + prepare_inplace_alter_table() and inplace_alter_table() inside + the storage engine. Note that the allowed level of concurrency + during this operation will be the same as for + inplace_alter_table() and thus might be higher than during + prepare_inplace_alter_table(). (E.g concurrent writes were + blocked during prepare, but might not be during commit). + @param altered_table TABLE object for new version of table. + @param ha_alter_info Structure describing changes to be done + by ALTER TABLE and holding data used during in-place alter. + @param commit true => Commit, false => Rollback. + @retval true Failure + @retval false Success + */ + bool commit_inplace_alter_table( + TABLE* altered_table, + Alter_inplace_info* ha_alter_info, + bool commit); + /** @} */ + bool check_if_incompatible_data(HA_CREATE_INFO *info, + uint table_changes); +private: + /** Builds a 'template' to the prebuilt struct. + + The template is used in fast retrieval of just those column + values MySQL needs in its processing. + @param whole_row true if access is needed to a whole row, + false if accessing individual fields is enough */ + void build_template(bool whole_row); + /** Resets a query execution 'template'. + @see build_template() */ + inline void reset_template(); + + int info_low(uint, bool); + +public: + /** @name Multi Range Read interface @{ */ + /** Initialize multi range read @see DsMrr_impl::dsmrr_init + * @param seq + * @param seq_init_param + * @param n_ranges + * @param mode + * @param buf + */ + int multi_range_read_init(RANGE_SEQ_IF* seq, + void* seq_init_param, + uint n_ranges, uint mode, + HANDLER_BUFFER* buf); + /** Process next multi range read @see DsMrr_impl::dsmrr_next + * @param range_info + */ + int multi_range_read_next(char** range_info); + /** Initialize multi range read and get information. + * @see ha_myisam::multi_range_read_info_const + * @see DsMrr_impl::dsmrr_info_const + * @param keyno + * @param seq + * @param seq_init_param + * @param n_ranges + * @param bufsz + * @param flags + * @param cost + */ + ha_rows multi_range_read_info_const(uint keyno, RANGE_SEQ_IF* seq, + void* seq_init_param, + uint n_ranges, uint* bufsz, + uint* flags, Cost_estimate* cost); + /** Initialize multi range read and get information. + * @see DsMrr_impl::dsmrr_info + * @param keyno + * @param seq + * @param seq_init_param + * @param n_ranges + * @param bufsz + * @param flags + * @param cost + */ + ha_rows multi_range_read_info(uint keyno, uint n_ranges, uint keys, + uint* bufsz, uint* flags, + Cost_estimate* cost); + + /** Attempt to push down an index condition. + * @param[in] keyno MySQL key number + * @param[in] idx_cond Index condition to be checked + * @return idx_cond if pushed; NULL if not pushed + */ + class Item* idx_cond_push(uint keyno, class Item* idx_cond); + +private: + /** The multi range read session object */ + DsMrr_impl ds_mrr; + /* @} */ +}; + +/* Some accessor functions which the InnoDB plugin needs, but which +can not be added to mysql/plugin.h as part of the public interface; +the definitions are bracketed with #ifdef INNODB_COMPATIBILITY_HOOKS */ + +#ifndef INNODB_COMPATIBILITY_HOOKS +#error InnoDB needs MySQL to be built with #define INNODB_COMPATIBILITY_HOOKS +#endif + +LEX_STRING* thd_query_string(MYSQL_THD thd); + +extern "C" { + +struct charset_info_st *thd_charset(MYSQL_THD thd); + +/** + Check if a user thread is a replication slave thread + @param thd user thread + @retval 0 the user thread is not a replication slave thread + @retval 1 the user thread is a replication slave thread +*/ +int thd_slave_thread(const MYSQL_THD thd); + +/** + Check if a user thread is running a non-transactional update + @param thd user thread + @retval 0 the user thread is not running a non-transactional update + @retval 1 the user thread is running a non-transactional update +*/ +int thd_non_transactional_update(const MYSQL_THD thd); + +/** + Get the user thread's binary logging format + @param thd user thread + @return Value to be used as index into the binlog_format_names array +*/ +int thd_binlog_format(const MYSQL_THD thd); + +/** + Mark transaction to rollback and mark error as fatal to a sub-statement. + @param thd Thread handle + @param all TRUE <=> rollback main transaction. +*/ +void thd_mark_transaction_to_rollback(MYSQL_THD thd, bool all); + +/** + Check if binary logging is filtered for thread's current db. + @param thd Thread handle + @retval 1 the query is not filtered, 0 otherwise. +*/ +bool thd_binlog_filter_ok(const MYSQL_THD thd); + +/** + Check if the query may generate row changes which + may end up in the binary. + @param thd Thread handle + @return 1 the query may generate row changes, 0 otherwise. +*/ +bool thd_sqlcom_can_generate_row_events(const MYSQL_THD thd); + +/** + Gets information on the durability property requested by + a thread. + @param thd Thread handle + @return a durability property. +*/ +enum durability_properties thd_get_durability_property(const MYSQL_THD thd); + +/** Get the auto_increment_offset auto_increment_increment. +@param thd Thread object +@param off auto_increment_offset +@param inc auto_increment_increment */ +void thd_get_autoinc(const MYSQL_THD thd, ulong* off, ulong* inc) +__attribute__((nonnull)); + +/** Is strict sql_mode set. +@param thd Thread object +@return True if sql_mode has strict mode (all or trans), false otherwise. +*/ +bool thd_is_strict_mode(const MYSQL_THD thd) +__attribute__((nonnull)); +} /* extern "C" */ + +struct trx_t; + +extern const struct _ft_vft ft_vft_result; + +/* Structure Returned by ha_innobase::ft_init_ext() */ +typedef struct new_ft_info +{ + struct _ft_vft *please; + struct _ft_vft_ext *could_you; + row_prebuilt_t* ft_prebuilt; + fts_result_t* ft_result; +} NEW_FT_INFO; + +/*********************************************************************//** +Allocates an InnoDB transaction for a MySQL handler object. +@return InnoDB transaction handle */ +trx_t* +innobase_trx_allocate( +/*==================*/ + MYSQL_THD thd); /*!< in: user thread handle */ + +/*********************************************************************//** +This function checks each index name for a table against reserved +system default primary index name 'GEN_CLUST_INDEX'. If a name +matches, this function pushes an warning message to the client, +and returns true. +@return true if the index name matches the reserved name */ +UNIV_INTERN +bool +innobase_index_name_is_reserved( +/*============================*/ + THD* thd, /*!< in/out: MySQL connection */ + const KEY* key_info, /*!< in: Indexes to be created */ + ulint num_of_keys) /*!< in: Number of indexes to + be created. */ + __attribute__((nonnull, warn_unused_result)); + +/*****************************************************************//** +Determines InnoDB table flags. +@retval true if successful, false if error */ +UNIV_INTERN +bool +innobase_table_flags( +/*=================*/ + const TABLE* form, /*!< in: table */ + const HA_CREATE_INFO* create_info, /*!< in: information + on table columns and indexes */ + THD* thd, /*!< in: connection */ + bool use_tablespace, /*!< in: whether to create + outside system tablespace */ + ulint* flags, /*!< out: DICT_TF flags */ + ulint* flags2) /*!< out: DICT_TF2 flags */ + __attribute__((nonnull, warn_unused_result)); + +/*****************************************************************//** +Validates the create options. We may build on this function +in future. For now, it checks two specifiers: +KEY_BLOCK_SIZE and ROW_FORMAT +If innodb_strict_mode is not set then this function is a no-op +@return NULL if valid, string if not. */ +UNIV_INTERN +const char* +create_options_are_invalid( +/*=======================*/ + THD* thd, /*!< in: connection thread. */ + TABLE* form, /*!< in: information on table + columns and indexes */ + HA_CREATE_INFO* create_info, /*!< in: create info. */ + bool use_tablespace) /*!< in: srv_file_per_table */ + __attribute__((nonnull, warn_unused_result)); + +/*********************************************************************//** +Retrieve the FTS Relevance Ranking result for doc with doc_id +of prebuilt->fts_doc_id +@return the relevance ranking value */ +UNIV_INTERN +float +innobase_fts_retrieve_ranking( +/*==========================*/ + FT_INFO* fts_hdl); /*!< in: FTS handler */ + +/*********************************************************************//** +Find and Retrieve the FTS Relevance Ranking result for doc with doc_id +of prebuilt->fts_doc_id +@return the relevance ranking value */ +UNIV_INTERN +float +innobase_fts_find_ranking( +/*======================*/ + FT_INFO* fts_hdl, /*!< in: FTS handler */ + uchar* record, /*!< in: Unused */ + uint len); /*!< in: Unused */ +/*********************************************************************//** +Free the memory for the FTS handler */ +UNIV_INTERN +void +innobase_fts_close_ranking( +/*=======================*/ + FT_INFO* fts_hdl) /*!< in: FTS handler */ + __attribute__((nonnull)); +/*****************************************************************//** +Initialize the table FTS stopword list +@return TRUE if success */ +UNIV_INTERN +ibool +innobase_fts_load_stopword( +/*=======================*/ + dict_table_t* table, /*!< in: Table has the FTS */ + trx_t* trx, /*!< in: transaction */ + THD* thd) /*!< in: current thread */ + __attribute__((nonnull(1,3), warn_unused_result)); + +/** Some defines for innobase_fts_check_doc_id_index() return value */ +enum fts_doc_id_index_enum { + FTS_INCORRECT_DOC_ID_INDEX, + FTS_EXIST_DOC_ID_INDEX, + FTS_NOT_EXIST_DOC_ID_INDEX +}; + +/*******************************************************************//** +Check whether the table has a unique index with FTS_DOC_ID_INDEX_NAME +on the Doc ID column. +@return the status of the FTS_DOC_ID index */ +UNIV_INTERN +enum fts_doc_id_index_enum +innobase_fts_check_doc_id_index( +/*============================*/ + const dict_table_t* table, /*!< in: table definition */ + const TABLE* altered_table, /*!< in: MySQL table + that is being altered */ + ulint* fts_doc_col_no) /*!< out: The column number for + Doc ID */ + __attribute__((warn_unused_result)); + +/*******************************************************************//** +Check whether the table has a unique index with FTS_DOC_ID_INDEX_NAME +on the Doc ID column in MySQL create index definition. +@return FTS_EXIST_DOC_ID_INDEX if there exists the FTS_DOC_ID index, +FTS_INCORRECT_DOC_ID_INDEX if the FTS_DOC_ID index is of wrong format */ +UNIV_INTERN +enum fts_doc_id_index_enum +innobase_fts_check_doc_id_index_in_def( +/*===================================*/ + ulint n_key, /*!< in: Number of keys */ + const KEY* key_info) /*!< in: Key definitions */ + __attribute__((nonnull, warn_unused_result)); + +/*********************************************************************** +@return version of the extended FTS API */ +uint +innobase_fts_get_version(); + +/*********************************************************************** +@return Which part of the extended FTS API is supported */ +ulonglong +innobase_fts_flags(); + +/*********************************************************************** +Find and Retrieve the FTS doc_id for the current result row +@return the document ID */ +ulonglong +innobase_fts_retrieve_docid( +/*============================*/ + FT_INFO_EXT* fts_hdl); /*!< in: FTS handler */ + +/*********************************************************************** +Find and retrieve the size of the current result +@return number of matching rows */ +ulonglong +innobase_fts_count_matches( +/*============================*/ + FT_INFO_EXT* fts_hdl); /*!< in: FTS handler */ + +/** "GEN_CLUST_INDEX" is the name reserved for InnoDB default +system clustered index when there is no primary key. */ +extern const char innobase_index_reserve_name[]; + +/*********************************************************************//** +Copy table flags from MySQL's HA_CREATE_INFO into an InnoDB table object. +Those flags are stored in .frm file and end up in the MySQL table object, +but are frequently used inside InnoDB so we keep their copies into the +InnoDB table object. */ +UNIV_INTERN +void +innobase_copy_frm_flags_from_create_info( +/*=====================================*/ + dict_table_t* innodb_table, /*!< in/out: InnoDB table */ + const HA_CREATE_INFO* create_info); /*!< in: create info */ + +/*********************************************************************//** +Copy table flags from MySQL's TABLE_SHARE into an InnoDB table object. +Those flags are stored in .frm file and end up in the MySQL table object, +but are frequently used inside InnoDB so we keep their copies into the +InnoDB table object. */ +UNIV_INTERN +void +innobase_copy_frm_flags_from_table_share( +/*=====================================*/ + dict_table_t* innodb_table, /*!< in/out: InnoDB table */ + const TABLE_SHARE* table_share); /*!< in: table share */ diff --git a/storage/xtradb/handler/handler0alter.cc b/storage/xtradb/handler/handler0alter.cc new file mode 100644 index 00000000000..d3308ebedc2 --- /dev/null +++ b/storage/xtradb/handler/handler0alter.cc @@ -0,0 +1,5959 @@ +/***************************************************************************** + +Copyright (c) 2005, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file handler/handler0alter.cc +Smart ALTER TABLE +*******************************************************/ + +#include <unireg.h> +#include <mysqld_error.h> +#include <log.h> +#include <debug_sync.h> +#include <mysql/innodb_priv.h> +#include <sql_alter.h> +#include <sql_class.h> + +#include "dict0crea.h" +#include "dict0dict.h" +#include "dict0priv.h" +#include "dict0stats.h" +#include "dict0stats_bg.h" +#include "log0log.h" +#include "rem0types.h" +#include "row0log.h" +#include "row0merge.h" +#include "srv0srv.h" +#include "trx0trx.h" +#include "trx0roll.h" +#include "ha_prototypes.h" +#include "handler0alter.h" +#include "srv0mon.h" +#include "fts0priv.h" +#include "pars0pars.h" +#include "row0sel.h" +#include "ha_innodb.h" + +/** Operations for creating secondary indexes (no rebuild needed) */ +static const Alter_inplace_info::HA_ALTER_FLAGS INNOBASE_ONLINE_CREATE + = Alter_inplace_info::ADD_INDEX + | Alter_inplace_info::ADD_UNIQUE_INDEX; + +/** Operations for rebuilding a table in place */ +static const Alter_inplace_info::HA_ALTER_FLAGS INNOBASE_ALTER_REBUILD + = Alter_inplace_info::ADD_PK_INDEX + | Alter_inplace_info::DROP_PK_INDEX + | Alter_inplace_info::CHANGE_CREATE_OPTION + /* CHANGE_CREATE_OPTION needs to check innobase_need_rebuild() */ + | Alter_inplace_info::ALTER_COLUMN_NULLABLE + | Alter_inplace_info::ALTER_COLUMN_NOT_NULLABLE + | Alter_inplace_info::ALTER_COLUMN_ORDER + | Alter_inplace_info::DROP_COLUMN + | Alter_inplace_info::ADD_COLUMN + | Alter_inplace_info::RECREATE_TABLE + /* + | Alter_inplace_info::ALTER_COLUMN_TYPE + | Alter_inplace_info::ALTER_COLUMN_EQUAL_PACK_LENGTH + */ + ; + +/** Operations that require changes to data */ +static const Alter_inplace_info::HA_ALTER_FLAGS INNOBASE_ALTER_DATA + = INNOBASE_ONLINE_CREATE | INNOBASE_ALTER_REBUILD; + +/** Operations for altering a table that InnoDB does not care about */ +static const Alter_inplace_info::HA_ALTER_FLAGS INNOBASE_INPLACE_IGNORE + = Alter_inplace_info::ALTER_COLUMN_DEFAULT + | Alter_inplace_info::ALTER_COLUMN_COLUMN_FORMAT + | Alter_inplace_info::ALTER_COLUMN_STORAGE_TYPE + | Alter_inplace_info::ALTER_RENAME; + +/** Operations on foreign key definitions (changing the schema only) */ +static const Alter_inplace_info::HA_ALTER_FLAGS INNOBASE_FOREIGN_OPERATIONS + = Alter_inplace_info::DROP_FOREIGN_KEY + | Alter_inplace_info::ADD_FOREIGN_KEY; + +/** Operations that InnoDB cares about and can perform without rebuild */ +static const Alter_inplace_info::HA_ALTER_FLAGS INNOBASE_ALTER_NOREBUILD + = INNOBASE_ONLINE_CREATE + | INNOBASE_FOREIGN_OPERATIONS + | Alter_inplace_info::DROP_INDEX + | Alter_inplace_info::DROP_UNIQUE_INDEX + | Alter_inplace_info::ALTER_COLUMN_NAME; + +/* Report an InnoDB error to the client by invoking my_error(). */ +static UNIV_COLD __attribute__((nonnull)) +void +my_error_innodb( +/*============*/ + dberr_t error, /*!< in: InnoDB error code */ + const char* table, /*!< in: table name */ + ulint flags) /*!< in: table flags */ +{ + switch (error) { + case DB_MISSING_HISTORY: + my_error(ER_TABLE_DEF_CHANGED, MYF(0)); + break; + case DB_RECORD_NOT_FOUND: + my_error(ER_KEY_NOT_FOUND, MYF(0), table); + break; + case DB_DEADLOCK: + my_error(ER_LOCK_DEADLOCK, MYF(0)); + break; + case DB_LOCK_WAIT_TIMEOUT: + my_error(ER_LOCK_WAIT_TIMEOUT, MYF(0)); + break; + case DB_INTERRUPTED: + my_error(ER_QUERY_INTERRUPTED, MYF(0)); + break; + case DB_OUT_OF_MEMORY: + my_error(ER_OUT_OF_RESOURCES, MYF(0)); + break; + case DB_OUT_OF_FILE_SPACE: + my_error(ER_RECORD_FILE_FULL, MYF(0), table); + break; + case DB_TEMP_FILE_WRITE_FAILURE: + my_error(ER_TEMP_FILE_WRITE_FAILURE, MYF(0)); + break; + case DB_TOO_BIG_INDEX_COL: + my_error(ER_INDEX_COLUMN_TOO_LONG, MYF(0), + DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(flags)); + break; + case DB_TOO_MANY_CONCURRENT_TRXS: + my_error(ER_TOO_MANY_CONCURRENT_TRXS, MYF(0)); + break; + case DB_LOCK_TABLE_FULL: + my_error(ER_LOCK_TABLE_FULL, MYF(0)); + break; + case DB_UNDO_RECORD_TOO_BIG: + my_error(ER_UNDO_RECORD_TOO_BIG, MYF(0)); + break; + case DB_CORRUPTION: + my_error(ER_NOT_KEYFILE, MYF(0), table); + break; + case DB_TOO_BIG_RECORD: + my_error(ER_TOO_BIG_ROWSIZE, MYF(0), + page_get_free_space_of_empty( + flags & DICT_TF_COMPACT) / 2); + break; + case DB_INVALID_NULL: + /* TODO: report the row, as we do for DB_DUPLICATE_KEY */ + my_error(ER_INVALID_USE_OF_NULL, MYF(0)); + break; +#ifdef UNIV_DEBUG + case DB_SUCCESS: + case DB_DUPLICATE_KEY: + case DB_TABLESPACE_EXISTS: + case DB_ONLINE_LOG_TOO_BIG: + /* These codes should not be passed here. */ + ut_error; +#endif /* UNIV_DEBUG */ + default: + my_error(ER_GET_ERRNO, MYF(0), error); + break; + } +} + +/** Determine if fulltext indexes exist in a given table. +@param table MySQL table +@return whether fulltext indexes exist on the table */ +static +bool +innobase_fulltext_exist( +/*====================*/ + const TABLE* table) +{ + for (uint i = 0; i < table->s->keys; i++) { + if (table->key_info[i].flags & HA_FULLTEXT) { + return(true); + } + } + + return(false); +} + +/*******************************************************************//** +Determine if ALTER TABLE needs to rebuild the table. +@param ha_alter_info the DDL operation +@return whether it is necessary to rebuild the table */ +static __attribute__((nonnull, warn_unused_result)) +bool +innobase_need_rebuild( +/*==================*/ + const Alter_inplace_info* ha_alter_info) +{ + if (ha_alter_info->handler_flags + == Alter_inplace_info::CHANGE_CREATE_OPTION + && !(ha_alter_info->create_info->used_fields + & (HA_CREATE_USED_ROW_FORMAT + | HA_CREATE_USED_KEY_BLOCK_SIZE))) { + /* Any other CHANGE_CREATE_OPTION than changing + ROW_FORMAT or KEY_BLOCK_SIZE is ignored. */ + return(false); + } + + return(!!(ha_alter_info->handler_flags & INNOBASE_ALTER_REBUILD)); +} + +/** Check if InnoDB supports a particular alter table in-place +@param altered_table TABLE object for new version of table. +@param ha_alter_info Structure describing changes to be done +by ALTER TABLE and holding data used during in-place alter. + +@retval HA_ALTER_INPLACE_NOT_SUPPORTED Not supported +@retval HA_ALTER_INPLACE_NO_LOCK Supported +@retval HA_ALTER_INPLACE_SHARED_LOCK_AFTER_PREPARE Supported, but requires +lock during main phase and exclusive lock during prepare phase. +@retval HA_ALTER_INPLACE_NO_LOCK_AFTER_PREPARE Supported, prepare phase +requires exclusive lock (any transactions that have accessed the table +must commit or roll back first, and no transactions can access the table +while prepare_inplace_alter_table() is executing) +*/ +UNIV_INTERN +enum_alter_inplace_result +ha_innobase::check_if_supported_inplace_alter( +/*==========================================*/ + TABLE* altered_table, + Alter_inplace_info* ha_alter_info) +{ + DBUG_ENTER("check_if_supported_inplace_alter"); + + if (srv_read_only_mode) { + ha_alter_info->unsupported_reason = + innobase_get_err_msg(ER_READ_ONLY_MODE); + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } else if (srv_created_new_raw || srv_force_recovery) { + + ha_alter_info->unsupported_reason =(srv_force_recovery)? + innobase_get_err_msg(ER_INNODB_FORCED_RECOVERY): + innobase_get_err_msg(ER_READ_ONLY_MODE); + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + + if (altered_table->s->fields > REC_MAX_N_USER_FIELDS) { + /* Deny the inplace ALTER TABLE. MySQL will try to + re-create the table and ha_innobase::create() will + return an error too. This is how we effectively + deny adding too many columns to a table. */ + ha_alter_info->unsupported_reason = + innobase_get_err_msg(ER_TOO_MANY_FIELDS); + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + + update_thd(); + trx_search_latch_release_if_reserved(prebuilt->trx); + + if (ha_alter_info->handler_flags + & ~(INNOBASE_INPLACE_IGNORE + | INNOBASE_ALTER_NOREBUILD + | INNOBASE_ALTER_REBUILD)) { + + if (ha_alter_info->handler_flags + & (Alter_inplace_info::ALTER_COLUMN_EQUAL_PACK_LENGTH + | Alter_inplace_info::ALTER_COLUMN_TYPE)) + ha_alter_info->unsupported_reason = innobase_get_err_msg( + ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_COLUMN_TYPE); + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + + /* Only support online add foreign key constraint when + check_foreigns is turned off */ + if ((ha_alter_info->handler_flags + & Alter_inplace_info::ADD_FOREIGN_KEY) + && prebuilt->trx->check_foreigns) { + ha_alter_info->unsupported_reason = innobase_get_err_msg( + ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_FK_CHECK); + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + + if (!(ha_alter_info->handler_flags & ~INNOBASE_INPLACE_IGNORE)) { + DBUG_RETURN(HA_ALTER_INPLACE_NO_LOCK); + } + + /* Only support NULL -> NOT NULL change if strict table sql_mode + is set. Fall back to COPY for conversion if not strict tables. + In-Place will fail with an error when trying to convert + NULL to a NOT NULL value. */ + if ((ha_alter_info->handler_flags + & Alter_inplace_info::ALTER_COLUMN_NOT_NULLABLE) + && !thd_is_strict_mode(user_thd)) { + ha_alter_info->unsupported_reason = innobase_get_err_msg( + ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_NOT_NULL); + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + + /* InnoDB cannot IGNORE when creating unique indexes. IGNORE + should silently delete some duplicate rows. Our inplace_alter + code will not delete anything from existing indexes. */ + if (ha_alter_info->ignore + && (ha_alter_info->handler_flags + & (Alter_inplace_info::ADD_PK_INDEX + | Alter_inplace_info::ADD_UNIQUE_INDEX))) { + ha_alter_info->unsupported_reason = innobase_get_err_msg( + ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_IGNORE); + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + + /* DROP PRIMARY KEY is only allowed in combination with ADD + PRIMARY KEY. */ + if ((ha_alter_info->handler_flags + & (Alter_inplace_info::ADD_PK_INDEX + | Alter_inplace_info::DROP_PK_INDEX)) + == Alter_inplace_info::DROP_PK_INDEX) { + ha_alter_info->unsupported_reason = innobase_get_err_msg( + ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_NOPK); + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + + /* If a column change from NOT NULL to NULL, + and there's a implict pk on this column. the + table should be rebuild. The change should + only go through the "Copy" method.*/ + if ((ha_alter_info->handler_flags + & Alter_inplace_info::ALTER_COLUMN_NULLABLE)) { + uint primary_key = altered_table->s->primary_key; + + /* See if MYSQL table has no pk but we do.*/ + if (UNIV_UNLIKELY(primary_key >= MAX_KEY) + && !row_table_got_default_clust_index(prebuilt->table)) { + ha_alter_info->unsupported_reason = innobase_get_err_msg( + ER_PRIMARY_CANT_HAVE_NULL); + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + } + + /* We should be able to do the operation in-place. + See if we can do it online (LOCK=NONE). */ + bool online = true; + + List_iterator_fast<Create_field> cf_it( + ha_alter_info->alter_info->create_list); + + /* Fix the key parts. */ + for (KEY* new_key = ha_alter_info->key_info_buffer; + new_key < ha_alter_info->key_info_buffer + + ha_alter_info->key_count; + new_key++) { + for (KEY_PART_INFO* key_part = new_key->key_part; + key_part < new_key->key_part + new_key->user_defined_key_parts; + key_part++) { + const Create_field* new_field; + + DBUG_ASSERT(key_part->fieldnr + < altered_table->s->fields); + + cf_it.rewind(); + for (uint fieldnr = 0; (new_field = cf_it++); + fieldnr++) { + if (fieldnr == key_part->fieldnr) { + break; + } + } + + DBUG_ASSERT(new_field); + + key_part->field = altered_table->field[ + key_part->fieldnr]; + /* In some special cases InnoDB emits "false" + duplicate key errors with NULL key values. Let + us play safe and ensure that we can correctly + print key values even in such cases .*/ + key_part->null_offset = key_part->field->null_offset(); + key_part->null_bit = key_part->field->null_bit; + + if (new_field->field) { + /* This is an existing column. */ + continue; + } + + /* This is an added column. */ + DBUG_ASSERT(ha_alter_info->handler_flags + & Alter_inplace_info::ADD_COLUMN); + + /* We cannot replace a hidden FTS_DOC_ID + with a user-visible FTS_DOC_ID. */ + if (prebuilt->table->fts + && innobase_fulltext_exist(altered_table) + && !my_strcasecmp( + system_charset_info, + key_part->field->field_name, + FTS_DOC_ID_COL_NAME)) { + ha_alter_info->unsupported_reason = innobase_get_err_msg( + ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_HIDDEN_FTS); + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + + DBUG_ASSERT((MTYP_TYPENR(key_part->field->unireg_check) + == Field::NEXT_NUMBER) + == !!(key_part->field->flags + & AUTO_INCREMENT_FLAG)); + + if (key_part->field->flags & AUTO_INCREMENT_FLAG) { + /* We cannot assign an AUTO_INCREMENT + column values during online ALTER. */ + DBUG_ASSERT(key_part->field == altered_table + -> found_next_number_field); + ha_alter_info->unsupported_reason = innobase_get_err_msg( + ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_AUTOINC); + online = false; + } + } + } + + DBUG_ASSERT(!prebuilt->table->fts || prebuilt->table->fts->doc_col + <= table->s->fields); + DBUG_ASSERT(!prebuilt->table->fts || prebuilt->table->fts->doc_col + < dict_table_get_n_user_cols(prebuilt->table)); + + if (prebuilt->table->fts + && innobase_fulltext_exist(altered_table)) { + /* FULLTEXT indexes are supposed to remain. */ + /* Disallow DROP INDEX FTS_DOC_ID_INDEX */ + + for (uint i = 0; i < ha_alter_info->index_drop_count; i++) { + if (!my_strcasecmp( + system_charset_info, + ha_alter_info->index_drop_buffer[i]->name, + FTS_DOC_ID_INDEX_NAME)) { + ha_alter_info->unsupported_reason = innobase_get_err_msg( + ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_CHANGE_FTS); + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + } + + /* InnoDB can have a hidden FTS_DOC_ID_INDEX on a + visible FTS_DOC_ID column as well. Prevent dropping or + renaming the FTS_DOC_ID. */ + + for (Field** fp = table->field; *fp; fp++) { + if (!((*fp)->flags + & (FIELD_IS_RENAMED | FIELD_IS_DROPPED))) { + continue; + } + + if (!my_strcasecmp( + system_charset_info, + (*fp)->field_name, + FTS_DOC_ID_COL_NAME)) { + ha_alter_info->unsupported_reason = innobase_get_err_msg( + ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_CHANGE_FTS); + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + } + } + + prebuilt->trx->will_lock++; + + if (!online) { + /* We already determined that only a non-locking + operation is possible. */ + } else if (((ha_alter_info->handler_flags + & Alter_inplace_info::ADD_PK_INDEX) + || innobase_need_rebuild(ha_alter_info)) + && (innobase_fulltext_exist(altered_table) + || (prebuilt->table->flags2 + & DICT_TF2_FTS_HAS_DOC_ID))) { + /* Refuse to rebuild the table online, if + fulltext indexes are to survive the rebuild, + or if the table contains a hidden FTS_DOC_ID column. */ + online = false; + /* If the table already contains fulltext indexes, + refuse to rebuild the table natively altogether. */ + if (prebuilt->table->fts) { + ha_alter_info->unsupported_reason = innobase_get_err_msg( + ER_INNODB_FT_LIMIT); + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + ha_alter_info->unsupported_reason = innobase_get_err_msg( + ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_FTS); + } else if ((ha_alter_info->handler_flags + & Alter_inplace_info::ADD_INDEX)) { + /* Building a full-text index requires a lock. + We could do without a lock if the table already contains + an FTS_DOC_ID column, but in that case we would have + to apply the modification log to the full-text indexes. */ + + for (uint i = 0; i < ha_alter_info->index_add_count; i++) { + const KEY* key = + &ha_alter_info->key_info_buffer[ + ha_alter_info->index_add_buffer[i]]; + if (key->flags & HA_FULLTEXT) { + DBUG_ASSERT(!(key->flags & HA_KEYFLAG_MASK + & ~(HA_FULLTEXT + | HA_PACK_KEY + | HA_GENERATED_KEY + | HA_BINARY_PACK_KEY))); + ha_alter_info->unsupported_reason = innobase_get_err_msg( + ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_FTS); + online = false; + break; + } + } + } + + DBUG_RETURN(online + ? HA_ALTER_INPLACE_NO_LOCK_AFTER_PREPARE + : HA_ALTER_INPLACE_SHARED_LOCK_AFTER_PREPARE); +} + +/*************************************************************//** +Initialize the dict_foreign_t structure with supplied info +@return true if added, false if duplicate foreign->id */ +static __attribute__((nonnull(1,3,5,7))) +bool +innobase_init_foreign( +/*==================*/ + dict_foreign_t* foreign, /*!< in/out: structure to + initialize */ + char* constraint_name, /*!< in/out: constraint name if + exists */ + dict_table_t* table, /*!< in: foreign table */ + dict_index_t* index, /*!< in: foreign key index */ + const char** column_names, /*!< in: foreign key column + names */ + ulint num_field, /*!< in: number of columns */ + const char* referenced_table_name, /*!< in: referenced table + name */ + dict_table_t* referenced_table, /*!< in: referenced table */ + dict_index_t* referenced_index, /*!< in: referenced index */ + const char** referenced_column_names,/*!< in: referenced column + names */ + ulint referenced_num_field) /*!< in: number of referenced + columns */ +{ + ut_ad(mutex_own(&dict_sys->mutex)); + + if (constraint_name) { + ulint db_len; + + /* Catenate 'databasename/' to the constraint name specified + by the user: we conceive the constraint as belonging to the + same MySQL 'database' as the table itself. We store the name + to foreign->id. */ + + db_len = dict_get_db_name_len(table->name); + + foreign->id = static_cast<char*>(mem_heap_alloc( + foreign->heap, db_len + strlen(constraint_name) + 2)); + + ut_memcpy(foreign->id, table->name, db_len); + foreign->id[db_len] = '/'; + strcpy(foreign->id + db_len + 1, constraint_name); + + /* Check if any existing foreign key has the same id, + this is needed only if user supplies the constraint name */ + + if (table->foreign_set.find(foreign) + != table->foreign_set.end()) { + return(false); + } + } + + foreign->foreign_table = table; + foreign->foreign_table_name = mem_heap_strdup( + foreign->heap, table->name); + dict_mem_foreign_table_name_lookup_set(foreign, TRUE); + + foreign->foreign_index = index; + foreign->n_fields = (unsigned int) num_field; + + foreign->foreign_col_names = static_cast<const char**>( + mem_heap_alloc(foreign->heap, num_field * sizeof(void*))); + + for (ulint i = 0; i < foreign->n_fields; i++) { + foreign->foreign_col_names[i] = mem_heap_strdup( + foreign->heap, column_names[i]); + } + + foreign->referenced_index = referenced_index; + foreign->referenced_table = referenced_table; + + foreign->referenced_table_name = mem_heap_strdup( + foreign->heap, referenced_table_name); + dict_mem_referenced_table_name_lookup_set(foreign, TRUE); + + foreign->referenced_col_names = static_cast<const char**>( + mem_heap_alloc(foreign->heap, + referenced_num_field * sizeof(void*))); + + for (ulint i = 0; i < foreign->n_fields; i++) { + foreign->referenced_col_names[i] + = mem_heap_strdup(foreign->heap, + referenced_column_names[i]); + } + + return(true); +} + +/*************************************************************//** +Check whether the foreign key options is legit +@return true if it is */ +static __attribute__((nonnull, warn_unused_result)) +bool +innobase_check_fk_option( +/*=====================*/ + const dict_foreign_t* foreign) /*!< in: foreign key */ +{ + if (!foreign->foreign_index) { + return(true); + } + + if (foreign->type & (DICT_FOREIGN_ON_UPDATE_SET_NULL + | DICT_FOREIGN_ON_DELETE_SET_NULL)) { + + for (ulint j = 0; j < foreign->n_fields; j++) { + if ((dict_index_get_nth_col( + foreign->foreign_index, j)->prtype) + & DATA_NOT_NULL) { + + /* It is not sensible to define + SET NULL if the column is not + allowed to be NULL! */ + return(false); + } + } + } + + return(true); +} + +/*************************************************************//** +Set foreign key options +@return true if successfully set */ +static __attribute__((nonnull, warn_unused_result)) +bool +innobase_set_foreign_key_option( +/*============================*/ + dict_foreign_t* foreign, /*!< in:InnoDB Foreign key */ + Foreign_key* fk_key) /*!< in: Foreign key info from + MySQL */ +{ + ut_ad(!foreign->type); + + switch (fk_key->delete_opt) { + case Foreign_key::FK_OPTION_NO_ACTION: + case Foreign_key::FK_OPTION_RESTRICT: + case Foreign_key::FK_OPTION_DEFAULT: + foreign->type = DICT_FOREIGN_ON_DELETE_NO_ACTION; + break; + case Foreign_key::FK_OPTION_CASCADE: + foreign->type = DICT_FOREIGN_ON_DELETE_CASCADE; + break; + case Foreign_key::FK_OPTION_SET_NULL: + foreign->type = DICT_FOREIGN_ON_DELETE_SET_NULL; + break; + } + + switch (fk_key->update_opt) { + case Foreign_key::FK_OPTION_NO_ACTION: + case Foreign_key::FK_OPTION_RESTRICT: + case Foreign_key::FK_OPTION_DEFAULT: + foreign->type |= DICT_FOREIGN_ON_UPDATE_NO_ACTION; + break; + case Foreign_key::FK_OPTION_CASCADE: + foreign->type |= DICT_FOREIGN_ON_UPDATE_CASCADE; + break; + case Foreign_key::FK_OPTION_SET_NULL: + foreign->type |= DICT_FOREIGN_ON_UPDATE_SET_NULL; + break; + } + + return(innobase_check_fk_option(foreign)); +} + +/*******************************************************************//** +Check if a foreign key constraint can make use of an index +that is being created. +@return useable index, or NULL if none found */ +static __attribute__((nonnull, warn_unused_result)) +const KEY* +innobase_find_equiv_index( +/*======================*/ + const char*const* col_names, + /*!< in: column names */ + uint n_cols, /*!< in: number of columns */ + const KEY* keys, /*!< in: index information */ + const uint* add, /*!< in: indexes being created */ + uint n_add) /*!< in: number of indexes to create */ +{ + for (uint i = 0; i < n_add; i++) { + const KEY* key = &keys[add[i]]; + + if (key->user_defined_key_parts < n_cols) { +no_match: + continue; + } + + for (uint j = 0; j < n_cols; j++) { + const KEY_PART_INFO& key_part = key->key_part[j]; + uint32 col_len + = key_part.field->pack_length(); + + /* The MySQL pack length contains 1 or 2 bytes + length field for a true VARCHAR. */ + + if (key_part.field->type() == MYSQL_TYPE_VARCHAR) { + col_len -= static_cast<const Field_varstring*>( + key_part.field)->length_bytes; + } + + if (key_part.length < col_len) { + + /* Column prefix indexes cannot be + used for FOREIGN KEY constraints. */ + goto no_match; + } + + if (innobase_strcasecmp(col_names[j], + key_part.field->field_name)) { + /* Name mismatch */ + goto no_match; + } + } + + return(key); + } + + return(NULL); +} + +/*************************************************************//** +Find an index whose first fields are the columns in the array +in the same order and is not marked for deletion +@return matching index, NULL if not found */ +static __attribute__((nonnull(1,2,6), warn_unused_result)) +dict_index_t* +innobase_find_fk_index( +/*===================*/ + Alter_inplace_info* ha_alter_info, + /*!< in: alter table info */ + dict_table_t* table, /*!< in: table */ + const char** col_names, + /*!< in: column names, or NULL + to use table->col_names */ + dict_index_t** drop_index, + /*!< in: indexes to be dropped */ + ulint n_drop_index, + /*!< in: size of drop_index[] */ + const char** columns,/*!< in: array of column names */ + ulint n_cols) /*!< in: number of columns */ +{ + dict_index_t* index; + + index = dict_table_get_first_index(table); + + while (index != NULL) { + if (!(index->type & DICT_FTS) + && dict_foreign_qualify_index( + table, col_names, columns, n_cols, + index, NULL, true, 0)) { + for (ulint i = 0; i < n_drop_index; i++) { + if (index == drop_index[i]) { + /* Skip to-be-dropped indexes. */ + goto next_rec; + } + } + + return(index); + } + +next_rec: + index = dict_table_get_next_index(index); + } + + return(NULL); +} + +/*************************************************************//** +Create InnoDB foreign key structure from MySQL alter_info +@retval true if successful +@retval false on error (will call my_error()) */ +static __attribute__((nonnull(1,2,3,7,8), warn_unused_result)) +bool +innobase_get_foreign_key_info( +/*==========================*/ + Alter_inplace_info* + ha_alter_info, /*!< in: alter table info */ + const TABLE_SHARE* + table_share, /*!< in: the TABLE_SHARE */ + dict_table_t* table, /*!< in: table */ + const char** col_names, /*!< in: column names, or NULL + to use table->col_names */ + dict_index_t** drop_index, /*!< in: indexes to be dropped */ + ulint n_drop_index, /*!< in: size of drop_index[] */ + dict_foreign_t**add_fk, /*!< out: foreign constraint added */ + ulint* n_add_fk, /*!< out: number of foreign + constraints added */ + const trx_t* trx) /*!< in: user transaction */ +{ + Key* key; + Foreign_key* fk_key; + dict_table_t* referenced_table = NULL; + char* referenced_table_name = NULL; + ulint num_fk = 0; + Alter_info* alter_info = ha_alter_info->alter_info; + + *n_add_fk = 0; + + List_iterator<Key> key_iterator(alter_info->key_list); + + while ((key=key_iterator++)) { + if (key->type != Key::FOREIGN_KEY) { + continue; + } + + const char* column_names[MAX_NUM_FK_COLUMNS]; + dict_index_t* index = NULL; + const char* referenced_column_names[MAX_NUM_FK_COLUMNS]; + dict_index_t* referenced_index = NULL; + ulint num_col = 0; + ulint referenced_num_col = 0; + bool correct_option; + char* db_namep = NULL; + char* tbl_namep = NULL; + ulint db_name_len = 0; + ulint tbl_name_len = 0; +#ifdef __WIN__ + char db_name[MAX_DATABASE_NAME_LEN]; + char tbl_name[MAX_TABLE_NAME_LEN]; +#endif + + fk_key = static_cast<Foreign_key*>(key); + + if (fk_key->columns.elements > 0) { + ulint i = 0; + Key_part_spec* column; + List_iterator<Key_part_spec> key_part_iterator( + fk_key->columns); + + /* Get all the foreign key column info for the + current table */ + while ((column = key_part_iterator++)) { + column_names[i] = column->field_name.str; + ut_ad(i < MAX_NUM_FK_COLUMNS); + i++; + } + + index = innobase_find_fk_index( + ha_alter_info, + table, col_names, + drop_index, n_drop_index, + column_names, i); + + /* MySQL would add a index in the creation + list if no such index for foreign table, + so we have to use DBUG_EXECUTE_IF to simulate + the scenario */ + DBUG_EXECUTE_IF("innodb_test_no_foreign_idx", + index = NULL;); + + /* Check whether there exist such + index in the the index create clause */ + if (!index && !innobase_find_equiv_index( + column_names, static_cast<uint>(i), + ha_alter_info->key_info_buffer, + ha_alter_info->index_add_buffer, + ha_alter_info->index_add_count)) { + my_error( + ER_FK_NO_INDEX_CHILD, + MYF(0), + fk_key->name.str + ? fk_key->name.str : "", + table_share->table_name.str); + goto err_exit; + } + + num_col = i; + } + + add_fk[num_fk] = dict_mem_foreign_create(); + +#ifndef __WIN__ + tbl_namep = fk_key->ref_table.str; + tbl_name_len = fk_key->ref_table.length; + db_namep = fk_key->ref_db.str; + db_name_len = fk_key->ref_db.length; +#else + ut_ad(fk_key->ref_table.str); + + memcpy(tbl_name, fk_key->ref_table.str, + fk_key->ref_table.length); + tbl_name[fk_key->ref_table.length] = 0; + innobase_casedn_str(tbl_name); + tbl_name_len = strlen(tbl_name); + tbl_namep = &tbl_name[0]; + + if (fk_key->ref_db.str != NULL) { + memcpy(db_name, fk_key->ref_db.str, + fk_key->ref_db.length); + db_name[fk_key->ref_db.length] = 0; + innobase_casedn_str(db_name); + db_name_len = strlen(db_name); + db_namep = &db_name[0]; + } +#endif + mutex_enter(&dict_sys->mutex); + + referenced_table_name = dict_get_referenced_table( + table->name, + db_namep, + db_name_len, + tbl_namep, + tbl_name_len, + &referenced_table, + add_fk[num_fk]->heap); + + /* Test the case when referenced_table failed to + open, if trx->check_foreigns is not set, we should + still be able to add the foreign key */ + DBUG_EXECUTE_IF("innodb_test_open_ref_fail", + referenced_table = NULL;); + + if (!referenced_table && trx->check_foreigns) { + mutex_exit(&dict_sys->mutex); + my_error(ER_FK_CANNOT_OPEN_PARENT, + MYF(0), tbl_namep); + + goto err_exit; + } + + if (fk_key->ref_columns.elements > 0) { + ulint i = 0; + Key_part_spec* column; + List_iterator<Key_part_spec> key_part_iterator( + fk_key->ref_columns); + + while ((column = key_part_iterator++)) { + referenced_column_names[i] = + column->field_name.str; + ut_ad(i < MAX_NUM_FK_COLUMNS); + i++; + } + + if (referenced_table) { + referenced_index = + dict_foreign_find_index( + referenced_table, 0, + referenced_column_names, + i, index, + TRUE, FALSE); + + DBUG_EXECUTE_IF( + "innodb_test_no_reference_idx", + referenced_index = NULL;); + + /* Check whether there exist such + index in the the index create clause */ + if (!referenced_index) { + mutex_exit(&dict_sys->mutex); + my_error(ER_FK_NO_INDEX_PARENT, MYF(0), + fk_key->name.str + ? fk_key->name.str : "", + tbl_namep); + goto err_exit; + } + } else { + ut_a(!trx->check_foreigns); + } + + referenced_num_col = i; + } else { + /* Not possible to add a foreign key without a + referenced column */ + mutex_exit(&dict_sys->mutex); + my_error(ER_CANNOT_ADD_FOREIGN, MYF(0), tbl_namep); + goto err_exit; + } + + if (!innobase_init_foreign( + add_fk[num_fk], fk_key->name.str, + table, index, column_names, + num_col, referenced_table_name, + referenced_table, referenced_index, + referenced_column_names, referenced_num_col)) { + mutex_exit(&dict_sys->mutex); + my_error( + ER_FK_DUP_NAME, + MYF(0), + add_fk[num_fk]->id); + goto err_exit; + } + + mutex_exit(&dict_sys->mutex); + + correct_option = innobase_set_foreign_key_option( + add_fk[num_fk], fk_key); + + DBUG_EXECUTE_IF("innodb_test_wrong_fk_option", + correct_option = false;); + + if (!correct_option) { + my_error(ER_FK_INCORRECT_OPTION, + MYF(0), + table_share->table_name.str, + add_fk[num_fk]->id); + goto err_exit; + } + + num_fk++; + } + + *n_add_fk = num_fk; + + return(true); +err_exit: + for (ulint i = 0; i <= num_fk; i++) { + if (add_fk[i]) { + dict_foreign_free(add_fk[i]); + } + } + + return(false); +} + +/*************************************************************//** +Copies an InnoDB column to a MySQL field. This function is +adapted from row_sel_field_store_in_mysql_format(). */ +static +void +innobase_col_to_mysql( +/*==================*/ + const dict_col_t* col, /*!< in: InnoDB column */ + const uchar* data, /*!< in: InnoDB column data */ + ulint len, /*!< in: length of data, in bytes */ + Field* field) /*!< in/out: MySQL field */ +{ + uchar* ptr; + uchar* dest = field->ptr; + ulint flen = field->pack_length(); + + switch (col->mtype) { + case DATA_INT: + ut_ad(len == flen); + + /* Convert integer data from Innobase to little-endian + format, sign bit restored to normal */ + + for (ptr = dest + len; ptr != dest; ) { + *--ptr = *data++; + } + + if (!(field->flags & UNSIGNED_FLAG)) { + ((byte*) dest)[len - 1] ^= 0x80; + } + + break; + + case DATA_VARCHAR: + case DATA_VARMYSQL: + case DATA_BINARY: + field->reset(); + + if (field->type() == MYSQL_TYPE_VARCHAR) { + /* This is a >= 5.0.3 type true VARCHAR. Store the + length of the data to the first byte or the first + two bytes of dest. */ + + dest = row_mysql_store_true_var_len( + dest, len, flen - field->key_length()); + } + + /* Copy the actual data */ + memcpy(dest, data, len); + break; + + case DATA_BLOB: + /* Skip MySQL BLOBs when reporting an erroneous row + during index creation or table rebuild. */ + field->set_null(); + break; + +#ifdef UNIV_DEBUG + case DATA_MYSQL: + ut_ad(flen >= len); + ut_ad(DATA_MBMAXLEN(col->mbminmaxlen) + >= DATA_MBMINLEN(col->mbminmaxlen)); + memcpy(dest, data, len); + break; + + default: + case DATA_SYS_CHILD: + case DATA_SYS: + /* These column types should never be shipped to MySQL. */ + ut_ad(0); + + case DATA_FLOAT: + case DATA_DOUBLE: + case DATA_DECIMAL: + /* Above are the valid column types for MySQL data. */ + ut_ad(flen == len); + /* fall through */ + case DATA_FIXBINARY: + case DATA_CHAR: + /* We may have flen > len when there is a shorter + prefix on the CHAR and BINARY column. */ + ut_ad(flen >= len); +#else /* UNIV_DEBUG */ + default: +#endif /* UNIV_DEBUG */ + memcpy(dest, data, len); + } +} + +/*************************************************************//** +Copies an InnoDB record to table->record[0]. */ +UNIV_INTERN +void +innobase_rec_to_mysql( +/*==================*/ + struct TABLE* table, /*!< in/out: MySQL table */ + const rec_t* rec, /*!< in: record */ + const dict_index_t* index, /*!< in: index */ + const ulint* offsets)/*!< in: rec_get_offsets( + rec, index, ...) */ +{ + uint n_fields = table->s->fields; + + ut_ad(n_fields == dict_table_get_n_user_cols(index->table) + - !!(DICT_TF2_FLAG_IS_SET(index->table, + DICT_TF2_FTS_HAS_DOC_ID))); + + for (uint i = 0; i < n_fields; i++) { + Field* field = table->field[i]; + ulint ipos; + ulint ilen; + const uchar* ifield; + + field->reset(); + + ipos = dict_index_get_nth_col_or_prefix_pos(index, i, TRUE); + + if (ipos == ULINT_UNDEFINED + || rec_offs_nth_extern(offsets, ipos)) { +null_field: + field->set_null(); + continue; + } + + ifield = rec_get_nth_field(rec, offsets, ipos, &ilen); + + /* Assign the NULL flag */ + if (ilen == UNIV_SQL_NULL) { + ut_ad(field->real_maybe_null()); + goto null_field; + } + + field->set_notnull(); + + innobase_col_to_mysql( + dict_field_get_col( + dict_index_get_nth_field(index, ipos)), + ifield, ilen, field); + } +} + +/*************************************************************//** +Copies an InnoDB index entry to table->record[0]. */ +UNIV_INTERN +void +innobase_fields_to_mysql( +/*=====================*/ + struct TABLE* table, /*!< in/out: MySQL table */ + const dict_index_t* index, /*!< in: InnoDB index */ + const dfield_t* fields) /*!< in: InnoDB index fields */ +{ + uint n_fields = table->s->fields; + + ut_ad(n_fields == dict_table_get_n_user_cols(index->table) + - !!(DICT_TF2_FLAG_IS_SET(index->table, + DICT_TF2_FTS_HAS_DOC_ID))); + + for (uint i = 0; i < n_fields; i++) { + Field* field = table->field[i]; + ulint ipos; + + field->reset(); + + ipos = dict_index_get_nth_col_or_prefix_pos(index, i, TRUE); + + if (ipos == ULINT_UNDEFINED + || dfield_is_ext(&fields[ipos]) + || dfield_is_null(&fields[ipos])) { + + field->set_null(); + } else { + field->set_notnull(); + + const dfield_t* df = &fields[ipos]; + + innobase_col_to_mysql( + dict_field_get_col( + dict_index_get_nth_field(index, ipos)), + static_cast<const uchar*>(dfield_get_data(df)), + dfield_get_len(df), field); + } + } +} + +/*************************************************************//** +Copies an InnoDB row to table->record[0]. */ +UNIV_INTERN +void +innobase_row_to_mysql( +/*==================*/ + struct TABLE* table, /*!< in/out: MySQL table */ + const dict_table_t* itab, /*!< in: InnoDB table */ + const dtuple_t* row) /*!< in: InnoDB row */ +{ + uint n_fields = table->s->fields; + + /* The InnoDB row may contain an extra FTS_DOC_ID column at the end. */ + ut_ad(row->n_fields == dict_table_get_n_cols(itab)); + ut_ad(n_fields == row->n_fields - DATA_N_SYS_COLS + - !!(DICT_TF2_FLAG_IS_SET(itab, DICT_TF2_FTS_HAS_DOC_ID))); + + for (uint i = 0; i < n_fields; i++) { + Field* field = table->field[i]; + const dfield_t* df = dtuple_get_nth_field(row, i); + + field->reset(); + + if (dfield_is_ext(df) || dfield_is_null(df)) { + field->set_null(); + } else { + field->set_notnull(); + + innobase_col_to_mysql( + dict_table_get_nth_col(itab, i), + static_cast<const uchar*>(dfield_get_data(df)), + dfield_get_len(df), field); + } + } +} + +/*************************************************************//** +Resets table->record[0]. */ +UNIV_INTERN +void +innobase_rec_reset( +/*===============*/ + TABLE* table) /*!< in/out: MySQL table */ +{ + uint n_fields = table->s->fields; + uint i; + + for (i = 0; i < n_fields; i++) { + table->field[i]->set_default(); + } +} + +/*******************************************************************//** +This function checks that index keys are sensible. +@return 0 or error number */ +static __attribute__((nonnull, warn_unused_result)) +int +innobase_check_index_keys( +/*======================*/ + const Alter_inplace_info* info, + /*!< in: indexes to be created or dropped */ + const dict_table_t* innodb_table) + /*!< in: Existing indexes */ +{ + for (uint key_num = 0; key_num < info->index_add_count; + key_num++) { + const KEY& key = info->key_info_buffer[ + info->index_add_buffer[key_num]]; + + /* Check that the same index name does not appear + twice in indexes to be created. */ + + for (ulint i = 0; i < key_num; i++) { + const KEY& key2 = info->key_info_buffer[ + info->index_add_buffer[i]]; + + if (0 == strcmp(key.name, key2.name)) { + my_error(ER_WRONG_NAME_FOR_INDEX, MYF(0), + key.name); + + return(ER_WRONG_NAME_FOR_INDEX); + } + } + + /* Check that the same index name does not already exist. */ + + const dict_index_t* index; + + for (index = dict_table_get_first_index(innodb_table); + index; index = dict_table_get_next_index(index)) { + + if (!strcmp(key.name, index->name)) { + break; + } + } + + if (index) { + /* If a key by the same name is being created and + dropped, the name clash is OK. */ + for (uint i = 0; i < info->index_drop_count; + i++) { + const KEY* drop_key + = info->index_drop_buffer[i]; + + if (0 == strcmp(key.name, drop_key->name)) { + goto name_ok; + } + } + + my_error(ER_WRONG_NAME_FOR_INDEX, MYF(0), key.name); + + return(ER_WRONG_NAME_FOR_INDEX); + } + +name_ok: + for (ulint i = 0; i < key.user_defined_key_parts; i++) { + const KEY_PART_INFO& key_part1 + = key.key_part[i]; + const Field* field + = key_part1.field; + ibool is_unsigned; + + switch (get_innobase_type_from_mysql_type( + &is_unsigned, field)) { + default: + break; + case DATA_INT: + case DATA_FLOAT: + case DATA_DOUBLE: + case DATA_DECIMAL: + /* Check that MySQL does not try to + create a column prefix index field on + an inappropriate data type. */ + + if (field->type() == MYSQL_TYPE_VARCHAR) { + if (key_part1.length + >= field->pack_length() + - ((Field_varstring*) field) + ->length_bytes) { + break; + } + } else { + if (key_part1.length + >= field->pack_length()) { + break; + } + } + + my_error(ER_WRONG_KEY_COLUMN, MYF(0), + field->field_name); + return(ER_WRONG_KEY_COLUMN); + } + + /* Check that the same column does not appear + twice in the index. */ + + for (ulint j = 0; j < i; j++) { + const KEY_PART_INFO& key_part2 + = key.key_part[j]; + + if (key_part1.fieldnr != key_part2.fieldnr) { + continue; + } + + my_error(ER_WRONG_KEY_COLUMN, MYF(0), + field->field_name); + return(ER_WRONG_KEY_COLUMN); + } + } + } + + return(0); +} + +/*******************************************************************//** +Create index field definition for key part */ +static __attribute__((nonnull(2,3))) +void +innobase_create_index_field_def( +/*============================*/ + const TABLE* altered_table, /*!< in: MySQL table that is + being altered, or NULL + if a new clustered index is + not being created */ + const KEY_PART_INFO* key_part, /*!< in: MySQL key definition */ + index_field_t* index_field) /*!< out: index field + definition for key_part */ +{ + const Field* field; + ibool is_unsigned; + ulint col_type; + + DBUG_ENTER("innobase_create_index_field_def"); + + ut_ad(key_part); + ut_ad(index_field); + + field = altered_table + ? altered_table->field[key_part->fieldnr] + : key_part->field; + ut_a(field); + + index_field->col_no = key_part->fieldnr; + + col_type = get_innobase_type_from_mysql_type(&is_unsigned, field); + + if (DATA_BLOB == col_type + || (key_part->length < field->pack_length() + && field->type() != MYSQL_TYPE_VARCHAR) + || (field->type() == MYSQL_TYPE_VARCHAR + && key_part->length < field->pack_length() + - ((Field_varstring*) field)->length_bytes)) { + + index_field->prefix_len = key_part->length; + } else { + index_field->prefix_len = 0; + } + + DBUG_VOID_RETURN; +} + +/*******************************************************************//** +Create index definition for key */ +static __attribute__((nonnull)) +void +innobase_create_index_def( +/*======================*/ + const TABLE* altered_table, /*!< in: MySQL table that is + being altered */ + const KEY* keys, /*!< in: key definitions */ + ulint key_number, /*!< in: MySQL key number */ + bool new_clustered, /*!< in: true if generating + a new clustered index + on the table */ + bool key_clustered, /*!< in: true if this is + the new clustered index */ + index_def_t* index, /*!< out: index definition */ + mem_heap_t* heap) /*!< in: heap where memory + is allocated */ +{ + const KEY* key = &keys[key_number]; + ulint i; + ulint len; + ulint n_fields = key->user_defined_key_parts; + char* index_name; + + DBUG_ENTER("innobase_create_index_def"); + DBUG_ASSERT(!key_clustered || new_clustered); + + index->fields = static_cast<index_field_t*>( + mem_heap_alloc(heap, n_fields * sizeof *index->fields)); + + index->ind_type = 0; + index->key_number = key_number; + index->n_fields = n_fields; + len = strlen(key->name) + 1; + index->name = index_name = static_cast<char*>( + mem_heap_alloc(heap, len + !new_clustered)); + + if (!new_clustered) { + *index_name++ = TEMP_INDEX_PREFIX; + } + + memcpy(index_name, key->name, len); + + if (key->flags & HA_NOSAME) { + index->ind_type |= DICT_UNIQUE; + } + + if (key_clustered) { + DBUG_ASSERT(!(key->flags & HA_FULLTEXT)); + index->ind_type |= DICT_CLUSTERED; + } else if (key->flags & HA_FULLTEXT) { + DBUG_ASSERT(!(key->flags & HA_KEYFLAG_MASK + & ~(HA_FULLTEXT + | HA_PACK_KEY + | HA_BINARY_PACK_KEY))); + DBUG_ASSERT(!(key->flags & HA_NOSAME)); + DBUG_ASSERT(!index->ind_type); + index->ind_type |= DICT_FTS; + } + + if (!new_clustered) { + altered_table = NULL; + } + + for (i = 0; i < n_fields; i++) { + innobase_create_index_field_def( + altered_table, &key->key_part[i], &index->fields[i]); + } + + DBUG_VOID_RETURN; +} + +/*******************************************************************//** +Check whether the table has the FTS_DOC_ID column +@return whether there exists an FTS_DOC_ID column */ +static +bool +innobase_fts_check_doc_id_col( +/*==========================*/ + const dict_table_t* table, /*!< in: InnoDB table with + fulltext index */ + const TABLE* altered_table, + /*!< in: MySQL table with + fulltext index */ + ulint* fts_doc_col_no) + /*!< out: The column number for + Doc ID, or ULINT_UNDEFINED + if it is of wrong type */ +{ + *fts_doc_col_no = ULINT_UNDEFINED; + + const uint n_cols = altered_table->s->fields; + uint i; + + for (i = 0; i < n_cols; i++) { + const Field* field = altered_table->field[i]; + + if (my_strcasecmp(system_charset_info, + field->field_name, FTS_DOC_ID_COL_NAME)) { + continue; + } + + if (strcmp(field->field_name, FTS_DOC_ID_COL_NAME)) { + my_error(ER_WRONG_COLUMN_NAME, MYF(0), + field->field_name); + } else if (field->type() != MYSQL_TYPE_LONGLONG + || field->pack_length() != 8 + || field->real_maybe_null() + || !(field->flags & UNSIGNED_FLAG)) { + my_error(ER_INNODB_FT_WRONG_DOCID_COLUMN, MYF(0), + field->field_name); + } else { + *fts_doc_col_no = i; + } + + return(true); + } + + if (!table) { + return(false); + } + + for (; i + DATA_N_SYS_COLS < (uint) table->n_cols; i++) { + const char* name = dict_table_get_col_name(table, i); + + if (strcmp(name, FTS_DOC_ID_COL_NAME) == 0) { +#ifdef UNIV_DEBUG + const dict_col_t* col; + + col = dict_table_get_nth_col(table, i); + + /* Because the FTS_DOC_ID does not exist in + the MySQL data dictionary, this must be the + internally created FTS_DOC_ID column. */ + ut_ad(col->mtype == DATA_INT); + ut_ad(col->len == 8); + ut_ad(col->prtype & DATA_NOT_NULL); + ut_ad(col->prtype & DATA_UNSIGNED); +#endif /* UNIV_DEBUG */ + *fts_doc_col_no = i; + return(true); + } + } + + return(false); +} + +/*******************************************************************//** +Check whether the table has a unique index with FTS_DOC_ID_INDEX_NAME +on the Doc ID column. +@return the status of the FTS_DOC_ID index */ +UNIV_INTERN +enum fts_doc_id_index_enum +innobase_fts_check_doc_id_index( +/*============================*/ + const dict_table_t* table, /*!< in: table definition */ + const TABLE* altered_table, /*!< in: MySQL table + that is being altered */ + ulint* fts_doc_col_no) /*!< out: The column number for + Doc ID, or ULINT_UNDEFINED + if it is being created in + ha_alter_info */ +{ + const dict_index_t* index; + const dict_field_t* field; + + if (altered_table) { + /* Check if a unique index with the name of + FTS_DOC_ID_INDEX_NAME is being created. */ + + for (uint i = 0; i < altered_table->s->keys; i++) { + const KEY& key = altered_table->key_info[i]; + + if (innobase_strcasecmp( + key.name, FTS_DOC_ID_INDEX_NAME)) { + continue; + } + + if ((key.flags & HA_NOSAME) + && key.user_defined_key_parts == 1 + && !strcmp(key.name, FTS_DOC_ID_INDEX_NAME) + && !strcmp(key.key_part[0].field->field_name, + FTS_DOC_ID_COL_NAME)) { + if (fts_doc_col_no) { + *fts_doc_col_no = ULINT_UNDEFINED; + } + return(FTS_EXIST_DOC_ID_INDEX); + } else { + return(FTS_INCORRECT_DOC_ID_INDEX); + } + } + } + + if (!table) { + return(FTS_NOT_EXIST_DOC_ID_INDEX); + } + + for (index = dict_table_get_first_index(table); + index; index = dict_table_get_next_index(index)) { + + /* Check if there exists a unique index with the name of + FTS_DOC_ID_INDEX_NAME */ + if (innobase_strcasecmp(index->name, FTS_DOC_ID_INDEX_NAME)) { + continue; + } + + if (!dict_index_is_unique(index) + || dict_index_get_n_unique(index) > 1 + || strcmp(index->name, FTS_DOC_ID_INDEX_NAME)) { + return(FTS_INCORRECT_DOC_ID_INDEX); + } + + /* Check whether the index has FTS_DOC_ID as its + first column */ + field = dict_index_get_nth_field(index, 0); + + /* The column would be of a BIGINT data type */ + if (strcmp(field->name, FTS_DOC_ID_COL_NAME) == 0 + && field->col->mtype == DATA_INT + && field->col->len == 8 + && field->col->prtype & DATA_NOT_NULL) { + if (fts_doc_col_no) { + *fts_doc_col_no = dict_col_get_no(field->col); + } + return(FTS_EXIST_DOC_ID_INDEX); + } else { + return(FTS_INCORRECT_DOC_ID_INDEX); + } + } + + + /* Not found */ + return(FTS_NOT_EXIST_DOC_ID_INDEX); +} +/*******************************************************************//** +Check whether the table has a unique index with FTS_DOC_ID_INDEX_NAME +on the Doc ID column in MySQL create index definition. +@return FTS_EXIST_DOC_ID_INDEX if there exists the FTS_DOC_ID index, +FTS_INCORRECT_DOC_ID_INDEX if the FTS_DOC_ID index is of wrong format */ +UNIV_INTERN +enum fts_doc_id_index_enum +innobase_fts_check_doc_id_index_in_def( +/*===================================*/ + ulint n_key, /*!< in: Number of keys */ + const KEY* key_info) /*!< in: Key definition */ +{ + /* Check whether there is a "FTS_DOC_ID_INDEX" in the to be built index + list */ + for (ulint j = 0; j < n_key; j++) { + const KEY* key = &key_info[j]; + + if (innobase_strcasecmp(key->name, FTS_DOC_ID_INDEX_NAME)) { + continue; + } + + /* Do a check on FTS DOC ID_INDEX, it must be unique, + named as "FTS_DOC_ID_INDEX" and on column "FTS_DOC_ID" */ + if (!(key->flags & HA_NOSAME) + || key->user_defined_key_parts != 1 + || strcmp(key->name, FTS_DOC_ID_INDEX_NAME) + || strcmp(key->key_part[0].field->field_name, + FTS_DOC_ID_COL_NAME)) { + return(FTS_INCORRECT_DOC_ID_INDEX); + } + + return(FTS_EXIST_DOC_ID_INDEX); + } + + return(FTS_NOT_EXIST_DOC_ID_INDEX); +} +/*******************************************************************//** +Create an index table where indexes are ordered as follows: + +IF a new primary key is defined for the table THEN + + 1) New primary key + 2) The remaining keys in key_info + +ELSE + + 1) All new indexes in the order they arrive from MySQL + +ENDIF + +@return key definitions */ +static __attribute__((nonnull, warn_unused_result, malloc)) +index_def_t* +innobase_create_key_defs( +/*=====================*/ + mem_heap_t* heap, + /*!< in/out: memory heap where space for key + definitions are allocated */ + const Alter_inplace_info* ha_alter_info, + /*!< in: alter operation */ + const TABLE* altered_table, + /*!< in: MySQL table that is being altered */ + ulint& n_add, + /*!< in/out: number of indexes to be created */ + ulint& n_fts_add, + /*!< out: number of FTS indexes to be created */ + bool got_default_clust, + /*!< in: whether the table lacks a primary key */ + ulint& fts_doc_id_col, + /*!< in: The column number for Doc ID */ + bool& add_fts_doc_id, + /*!< in: whether we need to add new DOC ID + column for FTS index */ + bool& add_fts_doc_idx) + /*!< in: whether we need to add new DOC ID + index for FTS index */ +{ + index_def_t* indexdef; + index_def_t* indexdefs; + bool new_primary; + const uint*const add + = ha_alter_info->index_add_buffer; + const KEY*const key_info + = ha_alter_info->key_info_buffer; + + DBUG_ENTER("innobase_create_key_defs"); + DBUG_ASSERT(!add_fts_doc_id || add_fts_doc_idx); + DBUG_ASSERT(ha_alter_info->index_add_count == n_add); + + /* If there is a primary key, it is always the first index + defined for the innodb_table. */ + + new_primary = n_add > 0 + && !my_strcasecmp(system_charset_info, + key_info[*add].name, "PRIMARY"); + n_fts_add = 0; + + /* If there is a UNIQUE INDEX consisting entirely of NOT NULL + columns and if the index does not contain column prefix(es) + (only prefix/part of the column is indexed), MySQL will treat the + index as a PRIMARY KEY unless the table already has one. */ + + if (n_add > 0 && !new_primary && got_default_clust + && (key_info[*add].flags & HA_NOSAME) + && !(key_info[*add].flags & HA_KEY_HAS_PART_KEY_SEG)) { + uint key_part = key_info[*add].user_defined_key_parts; + + new_primary = true; + + while (key_part--) { + const uint maybe_null + = key_info[*add].key_part[key_part].key_type + & FIELDFLAG_MAYBE_NULL; + DBUG_ASSERT(!maybe_null + == !key_info[*add].key_part[key_part]. + field->real_maybe_null()); + + if (maybe_null) { + new_primary = false; + break; + } + } + } + + const bool rebuild = new_primary || add_fts_doc_id + || innobase_need_rebuild(ha_alter_info); + /* Reserve one more space if new_primary is true, and we might + need to add the FTS_DOC_ID_INDEX */ + indexdef = indexdefs = static_cast<index_def_t*>( + mem_heap_alloc( + heap, sizeof *indexdef + * (ha_alter_info->key_count + + rebuild + + got_default_clust))); + + if (rebuild) { + ulint primary_key_number; + + if (new_primary) { + DBUG_ASSERT(n_add > 0); + primary_key_number = *add; + } else if (got_default_clust) { + /* Create the GEN_CLUST_INDEX */ + index_def_t* index = indexdef++; + + index->fields = NULL; + index->n_fields = 0; + index->ind_type = DICT_CLUSTERED; + index->name = mem_heap_strdup( + heap, innobase_index_reserve_name); + index->key_number = ~0; + primary_key_number = ULINT_UNDEFINED; + goto created_clustered; + } else { + primary_key_number = 0; + } + + /* Create the PRIMARY key index definition */ + innobase_create_index_def( + altered_table, key_info, primary_key_number, + TRUE, TRUE, indexdef++, heap); + +created_clustered: + n_add = 1; + + for (ulint i = 0; i < ha_alter_info->key_count; i++) { + if (i == primary_key_number) { + continue; + } + /* Copy the index definitions. */ + innobase_create_index_def( + altered_table, key_info, i, TRUE, FALSE, + indexdef, heap); + + if (indexdef->ind_type & DICT_FTS) { + n_fts_add++; + } + + indexdef++; + n_add++; + } + + if (n_fts_add > 0) { + if (!add_fts_doc_id + && !innobase_fts_check_doc_id_col( + NULL, altered_table, + &fts_doc_id_col)) { + fts_doc_id_col = altered_table->s->fields; + add_fts_doc_id = true; + } + + if (!add_fts_doc_idx) { + fts_doc_id_index_enum ret; + ulint doc_col_no; + + ret = innobase_fts_check_doc_id_index( + NULL, altered_table, &doc_col_no); + + /* This should have been checked before */ + ut_ad(ret != FTS_INCORRECT_DOC_ID_INDEX); + + if (ret == FTS_NOT_EXIST_DOC_ID_INDEX) { + add_fts_doc_idx = true; + } else { + ut_ad(ret == FTS_EXIST_DOC_ID_INDEX); + ut_ad(doc_col_no == ULINT_UNDEFINED + || doc_col_no == fts_doc_id_col); + } + } + } + } else { + /* Create definitions for added secondary indexes. */ + + for (ulint i = 0; i < n_add; i++) { + innobase_create_index_def( + altered_table, key_info, add[i], FALSE, FALSE, + indexdef, heap); + + if (indexdef->ind_type & DICT_FTS) { + n_fts_add++; + } + + indexdef++; + } + } + + DBUG_ASSERT(indexdefs + n_add == indexdef); + + if (add_fts_doc_idx) { + index_def_t* index = indexdef++; + + index->fields = static_cast<index_field_t*>( + mem_heap_alloc(heap, sizeof *index->fields)); + index->n_fields = 1; + index->fields->col_no = fts_doc_id_col; + index->fields->prefix_len = 0; + index->ind_type = DICT_UNIQUE; + + if (rebuild) { + index->name = mem_heap_strdup( + heap, FTS_DOC_ID_INDEX_NAME); + ut_ad(!add_fts_doc_id + || fts_doc_id_col == altered_table->s->fields); + } else { + char* index_name; + index->name = index_name = static_cast<char*>( + mem_heap_alloc( + heap, + 1 + sizeof FTS_DOC_ID_INDEX_NAME)); + *index_name++ = TEMP_INDEX_PREFIX; + memcpy(index_name, FTS_DOC_ID_INDEX_NAME, + sizeof FTS_DOC_ID_INDEX_NAME); + } + + /* TODO: assign a real MySQL key number for this */ + index->key_number = ULINT_UNDEFINED; + n_add++; + } + + DBUG_ASSERT(indexdef > indexdefs); + DBUG_ASSERT((ulint) (indexdef - indexdefs) + <= ha_alter_info->key_count + + add_fts_doc_idx + got_default_clust); + DBUG_ASSERT(ha_alter_info->index_add_count <= n_add); + DBUG_RETURN(indexdefs); +} + +/*******************************************************************//** +Check each index column size, make sure they do not exceed the max limit +@return true if index column size exceeds limit */ +static __attribute__((nonnull, warn_unused_result)) +bool +innobase_check_column_length( +/*=========================*/ + ulint max_col_len, /*!< in: maximum column length */ + const KEY* key_info) /*!< in: Indexes to be created */ +{ + for (ulint key_part = 0; key_part < key_info->user_defined_key_parts; key_part++) { + if (key_info->key_part[key_part].length > max_col_len) { + return(true); + } + } + return(false); +} + +struct ha_innobase_inplace_ctx : public inplace_alter_handler_ctx +{ + /** Dummy query graph */ + que_thr_t* thr; + /** reference to the prebuilt struct of the creating instance */ + row_prebuilt_t*&prebuilt; + /** InnoDB indexes being created */ + dict_index_t** add_index; + /** MySQL key numbers for the InnoDB indexes that are being created */ + const ulint* add_key_numbers; + /** number of InnoDB indexes being created */ + ulint num_to_add_index; + /** InnoDB indexes being dropped */ + dict_index_t** drop_index; + /** number of InnoDB indexes being dropped */ + const ulint num_to_drop_index; + /** InnoDB foreign key constraints being dropped */ + dict_foreign_t** drop_fk; + /** number of InnoDB foreign key constraints being dropped */ + const ulint num_to_drop_fk; + /** InnoDB foreign key constraints being added */ + dict_foreign_t** add_fk; + /** number of InnoDB foreign key constraints being dropped */ + const ulint num_to_add_fk; + /** whether to create the indexes online */ + bool online; + /** memory heap */ + mem_heap_t* heap; + /** dictionary transaction */ + trx_t* trx; + /** original table (if rebuilt, differs from indexed_table) */ + dict_table_t* old_table; + /** table where the indexes are being created or dropped */ + dict_table_t* new_table; + /** mapping of old column numbers to new ones, or NULL */ + const ulint* col_map; + /** new column names, or NULL if nothing was renamed */ + const char** col_names; + /** added AUTO_INCREMENT column position, or ULINT_UNDEFINED */ + const ulint add_autoinc; + /** default values of ADD COLUMN, or NULL */ + const dtuple_t* add_cols; + /** autoinc sequence to use */ + ib_sequence_t sequence; + /** maximum auto-increment value */ + ulonglong max_autoinc; + /** temporary table name to use for old table when renaming tables */ + const char* tmp_name; + + ha_innobase_inplace_ctx(row_prebuilt_t*& prebuilt_arg, + dict_index_t** drop_arg, + ulint num_to_drop_arg, + dict_foreign_t** drop_fk_arg, + ulint num_to_drop_fk_arg, + dict_foreign_t** add_fk_arg, + ulint num_to_add_fk_arg, + bool online_arg, + mem_heap_t* heap_arg, + dict_table_t* new_table_arg, + const char** col_names_arg, + ulint add_autoinc_arg, + ulonglong autoinc_col_min_value_arg, + ulonglong autoinc_col_max_value_arg) : + inplace_alter_handler_ctx(), + prebuilt (prebuilt_arg), + add_index (0), add_key_numbers (0), num_to_add_index (0), + drop_index (drop_arg), num_to_drop_index (num_to_drop_arg), + drop_fk (drop_fk_arg), num_to_drop_fk (num_to_drop_fk_arg), + add_fk (add_fk_arg), num_to_add_fk (num_to_add_fk_arg), + online (online_arg), heap (heap_arg), trx (0), + old_table (prebuilt_arg->table), + new_table (new_table_arg), + col_map (0), col_names (col_names_arg), + add_autoinc (add_autoinc_arg), + add_cols (0), + sequence(prebuilt->trx->mysql_thd, + autoinc_col_min_value_arg, autoinc_col_max_value_arg), + max_autoinc (0), + tmp_name (0) + { +#ifdef UNIV_DEBUG + for (ulint i = 0; i < num_to_add_index; i++) { + ut_ad(!add_index[i]->to_be_dropped); + } + for (ulint i = 0; i < num_to_drop_index; i++) { + ut_ad(drop_index[i]->to_be_dropped); + } +#endif /* UNIV_DEBUG */ + + thr = pars_complete_graph_for_exec(NULL, prebuilt->trx, heap); + } + + ~ha_innobase_inplace_ctx() + { + mem_heap_free(heap); + } + + /** Determine if the table will be rebuilt. + @return whether the table will be rebuilt */ + bool need_rebuild () const { return(old_table != new_table); } + +private: + // Disable copying + ha_innobase_inplace_ctx(const ha_innobase_inplace_ctx&); + ha_innobase_inplace_ctx& operator=(const ha_innobase_inplace_ctx&); +}; + +/********************************************************************//** +Drop any indexes that we were not able to free previously due to +open table handles. */ +static +void +online_retry_drop_indexes_low( +/*==========================*/ + dict_table_t* table, /*!< in/out: table */ + trx_t* trx) /*!< in/out: transaction */ +{ + ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH); + ut_ad(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX); + + /* We can have table->n_ref_count > 1, because other threads + may have prebuilt->table pointing to the table. However, these + other threads should be between statements, waiting for the + next statement to execute, or for a meta-data lock. */ + ut_ad(table->n_ref_count >= 1); + + if (table->drop_aborted) { + row_merge_drop_indexes(trx, table, TRUE); + } +} + +/********************************************************************//** +Drop any indexes that we were not able to free previously due to +open table handles. */ +static __attribute__((nonnull)) +void +online_retry_drop_indexes( +/*======================*/ + dict_table_t* table, /*!< in/out: table */ + THD* user_thd) /*!< in/out: MySQL connection */ +{ + if (table->drop_aborted) { + trx_t* trx = innobase_trx_allocate(user_thd); + + trx_start_for_ddl(trx, TRX_DICT_OP_INDEX); + + row_mysql_lock_data_dictionary(trx); + online_retry_drop_indexes_low(table, trx); + trx_commit_for_mysql(trx); + row_mysql_unlock_data_dictionary(trx); + trx_free_for_mysql(trx); + } + +#ifdef UNIV_DEBUG + mutex_enter(&dict_sys->mutex); + dict_table_check_for_dup_indexes(table, CHECK_ALL_COMPLETE); + mutex_exit(&dict_sys->mutex); + ut_a(!table->drop_aborted); +#endif /* UNIV_DEBUG */ +} + +/********************************************************************//** +Commit a dictionary transaction and drop any indexes that we were not +able to free previously due to open table handles. */ +static __attribute__((nonnull)) +void +online_retry_drop_indexes_with_trx( +/*===============================*/ + dict_table_t* table, /*!< in/out: table */ + trx_t* trx) /*!< in/out: transaction */ +{ + ut_ad(trx_state_eq(trx, TRX_STATE_NOT_STARTED)); + ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH); + + /* Now that the dictionary is being locked, check if we can + drop any incompletely created indexes that may have been left + behind in rollback_inplace_alter_table() earlier. */ + if (table->drop_aborted) { + + trx->table_id = 0; + + trx_start_for_ddl(trx, TRX_DICT_OP_INDEX); + + online_retry_drop_indexes_low(table, trx); + trx_commit_for_mysql(trx); + } +} + +/** Determines if InnoDB is dropping a foreign key constraint. +@param foreign the constraint +@param drop_fk constraints being dropped +@param n_drop_fk number of constraints that are being dropped +@return whether the constraint is being dropped */ +inline __attribute__((pure, nonnull, warn_unused_result)) +bool +innobase_dropping_foreign( +/*======================*/ + const dict_foreign_t* foreign, + dict_foreign_t** drop_fk, + ulint n_drop_fk) +{ + while (n_drop_fk--) { + if (*drop_fk++ == foreign) { + return(true); + } + } + + return(false); +} + +/** Determines if an InnoDB FOREIGN KEY constraint depends on a +column that is being dropped or modified to NOT NULL. +@param user_table InnoDB table as it is before the ALTER operation +@param col_name Name of the column being altered +@param drop_fk constraints being dropped +@param n_drop_fk number of constraints that are being dropped +@param drop true=drop column, false=set NOT NULL +@retval true Not allowed (will call my_error()) +@retval false Allowed +*/ +static __attribute__((pure, nonnull, warn_unused_result)) +bool +innobase_check_foreigns_low( +/*========================*/ + const dict_table_t* user_table, + dict_foreign_t** drop_fk, + ulint n_drop_fk, + const char* col_name, + bool drop) +{ + dict_foreign_t* foreign; + ut_ad(mutex_own(&dict_sys->mutex)); + + /* Check if any FOREIGN KEY constraints are defined on this + column. */ + + for (dict_foreign_set::iterator it = user_table->foreign_set.begin(); + it != user_table->foreign_set.end(); + ++it) { + + foreign = *it; + + if (!drop && !(foreign->type + & (DICT_FOREIGN_ON_DELETE_SET_NULL + | DICT_FOREIGN_ON_UPDATE_SET_NULL))) { + continue; + } + + if (innobase_dropping_foreign(foreign, drop_fk, n_drop_fk)) { + continue; + } + + for (unsigned f = 0; f < foreign->n_fields; f++) { + if (!strcmp(foreign->foreign_col_names[f], + col_name)) { + my_error(drop + ? ER_FK_COLUMN_CANNOT_DROP + : ER_FK_COLUMN_NOT_NULL, MYF(0), + col_name, foreign->id); + return(true); + } + } + } + + if (!drop) { + /* SET NULL clauses on foreign key constraints of + child tables affect the child tables, not the parent table. + The column can be NOT NULL in the parent table. */ + return(false); + } + + /* Check if any FOREIGN KEY constraints in other tables are + referring to the column that is being dropped. */ + for (dict_foreign_set::iterator it + = user_table->referenced_set.begin(); + it != user_table->referenced_set.end(); + ++it) { + + foreign = *it; + + if (innobase_dropping_foreign(foreign, drop_fk, n_drop_fk)) { + continue; + } + + for (unsigned f = 0; f < foreign->n_fields; f++) { + char display_name[FN_REFLEN]; + + if (strcmp(foreign->referenced_col_names[f], + col_name)) { + continue; + } + + char* buf_end = innobase_convert_name( + display_name, (sizeof display_name) - 1, + foreign->foreign_table_name, + strlen(foreign->foreign_table_name), + NULL, TRUE); + *buf_end = '\0'; + my_error(ER_FK_COLUMN_CANNOT_DROP_CHILD, + MYF(0), col_name, foreign->id, + display_name); + + return(true); + } + } + + return(false); +} + +/** Determines if an InnoDB FOREIGN KEY constraint depends on a +column that is being dropped or modified to NOT NULL. +@param ha_alter_info Data used during in-place alter +@param altered_table MySQL table that is being altered +@param old_table MySQL table as it is before the ALTER operation +@param user_table InnoDB table as it is before the ALTER operation +@param drop_fk constraints being dropped +@param n_drop_fk number of constraints that are being dropped +@retval true Not allowed (will call my_error()) +@retval false Allowed +*/ +static __attribute__((pure, nonnull, warn_unused_result)) +bool +innobase_check_foreigns( +/*====================*/ + Alter_inplace_info* ha_alter_info, + const TABLE* altered_table, + const TABLE* old_table, + const dict_table_t* user_table, + dict_foreign_t** drop_fk, + ulint n_drop_fk) +{ + List_iterator_fast<Create_field> cf_it( + ha_alter_info->alter_info->create_list); + + for (Field** fp = old_table->field; *fp; fp++) { + cf_it.rewind(); + const Create_field* new_field; + + ut_ad(!(*fp)->real_maybe_null() + == !!((*fp)->flags & NOT_NULL_FLAG)); + + while ((new_field = cf_it++)) { + if (new_field->field == *fp) { + break; + } + } + + if (!new_field || (new_field->flags & NOT_NULL_FLAG)) { + if (innobase_check_foreigns_low( + user_table, drop_fk, n_drop_fk, + (*fp)->field_name, !new_field)) { + return(true); + } + } + } + + return(false); +} + +/** Convert a default value for ADD COLUMN. + +@param heap Memory heap where allocated +@param dfield InnoDB data field to copy to +@param field MySQL value for the column +@param comp nonzero if in compact format */ +static __attribute__((nonnull)) +void +innobase_build_col_map_add( +/*=======================*/ + mem_heap_t* heap, + dfield_t* dfield, + const Field* field, + ulint comp) +{ + if (field->is_real_null()) { + dfield_set_null(dfield); + return; + } + + ulint size = field->pack_length(); + + byte* buf = static_cast<byte*>(mem_heap_alloc(heap, size)); + + row_mysql_store_col_in_innobase_format( + dfield, buf, TRUE, field->ptr, size, comp); +} + +/** Construct the translation table for reordering, dropping or +adding columns. + +@param ha_alter_info Data used during in-place alter +@param altered_table MySQL table that is being altered +@param table MySQL table as it is before the ALTER operation +@param new_table InnoDB table corresponding to MySQL altered_table +@param old_table InnoDB table corresponding to MYSQL table +@param add_cols Default values for ADD COLUMN, or NULL if no ADD COLUMN +@param heap Memory heap where allocated +@return array of integers, mapping column numbers in the table +to column numbers in altered_table */ +static __attribute__((nonnull(1,2,3,4,5,7), warn_unused_result)) +const ulint* +innobase_build_col_map( +/*===================*/ + Alter_inplace_info* ha_alter_info, + const TABLE* altered_table, + const TABLE* table, + const dict_table_t* new_table, + const dict_table_t* old_table, + dtuple_t* add_cols, + mem_heap_t* heap) +{ + DBUG_ENTER("innobase_build_col_map"); + DBUG_ASSERT(altered_table != table); + DBUG_ASSERT(new_table != old_table); + DBUG_ASSERT(dict_table_get_n_cols(new_table) + >= altered_table->s->fields + DATA_N_SYS_COLS); + DBUG_ASSERT(dict_table_get_n_cols(old_table) + >= table->s->fields + DATA_N_SYS_COLS); + DBUG_ASSERT(!!add_cols == !!(ha_alter_info->handler_flags + & Alter_inplace_info::ADD_COLUMN)); + DBUG_ASSERT(!add_cols || dtuple_get_n_fields(add_cols) + == dict_table_get_n_cols(new_table)); + + ulint* col_map = static_cast<ulint*>( + mem_heap_alloc(heap, old_table->n_cols * sizeof *col_map)); + + List_iterator_fast<Create_field> cf_it( + ha_alter_info->alter_info->create_list); + uint i = 0; + + /* Any dropped columns will map to ULINT_UNDEFINED. */ + for (uint old_i = 0; old_i + DATA_N_SYS_COLS < old_table->n_cols; + old_i++) { + col_map[old_i] = ULINT_UNDEFINED; + } + + while (const Create_field* new_field = cf_it++) { + for (uint old_i = 0; table->field[old_i]; old_i++) { + const Field* field = table->field[old_i]; + if (new_field->field == field) { + col_map[old_i] = i; + goto found_col; + } + } + + innobase_build_col_map_add( + heap, dtuple_get_nth_field(add_cols, i), + altered_table->field[i], + dict_table_is_comp(new_table)); +found_col: + i++; + } + + DBUG_ASSERT(i == altered_table->s->fields); + + i = table->s->fields; + + /* Add the InnoDB hidden FTS_DOC_ID column, if any. */ + if (i + DATA_N_SYS_COLS < old_table->n_cols) { + /* There should be exactly one extra field, + the FTS_DOC_ID. */ + DBUG_ASSERT(DICT_TF2_FLAG_IS_SET(old_table, + DICT_TF2_FTS_HAS_DOC_ID)); + DBUG_ASSERT(i + DATA_N_SYS_COLS + 1 == old_table->n_cols); + DBUG_ASSERT(!strcmp(dict_table_get_col_name( + old_table, table->s->fields), + FTS_DOC_ID_COL_NAME)); + if (altered_table->s->fields + DATA_N_SYS_COLS + < new_table->n_cols) { + DBUG_ASSERT(DICT_TF2_FLAG_IS_SET( + new_table, + DICT_TF2_FTS_HAS_DOC_ID)); + DBUG_ASSERT(altered_table->s->fields + + DATA_N_SYS_COLS + 1 + == new_table->n_cols); + col_map[i] = altered_table->s->fields; + } else { + DBUG_ASSERT(!DICT_TF2_FLAG_IS_SET( + new_table, + DICT_TF2_FTS_HAS_DOC_ID)); + col_map[i] = ULINT_UNDEFINED; + } + + i++; + } else { + DBUG_ASSERT(!DICT_TF2_FLAG_IS_SET( + old_table, + DICT_TF2_FTS_HAS_DOC_ID)); + } + + for (; i < old_table->n_cols; i++) { + col_map[i] = i + new_table->n_cols - old_table->n_cols; + } + + DBUG_RETURN(col_map); +} + +/** Drop newly create FTS index related auxiliary table during +FIC create index process, before fts_add_index is called +@param table table that was being rebuilt online +@param trx transaction +@return DB_SUCCESS if successful, otherwise last error code +*/ +static +dberr_t +innobase_drop_fts_index_table( +/*==========================*/ + dict_table_t* table, + trx_t* trx) +{ + dberr_t ret_err = DB_SUCCESS; + + for (dict_index_t* index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + if (index->type & DICT_FTS) { + dberr_t err; + + err = fts_drop_index_tables(trx, index); + + if (err != DB_SUCCESS) { + ret_err = err; + } + } + } + + return(ret_err); +} + +/** Get the new column names if any columns were renamed +@param ha_alter_info Data used during in-place alter +@param altered_table MySQL table that is being altered +@param table MySQL table as it is before the ALTER operation +@param user_table InnoDB table as it is before the ALTER operation +@param heap Memory heap for the allocation +@return array of new column names in rebuilt_table, or NULL if not renamed */ +static __attribute__((nonnull, warn_unused_result)) +const char** +innobase_get_col_names( + Alter_inplace_info* ha_alter_info, + const TABLE* altered_table, + const TABLE* table, + const dict_table_t* user_table, + mem_heap_t* heap) +{ + const char** cols; + uint i; + + DBUG_ENTER("innobase_get_col_names"); + DBUG_ASSERT(user_table->n_def > table->s->fields); + DBUG_ASSERT(ha_alter_info->handler_flags + & Alter_inplace_info::ALTER_COLUMN_NAME); + + cols = static_cast<const char**>( + mem_heap_zalloc(heap, user_table->n_def * sizeof *cols)); + + i = 0; + List_iterator_fast<Create_field> cf_it( + ha_alter_info->alter_info->create_list); + while (const Create_field* new_field = cf_it++) { + DBUG_ASSERT(i < altered_table->s->fields); + + for (uint old_i = 0; table->field[old_i]; old_i++) { + if (new_field->field == table->field[old_i]) { + cols[old_i] = new_field->field_name; + break; + } + } + + i++; + } + + /* Copy the internal column names. */ + i = table->s->fields; + cols[i] = dict_table_get_col_name(user_table, i); + + while (++i < user_table->n_def) { + cols[i] = cols[i - 1] + strlen(cols[i - 1]) + 1; + } + + DBUG_RETURN(cols); +} + +/** Update internal structures with concurrent writes blocked, +while preparing ALTER TABLE. + +@param ha_alter_info Data used during in-place alter +@param altered_table MySQL table that is being altered +@param old_table MySQL table as it is before the ALTER operation +@param table_name Table name in MySQL +@param flags Table and tablespace flags +@param flags2 Additional table flags +@param fts_doc_id_col The column number of FTS_DOC_ID +@param add_fts_doc_id Flag: add column FTS_DOC_ID? +@param add_fts_doc_id_idx Flag: add index FTS_DOC_ID_INDEX (FTS_DOC_ID)? + +@retval true Failure +@retval false Success +*/ +static __attribute__((warn_unused_result, nonnull(1,2,3,4))) +bool +prepare_inplace_alter_table_dict( +/*=============================*/ + Alter_inplace_info* ha_alter_info, + const TABLE* altered_table, + const TABLE* old_table, + const char* table_name, + ulint flags, + ulint flags2, + ulint fts_doc_id_col, + bool add_fts_doc_id, + bool add_fts_doc_id_idx) +{ + bool dict_locked = false; + ulint* add_key_nums; /* MySQL key numbers */ + index_def_t* index_defs; /* index definitions */ + dict_table_t* user_table; + dict_index_t* fts_index = NULL; + ulint new_clustered = 0; + dberr_t error; + ulint num_fts_index; + ha_innobase_inplace_ctx*ctx; + + DBUG_ENTER("prepare_inplace_alter_table_dict"); + + ctx = static_cast<ha_innobase_inplace_ctx*> + (ha_alter_info->handler_ctx); + + DBUG_ASSERT((ctx->add_autoinc != ULINT_UNDEFINED) + == (ctx->sequence.m_max_value > 0)); + DBUG_ASSERT(!ctx->num_to_drop_index == !ctx->drop_index); + DBUG_ASSERT(!ctx->num_to_drop_fk == !ctx->drop_fk); + DBUG_ASSERT(!add_fts_doc_id || add_fts_doc_id_idx); + DBUG_ASSERT(!add_fts_doc_id_idx + || innobase_fulltext_exist(altered_table)); + DBUG_ASSERT(!ctx->add_cols); + DBUG_ASSERT(!ctx->add_index); + DBUG_ASSERT(!ctx->add_key_numbers); + DBUG_ASSERT(!ctx->num_to_add_index); + + user_table = ctx->new_table; + + trx_start_if_not_started_xa(ctx->prebuilt->trx); + + /* Create a background transaction for the operations on + the data dictionary tables. */ + ctx->trx = innobase_trx_allocate(ctx->prebuilt->trx->mysql_thd); + + if (UNIV_UNLIKELY(ctx->trx->fake_changes)) { + trx_rollback_to_savepoint(ctx->trx, NULL); + trx_free_for_mysql(ctx->trx); + DBUG_RETURN(HA_ERR_WRONG_COMMAND); + } + + trx_start_for_ddl(ctx->trx, TRX_DICT_OP_INDEX); + + /* Create table containing all indexes to be built in this + ALTER TABLE ADD INDEX so that they are in the correct order + in the table. */ + + ctx->num_to_add_index = ha_alter_info->index_add_count; + + index_defs = innobase_create_key_defs( + ctx->heap, ha_alter_info, altered_table, ctx->num_to_add_index, + num_fts_index, + row_table_got_default_clust_index(ctx->new_table), + fts_doc_id_col, add_fts_doc_id, add_fts_doc_id_idx); + + new_clustered = DICT_CLUSTERED & index_defs[0].ind_type; + + if (num_fts_index > 1) { + my_error(ER_INNODB_FT_LIMIT, MYF(0)); + goto error_handled; + } + + if (!ctx->online) { + /* This is not an online operation (LOCK=NONE). */ + } else if (ctx->add_autoinc == ULINT_UNDEFINED + && num_fts_index == 0 + && (!innobase_need_rebuild(ha_alter_info) + || !innobase_fulltext_exist(altered_table))) { + /* InnoDB can perform an online operation (LOCK=NONE). */ + } else { + /* This should have been blocked in + check_if_supported_inplace_alter(). */ + ut_ad(0); + my_error(ER_NOT_SUPPORTED_YET, MYF(0), + thd_query_string(ctx->prebuilt->trx->mysql_thd)->str); + goto error_handled; + } + + /* The primary index would be rebuilt if a FTS Doc ID + column is to be added, and the primary index definition + is just copied from old table and stored in indexdefs[0] */ + DBUG_ASSERT(!add_fts_doc_id || new_clustered); + DBUG_ASSERT(!!new_clustered == + (innobase_need_rebuild(ha_alter_info) + || add_fts_doc_id)); + + /* Allocate memory for dictionary index definitions */ + + ctx->add_index = static_cast<dict_index_t**>( + mem_heap_alloc(ctx->heap, ctx->num_to_add_index + * sizeof *ctx->add_index)); + ctx->add_key_numbers = add_key_nums = static_cast<ulint*>( + mem_heap_alloc(ctx->heap, ctx->num_to_add_index + * sizeof *ctx->add_key_numbers)); + + /* This transaction should be dictionary operation, so that + the data dictionary will be locked during crash recovery. */ + + ut_ad(ctx->trx->dict_operation == TRX_DICT_OP_INDEX); + + /* Acquire a lock on the table before creating any indexes. */ + + if (ctx->online) { + error = DB_SUCCESS; + } else { + error = row_merge_lock_table( + ctx->prebuilt->trx, ctx->new_table, LOCK_S); + + if (error != DB_SUCCESS) { + + goto error_handling; + } + } + + /* Latch the InnoDB data dictionary exclusively so that no deadlocks + or lock waits can happen in it during an index create operation. */ + + row_mysql_lock_data_dictionary(ctx->trx); + dict_locked = true; + + /* Wait for background stats processing to stop using the table that + we are going to alter. We know bg stats will not start using it again + until we are holding the data dict locked and we are holding it here + at least until checking ut_ad(user_table->n_ref_count == 1) below. + XXX what may happen if bg stats opens the table after we + have unlocked data dictionary below? */ + dict_stats_wait_bg_to_stop_using_table(user_table, ctx->trx); + + online_retry_drop_indexes_low(ctx->new_table, ctx->trx); + + ut_d(dict_table_check_for_dup_indexes( + ctx->new_table, CHECK_ABORTED_OK)); + + /* If a new clustered index is defined for the table we need + to rebuild the table with a temporary name. */ + + if (new_clustered) { + const char* new_table_name + = dict_mem_create_temporary_tablename( + ctx->heap, + ctx->new_table->name, + ctx->new_table->id); + ulint n_cols; + dtuple_t* add_cols; + + if (innobase_check_foreigns( + ha_alter_info, altered_table, old_table, + user_table, ctx->drop_fk, ctx->num_to_drop_fk)) { + goto new_clustered_failed; + } + + n_cols = altered_table->s->fields; + + if (add_fts_doc_id) { + n_cols++; + DBUG_ASSERT(flags2 & DICT_TF2_FTS); + DBUG_ASSERT(add_fts_doc_id_idx); + flags2 |= DICT_TF2_FTS_ADD_DOC_ID + | DICT_TF2_FTS_HAS_DOC_ID + | DICT_TF2_FTS; + } + + DBUG_ASSERT(!add_fts_doc_id_idx || (flags2 & DICT_TF2_FTS)); + + /* Create the table. */ + trx_set_dict_operation(ctx->trx, TRX_DICT_OP_TABLE); + + if (dict_table_get_low(new_table_name)) { + my_error(ER_TABLE_EXISTS_ERROR, MYF(0), + new_table_name); + goto new_clustered_failed; + } + + /* The initial space id 0 may be overridden later. */ + ctx->new_table = dict_mem_table_create( + new_table_name, 0, n_cols, flags, flags2, false); + /* The rebuilt indexed_table will use the renamed + column names. */ + ctx->col_names = NULL; + + if (DICT_TF_HAS_DATA_DIR(flags)) { + ctx->new_table->data_dir_path = + mem_heap_strdup(ctx->new_table->heap, + user_table->data_dir_path); + } + + for (uint i = 0; i < altered_table->s->fields; i++) { + const Field* field = altered_table->field[i]; + ulint is_unsigned; + ulint field_type + = (ulint) field->type(); + ulint col_type + = get_innobase_type_from_mysql_type( + &is_unsigned, field); + ulint charset_no; + ulint col_len; + + /* we assume in dtype_form_prtype() that this + fits in two bytes */ + ut_a(field_type <= MAX_CHAR_COLL_NUM); + + if (!field->real_maybe_null()) { + field_type |= DATA_NOT_NULL; + } + + if (field->binary()) { + field_type |= DATA_BINARY_TYPE; + } + + if (is_unsigned) { + field_type |= DATA_UNSIGNED; + } + + if (dtype_is_string_type(col_type)) { + charset_no = (ulint) field->charset()->number; + + if (charset_no > MAX_CHAR_COLL_NUM) { + dict_mem_table_free( + ctx->new_table); + my_error(ER_WRONG_KEY_COLUMN, MYF(0), + field->field_name); + goto new_clustered_failed; + } + } else { + charset_no = 0; + } + + col_len = field->pack_length(); + + /* The MySQL pack length contains 1 or 2 bytes + length field for a true VARCHAR. Let us + subtract that, so that the InnoDB column + length in the InnoDB data dictionary is the + real maximum byte length of the actual data. */ + + if (field->type() == MYSQL_TYPE_VARCHAR) { + uint32 length_bytes + = static_cast<const Field_varstring*>( + field)->length_bytes; + + col_len -= length_bytes; + + if (length_bytes == 2) { + field_type |= DATA_LONG_TRUE_VARCHAR; + } + } + + if (dict_col_name_is_reserved(field->field_name)) { + dict_mem_table_free(ctx->new_table); + my_error(ER_WRONG_COLUMN_NAME, MYF(0), + field->field_name); + goto new_clustered_failed; + } + + dict_mem_table_add_col( + ctx->new_table, ctx->heap, + field->field_name, + col_type, + dtype_form_prtype(field_type, charset_no), + col_len); + } + + if (add_fts_doc_id) { + fts_add_doc_id_column(ctx->new_table, ctx->heap); + ctx->new_table->fts->doc_col = fts_doc_id_col; + ut_ad(fts_doc_id_col == altered_table->s->fields); + } else if (ctx->new_table->fts) { + ctx->new_table->fts->doc_col = fts_doc_id_col; + } + + error = row_create_table_for_mysql( + ctx->new_table, ctx->trx, false); + + switch (error) { + dict_table_t* temp_table; + case DB_SUCCESS: + /* We need to bump up the table ref count and + before we can use it we need to open the + table. The new_table must be in the data + dictionary cache, because we are still holding + the dict_sys->mutex. */ + ut_ad(mutex_own(&dict_sys->mutex)); + temp_table = dict_table_open_on_name( + ctx->new_table->name, TRUE, FALSE, + DICT_ERR_IGNORE_NONE); + ut_a(ctx->new_table == temp_table); + /* n_ref_count must be 1, because purge cannot + be executing on this very table as we are + holding dict_operation_lock X-latch. */ + DBUG_ASSERT(ctx->new_table->n_ref_count == 1); + break; + case DB_TABLESPACE_EXISTS: + my_error(ER_TABLESPACE_EXISTS, MYF(0), + new_table_name); + goto new_clustered_failed; + case DB_DUPLICATE_KEY: + my_error(HA_ERR_TABLE_EXIST, MYF(0), + altered_table->s->table_name.str); + goto new_clustered_failed; + default: + my_error_innodb(error, table_name, flags); + new_clustered_failed: + DBUG_ASSERT(ctx->trx != ctx->prebuilt->trx); + trx_rollback_to_savepoint(ctx->trx, NULL); + + ut_ad(user_table->n_ref_count == 1); + + online_retry_drop_indexes_with_trx( + user_table, ctx->trx); + goto err_exit; + } + + if (ha_alter_info->handler_flags + & Alter_inplace_info::ADD_COLUMN) { + add_cols = dtuple_create( + ctx->heap, + dict_table_get_n_cols(ctx->new_table)); + + dict_table_copy_types(add_cols, ctx->new_table); + } else { + add_cols = NULL; + } + + ctx->col_map = innobase_build_col_map( + ha_alter_info, altered_table, old_table, + ctx->new_table, user_table, + add_cols, ctx->heap); + ctx->add_cols = add_cols; + } else { + DBUG_ASSERT(!innobase_need_rebuild(ha_alter_info)); + + if (!ctx->new_table->fts + && innobase_fulltext_exist(altered_table)) { + ctx->new_table->fts = fts_create( + ctx->new_table); + ctx->new_table->fts->doc_col = fts_doc_id_col; + } + } + + /* Assign table_id, so that no table id of + fts_create_index_tables() will be written to the undo logs. */ + DBUG_ASSERT(ctx->new_table->id != 0); + ctx->trx->table_id = ctx->new_table->id; + + /* Create the indexes in SYS_INDEXES and load into dictionary. */ + + for (ulint a = 0; a < ctx->num_to_add_index; a++) { + + ctx->add_index[a] = row_merge_create_index( + ctx->trx, ctx->new_table, + &index_defs[a]); + + add_key_nums[a] = index_defs[a].key_number; + + if (!ctx->add_index[a]) { + error = ctx->trx->error_state; + DBUG_ASSERT(error != DB_SUCCESS); + goto error_handling; + } + + if (ctx->add_index[a]->type & DICT_FTS) { + DBUG_ASSERT(num_fts_index); + DBUG_ASSERT(!fts_index); + DBUG_ASSERT(ctx->add_index[a]->type == DICT_FTS); + fts_index = ctx->add_index[a]; + } + + /* If only online ALTER TABLE operations have been + requested, allocate a modification log. If the table + will be locked anyway, the modification + log is unnecessary. When rebuilding the table + (new_clustered), we will allocate the log for the + clustered index of the old table, later. */ + if (new_clustered + || !ctx->online + || user_table->ibd_file_missing + || dict_table_is_discarded(user_table)) { + /* No need to allocate a modification log. */ + ut_ad(!ctx->add_index[a]->online_log); + } else if (ctx->add_index[a]->type & DICT_FTS) { + /* Fulltext indexes are not covered + by a modification log. */ + } else { + DBUG_EXECUTE_IF("innodb_OOM_prepare_inplace_alter", + error = DB_OUT_OF_MEMORY; + goto error_handling;); + rw_lock_x_lock(&ctx->add_index[a]->lock); + bool ok = row_log_allocate(ctx->add_index[a], + NULL, true, NULL, NULL); + rw_lock_x_unlock(&ctx->add_index[a]->lock); + + if (!ok) { + error = DB_OUT_OF_MEMORY; + goto error_handling; + } + } + } + + ut_ad(new_clustered == ctx->need_rebuild()); + + DBUG_EXECUTE_IF("innodb_OOM_prepare_inplace_alter", + error = DB_OUT_OF_MEMORY; + goto error_handling;); + + if (new_clustered && ctx->online) { + /* Allocate a log for online table rebuild. */ + dict_index_t* clust_index = dict_table_get_first_index( + user_table); + + rw_lock_x_lock(&clust_index->lock); + bool ok = row_log_allocate( + clust_index, ctx->new_table, + !(ha_alter_info->handler_flags + & Alter_inplace_info::ADD_PK_INDEX), + ctx->add_cols, ctx->col_map); + rw_lock_x_unlock(&clust_index->lock); + + if (!ok) { + error = DB_OUT_OF_MEMORY; + goto error_handling; + } + } + + if (ctx->online) { + /* Assign a consistent read view for + row_merge_read_clustered_index(). */ + trx_assign_read_view(ctx->prebuilt->trx); + } + + if (fts_index) { + /* Ensure that the dictionary operation mode will + not change while creating the auxiliary tables. */ + trx_dict_op_t op = trx_get_dict_operation(ctx->trx); + +#ifdef UNIV_DEBUG + switch (op) { + case TRX_DICT_OP_NONE: + break; + case TRX_DICT_OP_TABLE: + case TRX_DICT_OP_INDEX: + goto op_ok; + } + ut_error; +op_ok: +#endif /* UNIV_DEBUG */ + ut_ad(ctx->trx->dict_operation_lock_mode == RW_X_LATCH); + ut_ad(mutex_own(&dict_sys->mutex)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + DICT_TF2_FLAG_SET(ctx->new_table, DICT_TF2_FTS); + + /* This function will commit the transaction and reset + the trx_t::dict_operation flag on success. */ + + error = fts_create_index_tables(ctx->trx, fts_index); + + DBUG_EXECUTE_IF("innodb_test_fail_after_fts_index_table", + error = DB_LOCK_WAIT_TIMEOUT; + goto error_handling;); + + if (error != DB_SUCCESS) { + goto error_handling; + } + + trx_start_for_ddl(ctx->trx, op); + + if (!ctx->new_table->fts + || ib_vector_size(ctx->new_table->fts->indexes) == 0) { + error = fts_create_common_tables( + ctx->trx, ctx->new_table, + user_table->name, TRUE); + + DBUG_EXECUTE_IF( + "innodb_test_fail_after_fts_common_table", + error = DB_LOCK_WAIT_TIMEOUT;); + + if (error != DB_SUCCESS) { + goto error_handling; + } + + ctx->new_table->fts->fts_status + |= TABLE_DICT_LOCKED; + + error = innobase_fts_load_stopword( + ctx->new_table, ctx->trx, + ctx->prebuilt->trx->mysql_thd) + ? DB_SUCCESS : DB_ERROR; + ctx->new_table->fts->fts_status + &= ~TABLE_DICT_LOCKED; + + if (error != DB_SUCCESS) { + goto error_handling; + } + } + + ut_ad(trx_get_dict_operation(ctx->trx) == op); + } + + DBUG_ASSERT(error == DB_SUCCESS); + + /* Commit the data dictionary transaction in order to release + the table locks on the system tables. This means that if + MySQL crashes while creating a new primary key inside + row_merge_build_indexes(), ctx->new_table will not be dropped + by trx_rollback_active(). It will have to be recovered or + dropped by the database administrator. */ + trx_commit_for_mysql(ctx->trx); + + row_mysql_unlock_data_dictionary(ctx->trx); + dict_locked = false; + + ut_a(ctx->trx->lock.n_active_thrs == 0); + + DBUG_EXECUTE_IF("crash_innodb_add_index_after", DBUG_SUICIDE();); + +error_handling: + /* After an error, remove all those index definitions from the + dictionary which were defined. */ + + switch (error) { + case DB_SUCCESS: + ut_a(!dict_locked); + + ut_d(mutex_enter(&dict_sys->mutex)); + ut_d(dict_table_check_for_dup_indexes( + user_table, CHECK_PARTIAL_OK)); + ut_d(mutex_exit(&dict_sys->mutex)); + DBUG_RETURN(false); + case DB_TABLESPACE_EXISTS: + my_error(ER_TABLESPACE_EXISTS, MYF(0), "(unknown)"); + break; + case DB_DUPLICATE_KEY: + my_error(ER_DUP_KEY, MYF(0), "SYS_INDEXES"); + break; + default: + my_error_innodb(error, table_name, user_table->flags); + } + +error_handled: + + ctx->prebuilt->trx->error_info = NULL; + ctx->trx->error_state = DB_SUCCESS; + + if (!dict_locked) { + row_mysql_lock_data_dictionary(ctx->trx); + } + + if (new_clustered) { + if (ctx->need_rebuild()) { + + if (DICT_TF2_FLAG_IS_SET( + ctx->new_table, DICT_TF2_FTS)) { + innobase_drop_fts_index_table( + ctx->new_table, ctx->trx); + } + + dict_table_close(ctx->new_table, TRUE, FALSE); + +#if defined UNIV_DEBUG || defined UNIV_DDL_DEBUG + /* Nobody should have initialized the stats of the + newly created table yet. When this is the case, we + know that it has not been added for background stats + gathering. */ + ut_a(!ctx->new_table->stat_initialized); +#endif /* UNIV_DEBUG || UNIV_DDL_DEBUG */ + + row_merge_drop_table(ctx->trx, ctx->new_table); + + /* Free the log for online table rebuild, if + one was allocated. */ + + dict_index_t* clust_index = dict_table_get_first_index( + user_table); + + rw_lock_x_lock(&clust_index->lock); + + if (clust_index->online_log) { + ut_ad(ctx->online); + row_log_abort_sec(clust_index); + clust_index->online_status + = ONLINE_INDEX_COMPLETE; + } + + rw_lock_x_unlock(&clust_index->lock); + } + + trx_commit_for_mysql(ctx->trx); + /* n_ref_count must be 1, because purge cannot + be executing on this very table as we are + holding dict_operation_lock X-latch. */ + DBUG_ASSERT(user_table->n_ref_count == 1 || ctx->online); + + online_retry_drop_indexes_with_trx(user_table, ctx->trx); + } else { + ut_ad(!ctx->need_rebuild()); + row_merge_drop_indexes(ctx->trx, user_table, TRUE); + trx_commit_for_mysql(ctx->trx); + } + + ut_d(dict_table_check_for_dup_indexes(user_table, CHECK_ALL_COMPLETE)); + ut_ad(!user_table->drop_aborted); + +err_exit: + /* Clear the to_be_dropped flag in the data dictionary cache. */ + for (ulint i = 0; i < ctx->num_to_drop_index; i++) { + DBUG_ASSERT(*ctx->drop_index[i]->name != TEMP_INDEX_PREFIX); + DBUG_ASSERT(ctx->drop_index[i]->to_be_dropped); + ctx->drop_index[i]->to_be_dropped = 0; + } + + row_mysql_unlock_data_dictionary(ctx->trx); + + trx_free_for_mysql(ctx->trx); + trx_commit_for_mysql(ctx->prebuilt->trx); + + delete ctx; + ha_alter_info->handler_ctx = NULL; + + DBUG_RETURN(true); +} + +/* Check whether an index is needed for the foreign key constraint. +If so, if it is dropped, is there an equivalent index can play its role. +@return true if the index is needed and can't be dropped */ +static __attribute__((nonnull(1,2,3,5), warn_unused_result)) +bool +innobase_check_foreign_key_index( +/*=============================*/ + Alter_inplace_info* ha_alter_info, /*!< in: Structure describing + changes to be done by ALTER + TABLE */ + dict_index_t* index, /*!< in: index to check */ + dict_table_t* indexed_table, /*!< in: table that owns the + foreign keys */ + const char** col_names, /*!< in: column names, or NULL + for indexed_table->col_names */ + trx_t* trx, /*!< in/out: transaction */ + dict_foreign_t** drop_fk, /*!< in: Foreign key constraints + to drop */ + ulint n_drop_fk) /*!< in: Number of foreign keys + to drop */ +{ + dict_foreign_t* foreign; + + /* Check if the index is referenced. */ + foreign = dict_table_get_referenced_constraint(indexed_table, index); + + ut_ad(!foreign || indexed_table + == foreign->referenced_table); + + if (foreign + && !dict_foreign_find_index( + indexed_table, col_names, + foreign->referenced_col_names, + foreign->n_fields, index, + /*check_charsets=*/TRUE, + /*check_null=*/FALSE) + && !innobase_find_equiv_index( + foreign->referenced_col_names, + foreign->n_fields, + ha_alter_info->key_info_buffer, + ha_alter_info->index_add_buffer, + ha_alter_info->index_add_count) + ) { + trx->error_info = index; + return(true); + } + + /* Check if this index references some + other table */ + foreign = dict_table_get_foreign_constraint( + indexed_table, index); + + ut_ad(!foreign || indexed_table + == foreign->foreign_table); + + if (foreign + && !innobase_dropping_foreign( + foreign, drop_fk, n_drop_fk) + && !dict_foreign_find_index( + indexed_table, col_names, + foreign->foreign_col_names, + foreign->n_fields, index, + /*check_charsets=*/TRUE, + /*check_null=*/FALSE) + && !innobase_find_equiv_index( + foreign->foreign_col_names, + foreign->n_fields, + ha_alter_info->key_info_buffer, + ha_alter_info->index_add_buffer, + ha_alter_info->index_add_count) + ) { + trx->error_info = index; + return(true); + } + + return(false); +} + +/** Allows InnoDB to update internal structures with concurrent +writes blocked (provided that check_if_supported_inplace_alter() +did not return HA_ALTER_INPLACE_NO_LOCK). +This will be invoked before inplace_alter_table(). + +@param altered_table TABLE object for new version of table. +@param ha_alter_info Structure describing changes to be done +by ALTER TABLE and holding data used during in-place alter. + +@retval true Failure +@retval false Success +*/ +UNIV_INTERN +bool +ha_innobase::prepare_inplace_alter_table( +/*=====================================*/ + TABLE* altered_table, + Alter_inplace_info* ha_alter_info) +{ + dict_index_t** drop_index; /*!< Index to be dropped */ + ulint n_drop_index; /*!< Number of indexes to drop */ + dict_foreign_t**drop_fk; /*!< Foreign key constraints to drop */ + ulint n_drop_fk; /*!< Number of foreign keys to drop */ + dict_foreign_t**add_fk = NULL; /*!< Foreign key constraints to drop */ + ulint n_add_fk; /*!< Number of foreign keys to drop */ + dict_table_t* indexed_table; /*!< Table where indexes are created */ + mem_heap_t* heap; + const char** col_names; + int error; + ulint flags; + ulint flags2; + ulint max_col_len; + ulint add_autoinc_col_no = ULINT_UNDEFINED; + ulonglong autoinc_col_max_value = 0; + ulint fts_doc_col_no = ULINT_UNDEFINED; + bool add_fts_doc_id = false; + bool add_fts_doc_id_idx = false; + bool add_fts_idx = false; + + DBUG_ENTER("prepare_inplace_alter_table"); + DBUG_ASSERT(!ha_alter_info->handler_ctx); + DBUG_ASSERT(ha_alter_info->create_info); + DBUG_ASSERT(!srv_read_only_mode); + + if (UNIV_UNLIKELY(prebuilt->trx->fake_changes)) { + DBUG_RETURN(HA_ERR_WRONG_COMMAND); + } + + MONITOR_ATOMIC_INC(MONITOR_PENDING_ALTER_TABLE); + +#ifdef UNIV_DEBUG + for (dict_index_t* index = dict_table_get_first_index(prebuilt->table); + index; + index = dict_table_get_next_index(index)) { + ut_ad(!index->to_be_dropped); + } +#endif /* UNIV_DEBUG */ + + ut_d(mutex_enter(&dict_sys->mutex)); + ut_d(dict_table_check_for_dup_indexes( + prebuilt->table, CHECK_ABORTED_OK)); + ut_d(mutex_exit(&dict_sys->mutex)); + + if (!(ha_alter_info->handler_flags & ~INNOBASE_INPLACE_IGNORE)) { + /* Nothing to do */ + goto func_exit; + } + + if (ha_alter_info->handler_flags + & Alter_inplace_info::CHANGE_CREATE_OPTION) { + if (const char* invalid_opt = create_options_are_invalid( + user_thd, altered_table, + ha_alter_info->create_info, + prebuilt->table->space != 0)) { + my_error(ER_ILLEGAL_HA_CREATE_OPTION, MYF(0), + table_type(), invalid_opt); + goto err_exit_no_heap; + } + } + + /* Check if any index name is reserved. */ + if (innobase_index_name_is_reserved( + user_thd, + ha_alter_info->key_info_buffer, + ha_alter_info->key_count)) { +err_exit_no_heap: + DBUG_ASSERT(prebuilt->trx->dict_operation_lock_mode == 0); + if (ha_alter_info->handler_flags & ~INNOBASE_INPLACE_IGNORE) { + online_retry_drop_indexes(prebuilt->table, user_thd); + } + DBUG_RETURN(true); + } + + indexed_table = prebuilt->table; + + /* Check that index keys are sensible */ + error = innobase_check_index_keys(ha_alter_info, indexed_table); + + if (error) { + goto err_exit_no_heap; + } + + /* Prohibit renaming a column to something that the table + already contains. */ + if (ha_alter_info->handler_flags + & Alter_inplace_info::ALTER_COLUMN_NAME) { + List_iterator_fast<Create_field> cf_it( + ha_alter_info->alter_info->create_list); + + for (Field** fp = table->field; *fp; fp++) { + if (!((*fp)->flags & FIELD_IS_RENAMED)) { + continue; + } + + const char* name = 0; + + cf_it.rewind(); + while (Create_field* cf = cf_it++) { + if (cf->field == *fp) { + name = cf->field_name; + goto check_if_ok_to_rename; + } + } + + ut_error; +check_if_ok_to_rename: + /* Prohibit renaming a column from FTS_DOC_ID + if full-text indexes exist. */ + if (!my_strcasecmp(system_charset_info, + (*fp)->field_name, + FTS_DOC_ID_COL_NAME) + && innobase_fulltext_exist(altered_table)) { + my_error(ER_INNODB_FT_WRONG_DOCID_COLUMN, + MYF(0), name); + goto err_exit_no_heap; + } + + /* Prohibit renaming a column to an internal column. */ + const char* s = prebuilt->table->col_names; + unsigned j; + /* Skip user columns. + MySQL should have checked these already. + We want to allow renaming of c1 to c2, c2 to c1. */ + for (j = 0; j < table->s->fields; j++) { + s += strlen(s) + 1; + } + + for (; j < prebuilt->table->n_def; j++) { + if (!my_strcasecmp( + system_charset_info, name, s)) { + my_error(ER_WRONG_COLUMN_NAME, MYF(0), + s); + goto err_exit_no_heap; + } + + s += strlen(s) + 1; + } + } + } + + if (!innobase_table_flags(altered_table, + ha_alter_info->create_info, + user_thd, + srv_file_per_table + || indexed_table->space != 0, + &flags, &flags2)) { + goto err_exit_no_heap; + } + + max_col_len = DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(flags); + + /* Check each index's column length to make sure they do not + exceed limit */ + for (ulint i = 0; i < ha_alter_info->index_add_count; i++) { + const KEY* key = &ha_alter_info->key_info_buffer[ + ha_alter_info->index_add_buffer[i]]; + + if (key->flags & HA_FULLTEXT) { + /* The column length does not matter for + fulltext search indexes. But, UNIQUE + fulltext indexes are not supported. */ + DBUG_ASSERT(!(key->flags & HA_NOSAME)); + DBUG_ASSERT(!(key->flags & HA_KEYFLAG_MASK + & ~(HA_FULLTEXT + | HA_PACK_KEY + | HA_BINARY_PACK_KEY))); + add_fts_idx = true; + continue; + } + + if (innobase_check_column_length(max_col_len, key)) { + my_error(ER_INDEX_COLUMN_TOO_LONG, MYF(0), + max_col_len); + goto err_exit_no_heap; + } + } + + /* We won't be allowed to add fts index to a table with + fts indexes already but without AUX_HEX_NAME set. + This means the aux tables of the table failed to + rename to hex format but new created aux tables + shall be in hex format, which is contradictory. */ + if (!DICT_TF2_FLAG_IS_SET(indexed_table, DICT_TF2_FTS_AUX_HEX_NAME) + && indexed_table->fts != NULL && add_fts_idx) { + my_error(ER_INNODB_FT_AUX_NOT_HEX_ID, MYF(0)); + goto err_exit_no_heap; + } + + /* Check existing index definitions for too-long column + prefixes as well, in case max_col_len shrunk. */ + for (const dict_index_t* index + = dict_table_get_first_index(indexed_table); + index; + index = dict_table_get_next_index(index)) { + if (index->type & DICT_FTS) { + DBUG_ASSERT(index->type == DICT_FTS + || (index->type & DICT_CORRUPT)); + continue; + } + + for (ulint i = 0; i < dict_index_get_n_fields(index); i++) { + const dict_field_t* field + = dict_index_get_nth_field(index, i); + if (field->prefix_len > max_col_len) { + my_error(ER_INDEX_COLUMN_TOO_LONG, MYF(0), + max_col_len); + goto err_exit_no_heap; + } + } + } + + n_drop_index = 0; + n_drop_fk = 0; + + if (ha_alter_info->handler_flags + & (INNOBASE_ALTER_NOREBUILD | INNOBASE_ALTER_REBUILD)) { + heap = mem_heap_create(1024); + + if (ha_alter_info->handler_flags + & Alter_inplace_info::ALTER_COLUMN_NAME) { + col_names = innobase_get_col_names( + ha_alter_info, altered_table, table, + indexed_table, heap); + } else { + col_names = NULL; + } + } else { + heap = NULL; + col_names = NULL; + } + + if (ha_alter_info->handler_flags + & Alter_inplace_info::DROP_FOREIGN_KEY) { + DBUG_ASSERT(ha_alter_info->alter_info->drop_list.elements > 0); + + drop_fk = static_cast<dict_foreign_t**>( + mem_heap_alloc( + heap, + ha_alter_info->alter_info->drop_list.elements + * sizeof(dict_foreign_t*))); + + List_iterator<Alter_drop> drop_it( + ha_alter_info->alter_info->drop_list); + + while (Alter_drop* drop = drop_it++) { + if (drop->type != Alter_drop::FOREIGN_KEY) { + continue; + } + + for (dict_foreign_set::iterator it + = prebuilt->table->foreign_set.begin(); + it != prebuilt->table->foreign_set.end(); + ++it) { + + dict_foreign_t* foreign = *it; + const char* fid = strchr(foreign->id, '/'); + + DBUG_ASSERT(fid); + /* If no database/ prefix was present in + the FOREIGN KEY constraint name, compare + to the full constraint name. */ + fid = fid ? fid + 1 : foreign->id; + + if (!my_strcasecmp(system_charset_info, + fid, drop->name)) { + drop_fk[n_drop_fk++] = foreign; + goto found_fk; + } + } + + my_error(ER_CANT_DROP_FIELD_OR_KEY, MYF(0), + drop->name); + goto err_exit; +found_fk: + continue; + } + + DBUG_ASSERT(n_drop_fk > 0); + DBUG_ASSERT(n_drop_fk + == ha_alter_info->alter_info->drop_list.elements); + } else { + drop_fk = NULL; + } + + if (ha_alter_info->index_drop_count) { + dict_index_t* drop_primary = NULL; + + DBUG_ASSERT(ha_alter_info->handler_flags + & (Alter_inplace_info::DROP_INDEX + | Alter_inplace_info::DROP_UNIQUE_INDEX + | Alter_inplace_info::DROP_PK_INDEX)); + /* Check which indexes to drop. */ + drop_index = static_cast<dict_index_t**>( + mem_heap_alloc( + heap, (ha_alter_info->index_drop_count + 1) + * sizeof *drop_index)); + + for (uint i = 0; i < ha_alter_info->index_drop_count; i++) { + const KEY* key + = ha_alter_info->index_drop_buffer[i]; + dict_index_t* index + = dict_table_get_index_on_name_and_min_id( + indexed_table, key->name); + + if (!index) { + push_warning_printf( + user_thd, + Sql_condition::WARN_LEVEL_WARN, + HA_ERR_WRONG_INDEX, + "InnoDB could not find key " + "with name %s", key->name); + } else { + ut_ad(!index->to_be_dropped); + if (!dict_index_is_clust(index)) { + drop_index[n_drop_index++] = index; + } else { + drop_primary = index; + } + } + } + + /* If all FULLTEXT indexes were removed, drop an + internal FTS_DOC_ID_INDEX as well, unless it exists in + the table. */ + + if (innobase_fulltext_exist(table) + && !innobase_fulltext_exist(altered_table) + && !DICT_TF2_FLAG_IS_SET( + indexed_table, DICT_TF2_FTS_HAS_DOC_ID)) { + dict_index_t* fts_doc_index + = dict_table_get_index_on_name( + indexed_table, FTS_DOC_ID_INDEX_NAME); + + // Add some fault tolerance for non-debug builds. + if (fts_doc_index == NULL) { + goto check_if_can_drop_indexes; + } + + DBUG_ASSERT(!fts_doc_index->to_be_dropped); + + for (uint i = 0; i < table->s->keys; i++) { + if (!my_strcasecmp( + system_charset_info, + FTS_DOC_ID_INDEX_NAME, + table->key_info[i].name)) { + /* The index exists in the MySQL + data dictionary. Do not drop it, + even though it is no longer needed + by InnoDB fulltext search. */ + goto check_if_can_drop_indexes; + } + } + + drop_index[n_drop_index++] = fts_doc_index; + } + +check_if_can_drop_indexes: + /* Check if the indexes can be dropped. */ + + /* Prevent a race condition between DROP INDEX and + CREATE TABLE adding FOREIGN KEY constraints. */ + row_mysql_lock_data_dictionary(prebuilt->trx); + + if (!n_drop_index) { + drop_index = NULL; + } else { + /* Flag all indexes that are to be dropped. */ + for (ulint i = 0; i < n_drop_index; i++) { + ut_ad(!drop_index[i]->to_be_dropped); + drop_index[i]->to_be_dropped = 1; + } + } + + if (prebuilt->trx->check_foreigns) { + for (uint i = 0; i < n_drop_index; i++) { + dict_index_t* index = drop_index[i]; + + if (innobase_check_foreign_key_index( + ha_alter_info, index, + indexed_table, col_names, + prebuilt->trx, drop_fk, n_drop_fk)) { + row_mysql_unlock_data_dictionary( + prebuilt->trx); + prebuilt->trx->error_info = index; + print_error(HA_ERR_DROP_INDEX_FK, + MYF(0)); + goto err_exit; + } + } + + /* If a primary index is dropped, need to check + any depending foreign constraints get affected */ + if (drop_primary + && innobase_check_foreign_key_index( + ha_alter_info, drop_primary, + indexed_table, col_names, + prebuilt->trx, drop_fk, n_drop_fk)) { + row_mysql_unlock_data_dictionary(prebuilt->trx); + print_error(HA_ERR_DROP_INDEX_FK, MYF(0)); + goto err_exit; + } + } + + row_mysql_unlock_data_dictionary(prebuilt->trx); + } else { + drop_index = NULL; + } + + n_add_fk = 0; + + if (ha_alter_info->handler_flags + & Alter_inplace_info::ADD_FOREIGN_KEY) { + ut_ad(!prebuilt->trx->check_foreigns); + + add_fk = static_cast<dict_foreign_t**>( + mem_heap_zalloc( + heap, + ha_alter_info->alter_info->key_list.elements + * sizeof(dict_foreign_t*))); + + if (!innobase_get_foreign_key_info( + ha_alter_info, table_share, + prebuilt->table, col_names, + drop_index, n_drop_index, + add_fk, &n_add_fk, prebuilt->trx)) { +err_exit: + if (n_drop_index) { + row_mysql_lock_data_dictionary(prebuilt->trx); + + /* Clear the to_be_dropped flags, which might + have been set at this point. */ + for (ulint i = 0; i < n_drop_index; i++) { + DBUG_ASSERT(*drop_index[i]->name + != TEMP_INDEX_PREFIX); + drop_index[i]->to_be_dropped = 0; + } + + row_mysql_unlock_data_dictionary(prebuilt->trx); + } + + if (heap) { + mem_heap_free(heap); + } + + goto err_exit_no_heap; + } + } + + if (!(ha_alter_info->handler_flags & INNOBASE_ALTER_DATA) + || (ha_alter_info->handler_flags + == Alter_inplace_info::CHANGE_CREATE_OPTION + && !innobase_need_rebuild(ha_alter_info))) { + + if (heap) { + ha_alter_info->handler_ctx + = new ha_innobase_inplace_ctx( + prebuilt, + drop_index, n_drop_index, + drop_fk, n_drop_fk, + add_fk, n_add_fk, + ha_alter_info->online, + heap, indexed_table, + col_names, ULINT_UNDEFINED, 0, 0); + } + +func_exit: + DBUG_ASSERT(prebuilt->trx->dict_operation_lock_mode == 0); + if (ha_alter_info->handler_flags & ~INNOBASE_INPLACE_IGNORE) { + online_retry_drop_indexes(prebuilt->table, user_thd); + } + DBUG_RETURN(false); + } + + /* If we are to build a full-text search index, check whether + the table already has a DOC ID column. If not, we will need to + add a Doc ID hidden column and rebuild the primary index */ + if (innobase_fulltext_exist(altered_table)) { + ulint doc_col_no; + + if (!innobase_fts_check_doc_id_col( + prebuilt->table, altered_table, &fts_doc_col_no)) { + fts_doc_col_no = altered_table->s->fields; + add_fts_doc_id = true; + add_fts_doc_id_idx = true; + + push_warning_printf( + user_thd, + Sql_condition::WARN_LEVEL_WARN, + HA_ERR_WRONG_INDEX, + "InnoDB rebuilding table to add column " + FTS_DOC_ID_COL_NAME); + } else if (fts_doc_col_no == ULINT_UNDEFINED) { + goto err_exit; + } + + switch (innobase_fts_check_doc_id_index( + prebuilt->table, altered_table, &doc_col_no)) { + case FTS_NOT_EXIST_DOC_ID_INDEX: + add_fts_doc_id_idx = true; + break; + case FTS_INCORRECT_DOC_ID_INDEX: + my_error(ER_INNODB_FT_WRONG_DOCID_INDEX, MYF(0), + FTS_DOC_ID_INDEX_NAME); + goto err_exit; + case FTS_EXIST_DOC_ID_INDEX: + DBUG_ASSERT(doc_col_no == fts_doc_col_no + || doc_col_no == ULINT_UNDEFINED + || (ha_alter_info->handler_flags + & (Alter_inplace_info::ALTER_COLUMN_ORDER + | Alter_inplace_info::DROP_COLUMN + | Alter_inplace_info::ADD_COLUMN))); + } + } + + /* See if an AUTO_INCREMENT column was added. */ + uint i = 0; + List_iterator_fast<Create_field> cf_it( + ha_alter_info->alter_info->create_list); + while (const Create_field* new_field = cf_it++) { + const Field* field; + + DBUG_ASSERT(i < altered_table->s->fields); + + for (uint old_i = 0; table->field[old_i]; old_i++) { + if (new_field->field == table->field[old_i]) { + goto found_col; + } + } + + /* This is an added column. */ + DBUG_ASSERT(!new_field->field); + DBUG_ASSERT(ha_alter_info->handler_flags + & Alter_inplace_info::ADD_COLUMN); + + field = altered_table->field[i]; + + DBUG_ASSERT((MTYP_TYPENR(field->unireg_check) + == Field::NEXT_NUMBER) + == !!(field->flags & AUTO_INCREMENT_FLAG)); + + if (field->flags & AUTO_INCREMENT_FLAG) { + if (add_autoinc_col_no != ULINT_UNDEFINED) { + /* This should have been blocked earlier. */ + ut_ad(0); + my_error(ER_WRONG_AUTO_KEY, MYF(0)); + goto err_exit; + } + add_autoinc_col_no = i; + + autoinc_col_max_value = innobase_get_int_col_max_value( + field); + } +found_col: + i++; + } + + DBUG_ASSERT(heap); + DBUG_ASSERT(user_thd == prebuilt->trx->mysql_thd); + DBUG_ASSERT(!ha_alter_info->handler_ctx); + + ha_alter_info->handler_ctx = new ha_innobase_inplace_ctx( + prebuilt, + drop_index, n_drop_index, + drop_fk, n_drop_fk, add_fk, n_add_fk, + ha_alter_info->online, + heap, prebuilt->table, col_names, + add_autoinc_col_no, + ha_alter_info->create_info->auto_increment_value, + autoinc_col_max_value); + + DBUG_RETURN(prepare_inplace_alter_table_dict( + ha_alter_info, altered_table, table, + table_share->table_name.str, + flags, flags2, + fts_doc_col_no, add_fts_doc_id, + add_fts_doc_id_idx)); +} + +/** Alter the table structure in-place with operations +specified using Alter_inplace_info. +The level of concurrency allowed during this operation depends +on the return value from check_if_supported_inplace_alter(). + +@param altered_table TABLE object for new version of table. +@param ha_alter_info Structure describing changes to be done +by ALTER TABLE and holding data used during in-place alter. + +@retval true Failure +@retval false Success +*/ +UNIV_INTERN +bool +ha_innobase::inplace_alter_table( +/*=============================*/ + TABLE* altered_table, + Alter_inplace_info* ha_alter_info) +{ + dberr_t error; + + DBUG_ENTER("inplace_alter_table"); + DBUG_ASSERT(!srv_read_only_mode); + +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); + ut_ad(!rw_lock_own(&dict_operation_lock, RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + + DEBUG_SYNC(user_thd, "innodb_inplace_alter_table_enter"); + + if (!(ha_alter_info->handler_flags & INNOBASE_ALTER_DATA)) { +ok_exit: + DEBUG_SYNC(user_thd, "innodb_after_inplace_alter_table"); + DBUG_RETURN(false); + } + + if (ha_alter_info->handler_flags + == Alter_inplace_info::CHANGE_CREATE_OPTION + && !innobase_need_rebuild(ha_alter_info)) { + goto ok_exit; + } + + ha_innobase_inplace_ctx* ctx + = static_cast<ha_innobase_inplace_ctx*> + (ha_alter_info->handler_ctx); + + DBUG_ASSERT(ctx); + DBUG_ASSERT(ctx->trx); + DBUG_ASSERT(ctx->prebuilt == prebuilt); + + if (prebuilt->table->ibd_file_missing + || dict_table_is_discarded(prebuilt->table)) { + goto all_done; + } + + /* Read the clustered index of the table and build + indexes based on this information using temporary + files and merge sort. */ + DBUG_EXECUTE_IF("innodb_OOM_inplace_alter", + error = DB_OUT_OF_MEMORY; goto oom;); + error = row_merge_build_indexes( + prebuilt->trx, + prebuilt->table, ctx->new_table, + ctx->online, + ctx->add_index, ctx->add_key_numbers, ctx->num_to_add_index, + altered_table, ctx->add_cols, ctx->col_map, + ctx->add_autoinc, ctx->sequence); +#ifndef DBUG_OFF +oom: +#endif /* !DBUG_OFF */ + if (error == DB_SUCCESS && ctx->online && ctx->need_rebuild()) { + DEBUG_SYNC_C("row_log_table_apply1_before"); + error = row_log_table_apply( + ctx->thr, prebuilt->table, altered_table); + } + + DEBUG_SYNC_C("inplace_after_index_build"); + + DBUG_EXECUTE_IF("create_index_fail", + error = DB_DUPLICATE_KEY; + prebuilt->trx->error_key_num = ULINT_UNDEFINED;); + + /* After an error, remove all those index definitions + from the dictionary which were defined. */ + + switch (error) { + KEY* dup_key; + all_done: + case DB_SUCCESS: + ut_d(mutex_enter(&dict_sys->mutex)); + ut_d(dict_table_check_for_dup_indexes( + prebuilt->table, CHECK_PARTIAL_OK)); + ut_d(mutex_exit(&dict_sys->mutex)); + /* prebuilt->table->n_ref_count can be anything here, + given that we hold at most a shared lock on the table. */ + goto ok_exit; + case DB_DUPLICATE_KEY: + if (prebuilt->trx->error_key_num == ULINT_UNDEFINED + || ha_alter_info->key_count == 0) { + /* This should be the hidden index on + FTS_DOC_ID, or there is no PRIMARY KEY in the + table. Either way, we should be seeing and + reporting a bogus duplicate key error. */ + dup_key = NULL; + } else { + DBUG_ASSERT(prebuilt->trx->error_key_num + < ha_alter_info->key_count); + dup_key = &ha_alter_info->key_info_buffer[ + prebuilt->trx->error_key_num]; + } + print_keydup_error(altered_table, dup_key, MYF(0)); + break; + case DB_ONLINE_LOG_TOO_BIG: + DBUG_ASSERT(ctx->online); + my_error(ER_INNODB_ONLINE_LOG_TOO_BIG, MYF(0), + (prebuilt->trx->error_key_num == ULINT_UNDEFINED) + ? FTS_DOC_ID_INDEX_NAME + : ha_alter_info->key_info_buffer[ + prebuilt->trx->error_key_num].name); + break; + case DB_INDEX_CORRUPT: + my_error(ER_INDEX_CORRUPT, MYF(0), + (prebuilt->trx->error_key_num == ULINT_UNDEFINED) + ? FTS_DOC_ID_INDEX_NAME + : ha_alter_info->key_info_buffer[ + prebuilt->trx->error_key_num].name); + break; + default: + my_error_innodb(error, + table_share->table_name.str, + prebuilt->table->flags); + } + + /* prebuilt->table->n_ref_count can be anything here, given + that we hold at most a shared lock on the table. */ + prebuilt->trx->error_info = NULL; + ctx->trx->error_state = DB_SUCCESS; + + DBUG_RETURN(true); +} + +/** Free the modification log for online table rebuild. +@param table table that was being rebuilt online */ +static +void +innobase_online_rebuild_log_free( +/*=============================*/ + dict_table_t* table) +{ + dict_index_t* clust_index = dict_table_get_first_index(table); + + ut_ad(mutex_own(&dict_sys->mutex)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + rw_lock_x_lock(&clust_index->lock); + + if (clust_index->online_log) { + ut_ad(dict_index_get_online_status(clust_index) + == ONLINE_INDEX_CREATION); + clust_index->online_status = ONLINE_INDEX_COMPLETE; + row_log_free(clust_index->online_log); + DEBUG_SYNC_C("innodb_online_rebuild_log_free_aborted"); + } + + DBUG_ASSERT(dict_index_get_online_status(clust_index) + == ONLINE_INDEX_COMPLETE); + rw_lock_x_unlock(&clust_index->lock); +} + +/** Rollback a secondary index creation, drop the indexes with +temparary index prefix +@param user_table InnoDB table +@param table the TABLE +@param locked TRUE=table locked, FALSE=may need to do a lazy drop +@param trx the transaction +*/ +static __attribute__((nonnull)) +void +innobase_rollback_sec_index( +/*========================*/ + dict_table_t* user_table, + const TABLE* table, + ibool locked, + trx_t* trx) +{ + row_merge_drop_indexes(trx, user_table, locked); + + /* Free the table->fts only if there is no FTS_DOC_ID + in the table */ + if (user_table->fts + && !DICT_TF2_FLAG_IS_SET(user_table, + DICT_TF2_FTS_HAS_DOC_ID) + && !innobase_fulltext_exist(table)) { + fts_free(user_table); + } +} + +/** Roll back the changes made during prepare_inplace_alter_table() +and inplace_alter_table() inside the storage engine. Note that the +allowed level of concurrency during this operation will be the same as +for inplace_alter_table() and thus might be higher than during +prepare_inplace_alter_table(). (E.g concurrent writes were blocked +during prepare, but might not be during commit). + +@param ha_alter_info Data used during in-place alter. +@param table the TABLE +@param prebuilt the prebuilt struct +@retval true Failure +@retval false Success +*/ +inline __attribute__((nonnull, warn_unused_result)) +bool +rollback_inplace_alter_table( +/*=========================*/ + Alter_inplace_info* ha_alter_info, + const TABLE* table, + row_prebuilt_t* prebuilt) +{ + bool fail = false; + + ha_innobase_inplace_ctx* ctx + = static_cast<ha_innobase_inplace_ctx*> + (ha_alter_info->handler_ctx); + + DBUG_ENTER("rollback_inplace_alter_table"); + + if (!ctx || !ctx->trx) { + /* If we have not started a transaction yet, + (almost) nothing has been or needs to be done. */ + goto func_exit; + } + + row_mysql_lock_data_dictionary(ctx->trx); + + if (ctx->need_rebuild()) { + dberr_t err; + ulint flags = ctx->new_table->flags; + + /* DML threads can access ctx->new_table via the + online rebuild log. Free it first. */ + innobase_online_rebuild_log_free(prebuilt->table); + + /* Since the FTS index specific auxiliary tables has + not yet registered with "table->fts" by fts_add_index(), + we will need explicitly delete them here */ + if (DICT_TF2_FLAG_IS_SET(ctx->new_table, DICT_TF2_FTS)) { + + err = innobase_drop_fts_index_table( + ctx->new_table, ctx->trx); + + if (err != DB_SUCCESS) { + my_error_innodb( + err, table->s->table_name.str, + flags); + fail = true; + } + } + + /* Drop the table. */ + dict_table_close(ctx->new_table, TRUE, FALSE); + +#if defined UNIV_DEBUG || defined UNIV_DDL_DEBUG + /* Nobody should have initialized the stats of the + newly created table yet. When this is the case, we + know that it has not been added for background stats + gathering. */ + ut_a(!ctx->new_table->stat_initialized); +#endif /* UNIV_DEBUG || UNIV_DDL_DEBUG */ + + err = row_merge_drop_table(ctx->trx, ctx->new_table); + + switch (err) { + case DB_SUCCESS: + break; + default: + my_error_innodb(err, table->s->table_name.str, + flags); + fail = true; + } + } else { + DBUG_ASSERT(!(ha_alter_info->handler_flags + & Alter_inplace_info::ADD_PK_INDEX)); + DBUG_ASSERT(ctx->new_table == prebuilt->table); + + trx_start_for_ddl(ctx->trx, TRX_DICT_OP_INDEX); + + innobase_rollback_sec_index( + prebuilt->table, table, FALSE, ctx->trx); + } + + trx_commit_for_mysql(ctx->trx); + row_mysql_unlock_data_dictionary(ctx->trx); + trx_free_for_mysql(ctx->trx); + +func_exit: +#ifndef DBUG_OFF + dict_index_t* clust_index = dict_table_get_first_index( + prebuilt->table); + DBUG_ASSERT(!clust_index->online_log); + DBUG_ASSERT(dict_index_get_online_status(clust_index) + == ONLINE_INDEX_COMPLETE); +#endif /* !DBUG_OFF */ + + if (ctx) { + DBUG_ASSERT(ctx->prebuilt == prebuilt); + + if (ctx->num_to_add_fk) { + for (ulint i = 0; i < ctx->num_to_add_fk; i++) { + dict_foreign_free(ctx->add_fk[i]); + } + } + + if (ctx->num_to_drop_index) { + row_mysql_lock_data_dictionary(prebuilt->trx); + + /* Clear the to_be_dropped flags + in the data dictionary cache. + The flags may already have been cleared, + in case an error was detected in + commit_inplace_alter_table(). */ + for (ulint i = 0; i < ctx->num_to_drop_index; i++) { + dict_index_t* index = ctx->drop_index[i]; + DBUG_ASSERT(*index->name != TEMP_INDEX_PREFIX); + + index->to_be_dropped = 0; + } + + row_mysql_unlock_data_dictionary(prebuilt->trx); + } + } + + trx_commit_for_mysql(prebuilt->trx); + MONITOR_ATOMIC_DEC(MONITOR_PENDING_ALTER_TABLE); + DBUG_RETURN(fail); +} + +/** Drop a FOREIGN KEY constraint from the data dictionary tables. +@param trx data dictionary transaction +@param table_name Table name in MySQL +@param foreign_id Foreign key constraint identifier +@retval true Failure +@retval false Success */ +static __attribute__((nonnull, warn_unused_result)) +bool +innobase_drop_foreign_try( +/*======================*/ + trx_t* trx, + const char* table_name, + const char* foreign_id) +{ + DBUG_ENTER("innobase_drop_foreign_try"); + + DBUG_ASSERT(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX); + ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH); + ut_ad(mutex_own(&dict_sys->mutex)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + /* Drop the constraint from the data dictionary. */ + static const char sql[] = + "PROCEDURE DROP_FOREIGN_PROC () IS\n" + "BEGIN\n" + "DELETE FROM SYS_FOREIGN WHERE ID=:id;\n" + "DELETE FROM SYS_FOREIGN_COLS WHERE ID=:id;\n" + "END;\n"; + + dberr_t error; + pars_info_t* info; + + info = pars_info_create(); + pars_info_add_str_literal(info, "id", foreign_id); + + trx->op_info = "dropping foreign key constraint from dictionary"; + error = que_eval_sql(info, sql, FALSE, trx); + trx->op_info = ""; + + DBUG_EXECUTE_IF("ib_drop_foreign_error", + error = DB_OUT_OF_FILE_SPACE;); + + if (error != DB_SUCCESS) { + my_error_innodb(error, table_name, 0); + trx->error_state = DB_SUCCESS; + DBUG_RETURN(true); + } + + DBUG_RETURN(false); +} + +/** Rename a column in the data dictionary tables. +@param user_table InnoDB table that was being altered +@param trx data dictionary transaction +@param table_name Table name in MySQL +@param nth_col 0-based index of the column +@param from old column name +@param to new column name +@param new_clustered whether the table has been rebuilt +@retval true Failure +@retval false Success */ +static __attribute__((nonnull, warn_unused_result)) +bool +innobase_rename_column_try( +/*=======================*/ + const dict_table_t* user_table, + trx_t* trx, + const char* table_name, + ulint nth_col, + const char* from, + const char* to, + bool new_clustered) +{ + pars_info_t* info; + dberr_t error; + + DBUG_ENTER("innobase_rename_column_try"); + + DBUG_ASSERT(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX); + ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH); + ut_ad(mutex_own(&dict_sys->mutex)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + if (new_clustered) { + goto rename_foreign; + } + + info = pars_info_create(); + + pars_info_add_ull_literal(info, "tableid", user_table->id); + pars_info_add_int4_literal(info, "nth", nth_col); + pars_info_add_str_literal(info, "old", from); + pars_info_add_str_literal(info, "new", to); + + trx->op_info = "renaming column in SYS_COLUMNS"; + + error = que_eval_sql( + info, + "PROCEDURE RENAME_SYS_COLUMNS_PROC () IS\n" + "BEGIN\n" + "UPDATE SYS_COLUMNS SET NAME=:new\n" + "WHERE TABLE_ID=:tableid AND NAME=:old\n" + "AND POS=:nth;\n" + "END;\n", + FALSE, trx); + + DBUG_EXECUTE_IF("ib_rename_column_error", + error = DB_OUT_OF_FILE_SPACE;); + + if (error != DB_SUCCESS) { +err_exit: + my_error_innodb(error, table_name, 0); + trx->error_state = DB_SUCCESS; + trx->op_info = ""; + DBUG_RETURN(true); + } + + trx->op_info = "renaming column in SYS_FIELDS"; + + for (const dict_index_t* index = dict_table_get_first_index( + user_table); + index != NULL; + index = dict_table_get_next_index(index)) { + + for (ulint i = 0; i < dict_index_get_n_fields(index); i++) { + if (strcmp(dict_index_get_nth_field(index, i)->name, + from)) { + continue; + } + + info = pars_info_create(); + + pars_info_add_ull_literal(info, "indexid", index->id); + pars_info_add_int4_literal(info, "nth", i); + pars_info_add_str_literal(info, "old", from); + pars_info_add_str_literal(info, "new", to); + + error = que_eval_sql( + info, + "PROCEDURE RENAME_SYS_FIELDS_PROC () IS\n" + "BEGIN\n" + + "UPDATE SYS_FIELDS SET COL_NAME=:new\n" + "WHERE INDEX_ID=:indexid AND COL_NAME=:old\n" + "AND POS=:nth;\n" + + /* Try again, in case there is a prefix_len + encoded in SYS_FIELDS.POS */ + + "UPDATE SYS_FIELDS SET COL_NAME=:new\n" + "WHERE INDEX_ID=:indexid AND COL_NAME=:old\n" + "AND POS>=65536*:nth AND POS<65536*(:nth+1);\n" + + "END;\n", + FALSE, trx); + + if (error != DB_SUCCESS) { + goto err_exit; + } + } + } + +rename_foreign: + trx->op_info = "renaming column in SYS_FOREIGN_COLS"; + + for (dict_foreign_set::iterator it = user_table->foreign_set.begin(); + it != user_table->foreign_set.end(); + ++it) { + + dict_foreign_t* foreign = *it; + + for (unsigned i = 0; i < foreign->n_fields; i++) { + if (strcmp(foreign->foreign_col_names[i], from)) { + continue; + } + + info = pars_info_create(); + + pars_info_add_str_literal(info, "id", foreign->id); + pars_info_add_int4_literal(info, "nth", i); + pars_info_add_str_literal(info, "old", from); + pars_info_add_str_literal(info, "new", to); + + error = que_eval_sql( + info, + "PROCEDURE RENAME_SYS_FOREIGN_F_PROC () IS\n" + "BEGIN\n" + "UPDATE SYS_FOREIGN_COLS\n" + "SET FOR_COL_NAME=:new\n" + "WHERE ID=:id AND POS=:nth\n" + "AND FOR_COL_NAME=:old;\n" + "END;\n", + FALSE, trx); + + if (error != DB_SUCCESS) { + goto err_exit; + } + } + } + + for (dict_foreign_set::iterator it + = user_table->referenced_set.begin(); + it != user_table->referenced_set.end(); + ++it) { + + dict_foreign_t* foreign = *it; + for (unsigned i = 0; i < foreign->n_fields; i++) { + if (strcmp(foreign->referenced_col_names[i], from)) { + continue; + } + + info = pars_info_create(); + + pars_info_add_str_literal(info, "id", foreign->id); + pars_info_add_int4_literal(info, "nth", i); + pars_info_add_str_literal(info, "old", from); + pars_info_add_str_literal(info, "new", to); + + error = que_eval_sql( + info, + "PROCEDURE RENAME_SYS_FOREIGN_R_PROC () IS\n" + "BEGIN\n" + "UPDATE SYS_FOREIGN_COLS\n" + "SET REF_COL_NAME=:new\n" + "WHERE ID=:id AND POS=:nth\n" + "AND REF_COL_NAME=:old;\n" + "END;\n", + FALSE, trx); + + if (error != DB_SUCCESS) { + goto err_exit; + } + } + } + + trx->op_info = ""; + DBUG_RETURN(false); +} + +/** Rename columns in the data dictionary tables. +@param ha_alter_info Data used during in-place alter. +@param ctx In-place ALTER TABLE context +@param table the TABLE +@param trx data dictionary transaction +@param table_name Table name in MySQL +@retval true Failure +@retval false Success */ +static __attribute__((nonnull, warn_unused_result)) +bool +innobase_rename_columns_try( +/*========================*/ + Alter_inplace_info* ha_alter_info, + ha_innobase_inplace_ctx*ctx, + const TABLE* table, + trx_t* trx, + const char* table_name) +{ + List_iterator_fast<Create_field> cf_it( + ha_alter_info->alter_info->create_list); + uint i = 0; + + DBUG_ASSERT(ctx); + DBUG_ASSERT(ha_alter_info->handler_flags + & Alter_inplace_info::ALTER_COLUMN_NAME); + + for (Field** fp = table->field; *fp; fp++, i++) { + if (!((*fp)->flags & FIELD_IS_RENAMED)) { + continue; + } + + cf_it.rewind(); + while (Create_field* cf = cf_it++) { + if (cf->field == *fp) { + if (innobase_rename_column_try( + ctx->old_table, trx, table_name, i, + cf->field->field_name, + cf->field_name, + ctx->need_rebuild())) { + return(true); + } + goto processed_field; + } + } + + ut_error; +processed_field: + continue; + } + + return(false); +} + +/** Rename columns in the data dictionary cache +as part of commit_cache_norebuild(). +@param ha_alter_info Data used during in-place alter. +@param table the TABLE +@param user_table InnoDB table that was being altered */ +static __attribute__((nonnull)) +void +innobase_rename_columns_cache( +/*==========================*/ + Alter_inplace_info* ha_alter_info, + const TABLE* table, + dict_table_t* user_table) +{ + if (!(ha_alter_info->handler_flags + & Alter_inplace_info::ALTER_COLUMN_NAME)) { + return; + } + + List_iterator_fast<Create_field> cf_it( + ha_alter_info->alter_info->create_list); + uint i = 0; + + for (Field** fp = table->field; *fp; fp++, i++) { + if (!((*fp)->flags & FIELD_IS_RENAMED)) { + continue; + } + + cf_it.rewind(); + while (Create_field* cf = cf_it++) { + if (cf->field == *fp) { + dict_mem_table_col_rename(user_table, i, + cf->field->field_name, + cf->field_name); + goto processed_field; + } + } + + ut_error; +processed_field: + continue; + } +} + +/** Get the auto-increment value of the table on commit. +@param ha_alter_info Data used during in-place alter +@param ctx In-place ALTER TABLE context +@param altered_table MySQL table that is being altered +@param old_table MySQL table as it is before the ALTER operation +@return the next auto-increment value (0 if not present) */ +static __attribute__((nonnull, warn_unused_result)) +ulonglong +commit_get_autoinc( +/*===============*/ + Alter_inplace_info* ha_alter_info, + ha_innobase_inplace_ctx*ctx, + const TABLE* altered_table, + const TABLE* old_table) +{ + ulonglong max_autoinc; + + DBUG_ENTER("commit_get_autoinc"); + + if (!altered_table->found_next_number_field) { + /* There is no AUTO_INCREMENT column in the table + after the ALTER operation. */ + max_autoinc = 0; + } else if (ctx->add_autoinc != ULINT_UNDEFINED) { + /* An AUTO_INCREMENT column was added. Get the last + value from the sequence, which may be based on a + supplied AUTO_INCREMENT value. */ + max_autoinc = ctx->sequence.last(); + } else if ((ha_alter_info->handler_flags + & Alter_inplace_info::CHANGE_CREATE_OPTION) + && (ha_alter_info->create_info->used_fields + & HA_CREATE_USED_AUTO)) { + /* An AUTO_INCREMENT value was supplied, but the table was not + rebuilt. Get the user-supplied value or the last value from the + sequence. */ + ib_uint64_t max_value_table; + dberr_t err; + + Field* autoinc_field = + old_table->found_next_number_field; + + dict_index_t* index = dict_table_get_index_on_first_col( + ctx->old_table, autoinc_field->field_index); + + max_autoinc = ha_alter_info->create_info->auto_increment_value; + + dict_table_autoinc_lock(ctx->old_table); + + err = row_search_max_autoinc( + index, autoinc_field->field_name, &max_value_table); + + if (err != DB_SUCCESS) { + ut_ad(0); + max_autoinc = 0; + } else if (max_autoinc <= max_value_table) { + ulonglong col_max_value; + ulonglong offset; + + col_max_value = innobase_get_int_col_max_value( + old_table->found_next_number_field); + + offset = ctx->prebuilt->autoinc_offset; + max_autoinc = innobase_next_autoinc( + max_value_table, 1, 1, offset, + col_max_value); + } + dict_table_autoinc_unlock(ctx->old_table); + } else { + /* An AUTO_INCREMENT value was not specified. + Read the old counter value from the table. */ + ut_ad(old_table->found_next_number_field); + dict_table_autoinc_lock(ctx->old_table); + max_autoinc = ctx->old_table->autoinc; + dict_table_autoinc_unlock(ctx->old_table); + } + + DBUG_RETURN(max_autoinc); +} + +/** Add or drop foreign key constraints to the data dictionary tables, +but do not touch the data dictionary cache. +@param ha_alter_info Data used during in-place alter +@param ctx In-place ALTER TABLE context +@param trx Data dictionary transaction +@param table_name Table name in MySQL +@retval true Failure +@retval false Success +*/ +static __attribute__((nonnull, warn_unused_result)) +bool +innobase_update_foreign_try( +/*========================*/ + ha_innobase_inplace_ctx*ctx, + trx_t* trx, + const char* table_name) +{ + ulint foreign_id; + ulint i; + + DBUG_ENTER("innobase_update_foreign_try"); + DBUG_ASSERT(ctx); + + foreign_id = dict_table_get_highest_foreign_id(ctx->new_table); + + foreign_id++; + + for (i = 0; i < ctx->num_to_add_fk; i++) { + dict_foreign_t* fk = ctx->add_fk[i]; + + ut_ad(fk->foreign_table == ctx->new_table + || fk->foreign_table == ctx->old_table); + + dberr_t error = dict_create_add_foreign_id( + &foreign_id, ctx->old_table->name, fk); + + if (error != DB_SUCCESS) { + my_error(ER_TOO_LONG_IDENT, MYF(0), + fk->id); + DBUG_RETURN(true); + } + + if (!fk->foreign_index) { + fk->foreign_index = dict_foreign_find_index( + ctx->new_table, ctx->col_names, + fk->foreign_col_names, + fk->n_fields, fk->referenced_index, TRUE, + fk->type + & (DICT_FOREIGN_ON_DELETE_SET_NULL + | DICT_FOREIGN_ON_UPDATE_SET_NULL)); + if (!fk->foreign_index) { + my_error(ER_FK_INCORRECT_OPTION, + MYF(0), table_name, fk->id); + DBUG_RETURN(true); + } + } + + /* The fk->foreign_col_names[] uses renamed column + names, while the columns in ctx->old_table have not + been renamed yet. */ + error = dict_create_add_foreign_to_dictionary( + ctx->old_table->name, fk, trx); + + DBUG_EXECUTE_IF( + "innodb_test_cannot_add_fk_system", + error = DB_ERROR;); + + if (error != DB_SUCCESS) { + my_error(ER_FK_FAIL_ADD_SYSTEM, MYF(0), + fk->id); + DBUG_RETURN(true); + } + } + + for (i = 0; i < ctx->num_to_drop_fk; i++) { + dict_foreign_t* fk = ctx->drop_fk[i]; + + DBUG_ASSERT(fk->foreign_table == ctx->old_table); + + if (innobase_drop_foreign_try(trx, table_name, fk->id)) { + DBUG_RETURN(true); + } + } + + DBUG_RETURN(false); +} + +/** Update the foreign key constraint definitions in the data dictionary cache +after the changes to data dictionary tables were committed. +@param ctx In-place ALTER TABLE context +@param user_thd MySQL connection +@return InnoDB error code (should always be DB_SUCCESS) */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +innobase_update_foreign_cache( +/*==========================*/ + ha_innobase_inplace_ctx* ctx, + THD* user_thd) +{ + dict_table_t* user_table; + dberr_t err = DB_SUCCESS; + + DBUG_ENTER("innobase_update_foreign_cache"); + + user_table = ctx->old_table; + + /* Discard the added foreign keys, because we will + load them from the data dictionary. */ + for (ulint i = 0; i < ctx->num_to_add_fk; i++) { + dict_foreign_t* fk = ctx->add_fk[i]; + dict_foreign_free(fk); + } + + if (ctx->need_rebuild()) { + /* The rebuilt table is already using the renamed + column names. No need to pass col_names or to drop + constraints from the data dictionary cache. */ + DBUG_ASSERT(!ctx->col_names); + DBUG_ASSERT(user_table->foreign_set.empty()); + DBUG_ASSERT(user_table->referenced_set.empty()); + user_table = ctx->new_table; + } else { + /* Drop the foreign key constraints if the + table was not rebuilt. If the table is rebuilt, + there would not be any foreign key contraints for + it yet in the data dictionary cache. */ + for (ulint i = 0; i < ctx->num_to_drop_fk; i++) { + dict_foreign_t* fk = ctx->drop_fk[i]; + dict_foreign_remove_from_cache(fk); + } + } + + /* Load the old or added foreign keys from the data dictionary + and prevent the table from being evicted from the data + dictionary cache (work around the lack of WL#6049). */ + err = dict_load_foreigns(user_table->name, + ctx->col_names, false, true, + DICT_ERR_IGNORE_NONE); + + if (err == DB_CANNOT_ADD_CONSTRAINT) { + /* It is possible there are existing foreign key are + loaded with "foreign_key checks" off, + so let's retry the loading with charset_check is off */ + err = dict_load_foreigns(user_table->name, + ctx->col_names, false, false, + DICT_ERR_IGNORE_NONE); + + /* The load with "charset_check" off is successful, warn + the user that the foreign key has loaded with mis-matched + charset */ + if (err == DB_SUCCESS) { + push_warning_printf( + user_thd, + Sql_condition::WARN_LEVEL_WARN, + ER_ALTER_INFO, + "Foreign key constraints for table '%s'" + " are loaded with charset check off", + user_table->name); + + } + } + + DBUG_RETURN(err); +} + +/** Commit the changes made during prepare_inplace_alter_table() +and inplace_alter_table() inside the data dictionary tables, +when rebuilding the table. +@param ha_alter_info Data used during in-place alter +@param ctx In-place ALTER TABLE context +@param altered_table MySQL table that is being altered +@param old_table MySQL table as it is before the ALTER operation +@param trx Data dictionary transaction +@param table_name Table name in MySQL +@retval true Failure +@retval false Success +*/ +inline __attribute__((nonnull, warn_unused_result)) +bool +commit_try_rebuild( +/*===============*/ + Alter_inplace_info* ha_alter_info, + ha_innobase_inplace_ctx*ctx, + TABLE* altered_table, + const TABLE* old_table, + trx_t* trx, + const char* table_name) +{ + dict_table_t* rebuilt_table = ctx->new_table; + dict_table_t* user_table = ctx->old_table; + + DBUG_ENTER("commit_try_rebuild"); + DBUG_ASSERT(ctx->need_rebuild()); + DBUG_ASSERT(trx->dict_operation_lock_mode == RW_X_LATCH); + DBUG_ASSERT(!(ha_alter_info->handler_flags + & Alter_inplace_info::DROP_FOREIGN_KEY) + || ctx->num_to_drop_fk > 0); + DBUG_ASSERT(ctx->num_to_drop_fk + == ha_alter_info->alter_info->drop_list.elements); + + for (dict_index_t* index = dict_table_get_first_index(rebuilt_table); + index; + index = dict_table_get_next_index(index)) { + DBUG_ASSERT(dict_index_get_online_status(index) + == ONLINE_INDEX_COMPLETE); + DBUG_ASSERT(*index->name != TEMP_INDEX_PREFIX); + if (dict_index_is_corrupted(index)) { + my_error(ER_INDEX_CORRUPT, MYF(0), + index->name); + DBUG_RETURN(true); + } + } + + if (innobase_update_foreign_try(ctx, trx, table_name)) { + DBUG_RETURN(true); + } + + dberr_t error; + + /* Clear the to_be_dropped flag in the data dictionary cache + of user_table. */ + for (ulint i = 0; i < ctx->num_to_drop_index; i++) { + dict_index_t* index = ctx->drop_index[i]; + DBUG_ASSERT(index->table == user_table); + DBUG_ASSERT(*index->name != TEMP_INDEX_PREFIX); + DBUG_ASSERT(index->to_be_dropped); + index->to_be_dropped = 0; + } + + /* We copied the table. Any indexes that were requested to be + dropped were not created in the copy of the table. Apply any + last bit of the rebuild log and then rename the tables. */ + + if (ctx->online) { + DEBUG_SYNC_C("row_log_table_apply2_before"); + error = row_log_table_apply( + ctx->thr, user_table, altered_table); + ulint err_key = thr_get_trx(ctx->thr)->error_key_num; + + switch (error) { + KEY* dup_key; + case DB_SUCCESS: + break; + case DB_DUPLICATE_KEY: + if (err_key == ULINT_UNDEFINED) { + /* This should be the hidden index on + FTS_DOC_ID. */ + dup_key = NULL; + } else { + DBUG_ASSERT(err_key < + ha_alter_info->key_count); + dup_key = &ha_alter_info + ->key_info_buffer[err_key]; + } + print_keydup_error(altered_table, dup_key, MYF(0)); + DBUG_RETURN(true); + case DB_ONLINE_LOG_TOO_BIG: + my_error(ER_INNODB_ONLINE_LOG_TOO_BIG, MYF(0), + ha_alter_info->key_info_buffer[0].name); + DBUG_RETURN(true); + case DB_INDEX_CORRUPT: + my_error(ER_INDEX_CORRUPT, MYF(0), + (err_key == ULINT_UNDEFINED) + ? FTS_DOC_ID_INDEX_NAME + : ha_alter_info->key_info_buffer[err_key] + .name); + DBUG_RETURN(true); + default: + my_error_innodb(error, table_name, user_table->flags); + DBUG_RETURN(true); + } + } + + if ((ha_alter_info->handler_flags + & Alter_inplace_info::ALTER_COLUMN_NAME) + && innobase_rename_columns_try(ha_alter_info, ctx, old_table, + trx, table_name)) { + DBUG_RETURN(true); + } + + DBUG_EXECUTE_IF("ib_ddl_crash_before_rename", DBUG_SUICIDE();); + + /* The new table must inherit the flag from the + "parent" table. */ + if (dict_table_is_discarded(user_table)) { + rebuilt_table->ibd_file_missing = true; + rebuilt_table->flags2 |= DICT_TF2_DISCARDED; + } + + /* We can now rename the old table as a temporary table, + rename the new temporary table as the old table and drop the + old table. First, we only do this in the data dictionary + tables. The actual renaming will be performed in + commit_cache_rebuild(), once the data dictionary transaction + has been successfully committed. */ + + error = row_merge_rename_tables_dict( + user_table, rebuilt_table, ctx->tmp_name, trx); + + /* We must be still holding a table handle. */ + DBUG_ASSERT(user_table->n_ref_count >= 1); + + DBUG_EXECUTE_IF("ib_ddl_crash_after_rename", DBUG_SUICIDE();); + DBUG_EXECUTE_IF("ib_rebuild_cannot_rename", error = DB_ERROR;); + + if (user_table->n_ref_count > 1) { + /* This should only occur when an innodb_memcached + connection with innodb_api_enable_mdl=off was started + before commit_inplace_alter_table() locked the data + dictionary. We must roll back the ALTER TABLE, because + we cannot drop a table while it is being used. */ + + /* Normally, n_ref_count must be 1, because purge + cannot be executing on this very table as we are + holding dict_operation_lock X-latch. */ + + error = DB_LOCK_WAIT_TIMEOUT; + } + + switch (error) { + case DB_SUCCESS: + DBUG_RETURN(false); + case DB_TABLESPACE_EXISTS: + ut_a(rebuilt_table->n_ref_count == 1); + my_error(ER_TABLESPACE_EXISTS, MYF(0), ctx->tmp_name); + DBUG_RETURN(true); + case DB_DUPLICATE_KEY: + ut_a(rebuilt_table->n_ref_count == 1); + my_error(ER_TABLE_EXISTS_ERROR, MYF(0), ctx->tmp_name); + DBUG_RETURN(true); + default: + my_error_innodb(error, table_name, user_table->flags); + DBUG_RETURN(true); + } +} + +/** Apply the changes made during commit_try_rebuild(), +to the data dictionary cache and the file system. +@param ctx In-place ALTER TABLE context */ +inline __attribute__((nonnull)) +void +commit_cache_rebuild( +/*=================*/ + ha_innobase_inplace_ctx* ctx) +{ + dberr_t error; + + DBUG_ENTER("commit_cache_rebuild"); + DBUG_ASSERT(ctx->need_rebuild()); + DBUG_ASSERT(dict_table_is_discarded(ctx->old_table) + == dict_table_is_discarded(ctx->new_table)); + + const char* old_name = mem_heap_strdup( + ctx->heap, ctx->old_table->name); + + /* We already committed and redo logged the renames, + so this must succeed. */ + error = dict_table_rename_in_cache( + ctx->old_table, ctx->tmp_name, FALSE); + ut_a(error == DB_SUCCESS); + + DEBUG_SYNC_C("commit_cache_rebuild_middle"); + + error = dict_table_rename_in_cache( + ctx->new_table, old_name, FALSE); + ut_a(error == DB_SUCCESS); + + DBUG_VOID_RETURN; +} + +/** Commit the changes made during prepare_inplace_alter_table() +and inplace_alter_table() inside the data dictionary tables, +when not rebuilding the table. +@param ha_alter_info Data used during in-place alter +@param ctx In-place ALTER TABLE context +@param old_table MySQL table as it is before the ALTER operation +@param trx Data dictionary transaction +@param table_name Table name in MySQL +@retval true Failure +@retval false Success +*/ +inline __attribute__((nonnull, warn_unused_result)) +bool +commit_try_norebuild( +/*=================*/ + Alter_inplace_info* ha_alter_info, + ha_innobase_inplace_ctx*ctx, + const TABLE* old_table, + trx_t* trx, + const char* table_name) +{ + DBUG_ENTER("commit_try_norebuild"); + DBUG_ASSERT(!ctx->need_rebuild()); + DBUG_ASSERT(trx->dict_operation_lock_mode == RW_X_LATCH); + DBUG_ASSERT(!(ha_alter_info->handler_flags + & Alter_inplace_info::DROP_FOREIGN_KEY) + || ctx->num_to_drop_fk > 0); + DBUG_ASSERT(ctx->num_to_drop_fk + == ha_alter_info->alter_info->drop_list.elements); + + for (ulint i = 0; i < ctx->num_to_add_index; i++) { + dict_index_t* index = ctx->add_index[i]; + DBUG_ASSERT(dict_index_get_online_status(index) + == ONLINE_INDEX_COMPLETE); + DBUG_ASSERT(*index->name == TEMP_INDEX_PREFIX); + if (dict_index_is_corrupted(index)) { + /* Report a duplicate key + error for the index that was + flagged corrupted, most likely + because a duplicate value was + inserted (directly or by + rollback) after + ha_innobase::inplace_alter_table() + completed. + TODO: report this as a corruption + with a detailed reason once + WL#6379 has been implemented. */ + my_error(ER_DUP_UNKNOWN_IN_INDEX, + MYF(0), index->name + 1); + DBUG_RETURN(true); + } + } + + if (innobase_update_foreign_try(ctx, trx, table_name)) { + DBUG_RETURN(true); + } + + dberr_t error; + + /* We altered the table in place. */ + /* Lose the TEMP_INDEX_PREFIX. */ + for (ulint i = 0; i < ctx->num_to_add_index; i++) { + dict_index_t* index = ctx->add_index[i]; + DBUG_ASSERT(dict_index_get_online_status(index) + == ONLINE_INDEX_COMPLETE); + DBUG_ASSERT(*index->name + == TEMP_INDEX_PREFIX); + error = row_merge_rename_index_to_add( + trx, ctx->new_table->id, index->id); + if (error != DB_SUCCESS) { + sql_print_error( + "InnoDB: rename index to add: %lu\n", + (ulong) error); + DBUG_ASSERT(0); + my_error(ER_INTERNAL_ERROR, MYF(0), + "rename index to add"); + DBUG_RETURN(true); + } + } + + /* Drop any indexes that were requested to be dropped. + Rename them to TEMP_INDEX_PREFIX in the data + dictionary first. We do not bother to rename + index->name in the dictionary cache, because the index + is about to be freed after row_merge_drop_indexes_dict(). */ + + for (ulint i = 0; i < ctx->num_to_drop_index; i++) { + dict_index_t* index = ctx->drop_index[i]; + DBUG_ASSERT(*index->name != TEMP_INDEX_PREFIX); + DBUG_ASSERT(index->table == ctx->new_table); + DBUG_ASSERT(index->to_be_dropped); + + error = row_merge_rename_index_to_drop( + trx, index->table->id, index->id); + if (error != DB_SUCCESS) { + sql_print_error( + "InnoDB: rename index to drop: %lu\n", + (ulong) error); + DBUG_ASSERT(0); + my_error(ER_INTERNAL_ERROR, MYF(0), + "rename index to drop"); + DBUG_RETURN(true); + } + } + + if (!(ha_alter_info->handler_flags + & Alter_inplace_info::ALTER_COLUMN_NAME)) { + DBUG_RETURN(false); + } + + DBUG_RETURN(innobase_rename_columns_try(ha_alter_info, ctx, + old_table, trx, table_name)); +} + +/** Commit the changes to the data dictionary cache +after a successful commit_try_norebuild() call. +@param ctx In-place ALTER TABLE context +@param table the TABLE before the ALTER +@param trx Data dictionary transaction object +(will be started and committed) +@return whether all replacements were found for dropped indexes */ +inline __attribute__((nonnull, warn_unused_result)) +bool +commit_cache_norebuild( +/*===================*/ + ha_innobase_inplace_ctx*ctx, + const TABLE* table, + trx_t* trx) +{ + DBUG_ENTER("commit_cache_norebuild"); + + bool found = true; + + DBUG_ASSERT(!ctx->need_rebuild()); + + for (ulint i = 0; i < ctx->num_to_add_index; i++) { + dict_index_t* index = ctx->add_index[i]; + DBUG_ASSERT(dict_index_get_online_status(index) + == ONLINE_INDEX_COMPLETE); + DBUG_ASSERT(*index->name == TEMP_INDEX_PREFIX); + index->name++; + } + + if (ctx->num_to_drop_index) { + /* Really drop the indexes that were dropped. + The transaction had to be committed first + (after renaming the indexes), so that in the + event of a crash, crash recovery will drop the + indexes, because it drops all indexes whose + names start with TEMP_INDEX_PREFIX. Once we + have started dropping an index tree, there is + no way to roll it back. */ + + for (ulint i = 0; i < ctx->num_to_drop_index; i++) { + dict_index_t* index = ctx->drop_index[i]; + DBUG_ASSERT(*index->name != TEMP_INDEX_PREFIX); + DBUG_ASSERT(index->table == ctx->new_table); + DBUG_ASSERT(index->to_be_dropped); + + /* Replace the indexes in foreign key + constraints if needed. */ + + if (!dict_foreign_replace_index( + index->table, ctx->col_names, index)) { + found = false; + } + + /* Mark the index dropped + in the data dictionary cache. */ + rw_lock_x_lock(dict_index_get_lock(index)); + index->page = FIL_NULL; + rw_lock_x_unlock(dict_index_get_lock(index)); + } + + trx_start_for_ddl(trx, TRX_DICT_OP_INDEX); + row_merge_drop_indexes_dict(trx, ctx->new_table->id); + + for (ulint i = 0; i < ctx->num_to_drop_index; i++) { + dict_index_t* index = ctx->drop_index[i]; + DBUG_ASSERT(*index->name != TEMP_INDEX_PREFIX); + DBUG_ASSERT(index->table == ctx->new_table); + + if (index->type & DICT_FTS) { + DBUG_ASSERT(index->type == DICT_FTS + || (index->type + & DICT_CORRUPT)); + DBUG_ASSERT(index->table->fts); + fts_drop_index(index->table, index, trx); + } + + dict_index_remove_from_cache(index->table, index); + } + + trx_commit_for_mysql(trx); + } + + DBUG_RETURN(found); +} + +/** Adjust the persistent statistics after non-rebuilding ALTER TABLE. +Remove statistics for dropped indexes, add statistics for created indexes +and rename statistics for renamed indexes. +@param ha_alter_info Data used during in-place alter +@param ctx In-place ALTER TABLE context +@param altered_table MySQL table that is being altered +@param table_name Table name in MySQL +@param thd MySQL connection +*/ +static +void +alter_stats_norebuild( +/*==================*/ + Alter_inplace_info* ha_alter_info, + ha_innobase_inplace_ctx* ctx, + TABLE* altered_table, + const char* table_name, + THD* thd) +{ + ulint i; + + DBUG_ENTER("alter_stats_norebuild"); + DBUG_ASSERT(!ctx->need_rebuild()); + + if (!dict_stats_is_persistent_enabled(ctx->new_table)) { + DBUG_VOID_RETURN; + } + + /* TODO: This will not drop the (unused) statistics for + FTS_DOC_ID_INDEX if it was a hidden index, dropped together + with the last renamining FULLTEXT index. */ + for (i = 0; i < ha_alter_info->index_drop_count; i++) { + const KEY* key = ha_alter_info->index_drop_buffer[i]; + + if (key->flags & HA_FULLTEXT) { + /* There are no index cardinality + statistics for FULLTEXT indexes. */ + continue; + } + + char errstr[1024]; + + if (dict_stats_drop_index( + ctx->new_table->name, key->name, + errstr, sizeof errstr) != DB_SUCCESS) { + push_warning(thd, + Sql_condition::WARN_LEVEL_WARN, + ER_LOCK_WAIT_TIMEOUT, errstr); + } + } + + for (i = 0; i < ctx->num_to_add_index; i++) { + dict_index_t* index = ctx->add_index[i]; + DBUG_ASSERT(index->table == ctx->new_table); + + if (!(index->type & DICT_FTS)) { + dict_stats_init(ctx->new_table); + dict_stats_update_for_index(index); + } + } + + DBUG_VOID_RETURN; +} + +/** Adjust the persistent statistics after rebuilding ALTER TABLE. +Remove statistics for dropped indexes, add statistics for created indexes +and rename statistics for renamed indexes. +@param table InnoDB table that was rebuilt by ALTER TABLE +@param table_name Table name in MySQL +@param thd MySQL connection +*/ +static +void +alter_stats_rebuild( +/*================*/ + dict_table_t* table, + const char* table_name, + THD* thd) +{ + DBUG_ENTER("alter_stats_rebuild"); + + if (dict_table_is_discarded(table) + || !dict_stats_is_persistent_enabled(table)) { + DBUG_VOID_RETURN; + } + + dberr_t ret; + + ret = dict_stats_update(table, DICT_STATS_RECALC_PERSISTENT); + + if (ret != DB_SUCCESS) { + push_warning_printf( + thd, + Sql_condition::WARN_LEVEL_WARN, + ER_ALTER_INFO, + "Error updating stats for table '%s' " + "after table rebuild: %s", + table_name, ut_strerr(ret)); + } + + DBUG_VOID_RETURN; +} + +#ifndef DBUG_OFF +# define DBUG_INJECT_CRASH(prefix, count) \ +do { \ + char buf[32]; \ + ut_snprintf(buf, sizeof buf, prefix "_%u", count); \ + DBUG_EXECUTE_IF(buf, DBUG_SUICIDE();); \ +} while (0) +#else +# define DBUG_INJECT_CRASH(prefix, count) +#endif + +/** Commit or rollback the changes made during +prepare_inplace_alter_table() and inplace_alter_table() inside +the storage engine. Note that the allowed level of concurrency +during this operation will be the same as for +inplace_alter_table() and thus might be higher than during +prepare_inplace_alter_table(). (E.g concurrent writes were +blocked during prepare, but might not be during commit). +@param altered_table TABLE object for new version of table. +@param ha_alter_info Structure describing changes to be done +by ALTER TABLE and holding data used during in-place alter. +@param commit true => Commit, false => Rollback. +@retval true Failure +@retval false Success +*/ +UNIV_INTERN +bool +ha_innobase::commit_inplace_alter_table( +/*====================================*/ + TABLE* altered_table, + Alter_inplace_info* ha_alter_info, + bool commit) +{ + ha_innobase_inplace_ctx* ctx0 + = static_cast<ha_innobase_inplace_ctx*> + (ha_alter_info->handler_ctx); +#ifndef DBUG_OFF + uint crash_inject_count = 1; + uint crash_fail_inject_count = 1; + uint failure_inject_count = 1; +#endif + + DBUG_ENTER("commit_inplace_alter_table"); + DBUG_ASSERT(!srv_read_only_mode); + DBUG_ASSERT(!ctx0 || ctx0->prebuilt == prebuilt); + DBUG_ASSERT(!ctx0 || ctx0->old_table == prebuilt->table); + + DEBUG_SYNC_C("innodb_commit_inplace_alter_table_enter"); + + DEBUG_SYNC_C("innodb_commit_inplace_alter_table_wait"); + + if (!commit) { + /* A rollback is being requested. So far we may at + most have created some indexes. If any indexes were to + be dropped, they would actually be dropped in this + method if commit=true. */ + DBUG_RETURN(rollback_inplace_alter_table( + ha_alter_info, table, prebuilt)); + } + + if (!(ha_alter_info->handler_flags & ~INNOBASE_INPLACE_IGNORE)) { + DBUG_ASSERT(!ctx0); + MONITOR_ATOMIC_DEC(MONITOR_PENDING_ALTER_TABLE); + ha_alter_info->group_commit_ctx = NULL; + DBUG_RETURN(false); + } + + DBUG_ASSERT(ctx0); + + inplace_alter_handler_ctx** ctx_array; + inplace_alter_handler_ctx* ctx_single[2]; + + if (ha_alter_info->group_commit_ctx) { + ctx_array = ha_alter_info->group_commit_ctx; + } else { + ctx_single[0] = ctx0; + ctx_single[1] = NULL; + ctx_array = ctx_single; + } + + DBUG_ASSERT(ctx0 == ctx_array[0]); + ut_ad(prebuilt->table == ctx0->old_table); + ha_alter_info->group_commit_ctx = NULL; + + /* Free the ctx->trx of other partitions, if any. We will only + use the ctx0->trx here. Others may have been allocated in + the prepare stage. */ + + for (inplace_alter_handler_ctx** pctx = &ctx_array[1]; *pctx; + pctx++) { + ha_innobase_inplace_ctx* ctx + = static_cast<ha_innobase_inplace_ctx*>(*pctx); + + if (ctx->trx) { + trx_free_for_mysql(ctx->trx); + ctx->trx = NULL; + } + } + + trx_start_if_not_started_xa(prebuilt->trx); + + for (inplace_alter_handler_ctx** pctx = ctx_array; *pctx; pctx++) { + ha_innobase_inplace_ctx* ctx + = static_cast<ha_innobase_inplace_ctx*>(*pctx); + DBUG_ASSERT(ctx->prebuilt->trx == prebuilt->trx); + + /* Exclusively lock the table, to ensure that no other + transaction is holding locks on the table while we + change the table definition. The MySQL meta-data lock + should normally guarantee that no conflicting locks + exist. However, FOREIGN KEY constraints checks and any + transactions collected during crash recovery could be + holding InnoDB locks only, not MySQL locks. */ + + dberr_t error = row_merge_lock_table( + prebuilt->trx, ctx->old_table, LOCK_X); + + if (error != DB_SUCCESS) { + my_error_innodb( + error, table_share->table_name.str, 0); + DBUG_RETURN(true); + } + } + + DEBUG_SYNC(user_thd, "innodb_alter_commit_after_lock_table"); + + const bool new_clustered = ctx0->need_rebuild(); + trx_t* trx = ctx0->trx; + bool fail = false; + + if (new_clustered) { + for (inplace_alter_handler_ctx** pctx = ctx_array; + *pctx; pctx++) { + ha_innobase_inplace_ctx* ctx + = static_cast<ha_innobase_inplace_ctx*>(*pctx); + DBUG_ASSERT(ctx->need_rebuild()); + + if (ctx->old_table->fts) { + ut_ad(!ctx->old_table->fts->add_wq); + fts_optimize_remove_table( + ctx->old_table); + } + + if (ctx->new_table->fts) { + ut_ad(!ctx->new_table->fts->add_wq); + fts_optimize_remove_table( + ctx->new_table); + } + } + } + + if (!trx) { + DBUG_ASSERT(!new_clustered); + trx = innobase_trx_allocate(user_thd); + } + + trx_start_for_ddl(trx, TRX_DICT_OP_INDEX); + /* Latch the InnoDB data dictionary exclusively so that no deadlocks + or lock waits can happen in it during the data dictionary operation. */ + row_mysql_lock_data_dictionary(trx); + + /* Prevent the background statistics collection from accessing + the tables. */ + for (;;) { + bool retry = false; + + for (inplace_alter_handler_ctx** pctx = ctx_array; + *pctx; pctx++) { + ha_innobase_inplace_ctx* ctx + = static_cast<ha_innobase_inplace_ctx*>(*pctx); + + DBUG_ASSERT(new_clustered == ctx->need_rebuild()); + + if (new_clustered + && !dict_stats_stop_bg(ctx->old_table)) { + retry = true; + } + + if (!dict_stats_stop_bg(ctx->new_table)) { + retry = true; + } + } + + if (!retry) { + break; + } + + DICT_STATS_BG_YIELD(trx); + } + + /* Apply the changes to the data dictionary tables, for all + partitions. */ + + for (inplace_alter_handler_ctx** pctx = ctx_array; + *pctx && !fail; pctx++) { + ha_innobase_inplace_ctx* ctx + = static_cast<ha_innobase_inplace_ctx*>(*pctx); + + DBUG_ASSERT(new_clustered == ctx->need_rebuild()); + + ctx->max_autoinc = commit_get_autoinc( + ha_alter_info, ctx, altered_table, table); + + if (ctx->need_rebuild()) { + ctx->tmp_name = dict_mem_create_temporary_tablename( + ctx->heap, ctx->new_table->name, + ctx->new_table->id); + + fail = commit_try_rebuild( + ha_alter_info, ctx, altered_table, table, + trx, table_share->table_name.str); + } else { + fail = commit_try_norebuild( + ha_alter_info, ctx, table, trx, + table_share->table_name.str); + } + DBUG_INJECT_CRASH("ib_commit_inplace_crash", + crash_inject_count++); +#ifndef DBUG_OFF + { + /* Generate a dynamic dbug text. */ + char buf[32]; + ut_snprintf(buf, sizeof buf, "ib_commit_inplace_fail_%u", + failure_inject_count++); + DBUG_EXECUTE_IF(buf, + my_error(ER_INTERNAL_ERROR, MYF(0), + "Injected error!"); + fail = true; + ); + } +#endif + } + + /* Commit or roll back the changes to the data dictionary. */ + + if (fail) { + trx_rollback_for_mysql(trx); + } else if (!new_clustered) { + trx_commit_for_mysql(trx); + } else { + mtr_t mtr; + mtr_start(&mtr); + + for (inplace_alter_handler_ctx** pctx = ctx_array; + *pctx; pctx++) { + ha_innobase_inplace_ctx* ctx + = static_cast<ha_innobase_inplace_ctx*>(*pctx); + + DBUG_ASSERT(ctx->need_rebuild()); + /* Generate the redo log for the file + operations that will be performed in + commit_cache_rebuild(). */ + fil_mtr_rename_log(ctx->old_table->space, + ctx->old_table->name, + ctx->new_table->space, + ctx->new_table->name, + ctx->tmp_name, &mtr); + DBUG_INJECT_CRASH("ib_commit_inplace_crash", + crash_inject_count++); + } + + /* Test what happens on crash if the redo logs + are flushed to disk here. The log records + about the rename should not be committed, and + the data dictionary transaction should be + rolled back, restoring the old table. */ + DBUG_EXECUTE_IF("innodb_alter_commit_crash_before_commit", + log_buffer_flush_to_disk(); + DBUG_SUICIDE();); + ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE)); + ut_ad(!trx->fts_trx); + ut_ad(trx->insert_undo || trx->update_undo); + + /* The following call commits the + mini-transaction, making the data dictionary + transaction committed at mtr.end_lsn. The + transaction becomes 'durable' by the time when + log_buffer_flush_to_disk() returns. In the + logical sense the commit in the file-based + data structures happens here. */ + trx_commit_low(trx, &mtr); + + /* If server crashes here, the dictionary in + InnoDB and MySQL will differ. The .ibd files + and the .frm files must be swapped manually by + the administrator. No loss of data. */ + DBUG_EXECUTE_IF("innodb_alter_commit_crash_after_commit", + log_buffer_flush_to_disk(); + DBUG_SUICIDE();); + } + + /* Flush the log to reduce probability that the .frm files and + the InnoDB data dictionary get out-of-sync if the user runs + with innodb_flush_log_at_trx_commit = 0 */ + + log_buffer_flush_to_disk(); + + /* At this point, the changes to the persistent storage have + been committed or rolled back. What remains to be done is to + update the in-memory structures, close some handles, release + temporary files, and (unless we rolled back) update persistent + statistics. */ + dberr_t error = DB_SUCCESS; + + for (inplace_alter_handler_ctx** pctx = ctx_array; + *pctx; pctx++) { + ha_innobase_inplace_ctx* ctx + = static_cast<ha_innobase_inplace_ctx*>(*pctx); + + DBUG_ASSERT(ctx->need_rebuild() == new_clustered); + + if (new_clustered) { + innobase_online_rebuild_log_free(ctx->old_table); + } + + if (fail) { + if (new_clustered) { + dict_table_close(ctx->new_table, + TRUE, FALSE); + +#if defined UNIV_DEBUG || defined UNIV_DDL_DEBUG + /* Nobody should have initialized the + stats of the newly created table + yet. When this is the case, we know + that it has not been added for + background stats gathering. */ + ut_a(!ctx->new_table->stat_initialized); +#endif /* UNIV_DEBUG || UNIV_DDL_DEBUG */ + + trx_start_for_ddl(trx, TRX_DICT_OP_TABLE); + row_merge_drop_table(trx, ctx->new_table); + trx_commit_for_mysql(trx); + ctx->new_table = NULL; + } else { + /* We failed, but did not rebuild the table. + Roll back any ADD INDEX, or get rid of garbage + ADD INDEX that was left over from a previous + ALTER TABLE statement. */ + trx_start_for_ddl(trx, TRX_DICT_OP_INDEX); + innobase_rollback_sec_index( + ctx->new_table, table, TRUE, trx); + trx_commit_for_mysql(trx); + } + DBUG_INJECT_CRASH("ib_commit_inplace_crash_fail", + crash_fail_inject_count++); + + continue; + } + + innobase_copy_frm_flags_from_table_share( + ctx->new_table, altered_table->s); + + if (new_clustered) { + /* We will reload and refresh the + in-memory foreign key constraint + metadata. This is a rename operation + in preparing for dropping the old + table. Set the table to_be_dropped bit + here, so to make sure DML foreign key + constraint check does not use the + stale dict_foreign_t. This is done + because WL#6049 (FK MDL) has not been + implemented yet. */ + ctx->old_table->to_be_dropped = true; + + /* Rename the tablespace files. */ + commit_cache_rebuild(ctx); + + error = innobase_update_foreign_cache(ctx, user_thd); + if (error != DB_SUCCESS) { + goto foreign_fail; + } + } else { + error = innobase_update_foreign_cache(ctx, user_thd); + + if (error != DB_SUCCESS) { +foreign_fail: + /* The data dictionary cache + should be corrupted now. The + best solution should be to + kill and restart the server, + but the *.frm file has not + been replaced yet. */ + my_error(ER_CANNOT_ADD_FOREIGN, + MYF(0)); + sql_print_error( + "InnoDB: dict_load_foreigns()" + " returned %u for %s", + (unsigned) error, + thd_query_string(user_thd) + ->str); + ut_ad(0); + } else { + if (!commit_cache_norebuild( + ctx, table, trx)) { + ut_a(!prebuilt->trx->check_foreigns); + } + + innobase_rename_columns_cache( + ha_alter_info, table, + ctx->new_table); + } + } + DBUG_INJECT_CRASH("ib_commit_inplace_crash", + crash_inject_count++); + } + + /* Invalidate the index translation table. In partitioned + tables, there is one TABLE_SHARE (and also only one TABLE) + covering all partitions. */ + share->idx_trans_tbl.index_count = 0; + + if (trx == ctx0->trx) { + ctx0->trx = NULL; + } + + /* Tell the InnoDB server that there might be work for + utility threads: */ + + srv_active_wake_master_thread(); + + if (fail) { + for (inplace_alter_handler_ctx** pctx = ctx_array; + *pctx; pctx++) { + ha_innobase_inplace_ctx* ctx + = static_cast<ha_innobase_inplace_ctx*> + (*pctx); + DBUG_ASSERT(ctx->need_rebuild() == new_clustered); + + ut_d(dict_table_check_for_dup_indexes( + ctx->old_table, + CHECK_ABORTED_OK)); + ut_a(fts_check_cached_index(ctx->old_table)); + DBUG_INJECT_CRASH("ib_commit_inplace_crash_fail", + crash_fail_inject_count++); + } + + row_mysql_unlock_data_dictionary(trx); + trx_free_for_mysql(trx); + DBUG_RETURN(true); + } + + /* Release the table locks. */ + trx_commit_for_mysql(prebuilt->trx); + + DBUG_EXECUTE_IF("ib_ddl_crash_after_user_trx_commit", DBUG_SUICIDE();); + + for (inplace_alter_handler_ctx** pctx = ctx_array; + *pctx; pctx++) { + ha_innobase_inplace_ctx* ctx + = static_cast<ha_innobase_inplace_ctx*> + (*pctx); + DBUG_ASSERT(ctx->need_rebuild() == new_clustered); + + if (altered_table->found_next_number_field) { + dict_table_t* t = ctx->new_table; + + dict_table_autoinc_lock(t); + dict_table_autoinc_initialize(t, ctx->max_autoinc); + dict_table_autoinc_unlock(t); + } + + bool add_fts = false; + + /* Publish the created fulltext index, if any. + Note that a fulltext index can be created without + creating the clustered index, if there already exists + a suitable FTS_DOC_ID column. If not, one will be + created, implying new_clustered */ + for (ulint i = 0; i < ctx->num_to_add_index; i++) { + dict_index_t* index = ctx->add_index[i]; + + if (index->type & DICT_FTS) { + DBUG_ASSERT(index->type == DICT_FTS); + /* We reset DICT_TF2_FTS here because the bit + is left unset when a drop proceeds the add. */ + DICT_TF2_FLAG_SET(ctx->new_table, DICT_TF2_FTS); + fts_add_index(index, ctx->new_table); + add_fts = true; + } + } + + ut_d(dict_table_check_for_dup_indexes( + ctx->new_table, CHECK_ALL_COMPLETE)); + + if (add_fts) { + fts_optimize_add_table(ctx->new_table); + } + + ut_d(dict_table_check_for_dup_indexes( + ctx->new_table, CHECK_ABORTED_OK)); + ut_a(fts_check_cached_index(ctx->new_table)); + + if (new_clustered) { + /* Since the table has been rebuilt, we remove + all persistent statistics corresponding to the + old copy of the table (which was renamed to + ctx->tmp_name). */ + + char errstr[1024]; + + DBUG_ASSERT(0 == strcmp(ctx->old_table->name, + ctx->tmp_name)); + + if (dict_stats_drop_table( + ctx->new_table->name, + errstr, sizeof(errstr)) + != DB_SUCCESS) { + push_warning_printf( + user_thd, + Sql_condition::WARN_LEVEL_WARN, + ER_ALTER_INFO, + "Deleting persistent statistics" + " for rebuilt table '%s' in" + " InnoDB failed: %s", + table->s->table_name.str, + errstr); + } + + DBUG_EXECUTE_IF("ib_ddl_crash_before_commit", + DBUG_SUICIDE();); + + trx_t* const user_trx = prebuilt->trx; + + row_prebuilt_free(ctx->prebuilt, TRUE); + + /* Drop the copy of the old table, which was + renamed to ctx->tmp_name at the atomic DDL + transaction commit. If the system crashes + before this is completed, some orphan tables + with ctx->tmp_name may be recovered. */ + trx_start_for_ddl(trx, TRX_DICT_OP_TABLE); + row_merge_drop_table(trx, ctx->old_table); + trx_commit_for_mysql(trx); + + /* Rebuild the prebuilt object. */ + ctx->prebuilt = row_create_prebuilt( + ctx->new_table, altered_table->s->reclength); + trx_start_if_not_started(user_trx); + user_trx->will_lock++; + prebuilt->trx = user_trx; + } + DBUG_INJECT_CRASH("ib_commit_inplace_crash", + crash_inject_count++); + } + + row_mysql_unlock_data_dictionary(trx); + trx_free_for_mysql(trx); + + /* TODO: The following code could be executed + while allowing concurrent access to the table + (MDL downgrade). */ + + if (new_clustered) { + for (inplace_alter_handler_ctx** pctx = ctx_array; + *pctx; pctx++) { + ha_innobase_inplace_ctx* ctx + = static_cast<ha_innobase_inplace_ctx*> + (*pctx); + DBUG_ASSERT(ctx->need_rebuild()); + + alter_stats_rebuild( + ctx->new_table, table->s->table_name.str, + user_thd); + DBUG_INJECT_CRASH("ib_commit_inplace_crash", + crash_inject_count++); + } + } else { + for (inplace_alter_handler_ctx** pctx = ctx_array; + *pctx; pctx++) { + ha_innobase_inplace_ctx* ctx + = static_cast<ha_innobase_inplace_ctx*> + (*pctx); + DBUG_ASSERT(!ctx->need_rebuild()); + + alter_stats_norebuild( + ha_alter_info, ctx, altered_table, + table->s->table_name.str, user_thd); + DBUG_INJECT_CRASH("ib_commit_inplace_crash", + crash_inject_count++); + } + } + + /* TODO: Also perform DROP TABLE and DROP INDEX after + the MDL downgrade. */ + +#ifndef DBUG_OFF + dict_index_t* clust_index = dict_table_get_first_index( + prebuilt->table); + DBUG_ASSERT(!clust_index->online_log); + DBUG_ASSERT(dict_index_get_online_status(clust_index) + == ONLINE_INDEX_COMPLETE); + + for (dict_index_t* index = dict_table_get_first_index( + prebuilt->table); + index; + index = dict_table_get_next_index(index)) { + DBUG_ASSERT(!index->to_be_dropped); + } +#endif /* DBUG_OFF */ + + MONITOR_ATOMIC_DEC(MONITOR_PENDING_ALTER_TABLE); + DBUG_RETURN(false); +} + +/** +@param thd - the session +@param start_value - the lower bound +@param max_value - the upper bound (inclusive) */ +UNIV_INTERN +ib_sequence_t::ib_sequence_t( + THD* thd, + ulonglong start_value, + ulonglong max_value) + : + m_max_value(max_value), + m_increment(0), + m_offset(0), + m_next_value(start_value), + m_eof(false) +{ + if (thd != 0 && m_max_value > 0) { + + thd_get_autoinc(thd, &m_offset, &m_increment); + + if (m_increment > 1 || m_offset > 1) { + + /* If there is an offset or increment specified + then we need to work out the exact next value. */ + + m_next_value = innobase_next_autoinc( + start_value, 1, + m_increment, m_offset, m_max_value); + + } else if (start_value == 0) { + /* The next value can never be 0. */ + m_next_value = 1; + } + } else { + m_eof = true; + } +} + +/** +Postfix increment +@return the next value to insert */ +UNIV_INTERN +ulonglong +ib_sequence_t::operator++(int) UNIV_NOTHROW +{ + ulonglong current = m_next_value; + + ut_ad(!m_eof); + ut_ad(m_max_value > 0); + + m_next_value = innobase_next_autoinc( + current, 1, m_increment, m_offset, m_max_value); + + if (m_next_value == m_max_value && current == m_next_value) { + m_eof = true; + } + + return(current); +} diff --git a/storage/xtradb/handler/i_s.cc b/storage/xtradb/handler/i_s.cc new file mode 100644 index 00000000000..2f9d123f1d6 --- /dev/null +++ b/storage/xtradb/handler/i_s.cc @@ -0,0 +1,8557 @@ +/***************************************************************************** + +Copyright (c) 2007, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file handler/i_s.cc +InnoDB INFORMATION SCHEMA tables interface to MySQL. + +Created July 18, 2007 Vasil Dimov +*******************************************************/ +#ifndef MYSQL_SERVER +#define MYSQL_SERVER /* For Item_* classes */ +#include <item.h> +/* Prevent influence of this definition to other headers */ +#undef MYSQL_SERVER +#else +#include <mysql_priv.h> +#endif //MYSQL_SERVER + +#include <mysqld_error.h> +#include <sql_acl.h> + +#include <m_ctype.h> +#include <hash.h> +#include <myisampack.h> +#include <mysys_err.h> +#include <my_sys.h> +#include "i_s.h" +#include <sql_plugin.h> +#include <mysql/innodb_priv.h> +#include <debug_sync.h> + +#include "btr0pcur.h" +#include "btr0types.h" +#include "dict0dict.h" +#include "dict0load.h" +#include "buf0buddy.h" +#include "buf0buf.h" +#include "ibuf0ibuf.h" +#include "dict0mem.h" +#include "dict0types.h" +#include "ha_prototypes.h" +#include "srv0start.h" +#include "srv0srv.h" +#include "trx0i_s.h" +#include "trx0trx.h" +#include "srv0mon.h" +#include "fut0fut.h" +#include "pars0pars.h" +#include "fts0types.h" +#include "fts0opt.h" +#include "fts0priv.h" +#include "log0online.h" +#include "btr0btr.h" +#include "page0zip.h" + +/** structure associates a name string with a file page type and/or buffer +page state. */ +struct buf_page_desc_t{ + const char* type_str; /*!< String explain the page + type/state */ + ulint type_value; /*!< Page type or page state */ +}; + +/** Change buffer B-tree page */ +#define I_S_PAGE_TYPE_IBUF (FIL_PAGE_TYPE_LAST + 1) + +/** Any states greater than I_S_PAGE_TYPE_IBUF would be treated as +unknown. */ +#define I_S_PAGE_TYPE_UNKNOWN (I_S_PAGE_TYPE_IBUF + 1) + +/** We also define I_S_PAGE_TYPE_INDEX as the Index Page's position +in i_s_page_type[] array */ +#define I_S_PAGE_TYPE_INDEX 1 + +/** Name string for File Page Types */ +static buf_page_desc_t i_s_page_type[] = { + {"ALLOCATED", FIL_PAGE_TYPE_ALLOCATED}, + {"INDEX", FIL_PAGE_INDEX}, + {"UNDO_LOG", FIL_PAGE_UNDO_LOG}, + {"INODE", FIL_PAGE_INODE}, + {"IBUF_FREE_LIST", FIL_PAGE_IBUF_FREE_LIST}, + {"IBUF_BITMAP", FIL_PAGE_IBUF_BITMAP}, + {"SYSTEM", FIL_PAGE_TYPE_SYS}, + {"TRX_SYSTEM", FIL_PAGE_TYPE_TRX_SYS}, + {"FILE_SPACE_HEADER", FIL_PAGE_TYPE_FSP_HDR}, + {"EXTENT_DESCRIPTOR", FIL_PAGE_TYPE_XDES}, + {"BLOB", FIL_PAGE_TYPE_BLOB}, + {"COMPRESSED_BLOB", FIL_PAGE_TYPE_ZBLOB}, + {"COMPRESSED_BLOB2", FIL_PAGE_TYPE_ZBLOB2}, + {"IBUF_INDEX", I_S_PAGE_TYPE_IBUF}, + {"UNKNOWN", I_S_PAGE_TYPE_UNKNOWN} +}; + +/* Check if we can hold all page type in a 4 bit value */ +#if I_S_PAGE_TYPE_UNKNOWN > 1<<4 +# error "i_s_page_type[] is too large" +#endif + +/** This structure defines information we will fetch from pages +currently cached in the buffer pool. It will be used to populate +table INFORMATION_SCHEMA.INNODB_BUFFER_PAGE */ +struct buf_page_info_t{ + ulint block_id; /*!< Buffer Pool block ID */ + unsigned space_id:32; /*!< Tablespace ID */ + unsigned page_num:32; /*!< Page number/offset */ + unsigned access_time:32; /*!< Time of first access */ + unsigned pool_id:MAX_BUFFER_POOLS_BITS; + /*!< Buffer Pool ID. Must be less than + MAX_BUFFER_POOLS */ + unsigned flush_type:2; /*!< Flush type */ + unsigned io_fix:2; /*!< type of pending I/O operation */ + unsigned fix_count:19; /*!< Count of how manyfold this block + is bufferfixed */ + unsigned hashed:1; /*!< Whether hash index has been + built on this page */ + unsigned is_old:1; /*!< TRUE if the block is in the old + blocks in buf_pool->LRU_old */ + unsigned freed_page_clock:31; /*!< the value of + buf_pool->freed_page_clock */ + unsigned zip_ssize:PAGE_ZIP_SSIZE_BITS; + /*!< Compressed page size */ + unsigned page_state:BUF_PAGE_STATE_BITS; /*!< Page state */ + unsigned page_type:4; /*!< Page type */ + unsigned num_recs:UNIV_PAGE_SIZE_SHIFT_MAX-2; + /*!< Number of records on Page */ + unsigned data_size:UNIV_PAGE_SIZE_SHIFT_MAX; + /*!< Sum of the sizes of the records */ + lsn_t newest_mod; /*!< Log sequence number of + the youngest modification */ + lsn_t oldest_mod; /*!< Log sequence number of + the oldest modification */ + index_id_t index_id; /*!< Index ID if a index page */ +}; + +/** maximum number of buffer page info we would cache. */ +#define MAX_BUF_INFO_CACHED 10000 + +#define OK(expr) \ + if ((expr) != 0) { \ + DBUG_RETURN(1); \ + } + +#define RETURN_IF_INNODB_NOT_STARTED(plugin_name) \ +do { \ + if (!srv_was_started) { \ + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, \ + ER_CANT_FIND_SYSTEM_REC, \ + "InnoDB: SELECTing from " \ + "INFORMATION_SCHEMA.%s but " \ + "the InnoDB storage engine " \ + "is not installed", plugin_name); \ + DBUG_RETURN(0); \ + } \ +} while (0) + +#if !defined __STRICT_ANSI__ && defined __GNUC__ && (__GNUC__) > 2 && \ + !defined __INTEL_COMPILER && !defined __clang__ +#define STRUCT_FLD(name, value) name: value +#else +#define STRUCT_FLD(name, value) value +#endif + +/* Don't use a static const variable here, as some C++ compilers (notably +HPUX aCC: HP ANSI C++ B3910B A.03.65) can't handle it. */ +#define END_OF_ST_FIELD_INFO \ + {STRUCT_FLD(field_name, NULL), \ + STRUCT_FLD(field_length, 0), \ + STRUCT_FLD(field_type, MYSQL_TYPE_NULL), \ + STRUCT_FLD(value, 0), \ + STRUCT_FLD(field_flags, 0), \ + STRUCT_FLD(old_name, ""), \ + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)} + +/* +Use the following types mapping: + +C type ST_FIELD_INFO::field_type +--------------------------------- +long MYSQL_TYPE_LONGLONG +(field_length=MY_INT64_NUM_DECIMAL_DIGITS) + +long unsigned MYSQL_TYPE_LONGLONG +(field_length=MY_INT64_NUM_DECIMAL_DIGITS, field_flags=MY_I_S_UNSIGNED) + +char* MYSQL_TYPE_STRING +(field_length=n) + +float MYSQL_TYPE_FLOAT +(field_length=0 is ignored) + +void* MYSQL_TYPE_LONGLONG +(field_length=MY_INT64_NUM_DECIMAL_DIGITS, field_flags=MY_I_S_UNSIGNED) + +boolean (if else) MYSQL_TYPE_LONG +(field_length=1) + +time_t MYSQL_TYPE_DATETIME +(field_length=0 ignored) +--------------------------------- +*/ + +/*******************************************************************//** +Common function to fill any of the dynamic tables: +INFORMATION_SCHEMA.innodb_trx +INFORMATION_SCHEMA.innodb_locks +INFORMATION_SCHEMA.innodb_lock_waits +@return 0 on success */ +static +int +trx_i_s_common_fill_table( +/*======================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* ); /*!< in: condition (not used) */ + +/*******************************************************************//** +Unbind a dynamic INFORMATION_SCHEMA table. +@return 0 on success */ +static +int +i_s_common_deinit( +/*==============*/ + void* p); /*!< in/out: table schema object */ +/*******************************************************************//** +Auxiliary function to store time_t value in MYSQL_TYPE_DATETIME +field. +@return 0 on success */ +static +int +field_store_time_t( +/*===============*/ + Field* field, /*!< in/out: target field for storage */ + time_t time) /*!< in: value to store */ +{ + MYSQL_TIME my_time; + struct tm tm_time; + + if (time) { +#if 0 + /* use this if you are sure that `variables' and `time_zone' + are always initialized */ + thd->variables.time_zone->gmt_sec_to_TIME( + &my_time, (my_time_t) time); +#else + localtime_r(&time, &tm_time); + localtime_to_TIME(&my_time, &tm_time); + my_time.time_type = MYSQL_TIMESTAMP_DATETIME; +#endif + } else { + memset(&my_time, 0, sizeof(my_time)); + } + + return(field->store_time(&my_time, MYSQL_TIMESTAMP_DATETIME)); +} + +/*******************************************************************//** +Auxiliary function to store char* value in MYSQL_TYPE_STRING field. +@return 0 on success */ +static +int +field_store_string( +/*===============*/ + Field* field, /*!< in/out: target field for storage */ + const char* str) /*!< in: NUL-terminated utf-8 string, + or NULL */ +{ + int ret; + + if (str != NULL) { + + ret = field->store(str, static_cast<uint>(strlen(str)), + system_charset_info); + field->set_notnull(); + } else { + + ret = 0; /* success */ + field->set_null(); + } + + return(ret); +} + +/*******************************************************************//** +Store the name of an index in a MYSQL_TYPE_VARCHAR field. +Handles the names of incomplete secondary indexes. +@return 0 on success */ +static +int +field_store_index_name( +/*===================*/ + Field* field, /*!< in/out: target field for + storage */ + const char* index_name) /*!< in: NUL-terminated utf-8 + index name, possibly starting with + TEMP_INDEX_PREFIX */ +{ + int ret; + + ut_ad(index_name != NULL); + ut_ad(field->real_type() == MYSQL_TYPE_VARCHAR); + + /* Since TEMP_INDEX_PREFIX is not a valid UTF8, we need to convert + it to something else. */ + if (index_name[0] == TEMP_INDEX_PREFIX) { + char buf[NAME_LEN + 1]; + buf[0] = '?'; + memcpy(buf + 1, index_name + 1, strlen(index_name)); + ret = field->store( + buf, static_cast<uint>(strlen(buf)), + system_charset_info); + } else { + ret = field->store( + index_name, static_cast<uint>(strlen(index_name)), + system_charset_info); + } + + field->set_notnull(); + + return(ret); +} + +/*******************************************************************//** +Auxiliary function to store ulint value in MYSQL_TYPE_LONGLONG field. +If the value is ULINT_UNDEFINED then the field it set to NULL. +@return 0 on success */ +static +int +field_store_ulint( +/*==============*/ + Field* field, /*!< in/out: target field for storage */ + ulint n) /*!< in: value to store */ +{ + int ret; + + if (n != ULINT_UNDEFINED) { + + ret = field->store(static_cast<double>(n)); + field->set_notnull(); + } else { + + ret = 0; /* success */ + field->set_null(); + } + + return(ret); +} + +/* Fields of the dynamic table INFORMATION_SCHEMA.innodb_trx */ +static ST_FIELD_INFO innodb_trx_fields_info[] = +{ +#define IDX_TRX_ID 0 + {STRUCT_FLD(field_name, "trx_id"), + STRUCT_FLD(field_length, TRX_ID_MAX_LEN + 1), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_TRX_STATE 1 + {STRUCT_FLD(field_name, "trx_state"), + STRUCT_FLD(field_length, TRX_QUE_STATE_STR_MAX_LEN + 1), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_TRX_STARTED 2 + {STRUCT_FLD(field_name, "trx_started"), + STRUCT_FLD(field_length, 0), + STRUCT_FLD(field_type, MYSQL_TYPE_DATETIME), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_TRX_REQUESTED_LOCK_ID 3 + {STRUCT_FLD(field_name, "trx_requested_lock_id"), + STRUCT_FLD(field_length, TRX_I_S_LOCK_ID_MAX_LEN + 1), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_TRX_WAIT_STARTED 4 + {STRUCT_FLD(field_name, "trx_wait_started"), + STRUCT_FLD(field_length, 0), + STRUCT_FLD(field_type, MYSQL_TYPE_DATETIME), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_TRX_WEIGHT 5 + {STRUCT_FLD(field_name, "trx_weight"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_TRX_MYSQL_THREAD_ID 6 + {STRUCT_FLD(field_name, "trx_mysql_thread_id"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_TRX_QUERY 7 + {STRUCT_FLD(field_name, "trx_query"), + STRUCT_FLD(field_length, TRX_I_S_TRX_QUERY_MAX_LEN), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_TRX_OPERATION_STATE 8 + {STRUCT_FLD(field_name, "trx_operation_state"), + STRUCT_FLD(field_length, TRX_I_S_TRX_OP_STATE_MAX_LEN), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_TRX_TABLES_IN_USE 9 + {STRUCT_FLD(field_name, "trx_tables_in_use"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_TRX_TABLES_LOCKED 10 + {STRUCT_FLD(field_name, "trx_tables_locked"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_TRX_LOCK_STRUCTS 11 + {STRUCT_FLD(field_name, "trx_lock_structs"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_TRX_LOCK_MEMORY_BYTES 12 + {STRUCT_FLD(field_name, "trx_lock_memory_bytes"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_TRX_ROWS_LOCKED 13 + {STRUCT_FLD(field_name, "trx_rows_locked"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_TRX_ROWS_MODIFIED 14 + {STRUCT_FLD(field_name, "trx_rows_modified"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_TRX_CONNCURRENCY_TICKETS 15 + {STRUCT_FLD(field_name, "trx_concurrency_tickets"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_TRX_ISOLATION_LEVEL 16 + {STRUCT_FLD(field_name, "trx_isolation_level"), + STRUCT_FLD(field_length, TRX_I_S_TRX_ISOLATION_LEVEL_MAX_LEN), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_TRX_UNIQUE_CHECKS 17 + {STRUCT_FLD(field_name, "trx_unique_checks"), + STRUCT_FLD(field_length, 1), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 1), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_TRX_FOREIGN_KEY_CHECKS 18 + {STRUCT_FLD(field_name, "trx_foreign_key_checks"), + STRUCT_FLD(field_length, 1), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 1), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_TRX_LAST_FOREIGN_KEY_ERROR 19 + {STRUCT_FLD(field_name, "trx_last_foreign_key_error"), + STRUCT_FLD(field_length, TRX_I_S_TRX_FK_ERROR_MAX_LEN), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_TRX_ADAPTIVE_HASH_LATCHED 20 + {STRUCT_FLD(field_name, "trx_adaptive_hash_latched"), + STRUCT_FLD(field_length, 1), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_TRX_ADAPTIVE_HASH_TIMEOUT 21 + {STRUCT_FLD(field_name, "trx_adaptive_hash_timeout"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_TRX_READ_ONLY 22 + {STRUCT_FLD(field_name, "trx_is_read_only"), + STRUCT_FLD(field_length, 1), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_TRX_AUTOCOMMIT_NON_LOCKING 23 + {STRUCT_FLD(field_name, "trx_autocommit_non_locking"), + STRUCT_FLD(field_length, 1), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + END_OF_ST_FIELD_INFO +}; + +/*******************************************************************//** +Read data from cache buffer and fill the INFORMATION_SCHEMA.innodb_trx +table with it. +@return 0 on success */ +static +int +fill_innodb_trx_from_cache( +/*=======================*/ + trx_i_s_cache_t* cache, /*!< in: cache to read from */ + THD* thd, /*!< in: used to call + schema_table_store_record() */ + TABLE* table) /*!< in/out: fill this table */ +{ + Field** fields; + ulint rows_num; + char lock_id[TRX_I_S_LOCK_ID_MAX_LEN + 1]; + ulint i; + + DBUG_ENTER("fill_innodb_trx_from_cache"); + + fields = table->field; + + rows_num = trx_i_s_cache_get_rows_used(cache, + I_S_INNODB_TRX); + + for (i = 0; i < rows_num; i++) { + + i_s_trx_row_t* row; + char trx_id[TRX_ID_MAX_LEN + 1]; + + row = (i_s_trx_row_t*) + trx_i_s_cache_get_nth_row( + cache, I_S_INNODB_TRX, i); + + /* trx_id */ + ut_snprintf(trx_id, sizeof(trx_id), TRX_ID_FMT, row->trx_id); + OK(field_store_string(fields[IDX_TRX_ID], trx_id)); + + /* trx_state */ + OK(field_store_string(fields[IDX_TRX_STATE], + row->trx_state)); + + /* trx_started */ + OK(field_store_time_t(fields[IDX_TRX_STARTED], + (time_t) row->trx_started)); + + /* trx_requested_lock_id */ + /* trx_wait_started */ + if (row->trx_wait_started != 0) { + + OK(field_store_string( + fields[IDX_TRX_REQUESTED_LOCK_ID], + trx_i_s_create_lock_id( + row->requested_lock_row, + lock_id, sizeof(lock_id)))); + /* field_store_string() sets it no notnull */ + + OK(field_store_time_t( + fields[IDX_TRX_WAIT_STARTED], + (time_t) row->trx_wait_started)); + fields[IDX_TRX_WAIT_STARTED]->set_notnull(); + } else { + + fields[IDX_TRX_REQUESTED_LOCK_ID]->set_null(); + fields[IDX_TRX_WAIT_STARTED]->set_null(); + } + + /* trx_weight */ + OK(fields[IDX_TRX_WEIGHT]->store((longlong) row->trx_weight, + true)); + + /* trx_mysql_thread_id */ + OK(fields[IDX_TRX_MYSQL_THREAD_ID]->store( + static_cast<double>(row->trx_mysql_thread_id))); + + /* trx_query */ + if (row->trx_query) { + /* store will do appropriate character set + conversion check */ + fields[IDX_TRX_QUERY]->store( + row->trx_query, + static_cast<uint>(strlen(row->trx_query)), + row->trx_query_cs); + fields[IDX_TRX_QUERY]->set_notnull(); + } else { + fields[IDX_TRX_QUERY]->set_null(); + } + + /* trx_operation_state */ + OK(field_store_string(fields[IDX_TRX_OPERATION_STATE], + row->trx_operation_state)); + + /* trx_tables_in_use */ + OK(fields[IDX_TRX_TABLES_IN_USE]->store( + (longlong) row->trx_tables_in_use, true)); + + /* trx_tables_locked */ + OK(fields[IDX_TRX_TABLES_LOCKED]->store( + (longlong) row->trx_tables_locked, true)); + + /* trx_lock_structs */ + OK(fields[IDX_TRX_LOCK_STRUCTS]->store( + (longlong) row->trx_lock_structs, true)); + + /* trx_lock_memory_bytes */ + OK(fields[IDX_TRX_LOCK_MEMORY_BYTES]->store( + (longlong) row->trx_lock_memory_bytes, true)); + + /* trx_rows_locked */ + OK(fields[IDX_TRX_ROWS_LOCKED]->store( + (longlong) row->trx_rows_locked, true)); + + /* trx_rows_modified */ + OK(fields[IDX_TRX_ROWS_MODIFIED]->store( + (longlong) row->trx_rows_modified, true)); + + /* trx_concurrency_tickets */ + OK(fields[IDX_TRX_CONNCURRENCY_TICKETS]->store( + (longlong) row->trx_concurrency_tickets, true)); + + /* trx_isolation_level */ + OK(field_store_string(fields[IDX_TRX_ISOLATION_LEVEL], + row->trx_isolation_level)); + + /* trx_unique_checks */ + OK(fields[IDX_TRX_UNIQUE_CHECKS]->store( + static_cast<double>(row->trx_unique_checks))); + + /* trx_foreign_key_checks */ + OK(fields[IDX_TRX_FOREIGN_KEY_CHECKS]->store( + static_cast<double>(row->trx_foreign_key_checks))); + + /* trx_last_foreign_key_error */ + OK(field_store_string(fields[IDX_TRX_LAST_FOREIGN_KEY_ERROR], + row->trx_foreign_key_error)); + + /* trx_adaptive_hash_latched */ + OK(fields[IDX_TRX_ADAPTIVE_HASH_LATCHED]->store( + static_cast<double>(row->trx_has_search_latch))); + + /* trx_adaptive_hash_timeout */ + OK(fields[IDX_TRX_ADAPTIVE_HASH_TIMEOUT]->store( + (longlong) row->trx_search_latch_timeout, true)); + + /* trx_is_read_only*/ + OK(fields[IDX_TRX_READ_ONLY]->store( + (longlong) row->trx_is_read_only, true)); + + /* trx_is_autocommit_non_locking */ + OK(fields[IDX_TRX_AUTOCOMMIT_NON_LOCKING]->store( + (longlong) row->trx_is_autocommit_non_locking, + true)); + + OK(schema_table_store_record(thd, table)); + } + + DBUG_RETURN(0); +} + +/*******************************************************************//** +Bind the dynamic table INFORMATION_SCHEMA.innodb_trx +@return 0 on success */ +static +int +innodb_trx_init( +/*============*/ + void* p) /*!< in/out: table schema object */ +{ + ST_SCHEMA_TABLE* schema; + + DBUG_ENTER("innodb_trx_init"); + + schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = innodb_trx_fields_info; + schema->fill_table = trx_i_s_common_fill_table; + + DBUG_RETURN(0); +} + +static struct st_mysql_information_schema i_s_info = +{ + MYSQL_INFORMATION_SCHEMA_INTERFACE_VERSION +}; + +UNIV_INTERN struct st_mysql_plugin i_s_innodb_trx = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), + + /* pointer to type-specific plugin descriptor */ + /* void* */ + STRUCT_FLD(info, &i_s_info), + + /* plugin name */ + /* const char* */ + STRUCT_FLD(name, "INNODB_TRX"), + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(author, plugin_author), + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(descr, "InnoDB transactions"), + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + STRUCT_FLD(license, PLUGIN_LICENSE_GPL), + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + STRUCT_FLD(init, innodb_trx_init), + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + STRUCT_FLD(deinit, i_s_common_deinit), + + /* plugin version (for SHOW PLUGINS) */ + /* unsigned int */ + STRUCT_FLD(version, INNODB_VERSION_SHORT), + + /* struct st_mysql_show_var* */ + STRUCT_FLD(status_vars, NULL), + + /* struct st_mysql_sys_var** */ + STRUCT_FLD(system_vars, NULL), + + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), +}; + +/* Fields of the dynamic table INFORMATION_SCHEMA.innodb_locks */ +static ST_FIELD_INFO innodb_locks_fields_info[] = +{ +#define IDX_LOCK_ID 0 + {STRUCT_FLD(field_name, "lock_id"), + STRUCT_FLD(field_length, TRX_I_S_LOCK_ID_MAX_LEN + 1), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_LOCK_TRX_ID 1 + {STRUCT_FLD(field_name, "lock_trx_id"), + STRUCT_FLD(field_length, TRX_ID_MAX_LEN + 1), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_LOCK_MODE 2 + {STRUCT_FLD(field_name, "lock_mode"), + /* S[,GAP] X[,GAP] IS[,GAP] IX[,GAP] AUTO_INC UNKNOWN */ + STRUCT_FLD(field_length, 32), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_LOCK_TYPE 3 + {STRUCT_FLD(field_name, "lock_type"), + STRUCT_FLD(field_length, 32 /* RECORD|TABLE|UNKNOWN */), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_LOCK_TABLE 4 + {STRUCT_FLD(field_name, "lock_table"), + STRUCT_FLD(field_length, 1024), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_LOCK_INDEX 5 + {STRUCT_FLD(field_name, "lock_index"), + STRUCT_FLD(field_length, 1024), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_LOCK_SPACE 6 + {STRUCT_FLD(field_name, "lock_space"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED | MY_I_S_MAYBE_NULL), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_LOCK_PAGE 7 + {STRUCT_FLD(field_name, "lock_page"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED | MY_I_S_MAYBE_NULL), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_LOCK_REC 8 + {STRUCT_FLD(field_name, "lock_rec"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED | MY_I_S_MAYBE_NULL), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_LOCK_DATA 9 + {STRUCT_FLD(field_name, "lock_data"), + STRUCT_FLD(field_length, TRX_I_S_LOCK_DATA_MAX_LEN), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + END_OF_ST_FIELD_INFO +}; + +/*******************************************************************//** +Read data from cache buffer and fill the INFORMATION_SCHEMA.innodb_locks +table with it. +@return 0 on success */ +static +int +fill_innodb_locks_from_cache( +/*=========================*/ + trx_i_s_cache_t* cache, /*!< in: cache to read from */ + THD* thd, /*!< in: MySQL client connection */ + TABLE* table) /*!< in/out: fill this table */ +{ + Field** fields; + ulint rows_num; + char lock_id[TRX_I_S_LOCK_ID_MAX_LEN + 1]; + ulint i; + + DBUG_ENTER("fill_innodb_locks_from_cache"); + + fields = table->field; + + rows_num = trx_i_s_cache_get_rows_used(cache, + I_S_INNODB_LOCKS); + + for (i = 0; i < rows_num; i++) { + + i_s_locks_row_t* row; + char buf[MAX_FULL_NAME_LEN + 1]; + const char* bufend; + + char lock_trx_id[TRX_ID_MAX_LEN + 1]; + + row = (i_s_locks_row_t*) + trx_i_s_cache_get_nth_row( + cache, I_S_INNODB_LOCKS, i); + + /* lock_id */ + trx_i_s_create_lock_id(row, lock_id, sizeof(lock_id)); + OK(field_store_string(fields[IDX_LOCK_ID], + lock_id)); + + /* lock_trx_id */ + ut_snprintf(lock_trx_id, sizeof(lock_trx_id), + TRX_ID_FMT, row->lock_trx_id); + OK(field_store_string(fields[IDX_LOCK_TRX_ID], lock_trx_id)); + + /* lock_mode */ + OK(field_store_string(fields[IDX_LOCK_MODE], + row->lock_mode)); + + /* lock_type */ + OK(field_store_string(fields[IDX_LOCK_TYPE], + row->lock_type)); + + /* lock_table */ + bufend = innobase_convert_name(buf, sizeof(buf), + row->lock_table, + strlen(row->lock_table), + thd, TRUE); + OK(fields[IDX_LOCK_TABLE]->store( + buf, static_cast<uint>(bufend - buf), + system_charset_info)); + + /* lock_index */ + if (row->lock_index != NULL) { + OK(field_store_index_name(fields[IDX_LOCK_INDEX], + row->lock_index)); + } else { + fields[IDX_LOCK_INDEX]->set_null(); + } + + /* lock_space */ + OK(field_store_ulint(fields[IDX_LOCK_SPACE], + row->lock_space)); + + /* lock_page */ + OK(field_store_ulint(fields[IDX_LOCK_PAGE], + row->lock_page)); + + /* lock_rec */ + OK(field_store_ulint(fields[IDX_LOCK_REC], + row->lock_rec)); + + /* lock_data */ + OK(field_store_string(fields[IDX_LOCK_DATA], + row->lock_data)); + + OK(schema_table_store_record(thd, table)); + } + + DBUG_RETURN(0); +} + +/*******************************************************************//** +Bind the dynamic table INFORMATION_SCHEMA.innodb_locks +@return 0 on success */ +static +int +innodb_locks_init( +/*==============*/ + void* p) /*!< in/out: table schema object */ +{ + ST_SCHEMA_TABLE* schema; + + DBUG_ENTER("innodb_locks_init"); + + schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = innodb_locks_fields_info; + schema->fill_table = trx_i_s_common_fill_table; + + DBUG_RETURN(0); +} + +UNIV_INTERN struct st_mysql_plugin i_s_innodb_locks = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), + + /* pointer to type-specific plugin descriptor */ + /* void* */ + STRUCT_FLD(info, &i_s_info), + + /* plugin name */ + /* const char* */ + STRUCT_FLD(name, "INNODB_LOCKS"), + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(author, plugin_author), + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(descr, "InnoDB conflicting locks"), + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + STRUCT_FLD(license, PLUGIN_LICENSE_GPL), + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + STRUCT_FLD(init, innodb_locks_init), + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + STRUCT_FLD(deinit, i_s_common_deinit), + + /* plugin version (for SHOW PLUGINS) */ + /* unsigned int */ + STRUCT_FLD(version, INNODB_VERSION_SHORT), + + /* struct st_mysql_show_var* */ + STRUCT_FLD(status_vars, NULL), + + /* struct st_mysql_sys_var** */ + STRUCT_FLD(system_vars, NULL), + + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), +}; + +/* Fields of the dynamic table INFORMATION_SCHEMA.innodb_lock_waits */ +static ST_FIELD_INFO innodb_lock_waits_fields_info[] = +{ +#define IDX_REQUESTING_TRX_ID 0 + {STRUCT_FLD(field_name, "requesting_trx_id"), + STRUCT_FLD(field_length, TRX_ID_MAX_LEN + 1), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_REQUESTED_LOCK_ID 1 + {STRUCT_FLD(field_name, "requested_lock_id"), + STRUCT_FLD(field_length, TRX_I_S_LOCK_ID_MAX_LEN + 1), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BLOCKING_TRX_ID 2 + {STRUCT_FLD(field_name, "blocking_trx_id"), + STRUCT_FLD(field_length, TRX_ID_MAX_LEN + 1), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BLOCKING_LOCK_ID 3 + {STRUCT_FLD(field_name, "blocking_lock_id"), + STRUCT_FLD(field_length, TRX_I_S_LOCK_ID_MAX_LEN + 1), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + END_OF_ST_FIELD_INFO +}; + +/*******************************************************************//** +Read data from cache buffer and fill the +INFORMATION_SCHEMA.innodb_lock_waits table with it. +@return 0 on success */ +static +int +fill_innodb_lock_waits_from_cache( +/*==============================*/ + trx_i_s_cache_t* cache, /*!< in: cache to read from */ + THD* thd, /*!< in: used to call + schema_table_store_record() */ + TABLE* table) /*!< in/out: fill this table */ +{ + Field** fields; + ulint rows_num; + char requested_lock_id[TRX_I_S_LOCK_ID_MAX_LEN + 1]; + char blocking_lock_id[TRX_I_S_LOCK_ID_MAX_LEN + 1]; + ulint i; + + DBUG_ENTER("fill_innodb_lock_waits_from_cache"); + + fields = table->field; + + rows_num = trx_i_s_cache_get_rows_used(cache, + I_S_INNODB_LOCK_WAITS); + + for (i = 0; i < rows_num; i++) { + + i_s_lock_waits_row_t* row; + + char requesting_trx_id[TRX_ID_MAX_LEN + 1]; + char blocking_trx_id[TRX_ID_MAX_LEN + 1]; + + row = (i_s_lock_waits_row_t*) + trx_i_s_cache_get_nth_row( + cache, I_S_INNODB_LOCK_WAITS, i); + + /* requesting_trx_id */ + ut_snprintf(requesting_trx_id, sizeof(requesting_trx_id), + TRX_ID_FMT, row->requested_lock_row->lock_trx_id); + OK(field_store_string(fields[IDX_REQUESTING_TRX_ID], + requesting_trx_id)); + + /* requested_lock_id */ + OK(field_store_string( + fields[IDX_REQUESTED_LOCK_ID], + trx_i_s_create_lock_id( + row->requested_lock_row, + requested_lock_id, + sizeof(requested_lock_id)))); + + /* blocking_trx_id */ + ut_snprintf(blocking_trx_id, sizeof(blocking_trx_id), + TRX_ID_FMT, row->blocking_lock_row->lock_trx_id); + OK(field_store_string(fields[IDX_BLOCKING_TRX_ID], + blocking_trx_id)); + + /* blocking_lock_id */ + OK(field_store_string( + fields[IDX_BLOCKING_LOCK_ID], + trx_i_s_create_lock_id( + row->blocking_lock_row, + blocking_lock_id, + sizeof(blocking_lock_id)))); + + OK(schema_table_store_record(thd, table)); + } + + DBUG_RETURN(0); +} + +/*******************************************************************//** +Bind the dynamic table INFORMATION_SCHEMA.innodb_lock_waits +@return 0 on success */ +static +int +innodb_lock_waits_init( +/*===================*/ + void* p) /*!< in/out: table schema object */ +{ + ST_SCHEMA_TABLE* schema; + + DBUG_ENTER("innodb_lock_waits_init"); + + schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = innodb_lock_waits_fields_info; + schema->fill_table = trx_i_s_common_fill_table; + + DBUG_RETURN(0); +} + +UNIV_INTERN struct st_mysql_plugin i_s_innodb_lock_waits = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), + + /* pointer to type-specific plugin descriptor */ + /* void* */ + STRUCT_FLD(info, &i_s_info), + + /* plugin name */ + /* const char* */ + STRUCT_FLD(name, "INNODB_LOCK_WAITS"), + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(author, plugin_author), + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(descr, "InnoDB which lock is blocking which"), + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + STRUCT_FLD(license, PLUGIN_LICENSE_GPL), + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + STRUCT_FLD(init, innodb_lock_waits_init), + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + STRUCT_FLD(deinit, i_s_common_deinit), + + /* plugin version (for SHOW PLUGINS) */ + /* unsigned int */ + STRUCT_FLD(version, INNODB_VERSION_SHORT), + + /* struct st_mysql_show_var* */ + STRUCT_FLD(status_vars, NULL), + + /* struct st_mysql_sys_var** */ + STRUCT_FLD(system_vars, NULL), + + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), +}; + +/*******************************************************************//** +Common function to fill any of the dynamic tables: +INFORMATION_SCHEMA.innodb_trx +INFORMATION_SCHEMA.innodb_locks +INFORMATION_SCHEMA.innodb_lock_waits +@return 0 on success */ +static +int +trx_i_s_common_fill_table( +/*======================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* ) /*!< in: condition (not used) */ +{ + const char* table_name; + int ret; + trx_i_s_cache_t* cache; + + DBUG_ENTER("trx_i_s_common_fill_table"); + + /* deny access to non-superusers */ + if (check_global_access(thd, PROCESS_ACL)) { + + DBUG_RETURN(0); + } + + /* minimize the number of places where global variables are + referenced */ + cache = trx_i_s_cache; + + /* which table we have to fill? */ + table_name = tables->schema_table_name; + /* or table_name = tables->schema_table->table_name; */ + + RETURN_IF_INNODB_NOT_STARTED(table_name); + + /* update the cache */ + trx_i_s_cache_start_write(cache); + trx_i_s_possibly_fetch_data_into_cache(cache); + trx_i_s_cache_end_write(cache); + + if (trx_i_s_cache_is_truncated(cache)) { + + /* XXX show warning to user if possible */ + fprintf(stderr, "Warning: data in %s truncated due to " + "memory limit of %d bytes\n", table_name, + TRX_I_S_MEM_LIMIT); + } + + ret = 0; + + trx_i_s_cache_start_read(cache); + + if (innobase_strcasecmp(table_name, "innodb_trx") == 0) { + + if (fill_innodb_trx_from_cache( + cache, thd, tables->table) != 0) { + + ret = 1; + } + + } else if (innobase_strcasecmp(table_name, "innodb_locks") == 0) { + + if (fill_innodb_locks_from_cache( + cache, thd, tables->table) != 0) { + + ret = 1; + } + + } else if (innobase_strcasecmp(table_name, "innodb_lock_waits") == 0) { + + if (fill_innodb_lock_waits_from_cache( + cache, thd, tables->table) != 0) { + + ret = 1; + } + + } else { + + /* huh! what happened!? */ + fprintf(stderr, + "InnoDB: trx_i_s_common_fill_table() was " + "called to fill unknown table: %s.\n" + "This function only knows how to fill " + "innodb_trx, innodb_locks and " + "innodb_lock_waits tables.\n", table_name); + + ret = 1; + } + + trx_i_s_cache_end_read(cache); + +#if 0 + DBUG_RETURN(ret); +#else + /* if this function returns something else than 0 then a + deadlock occurs between the mysqld server and mysql client, + see http://bugs.mysql.com/29900 ; when that bug is resolved + we can enable the DBUG_RETURN(ret) above */ + ret++; // silence a gcc46 warning + DBUG_RETURN(0); +#endif +} + +/* Fields of the dynamic table information_schema.innodb_cmp. */ +static ST_FIELD_INFO i_s_cmp_fields_info[] = +{ + {STRUCT_FLD(field_name, "page_size"), + STRUCT_FLD(field_length, 5), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, "Compressed Page Size"), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "compress_ops"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, "Total Number of Compressions"), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "compress_ops_ok"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, "Total Number of" + " Successful Compressions"), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "compress_time"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, "Total Duration of Compressions," + " in Seconds"), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "uncompress_ops"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, "Total Number of Decompressions"), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "uncompress_time"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, "Total Duration of Decompressions," + " in Seconds"), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + END_OF_ST_FIELD_INFO +}; + + +/*******************************************************************//** +Fill the dynamic table information_schema.innodb_cmp or +innodb_cmp_reset. +@return 0 on success, 1 on failure */ +static +int +i_s_cmp_fill_low( +/*=============*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* , /*!< in: condition (ignored) */ + ibool reset) /*!< in: TRUE=reset cumulated counts */ +{ + TABLE* table = (TABLE*) tables->table; + int status = 0; + + DBUG_ENTER("i_s_cmp_fill_low"); + + /* deny access to non-superusers */ + if (check_global_access(thd, PROCESS_ACL)) { + + DBUG_RETURN(0); + } + + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name); + + for (uint i = 0; i < PAGE_ZIP_SSIZE_MAX; i++) { + page_zip_stat_t* zip_stat = &page_zip_stat[i]; + + table->field[0]->store(UNIV_ZIP_SIZE_MIN << i); + + /* The cumulated counts are not protected by any + mutex. Thus, some operation in page0zip.cc could + increment a counter between the time we read it and + clear it. We could introduce mutex protection, but it + could cause a measureable performance hit in + page0zip.cc. */ + table->field[1]->store( + static_cast<double>(zip_stat->compressed)); + table->field[2]->store( + static_cast<double>(zip_stat->compressed_ok)); + table->field[3]->store( + static_cast<double>(zip_stat->compressed_usec / 1000000)); + table->field[4]->store( + static_cast<double>(zip_stat->decompressed)); + table->field[5]->store( + static_cast<double>(zip_stat->decompressed_usec / 1000000)); + + if (reset) { + memset(zip_stat, 0, sizeof *zip_stat); + } + + if (schema_table_store_record(thd, table)) { + status = 1; + break; + } + } + + DBUG_RETURN(status); +} + +/*******************************************************************//** +Fill the dynamic table information_schema.innodb_cmp. +@return 0 on success, 1 on failure */ +static +int +i_s_cmp_fill( +/*=========*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* cond) /*!< in: condition (ignored) */ +{ + return(i_s_cmp_fill_low(thd, tables, cond, FALSE)); +} + +/*******************************************************************//** +Fill the dynamic table information_schema.innodb_cmp_reset. +@return 0 on success, 1 on failure */ +static +int +i_s_cmp_reset_fill( +/*===============*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* cond) /*!< in: condition (ignored) */ +{ + return(i_s_cmp_fill_low(thd, tables, cond, TRUE)); +} + +/*******************************************************************//** +Bind the dynamic table information_schema.innodb_cmp. +@return 0 on success */ +static +int +i_s_cmp_init( +/*=========*/ + void* p) /*!< in/out: table schema object */ +{ + DBUG_ENTER("i_s_cmp_init"); + ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = i_s_cmp_fields_info; + schema->fill_table = i_s_cmp_fill; + + DBUG_RETURN(0); +} + +/*******************************************************************//** +Bind the dynamic table information_schema.innodb_cmp_reset. +@return 0 on success */ +static +int +i_s_cmp_reset_init( +/*===============*/ + void* p) /*!< in/out: table schema object */ +{ + DBUG_ENTER("i_s_cmp_reset_init"); + ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = i_s_cmp_fields_info; + schema->fill_table = i_s_cmp_reset_fill; + + DBUG_RETURN(0); +} + +UNIV_INTERN struct st_mysql_plugin i_s_innodb_cmp = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), + + /* pointer to type-specific plugin descriptor */ + /* void* */ + STRUCT_FLD(info, &i_s_info), + + /* plugin name */ + /* const char* */ + STRUCT_FLD(name, "INNODB_CMP"), + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(author, plugin_author), + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(descr, "Statistics for the InnoDB compression"), + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + STRUCT_FLD(license, PLUGIN_LICENSE_GPL), + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + STRUCT_FLD(init, i_s_cmp_init), + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + STRUCT_FLD(deinit, i_s_common_deinit), + + /* plugin version (for SHOW PLUGINS) */ + /* unsigned int */ + STRUCT_FLD(version, INNODB_VERSION_SHORT), + + /* struct st_mysql_show_var* */ + STRUCT_FLD(status_vars, NULL), + + /* struct st_mysql_sys_var** */ + STRUCT_FLD(system_vars, NULL), + + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), +}; + +UNIV_INTERN struct st_mysql_plugin i_s_innodb_cmp_reset = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), + + /* pointer to type-specific plugin descriptor */ + /* void* */ + STRUCT_FLD(info, &i_s_info), + + /* plugin name */ + /* const char* */ + STRUCT_FLD(name, "INNODB_CMP_RESET"), + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(author, plugin_author), + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(descr, "Statistics for the InnoDB compression;" + " reset cumulated counts"), + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + STRUCT_FLD(license, PLUGIN_LICENSE_GPL), + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + STRUCT_FLD(init, i_s_cmp_reset_init), + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + STRUCT_FLD(deinit, i_s_common_deinit), + + /* plugin version (for SHOW PLUGINS) */ + /* unsigned int */ + STRUCT_FLD(version, INNODB_VERSION_SHORT), + + /* struct st_mysql_show_var* */ + STRUCT_FLD(status_vars, NULL), + + /* struct st_mysql_sys_var** */ + STRUCT_FLD(system_vars, NULL), + + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), +}; + +/* Fields of the dynamic tables +information_schema.innodb_cmp_per_index and +information_schema.innodb_cmp_per_index_reset. */ +static ST_FIELD_INFO i_s_cmp_per_index_fields_info[] = +{ +#define IDX_DATABASE_NAME 0 + {STRUCT_FLD(field_name, "database_name"), + STRUCT_FLD(field_length, 192), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_TABLE_NAME 1 + {STRUCT_FLD(field_name, "table_name"), + STRUCT_FLD(field_length, 192), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_INDEX_NAME 2 + {STRUCT_FLD(field_name, "index_name"), + STRUCT_FLD(field_length, 192), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_COMPRESS_OPS 3 + {STRUCT_FLD(field_name, "compress_ops"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_COMPRESS_OPS_OK 4 + {STRUCT_FLD(field_name, "compress_ops_ok"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_COMPRESS_TIME 5 + {STRUCT_FLD(field_name, "compress_time"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_UNCOMPRESS_OPS 6 + {STRUCT_FLD(field_name, "uncompress_ops"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_UNCOMPRESS_TIME 7 + {STRUCT_FLD(field_name, "uncompress_time"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + END_OF_ST_FIELD_INFO +}; + +/*******************************************************************//** +Fill the dynamic table +information_schema.innodb_cmp_per_index or +information_schema.innodb_cmp_per_index_reset. +@return 0 on success, 1 on failure */ +static +int +i_s_cmp_per_index_fill_low( +/*=======================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* , /*!< in: condition (ignored) */ + ibool reset) /*!< in: TRUE=reset cumulated counts */ +{ + TABLE* table = tables->table; + Field** fields = table->field; + int status = 0; + + DBUG_ENTER("i_s_cmp_per_index_fill_low"); + + /* deny access to non-superusers */ + if (check_global_access(thd, PROCESS_ACL)) { + + DBUG_RETURN(0); + } + + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name); + + /* Create a snapshot of the stats so we do not bump into lock + order violations with dict_sys->mutex below. */ + mutex_enter(&page_zip_stat_per_index_mutex); + page_zip_stat_per_index_t snap (page_zip_stat_per_index); + mutex_exit(&page_zip_stat_per_index_mutex); + + mutex_enter(&dict_sys->mutex); + + page_zip_stat_per_index_t::iterator iter; + ulint i; + + for (iter = snap.begin(), i = 0; iter != snap.end(); iter++, i++) { + + char name[192]; + dict_index_t* index = dict_index_find_on_id_low(iter->first); + + if (index != NULL) { + char db_utf8[MAX_DB_UTF8_LEN]; + char table_utf8[MAX_TABLE_UTF8_LEN]; + + dict_fs2utf8(index->table_name, + db_utf8, sizeof(db_utf8), + table_utf8, sizeof(table_utf8)); + + field_store_string(fields[IDX_DATABASE_NAME], db_utf8); + field_store_string(fields[IDX_TABLE_NAME], table_utf8); + field_store_index_name(fields[IDX_INDEX_NAME], + index->name); + } else { + /* index not found */ + ut_snprintf(name, sizeof(name), + "index_id:" IB_ID_FMT, iter->first); + field_store_string(fields[IDX_DATABASE_NAME], + "unknown"); + field_store_string(fields[IDX_TABLE_NAME], + "unknown"); + field_store_string(fields[IDX_INDEX_NAME], + name); + } + + fields[IDX_COMPRESS_OPS]->store( + static_cast<double>(iter->second.compressed)); + + fields[IDX_COMPRESS_OPS_OK]->store( + static_cast<double>(iter->second.compressed_ok)); + + fields[IDX_COMPRESS_TIME]->store( + static_cast<double>(iter->second.compressed_usec / 1000000)); + + fields[IDX_UNCOMPRESS_OPS]->store( + static_cast<double>(iter->second.decompressed)); + + fields[IDX_UNCOMPRESS_TIME]->store( + static_cast<double>(iter->second.decompressed_usec / 1000000)); + + if (schema_table_store_record(thd, table)) { + status = 1; + break; + } + + /* Release and reacquire the dict mutex to allow other + threads to proceed. This could eventually result in the + contents of INFORMATION_SCHEMA.innodb_cmp_per_index being + inconsistent, but it is an acceptable compromise. */ + if (i % 1000 == 0) { + mutex_exit(&dict_sys->mutex); + mutex_enter(&dict_sys->mutex); + } + } + + mutex_exit(&dict_sys->mutex); + + if (reset) { + page_zip_reset_stat_per_index(); + } + + DBUG_RETURN(status); +} + +/*******************************************************************//** +Fill the dynamic table information_schema.innodb_cmp_per_index. +@return 0 on success, 1 on failure */ +static +int +i_s_cmp_per_index_fill( +/*===================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* cond) /*!< in: condition (ignored) */ +{ + return(i_s_cmp_per_index_fill_low(thd, tables, cond, FALSE)); +} + +/*******************************************************************//** +Fill the dynamic table information_schema.innodb_cmp_per_index_reset. +@return 0 on success, 1 on failure */ +static +int +i_s_cmp_per_index_reset_fill( +/*=========================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* cond) /*!< in: condition (ignored) */ +{ + return(i_s_cmp_per_index_fill_low(thd, tables, cond, TRUE)); +} + +/*******************************************************************//** +Bind the dynamic table information_schema.innodb_cmp_per_index. +@return 0 on success */ +static +int +i_s_cmp_per_index_init( +/*===================*/ + void* p) /*!< in/out: table schema object */ +{ + DBUG_ENTER("i_s_cmp_init"); + ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = i_s_cmp_per_index_fields_info; + schema->fill_table = i_s_cmp_per_index_fill; + + DBUG_RETURN(0); +} + +/*******************************************************************//** +Bind the dynamic table information_schema.innodb_cmp_per_index_reset. +@return 0 on success */ +static +int +i_s_cmp_per_index_reset_init( +/*=========================*/ + void* p) /*!< in/out: table schema object */ +{ + DBUG_ENTER("i_s_cmp_reset_init"); + ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = i_s_cmp_per_index_fields_info; + schema->fill_table = i_s_cmp_per_index_reset_fill; + + DBUG_RETURN(0); +} + +UNIV_INTERN struct st_mysql_plugin i_s_innodb_cmp_per_index = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), + + /* pointer to type-specific plugin descriptor */ + /* void* */ + STRUCT_FLD(info, &i_s_info), + + /* plugin name */ + /* const char* */ + STRUCT_FLD(name, "INNODB_CMP_PER_INDEX"), + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(author, plugin_author), + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(descr, "Statistics for the InnoDB compression (per index)"), + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + STRUCT_FLD(license, PLUGIN_LICENSE_GPL), + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + STRUCT_FLD(init, i_s_cmp_per_index_init), + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + STRUCT_FLD(deinit, i_s_common_deinit), + + /* plugin version (for SHOW PLUGINS) */ + /* unsigned int */ + STRUCT_FLD(version, INNODB_VERSION_SHORT), + + /* struct st_mysql_show_var* */ + STRUCT_FLD(status_vars, NULL), + + /* struct st_mysql_sys_var** */ + STRUCT_FLD(system_vars, NULL), + + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), +}; + +UNIV_INTERN struct st_mysql_plugin i_s_innodb_cmp_per_index_reset = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), + + /* pointer to type-specific plugin descriptor */ + /* void* */ + STRUCT_FLD(info, &i_s_info), + + /* plugin name */ + /* const char* */ + STRUCT_FLD(name, "INNODB_CMP_PER_INDEX_RESET"), + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(author, plugin_author), + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(descr, "Statistics for the InnoDB compression (per index);" + " reset cumulated counts"), + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + STRUCT_FLD(license, PLUGIN_LICENSE_GPL), + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + STRUCT_FLD(init, i_s_cmp_per_index_reset_init), + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + STRUCT_FLD(deinit, i_s_common_deinit), + + /* plugin version (for SHOW PLUGINS) */ + /* unsigned int */ + STRUCT_FLD(version, INNODB_VERSION_SHORT), + + /* struct st_mysql_show_var* */ + STRUCT_FLD(status_vars, NULL), + + /* struct st_mysql_sys_var** */ + STRUCT_FLD(system_vars, NULL), + + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), +}; + +/* Fields of the dynamic table information_schema.innodb_cmpmem. */ +static ST_FIELD_INFO i_s_cmpmem_fields_info[] = +{ + {STRUCT_FLD(field_name, "page_size"), + STRUCT_FLD(field_length, 5), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, "Buddy Block Size"), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "buffer_pool_instance"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, "Buffer Pool Id"), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "pages_used"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, "Currently in Use"), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "pages_free"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, "Currently Available"), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "relocation_ops"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, "Total Number of Relocations"), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "relocation_time"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, "Total Duration of Relocations," + " in Seconds"), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + END_OF_ST_FIELD_INFO +}; + +/*******************************************************************//** +Fill the dynamic table information_schema.innodb_cmpmem or +innodb_cmpmem_reset. +@return 0 on success, 1 on failure */ +static +int +i_s_cmpmem_fill_low( +/*================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* , /*!< in: condition (ignored) */ + ibool reset) /*!< in: TRUE=reset cumulated counts */ +{ + int status = 0; + TABLE* table = (TABLE*) tables->table; + + DBUG_ENTER("i_s_cmpmem_fill_low"); + + /* deny access to non-superusers */ + if (check_global_access(thd, PROCESS_ACL)) { + + DBUG_RETURN(0); + } + + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name); + + for (ulint i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool; + + status = 0; + + buf_pool = buf_pool_from_array(i); + + mutex_enter(&buf_pool->zip_free_mutex); + + for (uint x = 0; x <= BUF_BUDDY_SIZES; x++) { + buf_buddy_stat_t* buddy_stat; + + buddy_stat = &buf_pool->buddy_stat[x]; + + table->field[0]->store(BUF_BUDDY_LOW << x); + table->field[1]->store(static_cast<double>(i)); + table->field[2]->store(static_cast<double>( + buddy_stat->used)); + table->field[3]->store(static_cast<double>( + (x < BUF_BUDDY_SIZES) + ? UT_LIST_GET_LEN(buf_pool->zip_free[x]) + : 0)); + table->field[4]->store( + (longlong) buddy_stat->relocated, true); + table->field[5]->store( + static_cast<double>(buddy_stat->relocated_usec / 1000000)); + + if (reset) { + /* This is protected by + buf_pool->zip_free_mutex. */ + buddy_stat->relocated = 0; + buddy_stat->relocated_usec = 0; + } + + if (schema_table_store_record(thd, table)) { + status = 1; + break; + } + } + + mutex_exit(&buf_pool->zip_free_mutex); + + if (status) { + break; + } + } + + DBUG_RETURN(status); +} + +/*******************************************************************//** +Fill the dynamic table information_schema.innodb_cmpmem. +@return 0 on success, 1 on failure */ +static +int +i_s_cmpmem_fill( +/*============*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* cond) /*!< in: condition (ignored) */ +{ + return(i_s_cmpmem_fill_low(thd, tables, cond, FALSE)); +} + +/*******************************************************************//** +Fill the dynamic table information_schema.innodb_cmpmem_reset. +@return 0 on success, 1 on failure */ +static +int +i_s_cmpmem_reset_fill( +/*==================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* cond) /*!< in: condition (ignored) */ +{ + return(i_s_cmpmem_fill_low(thd, tables, cond, TRUE)); +} + +/*******************************************************************//** +Bind the dynamic table information_schema.innodb_cmpmem. +@return 0 on success */ +static +int +i_s_cmpmem_init( +/*============*/ + void* p) /*!< in/out: table schema object */ +{ + DBUG_ENTER("i_s_cmpmem_init"); + ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = i_s_cmpmem_fields_info; + schema->fill_table = i_s_cmpmem_fill; + + DBUG_RETURN(0); +} + +/*******************************************************************//** +Bind the dynamic table information_schema.innodb_cmpmem_reset. +@return 0 on success */ +static +int +i_s_cmpmem_reset_init( +/*==================*/ + void* p) /*!< in/out: table schema object */ +{ + DBUG_ENTER("i_s_cmpmem_reset_init"); + ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = i_s_cmpmem_fields_info; + schema->fill_table = i_s_cmpmem_reset_fill; + + DBUG_RETURN(0); +} + +UNIV_INTERN struct st_mysql_plugin i_s_innodb_cmpmem = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), + + /* pointer to type-specific plugin descriptor */ + /* void* */ + STRUCT_FLD(info, &i_s_info), + + /* plugin name */ + /* const char* */ + STRUCT_FLD(name, "INNODB_CMPMEM"), + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(author, plugin_author), + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(descr, "Statistics for the InnoDB compressed buffer pool"), + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + STRUCT_FLD(license, PLUGIN_LICENSE_GPL), + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + STRUCT_FLD(init, i_s_cmpmem_init), + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + STRUCT_FLD(deinit, i_s_common_deinit), + + /* plugin version (for SHOW PLUGINS) */ + /* unsigned int */ + STRUCT_FLD(version, INNODB_VERSION_SHORT), + + /* struct st_mysql_show_var* */ + STRUCT_FLD(status_vars, NULL), + + /* struct st_mysql_sys_var** */ + STRUCT_FLD(system_vars, NULL), + + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), +}; + +UNIV_INTERN struct st_mysql_plugin i_s_innodb_cmpmem_reset = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), + + /* pointer to type-specific plugin descriptor */ + /* void* */ + STRUCT_FLD(info, &i_s_info), + + /* plugin name */ + /* const char* */ + STRUCT_FLD(name, "INNODB_CMPMEM_RESET"), + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(author, plugin_author), + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(descr, "Statistics for the InnoDB compressed buffer pool;" + " reset cumulated counts"), + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + STRUCT_FLD(license, PLUGIN_LICENSE_GPL), + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + STRUCT_FLD(init, i_s_cmpmem_reset_init), + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + STRUCT_FLD(deinit, i_s_common_deinit), + + /* plugin version (for SHOW PLUGINS) */ + /* unsigned int */ + STRUCT_FLD(version, INNODB_VERSION_SHORT), + + /* struct st_mysql_show_var* */ + STRUCT_FLD(status_vars, NULL), + + /* struct st_mysql_sys_var** */ + STRUCT_FLD(system_vars, NULL), + + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), +}; + +/* Fields of the dynamic table INFORMATION_SCHEMA.innodb_metrics */ +static ST_FIELD_INFO innodb_metrics_fields_info[] = +{ +#define METRIC_NAME 0 + {STRUCT_FLD(field_name, "NAME"), + STRUCT_FLD(field_length, NAME_LEN + 1), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define METRIC_SUBSYS 1 + {STRUCT_FLD(field_name, "SUBSYSTEM"), + STRUCT_FLD(field_length, NAME_LEN + 1), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define METRIC_VALUE_START 2 + {STRUCT_FLD(field_name, "COUNT"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define METRIC_MAX_VALUE_START 3 + {STRUCT_FLD(field_name, "MAX_COUNT"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define METRIC_MIN_VALUE_START 4 + {STRUCT_FLD(field_name, "MIN_COUNT"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define METRIC_AVG_VALUE_START 5 + {STRUCT_FLD(field_name, "AVG_COUNT"), + STRUCT_FLD(field_length, MAX_FLOAT_STR_LENGTH), + STRUCT_FLD(field_type, MYSQL_TYPE_FLOAT), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define METRIC_VALUE_RESET 6 + {STRUCT_FLD(field_name, "COUNT_RESET"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define METRIC_MAX_VALUE_RESET 7 + {STRUCT_FLD(field_name, "MAX_COUNT_RESET"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define METRIC_MIN_VALUE_RESET 8 + {STRUCT_FLD(field_name, "MIN_COUNT_RESET"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define METRIC_AVG_VALUE_RESET 9 + {STRUCT_FLD(field_name, "AVG_COUNT_RESET"), + STRUCT_FLD(field_length, MAX_FLOAT_STR_LENGTH), + STRUCT_FLD(field_type, MYSQL_TYPE_FLOAT), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define METRIC_START_TIME 10 + {STRUCT_FLD(field_name, "TIME_ENABLED"), + STRUCT_FLD(field_length, 0), + STRUCT_FLD(field_type, MYSQL_TYPE_DATETIME), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define METRIC_STOP_TIME 11 + {STRUCT_FLD(field_name, "TIME_DISABLED"), + STRUCT_FLD(field_length, 0), + STRUCT_FLD(field_type, MYSQL_TYPE_DATETIME), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define METRIC_TIME_ELAPSED 12 + {STRUCT_FLD(field_name, "TIME_ELAPSED"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define METRIC_RESET_TIME 13 + {STRUCT_FLD(field_name, "TIME_RESET"), + STRUCT_FLD(field_length, 0), + STRUCT_FLD(field_type, MYSQL_TYPE_DATETIME), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define METRIC_STATUS 14 + {STRUCT_FLD(field_name, "STATUS"), + STRUCT_FLD(field_length, NAME_LEN + 1), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define METRIC_TYPE 15 + {STRUCT_FLD(field_name, "TYPE"), + STRUCT_FLD(field_length, NAME_LEN + 1), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define METRIC_DESC 16 + {STRUCT_FLD(field_name, "COMMENT"), + STRUCT_FLD(field_length, NAME_LEN + 1), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + END_OF_ST_FIELD_INFO +}; + +/**********************************************************************//** +Fill the information schema metrics table. +@return 0 on success */ +static +int +i_s_metrics_fill( +/*=============*/ + THD* thd, /*!< in: thread */ + TABLE* table_to_fill) /*!< in/out: fill this table */ +{ + int count; + Field** fields; + double time_diff = 0; + monitor_info_t* monitor_info; + mon_type_t min_val; + mon_type_t max_val; + + DBUG_ENTER("i_s_metrics_fill"); + fields = table_to_fill->field; + + for (count = 0; count < NUM_MONITOR; count++) { + monitor_info = srv_mon_get_info((monitor_id_t) count); + + /* A good place to sanity check the Monitor ID */ + ut_a(count == monitor_info->monitor_id); + + /* If the item refers to a Module, nothing to fill, + continue. */ + if ((monitor_info->monitor_type & MONITOR_MODULE) + || (monitor_info->monitor_type & MONITOR_HIDDEN)) { + continue; + } + + /* If this is an existing "status variable", and + its corresponding counter is still on, we need + to calculate the result from its corresponding + counter. */ + if (monitor_info->monitor_type & MONITOR_EXISTING + && MONITOR_IS_ON(count)) { + srv_mon_process_existing_counter((monitor_id_t) count, + MONITOR_GET_VALUE); + } + + /* Fill in counter's basic information */ + OK(field_store_string(fields[METRIC_NAME], + monitor_info->monitor_name)); + + OK(field_store_string(fields[METRIC_SUBSYS], + monitor_info->monitor_module)); + + OK(field_store_string(fields[METRIC_DESC], + monitor_info->monitor_desc)); + + /* Fill in counter values */ + OK(fields[METRIC_VALUE_RESET]->store( + MONITOR_VALUE(count), FALSE)); + + OK(fields[METRIC_VALUE_START]->store( + MONITOR_VALUE_SINCE_START(count), FALSE)); + + /* If the max value is MAX_RESERVED, counter max + value has not been updated. Set the column value + to NULL. */ + if (MONITOR_MAX_VALUE(count) == MAX_RESERVED + || MONITOR_MAX_MIN_NOT_INIT(count)) { + fields[METRIC_MAX_VALUE_RESET]->set_null(); + } else { + OK(fields[METRIC_MAX_VALUE_RESET]->store( + MONITOR_MAX_VALUE(count), FALSE)); + fields[METRIC_MAX_VALUE_RESET]->set_notnull(); + } + + /* If the min value is MAX_RESERVED, counter min + value has not been updated. Set the column value + to NULL. */ + if (MONITOR_MIN_VALUE(count) == MIN_RESERVED + || MONITOR_MAX_MIN_NOT_INIT(count)) { + fields[METRIC_MIN_VALUE_RESET]->set_null(); + } else { + OK(fields[METRIC_MIN_VALUE_RESET]->store( + MONITOR_MIN_VALUE(count), FALSE)); + fields[METRIC_MIN_VALUE_RESET]->set_notnull(); + } + + /* Calculate the max value since counter started */ + max_val = srv_mon_calc_max_since_start((monitor_id_t) count); + + if (max_val == MAX_RESERVED + || MONITOR_MAX_MIN_NOT_INIT(count)) { + fields[METRIC_MAX_VALUE_START]->set_null(); + } else { + OK(fields[METRIC_MAX_VALUE_START]->store( + max_val, FALSE)); + fields[METRIC_MAX_VALUE_START]->set_notnull(); + } + + /* Calculate the min value since counter started */ + min_val = srv_mon_calc_min_since_start((monitor_id_t) count); + + if (min_val == MIN_RESERVED + || MONITOR_MAX_MIN_NOT_INIT(count)) { + fields[METRIC_MIN_VALUE_START]->set_null(); + } else { + OK(fields[METRIC_MIN_VALUE_START]->store( + min_val, FALSE)); + + fields[METRIC_MIN_VALUE_START]->set_notnull(); + } + + /* If monitor has been enabled (no matter it is disabled + or not now), fill METRIC_START_TIME and METRIC_TIME_ELAPSED + field */ + if (MONITOR_FIELD(count, mon_start_time)) { + OK(field_store_time_t(fields[METRIC_START_TIME], + (time_t)MONITOR_FIELD(count, mon_start_time))); + fields[METRIC_START_TIME]->set_notnull(); + + /* If monitor is enabled, the TIME_ELAPSED is the + time difference between current and time when monitor + is enabled. Otherwise, it is the time difference + between time when monitor is enabled and time + when it is disabled */ + if (MONITOR_IS_ON(count)) { + time_diff = difftime(time(NULL), + MONITOR_FIELD(count, mon_start_time)); + } else { + time_diff = difftime( + MONITOR_FIELD(count, mon_stop_time), + MONITOR_FIELD(count, mon_start_time)); + } + + OK(fields[METRIC_TIME_ELAPSED]->store( + time_diff)); + fields[METRIC_TIME_ELAPSED]->set_notnull(); + } else { + fields[METRIC_START_TIME]->set_null(); + fields[METRIC_TIME_ELAPSED]->set_null(); + time_diff = 0; + } + + /* Unless MONITOR__NO_AVERAGE is marked, we will need + to calculate the average value. If this is a monitor set + owner marked by MONITOR_SET_OWNER, divide + the value by another counter (number of calls) designated + by monitor_info->monitor_related_id. + Otherwise average the counter value by the time between the + time that the counter is enabled and time it is disabled + or time it is sampled. */ + if (!(monitor_info->monitor_type & MONITOR_NO_AVERAGE) + && (monitor_info->monitor_type & MONITOR_SET_OWNER) + && monitor_info->monitor_related_id) { + mon_type_t value_start + = MONITOR_VALUE_SINCE_START( + monitor_info->monitor_related_id); + + if (value_start) { + OK(fields[METRIC_AVG_VALUE_START]->store( + MONITOR_VALUE_SINCE_START(count) + / value_start, FALSE)); + + fields[METRIC_AVG_VALUE_START]->set_notnull(); + } else { + fields[METRIC_AVG_VALUE_START]->set_null(); + } + + if (MONITOR_VALUE(monitor_info->monitor_related_id)) { + OK(fields[METRIC_AVG_VALUE_RESET]->store( + MONITOR_VALUE(count) + / MONITOR_VALUE( + monitor_info->monitor_related_id), + FALSE)); + } else { + fields[METRIC_AVG_VALUE_RESET]->set_null(); + } + } else if (!(monitor_info->monitor_type & MONITOR_NO_AVERAGE) + && !(monitor_info->monitor_type + & MONITOR_DISPLAY_CURRENT)) { + if (time_diff) { + OK(fields[METRIC_AVG_VALUE_START]->store( + (double) MONITOR_VALUE_SINCE_START( + count) / time_diff)); + fields[METRIC_AVG_VALUE_START]->set_notnull(); + } else { + fields[METRIC_AVG_VALUE_START]->set_null(); + } + + if (MONITOR_FIELD(count, mon_reset_time)) { + /* calculate the time difference since last + reset */ + if (MONITOR_IS_ON(count)) { + time_diff = difftime( + time(NULL), MONITOR_FIELD( + count, mon_reset_time)); + } else { + time_diff = difftime( + MONITOR_FIELD(count, mon_stop_time), + MONITOR_FIELD(count, mon_reset_time)); + } + } else { + time_diff = 0; + } + + if (time_diff) { + OK(fields[METRIC_AVG_VALUE_RESET]->store( + static_cast<double>( + MONITOR_VALUE(count) / time_diff))); + fields[METRIC_AVG_VALUE_RESET]->set_notnull(); + } else { + fields[METRIC_AVG_VALUE_RESET]->set_null(); + } + } else { + fields[METRIC_AVG_VALUE_START]->set_null(); + fields[METRIC_AVG_VALUE_RESET]->set_null(); + } + + + if (MONITOR_IS_ON(count)) { + /* If monitor is on, the stop time will set to NULL */ + fields[METRIC_STOP_TIME]->set_null(); + + /* Display latest Monitor Reset Time only if Monitor + counter is on. */ + if (MONITOR_FIELD(count, mon_reset_time)) { + OK(field_store_time_t( + fields[METRIC_RESET_TIME], + (time_t)MONITOR_FIELD( + count, mon_reset_time))); + fields[METRIC_RESET_TIME]->set_notnull(); + } else { + fields[METRIC_RESET_TIME]->set_null(); + } + + /* Display the monitor status as "enabled" */ + OK(field_store_string(fields[METRIC_STATUS], + "enabled")); + } else { + if (MONITOR_FIELD(count, mon_stop_time)) { + OK(field_store_time_t(fields[METRIC_STOP_TIME], + (time_t)MONITOR_FIELD(count, mon_stop_time))); + fields[METRIC_STOP_TIME]->set_notnull(); + } else { + fields[METRIC_STOP_TIME]->set_null(); + } + + fields[METRIC_RESET_TIME]->set_null(); + + OK(field_store_string(fields[METRIC_STATUS], + "disabled")); + } + + if (monitor_info->monitor_type & MONITOR_DISPLAY_CURRENT) { + OK(field_store_string(fields[METRIC_TYPE], + "value")); + } else if (monitor_info->monitor_type & MONITOR_EXISTING) { + OK(field_store_string(fields[METRIC_TYPE], + "status_counter")); + } else if (monitor_info->monitor_type & MONITOR_SET_OWNER) { + OK(field_store_string(fields[METRIC_TYPE], + "set_owner")); + } else if ( monitor_info->monitor_type & MONITOR_SET_MEMBER) { + OK(field_store_string(fields[METRIC_TYPE], + "set_member")); + } else { + OK(field_store_string(fields[METRIC_TYPE], + "counter")); + } + + OK(schema_table_store_record(thd, table_to_fill)); + } + + DBUG_RETURN(0); +} + +/*******************************************************************//** +Function to fill information schema metrics tables. +@return 0 on success */ +static +int +i_s_metrics_fill_table( +/*===================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* ) /*!< in: condition (not used) */ +{ + DBUG_ENTER("i_s_metrics_fill_table"); + + /* deny access to non-superusers */ + if (check_global_access(thd, PROCESS_ACL)) { + DBUG_RETURN(0); + } + + i_s_metrics_fill(thd, tables->table); + + DBUG_RETURN(0); +} +/*******************************************************************//** +Bind the dynamic table INFORMATION_SCHEMA.innodb_metrics +@return 0 on success */ +static +int +innodb_metrics_init( +/*================*/ + void* p) /*!< in/out: table schema object */ +{ + ST_SCHEMA_TABLE* schema; + + DBUG_ENTER("innodb_metrics_init"); + + schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = innodb_metrics_fields_info; + schema->fill_table = i_s_metrics_fill_table; + + DBUG_RETURN(0); +} + +UNIV_INTERN struct st_mysql_plugin i_s_innodb_metrics = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), + + /* pointer to type-specific plugin descriptor */ + /* void* */ + STRUCT_FLD(info, &i_s_info), + + /* plugin name */ + /* const char* */ + STRUCT_FLD(name, "INNODB_METRICS"), + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(author, plugin_author), + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(descr, "InnoDB Metrics Info"), + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + STRUCT_FLD(license, PLUGIN_LICENSE_GPL), + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + STRUCT_FLD(init, innodb_metrics_init), + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + STRUCT_FLD(deinit, i_s_common_deinit), + + /* plugin version (for SHOW PLUGINS) */ + /* unsigned int */ + STRUCT_FLD(version, INNODB_VERSION_SHORT), + + /* struct st_mysql_show_var* */ + STRUCT_FLD(status_vars, NULL), + + /* struct st_mysql_sys_var** */ + STRUCT_FLD(system_vars, NULL), + + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), +}; +/* Fields of the dynamic table INFORMATION_SCHEMA.innodb_ft_default_stopword */ +static ST_FIELD_INFO i_s_stopword_fields_info[] = +{ +#define STOPWORD_VALUE 0 + {STRUCT_FLD(field_name, "value"), + STRUCT_FLD(field_length, TRX_ID_MAX_LEN + 1), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + END_OF_ST_FIELD_INFO +}; + +/*******************************************************************//** +Fill the dynamic table information_schema.innodb_ft_default_stopword. +@return 0 on success, 1 on failure */ +static +int +i_s_stopword_fill( +/*==============*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* ) /*!< in: condition (not used) */ +{ + Field** fields; + ulint i = 0; + TABLE* table = (TABLE*) tables->table; + + DBUG_ENTER("i_s_stopword_fill"); + + fields = table->field; + + /* Fill with server default stopword list in array + fts_default_stopword */ + while (fts_default_stopword[i]) { + OK(field_store_string(fields[STOPWORD_VALUE], + fts_default_stopword[i])); + + OK(schema_table_store_record(thd, table)); + i++; + } + + DBUG_RETURN(0); +} + +/*******************************************************************//** +Bind the dynamic table information_schema.innodb_ft_default_stopword. +@return 0 on success */ +static +int +i_s_stopword_init( +/*==============*/ + void* p) /*!< in/out: table schema object */ +{ + DBUG_ENTER("i_s_stopword_init"); + ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = i_s_stopword_fields_info; + schema->fill_table = i_s_stopword_fill; + + DBUG_RETURN(0); +} + +UNIV_INTERN struct st_mysql_plugin i_s_innodb_ft_default_stopword = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), + + /* pointer to type-specific plugin descriptor */ + /* void* */ + STRUCT_FLD(info, &i_s_info), + + /* plugin name */ + /* const char* */ + STRUCT_FLD(name, "INNODB_FT_DEFAULT_STOPWORD"), + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(author, plugin_author), + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(descr, "Default stopword list for InnDB Full Text Search"), + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + STRUCT_FLD(license, PLUGIN_LICENSE_GPL), + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + STRUCT_FLD(init, i_s_stopword_init), + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + STRUCT_FLD(deinit, i_s_common_deinit), + + /* plugin version (for SHOW PLUGINS) */ + /* unsigned int */ + STRUCT_FLD(version, INNODB_VERSION_SHORT), + + /* struct st_mysql_show_var* */ + STRUCT_FLD(status_vars, NULL), + + /* struct st_mysql_sys_var** */ + STRUCT_FLD(system_vars, NULL), + + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), +}; + +/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_FT_DELETED +INFORMATION_SCHEMA.INNODB_FT_BEING_DELETED */ +static ST_FIELD_INFO i_s_fts_doc_fields_info[] = +{ +#define I_S_FTS_DOC_ID 0 + {STRUCT_FLD(field_name, "DOC_ID"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + END_OF_ST_FIELD_INFO +}; + +/*******************************************************************//** +Fill the dynamic table INFORMATION_SCHEMA.INNODB_FT_DELETED or +INFORMATION_SCHEMA.INNODB_FT_BEING_DELETED +@return 0 on success, 1 on failure */ +static +int +i_s_fts_deleted_generic_fill( +/*=========================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + ibool being_deleted) /*!< in: BEING_DELTED table */ +{ + Field** fields; + TABLE* table = (TABLE*) tables->table; + trx_t* trx; + fts_table_t fts_table; + fts_doc_ids_t* deleted; + dict_table_t* user_table; + + DBUG_ENTER("i_s_fts_deleted_generic_fill"); + + /* deny access to non-superusers */ + if (check_global_access(thd, PROCESS_ACL)) { + DBUG_RETURN(0); + } + + if (!fts_internal_tbl_name) { + DBUG_RETURN(0); + } + + deleted = fts_doc_ids_create(); + + user_table = dict_table_open_on_name( + fts_internal_tbl_name, FALSE, FALSE, DICT_ERR_IGNORE_NONE); + + if (!user_table) { + DBUG_RETURN(0); + } + + trx = trx_allocate_for_background(); + trx->op_info = "Select for FTS DELETE TABLE"; + + FTS_INIT_FTS_TABLE(&fts_table, + (being_deleted) ? "BEING_DELETED" : "DELETED", + FTS_COMMON_TABLE, user_table); + + fts_table_fetch_doc_ids(trx, &fts_table, deleted); + + fields = table->field; + + for (ulint j = 0; j < ib_vector_size(deleted->doc_ids); ++j) { + doc_id_t doc_id; + + doc_id = *(doc_id_t*) ib_vector_get_const(deleted->doc_ids, j); + + OK(fields[I_S_FTS_DOC_ID]->store((longlong) doc_id, true)); + + OK(schema_table_store_record(thd, table)); + } + + trx_free_for_background(trx); + + fts_doc_ids_free(deleted); + + dict_table_close(user_table, FALSE, FALSE); + + DBUG_RETURN(0); +} + +/*******************************************************************//** +Fill the dynamic table INFORMATION_SCHEMA.INNODB_FT_DELETED +@return 0 on success, 1 on failure */ +static +int +i_s_fts_deleted_fill( +/*=================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* ) /*!< in: condition (ignored) */ +{ + DBUG_ENTER("i_s_fts_deleted_fill"); + + DBUG_RETURN(i_s_fts_deleted_generic_fill(thd, tables, FALSE)); +} + +/*******************************************************************//** +Bind the dynamic table INFORMATION_SCHEMA.INNODB_FT_DELETED +@return 0 on success */ +static +int +i_s_fts_deleted_init( +/*=================*/ + void* p) /*!< in/out: table schema object */ +{ + DBUG_ENTER("i_s_fts_deleted_init"); + ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = i_s_fts_doc_fields_info; + schema->fill_table = i_s_fts_deleted_fill; + + DBUG_RETURN(0); +} + +UNIV_INTERN struct st_mysql_plugin i_s_innodb_ft_deleted = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), + + /* pointer to type-specific plugin descriptor */ + /* void* */ + STRUCT_FLD(info, &i_s_info), + + /* plugin name */ + /* const char* */ + STRUCT_FLD(name, "INNODB_FT_DELETED"), + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(author, plugin_author), + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(descr, "INNODB AUXILIARY FTS DELETED TABLE"), + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + STRUCT_FLD(license, PLUGIN_LICENSE_GPL), + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + STRUCT_FLD(init, i_s_fts_deleted_init), + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + STRUCT_FLD(deinit, i_s_common_deinit), + + /* plugin version (for SHOW PLUGINS) */ + /* unsigned int */ + STRUCT_FLD(version, INNODB_VERSION_SHORT), + + /* struct st_mysql_show_var* */ + STRUCT_FLD(status_vars, NULL), + + /* struct st_mysql_sys_var** */ + STRUCT_FLD(system_vars, NULL), + + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), +}; + +/*******************************************************************//** +Fill the dynamic table INFORMATION_SCHEMA.INNODB_FT_BEING_DELETED +@return 0 on success, 1 on failure */ +static +int +i_s_fts_being_deleted_fill( +/*=======================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* ) /*!< in: condition (ignored) */ +{ + DBUG_ENTER("i_s_fts_being_deleted_fill"); + + DBUG_RETURN(i_s_fts_deleted_generic_fill(thd, tables, TRUE)); +} + +/*******************************************************************//** +Bind the dynamic table INFORMATION_SCHEMA.INNODB_FT_BEING_DELETED +@return 0 on success */ +static +int +i_s_fts_being_deleted_init( +/*=======================*/ + void* p) /*!< in/out: table schema object */ +{ + DBUG_ENTER("i_s_fts_deleted_init"); + ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = i_s_fts_doc_fields_info; + schema->fill_table = i_s_fts_being_deleted_fill; + + DBUG_RETURN(0); +} + +UNIV_INTERN struct st_mysql_plugin i_s_innodb_ft_being_deleted = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), + + /* pointer to type-specific plugin descriptor */ + /* void* */ + STRUCT_FLD(info, &i_s_info), + + /* plugin name */ + /* const char* */ + STRUCT_FLD(name, "INNODB_FT_BEING_DELETED"), + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(author, plugin_author), + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(descr, "INNODB AUXILIARY FTS BEING DELETED TABLE"), + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + STRUCT_FLD(license, PLUGIN_LICENSE_GPL), + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + STRUCT_FLD(init, i_s_fts_being_deleted_init), + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + STRUCT_FLD(deinit, i_s_common_deinit), + + /* plugin version (for SHOW PLUGINS) */ + /* unsigned int */ + STRUCT_FLD(version, INNODB_VERSION_SHORT), + + /* struct st_mysql_show_var* */ + STRUCT_FLD(status_vars, NULL), + + /* struct st_mysql_sys_var** */ + STRUCT_FLD(system_vars, NULL), + + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), +}; + +/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_FT_INDEX_CACHED and +INFORMATION_SCHEMA.INNODB_FT_INDEX_TABLE */ +static ST_FIELD_INFO i_s_fts_index_fields_info[] = +{ +#define I_S_FTS_WORD 0 + {STRUCT_FLD(field_name, "WORD"), + STRUCT_FLD(field_length, FTS_MAX_WORD_LEN + 1), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define I_S_FTS_FIRST_DOC_ID 1 + {STRUCT_FLD(field_name, "FIRST_DOC_ID"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define I_S_FTS_LAST_DOC_ID 2 + {STRUCT_FLD(field_name, "LAST_DOC_ID"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define I_S_FTS_DOC_COUNT 3 + {STRUCT_FLD(field_name, "DOC_COUNT"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define I_S_FTS_ILIST_DOC_ID 4 + {STRUCT_FLD(field_name, "DOC_ID"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define I_S_FTS_ILIST_DOC_POS 5 + {STRUCT_FLD(field_name, "POSITION"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + END_OF_ST_FIELD_INFO +}; + +/*******************************************************************//** +Go through the Doc Node and its ilist, fill the dynamic table +INFORMATION_SCHEMA.INNODB_FT_INDEX_CACHED for one FTS index on the table. +@return 0 on success, 1 on failure */ +static +int +i_s_fts_index_cache_fill_one_index( +/*===============================*/ + fts_index_cache_t* index_cache, /*!< in: FTS index cache */ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables) /*!< in/out: tables to fill */ +{ + TABLE* table = (TABLE*) tables->table; + Field** fields; + CHARSET_INFO* index_charset; + const ib_rbt_node_t* rbt_node; + fts_string_t conv_str; + uint dummy_errors; + char* word_str; + + DBUG_ENTER("i_s_fts_index_cache_fill_one_index"); + + fields = table->field; + + index_charset = index_cache->charset; + conv_str.f_len = system_charset_info->mbmaxlen + * FTS_MAX_WORD_LEN_IN_CHAR; + conv_str.f_str = static_cast<byte*>(ut_malloc(conv_str.f_len)); + conv_str.f_n_char = 0; + + /* Go through each word in the index cache */ + for (rbt_node = rbt_first(index_cache->words); + rbt_node; + rbt_node = rbt_next(index_cache->words, rbt_node)) { + doc_id_t doc_id = 0; + + fts_tokenizer_word_t* word; + + word = rbt_value(fts_tokenizer_word_t, rbt_node); + + /* Convert word from index charset to system_charset_info */ + if (index_charset->cset != system_charset_info->cset) { + conv_str.f_n_char = my_convert( + reinterpret_cast<char*>(conv_str.f_str), + static_cast<uint32>(conv_str.f_len), + system_charset_info, + reinterpret_cast<char*>(word->text.f_str), + static_cast<uint32>(word->text.f_len), + index_charset, &dummy_errors); + ut_ad(conv_str.f_n_char <= conv_str.f_len); + conv_str.f_str[conv_str.f_n_char] = 0; + word_str = reinterpret_cast<char*>(conv_str.f_str); + } else { + word_str = reinterpret_cast<char*>(word->text.f_str); + } + + /* Decrypt the ilist, and display Dod ID and word position */ + for (ulint i = 0; i < ib_vector_size(word->nodes); i++) { + fts_node_t* node; + byte* ptr; + ulint decoded = 0; + + node = static_cast<fts_node_t*> (ib_vector_get( + word->nodes, i)); + + ptr = node->ilist; + + while (decoded < node->ilist_size) { + ulint pos = fts_decode_vlc(&ptr); + + doc_id += pos; + + /* Get position info */ + while (*ptr) { + pos = fts_decode_vlc(&ptr); + + OK(field_store_string( + fields[I_S_FTS_WORD], + word_str)); + + OK(fields[I_S_FTS_FIRST_DOC_ID]->store( + (longlong) node->first_doc_id, + true)); + + OK(fields[I_S_FTS_LAST_DOC_ID]->store( + (longlong) node->last_doc_id, + true)); + + OK(fields[I_S_FTS_DOC_COUNT]->store( + static_cast<double>(node->doc_count))); + + OK(fields[I_S_FTS_ILIST_DOC_ID]->store( + (longlong) doc_id, true)); + + OK(fields[I_S_FTS_ILIST_DOC_POS]->store( + static_cast<double>(pos))); + + OK(schema_table_store_record( + thd, table)); + } + + ++ptr; + + decoded = ptr - (byte*) node->ilist; + } + } + } + + ut_free(conv_str.f_str); + + DBUG_RETURN(0); +} +/*******************************************************************//** +Fill the dynamic table INFORMATION_SCHEMA.INNODB_FT_INDEX_CACHED +@return 0 on success, 1 on failure */ +static +int +i_s_fts_index_cache_fill( +/*=====================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* ) /*!< in: condition (ignored) */ +{ + dict_table_t* user_table; + fts_cache_t* cache; + + DBUG_ENTER("i_s_fts_index_cache_fill"); + + /* deny access to non-superusers */ + if (check_global_access(thd, PROCESS_ACL)) { + DBUG_RETURN(0); + } + + if (!fts_internal_tbl_name) { + DBUG_RETURN(0); + } + + user_table = dict_table_open_on_name( + fts_internal_tbl_name, FALSE, FALSE, DICT_ERR_IGNORE_NONE); + + if (!user_table) { + DBUG_RETURN(0); + } + + cache = user_table->fts->cache; + + ut_a(cache); + + for (ulint i = 0; i < ib_vector_size(cache->indexes); i++) { + fts_index_cache_t* index_cache; + + index_cache = static_cast<fts_index_cache_t*> ( + ib_vector_get(cache->indexes, i)); + + i_s_fts_index_cache_fill_one_index(index_cache, thd, tables); + } + + dict_table_close(user_table, FALSE, FALSE); + + DBUG_RETURN(0); +} + +/*******************************************************************//** +Bind the dynamic table INFORMATION_SCHEMA.INNODB_FT_INDEX_CACHE +@return 0 on success */ +static +int +i_s_fts_index_cache_init( +/*=====================*/ + void* p) /*!< in/out: table schema object */ +{ + DBUG_ENTER("i_s_fts_index_cache_init"); + ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = i_s_fts_index_fields_info; + schema->fill_table = i_s_fts_index_cache_fill; + + DBUG_RETURN(0); +} + +UNIV_INTERN struct st_mysql_plugin i_s_innodb_ft_index_cache = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), + + /* pointer to type-specific plugin descriptor */ + /* void* */ + STRUCT_FLD(info, &i_s_info), + + /* plugin name */ + /* const char* */ + STRUCT_FLD(name, "INNODB_FT_INDEX_CACHE"), + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(author, plugin_author), + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(descr, "INNODB AUXILIARY FTS INDEX CACHED"), + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + STRUCT_FLD(license, PLUGIN_LICENSE_GPL), + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + STRUCT_FLD(init, i_s_fts_index_cache_init), + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + STRUCT_FLD(deinit, i_s_common_deinit), + + /* plugin version (for SHOW PLUGINS) */ + /* unsigned int */ + STRUCT_FLD(version, INNODB_VERSION_SHORT), + + /* struct st_mysql_show_var* */ + STRUCT_FLD(status_vars, NULL), + + /* struct st_mysql_sys_var** */ + STRUCT_FLD(system_vars, NULL), + + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), +}; + +/*******************************************************************//** +Go through a FTS index auxiliary table, fetch its rows and fill +FTS word cache structure. +@return DB_SUCCESS on success, otherwise error code */ +static +dberr_t +i_s_fts_index_table_fill_selected( +/*==============================*/ + dict_index_t* index, /*!< in: FTS index */ + ib_vector_t* words, /*!< in/out: vector to hold + fetched words */ + ulint selected, /*!< in: selected FTS index */ + fts_string_t* word) /*!< in: word to select */ +{ + pars_info_t* info; + fts_table_t fts_table; + trx_t* trx; + que_t* graph; + dberr_t error; + fts_fetch_t fetch; + + info = pars_info_create(); + + fetch.read_arg = words; + fetch.read_record = fts_optimize_index_fetch_node; + fetch.total_memory = 0; + + DBUG_EXECUTE_IF("fts_instrument_result_cache_limit", + fts_result_cache_limit = 8192; + ); + + trx = trx_allocate_for_background(); + + trx->op_info = "fetching FTS index nodes"; + + pars_info_bind_function(info, "my_func", fetch.read_record, &fetch); + pars_info_bind_varchar_literal(info, "word", word->f_str, word->f_len); + + FTS_INIT_INDEX_TABLE(&fts_table, fts_get_suffix(selected), + FTS_INDEX_TABLE, index); + + graph = fts_parse_sql( + &fts_table, info, + "DECLARE FUNCTION my_func;\n" + "DECLARE CURSOR c IS" + " SELECT word, doc_count, first_doc_id, last_doc_id, " + "ilist\n" + " FROM %s WHERE word >= :word;\n" + "BEGIN\n" + "\n" + "OPEN c;\n" + "WHILE 1 = 1 LOOP\n" + " FETCH c INTO my_func();\n" + " IF c % NOTFOUND THEN\n" + " EXIT;\n" + " END IF;\n" + "END LOOP;\n" + "CLOSE c;"); + + for(;;) { + error = fts_eval_sql(trx, graph); + + if (error == DB_SUCCESS) { + fts_sql_commit(trx); + + break; + } else { + fts_sql_rollback(trx); + + ut_print_timestamp(stderr); + + if (error == DB_LOCK_WAIT_TIMEOUT) { + fprintf(stderr, " InnoDB: Warning: " + "lock wait timeout reading " + "FTS index. Retrying!\n"); + + trx->error_state = DB_SUCCESS; + } else { + fprintf(stderr, " InnoDB: Error: %d " + "while reading FTS index.\n", error); + break; + } + } + } + + mutex_enter(&dict_sys->mutex); + que_graph_free(graph); + mutex_exit(&dict_sys->mutex); + + trx_free_for_background(trx); + + if (fetch.total_memory >= fts_result_cache_limit) { + error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT; + } + + return(error); +} + +/*******************************************************************//** +Free words. */ +static +void +i_s_fts_index_table_free_one_fetch( +/*===============================*/ + ib_vector_t* words) /*!< in: words fetched */ +{ + for (ulint i = 0; i < ib_vector_size(words); i++) { + fts_word_t* word; + + word = static_cast<fts_word_t*>(ib_vector_get(words, i)); + + for (ulint j = 0; j < ib_vector_size(word->nodes); j++) { + fts_node_t* node; + + node = static_cast<fts_node_t*> (ib_vector_get( + word->nodes, j)); + ut_free(node->ilist); + } + + fts_word_free(word); + } + + ib_vector_reset(words); +} + +/*******************************************************************//** +Go through words, fill INFORMATION_SCHEMA.INNODB_FT_INDEX_TABLE. +@return 0 on success, 1 on failure */ +static +int +i_s_fts_index_table_fill_one_fetch( +/*===============================*/ + CHARSET_INFO* index_charset, /*!< in: FTS index charset */ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + ib_vector_t* words, /*!< in: words fetched */ + fts_string_t* conv_str, /*!< in: string for conversion*/ + bool has_more) /*!< in: has more to fetch */ +{ + TABLE* table = (TABLE*) tables->table; + Field** fields; + uint dummy_errors; + char* word_str; + ulint words_size; + int ret = 0; + + DBUG_ENTER("i_s_fts_index_table_fill_one_fetch"); + + fields = table->field; + + words_size = ib_vector_size(words); + if (has_more) { + /* the last word is not fetched completely. */ + ut_ad(words_size > 1); + words_size -= 1; + } + + /* Go through each word in the index cache */ + for (ulint i = 0; i < words_size; i++) { + fts_word_t* word; + + word = static_cast<fts_word_t*>(ib_vector_get(words, i)); + + word->text.f_str[word->text.f_len] = 0; + + /* Convert word from index charset to system_charset_info */ + if (index_charset->cset != system_charset_info->cset) { + conv_str->f_n_char = my_convert( + reinterpret_cast<char*>(conv_str->f_str), + static_cast<uint32>(conv_str->f_len), + system_charset_info, + reinterpret_cast<char*>(word->text.f_str), + static_cast<uint32>(word->text.f_len), + index_charset, &dummy_errors); + ut_ad(conv_str->f_n_char <= conv_str->f_len); + conv_str->f_str[conv_str->f_n_char] = 0; + word_str = reinterpret_cast<char*>(conv_str->f_str); + } else { + word_str = reinterpret_cast<char*>(word->text.f_str); + } + + /* Decrypt the ilist, and display Dod ID and word position */ + for (ulint i = 0; i < ib_vector_size(word->nodes); i++) { + fts_node_t* node; + byte* ptr; + ulint decoded = 0; + doc_id_t doc_id = 0; + + node = static_cast<fts_node_t*> (ib_vector_get( + word->nodes, i)); + + ptr = node->ilist; + + while (decoded < node->ilist_size) { + ulint pos = fts_decode_vlc(&ptr); + + doc_id += pos; + + /* Get position info */ + while (*ptr) { + pos = fts_decode_vlc(&ptr); + + OK(field_store_string( + fields[I_S_FTS_WORD], + word_str)); + + OK(fields[I_S_FTS_FIRST_DOC_ID]->store( + (longlong) node->first_doc_id, + true)); + + OK(fields[I_S_FTS_LAST_DOC_ID]->store( + (longlong) node->last_doc_id, + true)); + + OK(fields[I_S_FTS_DOC_COUNT]->store( + static_cast<double>(node->doc_count))); + + OK(fields[I_S_FTS_ILIST_DOC_ID]->store( + (longlong) doc_id, true)); + + OK(fields[I_S_FTS_ILIST_DOC_POS]->store( + static_cast<double>(pos))); + + OK(schema_table_store_record( + thd, table)); + } + + ++ptr; + + decoded = ptr - (byte*) node->ilist; + } + } + } + + i_s_fts_index_table_free_one_fetch(words); + + DBUG_RETURN(ret); +} + +/*******************************************************************//** +Go through a FTS index and its auxiliary tables, fetch rows in each table +and fill INFORMATION_SCHEMA.INNODB_FT_INDEX_TABLE. +@return 0 on success, 1 on failure */ +static +int +i_s_fts_index_table_fill_one_index( +/*===============================*/ + dict_index_t* index, /*!< in: FTS index */ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables) /*!< in/out: tables to fill */ +{ + ib_vector_t* words; + mem_heap_t* heap; + fts_string_t word; + CHARSET_INFO* index_charset; + fts_string_t conv_str; + dberr_t error; + int ret = 0; + + DBUG_ENTER("i_s_fts_index_table_fill_one_index"); + DBUG_ASSERT(!dict_index_is_online_ddl(index)); + + heap = mem_heap_create(1024); + + words = ib_vector_create(ib_heap_allocator_create(heap), + sizeof(fts_word_t), 256); + + word.f_str = NULL; + word.f_len = 0; + word.f_n_char = 0; + + index_charset = fts_index_get_charset(index); + conv_str.f_len = system_charset_info->mbmaxlen + * FTS_MAX_WORD_LEN_IN_CHAR; + conv_str.f_str = static_cast<byte*>(ut_malloc(conv_str.f_len)); + conv_str.f_n_char = 0; + + /* Iterate through each auxiliary table as described in + fts_index_selector */ + for (ulint selected = 0; fts_index_selector[selected].value; + selected++) { + bool has_more = false; + + do { + /* Fetch from index */ + error = i_s_fts_index_table_fill_selected( + index, words, selected, &word); + + if (error == DB_SUCCESS) { + has_more = false; + } else if (error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT) { + has_more = true; + } else { + i_s_fts_index_table_free_one_fetch(words); + ret = 1; + goto func_exit; + } + + if (has_more) { + fts_word_t* last_word; + + /* Prepare start point for next fetch */ + last_word = static_cast<fts_word_t*>(ib_vector_last(words)); + ut_ad(last_word != NULL); + fts_utf8_string_dup(&word, &last_word->text, heap); + } + + /* Fill into tables */ + ret = i_s_fts_index_table_fill_one_fetch( + index_charset, thd, tables, words, &conv_str, has_more); + + if (ret != 0) { + i_s_fts_index_table_free_one_fetch(words); + goto func_exit; + } + } while (has_more); + } + +func_exit: + ut_free(conv_str.f_str); + mem_heap_free(heap); + + DBUG_RETURN(ret); +} +/*******************************************************************//** +Fill the dynamic table INFORMATION_SCHEMA.INNODB_FT_INDEX_TABLE +@return 0 on success, 1 on failure */ +static +int +i_s_fts_index_table_fill( +/*=====================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* ) /*!< in: condition (ignored) */ +{ + dict_table_t* user_table; + dict_index_t* index; + + DBUG_ENTER("i_s_fts_index_table_fill"); + + /* deny access to non-superusers */ + if (check_global_access(thd, PROCESS_ACL)) { + DBUG_RETURN(0); + } + + if (!fts_internal_tbl_name) { + DBUG_RETURN(0); + } + + user_table = dict_table_open_on_name( + fts_internal_tbl_name, FALSE, FALSE, DICT_ERR_IGNORE_NONE); + + if (!user_table) { + DBUG_RETURN(0); + } + + for (index = dict_table_get_first_index(user_table); + index; index = dict_table_get_next_index(index)) { + if (index->type & DICT_FTS) { + i_s_fts_index_table_fill_one_index(index, thd, tables); + } + } + + dict_table_close(user_table, FALSE, FALSE); + + DBUG_RETURN(0); +} + +/*******************************************************************//** +Bind the dynamic table INFORMATION_SCHEMA.INNODB_FT_INDEX_TABLE +@return 0 on success */ +static +int +i_s_fts_index_table_init( +/*=====================*/ + void* p) /*!< in/out: table schema object */ +{ + DBUG_ENTER("i_s_fts_index_table_init"); + ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = i_s_fts_index_fields_info; + schema->fill_table = i_s_fts_index_table_fill; + + DBUG_RETURN(0); +} + +UNIV_INTERN struct st_mysql_plugin i_s_innodb_ft_index_table = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), + + /* pointer to type-specific plugin descriptor */ + /* void* */ + STRUCT_FLD(info, &i_s_info), + + /* plugin name */ + /* const char* */ + STRUCT_FLD(name, "INNODB_FT_INDEX_TABLE"), + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(author, plugin_author), + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(descr, "INNODB AUXILIARY FTS INDEX TABLE"), + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + STRUCT_FLD(license, PLUGIN_LICENSE_GPL), + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + STRUCT_FLD(init, i_s_fts_index_table_init), + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + STRUCT_FLD(deinit, i_s_common_deinit), + + /* plugin version (for SHOW PLUGINS) */ + /* unsigned int */ + STRUCT_FLD(version, INNODB_VERSION_SHORT), + + /* struct st_mysql_show_var* */ + STRUCT_FLD(status_vars, NULL), + + /* struct st_mysql_sys_var** */ + STRUCT_FLD(system_vars, NULL), + + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), +}; + +/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_FT_CONFIG */ +static ST_FIELD_INFO i_s_fts_config_fields_info[] = +{ +#define FTS_CONFIG_KEY 0 + {STRUCT_FLD(field_name, "KEY"), + STRUCT_FLD(field_length, NAME_LEN + 1), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define FTS_CONFIG_VALUE 1 + {STRUCT_FLD(field_name, "VALUE"), + STRUCT_FLD(field_length, NAME_LEN + 1), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + END_OF_ST_FIELD_INFO +}; + +static const char* fts_config_key[] = { + FTS_OPTIMIZE_LIMIT_IN_SECS, + FTS_SYNCED_DOC_ID, + FTS_STOPWORD_TABLE_NAME, + FTS_USE_STOPWORD, + NULL +}; + +/*******************************************************************//** +Fill the dynamic table INFORMATION_SCHEMA.INNODB_FT_CONFIG +@return 0 on success, 1 on failure */ +static +int +i_s_fts_config_fill( +/*================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* ) /*!< in: condition (ignored) */ +{ + Field** fields; + TABLE* table = (TABLE*) tables->table; + trx_t* trx; + fts_table_t fts_table; + dict_table_t* user_table; + ulint i = 0; + dict_index_t* index = NULL; + unsigned char str[FTS_MAX_CONFIG_VALUE_LEN + 1]; + + DBUG_ENTER("i_s_fts_config_fill"); + + /* deny access to non-superusers */ + if (check_global_access(thd, PROCESS_ACL)) { + DBUG_RETURN(0); + } + + if (!fts_internal_tbl_name) { + DBUG_RETURN(0); + } + + fields = table->field; + + user_table = dict_table_open_on_name( + fts_internal_tbl_name, FALSE, FALSE, DICT_ERR_IGNORE_NONE); + + if (!user_table) { + DBUG_RETURN(0); + } + + trx = trx_allocate_for_background(); + trx->op_info = "Select for FTS DELETE TABLE"; + + FTS_INIT_FTS_TABLE(&fts_table, "CONFIG", FTS_COMMON_TABLE, user_table); + + if (!ib_vector_is_empty(user_table->fts->indexes)) { + index = (dict_index_t*) ib_vector_getp_const( + user_table->fts->indexes, 0); + DBUG_ASSERT(!dict_index_is_online_ddl(index)); + } + + while (fts_config_key[i]) { + fts_string_t value; + char* key_name; + ulint allocated = FALSE; + + value.f_len = FTS_MAX_CONFIG_VALUE_LEN; + + value.f_str = str; + + if (index + && strcmp(fts_config_key[i], FTS_TOTAL_WORD_COUNT) == 0) { + key_name = fts_config_create_index_param_name( + fts_config_key[i], index); + allocated = TRUE; + } else { + key_name = (char*) fts_config_key[i]; + } + + fts_config_get_value(trx, &fts_table, key_name, &value); + + if (allocated) { + ut_free(key_name); + } + + OK(field_store_string( + fields[FTS_CONFIG_KEY], fts_config_key[i])); + + OK(field_store_string( + fields[FTS_CONFIG_VALUE], (const char*) value.f_str)); + + OK(schema_table_store_record(thd, table)); + + i++; + } + + fts_sql_commit(trx); + + trx_free_for_background(trx); + + dict_table_close(user_table, FALSE, FALSE); + + DBUG_RETURN(0); +} + +/*******************************************************************//** +Bind the dynamic table INFORMATION_SCHEMA.INNODB_FT_CONFIG +@return 0 on success */ +static +int +i_s_fts_config_init( +/*=================*/ + void* p) /*!< in/out: table schema object */ +{ + DBUG_ENTER("i_s_fts_config_init"); + ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = i_s_fts_config_fields_info; + schema->fill_table = i_s_fts_config_fill; + + DBUG_RETURN(0); +} + +UNIV_INTERN struct st_mysql_plugin i_s_innodb_ft_config = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), + + /* pointer to type-specific plugin descriptor */ + /* void* */ + STRUCT_FLD(info, &i_s_info), + + /* plugin name */ + /* const char* */ + STRUCT_FLD(name, "INNODB_FT_CONFIG"), + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(author, plugin_author), + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(descr, "INNODB AUXILIARY FTS CONFIG TABLE"), + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + STRUCT_FLD(license, PLUGIN_LICENSE_GPL), + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + STRUCT_FLD(init, i_s_fts_config_init), + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + STRUCT_FLD(deinit, i_s_common_deinit), + + /* plugin version (for SHOW PLUGINS) */ + /* unsigned int */ + STRUCT_FLD(version, INNODB_VERSION_SHORT), + + /* struct st_mysql_show_var* */ + STRUCT_FLD(status_vars, NULL), + + /* struct st_mysql_sys_var** */ + STRUCT_FLD(system_vars, NULL), + + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), +}; + +/* Fields of the dynamic table INNODB_BUFFER_POOL_STATS. */ +static ST_FIELD_INFO i_s_innodb_buffer_stats_fields_info[] = +{ +#define IDX_BUF_STATS_POOL_ID 0 + {STRUCT_FLD(field_name, "POOL_ID"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUF_STATS_POOL_SIZE 1 + {STRUCT_FLD(field_name, "POOL_SIZE"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUF_STATS_FREE_BUFFERS 2 + {STRUCT_FLD(field_name, "FREE_BUFFERS"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUF_STATS_LRU_LEN 3 + {STRUCT_FLD(field_name, "DATABASE_PAGES"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUF_STATS_OLD_LRU_LEN 4 + {STRUCT_FLD(field_name, "OLD_DATABASE_PAGES"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUF_STATS_FLUSH_LIST_LEN 5 + {STRUCT_FLD(field_name, "MODIFIED_DATABASE_PAGES"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUF_STATS_PENDING_ZIP 6 + {STRUCT_FLD(field_name, "PENDING_DECOMPRESS"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUF_STATS_PENDING_READ 7 + {STRUCT_FLD(field_name, "PENDING_READS"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUF_STATS_FLUSH_LRU 8 + {STRUCT_FLD(field_name, "PENDING_FLUSH_LRU"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUF_STATS_FLUSH_LIST 9 + {STRUCT_FLD(field_name, "PENDING_FLUSH_LIST"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUF_STATS_PAGE_YOUNG 10 + {STRUCT_FLD(field_name, "PAGES_MADE_YOUNG"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUF_STATS_PAGE_NOT_YOUNG 11 + {STRUCT_FLD(field_name, "PAGES_NOT_MADE_YOUNG"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUF_STATS_PAGE_YOUNG_RATE 12 + {STRUCT_FLD(field_name, "PAGES_MADE_YOUNG_RATE"), + STRUCT_FLD(field_length, MAX_FLOAT_STR_LENGTH), + STRUCT_FLD(field_type, MYSQL_TYPE_FLOAT), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUF_STATS_PAGE_NOT_YOUNG_RATE 13 + {STRUCT_FLD(field_name, "PAGES_MADE_NOT_YOUNG_RATE"), + STRUCT_FLD(field_length, MAX_FLOAT_STR_LENGTH), + STRUCT_FLD(field_type, MYSQL_TYPE_FLOAT), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUF_STATS_PAGE_READ 14 + {STRUCT_FLD(field_name, "NUMBER_PAGES_READ"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUF_STATS_PAGE_CREATED 15 + {STRUCT_FLD(field_name, "NUMBER_PAGES_CREATED"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUF_STATS_PAGE_WRITTEN 16 + {STRUCT_FLD(field_name, "NUMBER_PAGES_WRITTEN"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUF_STATS_PAGE_READ_RATE 17 + {STRUCT_FLD(field_name, "PAGES_READ_RATE"), + STRUCT_FLD(field_length, MAX_FLOAT_STR_LENGTH), + STRUCT_FLD(field_type, MYSQL_TYPE_FLOAT), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUF_STATS_PAGE_CREATE_RATE 18 + {STRUCT_FLD(field_name, "PAGES_CREATE_RATE"), + STRUCT_FLD(field_length, MAX_FLOAT_STR_LENGTH), + STRUCT_FLD(field_type, MYSQL_TYPE_FLOAT), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUF_STATS_PAGE_WRITTEN_RATE 19 + {STRUCT_FLD(field_name, "PAGES_WRITTEN_RATE"), + STRUCT_FLD(field_length, MAX_FLOAT_STR_LENGTH), + STRUCT_FLD(field_type, MYSQL_TYPE_FLOAT), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUF_STATS_GET 20 + {STRUCT_FLD(field_name, "NUMBER_PAGES_GET"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUF_STATS_HIT_RATE 21 + {STRUCT_FLD(field_name, "HIT_RATE"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUF_STATS_MADE_YOUNG_PCT 22 + {STRUCT_FLD(field_name, "YOUNG_MAKE_PER_THOUSAND_GETS"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUF_STATS_NOT_MADE_YOUNG_PCT 23 + {STRUCT_FLD(field_name, "NOT_YOUNG_MAKE_PER_THOUSAND_GETS"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUF_STATS_READ_AHREAD 24 + {STRUCT_FLD(field_name, "NUMBER_PAGES_READ_AHEAD"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUF_STATS_READ_AHEAD_EVICTED 25 + {STRUCT_FLD(field_name, "NUMBER_READ_AHEAD_EVICTED"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUF_STATS_READ_AHEAD_RATE 26 + {STRUCT_FLD(field_name, "READ_AHEAD_RATE"), + STRUCT_FLD(field_length, MAX_FLOAT_STR_LENGTH), + STRUCT_FLD(field_type, MYSQL_TYPE_FLOAT), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUF_STATS_READ_AHEAD_EVICT_RATE 27 + {STRUCT_FLD(field_name, "READ_AHEAD_EVICTED_RATE"), + STRUCT_FLD(field_length, MAX_FLOAT_STR_LENGTH), + STRUCT_FLD(field_type, MYSQL_TYPE_FLOAT), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUF_STATS_LRU_IO_SUM 28 + {STRUCT_FLD(field_name, "LRU_IO_TOTAL"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUF_STATS_LRU_IO_CUR 29 + {STRUCT_FLD(field_name, "LRU_IO_CURRENT"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUF_STATS_UNZIP_SUM 30 + {STRUCT_FLD(field_name, "UNCOMPRESS_TOTAL"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUF_STATS_UNZIP_CUR 31 + {STRUCT_FLD(field_name, "UNCOMPRESS_CURRENT"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + END_OF_ST_FIELD_INFO +}; + +/*******************************************************************//** +Fill Information Schema table INNODB_BUFFER_POOL_STATS for a particular +buffer pool +@return 0 on success, 1 on failure */ +static +int +i_s_innodb_stats_fill( +/*==================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + const buf_pool_info_t* info) /*!< in: buffer pool + information */ +{ + TABLE* table; + Field** fields; + + DBUG_ENTER("i_s_innodb_stats_fill"); + + table = tables->table; + + fields = table->field; + + OK(fields[IDX_BUF_STATS_POOL_ID]->store( + static_cast<double>(info->pool_unique_id))); + + OK(fields[IDX_BUF_STATS_POOL_SIZE]->store( + static_cast<double>(info->pool_size))); + + OK(fields[IDX_BUF_STATS_LRU_LEN]->store( + static_cast<double>(info->lru_len))); + + OK(fields[IDX_BUF_STATS_OLD_LRU_LEN]->store( + static_cast<double>(info->old_lru_len))); + + OK(fields[IDX_BUF_STATS_FREE_BUFFERS]->store( + static_cast<double>(info->free_list_len))); + + OK(fields[IDX_BUF_STATS_FLUSH_LIST_LEN]->store( + static_cast<double>(info->flush_list_len))); + + OK(fields[IDX_BUF_STATS_PENDING_ZIP]->store( + static_cast<double>(info->n_pend_unzip))); + + OK(fields[IDX_BUF_STATS_PENDING_READ]->store( + static_cast<double>(info->n_pend_reads))); + + OK(fields[IDX_BUF_STATS_FLUSH_LRU]->store( + static_cast<double>(info->n_pending_flush_lru))); + + OK(fields[IDX_BUF_STATS_FLUSH_LIST]->store( + static_cast<double>(info->n_pending_flush_list))); + + OK(fields[IDX_BUF_STATS_PAGE_YOUNG]->store( + static_cast<double>(info->n_pages_made_young))); + + OK(fields[IDX_BUF_STATS_PAGE_NOT_YOUNG]->store( + static_cast<double>(info->n_pages_not_made_young))); + + OK(fields[IDX_BUF_STATS_PAGE_YOUNG_RATE]->store( + info->page_made_young_rate)); + + OK(fields[IDX_BUF_STATS_PAGE_NOT_YOUNG_RATE]->store( + info->page_not_made_young_rate)); + + OK(fields[IDX_BUF_STATS_PAGE_READ]->store( + static_cast<double>(info->n_pages_read))); + + OK(fields[IDX_BUF_STATS_PAGE_CREATED]->store( + static_cast<double>(info->n_pages_created))); + + OK(fields[IDX_BUF_STATS_PAGE_WRITTEN]->store( + static_cast<double>(info->n_pages_written))); + + OK(fields[IDX_BUF_STATS_GET]->store( + static_cast<double>(info->n_page_gets))); + + OK(fields[IDX_BUF_STATS_PAGE_READ_RATE]->store( + info->pages_read_rate)); + + OK(fields[IDX_BUF_STATS_PAGE_CREATE_RATE]->store( + info->pages_created_rate)); + + OK(fields[IDX_BUF_STATS_PAGE_WRITTEN_RATE]->store( + info->pages_written_rate)); + + if (info->n_page_get_delta) { + OK(fields[IDX_BUF_STATS_HIT_RATE]->store( + static_cast<double>( + 1000 - (1000 * info->page_read_delta + / info->n_page_get_delta)))); + + OK(fields[IDX_BUF_STATS_MADE_YOUNG_PCT]->store( + static_cast<double>( + 1000 * info->young_making_delta + / info->n_page_get_delta))); + + OK(fields[IDX_BUF_STATS_NOT_MADE_YOUNG_PCT]->store( + static_cast<double>( + 1000 * info->not_young_making_delta + / info->n_page_get_delta))); + } else { + OK(fields[IDX_BUF_STATS_HIT_RATE]->store(0)); + OK(fields[IDX_BUF_STATS_MADE_YOUNG_PCT]->store(0)); + OK(fields[IDX_BUF_STATS_NOT_MADE_YOUNG_PCT]->store(0)); + } + + OK(fields[IDX_BUF_STATS_READ_AHREAD]->store( + static_cast<double>(info->n_ra_pages_read))); + + OK(fields[IDX_BUF_STATS_READ_AHEAD_EVICTED]->store( + static_cast<double>(info->n_ra_pages_evicted))); + + OK(fields[IDX_BUF_STATS_READ_AHEAD_RATE]->store( + info->pages_readahead_rate)); + + OK(fields[IDX_BUF_STATS_READ_AHEAD_EVICT_RATE]->store( + info->pages_evicted_rate)); + + OK(fields[IDX_BUF_STATS_LRU_IO_SUM]->store( + static_cast<double>(info->io_sum))); + + OK(fields[IDX_BUF_STATS_LRU_IO_CUR]->store( + static_cast<double>(info->io_cur))); + + OK(fields[IDX_BUF_STATS_UNZIP_SUM]->store( + static_cast<double>(info->unzip_sum))); + + OK(fields[IDX_BUF_STATS_UNZIP_CUR]->store( + static_cast<double>(info->unzip_cur))); + + DBUG_RETURN(schema_table_store_record(thd, table)); +} + +/*******************************************************************//** +This is the function that loops through each buffer pool and fetch buffer +pool stats to information schema table: I_S_INNODB_BUFFER_POOL_STATS +@return 0 on success, 1 on failure */ +static +int +i_s_innodb_buffer_stats_fill_table( +/*===============================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* ) /*!< in: condition (ignored) */ +{ + int status = 0; + buf_pool_info_t* pool_info; + + DBUG_ENTER("i_s_innodb_buffer_fill_general"); + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name); + + /* Only allow the PROCESS privilege holder to access the stats */ + if (check_global_access(thd, PROCESS_ACL)) { + DBUG_RETURN(0); + } + + pool_info = (buf_pool_info_t*) mem_zalloc( + srv_buf_pool_instances * sizeof *pool_info); + + /* Walk through each buffer pool */ + for (ulint i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_array(i); + + /* Fetch individual buffer pool info */ + buf_stats_get_pool_info(buf_pool, i, pool_info); + + status = i_s_innodb_stats_fill(thd, tables, &pool_info[i]); + + /* If something goes wrong, break and return */ + if (status) { + break; + } + } + + mem_free(pool_info); + + DBUG_RETURN(status); +} + +/*******************************************************************//** +Bind the dynamic table INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS. +@return 0 on success, 1 on failure */ +static +int +i_s_innodb_buffer_pool_stats_init( +/*==============================*/ + void* p) /*!< in/out: table schema object */ +{ + ST_SCHEMA_TABLE* schema; + + DBUG_ENTER("i_s_innodb_buffer_pool_stats_init"); + + schema = reinterpret_cast<ST_SCHEMA_TABLE*>(p); + + schema->fields_info = i_s_innodb_buffer_stats_fields_info; + schema->fill_table = i_s_innodb_buffer_stats_fill_table; + + DBUG_RETURN(0); +} + +UNIV_INTERN struct st_mysql_plugin i_s_innodb_buffer_stats = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), + + /* pointer to type-specific plugin descriptor */ + /* void* */ + STRUCT_FLD(info, &i_s_info), + + /* plugin name */ + /* const char* */ + STRUCT_FLD(name, "INNODB_BUFFER_POOL_STATS"), + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(author, plugin_author), + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(descr, "InnoDB Buffer Pool Statistics Information "), + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + STRUCT_FLD(license, PLUGIN_LICENSE_GPL), + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + STRUCT_FLD(init, i_s_innodb_buffer_pool_stats_init), + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + STRUCT_FLD(deinit, i_s_common_deinit), + + /* plugin version (for SHOW PLUGINS) */ + /* unsigned int */ + STRUCT_FLD(version, INNODB_VERSION_SHORT), + + /* struct st_mysql_show_var* */ + STRUCT_FLD(status_vars, NULL), + + /* struct st_mysql_sys_var** */ + STRUCT_FLD(system_vars, NULL), + + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), +}; + +/* Fields of the dynamic table INNODB_BUFFER_POOL_PAGE. */ +static ST_FIELD_INFO i_s_innodb_buffer_page_fields_info[] = +{ +#define IDX_BUFFER_POOL_ID 0 + {STRUCT_FLD(field_name, "POOL_ID"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUFFER_BLOCK_ID 1 + {STRUCT_FLD(field_name, "BLOCK_ID"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUFFER_PAGE_SPACE 2 + {STRUCT_FLD(field_name, "SPACE"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUFFER_PAGE_NUM 3 + {STRUCT_FLD(field_name, "PAGE_NUMBER"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUFFER_PAGE_TYPE 4 + {STRUCT_FLD(field_name, "PAGE_TYPE"), + STRUCT_FLD(field_length, 64), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUFFER_PAGE_FLUSH_TYPE 5 + {STRUCT_FLD(field_name, "FLUSH_TYPE"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUFFER_PAGE_FIX_COUNT 6 + {STRUCT_FLD(field_name, "FIX_COUNT"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUFFER_PAGE_HASHED 7 + {STRUCT_FLD(field_name, "IS_HASHED"), + STRUCT_FLD(field_length, 3), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUFFER_PAGE_NEWEST_MOD 8 + {STRUCT_FLD(field_name, "NEWEST_MODIFICATION"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUFFER_PAGE_OLDEST_MOD 9 + {STRUCT_FLD(field_name, "OLDEST_MODIFICATION"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUFFER_PAGE_ACCESS_TIME 10 + {STRUCT_FLD(field_name, "ACCESS_TIME"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUFFER_PAGE_TABLE_NAME 11 + {STRUCT_FLD(field_name, "TABLE_NAME"), + STRUCT_FLD(field_length, 1024), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUFFER_PAGE_INDEX_NAME 12 + {STRUCT_FLD(field_name, "INDEX_NAME"), + STRUCT_FLD(field_length, 1024), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUFFER_PAGE_NUM_RECS 13 + {STRUCT_FLD(field_name, "NUMBER_RECORDS"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUFFER_PAGE_DATA_SIZE 14 + {STRUCT_FLD(field_name, "DATA_SIZE"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUFFER_PAGE_ZIP_SIZE 15 + {STRUCT_FLD(field_name, "COMPRESSED_SIZE"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUFFER_PAGE_STATE 16 + {STRUCT_FLD(field_name, "PAGE_STATE"), + STRUCT_FLD(field_length, 64), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUFFER_PAGE_IO_FIX 17 + {STRUCT_FLD(field_name, "IO_FIX"), + STRUCT_FLD(field_length, 64), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUFFER_PAGE_IS_OLD 18 + {STRUCT_FLD(field_name, "IS_OLD"), + STRUCT_FLD(field_length, 3), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUFFER_PAGE_FREE_CLOCK 19 + {STRUCT_FLD(field_name, "FREE_PAGE_CLOCK"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + END_OF_ST_FIELD_INFO +}; + +/*******************************************************************//** +Fill Information Schema table INNODB_BUFFER_PAGE with information +cached in the buf_page_info_t array +@return 0 on success, 1 on failure */ +static +int +i_s_innodb_buffer_page_fill( +/*========================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + const buf_page_info_t* info_array, /*!< in: array cached page + info */ + ulint num_page) /*!< in: number of page info + cached */ +{ + TABLE* table; + Field** fields; + + DBUG_ENTER("i_s_innodb_buffer_page_fill"); + + table = tables->table; + + fields = table->field; + + /* Iterate through the cached array and fill the I_S table rows */ + for (ulint i = 0; i < num_page; i++) { + const buf_page_info_t* page_info; + char table_name[MAX_FULL_NAME_LEN + 1]; + const char* table_name_end = NULL; + const char* state_str; + enum buf_page_state state; + + page_info = info_array + i; + + state_str = NULL; + + OK(fields[IDX_BUFFER_POOL_ID]->store( + static_cast<double>(page_info->pool_id))); + + OK(fields[IDX_BUFFER_BLOCK_ID]->store( + static_cast<double>(page_info->block_id))); + + OK(fields[IDX_BUFFER_PAGE_SPACE]->store( + static_cast<double>(page_info->space_id))); + + OK(fields[IDX_BUFFER_PAGE_NUM]->store( + static_cast<double>(page_info->page_num))); + + OK(field_store_string( + fields[IDX_BUFFER_PAGE_TYPE], + i_s_page_type[page_info->page_type].type_str)); + + OK(fields[IDX_BUFFER_PAGE_FLUSH_TYPE]->store( + page_info->flush_type)); + + OK(fields[IDX_BUFFER_PAGE_FIX_COUNT]->store( + page_info->fix_count)); + + if (page_info->hashed) { + OK(field_store_string( + fields[IDX_BUFFER_PAGE_HASHED], "YES")); + } else { + OK(field_store_string( + fields[IDX_BUFFER_PAGE_HASHED], "NO")); + } + + OK(fields[IDX_BUFFER_PAGE_NEWEST_MOD]->store( + (longlong) page_info->newest_mod, true)); + + OK(fields[IDX_BUFFER_PAGE_OLDEST_MOD]->store( + (longlong) page_info->oldest_mod, true)); + + OK(fields[IDX_BUFFER_PAGE_ACCESS_TIME]->store( + page_info->access_time)); + + fields[IDX_BUFFER_PAGE_TABLE_NAME]->set_null(); + + fields[IDX_BUFFER_PAGE_INDEX_NAME]->set_null(); + + /* If this is an index page, fetch the index name + and table name */ + if (page_info->page_type == I_S_PAGE_TYPE_INDEX) { + const dict_index_t* index; + + mutex_enter(&dict_sys->mutex); + index = dict_index_get_if_in_cache_low( + page_info->index_id); + + if (index) { + + table_name_end = innobase_convert_name( + table_name, sizeof(table_name), + index->table_name, + strlen(index->table_name), + thd, TRUE); + + OK(fields[IDX_BUFFER_PAGE_TABLE_NAME]->store( + table_name, + static_cast<uint>(table_name_end - table_name), + system_charset_info)); + fields[IDX_BUFFER_PAGE_TABLE_NAME]->set_notnull(); + + OK(field_store_index_name( + fields[IDX_BUFFER_PAGE_INDEX_NAME], + index->name)); + } + + mutex_exit(&dict_sys->mutex); + } + + OK(fields[IDX_BUFFER_PAGE_NUM_RECS]->store( + page_info->num_recs)); + + OK(fields[IDX_BUFFER_PAGE_DATA_SIZE]->store( + page_info->data_size)); + + OK(fields[IDX_BUFFER_PAGE_ZIP_SIZE]->store( + page_info->zip_ssize + ? (UNIV_ZIP_SIZE_MIN >> 1) << page_info->zip_ssize + : 0)); + +#if BUF_PAGE_STATE_BITS > 3 +# error "BUF_PAGE_STATE_BITS > 3, please ensure that all 1<<BUF_PAGE_STATE_BITS values are checked for" +#endif + state = static_cast<enum buf_page_state>(page_info->page_state); + + switch (state) { + /* First three states are for compression pages and + are not states we would get as we scan pages through + buffer blocks */ + case BUF_BLOCK_POOL_WATCH: + case BUF_BLOCK_ZIP_PAGE: + case BUF_BLOCK_ZIP_DIRTY: + state_str = NULL; + break; + case BUF_BLOCK_NOT_USED: + state_str = "NOT_USED"; + break; + case BUF_BLOCK_READY_FOR_USE: + state_str = "READY_FOR_USE"; + break; + case BUF_BLOCK_FILE_PAGE: + state_str = "FILE_PAGE"; + break; + case BUF_BLOCK_MEMORY: + state_str = "MEMORY"; + break; + case BUF_BLOCK_REMOVE_HASH: + state_str = "REMOVE_HASH"; + break; + }; + + OK(field_store_string(fields[IDX_BUFFER_PAGE_STATE], + state_str)); + + switch (page_info->io_fix) { + case BUF_IO_NONE: + OK(field_store_string(fields[IDX_BUFFER_PAGE_IO_FIX], + "IO_NONE")); + break; + case BUF_IO_READ: + OK(field_store_string(fields[IDX_BUFFER_PAGE_IO_FIX], + "IO_READ")); + break; + case BUF_IO_WRITE: + OK(field_store_string(fields[IDX_BUFFER_PAGE_IO_FIX], + "IO_WRITE")); + break; + case BUF_IO_PIN: + OK(field_store_string(fields[IDX_BUFFER_PAGE_IO_FIX], + "IO_PIN")); + break; + } + + OK(field_store_string(fields[IDX_BUFFER_PAGE_IS_OLD], + (page_info->is_old) ? "YES" : "NO")); + + OK(fields[IDX_BUFFER_PAGE_FREE_CLOCK]->store( + page_info->freed_page_clock)); + + if (schema_table_store_record(thd, table)) { + DBUG_RETURN(1); + } + } + + DBUG_RETURN(0); +} + +/*******************************************************************//** +Set appropriate page type to a buf_page_info_t structure */ +static +void +i_s_innodb_set_page_type( +/*=====================*/ + buf_page_info_t*page_info, /*!< in/out: structure to fill with + scanned info */ + ulint page_type, /*!< in: page type */ + const byte* frame) /*!< in: buffer frame */ +{ + if (page_type == FIL_PAGE_INDEX) { + const page_t* page = (const page_t*) frame; + + page_info->index_id = btr_page_get_index_id(page); + + /* FIL_PAGE_INDEX is a bit special, its value + is defined as 17855, so we cannot use FIL_PAGE_INDEX + to index into i_s_page_type[] array, its array index + in the i_s_page_type[] array is I_S_PAGE_TYPE_INDEX + (1) for index pages or I_S_PAGE_TYPE_IBUF for + change buffer index pages */ + if (page_info->index_id + == static_cast<index_id_t>(DICT_IBUF_ID_MIN + + IBUF_SPACE_ID)) { + page_info->page_type = I_S_PAGE_TYPE_IBUF; + } else { + page_info->page_type = I_S_PAGE_TYPE_INDEX; + } + + page_info->data_size = (ulint)(page_header_get_field( + page, PAGE_HEAP_TOP) - (page_is_comp(page) + ? PAGE_NEW_SUPREMUM_END + : PAGE_OLD_SUPREMUM_END) + - page_header_get_field(page, PAGE_GARBAGE)); + + page_info->num_recs = page_get_n_recs(page); + } else if (page_type > FIL_PAGE_TYPE_LAST) { + /* Encountered an unknown page type */ + page_info->page_type = I_S_PAGE_TYPE_UNKNOWN; + } else { + /* Make sure we get the right index into the + i_s_page_type[] array */ + ut_a(page_type == i_s_page_type[page_type].type_value); + + page_info->page_type = page_type; + } + + if (page_info->page_type == FIL_PAGE_TYPE_ZBLOB + || page_info->page_type == FIL_PAGE_TYPE_ZBLOB2) { + page_info->page_num = mach_read_from_4( + frame + FIL_PAGE_OFFSET); + page_info->space_id = mach_read_from_4( + frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + } +} +/*******************************************************************//** +Scans pages in the buffer cache, and collect their general information +into the buf_page_info_t array which is zero-filled. So any fields +that are not initialized in the function will default to 0 */ +static +void +i_s_innodb_buffer_page_get_info( +/*============================*/ + const buf_page_t*bpage, /*!< in: buffer pool page to scan */ + ulint pool_id, /*!< in: buffer pool id */ + ulint pos, /*!< in: buffer block position in + buffer pool or in the LRU list */ + buf_page_info_t*page_info) /*!< in: zero filled info structure; + out: structure filled with scanned + info */ +{ + ib_mutex_t* mutex = buf_page_get_mutex(bpage); + + ut_ad(pool_id < MAX_BUFFER_POOLS); + + page_info->pool_id = pool_id; + + page_info->block_id = pos; + + mutex_enter(mutex); + + page_info->page_state = buf_page_get_state(bpage); + + /* Only fetch information for buffers that map to a tablespace, + that is, buffer page with state BUF_BLOCK_ZIP_PAGE, + BUF_BLOCK_ZIP_DIRTY or BUF_BLOCK_FILE_PAGE */ + if (buf_page_in_file(bpage)) { + const byte* frame; + ulint page_type; + + page_info->space_id = buf_page_get_space(bpage); + + page_info->page_num = buf_page_get_page_no(bpage); + + page_info->flush_type = bpage->flush_type; + + page_info->fix_count = bpage->buf_fix_count; + + page_info->newest_mod = bpage->newest_modification; + + page_info->oldest_mod = bpage->oldest_modification; + + page_info->access_time = bpage->access_time; + + page_info->zip_ssize = bpage->zip.ssize; + + page_info->io_fix = bpage->io_fix; + + page_info->is_old = bpage->old; + + page_info->freed_page_clock = bpage->freed_page_clock; + + switch (buf_page_get_io_fix(bpage)) { + case BUF_IO_NONE: + case BUF_IO_WRITE: + case BUF_IO_PIN: + break; + case BUF_IO_READ: + page_info->page_type = I_S_PAGE_TYPE_UNKNOWN; + mutex_exit(mutex); + return; + } + + if (page_info->page_state == BUF_BLOCK_FILE_PAGE) { + const buf_block_t*block; + + block = reinterpret_cast<const buf_block_t*>(bpage); + frame = block->frame; + page_info->hashed = (block->index != NULL); + } else { + ut_ad(page_info->zip_ssize); + frame = bpage->zip.data; + } + + page_type = fil_page_get_type(frame); + + i_s_innodb_set_page_type(page_info, page_type, frame); + } else { + page_info->page_type = I_S_PAGE_TYPE_UNKNOWN; + } + + mutex_exit(mutex); +} + +/*******************************************************************//** +This is the function that goes through each block of the buffer pool +and fetch information to information schema tables: INNODB_BUFFER_PAGE. +@return 0 on success, 1 on failure */ +static +int +i_s_innodb_fill_buffer_pool( +/*========================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + buf_pool_t* buf_pool, /*!< in: buffer pool to scan */ + const ulint pool_id) /*!< in: buffer pool id */ +{ + int status = 0; + mem_heap_t* heap; + + DBUG_ENTER("i_s_innodb_fill_buffer_pool"); + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name); + + heap = mem_heap_create(10000); + + /* Go through each chunk of buffer pool. Currently, we only + have one single chunk for each buffer pool */ + for (ulint n = 0; n < buf_pool->n_chunks; n++) { + const buf_block_t* block; + ulint n_blocks; + buf_page_info_t* info_buffer; + ulint num_page; + ulint mem_size; + ulint chunk_size; + ulint num_to_process = 0; + ulint block_id = 0; + + /* Get buffer block of the nth chunk */ + block = buf_get_nth_chunk_block(buf_pool, n, &chunk_size); + num_page = 0; + + while (chunk_size > 0) { + /* we cache maximum MAX_BUF_INFO_CACHED number of + buffer page info */ + num_to_process = ut_min(chunk_size, + MAX_BUF_INFO_CACHED); + + mem_size = num_to_process * sizeof(buf_page_info_t); + + /* For each chunk, we'll pre-allocate information + structures to cache the page information read from + the buffer pool. Doing so before obtain any mutex */ + info_buffer = (buf_page_info_t*) mem_heap_zalloc( + heap, mem_size); + + /* GO through each block in the chunk */ + for (n_blocks = num_to_process; n_blocks--; block++) { + i_s_innodb_buffer_page_get_info( + &block->page, pool_id, block_id, + info_buffer + num_page); + block_id++; + num_page++; + } + + /* Fill in information schema table with information + just collected from the buffer chunk scan */ + status = i_s_innodb_buffer_page_fill( + thd, tables, info_buffer, + num_page); + + /* If something goes wrong, break and return */ + if (status) { + break; + } + + mem_heap_empty(heap); + chunk_size -= num_to_process; + num_page = 0; + } + } + + mem_heap_free(heap); + + DBUG_RETURN(status); +} + +/*******************************************************************//** +Fill page information for pages in InnoDB buffer pool to the +dynamic table INFORMATION_SCHEMA.INNODB_BUFFER_PAGE +@return 0 on success, 1 on failure */ +static +int +i_s_innodb_buffer_page_fill_table( +/*==============================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* ) /*!< in: condition (ignored) */ +{ + int status = 0; + + DBUG_ENTER("i_s_innodb_buffer_page_fill_table"); + + /* deny access to user without PROCESS privilege */ + if (check_global_access(thd, PROCESS_ACL)) { + DBUG_RETURN(0); + } + + /* Walk through each buffer pool */ + for (ulint i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_array(i); + + /* Fetch information from pages in this buffer pool, + and fill the corresponding I_S table */ + status = i_s_innodb_fill_buffer_pool(thd, tables, buf_pool, i); + + /* If something wrong, break and return */ + if (status) { + break; + } + } + + DBUG_RETURN(status); +} + +/*******************************************************************//** +Bind the dynamic table INFORMATION_SCHEMA.INNODB_BUFFER_PAGE. +@return 0 on success, 1 on failure */ +static +int +i_s_innodb_buffer_page_init( +/*========================*/ + void* p) /*!< in/out: table schema object */ +{ + ST_SCHEMA_TABLE* schema; + + DBUG_ENTER("i_s_innodb_buffer_page_init"); + + schema = reinterpret_cast<ST_SCHEMA_TABLE*>(p); + + schema->fields_info = i_s_innodb_buffer_page_fields_info; + schema->fill_table = i_s_innodb_buffer_page_fill_table; + + DBUG_RETURN(0); +} + +UNIV_INTERN struct st_mysql_plugin i_s_innodb_buffer_page = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), + + /* pointer to type-specific plugin descriptor */ + /* void* */ + STRUCT_FLD(info, &i_s_info), + + /* plugin name */ + /* const char* */ + STRUCT_FLD(name, "INNODB_BUFFER_PAGE"), + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(author, plugin_author), + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(descr, "InnoDB Buffer Page Information"), + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + STRUCT_FLD(license, PLUGIN_LICENSE_GPL), + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + STRUCT_FLD(init, i_s_innodb_buffer_page_init), + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + STRUCT_FLD(deinit, i_s_common_deinit), + + /* plugin version (for SHOW PLUGINS) */ + /* unsigned int */ + STRUCT_FLD(version, INNODB_VERSION_SHORT), + + /* struct st_mysql_show_var* */ + STRUCT_FLD(status_vars, NULL), + + /* struct st_mysql_sys_var** */ + STRUCT_FLD(system_vars, NULL), + + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), +}; + +static ST_FIELD_INFO i_s_innodb_buf_page_lru_fields_info[] = +{ +#define IDX_BUF_LRU_POOL_ID 0 + {STRUCT_FLD(field_name, "POOL_ID"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUF_LRU_POS 1 + {STRUCT_FLD(field_name, "LRU_POSITION"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUF_LRU_PAGE_SPACE 2 + {STRUCT_FLD(field_name, "SPACE"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUF_LRU_PAGE_NUM 3 + {STRUCT_FLD(field_name, "PAGE_NUMBER"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUF_LRU_PAGE_TYPE 4 + {STRUCT_FLD(field_name, "PAGE_TYPE"), + STRUCT_FLD(field_length, 64), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUF_LRU_PAGE_FLUSH_TYPE 5 + {STRUCT_FLD(field_name, "FLUSH_TYPE"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUF_LRU_PAGE_FIX_COUNT 6 + {STRUCT_FLD(field_name, "FIX_COUNT"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUF_LRU_PAGE_HASHED 7 + {STRUCT_FLD(field_name, "IS_HASHED"), + STRUCT_FLD(field_length, 3), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUF_LRU_PAGE_NEWEST_MOD 8 + {STRUCT_FLD(field_name, "NEWEST_MODIFICATION"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUF_LRU_PAGE_OLDEST_MOD 9 + {STRUCT_FLD(field_name, "OLDEST_MODIFICATION"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUF_LRU_PAGE_ACCESS_TIME 10 + {STRUCT_FLD(field_name, "ACCESS_TIME"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUF_LRU_PAGE_TABLE_NAME 11 + {STRUCT_FLD(field_name, "TABLE_NAME"), + STRUCT_FLD(field_length, 1024), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUF_LRU_PAGE_INDEX_NAME 12 + {STRUCT_FLD(field_name, "INDEX_NAME"), + STRUCT_FLD(field_length, 1024), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUF_LRU_PAGE_NUM_RECS 13 + {STRUCT_FLD(field_name, "NUMBER_RECORDS"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUF_LRU_PAGE_DATA_SIZE 14 + {STRUCT_FLD(field_name, "DATA_SIZE"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUF_LRU_PAGE_ZIP_SIZE 15 + {STRUCT_FLD(field_name, "COMPRESSED_SIZE"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUF_LRU_PAGE_STATE 16 + {STRUCT_FLD(field_name, "COMPRESSED"), + STRUCT_FLD(field_length, 3), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUF_LRU_PAGE_IO_FIX 17 + {STRUCT_FLD(field_name, "IO_FIX"), + STRUCT_FLD(field_length, 64), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUF_LRU_PAGE_IS_OLD 18 + {STRUCT_FLD(field_name, "IS_OLD"), + STRUCT_FLD(field_length, 3), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BUF_LRU_PAGE_FREE_CLOCK 19 + {STRUCT_FLD(field_name, "FREE_PAGE_CLOCK"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + END_OF_ST_FIELD_INFO +}; + +/*******************************************************************//** +Fill Information Schema table INNODB_BUFFER_PAGE_LRU with information +cached in the buf_page_info_t array +@return 0 on success, 1 on failure */ +static +int +i_s_innodb_buf_page_lru_fill( +/*=========================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + const buf_page_info_t* info_array, /*!< in: array cached page + info */ + ulint num_page) /*!< in: number of page info + cached */ +{ + TABLE* table; + Field** fields; + mem_heap_t* heap; + + DBUG_ENTER("i_s_innodb_buf_page_lru_fill"); + + table = tables->table; + + fields = table->field; + + heap = mem_heap_create(1000); + + /* Iterate through the cached array and fill the I_S table rows */ + for (ulint i = 0; i < num_page; i++) { + const buf_page_info_t* page_info; + char table_name[MAX_FULL_NAME_LEN + 1]; + const char* table_name_end = NULL; + const char* state_str; + enum buf_page_state state; + + state_str = NULL; + + page_info = info_array + i; + + OK(fields[IDX_BUF_LRU_POOL_ID]->store( + static_cast<double>(page_info->pool_id))); + + OK(fields[IDX_BUF_LRU_POS]->store( + static_cast<double>(page_info->block_id))); + + OK(fields[IDX_BUF_LRU_PAGE_SPACE]->store( + static_cast<double>(page_info->space_id))); + + OK(fields[IDX_BUF_LRU_PAGE_NUM]->store( + static_cast<double>(page_info->page_num))); + + OK(field_store_string( + fields[IDX_BUF_LRU_PAGE_TYPE], + i_s_page_type[page_info->page_type].type_str)); + + OK(fields[IDX_BUF_LRU_PAGE_FLUSH_TYPE]->store( + static_cast<double>(page_info->flush_type))); + + OK(fields[IDX_BUF_LRU_PAGE_FIX_COUNT]->store( + static_cast<double>(page_info->fix_count))); + + if (page_info->hashed) { + OK(field_store_string( + fields[IDX_BUF_LRU_PAGE_HASHED], "YES")); + } else { + OK(field_store_string( + fields[IDX_BUF_LRU_PAGE_HASHED], "NO")); + } + + OK(fields[IDX_BUF_LRU_PAGE_NEWEST_MOD]->store( + page_info->newest_mod, true)); + + OK(fields[IDX_BUF_LRU_PAGE_OLDEST_MOD]->store( + page_info->oldest_mod, true)); + + OK(fields[IDX_BUF_LRU_PAGE_ACCESS_TIME]->store( + page_info->access_time)); + + fields[IDX_BUF_LRU_PAGE_TABLE_NAME]->set_null(); + + fields[IDX_BUF_LRU_PAGE_INDEX_NAME]->set_null(); + + /* If this is an index page, fetch the index name + and table name */ + if (page_info->page_type == I_S_PAGE_TYPE_INDEX) { + const dict_index_t* index; + + mutex_enter(&dict_sys->mutex); + index = dict_index_get_if_in_cache_low( + page_info->index_id); + + if (index) { + + table_name_end = innobase_convert_name( + table_name, sizeof(table_name), + index->table_name, + strlen(index->table_name), + thd, TRUE); + + OK(fields[IDX_BUF_LRU_PAGE_TABLE_NAME]->store( + table_name, + static_cast<uint>(table_name_end - table_name), + system_charset_info)); + fields[IDX_BUF_LRU_PAGE_TABLE_NAME]->set_notnull(); + + OK(field_store_index_name( + fields[IDX_BUF_LRU_PAGE_INDEX_NAME], + index->name)); + } + + mutex_exit(&dict_sys->mutex); + } + + OK(fields[IDX_BUF_LRU_PAGE_NUM_RECS]->store( + page_info->num_recs)); + + OK(fields[IDX_BUF_LRU_PAGE_DATA_SIZE]->store( + page_info->data_size)); + + OK(fields[IDX_BUF_LRU_PAGE_ZIP_SIZE]->store( + page_info->zip_ssize ? + 512 << page_info->zip_ssize : 0)); + + state = static_cast<enum buf_page_state>(page_info->page_state); + + switch (state) { + /* Compressed page */ + case BUF_BLOCK_ZIP_PAGE: + case BUF_BLOCK_ZIP_DIRTY: + state_str = "YES"; + break; + /* Uncompressed page */ + case BUF_BLOCK_FILE_PAGE: + state_str = "NO"; + break; + /* We should not see following states */ + case BUF_BLOCK_POOL_WATCH: + case BUF_BLOCK_READY_FOR_USE: + case BUF_BLOCK_NOT_USED: + case BUF_BLOCK_MEMORY: + case BUF_BLOCK_REMOVE_HASH: + state_str = NULL; + break; + }; + + OK(field_store_string(fields[IDX_BUF_LRU_PAGE_STATE], + state_str)); + + switch (page_info->io_fix) { + case BUF_IO_NONE: + OK(field_store_string(fields[IDX_BUF_LRU_PAGE_IO_FIX], + "IO_NONE")); + break; + case BUF_IO_READ: + OK(field_store_string(fields[IDX_BUF_LRU_PAGE_IO_FIX], + "IO_READ")); + break; + case BUF_IO_WRITE: + OK(field_store_string(fields[IDX_BUF_LRU_PAGE_IO_FIX], + "IO_WRITE")); + break; + } + + OK(field_store_string(fields[IDX_BUF_LRU_PAGE_IS_OLD], + (page_info->is_old) ? "YES" : "NO")); + + OK(fields[IDX_BUF_LRU_PAGE_FREE_CLOCK]->store( + page_info->freed_page_clock)); + + if (schema_table_store_record(thd, table)) { + mem_heap_free(heap); + DBUG_RETURN(1); + } + + mem_heap_empty(heap); + } + + mem_heap_free(heap); + + DBUG_RETURN(0); +} + +/*******************************************************************//** +This is the function that goes through buffer pool's LRU list +and fetch information to INFORMATION_SCHEMA.INNODB_BUFFER_PAGE_LRU. +@return 0 on success, 1 on failure */ +static +int +i_s_innodb_fill_buffer_lru( +/*=======================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + buf_pool_t* buf_pool, /*!< in: buffer pool to scan */ + const ulint pool_id) /*!< in: buffer pool id */ +{ + int status = 0; + buf_page_info_t* info_buffer; + ulint lru_pos = 0; + const buf_page_t* bpage; + ulint lru_len; + + DBUG_ENTER("i_s_innodb_fill_buffer_lru"); + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name); + + /* Obtain buf_pool->LRU_list_mutex before allocate info_buffer, since + UT_LIST_GET_LEN(buf_pool->LRU) could change */ + mutex_enter(&buf_pool->LRU_list_mutex); + + lru_len = UT_LIST_GET_LEN(buf_pool->LRU); + + /* Print error message if malloc fail */ + info_buffer = (buf_page_info_t*) my_malloc( + lru_len * sizeof *info_buffer, MYF(MY_WME)); + + if (!info_buffer) { + status = 1; + goto exit; + } + + memset(info_buffer, 0, lru_len * sizeof *info_buffer); + + /* Walk through Pool's LRU list and print the buffer page + information */ + bpage = UT_LIST_GET_LAST(buf_pool->LRU); + + while (bpage != NULL) { + /* Use the same function that collect buffer info for + INNODB_BUFFER_PAGE to get buffer page info */ + i_s_innodb_buffer_page_get_info(bpage, pool_id, lru_pos, + (info_buffer + lru_pos)); + + bpage = UT_LIST_GET_PREV(LRU, bpage); + + lru_pos++; + } + + ut_ad(lru_pos == lru_len); + ut_ad(lru_pos == UT_LIST_GET_LEN(buf_pool->LRU)); + +exit: + mutex_exit(&buf_pool->LRU_list_mutex); + + if (info_buffer) { + status = i_s_innodb_buf_page_lru_fill( + thd, tables, info_buffer, lru_len); + + my_free(info_buffer); + } + + DBUG_RETURN(status); +} + +/*******************************************************************//** +Fill page information for pages in InnoDB buffer pool to the +dynamic table INFORMATION_SCHEMA.INNODB_BUFFER_PAGE_LRU +@return 0 on success, 1 on failure */ +static +int +i_s_innodb_buf_page_lru_fill_table( +/*===============================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* ) /*!< in: condition (ignored) */ +{ + int status = 0; + + DBUG_ENTER("i_s_innodb_buf_page_lru_fill_table"); + + /* deny access to any users that do not hold PROCESS_ACL */ + if (check_global_access(thd, PROCESS_ACL)) { + DBUG_RETURN(0); + } + + /* Walk through each buffer pool */ + for (ulint i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_array(i); + + /* Fetch information from pages in this buffer pool's LRU list, + and fill the corresponding I_S table */ + status = i_s_innodb_fill_buffer_lru(thd, tables, buf_pool, i); + + /* If something wrong, break and return */ + if (status) { + break; + } + } + + DBUG_RETURN(status); +} + +/*******************************************************************//** +Bind the dynamic table INFORMATION_SCHEMA.INNODB_BUFFER_PAGE_LRU. +@return 0 on success, 1 on failure */ +static +int +i_s_innodb_buffer_page_lru_init( +/*============================*/ + void* p) /*!< in/out: table schema object */ +{ + ST_SCHEMA_TABLE* schema; + + DBUG_ENTER("i_s_innodb_buffer_page_lru_init"); + + schema = reinterpret_cast<ST_SCHEMA_TABLE*>(p); + + schema->fields_info = i_s_innodb_buf_page_lru_fields_info; + schema->fill_table = i_s_innodb_buf_page_lru_fill_table; + + DBUG_RETURN(0); +} + +UNIV_INTERN struct st_mysql_plugin i_s_innodb_buffer_page_lru = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), + + /* pointer to type-specific plugin descriptor */ + /* void* */ + STRUCT_FLD(info, &i_s_info), + + /* plugin name */ + /* const char* */ + STRUCT_FLD(name, "INNODB_BUFFER_PAGE_LRU"), + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(author, plugin_author), + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(descr, "InnoDB Buffer Page in LRU"), + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + STRUCT_FLD(license, PLUGIN_LICENSE_GPL), + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + STRUCT_FLD(init, i_s_innodb_buffer_page_lru_init), + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + STRUCT_FLD(deinit, i_s_common_deinit), + + /* plugin version (for SHOW PLUGINS) */ + /* unsigned int */ + STRUCT_FLD(version, INNODB_VERSION_SHORT), + + /* struct st_mysql_show_var* */ + STRUCT_FLD(status_vars, NULL), + + /* struct st_mysql_sys_var** */ + STRUCT_FLD(system_vars, NULL), + + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), +}; + +/*******************************************************************//** +Unbind a dynamic INFORMATION_SCHEMA table. +@return 0 on success */ +static +int +i_s_common_deinit( +/*==============*/ + void* p) /*!< in/out: table schema object */ +{ + DBUG_ENTER("i_s_common_deinit"); + + /* Do nothing */ + + DBUG_RETURN(0); +} + +/** SYS_TABLES ***************************************************/ +/* Fields of the dynamic table INFORMATION_SCHEMA.SYS_TABLES */ +static ST_FIELD_INFO innodb_sys_tables_fields_info[] = +{ +#define SYS_TABLES_ID 0 + {STRUCT_FLD(field_name, "TABLE_ID"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define SYS_TABLES_NAME 1 + {STRUCT_FLD(field_name, "NAME"), + STRUCT_FLD(field_length, MAX_FULL_NAME_LEN + 1), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define SYS_TABLES_FLAG 2 + {STRUCT_FLD(field_name, "FLAG"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define SYS_TABLES_NUM_COLUMN 3 + {STRUCT_FLD(field_name, "N_COLS"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define SYS_TABLES_SPACE 4 + {STRUCT_FLD(field_name, "SPACE"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define SYS_TABLES_FILE_FORMAT 5 + {STRUCT_FLD(field_name, "FILE_FORMAT"), + STRUCT_FLD(field_length, 10), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define SYS_TABLES_ROW_FORMAT 6 + {STRUCT_FLD(field_name, "ROW_FORMAT"), + STRUCT_FLD(field_length, 12), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define SYS_TABLES_ZIP_PAGE_SIZE 7 + {STRUCT_FLD(field_name, "ZIP_PAGE_SIZE"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + END_OF_ST_FIELD_INFO +}; + +/**********************************************************************//** +Populate information_schema.innodb_sys_tables table with information +from SYS_TABLES. +@return 0 on success */ +static +int +i_s_dict_fill_sys_tables( +/*=====================*/ + THD* thd, /*!< in: thread */ + dict_table_t* table, /*!< in: table */ + TABLE* table_to_fill) /*!< in/out: fill this table */ +{ + Field** fields; + ulint compact = DICT_TF_GET_COMPACT(table->flags); + ulint atomic_blobs = DICT_TF_HAS_ATOMIC_BLOBS(table->flags); + ulint zip_size = dict_tf_get_zip_size(table->flags); + const char* file_format; + const char* row_format; + + file_format = trx_sys_file_format_id_to_name(atomic_blobs); + if (!compact) { + row_format = "Redundant"; + } else if (!atomic_blobs) { + row_format = "Compact"; + } else if DICT_TF_GET_ZIP_SSIZE(table->flags) { + row_format = "Compressed"; + } else { + row_format = "Dynamic"; + } + + DBUG_ENTER("i_s_dict_fill_sys_tables"); + + fields = table_to_fill->field; + + OK(fields[SYS_TABLES_ID]->store(longlong(table->id), TRUE)); + + OK(field_store_string(fields[SYS_TABLES_NAME], table->name)); + + OK(fields[SYS_TABLES_FLAG]->store(table->flags)); + + OK(fields[SYS_TABLES_NUM_COLUMN]->store(table->n_cols)); + + OK(fields[SYS_TABLES_SPACE]->store(table->space)); + + OK(field_store_string(fields[SYS_TABLES_FILE_FORMAT], file_format)); + + OK(field_store_string(fields[SYS_TABLES_ROW_FORMAT], row_format)); + + OK(fields[SYS_TABLES_ZIP_PAGE_SIZE]->store( + static_cast<double>(zip_size))); + + OK(schema_table_store_record(thd, table_to_fill)); + + DBUG_RETURN(0); +} +/*******************************************************************//** +Function to go through each record in SYS_TABLES table, and fill the +information_schema.innodb_sys_tables table with related table information +@return 0 on success */ +static +int +i_s_sys_tables_fill_table( +/*======================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* ) /*!< in: condition (not used) */ +{ + btr_pcur_t pcur; + const rec_t* rec; + mem_heap_t* heap; + mtr_t mtr; + + DBUG_ENTER("i_s_sys_tables_fill_table"); + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name); + + /* deny access to user without PROCESS_ACL privilege */ + if (check_global_access(thd, PROCESS_ACL)) { + DBUG_RETURN(0); + } + + heap = mem_heap_create(1000); + mutex_enter(&(dict_sys->mutex)); + mtr_start(&mtr); + + rec = dict_startscan_system(&pcur, &mtr, SYS_TABLES); + + while (rec) { + const char* err_msg; + dict_table_t* table_rec; + + /* Create and populate a dict_table_t structure with + information from SYS_TABLES row */ + err_msg = dict_process_sys_tables_rec_and_mtr_commit( + heap, rec, &table_rec, + DICT_TABLE_LOAD_FROM_RECORD, &mtr); + + mutex_exit(&dict_sys->mutex); + + if (!err_msg) { + i_s_dict_fill_sys_tables(thd, table_rec, tables->table); + } else { + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_CANT_FIND_SYSTEM_REC, "%s", + err_msg); + } + + /* Since dict_process_sys_tables_rec_and_mtr_commit() + is called with DICT_TABLE_LOAD_FROM_RECORD, the table_rec + is created in dict_process_sys_tables_rec(), we will + need to free it */ + if (table_rec) { + dict_mem_table_free(table_rec); + } + + mem_heap_empty(heap); + + /* Get the next record */ + mutex_enter(&dict_sys->mutex); + mtr_start(&mtr); + rec = dict_getnext_system(&pcur, &mtr); + } + + mtr_commit(&mtr); + mutex_exit(&dict_sys->mutex); + mem_heap_free(heap); + + DBUG_RETURN(0); +} + +/*******************************************************************//** +Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_tables +@return 0 on success */ +static +int +innodb_sys_tables_init( +/*===================*/ + void* p) /*!< in/out: table schema object */ +{ + ST_SCHEMA_TABLE* schema; + + DBUG_ENTER("innodb_sys_tables_init"); + + schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = innodb_sys_tables_fields_info; + schema->fill_table = i_s_sys_tables_fill_table; + + DBUG_RETURN(0); +} + +UNIV_INTERN struct st_mysql_plugin i_s_innodb_sys_tables = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), + + /* pointer to type-specific plugin descriptor */ + /* void* */ + STRUCT_FLD(info, &i_s_info), + + /* plugin name */ + /* const char* */ + STRUCT_FLD(name, "INNODB_SYS_TABLES"), + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(author, plugin_author), + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(descr, "InnoDB SYS_TABLES"), + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + STRUCT_FLD(license, PLUGIN_LICENSE_GPL), + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + STRUCT_FLD(init, innodb_sys_tables_init), + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + STRUCT_FLD(deinit, i_s_common_deinit), + + /* plugin version (for SHOW PLUGINS) */ + /* unsigned int */ + STRUCT_FLD(version, INNODB_VERSION_SHORT), + + /* struct st_mysql_show_var* */ + STRUCT_FLD(status_vars, NULL), + + /* struct st_mysql_sys_var** */ + STRUCT_FLD(system_vars, NULL), + + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), +}; + +/** SYS_TABLESTATS ***********************************************/ +/* Fields of the dynamic table INFORMATION_SCHEMA.SYS_TABLESTATS */ +static ST_FIELD_INFO innodb_sys_tablestats_fields_info[] = +{ +#define SYS_TABLESTATS_ID 0 + {STRUCT_FLD(field_name, "TABLE_ID"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define SYS_TABLESTATS_NAME 1 + {STRUCT_FLD(field_name, "NAME"), + STRUCT_FLD(field_length, NAME_LEN + 1), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define SYS_TABLESTATS_INIT 2 + {STRUCT_FLD(field_name, "STATS_INITIALIZED"), + STRUCT_FLD(field_length, NAME_LEN + 1), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define SYS_TABLESTATS_NROW 3 + {STRUCT_FLD(field_name, "NUM_ROWS"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define SYS_TABLESTATS_CLUST_SIZE 4 + {STRUCT_FLD(field_name, "CLUST_INDEX_SIZE"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define SYS_TABLESTATS_INDEX_SIZE 5 + {STRUCT_FLD(field_name, "OTHER_INDEX_SIZE"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define SYS_TABLESTATS_MODIFIED 6 + {STRUCT_FLD(field_name, "MODIFIED_COUNTER"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define SYS_TABLESTATS_AUTONINC 7 + {STRUCT_FLD(field_name, "AUTOINC"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define SYS_TABLESTATS_TABLE_REF_COUNT 8 + {STRUCT_FLD(field_name, "REF_COUNT"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + END_OF_ST_FIELD_INFO +}; + +/**********************************************************************//** +Populate information_schema.innodb_sys_tablestats table with information +from SYS_TABLES. +@return 0 on success */ +static +int +i_s_dict_fill_sys_tablestats( +/*=========================*/ + THD* thd, /*!< in: thread */ + dict_table_t* table, /*!< in: table */ + TABLE* table_to_fill) /*!< in/out: fill this table */ +{ + Field** fields; + + DBUG_ENTER("i_s_dict_fill_sys_tablestats"); + + fields = table_to_fill->field; + + OK(fields[SYS_TABLESTATS_ID]->store(longlong(table->id), TRUE)); + + OK(field_store_string(fields[SYS_TABLESTATS_NAME], table->name)); + + dict_table_stats_lock(table, RW_S_LATCH); + + if (table->stat_initialized) { + OK(field_store_string(fields[SYS_TABLESTATS_INIT], + "Initialized")); + + OK(fields[SYS_TABLESTATS_NROW]->store(table->stat_n_rows, + TRUE)); + + OK(fields[SYS_TABLESTATS_CLUST_SIZE]->store( + static_cast<double>(table->stat_clustered_index_size))); + + OK(fields[SYS_TABLESTATS_INDEX_SIZE]->store( + static_cast<double>(table->stat_sum_of_other_index_sizes))); + + OK(fields[SYS_TABLESTATS_MODIFIED]->store( + static_cast<double>(table->stat_modified_counter))); + } else { + OK(field_store_string(fields[SYS_TABLESTATS_INIT], + "Uninitialized")); + + OK(fields[SYS_TABLESTATS_NROW]->store(0, TRUE)); + + OK(fields[SYS_TABLESTATS_CLUST_SIZE]->store(0)); + + OK(fields[SYS_TABLESTATS_INDEX_SIZE]->store(0)); + + OK(fields[SYS_TABLESTATS_MODIFIED]->store(0)); + } + + dict_table_stats_unlock(table, RW_S_LATCH); + + OK(fields[SYS_TABLESTATS_AUTONINC]->store(table->autoinc, TRUE)); + + OK(fields[SYS_TABLESTATS_TABLE_REF_COUNT]->store( + static_cast<double>(table->n_ref_count))); + + OK(schema_table_store_record(thd, table_to_fill)); + + DBUG_RETURN(0); +} + +/*******************************************************************//** +Function to go through each record in SYS_TABLES table, and fill the +information_schema.innodb_sys_tablestats table with table statistics +related information +@return 0 on success */ +static +int +i_s_sys_tables_fill_table_stats( +/*============================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* ) /*!< in: condition (not used) */ +{ + btr_pcur_t pcur; + const rec_t* rec; + mem_heap_t* heap; + mtr_t mtr; + + DBUG_ENTER("i_s_sys_tables_fill_table_stats"); + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name); + + /* deny access to user without PROCESS_ACL privilege */ + if (check_global_access(thd, PROCESS_ACL)) { + DBUG_RETURN(0); + } + + heap = mem_heap_create(1000); + mutex_enter(&dict_sys->mutex); + mtr_start(&mtr); + + rec = dict_startscan_system(&pcur, &mtr, SYS_TABLES); + + while (rec) { + const char* err_msg; + dict_table_t* table_rec; + + /* Fetch the dict_table_t structure corresponding to + this SYS_TABLES record */ + err_msg = dict_process_sys_tables_rec_and_mtr_commit( + heap, rec, &table_rec, + DICT_TABLE_LOAD_FROM_CACHE, &mtr); + + mutex_exit(&dict_sys->mutex); + + if (!err_msg) { + i_s_dict_fill_sys_tablestats(thd, table_rec, + tables->table); + } else { + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_CANT_FIND_SYSTEM_REC, "%s", + err_msg); + } + + mem_heap_empty(heap); + + /* Get the next record */ + mutex_enter(&dict_sys->mutex); + mtr_start(&mtr); + rec = dict_getnext_system(&pcur, &mtr); + } + + mtr_commit(&mtr); + mutex_exit(&dict_sys->mutex); + mem_heap_free(heap); + + DBUG_RETURN(0); +} + +/*******************************************************************//** +Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_tablestats +@return 0 on success */ +static +int +innodb_sys_tablestats_init( +/*=======================*/ + void* p) /*!< in/out: table schema object */ +{ + ST_SCHEMA_TABLE* schema; + + DBUG_ENTER("innodb_sys_tablestats_init"); + + schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = innodb_sys_tablestats_fields_info; + schema->fill_table = i_s_sys_tables_fill_table_stats; + + DBUG_RETURN(0); +} + +UNIV_INTERN struct st_mysql_plugin i_s_innodb_sys_tablestats = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), + + /* pointer to type-specific plugin descriptor */ + /* void* */ + STRUCT_FLD(info, &i_s_info), + + /* plugin name */ + /* const char* */ + STRUCT_FLD(name, "INNODB_SYS_TABLESTATS"), + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(author, plugin_author), + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(descr, "InnoDB SYS_TABLESTATS"), + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + STRUCT_FLD(license, PLUGIN_LICENSE_GPL), + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + STRUCT_FLD(init, innodb_sys_tablestats_init), + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + STRUCT_FLD(deinit, i_s_common_deinit), + + /* plugin version (for SHOW PLUGINS) */ + /* unsigned int */ + STRUCT_FLD(version, INNODB_VERSION_SHORT), + + /* struct st_mysql_show_var* */ + STRUCT_FLD(status_vars, NULL), + + /* struct st_mysql_sys_var** */ + STRUCT_FLD(system_vars, NULL), + + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), +}; + +/** SYS_INDEXES **************************************************/ +/* Fields of the dynamic table INFORMATION_SCHEMA.SYS_INDEXES */ +static ST_FIELD_INFO innodb_sysindex_fields_info[] = +{ +#define SYS_INDEX_ID 0 + {STRUCT_FLD(field_name, "INDEX_ID"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define SYS_INDEX_NAME 1 + {STRUCT_FLD(field_name, "NAME"), + STRUCT_FLD(field_length, NAME_LEN + 1), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define SYS_INDEX_TABLE_ID 2 + {STRUCT_FLD(field_name, "TABLE_ID"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define SYS_INDEX_TYPE 3 + {STRUCT_FLD(field_name, "TYPE"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define SYS_INDEX_NUM_FIELDS 4 + {STRUCT_FLD(field_name, "N_FIELDS"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define SYS_INDEX_PAGE_NO 5 + {STRUCT_FLD(field_name, "PAGE_NO"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define SYS_INDEX_SPACE 6 + {STRUCT_FLD(field_name, "SPACE"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + END_OF_ST_FIELD_INFO +}; + +/**********************************************************************//** +Function to populate the information_schema.innodb_sys_indexes table with +collected index information +@return 0 on success */ +static +int +i_s_dict_fill_sys_indexes( +/*======================*/ + THD* thd, /*!< in: thread */ + table_id_t table_id, /*!< in: table id */ + dict_index_t* index, /*!< in: populated dict_index_t + struct with index info */ + TABLE* table_to_fill) /*!< in/out: fill this table */ +{ + Field** fields; + + DBUG_ENTER("i_s_dict_fill_sys_indexes"); + + fields = table_to_fill->field; + + OK(field_store_index_name(fields[SYS_INDEX_NAME], index->name)); + + OK(fields[SYS_INDEX_ID]->store(longlong(index->id), TRUE)); + + OK(fields[SYS_INDEX_TABLE_ID]->store(longlong(table_id), TRUE)); + + OK(fields[SYS_INDEX_TYPE]->store(index->type)); + + OK(fields[SYS_INDEX_NUM_FIELDS]->store(index->n_fields)); + + /* FIL_NULL is ULINT32_UNDEFINED */ + if (index->page == FIL_NULL) { + OK(fields[SYS_INDEX_PAGE_NO]->store(-1)); + } else { + OK(fields[SYS_INDEX_PAGE_NO]->store(index->page)); + } + + OK(fields[SYS_INDEX_SPACE]->store(index->space)); + + OK(schema_table_store_record(thd, table_to_fill)); + + DBUG_RETURN(0); +} +/*******************************************************************//** +Function to go through each record in SYS_INDEXES table, and fill the +information_schema.innodb_sys_indexes table with related index information +@return 0 on success */ +static +int +i_s_sys_indexes_fill_table( +/*=======================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* ) /*!< in: condition (not used) */ +{ + btr_pcur_t pcur; + const rec_t* rec; + mem_heap_t* heap; + mtr_t mtr; + + DBUG_ENTER("i_s_sys_indexes_fill_table"); + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name); + + /* deny access to user without PROCESS_ACL privilege */ + if (check_global_access(thd, PROCESS_ACL)) { + DBUG_RETURN(0); + } + + heap = mem_heap_create(1000); + mutex_enter(&dict_sys->mutex); + mtr_start(&mtr); + + /* Start scan the SYS_INDEXES table */ + rec = dict_startscan_system(&pcur, &mtr, SYS_INDEXES); + + /* Process each record in the table */ + while (rec) { + const char* err_msg; + table_id_t table_id; + dict_index_t index_rec; + + /* Populate a dict_index_t structure with information from + a SYS_INDEXES row */ + err_msg = dict_process_sys_indexes_rec(heap, rec, &index_rec, + &table_id); + + mtr_commit(&mtr); + mutex_exit(&dict_sys->mutex); + + if (!err_msg) { + i_s_dict_fill_sys_indexes(thd, table_id, &index_rec, + tables->table); + } else { + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_CANT_FIND_SYSTEM_REC, "%s", + err_msg); + } + + mem_heap_empty(heap); + + /* Get the next record */ + mutex_enter(&dict_sys->mutex); + mtr_start(&mtr); + rec = dict_getnext_system(&pcur, &mtr); + } + + mtr_commit(&mtr); + mutex_exit(&dict_sys->mutex); + mem_heap_free(heap); + + DBUG_RETURN(0); +} +/*******************************************************************//** +Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_indexes +@return 0 on success */ +static +int +innodb_sys_indexes_init( +/*====================*/ + void* p) /*!< in/out: table schema object */ +{ + ST_SCHEMA_TABLE* schema; + + DBUG_ENTER("innodb_sys_indexes_init"); + + schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = innodb_sysindex_fields_info; + schema->fill_table = i_s_sys_indexes_fill_table; + + DBUG_RETURN(0); +} + +UNIV_INTERN struct st_mysql_plugin i_s_innodb_sys_indexes = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), + + /* pointer to type-specific plugin descriptor */ + /* void* */ + STRUCT_FLD(info, &i_s_info), + + /* plugin name */ + /* const char* */ + STRUCT_FLD(name, "INNODB_SYS_INDEXES"), + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(author, plugin_author), + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(descr, "InnoDB SYS_INDEXES"), + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + STRUCT_FLD(license, PLUGIN_LICENSE_GPL), + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + STRUCT_FLD(init, innodb_sys_indexes_init), + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + STRUCT_FLD(deinit, i_s_common_deinit), + + /* plugin version (for SHOW PLUGINS) */ + /* unsigned int */ + STRUCT_FLD(version, INNODB_VERSION_SHORT), + + /* struct st_mysql_show_var* */ + STRUCT_FLD(status_vars, NULL), + + /* struct st_mysql_sys_var** */ + STRUCT_FLD(system_vars, NULL), + + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), +}; + +/** SYS_COLUMNS **************************************************/ +/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_SYS_COLUMNS */ +static ST_FIELD_INFO innodb_sys_columns_fields_info[] = +{ +#define SYS_COLUMN_TABLE_ID 0 + {STRUCT_FLD(field_name, "TABLE_ID"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define SYS_COLUMN_NAME 1 + {STRUCT_FLD(field_name, "NAME"), + STRUCT_FLD(field_length, NAME_LEN + 1), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define SYS_COLUMN_POSITION 2 + {STRUCT_FLD(field_name, "POS"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define SYS_COLUMN_MTYPE 3 + {STRUCT_FLD(field_name, "MTYPE"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define SYS_COLUMN__PRTYPE 4 + {STRUCT_FLD(field_name, "PRTYPE"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define SYS_COLUMN_COLUMN_LEN 5 + {STRUCT_FLD(field_name, "LEN"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + END_OF_ST_FIELD_INFO +}; + +/**********************************************************************//** +Function to populate the information_schema.innodb_sys_columns with +related column information +@return 0 on success */ +static +int +i_s_dict_fill_sys_columns( +/*======================*/ + THD* thd, /*!< in: thread */ + table_id_t table_id, /*!< in: table ID */ + const char* col_name, /*!< in: column name */ + dict_col_t* column, /*!< in: dict_col_t struct holding + more column information */ + TABLE* table_to_fill) /*!< in/out: fill this table */ +{ + Field** fields; + + DBUG_ENTER("i_s_dict_fill_sys_columns"); + + fields = table_to_fill->field; + + OK(fields[SYS_COLUMN_TABLE_ID]->store(longlong(table_id), TRUE)); + + OK(field_store_string(fields[SYS_COLUMN_NAME], col_name)); + + OK(fields[SYS_COLUMN_POSITION]->store(column->ind)); + + OK(fields[SYS_COLUMN_MTYPE]->store(column->mtype)); + + OK(fields[SYS_COLUMN__PRTYPE]->store(column->prtype)); + + OK(fields[SYS_COLUMN_COLUMN_LEN]->store(column->len)); + + OK(schema_table_store_record(thd, table_to_fill)); + + DBUG_RETURN(0); +} +/*******************************************************************//** +Function to fill information_schema.innodb_sys_columns with information +collected by scanning SYS_COLUMNS table. +@return 0 on success */ +static +int +i_s_sys_columns_fill_table( +/*=======================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* ) /*!< in: condition (not used) */ +{ + btr_pcur_t pcur; + const rec_t* rec; + const char* col_name; + mem_heap_t* heap; + mtr_t mtr; + + DBUG_ENTER("i_s_sys_columns_fill_table"); + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name); + + /* deny access to user without PROCESS_ACL privilege */ + if (check_global_access(thd, PROCESS_ACL)) { + DBUG_RETURN(0); + } + + heap = mem_heap_create(1000); + mutex_enter(&dict_sys->mutex); + mtr_start(&mtr); + + rec = dict_startscan_system(&pcur, &mtr, SYS_COLUMNS); + + while (rec) { + const char* err_msg; + dict_col_t column_rec; + table_id_t table_id; + + /* populate a dict_col_t structure with information from + a SYS_COLUMNS row */ + err_msg = dict_process_sys_columns_rec(heap, rec, &column_rec, + &table_id, &col_name); + + mtr_commit(&mtr); + mutex_exit(&dict_sys->mutex); + + if (!err_msg) { + i_s_dict_fill_sys_columns(thd, table_id, col_name, + &column_rec, + tables->table); + } else { + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_CANT_FIND_SYSTEM_REC, "%s", + err_msg); + } + + mem_heap_empty(heap); + + /* Get the next record */ + mutex_enter(&dict_sys->mutex); + mtr_start(&mtr); + rec = dict_getnext_system(&pcur, &mtr); + } + + mtr_commit(&mtr); + mutex_exit(&dict_sys->mutex); + mem_heap_free(heap); + + DBUG_RETURN(0); +} +/*******************************************************************//** +Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_columns +@return 0 on success */ +static +int +innodb_sys_columns_init( +/*====================*/ + void* p) /*!< in/out: table schema object */ +{ + ST_SCHEMA_TABLE* schema; + + DBUG_ENTER("innodb_sys_columns_init"); + + schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = innodb_sys_columns_fields_info; + schema->fill_table = i_s_sys_columns_fill_table; + + DBUG_RETURN(0); +} + +UNIV_INTERN struct st_mysql_plugin i_s_innodb_sys_columns = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), + + /* pointer to type-specific plugin descriptor */ + /* void* */ + STRUCT_FLD(info, &i_s_info), + + /* plugin name */ + /* const char* */ + STRUCT_FLD(name, "INNODB_SYS_COLUMNS"), + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(author, plugin_author), + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(descr, "InnoDB SYS_COLUMNS"), + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + STRUCT_FLD(license, PLUGIN_LICENSE_GPL), + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + STRUCT_FLD(init, innodb_sys_columns_init), + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + STRUCT_FLD(deinit, i_s_common_deinit), + + /* plugin version (for SHOW PLUGINS) */ + /* unsigned int */ + STRUCT_FLD(version, INNODB_VERSION_SHORT), + + /* struct st_mysql_show_var* */ + STRUCT_FLD(status_vars, NULL), + + /* struct st_mysql_sys_var** */ + STRUCT_FLD(system_vars, NULL), + + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), +}; + +/** SYS_FIELDS ***************************************************/ +/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_SYS_FIELDS */ +static ST_FIELD_INFO innodb_sys_fields_fields_info[] = +{ +#define SYS_FIELD_INDEX_ID 0 + {STRUCT_FLD(field_name, "INDEX_ID"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define SYS_FIELD_NAME 1 + {STRUCT_FLD(field_name, "NAME"), + STRUCT_FLD(field_length, NAME_LEN + 1), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define SYS_FIELD_POS 2 + {STRUCT_FLD(field_name, "POS"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + END_OF_ST_FIELD_INFO +}; + +/**********************************************************************//** +Function to fill information_schema.innodb_sys_fields with information +collected by scanning SYS_FIELDS table. +@return 0 on success */ +static +int +i_s_dict_fill_sys_fields( +/*=====================*/ + THD* thd, /*!< in: thread */ + index_id_t index_id, /*!< in: index id for the field */ + dict_field_t* field, /*!< in: table */ + ulint pos, /*!< in: Field position */ + TABLE* table_to_fill) /*!< in/out: fill this table */ +{ + Field** fields; + + DBUG_ENTER("i_s_dict_fill_sys_fields"); + + fields = table_to_fill->field; + + OK(fields[SYS_FIELD_INDEX_ID]->store(longlong(index_id), TRUE)); + + OK(field_store_string(fields[SYS_FIELD_NAME], field->name)); + + OK(fields[SYS_FIELD_POS]->store(static_cast<double>(pos))); + + OK(schema_table_store_record(thd, table_to_fill)); + + DBUG_RETURN(0); +} +/*******************************************************************//** +Function to go through each record in SYS_FIELDS table, and fill the +information_schema.innodb_sys_fields table with related index field +information +@return 0 on success */ +static +int +i_s_sys_fields_fill_table( +/*======================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* ) /*!< in: condition (not used) */ +{ + btr_pcur_t pcur; + const rec_t* rec; + mem_heap_t* heap; + index_id_t last_id; + mtr_t mtr; + + DBUG_ENTER("i_s_sys_fields_fill_table"); + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name); + + /* deny access to user without PROCESS_ACL privilege */ + if (check_global_access(thd, PROCESS_ACL)) { + + DBUG_RETURN(0); + } + + heap = mem_heap_create(1000); + mutex_enter(&dict_sys->mutex); + mtr_start(&mtr); + + /* will save last index id so that we know whether we move to + the next index. This is used to calculate prefix length */ + last_id = 0; + + rec = dict_startscan_system(&pcur, &mtr, SYS_FIELDS); + + while (rec) { + ulint pos; + const char* err_msg; + index_id_t index_id; + dict_field_t field_rec; + + /* Populate a dict_field_t structure with information from + a SYS_FIELDS row */ + err_msg = dict_process_sys_fields_rec(heap, rec, &field_rec, + &pos, &index_id, last_id); + + mtr_commit(&mtr); + mutex_exit(&dict_sys->mutex); + + if (!err_msg) { + i_s_dict_fill_sys_fields(thd, index_id, &field_rec, + pos, tables->table); + last_id = index_id; + } else { + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_CANT_FIND_SYSTEM_REC, "%s", + err_msg); + } + + mem_heap_empty(heap); + + /* Get the next record */ + mutex_enter(&dict_sys->mutex); + mtr_start(&mtr); + rec = dict_getnext_system(&pcur, &mtr); + } + + mtr_commit(&mtr); + mutex_exit(&dict_sys->mutex); + mem_heap_free(heap); + + DBUG_RETURN(0); +} +/*******************************************************************//** +Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_fields +@return 0 on success */ +static +int +innodb_sys_fields_init( +/*===================*/ + void* p) /*!< in/out: table schema object */ +{ + ST_SCHEMA_TABLE* schema; + + DBUG_ENTER("innodb_sys_field_init"); + + schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = innodb_sys_fields_fields_info; + schema->fill_table = i_s_sys_fields_fill_table; + + DBUG_RETURN(0); +} + +UNIV_INTERN struct st_mysql_plugin i_s_innodb_sys_fields = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), + + /* pointer to type-specific plugin descriptor */ + /* void* */ + STRUCT_FLD(info, &i_s_info), + + /* plugin name */ + /* const char* */ + STRUCT_FLD(name, "INNODB_SYS_FIELDS"), + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(author, plugin_author), + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(descr, "InnoDB SYS_FIELDS"), + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + STRUCT_FLD(license, PLUGIN_LICENSE_GPL), + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + STRUCT_FLD(init, innodb_sys_fields_init), + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + STRUCT_FLD(deinit, i_s_common_deinit), + + /* plugin version (for SHOW PLUGINS) */ + /* unsigned int */ + STRUCT_FLD(version, INNODB_VERSION_SHORT), + + /* struct st_mysql_show_var* */ + STRUCT_FLD(status_vars, NULL), + + /* struct st_mysql_sys_var** */ + STRUCT_FLD(system_vars, NULL), + + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), +}; + +/** SYS_FOREIGN ********************************************/ +/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_SYS_FOREIGN */ +static ST_FIELD_INFO innodb_sys_foreign_fields_info[] = +{ +#define SYS_FOREIGN_ID 0 + {STRUCT_FLD(field_name, "ID"), + STRUCT_FLD(field_length, NAME_LEN + 1), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define SYS_FOREIGN_FOR_NAME 1 + {STRUCT_FLD(field_name, "FOR_NAME"), + STRUCT_FLD(field_length, NAME_LEN + 1), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define SYS_FOREIGN_REF_NAME 2 + {STRUCT_FLD(field_name, "REF_NAME"), + STRUCT_FLD(field_length, NAME_LEN + 1), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define SYS_FOREIGN_NUM_COL 3 + {STRUCT_FLD(field_name, "N_COLS"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define SYS_FOREIGN_TYPE 4 + {STRUCT_FLD(field_name, "TYPE"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + END_OF_ST_FIELD_INFO +}; + +/**********************************************************************//** +Function to fill information_schema.innodb_sys_foreign with information +collected by scanning SYS_FOREIGN table. +@return 0 on success */ +static +int +i_s_dict_fill_sys_foreign( +/*======================*/ + THD* thd, /*!< in: thread */ + dict_foreign_t* foreign, /*!< in: table */ + TABLE* table_to_fill) /*!< in/out: fill this table */ +{ + Field** fields; + + DBUG_ENTER("i_s_dict_fill_sys_foreign"); + + fields = table_to_fill->field; + + OK(field_store_string(fields[SYS_FOREIGN_ID], foreign->id)); + + OK(field_store_string(fields[SYS_FOREIGN_FOR_NAME], + foreign->foreign_table_name)); + + OK(field_store_string(fields[SYS_FOREIGN_REF_NAME], + foreign->referenced_table_name)); + + OK(fields[SYS_FOREIGN_NUM_COL]->store(foreign->n_fields)); + + OK(fields[SYS_FOREIGN_TYPE]->store(foreign->type)); + + OK(schema_table_store_record(thd, table_to_fill)); + + DBUG_RETURN(0); +} + +/*******************************************************************//** +Function to populate INFORMATION_SCHEMA.innodb_sys_foreign table. Loop +through each record in SYS_FOREIGN, and extract the foreign key +information. +@return 0 on success */ +static +int +i_s_sys_foreign_fill_table( +/*=======================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* ) /*!< in: condition (not used) */ +{ + btr_pcur_t pcur; + const rec_t* rec; + mem_heap_t* heap; + mtr_t mtr; + + DBUG_ENTER("i_s_sys_foreign_fill_table"); + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name); + + /* deny access to user without PROCESS_ACL privilege */ + if (check_global_access(thd, PROCESS_ACL)) { + + DBUG_RETURN(0); + } + + heap = mem_heap_create(1000); + mutex_enter(&dict_sys->mutex); + mtr_start(&mtr); + + rec = dict_startscan_system(&pcur, &mtr, SYS_FOREIGN); + + while (rec) { + const char* err_msg; + dict_foreign_t foreign_rec; + + /* Populate a dict_foreign_t structure with information from + a SYS_FOREIGN row */ + err_msg = dict_process_sys_foreign_rec(heap, rec, &foreign_rec); + + mtr_commit(&mtr); + mutex_exit(&dict_sys->mutex); + + if (!err_msg) { + i_s_dict_fill_sys_foreign(thd, &foreign_rec, + tables->table); + } else { + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_CANT_FIND_SYSTEM_REC, "%s", + err_msg); + } + + mem_heap_empty(heap); + + /* Get the next record */ + mtr_start(&mtr); + mutex_enter(&dict_sys->mutex); + rec = dict_getnext_system(&pcur, &mtr); + } + + mtr_commit(&mtr); + mutex_exit(&dict_sys->mutex); + mem_heap_free(heap); + + DBUG_RETURN(0); +} + +/*******************************************************************//** +Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_foreign +@return 0 on success */ +static +int +innodb_sys_foreign_init( +/*====================*/ + void* p) /*!< in/out: table schema object */ +{ + ST_SCHEMA_TABLE* schema; + + DBUG_ENTER("innodb_sys_foreign_init"); + + schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = innodb_sys_foreign_fields_info; + schema->fill_table = i_s_sys_foreign_fill_table; + + DBUG_RETURN(0); +} + +UNIV_INTERN struct st_mysql_plugin i_s_innodb_sys_foreign = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), + + /* pointer to type-specific plugin descriptor */ + /* void* */ + STRUCT_FLD(info, &i_s_info), + + /* plugin name */ + /* const char* */ + STRUCT_FLD(name, "INNODB_SYS_FOREIGN"), + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(author, plugin_author), + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(descr, "InnoDB SYS_FOREIGN"), + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + STRUCT_FLD(license, PLUGIN_LICENSE_GPL), + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + STRUCT_FLD(init, innodb_sys_foreign_init), + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + STRUCT_FLD(deinit, i_s_common_deinit), + + /* plugin version (for SHOW PLUGINS) */ + /* unsigned int */ + STRUCT_FLD(version, INNODB_VERSION_SHORT), + + /* struct st_mysql_show_var* */ + STRUCT_FLD(status_vars, NULL), + + /* struct st_mysql_sys_var** */ + STRUCT_FLD(system_vars, NULL), + + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), +}; + +/** SYS_FOREIGN_COLS ********************************************/ +/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_SYS_FOREIGN_COLS */ +static ST_FIELD_INFO innodb_sys_foreign_cols_fields_info[] = +{ +#define SYS_FOREIGN_COL_ID 0 + {STRUCT_FLD(field_name, "ID"), + STRUCT_FLD(field_length, NAME_LEN + 1), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define SYS_FOREIGN_COL_FOR_NAME 1 + {STRUCT_FLD(field_name, "FOR_COL_NAME"), + STRUCT_FLD(field_length, NAME_LEN + 1), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define SYS_FOREIGN_COL_REF_NAME 2 + {STRUCT_FLD(field_name, "REF_COL_NAME"), + STRUCT_FLD(field_length, NAME_LEN + 1), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define SYS_FOREIGN_COL_POS 3 + {STRUCT_FLD(field_name, "POS"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + END_OF_ST_FIELD_INFO +}; + +/**********************************************************************//** +Function to fill information_schema.innodb_sys_foreign_cols with information +collected by scanning SYS_FOREIGN_COLS table. +@return 0 on success */ +static +int +i_s_dict_fill_sys_foreign_cols( +/*==========================*/ + THD* thd, /*!< in: thread */ + const char* name, /*!< in: foreign key constraint name */ + const char* for_col_name, /*!< in: referencing column name*/ + const char* ref_col_name, /*!< in: referenced column + name */ + ulint pos, /*!< in: column position */ + TABLE* table_to_fill) /*!< in/out: fill this table */ +{ + Field** fields; + + DBUG_ENTER("i_s_dict_fill_sys_foreign_cols"); + + fields = table_to_fill->field; + + OK(field_store_string(fields[SYS_FOREIGN_COL_ID], name)); + + OK(field_store_string(fields[SYS_FOREIGN_COL_FOR_NAME], for_col_name)); + + OK(field_store_string(fields[SYS_FOREIGN_COL_REF_NAME], ref_col_name)); + + OK(fields[SYS_FOREIGN_COL_POS]->store(static_cast<double>(pos))); + + OK(schema_table_store_record(thd, table_to_fill)); + + DBUG_RETURN(0); +} +/*******************************************************************//** +Function to populate INFORMATION_SCHEMA.innodb_sys_foreign_cols table. Loop +through each record in SYS_FOREIGN_COLS, and extract the foreign key column +information and fill the INFORMATION_SCHEMA.innodb_sys_foreign_cols table. +@return 0 on success */ +static +int +i_s_sys_foreign_cols_fill_table( +/*============================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* ) /*!< in: condition (not used) */ +{ + btr_pcur_t pcur; + const rec_t* rec; + mem_heap_t* heap; + mtr_t mtr; + + DBUG_ENTER("i_s_sys_foreign_cols_fill_table"); + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name); + + /* deny access to user without PROCESS_ACL privilege */ + if (check_global_access(thd, PROCESS_ACL)) { + DBUG_RETURN(0); + } + + heap = mem_heap_create(1000); + mutex_enter(&dict_sys->mutex); + mtr_start(&mtr); + + rec = dict_startscan_system(&pcur, &mtr, SYS_FOREIGN_COLS); + + while (rec) { + const char* err_msg; + const char* name; + const char* for_col_name; + const char* ref_col_name; + ulint pos; + + /* Extract necessary information from a SYS_FOREIGN_COLS row */ + err_msg = dict_process_sys_foreign_col_rec( + heap, rec, &name, &for_col_name, &ref_col_name, &pos); + + mtr_commit(&mtr); + mutex_exit(&dict_sys->mutex); + + if (!err_msg) { + i_s_dict_fill_sys_foreign_cols( + thd, name, for_col_name, ref_col_name, pos, + tables->table); + } else { + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_CANT_FIND_SYSTEM_REC, "%s", + err_msg); + } + + mem_heap_empty(heap); + + /* Get the next record */ + mutex_enter(&dict_sys->mutex); + mtr_start(&mtr); + rec = dict_getnext_system(&pcur, &mtr); + } + + mtr_commit(&mtr); + mutex_exit(&dict_sys->mutex); + mem_heap_free(heap); + + DBUG_RETURN(0); +} +/*******************************************************************//** +Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_foreign_cols +@return 0 on success */ +static +int +innodb_sys_foreign_cols_init( +/*========================*/ + void* p) /*!< in/out: table schema object */ +{ + ST_SCHEMA_TABLE* schema; + + DBUG_ENTER("innodb_sys_foreign_cols_init"); + + schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = innodb_sys_foreign_cols_fields_info; + schema->fill_table = i_s_sys_foreign_cols_fill_table; + + DBUG_RETURN(0); +} + +UNIV_INTERN struct st_mysql_plugin i_s_innodb_sys_foreign_cols = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), + + /* pointer to type-specific plugin descriptor */ + /* void* */ + STRUCT_FLD(info, &i_s_info), + + /* plugin name */ + /* const char* */ + STRUCT_FLD(name, "INNODB_SYS_FOREIGN_COLS"), + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(author, plugin_author), + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(descr, "InnoDB SYS_FOREIGN_COLS"), + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + STRUCT_FLD(license, PLUGIN_LICENSE_GPL), + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + STRUCT_FLD(init, innodb_sys_foreign_cols_init), + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + STRUCT_FLD(deinit, i_s_common_deinit), + + /* plugin version (for SHOW PLUGINS) */ + /* unsigned int */ + STRUCT_FLD(version, INNODB_VERSION_SHORT), + + /* struct st_mysql_show_var* */ + STRUCT_FLD(status_vars, NULL), + + /* struct st_mysql_sys_var** */ + STRUCT_FLD(system_vars, NULL), + + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), +}; + +/** SYS_TABLESPACES ********************************************/ +/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_SYS_TABLESPACES */ +static ST_FIELD_INFO innodb_sys_tablespaces_fields_info[] = +{ +#define SYS_TABLESPACES_SPACE 0 + {STRUCT_FLD(field_name, "SPACE"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define SYS_TABLESPACES_NAME 1 + {STRUCT_FLD(field_name, "NAME"), + STRUCT_FLD(field_length, MAX_FULL_NAME_LEN + 1), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define SYS_TABLESPACES_FLAGS 2 + {STRUCT_FLD(field_name, "FLAG"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define SYS_TABLESPACES_FILE_FORMAT 3 + {STRUCT_FLD(field_name, "FILE_FORMAT"), + STRUCT_FLD(field_length, 10), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define SYS_TABLESPACES_ROW_FORMAT 4 + {STRUCT_FLD(field_name, "ROW_FORMAT"), + STRUCT_FLD(field_length, 22), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define SYS_TABLESPACES_PAGE_SIZE 5 + {STRUCT_FLD(field_name, "PAGE_SIZE"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define SYS_TABLESPACES_ZIP_PAGE_SIZE 6 + {STRUCT_FLD(field_name, "ZIP_PAGE_SIZE"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + END_OF_ST_FIELD_INFO + +}; + +/**********************************************************************//** +Function to fill INFORMATION_SCHEMA.INNODB_SYS_TABLESPACES with information +collected by scanning SYS_TABLESPACESS table. +@return 0 on success */ +static +int +i_s_dict_fill_sys_tablespaces( +/*==========================*/ + THD* thd, /*!< in: thread */ + ulint space, /*!< in: space ID */ + const char* name, /*!< in: tablespace name */ + ulint flags, /*!< in: tablespace flags */ + TABLE* table_to_fill) /*!< in/out: fill this table */ +{ + Field** fields; + ulint atomic_blobs = FSP_FLAGS_HAS_ATOMIC_BLOBS(flags); + ulint page_size = fsp_flags_get_page_size(flags);; + ulint zip_size = fsp_flags_get_zip_size(flags); + const char* file_format; + const char* row_format; + + DBUG_ENTER("i_s_dict_fill_sys_tablespaces"); + + file_format = trx_sys_file_format_id_to_name(atomic_blobs); + if (!atomic_blobs) { + row_format = "Compact or Redundant"; + } else if DICT_TF_GET_ZIP_SSIZE(flags) { + row_format = "Compressed"; + } else { + row_format = "Dynamic"; + } + + fields = table_to_fill->field; + + OK(fields[SYS_TABLESPACES_SPACE]->store( + static_cast<double>(space))); + + OK(field_store_string(fields[SYS_TABLESPACES_NAME], name)); + + OK(fields[SYS_TABLESPACES_FLAGS]->store( + static_cast<double>(flags))); + + OK(field_store_string(fields[SYS_TABLESPACES_FILE_FORMAT], + file_format)); + + OK(field_store_string(fields[SYS_TABLESPACES_ROW_FORMAT], + row_format)); + + OK(fields[SYS_TABLESPACES_PAGE_SIZE]->store( + static_cast<double>(page_size))); + + OK(fields[SYS_TABLESPACES_ZIP_PAGE_SIZE]->store( + static_cast<double>(zip_size))); + + OK(schema_table_store_record(thd, table_to_fill)); + + DBUG_RETURN(0); +} +/*******************************************************************//** +Function to populate INFORMATION_SCHEMA.INNODB_SYS_TABLESPACES table. +Loop through each record in SYS_TABLESPACES, and extract the column +information and fill the INFORMATION_SCHEMA.INNODB_SYS_TABLESPACES table. +@return 0 on success */ +static +int +i_s_sys_tablespaces_fill_table( +/*===========================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* ) /*!< in: condition (not used) */ +{ + btr_pcur_t pcur; + const rec_t* rec; + mem_heap_t* heap; + mtr_t mtr; + + DBUG_ENTER("i_s_sys_tablespaces_fill_table"); + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name); + + /* deny access to user without PROCESS_ACL privilege */ + if (check_global_access(thd, PROCESS_ACL)) { + DBUG_RETURN(0); + } + + heap = mem_heap_create(1000); + mutex_enter(&dict_sys->mutex); + mtr_start(&mtr); + + rec = dict_startscan_system(&pcur, &mtr, SYS_TABLESPACES); + + while (rec) { + const char* err_msg; + ulint space; + const char* name; + ulint flags; + + /* Extract necessary information from a SYS_TABLESPACES row */ + err_msg = dict_process_sys_tablespaces( + heap, rec, &space, &name, &flags); + + mtr_commit(&mtr); + mutex_exit(&dict_sys->mutex); + + if (!err_msg) { + i_s_dict_fill_sys_tablespaces( + thd, space, name, flags, + tables->table); + } else { + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_CANT_FIND_SYSTEM_REC, "%s", + err_msg); + } + + mem_heap_empty(heap); + + /* Get the next record */ + mutex_enter(&dict_sys->mutex); + mtr_start(&mtr); + rec = dict_getnext_system(&pcur, &mtr); + } + + mtr_commit(&mtr); + mutex_exit(&dict_sys->mutex); + mem_heap_free(heap); + + DBUG_RETURN(0); +} +/*******************************************************************//** +Bind the dynamic table INFORMATION_SCHEMA.INNODB_SYS_TABLESPACES +@return 0 on success */ +static +int +innodb_sys_tablespaces_init( +/*========================*/ + void* p) /*!< in/out: table schema object */ +{ + ST_SCHEMA_TABLE* schema; + + DBUG_ENTER("innodb_sys_tablespaces_init"); + + schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = innodb_sys_tablespaces_fields_info; + schema->fill_table = i_s_sys_tablespaces_fill_table; + + DBUG_RETURN(0); +} + +UNIV_INTERN struct st_mysql_plugin i_s_innodb_sys_tablespaces = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), + + /* pointer to type-specific plugin descriptor */ + /* void* */ + STRUCT_FLD(info, &i_s_info), + + /* plugin name */ + /* const char* */ + STRUCT_FLD(name, "INNODB_SYS_TABLESPACES"), + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(author, plugin_author), + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(descr, "InnoDB SYS_TABLESPACES"), + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + STRUCT_FLD(license, PLUGIN_LICENSE_GPL), + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + STRUCT_FLD(init, innodb_sys_tablespaces_init), + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + STRUCT_FLD(deinit, i_s_common_deinit), + + /* plugin version (for SHOW PLUGINS) */ + /* unsigned int */ + STRUCT_FLD(version, INNODB_VERSION_SHORT), + + /* struct st_mysql_show_var* */ + STRUCT_FLD(status_vars, NULL), + + /* struct st_mysql_sys_var** */ + STRUCT_FLD(system_vars, NULL), + + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), +}; + +/** SYS_DATAFILES ************************************************/ +/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_SYS_DATAFILES */ +static ST_FIELD_INFO innodb_sys_datafiles_fields_info[] = +{ +#define SYS_DATAFILES_SPACE 0 + {STRUCT_FLD(field_name, "SPACE"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define SYS_DATAFILES_PATH 1 + {STRUCT_FLD(field_name, "PATH"), + STRUCT_FLD(field_length, OS_FILE_MAX_PATH), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + END_OF_ST_FIELD_INFO +}; + +/**********************************************************************//** +Function to fill INFORMATION_SCHEMA.INNODB_SYS_DATAFILES with information +collected by scanning SYS_DATAFILESS table. +@return 0 on success */ +static +int +i_s_dict_fill_sys_datafiles( +/*========================*/ + THD* thd, /*!< in: thread */ + ulint space, /*!< in: space ID */ + const char* path, /*!< in: absolute path */ + TABLE* table_to_fill) /*!< in/out: fill this table */ +{ + Field** fields; + + DBUG_ENTER("i_s_dict_fill_sys_datafiles"); + + fields = table_to_fill->field; + + OK(field_store_ulint(fields[SYS_DATAFILES_SPACE], space)); + + OK(field_store_string(fields[SYS_DATAFILES_PATH], path)); + + OK(schema_table_store_record(thd, table_to_fill)); + + DBUG_RETURN(0); +} +/*******************************************************************//** +Function to populate INFORMATION_SCHEMA.INNODB_SYS_DATAFILES table. +Loop through each record in SYS_DATAFILES, and extract the column +information and fill the INFORMATION_SCHEMA.INNODB_SYS_DATAFILES table. +@return 0 on success */ +static +int +i_s_sys_datafiles_fill_table( +/*=========================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* ) /*!< in: condition (not used) */ +{ + btr_pcur_t pcur; + const rec_t* rec; + mem_heap_t* heap; + mtr_t mtr; + + DBUG_ENTER("i_s_sys_datafiles_fill_table"); + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name); + + /* deny access to user without PROCESS_ACL privilege */ + if (check_global_access(thd, PROCESS_ACL)) { + DBUG_RETURN(0); + } + + heap = mem_heap_create(1000); + mutex_enter(&dict_sys->mutex); + mtr_start(&mtr); + + rec = dict_startscan_system(&pcur, &mtr, SYS_DATAFILES); + + while (rec) { + const char* err_msg; + ulint space; + const char* path; + + /* Extract necessary information from a SYS_DATAFILES row */ + err_msg = dict_process_sys_datafiles( + heap, rec, &space, &path); + + mtr_commit(&mtr); + mutex_exit(&dict_sys->mutex); + + if (!err_msg) { + i_s_dict_fill_sys_datafiles( + thd, space, path, tables->table); + } else { + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_CANT_FIND_SYSTEM_REC, "%s", + err_msg); + } + + mem_heap_empty(heap); + + /* Get the next record */ + mutex_enter(&dict_sys->mutex); + mtr_start(&mtr); + rec = dict_getnext_system(&pcur, &mtr); + } + + mtr_commit(&mtr); + mutex_exit(&dict_sys->mutex); + mem_heap_free(heap); + + DBUG_RETURN(0); +} +/*******************************************************************//** +Bind the dynamic table INFORMATION_SCHEMA.INNODB_SYS_DATAFILES +@return 0 on success */ +static +int +innodb_sys_datafiles_init( +/*======================*/ + void* p) /*!< in/out: table schema object */ +{ + ST_SCHEMA_TABLE* schema; + + DBUG_ENTER("innodb_sys_datafiles_init"); + + schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = innodb_sys_datafiles_fields_info; + schema->fill_table = i_s_sys_datafiles_fill_table; + + DBUG_RETURN(0); +} + +UNIV_INTERN struct st_mysql_plugin i_s_innodb_sys_datafiles = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), + + /* pointer to type-specific plugin descriptor */ + /* void* */ + STRUCT_FLD(info, &i_s_info), + + /* plugin name */ + /* const char* */ + STRUCT_FLD(name, "INNODB_SYS_DATAFILES"), + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(author, plugin_author), + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(descr, "InnoDB SYS_DATAFILES"), + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + STRUCT_FLD(license, PLUGIN_LICENSE_GPL), + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + STRUCT_FLD(init, innodb_sys_datafiles_init), + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + STRUCT_FLD(deinit, i_s_common_deinit), + + /* plugin version (for SHOW PLUGINS) */ + /* unsigned int */ + STRUCT_FLD(version, INNODB_VERSION_SHORT), + + /* struct st_mysql_show_var* */ + STRUCT_FLD(status_vars, NULL), + + /* struct st_mysql_sys_var** */ + STRUCT_FLD(system_vars, NULL), + + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL), + + /* Plugin flags */ + /* unsigned long */ + STRUCT_FLD(flags, 0UL), +}; + +static ST_FIELD_INFO i_s_innodb_changed_pages_info[] = +{ + {STRUCT_FLD(field_name, "space_id"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "page_id"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "start_lsn"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "end_lsn"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + END_OF_ST_FIELD_INFO +}; + +/*********************************************************************** + This function implements ICP for I_S.INNODB_CHANGED_PAGES by parsing a + condition and getting lower and upper bounds for start and end LSNs if the + condition corresponds to a certain pattern. + + In the most general form, we understand queries like + + SELECT * FROM INNODB_CHANGED_PAGES + WHERE START_LSN > num1 AND START_LSN < num2 + AND END_LSN > num3 AND END_LSN < num4; + + That's why the pattern syntax is: + + pattern: comp | and_comp; + comp: lsn < int_num | lsn <= int_num | int_num > lsn | int_num >= lsn; + lsn: start_lsn | end_lsn; + and_comp: expression AND expression | expression AND and_comp; + expression: comp | any_other_expression; + + The two bounds are handled differently: the lower bound is used to find the + correct starting _file_, the upper bound the last _block_ that needs reading. + + Lower bound conditions are handled in the following way: start_lsn >= X + specifies that the reading must start from the file that has the highest + starting LSN less than or equal to X. start_lsn > X is equivalent to + start_lsn >= X + 1. For end_lsn, end_lsn >= X is treated as + start_lsn >= X - 1 and end_lsn > X as start_lsn >= X. + + For the upper bound, suppose the condition is start_lsn < 100, this means we + have to read all blocks with start_lsn < 100. Which is equivalent to reading + all the blocks with end_lsn <= 99, or just end_lsn < 100. That's why it's + enough to find maximum lsn value, doesn't matter if this is start or end lsn + and compare it with "start_lsn" field. LSN <= 100 is treated as LSN < 101. + + Example: + + SELECT * FROM INNODB_CHANGED_PAGES + WHERE + start_lsn > 10 AND + end_lsn <= 1111 AND + 555 > end_lsn AND + page_id = 100; + + end_lsn will be set to 555, start_lsn will be set 11. + + Support for other functions (equal, NULL-safe equal, BETWEEN, IN, etc.) will + be added on demand. + +*/ +static +void +limit_lsn_range_from_condition( +/*===========================*/ + TABLE* table, /*!<in: table */ + Item* cond, /*!<in: condition */ + ib_uint64_t* start_lsn, /*!<in/out: minumum LSN */ + ib_uint64_t* end_lsn) /*!<in/out: maximum LSN */ +{ + enum Item_func::Functype func_type; + + if (cond->type() != Item::COND_ITEM && + cond->type() != Item::FUNC_ITEM) + return; + + func_type = ((Item_func*) cond)->functype(); + + switch (func_type) + { + case Item_func::COND_AND_FUNC: + { + List_iterator<Item> li(*((Item_cond*) cond) + ->argument_list()); + Item *item; + + while ((item= li++)) { + limit_lsn_range_from_condition(table, item, start_lsn, + end_lsn); + } + break; + } + case Item_func::LT_FUNC: + case Item_func::LE_FUNC: + case Item_func::GT_FUNC: + case Item_func::GE_FUNC: + { + Item *left; + Item *right; + Item_field *item_field; + ib_uint64_t tmp_result; + ibool is_end_lsn; + + /* a <= b equals to b >= a that's why we just exchange "left" + and "right" in the case of ">" or ">=" function. We don't + touch the operation itself. */ + if (((Item_func*) cond)->functype() == Item_func::LT_FUNC + || ((Item_func*) cond)->functype() == Item_func::LE_FUNC) { + left = ((Item_func*) cond)->arguments()[0]; + right = ((Item_func*) cond)->arguments()[1]; + } else { + left = ((Item_func*) cond)->arguments()[1]; + right = ((Item_func*) cond)->arguments()[0]; + } + + if (left->type() == Item::FIELD_ITEM) { + item_field = (Item_field *)left; + } else if (right->type() == Item::FIELD_ITEM) { + item_field = (Item_field *)right; + } else { + return; + } + + /* Check if the current field belongs to our table */ + if (table != item_field->field->table) { + return; + } + + /* Check if the field is START_LSN or END_LSN */ + /* END_LSN */ + is_end_lsn = table->field[3]->eq(item_field->field); + + if (/* START_LSN */ !table->field[2]->eq(item_field->field) + && !is_end_lsn) { + return; + } + + if (left->type() == Item::FIELD_ITEM + && right->type() == Item::INT_ITEM) { + + /* The case of start_lsn|end_lsn <|<= const, i.e. the + upper bound. */ + + tmp_result = right->val_int(); + if (((func_type == Item_func::LE_FUNC) + || (func_type == Item_func::GE_FUNC)) + && (tmp_result != IB_UINT64_MAX)) { + + tmp_result++; + } + if (tmp_result < *end_lsn) { + *end_lsn = tmp_result; + } + + } else if (left->type() == Item::INT_ITEM + && right->type() == Item::FIELD_ITEM) { + + /* The case of const <|<= start_lsn|end_lsn, i.e. the + lower bound */ + + tmp_result = left->val_int(); + if (is_end_lsn && tmp_result != 0) { + tmp_result--; + } + if (((func_type == Item_func::LT_FUNC) + || (func_type == Item_func::GT_FUNC)) + && (tmp_result != IB_UINT64_MAX)) { + + tmp_result++; + } + if (tmp_result > *start_lsn) { + *start_lsn = tmp_result; + } + } + + break; + } + default:; + } +} + +/*********************************************************************** +Fill the dynamic table information_schema.innodb_changed_pages. +@return 0 on success, 1 on failure */ +static +int +i_s_innodb_changed_pages_fill( +/*==========================*/ + THD* thd, /*!<in: thread */ + TABLE_LIST* tables, /*!<in/out: tables to fill */ + Item* cond) /*!<in: condition */ +{ + TABLE* table = (TABLE *) tables->table; + log_bitmap_iterator_t i; + ib_uint64_t output_rows_num = 0UL; + lsn_t max_lsn = LSN_MAX; + lsn_t min_lsn = 0ULL; + int ret = 0; + + DBUG_ENTER("i_s_innodb_changed_pages_fill"); + + /* deny access to non-superusers */ + if (check_global_access(thd, PROCESS_ACL)) { + + DBUG_RETURN(0); + } + + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name); + + if (cond) { + limit_lsn_range_from_condition(table, cond, &min_lsn, + &max_lsn); + } + + /* If the log tracker is running and our max_lsn > current tracked LSN, + cap the max lsn so that we don't try to read any partial runs as the + tracked LSN advances. */ + if (srv_track_changed_pages) { + ib_uint64_t tracked_lsn = log_get_tracked_lsn(); + if (max_lsn > tracked_lsn) + max_lsn = tracked_lsn; + } + + if (!log_online_bitmap_iterator_init(&i, min_lsn, max_lsn)) { + my_error(ER_CANT_FIND_SYSTEM_REC, MYF(0)); + DBUG_RETURN(1); + } + + DEBUG_SYNC(thd, "i_s_innodb_changed_pages_range_ready"); + + while(log_online_bitmap_iterator_next(&i) && + (!srv_max_changed_pages || + output_rows_num < srv_max_changed_pages) && + /* + There is no need to compare both start LSN and end LSN fields + with maximum value. It's enough to compare only start LSN. + Example: + + max_lsn = 100 + \\\\\\\\\\\\\\\\\\\\\\\\\|\\\\\\\\ - Query 1 + I------I I-------I I-------------I I----I + ////////////////// | - Query 2 + 1 2 3 4 + + Query 1: + SELECT * FROM INNODB_CHANGED_PAGES WHERE start_lsn < 100 + will select 1,2,3 bitmaps + Query 2: + SELECT * FROM INNODB_CHANGED_PAGES WHERE end_lsn < 100 + will select 1,2 bitmaps + + The condition start_lsn <= 100 will be false after reading + 1,2,3 bitmaps which suits for both cases. + */ + LOG_BITMAP_ITERATOR_START_LSN(i) <= max_lsn) + { + if (!LOG_BITMAP_ITERATOR_PAGE_CHANGED(i)) + continue; + + /* SPACE_ID */ + table->field[0]->store( + LOG_BITMAP_ITERATOR_SPACE_ID(i)); + /* PAGE_ID */ + table->field[1]->store( + LOG_BITMAP_ITERATOR_PAGE_NUM(i)); + /* START_LSN */ + table->field[2]->store( + LOG_BITMAP_ITERATOR_START_LSN(i), true); + /* END_LSN */ + table->field[3]->store( + LOG_BITMAP_ITERATOR_END_LSN(i), true); + + /* + I_S tables are in-memory tables. If bitmap file is big enough + a lot of memory can be used to store the table. But the size + of used memory can be diminished if we store only data which + corresponds to some conditions (in WHERE sql clause). Here + conditions are checked for the field values stored above. + + Conditions are checked twice. The first is here (during table + generation) and the second during query execution. Maybe it + makes sense to use some flag in THD object to avoid double + checking. + */ + if (cond && !cond->val_int()) + continue; + + if (schema_table_store_record(thd, table)) + { + log_online_bitmap_iterator_release(&i); + my_error(ER_CANT_FIND_SYSTEM_REC, MYF(0)); + DBUG_RETURN(1); + } + + ++output_rows_num; + } + + if (i.failed) { + my_error(ER_CANT_FIND_SYSTEM_REC, MYF(0)); + ret = 1; + } + + log_online_bitmap_iterator_release(&i); + DBUG_RETURN(ret); +} + +static +int +i_s_innodb_changed_pages_init( +/*==========================*/ + void* p) +{ + DBUG_ENTER("i_s_innodb_changed_pages_init"); + ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = i_s_innodb_changed_pages_info; + schema->fill_table = i_s_innodb_changed_pages_fill; + + DBUG_RETURN(0); +} + +UNIV_INTERN struct st_mysql_plugin i_s_innodb_changed_pages = +{ + STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), + STRUCT_FLD(info, &i_s_info), + STRUCT_FLD(name, "INNODB_CHANGED_PAGES"), + STRUCT_FLD(author, "Percona"), + STRUCT_FLD(descr, "InnoDB CHANGED_PAGES table"), + STRUCT_FLD(license, PLUGIN_LICENSE_GPL), + STRUCT_FLD(init, i_s_innodb_changed_pages_init), + STRUCT_FLD(deinit, i_s_common_deinit), + STRUCT_FLD(version, 0x0100 /* 1.0 */), + STRUCT_FLD(status_vars, NULL), + STRUCT_FLD(system_vars, NULL), + STRUCT_FLD(__reserved1, NULL), + STRUCT_FLD(flags, 0UL), +}; diff --git a/storage/xtradb/handler/i_s.h b/storage/xtradb/handler/i_s.h new file mode 100644 index 00000000000..fa500d57490 --- /dev/null +++ b/storage/xtradb/handler/i_s.h @@ -0,0 +1,61 @@ +/***************************************************************************** + +Copyright (c) 2007, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file handler/i_s.h +InnoDB INFORMATION SCHEMA tables interface to MySQL. + +Created July 18, 2007 Vasil Dimov +*******************************************************/ + +#ifndef i_s_h +#define i_s_h + +const char plugin_author[] = "Oracle Corporation"; + +extern struct st_mysql_plugin i_s_innodb_trx; +extern struct st_mysql_plugin i_s_innodb_locks; +extern struct st_mysql_plugin i_s_innodb_lock_waits; +extern struct st_mysql_plugin i_s_innodb_cmp; +extern struct st_mysql_plugin i_s_innodb_cmp_reset; +extern struct st_mysql_plugin i_s_innodb_cmp_per_index; +extern struct st_mysql_plugin i_s_innodb_cmp_per_index_reset; +extern struct st_mysql_plugin i_s_innodb_cmpmem; +extern struct st_mysql_plugin i_s_innodb_cmpmem_reset; +extern struct st_mysql_plugin i_s_innodb_metrics; +extern struct st_mysql_plugin i_s_innodb_ft_default_stopword; +extern struct st_mysql_plugin i_s_innodb_ft_deleted; +extern struct st_mysql_plugin i_s_innodb_ft_being_deleted; +extern struct st_mysql_plugin i_s_innodb_ft_index_cache; +extern struct st_mysql_plugin i_s_innodb_ft_index_table; +extern struct st_mysql_plugin i_s_innodb_ft_config; +extern struct st_mysql_plugin i_s_innodb_buffer_page; +extern struct st_mysql_plugin i_s_innodb_buffer_page_lru; +extern struct st_mysql_plugin i_s_innodb_buffer_stats; +extern struct st_mysql_plugin i_s_innodb_sys_tables; +extern struct st_mysql_plugin i_s_innodb_sys_tablestats; +extern struct st_mysql_plugin i_s_innodb_sys_indexes; +extern struct st_mysql_plugin i_s_innodb_sys_columns; +extern struct st_mysql_plugin i_s_innodb_sys_fields; +extern struct st_mysql_plugin i_s_innodb_sys_foreign; +extern struct st_mysql_plugin i_s_innodb_sys_foreign_cols; +extern struct st_mysql_plugin i_s_innodb_sys_tablespaces; +extern struct st_mysql_plugin i_s_innodb_sys_datafiles; +extern struct st_mysql_plugin i_s_innodb_changed_pages; + +#endif /* i_s_h */ diff --git a/storage/xtradb/handler/xtradb_i_s.cc b/storage/xtradb/handler/xtradb_i_s.cc new file mode 100644 index 00000000000..213e3c1aa53 --- /dev/null +++ b/storage/xtradb/handler/xtradb_i_s.cc @@ -0,0 +1,605 @@ +/***************************************************************************** + +Copyright (c) 2007, 2012, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2010-2012, Percona Inc. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +#include <mysqld_error.h> +#include <sql_acl.h> // PROCESS_ACL + +#include <m_ctype.h> +#include <hash.h> +#include <myisampack.h> +#include <mysys_err.h> +#include <my_sys.h> +#include "i_s.h" +#include <sql_plugin.h> +#include <mysql/innodb_priv.h> + +#include <read0i_s.h> +#include <trx0i_s.h> +#include "srv0start.h" /* for srv_was_started */ +#include <btr0sea.h> /* btr_search_sys */ +#include <log0recv.h> /* recv_sys */ +#include <fil0fil.h> + +/* for XTRADB_RSEG table */ +#include "trx0trx.h" /* for TRX_QUE_STATE_STR_MAX_LEN */ +#include "trx0rseg.h" /* for trx_rseg_struct */ +#include "trx0sys.h" /* for trx_sys */ + +#define PLUGIN_AUTHOR "Percona Inc." + +#define OK(expr) \ + if ((expr) != 0) { \ + DBUG_RETURN(1); \ + } + +#define RETURN_IF_INNODB_NOT_STARTED(plugin_name) \ +do { \ + if (!srv_was_started) { \ + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, \ + ER_CANT_FIND_SYSTEM_REC, \ + "InnoDB: SELECTing from " \ + "INFORMATION_SCHEMA.%s but " \ + "the InnoDB storage engine " \ + "is not installed", plugin_name); \ + DBUG_RETURN(0); \ + } \ +} while (0) + +#if !defined __STRICT_ANSI__ && defined __GNUC__ && (__GNUC__) > 2 && \ + !defined __INTEL_COMPILER && !defined __clang__ +#define STRUCT_FLD(name, value) name: value +#else +#define STRUCT_FLD(name, value) value +#endif + +#define END_OF_ST_FIELD_INFO \ + {STRUCT_FLD(field_name, NULL), \ + STRUCT_FLD(field_length, 0), \ + STRUCT_FLD(field_type, MYSQL_TYPE_NULL), \ + STRUCT_FLD(value, 0), \ + STRUCT_FLD(field_flags, 0), \ + STRUCT_FLD(old_name, ""), \ + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)} + + +/*******************************************************************//** +Auxiliary function to store ulint value in MYSQL_TYPE_LONGLONG field. +If the value is ULINT_UNDEFINED then the field it set to NULL. +@return 0 on success */ +static +int +field_store_ulint( +/*==============*/ + Field* field, /*!< in/out: target field for storage */ + ulint n) /*!< in: value to store */ +{ + int ret; + + if (n != ULINT_UNDEFINED) { + + ret = field->store(n); + field->set_notnull(); + } else { + + ret = 0; /* success */ + field->set_null(); + } + + return(ret); +} + +/*******************************************************************//** +Auxiliary function to store char* value in MYSQL_TYPE_STRING field. +@return 0 on success */ +static +int +field_store_string( +/*===============*/ + Field* field, /*!< in/out: target field for storage */ + const char* str) /*!< in: NUL-terminated utf-8 string, + or NULL */ +{ + int ret; + + if (str != NULL) { + + ret = field->store(str, strlen(str), + system_charset_info); + field->set_notnull(); + } else { + + ret = 0; /* success */ + field->set_null(); + } + + return(ret); +} + +static +int +i_s_common_deinit( +/*==============*/ + void* p) /*!< in/out: table schema object */ +{ + DBUG_ENTER("i_s_common_deinit"); + + /* Do nothing */ + + DBUG_RETURN(0); +} + +static ST_FIELD_INFO xtradb_read_view_fields_info[] = +{ +#define READ_VIEW_UNDO_NUMBER 0 + {STRUCT_FLD(field_name, "READ_VIEW_UNDO_NUMBER"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define READ_VIEW_LOW_LIMIT_NUMBER 1 + {STRUCT_FLD(field_name, "READ_VIEW_LOW_LIMIT_TRX_NUMBER"), + STRUCT_FLD(field_length, TRX_ID_MAX_LEN + 1), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define READ_VIEW_UPPER_LIMIT_ID 2 + {STRUCT_FLD(field_name, "READ_VIEW_UPPER_LIMIT_TRX_ID"), + STRUCT_FLD(field_length, TRX_ID_MAX_LEN + 1), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define READ_VIEW_LOW_LIMIT_ID 3 + {STRUCT_FLD(field_name, "READ_VIEW_LOW_LIMIT_TRX_ID"), + + STRUCT_FLD(field_length, TRX_ID_MAX_LEN + 1), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + END_OF_ST_FIELD_INFO +}; + +static int xtradb_read_view_fill_table(THD* thd, TABLE_LIST* tables, Item*) +{ + const char* table_name; + Field** fields; + TABLE* table; + char trx_id[TRX_ID_MAX_LEN + 1]; + + + DBUG_ENTER("xtradb_read_view_fill_table"); + + /* deny access to non-superusers */ + if (check_global_access(thd, PROCESS_ACL)) { + + DBUG_RETURN(0); + } + + table_name = tables->schema_table_name; + table = tables->table; + fields = table->field; + + RETURN_IF_INNODB_NOT_STARTED(table_name); + + i_s_xtradb_read_view_t read_view; + + if (read_fill_i_s_xtradb_read_view(&read_view) == NULL) + DBUG_RETURN(0); + + OK(field_store_ulint(fields[READ_VIEW_UNDO_NUMBER], read_view.undo_no)); + + ut_snprintf(trx_id, sizeof(trx_id), TRX_ID_FMT, read_view.low_limit_no); + OK(field_store_string(fields[READ_VIEW_LOW_LIMIT_NUMBER], trx_id)); + + ut_snprintf(trx_id, sizeof(trx_id), TRX_ID_FMT, read_view.up_limit_id); + OK(field_store_string(fields[READ_VIEW_UPPER_LIMIT_ID], trx_id)); + + ut_snprintf(trx_id, sizeof(trx_id), TRX_ID_FMT, read_view.low_limit_id); + OK(field_store_string(fields[READ_VIEW_LOW_LIMIT_ID], trx_id)); + + OK(schema_table_store_record(thd, table)); + + DBUG_RETURN(0); +} + + +static int xtradb_read_view_init(void* p) +{ + ST_SCHEMA_TABLE* schema; + + DBUG_ENTER("xtradb_read_view_init"); + + schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = xtradb_read_view_fields_info; + schema->fill_table = xtradb_read_view_fill_table; + + DBUG_RETURN(0); +} + +static struct st_mysql_information_schema i_s_info = +{ + MYSQL_INFORMATION_SCHEMA_INTERFACE_VERSION +}; + +UNIV_INTERN struct st_mysql_plugin i_s_xtradb_read_view = +{ + STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), + STRUCT_FLD(info, &i_s_info), + STRUCT_FLD(name, "XTRADB_READ_VIEW"), + STRUCT_FLD(author, PLUGIN_AUTHOR), + STRUCT_FLD(descr, "InnoDB Read View information"), + STRUCT_FLD(license, PLUGIN_LICENSE_GPL), + STRUCT_FLD(init, xtradb_read_view_init), + STRUCT_FLD(deinit, i_s_common_deinit), + STRUCT_FLD(version, INNODB_VERSION_SHORT), + STRUCT_FLD(status_vars, NULL), + STRUCT_FLD(system_vars, NULL), + STRUCT_FLD(__reserved1, NULL), + STRUCT_FLD(flags, 0UL), +}; + +static ST_FIELD_INFO xtradb_internal_hash_tables_fields_info[] = +{ +#define INT_HASH_TABLES_NAME 0 + {STRUCT_FLD(field_name, "INTERNAL_HASH_TABLE_NAME"), + STRUCT_FLD(field_length, 100), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define INT_HASH_TABLES_TOTAL 1 + {STRUCT_FLD(field_name, "TOTAL_MEMORY"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define INT_HASH_TABLES_CONSTANT 2 + {STRUCT_FLD(field_name, "CONSTANT_MEMORY"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define INT_HASH_TABLES_VARIABLE 3 + {STRUCT_FLD(field_name, "VARIABLE_MEMORY"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + END_OF_ST_FIELD_INFO +}; + +static int xtradb_internal_hash_tables_fill_table(THD* thd, TABLE_LIST* tables, Item*) +{ + const char* table_name; + Field** fields; + TABLE* table; + ulong btr_search_sys_constant; + ulong btr_search_sys_variable; + + DBUG_ENTER("xtradb_internal_hash_tables_fill_table"); + + /* deny access to non-superusers */ + if (check_global_access(thd, PROCESS_ACL)) { + + DBUG_RETURN(0); + } + + table_name = tables->schema_table_name; + table = tables->table; + fields = table->field; + + RETURN_IF_INNODB_NOT_STARTED(table_name); + + /* Calculate AHI constant and variable memory allocations */ + + btr_search_sys_constant = 0; + btr_search_sys_variable = 0; + + ut_ad(btr_search_sys->hash_tables); + + for (ulint i = 0; i < btr_search_index_num; i++) { + hash_table_t* ht = btr_search_sys->hash_tables[i]; + + ut_ad(ht); + ut_ad(ht->heap); + + /* Multiple mutexes/heaps are currently never used for adaptive + hash index tables. */ + ut_ad(!ht->n_sync_obj); + ut_ad(!ht->heaps); + + btr_search_sys_variable += mem_heap_get_size(ht->heap); + btr_search_sys_constant += ht->n_cells * sizeof(hash_cell_t); + } + + OK(field_store_string(fields[INT_HASH_TABLES_NAME], + "Adaptive hash index")); + OK(field_store_ulint(fields[INT_HASH_TABLES_TOTAL], + btr_search_sys_variable + btr_search_sys_constant)); + OK(field_store_ulint(fields[INT_HASH_TABLES_CONSTANT], + btr_search_sys_constant)); + OK(field_store_ulint(fields[INT_HASH_TABLES_VARIABLE], + btr_search_sys_variable)); + OK(schema_table_store_record(thd, table)); + + { + OK(field_store_string(fields[INT_HASH_TABLES_NAME], + "Page hash (buffer pool 0 only)")); + OK(field_store_ulint(fields[INT_HASH_TABLES_TOTAL], + (ulong) (buf_pool_from_array(0)->page_hash->n_cells * sizeof(hash_cell_t)))); + OK(field_store_ulint(fields[INT_HASH_TABLES_CONSTANT], + (ulong) (buf_pool_from_array(0)->page_hash->n_cells * sizeof(hash_cell_t)))); + OK(field_store_ulint(fields[INT_HASH_TABLES_VARIABLE], 0)); + OK(schema_table_store_record(thd, table)); + + } + + if (dict_sys) + { + OK(field_store_string(fields[INT_HASH_TABLES_NAME], + "Dictionary Cache")); + OK(field_store_ulint(fields[INT_HASH_TABLES_TOTAL], + ((dict_sys->table_hash->n_cells + + dict_sys->table_id_hash->n_cells + ) * sizeof(hash_cell_t) + + dict_sys->size))); + OK(field_store_ulint(fields[INT_HASH_TABLES_CONSTANT], + ((dict_sys->table_hash->n_cells + + dict_sys->table_id_hash->n_cells + ) * sizeof(hash_cell_t)))); + OK(field_store_ulint(fields[INT_HASH_TABLES_VARIABLE], + dict_sys->size)); + OK(schema_table_store_record(thd, table)); + } + + { + OK(field_store_string(fields[INT_HASH_TABLES_NAME], + "File system")); + OK(field_store_ulint(fields[INT_HASH_TABLES_TOTAL], + (ulong) (fil_system_hash_cells() + * sizeof(hash_cell_t) + + fil_system_hash_nodes()))); + OK(field_store_ulint(fields[INT_HASH_TABLES_CONSTANT], + (ulong) (fil_system_hash_cells() * sizeof(hash_cell_t)))); + OK(field_store_ulint(fields[INT_HASH_TABLES_VARIABLE], + (ulong) fil_system_hash_nodes())); + OK(schema_table_store_record(thd, table)); + + } + + { + ulint lock_sys_constant, lock_sys_variable; + + trx_i_s_get_lock_sys_memory_usage(&lock_sys_constant, + &lock_sys_variable); + + OK(field_store_string(fields[INT_HASH_TABLES_NAME], "Lock System")); + OK(field_store_ulint(fields[INT_HASH_TABLES_TOTAL], + lock_sys_constant + lock_sys_variable)); + OK(field_store_ulint(fields[INT_HASH_TABLES_CONSTANT], + lock_sys_constant)); + OK(field_store_ulint(fields[INT_HASH_TABLES_VARIABLE], + lock_sys_variable)); + OK(schema_table_store_record(thd, table)); + } + + if (recv_sys) + { + ulint recv_sys_subtotal = ((recv_sys && recv_sys->addr_hash) + ? mem_heap_get_size(recv_sys->heap) : 0); + + OK(field_store_string(fields[INT_HASH_TABLES_NAME], "Recovery System")); + OK(field_store_ulint(fields[INT_HASH_TABLES_TOTAL], + ((recv_sys->addr_hash) ? (recv_sys->addr_hash->n_cells * sizeof(hash_cell_t)) : 0) + recv_sys_subtotal)); + OK(field_store_ulint(fields[INT_HASH_TABLES_CONSTANT], + ((recv_sys->addr_hash) ? (recv_sys->addr_hash->n_cells * sizeof(hash_cell_t)) : 0))); + OK(field_store_ulint(fields[INT_HASH_TABLES_VARIABLE], + recv_sys_subtotal)); + OK(schema_table_store_record(thd, table)); + } + + DBUG_RETURN(0); +} + +static int xtradb_internal_hash_tables_init(void* p) +{ + ST_SCHEMA_TABLE* schema; + + DBUG_ENTER("xtradb_internal_hash_tables_init"); + + schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = xtradb_internal_hash_tables_fields_info; + schema->fill_table = xtradb_internal_hash_tables_fill_table; + + DBUG_RETURN(0); +} + +UNIV_INTERN struct st_mysql_plugin i_s_xtradb_internal_hash_tables = +{ + STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), + STRUCT_FLD(info, &i_s_info), + STRUCT_FLD(name, "XTRADB_INTERNAL_HASH_TABLES"), + STRUCT_FLD(author, PLUGIN_AUTHOR), + STRUCT_FLD(descr, "InnoDB internal hash tables information"), + STRUCT_FLD(license, PLUGIN_LICENSE_GPL), + STRUCT_FLD(init, xtradb_internal_hash_tables_init), + STRUCT_FLD(deinit, i_s_common_deinit), + STRUCT_FLD(version, INNODB_VERSION_SHORT), + STRUCT_FLD(status_vars, NULL), + STRUCT_FLD(system_vars, NULL), + STRUCT_FLD(__reserved1, NULL), + STRUCT_FLD(flags, 0UL), +}; + + +/*********************************************************************** +*/ +static ST_FIELD_INFO i_s_xtradb_rseg_fields_info[] = +{ + {STRUCT_FLD(field_name, "rseg_id"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "space_id"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "zip_size"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "page_no"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "max_size"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "curr_size"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + END_OF_ST_FIELD_INFO +}; + +static +int +i_s_xtradb_rseg_fill( +/*=================*/ + THD* thd, /* in: thread */ + TABLE_LIST* tables, /* in/out: tables to fill */ + Item* ) /* in: condition (ignored) */ +{ + TABLE* table = (TABLE *) tables->table; + int status = 0; + trx_rseg_t* rseg; + + DBUG_ENTER("i_s_xtradb_rseg_fill"); + + /* deny access to non-superusers */ + if (check_global_access(thd, PROCESS_ACL)) { + + DBUG_RETURN(0); + } + + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name); + + for(int i=0; i < TRX_SYS_N_RSEGS; i++) + { + rseg = trx_sys->rseg_array[i]; + if (!rseg) + continue; + + table->field[0]->store(rseg->id); + table->field[1]->store(rseg->space); + table->field[2]->store(rseg->zip_size); + table->field[3]->store(rseg->page_no); + table->field[4]->store(rseg->max_size); + table->field[5]->store(rseg->curr_size); + + if (schema_table_store_record(thd, table)) { + status = 1; + break; + } + } + + DBUG_RETURN(status); +} + +static +int +i_s_xtradb_rseg_init( +/*=================*/ + /* out: 0 on success */ + void* p) /* in/out: table schema object */ +{ + DBUG_ENTER("i_s_xtradb_rseg_init"); + ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = i_s_xtradb_rseg_fields_info; + schema->fill_table = i_s_xtradb_rseg_fill; + + DBUG_RETURN(0); +} + +UNIV_INTERN struct st_mysql_plugin i_s_xtradb_rseg = +{ + STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), + STRUCT_FLD(info, &i_s_info), + STRUCT_FLD(name, "XTRADB_RSEG"), + STRUCT_FLD(author, PLUGIN_AUTHOR), + STRUCT_FLD(descr, "InnoDB rollback segment information"), + STRUCT_FLD(license, PLUGIN_LICENSE_GPL), + STRUCT_FLD(init, i_s_xtradb_rseg_init), + STRUCT_FLD(deinit, i_s_common_deinit), + STRUCT_FLD(version, INNODB_VERSION_SHORT), + STRUCT_FLD(status_vars, NULL), + STRUCT_FLD(system_vars, NULL), + STRUCT_FLD(__reserved1, NULL), + STRUCT_FLD(flags, 0UL), +}; diff --git a/storage/xtradb/handler/xtradb_i_s.h b/storage/xtradb/handler/xtradb_i_s.h new file mode 100644 index 00000000000..2f7552c565a --- /dev/null +++ b/storage/xtradb/handler/xtradb_i_s.h @@ -0,0 +1,26 @@ +/***************************************************************************** + +Copyright (c) 2010-2012, Percona Inc. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +#ifndef XTRADB_I_S_H +#define XTRADB_I_S_H + +extern struct st_mysql_plugin i_s_xtradb_read_view; +extern struct st_mysql_plugin i_s_xtradb_internal_hash_tables; +extern struct st_mysql_plugin i_s_xtradb_rseg; + +#endif /* XTRADB_I_S_H */ diff --git a/storage/xtradb/ibuf/ibuf0ibuf.cc b/storage/xtradb/ibuf/ibuf0ibuf.cc new file mode 100644 index 00000000000..ef6c9c74558 --- /dev/null +++ b/storage/xtradb/ibuf/ibuf0ibuf.cc @@ -0,0 +1,5269 @@ +/***************************************************************************** + +Copyright (c) 1997, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file ibuf/ibuf0ibuf.cc +Insert buffer + +Created 7/19/1997 Heikki Tuuri +*******************************************************/ + +#include "ibuf0ibuf.h" + +#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG +UNIV_INTERN my_bool srv_ibuf_disable_background_merge; +#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ + +/** Number of bits describing a single page */ +#define IBUF_BITS_PER_PAGE 4 +#if IBUF_BITS_PER_PAGE % 2 +# error "IBUF_BITS_PER_PAGE must be an even number!" +#endif +/** The start address for an insert buffer bitmap page bitmap */ +#define IBUF_BITMAP PAGE_DATA + +#ifdef UNIV_NONINL +#include "ibuf0ibuf.ic" +#endif + +#ifndef UNIV_HOTBACKUP + +#include "buf0buf.h" +#include "buf0rea.h" +#include "fsp0fsp.h" +#include "trx0sys.h" +#include "fil0fil.h" +#include "rem0rec.h" +#include "btr0cur.h" +#include "btr0pcur.h" +#include "btr0btr.h" +#include "btr0sea.h" +#include "row0upd.h" +#include "sync0sync.h" +#include "dict0boot.h" +#include "fut0lst.h" +#include "lock0lock.h" +#include "log0recv.h" +#include "que0que.h" +#include "srv0start.h" /* srv_shutdown_state */ +#include "ha_prototypes.h" +#include "rem0cmp.h" + +/* STRUCTURE OF AN INSERT BUFFER RECORD + +In versions < 4.1.x: + +1. The first field is the page number. +2. The second field is an array which stores type info for each subsequent + field. We store the information which affects the ordering of records, and + also the physical storage size of an SQL NULL value. E.g., for CHAR(10) it + is 10 bytes. +3. Next we have the fields of the actual index record. + +In versions >= 4.1.x: + +Note that contary to what we planned in the 1990's, there will only be one +insert buffer tree, and that is in the system tablespace of InnoDB. + +1. The first field is the space id. +2. The second field is a one-byte marker (0) which differentiates records from + the < 4.1.x storage format. +3. The third field is the page number. +4. The fourth field contains the type info, where we have also added 2 bytes to + store the charset. In the compressed table format of 5.0.x we must add more + information here so that we can build a dummy 'index' struct which 5.0.x + can use in the binary search on the index page in the ibuf merge phase. +5. The rest of the fields contain the fields of the actual index record. + +In versions >= 5.0.3: + +The first byte of the fourth field is an additional marker (0) if the record +is in the compact format. The presence of this marker can be detected by +looking at the length of the field modulo DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE. + +The high-order bit of the character set field in the type info is the +"nullable" flag for the field. + +In versions >= 5.5: + +The optional marker byte at the start of the fourth field is replaced by +mandatory 3 fields, totaling 4 bytes: + + 1. 2 bytes: Counter field, used to sort records within a (space id, page + no) in the order they were added. This is needed so that for example the + sequence of operations "INSERT x, DEL MARK x, INSERT x" is handled + correctly. + + 2. 1 byte: Operation type (see ibuf_op_t). + + 3. 1 byte: Flags. Currently only one flag exists, IBUF_REC_COMPACT. + +To ensure older records, which do not have counters to enforce correct +sorting, are merged before any new records, ibuf_insert checks if we're +trying to insert to a position that contains old-style records, and if so, +refuses the insert. Thus, ibuf pages are gradually converted to the new +format as their corresponding buffer pool pages are read into memory. +*/ + + +/* PREVENTING DEADLOCKS IN THE INSERT BUFFER SYSTEM + +If an OS thread performs any operation that brings in disk pages from +non-system tablespaces into the buffer pool, or creates such a page there, +then the operation may have as a side effect an insert buffer index tree +compression. Thus, the tree latch of the insert buffer tree may be acquired +in the x-mode, and also the file space latch of the system tablespace may +be acquired in the x-mode. + +Also, an insert to an index in a non-system tablespace can have the same +effect. How do we know this cannot lead to a deadlock of OS threads? There +is a problem with the i\o-handler threads: they break the latching order +because they own x-latches to pages which are on a lower level than the +insert buffer tree latch, its page latches, and the tablespace latch an +insert buffer operation can reserve. + +The solution is the following: Let all the tree and page latches connected +with the insert buffer be later in the latching order than the fsp latch and +fsp page latches. + +Insert buffer pages must be such that the insert buffer is never invoked +when these pages are accessed as this would result in a recursion violating +the latching order. We let a special i/o-handler thread take care of i/o to +the insert buffer pages and the ibuf bitmap pages, as well as the fsp bitmap +pages and the first inode page, which contains the inode of the ibuf tree: let +us call all these ibuf pages. To prevent deadlocks, we do not let a read-ahead +access both non-ibuf and ibuf pages. + +Then an i/o-handler for the insert buffer never needs to access recursively the +insert buffer tree and thus obeys the latching order. On the other hand, other +i/o-handlers for other tablespaces may require access to the insert buffer, +but because all kinds of latches they need to access there are later in the +latching order, no violation of the latching order occurs in this case, +either. + +A problem is how to grow and contract an insert buffer tree. As it is later +in the latching order than the fsp management, we have to reserve the fsp +latch first, before adding or removing pages from the insert buffer tree. +We let the insert buffer tree have its own file space management: a free +list of pages linked to the tree root. To prevent recursive using of the +insert buffer when adding pages to the tree, we must first load these pages +to memory, obtaining a latch on them, and only after that add them to the +free list of the insert buffer tree. More difficult is removing of pages +from the free list. If there is an excess of pages in the free list of the +ibuf tree, they might be needed if some thread reserves the fsp latch, +intending to allocate more file space. So we do the following: if a thread +reserves the fsp latch, we check the writer count field of the latch. If +this field has value 1, it means that the thread did not own the latch +before entering the fsp system, and the mtr of the thread contains no +modifications to the fsp pages. Now we are free to reserve the ibuf latch, +and check if there is an excess of pages in the free list. We can then, in a +separate mini-transaction, take them out of the free list and free them to +the fsp system. + +To avoid deadlocks in the ibuf system, we divide file pages into three levels: + +(1) non-ibuf pages, +(2) ibuf tree pages and the pages in the ibuf tree free list, and +(3) ibuf bitmap pages. + +No OS thread is allowed to access higher level pages if it has latches to +lower level pages; even if the thread owns a B-tree latch it must not access +the B-tree non-leaf pages if it has latches on lower level pages. Read-ahead +is only allowed for level 1 and 2 pages. Dedicated i/o-handler threads handle +exclusively level 1 i/o. A dedicated i/o handler thread handles exclusively +level 2 i/o. However, if an OS thread does the i/o handling for itself, i.e., +it uses synchronous aio, it can access any pages, as long as it obeys the +access order rules. */ + +/** Table name for the insert buffer. */ +#define IBUF_TABLE_NAME "SYS_IBUF_TABLE" + +/** Operations that can currently be buffered. */ +UNIV_INTERN ibuf_use_t ibuf_use = IBUF_USE_ALL; + +#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG +/** Flag to control insert buffer debugging. */ +UNIV_INTERN uint ibuf_debug; +#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ + +/** The insert buffer control structure */ +UNIV_INTERN ibuf_t* ibuf = NULL; + +#ifdef UNIV_PFS_MUTEX +UNIV_INTERN mysql_pfs_key_t ibuf_pessimistic_insert_mutex_key; +UNIV_INTERN mysql_pfs_key_t ibuf_mutex_key; +UNIV_INTERN mysql_pfs_key_t ibuf_bitmap_mutex_key; +#endif /* UNIV_PFS_MUTEX */ + +#ifdef UNIV_IBUF_COUNT_DEBUG +/** Number of tablespaces in the ibuf_counts array */ +#define IBUF_COUNT_N_SPACES 4 +/** Number of pages within each tablespace in the ibuf_counts array */ +#define IBUF_COUNT_N_PAGES 130000 + +/** Buffered entry counts for file pages, used in debugging */ +static ulint ibuf_counts[IBUF_COUNT_N_SPACES][IBUF_COUNT_N_PAGES]; + +/******************************************************************//** +Checks that the indexes to ibuf_counts[][] are within limits. */ +UNIV_INLINE +void +ibuf_count_check( +/*=============*/ + ulint space_id, /*!< in: space identifier */ + ulint page_no) /*!< in: page number */ +{ + if (space_id < IBUF_COUNT_N_SPACES && page_no < IBUF_COUNT_N_PAGES) { + return; + } + + fprintf(stderr, + "InnoDB: UNIV_IBUF_COUNT_DEBUG limits space_id and page_no\n" + "InnoDB: and breaks crash recovery.\n" + "InnoDB: space_id=%lu, should be 0<=space_id<%lu\n" + "InnoDB: page_no=%lu, should be 0<=page_no<%lu\n", + (ulint) space_id, (ulint) IBUF_COUNT_N_SPACES, + (ulint) page_no, (ulint) IBUF_COUNT_N_PAGES); + ut_error; +} +#endif + +/** @name Offsets to the per-page bits in the insert buffer bitmap */ +/* @{ */ +#define IBUF_BITMAP_FREE 0 /*!< Bits indicating the + amount of free space */ +#define IBUF_BITMAP_BUFFERED 2 /*!< TRUE if there are buffered + changes for the page */ +#define IBUF_BITMAP_IBUF 3 /*!< TRUE if page is a part of + the ibuf tree, excluding the + root page, or is in the free + list of the ibuf */ +/* @} */ + +#define IBUF_REC_FIELD_SPACE 0 /*!< in the pre-4.1 format, + the page number. later, the space_id */ +#define IBUF_REC_FIELD_MARKER 1 /*!< starting with 4.1, a marker + consisting of 1 byte that is 0 */ +#define IBUF_REC_FIELD_PAGE 2 /*!< starting with 4.1, the + page number */ +#define IBUF_REC_FIELD_METADATA 3 /* the metadata field */ +#define IBUF_REC_FIELD_USER 4 /* first user field */ + +/* Various constants for checking the type of an ibuf record and extracting +data from it. For details, see the description of the record format at the +top of this file. */ + +/** @name Format of the IBUF_REC_FIELD_METADATA of an insert buffer record +The fourth column in the MySQL 5.5 format contains an operation +type, counter, and some flags. */ +/* @{ */ +#define IBUF_REC_INFO_SIZE 4 /*!< Combined size of info fields at + the beginning of the fourth field */ +#if IBUF_REC_INFO_SIZE >= DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE +# error "IBUF_REC_INFO_SIZE >= DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE" +#endif + +/* Offsets for the fields at the beginning of the fourth field */ +#define IBUF_REC_OFFSET_COUNTER 0 /*!< Operation counter */ +#define IBUF_REC_OFFSET_TYPE 2 /*!< Type of operation */ +#define IBUF_REC_OFFSET_FLAGS 3 /*!< Additional flags */ + +/* Record flag masks */ +#define IBUF_REC_COMPACT 0x1 /*!< Set in + IBUF_REC_OFFSET_FLAGS if the + user index is in COMPACT + format or later */ + + +/** The mutex used to block pessimistic inserts to ibuf trees */ +static ib_mutex_t ibuf_pessimistic_insert_mutex; + +/** The mutex protecting the insert buffer structs */ +static ib_mutex_t ibuf_mutex; + +/** The mutex protecting the insert buffer bitmaps */ +static ib_mutex_t ibuf_bitmap_mutex; + +/** The area in pages from which contract looks for page numbers for merge */ +#define IBUF_MERGE_AREA 8UL + +/** Inside the merge area, pages which have at most 1 per this number less +buffered entries compared to maximum volume that can buffered for a single +page are merged along with the page whose buffer became full */ +#define IBUF_MERGE_THRESHOLD 4 + +/** In ibuf_contract at most this number of pages is read to memory in one +batch, in order to merge the entries for them in the insert buffer */ +#define IBUF_MAX_N_PAGES_MERGED IBUF_MERGE_AREA + +/** If the combined size of the ibuf trees exceeds ibuf->max_size by this +many pages, we start to contract it in connection to inserts there, using +non-synchronous contract */ +#define IBUF_CONTRACT_ON_INSERT_NON_SYNC 0 + +/** If the combined size of the ibuf trees exceeds ibuf->max_size by this +many pages, we start to contract it in connection to inserts there, using +synchronous contract */ +#define IBUF_CONTRACT_ON_INSERT_SYNC 5 + +/** If the combined size of the ibuf trees exceeds ibuf->max_size by +this many pages, we start to contract it synchronous contract, but do +not insert */ +#define IBUF_CONTRACT_DO_NOT_INSERT 10 + +/* TODO: how to cope with drop table if there are records in the insert +buffer for the indexes of the table? Is there actually any problem, +because ibuf merge is done to a page when it is read in, and it is +still physically like the index page even if the index would have been +dropped! So, there seems to be no problem. */ + +/******************************************************************//** +Sets the flag in the current mini-transaction record indicating we're +inside an insert buffer routine. */ +UNIV_INLINE +void +ibuf_enter( +/*=======*/ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ut_ad(!mtr->inside_ibuf); + mtr->inside_ibuf = TRUE; +} + +/******************************************************************//** +Sets the flag in the current mini-transaction record indicating we're +exiting an insert buffer routine. */ +UNIV_INLINE +void +ibuf_exit( +/*======*/ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ut_ad(mtr->inside_ibuf); + mtr->inside_ibuf = FALSE; +} + +/**************************************************************//** +Commits an insert buffer mini-transaction and sets the persistent +cursor latch mode to BTR_NO_LATCHES, that is, detaches the cursor. */ +UNIV_INLINE +void +ibuf_btr_pcur_commit_specify_mtr( +/*=============================*/ + btr_pcur_t* pcur, /*!< in/out: persistent cursor */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ut_d(ibuf_exit(mtr)); + btr_pcur_commit_specify_mtr(pcur, mtr); +} + +/******************************************************************//** +Gets the ibuf header page and x-latches it. +@return insert buffer header page */ +static +page_t* +ibuf_header_page_get( +/*=================*/ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + buf_block_t* block; + + ut_ad(!ibuf_inside(mtr)); + + block = buf_page_get( + IBUF_SPACE_ID, 0, FSP_IBUF_HEADER_PAGE_NO, RW_X_LATCH, mtr); + buf_block_dbg_add_level(block, SYNC_IBUF_HEADER); + + return(buf_block_get_frame(block)); +} + +/******************************************************************//** +Gets the root page and x-latches it. +@return insert buffer tree root page */ +static +page_t* +ibuf_tree_root_get( +/*===============*/ + mtr_t* mtr) /*!< in: mtr */ +{ + buf_block_t* block; + page_t* root; + + ut_ad(ibuf_inside(mtr)); + ut_ad(mutex_own(&ibuf_mutex)); + + mtr_x_lock(dict_index_get_lock(ibuf->index), mtr); + + block = buf_page_get( + IBUF_SPACE_ID, 0, FSP_IBUF_TREE_ROOT_PAGE_NO, RW_X_LATCH, mtr); + + buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE_NEW); + + root = buf_block_get_frame(block); + + ut_ad(page_get_space_id(root) == IBUF_SPACE_ID); + ut_ad(page_get_page_no(root) == FSP_IBUF_TREE_ROOT_PAGE_NO); + ut_ad(ibuf->empty == page_is_empty(root)); + + return(root); +} + +#ifdef UNIV_IBUF_COUNT_DEBUG +/******************************************************************//** +Gets the ibuf count for a given page. +@return number of entries in the insert buffer currently buffered for +this page */ +UNIV_INTERN +ulint +ibuf_count_get( +/*===========*/ + ulint space, /*!< in: space id */ + ulint page_no)/*!< in: page number */ +{ + ibuf_count_check(space, page_no); + + return(ibuf_counts[space][page_no]); +} + +/******************************************************************//** +Sets the ibuf count for a given page. */ +static +void +ibuf_count_set( +/*===========*/ + ulint space, /*!< in: space id */ + ulint page_no,/*!< in: page number */ + ulint val) /*!< in: value to set */ +{ + ibuf_count_check(space, page_no); + ut_a(val < UNIV_PAGE_SIZE); + + ibuf_counts[space][page_no] = val; +} +#endif + +/******************************************************************//** +Closes insert buffer and frees the data structures. */ +UNIV_INTERN +void +ibuf_close(void) +/*============*/ +{ + mutex_free(&ibuf_pessimistic_insert_mutex); + memset(&ibuf_pessimistic_insert_mutex, + 0x0, sizeof(ibuf_pessimistic_insert_mutex)); + + mutex_free(&ibuf_mutex); + memset(&ibuf_mutex, 0x0, sizeof(ibuf_mutex)); + + mutex_free(&ibuf_bitmap_mutex); + memset(&ibuf_bitmap_mutex, 0x0, sizeof(ibuf_mutex)); + + mem_free(ibuf); + ibuf = NULL; +} + +/******************************************************************//** +Function to pass ibuf status variables */ +UNIV_INTERN +void +ibuf_export_ibuf_status( +/*====================*/ + ulint* size, + ulint* free_list, + ulint* segment_size, + ulint* merges, + ulint* merged_inserts, + ulint* merged_delete_marks, + ulint* merged_deletes, + ulint* discarded_inserts, + ulint* discarded_delete_marks, + ulint* discarded_deletes) +{ + *size + = ibuf->size; + *free_list + = ibuf->free_list_len; + *segment_size + = ibuf->seg_size; + *merges + = ibuf->n_merges; + *merged_inserts + = ibuf->n_merged_ops[IBUF_OP_INSERT]; + *merged_delete_marks + = ibuf->n_merged_ops[IBUF_OP_DELETE_MARK]; + *merged_deletes + = ibuf->n_merged_ops[IBUF_OP_DELETE]; + *discarded_inserts + = ibuf->n_discarded_ops[IBUF_OP_INSERT]; + *discarded_delete_marks + = ibuf->n_discarded_ops[IBUF_OP_DELETE_MARK]; + *discarded_deletes + = ibuf->n_discarded_ops[IBUF_OP_DELETE]; +} + +/******************************************************************//** +Updates the size information of the ibuf, assuming the segment size has not +changed. */ +static +void +ibuf_size_update( +/*=============*/ + const page_t* root, /*!< in: ibuf tree root */ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(mutex_own(&ibuf_mutex)); + + ibuf->free_list_len = flst_get_len(root + PAGE_HEADER + + PAGE_BTR_IBUF_FREE_LIST, mtr); + + ibuf->height = 1 + btr_page_get_level(root, mtr); + + /* the '1 +' is the ibuf header page */ + ibuf->size = ibuf->seg_size - (1 + ibuf->free_list_len); +} + +/******************************************************************//** +Creates the insert buffer data structure at a database startup and initializes +the data structures for the insert buffer. */ +UNIV_INTERN +void +ibuf_init_at_db_start(void) +/*=======================*/ +{ + page_t* root; + mtr_t mtr; + dict_table_t* table; + mem_heap_t* heap; + dict_index_t* index; + ulint n_used; + page_t* header_page; + dberr_t error; + + ibuf = static_cast<ibuf_t*>(mem_zalloc(sizeof(ibuf_t))); + + /* At startup we intialize ibuf to have a maximum of + CHANGE_BUFFER_DEFAULT_SIZE in terms of percentage of the + buffer pool size. Once ibuf struct is initialized this + value is updated with the user supplied size by calling + ibuf_max_size_update(). */ + ibuf->max_size = ((buf_pool_get_curr_size() / UNIV_PAGE_SIZE) + * CHANGE_BUFFER_DEFAULT_SIZE) / 100; + + mutex_create(ibuf_pessimistic_insert_mutex_key, + &ibuf_pessimistic_insert_mutex, + SYNC_IBUF_PESS_INSERT_MUTEX); + + mutex_create(ibuf_mutex_key, + &ibuf_mutex, SYNC_IBUF_MUTEX); + + mutex_create(ibuf_bitmap_mutex_key, + &ibuf_bitmap_mutex, SYNC_IBUF_BITMAP_MUTEX); + + mtr_start(&mtr); + + mutex_enter(&ibuf_mutex); + + mtr_x_lock(fil_space_get_latch(IBUF_SPACE_ID, NULL), &mtr); + + header_page = ibuf_header_page_get(&mtr); + + fseg_n_reserved_pages(header_page + IBUF_HEADER + IBUF_TREE_SEG_HEADER, + &n_used, &mtr); + ibuf_enter(&mtr); + + ut_ad(n_used >= 2); + + ibuf->seg_size = n_used; + + { + buf_block_t* block; + + block = buf_page_get( + IBUF_SPACE_ID, 0, FSP_IBUF_TREE_ROOT_PAGE_NO, + RW_X_LATCH, &mtr); + buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE); + + root = buf_block_get_frame(block); + } + + ibuf_size_update(root, &mtr); + mutex_exit(&ibuf_mutex); + + ibuf->empty = page_is_empty(root); + ibuf_mtr_commit(&mtr); + + heap = mem_heap_create(450); + + /* Use old-style record format for the insert buffer. */ + table = dict_mem_table_create(IBUF_TABLE_NAME, IBUF_SPACE_ID, 1, 0, 0, + false); + + dict_mem_table_add_col(table, heap, "DUMMY_COLUMN", DATA_BINARY, 0, 0); + + table->id = DICT_IBUF_ID_MIN + IBUF_SPACE_ID; + + dict_table_add_to_cache(table, FALSE, heap); + mem_heap_free(heap); + + index = dict_mem_index_create( + IBUF_TABLE_NAME, "CLUST_IND", + IBUF_SPACE_ID, DICT_CLUSTERED | DICT_UNIVERSAL | DICT_IBUF, 1); + + dict_mem_index_add_field(index, "DUMMY_COLUMN", 0); + + index->id = DICT_IBUF_ID_MIN + IBUF_SPACE_ID; + btr_search_index_init(index); + + error = dict_index_add_to_cache(table, index, + FSP_IBUF_TREE_ROOT_PAGE_NO, FALSE); + ut_a(error == DB_SUCCESS); + + ibuf->index = dict_table_get_first_index(table); +} + +/*********************************************************************//** +Updates the max_size value for ibuf. */ +UNIV_INTERN +void +ibuf_max_size_update( +/*=================*/ + ulint new_val) /*!< in: new value in terms of + percentage of the buffer pool size */ +{ + ulint new_size = ((buf_pool_get_curr_size() / UNIV_PAGE_SIZE) + * new_val) / 100; + mutex_enter(&ibuf_mutex); + ibuf->max_size = new_size; + mutex_exit(&ibuf_mutex); +} + + +#endif /* !UNIV_HOTBACKUP */ +/*********************************************************************//** +Initializes an ibuf bitmap page. */ +UNIV_INTERN +void +ibuf_bitmap_page_init( +/*==================*/ + buf_block_t* block, /*!< in: bitmap page */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_t* page; + ulint byte_offset; + ulint zip_size = buf_block_get_zip_size(block); + + ut_a(ut_is_2pow(zip_size)); + + page = buf_block_get_frame(block); + fil_page_set_type(page, FIL_PAGE_IBUF_BITMAP); + + /* Write all zeros to the bitmap */ + + if (!zip_size) { + byte_offset = UT_BITS_IN_BYTES(UNIV_PAGE_SIZE + * IBUF_BITS_PER_PAGE); + } else { + byte_offset = UT_BITS_IN_BYTES(zip_size * IBUF_BITS_PER_PAGE); + } + + memset(page + IBUF_BITMAP, 0, byte_offset); + + /* The remaining area (up to the page trailer) is uninitialized. */ + +#ifndef UNIV_HOTBACKUP + mlog_write_initial_log_record(page, MLOG_IBUF_BITMAP_INIT, mtr); +#endif /* !UNIV_HOTBACKUP */ +} + +/*********************************************************************//** +Parses a redo log record of an ibuf bitmap page init. +@return end of log record or NULL */ +UNIV_INTERN +byte* +ibuf_parse_bitmap_init( +/*===================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr __attribute__((unused)), /*!< in: buffer end */ + buf_block_t* block, /*!< in: block or NULL */ + mtr_t* mtr) /*!< in: mtr or NULL */ +{ + ut_ad(ptr && end_ptr); + + if (block) { + ibuf_bitmap_page_init(block, mtr); + } + + return(ptr); +} +#ifndef UNIV_HOTBACKUP +# ifdef UNIV_DEBUG +/** Gets the desired bits for a given page from a bitmap page. +@param page in: bitmap page +@param offset in: page whose bits to get +@param zs in: compressed page size in bytes; 0 for uncompressed pages +@param bit in: IBUF_BITMAP_FREE, IBUF_BITMAP_BUFFERED, ... +@param mtr in: mini-transaction holding an x-latch on the bitmap page +@return value of bits */ +# define ibuf_bitmap_page_get_bits(page, offset, zs, bit, mtr) \ + ibuf_bitmap_page_get_bits_low(page, offset, zs, \ + MTR_MEMO_PAGE_X_FIX, mtr, bit) +# else /* UNIV_DEBUG */ +/** Gets the desired bits for a given page from a bitmap page. +@param page in: bitmap page +@param offset in: page whose bits to get +@param zs in: compressed page size in bytes; 0 for uncompressed pages +@param bit in: IBUF_BITMAP_FREE, IBUF_BITMAP_BUFFERED, ... +@param mtr in: mini-transaction holding an x-latch on the bitmap page +@return value of bits */ +# define ibuf_bitmap_page_get_bits(page, offset, zs, bit, mtr) \ + ibuf_bitmap_page_get_bits_low(page, offset, zs, bit) +# endif /* UNIV_DEBUG */ + +/********************************************************************//** +Gets the desired bits for a given page from a bitmap page. +@return value of bits */ +UNIV_INLINE +ulint +ibuf_bitmap_page_get_bits_low( +/*==========================*/ + const page_t* page, /*!< in: bitmap page */ + ulint page_no,/*!< in: page whose bits to get */ + ulint zip_size,/*!< in: compressed page size in bytes; + 0 for uncompressed pages */ +#ifdef UNIV_DEBUG + ulint latch_type, + /*!< in: MTR_MEMO_PAGE_X_FIX, + MTR_MEMO_BUF_FIX, ... */ + mtr_t* mtr, /*!< in: mini-transaction holding latch_type + on the bitmap page */ +#endif /* UNIV_DEBUG */ + ulint bit) /*!< in: IBUF_BITMAP_FREE, + IBUF_BITMAP_BUFFERED, ... */ +{ + ulint byte_offset; + ulint bit_offset; + ulint map_byte; + ulint value; + + ut_ad(bit < IBUF_BITS_PER_PAGE); +#if IBUF_BITS_PER_PAGE % 2 +# error "IBUF_BITS_PER_PAGE % 2 != 0" +#endif + ut_ad(ut_is_2pow(zip_size)); + ut_ad(mtr_memo_contains_page(mtr, page, latch_type)); + + if (!zip_size) { + bit_offset = (page_no % UNIV_PAGE_SIZE) * IBUF_BITS_PER_PAGE + + bit; + } else { + bit_offset = (page_no & (zip_size - 1)) * IBUF_BITS_PER_PAGE + + bit; + } + + byte_offset = bit_offset / 8; + bit_offset = bit_offset % 8; + + ut_ad(byte_offset + IBUF_BITMAP < UNIV_PAGE_SIZE); + + map_byte = mach_read_from_1(page + IBUF_BITMAP + byte_offset); + + value = ut_bit_get_nth(map_byte, bit_offset); + + if (bit == IBUF_BITMAP_FREE) { + ut_ad(bit_offset + 1 < 8); + + value = value * 2 + ut_bit_get_nth(map_byte, bit_offset + 1); + } + + return(value); +} + +/********************************************************************//** +Sets the desired bit for a given page in a bitmap page. */ +static +void +ibuf_bitmap_page_set_bits( +/*======================*/ + page_t* page, /*!< in: bitmap page */ + ulint page_no,/*!< in: page whose bits to set */ + ulint zip_size,/*!< in: compressed page size in bytes; + 0 for uncompressed pages */ + ulint bit, /*!< in: IBUF_BITMAP_FREE, IBUF_BITMAP_BUFFERED, ... */ + ulint val, /*!< in: value to set */ + mtr_t* mtr) /*!< in: mtr containing an x-latch to the bitmap page */ +{ + ulint byte_offset; + ulint bit_offset; + ulint map_byte; + + ut_ad(bit < IBUF_BITS_PER_PAGE); +#if IBUF_BITS_PER_PAGE % 2 +# error "IBUF_BITS_PER_PAGE % 2 != 0" +#endif + ut_ad(ut_is_2pow(zip_size)); + ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX)); +#ifdef UNIV_IBUF_COUNT_DEBUG + ut_a((bit != IBUF_BITMAP_BUFFERED) || (val != FALSE) + || (0 == ibuf_count_get(page_get_space_id(page), + page_no))); +#endif + if (!zip_size) { + bit_offset = (page_no % UNIV_PAGE_SIZE) * IBUF_BITS_PER_PAGE + + bit; + } else { + bit_offset = (page_no & (zip_size - 1)) * IBUF_BITS_PER_PAGE + + bit; + } + + byte_offset = bit_offset / 8; + bit_offset = bit_offset % 8; + + ut_ad(byte_offset + IBUF_BITMAP < UNIV_PAGE_SIZE); + + map_byte = mach_read_from_1(page + IBUF_BITMAP + byte_offset); + + if (bit == IBUF_BITMAP_FREE) { + ut_ad(bit_offset + 1 < 8); + ut_ad(val <= 3); + + map_byte = ut_bit_set_nth(map_byte, bit_offset, val / 2); + map_byte = ut_bit_set_nth(map_byte, bit_offset + 1, val % 2); + } else { + ut_ad(val <= 1); + map_byte = ut_bit_set_nth(map_byte, bit_offset, val); + } + + mlog_write_ulint(page + IBUF_BITMAP + byte_offset, map_byte, + MLOG_1BYTE, mtr); +} + +/********************************************************************//** +Calculates the bitmap page number for a given page number. +@return the bitmap page number where the file page is mapped */ +UNIV_INLINE +ulint +ibuf_bitmap_page_no_calc( +/*=====================*/ + ulint zip_size, /*!< in: compressed page size in bytes; + 0 for uncompressed pages */ + ulint page_no) /*!< in: tablespace page number */ +{ + ut_ad(ut_is_2pow(zip_size)); + + if (!zip_size) { + return(FSP_IBUF_BITMAP_OFFSET + + (page_no & ~(UNIV_PAGE_SIZE - 1))); + } else { + return(FSP_IBUF_BITMAP_OFFSET + + (page_no & ~(zip_size - 1))); + } +} + +/********************************************************************//** +Gets the ibuf bitmap page where the bits describing a given file page are +stored. +@return bitmap page where the file page is mapped, that is, the bitmap +page containing the descriptor bits for the file page; the bitmap page +is x-latched */ +static +page_t* +ibuf_bitmap_get_map_page_func( +/*==========================*/ + ulint space, /*!< in: space id of the file page */ + ulint page_no,/*!< in: page number of the file page */ + ulint zip_size,/*!< in: compressed page size in bytes; + 0 for uncompressed pages */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ + mtr_t* mtr) /*!< in: mtr */ +{ + buf_block_t* block; + + block = buf_page_get_gen(space, zip_size, + ibuf_bitmap_page_no_calc(zip_size, page_no), + RW_X_LATCH, NULL, BUF_GET, + file, line, mtr); + buf_block_dbg_add_level(block, SYNC_IBUF_BITMAP); + + return(buf_block_get_frame(block)); +} + +/********************************************************************//** +Gets the ibuf bitmap page where the bits describing a given file page are +stored. +@return bitmap page where the file page is mapped, that is, the bitmap +page containing the descriptor bits for the file page; the bitmap page +is x-latched +@param space in: space id of the file page +@param page_no in: page number of the file page +@param zip_size in: compressed page size in bytes; 0 for uncompressed pages +@param mtr in: mini-transaction */ +#define ibuf_bitmap_get_map_page(space, page_no, zip_size, mtr) \ + ibuf_bitmap_get_map_page_func(space, page_no, zip_size, \ + __FILE__, __LINE__, mtr) + +/************************************************************************//** +Sets the free bits of the page in the ibuf bitmap. This is done in a separate +mini-transaction, hence this operation does not restrict further work to only +ibuf bitmap operations, which would result if the latch to the bitmap page +were kept. */ +UNIV_INLINE +void +ibuf_set_free_bits_low( +/*===================*/ + ulint zip_size,/*!< in: compressed page size in bytes; + 0 for uncompressed pages */ + const buf_block_t* block, /*!< in: index page; free bits are set if + the index is non-clustered and page + level is 0 */ + ulint val, /*!< in: value to set: < 4 */ + mtr_t* mtr) /*!< in/out: mtr */ +{ + page_t* bitmap_page; + ulint space; + ulint page_no; + + if (!page_is_leaf(buf_block_get_frame(block))) { + + return; + } + + space = buf_block_get_space(block); + page_no = buf_block_get_page_no(block); + bitmap_page = ibuf_bitmap_get_map_page(space, page_no, zip_size, mtr); +#ifdef UNIV_IBUF_DEBUG +# if 0 + fprintf(stderr, + "Setting space %lu page %lu free bits to %lu should be %lu\n", + space, page_no, val, + ibuf_index_page_calc_free(zip_size, block)); +# endif + + ut_a(val <= ibuf_index_page_calc_free(zip_size, block)); +#endif /* UNIV_IBUF_DEBUG */ + ibuf_bitmap_page_set_bits(bitmap_page, page_no, zip_size, + IBUF_BITMAP_FREE, val, mtr); +} + +/************************************************************************//** +Sets the free bit of the page in the ibuf bitmap. This is done in a separate +mini-transaction, hence this operation does not restrict further work to only +ibuf bitmap operations, which would result if the latch to the bitmap page +were kept. */ +UNIV_INTERN +void +ibuf_set_free_bits_func( +/*====================*/ + buf_block_t* block, /*!< in: index page of a non-clustered index; + free bit is reset if page level is 0 */ +#ifdef UNIV_IBUF_DEBUG + ulint max_val,/*!< in: ULINT_UNDEFINED or a maximum + value which the bits must have before + setting; this is for debugging */ +#endif /* UNIV_IBUF_DEBUG */ + ulint val) /*!< in: value to set: < 4 */ +{ + mtr_t mtr; + page_t* page; + page_t* bitmap_page; + ulint space; + ulint page_no; + ulint zip_size; + + page = buf_block_get_frame(block); + + if (!page_is_leaf(page)) { + + return; + } + + mtr_start(&mtr); + + space = buf_block_get_space(block); + page_no = buf_block_get_page_no(block); + zip_size = buf_block_get_zip_size(block); + bitmap_page = ibuf_bitmap_get_map_page(space, page_no, zip_size, &mtr); + +#ifdef UNIV_IBUF_DEBUG + if (max_val != ULINT_UNDEFINED) { + ulint old_val; + + old_val = ibuf_bitmap_page_get_bits( + bitmap_page, page_no, zip_size, + IBUF_BITMAP_FREE, &mtr); +# if 0 + if (old_val != max_val) { + fprintf(stderr, + "Ibuf: page %lu old val %lu max val %lu\n", + page_get_page_no(page), + old_val, max_val); + } +# endif + + ut_a(old_val <= max_val); + } +# if 0 + fprintf(stderr, "Setting page no %lu free bits to %lu should be %lu\n", + page_get_page_no(page), val, + ibuf_index_page_calc_free(zip_size, block)); +# endif + + ut_a(val <= ibuf_index_page_calc_free(zip_size, block)); +#endif /* UNIV_IBUF_DEBUG */ + ibuf_bitmap_page_set_bits(bitmap_page, page_no, zip_size, + IBUF_BITMAP_FREE, val, &mtr); + mtr_commit(&mtr); +} + +/************************************************************************//** +Resets the free bits of the page in the ibuf bitmap. This is done in a +separate mini-transaction, hence this operation does not restrict +further work to only ibuf bitmap operations, which would result if the +latch to the bitmap page were kept. NOTE: The free bits in the insert +buffer bitmap must never exceed the free space on a page. It is safe +to decrement or reset the bits in the bitmap in a mini-transaction +that is committed before the mini-transaction that affects the free +space. */ +UNIV_INTERN +void +ibuf_reset_free_bits( +/*=================*/ + buf_block_t* block) /*!< in: index page; free bits are set to 0 + if the index is a non-clustered + non-unique, and page level is 0 */ +{ + ibuf_set_free_bits(block, 0, ULINT_UNDEFINED); +} + +/**********************************************************************//** +Updates the free bits for an uncompressed page to reflect the present +state. Does this in the mtr given, which means that the latching +order rules virtually prevent any further operations for this OS +thread until mtr is committed. NOTE: The free bits in the insert +buffer bitmap must never exceed the free space on a page. It is safe +to set the free bits in the same mini-transaction that updated the +page. */ +UNIV_INTERN +void +ibuf_update_free_bits_low( +/*======================*/ + const buf_block_t* block, /*!< in: index page */ + ulint max_ins_size, /*!< in: value of + maximum insert size + with reorganize before + the latest operation + performed to the page */ + mtr_t* mtr) /*!< in/out: mtr */ +{ + ulint before; + ulint after; + + ut_a(!buf_block_get_page_zip(block)); + + before = ibuf_index_page_calc_free_bits(0, max_ins_size); + + after = ibuf_index_page_calc_free(0, block); + + /* This approach cannot be used on compressed pages, since the + computed value of "before" often does not match the current + state of the bitmap. This is because the free space may + increase or decrease when a compressed page is reorganized. */ + if (before != after) { + ibuf_set_free_bits_low(0, block, after, mtr); + } +} + +/**********************************************************************//** +Updates the free bits for a compressed page to reflect the present +state. Does this in the mtr given, which means that the latching +order rules virtually prevent any further operations for this OS +thread until mtr is committed. NOTE: The free bits in the insert +buffer bitmap must never exceed the free space on a page. It is safe +to set the free bits in the same mini-transaction that updated the +page. */ +UNIV_INTERN +void +ibuf_update_free_bits_zip( +/*======================*/ + buf_block_t* block, /*!< in/out: index page */ + mtr_t* mtr) /*!< in/out: mtr */ +{ + page_t* bitmap_page; + ulint space; + ulint page_no; + ulint zip_size; + ulint after; + + space = buf_block_get_space(block); + page_no = buf_block_get_page_no(block); + zip_size = buf_block_get_zip_size(block); + + ut_a(page_is_leaf(buf_block_get_frame(block))); + ut_a(zip_size); + + bitmap_page = ibuf_bitmap_get_map_page(space, page_no, zip_size, mtr); + + after = ibuf_index_page_calc_free_zip(zip_size, block); + + if (after == 0) { + /* We move the page to the front of the buffer pool LRU list: + the purpose of this is to prevent those pages to which we + cannot make inserts using the insert buffer from slipping + out of the buffer pool */ + + buf_page_make_young(&block->page); + } + + ibuf_bitmap_page_set_bits(bitmap_page, page_no, zip_size, + IBUF_BITMAP_FREE, after, mtr); +} + +/**********************************************************************//** +Updates the free bits for the two pages to reflect the present state. +Does this in the mtr given, which means that the latching order rules +virtually prevent any further operations until mtr is committed. +NOTE: The free bits in the insert buffer bitmap must never exceed the +free space on a page. It is safe to set the free bits in the same +mini-transaction that updated the pages. */ +UNIV_INTERN +void +ibuf_update_free_bits_for_two_pages_low( +/*====================================*/ + ulint zip_size,/*!< in: compressed page size in bytes; + 0 for uncompressed pages */ + buf_block_t* block1, /*!< in: index page */ + buf_block_t* block2, /*!< in: index page */ + mtr_t* mtr) /*!< in: mtr */ +{ + ulint state; + + /* As we have to x-latch two random bitmap pages, we have to acquire + the bitmap mutex to prevent a deadlock with a similar operation + performed by another OS thread. */ + + mutex_enter(&ibuf_bitmap_mutex); + + state = ibuf_index_page_calc_free(zip_size, block1); + + ibuf_set_free_bits_low(zip_size, block1, state, mtr); + + state = ibuf_index_page_calc_free(zip_size, block2); + + ibuf_set_free_bits_low(zip_size, block2, state, mtr); + + mutex_exit(&ibuf_bitmap_mutex); +} + +/**********************************************************************//** +Returns TRUE if the page is one of the fixed address ibuf pages. +@return TRUE if a fixed address ibuf i/o page */ +UNIV_INLINE +ibool +ibuf_fixed_addr_page( +/*=================*/ + ulint space, /*!< in: space id */ + ulint zip_size,/*!< in: compressed page size in bytes; + 0 for uncompressed pages */ + ulint page_no)/*!< in: page number */ +{ + return((space == IBUF_SPACE_ID && page_no == IBUF_TREE_ROOT_PAGE_NO) + || ibuf_bitmap_page(zip_size, page_no)); +} + +/***********************************************************************//** +Checks if a page is a level 2 or 3 page in the ibuf hierarchy of pages. +Must not be called when recv_no_ibuf_operations==TRUE. +@return TRUE if level 2 or level 3 page */ +UNIV_INTERN +ibool +ibuf_page_low( +/*==========*/ + ulint space, /*!< in: space id */ + ulint zip_size,/*!< in: compressed page size in bytes, or 0 */ + ulint page_no,/*!< in: page number */ +#ifdef UNIV_DEBUG + ibool x_latch,/*!< in: FALSE if relaxed check + (avoid latching the bitmap page) */ +#endif /* UNIV_DEBUG */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ + mtr_t* mtr) /*!< in: mtr which will contain an + x-latch to the bitmap page if the page + is not one of the fixed address ibuf + pages, or NULL, in which case a new + transaction is created. */ +{ + ibool ret; + mtr_t local_mtr; + page_t* bitmap_page; + + ut_ad(!recv_no_ibuf_operations); + ut_ad(x_latch || mtr == NULL); + + if (ibuf_fixed_addr_page(space, zip_size, page_no)) { + + return(TRUE); + } else if (space != IBUF_SPACE_ID) { + + return(FALSE); + } + + ut_ad(fil_space_get_type(IBUF_SPACE_ID) == FIL_TABLESPACE); + +#ifdef UNIV_DEBUG + if (!x_latch) { + mtr_start(&local_mtr); + + /* Get the bitmap page without a page latch, so that + we will not be violating the latching order when + another bitmap page has already been latched by this + thread. The page will be buffer-fixed, and thus it + cannot be removed or relocated while we are looking at + it. The contents of the page could change, but the + IBUF_BITMAP_IBUF bit that we are interested in should + not be modified by any other thread. Nobody should be + calling ibuf_add_free_page() or ibuf_remove_free_page() + while the page is linked to the insert buffer b-tree. */ + + bitmap_page = buf_block_get_frame( + buf_page_get_gen( + space, zip_size, + ibuf_bitmap_page_no_calc(zip_size, page_no), + RW_NO_LATCH, NULL, BUF_GET_NO_LATCH, + file, line, &local_mtr)); + + ret = ibuf_bitmap_page_get_bits_low( + bitmap_page, page_no, zip_size, + MTR_MEMO_BUF_FIX, &local_mtr, IBUF_BITMAP_IBUF); + + mtr_commit(&local_mtr); + return(ret); + } +#endif /* UNIV_DEBUG */ + + if (mtr == NULL) { + mtr = &local_mtr; + mtr_start(mtr); + } + + bitmap_page = ibuf_bitmap_get_map_page_func(space, page_no, zip_size, + file, line, mtr); + + ret = ibuf_bitmap_page_get_bits(bitmap_page, page_no, zip_size, + IBUF_BITMAP_IBUF, mtr); + + if (mtr == &local_mtr) { + mtr_commit(mtr); + } + + return(ret); +} + +#ifdef UNIV_DEBUG +# define ibuf_rec_get_page_no(mtr,rec) ibuf_rec_get_page_no_func(mtr,rec) +#else /* UNIV_DEBUG */ +# define ibuf_rec_get_page_no(mtr,rec) ibuf_rec_get_page_no_func(rec) +#endif /* UNIV_DEBUG */ + +/********************************************************************//** +Returns the page number field of an ibuf record. +@return page number */ +static +ulint +ibuf_rec_get_page_no_func( +/*======================*/ +#ifdef UNIV_DEBUG + mtr_t* mtr, /*!< in: mini-transaction owning rec */ +#endif /* UNIV_DEBUG */ + const rec_t* rec) /*!< in: ibuf record */ +{ + const byte* field; + ulint len; + + ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX) + || mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_S_FIX)); + ut_ad(ibuf_inside(mtr)); + ut_ad(rec_get_n_fields_old(rec) > 2); + + field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_MARKER, &len); + + ut_a(len == 1); + + field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_PAGE, &len); + + ut_a(len == 4); + + return(mach_read_from_4(field)); +} + +#ifdef UNIV_DEBUG +# define ibuf_rec_get_space(mtr,rec) ibuf_rec_get_space_func(mtr,rec) +#else /* UNIV_DEBUG */ +# define ibuf_rec_get_space(mtr,rec) ibuf_rec_get_space_func(rec) +#endif /* UNIV_DEBUG */ + +/********************************************************************//** +Returns the space id field of an ibuf record. For < 4.1.x format records +returns 0. +@return space id */ +static +ulint +ibuf_rec_get_space_func( +/*====================*/ +#ifdef UNIV_DEBUG + mtr_t* mtr, /*!< in: mini-transaction owning rec */ +#endif /* UNIV_DEBUG */ + const rec_t* rec) /*!< in: ibuf record */ +{ + const byte* field; + ulint len; + + ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX) + || mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_S_FIX)); + ut_ad(ibuf_inside(mtr)); + ut_ad(rec_get_n_fields_old(rec) > 2); + + field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_MARKER, &len); + + ut_a(len == 1); + + field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_SPACE, &len); + + ut_a(len == 4); + + return(mach_read_from_4(field)); +} + +#ifdef UNIV_DEBUG +# define ibuf_rec_get_info(mtr,rec,op,comp,info_len,counter) \ + ibuf_rec_get_info_func(mtr,rec,op,comp,info_len,counter) +#else /* UNIV_DEBUG */ +# define ibuf_rec_get_info(mtr,rec,op,comp,info_len,counter) \ + ibuf_rec_get_info_func(rec,op,comp,info_len,counter) +#endif +/****************************************************************//** +Get various information about an ibuf record in >= 4.1.x format. */ +static +void +ibuf_rec_get_info_func( +/*===================*/ +#ifdef UNIV_DEBUG + mtr_t* mtr, /*!< in: mini-transaction owning rec */ +#endif /* UNIV_DEBUG */ + const rec_t* rec, /*!< in: ibuf record */ + ibuf_op_t* op, /*!< out: operation type, or NULL */ + ibool* comp, /*!< out: compact flag, or NULL */ + ulint* info_len, /*!< out: length of info fields at the + start of the fourth field, or + NULL */ + ulint* counter) /*!< in: counter value, or NULL */ +{ + const byte* types; + ulint fields; + ulint len; + + /* Local variables to shadow arguments. */ + ibuf_op_t op_local; + ibool comp_local; + ulint info_len_local; + ulint counter_local; + + ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX) + || mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_S_FIX)); + ut_ad(ibuf_inside(mtr)); + fields = rec_get_n_fields_old(rec); + ut_a(fields > IBUF_REC_FIELD_USER); + + types = rec_get_nth_field_old(rec, IBUF_REC_FIELD_METADATA, &len); + + info_len_local = len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE; + + switch (info_len_local) { + case 0: + case 1: + op_local = IBUF_OP_INSERT; + comp_local = info_len_local; + ut_ad(!counter); + counter_local = ULINT_UNDEFINED; + break; + + case IBUF_REC_INFO_SIZE: + op_local = (ibuf_op_t) types[IBUF_REC_OFFSET_TYPE]; + comp_local = types[IBUF_REC_OFFSET_FLAGS] & IBUF_REC_COMPACT; + counter_local = mach_read_from_2( + types + IBUF_REC_OFFSET_COUNTER); + break; + + default: + ut_error; + } + + ut_a(op_local < IBUF_OP_COUNT); + ut_a((len - info_len_local) == + (fields - IBUF_REC_FIELD_USER) + * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); + + if (op) { + *op = op_local; + } + + if (comp) { + *comp = comp_local; + } + + if (info_len) { + *info_len = info_len_local; + } + + if (counter) { + *counter = counter_local; + } +} + +#ifdef UNIV_DEBUG +# define ibuf_rec_get_op_type(mtr,rec) ibuf_rec_get_op_type_func(mtr,rec) +#else /* UNIV_DEBUG */ +# define ibuf_rec_get_op_type(mtr,rec) ibuf_rec_get_op_type_func(rec) +#endif + +/****************************************************************//** +Returns the operation type field of an ibuf record. +@return operation type */ +static +ibuf_op_t +ibuf_rec_get_op_type_func( +/*======================*/ +#ifdef UNIV_DEBUG + mtr_t* mtr, /*!< in: mini-transaction owning rec */ +#endif /* UNIV_DEBUG */ + const rec_t* rec) /*!< in: ibuf record */ +{ + ulint len; + + ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX) + || mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_S_FIX)); + ut_ad(ibuf_inside(mtr)); + ut_ad(rec_get_n_fields_old(rec) > 2); + + (void) rec_get_nth_field_old(rec, IBUF_REC_FIELD_MARKER, &len); + + if (len > 1) { + /* This is a < 4.1.x format record */ + + return(IBUF_OP_INSERT); + } else { + ibuf_op_t op; + + ibuf_rec_get_info(mtr, rec, &op, NULL, NULL, NULL); + + return(op); + } +} + +/****************************************************************//** +Read the first two bytes from a record's fourth field (counter field in new +records; something else in older records). +@return "counter" field, or ULINT_UNDEFINED if for some reason it +can't be read */ +UNIV_INTERN +ulint +ibuf_rec_get_counter( +/*=================*/ + const rec_t* rec) /*!< in: ibuf record */ +{ + const byte* ptr; + ulint len; + + if (rec_get_n_fields_old(rec) <= IBUF_REC_FIELD_METADATA) { + + return(ULINT_UNDEFINED); + } + + ptr = rec_get_nth_field_old(rec, IBUF_REC_FIELD_METADATA, &len); + + if (len >= 2) { + + return(mach_read_from_2(ptr)); + } else { + + return(ULINT_UNDEFINED); + } +} + +/****************************************************************//** +Add accumulated operation counts to a permanent array. Both arrays must be +of size IBUF_OP_COUNT. */ +static +void +ibuf_add_ops( +/*=========*/ + ulint* arr, /*!< in/out: array to modify */ + const ulint* ops) /*!< in: operation counts */ + +{ + ulint i; + +#ifndef HAVE_ATOMIC_BUILTINS + ut_ad(mutex_own(&ibuf_mutex)); +#endif /* !HAVE_ATOMIC_BUILTINS */ + + for (i = 0; i < IBUF_OP_COUNT; i++) { +#ifdef HAVE_ATOMIC_BUILTINS + os_atomic_increment_ulint(&arr[i], ops[i]); +#else /* HAVE_ATOMIC_BUILTINS */ + arr[i] += ops[i]; +#endif /* HAVE_ATOMIC_BUILTINS */ + } +} + +/****************************************************************//** +Print operation counts. The array must be of size IBUF_OP_COUNT. */ +static +void +ibuf_print_ops( +/*===========*/ + const ulint* ops, /*!< in: operation counts */ + FILE* file) /*!< in: file where to print */ +{ + static const char* op_names[] = { + "insert", + "delete mark", + "delete" + }; + ulint i; + + ut_a(UT_ARR_SIZE(op_names) == IBUF_OP_COUNT); + + for (i = 0; i < IBUF_OP_COUNT; i++) { + fprintf(file, "%s %lu%s", op_names[i], + (ulong) ops[i], (i < (IBUF_OP_COUNT - 1)) ? ", " : ""); + } + + putc('\n', file); +} + +/********************************************************************//** +Creates a dummy index for inserting a record to a non-clustered index. +@return dummy index */ +static +dict_index_t* +ibuf_dummy_index_create( +/*====================*/ + ulint n, /*!< in: number of fields */ + ibool comp) /*!< in: TRUE=use compact record format */ +{ + dict_table_t* table; + dict_index_t* index; + + table = dict_mem_table_create("IBUF_DUMMY", + DICT_HDR_SPACE, n, + comp ? DICT_TF_COMPACT : 0, 0, true); + + index = dict_mem_index_create("IBUF_DUMMY", "IBUF_DUMMY", + DICT_HDR_SPACE, 0, n); + + index->table = table; + + /* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */ + index->cached = TRUE; + + return(index); +} +/********************************************************************//** +Add a column to the dummy index */ +static +void +ibuf_dummy_index_add_col( +/*=====================*/ + dict_index_t* index, /*!< in: dummy index */ + const dtype_t* type, /*!< in: the data type of the column */ + ulint len) /*!< in: length of the column */ +{ + ulint i = index->table->n_def; + dict_mem_table_add_col(index->table, NULL, NULL, + dtype_get_mtype(type), + dtype_get_prtype(type), + dtype_get_len(type)); + dict_index_add_col(index, index->table, + dict_table_get_nth_col(index->table, i), len); +} +/********************************************************************//** +Deallocates a dummy index for inserting a record to a non-clustered index. */ +static +void +ibuf_dummy_index_free( +/*==================*/ + dict_index_t* index) /*!< in, own: dummy index */ +{ + dict_table_t* table = index->table; + + dict_mem_index_free(index); + dict_mem_table_free(table); +} + +#ifdef UNIV_DEBUG +# define ibuf_build_entry_from_ibuf_rec(mtr,ibuf_rec,heap,pindex) \ + ibuf_build_entry_from_ibuf_rec_func(mtr,ibuf_rec,heap,pindex) +#else /* UNIV_DEBUG */ +# define ibuf_build_entry_from_ibuf_rec(mtr,ibuf_rec,heap,pindex) \ + ibuf_build_entry_from_ibuf_rec_func(ibuf_rec,heap,pindex) +#endif + +/*********************************************************************//** +Builds the entry used to + +1) IBUF_OP_INSERT: insert into a non-clustered index + +2) IBUF_OP_DELETE_MARK: find the record whose delete-mark flag we need to + activate + +3) IBUF_OP_DELETE: find the record we need to delete + +when we have the corresponding record in an ibuf index. + +NOTE that as we copy pointers to fields in ibuf_rec, the caller must +hold a latch to the ibuf_rec page as long as the entry is used! + +@return own: entry to insert to a non-clustered index */ +static +dtuple_t* +ibuf_build_entry_from_ibuf_rec_func( +/*================================*/ +#ifdef UNIV_DEBUG + mtr_t* mtr, /*!< in: mini-transaction owning rec */ +#endif /* UNIV_DEBUG */ + const rec_t* ibuf_rec, /*!< in: record in an insert buffer */ + mem_heap_t* heap, /*!< in: heap where built */ + dict_index_t** pindex) /*!< out, own: dummy index that + describes the entry */ +{ + dtuple_t* tuple; + dfield_t* field; + ulint n_fields; + const byte* types; + const byte* data; + ulint len; + ulint info_len; + ulint i; + ulint comp; + dict_index_t* index; + + ut_ad(mtr_memo_contains_page(mtr, ibuf_rec, MTR_MEMO_PAGE_X_FIX) + || mtr_memo_contains_page(mtr, ibuf_rec, MTR_MEMO_PAGE_S_FIX)); + ut_ad(ibuf_inside(mtr)); + + data = rec_get_nth_field_old(ibuf_rec, IBUF_REC_FIELD_MARKER, &len); + + ut_a(len == 1); + ut_a(*data == 0); + ut_a(rec_get_n_fields_old(ibuf_rec) > IBUF_REC_FIELD_USER); + + n_fields = rec_get_n_fields_old(ibuf_rec) - IBUF_REC_FIELD_USER; + + tuple = dtuple_create(heap, n_fields); + + types = rec_get_nth_field_old(ibuf_rec, IBUF_REC_FIELD_METADATA, &len); + + ibuf_rec_get_info(mtr, ibuf_rec, NULL, &comp, &info_len, NULL); + + index = ibuf_dummy_index_create(n_fields, comp); + + len -= info_len; + types += info_len; + + ut_a(len == n_fields * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); + + for (i = 0; i < n_fields; i++) { + field = dtuple_get_nth_field(tuple, i); + + data = rec_get_nth_field_old( + ibuf_rec, i + IBUF_REC_FIELD_USER, &len); + + dfield_set_data(field, data, len); + + dtype_new_read_for_order_and_null_size( + dfield_get_type(field), + types + i * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); + + ibuf_dummy_index_add_col(index, dfield_get_type(field), len); + } + + /* Prevent an ut_ad() failure in page_zip_write_rec() by + adding system columns to the dummy table pointed to by the + dummy secondary index. The insert buffer is only used for + secondary indexes, whose records never contain any system + columns, such as DB_TRX_ID. */ + ut_d(dict_table_add_system_columns(index->table, index->table->heap)); + + *pindex = index; + + return(tuple); +} + +/******************************************************************//** +Get the data size. +@return size of fields */ +UNIV_INLINE +ulint +ibuf_rec_get_size( +/*==============*/ + const rec_t* rec, /*!< in: ibuf record */ + const byte* types, /*!< in: fields */ + ulint n_fields, /*!< in: number of fields */ + ulint comp) /*!< in: 0=ROW_FORMAT=REDUNDANT, + nonzero=ROW_FORMAT=COMPACT */ +{ + ulint i; + ulint field_offset; + ulint types_offset; + ulint size = 0; + + field_offset = IBUF_REC_FIELD_USER; + types_offset = DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE; + + for (i = 0; i < n_fields; i++) { + ulint len; + dtype_t dtype; + + rec_get_nth_field_offs_old(rec, i + field_offset, &len); + + if (len != UNIV_SQL_NULL) { + size += len; + } else { + dtype_new_read_for_order_and_null_size(&dtype, types); + + size += dtype_get_sql_null_size(&dtype, comp); + } + + types += types_offset; + } + + return(size); +} + +#ifdef UNIV_DEBUG +# define ibuf_rec_get_volume(mtr,rec) ibuf_rec_get_volume_func(mtr,rec) +#else /* UNIV_DEBUG */ +# define ibuf_rec_get_volume(mtr,rec) ibuf_rec_get_volume_func(rec) +#endif + +/********************************************************************//** +Returns the space taken by a stored non-clustered index entry if converted to +an index record. +@return size of index record in bytes + an upper limit of the space +taken in the page directory */ +static +ulint +ibuf_rec_get_volume_func( +/*=====================*/ +#ifdef UNIV_DEBUG + mtr_t* mtr, /*!< in: mini-transaction owning rec */ +#endif /* UNIV_DEBUG */ + const rec_t* ibuf_rec)/*!< in: ibuf record */ +{ + ulint len; + const byte* data; + const byte* types; + ulint n_fields; + ulint data_size; + ulint comp; + ibuf_op_t op; + ulint info_len; + + ut_ad(mtr_memo_contains_page(mtr, ibuf_rec, MTR_MEMO_PAGE_X_FIX) + || mtr_memo_contains_page(mtr, ibuf_rec, MTR_MEMO_PAGE_S_FIX)); + ut_ad(ibuf_inside(mtr)); + ut_ad(rec_get_n_fields_old(ibuf_rec) > 2); + + data = rec_get_nth_field_old(ibuf_rec, IBUF_REC_FIELD_MARKER, &len); + ut_a(len == 1); + ut_a(*data == 0); + + types = rec_get_nth_field_old( + ibuf_rec, IBUF_REC_FIELD_METADATA, &len); + + ibuf_rec_get_info(mtr, ibuf_rec, &op, &comp, &info_len, NULL); + + if (op == IBUF_OP_DELETE_MARK || op == IBUF_OP_DELETE) { + /* Delete-marking a record doesn't take any + additional space, and while deleting a record + actually frees up space, we have to play it safe and + pretend it takes no additional space (the record + might not exist, etc.). */ + + return(0); + } else if (comp) { + dtuple_t* entry; + ulint volume; + dict_index_t* dummy_index; + mem_heap_t* heap = mem_heap_create(500); + + entry = ibuf_build_entry_from_ibuf_rec(mtr, ibuf_rec, + heap, &dummy_index); + + volume = rec_get_converted_size(dummy_index, entry, 0); + + ibuf_dummy_index_free(dummy_index); + mem_heap_free(heap); + + return(volume + page_dir_calc_reserved_space(1)); + } + + types += info_len; + n_fields = rec_get_n_fields_old(ibuf_rec) + - IBUF_REC_FIELD_USER; + + data_size = ibuf_rec_get_size(ibuf_rec, types, n_fields, comp); + + return(data_size + rec_get_converted_extra_size(data_size, n_fields, 0) + + page_dir_calc_reserved_space(1)); +} + +/*********************************************************************//** +Builds the tuple to insert to an ibuf tree when we have an entry for a +non-clustered index. + +NOTE that the original entry must be kept because we copy pointers to +its fields. + +@return own: entry to insert into an ibuf index tree */ +static +dtuple_t* +ibuf_entry_build( +/*=============*/ + ibuf_op_t op, /*!< in: operation type */ + dict_index_t* index, /*!< in: non-clustered index */ + const dtuple_t* entry, /*!< in: entry for a non-clustered index */ + ulint space, /*!< in: space id */ + ulint page_no,/*!< in: index page number where entry should + be inserted */ + ulint counter,/*!< in: counter value; + ULINT_UNDEFINED=not used */ + mem_heap_t* heap) /*!< in: heap into which to build */ +{ + dtuple_t* tuple; + dfield_t* field; + const dfield_t* entry_field; + ulint n_fields; + byte* buf; + byte* ti; + byte* type_info; + ulint i; + + ut_ad(counter != ULINT_UNDEFINED || op == IBUF_OP_INSERT); + ut_ad(counter == ULINT_UNDEFINED || counter <= 0xFFFF); + ut_ad(op < IBUF_OP_COUNT); + + /* We have to build a tuple with the following fields: + + 1-4) These are described at the top of this file. + + 5) The rest of the fields are copied from the entry. + + All fields in the tuple are ordered like the type binary in our + insert buffer tree. */ + + n_fields = dtuple_get_n_fields(entry); + + tuple = dtuple_create(heap, n_fields + IBUF_REC_FIELD_USER); + + /* 1) Space Id */ + + field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_SPACE); + + buf = static_cast<byte*>(mem_heap_alloc(heap, 4)); + + mach_write_to_4(buf, space); + + dfield_set_data(field, buf, 4); + + /* 2) Marker byte */ + + field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_MARKER); + + buf = static_cast<byte*>(mem_heap_alloc(heap, 1)); + + /* We set the marker byte zero */ + + mach_write_to_1(buf, 0); + + dfield_set_data(field, buf, 1); + + /* 3) Page number */ + + field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_PAGE); + + buf = static_cast<byte*>(mem_heap_alloc(heap, 4)); + + mach_write_to_4(buf, page_no); + + dfield_set_data(field, buf, 4); + + /* 4) Type info, part #1 */ + + if (counter == ULINT_UNDEFINED) { + i = dict_table_is_comp(index->table) ? 1 : 0; + } else { + ut_ad(counter <= 0xFFFF); + i = IBUF_REC_INFO_SIZE; + } + + ti = type_info = static_cast<byte*>( + mem_heap_alloc( + heap, + i + n_fields * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE)); + + switch (i) { + default: + ut_error; + break; + case 1: + /* set the flag for ROW_FORMAT=COMPACT */ + *ti++ = 0; + /* fall through */ + case 0: + /* the old format does not allow delete buffering */ + ut_ad(op == IBUF_OP_INSERT); + break; + case IBUF_REC_INFO_SIZE: + mach_write_to_2(ti + IBUF_REC_OFFSET_COUNTER, counter); + + ti[IBUF_REC_OFFSET_TYPE] = (byte) op; + ti[IBUF_REC_OFFSET_FLAGS] = dict_table_is_comp(index->table) + ? IBUF_REC_COMPACT : 0; + ti += IBUF_REC_INFO_SIZE; + break; + } + + /* 5+) Fields from the entry */ + + for (i = 0; i < n_fields; i++) { + ulint fixed_len; + const dict_field_t* ifield; + + field = dtuple_get_nth_field(tuple, i + IBUF_REC_FIELD_USER); + entry_field = dtuple_get_nth_field(entry, i); + dfield_copy(field, entry_field); + + ifield = dict_index_get_nth_field(index, i); + /* Prefix index columns of fixed-length columns are of + fixed length. However, in the function call below, + dfield_get_type(entry_field) contains the fixed length + of the column in the clustered index. Replace it with + the fixed length of the secondary index column. */ + fixed_len = ifield->fixed_len; + +#ifdef UNIV_DEBUG + if (fixed_len) { + /* dict_index_add_col() should guarantee these */ + ut_ad(fixed_len <= (ulint) + dfield_get_type(entry_field)->len); + if (ifield->prefix_len) { + ut_ad(ifield->prefix_len == fixed_len); + } else { + ut_ad(fixed_len == (ulint) + dfield_get_type(entry_field)->len); + } + } +#endif /* UNIV_DEBUG */ + + dtype_new_store_for_order_and_null_size( + ti, dfield_get_type(entry_field), fixed_len); + ti += DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE; + } + + /* 4) Type info, part #2 */ + + field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_METADATA); + + dfield_set_data(field, type_info, ti - type_info); + + /* Set all the types in the new tuple binary */ + + dtuple_set_types_binary(tuple, n_fields + IBUF_REC_FIELD_USER); + + return(tuple); +} + +/*********************************************************************//** +Builds a search tuple used to search buffered inserts for an index page. +This is for >= 4.1.x format records. +@return own: search tuple */ +static +dtuple_t* +ibuf_search_tuple_build( +/*====================*/ + ulint space, /*!< in: space id */ + ulint page_no,/*!< in: index page number */ + mem_heap_t* heap) /*!< in: heap into which to build */ +{ + dtuple_t* tuple; + dfield_t* field; + byte* buf; + + tuple = dtuple_create(heap, IBUF_REC_FIELD_METADATA); + + /* Store the space id in tuple */ + + field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_SPACE); + + buf = static_cast<byte*>(mem_heap_alloc(heap, 4)); + + mach_write_to_4(buf, space); + + dfield_set_data(field, buf, 4); + + /* Store the new format record marker byte */ + + field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_MARKER); + + buf = static_cast<byte*>(mem_heap_alloc(heap, 1)); + + mach_write_to_1(buf, 0); + + dfield_set_data(field, buf, 1); + + /* Store the page number in tuple */ + + field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_PAGE); + + buf = static_cast<byte*>(mem_heap_alloc(heap, 4)); + + mach_write_to_4(buf, page_no); + + dfield_set_data(field, buf, 4); + + dtuple_set_types_binary(tuple, IBUF_REC_FIELD_METADATA); + + return(tuple); +} + +/*********************************************************************//** +Checks if there are enough pages in the free list of the ibuf tree that we +dare to start a pessimistic insert to the insert buffer. +@return TRUE if enough free pages in list */ +UNIV_INLINE +ibool +ibuf_data_enough_free_for_insert(void) +/*==================================*/ +{ + ut_ad(mutex_own(&ibuf_mutex)); + + /* We want a big margin of free pages, because a B-tree can sometimes + grow in size also if records are deleted from it, as the node pointers + can change, and we must make sure that we are able to delete the + inserts buffered for pages that we read to the buffer pool, without + any risk of running out of free space in the insert buffer. */ + + return(ibuf->free_list_len >= (ibuf->size / 2) + 3 * ibuf->height); +} + +/*********************************************************************//** +Checks if there are enough pages in the free list of the ibuf tree that we +should remove them and free to the file space management. +@return TRUE if enough free pages in list */ +UNIV_INLINE +ibool +ibuf_data_too_much_free(void) +/*=========================*/ +{ + ut_ad(mutex_own(&ibuf_mutex)); + + return(ibuf->free_list_len >= 3 + (ibuf->size / 2) + 3 * ibuf->height); +} + +/*********************************************************************//** +Allocates a new page from the ibuf file segment and adds it to the free +list. +@return TRUE on success, FALSE if no space left */ +static +ibool +ibuf_add_free_page(void) +/*====================*/ +{ + mtr_t mtr; + page_t* header_page; + ulint flags; + ulint zip_size; + buf_block_t* block; + page_t* page; + page_t* root; + page_t* bitmap_page; + + mtr_start(&mtr); + + /* Acquire the fsp latch before the ibuf header, obeying the latching + order */ + mtr_x_lock(fil_space_get_latch(IBUF_SPACE_ID, &flags), &mtr); + zip_size = fsp_flags_get_zip_size(flags); + + header_page = ibuf_header_page_get(&mtr); + + /* Allocate a new page: NOTE that if the page has been a part of a + non-clustered index which has subsequently been dropped, then the + page may have buffered inserts in the insert buffer, and these + should be deleted from there. These get deleted when the page + allocation creates the page in buffer. Thus the call below may end + up calling the insert buffer routines and, as we yet have no latches + to insert buffer tree pages, these routines can run without a risk + of a deadlock. This is the reason why we created a special ibuf + header page apart from the ibuf tree. */ + + block = fseg_alloc_free_page( + header_page + IBUF_HEADER + IBUF_TREE_SEG_HEADER, 0, FSP_UP, + &mtr); + + if (block == NULL) { + mtr_commit(&mtr); + + return(FALSE); + } + + ut_ad(rw_lock_get_x_lock_count(&block->lock) == 1); + ibuf_enter(&mtr); + mutex_enter(&ibuf_mutex); + root = ibuf_tree_root_get(&mtr); + + buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE_NEW); + page = buf_block_get_frame(block); + + /* Add the page to the free list and update the ibuf size data */ + + flst_add_last(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, + page + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, &mtr); + + mlog_write_ulint(page + FIL_PAGE_TYPE, FIL_PAGE_IBUF_FREE_LIST, + MLOG_2BYTES, &mtr); + + ibuf->seg_size++; + ibuf->free_list_len++; + + /* Set the bit indicating that this page is now an ibuf tree page + (level 2 page) */ + + bitmap_page = ibuf_bitmap_get_map_page( + IBUF_SPACE_ID, buf_block_get_page_no(block), zip_size, &mtr); + + mutex_exit(&ibuf_mutex); + + ibuf_bitmap_page_set_bits( + bitmap_page, buf_block_get_page_no(block), zip_size, + IBUF_BITMAP_IBUF, TRUE, &mtr); + + ibuf_mtr_commit(&mtr); + + return(TRUE); +} + +/*********************************************************************//** +Removes a page from the free list and frees it to the fsp system. */ +static +void +ibuf_remove_free_page(void) +/*=======================*/ +{ + mtr_t mtr; + mtr_t mtr2; + page_t* header_page; + ulint flags; + ulint zip_size; + ulint page_no; + page_t* page; + page_t* root; + page_t* bitmap_page; + + mtr_start(&mtr); + + /* Acquire the fsp latch before the ibuf header, obeying the latching + order */ + mtr_x_lock(fil_space_get_latch(IBUF_SPACE_ID, &flags), &mtr); + zip_size = fsp_flags_get_zip_size(flags); + + header_page = ibuf_header_page_get(&mtr); + + /* Prevent pessimistic inserts to insert buffer trees for a while */ + ibuf_enter(&mtr); + mutex_enter(&ibuf_pessimistic_insert_mutex); + mutex_enter(&ibuf_mutex); + + if (!ibuf_data_too_much_free()) { + + mutex_exit(&ibuf_mutex); + mutex_exit(&ibuf_pessimistic_insert_mutex); + + ibuf_mtr_commit(&mtr); + + return; + } + + ibuf_mtr_start(&mtr2); + + root = ibuf_tree_root_get(&mtr2); + + mutex_exit(&ibuf_mutex); + + page_no = flst_get_last(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, + &mtr2).page; + + /* NOTE that we must release the latch on the ibuf tree root + because in fseg_free_page we access level 1 pages, and the root + is a level 2 page. */ + + ibuf_mtr_commit(&mtr2); + ibuf_exit(&mtr); + + /* Since pessimistic inserts were prevented, we know that the + page is still in the free list. NOTE that also deletes may take + pages from the free list, but they take them from the start, and + the free list was so long that they cannot have taken the last + page from it. */ + + fseg_free_page(header_page + IBUF_HEADER + IBUF_TREE_SEG_HEADER, + IBUF_SPACE_ID, page_no, &mtr); + +#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG + buf_page_reset_file_page_was_freed(IBUF_SPACE_ID, page_no); +#endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */ + + ibuf_enter(&mtr); + + mutex_enter(&ibuf_mutex); + + root = ibuf_tree_root_get(&mtr); + + ut_ad(page_no == flst_get_last(root + PAGE_HEADER + + PAGE_BTR_IBUF_FREE_LIST, &mtr).page); + + { + buf_block_t* block; + + block = buf_page_get( + IBUF_SPACE_ID, 0, page_no, RW_X_LATCH, &mtr); + + buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE); + + page = buf_block_get_frame(block); + } + + /* Remove the page from the free list and update the ibuf size data */ + + flst_remove(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, + page + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, &mtr); + + mutex_exit(&ibuf_pessimistic_insert_mutex); + + ibuf->seg_size--; + ibuf->free_list_len--; + + /* Set the bit indicating that this page is no more an ibuf tree page + (level 2 page) */ + + bitmap_page = ibuf_bitmap_get_map_page( + IBUF_SPACE_ID, page_no, zip_size, &mtr); + + mutex_exit(&ibuf_mutex); + + ibuf_bitmap_page_set_bits( + bitmap_page, page_no, zip_size, IBUF_BITMAP_IBUF, FALSE, &mtr); + +#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG + buf_page_set_file_page_was_freed(IBUF_SPACE_ID, page_no); +#endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */ + ibuf_mtr_commit(&mtr); +} + +/***********************************************************************//** +Frees excess pages from the ibuf free list. This function is called when an OS +thread calls fsp services to allocate a new file segment, or a new page to a +file segment, and the thread did not own the fsp latch before this call. */ +UNIV_INTERN +void +ibuf_free_excess_pages(void) +/*========================*/ +{ + ulint i; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(fil_space_get_latch(IBUF_SPACE_ID, NULL), + RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + ut_ad(rw_lock_get_x_lock_count( + fil_space_get_latch(IBUF_SPACE_ID, NULL)) == 1); + + /* NOTE: We require that the thread did not own the latch before, + because then we know that we can obey the correct latching order + for ibuf latches */ + + if (!ibuf) { + /* Not yet initialized; not sure if this is possible, but + does no harm to check for it. */ + + return; + } + + /* Free at most a few pages at a time, so that we do not delay the + requested service too much */ + + for (i = 0; i < 4; i++) { + + ibool too_much_free; + + mutex_enter(&ibuf_mutex); + too_much_free = ibuf_data_too_much_free(); + mutex_exit(&ibuf_mutex); + + if (!too_much_free) { + return; + } + + ibuf_remove_free_page(); + } +} + +#ifdef UNIV_DEBUG +# define ibuf_get_merge_page_nos(contract,rec,mtr,ids,vers,pages,n_stored) \ + ibuf_get_merge_page_nos_func(contract,rec,mtr,ids,vers,pages,n_stored) +#else /* UNIV_DEBUG */ +# define ibuf_get_merge_page_nos(contract,rec,mtr,ids,vers,pages,n_stored) \ + ibuf_get_merge_page_nos_func(contract,rec,ids,vers,pages,n_stored) +#endif /* UNIV_DEBUG */ + +/*********************************************************************//** +Reads page numbers from a leaf in an ibuf tree. +@return a lower limit for the combined volume of records which will be +merged */ +static +ulint +ibuf_get_merge_page_nos_func( +/*=========================*/ + ibool contract,/*!< in: TRUE if this function is called to + contract the tree, FALSE if this is called + when a single page becomes full and we look + if it pays to read also nearby pages */ + const rec_t* rec, /*!< in: insert buffer record */ +#ifdef UNIV_DEBUG + mtr_t* mtr, /*!< in: mini-transaction holding rec */ +#endif /* UNIV_DEBUG */ + ulint* space_ids,/*!< in/out: space id's of the pages */ + ib_int64_t* space_versions,/*!< in/out: tablespace version + timestamps; used to prevent reading in old + pages after DISCARD + IMPORT tablespace */ + ulint* page_nos,/*!< in/out: buffer for at least + IBUF_MAX_N_PAGES_MERGED many page numbers; + the page numbers are in an ascending order */ + ulint* n_stored)/*!< out: number of page numbers stored to + page_nos in this function */ +{ + ulint prev_page_no; + ulint prev_space_id; + ulint first_page_no; + ulint first_space_id; + ulint rec_page_no; + ulint rec_space_id; + ulint sum_volumes; + ulint volume_for_page; + ulint rec_volume; + ulint limit; + ulint n_pages; + + ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX) + || mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_S_FIX)); + ut_ad(ibuf_inside(mtr)); + + *n_stored = 0; + + limit = ut_min(IBUF_MAX_N_PAGES_MERGED, buf_pool_get_curr_size() / 4); + + if (page_rec_is_supremum(rec)) { + + rec = page_rec_get_prev_const(rec); + } + + if (page_rec_is_infimum(rec)) { + + rec = page_rec_get_next_const(rec); + } + + if (page_rec_is_supremum(rec)) { + + return(0); + } + + first_page_no = ibuf_rec_get_page_no(mtr, rec); + first_space_id = ibuf_rec_get_space(mtr, rec); + n_pages = 0; + prev_page_no = 0; + prev_space_id = 0; + + /* Go backwards from the first rec until we reach the border of the + 'merge area', or the page start or the limit of storeable pages is + reached */ + + while (!page_rec_is_infimum(rec) && UNIV_LIKELY(n_pages < limit)) { + + rec_page_no = ibuf_rec_get_page_no(mtr, rec); + rec_space_id = ibuf_rec_get_space(mtr, rec); + + if (rec_space_id != first_space_id + || (rec_page_no / IBUF_MERGE_AREA) + != (first_page_no / IBUF_MERGE_AREA)) { + + break; + } + + if (rec_page_no != prev_page_no + || rec_space_id != prev_space_id) { + n_pages++; + } + + prev_page_no = rec_page_no; + prev_space_id = rec_space_id; + + rec = page_rec_get_prev_const(rec); + } + + rec = page_rec_get_next_const(rec); + + /* At the loop start there is no prev page; we mark this with a pair + of space id, page no (0, 0) for which there can never be entries in + the insert buffer */ + + prev_page_no = 0; + prev_space_id = 0; + sum_volumes = 0; + volume_for_page = 0; + + while (*n_stored < limit) { + if (page_rec_is_supremum(rec)) { + /* When no more records available, mark this with + another 'impossible' pair of space id, page no */ + rec_page_no = 1; + rec_space_id = 0; + } else { + rec_page_no = ibuf_rec_get_page_no(mtr, rec); + rec_space_id = ibuf_rec_get_space(mtr, rec); + /* In the system tablespace, the smallest + possible secondary index leaf page number is + bigger than IBUF_TREE_ROOT_PAGE_NO (4). In + other tablespaces, the clustered index tree is + created at page 3, which makes page 4 the + smallest possible secondary index leaf page + (and that only after DROP INDEX). */ + ut_ad(rec_page_no + > (ulint) IBUF_TREE_ROOT_PAGE_NO + - (rec_space_id != 0)); + } + +#ifdef UNIV_IBUF_DEBUG + ut_a(*n_stored < IBUF_MAX_N_PAGES_MERGED); +#endif + if ((rec_space_id != prev_space_id + || rec_page_no != prev_page_no) + && (prev_space_id != 0 || prev_page_no != 0)) { + + if (contract + || (prev_page_no == first_page_no + && prev_space_id == first_space_id) + || (volume_for_page + > ((IBUF_MERGE_THRESHOLD - 1) + * 4 * UNIV_PAGE_SIZE + / IBUF_PAGE_SIZE_PER_FREE_SPACE) + / IBUF_MERGE_THRESHOLD)) { + + space_ids[*n_stored] = prev_space_id; + space_versions[*n_stored] + = fil_space_get_version(prev_space_id); + page_nos[*n_stored] = prev_page_no; + + (*n_stored)++; + + sum_volumes += volume_for_page; + } + + if (rec_space_id != first_space_id + || rec_page_no / IBUF_MERGE_AREA + != first_page_no / IBUF_MERGE_AREA) { + + break; + } + + volume_for_page = 0; + } + + if (rec_page_no == 1 && rec_space_id == 0) { + /* Supremum record */ + + break; + } + + rec_volume = ibuf_rec_get_volume(mtr, rec); + + volume_for_page += rec_volume; + + prev_page_no = rec_page_no; + prev_space_id = rec_space_id; + + rec = page_rec_get_next_const(rec); + } + +#ifdef UNIV_IBUF_DEBUG + ut_a(*n_stored <= IBUF_MAX_N_PAGES_MERGED); +#endif +#if 0 + fprintf(stderr, "Ibuf merge batch %lu pages %lu volume\n", + *n_stored, sum_volumes); +#endif + return(sum_volumes); +} + +/*******************************************************************//** +Get the matching records for space id. +@return current rec or NULL */ +static __attribute__((nonnull, warn_unused_result)) +const rec_t* +ibuf_get_user_rec( +/*===============*/ + btr_pcur_t* pcur, /*!< in: the current cursor */ + mtr_t* mtr) /*!< in: mini transaction */ +{ + do { + const rec_t* rec = btr_pcur_get_rec(pcur); + + if (page_rec_is_user_rec(rec)) { + return(rec); + } + } while (btr_pcur_move_to_next(pcur, mtr)); + + return(NULL); +} + +/*********************************************************************//** +Reads page numbers for a space id from an ibuf tree. +@return a lower limit for the combined volume of records which will be +merged */ +static __attribute__((nonnull, warn_unused_result)) +ulint +ibuf_get_merge_pages( +/*=================*/ + btr_pcur_t* pcur, /*!< in/out: cursor */ + ulint space, /*!< in: space for which to merge */ + ulint limit, /*!< in: max page numbers to read */ + ulint* pages, /*!< out: pages read */ + ulint* spaces, /*!< out: spaces read */ + ib_int64_t* versions,/*!< out: space versions read */ + ulint* n_pages,/*!< out: number of pages read */ + mtr_t* mtr) /*!< in: mini transaction */ +{ + const rec_t* rec; + ulint volume = 0; + ib_int64_t version = fil_space_get_version(space); + + ut_a(space != ULINT_UNDEFINED); + + *n_pages = 0; + + while ((rec = ibuf_get_user_rec(pcur, mtr)) != 0 + && ibuf_rec_get_space(mtr, rec) == space + && *n_pages < limit) { + + ulint page_no = ibuf_rec_get_page_no(mtr, rec); + + if (*n_pages == 0 || pages[*n_pages - 1] != page_no) { + spaces[*n_pages] = space; + pages[*n_pages] = page_no; + versions[*n_pages] = version; + ++*n_pages; + } + + volume += ibuf_rec_get_volume(mtr, rec); + + btr_pcur_move_to_next(pcur, mtr); + } + + return(volume); +} + +/*********************************************************************//** +Contracts insert buffer trees by reading pages to the buffer pool. +@return a lower limit for the combined size in bytes of entries which +will be merged from ibuf trees to the pages read, 0 if ibuf is +empty */ +static +ulint +ibuf_merge_pages( +/*=============*/ + ulint* n_pages, /*!< out: number of pages to which merged */ + bool sync) /*!< in: true if the caller wants to wait for + the issued read with the highest tablespace + address to complete */ +{ + mtr_t mtr; + btr_pcur_t pcur; + ulint sum_sizes; + ulint page_nos[IBUF_MAX_N_PAGES_MERGED]; + ulint space_ids[IBUF_MAX_N_PAGES_MERGED]; + ib_int64_t space_versions[IBUF_MAX_N_PAGES_MERGED]; + + *n_pages = 0; + + ibuf_mtr_start(&mtr); + + /* Open a cursor to a randomly chosen leaf of the tree, at a random + position within the leaf */ + + btr_pcur_open_at_rnd_pos(ibuf->index, BTR_SEARCH_LEAF, &pcur, &mtr); + + ut_ad(page_validate(btr_pcur_get_page(&pcur), ibuf->index)); + + if (page_is_empty(btr_pcur_get_page(&pcur))) { + /* If a B-tree page is empty, it must be the root page + and the whole B-tree must be empty. InnoDB does not + allow empty B-tree pages other than the root. */ + ut_ad(ibuf->empty); + ut_ad(page_get_space_id(btr_pcur_get_page(&pcur)) + == IBUF_SPACE_ID); + ut_ad(page_get_page_no(btr_pcur_get_page(&pcur)) + == FSP_IBUF_TREE_ROOT_PAGE_NO); + + ibuf_mtr_commit(&mtr); + btr_pcur_close(&pcur); + + return(0); + } + + sum_sizes = ibuf_get_merge_page_nos(TRUE, + btr_pcur_get_rec(&pcur), &mtr, + space_ids, space_versions, + page_nos, n_pages); +#if 0 /* defined UNIV_IBUF_DEBUG */ + fprintf(stderr, "Ibuf contract sync %lu pages %lu volume %lu\n", + sync, *n_pages, sum_sizes); +#endif + ibuf_mtr_commit(&mtr); + btr_pcur_close(&pcur); + + buf_read_ibuf_merge_pages( + sync, space_ids, space_versions, page_nos, *n_pages); + + return(sum_sizes + 1); +} + +/*********************************************************************//** +Get the table instance from the table id. +@return table instance */ +static __attribute__((warn_unused_result)) +dict_table_t* +ibuf_get_table( +/*===========*/ + table_id_t table_id) /*!< in: valid table id */ +{ + rw_lock_s_lock_func(&dict_operation_lock, 0, __FILE__, __LINE__); + + dict_table_t* table = dict_table_open_on_id( + table_id, FALSE, DICT_TABLE_OP_NORMAL); + + rw_lock_s_unlock_gen(&dict_operation_lock, 0); + + return(table); +} + +/*********************************************************************//** +Contracts insert buffer trees by reading pages to the buffer pool. +@return a lower limit for the combined size in bytes of entries which +will be merged from ibuf trees to the pages read, 0 if ibuf is +empty */ +static +ulint +ibuf_merge_space( +/*=============*/ + ulint space, /*!< in: tablespace id to merge */ + ulint* n_pages)/*!< out: number of pages to which merged */ +{ + mtr_t mtr; + btr_pcur_t pcur; + mem_heap_t* heap = mem_heap_create(512); + dtuple_t* tuple = ibuf_search_tuple_build(space, 0, heap); + + ibuf_mtr_start(&mtr); + + /* Position the cursor on the first matching record. */ + + btr_pcur_open( + ibuf->index, tuple, PAGE_CUR_GE, BTR_SEARCH_LEAF, &pcur, + &mtr); + + mem_heap_free(heap); + + ut_ad(page_validate(btr_pcur_get_page(&pcur), ibuf->index)); + + ulint sum_sizes = 0; + ulint pages[IBUF_MAX_N_PAGES_MERGED]; + ulint spaces[IBUF_MAX_N_PAGES_MERGED]; + ib_int64_t versions[IBUF_MAX_N_PAGES_MERGED]; + + if (page_is_empty(btr_pcur_get_page(&pcur))) { + /* If a B-tree page is empty, it must be the root page + and the whole B-tree must be empty. InnoDB does not + allow empty B-tree pages other than the root. */ + ut_ad(ibuf->empty); + ut_ad(page_get_space_id(btr_pcur_get_page(&pcur)) + == IBUF_SPACE_ID); + ut_ad(page_get_page_no(btr_pcur_get_page(&pcur)) + == FSP_IBUF_TREE_ROOT_PAGE_NO); + + } else { + + sum_sizes = ibuf_get_merge_pages( + &pcur, space, IBUF_MAX_N_PAGES_MERGED, + &pages[0], &spaces[0], &versions[0], n_pages, + &mtr); + + ++sum_sizes; + } + + ibuf_mtr_commit(&mtr); + + btr_pcur_close(&pcur); + + if (sum_sizes > 0) { + + ut_a(*n_pages > 0 || sum_sizes == 1); + +#ifdef UNIV_DEBUG + ut_ad(*n_pages <= UT_ARR_SIZE(pages)); + + for (ulint i = 0; i < *n_pages; ++i) { + ut_ad(spaces[i] == space); + ut_ad(i == 0 || versions[i] == versions[i - 1]); + } +#endif /* UNIV_DEBUG */ + + buf_read_ibuf_merge_pages( + true, spaces, versions, pages, *n_pages); + } + + return(sum_sizes); +} + +/*********************************************************************//** +Contracts insert buffer trees by reading pages to the buffer pool. +@return a lower limit for the combined size in bytes of entries which +will be merged from ibuf trees to the pages read, 0 if ibuf is +empty */ +static __attribute__((nonnull, warn_unused_result)) +ulint +ibuf_merge( +/*=======*/ + table_id_t table_id, /*!< in: if merge should be + done only for a specific + table, for all tables this + should be 0 */ + ulint* n_pages, /*!< out: number of pages to + which merged */ + bool sync) /*!< in: TRUE if the caller + wants to wait for the issued + read with the highest + tablespace address to complete */ +{ + dict_table_t* table; + + *n_pages = 0; + + /* We perform a dirty read of ibuf->empty, without latching + the insert buffer root page. We trust this dirty read except + when a slow shutdown is being executed. During a slow + shutdown, the insert buffer merge must be completed. */ + + if (ibuf->empty && !srv_shutdown_state) { + return(0); +#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG + } else if (ibuf_debug) { + return(0); +#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ + } else if (table_id == 0) { + return(ibuf_merge_pages(n_pages, sync)); + } else if ((table = ibuf_get_table(table_id)) == 0) { + /* Table has been dropped. */ + return(0); + } + + ulint volume = ibuf_merge_space(table->space, n_pages); + + dict_table_close(table, FALSE, FALSE); + + return(volume); +} + +/*********************************************************************//** +Contracts insert buffer trees by reading pages to the buffer pool. +@return a lower limit for the combined size in bytes of entries which +will be merged from ibuf trees to the pages read, 0 if ibuf is +empty */ +static +ulint +ibuf_contract( +/*==========*/ + ibool sync) /*!< in: TRUE if the caller wants to wait for the + issued read with the highest tablespace address + to complete */ +{ + ulint n_pages; + + return(ibuf_merge(0, &n_pages, sync)); +} + +/*********************************************************************//** +Contracts insert buffer trees by reading pages to the buffer pool. +@return a lower limit for the combined size in bytes of entries which +will be merged from ibuf trees to the pages read, 0 if ibuf is +empty */ +UNIV_INTERN +ulint +ibuf_contract_in_background( +/*========================*/ + table_id_t table_id, /*!< in: if merge should be done only + for a specific table, for all tables + this should be 0 */ + ibool full) /*!< in: TRUE if the caller wants to + do a full contract based on PCT_IO(100). + If FALSE then the size of contract + batch is determined based on the + current size of the ibuf tree. */ +{ + ulint sum_bytes = 0; + ulint sum_pages = 0; + ulint n_pag2; + ulint n_pages; + +#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG + if (srv_ibuf_disable_background_merge && table_id == 0) { + return(0); + } +#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ + + if (full) { + /* Caller has requested a full batch */ + n_pages = PCT_IO(100); + } else { + /* By default we do a batch of 5% of the io_capacity */ + n_pages = PCT_IO(5); + + mutex_enter(&ibuf_mutex); + + /* If the ibuf->size is more than half the max_size + then we make more agreesive contraction. + +1 is to avoid division by zero. */ + if (ibuf->size > ibuf->max_size / 2) { + ulint diff = ibuf->size - ibuf->max_size / 2; + n_pages += PCT_IO((diff * 100) + / (ibuf->max_size + 1)); + } + + mutex_exit(&ibuf_mutex); + } + + while (sum_pages < n_pages) { + ulint n_bytes; + + n_bytes = ibuf_merge(table_id, &n_pag2, FALSE); + + if (n_bytes == 0) { + return(sum_bytes); + } + + sum_bytes += n_bytes; + sum_pages += n_pag2; + + srv_inc_activity_count(); + } + + return(sum_bytes); +} + +/*********************************************************************//** +Contract insert buffer trees after insert if they are too big. */ +UNIV_INLINE +void +ibuf_contract_after_insert( +/*=======================*/ + ulint entry_size) /*!< in: size of a record which was inserted + into an ibuf tree */ +{ + ibool sync; + ulint sum_sizes; + ulint size; + ulint max_size; + + /* Perform dirty reads of ibuf->size and ibuf->max_size, to + reduce ibuf_mutex contention. ibuf->max_size remains constant + after ibuf_init_at_db_start(), but ibuf->size should be + protected by ibuf_mutex. Given that ibuf->size fits in a + machine word, this should be OK; at worst we are doing some + excessive ibuf_contract() or occasionally skipping a + ibuf_contract(). */ + size = ibuf->size; + max_size = ibuf->max_size; + + if (size < max_size + IBUF_CONTRACT_ON_INSERT_NON_SYNC) { + return; + } + + sync = (size >= max_size + IBUF_CONTRACT_ON_INSERT_SYNC); + + /* Contract at least entry_size many bytes */ + sum_sizes = 0; + size = 1; + + do { + + size = ibuf_contract(sync); + sum_sizes += size; + } while (size > 0 && sum_sizes < entry_size); +} + +/*********************************************************************//** +Determine if an insert buffer record has been encountered already. +@return TRUE if a new record, FALSE if possible duplicate */ +static +ibool +ibuf_get_volume_buffered_hash( +/*==========================*/ + const rec_t* rec, /*!< in: ibuf record in post-4.1 format */ + const byte* types, /*!< in: fields */ + const byte* data, /*!< in: start of user record data */ + ulint comp, /*!< in: 0=ROW_FORMAT=REDUNDANT, + nonzero=ROW_FORMAT=COMPACT */ + ulint* hash, /*!< in/out: hash array */ + ulint size) /*!< in: number of elements in hash array */ +{ + ulint len; + ulint fold; + ulint bitmask; + + len = ibuf_rec_get_size( + rec, types, + rec_get_n_fields_old(rec) - IBUF_REC_FIELD_USER, comp); + fold = ut_fold_binary(data, len); + + hash += (fold / (CHAR_BIT * sizeof *hash)) % size; + bitmask = static_cast<ulint>( + 1 << (fold % (CHAR_BIT * sizeof(*hash)))); + + if (*hash & bitmask) { + + return(FALSE); + } + + /* We have not seen this record yet. Insert it. */ + *hash |= bitmask; + + return(TRUE); +} + +#ifdef UNIV_DEBUG +# define ibuf_get_volume_buffered_count(mtr,rec,hash,size,n_recs) \ + ibuf_get_volume_buffered_count_func(mtr,rec,hash,size,n_recs) +#else /* UNIV_DEBUG */ +# define ibuf_get_volume_buffered_count(mtr,rec,hash,size,n_recs) \ + ibuf_get_volume_buffered_count_func(rec,hash,size,n_recs) +#endif +/*********************************************************************//** +Update the estimate of the number of records on a page, and +get the space taken by merging the buffered record to the index page. +@return size of index record in bytes + an upper limit of the space +taken in the page directory */ +static +ulint +ibuf_get_volume_buffered_count_func( +/*================================*/ +#ifdef UNIV_DEBUG + mtr_t* mtr, /*!< in: mini-transaction owning rec */ +#endif /* UNIV_DEBUG */ + const rec_t* rec, /*!< in: insert buffer record */ + ulint* hash, /*!< in/out: hash array */ + ulint size, /*!< in: number of elements in hash array */ + lint* n_recs) /*!< in/out: estimated number of records + on the page that rec points to */ +{ + ulint len; + ibuf_op_t ibuf_op; + const byte* types; + ulint n_fields; + + ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX) + || mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_S_FIX)); + ut_ad(ibuf_inside(mtr)); + + n_fields = rec_get_n_fields_old(rec); + ut_ad(n_fields > IBUF_REC_FIELD_USER); + n_fields -= IBUF_REC_FIELD_USER; + + rec_get_nth_field_offs_old(rec, 1, &len); + /* This function is only invoked when buffering new + operations. All pre-4.1 records should have been merged + when the database was started up. */ + ut_a(len == 1); + + if (rec_get_deleted_flag(rec, 0)) { + /* This record has been merged already, + but apparently the system crashed before + the change was discarded from the buffer. + Pretend that the record does not exist. */ + return(0); + } + + types = rec_get_nth_field_old(rec, IBUF_REC_FIELD_METADATA, &len); + + switch (UNIV_EXPECT(len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE, + IBUF_REC_INFO_SIZE)) { + default: + ut_error; + case 0: + /* This ROW_TYPE=REDUNDANT record does not include an + operation counter. Exclude it from the *n_recs, + because deletes cannot be buffered if there are + old-style inserts buffered for the page. */ + + len = ibuf_rec_get_size(rec, types, n_fields, 0); + + return(len + + rec_get_converted_extra_size(len, n_fields, 0) + + page_dir_calc_reserved_space(1)); + case 1: + /* This ROW_TYPE=COMPACT record does not include an + operation counter. Exclude it from the *n_recs, + because deletes cannot be buffered if there are + old-style inserts buffered for the page. */ + goto get_volume_comp; + + case IBUF_REC_INFO_SIZE: + ibuf_op = (ibuf_op_t) types[IBUF_REC_OFFSET_TYPE]; + break; + } + + switch (ibuf_op) { + case IBUF_OP_INSERT: + /* Inserts can be done by updating a delete-marked record. + Because delete-mark and insert operations can be pointing to + the same records, we must not count duplicates. */ + case IBUF_OP_DELETE_MARK: + /* There must be a record to delete-mark. + See if this record has been already buffered. */ + if (n_recs && ibuf_get_volume_buffered_hash( + rec, types + IBUF_REC_INFO_SIZE, + types + len, + types[IBUF_REC_OFFSET_FLAGS] & IBUF_REC_COMPACT, + hash, size)) { + (*n_recs)++; + } + + if (ibuf_op == IBUF_OP_DELETE_MARK) { + /* Setting the delete-mark flag does not + affect the available space on the page. */ + return(0); + } + break; + case IBUF_OP_DELETE: + /* A record will be removed from the page. */ + if (n_recs) { + (*n_recs)--; + } + /* While deleting a record actually frees up space, + we have to play it safe and pretend that it takes no + additional space (the record might not exist, etc.). */ + return(0); + default: + ut_error; + } + + ut_ad(ibuf_op == IBUF_OP_INSERT); + +get_volume_comp: + { + dtuple_t* entry; + ulint volume; + dict_index_t* dummy_index; + mem_heap_t* heap = mem_heap_create(500); + + entry = ibuf_build_entry_from_ibuf_rec( + mtr, rec, heap, &dummy_index); + + volume = rec_get_converted_size(dummy_index, entry, 0); + + ibuf_dummy_index_free(dummy_index); + mem_heap_free(heap); + + return(volume + page_dir_calc_reserved_space(1)); + } +} + +/*********************************************************************//** +Gets an upper limit for the combined size of entries buffered in the insert +buffer for a given page. +@return upper limit for the volume of buffered inserts for the index +page, in bytes; UNIV_PAGE_SIZE, if the entries for the index page span +several pages in the insert buffer */ +static +ulint +ibuf_get_volume_buffered( +/*=====================*/ + const btr_pcur_t*pcur, /*!< in: pcur positioned at a place in an + insert buffer tree where we would insert an + entry for the index page whose number is + page_no, latch mode has to be BTR_MODIFY_PREV + or BTR_MODIFY_TREE */ + ulint space, /*!< in: space id */ + ulint page_no,/*!< in: page number of an index page */ + lint* n_recs, /*!< in/out: minimum number of records on the + page after the buffered changes have been + applied, or NULL to disable the counting */ + mtr_t* mtr) /*!< in: mini-transaction of pcur */ +{ + ulint volume; + const rec_t* rec; + const page_t* page; + ulint prev_page_no; + const page_t* prev_page; + ulint next_page_no; + const page_t* next_page; + /* bitmap of buffered recs */ + ulint hash_bitmap[128 / sizeof(ulint)]; + + ut_ad((pcur->latch_mode == BTR_MODIFY_PREV) + || (pcur->latch_mode == BTR_MODIFY_TREE)); + + /* Count the volume of inserts earlier in the alphabetical order than + pcur */ + + volume = 0; + + if (n_recs) { + memset(hash_bitmap, 0, sizeof hash_bitmap); + } + + rec = btr_pcur_get_rec(pcur); + page = page_align(rec); + ut_ad(page_validate(page, ibuf->index)); + + if (page_rec_is_supremum(rec)) { + rec = page_rec_get_prev_const(rec); + } + + for (; !page_rec_is_infimum(rec); + rec = page_rec_get_prev_const(rec)) { + ut_ad(page_align(rec) == page); + + if (page_no != ibuf_rec_get_page_no(mtr, rec) + || space != ibuf_rec_get_space(mtr, rec)) { + + goto count_later; + } + + volume += ibuf_get_volume_buffered_count( + mtr, rec, + hash_bitmap, UT_ARR_SIZE(hash_bitmap), n_recs); + } + + /* Look at the previous page */ + + prev_page_no = btr_page_get_prev(page, mtr); + + if (prev_page_no == FIL_NULL) { + + goto count_later; + } + + { + buf_block_t* block; + + block = buf_page_get( + IBUF_SPACE_ID, 0, prev_page_no, RW_X_LATCH, + mtr); + + buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE); + + + prev_page = buf_block_get_frame(block); + ut_ad(page_validate(prev_page, ibuf->index)); + } + +#ifdef UNIV_BTR_DEBUG + ut_a(btr_page_get_next(prev_page, mtr) == page_get_page_no(page)); +#endif /* UNIV_BTR_DEBUG */ + + rec = page_get_supremum_rec(prev_page); + rec = page_rec_get_prev_const(rec); + + for (;; rec = page_rec_get_prev_const(rec)) { + ut_ad(page_align(rec) == prev_page); + + if (page_rec_is_infimum(rec)) { + + /* We cannot go to yet a previous page, because we + do not have the x-latch on it, and cannot acquire one + because of the latching order: we have to give up */ + + return(UNIV_PAGE_SIZE); + } + + if (page_no != ibuf_rec_get_page_no(mtr, rec) + || space != ibuf_rec_get_space(mtr, rec)) { + + goto count_later; + } + + volume += ibuf_get_volume_buffered_count( + mtr, rec, + hash_bitmap, UT_ARR_SIZE(hash_bitmap), n_recs); + } + +count_later: + rec = btr_pcur_get_rec(pcur); + + if (!page_rec_is_supremum(rec)) { + rec = page_rec_get_next_const(rec); + } + + for (; !page_rec_is_supremum(rec); + rec = page_rec_get_next_const(rec)) { + if (page_no != ibuf_rec_get_page_no(mtr, rec) + || space != ibuf_rec_get_space(mtr, rec)) { + + return(volume); + } + + volume += ibuf_get_volume_buffered_count( + mtr, rec, + hash_bitmap, UT_ARR_SIZE(hash_bitmap), n_recs); + } + + /* Look at the next page */ + + next_page_no = btr_page_get_next(page, mtr); + + if (next_page_no == FIL_NULL) { + + return(volume); + } + + { + buf_block_t* block; + + block = buf_page_get( + IBUF_SPACE_ID, 0, next_page_no, RW_X_LATCH, + mtr); + + buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE); + + + next_page = buf_block_get_frame(block); + ut_ad(page_validate(next_page, ibuf->index)); + } + +#ifdef UNIV_BTR_DEBUG + ut_a(btr_page_get_prev(next_page, mtr) == page_get_page_no(page)); +#endif /* UNIV_BTR_DEBUG */ + + rec = page_get_infimum_rec(next_page); + rec = page_rec_get_next_const(rec); + + for (;; rec = page_rec_get_next_const(rec)) { + ut_ad(page_align(rec) == next_page); + + if (page_rec_is_supremum(rec)) { + + /* We give up */ + + return(UNIV_PAGE_SIZE); + } + + if (page_no != ibuf_rec_get_page_no(mtr, rec) + || space != ibuf_rec_get_space(mtr, rec)) { + + return(volume); + } + + volume += ibuf_get_volume_buffered_count( + mtr, rec, + hash_bitmap, UT_ARR_SIZE(hash_bitmap), n_recs); + } +} + +/*********************************************************************//** +Reads the biggest tablespace id from the high end of the insert buffer +tree and updates the counter in fil_system. */ +UNIV_INTERN +void +ibuf_update_max_tablespace_id(void) +/*===============================*/ +{ + ulint max_space_id; + const rec_t* rec; + const byte* field; + ulint len; + btr_pcur_t pcur; + mtr_t mtr; + + ut_a(!dict_table_is_comp(ibuf->index->table)); + + ibuf_mtr_start(&mtr); + + btr_pcur_open_at_index_side( + false, ibuf->index, BTR_SEARCH_LEAF, &pcur, true, 0, &mtr); + + ut_ad(page_validate(btr_pcur_get_page(&pcur), ibuf->index)); + + btr_pcur_move_to_prev(&pcur, &mtr); + + if (btr_pcur_is_before_first_on_page(&pcur)) { + /* The tree is empty */ + + max_space_id = 0; + } else { + rec = btr_pcur_get_rec(&pcur); + + field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_SPACE, &len); + + ut_a(len == 4); + + max_space_id = mach_read_from_4(field); + } + + ibuf_mtr_commit(&mtr); + + /* printf("Maximum space id in insert buffer %lu\n", max_space_id); */ + + fil_set_max_space_id_if_bigger(max_space_id); +} + +#ifdef UNIV_DEBUG +# define ibuf_get_entry_counter_low(mtr,rec,space,page_no) \ + ibuf_get_entry_counter_low_func(mtr,rec,space,page_no) +#else /* UNIV_DEBUG */ +# define ibuf_get_entry_counter_low(mtr,rec,space,page_no) \ + ibuf_get_entry_counter_low_func(rec,space,page_no) +#endif +/****************************************************************//** +Helper function for ibuf_get_entry_counter_func. Checks if rec is for +(space, page_no), and if so, reads counter value from it and returns +that + 1. +@retval ULINT_UNDEFINED if the record does not contain any counter +@retval 0 if the record is not for (space, page_no) +@retval 1 + previous counter value, otherwise */ +static +ulint +ibuf_get_entry_counter_low_func( +/*============================*/ +#ifdef UNIV_DEBUG + mtr_t* mtr, /*!< in: mini-transaction of rec */ +#endif /* UNIV_DEBUG */ + const rec_t* rec, /*!< in: insert buffer record */ + ulint space, /*!< in: space id */ + ulint page_no) /*!< in: page number */ +{ + ulint counter; + const byte* field; + ulint len; + + ut_ad(ibuf_inside(mtr)); + ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX) + || mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_S_FIX)); + ut_ad(rec_get_n_fields_old(rec) > 2); + + field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_MARKER, &len); + + ut_a(len == 1); + + /* Check the tablespace identifier. */ + field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_SPACE, &len); + + ut_a(len == 4); + + if (mach_read_from_4(field) != space) { + + return(0); + } + + /* Check the page offset. */ + field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_PAGE, &len); + ut_a(len == 4); + + if (mach_read_from_4(field) != page_no) { + + return(0); + } + + /* Check if the record contains a counter field. */ + field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_METADATA, &len); + + switch (len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE) { + default: + ut_error; + case 0: /* ROW_FORMAT=REDUNDANT */ + case 1: /* ROW_FORMAT=COMPACT */ + return(ULINT_UNDEFINED); + + case IBUF_REC_INFO_SIZE: + counter = mach_read_from_2(field + IBUF_REC_OFFSET_COUNTER); + ut_a(counter < 0xFFFF); + return(counter + 1); + } +} + +#ifdef UNIV_DEBUG +# define ibuf_get_entry_counter(space,page_no,rec,mtr,exact_leaf) \ + ibuf_get_entry_counter_func(space,page_no,rec,mtr,exact_leaf) +#else /* UNIV_DEBUG */ +# define ibuf_get_entry_counter(space,page_no,rec,mtr,exact_leaf) \ + ibuf_get_entry_counter_func(space,page_no,rec,exact_leaf) +#endif + +/****************************************************************//** +Calculate the counter field for an entry based on the current +last record in ibuf for (space, page_no). +@return the counter field, or ULINT_UNDEFINED +if we should abort this insertion to ibuf */ +static +ulint +ibuf_get_entry_counter_func( +/*========================*/ + ulint space, /*!< in: space id of entry */ + ulint page_no, /*!< in: page number of entry */ + const rec_t* rec, /*!< in: the record preceding the + insertion point */ +#ifdef UNIV_DEBUG + mtr_t* mtr, /*!< in: mini-transaction */ +#endif /* UNIV_DEBUG */ + ibool only_leaf) /*!< in: TRUE if this is the only + leaf page that can contain entries + for (space,page_no), that is, there + was no exact match for (space,page_no) + in the node pointer */ +{ + ut_ad(ibuf_inside(mtr)); + ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX)); + ut_ad(page_validate(page_align(rec), ibuf->index)); + + if (page_rec_is_supremum(rec)) { + /* This is just for safety. The record should be a + page infimum or a user record. */ + ut_ad(0); + return(ULINT_UNDEFINED); + } else if (!page_rec_is_infimum(rec)) { + return(ibuf_get_entry_counter_low(mtr, rec, space, page_no)); + } else if (only_leaf + || fil_page_get_prev(page_align(rec)) == FIL_NULL) { + /* The parent node pointer did not contain the + searched for (space, page_no), which means that the + search ended on the correct page regardless of the + counter value, and since we're at the infimum record, + there are no existing records. */ + return(0); + } else { + /* We used to read the previous page here. It would + break the latching order, because the caller has + buffer-fixed an insert buffer bitmap page. */ + return(ULINT_UNDEFINED); + } +} + +/*********************************************************************//** +Buffer an operation in the insert/delete buffer, instead of doing it +directly to the disk page, if this is possible. +@return DB_SUCCESS, DB_STRONG_FAIL or other error */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +ibuf_insert_low( +/*============*/ + ulint mode, /*!< in: BTR_MODIFY_PREV or BTR_MODIFY_TREE */ + ibuf_op_t op, /*!< in: operation type */ + ibool no_counter, + /*!< in: TRUE=use 5.0.3 format; + FALSE=allow delete buffering */ + const dtuple_t* entry, /*!< in: index entry to insert */ + ulint entry_size, + /*!< in: rec_get_converted_size(index, entry) */ + dict_index_t* index, /*!< in: index where to insert; must not be + unique or clustered */ + ulint space, /*!< in: space id where to insert */ + ulint zip_size,/*!< in: compressed page size in bytes, or 0 */ + ulint page_no,/*!< in: page number where to insert */ + que_thr_t* thr) /*!< in: query thread */ +{ + big_rec_t* dummy_big_rec; + btr_pcur_t pcur; + btr_cur_t* cursor; + dtuple_t* ibuf_entry; + mem_heap_t* offsets_heap = NULL; + mem_heap_t* heap; + ulint* offsets = NULL; + ulint buffered; + lint min_n_recs; + rec_t* ins_rec; + ibool old_bit_value; + page_t* bitmap_page; + buf_block_t* block; + page_t* root; + dberr_t err; + ibool do_merge; + ulint space_ids[IBUF_MAX_N_PAGES_MERGED]; + ib_int64_t space_versions[IBUF_MAX_N_PAGES_MERGED]; + ulint page_nos[IBUF_MAX_N_PAGES_MERGED]; + ulint n_stored; + mtr_t mtr; + mtr_t bitmap_mtr; + + ut_a(!dict_index_is_clust(index)); + ut_ad(dtuple_check_typed(entry)); + ut_ad(ut_is_2pow(zip_size)); + ut_ad(!no_counter || op == IBUF_OP_INSERT); + ut_a(op < IBUF_OP_COUNT); + + ut_ad(!(thr_get_trx(thr)->fake_changes)); + + do_merge = FALSE; + + /* Perform dirty reads of ibuf->size and ibuf->max_size, to + reduce ibuf_mutex contention. Given that ibuf->max_size and + ibuf->size fit in a machine word, this should be OK; at worst + we are doing some excessive ibuf_contract() or occasionally + skipping an ibuf_contract(). */ + if (ibuf->max_size == 0) { + return(DB_STRONG_FAIL); + } + + if (ibuf->size >= ibuf->max_size + IBUF_CONTRACT_DO_NOT_INSERT) { + /* Insert buffer is now too big, contract it but do not try + to insert */ + + +#ifdef UNIV_IBUF_DEBUG + fputs("Ibuf too big\n", stderr); +#endif + /* Use synchronous contract (== TRUE) */ + ibuf_contract(TRUE); + + return(DB_STRONG_FAIL); + } + + heap = mem_heap_create(1024); + + /* Build the entry which contains the space id and the page number + as the first fields and the type information for other fields, and + which will be inserted to the insert buffer. Using a counter value + of 0xFFFF we find the last record for (space, page_no), from which + we can then read the counter value N and use N + 1 in the record we + insert. (We patch the ibuf_entry's counter field to the correct + value just before actually inserting the entry.) */ + + ibuf_entry = ibuf_entry_build( + op, index, entry, space, page_no, + no_counter ? ULINT_UNDEFINED : 0xFFFF, heap); + + /* Open a cursor to the insert buffer tree to calculate if we can add + the new entry to it without exceeding the free space limit for the + page. */ + + if (mode == BTR_MODIFY_TREE) { + for (;;) { + mutex_enter(&ibuf_pessimistic_insert_mutex); + mutex_enter(&ibuf_mutex); + + if (UNIV_LIKELY(ibuf_data_enough_free_for_insert())) { + + break; + } + + mutex_exit(&ibuf_mutex); + mutex_exit(&ibuf_pessimistic_insert_mutex); + + if (UNIV_UNLIKELY(!ibuf_add_free_page())) { + + mem_heap_free(heap); + return(DB_STRONG_FAIL); + } + } + } + + ibuf_mtr_start(&mtr); + + btr_pcur_open(ibuf->index, ibuf_entry, PAGE_CUR_LE, mode, &pcur, &mtr); + ut_ad(page_validate(btr_pcur_get_page(&pcur), ibuf->index)); + + /* Find out the volume of already buffered inserts for the same index + page */ + min_n_recs = 0; + buffered = ibuf_get_volume_buffered(&pcur, space, page_no, + op == IBUF_OP_DELETE + ? &min_n_recs + : NULL, &mtr); + + if (op == IBUF_OP_DELETE + && (min_n_recs < 2 + || buf_pool_watch_occurred(space, page_no))) { + /* The page could become empty after the record is + deleted, or the page has been read in to the buffer + pool. Refuse to buffer the operation. */ + + /* The buffer pool watch is needed for IBUF_OP_DELETE + because of latching order considerations. We can + check buf_pool_watch_occurred() only after latching + the insert buffer B-tree pages that contain buffered + changes for the page. We never buffer IBUF_OP_DELETE, + unless some IBUF_OP_INSERT or IBUF_OP_DELETE_MARK have + been previously buffered for the page. Because there + are buffered operations for the page, the insert + buffer B-tree page latches held by mtr will guarantee + that no changes for the user page will be merged + before mtr_commit(&mtr). We must not mtr_commit(&mtr) + until after the IBUF_OP_DELETE has been buffered. */ + +fail_exit: + if (mode == BTR_MODIFY_TREE) { + mutex_exit(&ibuf_mutex); + mutex_exit(&ibuf_pessimistic_insert_mutex); + } + + err = DB_STRONG_FAIL; + goto func_exit; + } + + /* After this point, the page could still be loaded to the + buffer pool, but we do not have to care about it, since we are + holding a latch on the insert buffer leaf page that contains + buffered changes for (space, page_no). If the page enters the + buffer pool, buf_page_io_complete() for (space, page_no) will + have to acquire a latch on the same insert buffer leaf page, + which it cannot do until we have buffered the IBUF_OP_DELETE + and done mtr_commit(&mtr) to release the latch. */ + +#ifdef UNIV_IBUF_COUNT_DEBUG + ut_a((buffered == 0) || ibuf_count_get(space, page_no)); +#endif + ibuf_mtr_start(&bitmap_mtr); + + bitmap_page = ibuf_bitmap_get_map_page(space, page_no, + zip_size, &bitmap_mtr); + + /* We check if the index page is suitable for buffered entries */ + + if (buf_page_peek(space, page_no) + || lock_rec_expl_exist_on_page(space, page_no)) { + + ibuf_mtr_commit(&bitmap_mtr); + goto fail_exit; + } + + if (op == IBUF_OP_INSERT) { + ulint bits = ibuf_bitmap_page_get_bits( + bitmap_page, page_no, zip_size, IBUF_BITMAP_FREE, + &bitmap_mtr); + + if (buffered + entry_size + page_dir_calc_reserved_space(1) + > ibuf_index_page_calc_free_from_bits(zip_size, bits)) { + /* Release the bitmap page latch early. */ + ibuf_mtr_commit(&bitmap_mtr); + + /* It may not fit */ + do_merge = TRUE; + + ibuf_get_merge_page_nos(FALSE, + btr_pcur_get_rec(&pcur), &mtr, + space_ids, space_versions, + page_nos, &n_stored); + + goto fail_exit; + } + } + + if (!no_counter) { + /* Patch correct counter value to the entry to + insert. This can change the insert position, which can + result in the need to abort in some cases. */ + ulint counter = ibuf_get_entry_counter( + space, page_no, btr_pcur_get_rec(&pcur), &mtr, + btr_pcur_get_btr_cur(&pcur)->low_match + < IBUF_REC_FIELD_METADATA); + dfield_t* field; + + if (counter == ULINT_UNDEFINED) { + ibuf_mtr_commit(&bitmap_mtr); + goto fail_exit; + } + + field = dtuple_get_nth_field( + ibuf_entry, IBUF_REC_FIELD_METADATA); + mach_write_to_2( + (byte*) dfield_get_data(field) + + IBUF_REC_OFFSET_COUNTER, counter); + } + + /* Set the bitmap bit denoting that the insert buffer contains + buffered entries for this index page, if the bit is not set yet */ + + old_bit_value = ibuf_bitmap_page_get_bits( + bitmap_page, page_no, zip_size, + IBUF_BITMAP_BUFFERED, &bitmap_mtr); + + if (!old_bit_value) { + ibuf_bitmap_page_set_bits(bitmap_page, page_no, zip_size, + IBUF_BITMAP_BUFFERED, TRUE, + &bitmap_mtr); + } + + ibuf_mtr_commit(&bitmap_mtr); + + cursor = btr_pcur_get_btr_cur(&pcur); + + if (mode == BTR_MODIFY_PREV) { + err = btr_cur_optimistic_insert( + BTR_NO_LOCKING_FLAG, + cursor, &offsets, &offsets_heap, + ibuf_entry, &ins_rec, + &dummy_big_rec, 0, thr, &mtr); + block = btr_cur_get_block(cursor); + ut_ad(buf_block_get_space(block) == IBUF_SPACE_ID); + + /* If this is the root page, update ibuf->empty. */ + if (UNIV_UNLIKELY(buf_block_get_page_no(block) + == FSP_IBUF_TREE_ROOT_PAGE_NO)) { + const page_t* root = buf_block_get_frame(block); + + ut_ad(page_get_space_id(root) == IBUF_SPACE_ID); + ut_ad(page_get_page_no(root) + == FSP_IBUF_TREE_ROOT_PAGE_NO); + + ibuf->empty = page_is_empty(root); + } + } else { + ut_ad(mode == BTR_MODIFY_TREE); + + /* We acquire an x-latch to the root page before the insert, + because a pessimistic insert releases the tree x-latch, + which would cause the x-latching of the root after that to + break the latching order. */ + + root = ibuf_tree_root_get(&mtr); + + err = btr_cur_optimistic_insert( + BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG, + cursor, &offsets, &offsets_heap, + ibuf_entry, &ins_rec, + &dummy_big_rec, 0, thr, &mtr); + + if (err == DB_FAIL) { + err = btr_cur_pessimistic_insert( + BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG, + cursor, &offsets, &offsets_heap, + ibuf_entry, &ins_rec, + &dummy_big_rec, 0, thr, &mtr); + } + + mutex_exit(&ibuf_pessimistic_insert_mutex); + ibuf_size_update(root, &mtr); + mutex_exit(&ibuf_mutex); + ibuf->empty = page_is_empty(root); + + block = btr_cur_get_block(cursor); + ut_ad(buf_block_get_space(block) == IBUF_SPACE_ID); + } + + if (offsets_heap) { + mem_heap_free(offsets_heap); + } + + if (err == DB_SUCCESS && op != IBUF_OP_DELETE) { + /* Update the page max trx id field */ + page_update_max_trx_id(block, NULL, + thr_get_trx(thr)->id, &mtr); + } + +func_exit: +#ifdef UNIV_IBUF_COUNT_DEBUG + if (err == DB_SUCCESS) { + fprintf(stderr, + "Incrementing ibuf count of space %lu page %lu\n" + "from %lu by 1\n", space, page_no, + ibuf_count_get(space, page_no)); + + ibuf_count_set(space, page_no, + ibuf_count_get(space, page_no) + 1); + } +#endif + + ibuf_mtr_commit(&mtr); + btr_pcur_close(&pcur); + + mem_heap_free(heap); + + if (err == DB_SUCCESS && mode == BTR_MODIFY_TREE) { + ibuf_contract_after_insert(entry_size); + } + + if (do_merge) { +#ifdef UNIV_IBUF_DEBUG + ut_a(n_stored <= IBUF_MAX_N_PAGES_MERGED); +#endif + buf_read_ibuf_merge_pages(false, space_ids, space_versions, + page_nos, n_stored); + } + + return(err); +} + +/*********************************************************************//** +Buffer an operation in the insert/delete buffer, instead of doing it +directly to the disk page, if this is possible. Does not do it if the index +is clustered or unique. +@return TRUE if success */ +UNIV_INTERN +ibool +ibuf_insert( +/*========*/ + ibuf_op_t op, /*!< in: operation type */ + const dtuple_t* entry, /*!< in: index entry to insert */ + dict_index_t* index, /*!< in: index where to insert */ + ulint space, /*!< in: space id where to insert */ + ulint zip_size,/*!< in: compressed page size in bytes, or 0 */ + ulint page_no,/*!< in: page number where to insert */ + que_thr_t* thr) /*!< in: query thread */ +{ + dberr_t err; + ulint entry_size; + ibool no_counter; + /* Read the settable global variable ibuf_use only once in + this function, so that we will have a consistent view of it. */ + ibuf_use_t use = ibuf_use; + DBUG_ENTER("ibuf_insert"); + + DBUG_PRINT("ibuf", ("op: %d, space: %ld, page_no: %ld", + op, space, page_no)); + + ut_ad(dtuple_check_typed(entry)); + ut_ad(ut_is_2pow(zip_size)); + + ut_a(!dict_index_is_clust(index)); + + no_counter = use <= IBUF_USE_INSERT; + + switch (op) { + case IBUF_OP_INSERT: + switch (use) { + case IBUF_USE_NONE: + case IBUF_USE_DELETE: + case IBUF_USE_DELETE_MARK: + DBUG_RETURN(FALSE); + case IBUF_USE_INSERT: + case IBUF_USE_INSERT_DELETE_MARK: + case IBUF_USE_ALL: + goto check_watch; + case IBUF_USE_COUNT: + break; + } + break; + case IBUF_OP_DELETE_MARK: + switch (use) { + case IBUF_USE_NONE: + case IBUF_USE_INSERT: + DBUG_RETURN(FALSE); + case IBUF_USE_DELETE_MARK: + case IBUF_USE_DELETE: + case IBUF_USE_INSERT_DELETE_MARK: + case IBUF_USE_ALL: + ut_ad(!no_counter); + goto check_watch; + case IBUF_USE_COUNT: + break; + } + break; + case IBUF_OP_DELETE: + switch (use) { + case IBUF_USE_NONE: + case IBUF_USE_INSERT: + case IBUF_USE_INSERT_DELETE_MARK: + DBUG_RETURN(FALSE); + case IBUF_USE_DELETE_MARK: + case IBUF_USE_DELETE: + case IBUF_USE_ALL: + ut_ad(!no_counter); + goto skip_watch; + case IBUF_USE_COUNT: + break; + } + break; + case IBUF_OP_COUNT: + break; + } + + /* unknown op or use */ + ut_error; + +check_watch: + /* If a thread attempts to buffer an insert on a page while a + purge is in progress on the same page, the purge must not be + buffered, because it could remove a record that was + re-inserted later. For simplicity, we block the buffering of + all operations on a page that has a purge pending. + + We do not check this in the IBUF_OP_DELETE case, because that + would always trigger the buffer pool watch during purge and + thus prevent the buffering of delete operations. We assume + that the issuer of IBUF_OP_DELETE has called + buf_pool_watch_set(space, page_no). */ + + { + buf_page_t* bpage; + buf_pool_t* buf_pool = buf_pool_get(space, page_no); + bpage = buf_page_hash_get(buf_pool, space, page_no); + + if (UNIV_LIKELY_NULL(bpage)) { + /* A buffer pool watch has been set or the + page has been read into the buffer pool. + Do not buffer the request. If a purge operation + is being buffered, have this request executed + directly on the page in the buffer pool after the + buffered entries for this page have been merged. */ + DBUG_RETURN(FALSE); + } + } + +skip_watch: + entry_size = rec_get_converted_size(index, entry, 0); + + if (entry_size + >= page_get_free_space_of_empty(dict_table_is_comp(index->table)) + / 2) { + + DBUG_RETURN(FALSE); + } + + err = ibuf_insert_low(BTR_MODIFY_PREV, op, no_counter, + entry, entry_size, + index, space, zip_size, page_no, thr); + if (err == DB_FAIL) { + err = ibuf_insert_low(BTR_MODIFY_TREE, op, no_counter, + entry, entry_size, + index, space, zip_size, page_no, thr); + } + + if (err == DB_SUCCESS) { +#ifdef UNIV_IBUF_DEBUG + /* fprintf(stderr, "Ibuf insert for page no %lu of index %s\n", + page_no, index->name); */ +#endif + DBUG_RETURN(TRUE); + + } else { + ut_a(err == DB_STRONG_FAIL || err == DB_TOO_BIG_RECORD); + + DBUG_RETURN(FALSE); + } +} + +/********************************************************************//** +During merge, inserts to an index page a secondary index entry extracted +from the insert buffer. +@return newly inserted record */ +static __attribute__((nonnull)) +rec_t* +ibuf_insert_to_index_page_low( +/*==========================*/ + const dtuple_t* entry, /*!< in: buffered entry to insert */ + buf_block_t* block, /*!< in/out: index page where the buffered + entry should be placed */ + dict_index_t* index, /*!< in: record descriptor */ + ulint** offsets,/*!< out: offsets on *rec */ + mem_heap_t* heap, /*!< in/out: memory heap */ + mtr_t* mtr, /*!< in/out: mtr */ + page_cur_t* page_cur)/*!< in/out: cursor positioned on the record + after which to insert the buffered entry */ +{ + const page_t* page; + ulint space; + ulint page_no; + ulint zip_size; + const page_t* bitmap_page; + ulint old_bits; + rec_t* rec; + DBUG_ENTER("ibuf_insert_to_index_page_low"); + + rec = page_cur_tuple_insert(page_cur, entry, index, + offsets, &heap, 0, mtr); + if (rec != NULL) { + DBUG_RETURN(rec); + } + + /* Page reorganization or recompression should already have + been attempted by page_cur_tuple_insert(). Besides, per + ibuf_index_page_calc_free_zip() the page should not have been + recompressed or reorganized. */ + ut_ad(!buf_block_get_page_zip(block)); + + /* If the record did not fit, reorganize */ + + btr_page_reorganize(page_cur, index, mtr); + + /* This time the record must fit */ + + rec = page_cur_tuple_insert(page_cur, entry, index, + offsets, &heap, 0, mtr); + if (rec != NULL) { + DBUG_RETURN(rec); + } + + page = buf_block_get_frame(block); + + ut_print_timestamp(stderr); + + fprintf(stderr, + " InnoDB: Error: Insert buffer insert fails;" + " page free %lu, dtuple size %lu\n", + (ulong) page_get_max_insert_size(page, 1), + (ulong) rec_get_converted_size(index, entry, 0)); + fputs("InnoDB: Cannot insert index record ", stderr); + dtuple_print(stderr, entry); + fputs("\nInnoDB: The table where this index record belongs\n" + "InnoDB: is now probably corrupt. Please run CHECK TABLE on\n" + "InnoDB: that table.\n", stderr); + + space = page_get_space_id(page); + zip_size = buf_block_get_zip_size(block); + page_no = page_get_page_no(page); + + bitmap_page = ibuf_bitmap_get_map_page(space, page_no, zip_size, mtr); + old_bits = ibuf_bitmap_page_get_bits(bitmap_page, page_no, zip_size, + IBUF_BITMAP_FREE, mtr); + + fprintf(stderr, + "InnoDB: space %lu, page %lu, zip_size %lu, bitmap bits %lu\n", + (ulong) space, (ulong) page_no, + (ulong) zip_size, (ulong) old_bits); + + fputs("InnoDB: Submit a detailed bug report" + " to http://bugs.mysql.com\n", stderr); + ut_ad(0); + DBUG_RETURN(NULL); +} + +/************************************************************************ +During merge, inserts to an index page a secondary index entry extracted +from the insert buffer. */ +static +void +ibuf_insert_to_index_page( +/*======================*/ + const dtuple_t* entry, /*!< in: buffered entry to insert */ + buf_block_t* block, /*!< in/out: index page where the buffered entry + should be placed */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_cur_t page_cur; + ulint low_match; + page_t* page = buf_block_get_frame(block); + rec_t* rec; + ulint* offsets; + mem_heap_t* heap; + + DBUG_ENTER("ibuf_insert_to_index_page"); + + DBUG_PRINT("ibuf", ("page_no: %ld", buf_block_get_page_no(block))); + DBUG_PRINT("ibuf", ("index name: %s", index->name)); + DBUG_PRINT("ibuf", ("online status: %d", + dict_index_get_online_status(index))); + + ut_ad(ibuf_inside(mtr)); + ut_ad(dtuple_check_typed(entry)); + ut_ad(!buf_block_align(page)->index); + + if (UNIV_UNLIKELY(dict_table_is_comp(index->table) + != (ibool)!!page_is_comp(page))) { + fputs("InnoDB: Trying to insert a record from" + " the insert buffer to an index page\n" + "InnoDB: but the 'compact' flag does not match!\n", + stderr); + goto dump; + } + + rec = page_rec_get_next(page_get_infimum_rec(page)); + + if (page_rec_is_supremum(rec)) { + fputs("InnoDB: Trying to insert a record from" + " the insert buffer to an index page\n" + "InnoDB: but the index page is empty!\n", + stderr); + goto dump; + } + + if (UNIV_UNLIKELY(rec_get_n_fields(rec, index) + != dtuple_get_n_fields(entry))) { + fputs("InnoDB: Trying to insert a record from" + " the insert buffer to an index page\n" + "InnoDB: but the number of fields does not match!\n", + stderr); +dump: + buf_page_print(page, 0, BUF_PAGE_PRINT_NO_CRASH); + + dtuple_print(stderr, entry); + ut_ad(0); + + fputs("InnoDB: The table where where" + " this index record belongs\n" + "InnoDB: is now probably corrupt." + " Please run CHECK TABLE on\n" + "InnoDB: your tables.\n" + "InnoDB: Submit a detailed bug report to" + " http://bugs.mysql.com!\n", stderr); + + DBUG_VOID_RETURN; + } + + low_match = page_cur_search(block, index, entry, + PAGE_CUR_LE, &page_cur); + + heap = mem_heap_create( + sizeof(upd_t) + + REC_OFFS_HEADER_SIZE * sizeof(*offsets) + + dtuple_get_n_fields(entry) + * (sizeof(upd_field_t) + sizeof *offsets)); + + if (UNIV_UNLIKELY(low_match == dtuple_get_n_fields(entry))) { + upd_t* update; + page_zip_des_t* page_zip; + + rec = page_cur_get_rec(&page_cur); + + /* This is based on + row_ins_sec_index_entry_by_modify(BTR_MODIFY_LEAF). */ + ut_ad(rec_get_deleted_flag(rec, page_is_comp(page))); + + offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, + &heap); + update = row_upd_build_sec_rec_difference_binary( + rec, index, offsets, entry, heap); + + page_zip = buf_block_get_page_zip(block); + + if (update->n_fields == 0) { + /* The records only differ in the delete-mark. + Clear the delete-mark, like we did before + Bug #56680 was fixed. */ + btr_cur_set_deleted_flag_for_ibuf( + rec, page_zip, FALSE, mtr); + goto updated_in_place; + } + + /* Copy the info bits. Clear the delete-mark. */ + update->info_bits = rec_get_info_bits(rec, page_is_comp(page)); + update->info_bits &= ~REC_INFO_DELETED_FLAG; + + /* We cannot invoke btr_cur_optimistic_update() here, + because we do not have a btr_cur_t or que_thr_t, + as the insert buffer merge occurs at a very low level. */ + if (!row_upd_changes_field_size_or_external(index, offsets, + update) + && (!page_zip || btr_cur_update_alloc_zip( + page_zip, &page_cur, index, offsets, + rec_offs_size(offsets), false, mtr, NULL))) { + /* This is the easy case. Do something similar + to btr_cur_update_in_place(). */ + rec = page_cur_get_rec(&page_cur); + row_upd_rec_in_place(rec, index, offsets, + update, page_zip); + + /* Log the update in place operation. During recovery + MLOG_COMP_REC_UPDATE_IN_PLACE/MLOG_REC_UPDATE_IN_PLACE + expects trx_id, roll_ptr for secondary indexes. So we + just write dummy trx_id(0), roll_ptr(0) */ + btr_cur_update_in_place_log(BTR_KEEP_SYS_FLAG, rec, + index, update, 0, 0, mtr); + DBUG_EXECUTE_IF( + "crash_after_log_ibuf_upd_inplace", + log_buffer_flush_to_disk(); + ib_logf(IB_LOG_LEVEL_INFO, + "Wrote log record for ibuf update in " + "place operation"); + DBUG_SUICIDE(); + ); + + goto updated_in_place; + } + + /* btr_cur_update_alloc_zip() may have changed this */ + rec = page_cur_get_rec(&page_cur); + + /* A collation may identify values that differ in + storage length. + Some examples (1 or 2 bytes): + utf8_turkish_ci: I = U+0131 LATIN SMALL LETTER DOTLESS I + utf8_general_ci: S = U+00DF LATIN SMALL LETTER SHARP S + utf8_general_ci: A = U+00E4 LATIN SMALL LETTER A WITH DIAERESIS + + latin1_german2_ci: SS = U+00DF LATIN SMALL LETTER SHARP S + + Examples of a character (3-byte UTF-8 sequence) + identified with 2 or 4 characters (1-byte UTF-8 sequences): + + utf8_unicode_ci: 'II' = U+2171 SMALL ROMAN NUMERAL TWO + utf8_unicode_ci: '(10)' = U+247D PARENTHESIZED NUMBER TEN + */ + + /* Delete the different-length record, and insert the + buffered one. */ + + lock_rec_store_on_page_infimum(block, rec); + page_cur_delete_rec(&page_cur, index, offsets, mtr); + page_cur_move_to_prev(&page_cur); + rec = ibuf_insert_to_index_page_low(entry, block, index, + &offsets, heap, mtr, + &page_cur); + + ut_ad(!cmp_dtuple_rec(entry, rec, offsets)); + lock_rec_restore_from_page_infimum(block, rec, block); + } else { + offsets = NULL; + ibuf_insert_to_index_page_low(entry, block, index, + &offsets, heap, mtr, + &page_cur); + } +updated_in_place: + mem_heap_free(heap); + + DBUG_VOID_RETURN; +} + +/****************************************************************//** +During merge, sets the delete mark on a record for a secondary index +entry. */ +static +void +ibuf_set_del_mark( +/*==============*/ + const dtuple_t* entry, /*!< in: entry */ + buf_block_t* block, /*!< in/out: block */ + const dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_cur_t page_cur; + ulint low_match; + + ut_ad(ibuf_inside(mtr)); + ut_ad(dtuple_check_typed(entry)); + + low_match = page_cur_search( + block, index, entry, PAGE_CUR_LE, &page_cur); + + if (low_match == dtuple_get_n_fields(entry)) { + rec_t* rec; + page_zip_des_t* page_zip; + + rec = page_cur_get_rec(&page_cur); + page_zip = page_cur_get_page_zip(&page_cur); + + /* Delete mark the old index record. According to a + comment in row_upd_sec_index_entry(), it can already + have been delete marked if a lock wait occurred in + row_ins_sec_index_entry() in a previous invocation of + row_upd_sec_index_entry(). */ + + if (UNIV_LIKELY + (!rec_get_deleted_flag( + rec, dict_table_is_comp(index->table)))) { + btr_cur_set_deleted_flag_for_ibuf(rec, page_zip, + TRUE, mtr); + } + } else { + const page_t* page + = page_cur_get_page(&page_cur); + const buf_block_t* block + = page_cur_get_block(&page_cur); + + ut_print_timestamp(stderr); + fputs(" InnoDB: unable to find a record to delete-mark\n", + stderr); + fputs("InnoDB: tuple ", stderr); + dtuple_print(stderr, entry); + fputs("\n" + "InnoDB: record ", stderr); + rec_print(stderr, page_cur_get_rec(&page_cur), index); + fprintf(stderr, "\nspace %u offset %u" + " (%u records, index id %llu)\n" + "InnoDB: Submit a detailed bug report" + " to http://bugs.mysql.com\n", + (unsigned) buf_block_get_space(block), + (unsigned) buf_block_get_page_no(block), + (unsigned) page_get_n_recs(page), + (ulonglong) btr_page_get_index_id(page)); + ut_ad(0); + } +} + +/****************************************************************//** +During merge, delete a record for a secondary index entry. */ +static +void +ibuf_delete( +/*========*/ + const dtuple_t* entry, /*!< in: entry */ + buf_block_t* block, /*!< in/out: block */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in/out: mtr; must be committed + before latching any further pages */ +{ + page_cur_t page_cur; + ulint low_match; + + ut_ad(ibuf_inside(mtr)); + ut_ad(dtuple_check_typed(entry)); + + low_match = page_cur_search( + block, index, entry, PAGE_CUR_LE, &page_cur); + + if (low_match == dtuple_get_n_fields(entry)) { + page_zip_des_t* page_zip= buf_block_get_page_zip(block); + page_t* page = buf_block_get_frame(block); + rec_t* rec = page_cur_get_rec(&page_cur); + + /* TODO: the below should probably be a separate function, + it's a bastardized version of btr_cur_optimistic_delete. */ + + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + mem_heap_t* heap = NULL; + ulint max_ins_size = 0; + + rec_offs_init(offsets_); + + offsets = rec_get_offsets( + rec, index, offsets, ULINT_UNDEFINED, &heap); + + if (page_get_n_recs(page) <= 1 + || !(REC_INFO_DELETED_FLAG + & rec_get_info_bits(rec, page_is_comp(page)))) { + /* Refuse to purge the last record or a + record that has not been marked for deletion. */ + ut_print_timestamp(stderr); + fputs(" InnoDB: unable to purge a record\n", + stderr); + fputs("InnoDB: tuple ", stderr); + dtuple_print(stderr, entry); + fputs("\n" + "InnoDB: record ", stderr); + rec_print_new(stderr, rec, offsets); + fprintf(stderr, "\nspace %u offset %u" + " (%u records, index id %llu)\n" + "InnoDB: Submit a detailed bug report" + " to http://bugs.mysql.com\n", + (unsigned) buf_block_get_space(block), + (unsigned) buf_block_get_page_no(block), + (unsigned) page_get_n_recs(page), + (ulonglong) btr_page_get_index_id(page)); + + ut_ad(0); + return; + } + + lock_update_delete(block, rec); + + if (!page_zip) { + max_ins_size + = page_get_max_insert_size_after_reorganize( + page, 1); + } +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + page_cur_delete_rec(&page_cur, index, offsets, mtr); +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + + if (page_zip) { + ibuf_update_free_bits_zip(block, mtr); + } else { + ibuf_update_free_bits_low(block, max_ins_size, mtr); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + } else { + /* The record must have been purged already. */ + } +} + +/*********************************************************************//** +Restores insert buffer tree cursor position +@return TRUE if the position was restored; FALSE if not */ +static __attribute__((nonnull)) +ibool +ibuf_restore_pos( +/*=============*/ + ulint space, /*!< in: space id */ + ulint page_no,/*!< in: index page number where the record + should belong */ + const dtuple_t* search_tuple, + /*!< in: search tuple for entries of page_no */ + ulint mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */ + btr_pcur_t* pcur, /*!< in/out: persistent cursor whose + position is to be restored */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ut_ad(mode == BTR_MODIFY_LEAF || mode == BTR_MODIFY_TREE); + + if (btr_pcur_restore_position(mode, pcur, mtr)) { + + return(TRUE); + } + + if (fil_space_get_flags(space) == ULINT_UNDEFINED) { + /* The tablespace has been dropped. It is possible + that another thread has deleted the insert buffer + entry. Do not complain. */ + ibuf_btr_pcur_commit_specify_mtr(pcur, mtr); + } else { + fprintf(stderr, + "InnoDB: ERROR: Submit the output to" + " http://bugs.mysql.com\n" + "InnoDB: ibuf cursor restoration fails!\n" + "InnoDB: ibuf record inserted to page %lu:%lu\n", + (ulong) space, (ulong) page_no); + fflush(stderr); + + rec_print_old(stderr, btr_pcur_get_rec(pcur)); + rec_print_old(stderr, pcur->old_rec); + dtuple_print(stderr, search_tuple); + + rec_print_old(stderr, + page_rec_get_next(btr_pcur_get_rec(pcur))); + fflush(stderr); + + ibuf_btr_pcur_commit_specify_mtr(pcur, mtr); + ut_ad(0); + } + + return(FALSE); +} + +/*********************************************************************//** +Deletes from ibuf the record on which pcur is positioned. If we have to +resort to a pessimistic delete, this function commits mtr and closes +the cursor. +@return TRUE if mtr was committed and pcur closed in this operation */ +static __attribute__((warn_unused_result)) +ibool +ibuf_delete_rec( +/*============*/ + ulint space, /*!< in: space id */ + ulint page_no,/*!< in: index page number that the record + should belong to */ + btr_pcur_t* pcur, /*!< in: pcur positioned on the record to + delete, having latch mode BTR_MODIFY_LEAF */ + const dtuple_t* search_tuple, + /*!< in: search tuple for entries of page_no */ + mtr_t* mtr) /*!< in: mtr */ +{ + ibool success; + page_t* root; + dberr_t err; + + ut_ad(ibuf_inside(mtr)); + ut_ad(page_rec_is_user_rec(btr_pcur_get_rec(pcur))); + ut_ad(ibuf_rec_get_page_no(mtr, btr_pcur_get_rec(pcur)) == page_no); + ut_ad(ibuf_rec_get_space(mtr, btr_pcur_get_rec(pcur)) == space); + +#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG + if (ibuf_debug == 2) { + /* Inject a fault (crash). We do this before trying + optimistic delete, because a pessimistic delete in the + change buffer would require a larger test case. */ + + /* Flag the buffered record as processed, to avoid + an assertion failure after crash recovery. */ + btr_cur_set_deleted_flag_for_ibuf( + btr_pcur_get_rec(pcur), NULL, TRUE, mtr); + ibuf_mtr_commit(mtr); + log_write_up_to(LSN_MAX, LOG_WAIT_ALL_GROUPS, TRUE); + DBUG_SUICIDE(); + } +#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ + + success = btr_cur_optimistic_delete(btr_pcur_get_btr_cur(pcur), + 0, mtr); + + if (success) { + if (page_is_empty(btr_pcur_get_page(pcur))) { + /* If a B-tree page is empty, it must be the root page + and the whole B-tree must be empty. InnoDB does not + allow empty B-tree pages other than the root. */ + root = btr_pcur_get_page(pcur); + + ut_ad(page_get_space_id(root) == IBUF_SPACE_ID); + ut_ad(page_get_page_no(root) + == FSP_IBUF_TREE_ROOT_PAGE_NO); + + /* ibuf->empty is protected by the root page latch. + Before the deletion, it had to be FALSE. */ + ut_ad(!ibuf->empty); + ibuf->empty = true; + } + +#ifdef UNIV_IBUF_COUNT_DEBUG + fprintf(stderr, + "Decrementing ibuf count of space %lu page %lu\n" + "from %lu by 1\n", space, page_no, + ibuf_count_get(space, page_no)); + ibuf_count_set(space, page_no, + ibuf_count_get(space, page_no) - 1); +#endif + return(FALSE); + } + + ut_ad(page_rec_is_user_rec(btr_pcur_get_rec(pcur))); + ut_ad(ibuf_rec_get_page_no(mtr, btr_pcur_get_rec(pcur)) == page_no); + ut_ad(ibuf_rec_get_space(mtr, btr_pcur_get_rec(pcur)) == space); + + /* We have to resort to a pessimistic delete from ibuf. + Delete-mark the record so that it will not be applied again, + in case the server crashes before the pessimistic delete is + made persistent. */ + btr_cur_set_deleted_flag_for_ibuf( + btr_pcur_get_rec(pcur), NULL, TRUE, mtr); + + btr_pcur_store_position(pcur, mtr); + ibuf_btr_pcur_commit_specify_mtr(pcur, mtr); + + ibuf_mtr_start(mtr); + mutex_enter(&ibuf_mutex); + + if (!ibuf_restore_pos(space, page_no, search_tuple, + BTR_MODIFY_TREE, pcur, mtr)) { + + mutex_exit(&ibuf_mutex); + ut_ad(mtr->state == MTR_COMMITTED); + goto func_exit; + } + + root = ibuf_tree_root_get(mtr); + + btr_cur_pessimistic_delete(&err, TRUE, btr_pcur_get_btr_cur(pcur), 0, + RB_NONE, mtr); + ut_a(err == DB_SUCCESS); + +#ifdef UNIV_IBUF_COUNT_DEBUG + ibuf_count_set(space, page_no, ibuf_count_get(space, page_no) - 1); +#endif + ibuf_size_update(root, mtr); + mutex_exit(&ibuf_mutex); + + ibuf->empty = page_is_empty(root); + ibuf_btr_pcur_commit_specify_mtr(pcur, mtr); + +func_exit: + ut_ad(mtr->state == MTR_COMMITTED); + btr_pcur_close(pcur); + + return(TRUE); +} + +/*********************************************************************//** +When an index page is read from a disk to the buffer pool, this function +applies any buffered operations to the page and deletes the entries from the +insert buffer. If the page is not read, but created in the buffer pool, this +function deletes its buffered entries from the insert buffer; there can +exist entries for such a page if the page belonged to an index which +subsequently was dropped. */ +UNIV_INTERN +void +ibuf_merge_or_delete_for_page( +/*==========================*/ + buf_block_t* block, /*!< in: if page has been read from + disk, pointer to the page x-latched, + else NULL */ + ulint space, /*!< in: space id of the index page */ + ulint page_no,/*!< in: page number of the index page */ + ulint zip_size,/*!< in: compressed page size in bytes, + or 0 */ + ibool update_ibuf_bitmap)/*!< in: normally this is set + to TRUE, but if we have deleted or are + deleting the tablespace, then we + naturally do not want to update a + non-existent bitmap page */ +{ + mem_heap_t* heap; + btr_pcur_t pcur; + dtuple_t* search_tuple; +#ifdef UNIV_IBUF_DEBUG + ulint volume = 0; +#endif + page_zip_des_t* page_zip = NULL; + ibool tablespace_being_deleted = FALSE; + ibool corruption_noticed = FALSE; + mtr_t mtr; + + /* Counts for merged & discarded operations. */ + ulint mops[IBUF_OP_COUNT]; + ulint dops[IBUF_OP_COUNT]; + + ut_ad(!block || buf_block_get_space(block) == space); + ut_ad(!block || buf_block_get_page_no(block) == page_no); + ut_ad(!block || buf_block_get_zip_size(block) == zip_size); + ut_ad(!block || buf_block_get_io_fix_unlocked(block) == BUF_IO_READ); + + if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE + || trx_sys_hdr_page(space, page_no)) { + return; + } + + /* We cannot refer to zip_size in the following, because + zip_size is passed as ULINT_UNDEFINED (it is unknown) when + buf_read_ibuf_merge_pages() is merging (discarding) changes + for a dropped tablespace. When block != NULL or + update_ibuf_bitmap is specified, the zip_size must be known. + That is why we will repeat the check below, with zip_size in + place of 0. Passing zip_size as 0 assumes that the + uncompressed page size always is a power-of-2 multiple of the + compressed page size. */ + + if (ibuf_fixed_addr_page(space, 0, page_no) + || fsp_descr_page(0, page_no)) { + return; + } + + if (UNIV_LIKELY(update_ibuf_bitmap)) { + ut_a(ut_is_2pow(zip_size)); + + if (ibuf_fixed_addr_page(space, zip_size, page_no) + || fsp_descr_page(zip_size, page_no)) { + return; + } + + /* If the following returns FALSE, we get the counter + incremented, and must decrement it when we leave this + function. When the counter is > 0, that prevents tablespace + from being dropped. */ + + tablespace_being_deleted = fil_inc_pending_ops(space, true); + + if (UNIV_UNLIKELY(tablespace_being_deleted)) { + /* Do not try to read the bitmap page from space; + just delete the ibuf records for the page */ + + block = NULL; + update_ibuf_bitmap = FALSE; + } else { + page_t* bitmap_page; + ulint bitmap_bits; + + ibuf_mtr_start(&mtr); + + bitmap_page = ibuf_bitmap_get_map_page( + space, page_no, zip_size, &mtr); + bitmap_bits = ibuf_bitmap_page_get_bits( + bitmap_page, page_no, zip_size, + IBUF_BITMAP_BUFFERED, &mtr); + + ibuf_mtr_commit(&mtr); + + if (!bitmap_bits) { + /* No inserts buffered for this page */ + + if (!tablespace_being_deleted) { + fil_decr_pending_ops(space); + } + + return; + } + } + } else if (block + && (ibuf_fixed_addr_page(space, zip_size, page_no) + || fsp_descr_page(zip_size, page_no))) { + + return; + } + + heap = mem_heap_create(512); + + search_tuple = ibuf_search_tuple_build(space, page_no, heap); + + if (block) { + /* Move the ownership of the x-latch on the page to this OS + thread, so that we can acquire a second x-latch on it. This + is needed for the insert operations to the index page to pass + the debug checks. */ + + rw_lock_x_lock_move_ownership(&(block->lock)); + page_zip = buf_block_get_page_zip(block); + + if (UNIV_UNLIKELY(fil_page_get_type(block->frame) + != FIL_PAGE_INDEX) + || UNIV_UNLIKELY(!page_is_leaf(block->frame))) { + + page_t* bitmap_page; + + corruption_noticed = TRUE; + + ut_print_timestamp(stderr); + + ibuf_mtr_start(&mtr); + + fputs(" InnoDB: Dump of the ibuf bitmap page:\n", + stderr); + + bitmap_page = ibuf_bitmap_get_map_page(space, page_no, + zip_size, &mtr); + buf_page_print(bitmap_page, 0, + BUF_PAGE_PRINT_NO_CRASH); + ibuf_mtr_commit(&mtr); + + fputs("\nInnoDB: Dump of the page:\n", stderr); + + buf_page_print(block->frame, 0, + BUF_PAGE_PRINT_NO_CRASH); + + fprintf(stderr, + "InnoDB: Error: corruption in the tablespace." + " Bitmap shows insert\n" + "InnoDB: buffer records to page n:o %lu" + " though the page\n" + "InnoDB: type is %lu, which is" + " not an index leaf page!\n" + "InnoDB: We try to resolve the problem" + " by skipping the insert buffer\n" + "InnoDB: merge for this page." + " Please run CHECK TABLE on your tables\n" + "InnoDB: to determine if they are corrupt" + " after this.\n\n" + "InnoDB: Please submit a detailed bug report" + " to http://bugs.mysql.com\n\n", + (ulong) page_no, + (ulong) + fil_page_get_type(block->frame)); + ut_ad(0); + } + } + + memset(mops, 0, sizeof(mops)); + memset(dops, 0, sizeof(dops)); + +loop: + ibuf_mtr_start(&mtr); + + /* Position pcur in the insert buffer at the first entry for this + index page */ + btr_pcur_open_on_user_rec( + ibuf->index, search_tuple, PAGE_CUR_GE, BTR_MODIFY_LEAF, + &pcur, &mtr); + + if (block) { + ibool success; + + success = buf_page_get_known_nowait( + RW_X_LATCH, block, + BUF_KEEP_OLD, __FILE__, __LINE__, &mtr); + + ut_a(success); + + /* This is a user page (secondary index leaf page), + but we pretend that it is a change buffer page in + order to obey the latching order. This should be OK, + because buffered changes are applied immediately while + the block is io-fixed. Other threads must not try to + latch an io-fixed block. */ + buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE); + } + + if (!btr_pcur_is_on_user_rec(&pcur)) { + ut_ad(btr_pcur_is_after_last_in_tree(&pcur, &mtr)); + + goto reset_bit; + } + + for (;;) { + rec_t* rec; + + ut_ad(btr_pcur_is_on_user_rec(&pcur)); + + rec = btr_pcur_get_rec(&pcur); + + /* Check if the entry is for this index page */ + if (ibuf_rec_get_page_no(&mtr, rec) != page_no + || ibuf_rec_get_space(&mtr, rec) != space) { + + if (block) { + page_header_reset_last_insert( + block->frame, page_zip, &mtr); + } + + goto reset_bit; + } + + if (UNIV_UNLIKELY(corruption_noticed)) { + fputs("InnoDB: Discarding record\n ", stderr); + rec_print_old(stderr, rec); + fputs("\nInnoDB: from the insert buffer!\n\n", stderr); + } else if (block && !rec_get_deleted_flag(rec, 0)) { + /* Now we have at pcur a record which should be + applied on the index page; NOTE that the call below + copies pointers to fields in rec, and we must + keep the latch to the rec page until the + insertion is finished! */ + dtuple_t* entry; + trx_id_t max_trx_id; + dict_index_t* dummy_index; + ibuf_op_t op = ibuf_rec_get_op_type(&mtr, rec); + + max_trx_id = page_get_max_trx_id(page_align(rec)); + page_update_max_trx_id(block, page_zip, max_trx_id, + &mtr); + + ut_ad(page_validate(page_align(rec), ibuf->index)); + + entry = ibuf_build_entry_from_ibuf_rec( + &mtr, rec, heap, &dummy_index); + + ut_ad(page_validate(block->frame, dummy_index)); + + switch (op) { + ibool success; + case IBUF_OP_INSERT: +#ifdef UNIV_IBUF_DEBUG + volume += rec_get_converted_size( + dummy_index, entry, 0); + + volume += page_dir_calc_reserved_space(1); + + ut_a(volume <= 4 * UNIV_PAGE_SIZE + / IBUF_PAGE_SIZE_PER_FREE_SPACE); +#endif + ibuf_insert_to_index_page( + entry, block, dummy_index, &mtr); + break; + + case IBUF_OP_DELETE_MARK: + ibuf_set_del_mark( + entry, block, dummy_index, &mtr); + break; + + case IBUF_OP_DELETE: + ibuf_delete(entry, block, dummy_index, &mtr); + /* Because ibuf_delete() will latch an + insert buffer bitmap page, commit mtr + before latching any further pages. + Store and restore the cursor position. */ + ut_ad(rec == btr_pcur_get_rec(&pcur)); + ut_ad(page_rec_is_user_rec(rec)); + ut_ad(ibuf_rec_get_page_no(&mtr, rec) + == page_no); + ut_ad(ibuf_rec_get_space(&mtr, rec) == space); + + /* Mark the change buffer record processed, + so that it will not be merged again in case + the server crashes between the following + mtr_commit() and the subsequent mtr_commit() + of deleting the change buffer record. */ + + btr_cur_set_deleted_flag_for_ibuf( + btr_pcur_get_rec(&pcur), NULL, + TRUE, &mtr); + + btr_pcur_store_position(&pcur, &mtr); + ibuf_btr_pcur_commit_specify_mtr(&pcur, &mtr); + + ibuf_mtr_start(&mtr); + + success = buf_page_get_known_nowait( + RW_X_LATCH, block, + BUF_KEEP_OLD, + __FILE__, __LINE__, &mtr); + ut_a(success); + + /* This is a user page (secondary + index leaf page), but it should be OK + to use too low latching order for it, + as the block is io-fixed. */ + buf_block_dbg_add_level( + block, SYNC_IBUF_TREE_NODE); + + if (!ibuf_restore_pos(space, page_no, + search_tuple, + BTR_MODIFY_LEAF, + &pcur, &mtr)) { + + ut_ad(mtr.state == MTR_COMMITTED); + mops[op]++; + ibuf_dummy_index_free(dummy_index); + goto loop; + } + + break; + default: + ut_error; + } + + mops[op]++; + + ibuf_dummy_index_free(dummy_index); + } else { + dops[ibuf_rec_get_op_type(&mtr, rec)]++; + } + + /* Delete the record from ibuf */ + if (ibuf_delete_rec(space, page_no, &pcur, search_tuple, + &mtr)) { + /* Deletion was pessimistic and mtr was committed: + we start from the beginning again */ + + ut_ad(mtr.state == MTR_COMMITTED); + goto loop; + } else if (btr_pcur_is_after_last_on_page(&pcur)) { + ibuf_mtr_commit(&mtr); + btr_pcur_close(&pcur); + + goto loop; + } + } + +reset_bit: + if (UNIV_LIKELY(update_ibuf_bitmap)) { + page_t* bitmap_page; + + bitmap_page = ibuf_bitmap_get_map_page( + space, page_no, zip_size, &mtr); + + ibuf_bitmap_page_set_bits( + bitmap_page, page_no, zip_size, + IBUF_BITMAP_BUFFERED, FALSE, &mtr); + + if (block) { + ulint old_bits = ibuf_bitmap_page_get_bits( + bitmap_page, page_no, zip_size, + IBUF_BITMAP_FREE, &mtr); + + ulint new_bits = ibuf_index_page_calc_free( + zip_size, block); + + if (old_bits != new_bits) { + ibuf_bitmap_page_set_bits( + bitmap_page, page_no, zip_size, + IBUF_BITMAP_FREE, new_bits, &mtr); + } + } + } + + ibuf_mtr_commit(&mtr); + btr_pcur_close(&pcur); + mem_heap_free(heap); + +#ifdef HAVE_ATOMIC_BUILTINS + os_atomic_increment_ulint(&ibuf->n_merges, 1); + ibuf_add_ops(ibuf->n_merged_ops, mops); + ibuf_add_ops(ibuf->n_discarded_ops, dops); +#else /* HAVE_ATOMIC_BUILTINS */ + /* Protect our statistics keeping from race conditions */ + mutex_enter(&ibuf_mutex); + + ibuf->n_merges++; + ibuf_add_ops(ibuf->n_merged_ops, mops); + ibuf_add_ops(ibuf->n_discarded_ops, dops); + + mutex_exit(&ibuf_mutex); +#endif /* HAVE_ATOMIC_BUILTINS */ + + if (update_ibuf_bitmap && !tablespace_being_deleted) { + + fil_decr_pending_ops(space); + } + +#ifdef UNIV_IBUF_COUNT_DEBUG + ut_a(ibuf_count_get(space, page_no) == 0); +#endif +} + +/*********************************************************************//** +Deletes all entries in the insert buffer for a given space id. This is used +in DISCARD TABLESPACE and IMPORT TABLESPACE. +NOTE: this does not update the page free bitmaps in the space. The space will +become CORRUPT when you call this function! */ +UNIV_INTERN +void +ibuf_delete_for_discarded_space( +/*============================*/ + ulint space) /*!< in: space id */ +{ + mem_heap_t* heap; + btr_pcur_t pcur; + dtuple_t* search_tuple; + const rec_t* ibuf_rec; + ulint page_no; + mtr_t mtr; + + /* Counts for discarded operations. */ + ulint dops[IBUF_OP_COUNT]; + + heap = mem_heap_create(512); + + /* Use page number 0 to build the search tuple so that we get the + cursor positioned at the first entry for this space id */ + + search_tuple = ibuf_search_tuple_build(space, 0, heap); + + memset(dops, 0, sizeof(dops)); +loop: + ibuf_mtr_start(&mtr); + + /* Position pcur in the insert buffer at the first entry for the + space */ + btr_pcur_open_on_user_rec( + ibuf->index, search_tuple, PAGE_CUR_GE, BTR_MODIFY_LEAF, + &pcur, &mtr); + + if (!btr_pcur_is_on_user_rec(&pcur)) { + ut_ad(btr_pcur_is_after_last_in_tree(&pcur, &mtr)); + + goto leave_loop; + } + + for (;;) { + ut_ad(btr_pcur_is_on_user_rec(&pcur)); + + ibuf_rec = btr_pcur_get_rec(&pcur); + + /* Check if the entry is for this space */ + if (ibuf_rec_get_space(&mtr, ibuf_rec) != space) { + + goto leave_loop; + } + + page_no = ibuf_rec_get_page_no(&mtr, ibuf_rec); + + dops[ibuf_rec_get_op_type(&mtr, ibuf_rec)]++; + + /* Delete the record from ibuf */ + if (ibuf_delete_rec(space, page_no, &pcur, search_tuple, + &mtr)) { + /* Deletion was pessimistic and mtr was committed: + we start from the beginning again */ + + ut_ad(mtr.state == MTR_COMMITTED); + goto loop; + } + + if (btr_pcur_is_after_last_on_page(&pcur)) { + ibuf_mtr_commit(&mtr); + btr_pcur_close(&pcur); + + goto loop; + } + } + +leave_loop: + ibuf_mtr_commit(&mtr); + btr_pcur_close(&pcur); + +#ifdef HAVE_ATOMIC_BUILTINS + ibuf_add_ops(ibuf->n_discarded_ops, dops); +#else /* HAVE_ATOMIC_BUILTINS */ + /* Protect our statistics keeping from race conditions */ + mutex_enter(&ibuf_mutex); + ibuf_add_ops(ibuf->n_discarded_ops, dops); + mutex_exit(&ibuf_mutex); +#endif /* HAVE_ATOMIC_BUILTINS */ + + mem_heap_free(heap); +} + +/******************************************************************//** +Looks if the insert buffer is empty. +@return true if empty */ +UNIV_INTERN +bool +ibuf_is_empty(void) +/*===============*/ +{ + bool is_empty; + const page_t* root; + mtr_t mtr; + + ibuf_mtr_start(&mtr); + + mutex_enter(&ibuf_mutex); + root = ibuf_tree_root_get(&mtr); + mutex_exit(&ibuf_mutex); + + is_empty = page_is_empty(root); + ut_a(is_empty == ibuf->empty); + ibuf_mtr_commit(&mtr); + + return(is_empty); +} + +/******************************************************************//** +Prints info of ibuf. */ +UNIV_INTERN +void +ibuf_print( +/*=======*/ + FILE* file) /*!< in: file where to print */ +{ +#ifdef UNIV_IBUF_COUNT_DEBUG + ulint i; + ulint j; +#endif + + mutex_enter(&ibuf_mutex); + + fprintf(file, + "Ibuf: size %lu, free list len %lu," + " seg size %lu, %lu merges\n", + (ulong) ibuf->size, + (ulong) ibuf->free_list_len, + (ulong) ibuf->seg_size, + (ulong) ibuf->n_merges); + + fputs("merged operations:\n ", file); + ibuf_print_ops(ibuf->n_merged_ops, file); + + fputs("discarded operations:\n ", file); + ibuf_print_ops(ibuf->n_discarded_ops, file); + +#ifdef UNIV_IBUF_COUNT_DEBUG + for (i = 0; i < IBUF_COUNT_N_SPACES; i++) { + for (j = 0; j < IBUF_COUNT_N_PAGES; j++) { + ulint count = ibuf_count_get(i, j); + + if (count > 0) { + fprintf(stderr, + "Ibuf count for space/page %lu/%lu" + " is %lu\n", + (ulong) i, (ulong) j, (ulong) count); + } + } + } +#endif /* UNIV_IBUF_COUNT_DEBUG */ + + mutex_exit(&ibuf_mutex); +} + +/******************************************************************//** +Checks the insert buffer bitmaps on IMPORT TABLESPACE. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +ibuf_check_bitmap_on_import( +/*========================*/ + const trx_t* trx, /*!< in: transaction */ + ulint space_id) /*!< in: tablespace identifier */ +{ + ulint zip_size; + ulint page_size; + ulint size; + ulint page_no; + + ut_ad(space_id); + ut_ad(trx->mysql_thd); + + zip_size = fil_space_get_zip_size(space_id); + + if (zip_size == ULINT_UNDEFINED) { + return(DB_TABLE_NOT_FOUND); + } + + size = fil_space_get_size(space_id); + + if (size == 0) { + return(DB_TABLE_NOT_FOUND); + } + + mutex_enter(&ibuf_mutex); + + page_size = zip_size ? zip_size : UNIV_PAGE_SIZE; + + for (page_no = 0; page_no < size; page_no += page_size) { + mtr_t mtr; + page_t* bitmap_page; + ulint i; + + if (trx_is_interrupted(trx)) { + mutex_exit(&ibuf_mutex); + return(DB_INTERRUPTED); + } + + mtr_start(&mtr); + + mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO); + + ibuf_enter(&mtr); + + bitmap_page = ibuf_bitmap_get_map_page( + space_id, page_no, zip_size, &mtr); + + for (i = FSP_IBUF_BITMAP_OFFSET + 1; i < page_size; i++) { + const ulint offset = page_no + i; + + if (ibuf_bitmap_page_get_bits( + bitmap_page, offset, zip_size, + IBUF_BITMAP_IBUF, &mtr)) { + + mutex_exit(&ibuf_mutex); + ibuf_exit(&mtr); + mtr_commit(&mtr); + + ib_errf(trx->mysql_thd, + IB_LOG_LEVEL_ERROR, + ER_INNODB_INDEX_CORRUPT, + "Space %u page %u" + " is wrongly flagged to belong to the" + " insert buffer", + (unsigned) space_id, + (unsigned) offset); + + return(DB_CORRUPTION); + } + + if (ibuf_bitmap_page_get_bits( + bitmap_page, offset, zip_size, + IBUF_BITMAP_BUFFERED, &mtr)) { + + ib_errf(trx->mysql_thd, + IB_LOG_LEVEL_WARN, + ER_INNODB_INDEX_CORRUPT, + "Buffered changes" + " for space %u page %u are lost", + (unsigned) space_id, + (unsigned) offset); + + /* Tolerate this error, so that + slightly corrupted tables can be + imported and dumped. Clear the bit. */ + ibuf_bitmap_page_set_bits( + bitmap_page, offset, zip_size, + IBUF_BITMAP_BUFFERED, FALSE, &mtr); + } + } + + ibuf_exit(&mtr); + mtr_commit(&mtr); + } + + mutex_exit(&ibuf_mutex); + return(DB_SUCCESS); +} +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/xtradb/include/api0api.h b/storage/xtradb/include/api0api.h new file mode 100644 index 00000000000..d77d691becc --- /dev/null +++ b/storage/xtradb/include/api0api.h @@ -0,0 +1,1304 @@ +/***************************************************************************** + +Copyright (c) 2011, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/api0api.h +InnoDB Native API + +2008-08-01 Created by Sunny Bains. +3/20/2011 Jimmy Yang extracted from Embedded InnoDB +*******************************************************/ + +#ifndef api0api_h +#define api0api_h + +#include "db0err.h" +#include <stdio.h> + +#ifdef _MSC_VER +#define strncasecmp _strnicmp +#define strcasecmp _stricmp +#endif + +#if defined(__GNUC__) && (__GNUC__ > 2) && ! defined(__INTEL_COMPILER) +#define UNIV_NO_IGNORE __attribute__ ((warn_unused_result)) +#else +#define UNIV_NO_IGNORE +#endif /* __GNUC__ && __GNUC__ > 2 && !__INTEL_COMPILER */ + +/* See comment about ib_bool_t as to why the two macros are unsigned long. */ +/** The boolean value of "true" used internally within InnoDB */ +#define IB_TRUE 0x1UL +/** The boolean value of "false" used internally within InnoDB */ +#define IB_FALSE 0x0UL + +/* Basic types used by the InnoDB API. */ +/** All InnoDB error codes are represented by ib_err_t */ +typedef enum dberr_t ib_err_t; +/** Representation of a byte within InnoDB */ +typedef unsigned char ib_byte_t; +/** Representation of an unsigned long int within InnoDB */ +typedef unsigned long int ib_ulint_t; + +/* We assume C99 support except when using VisualStudio. */ +#if !defined(_MSC_VER) +#include <stdint.h> +#endif /* _MSC_VER */ + +/* Integer types used by the API. Microsft VS defines its own types +and we use the Microsoft types when building with Visual Studio. */ +#if defined(_MSC_VER) +/** A signed 8 bit integral type. */ +typedef __int8 ib_i8_t; +#else +/** A signed 8 bit integral type. */ +typedef int8_t ib_i8_t; +#endif + +#if defined(_MSC_VER) +/** An unsigned 8 bit integral type. */ +typedef unsigned __int8 ib_u8_t; +#else +/** An unsigned 8 bit integral type. */ +typedef uint8_t ib_u8_t; +#endif + +#if defined(_MSC_VER) +/** A signed 16 bit integral type. */ +typedef __int16 ib_i16_t; +#else +/** A signed 16 bit integral type. */ +typedef int16_t ib_i16_t; +#endif + +#if defined(_MSC_VER) +/** An unsigned 16 bit integral type. */ +typedef unsigned __int16 ib_u16_t; +#else +/** An unsigned 16 bit integral type. */ +typedef uint16_t ib_u16_t; +#endif + +#if defined(_MSC_VER) +/** A signed 32 bit integral type. */ +typedef __int32 ib_i32_t; +#else +/** A signed 32 bit integral type. */ +typedef int32_t ib_i32_t; +#endif + +#if defined(_MSC_VER) +/** An unsigned 32 bit integral type. */ +typedef unsigned __int32 ib_u32_t; +#else +/** An unsigned 32 bit integral type. */ +typedef uint32_t ib_u32_t; +#endif + +#if defined(_MSC_VER) +/** A signed 64 bit integral type. */ +typedef __int64 ib_i64_t; +#else +/** A signed 64 bit integral type. */ +typedef int64_t ib_i64_t; +#endif + +#if defined(_MSC_VER) +/** An unsigned 64 bit integral type. */ +typedef unsigned __int64 ib_u64_t; +#else +/** An unsigned 64 bit integral type. */ +typedef uint64_t ib_u64_t; +#endif + +typedef void* ib_opaque_t; +typedef ib_opaque_t ib_charset_t; +typedef ib_ulint_t ib_bool_t; +typedef ib_u64_t ib_id_u64_t; + +/** @enum ib_cfg_type_t Possible types for a configuration variable. */ +typedef enum { + IB_CFG_IBOOL, /*!< The configuration parameter is + of type ibool */ + + /* XXX Can we avoid having different types for ulint and ulong? + - On Win64 "unsigned long" is 32 bits + - ulong is always defined as "unsigned long" + - On Win64 ulint is defined as 64 bit integer + => On Win64 ulint != ulong. + If we typecast all ulong and ulint variables to the smaller type + ulong, then we will cut the range of the ulint variables. + This is not a problem for most ulint variables because their max + allowed values do not exceed 2^32-1 (e.g. log_groups is ulint + but its max allowed value is 10). BUT buffer_pool_size and + log_file_size allow up to 2^64-1. */ + + IB_CFG_ULINT, /*!< The configuration parameter is + of type ulint */ + + IB_CFG_ULONG, /*!< The configuration parameter is + of type ulong */ + + IB_CFG_TEXT, /*!< The configuration parameter is + of type char* */ + + IB_CFG_CB /*!< The configuration parameter is + a callback parameter */ +} ib_cfg_type_t; + +/** @enum ib_col_type_t column types that are supported. */ +typedef enum { + IB_VARCHAR = 1, /*!< Character varying length. The + column is not padded. */ + + IB_CHAR = 2, /*!< Fixed length character string. The + column is padded to the right. */ + + IB_BINARY = 3, /*!< Fixed length binary, similar to + IB_CHAR but the column is not padded + to the right. */ + + IB_VARBINARY = 4, /*!< Variable length binary */ + + IB_BLOB = 5, /*!< Binary large object, or + a TEXT type */ + + IB_INT = 6, /*!< Integer: can be any size + from 1 - 8 bytes. If the size is + 1, 2, 4 and 8 bytes then you can use + the typed read and write functions. For + other sizes you will need to use the + ib_col_get_value() function and do the + conversion yourself. */ + + IB_SYS = 8, /*!< System column, this column can + be one of DATA_TRX_ID, DATA_ROLL_PTR + or DATA_ROW_ID. */ + + IB_FLOAT = 9, /*!< C (float) floating point value. */ + + IB_DOUBLE = 10, /*!> C (double) floating point value. */ + + IB_DECIMAL = 11, /*!< Decimal stored as an ASCII + string */ + + IB_VARCHAR_ANYCHARSET = 12, /*!< Any charset, varying length */ + + IB_CHAR_ANYCHARSET = 13 /*!< Any charset, fixed length */ + +} ib_col_type_t; + +/** @enum ib_tbl_fmt_t InnoDB table format types */ +typedef enum { + IB_TBL_REDUNDANT, /*!< Redundant row format, the column + type and length is stored in the row.*/ + + IB_TBL_COMPACT, /*!< Compact row format, the column + type is not stored in the row. The + length is stored in the row but the + storage format uses a compact format + to store the length of the column data + and record data storage format also + uses less storage. */ + + IB_TBL_DYNAMIC, /*!< Compact row format. BLOB prefixes + are not stored in the clustered index */ + + IB_TBL_COMPRESSED /*!< Similar to dynamic format but + with pages compressed */ +} ib_tbl_fmt_t; + +/** @enum ib_col_attr_t InnoDB column attributes */ +typedef enum { + IB_COL_NONE = 0, /*!< No special attributes. */ + + IB_COL_NOT_NULL = 1, /*!< Column data can't be NULL. */ + + IB_COL_UNSIGNED = 2, /*!< Column is IB_INT and unsigned. */ + + IB_COL_NOT_USED = 4, /*!< Future use, reserved. */ + + IB_COL_CUSTOM1 = 8, /*!< Custom precision type, this is + a bit that is ignored by InnoDB and so + can be set and queried by users. */ + + IB_COL_CUSTOM2 = 16, /*!< Custom precision type, this is + a bit that is ignored by InnoDB and so + can be set and queried by users. */ + + IB_COL_CUSTOM3 = 32 /*!< Custom precision type, this is + a bit that is ignored by InnoDB and so + can be set and queried by users. */ +} ib_col_attr_t; + +/* Note: must match lock0types.h */ +/** @enum ib_lck_mode_t InnoDB lock modes. */ +typedef enum { + IB_LOCK_IS = 0, /*!< Intention shared, an intention + lock should be used to lock tables */ + + IB_LOCK_IX, /*!< Intention exclusive, an intention + lock should be used to lock tables */ + + IB_LOCK_S, /*!< Shared locks should be used to + lock rows */ + + IB_LOCK_X, /*!< Exclusive locks should be used to + lock rows*/ + + IB_LOCK_TABLE_X, /*!< exclusive table lock */ + + IB_LOCK_NONE, /*!< This is used internally to note + consistent read */ + + IB_LOCK_NUM = IB_LOCK_NONE /*!< number of lock modes */ +} ib_lck_mode_t; + +typedef enum { + IB_CLUSTERED = 1, /*!< clustered index */ + IB_UNIQUE = 2 /*!< unique index */ +} ib_index_type_t; + +/** @enum ib_srch_mode_t InnoDB cursor search modes for ib_cursor_moveto(). +Note: Values must match those found in page0cur.h */ +typedef enum { + IB_CUR_G = 1, /*!< If search key is not found then + position the cursor on the row that + is greater than the search key */ + + IB_CUR_GE = 2, /*!< If the search key not found then + position the cursor on the row that + is greater than or equal to the search + key */ + + IB_CUR_L = 3, /*!< If search key is not found then + position the cursor on the row that + is less than the search key */ + + IB_CUR_LE = 4 /*!< If search key is not found then + position the cursor on the row that + is less than or equal to the search + key */ +} ib_srch_mode_t; + +/** @enum ib_match_mode_t Various match modes used by ib_cursor_moveto() */ +typedef enum { + IB_CLOSEST_MATCH, /*!< Closest match possible */ + + IB_EXACT_MATCH, /*!< Search using a complete key + value */ + + IB_EXACT_PREFIX /*!< Search using a key prefix which + must match to rows: the prefix may + contain an incomplete field (the + last field in prefix may be just + a prefix of a fixed length column) */ +} ib_match_mode_t; + +/** @struct ib_col_meta_t InnoDB column meta data. */ +typedef struct { + ib_col_type_t type; /*!< Type of the column */ + + ib_col_attr_t attr; /*!< Column attributes */ + + ib_u32_t type_len; /*!< Length of type */ + + ib_u16_t client_type; /*!< 16 bits of data relevant only to + the client. InnoDB doesn't care */ + + ib_charset_t* charset; /*!< Column charset */ +} ib_col_meta_t; + +/* Note: Must be in sync with trx0trx.h */ +/** @enum ib_trx_state_t The transaction state can be queried using the +ib_trx_state() function. The InnoDB deadlock monitor can roll back a +transaction and users should be prepared for this, especially where there +is high contention. The way to determine the state of the transaction is to +query it's state and check. */ +typedef enum { + IB_TRX_NOT_STARTED, /*!< Has not started yet, the + transaction has not ben started yet.*/ + + IB_TRX_ACTIVE, /*!< The transaction is currently + active and needs to be either + committed or rolled back. */ + + IB_TRX_COMMITTED_IN_MEMORY, /*!< Not committed to disk yet */ + + IB_TRX_PREPARED /*!< Support for 2PC/XA */ +} ib_trx_state_t; + +/* Note: Must be in sync with trx0trx.h */ +/** @enum ib_trx_level_t Transaction isolation levels */ +typedef enum { + IB_TRX_READ_UNCOMMITTED = 0, /*!< Dirty read: non-locking SELECTs are + performed so that we do not look at a + possible earlier version of a record; + thus they are not 'consistent' reads + under this isolation level; otherwise + like level 2 */ + + IB_TRX_READ_COMMITTED = 1, /*!< Somewhat Oracle-like isolation, + except that in range UPDATE and DELETE + we must block phantom rows with + next-key locks; SELECT ... FOR UPDATE + and ... LOCK IN SHARE MODE only lock + the index records, NOT the gaps before + them, and thus allow free inserting; + each consistent read reads its own + snapshot */ + + IB_TRX_REPEATABLE_READ = 2, /*!< All consistent reads in the same + trx read the same snapshot; full + next-key locking used in locking reads + to block insertions into gaps */ + + IB_TRX_SERIALIZABLE = 3 /*!< All plain SELECTs are converted to + LOCK IN SHARE MODE reads */ +} ib_trx_level_t; + +/** Generical InnoDB callback prototype. */ +typedef void (*ib_cb_t)(void); + +#define IB_CFG_BINLOG_ENABLED 0x1 +#define IB_CFG_MDL_ENABLED 0x2 +#define IB_CFG_DISABLE_ROWLOCK 0x4 + +/** The first argument to the InnoDB message logging function. By default +it's set to stderr. You should treat ib_msg_stream_t as a void*, since +it will probably change in the future. */ +typedef FILE* ib_msg_stream_t; + +/** All log messages are written to this function.It should have the same +behavior as fprintf(3). */ +typedef int (*ib_msg_log_t)(ib_msg_stream_t, const char*, ...); + +/* Note: This is to make it easy for API users to have type +checking for arguments to our functions. Making it ib_opaque_t +by itself will result in pointer decay resulting in subverting +of the compiler's type checking. */ + +/** InnoDB tuple handle. This handle can refer to either a cluster index +tuple or a secondary index tuple. There are two types of tuples for each +type of index, making a total of four types of tuple handles. There +is a tuple for reading the entire row contents and another for searching +on the index key. */ +typedef struct ib_tuple_t* ib_tpl_t; + +/** InnoDB transaction handle, all database operations need to be covered +by transactions. This handle represents a transaction. The handle can be +created with ib_trx_begin(), you commit your changes with ib_trx_commit() +and undo your changes using ib_trx_rollback(). If the InnoDB deadlock +monitor rolls back the transaction then you need to free the transaction +using the function ib_trx_release(). You can query the state of an InnoDB +transaction by calling ib_trx_state(). */ +typedef struct trx_t* ib_trx_t; + +/** InnoDB cursor handle */ +typedef struct ib_cursor_t* ib_crsr_t; + +/*************************************************************//** +This function is used to compare two data fields for which the data type +is such that we must use the client code to compare them. + +@param col_meta column meta data +@param p1 key +@oaram p1_len key length +@param p2 second key +@param p2_len second key length +@return 1, 0, -1, if a is greater, equal, less than b, respectively */ + +typedef int (*ib_client_cmp_t)( + const ib_col_meta_t* col_meta, + const ib_byte_t* p1, + ib_ulint_t p1_len, + const ib_byte_t* p2, + ib_ulint_t p2_len); + +/* This should be the same as univ.i */ +/** Represents SQL_NULL length */ +#define IB_SQL_NULL 0xFFFFFFFF +/** The number of system columns in a row. */ +#define IB_N_SYS_COLS 3 + +/** The maximum length of a text column. */ +#define MAX_TEXT_LEN 4096 + +/* MySQL uses 3 byte UTF-8 encoding. */ +/** The maximum length of a column name in a table schema. */ +#define IB_MAX_COL_NAME_LEN (64 * 3) + +/** The maximum length of a table name (plus database name). */ +#define IB_MAX_TABLE_NAME_LEN (64 * 3) * 2 + +/*****************************************************************//** +Start a transaction that's been rolled back. This special function +exists for the case when InnoDB's deadlock detector has rolledack +a transaction. While the transaction has been rolled back the handle +is still valid and can be reused by calling this function. If you +don't want to reuse the transaction handle then you can free the handle +by calling ib_trx_release(). +@return innobase txn handle */ + +ib_err_t +ib_trx_start( +/*=========*/ + ib_trx_t ib_trx, /*!< in: transaction to restart */ + ib_trx_level_t ib_trx_level, /*!< in: trx isolation level */ + ib_bool_t read_write, /*!< in: true if read write + transaction */ + ib_bool_t auto_commit, /*!< in: auto commit after each + single DML */ + void* thd); /*!< in: THD */ + +/*****************************************************************//** +Begin a transaction. This will allocate a new transaction handle and +put the transaction in the active state. +@return innobase txn handle */ + +ib_trx_t +ib_trx_begin( +/*=========*/ + ib_trx_level_t ib_trx_level, /*!< in: trx isolation level */ + ib_bool_t read_write, /*!< in: true if read write + transaction */ + ib_bool_t auto_commit); /*!< in: auto commit after each + single DML */ + +/*****************************************************************//** +Query the transaction's state. This function can be used to check for +the state of the transaction in case it has been rolled back by the +InnoDB deadlock detector. Note that when a transaction is selected as +a victim for rollback, InnoDB will always return an appropriate error +code indicating this. @see DB_DEADLOCK, @see DB_LOCK_TABLE_FULL and +@see DB_LOCK_WAIT_TIMEOUT +@return transaction state */ + +ib_trx_state_t +ib_trx_state( +/*=========*/ + ib_trx_t ib_trx); /*!< in: trx handle */ + +/*****************************************************************//** +Release the resources of the transaction. If the transaction was +selected as a victim by InnoDB and rolled back then use this function +to free the transaction handle. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_trx_release( +/*===========*/ + ib_trx_t ib_trx); /*!< in: trx handle */ + +/*****************************************************************//** +Commit a transaction. This function will release the schema latches too. +It will also free the transaction handle. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_trx_commit( +/*==========*/ + ib_trx_t ib_trx); /*!< in: trx handle */ + +/*****************************************************************//** +Rollback a transaction. This function will release the schema latches too. +It will also free the transaction handle. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_trx_rollback( +/*============*/ + ib_trx_t ib_trx); /*!< in: trx handle */ + +/*****************************************************************//** +Open an InnoDB table and return a cursor handle to it. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_cursor_open_table_using_id( +/*==========================*/ + ib_id_u64_t table_id, /*!< in: table id of table to open */ + ib_trx_t ib_trx, /*!< in: Current transaction handle + can be NULL */ + ib_crsr_t* ib_crsr); /*!< out,own: InnoDB cursor */ + +/*****************************************************************//** +Open an InnoDB index and return a cursor handle to it. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_cursor_open_index_using_id( +/*==========================*/ + ib_id_u64_t index_id, /*!< in: index id of index to open */ + ib_trx_t ib_trx, /*!< in: Current transaction handle + can be NULL */ + ib_crsr_t* ib_crsr); /*!< out: InnoDB cursor */ + +/*****************************************************************//** +Open an InnoDB secondary index cursor and return a cursor handle to it. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_cursor_open_index_using_name( +/*============================*/ + ib_crsr_t ib_open_crsr, /*!< in: open/active cursor */ + const char* index_name, /*!< in: secondary index name */ + ib_crsr_t* ib_crsr, /*!< out,own: InnoDB index cursor */ + int* idx_type, /*!< out: index is cluster index */ + ib_id_u64_t* idx_id); /*!< out: index id */ + +/*****************************************************************//** +Open an InnoDB table by name and return a cursor handle to it. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_cursor_open_table( +/*=================*/ + const char* name, /*!< in: table name */ + ib_trx_t ib_trx, /*!< in: Current transaction handle + can be NULL */ + ib_crsr_t* ib_crsr); /*!< out,own: InnoDB cursor */ + +/*****************************************************************//** +Reset the cursor. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_cursor_reset( +/*============*/ + ib_crsr_t ib_crsr); /*!< in/out: InnoDB cursor */ + + +/*****************************************************************//** +set a cursor trx to NULL*/ + +void +ib_cursor_clear_trx( +/*================*/ + ib_crsr_t ib_crsr); /*!< in/out: InnoDB cursor */ + +/*****************************************************************//** +Close an InnoDB table and free the cursor. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_cursor_close( +/*============*/ + ib_crsr_t ib_crsr); /*!< in/out: InnoDB cursor */ + +/*****************************************************************//** +Close the table, decrement n_ref_count count. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_cursor_close_table( +/*==================*/ + ib_crsr_t ib_crsr); /*!< in/out: InnoDB cursor */ + +/*****************************************************************//** +update the cursor with new transactions and also reset the cursor +@return DB_SUCCESS or err code */ + +ib_err_t +ib_cursor_new_trx( +/*==============*/ + ib_crsr_t ib_crsr, /*!< in/out: InnoDB cursor */ + ib_trx_t ib_trx); /*!< in: transaction */ + +/*****************************************************************//** +Commit the transaction in a cursor +@return DB_SUCCESS or err code */ + +ib_err_t +ib_cursor_commit_trx( +/*=================*/ + ib_crsr_t ib_crsr, /*!< in/out: InnoDB cursor */ + ib_trx_t ib_trx); /*!< in: transaction */ + +/********************************************************************//** +Open a table using the table name, if found then increment table ref count. +@return table instance if found */ + +void* +ib_open_table_by_name( +/*==================*/ + const char* name); /*!< in: table name to lookup */ + +/*****************************************************************//** +Insert a row to a table. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_cursor_insert_row( +/*=================*/ + ib_crsr_t ib_crsr, /*!< in/out: InnoDB cursor instance */ + const ib_tpl_t ib_tpl); /*!< in: tuple to insert */ + +/*****************************************************************//** +Update a row in a table. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_cursor_update_row( +/*=================*/ + ib_crsr_t ib_crsr, /*!< in: InnoDB cursor instance */ + const ib_tpl_t ib_old_tpl, /*!< in: Old tuple in table */ + const ib_tpl_t ib_new_tpl); /*!< in: New tuple to update */ + +/*****************************************************************//** +Delete a row in a table. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_cursor_delete_row( +/*=================*/ + ib_crsr_t ib_crsr); /*!< in: cursor instance */ + +/*****************************************************************//** +Read current row. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_cursor_read_row( +/*===============*/ + ib_crsr_t ib_crsr, /*!< in: InnoDB cursor instance */ + ib_tpl_t ib_tpl, /*!< out: read cols into this tuple */ + void** row_buf, /*!< in/out: row buffer */ + ib_ulint_t* row_len); /*!< in/out: row buffer len */ + +/*****************************************************************//** +Move cursor to the first record in the table. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_cursor_first( +/*============*/ + ib_crsr_t ib_crsr); /*!< in: InnoDB cursor instance */ + +/*****************************************************************//** +Move cursor to the last record in the table. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_cursor_last( +/*===========*/ + ib_crsr_t ib_crsr); /*!< in: InnoDB cursor instance */ + +/*****************************************************************//** +Move cursor to the next record in the table. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_cursor_next( +/*===========*/ + ib_crsr_t ib_crsr); /*!< in: InnoDB cursor instance */ + +/*****************************************************************//** +Search for key. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_cursor_moveto( +/*=============*/ + ib_crsr_t ib_crsr, /*!< in: InnoDB cursor instance */ + ib_tpl_t ib_tpl, /*!< in: Key to search for */ + ib_srch_mode_t ib_srch_mode); /*!< in: search mode */ + +/*****************************************************************//** +Set the match mode for ib_cursor_move(). */ + +void +ib_cursor_set_match_mode( +/*=====================*/ + ib_crsr_t ib_crsr, /*!< in: Cursor instance */ + ib_match_mode_t match_mode); /*!< in: ib_cursor_moveto match mode */ + +/*****************************************************************//** +Set a column of the tuple. Make a copy using the tuple's heap. +@return DB_SUCCESS or error code */ + +ib_err_t +ib_col_set_value( +/*=============*/ + ib_tpl_t ib_tpl, /*!< in: tuple instance */ + ib_ulint_t col_no, /*!< in: column index in tuple */ + const void* src, /*!< in: data value */ + ib_ulint_t len, /*!< in: data value len */ + ib_bool_t need_cpy); /*!< in: if need memcpy */ + + +/*****************************************************************//** +Get the size of the data available in the column the tuple. +@return bytes avail or IB_SQL_NULL */ + +ib_ulint_t +ib_col_get_len( +/*===========*/ + ib_tpl_t ib_tpl, /*!< in: tuple instance */ + ib_ulint_t i); /*!< in: column index in tuple */ + +/*****************************************************************//** +Copy a column value from the tuple. +@return bytes copied or IB_SQL_NULL */ + +ib_ulint_t +ib_col_copy_value( +/*==============*/ + ib_tpl_t ib_tpl, /*!< in: tuple instance */ + ib_ulint_t i, /*!< in: column index in tuple */ + void* dst, /*!< out: copied data value */ + ib_ulint_t len); /*!< in: max data value len to copy */ + +/*************************************************************//** +Read a signed int 8 bit column from an InnoDB tuple. +@return DB_SUCCESS or error */ + +ib_err_t +ib_tuple_read_i8( +/*=============*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t i, /*!< in: column number */ + ib_i8_t* ival); /*!< out: integer value */ + +/*************************************************************//** +Read an unsigned int 8 bit column from an InnoDB tuple. +@return DB_SUCCESS or error */ + +ib_err_t +ib_tuple_read_u8( +/*=============*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t i, /*!< in: column number */ + ib_u8_t* ival); /*!< out: integer value */ + +/*************************************************************//** +Read a signed int 16 bit column from an InnoDB tuple. +@return DB_SUCCESS or error */ + +ib_err_t +ib_tuple_read_i16( +/*==============*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t i, /*!< in: column number */ + ib_i16_t* ival); /*!< out: integer value */ + +/*************************************************************//** +Read an unsigned int 16 bit column from an InnoDB tuple. +@return DB_SUCCESS or error */ + +ib_err_t +ib_tuple_read_u16( +/*==============*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t i, /*!< in: column number */ + ib_u16_t* ival); /*!< out: integer value */ + +/*************************************************************//** +Read a signed int 32 bit column from an InnoDB tuple. +@return DB_SUCCESS or error */ + +ib_err_t +ib_tuple_read_i32( +/*==============*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t i, /*!< in: column number */ + ib_i32_t* ival); /*!< out: integer value */ + +/*************************************************************//** +Read an unsigned int 32 bit column from an InnoDB tuple. +@return DB_SUCCESS or error */ + +ib_err_t +ib_tuple_read_u32( +/*==============*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t i, /*!< in: column number */ + ib_u32_t* ival); /*!< out: integer value */ + +/*************************************************************//** +Read a signed int 64 bit column from an InnoDB tuple. +@return DB_SUCCESS or error */ + +ib_err_t +ib_tuple_read_i64( +/*==============*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t i, /*!< in: column number */ + ib_i64_t* ival); /*!< out: integer value */ + +/*************************************************************//** +Read an unsigned int 64 bit column from an InnoDB tuple. +@return DB_SUCCESS or error */ + +ib_err_t +ib_tuple_read_u64( +/*==============*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t i, /*!< in: column number */ + ib_u64_t* ival); /*!< out: integer value */ + +/*****************************************************************//** +Get a column value pointer from the tuple. +@return NULL or pointer to buffer */ + +const void* +ib_col_get_value( +/*=============*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t i); /*!< in: column number */ + +/*****************************************************************//** +Get a column type, length and attributes from the tuple. +@return len of column data */ + +ib_ulint_t +ib_col_get_meta( +/*============*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t i, /*!< in: column number */ + ib_col_meta_t* ib_col_meta); /*!< out: column meta data */ + +/*****************************************************************//** +"Clear" or reset an InnoDB tuple. We free the heap and recreate the tuple. +@return new tuple, or NULL */ + +ib_tpl_t +ib_tuple_clear( +/*============*/ + ib_tpl_t ib_tpl); /*!< in: InnoDB tuple */ + +/*****************************************************************//** +Create a new cluster key search tuple and copy the contents of the +secondary index key tuple columns that refer to the cluster index record +to the cluster key. It does a deep copy of the column data. +@return DB_SUCCESS or error code */ + +ib_err_t +ib_tuple_get_cluster_key( +/*=====================*/ + ib_crsr_t ib_crsr, /*!< in: secondary index cursor */ + ib_tpl_t* ib_dst_tpl, /*!< out,own: destination tuple */ + const ib_tpl_t ib_src_tpl); /*!< in: source tuple */ + +/*****************************************************************//** +Copy the contents of source tuple to destination tuple. The tuples +must be of the same type and belong to the same table/index. +@return DB_SUCCESS or error code */ + +ib_err_t +ib_tuple_copy( +/*==========*/ + ib_tpl_t ib_dst_tpl, /*!< in: destination tuple */ + const ib_tpl_t ib_src_tpl); /*!< in: source tuple */ + +/*****************************************************************//** +Create an InnoDB tuple used for index/table search. +@return tuple for current index */ + +ib_tpl_t +ib_sec_search_tuple_create( +/*=======================*/ + ib_crsr_t ib_crsr); /*!< in: Cursor instance */ + +/*****************************************************************//** +Create an InnoDB tuple used for index/table search. +@return tuple for current index */ + +ib_tpl_t +ib_sec_read_tuple_create( +/*=====================*/ + ib_crsr_t ib_crsr); /*!< in: Cursor instance */ + +/*****************************************************************//** +Create an InnoDB tuple used for table key operations. +@return tuple for current table */ + +ib_tpl_t +ib_clust_search_tuple_create( +/*=========================*/ + ib_crsr_t ib_crsr); /*!< in: Cursor instance */ + +/*****************************************************************//** +Create an InnoDB tuple for table row operations. +@return tuple for current table */ + +ib_tpl_t +ib_clust_read_tuple_create( +/*=======================*/ + ib_crsr_t ib_crsr); /*!< in: Cursor instance */ + +/*****************************************************************//** +Return the number of user columns in the tuple definition. +@return number of user columns */ + +ib_ulint_t +ib_tuple_get_n_user_cols( +/*=====================*/ + const ib_tpl_t ib_tpl); /*!< in: Tuple for current table */ + +/*****************************************************************//** +Return the number of columns in the tuple definition. +@return number of columns */ + +ib_ulint_t +ib_tuple_get_n_cols( +/*================*/ + const ib_tpl_t ib_tpl); /*!< in: Tuple for current table */ + +/*****************************************************************//** +Destroy an InnoDB tuple. */ + +void +ib_tuple_delete( +/*============*/ + ib_tpl_t ib_tpl); /*!< in,own: Tuple instance to delete */ + +/*****************************************************************//** +Truncate a table. The cursor handle will be closed and set to NULL +on success. +@return DB_SUCCESS or error code */ + +ib_err_t +ib_cursor_truncate( +/*===============*/ + ib_crsr_t* ib_crsr, /*!< in/out: cursor for table + to truncate */ + ib_id_u64_t* table_id); /*!< out: new table id */ + +/*****************************************************************//** +Get a table id. +@return DB_SUCCESS if found */ + +ib_err_t +ib_table_get_id( +/*============*/ + const char* table_name, /*!< in: table to find */ + ib_id_u64_t* table_id); /*!< out: table id if found */ + +/*****************************************************************//** +Get an index id. +@return DB_SUCCESS if found */ + +ib_err_t +ib_index_get_id( +/*============*/ + const char* table_name, /*!< in: find index for this table */ + const char* index_name, /*!< in: index to find */ + ib_id_u64_t* index_id); /*!< out: index id if found */ + +/*****************************************************************//** +Check if cursor is positioned. +@return IB_TRUE if positioned */ + +ib_bool_t +ib_cursor_is_positioned( +/*====================*/ + const ib_crsr_t ib_crsr); /*!< in: InnoDB cursor instance */ + +/*****************************************************************//** +Checks if the data dictionary is latched in exclusive mode by a +user transaction. +@return TRUE if exclusive latch */ + +ib_bool_t +ib_schema_lock_is_exclusive( +/*========================*/ + const ib_trx_t ib_trx); /*!< in: transaction */ + +/*****************************************************************//** +Lock an InnoDB cursor/table. +@return DB_SUCCESS or error code */ + +ib_err_t +ib_cursor_lock( +/*===========*/ + ib_crsr_t ib_crsr, /*!< in/out: InnoDB cursor */ + ib_lck_mode_t ib_lck_mode); /*!< in: InnoDB lock mode */ + +/*****************************************************************//** +Set the Lock an InnoDB table using the table id. +@return DB_SUCCESS or error code */ + +ib_err_t +ib_table_lock( +/*===========*/ + ib_trx_t ib_trx, /*!< in/out: transaction */ + ib_id_u64_t table_id, /*!< in: table id */ + ib_lck_mode_t ib_lck_mode); /*!< in: InnoDB lock mode */ + +/*****************************************************************//** +Set the Lock mode of the cursor. +@return DB_SUCCESS or error code */ + +ib_err_t +ib_cursor_set_lock_mode( +/*====================*/ + ib_crsr_t ib_crsr, /*!< in/out: InnoDB cursor */ + ib_lck_mode_t ib_lck_mode); /*!< in: InnoDB lock mode */ + +/*****************************************************************//** +Set need to access clustered index record flag. */ + +void +ib_cursor_set_cluster_access( +/*=========================*/ + ib_crsr_t ib_crsr); /*!< in/out: InnoDB cursor */ + +/*****************************************************************//** +Write an integer value to a column. Integers are stored in big-endian +format and will need to be converted from the host format. +@return DB_SUCESS or error */ + +ib_err_t +ib_tuple_write_i8( +/*==============*/ + ib_tpl_t ib_tpl, /*!< in/out: tuple to write to */ + int col_no, /*!< in: column number */ + ib_i8_t val); /*!< in: value to write */ + +/*****************************************************************//** +Write an integer value to a column. Integers are stored in big-endian +format and will need to be converted from the host format. +@return DB_SUCESS or error */ + +ib_err_t +ib_tuple_write_i16( +/*=================*/ + ib_tpl_t ib_tpl, /*!< in/out: tuple to write to */ + int col_no, /*!< in: column number */ + ib_i16_t val); /*!< in: value to write */ + +/*****************************************************************//** +Write an integer value to a column. Integers are stored in big-endian +format and will need to be converted from the host format. +@return DB_SUCESS or error */ + +ib_err_t +ib_tuple_write_i32( +/*===============*/ + ib_tpl_t ib_tpl, /*!< in/out: tuple to write to */ + int col_no, /*!< in: column number */ + ib_i32_t val); /*!< in: value to write */ + +/*****************************************************************//** +Write an integer value to a column. Integers are stored in big-endian +format and will need to be converted from the host format. +@return DB_SUCESS or error */ + +ib_err_t +ib_tuple_write_i64( +/*===============*/ + ib_tpl_t ib_tpl, /*!< in/out: tuple to write to */ + int col_no, /*!< in: column number */ + ib_i64_t val); /*!< in: value to write */ + +/*****************************************************************//** +Write an integer value to a column. Integers are stored in big-endian +format and will need to be converted from the host format. +@return DB_SUCESS or error */ + +ib_err_t +ib_tuple_write_u8( +/*==============*/ + ib_tpl_t ib_tpl, /*!< in/out: tuple to write to */ + int col_no, /*!< in: column number */ + ib_u8_t val); /*!< in: value to write */ + +/*****************************************************************//** +Write an integer value to a column. Integers are stored in big-endian +format and will need to be converted from the host format. +@return DB_SUCESS or error */ + +ib_err_t +ib_tuple_write_u16( +/*===============*/ + ib_tpl_t ib_tpl, /*!< in/out: tuple to write to */ + int col_no, /*!< in: column number */ + ib_u16_t val); /*!< in: value to write */ + +/*****************************************************************//** +Write an integer value to a column. Integers are stored in big-endian +format and will need to be converted from the host format. +@return DB_SUCESS or error */ + +ib_err_t +ib_tuple_write_u32( +/*=================*/ + ib_tpl_t ib_tpl, /*!< in/out: tuple to write to */ + int col_no, /*!< in: column number */ + ib_u32_t val); /*!< in: value to write */ + +/*****************************************************************//** +Write an integer value to a column. Integers are stored in big-endian +format and will need to be converted from the host format. +@return DB_SUCESS or error */ + +ib_err_t +ib_tuple_write_u64( +/*===============*/ + ib_tpl_t ib_tpl, /*!< in/out: tuple to write to */ + int col_no, /*!< in: column number */ + ib_u64_t val); /*!< in: value to write */ + +/*****************************************************************//** +Inform the cursor that it's the start of an SQL statement. */ + +void +ib_cursor_stmt_begin( +/*=================*/ + ib_crsr_t ib_crsr); /*!< in: cursor */ + +/*****************************************************************//** +Write a double value to a column. +@return DB_SUCCESS or error */ + +ib_err_t +ib_tuple_write_double( +/*==================*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + int col_no, /*!< in: column number */ + double val); /*!< in: value to write */ + +/*************************************************************//** +Read a double column value from an InnoDB tuple. +@return DB_SUCCESS or error */ + +ib_err_t +ib_tuple_read_double( +/*=================*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t col_no, /*!< in: column number */ + double* dval); /*!< out: double value */ + +/*****************************************************************//** +Write a float value to a column. +@return DB_SUCCESS or error */ + +ib_err_t +ib_tuple_write_float( +/*=================*/ + ib_tpl_t ib_tpl, /*!< in/out: tuple to write to */ + int col_no, /*!< in: column number */ + float val); /*!< in: value to write */ + +/*************************************************************//** +Read a float value from an InnoDB tuple. +@return DB_SUCCESS or error */ + +ib_err_t +ib_tuple_read_float( +/*================*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t col_no, /*!< in: column number */ + float* fval); /*!< out: float value */ + +/*****************************************************************//** +Get a column type, length and attributes from the tuple. +@return len of column data */ + +const char* +ib_col_get_name( +/*============*/ + ib_crsr_t ib_crsr, /*!< in: InnoDB cursor instance */ + ib_ulint_t i); /*!< in: column index in tuple */ + +/*****************************************************************//** +Get an index field name from the cursor. +@return name of the field */ + +const char* +ib_get_idx_field_name( +/*==================*/ + ib_crsr_t ib_crsr, /*!< in: InnoDB cursor instance */ + ib_ulint_t i); /*!< in: column index in tuple */ + +/*****************************************************************//** +Truncate a table. +@return DB_SUCCESS or error code */ + +ib_err_t +ib_table_truncate( +/*==============*/ + const char* table_name, /*!< in: table name */ + ib_id_u64_t* table_id); /*!< out: new table id */ + +/*****************************************************************//** +Frees a possible InnoDB trx object associated with the current THD. +@return DB_SUCCESS or error number */ + +ib_err_t +ib_close_thd( +/*=========*/ + void* thd); /*!< in: handle to the MySQL + thread of the user whose resources + should be free'd */ + +/*****************************************************************//** +Get generic configure status +@return configure status*/ + +int +ib_cfg_get_cfg(); +/*============*/ + +/*****************************************************************//** +Increase/decrease the memcached sync count of table to sync memcached +DML with SQL DDLs. +@return DB_SUCCESS or error number */ +ib_err_t +ib_cursor_set_memcached_sync( +/*=========================*/ + ib_crsr_t ib_crsr, /*!< in: cursor */ + ib_bool_t flag); /*!< in: true for increasing */ + +/*****************************************************************//** +Check whether the table name conforms to our requirements. Currently +we only do a simple check for the presence of a '/'. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_table_name_check( +/*================*/ + const char* name); /*!< in: table name to check */ + +/*****************************************************************//** +Return isolation configuration set by "innodb_api_trx_level" +@return trx isolation level*/ + +ib_trx_state_t +ib_cfg_trx_level(); +/*==============*/ + +/*****************************************************************//** +Return configure value for background commit interval (in seconds) +@return background commit interval (in seconds) */ + +ib_ulint_t +ib_cfg_bk_commit_interval(); +/*=======================*/ + +/*****************************************************************//** +Get a trx start time. +@return trx start_time */ + +ib_u64_t +ib_trx_get_start_time( +/*==================*/ + ib_trx_t ib_trx); /*!< in: transaction */ + +#endif /* api0api_h */ diff --git a/storage/xtradb/include/api0misc.h b/storage/xtradb/include/api0misc.h new file mode 100644 index 00000000000..fcd748390d1 --- /dev/null +++ b/storage/xtradb/include/api0misc.h @@ -0,0 +1,78 @@ +/***************************************************************************** + +Copyright (c) 2008, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/api0misc.h +InnoDB Native API + +3/20/2011 Jimmy Yang extracted from Embedded InnoDB +2008 Created by Sunny Bains +*******************************************************/ + +#ifndef api0misc_h +#define api0misc_h + +#include "univ.i" +#include "os0file.h" +#include "que0que.h" +#include "trx0trx.h" + +/** Whether binlog is enabled for applications using InnoDB APIs */ +extern my_bool ib_binlog_enabled; + +/** Whether MySQL MDL is enabled for applications using InnoDB APIs */ +extern my_bool ib_mdl_enabled; + +/** Whether InnoDB row lock is disabled for applications using InnoDB APIs */ +extern my_bool ib_disable_row_lock; + +/** configure value for transaction isolation level */ +extern ulong ib_trx_level_setting; + +/** configure value for background commit interval (in seconds) */ +extern ulong ib_bk_commit_interval; + +/******************************************************************** +Handles user errors and lock waits detected by the database engine. +@return TRUE if it was a lock wait and we should continue running +the query thread */ +UNIV_INTERN +ibool +ib_handle_errors( +/*=============*/ + dberr_t* new_err, /*!< out: possible new error + encountered in lock wait, or if + no new error, the value of + trx->error_state at the entry of this + function */ + trx_t* trx, /*!< in: transaction */ + que_thr_t* thr, /*!< in: query thread */ + trx_savept_t* savept); /*!< in: savepoint or NULL */ + +/************************************************************************* +Sets a lock on a table. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +ib_trx_lock_table_with_retry( +/*=========================*/ + trx_t* trx, /*!< in/out: transaction */ + dict_table_t* table, /*!< in: table to lock */ + enum lock_mode mode); /*!< in: lock mode */ + +#endif /* api0misc_h */ diff --git a/storage/xtradb/include/btr0btr.h b/storage/xtradb/include/btr0btr.h new file mode 100644 index 00000000000..a3f7cee2733 --- /dev/null +++ b/storage/xtradb/include/btr0btr.h @@ -0,0 +1,776 @@ +/***************************************************************************** + +Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/btr0btr.h +The B-tree + +Created 6/2/1994 Heikki Tuuri +*******************************************************/ + +#ifndef btr0btr_h +#define btr0btr_h + +#include "univ.i" + +#include "dict0dict.h" +#include "data0data.h" +#include "page0cur.h" +#include "mtr0mtr.h" +#include "btr0types.h" + +#ifndef UNIV_HOTBACKUP +/** Maximum record size which can be stored on a page, without using the +special big record storage structure */ +#define BTR_PAGE_MAX_REC_SIZE (UNIV_PAGE_SIZE / 2 - 200) + +/** @brief Maximum depth of a B-tree in InnoDB. + +Note that this isn't a maximum as such; none of the tree operations +avoid producing trees bigger than this. It is instead a "max depth +that other code must work with", useful for e.g. fixed-size arrays +that must store some information about each level in a tree. In other +words: if a B-tree with bigger depth than this is encountered, it is +not acceptable for it to lead to mysterious memory corruption, but it +is acceptable for the program to die with a clear assert failure. */ +#define BTR_MAX_LEVELS 100 + +/** Latching modes for btr_cur_search_to_nth_level(). */ +enum btr_latch_mode { + /** Search a record on a leaf page and S-latch it. */ + BTR_SEARCH_LEAF = RW_S_LATCH, + /** (Prepare to) modify a record on a leaf page and X-latch it. */ + BTR_MODIFY_LEAF = RW_X_LATCH, + /** Obtain no latches. */ + BTR_NO_LATCHES = RW_NO_LATCH, + /** Start modifying the entire B-tree. */ + BTR_MODIFY_TREE = 33, + /** Continue modifying the entire B-tree. */ + BTR_CONT_MODIFY_TREE = 34, + /** Search the previous record. */ + BTR_SEARCH_PREV = 35, + /** Modify the previous record. */ + BTR_MODIFY_PREV = 36, + /** Weaker BTR_MODIFY_TREE that does not lock the leaf page siblings, + used for fake changes. */ + BTR_SEARCH_TREE = 37 /* BTR_MODIFY_TREE | 4 */ +}; + +/* BTR_INSERT, BTR_DELETE and BTR_DELETE_MARK are mutually exclusive. */ + +/** If this is ORed to btr_latch_mode, it means that the search tuple +will be inserted to the index, at the searched position. +When the record is not in the buffer pool, try to use the insert buffer. */ +#define BTR_INSERT 512 + +/** This flag ORed to btr_latch_mode says that we do the search in query +optimization */ +#define BTR_ESTIMATE 1024 + +/** This flag ORed to BTR_INSERT says that we can ignore possible +UNIQUE definition on secondary indexes when we decide if we can use +the insert buffer to speed up inserts */ +#define BTR_IGNORE_SEC_UNIQUE 2048 + +/** Try to delete mark the record at the searched position using the +insert/delete buffer when the record is not in the buffer pool. */ +#define BTR_DELETE_MARK 4096 + +/** Try to purge the record at the searched position using the insert/delete +buffer when the record is not in the buffer pool. */ +#define BTR_DELETE 8192 + +/** In the case of BTR_SEARCH_LEAF or BTR_MODIFY_LEAF, the caller is +already holding an S latch on the index tree */ +#define BTR_ALREADY_S_LATCHED 16384 + +#define BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode) \ + ((latch_mode) & ~(BTR_INSERT \ + | BTR_DELETE_MARK \ + | BTR_DELETE \ + | BTR_ESTIMATE \ + | BTR_IGNORE_SEC_UNIQUE \ + | BTR_ALREADY_S_LATCHED)) +#endif /* UNIV_HOTBACKUP */ + +/**************************************************************//** +Report that an index page is corrupted. */ +UNIV_INTERN +void +btr_corruption_report( +/*==================*/ + const buf_block_t* block, /*!< in: corrupted block */ + const dict_index_t* index) /*!< in: index tree */ + UNIV_COLD __attribute__((nonnull)); + +/** Assert that a B-tree page is not corrupted. +@param block buffer block containing a B-tree page +@param index the B-tree index */ +#define btr_assert_not_corrupted(block, index) \ + if ((ibool) !!page_is_comp(buf_block_get_frame(block)) \ + != dict_table_is_comp((index)->table)) { \ + btr_corruption_report(block, index); \ + ut_error; \ + } + +#ifndef UNIV_HOTBACKUP +#ifdef UNIV_BLOB_DEBUG +# include "ut0rbt.h" +/** An index->blobs entry for keeping track of off-page column references */ +struct btr_blob_dbg_t +{ + unsigned blob_page_no:32; /*!< first BLOB page number */ + unsigned ref_page_no:32; /*!< referring page number */ + unsigned ref_heap_no:16; /*!< referring heap number */ + unsigned ref_field_no:10; /*!< referring field number */ + unsigned owner:1; /*!< TRUE if BLOB owner */ + unsigned always_owner:1; /*!< TRUE if always + has been the BLOB owner; + reset to TRUE on B-tree + page splits and merges */ + unsigned del:1; /*!< TRUE if currently + delete-marked */ +}; + +/**************************************************************//** +Add a reference to an off-page column to the index->blobs map. */ +UNIV_INTERN +void +btr_blob_dbg_add_blob( +/*==================*/ + const rec_t* rec, /*!< in: clustered index record */ + ulint field_no, /*!< in: number of off-page column */ + ulint page_no, /*!< in: start page of the column */ + dict_index_t* index, /*!< in/out: index tree */ + const char* ctx) /*!< in: context (for logging) */ + __attribute__((nonnull)); +/**************************************************************//** +Display the references to off-page columns. +This function is to be called from a debugger, +for example when a breakpoint on ut_dbg_assertion_failed is hit. */ +UNIV_INTERN +void +btr_blob_dbg_print( +/*===============*/ + const dict_index_t* index) /*!< in: index tree */ + __attribute__((nonnull)); +/**************************************************************//** +Check that there are no references to off-page columns from or to +the given page. Invoked when freeing or clearing a page. +@return TRUE when no orphan references exist */ +UNIV_INTERN +ibool +btr_blob_dbg_is_empty( +/*==================*/ + dict_index_t* index, /*!< in: index */ + ulint page_no) /*!< in: page number */ + __attribute__((nonnull, warn_unused_result)); + +/**************************************************************//** +Modify the 'deleted' flag of a record. */ +UNIV_INTERN +void +btr_blob_dbg_set_deleted_flag( +/*==========================*/ + const rec_t* rec, /*!< in: record */ + dict_index_t* index, /*!< in/out: index */ + const ulint* offsets,/*!< in: rec_get_offs(rec, index) */ + ibool del) /*!< in: TRUE=deleted, FALSE=exists */ + __attribute__((nonnull)); +/**************************************************************//** +Change the ownership of an off-page column. */ +UNIV_INTERN +void +btr_blob_dbg_owner( +/*===============*/ + const rec_t* rec, /*!< in: record */ + dict_index_t* index, /*!< in/out: index */ + const ulint* offsets,/*!< in: rec_get_offs(rec, index) */ + ulint i, /*!< in: ith field in rec */ + ibool own) /*!< in: TRUE=owned, FALSE=disowned */ + __attribute__((nonnull)); +/** Assert that there are no BLOB references to or from the given page. */ +# define btr_blob_dbg_assert_empty(index, page_no) \ + ut_a(btr_blob_dbg_is_empty(index, page_no)) +#else /* UNIV_BLOB_DEBUG */ +# define btr_blob_dbg_add_blob(rec, field_no, page, index, ctx) ((void) 0) +# define btr_blob_dbg_set_deleted_flag(rec, index, offsets, del)((void) 0) +# define btr_blob_dbg_owner(rec, index, offsets, i, val) ((void) 0) +# define btr_blob_dbg_assert_empty(index, page_no) ((void) 0) +#endif /* UNIV_BLOB_DEBUG */ + +/**************************************************************//** +Gets the root node of a tree and x-latches it. +@return root page, x-latched */ +UNIV_INTERN +page_t* +btr_root_get( +/*=========*/ + const dict_index_t* index, /*!< in: index tree */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull)); + +/**************************************************************//** +Checks and adjusts the root node of a tree during IMPORT TABLESPACE. +@return error code, or DB_SUCCESS */ +UNIV_INTERN +dberr_t +btr_root_adjust_on_import( +/*======================*/ + const dict_index_t* index) /*!< in: index tree */ + __attribute__((nonnull, warn_unused_result)); + +/**************************************************************//** +Gets the height of the B-tree (the level of the root, when the leaf +level is assumed to be 0). The caller must hold an S or X latch on +the index. +@return tree height (level of the root) */ +UNIV_INTERN +ulint +btr_height_get( +/*===========*/ + dict_index_t* index, /*!< in: index tree */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull, warn_unused_result)); +/**************************************************************//** +Gets a buffer page and declares its latching order level. */ +UNIV_INLINE +buf_block_t* +btr_block_get_func( +/*===============*/ + ulint space, /*!< in: space id */ + ulint zip_size, /*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page_no, /*!< in: page number */ + ulint mode, /*!< in: latch mode */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ +# ifdef UNIV_SYNC_DEBUG + const dict_index_t* index, /*!< in: index tree, may be NULL + if it is not an insert buffer tree */ +# endif /* UNIV_SYNC_DEBUG */ + mtr_t* mtr); /*!< in/out: mini-transaction */ +# ifdef UNIV_SYNC_DEBUG +/** Gets a buffer page and declares its latching order level. +@param space tablespace identifier +@param zip_size compressed page size in bytes or 0 for uncompressed pages +@param page_no page number +@param mode latch mode +@param index index tree, may be NULL if not the insert buffer tree +@param mtr mini-transaction handle +@return the block descriptor */ +# define btr_block_get(space,zip_size,page_no,mode,index,mtr) \ + btr_block_get_func(space,zip_size,page_no,mode, \ + __FILE__,__LINE__,index,mtr) +# else /* UNIV_SYNC_DEBUG */ +/** Gets a buffer page and declares its latching order level. +@param space tablespace identifier +@param zip_size compressed page size in bytes or 0 for uncompressed pages +@param page_no page number +@param mode latch mode +@param idx index tree, may be NULL if not the insert buffer tree +@param mtr mini-transaction handle +@return the block descriptor */ +# define btr_block_get(space,zip_size,page_no,mode,idx,mtr) \ + btr_block_get_func(space,zip_size,page_no,mode,__FILE__,__LINE__,mtr) +# endif /* UNIV_SYNC_DEBUG */ +/** Gets a buffer page and declares its latching order level. +@param space tablespace identifier +@param zip_size compressed page size in bytes or 0 for uncompressed pages +@param page_no page number +@param mode latch mode +@param idx index tree, may be NULL if not the insert buffer tree +@param mtr mini-transaction handle +@return the uncompressed page frame */ +# define btr_page_get(space,zip_size,page_no,mode,idx,mtr) \ + buf_block_get_frame(btr_block_get(space,zip_size,page_no,mode,idx,mtr)) +#endif /* !UNIV_HOTBACKUP */ +/**************************************************************//** +Gets the index id field of a page. +@return index id */ +UNIV_INLINE +index_id_t +btr_page_get_index_id( +/*==================*/ + const page_t* page) /*!< in: index page */ + __attribute__((nonnull, pure, warn_unused_result)); +#ifndef UNIV_HOTBACKUP +/********************************************************//** +Gets the node level field in an index page. +@return level, leaf level == 0 */ +UNIV_INLINE +ulint +btr_page_get_level_low( +/*===================*/ + const page_t* page) /*!< in: index page */ + __attribute__((nonnull, pure, warn_unused_result)); +#define btr_page_get_level(page, mtr) btr_page_get_level_low(page) +/********************************************************//** +Gets the next index page number. +@return next page number */ +UNIV_INLINE +ulint +btr_page_get_next( +/*==============*/ + const page_t* page, /*!< in: index page */ + mtr_t* mtr) /*!< in: mini-transaction handle */ + __attribute__((nonnull, warn_unused_result)); +/********************************************************//** +Gets the previous index page number. +@return prev page number */ +UNIV_INLINE +ulint +btr_page_get_prev( +/*==============*/ + const page_t* page, /*!< in: index page */ + mtr_t* mtr) /*!< in: mini-transaction handle */ + __attribute__((nonnull, warn_unused_result)); +/*************************************************************//** +Gets pointer to the previous user record in the tree. It is assumed +that the caller has appropriate latches on the page and its neighbor. +@return previous user record, NULL if there is none */ +UNIV_INTERN +rec_t* +btr_get_prev_user_rec( +/*==================*/ + rec_t* rec, /*!< in: record on leaf level */ + mtr_t* mtr) /*!< in: mtr holding a latch on the page, and if + needed, also to the previous page */ + __attribute__((nonnull, warn_unused_result)); +/*************************************************************//** +Gets pointer to the next user record in the tree. It is assumed +that the caller has appropriate latches on the page and its neighbor. +@return next user record, NULL if there is none */ +UNIV_INTERN +rec_t* +btr_get_next_user_rec( +/*==================*/ + rec_t* rec, /*!< in: record on leaf level */ + mtr_t* mtr) /*!< in: mtr holding a latch on the page, and if + needed, also to the next page */ + __attribute__((nonnull, warn_unused_result)); +/**************************************************************//** +Releases the latch on a leaf page and bufferunfixes it. */ +UNIV_INLINE +void +btr_leaf_page_release( +/*==================*/ + buf_block_t* block, /*!< in: buffer block */ + ulint latch_mode, /*!< in: BTR_SEARCH_LEAF or + BTR_MODIFY_LEAF */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull)); +/**************************************************************//** +Gets the child node file address in a node pointer. +NOTE: the offsets array must contain all offsets for the record since +we read the last field according to offsets and assume that it contains +the child page number. In other words offsets must have been retrieved +with rec_get_offsets(n_fields=ULINT_UNDEFINED). +@return child node address */ +UNIV_INLINE +ulint +btr_node_ptr_get_child_page_no( +/*===========================*/ + const rec_t* rec, /*!< in: node pointer record */ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ + __attribute__((nonnull, pure, warn_unused_result)); +/************************************************************//** +Creates the root node for a new index tree. +@return page number of the created root, FIL_NULL if did not succeed */ +UNIV_INTERN +ulint +btr_create( +/*=======*/ + ulint type, /*!< in: type of the index */ + ulint space, /*!< in: space where created */ + ulint zip_size,/*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + index_id_t index_id,/*!< in: index id */ + dict_index_t* index, /*!< in: index */ + mtr_t* mtr) /*!< in: mini-transaction handle */ + __attribute__((nonnull)); +/************************************************************//** +Frees a B-tree except the root page, which MUST be freed after this +by calling btr_free_root. */ +UNIV_INTERN +void +btr_free_but_not_root( +/*==================*/ + ulint space, /*!< in: space where created */ + ulint zip_size, /*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint root_page_no); /*!< in: root page number */ +/************************************************************//** +Frees the B-tree root page. Other tree MUST already have been freed. */ +UNIV_INTERN +void +btr_free_root( +/*==========*/ + ulint space, /*!< in: space where created */ + ulint zip_size, /*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint root_page_no, /*!< in: root page number */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull)); +/*************************************************************//** +Makes tree one level higher by splitting the root, and inserts +the tuple. It is assumed that mtr contains an x-latch on the tree. +NOTE that the operation of this function must always succeed, +we cannot reverse it: therefore enough free disk space must be +guaranteed to be available before this function is called. +@return inserted record */ +UNIV_INTERN +rec_t* +btr_root_raise_and_insert( +/*======================*/ + ulint flags, /*!< in: undo logging and locking flags */ + btr_cur_t* cursor, /*!< in: cursor at which to insert: must be + on the root page; when the function returns, + the cursor is positioned on the predecessor + of the inserted record */ + ulint** offsets,/*!< out: offsets on inserted record */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap + that can be emptied, or NULL */ + const dtuple_t* tuple, /*!< in: tuple to insert */ + ulint n_ext, /*!< in: number of externally stored columns */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull, warn_unused_result)); +/*************************************************************//** +Reorganizes an index page. + +IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE +if this is a compressed leaf page in a secondary index. This has to +be done either within the same mini-transaction, or by invoking +ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages, +IBUF_BITMAP_FREE is unaffected by reorganization. + +@retval true if the operation was successful +@retval false if it is a compressed page, and recompression failed */ +UNIV_INTERN +bool +btr_page_reorganize_low( +/*====================*/ + bool recovery,/*!< in: true if called in recovery: + locks should not be updated, i.e., + there cannot exist locks on the + page, and a hash index should not be + dropped: it cannot exist */ + ulint z_level,/*!< in: compression level to be used + if dealing with compressed page */ + page_cur_t* cursor, /*!< in/out: page cursor */ + dict_index_t* index, /*!< in: the index tree of the page */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull, warn_unused_result)); +/*************************************************************//** +Reorganizes an index page. + +IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE +if this is a compressed leaf page in a secondary index. This has to +be done either within the same mini-transaction, or by invoking +ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages, +IBUF_BITMAP_FREE is unaffected by reorganization. + +@retval true if the operation was successful +@retval false if it is a compressed page, and recompression failed */ +UNIV_INTERN +bool +btr_page_reorganize( +/*================*/ + page_cur_t* cursor, /*!< in/out: page cursor */ + dict_index_t* index, /*!< in: the index tree of the page */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull)); +/*************************************************************//** +Decides if the page should be split at the convergence point of +inserts converging to left. +@return TRUE if split recommended */ +UNIV_INTERN +ibool +btr_page_get_split_rec_to_left( +/*===========================*/ + btr_cur_t* cursor, /*!< in: cursor at which to insert */ + rec_t** split_rec)/*!< out: if split recommended, + the first record on upper half page, + or NULL if tuple should be first */ + __attribute__((nonnull, warn_unused_result)); +/*************************************************************//** +Decides if the page should be split at the convergence point of +inserts converging to right. +@return TRUE if split recommended */ +UNIV_INTERN +ibool +btr_page_get_split_rec_to_right( +/*============================*/ + btr_cur_t* cursor, /*!< in: cursor at which to insert */ + rec_t** split_rec)/*!< out: if split recommended, + the first record on upper half page, + or NULL if tuple should be first */ + __attribute__((nonnull, warn_unused_result)); +/*************************************************************//** +Splits an index page to halves and inserts the tuple. It is assumed +that mtr holds an x-latch to the index tree. NOTE: the tree x-latch is +released within this function! NOTE that the operation of this +function must always succeed, we cannot reverse it: therefore enough +free disk space (2 pages) must be guaranteed to be available before +this function is called. + +@return inserted record */ +UNIV_INTERN +rec_t* +btr_page_split_and_insert( +/*======================*/ + ulint flags, /*!< in: undo logging and locking flags */ + btr_cur_t* cursor, /*!< in: cursor at which to insert; when the + function returns, the cursor is positioned + on the predecessor of the inserted record */ + ulint** offsets,/*!< out: offsets on inserted record */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap + that can be emptied, or NULL */ + const dtuple_t* tuple, /*!< in: tuple to insert */ + ulint n_ext, /*!< in: number of externally stored columns */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull, warn_unused_result)); +/*******************************************************//** +Inserts a data tuple to a tree on a non-leaf level. It is assumed +that mtr holds an x-latch on the tree. */ +UNIV_INTERN +void +btr_insert_on_non_leaf_level_func( +/*==============================*/ + ulint flags, /*!< in: undo logging and locking flags */ + dict_index_t* index, /*!< in: index */ + ulint level, /*!< in: level, must be > 0 */ + dtuple_t* tuple, /*!< in: the record to be inserted */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull)); +# define btr_insert_on_non_leaf_level(f,i,l,t,m) \ + btr_insert_on_non_leaf_level_func(f,i,l,t,__FILE__,__LINE__,m) +#endif /* !UNIV_HOTBACKUP */ +/****************************************************************//** +Sets a record as the predefined minimum record. */ +UNIV_INTERN +void +btr_set_min_rec_mark( +/*=================*/ + rec_t* rec, /*!< in/out: record */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull)); +#ifndef UNIV_HOTBACKUP +/*************************************************************//** +Deletes on the upper level the node pointer to a page. */ +UNIV_INTERN +void +btr_node_ptr_delete( +/*================*/ + dict_index_t* index, /*!< in: index tree */ + buf_block_t* block, /*!< in: page whose node pointer is deleted */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull)); +#ifdef UNIV_DEBUG +/************************************************************//** +Checks that the node pointer to a page is appropriate. +@return TRUE */ +UNIV_INTERN +ibool +btr_check_node_ptr( +/*===============*/ + dict_index_t* index, /*!< in: index tree */ + buf_block_t* block, /*!< in: index page */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull, warn_unused_result)); +#endif /* UNIV_DEBUG */ +/*************************************************************//** +Tries to merge the page first to the left immediate brother if such a +brother exists, and the node pointers to the current page and to the +brother reside on the same page. If the left brother does not satisfy these +conditions, looks at the right brother. If the page is the only one on that +level lifts the records of the page to the father page, thus reducing the +tree height. It is assumed that mtr holds an x-latch on the tree and on the +page. If cursor is on the leaf level, mtr must also hold x-latches to +the brothers, if they exist. +@return TRUE on success */ +UNIV_INTERN +ibool +btr_compress( +/*=========*/ + btr_cur_t* cursor, /*!< in/out: cursor on the page to merge + or lift; the page must not be empty: + when deleting records, use btr_discard_page() + if the page would become empty */ + ibool adjust, /*!< in: TRUE if should adjust the + cursor position even if compression occurs */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull)); +/*************************************************************//** +Discards a page from a B-tree. This is used to remove the last record from +a B-tree page: the whole page must be removed at the same time. This cannot +be used for the root page, which is allowed to be empty. */ +UNIV_INTERN +void +btr_discard_page( +/*=============*/ + btr_cur_t* cursor, /*!< in: cursor on the page to discard: not on + the root page */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull)); +#endif /* !UNIV_HOTBACKUP */ +/****************************************************************//** +Parses the redo log record for setting an index record as the predefined +minimum record. +@return end of log record or NULL */ +UNIV_INTERN +byte* +btr_parse_set_min_rec_mark( +/*=======================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + ulint comp, /*!< in: nonzero=compact page format */ + page_t* page, /*!< in: page or NULL */ + mtr_t* mtr) /*!< in: mtr or NULL */ + __attribute__((nonnull(1,2), warn_unused_result)); +/***********************************************************//** +Parses a redo log record of reorganizing a page. +@return end of log record or NULL */ +UNIV_INTERN +byte* +btr_parse_page_reorganize( +/*======================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + dict_index_t* index, /*!< in: record descriptor */ + bool compressed,/*!< in: true if compressed page */ + buf_block_t* block, /*!< in: page to be reorganized, or NULL */ + mtr_t* mtr) /*!< in: mtr or NULL */ + __attribute__((nonnull(1,2,3), warn_unused_result)); +#ifndef UNIV_HOTBACKUP +/**************************************************************//** +Gets the number of pages in a B-tree. +@return number of pages, or ULINT_UNDEFINED if the index is unavailable */ +UNIV_INTERN +ulint +btr_get_size( +/*=========*/ + dict_index_t* index, /*!< in: index */ + ulint flag, /*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */ + mtr_t* mtr) /*!< in/out: mini-transaction where index + is s-latched */ + __attribute__((nonnull, warn_unused_result)); +/**************************************************************//** +Allocates a new file page to be used in an index tree. NOTE: we assume +that the caller has made the reservation for free extents! +@retval NULL if no page could be allocated +@retval block, rw_lock_x_lock_count(&block->lock) == 1 if allocation succeeded +(init_mtr == mtr, or the page was not previously freed in mtr) +@retval block (not allocated or initialized) otherwise */ +UNIV_INTERN +buf_block_t* +btr_page_alloc( +/*===========*/ + dict_index_t* index, /*!< in: index tree */ + ulint hint_page_no, /*!< in: hint of a good page */ + byte file_direction, /*!< in: direction where a possible + page split is made */ + ulint level, /*!< in: level where the page is placed + in the tree */ + mtr_t* mtr, /*!< in/out: mini-transaction + for the allocation */ + mtr_t* init_mtr) /*!< in/out: mini-transaction + for x-latching and initializing + the page */ + __attribute__((nonnull, warn_unused_result)); +/**************************************************************//** +Frees a file page used in an index tree. NOTE: cannot free field external +storage pages because the page must contain info on its level. */ +UNIV_INTERN +void +btr_page_free( +/*==========*/ + dict_index_t* index, /*!< in: index tree */ + buf_block_t* block, /*!< in: block to be freed, x-latched */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull)); +/**************************************************************//** +Frees a file page used in an index tree. Can be used also to BLOB +external storage pages, because the page level 0 can be given as an +argument. */ +UNIV_INTERN +void +btr_page_free_low( +/*==============*/ + dict_index_t* index, /*!< in: index tree */ + buf_block_t* block, /*!< in: block to be freed, x-latched */ + ulint level, /*!< in: page level */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull)); +#ifdef UNIV_BTR_PRINT +/*************************************************************//** +Prints size info of a B-tree. */ +UNIV_INTERN +void +btr_print_size( +/*===========*/ + dict_index_t* index) /*!< in: index tree */ + __attribute__((nonnull)); +/**************************************************************//** +Prints directories and other info of all nodes in the index. */ +UNIV_INTERN +void +btr_print_index( +/*============*/ + dict_index_t* index, /*!< in: index */ + ulint width) /*!< in: print this many entries from start + and end */ + __attribute__((nonnull)); +#endif /* UNIV_BTR_PRINT */ +/************************************************************//** +Checks the size and number of fields in a record based on the definition of +the index. +@return TRUE if ok */ +UNIV_INTERN +ibool +btr_index_rec_validate( +/*===================*/ + const rec_t* rec, /*!< in: index record */ + const dict_index_t* index, /*!< in: index */ + ibool dump_on_error) /*!< in: TRUE if the function + should print hex dump of record + and page on error */ + __attribute__((nonnull, warn_unused_result)); +/**************************************************************//** +Checks the consistency of an index tree. +@return TRUE if ok */ +UNIV_INTERN +bool +btr_validate_index( +/*===============*/ + dict_index_t* index, /*!< in: index */ + const trx_t* trx) /*!< in: transaction or 0 */ + __attribute__((nonnull(1), warn_unused_result)); + +#define BTR_N_LEAF_PAGES 1 +#define BTR_TOTAL_SIZE 2 +#endif /* !UNIV_HOTBACKUP */ + +#ifndef UNIV_NONINL +#include "btr0btr.ic" +#endif + +#endif diff --git a/storage/xtradb/include/btr0btr.ic b/storage/xtradb/include/btr0btr.ic new file mode 100644 index 00000000000..9cc611ee450 --- /dev/null +++ b/storage/xtradb/include/btr0btr.ic @@ -0,0 +1,292 @@ +/***************************************************************************** + +Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/btr0btr.ic +The B-tree + +Created 6/2/1994 Heikki Tuuri +*******************************************************/ + +#include "mach0data.h" +#ifndef UNIV_HOTBACKUP +#include "mtr0mtr.h" +#include "mtr0log.h" +#include "page0zip.h" +#include "srv0srv.h" +#define BTR_MAX_NODE_LEVEL 50 /*!< Maximum B-tree page level + (not really a hard limit). + Used in debug assertions + in btr_page_set_level and + btr_page_get_level_low */ + +/**************************************************************//** +Gets a buffer page and declares its latching order level. */ +UNIV_INLINE +buf_block_t* +btr_block_get_func( +/*===============*/ + ulint space, /*!< in: space id */ + ulint zip_size, /*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page_no, /*!< in: page number */ + ulint mode, /*!< in: latch mode */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ +#ifdef UNIV_SYNC_DEBUG + const dict_index_t* index, /*!< in: index tree, may be NULL + if it is not an insert buffer tree */ +#endif /* UNIV_SYNC_DEBUG */ + mtr_t* mtr) /*!< in/out: mtr */ +{ + buf_block_t* block; + + block = buf_page_get_gen(space, zip_size, page_no, mode, + NULL, BUF_GET, file, line, mtr); + + SRV_CORRUPT_TABLE_CHECK(block, ; /* do nothing */); + + if (block && mode != RW_NO_LATCH) { + + buf_block_dbg_add_level( + block, index != NULL && dict_index_is_ibuf(index) + ? SYNC_IBUF_TREE_NODE : SYNC_TREE_NODE); + } + + return(block); +} + +/**************************************************************//** +Sets the index id field of a page. */ +UNIV_INLINE +void +btr_page_set_index_id( +/*==================*/ + page_t* page, /*!< in: page to be created */ + page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed + part will be updated, or NULL */ + index_id_t id, /*!< in: index id */ + mtr_t* mtr) /*!< in: mtr */ +{ + if (page_zip) { + mach_write_to_8(page + (PAGE_HEADER + PAGE_INDEX_ID), id); + page_zip_write_header(page_zip, + page + (PAGE_HEADER + PAGE_INDEX_ID), + 8, mtr); + } else { + mlog_write_ull(page + (PAGE_HEADER + PAGE_INDEX_ID), id, mtr); + } +} +#endif /* !UNIV_HOTBACKUP */ + +/**************************************************************//** +Gets the index id field of a page. +@return index id */ +UNIV_INLINE +index_id_t +btr_page_get_index_id( +/*==================*/ + const page_t* page) /*!< in: index page */ +{ + return(mach_read_from_8(page + PAGE_HEADER + PAGE_INDEX_ID)); +} + +#ifndef UNIV_HOTBACKUP +/********************************************************//** +Gets the node level field in an index page. +@return level, leaf level == 0 */ +UNIV_INLINE +ulint +btr_page_get_level_low( +/*===================*/ + const page_t* page) /*!< in: index page */ +{ + ulint level; + + ut_ad(page); + + level = mach_read_from_2(page + PAGE_HEADER + PAGE_LEVEL); + + ut_ad(level <= BTR_MAX_NODE_LEVEL); + + return(level); +} + +/********************************************************//** +Sets the node level field in an index page. */ +UNIV_INLINE +void +btr_page_set_level( +/*===============*/ + page_t* page, /*!< in: index page */ + page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed + part will be updated, or NULL */ + ulint level, /*!< in: level, leaf level == 0 */ + mtr_t* mtr) /*!< in: mini-transaction handle */ +{ + ut_ad(page && mtr); + ut_ad(level <= BTR_MAX_NODE_LEVEL); + + if (page_zip) { + mach_write_to_2(page + (PAGE_HEADER + PAGE_LEVEL), level); + page_zip_write_header(page_zip, + page + (PAGE_HEADER + PAGE_LEVEL), + 2, mtr); + } else { + mlog_write_ulint(page + (PAGE_HEADER + PAGE_LEVEL), level, + MLOG_2BYTES, mtr); + } +} + +/********************************************************//** +Gets the next index page number. +@return next page number */ +UNIV_INLINE +ulint +btr_page_get_next( +/*==============*/ + const page_t* page, /*!< in: index page */ + mtr_t* mtr __attribute__((unused))) + /*!< in: mini-transaction handle */ +{ + ut_ad(page && mtr); + ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX) + || mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_S_FIX)); + + return(mach_read_from_4(page + FIL_PAGE_NEXT)); +} + +/********************************************************//** +Sets the next index page field. */ +UNIV_INLINE +void +btr_page_set_next( +/*==============*/ + page_t* page, /*!< in: index page */ + page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed + part will be updated, or NULL */ + ulint next, /*!< in: next page number */ + mtr_t* mtr) /*!< in: mini-transaction handle */ +{ + ut_ad(page && mtr); + + if (page_zip) { + mach_write_to_4(page + FIL_PAGE_NEXT, next); + page_zip_write_header(page_zip, page + FIL_PAGE_NEXT, 4, mtr); + } else { + mlog_write_ulint(page + FIL_PAGE_NEXT, next, MLOG_4BYTES, mtr); + } +} + +/********************************************************//** +Gets the previous index page number. +@return prev page number */ +UNIV_INLINE +ulint +btr_page_get_prev( +/*==============*/ + const page_t* page, /*!< in: index page */ + mtr_t* mtr __attribute__((unused))) /*!< in: mini-transaction handle */ +{ + ut_ad(page && mtr); + + return(mach_read_from_4(page + FIL_PAGE_PREV)); +} + +/********************************************************//** +Sets the previous index page field. */ +UNIV_INLINE +void +btr_page_set_prev( +/*==============*/ + page_t* page, /*!< in: index page */ + page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed + part will be updated, or NULL */ + ulint prev, /*!< in: previous page number */ + mtr_t* mtr) /*!< in: mini-transaction handle */ +{ + ut_ad(page && mtr); + + if (page_zip) { + mach_write_to_4(page + FIL_PAGE_PREV, prev); + page_zip_write_header(page_zip, page + FIL_PAGE_PREV, 4, mtr); + } else { + mlog_write_ulint(page + FIL_PAGE_PREV, prev, MLOG_4BYTES, mtr); + } +} + +/**************************************************************//** +Gets the child node file address in a node pointer. +NOTE: the offsets array must contain all offsets for the record since +we read the last field according to offsets and assume that it contains +the child page number. In other words offsets must have been retrieved +with rec_get_offsets(n_fields=ULINT_UNDEFINED). +@return child node address */ +UNIV_INLINE +ulint +btr_node_ptr_get_child_page_no( +/*===========================*/ + const rec_t* rec, /*!< in: node pointer record */ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ +{ + const byte* field; + ulint len; + ulint page_no; + + ut_ad(!rec_offs_comp(offsets) || rec_get_node_ptr_flag(rec)); + + /* The child address is in the last field */ + field = rec_get_nth_field(rec, offsets, + rec_offs_n_fields(offsets) - 1, &len); + + ut_ad(len == 4); + + page_no = mach_read_from_4(field); + + if (page_no == 0) { + fprintf(stderr, + "InnoDB: a nonsensical page number 0" + " in a node ptr record at offset %lu\n", + (ulong) page_offset(rec)); + buf_page_print(page_align(rec), 0, 0); + ut_ad(0); + } + + return(page_no); +} + +/**************************************************************//** +Releases the latches on a leaf page and bufferunfixes it. */ +UNIV_INLINE +void +btr_leaf_page_release( +/*==================*/ + buf_block_t* block, /*!< in: buffer block */ + ulint latch_mode, /*!< in: BTR_SEARCH_LEAF or + BTR_MODIFY_LEAF */ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(latch_mode == BTR_SEARCH_LEAF || latch_mode == BTR_MODIFY_LEAF); + ut_ad(!mtr_memo_contains(mtr, block, MTR_MEMO_MODIFY)); + + mtr_memo_release(mtr, block, + latch_mode == BTR_SEARCH_LEAF + ? MTR_MEMO_PAGE_S_FIX + : MTR_MEMO_PAGE_X_FIX); +} +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/xtradb/include/btr0cur.h b/storage/xtradb/include/btr0cur.h new file mode 100644 index 00000000000..4ed66e76fe0 --- /dev/null +++ b/storage/xtradb/include/btr0cur.h @@ -0,0 +1,943 @@ +/***************************************************************************** + +Copyright (c) 1994, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/btr0cur.h +The index tree cursor + +Created 10/16/1994 Heikki Tuuri +*******************************************************/ + +#ifndef btr0cur_h +#define btr0cur_h + +#include "univ.i" +#include "dict0dict.h" +#include "page0cur.h" +#include "btr0types.h" + +/** Mode flags for btr_cur operations; these can be ORed */ +enum { + /** do no undo logging */ + BTR_NO_UNDO_LOG_FLAG = 1, + /** do no record lock checking */ + BTR_NO_LOCKING_FLAG = 2, + /** sys fields will be found in the update vector or inserted + entry */ + BTR_KEEP_SYS_FLAG = 4, + /** btr_cur_pessimistic_update() must keep cursor position + when moving columns to big_rec */ + BTR_KEEP_POS_FLAG = 8, + /** the caller is creating the index or wants to bypass the + index->info.online creation log */ + BTR_CREATE_FLAG = 16, + /** the caller of btr_cur_optimistic_update() or + btr_cur_update_in_place() will take care of + updating IBUF_BITMAP_FREE */ + BTR_KEEP_IBUF_BITMAP = 32 +}; + +#ifndef UNIV_HOTBACKUP +#include "que0types.h" +#include "row0types.h" +#include "ha0ha.h" + +#define BTR_CUR_ADAPT +#define BTR_CUR_HASH_ADAPT + +#ifdef UNIV_DEBUG +/*********************************************************//** +Returns the page cursor component of a tree cursor. +@return pointer to page cursor component */ +UNIV_INLINE +page_cur_t* +btr_cur_get_page_cur( +/*=================*/ + const btr_cur_t* cursor);/*!< in: tree cursor */ +/*********************************************************//** +Returns the buffer block on which the tree cursor is positioned. +@return pointer to buffer block */ +UNIV_INLINE +buf_block_t* +btr_cur_get_block( +/*==============*/ + const btr_cur_t* cursor);/*!< in: tree cursor */ +/*********************************************************//** +Returns the record pointer of a tree cursor. +@return pointer to record */ +UNIV_INLINE +rec_t* +btr_cur_get_rec( +/*============*/ + const btr_cur_t* cursor);/*!< in: tree cursor */ +#else /* UNIV_DEBUG */ +# define btr_cur_get_page_cur(cursor) (&(cursor)->page_cur) +# define btr_cur_get_block(cursor) ((cursor)->page_cur.block) +# define btr_cur_get_rec(cursor) ((cursor)->page_cur.rec) +#endif /* UNIV_DEBUG */ +/*********************************************************//** +Returns the compressed page on which the tree cursor is positioned. +@return pointer to compressed page, or NULL if the page is not compressed */ +UNIV_INLINE +page_zip_des_t* +btr_cur_get_page_zip( +/*=================*/ + btr_cur_t* cursor);/*!< in: tree cursor */ +/*********************************************************//** +Invalidates a tree cursor by setting record pointer to NULL. */ +UNIV_INLINE +void +btr_cur_invalidate( +/*===============*/ + btr_cur_t* cursor);/*!< in: tree cursor */ +/*********************************************************//** +Returns the page of a tree cursor. +@return pointer to page */ +UNIV_INLINE +page_t* +btr_cur_get_page( +/*=============*/ + btr_cur_t* cursor);/*!< in: tree cursor */ +/*********************************************************//** +Returns the index of a cursor. +@param cursor b-tree cursor +@return index */ +#define btr_cur_get_index(cursor) ((cursor)->index) +/*********************************************************//** +Positions a tree cursor at a given record. */ +UNIV_INLINE +void +btr_cur_position( +/*=============*/ + dict_index_t* index, /*!< in: index */ + rec_t* rec, /*!< in: record in tree */ + buf_block_t* block, /*!< in: buffer block of rec */ + btr_cur_t* cursor);/*!< in: cursor */ +/********************************************************************//** +Searches an index tree and positions a tree cursor on a given level. +NOTE: n_fields_cmp in tuple must be set so that it cannot be compared +to node pointer page number fields on the upper levels of the tree! +Note that if mode is PAGE_CUR_LE, which is used in inserts, then +cursor->up_match and cursor->low_match both will have sensible values. +If mode is PAGE_CUR_GE, then up_match will a have a sensible value. */ +UNIV_INTERN +void +btr_cur_search_to_nth_level( +/*========================*/ + dict_index_t* index, /*!< in: index */ + ulint level, /*!< in: the tree level of search */ + const dtuple_t* tuple, /*!< in: data tuple; NOTE: n_fields_cmp in + tuple must be set so that it cannot get + compared to the node ptr page number field! */ + ulint mode, /*!< in: PAGE_CUR_L, ...; + NOTE that if the search is made using a unique + prefix of a record, mode should be PAGE_CUR_LE, + not PAGE_CUR_GE, as the latter may end up on + the previous page of the record! Inserts + should always be made using PAGE_CUR_LE to + search the position! */ + ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ..., ORed with + at most one of BTR_INSERT, BTR_DELETE_MARK, + BTR_DELETE, or BTR_ESTIMATE; + cursor->left_block is used to store a pointer + to the left neighbor page, in the cases + BTR_SEARCH_PREV and BTR_MODIFY_PREV; + NOTE that if has_search_latch + is != 0, we maybe do not have a latch set + on the cursor page, we assume + the caller uses his search latch + to protect the record! */ + btr_cur_t* cursor, /*!< in/out: tree cursor; the cursor page is + s- or x-latched, but see also above! */ + ulint has_search_latch,/*!< in: latch mode the caller + currently has on btr_search_latch: + RW_S_LATCH, or 0 */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ + mtr_t* mtr); /*!< in: mtr */ +/*****************************************************************//** +Opens a cursor at either end of an index. */ +UNIV_INTERN +void +btr_cur_open_at_index_side_func( +/*============================*/ + bool from_left, /*!< in: true if open to the low end, + false if to the high end */ + dict_index_t* index, /*!< in: index */ + ulint latch_mode, /*!< in: latch mode */ + btr_cur_t* cursor, /*!< in/out: cursor */ + ulint level, /*!< in: level to search for + (0=leaf) */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull)); +#define btr_cur_open_at_index_side(f,i,l,c,lv,m) \ + btr_cur_open_at_index_side_func(f,i,l,c,lv,__FILE__,__LINE__,m) +/**********************************************************************//** +Positions a cursor at a randomly chosen position within a B-tree. */ +UNIV_INTERN +void +btr_cur_open_at_rnd_pos_func( +/*=========================*/ + dict_index_t* index, /*!< in: index */ + ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */ + btr_cur_t* cursor, /*!< in/out: B-tree cursor */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ + mtr_t* mtr); /*!< in: mtr */ +#define btr_cur_open_at_rnd_pos(i,l,c,m) \ + btr_cur_open_at_rnd_pos_func(i,l,c,__FILE__,__LINE__,m) +/*************************************************************//** +Tries to perform an insert to a page in an index tree, next to cursor. +It is assumed that mtr holds an x-latch on the page. The operation does +not succeed if there is too little space on the page. If there is just +one record on the page, the insert will always succeed; this is to +prevent trying to split a page with just one record. +@return DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */ +UNIV_INTERN +dberr_t +btr_cur_optimistic_insert( +/*======================*/ + ulint flags, /*!< in: undo logging and locking flags: if not + zero, the parameters index and thr should be + specified */ + btr_cur_t* cursor, /*!< in: cursor on page after which to insert; + cursor stays valid */ + ulint** offsets,/*!< out: offsets on *rec */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */ + dtuple_t* entry, /*!< in/out: entry to insert */ + rec_t** rec, /*!< out: pointer to inserted record if + succeed */ + big_rec_t** big_rec,/*!< out: big rec vector whose fields have to + be stored externally by the caller, or + NULL */ + ulint n_ext, /*!< in: number of externally stored columns */ + que_thr_t* thr, /*!< in: query thread or NULL */ + mtr_t* mtr) /*!< in/out: mini-transaction; + if this function returns DB_SUCCESS on + a leaf page of a secondary index in a + compressed tablespace, the caller must + mtr_commit(mtr) before latching + any further pages */ + __attribute__((nonnull(2,3,4,5,6,7,10), warn_unused_result)); +/*************************************************************//** +Performs an insert on a page of an index tree. It is assumed that mtr +holds an x-latch on the tree and on the cursor page. If the insert is +made on the leaf level, to avoid deadlocks, mtr must also own x-latches +to brothers of page, if those brothers exist. +@return DB_SUCCESS or error number */ +UNIV_INTERN +dberr_t +btr_cur_pessimistic_insert( +/*=======================*/ + ulint flags, /*!< in: undo logging and locking flags: if not + zero, the parameter thr should be + specified; if no undo logging is specified, + then the caller must have reserved enough + free extents in the file space so that the + insertion will certainly succeed */ + btr_cur_t* cursor, /*!< in: cursor after which to insert; + cursor stays valid */ + ulint** offsets,/*!< out: offsets on *rec */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap + that can be emptied, or NULL */ + dtuple_t* entry, /*!< in/out: entry to insert */ + rec_t** rec, /*!< out: pointer to inserted record if + succeed */ + big_rec_t** big_rec,/*!< out: big rec vector whose fields have to + be stored externally by the caller, or + NULL */ + ulint n_ext, /*!< in: number of externally stored columns */ + que_thr_t* thr, /*!< in: query thread or NULL */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull(2,3,4,5,6,7,10), warn_unused_result)); +/*************************************************************//** +See if there is enough place in the page modification log to log +an update-in-place. + +@retval false if out of space; IBUF_BITMAP_FREE will be reset +outside mtr if the page was recompressed +@retval true if enough place; + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE if this is +a secondary index leaf page. This has to be done either within the +same mini-transaction, or by invoking ibuf_reset_free_bits() before +mtr_commit(mtr). */ +UNIV_INTERN +bool +btr_cur_update_alloc_zip_func( +/*==========================*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page */ + page_cur_t* cursor, /*!< in/out: B-tree page cursor */ + dict_index_t* index, /*!< in: the index corresponding to cursor */ +#ifdef UNIV_DEBUG + ulint* offsets,/*!< in/out: offsets of the cursor record */ +#endif /* UNIV_DEBUG */ + ulint length, /*!< in: size needed */ + bool create, /*!< in: true=delete-and-insert, + false=update-in-place */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + trx_t* trx) /*!< in: NULL or transaction */ +#ifdef UNIV_DEBUG + __attribute__((nonnull (1, 2, 3, 4, 7), warn_unused_result)); +#else + __attribute__((nonnull (1, 2, 3, 6), warn_unused_result)); +#endif + +#ifdef UNIV_DEBUG +# define btr_cur_update_alloc_zip(page_zip,cursor,index,offsets,len,cr,mtr,trx) \ + btr_cur_update_alloc_zip_func(page_zip,cursor,index,offsets,len,cr,mtr,trx) +#else /* UNIV_DEBUG */ +# define btr_cur_update_alloc_zip(page_zip,cursor,index,offsets,len,cr,mtr,trx) \ + btr_cur_update_alloc_zip_func(page_zip,cursor,index,len,cr,mtr,trx) +#endif /* UNIV_DEBUG */ +/*************************************************************//** +Updates a record when the update causes no size changes in its fields. +@return locking or undo log related error code, or +@retval DB_SUCCESS on success +@retval DB_ZIP_OVERFLOW if there is not enough space left +on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */ +UNIV_INTERN +dberr_t +btr_cur_update_in_place( +/*====================*/ + ulint flags, /*!< in: undo logging and locking flags */ + btr_cur_t* cursor, /*!< in: cursor on the record to update; + cursor stays valid and positioned on the + same record */ + ulint* offsets,/*!< in/out: offsets on cursor->page_cur.rec */ + const upd_t* update, /*!< in: update vector */ + ulint cmpl_info,/*!< in: compiler info on secondary index + updates */ + que_thr_t* thr, /*!< in: query thread */ + trx_id_t trx_id, /*!< in: transaction id */ + mtr_t* mtr) /*!< in/out: mini-transaction; if this + is a secondary index, the caller must + mtr_commit(mtr) before latching any + further pages */ + __attribute__((warn_unused_result, nonnull)); +/***********************************************************//** +Writes a redo log record of updating a record in-place. */ +UNIV_INTERN +void +btr_cur_update_in_place_log( +/*========================*/ + ulint flags, /*!< in: flags */ + const rec_t* rec, /*!< in: record */ + dict_index_t* index, /*!< in: index of the record */ + const upd_t* update, /*!< in: update vector */ + trx_id_t trx_id, /*!< in: transaction id */ + roll_ptr_t roll_ptr, /*!< in: roll ptr */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull)); +/*************************************************************//** +Tries to update a record on a page in an index tree. It is assumed that mtr +holds an x-latch on the page. The operation does not succeed if there is too +little space on the page or if the update would result in too empty a page, +so that tree compression is recommended. +@return error code, including +@retval DB_SUCCESS on success +@retval DB_OVERFLOW if the updated record does not fit +@retval DB_UNDERFLOW if the page would become too empty +@retval DB_ZIP_OVERFLOW if there is not enough space left +on the compressed page */ +UNIV_INTERN +dberr_t +btr_cur_optimistic_update( +/*======================*/ + ulint flags, /*!< in: undo logging and locking flags */ + btr_cur_t* cursor, /*!< in: cursor on the record to update; + cursor stays valid and positioned on the + same record */ + ulint** offsets,/*!< out: offsets on cursor->page_cur.rec */ + mem_heap_t** heap, /*!< in/out: pointer to NULL or memory heap */ + const upd_t* update, /*!< in: update vector; this must also + contain trx id and roll ptr fields */ + ulint cmpl_info,/*!< in: compiler info on secondary index + updates */ + que_thr_t* thr, /*!< in: query thread */ + trx_id_t trx_id, /*!< in: transaction id */ + mtr_t* mtr) /*!< in/out: mini-transaction; if this + is a secondary index, the caller must + mtr_commit(mtr) before latching any + further pages */ + __attribute__((warn_unused_result, nonnull)); +/*************************************************************//** +Performs an update of a record on a page of a tree. It is assumed +that mtr holds an x-latch on the tree and on the cursor page. If the +update is made on the leaf level, to avoid deadlocks, mtr must also +own x-latches to brothers of page, if those brothers exist. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +btr_cur_pessimistic_update( +/*=======================*/ + ulint flags, /*!< in: undo logging, locking, and rollback + flags */ + btr_cur_t* cursor, /*!< in/out: cursor on the record to update; + cursor may become invalid if *big_rec == NULL + || !(flags & BTR_KEEP_POS_FLAG) */ + ulint** offsets,/*!< out: offsets on cursor->page_cur.rec */ + mem_heap_t** offsets_heap, + /*!< in/out: pointer to memory heap + that can be emptied, or NULL */ + mem_heap_t* entry_heap, + /*!< in/out: memory heap for allocating + big_rec and the index tuple */ + big_rec_t** big_rec,/*!< out: big rec vector whose fields have to + be stored externally by the caller, or NULL */ + const upd_t* update, /*!< in: update vector; this is allowed also + contain trx id and roll ptr fields, but + the values in update vector have no effect */ + ulint cmpl_info,/*!< in: compiler info on secondary index + updates */ + que_thr_t* thr, /*!< in: query thread */ + trx_id_t trx_id, /*!< in: transaction id */ + mtr_t* mtr) /*!< in/out: mini-transaction; must be committed + before latching any further pages */ + __attribute__((warn_unused_result, nonnull)); +/***********************************************************//** +Marks a clustered index record deleted. Writes an undo log record to +undo log on this delete marking. Writes in the trx id field the id +of the deleting transaction, and in the roll ptr field pointer to the +undo log record created. +@return DB_SUCCESS, DB_LOCK_WAIT, or error number */ +UNIV_INTERN +dberr_t +btr_cur_del_mark_set_clust_rec( +/*===========================*/ + buf_block_t* block, /*!< in/out: buffer block of the record */ + rec_t* rec, /*!< in/out: record */ + dict_index_t* index, /*!< in: clustered index of the record */ + const ulint* offsets,/*!< in: rec_get_offsets(rec) */ + que_thr_t* thr, /*!< in: query thread */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull, warn_unused_result)); +/***********************************************************//** +Sets a secondary index record delete mark to TRUE or FALSE. +@return DB_SUCCESS, DB_LOCK_WAIT, or error number */ +UNIV_INTERN +dberr_t +btr_cur_del_mark_set_sec_rec( +/*=========================*/ + ulint flags, /*!< in: locking flag */ + btr_cur_t* cursor, /*!< in: cursor */ + ibool val, /*!< in: value to set */ + que_thr_t* thr, /*!< in: query thread */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull, warn_unused_result)); +/*************************************************************//** +Tries to compress a page of the tree if it seems useful. It is assumed +that mtr holds an x-latch on the tree and on the cursor page. To avoid +deadlocks, mtr must also own x-latches to brothers of page, if those +brothers exist. NOTE: it is assumed that the caller has reserved enough +free extents so that the compression will always succeed if done! +@return TRUE if compression occurred */ +UNIV_INTERN +ibool +btr_cur_compress_if_useful( +/*=======================*/ + btr_cur_t* cursor, /*!< in/out: cursor on the page to compress; + cursor does not stay valid if compression + occurs */ + ibool adjust, /*!< in: TRUE if should adjust the + cursor position even if compression occurs */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull)); +/*******************************************************//** +Removes the record on which the tree cursor is positioned. It is assumed +that the mtr has an x-latch on the page where the cursor is positioned, +but no latch on the whole tree. +@return TRUE if success, i.e., the page did not become too empty */ +UNIV_INTERN +ibool +btr_cur_optimistic_delete_func( +/*===========================*/ + btr_cur_t* cursor, /*!< in: cursor on the record to delete; + cursor stays valid: if deletion succeeds, + on function exit it points to the successor + of the deleted record */ +# ifdef UNIV_DEBUG + ulint flags, /*!< in: BTR_CREATE_FLAG or 0 */ +# endif /* UNIV_DEBUG */ + mtr_t* mtr) /*!< in: mtr; if this function returns + TRUE on a leaf page of a secondary + index, the mtr must be committed + before latching any further pages */ + __attribute__((nonnull, warn_unused_result)); +# ifdef UNIV_DEBUG +# define btr_cur_optimistic_delete(cursor, flags, mtr) \ + btr_cur_optimistic_delete_func(cursor, flags, mtr) +# else /* UNIV_DEBUG */ +# define btr_cur_optimistic_delete(cursor, flags, mtr) \ + btr_cur_optimistic_delete_func(cursor, mtr) +# endif /* UNIV_DEBUG */ +/*************************************************************//** +Removes the record on which the tree cursor is positioned. Tries +to compress the page if its fillfactor drops below a threshold +or if it is the only page on the level. It is assumed that mtr holds +an x-latch on the tree and on the cursor page. To avoid deadlocks, +mtr must also own x-latches to brothers of page, if those brothers +exist. +@return TRUE if compression occurred */ +UNIV_INTERN +ibool +btr_cur_pessimistic_delete( +/*=======================*/ + dberr_t* err, /*!< out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE; + the latter may occur because we may have + to update node pointers on upper levels, + and in the case of variable length keys + these may actually grow in size */ + ibool has_reserved_extents, /*!< in: TRUE if the + caller has already reserved enough free + extents so that he knows that the operation + will succeed */ + btr_cur_t* cursor, /*!< in: cursor on the record to delete; + if compression does not occur, the cursor + stays valid: it points to successor of + deleted record on function exit */ + ulint flags, /*!< in: BTR_CREATE_FLAG or 0 */ + enum trx_rb_ctx rb_ctx, /*!< in: rollback context */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull)); +#endif /* !UNIV_HOTBACKUP */ +/***********************************************************//** +Parses a redo log record of updating a record in-place. +@return end of log record or NULL */ +UNIV_INTERN +byte* +btr_cur_parse_update_in_place( +/*==========================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + page_t* page, /*!< in/out: page or NULL */ + page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */ + dict_index_t* index); /*!< in: index corresponding to page */ +/****************************************************************//** +Parses the redo log record for delete marking or unmarking of a clustered +index record. +@return end of log record or NULL */ +UNIV_INTERN +byte* +btr_cur_parse_del_mark_set_clust_rec( +/*=================================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + page_t* page, /*!< in/out: page or NULL */ + page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */ + dict_index_t* index); /*!< in: index corresponding to page */ +/****************************************************************//** +Parses the redo log record for delete marking or unmarking of a secondary +index record. +@return end of log record or NULL */ +UNIV_INTERN +byte* +btr_cur_parse_del_mark_set_sec_rec( +/*===============================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + page_t* page, /*!< in/out: page or NULL */ + page_zip_des_t* page_zip);/*!< in/out: compressed page, or NULL */ +#ifndef UNIV_HOTBACKUP +/*******************************************************************//** +Estimates the number of rows in a given index range. +@return estimated number of rows */ +UNIV_INTERN +ib_int64_t +btr_estimate_n_rows_in_range( +/*=========================*/ + dict_index_t* index, /*!< in: index */ + const dtuple_t* tuple1, /*!< in: range start, may also be empty tuple */ + ulint mode1, /*!< in: search mode for range start */ + const dtuple_t* tuple2, /*!< in: range end, may also be empty tuple */ + ulint mode2); /*!< in: search mode for range end */ +/*******************************************************************//** +Estimates the number of different key values in a given index, for +each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index). +The estimates are stored in the array index->stat_n_diff_key_vals[] (indexed +0..n_uniq-1) and the number of pages that were sampled is saved in +index->stat_n_sample_sizes[]. +If innodb_stats_method is nulls_ignored, we also record the number of +non-null values for each prefix and stored the estimates in +array index->stat_n_non_null_key_vals. */ +UNIV_INTERN +void +btr_estimate_number_of_different_key_vals( +/*======================================*/ + dict_index_t* index); /*!< in: index */ + +/** Gets the externally stored size of a record, in units of a database page. +@param[in] rec record +@param[in] offsets array returned by rec_get_offsets() +@return externally stored part, in units of a database page */ + +ulint +btr_rec_get_externally_stored_len( + const rec_t* rec, + const ulint* offsets); + +/*******************************************************************//** +Marks non-updated off-page fields as disowned by this record. The ownership +must be transferred to the updated record which is inserted elsewhere in the +index tree. In purge only the owner of externally stored field is allowed +to free the field. */ +UNIV_INTERN +void +btr_cur_disown_inherited_fields( +/*============================*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page whose uncompressed + part will be updated, or NULL */ + rec_t* rec, /*!< in/out: record in a clustered index */ + dict_index_t* index, /*!< in: index of the page */ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + const upd_t* update, /*!< in: update vector */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull(2,3,4,5,6))); + +/** Operation code for btr_store_big_rec_extern_fields(). */ +enum blob_op { + /** Store off-page columns for a freshly inserted record */ + BTR_STORE_INSERT = 0, + /** Store off-page columns for an insert by update */ + BTR_STORE_INSERT_UPDATE, + /** Store off-page columns for an update */ + BTR_STORE_UPDATE +}; + +/*******************************************************************//** +Determine if an operation on off-page columns is an update. +@return TRUE if op != BTR_STORE_INSERT */ +UNIV_INLINE +ibool +btr_blob_op_is_update( +/*==================*/ + enum blob_op op) /*!< in: operation */ + __attribute__((warn_unused_result)); + +/*******************************************************************//** +Stores the fields in big_rec_vec to the tablespace and puts pointers to +them in rec. The extern flags in rec will have to be set beforehand. +The fields are stored on pages allocated from leaf node +file segment of the index tree. +@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ +UNIV_INTERN +dberr_t +btr_store_big_rec_extern_fields( +/*============================*/ + dict_index_t* index, /*!< in: index of rec; the index tree + MUST be X-latched */ + buf_block_t* rec_block, /*!< in/out: block containing rec */ + rec_t* rec, /*!< in/out: record */ + const ulint* offsets, /*!< in: rec_get_offsets(rec, index); + the "external storage" flags in offsets + will not correspond to rec when + this function returns */ + const big_rec_t*big_rec_vec, /*!< in: vector containing fields + to be stored externally */ + mtr_t* btr_mtr, /*!< in: mtr containing the + latches to the clustered index */ + enum blob_op op) /*! in: operation code */ + __attribute__((nonnull, warn_unused_result)); + +/*******************************************************************//** +Frees the space in an externally stored field to the file space +management if the field in data is owned the externally stored field, +in a rollback we may have the additional condition that the field must +not be inherited. */ +UNIV_INTERN +void +btr_free_externally_stored_field( +/*=============================*/ + dict_index_t* index, /*!< in: index of the data, the index + tree MUST be X-latched; if the tree + height is 1, then also the root page + must be X-latched! (this is relevant + in the case this function is called + from purge where 'data' is located on + an undo log page, not an index + page) */ + byte* field_ref, /*!< in/out: field reference */ + const rec_t* rec, /*!< in: record containing field_ref, for + page_zip_write_blob_ptr(), or NULL */ + const ulint* offsets, /*!< in: rec_get_offsets(rec, index), + or NULL */ + page_zip_des_t* page_zip, /*!< in: compressed page corresponding + to rec, or NULL if rec == NULL */ + ulint i, /*!< in: field number of field_ref; + ignored if rec == NULL */ + enum trx_rb_ctx rb_ctx, /*!< in: rollback context */ + mtr_t* local_mtr); /*!< in: mtr containing the latch to + data an an X-latch to the index + tree */ +/*******************************************************************//** +Copies the prefix of an externally stored field of a record. The +clustered index record must be protected by a lock or a page latch. +@return the length of the copied field, or 0 if the column was being +or has been deleted */ +UNIV_INTERN +ulint +btr_copy_externally_stored_field_prefix( +/*====================================*/ + byte* buf, /*!< out: the field, or a prefix of it */ + ulint len, /*!< in: length of buf, in bytes */ + ulint zip_size,/*!< in: nonzero=compressed BLOB page size, + zero for uncompressed BLOBs */ + const byte* data, /*!< in: 'internally' stored part of the + field containing also the reference to + the external part; must be protected by + a lock or a page latch */ + ulint local_len);/*!< in: length of data, in bytes */ +/*******************************************************************//** +Copies an externally stored field of a record to mem heap. The +clustered index record must be protected by a lock or a page latch. +@return the whole field copied to heap */ +UNIV_INTERN +byte* +btr_copy_externally_stored_field( +/*=============================*/ + ulint* len, /*!< out: length of the whole field */ + const byte* data, /*!< in: 'internally' stored part of the + field containing also the reference to + the external part; must be protected by + a lock or a page latch */ + ulint zip_size,/*!< in: nonzero=compressed BLOB page size, + zero for uncompressed BLOBs */ + ulint local_len,/*!< in: length of data */ + mem_heap_t* heap); /*!< in: mem heap */ +/*******************************************************************//** +Copies an externally stored field of a record to mem heap. +@return the field copied to heap, or NULL if the field is incomplete */ +UNIV_INTERN +byte* +btr_rec_copy_externally_stored_field( +/*=================================*/ + const rec_t* rec, /*!< in: record in a clustered index; + must be protected by a lock or a page latch */ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint zip_size,/*!< in: nonzero=compressed BLOB page size, + zero for uncompressed BLOBs */ + ulint no, /*!< in: field number */ + ulint* len, /*!< out: length of the field */ + mem_heap_t* heap); /*!< in: mem heap */ +/*******************************************************************//** +Flags the data tuple fields that are marked as extern storage in the +update vector. We use this function to remember which fields we must +mark as extern storage in a record inserted for an update. +@return number of flagged external columns */ +UNIV_INTERN +ulint +btr_push_update_extern_fields( +/*==========================*/ + dtuple_t* tuple, /*!< in/out: data tuple */ + const upd_t* update, /*!< in: update vector */ + mem_heap_t* heap) /*!< in: memory heap */ + __attribute__((nonnull)); +/***********************************************************//** +Sets a secondary index record's delete mark to the given value. This +function is only used by the insert buffer merge mechanism. */ +UNIV_INTERN +void +btr_cur_set_deleted_flag_for_ibuf( +/*==============================*/ + rec_t* rec, /*!< in/out: record */ + page_zip_des_t* page_zip, /*!< in/out: compressed page + corresponding to rec, or NULL + when the tablespace is + uncompressed */ + ibool val, /*!< in: value to set */ + mtr_t* mtr); /*!< in/out: mini-transaction */ +/*######################################################################*/ + +/** In the pessimistic delete, if the page data size drops below this +limit, merging it to a neighbor is tried */ +#define BTR_CUR_PAGE_COMPRESS_LIMIT (UNIV_PAGE_SIZE / 2) + +/** A slot in the path array. We store here info on a search path down the +tree. Each slot contains data on a single level of the tree. */ + +struct btr_path_t{ + ulint nth_rec; /*!< index of the record + where the page cursor stopped on + this level (index in alphabetical + order); value ULINT_UNDEFINED + denotes array end */ + ulint n_recs; /*!< number of records on the page */ + ulint page_no; /*!< no of the page containing the record */ + ulint page_level; /*!< level of the page, if later we fetch + the page under page_no and it is no different + level then we know that the tree has been + reorganized */ +}; + +#define BTR_PATH_ARRAY_N_SLOTS 250 /*!< size of path array (in slots) */ + +/** Values for the flag documenting the used search method */ +enum btr_cur_method { + BTR_CUR_HASH = 1, /*!< successful shortcut using + the hash index */ + BTR_CUR_HASH_FAIL, /*!< failure using hash, success using + binary search: the misleading hash + reference is stored in the field + hash_node, and might be necessary to + update */ + BTR_CUR_BINARY, /*!< success using the binary search */ + BTR_CUR_INSERT_TO_IBUF, /*!< performed the intended insert to + the insert buffer */ + BTR_CUR_DEL_MARK_IBUF, /*!< performed the intended delete + mark in the insert/delete buffer */ + BTR_CUR_DELETE_IBUF, /*!< performed the intended delete in + the insert/delete buffer */ + BTR_CUR_DELETE_REF /*!< row_purge_poss_sec() failed */ +}; + +/** The tree cursor: the definition appears here only for the compiler +to know struct size! */ +struct btr_cur_t { + dict_index_t* index; /*!< index where positioned */ + page_cur_t page_cur; /*!< page cursor */ + purge_node_t* purge_node; /*!< purge node, for BTR_DELETE */ + buf_block_t* left_block; /*!< this field is used to store + a pointer to the left neighbor + page, in the cases + BTR_SEARCH_PREV and + BTR_MODIFY_PREV */ + /*------------------------------*/ + que_thr_t* thr; /*!< this field is only used + when btr_cur_search_to_nth_level + is called for an index entry + insertion: the calling query + thread is passed here to be + used in the insert buffer */ + /*------------------------------*/ + /** The following fields are used in + btr_cur_search_to_nth_level to pass information: */ + /* @{ */ + enum btr_cur_method flag; /*!< Search method used */ + ulint tree_height; /*!< Tree height if the search is done + for a pessimistic insert or update + operation */ + ulint up_match; /*!< If the search mode was PAGE_CUR_LE, + the number of matched fields to the + the first user record to the right of + the cursor record after + btr_cur_search_to_nth_level; + for the mode PAGE_CUR_GE, the matched + fields to the first user record AT THE + CURSOR or to the right of it; + NOTE that the up_match and low_match + values may exceed the correct values + for comparison to the adjacent user + record if that record is on a + different leaf page! (See the note in + row_ins_duplicate_error_in_clust.) */ + ulint up_bytes; /*!< number of matched bytes to the + right at the time cursor positioned; + only used internally in searches: not + defined after the search */ + ulint low_match; /*!< if search mode was PAGE_CUR_LE, + the number of matched fields to the + first user record AT THE CURSOR or + to the left of it after + btr_cur_search_to_nth_level; + NOT defined for PAGE_CUR_GE or any + other search modes; see also the NOTE + in up_match! */ + ulint low_bytes; /*!< number of matched bytes to the + right at the time cursor positioned; + only used internally in searches: not + defined after the search */ + ulint n_fields; /*!< prefix length used in a hash + search if hash_node != NULL */ + ulint n_bytes; /*!< hash prefix bytes if hash_node != + NULL */ + ulint fold; /*!< fold value used in the search if + flag is BTR_CUR_HASH */ + /* @} */ + btr_path_t* path_arr; /*!< in estimating the number of + rows in range, we store in this array + information of the path through + the tree */ +}; + +/** If pessimistic delete fails because of lack of file space, there +is still a good change of success a little later. Try this many +times. */ +#define BTR_CUR_RETRY_DELETE_N_TIMES 100 +/** If pessimistic delete fails because of lack of file space, there +is still a good change of success a little later. Sleep this many +microseconds between retries. */ +#define BTR_CUR_RETRY_SLEEP_TIME 50000 + +/** The reference in a field for which data is stored on a different page. +The reference is at the end of the 'locally' stored part of the field. +'Locally' means storage in the index record. +We store locally a long enough prefix of each column so that we can determine +the ordering parts of each index record without looking into the externally +stored part. */ +/*-------------------------------------- @{ */ +#define BTR_EXTERN_SPACE_ID 0 /*!< space id where stored */ +#define BTR_EXTERN_PAGE_NO 4 /*!< page no where stored */ +#define BTR_EXTERN_OFFSET 8 /*!< offset of BLOB header + on that page */ +#define BTR_EXTERN_LEN 12 /*!< 8 bytes containing the + length of the externally + stored part of the BLOB. + The 2 highest bits are + reserved to the flags below. */ +/*-------------------------------------- @} */ +/* #define BTR_EXTERN_FIELD_REF_SIZE 20 // moved to btr0types.h */ + +/** The most significant bit of BTR_EXTERN_LEN (i.e., the most +significant bit of the byte at smallest address) is set to 1 if this +field does not 'own' the externally stored field; only the owner field +is allowed to free the field in purge! */ +#define BTR_EXTERN_OWNER_FLAG 128 +/** If the second most significant bit of BTR_EXTERN_LEN (i.e., the +second most significant bit of the byte at smallest address) is 1 then +it means that the externally stored field was inherited from an +earlier version of the row. In rollback we are not allowed to free an +inherited external field. */ +#define BTR_EXTERN_INHERITED_FLAG 64 + +/** Number of searches down the B-tree in btr_cur_search_to_nth_level(). */ +extern ulint btr_cur_n_non_sea; +/** Number of successful adaptive hash index lookups in +btr_cur_search_to_nth_level(). */ +extern ulint btr_cur_n_sea; +/** Old value of btr_cur_n_non_sea. Copied by +srv_refresh_innodb_monitor_stats(). Referenced by +srv_printf_innodb_monitor(). */ +extern ulint btr_cur_n_non_sea_old; +/** Old value of btr_cur_n_sea. Copied by +srv_refresh_innodb_monitor_stats(). Referenced by +srv_printf_innodb_monitor(). */ +extern ulint btr_cur_n_sea_old; +#endif /* !UNIV_HOTBACKUP */ + +#ifdef UNIV_DEBUG +/* Flag to limit optimistic insert records */ +extern uint btr_cur_limit_optimistic_insert_debug; +#endif /* UNIV_DEBUG */ + +#ifndef UNIV_NONINL +#include "btr0cur.ic" +#endif + +#endif diff --git a/storage/xtradb/include/btr0cur.ic b/storage/xtradb/include/btr0cur.ic new file mode 100644 index 00000000000..43ee3304c0e --- /dev/null +++ b/storage/xtradb/include/btr0cur.ic @@ -0,0 +1,223 @@ +/***************************************************************************** + +Copyright (c) 1994, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/btr0cur.ic +The index tree cursor + +Created 10/16/1994 Heikki Tuuri +*******************************************************/ + +#ifndef UNIV_HOTBACKUP +#include "btr0btr.h" + +#ifdef UNIV_DEBUG +# define LIMIT_OPTIMISTIC_INSERT_DEBUG(NREC, CODE)\ +if (btr_cur_limit_optimistic_insert_debug > 1\ + && (NREC) >= (ulint)btr_cur_limit_optimistic_insert_debug) {\ + CODE;\ +} +#else +# define LIMIT_OPTIMISTIC_INSERT_DEBUG(NREC, CODE) +#endif /* UNIV_DEBUG */ + +#ifdef UNIV_DEBUG +/*********************************************************//** +Returns the page cursor component of a tree cursor. +@return pointer to page cursor component */ +UNIV_INLINE +page_cur_t* +btr_cur_get_page_cur( +/*=================*/ + const btr_cur_t* cursor) /*!< in: tree cursor */ +{ + return(&((btr_cur_t*) cursor)->page_cur); +} + +/*********************************************************//** +Returns the buffer block on which the tree cursor is positioned. +@return pointer to buffer block */ +UNIV_INLINE +buf_block_t* +btr_cur_get_block( +/*==============*/ + const btr_cur_t* cursor) /*!< in: tree cursor */ +{ + return(page_cur_get_block(btr_cur_get_page_cur(cursor))); +} + +/*********************************************************//** +Returns the record pointer of a tree cursor. +@return pointer to record */ +UNIV_INLINE +rec_t* +btr_cur_get_rec( +/*============*/ + const btr_cur_t* cursor) /*!< in: tree cursor */ +{ + return(page_cur_get_rec(btr_cur_get_page_cur(cursor))); +} +#endif /* UNIV_DEBUG */ + +/*********************************************************//** +Returns the compressed page on which the tree cursor is positioned. +@return pointer to compressed page, or NULL if the page is not compressed */ +UNIV_INLINE +page_zip_des_t* +btr_cur_get_page_zip( +/*=================*/ + btr_cur_t* cursor) /*!< in: tree cursor */ +{ + return(buf_block_get_page_zip(btr_cur_get_block(cursor))); +} + +/*********************************************************//** +Invalidates a tree cursor by setting record pointer to NULL. */ +UNIV_INLINE +void +btr_cur_invalidate( +/*===============*/ + btr_cur_t* cursor) /*!< in: tree cursor */ +{ + page_cur_invalidate(&(cursor->page_cur)); +} + +/*********************************************************//** +Returns the page of a tree cursor. +@return pointer to page */ +UNIV_INLINE +page_t* +btr_cur_get_page( +/*=============*/ + btr_cur_t* cursor) /*!< in: tree cursor */ +{ + return(page_align(page_cur_get_rec(&(cursor->page_cur)))); +} + +/*********************************************************//** +Positions a tree cursor at a given record. */ +UNIV_INLINE +void +btr_cur_position( +/*=============*/ + dict_index_t* index, /*!< in: index */ + rec_t* rec, /*!< in: record in tree */ + buf_block_t* block, /*!< in: buffer block of rec */ + btr_cur_t* cursor) /*!< out: cursor */ +{ + ut_ad(page_align(rec) == block->frame); + + page_cur_position(rec, block, btr_cur_get_page_cur(cursor)); + + cursor->index = index; +} + +/*********************************************************************//** +Checks if compressing an index page where a btr cursor is placed makes +sense. +@return TRUE if compression is recommended */ +UNIV_INLINE +ibool +btr_cur_compress_recommendation( +/*============================*/ + btr_cur_t* cursor, /*!< in: btr cursor */ + mtr_t* mtr) /*!< in: mtr */ +{ + const page_t* page; + + ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor), + MTR_MEMO_PAGE_X_FIX)); + + page = btr_cur_get_page(cursor); + + LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page) * 2, + return(FALSE)); + + if ((page_get_data_size(page) < BTR_CUR_PAGE_COMPRESS_LIMIT) + || ((btr_page_get_next(page, mtr) == FIL_NULL) + && (btr_page_get_prev(page, mtr) == FIL_NULL))) { + + /* The page fillfactor has dropped below a predefined + minimum value OR the level in the B-tree contains just + one page: we recommend compression if this is not the + root page. */ + + return(dict_index_get_page(cursor->index) + != page_get_page_no(page)); + } + + return(FALSE); +} + +/*********************************************************************//** +Checks if the record on which the cursor is placed can be deleted without +making tree compression necessary (or, recommended). +@return TRUE if can be deleted without recommended compression */ +UNIV_INLINE +ibool +btr_cur_can_delete_without_compress( +/*================================*/ + btr_cur_t* cursor, /*!< in: btr cursor */ + ulint rec_size,/*!< in: rec_get_size(btr_cur_get_rec(cursor))*/ + mtr_t* mtr) /*!< in: mtr */ +{ + page_t* page; + + ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor), + MTR_MEMO_PAGE_X_FIX)); + + page = btr_cur_get_page(cursor); + + if ((page_get_data_size(page) - rec_size < BTR_CUR_PAGE_COMPRESS_LIMIT) + || ((btr_page_get_next(page, mtr) == FIL_NULL) + && (btr_page_get_prev(page, mtr) == FIL_NULL)) + || (page_get_n_recs(page) < 2)) { + + /* The page fillfactor will drop below a predefined + minimum value, OR the level in the B-tree contains just + one page, OR the page will become empty: we recommend + compression if this is not the root page. */ + + return(dict_index_get_page(cursor->index) + == page_get_page_no(page)); + } + + return(TRUE); +} + +/*******************************************************************//** +Determine if an operation on off-page columns is an update. +@return TRUE if op != BTR_STORE_INSERT */ +UNIV_INLINE +ibool +btr_blob_op_is_update( +/*==================*/ + enum blob_op op) /*!< in: operation */ +{ + switch (op) { + case BTR_STORE_INSERT: + return(FALSE); + case BTR_STORE_INSERT_UPDATE: + case BTR_STORE_UPDATE: + return(TRUE); + } + + ut_ad(0); + return(FALSE); +} +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/xtradb/include/btr0pcur.h b/storage/xtradb/include/btr0pcur.h new file mode 100644 index 00000000000..cfbaacf4de3 --- /dev/null +++ b/storage/xtradb/include/btr0pcur.h @@ -0,0 +1,548 @@ +/***************************************************************************** + +Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/btr0pcur.h +The index tree persistent cursor + +Created 2/23/1996 Heikki Tuuri +*******************************************************/ + +#ifndef btr0pcur_h +#define btr0pcur_h + +#include "univ.i" +#include "dict0dict.h" +#include "data0data.h" +#include "mtr0mtr.h" +#include "page0cur.h" +#include "btr0cur.h" +#include "btr0btr.h" +#include "btr0types.h" + +/* Relative positions for a stored cursor position */ +#define BTR_PCUR_ON 1 +#define BTR_PCUR_BEFORE 2 +#define BTR_PCUR_AFTER 3 +/* Note that if the tree is not empty, btr_pcur_store_position does not +use the following, but only uses the above three alternatives, where the +position is stored relative to a specific record: this makes implementation +of a scroll cursor easier */ +#define BTR_PCUR_BEFORE_FIRST_IN_TREE 4 /* in an empty tree */ +#define BTR_PCUR_AFTER_LAST_IN_TREE 5 /* in an empty tree */ + +/**************************************************************//** +Allocates memory for a persistent cursor object and initializes the cursor. +@return own: persistent cursor */ +UNIV_INTERN +btr_pcur_t* +btr_pcur_create_for_mysql(void); +/*============================*/ + +/**************************************************************//** +Resets a persistent cursor object, freeing ::old_rec_buf if it is +allocated and resetting the other members to their initial values. */ +UNIV_INTERN +void +btr_pcur_reset( +/*===========*/ + btr_pcur_t* cursor);/*!< in, out: persistent cursor */ + +/**************************************************************//** +Frees the memory for a persistent cursor object. */ +UNIV_INTERN +void +btr_pcur_free_for_mysql( +/*====================*/ + btr_pcur_t* cursor); /*!< in, own: persistent cursor */ +/**************************************************************//** +Copies the stored position of a pcur to another pcur. */ +UNIV_INTERN +void +btr_pcur_copy_stored_position( +/*==========================*/ + btr_pcur_t* pcur_receive, /*!< in: pcur which will receive the + position info */ + btr_pcur_t* pcur_donate); /*!< in: pcur from which the info is + copied */ +/**************************************************************//** +Sets the old_rec_buf field to NULL. */ +UNIV_INLINE +void +btr_pcur_init( +/*==========*/ + btr_pcur_t* pcur); /*!< in: persistent cursor */ +/**************************************************************//** +Initializes and opens a persistent cursor to an index tree. It should be +closed with btr_pcur_close. */ +UNIV_INLINE +void +btr_pcur_open_low( +/*==============*/ + dict_index_t* index, /*!< in: index */ + ulint level, /*!< in: level in the btree */ + const dtuple_t* tuple, /*!< in: tuple on which search done */ + ulint mode, /*!< in: PAGE_CUR_L, ...; + NOTE that if the search is made using a unique + prefix of a record, mode should be + PAGE_CUR_LE, not PAGE_CUR_GE, as the latter + may end up on the previous page from the + record! */ + ulint latch_mode,/*!< in: BTR_SEARCH_LEAF, ... */ + btr_pcur_t* cursor, /*!< in: memory buffer for persistent cursor */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ + mtr_t* mtr); /*!< in: mtr */ +#define btr_pcur_open(i,t,md,l,c,m) \ + btr_pcur_open_low(i,0,t,md,l,c,__FILE__,__LINE__,m) +/**************************************************************//** +Opens an persistent cursor to an index tree without initializing the +cursor. */ +UNIV_INLINE +void +btr_pcur_open_with_no_init_func( +/*============================*/ + dict_index_t* index, /*!< in: index */ + const dtuple_t* tuple, /*!< in: tuple on which search done */ + ulint mode, /*!< in: PAGE_CUR_L, ...; + NOTE that if the search is made using a unique + prefix of a record, mode should be + PAGE_CUR_LE, not PAGE_CUR_GE, as the latter + may end up on the previous page of the + record! */ + ulint latch_mode,/*!< in: BTR_SEARCH_LEAF, ...; + NOTE that if has_search_latch != 0 then + we maybe do not acquire a latch on the cursor + page, but assume that the caller uses his + btr search latch to protect the record! */ + btr_pcur_t* cursor, /*!< in: memory buffer for persistent cursor */ + ulint has_search_latch,/*!< in: latch mode the caller + currently has on btr_search_latch: + RW_S_LATCH, or 0 */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ + mtr_t* mtr); /*!< in: mtr */ +#define btr_pcur_open_with_no_init(ix,t,md,l,cur,has,m) \ + btr_pcur_open_with_no_init_func(ix,t,md,l,cur,has,__FILE__,__LINE__,m) + +/*****************************************************************//** +Opens a persistent cursor at either end of an index. */ +UNIV_INLINE +void +btr_pcur_open_at_index_side( +/*========================*/ + bool from_left, /*!< in: true if open to the low end, + false if to the high end */ + dict_index_t* index, /*!< in: index */ + ulint latch_mode, /*!< in: latch mode */ + btr_pcur_t* pcur, /*!< in/out: cursor */ + bool init_pcur, /*!< in: whether to initialize pcur */ + ulint level, /*!< in: level to search for + (0=leaf) */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull)); +/**************************************************************//** +Gets the up_match value for a pcur after a search. +@return number of matched fields at the cursor or to the right if +search mode was PAGE_CUR_GE, otherwise undefined */ +UNIV_INLINE +ulint +btr_pcur_get_up_match( +/*==================*/ + const btr_pcur_t* cursor); /*!< in: persistent cursor */ +/**************************************************************//** +Gets the low_match value for a pcur after a search. +@return number of matched fields at the cursor or to the right if +search mode was PAGE_CUR_LE, otherwise undefined */ +UNIV_INLINE +ulint +btr_pcur_get_low_match( +/*===================*/ + const btr_pcur_t* cursor); /*!< in: persistent cursor */ +/**************************************************************//** +If mode is PAGE_CUR_G or PAGE_CUR_GE, opens a persistent cursor on the first +user record satisfying the search condition, in the case PAGE_CUR_L or +PAGE_CUR_LE, on the last user record. If no such user record exists, then +in the first case sets the cursor after last in tree, and in the latter case +before first in tree. The latching mode must be BTR_SEARCH_LEAF or +BTR_MODIFY_LEAF. */ +UNIV_INTERN +void +btr_pcur_open_on_user_rec_func( +/*===========================*/ + dict_index_t* index, /*!< in: index */ + const dtuple_t* tuple, /*!< in: tuple on which search done */ + ulint mode, /*!< in: PAGE_CUR_L, ... */ + ulint latch_mode, /*!< in: BTR_SEARCH_LEAF or + BTR_MODIFY_LEAF */ + btr_pcur_t* cursor, /*!< in: memory buffer for persistent + cursor */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ + mtr_t* mtr); /*!< in: mtr */ +#define btr_pcur_open_on_user_rec(i,t,md,l,c,m) \ + btr_pcur_open_on_user_rec_func(i,t,md,l,c,__FILE__,__LINE__,m) +/**********************************************************************//** +Positions a cursor at a randomly chosen position within a B-tree. */ +UNIV_INLINE +void +btr_pcur_open_at_rnd_pos_func( +/*==========================*/ + dict_index_t* index, /*!< in: index */ + ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */ + btr_pcur_t* cursor, /*!< in/out: B-tree pcur */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ + mtr_t* mtr); /*!< in: mtr */ +#define btr_pcur_open_at_rnd_pos(i,l,c,m) \ + btr_pcur_open_at_rnd_pos_func(i,l,c,__FILE__,__LINE__,m) +/**************************************************************//** +Frees the possible memory heap of a persistent cursor and sets the latch +mode of the persistent cursor to BTR_NO_LATCHES. +WARNING: this function does not release the latch on the page where the +cursor is currently positioned. The latch is acquired by the +"move to next/previous" family of functions. Since recursive shared locks +are not allowed, you must take care (if using the cursor in S-mode) to +manually release the latch by either calling +btr_leaf_page_release(btr_pcur_get_block(&pcur), pcur.latch_mode, mtr) +or by committing the mini-transaction right after btr_pcur_close(). +A subsequent attempt to crawl the same page in the same mtr would cause +an assertion failure. */ +UNIV_INLINE +void +btr_pcur_close( +/*===========*/ + btr_pcur_t* cursor); /*!< in: persistent cursor */ +/**************************************************************//** +The position of the cursor is stored by taking an initial segment of the +record the cursor is positioned on, before, or after, and copying it to the +cursor data structure, or just setting a flag if the cursor id before the +first in an EMPTY tree, or after the last in an EMPTY tree. NOTE that the +page where the cursor is positioned must not be empty if the index tree is +not totally empty! */ +UNIV_INTERN +void +btr_pcur_store_position( +/*====================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor */ + mtr_t* mtr); /*!< in: mtr */ +/**************************************************************//** +Restores the stored position of a persistent cursor bufferfixing the page and +obtaining the specified latches. If the cursor position was saved when the +(1) cursor was positioned on a user record: this function restores the position +to the last record LESS OR EQUAL to the stored record; +(2) cursor was positioned on a page infimum record: restores the position to +the last record LESS than the user record which was the successor of the page +infimum; +(3) cursor was positioned on the page supremum: restores to the first record +GREATER than the user record which was the predecessor of the supremum. +(4) cursor was positioned before the first or after the last in an empty tree: +restores to before first or after the last in the tree. +@return TRUE if the cursor position was stored when it was on a user +record and it can be restored on a user record whose ordering fields +are identical to the ones of the original user record */ +UNIV_INTERN +ibool +btr_pcur_restore_position_func( +/*===========================*/ + ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */ + btr_pcur_t* cursor, /*!< in: detached persistent cursor */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ + mtr_t* mtr); /*!< in: mtr */ +#define btr_pcur_restore_position(l,cur,mtr) \ + btr_pcur_restore_position_func(l,cur,__FILE__,__LINE__,mtr) +/*********************************************************//** +Gets the rel_pos field for a cursor whose position has been stored. +@return BTR_PCUR_ON, ... */ +UNIV_INLINE +ulint +btr_pcur_get_rel_pos( +/*=================*/ + const btr_pcur_t* cursor);/*!< in: persistent cursor */ +/**************************************************************//** +Commits the mtr and sets the pcur latch mode to BTR_NO_LATCHES, +that is, the cursor becomes detached. +Function btr_pcur_store_position should be used before calling this, +if restoration of cursor is wanted later. */ +UNIV_INLINE +void +btr_pcur_commit_specify_mtr( +/*========================*/ + btr_pcur_t* pcur, /*!< in: persistent cursor */ + mtr_t* mtr); /*!< in: mtr to commit */ +/*********************************************************//** +Moves the persistent cursor to the next record in the tree. If no records are +left, the cursor stays 'after last in tree'. +@return TRUE if the cursor was not after last in tree */ +UNIV_INLINE +ibool +btr_pcur_move_to_next( +/*==================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor; NOTE that the + function may release the page latch */ + mtr_t* mtr); /*!< in: mtr */ +/*********************************************************//** +Moves the persistent cursor to the previous record in the tree. If no records +are left, the cursor stays 'before first in tree'. +@return TRUE if the cursor was not before first in tree */ +UNIV_INTERN +ibool +btr_pcur_move_to_prev( +/*==================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor; NOTE that the + function may release the page latch */ + mtr_t* mtr); /*!< in: mtr */ +/*********************************************************//** +Moves the persistent cursor to the last record on the same page. */ +UNIV_INLINE +void +btr_pcur_move_to_last_on_page( +/*==========================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor */ + mtr_t* mtr); /*!< in: mtr */ +/*********************************************************//** +Moves the persistent cursor to the next user record in the tree. If no user +records are left, the cursor ends up 'after last in tree'. +@return TRUE if the cursor moved forward, ending on a user record */ +UNIV_INLINE +ibool +btr_pcur_move_to_next_user_rec( +/*===========================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor; NOTE that the + function may release the page latch */ + mtr_t* mtr); /*!< in: mtr */ +/*********************************************************//** +Moves the persistent cursor to the first record on the next page. +Releases the latch on the current page, and bufferunfixes it. +Note that there must not be modifications on the current page, +as then the x-latch can be released only in mtr_commit. */ +UNIV_INTERN +void +btr_pcur_move_to_next_page( +/*=======================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor; must be on the + last record of the current page */ + mtr_t* mtr); /*!< in: mtr */ +/*********************************************************//** +Moves the persistent cursor backward if it is on the first record +of the page. Releases the latch on the current page, and bufferunfixes +it. Note that to prevent a possible deadlock, the operation first +stores the position of the cursor, releases the leaf latch, acquires +necessary latches and restores the cursor position again before returning. +The alphabetical position of the cursor is guaranteed to be sensible +on return, but it may happen that the cursor is not positioned on the +last record of any page, because the structure of the tree may have +changed while the cursor had no latches. */ +UNIV_INTERN +void +btr_pcur_move_backward_from_page( +/*=============================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor, must be on the + first record of the current page */ + mtr_t* mtr); /*!< in: mtr */ +#ifdef UNIV_DEBUG +/*********************************************************//** +Returns the btr cursor component of a persistent cursor. +@return pointer to btr cursor component */ +UNIV_INLINE +btr_cur_t* +btr_pcur_get_btr_cur( +/*=================*/ + const btr_pcur_t* cursor); /*!< in: persistent cursor */ +/*********************************************************//** +Returns the page cursor component of a persistent cursor. +@return pointer to page cursor component */ +UNIV_INLINE +page_cur_t* +btr_pcur_get_page_cur( +/*==================*/ + const btr_pcur_t* cursor); /*!< in: persistent cursor */ +/*********************************************************//** +Returns the page of a persistent cursor. +@return pointer to the page */ +UNIV_INLINE +page_t* +btr_pcur_get_page( +/*==============*/ + const btr_pcur_t* cursor);/*!< in: persistent cursor */ +/*********************************************************//** +Returns the buffer block of a persistent cursor. +@return pointer to the block */ +UNIV_INLINE +buf_block_t* +btr_pcur_get_block( +/*===============*/ + const btr_pcur_t* cursor);/*!< in: persistent cursor */ +/*********************************************************//** +Returns the record of a persistent cursor. +@return pointer to the record */ +UNIV_INLINE +rec_t* +btr_pcur_get_rec( +/*=============*/ + const btr_pcur_t* cursor);/*!< in: persistent cursor */ +#else /* UNIV_DEBUG */ +# define btr_pcur_get_btr_cur(cursor) (&(cursor)->btr_cur) +# define btr_pcur_get_page_cur(cursor) (&(cursor)->btr_cur.page_cur) +# define btr_pcur_get_page(cursor) ((cursor)->btr_cur.page_cur.block->frame) +# define btr_pcur_get_block(cursor) ((cursor)->btr_cur.page_cur.block) +# define btr_pcur_get_rec(cursor) ((cursor)->btr_cur.page_cur.rec) +#endif /* UNIV_DEBUG */ +/*********************************************************//** +Checks if the persistent cursor is on a user record. */ +UNIV_INLINE +ibool +btr_pcur_is_on_user_rec( +/*====================*/ + const btr_pcur_t* cursor);/*!< in: persistent cursor */ +/*********************************************************//** +Checks if the persistent cursor is after the last user record on +a page. */ +UNIV_INLINE +ibool +btr_pcur_is_after_last_on_page( +/*===========================*/ + const btr_pcur_t* cursor);/*!< in: persistent cursor */ +/*********************************************************//** +Checks if the persistent cursor is before the first user record on +a page. */ +UNIV_INLINE +ibool +btr_pcur_is_before_first_on_page( +/*=============================*/ + const btr_pcur_t* cursor);/*!< in: persistent cursor */ +/*********************************************************//** +Checks if the persistent cursor is before the first user record in +the index tree. */ +UNIV_INLINE +ibool +btr_pcur_is_before_first_in_tree( +/*=============================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor */ + mtr_t* mtr); /*!< in: mtr */ +/*********************************************************//** +Checks if the persistent cursor is after the last user record in +the index tree. */ +UNIV_INLINE +ibool +btr_pcur_is_after_last_in_tree( +/*===========================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor */ + mtr_t* mtr); /*!< in: mtr */ +/*********************************************************//** +Moves the persistent cursor to the next record on the same page. */ +UNIV_INLINE +void +btr_pcur_move_to_next_on_page( +/*==========================*/ + btr_pcur_t* cursor);/*!< in/out: persistent cursor */ +/*********************************************************//** +Moves the persistent cursor to the previous record on the same page. */ +UNIV_INLINE +void +btr_pcur_move_to_prev_on_page( +/*==========================*/ + btr_pcur_t* cursor);/*!< in/out: persistent cursor */ +/*********************************************************//** +Moves the persistent cursor to the infimum record on the same page. */ +UNIV_INLINE +void +btr_pcur_move_before_first_on_page( +/*===============================*/ + btr_pcur_t* cursor); /*!< in/out: persistent cursor */ + +/** Position state of persistent B-tree cursor. */ +enum pcur_pos_t { + /** The persistent cursor is not positioned. */ + BTR_PCUR_NOT_POSITIONED = 0, + /** The persistent cursor was previously positioned. + TODO: currently, the state can be BTR_PCUR_IS_POSITIONED, + though it really should be BTR_PCUR_WAS_POSITIONED, + because we have no obligation to commit the cursor with + mtr; similarly latch_mode may be out of date. This can + lead to problems if btr_pcur is not used the right way; + all current code should be ok. */ + BTR_PCUR_WAS_POSITIONED, + /** The persistent cursor is positioned by optimistic get to the same + record as it was positioned at. Not used for rel_pos == BTR_PCUR_ON. + It may need adjustment depending on previous/current search direction + and rel_pos. */ + BTR_PCUR_IS_POSITIONED_OPTIMISTIC, + /** The persistent cursor is positioned by index search. + Or optimistic get for rel_pos == BTR_PCUR_ON. */ + BTR_PCUR_IS_POSITIONED +}; + +/* The persistent B-tree cursor structure. This is used mainly for SQL +selects, updates, and deletes. */ + +struct btr_pcur_t{ + btr_cur_t btr_cur; /*!< a B-tree cursor */ + ulint latch_mode; /*!< see TODO note below! + BTR_SEARCH_LEAF, BTR_MODIFY_LEAF, + BTR_MODIFY_TREE, or BTR_NO_LATCHES, + depending on the latching state of + the page and tree where the cursor is + positioned; BTR_NO_LATCHES means that + the cursor is not currently positioned: + we say then that the cursor is + detached; it can be restored to + attached if the old position was + stored in old_rec */ + ulint old_stored; /*!< BTR_PCUR_OLD_STORED + or BTR_PCUR_OLD_NOT_STORED */ + rec_t* old_rec; /*!< if cursor position is stored, + contains an initial segment of the + latest record cursor was positioned + either on, before, or after */ + ulint old_n_fields; /*!< number of fields in old_rec */ + ulint rel_pos; /*!< BTR_PCUR_ON, BTR_PCUR_BEFORE, or + BTR_PCUR_AFTER, depending on whether + cursor was on, before, or after the + old_rec record */ + buf_block_t* block_when_stored;/* buffer block when the position was + stored */ + ib_uint64_t modify_clock; /*!< the modify clock value of the + buffer block when the cursor position + was stored */ + enum pcur_pos_t pos_state; /*!< btr_pcur_store_position() and + btr_pcur_restore_position() state. */ + ulint search_mode; /*!< PAGE_CUR_G, ... */ + trx_t* trx_if_known; /*!< the transaction, if we know it; + otherwise this field is not defined; + can ONLY BE USED in error prints in + fatal assertion failures! */ + /*-----------------------------*/ + /* NOTE that the following fields may possess dynamically allocated + memory which should be freed if not needed anymore! */ + + byte* old_rec_buf; /*!< NULL, or a dynamically allocated + buffer for old_rec */ + ulint buf_size; /*!< old_rec_buf size if old_rec_buf + is not NULL */ +}; + +#define BTR_PCUR_OLD_STORED 908467085 +#define BTR_PCUR_OLD_NOT_STORED 122766467 + +#ifndef UNIV_NONINL +#include "btr0pcur.ic" +#endif + +#endif diff --git a/storage/xtradb/include/btr0pcur.ic b/storage/xtradb/include/btr0pcur.ic new file mode 100644 index 00000000000..7e355d3709d --- /dev/null +++ b/storage/xtradb/include/btr0pcur.ic @@ -0,0 +1,606 @@ +/***************************************************************************** + +Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/btr0pcur.ic +The index tree persistent cursor + +Created 2/23/1996 Heikki Tuuri +*******************************************************/ + + +/*********************************************************//** +Gets the rel_pos field for a cursor whose position has been stored. +@return BTR_PCUR_ON, ... */ +UNIV_INLINE +ulint +btr_pcur_get_rel_pos( +/*=================*/ + const btr_pcur_t* cursor) /*!< in: persistent cursor */ +{ + ut_ad(cursor); + ut_ad(cursor->old_rec); + ut_ad(cursor->old_stored == BTR_PCUR_OLD_STORED); + ut_ad(cursor->pos_state == BTR_PCUR_WAS_POSITIONED + || cursor->pos_state == BTR_PCUR_IS_POSITIONED); + + return(cursor->rel_pos); +} + +#ifdef UNIV_DEBUG +/*********************************************************//** +Returns the btr cursor component of a persistent cursor. +@return pointer to btr cursor component */ +UNIV_INLINE +btr_cur_t* +btr_pcur_get_btr_cur( +/*=================*/ + const btr_pcur_t* cursor) /*!< in: persistent cursor */ +{ + const btr_cur_t* btr_cur = &cursor->btr_cur; + return((btr_cur_t*) btr_cur); +} + +/*********************************************************//** +Returns the page cursor component of a persistent cursor. +@return pointer to page cursor component */ +UNIV_INLINE +page_cur_t* +btr_pcur_get_page_cur( +/*==================*/ + const btr_pcur_t* cursor) /*!< in: persistent cursor */ +{ + return(btr_cur_get_page_cur(btr_pcur_get_btr_cur(cursor))); +} + +/*********************************************************//** +Returns the page of a persistent cursor. +@return pointer to the page */ +UNIV_INLINE +page_t* +btr_pcur_get_page( +/*==============*/ + const btr_pcur_t* cursor) /*!< in: persistent cursor */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + + return(btr_cur_get_page(btr_pcur_get_btr_cur(cursor))); +} + +/*********************************************************//** +Returns the buffer block of a persistent cursor. +@return pointer to the block */ +UNIV_INLINE +buf_block_t* +btr_pcur_get_block( +/*===============*/ + const btr_pcur_t* cursor) /*!< in: persistent cursor */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + + return(btr_cur_get_block(btr_pcur_get_btr_cur(cursor))); +} + +/*********************************************************//** +Returns the record of a persistent cursor. +@return pointer to the record */ +UNIV_INLINE +rec_t* +btr_pcur_get_rec( +/*=============*/ + const btr_pcur_t* cursor) /*!< in: persistent cursor */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + return(btr_cur_get_rec(btr_pcur_get_btr_cur(cursor))); +} +#endif /* UNIV_DEBUG */ + +/**************************************************************//** +Gets the up_match value for a pcur after a search. +@return number of matched fields at the cursor or to the right if +search mode was PAGE_CUR_GE, otherwise undefined */ +UNIV_INLINE +ulint +btr_pcur_get_up_match( +/*==================*/ + const btr_pcur_t* cursor) /*!< in: persistent cursor */ +{ + const btr_cur_t* btr_cursor; + + ut_ad((cursor->pos_state == BTR_PCUR_WAS_POSITIONED) + || (cursor->pos_state == BTR_PCUR_IS_POSITIONED)); + + btr_cursor = btr_pcur_get_btr_cur(cursor); + + ut_ad(btr_cursor->up_match != ULINT_UNDEFINED); + + return(btr_cursor->up_match); +} + +/**************************************************************//** +Gets the low_match value for a pcur after a search. +@return number of matched fields at the cursor or to the right if +search mode was PAGE_CUR_LE, otherwise undefined */ +UNIV_INLINE +ulint +btr_pcur_get_low_match( +/*===================*/ + const btr_pcur_t* cursor) /*!< in: persistent cursor */ +{ + const btr_cur_t* btr_cursor; + + ut_ad((cursor->pos_state == BTR_PCUR_WAS_POSITIONED) + || (cursor->pos_state == BTR_PCUR_IS_POSITIONED)); + + btr_cursor = btr_pcur_get_btr_cur(cursor); + ut_ad(btr_cursor->low_match != ULINT_UNDEFINED); + + return(btr_cursor->low_match); +} + +/*********************************************************//** +Checks if the persistent cursor is after the last user record on +a page. */ +UNIV_INLINE +ibool +btr_pcur_is_after_last_on_page( +/*===========================*/ + const btr_pcur_t* cursor) /*!< in: persistent cursor */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + return(page_cur_is_after_last(btr_pcur_get_page_cur(cursor))); +} + +/*********************************************************//** +Checks if the persistent cursor is before the first user record on +a page. */ +UNIV_INLINE +ibool +btr_pcur_is_before_first_on_page( +/*=============================*/ + const btr_pcur_t* cursor) /*!< in: persistent cursor */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + return(page_cur_is_before_first(btr_pcur_get_page_cur(cursor))); +} + +/*********************************************************//** +Checks if the persistent cursor is on a user record. */ +UNIV_INLINE +ibool +btr_pcur_is_on_user_rec( +/*====================*/ + const btr_pcur_t* cursor) /*!< in: persistent cursor */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + if (btr_pcur_is_before_first_on_page(cursor) + || btr_pcur_is_after_last_on_page(cursor)) { + + return(FALSE); + } + + return(TRUE); +} + +/*********************************************************//** +Checks if the persistent cursor is before the first user record in +the index tree. */ +UNIV_INLINE +ibool +btr_pcur_is_before_first_in_tree( +/*=============================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor */ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + if (btr_page_get_prev(btr_pcur_get_page(cursor), mtr) != FIL_NULL) { + + return(FALSE); + } + + return(page_cur_is_before_first(btr_pcur_get_page_cur(cursor))); +} + +/*********************************************************//** +Checks if the persistent cursor is after the last user record in +the index tree. */ +UNIV_INLINE +ibool +btr_pcur_is_after_last_in_tree( +/*===========================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor */ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + if (btr_page_get_next(btr_pcur_get_page(cursor), mtr) != FIL_NULL) { + + return(FALSE); + } + + return(page_cur_is_after_last(btr_pcur_get_page_cur(cursor))); +} + +/*********************************************************//** +Moves the persistent cursor to the next record on the same page. */ +UNIV_INLINE +void +btr_pcur_move_to_next_on_page( +/*==========================*/ + btr_pcur_t* cursor) /*!< in/out: persistent cursor */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + page_cur_move_to_next(btr_pcur_get_page_cur(cursor)); + + cursor->old_stored = BTR_PCUR_OLD_NOT_STORED; +} + +/*********************************************************//** +Moves the persistent cursor to the previous record on the same page. */ +UNIV_INLINE +void +btr_pcur_move_to_prev_on_page( +/*==========================*/ + btr_pcur_t* cursor) /*!< in/out: persistent cursor */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + page_cur_move_to_prev(btr_pcur_get_page_cur(cursor)); + + cursor->old_stored = BTR_PCUR_OLD_NOT_STORED; +} + +/*********************************************************//** +Moves the persistent cursor to the last record on the same page. */ +UNIV_INLINE +void +btr_pcur_move_to_last_on_page( +/*==========================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor */ + mtr_t* mtr) /*!< in: mtr */ +{ + UT_NOT_USED(mtr); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + page_cur_set_after_last(btr_pcur_get_block(cursor), + btr_pcur_get_page_cur(cursor)); + + cursor->old_stored = BTR_PCUR_OLD_NOT_STORED; +} + +/*********************************************************//** +Moves the persistent cursor to the next user record in the tree. If no user +records are left, the cursor ends up 'after last in tree'. +@return TRUE if the cursor moved forward, ending on a user record */ +UNIV_INLINE +ibool +btr_pcur_move_to_next_user_rec( +/*===========================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor; NOTE that the + function may release the page latch */ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + cursor->old_stored = BTR_PCUR_OLD_NOT_STORED; +loop: + if (btr_pcur_is_after_last_on_page(cursor)) { + + if (btr_pcur_is_after_last_in_tree(cursor, mtr)) { + + return(FALSE); + } + + btr_pcur_move_to_next_page(cursor, mtr); + } else { + btr_pcur_move_to_next_on_page(cursor); + } + + if (btr_pcur_is_on_user_rec(cursor)) { + + return(TRUE); + } + + goto loop; +} + +/*********************************************************//** +Moves the persistent cursor to the next record in the tree. If no records are +left, the cursor stays 'after last in tree'. +@return TRUE if the cursor was not after last in tree */ +UNIV_INLINE +ibool +btr_pcur_move_to_next( +/*==================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor; NOTE that the + function may release the page latch */ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + cursor->old_stored = BTR_PCUR_OLD_NOT_STORED; + + if (btr_pcur_is_after_last_on_page(cursor)) { + + if (btr_pcur_is_after_last_in_tree(cursor, mtr)) { + + return(FALSE); + } + + btr_pcur_move_to_next_page(cursor, mtr); + + return(TRUE); + } + + btr_pcur_move_to_next_on_page(cursor); + + return(TRUE); +} + +/**************************************************************//** +Commits the mtr and sets the pcur latch mode to BTR_NO_LATCHES, +that is, the cursor becomes detached. +Function btr_pcur_store_position should be used before calling this, +if restoration of cursor is wanted later. */ +UNIV_INLINE +void +btr_pcur_commit_specify_mtr( +/*========================*/ + btr_pcur_t* pcur, /*!< in: persistent cursor */ + mtr_t* mtr) /*!< in: mtr to commit */ +{ + ut_ad(pcur->pos_state == BTR_PCUR_IS_POSITIONED); + + pcur->latch_mode = BTR_NO_LATCHES; + + mtr_commit(mtr); + + pcur->pos_state = BTR_PCUR_WAS_POSITIONED; +} + +/**************************************************************//** +Sets the old_rec_buf field to NULL. */ +UNIV_INLINE +void +btr_pcur_init( +/*==========*/ + btr_pcur_t* pcur) /*!< in: persistent cursor */ +{ + pcur->old_stored = BTR_PCUR_OLD_NOT_STORED; + pcur->old_rec_buf = NULL; + pcur->old_rec = NULL; +} + +/**************************************************************//** +Initializes and opens a persistent cursor to an index tree. It should be +closed with btr_pcur_close. */ +UNIV_INLINE +void +btr_pcur_open_low( +/*==============*/ + dict_index_t* index, /*!< in: index */ + ulint level, /*!< in: level in the btree */ + const dtuple_t* tuple, /*!< in: tuple on which search done */ + ulint mode, /*!< in: PAGE_CUR_L, ...; + NOTE that if the search is made using a unique + prefix of a record, mode should be + PAGE_CUR_LE, not PAGE_CUR_GE, as the latter + may end up on the previous page from the + record! */ + ulint latch_mode,/*!< in: BTR_SEARCH_LEAF, ... */ + btr_pcur_t* cursor, /*!< in: memory buffer for persistent cursor */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ + mtr_t* mtr) /*!< in: mtr */ +{ + btr_cur_t* btr_cursor; + + /* Initialize the cursor */ + + btr_pcur_init(cursor); + + cursor->latch_mode = BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode); + cursor->search_mode = mode; + + /* Search with the tree cursor */ + + btr_cursor = btr_pcur_get_btr_cur(cursor); + + btr_cur_search_to_nth_level(index, level, tuple, mode, latch_mode, + btr_cursor, 0, file, line, mtr); + cursor->pos_state = BTR_PCUR_IS_POSITIONED; + + cursor->trx_if_known = NULL; +} + +/**************************************************************//** +Opens an persistent cursor to an index tree without initializing the +cursor. */ +UNIV_INLINE +void +btr_pcur_open_with_no_init_func( +/*============================*/ + dict_index_t* index, /*!< in: index */ + const dtuple_t* tuple, /*!< in: tuple on which search done */ + ulint mode, /*!< in: PAGE_CUR_L, ...; + NOTE that if the search is made using a unique + prefix of a record, mode should be + PAGE_CUR_LE, not PAGE_CUR_GE, as the latter + may end up on the previous page of the + record! */ + ulint latch_mode,/*!< in: BTR_SEARCH_LEAF, ...; + NOTE that if has_search_latch != 0 then + we maybe do not acquire a latch on the cursor + page, but assume that the caller uses his + btr search latch to protect the record! */ + btr_pcur_t* cursor, /*!< in: memory buffer for persistent cursor */ + ulint has_search_latch,/*!< in: latch mode the caller + currently has on btr_search_latch: + RW_S_LATCH, or 0 */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ + mtr_t* mtr) /*!< in: mtr */ +{ + btr_cur_t* btr_cursor; + + cursor->latch_mode = latch_mode; + cursor->search_mode = mode; + + /* Search with the tree cursor */ + + btr_cursor = btr_pcur_get_btr_cur(cursor); + + btr_cur_search_to_nth_level(index, 0, tuple, mode, latch_mode, + btr_cursor, has_search_latch, + file, line, mtr); + cursor->pos_state = BTR_PCUR_IS_POSITIONED; + + cursor->old_stored = BTR_PCUR_OLD_NOT_STORED; + + cursor->trx_if_known = NULL; +} + +/*****************************************************************//** +Opens a persistent cursor at either end of an index. */ +UNIV_INLINE +void +btr_pcur_open_at_index_side( +/*========================*/ + bool from_left, /*!< in: true if open to the low end, + false if to the high end */ + dict_index_t* index, /*!< in: index */ + ulint latch_mode, /*!< in: latch mode */ + btr_pcur_t* pcur, /*!< in/out: cursor */ + bool init_pcur, /*!< in: whether to initialize pcur */ + ulint level, /*!< in: level to search for + (0=leaf) */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + pcur->latch_mode = BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode); + + pcur->search_mode = from_left ? PAGE_CUR_G : PAGE_CUR_L; + + if (init_pcur) { + btr_pcur_init(pcur); + } + + btr_cur_open_at_index_side(from_left, index, latch_mode, + btr_pcur_get_btr_cur(pcur), level, mtr); + pcur->pos_state = BTR_PCUR_IS_POSITIONED; + + pcur->old_stored = BTR_PCUR_OLD_NOT_STORED; + + pcur->trx_if_known = NULL; +} + +/**********************************************************************//** +Positions a cursor at a randomly chosen position within a B-tree. */ +UNIV_INLINE +void +btr_pcur_open_at_rnd_pos_func( +/*==========================*/ + dict_index_t* index, /*!< in: index */ + ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */ + btr_pcur_t* cursor, /*!< in/out: B-tree pcur */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ + mtr_t* mtr) /*!< in: mtr */ +{ + /* Initialize the cursor */ + + cursor->latch_mode = latch_mode; + cursor->search_mode = PAGE_CUR_G; + + btr_pcur_init(cursor); + + btr_cur_open_at_rnd_pos_func(index, latch_mode, + btr_pcur_get_btr_cur(cursor), + file, line, mtr); + cursor->pos_state = BTR_PCUR_IS_POSITIONED; + cursor->old_stored = BTR_PCUR_OLD_NOT_STORED; + + cursor->trx_if_known = NULL; +} + +/**************************************************************//** +Frees the possible memory heap of a persistent cursor and sets the latch +mode of the persistent cursor to BTR_NO_LATCHES. +WARNING: this function does not release the latch on the page where the +cursor is currently positioned. The latch is acquired by the +"move to next/previous" family of functions. Since recursive shared locks +are not allowed, you must take care (if using the cursor in S-mode) to +manually release the latch by either calling +btr_leaf_page_release(btr_pcur_get_block(&pcur), pcur.latch_mode, mtr) +or by committing the mini-transaction right after btr_pcur_close(). +A subsequent attempt to crawl the same page in the same mtr would cause +an assertion failure. */ +UNIV_INLINE +void +btr_pcur_close( +/*===========*/ + btr_pcur_t* cursor) /*!< in: persistent cursor */ +{ + if (cursor->old_rec_buf != NULL) { + + mem_free(cursor->old_rec_buf); + + cursor->old_rec = NULL; + cursor->old_rec_buf = NULL; + } + + cursor->btr_cur.page_cur.rec = NULL; + cursor->btr_cur.page_cur.block = NULL; + cursor->old_rec = NULL; + cursor->old_stored = BTR_PCUR_OLD_NOT_STORED; + + cursor->latch_mode = BTR_NO_LATCHES; + cursor->pos_state = BTR_PCUR_NOT_POSITIONED; + + cursor->trx_if_known = NULL; +} + +/*********************************************************//** +Moves the persistent cursor to the infimum record on the same page. */ +UNIV_INLINE +void +btr_pcur_move_before_first_on_page( +/*===============================*/ + btr_pcur_t* cursor) /*!< in/out: persistent cursor */ +{ + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + page_cur_set_before_first(btr_pcur_get_block(cursor), + btr_pcur_get_page_cur(cursor)); + + cursor->old_stored = BTR_PCUR_OLD_NOT_STORED; +} diff --git a/storage/xtradb/include/btr0sea.h b/storage/xtradb/include/btr0sea.h new file mode 100644 index 00000000000..d40094461ff --- /dev/null +++ b/storage/xtradb/include/btr0sea.h @@ -0,0 +1,364 @@ +/***************************************************************************** + +Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/btr0sea.h +The index tree adaptive search + +Created 2/17/1996 Heikki Tuuri +*************************************************************************/ + +#ifndef btr0sea_h +#define btr0sea_h + +#include "univ.i" + +#include "rem0rec.h" +#include "dict0dict.h" +#include "btr0types.h" +#include "mtr0mtr.h" +#include "ha0ha.h" + +/*****************************************************************//** +Creates and initializes the adaptive search system at a database start. */ +UNIV_INTERN +void +btr_search_sys_create( +/*==================*/ + ulint hash_size); /*!< in: hash index hash table size */ +/*****************************************************************//** +Frees the adaptive search system at a database shutdown. */ +UNIV_INTERN +void +btr_search_sys_free(void); +/*=====================*/ + +/********************************************************************//** +Disable the adaptive hash search system and empty the index. */ +UNIV_INTERN +void +btr_search_disable(void); +/*====================*/ +/********************************************************************//** +Enable the adaptive hash search system. */ +UNIV_INTERN +void +btr_search_enable(void); +/*====================*/ + +/********************************************************************//** +Returns search info for an index. +@return search info; search mutex reserved */ +UNIV_INLINE +btr_search_t* +btr_search_get_info( +/*================*/ + dict_index_t* index) /*!< in: index */ + __attribute__((nonnull)); +/*****************************************************************//** +Creates and initializes a search info struct. +@return own: search info struct */ +UNIV_INTERN +btr_search_t* +btr_search_info_create( +/*===================*/ + mem_heap_t* heap); /*!< in: heap where created */ +/*****************************************************************//** +Returns the value of ref_count. The value is protected by +the latch of the AHI partition corresponding to this index. +@return ref_count value. */ +UNIV_INTERN +ulint +btr_search_info_get_ref_count( +/*==========================*/ + btr_search_t* info, /*!< in: search info. */ + dict_index_t* index); /*!< in: index */ +/*********************************************************************//** +Updates the search info. */ +UNIV_INLINE +void +btr_search_info_update( +/*===================*/ + dict_index_t* index, /*!< in: index of the cursor */ + btr_cur_t* cursor);/*!< in: cursor which was just positioned */ +/******************************************************************//** +Tries to guess the right search position based on the hash search info +of the index. Note that if mode is PAGE_CUR_LE, which is used in inserts, +and the function returns TRUE, then cursor->up_match and cursor->low_match +both have sensible values. +@return TRUE if succeeded */ +UNIV_INTERN +ibool +btr_search_guess_on_hash( +/*=====================*/ + dict_index_t* index, /*!< in: index */ + btr_search_t* info, /*!< in: index search info */ + const dtuple_t* tuple, /*!< in: logical record */ + ulint mode, /*!< in: PAGE_CUR_L, ... */ + ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */ + btr_cur_t* cursor, /*!< out: tree cursor */ + ulint has_search_latch,/*!< in: latch mode the caller + currently has on btr_search_latch: + RW_S_LATCH, RW_X_LATCH, or 0 */ + mtr_t* mtr); /*!< in: mtr */ +/********************************************************************//** +Moves or deletes hash entries for moved records. If new_page is already hashed, +then the hash index for page, if any, is dropped. If new_page is not hashed, +and page is hashed, then a new hash index is built to new_page with the same +parameters as page (this often happens when a page is split). */ +UNIV_INTERN +void +btr_search_move_or_delete_hash_entries( +/*===================================*/ + buf_block_t* new_block, /*!< in: records are copied + to this page */ + buf_block_t* block, /*!< in: index page from which + records were copied, and the + copied records will be deleted + from this page */ + dict_index_t* index); /*!< in: record descriptor */ +/********************************************************************//** +Drops a page hash index. */ +UNIV_INTERN +void +btr_search_drop_page_hash_index( +/*============================*/ + buf_block_t* block); /*!< in: block containing index page, + s- or x-latched, or an index page + for which we know that + block->buf_fix_count == 0 */ +/********************************************************************//** +Drops a possible page hash index when a page is evicted from the buffer pool +or freed in a file segment. */ +UNIV_INTERN +void +btr_search_drop_page_hash_when_freed( +/*=================================*/ + ulint space, /*!< in: space id */ + ulint zip_size, /*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page_no); /*!< in: page number */ +/********************************************************************//** +Updates the page hash index when a single record is inserted on a page. */ +UNIV_INTERN +void +btr_search_update_hash_node_on_insert( +/*==================================*/ + btr_cur_t* cursor);/*!< in: cursor which was positioned to the + place to insert using btr_cur_search_..., + and the new record has been inserted next + to the cursor */ +/********************************************************************//** +Updates the page hash index when a single record is inserted on a page. */ +UNIV_INTERN +void +btr_search_update_hash_on_insert( +/*=============================*/ + btr_cur_t* cursor);/*!< in: cursor which was positioned to the + place to insert using btr_cur_search_..., + and the new record has been inserted next + to the cursor */ +/********************************************************************//** +Updates the page hash index when a single record is deleted from a page. */ +UNIV_INTERN +void +btr_search_update_hash_on_delete( +/*=============================*/ + btr_cur_t* cursor);/*!< in: cursor which was positioned on the + record to delete using btr_cur_search_..., + the record is not yet deleted */ +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG +/********************************************************************//** +Validates the search system. +@return TRUE if ok */ +UNIV_INTERN +ibool +btr_search_validate(void); +/*======================*/ +#endif /* defined UNIV_AHI_DEBUG || defined UNIV_DEBUG */ + +/********************************************************************//** +Returns the adaptive hash index table for a given index key. +@return the adaptive hash index table for a given index key */ +UNIV_INLINE +hash_table_t* +btr_search_get_hash_table( +/*======================*/ + const dict_index_t* index) /*!< in: index */ + __attribute__((pure,warn_unused_result)); + +/********************************************************************//** +Returns the adaptive hash index latch for a given index key. +@return the adaptive hash index latch for a given index key */ +UNIV_INLINE +prio_rw_lock_t* +btr_search_get_latch( +/*=================*/ + const dict_index_t* index) /*!< in: index */ + __attribute__((pure,warn_unused_result)); + +/*********************************************************************//** +Returns the AHI partition number corresponding to a given index ID. */ +UNIV_INLINE +ulint +btr_search_get_key( +/*===============*/ + index_id_t index_id) /*!< in: index ID */ + __attribute__((pure,warn_unused_result)); + +/*********************************************************************//** +Initializes AHI-related fields in a newly created index. */ +UNIV_INLINE +void +btr_search_index_init( +/*===============*/ + dict_index_t* index) /*!< in: index */ + __attribute__((nonnull)); + +/********************************************************************//** +Latches all adaptive hash index latches in exclusive mode. */ +UNIV_INLINE +void +btr_search_x_lock_all(void); +/*========================*/ + +/********************************************************************//** +Unlatches all adaptive hash index latches in exclusive mode. */ +UNIV_INLINE +void +btr_search_x_unlock_all(void); +/*==========================*/ + +#ifdef UNIV_SYNC_DEBUG +/******************************************************************//** +Checks if the thread has locked all the adaptive hash index latches in the +specified mode. + +@return true if all latches are locked by the current thread, false +otherwise. */ +UNIV_INLINE +bool +btr_search_own_all( +/*===============*/ + ulint lock_type) + __attribute__((warn_unused_result)); +/********************************************************************//** +Checks if the thread owns any adaptive hash latches in either S or X mode. +@return true if the thread owns at least one latch in any mode. */ +UNIV_INLINE +bool +btr_search_own_any(void) +/*=====================*/ + __attribute__((warn_unused_result)); +#endif + +/** The search info struct in an index */ +struct btr_search_t{ + ulint ref_count; /*!< Number of blocks in this index tree + that have search index built + i.e. block->index points to this index. + Protected by btr_search_latch except + when during initialization in + btr_search_info_create(). */ + + /* @{ The following fields are not protected by any latch. + Unfortunately, this means that they must be aligned to + the machine word, i.e., they cannot be turned into bit-fields. */ + buf_block_t* root_guess;/*!< the root page frame when it was last time + fetched, or NULL */ + ulint hash_analysis; /*!< when this exceeds + BTR_SEARCH_HASH_ANALYSIS, the hash + analysis starts; this is reset if no + success noticed */ + ibool last_hash_succ; /*!< TRUE if the last search would have + succeeded, or did succeed, using the hash + index; NOTE that the value here is not exact: + it is not calculated for every search, and the + calculation itself is not always accurate! */ + ulint n_hash_potential; + /*!< number of consecutive searches + which would have succeeded, or did succeed, + using the hash index; + the range is 0 .. BTR_SEARCH_BUILD_LIMIT + 5 */ + /* @} */ + /*---------------------- @{ */ + ulint n_fields; /*!< recommended prefix length for hash search: + number of full fields */ + ulint n_bytes; /*!< recommended prefix: number of bytes in + an incomplete field + @see BTR_PAGE_MAX_REC_SIZE */ + ibool left_side; /*!< TRUE or FALSE, depending on whether + the leftmost record of several records with + the same prefix should be indexed in the + hash index */ + /*---------------------- @} */ +#ifdef UNIV_SEARCH_PERF_STAT + ulint n_hash_succ; /*!< number of successful hash searches thus + far */ + ulint n_hash_fail; /*!< number of failed hash searches */ + ulint n_patt_succ; /*!< number of successful pattern searches thus + far */ + ulint n_searches; /*!< number of searches */ +#endif /* UNIV_SEARCH_PERF_STAT */ +#ifdef UNIV_DEBUG + ulint magic_n; /*!< magic number @see BTR_SEARCH_MAGIC_N */ +/** value of btr_search_t::magic_n, used in assertions */ +# define BTR_SEARCH_MAGIC_N 1112765 +#endif /* UNIV_DEBUG */ +}; + +/** The hash index system */ +struct btr_search_sys_t{ + hash_table_t** hash_tables; /*!< the array of adaptive hash index + tables, mapping dtuple_fold values to + rec_t pointers on index pages */ +}; + +/** The adaptive hash index */ +extern btr_search_sys_t* btr_search_sys; + +#ifdef UNIV_SEARCH_PERF_STAT +/** Number of successful adaptive hash index lookups */ +extern ulint btr_search_n_succ; +/** Number of failed adaptive hash index lookups */ +extern ulint btr_search_n_hash_fail; +#endif /* UNIV_SEARCH_PERF_STAT */ + +/** After change in n_fields or n_bytes in info, this many rounds are waited +before starting the hash analysis again: this is to save CPU time when there +is no hope in building a hash index. */ +#define BTR_SEARCH_HASH_ANALYSIS 17 + +/** Limit of consecutive searches for trying a search shortcut on the search +pattern */ +#define BTR_SEARCH_ON_PATTERN_LIMIT 3 + +/** Limit of consecutive searches for trying a search shortcut using +the hash index */ +#define BTR_SEARCH_ON_HASH_LIMIT 3 + +/** We do this many searches before trying to keep the search latch +over calls from MySQL. If we notice someone waiting for the latch, we +again set this much timeout. This is to reduce contention. */ +#define BTR_SEA_TIMEOUT 10000 + +#ifndef UNIV_NONINL +#include "btr0sea.ic" +#endif + +#endif diff --git a/storage/xtradb/include/btr0sea.ic b/storage/xtradb/include/btr0sea.ic new file mode 100644 index 00000000000..3cbcff75f31 --- /dev/null +++ b/storage/xtradb/include/btr0sea.ic @@ -0,0 +1,214 @@ +/***************************************************************************** + +Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/btr0sea.ic +The index tree adaptive search + +Created 2/17/1996 Heikki Tuuri +*************************************************************************/ + +#include "dict0mem.h" +#include "btr0cur.h" +#include "buf0buf.h" + +/*********************************************************************//** +Updates the search info. */ +UNIV_INTERN +void +btr_search_info_update_slow( +/*========================*/ + btr_search_t* info, /*!< in/out: search info */ + btr_cur_t* cursor);/*!< in: cursor which was just positioned */ + +/********************************************************************//** +Returns search info for an index. +@return search info; search mutex reserved */ +UNIV_INLINE +btr_search_t* +btr_search_get_info( +/*================*/ + dict_index_t* index) /*!< in: index */ +{ + return(index->search_info); +} + +/*********************************************************************//** +Updates the search info. */ +UNIV_INLINE +void +btr_search_info_update( +/*===================*/ + dict_index_t* index, /*!< in: index of the cursor */ + btr_cur_t* cursor) /*!< in: cursor which was just positioned */ +{ + btr_search_t* info; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(btr_search_get_latch(index), RW_LOCK_SHARED)); + ut_ad(!rw_lock_own(btr_search_get_latch(index), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + info = btr_search_get_info(index); + + info->hash_analysis++; + + if (info->hash_analysis < BTR_SEARCH_HASH_ANALYSIS) { + + /* Do nothing */ + + return; + + } + + ut_ad(cursor->flag != BTR_CUR_HASH); + + btr_search_info_update_slow(info, cursor); +} + +/********************************************************************//** +Returns the adaptive hash index table for a given index key. +@return the adaptive hash index table for a given index key */ +UNIV_INLINE +hash_table_t* +btr_search_get_hash_table( +/*======================*/ + const dict_index_t* index) /*!< in: index */ +{ + ut_ad(index); + ut_ad(index->search_table); + + return(index->search_table); +} + +/********************************************************************//** +Returns the adaptive hash index latch for a given index key. +@return the adaptive hash index latch for a given index key */ +UNIV_INLINE +prio_rw_lock_t* +btr_search_get_latch( +/*=================*/ + const dict_index_t* index) /*!< in: index */ +{ + ut_ad(index); + ut_ad(index->search_latch >= btr_search_latch_arr && + index->search_latch < btr_search_latch_arr + + btr_search_index_num); + + return(index->search_latch); +} + +/*********************************************************************//** +Returns the AHI partition number corresponding to a given index ID. */ +UNIV_INLINE +ulint +btr_search_get_key( +/*===============*/ + index_id_t index_id) /*!< in: index ID */ +{ + return(index_id % btr_search_index_num); +} + +/*********************************************************************//** +Initializes AHI-related fields in a newly created index. */ +UNIV_INLINE +void +btr_search_index_init( +/*===============*/ + dict_index_t* index) /*!< in: index */ +{ + ut_ad(index); + + index->search_latch = + &btr_search_latch_arr[btr_search_get_key(index->id)]; + index->search_table = + btr_search_sys->hash_tables[btr_search_get_key(index->id)]; +} + +/********************************************************************//** +Latches all adaptive hash index latches in exclusive mode. */ +UNIV_INLINE +void +btr_search_x_lock_all(void) +/*=======================*/ +{ + ulint i; + + for (i = 0; i < btr_search_index_num; i++) { + rw_lock_x_lock(&btr_search_latch_arr[i]); + } +} + +/********************************************************************//** +Unlatches all adaptive hash index latches in exclusive mode. */ +UNIV_INLINE +void +btr_search_x_unlock_all(void) +/*==========================*/ +{ + ulint i; + + for (i = 0; i < btr_search_index_num; i++) { + rw_lock_x_unlock(&btr_search_latch_arr[i]); + } +} + +#ifdef UNIV_SYNC_DEBUG +/******************************************************************//** +Checks if the thread has locked all the adaptive hash index latches in the +specified mode. + +@return true if all latches are locked by the current thread, false +otherwise. */ +UNIV_INLINE +bool +btr_search_own_all( +/*===============*/ + ulint lock_type) +{ + ulint i; + + for (i = 0; i < btr_search_index_num; i++) { + if (!rw_lock_own(&btr_search_latch_arr[i], lock_type)) { + return(false); + } + } + + return(true); +} + +/********************************************************************//** +Checks if the thread owns any adaptive hash latches in either S or X mode. +@return true if the thread owns at least one latch in any mode. */ +UNIV_INLINE +bool +btr_search_own_any(void) +/*====================*/ +{ + ulint i; + + for (i = 0; i < btr_search_index_num; i++) { + if (rw_lock_own(&btr_search_latch_arr[i], RW_LOCK_SHARED) || + rw_lock_own(&btr_search_latch_arr[i], RW_LOCK_EX)) { + return(true); + } + } + + return(false); +} +#endif /* UNIV_SYNC_DEBUG */ diff --git a/storage/xtradb/include/btr0types.h b/storage/xtradb/include/btr0types.h new file mode 100644 index 00000000000..cd0392e7951 --- /dev/null +++ b/storage/xtradb/include/btr0types.h @@ -0,0 +1,204 @@ +/***************************************************************************** + +Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/btr0types.h +The index tree general types + +Created 2/17/1996 Heikki Tuuri +*************************************************************************/ + +#ifndef btr0types_h +#define btr0types_h + +#include "univ.i" + +#include "rem0types.h" +#include "page0types.h" +#include "sync0rw.h" + +/** Persistent cursor */ +struct btr_pcur_t; +/** B-tree cursor */ +struct btr_cur_t; +/** B-tree search information for the adaptive hash index */ +struct btr_search_t; + +#ifndef UNIV_HOTBACKUP + +/** @brief The array of latches protecting the adaptive search partitions + +These latches protect the +(1) hash index from the corresponding AHI partition; +(2) columns of a record to which we have a pointer in the hash index; + +but do NOT protect: + +(3) next record offset field in a record; +(4) next or previous records on the same page. + +Bear in mind (3) and (4) when using the hash indexes. +*/ + +extern prio_rw_lock_t* btr_search_latch_arr; + +#endif /* UNIV_HOTBACKUP */ + +/** Flag: has the search system been enabled? +Protected by btr_search_latch. */ +extern char btr_search_enabled; + +/** Number of adaptive hash index partitions */ +extern ulint btr_search_index_num; + +#ifdef UNIV_BLOB_DEBUG +# include "buf0types.h" +/** An index->blobs entry for keeping track of off-page column references */ +struct btr_blob_dbg_t; + +/** Insert to index->blobs a reference to an off-page column. +@param index the index tree +@param b the reference +@param ctx context (for logging) */ +UNIV_INTERN +void +btr_blob_dbg_rbt_insert( +/*====================*/ + dict_index_t* index, /*!< in/out: index tree */ + const btr_blob_dbg_t* b, /*!< in: the reference */ + const char* ctx) /*!< in: context (for logging) */ + __attribute__((nonnull)); + +/** Remove from index->blobs a reference to an off-page column. +@param index the index tree +@param b the reference +@param ctx context (for logging) */ +UNIV_INTERN +void +btr_blob_dbg_rbt_delete( +/*====================*/ + dict_index_t* index, /*!< in/out: index tree */ + const btr_blob_dbg_t* b, /*!< in: the reference */ + const char* ctx) /*!< in: context (for logging) */ + __attribute__((nonnull)); + +/**************************************************************//** +Add to index->blobs any references to off-page columns from a record. +@return number of references added */ +UNIV_INTERN +ulint +btr_blob_dbg_add_rec( +/*=================*/ + const rec_t* rec, /*!< in: record */ + dict_index_t* index, /*!< in/out: index */ + const ulint* offsets,/*!< in: offsets */ + const char* ctx) /*!< in: context (for logging) */ + __attribute__((nonnull)); +/**************************************************************//** +Remove from index->blobs any references to off-page columns from a record. +@return number of references removed */ +UNIV_INTERN +ulint +btr_blob_dbg_remove_rec( +/*====================*/ + const rec_t* rec, /*!< in: record */ + dict_index_t* index, /*!< in/out: index */ + const ulint* offsets,/*!< in: offsets */ + const char* ctx) /*!< in: context (for logging) */ + __attribute__((nonnull)); +/**************************************************************//** +Count and add to index->blobs any references to off-page columns +from records on a page. +@return number of references added */ +UNIV_INTERN +ulint +btr_blob_dbg_add( +/*=============*/ + const page_t* page, /*!< in: rewritten page */ + dict_index_t* index, /*!< in/out: index */ + const char* ctx) /*!< in: context (for logging) */ + __attribute__((nonnull)); +/**************************************************************//** +Count and remove from index->blobs any references to off-page columns +from records on a page. +Used when reorganizing a page, before copying the records. +@return number of references removed */ +UNIV_INTERN +ulint +btr_blob_dbg_remove( +/*================*/ + const page_t* page, /*!< in: b-tree page */ + dict_index_t* index, /*!< in/out: index */ + const char* ctx) /*!< in: context (for logging) */ + __attribute__((nonnull)); +/**************************************************************//** +Restore in index->blobs any references to off-page columns +Used when page reorganize fails due to compressed page overflow. */ +UNIV_INTERN +void +btr_blob_dbg_restore( +/*=================*/ + const page_t* npage, /*!< in: page that failed to compress */ + const page_t* page, /*!< in: copy of original page */ + dict_index_t* index, /*!< in/out: index */ + const char* ctx) /*!< in: context (for logging) */ + __attribute__((nonnull)); + +/** Operation that processes the BLOB references of an index record +@param[in] rec record on index page +@param[in/out] index the index tree of the record +@param[in] offsets rec_get_offsets(rec,index) +@param[in] ctx context (for logging) +@return number of BLOB references processed */ +typedef ulint (*btr_blob_dbg_op_f) +(const rec_t* rec,dict_index_t* index,const ulint* offsets,const char* ctx); + +/**************************************************************//** +Count and process all references to off-page columns on a page. +@return number of references processed */ +UNIV_INTERN +ulint +btr_blob_dbg_op( +/*============*/ + const page_t* page, /*!< in: B-tree leaf page */ + const rec_t* rec, /*!< in: record to start from + (NULL to process the whole page) */ + dict_index_t* index, /*!< in/out: index */ + const char* ctx, /*!< in: context (for logging) */ + const btr_blob_dbg_op_f op) /*!< in: operation on records */ + __attribute__((nonnull(1,3,4,5))); +#else /* UNIV_BLOB_DEBUG */ +# define btr_blob_dbg_add_rec(rec, index, offsets, ctx) ((void) 0) +# define btr_blob_dbg_add(page, index, ctx) ((void) 0) +# define btr_blob_dbg_remove_rec(rec, index, offsets, ctx) ((void) 0) +# define btr_blob_dbg_remove(page, index, ctx) ((void) 0) +# define btr_blob_dbg_restore(npage, page, index, ctx) ((void) 0) +# define btr_blob_dbg_op(page, rec, index, ctx, op) ((void) 0) +#endif /* UNIV_BLOB_DEBUG */ + +/** The size of a reference to data stored on a different page. +The reference is stored at the end of the prefix of the field +in the index record. */ +#define BTR_EXTERN_FIELD_REF_SIZE 20 + +/** A BLOB field reference full of zero, for use in assertions and tests. +Initially, BLOB field references are set to zero, in +dtuple_convert_big_rec(). */ +extern const byte field_ref_zero[BTR_EXTERN_FIELD_REF_SIZE]; + +#endif diff --git a/storage/xtradb/include/buf0buddy.h b/storage/xtradb/include/buf0buddy.h new file mode 100644 index 00000000000..a86fc87e3d3 --- /dev/null +++ b/storage/xtradb/include/buf0buddy.h @@ -0,0 +1,77 @@ +/***************************************************************************** + +Copyright (c) 2006, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/buf0buddy.h +Binary buddy allocator for compressed pages + +Created December 2006 by Marko Makela +*******************************************************/ + +#ifndef buf0buddy_h +#define buf0buddy_h + +#ifdef UNIV_MATERIALIZE +# undef UNIV_INLINE +# define UNIV_INLINE +#endif + +#include "univ.i" +#include "buf0types.h" + +/**********************************************************************//** +Allocate a block. The thread calling this function must hold +buf_pool->LRU_list_mutex and must not hold buf_pool->zip_mutex or any +block->mutex. The buf_pool->LRU_list_mutex may be released and reacquired. +This function should only be used for allocating compressed page frames. +@return allocated block, never NULL */ +UNIV_INLINE +byte* +buf_buddy_alloc( +/*============*/ + buf_pool_t* buf_pool, /*!< in/out: buffer pool in which + the page resides */ + ulint size, /*!< in: compressed page size + (between UNIV_ZIP_SIZE_MIN and + UNIV_PAGE_SIZE) */ + ibool* lru) /*!< in: pointer to a variable + that will be assigned TRUE if + storage was allocated from the + LRU list and buf_pool->LRU_list_mutex + was temporarily released */ + __attribute__((malloc, nonnull)); + +/**********************************************************************//** +Deallocate a block. */ +UNIV_INLINE +void +buf_buddy_free( +/*===========*/ + buf_pool_t* buf_pool, /*!< in/out: buffer pool in which + the block resides */ + void* buf, /*!< in: block to be freed, must not + be pointed to by the buffer pool */ + ulint size) /*!< in: block size, + up to UNIV_PAGE_SIZE */ + __attribute__((nonnull)); + +#ifndef UNIV_NONINL +# include "buf0buddy.ic" +#endif + +#endif /* buf0buddy_h */ diff --git a/storage/xtradb/include/buf0buddy.ic b/storage/xtradb/include/buf0buddy.ic new file mode 100644 index 00000000000..020442016d0 --- /dev/null +++ b/storage/xtradb/include/buf0buddy.ic @@ -0,0 +1,142 @@ +/***************************************************************************** + +Copyright (c) 2006, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/buf0buddy.ic +Binary buddy allocator for compressed pages + +Created December 2006 by Marko Makela +*******************************************************/ + +#ifdef UNIV_MATERIALIZE +# undef UNIV_INLINE +# define UNIV_INLINE +#endif + +#include "buf0buf.h" +#include "buf0buddy.h" +#include "ut0ut.h" +#include "sync0sync.h" + +/**********************************************************************//** +Allocate a block. The thread calling this function must hold +buf_pool->LRU_list_mutex and must not hold buf_pool->zip_mutex or any +block->mutex. The buf_pool->LRU_list_mutex may be released and reacquired. +@return allocated block, never NULL */ +UNIV_INTERN +void* +buf_buddy_alloc_low( +/*================*/ + buf_pool_t* buf_pool, /*!< in/out: buffer pool instance */ + ulint i, /*!< in: index of buf_pool->zip_free[], + or BUF_BUDDY_SIZES */ + ibool* lru) /*!< in: pointer to a variable that + will be assigned TRUE if storage was + allocated from the LRU list and + buf_pool->LRU_list_mutex was + temporarily released */ + __attribute__((malloc, nonnull)); + +/**********************************************************************//** +Deallocate a block. */ +UNIV_INTERN +void +buf_buddy_free_low( +/*===============*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + void* buf, /*!< in: block to be freed, must not be + pointed to by the buffer pool */ + ulint i) /*!< in: index of buf_pool->zip_free[], + or BUF_BUDDY_SIZES */ + __attribute__((nonnull)); + +/**********************************************************************//** +Get the index of buf_pool->zip_free[] for a given block size. +@return index of buf_pool->zip_free[], or BUF_BUDDY_SIZES */ +UNIV_INLINE +ulint +buf_buddy_get_slot( +/*===============*/ + ulint size) /*!< in: block size */ +{ + ulint i; + ulint s; + + ut_ad(size >= UNIV_ZIP_SIZE_MIN); + + for (i = 0, s = BUF_BUDDY_LOW; s < size; i++, s <<= 1) { + } + + ut_ad(i <= BUF_BUDDY_SIZES); + return(i); +} + +/**********************************************************************//** +Allocate a block. The thread calling this function must hold +buf_pool->LRU_list_mutex and must not hold buf_pool->zip_mutex or any +block->mutex. The buf_pool->LRU_list_mutex may be released and reacquired. +This function should only be used for allocating compressed page frames. +@return allocated block, never NULL */ +UNIV_INLINE +byte* +buf_buddy_alloc( +/*============*/ + buf_pool_t* buf_pool, /*!< in/out: buffer pool in which + the page resides */ + ulint size, /*!< in: compressed page size + (between UNIV_ZIP_SIZE_MIN and + UNIV_PAGE_SIZE) */ + ibool* lru) /*!< in: pointer to a variable + that will be assigned TRUE if + storage was allocated from the + LRU list and buf_pool->LRU_list_mutex + was temporarily released */ +{ + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); + ut_ad(ut_is_2pow(size)); + ut_ad(size >= UNIV_ZIP_SIZE_MIN); + ut_ad(size <= UNIV_PAGE_SIZE); + + return((byte*) buf_buddy_alloc_low(buf_pool, buf_buddy_get_slot(size), + lru)); +} + +/**********************************************************************//** +Deallocate a block. */ +UNIV_INLINE +void +buf_buddy_free( +/*===========*/ + buf_pool_t* buf_pool, /*!< in/out: buffer pool in which + the block resides */ + void* buf, /*!< in: block to be freed, must not + be pointed to by the buffer pool */ + ulint size) /*!< in: block size, + up to UNIV_PAGE_SIZE */ +{ + ut_ad(ut_is_2pow(size)); + ut_ad(size >= UNIV_ZIP_SIZE_MIN); + ut_ad(size <= UNIV_PAGE_SIZE); + + buf_buddy_free_low(buf_pool, buf, buf_buddy_get_slot(size)); +} + +#ifdef UNIV_MATERIALIZE +# undef UNIV_INLINE +# define UNIV_INLINE UNIV_INLINE_ORIGINAL +#endif diff --git a/storage/xtradb/include/buf0buf.h b/storage/xtradb/include/buf0buf.h new file mode 100644 index 00000000000..50012933448 --- /dev/null +++ b/storage/xtradb/include/buf0buf.h @@ -0,0 +1,2166 @@ +/***************************************************************************** + +Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/buf0buf.h +The database buffer pool high-level routines + +Created 11/5/1995 Heikki Tuuri +*******************************************************/ + +#ifndef buf0buf_h +#define buf0buf_h + +#include "univ.i" +#include "fil0fil.h" +#include "mtr0types.h" +#include "buf0types.h" +#include "hash0hash.h" +#include "ut0byte.h" +#include "page0types.h" +#ifndef UNIV_HOTBACKUP +#include "ut0rbt.h" +#include "os0proc.h" +#include "log0log.h" + +/** @name Modes for buf_page_get_gen */ +/* @{ */ +#define BUF_GET 10 /*!< get always */ +#define BUF_GET_IF_IN_POOL 11 /*!< get if in pool */ +#define BUF_PEEK_IF_IN_POOL 12 /*!< get if in pool, do not make + the block young in the LRU list */ +#define BUF_GET_NO_LATCH 14 /*!< get and bufferfix, but + set no latch; we have + separated this case, because + it is error-prone programming + not to set a latch, and it + should be used with care */ +#define BUF_GET_IF_IN_POOL_OR_WATCH 15 + /*!< Get the page only if it's in the + buffer pool, if not then set a watch + on the page. */ +#define BUF_GET_POSSIBLY_FREED 16 + /*!< Like BUF_GET, but do not mind + if the file page has been freed. */ +/* @} */ +/** @name Modes for buf_page_get_known_nowait */ +/* @{ */ +#define BUF_MAKE_YOUNG 51 /*!< Move the block to the + start of the LRU list if there + is a danger that the block + would drift out of the buffer + pool*/ +#define BUF_KEEP_OLD 52 /*!< Preserve the current LRU + position of the block. */ +/* @} */ + +#define MAX_BUFFER_POOLS_BITS 6 /*!< Number of bits to representing + a buffer pool ID */ + +#define MAX_BUFFER_POOLS (1 << MAX_BUFFER_POOLS_BITS) + /*!< The maximum number of buffer + pools that can be defined */ + +#define BUF_POOL_WATCH_SIZE (srv_n_purge_threads + 1) + /*!< Maximum number of concurrent + buffer pool watches */ +#define MAX_PAGE_HASH_LOCKS 1024 /*!< The maximum number of + page_hash locks */ + +extern buf_pool_t* buf_pool_ptr; /*!< The buffer pools + of the database */ +#ifdef UNIV_DEBUG +extern ibool buf_debug_prints;/*!< If this is set TRUE, the program + prints info whenever read or flush + occurs */ +#endif /* UNIV_DEBUG */ +extern ulint srv_buf_pool_instances; +extern ulint srv_buf_pool_curr_size; +#else /* !UNIV_HOTBACKUP */ +extern buf_block_t* back_block1; /*!< first block, for --apply-log */ +extern buf_block_t* back_block2; /*!< second block, for page reorganize */ +#endif /* !UNIV_HOTBACKUP */ + +/** Magic value to use instead of checksums when they are disabled */ +#define BUF_NO_CHECKSUM_MAGIC 0xDEADBEEFUL + +/** @brief States of a control block +@see buf_page_t + +The enumeration values must be 0..7. */ +enum buf_page_state { + BUF_BLOCK_POOL_WATCH, /*!< a sentinel for the buffer pool + watch, element of buf_pool->watch[] */ + BUF_BLOCK_ZIP_PAGE, /*!< contains a clean + compressed page */ + BUF_BLOCK_ZIP_DIRTY, /*!< contains a compressed + page that is in the + buf_pool->flush_list */ + + BUF_BLOCK_NOT_USED, /*!< is in the free list; + must be after the BUF_BLOCK_ZIP_ + constants for compressed-only pages + @see buf_block_state_valid() */ + BUF_BLOCK_READY_FOR_USE, /*!< when buf_LRU_get_free_block + returns a block, it is in this state */ + BUF_BLOCK_FILE_PAGE, /*!< contains a buffered file page */ + BUF_BLOCK_MEMORY, /*!< contains some main memory + object */ + BUF_BLOCK_REMOVE_HASH /*!< hash index should be removed + before putting to the free list */ +}; + + +/** This structure defines information we will fetch from each buffer pool. It +will be used to print table IO stats */ +struct buf_pool_info_t{ + /* General buffer pool info */ + ulint pool_unique_id; /*!< Buffer Pool ID */ + ulint pool_size; /*!< Buffer Pool size in pages */ + ulint pool_size_bytes; + ulint lru_len; /*!< Length of buf_pool->LRU */ + ulint old_lru_len; /*!< buf_pool->LRU_old_len */ + ulint free_list_len; /*!< Length of buf_pool->free list */ + ulint flush_list_len; /*!< Length of buf_pool->flush_list */ + ulint n_pend_unzip; /*!< buf_pool->n_pend_unzip, pages + pending decompress */ + ulint n_pend_reads; /*!< buf_pool->n_pend_reads, pages + pending read */ + ulint n_pending_flush_lru; /*!< Pages pending flush in LRU */ + ulint n_pending_flush_single_page;/*!< Pages pending to be + flushed as part of single page + flushes issued by various user + threads */ + ulint n_pending_flush_list; /*!< Pages pending flush in FLUSH + LIST */ + ulint n_pages_made_young; /*!< number of pages made young */ + ulint n_pages_not_made_young; /*!< number of pages not made young */ + ulint n_pages_read; /*!< buf_pool->n_pages_read */ + ulint n_pages_created; /*!< buf_pool->n_pages_created */ + ulint n_pages_written; /*!< buf_pool->n_pages_written */ + ulint n_page_gets; /*!< buf_pool->n_page_gets */ + ulint n_ra_pages_read_rnd; /*!< buf_pool->n_ra_pages_read_rnd, + number of pages readahead */ + ulint n_ra_pages_read; /*!< buf_pool->n_ra_pages_read, number + of pages readahead */ + ulint n_ra_pages_evicted; /*!< buf_pool->n_ra_pages_evicted, + number of readahead pages evicted + without access */ + ulint n_page_get_delta; /*!< num of buffer pool page gets since + last printout */ + + /* Buffer pool access stats */ + double page_made_young_rate; /*!< page made young rate in pages + per second */ + double page_not_made_young_rate;/*!< page not made young rate + in pages per second */ + double pages_read_rate; /*!< num of pages read per second */ + double pages_created_rate; /*!< num of pages create per second */ + double pages_written_rate; /*!< num of pages written per second */ + ulint page_read_delta; /*!< num of pages read since last + printout */ + ulint young_making_delta; /*!< num of pages made young since + last printout */ + ulint not_young_making_delta; /*!< num of pages not make young since + last printout */ + + /* Statistics about read ahead algorithm. */ + double pages_readahead_rnd_rate;/*!< random readahead rate in pages per + second */ + double pages_readahead_rate; /*!< readahead rate in pages per + second */ + double pages_evicted_rate; /*!< rate of readahead page evicted + without access, in pages per second */ + + /* Stats about LRU eviction */ + ulint unzip_lru_len; /*!< length of buf_pool->unzip_LRU + list */ + /* Counters for LRU policy */ + ulint io_sum; /*!< buf_LRU_stat_sum.io */ + ulint io_cur; /*!< buf_LRU_stat_cur.io, num of IO + for current interval */ + ulint unzip_sum; /*!< buf_LRU_stat_sum.unzip */ + ulint unzip_cur; /*!< buf_LRU_stat_cur.unzip, num + pages decompressed in current + interval */ +}; + +/** The occupied bytes of lists in all buffer pools */ +struct buf_pools_list_size_t { + ulint LRU_bytes; /*!< LRU size in bytes */ + ulint unzip_LRU_bytes; /*!< unzip_LRU size in bytes */ + ulint flush_list_bytes; /*!< flush_list size in bytes */ +}; + +#ifndef UNIV_HOTBACKUP + +/********************************************************************//** +Creates the buffer pool. +@return DB_SUCCESS if success, DB_ERROR if not enough memory or error */ +UNIV_INTERN +dberr_t +buf_pool_init( +/*=========*/ + ulint size, /*!< in: Size of the total pool in bytes */ + ibool populate, /*!< in: Force virtual page preallocation */ + ulint n_instances); /*!< in: Number of instances */ +/********************************************************************//** +Frees the buffer pool at shutdown. This must not be invoked before +freeing all mutexes. */ +UNIV_INTERN +void +buf_pool_free( +/*==========*/ + ulint n_instances); /*!< in: numbere of instances to free */ + +/********************************************************************//** +Clears the adaptive hash index on all pages in the buffer pool. */ +UNIV_INTERN +void +buf_pool_clear_hash_index(void); +/*===========================*/ + +/********************************************************************//** +Relocate a buffer control block. Relocates the block on the LRU list +and in buf_pool->page_hash. Does not relocate bpage->list. +The caller must take care of relocating bpage->list. */ +UNIV_INTERN +void +buf_relocate( +/*=========*/ + buf_page_t* bpage, /*!< in/out: control block being relocated; + buf_page_get_state(bpage) must be + BUF_BLOCK_ZIP_DIRTY or BUF_BLOCK_ZIP_PAGE */ + buf_page_t* dpage) /*!< in/out: destination control block */ + __attribute__((nonnull)); +/*********************************************************************//** +Gets the current size of buffer buf_pool in bytes. +@return size in bytes */ +UNIV_INLINE +ulint +buf_pool_get_curr_size(void); +/*========================*/ +/*********************************************************************//** +Gets the current size of buffer buf_pool in frames. +@return size in pages */ +UNIV_INLINE +ulint +buf_pool_get_n_pages(void); +/*=======================*/ +/********************************************************************//** +Gets the smallest oldest_modification lsn for any page in the pool. Returns +zero if all modified pages have been flushed to disk. +@return oldest modification in pool, zero if none */ +UNIV_INTERN +lsn_t +buf_pool_get_oldest_modification(void); +/*==================================*/ + +/********************************************************************//** +Allocates a buf_page_t descriptor. This function must succeed. In case +of failure we assert in this function. */ +UNIV_INLINE +buf_page_t* +buf_page_alloc_descriptor(void) +/*===========================*/ + __attribute__((malloc)); +/********************************************************************//** +Free a buf_page_t descriptor. */ +UNIV_INLINE +void +buf_page_free_descriptor( +/*=====================*/ + buf_page_t* bpage) /*!< in: bpage descriptor to free. */ + __attribute__((nonnull)); + +/********************************************************************//** +Allocates a buffer block. +@return own: the allocated block, in state BUF_BLOCK_MEMORY */ +UNIV_INTERN +buf_block_t* +buf_block_alloc( +/*============*/ + buf_pool_t* buf_pool); /*!< in: buffer pool instance, + or NULL for round-robin selection + of the buffer pool */ +/********************************************************************//** +Frees a buffer block which does not contain a file page. */ +UNIV_INLINE +void +buf_block_free( +/*===========*/ + buf_block_t* block); /*!< in, own: block to be freed */ +#endif /* !UNIV_HOTBACKUP */ +/*********************************************************************//** +Copies contents of a buffer frame to a given buffer. +@return buf */ +UNIV_INLINE +byte* +buf_frame_copy( +/*===========*/ + byte* buf, /*!< in: buffer to copy to */ + const buf_frame_t* frame); /*!< in: buffer frame */ +#ifndef UNIV_HOTBACKUP +/**************************************************************//** +NOTE! The following macros should be used instead of buf_page_get_gen, +to improve debugging. Only values RW_S_LATCH and RW_X_LATCH are allowed +in LA! */ +#define buf_page_get(SP, ZS, OF, LA, MTR) buf_page_get_gen(\ + SP, ZS, OF, LA, NULL,\ + BUF_GET, __FILE__, __LINE__, MTR) +/**************************************************************//** +Use these macros to bufferfix a page with no latching. Remember not to +read the contents of the page unless you know it is safe. Do not modify +the contents of the page! We have separated this case, because it is +error-prone programming not to set a latch, and it should be used +with care. */ +#define buf_page_get_with_no_latch(SP, ZS, OF, MTR) buf_page_get_gen(\ + SP, ZS, OF, RW_NO_LATCH, NULL,\ + BUF_GET_NO_LATCH, __FILE__, __LINE__, MTR) +/********************************************************************//** +This is the general function used to get optimistic access to a database +page. +@return TRUE if success */ +UNIV_INTERN +ibool +buf_page_optimistic_get( +/*====================*/ + ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */ + buf_block_t* block, /*!< in: guessed block */ + ib_uint64_t modify_clock,/*!< in: modify clock value */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ + mtr_t* mtr); /*!< in: mini-transaction */ +/********************************************************************//** +This is used to get access to a known database page, when no waiting can be +done. +@return TRUE if success */ +UNIV_INTERN +ibool +buf_page_get_known_nowait( +/*======================*/ + ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */ + buf_block_t* block, /*!< in: the known page */ + ulint mode, /*!< in: BUF_MAKE_YOUNG or BUF_KEEP_OLD */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ + mtr_t* mtr); /*!< in: mini-transaction */ + +/*******************************************************************//** +Given a tablespace id and page number tries to get that page. If the +page is not in the buffer pool it is not loaded and NULL is returned. +Suitable for using when holding the lock_sys_t::mutex. */ +UNIV_INTERN +const buf_block_t* +buf_page_try_get_func( +/*==================*/ + ulint space_id,/*!< in: tablespace id */ + ulint page_no,/*!< in: page number */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ + mtr_t* mtr); /*!< in: mini-transaction */ + +/** Tries to get a page. If the page is not in the buffer pool it is +not loaded. Suitable for using when holding the lock_sys_t::mutex. +@param space_id in: tablespace id +@param page_no in: page number +@param mtr in: mini-transaction +@return the page if in buffer pool, NULL if not */ +#define buf_page_try_get(space_id, page_no, mtr) \ + buf_page_try_get_func(space_id, page_no, __FILE__, __LINE__, mtr); + +/********************************************************************//** +Get read access to a compressed page (usually of type +FIL_PAGE_TYPE_ZBLOB or FIL_PAGE_TYPE_ZBLOB2). +The page must be released with buf_page_release_zip(). +NOTE: the page is not protected by any latch. Mutual exclusion has to +be implemented at a higher level. In other words, all possible +accesses to a given page through this function must be protected by +the same set of mutexes or latches. +@return pointer to the block, or NULL if not compressed */ +UNIV_INTERN +buf_page_t* +buf_page_get_zip( +/*=============*/ + ulint space, /*!< in: space id */ + ulint zip_size,/*!< in: compressed page size */ + ulint offset);/*!< in: page number */ +/********************************************************************//** +This is the general function used to get access to a database page. +@return pointer to the block or NULL */ +UNIV_INTERN +buf_block_t* +buf_page_get_gen( +/*=============*/ + ulint space, /*!< in: space id */ + ulint zip_size,/*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint offset, /*!< in: page number */ + ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */ + buf_block_t* guess, /*!< in: guessed block or NULL */ + ulint mode, /*!< in: BUF_GET, BUF_GET_IF_IN_POOL, + BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH or + BUF_GET_IF_IN_POOL_OR_WATCH */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ + mtr_t* mtr); /*!< in: mini-transaction */ +/********************************************************************//** +Initializes a page to the buffer buf_pool. The page is usually not read +from a file even if it cannot be found in the buffer buf_pool. This is one +of the functions which perform to a block a state transition NOT_USED => +FILE_PAGE (the other is buf_page_get_gen). +@return pointer to the block, page bufferfixed */ +UNIV_INTERN +buf_block_t* +buf_page_create( +/*============*/ + ulint space, /*!< in: space id */ + ulint offset, /*!< in: offset of the page within space in units of + a page */ + ulint zip_size,/*!< in: compressed page size, or 0 */ + mtr_t* mtr); /*!< in: mini-transaction handle */ +#else /* !UNIV_HOTBACKUP */ +/********************************************************************//** +Inits a page to the buffer buf_pool, for use in mysqlbackup --restore. */ +UNIV_INTERN +void +buf_page_init_for_backup_restore( +/*=============================*/ + ulint space, /*!< in: space id */ + ulint offset, /*!< in: offset of the page within space + in units of a page */ + ulint zip_size,/*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + buf_block_t* block); /*!< in: block to init */ +#endif /* !UNIV_HOTBACKUP */ + +#ifndef UNIV_HOTBACKUP +/********************************************************************//** +Releases a compressed-only page acquired with buf_page_get_zip(). */ +UNIV_INLINE +void +buf_page_release_zip( +/*=================*/ + buf_page_t* bpage); /*!< in: buffer block */ +/********************************************************************//** +Decrements the bufferfix count of a buffer control block and releases +a latch, if specified. */ +UNIV_INLINE +void +buf_page_release( +/*=============*/ + buf_block_t* block, /*!< in: buffer block */ + ulint rw_latch); /*!< in: RW_S_LATCH, RW_X_LATCH, + RW_NO_LATCH */ +/********************************************************************//** +Moves a page to the start of the buffer pool LRU list. This high-level +function can be used to prevent an important page from slipping out of +the buffer pool. */ +UNIV_INTERN +void +buf_page_make_young( +/*================*/ + buf_page_t* bpage); /*!< in: buffer block of a file page */ +/********************************************************************//** +Returns TRUE if the page can be found in the buffer pool hash table. + +NOTE that it is possible that the page is not yet read from disk, +though. + +@return TRUE if found in the page hash table */ +UNIV_INLINE +ibool +buf_page_peek( +/*==========*/ + ulint space, /*!< in: space id */ + ulint offset);/*!< in: page number */ +#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG +/********************************************************************//** +Sets file_page_was_freed TRUE if the page is found in the buffer pool. +This function should be called when we free a file page and want the +debug version to check that it is not accessed any more unless +reallocated. +@return control block if found in page hash table, otherwise NULL */ +UNIV_INTERN +buf_page_t* +buf_page_set_file_page_was_freed( +/*=============================*/ + ulint space, /*!< in: space id */ + ulint offset);/*!< in: page number */ +/********************************************************************//** +Sets file_page_was_freed FALSE if the page is found in the buffer pool. +This function should be called when we free a file page and want the +debug version to check that it is not accessed any more unless +reallocated. +@return control block if found in page hash table, otherwise NULL */ +UNIV_INTERN +buf_page_t* +buf_page_reset_file_page_was_freed( +/*===============================*/ + ulint space, /*!< in: space id */ + ulint offset); /*!< in: page number */ +#endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */ +/********************************************************************//** +Reads the freed_page_clock of a buffer block. +@return freed_page_clock */ +UNIV_INLINE +ulint +buf_page_get_freed_page_clock( +/*==========================*/ + const buf_page_t* bpage) /*!< in: block */ + __attribute__((pure)); +/********************************************************************//** +Reads the freed_page_clock of a buffer block. +@return freed_page_clock */ +UNIV_INLINE +ulint +buf_block_get_freed_page_clock( +/*===========================*/ + const buf_block_t* block) /*!< in: block */ + __attribute__((pure)); + +/********************************************************************//** +Tells if a block is still close enough to the MRU end of the LRU list +meaning that it is not in danger of getting evicted and also implying +that it has been accessed recently. +Note that this is for heuristics only and does not reserve buffer pool +mutex. +@return TRUE if block is close to MRU end of LRU */ +UNIV_INLINE +ibool +buf_page_peek_if_young( +/*===================*/ + const buf_page_t* bpage); /*!< in: block */ +/********************************************************************//** +Recommends a move of a block to the start of the LRU list if there is danger +of dropping from the buffer pool. NOTE: does not reserve the buffer pool +mutex. +@return TRUE if should be made younger */ +UNIV_INLINE +ibool +buf_page_peek_if_too_old( +/*=====================*/ + const buf_page_t* bpage); /*!< in: block to make younger */ +/********************************************************************//** +Gets the youngest modification log sequence number for a frame. +Returns zero if not file page or no modification occurred yet. +@return newest modification to page */ +UNIV_INLINE +lsn_t +buf_page_get_newest_modification( +/*=============================*/ + const buf_page_t* bpage); /*!< in: block containing the + page frame */ +/********************************************************************//** +Increments the modify clock of a frame by 1. The caller must (1) own the +LRU list mutex and block bufferfix count has to be zero, (2) or own an x-lock +on the block. */ +UNIV_INLINE +void +buf_block_modify_clock_inc( +/*=======================*/ + buf_block_t* block); /*!< in: block */ +/********************************************************************//** +Returns the value of the modify clock. The caller must have an s-lock +or x-lock on the block. +@return value */ +UNIV_INLINE +ib_uint64_t +buf_block_get_modify_clock( +/*=======================*/ + buf_block_t* block); /*!< in: block */ +/*******************************************************************//** +Increments the bufferfix count. */ +UNIV_INLINE +void +buf_block_buf_fix_inc_func( +/*=======================*/ +# ifdef UNIV_SYNC_DEBUG + const char* file, /*!< in: file name */ + ulint line, /*!< in: line */ +# endif /* UNIV_SYNC_DEBUG */ + buf_block_t* block) /*!< in/out: block to bufferfix */ + __attribute__((nonnull)); + +/*******************************************************************//** +Increments the bufferfix count. */ +UNIV_INLINE +void +buf_block_fix( +/*===========*/ + buf_block_t* block); /*!< in/out: block to bufferfix */ + +/*******************************************************************//** +Increments the bufferfix count. */ +UNIV_INLINE +void +buf_block_unfix( +/*===========*/ + buf_block_t* block); /*!< in/out: block to bufferfix */ + +# ifdef UNIV_SYNC_DEBUG +/** Increments the bufferfix count. +@param b in/out: block to bufferfix +@param f in: file name where requested +@param l in: line number where requested */ +# define buf_block_buf_fix_inc(b,f,l) buf_block_buf_fix_inc_func(f,l,b) +# else /* UNIV_SYNC_DEBUG */ +/** Increments the bufferfix count. +@param b in/out: block to bufferfix +@param f in: file name where requested +@param l in: line number where requested */ +# define buf_block_buf_fix_inc(b,f,l) buf_block_buf_fix_inc_func(b) +# endif /* UNIV_SYNC_DEBUG */ +#else /* !UNIV_HOTBACKUP */ +# define buf_block_modify_clock_inc(block) ((void) 0) +#endif /* !UNIV_HOTBACKUP */ +/********************************************************************//** +Checks if a page is corrupt. +@return TRUE if corrupted */ +UNIV_INTERN +ibool +buf_page_is_corrupted( +/*==================*/ + bool check_lsn, /*!< in: true if we need to check the + and complain about the LSN */ + const byte* read_buf, /*!< in: a database page */ + ulint zip_size) /*!< in: size of compressed page; + 0 for uncompressed pages */ + __attribute__((nonnull, warn_unused_result)); +/********************************************************************//** +Checks if a page is all zeroes. +@return TRUE if the page is all zeroes */ +bool +buf_page_is_zeroes( +/*===============*/ + const byte* read_buf, /*!< in: a database page */ + const ulint zip_size); /*!< in: size of compressed page; + 0 for uncompressed pages */ +#ifndef UNIV_HOTBACKUP +/**********************************************************************//** +Gets the space id, page offset, and byte offset within page of a +pointer pointing to a buffer frame containing a file page. */ +UNIV_INLINE +void +buf_ptr_get_fsp_addr( +/*=================*/ + const void* ptr, /*!< in: pointer to a buffer frame */ + ulint* space, /*!< out: space id */ + fil_addr_t* addr); /*!< out: page offset and byte offset */ +/**********************************************************************//** +Gets the hash value of a block. This can be used in searches in the +lock hash table. +@return lock hash value */ +UNIV_INLINE +ulint +buf_block_get_lock_hash_val( +/*========================*/ + const buf_block_t* block) /*!< in: block */ + __attribute__((pure)); +#ifdef UNIV_DEBUG +/*********************************************************************//** +Finds a block in the buffer pool that points to a +given compressed page. +@return buffer block pointing to the compressed page, or NULL */ +UNIV_INTERN +buf_block_t* +buf_pool_contains_zip( +/*==================*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + const void* data); /*!< in: pointer to compressed page */ +#endif /* UNIV_DEBUG */ + +/*********************************************************************** +FIXME_FTS: Gets the frame the pointer is pointing to. */ +UNIV_INLINE +buf_frame_t* +buf_frame_align( +/*============*/ + /* out: pointer to frame */ + byte* ptr); /* in: pointer to a frame */ + + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +/*********************************************************************//** +Validates the buffer pool data structure. +@return TRUE */ +UNIV_INTERN +ibool +buf_validate(void); +/*==============*/ +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ +#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +/*********************************************************************//** +Prints info of the buffer pool data structure. */ +UNIV_INTERN +void +buf_print(void); +/*============*/ +#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG || UNIV_BUF_DEBUG */ +#endif /* !UNIV_HOTBACKUP */ +enum buf_page_print_flags { + /** Do not crash at the end of buf_page_print(). */ + BUF_PAGE_PRINT_NO_CRASH = 1, + /** Do not print the full page dump. */ + BUF_PAGE_PRINT_NO_FULL = 2 +}; + +/********************************************************************//** +Prints a page to stderr. */ +UNIV_INTERN +void +buf_page_print( +/*===========*/ + const byte* read_buf, /*!< in: a database page */ + ulint zip_size, /*!< in: compressed page size, or + 0 for uncompressed pages */ + ulint flags); /*!< in: 0 or + BUF_PAGE_PRINT_NO_CRASH or + BUF_PAGE_PRINT_NO_FULL */ + +/********************************************************************//** +Decompress a block. +@return TRUE if successful */ +UNIV_INTERN +ibool +buf_zip_decompress( +/*===============*/ + buf_block_t* block, /*!< in/out: block */ + ibool check); /*!< in: TRUE=verify the page checksum */ +#ifndef UNIV_HOTBACKUP +#ifdef UNIV_DEBUG +/*********************************************************************//** +Returns the number of latched pages in the buffer pool. +@return number of latched pages */ +UNIV_INTERN +ulint +buf_get_latched_pages_number(void); +/*==============================*/ +#endif /* UNIV_DEBUG */ +/*********************************************************************//** +Returns the number of pending buf pool read ios. +@return number of pending read I/O operations */ +UNIV_INTERN +ulint +buf_get_n_pending_read_ios(void); +/*============================*/ +/*********************************************************************//** +Prints info of the buffer i/o. */ +UNIV_INTERN +void +buf_print_io( +/*=========*/ + FILE* file); /*!< in: file where to print */ +/*******************************************************************//** +Collect buffer pool stats information for a buffer pool. Also +record aggregated stats if there are more than one buffer pool +in the server */ +UNIV_INTERN +void +buf_stats_get_pool_info( +/*====================*/ + buf_pool_t* buf_pool, /*!< in: buffer pool */ + ulint pool_id, /*!< in: buffer pool ID */ + buf_pool_info_t* all_pool_info); /*!< in/out: buffer pool info + to fill */ +/*********************************************************************//** +Returns the ratio in percents of modified pages in the buffer pool / +database pages in the buffer pool. +@return modified page percentage ratio */ +UNIV_INTERN +ulint +buf_get_modified_ratio_pct(void); +/*============================*/ +/**********************************************************************//** +Refreshes the statistics used to print per-second averages. */ +UNIV_INTERN +void +buf_refresh_io_stats( +/*=================*/ + buf_pool_t* buf_pool); /*!< buffer pool instance */ +/**********************************************************************//** +Refreshes the statistics used to print per-second averages. */ +UNIV_INTERN +void +buf_refresh_io_stats_all(void); +/*=================*/ +/*********************************************************************//** +Asserts that all file pages in the buffer are in a replaceable state. +@return TRUE */ +UNIV_INTERN +ibool +buf_all_freed(void); +/*===============*/ +/*********************************************************************//** +Checks that there currently are no pending i/o-operations for the buffer +pool. +@return number of pending i/o operations */ +UNIV_INTERN +ulint +buf_pool_check_no_pending_io(void); +/*==============================*/ +/*********************************************************************//** +Invalidates the file pages in the buffer pool when an archive recovery is +completed. All the file pages buffered must be in a replaceable state when +this function is called: not latched and not modified. */ +UNIV_INTERN +void +buf_pool_invalidate(void); +/*=====================*/ +#endif /* !UNIV_HOTBACKUP */ + +/*======================================================================== +--------------------------- LOWER LEVEL ROUTINES ------------------------- +=========================================================================*/ + +#ifdef UNIV_SYNC_DEBUG +/*********************************************************************//** +Adds latch level info for the rw-lock protecting the buffer frame. This +should be called in the debug version after a successful latching of a +page if we know the latching order level of the acquired latch. */ +UNIV_INLINE +void +buf_block_dbg_add_level( +/*====================*/ + buf_block_t* block, /*!< in: buffer page + where we have acquired latch */ + ulint level); /*!< in: latching order level */ +#else /* UNIV_SYNC_DEBUG */ +# define buf_block_dbg_add_level(block, level) /* nothing */ +#endif /* UNIV_SYNC_DEBUG */ +/*********************************************************************//** +Gets the state of a block. +@return state */ +UNIV_INLINE +enum buf_page_state +buf_page_get_state( +/*===============*/ + const buf_page_t* bpage); /*!< in: pointer to the control block */ +/*********************************************************************//** +Gets the state of a block. +@return state */ +UNIV_INLINE +enum buf_page_state +buf_block_get_state( +/*================*/ + const buf_block_t* block) /*!< in: pointer to the control block */ + __attribute__((pure)); +/*********************************************************************//** +Sets the state of a block. */ +UNIV_INLINE +void +buf_page_set_state( +/*===============*/ + buf_page_t* bpage, /*!< in/out: pointer to control block */ + enum buf_page_state state); /*!< in: state */ +/*********************************************************************//** +Sets the state of a block. */ +UNIV_INLINE +void +buf_block_set_state( +/*================*/ + buf_block_t* block, /*!< in/out: pointer to control block */ + enum buf_page_state state); /*!< in: state */ +/*********************************************************************//** +Determines if a block is mapped to a tablespace. +@return TRUE if mapped */ +UNIV_INLINE +ibool +buf_page_in_file( +/*=============*/ + const buf_page_t* bpage) /*!< in: pointer to control block */ + __attribute__((pure)); +#ifndef UNIV_HOTBACKUP +/*********************************************************************//** +Determines if a block should be on unzip_LRU list. +@return TRUE if block belongs to unzip_LRU */ +UNIV_INLINE +ibool +buf_page_belongs_to_unzip_LRU( +/*==========================*/ + const buf_page_t* bpage) /*!< in: pointer to control block */ + __attribute__((pure)); + +/*********************************************************************//** +Gets the mutex of a block. +@return pointer to mutex protecting bpage */ +UNIV_INLINE +ib_mutex_t* +buf_page_get_mutex( +/*===============*/ + const buf_page_t* bpage) /*!< in: pointer to control block */ + __attribute__((pure)); + +/*********************************************************************//** +Get the flush type of a page. +@return flush type */ +UNIV_INLINE +buf_flush_t +buf_page_get_flush_type( +/*====================*/ + const buf_page_t* bpage) /*!< in: buffer page */ + __attribute__((pure)); +/*********************************************************************//** +Set the flush type of a page. */ +UNIV_INLINE +void +buf_page_set_flush_type( +/*====================*/ + buf_page_t* bpage, /*!< in: buffer page */ + buf_flush_t flush_type); /*!< in: flush type */ +/*********************************************************************//** +Map a block to a file page. */ +UNIV_INLINE +void +buf_block_set_file_page( +/*====================*/ + buf_block_t* block, /*!< in/out: pointer to control block */ + ulint space, /*!< in: tablespace id */ + ulint page_no);/*!< in: page number */ +/*********************************************************************//** +Gets the io_fix state of a block. +@return io_fix state */ +UNIV_INLINE +enum buf_io_fix +buf_page_get_io_fix( +/*================*/ + const buf_page_t* bpage) /*!< in: pointer to the control block */ + __attribute__((pure)); +/*********************************************************************//** +Gets the io_fix state of a block. +@return io_fix state */ +UNIV_INLINE +enum buf_io_fix +buf_block_get_io_fix( +/*================*/ + const buf_block_t* block) /*!< in: pointer to the control block */ + __attribute__((pure)); +/*********************************************************************//** +Gets the io_fix state of a block. Does not assert that the +buf_page_get_mutex() mutex is held, to be used in the cases where it is safe +not to hold it. +@return io_fix state */ +UNIV_INLINE +enum buf_io_fix +buf_page_get_io_fix_unlocked( +/*=========================*/ + const buf_page_t* bpage) /*!< in: pointer to the control block */ + __attribute__((pure)); +/*********************************************************************//** +Sets the io_fix state of a block. */ +UNIV_INLINE +void +buf_page_set_io_fix( +/*================*/ + buf_page_t* bpage, /*!< in/out: control block */ + enum buf_io_fix io_fix);/*!< in: io_fix state */ +/*********************************************************************//** +Sets the io_fix state of a block. */ +UNIV_INLINE +void +buf_block_set_io_fix( +/*=================*/ + buf_block_t* block, /*!< in/out: control block */ + enum buf_io_fix io_fix);/*!< in: io_fix state */ +/*********************************************************************//** +Makes a block sticky. A sticky block implies that even after we release +the buf_pool->LRU_list_mutex and the block->mutex: +* it cannot be removed from the flush_list +* the block descriptor cannot be relocated +* it cannot be removed from the LRU list +Note that: +* the block can still change its position in the LRU list +* the next and previous pointers can change. */ +UNIV_INLINE +void +buf_page_set_sticky( +/*================*/ + buf_page_t* bpage); /*!< in/out: control block */ +/*********************************************************************//** +Removes stickiness of a block. */ +UNIV_INLINE +void +buf_page_unset_sticky( +/*==================*/ + buf_page_t* bpage); /*!< in/out: control block */ +/********************************************************************//** +Determine if a buffer block can be relocated in memory. The block +can be dirty, but it must not be I/O-fixed or bufferfixed. */ +UNIV_INLINE +ibool +buf_page_can_relocate( +/*==================*/ + const buf_page_t* bpage) /*!< control block being relocated */ + __attribute__((pure)); + +/*********************************************************************//** +Determine if a block has been flagged old. +@return TRUE if old */ +UNIV_INLINE +ibool +buf_page_is_old( +/*============*/ + const buf_page_t* bpage) /*!< in: control block */ + __attribute__((pure)); +/*********************************************************************//** +Flag a block old. */ +UNIV_INLINE +void +buf_page_set_old( +/*=============*/ + buf_page_t* bpage, /*!< in/out: control block */ + ibool old); /*!< in: old */ +/*********************************************************************//** +Determine the time of first access of a block in the buffer pool. +@return ut_time_ms() at the time of first access, 0 if not accessed */ +UNIV_INLINE +unsigned +buf_page_is_accessed( +/*=================*/ + const buf_page_t* bpage) /*!< in: control block */ + __attribute__((nonnull, pure)); +/*********************************************************************//** +Flag a block accessed. */ +UNIV_INLINE +void +buf_page_set_accessed( +/*==================*/ + buf_page_t* bpage) /*!< in/out: control block */ + __attribute__((nonnull)); +/*********************************************************************//** +Gets the buf_block_t handle of a buffered file block if an uncompressed +page frame exists, or NULL. Note: even though bpage is not declared a +const we don't update its value. It is safe to make this pure. +@return control block, or NULL */ +UNIV_INLINE +buf_block_t* +buf_page_get_block( +/*===============*/ + buf_page_t* bpage) /*!< in: control block, or NULL */ + __attribute__((pure)); +#endif /* !UNIV_HOTBACKUP */ +#ifdef UNIV_DEBUG +/*********************************************************************//** +Gets a pointer to the memory frame of a block. +@return pointer to the frame */ +UNIV_INLINE +buf_frame_t* +buf_block_get_frame( +/*================*/ + const buf_block_t* block) /*!< in: pointer to the control block */ + __attribute__((pure)); +#else /* UNIV_DEBUG */ +# define buf_block_get_frame(block) (block ? (block)->frame : 0) +#endif /* UNIV_DEBUG */ +/*********************************************************************//** +Gets the space id of a block. +@return space id */ +UNIV_INLINE +ulint +buf_page_get_space( +/*===============*/ + const buf_page_t* bpage) /*!< in: pointer to the control block */ + __attribute__((pure)); +/*********************************************************************//** +Gets the space id of a block. +@return space id */ +UNIV_INLINE +ulint +buf_block_get_space( +/*================*/ + const buf_block_t* block) /*!< in: pointer to the control block */ + __attribute__((pure)); +/*********************************************************************//** +Gets the page number of a block. +@return page number */ +UNIV_INLINE +ulint +buf_page_get_page_no( +/*=================*/ + const buf_page_t* bpage) /*!< in: pointer to the control block */ + __attribute__((pure)); +/*********************************************************************//** +Gets the page number of a block. +@return page number */ +UNIV_INLINE +ulint +buf_block_get_page_no( +/*==================*/ + const buf_block_t* block) /*!< in: pointer to the control block */ + __attribute__((pure)); +/*********************************************************************//** +Gets the compressed page size of a block. +@return compressed page size, or 0 */ +UNIV_INLINE +ulint +buf_page_get_zip_size( +/*==================*/ + const buf_page_t* bpage) /*!< in: pointer to the control block */ + __attribute__((pure)); +/*********************************************************************//** +Gets the compressed page size of a block. +@return compressed page size, or 0 */ +UNIV_INLINE +ulint +buf_block_get_zip_size( +/*===================*/ + const buf_block_t* block) /*!< in: pointer to the control block */ + __attribute__((pure)); +/*********************************************************************//** +Gets the compressed page descriptor corresponding to an uncompressed page +if applicable. */ +#define buf_block_get_page_zip(block) \ + ((block)->page.zip.data ? &(block)->page.zip : NULL) +#ifndef UNIV_HOTBACKUP +/*******************************************************************//** +Gets the block to whose frame the pointer is pointing to. +@return pointer to block, never NULL */ +UNIV_INTERN +buf_block_t* +buf_block_align( +/*============*/ + const byte* ptr); /*!< in: pointer to a frame */ +/********************************************************************//** +Find out if a pointer belongs to a buf_block_t. It can be a pointer to +the buf_block_t itself or a member of it +@return TRUE if ptr belongs to a buf_block_t struct */ +UNIV_INTERN +ibool +buf_pointer_is_block_field( +/*=======================*/ + const void* ptr); /*!< in: pointer not + dereferenced */ +/** Find out if a pointer corresponds to a buf_block_t::mutex. +@param m in: mutex candidate +@return TRUE if m is a buf_block_t::mutex */ +#define buf_pool_is_block_mutex(m) \ + buf_pointer_is_block_field((const void*)(m)) +/** Find out if a pointer corresponds to a buf_block_t::lock. +@param l in: rw-lock candidate +@return TRUE if l is a buf_block_t::lock */ +#define buf_pool_is_block_lock(l) \ + buf_pointer_is_block_field((const void*)(l)) + +#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG +/*********************************************************************//** +Gets the compressed page descriptor corresponding to an uncompressed page +if applicable. +@return compressed page descriptor, or NULL */ +UNIV_INLINE +const page_zip_des_t* +buf_frame_get_page_zip( +/*===================*/ + const byte* ptr); /*!< in: pointer to the page */ +#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ +/********************************************************************//** +Function which inits a page for read to the buffer buf_pool. If the page is +(1) already in buf_pool, or +(2) if we specify to read only ibuf pages and the page is not an ibuf page, or +(3) if the space is deleted or being deleted, +then this function does nothing. +Sets the io_fix flag to BUF_IO_READ and sets a non-recursive exclusive lock +on the buffer frame. The io-handler must take care that the flag is cleared +and the lock released later. +@return pointer to the block or NULL */ +UNIV_INTERN +buf_page_t* +buf_page_init_for_read( +/*===================*/ + dberr_t* err, /*!< out: DB_SUCCESS or DB_TABLESPACE_DELETED */ + ulint mode, /*!< in: BUF_READ_IBUF_PAGES_ONLY, ... */ + ulint space, /*!< in: space id */ + ulint zip_size,/*!< in: compressed page size, or 0 */ + ibool unzip, /*!< in: TRUE=request uncompressed page */ + ib_int64_t tablespace_version,/*!< in: prevents reading from a wrong + version of the tablespace in case we have done + DISCARD + IMPORT */ + ulint offset);/*!< in: page number */ +/********************************************************************//** +Completes an asynchronous read or write request of a file page to or from +the buffer pool. +@return true if successful */ +UNIV_INTERN +bool +buf_page_io_complete( +/*=================*/ + buf_page_t* bpage); /*!< in: pointer to the block in question */ +/********************************************************************//** +Calculates a folded value of a file page address to use in the page hash +table. +@return the folded value */ +UNIV_INLINE +ulint +buf_page_address_fold( +/*==================*/ + ulint space, /*!< in: space id */ + ulint offset) /*!< in: offset of the page within space */ + __attribute__((const)); +/********************************************************************//** +Calculates the index of a buffer pool to the buf_pool[] array. +@return the position of the buffer pool in buf_pool[] */ +UNIV_INLINE +ulint +buf_pool_index( +/*===========*/ + const buf_pool_t* buf_pool) /*!< in: buffer pool */ + __attribute__((nonnull, const)); +/******************************************************************//** +Returns the buffer pool instance given a page instance +@return buf_pool */ +UNIV_INLINE +buf_pool_t* +buf_pool_from_bpage( +/*================*/ + const buf_page_t* bpage); /*!< in: buffer pool page */ +/******************************************************************//** +Returns the buffer pool instance given a block instance +@return buf_pool */ +UNIV_INLINE +buf_pool_t* +buf_pool_from_block( +/*================*/ + const buf_block_t* block); /*!< in: block */ +/******************************************************************//** +Returns the buffer pool instance given space and offset of page +@return buffer pool */ +UNIV_INLINE +buf_pool_t* +buf_pool_get( +/*==========*/ + ulint space, /*!< in: space id */ + ulint offset);/*!< in: offset of the page within space */ +/******************************************************************//** +Returns the buffer pool instance given its array index +@return buffer pool */ +UNIV_INLINE +buf_pool_t* +buf_pool_from_array( +/*================*/ + ulint index); /*!< in: array index to get + buffer pool instance from */ +/******************************************************************//** +Returns the control block of a file page, NULL if not found. +@return block, NULL if not found */ +UNIV_INLINE +buf_page_t* +buf_page_hash_get_low( +/*==================*/ + buf_pool_t* buf_pool,/*!< buffer pool instance */ + ulint space, /*!< in: space id */ + ulint offset, /*!< in: offset of the page within space */ + ulint fold); /*!< in: buf_page_address_fold(space, offset) */ +/******************************************************************//** +Returns the control block of a file page, NULL if not found. +If the block is found and lock is not NULL then the appropriate +page_hash lock is acquired in the specified lock mode. Otherwise, +mode value is ignored. It is up to the caller to release the +lock. If the block is found and the lock is NULL then the page_hash +lock is released by this function. +@return block, NULL if not found */ +UNIV_INLINE +buf_page_t* +buf_page_hash_get_locked( +/*=====================*/ + /*!< out: pointer to the bpage, + or NULL; if NULL, hash_lock + is also NULL. */ + buf_pool_t* buf_pool, /*!< buffer pool instance */ + ulint space, /*!< in: space id */ + ulint offset, /*!< in: page number */ + prio_rw_lock_t** lock, /*!< in/out: lock of the page + hash acquired if bpage is + found. NULL otherwise. If NULL + is passed then the hash_lock + is released by this function */ + ulint lock_mode); /*!< in: RW_LOCK_EX or + RW_LOCK_SHARED. Ignored if + lock == NULL */ +/******************************************************************//** +Returns the control block of a file page, NULL if not found. +If the block is found and lock is not NULL then the appropriate +page_hash lock is acquired in the specified lock mode. Otherwise, +mode value is ignored. It is up to the caller to release the +lock. If the block is found and the lock is NULL then the page_hash +lock is released by this function. +@return block, NULL if not found */ +UNIV_INLINE +buf_block_t* +buf_block_hash_get_locked( +/*=====================*/ + /*!< out: pointer to the bpage, + or NULL; if NULL, hash_lock + is also NULL. */ + buf_pool_t* buf_pool, /*!< buffer pool instance */ + ulint space, /*!< in: space id */ + ulint offset, /*!< in: page number */ + prio_rw_lock_t** lock, /*!< in/out: lock of the page + hash acquired if bpage is + found. NULL otherwise. If NULL + is passed then the hash_lock + is released by this function */ + ulint lock_mode); /*!< in: RW_LOCK_EX or + RW_LOCK_SHARED. Ignored if + lock == NULL */ +/* There are four different ways we can try to get a bpage or block +from the page hash: +1) Caller already holds the appropriate page hash lock: in the case call +buf_page_hash_get_low() function. +2) Caller wants to hold page hash lock in x-mode +3) Caller wants to hold page hash lock in s-mode +4) Caller doesn't want to hold page hash lock */ +#define buf_page_hash_get_s_locked(b, s, o, l) \ + buf_page_hash_get_locked(b, s, o, l, RW_LOCK_SHARED) +#define buf_page_hash_get_x_locked(b, s, o, l) \ + buf_page_hash_get_locked(b, s, o, l, RW_LOCK_EX) +#define buf_page_hash_get(b, s, o) \ + buf_page_hash_get_locked(b, s, o, NULL, 0) + +#define buf_block_hash_get_s_locked(b, s, o, l) \ + buf_block_hash_get_locked(b, s, o, l, RW_LOCK_SHARED) +#define buf_block_hash_get_x_locked(b, s, o, l) \ + buf_block_hash_get_locked(b, s, o, l, RW_LOCK_EX) +#define buf_block_hash_get(b, s, o) \ + buf_block_hash_get_locked(b, s, o, NULL, 0) + +/*********************************************************************//** +Gets the current length of the free list of buffer blocks. +@return length of the free list */ +UNIV_INTERN +ulint +buf_get_free_list_len(void); +/*=======================*/ + +/********************************************************************//** +Determine if a block is a sentinel for a buffer pool watch. +@return TRUE if a sentinel for a buffer pool watch, FALSE if not */ +UNIV_INTERN +ibool +buf_pool_watch_is_sentinel( +/*=======================*/ + buf_pool_t* buf_pool, /*!< buffer pool instance */ + const buf_page_t* bpage) /*!< in: block */ + __attribute__((nonnull, warn_unused_result)); +/****************************************************************//** +Add watch for the given page to be read in. Caller must have +appropriate hash_lock for the bpage and hold the LRU list mutex to avoid a race +condition with buf_LRU_free_page inserting the same page into the page hash. +This function may release the hash_lock and reacquire it. +@return NULL if watch set, block if the page is in the buffer pool */ +UNIV_INTERN +buf_page_t* +buf_pool_watch_set( +/*===============*/ + ulint space, /*!< in: space id */ + ulint offset, /*!< in: page number */ + ulint fold) /*!< in: buf_page_address_fold(space, offset) */ + __attribute__((warn_unused_result)); +/****************************************************************//** +Stop watching if the page has been read in. +buf_pool_watch_set(space,offset) must have returned NULL before. */ +UNIV_INTERN +void +buf_pool_watch_unset( +/*=================*/ + ulint space, /*!< in: space id */ + ulint offset);/*!< in: page number */ +/****************************************************************//** +Check if the page has been read in. +This may only be called after buf_pool_watch_set(space,offset) +has returned NULL and before invoking buf_pool_watch_unset(space,offset). +@return FALSE if the given page was not read in, TRUE if it was */ +UNIV_INTERN +ibool +buf_pool_watch_occurred( +/*====================*/ + ulint space, /*!< in: space id */ + ulint offset) /*!< in: page number */ + __attribute__((warn_unused_result)); +/********************************************************************//** +Get total buffer pool statistics. */ +UNIV_INTERN +void +buf_get_total_list_len( +/*===================*/ + ulint* LRU_len, /*!< out: length of all LRU lists */ + ulint* free_len, /*!< out: length of all free lists */ + ulint* flush_list_len);/*!< out: length of all flush lists */ +/********************************************************************//** +Get total list size in bytes from all buffer pools. */ +UNIV_INTERN +void +buf_get_total_list_size_in_bytes( +/*=============================*/ + buf_pools_list_size_t* buf_pools_list_size); /*!< out: list sizes + in all buffer pools */ +/********************************************************************//** +Get total buffer pool statistics. */ +UNIV_INTERN +void +buf_get_total_stat( +/*===============*/ + buf_pool_stat_t*tot_stat); /*!< out: buffer pool stats */ +/*********************************************************************//** +Get the nth chunk's buffer block in the specified buffer pool. +@return the nth chunk's buffer block. */ +UNIV_INLINE +buf_block_t* +buf_get_nth_chunk_block( +/*====================*/ + const buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + ulint n, /*!< in: nth chunk in the buffer pool */ + ulint* chunk_size); /*!< in: chunk size */ + +/********************************************************************//** +Calculate the checksum of a page from compressed table and update the page. */ +UNIV_INTERN +void +buf_flush_update_zip_checksum( +/*==========================*/ + buf_frame_t* page, /*!< in/out: Page to update */ + ulint zip_size, /*!< in: Compressed page size */ + lsn_t lsn); /*!< in: Lsn to stamp on the page */ + +#endif /* !UNIV_HOTBACKUP */ + +#ifdef UNIV_DEBUG +/********************************************************************//** +Checks if buf_pool->zip_mutex is owned and is serving for a given page as its +block mutex. +@return true if buf_pool->zip_mutex is owned. */ +UNIV_INLINE +bool +buf_own_zip_mutex_for_page( +/*=======================*/ + const buf_page_t* bpage) + __attribute__((nonnull,warn_unused_result)); +#endif /* UNIV_DEBUG */ + +/** The common buffer control block structure +for compressed and uncompressed frames */ + +/** Number of bits used for buffer page states. */ +#define BUF_PAGE_STATE_BITS 3 + +struct buf_page_t{ + /** @name General fields + None of these bit-fields must be modified without holding + buf_page_get_mutex() [buf_block_t::mutex or + buf_pool->zip_mutex], since they can be stored in the same + machine word. */ + /* @{ */ + + ib_uint32_t space; /*!< tablespace id. */ + ib_uint32_t offset; /*!< page number. */ + /** count of how manyfold this block is currently bufferfixed */ +#ifdef PAGE_ATOMIC_REF_COUNT + ib_uint32_t buf_fix_count; + + /** type of pending I/O operation; Transitions from BUF_IO_NONE to + BUF_IO_WRITE and back are protected by the buf_page_get_mutex() mutex + and the corresponding flush state mutex. The flush state mutex + protection for io_fix and flush_type is not strictly required, but it + ensures consistent buffer pool instance state snapshots in + buf_pool_validate_instance(). @see enum buf_io_fix */ + byte io_fix; + + byte state; +#else + unsigned buf_fix_count:19; + + /** type of pending I/O operation; also protected by + buf_pool->mutex for writes only @see enum buf_io_fix */ + unsigned io_fix:2; + + /*!< state of the control block. + State transitions from BUF_BLOCK_READY_FOR_USE to BUF_BLOCK_MEMORY + need not be protected by buf_page_get_mutex(). @see enum buf_page_state. + State changes that are relevant to page_hash are additionally protected + by the appropriate page_hash mutex i.e.: if a page is in page_hash or + is being added to/removed from page_hash then the corresponding changes + must also be protected by page_hash mutex. */ + unsigned state:BUF_PAGE_STATE_BITS; + +#endif /* PAGE_ATOMIC_REF_COUNT */ + +#ifndef UNIV_HOTBACKUP + unsigned flush_type:2; /*!< if this block is currently being + flushed to disk, this tells the + flush_type. Writes during flushing + protected by buf_page_get_mutex_enter() + mutex and the corresponding flush state + mutex. + @see buf_flush_t */ + unsigned buf_pool_index:6;/*!< index number of the buffer pool + that this block belongs to */ +# if MAX_BUFFER_POOLS > 64 +# error "MAX_BUFFER_POOLS > 64; redefine buf_pool_index:6" +# endif + /* @} */ +#endif /* !UNIV_HOTBACKUP */ + page_zip_des_t zip; /*!< compressed page; zip.data + (but not the data it points to) is + protected by buf_pool->zip_mutex; + state == BUF_BLOCK_ZIP_PAGE and + zip.data == NULL means an active + buf_pool->watch */ +#ifndef UNIV_HOTBACKUP + buf_page_t* hash; /*!< node used in chaining to + buf_pool->page_hash or + buf_pool->zip_hash */ +#ifdef UNIV_DEBUG + ibool in_page_hash; /*!< TRUE if in buf_pool->page_hash */ + ibool in_zip_hash; /*!< TRUE if in buf_pool->zip_hash */ +#endif /* UNIV_DEBUG */ + + /** @name Page flushing fields */ + /* @{ */ + + UT_LIST_NODE_T(buf_page_t) list; + /*!< based on state, this is a + list node, protected either by + a corresponding list mutex, + in one of the following lists in + buf_pool: + + - BUF_BLOCK_NOT_USED: free + - BUF_BLOCK_FILE_PAGE: flush_list + - BUF_BLOCK_ZIP_DIRTY: flush_list + - BUF_BLOCK_ZIP_PAGE: zip_clean + + If bpage is part of flush_list + then the node pointers are + covered by buf_pool->flush_list_mutex. + Otherwise these pointers are + protected by a corresponding list + mutex. + + The contents of the list node + is undefined if !in_flush_list + && state == BUF_BLOCK_FILE_PAGE, + or if state is one of + BUF_BLOCK_MEMORY, + BUF_BLOCK_REMOVE_HASH or + BUF_BLOCK_READY_IN_USE. */ + +#ifdef UNIV_DEBUG + ibool in_flush_list; /*!< TRUE if in buf_pool->flush_list; + when buf_pool->flush_list_mutex is + free, the following should hold: + in_flush_list + == (state == BUF_BLOCK_FILE_PAGE + || state == BUF_BLOCK_ZIP_DIRTY) + Writes to this field must be + covered by both block->mutex + and buf_pool->flush_list_mutex. Hence + reads can happen while holding + any one of the two mutexes */ + ibool in_free_list; /*!< TRUE if in buf_pool->free; when + buf_pool->free_list_mutex is free, the + following should hold: in_free_list + == (state == BUF_BLOCK_NOT_USED) */ +#endif /* UNIV_DEBUG */ + lsn_t newest_modification; + /*!< log sequence number of + the youngest modification to + this block, zero if not + modified. Protected by block + mutex */ + lsn_t oldest_modification; + /*!< log sequence number of + the START of the log entry + written of the oldest + modification to this block + which has not yet been flushed + on disk; zero if all + modifications are on disk. + Writes to this field must be + covered by both block->mutex + and buf_pool->flush_list_mutex. Hence + reads can happen while holding + any one of the two mutexes */ + /* @} */ + /** @name LRU replacement algorithm fields */ + /* @{ */ + + UT_LIST_NODE_T(buf_page_t) LRU; + /*!< node of the LRU list */ +#ifdef UNIV_DEBUG + ibool in_LRU_list; /*!< TRUE if the page is in + the LRU list; used in + debugging */ +#endif /* UNIV_DEBUG */ + unsigned old:1; /*!< TRUE if the block is in the old + blocks in buf_pool->LRU_old. Protected + by the LRU list mutex. May be read for + heuristics purposes under the block + mutex instead. */ + unsigned freed_page_clock:31;/*!< the value of + buf_pool->freed_page_clock + when this block was the last + time put to the head of the + LRU list; a thread is allowed + to read this for heuristic + purposes without holding any + mutex or latch */ + /* @} */ + unsigned access_time; /*!< time of first access, or + 0 if the block was never accessed + in the buffer pool. Protected by + block mutex */ + ibool is_corrupt; +# if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG + ibool file_page_was_freed; + /*!< this is set to TRUE when + fsp frees a page in buffer pool; + protected by buf_pool->zip_mutex + or buf_block_t::mutex. */ +# endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */ +#endif /* !UNIV_HOTBACKUP */ +}; + +/** The buffer control block structure */ + +struct buf_block_t{ + + /** @name General fields */ + /* @{ */ + + buf_page_t page; /*!< page information; this must + be the first field, so that + buf_pool->page_hash can point + to buf_page_t or buf_block_t */ + byte* frame; /*!< pointer to buffer frame which + is of size UNIV_PAGE_SIZE, and + aligned to an address divisible by + UNIV_PAGE_SIZE */ +#ifndef UNIV_HOTBACKUP + UT_LIST_NODE_T(buf_block_t) unzip_LRU; + /*!< node of the decompressed LRU list; + a block is in the unzip_LRU list + if page.state == BUF_BLOCK_FILE_PAGE + and page.zip.data != NULL */ +#ifdef UNIV_DEBUG + ibool in_unzip_LRU_list;/*!< TRUE if the page is in the + decompressed LRU list; + used in debugging */ +#endif /* UNIV_DEBUG */ + ib_mutex_t mutex; /*!< mutex protecting this block: + state, io_fix, buf_fix_count, + and accessed; we introduce this new + mutex in InnoDB-5.1 to relieve + contention on the buffer pool mutex */ + rw_lock_t lock; /*!< read-write lock of the buffer + frame */ + unsigned lock_hash_val:32;/*!< hashed value of the page address + in the record lock hash table; + protected by buf_block_t::lock + (or buf_block_t::mutex in + buf_page_get_gen(), + buf_page_init_for_read() + and buf_page_create()) */ + ibool check_index_page_at_flush; + /*!< TRUE if we know that this is + an index page, and want the database + to check its consistency before flush; + note that there may be pages in the + buffer pool which are index pages, + but this flag is not set because + we do not keep track of all pages; + NOT protected by any mutex */ + /* @} */ + /** @name Optimistic search field */ + /* @{ */ + + ib_uint64_t modify_clock; /*!< this clock is incremented every + time a pointer to a record on the + page may become obsolete; this is + used in the optimistic cursor + positioning: if the modify clock has + not changed, we know that the pointer + is still valid; this field may be + changed if the thread (1) owns the LRU + list mutex and the page is not + bufferfixed, or (2) the thread has an + x-latch on the block */ + /* @} */ + /** @name Hash search fields (unprotected) + NOTE that these fields are NOT protected by any semaphore! */ + /* @{ */ + + ulint n_hash_helps; /*!< counter which controls building + of a new hash index for the page */ + ulint n_fields; /*!< recommended prefix length for hash + search: number of full fields */ + ulint n_bytes; /*!< recommended prefix: number of bytes + in an incomplete field */ + ibool left_side; /*!< TRUE or FALSE, depending on + whether the leftmost record of several + records with the same prefix should be + indexed in the hash index */ + /* @} */ + + /** @name Hash search fields + These 5 fields may only be modified when we have + an x-latch on btr_search_latch AND + - we are holding an s-latch or x-latch on buf_block_t::lock or + - we know that buf_block_t::buf_fix_count == 0. + + An exception to this is when we init or create a page + in the buffer pool in buf0buf.cc. + + Another exception is that assigning block->index = NULL + is allowed whenever holding an x-latch on btr_search_latch. */ + + /* @{ */ + +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + ulint n_pointers; /*!< used in debugging: the number of + pointers in the adaptive hash index + pointing to this frame */ +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + unsigned curr_n_fields:10;/*!< prefix length for hash indexing: + number of full fields */ + unsigned curr_n_bytes:15;/*!< number of bytes in hash + indexing */ + unsigned curr_left_side:1;/*!< TRUE or FALSE in hash indexing */ + dict_index_t* index; /*!< Index for which the + adaptive hash index has been + created, or NULL if the page + does not exist in the + index. Note that it does not + guarantee that the index is + complete, though: there may + have been hash collisions, + record deletions, etc. */ + /* @} */ +# ifdef UNIV_SYNC_DEBUG + /** @name Debug fields */ + /* @{ */ + rw_lock_t debug_latch; /*!< in the debug version, each thread + which bufferfixes the block acquires + an s-latch here; so we can use the + debug utilities in sync0rw */ + /* @} */ +# endif +#endif /* !UNIV_HOTBACKUP */ +}; + +/** Check if a buf_block_t object is in a valid state +@param block buffer block +@return TRUE if valid */ +#define buf_block_state_valid(block) \ +(buf_block_get_state(block) >= BUF_BLOCK_NOT_USED \ + && (buf_block_get_state(block) <= BUF_BLOCK_REMOVE_HASH)) + +#ifndef UNIV_HOTBACKUP +/**********************************************************************//** +Compute the hash fold value for blocks in buf_pool->zip_hash. */ +/* @{ */ +#define BUF_POOL_ZIP_FOLD_PTR(ptr) ((ulint) (ptr) / UNIV_PAGE_SIZE) +#define BUF_POOL_ZIP_FOLD(b) BUF_POOL_ZIP_FOLD_PTR((b)->frame) +#define BUF_POOL_ZIP_FOLD_BPAGE(b) BUF_POOL_ZIP_FOLD((buf_block_t*) (b)) +/* @} */ + +/** Struct that is embedded in the free zip blocks */ +struct buf_buddy_free_t { + union { + ulint size; /*!< size of the block */ + byte bytes[FIL_PAGE_DATA]; + /*!< stamp[FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID] + == BUF_BUDDY_FREE_STAMP denotes a free + block. If the space_id field of buddy + block != BUF_BUDDY_FREE_STAMP, the block + is not in any zip_free list. If the + space_id is BUF_BUDDY_FREE_STAMP then + stamp[0] will contain the + buddy block size. */ + } stamp; + + buf_page_t bpage; /*!< Embedded bpage descriptor */ + UT_LIST_NODE_T(buf_buddy_free_t) list; + /*!< Node of zip_free list */ +}; + +/** @brief The buffer pool statistics structure. */ +struct buf_pool_stat_t{ + ulint n_page_gets; /*!< number of page gets performed; + also successful searches through + the adaptive hash index are + counted as page gets. */ + ulint n_pages_read; /*!< number read operations. Accessed + atomically. */ + ulint n_pages_written;/*!< number write operations. Accessed + atomically.*/ + ulint n_pages_created;/*!< number of pages created + in the pool with no read */ + ulint n_ra_pages_read_rnd;/*!< number of pages read in + as part of random read ahead */ + ulint n_ra_pages_read;/*!< number of pages read in + as part of read ahead */ + ulint n_ra_pages_evicted;/*!< number of read ahead + pages that are evicted without + being accessed */ + ulint n_pages_made_young; /*!< number of pages made young, in + calls to buf_LRU_make_block_young() */ + ulint n_pages_not_made_young; /*!< number of pages not made + young because the first access + was not long enough ago, in + buf_page_peek_if_too_old() */ + ulint LRU_bytes; /*!< LRU size in bytes */ + ulint flush_list_bytes;/*!< flush_list size in bytes */ + ulint buf_lru_flush_page_count; +}; + +/** Statistics of buddy blocks of a given size. */ +struct buf_buddy_stat_t { + /** Number of blocks allocated from the buddy system. */ + ulint used; + /** Number of blocks relocated by the buddy system. */ + ib_uint64_t relocated; + /** Total duration of block relocations, in microseconds. */ + ib_uint64_t relocated_usec; +}; + +/** @brief The buffer pool structure. + +NOTE! The definition appears here only for other modules of this +directory (buf) to see it. Do not use from outside! */ + +struct buf_pool_t{ + + /** @name General fields */ + /* @{ */ + ib_mutex_t zip_mutex; /*!< Zip mutex of this buffer + pool instance, protects compressed + only pages (of type buf_page_t, not + buf_block_t */ + ib_prio_mutex_t LRU_list_mutex; + ib_prio_mutex_t free_list_mutex; + ib_mutex_t zip_free_mutex; + ib_mutex_t zip_hash_mutex; + ib_mutex_t flush_state_mutex; /*!< Flush state protection + mutex */ + ulint instance_no; /*!< Array index of this buffer + pool instance */ + ulint old_pool_size; /*!< Old pool size in bytes */ + ulint curr_pool_size; /*!< Current pool size in bytes */ + ulint LRU_old_ratio; /*!< Reserve this much of the buffer + pool for "old" blocks */ +#ifdef UNIV_DEBUG + ulint buddy_n_frames; /*!< Number of frames allocated from + the buffer pool to the buddy system */ +#endif + ulint n_chunks; /*!< number of buffer pool chunks */ + buf_chunk_t* chunks; /*!< buffer pool chunks */ + ulint curr_size; /*!< current pool size in pages */ + ulint read_ahead_area;/*!< size in pages of the area which + the read-ahead algorithms read if + invoked */ + hash_table_t* page_hash; /*!< hash table of buf_page_t or + buf_block_t file pages, + buf_page_in_file() == TRUE, + indexed by (space_id, offset). + page_hash is protected by an + array of mutexes. */ + hash_table_t* zip_hash; /*!< hash table of buf_block_t blocks + whose frames are allocated to the + zip buddy system, + indexed by block->frame */ + ulint n_pend_reads; /*!< number of pending read + operations. Accessed atomically */ + ulint n_pend_unzip; /*!< number of pending decompressions. + Accesssed atomically */ + + time_t last_printout_time; + /*!< when buf_print_io was last time + called. Accesses not protected */ + buf_buddy_stat_t buddy_stat[BUF_BUDDY_SIZES_MAX + 1]; + /*!< Statistics of buddy system, + indexed by block size. Protected by + zip_free_mutex. */ + buf_pool_stat_t stat; /*!< current statistics */ + buf_pool_stat_t old_stat; /*!< old statistics */ + + /* @} */ + + /** @name Page flushing algorithm fields */ + + /* @{ */ + + ib_mutex_t flush_list_mutex;/*!< mutex protecting the + flush list access. This mutex + protects flush_list, flush_rbt + and bpage::list pointers when + the bpage is on flush_list. It + also protects writes to + bpage::oldest_modification and + flush_list_hp */ + const buf_page_t* flush_list_hp;/*!< "hazard pointer" + used during scan of flush_list + while doing flush list batch. + Protected by flush_list_mutex */ + UT_LIST_BASE_NODE_T(buf_page_t) flush_list; + /*!< base node of the modified block + list */ + ibool init_flush[BUF_FLUSH_N_TYPES]; + /*!< this is TRUE when a flush of the + given type is being initialized. + Protected by flush_state_mutex. */ + ulint n_flush[BUF_FLUSH_N_TYPES]; + /*!< this is the number of pending + writes in the given flush type. + Protected by flush_state_mutex. */ + os_event_t no_flush[BUF_FLUSH_N_TYPES]; + /*!< this is in the set state + when there is no flush batch + of the given type running */ + ib_rbt_t* flush_rbt; /*!< a red-black tree is used + exclusively during recovery to + speed up insertions in the + flush_list. This tree contains + blocks in order of + oldest_modification LSN and is + kept in sync with the + flush_list. + Each member of the tree MUST + also be on the flush_list. + This tree is relevant only in + recovery and is set to NULL + once the recovery is over. + Protected by flush_list_mutex */ + ulint freed_page_clock;/*!< a sequence number used + to count the number of buffer + blocks removed from the end of + the LRU list; NOTE that this + counter may wrap around at 4 + billion! A thread is allowed + to read this for heuristic + purposes without holding any + mutex or latch. For non-heuristic + purposes protected by LRU_list_mutex */ + ibool try_LRU_scan; /*!< Set to FALSE when an LRU + scan for free block fails. This + flag is used to avoid repeated + scans of LRU list when we know + that there is no free block + available in the scan depth for + eviction. Set to TRUE whenever + we flush a batch from the + buffer pool. Accessed atomically. */ + /* @} */ + + /** @name LRU replacement algorithm fields */ + /* @{ */ + + UT_LIST_BASE_NODE_T(buf_page_t) free; + /*!< base node of the free + block list */ + UT_LIST_BASE_NODE_T(buf_page_t) LRU; + /*!< base node of the LRU list */ + buf_page_t* LRU_old; /*!< pointer to the about + LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV + oldest blocks in the LRU list; + NULL if LRU length less than + BUF_LRU_OLD_MIN_LEN; + NOTE: when LRU_old != NULL, its length + should always equal LRU_old_len */ + ulint LRU_old_len; /*!< length of the LRU list from + the block to which LRU_old points + onward, including that block; + see buf0lru.cc for the restrictions + on this value; 0 if LRU_old == NULL; + NOTE: LRU_old_len must be adjusted + whenever LRU_old shrinks or grows! */ + + UT_LIST_BASE_NODE_T(buf_block_t) unzip_LRU; + /*!< base node of the + unzip_LRU list. The list is protected + by LRU list mutex. */ + + /* @} */ + /** @name Buddy allocator fields + The buddy allocator is used for allocating compressed page + frames and buf_page_t descriptors of blocks that exist + in the buffer pool only in compressed form. */ + /* @{ */ +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG + UT_LIST_BASE_NODE_T(buf_page_t) zip_clean; + /*!< unmodified compressed pages */ +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ + UT_LIST_BASE_NODE_T(buf_buddy_free_t) zip_free[BUF_BUDDY_SIZES_MAX]; + /*!< buddy free lists */ + + buf_page_t* watch; + /*!< Sentinel records for buffer + pool watches. */ + +#if BUF_BUDDY_LOW > UNIV_ZIP_SIZE_MIN +# error "BUF_BUDDY_LOW > UNIV_ZIP_SIZE_MIN" +#endif + /* @} */ +}; + +/** @name Accessors for buffer pool mutexes +Use these instead of accessing buffer pool mutexes directly. */ +/* @{ */ + +/** Test if flush list mutex is owned. */ +#define buf_flush_list_mutex_own(b) mutex_own(&b->flush_list_mutex) + +/** Acquire the flush list mutex. */ +#define buf_flush_list_mutex_enter(b) do { \ + mutex_enter(&b->flush_list_mutex); \ +} while (0) +/** Release the flush list mutex. */ +# define buf_flush_list_mutex_exit(b) do { \ + mutex_exit(&b->flush_list_mutex); \ +} while (0) + +/** Test if block->mutex is owned. */ +#define buf_block_mutex_own(b) mutex_own(&(b)->mutex) + +/** Acquire the block->mutex. */ +#define buf_block_mutex_enter(b) do { \ + mutex_enter(&(b)->mutex); \ +} while (0) + +/** Release the trx->mutex. */ +#define buf_block_mutex_exit(b) do { \ + mutex_exit(&(b)->mutex); \ +} while (0) + + +/** Get appropriate page_hash_lock. */ +# define buf_page_hash_lock_get(b, f) \ + hash_get_lock(b->page_hash, f) + +#ifdef UNIV_SYNC_DEBUG +/** Test if page_hash lock is held in s-mode. */ +# define buf_page_hash_lock_held_s(b, p) \ + rw_lock_own(buf_page_hash_lock_get(b, \ + buf_page_address_fold(p->space, \ + p->offset)), \ + RW_LOCK_SHARED) + +/** Test if page_hash lock is held in x-mode. */ +# define buf_page_hash_lock_held_x(b, p) \ + rw_lock_own(buf_page_hash_lock_get(b, \ + buf_page_address_fold(p->space, \ + p->offset)), \ + RW_LOCK_EX) + +/** Test if page_hash lock is held in x or s-mode. */ +# define buf_page_hash_lock_held_s_or_x(b, p) \ + (buf_page_hash_lock_held_s(b, p) \ + || buf_page_hash_lock_held_x(b, p)) + +# define buf_block_hash_lock_held_s(b, p) \ + buf_page_hash_lock_held_s(b, &(p->page)) + +# define buf_block_hash_lock_held_x(b, p) \ + buf_page_hash_lock_held_x(b, &(p->page)) + +# define buf_block_hash_lock_held_s_or_x(b, p) \ + buf_page_hash_lock_held_s_or_x(b, &(p->page)) +#else /* UNIV_SYNC_DEBUG */ +# define buf_page_hash_lock_held_s(b, p) (TRUE) +# define buf_page_hash_lock_held_x(b, p) (TRUE) +# define buf_page_hash_lock_held_s_or_x(b, p) (TRUE) +# define buf_block_hash_lock_held_s(b, p) (TRUE) +# define buf_block_hash_lock_held_x(b, p) (TRUE) +# define buf_block_hash_lock_held_s_or_x(b, p) (TRUE) +#endif /* UNIV_SYNC_DEBUG */ + +#endif /* !UNIV_HOTBACKUP */ +/* @} */ + +/********************************************************************** +Let us list the consistency conditions for different control block states. + +NOT_USED: is in free list, not in LRU list, not in flush list, nor + page hash table +READY_FOR_USE: is not in free list, LRU list, or flush list, nor page + hash table +MEMORY: is not in free list, LRU list, or flush list, nor page + hash table +FILE_PAGE: space and offset are defined, is in page hash table + if io_fix == BUF_IO_WRITE, + pool: no_flush[flush_type] is in reset state, + pool: n_flush[flush_type] > 0 + + (1) if buf_fix_count == 0, then + is in LRU list, not in free list + is in flush list, + if and only if oldest_modification > 0 + is x-locked, + if and only if io_fix == BUF_IO_READ + is s-locked, + if and only if io_fix == BUF_IO_WRITE + + (2) if buf_fix_count > 0, then + is not in LRU list, not in free list + is in flush list, + if and only if oldest_modification > 0 + if io_fix == BUF_IO_READ, + is x-locked + if io_fix == BUF_IO_WRITE, + is s-locked + +State transitions: + +NOT_USED => READY_FOR_USE +READY_FOR_USE => MEMORY +READY_FOR_USE => FILE_PAGE +MEMORY => NOT_USED +FILE_PAGE => NOT_USED NOTE: This transition is allowed if and only if + (1) buf_fix_count == 0, + (2) oldest_modification == 0, and + (3) io_fix == 0. +*/ + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +/** Functor to validate the LRU list. */ +struct CheckInLRUList { + void operator()(const buf_page_t* elem) const + { + ut_a(elem->in_LRU_list); + } +}; + +/** Functor to validate the LRU list. */ +struct CheckInFreeList { + void operator()(const buf_page_t* elem) const + { + ut_a(elem->in_free_list); + } +}; + +struct CheckUnzipLRUAndLRUList { + void operator()(const buf_block_t* elem) const + { + ut_a(elem->page.in_LRU_list); + ut_a(elem->in_unzip_LRU_list); + } +}; +#endif /* UNIV_DEBUG || defined UNIV_BUF_DEBUG */ + +#ifndef UNIV_NONINL +#include "buf0buf.ic" +#endif + +#endif diff --git a/storage/xtradb/include/buf0buf.ic b/storage/xtradb/include/buf0buf.ic new file mode 100644 index 00000000000..10f0e02cb8f --- /dev/null +++ b/storage/xtradb/include/buf0buf.ic @@ -0,0 +1,1466 @@ +/***************************************************************************** + +Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2008, Google Inc. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/buf0buf.ic +The database buffer buf_pool + +Created 11/5/1995 Heikki Tuuri +*******************************************************/ + +#include "mtr0mtr.h" +#ifndef UNIV_HOTBACKUP +#include "buf0flu.h" +#include "buf0lru.h" +#include "buf0rea.h" + +/** A chunk of buffers. The buffer pool is allocated in chunks. */ +struct buf_chunk_t{ + ulint mem_size; /*!< allocated size of the chunk */ + ulint size; /*!< size of frames[] and blocks[] */ + void* mem; /*!< pointer to the memory area which + was allocated for the frames */ + buf_block_t* blocks; /*!< array of buffer control blocks */ +}; + + +#include "srv0srv.h" + +/*********************************************************************//** +Gets the current size of buffer buf_pool in bytes. +@return size in bytes */ +UNIV_INLINE +ulint +buf_pool_get_curr_size(void) +/*========================*/ +{ + return(srv_buf_pool_curr_size); +} + +/********************************************************************//** +Calculates the index of a buffer pool to the buf_pool[] array. +@return the position of the buffer pool in buf_pool[] */ +UNIV_INLINE +ulint +buf_pool_index( +/*===========*/ + const buf_pool_t* buf_pool) /*!< in: buffer pool */ +{ + ulint i = buf_pool - buf_pool_ptr; + ut_ad(i < MAX_BUFFER_POOLS); + ut_ad(i < srv_buf_pool_instances); + return(i); +} + +/******************************************************************//** +Returns the buffer pool instance given a page instance +@return buf_pool */ +UNIV_INLINE +buf_pool_t* +buf_pool_from_bpage( +/*================*/ + const buf_page_t* bpage) /*!< in: buffer pool page */ +{ + ulint i; + i = bpage->buf_pool_index; + ut_ad(i < srv_buf_pool_instances); + return(&buf_pool_ptr[i]); +} + +/******************************************************************//** +Returns the buffer pool instance given a block instance +@return buf_pool */ +UNIV_INLINE +buf_pool_t* +buf_pool_from_block( +/*================*/ + const buf_block_t* block) /*!< in: block */ +{ + return(buf_pool_from_bpage(&block->page)); +} + +/*********************************************************************//** +Gets the current size of buffer buf_pool in pages. +@return size in pages*/ +UNIV_INLINE +ulint +buf_pool_get_n_pages(void) +/*======================*/ +{ + return(buf_pool_get_curr_size() / UNIV_PAGE_SIZE); +} + +/********************************************************************//** +Reads the freed_page_clock of a buffer block. +@return freed_page_clock */ +UNIV_INLINE +ulint +buf_page_get_freed_page_clock( +/*==========================*/ + const buf_page_t* bpage) /*!< in: block */ +{ + /* This is sometimes read without holding any buffer pool mutex. */ + return(bpage->freed_page_clock); +} + +/********************************************************************//** +Reads the freed_page_clock of a buffer block. +@return freed_page_clock */ +UNIV_INLINE +ulint +buf_block_get_freed_page_clock( +/*===========================*/ + const buf_block_t* block) /*!< in: block */ +{ + return(buf_page_get_freed_page_clock(&block->page)); +} + +/********************************************************************//** +Tells if a block is still close enough to the MRU end of the LRU list +meaning that it is not in danger of getting evicted and also implying +that it has been accessed recently. +Note that this is for heuristics only and does not reserve buffer pool +mutex. +@return TRUE if block is close to MRU end of LRU */ +UNIV_INLINE +ibool +buf_page_peek_if_young( +/*===================*/ + const buf_page_t* bpage) /*!< in: block */ +{ + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + + /* FIXME: bpage->freed_page_clock is 31 bits */ + return((buf_pool->freed_page_clock & ((1UL << 31) - 1)) + < ((ulint) bpage->freed_page_clock + + (buf_pool->curr_size + * (BUF_LRU_OLD_RATIO_DIV - buf_pool->LRU_old_ratio) + / (BUF_LRU_OLD_RATIO_DIV * 4)))); +} + +/********************************************************************//** +Recommends a move of a block to the start of the LRU list if there is danger +of dropping from the buffer pool. NOTE: does not reserve the buffer pool +mutex. +@return TRUE if should be made younger */ +UNIV_INLINE +ibool +buf_page_peek_if_too_old( +/*=====================*/ + const buf_page_t* bpage) /*!< in: block to make younger */ +{ + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + + if (buf_pool->freed_page_clock == 0) { + /* If eviction has not started yet, do not update the + statistics or move blocks in the LRU list. This is + either the warm-up phase or an in-memory workload. */ + return(FALSE); + } else if (buf_LRU_old_threshold_ms && bpage->old) { + unsigned access_time = buf_page_is_accessed(bpage); + + if (access_time > 0 + && ((ib_uint32_t) (ut_time_ms() - access_time)) + >= buf_LRU_old_threshold_ms) { + return(TRUE); + } + + buf_pool->stat.n_pages_not_made_young++; + return(FALSE); + } else { + return(!buf_page_peek_if_young(bpage)); + } +} +#endif /* !UNIV_HOTBACKUP */ + +/*********************************************************************//** +Gets the state of a block. +@return state */ +UNIV_INLINE +enum buf_page_state +buf_page_get_state( +/*===============*/ + const buf_page_t* bpage) /*!< in: pointer to the control block */ +{ + enum buf_page_state state = (enum buf_page_state) bpage->state; + +#ifdef UNIV_DEBUG + switch (state) { + case BUF_BLOCK_POOL_WATCH: + case BUF_BLOCK_ZIP_PAGE: + case BUF_BLOCK_ZIP_DIRTY: + case BUF_BLOCK_NOT_USED: + case BUF_BLOCK_READY_FOR_USE: + case BUF_BLOCK_FILE_PAGE: + case BUF_BLOCK_MEMORY: + case BUF_BLOCK_REMOVE_HASH: + break; + default: + ut_error; + } +#endif /* UNIV_DEBUG */ + + return(state); +} +/*********************************************************************//** +Gets the state of a block. +@return state */ +UNIV_INLINE +enum buf_page_state +buf_block_get_state( +/*================*/ + const buf_block_t* block) /*!< in: pointer to the control block */ +{ + return(buf_page_get_state(&block->page)); +} +/*********************************************************************//** +Sets the state of a block. */ +UNIV_INLINE +void +buf_page_set_state( +/*===============*/ + buf_page_t* bpage, /*!< in/out: pointer to control block */ + enum buf_page_state state) /*!< in: state */ +{ +#ifdef UNIV_DEBUG + enum buf_page_state old_state = buf_page_get_state(bpage); + + switch (old_state) { + case BUF_BLOCK_POOL_WATCH: + ut_error; + break; + case BUF_BLOCK_ZIP_PAGE: + ut_a(state == BUF_BLOCK_ZIP_DIRTY); + break; + case BUF_BLOCK_ZIP_DIRTY: + ut_a(state == BUF_BLOCK_ZIP_PAGE); + break; + case BUF_BLOCK_NOT_USED: + ut_a(state == BUF_BLOCK_READY_FOR_USE); + break; + case BUF_BLOCK_READY_FOR_USE: + ut_a(state == BUF_BLOCK_MEMORY + || state == BUF_BLOCK_FILE_PAGE + || state == BUF_BLOCK_NOT_USED); + break; + case BUF_BLOCK_MEMORY: + ut_a(state == BUF_BLOCK_NOT_USED); + break; + case BUF_BLOCK_FILE_PAGE: + ut_a(state == BUF_BLOCK_NOT_USED + || state == BUF_BLOCK_REMOVE_HASH); + break; + case BUF_BLOCK_REMOVE_HASH: + ut_a(state == BUF_BLOCK_MEMORY); + break; + } +#endif /* UNIV_DEBUG */ + bpage->state = state; + ut_ad(buf_page_get_state(bpage) == state); +} + +/*********************************************************************//** +Sets the state of a block. */ +UNIV_INLINE +void +buf_block_set_state( +/*================*/ + buf_block_t* block, /*!< in/out: pointer to control block */ + enum buf_page_state state) /*!< in: state */ +{ + buf_page_set_state(&block->page, state); +} + +/*********************************************************************//** +Determines if a block is mapped to a tablespace. +@return TRUE if mapped */ +UNIV_INLINE +ibool +buf_page_in_file( +/*=============*/ + const buf_page_t* bpage) /*!< in: pointer to control block */ +{ + switch (buf_page_get_state(bpage)) { + case BUF_BLOCK_POOL_WATCH: + ut_error; + break; + case BUF_BLOCK_ZIP_PAGE: + case BUF_BLOCK_ZIP_DIRTY: + case BUF_BLOCK_FILE_PAGE: + return(TRUE); + case BUF_BLOCK_NOT_USED: + case BUF_BLOCK_READY_FOR_USE: + case BUF_BLOCK_MEMORY: + case BUF_BLOCK_REMOVE_HASH: + break; + } + + return(FALSE); +} + +#ifndef UNIV_HOTBACKUP +/*********************************************************************//** +Determines if a block should be on unzip_LRU list. +@return TRUE if block belongs to unzip_LRU */ +UNIV_INLINE +ibool +buf_page_belongs_to_unzip_LRU( +/*==========================*/ + const buf_page_t* bpage) /*!< in: pointer to control block */ +{ + ut_ad(buf_page_in_file(bpage)); + + return(bpage->zip.data + && buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE); +} + +/*********************************************************************//** +Gets the mutex of a block. +@return pointer to mutex protecting bpage */ +UNIV_INLINE +ib_mutex_t* +buf_page_get_mutex( +/*===============*/ + const buf_page_t* bpage) /*!< in: pointer to control block */ +{ + switch (buf_page_get_state(bpage)) { + case BUF_BLOCK_POOL_WATCH: + ut_error; + return(NULL); + case BUF_BLOCK_ZIP_PAGE: + case BUF_BLOCK_ZIP_DIRTY: { + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + + return(&buf_pool->zip_mutex); + } + default: + return(&((buf_block_t*) bpage)->mutex); + } +} + +/*********************************************************************//** +Get the flush type of a page. +@return flush type */ +UNIV_INLINE +buf_flush_t +buf_page_get_flush_type( +/*====================*/ + const buf_page_t* bpage) /*!< in: buffer page */ +{ + buf_flush_t flush_type = (buf_flush_t) bpage->flush_type; + +#ifdef UNIV_DEBUG + switch (flush_type) { + case BUF_FLUSH_LRU: + case BUF_FLUSH_LIST: + case BUF_FLUSH_SINGLE_PAGE: + return(flush_type); + case BUF_FLUSH_N_TYPES: + ut_error; + } + ut_error; +#endif /* UNIV_DEBUG */ + return(flush_type); +} +/*********************************************************************//** +Set the flush type of a page. */ +UNIV_INLINE +void +buf_page_set_flush_type( +/*====================*/ + buf_page_t* bpage, /*!< in: buffer page */ + buf_flush_t flush_type) /*!< in: flush type */ +{ + bpage->flush_type = flush_type; + ut_ad(buf_page_get_flush_type(bpage) == flush_type); +} + +/*********************************************************************//** +Map a block to a file page. */ +UNIV_INLINE +void +buf_block_set_file_page( +/*====================*/ + buf_block_t* block, /*!< in/out: pointer to control block */ + ulint space, /*!< in: tablespace id */ + ulint page_no)/*!< in: page number */ +{ + buf_block_set_state(block, BUF_BLOCK_FILE_PAGE); + block->page.space = static_cast<ib_uint32_t>(space); + block->page.offset = static_cast<ib_uint32_t>(page_no); +} + +/*********************************************************************//** +Gets the io_fix state of a block. +@return io_fix state */ +UNIV_INLINE +enum buf_io_fix +buf_page_get_io_fix( +/*================*/ + const buf_page_t* bpage) /*!< in: pointer to the control block */ +{ + ut_ad(mutex_own(buf_page_get_mutex(bpage))); + return buf_page_get_io_fix_unlocked(bpage); +} + +/*********************************************************************//** +Gets the io_fix state of a block. Does not assert that the +buf_page_get_mutex() mutex is held, to be used in the cases where it is safe +not to hold it. +@return io_fix state */ +UNIV_INLINE +enum buf_io_fix +buf_page_get_io_fix_unlocked( +/*=========================*/ + const buf_page_t* bpage) /*!< in: pointer to the control block */ +{ + enum buf_io_fix io_fix = (enum buf_io_fix) bpage->io_fix; +#ifdef UNIV_DEBUG + switch (io_fix) { + case BUF_IO_NONE: + case BUF_IO_READ: + case BUF_IO_WRITE: + case BUF_IO_PIN: + return(io_fix); + } + ut_error; +#endif /* UNIV_DEBUG */ + return(io_fix); +} + +/*********************************************************************//** +Gets the io_fix state of a block. +@return io_fix state */ +UNIV_INLINE +enum buf_io_fix +buf_block_get_io_fix( +/*=================*/ + const buf_block_t* block) /*!< in: pointer to the control block */ +{ + return(buf_page_get_io_fix(&block->page)); +} + +/*********************************************************************//** +Gets the io_fix state of a block. Does not assert that the +buf_page_get_mutex() mutex is held, to be used in the cases where it is safe +not to hold it. +@return io_fix state */ +UNIV_INLINE +enum buf_io_fix +buf_block_get_io_fix_unlocked( +/*==========================*/ + const buf_block_t* block) /*!< in: pointer to the control block */ +{ + return(buf_page_get_io_fix_unlocked(&block->page)); +} + + +/*********************************************************************//** +Sets the io_fix state of a block. */ +UNIV_INLINE +void +buf_page_set_io_fix( +/*================*/ + buf_page_t* bpage, /*!< in/out: control block */ + enum buf_io_fix io_fix) /*!< in: io_fix state */ +{ + ut_ad(mutex_own(buf_page_get_mutex(bpage))); + + bpage->io_fix = io_fix; + ut_ad(buf_page_get_io_fix(bpage) == io_fix); +} + +/*********************************************************************//** +Sets the io_fix state of a block. */ +UNIV_INLINE +void +buf_block_set_io_fix( +/*=================*/ + buf_block_t* block, /*!< in/out: control block */ + enum buf_io_fix io_fix) /*!< in: io_fix state */ +{ + buf_page_set_io_fix(&block->page, io_fix); +} + +/*********************************************************************//** +Makes a block sticky. A sticky block implies that even after we release +the buf_pool->LRU_list_mutex and the block->mutex: +* it cannot be removed from the flush_list +* the block descriptor cannot be relocated +* it cannot be removed from the LRU list +Note that: +* the block can still change its position in the LRU list +* the next and previous pointers can change. */ +UNIV_INLINE +void +buf_page_set_sticky( +/*================*/ + buf_page_t* bpage) /*!< in/out: control block */ +{ +#ifdef UNIV_DEBUG + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); +#endif + ut_ad(mutex_own(buf_page_get_mutex(bpage))); + ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_NONE); + ut_ad(bpage->in_LRU_list); + + bpage->io_fix = BUF_IO_PIN; +} + +/*********************************************************************//** +Removes stickiness of a block. */ +UNIV_INLINE +void +buf_page_unset_sticky( +/*==================*/ + buf_page_t* bpage) /*!< in/out: control block */ +{ + ut_ad(mutex_own(buf_page_get_mutex(bpage))); + ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_PIN); + + bpage->io_fix = BUF_IO_NONE; +} + +/********************************************************************//** +Determine if a buffer block can be relocated in memory. The block +can be dirty, but it must not be I/O-fixed or bufferfixed. */ +UNIV_INLINE +ibool +buf_page_can_relocate( +/*==================*/ + const buf_page_t* bpage) /*!< control block being relocated */ +{ + ut_ad(mutex_own(buf_page_get_mutex(bpage))); + ut_ad(buf_page_in_file(bpage)); + ut_ad(bpage->in_LRU_list); + + return(buf_page_get_io_fix(bpage) == BUF_IO_NONE + && bpage->buf_fix_count == 0); +} + +/*********************************************************************//** +Determine if a block has been flagged old. +@return TRUE if old */ +UNIV_INLINE +ibool +buf_page_is_old( +/*============*/ + const buf_page_t* bpage) /*!< in: control block */ +{ +#ifdef UNIV_DEBUG + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); +#endif + /* Buffer page mutex is not strictly required here for heuristic + purposes even if LRU mutex is not being held. Keep the assertion + for now since all the callers hold it. */ + ut_ad(mutex_own(buf_page_get_mutex(bpage)) + || mutex_own(&buf_pool->LRU_list_mutex)); + ut_ad(buf_page_in_file(bpage)); + + return(bpage->old); +} + +/*********************************************************************//** +Flag a block old. */ +UNIV_INLINE +void +buf_page_set_old( +/*=============*/ + buf_page_t* bpage, /*!< in/out: control block */ + ibool old) /*!< in: old */ +{ +#ifdef UNIV_DEBUG + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); +#endif /* UNIV_DEBUG */ + ut_a(buf_page_in_file(bpage)); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); + ut_ad(bpage->in_LRU_list); + +#ifdef UNIV_LRU_DEBUG + ut_a((buf_pool->LRU_old_len == 0) == (buf_pool->LRU_old == NULL)); + /* If a block is flagged "old", the LRU_old list must exist. */ + ut_a(!old || buf_pool->LRU_old); + + if (UT_LIST_GET_PREV(LRU, bpage) && UT_LIST_GET_NEXT(LRU, bpage)) { + const buf_page_t* prev = UT_LIST_GET_PREV(LRU, bpage); + const buf_page_t* next = UT_LIST_GET_NEXT(LRU, bpage); + if (prev->old == next->old) { + ut_a(prev->old == old); + } else { + ut_a(!prev->old); + ut_a(buf_pool->LRU_old == (old ? bpage : next)); + } + } +#endif /* UNIV_LRU_DEBUG */ + + bpage->old = old; +} + +/*********************************************************************//** +Determine the time of first access of a block in the buffer pool. +@return ut_time_ms() at the time of first access, 0 if not accessed */ +UNIV_INLINE +unsigned +buf_page_is_accessed( +/*=================*/ + const buf_page_t* bpage) /*!< in: control block */ +{ + ut_ad(buf_page_in_file(bpage)); + + return(bpage->access_time); +} + +/*********************************************************************//** +Flag a block accessed. */ +UNIV_INLINE +void +buf_page_set_accessed( +/*==================*/ + buf_page_t* bpage) /*!< in/out: control block */ +{ + ut_ad(mutex_own(buf_page_get_mutex(bpage))); + + ut_a(buf_page_in_file(bpage)); + + if (bpage->access_time == 0) { + /* Make this the time of the first access. */ + bpage->access_time = static_cast<uint>(ut_time_ms()); + } +} + +/*********************************************************************//** +Gets the buf_block_t handle of a buffered file block if an uncompressed +page frame exists, or NULL. +@return control block, or NULL */ +UNIV_INLINE +buf_block_t* +buf_page_get_block( +/*===============*/ + buf_page_t* bpage) /*!< in: control block, or NULL */ +{ + if (bpage != NULL) { +#ifdef UNIV_DEBUG + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + ut_ad(buf_page_hash_lock_held_s_or_x(buf_pool, bpage) + || mutex_own(&buf_pool->LRU_list_mutex)); +#endif + ut_ad(buf_page_in_file(bpage)); + + if (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE) { + return((buf_block_t*) bpage); + } + } + + return(NULL); +} +#endif /* !UNIV_HOTBACKUP */ + +#ifdef UNIV_DEBUG +/*********************************************************************//** +Gets a pointer to the memory frame of a block. +@return pointer to the frame */ +UNIV_INLINE +buf_frame_t* +buf_block_get_frame( +/*================*/ + const buf_block_t* block) /*!< in: pointer to the control block */ +{ + SRV_CORRUPT_TABLE_CHECK(block, return(0);); + + switch (buf_block_get_state(block)) { + case BUF_BLOCK_POOL_WATCH: + case BUF_BLOCK_ZIP_PAGE: + case BUF_BLOCK_ZIP_DIRTY: + case BUF_BLOCK_NOT_USED: + ut_error; + break; + case BUF_BLOCK_FILE_PAGE: +# ifndef UNIV_HOTBACKUP + ut_a(block->page.buf_fix_count > 0); +# endif /* !UNIV_HOTBACKUP */ + /* fall through */ + case BUF_BLOCK_READY_FOR_USE: + case BUF_BLOCK_MEMORY: + case BUF_BLOCK_REMOVE_HASH: + goto ok; + } + ut_error; +ok: + return((buf_frame_t*) block->frame); +} +#endif /* UNIV_DEBUG */ + +/*********************************************************************//** +Gets the space id of a block. +@return space id */ +UNIV_INLINE +ulint +buf_page_get_space( +/*===============*/ + const buf_page_t* bpage) /*!< in: pointer to the control block */ +{ + ut_ad(bpage); + ut_a(buf_page_in_file(bpage)); + + return(bpage->space); +} + +/*********************************************************************//** +Gets the space id of a block. +@return space id */ +UNIV_INLINE +ulint +buf_block_get_space( +/*================*/ + const buf_block_t* block) /*!< in: pointer to the control block */ +{ + ut_ad(block); + ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); + + return(block->page.space); +} + +/*********************************************************************//** +Gets the page number of a block. +@return page number */ +UNIV_INLINE +ulint +buf_page_get_page_no( +/*=================*/ + const buf_page_t* bpage) /*!< in: pointer to the control block */ +{ + ut_ad(bpage); + ut_a(buf_page_in_file(bpage)); + + return(bpage->offset); +} +/*********************************************************************** +FIXME_FTS Gets the frame the pointer is pointing to. */ +UNIV_INLINE +buf_frame_t* +buf_frame_align( +/*============*/ + /* out: pointer to frame */ + byte* ptr) /* in: pointer to a frame */ +{ + buf_frame_t* frame; + + ut_ad(ptr); + + frame = (buf_frame_t*) ut_align_down(ptr, UNIV_PAGE_SIZE); + + return(frame); +} + +/*********************************************************************//** +Gets the page number of a block. +@return page number */ +UNIV_INLINE +ulint +buf_block_get_page_no( +/*==================*/ + const buf_block_t* block) /*!< in: pointer to the control block */ +{ + ut_ad(block); + ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); + + return(block->page.offset); +} + +/*********************************************************************//** +Gets the compressed page size of a block. +@return compressed page size, or 0 */ +UNIV_INLINE +ulint +buf_page_get_zip_size( +/*==================*/ + const buf_page_t* bpage) /*!< in: pointer to the control block */ +{ + return(bpage->zip.ssize + ? (UNIV_ZIP_SIZE_MIN >> 1) << bpage->zip.ssize : 0); +} + +/*********************************************************************//** +Gets the compressed page size of a block. +@return compressed page size, or 0 */ +UNIV_INLINE +ulint +buf_block_get_zip_size( +/*===================*/ + const buf_block_t* block) /*!< in: pointer to the control block */ +{ + return(block->page.zip.ssize + ? (UNIV_ZIP_SIZE_MIN >> 1) << block->page.zip.ssize : 0); +} + +#ifndef UNIV_HOTBACKUP +#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG +/*********************************************************************//** +Gets the compressed page descriptor corresponding to an uncompressed page +if applicable. +@return compressed page descriptor, or NULL */ +UNIV_INLINE +const page_zip_des_t* +buf_frame_get_page_zip( +/*===================*/ + const byte* ptr) /*!< in: pointer to the page */ +{ + return(buf_block_get_page_zip(buf_block_align(ptr))); +} +#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ +#endif /* !UNIV_HOTBACKUP */ + +/**********************************************************************//** +Gets the space id, page offset, and byte offset within page of a +pointer pointing to a buffer frame containing a file page. */ +UNIV_INLINE +void +buf_ptr_get_fsp_addr( +/*=================*/ + const void* ptr, /*!< in: pointer to a buffer frame */ + ulint* space, /*!< out: space id */ + fil_addr_t* addr) /*!< out: page offset and byte offset */ +{ + const page_t* page = (const page_t*) ut_align_down(ptr, + UNIV_PAGE_SIZE); + + *space = mach_read_from_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + addr->page = mach_read_from_4(page + FIL_PAGE_OFFSET); + addr->boffset = ut_align_offset(ptr, UNIV_PAGE_SIZE); +} + +#ifndef UNIV_HOTBACKUP +/**********************************************************************//** +Gets the hash value of the page the pointer is pointing to. This can be used +in searches in the lock hash table. +@return lock hash value */ +UNIV_INLINE +ulint +buf_block_get_lock_hash_val( +/*========================*/ + const buf_block_t* block) /*!< in: block */ +{ + ut_ad(block); + ut_ad(buf_page_in_file(&block->page)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&(((buf_block_t*) block)->lock), RW_LOCK_EXCLUSIVE) + || rw_lock_own(&(((buf_block_t*) block)->lock), RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + return(block->lock_hash_val); +} + +/********************************************************************//** +Allocates a buf_page_t descriptor. This function must succeed. In case +of failure we assert in this function. +@return: the allocated descriptor. */ +UNIV_INLINE +buf_page_t* +buf_page_alloc_descriptor(void) +/*===========================*/ +{ + buf_page_t* bpage; + + bpage = (buf_page_t*) ut_malloc(sizeof *bpage); + ut_d(memset(bpage, 0, sizeof *bpage)); + UNIV_MEM_ALLOC(bpage, sizeof *bpage); + + return(bpage); +} + +/********************************************************************//** +Free a buf_page_t descriptor. */ +UNIV_INLINE +void +buf_page_free_descriptor( +/*=====================*/ + buf_page_t* bpage) /*!< in: bpage descriptor to free. */ +{ + ut_free(bpage); +} + +/********************************************************************//** +Frees a buffer block which does not contain a file page. */ +UNIV_INLINE +void +buf_block_free( +/*===========*/ + buf_block_t* block) /*!< in, own: block to be freed */ +{ + mutex_enter(&block->mutex); + + ut_a(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE); + + buf_LRU_block_free_non_file_page(block); + + mutex_exit(&block->mutex); +} +#endif /* !UNIV_HOTBACKUP */ + +/*********************************************************************//** +Copies contents of a buffer frame to a given buffer. +@return buf */ +UNIV_INLINE +byte* +buf_frame_copy( +/*===========*/ + byte* buf, /*!< in: buffer to copy to */ + const buf_frame_t* frame) /*!< in: buffer frame */ +{ + ut_ad(buf && frame); + + ut_memcpy(buf, frame, UNIV_PAGE_SIZE); + + return(buf); +} + +#ifndef UNIV_HOTBACKUP +/********************************************************************//** +Calculates a folded value of a file page address to use in the page hash +table. +@return the folded value */ +UNIV_INLINE +ulint +buf_page_address_fold( +/*==================*/ + ulint space, /*!< in: space id */ + ulint offset) /*!< in: offset of the page within space */ +{ + return((space << 20) + space + offset); +} + +/********************************************************************//** +Gets the youngest modification log sequence number for a frame. +Returns zero if not file page or no modification occurred yet. +@return newest modification to page */ +UNIV_INLINE +lsn_t +buf_page_get_newest_modification( +/*=============================*/ + const buf_page_t* bpage) /*!< in: block containing the + page frame */ +{ + lsn_t lsn; + ib_mutex_t* block_mutex = buf_page_get_mutex(bpage); + + mutex_enter(block_mutex); + + if (buf_page_in_file(bpage)) { + lsn = bpage->newest_modification; + } else { + lsn = 0; + } + + mutex_exit(block_mutex); + + return(lsn); +} + +/********************************************************************//** +Increments the modify clock of a frame by 1. The caller must (1) own the +LRU list mutex and block bufferfix count has to be zero, (2) or own an x-lock +on the block. */ +UNIV_INLINE +void +buf_block_modify_clock_inc( +/*=======================*/ + buf_block_t* block) /*!< in: block */ +{ +#ifdef UNIV_SYNC_DEBUG + buf_pool_t* buf_pool = buf_pool_from_bpage((buf_page_t*) block); + + ut_ad((mutex_own(&buf_pool->LRU_list_mutex) + && (block->page.buf_fix_count == 0)) + || rw_lock_own(&(block->lock), RW_LOCK_EXCLUSIVE)); +#endif /* UNIV_SYNC_DEBUG */ + + block->modify_clock++; +} + +/********************************************************************//** +Returns the value of the modify clock. The caller must have an s-lock +or x-lock on the block. +@return value */ +UNIV_INLINE +ib_uint64_t +buf_block_get_modify_clock( +/*=======================*/ + buf_block_t* block) /*!< in: block */ +{ +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&(block->lock), RW_LOCK_SHARED) + || rw_lock_own(&(block->lock), RW_LOCK_EXCLUSIVE)); +#endif /* UNIV_SYNC_DEBUG */ + + return(block->modify_clock); +} + +/*******************************************************************//** +Increments the bufferfix count. */ +UNIV_INLINE +void +buf_block_fix( +/*===========*/ + buf_block_t* block) /*!< in/out: block to bufferfix */ +{ + ut_ad(!mutex_own(buf_page_get_mutex(&block->page))); +#ifdef PAGE_ATOMIC_REF_COUNT + os_atomic_increment_uint32(&block->page.buf_fix_count, 1); +#else + ib_mutex_t* block_mutex = buf_page_get_mutex(&block->page); + + mutex_enter(block_mutex); + ++block->page.buf_fix_count; + mutex_exit(block_mutex); +#endif /* PAGE_ATOMIC_REF_COUNT */ +} + +/*******************************************************************//** +Increments the bufferfix count. */ +UNIV_INLINE +void +buf_block_buf_fix_inc_func( +/*=======================*/ +#ifdef UNIV_SYNC_DEBUG + const char* file, /*!< in: file name */ + ulint line, /*!< in: line */ +#endif /* UNIV_SYNC_DEBUG */ + buf_block_t* block) /*!< in/out: block to bufferfix */ +{ +#ifdef UNIV_SYNC_DEBUG + ibool ret; + + ret = rw_lock_s_lock_nowait(&(block->debug_latch), file, line); + ut_a(ret); +#endif /* UNIV_SYNC_DEBUG */ + +#ifdef PAGE_ATOMIC_REF_COUNT + os_atomic_increment_uint32(&block->page.buf_fix_count, 1); +#else + ut_ad(mutex_own(&block->mutex)); + + ++block->page.buf_fix_count; +#endif /* PAGE_ATOMIC_REF_COUNT */ +} + +/*******************************************************************//** +Decrements the bufferfix count. */ +UNIV_INLINE +void +buf_block_unfix( +/*============*/ + buf_block_t* block) /*!< in/out: block to bufferunfix */ +{ + ut_ad(block->page.buf_fix_count > 0); + ut_ad(!mutex_own(buf_page_get_mutex(&block->page))); + +#ifdef PAGE_ATOMIC_REF_COUNT + os_atomic_decrement_uint32(&block->page.buf_fix_count, 1); +#else + ib_mutex_t* block_mutex = buf_page_get_mutex(&block->page); + + mutex_enter(block_mutex); + --block->page.buf_fix_count; + mutex_exit(block_mutex); +#endif /* PAGE_ATOMIC_REF_COUNT */ +} + +/*******************************************************************//** +Decrements the bufferfix count. */ +UNIV_INLINE +void +buf_block_buf_fix_dec( +/*==================*/ + buf_block_t* block) /*!< in/out: block to bufferunfix */ +{ + ut_ad(block->page.buf_fix_count > 0); + +#ifdef PAGE_ATOMIC_REF_COUNT + os_atomic_decrement_uint32(&block->page.buf_fix_count, 1); +#else + mutex_enter(&block->mutex); + --block->page.buf_fix_count; + mutex_exit(&block->mutex); +#endif /* PAGE_ATOMIC_REF_COUNT */ + +#ifdef UNIV_SYNC_DEBUG + rw_lock_s_unlock(&block->debug_latch); +#endif +} + +/******************************************************************//** +Returns the buffer pool instance given space and offset of page +@return buffer pool */ +UNIV_INLINE +buf_pool_t* +buf_pool_get( +/*==========*/ + ulint space, /*!< in: space id */ + ulint offset) /*!< in: offset of the page within space */ +{ + ulint fold; + ulint index; + ulint ignored_offset; + + ignored_offset = offset >> 6; /* 2log of BUF_READ_AHEAD_AREA (64)*/ + fold = buf_page_address_fold(space, ignored_offset); + index = fold % srv_buf_pool_instances; + return(&buf_pool_ptr[index]); +} + +/******************************************************************//** +Returns the buffer pool instance given its array index +@return buffer pool */ +UNIV_INLINE +buf_pool_t* +buf_pool_from_array( +/*================*/ + ulint index) /*!< in: array index to get + buffer pool instance from */ +{ + ut_ad(index < MAX_BUFFER_POOLS); + ut_ad(index < srv_buf_pool_instances); + return(&buf_pool_ptr[index]); +} + +/******************************************************************//** +Returns the control block of a file page, NULL if not found. +@return block, NULL if not found */ +UNIV_INLINE +buf_page_t* +buf_page_hash_get_low( +/*==================*/ + buf_pool_t* buf_pool,/*!< buffer pool instance */ + ulint space, /*!< in: space id */ + ulint offset, /*!< in: offset of the page within space */ + ulint fold) /*!< in: buf_page_address_fold(space, offset) */ +{ + buf_page_t* bpage; + +#ifdef UNIV_SYNC_DEBUG + ulint hash_fold; + prio_rw_lock_t* hash_lock; + + hash_fold = buf_page_address_fold(space, offset); + ut_ad(hash_fold == fold); + + hash_lock = hash_get_lock(buf_pool->page_hash, fold); + ut_ad(rw_lock_own(hash_lock, RW_LOCK_EX) + || rw_lock_own(hash_lock, RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + + /* Look for the page in the hash table */ + + HASH_SEARCH(hash, buf_pool->page_hash, fold, buf_page_t*, bpage, + ut_ad(bpage->in_page_hash && !bpage->in_zip_hash + && buf_page_in_file(bpage)), + bpage->space == space && bpage->offset == offset); + if (bpage) { + ut_a(buf_page_in_file(bpage)); + ut_ad(bpage->in_page_hash); + ut_ad(!bpage->in_zip_hash); + } + + return(bpage); +} + +/******************************************************************//** +Returns the control block of a file page, NULL if not found. +If the block is found and lock is not NULL then the appropriate +page_hash lock is acquired in the specified lock mode. Otherwise, +mode value is ignored. It is up to the caller to release the +lock. If the block is found and the lock is NULL then the page_hash +lock is released by this function. +@return block, NULL if not found */ +UNIV_INLINE +buf_page_t* +buf_page_hash_get_locked( +/*=====================*/ + /*!< out: pointer to the bpage, + or NULL; if NULL, hash_lock + is also NULL. */ + buf_pool_t* buf_pool, /*!< buffer pool instance */ + ulint space, /*!< in: space id */ + ulint offset, /*!< in: page number */ + prio_rw_lock_t** lock, /*!< in/out: lock of the page + hash acquired if bpage is + found. NULL otherwise. If NULL + is passed then the hash_lock + is released by this function */ + ulint lock_mode) /*!< in: RW_LOCK_EX or + RW_LOCK_SHARED. Ignored if + lock == NULL */ +{ + buf_page_t* bpage = NULL; + ulint fold; + prio_rw_lock_t* hash_lock; + ulint mode = RW_LOCK_SHARED; + + if (lock != NULL) { + *lock = NULL; + ut_ad(lock_mode == RW_LOCK_EX + || lock_mode == RW_LOCK_SHARED); + mode = lock_mode; + } + + fold = buf_page_address_fold(space, offset); + hash_lock = hash_get_lock(buf_pool->page_hash, fold); + +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX) + && !rw_lock_own(hash_lock, RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + + if (mode == RW_LOCK_SHARED) { + rw_lock_s_lock(hash_lock); + } else { + rw_lock_x_lock(hash_lock); + } + + bpage = buf_page_hash_get_low(buf_pool, space, offset, fold); + + if (!bpage || buf_pool_watch_is_sentinel(buf_pool, bpage)) { + bpage = NULL; + goto unlock_and_exit; + } + + ut_ad(buf_page_in_file(bpage)); + ut_ad(offset == bpage->offset); + ut_ad(space == bpage->space); + + if (lock == NULL) { + /* The caller wants us to release the page_hash lock */ + goto unlock_and_exit; + } else { + /* To be released by the caller */ + *lock = hash_lock; + goto exit; + } + +unlock_and_exit: + if (mode == RW_LOCK_SHARED) { + rw_lock_s_unlock(hash_lock); + } else { + rw_lock_x_unlock(hash_lock); + } +exit: + return(bpage); +} + +/******************************************************************//** +Returns the control block of a file page, NULL if not found. +If the block is found and lock is not NULL then the appropriate +page_hash lock is acquired in the specified lock mode. Otherwise, +mode value is ignored. It is up to the caller to release the +lock. If the block is found and the lock is NULL then the page_hash +lock is released by this function. +@return block, NULL if not found */ +UNIV_INLINE +buf_block_t* +buf_block_hash_get_locked( +/*=====================*/ + /*!< out: pointer to the bpage, + or NULL; if NULL, hash_lock + is also NULL. */ + buf_pool_t* buf_pool, /*!< buffer pool instance */ + ulint space, /*!< in: space id */ + ulint offset, /*!< in: page number */ + prio_rw_lock_t** lock, /*!< in/out: lock of the page + hash acquired if bpage is + found. NULL otherwise. If NULL + is passed then the hash_lock + is released by this function */ + ulint lock_mode) /*!< in: RW_LOCK_EX or + RW_LOCK_SHARED. Ignored if + lock == NULL */ +{ + buf_page_t* bpage = buf_page_hash_get_locked(buf_pool, + space, + offset, + lock, + lock_mode); + buf_block_t* block = buf_page_get_block(bpage); + + if (block) { + ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); +#ifdef UNIV_SYNC_DEBUG + ut_ad(!lock || rw_lock_own(*lock, lock_mode)); +#endif /* UNIV_SYNC_DEBUG */ + return(block); + } else if (bpage) { + /* It is not a block. Just a bpage */ + ut_ad(buf_page_in_file(bpage)); + + if (lock) { + if (lock_mode == RW_LOCK_SHARED) { + rw_lock_s_unlock(*lock); + } else { + rw_lock_x_unlock(*lock); + } + } + *lock = NULL; + return(NULL); + } + + ut_ad(!bpage); + ut_ad(lock == NULL ||*lock == NULL); + return(NULL); +} + +/********************************************************************//** +Returns TRUE if the page can be found in the buffer pool hash table. + +NOTE that it is possible that the page is not yet read from disk, +though. + +@return TRUE if found in the page hash table */ +UNIV_INLINE +ibool +buf_page_peek( +/*==========*/ + ulint space, /*!< in: space id */ + ulint offset) /*!< in: page number */ +{ + buf_pool_t* buf_pool = buf_pool_get(space, offset); + + return(buf_page_hash_get(buf_pool, space, offset) != NULL); +} + +/********************************************************************//** +Releases a compressed-only page acquired with buf_page_get_zip(). */ +UNIV_INLINE +void +buf_page_release_zip( +/*=================*/ + buf_page_t* bpage) /*!< in: buffer block */ +{ + buf_block_t* block; + + block = (buf_block_t*) bpage; + + switch (buf_page_get_state(bpage)) { + case BUF_BLOCK_FILE_PAGE: +#ifdef UNIV_SYNC_DEBUG + rw_lock_s_unlock(&block->debug_latch); +#endif /* UNUV_SYNC_DEBUG */ + /* Fall through */ + case BUF_BLOCK_ZIP_PAGE: + case BUF_BLOCK_ZIP_DIRTY: + buf_block_unfix(block); + return; + + case BUF_BLOCK_POOL_WATCH: + case BUF_BLOCK_NOT_USED: + case BUF_BLOCK_READY_FOR_USE: + case BUF_BLOCK_MEMORY: + case BUF_BLOCK_REMOVE_HASH: + break; + } + + ut_error; +} + +/********************************************************************//** +Decrements the bufferfix count of a buffer control block and releases +a latch, if specified. */ +UNIV_INLINE +void +buf_page_release( +/*=============*/ + buf_block_t* block, /*!< in: buffer block */ + ulint rw_latch) /*!< in: RW_S_LATCH, RW_X_LATCH, + RW_NO_LATCH */ +{ + ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); + +#ifdef UNIV_SYNC_DEBUG + rw_lock_s_unlock(&(block->debug_latch)); +#endif + if (rw_latch == RW_S_LATCH) { + rw_lock_s_unlock(&(block->lock)); + } else if (rw_latch == RW_X_LATCH) { + rw_lock_x_unlock(&(block->lock)); + } + + buf_block_unfix(block); +} + +#ifdef UNIV_SYNC_DEBUG +/*********************************************************************//** +Adds latch level info for the rw-lock protecting the buffer frame. This +should be called in the debug version after a successful latching of a +page if we know the latching order level of the acquired latch. */ +UNIV_INLINE +void +buf_block_dbg_add_level( +/*====================*/ + buf_block_t* block, /*!< in: buffer page + where we have acquired latch */ + ulint level) /*!< in: latching order level */ +{ + sync_thread_add_level(&block->lock, level, FALSE); +} + +#endif /* UNIV_SYNC_DEBUG */ +/*********************************************************************//** +Get the nth chunk's buffer block in the specified buffer pool. +@return the nth chunk's buffer block. */ +UNIV_INLINE +buf_block_t* +buf_get_nth_chunk_block( +/*====================*/ + const buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + ulint n, /*!< in: nth chunk in the buffer pool */ + ulint* chunk_size) /*!< in: chunk size */ +{ + const buf_chunk_t* chunk; + + chunk = buf_pool->chunks + n; + *chunk_size = chunk->size; + return(chunk->blocks); +} + +#ifdef UNIV_DEBUG +/********************************************************************//** +Checks if buf_pool->zip_mutex is owned and is serving for a given page as its +block mutex. +@return true if buf_pool->zip_mutex is owned. */ +UNIV_INLINE +bool +buf_own_zip_mutex_for_page( +/*=======================*/ + const buf_page_t* bpage) +{ + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + + ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_PAGE + || buf_page_get_state(bpage) == BUF_BLOCK_ZIP_DIRTY); + ut_ad(buf_page_get_mutex(bpage) == &buf_pool->zip_mutex); + + return(mutex_own(&buf_pool->zip_mutex)); +} +#endif /* UNIV_DEBUG */ + +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/xtradb/include/buf0checksum.h b/storage/xtradb/include/buf0checksum.h new file mode 100644 index 00000000000..cd21781dc6e --- /dev/null +++ b/storage/xtradb/include/buf0checksum.h @@ -0,0 +1,88 @@ +/***************************************************************************** + +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file buf/buf0checksum.h +Buffer pool checksum functions, also linked from /extra/innochecksum.cc + +Created Aug 11, 2011 Vasil Dimov +*******************************************************/ + +#ifndef buf0checksum_h +#define buf0checksum_h + +#include "univ.i" + +#ifndef UNIV_INNOCHECKSUM + +#include "buf0types.h" + +#endif /* !UNIV_INNOCHECKSUM */ + +/********************************************************************//** +Calculates a page CRC32 which is stored to the page when it is written +to a file. Note that we must be careful to calculate the same value on +32-bit and 64-bit architectures. +@return checksum */ +UNIV_INTERN +ib_uint32_t +buf_calc_page_crc32( +/*================*/ + const byte* page); /*!< in: buffer page */ + +/********************************************************************//** +Calculates a page checksum which is stored to the page when it is written +to a file. Note that we must be careful to calculate the same value on +32-bit and 64-bit architectures. +@return checksum */ +UNIV_INTERN +ulint +buf_calc_page_new_checksum( +/*=======================*/ + const byte* page); /*!< in: buffer page */ + +/********************************************************************//** +In versions < 4.0.14 and < 4.1.1 there was a bug that the checksum only +looked at the first few bytes of the page. This calculates that old +checksum. +NOTE: we must first store the new formula checksum to +FIL_PAGE_SPACE_OR_CHKSUM before calculating and storing this old checksum +because this takes that field as an input! +@return checksum */ +UNIV_INTERN +ulint +buf_calc_page_old_checksum( +/*=======================*/ + const byte* page); /*!< in: buffer page */ + +#ifndef UNIV_INNOCHECKSUM + +/********************************************************************//** +Return a printable string describing the checksum algorithm. +@return algorithm name */ +UNIV_INTERN +const char* +buf_checksum_algorithm_name( +/*========================*/ + srv_checksum_algorithm_t algo); /*!< in: algorithm */ + +extern ulong srv_checksum_algorithm; + +#endif /* !UNIV_INNOCHECKSUM */ + +#endif /* buf0checksum_h */ diff --git a/storage/xtradb/include/buf0dblwr.h b/storage/xtradb/include/buf0dblwr.h new file mode 100644 index 00000000000..a62a6400d97 --- /dev/null +++ b/storage/xtradb/include/buf0dblwr.h @@ -0,0 +1,162 @@ +/***************************************************************************** + +Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/buf0dblwr.h +Doublewrite buffer module + +Created 2011/12/19 Inaam Rana +*******************************************************/ + +#ifndef buf0dblwr_h +#define buf0dblwr_h + +#include "univ.i" +#include "ut0byte.h" +#include "log0log.h" +#include "log0recv.h" + +#ifndef UNIV_HOTBACKUP + +/** Doublewrite system */ +extern buf_dblwr_t* buf_dblwr; +/** Set to TRUE when the doublewrite buffer is being created */ +extern ibool buf_dblwr_being_created; + +/****************************************************************//** +Creates the doublewrite buffer to a new InnoDB installation. The header of the +doublewrite buffer is placed on the trx system header page. */ +UNIV_INTERN +void +buf_dblwr_create(void); +/*==================*/ + +/****************************************************************//** +At a database startup initializes the doublewrite buffer memory structure if +we already have a doublewrite buffer created in the data files. If we are +upgrading to an InnoDB version which supports multiple tablespaces, then this +function performs the necessary update operations. If we are in a crash +recovery, this function loads the pages from double write buffer into memory. */ +void +buf_dblwr_init_or_load_pages( +/*=========================*/ + os_file_t file, + char* path, + bool load_corrupt_pages); + +/****************************************************************//** +Process the double write buffer pages. */ +void +buf_dblwr_process(void); +/*===================*/ + +/****************************************************************//** +frees doublewrite buffer. */ +UNIV_INTERN +void +buf_dblwr_free(void); +/*================*/ +/********************************************************************//** +Updates the doublewrite buffer when an IO request is completed. */ +UNIV_INTERN +void +buf_dblwr_update( +/*=============*/ + const buf_page_t* bpage, /*!< in: buffer block descriptor */ + buf_flush_t flush_type);/*!< in: flush type */ +/****************************************************************//** +Determines if a page number is located inside the doublewrite buffer. +@return TRUE if the location is inside the two blocks of the +doublewrite buffer */ +UNIV_INTERN +ibool +buf_dblwr_page_inside( +/*==================*/ + ulint page_no); /*!< in: page number */ +/********************************************************************//** +Posts a buffer page for writing. If the doublewrite memory buffer is +full, calls buf_dblwr_flush_buffered_writes and waits for for free +space to appear. */ +UNIV_INTERN +void +buf_dblwr_add_to_batch( +/*====================*/ + buf_page_t* bpage); /*!< in: buffer block to write */ +/********************************************************************//** +Flushes possible buffered writes from the doublewrite memory buffer to disk, +and also wakes up the aio thread if simulated aio is used. It is very +important to call this function after a batch of writes has been posted, +and also when we may have to wait for a page latch! Otherwise a deadlock +of threads can occur. */ +UNIV_INTERN +void +buf_dblwr_flush_buffered_writes(void); +/*=================================*/ +/********************************************************************//** +Writes a page to the doublewrite buffer on disk, sync it, then write +the page to the datafile and sync the datafile. This function is used +for single page flushes. If all the buffers allocated for single page +flushes in the doublewrite buffer are in use we wait here for one to +become free. We are guaranteed that a slot will become free because any +thread that is using a slot must also release the slot before leaving +this function. */ +UNIV_INTERN +void +buf_dblwr_write_single_page( +/*========================*/ + buf_page_t* bpage, /*!< in: buffer block to write */ + bool sync); /*!< in: true if sync IO requested */ + +/** Doublewrite control struct */ +struct buf_dblwr_t{ + ib_mutex_t mutex; /*!< mutex protecting the first_free + field and write_buf */ + ulint block1; /*!< the page number of the first + doublewrite block (64 pages) */ + ulint block2; /*!< page number of the second block */ + ulint first_free;/*!< first free position in write_buf + measured in units of UNIV_PAGE_SIZE */ + ulint b_reserved;/*!< number of slots currently reserved + for batch flush. */ + os_event_t b_event;/*!< event where threads wait for a + batch flush to end. */ + ulint s_reserved;/*!< number of slots currently + reserved for single page flushes. */ + os_event_t s_event;/*!< event where threads wait for a + single page flush slot. */ + bool* in_use; /*!< flag used to indicate if a slot is + in use. Only used for single page + flushes. */ + bool batch_running;/*!< set to TRUE if currently a batch + is being written from the doublewrite + buffer. */ + byte* write_buf;/*!< write buffer used in writing to the + doublewrite buffer, aligned to an + address divisible by UNIV_PAGE_SIZE + (which is required by Windows aio) */ + byte* write_buf_unaligned;/*!< pointer to write_buf, + but unaligned */ + buf_page_t** buf_block_arr;/*!< array to store pointers to + the buffer blocks which have been + cached to write_buf */ +}; + + +#endif /* UNIV_HOTBACKUP */ + +#endif diff --git a/storage/xtradb/include/buf0dump.h b/storage/xtradb/include/buf0dump.h new file mode 100644 index 00000000000..c704a8e97e0 --- /dev/null +++ b/storage/xtradb/include/buf0dump.h @@ -0,0 +1,72 @@ +/***************************************************************************** + +Copyright (c) 2011, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file buf/buf0dump.h +Implements a buffer pool dump/load. + +Created April 08, 2011 Vasil Dimov +*******************************************************/ + +#ifndef buf0dump_h +#define buf0dump_h + +#include "univ.i" + +/*****************************************************************//** +Wakes up the buffer pool dump/load thread and instructs it to start +a dump. This function is called by MySQL code via buffer_pool_dump_now() +and it should return immediately because the whole MySQL is frozen during +its execution. */ +UNIV_INTERN +void +buf_dump_start(); +/*============*/ + +/*****************************************************************//** +Wakes up the buffer pool dump/load thread and instructs it to start +a load. This function is called by MySQL code via buffer_pool_load_now() +and it should return immediately because the whole MySQL is frozen during +its execution. */ +UNIV_INTERN +void +buf_load_start(); +/*============*/ + +/*****************************************************************//** +Aborts a currently running buffer pool load. This function is called by +MySQL code via buffer_pool_load_abort() and it should return immediately +because the whole MySQL is frozen during its execution. */ +UNIV_INTERN +void +buf_load_abort(); +/*============*/ + +/*****************************************************************//** +This is the main thread for buffer pool dump/load. It waits for an +event and when waked up either performs a dump or load and sleeps +again. +@return this function does not return, it calls os_thread_exit() */ +extern "C" UNIV_INTERN +os_thread_ret_t +DECLARE_THREAD(buf_dump_thread)( +/*============================*/ + void* arg); /*!< in: a dummy parameter + required by os_thread_create */ + +#endif /* buf0dump_h */ diff --git a/storage/xtradb/include/buf0flu.h b/storage/xtradb/include/buf0flu.h new file mode 100644 index 00000000000..56b0c314b5c --- /dev/null +++ b/storage/xtradb/include/buf0flu.h @@ -0,0 +1,311 @@ +/***************************************************************************** + +Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/buf0flu.h +The database buffer pool flush algorithm + +Created 11/5/1995 Heikki Tuuri +*******************************************************/ + +#ifndef buf0flu_h +#define buf0flu_h + +#include "univ.i" +#include "ut0byte.h" +#include "log0log.h" +#ifndef UNIV_HOTBACKUP +#include "mtr0types.h" +#include "buf0types.h" + +/** Flag indicating if the page_cleaner is in active state. */ +extern ibool buf_page_cleaner_is_active; + +/** Flag indicating if the lru_manager is in active state. */ +extern bool buf_lru_manager_is_active; + +/********************************************************************//** +Remove a block from the flush list of modified blocks. */ +UNIV_INTERN +void +buf_flush_remove( +/*=============*/ + buf_page_t* bpage); /*!< in: pointer to the block in question */ +/*******************************************************************//** +Relocates a buffer control block on the flush_list. +Note that it is assumed that the contents of bpage has already been +copied to dpage. */ +UNIV_INTERN +void +buf_flush_relocate_on_flush_list( +/*=============================*/ + buf_page_t* bpage, /*!< in/out: control block being moved */ + buf_page_t* dpage); /*!< in/out: destination block */ +/********************************************************************//** +Updates the flush system data structures when a write is completed. */ +UNIV_INTERN +void +buf_flush_write_complete( +/*=====================*/ + buf_page_t* bpage); /*!< in: pointer to the block in question */ +#endif /* !UNIV_HOTBACKUP */ +/********************************************************************//** +Initializes a page for writing to the tablespace. */ +UNIV_INTERN +void +buf_flush_init_for_writing( +/*=======================*/ + byte* page, /*!< in/out: page */ + void* page_zip_, /*!< in/out: compressed page, or NULL */ + lsn_t newest_lsn); /*!< in: newest modification lsn + to the page */ +#ifndef UNIV_HOTBACKUP +# if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG +/********************************************************************//** +Writes a flushable page asynchronously from the buffer pool to a file. +NOTE: block and LRU list mutexes must be held upon entering this function, and +they will be released by this function after flushing. This is loosely based on +buf_flush_batch() and buf_flush_page(). +@return TRUE if the page was flushed and the mutexes released */ +UNIV_INTERN +ibool +buf_flush_page_try( +/*===============*/ + buf_pool_t* buf_pool, /*!< in/out: buffer pool instance */ + buf_block_t* block) /*!< in/out: buffer control block */ + __attribute__((nonnull, warn_unused_result)); +# endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ +/*******************************************************************//** +This utility flushes dirty blocks from the end of the flush list of +all buffer pool instances. +NOTE: The calling thread is not allowed to own any latches on pages! +@return true if a batch was queued successfully for each buffer pool +instance. false if another batch of same type was already running in +at least one of the buffer pool instance */ +UNIV_INTERN +bool +buf_flush_list( +/*===========*/ + ulint min_n, /*!< in: wished minimum mumber of blocks + flushed (it is not guaranteed that the + actual number is that big, though) */ + lsn_t lsn_limit, /*!< in the case BUF_FLUSH_LIST all + blocks whose oldest_modification is + smaller than this should be flushed + (if their number does not exceed + min_n), otherwise ignored */ + ulint* n_processed); /*!< out: the number of pages + which were processed is passed + back to caller. Ignored if NULL */ +/******************************************************************//** +This function picks up a single dirty page from the tail of the LRU +list, flushes it, removes it from page_hash and LRU list and puts +it on the free list. It is called from user threads when they are +unable to find a replacable page at the tail of the LRU list i.e.: +when the background LRU flushing in the page_cleaner thread is not +fast enough to keep pace with the workload. +@return TRUE if success. */ +UNIV_INTERN +ibool +buf_flush_single_page_from_LRU( +/*===========================*/ + buf_pool_t* buf_pool); /*!< in/out: buffer pool instance */ +/******************************************************************//** +Waits until a flush batch of the given type ends */ +UNIV_INTERN +void +buf_flush_wait_batch_end( +/*=====================*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + buf_flush_t type); /*!< in: BUF_FLUSH_LRU + or BUF_FLUSH_LIST */ +/******************************************************************//** +Waits until a flush batch of the given type ends. This is called by +a thread that only wants to wait for a flush to end but doesn't do +any flushing itself. */ +UNIV_INTERN +void +buf_flush_wait_batch_end_wait_only( +/*===============================*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + buf_flush_t type); /*!< in: BUF_FLUSH_LRU + or BUF_FLUSH_LIST */ +/********************************************************************//** +This function should be called at a mini-transaction commit, if a page was +modified in it. Puts the block to the list of modified blocks, if it not +already in it. */ +UNIV_INLINE +void +buf_flush_note_modification( +/*========================*/ + buf_block_t* block, /*!< in: block which is modified */ + mtr_t* mtr); /*!< in: mtr */ +/********************************************************************//** +This function should be called when recovery has modified a buffer page. */ +UNIV_INLINE +void +buf_flush_recv_note_modification( +/*=============================*/ + buf_block_t* block, /*!< in: block which is modified */ + lsn_t start_lsn, /*!< in: start lsn of the first mtr in a + set of mtr's */ + lsn_t end_lsn); /*!< in: end lsn of the last mtr in the + set of mtr's */ +/********************************************************************//** +Returns TRUE if the file page block is immediately suitable for replacement, +i.e., transition FILE_PAGE => NOT_USED allowed. +@return TRUE if can replace immediately */ +UNIV_INTERN +ibool +buf_flush_ready_for_replace( +/*========================*/ + buf_page_t* bpage); /*!< in: buffer control block, must be + buf_page_in_file(bpage) and in the LRU list */ +/******************************************************************//** +page_cleaner thread tasked with flushing dirty pages from the buffer +pool flush lists. As of now we'll have only one instance of this thread. +@return a dummy parameter */ +extern "C" UNIV_INTERN +os_thread_ret_t +DECLARE_THREAD(buf_flush_page_cleaner_thread)( +/*==========================================*/ + void* arg); /*!< in: a dummy parameter required by + os_thread_create */ +/******************************************************************//** +lru_manager thread tasked with performing LRU flushes and evictions to refill +the buffer pool free lists. As of now we'll have only one instance of this +thread. +@return a dummy parameter */ +extern "C" UNIV_INTERN +os_thread_ret_t +DECLARE_THREAD(buf_flush_lru_manager_thread)( +/*=========================================*/ + void* arg); /*!< in: a dummy parameter required by + os_thread_create */ +/*********************************************************************//** +Clears up tail of the LRU lists: +* Put replaceable pages at the tail of LRU to the free list +* Flush dirty pages at the tail of LRU to the disk +The depth to which we scan each buffer pool is controlled by dynamic +config parameter innodb_LRU_scan_depth. +@return total pages flushed */ +UNIV_INTERN +ulint +buf_flush_LRU_tail(void); +/*====================*/ +/*********************************************************************//** +Wait for any possible LRU flushes that are in progress to end. */ +UNIV_INTERN +void +buf_flush_wait_LRU_batch_end(void); +/*==============================*/ + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +/******************************************************************//** +Validates the flush list. +@return TRUE if ok */ +UNIV_INTERN +ibool +buf_flush_validate( +/*===============*/ + buf_pool_t* buf_pool); +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ + +/********************************************************************//** +Initialize the red-black tree to speed up insertions into the flush_list +during recovery process. Should be called at the start of recovery +process before any page has been read/written. */ +UNIV_INTERN +void +buf_flush_init_flush_rbt(void); +/*==========================*/ + +/********************************************************************//** +Frees up the red-black tree. */ +UNIV_INTERN +void +buf_flush_free_flush_rbt(void); +/*==========================*/ + +/********************************************************************//** +Writes a flushable page asynchronously from the buffer pool to a file. +NOTE: in simulated aio we must call +os_aio_simulated_wake_handler_threads after we have posted a batch of +writes! NOTE: buf_page_get_mutex(bpage) must be held upon entering this +function, and they will be released by this function if it returns true. +LRU_list_mutex must be held iff performing a single page flush and will be +released by the function if it returns true. +@return TRUE if the page was flushed */ +UNIV_INTERN +bool +buf_flush_page( +/*===========*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + buf_page_t* bpage, /*!< in: buffer control block */ + buf_flush_t flush_type, /*!< in: type of flush */ + bool sync); /*!< in: true if sync IO request */ +/********************************************************************//** +Returns true if the block is modified and ready for flushing. +@return true if can flush immediately */ +UNIV_INTERN +bool +buf_flush_ready_for_flush( +/*======================*/ + buf_page_t* bpage, /*!< in: buffer control block, must be + buf_page_in_file(bpage) */ + buf_flush_t flush_type)/*!< in: type of flush */ + __attribute__((warn_unused_result)); + +#ifdef UNIV_DEBUG +/******************************************************************//** +Check if there are any dirty pages that belong to a space id in the flush +list in a particular buffer pool. +@return number of dirty pages present in a single buffer pool */ +UNIV_INTERN +ulint +buf_pool_get_dirty_pages_count( +/*===========================*/ + buf_pool_t* buf_pool, /*!< in: buffer pool */ + ulint id); /*!< in: space id to check */ +/******************************************************************//** +Check if there are any dirty pages that belong to a space id in the flush list. +@return count of dirty pages present in all the buffer pools */ +UNIV_INTERN +ulint +buf_flush_get_dirty_pages_count( +/*============================*/ + ulint id); /*!< in: space id to check */ +#endif /* UNIV_DEBUG */ + +#endif /* !UNIV_HOTBACKUP */ + +/******************************************************************//** +Check if a flush list flush is in progress for any buffer pool instance, or if +all the instances are clean, for heuristic purposes. +@return true if flush list flush is in progress or buffer pool is clean */ +UNIV_INLINE +bool +buf_flush_flush_list_in_progress(void) +/*==================================*/ + __attribute__((warn_unused_result)); + +#ifndef UNIV_NONINL +#include "buf0flu.ic" +#endif + +#endif diff --git a/storage/xtradb/include/buf0flu.ic b/storage/xtradb/include/buf0flu.ic new file mode 100644 index 00000000000..06fa49754cd --- /dev/null +++ b/storage/xtradb/include/buf0flu.ic @@ -0,0 +1,167 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/buf0flu.ic +The database buffer pool flush algorithm + +Created 11/5/1995 Heikki Tuuri +*******************************************************/ + +#ifndef UNIV_HOTBACKUP +#include "buf0buf.h" +#include "mtr0mtr.h" +#include "srv0srv.h" + +/********************************************************************//** +Inserts a modified block into the flush list. */ +UNIV_INTERN +void +buf_flush_insert_into_flush_list( +/*=============================*/ + buf_pool_t* buf_pool, /*!< buffer pool instance */ + buf_block_t* block, /*!< in/out: block which is modified */ + lsn_t lsn); /*!< in: oldest modification */ +/********************************************************************//** +Inserts a modified block into the flush list in the right sorted position. +This function is used by recovery, because there the modifications do not +necessarily come in the order of lsn's. */ +UNIV_INTERN +void +buf_flush_insert_sorted_into_flush_list( +/*====================================*/ + buf_pool_t* buf_pool, /*!< buffer pool instance */ + buf_block_t* block, /*!< in/out: block which is modified */ + lsn_t lsn); /*!< in: oldest modification */ + +/********************************************************************//** +This function should be called at a mini-transaction commit, if a page was +modified in it. Puts the block to the list of modified blocks, if it is not +already in it. */ +UNIV_INLINE +void +buf_flush_note_modification( +/*========================*/ + buf_block_t* block, /*!< in: block which is modified */ + mtr_t* mtr) /*!< in: mtr */ +{ + buf_pool_t* buf_pool = buf_pool_from_block(block); + + ut_ad(!srv_read_only_mode); + ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); + ut_ad(block->page.buf_fix_count > 0); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + ut_ad(!buf_flush_list_mutex_own(buf_pool)); + ut_ad(!mtr->made_dirty || log_flush_order_mutex_own()); + + ut_ad(mtr->start_lsn != 0); + ut_ad(mtr->modifications); + + mutex_enter(&block->mutex); + ut_ad(block->page.newest_modification <= mtr->end_lsn); + + block->page.newest_modification = mtr->end_lsn; + + if (!block->page.oldest_modification) { + ut_a(mtr->made_dirty); + ut_ad(log_flush_order_mutex_own()); + buf_flush_insert_into_flush_list( + buf_pool, block, mtr->start_lsn); + } else { + ut_ad(block->page.oldest_modification <= mtr->start_lsn); + } + + mutex_exit(&block->mutex); + + srv_stats.buf_pool_write_requests.inc(); +} + +/********************************************************************//** +This function should be called when recovery has modified a buffer page. */ +UNIV_INLINE +void +buf_flush_recv_note_modification( +/*=============================*/ + buf_block_t* block, /*!< in: block which is modified */ + lsn_t start_lsn, /*!< in: start lsn of the first mtr in a + set of mtr's */ + lsn_t end_lsn) /*!< in: end lsn of the last mtr in the + set of mtr's */ +{ + buf_pool_t* buf_pool = buf_pool_from_block(block); + + ut_ad(!srv_read_only_mode); + ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); + ut_ad(block->page.buf_fix_count > 0); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + ut_ad(!buf_flush_list_mutex_own(buf_pool)); + ut_ad(log_flush_order_mutex_own()); + + ut_ad(start_lsn != 0); + ut_ad(block->page.newest_modification <= end_lsn); + + mutex_enter(&block->mutex); + block->page.newest_modification = end_lsn; + + if (!block->page.oldest_modification) { + buf_flush_insert_sorted_into_flush_list( + buf_pool, block, start_lsn); + } else { + ut_ad(block->page.oldest_modification <= start_lsn); + } + + mutex_exit(&block->mutex); + +} +#endif /* !UNIV_HOTBACKUP */ + +/******************************************************************//** +Check if a flush list flush is in progress for any buffer pool instance, or if +all the instances are clean, for heuristic purposes. +@return true if flush list flush is in progress or buffer pool is clean */ +UNIV_INLINE +bool +buf_flush_flush_list_in_progress(void) +/*==================================*/ +{ + bool all_clean = true; + + for (ulint i = 0; i < srv_buf_pool_instances; i++) { + + const buf_pool_t* buf_pool = buf_pool_from_array(i); + if (buf_pool->init_flush[BUF_FLUSH_LIST] + || buf_pool->n_flush[BUF_FLUSH_LIST]) { + + return(true); + } + + if (all_clean) { + + all_clean = (UT_LIST_GET_LEN(buf_pool->flush_list) + == 0); + } + + } + return(all_clean); +} diff --git a/storage/xtradb/include/buf0lru.h b/storage/xtradb/include/buf0lru.h new file mode 100644 index 00000000000..6415540178c --- /dev/null +++ b/storage/xtradb/include/buf0lru.h @@ -0,0 +1,312 @@ +/***************************************************************************** + +Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/buf0lru.h +The database buffer pool LRU replacement algorithm + +Created 11/5/1995 Heikki Tuuri +*******************************************************/ + +#ifndef buf0lru_h +#define buf0lru_h + +#include "univ.i" +#ifndef UNIV_HOTBACKUP +#include "ut0byte.h" +#include "buf0types.h" + +// Forward declaration +struct trx_t; + +/******************************************************************//** +Returns TRUE if less than 25 % of the buffer pool is available. This can be +used in heuristics to prevent huge transactions eating up the whole buffer +pool for their locks. +@return TRUE if less than 25 % of buffer pool left */ +UNIV_INTERN +ibool +buf_LRU_buf_pool_running_out(void); +/*==============================*/ + +/*####################################################################### +These are low-level functions +#########################################################################*/ + +/** Minimum LRU list length for which the LRU_old pointer is defined */ +#define BUF_LRU_OLD_MIN_LEN 512 /* 8 megabytes of 16k pages */ + +/******************************************************************//** +Flushes all dirty pages or removes all pages belonging +to a given tablespace. A PROBLEM: if readahead is being started, what +guarantees that it will not try to read in pages after this operation +has completed? */ +UNIV_INTERN +void +buf_LRU_flush_or_remove_pages( +/*==========================*/ + ulint id, /*!< in: space id */ + buf_remove_t buf_remove, /*!< in: remove or flush strategy */ + const trx_t* trx); /*!< to check if the operation must + be interrupted */ + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +/********************************************************************//** +Insert a compressed block into buf_pool->zip_clean in the LRU order. */ +UNIV_INTERN +void +buf_LRU_insert_zip_clean( +/*=====================*/ + buf_page_t* bpage); /*!< in: pointer to the block in question */ +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ + +/******************************************************************//** +Try to free a block. If bpage is a descriptor of a compressed-only +page, the descriptor object will be freed as well. + +NOTE: If this function returns true, it will release the LRU list mutex, +and temporarily release and relock the buf_page_get_mutex() mutex. +Furthermore, the page frame will no longer be accessible via bpage. If this +function returns false, the buf_page_get_mutex() might be temporarily released +and relocked too. + +The caller must hold the LRU list and buf_page_get_mutex() mutexes. + +@return true if freed, false otherwise. */ +UNIV_INTERN +bool +buf_LRU_free_page( +/*==============*/ + buf_page_t* bpage, /*!< in: block to be freed */ + bool zip) /*!< in: true if should remove also the + compressed page of an uncompressed page */ + __attribute__((nonnull)); +/******************************************************************//** +Try to free a replaceable block. +@return TRUE if found and freed */ +UNIV_INTERN +ibool +buf_LRU_scan_and_free_block( +/*========================*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + ibool scan_all) /*!< in: scan whole LRU list + if TRUE, otherwise scan only + 'old' blocks. */ + __attribute__((nonnull,warn_unused_result)); +/******************************************************************//** +Returns a free block from the buf_pool. The block is taken off the +free list. If it is empty, returns NULL. +@return a free control block, or NULL if the buf_block->free list is empty */ +UNIV_INTERN +buf_block_t* +buf_LRU_get_free_only( +/*==================*/ + buf_pool_t* buf_pool); /*!< buffer pool instance */ +/******************************************************************//** +Returns a free block from the buf_pool. The block is taken off the +free list. If it is empty, blocks are moved from the end of the +LRU list to the free list. +This function is called from a user thread when it needs a clean +block to read in a page. Note that we only ever get a block from +the free list. Even when we flush a page or find a page in LRU scan +we put it to free list to be used. +* iteration 0: + * get a block from free list, success:done + * if there is an LRU flush batch in progress: + * wait for batch to end: retry free list + * if buf_pool->try_LRU_scan is set + * scan LRU up to srv_LRU_scan_depth to find a clean block + * the above will put the block on free list + * success:retry the free list + * flush one dirty page from tail of LRU to disk + * the above will put the block on free list + * success: retry the free list +* iteration 1: + * same as iteration 0 except: + * scan whole LRU list + * scan LRU list even if buf_pool->try_LRU_scan is not set +* iteration > 1: + * same as iteration 1 but sleep 100ms +@return the free control block, in state BUF_BLOCK_READY_FOR_USE */ +UNIV_INTERN +buf_block_t* +buf_LRU_get_free_block( +/*===================*/ + buf_pool_t* buf_pool) /*!< in/out: buffer pool instance */ + __attribute__((nonnull,warn_unused_result)); +/******************************************************************//** +Determines if the unzip_LRU list should be used for evicting a victim +instead of the general LRU list. +@return TRUE if should use unzip_LRU */ +UNIV_INTERN +ibool +buf_LRU_evict_from_unzip_LRU( +/*=========================*/ + buf_pool_t* buf_pool); +/******************************************************************//** +Puts a block back to the free list. */ +UNIV_INTERN +void +buf_LRU_block_free_non_file_page( +/*=============================*/ + buf_block_t* block); /*!< in: block, must not contain a file page */ +/******************************************************************//** +Adds a block to the LRU list. Please make sure that the zip_size is +already set into the page zip when invoking the function, so that we +can get correct zip_size from the buffer page when adding a block +into LRU */ +UNIV_INTERN +void +buf_LRU_add_block( +/*==============*/ + buf_page_t* bpage, /*!< in: control block */ + ibool old); /*!< in: TRUE if should be put to the old + blocks in the LRU list, else put to the + start; if the LRU list is very short, added to + the start regardless of this parameter */ +/******************************************************************//** +Adds a block to the LRU list of decompressed zip pages. */ +UNIV_INTERN +void +buf_unzip_LRU_add_block( +/*====================*/ + buf_block_t* block, /*!< in: control block */ + ibool old); /*!< in: TRUE if should be put to the end + of the list, else put to the start */ +/******************************************************************//** +Moves a block to the start of the LRU list. */ +UNIV_INTERN +void +buf_LRU_make_block_young( +/*=====================*/ + buf_page_t* bpage); /*!< in: control block */ +/******************************************************************//** +Moves a block to the end of the LRU list. */ +UNIV_INTERN +void +buf_LRU_make_block_old( +/*===================*/ + buf_page_t* bpage); /*!< in: control block */ +/**********************************************************************//** +Updates buf_pool->LRU_old_ratio. +@return updated old_pct */ +UNIV_INTERN +ulint +buf_LRU_old_ratio_update( +/*=====================*/ + uint old_pct,/*!< in: Reserve this percentage of + the buffer pool for "old" blocks. */ + ibool adjust);/*!< in: TRUE=adjust the LRU list; + FALSE=just assign buf_pool->LRU_old_ratio + during the initialization of InnoDB */ +/********************************************************************//** +Update the historical stats that we are collecting for LRU eviction +policy at the end of each interval. */ +UNIV_INTERN +void +buf_LRU_stat_update(void); +/*=====================*/ + +/******************************************************************//** +Remove one page from LRU list and put it to free list */ +UNIV_INTERN +void +buf_LRU_free_one_page( +/*==================*/ + buf_page_t* bpage) /*!< in/out: block, must contain a file page and + be in a state where it can be freed; there + may or may not be a hash index to the page */ + __attribute__((nonnull)); + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +/**********************************************************************//** +Validates the LRU list. +@return TRUE */ +UNIV_INTERN +ibool +buf_LRU_validate(void); +/*==================*/ +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ +#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +/**********************************************************************//** +Prints the LRU list. */ +UNIV_INTERN +void +buf_LRU_print(void); +/*===============*/ +#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG || UNIV_BUF_DEBUG */ + +/** @name Heuristics for detecting index scan @{ */ +/** The denominator of buf_pool->LRU_old_ratio. */ +#define BUF_LRU_OLD_RATIO_DIV 1024 +/** Maximum value of buf_pool->LRU_old_ratio. +@see buf_LRU_old_adjust_len +@see buf_pool->LRU_old_ratio_update */ +#define BUF_LRU_OLD_RATIO_MAX BUF_LRU_OLD_RATIO_DIV +/** Minimum value of buf_pool->LRU_old_ratio. +@see buf_LRU_old_adjust_len +@see buf_pool->LRU_old_ratio_update +The minimum must exceed +(BUF_LRU_OLD_TOLERANCE + 5) * BUF_LRU_OLD_RATIO_DIV / BUF_LRU_OLD_MIN_LEN. */ +#define BUF_LRU_OLD_RATIO_MIN 51 + +#if BUF_LRU_OLD_RATIO_MIN >= BUF_LRU_OLD_RATIO_MAX +# error "BUF_LRU_OLD_RATIO_MIN >= BUF_LRU_OLD_RATIO_MAX" +#endif +#if BUF_LRU_OLD_RATIO_MAX > BUF_LRU_OLD_RATIO_DIV +# error "BUF_LRU_OLD_RATIO_MAX > BUF_LRU_OLD_RATIO_DIV" +#endif + +/** Move blocks to "new" LRU list only if the first access was at +least this many milliseconds ago. Not protected by any mutex or latch. */ +extern uint buf_LRU_old_threshold_ms; +/* @} */ + +/** @brief Statistics for selecting the LRU list for eviction. + +These statistics are not 'of' LRU but 'for' LRU. We keep count of I/O +and page_zip_decompress() operations. Based on the statistics we decide +if we want to evict from buf_pool->unzip_LRU or buf_pool->LRU. */ +struct buf_LRU_stat_t +{ + ulint io; /**< Counter of buffer pool I/O operations. */ + ulint unzip; /**< Counter of page_zip_decompress operations. */ +}; + +/** Current operation counters. Not protected by any mutex. +Cleared by buf_LRU_stat_update(). */ +extern buf_LRU_stat_t buf_LRU_stat_cur; + +/** Running sum of past values of buf_LRU_stat_cur. +Updated by buf_LRU_stat_update(). */ +extern buf_LRU_stat_t buf_LRU_stat_sum; + +/********************************************************************//** +Increments the I/O counter in buf_LRU_stat_cur. */ +#define buf_LRU_stat_inc_io() buf_LRU_stat_cur.io++ +/********************************************************************//** +Increments the page_zip_decompress() counter in buf_LRU_stat_cur. */ +#define buf_LRU_stat_inc_unzip() buf_LRU_stat_cur.unzip++ + +#ifndef UNIV_NONINL +#include "buf0lru.ic" +#endif + +#endif /* !UNIV_HOTBACKUP */ + +#endif diff --git a/storage/xtradb/include/buf0lru.ic b/storage/xtradb/include/buf0lru.ic new file mode 100644 index 00000000000..6e0da7a2588 --- /dev/null +++ b/storage/xtradb/include/buf0lru.ic @@ -0,0 +1,25 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/buf0lru.ic +The database buffer replacement algorithm + +Created 11/5/1995 Heikki Tuuri +*******************************************************/ + diff --git a/storage/xtradb/include/buf0rea.h b/storage/xtradb/include/buf0rea.h new file mode 100644 index 00000000000..9adeaa7455a --- /dev/null +++ b/storage/xtradb/include/buf0rea.h @@ -0,0 +1,179 @@ +/***************************************************************************** + +Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/buf0rea.h +The database buffer read + +Created 11/5/1995 Heikki Tuuri +*******************************************************/ + +#ifndef buf0rea_h +#define buf0rea_h + +#include "univ.i" +#include "buf0types.h" + +/********************************************************************//** +High-level function which reads a page asynchronously from a file to the +buffer buf_pool if it is not already there. Sets the io_fix flag and sets +an exclusive lock on the buffer frame. The flag is cleared and the x-lock +released by the i/o-handler thread. +@return TRUE if page has been read in, FALSE in case of failure */ +UNIV_INTERN +ibool +buf_read_page( +/*==========*/ + ulint space, /*!< in: space id */ + ulint zip_size,/*!< in: compressed page size in bytes, or 0 */ + ulint offset, /*!< in: page number */ + trx_t* trx); +/********************************************************************//** +High-level function which reads a page asynchronously from a file to the +buffer buf_pool if it is not already there. Sets the io_fix flag and sets +an exclusive lock on the buffer frame. The flag is cleared and the x-lock +released by the i/o-handler thread. +@return TRUE if page has been read in, FALSE in case of failure */ +UNIV_INTERN +ibool +buf_read_page_async( +/*================*/ + ulint space, /*!< in: space id */ + ulint offset);/*!< in: page number */ +/********************************************************************//** +Applies a random read-ahead in buf_pool if there are at least a threshold +value of accessed pages from the random read-ahead area. Does not read any +page, not even the one at the position (space, offset), if the read-ahead +mechanism is not activated. NOTE 1: the calling thread may own latches on +pages: to avoid deadlocks this function must be written such that it cannot +end up waiting for these latches! NOTE 2: the calling thread must want +access to the page given: this rule is set to prevent unintended read-aheads +performed by ibuf routines, a situation which could result in a deadlock if +the OS does not support asynchronous i/o. +@return number of page read requests issued; NOTE that if we read ibuf +pages, it may happen that the page at the given page number does not +get read even if we return a positive value! +@return number of page read requests issued */ +UNIV_INTERN +ulint +buf_read_ahead_random( +/*==================*/ + ulint space, /*!< in: space id */ + ulint zip_size, /*!< in: compressed page size in bytes, + or 0 */ + ulint offset, /*!< in: page number of a page which + the current thread wants to access */ + ibool inside_ibuf, /*!< in: TRUE if we are inside ibuf + routine */ + trx_t* trx); +/********************************************************************//** +Applies linear read-ahead if in the buf_pool the page is a border page of +a linear read-ahead area and all the pages in the area have been accessed. +Does not read any page if the read-ahead mechanism is not activated. Note +that the algorithm looks at the 'natural' adjacent successor and +predecessor of the page, which on the leaf level of a B-tree are the next +and previous page in the chain of leaves. To know these, the page specified +in (space, offset) must already be present in the buf_pool. Thus, the +natural way to use this function is to call it when a page in the buf_pool +is accessed the first time, calling this function just after it has been +bufferfixed. +NOTE 1: as this function looks at the natural predecessor and successor +fields on the page, what happens, if these are not initialized to any +sensible value? No problem, before applying read-ahead we check that the +area to read is within the span of the space, if not, read-ahead is not +applied. An uninitialized value may result in a useless read operation, but +only very improbably. +NOTE 2: the calling thread may own latches on pages: to avoid deadlocks this +function must be written such that it cannot end up waiting for these +latches! +NOTE 3: the calling thread must want access to the page given: this rule is +set to prevent unintended read-aheads performed by ibuf routines, a situation +which could result in a deadlock if the OS does not support asynchronous io. +@return number of page read requests issued */ +UNIV_INTERN +ulint +buf_read_ahead_linear( +/*==================*/ + ulint space, /*!< in: space id */ + ulint zip_size, /*!< in: compressed page size in bytes, or 0 */ + ulint offset, /*!< in: page number; see NOTE 3 above */ + ibool inside_ibuf, /*!< in: TRUE if we are inside ibuf routine */ + trx_t* trx); +/********************************************************************//** +Issues read requests for pages which the ibuf module wants to read in, in +order to contract the insert buffer tree. Technically, this function is like +a read-ahead function. */ +UNIV_INTERN +void +buf_read_ibuf_merge_pages( +/*======================*/ + bool sync, /*!< in: true if the caller + wants this function to wait + for the highest address page + to get read in, before this + function returns */ + const ulint* space_ids, /*!< in: array of space ids */ + const ib_int64_t* space_versions,/*!< in: the spaces must have + this version number + (timestamp), otherwise we + discard the read; we use this + to cancel reads if DISCARD + + IMPORT may have changed the + tablespace size */ + const ulint* page_nos, /*!< in: array of page numbers + to read, with the highest page + number the last in the + array */ + ulint n_stored); /*!< in: number of elements + in the arrays */ +/********************************************************************//** +Issues read requests for pages which recovery wants to read in. */ +UNIV_INTERN +void +buf_read_recv_pages( +/*================*/ + ibool sync, /*!< in: TRUE if the caller + wants this function to wait + for the highest address page + to get read in, before this + function returns */ + ulint space, /*!< in: space id */ + ulint zip_size, /*!< in: compressed page size in + bytes, or 0 */ + const ulint* page_nos, /*!< in: array of page numbers + to read, with the highest page + number the last in the + array */ + ulint n_stored); /*!< in: number of page numbers + in the array */ + +/** The size in pages of the area which the read-ahead algorithms read if +invoked */ +#define BUF_READ_AHEAD_AREA(b) ((b)->read_ahead_area) + +/** @name Modes used in read-ahead @{ */ +/** read only pages belonging to the insert buffer tree */ +#define BUF_READ_IBUF_PAGES_ONLY 131 +/** read any page */ +#define BUF_READ_ANY_PAGE 132 +/** read any page, but ignore (return an error) if a page does not exist +instead of crashing like BUF_READ_ANY_PAGE does */ +#define BUF_READ_IGNORE_NONEXISTENT_PAGES 1024 +/* @} */ + +#endif diff --git a/storage/xtradb/include/buf0types.h b/storage/xtradb/include/buf0types.h new file mode 100644 index 00000000000..4eb5ea18cef --- /dev/null +++ b/storage/xtradb/include/buf0types.h @@ -0,0 +1,157 @@ +/***************************************************************************** + +Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/buf0types.h +The database buffer pool global types for the directory + +Created 11/17/1995 Heikki Tuuri +*******************************************************/ + +#ifndef buf0types_h +#define buf0types_h + +#if defined(INNODB_PAGE_ATOMIC_REF_COUNT) && defined(HAVE_ATOMIC_BUILTINS) +#define PAGE_ATOMIC_REF_COUNT +#endif /* INNODB_PAGE_ATOMIC_REF_COUNT && HAVE_ATOMIC_BUILTINS */ + +/** Buffer page (uncompressed or compressed) */ +struct buf_page_t; +/** Buffer block for which an uncompressed page exists */ +struct buf_block_t; +/** Buffer pool chunk comprising buf_block_t */ +struct buf_chunk_t; +/** Buffer pool comprising buf_chunk_t */ +struct buf_pool_t; +/** Buffer pool statistics struct */ +struct buf_pool_stat_t; +/** Buffer pool buddy statistics struct */ +struct buf_buddy_stat_t; +/** Doublewrite memory struct */ +struct buf_dblwr_t; + +/** A buffer frame. @see page_t */ +typedef byte buf_frame_t; + +/** Flags for flush types */ +enum buf_flush_t { + BUF_FLUSH_LRU = 0, /*!< flush via the LRU list */ + BUF_FLUSH_LIST, /*!< flush via the flush list + of dirty blocks */ + BUF_FLUSH_SINGLE_PAGE, /*!< flush via the LRU list + but only a single page */ + BUF_FLUSH_N_TYPES /*!< index of last element + 1 */ +}; + +/** Algorithm to remove the pages for a tablespace from the buffer pool. +See buf_LRU_flush_or_remove_pages(). */ +enum buf_remove_t { + BUF_REMOVE_ALL_NO_WRITE, /*!< Remove all pages from the buffer + pool, don't write or sync to disk */ + BUF_REMOVE_FLUSH_NO_WRITE, /*!< Remove only, from the flush list, + don't write or sync to disk */ + BUF_REMOVE_FLUSH_WRITE /*!< Flush dirty pages to disk only + don't remove from the buffer pool */ +}; + +/** Flags for io_fix types */ +enum buf_io_fix { + BUF_IO_NONE = 0, /**< no pending I/O */ + BUF_IO_READ, /**< read pending */ + BUF_IO_WRITE, /**< write pending */ + BUF_IO_PIN /**< disallow relocation of + block and its removal of from + the flush_list */ +}; + +/** Alternatives for srv_checksum_algorithm, which can be changed by +setting innodb_checksum_algorithm */ +enum srv_checksum_algorithm_t { + SRV_CHECKSUM_ALGORITHM_CRC32, /*!< Write crc32, allow crc32, + innodb or none when reading */ + SRV_CHECKSUM_ALGORITHM_STRICT_CRC32, /*!< Write crc32, allow crc32 + when reading */ + SRV_CHECKSUM_ALGORITHM_INNODB, /*!< Write innodb, allow crc32, + innodb or none when reading */ + SRV_CHECKSUM_ALGORITHM_STRICT_INNODB, /*!< Write innodb, allow + innodb when reading */ + SRV_CHECKSUM_ALGORITHM_NONE, /*!< Write none, allow crc32, + innodb or none when reading */ + SRV_CHECKSUM_ALGORITHM_STRICT_NONE /*!< Write none, allow none + when reading */ +}; + +/** Alternatives for srv_cleaner_lsn_age_factor, set through +innodb_cleaner_lsn_age_factor variable */ +enum srv_cleaner_lsn_age_factor_t { + SRV_CLEANER_LSN_AGE_FACTOR_LEGACY, /*!< Original Oracle MySQL 5.6 + formula */ + SRV_CLEANER_LSN_AGE_FACTOR_HIGH_CHECKPOINT + /*!< Percona Server 5.6 formula + that returns lower values than + legacy option for low + checkpoint ages, and higher + values for high ages. This has + the effect of stabilizing the + checkpoint age higher. */ +}; + +/** Alternatives for srv_foreground_preflush, set through +innodb_foreground_preflush variable */ +enum srv_foreground_preflush_t { + SRV_FOREGROUND_PREFLUSH_SYNC_PREFLUSH, /*!< Original Oracle MySQL 5.6 + behavior of performing a sync + flush list flush */ + SRV_FOREGROUND_PREFLUSH_EXP_BACKOFF /*!< Exponential backoff wait + for the page cleaner to flush + for us */ +}; + +/** Alternatives for srv_empty_free_list_algorithm, set through +innodb_empty_free_list_algorithm variable */ +enum srv_empty_free_list_t { + SRV_EMPTY_FREE_LIST_LEGACY, /*!< Original Oracle MySQL 5.6 + algorithm */ + SRV_EMPTY_FREE_LIST_BACKOFF /*!< Percona Server 5.6 algorithm that + loops in a progressive backoff until a + free page is produced by the cleaner + thread */ +}; + +/** Parameters of binary buddy system for compressed pages (buf0buddy.h) */ +/* @{ */ +/** Zip shift value for the smallest page size */ +#define BUF_BUDDY_LOW_SHIFT UNIV_ZIP_SIZE_SHIFT_MIN + +/** Smallest buddy page size */ +#define BUF_BUDDY_LOW (1U << BUF_BUDDY_LOW_SHIFT) + +/** Actual number of buddy sizes based on current page size */ +#define BUF_BUDDY_SIZES (UNIV_PAGE_SIZE_SHIFT - BUF_BUDDY_LOW_SHIFT) + +/** Maximum number of buddy sizes based on the max page size */ +#define BUF_BUDDY_SIZES_MAX (UNIV_PAGE_SIZE_SHIFT_MAX \ + - BUF_BUDDY_LOW_SHIFT) + +/** twice the maximum block size of the buddy system; +the underlying memory is aligned by this amount: +this must be equal to UNIV_PAGE_SIZE */ +#define BUF_BUDDY_HIGH (BUF_BUDDY_LOW << BUF_BUDDY_SIZES) +/* @} */ + +#endif /* buf0types.h */ diff --git a/storage/xtradb/include/data0data.h b/storage/xtradb/include/data0data.h new file mode 100644 index 00000000000..a548c7b89b3 --- /dev/null +++ b/storage/xtradb/include/data0data.h @@ -0,0 +1,536 @@ +/***************************************************************************** + +Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/data0data.h +SQL data field and tuple + +Created 5/30/1994 Heikki Tuuri +*************************************************************************/ + +#ifndef data0data_h +#define data0data_h + +#include "univ.i" + +#include "data0types.h" +#include "data0type.h" +#include "mem0mem.h" +#include "dict0types.h" + +/** Storage for overflow data in a big record, that is, a clustered +index record which needs external storage of data fields */ +struct big_rec_t; + +#ifdef UNIV_DEBUG +/*********************************************************************//** +Gets pointer to the type struct of SQL data field. +@return pointer to the type struct */ +UNIV_INLINE +dtype_t* +dfield_get_type( +/*============*/ + const dfield_t* field) /*!< in: SQL data field */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Gets pointer to the data in a field. +@return pointer to data */ +UNIV_INLINE +void* +dfield_get_data( +/*============*/ + const dfield_t* field) /*!< in: field */ + __attribute__((nonnull, warn_unused_result)); +#else /* UNIV_DEBUG */ +# define dfield_get_type(field) (&(field)->type) +# define dfield_get_data(field) ((field)->data) +#endif /* UNIV_DEBUG */ +/*********************************************************************//** +Sets the type struct of SQL data field. */ +UNIV_INLINE +void +dfield_set_type( +/*============*/ + dfield_t* field, /*!< in: SQL data field */ + const dtype_t* type) /*!< in: pointer to data type struct */ + __attribute__((nonnull)); +/*********************************************************************//** +Gets length of field data. +@return length of data; UNIV_SQL_NULL if SQL null data */ +UNIV_INLINE +ulint +dfield_get_len( +/*===========*/ + const dfield_t* field) /*!< in: field */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Sets length in a field. */ +UNIV_INLINE +void +dfield_set_len( +/*===========*/ + dfield_t* field, /*!< in: field */ + ulint len) /*!< in: length or UNIV_SQL_NULL */ + __attribute__((nonnull)); +/*********************************************************************//** +Determines if a field is SQL NULL +@return nonzero if SQL null data */ +UNIV_INLINE +ulint +dfield_is_null( +/*===========*/ + const dfield_t* field) /*!< in: field */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Determines if a field is externally stored +@return nonzero if externally stored */ +UNIV_INLINE +ulint +dfield_is_ext( +/*==========*/ + const dfield_t* field) /*!< in: field */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Sets the "external storage" flag */ +UNIV_INLINE +void +dfield_set_ext( +/*===========*/ + dfield_t* field) /*!< in/out: field */ + __attribute__((nonnull)); +/*********************************************************************//** +Sets pointer to the data and length in a field. */ +UNIV_INLINE +void +dfield_set_data( +/*============*/ + dfield_t* field, /*!< in: field */ + const void* data, /*!< in: data */ + ulint len) /*!< in: length or UNIV_SQL_NULL */ + __attribute__((nonnull(1))); +/*********************************************************************//** +Sets a data field to SQL NULL. */ +UNIV_INLINE +void +dfield_set_null( +/*============*/ + dfield_t* field) /*!< in/out: field */ + __attribute__((nonnull)); +/**********************************************************************//** +Writes an SQL null field full of zeros. */ +UNIV_INLINE +void +data_write_sql_null( +/*================*/ + byte* data, /*!< in: pointer to a buffer of size len */ + ulint len) /*!< in: SQL null size in bytes */ + __attribute__((nonnull)); +/*********************************************************************//** +Copies the data and len fields. */ +UNIV_INLINE +void +dfield_copy_data( +/*=============*/ + dfield_t* field1, /*!< out: field to copy to */ + const dfield_t* field2) /*!< in: field to copy from */ + __attribute__((nonnull)); +/*********************************************************************//** +Copies a data field to another. */ +UNIV_INLINE +void +dfield_copy( +/*========*/ + dfield_t* field1, /*!< out: field to copy to */ + const dfield_t* field2) /*!< in: field to copy from */ + __attribute__((nonnull)); +/*********************************************************************//** +Copies the data pointed to by a data field. */ +UNIV_INLINE +void +dfield_dup( +/*=======*/ + dfield_t* field, /*!< in/out: data field */ + mem_heap_t* heap) /*!< in: memory heap where allocated */ + __attribute__((nonnull)); +#ifndef UNIV_HOTBACKUP +/*********************************************************************//** +Tests if two data fields are equal. +If len==0, tests the data length and content for equality. +If len>0, tests the first len bytes of the content for equality. +@return TRUE if both fields are NULL or if they are equal */ +UNIV_INLINE +ibool +dfield_datas_are_binary_equal( +/*==========================*/ + const dfield_t* field1, /*!< in: field */ + const dfield_t* field2, /*!< in: field */ + ulint len) /*!< in: maximum prefix to compare, + or 0 to compare the whole field length */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Tests if dfield data length and content is equal to the given. +@return TRUE if equal */ +UNIV_INLINE +ibool +dfield_data_is_binary_equal( +/*========================*/ + const dfield_t* field, /*!< in: field */ + ulint len, /*!< in: data length or UNIV_SQL_NULL */ + const byte* data) /*!< in: data */ + __attribute__((nonnull, warn_unused_result)); +#endif /* !UNIV_HOTBACKUP */ +/*********************************************************************//** +Gets number of fields in a data tuple. +@return number of fields */ +UNIV_INLINE +ulint +dtuple_get_n_fields( +/*================*/ + const dtuple_t* tuple) /*!< in: tuple */ + __attribute__((nonnull, warn_unused_result)); +#ifdef UNIV_DEBUG +/*********************************************************************//** +Gets nth field of a tuple. +@return nth field */ +UNIV_INLINE +dfield_t* +dtuple_get_nth_field( +/*=================*/ + const dtuple_t* tuple, /*!< in: tuple */ + ulint n); /*!< in: index of field */ +#else /* UNIV_DEBUG */ +# define dtuple_get_nth_field(tuple, n) ((tuple)->fields + (n)) +#endif /* UNIV_DEBUG */ +/*********************************************************************//** +Gets info bits in a data tuple. +@return info bits */ +UNIV_INLINE +ulint +dtuple_get_info_bits( +/*=================*/ + const dtuple_t* tuple) /*!< in: tuple */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Sets info bits in a data tuple. */ +UNIV_INLINE +void +dtuple_set_info_bits( +/*=================*/ + dtuple_t* tuple, /*!< in: tuple */ + ulint info_bits) /*!< in: info bits */ + __attribute__((nonnull)); +/*********************************************************************//** +Gets number of fields used in record comparisons. +@return number of fields used in comparisons in rem0cmp.* */ +UNIV_INLINE +ulint +dtuple_get_n_fields_cmp( +/*====================*/ + const dtuple_t* tuple) /*!< in: tuple */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Gets number of fields used in record comparisons. */ +UNIV_INLINE +void +dtuple_set_n_fields_cmp( +/*====================*/ + dtuple_t* tuple, /*!< in: tuple */ + ulint n_fields_cmp) /*!< in: number of fields used in + comparisons in rem0cmp.* */ + __attribute__((nonnull)); + +/* Estimate the number of bytes that are going to be allocated when +creating a new dtuple_t object */ +#define DTUPLE_EST_ALLOC(n_fields) \ + (sizeof(dtuple_t) + (n_fields) * sizeof(dfield_t)) + +/**********************************************************//** +Creates a data tuple from an already allocated chunk of memory. +The size of the chunk must be at least DTUPLE_EST_ALLOC(n_fields). +The default value for number of fields used in record comparisons +for this tuple is n_fields. +@return created tuple (inside buf) */ +UNIV_INLINE +dtuple_t* +dtuple_create_from_mem( +/*===================*/ + void* buf, /*!< in, out: buffer to use */ + ulint buf_size, /*!< in: buffer size */ + ulint n_fields) /*!< in: number of fields */ + __attribute__((nonnull, warn_unused_result)); + +/**********************************************************//** +Creates a data tuple to a memory heap. The default value for number +of fields used in record comparisons for this tuple is n_fields. +@return own: created tuple */ +UNIV_INLINE +dtuple_t* +dtuple_create( +/*==========*/ + mem_heap_t* heap, /*!< in: memory heap where the tuple + is created, DTUPLE_EST_ALLOC(n_fields) + bytes will be allocated from this heap */ + ulint n_fields)/*!< in: number of fields */ + __attribute__((nonnull, malloc)); + +/*********************************************************************//** +Sets number of fields used in a tuple. Normally this is set in +dtuple_create, but if you want later to set it smaller, you can use this. */ +UNIV_INTERN +void +dtuple_set_n_fields( +/*================*/ + dtuple_t* tuple, /*!< in: tuple */ + ulint n_fields) /*!< in: number of fields */ + __attribute__((nonnull)); +/*********************************************************************//** +Copies a data tuple to another. This is a shallow copy; if a deep copy +is desired, dfield_dup() will have to be invoked on each field. +@return own: copy of tuple */ +UNIV_INLINE +dtuple_t* +dtuple_copy( +/*========*/ + const dtuple_t* tuple, /*!< in: tuple to copy from */ + mem_heap_t* heap) /*!< in: memory heap + where the tuple is created */ + __attribute__((nonnull, malloc)); +/**********************************************************//** +The following function returns the sum of data lengths of a tuple. The space +occupied by the field structs or the tuple struct is not counted. +@return sum of data lens */ +UNIV_INLINE +ulint +dtuple_get_data_size( +/*=================*/ + const dtuple_t* tuple, /*!< in: typed data tuple */ + ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */ + __attribute__((nonnull)); +/*********************************************************************//** +Computes the number of externally stored fields in a data tuple. +@return number of fields */ +UNIV_INLINE +ulint +dtuple_get_n_ext( +/*=============*/ + const dtuple_t* tuple) /*!< in: tuple */ + __attribute__((nonnull)); +/************************************************************//** +Compare two data tuples, respecting the collation of character fields. +@return 1, 0 , -1 if tuple1 is greater, equal, less, respectively, +than tuple2 */ +UNIV_INTERN +int +dtuple_coll_cmp( +/*============*/ + const dtuple_t* tuple1, /*!< in: tuple 1 */ + const dtuple_t* tuple2) /*!< in: tuple 2 */ + __attribute__((nonnull, warn_unused_result)); +/************************************************************//** +Folds a prefix given as the number of fields of a tuple. +@return the folded value */ +UNIV_INLINE +ulint +dtuple_fold( +/*========*/ + const dtuple_t* tuple, /*!< in: the tuple */ + ulint n_fields,/*!< in: number of complete fields to fold */ + ulint n_bytes,/*!< in: number of bytes to fold in an + incomplete last field */ + index_id_t tree_id)/*!< in: index tree id */ + __attribute__((nonnull, pure, warn_unused_result)); +/*******************************************************************//** +Sets types of fields binary in a tuple. */ +UNIV_INLINE +void +dtuple_set_types_binary( +/*====================*/ + dtuple_t* tuple, /*!< in: data tuple */ + ulint n) /*!< in: number of fields to set */ + __attribute__((nonnull)); +/**********************************************************************//** +Checks if a dtuple contains an SQL null value. +@return TRUE if some field is SQL null */ +UNIV_INLINE +ibool +dtuple_contains_null( +/*=================*/ + const dtuple_t* tuple) /*!< in: dtuple */ + __attribute__((nonnull, warn_unused_result)); +/**********************************************************//** +Checks that a data field is typed. Asserts an error if not. +@return TRUE if ok */ +UNIV_INTERN +ibool +dfield_check_typed( +/*===============*/ + const dfield_t* field) /*!< in: data field */ + __attribute__((nonnull, warn_unused_result)); +/**********************************************************//** +Checks that a data tuple is typed. Asserts an error if not. +@return TRUE if ok */ +UNIV_INTERN +ibool +dtuple_check_typed( +/*===============*/ + const dtuple_t* tuple) /*!< in: tuple */ + __attribute__((nonnull, warn_unused_result)); +/**********************************************************//** +Checks that a data tuple is typed. +@return TRUE if ok */ +UNIV_INTERN +ibool +dtuple_check_typed_no_assert( +/*=========================*/ + const dtuple_t* tuple) /*!< in: tuple */ + __attribute__((nonnull, warn_unused_result)); +#ifdef UNIV_DEBUG +/**********************************************************//** +Validates the consistency of a tuple which must be complete, i.e, +all fields must have been set. +@return TRUE if ok */ +UNIV_INTERN +ibool +dtuple_validate( +/*============*/ + const dtuple_t* tuple) /*!< in: tuple */ + __attribute__((nonnull, warn_unused_result)); +#endif /* UNIV_DEBUG */ +/*************************************************************//** +Pretty prints a dfield value according to its data type. */ +UNIV_INTERN +void +dfield_print( +/*=========*/ + const dfield_t* dfield) /*!< in: dfield */ + __attribute__((nonnull)); +/*************************************************************//** +Pretty prints a dfield value according to its data type. Also the hex string +is printed if a string contains non-printable characters. */ +UNIV_INTERN +void +dfield_print_also_hex( +/*==================*/ + const dfield_t* dfield) /*!< in: dfield */ + __attribute__((nonnull)); +/**********************************************************//** +The following function prints the contents of a tuple. */ +UNIV_INTERN +void +dtuple_print( +/*=========*/ + FILE* f, /*!< in: output stream */ + const dtuple_t* tuple) /*!< in: tuple */ + __attribute__((nonnull)); +/**************************************************************//** +Moves parts of long fields in entry to the big record vector so that +the size of tuple drops below the maximum record size allowed in the +database. Moves data only from those fields which are not necessary +to determine uniquely the insertion place of the tuple in the index. +@return own: created big record vector, NULL if we are not able to +shorten the entry enough, i.e., if there are too many fixed-length or +short fields in entry or the index is clustered */ +UNIV_INTERN +big_rec_t* +dtuple_convert_big_rec( +/*===================*/ + dict_index_t* index, /*!< in: index */ + dtuple_t* entry, /*!< in/out: index entry */ + ulint* n_ext) /*!< in/out: number of + externally stored columns */ + __attribute__((nonnull, malloc, warn_unused_result)); +/**************************************************************//** +Puts back to entry the data stored in vector. Note that to ensure the +fields in entry can accommodate the data, vector must have been created +from entry with dtuple_convert_big_rec. */ +UNIV_INTERN +void +dtuple_convert_back_big_rec( +/*========================*/ + dict_index_t* index, /*!< in: index */ + dtuple_t* entry, /*!< in: entry whose data was put to vector */ + big_rec_t* vector) /*!< in, own: big rec vector; it is + freed in this function */ + __attribute__((nonnull)); +/**************************************************************//** +Frees the memory in a big rec vector. */ +UNIV_INLINE +void +dtuple_big_rec_free( +/*================*/ + big_rec_t* vector) /*!< in, own: big rec vector; it is + freed in this function */ + __attribute__((nonnull)); + +/*######################################################################*/ + +/** Structure for an SQL data field */ +struct dfield_t{ + void* data; /*!< pointer to data */ + unsigned ext:1; /*!< TRUE=externally stored, FALSE=local */ + unsigned len:32; /*!< data length; UNIV_SQL_NULL if SQL null */ + dtype_t type; /*!< type of data */ +}; + +/** Structure for an SQL data tuple of fields (logical record) */ +struct dtuple_t { + ulint info_bits; /*!< info bits of an index record: + the default is 0; this field is used + if an index record is built from + a data tuple */ + ulint n_fields; /*!< number of fields in dtuple */ + ulint n_fields_cmp; /*!< number of fields which should + be used in comparison services + of rem0cmp.*; the index search + is performed by comparing only these + fields, others are ignored; the + default value in dtuple creation is + the same value as n_fields */ + dfield_t* fields; /*!< fields */ + UT_LIST_NODE_T(dtuple_t) tuple_list; + /*!< data tuples can be linked into a + list using this field */ +#ifdef UNIV_DEBUG + ulint magic_n; /*!< magic number, used in + debug assertions */ +/** Value of dtuple_t::magic_n */ +# define DATA_TUPLE_MAGIC_N 65478679 +#endif /* UNIV_DEBUG */ +}; + +/** A slot for a field in a big rec vector */ +struct big_rec_field_t { + ulint field_no; /*!< field number in record */ + ulint len; /*!< stored data length, in bytes */ + const void* data; /*!< stored data */ +}; + +/** Storage format for overflow data in a big record, that is, a +clustered index record which needs external storage of data fields */ +struct big_rec_t { + mem_heap_t* heap; /*!< memory heap from which + allocated */ + ulint n_fields; /*!< number of stored fields */ + big_rec_field_t*fields; /*!< stored fields */ +}; + +#ifndef UNIV_NONINL +#include "data0data.ic" +#endif + +#endif diff --git a/storage/xtradb/include/data0data.ic b/storage/xtradb/include/data0data.ic new file mode 100644 index 00000000000..6937d55d211 --- /dev/null +++ b/storage/xtradb/include/data0data.ic @@ -0,0 +1,649 @@ +/***************************************************************************** + +Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/data0data.ic +SQL data field and tuple + +Created 5/30/1994 Heikki Tuuri +*************************************************************************/ + +#include "mem0mem.h" +#include "ut0rnd.h" + +#ifdef UNIV_DEBUG +/** Dummy variable to catch access to uninitialized fields. In the +debug version, dtuple_create() will make all fields of dtuple_t point +to data_error. */ +extern byte data_error; + +/*********************************************************************//** +Gets pointer to the type struct of SQL data field. +@return pointer to the type struct */ +UNIV_INLINE +dtype_t* +dfield_get_type( +/*============*/ + const dfield_t* field) /*!< in: SQL data field */ +{ + ut_ad(field); + + return((dtype_t*) &(field->type)); +} +#endif /* UNIV_DEBUG */ + +/*********************************************************************//** +Sets the type struct of SQL data field. */ +UNIV_INLINE +void +dfield_set_type( +/*============*/ + dfield_t* field, /*!< in: SQL data field */ + const dtype_t* type) /*!< in: pointer to data type struct */ +{ + ut_ad(field && type); + + field->type = *type; +} + +#ifdef UNIV_DEBUG +/*********************************************************************//** +Gets pointer to the data in a field. +@return pointer to data */ +UNIV_INLINE +void* +dfield_get_data( +/*============*/ + const dfield_t* field) /*!< in: field */ +{ + ut_ad(field); + ut_ad((field->len == UNIV_SQL_NULL) + || (field->data != &data_error)); + + return((void*) field->data); +} +#endif /* UNIV_DEBUG */ + +/*********************************************************************//** +Gets length of field data. +@return length of data; UNIV_SQL_NULL if SQL null data */ +UNIV_INLINE +ulint +dfield_get_len( +/*===========*/ + const dfield_t* field) /*!< in: field */ +{ + ut_ad(field); + ut_ad((field->len == UNIV_SQL_NULL) + || (field->data != &data_error)); + + return(field->len); +} + +/*********************************************************************//** +Sets length in a field. */ +UNIV_INLINE +void +dfield_set_len( +/*===========*/ + dfield_t* field, /*!< in: field */ + ulint len) /*!< in: length or UNIV_SQL_NULL */ +{ + ut_ad(field); +#ifdef UNIV_VALGRIND_DEBUG + if (len != UNIV_SQL_NULL) UNIV_MEM_ASSERT_RW(field->data, len); +#endif /* UNIV_VALGRIND_DEBUG */ + + field->ext = 0; + field->len = len; +} + +/*********************************************************************//** +Determines if a field is SQL NULL +@return nonzero if SQL null data */ +UNIV_INLINE +ulint +dfield_is_null( +/*===========*/ + const dfield_t* field) /*!< in: field */ +{ + ut_ad(field); + + return(field->len == UNIV_SQL_NULL); +} + +/*********************************************************************//** +Determines if a field is externally stored +@return nonzero if externally stored */ +UNIV_INLINE +ulint +dfield_is_ext( +/*==========*/ + const dfield_t* field) /*!< in: field */ +{ + ut_ad(field); + + return(field->ext); +} + +/*********************************************************************//** +Sets the "external storage" flag */ +UNIV_INLINE +void +dfield_set_ext( +/*===========*/ + dfield_t* field) /*!< in/out: field */ +{ + ut_ad(field); + + field->ext = 1; +} + +/*********************************************************************//** +Sets pointer to the data and length in a field. */ +UNIV_INLINE +void +dfield_set_data( +/*============*/ + dfield_t* field, /*!< in: field */ + const void* data, /*!< in: data */ + ulint len) /*!< in: length or UNIV_SQL_NULL */ +{ + ut_ad(field); + +#ifdef UNIV_VALGRIND_DEBUG + if (len != UNIV_SQL_NULL) UNIV_MEM_ASSERT_RW(data, len); +#endif /* UNIV_VALGRIND_DEBUG */ + field->data = (void*) data; + field->ext = 0; + field->len = len; +} + +/*********************************************************************//** +Sets a data field to SQL NULL. */ +UNIV_INLINE +void +dfield_set_null( +/*============*/ + dfield_t* field) /*!< in/out: field */ +{ + dfield_set_data(field, NULL, UNIV_SQL_NULL); +} + +/*********************************************************************//** +Copies the data and len fields. */ +UNIV_INLINE +void +dfield_copy_data( +/*=============*/ + dfield_t* field1, /*!< out: field to copy to */ + const dfield_t* field2) /*!< in: field to copy from */ +{ + ut_ad(field1 && field2); + + field1->data = field2->data; + field1->len = field2->len; + field1->ext = field2->ext; +} + +/*********************************************************************//** +Copies a data field to another. */ +UNIV_INLINE +void +dfield_copy( +/*========*/ + dfield_t* field1, /*!< out: field to copy to */ + const dfield_t* field2) /*!< in: field to copy from */ +{ + *field1 = *field2; +} + +/*********************************************************************//** +Copies the data pointed to by a data field. */ +UNIV_INLINE +void +dfield_dup( +/*=======*/ + dfield_t* field, /*!< in/out: data field */ + mem_heap_t* heap) /*!< in: memory heap where allocated */ +{ + if (!dfield_is_null(field)) { + UNIV_MEM_ASSERT_RW(field->data, field->len); + field->data = mem_heap_dup(heap, field->data, field->len); + } +} + +#ifndef UNIV_HOTBACKUP +/*********************************************************************//** +Tests if two data fields are equal. +If len==0, tests the data length and content for equality. +If len>0, tests the first len bytes of the content for equality. +@return TRUE if both fields are NULL or if they are equal */ +UNIV_INLINE +ibool +dfield_datas_are_binary_equal( +/*==========================*/ + const dfield_t* field1, /*!< in: field */ + const dfield_t* field2, /*!< in: field */ + ulint len) /*!< in: maximum prefix to compare, + or 0 to compare the whole field length */ +{ + ulint len2 = len; + + if (field1->len == UNIV_SQL_NULL || len == 0 || field1->len < len) { + len = field1->len; + } + + if (field2->len == UNIV_SQL_NULL || len2 == 0 || field2->len < len2) { + len2 = field2->len; + } + + return(len == len2 + && (len == UNIV_SQL_NULL + || !memcmp(field1->data, field2->data, len))); +} + +/*********************************************************************//** +Tests if dfield data length and content is equal to the given. +@return TRUE if equal */ +UNIV_INLINE +ibool +dfield_data_is_binary_equal( +/*========================*/ + const dfield_t* field, /*!< in: field */ + ulint len, /*!< in: data length or UNIV_SQL_NULL */ + const byte* data) /*!< in: data */ +{ + return(len == dfield_get_len(field) + && (len == UNIV_SQL_NULL + || !memcmp(dfield_get_data(field), data, len))); +} +#endif /* !UNIV_HOTBACKUP */ + +/*********************************************************************//** +Gets info bits in a data tuple. +@return info bits */ +UNIV_INLINE +ulint +dtuple_get_info_bits( +/*=================*/ + const dtuple_t* tuple) /*!< in: tuple */ +{ + ut_ad(tuple); + + return(tuple->info_bits); +} + +/*********************************************************************//** +Sets info bits in a data tuple. */ +UNIV_INLINE +void +dtuple_set_info_bits( +/*=================*/ + dtuple_t* tuple, /*!< in: tuple */ + ulint info_bits) /*!< in: info bits */ +{ + ut_ad(tuple); + + tuple->info_bits = info_bits; +} + +/*********************************************************************//** +Gets number of fields used in record comparisons. +@return number of fields used in comparisons in rem0cmp.* */ +UNIV_INLINE +ulint +dtuple_get_n_fields_cmp( +/*====================*/ + const dtuple_t* tuple) /*!< in: tuple */ +{ + ut_ad(tuple); + + return(tuple->n_fields_cmp); +} + +/*********************************************************************//** +Sets number of fields used in record comparisons. */ +UNIV_INLINE +void +dtuple_set_n_fields_cmp( +/*====================*/ + dtuple_t* tuple, /*!< in: tuple */ + ulint n_fields_cmp) /*!< in: number of fields used in + comparisons in rem0cmp.* */ +{ + ut_ad(tuple); + ut_ad(n_fields_cmp <= tuple->n_fields); + + tuple->n_fields_cmp = n_fields_cmp; +} + +/*********************************************************************//** +Gets number of fields in a data tuple. +@return number of fields */ +UNIV_INLINE +ulint +dtuple_get_n_fields( +/*================*/ + const dtuple_t* tuple) /*!< in: tuple */ +{ + ut_ad(tuple); + + return(tuple->n_fields); +} + +#ifdef UNIV_DEBUG +/*********************************************************************//** +Gets nth field of a tuple. +@return nth field */ +UNIV_INLINE +dfield_t* +dtuple_get_nth_field( +/*=================*/ + const dtuple_t* tuple, /*!< in: tuple */ + ulint n) /*!< in: index of field */ +{ + ut_ad(tuple); + ut_ad(n < tuple->n_fields); + + return((dfield_t*) tuple->fields + n); +} +#endif /* UNIV_DEBUG */ + +/**********************************************************//** +Creates a data tuple from an already allocated chunk of memory. +The size of the chunk must be at least DTUPLE_EST_ALLOC(n_fields). +The default value for number of fields used in record comparisons +for this tuple is n_fields. +@return created tuple (inside buf) */ +UNIV_INLINE +dtuple_t* +dtuple_create_from_mem( +/*===================*/ + void* buf, /*!< in, out: buffer to use */ + ulint buf_size, /*!< in: buffer size */ + ulint n_fields) /*!< in: number of fields */ +{ + dtuple_t* tuple; + + ut_ad(buf != NULL); + ut_a(buf_size >= DTUPLE_EST_ALLOC(n_fields)); + + tuple = (dtuple_t*) buf; + tuple->info_bits = 0; + tuple->n_fields = n_fields; + tuple->n_fields_cmp = n_fields; + tuple->fields = (dfield_t*) &tuple[1]; + +#ifdef UNIV_DEBUG + tuple->magic_n = DATA_TUPLE_MAGIC_N; + + { /* In the debug version, initialize fields to an error value */ + ulint i; + + for (i = 0; i < n_fields; i++) { + dfield_t* field; + + field = dtuple_get_nth_field(tuple, i); + + dfield_set_len(field, UNIV_SQL_NULL); + field->data = &data_error; + dfield_get_type(field)->mtype = DATA_ERROR; + } + } +#endif + UNIV_MEM_ASSERT_W(tuple->fields, n_fields * sizeof *tuple->fields); + UNIV_MEM_INVALID(tuple->fields, n_fields * sizeof *tuple->fields); + return(tuple); +} + +/**********************************************************//** +Creates a data tuple to a memory heap. The default value for number +of fields used in record comparisons for this tuple is n_fields. +@return own: created tuple */ +UNIV_INLINE +dtuple_t* +dtuple_create( +/*==========*/ + mem_heap_t* heap, /*!< in: memory heap where the tuple + is created, DTUPLE_EST_ALLOC(n_fields) + bytes will be allocated from this heap */ + ulint n_fields) /*!< in: number of fields */ +{ + void* buf; + ulint buf_size; + dtuple_t* tuple; + + ut_ad(heap); + + buf_size = DTUPLE_EST_ALLOC(n_fields); + buf = mem_heap_alloc(heap, buf_size); + + tuple = dtuple_create_from_mem(buf, buf_size, n_fields); + + return(tuple); +} + +/*********************************************************************//** +Copies a data tuple to another. This is a shallow copy; if a deep copy +is desired, dfield_dup() will have to be invoked on each field. +@return own: copy of tuple */ +UNIV_INLINE +dtuple_t* +dtuple_copy( +/*========*/ + const dtuple_t* tuple, /*!< in: tuple to copy from */ + mem_heap_t* heap) /*!< in: memory heap + where the tuple is created */ +{ + ulint n_fields = dtuple_get_n_fields(tuple); + dtuple_t* new_tuple = dtuple_create(heap, n_fields); + ulint i; + + for (i = 0; i < n_fields; i++) { + dfield_copy(dtuple_get_nth_field(new_tuple, i), + dtuple_get_nth_field(tuple, i)); + } + + return(new_tuple); +} + +/**********************************************************//** +The following function returns the sum of data lengths of a tuple. The space +occupied by the field structs or the tuple struct is not counted. Neither +is possible space in externally stored parts of the field. +@return sum of data lengths */ +UNIV_INLINE +ulint +dtuple_get_data_size( +/*=================*/ + const dtuple_t* tuple, /*!< in: typed data tuple */ + ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */ +{ + const dfield_t* field; + ulint n_fields; + ulint len; + ulint i; + ulint sum = 0; + + ut_ad(tuple); + ut_ad(dtuple_check_typed(tuple)); + ut_ad(tuple->magic_n == DATA_TUPLE_MAGIC_N); + + n_fields = tuple->n_fields; + + for (i = 0; i < n_fields; i++) { + field = dtuple_get_nth_field(tuple, i); + len = dfield_get_len(field); + + if (len == UNIV_SQL_NULL) { + len = dtype_get_sql_null_size(dfield_get_type(field), + comp); + } + + sum += len; + } + + return(sum); +} + +/*********************************************************************//** +Computes the number of externally stored fields in a data tuple. +@return number of externally stored fields */ +UNIV_INLINE +ulint +dtuple_get_n_ext( +/*=============*/ + const dtuple_t* tuple) /*!< in: tuple */ +{ + ulint n_ext = 0; + ulint n_fields = tuple->n_fields; + ulint i; + + ut_ad(tuple); + ut_ad(dtuple_check_typed(tuple)); + ut_ad(tuple->magic_n == DATA_TUPLE_MAGIC_N); + + for (i = 0; i < n_fields; i++) { + n_ext += dtuple_get_nth_field(tuple, i)->ext; + } + + return(n_ext); +} + +/*******************************************************************//** +Sets types of fields binary in a tuple. */ +UNIV_INLINE +void +dtuple_set_types_binary( +/*====================*/ + dtuple_t* tuple, /*!< in: data tuple */ + ulint n) /*!< in: number of fields to set */ +{ + dtype_t* dfield_type; + ulint i; + + for (i = 0; i < n; i++) { + dfield_type = dfield_get_type(dtuple_get_nth_field(tuple, i)); + dtype_set(dfield_type, DATA_BINARY, 0, 0); + } +} + +/************************************************************//** +Folds a prefix given as the number of fields of a tuple. +@return the folded value */ +UNIV_INLINE +ulint +dtuple_fold( +/*========*/ + const dtuple_t* tuple, /*!< in: the tuple */ + ulint n_fields,/*!< in: number of complete fields to fold */ + ulint n_bytes,/*!< in: number of bytes to fold in an + incomplete last field */ + index_id_t tree_id)/*!< in: index tree id */ +{ + const dfield_t* field; + ulint i; + const byte* data; + ulint len; + ulint fold; + + ut_ad(tuple); + ut_ad(tuple->magic_n == DATA_TUPLE_MAGIC_N); + ut_ad(dtuple_check_typed(tuple)); + + fold = ut_fold_ull(tree_id); + + for (i = 0; i < n_fields; i++) { + field = dtuple_get_nth_field(tuple, i); + + data = (const byte*) dfield_get_data(field); + len = dfield_get_len(field); + + if (len != UNIV_SQL_NULL) { + fold = ut_fold_ulint_pair(fold, + ut_fold_binary(data, len)); + } + } + + if (n_bytes > 0) { + field = dtuple_get_nth_field(tuple, i); + + data = (const byte*) dfield_get_data(field); + len = dfield_get_len(field); + + if (len != UNIV_SQL_NULL) { + if (len > n_bytes) { + len = n_bytes; + } + + fold = ut_fold_ulint_pair(fold, + ut_fold_binary(data, len)); + } + } + + return(fold); +} + +/**********************************************************************//** +Writes an SQL null field full of zeros. */ +UNIV_INLINE +void +data_write_sql_null( +/*================*/ + byte* data, /*!< in: pointer to a buffer of size len */ + ulint len) /*!< in: SQL null size in bytes */ +{ + memset(data, 0, len); +} + +/**********************************************************************//** +Checks if a dtuple contains an SQL null value. +@return TRUE if some field is SQL null */ +UNIV_INLINE +ibool +dtuple_contains_null( +/*=================*/ + const dtuple_t* tuple) /*!< in: dtuple */ +{ + ulint n; + ulint i; + + n = dtuple_get_n_fields(tuple); + + for (i = 0; i < n; i++) { + if (dfield_is_null(dtuple_get_nth_field(tuple, i))) { + + return(TRUE); + } + } + + return(FALSE); +} + +/**************************************************************//** +Frees the memory in a big rec vector. */ +UNIV_INLINE +void +dtuple_big_rec_free( +/*================*/ + big_rec_t* vector) /*!< in, own: big rec vector; it is + freed in this function */ +{ + mem_heap_free(vector->heap); +} diff --git a/storage/xtradb/include/data0type.h b/storage/xtradb/include/data0type.h new file mode 100644 index 00000000000..111664b0b52 --- /dev/null +++ b/storage/xtradb/include/data0type.h @@ -0,0 +1,544 @@ +/***************************************************************************** + +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/data0type.h +Data types + +Created 1/16/1996 Heikki Tuuri +*******************************************************/ + +#ifndef data0type_h +#define data0type_h + +#include "univ.i" + +extern ulint data_mysql_default_charset_coll; +#define DATA_MYSQL_LATIN1_SWEDISH_CHARSET_COLL 8 +#define DATA_MYSQL_BINARY_CHARSET_COLL 63 + +/* SQL data type struct */ +struct dtype_t; + +/* SQL Like operator comparison types */ +enum ib_like_t { + IB_LIKE_EXACT, /* e.g. STRING */ + IB_LIKE_PREFIX, /* e.g., STRING% */ + IB_LIKE_SUFFIX, /* e.g., %STRING */ + IB_LIKE_SUBSTR, /* e.g., %STRING% */ + IB_LIKE_REGEXP /* Future */ +}; + +/*-------------------------------------------*/ +/* The 'MAIN TYPE' of a column */ +#define DATA_MISSING 0 /* missing column */ +#define DATA_VARCHAR 1 /* character varying of the + latin1_swedish_ci charset-collation; note + that the MySQL format for this, DATA_BINARY, + DATA_VARMYSQL, is also affected by whether the + 'precise type' contains + DATA_MYSQL_TRUE_VARCHAR */ +#define DATA_CHAR 2 /* fixed length character of the + latin1_swedish_ci charset-collation */ +#define DATA_FIXBINARY 3 /* binary string of fixed length */ +#define DATA_BINARY 4 /* binary string */ +#define DATA_BLOB 5 /* binary large object, or a TEXT type; + if prtype & DATA_BINARY_TYPE == 0, then this is + actually a TEXT column (or a BLOB created + with < 4.0.14; since column prefix indexes + came only in 4.0.14, the missing flag in BLOBs + created before that does not cause any harm) */ +#define DATA_INT 6 /* integer: can be any size 1 - 8 bytes */ +#define DATA_SYS_CHILD 7 /* address of the child page in node pointer */ +#define DATA_SYS 8 /* system column */ + +/* Data types >= DATA_FLOAT must be compared using the whole field, not as +binary strings */ + +#define DATA_FLOAT 9 +#define DATA_DOUBLE 10 +#define DATA_DECIMAL 11 /* decimal number stored as an ASCII string */ +#define DATA_VARMYSQL 12 /* any charset varying length char */ +#define DATA_MYSQL 13 /* any charset fixed length char */ + /* NOTE that 4.1.1 used DATA_MYSQL and + DATA_VARMYSQL for all character sets, and the + charset-collation for tables created with it + can also be latin1_swedish_ci */ +#define DATA_MTYPE_MAX 63 /* dtype_store_for_order_and_null_size() + requires the values are <= 63 */ +/*-------------------------------------------*/ +/* The 'PRECISE TYPE' of a column */ +/* +Tables created by a MySQL user have the following convention: + +- In the least significant byte in the precise type we store the MySQL type +code (not applicable for system columns). + +- In the second least significant byte we OR flags DATA_NOT_NULL, +DATA_UNSIGNED, DATA_BINARY_TYPE. + +- In the third least significant byte of the precise type of string types we +store the MySQL charset-collation code. In DATA_BLOB columns created with +< 4.0.14 we do not actually know if it is a BLOB or a TEXT column. Since there +are no indexes on prefixes of BLOB or TEXT columns in < 4.0.14, this is no +problem, though. + +Note that versions < 4.1.2 or < 5.0.1 did not store the charset code to the +precise type, since the charset was always the default charset of the MySQL +installation. If the stored charset code is 0 in the system table SYS_COLUMNS +of InnoDB, that means that the default charset of this MySQL installation +should be used. + +When loading a table definition from the system tables to the InnoDB data +dictionary cache in main memory, InnoDB versions >= 4.1.2 and >= 5.0.1 check +if the stored charset-collation is 0, and if that is the case and the type is +a non-binary string, replace that 0 by the default charset-collation code of +this MySQL installation. In short, in old tables, the charset-collation code +in the system tables on disk can be 0, but in in-memory data structures +(dtype_t), the charset-collation code is always != 0 for non-binary string +types. + +In new tables, in binary string types, the charset-collation code is the +MySQL code for the 'binary charset', that is, != 0. + +For binary string types and for DATA_CHAR, DATA_VARCHAR, and for those +DATA_BLOB which are binary or have the charset-collation latin1_swedish_ci, +InnoDB performs all comparisons internally, without resorting to the MySQL +comparison functions. This is to save CPU time. + +InnoDB's own internal system tables have different precise types for their +columns, and for them the precise type is usually not used at all. +*/ + +#define DATA_ENGLISH 4 /* English language character string: this + is a relic from pre-MySQL time and only used + for InnoDB's own system tables */ +#define DATA_ERROR 111 /* another relic from pre-MySQL time */ + +#define DATA_MYSQL_TYPE_MASK 255 /* AND with this mask to extract the MySQL + type from the precise type */ +#define DATA_MYSQL_TRUE_VARCHAR 15 /* MySQL type code for the >= 5.0.3 + format true VARCHAR */ + +/* Precise data types for system columns and the length of those columns; +NOTE: the values must run from 0 up in the order given! All codes must +be less than 256 */ +#define DATA_ROW_ID 0 /* row id: a 48-bit integer */ +#define DATA_ROW_ID_LEN 6 /* stored length for row id */ + +#define DATA_TRX_ID 1 /* transaction id: 6 bytes */ +#define DATA_TRX_ID_LEN 6 + +#define DATA_ROLL_PTR 2 /* rollback data pointer: 7 bytes */ +#define DATA_ROLL_PTR_LEN 7 + +#define DATA_N_SYS_COLS 3 /* number of system columns defined above */ + +#define DATA_FTS_DOC_ID 3 /* Used as FTS DOC ID column */ + +#define DATA_SYS_PRTYPE_MASK 0xF /* mask to extract the above from prtype */ + +/* Flags ORed to the precise data type */ +#define DATA_NOT_NULL 256 /* this is ORed to the precise type when + the column is declared as NOT NULL */ +#define DATA_UNSIGNED 512 /* this id ORed to the precise type when + we have an unsigned integer type */ +#define DATA_BINARY_TYPE 1024 /* if the data type is a binary character + string, this is ORed to the precise type: + this only holds for tables created with + >= MySQL-4.0.14 */ +/* #define DATA_NONLATIN1 2048 This is a relic from < 4.1.2 and < 5.0.1. + In earlier versions this was set for some + BLOB columns. +*/ +#define DATA_LONG_TRUE_VARCHAR 4096 /* this is ORed to the precise data + type when the column is true VARCHAR where + MySQL uses 2 bytes to store the data len; + for shorter VARCHARs MySQL uses only 1 byte */ +/*-------------------------------------------*/ + +/* This many bytes we need to store the type information affecting the +alphabetical order for a single field and decide the storage size of an +SQL null*/ +#define DATA_ORDER_NULL_TYPE_BUF_SIZE 4 +/* In the >= 4.1.x storage format we add 2 bytes more so that we can also +store the charset-collation number; one byte is left unused, though */ +#define DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE 6 + +/* Maximum multi-byte character length in bytes, plus 1 */ +#define DATA_MBMAX 5 + +/* Pack mbminlen, mbmaxlen to mbminmaxlen. */ +#define DATA_MBMINMAXLEN(mbminlen, mbmaxlen) \ + ((mbmaxlen) * DATA_MBMAX + (mbminlen)) +/* Get mbminlen from mbminmaxlen. Cast the result of UNIV_EXPECT to ulint +because in GCC it returns a long. */ +#define DATA_MBMINLEN(mbminmaxlen) ((ulint) \ + UNIV_EXPECT(((mbminmaxlen) % DATA_MBMAX), \ + 1)) +/* Get mbmaxlen from mbminmaxlen. */ +#define DATA_MBMAXLEN(mbminmaxlen) ((ulint) ((mbminmaxlen) / DATA_MBMAX)) + +/* We now support 15 bits (up to 32767) collation number */ +#define MAX_CHAR_COLL_NUM 32767 + +/* Mask to get the Charset Collation number (0x7fff) */ +#define CHAR_COLL_MASK MAX_CHAR_COLL_NUM + +#ifndef UNIV_HOTBACKUP +/*********************************************************************//** +Gets the MySQL type code from a dtype. +@return MySQL type code; this is NOT an InnoDB type code! */ +UNIV_INLINE +ulint +dtype_get_mysql_type( +/*=================*/ + const dtype_t* type); /*!< in: type struct */ +/*********************************************************************//** +Determine how many bytes the first n characters of the given string occupy. +If the string is shorter than n characters, returns the number of bytes +the characters in the string occupy. +@return length of the prefix, in bytes */ +UNIV_INTERN +ulint +dtype_get_at_most_n_mbchars( +/*========================*/ + ulint prtype, /*!< in: precise type */ + ulint mbminmaxlen, /*!< in: minimum and maximum length of + a multi-byte character */ + ulint prefix_len, /*!< in: length of the requested + prefix, in characters, multiplied by + dtype_get_mbmaxlen(dtype) */ + ulint data_len, /*!< in: length of str (in bytes) */ + const char* str); /*!< in: the string whose prefix + length is being determined */ +#endif /* !UNIV_HOTBACKUP */ +/*********************************************************************//** +Checks if a data main type is a string type. Also a BLOB is considered a +string type. +@return TRUE if string type */ +UNIV_INTERN +ibool +dtype_is_string_type( +/*=================*/ + ulint mtype); /*!< in: InnoDB main data type code: DATA_CHAR, ... */ +/*********************************************************************//** +Checks if a type is a binary string type. Note that for tables created with +< 4.0.14, we do not know if a DATA_BLOB column is a BLOB or a TEXT column. For +those DATA_BLOB columns this function currently returns FALSE. +@return TRUE if binary string type */ +UNIV_INTERN +ibool +dtype_is_binary_string_type( +/*========================*/ + ulint mtype, /*!< in: main data type */ + ulint prtype);/*!< in: precise type */ +/*********************************************************************//** +Checks if a type is a non-binary string type. That is, dtype_is_string_type is +TRUE and dtype_is_binary_string_type is FALSE. Note that for tables created +with < 4.0.14, we do not know if a DATA_BLOB column is a BLOB or a TEXT column. +For those DATA_BLOB columns this function currently returns TRUE. +@return TRUE if non-binary string type */ +UNIV_INTERN +ibool +dtype_is_non_binary_string_type( +/*============================*/ + ulint mtype, /*!< in: main data type */ + ulint prtype);/*!< in: precise type */ +/*********************************************************************//** +Sets a data type structure. */ +UNIV_INLINE +void +dtype_set( +/*======*/ + dtype_t* type, /*!< in: type struct to init */ + ulint mtype, /*!< in: main data type */ + ulint prtype, /*!< in: precise type */ + ulint len); /*!< in: precision of type */ +/*********************************************************************//** +Copies a data type structure. */ +UNIV_INLINE +void +dtype_copy( +/*=======*/ + dtype_t* type1, /*!< in: type struct to copy to */ + const dtype_t* type2); /*!< in: type struct to copy from */ +/*********************************************************************//** +Gets the SQL main data type. +@return SQL main data type */ +UNIV_INLINE +ulint +dtype_get_mtype( +/*============*/ + const dtype_t* type); /*!< in: data type */ +/*********************************************************************//** +Gets the precise data type. +@return precise data type */ +UNIV_INLINE +ulint +dtype_get_prtype( +/*=============*/ + const dtype_t* type); /*!< in: data type */ +#ifndef UNIV_HOTBACKUP +/*********************************************************************//** +Compute the mbminlen and mbmaxlen members of a data type structure. */ +UNIV_INLINE +void +dtype_get_mblen( +/*============*/ + ulint mtype, /*!< in: main type */ + ulint prtype, /*!< in: precise type (and collation) */ + ulint* mbminlen, /*!< out: minimum length of a + multi-byte character */ + ulint* mbmaxlen); /*!< out: maximum length of a + multi-byte character */ +/*********************************************************************//** +Gets the MySQL charset-collation code for MySQL string types. +@return MySQL charset-collation code */ +UNIV_INLINE +ulint +dtype_get_charset_coll( +/*===================*/ + ulint prtype);/*!< in: precise data type */ +/*********************************************************************//** +Forms a precise type from the < 4.1.2 format precise type plus the +charset-collation code. +@return precise type, including the charset-collation code */ +UNIV_INTERN +ulint +dtype_form_prtype( +/*==============*/ + ulint old_prtype, /*!< in: the MySQL type code and the flags + DATA_BINARY_TYPE etc. */ + ulint charset_coll); /*!< in: MySQL charset-collation code */ +/*********************************************************************//** +Determines if a MySQL string type is a subset of UTF-8. This function +may return false negatives, in case further character-set collation +codes are introduced in MySQL later. +@return TRUE if a subset of UTF-8 */ +UNIV_INLINE +ibool +dtype_is_utf8( +/*==========*/ + ulint prtype);/*!< in: precise data type */ +#endif /* !UNIV_HOTBACKUP */ +/*********************************************************************//** +Gets the type length. +@return fixed length of the type, in bytes, or 0 if variable-length */ +UNIV_INLINE +ulint +dtype_get_len( +/*==========*/ + const dtype_t* type); /*!< in: data type */ +#ifndef UNIV_HOTBACKUP +/*********************************************************************//** +Gets the minimum length of a character, in bytes. +@return minimum length of a char, in bytes, or 0 if this is not a +character type */ +UNIV_INLINE +ulint +dtype_get_mbminlen( +/*===============*/ + const dtype_t* type); /*!< in: type */ +/*********************************************************************//** +Gets the maximum length of a character, in bytes. +@return maximum length of a char, in bytes, or 0 if this is not a +character type */ +UNIV_INLINE +ulint +dtype_get_mbmaxlen( +/*===============*/ + const dtype_t* type); /*!< in: type */ +/*********************************************************************//** +Sets the minimum and maximum length of a character, in bytes. */ +UNIV_INLINE +void +dtype_set_mbminmaxlen( +/*==================*/ + dtype_t* type, /*!< in/out: type */ + ulint mbminlen, /*!< in: minimum length of a char, + in bytes, or 0 if this is not + a character type */ + ulint mbmaxlen); /*!< in: maximum length of a char, + in bytes, or 0 if this is not + a character type */ +/*********************************************************************//** +Gets the padding character code for the type. +@return padding character code, or ULINT_UNDEFINED if no padding specified */ +UNIV_INLINE +ulint +dtype_get_pad_char( +/*===============*/ + ulint mtype, /*!< in: main type */ + ulint prtype); /*!< in: precise type */ +#endif /* !UNIV_HOTBACKUP */ +/***********************************************************************//** +Returns the size of a fixed size data type, 0 if not a fixed size type. +@return fixed size, or 0 */ +UNIV_INLINE +ulint +dtype_get_fixed_size_low( +/*=====================*/ + ulint mtype, /*!< in: main type */ + ulint prtype, /*!< in: precise type */ + ulint len, /*!< in: length */ + ulint mbminmaxlen, /*!< in: minimum and maximum length of a + multibyte character, in bytes */ + ulint comp); /*!< in: nonzero=ROW_FORMAT=COMPACT */ +#ifndef UNIV_HOTBACKUP +/***********************************************************************//** +Returns the minimum size of a data type. +@return minimum size */ +UNIV_INLINE +ulint +dtype_get_min_size_low( +/*===================*/ + ulint mtype, /*!< in: main type */ + ulint prtype, /*!< in: precise type */ + ulint len, /*!< in: length */ + ulint mbminmaxlen); /*!< in: minimum and maximum length of a + multibyte character */ +/***********************************************************************//** +Returns the maximum size of a data type. Note: types in system tables may be +incomplete and return incorrect information. +@return maximum size */ +UNIV_INLINE +ulint +dtype_get_max_size_low( +/*===================*/ + ulint mtype, /*!< in: main type */ + ulint len); /*!< in: length */ +#endif /* !UNIV_HOTBACKUP */ +/***********************************************************************//** +Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a type. +For fixed length types it is the fixed length of the type, otherwise 0. +@return SQL null storage size in ROW_FORMAT=REDUNDANT */ +UNIV_INLINE +ulint +dtype_get_sql_null_size( +/*====================*/ + const dtype_t* type, /*!< in: type */ + ulint comp); /*!< in: nonzero=ROW_FORMAT=COMPACT */ +#ifndef UNIV_HOTBACKUP +/**********************************************************************//** +Reads to a type the stored information which determines its alphabetical +ordering and the storage size of an SQL NULL value. */ +UNIV_INLINE +void +dtype_read_for_order_and_null_size( +/*===============================*/ + dtype_t* type, /*!< in: type struct */ + const byte* buf); /*!< in: buffer for the stored order info */ +/**********************************************************************//** +Stores for a type the information which determines its alphabetical ordering +and the storage size of an SQL NULL value. This is the >= 4.1.x storage +format. */ +UNIV_INLINE +void +dtype_new_store_for_order_and_null_size( +/*====================================*/ + byte* buf, /*!< in: buffer for + DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE + bytes where we store the info */ + const dtype_t* type, /*!< in: type struct */ + ulint prefix_len);/*!< in: prefix length to + replace type->len, or 0 */ +/**********************************************************************//** +Reads to a type the stored information which determines its alphabetical +ordering and the storage size of an SQL NULL value. This is the 4.1.x storage +format. */ +UNIV_INLINE +void +dtype_new_read_for_order_and_null_size( +/*===================================*/ + dtype_t* type, /*!< in: type struct */ + const byte* buf); /*!< in: buffer for stored type order info */ + +/*********************************************************************//** +Returns the type's SQL name (e.g. BIGINT UNSIGNED) from mtype,prtype,len +@return the SQL type name */ +UNIV_INLINE +char* +dtype_sql_name( +/*===========*/ + unsigned mtype, /*!< in: mtype */ + unsigned prtype, /*!< in: prtype */ + unsigned len, /*!< in: len */ + char* name, /*!< out: SQL name */ + unsigned name_sz);/*!< in: size of the name buffer */ + +#endif /* !UNIV_HOTBACKUP */ + +/*********************************************************************//** +Validates a data type structure. +@return TRUE if ok */ +UNIV_INTERN +ibool +dtype_validate( +/*===========*/ + const dtype_t* type); /*!< in: type struct to validate */ +/*********************************************************************//** +Prints a data type structure. */ +UNIV_INTERN +void +dtype_print( +/*========*/ + const dtype_t* type); /*!< in: type */ + +/* Structure for an SQL data type. +If you add fields to this structure, be sure to initialize them everywhere. +This structure is initialized in the following functions: +dtype_set() +dtype_read_for_order_and_null_size() +dtype_new_read_for_order_and_null_size() +sym_tab_add_null_lit() */ + +struct dtype_t{ + unsigned prtype:32; /*!< precise type; MySQL data + type, charset code, flags to + indicate nullability, + signedness, whether this is a + binary string, whether this is + a true VARCHAR where MySQL + uses 2 bytes to store the length */ + unsigned mtype:8; /*!< main data type */ + + /* the remaining fields do not affect alphabetical ordering: */ + + unsigned len:16; /*!< length; for MySQL data this + is field->pack_length(), + except that for a >= 5.0.3 + type true VARCHAR this is the + maximum byte length of the + string data (in addition to + the string, MySQL uses 1 or 2 + bytes to store the string length) */ +#ifndef UNIV_HOTBACKUP + unsigned mbminmaxlen:5; /*!< minimum and maximum length of a + character, in bytes; + DATA_MBMINMAXLEN(mbminlen,mbmaxlen); + mbminlen=DATA_MBMINLEN(mbminmaxlen); + mbmaxlen=DATA_MBMINLEN(mbminmaxlen) */ +#endif /* !UNIV_HOTBACKUP */ +}; + +#ifndef UNIV_NONINL +#include "data0type.ic" +#endif + +#endif diff --git a/storage/xtradb/include/data0type.ic b/storage/xtradb/include/data0type.ic new file mode 100644 index 00000000000..d489bef89a8 --- /dev/null +++ b/storage/xtradb/include/data0type.ic @@ -0,0 +1,711 @@ +/***************************************************************************** + +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/data0type.ic +Data types + +Created 1/16/1996 Heikki Tuuri +*******************************************************/ + +#include <string.h> /* strlen() */ + +#include "mach0data.h" +#ifndef UNIV_HOTBACKUP +# include "ha_prototypes.h" + +/*********************************************************************//** +Gets the MySQL charset-collation code for MySQL string types. +@return MySQL charset-collation code */ +UNIV_INLINE +ulint +dtype_get_charset_coll( +/*===================*/ + ulint prtype) /*!< in: precise data type */ +{ + return((prtype >> 16) & CHAR_COLL_MASK); +} + +/*********************************************************************//** +Determines if a MySQL string type is a subset of UTF-8. This function +may return false negatives, in case further character-set collation +codes are introduced in MySQL later. +@return TRUE if a subset of UTF-8 */ +UNIV_INLINE +ibool +dtype_is_utf8( +/*==========*/ + ulint prtype) /*!< in: precise data type */ +{ + /* These codes have been copied from strings/ctype-extra.c + and strings/ctype-utf8.c. */ + switch (dtype_get_charset_coll(prtype)) { + case 11: /* ascii_general_ci */ + case 65: /* ascii_bin */ + case 33: /* utf8_general_ci */ + case 83: /* utf8_bin */ + case 254: /* utf8_general_cs */ + return(TRUE); + } + + return(FALSE); +} + +/*********************************************************************//** +Gets the MySQL type code from a dtype. +@return MySQL type code; this is NOT an InnoDB type code! */ +UNIV_INLINE +ulint +dtype_get_mysql_type( +/*=================*/ + const dtype_t* type) /*!< in: type struct */ +{ + return(type->prtype & 0xFFUL); +} + +/*********************************************************************//** +Compute the mbminlen and mbmaxlen members of a data type structure. */ +UNIV_INLINE +void +dtype_get_mblen( +/*============*/ + ulint mtype, /*!< in: main type */ + ulint prtype, /*!< in: precise type (and collation) */ + ulint* mbminlen, /*!< out: minimum length of a + multi-byte character */ + ulint* mbmaxlen) /*!< out: maximum length of a + multi-byte character */ +{ + if (dtype_is_string_type(mtype)) { + innobase_get_cset_width(dtype_get_charset_coll(prtype), + mbminlen, mbmaxlen); + ut_ad(*mbminlen <= *mbmaxlen); + ut_ad(*mbminlen < DATA_MBMAX); + ut_ad(*mbmaxlen < DATA_MBMAX); + } else { + *mbminlen = *mbmaxlen = 0; + } +} + +/*********************************************************************//** +Sets the minimum and maximum length of a character, in bytes. */ +UNIV_INLINE +void +dtype_set_mbminmaxlen( +/*==================*/ + dtype_t* type, /*!< in/out: type */ + ulint mbminlen, /*!< in: minimum length of a char, + in bytes, or 0 if this is not + a character type */ + ulint mbmaxlen) /*!< in: maximum length of a char, + in bytes, or 0 if this is not + a character type */ +{ + ut_ad(mbminlen < DATA_MBMAX); + ut_ad(mbmaxlen < DATA_MBMAX); + ut_ad(mbminlen <= mbmaxlen); + + type->mbminmaxlen = DATA_MBMINMAXLEN(mbminlen, mbmaxlen); +} + +/*********************************************************************//** +Compute the mbminlen and mbmaxlen members of a data type structure. */ +UNIV_INLINE +void +dtype_set_mblen( +/*============*/ + dtype_t* type) /*!< in/out: type */ +{ + ulint mbminlen; + ulint mbmaxlen; + + dtype_get_mblen(type->mtype, type->prtype, &mbminlen, &mbmaxlen); + dtype_set_mbminmaxlen(type, mbminlen, mbmaxlen); + + ut_ad(dtype_validate(type)); +} +#else /* !UNIV_HOTBACKUP */ +# define dtype_set_mblen(type) (void) 0 +#endif /* !UNIV_HOTBACKUP */ + +/*********************************************************************//** +Sets a data type structure. */ +UNIV_INLINE +void +dtype_set( +/*======*/ + dtype_t* type, /*!< in: type struct to init */ + ulint mtype, /*!< in: main data type */ + ulint prtype, /*!< in: precise type */ + ulint len) /*!< in: precision of type */ +{ + ut_ad(type); + ut_ad(mtype <= DATA_MTYPE_MAX); + + type->mtype = mtype; + type->prtype = prtype; + type->len = len; + + dtype_set_mblen(type); +} + +/*********************************************************************//** +Copies a data type structure. */ +UNIV_INLINE +void +dtype_copy( +/*=======*/ + dtype_t* type1, /*!< in: type struct to copy to */ + const dtype_t* type2) /*!< in: type struct to copy from */ +{ + *type1 = *type2; + + ut_ad(dtype_validate(type1)); +} + +/*********************************************************************//** +Gets the SQL main data type. +@return SQL main data type */ +UNIV_INLINE +ulint +dtype_get_mtype( +/*============*/ + const dtype_t* type) /*!< in: data type */ +{ + ut_ad(type); + + return(type->mtype); +} + +/*********************************************************************//** +Gets the precise data type. +@return precise data type */ +UNIV_INLINE +ulint +dtype_get_prtype( +/*=============*/ + const dtype_t* type) /*!< in: data type */ +{ + ut_ad(type); + + return(type->prtype); +} + +/*********************************************************************//** +Gets the type length. +@return fixed length of the type, in bytes, or 0 if variable-length */ +UNIV_INLINE +ulint +dtype_get_len( +/*==========*/ + const dtype_t* type) /*!< in: data type */ +{ + ut_ad(type); + + return(type->len); +} + +#ifndef UNIV_HOTBACKUP +/*********************************************************************//** +Gets the minimum length of a character, in bytes. +@return minimum length of a char, in bytes, or 0 if this is not a +character type */ +UNIV_INLINE +ulint +dtype_get_mbminlen( +/*===============*/ + const dtype_t* type) /*!< in: type */ +{ + ut_ad(type); + return(DATA_MBMINLEN(type->mbminmaxlen)); +} +/*********************************************************************//** +Gets the maximum length of a character, in bytes. +@return maximum length of a char, in bytes, or 0 if this is not a +character type */ +UNIV_INLINE +ulint +dtype_get_mbmaxlen( +/*===============*/ + const dtype_t* type) /*!< in: type */ +{ + ut_ad(type); + return(DATA_MBMAXLEN(type->mbminmaxlen)); +} + +/*********************************************************************//** +Gets the padding character code for a type. +@return padding character code, or ULINT_UNDEFINED if no padding specified */ +UNIV_INLINE +ulint +dtype_get_pad_char( +/*===============*/ + ulint mtype, /*!< in: main type */ + ulint prtype) /*!< in: precise type */ +{ + switch (mtype) { + case DATA_FIXBINARY: + case DATA_BINARY: + if (dtype_get_charset_coll(prtype) + == DATA_MYSQL_BINARY_CHARSET_COLL) { + /* Starting from 5.0.18, do not pad + VARBINARY or BINARY columns. */ + return(ULINT_UNDEFINED); + } + /* Fall through */ + case DATA_CHAR: + case DATA_VARCHAR: + case DATA_MYSQL: + case DATA_VARMYSQL: + /* Space is the padding character for all char and binary + strings, and starting from 5.0.3, also for TEXT strings. */ + + return(0x20); + case DATA_BLOB: + if (!(prtype & DATA_BINARY_TYPE)) { + return(0x20); + } + /* Fall through */ + default: + /* No padding specified */ + return(ULINT_UNDEFINED); + } +} + +/**********************************************************************//** +Stores for a type the information which determines its alphabetical ordering +and the storage size of an SQL NULL value. This is the >= 4.1.x storage +format. */ +UNIV_INLINE +void +dtype_new_store_for_order_and_null_size( +/*====================================*/ + byte* buf, /*!< in: buffer for + DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE + bytes where we store the info */ + const dtype_t* type, /*!< in: type struct */ + ulint prefix_len)/*!< in: prefix length to + replace type->len, or 0 */ +{ +#if 6 != DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE +#error "6 != DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE" +#endif + ulint len; + + ut_ad(type); + ut_ad(type->mtype >= DATA_VARCHAR); + ut_ad(type->mtype <= DATA_MYSQL); + + buf[0] = (byte)(type->mtype & 0xFFUL); + + if (type->prtype & DATA_BINARY_TYPE) { + buf[0] |= 128; + } + + /* In versions < 4.1.2 we had: if (type->prtype & DATA_NONLATIN1) { + buf[0] |= 64; + } + */ + + buf[1] = (byte)(type->prtype & 0xFFUL); + + len = prefix_len ? prefix_len : type->len; + + mach_write_to_2(buf + 2, len & 0xFFFFUL); + + ut_ad(dtype_get_charset_coll(type->prtype) <= MAX_CHAR_COLL_NUM); + mach_write_to_2(buf + 4, dtype_get_charset_coll(type->prtype)); + + if (type->prtype & DATA_NOT_NULL) { + buf[4] |= 128; + } +} + +/**********************************************************************//** +Reads to a type the stored information which determines its alphabetical +ordering and the storage size of an SQL NULL value. This is the < 4.1.x +storage format. */ +UNIV_INLINE +void +dtype_read_for_order_and_null_size( +/*===============================*/ + dtype_t* type, /*!< in: type struct */ + const byte* buf) /*!< in: buffer for stored type order info */ +{ +#if 4 != DATA_ORDER_NULL_TYPE_BUF_SIZE +# error "4 != DATA_ORDER_NULL_TYPE_BUF_SIZE" +#endif + + type->mtype = buf[0] & 63; + type->prtype = buf[1]; + + if (buf[0] & 128) { + type->prtype |= DATA_BINARY_TYPE; + } + + type->len = mach_read_from_2(buf + 2); + + type->prtype = dtype_form_prtype(type->prtype, + data_mysql_default_charset_coll); + dtype_set_mblen(type); +} + +/**********************************************************************//** +Reads to a type the stored information which determines its alphabetical +ordering and the storage size of an SQL NULL value. This is the >= 4.1.x +storage format. */ +UNIV_INLINE +void +dtype_new_read_for_order_and_null_size( +/*===================================*/ + dtype_t* type, /*!< in: type struct */ + const byte* buf) /*!< in: buffer for stored type order info */ +{ + ulint charset_coll; + +#if 6 != DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE +#error "6 != DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE" +#endif + + type->mtype = buf[0] & 63; + type->prtype = buf[1]; + + if (buf[0] & 128) { + type->prtype |= DATA_BINARY_TYPE; + } + + if (buf[4] & 128) { + type->prtype |= DATA_NOT_NULL; + } + + type->len = mach_read_from_2(buf + 2); + + charset_coll = mach_read_from_2(buf + 4) & CHAR_COLL_MASK; + + if (dtype_is_string_type(type->mtype)) { + ut_a(charset_coll <= MAX_CHAR_COLL_NUM); + + if (charset_coll == 0) { + /* This insert buffer record was inserted with MySQL + version < 4.1.2, and the charset-collation code was not + explicitly stored to dtype->prtype at that time. It + must be the default charset-collation of this MySQL + installation. */ + + charset_coll = data_mysql_default_charset_coll; + } + + type->prtype = dtype_form_prtype(type->prtype, charset_coll); + } + dtype_set_mblen(type); +} + +/*********************************************************************//** +Returns the type's SQL name (e.g. BIGINT UNSIGNED) from mtype,prtype,len +@return the SQL type name */ +UNIV_INLINE +char* +dtype_sql_name( +/*===========*/ + unsigned mtype, /*!< in: mtype */ + unsigned prtype, /*!< in: prtype */ + unsigned len, /*!< in: len */ + char* name, /*!< out: SQL name */ + unsigned name_sz)/*!< in: size of the name buffer */ +{ + +#define APPEND_UNSIGNED() \ + do { \ + if (prtype & DATA_UNSIGNED) { \ + ut_snprintf(name + strlen(name), \ + name_sz - strlen(name), \ + " UNSIGNED"); \ + } \ + } while (0) + + ut_snprintf(name, name_sz, "UNKNOWN"); + + switch (mtype) { + case DATA_INT: + switch (len) { + case 1: + ut_snprintf(name, name_sz, "TINYINT"); + break; + case 2: + ut_snprintf(name, name_sz, "SMALLINT"); + break; + case 3: + ut_snprintf(name, name_sz, "MEDIUMINT"); + break; + case 4: + ut_snprintf(name, name_sz, "INT"); + break; + case 8: + ut_snprintf(name, name_sz, "BIGINT"); + break; + } + APPEND_UNSIGNED(); + break; + case DATA_FLOAT: + ut_snprintf(name, name_sz, "FLOAT"); + APPEND_UNSIGNED(); + break; + case DATA_DOUBLE: + ut_snprintf(name, name_sz, "DOUBLE"); + APPEND_UNSIGNED(); + break; + case DATA_FIXBINARY: + ut_snprintf(name, name_sz, "BINARY(%u)", len); + break; + case DATA_CHAR: + case DATA_MYSQL: + ut_snprintf(name, name_sz, "CHAR(%u)", len); + break; + case DATA_VARCHAR: + case DATA_VARMYSQL: + ut_snprintf(name, name_sz, "VARCHAR(%u)", len); + break; + case DATA_BINARY: + ut_snprintf(name, name_sz, "VARBINARY(%u)", len); + break; + case DATA_BLOB: + switch (len) { + case 9: + ut_snprintf(name, name_sz, "TINYBLOB"); + break; + case 10: + ut_snprintf(name, name_sz, "BLOB"); + break; + case 11: + ut_snprintf(name, name_sz, "MEDIUMBLOB"); + break; + case 12: + ut_snprintf(name, name_sz, "LONGBLOB"); + break; + } + } + + if (prtype & DATA_NOT_NULL) { + ut_snprintf(name + strlen(name), + name_sz - strlen(name), + " NOT NULL"); + } + + return(name); +} + +#endif /* !UNIV_HOTBACKUP */ + +/***********************************************************************//** +Returns the size of a fixed size data type, 0 if not a fixed size type. +@return fixed size, or 0 */ +UNIV_INLINE +ulint +dtype_get_fixed_size_low( +/*=====================*/ + ulint mtype, /*!< in: main type */ + ulint prtype, /*!< in: precise type */ + ulint len, /*!< in: length */ + ulint mbminmaxlen, /*!< in: minimum and maximum length of + a multibyte character, in bytes */ + ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */ +{ + switch (mtype) { + case DATA_SYS: +#ifdef UNIV_DEBUG + switch (prtype & DATA_MYSQL_TYPE_MASK) { + case DATA_ROW_ID: + ut_ad(len == DATA_ROW_ID_LEN); + break; + case DATA_TRX_ID: + ut_ad(len == DATA_TRX_ID_LEN); + break; + case DATA_ROLL_PTR: + ut_ad(len == DATA_ROLL_PTR_LEN); + break; + default: + ut_ad(0); + return(0); + } +#endif /* UNIV_DEBUG */ + case DATA_CHAR: + case DATA_FIXBINARY: + case DATA_INT: + case DATA_FLOAT: + case DATA_DOUBLE: + return(len); + case DATA_MYSQL: +#ifndef UNIV_HOTBACKUP + if (prtype & DATA_BINARY_TYPE) { + return(len); + } else if (!comp) { + return(len); + } else { +#ifdef UNIV_DEBUG + ulint i_mbminlen, i_mbmaxlen; + + innobase_get_cset_width( + dtype_get_charset_coll(prtype), + &i_mbminlen, &i_mbmaxlen); + + ut_ad(DATA_MBMINMAXLEN(i_mbminlen, i_mbmaxlen) + == mbminmaxlen); +#endif /* UNIV_DEBUG */ + if (DATA_MBMINLEN(mbminmaxlen) + == DATA_MBMAXLEN(mbminmaxlen)) { + return(len); + } + } +#else /* !UNIV_HOTBACKUP */ + return(len); +#endif /* !UNIV_HOTBACKUP */ + /* fall through for variable-length charsets */ + case DATA_VARCHAR: + case DATA_BINARY: + case DATA_DECIMAL: + case DATA_VARMYSQL: + case DATA_BLOB: + return(0); + default: + ut_error; + } + + return(0); +} + +#ifndef UNIV_HOTBACKUP +/***********************************************************************//** +Returns the minimum size of a data type. +@return minimum size */ +UNIV_INLINE +ulint +dtype_get_min_size_low( +/*===================*/ + ulint mtype, /*!< in: main type */ + ulint prtype, /*!< in: precise type */ + ulint len, /*!< in: length */ + ulint mbminmaxlen) /*!< in: minimum and maximum length of a + multi-byte character */ +{ + switch (mtype) { + case DATA_SYS: +#ifdef UNIV_DEBUG + switch (prtype & DATA_MYSQL_TYPE_MASK) { + case DATA_ROW_ID: + ut_ad(len == DATA_ROW_ID_LEN); + break; + case DATA_TRX_ID: + ut_ad(len == DATA_TRX_ID_LEN); + break; + case DATA_ROLL_PTR: + ut_ad(len == DATA_ROLL_PTR_LEN); + break; + default: + ut_ad(0); + return(0); + } +#endif /* UNIV_DEBUG */ + case DATA_CHAR: + case DATA_FIXBINARY: + case DATA_INT: + case DATA_FLOAT: + case DATA_DOUBLE: + return(len); + case DATA_MYSQL: + if (prtype & DATA_BINARY_TYPE) { + return(len); + } else { + ulint mbminlen = DATA_MBMINLEN(mbminmaxlen); + ulint mbmaxlen = DATA_MBMAXLEN(mbminmaxlen); + + if (mbminlen == mbmaxlen) { + return(len); + } + + /* this is a variable-length character set */ + ut_a(mbminlen > 0); + ut_a(mbmaxlen > mbminlen); + ut_a(len % mbmaxlen == 0); + return(len * mbminlen / mbmaxlen); + } + case DATA_VARCHAR: + case DATA_BINARY: + case DATA_DECIMAL: + case DATA_VARMYSQL: + case DATA_BLOB: + return(0); + default: + ut_error; + } + + return(0); +} + +/***********************************************************************//** +Returns the maximum size of a data type. Note: types in system tables may be +incomplete and return incorrect information. +@return maximum size */ +UNIV_INLINE +ulint +dtype_get_max_size_low( +/*===================*/ + ulint mtype, /*!< in: main type */ + ulint len) /*!< in: length */ +{ + switch (mtype) { + case DATA_SYS: + case DATA_CHAR: + case DATA_FIXBINARY: + case DATA_INT: + case DATA_FLOAT: + case DATA_DOUBLE: + case DATA_MYSQL: + case DATA_VARCHAR: + case DATA_BINARY: + case DATA_DECIMAL: + case DATA_VARMYSQL: + return(len); + case DATA_BLOB: + break; + default: + ut_error; + } + + return(ULINT_MAX); +} +#endif /* !UNIV_HOTBACKUP */ + +/***********************************************************************//** +Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a type. +For fixed length types it is the fixed length of the type, otherwise 0. +@return SQL null storage size in ROW_FORMAT=REDUNDANT */ +UNIV_INLINE +ulint +dtype_get_sql_null_size( +/*====================*/ + const dtype_t* type, /*!< in: type */ + ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */ +{ +#ifndef UNIV_HOTBACKUP + return(dtype_get_fixed_size_low(type->mtype, type->prtype, type->len, + type->mbminmaxlen, comp)); +#else /* !UNIV_HOTBACKUP */ + return(dtype_get_fixed_size_low(type->mtype, type->prtype, type->len, + 0, 0)); +#endif /* !UNIV_HOTBACKUP */ +} diff --git a/storage/xtradb/include/data0types.h b/storage/xtradb/include/data0types.h new file mode 100644 index 00000000000..bd2bb577611 --- /dev/null +++ b/storage/xtradb/include/data0types.h @@ -0,0 +1,36 @@ +/***************************************************************************** + +Copyright (c) 2000, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/data0types.h +Some type definitions + +Created 9/21/2000 Heikki Tuuri +*************************************************************************/ + +#ifndef data0types_h +#define data0types_h + +/* SQL data field struct */ +struct dfield_t; + +/* SQL data tuple struct */ +struct dtuple_t; + +#endif + diff --git a/storage/xtradb/include/db0err.h b/storage/xtradb/include/db0err.h new file mode 100644 index 00000000000..71916cb33f2 --- /dev/null +++ b/storage/xtradb/include/db0err.h @@ -0,0 +1,160 @@ +/***************************************************************************** + +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/db0err.h +Global error codes for the database + +Created 5/24/1996 Heikki Tuuri +*******************************************************/ + +#ifndef db0err_h +#define db0err_h + + +enum dberr_t { + DB_SUCCESS_LOCKED_REC = 9, /*!< like DB_SUCCESS, but a new + explicit record lock was created */ + DB_SUCCESS = 10, + + /* The following are error codes */ + DB_ERROR, + DB_INTERRUPTED, + DB_OUT_OF_MEMORY, + DB_OUT_OF_FILE_SPACE, + DB_LOCK_WAIT, + DB_DEADLOCK, + DB_ROLLBACK, + DB_DUPLICATE_KEY, + DB_QUE_THR_SUSPENDED, + DB_MISSING_HISTORY, /*!< required history data has been + deleted due to lack of space in + rollback segment */ + DB_CLUSTER_NOT_FOUND = 30, + DB_TABLE_NOT_FOUND, + DB_MUST_GET_MORE_FILE_SPACE, /*!< the database has to be stopped + and restarted with more file space */ + DB_TABLE_IS_BEING_USED, + DB_TOO_BIG_RECORD, /*!< a record in an index would not fit + on a compressed page, or it would + become bigger than 1/2 free space in + an uncompressed page frame */ + DB_LOCK_WAIT_TIMEOUT, /*!< lock wait lasted too long */ + DB_NO_REFERENCED_ROW, /*!< referenced key value not found + for a foreign key in an insert or + update of a row */ + DB_ROW_IS_REFERENCED, /*!< cannot delete or update a row + because it contains a key value + which is referenced */ + DB_CANNOT_ADD_CONSTRAINT, /*!< adding a foreign key constraint + to a table failed */ + DB_CORRUPTION, /*!< data structure corruption noticed */ + DB_CANNOT_DROP_CONSTRAINT, /*!< dropping a foreign key constraint + from a table failed */ + DB_NO_SAVEPOINT, /*!< no savepoint exists with the given + name */ + DB_TABLESPACE_EXISTS, /*!< we cannot create a new single-table + tablespace because a file of the same + name already exists */ + DB_TABLESPACE_DELETED, /*!< tablespace was deleted or is + being dropped right now */ + DB_TABLESPACE_NOT_FOUND, /*<! Attempt to delete a tablespace + instance that was not found in the + tablespace hash table */ + DB_LOCK_TABLE_FULL, /*!< lock structs have exhausted the + buffer pool (for big transactions, + InnoDB stores the lock structs in the + buffer pool) */ + DB_FOREIGN_DUPLICATE_KEY, /*!< foreign key constraints + activated by the operation would + lead to a duplicate key in some + table */ + DB_TOO_MANY_CONCURRENT_TRXS, /*!< when InnoDB runs out of the + preconfigured undo slots, this can + only happen when there are too many + concurrent transactions */ + DB_UNSUPPORTED, /*!< when InnoDB sees any artefact or + a feature that it can't recoginize or + work with e.g., FT indexes created by + a later version of the engine. */ + + DB_INVALID_NULL, /*!< a NOT NULL column was found to + be NULL during table rebuild */ + + DB_STATS_DO_NOT_EXIST, /*!< an operation that requires the + persistent storage, used for recording + table and index statistics, was + requested but this storage does not + exist itself or the stats for a given + table do not exist */ + DB_FOREIGN_EXCEED_MAX_CASCADE, /*!< Foreign key constraint related + cascading delete/update exceeds + maximum allowed depth */ + DB_CHILD_NO_INDEX, /*!< the child (foreign) table does + not have an index that contains the + foreign keys as its prefix columns */ + DB_PARENT_NO_INDEX, /*!< the parent table does not + have an index that contains the + foreign keys as its prefix columns */ + DB_TOO_BIG_INDEX_COL, /*!< index column size exceeds + maximum limit */ + DB_INDEX_CORRUPT, /*!< we have corrupted index */ + DB_UNDO_RECORD_TOO_BIG, /*!< the undo log record is too big */ + DB_READ_ONLY, /*!< Update operation attempted in + a read-only transaction */ + DB_FTS_INVALID_DOCID, /* FTS Doc ID cannot be zero */ + DB_TABLE_IN_FK_CHECK, /* table is being used in foreign + key check */ + DB_ONLINE_LOG_TOO_BIG, /*!< Modification log grew too big + during online index creation */ + + DB_IO_ERROR, /*!< Generic IO error */ + DB_IDENTIFIER_TOO_LONG, /*!< Identifier name too long */ + DB_FTS_EXCEED_RESULT_CACHE_LIMIT, /*!< FTS query memory + exceeds result cache limit */ + DB_TEMP_FILE_WRITE_FAILURE, /*!< Temp file write failure */ + DB_FTS_TOO_MANY_WORDS_IN_PHRASE, + /*< Too many words in a phrase */ + + /* The following are partial failure codes */ + DB_FAIL = 1000, + DB_OVERFLOW, + DB_UNDERFLOW, + DB_STRONG_FAIL, + DB_ZIP_OVERFLOW, + DB_RECORD_NOT_FOUND = 1500, + DB_END_OF_INDEX, + DB_DICT_CHANGED, /*!< Some part of table dictionary has + changed. Such as index dropped or + foreign key dropped */ + + + /* The following are API only error codes. */ + DB_DATA_MISMATCH = 2000, /*!< Column update or read failed + because the types mismatch */ + + DB_SCHEMA_NOT_LOCKED, /*!< If an API function expects the + schema to be locked in exclusive mode + and if it's not then that API function + will return this error code */ + + DB_NOT_FOUND /*!< Generic error code for "Not found" + type of errors */ +}; + +#endif diff --git a/storage/xtradb/include/dict0boot.h b/storage/xtradb/include/dict0boot.h new file mode 100644 index 00000000000..a994c9d8ff1 --- /dev/null +++ b/storage/xtradb/include/dict0boot.h @@ -0,0 +1,342 @@ +/***************************************************************************** + +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dict0boot.h +Data dictionary creation and booting + +Created 4/18/1996 Heikki Tuuri +*******************************************************/ + +#ifndef dict0boot_h +#define dict0boot_h + +#include "univ.i" + +#include "mtr0mtr.h" +#include "mtr0log.h" +#include "ut0byte.h" +#include "buf0buf.h" +#include "fsp0fsp.h" +#include "dict0dict.h" + +typedef byte dict_hdr_t; + +/**********************************************************************//** +Gets a pointer to the dictionary header and x-latches its page. +@return pointer to the dictionary header, page x-latched */ +UNIV_INTERN +dict_hdr_t* +dict_hdr_get( +/*=========*/ + mtr_t* mtr); /*!< in: mtr */ +/**********************************************************************//** +Returns a new table, index, or space id. */ +UNIV_INTERN +void +dict_hdr_get_new_id( +/*================*/ + table_id_t* table_id, /*!< out: table id + (not assigned if NULL) */ + index_id_t* index_id, /*!< out: index id + (not assigned if NULL) */ + ulint* space_id); /*!< out: space id + (not assigned if NULL) */ +/**********************************************************************//** +Writes the current value of the row id counter to the dictionary header file +page. */ +UNIV_INTERN +void +dict_hdr_flush_row_id(void); +/*=======================*/ +/**********************************************************************//** +Returns a new row id. +@return the new id */ +UNIV_INLINE +row_id_t +dict_sys_get_new_row_id(void); +/*=========================*/ +/**********************************************************************//** +Reads a row id from a record or other 6-byte stored form. +@return row id */ +UNIV_INLINE +row_id_t +dict_sys_read_row_id( +/*=================*/ + const byte* field); /*!< in: record field */ +/**********************************************************************//** +Writes a row id to a record or other 6-byte stored form. */ +UNIV_INLINE +void +dict_sys_write_row_id( +/*==================*/ + byte* field, /*!< in: record field */ + row_id_t row_id);/*!< in: row id */ +/*****************************************************************//** +Initializes the data dictionary memory structures when the database is +started. This function is also called when the data dictionary is created. +@return DB_SUCCESS or error code. */ +UNIV_INTERN +dberr_t +dict_boot(void) +/*===========*/ + __attribute__((warn_unused_result)); + +/*****************************************************************//** +Creates and initializes the data dictionary at the server bootstrap. +@return DB_SUCCESS or error code. */ +UNIV_INTERN +dberr_t +dict_create(void) +/*=============*/ + __attribute__((warn_unused_result)); + +/*********************************************************************//** +Check if a table id belongs to system table. +@return true if the table id belongs to a system table. */ +UNIV_INLINE +bool +dict_is_sys_table( +/*==============*/ + table_id_t id) /*!< in: table id to check */ + __attribute__((warn_unused_result)); + +/* Space id and page no where the dictionary header resides */ +#define DICT_HDR_SPACE 0 /* the SYSTEM tablespace */ +#define DICT_HDR_PAGE_NO FSP_DICT_HDR_PAGE_NO + +/* The ids for the basic system tables and their indexes */ +#define DICT_TABLES_ID 1 +#define DICT_COLUMNS_ID 2 +#define DICT_INDEXES_ID 3 +#define DICT_FIELDS_ID 4 +/* The following is a secondary index on SYS_TABLES */ +#define DICT_TABLE_IDS_ID 5 + +#define DICT_HDR_FIRST_ID 10 /* the ids for tables etc. start + from this number, except for basic + system tables and their above defined + indexes; ibuf tables and indexes are + assigned as the id the number + DICT_IBUF_ID_MIN plus the space id */ + +/* The offset of the dictionary header on the page */ +#define DICT_HDR FSEG_PAGE_DATA + +/*-------------------------------------------------------------*/ +/* Dictionary header offsets */ +#define DICT_HDR_ROW_ID 0 /* The latest assigned row id */ +#define DICT_HDR_TABLE_ID 8 /* The latest assigned table id */ +#define DICT_HDR_INDEX_ID 16 /* The latest assigned index id */ +#define DICT_HDR_MAX_SPACE_ID 24 /* The latest assigned space id,or 0*/ +#define DICT_HDR_MIX_ID_LOW 28 /* Obsolete,always DICT_HDR_FIRST_ID*/ +#define DICT_HDR_TABLES 32 /* Root of SYS_TABLES clust index */ +#define DICT_HDR_TABLE_IDS 36 /* Root of SYS_TABLE_IDS sec index */ +#define DICT_HDR_COLUMNS 40 /* Root of SYS_COLUMNS clust index */ +#define DICT_HDR_INDEXES 44 /* Root of SYS_INDEXES clust index */ +#define DICT_HDR_FIELDS 48 /* Root of SYS_FIELDS clust index */ + +#define DICT_HDR_FSEG_HEADER 56 /* Segment header for the tablespace + segment into which the dictionary + header is created */ +/*-------------------------------------------------------------*/ + +/* The columns in SYS_TABLES */ +enum dict_col_sys_tables_enum { + DICT_COL__SYS_TABLES__NAME = 0, + DICT_COL__SYS_TABLES__ID = 1, + DICT_COL__SYS_TABLES__N_COLS = 2, + DICT_COL__SYS_TABLES__TYPE = 3, + DICT_COL__SYS_TABLES__MIX_ID = 4, + DICT_COL__SYS_TABLES__MIX_LEN = 5, + DICT_COL__SYS_TABLES__CLUSTER_ID = 6, + DICT_COL__SYS_TABLES__SPACE = 7, + DICT_NUM_COLS__SYS_TABLES = 8 +}; +/* The field numbers in the SYS_TABLES clustered index */ +enum dict_fld_sys_tables_enum { + DICT_FLD__SYS_TABLES__NAME = 0, + DICT_FLD__SYS_TABLES__DB_TRX_ID = 1, + DICT_FLD__SYS_TABLES__DB_ROLL_PTR = 2, + DICT_FLD__SYS_TABLES__ID = 3, + DICT_FLD__SYS_TABLES__N_COLS = 4, + DICT_FLD__SYS_TABLES__TYPE = 5, + DICT_FLD__SYS_TABLES__MIX_ID = 6, + DICT_FLD__SYS_TABLES__MIX_LEN = 7, + DICT_FLD__SYS_TABLES__CLUSTER_ID = 8, + DICT_FLD__SYS_TABLES__SPACE = 9, + DICT_NUM_FIELDS__SYS_TABLES = 10 +}; +/* The field numbers in the SYS_TABLE_IDS index */ +enum dict_fld_sys_table_ids_enum { + DICT_FLD__SYS_TABLE_IDS__ID = 0, + DICT_FLD__SYS_TABLE_IDS__NAME = 1, + DICT_NUM_FIELDS__SYS_TABLE_IDS = 2 +}; +/* The columns in SYS_COLUMNS */ +enum dict_col_sys_columns_enum { + DICT_COL__SYS_COLUMNS__TABLE_ID = 0, + DICT_COL__SYS_COLUMNS__POS = 1, + DICT_COL__SYS_COLUMNS__NAME = 2, + DICT_COL__SYS_COLUMNS__MTYPE = 3, + DICT_COL__SYS_COLUMNS__PRTYPE = 4, + DICT_COL__SYS_COLUMNS__LEN = 5, + DICT_COL__SYS_COLUMNS__PREC = 6, + DICT_NUM_COLS__SYS_COLUMNS = 7 +}; +/* The field numbers in the SYS_COLUMNS clustered index */ +enum dict_fld_sys_columns_enum { + DICT_FLD__SYS_COLUMNS__TABLE_ID = 0, + DICT_FLD__SYS_COLUMNS__POS = 1, + DICT_FLD__SYS_COLUMNS__DB_TRX_ID = 2, + DICT_FLD__SYS_COLUMNS__DB_ROLL_PTR = 3, + DICT_FLD__SYS_COLUMNS__NAME = 4, + DICT_FLD__SYS_COLUMNS__MTYPE = 5, + DICT_FLD__SYS_COLUMNS__PRTYPE = 6, + DICT_FLD__SYS_COLUMNS__LEN = 7, + DICT_FLD__SYS_COLUMNS__PREC = 8, + DICT_NUM_FIELDS__SYS_COLUMNS = 9 +}; +/* The columns in SYS_INDEXES */ +enum dict_col_sys_indexes_enum { + DICT_COL__SYS_INDEXES__TABLE_ID = 0, + DICT_COL__SYS_INDEXES__ID = 1, + DICT_COL__SYS_INDEXES__NAME = 2, + DICT_COL__SYS_INDEXES__N_FIELDS = 3, + DICT_COL__SYS_INDEXES__TYPE = 4, + DICT_COL__SYS_INDEXES__SPACE = 5, + DICT_COL__SYS_INDEXES__PAGE_NO = 6, + DICT_NUM_COLS__SYS_INDEXES = 7 +}; +/* The field numbers in the SYS_INDEXES clustered index */ +enum dict_fld_sys_indexes_enum { + DICT_FLD__SYS_INDEXES__TABLE_ID = 0, + DICT_FLD__SYS_INDEXES__ID = 1, + DICT_FLD__SYS_INDEXES__DB_TRX_ID = 2, + DICT_FLD__SYS_INDEXES__DB_ROLL_PTR = 3, + DICT_FLD__SYS_INDEXES__NAME = 4, + DICT_FLD__SYS_INDEXES__N_FIELDS = 5, + DICT_FLD__SYS_INDEXES__TYPE = 6, + DICT_FLD__SYS_INDEXES__SPACE = 7, + DICT_FLD__SYS_INDEXES__PAGE_NO = 8, + DICT_NUM_FIELDS__SYS_INDEXES = 9 +}; +/* The columns in SYS_FIELDS */ +enum dict_col_sys_fields_enum { + DICT_COL__SYS_FIELDS__INDEX_ID = 0, + DICT_COL__SYS_FIELDS__POS = 1, + DICT_COL__SYS_FIELDS__COL_NAME = 2, + DICT_NUM_COLS__SYS_FIELDS = 3 +}; +/* The field numbers in the SYS_FIELDS clustered index */ +enum dict_fld_sys_fields_enum { + DICT_FLD__SYS_FIELDS__INDEX_ID = 0, + DICT_FLD__SYS_FIELDS__POS = 1, + DICT_FLD__SYS_FIELDS__DB_TRX_ID = 2, + DICT_FLD__SYS_FIELDS__DB_ROLL_PTR = 3, + DICT_FLD__SYS_FIELDS__COL_NAME = 4, + DICT_NUM_FIELDS__SYS_FIELDS = 5 +}; +/* The columns in SYS_FOREIGN */ +enum dict_col_sys_foreign_enum { + DICT_COL__SYS_FOREIGN__ID = 0, + DICT_COL__SYS_FOREIGN__FOR_NAME = 1, + DICT_COL__SYS_FOREIGN__REF_NAME = 2, + DICT_COL__SYS_FOREIGN__N_COLS = 3, + DICT_NUM_COLS__SYS_FOREIGN = 4 +}; +/* The field numbers in the SYS_FOREIGN clustered index */ +enum dict_fld_sys_foreign_enum { + DICT_FLD__SYS_FOREIGN__ID = 0, + DICT_FLD__SYS_FOREIGN__DB_TRX_ID = 1, + DICT_FLD__SYS_FOREIGN__DB_ROLL_PTR = 2, + DICT_FLD__SYS_FOREIGN__FOR_NAME = 3, + DICT_FLD__SYS_FOREIGN__REF_NAME = 4, + DICT_FLD__SYS_FOREIGN__N_COLS = 5, + DICT_NUM_FIELDS__SYS_FOREIGN = 6 +}; +/* The field numbers in the SYS_FOREIGN_FOR_NAME secondary index */ +enum dict_fld_sys_foreign_for_name_enum { + DICT_FLD__SYS_FOREIGN_FOR_NAME__NAME = 0, + DICT_FLD__SYS_FOREIGN_FOR_NAME__ID = 1, + DICT_NUM_FIELDS__SYS_FOREIGN_FOR_NAME = 2 +}; +/* The columns in SYS_FOREIGN_COLS */ +enum dict_col_sys_foreign_cols_enum { + DICT_COL__SYS_FOREIGN_COLS__ID = 0, + DICT_COL__SYS_FOREIGN_COLS__POS = 1, + DICT_COL__SYS_FOREIGN_COLS__FOR_COL_NAME = 2, + DICT_COL__SYS_FOREIGN_COLS__REF_COL_NAME = 3, + DICT_NUM_COLS__SYS_FOREIGN_COLS = 4 +}; +/* The field numbers in the SYS_FOREIGN_COLS clustered index */ +enum dict_fld_sys_foreign_cols_enum { + DICT_FLD__SYS_FOREIGN_COLS__ID = 0, + DICT_FLD__SYS_FOREIGN_COLS__POS = 1, + DICT_FLD__SYS_FOREIGN_COLS__DB_TRX_ID = 2, + DICT_FLD__SYS_FOREIGN_COLS__DB_ROLL_PTR = 3, + DICT_FLD__SYS_FOREIGN_COLS__FOR_COL_NAME = 4, + DICT_FLD__SYS_FOREIGN_COLS__REF_COL_NAME = 5, + DICT_NUM_FIELDS__SYS_FOREIGN_COLS = 6 +}; +/* The columns in SYS_TABLESPACES */ +enum dict_col_sys_tablespaces_enum { + DICT_COL__SYS_TABLESPACES__SPACE = 0, + DICT_COL__SYS_TABLESPACES__NAME = 1, + DICT_COL__SYS_TABLESPACES__FLAGS = 2, + DICT_NUM_COLS__SYS_TABLESPACES = 3 +}; +/* The field numbers in the SYS_TABLESPACES clustered index */ +enum dict_fld_sys_tablespaces_enum { + DICT_FLD__SYS_TABLESPACES__SPACE = 0, + DICT_FLD__SYS_TABLESPACES__DB_TRX_ID = 1, + DICT_FLD__SYS_TABLESPACES__DB_ROLL_PTR = 2, + DICT_FLD__SYS_TABLESPACES__NAME = 3, + DICT_FLD__SYS_TABLESPACES__FLAGS = 4, + DICT_NUM_FIELDS__SYS_TABLESPACES = 5 +}; +/* The columns in SYS_DATAFILES */ +enum dict_col_sys_datafiles_enum { + DICT_COL__SYS_DATAFILES__SPACE = 0, + DICT_COL__SYS_DATAFILES__PATH = 1, + DICT_NUM_COLS__SYS_DATAFILES = 2 +}; +/* The field numbers in the SYS_DATAFILES clustered index */ +enum dict_fld_sys_datafiles_enum { + DICT_FLD__SYS_DATAFILES__SPACE = 0, + DICT_FLD__SYS_DATAFILES__DB_TRX_ID = 1, + DICT_FLD__SYS_DATAFILES__DB_ROLL_PTR = 2, + DICT_FLD__SYS_DATAFILES__PATH = 3, + DICT_NUM_FIELDS__SYS_DATAFILES = 4 +}; + +/* A number of the columns above occur in multiple tables. These are the +length of thos fields. */ +#define DICT_FLD_LEN_SPACE 4 +#define DICT_FLD_LEN_FLAGS 4 + +/* When a row id which is zero modulo this number (which must be a power of +two) is assigned, the field DICT_HDR_ROW_ID on the dictionary header page is +updated */ +#define DICT_HDR_ROW_ID_WRITE_MARGIN 256 + +#ifndef UNIV_NONINL +#include "dict0boot.ic" +#endif + +#endif diff --git a/storage/xtradb/include/dict0boot.ic b/storage/xtradb/include/dict0boot.ic new file mode 100644 index 00000000000..2b156a4f672 --- /dev/null +++ b/storage/xtradb/include/dict0boot.ic @@ -0,0 +1,96 @@ +/***************************************************************************** + +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dict0boot.ic +Data dictionary creation and booting + +Created 4/18/1996 Heikki Tuuri +*******************************************************/ + +/**********************************************************************//** +Returns a new row id. +@return the new id */ +UNIV_INLINE +row_id_t +dict_sys_get_new_row_id(void) +/*=========================*/ +{ + row_id_t id; + + mutex_enter(&(dict_sys->mutex)); + + id = dict_sys->row_id; + + if (0 == (id % DICT_HDR_ROW_ID_WRITE_MARGIN)) { + + dict_hdr_flush_row_id(); + } + + dict_sys->row_id++; + + mutex_exit(&(dict_sys->mutex)); + + return(id); +} + +/**********************************************************************//** +Reads a row id from a record or other 6-byte stored form. +@return row id */ +UNIV_INLINE +row_id_t +dict_sys_read_row_id( +/*=================*/ + const byte* field) /*!< in: record field */ +{ +#if DATA_ROW_ID_LEN != 6 +# error "DATA_ROW_ID_LEN != 6" +#endif + + return(mach_read_from_6(field)); +} + +/**********************************************************************//** +Writes a row id to a record or other 6-byte stored form. */ +UNIV_INLINE +void +dict_sys_write_row_id( +/*==================*/ + byte* field, /*!< in: record field */ + row_id_t row_id) /*!< in: row id */ +{ +#if DATA_ROW_ID_LEN != 6 +# error "DATA_ROW_ID_LEN != 6" +#endif + + mach_write_to_6(field, row_id); +} + +/*********************************************************************//** +Check if a table id belongs to system table. +@return true if the table id belongs to a system table. */ +UNIV_INLINE +bool +dict_is_sys_table( +/*==============*/ + table_id_t id) /*!< in: table id to check */ +{ + return(id < DICT_HDR_FIRST_ID); +} + + diff --git a/storage/xtradb/include/dict0crea.h b/storage/xtradb/include/dict0crea.h new file mode 100644 index 00000000000..67eab9058da --- /dev/null +++ b/storage/xtradb/include/dict0crea.h @@ -0,0 +1,246 @@ +/***************************************************************************** + +Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dict0crea.h +Database object creation + +Created 1/8/1996 Heikki Tuuri +*******************************************************/ + +#ifndef dict0crea_h +#define dict0crea_h + +#include "univ.i" +#include "dict0types.h" +#include "dict0dict.h" +#include "que0types.h" +#include "row0types.h" +#include "mtr0mtr.h" + +/*********************************************************************//** +Creates a table create graph. +@return own: table create node */ +UNIV_INTERN +tab_node_t* +tab_create_graph_create( +/*====================*/ + dict_table_t* table, /*!< in: table to create, built as a memory data + structure */ + mem_heap_t* heap, /*!< in: heap where created */ + bool commit);/*!< in: true if the commit node should be + added to the query graph */ +/*********************************************************************//** +Creates an index create graph. +@return own: index create node */ +UNIV_INTERN +ind_node_t* +ind_create_graph_create( +/*====================*/ + dict_index_t* index, /*!< in: index to create, built as a memory data + structure */ + mem_heap_t* heap, /*!< in: heap where created */ + bool commit);/*!< in: true if the commit node should be + added to the query graph */ +/***********************************************************//** +Creates a table. This is a high-level function used in SQL execution graphs. +@return query thread to run next or NULL */ +UNIV_INTERN +que_thr_t* +dict_create_table_step( +/*===================*/ + que_thr_t* thr); /*!< in: query thread */ +/***********************************************************//** +Creates an index. This is a high-level function used in SQL execution +graphs. +@return query thread to run next or NULL */ +UNIV_INTERN +que_thr_t* +dict_create_index_step( +/*===================*/ + que_thr_t* thr); /*!< in: query thread */ +/*******************************************************************//** +Truncates the index tree associated with a row in SYS_INDEXES table. +@return new root page number, or FIL_NULL on failure */ +UNIV_INTERN +ulint +dict_truncate_index_tree( +/*=====================*/ + dict_table_t* table, /*!< in: the table the index belongs to */ + ulint space, /*!< in: 0=truncate, + nonzero=create the index tree in the + given tablespace */ + btr_pcur_t* pcur, /*!< in/out: persistent cursor pointing to + record in the clustered index of + SYS_INDEXES table. The cursor may be + repositioned in this call. */ + mtr_t* mtr); /*!< in: mtr having the latch + on the record page. The mtr may be + committed and restarted in this call. */ +/*******************************************************************//** +Drops the index tree associated with a row in SYS_INDEXES table. */ +UNIV_INTERN +void +dict_drop_index_tree( +/*=================*/ + rec_t* rec, /*!< in/out: record in the clustered index + of SYS_INDEXES table */ + mtr_t* mtr); /*!< in: mtr having the latch on the record page */ +/****************************************************************//** +Creates the foreign key constraints system tables inside InnoDB +at server bootstrap or server start if they are not found or are +not of the right form. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +dict_create_or_check_foreign_constraint_tables(void); +/*================================================*/ +/********************************************************************//** +Generate a foreign key constraint name when it was not named by the user. +A generated constraint has a name of the format dbname/tablename_ibfk_NUMBER, +where the numbers start from 1, and are given locally for this table, that is, +the number is not global, as it used to be before MySQL 4.0.18. */ +UNIV_INLINE +dberr_t +dict_create_add_foreign_id( +/*=======================*/ + ulint* id_nr, /*!< in/out: number to use in id generation; + incremented if used */ + const char* name, /*!< in: table name */ + dict_foreign_t* foreign)/*!< in/out: foreign key */ + __attribute__((nonnull)); + +/** Adds the given set of foreign key objects to the dictionary tables +in the database. This function does not modify the dictionary cache. The +caller must ensure that all foreign key objects contain a valid constraint +name in foreign->id. +@param[in] local_fk_set set of foreign key objects, to be added to +the dictionary tables +@param[in] table table to which the foreign key objects in +local_fk_set belong to +@param[in,out] trx transaction +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +dict_create_add_foreigns_to_dictionary( +/*===================================*/ + const dict_foreign_set& local_fk_set, + const dict_table_t* table, + trx_t* trx) + __attribute__((nonnull, warn_unused_result)); +/****************************************************************//** +Creates the tablespaces and datafiles system tables inside InnoDB +at server bootstrap or server start if they are not found or are +not of the right form. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +dict_create_or_check_sys_tablespace(void); +/*=====================================*/ +/********************************************************************//** +Add a single tablespace definition to the data dictionary tables in the +database. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +dict_create_add_tablespace_to_dictionary( +/*=====================================*/ + ulint space, /*!< in: tablespace id */ + const char* name, /*!< in: tablespace name */ + ulint flags, /*!< in: tablespace flags */ + const char* path, /*!< in: tablespace path */ + trx_t* trx, /*!< in: transaction */ + bool commit); /*!< in: if true then commit the + transaction */ +/********************************************************************//** +Add a foreign key definition to the data dictionary tables. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +dict_create_add_foreign_to_dictionary( +/*==================================*/ + const char* name, /*!< in: table name */ + const dict_foreign_t* foreign,/*!< in: foreign key */ + trx_t* trx) /*!< in/out: dictionary transaction */ + __attribute__((nonnull, warn_unused_result)); + +/* Table create node structure */ +struct tab_node_t{ + que_common_t common; /*!< node type: QUE_NODE_TABLE_CREATE */ + dict_table_t* table; /*!< table to create, built as a memory data + structure with dict_mem_... functions */ + ins_node_t* tab_def; /* child node which does the insert of + the table definition; the row to be inserted + is built by the parent node */ + ins_node_t* col_def; /* child node which does the inserts of + the column definitions; the row to be inserted + is built by the parent node */ + commit_node_t* commit_node; + /* child node which performs a commit after + a successful table creation */ + /*----------------------*/ + /* Local storage for this graph node */ + ulint state; /*!< node execution state */ + ulint col_no; /*!< next column definition to insert */ + mem_heap_t* heap; /*!< memory heap used as auxiliary storage */ +}; + +/* Table create node states */ +#define TABLE_BUILD_TABLE_DEF 1 +#define TABLE_BUILD_COL_DEF 2 +#define TABLE_COMMIT_WORK 3 +#define TABLE_ADD_TO_CACHE 4 +#define TABLE_COMPLETED 5 + +/* Index create node struct */ + +struct ind_node_t{ + que_common_t common; /*!< node type: QUE_NODE_INDEX_CREATE */ + dict_index_t* index; /*!< index to create, built as a memory data + structure with dict_mem_... functions */ + ins_node_t* ind_def; /* child node which does the insert of + the index definition; the row to be inserted + is built by the parent node */ + ins_node_t* field_def; /* child node which does the inserts of + the field definitions; the row to be inserted + is built by the parent node */ + commit_node_t* commit_node; + /* child node which performs a commit after + a successful index creation */ + /*----------------------*/ + /* Local storage for this graph node */ + ulint state; /*!< node execution state */ + ulint page_no;/* root page number of the index */ + dict_table_t* table; /*!< table which owns the index */ + dtuple_t* ind_row;/* index definition row built */ + ulint field_no;/* next field definition to insert */ + mem_heap_t* heap; /*!< memory heap used as auxiliary storage */ +}; + +/* Index create node states */ +#define INDEX_BUILD_INDEX_DEF 1 +#define INDEX_BUILD_FIELD_DEF 2 +#define INDEX_CREATE_INDEX_TREE 3 +#define INDEX_COMMIT_WORK 4 +#define INDEX_ADD_TO_CACHE 5 + +#ifndef UNIV_NONINL +#include "dict0crea.ic" +#endif + +#endif diff --git a/storage/xtradb/include/dict0crea.ic b/storage/xtradb/include/dict0crea.ic new file mode 100644 index 00000000000..2d0d9dcb858 --- /dev/null +++ b/storage/xtradb/include/dict0crea.ic @@ -0,0 +1,98 @@ +/***************************************************************************** + +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dict0crea.ic +Database object creation + +Created 1/8/1996 Heikki Tuuri +*******************************************************/ + +#include "mem0mem.h" + +/*********************************************************************//** +Checks if a table name contains the string "/#sql" which denotes temporary +tables in MySQL. +@return true if temporary table */ +UNIV_INTERN +bool +row_is_mysql_tmp_table_name( +/*========================*/ + const char* name) __attribute__((warn_unused_result)); + /*!< in: table name in the form + 'database/tablename' */ + + +/********************************************************************//** +Generate a foreign key constraint name when it was not named by the user. +A generated constraint has a name of the format dbname/tablename_ibfk_NUMBER, +where the numbers start from 1, and are given locally for this table, that is, +the number is not global, as it used to be before MySQL 4.0.18. */ +UNIV_INLINE +dberr_t +dict_create_add_foreign_id( +/*=======================*/ + ulint* id_nr, /*!< in/out: number to use in id generation; + incremented if used */ + const char* name, /*!< in: table name */ + dict_foreign_t* foreign)/*!< in/out: foreign key */ +{ + if (foreign->id == NULL) { + /* Generate a new constraint id */ + ulint namelen = strlen(name); + char* id = static_cast<char*>( + mem_heap_alloc(foreign->heap, + namelen + 20)); + + if (row_is_mysql_tmp_table_name(name)) { + + /* no overflow if number < 1e13 */ + sprintf(id, "%s_ibfk_%lu", name, + (ulong) (*id_nr)++); + } else { + char table_name[MAX_TABLE_NAME_LEN + 20] = ""; + uint errors = 0; + + strncpy(table_name, name, + MAX_TABLE_NAME_LEN + 20); + + innobase_convert_to_system_charset( + strchr(table_name, '/') + 1, + strchr(name, '/') + 1, + MAX_TABLE_NAME_LEN, &errors); + + if (errors) { + strncpy(table_name, name, + MAX_TABLE_NAME_LEN + 20); + } + + /* no overflow if number < 1e13 */ + sprintf(id, "%s_ibfk_%lu", table_name, + (ulong) (*id_nr)++); + + if (innobase_check_identifier_length( + strchr(id,'/') + 1)) { + return(DB_IDENTIFIER_TOO_LONG); + } + } + foreign->id = id; + } + + return(DB_SUCCESS); +} + diff --git a/storage/xtradb/include/dict0dict.h b/storage/xtradb/include/dict0dict.h new file mode 100644 index 00000000000..a52de9de11a --- /dev/null +++ b/storage/xtradb/include/dict0dict.h @@ -0,0 +1,1849 @@ +/***************************************************************************** + +Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dict0dict.h +Data dictionary system + +Created 1/8/1996 Heikki Tuuri +*******************************************************/ + +#ifndef dict0dict_h +#define dict0dict_h + +#include "univ.i" +#include "db0err.h" +#include "dict0types.h" +#include "dict0mem.h" +#include "data0type.h" +#include "data0data.h" +#include "mem0mem.h" +#include "rem0types.h" +#include "ut0mem.h" +#include "ut0lst.h" +#include "hash0hash.h" +#include "ut0rnd.h" +#include "ut0byte.h" +#include "trx0types.h" +#include "row0types.h" + +#ifndef UNIV_HOTBACKUP +# include "sync0sync.h" +# include "sync0rw.h" +/******************************************************************//** +Makes all characters in a NUL-terminated UTF-8 string lower case. */ +UNIV_INTERN +void +dict_casedn_str( +/*============*/ + char* a) /*!< in/out: string to put in lower case */ + __attribute__((nonnull)); +/********************************************************************//** +Get the database name length in a table name. +@return database name length */ +UNIV_INTERN +ulint +dict_get_db_name_len( +/*=================*/ + const char* name) /*!< in: table name in the form + dbname '/' tablename */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Open a table from its database and table name, this is currently used by +foreign constraint parser to get the referenced table. +@return complete table name with database and table name, allocated from +heap memory passed in */ +UNIV_INTERN +char* +dict_get_referenced_table( +/*======================*/ + const char* name, /*!< in: foreign key table name */ + const char* database_name, /*!< in: table db name */ + ulint database_name_len,/*!< in: db name length */ + const char* table_name, /*!< in: table name */ + ulint table_name_len, /*!< in: table name length */ + dict_table_t** table, /*!< out: table object or NULL */ + mem_heap_t* heap); /*!< in: heap memory */ +/*********************************************************************//** +Frees a foreign key struct. */ + +void +dict_foreign_free( +/*==============*/ + dict_foreign_t* foreign); /*!< in, own: foreign key struct */ +/*********************************************************************//** +Finds the highest [number] for foreign key constraints of the table. Looks +only at the >= 4.0.18-format id's, which are of the form +databasename/tablename_ibfk_[number]. +@return highest number, 0 if table has no new format foreign key constraints */ +UNIV_INTERN +ulint +dict_table_get_highest_foreign_id( +/*==============================*/ + dict_table_t* table); /*!< in: table in the dictionary + memory cache */ +/********************************************************************//** +Return the end of table name where we have removed dbname and '/'. +@return table name */ +UNIV_INTERN +const char* +dict_remove_db_name( +/*================*/ + const char* name) /*!< in: table name in the form + dbname '/' tablename */ + __attribute__((nonnull, warn_unused_result)); + +/** Operation to perform when opening a table */ +enum dict_table_op_t { + /** Expect the tablespace to exist. */ + DICT_TABLE_OP_NORMAL = 0, + /** Drop any orphan indexes after an aborted online index creation */ + DICT_TABLE_OP_DROP_ORPHAN, + /** Silently load the tablespace if it does not exist, + and do not load the definitions of incomplete indexes. */ + DICT_TABLE_OP_LOAD_TABLESPACE +}; + +/**********************************************************************//** +Returns a table object based on table id. +@return table, NULL if does not exist */ +UNIV_INTERN +dict_table_t* +dict_table_open_on_id( +/*==================*/ + table_id_t table_id, /*!< in: table id */ + ibool dict_locked, /*!< in: TRUE=data dictionary locked */ + dict_table_op_t table_op) /*!< in: operation to perform */ + __attribute__((warn_unused_result)); +/********************************************************************//** +Decrements the count of open handles to a table. */ +UNIV_INTERN +void +dict_table_close( +/*=============*/ + dict_table_t* table, /*!< in/out: table */ + ibool dict_locked, /*!< in: TRUE=data dictionary locked */ + ibool try_drop) /*!< in: TRUE=try to drop any orphan + indexes after an aborted online + index creation */ + __attribute__((nonnull)); +/**********************************************************************//** +Inits the data dictionary module. */ +UNIV_INTERN +void +dict_init(void); +/*===========*/ +/********************************************************************//** +Gets the space id of every table of the data dictionary and makes a linear +list and a hash table of them to the data dictionary cache. This function +can be called at database startup if we did not need to do a crash recovery. +In crash recovery we must scan the space id's from the .ibd files in MySQL +database directories. */ +UNIV_INTERN +void +dict_load_space_id_list(void); +/*=========================*/ +/*********************************************************************//** +Gets the minimum number of bytes per character. +@return minimum multi-byte char size, in bytes */ +UNIV_INLINE +ulint +dict_col_get_mbminlen( +/*==================*/ + const dict_col_t* col) /*!< in: column */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Gets the maximum number of bytes per character. +@return maximum multi-byte char size, in bytes */ +UNIV_INLINE +ulint +dict_col_get_mbmaxlen( +/*==================*/ + const dict_col_t* col) /*!< in: column */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Sets the minimum and maximum number of bytes per character. */ +UNIV_INLINE +void +dict_col_set_mbminmaxlen( +/*=====================*/ + dict_col_t* col, /*!< in/out: column */ + ulint mbminlen, /*!< in: minimum multi-byte + character size, in bytes */ + ulint mbmaxlen) /*!< in: minimum multi-byte + character size, in bytes */ + __attribute__((nonnull)); +/*********************************************************************//** +Gets the column data type. */ +UNIV_INLINE +void +dict_col_copy_type( +/*===============*/ + const dict_col_t* col, /*!< in: column */ + dtype_t* type) /*!< out: data type */ + __attribute__((nonnull)); +/**********************************************************************//** +Determine bytes of column prefix to be stored in the undo log. Please +note if the table format is UNIV_FORMAT_A (< UNIV_FORMAT_B), no prefix +needs to be stored in the undo log. +@return bytes of column prefix to be stored in the undo log */ +UNIV_INLINE +ulint +dict_max_field_len_store_undo( +/*==========================*/ + dict_table_t* table, /*!< in: table */ + const dict_col_t* col) /*!< in: column which index prefix + is based on */ + __attribute__((nonnull, warn_unused_result)); +#endif /* !UNIV_HOTBACKUP */ +#ifdef UNIV_DEBUG +/*********************************************************************//** +Assert that a column and a data type match. +@return TRUE */ +UNIV_INLINE +ibool +dict_col_type_assert_equal( +/*=======================*/ + const dict_col_t* col, /*!< in: column */ + const dtype_t* type) /*!< in: data type */ + __attribute__((nonnull, warn_unused_result)); +#endif /* UNIV_DEBUG */ +#ifndef UNIV_HOTBACKUP +/***********************************************************************//** +Returns the minimum size of the column. +@return minimum size */ +UNIV_INLINE +ulint +dict_col_get_min_size( +/*==================*/ + const dict_col_t* col) /*!< in: column */ + __attribute__((nonnull, warn_unused_result)); +/***********************************************************************//** +Returns the maximum size of the column. +@return maximum size */ +UNIV_INLINE +ulint +dict_col_get_max_size( +/*==================*/ + const dict_col_t* col) /*!< in: column */ + __attribute__((nonnull, warn_unused_result)); +/***********************************************************************//** +Returns the size of a fixed size column, 0 if not a fixed size column. +@return fixed size, or 0 */ +UNIV_INLINE +ulint +dict_col_get_fixed_size( +/*====================*/ + const dict_col_t* col, /*!< in: column */ + ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */ + __attribute__((nonnull, warn_unused_result)); +/***********************************************************************//** +Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a column. +For fixed length types it is the fixed length of the type, otherwise 0. +@return SQL null storage size in ROW_FORMAT=REDUNDANT */ +UNIV_INLINE +ulint +dict_col_get_sql_null_size( +/*=======================*/ + const dict_col_t* col, /*!< in: column */ + ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Gets the column number. +@return col->ind, table column position (starting from 0) */ +UNIV_INLINE +ulint +dict_col_get_no( +/*============*/ + const dict_col_t* col) /*!< in: column */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Gets the column position in the clustered index. */ +UNIV_INLINE +ulint +dict_col_get_clust_pos( +/*===================*/ + const dict_col_t* col, /*!< in: table column */ + const dict_index_t* clust_index) /*!< in: clustered index */ + __attribute__((nonnull, warn_unused_result)); +/****************************************************************//** +If the given column name is reserved for InnoDB system columns, return +TRUE. +@return TRUE if name is reserved */ +UNIV_INTERN +ibool +dict_col_name_is_reserved( +/*======================*/ + const char* name) /*!< in: column name */ + __attribute__((nonnull, warn_unused_result)); +/********************************************************************//** +Acquire the autoinc lock. */ +UNIV_INTERN +void +dict_table_autoinc_lock( +/*====================*/ + dict_table_t* table) /*!< in/out: table */ + __attribute__((nonnull)); +/********************************************************************//** +Unconditionally set the autoinc counter. */ +UNIV_INTERN +void +dict_table_autoinc_initialize( +/*==========================*/ + dict_table_t* table, /*!< in/out: table */ + ib_uint64_t value) /*!< in: next value to assign to a row */ + __attribute__((nonnull)); +/********************************************************************//** +Reads the next autoinc value (== autoinc counter value), 0 if not yet +initialized. +@return value for a new row, or 0 */ +UNIV_INTERN +ib_uint64_t +dict_table_autoinc_read( +/*====================*/ + const dict_table_t* table) /*!< in: table */ + __attribute__((nonnull, warn_unused_result)); +/********************************************************************//** +Updates the autoinc counter if the value supplied is greater than the +current value. */ +UNIV_INTERN +void +dict_table_autoinc_update_if_greater( +/*=================================*/ + + dict_table_t* table, /*!< in/out: table */ + ib_uint64_t value) /*!< in: value which was assigned to a row */ + __attribute__((nonnull)); +/********************************************************************//** +Release the autoinc lock. */ +UNIV_INTERN +void +dict_table_autoinc_unlock( +/*======================*/ + dict_table_t* table) /*!< in/out: table */ + __attribute__((nonnull)); +#endif /* !UNIV_HOTBACKUP */ +/**********************************************************************//** +Adds system columns to a table object. */ +UNIV_INTERN +void +dict_table_add_system_columns( +/*==========================*/ + dict_table_t* table, /*!< in/out: table */ + mem_heap_t* heap) /*!< in: temporary heap */ + __attribute__((nonnull)); +#ifndef UNIV_HOTBACKUP +/**********************************************************************//** +Adds a table object to the dictionary cache. */ +UNIV_INTERN +void +dict_table_add_to_cache( +/*====================*/ + dict_table_t* table, /*!< in: table */ + ibool can_be_evicted, /*!< in: TRUE if can be evicted*/ + mem_heap_t* heap) /*!< in: temporary heap */ + __attribute__((nonnull)); +/**********************************************************************//** +Removes a table object from the dictionary cache. */ +UNIV_INTERN +void +dict_table_remove_from_cache( +/*=========================*/ + dict_table_t* table) /*!< in, own: table */ + __attribute__((nonnull)); +/**********************************************************************//** +Renames a table object. +@return TRUE if success */ +UNIV_INTERN +dberr_t +dict_table_rename_in_cache( +/*=======================*/ + dict_table_t* table, /*!< in/out: table */ + const char* new_name, /*!< in: new name */ + ibool rename_also_foreigns) + /*!< in: in ALTER TABLE we want + to preserve the original table name + in constraints which reference it */ + __attribute__((nonnull, warn_unused_result)); +/**********************************************************************//** +Removes an index from the dictionary cache. */ +UNIV_INTERN +void +dict_index_remove_from_cache( +/*=========================*/ + dict_table_t* table, /*!< in/out: table */ + dict_index_t* index) /*!< in, own: index */ + __attribute__((nonnull)); +/**********************************************************************//** +Change the id of a table object in the dictionary cache. This is used in +DISCARD TABLESPACE. */ +UNIV_INTERN +void +dict_table_change_id_in_cache( +/*==========================*/ + dict_table_t* table, /*!< in/out: table object already in cache */ + table_id_t new_id) /*!< in: new id to set */ + __attribute__((nonnull)); +/**********************************************************************//** +Removes a foreign constraint struct from the dictionary cache. */ +UNIV_INTERN +void +dict_foreign_remove_from_cache( +/*===========================*/ + dict_foreign_t* foreign) /*!< in, own: foreign constraint */ + __attribute__((nonnull)); +/**********************************************************************//** +Adds a foreign key constraint object to the dictionary cache. May free +the object if there already is an object with the same identifier in. +At least one of foreign table or referenced table must already be in +the dictionary cache! +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +dict_foreign_add_to_cache( +/*======================*/ + dict_foreign_t* foreign, + /*!< in, own: foreign key constraint */ + const char** col_names, + /*!< in: column names, or NULL to use + foreign->foreign_table->col_names */ + bool check_charsets, + /*!< in: whether to check charset + compatibility */ + dict_err_ignore_t ignore_err) + /*!< in: error to be ignored */ + __attribute__((nonnull(1), warn_unused_result)); +/*********************************************************************//** +Check if the index is referenced by a foreign key, if TRUE return the +matching instance NULL otherwise. +@return pointer to foreign key struct if index is defined for foreign +key, otherwise NULL */ +UNIV_INTERN +dict_foreign_t* +dict_table_get_referenced_constraint( +/*=================================*/ + dict_table_t* table, /*!< in: InnoDB table */ + dict_index_t* index) /*!< in: InnoDB index */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Checks if a table is referenced by foreign keys. +@return TRUE if table is referenced by a foreign key */ +UNIV_INTERN +ibool +dict_table_is_referenced_by_foreign_key( +/*====================================*/ + const dict_table_t* table) /*!< in: InnoDB table */ + __attribute__((nonnull, warn_unused_result)); +/**********************************************************************//** +Replace the index passed in with another equivalent index in the +foreign key lists of the table. +@return whether all replacements were found */ +UNIV_INTERN +bool +dict_foreign_replace_index( +/*=======================*/ + dict_table_t* table, /*!< in/out: table */ + const char** col_names, + /*!< in: column names, or NULL + to use table->col_names */ + const dict_index_t* index) /*!< in: index to be replaced */ + __attribute__((nonnull(1,3), warn_unused_result)); +/**********************************************************************//** +Determines whether a string starts with the specified keyword. +@return TRUE if str starts with keyword */ +UNIV_INTERN +ibool +dict_str_starts_with_keyword( +/*=========================*/ + THD* thd, /*!< in: MySQL thread handle */ + const char* str, /*!< in: string to scan for keyword */ + const char* keyword) /*!< in: keyword to look for */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Checks if a index is defined for a foreign key constraint. Index is a part +of a foreign key constraint if the index is referenced by foreign key +or index is a foreign key index +@return pointer to foreign key struct if index is defined for foreign +key, otherwise NULL */ +UNIV_INTERN +dict_foreign_t* +dict_table_get_foreign_constraint( +/*==============================*/ + dict_table_t* table, /*!< in: InnoDB table */ + dict_index_t* index) /*!< in: InnoDB index */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Scans a table create SQL string and adds to the data dictionary +the foreign key constraints declared in the string. This function +should be called after the indexes for a table have been created. +Each foreign key constraint must be accompanied with indexes in +bot participating tables. The indexes are allowed to contain more +fields than mentioned in the constraint. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +dict_create_foreign_constraints( +/*============================*/ + trx_t* trx, /*!< in: transaction */ + const char* sql_string, /*!< in: table create statement where + foreign keys are declared like: + FOREIGN KEY (a, b) REFERENCES + table2(c, d), table2 can be written + also with the database + name before it: test.table2; the + default database id the database of + parameter name */ + size_t sql_length, /*!< in: length of sql_string */ + const char* name, /*!< in: table full name in the + normalized form + database_name/table_name */ + ibool reject_fks) /*!< in: if TRUE, fail with error + code DB_CANNOT_ADD_CONSTRAINT if + any foreign keys are found. */ + __attribute__((nonnull, warn_unused_result)); +/**********************************************************************//** +Parses the CONSTRAINT id's to be dropped in an ALTER TABLE statement. +@return DB_SUCCESS or DB_CANNOT_DROP_CONSTRAINT if syntax error or the +constraint id does not match */ +UNIV_INTERN +dberr_t +dict_foreign_parse_drop_constraints( +/*================================*/ + mem_heap_t* heap, /*!< in: heap from which we can + allocate memory */ + trx_t* trx, /*!< in: transaction */ + dict_table_t* table, /*!< in: table */ + ulint* n, /*!< out: number of constraints + to drop */ + const char*** constraints_to_drop) /*!< out: id's of the + constraints to drop */ + __attribute__((nonnull, warn_unused_result)); +/**********************************************************************//** +Returns a table object and increments its open handle count. +NOTE! This is a high-level function to be used mainly from outside the +'dict' directory. Inside this directory dict_table_get_low +is usually the appropriate function. +@return table, NULL if does not exist */ +UNIV_INTERN +dict_table_t* +dict_table_open_on_name( +/*====================*/ + const char* table_name, /*!< in: table name */ + ibool dict_locked, /*!< in: TRUE=data dictionary locked */ + ibool try_drop, /*!< in: TRUE=try to drop any orphan + indexes after an aborted online + index creation */ + dict_err_ignore_t + ignore_err) /*!< in: error to be ignored when + loading the table */ + __attribute__((nonnull, warn_unused_result)); + +/*********************************************************************//** +Tries to find an index whose first fields are the columns in the array, +in the same order and is not marked for deletion and is not the same +as types_idx. +@return matching index, NULL if not found */ +UNIV_INTERN +dict_index_t* +dict_foreign_find_index( +/*====================*/ + const dict_table_t* table, /*!< in: table */ + const char** col_names, + /*!< in: column names, or NULL + to use table->col_names */ + const char** columns,/*!< in: array of column names */ + ulint n_cols, /*!< in: number of columns */ + const dict_index_t* types_idx, + /*!< in: NULL or an index + whose types the column types + must match */ + bool check_charsets, + /*!< in: whether to check + charsets. only has an effect + if types_idx != NULL */ + ulint check_null) + /*!< in: nonzero if none of + the columns must be declared + NOT NULL */ + __attribute__((nonnull(1,3), warn_unused_result)); +/**********************************************************************//** +Returns a column's name. +@return column name. NOTE: not guaranteed to stay valid if table is +modified in any way (columns added, etc.). */ +UNIV_INTERN +const char* +dict_table_get_col_name( +/*====================*/ + const dict_table_t* table, /*!< in: table */ + ulint col_nr) /*!< in: column number */ + __attribute__((nonnull, warn_unused_result)); +/**********************************************************************//** +Prints a table data. */ +UNIV_INTERN +void +dict_table_print( +/*=============*/ + dict_table_t* table) /*!< in: table */ + __attribute__((nonnull)); +/**********************************************************************//** +Outputs info on foreign keys of a table. */ +UNIV_INTERN +void +dict_print_info_on_foreign_keys( +/*============================*/ + ibool create_table_format, /*!< in: if TRUE then print in + a format suitable to be inserted into + a CREATE TABLE, otherwise in the format + of SHOW TABLE STATUS */ + FILE* file, /*!< in: file where to print */ + trx_t* trx, /*!< in: transaction */ + dict_table_t* table) /*!< in: table */ + __attribute__((nonnull)); +/**********************************************************************//** +Outputs info on a foreign key of a table in a format suitable for +CREATE TABLE. */ +UNIV_INTERN +void +dict_print_info_on_foreign_key_in_create_format( +/*============================================*/ + FILE* file, /*!< in: file where to print */ + trx_t* trx, /*!< in: transaction */ + dict_foreign_t* foreign, /*!< in: foreign key constraint */ + ibool add_newline) /*!< in: whether to add a newline */ + __attribute__((nonnull(1,3))); +/********************************************************************//** +Displays the names of the index and the table. */ +UNIV_INTERN +void +dict_index_name_print( +/*==================*/ + FILE* file, /*!< in: output stream */ + const trx_t* trx, /*!< in: transaction */ + const dict_index_t* index) /*!< in: index to print */ + __attribute__((nonnull(1,3))); +/*********************************************************************//** +Tries to find an index whose first fields are the columns in the array, +in the same order and is not marked for deletion and is not the same +as types_idx. +@return matching index, NULL if not found */ +UNIV_INTERN +bool +dict_foreign_qualify_index( +/*====================*/ + const dict_table_t* table, /*!< in: table */ + const char** col_names, + /*!< in: column names, or NULL + to use table->col_names */ + const char** columns,/*!< in: array of column names */ + ulint n_cols, /*!< in: number of columns */ + const dict_index_t* index, /*!< in: index to check */ + const dict_index_t* types_idx, + /*!< in: NULL or an index + whose types the column types + must match */ + bool check_charsets, + /*!< in: whether to check + charsets. only has an effect + if types_idx != NULL */ + ulint check_null) + /*!< in: nonzero if none of + the columns must be declared + NOT NULL */ + __attribute__((nonnull(1,3), warn_unused_result)); +#ifdef UNIV_DEBUG +/********************************************************************//** +Gets the first index on the table (the clustered index). +@return index, NULL if none exists */ +UNIV_INLINE +dict_index_t* +dict_table_get_first_index( +/*=======================*/ + const dict_table_t* table) /*!< in: table */ + __attribute__((nonnull, warn_unused_result)); +/********************************************************************//** +Gets the last index on the table. +@return index, NULL if none exists */ +UNIV_INLINE +dict_index_t* +dict_table_get_last_index( +/*=======================*/ + const dict_table_t* table) /*!< in: table */ + __attribute__((nonnull, warn_unused_result)); +/********************************************************************//** +Gets the next index on the table. +@return index, NULL if none left */ +UNIV_INLINE +dict_index_t* +dict_table_get_next_index( +/*======================*/ + const dict_index_t* index) /*!< in: index */ + __attribute__((nonnull, warn_unused_result)); +#else /* UNIV_DEBUG */ +# define dict_table_get_first_index(table) UT_LIST_GET_FIRST((table)->indexes) +# define dict_table_get_last_index(table) UT_LIST_GET_LAST((table)->indexes) +# define dict_table_get_next_index(index) UT_LIST_GET_NEXT(indexes, index) +#endif /* UNIV_DEBUG */ +#endif /* !UNIV_HOTBACKUP */ + +/* Skip corrupted index */ +#define dict_table_skip_corrupt_index(index) \ + while (index && dict_index_is_corrupted(index)) { \ + index = dict_table_get_next_index(index); \ + } + +/* Get the next non-corrupt index */ +#define dict_table_next_uncorrupted_index(index) \ +do { \ + index = dict_table_get_next_index(index); \ + dict_table_skip_corrupt_index(index); \ +} while (0) + +/********************************************************************//** +Check whether the index is the clustered index. +@return nonzero for clustered index, zero for other indexes */ +UNIV_INLINE +ulint +dict_index_is_clust( +/*================*/ + const dict_index_t* index) /*!< in: index */ + __attribute__((nonnull, pure, warn_unused_result)); +/********************************************************************//** +Check whether the index is unique. +@return nonzero for unique index, zero for other indexes */ +UNIV_INLINE +ulint +dict_index_is_unique( +/*=================*/ + const dict_index_t* index) /*!< in: index */ + __attribute__((nonnull, pure, warn_unused_result)); +/********************************************************************//** +Check whether the index is the insert buffer tree. +@return nonzero for insert buffer, zero for other indexes */ +UNIV_INLINE +ulint +dict_index_is_ibuf( +/*===============*/ + const dict_index_t* index) /*!< in: index */ + __attribute__((nonnull, pure, warn_unused_result)); +/********************************************************************//** +Check whether the index is a secondary index or the insert buffer tree. +@return nonzero for insert buffer, zero for other indexes */ +UNIV_INLINE +ulint +dict_index_is_sec_or_ibuf( +/*======================*/ + const dict_index_t* index) /*!< in: index */ + __attribute__((nonnull, pure, warn_unused_result)); + +/************************************************************************ +Gets the all the FTS indexes for the table. NOTE: must not be called for +tables which do not have an FTS-index. */ +UNIV_INTERN +ulint +dict_table_get_all_fts_indexes( +/*===========================*/ + /* out: number of indexes collected */ + dict_table_t* table, /* in: table */ + ib_vector_t* indexes)/* out: vector for collecting FTS indexes */ + __attribute__((nonnull)); +/********************************************************************//** +Gets the number of user-defined columns in a table in the dictionary +cache. +@return number of user-defined (e.g., not ROW_ID) columns of a table */ +UNIV_INLINE +ulint +dict_table_get_n_user_cols( +/*=======================*/ + const dict_table_t* table) /*!< in: table */ + __attribute__((nonnull, pure, warn_unused_result)); +/********************************************************************//** +Gets the number of system columns in a table in the dictionary cache. +@return number of system (e.g., ROW_ID) columns of a table */ +UNIV_INLINE +ulint +dict_table_get_n_sys_cols( +/*======================*/ + const dict_table_t* table) /*!< in: table */ + __attribute__((nonnull, pure, warn_unused_result)); +/********************************************************************//** +Gets the number of all columns (also system) in a table in the dictionary +cache. +@return number of columns of a table */ +UNIV_INLINE +ulint +dict_table_get_n_cols( +/*==================*/ + const dict_table_t* table) /*!< in: table */ + __attribute__((nonnull, pure, warn_unused_result)); +/********************************************************************//** +Gets the approximately estimated number of rows in the table. +@return estimated number of rows */ +UNIV_INLINE +ib_uint64_t +dict_table_get_n_rows( +/*==================*/ + const dict_table_t* table) /*!< in: table */ + __attribute__((nonnull, warn_unused_result)); +/********************************************************************//** +Increment the number of rows in the table by one. +Notice that this operation is not protected by any latch, the number is +approximate. */ +UNIV_INLINE +void +dict_table_n_rows_inc( +/*==================*/ + dict_table_t* table) /*!< in/out: table */ + __attribute__((nonnull)); +/********************************************************************//** +Decrement the number of rows in the table by one. +Notice that this operation is not protected by any latch, the number is +approximate. */ +UNIV_INLINE +void +dict_table_n_rows_dec( +/*==================*/ + dict_table_t* table) /*!< in/out: table */ + __attribute__((nonnull)); +#ifdef UNIV_DEBUG +/********************************************************************//** +Gets the nth column of a table. +@return pointer to column object */ +UNIV_INLINE +dict_col_t* +dict_table_get_nth_col( +/*===================*/ + const dict_table_t* table, /*!< in: table */ + ulint pos) /*!< in: position of column */ + __attribute__((nonnull, warn_unused_result)); +/********************************************************************//** +Gets the given system column of a table. +@return pointer to column object */ +UNIV_INLINE +dict_col_t* +dict_table_get_sys_col( +/*===================*/ + const dict_table_t* table, /*!< in: table */ + ulint sys) /*!< in: DATA_ROW_ID, ... */ + __attribute__((nonnull, warn_unused_result)); +#else /* UNIV_DEBUG */ +#define dict_table_get_nth_col(table, pos) \ +((table)->cols + (pos)) +#define dict_table_get_sys_col(table, sys) \ +((table)->cols + (table)->n_cols + (sys) - DATA_N_SYS_COLS) +#endif /* UNIV_DEBUG */ +/********************************************************************//** +Gets the given system column number of a table. +@return column number */ +UNIV_INLINE +ulint +dict_table_get_sys_col_no( +/*======================*/ + const dict_table_t* table, /*!< in: table */ + ulint sys) /*!< in: DATA_ROW_ID, ... */ + __attribute__((nonnull, warn_unused_result)); +#ifndef UNIV_HOTBACKUP +/********************************************************************//** +Returns the minimum data size of an index record. +@return minimum data size in bytes */ +UNIV_INLINE +ulint +dict_index_get_min_size( +/*====================*/ + const dict_index_t* index) /*!< in: index */ + __attribute__((nonnull, warn_unused_result)); +#endif /* !UNIV_HOTBACKUP */ +/********************************************************************//** +Check whether the table uses the compact page format. +@return TRUE if table uses the compact page format */ +UNIV_INLINE +ibool +dict_table_is_comp( +/*===============*/ + const dict_table_t* table) /*!< in: table */ + __attribute__((nonnull, warn_unused_result)); +/********************************************************************//** +Determine the file format of a table. +@return file format version */ +UNIV_INLINE +ulint +dict_table_get_format( +/*==================*/ + const dict_table_t* table) /*!< in: table */ + __attribute__((nonnull, warn_unused_result)); +/********************************************************************//** +Determine the file format from a dict_table_t::flags. +@return file format version */ +UNIV_INLINE +ulint +dict_tf_get_format( +/*===============*/ + ulint flags) /*!< in: dict_table_t::flags */ + __attribute__((warn_unused_result)); +/********************************************************************//** +Set the various values in a dict_table_t::flags pointer. */ +UNIV_INLINE +void +dict_tf_set( +/*========*/ + ulint* flags, /*!< in/out: table */ + rec_format_t format, /*!< in: file format */ + ulint zip_ssize, /*!< in: zip shift size */ + bool remote_path) /*!< in: table uses DATA DIRECTORY */ + __attribute__((nonnull)); +/********************************************************************//** +Convert a 32 bit integer table flags to the 32 bit integer that is +written into the tablespace header at the offset FSP_SPACE_FLAGS and is +also stored in the fil_space_t::flags field. The following chart shows +the translation of the low order bit. Other bits are the same. +========================= Low order bit ========================== + | REDUNDANT | COMPACT | COMPRESSED | DYNAMIC +dict_table_t::flags | 0 | 1 | 1 | 1 +fil_space_t::flags | 0 | 0 | 1 | 1 +================================================================== +@return tablespace flags (fil_space_t::flags) */ +UNIV_INLINE +ulint +dict_tf_to_fsp_flags( +/*=================*/ + ulint flags) /*!< in: dict_table_t::flags */ + __attribute__((const)); +/********************************************************************//** +Extract the compressed page size from table flags. +@return compressed page size, or 0 if not compressed */ +UNIV_INLINE +ulint +dict_tf_get_zip_size( +/*=================*/ + ulint flags) /*!< in: flags */ + __attribute__((const)); +/********************************************************************//** +Check whether the table uses the compressed compact page format. +@return compressed page size, or 0 if not compressed */ +UNIV_INLINE +ulint +dict_table_zip_size( +/*================*/ + const dict_table_t* table) /*!< in: table */ + __attribute__((nonnull, warn_unused_result)); +#ifndef UNIV_HOTBACKUP +/*********************************************************************//** +Obtain exclusive locks on all index trees of the table. This is to prevent +accessing index trees while InnoDB is updating internal metadata for +operations such as truncate tables. */ +UNIV_INLINE +void +dict_table_x_lock_indexes( +/*======================*/ + dict_table_t* table) /*!< in: table */ + __attribute__((nonnull)); +/*********************************************************************//** +Release the exclusive locks on all index tree. */ +UNIV_INLINE +void +dict_table_x_unlock_indexes( +/*========================*/ + dict_table_t* table) /*!< in: table */ + __attribute__((nonnull)); +/********************************************************************//** +Checks if a column is in the ordering columns of the clustered index of a +table. Column prefixes are treated like whole columns. +@return TRUE if the column, or its prefix, is in the clustered key */ +UNIV_INTERN +ibool +dict_table_col_in_clustered_key( +/*============================*/ + const dict_table_t* table, /*!< in: table */ + ulint n) /*!< in: column number */ + __attribute__((nonnull, warn_unused_result)); +/*******************************************************************//** +Check if the table has an FTS index. +@return TRUE if table has an FTS index */ +UNIV_INLINE +ibool +dict_table_has_fts_index( +/*=====================*/ + dict_table_t* table) /*!< in: table */ + __attribute__((nonnull, warn_unused_result)); +/*******************************************************************//** +Copies types of columns contained in table to tuple and sets all +fields of the tuple to the SQL NULL value. This function should +be called right after dtuple_create(). */ +UNIV_INTERN +void +dict_table_copy_types( +/*==================*/ + dtuple_t* tuple, /*!< in/out: data tuple */ + const dict_table_t* table) /*!< in: table */ + __attribute__((nonnull)); +/******************************************************************** +Wait until all the background threads of the given table have exited, i.e., +bg_threads == 0. Note: bg_threads_mutex must be reserved when +calling this. */ +UNIV_INTERN +void +dict_table_wait_for_bg_threads_to_exit( +/*===================================*/ + dict_table_t* table, /* in: table */ + ulint delay) /* in: time in microseconds to wait between + checks of bg_threads. */ + __attribute__((nonnull)); +/**********************************************************************//** +Looks for an index with the given id. NOTE that we do not reserve +the dictionary mutex: this function is for emergency purposes like +printing info of a corrupt database page! +@return index or NULL if not found from cache */ +UNIV_INTERN +dict_index_t* +dict_index_find_on_id_low( +/*======================*/ + index_id_t id) /*!< in: index id */ + __attribute__((warn_unused_result)); +/**********************************************************************//** +Make room in the table cache by evicting an unused table. The unused table +should not be part of FK relationship and currently not used in any user +transaction. There is no guarantee that it will remove a table. +@return number of tables evicted. */ +UNIV_INTERN +ulint +dict_make_room_in_cache( +/*====================*/ + ulint max_tables, /*!< in: max tables allowed in cache */ + ulint pct_check); /*!< in: max percent to check */ +/**********************************************************************//** +Adds an index to the dictionary cache. +@return DB_SUCCESS, DB_TOO_BIG_RECORD, or DB_CORRUPTION */ +UNIV_INTERN +dberr_t +dict_index_add_to_cache( +/*====================*/ + dict_table_t* table, /*!< in: table on which the index is */ + dict_index_t* index, /*!< in, own: index; NOTE! The index memory + object is freed in this function! */ + ulint page_no,/*!< in: root page number of the index */ + ibool strict) /*!< in: TRUE=refuse to create the index + if records could be too big to fit in + an B-tree page */ + __attribute__((nonnull, warn_unused_result)); +/**********************************************************************//** +Removes an index from the dictionary cache. */ +UNIV_INTERN +void +dict_index_remove_from_cache( +/*=========================*/ + dict_table_t* table, /*!< in/out: table */ + dict_index_t* index) /*!< in, own: index */ + __attribute__((nonnull)); +#endif /* !UNIV_HOTBACKUP */ +/********************************************************************//** +Gets the number of fields in the internal representation of an index, +including fields added by the dictionary system. +@return number of fields */ +UNIV_INLINE +ulint +dict_index_get_n_fields( +/*====================*/ + const dict_index_t* index) /*!< in: an internal + representation of index (in + the dictionary cache) */ + __attribute__((nonnull, warn_unused_result)); +/********************************************************************//** +Gets the number of fields in the internal representation of an index +that uniquely determine the position of an index entry in the index, if +we do not take multiversioning into account: in the B-tree use the value +returned by dict_index_get_n_unique_in_tree. +@return number of fields */ +UNIV_INLINE +ulint +dict_index_get_n_unique( +/*====================*/ + const dict_index_t* index) /*!< in: an internal representation + of index (in the dictionary cache) */ + __attribute__((nonnull, warn_unused_result)); +/********************************************************************//** +Gets the number of fields in the internal representation of an index +which uniquely determine the position of an index entry in the index, if +we also take multiversioning into account. +@return number of fields */ +UNIV_INLINE +ulint +dict_index_get_n_unique_in_tree( +/*============================*/ + const dict_index_t* index) /*!< in: an internal representation + of index (in the dictionary cache) */ + __attribute__((nonnull, warn_unused_result)); +/********************************************************************//** +Gets the number of user-defined ordering fields in the index. In the internal +representation we add the row id to the ordering fields to make all indexes +unique, but this function returns the number of fields the user defined +in the index as ordering fields. +@return number of fields */ +UNIV_INLINE +ulint +dict_index_get_n_ordering_defined_by_user( +/*======================================*/ + const dict_index_t* index) /*!< in: an internal representation + of index (in the dictionary cache) */ + __attribute__((nonnull, warn_unused_result)); +#ifdef UNIV_DEBUG +/********************************************************************//** +Gets the nth field of an index. +@return pointer to field object */ +UNIV_INLINE +dict_field_t* +dict_index_get_nth_field( +/*=====================*/ + const dict_index_t* index, /*!< in: index */ + ulint pos) /*!< in: position of field */ + __attribute__((nonnull, warn_unused_result)); +#else /* UNIV_DEBUG */ +# define dict_index_get_nth_field(index, pos) ((index)->fields + (pos)) +#endif /* UNIV_DEBUG */ +/********************************************************************//** +Gets pointer to the nth column in an index. +@return column */ +UNIV_INLINE +const dict_col_t* +dict_index_get_nth_col( +/*===================*/ + const dict_index_t* index, /*!< in: index */ + ulint pos) /*!< in: position of the field */ + __attribute__((nonnull, warn_unused_result)); +/********************************************************************//** +Gets the column number of the nth field in an index. +@return column number */ +UNIV_INLINE +ulint +dict_index_get_nth_col_no( +/*======================*/ + const dict_index_t* index, /*!< in: index */ + ulint pos) /*!< in: position of the field */ + __attribute__((nonnull, warn_unused_result)); +/********************************************************************//** +Looks for column n in an index. +@return position in internal representation of the index; +ULINT_UNDEFINED if not contained */ +UNIV_INLINE +ulint +dict_index_get_nth_col_pos( +/*=======================*/ + const dict_index_t* index, /*!< in: index */ + ulint n) /*!< in: column number */ + __attribute__((nonnull, warn_unused_result)); +/********************************************************************//** +Looks for column n in an index. +@return position in internal representation of the index; +ULINT_UNDEFINED if not contained */ +UNIV_INTERN +ulint +dict_index_get_nth_col_or_prefix_pos( +/*=================================*/ + const dict_index_t* index, /*!< in: index */ + ulint n, /*!< in: column number */ + ibool inc_prefix) /*!< in: TRUE=consider + column prefixes too */ + __attribute__((nonnull, warn_unused_result)); +/********************************************************************//** +Returns TRUE if the index contains a column or a prefix of that column. +@return TRUE if contains the column or its prefix */ +UNIV_INTERN +ibool +dict_index_contains_col_or_prefix( +/*==============================*/ + const dict_index_t* index, /*!< in: index */ + ulint n) /*!< in: column number */ + __attribute__((nonnull, warn_unused_result)); +/********************************************************************//** +Looks for a matching field in an index. The column has to be the same. The +column in index must be complete, or must contain a prefix longer than the +column in index2. That is, we must be able to construct the prefix in index2 +from the prefix in index. +@return position in internal representation of the index; +ULINT_UNDEFINED if not contained */ +UNIV_INTERN +ulint +dict_index_get_nth_field_pos( +/*=========================*/ + const dict_index_t* index, /*!< in: index from which to search */ + const dict_index_t* index2, /*!< in: index */ + ulint n) /*!< in: field number in index2 */ + __attribute__((nonnull, warn_unused_result)); +/********************************************************************//** +Looks for column n position in the clustered index. +@return position in internal representation of the clustered index */ +UNIV_INTERN +ulint +dict_table_get_nth_col_pos( +/*=======================*/ + const dict_table_t* table, /*!< in: table */ + ulint n) /*!< in: column number */ + __attribute__((nonnull, warn_unused_result)); +/********************************************************************//** +Returns the position of a system column in an index. +@return position, ULINT_UNDEFINED if not contained */ +UNIV_INLINE +ulint +dict_index_get_sys_col_pos( +/*=======================*/ + const dict_index_t* index, /*!< in: index */ + ulint type) /*!< in: DATA_ROW_ID, ... */ + __attribute__((nonnull, warn_unused_result)); +/*******************************************************************//** +Adds a column to index. */ +UNIV_INTERN +void +dict_index_add_col( +/*===============*/ + dict_index_t* index, /*!< in/out: index */ + const dict_table_t* table, /*!< in: table */ + dict_col_t* col, /*!< in: column */ + ulint prefix_len) /*!< in: column prefix length */ + __attribute__((nonnull)); +#ifndef UNIV_HOTBACKUP +/*******************************************************************//** +Copies types of fields contained in index to tuple. */ +UNIV_INTERN +void +dict_index_copy_types( +/*==================*/ + dtuple_t* tuple, /*!< in/out: data tuple */ + const dict_index_t* index, /*!< in: index */ + ulint n_fields) /*!< in: number of + field types to copy */ + __attribute__((nonnull)); +#endif /* !UNIV_HOTBACKUP */ +/*********************************************************************//** +Gets the field column. +@return field->col, pointer to the table column */ +UNIV_INLINE +const dict_col_t* +dict_field_get_col( +/*===============*/ + const dict_field_t* field) /*!< in: index field */ + __attribute__((nonnull, warn_unused_result)); +#ifndef UNIV_HOTBACKUP +/**********************************************************************//** +Returns an index object if it is found in the dictionary cache. +Assumes that dict_sys->mutex is already being held. +@return index, NULL if not found */ +UNIV_INTERN +dict_index_t* +dict_index_get_if_in_cache_low( +/*===========================*/ + index_id_t index_id) /*!< in: index id */ + __attribute__((warn_unused_result)); +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +/**********************************************************************//** +Returns an index object if it is found in the dictionary cache. +@return index, NULL if not found */ +UNIV_INTERN +dict_index_t* +dict_index_get_if_in_cache( +/*=======================*/ + index_id_t index_id) /*!< in: index id */ + __attribute__((warn_unused_result)); +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ +#ifdef UNIV_DEBUG +/**********************************************************************//** +Checks that a tuple has n_fields_cmp value in a sensible range, so that +no comparison can occur with the page number field in a node pointer. +@return TRUE if ok */ +UNIV_INTERN +ibool +dict_index_check_search_tuple( +/*==========================*/ + const dict_index_t* index, /*!< in: index tree */ + const dtuple_t* tuple) /*!< in: tuple used in a search */ + __attribute__((nonnull, warn_unused_result)); +/** Whether and when to allow temporary index names */ +enum check_name { + /** Require all indexes to be complete. */ + CHECK_ALL_COMPLETE, + /** Allow aborted online index creation. */ + CHECK_ABORTED_OK, + /** Allow partial indexes to exist. */ + CHECK_PARTIAL_OK +}; +/**********************************************************************//** +Check for duplicate index entries in a table [using the index name] */ +UNIV_INTERN +void +dict_table_check_for_dup_indexes( +/*=============================*/ + const dict_table_t* table, /*!< in: Check for dup indexes + in this table */ + enum check_name check) /*!< in: whether and when to allow + temporary index names */ + __attribute__((nonnull)); +#endif /* UNIV_DEBUG */ +/**********************************************************************//** +Builds a node pointer out of a physical record and a page number. +@return own: node pointer */ +UNIV_INTERN +dtuple_t* +dict_index_build_node_ptr( +/*======================*/ + const dict_index_t* index, /*!< in: index */ + const rec_t* rec, /*!< in: record for which to build node + pointer */ + ulint page_no,/*!< in: page number to put in node + pointer */ + mem_heap_t* heap, /*!< in: memory heap where pointer + created */ + ulint level) /*!< in: level of rec in tree: + 0 means leaf level */ + __attribute__((nonnull, warn_unused_result)); +/**********************************************************************//** +Copies an initial segment of a physical record, long enough to specify an +index entry uniquely. +@return pointer to the prefix record */ +UNIV_INTERN +rec_t* +dict_index_copy_rec_order_prefix( +/*=============================*/ + const dict_index_t* index, /*!< in: index */ + const rec_t* rec, /*!< in: record for which to + copy prefix */ + ulint* n_fields,/*!< out: number of fields copied */ + byte** buf, /*!< in/out: memory buffer for the + copied prefix, or NULL */ + ulint* buf_size)/*!< in/out: buffer size */ + __attribute__((nonnull, warn_unused_result)); +/**********************************************************************//** +Builds a typed data tuple out of a physical record. +@return own: data tuple */ +UNIV_INTERN +dtuple_t* +dict_index_build_data_tuple( +/*========================*/ + dict_index_t* index, /*!< in: index */ + rec_t* rec, /*!< in: record for which to build data tuple */ + ulint n_fields,/*!< in: number of data fields */ + mem_heap_t* heap) /*!< in: memory heap where tuple created */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Gets the space id of the root of the index tree. +@return space id */ +UNIV_INLINE +ulint +dict_index_get_space( +/*=================*/ + const dict_index_t* index) /*!< in: index */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Sets the space id of the root of the index tree. */ +UNIV_INLINE +void +dict_index_set_space( +/*=================*/ + dict_index_t* index, /*!< in/out: index */ + ulint space) /*!< in: space id */ + __attribute__((nonnull)); +/*********************************************************************//** +Gets the page number of the root of the index tree. +@return page number */ +UNIV_INLINE +ulint +dict_index_get_page( +/*================*/ + const dict_index_t* tree) /*!< in: index */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Gets the read-write lock of the index tree. +@return read-write lock */ +UNIV_INLINE +prio_rw_lock_t* +dict_index_get_lock( +/*================*/ + dict_index_t* index) /*!< in: index */ + __attribute__((nonnull, warn_unused_result)); +/********************************************************************//** +Returns free space reserved for future updates of records. This is +relevant only in the case of many consecutive inserts, as updates +which make the records bigger might fragment the index. +@return number of free bytes on page, reserved for updates */ +UNIV_INLINE +ulint +dict_index_get_space_reserve(void); +/*==============================*/ + +/* Online index creation @{ */ +/********************************************************************//** +Gets the status of online index creation. +@return the status */ +UNIV_INLINE +enum online_index_status +dict_index_get_online_status( +/*=========================*/ + const dict_index_t* index) /*!< in: secondary index */ + __attribute__((nonnull, warn_unused_result)); +/********************************************************************//** +Sets the status of online index creation. */ +UNIV_INLINE +void +dict_index_set_online_status( +/*=========================*/ + dict_index_t* index, /*!< in/out: index */ + enum online_index_status status) /*!< in: status */ + __attribute__((nonnull)); +/********************************************************************//** +Determines if a secondary index is being or has been created online, +or if the table is being rebuilt online, allowing concurrent modifications +to the table. +@retval true if the index is being or has been built online, or +if this is a clustered index and the table is being or has been rebuilt online +@retval false if the index has been created or the table has been +rebuilt completely */ +UNIV_INLINE +bool +dict_index_is_online_ddl( +/*=====================*/ + const dict_index_t* index) /*!< in: index */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Calculates the minimum record length in an index. */ +UNIV_INTERN +ulint +dict_index_calc_min_rec_len( +/*========================*/ + const dict_index_t* index) /*!< in: index */ + __attribute__((nonnull, warn_unused_result)); +/********************************************************************//** +Reserves the dictionary system mutex for MySQL. */ +UNIV_INTERN +void +dict_mutex_enter_for_mysql(void); +/*============================*/ +/********************************************************************//** +Releases the dictionary system mutex for MySQL. */ +UNIV_INTERN +void +dict_mutex_exit_for_mysql(void); +/*===========================*/ + +/** Create a dict_table_t's stats latch or delay for lazy creation. +This function is only called from either single threaded environment +or from a thread that has not shared the table object with other threads. +@param[in,out] table table whose stats latch to create +@param[in] enabled if false then the latch is disabled +and dict_table_stats_lock()/unlock() become noop on this table. */ + +void +dict_table_stats_latch_create( + dict_table_t* table, + bool enabled); + +/** Destroy a dict_table_t's stats latch. +This function is only called from either single threaded environment +or from a thread that has not shared the table object with other threads. +@param[in,out] table table whose stats latch to destroy */ + +void +dict_table_stats_latch_destroy( + dict_table_t* table); + +/**********************************************************************//** +Lock the appropriate latch to protect a given table's statistics. +table->id is used to pick the corresponding latch from a global array of +latches. */ +UNIV_INTERN +void +dict_table_stats_lock( +/*==================*/ + dict_table_t* table, /*!< in: table */ + ulint latch_mode); /*!< in: RW_S_LATCH or RW_X_LATCH */ +/**********************************************************************//** +Unlock the latch that has been locked by dict_table_stats_lock() */ +UNIV_INTERN +void +dict_table_stats_unlock( +/*====================*/ + dict_table_t* table, /*!< in: table */ + ulint latch_mode); /*!< in: RW_S_LATCH or RW_X_LATCH */ +/********************************************************************//** +Checks if the database name in two table names is the same. +@return TRUE if same db name */ +UNIV_INTERN +ibool +dict_tables_have_same_db( +/*=====================*/ + const char* name1, /*!< in: table name in the form + dbname '/' tablename */ + const char* name2) /*!< in: table name in the form + dbname '/' tablename */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Removes an index from the cache */ +UNIV_INTERN +void +dict_index_remove_from_cache( +/*=========================*/ + dict_table_t* table, /*!< in/out: table */ + dict_index_t* index) /*!< in, own: index */ + __attribute__((nonnull)); +/**********************************************************************//** +Get index by name +@return index, NULL if does not exist */ +UNIV_INTERN +dict_index_t* +dict_table_get_index_on_name( +/*=========================*/ + dict_table_t* table, /*!< in: table */ + const char* name) /*!< in: name of the index to find */ + __attribute__((nonnull, warn_unused_result)); +/**********************************************************************//** +In case there is more than one index with the same name return the index +with the min(id). +@return index, NULL if does not exist */ +UNIV_INTERN +dict_index_t* +dict_table_get_index_on_name_and_min_id( +/*====================================*/ + dict_table_t* table, /*!< in: table */ + const char* name) /*!< in: name of the index to find */ + __attribute__((nonnull, warn_unused_result)); +/*************************************************************** +Check whether a column exists in an FTS index. */ +UNIV_INLINE +ulint +dict_table_is_fts_column( +/*=====================*/ + /* out: ULINT_UNDEFINED if no match else + the offset within the vector */ + ib_vector_t* indexes,/* in: vector containing only FTS indexes */ + ulint col_no) /* in: col number to search for */ + __attribute__((nonnull, warn_unused_result)); +/**********************************************************************//** +Move a table to the non LRU end of the LRU list. */ +UNIV_INTERN +void +dict_table_move_from_lru_to_non_lru( +/*================================*/ + dict_table_t* table) /*!< in: table to move from LRU to non-LRU */ + __attribute__((nonnull)); +/**********************************************************************//** +Move a table to the LRU list from the non-LRU list. */ +UNIV_INTERN +void +dict_table_move_from_non_lru_to_lru( +/*================================*/ + dict_table_t* table) /*!< in: table to move from non-LRU to LRU */ + __attribute__((nonnull)); +/**********************************************************************//** +Move to the most recently used segment of the LRU list. */ +UNIV_INTERN +void +dict_move_to_mru( +/*=============*/ + dict_table_t* table) /*!< in: table to move to MRU */ + __attribute__((nonnull)); + +/** Maximum number of columns in a foreign key constraint. Please Note MySQL +has a much lower limit on the number of columns allowed in a foreign key +constraint */ +#define MAX_NUM_FK_COLUMNS 500 + +/* Buffers for storing detailed information about the latest foreign key +and unique key errors */ +extern FILE* dict_foreign_err_file; +extern ib_mutex_t dict_foreign_err_mutex; /* mutex protecting the buffers */ + +/** the dictionary system */ +extern dict_sys_t* dict_sys; +/** the data dictionary rw-latch protecting dict_sys */ +extern rw_lock_t dict_operation_lock; + +/* Dictionary system struct */ +struct dict_sys_t{ + ib_prio_mutex_t mutex; /*!< mutex protecting the data + dictionary; protects also the + disk-based dictionary system tables; + this mutex serializes CREATE TABLE + and DROP TABLE, as well as reading + the dictionary data for a table from + system tables */ + row_id_t row_id; /*!< the next row id to assign; + NOTE that at a checkpoint this + must be written to the dict system + header and flushed to a file; in + recovery this must be derived from + the log records */ + hash_table_t* table_hash; /*!< hash table of the tables, based + on name */ + hash_table_t* table_id_hash; /*!< hash table of the tables, based + on id */ + ulint size; /*!< varying space in bytes occupied + by the data dictionary table and + index objects */ + dict_table_t* sys_tables; /*!< SYS_TABLES table */ + dict_table_t* sys_columns; /*!< SYS_COLUMNS table */ + dict_table_t* sys_indexes; /*!< SYS_INDEXES table */ + dict_table_t* sys_fields; /*!< SYS_FIELDS table */ + + /*=============================*/ + UT_LIST_BASE_NODE_T(dict_table_t) + table_LRU; /*!< List of tables that can be evicted + from the cache */ + UT_LIST_BASE_NODE_T(dict_table_t) + table_non_LRU; /*!< List of tables that can't be + evicted from the cache */ +}; +#endif /* !UNIV_HOTBACKUP */ + +/** dummy index for ROW_FORMAT=REDUNDANT supremum and infimum records */ +extern dict_index_t* dict_ind_redundant; +/** dummy index for ROW_FORMAT=COMPACT supremum and infimum records */ +extern dict_index_t* dict_ind_compact; + +/**********************************************************************//** +Inits dict_ind_redundant and dict_ind_compact. */ +UNIV_INTERN +void +dict_ind_init(void); +/*===============*/ + +/* Auxiliary structs for checking a table definition @{ */ + +/* This struct is used to specify the name and type that a column must +have when checking a table's schema. */ +struct dict_col_meta_t { + const char* name; /* column name */ + ulint mtype; /* required column main type */ + ulint prtype_mask; /* required column precise type mask; + if this is non-zero then all the + bits it has set must also be set + in the column's prtype */ + ulint len; /* required column length */ +}; + +/* This struct is used for checking whether a given table exists and +whether it has a predefined schema (number of columns and columns names +and types) */ +struct dict_table_schema_t { + const char* table_name; /* the name of the table whose + structure we are checking */ + ulint n_cols; /* the number of columns the + table must have */ + dict_col_meta_t* columns; /* metadata for the columns; + this array has n_cols + elements */ + ulint n_foreign; /* number of foreign keys this + table has, pointing to other + tables (where this table is + FK child) */ + ulint n_referenced; /* number of foreign keys other + tables have, pointing to this + table (where this table is + parent) */ +}; +/* @} */ + +/*********************************************************************//** +Checks whether a table exists and whether it has the given structure. +The table must have the same number of columns with the same names and +types. The order of the columns does not matter. +The caller must own the dictionary mutex. +dict_table_schema_check() @{ +@return DB_SUCCESS if the table exists and contains the necessary columns */ +UNIV_INTERN +dberr_t +dict_table_schema_check( +/*====================*/ + dict_table_schema_t* req_schema, /*!< in/out: required table + schema */ + char* errstr, /*!< out: human readable error + message if != DB_SUCCESS and + != DB_TABLE_NOT_FOUND is + returned */ + size_t errstr_sz) /*!< in: errstr size */ + __attribute__((nonnull, warn_unused_result)); +/* @} */ + +/*********************************************************************//** +Converts a database and table name from filesystem encoding +(e.g. d@i1b/a@q1b@1Kc, same format as used in dict_table_t::name) in two +strings in UTF8 encoding (e.g. dцb and aюbØc). The output buffers must be +at least MAX_DB_UTF8_LEN and MAX_TABLE_UTF8_LEN bytes. */ +UNIV_INTERN +void +dict_fs2utf8( +/*=========*/ + const char* db_and_table, /*!< in: database and table names, + e.g. d@i1b/a@q1b@1Kc */ + char* db_utf8, /*!< out: database name, e.g. dцb */ + size_t db_utf8_size, /*!< in: dbname_utf8 size */ + char* table_utf8, /*!< out: table name, e.g. aюbØc */ + size_t table_utf8_size)/*!< in: table_utf8 size */ + __attribute__((nonnull)); + +/**********************************************************************//** +Closes the data dictionary module. */ +UNIV_INTERN +void +dict_close(void); +/*============*/ +#ifndef UNIV_HOTBACKUP +/**********************************************************************//** +Check whether the table is corrupted. +@return nonzero for corrupted table, zero for valid tables */ +UNIV_INLINE +ulint +dict_table_is_corrupted( +/*====================*/ + const dict_table_t* table) /*!< in: table */ + __attribute__((nonnull, warn_unused_result)); + +/**********************************************************************//** +Check whether the index is corrupted. +@return nonzero for corrupted index, zero for valid indexes */ +UNIV_INLINE +ulint +dict_index_is_corrupted( +/*====================*/ + const dict_index_t* index) /*!< in: index */ + __attribute__((nonnull, warn_unused_result)); + +#endif /* !UNIV_HOTBACKUP */ +/**********************************************************************//** +Flags an index and table corrupted both in the data dictionary cache +and in the system table SYS_INDEXES. */ +UNIV_INTERN +void +dict_set_corrupted( +/*===============*/ + dict_index_t* index, /*!< in/out: index */ + trx_t* trx, /*!< in/out: transaction */ + const char* ctx) /*!< in: context */ + UNIV_COLD __attribute__((nonnull)); + +/**********************************************************************//** +Flags an index corrupted in the data dictionary cache only. This +is used mostly to mark a corrupted index when index's own dictionary +is corrupted, and we force to load such index for repair purpose */ +UNIV_INTERN +void +dict_set_corrupted_index_cache_only( +/*================================*/ + dict_index_t* index, /*!< in/out: index */ + dict_table_t* table) /*!< in/out: table */ + __attribute__((nonnull)); + +/**********************************************************************//** +Flags a table with specified space_id corrupted in the table dictionary +cache. +@return TRUE if successful */ +UNIV_INTERN +ibool +dict_set_corrupted_by_space( +/*========================*/ + ulint space_id); /*!< in: space ID */ + +/********************************************************************//** +Validate the table flags. +@return true if valid. */ +UNIV_INLINE +bool +dict_tf_is_valid( +/*=============*/ + ulint flags) /*!< in: table flags */ + __attribute__((warn_unused_result)); + +/********************************************************************//** +Check if the tablespace for the table has been discarded. +@return true if the tablespace has been discarded. */ +UNIV_INLINE +bool +dict_table_is_discarded( +/*====================*/ + const dict_table_t* table) /*!< in: table to check */ + __attribute__((nonnull, pure, warn_unused_result)); + +/********************************************************************//** +Check if it is a temporary table. +@return true if temporary table flag is set. */ +UNIV_INLINE +bool +dict_table_is_temporary( +/*====================*/ + const dict_table_t* table) /*!< in: table to check */ + __attribute__((nonnull, pure, warn_unused_result)); + +#ifndef UNIV_HOTBACKUP +/*********************************************************************//** +This function should be called whenever a page is successfully +compressed. Updates the compression padding information. */ +UNIV_INTERN +void +dict_index_zip_success( +/*===================*/ + dict_index_t* index) /*!< in/out: index to be updated. */ + __attribute__((nonnull)); +/*********************************************************************//** +This function should be called whenever a page compression attempt +fails. Updates the compression padding information. */ +UNIV_INTERN +void +dict_index_zip_failure( +/*===================*/ + dict_index_t* index) /*!< in/out: index to be updated. */ + __attribute__((nonnull)); +/*********************************************************************//** +Return the optimal page size, for which page will likely compress. +@return page size beyond which page may not compress*/ +UNIV_INTERN +ulint +dict_index_zip_pad_optimal_page_size( +/*=================================*/ + dict_index_t* index) /*!< in: index for which page size + is requested */ + __attribute__((nonnull, warn_unused_result)); +/*************************************************************//** +Convert table flag to row format string. +@return row format name */ +UNIV_INTERN +const char* +dict_tf_to_row_format_string( +/*=========================*/ + ulint table_flag); /*!< in: row format setting */ +/*****************************************************************//** +Get index by first field of the index +@return index which is having first field matches +with the field present in field_index position of table */ +UNIV_INLINE +dict_index_t* +dict_table_get_index_on_first_col( +/*==============================*/ + const dict_table_t* table, /*!< in: table */ + ulint col_index); /*!< in: position of column + in table */ + +#endif /* !UNIV_HOTBACKUP */ +/************************************************************************* +set is_corrupt flag by space_id*/ + +void +dict_table_set_corrupt_by_space( +/*============================*/ + ulint space_id, + ibool need_mutex); + +#ifndef UNIV_NONINL +#include "dict0dict.ic" +#endif + +#endif diff --git a/storage/xtradb/include/dict0dict.ic b/storage/xtradb/include/dict0dict.ic new file mode 100644 index 00000000000..6bfd7f6cdae --- /dev/null +++ b/storage/xtradb/include/dict0dict.ic @@ -0,0 +1,1433 @@ +/***************************************************************************** + +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/dict0dict.ic +Data dictionary system + +Created 1/8/1996 Heikki Tuuri +***********************************************************************/ + +#include "data0type.h" +#ifndef UNIV_HOTBACKUP +#include "dict0load.h" +#include "rem0types.h" +#include "fsp0fsp.h" +#include "srv0srv.h" +#include "sync0rw.h" /* RW_S_LATCH */ + +/*********************************************************************//** +Gets the minimum number of bytes per character. +@return minimum multi-byte char size, in bytes */ +UNIV_INLINE +ulint +dict_col_get_mbminlen( +/*==================*/ + const dict_col_t* col) /*!< in: column */ +{ + return(DATA_MBMINLEN(col->mbminmaxlen)); +} +/*********************************************************************//** +Gets the maximum number of bytes per character. +@return maximum multi-byte char size, in bytes */ +UNIV_INLINE +ulint +dict_col_get_mbmaxlen( +/*==================*/ + const dict_col_t* col) /*!< in: column */ +{ + return(DATA_MBMAXLEN(col->mbminmaxlen)); +} +/*********************************************************************//** +Sets the minimum and maximum number of bytes per character. */ +UNIV_INLINE +void +dict_col_set_mbminmaxlen( +/*=====================*/ + dict_col_t* col, /*!< in/out: column */ + ulint mbminlen, /*!< in: minimum multi-byte + character size, in bytes */ + ulint mbmaxlen) /*!< in: minimum multi-byte + character size, in bytes */ +{ + ut_ad(mbminlen < DATA_MBMAX); + ut_ad(mbmaxlen < DATA_MBMAX); + ut_ad(mbminlen <= mbmaxlen); + + col->mbminmaxlen = DATA_MBMINMAXLEN(mbminlen, mbmaxlen); +} +/*********************************************************************//** +Gets the column data type. */ +UNIV_INLINE +void +dict_col_copy_type( +/*===============*/ + const dict_col_t* col, /*!< in: column */ + dtype_t* type) /*!< out: data type */ +{ + ut_ad(col && type); + + type->mtype = col->mtype; + type->prtype = col->prtype; + type->len = col->len; + type->mbminmaxlen = col->mbminmaxlen; +} +#endif /* !UNIV_HOTBACKUP */ + +#ifdef UNIV_DEBUG +/*********************************************************************//** +Assert that a column and a data type match. +@return TRUE */ +UNIV_INLINE +ibool +dict_col_type_assert_equal( +/*=======================*/ + const dict_col_t* col, /*!< in: column */ + const dtype_t* type) /*!< in: data type */ +{ + ut_ad(col); + ut_ad(type); + + ut_ad(col->mtype == type->mtype); + ut_ad(col->prtype == type->prtype); + //ut_ad(col->len == type->len); +# ifndef UNIV_HOTBACKUP + ut_ad(col->mbminmaxlen == type->mbminmaxlen); +# endif /* !UNIV_HOTBACKUP */ + + return(TRUE); +} +#endif /* UNIV_DEBUG */ + +#ifndef UNIV_HOTBACKUP +/***********************************************************************//** +Returns the minimum size of the column. +@return minimum size */ +UNIV_INLINE +ulint +dict_col_get_min_size( +/*==================*/ + const dict_col_t* col) /*!< in: column */ +{ + return(dtype_get_min_size_low(col->mtype, col->prtype, col->len, + col->mbminmaxlen)); +} +/***********************************************************************//** +Returns the maximum size of the column. +@return maximum size */ +UNIV_INLINE +ulint +dict_col_get_max_size( +/*==================*/ + const dict_col_t* col) /*!< in: column */ +{ + return(dtype_get_max_size_low(col->mtype, col->len)); +} +#endif /* !UNIV_HOTBACKUP */ +/***********************************************************************//** +Returns the size of a fixed size column, 0 if not a fixed size column. +@return fixed size, or 0 */ +UNIV_INLINE +ulint +dict_col_get_fixed_size( +/*====================*/ + const dict_col_t* col, /*!< in: column */ + ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */ +{ + return(dtype_get_fixed_size_low(col->mtype, col->prtype, col->len, + col->mbminmaxlen, comp)); +} +/***********************************************************************//** +Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a column. +For fixed length types it is the fixed length of the type, otherwise 0. +@return SQL null storage size in ROW_FORMAT=REDUNDANT */ +UNIV_INLINE +ulint +dict_col_get_sql_null_size( +/*=======================*/ + const dict_col_t* col, /*!< in: column */ + ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */ +{ + return(dict_col_get_fixed_size(col, comp)); +} + +/*********************************************************************//** +Gets the column number. +@return col->ind, table column position (starting from 0) */ +UNIV_INLINE +ulint +dict_col_get_no( +/*============*/ + const dict_col_t* col) /*!< in: column */ +{ + ut_ad(col); + + return(col->ind); +} + +/*********************************************************************//** +Gets the column position in the clustered index. */ +UNIV_INLINE +ulint +dict_col_get_clust_pos( +/*===================*/ + const dict_col_t* col, /*!< in: table column */ + const dict_index_t* clust_index) /*!< in: clustered index */ +{ + ulint i; + + ut_ad(col); + ut_ad(clust_index); + ut_ad(dict_index_is_clust(clust_index)); + + for (i = 0; i < clust_index->n_def; i++) { + const dict_field_t* field = &clust_index->fields[i]; + + if (!field->prefix_len && field->col == col) { + return(i); + } + } + + return(ULINT_UNDEFINED); +} + +#ifndef UNIV_HOTBACKUP +#ifdef UNIV_DEBUG +/********************************************************************//** +Gets the first index on the table (the clustered index). +@return index, NULL if none exists */ +UNIV_INLINE +dict_index_t* +dict_table_get_first_index( +/*=======================*/ + const dict_table_t* table) /*!< in: table */ +{ + ut_ad(table); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + + return(UT_LIST_GET_FIRST(((dict_table_t*) table)->indexes)); +} + +/********************************************************************//** +Gets the last index on the table. +@return index, NULL if none exists */ +UNIV_INLINE +dict_index_t* +dict_table_get_last_index( +/*=======================*/ + const dict_table_t* table) /*!< in: table */ +{ + ut_ad(table); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + + return(UT_LIST_GET_LAST((const_cast<dict_table_t*>(table)) + ->indexes)); +} + +/********************************************************************//** +Gets the next index on the table. +@return index, NULL if none left */ +UNIV_INLINE +dict_index_t* +dict_table_get_next_index( +/*======================*/ + const dict_index_t* index) /*!< in: index */ +{ + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + return(UT_LIST_GET_NEXT(indexes, (dict_index_t*) index)); +} +#endif /* UNIV_DEBUG */ +#endif /* !UNIV_HOTBACKUP */ + +/********************************************************************//** +Check whether the index is the clustered index. +@return nonzero for clustered index, zero for other indexes */ +UNIV_INLINE +ulint +dict_index_is_clust( +/*================*/ + const dict_index_t* index) /*!< in: index */ +{ + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + return(index->type & DICT_CLUSTERED); +} +/********************************************************************//** +Check whether the index is unique. +@return nonzero for unique index, zero for other indexes */ +UNIV_INLINE +ulint +dict_index_is_unique( +/*=================*/ + const dict_index_t* index) /*!< in: index */ +{ + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + return(index->type & DICT_UNIQUE); +} + +/********************************************************************//** +Check whether the index is the insert buffer tree. +@return nonzero for insert buffer, zero for other indexes */ +UNIV_INLINE +ulint +dict_index_is_ibuf( +/*===============*/ + const dict_index_t* index) /*!< in: index */ +{ + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + return(index->type & DICT_IBUF); +} + +/********************************************************************//** +Check whether the index is an universal index tree. +@return nonzero for universal tree, zero for other indexes */ +UNIV_INLINE +ulint +dict_index_is_univ( +/*===============*/ + const dict_index_t* index) /*!< in: index */ +{ + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + return(index->type & DICT_UNIVERSAL); +} + +/********************************************************************//** +Check whether the index is a secondary index or the insert buffer tree. +@return nonzero for insert buffer, zero for other indexes */ +UNIV_INLINE +ulint +dict_index_is_sec_or_ibuf( +/*======================*/ + const dict_index_t* index) /*!< in: index */ +{ + ulint type; + + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + type = index->type; + + return(!(type & DICT_CLUSTERED) || (type & DICT_IBUF)); +} + +/********************************************************************//** +Gets the number of user-defined columns in a table in the dictionary +cache. +@return number of user-defined (e.g., not ROW_ID) columns of a table */ +UNIV_INLINE +ulint +dict_table_get_n_user_cols( +/*=======================*/ + const dict_table_t* table) /*!< in: table */ +{ + ut_ad(table); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + + return(table->n_cols - DATA_N_SYS_COLS); +} + +/********************************************************************//** +Gets the number of system columns in a table in the dictionary cache. +@return number of system (e.g., ROW_ID) columns of a table */ +UNIV_INLINE +ulint +dict_table_get_n_sys_cols( +/*======================*/ + const dict_table_t* table __attribute__((unused))) /*!< in: table */ +{ + ut_ad(table); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + ut_ad(table->cached); + + return(DATA_N_SYS_COLS); +} + +/********************************************************************//** +Gets the number of all columns (also system) in a table in the dictionary +cache. +@return number of columns of a table */ +UNIV_INLINE +ulint +dict_table_get_n_cols( +/*==================*/ + const dict_table_t* table) /*!< in: table */ +{ + ut_ad(table); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + + return(table->n_cols); +} + +/********************************************************************//** +Gets the approximately estimated number of rows in the table. +@return estimated number of rows */ +UNIV_INLINE +ib_uint64_t +dict_table_get_n_rows( +/*==================*/ + const dict_table_t* table) /*!< in: table */ +{ + ut_ad(table->stat_initialized); + + return(table->stat_n_rows); +} + +/********************************************************************//** +Increment the number of rows in the table by one. +Notice that this operation is not protected by any latch, the number is +approximate. */ +UNIV_INLINE +void +dict_table_n_rows_inc( +/*==================*/ + dict_table_t* table) /*!< in/out: table */ +{ + if (table->stat_initialized) { + ib_uint64_t n_rows = table->stat_n_rows; + if (n_rows < 0xFFFFFFFFFFFFFFFFULL) { + table->stat_n_rows = n_rows + 1; + } + } +} + +/********************************************************************//** +Decrement the number of rows in the table by one. +Notice that this operation is not protected by any latch, the number is +approximate. */ +UNIV_INLINE +void +dict_table_n_rows_dec( +/*==================*/ + dict_table_t* table) /*!< in/out: table */ +{ + if (table->stat_initialized) { + ib_uint64_t n_rows = table->stat_n_rows; + if (n_rows > 0) { + table->stat_n_rows = n_rows - 1; + } + } +} + +#ifdef UNIV_DEBUG +/********************************************************************//** +Gets the nth column of a table. +@return pointer to column object */ +UNIV_INLINE +dict_col_t* +dict_table_get_nth_col( +/*===================*/ + const dict_table_t* table, /*!< in: table */ + ulint pos) /*!< in: position of column */ +{ + ut_ad(table); + ut_ad(pos < table->n_def); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + + return((dict_col_t*) (table->cols) + pos); +} + +/********************************************************************//** +Gets the given system column of a table. +@return pointer to column object */ +UNIV_INLINE +dict_col_t* +dict_table_get_sys_col( +/*===================*/ + const dict_table_t* table, /*!< in: table */ + ulint sys) /*!< in: DATA_ROW_ID, ... */ +{ + dict_col_t* col; + + ut_ad(table); + ut_ad(sys < DATA_N_SYS_COLS); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + + col = dict_table_get_nth_col(table, table->n_cols + - DATA_N_SYS_COLS + sys); + ut_ad(col->mtype == DATA_SYS); + ut_ad(col->prtype == (sys | DATA_NOT_NULL)); + + return(col); +} +#endif /* UNIV_DEBUG */ + +/********************************************************************//** +Gets the given system column number of a table. +@return column number */ +UNIV_INLINE +ulint +dict_table_get_sys_col_no( +/*======================*/ + const dict_table_t* table, /*!< in: table */ + ulint sys) /*!< in: DATA_ROW_ID, ... */ +{ + ut_ad(table); + ut_ad(sys < DATA_N_SYS_COLS); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + + return(table->n_cols - DATA_N_SYS_COLS + sys); +} + +/********************************************************************//** +Check whether the table uses the compact page format. +@return TRUE if table uses the compact page format */ +UNIV_INLINE +ibool +dict_table_is_comp( +/*===============*/ + const dict_table_t* table) /*!< in: table */ +{ + ut_ad(table); + +#if DICT_TF_COMPACT != 1 +#error "DICT_TF_COMPACT must be 1" +#endif + + return(table->flags & DICT_TF_COMPACT); +} + +/************************************************************************ +Check if the table has an FTS index. */ +UNIV_INLINE +ibool +dict_table_has_fts_index( +/*=====================*/ + /* out: TRUE if table has an FTS index */ + dict_table_t* table) /* in: table */ +{ + ut_ad(table); + + return(DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS)); +} + +/********************************************************************//** +Validate the table flags. +@return true if valid. */ +UNIV_INLINE +bool +dict_tf_is_valid( +/*=============*/ + ulint flags) /*!< in: table flags */ +{ + ulint compact = DICT_TF_GET_COMPACT(flags); + ulint zip_ssize = DICT_TF_GET_ZIP_SSIZE(flags); + ulint atomic_blobs = DICT_TF_HAS_ATOMIC_BLOBS(flags); + ulint unused = DICT_TF_GET_UNUSED(flags); + + /* Make sure there are no bits that we do not know about. */ + if (unused != 0) { + + return(false); + + } else if (atomic_blobs) { + /* Barracuda row formats COMPRESSED and DYNAMIC build on + the page structure introduced for the COMPACT row format + by allowing keys in secondary indexes to be made from + data stored off-page in the clustered index. */ + + if (!compact) { + return(false); + } + + } else if (zip_ssize) { + + /* Antelope does not support COMPRESSED row format. */ + return(false); + } + + if (zip_ssize) { + + /* COMPRESSED row format must have compact and atomic_blobs + bits set and validate the number is within allowed range. */ + + if (!compact + || !atomic_blobs + || zip_ssize > PAGE_ZIP_SSIZE_MAX) { + + return(false); + } + } + + /* CREATE TABLE ... DATA DIRECTORY is supported for any row format, + so the DATA_DIR flag is compatible with all other table flags. */ + + return(true); +} + +/********************************************************************//** +Validate a SYS_TABLES TYPE field and return it. +@return Same as input after validating it as a SYS_TABLES TYPE field. +If there is an error, return ULINT_UNDEFINED. */ +UNIV_INLINE +ulint +dict_sys_tables_type_validate( +/*==========================*/ + ulint type, /*!< in: SYS_TABLES.TYPE */ + ulint n_cols) /*!< in: SYS_TABLES.N_COLS */ +{ + ulint low_order_bit = DICT_TF_GET_COMPACT(type); + ulint redundant = !(n_cols & DICT_N_COLS_COMPACT); + ulint zip_ssize = DICT_TF_GET_ZIP_SSIZE(type); + ulint atomic_blobs = DICT_TF_HAS_ATOMIC_BLOBS(type); + ulint unused = DICT_TF_GET_UNUSED(type); + + /* The low order bit of SYS_TABLES.TYPE is always set to 1. + If the format is UNIV_FORMAT_B or higher, this field is the same + as dict_table_t::flags. Zero is not allowed here. */ + if (!low_order_bit) { + return(ULINT_UNDEFINED); + } + + if (redundant) { + if (zip_ssize || atomic_blobs) { + return(ULINT_UNDEFINED); + } + } + + /* Make sure there are no bits that we do not know about. */ + if (unused) { + return(ULINT_UNDEFINED); + } + + if (atomic_blobs) { + /* Barracuda row formats COMPRESSED and DYNAMIC build on + the page structure introduced for the COMPACT row format + by allowing keys in secondary indexes to be made from + data stored off-page in the clustered index. + + The DICT_N_COLS_COMPACT flag should be in N_COLS, + but we already know that. */ + + } else if (zip_ssize) { + /* Antelope does not support COMPRESSED format. */ + return(ULINT_UNDEFINED); + } + + if (zip_ssize) { + /* COMPRESSED row format must have low_order_bit and + atomic_blobs bits set and the DICT_N_COLS_COMPACT flag + should be in N_COLS, but we already know about the + low_order_bit and DICT_N_COLS_COMPACT flags. */ + if (!atomic_blobs) { + return(ULINT_UNDEFINED); + } + + /* Validate that the number is within allowed range. */ + if (zip_ssize > PAGE_ZIP_SSIZE_MAX) { + return(ULINT_UNDEFINED); + } + } + + /* There is nothing to validate for the data_dir field. + CREATE TABLE ... DATA DIRECTORY is supported for any row + format, so the DATA_DIR flag is compatible with any other + table flags. However, it is not used with TEMPORARY tables.*/ + + /* Return the validated SYS_TABLES.TYPE. */ + return(type); +} + +/********************************************************************//** +Determine the file format from dict_table_t::flags +The low order bit will be zero for REDUNDANT and 1 for COMPACT. For any +other row_format, file_format is > 0 and DICT_TF_COMPACT will also be set. +@return file format version */ +UNIV_INLINE +rec_format_t +dict_tf_get_rec_format( +/*===================*/ + ulint flags) /*!< in: dict_table_t::flags */ +{ + ut_a(dict_tf_is_valid(flags)); + + if (!DICT_TF_GET_COMPACT(flags)) { + return(REC_FORMAT_REDUNDANT); + } + + if (!DICT_TF_HAS_ATOMIC_BLOBS(flags)) { + return(REC_FORMAT_COMPACT); + } + + if (DICT_TF_GET_ZIP_SSIZE(flags)) { + return(REC_FORMAT_COMPRESSED); + } + + return(REC_FORMAT_DYNAMIC); +} + +/********************************************************************//** +Determine the file format from a dict_table_t::flags. +@return file format version */ +UNIV_INLINE +ulint +dict_tf_get_format( +/*===============*/ + ulint flags) /*!< in: dict_table_t::flags */ +{ + if (DICT_TF_HAS_ATOMIC_BLOBS(flags)) { + return(UNIV_FORMAT_B); + } + + return(UNIV_FORMAT_A); +} + +/********************************************************************//** +Determine the file format of a table. +@return file format version */ +UNIV_INLINE +ulint +dict_table_get_format( +/*==================*/ + const dict_table_t* table) /*!< in: table */ +{ + ut_ad(table); + + return(dict_tf_get_format(table->flags)); +} + +/********************************************************************//** +Set the file format and zip size in a dict_table_t::flags. If zip size +is not needed, it should be 0. */ +UNIV_INLINE +void +dict_tf_set( +/*========*/ + ulint* flags, /*!< in/out: table flags */ + rec_format_t format, /*!< in: file format */ + ulint zip_ssize, /*!< in: zip shift size */ + bool use_data_dir) /*!< in: table uses DATA DIRECTORY */ +{ + switch (format) { + case REC_FORMAT_REDUNDANT: + *flags = 0; + ut_ad(zip_ssize == 0); + break; + case REC_FORMAT_COMPACT: + *flags = DICT_TF_COMPACT; + ut_ad(zip_ssize == 0); + break; + case REC_FORMAT_COMPRESSED: + *flags = DICT_TF_COMPACT + | (1 << DICT_TF_POS_ATOMIC_BLOBS) + | (zip_ssize << DICT_TF_POS_ZIP_SSIZE); + break; + case REC_FORMAT_DYNAMIC: + *flags = DICT_TF_COMPACT + | (1 << DICT_TF_POS_ATOMIC_BLOBS); + ut_ad(zip_ssize == 0); + break; + } + + if (use_data_dir) { + *flags |= (1 << DICT_TF_POS_DATA_DIR); + } +} + +/********************************************************************//** +Convert a 32 bit integer table flags to the 32 bit integer that is +written into the tablespace header at the offset FSP_SPACE_FLAGS and is +also stored in the fil_space_t::flags field. The following chart shows +the translation of the low order bit. Other bits are the same. +========================= Low order bit ========================== + | REDUNDANT | COMPACT | COMPRESSED | DYNAMIC +dict_table_t::flags | 0 | 1 | 1 | 1 +fil_space_t::flags | 0 | 0 | 1 | 1 +================================================================== +@return tablespace flags (fil_space_t::flags) */ +UNIV_INLINE +ulint +dict_tf_to_fsp_flags( +/*=================*/ + ulint table_flags) /*!< in: dict_table_t::flags */ +{ + ulint fsp_flags; + + DBUG_EXECUTE_IF("dict_tf_to_fsp_flags_failure", + return(ULINT_UNDEFINED);); + + /* Adjust bit zero. */ + fsp_flags = DICT_TF_HAS_ATOMIC_BLOBS(table_flags) ? 1 : 0; + + /* ZIP_SSIZE and ATOMIC_BLOBS are at the same position. */ + fsp_flags |= table_flags & DICT_TF_MASK_ZIP_SSIZE; + fsp_flags |= table_flags & DICT_TF_MASK_ATOMIC_BLOBS; + + /* In addition, tablespace flags also contain the page size. */ + fsp_flags |= fsp_flags_set_page_size(fsp_flags, UNIV_PAGE_SIZE); + + /* The DATA_DIR flag is in a different position in fsp_flag */ + fsp_flags |= DICT_TF_HAS_DATA_DIR(table_flags) + ? FSP_FLAGS_MASK_DATA_DIR : 0; + + ut_a(fsp_flags_is_valid(fsp_flags)); + + return(fsp_flags); +} + +/********************************************************************//** +Convert a 32 bit integer from SYS_TABLES.TYPE to dict_table_t::flags +The following chart shows the translation of the low order bit. +Other bits are the same. +========================= Low order bit ========================== + | REDUNDANT | COMPACT | COMPRESSED and DYNAMIC +SYS_TABLES.TYPE | 1 | 1 | 1 +dict_table_t::flags | 0 | 1 | 1 +================================================================== +@return ulint containing SYS_TABLES.TYPE */ +UNIV_INLINE +ulint +dict_sys_tables_type_to_tf( +/*=======================*/ + ulint type, /*!< in: SYS_TABLES.TYPE field */ + ulint n_cols) /*!< in: SYS_TABLES.N_COLS field */ +{ + ulint flags; + ulint redundant = !(n_cols & DICT_N_COLS_COMPACT); + + /* Adjust bit zero. */ + flags = redundant ? 0 : 1; + + /* ZIP_SSIZE, ATOMIC_BLOBS & DATA_DIR are the same. */ + flags |= type & (DICT_TF_MASK_ZIP_SSIZE + | DICT_TF_MASK_ATOMIC_BLOBS + | DICT_TF_MASK_DATA_DIR); + + return(flags); +} + +/********************************************************************//** +Convert a 32 bit integer table flags to the 32bit integer that is written +to a SYS_TABLES.TYPE field. The following chart shows the translation of +the low order bit. Other bits are the same. +========================= Low order bit ========================== + | REDUNDANT | COMPACT | COMPRESSED and DYNAMIC +dict_table_t::flags | 0 | 1 | 1 +SYS_TABLES.TYPE | 1 | 1 | 1 +================================================================== +@return ulint containing SYS_TABLES.TYPE */ +UNIV_INLINE +ulint +dict_tf_to_sys_tables_type( +/*=======================*/ + ulint flags) /*!< in: dict_table_t::flags */ +{ + ulint type; + + ut_a(dict_tf_is_valid(flags)); + + /* Adjust bit zero. It is always 1 in SYS_TABLES.TYPE */ + type = 1; + + /* ZIP_SSIZE, ATOMIC_BLOBS & DATA_DIR are the same. */ + type |= flags & (DICT_TF_MASK_ZIP_SSIZE + | DICT_TF_MASK_ATOMIC_BLOBS + | DICT_TF_MASK_DATA_DIR); + + return(type); +} + +/********************************************************************//** +Extract the compressed page size from dict_table_t::flags. +These flags are in memory, so assert that they are valid. +@return compressed page size, or 0 if not compressed */ +UNIV_INLINE +ulint +dict_tf_get_zip_size( +/*=================*/ + ulint flags) /*!< in: flags */ +{ + ulint zip_ssize = DICT_TF_GET_ZIP_SSIZE(flags); + ulint zip_size = (zip_ssize + ? (UNIV_ZIP_SIZE_MIN >> 1) << zip_ssize + : 0); + + ut_ad(zip_size <= UNIV_ZIP_SIZE_MAX); + + return(zip_size); +} + +/********************************************************************//** +Check whether the table uses the compressed compact page format. +@return compressed page size, or 0 if not compressed */ +UNIV_INLINE +ulint +dict_table_zip_size( +/*================*/ + const dict_table_t* table) /*!< in: table */ +{ + ut_ad(table); + + return(dict_tf_get_zip_size(table->flags)); +} + +#ifndef UNIV_HOTBACKUP +/*********************************************************************//** +Obtain exclusive locks on all index trees of the table. This is to prevent +accessing index trees while InnoDB is updating internal metadata for +operations such as truncate tables. */ +UNIV_INLINE +void +dict_table_x_lock_indexes( +/*======================*/ + dict_table_t* table) /*!< in: table */ +{ + dict_index_t* index; + + ut_a(table); + ut_ad(mutex_own(&(dict_sys->mutex))); + + /* Loop through each index of the table and lock them */ + for (index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + rw_lock_x_lock(dict_index_get_lock(index)); + } +} + +/*********************************************************************//** +Release the exclusive locks on all index tree. */ +UNIV_INLINE +void +dict_table_x_unlock_indexes( +/*========================*/ + dict_table_t* table) /*!< in: table */ +{ + dict_index_t* index; + + ut_a(table); + ut_ad(mutex_own(&(dict_sys->mutex))); + + for (index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + rw_lock_x_unlock(dict_index_get_lock(index)); + } +} +#endif /* !UNIV_HOTBACKUP */ + +/********************************************************************//** +Gets the number of fields in the internal representation of an index, +including fields added by the dictionary system. +@return number of fields */ +UNIV_INLINE +ulint +dict_index_get_n_fields( +/*====================*/ + const dict_index_t* index) /*!< in: an internal + representation of index (in + the dictionary cache) */ +{ + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + return(index->n_fields); +} + +/********************************************************************//** +Gets the number of fields in the internal representation of an index +that uniquely determine the position of an index entry in the index, if +we do not take multiversioning into account: in the B-tree use the value +returned by dict_index_get_n_unique_in_tree. +@return number of fields */ +UNIV_INLINE +ulint +dict_index_get_n_unique( +/*====================*/ + const dict_index_t* index) /*!< in: an internal representation + of index (in the dictionary cache) */ +{ + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + ut_ad(index->cached); + + return(index->n_uniq); +} + +/********************************************************************//** +Gets the number of fields in the internal representation of an index +which uniquely determine the position of an index entry in the index, if +we also take multiversioning into account. +@return number of fields */ +UNIV_INLINE +ulint +dict_index_get_n_unique_in_tree( +/*============================*/ + const dict_index_t* index) /*!< in: an internal representation + of index (in the dictionary cache) */ +{ + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + ut_ad(index->cached); + + if (dict_index_is_clust(index)) { + + return(dict_index_get_n_unique(index)); + } + + return(dict_index_get_n_fields(index)); +} + +/********************************************************************//** +Gets the number of user-defined ordering fields in the index. In the internal +representation of clustered indexes we add the row id to the ordering fields +to make a clustered index unique, but this function returns the number of +fields the user defined in the index as ordering fields. +@return number of fields */ +UNIV_INLINE +ulint +dict_index_get_n_ordering_defined_by_user( +/*======================================*/ + const dict_index_t* index) /*!< in: an internal representation + of index (in the dictionary cache) */ +{ + return(index->n_user_defined_cols); +} + +#ifdef UNIV_DEBUG +/********************************************************************//** +Gets the nth field of an index. +@return pointer to field object */ +UNIV_INLINE +dict_field_t* +dict_index_get_nth_field( +/*=====================*/ + const dict_index_t* index, /*!< in: index */ + ulint pos) /*!< in: position of field */ +{ + ut_ad(index); + ut_ad(pos < index->n_def); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + return((dict_field_t*) (index->fields) + pos); +} +#endif /* UNIV_DEBUG */ + +/********************************************************************//** +Returns the position of a system column in an index. +@return position, ULINT_UNDEFINED if not contained */ +UNIV_INLINE +ulint +dict_index_get_sys_col_pos( +/*=======================*/ + const dict_index_t* index, /*!< in: index */ + ulint type) /*!< in: DATA_ROW_ID, ... */ +{ + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + ut_ad(!dict_index_is_univ(index)); + + if (dict_index_is_clust(index)) { + + return(dict_col_get_clust_pos( + dict_table_get_sys_col(index->table, type), + index)); + } + + return(dict_index_get_nth_col_pos( + index, dict_table_get_sys_col_no(index->table, type))); +} + +/*********************************************************************//** +Gets the field column. +@return field->col, pointer to the table column */ +UNIV_INLINE +const dict_col_t* +dict_field_get_col( +/*===============*/ + const dict_field_t* field) /*!< in: index field */ +{ + ut_ad(field); + + return(field->col); +} + +/********************************************************************//** +Gets pointer to the nth column in an index. +@return column */ +UNIV_INLINE +const dict_col_t* +dict_index_get_nth_col( +/*===================*/ + const dict_index_t* index, /*!< in: index */ + ulint pos) /*!< in: position of the field */ +{ + return(dict_field_get_col(dict_index_get_nth_field(index, pos))); +} + +/********************************************************************//** +Gets the column number the nth field in an index. +@return column number */ +UNIV_INLINE +ulint +dict_index_get_nth_col_no( +/*======================*/ + const dict_index_t* index, /*!< in: index */ + ulint pos) /*!< in: position of the field */ +{ + return(dict_col_get_no(dict_index_get_nth_col(index, pos))); +} + +/********************************************************************//** +Looks for column n in an index. +@return position in internal representation of the index; +ULINT_UNDEFINED if not contained */ +UNIV_INLINE +ulint +dict_index_get_nth_col_pos( +/*=======================*/ + const dict_index_t* index, /*!< in: index */ + ulint n) /*!< in: column number */ +{ + return(dict_index_get_nth_col_or_prefix_pos(index, n, FALSE)); +} + +#ifndef UNIV_HOTBACKUP +/********************************************************************//** +Returns the minimum data size of an index record. +@return minimum data size in bytes */ +UNIV_INLINE +ulint +dict_index_get_min_size( +/*====================*/ + const dict_index_t* index) /*!< in: index */ +{ + ulint n = dict_index_get_n_fields(index); + ulint size = 0; + + while (n--) { + size += dict_col_get_min_size(dict_index_get_nth_col(index, + n)); + } + + return(size); +} + +/*********************************************************************//** +Gets the space id of the root of the index tree. +@return space id */ +UNIV_INLINE +ulint +dict_index_get_space( +/*=================*/ + const dict_index_t* index) /*!< in: index */ +{ + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + return(index->space); +} + +/*********************************************************************//** +Sets the space id of the root of the index tree. */ +UNIV_INLINE +void +dict_index_set_space( +/*=================*/ + dict_index_t* index, /*!< in/out: index */ + ulint space) /*!< in: space id */ +{ + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + index->space = space; +} + +/*********************************************************************//** +Gets the page number of the root of the index tree. +@return page number */ +UNIV_INLINE +ulint +dict_index_get_page( +/*================*/ + const dict_index_t* index) /*!< in: index */ +{ + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + return(index->page); +} + +/*********************************************************************//** +Gets the read-write lock of the index tree. +@return read-write lock */ +UNIV_INLINE +prio_rw_lock_t* +dict_index_get_lock( +/*================*/ + dict_index_t* index) /*!< in: index */ +{ + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + return(&(index->lock)); +} + +/********************************************************************//** +Returns free space reserved for future updates of records. This is +relevant only in the case of many consecutive inserts, as updates +which make the records bigger might fragment the index. +@return number of free bytes on page, reserved for updates */ +UNIV_INLINE +ulint +dict_index_get_space_reserve(void) +/*==============================*/ +{ + return(UNIV_PAGE_SIZE / 16); +} + +/********************************************************************//** +Gets the status of online index creation. +@return the status */ +UNIV_INLINE +enum online_index_status +dict_index_get_online_status( +/*=========================*/ + const dict_index_t* index) /*!< in: secondary index */ +{ + enum online_index_status status; + + status = (enum online_index_status) index->online_status; + + /* Without the index->lock protection, the online + status can change from ONLINE_INDEX_CREATION to + ONLINE_INDEX_COMPLETE (or ONLINE_INDEX_ABORTED) in + row_log_apply() once log application is done. So to make + sure the status is ONLINE_INDEX_CREATION or ONLINE_INDEX_COMPLETE + you should always do the recheck after acquiring index->lock */ + +#ifdef UNIV_DEBUG + switch (status) { + case ONLINE_INDEX_COMPLETE: + case ONLINE_INDEX_CREATION: + case ONLINE_INDEX_ABORTED: + case ONLINE_INDEX_ABORTED_DROPPED: + return(status); + } + ut_error; +#endif /* UNIV_DEBUG */ + return(status); +} + +/********************************************************************//** +Sets the status of online index creation. */ +UNIV_INLINE +void +dict_index_set_online_status( +/*=========================*/ + dict_index_t* index, /*!< in/out: index */ + enum online_index_status status) /*!< in: status */ +{ + ut_ad(!(index->type & DICT_FTS)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ +#ifdef UNIV_DEBUG + switch (dict_index_get_online_status(index)) { + case ONLINE_INDEX_COMPLETE: + case ONLINE_INDEX_CREATION: + break; + case ONLINE_INDEX_ABORTED: + ut_ad(status == ONLINE_INDEX_ABORTED_DROPPED); + break; + case ONLINE_INDEX_ABORTED_DROPPED: + ut_error; + } +#endif /* UNIV_DEBUG */ + + index->online_status = status; + ut_ad(dict_index_get_online_status(index) == status); +} + +/********************************************************************//** +Determines if a secondary index is being or has been created online, +or if the table is being rebuilt online, allowing concurrent modifications +to the table. +@retval true if the index is being or has been built online, or +if this is a clustered index and the table is being or has been rebuilt online +@retval false if the index has been created or the table has been +rebuilt completely */ +UNIV_INLINE +bool +dict_index_is_online_ddl( +/*=====================*/ + const dict_index_t* index) /*!< in: index */ +{ +#ifdef UNIV_DEBUG + if (dict_index_is_clust(index)) { + switch (dict_index_get_online_status(index)) { + case ONLINE_INDEX_CREATION: + return(true); + case ONLINE_INDEX_COMPLETE: + return(false); + case ONLINE_INDEX_ABORTED: + case ONLINE_INDEX_ABORTED_DROPPED: + break; + } + ut_ad(0); + return(false); + } +#endif /* UNIV_DEBUG */ + + return(UNIV_UNLIKELY(dict_index_get_online_status(index) + != ONLINE_INDEX_COMPLETE)); +} + +/**********************************************************************//** +Check whether a column exists in an FTS index. +@return ULINT_UNDEFINED if no match else the offset within the vector */ +UNIV_INLINE +ulint +dict_table_is_fts_column( +/*=====================*/ + ib_vector_t* indexes,/*!< in: vector containing only FTS indexes */ + ulint col_no) /*!< in: col number to search for */ + +{ + ulint i; + + for (i = 0; i < ib_vector_size(indexes); ++i) { + dict_index_t* index; + + index = (dict_index_t*) ib_vector_getp(indexes, i); + + if (dict_index_contains_col_or_prefix(index, col_no)) { + + return(i); + } + } + + return(ULINT_UNDEFINED); +} + +/**********************************************************************//** +Determine bytes of column prefix to be stored in the undo log. Please +note if the table format is UNIV_FORMAT_A (< UNIV_FORMAT_B), no prefix +needs to be stored in the undo log. +@return bytes of column prefix to be stored in the undo log */ +UNIV_INLINE +ulint +dict_max_field_len_store_undo( +/*==========================*/ + dict_table_t* table, /*!< in: table */ + const dict_col_t* col) /*!< in: column which index prefix + is based on */ +{ + ulint prefix_len = 0; + + if (dict_table_get_format(table) >= UNIV_FORMAT_B) + { + prefix_len = col->max_prefix + ? col->max_prefix + : DICT_MAX_FIELD_LEN_BY_FORMAT(table); + } + + return(prefix_len); +} + +/********************************************************************//** +Check whether the table is corrupted. +@return nonzero for corrupted table, zero for valid tables */ +UNIV_INLINE +ulint +dict_table_is_corrupted( +/*====================*/ + const dict_table_t* table) /*!< in: table */ +{ + ut_ad(table); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + + return(table->corrupted); +} + +/********************************************************************//** +Check whether the index is corrupted. +@return nonzero for corrupted index, zero for valid indexes */ +UNIV_INLINE +ulint +dict_index_is_corrupted( +/*====================*/ + const dict_index_t* index) /*!< in: index */ +{ + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + return((index->type & DICT_CORRUPT) + || (index->table && index->table->corrupted)); +} + +/********************************************************************//** +Check if the tablespace for the table has been discarded. +@return true if the tablespace has been discarded. */ +UNIV_INLINE +bool +dict_table_is_discarded( +/*====================*/ + const dict_table_t* table) /*!< in: table to check */ +{ + return(DICT_TF2_FLAG_IS_SET(table, DICT_TF2_DISCARDED)); +} + +/********************************************************************//** +Check if it is a temporary table. +@return true if temporary table flag is set. */ +UNIV_INLINE +bool +dict_table_is_temporary( +/*====================*/ + const dict_table_t* table) /*!< in: table to check */ +{ + return(DICT_TF2_FLAG_IS_SET(table, DICT_TF2_TEMPORARY)); +} + +/**********************************************************************//** +Get index by first field of the index +@return index which is having first field matches +with the field present in field_index position of table */ +UNIV_INLINE +dict_index_t* +dict_table_get_index_on_first_col( +/*==============================*/ + const dict_table_t* table, /*!< in: table */ + ulint col_index) /*!< in: position of column + in table */ +{ + ut_ad(col_index < table->n_cols); + + dict_col_t* column = dict_table_get_nth_col(table, col_index); + + for (dict_index_t* index = dict_table_get_first_index(table); + index != NULL; index = dict_table_get_next_index(index)) { + + if (index->fields[0].col == column) { + return(index); + } + } + ut_error; + return(0); +} + +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/xtradb/include/dict0load.h b/storage/xtradb/include/dict0load.h new file mode 100644 index 00000000000..030190b1a8e --- /dev/null +++ b/storage/xtradb/include/dict0load.h @@ -0,0 +1,428 @@ +/***************************************************************************** + +Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dict0load.h +Loads to the memory cache database object definitions +from dictionary tables + +Created 4/24/1996 Heikki Tuuri +*******************************************************/ + +#ifndef dict0load_h +#define dict0load_h + +#include "univ.i" +#include "dict0types.h" +#include "trx0types.h" +#include "ut0byte.h" +#include "mem0mem.h" +#include "btr0types.h" + +/** enum that defines all system table IDs. @see SYSTEM_TABLE_NAME[] */ +enum dict_system_id_t { + SYS_TABLES = 0, + SYS_INDEXES, + SYS_COLUMNS, + SYS_FIELDS, + SYS_FOREIGN, + SYS_FOREIGN_COLS, + SYS_TABLESPACES, + SYS_DATAFILES, + + /* This must be last item. Defines the number of system tables. */ + SYS_NUM_SYSTEM_TABLES +}; + +/** Status bit for dict_process_sys_tables_rec_and_mtr_commit() */ +enum dict_table_info_t { + DICT_TABLE_LOAD_FROM_RECORD = 0,/*!< Directly populate a dict_table_t + structure with information from + a SYS_TABLES record */ + DICT_TABLE_LOAD_FROM_CACHE = 1 /*!< Check first whether dict_table_t + is in the cache, if so, return it */ +}; + +/** Check type for dict_check_tablespaces_and_store_max_id() */ +enum dict_check_t { + /** No user tablespaces have been opened + (no crash recovery, no transactions recovered). */ + DICT_CHECK_NONE_LOADED = 0, + /** Some user tablespaces may have been opened + (no crash recovery; recovered table locks for transactions). */ + DICT_CHECK_SOME_LOADED, + /** All user tablespaces have been opened (crash recovery). */ + DICT_CHECK_ALL_LOADED +}; + +/********************************************************************//** +In a crash recovery we already have all the tablespace objects created. +This function compares the space id information in the InnoDB data dictionary +to what we already read with fil_load_single_table_tablespaces(). + +In a normal startup, we create the tablespace objects for every table in +InnoDB's data dictionary, if the corresponding .ibd file exists. +We also scan the biggest space id, and store it to fil_system. */ +UNIV_INTERN +void +dict_check_tablespaces_and_store_max_id( +/*====================================*/ + dict_check_t dict_check); /*!< in: how to check */ +/********************************************************************//** +Finds the first table name in the given database. +@return own: table name, NULL if does not exist; the caller must free +the memory in the string! */ +UNIV_INTERN +char* +dict_get_first_table_name_in_db( +/*============================*/ + const char* name); /*!< in: database name which ends to '/' */ + +/********************************************************************//** +Loads a table definition from a SYS_TABLES record to dict_table_t. +Does not load any columns or indexes. +@return error message, or NULL on success */ +UNIV_INTERN +const char* +dict_load_table_low( +/*================*/ + const char* name, /*!< in: table name */ + const rec_t* rec, /*!< in: SYS_TABLES record */ + dict_table_t** table); /*!< out,own: table, or NULL */ +/********************************************************************//** +Loads a table column definition from a SYS_COLUMNS record to +dict_table_t. +@return error message, or NULL on success */ +UNIV_INTERN +const char* +dict_load_column_low( +/*=================*/ + dict_table_t* table, /*!< in/out: table, could be NULL + if we just populate a dict_column_t + struct with information from + a SYS_COLUMNS record */ + mem_heap_t* heap, /*!< in/out: memory heap + for temporary storage */ + dict_col_t* column, /*!< out: dict_column_t to fill, + or NULL if table != NULL */ + table_id_t* table_id, /*!< out: table id */ + const char** col_name, /*!< out: column name */ + const rec_t* rec); /*!< in: SYS_COLUMNS record */ +/********************************************************************//** +Loads an index definition from a SYS_INDEXES record to dict_index_t. +If allocate=TRUE, we will create a dict_index_t structure and fill it +accordingly. If allocated=FALSE, the dict_index_t will be supplied by +the caller and filled with information read from the record. @return +error message, or NULL on success */ +UNIV_INTERN +const char* +dict_load_index_low( +/*================*/ + byte* table_id, /*!< in/out: table id (8 bytes), + an "in" value if allocate=TRUE + and "out" when allocate=FALSE */ + const char* table_name, /*!< in: table name */ + mem_heap_t* heap, /*!< in/out: temporary memory heap */ + const rec_t* rec, /*!< in: SYS_INDEXES record */ + ibool allocate, /*!< in: TRUE=allocate *index, + FALSE=fill in a pre-allocated + *index */ + dict_index_t** index); /*!< out,own: index, or NULL */ +/********************************************************************//** +Loads an index field definition from a SYS_FIELDS record to +dict_index_t. +@return error message, or NULL on success */ +UNIV_INTERN +const char* +dict_load_field_low( +/*================*/ + byte* index_id, /*!< in/out: index id (8 bytes) + an "in" value if index != NULL + and "out" if index == NULL */ + dict_index_t* index, /*!< in/out: index, could be NULL + if we just populate a dict_field_t + struct with information from + a SYS_FIELDS record */ + dict_field_t* sys_field, /*!< out: dict_field_t to be + filled */ + ulint* pos, /*!< out: Field position */ + byte* last_index_id, /*!< in: last index id */ + mem_heap_t* heap, /*!< in/out: memory heap + for temporary storage */ + const rec_t* rec); /*!< in: SYS_FIELDS record */ +/********************************************************************//** +Using the table->heap, copy the null-terminated filepath into +table->data_dir_path and put a null byte before the extension. +This allows SHOW CREATE TABLE to return the correct DATA DIRECTORY path. +Make this data directory path only if it has not yet been saved. */ +UNIV_INTERN +void +dict_save_data_dir_path( +/*====================*/ + dict_table_t* table, /*!< in/out: table */ + char* filepath); /*!< in: filepath of tablespace */ +/*****************************************************************//** +Make sure the data_file_name is saved in dict_table_t if needed. Try to +read it from the file dictionary first, then from SYS_DATAFILES. */ +UNIV_INTERN +void +dict_get_and_save_data_dir_path( +/*============================*/ + dict_table_t* table, /*!< in/out: table */ + bool dict_mutex_own); /*!< in: true if dict_sys->mutex + is owned already */ +/********************************************************************//** +Loads a table definition and also all its index definitions, and also +the cluster definition if the table is a member in a cluster. Also loads +all foreign key constraints where the foreign key is in the table or where +a foreign key references columns in this table. +@return table, NULL if does not exist; if the table is stored in an +.ibd file, but the file does not exist, then we set the +ibd_file_missing flag TRUE in the table object we return */ +UNIV_INTERN +dict_table_t* +dict_load_table( +/*============*/ + const char* name, /*!< in: table name in the + databasename/tablename format */ + ibool cached, /*!< in: TRUE=add to cache, FALSE=do not */ + dict_err_ignore_t ignore_err); + /*!< in: error to be ignored when loading + table and its indexes' definition */ +/***********************************************************************//** +Loads a table object based on the table id. +@return table; NULL if table does not exist */ +UNIV_INTERN +dict_table_t* +dict_load_table_on_id( +/*==================*/ + table_id_t table_id, /*!< in: table id */ + dict_err_ignore_t ignore_err); /*!< in: errors to ignore + when loading the table */ +/********************************************************************//** +This function is called when the database is booted. +Loads system table index definitions except for the clustered index which +is added to the dictionary cache at booting before calling this function. */ +UNIV_INTERN +void +dict_load_sys_table( +/*================*/ + dict_table_t* table); /*!< in: system table */ +/***********************************************************************//** +Loads foreign key constraints where the table is either the foreign key +holder or where the table is referenced by a foreign key. Adds these +constraints to the data dictionary. Note that we know that the dictionary +cache already contains all constraints where the other relevant table is +already in the dictionary cache. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +dict_load_foreigns( +/*===============*/ + const char* table_name, /*!< in: table name */ + const char** col_names, /*!< in: column names, or NULL + to use table->col_names */ + bool check_recursive,/*!< in: Whether to check + recursive load of tables + chained by FK */ + bool check_charsets, /*!< in: whether to check + charset compatibility */ + dict_err_ignore_t ignore_err) /*!< in: error to be ignored */ + __attribute__((nonnull(1), warn_unused_result)); +/********************************************************************//** +Prints to the standard output information on all tables found in the data +dictionary system table. */ +UNIV_INTERN +void +dict_print(void); +/*============*/ + +/********************************************************************//** +This function opens a system table, and return the first record. +@return first record of the system table */ +UNIV_INTERN +const rec_t* +dict_startscan_system( +/*==================*/ + btr_pcur_t* pcur, /*!< out: persistent cursor to + the record */ + mtr_t* mtr, /*!< in: the mini-transaction */ + dict_system_id_t system_id); /*!< in: which system table to open */ +/********************************************************************//** +This function get the next system table record as we scan the table. +@return the record if found, NULL if end of scan. */ +UNIV_INTERN +const rec_t* +dict_getnext_system( +/*================*/ + btr_pcur_t* pcur, /*!< in/out: persistent cursor + to the record */ + mtr_t* mtr); /*!< in: the mini-transaction */ +/********************************************************************//** +This function processes one SYS_TABLES record and populate the dict_table_t +struct for the table. Extracted out of dict_print() to be used by +both monitor table output and information schema innodb_sys_tables output. +@return error message, or NULL on success */ +UNIV_INTERN +const char* +dict_process_sys_tables_rec_and_mtr_commit( +/*=======================================*/ + mem_heap_t* heap, /*!< in: temporary memory heap */ + const rec_t* rec, /*!< in: SYS_TABLES record */ + dict_table_t** table, /*!< out: dict_table_t to fill */ + dict_table_info_t status, /*!< in: status bit controls + options such as whether we shall + look for dict_table_t from cache + first */ + mtr_t* mtr); /*!< in/out: mini-transaction, + will be committed */ +/********************************************************************//** +This function parses a SYS_INDEXES record and populate a dict_index_t +structure with the information from the record. For detail information +about SYS_INDEXES fields, please refer to dict_boot() function. +@return error message, or NULL on success */ +UNIV_INTERN +const char* +dict_process_sys_indexes_rec( +/*=========================*/ + mem_heap_t* heap, /*!< in/out: heap memory */ + const rec_t* rec, /*!< in: current SYS_INDEXES rec */ + dict_index_t* index, /*!< out: dict_index_t to be + filled */ + table_id_t* table_id); /*!< out: table id */ +/********************************************************************//** +This function parses a SYS_COLUMNS record and populate a dict_column_t +structure with the information from the record. +@return error message, or NULL on success */ +UNIV_INTERN +const char* +dict_process_sys_columns_rec( +/*=========================*/ + mem_heap_t* heap, /*!< in/out: heap memory */ + const rec_t* rec, /*!< in: current SYS_COLUMNS rec */ + dict_col_t* column, /*!< out: dict_col_t to be filled */ + table_id_t* table_id, /*!< out: table id */ + const char** col_name); /*!< out: column name */ +/********************************************************************//** +This function parses a SYS_FIELDS record and populate a dict_field_t +structure with the information from the record. +@return error message, or NULL on success */ +UNIV_INTERN +const char* +dict_process_sys_fields_rec( +/*========================*/ + mem_heap_t* heap, /*!< in/out: heap memory */ + const rec_t* rec, /*!< in: current SYS_FIELDS rec */ + dict_field_t* sys_field, /*!< out: dict_field_t to be + filled */ + ulint* pos, /*!< out: Field position */ + index_id_t* index_id, /*!< out: current index id */ + index_id_t last_id); /*!< in: previous index id */ +/********************************************************************//** +This function parses a SYS_FOREIGN record and populate a dict_foreign_t +structure with the information from the record. For detail information +about SYS_FOREIGN fields, please refer to dict_load_foreign() function +@return error message, or NULL on success */ +UNIV_INTERN +const char* +dict_process_sys_foreign_rec( +/*=========================*/ + mem_heap_t* heap, /*!< in/out: heap memory */ + const rec_t* rec, /*!< in: current SYS_FOREIGN rec */ + dict_foreign_t* foreign); /*!< out: dict_foreign_t to be + filled */ +/********************************************************************//** +This function parses a SYS_FOREIGN_COLS record and extract necessary +information from the record and return to caller. +@return error message, or NULL on success */ +UNIV_INTERN +const char* +dict_process_sys_foreign_col_rec( +/*=============================*/ + mem_heap_t* heap, /*!< in/out: heap memory */ + const rec_t* rec, /*!< in: current SYS_FOREIGN_COLS rec */ + const char** name, /*!< out: foreign key constraint name */ + const char** for_col_name, /*!< out: referencing column name */ + const char** ref_col_name, /*!< out: referenced column name + in referenced table */ + ulint* pos); /*!< out: column position */ +/********************************************************************//** +This function parses a SYS_TABLESPACES record, extracts necessary +information from the record and returns to caller. +@return error message, or NULL on success */ +UNIV_INTERN +const char* +dict_process_sys_tablespaces( +/*=========================*/ + mem_heap_t* heap, /*!< in/out: heap memory */ + const rec_t* rec, /*!< in: current SYS_TABLESPACES rec */ + ulint* space, /*!< out: pace id */ + const char** name, /*!< out: tablespace name */ + ulint* flags); /*!< out: tablespace flags */ +/********************************************************************//** +This function parses a SYS_DATAFILES record, extracts necessary +information from the record and returns to caller. +@return error message, or NULL on success */ +UNIV_INTERN +const char* +dict_process_sys_datafiles( +/*=======================*/ + mem_heap_t* heap, /*!< in/out: heap memory */ + const rec_t* rec, /*!< in: current SYS_DATAFILES rec */ + ulint* space, /*!< out: pace id */ + const char** path); /*!< out: datafile path */ +/********************************************************************//** +Get the filepath for a spaceid from SYS_DATAFILES. This function provides +a temporary heap which is used for the table lookup, but not for the path. +The caller must free the memory for the path returned. This function can +return NULL if the space ID is not found in SYS_DATAFILES, then the caller +will assume that the ibd file is in the normal datadir. +@return own: A copy of the first datafile found in SYS_DATAFILES.PATH for +the given space ID. NULL if space ID is zero or not found. */ +UNIV_INTERN +char* +dict_get_first_path( +/*================*/ + ulint space, /*!< in: space id */ + const char* name); /*!< in: tablespace name */ +/********************************************************************//** +Update the record for space_id in SYS_TABLESPACES to this filepath. +@return DB_SUCCESS if OK, dberr_t if the insert failed */ +UNIV_INTERN +dberr_t +dict_update_filepath( +/*=================*/ + ulint space_id, /*!< in: space id */ + const char* filepath); /*!< in: filepath */ +/********************************************************************//** +Insert records into SYS_TABLESPACES and SYS_DATAFILES. +@return DB_SUCCESS if OK, dberr_t if the insert failed */ +UNIV_INTERN +dberr_t +dict_insert_tablespace_and_filepath( +/*================================*/ + ulint space, /*!< in: space id */ + const char* name, /*!< in: talespace name */ + const char* filepath, /*!< in: filepath */ + ulint fsp_flags); /*!< in: tablespace flags */ + +#ifndef UNIV_NONINL +#include "dict0load.ic" +#endif + +#endif diff --git a/storage/xtradb/include/dict0load.ic b/storage/xtradb/include/dict0load.ic new file mode 100644 index 00000000000..2c0f1ff38a5 --- /dev/null +++ b/storage/xtradb/include/dict0load.ic @@ -0,0 +1,26 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dict0load.ic +Loads to the memory cache database object definitions +from dictionary tables + +Created 4/24/1996 Heikki Tuuri +*******************************************************/ + diff --git a/storage/xtradb/include/dict0mem.h b/storage/xtradb/include/dict0mem.h new file mode 100644 index 00000000000..500bd3dfc18 --- /dev/null +++ b/storage/xtradb/include/dict0mem.h @@ -0,0 +1,1234 @@ +/***************************************************************************** + +Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dict0mem.h +Data dictionary memory object creation + +Created 1/8/1996 Heikki Tuuri +*******************************************************/ + +#ifndef dict0mem_h +#define dict0mem_h + +#include "univ.i" + +#ifndef UNIV_INNOCHECKSUM + +#include "dict0types.h" +#include "data0type.h" +#include "mem0mem.h" +#include "row0types.h" +#include "rem0types.h" +#include "btr0types.h" +#ifndef UNIV_HOTBACKUP +# include "lock0types.h" +# include "que0types.h" +# include "sync0rw.h" +#endif /* !UNIV_HOTBACKUP */ +#include "ut0mem.h" +#include "ut0lst.h" +#include "ut0rnd.h" +#include "ut0byte.h" +#include "hash0hash.h" +#include "trx0types.h" +#include "fts0fts.h" +#include "os0once.h" +#include <set> +#include <algorithm> +#include <iterator> + +/* Forward declaration. */ +struct ib_rbt_t; + +/** Type flags of an index: OR'ing of the flags is allowed to define a +combination of types */ +/* @{ */ +#define DICT_CLUSTERED 1 /*!< clustered index */ +#define DICT_UNIQUE 2 /*!< unique index */ +#define DICT_UNIVERSAL 4 /*!< index which can contain records from any + other index */ +#define DICT_IBUF 8 /*!< insert buffer tree */ +#define DICT_CORRUPT 16 /*!< bit to store the corrupted flag + in SYS_INDEXES.TYPE */ +#define DICT_FTS 32 /* FTS index; can't be combined with the + other flags */ + +#define DICT_IT_BITS 6 /*!< number of bits used for + SYS_INDEXES.TYPE */ +/* @} */ + +#if 0 /* not implemented, retained for history */ +/** Types for a table object */ +#define DICT_TABLE_ORDINARY 1 /*!< ordinary table */ +#define DICT_TABLE_CLUSTER_MEMBER 2 +#define DICT_TABLE_CLUSTER 3 /* this means that the table is + really a cluster definition */ +#endif + +/* Table and tablespace flags are generally not used for the Antelope file +format except for the low order bit, which is used differently depending on +where the flags are stored. + +==================== Low order flags bit ========================= + | REDUNDANT | COMPACT | COMPRESSED and DYNAMIC +SYS_TABLES.TYPE | 1 | 1 | 1 +dict_table_t::flags | 0 | 1 | 1 +FSP_SPACE_FLAGS | 0 | 0 | 1 +fil_space_t::flags | 0 | 0 | 1 + +Before the 5.1 plugin, SYS_TABLES.TYPE was always DICT_TABLE_ORDINARY (1) +and the tablespace flags field was always 0. In the 5.1 plugin, these fields +were repurposed to identify compressed and dynamic row formats. + +The following types and constants describe the flags found in dict_table_t +and SYS_TABLES.TYPE. Similar flags found in fil_space_t and FSP_SPACE_FLAGS +are described in fsp0fsp.h. */ + +/* @{ */ +/** dict_table_t::flags bit 0 is equal to 0 if the row format = Redundant */ +#define DICT_TF_REDUNDANT 0 /*!< Redundant row format. */ +/** dict_table_t::flags bit 0 is equal to 1 if the row format = Compact */ +#define DICT_TF_COMPACT 1 /*!< Compact row format. */ + +/** This bitmask is used in SYS_TABLES.N_COLS to set and test whether +the Compact page format is used, i.e ROW_FORMAT != REDUNDANT */ +#define DICT_N_COLS_COMPACT 0x80000000UL + +#endif /* !UNIV_INNOCHECKSUM */ + +/** Width of the COMPACT flag */ +#define DICT_TF_WIDTH_COMPACT 1 +/** Width of the ZIP_SSIZE flag */ +#define DICT_TF_WIDTH_ZIP_SSIZE 4 +/** Width of the ATOMIC_BLOBS flag. The Antelope file formats broke up +BLOB and TEXT fields, storing the first 768 bytes in the clustered index. +Brracuda row formats store the whole blob or text field off-page atomically. +Secondary indexes are created from this external data using row_ext_t +to cache the BLOB prefixes. */ +#define DICT_TF_WIDTH_ATOMIC_BLOBS 1 +/** If a table is created with the MYSQL option DATA DIRECTORY and +innodb-file-per-table, an older engine will not be able to find that table. +This flag prevents older engines from attempting to open the table and +allows InnoDB to update_create_info() accordingly. */ +#define DICT_TF_WIDTH_DATA_DIR 1 + +/** Width of all the currently known table flags */ +#define DICT_TF_BITS (DICT_TF_WIDTH_COMPACT \ + + DICT_TF_WIDTH_ZIP_SSIZE \ + + DICT_TF_WIDTH_ATOMIC_BLOBS \ + + DICT_TF_WIDTH_DATA_DIR) + +/** A mask of all the known/used bits in table flags */ +#define DICT_TF_BIT_MASK (~(~0 << DICT_TF_BITS)) + +/** Zero relative shift position of the COMPACT field */ +#define DICT_TF_POS_COMPACT 0 +/** Zero relative shift position of the ZIP_SSIZE field */ +#define DICT_TF_POS_ZIP_SSIZE (DICT_TF_POS_COMPACT \ + + DICT_TF_WIDTH_COMPACT) +/** Zero relative shift position of the ATOMIC_BLOBS field */ +#define DICT_TF_POS_ATOMIC_BLOBS (DICT_TF_POS_ZIP_SSIZE \ + + DICT_TF_WIDTH_ZIP_SSIZE) +/** Zero relative shift position of the DATA_DIR field */ +#define DICT_TF_POS_DATA_DIR (DICT_TF_POS_ATOMIC_BLOBS \ + + DICT_TF_WIDTH_ATOMIC_BLOBS) +/** Zero relative shift position of the start of the UNUSED bits */ +#define DICT_TF_POS_UNUSED (DICT_TF_POS_DATA_DIR \ + + DICT_TF_WIDTH_DATA_DIR) + +/** Bit mask of the COMPACT field */ +#define DICT_TF_MASK_COMPACT \ + ((~(~0 << DICT_TF_WIDTH_COMPACT)) \ + << DICT_TF_POS_COMPACT) +/** Bit mask of the ZIP_SSIZE field */ +#define DICT_TF_MASK_ZIP_SSIZE \ + ((~(~0 << DICT_TF_WIDTH_ZIP_SSIZE)) \ + << DICT_TF_POS_ZIP_SSIZE) +/** Bit mask of the ATOMIC_BLOBS field */ +#define DICT_TF_MASK_ATOMIC_BLOBS \ + ((~(~0 << DICT_TF_WIDTH_ATOMIC_BLOBS)) \ + << DICT_TF_POS_ATOMIC_BLOBS) +/** Bit mask of the DATA_DIR field */ +#define DICT_TF_MASK_DATA_DIR \ + ((~(~0 << DICT_TF_WIDTH_DATA_DIR)) \ + << DICT_TF_POS_DATA_DIR) + +/** Return the value of the COMPACT field */ +#define DICT_TF_GET_COMPACT(flags) \ + ((flags & DICT_TF_MASK_COMPACT) \ + >> DICT_TF_POS_COMPACT) +/** Return the value of the ZIP_SSIZE field */ +#define DICT_TF_GET_ZIP_SSIZE(flags) \ + ((flags & DICT_TF_MASK_ZIP_SSIZE) \ + >> DICT_TF_POS_ZIP_SSIZE) +/** Return the value of the ATOMIC_BLOBS field */ +#define DICT_TF_HAS_ATOMIC_BLOBS(flags) \ + ((flags & DICT_TF_MASK_ATOMIC_BLOBS) \ + >> DICT_TF_POS_ATOMIC_BLOBS) +/** Return the value of the ATOMIC_BLOBS field */ +#define DICT_TF_HAS_DATA_DIR(flags) \ + ((flags & DICT_TF_MASK_DATA_DIR) \ + >> DICT_TF_POS_DATA_DIR) +/** Return the contents of the UNUSED bits */ +#define DICT_TF_GET_UNUSED(flags) \ + (flags >> DICT_TF_POS_UNUSED) +/* @} */ + +#ifndef UNIV_INNOCHECKSUM + +/** @brief Table Flags set number 2. + +These flags will be stored in SYS_TABLES.MIX_LEN. All unused flags +will be written as 0. The column may contain garbage for tables +created with old versions of InnoDB that only implemented +ROW_FORMAT=REDUNDANT. InnoDB engines do not check these flags +for unknown bits in order to protect backward incompatibility. */ +/* @{ */ +/** Total number of bits in table->flags2. */ +#define DICT_TF2_BITS 7 +#define DICT_TF2_BIT_MASK ~(~0 << DICT_TF2_BITS) + +/** TEMPORARY; TRUE for tables from CREATE TEMPORARY TABLE. */ +#define DICT_TF2_TEMPORARY 1 +/** The table has an internal defined DOC ID column */ +#define DICT_TF2_FTS_HAS_DOC_ID 2 +/** The table has an FTS index */ +#define DICT_TF2_FTS 4 +/** Need to add Doc ID column for FTS index build. +This is a transient bit for index build */ +#define DICT_TF2_FTS_ADD_DOC_ID 8 +/** This bit is used during table creation to indicate that it will +use its own tablespace instead of the system tablespace. */ +#define DICT_TF2_USE_TABLESPACE 16 + +/** Set when we discard/detach the tablespace */ +#define DICT_TF2_DISCARDED 32 + +/** This bit is set if all aux table names (both common tables and +index tables) of a FTS table are in HEX format. */ +#define DICT_TF2_FTS_AUX_HEX_NAME 64 +/* @} */ + +#define DICT_TF2_FLAG_SET(table, flag) \ + (table->flags2 |= (flag)) + +#define DICT_TF2_FLAG_IS_SET(table, flag) \ + (table->flags2 & (flag)) + +#define DICT_TF2_FLAG_UNSET(table, flag) \ + (table->flags2 &= ~(flag)) + +/** Tables could be chained together with Foreign key constraint. When +first load the parent table, we would load all of its descedents. +This could result in rescursive calls and out of stack error eventually. +DICT_FK_MAX_RECURSIVE_LOAD defines the maximum number of recursive loads, +when exceeded, the child table will not be loaded. It will be loaded when +the foreign constraint check needs to be run. */ +#define DICT_FK_MAX_RECURSIVE_LOAD 20 + +/** Similarly, when tables are chained together with foreign key constraints +with on cascading delete/update clause, delete from parent table could +result in recursive cascading calls. This defines the maximum number of +such cascading deletes/updates allowed. When exceeded, the delete from +parent table will fail, and user has to drop excessive foreign constraint +before proceeds. */ +#define FK_MAX_CASCADE_DEL 255 + +/**********************************************************************//** +Creates a table memory object. +@return own: table object */ +UNIV_INTERN +dict_table_t* +dict_mem_table_create( +/*==================*/ + const char* name, /*!< in: table name */ + ulint space, /*!< in: space where the clustered index + of the table is placed */ + ulint n_cols, /*!< in: number of columns */ + ulint flags, /*!< in: table flags */ + ulint flags2, /*!< in: table flags2 */ + bool nonshared);/*!< in: whether the table object is a dummy + one that does not need the initialization of + locking-related fields. */ +/****************************************************************//** +Free a table memory object. */ +UNIV_INTERN +void +dict_mem_table_free( +/*================*/ + dict_table_t* table); /*!< in: table */ +/**********************************************************************//** +Adds a column definition to a table. */ +UNIV_INTERN +void +dict_mem_table_add_col( +/*===================*/ + dict_table_t* table, /*!< in: table */ + mem_heap_t* heap, /*!< in: temporary memory heap, or NULL */ + const char* name, /*!< in: column name, or NULL */ + ulint mtype, /*!< in: main datatype */ + ulint prtype, /*!< in: precise type */ + ulint len) /*!< in: precision */ + __attribute__((nonnull(1))); +/**********************************************************************//** +Renames a column of a table in the data dictionary cache. */ +UNIV_INTERN +void +dict_mem_table_col_rename( +/*======================*/ + dict_table_t* table, /*!< in/out: table */ + unsigned nth_col,/*!< in: column index */ + const char* from, /*!< in: old column name */ + const char* to) /*!< in: new column name */ + __attribute__((nonnull)); +/**********************************************************************//** +This function populates a dict_col_t memory structure with +supplied information. */ +UNIV_INTERN +void +dict_mem_fill_column_struct( +/*========================*/ + dict_col_t* column, /*!< out: column struct to be + filled */ + ulint col_pos, /*!< in: column position */ + ulint mtype, /*!< in: main data type */ + ulint prtype, /*!< in: precise type */ + ulint col_len); /*!< in: column length */ +/**********************************************************************//** +This function poplulates a dict_index_t index memory structure with +supplied information. */ +UNIV_INLINE +void +dict_mem_fill_index_struct( +/*=======================*/ + dict_index_t* index, /*!< out: index to be filled */ + mem_heap_t* heap, /*!< in: memory heap */ + const char* table_name, /*!< in: table name */ + const char* index_name, /*!< in: index name */ + ulint space, /*!< in: space where the index tree is + placed, ignored if the index is of + the clustered type */ + ulint type, /*!< in: DICT_UNIQUE, + DICT_CLUSTERED, ... ORed */ + ulint n_fields); /*!< in: number of fields */ +/**********************************************************************//** +Creates an index memory object. +@return own: index object */ +UNIV_INTERN +dict_index_t* +dict_mem_index_create( +/*==================*/ + const char* table_name, /*!< in: table name */ + const char* index_name, /*!< in: index name */ + ulint space, /*!< in: space where the index tree is + placed, ignored if the index is of + the clustered type */ + ulint type, /*!< in: DICT_UNIQUE, + DICT_CLUSTERED, ... ORed */ + ulint n_fields); /*!< in: number of fields */ +/**********************************************************************//** +Adds a field definition to an index. NOTE: does not take a copy +of the column name if the field is a column. The memory occupied +by the column name may be released only after publishing the index. */ +UNIV_INTERN +void +dict_mem_index_add_field( +/*=====================*/ + dict_index_t* index, /*!< in: index */ + const char* name, /*!< in: column name */ + ulint prefix_len); /*!< in: 0 or the column prefix length + in a MySQL index like + INDEX (textcol(25)) */ +/**********************************************************************//** +Frees an index memory object. */ +UNIV_INTERN +void +dict_mem_index_free( +/*================*/ + dict_index_t* index); /*!< in: index */ +/**********************************************************************//** +Creates and initializes a foreign constraint memory object. +@return own: foreign constraint struct */ +UNIV_INTERN +dict_foreign_t* +dict_mem_foreign_create(void); +/*=========================*/ + +/**********************************************************************//** +Sets the foreign_table_name_lookup pointer based on the value of +lower_case_table_names. If that is 0 or 1, foreign_table_name_lookup +will point to foreign_table_name. If 2, then another string is +allocated from the heap and set to lower case. */ +UNIV_INTERN +void +dict_mem_foreign_table_name_lookup_set( +/*===================================*/ + dict_foreign_t* foreign, /*!< in/out: foreign struct */ + ibool do_alloc); /*!< in: is an alloc needed */ + +/**********************************************************************//** +Sets the referenced_table_name_lookup pointer based on the value of +lower_case_table_names. If that is 0 or 1, referenced_table_name_lookup +will point to referenced_table_name. If 2, then another string is +allocated from the heap and set to lower case. */ +UNIV_INTERN +void +dict_mem_referenced_table_name_lookup_set( +/*======================================*/ + dict_foreign_t* foreign, /*!< in/out: foreign struct */ + ibool do_alloc); /*!< in: is an alloc needed */ + +/** Create a temporary tablename like "#sql-ibtid-inc where + tid = the Table ID + inc = a randomly initialized number that is incremented for each file +The table ID is a 64 bit integer, can use up to 20 digits, and is +initialized at bootstrap. The second number is 32 bits, can use up to 10 +digits, and is initialized at startup to a randomly distributed number. +It is hoped that the combination of these two numbers will provide a +reasonably unique temporary file name. +@param[in] heap A memory heap +@param[in] dbtab Table name in the form database/table name +@param[in] id Table id +@return A unique temporary tablename suitable for InnoDB use */ +UNIV_INTERN +char* +dict_mem_create_temporary_tablename( + mem_heap_t* heap, + const char* dbtab, + table_id_t id); + +/** Initialize dict memory variables */ + +void +dict_mem_init(void); + +/** Data structure for a column in a table */ +struct dict_col_t{ + /*----------------------*/ + /** The following are copied from dtype_t, + so that all bit-fields can be packed tightly. */ + /* @{ */ + unsigned prtype:32; /*!< precise type; MySQL data + type, charset code, flags to + indicate nullability, + signedness, whether this is a + binary string, whether this is + a true VARCHAR where MySQL + uses 2 bytes to store the length */ + unsigned mtype:8; /*!< main data type */ + + /* the remaining fields do not affect alphabetical ordering: */ + + unsigned len:16; /*!< length; for MySQL data this + is field->pack_length(), + except that for a >= 5.0.3 + type true VARCHAR this is the + maximum byte length of the + string data (in addition to + the string, MySQL uses 1 or 2 + bytes to store the string length) */ + + unsigned mbminmaxlen:5; /*!< minimum and maximum length of a + character, in bytes; + DATA_MBMINMAXLEN(mbminlen,mbmaxlen); + mbminlen=DATA_MBMINLEN(mbminmaxlen); + mbmaxlen=DATA_MBMINLEN(mbminmaxlen) */ + /*----------------------*/ + /* End of definitions copied from dtype_t */ + /* @} */ + + unsigned ind:10; /*!< table column position + (starting from 0) */ + unsigned ord_part:1; /*!< nonzero if this column + appears in the ordering fields + of an index */ + unsigned max_prefix:12; /*!< maximum index prefix length on + this column. Our current max limit is + 3072 for Barracuda table */ +}; + +/** @brief DICT_ANTELOPE_MAX_INDEX_COL_LEN is measured in bytes and +is the maximum indexed column length (or indexed prefix length) in +ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT. Also, in any format, +any fixed-length field that is longer than this will be encoded as +a variable-length field. + +It is set to 3*256, so that one can create a column prefix index on +256 characters of a TEXT or VARCHAR column also in the UTF-8 +charset. In that charset, a character may take at most 3 bytes. This +constant MUST NOT BE CHANGED, or the compatibility of InnoDB data +files would be at risk! */ +#define DICT_ANTELOPE_MAX_INDEX_COL_LEN REC_ANTELOPE_MAX_INDEX_COL_LEN + +/** Find out maximum indexed column length by its table format. +For ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT, the maximum +field length is REC_ANTELOPE_MAX_INDEX_COL_LEN - 1 (767). For +Barracuda row formats COMPRESSED and DYNAMIC, the length could +be REC_VERSION_56_MAX_INDEX_COL_LEN (3072) bytes */ +#define DICT_MAX_FIELD_LEN_BY_FORMAT(table) \ + ((dict_table_get_format(table) < UNIV_FORMAT_B) \ + ? (REC_ANTELOPE_MAX_INDEX_COL_LEN - 1) \ + : REC_VERSION_56_MAX_INDEX_COL_LEN) + +#define DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(flags) \ + ((DICT_TF_HAS_ATOMIC_BLOBS(flags) < UNIV_FORMAT_B) \ + ? (REC_ANTELOPE_MAX_INDEX_COL_LEN - 1) \ + : REC_VERSION_56_MAX_INDEX_COL_LEN) + +/** Defines the maximum fixed length column size */ +#define DICT_MAX_FIXED_COL_LEN DICT_ANTELOPE_MAX_INDEX_COL_LEN + +/** Data structure for a field in an index */ +struct dict_field_t{ + dict_col_t* col; /*!< pointer to the table column */ + const char* name; /*!< name of the column */ + unsigned prefix_len:12; /*!< 0 or the length of the column + prefix in bytes in a MySQL index of + type, e.g., INDEX (textcol(25)); + must be smaller than + DICT_MAX_FIELD_LEN_BY_FORMAT; + NOTE that in the UTF-8 charset, MySQL + sets this to (mbmaxlen * the prefix len) + in UTF-8 chars */ + unsigned fixed_len:10; /*!< 0 or the fixed length of the + column if smaller than + DICT_ANTELOPE_MAX_INDEX_COL_LEN */ +}; + +/**********************************************************************//** +PADDING HEURISTIC BASED ON LINEAR INCREASE OF PADDING TO AVOID +COMPRESSION FAILURES +(Note: this is relevant only for compressed indexes) +GOAL: Avoid compression failures by maintaining information about the +compressibility of data. If data is not very compressible then leave +some extra space 'padding' in the uncompressed page making it more +likely that compression of less than fully packed uncompressed page will +succeed. + +This padding heuristic works by increasing the pad linearly until the +desired failure rate is reached. A "round" is a fixed number of +compression operations. +After each round, the compression failure rate for that round is +computed. If the failure rate is too high, then padding is incremented +by a fixed value, otherwise it's left intact. +If the compression failure is lower than the desired rate for a fixed +number of consecutive rounds, then the padding is decreased by a fixed +value. This is done to prevent overshooting the padding value, +and to accommodate the possible change in data compressibility. */ + +/** Number of zip ops in one round. */ +#define ZIP_PAD_ROUND_LEN (128) + +/** Number of successful rounds after which the padding is decreased */ +#define ZIP_PAD_SUCCESSFUL_ROUND_LIMIT (5) + +/** Amount by which padding is increased. */ +#define ZIP_PAD_INCR (128) + +/** Percentage of compression failures that are allowed in a single +round */ +extern ulong zip_failure_threshold_pct; + +/** Maximum percentage of a page that can be allowed as a pad to avoid +compression failures */ +extern ulong zip_pad_max; + +/** Data structure to hold information about about how much space in +an uncompressed page should be left as padding to avoid compression +failures. This estimate is based on a self-adapting heuristic. */ +struct zip_pad_info_t { + os_fast_mutex_t mutex; /*!< mutex protecting the info */ + ulint pad; /*!< number of bytes used as pad */ + ulint success;/*!< successful compression ops during + current round */ + ulint failure;/*!< failed compression ops during + current round */ + ulint n_rounds;/*!< number of currently successful + rounds */ +}; + +/** Data structure for an index. Most fields will be +initialized to 0, NULL or FALSE in dict_mem_index_create(). */ +struct dict_index_t{ + index_id_t id; /*!< id of the index */ + prio_rw_lock_t* search_latch; /*!< latch protecting the AHI partition + corresponding to this index */ + hash_table_t* search_table; /*!< hash table protected by + search_latch */ + mem_heap_t* heap; /*!< memory heap */ + const char* name; /*!< index name */ + const char* table_name;/*!< table name */ + dict_table_t* table; /*!< back pointer to table */ +#ifndef UNIV_HOTBACKUP + unsigned space:32; + /*!< space where the index tree is placed */ + unsigned page:32;/*!< index tree root page number */ +#endif /* !UNIV_HOTBACKUP */ + unsigned type:DICT_IT_BITS; + /*!< index type (DICT_CLUSTERED, DICT_UNIQUE, + DICT_UNIVERSAL, DICT_IBUF, DICT_CORRUPT) */ +#define MAX_KEY_LENGTH_BITS 12 + unsigned trx_id_offset:MAX_KEY_LENGTH_BITS; + /*!< position of the trx id column + in a clustered index record, if the fields + before it are known to be of a fixed size, + 0 otherwise */ +#if (1<<MAX_KEY_LENGTH_BITS) < MAX_KEY_LENGTH +# error (1<<MAX_KEY_LENGTH_BITS) < MAX_KEY_LENGTH +#endif + unsigned n_user_defined_cols:10; + /*!< number of columns the user defined to + be in the index: in the internal + representation we add more columns */ + unsigned n_uniq:10;/*!< number of fields from the beginning + which are enough to determine an index + entry uniquely */ + unsigned n_def:10;/*!< number of fields defined so far */ + unsigned n_fields:10;/*!< number of fields in the index */ + unsigned n_nullable:10;/*!< number of nullable fields */ + unsigned cached:1;/*!< TRUE if the index object is in the + dictionary cache */ + unsigned to_be_dropped:1; + /*!< TRUE if the index is to be dropped; + protected by dict_operation_lock */ + unsigned online_status:2; + /*!< enum online_index_status. + Transitions from ONLINE_INDEX_COMPLETE (to + ONLINE_INDEX_CREATION) are protected + by dict_operation_lock and + dict_sys->mutex. Other changes are + protected by index->lock. */ + dict_field_t* fields; /*!< array of field descriptions */ +#ifndef UNIV_HOTBACKUP + UT_LIST_NODE_T(dict_index_t) + indexes;/*!< list of indexes of the table */ + btr_search_t* search_info; + /*!< info used in optimistic searches */ + row_log_t* online_log; + /*!< the log of modifications + during online index creation; + valid when online_status is + ONLINE_INDEX_CREATION */ + /*----------------------*/ + /** Statistics for query optimization */ + /* @{ */ + ib_uint64_t* stat_n_diff_key_vals; + /*!< approximate number of different + key values for this index, for each + n-column prefix where 1 <= n <= + dict_get_n_unique(index) (the array is + indexed from 0 to n_uniq-1); we + periodically calculate new + estimates */ + ib_uint64_t* stat_n_sample_sizes; + /*!< number of pages that were sampled + to calculate each of stat_n_diff_key_vals[], + e.g. stat_n_sample_sizes[3] pages were sampled + to get the number stat_n_diff_key_vals[3]. */ + ib_uint64_t* stat_n_non_null_key_vals; + /* approximate number of non-null key values + for this index, for each column where + 1 <= n <= dict_get_n_unique(index) (the array + is indexed from 0 to n_uniq-1); This + is used when innodb_stats_method is + "nulls_ignored". */ + ulint stat_index_size; + /*!< approximate index size in + database pages */ + ulint stat_n_leaf_pages; + /*!< approximate number of leaf pages in the + index tree */ + /* @} */ + prio_rw_lock_t lock; /*!< read-write lock protecting the + upper levels of the index tree */ + trx_id_t trx_id; /*!< id of the transaction that created this + index, or 0 if the index existed + when InnoDB was started up */ + zip_pad_info_t zip_pad;/*!< Information about state of + compression failures and successes */ +#endif /* !UNIV_HOTBACKUP */ +#ifdef UNIV_BLOB_DEBUG + ib_mutex_t blobs_mutex; + /*!< mutex protecting blobs */ + ib_rbt_t* blobs; /*!< map of (page_no,heap_no,field_no) + to first_blob_page_no; protected by + blobs_mutex; @see btr_blob_dbg_t */ +#endif /* UNIV_BLOB_DEBUG */ +#ifdef UNIV_DEBUG + ulint magic_n;/*!< magic number */ +/** Value of dict_index_t::magic_n */ +# define DICT_INDEX_MAGIC_N 76789786 +#endif +}; + +/** The status of online index creation */ +enum online_index_status { + /** the index is complete and ready for access */ + ONLINE_INDEX_COMPLETE = 0, + /** the index is being created, online + (allowing concurrent modifications) */ + ONLINE_INDEX_CREATION, + /** secondary index creation was aborted and the index + should be dropped as soon as index->table->n_ref_count reaches 0, + or online table rebuild was aborted and the clustered index + of the original table should soon be restored to + ONLINE_INDEX_COMPLETE */ + ONLINE_INDEX_ABORTED, + /** the online index creation was aborted, the index was + dropped from the data dictionary and the tablespace, and it + should be dropped from the data dictionary cache as soon as + index->table->n_ref_count reaches 0. */ + ONLINE_INDEX_ABORTED_DROPPED +}; + +/** Data structure for a foreign key constraint; an example: +FOREIGN KEY (A, B) REFERENCES TABLE2 (C, D). Most fields will be +initialized to 0, NULL or FALSE in dict_mem_foreign_create(). */ +struct dict_foreign_t{ + mem_heap_t* heap; /*!< this object is allocated from + this memory heap */ + char* id; /*!< id of the constraint as a + null-terminated string */ + unsigned n_fields:10; /*!< number of indexes' first fields + for which the foreign key + constraint is defined: we allow the + indexes to contain more fields than + mentioned in the constraint, as long + as the first fields are as mentioned */ + unsigned type:6; /*!< 0 or DICT_FOREIGN_ON_DELETE_CASCADE + or DICT_FOREIGN_ON_DELETE_SET_NULL */ + char* foreign_table_name;/*!< foreign table name */ + char* foreign_table_name_lookup; + /*!< foreign table name used for dict lookup */ + dict_table_t* foreign_table; /*!< table where the foreign key is */ + const char** foreign_col_names;/*!< names of the columns in the + foreign key */ + char* referenced_table_name;/*!< referenced table name */ + char* referenced_table_name_lookup; + /*!< referenced table name for dict lookup*/ + dict_table_t* referenced_table;/*!< table where the referenced key + is */ + const char** referenced_col_names;/*!< names of the referenced + columns in the referenced table */ + dict_index_t* foreign_index; /*!< foreign index; we require that + both tables contain explicitly defined + indexes for the constraint: InnoDB + does not generate new indexes + implicitly */ + dict_index_t* referenced_index;/*!< referenced index */ +}; + +std::ostream& +operator<< (std::ostream& out, const dict_foreign_t& foreign); + +struct dict_foreign_print { + + dict_foreign_print(std::ostream& out) + : m_out(out) + {} + + void operator()(const dict_foreign_t* foreign) { + m_out << *foreign; + } +private: + std::ostream& m_out; +}; + +/** Compare two dict_foreign_t objects using their ids. Used in the ordering +of dict_table_t::foreign_set and dict_table_t::referenced_set. It returns +true if the first argument is considered to go before the second in the +strict weak ordering it defines, and false otherwise. */ +struct dict_foreign_compare { + + bool operator()( + const dict_foreign_t* lhs, + const dict_foreign_t* rhs) const + { + return(ut_strcmp(lhs->id, rhs->id) < 0); + } +}; + +/** A function object to find a foreign key with the given index as the +referenced index. Return the foreign key with matching criteria or NULL */ +struct dict_foreign_with_index { + + dict_foreign_with_index(const dict_index_t* index) + : m_index(index) + {} + + bool operator()(const dict_foreign_t* foreign) const + { + return(foreign->referenced_index == m_index); + } + + const dict_index_t* m_index; +}; + +/* A function object to check if the foreign constraint is between different +tables. Returns true if foreign key constraint is between different tables, +false otherwise. */ +struct dict_foreign_different_tables { + + bool operator()(const dict_foreign_t* foreign) const + { + return(foreign->foreign_table != foreign->referenced_table); + } +}; + +/** A function object to check if the foreign key constraint has the same +name as given. If the full name of the foreign key constraint doesn't match, +then, check if removing the database name from the foreign key constraint +matches. Return true if it matches, false otherwise. */ +struct dict_foreign_matches_id { + + dict_foreign_matches_id(const char* id) + : m_id(id) + {} + + bool operator()(const dict_foreign_t* foreign) const + { + if (0 == innobase_strcasecmp(foreign->id, m_id)) { + return(true); + } + if (const char* pos = strchr(foreign->id, '/')) { + if (0 == innobase_strcasecmp(m_id, pos + 1)) { + return(true); + } + } + return(false); + } + + const char* m_id; +}; + +typedef std::set<dict_foreign_t*, dict_foreign_compare> dict_foreign_set; + +std::ostream& +operator<< (std::ostream& out, const dict_foreign_set& fk_set); + +/** Function object to check if a foreign key object is there +in the given foreign key set or not. It returns true if the +foreign key is not found, false otherwise */ +struct dict_foreign_not_exists { + dict_foreign_not_exists(const dict_foreign_set& obj_) + : m_foreigns(obj_) + {} + + /* Return true if the given foreign key is not found */ + bool operator()(dict_foreign_t* const & foreign) const { + return(m_foreigns.find(foreign) == m_foreigns.end()); + } +private: + const dict_foreign_set& m_foreigns; +}; + +/** Validate the search order in the foreign key set. +@param[in] fk_set the foreign key set to be validated +@return true if search order is fine in the set, false otherwise. */ +bool +dict_foreign_set_validate( + const dict_foreign_set& fk_set); + +/** Validate the search order in the foreign key sets of the table +(foreign_set and referenced_set). +@param[in] table table whose foreign key sets are to be validated +@return true if foreign key sets are fine, false otherwise. */ +bool +dict_foreign_set_validate( + const dict_table_t& table); + +/*********************************************************************//** +Frees a foreign key struct. */ +inline +void +dict_foreign_free( +/*==============*/ + dict_foreign_t* foreign) /*!< in, own: foreign key struct */ +{ + mem_heap_free(foreign->heap); +} + +/** The destructor will free all the foreign key constraints in the set +by calling dict_foreign_free() on each of the foreign key constraints. +This is used to free the allocated memory when a local set goes out +of scope. */ +struct dict_foreign_set_free { + + dict_foreign_set_free(const dict_foreign_set& foreign_set) + : m_foreign_set(foreign_set) + {} + + ~dict_foreign_set_free() + { + std::for_each(m_foreign_set.begin(), + m_foreign_set.end(), + dict_foreign_free); + } + + const dict_foreign_set& m_foreign_set; +}; + +/** The flags for ON_UPDATE and ON_DELETE can be ORed; the default is that +a foreign key constraint is enforced, therefore RESTRICT just means no flag */ +/* @{ */ +#define DICT_FOREIGN_ON_DELETE_CASCADE 1 /*!< ON DELETE CASCADE */ +#define DICT_FOREIGN_ON_DELETE_SET_NULL 2 /*!< ON UPDATE SET NULL */ +#define DICT_FOREIGN_ON_UPDATE_CASCADE 4 /*!< ON DELETE CASCADE */ +#define DICT_FOREIGN_ON_UPDATE_SET_NULL 8 /*!< ON UPDATE SET NULL */ +#define DICT_FOREIGN_ON_DELETE_NO_ACTION 16 /*!< ON DELETE NO ACTION */ +#define DICT_FOREIGN_ON_UPDATE_NO_ACTION 32 /*!< ON UPDATE NO ACTION */ +/* @} */ + +/* This flag is for sync SQL DDL and memcached DML. +if table->memcached_sync_count == DICT_TABLE_IN_DDL means there's DDL running on +the table, DML from memcached will be blocked. */ +#define DICT_TABLE_IN_DDL -1 + +/** Data structure for a database table. Most fields will be +initialized to 0, NULL or FALSE in dict_mem_table_create(). */ +struct dict_table_t{ + + + table_id_t id; /*!< id of the table */ + mem_heap_t* heap; /*!< memory heap */ + char* name; /*!< table name */ + const char* dir_path_of_temp_table;/*!< NULL or the directory path + where a TEMPORARY table that was explicitly + created by a user should be placed if + innodb_file_per_table is defined in my.cnf; + in Unix this is usually /tmp/..., in Windows + temp\... */ + char* data_dir_path; /*!< NULL or the directory path + specified by DATA DIRECTORY */ + unsigned space:32; + /*!< space where the clustered index of the + table is placed */ + unsigned flags:DICT_TF_BITS; /*!< DICT_TF_... */ + unsigned flags2:DICT_TF2_BITS; /*!< DICT_TF2_... */ + unsigned ibd_file_missing:1; + /*!< TRUE if this is in a single-table + tablespace and the .ibd file is missing; then + we must return in ha_innodb.cc an error if the + user tries to query such an orphaned table */ + unsigned cached:1;/*!< TRUE if the table object has been added + to the dictionary cache */ + unsigned to_be_dropped:1; + /*!< TRUE if the table is to be dropped, but + not yet actually dropped (could in the bk + drop list); It is turned on at the beginning + of row_drop_table_for_mysql() and turned off + just before we start to update system tables + for the drop. It is protected by + dict_operation_lock */ + unsigned n_def:10;/*!< number of columns defined so far */ + unsigned n_cols:10;/*!< number of columns */ + unsigned can_be_evicted:1; + /*!< TRUE if it's not an InnoDB system table + or a table that has no FK relationships */ + unsigned corrupted:1; + /*!< TRUE if table is corrupted */ + unsigned drop_aborted:1; + /*!< TRUE if some indexes should be dropped + after ONLINE_INDEX_ABORTED + or ONLINE_INDEX_ABORTED_DROPPED */ + dict_col_t* cols; /*!< array of column descriptions */ + const char* col_names; + /*!< Column names packed in a character string + "name1\0name2\0...nameN\0". Until + the string contains n_cols, it will be + allocated from a temporary heap. The final + string will be allocated from table->heap. */ +#ifndef UNIV_HOTBACKUP + hash_node_t name_hash; /*!< hash chain node */ + hash_node_t id_hash; /*!< hash chain node */ + UT_LIST_BASE_NODE_T(dict_index_t) + indexes; /*!< list of indexes of the table */ + + dict_foreign_set foreign_set; + /*!< set of foreign key constraints + in the table; these refer to columns + in other tables */ + + dict_foreign_set referenced_set; + /*!< list of foreign key constraints + which refer to this table */ + + UT_LIST_NODE_T(dict_table_t) + table_LRU; /*!< node of the LRU list of tables */ + unsigned fk_max_recusive_level:8; + /*!< maximum recursive level we support when + loading tables chained together with FK + constraints. If exceeds this level, we will + stop loading child table into memory along with + its parent table */ + ulint n_foreign_key_checks_running; + /*!< count of how many foreign key check + operations are currently being performed + on the table: we cannot drop the table while + there are foreign key checks running on + it! */ + trx_id_t def_trx_id; + /*!< transaction id that last touched + the table definition, either when + loading the definition or CREATE + TABLE, or ALTER TABLE (prepare, + commit, and rollback phases) */ + trx_id_t query_cache_inv_trx_id; + /*!< transactions whose trx id is + smaller than this number are not + allowed to store to the MySQL query + cache or retrieve from it; when a trx + with undo logs commits, it sets this + to the value of the trx id counter for + the tables it had an IX lock on */ +#ifdef UNIV_DEBUG + /*----------------------*/ + ibool does_not_fit_in_memory; + /*!< this field is used to specify in + simulations tables which are so big + that disk should be accessed: disk + access is simulated by putting the + thread to sleep for a while; NOTE that + this flag is not stored to the data + dictionary on disk, and the database + will forget about value TRUE if it has + to reload the table definition from + disk */ +#endif /* UNIV_DEBUG */ + /*----------------------*/ + unsigned big_rows:1; + /*!< flag: TRUE if the maximum length of + a single row exceeds BIG_ROW_SIZE; + initialized in dict_table_add_to_cache() */ + /** Statistics for query optimization */ + /* @{ */ + + volatile os_once::state_t stats_latch_created; + /*!< Creation state of 'stats_latch'. */ + + rw_lock_t* stats_latch; /*!< this latch protects: + dict_table_t::stat_initialized + dict_table_t::stat_n_rows (*) + dict_table_t::stat_clustered_index_size + dict_table_t::stat_sum_of_other_index_sizes + dict_table_t::stat_modified_counter (*) + dict_table_t::indexes*::stat_n_diff_key_vals[] + dict_table_t::indexes*::stat_index_size + dict_table_t::indexes*::stat_n_leaf_pages + (*) those are not always protected for + performance reasons. NULL for dumy table + objects. */ + unsigned stat_initialized:1; /*!< TRUE if statistics have + been calculated the first time + after database startup or table creation */ +#define DICT_TABLE_IN_USED -1 + lint memcached_sync_count; + /*!< count of how many handles are opened + to this table from memcached; DDL on the + table is NOT allowed until this count + goes to zero. If it's -1, means there's DDL + on the table, DML from memcached will be + blocked. */ + ib_time_t stats_last_recalc; + /*!< Timestamp of last recalc of the stats */ + ib_uint32_t stat_persistent; + /*!< The two bits below are set in the + ::stat_persistent member and have the following + meaning: + 1. _ON=0, _OFF=0, no explicit persistent stats + setting for this table, the value of the global + srv_stats_persistent is used to determine + whether the table has persistent stats enabled + or not + 2. _ON=0, _OFF=1, persistent stats are + explicitly disabled for this table, regardless + of the value of the global srv_stats_persistent + 3. _ON=1, _OFF=0, persistent stats are + explicitly enabled for this table, regardless + of the value of the global srv_stats_persistent + 4. _ON=1, _OFF=1, not allowed, we assert if + this ever happens. */ +#define DICT_STATS_PERSISTENT_ON (1 << 1) +#define DICT_STATS_PERSISTENT_OFF (1 << 2) + ib_uint32_t stats_auto_recalc; + /*!< The two bits below are set in the + ::stats_auto_recalc member and have + the following meaning: + 1. _ON=0, _OFF=0, no explicit auto recalc + setting for this table, the value of the global + srv_stats_persistent_auto_recalc is used to + determine whether the table has auto recalc + enabled or not + 2. _ON=0, _OFF=1, auto recalc is explicitly + disabled for this table, regardless of the + value of the global + srv_stats_persistent_auto_recalc + 3. _ON=1, _OFF=0, auto recalc is explicitly + enabled for this table, regardless of the + value of the global + srv_stats_persistent_auto_recalc + 4. _ON=1, _OFF=1, not allowed, we assert if + this ever happens. */ +#define DICT_STATS_AUTO_RECALC_ON (1 << 1) +#define DICT_STATS_AUTO_RECALC_OFF (1 << 2) + ulint stats_sample_pages; + /*!< the number of pages to sample for this + table during persistent stats estimation; + if this is 0, then the value of the global + srv_stats_persistent_sample_pages will be + used instead. */ + ib_uint64_t stat_n_rows; + /*!< approximate number of rows in the table; + we periodically calculate new estimates */ + ulint stat_clustered_index_size; + /*!< approximate clustered index size in + database pages */ + ulint stat_sum_of_other_index_sizes; + /*!< other indexes in database pages */ + ib_uint64_t stat_modified_counter; + /*!< when a row is inserted, updated, + or deleted, + we add 1 to this number; we calculate new + estimates for the stat_... values for the + table and the indexes when about 1 / 16 of + table has been modified; + also when the estimate operation is + called for MySQL SHOW TABLE STATUS; the + counter is reset to zero at statistics + calculation; this counter is not protected by + any latch, because this is only used for + heuristics */ +#define BG_STAT_NONE 0 +#define BG_STAT_IN_PROGRESS (1 << 0) + /*!< BG_STAT_IN_PROGRESS is set in + stats_bg_flag when the background + stats code is working on this table. The DROP + TABLE code waits for this to be cleared + before proceeding. */ +#define BG_STAT_SHOULD_QUIT (1 << 1) + /*!< BG_STAT_SHOULD_QUIT is set in + stats_bg_flag when DROP TABLE starts + waiting on BG_STAT_IN_PROGRESS to be cleared, + the background stats thread will detect this + and will eventually quit sooner */ + byte stats_bg_flag; + /*!< see BG_STAT_* above. + Writes are covered by dict_sys->mutex. + Dirty reads are possible. */ + /* @} */ + /*----------------------*/ + /**!< The following fields are used by the + AUTOINC code. The actual collection of + tables locked during AUTOINC read/write is + kept in trx_t. In order to quickly determine + whether a transaction has locked the AUTOINC + lock we keep a pointer to the transaction + here in the autoinc_trx variable. This is to + avoid acquiring the lock_sys_t::mutex and + scanning the vector in trx_t. + + When an AUTOINC lock has to wait, the + corresponding lock instance is created on + the trx lock heap rather than use the + pre-allocated instance in autoinc_lock below.*/ + /* @{ */ + lock_t* autoinc_lock; + /*!< a buffer for an AUTOINC lock + for this table: we allocate the memory here + so that individual transactions can get it + and release it without a need to allocate + space from the lock heap of the trx: + otherwise the lock heap would grow rapidly + if we do a large insert from a select. NULL + for dummy table objects. */ + ib_mutex_t autoinc_mutex; + /*!< mutex protecting the autoincrement + counter. Not initialized for dummy table + objects */ + ib_uint64_t autoinc;/*!< autoinc counter value to give to the + next inserted row */ + ulong n_waiting_or_granted_auto_inc_locks; + /*!< This counter is used to track the number + of granted and pending autoinc locks on this + table. This value is set after acquiring the + lock_sys_t::mutex but we peek the contents to + determine whether other transactions have + acquired the AUTOINC lock or not. Of course + only one transaction can be granted the + lock but there can be multiple waiters. */ + const trx_t* autoinc_trx; + /*!< The transaction that currently holds the + the AUTOINC lock on this table. + Protected by lock_sys->mutex. */ + fts_t* fts; /* FTS specific state variables */ + /* @} */ + /*----------------------*/ + + ib_quiesce_t quiesce;/*!< Quiescing states, protected by the + dict_index_t::lock. ie. we can only change + the state if we acquire all the latches + (dict_index_t::lock) in X mode of this table's + indexes. */ + + /*----------------------*/ + ulint n_rec_locks; + /*!< Count of the number of record locks on + this table. We use this to determine whether + we can evict the table from the dictionary + cache. It is protected by lock_sys->mutex. */ + ulint n_ref_count; + /*!< count of how many handles are opened + to this table; dropping of the table is + NOT allowed until this count gets to zero; + MySQL does NOT itself check the number of + open handles at drop */ + UT_LIST_BASE_NODE_T(lock_t) + locks; /*!< list of locks on the table; protected + by lock_sys->mutex */ + ibool is_corrupt; +#endif /* !UNIV_HOTBACKUP */ + +#ifdef UNIV_DEBUG + ulint magic_n;/*!< magic number */ +/** Value of dict_table_t::magic_n */ +# define DICT_TABLE_MAGIC_N 76333786 +#endif /* UNIV_DEBUG */ +}; + +/** A function object to add the foreign key constraint to the referenced set +of the referenced table, if it exists in the dictionary cache. */ +struct dict_foreign_add_to_referenced_table { + void operator()(dict_foreign_t* foreign) const + { + if (dict_table_t* table = foreign->referenced_table) { + std::pair<dict_foreign_set::iterator, bool> ret + = table->referenced_set.insert(foreign); + ut_a(ret.second); + } + } +}; + +#ifndef UNIV_NONINL +#include "dict0mem.ic" +#endif + +#endif /* !UNIV_INNOCHECKSUM */ + +#endif diff --git a/storage/xtradb/include/dict0mem.ic b/storage/xtradb/include/dict0mem.ic new file mode 100644 index 00000000000..38d51f61789 --- /dev/null +++ b/storage/xtradb/include/dict0mem.ic @@ -0,0 +1,74 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/dict0mem.ic +Data dictionary memory object creation + +Created 1/8/1996 Heikki Tuuri +***********************************************************************/ + +#include "data0type.h" +#include "dict0mem.h" +#include "fil0fil.h" + +/**********************************************************************//** +This function poplulates a dict_index_t index memory structure with +supplied information. */ +UNIV_INLINE +void +dict_mem_fill_index_struct( +/*=======================*/ + dict_index_t* index, /*!< out: index to be filled */ + mem_heap_t* heap, /*!< in: memory heap */ + const char* table_name, /*!< in: table name */ + const char* index_name, /*!< in: index name */ + ulint space, /*!< in: space where the index tree is + placed, ignored if the index is of + the clustered type */ + ulint type, /*!< in: DICT_UNIQUE, + DICT_CLUSTERED, ... ORed */ + ulint n_fields) /*!< in: number of fields */ +{ + + if (heap) { + index->heap = heap; + index->name = mem_heap_strdup(heap, index_name); + index->fields = (dict_field_t*) mem_heap_alloc( + heap, 1 + n_fields * sizeof(dict_field_t)); + } else { + index->name = index_name; + index->heap = NULL; + index->fields = NULL; + } + + /* Assign a ulint to a 4-bit-mapped field. + Only the low-order 4 bits are assigned. */ + index->type = type; +#ifndef UNIV_HOTBACKUP + index->space = (unsigned int) space; + index->page = FIL_NULL; +#endif /* !UNIV_HOTBACKUP */ + index->table_name = table_name; + index->n_fields = (unsigned int) n_fields; + /* The '1 +' above prevents allocation + of an empty mem block */ +#ifdef UNIV_DEBUG + index->magic_n = DICT_INDEX_MAGIC_N; +#endif /* UNIV_DEBUG */ +} diff --git a/storage/xtradb/include/dict0priv.h b/storage/xtradb/include/dict0priv.h new file mode 100644 index 00000000000..9a3c8e22992 --- /dev/null +++ b/storage/xtradb/include/dict0priv.h @@ -0,0 +1,63 @@ +/***************************************************************************** + +Copyright (c) 2010, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dict0priv.h +Data dictionary private functions + +Created Fri 2 Jul 2010 13:30:38 EST - Sunny Bains +*******************************************************/ + +#ifndef dict0priv_h +#define dict0priv_h + +/**********************************************************************//** +Gets a table; loads it to the dictionary cache if necessary. A low-level +function. Note: Not to be called from outside dict0*c functions. +@return table, NULL if not found */ +UNIV_INLINE +dict_table_t* +dict_table_get_low( +/*===============*/ + const char* table_name); /*!< in: table name */ + +/**********************************************************************//** +Checks if a table is in the dictionary cache. +@return table, NULL if not found */ +UNIV_INLINE +dict_table_t* +dict_table_check_if_in_cache_low( +/*=============================*/ + const char* table_name); /*!< in: table name */ + +/**********************************************************************//** +Returns a table object based on table id. +@return table, NULL if does not exist */ +UNIV_INLINE +dict_table_t* +dict_table_open_on_id_low( +/*=====================*/ + table_id_t table_id, /*!< in: table id */ + dict_err_ignore_t ignore_err); /*!< in: errors to ignore + when loading the table */ + +#ifndef UNIV_NONINL +#include "dict0priv.ic" +#endif + +#endif /* dict0priv.h */ diff --git a/storage/xtradb/include/dict0priv.ic b/storage/xtradb/include/dict0priv.ic new file mode 100644 index 00000000000..30ba8fb60aa --- /dev/null +++ b/storage/xtradb/include/dict0priv.ic @@ -0,0 +1,125 @@ +/***************************************************************************** + +Copyright (c) 2010, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/dict0priv.ic +Data dictionary system private include file + +Created Wed 13 Oct 2010 16:10:14 EST Sunny Bains +***********************************************************************/ + +#include "dict0dict.h" +#include "dict0load.h" +#include "dict0priv.h" +#ifndef UNIV_HOTBACKUP + +/**********************************************************************//** +Gets a table; loads it to the dictionary cache if necessary. A low-level +function. +@return table, NULL if not found */ +UNIV_INLINE +dict_table_t* +dict_table_get_low( +/*===============*/ + const char* table_name) /*!< in: table name */ +{ + dict_table_t* table; + + ut_ad(table_name); + ut_ad(mutex_own(&(dict_sys->mutex))); + + table = dict_table_check_if_in_cache_low(table_name); + + if (table && table->corrupted) { + fprintf(stderr, "InnoDB: table"); + ut_print_name(stderr, NULL, TRUE, table->name); + if (srv_load_corrupted) { + fputs(" is corrupted, but" + " innodb_force_load_corrupted is set\n", stderr); + } else { + fputs(" is corrupted\n", stderr); + return(NULL); + } + } + + if (table == NULL) { + table = dict_load_table(table_name, TRUE, DICT_ERR_IGNORE_NONE); + } + + ut_ad(!table || table->cached); + + return(table); +} + +/**********************************************************************//** +Returns a table object based on table id. +@return table, NULL if does not exist */ +UNIV_INLINE +dict_table_t* +dict_table_open_on_id_low( +/*======================*/ + table_id_t table_id, /*!< in: table id */ + dict_err_ignore_t ignore_err) /*!< in: errors to ignore + when loading the table */ +{ + dict_table_t* table; + ulint fold; + + ut_ad(mutex_own(&(dict_sys->mutex))); + + /* Look for the table name in the hash table */ + fold = ut_fold_ull(table_id); + + HASH_SEARCH(id_hash, dict_sys->table_id_hash, fold, + dict_table_t*, table, ut_ad(table->cached), + table->id == table_id); + if (table == NULL) { + table = dict_load_table_on_id(table_id, ignore_err); + } + + ut_ad(!table || table->cached); + + /* TODO: should get the type information from MySQL */ + + return(table); +} + +/**********************************************************************//** +Checks if a table is in the dictionary cache. +@return table, NULL if not found */ +UNIV_INLINE +dict_table_t* +dict_table_check_if_in_cache_low( +/*=============================*/ + const char* table_name) /*!< in: table name */ +{ + dict_table_t* table; + ulint table_fold; + + ut_ad(table_name); + ut_ad(mutex_own(&(dict_sys->mutex))); + + /* Look for the table name in the hash table */ + table_fold = ut_fold_string(table_name); + + HASH_SEARCH(name_hash, dict_sys->table_hash, table_fold, + dict_table_t*, table, ut_ad(table->cached), + !strcmp(table->name, table_name)); + return(table); +} +#endif /*! UNIV_HOTBACKUP */ diff --git a/storage/xtradb/include/dict0stats.h b/storage/xtradb/include/dict0stats.h new file mode 100644 index 00000000000..186f90e3694 --- /dev/null +++ b/storage/xtradb/include/dict0stats.h @@ -0,0 +1,202 @@ +/***************************************************************************** + +Copyright (c) 2009, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dict0stats.h +Code used for calculating and manipulating table statistics. + +Created Jan 06, 2010 Vasil Dimov +*******************************************************/ + +#ifndef dict0stats_h +#define dict0stats_h + +#include "univ.i" + +#include "db0err.h" +#include "dict0types.h" +#include "trx0types.h" + +enum dict_stats_upd_option_t { + DICT_STATS_RECALC_PERSISTENT,/* (re) calculate the + statistics using a precise and slow + algo and save them to the persistent + storage, if the persistent storage is + not present then emit a warning and + fall back to transient stats */ + DICT_STATS_RECALC_TRANSIENT,/* (re) calculate the statistics + using an imprecise quick algo + without saving the results + persistently */ + DICT_STATS_EMPTY_TABLE, /* Write all zeros (or 1 where it makes sense) + into a table and its indexes' statistics + members. The resulting stats correspond to an + empty table. If the table is using persistent + statistics, then they are saved on disk. */ + DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY /* fetch the stats + from the persistent storage if the in-memory + structures have not been initialized yet, + otherwise do nothing */ +}; + +/*********************************************************************//** +Calculates new estimates for table and index statistics. This function +is relatively quick and is used to calculate transient statistics that +are not saved on disk. +This was the only way to calculate statistics before the +Persistent Statistics feature was introduced. */ +UNIV_INTERN +void +dict_stats_update_transient( +/*========================*/ + dict_table_t* table); /*!< in/out: table */ + +/*********************************************************************//** +Set the persistent statistics flag for a given table. This is set only +in the in-memory table object and is not saved on disk. It will be read +from the .frm file upon first open from MySQL after a server restart. */ +UNIV_INLINE +void +dict_stats_set_persistent( +/*======================*/ + dict_table_t* table, /*!< in/out: table */ + ibool ps_on, /*!< in: persistent stats explicitly enabled */ + ibool ps_off) /*!< in: persistent stats explicitly disabled */ + __attribute__((nonnull)); + +/*********************************************************************//** +Check whether persistent statistics is enabled for a given table. +@return TRUE if enabled, FALSE otherwise */ +UNIV_INLINE +ibool +dict_stats_is_persistent_enabled( +/*=============================*/ + const dict_table_t* table) /*!< in: table */ + __attribute__((nonnull, warn_unused_result)); + +/*********************************************************************//** +Set the auto recalc flag for a given table (only honored for a persistent +stats enabled table). The flag is set only in the in-memory table object +and is not saved in InnoDB files. It will be read from the .frm file upon +first open from MySQL after a server restart. */ +UNIV_INLINE +void +dict_stats_auto_recalc_set( +/*=======================*/ + dict_table_t* table, /*!< in/out: table */ + ibool auto_recalc_on, /*!< in: explicitly enabled */ + ibool auto_recalc_off); /*!< in: explicitly disabled */ + +/*********************************************************************//** +Check whether auto recalc is enabled for a given table. +@return TRUE if enabled, FALSE otherwise */ +UNIV_INLINE +ibool +dict_stats_auto_recalc_is_enabled( +/*==============================*/ + const dict_table_t* table); /*!< in: table */ + +/*********************************************************************//** +Initialize table's stats for the first time when opening a table. */ +UNIV_INLINE +void +dict_stats_init( +/*============*/ + dict_table_t* table); /*!< in/out: table */ + +/*********************************************************************//** +Deinitialize table's stats after the last close of the table. This is +used to detect "FLUSH TABLE" and refresh the stats upon next open. */ +UNIV_INLINE +void +dict_stats_deinit( +/*==============*/ + dict_table_t* table) /*!< in/out: table */ + __attribute__((nonnull)); + +/*********************************************************************//** +Calculates new estimates for table and index statistics. The statistics +are used in query optimization. +@return DB_* error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +dict_stats_update( +/*==============*/ + dict_table_t* table, /*!< in/out: table */ + dict_stats_upd_option_t stats_upd_option); + /*!< in: whether to (re) calc + the stats or to fetch them from + the persistent storage */ + +/*********************************************************************//** +Removes the information for a particular index's stats from the persistent +storage if it exists and if there is data stored for this index. +This function creates its own trx and commits it. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +dict_stats_drop_index( +/*==================*/ + const char* tname, /*!< in: table name */ + const char* iname, /*!< in: index name */ + char* errstr, /*!< out: error message if != DB_SUCCESS + is returned */ + ulint errstr_sz);/*!< in: size of the errstr buffer */ + +/*********************************************************************//** +Removes the statistics for a table and all of its indexes from the +persistent storage if it exists and if there is data stored for the table. +This function creates its own transaction and commits it. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +dict_stats_drop_table( +/*==================*/ + const char* table_name, /*!< in: table name */ + char* errstr, /*!< out: error message + if != DB_SUCCESS is returned */ + ulint errstr_sz); /*!< in: size of errstr buffer */ + +/*********************************************************************//** +Fetches or calculates new estimates for index statistics. */ +UNIV_INTERN +void +dict_stats_update_for_index( +/*========================*/ + dict_index_t* index) /*!< in/out: index */ + __attribute__((nonnull)); + +/*********************************************************************//** +Renames a table in InnoDB persistent stats storage. +This function creates its own transaction and commits it. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +dict_stats_rename_table( +/*====================*/ + const char* old_name, /*!< in: old table name */ + const char* new_name, /*!< in: new table name */ + char* errstr, /*!< out: error string if != DB_SUCCESS + is returned */ + size_t errstr_sz); /*!< in: errstr size */ + +#ifndef UNIV_NONINL +#include "dict0stats.ic" +#endif + +#endif /* dict0stats_h */ diff --git a/storage/xtradb/include/dict0stats.ic b/storage/xtradb/include/dict0stats.ic new file mode 100644 index 00000000000..ec9a9065470 --- /dev/null +++ b/storage/xtradb/include/dict0stats.ic @@ -0,0 +1,236 @@ +/***************************************************************************** + +Copyright (c) 2012, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dict0stats.ic +Code used for calculating and manipulating table statistics. + +Created Jan 23, 2012 Vasil Dimov +*******************************************************/ + +#include "univ.i" +#include "dict0dict.h" /* dict_table_stats_lock() */ +#include "dict0types.h" /* dict_table_t */ +#include "srv0srv.h" /* srv_stats_persistent, srv_stats_auto_recalc */ + +/*********************************************************************//** +Set the persistent statistics flag for a given table. This is set only +in the in-memory table object and is not saved on disk. It will be read +from the .frm file upon first open from MySQL after a server restart. */ +UNIV_INLINE +void +dict_stats_set_persistent( +/*======================*/ + dict_table_t* table, /*!< in/out: table */ + ibool ps_on, /*!< in: persistent stats explicitly enabled */ + ibool ps_off) /*!< in: persistent stats explicitly disabled */ +{ + /* Not allowed to have both flags set, but a CREATE or ALTER + statement that contains "STATS_PERSISTENT=0 STATS_PERSISTENT=1" would + end up having both set. In this case we clear the OFF flag. */ + if (ps_on && ps_off) { + ps_off = FALSE; + } + + ib_uint32_t stat_persistent = 0; + + if (ps_on) { + stat_persistent |= DICT_STATS_PERSISTENT_ON; + } + + if (ps_off) { + stat_persistent |= DICT_STATS_PERSISTENT_OFF; + } + + /* we rely on this assignment to be atomic */ + table->stat_persistent = stat_persistent; +} + +/*********************************************************************//** +Check whether persistent statistics is enabled for a given table. +@return TRUE if enabled, FALSE otherwise */ +UNIV_INLINE +ibool +dict_stats_is_persistent_enabled( +/*=============================*/ + const dict_table_t* table) /*!< in: table */ +{ + /* Because of the nature of this check (non-locking) it is possible + that a table becomes: + * PS-disabled immediately after this function has returned TRUE or + * PS-enabled immediately after this function has returned FALSE. + This means that it is possible that we do: + + dict_stats_update(DICT_STATS_RECALC_PERSISTENT) on a table that has + just been PS-disabled or + + dict_stats_update(DICT_STATS_RECALC_TRANSIENT) on a table that has + just been PS-enabled. + This is acceptable. Avoiding this would mean that we would have to + protect the ::stat_persistent with dict_table_stats_lock() like the + other ::stat_ members which would be too big performance penalty, + especially when this function is called from + row_update_statistics_if_needed(). */ + + /* we rely on this read to be atomic */ + ib_uint32_t stat_persistent = table->stat_persistent; + + if (stat_persistent & DICT_STATS_PERSISTENT_ON) { + ut_ad(!(stat_persistent & DICT_STATS_PERSISTENT_OFF)); + return(TRUE); + } else if (stat_persistent & DICT_STATS_PERSISTENT_OFF) { + return(FALSE); + } else { + return(srv_stats_persistent); + } +} + +/*********************************************************************//** +Set the auto recalc flag for a given table (only honored for a persistent +stats enabled table). The flag is set only in the in-memory table object +and is not saved in InnoDB files. It will be read from the .frm file upon +first open from MySQL after a server restart. */ +UNIV_INLINE +void +dict_stats_auto_recalc_set( +/*=======================*/ + dict_table_t* table, /*!< in/out: table */ + ibool auto_recalc_on, /*!< in: explicitly enabled */ + ibool auto_recalc_off) /*!< in: explicitly disabled */ +{ + ut_ad(!auto_recalc_on || !auto_recalc_off); + + ib_uint32_t stats_auto_recalc = 0; + + if (auto_recalc_on) { + stats_auto_recalc |= DICT_STATS_AUTO_RECALC_ON; + } + + if (auto_recalc_off) { + stats_auto_recalc |= DICT_STATS_AUTO_RECALC_OFF; + } + + /* we rely on this assignment to be atomic */ + table->stats_auto_recalc = stats_auto_recalc; +} + +/*********************************************************************//** +Check whether auto recalc is enabled for a given table. +@return TRUE if enabled, FALSE otherwise */ +UNIV_INLINE +ibool +dict_stats_auto_recalc_is_enabled( +/*==============================*/ + const dict_table_t* table) /*!< in: table */ +{ + /* we rely on this read to be atomic */ + ib_uint32_t stats_auto_recalc = table->stats_auto_recalc; + + if (stats_auto_recalc & DICT_STATS_AUTO_RECALC_ON) { + ut_ad(!(stats_auto_recalc & DICT_STATS_AUTO_RECALC_OFF)); + return(TRUE); + } else if (stats_auto_recalc & DICT_STATS_AUTO_RECALC_OFF) { + return(FALSE); + } else { + return(srv_stats_auto_recalc); + } +} + +/*********************************************************************//** +Initialize table's stats for the first time when opening a table. */ +UNIV_INLINE +void +dict_stats_init( +/*============*/ + dict_table_t* table) /*!< in/out: table */ +{ + ut_ad(!mutex_own(&dict_sys->mutex)); + + if (table->stat_initialized) { + return; + } + + dict_stats_upd_option_t opt; + + if (dict_stats_is_persistent_enabled(table)) { + opt = DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY; + } else { + opt = DICT_STATS_RECALC_TRANSIENT; + } + + dict_stats_update(table, opt); +} + +/*********************************************************************//** +Deinitialize table's stats after the last close of the table. This is +used to detect "FLUSH TABLE" and refresh the stats upon next open. */ +UNIV_INLINE +void +dict_stats_deinit( +/*==============*/ + dict_table_t* table) /*!< in/out: table */ +{ + ut_ad(mutex_own(&dict_sys->mutex)); + + ut_a(table->n_ref_count == 0); + + dict_table_stats_lock(table, RW_X_LATCH); + + if (!table->stat_initialized) { + dict_table_stats_unlock(table, RW_X_LATCH); + return; + } + + table->stat_initialized = FALSE; + +#ifdef UNIV_DEBUG_VALGRIND + UNIV_MEM_INVALID(&table->stat_n_rows, + sizeof(table->stat_n_rows)); + UNIV_MEM_INVALID(&table->stat_clustered_index_size, + sizeof(table->stat_clustered_index_size)); + UNIV_MEM_INVALID(&table->stat_sum_of_other_index_sizes, + sizeof(table->stat_sum_of_other_index_sizes)); + UNIV_MEM_INVALID(&table->stat_modified_counter, + sizeof(table->stat_modified_counter)); + + dict_index_t* index; + + for (index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + + ulint n_uniq = dict_index_get_n_unique(index); + + UNIV_MEM_INVALID( + index->stat_n_diff_key_vals, + n_uniq * sizeof(index->stat_n_diff_key_vals[0])); + UNIV_MEM_INVALID( + index->stat_n_sample_sizes, + n_uniq * sizeof(index->stat_n_sample_sizes[0])); + UNIV_MEM_INVALID( + index->stat_n_non_null_key_vals, + n_uniq * sizeof(index->stat_n_non_null_key_vals[0])); + UNIV_MEM_INVALID( + &index->stat_index_size, + sizeof(index->stat_index_size)); + UNIV_MEM_INVALID( + &index->stat_n_leaf_pages, + sizeof(index->stat_n_leaf_pages)); + } +#endif /* UNIV_DEBUG_VALGRIND */ + + dict_table_stats_unlock(table, RW_X_LATCH); +} diff --git a/storage/xtradb/include/dict0stats_bg.h b/storage/xtradb/include/dict0stats_bg.h new file mode 100644 index 00000000000..e866ab419fe --- /dev/null +++ b/storage/xtradb/include/dict0stats_bg.h @@ -0,0 +1,127 @@ +/***************************************************************************** + +Copyright (c) 2012, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dict0stats_bg.h +Code used for background table and index stats gathering. + +Created Apr 26, 2012 Vasil Dimov +*******************************************************/ + +#ifndef dict0stats_bg_h +#define dict0stats_bg_h + +#include "univ.i" + +#include "dict0types.h" /* dict_table_t, table_id_t */ +#include "os0sync.h" /* os_event_t */ +#include "os0thread.h" /* DECLARE_THREAD */ + +/** Event to wake up the stats thread */ +extern os_event_t dict_stats_event; + +/*****************************************************************//** +Add a table to the recalc pool, which is processed by the +background stats gathering thread. Only the table id is added to the +list, so the table can be closed after being enqueued and it will be +opened when needed. If the table does not exist later (has been DROPped), +then it will be removed from the pool and skipped. */ +UNIV_INTERN +void +dict_stats_recalc_pool_add( +/*=======================*/ + const dict_table_t* table); /*!< in: table to add */ + +/*****************************************************************//** +Delete a given table from the auto recalc pool. +dict_stats_recalc_pool_del() */ +UNIV_INTERN +void +dict_stats_recalc_pool_del( +/*=======================*/ + const dict_table_t* table); /*!< in: table to remove */ + +/** Yield the data dictionary latch when waiting +for the background thread to stop accessing a table. +@param trx transaction holding the data dictionary locks */ +#define DICT_STATS_BG_YIELD(trx) do { \ + row_mysql_unlock_data_dictionary(trx); \ + os_thread_sleep(250000); \ + row_mysql_lock_data_dictionary(trx); \ +} while (0) + +/*****************************************************************//** +Request the background collection of statistics to stop for a table. +@retval true when no background process is active +@retval false when it is not safe to modify the table definition */ +UNIV_INLINE +bool +dict_stats_stop_bg( +/*===============*/ + dict_table_t* table) /*!< in/out: table */ + __attribute__((warn_unused_result)); + +/*****************************************************************//** +Wait until background stats thread has stopped using the specified table. +The caller must have locked the data dictionary using +row_mysql_lock_data_dictionary() and this function may unlock it temporarily +and restore the lock before it exits. +The background stats thread is guaranteed not to start using the specified +table after this function returns and before the caller unlocks the data +dictionary because it sets the BG_STAT_IN_PROGRESS bit in table->stats_bg_flag +under dict_sys->mutex. */ +UNIV_INTERN +void +dict_stats_wait_bg_to_stop_using_table( +/*===================================*/ + dict_table_t* table, /*!< in/out: table */ + trx_t* trx); /*!< in/out: transaction to use for + unlocking/locking the data dict */ +/*****************************************************************//** +Initialize global variables needed for the operation of dict_stats_thread(). +Must be called before dict_stats_thread() is started. */ +UNIV_INTERN +void +dict_stats_thread_init(); +/*====================*/ + +/*****************************************************************//** +Free resources allocated by dict_stats_thread_init(), must be called +after dict_stats_thread() has exited. */ +UNIV_INTERN +void +dict_stats_thread_deinit(); +/*======================*/ + +/*****************************************************************//** +This is the thread for background stats gathering. It pops tables, from +the auto recalc list and proceeds them, eventually recalculating their +statistics. +@return this function does not return, it calls os_thread_exit() */ +extern "C" UNIV_INTERN +os_thread_ret_t +DECLARE_THREAD(dict_stats_thread)( +/*==============================*/ + void* arg); /*!< in: a dummy parameter + required by os_thread_create */ + +# ifndef UNIV_NONINL +# include "dict0stats_bg.ic" +# endif + +#endif /* dict0stats_bg_h */ diff --git a/storage/xtradb/include/dict0stats_bg.ic b/storage/xtradb/include/dict0stats_bg.ic new file mode 100644 index 00000000000..87e3225de58 --- /dev/null +++ b/storage/xtradb/include/dict0stats_bg.ic @@ -0,0 +1,45 @@ +/***************************************************************************** + +Copyright (c) 2012, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dict0stats_bg.ic +Code used for background table and index stats gathering. + +Created Feb 8, 2013 Marko Makela +*******************************************************/ + +/*****************************************************************//** +Request the background collection of statistics to stop for a table. +@retval true when no background process is active +@retval false when it is not safe to modify the table definition */ +UNIV_INLINE +bool +dict_stats_stop_bg( +/*===============*/ + dict_table_t* table) /*!< in/out: table */ +{ + ut_ad(!srv_read_only_mode); + ut_ad(mutex_own(&dict_sys->mutex)); + + if (!(table->stats_bg_flag & BG_STAT_IN_PROGRESS)) { + return(true); + } + + table->stats_bg_flag |= BG_STAT_SHOULD_QUIT; + return(false); +} diff --git a/storage/xtradb/include/dict0types.h b/storage/xtradb/include/dict0types.h new file mode 100644 index 00000000000..d34b6f7eab3 --- /dev/null +++ b/storage/xtradb/include/dict0types.h @@ -0,0 +1,91 @@ +/***************************************************************************** + +Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dict0types.h +Data dictionary global types + +Created 1/8/1996 Heikki Tuuri +*******************************************************/ + +#ifndef dict0types_h +#define dict0types_h + +struct dict_sys_t; +struct dict_col_t; +struct dict_field_t; +struct dict_index_t; +struct dict_table_t; +struct dict_foreign_t; + +struct ind_node_t; +struct tab_node_t; + +/* Space id and page no where the dictionary header resides */ +#define DICT_HDR_SPACE 0 /* the SYSTEM tablespace */ +#define DICT_HDR_PAGE_NO FSP_DICT_HDR_PAGE_NO + +/* The ibuf table and indexes's ID are assigned as the number +DICT_IBUF_ID_MIN plus the space id */ +#define DICT_IBUF_ID_MIN 0xFFFFFFFF00000000ULL + +typedef ib_id_t table_id_t; +typedef ib_id_t index_id_t; + +/** Error to ignore when we load table dictionary into memory. However, +the table and index will be marked as "corrupted", and caller will +be responsible to deal with corrupted table or index. +Note: please define the IGNORE_ERR_* as bits, so their value can +be or-ed together */ +enum dict_err_ignore_t { + DICT_ERR_IGNORE_NONE = 0, /*!< no error to ignore */ + DICT_ERR_IGNORE_INDEX_ROOT = 1, /*!< ignore error if index root + page is FIL_NULL or incorrect value */ + DICT_ERR_IGNORE_CORRUPT = 2, /*!< skip corrupted indexes */ + DICT_ERR_IGNORE_FK_NOKEY = 4, /*!< ignore error if any foreign + key is missing */ + DICT_ERR_IGNORE_RECOVER_LOCK = 8, + /*!< Used when recovering table locks + for resurrected transactions. + Silently load a missing + tablespace, and do not load + incomplete index definitions. */ + DICT_ERR_IGNORE_ALL = 0xFFFF /*!< ignore all errors */ +}; + +/** Quiescing states for flushing tables to disk. */ +enum ib_quiesce_t { + QUIESCE_NONE, + QUIESCE_START, /*!< Initialise, prepare to start */ + QUIESCE_COMPLETE /*!< All done */ +}; + +/** Prefix for tmp tables, adopted from sql/table.h */ +#define tmp_file_prefix "#sql" +#define tmp_file_prefix_length 4 +#define TEMP_FILE_PREFIX_INNODB "#sql-ib" + +#define TEMP_TABLE_PREFIX "#sql" +#define TEMP_TABLE_PATH_PREFIX "/" TEMP_TABLE_PREFIX + +#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG +/** Flag to control insert buffer debugging. */ +extern uint ibuf_debug; +#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ + +#endif diff --git a/storage/xtradb/include/dyn0dyn.h b/storage/xtradb/include/dyn0dyn.h new file mode 100644 index 00000000000..7f23302d1ff --- /dev/null +++ b/storage/xtradb/include/dyn0dyn.h @@ -0,0 +1,199 @@ +/***************************************************************************** + +Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dyn0dyn.h +The dynamically allocated array + +Created 2/5/1996 Heikki Tuuri +*******************************************************/ + +#ifndef dyn0dyn_h +#define dyn0dyn_h + +#include "univ.i" +#include "ut0lst.h" +#include "mem0mem.h" + +/** A block in a dynamically allocated array */ +struct dyn_block_t; +/** Dynamically allocated array */ +typedef dyn_block_t dyn_array_t; + +/** This is the initial 'payload' size of a dynamic array; +this must be > MLOG_BUF_MARGIN + 30! */ +#define DYN_ARRAY_DATA_SIZE 512 + +/*********************************************************************//** +Initializes a dynamic array. +@return initialized dyn array */ +UNIV_INLINE +dyn_array_t* +dyn_array_create( +/*=============*/ + dyn_array_t* arr) /*!< in/out memory buffer of + size sizeof(dyn_array_t) */ + __attribute__((nonnull)); +/************************************************************//** +Frees a dynamic array. */ +UNIV_INLINE +void +dyn_array_free( +/*===========*/ + dyn_array_t* arr) /*!< in,own: dyn array */ + __attribute__((nonnull)); +/*********************************************************************//** +Makes room on top of a dyn array and returns a pointer to a buffer in it. +After copying the elements, the caller must close the buffer using +dyn_array_close. +@return pointer to the buffer */ +UNIV_INLINE +byte* +dyn_array_open( +/*===========*/ + dyn_array_t* arr, /*!< in: dynamic array */ + ulint size) /*!< in: size in bytes of the buffer; MUST be + smaller than DYN_ARRAY_DATA_SIZE! */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Closes the buffer returned by dyn_array_open. */ +UNIV_INLINE +void +dyn_array_close( +/*============*/ + dyn_array_t* arr, /*!< in: dynamic array */ + const byte* ptr) /*!< in: end of used space */ + __attribute__((nonnull)); +/*********************************************************************//** +Makes room on top of a dyn array and returns a pointer to +the added element. The caller must copy the element to +the pointer returned. +@return pointer to the element */ +UNIV_INLINE +void* +dyn_array_push( +/*===========*/ + dyn_array_t* arr, /*!< in/out: dynamic array */ + ulint size) /*!< in: size in bytes of the element */ + __attribute__((nonnull, warn_unused_result)); +/************************************************************//** +Returns pointer to an element in dyn array. +@return pointer to element */ +UNIV_INLINE +void* +dyn_array_get_element( +/*==================*/ + const dyn_array_t* arr, /*!< in: dyn array */ + ulint pos) /*!< in: position of element + in bytes from array start */ + __attribute__((nonnull, warn_unused_result)); +/************************************************************//** +Returns the size of stored data in a dyn array. +@return data size in bytes */ +UNIV_INLINE +ulint +dyn_array_get_data_size( +/*====================*/ + const dyn_array_t* arr) /*!< in: dyn array */ + __attribute__((nonnull, warn_unused_result, pure)); +/************************************************************//** +Gets the first block in a dyn array. +@param arr dyn array +@return first block */ +#define dyn_array_get_first_block(arr) (arr) +/************************************************************//** +Gets the last block in a dyn array. +@param arr dyn array +@return last block */ +#define dyn_array_get_last_block(arr) \ + ((arr)->heap ? UT_LIST_GET_LAST((arr)->base) : (arr)) +/********************************************************************//** +Gets the next block in a dyn array. +@param arr dyn array +@param block dyn array block +@return pointer to next, NULL if end of list */ +#define dyn_array_get_next_block(arr, block) \ + ((arr)->heap ? UT_LIST_GET_NEXT(list, block) : NULL) +/********************************************************************//** +Gets the previous block in a dyn array. +@param arr dyn array +@param block dyn array block +@return pointer to previous, NULL if end of list */ +#define dyn_array_get_prev_block(arr, block) \ + ((arr)->heap ? UT_LIST_GET_PREV(list, block) : NULL) +/********************************************************************//** +Gets the number of used bytes in a dyn array block. +@return number of bytes used */ +UNIV_INLINE +ulint +dyn_block_get_used( +/*===============*/ + const dyn_block_t* block) /*!< in: dyn array block */ + __attribute__((nonnull, warn_unused_result, pure)); +/********************************************************************//** +Gets pointer to the start of data in a dyn array block. +@return pointer to data */ +UNIV_INLINE +byte* +dyn_block_get_data( +/*===============*/ + const dyn_block_t* block) /*!< in: dyn array block */ + __attribute__((nonnull, warn_unused_result, pure)); +/********************************************************//** +Pushes n bytes to a dyn array. */ +UNIV_INLINE +void +dyn_push_string( +/*============*/ + dyn_array_t* arr, /*!< in/out: dyn array */ + const byte* str, /*!< in: string to write */ + ulint len) /*!< in: string length */ + __attribute__((nonnull)); + +/*#################################################################*/ + +/** @brief A block in a dynamically allocated array. +NOTE! Do not access the fields of the struct directly: the definition +appears here only for the compiler to know its size! */ +struct dyn_block_t{ + mem_heap_t* heap; /*!< in the first block this is != NULL + if dynamic allocation has been needed */ + ulint used; /*!< number of data bytes used in this block; + DYN_BLOCK_FULL_FLAG is set when the block + becomes full */ + byte data[DYN_ARRAY_DATA_SIZE]; + /*!< storage for array elements */ + UT_LIST_BASE_NODE_T(dyn_block_t) base; + /*!< linear list of dyn blocks: this node is + used only in the first block */ + UT_LIST_NODE_T(dyn_block_t) list; + /*!< linear list node: used in all blocks */ +#ifdef UNIV_DEBUG + ulint buf_end;/*!< only in the debug version: if dyn + array is opened, this is the buffer + end offset, else this is 0 */ + ulint magic_n;/*!< magic number (DYN_BLOCK_MAGIC_N) */ +#endif +}; + + +#ifndef UNIV_NONINL +#include "dyn0dyn.ic" +#endif + +#endif diff --git a/storage/xtradb/include/dyn0dyn.ic b/storage/xtradb/include/dyn0dyn.ic new file mode 100644 index 00000000000..0296554e2ee --- /dev/null +++ b/storage/xtradb/include/dyn0dyn.ic @@ -0,0 +1,306 @@ +/***************************************************************************** + +Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dyn0dyn.ic +The dynamically allocated array + +Created 2/5/1996 Heikki Tuuri +*******************************************************/ + +/** Value of dyn_block_t::magic_n */ +#define DYN_BLOCK_MAGIC_N 375767 +/** Flag for dyn_block_t::used that indicates a full block */ +#define DYN_BLOCK_FULL_FLAG 0x1000000UL + +/************************************************************//** +Adds a new block to a dyn array. +@return created block */ +UNIV_INTERN +dyn_block_t* +dyn_array_add_block( +/*================*/ + dyn_array_t* arr) /*!< in/out: dyn array */ + __attribute__((nonnull, warn_unused_result)); + +/********************************************************************//** +Gets the number of used bytes in a dyn array block. +@return number of bytes used */ +UNIV_INLINE +ulint +dyn_block_get_used( +/*===============*/ + const dyn_block_t* block) /*!< in: dyn array block */ +{ + ut_ad(block); + + return((block->used) & ~DYN_BLOCK_FULL_FLAG); +} + +/********************************************************************//** +Gets pointer to the start of data in a dyn array block. +@return pointer to data */ +UNIV_INLINE +byte* +dyn_block_get_data( +/*===============*/ + const dyn_block_t* block) /*!< in: dyn array block */ +{ + ut_ad(block); + + return(const_cast<byte*>(block->data)); +} + +/*********************************************************************//** +Initializes a dynamic array. +@return initialized dyn array */ +UNIV_INLINE +dyn_array_t* +dyn_array_create( +/*=============*/ + dyn_array_t* arr) /*!< in/out: memory buffer of + size sizeof(dyn_array_t) */ +{ + ut_ad(arr); +#if DYN_ARRAY_DATA_SIZE >= DYN_BLOCK_FULL_FLAG +# error "DYN_ARRAY_DATA_SIZE >= DYN_BLOCK_FULL_FLAG" +#endif + + arr->heap = NULL; + arr->used = 0; + + ut_d(arr->buf_end = 0); + ut_d(arr->magic_n = DYN_BLOCK_MAGIC_N); + + return(arr); +} + +/************************************************************//** +Frees a dynamic array. */ +UNIV_INLINE +void +dyn_array_free( +/*===========*/ + dyn_array_t* arr) /*!< in: dyn array */ +{ + if (arr->heap != NULL) { + mem_heap_free(arr->heap); + } + + ut_d(arr->magic_n = 0); +} + +/*********************************************************************//** +Makes room on top of a dyn array and returns a pointer to the added element. +The caller must copy the element to the pointer returned. +@return pointer to the element */ +UNIV_INLINE +void* +dyn_array_push( +/*===========*/ + dyn_array_t* arr, /*!< in/out: dynamic array */ + ulint size) /*!< in: size in bytes of the element */ +{ + dyn_block_t* block; + ulint used; + + ut_ad(arr); + ut_ad(arr->magic_n == DYN_BLOCK_MAGIC_N); + ut_ad(size <= DYN_ARRAY_DATA_SIZE); + ut_ad(size); + + block = arr; + + if (block->used + size > DYN_ARRAY_DATA_SIZE) { + /* Get the last array block */ + + block = dyn_array_get_last_block(arr); + + if (block->used + size > DYN_ARRAY_DATA_SIZE) { + block = dyn_array_add_block(arr); + } + } + + used = block->used; + + block->used = used + size; + ut_ad(block->used <= DYN_ARRAY_DATA_SIZE); + + return(block->data + used); +} + +/*********************************************************************//** +Makes room on top of a dyn array and returns a pointer to a buffer in it. +After copying the elements, the caller must close the buffer using +dyn_array_close. +@return pointer to the buffer */ +UNIV_INLINE +byte* +dyn_array_open( +/*===========*/ + dyn_array_t* arr, /*!< in: dynamic array */ + ulint size) /*!< in: size in bytes of the buffer; MUST be + smaller than DYN_ARRAY_DATA_SIZE! */ +{ + dyn_block_t* block; + + ut_ad(arr); + ut_ad(arr->magic_n == DYN_BLOCK_MAGIC_N); + ut_ad(size <= DYN_ARRAY_DATA_SIZE); + ut_ad(size); + + block = arr; + + if (block->used + size > DYN_ARRAY_DATA_SIZE) { + /* Get the last array block */ + + block = dyn_array_get_last_block(arr); + + if (block->used + size > DYN_ARRAY_DATA_SIZE) { + block = dyn_array_add_block(arr); + ut_a(size <= DYN_ARRAY_DATA_SIZE); + } + } + + ut_ad(block->used <= DYN_ARRAY_DATA_SIZE); + ut_ad(arr->buf_end == 0); + ut_d(arr->buf_end = block->used + size); + + return(block->data + block->used); +} + +/*********************************************************************//** +Closes the buffer returned by dyn_array_open. */ +UNIV_INLINE +void +dyn_array_close( +/*============*/ + dyn_array_t* arr, /*!< in/out: dynamic array */ + const byte* ptr) /*!< in: end of used space */ +{ + dyn_block_t* block; + + ut_ad(arr); + ut_ad(arr->magic_n == DYN_BLOCK_MAGIC_N); + + block = dyn_array_get_last_block(arr); + + ut_ad(arr->buf_end + block->data >= ptr); + + block->used = ptr - block->data; + + ut_ad(block->used <= DYN_ARRAY_DATA_SIZE); + + ut_d(arr->buf_end = 0); +} + +/************************************************************//** +Returns pointer to an element in dyn array. +@return pointer to element */ +UNIV_INLINE +void* +dyn_array_get_element( +/*==================*/ + const dyn_array_t* arr, /*!< in: dyn array */ + ulint pos) /*!< in: position of element + in bytes from array start */ +{ + const dyn_block_t* block; + + ut_ad(arr); + ut_ad(arr->magic_n == DYN_BLOCK_MAGIC_N); + + /* Get the first array block */ + block = dyn_array_get_first_block(arr); + + if (arr->heap != NULL) { + for (;;) { + ulint used = dyn_block_get_used(block); + + if (pos < used) { + break; + } + + pos -= used; + block = UT_LIST_GET_NEXT(list, block); + ut_ad(block); + } + } + + ut_ad(block); + ut_ad(dyn_block_get_used(block) >= pos); + + return(const_cast<byte*>(block->data) + pos); +} + +/************************************************************//** +Returns the size of stored data in a dyn array. +@return data size in bytes */ +UNIV_INLINE +ulint +dyn_array_get_data_size( +/*====================*/ + const dyn_array_t* arr) /*!< in: dyn array */ +{ + const dyn_block_t* block; + ulint sum = 0; + + ut_ad(arr); + ut_ad(arr->magic_n == DYN_BLOCK_MAGIC_N); + + if (arr->heap == NULL) { + + return(arr->used); + } + + /* Get the first array block */ + block = dyn_array_get_first_block(arr); + + while (block != NULL) { + sum += dyn_block_get_used(block); + block = dyn_array_get_next_block(arr, block); + } + + return(sum); +} + +/********************************************************//** +Pushes n bytes to a dyn array. */ +UNIV_INLINE +void +dyn_push_string( +/*============*/ + dyn_array_t* arr, /*!< in/out: dyn array */ + const byte* str, /*!< in: string to write */ + ulint len) /*!< in: string length */ +{ + ulint n_copied; + + while (len > 0) { + if (len > DYN_ARRAY_DATA_SIZE) { + n_copied = DYN_ARRAY_DATA_SIZE; + } else { + n_copied = len; + } + + memcpy(dyn_array_push(arr, n_copied), str, n_copied); + + str += n_copied; + len -= n_copied; + } +} diff --git a/storage/xtradb/include/eval0eval.h b/storage/xtradb/include/eval0eval.h new file mode 100644 index 00000000000..e3b1e6c16b6 --- /dev/null +++ b/storage/xtradb/include/eval0eval.h @@ -0,0 +1,114 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/eval0eval.h +SQL evaluator: evaluates simple data structures, like expressions, in +a query graph + +Created 12/29/1997 Heikki Tuuri +*******************************************************/ + +#ifndef eval0eval_h +#define eval0eval_h + +#include "univ.i" +#include "que0types.h" +#include "pars0sym.h" +#include "pars0pars.h" + +/*****************************************************************//** +Free the buffer from global dynamic memory for a value of a que_node, +if it has been allocated in the above function. The freeing for pushed +column values is done in sel_col_prefetch_buf_free. */ +UNIV_INTERN +void +eval_node_free_val_buf( +/*===================*/ + que_node_t* node); /*!< in: query graph node */ +/*****************************************************************//** +Evaluates a symbol table symbol. */ +UNIV_INLINE +void +eval_sym( +/*=====*/ + sym_node_t* sym_node); /*!< in: symbol table node */ +/*****************************************************************//** +Evaluates an expression. */ +UNIV_INLINE +void +eval_exp( +/*=====*/ + que_node_t* exp_node); /*!< in: expression */ +/*****************************************************************//** +Sets an integer value as the value of an expression node. */ +UNIV_INLINE +void +eval_node_set_int_val( +/*==================*/ + que_node_t* node, /*!< in: expression node */ + lint val); /*!< in: value to set */ +/*****************************************************************//** +Gets an integer value from an expression node. +@return integer value */ +UNIV_INLINE +lint +eval_node_get_int_val( +/*==================*/ + que_node_t* node); /*!< in: expression node */ +/*****************************************************************//** +Copies a binary string value as the value of a query graph node. Allocates a +new buffer if necessary. */ +UNIV_INLINE +void +eval_node_copy_and_alloc_val( +/*=========================*/ + que_node_t* node, /*!< in: query graph node */ + const byte* str, /*!< in: binary string */ + ulint len); /*!< in: string length or UNIV_SQL_NULL */ +/*****************************************************************//** +Copies a query node value to another node. */ +UNIV_INLINE +void +eval_node_copy_val( +/*===============*/ + que_node_t* node1, /*!< in: node to copy to */ + que_node_t* node2); /*!< in: node to copy from */ +/*****************************************************************//** +Gets a iboolean value from a query node. +@return iboolean value */ +UNIV_INLINE +ibool +eval_node_get_ibool_val( +/*====================*/ + que_node_t* node); /*!< in: query graph node */ +/*****************************************************************//** +Evaluates a comparison node. +@return the result of the comparison */ +UNIV_INTERN +ibool +eval_cmp( +/*=====*/ + func_node_t* cmp_node); /*!< in: comparison node */ + + +#ifndef UNIV_NONINL +#include "eval0eval.ic" +#endif + +#endif diff --git a/storage/xtradb/include/eval0eval.ic b/storage/xtradb/include/eval0eval.ic new file mode 100644 index 00000000000..e4b1dd08017 --- /dev/null +++ b/storage/xtradb/include/eval0eval.ic @@ -0,0 +1,255 @@ +/***************************************************************************** + +Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/eval0eval.ic +SQL evaluator: evaluates simple data structures, like expressions, in +a query graph + +Created 12/29/1997 Heikki Tuuri +*******************************************************/ + +#include "que0que.h" +#include "rem0cmp.h" +#include "pars0grm.h" + +/*****************************************************************//** +Evaluates a function node. */ +UNIV_INTERN +void +eval_func( +/*======*/ + func_node_t* func_node); /*!< in: function node */ +/*****************************************************************//** +Allocate a buffer from global dynamic memory for a value of a que_node. +NOTE that this memory must be explicitly freed when the query graph is +freed. If the node already has allocated buffer, that buffer is freed +here. NOTE that this is the only function where dynamic memory should be +allocated for a query node val field. +@return pointer to allocated buffer */ +UNIV_INTERN +byte* +eval_node_alloc_val_buf( +/*====================*/ + que_node_t* node, /*!< in: query graph node; sets the val field + data field to point to the new buffer, and + len field equal to size */ + ulint size); /*!< in: buffer size */ + + +/*****************************************************************//** +Allocates a new buffer if needed. +@return pointer to buffer */ +UNIV_INLINE +byte* +eval_node_ensure_val_buf( +/*=====================*/ + que_node_t* node, /*!< in: query graph node; sets the val field + data field to point to the new buffer, and + len field equal to size */ + ulint size) /*!< in: buffer size */ +{ + dfield_t* dfield; + byte* data; + + dfield = que_node_get_val(node); + dfield_set_len(dfield, size); + + data = static_cast<byte*>(dfield_get_data(dfield)); + + if (!data || que_node_get_val_buf_size(node) < size) { + + data = eval_node_alloc_val_buf(node, size); + } + + return(data); +} + +/*****************************************************************//** +Evaluates a symbol table symbol. */ +UNIV_INLINE +void +eval_sym( +/*=====*/ + sym_node_t* sym_node) /*!< in: symbol table node */ +{ + + ut_ad(que_node_get_type(sym_node) == QUE_NODE_SYMBOL); + + if (sym_node->indirection) { + /* The symbol table node is an alias for a variable or a + column */ + + dfield_copy_data(que_node_get_val(sym_node), + que_node_get_val(sym_node->indirection)); + } +} + +/*****************************************************************//** +Evaluates an expression. */ +UNIV_INLINE +void +eval_exp( +/*=====*/ + que_node_t* exp_node) /*!< in: expression */ +{ + if (que_node_get_type(exp_node) == QUE_NODE_SYMBOL) { + + eval_sym((sym_node_t*) exp_node); + + return; + } + + eval_func(static_cast<func_node_t*>(exp_node)); +} + +/*****************************************************************//** +Sets an integer value as the value of an expression node. */ +UNIV_INLINE +void +eval_node_set_int_val( +/*==================*/ + que_node_t* node, /*!< in: expression node */ + lint val) /*!< in: value to set */ +{ + dfield_t* dfield; + byte* data; + + dfield = que_node_get_val(node); + + data = static_cast<byte*>(dfield_get_data(dfield)); + + if (data == NULL) { + data = eval_node_alloc_val_buf(node, 4); + } + + ut_ad(dfield_get_len(dfield) == 4); + + mach_write_to_4(data, (ulint) val); +} + +/*****************************************************************//** +Gets an integer non-SQL null value from an expression node. +@return integer value */ +UNIV_INLINE +lint +eval_node_get_int_val( +/*==================*/ + que_node_t* node) /*!< in: expression node */ +{ + const byte* ptr; + dfield_t* dfield; + + dfield = que_node_get_val(node); + ptr = static_cast<byte*>(dfield_get_data(dfield)); + + ut_ad(dfield_get_len(dfield) == 4); + + return((int) mach_read_from_4(ptr)); +} + +/*****************************************************************//** +Gets a iboolean value from a query node. +@return iboolean value */ +UNIV_INLINE +ibool +eval_node_get_ibool_val( +/*====================*/ + que_node_t* node) /*!< in: query graph node */ +{ + dfield_t* dfield; + byte* data; + + dfield = que_node_get_val(node); + + data = static_cast<byte*>(dfield_get_data(dfield)); + + ut_ad(data != NULL); + + return(mach_read_from_1(data)); +} + +/*****************************************************************//** +Sets a iboolean value as the value of a function node. */ +UNIV_INLINE +void +eval_node_set_ibool_val( +/*====================*/ + func_node_t* func_node, /*!< in: function node */ + ibool val) /*!< in: value to set */ +{ + dfield_t* dfield; + byte* data; + + dfield = que_node_get_val(func_node); + + data = static_cast<byte*>(dfield_get_data(dfield)); + + if (data == NULL) { + /* Allocate 1 byte to hold the value */ + + data = eval_node_alloc_val_buf(func_node, 1); + } + + ut_ad(dfield_get_len(dfield) == 1); + + mach_write_to_1(data, val); +} + +/*****************************************************************//** +Copies a binary string value as the value of a query graph node. Allocates a +new buffer if necessary. */ +UNIV_INLINE +void +eval_node_copy_and_alloc_val( +/*=========================*/ + que_node_t* node, /*!< in: query graph node */ + const byte* str, /*!< in: binary string */ + ulint len) /*!< in: string length or UNIV_SQL_NULL */ +{ + byte* data; + + if (len == UNIV_SQL_NULL) { + dfield_set_len(que_node_get_val(node), len); + + return; + } + + data = eval_node_ensure_val_buf(node, len); + + ut_memcpy(data, str, len); +} + +/*****************************************************************//** +Copies a query node value to another node. */ +UNIV_INLINE +void +eval_node_copy_val( +/*===============*/ + que_node_t* node1, /*!< in: node to copy to */ + que_node_t* node2) /*!< in: node to copy from */ +{ + dfield_t* dfield2; + + dfield2 = que_node_get_val(node2); + + eval_node_copy_and_alloc_val( + node1, + static_cast<byte*>(dfield_get_data(dfield2)), + dfield_get_len(dfield2)); +} diff --git a/storage/xtradb/include/eval0proc.h b/storage/xtradb/include/eval0proc.h new file mode 100644 index 00000000000..7755fb10343 --- /dev/null +++ b/storage/xtradb/include/eval0proc.h @@ -0,0 +1,104 @@ +/***************************************************************************** + +Copyright (c) 1998, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/eval0proc.h +Executes SQL stored procedures and their control structures + +Created 1/20/1998 Heikki Tuuri +*******************************************************/ + +#ifndef eval0proc_h +#define eval0proc_h + +#include "univ.i" +#include "que0types.h" +#include "pars0sym.h" +#include "pars0pars.h" + +/**********************************************************************//** +Performs an execution step of a procedure node. +@return query thread to run next or NULL */ +UNIV_INLINE +que_thr_t* +proc_step( +/*======*/ + que_thr_t* thr); /*!< in: query thread */ +/**********************************************************************//** +Performs an execution step of an if-statement node. +@return query thread to run next or NULL */ +UNIV_INTERN +que_thr_t* +if_step( +/*====*/ + que_thr_t* thr); /*!< in: query thread */ +/**********************************************************************//** +Performs an execution step of a while-statement node. +@return query thread to run next or NULL */ +UNIV_INTERN +que_thr_t* +while_step( +/*=======*/ + que_thr_t* thr); /*!< in: query thread */ +/**********************************************************************//** +Performs an execution step of a for-loop node. +@return query thread to run next or NULL */ +UNIV_INTERN +que_thr_t* +for_step( +/*=====*/ + que_thr_t* thr); /*!< in: query thread */ +/**********************************************************************//** +Performs an execution step of an assignment statement node. +@return query thread to run next or NULL */ +UNIV_INTERN +que_thr_t* +assign_step( +/*========*/ + que_thr_t* thr); /*!< in: query thread */ +/**********************************************************************//** +Performs an execution step of a procedure call node. +@return query thread to run next or NULL */ +UNIV_INLINE +que_thr_t* +proc_eval_step( +/*===========*/ + que_thr_t* thr); /*!< in: query thread */ +/**********************************************************************//** +Performs an execution step of an exit statement node. +@return query thread to run next or NULL */ +UNIV_INTERN +que_thr_t* +exit_step( +/*======*/ + que_thr_t* thr); /*!< in: query thread */ +/**********************************************************************//** +Performs an execution step of a return-statement node. +@return query thread to run next or NULL */ +UNIV_INTERN +que_thr_t* +return_step( +/*========*/ + que_thr_t* thr); /*!< in: query thread */ + + +#ifndef UNIV_NONINL +#include "eval0proc.ic" +#endif + +#endif diff --git a/storage/xtradb/include/eval0proc.ic b/storage/xtradb/include/eval0proc.ic new file mode 100644 index 00000000000..81418bae2c9 --- /dev/null +++ b/storage/xtradb/include/eval0proc.ic @@ -0,0 +1,88 @@ +/***************************************************************************** + +Copyright (c) 1998, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/eval0proc.ic +Executes SQL stored procedures and their control structures + +Created 1/20/1998 Heikki Tuuri +*******************************************************/ + +#include "pars0pars.h" +#include "que0que.h" +#include "eval0eval.h" + +/**********************************************************************//** +Performs an execution step of a procedure node. +@return query thread to run next or NULL */ +UNIV_INLINE +que_thr_t* +proc_step( +/*======*/ + que_thr_t* thr) /*!< in: query thread */ +{ + proc_node_t* node; + + ut_ad(thr); + + node = static_cast<proc_node_t*>(thr->run_node); + ut_ad(que_node_get_type(node) == QUE_NODE_PROC); + + if (thr->prev_node == que_node_get_parent(node)) { + /* Start execution from the first statement in the statement + list */ + + thr->run_node = node->stat_list; + } else { + /* Move to the next statement */ + ut_ad(que_node_get_next(thr->prev_node) == NULL); + + thr->run_node = NULL; + } + + if (thr->run_node == NULL) { + thr->run_node = que_node_get_parent(node); + } + + return(thr); +} + +/**********************************************************************//** +Performs an execution step of a procedure call node. +@return query thread to run next or NULL */ +UNIV_INLINE +que_thr_t* +proc_eval_step( +/*===========*/ + que_thr_t* thr) /*!< in: query thread */ +{ + func_node_t* node; + + ut_ad(thr); + + node = static_cast<func_node_t*>(thr->run_node); + ut_ad(que_node_get_type(node) == QUE_NODE_FUNC); + + /* Evaluate the procedure */ + + eval_exp(node); + + thr->run_node = que_node_get_parent(node); + + return(thr); +} diff --git a/storage/xtradb/include/fil0fil.h b/storage/xtradb/include/fil0fil.h new file mode 100644 index 00000000000..c5ac9d7de83 --- /dev/null +++ b/storage/xtradb/include/fil0fil.h @@ -0,0 +1,1048 @@ +/***************************************************************************** + +Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/fil0fil.h +The low-level file system + +Created 10/25/1995 Heikki Tuuri +*******************************************************/ + +#ifndef fil0fil_h +#define fil0fil_h + +#include "univ.i" + +#ifndef UNIV_INNOCHECKSUM + +#include "dict0types.h" +#include "ut0byte.h" +#include "os0file.h" +#ifndef UNIV_HOTBACKUP +#include "sync0rw.h" +#include "ibuf0types.h" +#include "log0log.h" +#endif /* !UNIV_HOTBACKUP */ +#include "trx0types.h" + +#include <list> + +extern my_bool lower_case_file_system; +// Forward declaration +struct trx_t; +struct fil_space_t; + +typedef std::list<const char*> space_name_list_t; + +/** When mysqld is run, the default directory "." is the mysqld datadir, +but in the MySQL Embedded Server Library and mysqlbackup it is not the default +directory, and we must set the base file path explicitly */ +extern const char* fil_path_to_mysql_datadir; + +/** Initial size of a single-table tablespace in pages */ +#define FIL_IBD_FILE_INITIAL_SIZE 4 + +/** 'null' (undefined) page offset in the context of file spaces */ +#define FIL_NULL ULINT32_UNDEFINED + +/* Space address data type; this is intended to be used when +addresses accurate to a byte are stored in file pages. If the page part +of the address is FIL_NULL, the address is considered undefined. */ + +typedef byte fil_faddr_t; /*!< 'type' definition in C: an address + stored in a file page is a string of bytes */ +#define FIL_ADDR_PAGE 0 /* first in address is the page offset */ +#define FIL_ADDR_BYTE 4 /* then comes 2-byte byte offset within page*/ + +#define FIL_ADDR_SIZE 6 /* address size is 6 bytes */ + +/** File space address */ +struct fil_addr_t{ + ulint page; /*!< page number within a space */ + ulint boffset; /*!< byte offset within the page */ +}; + +/** The null file address */ +extern fil_addr_t fil_addr_null; + +#endif /* !UNIV_INNOCHECKSUM */ + +/** The byte offsets on a file page for various variables @{ */ +#define FIL_PAGE_SPACE_OR_CHKSUM 0 /*!< in < MySQL-4.0.14 space id the + page belongs to (== 0) but in later + versions the 'new' checksum of the + page */ +#define FIL_PAGE_OFFSET 4 /*!< page offset inside space */ +#define FIL_PAGE_PREV 8 /*!< if there is a 'natural' + predecessor of the page, its + offset. Otherwise FIL_NULL. + This field is not set on BLOB + pages, which are stored as a + singly-linked list. See also + FIL_PAGE_NEXT. */ +#define FIL_PAGE_NEXT 12 /*!< if there is a 'natural' successor + of the page, its offset. + Otherwise FIL_NULL. + B-tree index pages + (FIL_PAGE_TYPE contains FIL_PAGE_INDEX) + on the same PAGE_LEVEL are maintained + as a doubly linked list via + FIL_PAGE_PREV and FIL_PAGE_NEXT + in the collation order of the + smallest user record on each page. */ +#define FIL_PAGE_LSN 16 /*!< lsn of the end of the newest + modification log record to the page */ +#define FIL_PAGE_TYPE 24 /*!< file page type: FIL_PAGE_INDEX,..., + 2 bytes. + + The contents of this field can only + be trusted in the following case: + if the page is an uncompressed + B-tree index page, then it is + guaranteed that the value is + FIL_PAGE_INDEX. + The opposite does not hold. + + In tablespaces created by + MySQL/InnoDB 5.1.7 or later, the + contents of this field is valid + for all uncompressed pages. */ +#define FIL_PAGE_FILE_FLUSH_LSN 26 /*!< this is only defined for the + first page in a system tablespace + data file (ibdata*, not *.ibd): + the file has been flushed to disk + at least up to this lsn */ +#define FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID 34 /*!< starting from 4.1.x this + contains the space id of the page */ +#define FIL_PAGE_SPACE_ID FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID + +#define FIL_PAGE_DATA 38 /*!< start of the data on the page */ +/* @} */ +/** File page trailer @{ */ +#define FIL_PAGE_END_LSN_OLD_CHKSUM 8 /*!< the low 4 bytes of this are used + to store the page checksum, the + last 4 bytes should be identical + to the last 4 bytes of FIL_PAGE_LSN */ +#define FIL_PAGE_DATA_END 8 /*!< size of the page trailer */ +/* @} */ + +/** File page types (values of FIL_PAGE_TYPE) @{ */ +#define FIL_PAGE_INDEX 17855 /*!< B-tree node */ +#define FIL_PAGE_UNDO_LOG 2 /*!< Undo log page */ +#define FIL_PAGE_INODE 3 /*!< Index node */ +#define FIL_PAGE_IBUF_FREE_LIST 4 /*!< Insert buffer free list */ +/* File page types introduced in MySQL/InnoDB 5.1.7 */ +#define FIL_PAGE_TYPE_ALLOCATED 0 /*!< Freshly allocated page */ +#define FIL_PAGE_IBUF_BITMAP 5 /*!< Insert buffer bitmap */ +#define FIL_PAGE_TYPE_SYS 6 /*!< System page */ +#define FIL_PAGE_TYPE_TRX_SYS 7 /*!< Transaction system data */ +#define FIL_PAGE_TYPE_FSP_HDR 8 /*!< File space header */ +#define FIL_PAGE_TYPE_XDES 9 /*!< Extent descriptor page */ +#define FIL_PAGE_TYPE_BLOB 10 /*!< Uncompressed BLOB page */ +#define FIL_PAGE_TYPE_ZBLOB 11 /*!< First compressed BLOB page */ +#define FIL_PAGE_TYPE_ZBLOB2 12 /*!< Subsequent compressed BLOB page */ +#define FIL_PAGE_TYPE_LAST FIL_PAGE_TYPE_ZBLOB2 + /*!< Last page type */ +/* @} */ + +#ifndef UNIV_INNOCHECKSUM + +/** Space types @{ */ +#define FIL_TABLESPACE 501 /*!< tablespace */ +#define FIL_LOG 502 /*!< redo log */ +/* @} */ + +/** The number of fsyncs done to the log */ +extern ulint fil_n_log_flushes; + +/** Number of pending redo log flushes */ +extern ulint fil_n_pending_log_flushes; +/** Number of pending tablespace flushes */ +extern ulint fil_n_pending_tablespace_flushes; + +/** Number of files currently open */ +extern ulint fil_n_file_opened; + +struct fsp_open_info { + ibool success; /*!< Has the tablespace been opened? */ + const char* check_msg; /*!< fil_check_first_page() message */ + ibool valid; /*!< Is the tablespace valid? */ + os_file_t file; /*!< File handle */ + char* filepath; /*!< File path to open */ + lsn_t lsn; /*!< Flushed LSN from header page */ + ulint id; /*!< Space ID */ + ulint flags; /*!< Tablespace flags */ +}; + +#ifndef UNIV_HOTBACKUP +/*******************************************************************//** +Returns the version number of a tablespace, -1 if not found. +@return version number, -1 if the tablespace does not exist in the +memory cache */ +UNIV_INTERN +ib_int64_t +fil_space_get_version( +/*==================*/ + ulint id); /*!< in: space id */ +/*******************************************************************//** +Returns the latch of a file space. +@return latch protecting storage allocation */ +UNIV_INTERN +prio_rw_lock_t* +fil_space_get_latch( +/*================*/ + ulint id, /*!< in: space id */ + ulint* zip_size);/*!< out: compressed page size, or + 0 for uncompressed tablespaces */ +/*******************************************************************//** +Returns the type of a file space. +@return FIL_TABLESPACE or FIL_LOG */ +UNIV_INTERN +ulint +fil_space_get_type( +/*===============*/ + ulint id); /*!< in: space id */ +#endif /* !UNIV_HOTBACKUP */ +/*******************************************************************//** +Appends a new file to the chain of files of a space. File must be closed. +@return pointer to the file name, or NULL on error */ +UNIV_INTERN +char* +fil_node_create( +/*============*/ + const char* name, /*!< in: file name (file must be closed) */ + ulint size, /*!< in: file size in database blocks, rounded + downwards to an integer */ + ulint id, /*!< in: space id where to append */ + ibool is_raw) /*!< in: TRUE if a raw device or + a raw disk partition */ + __attribute__((nonnull, warn_unused_result)); +#ifdef UNIV_LOG_ARCHIVE +/****************************************************************//** +Drops files from the start of a file space, so that its size is cut by +the amount given. */ +UNIV_INTERN +void +fil_space_truncate_start( +/*=====================*/ + ulint id, /*!< in: space id */ + ulint trunc_len); /*!< in: truncate by this much; it is an error + if this does not equal to the combined size of + some initial files in the space */ +/****************************************************************//** +Check is there node in file space with given name. */ +UNIV_INTERN +ibool +fil_space_contains_node( +/*====================*/ + ulint id, /*!< in: space id */ + char* node_name); /*!< in: node name */ +#endif /* UNIV_LOG_ARCHIVE */ +/*******************************************************************//** +Creates a space memory object and puts it to the 'fil system' hash table. +If there is an error, prints an error message to the .err log. +@return TRUE if success */ +UNIV_INTERN +ibool +fil_space_create( +/*=============*/ + const char* name, /*!< in: space name */ + ulint id, /*!< in: space id */ + ulint zip_size,/*!< in: compressed page size, or + 0 for uncompressed tablespaces */ + ulint purpose);/*!< in: FIL_TABLESPACE, or FIL_LOG if log */ +/*******************************************************************//** +Assigns a new space id for a new single-table tablespace. This works simply by +incrementing the global counter. If 4 billion id's is not enough, we may need +to recycle id's. +@return TRUE if assigned, FALSE if not */ +UNIV_INTERN +ibool +fil_assign_new_space_id( +/*====================*/ + ulint* space_id); /*!< in/out: space id */ +/*******************************************************************//** +Returns the path from the first fil_node_t found for the space ID sent. +The caller is responsible for freeing the memory allocated here for the +value returned. +@return a copy of fil_node_t::path, NULL if space is zero or not found. */ +UNIV_INTERN +char* +fil_space_get_first_path( +/*=====================*/ + ulint id); /*!< in: space id */ +/*******************************************************************//** +Returns the size of the space in pages. The tablespace must be cached in the +memory cache. +@return space size, 0 if space not found */ +UNIV_INTERN +ulint +fil_space_get_size( +/*===============*/ + ulint id); /*!< in: space id */ +/*******************************************************************//** +Returns the flags of the space. The tablespace must be cached +in the memory cache. +@return flags, ULINT_UNDEFINED if space not found */ +UNIV_INTERN +ulint +fil_space_get_flags( +/*================*/ + ulint id); /*!< in: space id */ +/*******************************************************************//** +Returns the compressed page size of the space, or 0 if the space +is not compressed. The tablespace must be cached in the memory cache. +@return compressed page size, ULINT_UNDEFINED if space not found */ +UNIV_INTERN +ulint +fil_space_get_zip_size( +/*===================*/ + ulint id); /*!< in: space id */ +/*******************************************************************//** +Checks if the pair space, page_no refers to an existing page in a tablespace +file space. The tablespace must be cached in the memory cache. +@return TRUE if the address is meaningful */ +UNIV_INTERN +ibool +fil_check_adress_in_tablespace( +/*===========================*/ + ulint id, /*!< in: space id */ + ulint page_no);/*!< in: page number */ +/****************************************************************//** +Initializes the tablespace memory cache. */ +UNIV_INTERN +void +fil_init( +/*=====*/ + ulint hash_size, /*!< in: hash table size */ + ulint max_n_open); /*!< in: max number of open files */ +/*******************************************************************//** +Initializes the tablespace memory cache. */ +UNIV_INTERN +void +fil_close(void); +/*===========*/ +/*******************************************************************//** +Opens all log files and system tablespace data files. They stay open until the +database server shutdown. This should be called at a server startup after the +space objects for the log and the system tablespace have been created. The +purpose of this operation is to make sure we never run out of file descriptors +if we need to read from the insert buffer or to write to the log. */ +UNIV_INTERN +void +fil_open_log_and_system_tablespace_files(void); +/*==========================================*/ +/*******************************************************************//** +Closes all open files. There must not be any pending i/o's or not flushed +modifications in the files. */ +UNIV_INTERN +void +fil_close_all_files(void); +/*=====================*/ +/*******************************************************************//** +Closes the redo log files. There must not be any pending i/o's or not +flushed modifications in the files. */ +UNIV_INTERN +void +fil_close_log_files( +/*================*/ + bool free); /*!< in: whether to free the memory object */ +/*******************************************************************//** +Sets the max tablespace id counter if the given number is bigger than the +previous value. */ +UNIV_INTERN +void +fil_set_max_space_id_if_bigger( +/*===========================*/ + ulint max_id);/*!< in: maximum known id */ +#ifndef UNIV_HOTBACKUP +/****************************************************************//** +Writes the flushed lsn and the latest archived log number to the page +header of the first page of each data file in the system tablespace. +@return DB_SUCCESS or error number */ +UNIV_INTERN +dberr_t +fil_write_flushed_lsn_to_data_files( +/*================================*/ + lsn_t lsn, /*!< in: lsn to write */ + ulint arch_log_no); /*!< in: latest archived log file number */ +/*******************************************************************//** +Reads the flushed lsn, arch no, and tablespace flag fields from a data +file at database startup. +@retval NULL on success, or if innodb_force_recovery is set +@return pointer to an error message string */ +UNIV_INTERN +const char* +fil_read_first_page( +/*================*/ + os_file_t data_file, /*!< in: open data file */ + ibool one_read_already, /*!< in: TRUE if min and max + parameters below already + contain sensible data */ + ulint* flags, /*!< out: tablespace flags */ + ulint* space_id, /*!< out: tablespace ID */ + lsn_t* min_flushed_lsn, /*!< out: min of flushed + lsn values in data files */ + lsn_t* max_flushed_lsn) /*!< out: max of flushed + lsn values in data files */ + __attribute__((warn_unused_result)); +/*******************************************************************//** +Increments the count of pending operation, if space is not being deleted. +@return TRUE if being deleted, and operation should be skipped */ +UNIV_INTERN +ibool +fil_inc_pending_ops( +/*================*/ + ulint id, /*!< in: space id */ + ibool print_err); /*!< in: need to print error or not */ +/*******************************************************************//** +Decrements the count of pending operations. */ +UNIV_INTERN +void +fil_decr_pending_ops( +/*=================*/ + ulint id); /*!< in: space id */ +#endif /* !UNIV_HOTBACKUP */ +/*******************************************************************//** +Parses the body of a log record written about an .ibd file operation. That is, +the log record part after the standard (type, space id, page no) header of the +log record. + +If desired, also replays the delete or rename operation if the .ibd file +exists and the space id in it matches. Replays the create operation if a file +at that path does not exist yet. If the database directory for the file to be +created does not exist, then we create the directory, too. + +Note that mysqlbackup --apply-log sets fil_path_to_mysql_datadir to point to +the datadir that we should use in replaying the file operations. +@return end of log record, or NULL if the record was not completely +contained between ptr and end_ptr */ +UNIV_INTERN +byte* +fil_op_log_parse_or_replay( +/*=======================*/ + byte* ptr, /*!< in: buffer containing the log record body, + or an initial segment of it, if the record does + not fir completely between ptr and end_ptr */ + byte* end_ptr, /*!< in: buffer end */ + ulint type, /*!< in: the type of this log record */ + ulint space_id, /*!< in: the space id of the tablespace in + question, or 0 if the log record should + only be parsed but not replayed */ + ulint log_flags); /*!< in: redo log flags + (stored in the page number parameter) */ +/*******************************************************************//** +Deletes a single-table tablespace. The tablespace must be cached in the +memory cache. +@return TRUE if success */ +UNIV_INTERN +dberr_t +fil_delete_tablespace( +/*==================*/ + ulint id, /*!< in: space id */ + buf_remove_t buf_remove); /*!< in: specify the action to take + on the tables pages in the buffer + pool */ +/*******************************************************************//** +Closes a single-table tablespace. The tablespace must be cached in the +memory cache. Free all pages used by the tablespace. +@return DB_SUCCESS or error */ +UNIV_INTERN +dberr_t +fil_close_tablespace( +/*=================*/ + trx_t* trx, /*!< in/out: Transaction covering the close */ + ulint id); /*!< in: space id */ +#ifndef UNIV_HOTBACKUP +/*******************************************************************//** +Discards a single-table tablespace. The tablespace must be cached in the +memory cache. Discarding is like deleting a tablespace, but + + 1. We do not drop the table from the data dictionary; + + 2. We remove all insert buffer entries for the tablespace immediately; + in DROP TABLE they are only removed gradually in the background; + + 3. When the user does IMPORT TABLESPACE, the tablespace will have the + same id as it originally had. + + 4. Free all the pages in use by the tablespace if rename=TRUE. +@return DB_SUCCESS or error */ +UNIV_INTERN +dberr_t +fil_discard_tablespace( +/*===================*/ + ulint id) /*!< in: space id */ + __attribute__((warn_unused_result)); +#endif /* !UNIV_HOTBACKUP */ +/*******************************************************************//** +Renames a single-table tablespace. The tablespace must be cached in the +tablespace memory cache. +@return TRUE if success */ +UNIV_INTERN +ibool +fil_rename_tablespace( +/*==================*/ + const char* old_name_in, /*!< in: old table name in the + standard databasename/tablename + format of InnoDB, or NULL if we + do the rename based on the space + id only */ + ulint id, /*!< in: space id */ + const char* new_name, /*!< in: new table name in the + standard databasename/tablename + format of InnoDB */ + const char* new_path); /*!< in: new full datafile path + if the tablespace is remotely + located, or NULL if it is located + in the normal data directory. */ + +/*******************************************************************//** +Allocates a file name for a single-table tablespace. The string must be freed +by caller with mem_free(). +@return own: file name */ +UNIV_INTERN +char* +fil_make_ibd_name( +/*==============*/ + const char* name, /*!< in: table name or a dir path */ + bool is_full_path); /*!< in: TRUE if it is a dir path */ +/*******************************************************************//** +Allocates a file name for a tablespace ISL file (InnoDB Symbolic Link). +The string must be freed by caller with mem_free(). +@return own: file name */ +UNIV_INTERN +char* +fil_make_isl_name( +/*==============*/ + const char* name); /*!< in: table name */ +/*******************************************************************//** +Creates a new InnoDB Symbolic Link (ISL) file. It is always created +under the 'datadir' of MySQL. The datadir is the directory of a +running mysqld program. We can refer to it by simply using the path '.'. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fil_create_link_file( +/*=================*/ + const char* tablename, /*!< in: tablename */ + const char* filepath); /*!< in: pathname of tablespace */ +/*******************************************************************//** +Deletes an InnoDB Symbolic Link (ISL) file. */ +UNIV_INTERN +void +fil_delete_link_file( +/*==================*/ + const char* tablename); /*!< in: name of table */ +/*******************************************************************//** +Reads an InnoDB Symbolic Link (ISL) file. +It is always created under the 'datadir' of MySQL. The name is of the +form {databasename}/{tablename}. and the isl file is expected to be in a +'{databasename}' directory called '{tablename}.isl'. The caller must free +the memory of the null-terminated path returned if it is not null. +@return own: filepath found in link file, NULL if not found. */ +UNIV_INTERN +char* +fil_read_link_file( +/*===============*/ + const char* name); /*!< in: tablespace name */ +/*******************************************************************//** +Creates a new single-table tablespace to a database directory of MySQL. +Database directories are under the 'datadir' of MySQL. The datadir is the +directory of a running mysqld program. We can refer to it by simply the +path '.'. Tables created with CREATE TEMPORARY TABLE we place in the temp +dir of the mysqld server. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fil_create_new_single_table_tablespace( +/*===================================*/ + ulint space_id, /*!< in: space id */ + const char* tablename, /*!< in: the table name in the usual + databasename/tablename format + of InnoDB */ + const char* dir_path, /*!< in: NULL or a dir path */ + ulint flags, /*!< in: tablespace flags */ + ulint flags2, /*!< in: table flags2 */ + ulint size) /*!< in: the initial size of the + tablespace file in pages, + must be >= FIL_IBD_FILE_INITIAL_SIZE */ + __attribute__((nonnull, warn_unused_result)); +#ifndef UNIV_HOTBACKUP +/********************************************************************//** +Tries to open a single-table tablespace and optionally checks the space id is +right in it. If does not succeed, prints an error message to the .err log. This +function is used to open a tablespace when we start up mysqld, and also in +IMPORT TABLESPACE. +NOTE that we assume this operation is used either at the database startup +or under the protection of the dictionary mutex, so that two users cannot +race here. This operation does not leave the file associated with the +tablespace open, but closes it after we have looked at the space id in it. + +If the validate boolean is set, we read the first page of the file and +check that the space id in the file is what we expect. We assume that +this function runs much faster if no check is made, since accessing the +file inode probably is much faster (the OS caches them) than accessing +the first page of the file. This boolean may be initially FALSE, but if +a remote tablespace is found it will be changed to true. + +If the fix_dict boolean is set, then it is safe to use an internal SQL +statement to update the dictionary tables if they are incorrect. + +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fil_open_single_table_tablespace( +/*=============================*/ + bool validate, /*!< in: Do we validate tablespace? */ + bool fix_dict, /*!< in: Can we fix the dictionary? */ + ulint id, /*!< in: space id */ + ulint flags, /*!< in: tablespace flags */ + const char* tablename, /*!< in: table name in the + databasename/tablename format */ + const char* filepath) /*!< in: tablespace filepath */ + __attribute__((nonnull(5), warn_unused_result)); + +#endif /* !UNIV_HOTBACKUP */ +/********************************************************************//** +At the server startup, if we need crash recovery, scans the database +directories under the MySQL datadir, looking for .ibd files. Those files are +single-table tablespaces. We need to know the space id in each of them so that +we know into which file we should look to check the contents of a page stored +in the doublewrite buffer, also to know where to apply log records where the +space id is != 0. +@return DB_SUCCESS or error number */ +UNIV_INTERN +dberr_t +fil_load_single_table_tablespaces(void); +/*===================================*/ +/*******************************************************************//** +Returns TRUE if a single-table tablespace does not exist in the memory cache, +or is being deleted there. +@return TRUE if does not exist or is being deleted */ +UNIV_INTERN +ibool +fil_tablespace_deleted_or_being_deleted_in_mem( +/*===========================================*/ + ulint id, /*!< in: space id */ + ib_int64_t version);/*!< in: tablespace_version should be this; if + you pass -1 as the value of this, then this + parameter is ignored */ +/*******************************************************************//** +Returns TRUE if a single-table tablespace exists in the memory cache. +@return TRUE if exists */ +UNIV_INTERN +ibool +fil_tablespace_exists_in_mem( +/*=========================*/ + ulint id); /*!< in: space id */ +#ifndef UNIV_HOTBACKUP +/*******************************************************************//** +Returns TRUE if a matching tablespace exists in the InnoDB tablespace memory +cache. Note that if we have not done a crash recovery at the database startup, +there may be many tablespaces which are not yet in the memory cache. +@return TRUE if a matching tablespace exists in the memory cache */ +UNIV_INTERN +ibool +fil_space_for_table_exists_in_mem( +/*==============================*/ + ulint id, /*!< in: space id */ + const char* name, /*!< in: table name in the standard + 'databasename/tablename' format */ + ibool mark_space, /*!< in: in crash recovery, at database + startup we mark all spaces which have + an associated table in the InnoDB + data dictionary, so that + we can print a warning about orphaned + tablespaces */ + ibool print_error_if_does_not_exist, + /*!< in: print detailed error + information to the .err log if a + matching tablespace is not found from + memory */ + bool adjust_space, /*!< in: whether to adjust space id + when find table space mismatch */ + mem_heap_t* heap, /*!< in: heap memory */ + table_id_t table_id); /*!< in: table id */ +#else /* !UNIV_HOTBACKUP */ +/********************************************************************//** +Extends all tablespaces to the size stored in the space header. During the +mysqlbackup --apply-log phase we extended the spaces on-demand so that log +records could be appllied, but that may have left spaces still too small +compared to the size stored in the space header. */ +UNIV_INTERN +void +fil_extend_tablespaces_to_stored_len(void); +/*======================================*/ +#endif /* !UNIV_HOTBACKUP */ +/**********************************************************************//** +Tries to extend a data file so that it would accommodate the number of pages +given. The tablespace must be cached in the memory cache. If the space is big +enough already, does nothing. +@return TRUE if success */ +UNIV_INTERN +ibool +fil_extend_space_to_desired_size( +/*=============================*/ + ulint* actual_size, /*!< out: size of the space after extension; + if we ran out of disk space this may be lower + than the desired size */ + ulint space_id, /*!< in: space id */ + ulint size_after_extend);/*!< in: desired size in pages after the + extension; if the current space size is bigger + than this already, the function does nothing */ +/*******************************************************************//** +Tries to reserve free extents in a file space. +@return TRUE if succeed */ +UNIV_INTERN +ibool +fil_space_reserve_free_extents( +/*===========================*/ + ulint id, /*!< in: space id */ + ulint n_free_now, /*!< in: number of free extents now */ + ulint n_to_reserve); /*!< in: how many one wants to reserve */ +/*******************************************************************//** +Releases free extents in a file space. */ +UNIV_INTERN +void +fil_space_release_free_extents( +/*===========================*/ + ulint id, /*!< in: space id */ + ulint n_reserved); /*!< in: how many one reserved */ +/*******************************************************************//** +Gets the number of reserved extents. If the database is silent, this number +should be zero. */ +UNIV_INTERN +ulint +fil_space_get_n_reserved_extents( +/*=============================*/ + ulint id); /*!< in: space id */ +/********************************************************************//** +Reads or writes data. This operation is asynchronous (aio). +@return DB_SUCCESS, or DB_TABLESPACE_DELETED if we are trying to do +i/o on a tablespace which does not exist */ +#define fil_io(type, sync, space_id, zip_size, block_offset, byte_offset, len, buf, message) \ + _fil_io(type, sync, space_id, zip_size, block_offset, byte_offset, len, buf, message, NULL) + +UNIV_INTERN +dberr_t +_fil_io( +/*===*/ + ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE, + ORed to OS_FILE_LOG, if a log i/o + and ORed to OS_AIO_SIMULATED_WAKE_LATER + if simulated aio and we want to post a + batch of i/os; NOTE that a simulated batch + may introduce hidden chances of deadlocks, + because i/os are not actually handled until + all have been posted: use with great + caution! */ + bool sync, /*!< in: true if synchronous aio is desired */ + ulint space_id, /*!< in: space id */ + ulint zip_size, /*!< in: compressed page size in bytes; + 0 for uncompressed pages */ + ulint block_offset, /*!< in: offset in number of blocks */ + ulint byte_offset, /*!< in: remainder of offset in bytes; in + aio this must be divisible by the OS block + size */ + ulint len, /*!< in: how many bytes to read or write; this + must not cross a file boundary; in aio this + must be a block size multiple */ + void* buf, /*!< in/out: buffer where to store read data + or from where to write; in aio this must be + appropriately aligned */ + void* message, /*!< in: message for aio handler if non-sync + aio used, else ignored */ + trx_t* trx) + __attribute__((nonnull(8))); +/**********************************************************************//** +Waits for an aio operation to complete. This function is used to write the +handler for completed requests. The aio array of pending requests is divided +into segments (see os0file.cc for more info). The thread specifies which +segment it wants to wait for. */ +UNIV_INTERN +void +fil_aio_wait( +/*=========*/ + ulint segment); /*!< in: the number of the segment in the aio + array to wait for */ +/**********************************************************************//** +Flushes to disk possible writes cached by the OS. If the space does not exist +or is being dropped, does not do anything. */ +UNIV_INTERN +void +fil_flush( +/*======*/ + ulint space_id); /*!< in: file space id (this can be a group of + log files or a tablespace of the database) */ +/**********************************************************************//** +Flushes to disk writes in file spaces of the given type possibly cached by +the OS. */ +UNIV_INTERN +void +fil_flush_file_spaces( +/*==================*/ + ulint purpose); /*!< in: FIL_TABLESPACE, FIL_LOG */ +/******************************************************************//** +Checks the consistency of the tablespace cache. +@return TRUE if ok */ +UNIV_INTERN +ibool +fil_validate(void); +/*==============*/ +/********************************************************************//** +Returns TRUE if file address is undefined. +@return TRUE if undefined */ +UNIV_INTERN +ibool +fil_addr_is_null( +/*=============*/ + fil_addr_t addr); /*!< in: address */ +/********************************************************************//** +Get the predecessor of a file page. +@return FIL_PAGE_PREV */ +UNIV_INTERN +ulint +fil_page_get_prev( +/*==============*/ + const byte* page); /*!< in: file page */ +/********************************************************************//** +Get the successor of a file page. +@return FIL_PAGE_NEXT */ +UNIV_INTERN +ulint +fil_page_get_next( +/*==============*/ + const byte* page); /*!< in: file page */ +/*********************************************************************//** +Sets the file page type. */ +UNIV_INTERN +void +fil_page_set_type( +/*==============*/ + byte* page, /*!< in/out: file page */ + ulint type); /*!< in: type */ +/*********************************************************************//** +Gets the file page type. +@return type; NOTE that if the type has not been written to page, the +return value not defined */ +UNIV_INTERN +ulint +fil_page_get_type( +/*==============*/ + const byte* page); /*!< in: file page */ + +/*******************************************************************//** +Returns TRUE if a single-table tablespace is being deleted. +@return TRUE if being deleted */ +UNIV_INTERN +ibool +fil_tablespace_is_being_deleted( +/*============================*/ + ulint id); /*!< in: space id */ + +/********************************************************************//** +Delete the tablespace file and any related files like .cfg. +This should not be called for temporary tables. */ +UNIV_INTERN +void +fil_delete_file( +/*============*/ + const char* path); /*!< in: filepath of the ibd tablespace */ + +/** Callback functor. */ +struct PageCallback { + + /** + Default constructor */ + PageCallback() + : + m_zip_size(), + m_page_size(), + m_filepath() UNIV_NOTHROW {} + + virtual ~PageCallback() UNIV_NOTHROW {} + + /** + Called for page 0 in the tablespace file at the start. + @param file_size - size of the file in bytes + @param block - contents of the first page in the tablespace file + @retval DB_SUCCESS or error code.*/ + virtual dberr_t init( + os_offset_t file_size, + const buf_block_t* block) UNIV_NOTHROW = 0; + + /** + Called for every page in the tablespace. If the page was not + updated then its state must be set to BUF_PAGE_NOT_USED. For + compressed tables the page descriptor memory will be at offset: + block->frame + UNIV_PAGE_SIZE; + @param offset - physical offset within the file + @param block - block read from file, note it is not from the buffer pool + @retval DB_SUCCESS or error code. */ + virtual dberr_t operator()( + os_offset_t offset, + buf_block_t* block) UNIV_NOTHROW = 0; + + /** + Set the name of the physical file and the file handle that is used + to open it for the file that is being iterated over. + @param filename - then physical name of the tablespace file. + @param file - OS file handle */ + void set_file(const char* filename, os_file_t file) UNIV_NOTHROW + { + m_file = file; + m_filepath = filename; + } + + /** + @return the space id of the tablespace */ + virtual ulint get_space_id() const UNIV_NOTHROW = 0; + + /** The compressed page size + @return the compressed page size */ + ulint get_zip_size() const + { + return(m_zip_size); + } + + /** + Set the tablespace compressed table size. + @return DB_SUCCESS if it is valie or DB_CORRUPTION if not */ + dberr_t set_zip_size(const buf_frame_t* page) UNIV_NOTHROW; + + /** The compressed page size + @return the compressed page size */ + ulint get_page_size() const + { + return(m_page_size); + } + + /** Compressed table page size */ + ulint m_zip_size; + + /** The tablespace page size. */ + ulint m_page_size; + + /** File handle to the tablespace */ + os_file_t m_file; + + /** Physical file path. */ + const char* m_filepath; + +protected: + // Disable copying + PageCallback(const PageCallback&); + PageCallback& operator=(const PageCallback&); +}; + +/********************************************************************//** +Iterate over all the pages in the tablespace. +@param table - the table definiton in the server +@param n_io_buffers - number of blocks to read and write together +@param callback - functor that will do the page updates +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fil_tablespace_iterate( +/*===================*/ + dict_table_t* table, + ulint n_io_buffers, + PageCallback& callback) + __attribute__((nonnull, warn_unused_result)); + +/*******************************************************************//** +Checks if a single-table tablespace for a given table name exists in the +tablespace memory cache. +@return space id, ULINT_UNDEFINED if not found */ +UNIV_INTERN +ulint +fil_get_space_id_for_table( +/*=======================*/ + const char* name); /*!< in: table name in the standard + 'databasename/tablename' format */ + +/** +Iterate over all the spaces in the space list and fetch the +tablespace names. It will return a copy of the name that must be +freed by the caller using: delete[]. +@return DB_SUCCESS if all OK. */ +UNIV_INTERN +dberr_t +fil_get_space_names( +/*================*/ + space_name_list_t& space_name_list) + /*!< in/out: Vector for collecting the names. */ + __attribute__((warn_unused_result)); + +/****************************************************************//** +Generate redo logs for swapping two .ibd files */ +UNIV_INTERN +void +fil_mtr_rename_log( +/*===============*/ + ulint old_space_id, /*!< in: tablespace id of the old + table. */ + const char* old_name, /*!< in: old table name */ + ulint new_space_id, /*!< in: tablespace id of the new + table */ + const char* new_name, /*!< in: new table name */ + const char* tmp_name, /*!< in: temp table name used while + swapping */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull)); + +/*******************************************************************//** +Finds the given page_no of the given space id from the double write buffer, +and copies it to the corresponding .ibd file. +@return true if copy was successful, or false. */ +bool +fil_user_tablespace_restore_page( +/*==============================*/ + fsp_open_info* fsp, /* in: contains space id and .ibd + file information */ + ulint page_no); /* in: page_no to obtain from double + write buffer */ + +#endif /* !UNIV_INNOCHECKSUM */ + +/************************************************************************* +Return local hash table informations. */ + +ulint +fil_system_hash_cells(void); +/*========================*/ + +ulint +fil_system_hash_nodes(void); +/*========================*/ + +/************************************************************************* +functions to access is_corrupt flag of fil_space_t*/ + +ibool +fil_space_is_corrupt( +/*=================*/ + ulint space_id); + +void +fil_space_set_corrupt( +/*==================*/ + ulint space_id); + +#endif /* fil0fil_h */ diff --git a/storage/xtradb/include/fsp0fsp.h b/storage/xtradb/include/fsp0fsp.h new file mode 100644 index 00000000000..a587ccc9f20 --- /dev/null +++ b/storage/xtradb/include/fsp0fsp.h @@ -0,0 +1,747 @@ +/***************************************************************************** + +Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/fsp0fsp.h +File space management + +Created 12/18/1995 Heikki Tuuri +*******************************************************/ + +#ifndef fsp0fsp_h +#define fsp0fsp_h + +#include "univ.i" + +#ifndef UNIV_INNOCHECKSUM + +#include "mtr0mtr.h" +#include "fut0lst.h" +#include "ut0byte.h" +#include "page0types.h" +#include "fsp0types.h" + +#endif /* !UNIV_INNOCHECKSUM */ + +/* @defgroup fsp_flags InnoDB Tablespace Flag Constants @{ */ + +/** Width of the POST_ANTELOPE flag */ +#define FSP_FLAGS_WIDTH_POST_ANTELOPE 1 +/** Number of flag bits used to indicate the tablespace zip page size */ +#define FSP_FLAGS_WIDTH_ZIP_SSIZE 4 +/** Width of the ATOMIC_BLOBS flag. The ability to break up a long +column into an in-record prefix and an externally stored part is available +to the two Barracuda row formats COMPRESSED and DYNAMIC. */ +#define FSP_FLAGS_WIDTH_ATOMIC_BLOBS 1 +/** Number of flag bits used to indicate the tablespace page size */ +#define FSP_FLAGS_WIDTH_PAGE_SSIZE 4 +/** Width of the DATA_DIR flag. This flag indicates that the tablespace +is found in a remote location, not the default data directory. */ +#define FSP_FLAGS_WIDTH_DATA_DIR 1 +/** Width of all the currently known tablespace flags */ +#define FSP_FLAGS_WIDTH (FSP_FLAGS_WIDTH_POST_ANTELOPE \ + + FSP_FLAGS_WIDTH_ZIP_SSIZE \ + + FSP_FLAGS_WIDTH_ATOMIC_BLOBS \ + + FSP_FLAGS_WIDTH_PAGE_SSIZE \ + + FSP_FLAGS_WIDTH_DATA_DIR) + +/** A mask of all the known/used bits in tablespace flags */ +#define FSP_FLAGS_MASK (~(~0 << FSP_FLAGS_WIDTH)) + +/** Zero relative shift position of the POST_ANTELOPE field */ +#define FSP_FLAGS_POS_POST_ANTELOPE 0 +/** Zero relative shift position of the ZIP_SSIZE field */ +#define FSP_FLAGS_POS_ZIP_SSIZE (FSP_FLAGS_POS_POST_ANTELOPE \ + + FSP_FLAGS_WIDTH_POST_ANTELOPE) +/** Zero relative shift position of the ATOMIC_BLOBS field */ +#define FSP_FLAGS_POS_ATOMIC_BLOBS (FSP_FLAGS_POS_ZIP_SSIZE \ + + FSP_FLAGS_WIDTH_ZIP_SSIZE) +/** Zero relative shift position of the PAGE_SSIZE field */ +#define FSP_FLAGS_POS_PAGE_SSIZE (FSP_FLAGS_POS_ATOMIC_BLOBS \ + + FSP_FLAGS_WIDTH_ATOMIC_BLOBS) +/** Zero relative shift position of the start of the UNUSED bits */ +#define FSP_FLAGS_POS_DATA_DIR (FSP_FLAGS_POS_PAGE_SSIZE \ + + FSP_FLAGS_WIDTH_PAGE_SSIZE) +/** Zero relative shift position of the start of the UNUSED bits */ +#define FSP_FLAGS_POS_UNUSED (FSP_FLAGS_POS_DATA_DIR \ + + FSP_FLAGS_WIDTH_DATA_DIR) + +/** Bit mask of the POST_ANTELOPE field */ +#define FSP_FLAGS_MASK_POST_ANTELOPE \ + ((~(~0 << FSP_FLAGS_WIDTH_POST_ANTELOPE)) \ + << FSP_FLAGS_POS_POST_ANTELOPE) +/** Bit mask of the ZIP_SSIZE field */ +#define FSP_FLAGS_MASK_ZIP_SSIZE \ + ((~(~0 << FSP_FLAGS_WIDTH_ZIP_SSIZE)) \ + << FSP_FLAGS_POS_ZIP_SSIZE) +/** Bit mask of the ATOMIC_BLOBS field */ +#define FSP_FLAGS_MASK_ATOMIC_BLOBS \ + ((~(~0 << FSP_FLAGS_WIDTH_ATOMIC_BLOBS)) \ + << FSP_FLAGS_POS_ATOMIC_BLOBS) +/** Bit mask of the PAGE_SSIZE field */ +#define FSP_FLAGS_MASK_PAGE_SSIZE \ + ((~(~0 << FSP_FLAGS_WIDTH_PAGE_SSIZE)) \ + << FSP_FLAGS_POS_PAGE_SSIZE) +/** Bit mask of the DATA_DIR field */ +#define FSP_FLAGS_MASK_DATA_DIR \ + ((~(~0 << FSP_FLAGS_WIDTH_DATA_DIR)) \ + << FSP_FLAGS_POS_DATA_DIR) + +/** Return the value of the POST_ANTELOPE field */ +#define FSP_FLAGS_GET_POST_ANTELOPE(flags) \ + ((flags & FSP_FLAGS_MASK_POST_ANTELOPE) \ + >> FSP_FLAGS_POS_POST_ANTELOPE) +/** Return the value of the ZIP_SSIZE field */ +#define FSP_FLAGS_GET_ZIP_SSIZE(flags) \ + ((flags & FSP_FLAGS_MASK_ZIP_SSIZE) \ + >> FSP_FLAGS_POS_ZIP_SSIZE) +/** Return the value of the ATOMIC_BLOBS field */ +#define FSP_FLAGS_HAS_ATOMIC_BLOBS(flags) \ + ((flags & FSP_FLAGS_MASK_ATOMIC_BLOBS) \ + >> FSP_FLAGS_POS_ATOMIC_BLOBS) +/** Return the value of the PAGE_SSIZE field */ +#define FSP_FLAGS_GET_PAGE_SSIZE(flags) \ + ((flags & FSP_FLAGS_MASK_PAGE_SSIZE) \ + >> FSP_FLAGS_POS_PAGE_SSIZE) +/** Return the value of the DATA_DIR field */ +#define FSP_FLAGS_HAS_DATA_DIR(flags) \ + ((flags & FSP_FLAGS_MASK_DATA_DIR) \ + >> FSP_FLAGS_POS_DATA_DIR) +/** Return the contents of the UNUSED bits */ +#define FSP_FLAGS_GET_UNUSED(flags) \ + (flags >> FSP_FLAGS_POS_UNUSED) + +/** Set a PAGE_SSIZE into the correct bits in a given +tablespace flags. */ +#define FSP_FLAGS_SET_PAGE_SSIZE(flags, ssize) \ + (flags | (ssize << FSP_FLAGS_POS_PAGE_SSIZE)) + +/* @} */ + +/* @defgroup Tablespace Header Constants (moved from fsp0fsp.c) @{ */ + +/** Offset of the space header within a file page */ +#define FSP_HEADER_OFFSET FIL_PAGE_DATA + +/* The data structures in files are defined just as byte strings in C */ +typedef byte fsp_header_t; +typedef byte xdes_t; + +/* SPACE HEADER + ============ + +File space header data structure: this data structure is contained in the +first page of a space. The space for this header is reserved in every extent +descriptor page, but used only in the first. */ + +/*-------------------------------------*/ +#define FSP_SPACE_ID 0 /* space id */ +#define FSP_NOT_USED 4 /* this field contained a value up to + which we know that the modifications + in the database have been flushed to + the file space; not used now */ +#define FSP_SIZE 8 /* Current size of the space in + pages */ +#define FSP_FREE_LIMIT 12 /* Minimum page number for which the + free list has not been initialized: + the pages >= this limit are, by + definition, free; note that in a + single-table tablespace where size + < 64 pages, this number is 64, i.e., + we have initialized the space + about the first extent, but have not + physically allocted those pages to the + file */ +#define FSP_SPACE_FLAGS 16 /* fsp_space_t.flags, similar to + dict_table_t::flags */ +#define FSP_FRAG_N_USED 20 /* number of used pages in the + FSP_FREE_FRAG list */ +#define FSP_FREE 24 /* list of free extents */ +#define FSP_FREE_FRAG (24 + FLST_BASE_NODE_SIZE) + /* list of partially free extents not + belonging to any segment */ +#define FSP_FULL_FRAG (24 + 2 * FLST_BASE_NODE_SIZE) + /* list of full extents not belonging + to any segment */ +#define FSP_SEG_ID (24 + 3 * FLST_BASE_NODE_SIZE) + /* 8 bytes which give the first unused + segment id */ +#define FSP_SEG_INODES_FULL (32 + 3 * FLST_BASE_NODE_SIZE) + /* list of pages containing segment + headers, where all the segment inode + slots are reserved */ +#define FSP_SEG_INODES_FREE (32 + 4 * FLST_BASE_NODE_SIZE) + /* list of pages containing segment + headers, where not all the segment + header slots are reserved */ +/*-------------------------------------*/ +/* File space header size */ +#define FSP_HEADER_SIZE (32 + 5 * FLST_BASE_NODE_SIZE) + +#define FSP_FREE_ADD 4 /* this many free extents are added + to the free list from above + FSP_FREE_LIMIT at a time */ +/* @} */ + +#ifndef UNIV_INNOCHECKSUM + +/* @defgroup File Segment Inode Constants (moved from fsp0fsp.c) @{ */ + +/* FILE SEGMENT INODE + ================== + +Segment inode which is created for each segment in a tablespace. NOTE: in +purge we assume that a segment having only one currently used page can be +freed in a few steps, so that the freeing cannot fill the file buffer with +bufferfixed file pages. */ + +typedef byte fseg_inode_t; + +#define FSEG_INODE_PAGE_NODE FSEG_PAGE_DATA + /* the list node for linking + segment inode pages */ + +#define FSEG_ARR_OFFSET (FSEG_PAGE_DATA + FLST_NODE_SIZE) +/*-------------------------------------*/ +#define FSEG_ID 0 /* 8 bytes of segment id: if this is 0, + it means that the header is unused */ +#define FSEG_NOT_FULL_N_USED 8 + /* number of used segment pages in + the FSEG_NOT_FULL list */ +#define FSEG_FREE 12 + /* list of free extents of this + segment */ +#define FSEG_NOT_FULL (12 + FLST_BASE_NODE_SIZE) + /* list of partially free extents */ +#define FSEG_FULL (12 + 2 * FLST_BASE_NODE_SIZE) + /* list of full extents */ +#define FSEG_MAGIC_N (12 + 3 * FLST_BASE_NODE_SIZE) + /* magic number used in debugging */ +#define FSEG_FRAG_ARR (16 + 3 * FLST_BASE_NODE_SIZE) + /* array of individual pages + belonging to this segment in fsp + fragment extent lists */ +#define FSEG_FRAG_ARR_N_SLOTS (FSP_EXTENT_SIZE / 2) + /* number of slots in the array for + the fragment pages */ +#define FSEG_FRAG_SLOT_SIZE 4 /* a fragment page slot contains its + page number within space, FIL_NULL + means that the slot is not in use */ +/*-------------------------------------*/ +#define FSEG_INODE_SIZE \ + (16 + 3 * FLST_BASE_NODE_SIZE \ + + FSEG_FRAG_ARR_N_SLOTS * FSEG_FRAG_SLOT_SIZE) + +#define FSP_SEG_INODES_PER_PAGE(zip_size) \ + (((zip_size ? zip_size : UNIV_PAGE_SIZE) \ + - FSEG_ARR_OFFSET - 10) / FSEG_INODE_SIZE) + /* Number of segment inodes which fit on a + single page */ + +#define FSEG_MAGIC_N_VALUE 97937874 + +#define FSEG_FILLFACTOR 8 /* If this value is x, then if + the number of unused but reserved + pages in a segment is less than + reserved pages * 1/x, and there are + at least FSEG_FRAG_LIMIT used pages, + then we allow a new empty extent to + be added to the segment in + fseg_alloc_free_page. Otherwise, we + use unused pages of the segment. */ + +#define FSEG_FRAG_LIMIT FSEG_FRAG_ARR_N_SLOTS + /* If the segment has >= this many + used pages, it may be expanded by + allocating extents to the segment; + until that only individual fragment + pages are allocated from the space */ + +#define FSEG_FREE_LIST_LIMIT 40 /* If the reserved size of a segment + is at least this many extents, we + allow extents to be put to the free + list of the extent: at most + FSEG_FREE_LIST_MAX_LEN many */ +#define FSEG_FREE_LIST_MAX_LEN 4 +/* @} */ + +/* @defgroup Extent Descriptor Constants (moved from fsp0fsp.c) @{ */ + +/* EXTENT DESCRIPTOR + ================= + +File extent descriptor data structure: contains bits to tell which pages in +the extent are free and which contain old tuple version to clean. */ + +/*-------------------------------------*/ +#define XDES_ID 0 /* The identifier of the segment + to which this extent belongs */ +#define XDES_FLST_NODE 8 /* The list node data structure + for the descriptors */ +#define XDES_STATE (FLST_NODE_SIZE + 8) + /* contains state information + of the extent */ +#define XDES_BITMAP (FLST_NODE_SIZE + 12) + /* Descriptor bitmap of the pages + in the extent */ +/*-------------------------------------*/ + +#define XDES_BITS_PER_PAGE 2 /* How many bits are there per page */ +#define XDES_FREE_BIT 0 /* Index of the bit which tells if + the page is free */ +#define XDES_CLEAN_BIT 1 /* NOTE: currently not used! + Index of the bit which tells if + there are old versions of tuples + on the page */ +/* States of a descriptor */ +#define XDES_FREE 1 /* extent is in free list of space */ +#define XDES_FREE_FRAG 2 /* extent is in free fragment list of + space */ +#define XDES_FULL_FRAG 3 /* extent is in full fragment list of + space */ +#define XDES_FSEG 4 /* extent belongs to a segment */ + +/** File extent data structure size in bytes. */ +#define XDES_SIZE \ + (XDES_BITMAP \ + + UT_BITS_IN_BYTES(FSP_EXTENT_SIZE * XDES_BITS_PER_PAGE)) + +/** File extent data structure size in bytes for MAX page size. */ +#define XDES_SIZE_MAX \ + (XDES_BITMAP \ + + UT_BITS_IN_BYTES(FSP_EXTENT_SIZE_MAX * XDES_BITS_PER_PAGE)) + +/** File extent data structure size in bytes for MIN page size. */ +#define XDES_SIZE_MIN \ + (XDES_BITMAP \ + + UT_BITS_IN_BYTES(FSP_EXTENT_SIZE_MIN * XDES_BITS_PER_PAGE)) + +/** Offset of the descriptor array on a descriptor page */ +#define XDES_ARR_OFFSET (FSP_HEADER_OFFSET + FSP_HEADER_SIZE) + +/* @} */ + +/**********************************************************************//** +Initializes the file space system. */ +UNIV_INTERN +void +fsp_init(void); +/*==========*/ +/**********************************************************************//** +Gets the size of the system tablespace from the tablespace header. If +we do not have an auto-extending data file, this should be equal to +the size of the data files. If there is an auto-extending data file, +this can be smaller. +@return size in pages */ +UNIV_INTERN +ulint +fsp_header_get_tablespace_size(void); +/*================================*/ +/**********************************************************************//** +Reads the file space size stored in the header page. +@return tablespace size stored in the space header */ +UNIV_INTERN +ulint +fsp_get_size_low( +/*=============*/ + page_t* page); /*!< in: header page (page 0 in the tablespace) */ +/**********************************************************************//** +Reads the space id from the first page of a tablespace. +@return space id, ULINT UNDEFINED if error */ +UNIV_INTERN +ulint +fsp_header_get_space_id( +/*====================*/ + const page_t* page); /*!< in: first page of a tablespace */ +/**********************************************************************//** +Reads the space flags from the first page of a tablespace. +@return flags */ +UNIV_INTERN +ulint +fsp_header_get_flags( +/*=================*/ + const page_t* page); /*!< in: first page of a tablespace */ +/**********************************************************************//** +Reads the compressed page size from the first page of a tablespace. +@return compressed page size in bytes, or 0 if uncompressed */ +UNIV_INTERN +ulint +fsp_header_get_zip_size( +/*====================*/ + const page_t* page); /*!< in: first page of a tablespace */ +/**********************************************************************//** +Writes the space id and flags to a tablespace header. The flags contain +row type, physical/compressed page size, and logical/uncompressed page +size of the tablespace. */ +UNIV_INTERN +void +fsp_header_init_fields( +/*===================*/ + page_t* page, /*!< in/out: first page in the space */ + ulint space_id, /*!< in: space id */ + ulint flags); /*!< in: tablespace flags (FSP_SPACE_FLAGS): + 0, or table->flags if newer than COMPACT */ +/**********************************************************************//** +Initializes the space header of a new created space and creates also the +insert buffer tree root if space == 0. */ +UNIV_INTERN +void +fsp_header_init( +/*============*/ + ulint space, /*!< in: space id */ + ulint size, /*!< in: current size in blocks */ + mtr_t* mtr); /*!< in/out: mini-transaction */ +/**********************************************************************//** +Increases the space size field of a space. */ +UNIV_INTERN +void +fsp_header_inc_size( +/*================*/ + ulint space, /*!< in: space id */ + ulint size_inc, /*!< in: size increment in pages */ + mtr_t* mtr); /*!< in/out: mini-transaction */ +/**********************************************************************//** +Creates a new segment. +@return the block where the segment header is placed, x-latched, NULL +if could not create segment because of lack of space */ +UNIV_INTERN +buf_block_t* +fseg_create( +/*========*/ + ulint space, /*!< in: space id */ + ulint page, /*!< in: page where the segment header is placed: if + this is != 0, the page must belong to another segment, + if this is 0, a new page will be allocated and it + will belong to the created segment */ + ulint byte_offset, /*!< in: byte offset of the created segment header + on the page */ + mtr_t* mtr); /*!< in/out: mini-transaction */ +/**********************************************************************//** +Creates a new segment. +@return the block where the segment header is placed, x-latched, NULL +if could not create segment because of lack of space */ +UNIV_INTERN +buf_block_t* +fseg_create_general( +/*================*/ + ulint space, /*!< in: space id */ + ulint page, /*!< in: page where the segment header is placed: if + this is != 0, the page must belong to another segment, + if this is 0, a new page will be allocated and it + will belong to the created segment */ + ulint byte_offset, /*!< in: byte offset of the created segment header + on the page */ + ibool has_done_reservation, /*!< in: TRUE if the caller has already + done the reservation for the pages with + fsp_reserve_free_extents (at least 2 extents: one for + the inode and the other for the segment) then there is + no need to do the check for this individual + operation */ + mtr_t* mtr); /*!< in/out: mini-transaction */ +/**********************************************************************//** +Calculates the number of pages reserved by a segment, and how many pages are +currently used. +@return number of reserved pages */ +UNIV_INTERN +ulint +fseg_n_reserved_pages( +/*==================*/ + fseg_header_t* header, /*!< in: segment header */ + ulint* used, /*!< out: number of pages used (<= reserved) */ + mtr_t* mtr); /*!< in/out: mini-transaction */ +/**********************************************************************//** +Allocates a single free page from a segment. This function implements +the intelligent allocation strategy which tries to minimize +file space fragmentation. +@param[in/out] seg_header segment header +@param[in] hint hint of which page would be desirable +@param[in] direction if the new page is needed because + of an index page split, and records are + inserted there in order, into which + direction they go alphabetically: FSP_DOWN, + FSP_UP, FSP_NO_DIR +@param[in/out] mtr mini-transaction +@return X-latched block, or NULL if no page could be allocated */ +#define fseg_alloc_free_page(seg_header, hint, direction, mtr) \ + fseg_alloc_free_page_general(seg_header, hint, direction, \ + FALSE, mtr, mtr) +/**********************************************************************//** +Allocates a single free page from a segment. This function implements +the intelligent allocation strategy which tries to minimize file space +fragmentation. +@retval NULL if no page could be allocated +@retval block, rw_lock_x_lock_count(&block->lock) == 1 if allocation succeeded +(init_mtr == mtr, or the page was not previously freed in mtr) +@retval block (not allocated or initialized) otherwise */ +UNIV_INTERN +buf_block_t* +fseg_alloc_free_page_general( +/*=========================*/ + fseg_header_t* seg_header,/*!< in/out: segment header */ + ulint hint, /*!< in: hint of which page would be + desirable */ + byte direction,/*!< in: if the new page is needed because + of an index page split, and records are + inserted there in order, into which + direction they go alphabetically: FSP_DOWN, + FSP_UP, FSP_NO_DIR */ + ibool has_done_reservation, /*!< in: TRUE if the caller has + already done the reservation for the page + with fsp_reserve_free_extents, then there + is no need to do the check for this individual + page */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + mtr_t* init_mtr)/*!< in/out: mtr or another mini-transaction + in which the page should be initialized. + If init_mtr!=mtr, but the page is already + latched in mtr, do not initialize the page. */ + __attribute__((warn_unused_result, nonnull)); +/**********************************************************************//** +Reserves free pages from a tablespace. All mini-transactions which may +use several pages from the tablespace should call this function beforehand +and reserve enough free extents so that they certainly will be able +to do their operation, like a B-tree page split, fully. Reservations +must be released with function fil_space_release_free_extents! + +The alloc_type below has the following meaning: FSP_NORMAL means an +operation which will probably result in more space usage, like an +insert in a B-tree; FSP_UNDO means allocation to undo logs: if we are +deleting rows, then this allocation will in the long run result in +less space usage (after a purge); FSP_CLEANING means allocation done +in a physical record delete (like in a purge) or other cleaning operation +which will result in less space usage in the long run. We prefer the latter +two types of allocation: when space is scarce, FSP_NORMAL allocations +will not succeed, but the latter two allocations will succeed, if possible. +The purpose is to avoid dead end where the database is full but the +user cannot free any space because these freeing operations temporarily +reserve some space. + +Single-table tablespaces whose size is < 32 pages are a special case. In this +function we would liberally reserve several 64 page extents for every page +split or merge in a B-tree. But we do not want to waste disk space if the table +only occupies < 32 pages. That is why we apply different rules in that special +case, just ensuring that there are 3 free pages available. +@return TRUE if we were able to make the reservation */ +UNIV_INTERN +ibool +fsp_reserve_free_extents( +/*=====================*/ + ulint* n_reserved,/*!< out: number of extents actually reserved; if we + return TRUE and the tablespace size is < 64 pages, + then this can be 0, otherwise it is n_ext */ + ulint space, /*!< in: space id */ + ulint n_ext, /*!< in: number of extents to reserve */ + ulint alloc_type,/*!< in: FSP_NORMAL, FSP_UNDO, or FSP_CLEANING */ + mtr_t* mtr); /*!< in: mini-transaction */ +/**********************************************************************//** +This function should be used to get information on how much we still +will be able to insert new data to the database without running out the +tablespace. Only free extents are taken into account and we also subtract +the safety margin required by the above function fsp_reserve_free_extents. +@return available space in kB */ +UNIV_INTERN +ullint +fsp_get_available_space_in_free_extents( +/*====================================*/ + ulint space); /*!< in: space id */ +/**********************************************************************//** +Frees a single page of a segment. */ +UNIV_INTERN +void +fseg_free_page( +/*===========*/ + fseg_header_t* seg_header, /*!< in: segment header */ + ulint space, /*!< in: space id */ + ulint page, /*!< in: page offset */ + mtr_t* mtr); /*!< in/out: mini-transaction */ +/**********************************************************************//** +Checks if a single page of a segment is free. +@return true if free */ +UNIV_INTERN +bool +fseg_page_is_free( +/*==============*/ + fseg_header_t* seg_header, /*!< in: segment header */ + ulint space, /*!< in: space id */ + ulint page) /*!< in: page offset */ + __attribute__((nonnull, warn_unused_result)); +/**********************************************************************//** +Frees part of a segment. This function can be used to free a segment +by repeatedly calling this function in different mini-transactions. +Doing the freeing in a single mini-transaction might result in +too big a mini-transaction. +@return TRUE if freeing completed */ +UNIV_INTERN +ibool +fseg_free_step( +/*===========*/ + fseg_header_t* header, /*!< in, own: segment header; NOTE: if the header + resides on the first page of the frag list + of the segment, this pointer becomes obsolete + after the last freeing step */ + mtr_t* mtr); /*!< in/out: mini-transaction */ +/**********************************************************************//** +Frees part of a segment. Differs from fseg_free_step because this function +leaves the header page unfreed. +@return TRUE if freeing completed, except the header page */ +UNIV_INTERN +ibool +fseg_free_step_not_header( +/*======================*/ + fseg_header_t* header, /*!< in: segment header which must reside on + the first fragment page of the segment */ + mtr_t* mtr); /*!< in/out: mini-transaction */ +/***********************************************************************//** +Checks if a page address is an extent descriptor page address. +@return TRUE if a descriptor page */ +UNIV_INLINE +ibool +fsp_descr_page( +/*===========*/ + ulint zip_size,/*!< in: compressed page size in bytes; + 0 for uncompressed pages */ + ulint page_no);/*!< in: page number */ +/***********************************************************//** +Parses a redo log record of a file page init. +@return end of log record or NULL */ +UNIV_INTERN +byte* +fsp_parse_init_file_page( +/*=====================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr, /*!< in: buffer end */ + buf_block_t* block); /*!< in: block or NULL */ +/*******************************************************************//** +Validates the file space system and its segments. +@return TRUE if ok */ +UNIV_INTERN +ibool +fsp_validate( +/*=========*/ + ulint space); /*!< in: space id */ +/*******************************************************************//** +Prints info of a file space. */ +UNIV_INTERN +void +fsp_print( +/*======*/ + ulint space); /*!< in: space id */ +#ifdef UNIV_DEBUG +/*******************************************************************//** +Validates a segment. +@return TRUE if ok */ +UNIV_INTERN +ibool +fseg_validate( +/*==========*/ + fseg_header_t* header, /*!< in: segment header */ + mtr_t* mtr); /*!< in/out: mini-transaction */ +#endif /* UNIV_DEBUG */ +#ifdef UNIV_BTR_PRINT +/*******************************************************************//** +Writes info of a segment. */ +UNIV_INTERN +void +fseg_print( +/*=======*/ + fseg_header_t* header, /*!< in: segment header */ + mtr_t* mtr); /*!< in/out: mini-transaction */ +#endif /* UNIV_BTR_PRINT */ + +/********************************************************************//** +Validate and return the tablespace flags, which are stored in the +tablespace header at offset FSP_SPACE_FLAGS. They should be 0 for +ROW_FORMAT=COMPACT and ROW_FORMAT=REDUNDANT. The newer row formats, +COMPRESSED and DYNAMIC, use a file format > Antelope so they should +have a file format number plus the DICT_TF_COMPACT bit set. +@return true if check ok */ +UNIV_INLINE +bool +fsp_flags_is_valid( +/*===============*/ + ulint flags) /*!< in: tablespace flags */ + __attribute__((warn_unused_result, const)); +/********************************************************************//** +Determine if the tablespace is compressed from dict_table_t::flags. +@return TRUE if compressed, FALSE if not compressed */ +UNIV_INLINE +ibool +fsp_flags_is_compressed( +/*====================*/ + ulint flags); /*!< in: tablespace flags */ + +/********************************************************************//** +Calculates the descriptor index within a descriptor page. +@return descriptor index */ +UNIV_INLINE +ulint +xdes_calc_descriptor_index( +/*=======================*/ + ulint zip_size, /*!< in: compressed page size in bytes; + 0 for uncompressed pages */ + ulint offset); /*!< in: page offset */ + +/**********************************************************************//** +Gets a descriptor bit of a page. +@return TRUE if free */ +UNIV_INLINE +ibool +xdes_get_bit( +/*=========*/ + const xdes_t* descr, /*!< in: descriptor */ + ulint bit, /*!< in: XDES_FREE_BIT or XDES_CLEAN_BIT */ + ulint offset);/*!< in: page offset within extent: + 0 ... FSP_EXTENT_SIZE - 1 */ + +/********************************************************************//** +Calculates the page where the descriptor of a page resides. +@return descriptor page offset */ +UNIV_INLINE +ulint +xdes_calc_descriptor_page( +/*======================*/ + ulint zip_size, /*!< in: compressed page size in bytes; + 0 for uncompressed pages */ + ulint offset); /*!< in: page offset */ + +#endif /* !UNIV_INNOCHECKSUM */ + +/********************************************************************//** +Extract the zip size from tablespace flags. A tablespace has only one +physical page size whether that page is compressed or not. +@return compressed page size of the file-per-table tablespace in bytes, +or zero if the table is not compressed. */ +UNIV_INLINE +ulint +fsp_flags_get_zip_size( +/*====================*/ + ulint flags); /*!< in: tablespace flags */ +/********************************************************************//** +Extract the page size from tablespace flags. +@return page size of the tablespace in bytes */ +UNIV_INLINE +ulint +fsp_flags_get_page_size( +/*====================*/ + ulint flags); /*!< in: tablespace flags */ + +#ifndef UNIV_NONINL +#include "fsp0fsp.ic" +#endif + +#endif diff --git a/storage/xtradb/include/fsp0fsp.ic b/storage/xtradb/include/fsp0fsp.ic new file mode 100644 index 00000000000..0d81e817cc9 --- /dev/null +++ b/storage/xtradb/include/fsp0fsp.ic @@ -0,0 +1,314 @@ +/***************************************************************************** + +Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/fsp0fsp.ic +File space management + +Created 12/18/1995 Heikki Tuuri +*******************************************************/ + +#ifndef UNIV_INNOCHECKSUM + +/***********************************************************************//** +Checks if a page address is an extent descriptor page address. +@return TRUE if a descriptor page */ +UNIV_INLINE +ibool +fsp_descr_page( +/*===========*/ + ulint zip_size,/*!< in: compressed page size in bytes; + 0 for uncompressed pages */ + ulint page_no)/*!< in: page number */ +{ + ut_ad(ut_is_2pow(zip_size)); + + if (!zip_size) { + return((page_no & (UNIV_PAGE_SIZE - 1)) == FSP_XDES_OFFSET); + } + + return((page_no & (zip_size - 1)) == FSP_XDES_OFFSET); +} + +/********************************************************************//** +Validate and return the tablespace flags, which are stored in the +tablespace header at offset FSP_SPACE_FLAGS. They should be 0 for +ROW_FORMAT=COMPACT and ROW_FORMAT=REDUNDANT. The newer row formats, +COMPRESSED and DYNAMIC, use a file format > Antelope so they should +have a file format number plus the DICT_TF_COMPACT bit set. +@return true if check ok */ +UNIV_INLINE +bool +fsp_flags_is_valid( +/*===============*/ + ulint flags) /*!< in: tablespace flags */ +{ + ulint post_antelope = FSP_FLAGS_GET_POST_ANTELOPE(flags); + ulint zip_ssize = FSP_FLAGS_GET_ZIP_SSIZE(flags); + ulint atomic_blobs = FSP_FLAGS_HAS_ATOMIC_BLOBS(flags); + ulint page_ssize = FSP_FLAGS_GET_PAGE_SSIZE(flags); + ulint unused = FSP_FLAGS_GET_UNUSED(flags); + + DBUG_EXECUTE_IF("fsp_flags_is_valid_failure", return(false);); + + /* fsp_flags is zero unless atomic_blobs is set. */ + /* Make sure there are no bits that we do not know about. */ + if (unused != 0 || flags == 1) { + return(false); + } else if (post_antelope) { + /* The Antelope row formats REDUNDANT and COMPACT did + not use tablespace flags, so this flag and the entire + 4-byte field is zero for Antelope row formats. */ + + if (!atomic_blobs) { + return(false); + } + } + + if (!atomic_blobs) { + /* Barracuda row formats COMPRESSED and DYNAMIC build on + the page structure introduced for the COMPACT row format + by allowing long fields to be broken into prefix and + externally stored parts. */ + + if (post_antelope || zip_ssize != 0) { + return(false); + } + + } else if (!post_antelope || zip_ssize > PAGE_ZIP_SSIZE_MAX) { + return(false); + } else if (page_ssize > UNIV_PAGE_SSIZE_MAX) { + + /* The page size field can be used for any row type, or it may + be zero for an original 16k page size. + Validate the page shift size is within allowed range. */ + + return(false); + + } else if (UNIV_PAGE_SIZE != UNIV_PAGE_SIZE_ORIG && !page_ssize) { + return(false); + } + +#if UNIV_FORMAT_MAX != UNIV_FORMAT_B +# error "UNIV_FORMAT_MAX != UNIV_FORMAT_B, Add more validations." +#endif + + /* The DATA_DIR field can be used for any row type so there is + nothing here to validate. */ + + return(true); +} + +/********************************************************************//** +Determine if the tablespace is compressed from dict_table_t::flags. +@return TRUE if compressed, FALSE if not compressed */ +UNIV_INLINE +ibool +fsp_flags_is_compressed( +/*====================*/ + ulint flags) /*!< in: tablespace flags */ +{ + return(FSP_FLAGS_GET_ZIP_SSIZE(flags) != 0); +} + +#endif /* !UNIV_INNOCHECKSUM */ + +/********************************************************************//** +Extract the zip size from tablespace flags. +@return compressed page size of the file-per-table tablespace in bytes, +or zero if the table is not compressed. */ +UNIV_INLINE +ulint +fsp_flags_get_zip_size( +/*===================*/ + ulint flags) /*!< in: tablespace flags */ +{ + ulint zip_size = 0; + ulint ssize = FSP_FLAGS_GET_ZIP_SSIZE(flags); + + /* Convert from a 'log2 minus 9' to a page size in bytes. */ + if (ssize) { + zip_size = ((UNIV_ZIP_SIZE_MIN >> 1) << ssize); + + ut_ad(zip_size <= UNIV_ZIP_SIZE_MAX); + } + + return(zip_size); +} + +/********************************************************************//** +Extract the page size from tablespace flags. +@return page size of the tablespace in bytes */ +UNIV_INLINE +ulint +fsp_flags_get_page_size( +/*====================*/ + ulint flags) /*!< in: tablespace flags */ +{ + ulint page_size = 0; + ulint ssize = FSP_FLAGS_GET_PAGE_SSIZE(flags); + + /* Convert from a 'log2 minus 9' to a page size in bytes. */ + if (UNIV_UNLIKELY(ssize)) { + page_size = ((UNIV_ZIP_SIZE_MIN >> 1) << ssize); + + ut_ad(page_size <= UNIV_PAGE_SIZE_MAX); + } else { + /* If the page size was not stored, then it is the + original 16k. */ + page_size = UNIV_PAGE_SIZE_ORIG; + } + + return(page_size); +} + +#ifndef UNIV_INNOCHECKSUM + +/********************************************************************//** +Add the page size to the tablespace flags. +@return tablespace flags after page size is added */ +UNIV_INLINE +ulint +fsp_flags_set_page_size( +/*====================*/ + ulint flags, /*!< in: tablespace flags */ + ulint page_size) /*!< in: page size in bytes */ +{ + ulint ssize = 0; + ulint shift; + + /* Page size should be > UNIV_PAGE_SIZE_MIN */ + ut_ad(page_size >= UNIV_PAGE_SIZE_MIN); + ut_ad(page_size <= UNIV_PAGE_SIZE_MAX); + + if (page_size == UNIV_PAGE_SIZE_ORIG) { + ut_ad(0 == FSP_FLAGS_GET_PAGE_SSIZE(flags)); + return(flags); + } + + for (shift = UNIV_PAGE_SIZE_SHIFT_MAX; + shift >= UNIV_PAGE_SIZE_SHIFT_MIN; + shift--) { + ulint mask = (1 << shift); + if (page_size & mask) { + ut_ad(!(page_size & ~mask)); + ssize = shift - UNIV_ZIP_SIZE_SHIFT_MIN + 1; + break; + } + } + + ut_ad(ssize); + ut_ad(ssize <= UNIV_PAGE_SSIZE_MAX); + + flags = FSP_FLAGS_SET_PAGE_SSIZE(flags, ssize); + + ut_ad(fsp_flags_is_valid(flags)); + + return(flags); +} + +/********************************************************************//** +Calculates the descriptor index within a descriptor page. +@return descriptor index */ +UNIV_INLINE +ulint +xdes_calc_descriptor_index( +/*=======================*/ + ulint zip_size, /*!< in: compressed page size in bytes; + 0 for uncompressed pages */ + ulint offset) /*!< in: page offset */ +{ + ut_ad(ut_is_2pow(zip_size)); + + if (zip_size == 0) { + return(ut_2pow_remainder(offset, UNIV_PAGE_SIZE) + / FSP_EXTENT_SIZE); + } else { + return(ut_2pow_remainder(offset, zip_size) / FSP_EXTENT_SIZE); + } +} + +/**********************************************************************//** +Gets a descriptor bit of a page. +@return TRUE if free */ +UNIV_INLINE +ibool +xdes_get_bit( +/*=========*/ + const xdes_t* descr, /*!< in: descriptor */ + ulint bit, /*!< in: XDES_FREE_BIT or XDES_CLEAN_BIT */ + ulint offset) /*!< in: page offset within extent: + 0 ... FSP_EXTENT_SIZE - 1 */ +{ + ut_ad(offset < FSP_EXTENT_SIZE); + ut_ad(bit == XDES_FREE_BIT || bit == XDES_CLEAN_BIT); + + ulint index = bit + XDES_BITS_PER_PAGE * offset; + + ulint bit_index = index % 8; + ulint byte_index = index / 8; + + return(ut_bit_get_nth( + mach_read_ulint(descr + XDES_BITMAP + byte_index, + MLOG_1BYTE), + bit_index)); +} + +/********************************************************************//** +Calculates the page where the descriptor of a page resides. +@return descriptor page offset */ +UNIV_INLINE +ulint +xdes_calc_descriptor_page( +/*======================*/ + ulint zip_size, /*!< in: compressed page size in bytes; + 0 for uncompressed pages */ + ulint offset) /*!< in: page offset */ +{ +#ifndef DOXYGEN /* Doxygen gets confused by these */ +# if UNIV_PAGE_SIZE_MAX <= XDES_ARR_OFFSET \ + + (UNIV_PAGE_SIZE_MAX / FSP_EXTENT_SIZE_MAX) \ + * XDES_SIZE_MAX +# error +# endif +# if UNIV_ZIP_SIZE_MIN <= XDES_ARR_OFFSET \ + + (UNIV_ZIP_SIZE_MIN / FSP_EXTENT_SIZE_MIN) \ + * XDES_SIZE_MIN +# error +# endif +#endif /* !DOXYGEN */ + + ut_ad(UNIV_PAGE_SIZE > XDES_ARR_OFFSET + + (UNIV_PAGE_SIZE / FSP_EXTENT_SIZE) + * XDES_SIZE); + ut_ad(UNIV_ZIP_SIZE_MIN > XDES_ARR_OFFSET + + (UNIV_ZIP_SIZE_MIN / FSP_EXTENT_SIZE) + * XDES_SIZE); + + ut_ad(ut_is_2pow(zip_size)); + + if (zip_size == 0) { + return(ut_2pow_round(offset, UNIV_PAGE_SIZE)); + } else { + ut_ad(zip_size > XDES_ARR_OFFSET + + (zip_size / FSP_EXTENT_SIZE) * XDES_SIZE); + return(ut_2pow_round(offset, zip_size)); + } +} + +#endif /* !UNIV_INNOCHECKSUM */ diff --git a/storage/xtradb/include/fsp0types.h b/storage/xtradb/include/fsp0types.h new file mode 100644 index 00000000000..94fd908ab0c --- /dev/null +++ b/storage/xtradb/include/fsp0types.h @@ -0,0 +1,116 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/****************************************************** +@file include/fsp0types.h +File space management types + +Created May 26, 2009 Vasil Dimov +*******************************************************/ + +#ifndef fsp0types_h +#define fsp0types_h + +#include "univ.i" + +#include "fil0fil.h" /* for FIL_PAGE_DATA */ + +/** @name Flags for inserting records in order +If records are inserted in order, there are the following +flags to tell this (their type is made byte for the compiler +to warn if direction and hint parameters are switched in +fseg_alloc_free_page) */ +/* @{ */ +#define FSP_UP ((byte)111) /*!< alphabetically upwards */ +#define FSP_DOWN ((byte)112) /*!< alphabetically downwards */ +#define FSP_NO_DIR ((byte)113) /*!< no order */ +/* @} */ + +/** File space extent size (one megabyte) in pages */ +#define FSP_EXTENT_SIZE (1048576U / UNIV_PAGE_SIZE) + +/** File space extent size (one megabyte) in pages for MAX page size */ +#define FSP_EXTENT_SIZE_MAX (1048576 / UNIV_PAGE_SIZE_MAX) + +/** File space extent size (one megabyte) in pages for MIN page size */ +#define FSP_EXTENT_SIZE_MIN (1048576 / UNIV_PAGE_SIZE_MIN) + +/** On a page of any file segment, data may be put starting from this +offset */ +#define FSEG_PAGE_DATA FIL_PAGE_DATA + +/** @name File segment header +The file segment header points to the inode describing the file segment. */ +/* @{ */ +/** Data type for file segment header */ +typedef byte fseg_header_t; + +#define FSEG_HDR_SPACE 0 /*!< space id of the inode */ +#define FSEG_HDR_PAGE_NO 4 /*!< page number of the inode */ +#define FSEG_HDR_OFFSET 8 /*!< byte offset of the inode */ + +#define FSEG_HEADER_SIZE 10 /*!< Length of the file system + header, in bytes */ +/* @} */ + +/** Flags for fsp_reserve_free_extents @{ */ +#define FSP_NORMAL 1000000 +#define FSP_UNDO 2000000 +#define FSP_CLEANING 3000000 +/* @} */ + +/* Number of pages described in a single descriptor page: currently each page +description takes less than 1 byte; a descriptor page is repeated every +this many file pages */ +/* #define XDES_DESCRIBED_PER_PAGE UNIV_PAGE_SIZE */ +/* This has been replaced with either UNIV_PAGE_SIZE or page_zip->size. */ + +/** @name The space low address page map +The pages at FSP_XDES_OFFSET and FSP_IBUF_BITMAP_OFFSET are repeated +every XDES_DESCRIBED_PER_PAGE pages in every tablespace. */ +/* @{ */ +/*--------------------------------------*/ +#define FSP_XDES_OFFSET 0 /* !< extent descriptor */ +#define FSP_IBUF_BITMAP_OFFSET 1 /* !< insert buffer bitmap */ + /* The ibuf bitmap pages are the ones whose + page number is the number above plus a + multiple of XDES_DESCRIBED_PER_PAGE */ + +#define FSP_FIRST_INODE_PAGE_NO 2 /*!< in every tablespace */ + /* The following pages exist + in the system tablespace (space 0). */ +#define FSP_IBUF_HEADER_PAGE_NO 3 /*!< insert buffer + header page, in + tablespace 0 */ +#define FSP_IBUF_TREE_ROOT_PAGE_NO 4 /*!< insert buffer + B-tree root page in + tablespace 0 */ + /* The ibuf tree root page number in + tablespace 0; its fseg inode is on the page + number FSP_FIRST_INODE_PAGE_NO */ +#define FSP_TRX_SYS_PAGE_NO 5 /*!< transaction + system header, in + tablespace 0 */ +#define FSP_FIRST_RSEG_PAGE_NO 6 /*!< first rollback segment + page, in tablespace 0 */ +#define FSP_DICT_HDR_PAGE_NO 7 /*!< data dictionary header + page, in tablespace 0 */ +/*--------------------------------------*/ +/* @} */ + +#endif /* fsp0types_h */ diff --git a/storage/xtradb/include/fts0ast.h b/storage/xtradb/include/fts0ast.h new file mode 100644 index 00000000000..50ee587e282 --- /dev/null +++ b/storage/xtradb/include/fts0ast.h @@ -0,0 +1,332 @@ +/***************************************************************************** + +Copyright (c) 2007, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fts0ast.h +The FTS query parser (AST) abstract syntax tree routines + +Created 2007/03/16/03 Sunny Bains +*******************************************************/ + +#ifndef INNOBASE_FST0AST_H +#define INNOBASE_FST0AST_H + +#include "mem0mem.h" +#include "ha_prototypes.h" + +/* The type of AST Node */ +enum fts_ast_type_t { + FTS_AST_OPER, /*!< Operator */ + FTS_AST_NUMB, /*!< Number */ + FTS_AST_TERM, /*!< Term (or word) */ + FTS_AST_TEXT, /*!< Text string */ + FTS_AST_LIST, /*!< Expression list */ + FTS_AST_SUBEXP_LIST /*!< Sub-Expression list */ +}; + +/* The FTS query operators that we support */ +enum fts_ast_oper_t { + FTS_NONE, /*!< No operator */ + + FTS_IGNORE, /*!< Ignore rows that contain + this word */ + + FTS_EXIST, /*!< Include rows that contain + this word */ + + FTS_NEGATE, /*!< Include rows that contain + this word but rank them + lower*/ + + FTS_INCR_RATING, /*!< Increase the rank for this + word*/ + + FTS_DECR_RATING, /*!< Decrease the rank for this + word*/ + + FTS_DISTANCE, /*!< Proximity distance */ + FTS_IGNORE_SKIP, /*!< Transient node operator + signifies that this is a + FTS_IGNORE node, and ignored in + the first pass of + fts_ast_visit() */ + FTS_EXIST_SKIP /*!< Transient node operator + signifies that this ia a + FTS_EXIST node, and ignored in + the first pass of + fts_ast_visit() */ +}; + +/* Data types used by the FTS parser */ +struct fts_lexer_t; +struct fts_ast_node_t; +struct fts_ast_state_t; +struct fts_ast_string_t; + +typedef dberr_t (*fts_ast_callback)(fts_ast_oper_t, fts_ast_node_t*, void*); + +/******************************************************************** +Parse the string using the lexer setup within state.*/ +int +fts_parse( +/*======*/ + /* out: 0 on OK, 1 on error */ + fts_ast_state_t* state); /*!< in: ast state instance.*/ + +/******************************************************************** +Create an AST operator node */ +extern +fts_ast_node_t* +fts_ast_create_node_oper( +/*=====================*/ + void* arg, /*!< in: ast state */ + fts_ast_oper_t oper); /*!< in: ast operator */ +/******************************************************************** +Create an AST term node, makes a copy of ptr */ +extern +fts_ast_node_t* +fts_ast_create_node_term( +/*=====================*/ + void* arg, /*!< in: ast state */ + const fts_ast_string_t* ptr); /*!< in: term string */ +/******************************************************************** +Create an AST text node */ +extern +fts_ast_node_t* +fts_ast_create_node_text( +/*=====================*/ + void* arg, /*!< in: ast state */ + const fts_ast_string_t* ptr); /*!< in: text string */ +/******************************************************************** +Create an AST expr list node */ +extern +fts_ast_node_t* +fts_ast_create_node_list( +/*=====================*/ + void* arg, /*!< in: ast state */ + fts_ast_node_t* expr); /*!< in: ast expr */ +/******************************************************************** +Create a sub-expression list node. This function takes ownership of +expr and is responsible for deleting it. */ +extern +fts_ast_node_t* +fts_ast_create_node_subexp_list( +/*============================*/ + /* out: new node */ + void* arg, /*!< in: ast state instance */ + fts_ast_node_t* expr); /*!< in: ast expr instance */ +/******************************************************************** +Set the wildcard attribute of a term.*/ +extern +void +fts_ast_term_set_wildcard( +/*======================*/ + fts_ast_node_t* node); /*!< in: term to change */ +/******************************************************************** +Set the proximity attribute of a text node. */ + +void +fts_ast_term_set_distance( +/*======================*/ + fts_ast_node_t* node, /*!< in/out: text node */ + ulint distance); /*!< in: the text proximity + distance */ +/********************************************************************//** +Free a fts_ast_node_t instance. +@return next node to free */ +UNIV_INTERN +fts_ast_node_t* +fts_ast_free_node( +/*==============*/ + fts_ast_node_t* node); /*!< in: node to free */ +/******************************************************************** +Add a sub-expression to an AST*/ +extern +fts_ast_node_t* +fts_ast_add_node( +/*=============*/ + fts_ast_node_t* list, /*!< in: list node instance */ + fts_ast_node_t* node); /*!< in: (sub) expr to add */ +/******************************************************************** +Print the AST node recursively.*/ +extern +void +fts_ast_node_print( +/*===============*/ + fts_ast_node_t* node); /*!< in: ast node to print */ +/******************************************************************** +For tracking node allocations, in case there is an during parsing.*/ +extern +void +fts_ast_state_add_node( +/*===================*/ + fts_ast_state_t*state, /*!< in: ast state instance */ + fts_ast_node_t* node); /*!< in: node to add to state */ +/******************************************************************** +Free node and expr allocations.*/ +extern +void +fts_ast_state_free( +/*===============*/ + fts_ast_state_t*state); /*!< in: state instance + to free */ +/******************************************************************//** +Traverse the AST - in-order traversal. +@return DB_SUCCESS if all went well */ +UNIV_INTERN +dberr_t +fts_ast_visit( +/*==========*/ + fts_ast_oper_t oper, /*!< in: FTS operator */ + fts_ast_node_t* node, /*!< in: instance to traverse*/ + fts_ast_callback visitor, /*!< in: callback */ + void* arg, /*!< in: callback arg */ + bool* has_ignore) /*!< out: whether we encounter + and ignored processing an + operator, currently we only + ignore FTS_IGNORE operator */ + __attribute__((nonnull, warn_unused_result)); +/*****************************************************************//** +Process (nested) sub-expression, create a new result set to store the +sub-expression result by processing nodes under current sub-expression +list. Merge the sub-expression result with that of parent expression list. +@return DB_SUCCESS if all went well */ +UNIV_INTERN +dberr_t +fts_ast_visit_sub_exp( +/*==================*/ + fts_ast_node_t* node, /*!< in: instance to traverse*/ + fts_ast_callback visitor, /*!< in: callback */ + void* arg) /*!< in: callback arg */ + __attribute__((nonnull, warn_unused_result)); +/******************************************************************** +Create a lex instance.*/ +UNIV_INTERN +fts_lexer_t* +fts_lexer_create( +/*=============*/ + ibool boolean_mode, /*!< in: query type */ + const byte* query, /*!< in: query string */ + ulint query_len) /*!< in: query string len */ + __attribute__((nonnull, malloc, warn_unused_result)); +/******************************************************************** +Free an fts_lexer_t instance.*/ +UNIV_INTERN +void +fts_lexer_free( +/*===========*/ + fts_lexer_t* fts_lexer) /*!< in: lexer instance to + free */ + __attribute__((nonnull)); + +/** +Create an ast string object, with NUL-terminator, so the string +has one more byte than len +@param[in] str pointer to string +@param[in] len length of the string +@return ast string with NUL-terminator */ +UNIV_INTERN +fts_ast_string_t* +fts_ast_string_create( + const byte* str, + ulint len); + +/** +Free an ast string instance +@param[in,out] ast_str string to free */ +UNIV_INTERN +void +fts_ast_string_free( + fts_ast_string_t* ast_str); + +/** +Translate ast string of type FTS_AST_NUMB to unsigned long by strtoul +@param[in] str string to translate +@param[in] base the base +@return translated number */ +UNIV_INTERN +ulint +fts_ast_string_to_ul( + const fts_ast_string_t* ast_str, + int base); + +/** +Print the ast string +@param[in] str string to print */ +UNIV_INTERN +void +fts_ast_string_print( + const fts_ast_string_t* ast_str); + +/* String of length len. +We always store the string of length len with a terminating '\0', +regardless of there is any 0x00 in the string itself */ +struct fts_ast_string_t { + /*!< Pointer to string. */ + byte* str; + + /*!< Length of the string. */ + ulint len; +}; + +/* Query term type */ +struct fts_ast_term_t { + fts_ast_string_t* ptr; /*!< Pointer to term string.*/ + ibool wildcard; /*!< TRUE if wild card set.*/ +}; + +/* Query text type */ +struct fts_ast_text_t { + fts_ast_string_t* ptr; /*!< Pointer to text string.*/ + ulint distance; /*!< > 0 if proximity distance + set */ +}; + +/* The list of nodes in an expr list */ +struct fts_ast_list_t { + fts_ast_node_t* head; /*!< Children list head */ + fts_ast_node_t* tail; /*!< Children list tail */ +}; + +/* FTS AST node to store the term, text, operator and sub-expressions.*/ +struct fts_ast_node_t { + fts_ast_type_t type; /*!< The type of node */ + fts_ast_text_t text; /*!< Text node */ + fts_ast_term_t term; /*!< Term node */ + fts_ast_oper_t oper; /*!< Operator value */ + fts_ast_list_t list; /*!< Expression list */ + fts_ast_node_t* next; /*!< Link for expr list */ + fts_ast_node_t* next_alloc; /*!< For tracking allocations */ + bool visited; /*!< whether this node is + already processed */ +}; + +/* To track state during parsing */ +struct fts_ast_state_t { + mem_heap_t* heap; /*!< Heap to use for alloc */ + fts_ast_node_t* root; /*!< If all goes OK, then this + will point to the root.*/ + + fts_ast_list_t list; /*!< List of nodes allocated */ + + fts_lexer_t* lexer; /*!< Lexer callback + arg */ + CHARSET_INFO* charset; /*!< charset used for + tokenization */ +}; + +#endif /* INNOBASE_FSTS0AST_H */ diff --git a/storage/xtradb/include/fts0blex.h b/storage/xtradb/include/fts0blex.h new file mode 100644 index 00000000000..d0e4cae0678 --- /dev/null +++ b/storage/xtradb/include/fts0blex.h @@ -0,0 +1,349 @@ +#ifndef fts0bHEADER_H +#define fts0bHEADER_H 1 +#define fts0bIN_HEADER 1 + +#line 6 "../include/fts0blex.h" + +#line 8 "../include/fts0blex.h" + +#define YY_INT_ALIGNED short int + +/* A lexical scanner generated by flex */ + +#define FLEX_SCANNER +#define YY_FLEX_MAJOR_VERSION 2 +#define YY_FLEX_MINOR_VERSION 5 +#define YY_FLEX_SUBMINOR_VERSION 35 +#if YY_FLEX_SUBMINOR_VERSION > 0 +#define FLEX_BETA +#endif + +/* First, we deal with platform-specific or compiler-specific issues. */ + +/* begin standard C headers. */ +#include <stdio.h> +#include <string.h> +#include <errno.h> +#include <stdlib.h> + +/* end standard C headers. */ + +/* flex integer type definitions */ + +#ifndef FLEXINT_H +#define FLEXINT_H + +/* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */ + +#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + +/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h, + * if you want the limit (max/min) macros for int types. + */ +#ifndef __STDC_LIMIT_MACROS +#define __STDC_LIMIT_MACROS 1 +#endif + +#include <inttypes.h> +typedef int8_t flex_int8_t; +typedef uint8_t flex_uint8_t; +typedef int16_t flex_int16_t; +typedef uint16_t flex_uint16_t; +typedef int32_t flex_int32_t; +typedef uint32_t flex_uint32_t; +#else +typedef signed char flex_int8_t; +typedef short int flex_int16_t; +typedef int flex_int32_t; +typedef unsigned char flex_uint8_t; +typedef unsigned short int flex_uint16_t; +typedef unsigned int flex_uint32_t; + +/* Limits of integral types. */ +#ifndef INT8_MIN +#define INT8_MIN (-128) +#endif +#ifndef INT16_MIN +#define INT16_MIN (-32767-1) +#endif +#ifndef INT32_MIN +#define INT32_MIN (-2147483647-1) +#endif +#ifndef INT8_MAX +#define INT8_MAX (127) +#endif +#ifndef INT16_MAX +#define INT16_MAX (32767) +#endif +#ifndef INT32_MAX +#define INT32_MAX (2147483647) +#endif +#ifndef UINT8_MAX +#define UINT8_MAX (255U) +#endif +#ifndef UINT16_MAX +#define UINT16_MAX (65535U) +#endif +#ifndef UINT32_MAX +#define UINT32_MAX (4294967295U) +#endif + +#endif /* ! C99 */ + +#endif /* ! FLEXINT_H */ + +#ifdef __cplusplus + +/* The "const" storage-class-modifier is valid. */ +#define YY_USE_CONST + +#else /* ! __cplusplus */ + +/* C99 requires __STDC__ to be defined as 1. */ +#if defined (__STDC__) + +#define YY_USE_CONST + +#endif /* defined (__STDC__) */ +#endif /* ! __cplusplus */ + +#ifdef YY_USE_CONST +#define yyconst const +#else +#define yyconst +#endif + +/* An opaque pointer. */ +#ifndef YY_TYPEDEF_YY_SCANNER_T +#define YY_TYPEDEF_YY_SCANNER_T +typedef void* yyscan_t; +#endif + +/* For convenience, these vars (plus the bison vars far below) + are macros in the reentrant scanner. */ +#define yyin yyg->yyin_r +#define yyout yyg->yyout_r +#define yyextra yyg->yyextra_r +#define yyleng yyg->yyleng_r +#define yytext yyg->yytext_r +#define yylineno (YY_CURRENT_BUFFER_LVALUE->yy_bs_lineno) +#define yycolumn (YY_CURRENT_BUFFER_LVALUE->yy_bs_column) +#define yy_flex_debug yyg->yy_flex_debug_r + +/* Size of default input buffer. */ +#ifndef YY_BUF_SIZE +#ifdef __ia64__ +/* On IA-64, the buffer size is 16k, not 8k. + * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case. + * Ditto for the __ia64__ case accordingly. + */ +#define YY_BUF_SIZE 32768 +#else +#define YY_BUF_SIZE 16384 +#endif /* __ia64__ */ +#endif + +#ifndef YY_TYPEDEF_YY_BUFFER_STATE +#define YY_TYPEDEF_YY_BUFFER_STATE +typedef struct yy_buffer_state *YY_BUFFER_STATE; +#endif + +#ifndef YY_TYPEDEF_YY_SIZE_T +#define YY_TYPEDEF_YY_SIZE_T +typedef size_t yy_size_t; +#endif + +#ifndef YY_STRUCT_YY_BUFFER_STATE +#define YY_STRUCT_YY_BUFFER_STATE +struct yy_buffer_state + { + FILE *yy_input_file; + + char *yy_ch_buf; /* input buffer */ + char *yy_buf_pos; /* current position in input buffer */ + + /* Size of input buffer in bytes, not including room for EOB + * characters. + */ + yy_size_t yy_buf_size; + + /* Number of characters read into yy_ch_buf, not including EOB + * characters. + */ + int yy_n_chars; + + /* Whether we "own" the buffer - i.e., we know we created it, + * and can realloc() it to grow it, and should free() it to + * delete it. + */ + int yy_is_our_buffer; + + /* Whether this is an "interactive" input source; if so, and + * if we're using stdio for input, then we want to use getc() + * instead of fread(), to make sure we stop fetching input after + * each newline. + */ + int yy_is_interactive; + + /* Whether we're considered to be at the beginning of a line. + * If so, '^' rules will be active on the next match, otherwise + * not. + */ + int yy_at_bol; + + int yy_bs_lineno; /**< The line count. */ + int yy_bs_column; /**< The column count. */ + + /* Whether to try to fill the input buffer when we reach the + * end of it. + */ + int yy_fill_buffer; + + int yy_buffer_status; + + }; +#endif /* !YY_STRUCT_YY_BUFFER_STATE */ + +void fts0brestart (FILE *input_file ,yyscan_t yyscanner ); +void fts0b_switch_to_buffer (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner ); +YY_BUFFER_STATE fts0b_create_buffer (FILE *file,int size ,yyscan_t yyscanner ); +void fts0b_delete_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner ); +void fts0b_flush_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner ); +void fts0bpush_buffer_state (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner ); +void fts0bpop_buffer_state (yyscan_t yyscanner ); + +YY_BUFFER_STATE fts0b_scan_buffer (char *base,yy_size_t size ,yyscan_t yyscanner ); +YY_BUFFER_STATE fts0b_scan_string (yyconst char *yy_str ,yyscan_t yyscanner ); +YY_BUFFER_STATE fts0b_scan_bytes (yyconst char *bytes,int len ,yyscan_t yyscanner ); + +void *fts0balloc (yy_size_t ,yyscan_t yyscanner ); +void *fts0brealloc (void *,yy_size_t ,yyscan_t yyscanner ); +void fts0bfree (void * ,yyscan_t yyscanner ); + +/* Begin user sect3 */ + +#define fts0bwrap(n) 1 +#define YY_SKIP_YYWRAP + +#define yytext_ptr yytext_r + +#ifdef YY_HEADER_EXPORT_START_CONDITIONS +#define INITIAL 0 + +#endif + +#ifndef YY_NO_UNISTD_H +/* Special case for "unistd.h", since it is non-ANSI. We include it way + * down here because we want the user's section 1 to have been scanned first. + * The user has a chance to override it with an option. + */ +#include <unistd.h> +#endif + +#ifndef YY_EXTRA_TYPE +#define YY_EXTRA_TYPE void * +#endif + +int fts0blex_init (yyscan_t* scanner); + +int fts0blex_init_extra (YY_EXTRA_TYPE user_defined,yyscan_t* scanner); + +/* Accessor methods to globals. + These are made visible to non-reentrant scanners for convenience. */ + +int fts0blex_destroy (yyscan_t yyscanner ); + +int fts0bget_debug (yyscan_t yyscanner ); + +void fts0bset_debug (int debug_flag ,yyscan_t yyscanner ); + +YY_EXTRA_TYPE fts0bget_extra (yyscan_t yyscanner ); + +void fts0bset_extra (YY_EXTRA_TYPE user_defined ,yyscan_t yyscanner ); + +FILE *fts0bget_in (yyscan_t yyscanner ); + +void fts0bset_in (FILE * in_str ,yyscan_t yyscanner ); + +FILE *fts0bget_out (yyscan_t yyscanner ); + +void fts0bset_out (FILE * out_str ,yyscan_t yyscanner ); + +int fts0bget_leng (yyscan_t yyscanner ); + +char *fts0bget_text (yyscan_t yyscanner ); + +int fts0bget_lineno (yyscan_t yyscanner ); + +void fts0bset_lineno (int line_number ,yyscan_t yyscanner ); + +/* Macros after this point can all be overridden by user definitions in + * section 1. + */ + +#ifndef YY_SKIP_YYWRAP +#ifdef __cplusplus +extern "C" int fts0bwrap (yyscan_t yyscanner ); +#else +extern int fts0bwrap (yyscan_t yyscanner ); +#endif +#endif + +#ifndef yytext_ptr +static void yy_flex_strncpy (char *,yyconst char *,int ,yyscan_t yyscanner); +#endif + +#ifdef YY_NEED_STRLEN +static int yy_flex_strlen (yyconst char * ,yyscan_t yyscanner); +#endif + +#ifndef YY_NO_INPUT + +#endif + +/* Amount of stuff to slurp up with each read. */ +#ifndef YY_READ_BUF_SIZE +#ifdef __ia64__ +/* On IA-64, the buffer size is 16k, not 8k */ +#define YY_READ_BUF_SIZE 16384 +#else +#define YY_READ_BUF_SIZE 8192 +#endif /* __ia64__ */ +#endif + +/* Number of entries by which start-condition stack grows. */ +#ifndef YY_START_STACK_INCR +#define YY_START_STACK_INCR 25 +#endif + +/* Default declaration of generated scanner - a define so the user can + * easily add parameters. + */ +#ifndef YY_DECL +#define YY_DECL_IS_OURS 1 + +extern int fts0blex (yyscan_t yyscanner); + +#define YY_DECL int fts0blex (yyscan_t yyscanner) +#endif /* !YY_DECL */ + +/* yy_get_previous_state - get the state just before the EOB char was reached */ + +#undef YY_NEW_FILE +#undef YY_FLUSH_BUFFER +#undef yy_set_bol +#undef yy_new_buffer +#undef yy_set_interactive +#undef YY_DO_BEFORE_ACTION + +#ifdef YY_DECL_IS_OURS +#undef YY_DECL_IS_OURS +#undef YY_DECL +#endif + +#line 73 "fts0blex.l" + + +#line 348 "../include/fts0blex.h" +#undef fts0bIN_HEADER +#endif /* fts0bHEADER_H */ diff --git a/storage/xtradb/include/fts0fts.h b/storage/xtradb/include/fts0fts.h new file mode 100644 index 00000000000..a2996ecacc8 --- /dev/null +++ b/storage/xtradb/include/fts0fts.h @@ -0,0 +1,1039 @@ +/***************************************************************************** + +Copyright (c) 2011, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fts0fts.h +Full text search header file + +Created 2011/09/02 Sunny Bains +***********************************************************************/ + +#ifndef fts0fts_h +#define fts0fts_h + +#include "univ.i" + +#include "data0type.h" +#include "data0types.h" +#include "dict0types.h" +#include "hash0hash.h" +#include "mem0mem.h" +#include "rem0types.h" +#include "row0types.h" +#include "trx0types.h" +#include "ut0vec.h" +#include "ut0rbt.h" +#include "ut0wqueue.h" +#include "que0types.h" +#include "ft_global.h" + +/** "NULL" value of a document id. */ +#define FTS_NULL_DOC_ID 0 + +/** FTS hidden column that is used to map to and from the row */ +#define FTS_DOC_ID_COL_NAME "FTS_DOC_ID" + +/** The name of the index created by FTS */ +#define FTS_DOC_ID_INDEX_NAME "FTS_DOC_ID_INDEX" + +#define FTS_DOC_ID_INDEX_NAME_LEN 16 + +/** Doc ID is a 8 byte value */ +#define FTS_DOC_ID_LEN 8 + +/** The number of fields to sort when we build FT index with +FIC. Three fields are sort: (word, doc_id, position) */ +#define FTS_NUM_FIELDS_SORT 3 + +/** Maximum number of rows in a table, smaller than which, we will +optimize using a 4 byte Doc ID for FIC merge sort to reduce sort size */ +#define MAX_DOC_ID_OPT_VAL 1073741824 + +/** Document id type. */ +typedef ib_uint64_t doc_id_t; + +/** doc_id_t printf format */ +#define FTS_DOC_ID_FORMAT IB_ID_FMT + +/** Convert document id to the InnoDB (BIG ENDIAN) storage format. */ +#define fts_write_doc_id(d, s) mach_write_to_8(d, s) + +/** Read a document id to internal format. */ +#define fts_read_doc_id(s) mach_read_from_8(s) + +/** Bind the doc id to a variable */ +#define fts_bind_doc_id(i, n, v) pars_info_bind_int8_literal(i, n, v) + +/** Defines for FTS query mode, they have the same values as +those defined in mysql file ft_global.h */ +#define FTS_NL 0 +#define FTS_BOOL 1 +#define FTS_SORTED 2 +#define FTS_EXPAND 4 +#define FTS_PROXIMITY 8 +#define FTS_PHRASE 16 +#define FTS_OPT_RANKING 32 + +#define FTS_INDEX_TABLE_IND_NAME "FTS_INDEX_TABLE_IND" + +/** Threshold where our optimize thread automatically kicks in */ +#define FTS_OPTIMIZE_THRESHOLD 10000000 + +#define FTS_DOC_ID_MAX_STEP 10000 +/** Variable specifying the FTS parallel sort degree */ +extern ulong fts_sort_pll_degree; + +/** Variable specifying the number of word to optimize for each optimize table +call */ +extern ulong fts_num_word_optimize; + +/** Variable specifying whether we do additional FTS diagnostic printout +in the log */ +extern char fts_enable_diag_print; + +/** FTS rank type, which will be between 0 .. 1 inclusive */ +typedef float fts_rank_t; + +/** Type of a row during a transaction. FTS_NOTHING means the row can be +forgotten from the FTS system's POV, FTS_INVALID is an internal value used +to mark invalid states. + +NOTE: Do not change the order or value of these, fts_trx_row_get_new_state +depends on them being exactly as they are. */ +enum fts_row_state { + FTS_INSERT = 0, + FTS_MODIFY, + FTS_DELETE, + FTS_NOTHING, + FTS_INVALID +}; + +/** The FTS table types. */ +enum fts_table_type_t { + FTS_INDEX_TABLE, /*!< FTS auxiliary table that is + specific to a particular FTS index + on a table */ + + FTS_COMMON_TABLE /*!< FTS auxiliary table that is common + for all FTS index on a table */ +}; + +struct fts_doc_t; +struct fts_cache_t; +struct fts_token_t; +struct fts_doc_ids_t; +struct fts_index_cache_t; + + +/** Initialize the "fts_table" for internal query into FTS auxiliary +tables */ +#define FTS_INIT_FTS_TABLE(fts_table, m_suffix, m_type, m_table)\ +do { \ + (fts_table)->suffix = m_suffix; \ + (fts_table)->type = m_type; \ + (fts_table)->table_id = m_table->id; \ + (fts_table)->parent = m_table->name; \ + (fts_table)->table = m_table; \ +} while (0); + +#define FTS_INIT_INDEX_TABLE(fts_table, m_suffix, m_type, m_index)\ +do { \ + (fts_table)->suffix = m_suffix; \ + (fts_table)->type = m_type; \ + (fts_table)->table_id = m_index->table->id; \ + (fts_table)->parent = m_index->table->name; \ + (fts_table)->table = m_index->table; \ + (fts_table)->index_id = m_index->id; \ +} while (0); + +/** Information about changes in a single transaction affecting +the FTS system. */ +struct fts_trx_t { + trx_t* trx; /*!< InnoDB transaction */ + + ib_vector_t* savepoints; /*!< Active savepoints, must have at + least one element, the implied + savepoint */ + ib_vector_t* last_stmt; /*!< last_stmt */ + + mem_heap_t* heap; /*!< heap */ +}; + +/** Information required for transaction savepoint handling. */ +struct fts_savepoint_t { + char* name; /*!< First entry is always NULL, the + default instance. Otherwise the name + of the savepoint */ + + ib_rbt_t* tables; /*!< Modified FTS tables */ +}; + +/** Information about changed rows in a transaction for a single table. */ +struct fts_trx_table_t { + dict_table_t* table; /*!< table */ + + fts_trx_t* fts_trx; /*!< link to parent */ + + ib_rbt_t* rows; /*!< rows changed; indexed by doc-id, + cells are fts_trx_row_t* */ + + fts_doc_ids_t* added_doc_ids; /*!< list of added doc ids (NULL until + the first addition) */ + + /*!< for adding doc ids */ + que_t* docs_added_graph; +}; + +/** Information about one changed row in a transaction. */ +struct fts_trx_row_t { + doc_id_t doc_id; /*!< Id of the ins/upd/del document */ + + fts_row_state state; /*!< state of the row */ + + ib_vector_t* fts_indexes; /*!< The indexes that are affected */ +}; + +/** List of document ids that were added during a transaction. This +list is passed on to a background 'Add' thread and OPTIMIZE, so it +needs its own memory heap. */ +struct fts_doc_ids_t { + ib_vector_t* doc_ids; /*!< document ids (each element is + of type doc_id_t). */ + + ib_alloc_t* self_heap; /*!< Allocator used to create an + instance of this type and the + doc_ids vector */ +}; + +// FIXME: Get rid of this if possible. +/** Since MySQL's character set support for Unicode is woefully inadequate +(it supports basic operations like isalpha etc. only for 8-bit characters), +we have to implement our own. We use UTF-16 without surrogate processing +as our in-memory format. This typedef is a single such character. */ +typedef unsigned short ib_uc_t; + +/** An UTF-16 ro UTF-8 string. */ +struct fts_string_t { + byte* f_str; /*!< string, not necessary terminated in + any way */ + ulint f_len; /*!< Length of the string in bytes */ + ulint f_n_char; /*!< Number of characters */ +}; + +/** Query ranked doc ids. */ +struct fts_ranking_t { + doc_id_t doc_id; /*!< Document id */ + + fts_rank_t rank; /*!< Rank is between 0 .. 1 */ + + byte* words; /*!< this contains the words + that were queried + and found in this document */ + ulint words_len; /*!< words len */ +}; + +/** Query result. */ +struct fts_result_t { + ib_rbt_node_t* current; /*!< Current element */ + + ib_rbt_t* rankings_by_id; /*!< RB tree of type fts_ranking_t + indexed by doc id */ + ib_rbt_t* rankings_by_rank;/*!< RB tree of type fts_ranking_t + indexed by rank */ +}; + +/** This is used to generate the FTS auxiliary table name, we need the +table id and the index id to generate the column specific FTS auxiliary +table name. */ +struct fts_table_t { + const char* parent; /*!< Parent table name, this is + required only for the database + name */ + + fts_table_type_t + type; /*!< The auxiliary table type */ + + table_id_t table_id; /*!< The table id */ + + index_id_t index_id; /*!< The index id */ + + const char* suffix; /*!< The suffix of the fts auxiliary + table name, can be NULL, not used + everywhere (yet) */ + const dict_table_t* + table; /*!< Parent table */ + CHARSET_INFO* charset; /*!< charset info if it is for FTS + index auxiliary table */ +}; + +enum fts_status { + BG_THREAD_STOP = 1, /*!< TRUE if the FTS background thread + has finished reading the ADDED table, + meaning more items can be added to + the table. */ + + BG_THREAD_READY = 2, /*!< TRUE if the FTS background thread + is ready */ + + ADD_THREAD_STARTED = 4, /*!< TRUE if the FTS add thread + has started */ + + ADDED_TABLE_SYNCED = 8, /*!< TRUE if the ADDED table record is + sync-ed after crash recovery */ + + TABLE_DICT_LOCKED = 16 /*!< Set if the table has + dict_sys->mutex */ +}; + +typedef enum fts_status fts_status_t; + +/** The state of the FTS sub system. */ +struct fts_t { + /*!< mutex protecting bg_threads* and + fts_add_wq. */ + ib_mutex_t bg_threads_mutex; + + ulint bg_threads; /*!< number of background threads + accessing this table */ + + /*!< TRUE if background threads running + should stop themselves */ + ulint fts_status; /*!< Status bit regarding fts + running state */ + + ib_wqueue_t* add_wq; /*!< Work queue for scheduling jobs + for the FTS 'Add' thread, or NULL + if the thread has not yet been + created. Each work item is a + fts_trx_doc_ids_t*. */ + + fts_cache_t* cache; /*!< FTS memory buffer for this table, + or NULL if the table has no FTS + index. */ + + ulint doc_col; /*!< FTS doc id hidden column number + in the CLUSTERED index. */ + + ib_vector_t* indexes; /*!< Vector of FTS indexes, this is + mainly for caching purposes. */ + mem_heap_t* fts_heap; /*!< heap for fts_t allocation */ +}; + +struct fts_stopword_t; + +/** status bits for fts_stopword_t status field. */ +#define STOPWORD_NOT_INIT 0x1 +#define STOPWORD_OFF 0x2 +#define STOPWORD_FROM_DEFAULT 0x4 +#define STOPWORD_USER_TABLE 0x8 + +extern const char* fts_default_stopword[]; + +/** Variable specifying the maximum FTS cache size for each table */ +extern ulong fts_max_cache_size; + +/** Variable specifying the total memory allocated for FTS cache */ +extern ulong fts_max_total_cache_size; + +/** Variable specifying the FTS result cache limit for each query */ +extern ulong fts_result_cache_limit; + +/** Variable specifying the maximum FTS max token size */ +extern ulong fts_max_token_size; + +/** Variable specifying the minimum FTS max token size */ +extern ulong fts_min_token_size; + +/** Whether the total memory used for FTS cache is exhausted, and we will +need a sync to free some memory */ +extern bool fts_need_sync; + +/** Maximum possible Fulltext word length */ +#define FTS_MAX_WORD_LEN HA_FT_MAXBYTELEN + +/** Maximum possible Fulltext word length (in characters) */ +#define FTS_MAX_WORD_LEN_IN_CHAR HA_FT_MAXCHARLEN + +/** Variable specifying the table that has Fulltext index to display its +content through information schema table */ +extern char* fts_internal_tbl_name; + +#define fts_que_graph_free(graph) \ +do { \ + mutex_enter(&dict_sys->mutex); \ + que_graph_free(graph); \ + mutex_exit(&dict_sys->mutex); \ +} while (0) + +/******************************************************************//** +Create a FTS cache. */ +UNIV_INTERN +fts_cache_t* +fts_cache_create( +/*=============*/ + dict_table_t* table); /*!< table owns the FTS cache */ + +/******************************************************************//** +Create a FTS index cache. +@return Index Cache */ +UNIV_INTERN +fts_index_cache_t* +fts_cache_index_cache_create( +/*=========================*/ + dict_table_t* table, /*!< in: table with FTS index */ + dict_index_t* index); /*!< in: FTS index */ + +/******************************************************************//** +Get the next available document id. This function creates a new +transaction to generate the document id. +@return DB_SUCCESS if OK */ +UNIV_INTERN +dberr_t +fts_get_next_doc_id( +/*================*/ + const dict_table_t* table, /*!< in: table */ + doc_id_t* doc_id) /*!< out: new document id */ + __attribute__((nonnull)); +/*********************************************************************//** +Update the next and last Doc ID in the CONFIG table to be the input +"doc_id" value (+ 1). We would do so after each FTS index build or +table truncate */ +UNIV_INTERN +void +fts_update_next_doc_id( +/*===================*/ + trx_t* trx, /*!< in/out: transaction */ + const dict_table_t* table, /*!< in: table */ + const char* table_name, /*!< in: table name, or NULL */ + doc_id_t doc_id) /*!< in: DOC ID to set */ + __attribute__((nonnull(2))); + +/******************************************************************//** +Create a new document id . +@return DB_SUCCESS if all went well else error */ +UNIV_INTERN +dberr_t +fts_create_doc_id( +/*==============*/ + dict_table_t* table, /*!< in: row is of this + table. */ + dtuple_t* row, /*!< in/out: add doc id + value to this row. This is the + current row that is being + inserted. */ + mem_heap_t* heap) /*!< in: heap */ + __attribute__((nonnull)); +/******************************************************************//** +Create a new fts_doc_ids_t. +@return new fts_doc_ids_t. */ +UNIV_INTERN +fts_doc_ids_t* +fts_doc_ids_create(void); +/*=====================*/ + +/******************************************************************//** +Free a fts_doc_ids_t. */ +UNIV_INTERN +void +fts_doc_ids_free( +/*=============*/ + fts_doc_ids_t* doc_ids); /*!< in: doc_ids to free */ + +/******************************************************************//** +Notify the FTS system about an operation on an FTS-indexed table. */ +UNIV_INTERN +void +fts_trx_add_op( +/*===========*/ + trx_t* trx, /*!< in: InnoDB transaction */ + dict_table_t* table, /*!< in: table */ + doc_id_t doc_id, /*!< in: doc id */ + fts_row_state state, /*!< in: state of the row */ + ib_vector_t* fts_indexes) /*!< in: FTS indexes affected + (NULL=all) */ + __attribute__((nonnull(1,2))); + +/******************************************************************//** +Free an FTS trx. */ +UNIV_INTERN +void +fts_trx_free( +/*=========*/ + fts_trx_t* fts_trx); /*!< in, own: FTS trx */ + +/******************************************************************//** +Creates the common ancillary tables needed for supporting an FTS index +on the given table. row_mysql_lock_data_dictionary must have been +called before this. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fts_create_common_tables( +/*=====================*/ + trx_t* trx, /*!< in: transaction handle */ + const dict_table_t* + table, /*!< in: table with one FTS + index */ + const char* name, /*!< in: table name */ + bool skip_doc_id_index) /*!< in: Skip index on doc id */ + __attribute__((nonnull, warn_unused_result)); +/******************************************************************//** +Wrapper function of fts_create_index_tables_low(), create auxiliary +tables for an FTS index +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fts_create_index_tables( +/*====================*/ + trx_t* trx, /*!< in: transaction handle */ + const dict_index_t* index) /*!< in: the FTS index + instance */ + __attribute__((nonnull, warn_unused_result)); +/******************************************************************//** +Creates the column specific ancillary tables needed for supporting an +FTS index on the given table. row_mysql_lock_data_dictionary must have +been called before this. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fts_create_index_tables_low( +/*========================*/ + trx_t* trx, /*!< in: transaction handle */ + const dict_index_t* + index, /*!< in: the FTS index + instance */ + const char* table_name, /*!< in: the table name */ + table_id_t table_id) /*!< in: the table id */ + __attribute__((nonnull, warn_unused_result)); +/******************************************************************//** +Add the FTS document id hidden column. */ +UNIV_INTERN +void +fts_add_doc_id_column( +/*==================*/ + dict_table_t* table, /*!< in/out: Table with FTS index */ + mem_heap_t* heap) /*!< in: temporary memory heap, or NULL */ + __attribute__((nonnull(1))); + +/*********************************************************************//** +Drops the ancillary tables needed for supporting an FTS index on the +given table. row_mysql_lock_data_dictionary must have been called before +this. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fts_drop_tables( +/*============*/ + trx_t* trx, /*!< in: transaction */ + dict_table_t* table) /*!< in: table has the FTS + index */ + __attribute__((nonnull)); +/******************************************************************//** +The given transaction is about to be committed; do whatever is necessary +from the FTS system's POV. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fts_commit( +/*=======*/ + trx_t* trx) /*!< in: transaction */ + __attribute__((nonnull, warn_unused_result)); + +/*******************************************************************//** +FTS Query entry point. +@return DB_SUCCESS if successful otherwise error code */ +UNIV_INTERN +dberr_t +fts_query( +/*======*/ + trx_t* trx, /*!< in: transaction */ + dict_index_t* index, /*!< in: FTS index to search */ + uint flags, /*!< in: FTS search mode */ + const byte* query, /*!< in: FTS query */ + ulint query_len, /*!< in: FTS query string len + in bytes */ + fts_result_t** result) /*!< out: query result, to be + freed by the caller.*/ + __attribute__((nonnull, warn_unused_result)); + +/******************************************************************//** +Retrieve the FTS Relevance Ranking result for doc with doc_id +@return the relevance ranking value. */ +UNIV_INTERN +float +fts_retrieve_ranking( +/*=================*/ + fts_result_t* result, /*!< in: FTS result structure */ + doc_id_t doc_id); /*!< in: the interested document + doc_id */ + +/******************************************************************//** +FTS Query sort result, returned by fts_query() on fts_ranking_t::rank. */ +UNIV_INTERN +void +fts_query_sort_result_on_rank( +/*==========================*/ + fts_result_t* result); /*!< out: result instance + to sort.*/ + +/******************************************************************//** +FTS Query free result, returned by fts_query(). */ +UNIV_INTERN +void +fts_query_free_result( +/*==================*/ + fts_result_t* result); /*!< in: result instance + to free.*/ + +/******************************************************************//** +Extract the doc id from the FTS hidden column. */ +UNIV_INTERN +doc_id_t +fts_get_doc_id_from_row( +/*====================*/ + dict_table_t* table, /*!< in: table */ + dtuple_t* row); /*!< in: row whose FTS doc id we + want to extract.*/ + +/******************************************************************//** +Extract the doc id from the FTS hidden column. */ +UNIV_INTERN +doc_id_t +fts_get_doc_id_from_rec( +/*====================*/ + dict_table_t* table, /*!< in: table */ + const rec_t* rec, /*!< in: rec */ + mem_heap_t* heap); /*!< in: heap */ + +/******************************************************************//** +Update the query graph with a new document id. +@return Doc ID used */ +UNIV_INTERN +doc_id_t +fts_update_doc_id( +/*==============*/ + dict_table_t* table, /*!< in: table */ + upd_field_t* ufield, /*!< out: update node */ + doc_id_t* next_doc_id); /*!< out: buffer for writing */ + +/******************************************************************//** +FTS initialize. */ +UNIV_INTERN +void +fts_startup(void); +/*==============*/ + +/******************************************************************//** +Signal FTS threads to initiate shutdown. */ +UNIV_INTERN +void +fts_start_shutdown( +/*===============*/ + dict_table_t* table, /*!< in: table with FTS + indexes */ + fts_t* fts); /*!< in: fts instance to + shutdown */ + +/******************************************************************//** +Wait for FTS threads to shutdown. */ +UNIV_INTERN +void +fts_shutdown( +/*=========*/ + dict_table_t* table, /*!< in: table with FTS + indexes */ + fts_t* fts); /*!< in: fts instance to + shutdown */ + +/******************************************************************//** +Create an instance of fts_t. +@return instance of fts_t */ +UNIV_INTERN +fts_t* +fts_create( +/*=======*/ + dict_table_t* table); /*!< out: table with FTS + indexes */ + +/**********************************************************************//** +Free the FTS resources. */ +UNIV_INTERN +void +fts_free( +/*=====*/ + dict_table_t* table); /*!< in/out: table with + FTS indexes */ + +/*********************************************************************//** +Run OPTIMIZE on the given table. +@return DB_SUCCESS if all OK */ +UNIV_INTERN +dberr_t +fts_optimize_table( +/*===============*/ + dict_table_t* table) /*!< in: table to optimiza */ + __attribute__((nonnull)); + +/**********************************************************************//** +Startup the optimize thread and create the work queue. */ +UNIV_INTERN +void +fts_optimize_init(void); +/*====================*/ + +/**********************************************************************//** +Check whether the work queue is initialized. +@return TRUE if optimze queue is initialized. */ +UNIV_INTERN +ibool +fts_optimize_is_init(void); +/*======================*/ + +/****************************************************************//** +Drops index ancillary tables for a FTS index +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fts_drop_index_tables( +/*==================*/ + trx_t* trx, /*!< in: transaction */ + dict_index_t* index) /*!< in: Index to drop */ + __attribute__((nonnull, warn_unused_result)); + +/******************************************************************//** +Remove the table from the OPTIMIZER's list. We do wait for +acknowledgement from the consumer of the message. */ +UNIV_INTERN +void +fts_optimize_remove_table( +/*======================*/ + dict_table_t* table); /*!< in: table to remove */ + +/**********************************************************************//** +Signal the optimize thread to prepare for shutdown. */ +UNIV_INTERN +void +fts_optimize_start_shutdown(void); +/*==============================*/ + +/**********************************************************************//** +Inform optimize to clean up. */ +UNIV_INTERN +void +fts_optimize_end(void); +/*===================*/ + +/**********************************************************************//** +Take a FTS savepoint. */ +UNIV_INTERN +void +fts_savepoint_take( +/*===============*/ + trx_t* trx, /*!< in: transaction */ + fts_trx_t* fts_trx, /*!< in: fts transaction */ + const char* name) /*!< in: savepoint name */ + __attribute__((nonnull)); +/**********************************************************************//** +Refresh last statement savepoint. */ +UNIV_INTERN +void +fts_savepoint_laststmt_refresh( +/*===========================*/ + trx_t* trx) /*!< in: transaction */ + __attribute__((nonnull)); +/**********************************************************************//** +Release the savepoint data identified by name. */ +UNIV_INTERN +void +fts_savepoint_release( +/*==================*/ + trx_t* trx, /*!< in: transaction */ + const char* name); /*!< in: savepoint name */ + +/**********************************************************************//** +Free the FTS cache. */ +UNIV_INTERN +void +fts_cache_destroy( +/*==============*/ + fts_cache_t* cache); /*!< in: cache*/ + +/*********************************************************************//** +Clear cache. */ +UNIV_INTERN +void +fts_cache_clear( +/*============*/ + fts_cache_t* cache); /*!< in: cache */ + +/*********************************************************************//** +Initialize things in cache. */ +UNIV_INTERN +void +fts_cache_init( +/*===========*/ + fts_cache_t* cache); /*!< in: cache */ + +/*********************************************************************//** +Rollback to and including savepoint indentified by name. */ +UNIV_INTERN +void +fts_savepoint_rollback( +/*===================*/ + trx_t* trx, /*!< in: transaction */ + const char* name); /*!< in: savepoint name */ + +/*********************************************************************//** +Rollback to and including savepoint indentified by name. */ +UNIV_INTERN +void +fts_savepoint_rollback_last_stmt( +/*=============================*/ + trx_t* trx); /*!< in: transaction */ + +/***********************************************************************//** +Drop all orphaned FTS auxiliary tables, those that don't have a parent +table or FTS index defined on them. */ +UNIV_INTERN +void +fts_drop_orphaned_tables(void); +/*==========================*/ + +/******************************************************************//** +Since we do a horizontal split on the index table, we need to drop +all the split tables. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fts_drop_index_split_tables( +/*========================*/ + trx_t* trx, /*!< in: transaction */ + dict_index_t* index) /*!< in: fts instance */ + __attribute__((nonnull, warn_unused_result)); + +/****************************************************************//** +Run SYNC on the table, i.e., write out data from the cache to the +FTS auxiliary INDEX table and clear the cache at the end. */ +UNIV_INTERN +dberr_t +fts_sync_table( +/*===========*/ + dict_table_t* table) /*!< in: table */ + __attribute__((nonnull)); + +/****************************************************************//** +Free the query graph but check whether dict_sys->mutex is already +held */ +UNIV_INTERN +void +fts_que_graph_free_check_lock( +/*==========================*/ + fts_table_t* fts_table, /*!< in: FTS table */ + const fts_index_cache_t*index_cache, /*!< in: FTS index cache */ + que_t* graph); /*!< in: query graph */ + +/****************************************************************//** +Create an FTS index cache. */ +UNIV_INTERN +CHARSET_INFO* +fts_index_get_charset( +/*==================*/ + dict_index_t* index); /*!< in: FTS index */ + +/*********************************************************************//** +Get the initial Doc ID by consulting the CONFIG table +@return initial Doc ID */ +UNIV_INTERN +doc_id_t +fts_init_doc_id( +/*============*/ + const dict_table_t* table); /*!< in: table */ + +/******************************************************************//** +compare two character string according to their charset. */ +extern +int +innobase_fts_text_cmp( +/*==================*/ + const void* cs, /*!< in: Character set */ + const void* p1, /*!< in: key */ + const void* p2); /*!< in: node */ + +/******************************************************************//** +Makes all characters in a string lower case. */ +extern +size_t +innobase_fts_casedn_str( +/*====================*/ + CHARSET_INFO* cs, /*!< in: Character set */ + char* src, /*!< in: string to put in + lower case */ + size_t src_len, /*!< in: input string length */ + char* dst, /*!< in: buffer for result + string */ + size_t dst_len); /*!< in: buffer size */ + + +/******************************************************************//** +compare two character string according to their charset. */ +extern +int +innobase_fts_text_cmp_prefix( +/*=========================*/ + const void* cs, /*!< in: Character set */ + const void* p1, /*!< in: key */ + const void* p2); /*!< in: node */ + +/*************************************************************//** +Get the next token from the given string and store it in *token. */ +extern +ulint +innobase_mysql_fts_get_token( +/*=========================*/ + CHARSET_INFO* charset, /*!< in: Character set */ + const byte* start, /*!< in: start of text */ + const byte* end, /*!< in: one character past + end of text */ + fts_string_t* token, /*!< out: token's text */ + ulint* offset); /*!< out: offset to token, + measured as characters from + 'start' */ + +/*********************************************************************//** +Fetch COUNT(*) from specified table. +@return the number of rows in the table */ +UNIV_INTERN +ulint +fts_get_rows_count( +/*===============*/ + fts_table_t* fts_table); /*!< in: fts table to read */ + +/*************************************************************//** +Get maximum Doc ID in a table if index "FTS_DOC_ID_INDEX" exists +@return max Doc ID or 0 if index "FTS_DOC_ID_INDEX" does not exist */ +UNIV_INTERN +doc_id_t +fts_get_max_doc_id( +/*===============*/ + dict_table_t* table); /*!< in: user table */ + +/******************************************************************//** +Check whether user supplied stopword table exists and is of +the right format. +@return the stopword column charset if qualifies */ +UNIV_INTERN +CHARSET_INFO* +fts_valid_stopword_table( +/*=====================*/ + const char* stopword_table_name); /*!< in: Stopword table + name */ +/****************************************************************//** +This function loads specified stopword into FTS cache +@return TRUE if success */ +UNIV_INTERN +ibool +fts_load_stopword( +/*==============*/ + const dict_table_t* + table, /*!< in: Table with FTS */ + trx_t* trx, /*!< in: Transaction */ + const char* global_stopword_table, /*!< in: Global stopword table + name */ + const char* session_stopword_table, /*!< in: Session stopword table + name */ + ibool stopword_is_on, /*!< in: Whether stopword + option is turned on/off */ + ibool reload); /*!< in: Whether it is during + reload of FTS table */ + +/****************************************************************//** +Create the vector of fts_get_doc_t instances. +@return vector of fts_get_doc_t instances */ +UNIV_INTERN +ib_vector_t* +fts_get_docs_create( +/*================*/ + fts_cache_t* cache); /*!< in: fts cache */ + +/****************************************************************//** +Read the rows from the FTS index +@return DB_SUCCESS if OK */ +UNIV_INTERN +dberr_t +fts_table_fetch_doc_ids( +/*====================*/ + trx_t* trx, /*!< in: transaction */ + fts_table_t* fts_table, /*!< in: aux table */ + fts_doc_ids_t* doc_ids); /*!< in: For collecting + doc ids */ +/****************************************************************//** +This function brings FTS index in sync when FTS index is first +used. There are documents that have not yet sync-ed to auxiliary +tables from last server abnormally shutdown, we will need to bring +such document into FTS cache before any further operations +@return TRUE if all OK */ +UNIV_INTERN +ibool +fts_init_index( +/*===========*/ + dict_table_t* table, /*!< in: Table with FTS */ + ibool has_cache_lock); /*!< in: Whether we already + have cache lock */ +/*******************************************************************//** +Add a newly create index in FTS cache */ +UNIV_INTERN +void +fts_add_index( +/*==========*/ + dict_index_t* index, /*!< FTS index to be added */ + dict_table_t* table); /*!< table */ + +/*******************************************************************//** +Drop auxiliary tables related to an FTS index +@return DB_SUCCESS or error number */ +UNIV_INTERN +dberr_t +fts_drop_index( +/*===========*/ + dict_table_t* table, /*!< in: Table where indexes are dropped */ + dict_index_t* index, /*!< in: Index to be dropped */ + trx_t* trx) /*!< in: Transaction for the drop */ + __attribute__((nonnull)); + +/****************************************************************//** +Rename auxiliary tables for all fts index for a table +@return DB_SUCCESS or error code */ + +dberr_t +fts_rename_aux_tables( +/*==================*/ + dict_table_t* table, /*!< in: user Table */ + const char* new_name, /*!< in: new table name */ + trx_t* trx); /*!< in: transaction */ + +/*******************************************************************//** +Check indexes in the fts->indexes is also present in index cache and +table->indexes list +@return TRUE if all indexes match */ +UNIV_INTERN +ibool +fts_check_cached_index( +/*===================*/ + dict_table_t* table); /*!< in: Table where indexes are dropped */ +#endif /*!< fts0fts.h */ + diff --git a/storage/xtradb/include/fts0opt.h b/storage/xtradb/include/fts0opt.h new file mode 100644 index 00000000000..92eaf8270d2 --- /dev/null +++ b/storage/xtradb/include/fts0opt.h @@ -0,0 +1,37 @@ +/***************************************************************************** + +Copyright (c) 2001, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fts0opt.h +Full Text Search optimize thread + +Created 2011-02-15 Jimmy Yang +***********************************************************************/ +#ifndef INNODB_FTS0OPT_H +#define INNODB_FTS0OPT_H + +/******************************************************************** +Callback function to fetch the rows in an FTS INDEX record. */ +UNIV_INTERN +ibool +fts_optimize_index_fetch_node( +/*==========================*/ + /* out: always returns non-NULL */ + void* row, /* in: sel_node_t* */ + void* user_arg); /* in: pointer to ib_vector_t */ +#endif diff --git a/storage/xtradb/include/fts0pars.h b/storage/xtradb/include/fts0pars.h new file mode 100644 index 00000000000..8108e811599 --- /dev/null +++ b/storage/xtradb/include/fts0pars.h @@ -0,0 +1,72 @@ +/* A Bison parser, made by GNU Bison 2.5. */ + +/* Bison interface for Yacc-like parsers in C + + Copyright (C) 1984, 1989-1990, 2000-2011 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +/* As a special exception, you may create a larger work that contains + part or all of the Bison parser skeleton and distribute that work + under terms of your choice, so long as that work isn't itself a + parser generator using the skeleton or a modified version thereof + as a parser skeleton. Alternatively, if you modify or redistribute + the parser skeleton itself, you may (at your option) remove this + special exception, which will cause the skeleton and the resulting + Bison output files to be licensed under the GNU General Public + License without this special exception. + + This special exception was added by the Free Software Foundation in + version 2.2 of Bison. */ + + +/* Tokens. */ +#ifndef YYTOKENTYPE +# define YYTOKENTYPE + /* Put the tokens into the symbol table, so that GDB and other debuggers + know about them. */ + enum yytokentype { + FTS_OPER = 258, + FTS_TEXT = 259, + FTS_TERM = 260, + FTS_NUMB = 261 + }; +#endif + + + +#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED +typedef union YYSTYPE +{ + +/* Line 2068 of yacc.c */ +#line 61 "fts0pars.y" + + int oper; + fts_ast_string_t* token; + fts_ast_node_t* node; + + + +/* Line 2068 of yacc.c */ +#line 64 "fts0pars.hh" +} YYSTYPE; +# define YYSTYPE_IS_TRIVIAL 1 +# define yystype YYSTYPE /* obsolescent; will be withdrawn */ +# define YYSTYPE_IS_DECLARED 1 +#endif + + + + diff --git a/storage/xtradb/include/fts0priv.h b/storage/xtradb/include/fts0priv.h new file mode 100644 index 00000000000..b4d9e1d41ec --- /dev/null +++ b/storage/xtradb/include/fts0priv.h @@ -0,0 +1,653 @@ +/***************************************************************************** + +Copyright (c) 2011, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fts0priv.h +Full text search internal header file + +Created 2011/09/02 Sunny Bains +***********************************************************************/ + +#ifndef INNOBASE_FTS0PRIV_H +#define INNOBASE_FTS0PRIV_H + +#include "dict0dict.h" +#include "pars0pars.h" +#include "que0que.h" +#include "que0types.h" +#include "fts0types.h" + +/* The various states of the FTS sub system pertaining to a table with +FTS indexes defined on it. */ +enum fts_table_state_enum { + /* !<This must be 0 since we insert + a hard coded '0' at create time + to the config table */ + + FTS_TABLE_STATE_RUNNING = 0, /*!< Auxiliary tables created OK */ + + FTS_TABLE_STATE_OPTIMIZING, /*!< This is a substate of RUNNING */ + + FTS_TABLE_STATE_DELETED /*!< All aux tables to be dropped when + it's safe to do so */ +}; + +typedef enum fts_table_state_enum fts_table_state_t; + +/** The default time to wait for the background thread (in microsecnds). */ +#define FTS_MAX_BACKGROUND_THREAD_WAIT 10000 + +/** Maximum number of iterations to wait before we complain */ +#define FTS_BACKGROUND_THREAD_WAIT_COUNT 1000 + +/** The maximum length of the config table's value column in bytes */ +#define FTS_MAX_CONFIG_NAME_LEN 64 + +/** The maximum length of the config table's value column in bytes */ +#define FTS_MAX_CONFIG_VALUE_LEN 1024 + +/** Approx. upper limit of ilist length in bytes. */ +#define FTS_ILIST_MAX_SIZE (64 * 1024) + +/** FTS config table name parameters */ + +/** The number of seconds after which an OPTIMIZE run will stop */ +#define FTS_OPTIMIZE_LIMIT_IN_SECS "optimize_checkpoint_limit" + +/** The next doc id */ +#define FTS_SYNCED_DOC_ID "synced_doc_id" + +/** The last word that was OPTIMIZED */ +#define FTS_LAST_OPTIMIZED_WORD "last_optimized_word" + +/** Total number of documents that have been deleted. The next_doc_id +minus this count gives us the total number of documents. */ +#define FTS_TOTAL_DELETED_COUNT "deleted_doc_count" + +/** Total number of words parsed from all documents */ +#define FTS_TOTAL_WORD_COUNT "total_word_count" + +/** Start of optimize of an FTS index */ +#define FTS_OPTIMIZE_START_TIME "optimize_start_time" + +/** End of optimize for an FTS index */ +#define FTS_OPTIMIZE_END_TIME "optimize_end_time" + +/** User specified stopword table name */ +#define FTS_STOPWORD_TABLE_NAME "stopword_table_name" + +/** Whether to use (turn on/off) stopword */ +#define FTS_USE_STOPWORD "use_stopword" + +/** State of the FTS system for this table. It can be one of + RUNNING, OPTIMIZING, DELETED. */ +#define FTS_TABLE_STATE "table_state" + +/** The minimum length of an FTS auxiliary table names's id component +e.g., For an auxiliary table name + + FTS_<TABLE_ID>_SUFFIX + +This constant is for the minimum length required to store the <TABLE_ID> +component. +*/ +#define FTS_AUX_MIN_TABLE_ID_LENGTH 48 + +/** Maximum length of an integer stored in the config table value column. */ +#define FTS_MAX_INT_LEN 32 + +/******************************************************************//** +Parse an SQL string. %s is replaced with the table's id. +@return query graph */ +UNIV_INTERN +que_t* +fts_parse_sql( +/*==========*/ + fts_table_t* fts_table, /*!< in: FTS aux table */ + pars_info_t* info, /*!< in: info struct, or NULL */ + const char* sql) /*!< in: SQL string to evaluate */ + __attribute__((nonnull(3), malloc, warn_unused_result)); +/******************************************************************//** +Evaluate a parsed SQL statement +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fts_eval_sql( +/*=========*/ + trx_t* trx, /*!< in: transaction */ + que_t* graph) /*!< in: Parsed statement */ + __attribute__((nonnull, warn_unused_result)); +/******************************************************************//** +Construct the name of an ancillary FTS table for the given table. +@return own: table name, must be freed with mem_free() */ +UNIV_INTERN +char* +fts_get_table_name( +/*===============*/ + const fts_table_t* + fts_table) /*!< in: FTS aux table info */ + __attribute__((nonnull, malloc, warn_unused_result)); +/******************************************************************//** +Construct the column specification part of the SQL string for selecting the +indexed FTS columns for the given table. Adds the necessary bound +ids to the given 'info' and returns the SQL string. Examples: + +One indexed column named "text": + + "$sel0", + info/ids: sel0 -> "text" + +Two indexed columns named "subject" and "content": + + "$sel0, $sel1", + info/ids: sel0 -> "subject", sel1 -> "content", +@return heap-allocated WHERE string */ +UNIV_INTERN +const char* +fts_get_select_columns_str( +/*=======================*/ + dict_index_t* index, /*!< in: FTS index */ + pars_info_t* info, /*!< in/out: parser info */ + mem_heap_t* heap) /*!< in: memory heap */ + __attribute__((nonnull, warn_unused_result)); + +/** define for fts_doc_fetch_by_doc_id() "option" value, defines whether +we want to get Doc whose ID is equal to or greater or smaller than supplied +ID */ +#define FTS_FETCH_DOC_BY_ID_EQUAL 1 +#define FTS_FETCH_DOC_BY_ID_LARGE 2 +#define FTS_FETCH_DOC_BY_ID_SMALL 3 + +/*************************************************************//** +Fetch document (= a single row's indexed text) with the given +document id. +@return: DB_SUCCESS if fetch is successful, else error */ +UNIV_INTERN +dberr_t +fts_doc_fetch_by_doc_id( +/*====================*/ + fts_get_doc_t* get_doc, /*!< in: state */ + doc_id_t doc_id, /*!< in: id of document to fetch */ + dict_index_t* index_to_use, /*!< in: caller supplied FTS index, + or NULL */ + ulint option, /*!< in: search option, if it is + greater than doc_id or equal */ + fts_sql_callback + callback, /*!< in: callback to read + records */ + void* arg) /*!< in: callback arg */ + __attribute__((nonnull(6))); + +/*******************************************************************//** +Callback function for fetch that stores the text of an FTS document, +converting each column to UTF-16. +@return always FALSE */ +UNIV_INTERN +ibool +fts_query_expansion_fetch_doc( +/*==========================*/ + void* row, /*!< in: sel_node_t* */ + void* user_arg) /*!< in: fts_doc_t* */ + __attribute__((nonnull)); +/******************************************************************** +Write out a single word's data as new entry/entries in the INDEX table. +@return DB_SUCCESS if all OK. */ +UNIV_INTERN +dberr_t +fts_write_node( +/*===========*/ + trx_t* trx, /*!< in: transaction */ + que_t** graph, /*!< in: query graph */ + fts_table_t* fts_table, /*!< in: the FTS aux index */ + fts_string_t* word, /*!< in: word in UTF-8 */ + fts_node_t* node) /*!< in: node columns */ + __attribute__((nonnull, warn_unused_result)); +/*******************************************************************//** +Tokenize a document. */ +UNIV_INTERN +void +fts_tokenize_document( +/*==================*/ + fts_doc_t* doc, /*!< in/out: document to + tokenize */ + fts_doc_t* result) /*!< out: if provided, save + result tokens here */ + __attribute__((nonnull(1))); + +/*******************************************************************//** +Continue to tokenize a document. */ +UNIV_INTERN +void +fts_tokenize_document_next( +/*=======================*/ + fts_doc_t* doc, /*!< in/out: document to + tokenize */ + ulint add_pos, /*!< in: add this position to all + tokens from this tokenization */ + fts_doc_t* result) /*!< out: if provided, save + result tokens here */ + __attribute__((nonnull(1))); +/******************************************************************//** +Initialize a document. */ +UNIV_INTERN +void +fts_doc_init( +/*=========*/ + fts_doc_t* doc) /*!< in: doc to initialize */ + __attribute__((nonnull)); + +/******************************************************************//** +Do a binary search for a doc id in the array +@return +ve index if found -ve index where it should be + inserted if not found */ +UNIV_INTERN +int +fts_bsearch( +/*========*/ + fts_update_t* array, /*!< in: array to sort */ + int lower, /*!< in: lower bound of array*/ + int upper, /*!< in: upper bound of array*/ + doc_id_t doc_id) /*!< in: doc id to lookup */ + __attribute__((nonnull, warn_unused_result)); +/******************************************************************//** +Free document. */ +UNIV_INTERN +void +fts_doc_free( +/*=========*/ + fts_doc_t* doc) /*!< in: document */ + __attribute__((nonnull)); +/******************************************************************//** +Free fts_optimizer_word_t instanace.*/ +UNIV_INTERN +void +fts_word_free( +/*==========*/ + fts_word_t* word) /*!< in: instance to free.*/ + __attribute__((nonnull)); +/******************************************************************//** +Read the rows from the FTS inde +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fts_index_fetch_nodes( +/*==================*/ + trx_t* trx, /*!< in: transaction */ + que_t** graph, /*!< in: prepared statement */ + fts_table_t* fts_table, /*!< in: FTS aux table */ + const fts_string_t* + word, /*!< in: the word to fetch */ + fts_fetch_t* fetch) /*!< in: fetch callback.*/ + __attribute__((nonnull)); +/******************************************************************//** +Create a fts_optimizer_word_t instance. +@return new instance */ +UNIV_INTERN +fts_word_t* +fts_word_init( +/*==========*/ + fts_word_t* word, /*!< in: word to initialize */ + byte* utf8, /*!< in: UTF-8 string */ + ulint len) /*!< in: length of string in bytes */ + __attribute__((nonnull)); +/******************************************************************//** +Compare two fts_trx_table_t instances, we actually compare the +table id's here. +@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ +UNIV_INLINE +int +fts_trx_table_cmp( +/*==============*/ + const void* v1, /*!< in: id1 */ + const void* v2) /*!< in: id2 */ + __attribute__((nonnull, warn_unused_result)); +/******************************************************************//** +Compare a table id with a trx_table_t table id. +@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ +UNIV_INLINE +int +fts_trx_table_id_cmp( +/*=================*/ + const void* p1, /*!< in: id1 */ + const void* p2) /*!< in: id2 */ + __attribute__((nonnull, warn_unused_result)); +/******************************************************************//** +Commit a transaction. +@return DB_SUCCESS if all OK */ +UNIV_INTERN +dberr_t +fts_sql_commit( +/*===========*/ + trx_t* trx) /*!< in: transaction */ + __attribute__((nonnull)); +/******************************************************************//** +Rollback a transaction. +@return DB_SUCCESS if all OK */ +UNIV_INTERN +dberr_t +fts_sql_rollback( +/*=============*/ + trx_t* trx) /*!< in: transaction */ + __attribute__((nonnull)); +/******************************************************************//** +Parse an SQL string. %s is replaced with the table's id. Don't acquire +the dict mutex +@return query graph */ +UNIV_INTERN +que_t* +fts_parse_sql_no_dict_lock( +/*=======================*/ + fts_table_t* fts_table, /*!< in: table with FTS index */ + pars_info_t* info, /*!< in: parser info */ + const char* sql) /*!< in: SQL string to evaluate */ + __attribute__((nonnull(3), malloc, warn_unused_result)); +/******************************************************************//** +Get value from config table. The caller must ensure that enough +space is allocated for value to hold the column contents +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fts_config_get_value( +/*=================*/ + trx_t* trx, /* transaction */ + fts_table_t* fts_table, /*!< in: the indexed FTS table */ + const char* name, /*!< in: get config value for + this parameter name */ + fts_string_t* value) /*!< out: value read from + config table */ + __attribute__((nonnull)); +/******************************************************************//** +Get value specific to an FTS index from the config table. The caller +must ensure that enough space is allocated for value to hold the +column contents. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fts_config_get_index_value( +/*=======================*/ + trx_t* trx, /*!< transaction */ + dict_index_t* index, /*!< in: index */ + const char* param, /*!< in: get config value for + this parameter name */ + fts_string_t* value) /*!< out: value read from + config table */ + __attribute__((nonnull, warn_unused_result)); +/******************************************************************//** +Set the value in the config table for name. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fts_config_set_value( +/*=================*/ + trx_t* trx, /*!< transaction */ + fts_table_t* fts_table, /*!< in: the indexed FTS table */ + const char* name, /*!< in: get config value for + this parameter name */ + const fts_string_t* + value) /*!< in: value to update */ + __attribute__((nonnull)); +/****************************************************************//** +Set an ulint value in the config table. +@return DB_SUCCESS if all OK else error code */ +UNIV_INTERN +dberr_t +fts_config_set_ulint( +/*=================*/ + trx_t* trx, /*!< in: transaction */ + fts_table_t* fts_table, /*!< in: the indexed FTS table */ + const char* name, /*!< in: param name */ + ulint int_value) /*!< in: value */ + __attribute__((nonnull, warn_unused_result)); +/******************************************************************//** +Set the value specific to an FTS index in the config table. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fts_config_set_index_value( +/*=======================*/ + trx_t* trx, /*!< transaction */ + dict_index_t* index, /*!< in: index */ + const char* param, /*!< in: get config value for + this parameter name */ + fts_string_t* value) /*!< out: value read from + config table */ + __attribute__((nonnull, warn_unused_result)); +/******************************************************************//** +Increment the value in the config table for column name. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fts_config_increment_value( +/*=======================*/ + trx_t* trx, /*!< transaction */ + fts_table_t* fts_table, /*!< in: the indexed FTS table */ + const char* name, /*!< in: increment config value + for this parameter name */ + ulint delta) /*!< in: increment by this much */ + __attribute__((nonnull, warn_unused_result)); +/******************************************************************//** +Increment the per index value in the config table for column name. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fts_config_increment_index_value( +/*=============================*/ + trx_t* trx, /*!< transaction */ + dict_index_t* index, /*!< in: FTS index */ + const char* name, /*!< in: increment config value + for this parameter name */ + ulint delta) /*!< in: increment by this much */ + __attribute__((nonnull)); +/******************************************************************//** +Get an ulint value from the config table. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fts_config_get_index_ulint( +/*=======================*/ + trx_t* trx, /*!< in: transaction */ + dict_index_t* index, /*!< in: FTS index */ + const char* name, /*!< in: param name */ + ulint* int_value) /*!< out: value */ + __attribute__((nonnull, warn_unused_result)); +/******************************************************************//** +Set an ulint value int the config table. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fts_config_set_index_ulint( +/*=======================*/ + trx_t* trx, /*!< in: transaction */ + dict_index_t* index, /*!< in: FTS index */ + const char* name, /*!< in: param name */ + ulint int_value) /*!< in: value */ + __attribute__((nonnull, warn_unused_result)); +/******************************************************************//** +Get an ulint value from the config table. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fts_config_get_ulint( +/*=================*/ + trx_t* trx, /*!< in: transaction */ + fts_table_t* fts_table, /*!< in: the indexed FTS table */ + const char* name, /*!< in: param name */ + ulint* int_value) /*!< out: value */ + __attribute__((nonnull)); +/******************************************************************//** +Search cache for word. +@return the word node vector if found else NULL */ +UNIV_INTERN +const ib_vector_t* +fts_cache_find_word( +/*================*/ + const fts_index_cache_t* + index_cache, /*!< in: cache to search */ + const fts_string_t* + text) /*!< in: word to search for */ + __attribute__((nonnull, warn_unused_result)); +/******************************************************************//** +Check cache for deleted doc id. +@return TRUE if deleted */ +UNIV_INTERN +ibool +fts_cache_is_deleted_doc_id( +/*========================*/ + const fts_cache_t* + cache, /*!< in: cache ito search */ + doc_id_t doc_id) /*!< in: doc id to search for */ + __attribute__((nonnull, warn_unused_result)); +/******************************************************************//** +Append deleted doc ids to vector and sort the vector. */ +UNIV_INTERN +void +fts_cache_append_deleted_doc_ids( +/*=============================*/ + const fts_cache_t* + cache, /*!< in: cache to use */ + ib_vector_t* vector); /*!< in: append to this vector */ +/******************************************************************//** +Wait for the background thread to start. We poll to detect change +of state, which is acceptable, since the wait should happen only +once during startup. +@return true if the thread started else FALSE (i.e timed out) */ +UNIV_INTERN +ibool +fts_wait_for_background_thread_to_start( +/*====================================*/ + dict_table_t* table, /*!< in: table to which the thread + is attached */ + ulint max_wait); /*!< in: time in microseconds, if set + to 0 then it disables timeout + checking */ +#ifdef FTS_DOC_STATS_DEBUG +/******************************************************************//** +Get the total number of words in the FTS for a particular FTS index. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fts_get_total_word_count( +/*=====================*/ + trx_t* trx, /*!< in: transaction */ + dict_index_t* index, /*!< in: for this index */ + ulint* total) /*!< out: total words */ + __attribute__((nonnull, warn_unused_result)); +#endif +/******************************************************************//** +Search the index specific cache for a particular FTS index. +@return the index specific cache else NULL */ +UNIV_INTERN +fts_index_cache_t* +fts_find_index_cache( +/*================*/ + const fts_cache_t* + cache, /*!< in: cache to search */ + const dict_index_t* + index) /*!< in: index to search for */ + __attribute__((nonnull, warn_unused_result)); +/******************************************************************//** +Write the table id to the given buffer (including final NUL). Buffer must be +at least FTS_AUX_MIN_TABLE_ID_LENGTH bytes long. +@return number of bytes written */ +UNIV_INLINE +int +fts_write_object_id( +/*================*/ + ib_id_t id, /*!< in: a table/index id */ + char* str, /*!< in: buffer to write the id to */ + bool hex_format __attribute__((unused))) + /*!< in: true for fixed hex format, + false for old ambiguous format */ + __attribute__((nonnull)); +/******************************************************************//** +Read the table id from the string generated by fts_write_object_id(). +@return TRUE if parse successful */ +UNIV_INLINE +ibool +fts_read_object_id( +/*===============*/ + ib_id_t* id, /*!< out: a table id */ + const char* str) /*!< in: buffer to read from */ + __attribute__((nonnull, warn_unused_result)); +/******************************************************************//** +Get the table id. +@return number of bytes written */ +UNIV_INTERN +int +fts_get_table_id( +/*=============*/ + const fts_table_t* + fts_table, /*!< in: FTS Auxiliary table */ + char* table_id) /*!< out: table id, must be at least + FTS_AUX_MIN_TABLE_ID_LENGTH bytes + long */ + __attribute__((nonnull, warn_unused_result)); +/******************************************************************//** +Add the table to add to the OPTIMIZER's list. */ +UNIV_INTERN +void +fts_optimize_add_table( +/*===================*/ + dict_table_t* table) /*!< in: table to add */ + __attribute__((nonnull)); +/******************************************************************//** +Optimize a table. */ +UNIV_INTERN +void +fts_optimize_do_table( +/*==================*/ + dict_table_t* table) /*!< in: table to optimize */ + __attribute__((nonnull)); +/******************************************************************//** +Construct the prefix name of an FTS table. +@return own: table name, must be freed with mem_free() */ +UNIV_INTERN +char* +fts_get_table_name_prefix( +/*======================*/ + const fts_table_t* + fts_table) /*!< in: Auxiliary table type */ + __attribute__((nonnull, malloc, warn_unused_result)); +/******************************************************************//** +Add node positions. */ +UNIV_INTERN +void +fts_cache_node_add_positions( +/*=========================*/ + fts_cache_t* cache, /*!< in: cache */ + fts_node_t* node, /*!< in: word node */ + doc_id_t doc_id, /*!< in: doc id */ + ib_vector_t* positions) /*!< in: fts_token_t::positions */ + __attribute__((nonnull(2,4))); + +/******************************************************************//** +Create the config table name for retrieving index specific value. +@return index config parameter name */ +UNIV_INTERN +char* +fts_config_create_index_param_name( +/*===============================*/ + const char* param, /*!< in: base name of param */ + const dict_index_t* index) /*!< in: index for config */ + __attribute__((nonnull, malloc, warn_unused_result)); + +#ifndef UNIV_NONINL +#include "fts0priv.ic" +#endif + +#endif /* INNOBASE_FTS0PRIV_H */ diff --git a/storage/xtradb/include/fts0priv.ic b/storage/xtradb/include/fts0priv.ic new file mode 100644 index 00000000000..2d07c60f980 --- /dev/null +++ b/storage/xtradb/include/fts0priv.ic @@ -0,0 +1,130 @@ +/***************************************************************************** + +Copyright (c) 2011, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fts0priv.ic +Full text search internal header file + +Created 2011/11/12 Sunny Bains +***********************************************************************/ + +/******************************************************************//** +Write the table id to the given buffer (including final NUL). Buffer must be +at least FTS_AUX_MIN_TABLE_ID_LENGTH bytes long. +@return number of bytes written */ +UNIV_INLINE +int +fts_write_object_id( +/*================*/ + ib_id_t id, /* in: a table/index id */ + char* str, /* in: buffer to write the id to */ + bool hex_format __attribute__((unused))) + /* in: true for fixed hex format, + false for old ambiguous format */ +{ + +#ifdef _WIN32 + + DBUG_EXECUTE_IF("innodb_test_wrong_non_windows_fts_aux_table_name", + return(sprintf(str, UINT64PFx, id));); + + /* Use this to construct old(5.6.14 and 5.7.3) windows + ambiguous aux table names */ + DBUG_EXECUTE_IF("innodb_test_wrong_fts_aux_table_name", + return(sprintf(str, "%016llu", id));); + +#else /* _WIN32 */ + + /* Use this to construct old(5.6.14 and 5.7.3) windows + ambiguous aux table names */ + DBUG_EXECUTE_IF("innodb_test_wrong_windows_fts_aux_table_name", + return(sprintf(str, "%016"PRIu64, id));); + + DBUG_EXECUTE_IF("innodb_test_wrong_fts_aux_table_name", + return(sprintf(str, UINT64PFx, id));); + +#endif /* _WIN32 */ + + /* As above, but this is only for those tables failing to rename. */ + if (!hex_format) { +#ifdef _WIN32 + // FIXME: Use ut_snprintf(), so does following one. + return(sprintf(str, "%016llu", id)); +#else /* _WIN32 */ + return(sprintf(str, "%016"PRIu64, id)); +#endif /* _WIN32 */ + } + + return(sprintf(str, UINT64PFx, id)); +} + +/******************************************************************//** +Read the table id from the string generated by fts_write_object_id(). +@return TRUE if parse successful */ +UNIV_INLINE +ibool +fts_read_object_id( +/*===============*/ + ib_id_t* id, /* out: an id */ + const char* str) /* in: buffer to read from */ +{ + /* NOTE: this func doesn't care about whether current table + is set with HEX_NAME, the user of the id read here will check + if the id is HEX or DEC and do the right thing with it. */ + return(sscanf(str, UINT64PFx, id) == 1); +} + +/******************************************************************//** +Compare two fts_trx_table_t instances. +@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ +UNIV_INLINE +int +fts_trx_table_cmp( +/*==============*/ + const void* p1, /*!< in: id1 */ + const void* p2) /*!< in: id2 */ +{ + const dict_table_t* table1 = (*(const fts_trx_table_t**) p1)->table; + const dict_table_t* table2 = (*(const fts_trx_table_t**) p2)->table; + + return((table1->id > table2->id) + ? 1 + : (table1->id == table2->id) + ? 0 + : -1); +} + +/******************************************************************//** +Compare a table id with a fts_trx_table_t table id. +@return < 0 if n1 < n2, 0 if n1 == n2,> 0 if n1 > n2 */ +UNIV_INLINE +int +fts_trx_table_id_cmp( +/*=================*/ + const void* p1, /*!< in: id1 */ + const void* p2) /*!< in: id2 */ +{ + const ullint* table_id = (const ullint*) p1; + const dict_table_t* table2 = (*(const fts_trx_table_t**) p2)->table; + + return((*table_id > table2->id) + ? 1 + : (*table_id == table2->id) + ? 0 + : -1); +} diff --git a/storage/xtradb/include/fts0tlex.h b/storage/xtradb/include/fts0tlex.h new file mode 100644 index 00000000000..f91533803e8 --- /dev/null +++ b/storage/xtradb/include/fts0tlex.h @@ -0,0 +1,349 @@ +#ifndef fts0tHEADER_H +#define fts0tHEADER_H 1 +#define fts0tIN_HEADER 1 + +#line 6 "../include/fts0tlex.h" + +#line 8 "../include/fts0tlex.h" + +#define YY_INT_ALIGNED short int + +/* A lexical scanner generated by flex */ + +#define FLEX_SCANNER +#define YY_FLEX_MAJOR_VERSION 2 +#define YY_FLEX_MINOR_VERSION 5 +#define YY_FLEX_SUBMINOR_VERSION 35 +#if YY_FLEX_SUBMINOR_VERSION > 0 +#define FLEX_BETA +#endif + +/* First, we deal with platform-specific or compiler-specific issues. */ + +/* begin standard C headers. */ +#include <stdio.h> +#include <string.h> +#include <errno.h> +#include <stdlib.h> + +/* end standard C headers. */ + +/* flex integer type definitions */ + +#ifndef FLEXINT_H +#define FLEXINT_H + +/* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */ + +#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + +/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h, + * if you want the limit (max/min) macros for int types. + */ +#ifndef __STDC_LIMIT_MACROS +#define __STDC_LIMIT_MACROS 1 +#endif + +#include <inttypes.h> +typedef int8_t flex_int8_t; +typedef uint8_t flex_uint8_t; +typedef int16_t flex_int16_t; +typedef uint16_t flex_uint16_t; +typedef int32_t flex_int32_t; +typedef uint32_t flex_uint32_t; +#else +typedef signed char flex_int8_t; +typedef short int flex_int16_t; +typedef int flex_int32_t; +typedef unsigned char flex_uint8_t; +typedef unsigned short int flex_uint16_t; +typedef unsigned int flex_uint32_t; + +/* Limits of integral types. */ +#ifndef INT8_MIN +#define INT8_MIN (-128) +#endif +#ifndef INT16_MIN +#define INT16_MIN (-32767-1) +#endif +#ifndef INT32_MIN +#define INT32_MIN (-2147483647-1) +#endif +#ifndef INT8_MAX +#define INT8_MAX (127) +#endif +#ifndef INT16_MAX +#define INT16_MAX (32767) +#endif +#ifndef INT32_MAX +#define INT32_MAX (2147483647) +#endif +#ifndef UINT8_MAX +#define UINT8_MAX (255U) +#endif +#ifndef UINT16_MAX +#define UINT16_MAX (65535U) +#endif +#ifndef UINT32_MAX +#define UINT32_MAX (4294967295U) +#endif + +#endif /* ! C99 */ + +#endif /* ! FLEXINT_H */ + +#ifdef __cplusplus + +/* The "const" storage-class-modifier is valid. */ +#define YY_USE_CONST + +#else /* ! __cplusplus */ + +/* C99 requires __STDC__ to be defined as 1. */ +#if defined (__STDC__) + +#define YY_USE_CONST + +#endif /* defined (__STDC__) */ +#endif /* ! __cplusplus */ + +#ifdef YY_USE_CONST +#define yyconst const +#else +#define yyconst +#endif + +/* An opaque pointer. */ +#ifndef YY_TYPEDEF_YY_SCANNER_T +#define YY_TYPEDEF_YY_SCANNER_T +typedef void* yyscan_t; +#endif + +/* For convenience, these vars (plus the bison vars far below) + are macros in the reentrant scanner. */ +#define yyin yyg->yyin_r +#define yyout yyg->yyout_r +#define yyextra yyg->yyextra_r +#define yyleng yyg->yyleng_r +#define yytext yyg->yytext_r +#define yylineno (YY_CURRENT_BUFFER_LVALUE->yy_bs_lineno) +#define yycolumn (YY_CURRENT_BUFFER_LVALUE->yy_bs_column) +#define yy_flex_debug yyg->yy_flex_debug_r + +/* Size of default input buffer. */ +#ifndef YY_BUF_SIZE +#ifdef __ia64__ +/* On IA-64, the buffer size is 16k, not 8k. + * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case. + * Ditto for the __ia64__ case accordingly. + */ +#define YY_BUF_SIZE 32768 +#else +#define YY_BUF_SIZE 16384 +#endif /* __ia64__ */ +#endif + +#ifndef YY_TYPEDEF_YY_BUFFER_STATE +#define YY_TYPEDEF_YY_BUFFER_STATE +typedef struct yy_buffer_state *YY_BUFFER_STATE; +#endif + +#ifndef YY_TYPEDEF_YY_SIZE_T +#define YY_TYPEDEF_YY_SIZE_T +typedef size_t yy_size_t; +#endif + +#ifndef YY_STRUCT_YY_BUFFER_STATE +#define YY_STRUCT_YY_BUFFER_STATE +struct yy_buffer_state + { + FILE *yy_input_file; + + char *yy_ch_buf; /* input buffer */ + char *yy_buf_pos; /* current position in input buffer */ + + /* Size of input buffer in bytes, not including room for EOB + * characters. + */ + yy_size_t yy_buf_size; + + /* Number of characters read into yy_ch_buf, not including EOB + * characters. + */ + int yy_n_chars; + + /* Whether we "own" the buffer - i.e., we know we created it, + * and can realloc() it to grow it, and should free() it to + * delete it. + */ + int yy_is_our_buffer; + + /* Whether this is an "interactive" input source; if so, and + * if we're using stdio for input, then we want to use getc() + * instead of fread(), to make sure we stop fetching input after + * each newline. + */ + int yy_is_interactive; + + /* Whether we're considered to be at the beginning of a line. + * If so, '^' rules will be active on the next match, otherwise + * not. + */ + int yy_at_bol; + + int yy_bs_lineno; /**< The line count. */ + int yy_bs_column; /**< The column count. */ + + /* Whether to try to fill the input buffer when we reach the + * end of it. + */ + int yy_fill_buffer; + + int yy_buffer_status; + + }; +#endif /* !YY_STRUCT_YY_BUFFER_STATE */ + +void fts0trestart (FILE *input_file ,yyscan_t yyscanner ); +void fts0t_switch_to_buffer (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner ); +YY_BUFFER_STATE fts0t_create_buffer (FILE *file,int size ,yyscan_t yyscanner ); +void fts0t_delete_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner ); +void fts0t_flush_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner ); +void fts0tpush_buffer_state (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner ); +void fts0tpop_buffer_state (yyscan_t yyscanner ); + +YY_BUFFER_STATE fts0t_scan_buffer (char *base,yy_size_t size ,yyscan_t yyscanner ); +YY_BUFFER_STATE fts0t_scan_string (yyconst char *yy_str ,yyscan_t yyscanner ); +YY_BUFFER_STATE fts0t_scan_bytes (yyconst char *bytes,int len ,yyscan_t yyscanner ); + +void *fts0talloc (yy_size_t ,yyscan_t yyscanner ); +void *fts0trealloc (void *,yy_size_t ,yyscan_t yyscanner ); +void fts0tfree (void * ,yyscan_t yyscanner ); + +/* Begin user sect3 */ + +#define fts0twrap(n) 1 +#define YY_SKIP_YYWRAP + +#define yytext_ptr yytext_r + +#ifdef YY_HEADER_EXPORT_START_CONDITIONS +#define INITIAL 0 + +#endif + +#ifndef YY_NO_UNISTD_H +/* Special case for "unistd.h", since it is non-ANSI. We include it way + * down here because we want the user's section 1 to have been scanned first. + * The user has a chance to override it with an option. + */ +#include <unistd.h> +#endif + +#ifndef YY_EXTRA_TYPE +#define YY_EXTRA_TYPE void * +#endif + +int fts0tlex_init (yyscan_t* scanner); + +int fts0tlex_init_extra (YY_EXTRA_TYPE user_defined,yyscan_t* scanner); + +/* Accessor methods to globals. + These are made visible to non-reentrant scanners for convenience. */ + +int fts0tlex_destroy (yyscan_t yyscanner ); + +int fts0tget_debug (yyscan_t yyscanner ); + +void fts0tset_debug (int debug_flag ,yyscan_t yyscanner ); + +YY_EXTRA_TYPE fts0tget_extra (yyscan_t yyscanner ); + +void fts0tset_extra (YY_EXTRA_TYPE user_defined ,yyscan_t yyscanner ); + +FILE *fts0tget_in (yyscan_t yyscanner ); + +void fts0tset_in (FILE * in_str ,yyscan_t yyscanner ); + +FILE *fts0tget_out (yyscan_t yyscanner ); + +void fts0tset_out (FILE * out_str ,yyscan_t yyscanner ); + +int fts0tget_leng (yyscan_t yyscanner ); + +char *fts0tget_text (yyscan_t yyscanner ); + +int fts0tget_lineno (yyscan_t yyscanner ); + +void fts0tset_lineno (int line_number ,yyscan_t yyscanner ); + +/* Macros after this point can all be overridden by user definitions in + * section 1. + */ + +#ifndef YY_SKIP_YYWRAP +#ifdef __cplusplus +extern "C" int fts0twrap (yyscan_t yyscanner ); +#else +extern int fts0twrap (yyscan_t yyscanner ); +#endif +#endif + +#ifndef yytext_ptr +static void yy_flex_strncpy (char *,yyconst char *,int ,yyscan_t yyscanner); +#endif + +#ifdef YY_NEED_STRLEN +static int yy_flex_strlen (yyconst char * ,yyscan_t yyscanner); +#endif + +#ifndef YY_NO_INPUT + +#endif + +/* Amount of stuff to slurp up with each read. */ +#ifndef YY_READ_BUF_SIZE +#ifdef __ia64__ +/* On IA-64, the buffer size is 16k, not 8k */ +#define YY_READ_BUF_SIZE 16384 +#else +#define YY_READ_BUF_SIZE 8192 +#endif /* __ia64__ */ +#endif + +/* Number of entries by which start-condition stack grows. */ +#ifndef YY_START_STACK_INCR +#define YY_START_STACK_INCR 25 +#endif + +/* Default declaration of generated scanner - a define so the user can + * easily add parameters. + */ +#ifndef YY_DECL +#define YY_DECL_IS_OURS 1 + +extern int fts0tlex (yyscan_t yyscanner); + +#define YY_DECL int fts0tlex (yyscan_t yyscanner) +#endif /* !YY_DECL */ + +/* yy_get_previous_state - get the state just before the EOB char was reached */ + +#undef YY_NEW_FILE +#undef YY_FLUSH_BUFFER +#undef yy_set_bol +#undef yy_new_buffer +#undef yy_set_interactive +#undef YY_DO_BEFORE_ACTION + +#ifdef YY_DECL_IS_OURS +#undef YY_DECL_IS_OURS +#undef YY_DECL +#endif + +#line 68 "fts0tlex.l" + + +#line 348 "../include/fts0tlex.h" +#undef fts0tIN_HEADER +#endif /* fts0tHEADER_H */ diff --git a/storage/xtradb/include/fts0types.h b/storage/xtradb/include/fts0types.h new file mode 100644 index 00000000000..64677428331 --- /dev/null +++ b/storage/xtradb/include/fts0types.h @@ -0,0 +1,474 @@ +/***************************************************************************** + +Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fts0types.h +Full text search types file + +Created 2007-03-27 Sunny Bains +*******************************************************/ + +#ifndef INNOBASE_FTS0TYPES_H +#define INNOBASE_FTS0TYPES_H + +#include "que0types.h" +#include "ut0byte.h" +#include "fut0fut.h" +#include "ut0rbt.h" +#include "fts0fts.h" + +/** Types used within FTS. */ +struct fts_que_t; +struct fts_node_t; +struct fts_utf8_str_t; + +/** Callbacks used within FTS. */ +typedef pars_user_func_cb_t fts_sql_callback; +typedef void (*fts_filter)(void*, fts_node_t*, void*, ulint len); + +/** Statistics relevant to a particular document, used during retrieval. */ +struct fts_doc_stats_t { + doc_id_t doc_id; /*!< Document id */ + ulint word_count; /*!< Total words in the document */ +}; + +/** It's main purpose is to store the SQL prepared statements that +are required to retrieve a document from the database. */ +struct fts_get_doc_t { + fts_index_cache_t* + index_cache; /*!< The index cache instance */ + + /*!< Parsed sql statement */ + que_t* get_document_graph; + fts_cache_t* cache; /*!< The parent cache */ +}; + +/** Since we can have multiple FTS indexes on a table, we keep a +per index cache of words etc. */ +struct fts_index_cache_t { + dict_index_t* index; /*!< The FTS index instance */ + + ib_rbt_t* words; /*!< Nodes; indexed by fts_string_t*, + cells are fts_tokenizer_word_t*.*/ + + ib_vector_t* doc_stats; /*!< Array of the fts_doc_stats_t + contained in the memory buffer. + Must be in sorted order (ascending). + The ideal choice is an rb tree but + the rb tree imposes a space overhead + that we can do without */ + + que_t** ins_graph; /*!< Insert query graphs */ + + que_t** sel_graph; /*!< Select query graphs */ + CHARSET_INFO* charset; /*!< charset */ +}; + +/** For supporting the tracking of updates on multiple FTS indexes we need +to track which FTS indexes need to be updated. For INSERT and DELETE we +update all fts indexes. */ +struct fts_update_t { + doc_id_t doc_id; /*!< The doc id affected */ + + ib_vector_t* fts_indexes; /*!< The FTS indexes that need to be + updated. A NULL value means all + indexes need to be updated. This + vector is not allocated on the heap + and so must be freed explicitly, + when we are done with it */ +}; + +/** Stop word control infotmation. */ +struct fts_stopword_t { + ulint status; /*!< Status of the stopword tree */ + ib_alloc_t* heap; /*!< The memory allocator to use */ + ib_rbt_t* cached_stopword;/*!< This stores all active stopwords */ + CHARSET_INFO* charset; /*!< charset for stopword */ +}; + +/** The SYNC state of the cache. There is one instance of this struct +associated with each ADD thread. */ +struct fts_sync_t { + trx_t* trx; /*!< The transaction used for SYNCing + the cache to disk */ + dict_table_t* table; /*!< Table with FTS index(es) */ + ulint max_cache_size; /*!< Max size in bytes of the cache */ + ibool cache_full; /*!< flag, when true it indicates that + we need to sync the cache to disk */ + ulint lower_index; /*!< the start index of the doc id + vector from where to start adding + documents to the FTS cache */ + ulint upper_index; /*!< max index of the doc id vector to + add to the FTS cache */ + ibool interrupted; /*!< TRUE if SYNC was interrupted */ + doc_id_t min_doc_id; /*!< The smallest doc id added to the + cache. It should equal to + doc_ids[lower_index] */ + doc_id_t max_doc_id; /*!< The doc id at which the cache was + noted as being full, we use this to + set the upper_limit field */ + ib_time_t start_time; /*!< SYNC start time */ +}; + +/** The cache for the FTS system. It is a memory-based inverted index +that new entries are added to, until it grows over the configured maximum +size, at which time its contents are written to the INDEX table. */ +struct fts_cache_t { + rw_lock_t lock; /*!< lock protecting all access to the + memory buffer. FIXME: this needs to + be our new upgrade-capable rw-lock */ + + rw_lock_t init_lock; /*!< lock used for the cache + intialization, it has different + SYNC level as above cache lock */ + + ib_mutex_t optimize_lock; /*!< Lock for OPTIMIZE */ + + ib_mutex_t deleted_lock; /*!< Lock covering deleted_doc_ids */ + + ib_mutex_t doc_id_lock; /*!< Lock covering Doc ID */ + + ib_vector_t* deleted_doc_ids;/*!< Array of deleted doc ids, each + element is of type fts_update_t */ + + ib_vector_t* indexes; /*!< We store the stats and inverted + index for the individual FTS indexes + in this vector. Each element is + an instance of fts_index_cache_t */ + + ib_vector_t* get_docs; /*!< information required to read + the document from the table. Each + element is of type fts_doc_t */ + + ulint total_size; /*!< total size consumed by the ilist + field of all nodes. SYNC is run + whenever this gets too big */ + fts_sync_t* sync; /*!< sync structure to sync data to + disk */ + ib_alloc_t* sync_heap; /*!< The heap allocator, for indexes + and deleted_doc_ids, ie. transient + objects, they are recreated after + a SYNC is completed */ + + + ib_alloc_t* self_heap; /*!< This heap is the heap out of + which an instance of the cache itself + was created. Objects created using + this heap will last for the lifetime + of the cache */ + + doc_id_t next_doc_id; /*!< Next doc id */ + + doc_id_t synced_doc_id; /*!< Doc ID sync-ed to CONFIG table */ + + doc_id_t first_doc_id; /*!< first doc id since this table + was opened */ + + ulint deleted; /*!< Number of doc ids deleted since + last optimized. This variable is + covered by deleted_lock */ + + ulint added; /*!< Number of doc ids added since last + optimized. This variable is covered by + the deleted lock */ + + fts_stopword_t stopword_info; /*!< Cached stopwords for the FTS */ + mem_heap_t* cache_heap; /*!< Cache Heap */ +}; + +/** Columns of the FTS auxiliary INDEX table */ +struct fts_node_t { + doc_id_t first_doc_id; /*!< First document id in ilist. */ + + doc_id_t last_doc_id; /*!< Last document id in ilist. */ + + byte* ilist; /*!< Binary list of documents & word + positions the token appears in. + TODO: For now, these are simply + ut_malloc'd, but if testing shows + that they waste memory unacceptably, a + special memory allocator will have + to be written */ + + ulint doc_count; /*!< Number of doc ids in ilist */ + + ulint ilist_size; /*!< Used size of ilist in bytes. */ + + ulint ilist_size_alloc; + /*!< Allocated size of ilist in + bytes */ +}; + +/** A tokenizer word. Contains information about one word. */ +struct fts_tokenizer_word_t { + fts_string_t text; /*!< Token text. */ + + ib_vector_t* nodes; /*!< Word node ilists, each element is + of type fts_node_t */ +}; + +/** Word text plus it's array of nodes as on disk in FTS index */ +struct fts_word_t { + fts_string_t text; /*!< Word value in UTF-8 */ + ib_vector_t* nodes; /*!< Nodes read from disk */ + + ib_alloc_t* heap_alloc; /*!< For handling all allocations */ +}; + +/** Callback for reading and filtering nodes that are read from FTS index */ +struct fts_fetch_t { + void* read_arg; /*!< Arg for the sql_callback */ + + fts_sql_callback + read_record; /*!< Callback for reading index + record */ + ulint total_memory; /*!< Total memory used */ +}; + +/** For horizontally splitting an FTS auxiliary index */ +struct fts_index_selector_t { + ulint value; /*!< Character value at which + to split */ + + const char* suffix; /*!< FTS aux index suffix */ +}; + +/** This type represents a single document. */ +struct fts_doc_t { + fts_string_t text; /*!< document text */ + + ibool found; /*!< TRUE if the document was found + successfully in the database */ + + ib_rbt_t* tokens; /*!< This is filled when the document + is tokenized. Tokens; indexed by + fts_string_t*, cells are of type + fts_token_t* */ + + ib_alloc_t* self_heap; /*!< An instance of this type is + allocated from this heap along + with any objects that have the + same lifespan, most notably + the vector of token positions */ + CHARSET_INFO* charset; /*!< Document's charset info */ +}; + +/** A token and its positions within a document. */ +struct fts_token_t { + fts_string_t text; /*!< token text */ + + ib_vector_t* positions; /*!< an array of the positions the + token is found in; each item is + actually an ulint. */ +}; + +/** It's defined in fts/fts0fts.c */ +extern const fts_index_selector_t fts_index_selector[]; + +/******************************************************************//** +Compare two UTF-8 strings. */ +UNIV_INLINE +int +fts_utf8_string_cmp( +/*================*/ + /*!< out: + < 0 if n1 < n2, + 0 if n1 == n2, + > 0 if n1 > n2 */ + const void* p1, /*!< in: key */ + const void* p2); /*!< in: node */ + +/******************************************************************//** +Compare two UTF-8 strings, and return match (0) if +passed in "key" value equals or is the prefix of the "node" value. */ +UNIV_INLINE +int +fts_utf8_string_cmp_prefix( +/*=======================*/ + /*!< out: + < 0 if n1 < n2, + 0 if n1 == n2, + > 0 if n1 > n2 */ + const void* p1, /*!< in: key */ + const void* p2); /*!< in: node */ + +/******************************************************************//** +Compare two fts_trx_row_t instances doc_ids. */ +UNIV_INLINE +int +fts_trx_row_doc_id_cmp( +/*===================*/ + /*!< out: + < 0 if n1 < n2, + 0 if n1 == n2, + > 0 if n1 > n2 */ + const void* p1, /*!< in: id1 */ + const void* p2); /*!< in: id2 */ + +/******************************************************************//** +Compare two fts_ranking_t instances doc_ids. */ +UNIV_INLINE +int +fts_ranking_doc_id_cmp( +/*===================*/ + /*!< out: + < 0 if n1 < n2, + 0 if n1 == n2, + > 0 if n1 > n2 */ + const void* p1, /*!< in: id1 */ + const void* p2); /*!< in: id2 */ + +/******************************************************************//** +Compare two fts_update_t instances doc_ids. */ +UNIV_INLINE +int +fts_update_doc_id_cmp( +/*==================*/ + /*!< out: + < 0 if n1 < n2, + 0 if n1 == n2, + > 0 if n1 > n2 */ + const void* p1, /*!< in: id1 */ + const void* p2); /*!< in: id2 */ + +/******************************************************************//** +Decode and return the integer that was encoded using our VLC scheme.*/ +UNIV_INLINE +ulint +fts_decode_vlc( +/*===========*/ + /*!< out: value decoded */ + byte** ptr); /*!< in: ptr to decode from, this ptr is + incremented by the number of bytes decoded */ + +/******************************************************************//** +Duplicate an UTF-8 string. */ +UNIV_INLINE +void +fts_utf8_string_dup( +/*================*/ + /*!< out: + < 0 if n1 < n2, + 0 if n1 == n2, + > 0 if n1 > n2 */ + fts_string_t* dst, /*!< in: dup to here */ + const fts_string_t* src, /*!< in: src string */ + mem_heap_t* heap); /*!< in: heap to use */ + +/******************************************************************//** +Return length of val if it were encoded using our VLC scheme. */ +UNIV_INLINE +ulint +fts_get_encoded_len( +/*================*/ + /*!< out: length of value + encoded, in bytes */ + ulint val); /*!< in: value to encode */ + +/******************************************************************//** +Encode an integer using our VLC scheme and return the length in bytes. */ +UNIV_INLINE +ulint +fts_encode_int( +/*===========*/ + /*!< out: length of value + encoded, in bytes */ + ulint val, /*!< in: value to encode */ + byte* buf); /*!< in: buffer, must have + enough space */ + +/******************************************************************//** +Decode a UTF-8 character. + +http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf: + + Scalar Value 1st Byte 2nd Byte 3rd Byte 4th Byte +00000000 0xxxxxxx 0xxxxxxx +00000yyy yyxxxxxx 110yyyyy 10xxxxxx +zzzzyyyy yyxxxxxx 1110zzzz 10yyyyyy 10xxxxxx +000uuuzz zzzzyyyy yyxxxxxx 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx + +This function decodes UTF-8 sequences up to 6 bytes (31 bits). + +On error *ptr will point to the first byte that was not correctly +decoded. This will hopefully help in resyncing the input. */ +UNIV_INLINE +ulint +fts_utf8_decode( +/*============*/ + /*!< out: UTF8_ERROR if *ptr + did not point to a valid + UTF-8 sequence, or the + Unicode code point. */ + const byte** ptr); /*!< in/out: pointer to + UTF-8 string. The + pointer is advanced to + the start of the next + character. */ + +/******************************************************************//** +Lowercase an UTF-8 string. */ +UNIV_INLINE +void +fts_utf8_tolower( +/*=============*/ + fts_string_t* str); /*!< in: string */ + +/******************************************************************//** +Get the selected FTS aux INDEX suffix. */ +UNIV_INLINE +const char* +fts_get_suffix( +/*===========*/ + ulint selected); /*!< in: selected index */ + +/******************************************************************** +Get the number of index selectors. */ +UNIV_INLINE +ulint +fts_get_n_selectors(void); +/*=====================*/ + +/******************************************************************//** +Select the FTS auxiliary index for the given string. +@return the index to use for the string */ +UNIV_INLINE +ulint +fts_select_index( +/*=============*/ + const CHARSET_INFO* cs, /*!< Charset */ + const byte* str, /*!< in: word string */ + ulint len); /*!< in: string length */ + +/******************************************************************** +Select the next FTS auxiliary index for the given character. +@return the next index to use for character */ +UNIV_INLINE +ulint +fts_select_next_index( +/*==================*/ + const CHARSET_INFO* cs, /*!< Charset */ + const byte* str, /*!< in: string */ + ulint len); /*!< in: string length */ + +#ifndef UNIV_NONINL +#include "fts0types.ic" +#include "fts0vlc.ic" +#endif + +#endif /* INNOBASE_FTS0TYPES_H */ diff --git a/storage/xtradb/include/fts0types.ic b/storage/xtradb/include/fts0types.ic new file mode 100644 index 00000000000..f0dfd023a70 --- /dev/null +++ b/storage/xtradb/include/fts0types.ic @@ -0,0 +1,388 @@ +/***************************************************************************** + +Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fts0types.ic +Full text search types. + +Created 2007-03-27 Sunny Bains +*******************************************************/ + +#ifndef INNOBASE_FTS0TYPES_IC +#define INNOBASE_FTS0TYPES_IC + +#include <ctype.h> + +#include "rem0cmp.h" +#include "ha_prototypes.h" + +extern const ulint UTF8_ERROR; + +/* Determine if a UTF-8 continuation byte is valid. */ +#define fts_utf8_is_valid(b) (((b) & 0xC0) == 0x80) + +/******************************************************************//** +Duplicate an UTF-8 string. +@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ +UNIV_INLINE +void +fts_utf8_string_dup( +/*================*/ + fts_string_t* dst, /*!< in: dup to here */ + const fts_string_t* src, /*!< in: src string */ + mem_heap_t* heap) /*!< in: heap to use */ +{ + dst->f_str = (byte*)mem_heap_alloc(heap, src->f_len + 1); + memcpy(dst->f_str, src->f_str, src->f_len); + + dst->f_len = src->f_len; + dst->f_str[src->f_len] = 0; + dst->f_n_char = src->f_n_char; +} + +/******************************************************************//** +Compare two fts_trx_row_t doc_ids. +@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ +UNIV_INLINE +int +fts_trx_row_doc_id_cmp( +/*===================*/ + const void* p1, /*!< in: id1 */ + const void* p2) /*!< in: id2 */ +{ + const fts_trx_row_t* tr1 = (const fts_trx_row_t*) p1; + const fts_trx_row_t* tr2 = (const fts_trx_row_t*) p2; + + return((int)(tr1->doc_id - tr2->doc_id)); +} + +/******************************************************************//** +Compare two fts_ranking_t doc_ids. +@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ +UNIV_INLINE +int +fts_ranking_doc_id_cmp( +/*===================*/ + const void* p1, /*!< in: id1 */ + const void* p2) /*!< in: id2 */ +{ + const fts_ranking_t* rk1 = (const fts_ranking_t*) p1; + const fts_ranking_t* rk2 = (const fts_ranking_t*) p2; + + return((int)(rk1->doc_id - rk2->doc_id)); +} + +/******************************************************************//** +Compare two fts_update_t doc_ids. +@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ +UNIV_INLINE +int +fts_update_doc_id_cmp( +/*==================*/ + const void* p1, /*!< in: id1 */ + const void* p2) /*!< in: id2 */ +{ + const fts_update_t* up1 = (const fts_update_t*) p1; + const fts_update_t* up2 = (const fts_update_t*) p2; + + return((int)(up1->doc_id - up2->doc_id)); +} + + +/******************************************************************//** +Lowercase an UTF-8 string. */ +UNIV_INLINE +void +fts_utf8_tolower( +/*=============*/ + fts_string_t* str) /*!< in: string */ +{ + innobase_casedn_str((char*) str->f_str); +} + +/******************************************************************//** +Compare two UTF-8 strings. +@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ +UNIV_INLINE +int +fts_utf8_string_cmp( +/*================*/ + const void* p1, /*!< in: key */ + const void* p2) /*!< in: node */ +{ + const fts_string_t* s1 = (const fts_string_t*) p1; + const fts_string_t* s2 = (const fts_string_t*) p2; + + return(cmp_data_data_slow_varchar( + s1->f_str, s1->f_len, s2->f_str, s2->f_len)); +} + +/******************************************************************//** +Compare two UTF-8 strings, and return match (0) if +passed in "key" value equals or is the prefix of the "node" value. +@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ +UNIV_INLINE +int +fts_utf8_string_cmp_prefix( +/*=======================*/ + const void* p1, /*!< in: key */ + const void* p2) /*!< in: node */ +{ + int result; + ulint len; + + const fts_string_t* s1 = (const fts_string_t*) p1; + const fts_string_t* s2 = (const fts_string_t*) p2; + + len = ut_min(s1->f_len, s2->f_len); + + result = cmp_data_data_slow_varchar(s1->f_str, len, s2->f_str, len); + + if (result) { + return(result); + } + + if (s1->f_len > s2->f_len) { + return(1); + } + + return(0); +} + +/******************************************************************//** +Decode a UTF-8 character. + +http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf: + + Scalar Value 1st Byte 2nd Byte 3rd Byte 4th Byte +00000000 0xxxxxxx 0xxxxxxx +00000yyy yyxxxxxx 110yyyyy 10xxxxxx +zzzzyyyy yyxxxxxx 1110zzzz 10yyyyyy 10xxxxxx +000uuuzz zzzzyyyy yyxxxxxx 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx + +This function decodes UTF-8 sequences up to 6 bytes (31 bits). + +On error *ptr will point to the first byte that was not correctly +decoded. This will hopefully help in resyncing the input. +@return UTF8_ERROR if *ptr did not point to a valid +UTF-8 sequence, or the Unicode code point. */ +UNIV_INLINE +ulint +fts_utf8_decode( +/*============*/ + const byte** ptr) /*!< in/out: pointer to + UTF-8 string. The + pointer is advanced to + the start of the next + character. */ +{ + const byte* p = *ptr; + ulint ch = *p++; +#ifdef UNIV_DEBUG + ulint min_ch; +#endif /* UNIV_DEBUG */ + + if (UNIV_LIKELY(ch < 0x80)) { + /* 0xxxxxxx */ + } else if (UNIV_UNLIKELY(ch < 0xC0)) { + /* A continuation byte cannot start a code. */ + goto err_exit; + } else if (ch < 0xE0) { + /* 110yyyyy 10xxxxxx */ + ch &= 0x1F; + ut_d(min_ch = 0x80); + goto get1; + } else if (ch < 0xF0) { + /* 1110zzzz 10yyyyyy 10xxxxxx */ + ch &= 0x0F; + ut_d(min_ch = 0x800); + goto get2; + } else if (ch < 0xF8) { + /* 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx */ + ch &= 0x07; + ut_d(min_ch = 0x10000); + goto get3; + } else if (ch < 0xFC) { + /* 111110tt 10uuuuuu 10zzzzzz 10yyyyyy 10xxxxxx */ + ch &= 0x03; + ut_d(min_ch = 0x200000); + goto get4; + } else if (ch < 0xFE) { + /* 1111110s 10tttttt 10uuuuuu 10zzzzzz 10yyyyyy 10xxxxxx */ + ut_d(min_ch = 0x4000000); + if (!fts_utf8_is_valid(*p)) { + goto err_exit; + } + ch <<= 6; + ch |= (*p++) & 0x3F; +get4: + if (!fts_utf8_is_valid(*p)) { + goto err_exit; + } + ch <<= 6; + ch |= (*p++) & 0x3F; +get3: + if (!fts_utf8_is_valid(*p)) { + goto err_exit; + } + ch <<= 6; + ch |= (*p++) & 0x3F; +get2: + if (!fts_utf8_is_valid(*p)) { + goto err_exit; + } + ch <<= 6; + ch |= (*p++) & 0x3F; +get1: + if (!fts_utf8_is_valid(*p)) { + goto err_exit; + } + ch <<= 6; + ch |= (*p++) & 0x3F; + + /* The following is needed in the 6-byte case + when ulint is wider than 32 bits. */ + ch &= 0xFFFFFFFF; + + /* The code positions U+D800 to U+DFFF (UTF-16 surrogate pairs) + and U+FFFE and U+FFFF cannot occur in valid UTF-8. */ + + if ( (ch >= 0xD800 && ch <= 0xDFFF) +#ifdef UNIV_DEBUG + || ch < min_ch +#endif /* UNIV_DEBUG */ + || ch == 0xFFFE || ch == 0xFFFF) { + + ch = UTF8_ERROR; + } + } else { +err_exit: + ch = UTF8_ERROR; + } + + *ptr = p; + + return(ch); +} + +/******************************************************************//** +Get the first character's code position for FTS index partition */ +extern +ulint +innobase_strnxfrm( +/*==============*/ + const CHARSET_INFO* cs, /*!< in: Character set */ + const uchar* p2, /*!< in: string */ + const ulint len2); /*!< in: string length */ + +/******************************************************************//** +Select the FTS auxiliary index for the given character. +@return the index to use for the string */ +UNIV_INLINE +ulint +fts_select_index( +/*=============*/ + const CHARSET_INFO* cs, /*!< in: Charset */ + const byte* str, /*!< in: string */ + ulint len) /*!< in: string length */ +{ + ulint selected = 0; + ulint value = innobase_strnxfrm(cs, str, len); + + while (fts_index_selector[selected].value != 0) { + + if (fts_index_selector[selected].value == value) { + + return(selected); + + } else if (fts_index_selector[selected].value > value) { + + return(selected > 0 ? selected - 1 : 0); + } + + ++selected; + } + + ut_ad(selected > 1); + + return(selected - 1); +} + +/******************************************************************//** +Select the next FTS auxiliary index for the given character. +@return the next index to use for character */ +UNIV_INLINE +ulint +fts_select_next_index( +/*==================*/ + const CHARSET_INFO* cs, /*!< in: Charset */ + const byte* str, /*!< in: string */ + ulint len) /*!< in: string length */ +{ + ulint selected = 0; + ulint value = innobase_strnxfrm(cs, str, len); + + while (fts_index_selector[selected].value != 0) { + + if (fts_index_selector[selected].value == value) { + + return(selected + 1); + + } else if (fts_index_selector[selected].value > value) { + + return(selected); + } + + ++selected; + } + + ut_ad(selected > 0); + + return((ulint) selected); +} + +/******************************************************************//** +Return the selected FTS aux index suffix. */ +UNIV_INLINE +const char* +fts_get_suffix( +/*===========*/ + ulint selected) /*!< in: selected index */ +{ + return(fts_index_selector[selected].suffix); +} + +/******************************************************************//** +Get the number of index selectors. +@return The number of selectors */ +UNIV_INLINE +ulint +fts_get_n_selectors(void) +/*=====================*/ +{ + ulint i = 0; + + // FIXME: This is a hack + while (fts_index_selector[i].value != 0) { + ++i; + } + + return(i); +} + +#endif /* INNOBASE_FTS0TYPES_IC */ diff --git a/storage/xtradb/include/fts0vlc.ic b/storage/xtradb/include/fts0vlc.ic new file mode 100644 index 00000000000..e79bcf59347 --- /dev/null +++ b/storage/xtradb/include/fts0vlc.ic @@ -0,0 +1,142 @@ +/***************************************************************************** + +Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fts0vlc.ic +Full text variable length integer encoding/decoding. + +Created 2007-03-27 Sunny Bains +*******************************************************/ + +#ifndef INNOBASE_FTS0VLC_IC +#define INNOBASE_FTS0VLC_IC + +#include "fts0types.h" + +/******************************************************************//** +Return length of val if it were encoded using our VLC scheme. +FIXME: We will need to be able encode 8 bytes value +@return length of value encoded, in bytes */ +UNIV_INLINE +ulint +fts_get_encoded_len( +/*================*/ + ulint val) /* in: value to encode */ +{ + if (val <= 127) { + return(1); + } else if (val <= 16383) { + return(2); + } else if (val <= 2097151) { + return(3); + } else if (val <= 268435455) { + return(4); + } else { + /* Possibly we should care that on 64-bit machines ulint can + contain values that we can't encode in 5 bytes, but + fts_encode_int doesn't handle them either so it doesn't much + matter. */ + + return(5); + } +} + +/******************************************************************//** +Encode an integer using our VLC scheme and return the length in bytes. +@return length of value encoded, in bytes */ +UNIV_INLINE +ulint +fts_encode_int( +/*===========*/ + ulint val, /* in: value to encode */ + byte* buf) /* in: buffer, must have enough space */ +{ + ulint len; + + if (val <= 127) { + *buf = (byte) val; + + len = 1; + } else if (val <= 16383) { + *buf++ = (byte)(val >> 7); + *buf = (byte)(val & 0x7F); + + len = 2; + } else if (val <= 2097151) { + *buf++ = (byte)(val >> 14); + *buf++ = (byte)((val >> 7) & 0x7F); + *buf = (byte)(val & 0x7F); + + len = 3; + } else if (val <= 268435455) { + *buf++ = (byte)(val >> 21); + *buf++ = (byte)((val >> 14) & 0x7F); + *buf++ = (byte)((val >> 7) & 0x7F); + *buf = (byte)(val & 0x7F); + + len = 4; + } else { + /* Best to keep the limitations of the 32/64 bit versions + identical, at least for the time being. */ + ut_ad(val <= 4294967295u); + + *buf++ = (byte)(val >> 28); + *buf++ = (byte)((val >> 21) & 0x7F); + *buf++ = (byte)((val >> 14) & 0x7F); + *buf++ = (byte)((val >> 7) & 0x7F); + *buf = (byte)(val & 0x7F); + + len = 5; + } + + /* High-bit on means "last byte in the encoded integer". */ + *buf |= 0x80; + + return(len); +} + +/******************************************************************//** +Decode and return the integer that was encoded using our VLC scheme. +@return value decoded */ +UNIV_INLINE +ulint +fts_decode_vlc( +/*===========*/ + byte** ptr) /* in: ptr to decode from, this ptr is + incremented by the number of bytes decoded */ +{ + ulint val = 0; + + for (;;) { + byte b = **ptr; + + ++*ptr; + val |= (b & 0x7F); + + /* High-bit on means "last byte in the encoded integer". */ + if (b & 0x80) { + break; + } else { + val <<= 7; + } + } + + return(val); +} + +#endif diff --git a/storage/xtradb/include/fut0fut.h b/storage/xtradb/include/fut0fut.h new file mode 100644 index 00000000000..851cdb44cdf --- /dev/null +++ b/storage/xtradb/include/fut0fut.h @@ -0,0 +1,55 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fut0fut.h +File-based utilities + +Created 12/13/1995 Heikki Tuuri +***********************************************************************/ + + +#ifndef fut0fut_h +#define fut0fut_h + +#include "univ.i" + +#include "fil0fil.h" +#include "mtr0mtr.h" + +/********************************************************************//** +Gets a pointer to a file address and latches the page. +@return pointer to a byte in a frame; the file page in the frame is +bufferfixed and latched */ +UNIV_INLINE +byte* +fut_get_ptr( +/*========*/ + ulint space, /*!< in: space id */ + ulint zip_size,/*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + fil_addr_t addr, /*!< in: file address */ + ulint rw_latch, /*!< in: RW_S_LATCH, RW_X_LATCH */ + mtr_t* mtr); /*!< in: mtr handle */ + +#ifndef UNIV_NONINL +#include "fut0fut.ic" +#endif + +#endif + diff --git a/storage/xtradb/include/fut0fut.ic b/storage/xtradb/include/fut0fut.ic new file mode 100644 index 00000000000..15c964df6c7 --- /dev/null +++ b/storage/xtradb/include/fut0fut.ic @@ -0,0 +1,60 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fut0fut.ic +File-based utilities + +Created 12/13/1995 Heikki Tuuri +***********************************************************************/ + +#include "srv0srv.h" +#include "sync0rw.h" +#include "buf0buf.h" + +/********************************************************************//** +Gets a pointer to a file address and latches the page. +@return pointer to a byte in a frame; the file page in the frame is +bufferfixed and latched */ +UNIV_INLINE +byte* +fut_get_ptr( +/*========*/ + ulint space, /*!< in: space id */ + ulint zip_size,/*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + fil_addr_t addr, /*!< in: file address */ + ulint rw_latch, /*!< in: RW_S_LATCH, RW_X_LATCH */ + mtr_t* mtr) /*!< in: mtr handle */ +{ + buf_block_t* block; + byte* ptr; + + ut_ad(addr.boffset < UNIV_PAGE_SIZE); + ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH)); + + block = buf_page_get(space, zip_size, addr.page, rw_latch, mtr); + + SRV_CORRUPT_TABLE_CHECK(block, return(0);); + + ptr = buf_block_get_frame(block) + addr.boffset; + + buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK); + + return(ptr); +} diff --git a/storage/xtradb/include/fut0lst.h b/storage/xtradb/include/fut0lst.h new file mode 100644 index 00000000000..90f9a65d4fa --- /dev/null +++ b/storage/xtradb/include/fut0lst.h @@ -0,0 +1,217 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fut0lst.h +File-based list utilities + +Created 11/28/1995 Heikki Tuuri +***********************************************************************/ + +#ifndef fut0lst_h +#define fut0lst_h + +#include "univ.i" + +#include "fil0fil.h" +#include "mtr0mtr.h" + + +/* The C 'types' of base node and list node: these should be used to +write self-documenting code. Of course, the sizeof macro cannot be +applied to these types! */ + +typedef byte flst_base_node_t; +typedef byte flst_node_t; + +/* The physical size of a list base node in bytes */ +#define FLST_BASE_NODE_SIZE (4 + 2 * FIL_ADDR_SIZE) + +/* The physical size of a list node in bytes */ +#define FLST_NODE_SIZE (2 * FIL_ADDR_SIZE) + +#ifndef UNIV_HOTBACKUP +/********************************************************************//** +Initializes a list base node. */ +UNIV_INLINE +void +flst_init( +/*======*/ + flst_base_node_t* base, /*!< in: pointer to base node */ + mtr_t* mtr); /*!< in: mini-transaction handle */ +/********************************************************************//** +Adds a node as the last node in a list. */ +UNIV_INTERN +void +flst_add_last( +/*==========*/ + flst_base_node_t* base, /*!< in: pointer to base node of list */ + flst_node_t* node, /*!< in: node to add */ + mtr_t* mtr); /*!< in: mini-transaction handle */ +/********************************************************************//** +Adds a node as the first node in a list. */ +UNIV_INTERN +void +flst_add_first( +/*===========*/ + flst_base_node_t* base, /*!< in: pointer to base node of list */ + flst_node_t* node, /*!< in: node to add */ + mtr_t* mtr); /*!< in: mini-transaction handle */ +/********************************************************************//** +Inserts a node after another in a list. */ +UNIV_INTERN +void +flst_insert_after( +/*==============*/ + flst_base_node_t* base, /*!< in: pointer to base node of list */ + flst_node_t* node1, /*!< in: node to insert after */ + flst_node_t* node2, /*!< in: node to add */ + mtr_t* mtr); /*!< in: mini-transaction handle */ +/********************************************************************//** +Inserts a node before another in a list. */ +UNIV_INTERN +void +flst_insert_before( +/*===============*/ + flst_base_node_t* base, /*!< in: pointer to base node of list */ + flst_node_t* node2, /*!< in: node to insert */ + flst_node_t* node3, /*!< in: node to insert before */ + mtr_t* mtr); /*!< in: mini-transaction handle */ +/********************************************************************//** +Removes a node. */ +UNIV_INTERN +void +flst_remove( +/*========*/ + flst_base_node_t* base, /*!< in: pointer to base node of list */ + flst_node_t* node2, /*!< in: node to remove */ + mtr_t* mtr); /*!< in: mini-transaction handle */ +/********************************************************************//** +Cuts off the tail of the list, including the node given. The number of +nodes which will be removed must be provided by the caller, as this function +does not measure the length of the tail. */ +UNIV_INTERN +void +flst_cut_end( +/*=========*/ + flst_base_node_t* base, /*!< in: pointer to base node of list */ + flst_node_t* node2, /*!< in: first node to remove */ + ulint n_nodes,/*!< in: number of nodes to remove, + must be >= 1 */ + mtr_t* mtr); /*!< in: mini-transaction handle */ +/********************************************************************//** +Cuts off the tail of the list, not including the given node. The number of +nodes which will be removed must be provided by the caller, as this function +does not measure the length of the tail. */ +UNIV_INTERN +void +flst_truncate_end( +/*==============*/ + flst_base_node_t* base, /*!< in: pointer to base node of list */ + flst_node_t* node2, /*!< in: first node not to remove */ + ulint n_nodes,/*!< in: number of nodes to remove */ + mtr_t* mtr); /*!< in: mini-transaction handle */ +/********************************************************************//** +Gets list length. +@return length */ +UNIV_INLINE +ulint +flst_get_len( +/*=========*/ + const flst_base_node_t* base, /*!< in: pointer to base node */ + mtr_t* mtr); /*!< in: mini-transaction handle */ +/********************************************************************//** +Gets list first node address. +@return file address */ +UNIV_INLINE +fil_addr_t +flst_get_first( +/*===========*/ + const flst_base_node_t* base, /*!< in: pointer to base node */ + mtr_t* mtr); /*!< in: mini-transaction handle */ +/********************************************************************//** +Gets list last node address. +@return file address */ +UNIV_INLINE +fil_addr_t +flst_get_last( +/*==========*/ + const flst_base_node_t* base, /*!< in: pointer to base node */ + mtr_t* mtr); /*!< in: mini-transaction handle */ +/********************************************************************//** +Gets list next node address. +@return file address */ +UNIV_INLINE +fil_addr_t +flst_get_next_addr( +/*===============*/ + const flst_node_t* node, /*!< in: pointer to node */ + mtr_t* mtr); /*!< in: mini-transaction handle */ +/********************************************************************//** +Gets list prev node address. +@return file address */ +UNIV_INLINE +fil_addr_t +flst_get_prev_addr( +/*===============*/ + const flst_node_t* node, /*!< in: pointer to node */ + mtr_t* mtr); /*!< in: mini-transaction handle */ +/********************************************************************//** +Writes a file address. */ +UNIV_INLINE +void +flst_write_addr( +/*============*/ + fil_faddr_t* faddr, /*!< in: pointer to file faddress */ + fil_addr_t addr, /*!< in: file address */ + mtr_t* mtr); /*!< in: mini-transaction handle */ +/********************************************************************//** +Reads a file address. +@return file address */ +UNIV_INLINE +fil_addr_t +flst_read_addr( +/*===========*/ + const fil_faddr_t* faddr, /*!< in: pointer to file faddress */ + mtr_t* mtr); /*!< in: mini-transaction handle */ +/********************************************************************//** +Validates a file-based list. +@return TRUE if ok */ +UNIV_INTERN +ibool +flst_validate( +/*==========*/ + const flst_base_node_t* base, /*!< in: pointer to base node of list */ + mtr_t* mtr1); /*!< in: mtr */ +/********************************************************************//** +Prints info of a file-based list. */ +UNIV_INTERN +void +flst_print( +/*=======*/ + const flst_base_node_t* base, /*!< in: pointer to base node of list */ + mtr_t* mtr); /*!< in: mtr */ + + +#ifndef UNIV_NONINL +#include "fut0lst.ic" +#endif + +#endif /* !UNIV_HOTBACKUP */ + +#endif diff --git a/storage/xtradb/include/fut0lst.ic b/storage/xtradb/include/fut0lst.ic new file mode 100644 index 00000000000..d18cf21378f --- /dev/null +++ b/storage/xtradb/include/fut0lst.ic @@ -0,0 +1,167 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fut0lst.ic +File-based list utilities + +Created 11/28/1995 Heikki Tuuri +***********************************************************************/ + +#include "fut0fut.h" +#include "mtr0log.h" +#include "buf0buf.h" + +/* We define the field offsets of a node for the list */ +#define FLST_PREV 0 /* 6-byte address of the previous list element; + the page part of address is FIL_NULL, if no + previous element */ +#define FLST_NEXT FIL_ADDR_SIZE /* 6-byte address of the next + list element; the page part of address + is FIL_NULL, if no next element */ + +/* We define the field offsets of a base node for the list */ +#define FLST_LEN 0 /* 32-bit list length field */ +#define FLST_FIRST 4 /* 6-byte address of the first element + of the list; undefined if empty list */ +#define FLST_LAST (4 + FIL_ADDR_SIZE) /* 6-byte address of the + last element of the list; undefined + if empty list */ + +/********************************************************************//** +Writes a file address. */ +UNIV_INLINE +void +flst_write_addr( +/*============*/ + fil_faddr_t* faddr, /*!< in: pointer to file faddress */ + fil_addr_t addr, /*!< in: file address */ + mtr_t* mtr) /*!< in: mini-transaction handle */ +{ + ut_ad(faddr && mtr); + ut_ad(mtr_memo_contains_page(mtr, faddr, MTR_MEMO_PAGE_X_FIX)); + ut_a(addr.page == FIL_NULL || addr.boffset >= FIL_PAGE_DATA); + ut_a(ut_align_offset(faddr, UNIV_PAGE_SIZE) >= FIL_PAGE_DATA); + + mlog_write_ulint(faddr + FIL_ADDR_PAGE, addr.page, MLOG_4BYTES, mtr); + mlog_write_ulint(faddr + FIL_ADDR_BYTE, addr.boffset, + MLOG_2BYTES, mtr); +} + +/********************************************************************//** +Reads a file address. +@return file address */ +UNIV_INLINE +fil_addr_t +flst_read_addr( +/*===========*/ + const fil_faddr_t* faddr, /*!< in: pointer to file faddress */ + mtr_t* mtr) /*!< in: mini-transaction handle */ +{ + fil_addr_t addr; + + ut_ad(faddr && mtr); + + addr.page = mtr_read_ulint(faddr + FIL_ADDR_PAGE, MLOG_4BYTES, mtr); + addr.boffset = mtr_read_ulint(faddr + FIL_ADDR_BYTE, MLOG_2BYTES, + mtr); + ut_a(addr.page == FIL_NULL || addr.boffset >= FIL_PAGE_DATA); + ut_a(ut_align_offset(faddr, UNIV_PAGE_SIZE) >= FIL_PAGE_DATA); + return(addr); +} + +/********************************************************************//** +Initializes a list base node. */ +UNIV_INLINE +void +flst_init( +/*======*/ + flst_base_node_t* base, /*!< in: pointer to base node */ + mtr_t* mtr) /*!< in: mini-transaction handle */ +{ + ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX)); + + mlog_write_ulint(base + FLST_LEN, 0, MLOG_4BYTES, mtr); + flst_write_addr(base + FLST_FIRST, fil_addr_null, mtr); + flst_write_addr(base + FLST_LAST, fil_addr_null, mtr); +} + +/********************************************************************//** +Gets list length. +@return length */ +UNIV_INLINE +ulint +flst_get_len( +/*=========*/ + const flst_base_node_t* base, /*!< in: pointer to base node */ + mtr_t* mtr) /*!< in: mini-transaction handle */ +{ + return(mtr_read_ulint(base + FLST_LEN, MLOG_4BYTES, mtr)); +} + +/********************************************************************//** +Gets list first node address. +@return file address */ +UNIV_INLINE +fil_addr_t +flst_get_first( +/*===========*/ + const flst_base_node_t* base, /*!< in: pointer to base node */ + mtr_t* mtr) /*!< in: mini-transaction handle */ +{ + return(flst_read_addr(base + FLST_FIRST, mtr)); +} + +/********************************************************************//** +Gets list last node address. +@return file address */ +UNIV_INLINE +fil_addr_t +flst_get_last( +/*==========*/ + const flst_base_node_t* base, /*!< in: pointer to base node */ + mtr_t* mtr) /*!< in: mini-transaction handle */ +{ + return(flst_read_addr(base + FLST_LAST, mtr)); +} + +/********************************************************************//** +Gets list next node address. +@return file address */ +UNIV_INLINE +fil_addr_t +flst_get_next_addr( +/*===============*/ + const flst_node_t* node, /*!< in: pointer to node */ + mtr_t* mtr) /*!< in: mini-transaction handle */ +{ + return(flst_read_addr(node + FLST_NEXT, mtr)); +} + +/********************************************************************//** +Gets list prev node address. +@return file address */ +UNIV_INLINE +fil_addr_t +flst_get_prev_addr( +/*===============*/ + const flst_node_t* node, /*!< in: pointer to node */ + mtr_t* mtr) /*!< in: mini-transaction handle */ +{ + return(flst_read_addr(node + FLST_PREV, mtr)); +} diff --git a/storage/xtradb/include/ha0ha.h b/storage/xtradb/include/ha0ha.h new file mode 100644 index 00000000000..7351b407e8c --- /dev/null +++ b/storage/xtradb/include/ha0ha.h @@ -0,0 +1,265 @@ +/***************************************************************************** + +Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/ha0ha.h +The hash table with external chains + +Created 8/18/1994 Heikki Tuuri +*******************************************************/ + +#ifndef ha0ha_h +#define ha0ha_h + +#include "univ.i" + +#include "hash0hash.h" +#include "page0types.h" +#include "buf0types.h" +#include "rem0types.h" + +/*************************************************************//** +Looks for an element in a hash table. +@return pointer to the data of the first hash table node in chain +having the fold number, NULL if not found */ +UNIV_INLINE +const rec_t* +ha_search_and_get_data( +/*===================*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold); /*!< in: folded value of the searched data */ +/*********************************************************//** +Looks for an element when we know the pointer to the data and updates +the pointer to data if found. +@return TRUE if found */ +UNIV_INTERN +ibool +ha_search_and_update_if_found_func( +/*===============================*/ + hash_table_t* table, /*!< in/out: hash table */ + ulint fold, /*!< in: folded value of the searched data */ + const rec_t* data, /*!< in: pointer to the data */ +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + buf_block_t* new_block,/*!< in: block containing new_data */ +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + const rec_t* new_data);/*!< in: new pointer to the data */ + +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG +/** Looks for an element when we know the pointer to the data and +updates the pointer to data if found. +@param table in/out: hash table +@param fold in: folded value of the searched data +@param data in: pointer to the data +@param new_block in: block containing new_data +@param new_data in: new pointer to the data */ +# define ha_search_and_update_if_found(table,fold,data,new_block,new_data) \ + ha_search_and_update_if_found_func(table,fold,data,new_block,new_data) +#else /* UNIV_AHI_DEBUG || UNIV_DEBUG */ +/** Looks for an element when we know the pointer to the data and +updates the pointer to data if found. +@param table in/out: hash table +@param fold in: folded value of the searched data +@param data in: pointer to the data +@param new_block ignored: block containing new_data +@param new_data in: new pointer to the data */ +# define ha_search_and_update_if_found(table,fold,data,new_block,new_data) \ + ha_search_and_update_if_found_func(table,fold,data,new_data) +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ +/*************************************************************//** +Creates a hash table with at least n array cells. The actual number +of cells is chosen to be a prime number slightly bigger than n. +@return own: created table */ +UNIV_INTERN +hash_table_t* +ha_create_func( +/*===========*/ + ulint n, /*!< in: number of array cells */ +#ifdef UNIV_SYNC_DEBUG + ulint mutex_level, /*!< in: level of the mutexes in the latching + order: this is used in the debug version */ +#endif /* UNIV_SYNC_DEBUG */ + ulint n_mutexes, /*!< in: number of mutexes to protect the + hash table: must be a power of 2, or 0 */ + ulint type); /*!< in: type of datastructure for which + the memory heap is going to be used e.g.: + MEM_HEAP_FOR_BTR_SEARCH or + MEM_HEAP_FOR_PAGE_HASH */ +#ifdef UNIV_SYNC_DEBUG +/** Creates a hash table. +@return own: created table +@param n_c in: number of array cells. The actual number of cells is +chosen to be a slightly bigger prime number. +@param level in: level of the mutexes in the latching order +@param n_m in: number of mutexes to protect the hash table; + must be a power of 2, or 0 */ +# define ha_create(n_c,n_m,type,level) ha_create_func(n_c,level,n_m,type) +#else /* UNIV_SYNC_DEBUG */ +/** Creates a hash table. +@return own: created table +@param n_c in: number of array cells. The actual number of cells is +chosen to be a slightly bigger prime number. +@param level in: level of the mutexes in the latching order +@param n_m in: number of mutexes to protect the hash table; + must be a power of 2, or 0 */ +# define ha_create(n_c,n_m,type,level) ha_create_func(n_c,n_m,type) +#endif /* UNIV_SYNC_DEBUG */ + +/*************************************************************//** +Empties a hash table and frees the memory heaps. */ +UNIV_INTERN +void +ha_clear( +/*=====*/ + hash_table_t* table); /*!< in, own: hash table */ + +/*************************************************************//** +Inserts an entry into a hash table. If an entry with the same fold number +is found, its node is updated to point to the new data, and no new node +is inserted. +@return TRUE if succeed, FALSE if no more memory could be allocated */ +UNIV_INTERN +ibool +ha_insert_for_fold_func( +/*====================*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold, /*!< in: folded value of data; if a node with + the same fold value already exists, it is + updated to point to the same data, and no new + node is created! */ +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + buf_block_t* block, /*!< in: buffer block containing the data */ +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + const rec_t* data); /*!< in: data, must not be NULL */ + +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG +/** +Inserts an entry into a hash table. If an entry with the same fold number +is found, its node is updated to point to the new data, and no new node +is inserted. +@return TRUE if succeed, FALSE if no more memory could be allocated +@param t in: hash table +@param f in: folded value of data +@param b in: buffer block containing the data +@param d in: data, must not be NULL */ +# define ha_insert_for_fold(t,f,b,d) do { \ + ha_insert_for_fold_func(t,f,b,d); \ + MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED); \ +} while(0) +#else /* UNIV_AHI_DEBUG || UNIV_DEBUG */ +/** +Inserts an entry into a hash table. If an entry with the same fold number +is found, its node is updated to point to the new data, and no new node +is inserted. +@return TRUE if succeed, FALSE if no more memory could be allocated +@param t in: hash table +@param f in: folded value of data +@param b ignored: buffer block containing the data +@param d in: data, must not be NULL */ +# define ha_insert_for_fold(t,f,b,d) do { \ + ha_insert_for_fold_func(t,f,d); \ + MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED); \ +} while (0) +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + +/*********************************************************//** +Looks for an element when we know the pointer to the data and deletes +it from the hash table if found. +@return TRUE if found */ +UNIV_INLINE +ibool +ha_search_and_delete_if_found( +/*==========================*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold, /*!< in: folded value of the searched data */ + const rec_t* data); /*!< in: pointer to the data */ +#ifndef UNIV_HOTBACKUP +/*****************************************************************//** +Removes from the chain determined by fold all nodes whose data pointer +points to the page given. */ +UNIV_INTERN +void +ha_remove_all_nodes_to_page( +/*========================*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold, /*!< in: fold value */ + const page_t* page); /*!< in: buffer page */ +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG +/*************************************************************//** +Validates a given range of the cells in hash table. +@return TRUE if ok */ +UNIV_INTERN +ibool +ha_validate( +/*========*/ + hash_table_t* table, /*!< in: hash table */ + ulint start_index, /*!< in: start index */ + ulint end_index); /*!< in: end index */ +#endif /* defined UNIV_AHI_DEBUG || defined UNIV_DEBUG */ +/*************************************************************//** +Prints info of a hash table. */ +UNIV_INTERN +void +ha_print_info( +/*==========*/ + FILE* file, /*!< in: file where to print */ + hash_table_t* table); /*!< in: hash table */ +#endif /* !UNIV_HOTBACKUP */ + +/** The hash table external chain node */ +struct ha_node_t { + ha_node_t* next; /*!< next chain node or NULL if none */ +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + buf_block_t* block; /*!< buffer block containing the data, or NULL */ +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + const rec_t* data; /*!< pointer to the data */ + ulint fold; /*!< fold value for the data */ +}; + +#ifdef UNIV_DEBUG +/********************************************************************//** +Assert that the synchronization object in a hash operation involving +possible change in the hash table is held. +Note that in case of mutexes we assert that mutex is owned while in case +of rw-locks we assert that it is held in exclusive mode. */ +UNIV_INLINE +void +hash_assert_can_modify( +/*===================*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold); /*!< in: fold value */ +/********************************************************************//** +Assert that the synchronization object in a hash search operation is held. +Note that in case of mutexes we assert that mutex is owned while in case +of rw-locks we assert that it is held either in x-mode or s-mode. */ +UNIV_INLINE +void +hash_assert_can_search( +/*===================*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold); /*!< in: fold value */ +#else /* UNIV_DEBUG */ +#define hash_assert_can_modify(t, f) +#define hash_assert_can_search(t, f) +#endif /* UNIV_DEBUG */ + + +#ifndef UNIV_NONINL +#include "ha0ha.ic" +#endif + +#endif diff --git a/storage/xtradb/include/ha0ha.ic b/storage/xtradb/include/ha0ha.ic new file mode 100644 index 00000000000..9d0e396e200 --- /dev/null +++ b/storage/xtradb/include/ha0ha.ic @@ -0,0 +1,246 @@ +/***************************************************************************** + +Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/ha0ha.ic +The hash table with external chains + +Created 8/18/1994 Heikki Tuuri +*************************************************************************/ + +#include "ut0rnd.h" +#include "mem0mem.h" +#include "btr0types.h" + +/***********************************************************//** +Deletes a hash node. */ +UNIV_INTERN +void +ha_delete_hash_node( +/*================*/ + hash_table_t* table, /*!< in: hash table */ + ha_node_t* del_node); /*!< in: node to be deleted */ + +/******************************************************************//** +Gets a hash node data. +@return pointer to the data */ +UNIV_INLINE +const rec_t* +ha_node_get_data( +/*=============*/ + const ha_node_t* node) /*!< in: hash chain node */ +{ + return(node->data); +} + +/******************************************************************//** +Sets hash node data. */ +UNIV_INLINE +void +ha_node_set_data_func( +/*==================*/ + ha_node_t* node, /*!< in: hash chain node */ +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + buf_block_t* block, /*!< in: buffer block containing the data */ +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + const rec_t* data) /*!< in: pointer to the data */ +{ +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + node->block = block; +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + node->data = data; +} + +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG +/** Sets hash node data. +@param n in: hash chain node +@param b in: buffer block containing the data +@param d in: pointer to the data */ +# define ha_node_set_data(n,b,d) ha_node_set_data_func(n,b,d) +#else /* UNIV_AHI_DEBUG || UNIV_DEBUG */ +/** Sets hash node data. +@param n in: hash chain node +@param b in: buffer block containing the data +@param d in: pointer to the data */ +# define ha_node_set_data(n,b,d) ha_node_set_data_func(n,d) +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + +/******************************************************************//** +Gets the next node in a hash chain. +@return next node, NULL if none */ +UNIV_INLINE +ha_node_t* +ha_chain_get_next( +/*==============*/ + ha_node_t* node) /*!< in: hash chain node */ +{ + return(node->next); +} + +/******************************************************************//** +Gets the first node in a hash chain. +@return first node, NULL if none */ +UNIV_INLINE +ha_node_t* +ha_chain_get_first( +/*===============*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold) /*!< in: fold value determining the chain */ +{ + return((ha_node_t*) + hash_get_nth_cell(table, hash_calc_hash(fold, table))->node); +} + +#ifdef UNIV_DEBUG +/********************************************************************//** +Assert that the synchronization object in a hash operation involving +possible change in the hash table is held. +Note that in case of mutexes we assert that mutex is owned while in case +of rw-locks we assert that it is held in exclusive mode. */ +UNIV_INLINE +void +hash_assert_can_modify( +/*===================*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold) /*!< in: fold value */ +{ + if (table->type == HASH_TABLE_SYNC_MUTEX) { + ut_ad(mutex_own(hash_get_mutex(table, fold))); + } else if (table->type == HASH_TABLE_SYNC_RW_LOCK) { +# ifdef UNIV_SYNC_DEBUG + prio_rw_lock_t* lock = hash_get_lock(table, fold); + ut_ad(rw_lock_own(lock, RW_LOCK_EX)); +# endif + } else { + ut_ad(table->type == HASH_TABLE_SYNC_NONE); + } +} + +/********************************************************************//** +Assert that the synchronization object in a hash search operation is held. +Note that in case of mutexes we assert that mutex is owned while in case +of rw-locks we assert that it is held either in x-mode or s-mode. */ +UNIV_INLINE +void +hash_assert_can_search( +/*===================*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold) /*!< in: fold value */ +{ + if (table->type == HASH_TABLE_SYNC_MUTEX) { + ut_ad(mutex_own(hash_get_mutex(table, fold))); + } else if (table->type == HASH_TABLE_SYNC_RW_LOCK) { +# ifdef UNIV_SYNC_DEBUG + prio_rw_lock_t* lock = hash_get_lock(table, fold); + ut_ad(rw_lock_own(lock, RW_LOCK_EX) + || rw_lock_own(lock, RW_LOCK_SHARED)); +# endif + } else { + ut_ad(table->type == HASH_TABLE_SYNC_NONE); + } +} +#endif /* UNIV_DEBUG */ + +/*************************************************************//** +Looks for an element in a hash table. +@return pointer to the data of the first hash table node in chain +having the fold number, NULL if not found */ +UNIV_INLINE +const rec_t* +ha_search_and_get_data( +/*===================*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold) /*!< in: folded value of the searched data */ +{ + ha_node_t* node; + + hash_assert_can_search(table, fold); + ut_ad(btr_search_enabled); + + node = ha_chain_get_first(table, fold); + + while (node) { + if (node->fold == fold) { + + return(node->data); + } + + node = ha_chain_get_next(node); + } + + return(NULL); +} + +/*********************************************************//** +Looks for an element when we know the pointer to the data. +@return pointer to the hash table node, NULL if not found in the table */ +UNIV_INLINE +ha_node_t* +ha_search_with_data( +/*================*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold, /*!< in: folded value of the searched data */ + const rec_t* data) /*!< in: pointer to the data */ +{ + ha_node_t* node; + + hash_assert_can_search(table, fold); + + ut_ad(btr_search_enabled); + + node = ha_chain_get_first(table, fold); + + while (node) { + if (node->data == data) { + + return(node); + } + + node = ha_chain_get_next(node); + } + + return(NULL); +} + +/*********************************************************//** +Looks for an element when we know the pointer to the data, and deletes +it from the hash table, if found. +@return TRUE if found */ +UNIV_INLINE +ibool +ha_search_and_delete_if_found( +/*==========================*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold, /*!< in: folded value of the searched data */ + const rec_t* data) /*!< in: pointer to the data */ +{ + ha_node_t* node; + + hash_assert_can_modify(table, fold); + ut_ad(btr_search_enabled); + + node = ha_search_with_data(table, fold, data); + + if (node) { + ha_delete_hash_node(table, node); + + return(TRUE); + } + + return(FALSE); +} diff --git a/storage/xtradb/include/ha0storage.h b/storage/xtradb/include/ha0storage.h new file mode 100644 index 00000000000..0073930b502 --- /dev/null +++ b/storage/xtradb/include/ha0storage.h @@ -0,0 +1,140 @@ +/***************************************************************************** + +Copyright (c) 2007, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/ha0storage.h +Hash storage. +Provides a data structure that stores chunks of data in +its own storage, avoiding duplicates. + +Created September 22, 2007 Vasil Dimov +*******************************************************/ + +#ifndef ha0storage_h +#define ha0storage_h + +#include "univ.i" + +/** This value is used by default by ha_storage_create(). More memory +is allocated later when/if it is needed. */ +#define HA_STORAGE_DEFAULT_HEAP_BYTES 1024 + +/** This value is used by default by ha_storage_create(). It is a +constant per ha_storage's lifetime. */ +#define HA_STORAGE_DEFAULT_HASH_CELLS 4096 + +/** Hash storage */ +struct ha_storage_t; + +/*******************************************************************//** +Creates a hash storage. If any of the parameters is 0, then a default +value is used. +@return own: hash storage */ +UNIV_INLINE +ha_storage_t* +ha_storage_create( +/*==============*/ + ulint initial_heap_bytes, /*!< in: initial heap's size */ + ulint initial_hash_cells); /*!< in: initial number of cells + in the hash table */ + +/*******************************************************************//** +Copies data into the storage and returns a pointer to the copy. If the +same data chunk is already present, then pointer to it is returned. +Data chunks are considered to be equal if len1 == len2 and +memcmp(data1, data2, len1) == 0. If "data" is not present (and thus +data_len bytes need to be allocated) and the size of storage is going to +become more than "memlim" then "data" is not added and NULL is returned. +To disable this behavior "memlim" can be set to 0, which stands for +"no limit". +@return pointer to the copy */ +UNIV_INTERN +const void* +ha_storage_put_memlim( +/*==================*/ + ha_storage_t* storage, /*!< in/out: hash storage */ + const void* data, /*!< in: data to store */ + ulint data_len, /*!< in: data length */ + ulint memlim); /*!< in: memory limit to obey */ + +/*******************************************************************//** +Same as ha_storage_put_memlim() but without memory limit. +@param storage in/out: hash storage +@param data in: data to store +@param data_len in: data length +@return pointer to the copy of the string */ +#define ha_storage_put(storage, data, data_len) \ + ha_storage_put_memlim((storage), (data), (data_len), 0) + +/*******************************************************************//** +Copies string into the storage and returns a pointer to the copy. If the +same string is already present, then pointer to it is returned. +Strings are considered to be equal if strcmp(str1, str2) == 0. +@param storage in/out: hash storage +@param str in: string to put +@return pointer to the copy of the string */ +#define ha_storage_put_str(storage, str) \ + ((const char*) ha_storage_put((storage), (str), strlen(str) + 1)) + +/*******************************************************************//** +Copies string into the storage and returns a pointer to the copy obeying +a memory limit. +If the same string is already present, then pointer to it is returned. +Strings are considered to be equal if strcmp(str1, str2) == 0. +@param storage in/out: hash storage +@param str in: string to put +@param memlim in: memory limit to obey +@return pointer to the copy of the string */ +#define ha_storage_put_str_memlim(storage, str, memlim) \ + ((const char*) ha_storage_put_memlim((storage), (str), \ + strlen(str) + 1, (memlim))) + +/*******************************************************************//** +Empties a hash storage, freeing memory occupied by data chunks. +This invalidates any pointers previously returned by ha_storage_put(). +The hash storage is not invalidated itself and can be used again. */ +UNIV_INLINE +void +ha_storage_empty( +/*=============*/ + ha_storage_t** storage); /*!< in/out: hash storage */ + +/*******************************************************************//** +Frees a hash storage and everything it contains, it cannot be used after +this call. +This invalidates any pointers previously returned by ha_storage_put(). */ +UNIV_INLINE +void +ha_storage_free( +/*============*/ + ha_storage_t* storage); /*!< in, own: hash storage */ + +/*******************************************************************//** +Gets the size of the memory used by a storage. +@return bytes used */ +UNIV_INLINE +ulint +ha_storage_get_size( +/*================*/ + const ha_storage_t* storage); /*!< in: hash storage */ + +#ifndef UNIV_NONINL +#include "ha0storage.ic" +#endif + +#endif /* ha0storage_h */ diff --git a/storage/xtradb/include/ha0storage.ic b/storage/xtradb/include/ha0storage.ic new file mode 100644 index 00000000000..7150ca045ec --- /dev/null +++ b/storage/xtradb/include/ha0storage.ic @@ -0,0 +1,146 @@ +/***************************************************************************** + +Copyright (c) 2007, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/ha0storage.ic +Hash storage. +Provides a data structure that stores chunks of data in +its own storage, avoiding duplicates. + +Created September 24, 2007 Vasil Dimov +*******************************************************/ + +#include "univ.i" +#include "ha0storage.h" +#include "hash0hash.h" +#include "mem0mem.h" + +/** Hash storage for strings */ +struct ha_storage_t { + mem_heap_t* heap; /*!< memory heap from which memory is + allocated */ + hash_table_t* hash; /*!< hash table used to avoid + duplicates */ +}; + +/** Objects of this type are stored in ha_storage_t */ +struct ha_storage_node_t { + ulint data_len;/*!< length of the data */ + const void* data; /*!< pointer to data */ + ha_storage_node_t* next; /*!< next node in hash chain */ +}; + +/*******************************************************************//** +Creates a hash storage. If any of the parameters is 0, then a default +value is used. +@return own: hash storage */ +UNIV_INLINE +ha_storage_t* +ha_storage_create( +/*==============*/ + ulint initial_heap_bytes, /*!< in: initial heap's size */ + ulint initial_hash_cells) /*!< in: initial number of cells + in the hash table */ +{ + ha_storage_t* storage; + mem_heap_t* heap; + + if (initial_heap_bytes == 0) { + + initial_heap_bytes = HA_STORAGE_DEFAULT_HEAP_BYTES; + } + + if (initial_hash_cells == 0) { + + initial_hash_cells = HA_STORAGE_DEFAULT_HASH_CELLS; + } + + /* we put "storage" within "storage->heap" */ + + heap = mem_heap_create(sizeof(ha_storage_t) + + initial_heap_bytes); + + storage = (ha_storage_t*) mem_heap_alloc(heap, + sizeof(ha_storage_t)); + + storage->heap = heap; + storage->hash = hash_create(initial_hash_cells); + + return(storage); +} + +/*******************************************************************//** +Empties a hash storage, freeing memory occupied by data chunks. +This invalidates any pointers previously returned by ha_storage_put(). +The hash storage is not invalidated itself and can be used again. */ +UNIV_INLINE +void +ha_storage_empty( +/*=============*/ + ha_storage_t** storage) /*!< in/out: hash storage */ +{ + ha_storage_t temp_storage; + + temp_storage.heap = (*storage)->heap; + temp_storage.hash = (*storage)->hash; + + hash_table_clear(temp_storage.hash); + mem_heap_empty(temp_storage.heap); + + *storage = (ha_storage_t*) mem_heap_alloc(temp_storage.heap, + sizeof(ha_storage_t)); + + (*storage)->heap = temp_storage.heap; + (*storage)->hash = temp_storage.hash; +} + +/*******************************************************************//** +Frees a hash storage and everything it contains, it cannot be used after +this call. +This invalidates any pointers previously returned by ha_storage_put(). */ +UNIV_INLINE +void +ha_storage_free( +/*============*/ + ha_storage_t* storage) /*!< in, own: hash storage */ +{ + /* order is important because the pointer storage->hash is + within the heap */ + hash_table_free(storage->hash); + mem_heap_free(storage->heap); +} + +/*******************************************************************//** +Gets the size of the memory used by a storage. +@return bytes used */ +UNIV_INLINE +ulint +ha_storage_get_size( +/*================*/ + const ha_storage_t* storage) /*!< in: hash storage */ +{ + ulint ret; + + ret = mem_heap_get_size(storage->heap); + + /* this assumes hash->heap and hash->heaps are NULL */ + ret += sizeof(hash_table_t); + ret += sizeof(hash_cell_t) * hash_get_n_cells(storage->hash); + + return(ret); +} diff --git a/storage/xtradb/include/ha_prototypes.h b/storage/xtradb/include/ha_prototypes.h new file mode 100644 index 00000000000..2f73a5437a1 --- /dev/null +++ b/storage/xtradb/include/ha_prototypes.h @@ -0,0 +1,613 @@ +/***************************************************************************** + +Copyright (c) 2006, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/*******************************************************************//** +@file include/ha_prototypes.h +Prototypes for global functions in ha_innodb.cc that are called by +InnoDB C code + +Created 5/11/2006 Osku Salerma +************************************************************************/ + +#ifndef HA_INNODB_PROTOTYPES_H +#define HA_INNODB_PROTOTYPES_H + +#include "my_dbug.h" +#include "mysqld_error.h" +#include "my_compare.h" +#include "my_sys.h" +#include "m_string.h" +#include "debug_sync.h" + +#include "trx0types.h" +#include "m_ctype.h" /* CHARSET_INFO */ + +// Forward declarations +class Field; +struct fts_string_t; + +/*********************************************************************//** +Wrapper around MySQL's copy_and_convert function. +@return number of bytes copied to 'to' */ +UNIV_INTERN +ulint +innobase_convert_string( +/*====================*/ + void* to, /*!< out: converted string */ + ulint to_length, /*!< in: number of bytes reserved + for the converted string */ + CHARSET_INFO* to_cs, /*!< in: character set to convert to */ + const void* from, /*!< in: string to convert */ + ulint from_length, /*!< in: number of bytes to convert */ + CHARSET_INFO* from_cs, /*!< in: character set to convert + from */ + uint* errors); /*!< out: number of errors encountered + during the conversion */ + +/*******************************************************************//** +Formats the raw data in "data" (in InnoDB on-disk format) that is of +type DATA_(CHAR|VARCHAR|MYSQL|VARMYSQL) using "charset_coll" and writes +the result to "buf". The result is converted to "system_charset_info". +Not more than "buf_size" bytes are written to "buf". +The result is always NUL-terminated (provided buf_size > 0) and the +number of bytes that were written to "buf" is returned (including the +terminating NUL). +@return number of bytes that were written */ +UNIV_INTERN +ulint +innobase_raw_format( +/*================*/ + const char* data, /*!< in: raw data */ + ulint data_len, /*!< in: raw data length + in bytes */ + ulint charset_coll, /*!< in: charset collation */ + char* buf, /*!< out: output buffer */ + ulint buf_size); /*!< in: output buffer size + in bytes */ + +/*****************************************************************//** +Invalidates the MySQL query cache for the table. */ +UNIV_INTERN +void +innobase_invalidate_query_cache( +/*============================*/ + trx_t* trx, /*!< in: transaction which + modifies the table */ + const char* full_name, /*!< in: concatenation of + database name, null char NUL, + table name, null char NUL; + NOTE that in Windows this is + always in LOWER CASE! */ + ulint full_name_len); /*!< in: full name length where + also the null chars count */ + +/*****************************************************************//** +Convert a table or index name to the MySQL system_charset_info (UTF-8) +and quote it if needed. +@return pointer to the end of buf */ +UNIV_INTERN +char* +innobase_convert_name( +/*==================*/ + char* buf, /*!< out: buffer for converted identifier */ + ulint buflen, /*!< in: length of buf, in bytes */ + const char* id, /*!< in: identifier to convert */ + ulint idlen, /*!< in: length of id, in bytes */ + THD* thd, /*!< in: MySQL connection thread, or NULL */ + ibool table_id);/*!< in: TRUE=id is a table or database name; + FALSE=id is an index name */ + +/******************************************************************//** +Returns true if the thread is the replication thread on the slave +server. Used in srv_conc_enter_innodb() to determine if the thread +should be allowed to enter InnoDB - the replication thread is treated +differently than other threads. Also used in +srv_conc_force_exit_innodb(). +@return true if thd is the replication thread */ +UNIV_INTERN +ibool +thd_is_replication_slave_thread( +/*============================*/ + THD* thd); /*!< in: thread handle */ + +/******************************************************************//** +Gets information on the durability property requested by thread. +Used when writing either a prepare or commit record to the log +buffer. +@return the durability property. */ +UNIV_INTERN +enum durability_properties +thd_requested_durability( +/*=====================*/ + const THD* thd) /*!< in: thread handle */ + __attribute__((nonnull, warn_unused_result)); + +/******************************************************************//** +Returns true if the transaction this thread is processing has edited +non-transactional tables. Used by the deadlock detector when deciding +which transaction to rollback in case of a deadlock - we try to avoid +rolling back transactions that have edited non-transactional tables. +@return true if non-transactional tables have been edited */ +UNIV_INTERN +ibool +thd_has_edited_nontrans_tables( +/*===========================*/ + THD* thd); /*!< in: thread handle */ + +/*************************************************************//** +Prints info of a THD object (== user session thread) to the given file. */ +UNIV_INTERN +void +innobase_mysql_print_thd( +/*=====================*/ + FILE* f, /*!< in: output stream */ + THD* thd, /*!< in: pointer to a MySQL THD object */ + uint max_query_len); /*!< in: max query length to print, or 0 to + use the default max length */ + +/*************************************************************//** +InnoDB uses this function to compare two data fields for which the data type +is such that we must use MySQL code to compare them. +@return 1, 0, -1, if a is greater, equal, less than b, respectively */ +UNIV_INTERN +int +innobase_mysql_cmp( +/*===============*/ + int mysql_type, /*!< in: MySQL type */ + uint charset_number, /*!< in: number of the charset */ + const unsigned char* a, /*!< in: data field */ + unsigned int a_length, /*!< in: data field length, + not UNIV_SQL_NULL */ + const unsigned char* b, /*!< in: data field */ + unsigned int b_length) /*!< in: data field length, + not UNIV_SQL_NULL */ + __attribute__((nonnull, warn_unused_result)); +/**************************************************************//** +Converts a MySQL type to an InnoDB type. Note that this function returns +the 'mtype' of InnoDB. InnoDB differentiates between MySQL's old <= 4.1 +VARCHAR and the new true VARCHAR in >= 5.0.3 by the 'prtype'. +@return DATA_BINARY, DATA_VARCHAR, ... */ +UNIV_INTERN +ulint +get_innobase_type_from_mysql_type( +/*==============================*/ + ulint* unsigned_flag, /*!< out: DATA_UNSIGNED if an + 'unsigned type'; + at least ENUM and SET, + and unsigned integer + types are 'unsigned types' */ + const void* field) /*!< in: MySQL Field */ + __attribute__((nonnull)); + +/******************************************************************//** +Get the variable length bounds of the given character set. */ +UNIV_INTERN +void +innobase_get_cset_width( +/*====================*/ + ulint cset, /*!< in: MySQL charset-collation code */ + ulint* mbminlen, /*!< out: minimum length of a char (in bytes) */ + ulint* mbmaxlen); /*!< out: maximum length of a char (in bytes) */ + +/******************************************************************//** +Compares NUL-terminated UTF-8 strings case insensitively. +@return 0 if a=b, <0 if a<b, >1 if a>b */ +UNIV_INTERN +int +innobase_strcasecmp( +/*================*/ + const char* a, /*!< in: first string to compare */ + const char* b); /*!< in: second string to compare */ + +/******************************************************************//** +Compares NUL-terminated UTF-8 strings case insensitively. The +second string contains wildcards. +@return 0 if a match is found, 1 if not */ +UNIV_INTERN +int +innobase_wildcasecmp( +/*=================*/ + const char* a, /*!< in: string to compare */ + const char* b); /*!< in: wildcard string to compare */ + +/******************************************************************//** +Strip dir name from a full path name and return only its file name. +@return file name or "null" if no file name */ +UNIV_INTERN +const char* +innobase_basename( +/*==============*/ + const char* path_name); /*!< in: full path name */ + +/******************************************************************//** +Returns true if the thread is executing a SELECT statement. +@return true if thd is executing SELECT */ +UNIV_INTERN +ibool +thd_is_select( +/*==========*/ + const THD* thd); /*!< in: thread handle */ + +/******************************************************************//** +Converts an identifier to a table name. */ +UNIV_INTERN +void +innobase_convert_from_table_id( +/*===========================*/ + struct charset_info_st* cs, /*!< in: the 'from' character set */ + char* to, /*!< out: converted identifier */ + const char* from, /*!< in: identifier to convert */ + ulint len); /*!< in: length of 'to', in bytes; should + be at least 5 * strlen(to) + 1 */ +/******************************************************************//** +Converts an identifier to UTF-8. */ +UNIV_INTERN +void +innobase_convert_from_id( +/*=====================*/ + struct charset_info_st* cs, /*!< in: the 'from' character set */ + char* to, /*!< out: converted identifier */ + const char* from, /*!< in: identifier to convert */ + ulint len); /*!< in: length of 'to', in bytes; + should be at least 3 * strlen(to) + 1 */ +/******************************************************************//** +Makes all characters in a NUL-terminated UTF-8 string lower case. */ +UNIV_INTERN +void +innobase_casedn_str( +/*================*/ + char* a); /*!< in/out: string to put in lower case */ + +/**********************************************************************//** +Determines the connection character set. +@return connection character set */ +UNIV_INTERN +struct charset_info_st* +innobase_get_charset( +/*=================*/ + THD* thd); /*!< in: MySQL thread handle */ +/**********************************************************************//** +Determines the current SQL statement. +@return SQL statement string */ +UNIV_INTERN +const char* +innobase_get_stmt( +/*==============*/ + THD* thd, /*!< in: MySQL thread handle */ + size_t* length) /*!< out: length of the SQL statement */ + __attribute__((nonnull)); +/******************************************************************//** +This function is used to find the storage length in bytes of the first n +characters for prefix indexes using a multibyte character set. The function +finds charset information and returns length of prefix_len characters in the +index field in bytes. +@return number of bytes occupied by the first n characters */ +UNIV_INTERN +ulint +innobase_get_at_most_n_mbchars( +/*===========================*/ + ulint charset_id, /*!< in: character set id */ + ulint prefix_len, /*!< in: prefix length in bytes of the index + (this has to be divided by mbmaxlen to get the + number of CHARACTERS n in the prefix) */ + ulint data_len, /*!< in: length of the string in bytes */ + const char* str); /*!< in: character string */ + +/*************************************************************//** +InnoDB index push-down condition check +@return ICP_NO_MATCH, ICP_MATCH, or ICP_OUT_OF_RANGE */ +UNIV_INTERN +enum icp_result +innobase_index_cond( +/*================*/ + void* file) /*!< in/out: pointer to ha_innobase */ + __attribute__((nonnull, warn_unused_result)); +/******************************************************************//** +Returns true if the thread supports XA, +global value of innodb_supports_xa if thd is NULL. +@return true if thd supports XA */ +UNIV_INTERN +ibool +thd_supports_xa( +/*============*/ + THD* thd); /*!< in: thread handle, or NULL to query + the global innodb_supports_xa */ + +/******************************************************************//** +Returns the lock wait timeout for the current connection. +@return the lock wait timeout, in seconds */ +UNIV_INTERN +ulong +thd_lock_wait_timeout( +/*==================*/ + THD* thd); /*!< in: thread handle, or NULL to query + the global innodb_lock_wait_timeout */ +/******************************************************************//** +Add up the time waited for the lock for the current query. */ +UNIV_INTERN +void +thd_set_lock_wait_time( +/*===================*/ + THD* thd, /*!< in/out: thread handle */ + ulint value); /*!< in: time waited for the lock */ + +/**********************************************************************//** +Get the current setting of the table_cache_size global parameter. We do +a dirty read because for one there is no synchronization object and +secondly there is little harm in doing so even if we get a torn read. +@return SQL statement string */ +UNIV_INTERN +ulint +innobase_get_table_cache_size(void); +/*===============================*/ + +/******************************************************************//** + */ +ulong +thd_flush_log_at_trx_commit( +/*================================*/ + void* thd); + +/**********************************************************************//** +Get the current setting of the lower_case_table_names global parameter from +mysqld.cc. We do a dirty read because for one there is no synchronization +object and secondly there is little harm in doing so even if we get a torn +read. +@return value of lower_case_table_names */ +UNIV_INTERN +ulint +innobase_get_lower_case_table_names(void); +/*=====================================*/ + +/*****************************************************************//** +Frees a possible InnoDB trx object associated with the current THD. +@return 0 or error number */ +UNIV_INTERN +int +innobase_close_thd( +/*===============*/ + THD* thd); /*!< in: MySQL thread handle for + which to close the connection */ +/*************************************************************//** +Get the next token from the given string and store it in *token. */ +UNIV_INTERN +ulint +innobase_mysql_fts_get_token( +/*=========================*/ + CHARSET_INFO* charset, /*!< in: Character set */ + const byte* start, /*!< in: start of text */ + const byte* end, /*!< in: one character past end of + text */ + fts_string_t* token, /*!< out: token's text */ + ulint* offset); /*!< out: offset to token, + measured as characters from + 'start' */ + +/******************************************************************//** +compare two character string case insensitively according to their charset. */ +UNIV_INTERN +int +innobase_fts_text_case_cmp( +/*=======================*/ + const void* cs, /*!< in: Character set */ + const void* p1, /*!< in: key */ + const void* p2); /*!< in: node */ + +/****************************************************************//** +Get FTS field charset info from the field's prtype +@return charset info */ +UNIV_INTERN +CHARSET_INFO* +innobase_get_fts_charset( +/*=====================*/ + int mysql_type, /*!< in: MySQL type */ + uint charset_number);/*!< in: number of the charset */ +/******************************************************************//** +Returns true if transaction should be flagged as read-only. +@return true if the thd is marked as read-only */ +UNIV_INTERN +ibool +thd_trx_is_read_only( +/*=================*/ + THD* thd); /*!< in/out: thread handle */ + +/******************************************************************//** +Check if the transaction is an auto-commit transaction. TRUE also +implies that it is a SELECT (read-only) transaction. +@return true if the transaction is an auto commit read-only transaction. */ +UNIV_INTERN +ibool +thd_trx_is_auto_commit( +/*===================*/ + THD* thd); /*!< in: thread handle, or NULL */ + +/*****************************************************************//** +A wrapper function of innobase_convert_name(), convert a table or +index name to the MySQL system_charset_info (UTF-8) and quote it if needed. +@return pointer to the end of buf */ +UNIV_INTERN +void +innobase_format_name( +/*==================*/ + char* buf, /*!< out: buffer for converted + identifier */ + ulint buflen, /*!< in: length of buf, in bytes */ + const char* name, /*!< in: index or table name + to format */ + ibool is_index_name) /*!< in: index name */ + __attribute__((nonnull)); + +/** Corresponds to Sql_condition:enum_warning_level. */ +enum ib_log_level_t { + IB_LOG_LEVEL_INFO, + IB_LOG_LEVEL_WARN, + IB_LOG_LEVEL_ERROR, + IB_LOG_LEVEL_FATAL +}; + +/******************************************************************//** +Use this when the args are first converted to a formatted string and then +passed to the format string from errmsg-utf8.txt. The error message format +must be: "Some string ... %s". + +Push a warning message to the client, it is a wrapper around: + +void push_warning_printf( + THD *thd, Sql_condition::enum_warning_level level, + uint code, const char *format, ...); +*/ +UNIV_INTERN +void +ib_errf( +/*====*/ + THD* thd, /*!< in/out: session */ + ib_log_level_t level, /*!< in: warning level */ + ib_uint32_t code, /*!< MySQL error code */ + const char* format, /*!< printf format */ + ...) /*!< Args */ + __attribute__((format(printf, 4, 5))); + +/******************************************************************//** +Use this when the args are passed to the format string from +errmsg-utf8.txt directly as is. + +Push a warning message to the client, it is a wrapper around: + +void push_warning_printf( + THD *thd, Sql_condition::enum_warning_level level, + uint code, const char *format, ...); +*/ +UNIV_INTERN +void +ib_senderrf( +/*========*/ + THD* thd, /*!< in/out: session */ + ib_log_level_t level, /*!< in: warning level */ + ib_uint32_t code, /*!< MySQL error code */ + ...); /*!< Args */ + +/******************************************************************//** +Write a message to the MySQL log, prefixed with "InnoDB: ". +Wrapper around sql_print_information() */ +UNIV_INTERN +void +ib_logf( +/*====*/ + ib_log_level_t level, /*!< in: warning level */ + const char* format, /*!< printf format */ + ...) /*!< Args */ + __attribute__((format(printf, 2, 3))); + +/******************************************************************//** +Returns the NUL terminated value of glob_hostname. +@return pointer to glob_hostname. */ +UNIV_INTERN +const char* +server_get_hostname(); +/*=================*/ + +/******************************************************************//** +Get the error message format string. +@return the format string or 0 if not found. */ +UNIV_INTERN +const char* +innobase_get_err_msg( +/*=================*/ + int error_code); /*!< in: MySQL error code */ + +/******************************************************************//** +Returns true if innodb_expand_fast_index_creation is enabled for the current +session. +@return the value of the server's innodb_expand_fast_index_creation variable */ + +ibool +thd_expand_fast_index_creation( +/*===========================*/ + void* thd); /*!< in: thread handle (THD*) */ + +/*********************************************************************//** +Compute the next autoinc value. + +For MySQL replication the autoincrement values can be partitioned among +the nodes. The offset is the start or origin of the autoincrement value +for a particular node. For n nodes the increment will be n and the offset +will be in the interval [1, n]. The formula tries to allocate the next +value for a particular node. + +Note: This function is also called with increment set to the number of +values we want to reserve for multi-value inserts e.g., + + INSERT INTO T VALUES(), (), (); + +innobase_next_autoinc() will be called with increment set to 3 where +autoinc_lock_mode != TRADITIONAL because we want to reserve 3 values for +the multi-value INSERT above. +@return the next value */ +UNIV_INTERN +ulonglong +innobase_next_autoinc( +/*==================*/ + ulonglong current, /*!< in: Current value */ + ulonglong need, /*!< in: count of values needed */ + ulonglong step, /*!< in: AUTOINC increment step */ + ulonglong offset, /*!< in: AUTOINC offset */ + ulonglong max_value) /*!< in: max value for type */ + __attribute__((pure, warn_unused_result)); + +/********************************************************************//** +Get the upper limit of the MySQL integral and floating-point type. +@return maximum allowed value for the field */ +UNIV_INTERN +ulonglong +innobase_get_int_col_max_value( +/*===========================*/ + const Field* field) /*!< in: MySQL field */ + __attribute__((nonnull, pure, warn_unused_result)); + +/********************************************************************** +Check if the length of the identifier exceeds the maximum allowed. +The input to this function is an identifier in charset my_charset_filename. +return true when length of identifier is too long. */ +UNIV_INTERN +my_bool +innobase_check_identifier_length( +/*=============================*/ + const char* id); /* in: identifier to check. it must belong + to charset my_charset_filename */ + +/********************************************************************** +Converts an identifier from my_charset_filename to UTF-8 charset. */ +uint +innobase_convert_to_system_charset( +/*===============================*/ + char* to, /* out: converted identifier */ + const char* from, /* in: identifier to convert */ + ulint len, /* in: length of 'to', in bytes */ + uint* errors); /* out: error return */ + +/********************************************************************** +Converts an identifier from my_charset_filename to UTF-8 charset. */ +uint +innobase_convert_to_filename_charset( +/*=================================*/ + char* to, /* out: converted identifier */ + const char* from, /* in: identifier to convert */ + ulint len); /* in: length of 'to', in bytes */ + + +#endif /* HA_INNODB_PROTOTYPES_H */ diff --git a/storage/xtradb/include/handler0alter.h b/storage/xtradb/include/handler0alter.h new file mode 100644 index 00000000000..66b963ae39a --- /dev/null +++ b/storage/xtradb/include/handler0alter.h @@ -0,0 +1,114 @@ +/***************************************************************************** + +Copyright (c) 2005, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/handler0alter.h +Smart ALTER TABLE +*******************************************************/ + +/*************************************************************//** +Copies an InnoDB record to table->record[0]. */ +UNIV_INTERN +void +innobase_rec_to_mysql( +/*==================*/ + struct TABLE* table, /*!< in/out: MySQL table */ + const rec_t* rec, /*!< in: record */ + const dict_index_t* index, /*!< in: index */ + const ulint* offsets)/*!< in: rec_get_offsets( + rec, index, ...) */ + __attribute__((nonnull)); + +/*************************************************************//** +Copies an InnoDB index entry to table->record[0]. */ +UNIV_INTERN +void +innobase_fields_to_mysql( +/*=====================*/ + struct TABLE* table, /*!< in/out: MySQL table */ + const dict_index_t* index, /*!< in: InnoDB index */ + const dfield_t* fields) /*!< in: InnoDB index fields */ + __attribute__((nonnull)); + +/*************************************************************//** +Copies an InnoDB row to table->record[0]. */ +UNIV_INTERN +void +innobase_row_to_mysql( +/*==================*/ + struct TABLE* table, /*!< in/out: MySQL table */ + const dict_table_t* itab, /*!< in: InnoDB table */ + const dtuple_t* row) /*!< in: InnoDB row */ + __attribute__((nonnull)); + +/*************************************************************//** +Resets table->record[0]. */ +UNIV_INTERN +void +innobase_rec_reset( +/*===============*/ + struct TABLE* table) /*!< in/out: MySQL table */ + __attribute__((nonnull)); + +/** Generate the next autoinc based on a snapshot of the session +auto_increment_increment and auto_increment_offset variables. */ +struct ib_sequence_t { + + /** + @param thd - the session + @param start_value - the lower bound + @param max_value - the upper bound (inclusive) */ + ib_sequence_t(THD* thd, ulonglong start_value, ulonglong max_value); + + /** + Postfix increment + @return the value to insert */ + ulonglong operator++(int) UNIV_NOTHROW; + + /** Check if the autoinc "sequence" is exhausted. + @return true if the sequence is exhausted */ + bool eof() const UNIV_NOTHROW + { + return(m_eof); + } + + /** + @return the next value in the sequence */ + ulonglong last() const UNIV_NOTHROW + { + ut_ad(m_next_value > 0); + + return(m_next_value); + } + + /** Maximum calumn value if adding an AUTOINC column else 0. Once + we reach the end of the sequence it will be set to ~0. */ + const ulonglong m_max_value; + + /** Value of auto_increment_increment */ + ulong m_increment; + + /** Value of auto_increment_offset */ + ulong m_offset; + + /** Next value in the sequence */ + ulonglong m_next_value; + + /** true if no more values left in the sequence */ + bool m_eof; +}; diff --git a/storage/xtradb/include/hash0hash.h b/storage/xtradb/include/hash0hash.h new file mode 100644 index 00000000000..a6fe4e680a1 --- /dev/null +++ b/storage/xtradb/include/hash0hash.h @@ -0,0 +1,576 @@ +/***************************************************************************** + +Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/hash0hash.h +The simple hash table utility + +Created 5/20/1997 Heikki Tuuri +*******************************************************/ + +#ifndef hash0hash_h +#define hash0hash_h + +#include "univ.i" +#include "mem0mem.h" +#ifndef UNIV_HOTBACKUP +# include "sync0sync.h" +# include "sync0rw.h" +#endif /* !UNIV_HOTBACKUP */ + +struct hash_table_t; +struct hash_cell_t; + +typedef void* hash_node_t; + +/* Fix Bug #13859: symbol collision between imap/mysql */ +#define hash_create hash0_create + +/* Differnt types of hash_table based on the synchronization +method used for it. */ +enum hash_table_sync_t { + HASH_TABLE_SYNC_NONE = 0, /*!< Don't use any internal + synchronization objects for + this hash_table. */ + HASH_TABLE_SYNC_MUTEX, /*!< Use mutexes to control + access to this hash_table. */ + HASH_TABLE_SYNC_RW_LOCK /*!< Use rw_locks to control + access to this hash_table. */ +}; + +/*************************************************************//** +Creates a hash table with >= n array cells. The actual number +of cells is chosen to be a prime number slightly bigger than n. +@return own: created table */ +UNIV_INTERN +hash_table_t* +hash_create( +/*========*/ + ulint n); /*!< in: number of array cells */ +#ifndef UNIV_HOTBACKUP +/*************************************************************//** +Creates a sync object array array to protect a hash table. +::sync_obj can be mutexes or rw_locks depening on the type of +hash table. */ +UNIV_INTERN +void +hash_create_sync_obj_func( +/*======================*/ + hash_table_t* table, /*!< in: hash table */ + enum hash_table_sync_t type, /*!< in: HASH_TABLE_SYNC_MUTEX + or HASH_TABLE_SYNC_RW_LOCK */ +#ifdef UNIV_SYNC_DEBUG + ulint sync_level,/*!< in: latching order level + of the mutexes: used in the + debug version */ +#endif /* UNIV_SYNC_DEBUG */ + ulint n_sync_obj);/*!< in: number of sync objects, + must be a power of 2 */ +#ifdef UNIV_SYNC_DEBUG +# define hash_create_sync_obj(t, s, n, level) \ + hash_create_sync_obj_func(t, s, level, n) +#else /* UNIV_SYNC_DEBUG */ +# define hash_create_sync_obj(t, s, n, level) \ + hash_create_sync_obj_func(t, s, n) +#endif /* UNIV_SYNC_DEBUG */ +#endif /* !UNIV_HOTBACKUP */ + +/*************************************************************//** +Frees a hash table. */ +UNIV_INTERN +void +hash_table_free( +/*============*/ + hash_table_t* table); /*!< in, own: hash table */ +/**************************************************************//** +Calculates the hash value from a folded value. +@return hashed value */ +UNIV_INLINE +ulint +hash_calc_hash( +/*===========*/ + ulint fold, /*!< in: folded value */ + hash_table_t* table); /*!< in: hash table */ +#ifndef UNIV_HOTBACKUP +/********************************************************************//** +Assert that the mutex for the table is held */ +# define HASH_ASSERT_OWN(TABLE, FOLD) \ + ut_ad((TABLE)->type != HASH_TABLE_SYNC_MUTEX \ + || (mutex_own(hash_get_mutex((TABLE), FOLD)))); +#else /* !UNIV_HOTBACKUP */ +# define HASH_ASSERT_OWN(TABLE, FOLD) +#endif /* !UNIV_HOTBACKUP */ + +/*******************************************************************//** +Inserts a struct to a hash table. */ + +#define HASH_INSERT(TYPE, NAME, TABLE, FOLD, DATA)\ +do {\ + hash_cell_t* cell3333;\ + TYPE* struct3333;\ +\ + HASH_ASSERT_OWN(TABLE, FOLD)\ +\ + (DATA)->NAME = NULL;\ +\ + cell3333 = hash_get_nth_cell(TABLE, hash_calc_hash(FOLD, TABLE));\ +\ + if (cell3333->node == NULL) {\ + cell3333->node = DATA;\ + } else {\ + struct3333 = (TYPE*) cell3333->node;\ +\ + while (struct3333->NAME != NULL) {\ +\ + struct3333 = (TYPE*) struct3333->NAME;\ + }\ +\ + struct3333->NAME = DATA;\ + }\ +} while (0) + +#ifdef UNIV_HASH_DEBUG +# define HASH_ASSERT_VALID(DATA) ut_a((void*) (DATA) != (void*) -1) +# define HASH_INVALIDATE(DATA, NAME) *(void**) (&DATA->NAME) = (void*) -1 +#else +# define HASH_ASSERT_VALID(DATA) do {} while (0) +# define HASH_INVALIDATE(DATA, NAME) do {} while (0) +#endif + +/*******************************************************************//** +Deletes a struct from a hash table. */ + +#define HASH_DELETE(TYPE, NAME, TABLE, FOLD, DATA)\ +do {\ + hash_cell_t* cell3333;\ + TYPE* struct3333;\ +\ + HASH_ASSERT_OWN(TABLE, FOLD)\ +\ + cell3333 = hash_get_nth_cell(TABLE, hash_calc_hash(FOLD, TABLE));\ +\ + if (cell3333->node == DATA) {\ + HASH_ASSERT_VALID(DATA->NAME);\ + cell3333->node = DATA->NAME;\ + } else {\ + struct3333 = (TYPE*) cell3333->node;\ +\ + while (struct3333->NAME != DATA) {\ +\ + struct3333 = (TYPE*) struct3333->NAME;\ + ut_a(struct3333);\ + }\ +\ + struct3333->NAME = DATA->NAME;\ + }\ + HASH_INVALIDATE(DATA, NAME);\ +} while (0) + +/*******************************************************************//** +Gets the first struct in a hash chain, NULL if none. */ + +#define HASH_GET_FIRST(TABLE, HASH_VAL)\ + (hash_get_nth_cell(TABLE, HASH_VAL)->node) + +/*******************************************************************//** +Gets the next struct in a hash chain, NULL if none. */ + +#define HASH_GET_NEXT(NAME, DATA) ((DATA)->NAME) + +/********************************************************************//** +Looks for a struct in a hash table. */ +#define HASH_SEARCH(NAME, TABLE, FOLD, TYPE, DATA, ASSERTION, TEST)\ +{\ +\ + HASH_ASSERT_OWN(TABLE, FOLD)\ +\ + (DATA) = (TYPE) HASH_GET_FIRST(TABLE, hash_calc_hash(FOLD, TABLE));\ + HASH_ASSERT_VALID(DATA);\ +\ + while ((DATA) != NULL) {\ + ASSERTION;\ + if (TEST) {\ + break;\ + } else {\ + HASH_ASSERT_VALID(HASH_GET_NEXT(NAME, DATA));\ + (DATA) = (TYPE) HASH_GET_NEXT(NAME, DATA);\ + }\ + }\ +} + +/********************************************************************//** +Looks for an item in all hash buckets. */ +#define HASH_SEARCH_ALL(NAME, TABLE, TYPE, DATA, ASSERTION, TEST) \ +do { \ + ulint i3333; \ + \ + for (i3333 = (TABLE)->n_cells; i3333--; ) { \ + (DATA) = (TYPE) HASH_GET_FIRST(TABLE, i3333); \ + \ + while ((DATA) != NULL) { \ + HASH_ASSERT_VALID(DATA); \ + ASSERTION; \ + \ + if (TEST) { \ + break; \ + } \ + \ + (DATA) = (TYPE) HASH_GET_NEXT(NAME, DATA); \ + } \ + \ + if ((DATA) != NULL) { \ + break; \ + } \ + } \ +} while (0) + +/************************************************************//** +Gets the nth cell in a hash table. +@return pointer to cell */ +UNIV_INLINE +hash_cell_t* +hash_get_nth_cell( +/*==============*/ + hash_table_t* table, /*!< in: hash table */ + ulint n); /*!< in: cell index */ + +/*************************************************************//** +Clears a hash table so that all the cells become empty. */ +UNIV_INLINE +void +hash_table_clear( +/*=============*/ + hash_table_t* table); /*!< in/out: hash table */ + +/*************************************************************//** +Returns the number of cells in a hash table. +@return number of cells */ +UNIV_INLINE +ulint +hash_get_n_cells( +/*=============*/ + hash_table_t* table); /*!< in: table */ +/*******************************************************************//** +Deletes a struct which is stored in the heap of the hash table, and compacts +the heap. The fold value must be stored in the struct NODE in a field named +'fold'. */ + +#define HASH_DELETE_AND_COMPACT(TYPE, NAME, TABLE, NODE)\ +do {\ + TYPE* node111;\ + TYPE* top_node111;\ + hash_cell_t* cell111;\ + ulint fold111;\ +\ + fold111 = (NODE)->fold;\ +\ + HASH_DELETE(TYPE, NAME, TABLE, fold111, NODE);\ +\ + top_node111 = (TYPE*) mem_heap_get_top(\ + hash_get_heap(TABLE, fold111),\ + sizeof(TYPE));\ +\ + /* If the node to remove is not the top node in the heap, compact the\ + heap of nodes by moving the top node in the place of NODE. */\ +\ + if (NODE != top_node111) {\ +\ + /* Copy the top node in place of NODE */\ +\ + *(NODE) = *top_node111;\ +\ + cell111 = hash_get_nth_cell(TABLE,\ + hash_calc_hash(top_node111->fold, TABLE));\ +\ + /* Look for the pointer to the top node, to update it */\ +\ + if (cell111->node == top_node111) {\ + /* The top node is the first in the chain */\ +\ + cell111->node = NODE;\ + } else {\ + /* We have to look for the predecessor of the top\ + node */\ + node111 = static_cast<TYPE*>(cell111->node);\ +\ + while (top_node111 != HASH_GET_NEXT(NAME, node111)) {\ +\ + node111 = static_cast<TYPE*>(\ + HASH_GET_NEXT(NAME, node111));\ + }\ +\ + /* Now we have the predecessor node */\ +\ + node111->NAME = NODE;\ + }\ + }\ +\ + /* Free the space occupied by the top node */\ +\ + mem_heap_free_top(hash_get_heap(TABLE, fold111), sizeof(TYPE));\ +} while (0) + +#ifndef UNIV_HOTBACKUP +/****************************************************************//** +Move all hash table entries from OLD_TABLE to NEW_TABLE. */ + +#define HASH_MIGRATE(OLD_TABLE, NEW_TABLE, NODE_TYPE, PTR_NAME, FOLD_FUNC) \ +do {\ + ulint i2222;\ + ulint cell_count2222;\ +\ + cell_count2222 = hash_get_n_cells(OLD_TABLE);\ +\ + for (i2222 = 0; i2222 < cell_count2222; i2222++) {\ + NODE_TYPE* node2222 = HASH_GET_FIRST((OLD_TABLE), i2222);\ +\ + while (node2222) {\ + NODE_TYPE* next2222 = node2222->PTR_NAME;\ + ulint fold2222 = FOLD_FUNC(node2222);\ +\ + HASH_INSERT(NODE_TYPE, PTR_NAME, (NEW_TABLE),\ + fold2222, node2222);\ +\ + node2222 = next2222;\ + }\ + }\ +} while (0) + +/************************************************************//** +Gets the sync object index for a fold value in a hash table. +@return index */ +UNIV_INLINE +ulint +hash_get_sync_obj_index( +/*====================*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold); /*!< in: fold */ +/************************************************************//** +Gets the nth heap in a hash table. +@return mem heap */ +UNIV_INLINE +mem_heap_t* +hash_get_nth_heap( +/*==============*/ + hash_table_t* table, /*!< in: hash table */ + ulint i); /*!< in: index of the heap */ +/************************************************************//** +Gets the heap for a fold value in a hash table. +@return mem heap */ +UNIV_INLINE +mem_heap_t* +hash_get_heap( +/*==========*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold); /*!< in: fold */ +/************************************************************//** +Gets the nth mutex in a hash table. +@return mutex */ +UNIV_INLINE +ib_prio_mutex_t* +hash_get_nth_mutex( +/*===============*/ + hash_table_t* table, /*!< in: hash table */ + ulint i); /*!< in: index of the mutex */ +/************************************************************//** +Gets the nth rw_lock in a hash table. +@return rw_lock */ +UNIV_INLINE +prio_rw_lock_t* +hash_get_nth_lock( +/*==============*/ + hash_table_t* table, /*!< in: hash table */ + ulint i); /*!< in: index of the rw_lock */ +/************************************************************//** +Gets the mutex for a fold value in a hash table. +@return mutex */ +UNIV_INLINE +ib_prio_mutex_t* +hash_get_mutex( +/*===========*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold); /*!< in: fold */ +/************************************************************//** +Gets the rw_lock for a fold value in a hash table. +@return rw_lock */ +UNIV_INLINE +prio_rw_lock_t* +hash_get_lock( +/*==========*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold); /*!< in: fold */ +/************************************************************//** +Reserves the mutex for a fold value in a hash table. */ +UNIV_INTERN +void +hash_mutex_enter( +/*=============*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold); /*!< in: fold */ +/************************************************************//** +Releases the mutex for a fold value in a hash table. */ +UNIV_INTERN +void +hash_mutex_exit( +/*============*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold); /*!< in: fold */ +/************************************************************//** +Reserves all the mutexes of a hash table, in an ascending order. */ +UNIV_INTERN +void +hash_mutex_enter_all( +/*=================*/ + hash_table_t* table); /*!< in: hash table */ +/************************************************************//** +Releases all the mutexes of a hash table. */ +UNIV_INTERN +void +hash_mutex_exit_all( +/*================*/ + hash_table_t* table); /*!< in: hash table */ +/************************************************************//** +Releases all but the passed in mutex of a hash table. */ +UNIV_INTERN +void +hash_mutex_exit_all_but( +/*====================*/ + hash_table_t* table, /*!< in: hash table */ + ib_prio_mutex_t* keep_mutex); /*!< in: mutex to keep */ +/************************************************************//** +s-lock a lock for a fold value in a hash table. */ +UNIV_INTERN +void +hash_lock_s( +/*========*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold); /*!< in: fold */ +/************************************************************//** +x-lock a lock for a fold value in a hash table. */ +UNIV_INTERN +void +hash_lock_x( +/*========*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold); /*!< in: fold */ +/************************************************************//** +unlock an s-lock for a fold value in a hash table. */ +UNIV_INTERN +void +hash_unlock_s( +/*==========*/ + + hash_table_t* table, /*!< in: hash table */ + ulint fold); /*!< in: fold */ +/************************************************************//** +unlock x-lock for a fold value in a hash table. */ +UNIV_INTERN +void +hash_unlock_x( +/*==========*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold); /*!< in: fold */ +/************************************************************//** +Reserves all the locks of a hash table, in an ascending order. */ +UNIV_INTERN +void +hash_lock_x_all( +/*============*/ + hash_table_t* table); /*!< in: hash table */ +/************************************************************//** +Releases all the locks of a hash table, in an ascending order. */ +UNIV_INTERN +void +hash_unlock_x_all( +/*==============*/ + hash_table_t* table); /*!< in: hash table */ +/************************************************************//** +Releases all but passed in lock of a hash table, */ +UNIV_INTERN +void +hash_unlock_x_all_but( +/*==================*/ + hash_table_t* table, /*!< in: hash table */ + prio_rw_lock_t* keep_lock); /*!< in: lock to keep */ + +#else /* !UNIV_HOTBACKUP */ +# define hash_get_heap(table, fold) ((table)->heap) +# define hash_mutex_enter(table, fold) ((void) 0) +# define hash_mutex_exit(table, fold) ((void) 0) +# define hash_mutex_enter_all(table) ((void) 0) +# define hash_mutex_exit_all(table) ((void) 0) +# define hash_mutex_exit_all_but(t, m) ((void) 0) +# define hash_lock_s(t, f) ((void) 0) +# define hash_lock_x(t, f) ((void) 0) +# define hash_unlock_s(t, f) ((void) 0) +# define hash_unlock_x(t, f) ((void) 0) +# define hash_lock_x_all(t) ((void) 0) +# define hash_unlock_x_all(t) ((void) 0) +# define hash_unlock_x_all_but(t, l) ((void) 0) +#endif /* !UNIV_HOTBACKUP */ + +struct hash_cell_t{ + void* node; /*!< hash chain node, NULL if none */ +}; + +/* The hash table structure */ +struct hash_table_t { + enum hash_table_sync_t type; /*<! type of hash_table. */ +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG +# ifndef UNIV_HOTBACKUP + ibool adaptive;/* TRUE if this is the hash + table of the adaptive hash + index */ +# endif /* !UNIV_HOTBACKUP */ +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + ulint n_cells;/* number of cells in the hash table */ + hash_cell_t* array; /*!< pointer to cell array */ +#ifndef UNIV_HOTBACKUP + ulint n_sync_obj;/* if sync_objs != NULL, then + the number of either the number + of mutexes or the number of + rw_locks depending on the type. + Must be a power of 2 */ + union { + ib_prio_mutex_t* mutexes; + /* NULL, or an array of mutexes + used to protect segments of the + hash table */ + prio_rw_lock_t* rw_locks;/* NULL, or an array of rw_lcoks + used to protect segments of the + hash table */ + } sync_obj; + + mem_heap_t** heaps; /*!< if this is non-NULL, hash + chain nodes for external chaining + can be allocated from these memory + heaps; there are then n_mutexes + many of these heaps */ +#endif /* !UNIV_HOTBACKUP */ + mem_heap_t* heap; +#ifdef UNIV_DEBUG + ulint magic_n; +# define HASH_TABLE_MAGIC_N 76561114 +#endif /* UNIV_DEBUG */ +}; + +#ifndef UNIV_NONINL +#include "hash0hash.ic" +#endif + +#endif diff --git a/storage/xtradb/include/hash0hash.ic b/storage/xtradb/include/hash0hash.ic new file mode 100644 index 00000000000..e4822538e19 --- /dev/null +++ b/storage/xtradb/include/hash0hash.ic @@ -0,0 +1,225 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/hash0hash.ic +The simple hash table utility + +Created 5/20/1997 Heikki Tuuri +*******************************************************/ + +#include "ut0rnd.h" + +/************************************************************//** +Gets the nth cell in a hash table. +@return pointer to cell */ +UNIV_INLINE +hash_cell_t* +hash_get_nth_cell( +/*==============*/ + hash_table_t* table, /*!< in: hash table */ + ulint n) /*!< in: cell index */ +{ + ut_ad(table); + ut_ad(table->magic_n == HASH_TABLE_MAGIC_N); + ut_ad(n < table->n_cells); + + return(table->array + n); +} + +/*************************************************************//** +Clears a hash table so that all the cells become empty. */ +UNIV_INLINE +void +hash_table_clear( +/*=============*/ + hash_table_t* table) /*!< in/out: hash table */ +{ + ut_ad(table); + ut_ad(table->magic_n == HASH_TABLE_MAGIC_N); + memset(table->array, 0x0, + table->n_cells * sizeof(*table->array)); +} + +/*************************************************************//** +Returns the number of cells in a hash table. +@return number of cells */ +UNIV_INLINE +ulint +hash_get_n_cells( +/*=============*/ + hash_table_t* table) /*!< in: table */ +{ + ut_ad(table); + ut_ad(table->magic_n == HASH_TABLE_MAGIC_N); + return(table->n_cells); +} + +/**************************************************************//** +Calculates the hash value from a folded value. +@return hashed value */ +UNIV_INLINE +ulint +hash_calc_hash( +/*===========*/ + ulint fold, /*!< in: folded value */ + hash_table_t* table) /*!< in: hash table */ +{ + ut_ad(table); + ut_ad(table->magic_n == HASH_TABLE_MAGIC_N); + return(ut_hash_ulint(fold, table->n_cells)); +} + +#ifndef UNIV_HOTBACKUP +/************************************************************//** +Gets the sync object index for a fold value in a hash table. +@return index */ +UNIV_INLINE +ulint +hash_get_sync_obj_index( +/*====================*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold) /*!< in: fold */ +{ + ut_ad(table); + ut_ad(table->magic_n == HASH_TABLE_MAGIC_N); + ut_ad(table->type != HASH_TABLE_SYNC_NONE); + ut_ad(ut_is_2pow(table->n_sync_obj)); + return(ut_2pow_remainder(hash_calc_hash(fold, table), + table->n_sync_obj)); +} + +/************************************************************//** +Gets the nth heap in a hash table. +@return mem heap */ +UNIV_INLINE +mem_heap_t* +hash_get_nth_heap( +/*==============*/ + hash_table_t* table, /*!< in: hash table */ + ulint i) /*!< in: index of the heap */ +{ + ut_ad(table); + ut_ad(table->magic_n == HASH_TABLE_MAGIC_N); + ut_ad(table->type != HASH_TABLE_SYNC_NONE); + ut_ad(i < table->n_sync_obj); + + return(table->heaps[i]); +} + +/************************************************************//** +Gets the heap for a fold value in a hash table. +@return mem heap */ +UNIV_INLINE +mem_heap_t* +hash_get_heap( +/*==========*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold) /*!< in: fold */ +{ + ulint i; + + ut_ad(table); + ut_ad(table->magic_n == HASH_TABLE_MAGIC_N); + + if (table->heap) { + return(table->heap); + } + + i = hash_get_sync_obj_index(table, fold); + + return(hash_get_nth_heap(table, i)); +} + +/************************************************************//** +Gets the nth mutex in a hash table. +@return mutex */ +UNIV_INLINE +ib_prio_mutex_t* +hash_get_nth_mutex( +/*===============*/ + hash_table_t* table, /*!< in: hash table */ + ulint i) /*!< in: index of the mutex */ +{ + ut_ad(table); + ut_ad(table->magic_n == HASH_TABLE_MAGIC_N); + ut_ad(table->type == HASH_TABLE_SYNC_MUTEX); + ut_ad(i < table->n_sync_obj); + + return(table->sync_obj.mutexes + i); +} + +/************************************************************//** +Gets the mutex for a fold value in a hash table. +@return mutex */ +UNIV_INLINE +ib_prio_mutex_t* +hash_get_mutex( +/*===========*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold) /*!< in: fold */ +{ + ulint i; + + ut_ad(table); + ut_ad(table->magic_n == HASH_TABLE_MAGIC_N); + + i = hash_get_sync_obj_index(table, fold); + + return(hash_get_nth_mutex(table, i)); +} + +/************************************************************//** +Gets the nth rw_lock in a hash table. +@return rw_lock */ +UNIV_INLINE +prio_rw_lock_t* +hash_get_nth_lock( +/*==============*/ + hash_table_t* table, /*!< in: hash table */ + ulint i) /*!< in: index of the rw_lock */ +{ + ut_ad(table); + ut_ad(table->magic_n == HASH_TABLE_MAGIC_N); + ut_ad(table->type == HASH_TABLE_SYNC_RW_LOCK); + ut_ad(i < table->n_sync_obj); + + return(table->sync_obj.rw_locks + i); +} + +/************************************************************//** +Gets the rw_lock for a fold value in a hash table. +@return rw_lock */ +UNIV_INLINE +prio_rw_lock_t* +hash_get_lock( +/*==========*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold) /*!< in: fold */ +{ + ulint i; + + ut_ad(table); + ut_ad(table->type == HASH_TABLE_SYNC_RW_LOCK); + ut_ad(table->magic_n == HASH_TABLE_MAGIC_N); + + i = hash_get_sync_obj_index(table, fold); + + return(hash_get_nth_lock(table, i)); +} +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/xtradb/include/ibuf0ibuf.h b/storage/xtradb/include/ibuf0ibuf.h new file mode 100644 index 00000000000..ac16b10e097 --- /dev/null +++ b/storage/xtradb/include/ibuf0ibuf.h @@ -0,0 +1,483 @@ +/***************************************************************************** + +Copyright (c) 1997, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/ibuf0ibuf.h +Insert buffer + +Created 7/19/1997 Heikki Tuuri +*******************************************************/ + +#ifndef ibuf0ibuf_h +#define ibuf0ibuf_h + +#include "univ.i" + +#include "mtr0mtr.h" +#include "dict0mem.h" +#include "fsp0fsp.h" + +#ifndef UNIV_HOTBACKUP +# include "ibuf0types.h" + +/** Default value for maximum on-disk size of change buffer in terms +of percentage of the buffer pool. */ +#define CHANGE_BUFFER_DEFAULT_SIZE (25) + +/* Possible operations buffered in the insert/whatever buffer. See +ibuf_insert(). DO NOT CHANGE THE VALUES OF THESE, THEY ARE STORED ON DISK. */ +typedef enum { + IBUF_OP_INSERT = 0, + IBUF_OP_DELETE_MARK = 1, + IBUF_OP_DELETE = 2, + + /* Number of different operation types. */ + IBUF_OP_COUNT = 3 +} ibuf_op_t; + +/** Combinations of operations that can be buffered. Because the enum +values are used for indexing innobase_change_buffering_values[], they +should start at 0 and there should not be any gaps. */ +typedef enum { + IBUF_USE_NONE = 0, + IBUF_USE_INSERT, /* insert */ + IBUF_USE_DELETE_MARK, /* delete */ + IBUF_USE_INSERT_DELETE_MARK, /* insert+delete */ + IBUF_USE_DELETE, /* delete+purge */ + IBUF_USE_ALL, /* insert+delete+purge */ + + IBUF_USE_COUNT /* number of entries in ibuf_use_t */ +} ibuf_use_t; + +/** Operations that can currently be buffered. */ +extern ibuf_use_t ibuf_use; + +/** The insert buffer control structure */ +extern ibuf_t* ibuf; + +/* The purpose of the insert buffer is to reduce random disk access. +When we wish to insert a record into a non-unique secondary index and +the B-tree leaf page where the record belongs to is not in the buffer +pool, we insert the record into the insert buffer B-tree, indexed by +(space_id, page_no). When the page is eventually read into the buffer +pool, we look up the insert buffer B-tree for any modifications to the +page, and apply these upon the completion of the read operation. This +is called the insert buffer merge. */ + +/* The insert buffer merge must always succeed. To guarantee this, +the insert buffer subsystem keeps track of the free space in pages for +which it can buffer operations. Two bits per page in the insert +buffer bitmap indicate the available space in coarse increments. The +free bits in the insert buffer bitmap must never exceed the free space +on a page. It is safe to decrement or reset the bits in the bitmap in +a mini-transaction that is committed before the mini-transaction that +affects the free space. It is unsafe to increment the bits in a +separately committed mini-transaction, because in crash recovery, the +free bits could momentarily be set too high. */ + +/******************************************************************//** +Creates the insert buffer data structure at a database startup. */ +UNIV_INTERN +void +ibuf_init_at_db_start(void); +/*=======================*/ +/*********************************************************************//** +Updates the max_size value for ibuf. */ +UNIV_INTERN +void +ibuf_max_size_update( +/*=================*/ + ulint new_val); /*!< in: new value in terms of + percentage of the buffer pool size */ +/*********************************************************************//** +Reads the biggest tablespace id from the high end of the insert buffer +tree and updates the counter in fil_system. */ +UNIV_INTERN +void +ibuf_update_max_tablespace_id(void); +/*===============================*/ +/***************************************************************//** +Starts an insert buffer mini-transaction. */ +UNIV_INLINE +void +ibuf_mtr_start( +/*===========*/ + mtr_t* mtr) /*!< out: mini-transaction */ + __attribute__((nonnull)); +/***************************************************************//** +Commits an insert buffer mini-transaction. */ +UNIV_INLINE +void +ibuf_mtr_commit( +/*============*/ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull)); +/*********************************************************************//** +Initializes an ibuf bitmap page. */ +UNIV_INTERN +void +ibuf_bitmap_page_init( +/*==================*/ + buf_block_t* block, /*!< in: bitmap page */ + mtr_t* mtr); /*!< in: mtr */ +/************************************************************************//** +Resets the free bits of the page in the ibuf bitmap. This is done in a +separate mini-transaction, hence this operation does not restrict +further work to only ibuf bitmap operations, which would result if the +latch to the bitmap page were kept. NOTE: The free bits in the insert +buffer bitmap must never exceed the free space on a page. It is safe +to decrement or reset the bits in the bitmap in a mini-transaction +that is committed before the mini-transaction that affects the free +space. */ +UNIV_INTERN +void +ibuf_reset_free_bits( +/*=================*/ + buf_block_t* block); /*!< in: index page; free bits are set to 0 + if the index is a non-clustered + non-unique, and page level is 0 */ +/************************************************************************//** +Updates the free bits of an uncompressed page in the ibuf bitmap if +there is not enough free on the page any more. This is done in a +separate mini-transaction, hence this operation does not restrict +further work to only ibuf bitmap operations, which would result if the +latch to the bitmap page were kept. NOTE: The free bits in the insert +buffer bitmap must never exceed the free space on a page. It is +unsafe to increment the bits in a separately committed +mini-transaction, because in crash recovery, the free bits could +momentarily be set too high. It is only safe to use this function for +decrementing the free bits. Should more free space become available, +we must not update the free bits here, because that would break crash +recovery. */ +UNIV_INLINE +void +ibuf_update_free_bits_if_full( +/*==========================*/ + buf_block_t* block, /*!< in: index page to which we have added new + records; the free bits are updated if the + index is non-clustered and non-unique and + the page level is 0, and the page becomes + fuller */ + ulint max_ins_size,/*!< in: value of maximum insert size with + reorganize before the latest operation + performed to the page */ + ulint increase);/*!< in: upper limit for the additional space + used in the latest operation, if known, or + ULINT_UNDEFINED */ +/**********************************************************************//** +Updates the free bits for an uncompressed page to reflect the present +state. Does this in the mtr given, which means that the latching +order rules virtually prevent any further operations for this OS +thread until mtr is committed. NOTE: The free bits in the insert +buffer bitmap must never exceed the free space on a page. It is safe +to set the free bits in the same mini-transaction that updated the +page. */ +UNIV_INTERN +void +ibuf_update_free_bits_low( +/*======================*/ + const buf_block_t* block, /*!< in: index page */ + ulint max_ins_size, /*!< in: value of + maximum insert size + with reorganize before + the latest operation + performed to the page */ + mtr_t* mtr); /*!< in/out: mtr */ +/**********************************************************************//** +Updates the free bits for a compressed page to reflect the present +state. Does this in the mtr given, which means that the latching +order rules virtually prevent any further operations for this OS +thread until mtr is committed. NOTE: The free bits in the insert +buffer bitmap must never exceed the free space on a page. It is safe +to set the free bits in the same mini-transaction that updated the +page. */ +UNIV_INTERN +void +ibuf_update_free_bits_zip( +/*======================*/ + buf_block_t* block, /*!< in/out: index page */ + mtr_t* mtr); /*!< in/out: mtr */ +/**********************************************************************//** +Updates the free bits for the two pages to reflect the present state. +Does this in the mtr given, which means that the latching order rules +virtually prevent any further operations until mtr is committed. +NOTE: The free bits in the insert buffer bitmap must never exceed the +free space on a page. It is safe to set the free bits in the same +mini-transaction that updated the pages. */ +UNIV_INTERN +void +ibuf_update_free_bits_for_two_pages_low( +/*====================================*/ + ulint zip_size,/*!< in: compressed page size in bytes; + 0 for uncompressed pages */ + buf_block_t* block1, /*!< in: index page */ + buf_block_t* block2, /*!< in: index page */ + mtr_t* mtr); /*!< in: mtr */ +/**********************************************************************//** +A basic partial test if an insert to the insert buffer could be possible and +recommended. */ +UNIV_INLINE +ibool +ibuf_should_try( +/*============*/ + dict_index_t* index, /*!< in: index where to insert */ + ulint ignore_sec_unique); /*!< in: if != 0, we should + ignore UNIQUE constraint on + a secondary index when we + decide */ +/******************************************************************//** +Returns TRUE if the current OS thread is performing an insert buffer +routine. + +For instance, a read-ahead of non-ibuf pages is forbidden by threads +that are executing an insert buffer routine. +@return TRUE if inside an insert buffer routine */ +UNIV_INLINE +ibool +ibuf_inside( +/*========*/ + const mtr_t* mtr) /*!< in: mini-transaction */ + __attribute__((nonnull, pure)); +/***********************************************************************//** +Checks if a page address is an ibuf bitmap page (level 3 page) address. +@return TRUE if a bitmap page */ +UNIV_INLINE +ibool +ibuf_bitmap_page( +/*=============*/ + ulint zip_size,/*!< in: compressed page size in bytes; + 0 for uncompressed pages */ + ulint page_no);/*!< in: page number */ +/***********************************************************************//** +Checks if a page is a level 2 or 3 page in the ibuf hierarchy of pages. +Must not be called when recv_no_ibuf_operations==TRUE. +@return TRUE if level 2 or level 3 page */ +UNIV_INTERN +ibool +ibuf_page_low( +/*==========*/ + ulint space, /*!< in: space id */ + ulint zip_size,/*!< in: compressed page size in bytes, or 0 */ + ulint page_no,/*!< in: page number */ +#ifdef UNIV_DEBUG + ibool x_latch,/*!< in: FALSE if relaxed check + (avoid latching the bitmap page) */ +#endif /* UNIV_DEBUG */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ + mtr_t* mtr) /*!< in: mtr which will contain an + x-latch to the bitmap page if the page + is not one of the fixed address ibuf + pages, or NULL, in which case a new + transaction is created. */ + __attribute__((warn_unused_result)); +#ifdef UNIV_DEBUG +/** Checks if a page is a level 2 or 3 page in the ibuf hierarchy of +pages. Must not be called when recv_no_ibuf_operations==TRUE. +@param space tablespace identifier +@param zip_size compressed page size in bytes, or 0 +@param page_no page number +@param mtr mini-transaction or NULL +@return TRUE if level 2 or level 3 page */ +# define ibuf_page(space, zip_size, page_no, mtr) \ + ibuf_page_low(space, zip_size, page_no, TRUE, __FILE__, __LINE__, mtr) +#else /* UVIV_DEBUG */ +/** Checks if a page is a level 2 or 3 page in the ibuf hierarchy of +pages. Must not be called when recv_no_ibuf_operations==TRUE. +@param space tablespace identifier +@param zip_size compressed page size in bytes, or 0 +@param page_no page number +@param mtr mini-transaction or NULL +@return TRUE if level 2 or level 3 page */ +# define ibuf_page(space, zip_size, page_no, mtr) \ + ibuf_page_low(space, zip_size, page_no, __FILE__, __LINE__, mtr) +#endif /* UVIV_DEBUG */ +/***********************************************************************//** +Frees excess pages from the ibuf free list. This function is called when an OS +thread calls fsp services to allocate a new file segment, or a new page to a +file segment, and the thread did not own the fsp latch before this call. */ +UNIV_INTERN +void +ibuf_free_excess_pages(void); +/*========================*/ +/*********************************************************************//** +Buffer an operation in the insert/delete buffer, instead of doing it +directly to the disk page, if this is possible. Does not do it if the index +is clustered or unique. +@return TRUE if success */ +UNIV_INTERN +ibool +ibuf_insert( +/*========*/ + ibuf_op_t op, /*!< in: operation type */ + const dtuple_t* entry, /*!< in: index entry to insert */ + dict_index_t* index, /*!< in: index where to insert */ + ulint space, /*!< in: space id where to insert */ + ulint zip_size,/*!< in: compressed page size in bytes, or 0 */ + ulint page_no,/*!< in: page number where to insert */ + que_thr_t* thr); /*!< in: query thread */ +/*********************************************************************//** +When an index page is read from a disk to the buffer pool, this function +applies any buffered operations to the page and deletes the entries from the +insert buffer. If the page is not read, but created in the buffer pool, this +function deletes its buffered entries from the insert buffer; there can +exist entries for such a page if the page belonged to an index which +subsequently was dropped. */ +UNIV_INTERN +void +ibuf_merge_or_delete_for_page( +/*==========================*/ + buf_block_t* block, /*!< in: if page has been read from + disk, pointer to the page x-latched, + else NULL */ + ulint space, /*!< in: space id of the index page */ + ulint page_no,/*!< in: page number of the index page */ + ulint zip_size,/*!< in: compressed page size in bytes, + or 0 */ + ibool update_ibuf_bitmap);/*!< in: normally this is set + to TRUE, but if we have deleted or are + deleting the tablespace, then we + naturally do not want to update a + non-existent bitmap page */ +/*********************************************************************//** +Deletes all entries in the insert buffer for a given space id. This is used +in DISCARD TABLESPACE and IMPORT TABLESPACE. +NOTE: this does not update the page free bitmaps in the space. The space will +become CORRUPT when you call this function! */ +UNIV_INTERN +void +ibuf_delete_for_discarded_space( +/*============================*/ + ulint space); /*!< in: space id */ +/*********************************************************************//** +Contracts insert buffer trees by reading pages to the buffer pool. +@return a lower limit for the combined size in bytes of entries which +will be merged from ibuf trees to the pages read, 0 if ibuf is +empty */ +UNIV_INTERN +ulint +ibuf_contract_in_background( +/*========================*/ + table_id_t table_id, /*!< in: if merge should be done only + for a specific table, for all tables + this should be 0 */ + ibool full); /*!< in: TRUE if the caller wants to + do a full contract based on PCT_IO(100). + If FALSE then the size of contract + batch is determined based on the + current size of the ibuf tree. */ +#endif /* !UNIV_HOTBACKUP */ +/*********************************************************************//** +Parses a redo log record of an ibuf bitmap page init. +@return end of log record or NULL */ +UNIV_INTERN +byte* +ibuf_parse_bitmap_init( +/*===================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + buf_block_t* block, /*!< in: block or NULL */ + mtr_t* mtr); /*!< in: mtr or NULL */ +#ifndef UNIV_HOTBACKUP +#ifdef UNIV_IBUF_COUNT_DEBUG +/******************************************************************//** +Gets the ibuf count for a given page. +@return number of entries in the insert buffer currently buffered for +this page */ +UNIV_INTERN +ulint +ibuf_count_get( +/*===========*/ + ulint space, /*!< in: space id */ + ulint page_no);/*!< in: page number */ +#endif +/******************************************************************//** +Looks if the insert buffer is empty. +@return true if empty */ +UNIV_INTERN +bool +ibuf_is_empty(void); +/*===============*/ +/******************************************************************//** +Prints info of ibuf. */ +UNIV_INTERN +void +ibuf_print( +/*=======*/ + FILE* file); /*!< in: file where to print */ +/******************************************************************** +Read the first two bytes from a record's fourth field (counter field in new +records; something else in older records). +@return "counter" field, or ULINT_UNDEFINED if for some reason it can't be read */ +UNIV_INTERN +ulint +ibuf_rec_get_counter( +/*=================*/ + const rec_t* rec); /*!< in: ibuf record */ +/******************************************************************//** +Closes insert buffer and frees the data structures. */ +UNIV_INTERN +void +ibuf_close(void); +/*============*/ +/******************************************************************//** +Function to pass ibuf status variables */ +UNIV_INTERN +void +ibuf_export_ibuf_status( +/*====================*/ + ulint* size, + ulint* free_list, + ulint* segment_size, + ulint* merges, + ulint* merged_inserts, + ulint* merged_delete_marks, + ulint* merged_deletes, + ulint* discarded_inserts, + ulint* discarded_delete_marks, + ulint* discarded_deletes); + +/******************************************************************//** +Checks the insert buffer bitmaps on IMPORT TABLESPACE. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +ibuf_check_bitmap_on_import( +/*========================*/ + const trx_t* trx, /*!< in: transaction */ + ulint space_id) /*!< in: tablespace identifier */ + __attribute__((nonnull, warn_unused_result)); + +#define IBUF_HEADER_PAGE_NO FSP_IBUF_HEADER_PAGE_NO +#define IBUF_TREE_ROOT_PAGE_NO FSP_IBUF_TREE_ROOT_PAGE_NO + +#endif /* !UNIV_HOTBACKUP */ + +/* The ibuf header page currently contains only the file segment header +for the file segment from which the pages for the ibuf tree are allocated */ +#define IBUF_HEADER PAGE_DATA +#define IBUF_TREE_SEG_HEADER 0 /* fseg header for ibuf tree */ + +/* The insert buffer tree itself is always located in space 0. */ +#define IBUF_SPACE_ID 0 + +#ifndef UNIV_NONINL +#include "ibuf0ibuf.ic" +#endif + +#endif diff --git a/storage/xtradb/include/ibuf0ibuf.ic b/storage/xtradb/include/ibuf0ibuf.ic new file mode 100644 index 00000000000..21747fdceac --- /dev/null +++ b/storage/xtradb/include/ibuf0ibuf.ic @@ -0,0 +1,367 @@ +/***************************************************************************** + +Copyright (c) 1997, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/ibuf0ibuf.ic +Insert buffer + +Created 7/19/1997 Heikki Tuuri +*******************************************************/ + +#include "page0page.h" +#include "page0zip.h" +#ifndef UNIV_HOTBACKUP +#include "buf0lru.h" + +/** An index page must contain at least UNIV_PAGE_SIZE / +IBUF_PAGE_SIZE_PER_FREE_SPACE bytes of free space for ibuf to try to +buffer inserts to this page. If there is this much of free space, the +corresponding bits are set in the ibuf bitmap. */ +#define IBUF_PAGE_SIZE_PER_FREE_SPACE 32 + +/***************************************************************//** +Starts an insert buffer mini-transaction. */ +UNIV_INLINE +void +ibuf_mtr_start( +/*===========*/ + mtr_t* mtr) /*!< out: mini-transaction */ +{ + mtr_start(mtr); + mtr->inside_ibuf = TRUE; +} +/***************************************************************//** +Commits an insert buffer mini-transaction. */ +UNIV_INLINE +void +ibuf_mtr_commit( +/*============*/ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ut_ad(mtr->inside_ibuf); + ut_d(mtr->inside_ibuf = FALSE); + mtr_commit(mtr); +} + +/** Insert buffer struct */ +struct ibuf_t{ + ulint size; /*!< current size of the ibuf index + tree, in pages */ + ulint max_size; /*!< recommended maximum size of the + ibuf index tree, in pages */ + ulint seg_size; /*!< allocated pages of the file + segment containing ibuf header and + tree */ + bool empty; /*!< Protected by the page + latch of the root page of the + insert buffer tree + (FSP_IBUF_TREE_ROOT_PAGE_NO). true + if and only if the insert + buffer tree is empty. */ + ulint free_list_len; /*!< length of the free list */ + ulint height; /*!< tree height */ + dict_index_t* index; /*!< insert buffer index */ + + ulint n_merges; /*!< number of pages merged */ + ulint n_merged_ops[IBUF_OP_COUNT]; + /*!< number of operations of each type + merged to index pages */ + ulint n_discarded_ops[IBUF_OP_COUNT]; + /*!< number of operations of each type + discarded without merging due to the + tablespace being deleted or the + index being dropped */ +}; + +/************************************************************************//** +Sets the free bit of the page in the ibuf bitmap. This is done in a separate +mini-transaction, hence this operation does not restrict further work to only +ibuf bitmap operations, which would result if the latch to the bitmap page +were kept. */ +UNIV_INTERN +void +ibuf_set_free_bits_func( +/*====================*/ + buf_block_t* block, /*!< in: index page of a non-clustered index; + free bit is reset if page level is 0 */ +#ifdef UNIV_IBUF_DEBUG + ulint max_val,/*!< in: ULINT_UNDEFINED or a maximum + value which the bits must have before + setting; this is for debugging */ +#endif /* UNIV_IBUF_DEBUG */ + ulint val); /*!< in: value to set: < 4 */ +#ifdef UNIV_IBUF_DEBUG +# define ibuf_set_free_bits(b,v,max) ibuf_set_free_bits_func(b,max,v) +#else /* UNIV_IBUF_DEBUG */ +# define ibuf_set_free_bits(b,v,max) ibuf_set_free_bits_func(b,v) +#endif /* UNIV_IBUF_DEBUG */ + +/**********************************************************************//** +A basic partial test if an insert to the insert buffer could be possible and +recommended. */ +UNIV_INLINE +ibool +ibuf_should_try( +/*============*/ + dict_index_t* index, /*!< in: index where to insert */ + ulint ignore_sec_unique) /*!< in: if != 0, we should + ignore UNIQUE constraint on + a secondary index when we + decide */ +{ + return(ibuf_use != IBUF_USE_NONE + && ibuf->max_size != 0 + && !dict_index_is_clust(index) + && index->table->quiesce == QUIESCE_NONE + && (ignore_sec_unique || !dict_index_is_unique(index))); +} + +/******************************************************************//** +Returns TRUE if the current OS thread is performing an insert buffer +routine. + +For instance, a read-ahead of non-ibuf pages is forbidden by threads +that are executing an insert buffer routine. +@return TRUE if inside an insert buffer routine */ +UNIV_INLINE +ibool +ibuf_inside( +/*========*/ + const mtr_t* mtr) /*!< in: mini-transaction */ +{ + return(mtr->inside_ibuf); +} + +/***********************************************************************//** +Checks if a page address is an ibuf bitmap page address. +@return TRUE if a bitmap page */ +UNIV_INLINE +ibool +ibuf_bitmap_page( +/*=============*/ + ulint zip_size,/*!< in: compressed page size in bytes; + 0 for uncompressed pages */ + ulint page_no)/*!< in: page number */ +{ + ut_ad(ut_is_2pow(zip_size)); + + if (!zip_size) { + return((page_no & (UNIV_PAGE_SIZE - 1)) + == FSP_IBUF_BITMAP_OFFSET); + } + + return((page_no & (zip_size - 1)) == FSP_IBUF_BITMAP_OFFSET); +} + +/*********************************************************************//** +Translates the free space on a page to a value in the ibuf bitmap. +@return value for ibuf bitmap bits */ +UNIV_INLINE +ulint +ibuf_index_page_calc_free_bits( +/*===========================*/ + ulint zip_size, /*!< in: compressed page size in bytes; + 0 for uncompressed pages */ + ulint max_ins_size) /*!< in: maximum insert size after reorganize + for the page */ +{ + ulint n; + ut_ad(ut_is_2pow(zip_size)); + ut_ad(!zip_size || zip_size > IBUF_PAGE_SIZE_PER_FREE_SPACE); + ut_ad(zip_size <= UNIV_ZIP_SIZE_MAX); + + if (zip_size) { + n = max_ins_size + / (zip_size / IBUF_PAGE_SIZE_PER_FREE_SPACE); + } else { + n = max_ins_size + / (UNIV_PAGE_SIZE / IBUF_PAGE_SIZE_PER_FREE_SPACE); + } + + if (n == 3) { + n = 2; + } + + if (n > 3) { + n = 3; + } + + return(n); +} + +/*********************************************************************//** +Translates the ibuf free bits to the free space on a page in bytes. +@return maximum insert size after reorganize for the page */ +UNIV_INLINE +ulint +ibuf_index_page_calc_free_from_bits( +/*================================*/ + ulint zip_size,/*!< in: compressed page size in bytes; + 0 for uncompressed pages */ + ulint bits) /*!< in: value for ibuf bitmap bits */ +{ + ut_ad(bits < 4); + ut_ad(ut_is_2pow(zip_size)); + ut_ad(!zip_size || zip_size > IBUF_PAGE_SIZE_PER_FREE_SPACE); + ut_ad(zip_size <= UNIV_ZIP_SIZE_MAX); + + if (zip_size) { + if (bits == 3) { + return(4 * zip_size / IBUF_PAGE_SIZE_PER_FREE_SPACE); + } + + return(bits * zip_size / IBUF_PAGE_SIZE_PER_FREE_SPACE); + } + + if (bits == 3) { + return(4 * UNIV_PAGE_SIZE / IBUF_PAGE_SIZE_PER_FREE_SPACE); + } + + return(bits * (UNIV_PAGE_SIZE / IBUF_PAGE_SIZE_PER_FREE_SPACE)); +} + +/*********************************************************************//** +Translates the free space on a compressed page to a value in the ibuf bitmap. +@return value for ibuf bitmap bits */ +UNIV_INLINE +ulint +ibuf_index_page_calc_free_zip( +/*==========================*/ + ulint zip_size, + /*!< in: compressed page size in bytes */ + const buf_block_t* block) /*!< in: buffer block */ +{ + ulint max_ins_size; + const page_zip_des_t* page_zip; + lint zip_max_ins; + + ut_ad(zip_size == buf_block_get_zip_size(block)); + ut_ad(zip_size); + + /* Consider the maximum insert size on the uncompressed page + without reorganizing the page. We must not assume anything + about the compression ratio. If zip_max_ins > max_ins_size and + there is 1/4 garbage on the page, recompression after the + reorganize could fail, in theory. So, let us guarantee that + merging a buffered insert to a compressed page will always + succeed without reorganizing or recompressing the page, just + by using the page modification log. */ + max_ins_size = page_get_max_insert_size( + buf_block_get_frame(block), 1); + + page_zip = buf_block_get_page_zip(block); + zip_max_ins = page_zip_max_ins_size(page_zip, + FALSE/* not clustered */); + + if (zip_max_ins < 0) { + return(0); + } else if (max_ins_size > (ulint) zip_max_ins) { + max_ins_size = (ulint) zip_max_ins; + } + + return(ibuf_index_page_calc_free_bits(zip_size, max_ins_size)); +} + +/*********************************************************************//** +Translates the free space on a page to a value in the ibuf bitmap. +@return value for ibuf bitmap bits */ +UNIV_INLINE +ulint +ibuf_index_page_calc_free( +/*======================*/ + ulint zip_size,/*!< in: compressed page size in bytes; + 0 for uncompressed pages */ + const buf_block_t* block) /*!< in: buffer block */ +{ + ut_ad(zip_size == buf_block_get_zip_size(block)); + + if (!zip_size) { + ulint max_ins_size; + + max_ins_size = page_get_max_insert_size_after_reorganize( + buf_block_get_frame(block), 1); + + return(ibuf_index_page_calc_free_bits(0, max_ins_size)); + } else { + return(ibuf_index_page_calc_free_zip(zip_size, block)); + } +} + +/************************************************************************//** +Updates the free bits of an uncompressed page in the ibuf bitmap if +there is not enough free on the page any more. This is done in a +separate mini-transaction, hence this operation does not restrict +further work to only ibuf bitmap operations, which would result if the +latch to the bitmap page were kept. NOTE: The free bits in the insert +buffer bitmap must never exceed the free space on a page. It is +unsafe to increment the bits in a separately committed +mini-transaction, because in crash recovery, the free bits could +momentarily be set too high. It is only safe to use this function for +decrementing the free bits. Should more free space become available, +we must not update the free bits here, because that would break crash +recovery. */ +UNIV_INLINE +void +ibuf_update_free_bits_if_full( +/*==========================*/ + buf_block_t* block, /*!< in: index page to which we have added new + records; the free bits are updated if the + index is non-clustered and non-unique and + the page level is 0, and the page becomes + fuller */ + ulint max_ins_size,/*!< in: value of maximum insert size with + reorganize before the latest operation + performed to the page */ + ulint increase)/*!< in: upper limit for the additional space + used in the latest operation, if known, or + ULINT_UNDEFINED */ +{ + ulint before; + ulint after; + + ut_ad(!buf_block_get_page_zip(block)); + + before = ibuf_index_page_calc_free_bits(0, max_ins_size); + + if (max_ins_size >= increase) { +#if ULINT32_UNDEFINED <= UNIV_PAGE_SIZE_MAX +# error "ULINT32_UNDEFINED <= UNIV_PAGE_SIZE_MAX" +#endif + after = ibuf_index_page_calc_free_bits(0, max_ins_size + - increase); +#ifdef UNIV_IBUF_DEBUG + ut_a(after <= ibuf_index_page_calc_free(0, block)); +#endif + } else { + after = ibuf_index_page_calc_free(0, block); + } + + if (after == 0) { + /* We move the page to the front of the buffer pool LRU list: + the purpose of this is to prevent those pages to which we + cannot make inserts using the insert buffer from slipping + out of the buffer pool */ + + buf_page_make_young(&block->page); + } + + if (before > after) { + ibuf_set_free_bits(block, after, before); + } +} +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/xtradb/include/ibuf0types.h b/storage/xtradb/include/ibuf0types.h new file mode 100644 index 00000000000..3fdbf078b0b --- /dev/null +++ b/storage/xtradb/include/ibuf0types.h @@ -0,0 +1,31 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/ibuf0types.h +Insert buffer global types + +Created 7/29/1997 Heikki Tuuri +*******************************************************/ + +#ifndef ibuf0types_h +#define ibuf0types_h + +struct ibuf_t; + +#endif diff --git a/storage/xtradb/include/lock0iter.h b/storage/xtradb/include/lock0iter.h new file mode 100644 index 00000000000..0054850b526 --- /dev/null +++ b/storage/xtradb/include/lock0iter.h @@ -0,0 +1,69 @@ +/***************************************************************************** + +Copyright (c) 2007, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/lock0iter.h +Lock queue iterator type and function prototypes. + +Created July 16, 2007 Vasil Dimov +*******************************************************/ + +#ifndef lock0iter_h +#define lock0iter_h + +#include "univ.i" +#include "lock0types.h" + +struct lock_queue_iterator_t { + const lock_t* current_lock; + /* In case this is a record lock queue (not table lock queue) + then bit_no is the record number within the heap in which the + record is stored. */ + ulint bit_no; +}; + +/*******************************************************************//** +Initialize lock queue iterator so that it starts to iterate from +"lock". bit_no specifies the record number within the heap where the +record is stored. It can be undefined (ULINT_UNDEFINED) in two cases: +1. If the lock is a table lock, thus we have a table lock queue; +2. If the lock is a record lock and it is a wait lock. In this case + bit_no is calculated in this function by using + lock_rec_find_set_bit(). There is exactly one bit set in the bitmap + of a wait lock. */ +UNIV_INTERN +void +lock_queue_iterator_reset( +/*======================*/ + lock_queue_iterator_t* iter, /*!< out: iterator */ + const lock_t* lock, /*!< in: lock to start from */ + ulint bit_no);/*!< in: record number in the + heap */ + +/*******************************************************************//** +Gets the previous lock in the lock queue, returns NULL if there are no +more locks (i.e. the current lock is the first one). The iterator is +receded (if not-NULL is returned). +@return previous lock or NULL */ + +const lock_t* +lock_queue_iterator_get_prev( +/*=========================*/ + lock_queue_iterator_t* iter); /*!< in/out: iterator */ + +#endif /* lock0iter_h */ diff --git a/storage/xtradb/include/lock0lock.h b/storage/xtradb/include/lock0lock.h new file mode 100644 index 00000000000..cb95c58fe3c --- /dev/null +++ b/storage/xtradb/include/lock0lock.h @@ -0,0 +1,992 @@ +/***************************************************************************** + +Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/lock0lock.h +The transaction lock system + +Created 5/7/1996 Heikki Tuuri +*******************************************************/ + +#ifndef lock0lock_h +#define lock0lock_h + +#include "univ.i" +#include "buf0types.h" +#include "trx0types.h" +#include "mtr0types.h" +#include "rem0types.h" +#include "dict0types.h" +#include "que0types.h" +#include "lock0types.h" +#include "read0types.h" +#include "hash0hash.h" +#include "srv0srv.h" +#include "ut0vec.h" + +#ifdef UNIV_DEBUG +extern ibool lock_print_waits; +#endif /* UNIV_DEBUG */ + +extern ulint srv_n_lock_deadlock_count; + +/*********************************************************************//** +Gets the size of a lock struct. +@return size in bytes */ +UNIV_INTERN +ulint +lock_get_size(void); +/*===============*/ +/*********************************************************************//** +Creates the lock system at database start. */ +UNIV_INTERN +void +lock_sys_create( +/*============*/ + ulint n_cells); /*!< in: number of slots in lock hash table */ +/*********************************************************************//** +Closes the lock system at database shutdown. */ +UNIV_INTERN +void +lock_sys_close(void); +/*================*/ +/*********************************************************************//** +Gets the heap_no of the smallest user record on a page. +@return heap_no of smallest user record, or PAGE_HEAP_NO_SUPREMUM */ +UNIV_INLINE +ulint +lock_get_min_heap_no( +/*=================*/ + const buf_block_t* block); /*!< in: buffer block */ +/*************************************************************//** +Updates the lock table when we have reorganized a page. NOTE: we copy +also the locks set on the infimum of the page; the infimum may carry +locks if an update of a record is occurring on the page, and its locks +were temporarily stored on the infimum. */ +UNIV_INTERN +void +lock_move_reorganize_page( +/*======================*/ + const buf_block_t* block, /*!< in: old index page, now + reorganized */ + const buf_block_t* oblock);/*!< in: copy of the old, not + reorganized page */ +/*************************************************************//** +Moves the explicit locks on user records to another page if a record +list end is moved to another page. */ +UNIV_INTERN +void +lock_move_rec_list_end( +/*===================*/ + const buf_block_t* new_block, /*!< in: index page to move to */ + const buf_block_t* block, /*!< in: index page */ + const rec_t* rec); /*!< in: record on page: this + is the first record moved */ +/*************************************************************//** +Moves the explicit locks on user records to another page if a record +list start is moved to another page. */ +UNIV_INTERN +void +lock_move_rec_list_start( +/*=====================*/ + const buf_block_t* new_block, /*!< in: index page to move to */ + const buf_block_t* block, /*!< in: index page */ + const rec_t* rec, /*!< in: record on page: + this is the first + record NOT copied */ + const rec_t* old_end); /*!< in: old + previous-to-last + record on new_page + before the records + were copied */ +/*************************************************************//** +Updates the lock table when a page is split to the right. */ +UNIV_INTERN +void +lock_update_split_right( +/*====================*/ + const buf_block_t* right_block, /*!< in: right page */ + const buf_block_t* left_block); /*!< in: left page */ +/*************************************************************//** +Updates the lock table when a page is merged to the right. */ +UNIV_INTERN +void +lock_update_merge_right( +/*====================*/ + const buf_block_t* right_block, /*!< in: right page to + which merged */ + const rec_t* orig_succ, /*!< in: original + successor of infimum + on the right page + before merge */ + const buf_block_t* left_block); /*!< in: merged index + page which will be + discarded */ +/*************************************************************//** +Updates the lock table when the root page is copied to another in +btr_root_raise_and_insert. Note that we leave lock structs on the +root page, even though they do not make sense on other than leaf +pages: the reason is that in a pessimistic update the infimum record +of the root page will act as a dummy carrier of the locks of the record +to be updated. */ +UNIV_INTERN +void +lock_update_root_raise( +/*===================*/ + const buf_block_t* block, /*!< in: index page to which copied */ + const buf_block_t* root); /*!< in: root page */ +/*************************************************************//** +Updates the lock table when a page is copied to another and the original page +is removed from the chain of leaf pages, except if page is the root! */ +UNIV_INTERN +void +lock_update_copy_and_discard( +/*=========================*/ + const buf_block_t* new_block, /*!< in: index page to + which copied */ + const buf_block_t* block); /*!< in: index page; + NOT the root! */ +/*************************************************************//** +Updates the lock table when a page is split to the left. */ +UNIV_INTERN +void +lock_update_split_left( +/*===================*/ + const buf_block_t* right_block, /*!< in: right page */ + const buf_block_t* left_block); /*!< in: left page */ +/*************************************************************//** +Updates the lock table when a page is merged to the left. */ +UNIV_INTERN +void +lock_update_merge_left( +/*===================*/ + const buf_block_t* left_block, /*!< in: left page to + which merged */ + const rec_t* orig_pred, /*!< in: original predecessor + of supremum on the left page + before merge */ + const buf_block_t* right_block); /*!< in: merged index page + which will be discarded */ +/*************************************************************//** +Resets the original locks on heir and replaces them with gap type locks +inherited from rec. */ +UNIV_INTERN +void +lock_rec_reset_and_inherit_gap_locks( +/*=================================*/ + const buf_block_t* heir_block, /*!< in: block containing the + record which inherits */ + const buf_block_t* block, /*!< in: block containing the + record from which inherited; + does NOT reset the locks on + this record */ + ulint heir_heap_no, /*!< in: heap_no of the + inheriting record */ + ulint heap_no); /*!< in: heap_no of the + donating record */ +/*************************************************************//** +Updates the lock table when a page is discarded. */ +UNIV_INTERN +void +lock_update_discard( +/*================*/ + const buf_block_t* heir_block, /*!< in: index page + which will inherit the locks */ + ulint heir_heap_no, /*!< in: heap_no of the record + which will inherit the locks */ + const buf_block_t* block); /*!< in: index page + which will be discarded */ +/*************************************************************//** +Updates the lock table when a new user record is inserted. */ +UNIV_INTERN +void +lock_update_insert( +/*===============*/ + const buf_block_t* block, /*!< in: buffer block containing rec */ + const rec_t* rec); /*!< in: the inserted record */ +/*************************************************************//** +Updates the lock table when a record is removed. */ +UNIV_INTERN +void +lock_update_delete( +/*===============*/ + const buf_block_t* block, /*!< in: buffer block containing rec */ + const rec_t* rec); /*!< in: the record to be removed */ +/*********************************************************************//** +Stores on the page infimum record the explicit locks of another record. +This function is used to store the lock state of a record when it is +updated and the size of the record changes in the update. The record +is in such an update moved, perhaps to another page. The infimum record +acts as a dummy carrier record, taking care of lock releases while the +actual record is being moved. */ +UNIV_INTERN +void +lock_rec_store_on_page_infimum( +/*===========================*/ + const buf_block_t* block, /*!< in: buffer block containing rec */ + const rec_t* rec); /*!< in: record whose lock state + is stored on the infimum + record of the same page; lock + bits are reset on the + record */ +/*********************************************************************//** +Restores the state of explicit lock requests on a single record, where the +state was stored on the infimum of the page. */ +UNIV_INTERN +void +lock_rec_restore_from_page_infimum( +/*===============================*/ + const buf_block_t* block, /*!< in: buffer block containing rec */ + const rec_t* rec, /*!< in: record whose lock state + is restored */ + const buf_block_t* donator);/*!< in: page (rec is not + necessarily on this page) + whose infimum stored the lock + state; lock bits are reset on + the infimum */ +/*********************************************************************//** +Determines if there are explicit record locks on a page. +@return an explicit record lock on the page, or NULL if there are none */ +UNIV_INTERN +lock_t* +lock_rec_expl_exist_on_page( +/*========================*/ + ulint space, /*!< in: space id */ + ulint page_no)/*!< in: page number */ + __attribute__((warn_unused_result)); +/*********************************************************************//** +Checks if locks of other transactions prevent an immediate insert of +a record. If they do, first tests if the query thread should anyway +be suspended for some reason; if not, then puts the transaction and +the query thread to the lock wait state and inserts a waiting request +for a gap x-lock to the lock queue. +@return DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */ +UNIV_INTERN +dberr_t +lock_rec_insert_check_and_lock( +/*===========================*/ + ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG bit is + set, does nothing */ + const rec_t* rec, /*!< in: record after which to insert */ + buf_block_t* block, /*!< in/out: buffer block of rec */ + dict_index_t* index, /*!< in: index */ + que_thr_t* thr, /*!< in: query thread */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + ibool* inherit)/*!< out: set to TRUE if the new + inserted record maybe should inherit + LOCK_GAP type locks from the successor + record */ + __attribute__((nonnull(2,3,4,6,7), warn_unused_result)); +/*********************************************************************//** +Checks if locks of other transactions prevent an immediate modify (update, +delete mark, or delete unmark) of a clustered index record. If they do, +first tests if the query thread should anyway be suspended for some +reason; if not, then puts the transaction and the query thread to the +lock wait state and inserts a waiting request for a record x-lock to the +lock queue. +@return DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */ +UNIV_INTERN +dberr_t +lock_clust_rec_modify_check_and_lock( +/*=================================*/ + ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG + bit is set, does nothing */ + const buf_block_t* block, /*!< in: buffer block of rec */ + const rec_t* rec, /*!< in: record which should be + modified */ + dict_index_t* index, /*!< in: clustered index */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + que_thr_t* thr) /*!< in: query thread */ + __attribute__((warn_unused_result, nonnull)); +/*********************************************************************//** +Checks if locks of other transactions prevent an immediate modify +(delete mark or delete unmark) of a secondary index record. +@return DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */ +UNIV_INTERN +dberr_t +lock_sec_rec_modify_check_and_lock( +/*===============================*/ + ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG + bit is set, does nothing */ + buf_block_t* block, /*!< in/out: buffer block of rec */ + const rec_t* rec, /*!< in: record which should be + modified; NOTE: as this is a secondary + index, we always have to modify the + clustered index record first: see the + comment below */ + dict_index_t* index, /*!< in: secondary index */ + que_thr_t* thr, /*!< in: query thread + (can be NULL if BTR_NO_LOCKING_FLAG) */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((warn_unused_result, nonnull(2,3,4,6))); +/*********************************************************************//** +Like lock_clust_rec_read_check_and_lock(), but reads a +secondary index record. +@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, DB_DEADLOCK, +or DB_QUE_THR_SUSPENDED */ +UNIV_INTERN +dberr_t +lock_sec_rec_read_check_and_lock( +/*=============================*/ + ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG + bit is set, does nothing */ + const buf_block_t* block, /*!< in: buffer block of rec */ + const rec_t* rec, /*!< in: user record or page + supremum record which should + be read or passed over by a + read cursor */ + dict_index_t* index, /*!< in: secondary index */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + enum lock_mode mode, /*!< in: mode of the lock which + the read cursor should set on + records: LOCK_S or LOCK_X; the + latter is possible in + SELECT FOR UPDATE */ + ulint gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or + LOCK_REC_NOT_GAP */ + que_thr_t* thr); /*!< in: query thread */ +/*********************************************************************//** +Checks if locks of other transactions prevent an immediate read, or passing +over by a read cursor, of a clustered index record. If they do, first tests +if the query thread should anyway be suspended for some reason; if not, then +puts the transaction and the query thread to the lock wait state and inserts a +waiting request for a record lock to the lock queue. Sets the requested mode +lock on the record. +@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, DB_DEADLOCK, +or DB_QUE_THR_SUSPENDED */ +UNIV_INTERN +dberr_t +lock_clust_rec_read_check_and_lock( +/*===============================*/ + ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG + bit is set, does nothing */ + const buf_block_t* block, /*!< in: buffer block of rec */ + const rec_t* rec, /*!< in: user record or page + supremum record which should + be read or passed over by a + read cursor */ + dict_index_t* index, /*!< in: clustered index */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + enum lock_mode mode, /*!< in: mode of the lock which + the read cursor should set on + records: LOCK_S or LOCK_X; the + latter is possible in + SELECT FOR UPDATE */ + ulint gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or + LOCK_REC_NOT_GAP */ + que_thr_t* thr); /*!< in: query thread */ +/*********************************************************************//** +Checks if locks of other transactions prevent an immediate read, or passing +over by a read cursor, of a clustered index record. If they do, first tests +if the query thread should anyway be suspended for some reason; if not, then +puts the transaction and the query thread to the lock wait state and inserts a +waiting request for a record lock to the lock queue. Sets the requested mode +lock on the record. This is an alternative version of +lock_clust_rec_read_check_and_lock() that does not require the parameter +"offsets". +@return DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */ +UNIV_INTERN +dberr_t +lock_clust_rec_read_check_and_lock_alt( +/*===================================*/ + ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG + bit is set, does nothing */ + const buf_block_t* block, /*!< in: buffer block of rec */ + const rec_t* rec, /*!< in: user record or page + supremum record which should + be read or passed over by a + read cursor */ + dict_index_t* index, /*!< in: clustered index */ + enum lock_mode mode, /*!< in: mode of the lock which + the read cursor should set on + records: LOCK_S or LOCK_X; the + latter is possible in + SELECT FOR UPDATE */ + ulint gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or + LOCK_REC_NOT_GAP */ + que_thr_t* thr) /*!< in: query thread */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Checks that a record is seen in a consistent read. +@return true if sees, or false if an earlier version of the record +should be retrieved */ +UNIV_INTERN +bool +lock_clust_rec_cons_read_sees( +/*==========================*/ + const rec_t* rec, /*!< in: user record which should be read or + passed over by a read cursor */ + dict_index_t* index, /*!< in: clustered index */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + read_view_t* view); /*!< in: consistent read view */ +/*********************************************************************//** +Checks that a non-clustered index record is seen in a consistent read. + +NOTE that a non-clustered index page contains so little information on +its modifications that also in the case false, the present version of +rec may be the right, but we must check this from the clustered index +record. + +@return true if certainly sees, or false if an earlier version of the +clustered index record might be needed */ +UNIV_INTERN +bool +lock_sec_rec_cons_read_sees( +/*========================*/ + const rec_t* rec, /*!< in: user record which + should be read or passed over + by a read cursor */ + const read_view_t* view) /*!< in: consistent read view */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Locks the specified database table in the mode given. If the lock cannot +be granted immediately, the query thread is put to wait. +@return DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */ +UNIV_INTERN +dberr_t +lock_table( +/*=======*/ + ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG bit is set, + does nothing */ + dict_table_t* table, /*!< in/out: database table + in dictionary cache */ + enum lock_mode mode, /*!< in: lock mode */ + que_thr_t* thr) /*!< in: query thread */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Creates a table IX lock object for a resurrected transaction. */ +UNIV_INTERN +void +lock_table_ix_resurrect( +/*====================*/ + dict_table_t* table, /*!< in/out: table */ + trx_t* trx); /*!< in/out: transaction */ +/*************************************************************//** +Removes a granted record lock of a transaction from the queue and grants +locks to other transactions waiting in the queue if they now are entitled +to a lock. */ +UNIV_INTERN +void +lock_rec_unlock( +/*============*/ + trx_t* trx, /*!< in/out: transaction that has + set a record lock */ + const buf_block_t* block, /*!< in: buffer block containing rec */ + const rec_t* rec, /*!< in: record */ + enum lock_mode lock_mode);/*!< in: LOCK_S or LOCK_X */ +/*********************************************************************//** +Releases a transaction's locks, and releases possible other transactions +waiting because of these locks. Change the state of the transaction to +TRX_STATE_COMMITTED_IN_MEMORY. */ +UNIV_INTERN +void +lock_trx_release_locks( +/*===================*/ + trx_t* trx); /*!< in/out: transaction */ + +/*********************************************************************//** +Cancels a waiting lock request and releases possible other transactions +waiting behind it. */ +UNIV_INTERN +void +lock_cancel_waiting_and_release( +/*============================*/ + lock_t* lock); /*!< in/out: waiting lock request */ + +/*********************************************************************//** +Removes locks on a table to be dropped or truncated. +If remove_also_table_sx_locks is TRUE then table-level S and X locks are +also removed in addition to other table-level and record-level locks. +No lock, that is going to be removed, is allowed to be a wait lock. */ +UNIV_INTERN +void +lock_remove_all_on_table( +/*=====================*/ + dict_table_t* table, /*!< in: table to be dropped + or truncated */ + ibool remove_also_table_sx_locks);/*!< in: also removes + table S and X locks */ + +/*********************************************************************//** +Calculates the fold value of a page file address: used in inserting or +searching for a lock in the hash table. +@return folded value */ +UNIV_INLINE +ulint +lock_rec_fold( +/*==========*/ + ulint space, /*!< in: space */ + ulint page_no)/*!< in: page number */ + __attribute__((const)); +/*********************************************************************//** +Calculates the hash value of a page file address: used in inserting or +searching for a lock in the hash table. +@return hashed value */ +UNIV_INLINE +ulint +lock_rec_hash( +/*==========*/ + ulint space, /*!< in: space */ + ulint page_no);/*!< in: page number */ + +/**********************************************************************//** +Looks for a set bit in a record lock bitmap. Returns ULINT_UNDEFINED, +if none found. +@return bit index == heap number of the record, or ULINT_UNDEFINED if +none found */ +UNIV_INTERN +ulint +lock_rec_find_set_bit( +/*==================*/ + const lock_t* lock); /*!< in: record lock with at least one + bit set */ + +/*********************************************************************//** +Gets the source table of an ALTER TABLE transaction. The table must be +covered by an IX or IS table lock. +@return the source table of transaction, if it is covered by an IX or +IS table lock; dest if there is no source table, and NULL if the +transaction is locking more than two tables or an inconsistency is +found */ +UNIV_INTERN +dict_table_t* +lock_get_src_table( +/*===============*/ + trx_t* trx, /*!< in: transaction */ + dict_table_t* dest, /*!< in: destination of ALTER TABLE */ + enum lock_mode* mode); /*!< out: lock mode of the source table */ +/*********************************************************************//** +Determine if the given table is exclusively "owned" by the given +transaction, i.e., transaction holds LOCK_IX and possibly LOCK_AUTO_INC +on the table. +@return TRUE if table is only locked by trx, with LOCK_IX, and +possibly LOCK_AUTO_INC */ +UNIV_INTERN +ibool +lock_is_table_exclusive( +/*====================*/ + const dict_table_t* table, /*!< in: table */ + const trx_t* trx) /*!< in: transaction */ + __attribute__((nonnull)); +/*********************************************************************//** +Checks if a lock request lock1 has to wait for request lock2. +@return TRUE if lock1 has to wait for lock2 to be removed */ +UNIV_INTERN +ibool +lock_has_to_wait( +/*=============*/ + const lock_t* lock1, /*!< in: waiting lock */ + const lock_t* lock2); /*!< in: another lock; NOTE that it is + assumed that this has a lock bit set + on the same record as in lock1 if the + locks are record locks */ +/*********************************************************************//** +Reports that a transaction id is insensible, i.e., in the future. */ +UNIV_INTERN +void +lock_report_trx_id_insanity( +/*========================*/ + trx_id_t trx_id, /*!< in: trx id */ + const rec_t* rec, /*!< in: user record */ + dict_index_t* index, /*!< in: index */ + const ulint* offsets, /*!< in: rec_get_offsets(rec, index) */ + trx_id_t max_trx_id) /*!< in: trx_sys_get_max_trx_id() */ + __attribute__((nonnull)); +/*********************************************************************//** +Prints info of a table lock. */ +UNIV_INTERN +void +lock_table_print( +/*=============*/ + FILE* file, /*!< in: file where to print */ + const lock_t* lock); /*!< in: table type lock */ +/*********************************************************************//** +Prints info of a record lock. */ +UNIV_INTERN +void +lock_rec_print( +/*===========*/ + FILE* file, /*!< in: file where to print */ + const lock_t* lock); /*!< in: record type lock */ +/*********************************************************************//** +Prints info of locks for all transactions. +@return FALSE if not able to obtain lock mutex and exits without +printing info */ +UNIV_INTERN +ibool +lock_print_info_summary( +/*====================*/ + FILE* file, /*!< in: file where to print */ + ibool nowait) /*!< in: whether to wait for the lock mutex */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Prints info of locks for each transaction. This function assumes that the +caller holds the lock mutex and more importantly it will release the lock +mutex on behalf of the caller. (This should be fixed in the future). */ +UNIV_INTERN +void +lock_print_info_all_transactions( +/*=============================*/ + FILE* file); /*!< in: file where to print */ +/*********************************************************************//** +Return approximate number or record locks (bits set in the bitmap) for +this transaction. Since delete-marked records may be removed, the +record count will not be precise. +The caller must be holding lock_sys->mutex. */ +UNIV_INTERN +ulint +lock_number_of_rows_locked( +/*=======================*/ + const trx_lock_t* trx_lock) /*!< in: transaction locks */ + __attribute__((nonnull, warn_unused_result)); + +/*******************************************************************//** +Gets the type of a lock. Non-inline version for using outside of the +lock module. +@return LOCK_TABLE or LOCK_REC */ +UNIV_INTERN +ulint +lock_get_type( +/*==========*/ + const lock_t* lock); /*!< in: lock */ + +/*******************************************************************//** +Gets the id of the transaction owning a lock. +@return transaction id */ +UNIV_INTERN +trx_id_t +lock_get_trx_id( +/*============*/ + const lock_t* lock); /*!< in: lock */ + +/*******************************************************************//** +Gets the mode of a lock in a human readable string. +The string should not be free()'d or modified. +@return lock mode */ +UNIV_INTERN +const char* +lock_get_mode_str( +/*==============*/ + const lock_t* lock); /*!< in: lock */ + +/*******************************************************************//** +Gets the type of a lock in a human readable string. +The string should not be free()'d or modified. +@return lock type */ +UNIV_INTERN +const char* +lock_get_type_str( +/*==============*/ + const lock_t* lock); /*!< in: lock */ + +/*******************************************************************//** +Gets the id of the table on which the lock is. +@return id of the table */ +UNIV_INTERN +table_id_t +lock_get_table_id( +/*==============*/ + const lock_t* lock); /*!< in: lock */ + +/*******************************************************************//** +Gets the name of the table on which the lock is. +The string should not be free()'d or modified. +@return name of the table */ +UNIV_INTERN +const char* +lock_get_table_name( +/*================*/ + const lock_t* lock); /*!< in: lock */ + +/*******************************************************************//** +For a record lock, gets the index on which the lock is. +@return index */ +UNIV_INTERN +const dict_index_t* +lock_rec_get_index( +/*===============*/ + const lock_t* lock); /*!< in: lock */ + +/*******************************************************************//** +For a record lock, gets the name of the index on which the lock is. +The string should not be free()'d or modified. +@return name of the index */ +UNIV_INTERN +const char* +lock_rec_get_index_name( +/*====================*/ + const lock_t* lock); /*!< in: lock */ + +/*******************************************************************//** +For a record lock, gets the tablespace number on which the lock is. +@return tablespace number */ +UNIV_INTERN +ulint +lock_rec_get_space_id( +/*==================*/ + const lock_t* lock); /*!< in: lock */ + +/*******************************************************************//** +For a record lock, gets the page number on which the lock is. +@return page number */ +UNIV_INTERN +ulint +lock_rec_get_page_no( +/*=================*/ + const lock_t* lock); /*!< in: lock */ +/*******************************************************************//** +Check if there are any locks (table or rec) against table. +@return TRUE if locks exist */ +UNIV_INTERN +ibool +lock_table_has_locks( +/*=================*/ + const dict_table_t* table); /*!< in: check if there are any locks + held on records in this table or on the + table itself */ + +/*********************************************************************//** +A thread which wakes up threads whose lock wait may have lasted too long. +@return a dummy parameter */ +extern "C" UNIV_INTERN +os_thread_ret_t +DECLARE_THREAD(lock_wait_timeout_thread)( +/*=====================================*/ + void* arg); /*!< in: a dummy parameter required by + os_thread_create */ + +/********************************************************************//** +Releases a user OS thread waiting for a lock to be released, if the +thread is already suspended. */ +UNIV_INTERN +void +lock_wait_release_thread_if_suspended( +/*==================================*/ + que_thr_t* thr); /*!< in: query thread associated with the + user OS thread */ + +/***************************************************************//** +Puts a user OS thread to wait for a lock to be released. If an error +occurs during the wait trx->error_state associated with thr is +!= DB_SUCCESS when we return. DB_LOCK_WAIT_TIMEOUT and DB_DEADLOCK +are possible errors. DB_DEADLOCK is returned if selective deadlock +resolution chose this transaction as a victim. */ +UNIV_INTERN +void +lock_wait_suspend_thread( +/*=====================*/ + que_thr_t* thr); /*!< in: query thread associated with the + user OS thread */ +/*********************************************************************//** +Unlocks AUTO_INC type locks that were possibly reserved by a trx. This +function should be called at the the end of an SQL statement, by the +connection thread that owns the transaction (trx->mysql_thd). */ +UNIV_INTERN +void +lock_unlock_table_autoinc( +/*======================*/ + trx_t* trx); /*!< in/out: transaction */ +/*********************************************************************//** +Check whether the transaction has already been rolled back because it +was selected as a deadlock victim, or if it has to wait then cancel +the wait lock. +@return DB_DEADLOCK, DB_LOCK_WAIT or DB_SUCCESS */ +UNIV_INTERN +dberr_t +lock_trx_handle_wait( +/*=================*/ + trx_t* trx) /*!< in/out: trx lock state */ + __attribute__((nonnull)); +/*********************************************************************//** +Get the number of locks on a table. +@return number of locks */ +UNIV_INTERN +ulint +lock_table_get_n_locks( +/*===================*/ + const dict_table_t* table) /*!< in: table */ + __attribute__((nonnull)); +#ifdef UNIV_DEBUG +/*********************************************************************//** +Checks that a transaction id is sensible, i.e., not in the future. +@return true if ok */ +UNIV_INTERN +bool +lock_check_trx_id_sanity( +/*=====================*/ + trx_id_t trx_id, /*!< in: trx id */ + const rec_t* rec, /*!< in: user record */ + dict_index_t* index, /*!< in: index */ + const ulint* offsets) /*!< in: rec_get_offsets(rec, index) */ + __attribute__((nonnull, warn_unused_result)); +/*******************************************************************//** +Check if the transaction holds any locks on the sys tables +or its records. +@return the strongest lock found on any sys table or 0 for none */ +UNIV_INTERN +const lock_t* +lock_trx_has_sys_table_locks( +/*=========================*/ + const trx_t* trx) /*!< in: transaction to check */ + __attribute__((warn_unused_result)); + +/*******************************************************************//** +Check if the transaction holds an exclusive lock on a record. +@return whether the locks are held */ +UNIV_INTERN +bool +lock_trx_has_rec_x_lock( +/*====================*/ + const trx_t* trx, /*!< in: transaction to check */ + const dict_table_t* table, /*!< in: table to check */ + const buf_block_t* block, /*!< in: buffer block of the record */ + ulint heap_no)/*!< in: record heap number */ + __attribute__((nonnull, warn_unused_result)); +#endif /* UNIV_DEBUG */ + +/** Lock modes and types */ +/* @{ */ +#define LOCK_MODE_MASK 0xFUL /*!< mask used to extract mode from the + type_mode field in a lock */ +/** Lock types */ +/* @{ */ +#define LOCK_TABLE 16 /*!< table lock */ +#define LOCK_REC 32 /*!< record lock */ +#define LOCK_TYPE_MASK 0xF0UL /*!< mask used to extract lock type from the + type_mode field in a lock */ +#if LOCK_MODE_MASK & LOCK_TYPE_MASK +# error "LOCK_MODE_MASK & LOCK_TYPE_MASK" +#endif + +#define LOCK_WAIT 256 /*!< Waiting lock flag; when set, it + means that the lock has not yet been + granted, it is just waiting for its + turn in the wait queue */ +/* Precise modes */ +#define LOCK_ORDINARY 0 /*!< this flag denotes an ordinary + next-key lock in contrast to LOCK_GAP + or LOCK_REC_NOT_GAP */ +#define LOCK_GAP 512 /*!< when this bit is set, it means that the + lock holds only on the gap before the record; + for instance, an x-lock on the gap does not + give permission to modify the record on which + the bit is set; locks of this type are created + when records are removed from the index chain + of records */ +#define LOCK_REC_NOT_GAP 1024 /*!< this bit means that the lock is only on + the index record and does NOT block inserts + to the gap before the index record; this is + used in the case when we retrieve a record + with a unique key, and is also used in + locking plain SELECTs (not part of UPDATE + or DELETE) when the user has set the READ + COMMITTED isolation level */ +#define LOCK_INSERT_INTENTION 2048 /*!< this bit is set when we place a waiting + gap type record lock request in order to let + an insert of an index record to wait until + there are no conflicting locks by other + transactions on the gap; note that this flag + remains set when the waiting lock is granted, + or if the lock is inherited to a neighboring + record */ + +#if (LOCK_WAIT|LOCK_GAP|LOCK_REC_NOT_GAP|LOCK_INSERT_INTENTION)&LOCK_MODE_MASK +# error +#endif +#if (LOCK_WAIT|LOCK_GAP|LOCK_REC_NOT_GAP|LOCK_INSERT_INTENTION)&LOCK_TYPE_MASK +# error +#endif +/* @} */ + +/** Lock operation struct */ +struct lock_op_t{ + dict_table_t* table; /*!< table to be locked */ + enum lock_mode mode; /*!< lock mode */ +}; + +/** The lock system struct */ +struct lock_sys_t{ + ib_mutex_t mutex; /*!< Mutex protecting the + locks */ + hash_table_t* rec_hash; /*!< hash table of the record + locks */ + ulint rec_num; + ib_mutex_t wait_mutex; /*!< Mutex protecting the + next two fields */ + srv_slot_t* waiting_threads; /*!< Array of user threads + suspended while waiting for + locks within InnoDB, protected + by the lock_sys->wait_mutex */ + srv_slot_t* last_slot; /*!< highest slot ever used + in the waiting_threads array, + protected by + lock_sys->wait_mutex */ + ibool rollback_complete; + /*!< TRUE if rollback of all + recovered transactions is + complete. Protected by + lock_sys->mutex */ + + ulint n_lock_max_wait_time; /*!< Max wait time */ + + os_event_t timeout_event; /*!< Set to the event that is + created in the lock wait monitor + thread. A value of 0 means the + thread is not active */ + + bool timeout_thread_active; /*!< True if the timeout thread + is running */ +}; + +/** The lock system */ +extern lock_sys_t* lock_sys; + +/** Test if lock_sys->mutex can be acquired without waiting. */ +#define lock_mutex_enter_nowait() mutex_enter_nowait(&lock_sys->mutex) + +/** Test if lock_sys->mutex is owned. */ +#define lock_mutex_own() mutex_own(&lock_sys->mutex) + +/** Acquire the lock_sys->mutex. */ +#define lock_mutex_enter() do { \ + mutex_enter(&lock_sys->mutex); \ +} while (0) + +/** Release the lock_sys->mutex. */ +#define lock_mutex_exit() do { \ + mutex_exit(&lock_sys->mutex); \ +} while (0) + +/** Test if lock_sys->wait_mutex is owned. */ +#define lock_wait_mutex_own() mutex_own(&lock_sys->wait_mutex) + +/** Acquire the lock_sys->wait_mutex. */ +#define lock_wait_mutex_enter() do { \ + mutex_enter(&lock_sys->wait_mutex); \ +} while (0) + +/** Release the lock_sys->wait_mutex. */ +#define lock_wait_mutex_exit() do { \ + mutex_exit(&lock_sys->wait_mutex); \ +} while (0) + +#ifndef UNIV_NONINL +#include "lock0lock.ic" +#endif + +#endif diff --git a/storage/xtradb/include/lock0lock.ic b/storage/xtradb/include/lock0lock.ic new file mode 100644 index 00000000000..736936954cb --- /dev/null +++ b/storage/xtradb/include/lock0lock.ic @@ -0,0 +1,92 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/lock0lock.ic +The transaction lock system + +Created 5/7/1996 Heikki Tuuri +*******************************************************/ + +#include "sync0sync.h" +#include "srv0srv.h" +#include "dict0dict.h" +#include "row0row.h" +#include "trx0sys.h" +#include "trx0trx.h" +#include "buf0buf.h" +#include "page0page.h" +#include "page0cur.h" +#include "row0vers.h" +#include "que0que.h" +#include "btr0cur.h" +#include "read0read.h" +#include "log0recv.h" + +/*********************************************************************//** +Calculates the fold value of a page file address: used in inserting or +searching for a lock in the hash table. +@return folded value */ +UNIV_INLINE +ulint +lock_rec_fold( +/*==========*/ + ulint space, /*!< in: space */ + ulint page_no)/*!< in: page number */ +{ + return(ut_fold_ulint_pair(space, page_no)); +} + +/*********************************************************************//** +Calculates the hash value of a page file address: used in inserting or +searching for a lock in the hash table. +@return hashed value */ +UNIV_INLINE +ulint +lock_rec_hash( +/*==========*/ + ulint space, /*!< in: space */ + ulint page_no)/*!< in: page number */ +{ + return(hash_calc_hash(lock_rec_fold(space, page_no), + lock_sys->rec_hash)); +} + +/*********************************************************************//** +Gets the heap_no of the smallest user record on a page. +@return heap_no of smallest user record, or PAGE_HEAP_NO_SUPREMUM */ +UNIV_INLINE +ulint +lock_get_min_heap_no( +/*=================*/ + const buf_block_t* block) /*!< in: buffer block */ +{ + const page_t* page = block->frame; + + if (page_is_comp(page)) { + return(rec_get_heap_no_new( + page + + rec_get_next_offs(page + PAGE_NEW_INFIMUM, + TRUE))); + } else { + return(rec_get_heap_no_old( + page + + rec_get_next_offs(page + PAGE_OLD_INFIMUM, + FALSE))); + } +} diff --git a/storage/xtradb/include/lock0priv.h b/storage/xtradb/include/lock0priv.h new file mode 100644 index 00000000000..e564387ec53 --- /dev/null +++ b/storage/xtradb/include/lock0priv.h @@ -0,0 +1,117 @@ +/***************************************************************************** + +Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/lock0priv.h +Lock module internal structures and methods. + +Created July 12, 2007 Vasil Dimov +*******************************************************/ + +#ifndef lock0priv_h +#define lock0priv_h + +#ifndef LOCK_MODULE_IMPLEMENTATION +/* If you need to access members of the structures defined in this +file, please write appropriate functions that retrieve them and put +those functions in lock/ */ +#error Do not include lock0priv.h outside of the lock/ module +#endif + +#include "univ.i" +#include "dict0types.h" +#include "hash0hash.h" +#include "trx0types.h" +#include "ut0lst.h" + +/** A table lock */ +struct lock_table_t { + dict_table_t* table; /*!< database table in dictionary + cache */ + UT_LIST_NODE_T(lock_t) + locks; /*!< list of locks on the same + table */ +}; + +/** Record lock for a page */ +struct lock_rec_t { + ulint space; /*!< space id */ + ulint page_no; /*!< page number */ + ulint n_bits; /*!< number of bits in the lock + bitmap; NOTE: the lock bitmap is + placed immediately after the + lock struct */ +}; + +/** Lock struct; protected by lock_sys->mutex */ +struct lock_t { + trx_t* trx; /*!< transaction owning the + lock */ + UT_LIST_NODE_T(lock_t) + trx_locks; /*!< list of the locks of the + transaction */ + ulint type_mode; /*!< lock type, mode, LOCK_GAP or + LOCK_REC_NOT_GAP, + LOCK_INSERT_INTENTION, + wait flag, ORed */ + hash_node_t hash; /*!< hash chain node for a record + lock */ + dict_index_t* index; /*!< index for a record lock */ + union { + lock_table_t tab_lock;/*!< table lock */ + lock_rec_t rec_lock;/*!< record lock */ + } un_member; /*!< lock details */ +}; + +/*********************************************************************//** +Gets the type of a lock. +@return LOCK_TABLE or LOCK_REC */ +UNIV_INLINE +ulint +lock_get_type_low( +/*==============*/ + const lock_t* lock); /*!< in: lock */ + +/*********************************************************************//** +Gets the previous record lock set on a record. +@return previous lock on the same record, NULL if none exists */ +UNIV_INTERN +const lock_t* +lock_rec_get_prev( +/*==============*/ + const lock_t* in_lock,/*!< in: record lock */ + ulint heap_no);/*!< in: heap number of the record */ + +/*********************************************************************//** +Checks if some transaction has an implicit x-lock on a record in a clustered +index. +@return transaction id of the transaction which has the x-lock, or 0 */ +UNIV_INLINE +trx_id_t +lock_clust_rec_some_has_impl( +/*=========================*/ + const rec_t* rec, /*!< in: user record */ + const dict_index_t* index, /*!< in: clustered index */ + const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */ + __attribute__((nonnull, warn_unused_result)); + +#ifndef UNIV_NONINL +#include "lock0priv.ic" +#endif + +#endif /* lock0priv_h */ diff --git a/storage/xtradb/include/lock0priv.ic b/storage/xtradb/include/lock0priv.ic new file mode 100644 index 00000000000..6b70dc33d3c --- /dev/null +++ b/storage/xtradb/include/lock0priv.ic @@ -0,0 +1,67 @@ +/***************************************************************************** + +Copyright (c) 2007, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/lock0priv.ic +Lock module internal inline methods. + +Created July 16, 2007 Vasil Dimov +*******************************************************/ + +/* This file contains only methods which are used in +lock/lock0* files, other than lock/lock0lock.cc. +I.e. lock/lock0lock.cc contains more internal inline +methods but they are used only in that file. */ + +#ifndef LOCK_MODULE_IMPLEMENTATION +#error Do not include lock0priv.ic outside of the lock/ module +#endif + +/*********************************************************************//** +Gets the type of a lock. +@return LOCK_TABLE or LOCK_REC */ +UNIV_INLINE +ulint +lock_get_type_low( +/*==============*/ + const lock_t* lock) /*!< in: lock */ +{ + ut_ad(lock); + + return(lock->type_mode & LOCK_TYPE_MASK); +} + +/*********************************************************************//** +Checks if some transaction has an implicit x-lock on a record in a clustered +index. +@return transaction id of the transaction which has the x-lock, or 0 */ +UNIV_INLINE +trx_id_t +lock_clust_rec_some_has_impl( +/*=========================*/ + const rec_t* rec, /*!< in: user record */ + const dict_index_t* index, /*!< in: clustered index */ + const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */ +{ + ut_ad(dict_index_is_clust(index)); + ut_ad(page_rec_is_user_rec(rec)); + + return(row_get_rec_trx_id(rec, index, offsets)); +} + +/* vim: set filetype=c: */ diff --git a/storage/xtradb/include/lock0types.h b/storage/xtradb/include/lock0types.h new file mode 100644 index 00000000000..cf32e72f864 --- /dev/null +++ b/storage/xtradb/include/lock0types.h @@ -0,0 +1,47 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/lock0types.h +The transaction lock system global types + +Created 5/7/1996 Heikki Tuuri +*******************************************************/ + +#ifndef lock0types_h +#define lock0types_h + +#define lock_t ib_lock_t +struct lock_t; +struct lock_sys_t; + +/* Basic lock modes */ +enum lock_mode { + LOCK_IS = 0, /* intention shared */ + LOCK_IX, /* intention exclusive */ + LOCK_S, /* shared */ + LOCK_X, /* exclusive */ + LOCK_AUTO_INC, /* locks the auto-inc counter of a table + in an exclusive mode */ + LOCK_NONE, /* this is used elsewhere to note consistent read */ + LOCK_NUM = LOCK_NONE, /* number of lock modes */ + LOCK_NONE_UNSET = 255 +}; + + +#endif diff --git a/storage/xtradb/include/log0log.h b/storage/xtradb/include/log0log.h new file mode 100644 index 00000000000..fbaf0a1e633 --- /dev/null +++ b/storage/xtradb/include/log0log.h @@ -0,0 +1,1052 @@ +/***************************************************************************** + +Copyright (c) 1995, 2013, Oracle and/or its affiliates. All rights reserved. +Copyright (c) 2009, Google Inc. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/log0log.h +Database log + +Created 12/9/1995 Heikki Tuuri +*******************************************************/ + +#ifndef log0log_h +#define log0log_h + +#include "univ.i" +#include "ut0byte.h" +#include "ut0lst.h" +#ifndef UNIV_HOTBACKUP +#include "sync0sync.h" +#include "sync0rw.h" +#endif /* !UNIV_HOTBACKUP */ + +/* Type used for all log sequence number storage and arithmetics */ +typedef ib_uint64_t lsn_t; +#define LSN_MAX IB_UINT64_MAX + +#define LSN_PF UINT64PF + +/** Redo log buffer */ +struct log_t; +/** Redo log group */ +struct log_group_t; + +#ifdef UNIV_DEBUG +/** Flag: write to log file? */ +extern ibool log_do_write; +/** Flag: enable debug output when writing to the log? */ +extern ibool log_debug_writes; +#else /* UNIV_DEBUG */ +/** Write to log */ +# define log_do_write TRUE +#endif /* UNIV_DEBUG */ + +/** Magic value to use instead of log checksums when they are disabled */ +#define LOG_NO_CHECKSUM_MAGIC 0xDEADBEEFUL + +typedef ulint (*log_checksum_func_t)(const byte* log_block); + +/** Pointer to the log checksum calculation function. Protected with +log_sys->mutex. */ +extern log_checksum_func_t log_checksum_algorithm_ptr; + +/** Wait modes for log_write_up_to @{ */ +#define LOG_NO_WAIT 91 +#define LOG_WAIT_ONE_GROUP 92 +#define LOG_WAIT_ALL_GROUPS 93 +/* @} */ +/** Maximum number of log groups in log_group_t::checkpoint_buf */ +#define LOG_MAX_N_GROUPS 32 + +#define IB_ARCHIVED_LOGS_PREFIX "ib_log_archive_" +#define IB_ARCHIVED_LOGS_PREFIX_LEN (sizeof(IB_ARCHIVED_LOGS_PREFIX) - 1) +#define IB_ARCHIVED_LOGS_SERIAL_LEN 20 + +/*******************************************************************//** +Calculates where in log files we find a specified lsn. +@return log file number */ +UNIV_INTERN +ulint +log_calc_where_lsn_is( +/*==================*/ + ib_int64_t* log_file_offset, /*!< out: offset in that file + (including the header) */ + ib_uint64_t first_header_lsn, /*!< in: first log file start + lsn */ + ib_uint64_t lsn, /*!< in: lsn whose position to + determine */ + ulint n_log_files, /*!< in: total number of log + files */ + ib_int64_t log_file_size); /*!< in: log file size + (including the header) */ +#ifndef UNIV_HOTBACKUP +/************************************************************//** +Writes to the log the string given. The log must be released with +log_release. +@return end lsn of the log record, zero if did not succeed */ +UNIV_INLINE +lsn_t +log_reserve_and_write_fast( +/*=======================*/ + const void* str, /*!< in: string */ + ulint len, /*!< in: string length */ + lsn_t* start_lsn);/*!< out: start lsn of the log record */ +/***********************************************************************//** +Releases the log mutex. */ +UNIV_INLINE +void +log_release(void); +/*=============*/ +/***********************************************************************//** +Checks if there is need for a log buffer flush or a new checkpoint, and does +this if yes. Any database operation should call this when it has modified +more than about 4 pages. NOTE that this function may only be called when the +OS thread owns no synchronization objects except the dictionary mutex. */ +UNIV_INLINE +void +log_free_check(void); +/*================*/ +/**************************************************************************//** +Locks the log mutex and opens the log for log_write_low. The log must be closed +with log_close and released with log_release. +@return start lsn of the log record */ +UNIV_INLINE +lsn_t +log_reserve_and_open( +/*=================*/ + ulint len); /*!< in: length of data to be catenated */ +/************************************************************//** +Opens the log for log_write_low. The log must be closed with log_close. +@return start lsn of the log record */ +UNIV_INTERN +lsn_t +log_open( +/*=====*/ + ulint len); /*!< in: length of data to be catenated */ +/************************************************************//** +Writes to the log the string given. It is assumed that the caller holds the +log mutex. */ +UNIV_INTERN +void +log_write_low( +/*==========*/ + byte* str, /*!< in: string */ + ulint str_len); /*!< in: string length */ +/************************************************************//** +Closes the log. +@return lsn */ +UNIV_INTERN +lsn_t +log_close(void); +/*===========*/ +/************************************************************//** +Gets the current lsn. +@return current lsn */ +UNIV_INLINE +lsn_t +log_get_lsn(void); +/*=============*/ +/**************************************************************** +Gets the log group capacity. It is OK to read the value without +holding log_sys->mutex because it is constant. +@return log group capacity */ +UNIV_INLINE +lsn_t +log_get_capacity(void); +/*==================*/ +/**************************************************************** +Get log_sys::max_modified_age_async. It is OK to read the value without +holding log_sys::mutex because it is constant. +@return max_modified_age_async */ +UNIV_INLINE +lsn_t +log_get_max_modified_age_async(void); +/*================================*/ +/******************************************************//** +Initializes the log. */ +UNIV_INTERN +void +log_init(void); +/*==========*/ +/******************************************************************//** +Inits a log group to the log system. */ +UNIV_INTERN +void +log_group_init( +/*===========*/ + ulint id, /*!< in: group id */ + ulint n_files, /*!< in: number of log files */ + lsn_t file_size, /*!< in: log file size in bytes */ + ulint space_id, /*!< in: space id of the file space + which contains the log files of this + group */ + ulint archive_space_id); /*!< in: space id of the file space + which contains some archived log + files for this group; currently, only + for the first log group this is + used */ +/******************************************************//** +Completes an i/o to a log file. */ +UNIV_INTERN +void +log_io_complete( +/*============*/ + log_group_t* group); /*!< in: log group */ +/******************************************************//** +This function is called, e.g., when a transaction wants to commit. It checks +that the log has been written to the log file up to the last log entry written +by the transaction. If there is a flush running, it waits and checks if the +flush flushed enough. If not, starts a new flush. */ +UNIV_INTERN +void +log_write_up_to( +/*============*/ + lsn_t lsn, /*!< in: log sequence number up to which + the log should be written, LSN_MAX if not specified */ + ulint wait, /*!< in: LOG_NO_WAIT, LOG_WAIT_ONE_GROUP, + or LOG_WAIT_ALL_GROUPS */ + ibool flush_to_disk); + /*!< in: TRUE if we want the written log + also to be flushed to disk */ +/****************************************************************//** +Does a syncronous flush of the log buffer to disk. */ +UNIV_INTERN +void +log_buffer_flush_to_disk(void); +/*==========================*/ +/****************************************************************//** +This functions writes the log buffer to the log file and if 'flush' +is set it forces a flush of the log file as well. This is meant to be +called from background master thread only as it does not wait for +the write (+ possible flush) to finish. */ +UNIV_INTERN +void +log_buffer_sync_in_background( +/*==========================*/ + ibool flush); /*<! in: flush the logs to disk */ +/******************************************************//** +Makes a checkpoint. Note that this function does not flush dirty +blocks from the buffer pool: it only checks what is lsn of the oldest +modification in the pool, and writes information about the lsn in +log files. Use log_make_checkpoint_at to flush also the pool. +@return TRUE if success, FALSE if a checkpoint write was already running */ +UNIV_INTERN +ibool +log_checkpoint( +/*===========*/ + ibool sync, /*!< in: TRUE if synchronous operation is + desired */ + ibool write_always); /*!< in: the function normally checks if the + the new checkpoint would have a greater + lsn than the previous one: if not, then no + physical write is done; by setting this + parameter TRUE, a physical write will always be + made to log files */ +/****************************************************************//** +Makes a checkpoint at a given lsn or later. */ +UNIV_INTERN +void +log_make_checkpoint_at( +/*===================*/ + lsn_t lsn, /*!< in: make a checkpoint at this or a + later lsn, if LSN_MAX, makes + a checkpoint at the latest lsn */ + ibool write_always); /*!< in: the function normally checks if + the new checkpoint would have a + greater lsn than the previous one: if + not, then no physical write is done; + by setting this parameter TRUE, a + physical write will always be made to + log files */ +/****************************************************************//** +Makes a checkpoint at the latest lsn and writes it to first page of each +data file in the database, so that we know that the file spaces contain +all modifications up to that lsn. This can only be called at database +shutdown. This function also writes all log in log files to the log archive. */ +UNIV_INTERN +void +logs_empty_and_mark_files_at_shutdown(void); +/*=======================================*/ +/******************************************************//** +Reads a checkpoint info from a log group header to log_sys->checkpoint_buf. */ +UNIV_INTERN +void +log_group_read_checkpoint_info( +/*===========================*/ + log_group_t* group, /*!< in: log group */ + ulint field); /*!< in: LOG_CHECKPOINT_1 or LOG_CHECKPOINT_2 */ +/*******************************************************************//** +Gets info from a checkpoint about a log group. */ +UNIV_INTERN +void +log_checkpoint_get_nth_group_info( +/*==============================*/ + const byte* buf, /*!< in: buffer containing checkpoint info */ + ulint n, /*!< in: nth slot */ + lsn_t* file_no);/*!< out: archived file number */ +/******************************************************//** +Writes checkpoint info to groups. */ +UNIV_INTERN +void +log_groups_write_checkpoint_info(void); +/*==================================*/ +/********************************************************************//** +Starts an archiving operation. +@return TRUE if succeed, FALSE if an archiving operation was already running */ +UNIV_INTERN +ibool +log_archive_do( +/*===========*/ + ibool sync, /*!< in: TRUE if synchronous operation is desired */ + ulint* n_bytes);/*!< out: archive log buffer size, 0 if nothing to + archive */ +/****************************************************************//** +Writes the log contents to the archive up to the lsn when this function was +called, and stops the archiving. When archiving is started again, the archived +log file numbers start from a number one higher, so that the archiving will +not write again to the archived log files which exist when this function +returns. +@return DB_SUCCESS or DB_ERROR */ +UNIV_INTERN +ulint +log_archive_stop(void); +/*==================*/ +/****************************************************************//** +Starts again archiving which has been stopped. +@return DB_SUCCESS or DB_ERROR */ +UNIV_INTERN +ulint +log_archive_start(void); +/*===================*/ +/****************************************************************//** +Stop archiving the log so that a gap may occur in the archived log files. +@return DB_SUCCESS or DB_ERROR */ +UNIV_INTERN +ulint +log_archive_noarchivelog(void); +/*==========================*/ +/****************************************************************//** +Start archiving the log so that a gap may occur in the archived log files. +@return DB_SUCCESS or DB_ERROR */ +UNIV_INTERN +ulint +log_archive_archivelog(void); +/*========================*/ +/******************************************************//** +Generates an archived log file name. */ +UNIV_INTERN +void +log_archived_file_name_gen( +/*=======================*/ + char* buf, /*!< in: buffer where to write */ + ulint buf_len,/*!< in: buffer length */ + ulint id, /*!< in: group id */ + lsn_t file_no);/*!< in: file number */ + +UNIV_INTERN +void +log_archived_get_offset( +/*====================*/ + log_group_t* group, /*!< in: log group */ + lsn_t file_no, /*!< in: archive log file number */ + lsn_t archived_lsn, /*!< in: last archived LSN */ + lsn_t* offset); /*!< out: offset within archived file */ +#else /* !UNIV_HOTBACKUP */ +/******************************************************//** +Writes info to a buffer of a log group when log files are created in +backup restoration. */ +UNIV_INTERN +void +log_reset_first_header_and_checkpoint( +/*==================================*/ + byte* hdr_buf,/*!< in: buffer which will be written to the + start of the first log file */ + ib_uint64_t start); /*!< in: lsn of the start of the first log file; + we pretend that there is a checkpoint at + start + LOG_BLOCK_HDR_SIZE */ +#endif /* !UNIV_HOTBACKUP */ +/********************************************************************//** +Checks that there is enough free space in the log to start a new query step. +Flushes the log buffer or makes a new checkpoint if necessary. NOTE: this +function may only be called if the calling thread owns no synchronization +objects! */ +UNIV_INTERN +void +log_check_margins(void); +/*===================*/ +#ifndef UNIV_HOTBACKUP +/******************************************************//** +Reads a specified log segment to a buffer. */ +UNIV_INTERN +void +log_group_read_log_seg( +/*===================*/ + ulint type, /*!< in: LOG_ARCHIVE or LOG_RECOVER */ + byte* buf, /*!< in: buffer where to read */ + log_group_t* group, /*!< in: log group */ + lsn_t start_lsn, /*!< in: read area start */ + lsn_t end_lsn, /*!< in: read area end */ + ibool release_mutex); /*!< in: whether the log_sys->mutex + should be released before the read */ +/******************************************************//** +Writes a buffer to a log file group. */ +UNIV_INTERN +void +log_group_write_buf( +/*================*/ + log_group_t* group, /*!< in: log group */ + byte* buf, /*!< in: buffer */ + ulint len, /*!< in: buffer len; must be divisible + by OS_FILE_LOG_BLOCK_SIZE */ + lsn_t start_lsn, /*!< in: start lsn of the buffer; must + be divisible by + OS_FILE_LOG_BLOCK_SIZE */ + ulint new_data_offset);/*!< in: start offset of new data in + buf: this parameter is used to decide + if we have to write a new log file + header */ +/********************************************************//** +Sets the field values in group to correspond to a given lsn. For this function +to work, the values must already be correctly initialized to correspond to +some lsn, for instance, a checkpoint lsn. */ +UNIV_INTERN +void +log_group_set_fields( +/*=================*/ + log_group_t* group, /*!< in/out: group */ + lsn_t lsn); /*!< in: lsn for which the values should be + set */ +/******************************************************//** +Calculates the data capacity of a log group, when the log file headers are not +included. +@return capacity in bytes */ +UNIV_INTERN +lsn_t +log_group_get_capacity( +/*===================*/ + const log_group_t* group); /*!< in: log group */ +#endif /* !UNIV_HOTBACKUP */ +/************************************************************//** +Gets a log block flush bit. +@return TRUE if this block was the first to be written in a log flush */ +UNIV_INLINE +ibool +log_block_get_flush_bit( +/*====================*/ + const byte* log_block); /*!< in: log block */ +/************************************************************//** +Gets a log block number stored in the header. +@return log block number stored in the block header */ +UNIV_INLINE +ulint +log_block_get_hdr_no( +/*=================*/ + const byte* log_block); /*!< in: log block */ +/************************************************************//** +Gets a log block data length. +@return log block data length measured as a byte offset from the block start */ +UNIV_INLINE +ulint +log_block_get_data_len( +/*===================*/ + const byte* log_block); /*!< in: log block */ +/************************************************************//** +Sets the log block data length. */ +UNIV_INLINE +void +log_block_set_data_len( +/*===================*/ + byte* log_block, /*!< in/out: log block */ + ulint len); /*!< in: data length */ +/************************************************************//** +Calculates the checksum for a log block. +@return checksum */ +UNIV_INLINE +ulint +log_block_calc_checksum( +/*====================*/ + const byte* block); /*!< in: log block */ +/************************************************************//** +Gets a log block checksum field value. +@return checksum */ +UNIV_INLINE +ulint +log_block_get_checksum( +/*===================*/ + const byte* log_block); /*!< in: log block */ +/************************************************************//** +Sets a log block checksum field value. */ +UNIV_INLINE +void +log_block_set_checksum( +/*===================*/ + byte* log_block, /*!< in/out: log block */ + ulint checksum); /*!< in: checksum */ +/************************************************************//** +Gets a log block first mtr log record group offset. +@return first mtr log record group byte offset from the block start, 0 +if none */ +UNIV_INLINE +ulint +log_block_get_first_rec_group( +/*==========================*/ + const byte* log_block); /*!< in: log block */ +/************************************************************//** +Sets the log block first mtr log record group offset. */ +UNIV_INLINE +void +log_block_set_first_rec_group( +/*==========================*/ + byte* log_block, /*!< in/out: log block */ + ulint offset); /*!< in: offset, 0 if none */ +/************************************************************//** +Gets a log block checkpoint number field (4 lowest bytes). +@return checkpoint no (4 lowest bytes) */ +UNIV_INLINE +ulint +log_block_get_checkpoint_no( +/*========================*/ + const byte* log_block); /*!< in: log block */ +/************************************************************//** +Initializes a log block in the log buffer. */ +UNIV_INLINE +void +log_block_init( +/*===========*/ + byte* log_block, /*!< in: pointer to the log buffer */ + lsn_t lsn); /*!< in: lsn within the log block */ +/************************************************************//** +Initializes a log block in the log buffer in the old, < 3.23.52 format, where +there was no checksum yet. */ +UNIV_INLINE +void +log_block_init_in_old_format( +/*=========================*/ + byte* log_block, /*!< in: pointer to the log buffer */ + lsn_t lsn); /*!< in: lsn within the log block */ +/************************************************************//** +Converts a lsn to a log block number. +@return log block number, it is > 0 and <= 1G */ +UNIV_INLINE +ulint +log_block_convert_lsn_to_no( +/*========================*/ + lsn_t lsn); /*!< in: lsn of a byte within the block */ +/******************************************************//** +Prints info of the log. */ +UNIV_INTERN +void +log_print( +/*======*/ + FILE* file); /*!< in: file where to print */ +/******************************************************//** +Peeks the current lsn. +@return TRUE if success, FALSE if could not get the log system mutex */ +UNIV_INTERN +ibool +log_peek_lsn( +/*=========*/ + lsn_t* lsn); /*!< out: if returns TRUE, current lsn is here */ +/**********************************************************************//** +Refreshes the statistics used to print per-second averages. */ +UNIV_INTERN +void +log_refresh_stats(void); +/*===================*/ +/********************************************************//** +Closes all log groups. */ +UNIV_INTERN +void +log_group_close_all(void); +/*=====================*/ +/********************************************************//** +Shutdown the log system but do not release all the memory. */ +UNIV_INTERN +void +log_shutdown(void); +/*==============*/ +/********************************************************//** +Free the log system data structures. */ +UNIV_INTERN +void +log_mem_free(void); +/*==============*/ + +/****************************************************************//** +Safely reads the log_sys->tracked_lsn value. Uses atomic operations +if available, otherwise this field is protected with the log system +mutex. The writer counterpart function is log_set_tracked_lsn() in +log0online.c. + +@return log_sys->tracked_lsn value. */ +UNIV_INLINE +lsn_t +log_get_tracked_lsn(void); +/*=====================*/ + +extern log_t* log_sys; + +/* Values used as flags */ +#define LOG_FLUSH 7652559 +#define LOG_CHECKPOINT 78656949 +#ifdef UNIV_LOG_ARCHIVE +# define LOG_ARCHIVE 11122331 +#endif /* UNIV_LOG_ARCHIVE */ +#define LOG_RECOVER 98887331 + +/* The counting of lsn's starts from this value: this must be non-zero */ +#define LOG_START_LSN ((lsn_t) (16 * OS_FILE_LOG_BLOCK_SIZE)) + +#define LOG_BUFFER_SIZE (srv_log_buffer_size * UNIV_PAGE_SIZE) +#define LOG_ARCHIVE_BUF_SIZE (srv_log_buffer_size * UNIV_PAGE_SIZE / 4) + +/* Offsets of a log block header */ +#define LOG_BLOCK_HDR_NO 0 /* block number which must be > 0 and + is allowed to wrap around at 2G; the + highest bit is set to 1 if this is the + first log block in a log flush write + segment */ +#define LOG_BLOCK_FLUSH_BIT_MASK 0x80000000UL + /* mask used to get the highest bit in + the preceding field */ +#define LOG_BLOCK_HDR_DATA_LEN 4 /* number of bytes of log written to + this block */ +#define LOG_BLOCK_FIRST_REC_GROUP 6 /* offset of the first start of an + mtr log record group in this log block, + 0 if none; if the value is the same + as LOG_BLOCK_HDR_DATA_LEN, it means + that the first rec group has not yet + been catenated to this log block, but + if it will, it will start at this + offset; an archive recovery can + start parsing the log records starting + from this offset in this log block, + if value not 0 */ +#define LOG_BLOCK_CHECKPOINT_NO 8 /* 4 lower bytes of the value of + log_sys->next_checkpoint_no when the + log block was last written to: if the + block has not yet been written full, + this value is only updated before a + log buffer flush */ +#define LOG_BLOCK_HDR_SIZE 12 /* size of the log block header in + bytes */ + +/* Offsets of a log block trailer from the end of the block */ +#define LOG_BLOCK_CHECKSUM 4 /* 4 byte checksum of the log block + contents; in InnoDB versions + < 3.23.52 this did not contain the + checksum but the same value as + .._HDR_NO */ +#define LOG_BLOCK_TRL_SIZE 4 /* trailer size in bytes */ + +/* Offsets for a checkpoint field */ +#define LOG_CHECKPOINT_NO 0 +#define LOG_CHECKPOINT_LSN 8 +#define LOG_CHECKPOINT_OFFSET_LOW32 16 +#define LOG_CHECKPOINT_LOG_BUF_SIZE 20 +#define LOG_CHECKPOINT_ARCHIVED_LSN 24 +#define LOG_CHECKPOINT_GROUP_ARRAY 32 + +/* For each value smaller than LOG_MAX_N_GROUPS the following 8 bytes: */ + +#define LOG_CHECKPOINT_ARCHIVED_FILE_NO 0 +#define LOG_CHECKPOINT_ARCHIVED_OFFSET 4 + +#define LOG_CHECKPOINT_ARRAY_END (LOG_CHECKPOINT_GROUP_ARRAY\ + + LOG_MAX_N_GROUPS * 8) +#define LOG_CHECKPOINT_CHECKSUM_1 LOG_CHECKPOINT_ARRAY_END +#define LOG_CHECKPOINT_CHECKSUM_2 (4 + LOG_CHECKPOINT_ARRAY_END) +#if 0 +#define LOG_CHECKPOINT_FSP_FREE_LIMIT (8 + LOG_CHECKPOINT_ARRAY_END) + /*!< Not used (0); + This used to contain the + current fsp free limit in + tablespace 0, in units of one + megabyte. + + This information might have been used + since mysqlbackup version 0.35 but + before 1.41 to decide if unused ends of + non-auto-extending data files + in space 0 can be truncated. + + This information was made obsolete + by mysqlbackup --compress. */ +#define LOG_CHECKPOINT_FSP_MAGIC_N (12 + LOG_CHECKPOINT_ARRAY_END) + /*!< Not used (0); + This magic number tells if the + checkpoint contains the above field: + the field was added to + InnoDB-3.23.50 and + removed from MySQL 5.6 */ +#define LOG_CHECKPOINT_FSP_MAGIC_N_VAL 1441231243 + /*!< if LOG_CHECKPOINT_FSP_MAGIC_N + contains this value, then + LOG_CHECKPOINT_FSP_FREE_LIMIT + is valid */ +#endif +#define LOG_CHECKPOINT_OFFSET_HIGH32 (16 + LOG_CHECKPOINT_ARRAY_END) +#define LOG_CHECKPOINT_SIZE (20 + LOG_CHECKPOINT_ARRAY_END) + + +/* Offsets of a log file header */ +#define LOG_GROUP_ID 0 /* log group number */ +#define LOG_FILE_START_LSN 4 /* lsn of the start of data in this + log file */ +#define LOG_FILE_NO 12 /* 4-byte archived log file number; + this field is only defined in an + archived log file */ +#define LOG_FILE_WAS_CREATED_BY_HOT_BACKUP 16 + /* a 32-byte field which contains + the string 'ibbackup' and the + creation time if the log file was + created by mysqlbackup --restore; + when mysqld is first time started + on the restored database, it can + print helpful info for the user */ +#define LOG_FILE_OS_FILE_LOG_BLOCK_SIZE 64 + /* extend to record log_block_size + of XtraDB. 0 means default 512 */ +#define LOG_FILE_ARCH_COMPLETED OS_FILE_LOG_BLOCK_SIZE + /* this 4-byte field is TRUE when + the writing of an archived log file + has been completed; this field is + only defined in an archived log file */ +#define LOG_FILE_END_LSN (OS_FILE_LOG_BLOCK_SIZE + 4) + /* lsn where the archived log file + at least extends: actually the + archived log file may extend to a + later lsn, as long as it is within the + same log block as this lsn; this field + is defined only when an archived log + file has been completely written */ +#define LOG_CHECKPOINT_1 OS_FILE_LOG_BLOCK_SIZE + /* first checkpoint field in the log + header; we write alternately to the + checkpoint fields when we make new + checkpoints; this field is only defined + in the first log file of a log group */ +#define LOG_CHECKPOINT_2 (3 * OS_FILE_LOG_BLOCK_SIZE) + /* second checkpoint field in the log + header */ +#define LOG_FILE_HDR_SIZE (4 * OS_FILE_LOG_BLOCK_SIZE) + +#define LOG_GROUP_OK 301 +#define LOG_GROUP_CORRUPTED 302 + +/** Log group consists of a number of log files, each of the same size; a log +group is implemented as a space in the sense of the module fil0fil. */ +struct log_group_t{ + /* The following fields are protected by log_sys->mutex */ + ulint id; /*!< log group id */ + ulint n_files; /*!< number of files in the group */ + lsn_t file_size; /*!< individual log file size in bytes, + including the log file header */ + ulint space_id; /*!< file space which implements the log + group */ + ulint state; /*!< LOG_GROUP_OK or + LOG_GROUP_CORRUPTED */ + lsn_t lsn; /*!< lsn used to fix coordinates within + the log group */ + lsn_t lsn_offset; /*!< the offset of the above lsn */ + ulint n_pending_writes;/*!< number of currently pending flush + writes for this log group */ + byte** file_header_bufs_ptr;/*!< unaligned buffers */ + byte** file_header_bufs;/*!< buffers for each file + header in the group */ +#ifdef UNIV_LOG_ARCHIVE + /*-----------------------------*/ + byte** archive_file_header_bufs_ptr;/*!< unaligned buffers */ + byte** archive_file_header_bufs;/*!< buffers for each file + header in the group */ + ulint archive_space_id;/*!< file space which + implements the log group + archive */ + lsn_t archived_file_no;/*!< file number corresponding to + log_sys->archived_lsn */ + lsn_t archived_offset;/*!< file offset corresponding to + log_sys->archived_lsn, 0 if we have + not yet written to the archive file + number archived_file_no */ + lsn_t next_archived_file_no;/*!< during an archive write, + until the write is completed, we + store the next value for + archived_file_no here: the write + completion function then sets the new + value to ..._file_no */ + lsn_t next_archived_offset; /*!< like the preceding field */ +#endif /* UNIV_LOG_ARCHIVE */ + /*-----------------------------*/ + lsn_t scanned_lsn; /*!< used only in recovery: recovery scan + succeeded up to this lsn in this log + group */ + byte* checkpoint_buf_ptr;/*!< unaligned checkpoint header */ + byte* checkpoint_buf; /*!< checkpoint header is written from + this buffer to the group */ + UT_LIST_NODE_T(log_group_t) + log_groups; /*!< list of log groups */ +}; + +/** Redo log buffer */ +struct log_t{ + byte pad[64]; /*!< padding to prevent other memory + update hotspots from residing on the + same memory cache line */ + lsn_t lsn; /*!< log sequence number */ + ulint buf_free; /*!< first free offset within the log + buffer */ +#ifndef UNIV_HOTBACKUP + ib_prio_mutex_t mutex; /*!< mutex protecting the log */ + + ib_mutex_t log_flush_order_mutex;/*!< mutex to serialize access to + the flush list when we are putting + dirty blocks in the list. The idea + behind this mutex is to be able + to release log_sys->mutex during + mtr_commit and still ensure that + insertions in the flush_list happen + in the LSN order. */ +#endif /* !UNIV_HOTBACKUP */ + byte* buf_ptr; /* unaligned log buffer */ + byte* buf; /*!< log buffer */ + ulint buf_size; /*!< log buffer size in bytes */ + ulint max_buf_free; /*!< recommended maximum value of + buf_free, after which the buffer is + flushed */ + #ifdef UNIV_LOG_DEBUG + ulint old_buf_free; /*!< value of buf free when log was + last time opened; only in the debug + version */ + ib_uint64_t old_lsn; /*!< value of lsn when log was + last time opened; only in the + debug version */ +#endif /* UNIV_LOG_DEBUG */ + ibool check_flush_or_checkpoint; + /*!< this is set to TRUE when there may + be need to flush the log buffer, or + preflush buffer pool pages, or make + a checkpoint; this MUST be TRUE when + lsn - last_checkpoint_lsn > + max_checkpoint_age; this flag is + peeked at by log_free_check(), which + does not reserve the log mutex */ + UT_LIST_BASE_NODE_T(log_group_t) + log_groups; /*!< log groups */ + +#ifndef UNIV_HOTBACKUP + /** The fields involved in the log buffer flush @{ */ + + ulint buf_next_to_write;/*!< first offset in the log buffer + where the byte content may not exist + written to file, e.g., the start + offset of a log record catenated + later; this is advanced when a flush + operation is completed to all the log + groups */ + volatile bool is_extending; /*!< this is set to true during extend + the log buffer size */ + lsn_t written_to_some_lsn; + /*!< first log sequence number not yet + written to any log group; for this to + be advanced, it is enough that the + write i/o has been completed for any + one log group */ + lsn_t written_to_all_lsn; + /*!< first log sequence number not yet + written to some log group; for this to + be advanced, it is enough that the + write i/o has been completed for all + log groups. + Note that since InnoDB currently + has only one log group therefore + this value is redundant. Also it + is possible that this value + falls behind the + flushed_to_disk_lsn transiently. + It is appropriate to use either + flushed_to_disk_lsn or + write_lsn which are always + up-to-date and accurate. */ + lsn_t write_lsn; /*!< end lsn for the current running + write */ + ulint write_end_offset;/*!< the data in buffer has + been written up to this offset + when the current write ends: + this field will then be copied + to buf_next_to_write */ + lsn_t current_flush_lsn;/*!< end lsn for the current running + write + flush operation */ + lsn_t flushed_to_disk_lsn; + /*!< how far we have written the log + AND flushed to disk */ + ulint n_pending_writes;/*!< number of currently + pending flushes or writes */ + /* NOTE on the 'flush' in names of the fields below: starting from + 4.0.14, we separate the write of the log file and the actual fsync() + or other method to flush it to disk. The names below shhould really + be 'flush_or_write'! */ + os_event_t no_flush_event; /*!< this event is in the reset state + when a flush or a write is running; + a thread should wait for this without + owning the log mutex, but NOTE that + to set or reset this event, the + thread MUST own the log mutex! */ + ibool one_flushed; /*!< during a flush, this is + first FALSE and becomes TRUE + when one log group has been + written or flushed */ + os_event_t one_flushed_event;/*!< this event is reset when the + flush or write has not yet completed + for any log group; e.g., this means + that a transaction has been committed + when this is set; a thread should wait + for this without owning the log mutex, + but NOTE that to set or reset this + event, the thread MUST own the log + mutex! */ + ulint n_log_ios; /*!< number of log i/os initiated thus + far */ + ulint n_log_ios_old; /*!< number of log i/o's at the + previous printout */ + time_t last_printout_time;/*!< when log_print was last time + called */ + /* @} */ + + /** Fields involved in checkpoints @{ */ + lsn_t log_group_capacity; /*!< capacity of the log group; if + the checkpoint age exceeds this, it is + a serious error because it is possible + we will then overwrite log and spoil + crash recovery */ + lsn_t max_modified_age_async; + /*!< when this recommended + value for lsn - + buf_pool_get_oldest_modification() + is exceeded, we start an + asynchronous preflush of pool pages */ + lsn_t max_modified_age_sync; + /*!< when this recommended + value for lsn - + buf_pool_get_oldest_modification() + is exceeded, we start a + synchronous preflush of pool pages */ + lsn_t max_checkpoint_age_async; + /*!< when this checkpoint age + is exceeded we start an + asynchronous writing of a new + checkpoint */ + lsn_t max_checkpoint_age; + /*!< this is the maximum allowed value + for lsn - last_checkpoint_lsn when a + new query step is started */ + ib_uint64_t next_checkpoint_no; + /*!< next checkpoint number */ + lsn_t last_checkpoint_lsn; + /*!< latest checkpoint lsn */ + lsn_t next_checkpoint_lsn; + /*!< next checkpoint lsn */ + ulint n_pending_checkpoint_writes; + /*!< number of currently pending + checkpoint writes */ + rw_lock_t checkpoint_lock;/*!< this latch is x-locked when a + checkpoint write is running; a thread + should wait for this without owning + the log mutex */ +#endif /* !UNIV_HOTBACKUP */ + byte* checkpoint_buf_ptr;/* unaligned checkpoint header */ + byte* checkpoint_buf; /*!< checkpoint header is read to this + buffer */ + /* @} */ +#ifdef UNIV_LOG_ARCHIVE + /** Fields involved in archiving @{ */ + ulint archiving_state;/*!< LOG_ARCH_ON, LOG_ARCH_STOPPING + LOG_ARCH_STOPPED, LOG_ARCH_OFF */ + lsn_t archived_lsn; /*!< archiving has advanced to this + lsn */ + lsn_t max_archived_lsn_age_async; + /*!< recommended maximum age of + archived_lsn, before we start + asynchronous copying to the archive */ + lsn_t max_archived_lsn_age; + /*!< maximum allowed age for + archived_lsn */ + lsn_t next_archived_lsn;/*!< during an archive write, + until the write is completed, we + store the next value for + archived_lsn here: the write + completion function then sets the new + value to archived_lsn */ + ulint archiving_phase;/*!< LOG_ARCHIVE_READ or + LOG_ARCHIVE_WRITE */ + ulint n_pending_archive_ios; + /*!< number of currently pending reads + or writes in archiving */ + rw_lock_t archive_lock; /*!< this latch is x-locked when an + archive write is running; a thread + should wait for this without owning + the log mutex */ + ulint archive_buf_size;/*!< size of archive_buf */ + byte* archive_buf_ptr;/*!< unaligned archived_buf */ + byte* archive_buf; /*!< log segment is written to the + archive from this buffer */ + os_event_t archiving_on; /*!< if archiving has been stopped, + a thread can wait for this event to + become signaled */ + /* @} */ +#endif /* UNIV_LOG_ARCHIVE */ + lsn_t tracked_lsn; /*!< log tracking has advanced to this + lsn. Field accessed atomically where + 64-bit atomic ops are supported, + protected by the log sys mutex + otherwise. */ +}; + +/** Test if flush order mutex is owned. */ +#define log_flush_order_mutex_own() \ + mutex_own(&log_sys->log_flush_order_mutex) + +/** Acquire the flush order mutex. */ +#define log_flush_order_mutex_enter() do { \ + mutex_enter(&log_sys->log_flush_order_mutex); \ +} while (0) +/** Release the flush order mutex. */ +# define log_flush_order_mutex_exit() do { \ + mutex_exit(&log_sys->log_flush_order_mutex); \ +} while (0) + +#ifdef UNIV_LOG_ARCHIVE +/** Archiving state @{ */ +#define LOG_ARCH_ON 71 +#define LOG_ARCH_STOPPING 72 +#define LOG_ARCH_STOPPING2 73 +#define LOG_ARCH_STOPPED 74 +#define LOG_ARCH_OFF 75 +/* @} */ +#endif /* UNIV_LOG_ARCHIVE */ + +#ifndef UNIV_NONINL +#include "log0log.ic" +#endif + +#endif diff --git a/storage/xtradb/include/log0log.ic b/storage/xtradb/include/log0log.ic new file mode 100644 index 00000000000..6402c7df1e7 --- /dev/null +++ b/storage/xtradb/include/log0log.ic @@ -0,0 +1,535 @@ +/***************************************************************************** + +Copyright (c) 1995, 2010, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/log0log.ic +Database log + +Created 12/9/1995 Heikki Tuuri +*******************************************************/ + +#include "os0file.h" +#include "mach0data.h" +#include "mtr0mtr.h" +#include "srv0mon.h" +#include "srv0srv.h" +#include "ut0crc32.h" + +#ifdef UNIV_LOG_DEBUG +/******************************************************//** +Checks by parsing that the catenated log segment for a single mtr is +consistent. */ +UNIV_INTERN +ibool +log_check_log_recs( +/*===============*/ + const byte* buf, /*!< in: pointer to the start of + the log segment in the + log_sys->buf log buffer */ + ulint len, /*!< in: segment length in bytes */ + ib_uint64_t buf_start_lsn); /*!< in: buffer start lsn */ +#endif /* UNIV_LOG_DEBUG */ + +/************************************************************//** +Gets a log block flush bit. +@return TRUE if this block was the first to be written in a log flush */ +UNIV_INLINE +ibool +log_block_get_flush_bit( +/*====================*/ + const byte* log_block) /*!< in: log block */ +{ + if (LOG_BLOCK_FLUSH_BIT_MASK + & mach_read_from_4(log_block + LOG_BLOCK_HDR_NO)) { + + return(TRUE); + } + + return(FALSE); +} + +/************************************************************//** +Sets the log block flush bit. */ +UNIV_INLINE +void +log_block_set_flush_bit( +/*====================*/ + byte* log_block, /*!< in/out: log block */ + ibool val) /*!< in: value to set */ +{ + ulint field; + + field = mach_read_from_4(log_block + LOG_BLOCK_HDR_NO); + + if (val) { + field = field | LOG_BLOCK_FLUSH_BIT_MASK; + } else { + field = field & ~LOG_BLOCK_FLUSH_BIT_MASK; + } + + mach_write_to_4(log_block + LOG_BLOCK_HDR_NO, field); +} + +/************************************************************//** +Gets a log block number stored in the header. +@return log block number stored in the block header */ +UNIV_INLINE +ulint +log_block_get_hdr_no( +/*=================*/ + const byte* log_block) /*!< in: log block */ +{ + return(~LOG_BLOCK_FLUSH_BIT_MASK + & mach_read_from_4(log_block + LOG_BLOCK_HDR_NO)); +} + +/************************************************************//** +Sets the log block number stored in the header; NOTE that this must be set +before the flush bit! */ +UNIV_INLINE +void +log_block_set_hdr_no( +/*=================*/ + byte* log_block, /*!< in/out: log block */ + ulint n) /*!< in: log block number: must be > 0 and + < LOG_BLOCK_FLUSH_BIT_MASK */ +{ + ut_ad(n > 0); + ut_ad(n < LOG_BLOCK_FLUSH_BIT_MASK); + + mach_write_to_4(log_block + LOG_BLOCK_HDR_NO, n); +} + +/************************************************************//** +Gets a log block data length. +@return log block data length measured as a byte offset from the block start */ +UNIV_INLINE +ulint +log_block_get_data_len( +/*===================*/ + const byte* log_block) /*!< in: log block */ +{ + return(mach_read_from_2(log_block + LOG_BLOCK_HDR_DATA_LEN)); +} + +/************************************************************//** +Sets the log block data length. */ +UNIV_INLINE +void +log_block_set_data_len( +/*===================*/ + byte* log_block, /*!< in/out: log block */ + ulint len) /*!< in: data length */ +{ + mach_write_to_2(log_block + LOG_BLOCK_HDR_DATA_LEN, len); +} + +/************************************************************//** +Gets a log block first mtr log record group offset. +@return first mtr log record group byte offset from the block start, 0 +if none */ +UNIV_INLINE +ulint +log_block_get_first_rec_group( +/*==========================*/ + const byte* log_block) /*!< in: log block */ +{ + return(mach_read_from_2(log_block + LOG_BLOCK_FIRST_REC_GROUP)); +} + +/************************************************************//** +Sets the log block first mtr log record group offset. */ +UNIV_INLINE +void +log_block_set_first_rec_group( +/*==========================*/ + byte* log_block, /*!< in/out: log block */ + ulint offset) /*!< in: offset, 0 if none */ +{ + mach_write_to_2(log_block + LOG_BLOCK_FIRST_REC_GROUP, offset); +} + +/************************************************************//** +Gets a log block checkpoint number field (4 lowest bytes). +@return checkpoint no (4 lowest bytes) */ +UNIV_INLINE +ulint +log_block_get_checkpoint_no( +/*========================*/ + const byte* log_block) /*!< in: log block */ +{ + return(mach_read_from_4(log_block + LOG_BLOCK_CHECKPOINT_NO)); +} + +/************************************************************//** +Sets a log block checkpoint number field (4 lowest bytes). */ +UNIV_INLINE +void +log_block_set_checkpoint_no( +/*========================*/ + byte* log_block, /*!< in/out: log block */ + ib_uint64_t no) /*!< in: checkpoint no */ +{ + mach_write_to_4(log_block + LOG_BLOCK_CHECKPOINT_NO, (ulint) no); +} + +/************************************************************//** +Converts a lsn to a log block number. +@return log block number, it is > 0 and <= 1G */ +UNIV_INLINE +ulint +log_block_convert_lsn_to_no( +/*========================*/ + lsn_t lsn) /*!< in: lsn of a byte within the block */ +{ + return(((ulint) (lsn / OS_FILE_LOG_BLOCK_SIZE) & 0x3FFFFFFFUL) + 1); +} + +/************************************************************//** +Calculates the checksum for a log block using the current algorithm. +@return checksum */ +UNIV_INLINE +ulint +log_block_calc_checksum( +/*====================*/ + const byte* block) /*!< in: log block */ +{ + return(log_checksum_algorithm_ptr(block)); +} +/************************************************************//** +Calculates the checksum for a log block using the default InnoDB algorithm. +@return checksum */ +UNIV_INLINE +ulint +log_block_calc_checksum_innodb( +/*===========================*/ + const byte* block) /*!< in: log block */ +{ + ulint sum; + ulint sh; + ulint i; + + sum = 1; + sh = 0; + + for (i = 0; i < OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE; i++) { + ulint b = (ulint) block[i]; + sum &= 0x7FFFFFFFUL; + sum += b; + sum += b << sh; + sh++; + if (sh > 24) { + sh = 0; + } + } + + return(sum); +} + +/************************************************************//** +Calculates the checksum for a log block using the CRC32 algorithm. +@return checksum */ +UNIV_INLINE +ulint +log_block_calc_checksum_crc32( +/*==========================*/ + const byte* block) /*!< in: log block */ +{ + return(ut_crc32(block, OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE)); +} + +/************************************************************//** +Calculates the checksum for a log block using the "no-op" algorithm. +@return checksum */ +UNIV_INLINE +ulint +log_block_calc_checksum_none( +/*=========================*/ + const byte* block) /*!< in: log block */ +{ + return(LOG_NO_CHECKSUM_MAGIC); +} + +/************************************************************//** +Gets a log block checksum field value. +@return checksum */ +UNIV_INLINE +ulint +log_block_get_checksum( +/*===================*/ + const byte* log_block) /*!< in: log block */ +{ + return(mach_read_from_4(log_block + OS_FILE_LOG_BLOCK_SIZE + - LOG_BLOCK_CHECKSUM)); +} + +/************************************************************//** +Sets a log block checksum field value. */ +UNIV_INLINE +void +log_block_set_checksum( +/*===================*/ + byte* log_block, /*!< in/out: log block */ + ulint checksum) /*!< in: checksum */ +{ + mach_write_to_4(log_block + OS_FILE_LOG_BLOCK_SIZE + - LOG_BLOCK_CHECKSUM, + checksum); +} + +/************************************************************//** +Initializes a log block in the log buffer. */ +UNIV_INLINE +void +log_block_init( +/*===========*/ + byte* log_block, /*!< in: pointer to the log buffer */ + lsn_t lsn) /*!< in: lsn within the log block */ +{ + ulint no; + + ut_ad(mutex_own(&(log_sys->mutex))); + + no = log_block_convert_lsn_to_no(lsn); + + log_block_set_hdr_no(log_block, no); + + log_block_set_data_len(log_block, LOG_BLOCK_HDR_SIZE); + log_block_set_first_rec_group(log_block, 0); +} + +/************************************************************//** +Initializes a log block in the log buffer in the old format, where there +was no checksum yet. */ +UNIV_INLINE +void +log_block_init_in_old_format( +/*=========================*/ + byte* log_block, /*!< in: pointer to the log buffer */ + lsn_t lsn) /*!< in: lsn within the log block */ +{ + ulint no; + + ut_ad(mutex_own(&(log_sys->mutex))); + + no = log_block_convert_lsn_to_no(lsn); + + log_block_set_hdr_no(log_block, no); + mach_write_to_4(log_block + OS_FILE_LOG_BLOCK_SIZE + - LOG_BLOCK_CHECKSUM, no); + log_block_set_data_len(log_block, LOG_BLOCK_HDR_SIZE); + log_block_set_first_rec_group(log_block, 0); +} + +#ifndef UNIV_HOTBACKUP +/************************************************************//** +Writes to the log the string given. The log must be released with +log_release. +@return end lsn of the log record, zero if did not succeed */ +UNIV_INLINE +lsn_t +log_reserve_and_write_fast( +/*=======================*/ + const void* str, /*!< in: string */ + ulint len, /*!< in: string length */ + lsn_t* start_lsn)/*!< out: start lsn of the log record */ +{ + ulint data_len; +#ifdef UNIV_LOG_LSN_DEBUG + /* length of the LSN pseudo-record */ + ulint lsn_len; +#endif /* UNIV_LOG_LSN_DEBUG */ + + mutex_enter(&log_sys->mutex); +#ifdef UNIV_LOG_LSN_DEBUG + lsn_len = 1 + + mach_get_compressed_size(log_sys->lsn >> 32) + + mach_get_compressed_size(log_sys->lsn & 0xFFFFFFFFUL); +#endif /* UNIV_LOG_LSN_DEBUG */ + + data_len = len +#ifdef UNIV_LOG_LSN_DEBUG + + lsn_len +#endif /* UNIV_LOG_LSN_DEBUG */ + + log_sys->buf_free % OS_FILE_LOG_BLOCK_SIZE; + + if (data_len >= OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE) { + + /* The string does not fit within the current log block or the + log block would become full. Do not release the log mutex, + because it has to be reacquired immediately for the "slow" write + procedure via log_write_low(). */ + + return(0); + } + + *start_lsn = log_sys->lsn; + +#ifdef UNIV_LOG_LSN_DEBUG + { + /* Write the LSN pseudo-record. */ + byte* b = &log_sys->buf[log_sys->buf_free]; + *b++ = MLOG_LSN | (MLOG_SINGLE_REC_FLAG & *(const byte*) str); + /* Write the LSN in two parts, + as a pseudo page number and space id. */ + b += mach_write_compressed(b, log_sys->lsn >> 32); + b += mach_write_compressed(b, log_sys->lsn & 0xFFFFFFFFUL); + ut_a(b - lsn_len == &log_sys->buf[log_sys->buf_free]); + + memcpy(b, str, len); + len += lsn_len; + } +#else /* UNIV_LOG_LSN_DEBUG */ + memcpy(log_sys->buf + log_sys->buf_free, str, len); +#endif /* UNIV_LOG_LSN_DEBUG */ + + log_block_set_data_len((byte*) ut_align_down(log_sys->buf + + log_sys->buf_free, + OS_FILE_LOG_BLOCK_SIZE), + data_len); +#ifdef UNIV_LOG_DEBUG + log_sys->old_buf_free = log_sys->buf_free; + log_sys->old_lsn = log_sys->lsn; +#endif + log_sys->buf_free += len; + + ut_ad(log_sys->buf_free <= log_sys->buf_size); + + log_sys->lsn += len; + + MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE, + log_sys->lsn - log_sys->last_checkpoint_lsn); + +#ifdef UNIV_LOG_DEBUG + log_check_log_recs(log_sys->buf + log_sys->old_buf_free, + log_sys->buf_free - log_sys->old_buf_free, + log_sys->old_lsn); +#endif + return(log_sys->lsn); +} + +/**************************************************************************//** +Locks the log mutex and opens the log for log_write_low. The log must be closed +with log_close and released with log_release. +@return start lsn of the log record */ +UNIV_INLINE +ib_uint64_t +log_reserve_and_open( +/*=================*/ + ulint len) /*!< in: length of data to be catenated */ +{ + mutex_enter(&(log_sys->mutex)); + + return log_open(len); +} + +/***********************************************************************//** +Releases the log mutex. */ +UNIV_INLINE +void +log_release(void) +/*=============*/ +{ + mutex_exit(&(log_sys->mutex)); +} + +/************************************************************//** +Gets the current lsn. +@return current lsn */ +UNIV_INLINE +lsn_t +log_get_lsn(void) +/*=============*/ +{ + lsn_t lsn; + + mutex_enter(&(log_sys->mutex)); + + lsn = log_sys->lsn; + + mutex_exit(&(log_sys->mutex)); + + return(lsn); +} + +/**************************************************************** +Gets the log group capacity. It is OK to read the value without +holding log_sys->mutex because it is constant. +@return log group capacity */ +UNIV_INLINE +lsn_t +log_get_capacity(void) +/*==================*/ +{ + return(log_sys->log_group_capacity); +} + +/**************************************************************** +Get log_sys::max_modified_age_async. It is OK to read the value without +holding log_sys::mutex because it is constant. +@return max_modified_age_async */ +UNIV_INLINE +lsn_t +log_get_max_modified_age_async(void) +/*================================*/ +{ + return(log_sys->max_modified_age_async); +} + +/***********************************************************************//** +Checks if there is need for a log buffer flush or a new checkpoint, and does +this if yes. Any database operation should call this when it has modified +more than about 4 pages. NOTE that this function may only be called when the +OS thread owns no synchronization objects except the dictionary mutex. */ +UNIV_INLINE +void +log_free_check(void) +/*================*/ +{ + +#ifdef UNIV_SYNC_DEBUG + ut_ad(sync_thread_levels_empty_except_dict()); +#endif /* UNIV_SYNC_DEBUG */ + + if (log_sys->check_flush_or_checkpoint) { + + log_check_margins(); + } +} +#endif /* !UNIV_HOTBACKUP */ + +/****************************************************************//** +Safely reads the log_sys->tracked_lsn value. Uses atomic operations +if available, otherwise this field is protected with the log system +mutex. The writer counterpart function is log_set_tracked_lsn() in +log0online.c. + +@return log_sys->tracked_lsn value. */ +UNIV_INLINE +lsn_t +log_get_tracked_lsn(void) +/*=====================*/ +{ +#ifdef HAVE_ATOMIC_BUILTINS_64 + return os_atomic_increment_uint64(&log_sys->tracked_lsn, 0); +#else + ut_ad(mutex_own(&(log_sys->mutex))); + return log_sys->tracked_lsn; +#endif +} + diff --git a/storage/xtradb/include/log0online.h b/storage/xtradb/include/log0online.h new file mode 100644 index 00000000000..1ef4df7d6da --- /dev/null +++ b/storage/xtradb/include/log0online.h @@ -0,0 +1,192 @@ +/***************************************************************************** + +Copyright (c) 2011-2012, Percona Inc. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/log0online.h +Online database log parsing for changed page tracking +*******************************************************/ + +#ifndef log0online_h +#define log0online_h + +#include "univ.i" +#include "os0file.h" +#include "log0log.h" + +/** Single bitmap file information */ +typedef struct log_online_bitmap_file_struct log_online_bitmap_file_t; + +/** A set of bitmap files containing some LSN range */ +typedef struct log_online_bitmap_file_range_struct +log_online_bitmap_file_range_t; + +/** An iterator over changed page info */ +typedef struct log_bitmap_iterator_struct log_bitmap_iterator_t; + +/*********************************************************************//** +Initializes the online log following subsytem. */ +UNIV_INTERN +void +log_online_read_init(void); +/*=======================*/ + +/*********************************************************************//** +Shuts down the online log following subsystem. */ +UNIV_INTERN +void +log_online_read_shutdown(void); +/*===========================*/ + +/*********************************************************************//** +Reads and parses the redo log up to last checkpoint LSN to build the changed +page bitmap which is then written to disk. + +@return TRUE if log tracking succeeded, FALSE if bitmap write I/O error */ +UNIV_INTERN +ibool +log_online_follow_redo_log(void); +/*=============================*/ + +/************************************************************//** +Delete all the bitmap files for data less than the specified LSN. +If called with lsn == 0 (i.e. set by RESET request) or +IB_ULONGLONG_MAX, restart the bitmap file sequence, otherwise +continue it. + +@return FALSE to indicate success, TRUE for failure. */ +UNIV_INTERN +ibool +log_online_purge_changed_page_bitmaps( +/*==================================*/ + ib_uint64_t lsn); /*!<in: LSN to purge files up to */ + +/************************************************************//** +Delete all the bitmap files for data less than the specified LSN. +If called with lsn == 0 (i.e. set by RESET request) or +IB_ULONGLONG_MAX, restart the bitmap file sequence, otherwise +continue it. + +@return FALSE to indicate success, TRUE for failure. */ +UNIV_INTERN +ibool +log_online_purge_changed_page_bitmaps( +/*==================================*/ + ib_uint64_t lsn); /*!<in: LSN to purge files up to */ + +#define LOG_BITMAP_ITERATOR_START_LSN(i) \ + ((i).start_lsn) +#define LOG_BITMAP_ITERATOR_END_LSN(i) \ + ((i).end_lsn) +#define LOG_BITMAP_ITERATOR_SPACE_ID(i) \ + ((i).space_id) +#define LOG_BITMAP_ITERATOR_PAGE_NUM(i) \ + ((i).first_page_id + (i).bit_offset) +#define LOG_BITMAP_ITERATOR_PAGE_CHANGED(i) \ + ((i).changed) + +/*********************************************************************//** +Initializes log bitmap iterator. The minimum LSN is used for finding the +correct starting file with records and it there may be records returned by +the iterator that have LSN less than start_lsn. + +@return TRUE if the iterator is initialized OK, FALSE otherwise. */ +UNIV_INTERN +ibool +log_online_bitmap_iterator_init( +/*============================*/ + log_bitmap_iterator_t *i, /*!<in/out: iterator */ + lsn_t min_lsn, /*!<in: start LSN for the + iterator */ + lsn_t max_lsn); /*!<in: end LSN for the + iterator */ + +/*********************************************************************//** +Releases log bitmap iterator. */ +UNIV_INTERN +void +log_online_bitmap_iterator_release( +/*===============================*/ + log_bitmap_iterator_t *i); /*!<in/out: iterator */ + +/*********************************************************************//** +Iterates through bits of saved bitmap blocks. +Sequentially reads blocks from bitmap file(s) and interates through +their bits. Ignores blocks with wrong checksum. +@return TRUE if iteration is successful, FALSE if all bits are iterated. */ +UNIV_INTERN +ibool +log_online_bitmap_iterator_next( +/*============================*/ + log_bitmap_iterator_t *i); /*!<in/out: iterator */ + +/** Struct for single bitmap file information */ +struct log_online_bitmap_file_struct { + char name[FN_REFLEN]; /*!< Name with full path */ + os_file_t file; /*!< Handle to opened file */ + ib_uint64_t size; /*!< Size of the file */ + os_offset_t offset; /*!< Offset of the next read, + or count of already-read bytes + */ +}; + +/** Struct for a set of bitmap files containing some LSN range */ +struct log_online_bitmap_file_range_struct { + size_t count; /*!< Number of files */ + /*!< Dynamically-allocated array of info about individual files */ + struct files_t { + char name[FN_REFLEN]; /*!< Name of a file */ + lsn_t start_lsn; /*!< Starting LSN of data in + this file */ + ulong seq_num; /*!< Sequence number of this + file */ + } *files; +}; + +/** Struct for an iterator through all bits of changed pages bitmap blocks */ +struct log_bitmap_iterator_struct +{ + ibool failed; /*!< Has the iteration + stopped prematurely */ + log_online_bitmap_file_range_t in_files; /*!< The bitmap files + for this iterator */ + size_t in_i; /*!< Currently read + file index in in_files + */ + log_online_bitmap_file_t in; /*!< Currently read + file */ + ib_uint32_t bit_offset; /*!< bit offset inside + the current bitmap + block */ + lsn_t start_lsn; /*!< Start LSN of the + current bitmap block */ + lsn_t end_lsn; /*!< End LSN of the + current bitmap block */ + ib_uint32_t space_id; /*!< Current block + space id */ + ib_uint32_t first_page_id; /*!< Id of the first + page in the current + block */ + ibool last_page_in_run;/*!< "Last page in + run" flag value for the + current block */ + ibool changed; /*!< true if current + page was changed */ + byte* page; /*!< Bitmap block */ +}; + +#endif diff --git a/storage/xtradb/include/log0recv.h b/storage/xtradb/include/log0recv.h new file mode 100644 index 00000000000..805b6c66768 --- /dev/null +++ b/storage/xtradb/include/log0recv.h @@ -0,0 +1,542 @@ +/***************************************************************************** + +Copyright (c) 1997, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/log0recv.h +Recovery + +Created 9/20/1997 Heikki Tuuri +*******************************************************/ + +#ifndef log0recv_h +#define log0recv_h + +#include "univ.i" +#include "ut0byte.h" +#include "buf0types.h" +#include "hash0hash.h" +#include "log0log.h" +#include <list> + +/******************************************************//** +Checks the 4-byte checksum to the trailer checksum field of a log +block. We also accept a log block in the old format before +InnoDB-3.23.52 where the checksum field contains the log block number. +@return TRUE if ok, or if the log block may be in the format of InnoDB +version predating 3.23.52 */ +UNIV_INTERN +ibool +log_block_checksum_is_ok_or_old_format( +/*===================================*/ + const byte* block); /*!< in: pointer to a log block */ + +/*******************************************************//** +Calculates the new value for lsn when more data is added to the log. */ +UNIV_INTERN +ib_uint64_t +recv_calc_lsn_on_data_add( +/*======================*/ + lsn_t lsn, /*!< in: old lsn */ + ib_uint64_t len); /*!< in: this many bytes of data is + added, log block headers not included */ + +#ifdef UNIV_HOTBACKUP +extern ibool recv_replay_file_ops; + +/*******************************************************************//** +Reads the checkpoint info needed in hot backup. +@return TRUE if success */ +UNIV_INTERN +ibool +recv_read_checkpoint_info_for_backup( +/*=================================*/ + const byte* hdr, /*!< in: buffer containing the log group + header */ + lsn_t* lsn, /*!< out: checkpoint lsn */ + lsn_t* offset, /*!< out: checkpoint offset in the log group */ + lsn_t* cp_no, /*!< out: checkpoint number */ + lsn_t* first_header_lsn) + /*!< out: lsn of of the start of the + first log file */ + __attribute__((nonnull)); +/*******************************************************************//** +Scans the log segment and n_bytes_scanned is set to the length of valid +log scanned. */ +UNIV_INTERN +void +recv_scan_log_seg_for_backup( +/*=========================*/ + byte* buf, /*!< in: buffer containing log data */ + ulint buf_len, /*!< in: data length in that buffer */ + lsn_t* scanned_lsn, /*!< in/out: lsn of buffer start, + we return scanned lsn */ + ulint* scanned_checkpoint_no, + /*!< in/out: 4 lowest bytes of the + highest scanned checkpoint number so + far */ + ulint* n_bytes_scanned);/*!< out: how much we were able to + scan, smaller than buf_len if log + data ended here */ +#endif /* UNIV_HOTBACKUP */ +/*******************************************************************//** +Returns TRUE if recovery is currently running. +@return recv_recovery_on */ +UNIV_INLINE +ibool +recv_recovery_is_on(void); +/*=====================*/ +#ifdef UNIV_LOG_ARCHIVE +/*******************************************************************//** +Returns TRUE if recovery from backup is currently running. +@return recv_recovery_from_backup_on */ +UNIV_INLINE +ibool +recv_recovery_from_backup_is_on(void); +/*=================================*/ +#endif /* UNIV_LOG_ARCHIVE */ +/************************************************************************//** +Applies the hashed log records to the page, if the page lsn is less than the +lsn of a log record. This can be called when a buffer page has just been +read in, or also for a page already in the buffer pool. */ +UNIV_INTERN +void +recv_recover_page_func( +/*===================*/ +#ifndef UNIV_HOTBACKUP + ibool just_read_in, + /*!< in: TRUE if the i/o handler calls + this for a freshly read page */ +#endif /* !UNIV_HOTBACKUP */ + buf_block_t* block); /*!< in/out: buffer block */ +#ifndef UNIV_HOTBACKUP +/** Wrapper for recv_recover_page_func(). +Applies the hashed log records to the page, if the page lsn is less than the +lsn of a log record. This can be called when a buffer page has just been +read in, or also for a page already in the buffer pool. +@param jri in: TRUE if just read in (the i/o handler calls this for +a freshly read page) +@param block in/out: the buffer block +*/ +# define recv_recover_page(jri, block) recv_recover_page_func(jri, block) +#else /* !UNIV_HOTBACKUP */ +/** Wrapper for recv_recover_page_func(). +Applies the hashed log records to the page, if the page lsn is less than the +lsn of a log record. This can be called when a buffer page has just been +read in, or also for a page already in the buffer pool. +@param jri in: TRUE if just read in (the i/o handler calls this for +a freshly read page) +@param block in/out: the buffer block +*/ +# define recv_recover_page(jri, block) recv_recover_page_func(block) +#endif /* !UNIV_HOTBACKUP */ +/********************************************************//** +Recovers from a checkpoint. When this function returns, the database is able +to start processing of new user transactions, but the function +recv_recovery_from_checkpoint_finish should be called later to complete +the recovery and free the resources used in it. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +recv_recovery_from_checkpoint_start_func( +/*=====================================*/ +#ifdef UNIV_LOG_ARCHIVE + ulint type, /*!< in: LOG_CHECKPOINT or + LOG_ARCHIVE */ + lsn_t limit_lsn, /*!< in: recover up to this lsn + if possible */ +#endif /* UNIV_LOG_ARCHIVE */ + lsn_t min_flushed_lsn,/*!< in: min flushed lsn from + data files */ + lsn_t max_flushed_lsn);/*!< in: max flushed lsn from + data files */ +#ifdef UNIV_LOG_ARCHIVE +/** Wrapper for recv_recovery_from_checkpoint_start_func(). +Recovers from a checkpoint. When this function returns, the database is able +to start processing of new user transactions, but the function +recv_recovery_from_checkpoint_finish should be called later to complete +the recovery and free the resources used in it. +@param type in: LOG_CHECKPOINT or LOG_ARCHIVE +@param lim in: recover up to this log sequence number if possible +@param min in: minimum flushed log sequence number from data files +@param max in: maximum flushed log sequence number from data files +@return error code or DB_SUCCESS */ +# define recv_recovery_from_checkpoint_start(type,lim,min,max) \ + recv_recovery_from_checkpoint_start_func(type,lim,min,max) +#else /* UNIV_LOG_ARCHIVE */ +/** Wrapper for recv_recovery_from_checkpoint_start_func(). +Recovers from a checkpoint. When this function returns, the database is able +to start processing of new user transactions, but the function +recv_recovery_from_checkpoint_finish should be called later to complete +the recovery and free the resources used in it. +@param type ignored: LOG_CHECKPOINT or LOG_ARCHIVE +@param lim ignored: recover up to this log sequence number if possible +@param min in: minimum flushed log sequence number from data files +@param max in: maximum flushed log sequence number from data files +@return error code or DB_SUCCESS */ +# define recv_recovery_from_checkpoint_start(type,lim,min,max) \ + recv_recovery_from_checkpoint_start_func(min,max) +#endif /* UNIV_LOG_ARCHIVE */ +/********************************************************//** +Completes recovery from a checkpoint. */ +UNIV_INTERN +void +recv_recovery_from_checkpoint_finish(void); +/*======================================*/ +/********************************************************//** +Initiates the rollback of active transactions. */ +UNIV_INTERN +void +recv_recovery_rollback_active(void); +/*===============================*/ + +/*******************************************************************//** +Tries to parse a single log record and returns its length. +@return length of the record, or 0 if the record was not complete */ +UNIV_INTERN +ulint +recv_parse_log_rec( +/*===============*/ + byte* ptr, /*!< in: pointer to a buffer */ + byte* end_ptr,/*!< in: pointer to the buffer end */ + byte* type, /*!< out: type */ + ulint* space, /*!< out: space id */ + ulint* page_no,/*!< out: page number */ + byte** body); /*!< out: log record body start */ + +/*******************************************************//** +Scans log from a buffer and stores new log data to the parsing buffer. +Parses and hashes the log records if new data found. Unless +UNIV_HOTBACKUP is defined, this function will apply log records +automatically when the hash table becomes full. +@return TRUE if limit_lsn has been reached, or not able to scan any +more in this log group */ +UNIV_INTERN +ibool +recv_scan_log_recs( +/*===============*/ + ulint available_memory,/*!< in: we let the hash table of recs + to grow to this size, at the maximum */ + ibool store_to_hash, /*!< in: TRUE if the records should be + stored to the hash table; this is set + to FALSE if just debug checking is + needed */ + const byte* buf, /*!< in: buffer containing a log + segment or garbage */ + ulint len, /*!< in: buffer length */ + lsn_t start_lsn, /*!< in: buffer start lsn */ + lsn_t* contiguous_lsn, /*!< in/out: it is known that all log + groups contain contiguous log data up + to this lsn */ + lsn_t* group_scanned_lsn);/*!< out: scanning succeeded up to + this lsn */ +/******************************************************//** +Resets the logs. The contents of log files will be lost! */ +UNIV_INTERN +void +recv_reset_logs( +/*============*/ +#ifdef UNIV_LOG_ARCHIVE + ulint arch_log_no, /*!< in: next archived log file number */ + ibool new_logs_created,/*!< in: TRUE if resetting logs + is done at the log creation; + FALSE if it is done after + archive recovery */ +#endif /* UNIV_LOG_ARCHIVE */ + lsn_t lsn); /*!< in: reset to this lsn + rounded up to be divisible by + OS_FILE_LOG_BLOCK_SIZE, after + which we add + LOG_BLOCK_HDR_SIZE */ +#ifdef UNIV_HOTBACKUP +/******************************************************//** +Creates new log files after a backup has been restored. */ +UNIV_INTERN +void +recv_reset_log_files_for_backup( +/*============================*/ + const char* log_dir, /*!< in: log file directory path */ + ulint n_log_files, /*!< in: number of log files */ + lsn_t log_file_size, /*!< in: log file size */ + lsn_t lsn); /*!< in: new start lsn, must be + divisible by OS_FILE_LOG_BLOCK_SIZE */ +#endif /* UNIV_HOTBACKUP */ +/********************************************************//** +Creates the recovery system. */ +UNIV_INTERN +void +recv_sys_create(void); +/*=================*/ +/**********************************************************//** +Release recovery system mutexes. */ +UNIV_INTERN +void +recv_sys_close(void); +/*================*/ +/********************************************************//** +Frees the recovery system memory. */ +UNIV_INTERN +void +recv_sys_mem_free(void); +/*===================*/ +/********************************************************//** +Inits the recovery system for a recovery operation. */ +UNIV_INTERN +void +recv_sys_init( +/*==========*/ + ulint available_memory); /*!< in: available memory in bytes */ +#ifndef UNIV_HOTBACKUP +/********************************************************//** +Reset the state of the recovery system variables. */ +UNIV_INTERN +void +recv_sys_var_init(void); +/*===================*/ +#endif /* !UNIV_HOTBACKUP */ +/*******************************************************************//** +Empties the hash table of stored log records, applying them to appropriate +pages. */ +UNIV_INTERN +void +recv_apply_hashed_log_recs( +/*=======================*/ + ibool allow_ibuf); /*!< in: if TRUE, also ibuf operations are + allowed during the application; if FALSE, + no ibuf operations are allowed, and after + the application all file pages are flushed to + disk and invalidated in buffer pool: this + alternative means that no new log records + can be generated during the application */ +#ifdef UNIV_HOTBACKUP +/*******************************************************************//** +Applies log records in the hash table to a backup. */ +UNIV_INTERN +void +recv_apply_log_recs_for_backup(void); +/*================================*/ +#endif +#ifdef UNIV_LOG_ARCHIVE +/********************************************************//** +Recovers from archived log files, and also from log files, if they exist. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +recv_recovery_from_archive_start( +/*=============================*/ + lsn_t min_flushed_lsn,/*!< in: min flushed lsn field from the + data files */ + lsn_t limit_lsn, /*!< in: recover up to this lsn if + possible */ + lsn_t first_log_no); /*!< in: number of the first archived + log file to use in the recovery; the + file will be searched from + INNOBASE_LOG_ARCH_DIR specified in + server config file */ +/********************************************************//** +Completes recovery from archive. */ +UNIV_INTERN +void +recv_recovery_from_archive_finish(void); +/*===================================*/ +#endif /* UNIV_LOG_ARCHIVE */ + +/** Block of log record data */ +struct recv_data_t{ + recv_data_t* next; /*!< pointer to the next block or NULL */ + /*!< the log record data is stored physically + immediately after this struct, max amount + RECV_DATA_BLOCK_SIZE bytes of it */ +}; + +/** Stored log record struct */ +struct recv_t{ + byte type; /*!< log record type */ + ulint len; /*!< log record body length in bytes */ + recv_data_t* data; /*!< chain of blocks containing the log record + body */ + lsn_t start_lsn;/*!< start lsn of the log segment written by + the mtr which generated this log record: NOTE + that this is not necessarily the start lsn of + this log record */ + lsn_t end_lsn;/*!< end lsn of the log segment written by + the mtr which generated this log record: NOTE + that this is not necessarily the end lsn of + this log record */ + UT_LIST_NODE_T(recv_t) + rec_list;/*!< list of log records for this page */ +}; + +/** States of recv_addr_t */ +enum recv_addr_state { + /** not yet processed */ + RECV_NOT_PROCESSED, + /** page is being read */ + RECV_BEING_READ, + /** log records are being applied on the page */ + RECV_BEING_PROCESSED, + /** log records have been applied on the page, or they have + been discarded because the tablespace does not exist */ + RECV_PROCESSED +}; + +/** Hashed page file address struct */ +struct recv_addr_t{ + enum recv_addr_state state; + /*!< recovery state of the page */ + unsigned space:32;/*!< space id */ + unsigned page_no:32;/*!< page number */ + UT_LIST_BASE_NODE_T(recv_t) + rec_list;/*!< list of log records for this page */ + hash_node_t addr_hash;/*!< hash node in the hash bucket chain */ +}; + +struct recv_dblwr_t { + void add(byte* page); + + byte* find_page(ulint space_id, ulint page_no); + + std::list<byte *> pages; /* Pages from double write buffer */ + + void operator() () { + pages.clear(); + } +}; + +/** Recovery system data structure */ +struct recv_sys_t{ +#ifndef UNIV_HOTBACKUP + ib_mutex_t mutex; /*!< mutex protecting the fields apply_log_recs, + n_addrs, and the state field in each recv_addr + struct */ + ib_mutex_t writer_mutex;/*!< mutex coordinating + flushing between recv_writer_thread and + the recovery thread. */ +#endif /* !UNIV_HOTBACKUP */ + ibool apply_log_recs; + /*!< this is TRUE when log rec application to + pages is allowed; this flag tells the + i/o-handler if it should do log record + application */ + ibool apply_batch_on; + /*!< this is TRUE when a log rec application + batch is running */ + lsn_t lsn; /*!< log sequence number */ + ulint last_log_buf_size; + /*!< size of the log buffer when the database + last time wrote to the log */ + byte* last_block; + /*!< possible incomplete last recovered log + block */ + byte* last_block_buf_start; + /*!< the nonaligned start address of the + preceding buffer */ + byte* buf; /*!< buffer for parsing log records */ + ulint len; /*!< amount of data in buf */ + lsn_t parse_start_lsn; + /*!< this is the lsn from which we were able to + start parsing log records and adding them to + the hash table; zero if a suitable + start point not found yet */ + lsn_t scanned_lsn; + /*!< the log data has been scanned up to this + lsn */ + ulint scanned_checkpoint_no; + /*!< the log data has been scanned up to this + checkpoint number (lowest 4 bytes) */ + ulint recovered_offset; + /*!< start offset of non-parsed log records in + buf */ + lsn_t recovered_lsn; + /*!< the log records have been parsed up to + this lsn */ + lsn_t limit_lsn;/*!< recovery should be made at most + up to this lsn */ + ibool found_corrupt_log; + /*!< this is set to TRUE if we during log + scan find a corrupt log block, or a corrupt + log record, or there is a log parsing + buffer overflow */ +#ifdef UNIV_LOG_ARCHIVE + log_group_t* archive_group; + /*!< in archive recovery: the log group whose + archive is read */ +#endif /* !UNIV_LOG_ARCHIVE */ + mem_heap_t* heap; /*!< memory heap of log records and file + addresses*/ + hash_table_t* addr_hash;/*!< hash table of file addresses of pages */ + ulint n_addrs;/*!< number of not processed hashed file + addresses in the hash table */ + + recv_dblwr_t dblwr; +}; + +/** The recovery system */ +extern recv_sys_t* recv_sys; + +/** TRUE when applying redo log records during crash recovery; FALSE +otherwise. Note that this is FALSE while a background thread is +rolling back incomplete transactions. */ +extern ibool recv_recovery_on; +/** If the following is TRUE, the buffer pool file pages must be invalidated +after recovery and no ibuf operations are allowed; this becomes TRUE if +the log record hash table becomes too full, and log records must be merged +to file pages already before the recovery is finished: in this case no +ibuf operations are allowed, as they could modify the pages read in the +buffer pool before the pages have been recovered to the up-to-date state. + +TRUE means that recovery is running and no operations on the log files +are allowed yet: the variable name is misleading. */ +extern ibool recv_no_ibuf_operations; +/** TRUE when recv_init_crash_recovery() has been called. */ +extern ibool recv_needed_recovery; +#ifdef UNIV_DEBUG +/** TRUE if writing to the redo log (mtr_commit) is forbidden. +Protected by log_sys->mutex. */ +extern ibool recv_no_log_write; +#endif /* UNIV_DEBUG */ + +/** TRUE if buf_page_is_corrupted() should check if the log sequence +number (FIL_PAGE_LSN) is in the future. Initially FALSE, and set by +recv_recovery_from_checkpoint_start_func(). */ +extern ibool recv_lsn_checks_on; +#ifdef UNIV_HOTBACKUP +/** TRUE when the redo log is being backed up */ +extern ibool recv_is_making_a_backup; +#endif /* UNIV_HOTBACKUP */ +/** Maximum page number encountered in the redo log */ +extern ulint recv_max_parsed_page_no; + +/** Size of the parsing buffer; it must accommodate RECV_SCAN_SIZE many +times! */ +#define RECV_PARSING_BUF_SIZE (2 * 1024 * 1024) + +/** Size of block reads when the log groups are scanned forward to do a +roll-forward */ +#define RECV_SCAN_SIZE (4 * UNIV_PAGE_SIZE) + +/** This many frames must be left free in the buffer pool when we scan +the log and store the scanned log records in the buffer pool: we will +use these free frames to read in pages when we start applying the +log records to the database. */ +extern ulint recv_n_pool_free_frames; + +#ifndef UNIV_NONINL +#include "log0recv.ic" +#endif + +#endif diff --git a/storage/xtradb/include/log0recv.ic b/storage/xtradb/include/log0recv.ic new file mode 100644 index 00000000000..32c28dd03e6 --- /dev/null +++ b/storage/xtradb/include/log0recv.ic @@ -0,0 +1,53 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/log0recv.ic +Recovery + +Created 9/20/1997 Heikki Tuuri +*******************************************************/ + +#include "univ.i" + +/*******************************************************************//** +Returns TRUE if recovery is currently running. +@return recv_recovery_on */ +UNIV_INLINE +ibool +recv_recovery_is_on(void) +/*=====================*/ +{ + return(recv_recovery_on); +} + +#ifdef UNIV_LOG_ARCHIVE +/** TRUE when applying redo log records from an archived log file */ +extern ibool recv_recovery_from_backup_on; + +/*******************************************************************//** +Returns TRUE if recovery from backup is currently running. +@return recv_recovery_from_backup_on */ +UNIV_INLINE +ibool +recv_recovery_from_backup_is_on(void) +/*=================================*/ +{ + return(recv_recovery_from_backup_on); +} +#endif /* UNIV_LOG_ARCHIVE */ diff --git a/storage/xtradb/include/mach0data.h b/storage/xtradb/include/mach0data.h new file mode 100644 index 00000000000..d0087f56aaa --- /dev/null +++ b/storage/xtradb/include/mach0data.h @@ -0,0 +1,418 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/mach0data.h +Utilities for converting data from the database file +to the machine format. + +Created 11/28/1995 Heikki Tuuri +***********************************************************************/ + +#ifndef mach0data_h +#define mach0data_h + +#ifndef UNIV_INNOCHECKSUM + +#include "univ.i" +#include "ut0byte.h" + +/* The data and all fields are always stored in a database file +in the same format: ascii, big-endian, ... . +All data in the files MUST be accessed using the functions in this +module. */ + +/*******************************************************//** +The following function is used to store data in one byte. */ +UNIV_INLINE +void +mach_write_to_1( +/*============*/ + byte* b, /*!< in: pointer to byte where to store */ + ulint n); /*!< in: ulint integer to be stored, >= 0, < 256 */ +/********************************************************//** +The following function is used to fetch data from one byte. +@return ulint integer, >= 0, < 256 */ +UNIV_INLINE +ulint +mach_read_from_1( +/*=============*/ + const byte* b) /*!< in: pointer to byte */ + __attribute__((nonnull, pure)); +/*******************************************************//** +The following function is used to store data in two consecutive +bytes. We store the most significant byte to the lower address. */ +UNIV_INLINE +void +mach_write_to_2( +/*============*/ + byte* b, /*!< in: pointer to two bytes where to store */ + ulint n); /*!< in: ulint integer to be stored, >= 0, < 64k */ +/********************************************************//** +The following function is used to fetch data from two consecutive +bytes. The most significant byte is at the lowest address. +@return ulint integer, >= 0, < 64k */ +UNIV_INLINE +ulint +mach_read_from_2( +/*=============*/ + const byte* b) /*!< in: pointer to two bytes */ + __attribute__((nonnull, pure)); + +/********************************************************//** +The following function is used to convert a 16-bit data item +to the canonical format, for fast bytewise equality test +against memory. +@return 16-bit integer in canonical format */ +UNIV_INLINE +uint16 +mach_encode_2( +/*==========*/ + ulint n) /*!< in: integer in machine-dependent format */ + __attribute__((const)); +/********************************************************//** +The following function is used to convert a 16-bit data item +from the canonical format, for fast bytewise equality test +against memory. +@return integer in machine-dependent format */ +UNIV_INLINE +ulint +mach_decode_2( +/*==========*/ + uint16 n) /*!< in: 16-bit integer in canonical format */ + __attribute__((const)); +/*******************************************************//** +The following function is used to store data in 3 consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_to_3( +/*============*/ + byte* b, /*!< in: pointer to 3 bytes where to store */ + ulint n); /*!< in: ulint integer to be stored */ +/********************************************************//** +The following function is used to fetch data from 3 consecutive +bytes. The most significant byte is at the lowest address. +@return ulint integer */ +UNIV_INLINE +ulint +mach_read_from_3( +/*=============*/ + const byte* b) /*!< in: pointer to 3 bytes */ + __attribute__((nonnull, pure)); +/*******************************************************//** +The following function is used to store data in four consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_to_4( +/*============*/ + byte* b, /*!< in: pointer to four bytes where to store */ + ulint n); /*!< in: ulint integer to be stored */ +/********************************************************//** +The following function is used to fetch data from 4 consecutive +bytes. The most significant byte is at the lowest address. +@return ulint integer */ +UNIV_INLINE +ulint +mach_read_from_4( +/*=============*/ + const byte* b) /*!< in: pointer to four bytes */ + __attribute__((nonnull, pure)); +/*********************************************************//** +Writes a ulint in a compressed form (1..5 bytes). +@return stored size in bytes */ +UNIV_INLINE +ulint +mach_write_compressed( +/*==================*/ + byte* b, /*!< in: pointer to memory where to store */ + ulint n); /*!< in: ulint integer to be stored */ +/*********************************************************//** +Returns the size of an ulint when written in the compressed form. +@return compressed size in bytes */ +UNIV_INLINE +ulint +mach_get_compressed_size( +/*=====================*/ + ulint n) /*!< in: ulint integer to be stored */ + __attribute__((const)); +/*********************************************************//** +Reads a ulint in a compressed form. +@return read integer */ +UNIV_INLINE +ulint +mach_read_compressed( +/*=================*/ + const byte* b) /*!< in: pointer to memory from where to read */ + __attribute__((nonnull, pure)); +/*******************************************************//** +The following function is used to store data in 6 consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_to_6( +/*============*/ + byte* b, /*!< in: pointer to 6 bytes where to store */ + ib_uint64_t id); /*!< in: 48-bit integer */ +/********************************************************//** +The following function is used to fetch data from 6 consecutive +bytes. The most significant byte is at the lowest address. +@return 48-bit integer */ +UNIV_INLINE +ib_uint64_t +mach_read_from_6( +/*=============*/ + const byte* b) /*!< in: pointer to 6 bytes */ + __attribute__((nonnull, pure)); +/*******************************************************//** +The following function is used to store data in 7 consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_to_7( +/*============*/ + byte* b, /*!< in: pointer to 7 bytes where to store */ + ib_uint64_t n); /*!< in: 56-bit integer */ +/********************************************************//** +The following function is used to fetch data from 7 consecutive +bytes. The most significant byte is at the lowest address. +@return 56-bit integer */ +UNIV_INLINE +ib_uint64_t +mach_read_from_7( +/*=============*/ + const byte* b) /*!< in: pointer to 7 bytes */ + __attribute__((nonnull, pure)); +/*******************************************************//** +The following function is used to store data in 8 consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_to_8( +/*============*/ + void* b, /*!< in: pointer to 8 bytes where to store */ + ib_uint64_t n); /*!< in: 64-bit integer to be stored */ +/********************************************************//** +The following function is used to fetch data from 8 consecutive +bytes. The most significant byte is at the lowest address. +@return 64-bit integer */ +UNIV_INLINE +ib_uint64_t +mach_read_from_8( +/*=============*/ + const byte* b) /*!< in: pointer to 8 bytes */ + __attribute__((nonnull, pure)); +/*********************************************************//** +Writes a 64-bit integer in a compressed form (5..9 bytes). +@return size in bytes */ +UNIV_INLINE +ulint +mach_ull_write_compressed( +/*======================*/ + byte* b, /*!< in: pointer to memory where to store */ + ib_uint64_t n); /*!< in: 64-bit integer to be stored */ +/*********************************************************//** +Returns the size of a 64-bit integer when written in the compressed form. +@return compressed size in bytes */ +UNIV_INLINE +ulint +mach_ull_get_compressed_size( +/*=========================*/ + ib_uint64_t n); /*!< in: 64-bit integer to be stored */ +/*********************************************************//** +Reads a 64-bit integer in a compressed form. +@return the value read */ +UNIV_INLINE +ib_uint64_t +mach_ull_read_compressed( +/*=====================*/ + const byte* b) /*!< in: pointer to memory from where to read */ + __attribute__((nonnull, pure)); +/*********************************************************//** +Writes a 64-bit integer in a compressed form (1..11 bytes). +@return size in bytes */ +UNIV_INLINE +ulint +mach_ull_write_much_compressed( +/*===========================*/ + byte* b, /*!< in: pointer to memory where to store */ + ib_uint64_t n); /*!< in: 64-bit integer to be stored */ +/*********************************************************//** +Returns the size of a 64-bit integer when written in the compressed form. +@return compressed size in bytes */ +UNIV_INLINE +ulint +mach_ull_get_much_compressed_size( +/*==============================*/ + ib_uint64_t n) /*!< in: 64-bit integer to be stored */ + __attribute__((const)); +/*********************************************************//** +Reads a 64-bit integer in a compressed form. +@return the value read */ +UNIV_INLINE +ib_uint64_t +mach_ull_read_much_compressed( +/*==========================*/ + const byte* b) /*!< in: pointer to memory from where to read */ + __attribute__((nonnull, pure)); +/*********************************************************//** +Reads a ulint in a compressed form if the log record fully contains it. +@return pointer to end of the stored field, NULL if not complete */ +UNIV_INTERN +byte* +mach_parse_compressed( +/*==================*/ + byte* ptr, /*!< in: pointer to buffer from where to read */ + byte* end_ptr,/*!< in: pointer to end of the buffer */ + ulint* val); /*!< out: read value */ +/*********************************************************//** +Reads a 64-bit integer in a compressed form +if the log record fully contains it. +@return pointer to end of the stored field, NULL if not complete */ +UNIV_INLINE +byte* +mach_ull_parse_compressed( +/*======================*/ + byte* ptr, /*!< in: pointer to buffer from where to read */ + byte* end_ptr,/*!< in: pointer to end of the buffer */ + ib_uint64_t* val); /*!< out: read value */ +#ifndef UNIV_HOTBACKUP +/*********************************************************//** +Reads a double. It is stored in a little-endian format. +@return double read */ +UNIV_INLINE +double +mach_double_read( +/*=============*/ + const byte* b) /*!< in: pointer to memory from where to read */ + __attribute__((nonnull, pure)); +/*********************************************************//** +Writes a double. It is stored in a little-endian format. */ +UNIV_INLINE +void +mach_double_write( +/*==============*/ + byte* b, /*!< in: pointer to memory where to write */ + double d); /*!< in: double */ +/*********************************************************//** +Reads a float. It is stored in a little-endian format. +@return float read */ +UNIV_INLINE +float +mach_float_read( +/*============*/ + const byte* b) /*!< in: pointer to memory from where to read */ + __attribute__((nonnull, pure)); +/*********************************************************//** +Writes a float. It is stored in a little-endian format. */ +UNIV_INLINE +void +mach_float_write( +/*=============*/ + byte* b, /*!< in: pointer to memory where to write */ + float d); /*!< in: float */ +/*********************************************************//** +Reads a ulint stored in the little-endian format. +@return unsigned long int */ +UNIV_INLINE +ulint +mach_read_from_n_little_endian( +/*===========================*/ + const byte* buf, /*!< in: from where to read */ + ulint buf_size) /*!< in: from how many bytes to read */ + __attribute__((nonnull, pure)); +/*********************************************************//** +Writes a ulint in the little-endian format. */ +UNIV_INLINE +void +mach_write_to_n_little_endian( +/*==========================*/ + byte* dest, /*!< in: where to write */ + ulint dest_size, /*!< in: into how many bytes to write */ + ulint n); /*!< in: unsigned long int to write */ +/*********************************************************//** +Reads a ulint stored in the little-endian format. +@return unsigned long int */ +UNIV_INLINE +ulint +mach_read_from_2_little_endian( +/*===========================*/ + const byte* buf) /*!< in: from where to read */ + __attribute__((nonnull, pure)); +/*********************************************************//** +Writes a ulint in the little-endian format. */ +UNIV_INLINE +void +mach_write_to_2_little_endian( +/*==========================*/ + byte* dest, /*!< in: where to write */ + ulint n); /*!< in: unsigned long int to write */ +/*********************************************************//** +Convert integral type from storage byte order (big endian) to +host byte order. +@return integer value */ +UNIV_INLINE +ib_uint64_t +mach_read_int_type( +/*===============*/ + const byte* src, /*!< in: where to read from */ + ulint len, /*!< in: length of src */ + ibool unsigned_type); /*!< in: signed or unsigned flag */ +/***********************************************************//** +Convert integral type from host byte order to (big-endian) storage +byte order. */ +UNIV_INLINE +void +mach_write_int_type( +/*================*/ + byte* dest, /*!< in: where to write*/ + const byte* src, /*!< in: where to read from */ + ulint len, /*!< in: length of src */ + bool usign); /*!< in: signed or unsigned flag */ + +/************************************************************* +Convert a ulonglong integer from host byte order to (big-endian) +storage byte order. */ +UNIV_INLINE +void +mach_write_ulonglong( +/*=================*/ + byte* dest, /*!< in: where to write */ + ulonglong src, /*!< in: where to read from */ + ulint len, /*!< in: length of dest */ + bool usign); /*!< in: signed or unsigned flag */ + +/********************************************************//** +Reads 1 - 4 bytes from a file page buffered in the buffer pool. +@return value read */ +UNIV_INLINE +ulint +mach_read_ulint( +/*============*/ + const byte* ptr, /*!< in: pointer from where to read */ + ulint type); /*!< in: MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES */ + +#endif /* !UNIV_HOTBACKUP */ +#endif /* !UNIV_INNOCHECKSUM */ + +#ifndef UNIV_NONINL +#include "mach0data.ic" +#endif + +#endif diff --git a/storage/xtradb/include/mach0data.ic b/storage/xtradb/include/mach0data.ic new file mode 100644 index 00000000000..27b9f62b552 --- /dev/null +++ b/storage/xtradb/include/mach0data.ic @@ -0,0 +1,881 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/mach0data.ic +Utilities for converting data from the database file +to the machine format. + +Created 11/28/1995 Heikki Tuuri +***********************************************************************/ + +#ifndef UNIV_INNOCHECKSUM + +#include "ut0mem.h" + +/*******************************************************//** +The following function is used to store data in one byte. */ +UNIV_INLINE +void +mach_write_to_1( +/*============*/ + byte* b, /*!< in: pointer to byte where to store */ + ulint n) /*!< in: ulint integer to be stored, >= 0, < 256 */ +{ + ut_ad(b); + ut_ad((n | 0xFFUL) <= 0xFFUL); + + b[0] = (byte) n; +} + +/********************************************************//** +The following function is used to fetch data from one byte. +@return ulint integer, >= 0, < 256 */ +UNIV_INLINE +ulint +mach_read_from_1( +/*=============*/ + const byte* b) /*!< in: pointer to byte */ +{ + ut_ad(b); + return((ulint)(b[0])); +} + +/*******************************************************//** +The following function is used to store data in two consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_to_2( +/*============*/ + byte* b, /*!< in: pointer to two bytes where to store */ + ulint n) /*!< in: ulint integer to be stored */ +{ + ut_ad(b); + ut_ad((n | 0xFFFFUL) <= 0xFFFFUL); + + b[0] = (byte)(n >> 8); + b[1] = (byte)(n); +} + +/********************************************************//** +The following function is used to convert a 16-bit data item +to the canonical format, for fast bytewise equality test +against memory. +@return 16-bit integer in canonical format */ +UNIV_INLINE +uint16 +mach_encode_2( +/*==========*/ + ulint n) /*!< in: integer in machine-dependent format */ +{ + uint16 ret; + ut_ad(2 == sizeof ret); + mach_write_to_2((byte*) &ret, n); + return(ret); +} +/********************************************************//** +The following function is used to convert a 16-bit data item +from the canonical format, for fast bytewise equality test +against memory. +@return integer in machine-dependent format */ +UNIV_INLINE +ulint +mach_decode_2( +/*==========*/ + uint16 n) /*!< in: 16-bit integer in canonical format */ +{ + ut_ad(2 == sizeof n); + return(mach_read_from_2((const byte*) &n)); +} + +/*******************************************************//** +The following function is used to store data in 3 consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_to_3( +/*============*/ + byte* b, /*!< in: pointer to 3 bytes where to store */ + ulint n) /*!< in: ulint integer to be stored */ +{ + ut_ad(b); + ut_ad((n | 0xFFFFFFUL) <= 0xFFFFFFUL); + + b[0] = (byte)(n >> 16); + b[1] = (byte)(n >> 8); + b[2] = (byte)(n); +} + +/********************************************************//** +The following function is used to fetch data from 3 consecutive +bytes. The most significant byte is at the lowest address. +@return ulint integer */ +UNIV_INLINE +ulint +mach_read_from_3( +/*=============*/ + const byte* b) /*!< in: pointer to 3 bytes */ +{ + ut_ad(b); + return( ((ulint)(b[0]) << 16) + | ((ulint)(b[1]) << 8) + | (ulint)(b[2]) + ); +} + +/*******************************************************//** +The following function is used to store data in four consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_to_4( +/*============*/ + byte* b, /*!< in: pointer to four bytes where to store */ + ulint n) /*!< in: ulint integer to be stored */ +{ + ut_ad(b); + + b[0] = (byte)(n >> 24); + b[1] = (byte)(n >> 16); + b[2] = (byte)(n >> 8); + b[3] = (byte) n; +} + +#endif /* !UNIV_INNOCHECKSUM */ + +/********************************************************//** +The following function is used to fetch data from 2 consecutive +bytes. The most significant byte is at the lowest address. +@return ulint integer */ +UNIV_INLINE +ulint +mach_read_from_2( +/*=============*/ + const byte* b) /*!< in: pointer to 2 bytes */ +{ + return(((ulint)(b[0]) << 8) | (ulint)(b[1])); +} + +/********************************************************//** +The following function is used to fetch data from 4 consecutive +bytes. The most significant byte is at the lowest address. +@return ulint integer */ +UNIV_INLINE +ulint +mach_read_from_4( +/*=============*/ + const byte* b) /*!< in: pointer to four bytes */ +{ + ut_ad(b); + return( ((ulint)(b[0]) << 24) + | ((ulint)(b[1]) << 16) + | ((ulint)(b[2]) << 8) + | (ulint)(b[3]) + ); +} + +#ifndef UNIV_INNOCHECKSUM + +/*********************************************************//** +Writes a ulint in a compressed form where the first byte codes the +length of the stored ulint. We look at the most significant bits of +the byte. If the most significant bit is zero, it means 1-byte storage, +else if the 2nd bit is 0, it means 2-byte storage, else if 3rd is 0, +it means 3-byte storage, else if 4th is 0, it means 4-byte storage, +else the storage is 5-byte. +@return compressed size in bytes */ +UNIV_INLINE +ulint +mach_write_compressed( +/*==================*/ + byte* b, /*!< in: pointer to memory where to store */ + ulint n) /*!< in: ulint integer (< 2^32) to be stored */ +{ + ut_ad(b); + + if (n < 0x80UL) { + mach_write_to_1(b, n); + return(1); + } else if (n < 0x4000UL) { + mach_write_to_2(b, n | 0x8000UL); + return(2); + } else if (n < 0x200000UL) { + mach_write_to_3(b, n | 0xC00000UL); + return(3); + } else if (n < 0x10000000UL) { + mach_write_to_4(b, n | 0xE0000000UL); + return(4); + } else { + mach_write_to_1(b, 0xF0UL); + mach_write_to_4(b + 1, n); + return(5); + } +} + +/*********************************************************//** +Returns the size of a ulint when written in the compressed form. +@return compressed size in bytes */ +UNIV_INLINE +ulint +mach_get_compressed_size( +/*=====================*/ + ulint n) /*!< in: ulint integer (< 2^32) to be stored */ +{ + if (n < 0x80UL) { + return(1); + } else if (n < 0x4000UL) { + return(2); + } else if (n < 0x200000UL) { + return(3); + } else if (n < 0x10000000UL) { + return(4); + } else { + return(5); + } +} + +/*********************************************************//** +Reads a ulint in a compressed form. +@return read integer (< 2^32) */ +UNIV_INLINE +ulint +mach_read_compressed( +/*=================*/ + const byte* b) /*!< in: pointer to memory from where to read */ +{ + ulint flag; + + ut_ad(b); + + flag = mach_read_from_1(b); + + if (flag < 0x80UL) { + return(flag); + } else if (flag < 0xC0UL) { + return(mach_read_from_2(b) & 0x7FFFUL); + } else if (flag < 0xE0UL) { + return(mach_read_from_3(b) & 0x3FFFFFUL); + } else if (flag < 0xF0UL) { + return(mach_read_from_4(b) & 0x1FFFFFFFUL); + } else { + ut_ad(flag == 0xF0UL); + return(mach_read_from_4(b + 1)); + } +} + +/*******************************************************//** +The following function is used to store data in 8 consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_to_8( +/*============*/ + void* b, /*!< in: pointer to 8 bytes where to store */ + ib_uint64_t n) /*!< in: 64-bit integer to be stored */ +{ + ut_ad(b); + + mach_write_to_4(static_cast<byte*>(b), (ulint) (n >> 32)); + mach_write_to_4(static_cast<byte*>(b) + 4, (ulint) n); +} + +/********************************************************//** +The following function is used to fetch data from 8 consecutive +bytes. The most significant byte is at the lowest address. +@return 64-bit integer */ +UNIV_INLINE +ib_uint64_t +mach_read_from_8( +/*=============*/ + const byte* b) /*!< in: pointer to 8 bytes */ +{ + ib_uint64_t ull; + + ull = ((ib_uint64_t) mach_read_from_4(b)) << 32; + ull |= (ib_uint64_t) mach_read_from_4(b + 4); + + return(ull); +} + +/*******************************************************//** +The following function is used to store data in 7 consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_to_7( +/*============*/ + byte* b, /*!< in: pointer to 7 bytes where to store */ + ib_uint64_t n) /*!< in: 56-bit integer */ +{ + ut_ad(b); + + mach_write_to_3(b, (ulint) (n >> 32)); + mach_write_to_4(b + 3, (ulint) n); +} + +/********************************************************//** +The following function is used to fetch data from 7 consecutive +bytes. The most significant byte is at the lowest address. +@return 56-bit integer */ +UNIV_INLINE +ib_uint64_t +mach_read_from_7( +/*=============*/ + const byte* b) /*!< in: pointer to 7 bytes */ +{ + ut_ad(b); + + return(ut_ull_create(mach_read_from_3(b), mach_read_from_4(b + 3))); +} + +/*******************************************************//** +The following function is used to store data in 6 consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_to_6( +/*============*/ + byte* b, /*!< in: pointer to 6 bytes where to store */ + ib_uint64_t n) /*!< in: 48-bit integer */ +{ + ut_ad(b); + + mach_write_to_2(b, (ulint) (n >> 32)); + mach_write_to_4(b + 2, (ulint) n); +} + +/********************************************************//** +The following function is used to fetch data from 6 consecutive +bytes. The most significant byte is at the lowest address. +@return 48-bit integer */ +UNIV_INLINE +ib_uint64_t +mach_read_from_6( +/*=============*/ + const byte* b) /*!< in: pointer to 6 bytes */ +{ + ut_ad(b); + + return(ut_ull_create(mach_read_from_2(b), mach_read_from_4(b + 2))); +} + +/*********************************************************//** +Writes a 64-bit integer in a compressed form (5..9 bytes). +@return size in bytes */ +UNIV_INLINE +ulint +mach_ull_write_compressed( +/*======================*/ + byte* b, /*!< in: pointer to memory where to store */ + ib_uint64_t n) /*!< in: 64-bit integer to be stored */ +{ + ulint size; + + ut_ad(b); + + size = mach_write_compressed(b, (ulint) (n >> 32)); + mach_write_to_4(b + size, (ulint) n); + + return(size + 4); +} + +/*********************************************************//** +Returns the size of a 64-bit integer when written in the compressed form. +@return compressed size in bytes */ +UNIV_INLINE +ulint +mach_ull_get_compressed_size( +/*=========================*/ + ib_uint64_t n) /*!< in: 64-bit integer to be stored */ +{ + return(4 + mach_get_compressed_size((ulint) (n >> 32))); +} + +/*********************************************************//** +Reads a 64-bit integer in a compressed form. +@return the value read */ +UNIV_INLINE +ib_uint64_t +mach_ull_read_compressed( +/*=====================*/ + const byte* b) /*!< in: pointer to memory from where to read */ +{ + ib_uint64_t n; + ulint size; + + ut_ad(b); + + n = (ib_uint64_t) mach_read_compressed(b); + + size = mach_get_compressed_size((ulint) n); + + n <<= 32; + n |= (ib_uint64_t) mach_read_from_4(b + size); + + return(n); +} + +/*********************************************************//** +Writes a 64-bit integer in a compressed form (1..11 bytes). +@return size in bytes */ +UNIV_INLINE +ulint +mach_ull_write_much_compressed( +/*===========================*/ + byte* b, /*!< in: pointer to memory where to store */ + ib_uint64_t n) /*!< in: 64-bit integer to be stored */ +{ + ulint size; + + ut_ad(b); + + if (!(n >> 32)) { + return(mach_write_compressed(b, (ulint) n)); + } + + *b = (byte)0xFF; + size = 1 + mach_write_compressed(b + 1, (ulint) (n >> 32)); + + size += mach_write_compressed(b + size, (ulint) n & 0xFFFFFFFF); + + return(size); +} + +/*********************************************************//** +Returns the size of a 64-bit integer when written in the compressed form. +@return compressed size in bytes */ +UNIV_INLINE +ulint +mach_ull_get_much_compressed_size( +/*==============================*/ + ib_uint64_t n) /*!< in: 64-bit integer to be stored */ +{ + if (!(n >> 32)) { + return(mach_get_compressed_size((ulint) n)); + } + + return(1 + mach_get_compressed_size((ulint) (n >> 32)) + + mach_get_compressed_size((ulint) n & ULINT32_MASK)); +} + +/*********************************************************//** +Reads a 64-bit integer in a compressed form. +@return the value read */ +UNIV_INLINE +ib_uint64_t +mach_ull_read_much_compressed( +/*==========================*/ + const byte* b) /*!< in: pointer to memory from where to read */ +{ + ib_uint64_t n; + ulint size; + + ut_ad(b); + + if (*b != (byte)0xFF) { + n = 0; + size = 0; + } else { + n = (ib_uint64_t) mach_read_compressed(b + 1); + + size = 1 + mach_get_compressed_size((ulint) n); + n <<= 32; + } + + n |= mach_read_compressed(b + size); + + return(n); +} + +/*********************************************************//** +Reads a 64-bit integer in a compressed form +if the log record fully contains it. +@return pointer to end of the stored field, NULL if not complete */ +UNIV_INLINE +byte* +mach_ull_parse_compressed( +/*======================*/ + byte* ptr, /* in: pointer to buffer from where to read */ + byte* end_ptr,/* in: pointer to end of the buffer */ + ib_uint64_t* val) /* out: read value */ +{ + ulint size; + + ut_ad(ptr); + ut_ad(end_ptr); + ut_ad(val); + + if (end_ptr < ptr + 5) { + + return(NULL); + } + + *val = mach_read_compressed(ptr); + + size = mach_get_compressed_size((ulint) *val); + + ptr += size; + + if (end_ptr < ptr + 4) { + + return(NULL); + } + + *val <<= 32; + *val |= mach_read_from_4(ptr); + + return(ptr + 4); +} +#ifndef UNIV_HOTBACKUP +/*********************************************************//** +Reads a double. It is stored in a little-endian format. +@return double read */ +UNIV_INLINE +double +mach_double_read( +/*=============*/ + const byte* b) /*!< in: pointer to memory from where to read */ +{ + double d; + ulint i; + byte* ptr; + + ptr = (byte*) &d; + + for (i = 0; i < sizeof(double); i++) { +#ifdef WORDS_BIGENDIAN + ptr[sizeof(double) - i - 1] = b[i]; +#else + ptr[i] = b[i]; +#endif + } + + return(d); +} + +/*********************************************************//** +Writes a double. It is stored in a little-endian format. */ +UNIV_INLINE +void +mach_double_write( +/*==============*/ + byte* b, /*!< in: pointer to memory where to write */ + double d) /*!< in: double */ +{ + ulint i; + byte* ptr; + + ptr = (byte*) &d; + + for (i = 0; i < sizeof(double); i++) { +#ifdef WORDS_BIGENDIAN + b[i] = ptr[sizeof(double) - i - 1]; +#else + b[i] = ptr[i]; +#endif + } +} + +/*********************************************************//** +Reads a float. It is stored in a little-endian format. +@return float read */ +UNIV_INLINE +float +mach_float_read( +/*============*/ + const byte* b) /*!< in: pointer to memory from where to read */ +{ + float d; + ulint i; + byte* ptr; + + ptr = (byte*) &d; + + for (i = 0; i < sizeof(float); i++) { +#ifdef WORDS_BIGENDIAN + ptr[sizeof(float) - i - 1] = b[i]; +#else + ptr[i] = b[i]; +#endif + } + + return(d); +} + +/*********************************************************//** +Writes a float. It is stored in a little-endian format. */ +UNIV_INLINE +void +mach_float_write( +/*=============*/ + byte* b, /*!< in: pointer to memory where to write */ + float d) /*!< in: float */ +{ + ulint i; + byte* ptr; + + ptr = (byte*) &d; + + for (i = 0; i < sizeof(float); i++) { +#ifdef WORDS_BIGENDIAN + b[i] = ptr[sizeof(float) - i - 1]; +#else + b[i] = ptr[i]; +#endif + } +} + +/*********************************************************//** +Reads a ulint stored in the little-endian format. +@return unsigned long int */ +UNIV_INLINE +ulint +mach_read_from_n_little_endian( +/*===========================*/ + const byte* buf, /*!< in: from where to read */ + ulint buf_size) /*!< in: from how many bytes to read */ +{ + ulint n = 0; + const byte* ptr; + + ut_ad(buf_size > 0); + + ptr = buf + buf_size; + + for (;;) { + ptr--; + + n = n << 8; + + n += (ulint)(*ptr); + + if (ptr == buf) { + break; + } + } + + return(n); +} + +/*********************************************************//** +Writes a ulint in the little-endian format. */ +UNIV_INLINE +void +mach_write_to_n_little_endian( +/*==========================*/ + byte* dest, /*!< in: where to write */ + ulint dest_size, /*!< in: into how many bytes to write */ + ulint n) /*!< in: unsigned long int to write */ +{ + byte* end; + + ut_ad(dest_size <= sizeof(ulint)); + ut_ad(dest_size > 0); + + end = dest + dest_size; + + for (;;) { + *dest = (byte)(n & 0xFF); + + n = n >> 8; + + dest++; + + if (dest == end) { + break; + } + } + + ut_ad(n == 0); +} + +/*********************************************************//** +Reads a ulint stored in the little-endian format. +@return unsigned long int */ +UNIV_INLINE +ulint +mach_read_from_2_little_endian( +/*===========================*/ + const byte* buf) /*!< in: from where to read */ +{ + return((ulint)(buf[0]) | ((ulint)(buf[1]) << 8)); +} + +/*********************************************************//** +Writes a ulint in the little-endian format. */ +UNIV_INLINE +void +mach_write_to_2_little_endian( +/*==========================*/ + byte* dest, /*!< in: where to write */ + ulint n) /*!< in: unsigned long int to write */ +{ + ut_ad(n < 256 * 256); + + *dest = (byte)(n & 0xFFUL); + + n = n >> 8; + dest++; + + *dest = (byte)(n & 0xFFUL); +} + +/*********************************************************//** +Convert integral type from storage byte order (big endian) to +host byte order. +@return integer value */ +UNIV_INLINE +ib_uint64_t +mach_read_int_type( +/*===============*/ + const byte* src, /*!< in: where to read from */ + ulint len, /*!< in: length of src */ + ibool unsigned_type) /*!< in: signed or unsigned flag */ +{ + /* XXX this can be optimized on big-endian machines */ + + ullint ret; + uint i; + + if (unsigned_type || (src[0] & 0x80)) { + + ret = 0x0000000000000000ULL; + } else { + + ret = 0xFFFFFFFFFFFFFF00ULL; + } + + if (unsigned_type) { + + ret |= src[0]; + } else { + + ret |= src[0] ^ 0x80; + } + + for (i = 1; i < len; i++) { + ret <<= 8; + ret |= src[i]; + } + + return(ret); +} +/*********************************************************//** +Swap byte ordering. */ +UNIV_INLINE +void +mach_swap_byte_order( +/*=================*/ + byte* dest, /*!< out: where to write */ + const byte* from, /*!< in: where to read from */ + ulint len) /*!< in: length of src */ +{ + ut_ad(len > 0); + ut_ad(len <= 8); + + dest += len; + + switch (len & 0x7) { + case 0: *--dest = *from++; + case 7: *--dest = *from++; + case 6: *--dest = *from++; + case 5: *--dest = *from++; + case 4: *--dest = *from++; + case 3: *--dest = *from++; + case 2: *--dest = *from++; + case 1: *--dest = *from; + } +} + +/************************************************************* +Convert integral type from host byte order (big-endian) storage +byte order. */ +UNIV_INLINE +void +mach_write_int_type( +/*================*/ + byte* dest, /*!< in: where to write */ + const byte* src, /*!< in: where to read from */ + ulint len, /*!< in: length of src */ + bool usign) /*!< in: signed or unsigned flag */ +{ +#ifdef WORDS_BIGENDIAN + memcpy(dest, src, len); +#else + mach_swap_byte_order(dest, src, len); +#endif /* WORDS_BIGENDIAN */ + + if (!usign) { + *dest ^= 0x80; + } +} + +/************************************************************* +Convert a ulonglong integer from host byte order to (big-endian) +storage byte order. */ +UNIV_INLINE +void +mach_write_ulonglong( +/*=================*/ + byte* dest, /*!< in: where to write */ + ulonglong src, /*!< in: where to read from */ + ulint len, /*!< in: length of dest */ + bool usign) /*!< in: signed or unsigned flag */ +{ + byte* ptr = reinterpret_cast<byte*>(&src); + + ut_ad(len <= sizeof(ulonglong)); + +#ifdef WORDS_BIGENDIAN + memcpy(dest, ptr + (sizeof(src) - len), len); +#else + mach_swap_byte_order(dest, reinterpret_cast<byte*>(ptr), len); +#endif /* WORDS_BIGENDIAN */ + + if (!usign) { + *dest ^= 0x80; + } +} + +/********************************************************//** +Reads 1 - 4 bytes from a file page buffered in the buffer pool. +@return value read */ +UNIV_INLINE +ulint +mach_read_ulint( +/*============*/ + const byte* ptr, /*!< in: pointer from where to read */ + ulint type) /*!< in: 1,2 or 4 bytes */ +{ + switch (type) { + case 1: + return(mach_read_from_1(ptr)); + case 2: + return(mach_read_from_2(ptr)); + case 4: + return(mach_read_from_4(ptr)); + default: + ut_error; + } + + return(0); +} + +#endif /* !UNIV_HOTBACKUP */ +#endif /* !UNIV_INNOCHECKSUM */ diff --git a/storage/xtradb/include/mem0dbg.h b/storage/xtradb/include/mem0dbg.h new file mode 100644 index 00000000000..cc339b82910 --- /dev/null +++ b/storage/xtradb/include/mem0dbg.h @@ -0,0 +1,150 @@ +/***************************************************************************** + +Copyright (c) 1994, 2010, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/mem0dbg.h +The memory management: the debug code. This is not a compilation module, +but is included in mem0mem.* ! + +Created 6/9/1994 Heikki Tuuri +*******************************************************/ + +/* In the debug version each allocated field is surrounded with +check fields whose sizes are given below */ + +#ifdef UNIV_MEM_DEBUG +# ifndef UNIV_HOTBACKUP +/* The mutex which protects in the debug version the hash table +containing the list of live memory heaps, and also the global +variables in mem0dbg.cc. */ +extern ib_mutex_t mem_hash_mutex; +# endif /* !UNIV_HOTBACKUP */ + +#define MEM_FIELD_HEADER_SIZE ut_calc_align(2 * sizeof(ulint),\ + UNIV_MEM_ALIGNMENT) +#define MEM_FIELD_TRAILER_SIZE sizeof(ulint) +#else +#define MEM_FIELD_HEADER_SIZE 0 +#endif + + +/* Space needed when allocating for a user a field of +length N. The space is allocated only in multiples of +UNIV_MEM_ALIGNMENT. In the debug version there are also +check fields at the both ends of the field. */ +#ifdef UNIV_MEM_DEBUG +#define MEM_SPACE_NEEDED(N) ut_calc_align((N) + MEM_FIELD_HEADER_SIZE\ + + MEM_FIELD_TRAILER_SIZE, UNIV_MEM_ALIGNMENT) +#else +#define MEM_SPACE_NEEDED(N) ut_calc_align((N), UNIV_MEM_ALIGNMENT) +#endif + +#if defined UNIV_MEM_DEBUG || defined UNIV_DEBUG +/***************************************************************//** +Checks a memory heap for consistency and prints the contents if requested. +Outputs the sum of sizes of buffers given to the user (only in +the debug version), the physical size of the heap and the number of +blocks in the heap. In case of error returns 0 as sizes and number +of blocks. */ +UNIV_INTERN +void +mem_heap_validate_or_print( +/*=======================*/ + mem_heap_t* heap, /*!< in: memory heap */ + byte* top, /*!< in: calculate and validate only until + this top pointer in the heap is reached, + if this pointer is NULL, ignored */ + ibool print, /*!< in: if TRUE, prints the contents + of the heap; works only in + the debug version */ + ibool* error, /*!< out: TRUE if error */ + ulint* us_size,/*!< out: allocated memory + (for the user) in the heap, + if a NULL pointer is passed as this + argument, it is ignored; in the + non-debug version this is always -1 */ + ulint* ph_size,/*!< out: physical size of the heap, + if a NULL pointer is passed as this + argument, it is ignored */ + ulint* n_blocks); /*!< out: number of blocks in the heap, + if a NULL pointer is passed as this + argument, it is ignored */ +/**************************************************************//** +Validates the contents of a memory heap. +@return TRUE if ok */ +UNIV_INTERN +ibool +mem_heap_validate( +/*==============*/ + mem_heap_t* heap); /*!< in: memory heap */ +#endif /* UNIV_MEM_DEBUG || UNIV_DEBUG */ +#ifdef UNIV_DEBUG +/**************************************************************//** +Checks that an object is a memory heap (or a block of it) +@return TRUE if ok */ +UNIV_INTERN +ibool +mem_heap_check( +/*===========*/ + mem_heap_t* heap); /*!< in: memory heap */ +#endif /* UNIV_DEBUG */ +#ifdef UNIV_MEM_DEBUG +/*****************************************************************//** +TRUE if no memory is currently allocated. +@return TRUE if no heaps exist */ +UNIV_INTERN +ibool +mem_all_freed(void); +/*===============*/ +/*****************************************************************//** +Validates the dynamic memory +@return TRUE if error */ +UNIV_INTERN +ibool +mem_validate_no_assert(void); +/*=========================*/ +/************************************************************//** +Validates the dynamic memory +@return TRUE if ok */ +UNIV_INTERN +ibool +mem_validate(void); +/*===============*/ +#endif /* UNIV_MEM_DEBUG */ +/************************************************************//** +Tries to find neigboring memory allocation blocks and dumps to stderr +the neighborhood of a given pointer. */ +UNIV_INTERN +void +mem_analyze_corruption( +/*===================*/ + void* ptr); /*!< in: pointer to place of possible corruption */ +/*****************************************************************//** +Prints information of dynamic memory usage and currently allocated memory +heaps or buffers. Can only be used in the debug version. */ +UNIV_INTERN +void +mem_print_info(void); +/*================*/ +/*****************************************************************//** +Prints information of dynamic memory usage and currently allocated memory +heaps or buffers since the last ..._print_info or..._print_new_info. */ +UNIV_INTERN +void +mem_print_new_info(void); +/*====================*/ diff --git a/storage/xtradb/include/mem0dbg.ic b/storage/xtradb/include/mem0dbg.ic new file mode 100644 index 00000000000..ec60ed35337 --- /dev/null +++ b/storage/xtradb/include/mem0dbg.ic @@ -0,0 +1,109 @@ +/***************************************************************************** + +Copyright (c) 1994, 2010, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/mem0dbg.ic +The memory management: the debug code. This is not an independent +compilation module but is included in mem0mem.*. + +Created 6/8/1994 Heikki Tuuri +*************************************************************************/ + +#ifdef UNIV_MEM_DEBUG +extern ulint mem_current_allocated_memory; + +/******************************************************************//** +Initializes an allocated memory field in the debug version. */ +UNIV_INTERN +void +mem_field_init( +/*===========*/ + byte* buf, /*!< in: memory field */ + ulint n); /*!< in: how many bytes the user requested */ +/******************************************************************//** +Erases an allocated memory field in the debug version. */ +UNIV_INTERN +void +mem_field_erase( +/*============*/ + byte* buf, /*!< in: memory field */ + ulint n); /*!< in: how many bytes the user requested */ +/***************************************************************//** +Initializes a buffer to a random combination of hex BA and BE. +Used to initialize allocated memory. */ +UNIV_INTERN +void +mem_init_buf( +/*=========*/ + byte* buf, /*!< in: pointer to buffer */ + ulint n); /*!< in: length of buffer */ +/***************************************************************//** +Initializes a buffer to a random combination of hex DE and AD. +Used to erase freed memory. */ +UNIV_INTERN +void +mem_erase_buf( +/*==========*/ + byte* buf, /*!< in: pointer to buffer */ + ulint n); /*!< in: length of buffer */ +/***************************************************************//** +Inserts a created memory heap to the hash table of +current allocated memory heaps. +Initializes the hash table when first called. */ +UNIV_INTERN +void +mem_hash_insert( +/*============*/ + mem_heap_t* heap, /*!< in: the created heap */ + const char* file_name, /*!< in: file name of creation */ + ulint line); /*!< in: line where created */ +/***************************************************************//** +Removes a memory heap (which is going to be freed by the caller) +from the list of live memory heaps. Returns the size of the heap +in terms of how much memory in bytes was allocated for the user of +the heap (not the total space occupied by the heap). +Also validates the heap. +NOTE: This function does not free the storage occupied by the +heap itself, only the node in the list of heaps. */ +UNIV_INTERN +void +mem_hash_remove( +/*============*/ + mem_heap_t* heap, /*!< in: the heap to be freed */ + const char* file_name, /*!< in: file name of freeing */ + ulint line); /*!< in: line where freed */ + + +void +mem_field_header_set_len(byte* field, ulint len); + +ulint +mem_field_header_get_len(byte* field); + +void +mem_field_header_set_check(byte* field, ulint check); + +ulint +mem_field_header_get_check(byte* field); + +void +mem_field_trailer_set_check(byte* field, ulint check); + +ulint +mem_field_trailer_get_check(byte* field); +#endif /* UNIV_MEM_DEBUG */ diff --git a/storage/xtradb/include/mem0mem.h b/storage/xtradb/include/mem0mem.h new file mode 100644 index 00000000000..f30034f3074 --- /dev/null +++ b/storage/xtradb/include/mem0mem.h @@ -0,0 +1,425 @@ +/***************************************************************************** + +Copyright (c) 1994, 2010, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/mem0mem.h +The memory management + +Created 6/9/1994 Heikki Tuuri +*******************************************************/ + +#ifndef mem0mem_h +#define mem0mem_h + +#include "univ.i" +#include "ut0mem.h" +#include "ut0byte.h" +#include "ut0rnd.h" +#ifndef UNIV_HOTBACKUP +# include "sync0sync.h" +#endif /* UNIV_HOTBACKUP */ +#include "ut0lst.h" +#include "mach0data.h" + +/* -------------------- MEMORY HEAPS ----------------------------- */ + +/* A block of a memory heap consists of the info structure +followed by an area of memory */ +typedef struct mem_block_info_t mem_block_t; + +/* A memory heap is a nonempty linear list of memory blocks */ +typedef mem_block_t mem_heap_t; + +/* Types of allocation for memory heaps: DYNAMIC means allocation from the +dynamic memory pool of the C compiler, BUFFER means allocation from the +buffer pool; the latter method is used for very big heaps */ + +#define MEM_HEAP_DYNAMIC 0 /* the most common type */ +#define MEM_HEAP_BUFFER 1 +#define MEM_HEAP_BTR_SEARCH 2 /* this flag can optionally be + ORed to MEM_HEAP_BUFFER, in which + case heap->free_block is used in + some cases for memory allocations, + and if it's NULL, the memory + allocation functions can return + NULL. */ + +/* Different type of heaps in terms of which datastructure is using them */ +#define MEM_HEAP_FOR_BTR_SEARCH (MEM_HEAP_BTR_SEARCH | MEM_HEAP_BUFFER) +#define MEM_HEAP_FOR_PAGE_HASH (MEM_HEAP_DYNAMIC) +#define MEM_HEAP_FOR_RECV_SYS (MEM_HEAP_BUFFER) +#define MEM_HEAP_FOR_LOCK_HEAP (MEM_HEAP_BUFFER) + +/* The following start size is used for the first block in the memory heap if +the size is not specified, i.e., 0 is given as the parameter in the call of +create. The standard size is the maximum (payload) size of the blocks used for +allocations of small buffers. */ + +#define MEM_BLOCK_START_SIZE 64 +#define MEM_BLOCK_STANDARD_SIZE \ + (UNIV_PAGE_SIZE >= 16384 ? 8000 : MEM_MAX_ALLOC_IN_BUF) + +/* If a memory heap is allowed to grow into the buffer pool, the following +is the maximum size for a single allocated buffer: */ +#define MEM_MAX_ALLOC_IN_BUF (UNIV_PAGE_SIZE - 200) + +/******************************************************************//** +Initializes the memory system. */ +UNIV_INTERN +void +mem_init( +/*=====*/ + ulint size); /*!< in: common pool size in bytes */ +/******************************************************************//** +Closes the memory system. */ +UNIV_INTERN +void +mem_close(void); +/*===========*/ + +#ifdef UNIV_DEBUG +/**************************************************************//** +Use this macro instead of the corresponding function! Macro for memory +heap creation. */ + +# define mem_heap_create(N) mem_heap_create_func( \ + (N), __FILE__, __LINE__, MEM_HEAP_DYNAMIC) +/**************************************************************//** +Use this macro instead of the corresponding function! Macro for memory +heap creation. */ + +# define mem_heap_create_typed(N, T) mem_heap_create_func( \ + (N), __FILE__, __LINE__, (T)) + +#else /* UNIV_DEBUG */ +/**************************************************************//** +Use this macro instead of the corresponding function! Macro for memory +heap creation. */ + +# define mem_heap_create(N) mem_heap_create_func( \ + (N), MEM_HEAP_DYNAMIC) +/**************************************************************//** +Use this macro instead of the corresponding function! Macro for memory +heap creation. */ + +# define mem_heap_create_typed(N, T) mem_heap_create_func( \ + (N), (T)) + +#endif /* UNIV_DEBUG */ +/**************************************************************//** +Use this macro instead of the corresponding function! Macro for memory +heap freeing. */ + +#define mem_heap_free(heap) mem_heap_free_func(\ + (heap), __FILE__, __LINE__) +/*****************************************************************//** +NOTE: Use the corresponding macros instead of this function. Creates a +memory heap. For debugging purposes, takes also the file name and line as +arguments. +@return own: memory heap, NULL if did not succeed (only possible for +MEM_HEAP_BTR_SEARCH type heaps) */ +UNIV_INLINE +mem_heap_t* +mem_heap_create_func( +/*=================*/ + ulint n, /*!< in: desired start block size, + this means that a single user buffer + of size n will fit in the block, + 0 creates a default size block */ +#ifdef UNIV_DEBUG + const char* file_name, /*!< in: file name where created */ + ulint line, /*!< in: line where created */ +#endif /* UNIV_DEBUG */ + ulint type); /*!< in: heap type */ +/*****************************************************************//** +NOTE: Use the corresponding macro instead of this function. Frees the space +occupied by a memory heap. In the debug version erases the heap memory +blocks. */ +UNIV_INLINE +void +mem_heap_free_func( +/*===============*/ + mem_heap_t* heap, /*!< in, own: heap to be freed */ + const char* file_name, /*!< in: file name where freed */ + ulint line); /*!< in: line where freed */ +/***************************************************************//** +Allocates and zero-fills n bytes of memory from a memory heap. +@return allocated, zero-filled storage */ +UNIV_INLINE +void* +mem_heap_zalloc( +/*============*/ + mem_heap_t* heap, /*!< in: memory heap */ + ulint n); /*!< in: number of bytes; if the heap is allowed + to grow into the buffer pool, this must be + <= MEM_MAX_ALLOC_IN_BUF */ +/***************************************************************//** +Allocates n bytes of memory from a memory heap. +@return allocated storage, NULL if did not succeed (only possible for +MEM_HEAP_BTR_SEARCH type heaps) */ +UNIV_INLINE +void* +mem_heap_alloc( +/*===========*/ + mem_heap_t* heap, /*!< in: memory heap */ + ulint n); /*!< in: number of bytes; if the heap is allowed + to grow into the buffer pool, this must be + <= MEM_MAX_ALLOC_IN_BUF */ +/*****************************************************************//** +Returns a pointer to the heap top. +@return pointer to the heap top */ +UNIV_INLINE +byte* +mem_heap_get_heap_top( +/*==================*/ + mem_heap_t* heap); /*!< in: memory heap */ +/*****************************************************************//** +Frees the space in a memory heap exceeding the pointer given. The +pointer must have been acquired from mem_heap_get_heap_top. The first +memory block of the heap is not freed. */ +UNIV_INLINE +void +mem_heap_free_heap_top( +/*===================*/ + mem_heap_t* heap, /*!< in: heap from which to free */ + byte* old_top);/*!< in: pointer to old top of heap */ +/*****************************************************************//** +Empties a memory heap. The first memory block of the heap is not freed. */ +UNIV_INLINE +void +mem_heap_empty( +/*===========*/ + mem_heap_t* heap); /*!< in: heap to empty */ +/*****************************************************************//** +Returns a pointer to the topmost element in a memory heap. +The size of the element must be given. +@return pointer to the topmost element */ +UNIV_INLINE +void* +mem_heap_get_top( +/*=============*/ + mem_heap_t* heap, /*!< in: memory heap */ + ulint n); /*!< in: size of the topmost element */ +/*****************************************************************//** +Frees the topmost element in a memory heap. +The size of the element must be given. */ +UNIV_INLINE +void +mem_heap_free_top( +/*==============*/ + mem_heap_t* heap, /*!< in: memory heap */ + ulint n); /*!< in: size of the topmost element */ +/*****************************************************************//** +Returns the space in bytes occupied by a memory heap. */ +UNIV_INLINE +ulint +mem_heap_get_size( +/*==============*/ + mem_heap_t* heap); /*!< in: heap */ +/**************************************************************//** +Use this macro instead of the corresponding function! +Macro for memory buffer allocation */ + +#define mem_zalloc(N) memset(mem_alloc(N), 0, (N)) + +#ifdef UNIV_DEBUG +#define mem_alloc(N) mem_alloc_func((N), __FILE__, __LINE__, NULL) +#define mem_alloc2(N,S) mem_alloc_func((N), __FILE__, __LINE__, (S)) +#else /* UNIV_DEBUG */ +#define mem_alloc(N) mem_alloc_func((N), NULL) +#define mem_alloc2(N,S) mem_alloc_func((N), (S)) +#endif /* UNIV_DEBUG */ + +/***************************************************************//** +NOTE: Use the corresponding macro instead of this function. +Allocates a single buffer of memory from the dynamic memory of +the C compiler. Is like malloc of C. The buffer must be freed +with mem_free. +@return own: free storage */ +UNIV_INLINE +void* +mem_alloc_func( +/*===========*/ + ulint n, /*!< in: requested size in bytes */ +#ifdef UNIV_DEBUG + const char* file_name, /*!< in: file name where created */ + ulint line, /*!< in: line where created */ +#endif /* UNIV_DEBUG */ + ulint* size); /*!< out: allocated size in bytes, + or NULL */ + +/**************************************************************//** +Use this macro instead of the corresponding function! +Macro for memory buffer freeing */ + +#define mem_free(PTR) mem_free_func((PTR), __FILE__, __LINE__) +/***************************************************************//** +NOTE: Use the corresponding macro instead of this function. +Frees a single buffer of storage from +the dynamic memory of C compiler. Similar to free of C. */ +UNIV_INLINE +void +mem_free_func( +/*==========*/ + void* ptr, /*!< in, own: buffer to be freed */ + const char* file_name, /*!< in: file name where created */ + ulint line); /*!< in: line where created */ + +/**********************************************************************//** +Duplicates a NUL-terminated string. +@return own: a copy of the string, must be deallocated with mem_free */ +UNIV_INLINE +char* +mem_strdup( +/*=======*/ + const char* str); /*!< in: string to be copied */ +/**********************************************************************//** +Makes a NUL-terminated copy of a nonterminated string. +@return own: a copy of the string, must be deallocated with mem_free */ +UNIV_INLINE +char* +mem_strdupl( +/*========*/ + const char* str, /*!< in: string to be copied */ + ulint len); /*!< in: length of str, in bytes */ + +/**********************************************************************//** +Duplicates a NUL-terminated string, allocated from a memory heap. +@return own: a copy of the string */ +UNIV_INTERN +char* +mem_heap_strdup( +/*============*/ + mem_heap_t* heap, /*!< in: memory heap where string is allocated */ + const char* str); /*!< in: string to be copied */ +/**********************************************************************//** +Makes a NUL-terminated copy of a nonterminated string, +allocated from a memory heap. +@return own: a copy of the string */ +UNIV_INLINE +char* +mem_heap_strdupl( +/*=============*/ + mem_heap_t* heap, /*!< in: memory heap where string is allocated */ + const char* str, /*!< in: string to be copied */ + ulint len); /*!< in: length of str, in bytes */ + +/**********************************************************************//** +Concatenate two strings and return the result, using a memory heap. +@return own: the result */ +UNIV_INTERN +char* +mem_heap_strcat( +/*============*/ + mem_heap_t* heap, /*!< in: memory heap where string is allocated */ + const char* s1, /*!< in: string 1 */ + const char* s2); /*!< in: string 2 */ + +/**********************************************************************//** +Duplicate a block of data, allocated from a memory heap. +@return own: a copy of the data */ +UNIV_INTERN +void* +mem_heap_dup( +/*=========*/ + mem_heap_t* heap, /*!< in: memory heap where copy is allocated */ + const void* data, /*!< in: data to be copied */ + ulint len); /*!< in: length of data, in bytes */ + +/****************************************************************//** +A simple sprintf replacement that dynamically allocates the space for the +formatted string from the given heap. This supports a very limited set of +the printf syntax: types 's' and 'u' and length modifier 'l' (which is +required for the 'u' type). +@return heap-allocated formatted string */ +UNIV_INTERN +char* +mem_heap_printf( +/*============*/ + mem_heap_t* heap, /*!< in: memory heap */ + const char* format, /*!< in: format string */ + ...) __attribute__ ((format (printf, 2, 3))); + +#ifdef MEM_PERIODIC_CHECK +/******************************************************************//** +Goes through the list of all allocated mem blocks, checks their magic +numbers, and reports possible corruption. */ +UNIV_INTERN +void +mem_validate_all_blocks(void); +/*=========================*/ +#endif + +/*#######################################################################*/ + +/** The info structure stored at the beginning of a heap block */ +struct mem_block_info_t { + ulint magic_n;/* magic number for debugging */ +#ifdef UNIV_DEBUG + char file_name[8];/* file name where the mem heap was created */ + ulint line; /*!< line number where the mem heap was created */ +#endif /* UNIV_DEBUG */ + UT_LIST_BASE_NODE_T(mem_block_t) base; /* In the first block in the + the list this is the base node of the list of blocks; + in subsequent blocks this is undefined */ + UT_LIST_NODE_T(mem_block_t) list; /* This contains pointers to next + and prev in the list. The first block allocated + to the heap is also the first block in this list, + though it also contains the base node of the list. */ + ulint len; /*!< physical length of this block in bytes */ + ulint total_size; /*!< physical length in bytes of all blocks + in the heap. This is defined only in the base + node and is set to ULINT_UNDEFINED in others. */ + ulint type; /*!< type of heap: MEM_HEAP_DYNAMIC, or + MEM_HEAP_BUF possibly ORed to MEM_HEAP_BTR_SEARCH */ + ulint free; /*!< offset in bytes of the first free position for + user data in the block */ + ulint start; /*!< the value of the struct field 'free' at the + creation of the block */ +#ifndef UNIV_HOTBACKUP + void* free_block; + /* if the MEM_HEAP_BTR_SEARCH bit is set in type, + and this is the heap root, this can contain an + allocated buffer frame, which can be appended as a + free block to the heap, if we need more space; + otherwise, this is NULL */ + void* buf_block; + /* if this block has been allocated from the buffer + pool, this contains the buf_block_t handle; + otherwise, this is NULL */ +#endif /* !UNIV_HOTBACKUP */ +#ifdef MEM_PERIODIC_CHECK + UT_LIST_NODE_T(mem_block_t) mem_block_list; + /* List of all mem blocks allocated; protected + by the mem_comm_pool mutex */ +#endif +}; + +#define MEM_BLOCK_MAGIC_N 764741555 +#define MEM_FREED_BLOCK_MAGIC_N 547711122 + +/* Header size for a memory heap block */ +#define MEM_BLOCK_HEADER_SIZE ut_calc_align(sizeof(mem_block_info_t),\ + UNIV_MEM_ALIGNMENT) +#include "mem0dbg.h" + +#ifndef UNIV_NONINL +#include "mem0mem.ic" +#endif + +#endif diff --git a/storage/xtradb/include/mem0mem.ic b/storage/xtradb/include/mem0mem.ic new file mode 100644 index 00000000000..0d983d69e1a --- /dev/null +++ b/storage/xtradb/include/mem0mem.ic @@ -0,0 +1,649 @@ +/***************************************************************************** + +Copyright (c) 1994, 2010, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/mem0mem.ic +The memory management + +Created 6/8/1994 Heikki Tuuri +*************************************************************************/ + +#include "mem0dbg.ic" +#ifndef UNIV_HOTBACKUP +# include "mem0pool.h" +#endif /* !UNIV_HOTBACKUP */ + +#ifdef UNIV_DEBUG +# define mem_heap_create_block(heap, n, type, file_name, line) \ + mem_heap_create_block_func(heap, n, file_name, line, type) +# define mem_heap_create_at(N, file_name, line) \ + mem_heap_create_func(N, file_name, line, MEM_HEAP_DYNAMIC) +#else /* UNIV_DEBUG */ +# define mem_heap_create_block(heap, n, type, file_name, line) \ + mem_heap_create_block_func(heap, n, type) +# define mem_heap_create_at(N, file_name, line) \ + mem_heap_create_func(N, MEM_HEAP_DYNAMIC) +#endif /* UNIV_DEBUG */ +/***************************************************************//** +Creates a memory heap block where data can be allocated. +@return own: memory heap block, NULL if did not succeed (only possible +for MEM_HEAP_BTR_SEARCH type heaps) */ +UNIV_INTERN +mem_block_t* +mem_heap_create_block_func( +/*=======================*/ + mem_heap_t* heap, /*!< in: memory heap or NULL if first block + should be created */ + ulint n, /*!< in: number of bytes needed for user data */ +#ifdef UNIV_DEBUG + const char* file_name,/*!< in: file name where created */ + ulint line, /*!< in: line where created */ +#endif /* UNIV_DEBUG */ + ulint type); /*!< in: type of heap: MEM_HEAP_DYNAMIC or + MEM_HEAP_BUFFER */ +/******************************************************************//** +Frees a block from a memory heap. */ +UNIV_INTERN +void +mem_heap_block_free( +/*================*/ + mem_heap_t* heap, /*!< in: heap */ + mem_block_t* block); /*!< in: block to free */ +#ifndef UNIV_HOTBACKUP +/******************************************************************//** +Frees the free_block field from a memory heap. */ +UNIV_INTERN +void +mem_heap_free_block_free( +/*=====================*/ + mem_heap_t* heap); /*!< in: heap */ +#endif /* !UNIV_HOTBACKUP */ +/***************************************************************//** +Adds a new block to a memory heap. +@return created block, NULL if did not succeed (only possible for +MEM_HEAP_BTR_SEARCH type heaps) */ +UNIV_INTERN +mem_block_t* +mem_heap_add_block( +/*===============*/ + mem_heap_t* heap, /*!< in: memory heap */ + ulint n); /*!< in: number of bytes user needs */ + +UNIV_INLINE +void +mem_block_set_len(mem_block_t* block, ulint len) +{ + ut_ad(len > 0); + + block->len = len; +} + +UNIV_INLINE +ulint +mem_block_get_len(mem_block_t* block) +{ + return(block->len); +} + +UNIV_INLINE +void +mem_block_set_type(mem_block_t* block, ulint type) +{ + ut_ad((type == MEM_HEAP_DYNAMIC) || (type == MEM_HEAP_BUFFER) + || (type == MEM_HEAP_BUFFER + MEM_HEAP_BTR_SEARCH)); + + block->type = type; +} + +UNIV_INLINE +ulint +mem_block_get_type(mem_block_t* block) +{ + return(block->type); +} + +UNIV_INLINE +void +mem_block_set_free(mem_block_t* block, ulint free) +{ + ut_ad(free > 0); + ut_ad(free <= mem_block_get_len(block)); + + block->free = free; +} + +UNIV_INLINE +ulint +mem_block_get_free(mem_block_t* block) +{ + return(block->free); +} + +UNIV_INLINE +void +mem_block_set_start(mem_block_t* block, ulint start) +{ + ut_ad(start > 0); + + block->start = start; +} + +UNIV_INLINE +ulint +mem_block_get_start(mem_block_t* block) +{ + return(block->start); +} + +/***************************************************************//** +Allocates and zero-fills n bytes of memory from a memory heap. +@return allocated, zero-filled storage */ +UNIV_INLINE +void* +mem_heap_zalloc( +/*============*/ + mem_heap_t* heap, /*!< in: memory heap */ + ulint n) /*!< in: number of bytes; if the heap is allowed + to grow into the buffer pool, this must be + <= MEM_MAX_ALLOC_IN_BUF */ +{ + ut_ad(heap); + ut_ad(!(heap->type & MEM_HEAP_BTR_SEARCH)); + return(memset(mem_heap_alloc(heap, n), 0, n)); +} + +/***************************************************************//** +Allocates n bytes of memory from a memory heap. +@return allocated storage, NULL if did not succeed (only possible for +MEM_HEAP_BTR_SEARCH type heaps) */ +UNIV_INLINE +void* +mem_heap_alloc( +/*===========*/ + mem_heap_t* heap, /*!< in: memory heap */ + ulint n) /*!< in: number of bytes; if the heap is allowed + to grow into the buffer pool, this must be + <= MEM_MAX_ALLOC_IN_BUF */ +{ + mem_block_t* block; + void* buf; + ulint free; + + ut_ad(mem_heap_check(heap)); + + block = UT_LIST_GET_LAST(heap->base); + + ut_ad(!(block->type & MEM_HEAP_BUFFER) || (n <= MEM_MAX_ALLOC_IN_BUF)); + + /* Check if there is enough space in block. If not, create a new + block to the heap */ + + if (mem_block_get_len(block) + < mem_block_get_free(block) + MEM_SPACE_NEEDED(n)) { + + block = mem_heap_add_block(heap, n); + + if (block == NULL) { + + return(NULL); + } + } + + free = mem_block_get_free(block); + + buf = (byte*) block + free; + + mem_block_set_free(block, free + MEM_SPACE_NEEDED(n)); + +#ifdef UNIV_MEM_DEBUG + UNIV_MEM_ALLOC(buf, + n + MEM_FIELD_HEADER_SIZE + MEM_FIELD_TRAILER_SIZE); + + /* In the debug version write debugging info to the field */ + mem_field_init((byte*) buf, n); + + /* Advance buf to point at the storage which will be given to the + caller */ + buf = (byte*) buf + MEM_FIELD_HEADER_SIZE; + +#endif + UNIV_MEM_ALLOC(buf, n); + return(buf); +} + +/*****************************************************************//** +Returns a pointer to the heap top. +@return pointer to the heap top */ +UNIV_INLINE +byte* +mem_heap_get_heap_top( +/*==================*/ + mem_heap_t* heap) /*!< in: memory heap */ +{ + mem_block_t* block; + byte* buf; + + ut_ad(mem_heap_check(heap)); + + block = UT_LIST_GET_LAST(heap->base); + + buf = (byte*) block + mem_block_get_free(block); + + return(buf); +} + +/*****************************************************************//** +Frees the space in a memory heap exceeding the pointer given. The +pointer must have been acquired from mem_heap_get_heap_top. The first +memory block of the heap is not freed. */ +UNIV_INLINE +void +mem_heap_free_heap_top( +/*===================*/ + mem_heap_t* heap, /*!< in: heap from which to free */ + byte* old_top)/*!< in: pointer to old top of heap */ +{ + mem_block_t* block; + mem_block_t* prev_block; +#if defined UNIV_MEM_DEBUG || defined UNIV_DEBUG + ibool error; + ulint total_size; + ulint size; + + ut_ad(mem_heap_check(heap)); + + /* Validate the heap and get its total allocated size */ + mem_heap_validate_or_print(heap, NULL, FALSE, &error, &total_size, + NULL, NULL); + ut_a(!error); + + /* Get the size below top pointer */ + mem_heap_validate_or_print(heap, old_top, FALSE, &error, &size, NULL, + NULL); + ut_a(!error); + +#endif + + block = UT_LIST_GET_LAST(heap->base); + + while (block != NULL) { + if (((byte*) block + mem_block_get_free(block) >= old_top) + && ((byte*) block <= old_top)) { + /* Found the right block */ + + break; + } + + /* Store prev_block value before freeing the current block + (the current block will be erased in freeing) */ + + prev_block = UT_LIST_GET_PREV(list, block); + + mem_heap_block_free(heap, block); + + block = prev_block; + } + + ut_ad(block); + + /* Set the free field of block */ + mem_block_set_free(block, old_top - (byte*) block); + + ut_ad(mem_block_get_start(block) <= mem_block_get_free(block)); + UNIV_MEM_ASSERT_W(old_top, (byte*) block + block->len - old_top); +#if defined UNIV_MEM_DEBUG + /* In the debug version erase block from top up */ + mem_erase_buf(old_top, (byte*) block + block->len - old_top); + + /* Update allocated memory count */ + mutex_enter(&mem_hash_mutex); + mem_current_allocated_memory -= (total_size - size); + mutex_exit(&mem_hash_mutex); +#endif /* UNIV_MEM_DEBUG */ + UNIV_MEM_ALLOC(old_top, (byte*) block + block->len - old_top); + + /* If free == start, we may free the block if it is not the first + one */ + + if ((heap != block) && (mem_block_get_free(block) + == mem_block_get_start(block))) { + mem_heap_block_free(heap, block); + } +} + +/*****************************************************************//** +Empties a memory heap. The first memory block of the heap is not freed. */ +UNIV_INLINE +void +mem_heap_empty( +/*===========*/ + mem_heap_t* heap) /*!< in: heap to empty */ +{ + mem_heap_free_heap_top(heap, (byte*) heap + mem_block_get_start(heap)); +#ifndef UNIV_HOTBACKUP + if (heap->free_block) { + mem_heap_free_block_free(heap); + } +#endif /* !UNIV_HOTBACKUP */ +} + +/*****************************************************************//** +Returns a pointer to the topmost element in a memory heap. The size of the +element must be given. +@return pointer to the topmost element */ +UNIV_INLINE +void* +mem_heap_get_top( +/*=============*/ + mem_heap_t* heap, /*!< in: memory heap */ + ulint n) /*!< in: size of the topmost element */ +{ + mem_block_t* block; + byte* buf; + + ut_ad(mem_heap_check(heap)); + + block = UT_LIST_GET_LAST(heap->base); + + buf = (byte*) block + mem_block_get_free(block) - MEM_SPACE_NEEDED(n); + +#ifdef UNIV_MEM_DEBUG + ut_ad(mem_block_get_start(block) <= (ulint) (buf - (byte*) block)); + + /* In the debug version, advance buf to point at the storage which + was given to the caller in the allocation*/ + + buf += MEM_FIELD_HEADER_SIZE; + + /* Check that the field lengths agree */ + ut_ad(n == mem_field_header_get_len(buf)); +#endif + + return((void*) buf); +} + +/*****************************************************************//** +Frees the topmost element in a memory heap. The size of the element must be +given. */ +UNIV_INLINE +void +mem_heap_free_top( +/*==============*/ + mem_heap_t* heap, /*!< in: memory heap */ + ulint n) /*!< in: size of the topmost element */ +{ + mem_block_t* block; + + ut_ad(mem_heap_check(heap)); + + block = UT_LIST_GET_LAST(heap->base); + + /* Subtract the free field of block */ + mem_block_set_free(block, mem_block_get_free(block) + - MEM_SPACE_NEEDED(n)); + UNIV_MEM_ASSERT_W((byte*) block + mem_block_get_free(block), n); +#ifdef UNIV_MEM_DEBUG + + ut_ad(mem_block_get_start(block) <= mem_block_get_free(block)); + + /* In the debug version check the consistency, and erase field */ + mem_field_erase((byte*) block + mem_block_get_free(block), n); +#endif + + /* If free == start, we may free the block if it is not the first + one */ + + if ((heap != block) && (mem_block_get_free(block) + == mem_block_get_start(block))) { + mem_heap_block_free(heap, block); + } else { + /* Avoid a bogus UNIV_MEM_ASSERT_W() warning in a + subsequent invocation of mem_heap_free_top(). + Originally, this was UNIV_MEM_FREE(), to catch writes + to freed memory. */ + UNIV_MEM_ALLOC((byte*) block + mem_block_get_free(block), n); + } +} + +/*****************************************************************//** +NOTE: Use the corresponding macros instead of this function. Creates a +memory heap. For debugging purposes, takes also the file name and line as +argument. +@return own: memory heap, NULL if did not succeed (only possible for +MEM_HEAP_BTR_SEARCH type heaps) */ +UNIV_INLINE +mem_heap_t* +mem_heap_create_func( +/*=================*/ + ulint n, /*!< in: desired start block size, + this means that a single user buffer + of size n will fit in the block, + 0 creates a default size block */ +#ifdef UNIV_DEBUG + const char* file_name, /*!< in: file name where created */ + ulint line, /*!< in: line where created */ +#endif /* UNIV_DEBUG */ + ulint type) /*!< in: heap type */ +{ + mem_block_t* block; + + if (!n) { + n = MEM_BLOCK_START_SIZE; + } + + block = mem_heap_create_block(NULL, n, type, file_name, line); + + if (block == NULL) { + + return(NULL); + } + + UT_LIST_INIT(block->base); + + /* Add the created block itself as the first block in the list */ + UT_LIST_ADD_FIRST(list, block->base, block); + +#ifdef UNIV_MEM_DEBUG + + mem_hash_insert(block, file_name, line); + +#endif + + return(block); +} + +/*****************************************************************//** +NOTE: Use the corresponding macro instead of this function. Frees the space +occupied by a memory heap. In the debug version erases the heap memory +blocks. */ +UNIV_INLINE +void +mem_heap_free_func( +/*===============*/ + mem_heap_t* heap, /*!< in, own: heap to be freed */ + const char* file_name __attribute__((unused)), + /*!< in: file name where freed */ + ulint line __attribute__((unused))) +{ + mem_block_t* block; + mem_block_t* prev_block; + + ut_ad(mem_heap_check(heap)); + + block = UT_LIST_GET_LAST(heap->base); + +#ifdef UNIV_MEM_DEBUG + + /* In the debug version remove the heap from the hash table of heaps + and check its consistency */ + + mem_hash_remove(heap, file_name, line); + +#endif +#ifndef UNIV_HOTBACKUP + if (heap->free_block) { + mem_heap_free_block_free(heap); + } +#endif /* !UNIV_HOTBACKUP */ + + while (block != NULL) { + /* Store the contents of info before freeing current block + (it is erased in freeing) */ + + prev_block = UT_LIST_GET_PREV(list, block); + + mem_heap_block_free(heap, block); + + block = prev_block; + } +} + +/***************************************************************//** +NOTE: Use the corresponding macro instead of this function. +Allocates a single buffer of memory from the dynamic memory of +the C compiler. Is like malloc of C. The buffer must be freed +with mem_free. +@return own: free storage */ +UNIV_INLINE +void* +mem_alloc_func( +/*===========*/ + ulint n, /*!< in: desired number of bytes */ +#ifdef UNIV_DEBUG + const char* file_name, /*!< in: file name where created */ + ulint line, /*!< in: line where created */ +#endif /* UNIV_DEBUG */ + ulint* size) /*!< out: allocated size in bytes, + or NULL */ +{ + mem_heap_t* heap; + void* buf; + + heap = mem_heap_create_at(n, file_name, line); + + /* Note that as we created the first block in the heap big enough + for the buffer requested by the caller, the buffer will be in the + first block and thus we can calculate the pointer to the heap from + the pointer to the buffer when we free the memory buffer. */ + + if (size) { + /* Adjust the allocation to the actual size of the + memory block. */ + ulint m = mem_block_get_len(heap) + - mem_block_get_free(heap); +#ifdef UNIV_MEM_DEBUG + m -= MEM_FIELD_HEADER_SIZE + MEM_FIELD_TRAILER_SIZE; +#endif /* UNIV_MEM_DEBUG */ + ut_ad(m >= n); + n = m; + *size = m; + } + + buf = mem_heap_alloc(heap, n); + + ut_a((byte*) heap == (byte*) buf - MEM_BLOCK_HEADER_SIZE + - MEM_FIELD_HEADER_SIZE); + return(buf); +} + +/***************************************************************//** +NOTE: Use the corresponding macro instead of this function. Frees a single +buffer of storage from the dynamic memory of the C compiler. Similar to the +free of C. */ +UNIV_INLINE +void +mem_free_func( +/*==========*/ + void* ptr, /*!< in, own: buffer to be freed */ + const char* file_name, /*!< in: file name where created */ + ulint line) /*!< in: line where created */ +{ + mem_heap_t* heap; + + heap = (mem_heap_t*)((byte*) ptr - MEM_BLOCK_HEADER_SIZE + - MEM_FIELD_HEADER_SIZE); + mem_heap_free_func(heap, file_name, line); +} + +/*****************************************************************//** +Returns the space in bytes occupied by a memory heap. */ +UNIV_INLINE +ulint +mem_heap_get_size( +/*==============*/ + mem_heap_t* heap) /*!< in: heap */ +{ + ulint size = 0; + + ut_ad(mem_heap_check(heap)); + + size = heap->total_size; + +#ifndef UNIV_HOTBACKUP + if (heap->free_block) { + size += UNIV_PAGE_SIZE; + } +#endif /* !UNIV_HOTBACKUP */ + + return(size); +} + +/**********************************************************************//** +Duplicates a NUL-terminated string. +@return own: a copy of the string, must be deallocated with mem_free */ +UNIV_INLINE +char* +mem_strdup( +/*=======*/ + const char* str) /*!< in: string to be copied */ +{ + ulint len = strlen(str) + 1; + return((char*) memcpy(mem_alloc(len), str, len)); +} + +/**********************************************************************//** +Makes a NUL-terminated copy of a nonterminated string. +@return own: a copy of the string, must be deallocated with mem_free */ +UNIV_INLINE +char* +mem_strdupl( +/*========*/ + const char* str, /*!< in: string to be copied */ + ulint len) /*!< in: length of str, in bytes */ +{ + char* s = (char*) mem_alloc(len + 1); + s[len] = 0; + return((char*) memcpy(s, str, len)); +} + +/**********************************************************************//** +Makes a NUL-terminated copy of a nonterminated string, +allocated from a memory heap. +@return own: a copy of the string */ +UNIV_INLINE +char* +mem_heap_strdupl( +/*=============*/ + mem_heap_t* heap, /*!< in: memory heap where string is allocated */ + const char* str, /*!< in: string to be copied */ + ulint len) /*!< in: length of str, in bytes */ +{ + char* s = (char*) mem_heap_alloc(heap, len + 1); + s[len] = 0; + return((char*) memcpy(s, str, len)); +} diff --git a/storage/xtradb/include/mem0pool.h b/storage/xtradb/include/mem0pool.h new file mode 100644 index 00000000000..a65ba50fdf9 --- /dev/null +++ b/storage/xtradb/include/mem0pool.h @@ -0,0 +1,121 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/mem0pool.h +The lowest-level memory management + +Created 6/9/1994 Heikki Tuuri +*******************************************************/ + +#ifndef mem0pool_h +#define mem0pool_h + +#include "univ.i" +#include "os0file.h" +#include "ut0lst.h" + +/** Memory pool */ +struct mem_pool_t; + +/** The common memory pool */ +extern mem_pool_t* mem_comm_pool; + +/** Memory area header */ +struct mem_area_t{ + ulint size_and_free; /*!< memory area size is obtained by + anding with ~MEM_AREA_FREE; area in + a free list if ANDing with + MEM_AREA_FREE results in nonzero */ + UT_LIST_NODE_T(mem_area_t) + free_list; /*!< free list node */ +}; + +/** Each memory area takes this many extra bytes for control information */ +#define MEM_AREA_EXTRA_SIZE (ut_calc_align(sizeof(struct mem_area_t),\ + UNIV_MEM_ALIGNMENT)) + +/********************************************************************//** +Creates a memory pool. +@return memory pool */ +UNIV_INTERN +mem_pool_t* +mem_pool_create( +/*============*/ + ulint size); /*!< in: pool size in bytes */ +/********************************************************************//** +Frees a memory pool. */ +UNIV_INTERN +void +mem_pool_free( +/*==========*/ + mem_pool_t* pool); /*!< in, own: memory pool */ +/********************************************************************//** +Allocates memory from a pool. NOTE: This low-level function should only be +used in mem0mem.*! +@return own: allocated memory buffer */ +UNIV_INTERN +void* +mem_area_alloc( +/*===========*/ + ulint* psize, /*!< in: requested size in bytes; for optimum + space usage, the size should be a power of 2 + minus MEM_AREA_EXTRA_SIZE; + out: allocated size in bytes (greater than + or equal to the requested size) */ + mem_pool_t* pool); /*!< in: memory pool */ +/********************************************************************//** +Frees memory to a pool. */ +UNIV_INTERN +void +mem_area_free( +/*==========*/ + void* ptr, /*!< in, own: pointer to allocated memory + buffer */ + mem_pool_t* pool); /*!< in: memory pool */ +/********************************************************************//** +Returns the amount of reserved memory. +@return reserved mmeory in bytes */ +UNIV_INTERN +ulint +mem_pool_get_reserved( +/*==================*/ + mem_pool_t* pool); /*!< in: memory pool */ +/********************************************************************//** +Validates a memory pool. +@return TRUE if ok */ +UNIV_INTERN +ibool +mem_pool_validate( +/*==============*/ + mem_pool_t* pool); /*!< in: memory pool */ +/********************************************************************//** +Prints info of a memory pool. */ +UNIV_INTERN +void +mem_pool_print_info( +/*================*/ + FILE* outfile,/*!< in: output file to write to */ + mem_pool_t* pool); /*!< in: memory pool */ + + +#ifndef UNIV_NONINL +#include "mem0pool.ic" +#endif + +#endif diff --git a/storage/xtradb/include/mem0pool.ic b/storage/xtradb/include/mem0pool.ic new file mode 100644 index 00000000000..f4bafb8ba63 --- /dev/null +++ b/storage/xtradb/include/mem0pool.ic @@ -0,0 +1,24 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/mem0pool.ic +The lowest-level memory management + +Created 6/8/1994 Heikki Tuuri +*************************************************************************/ diff --git a/storage/xtradb/include/mtr0log.h b/storage/xtradb/include/mtr0log.h new file mode 100644 index 00000000000..18a345d050f --- /dev/null +++ b/storage/xtradb/include/mtr0log.h @@ -0,0 +1,251 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/mtr0log.h +Mini-transaction logging routines + +Created 12/7/1995 Heikki Tuuri +*******************************************************/ + +#ifndef mtr0log_h +#define mtr0log_h + +#include "univ.i" +#include "mtr0mtr.h" +#include "dict0types.h" + +#ifndef UNIV_HOTBACKUP +/********************************************************//** +Writes 1, 2 or 4 bytes to a file page. Writes the corresponding log +record to the mini-transaction log if mtr is not NULL. */ +UNIV_INTERN +void +mlog_write_ulint( +/*=============*/ + byte* ptr, /*!< in: pointer where to write */ + ulint val, /*!< in: value to write */ + byte type, /*!< in: MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES */ + mtr_t* mtr); /*!< in: mini-transaction handle */ +/********************************************************//** +Writes 8 bytes to a file page. Writes the corresponding log +record to the mini-transaction log, only if mtr is not NULL */ +UNIV_INTERN +void +mlog_write_ull( +/*===========*/ + byte* ptr, /*!< in: pointer where to write */ + ib_uint64_t val, /*!< in: value to write */ + mtr_t* mtr); /*!< in: mini-transaction handle */ +/********************************************************//** +Writes a string to a file page buffered in the buffer pool. Writes the +corresponding log record to the mini-transaction log. */ +UNIV_INTERN +void +mlog_write_string( +/*==============*/ + byte* ptr, /*!< in: pointer where to write */ + const byte* str, /*!< in: string to write */ + ulint len, /*!< in: string length */ + mtr_t* mtr); /*!< in: mini-transaction handle */ +/********************************************************//** +Logs a write of a string to a file page buffered in the buffer pool. +Writes the corresponding log record to the mini-transaction log. */ +UNIV_INTERN +void +mlog_log_string( +/*============*/ + byte* ptr, /*!< in: pointer written to */ + ulint len, /*!< in: string length */ + mtr_t* mtr); /*!< in: mini-transaction handle */ +/********************************************************//** +Writes initial part of a log record consisting of one-byte item +type and four-byte space and page numbers. */ +UNIV_INTERN +void +mlog_write_initial_log_record( +/*==========================*/ + const byte* ptr, /*!< in: pointer to (inside) a buffer + frame holding the file page where + modification is made */ + byte type, /*!< in: log item type: MLOG_1BYTE, ... */ + mtr_t* mtr); /*!< in: mini-transaction handle */ +/********************************************************//** +Writes a log record about an .ibd file create/delete/rename. +@return new value of log_ptr */ +UNIV_INLINE +byte* +mlog_write_initial_log_record_for_file_op( +/*======================================*/ + ulint type, /*!< in: MLOG_FILE_CREATE, MLOG_FILE_DELETE, or + MLOG_FILE_RENAME */ + ulint space_id,/*!< in: space id, if applicable */ + ulint page_no,/*!< in: page number (not relevant currently) */ + byte* log_ptr,/*!< in: pointer to mtr log which has been opened */ + mtr_t* mtr); /*!< in: mtr */ +/********************************************************//** +Catenates 1 - 4 bytes to the mtr log. */ +UNIV_INLINE +void +mlog_catenate_ulint( +/*================*/ + mtr_t* mtr, /*!< in: mtr */ + ulint val, /*!< in: value to write */ + ulint type); /*!< in: MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES */ +/********************************************************//** +Catenates n bytes to the mtr log. */ +UNIV_INTERN +void +mlog_catenate_string( +/*=================*/ + mtr_t* mtr, /*!< in: mtr */ + const byte* str, /*!< in: string to write */ + ulint len); /*!< in: string length */ +/********************************************************//** +Catenates a compressed ulint to mlog. */ +UNIV_INLINE +void +mlog_catenate_ulint_compressed( +/*===========================*/ + mtr_t* mtr, /*!< in: mtr */ + ulint val); /*!< in: value to write */ +/********************************************************//** +Catenates a compressed 64-bit integer to mlog. */ +UNIV_INLINE +void +mlog_catenate_ull_compressed( +/*=========================*/ + mtr_t* mtr, /*!< in: mtr */ + ib_uint64_t val); /*!< in: value to write */ +/********************************************************//** +Opens a buffer to mlog. It must be closed with mlog_close. +@return buffer, NULL if log mode MTR_LOG_NONE */ +UNIV_INLINE +byte* +mlog_open( +/*======*/ + mtr_t* mtr, /*!< in: mtr */ + ulint size); /*!< in: buffer size in bytes; MUST be + smaller than DYN_ARRAY_DATA_SIZE! */ +/********************************************************//** +Closes a buffer opened to mlog. */ +UNIV_INLINE +void +mlog_close( +/*=======*/ + mtr_t* mtr, /*!< in: mtr */ + byte* ptr); /*!< in: buffer space from ptr up was not used */ +/********************************************************//** +Writes the initial part of a log record (3..11 bytes). +If the implementation of this function is changed, all +size parameters to mlog_open() should be adjusted accordingly! +@return new value of log_ptr */ +UNIV_INLINE +byte* +mlog_write_initial_log_record_fast( +/*===============================*/ + const byte* ptr, /*!< in: pointer to (inside) a buffer + frame holding the file page where + modification is made */ + byte type, /*!< in: log item type: MLOG_1BYTE, ... */ + byte* log_ptr,/*!< in: pointer to mtr log which has + been opened */ + mtr_t* mtr); /*!< in: mtr */ +#else /* !UNIV_HOTBACKUP */ +# define mlog_write_initial_log_record(ptr,type,mtr) ((void) 0) +# define mlog_write_initial_log_record_fast(ptr,type,log_ptr,mtr) ((byte*) 0) +#endif /* !UNIV_HOTBACKUP */ +/********************************************************//** +Parses an initial log record written by mlog_write_initial_log_record. +@return parsed record end, NULL if not a complete record */ +UNIV_INTERN +byte* +mlog_parse_initial_log_record( +/*==========================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + byte* type, /*!< out: log record type: MLOG_1BYTE, ... */ + ulint* space, /*!< out: space id */ + ulint* page_no);/*!< out: page number */ +/********************************************************//** +Parses a log record written by mlog_write_ulint or mlog_write_ull. +@return parsed record end, NULL if not a complete record */ +UNIV_INTERN +byte* +mlog_parse_nbytes( +/*==============*/ + ulint type, /*!< in: log record type: MLOG_1BYTE, ... */ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + byte* page, /*!< in: page where to apply the log record, or NULL */ + void* page_zip);/*!< in/out: compressed page, or NULL */ +/********************************************************//** +Parses a log record written by mlog_write_string. +@return parsed record end, NULL if not a complete record */ +UNIV_INTERN +byte* +mlog_parse_string( +/*==============*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + byte* page, /*!< in: page where to apply the log record, or NULL */ + void* page_zip);/*!< in/out: compressed page, or NULL */ + +#ifndef UNIV_HOTBACKUP +/********************************************************//** +Opens a buffer for mlog, writes the initial log record and, +if needed, the field lengths of an index. Reserves space +for further log entries. The log entry must be closed with +mtr_close(). +@return buffer, NULL if log mode MTR_LOG_NONE */ +UNIV_INTERN +byte* +mlog_open_and_write_index( +/*======================*/ + mtr_t* mtr, /*!< in: mtr */ + const byte* rec, /*!< in: index record or page */ + const dict_index_t* index, /*!< in: record descriptor */ + byte type, /*!< in: log item type */ + ulint size); /*!< in: requested buffer size in bytes + (if 0, calls mlog_close() and + returns NULL) */ +#endif /* !UNIV_HOTBACKUP */ + +/********************************************************//** +Parses a log record written by mlog_open_and_write_index. +@return parsed record end, NULL if not a complete record */ +UNIV_INTERN +byte* +mlog_parse_index( +/*=============*/ + byte* ptr, /*!< in: buffer */ + const byte* end_ptr,/*!< in: buffer end */ + ibool comp, /*!< in: TRUE=compact record format */ + dict_index_t** index); /*!< out, own: dummy index */ + +#ifndef UNIV_HOTBACKUP +/* Insert, update, and maybe other functions may use this value to define an +extra mlog buffer size for variable size data */ +#define MLOG_BUF_MARGIN 256 +#endif /* !UNIV_HOTBACKUP */ + +#ifndef UNIV_NONINL +#include "mtr0log.ic" +#endif + +#endif diff --git a/storage/xtradb/include/mtr0log.ic b/storage/xtradb/include/mtr0log.ic new file mode 100644 index 00000000000..bc49f655294 --- /dev/null +++ b/storage/xtradb/include/mtr0log.ic @@ -0,0 +1,277 @@ +/***************************************************************************** + +Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/mtr0log.ic +Mini-transaction logging routines + +Created 12/7/1995 Heikki Tuuri +*******************************************************/ + +#include "mach0data.h" +#include "ut0lst.h" +#include "buf0buf.h" +#include "buf0dblwr.h" +#include "fsp0types.h" +#include "btr0types.h" +#include "trx0sys.h" + +/********************************************************//** +Opens a buffer to mlog. It must be closed with mlog_close. +@return buffer, NULL if log mode MTR_LOG_NONE */ +UNIV_INLINE +byte* +mlog_open( +/*======*/ + mtr_t* mtr, /*!< in: mtr */ + ulint size) /*!< in: buffer size in bytes; MUST be + smaller than DYN_ARRAY_DATA_SIZE! */ +{ + dyn_array_t* mlog; + + mtr->modifications = TRUE; + + if (mtr_get_log_mode(mtr) == MTR_LOG_NONE) { + + return(NULL); + } + + mlog = &(mtr->log); + + return(dyn_array_open(mlog, size)); +} + +/********************************************************//** +Closes a buffer opened to mlog. */ +UNIV_INLINE +void +mlog_close( +/*=======*/ + mtr_t* mtr, /*!< in: mtr */ + byte* ptr) /*!< in: buffer space from ptr up was not used */ +{ + dyn_array_t* mlog; + + ut_ad(mtr_get_log_mode(mtr) != MTR_LOG_NONE); + + mlog = &(mtr->log); + + dyn_array_close(mlog, ptr); +} + +#ifndef UNIV_HOTBACKUP +/********************************************************//** +Catenates 1 - 4 bytes to the mtr log. The value is not compressed. */ +UNIV_INLINE +void +mlog_catenate_ulint( +/*================*/ + mtr_t* mtr, /*!< in: mtr */ + ulint val, /*!< in: value to write */ + ulint type) /*!< in: MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES */ +{ + dyn_array_t* mlog; + byte* ptr; + + if (mtr_get_log_mode(mtr) == MTR_LOG_NONE) { + + return; + } + + mlog = &(mtr->log); + +#if MLOG_1BYTE != 1 +# error "MLOG_1BYTE != 1" +#endif +#if MLOG_2BYTES != 2 +# error "MLOG_2BYTES != 2" +#endif +#if MLOG_4BYTES != 4 +# error "MLOG_4BYTES != 4" +#endif +#if MLOG_8BYTES != 8 +# error "MLOG_8BYTES != 8" +#endif + ptr = (byte*) dyn_array_push(mlog, type); + + if (type == MLOG_4BYTES) { + mach_write_to_4(ptr, val); + } else if (type == MLOG_2BYTES) { + mach_write_to_2(ptr, val); + } else { + ut_ad(type == MLOG_1BYTE); + mach_write_to_1(ptr, val); + } +} + +/********************************************************//** +Catenates a compressed ulint to mlog. */ +UNIV_INLINE +void +mlog_catenate_ulint_compressed( +/*===========================*/ + mtr_t* mtr, /*!< in: mtr */ + ulint val) /*!< in: value to write */ +{ + byte* log_ptr; + + log_ptr = mlog_open(mtr, 10); + + /* If no logging is requested, we may return now */ + if (log_ptr == NULL) { + + return; + } + + log_ptr += mach_write_compressed(log_ptr, val); + + mlog_close(mtr, log_ptr); +} + +/********************************************************//** +Catenates a compressed 64-bit integer to mlog. */ +UNIV_INLINE +void +mlog_catenate_ull_compressed( +/*=========================*/ + mtr_t* mtr, /*!< in: mtr */ + ib_uint64_t val) /*!< in: value to write */ +{ + byte* log_ptr; + + log_ptr = mlog_open(mtr, 15); + + /* If no logging is requested, we may return now */ + if (log_ptr == NULL) { + + return; + } + + log_ptr += mach_ull_write_compressed(log_ptr, val); + + mlog_close(mtr, log_ptr); +} + +/********************************************************//** +Writes the initial part of a log record (3..11 bytes). +If the implementation of this function is changed, all +size parameters to mlog_open() should be adjusted accordingly! +@return new value of log_ptr */ +UNIV_INLINE +byte* +mlog_write_initial_log_record_fast( +/*===============================*/ + const byte* ptr, /*!< in: pointer to (inside) a buffer + frame holding the file page where + modification is made */ + byte type, /*!< in: log item type: MLOG_1BYTE, ... */ + byte* log_ptr,/*!< in: pointer to mtr log which has + been opened */ + mtr_t* mtr) /*!< in: mtr */ +{ +#ifdef UNIV_DEBUG + buf_block_t* block; +#endif + const byte* page; + ulint space; + ulint offset; + + ut_ad(mtr_memo_contains_page(mtr, ptr, MTR_MEMO_PAGE_X_FIX)); + ut_ad(type <= MLOG_BIGGEST_TYPE); + ut_ad(ptr && log_ptr); + + page = (const byte*) ut_align_down(ptr, UNIV_PAGE_SIZE); + space = mach_read_from_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + offset = mach_read_from_4(page + FIL_PAGE_OFFSET); + + /* check whether the page is in the doublewrite buffer; + the doublewrite buffer is located in pages + FSP_EXTENT_SIZE, ..., 3 * FSP_EXTENT_SIZE - 1 in the + system tablespace */ + if (space == TRX_SYS_SPACE + && offset >= FSP_EXTENT_SIZE && offset < 3 * FSP_EXTENT_SIZE) { + if (buf_dblwr_being_created) { + /* Do nothing: we only come to this branch in an + InnoDB database creation. We do not redo log + anything for the doublewrite buffer pages. */ + return(log_ptr); + } else { + fprintf(stderr, + "Error: trying to redo log a record of type " + "%d on page %lu of space %lu in the " + "doublewrite buffer, continuing anyway.\n" + "Please post a bug report to " + "bugs.mysql.com.\n", + type, offset, space); + ut_ad(0); + } + } + + mach_write_to_1(log_ptr, type); + log_ptr++; + log_ptr += mach_write_compressed(log_ptr, space); + log_ptr += mach_write_compressed(log_ptr, offset); + + mtr->n_log_recs++; + +#ifdef UNIV_LOG_DEBUG + fprintf(stderr, + "Adding to mtr log record type %lu space %lu page no %lu\n", + (ulong) type, space, offset); +#endif + +#ifdef UNIV_DEBUG + /* We now assume that all x-latched pages have been modified! */ + block = (buf_block_t*) buf_block_align(ptr); + + if (!mtr_memo_contains(mtr, block, MTR_MEMO_MODIFY)) { + + mtr_memo_push(mtr, block, MTR_MEMO_MODIFY); + } +#endif + return(log_ptr); +} + +/********************************************************//** +Writes a log record about an .ibd file create/delete/rename. +@return new value of log_ptr */ +UNIV_INLINE +byte* +mlog_write_initial_log_record_for_file_op( +/*======================================*/ + ulint type, /*!< in: MLOG_FILE_CREATE, MLOG_FILE_DELETE, or + MLOG_FILE_RENAME */ + ulint space_id,/*!< in: space id, if applicable */ + ulint page_no,/*!< in: page number (not relevant currently) */ + byte* log_ptr,/*!< in: pointer to mtr log which has been opened */ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(log_ptr); + + mach_write_to_1(log_ptr, type); + log_ptr++; + + /* We write dummy space id and page number */ + log_ptr += mach_write_compressed(log_ptr, space_id); + log_ptr += mach_write_compressed(log_ptr, page_no); + + mtr->n_log_recs++; + + return(log_ptr); +} +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/xtradb/include/mtr0mtr.h b/storage/xtradb/include/mtr0mtr.h new file mode 100644 index 00000000000..0730e870b3f --- /dev/null +++ b/storage/xtradb/include/mtr0mtr.h @@ -0,0 +1,420 @@ +/***************************************************************************** + +Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/mtr0mtr.h +Mini-transaction buffer + +Created 11/26/1995 Heikki Tuuri +*******************************************************/ + +#ifndef mtr0mtr_h +#define mtr0mtr_h + +#include "univ.i" +#include "mem0mem.h" +#include "dyn0dyn.h" +#include "buf0types.h" +#include "sync0rw.h" +#include "ut0byte.h" +#include "mtr0types.h" +#include "page0types.h" + +/* Logging modes for a mini-transaction */ +#define MTR_LOG_ALL 21 /* default mode: log all operations + modifying disk-based data */ +#define MTR_LOG_NONE 22 /* log no operations */ +#define MTR_LOG_NO_REDO 23 /* Don't generate REDO */ +/*#define MTR_LOG_SPACE 23 */ /* log only operations modifying + file space page allocation data + (operations in fsp0fsp.* ) */ +#define MTR_LOG_SHORT_INSERTS 24 /* inserts are logged in a shorter + form */ + +/* Types for the mlock objects to store in the mtr memo; NOTE that the +first 3 values must be RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */ +#define MTR_MEMO_PAGE_S_FIX RW_S_LATCH +#define MTR_MEMO_PAGE_X_FIX RW_X_LATCH +#define MTR_MEMO_BUF_FIX RW_NO_LATCH +#ifdef UNIV_DEBUG +# define MTR_MEMO_MODIFY 54 +#endif /* UNIV_DEBUG */ +#define MTR_MEMO_S_LOCK 55 +#define MTR_MEMO_X_LOCK 56 + +/** @name Log item types +The log items are declared 'byte' so that the compiler can warn if val +and type parameters are switched in a call to mlog_write_ulint. NOTE! +For 1 - 8 bytes, the flag value must give the length also! @{ */ +#define MLOG_SINGLE_REC_FLAG 128 /*!< if the mtr contains only + one log record for one page, + i.e., write_initial_log_record + has been called only once, + this flag is ORed to the type + of that first log record */ +#define MLOG_1BYTE (1) /*!< one byte is written */ +#define MLOG_2BYTES (2) /*!< 2 bytes ... */ +#define MLOG_4BYTES (4) /*!< 4 bytes ... */ +#define MLOG_8BYTES (8) /*!< 8 bytes ... */ +#define MLOG_REC_INSERT ((byte)9) /*!< record insert */ +#define MLOG_REC_CLUST_DELETE_MARK ((byte)10) /*!< mark clustered index record + deleted */ +#define MLOG_REC_SEC_DELETE_MARK ((byte)11) /*!< mark secondary index record + deleted */ +#define MLOG_REC_UPDATE_IN_PLACE ((byte)13) /*!< update of a record, + preserves record field sizes */ +#define MLOG_REC_DELETE ((byte)14) /*!< delete a record from a + page */ +#define MLOG_LIST_END_DELETE ((byte)15) /*!< delete record list end on + index page */ +#define MLOG_LIST_START_DELETE ((byte)16) /*!< delete record list start on + index page */ +#define MLOG_LIST_END_COPY_CREATED ((byte)17) /*!< copy record list end to a + new created index page */ +#define MLOG_PAGE_REORGANIZE ((byte)18) /*!< reorganize an + index page in + ROW_FORMAT=REDUNDANT */ +#define MLOG_PAGE_CREATE ((byte)19) /*!< create an index page */ +#define MLOG_UNDO_INSERT ((byte)20) /*!< insert entry in an undo + log */ +#define MLOG_UNDO_ERASE_END ((byte)21) /*!< erase an undo log + page end */ +#define MLOG_UNDO_INIT ((byte)22) /*!< initialize a page in an + undo log */ +#define MLOG_UNDO_HDR_DISCARD ((byte)23) /*!< discard an update undo log + header */ +#define MLOG_UNDO_HDR_REUSE ((byte)24) /*!< reuse an insert undo log + header */ +#define MLOG_UNDO_HDR_CREATE ((byte)25) /*!< create an undo + log header */ +#define MLOG_REC_MIN_MARK ((byte)26) /*!< mark an index + record as the + predefined minimum + record */ +#define MLOG_IBUF_BITMAP_INIT ((byte)27) /*!< initialize an + ibuf bitmap page */ +/*#define MLOG_FULL_PAGE ((byte)28) full contents of a page */ +#ifdef UNIV_LOG_LSN_DEBUG +# define MLOG_LSN ((byte)28) /* current LSN */ +#endif +#define MLOG_INIT_FILE_PAGE ((byte)29) /*!< this means that a + file page is taken + into use and the prior + contents of the page + should be ignored: in + recovery we must not + trust the lsn values + stored to the file + page */ +#define MLOG_WRITE_STRING ((byte)30) /*!< write a string to + a page */ +#define MLOG_MULTI_REC_END ((byte)31) /*!< if a single mtr writes + several log records, + this log record ends the + sequence of these records */ +#define MLOG_DUMMY_RECORD ((byte)32) /*!< dummy log record used to + pad a log block full */ +#define MLOG_FILE_CREATE ((byte)33) /*!< log record about an .ibd + file creation */ +#define MLOG_FILE_RENAME ((byte)34) /*!< log record about an .ibd + file rename */ +#define MLOG_FILE_DELETE ((byte)35) /*!< log record about an .ibd + file deletion */ +#define MLOG_COMP_REC_MIN_MARK ((byte)36) /*!< mark a compact + index record as the + predefined minimum + record */ +#define MLOG_COMP_PAGE_CREATE ((byte)37) /*!< create a compact + index page */ +#define MLOG_COMP_REC_INSERT ((byte)38) /*!< compact record insert */ +#define MLOG_COMP_REC_CLUST_DELETE_MARK ((byte)39) + /*!< mark compact + clustered index record + deleted */ +#define MLOG_COMP_REC_SEC_DELETE_MARK ((byte)40)/*!< mark compact + secondary index record + deleted; this log + record type is + redundant, as + MLOG_REC_SEC_DELETE_MARK + is independent of the + record format. */ +#define MLOG_COMP_REC_UPDATE_IN_PLACE ((byte)41)/*!< update of a + compact record, + preserves record field + sizes */ +#define MLOG_COMP_REC_DELETE ((byte)42) /*!< delete a compact record + from a page */ +#define MLOG_COMP_LIST_END_DELETE ((byte)43) /*!< delete compact record list + end on index page */ +#define MLOG_COMP_LIST_START_DELETE ((byte)44) /*!< delete compact record list + start on index page */ +#define MLOG_COMP_LIST_END_COPY_CREATED ((byte)45) + /*!< copy compact + record list end to a + new created index + page */ +#define MLOG_COMP_PAGE_REORGANIZE ((byte)46) /*!< reorganize an index page */ +#define MLOG_FILE_CREATE2 ((byte)47) /*!< log record about creating + an .ibd file, with format */ +#define MLOG_ZIP_WRITE_NODE_PTR ((byte)48) /*!< write the node pointer of + a record on a compressed + non-leaf B-tree page */ +#define MLOG_ZIP_WRITE_BLOB_PTR ((byte)49) /*!< write the BLOB pointer + of an externally stored column + on a compressed page */ +#define MLOG_ZIP_WRITE_HEADER ((byte)50) /*!< write to compressed page + header */ +#define MLOG_ZIP_PAGE_COMPRESS ((byte)51) /*!< compress an index page */ +#define MLOG_ZIP_PAGE_COMPRESS_NO_DATA ((byte)52)/*!< compress an index page + without logging it's image */ +#define MLOG_ZIP_PAGE_REORGANIZE ((byte)53) /*!< reorganize a compressed + page */ +#define MLOG_BIGGEST_TYPE ((byte)53) /*!< biggest value (used in + assertions) */ +/* @} */ + +/** @name Flags for MLOG_FILE operations +(stored in the page number parameter, called log_flags in the +functions). The page number parameter was originally written as 0. @{ */ +#define MLOG_FILE_FLAG_TEMP 1 /*!< identifies TEMPORARY TABLE in + MLOG_FILE_CREATE, MLOG_FILE_CREATE2 */ +/* @} */ + +/* included here because it needs MLOG_LSN defined */ +#include "log0log.h" + +/***************************************************************//** +Starts a mini-transaction. */ +UNIV_INLINE +void +mtr_start( +/*======*/ + mtr_t* mtr) /*!< out: mini-transaction */ + __attribute__((nonnull)); +/***************************************************************//** +Commits a mini-transaction. */ +UNIV_INTERN +void +mtr_commit( +/*=======*/ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull)); +/**********************************************************//** +Sets and returns a savepoint in mtr. +@return savepoint */ +UNIV_INLINE +ulint +mtr_set_savepoint( +/*==============*/ + mtr_t* mtr); /*!< in: mtr */ +#ifndef UNIV_HOTBACKUP +/**********************************************************//** +Releases the (index tree) s-latch stored in an mtr memo after a +savepoint. */ +UNIV_INLINE +void +mtr_release_s_latch_at_savepoint( +/*=============================*/ + mtr_t* mtr, /*!< in: mtr */ + ulint savepoint, /*!< in: savepoint */ + prio_rw_lock_t* lock); /*!< in: latch to release */ +#else /* !UNIV_HOTBACKUP */ +# define mtr_release_s_latch_at_savepoint(mtr,savepoint,lock) ((void) 0) +#endif /* !UNIV_HOTBACKUP */ +/***************************************************************//** +Gets the logging mode of a mini-transaction. +@return logging mode: MTR_LOG_NONE, ... */ +UNIV_INLINE +ulint +mtr_get_log_mode( +/*=============*/ + mtr_t* mtr); /*!< in: mtr */ +/***************************************************************//** +Changes the logging mode of a mini-transaction. +@return old mode */ +UNIV_INLINE +ulint +mtr_set_log_mode( +/*=============*/ + mtr_t* mtr, /*!< in: mtr */ + ulint mode); /*!< in: logging mode: MTR_LOG_NONE, ... */ +/********************************************************//** +Reads 1 - 4 bytes from a file page buffered in the buffer pool. +@return value read */ +UNIV_INTERN +ulint +mtr_read_ulint( +/*===========*/ + const byte* ptr, /*!< in: pointer from where to read */ + ulint type, /*!< in: MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES */ + mtr_t* mtr); /*!< in: mini-transaction handle */ +#ifndef UNIV_HOTBACKUP +/*********************************************************************//** +This macro locks an rw-lock in s-mode. */ +#define mtr_s_lock(B, MTR) mtr_s_lock_func((B), __FILE__, __LINE__,\ + (MTR)) +/*********************************************************************//** +This macro locks an rw-lock in x-mode. */ +#define mtr_x_lock(B, MTR) mtr_x_lock_func((B), __FILE__, __LINE__,\ + (MTR)) +/*********************************************************************//** +NOTE! Use the macro above! +Locks a lock in s-mode. */ +UNIV_INLINE +void +mtr_s_lock_func( +/*============*/ + prio_rw_lock_t* lock, /*!< in: rw-lock */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line number */ + mtr_t* mtr); /*!< in: mtr */ +/*********************************************************************//** +NOTE! Use the macro above! +Locks a lock in x-mode. */ +UNIV_INLINE +void +mtr_x_lock_func( +/*============*/ + prio_rw_lock_t* lock, /*!< in: rw-lock */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line number */ + mtr_t* mtr); /*!< in: mtr */ +#endif /* !UNIV_HOTBACKUP */ + +/***************************************************//** +Releases an object in the memo stack. +@return true if released */ +UNIV_INTERN +bool +mtr_memo_release( +/*=============*/ + mtr_t* mtr, /*!< in/out: mini-transaction */ + void* object, /*!< in: object */ + ulint type) /*!< in: object type: MTR_MEMO_S_LOCK, ... */ + __attribute__((nonnull)); +#ifdef UNIV_DEBUG +# ifndef UNIV_HOTBACKUP +/**********************************************************//** +Checks if memo contains the given item. +@return TRUE if contains */ +UNIV_INLINE +bool +mtr_memo_contains( +/*==============*/ + mtr_t* mtr, /*!< in: mtr */ + const void* object, /*!< in: object to search */ + ulint type) /*!< in: type of object */ + __attribute__((warn_unused_result, nonnull)); + +/**********************************************************//** +Checks if memo contains the given page. +@return TRUE if contains */ +UNIV_INTERN +ibool +mtr_memo_contains_page( +/*===================*/ + mtr_t* mtr, /*!< in: mtr */ + const byte* ptr, /*!< in: pointer to buffer frame */ + ulint type); /*!< in: type of object */ +/*********************************************************//** +Prints info of an mtr handle. */ +UNIV_INTERN +void +mtr_print( +/*======*/ + mtr_t* mtr); /*!< in: mtr */ +# else /* !UNIV_HOTBACKUP */ +# define mtr_memo_contains(mtr, object, type) TRUE +# define mtr_memo_contains_page(mtr, ptr, type) TRUE +# endif /* !UNIV_HOTBACKUP */ +#endif /* UNIV_DEBUG */ +/*######################################################################*/ + +#define MTR_BUF_MEMO_SIZE 200 /* number of slots in memo */ + +/***************************************************************//** +Returns the log object of a mini-transaction buffer. +@return log */ +UNIV_INLINE +dyn_array_t* +mtr_get_log( +/*========*/ + mtr_t* mtr); /*!< in: mini-transaction */ +/***************************************************//** +Pushes an object to an mtr memo stack. */ +UNIV_INLINE +void +mtr_memo_push( +/*==========*/ + mtr_t* mtr, /*!< in: mtr */ + void* object, /*!< in: object */ + ulint type); /*!< in: object type: MTR_MEMO_S_LOCK, ... */ + +/** Mini-transaction memo stack slot. */ +struct mtr_memo_slot_t{ + ulint type; /*!< type of the stored object (MTR_MEMO_S_LOCK, ...) */ + void* object; /*!< pointer to the object */ +}; + +/* Mini-transaction handle and buffer */ +struct mtr_t{ +#ifdef UNIV_DEBUG + ulint state; /*!< MTR_ACTIVE, MTR_COMMITTING, MTR_COMMITTED */ +#endif + dyn_array_t memo; /*!< memo stack for locks etc. */ + dyn_array_t log; /*!< mini-transaction log */ + unsigned inside_ibuf:1; + /*!< TRUE if inside ibuf changes */ + unsigned modifications:1; + /*!< TRUE if the mini-transaction + modified buffer pool pages */ + unsigned made_dirty:1; + /*!< TRUE if mtr has made at least + one buffer pool page dirty */ + ulint n_log_recs; + /* count of how many page initial log records + have been written to the mtr log */ + ulint n_freed_pages; + /* number of pages that have been freed in + this mini-transaction */ + ulint log_mode; /* specifies which operations should be + logged; default value MTR_LOG_ALL */ + lsn_t start_lsn;/* start lsn of the possible log entry for + this mtr */ + lsn_t end_lsn;/* end lsn of the possible log entry for + this mtr */ +#ifdef UNIV_DEBUG + ulint magic_n; +#endif /* UNIV_DEBUG */ +}; + +#ifdef UNIV_DEBUG +# define MTR_MAGIC_N 54551 +#endif /* UNIV_DEBUG */ + +#define MTR_ACTIVE 12231 +#define MTR_COMMITTING 56456 +#define MTR_COMMITTED 34676 + +#ifndef UNIV_NONINL +#include "mtr0mtr.ic" +#endif + +#endif diff --git a/storage/xtradb/include/mtr0mtr.ic b/storage/xtradb/include/mtr0mtr.ic new file mode 100644 index 00000000000..cc021038001 --- /dev/null +++ b/storage/xtradb/include/mtr0mtr.ic @@ -0,0 +1,296 @@ +/***************************************************************************** + +Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/mtr0mtr.ic +Mini-transaction buffer + +Created 11/26/1995 Heikki Tuuri +*******************************************************/ + +#ifndef UNIV_HOTBACKUP +# include "sync0sync.h" +# include "sync0rw.h" +#endif /* !UNIV_HOTBACKUP */ +#include "mach0data.h" + +/***************************************************//** +Checks if a mini-transaction is dirtying a clean page. +@return TRUE if the mtr is dirtying a clean page. */ +UNIV_INTERN +ibool +mtr_block_dirtied( +/*==============*/ + const buf_block_t* block) /*!< in: block being x-fixed */ + __attribute__((nonnull,warn_unused_result)); + +/***************************************************************//** +Starts a mini-transaction. */ +UNIV_INLINE +void +mtr_start( +/*======*/ + mtr_t* mtr) /*!< out: mini-transaction */ +{ + UNIV_MEM_INVALID(mtr, sizeof *mtr); + + dyn_array_create(&(mtr->memo)); + dyn_array_create(&(mtr->log)); + + mtr->log_mode = MTR_LOG_ALL; + mtr->inside_ibuf = FALSE; + mtr->modifications = FALSE; + mtr->made_dirty = FALSE; + mtr->n_log_recs = 0; + mtr->n_freed_pages = 0; + + ut_d(mtr->state = MTR_ACTIVE); + ut_d(mtr->magic_n = MTR_MAGIC_N); +} + +/***************************************************//** +Pushes an object to an mtr memo stack. */ +UNIV_INLINE +void +mtr_memo_push( +/*==========*/ + mtr_t* mtr, /*!< in: mtr */ + void* object, /*!< in: object */ + ulint type) /*!< in: object type: MTR_MEMO_S_LOCK, ... */ +{ + dyn_array_t* memo; + mtr_memo_slot_t* slot; + + ut_ad(object); + ut_ad(type >= MTR_MEMO_PAGE_S_FIX); + ut_ad(type <= MTR_MEMO_X_LOCK); + ut_ad(mtr); + ut_ad(mtr->magic_n == MTR_MAGIC_N); + ut_ad(mtr->state == MTR_ACTIVE); + + /* If this mtr has x-fixed a clean page then we set + the made_dirty flag. This tells us if we need to + grab log_flush_order_mutex at mtr_commit so that we + can insert the dirtied page to the flush list. */ + if (type == MTR_MEMO_PAGE_X_FIX && !mtr->made_dirty) { + mtr->made_dirty = + mtr_block_dirtied((const buf_block_t*) object); + } + + memo = &(mtr->memo); + + slot = (mtr_memo_slot_t*) dyn_array_push(memo, sizeof *slot); + + slot->object = object; + slot->type = type; +} + +/**********************************************************//** +Sets and returns a savepoint in mtr. +@return savepoint */ +UNIV_INLINE +ulint +mtr_set_savepoint( +/*==============*/ + mtr_t* mtr) /*!< in: mtr */ +{ + dyn_array_t* memo; + + ut_ad(mtr); + ut_ad(mtr->magic_n == MTR_MAGIC_N); + ut_ad(mtr->state == MTR_ACTIVE); + + memo = &(mtr->memo); + + return(dyn_array_get_data_size(memo)); +} + +#ifndef UNIV_HOTBACKUP +/**********************************************************//** +Releases the (index tree) s-latch stored in an mtr memo after a +savepoint. */ +UNIV_INLINE +void +mtr_release_s_latch_at_savepoint( +/*=============================*/ + mtr_t* mtr, /*!< in: mtr */ + ulint savepoint, /*!< in: savepoint */ + prio_rw_lock_t* lock) /*!< in: latch to release */ +{ + mtr_memo_slot_t* slot; + dyn_array_t* memo; + + ut_ad(mtr); + ut_ad(mtr->magic_n == MTR_MAGIC_N); + ut_ad(mtr->state == MTR_ACTIVE); + + memo = &(mtr->memo); + + ut_ad(dyn_array_get_data_size(memo) > savepoint); + + slot = (mtr_memo_slot_t*) dyn_array_get_element(memo, savepoint); + + ut_ad(slot->object == lock); + ut_ad(slot->type == MTR_MEMO_S_LOCK); + + rw_lock_s_unlock(lock); + + slot->object = NULL; +} + +# ifdef UNIV_DEBUG +/**********************************************************//** +Checks if memo contains the given item. +@return TRUE if contains */ +UNIV_INLINE +bool +mtr_memo_contains( +/*==============*/ + mtr_t* mtr, /*!< in: mtr */ + const void* object, /*!< in: object to search */ + ulint type) /*!< in: type of object */ +{ + ut_ad(mtr); + ut_ad(mtr->magic_n == MTR_MAGIC_N); + ut_ad(mtr->state == MTR_ACTIVE || mtr->state == MTR_COMMITTING); + + for (const dyn_block_t* block = dyn_array_get_last_block(&mtr->memo); + block; + block = dyn_array_get_prev_block(&mtr->memo, block)) { + const mtr_memo_slot_t* start + = reinterpret_cast<mtr_memo_slot_t*>( + dyn_block_get_data(block)); + mtr_memo_slot_t* slot + = reinterpret_cast<mtr_memo_slot_t*>( + dyn_block_get_data(block) + + dyn_block_get_used(block)); + + ut_ad(!(dyn_block_get_used(block) % sizeof(mtr_memo_slot_t))); + + while (slot-- != start) { + if (object == slot->object && type == slot->type) { + return(true); + } + } + } + + return(false); +} +# endif /* UNIV_DEBUG */ +#endif /* !UNIV_HOTBACKUP */ + +/***************************************************************//** +Returns the log object of a mini-transaction buffer. +@return log */ +UNIV_INLINE +dyn_array_t* +mtr_get_log( +/*========*/ + mtr_t* mtr) /*!< in: mini-transaction */ +{ + ut_ad(mtr); + ut_ad(mtr->magic_n == MTR_MAGIC_N); + + return(&(mtr->log)); +} + +/***************************************************************//** +Gets the logging mode of a mini-transaction. +@return logging mode: MTR_LOG_NONE, ... */ +UNIV_INLINE +ulint +mtr_get_log_mode( +/*=============*/ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(mtr); + ut_ad(mtr->log_mode >= MTR_LOG_ALL); + ut_ad(mtr->log_mode <= MTR_LOG_SHORT_INSERTS); + + return(mtr->log_mode); +} + +/***************************************************************//** +Changes the logging mode of a mini-transaction. +@return old mode */ +UNIV_INLINE +ulint +mtr_set_log_mode( +/*=============*/ + mtr_t* mtr, /*!< in: mtr */ + ulint mode) /*!< in: logging mode: MTR_LOG_NONE, ... */ +{ + ulint old_mode; + + ut_ad(mtr); + ut_ad(mode >= MTR_LOG_ALL); + ut_ad(mode <= MTR_LOG_SHORT_INSERTS); + + old_mode = mtr->log_mode; + + if ((mode == MTR_LOG_SHORT_INSERTS) && (old_mode == MTR_LOG_NONE)) { + /* Do nothing */ + } else { + mtr->log_mode = mode; + } + + ut_ad(old_mode >= MTR_LOG_ALL); + ut_ad(old_mode <= MTR_LOG_SHORT_INSERTS); + + return(old_mode); +} + +#ifndef UNIV_HOTBACKUP +/*********************************************************************//** +Locks a lock in s-mode. */ +UNIV_INLINE +void +mtr_s_lock_func( +/*============*/ + prio_rw_lock_t* lock, /*!< in: rw-lock */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line number */ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(mtr); + ut_ad(lock); + + rw_lock_s_lock_inline(lock, 0, file, line); + + mtr_memo_push(mtr, lock, MTR_MEMO_S_LOCK); +} + +/*********************************************************************//** +Locks a lock in x-mode. */ +UNIV_INLINE +void +mtr_x_lock_func( +/*============*/ + prio_rw_lock_t* lock, /*!< in: rw-lock */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line number */ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(mtr); + ut_ad(lock); + + rw_lock_x_lock_inline(lock, 0, file, line); + + mtr_memo_push(mtr, lock, MTR_MEMO_X_LOCK); +} +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/xtradb/include/mtr0types.h b/storage/xtradb/include/mtr0types.h new file mode 100644 index 00000000000..43368c0b726 --- /dev/null +++ b/storage/xtradb/include/mtr0types.h @@ -0,0 +1,31 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/mtr0types.h +Mini-transaction buffer global types + +Created 11/26/1995 Heikki Tuuri +*******************************************************/ + +#ifndef mtr0types_h +#define mtr0types_h + +struct mtr_t; + +#endif diff --git a/storage/xtradb/include/os0file.h b/storage/xtradb/include/os0file.h new file mode 100644 index 00000000000..0e4b1c60ecd --- /dev/null +++ b/storage/xtradb/include/os0file.h @@ -0,0 +1,1324 @@ +/*********************************************************************** + +Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2009, Percona Inc. + +Portions of this file contain modifications contributed and copyrighted +by Percona Inc.. Those modifications are +gratefully acknowledged and are described briefly in the InnoDB +documentation. The contributions by Percona Inc. are incorporated with +their permission, and subject to the conditions contained in the file +COPYING.Percona. + +This program is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +***********************************************************************/ + +/**************************************************//** +@file include/os0file.h +The interface to the operating system file io + +Created 10/21/1995 Heikki Tuuri +*******************************************************/ + +#ifndef os0file_h +#define os0file_h + +#include "univ.i" +#include "trx0types.h" + +#ifndef __WIN__ +#include <dirent.h> +#include <sys/stat.h> +#include <time.h> +#endif + +/** File node of a tablespace or the log data space */ +struct fil_node_t; + +extern ibool os_has_said_disk_full; +/** Flag: enable debug printout for asynchronous i/o */ +extern ibool os_aio_print_debug; + +/** Number of pending os_file_pread() operations */ +extern ulint os_file_n_pending_preads; +/** Number of pending os_file_pwrite() operations */ +extern ulint os_file_n_pending_pwrites; + +/** Number of pending read operations */ +extern ulint os_n_pending_reads; +/** Number of pending write operations */ +extern ulint os_n_pending_writes; + +#ifdef __WIN__ + +/** We define always WIN_ASYNC_IO, and check at run-time whether + the OS actually supports it: Win 95 does not, NT does. */ +#define WIN_ASYNC_IO + +/** Use unbuffered I/O */ +#define UNIV_NON_BUFFERED_IO + +#endif + +/** File offset in bytes */ +typedef ib_uint64_t os_offset_t; +#ifdef __WIN__ +#define SRV_PATH_SEPARATOR '\\' +/** File handle */ +# define os_file_t HANDLE +# define os_file_invalid INVALID_HANDLE_VALUE +/** Convert a C file descriptor to a native file handle +@param fd file descriptor +@return native file handle */ +# define OS_FILE_FROM_FD(fd) (HANDLE) _get_osfhandle(fd) +#else +#define SRV_PATH_SEPARATOR '/' +/** File handle */ +typedef int os_file_t; +# define os_file_invalid (-1) +/** Convert a C file descriptor to a native file handle +@param fd file descriptor +@return native file handle */ +# define OS_FILE_FROM_FD(fd) fd +#endif + +/** Umask for creating files */ +extern ulint os_innodb_umask; + +/** The next value should be smaller or equal to the smallest sector size used +on any disk. A log block is required to be a portion of disk which is written +so that if the start and the end of a block get written to disk, then the +whole block gets written. This should be true even in most cases of a crash: +if this fails for a log block, then it is equivalent to a media failure in the +log. */ + +#define OS_FILE_LOG_BLOCK_SIZE srv_log_block_size + +/** Options for os_file_create_func @{ */ +enum os_file_create_t { + OS_FILE_OPEN = 51, /*!< to open an existing file (if + doesn't exist, error) */ + OS_FILE_CREATE, /*!< to create new file (if + exists, error) */ + OS_FILE_OVERWRITE, /*!< to create a new file, if exists + the overwrite old file */ + OS_FILE_OPEN_RAW, /*!< to open a raw device or disk + partition */ + OS_FILE_CREATE_PATH, /*!< to create the directories */ + OS_FILE_OPEN_RETRY, /*!< open with retry */ + + /** Flags that can be combined with the above values. Please ensure + that the above values stay below 128. */ + + OS_FILE_ON_ERROR_NO_EXIT = 128, /*!< do not exit on unknown errors */ + OS_FILE_ON_ERROR_SILENT = 256 /*!< don't print diagnostic messages to + the log unless it is a fatal error, + this flag is only used if + ON_ERROR_NO_EXIT is set */ +}; + +#define OS_FILE_READ_ONLY 333 +#define OS_FILE_READ_WRITE 444 +#define OS_FILE_READ_ALLOW_DELETE 555 /* for mysqlbackup */ + +/* Options for file_create */ +#define OS_FILE_AIO 61 +#define OS_FILE_NORMAL 62 +/* @} */ + +/** Types for file create @{ */ +#define OS_DATA_FILE 100 +#define OS_LOG_FILE 101 +/* @} */ + +/** Error codes from os_file_get_last_error @{ */ +#define OS_FILE_NOT_FOUND 71 +#define OS_FILE_DISK_FULL 72 +#define OS_FILE_ALREADY_EXISTS 73 +#define OS_FILE_PATH_ERROR 74 +#define OS_FILE_AIO_RESOURCES_RESERVED 75 /* wait for OS aio resources + to become available again */ +#define OS_FILE_SHARING_VIOLATION 76 +#define OS_FILE_ERROR_NOT_SPECIFIED 77 +#define OS_FILE_INSUFFICIENT_RESOURCE 78 +#define OS_FILE_AIO_INTERRUPTED 79 +#define OS_FILE_OPERATION_ABORTED 80 + +#define OS_FILE_ACCESS_VIOLATION 81 + +#define OS_FILE_ERROR_MAX 100 +/* @} */ + +/** Types for aio operations @{ */ +#define OS_FILE_READ 10 +#define OS_FILE_WRITE 11 + +#define OS_FILE_LOG 256 /* This can be ORed to type */ +/* @} */ + +#define OS_AIO_N_PENDING_IOS_PER_THREAD 32 /*!< Win NT does not allow more + than 64 */ + +/** Modes for aio operations @{ */ +#define OS_AIO_NORMAL 21 /*!< Normal asynchronous i/o not for ibuf + pages or ibuf bitmap pages */ +#define OS_AIO_IBUF 22 /*!< Asynchronous i/o for ibuf pages or ibuf + bitmap pages */ +#define OS_AIO_LOG 23 /*!< Asynchronous i/o for the log */ +#define OS_AIO_SYNC 24 /*!< Asynchronous i/o where the calling thread + will itself wait for the i/o to complete, + doing also the job of the i/o-handler thread; + can be used for any pages, ibuf or non-ibuf. + This is used to save CPU time, as we can do + with fewer thread switches. Plain synchronous + i/o is not as good, because it must serialize + the file seek and read or write, causing a + bottleneck for parallelism. */ + +#define OS_AIO_SIMULATED_WAKE_LATER 512 /*!< This can be ORed to mode + in the call of os_aio(...), + if the caller wants to post several i/o + requests in a batch, and only after that + wake the i/o-handler thread; this has + effect only in simulated aio */ +/* @} */ + +#define OS_WIN31 1 /*!< Microsoft Windows 3.x */ +#define OS_WIN95 2 /*!< Microsoft Windows 95 */ +#define OS_WINNT 3 /*!< Microsoft Windows NT 3.x */ +#define OS_WIN2000 4 /*!< Microsoft Windows 2000 */ +#define OS_WINXP 5 /*!< Microsoft Windows XP + or Windows Server 2003 */ +#define OS_WINVISTA 6 /*!< Microsoft Windows Vista + or Windows Server 2008 */ +#define OS_WIN7 7 /*!< Microsoft Windows 7 + or Windows Server 2008 R2 */ + + +extern ulint os_n_file_reads; +extern ulint os_n_file_writes; +extern ulint os_n_fsyncs; + +#define OS_MIN_LOG_BLOCK_SIZE 512 + +extern ulint srv_log_block_size; + +#ifdef UNIV_PFS_IO +/* Keys to register InnoDB I/O with performance schema */ +extern mysql_pfs_key_t innodb_file_data_key; +extern mysql_pfs_key_t innodb_file_log_key; +extern mysql_pfs_key_t innodb_file_temp_key; +extern mysql_pfs_key_t innodb_file_bmp_key; + +/* Following four macros are instumentations to register +various file I/O operations with performance schema. +1) register_pfs_file_open_begin() and register_pfs_file_open_end() are +used to register file creation, opening, closing and renaming. +2) register_pfs_file_io_begin() and register_pfs_file_io_end() are +used to register actual file read, write and flush +3) register_pfs_file_close_begin() and register_pfs_file_close_end() +are used to register file deletion operations*/ +# define register_pfs_file_open_begin(state, locker, key, op, name, \ + src_file, src_line) \ +do { \ + locker = PSI_FILE_CALL(get_thread_file_name_locker)( \ + state, key, op, name, &locker); \ + if (UNIV_LIKELY(locker != NULL)) { \ + PSI_FILE_CALL(start_file_open_wait)( \ + locker, src_file, src_line); \ + } \ +} while (0) + +# define register_pfs_file_open_end(locker, file) \ +do { \ + if (UNIV_LIKELY(locker != NULL)) { \ + PSI_FILE_CALL(end_file_open_wait_and_bind_to_descriptor)(\ + locker, file); \ + } \ +} while (0) + +# define register_pfs_file_close_begin(state, locker, key, op, name, \ + src_file, src_line) \ +do { \ + locker = PSI_FILE_CALL(get_thread_file_name_locker)( \ + state, key, op, name, &locker); \ + if (UNIV_LIKELY(locker != NULL)) { \ + PSI_FILE_CALL(start_file_close_wait)( \ + locker, src_file, src_line); \ + } \ +} while (0) + +# define register_pfs_file_close_end(locker, result) \ +do { \ + if (UNIV_LIKELY(locker != NULL)) { \ + PSI_FILE_CALL(end_file_close_wait)( \ + locker, result); \ + } \ +} while (0) + +# define register_pfs_file_io_begin(state, locker, file, count, op, \ + src_file, src_line) \ +do { \ + locker = PSI_FILE_CALL(get_thread_file_descriptor_locker)( \ + state, file, op); \ + if (UNIV_LIKELY(locker != NULL)) { \ + PSI_FILE_CALL(start_file_wait)( \ + locker, count, src_file, src_line); \ + } \ +} while (0) + +# define register_pfs_file_io_end(locker, count) \ +do { \ + if (UNIV_LIKELY(locker != NULL)) { \ + PSI_FILE_CALL(end_file_wait)(locker, count); \ + } \ +} while (0) +#endif /* UNIV_PFS_IO */ + +/* Following macros/functions are file I/O APIs that would be performance +schema instrumented if "UNIV_PFS_IO" is defined. They would point to +wrapper functions with performance schema instrumentation in such case. + +os_file_create +os_file_create_simple +os_file_create_simple_no_error_handling +os_file_close +os_file_rename +os_aio +os_file_read +os_file_read_no_error_handling +os_file_write + +The wrapper functions have the prefix of "innodb_". */ + +#ifdef UNIV_PFS_IO +# define os_file_create(key, name, create, purpose, type, success) \ + pfs_os_file_create_func(key, name, create, purpose, type, \ + success, __FILE__, __LINE__) + +# define os_file_create_simple(key, name, create, access, success) \ + pfs_os_file_create_simple_func(key, name, create, access, \ + success, __FILE__, __LINE__) + +# define os_file_create_simple_no_error_handling( \ + key, name, create_mode, access, success) \ + pfs_os_file_create_simple_no_error_handling_func( \ + key, name, create_mode, access, success, __FILE__, __LINE__) + +# define os_file_close(file) \ + pfs_os_file_close_func(file, __FILE__, __LINE__) + +# define os_aio(type, mode, name, file, buf, offset, \ + n, message1, message2, space_id, trx) \ + pfs_os_aio_func(type, mode, name, file, buf, offset, \ + n, message1, message2, space_id, trx, \ + __FILE__, __LINE__) + +# define os_file_read(file, buf, offset, n) \ + pfs_os_file_read_func(file, buf, offset, n, NULL, \ + __FILE__, __LINE__) + +# define os_file_read_trx(file, buf, offset, n, trx) \ + pfs_os_file_read_func(file, buf, offset, n, trx, \ + __FILE__, __LINE__) + +# define os_file_read_no_error_handling(file, buf, offset, n) \ + pfs_os_file_read_no_error_handling_func(file, buf, offset, n, \ + __FILE__, __LINE__) + +# define os_file_write(name, file, buf, offset, n) \ + pfs_os_file_write_func(name, file, buf, offset, \ + n, __FILE__, __LINE__) + +# define os_file_flush(file) \ + pfs_os_file_flush_func(file, __FILE__, __LINE__) + +# define os_file_rename(key, oldpath, newpath) \ + pfs_os_file_rename_func(key, oldpath, newpath, __FILE__, __LINE__) + +# define os_file_delete(key, name) \ + pfs_os_file_delete_func(key, name, __FILE__, __LINE__) + +# define os_file_delete_if_exists(key, name) \ + pfs_os_file_delete_if_exists_func(key, name, __FILE__, __LINE__) +#else /* UNIV_PFS_IO */ + +/* If UNIV_PFS_IO is not defined, these I/O APIs point +to original un-instrumented file I/O APIs */ +# define os_file_create(key, name, create, purpose, type, success) \ + os_file_create_func(name, create, purpose, type, success) + +# define os_file_create_simple(key, name, create_mode, access, success) \ + os_file_create_simple_func(name, create_mode, access, success) + +# define os_file_create_simple_no_error_handling( \ + key, name, create_mode, access, success) \ + os_file_create_simple_no_error_handling_func( \ + name, create_mode, access, success) + +# define os_file_close(file) os_file_close_func(file) + +# define os_aio(type, mode, name, file, buf, offset, n, message1, \ + message2, space_id, trx) \ + os_aio_func(type, mode, name, file, buf, offset, n, \ + message1, message2, space_id, trx) + +# define os_file_read(file, buf, offset, n) \ + os_file_read_func(file, buf, offset, n, NULL) + +# define os_file_read_trx(file, buf, offset, n, trx) \ + os_file_read_func(file, buf, offset, n, trx) + +# define os_file_read_no_error_handling(file, buf, offset, n) \ + os_file_read_no_error_handling_func(file, buf, offset, n) + +# define os_file_write(name, file, buf, offset, n) \ + os_file_write_func(name, file, buf, offset, n) + +# define os_file_flush(file) os_file_flush_func(file) + +# define os_file_rename(key, oldpath, newpath) \ + os_file_rename_func(oldpath, newpath) + +# define os_file_delete(key, name) os_file_delete_func(name) + +# define os_file_delete_if_exists(key, name) \ + os_file_delete_if_exists_func(name) + +#endif /* UNIV_PFS_IO */ + +/* File types for directory entry data type */ + +enum os_file_type_t { + OS_FILE_TYPE_UNKNOWN = 0, + OS_FILE_TYPE_FILE, /* regular file */ + OS_FILE_TYPE_DIR, /* directory */ + OS_FILE_TYPE_LINK, /* symbolic link */ + OS_FILE_TYPE_BLOCK /* block device */ +}; + +/* Maximum path string length in bytes when referring to tables with in the +'./databasename/tablename.ibd' path format; we can allocate at least 2 buffers +of this size from the thread stack; that is why this should not be made much +bigger than 4000 bytes */ +#define OS_FILE_MAX_PATH 4000 + +/** Struct used in fetching information of a file in a directory */ +struct os_file_stat_t { + char name[OS_FILE_MAX_PATH]; /*!< path to a file */ + os_file_type_t type; /*!< file type */ + ib_int64_t size; /*!< file size */ + time_t ctime; /*!< creation time */ + time_t mtime; /*!< modification time */ + time_t atime; /*!< access time */ + bool rw_perm; /*!< true if can be opened + in read-write mode. Only valid + if type == OS_FILE_TYPE_FILE */ +}; + +#ifdef __WIN__ +typedef HANDLE os_file_dir_t; /*!< directory stream */ +#else +typedef DIR* os_file_dir_t; /*!< directory stream */ +#endif + +#ifdef __WIN__ +/***********************************************************************//** +Gets the operating system version. Currently works only on Windows. +@return OS_WIN95, OS_WIN31, OS_WINNT, OS_WIN2000, OS_WINXP, OS_WINVISTA, +OS_WIN7. */ +UNIV_INTERN +ulint +os_get_os_version(void); +/*===================*/ +#endif /* __WIN__ */ +#ifndef UNIV_HOTBACKUP +/****************************************************************//** +Creates the seek mutexes used in positioned reads and writes. */ +UNIV_INTERN +void +os_io_init_simple(void); +/*===================*/ +/***********************************************************************//** +Creates a temporary file. This function is like tmpfile(3), but +the temporary file is created in the MySQL temporary directory. +@return temporary file handle, or NULL on error */ + +FILE* +os_file_create_tmpfile(void); +/*========================*/ +#endif /* !UNIV_HOTBACKUP */ +/***********************************************************************//** +The os_file_opendir() function opens a directory stream corresponding to the +directory named by the dirname argument. The directory stream is positioned +at the first entry. In both Unix and Windows we automatically skip the '.' +and '..' items at the start of the directory listing. +@return directory stream, NULL if error */ +UNIV_INTERN +os_file_dir_t +os_file_opendir( +/*============*/ + const char* dirname, /*!< in: directory name; it must not + contain a trailing '\' or '/' */ + ibool error_is_fatal);/*!< in: TRUE if we should treat an + error as a fatal error; if we try to + open symlinks then we do not wish a + fatal error if it happens not to be + a directory */ +/***********************************************************************//** +Closes a directory stream. +@return 0 if success, -1 if failure */ +UNIV_INTERN +int +os_file_closedir( +/*=============*/ + os_file_dir_t dir); /*!< in: directory stream */ +/***********************************************************************//** +This function returns information of the next file in the directory. We jump +over the '.' and '..' entries in the directory. +@return 0 if ok, -1 if error, 1 if at the end of the directory */ +UNIV_INTERN +int +os_file_readdir_next_file( +/*======================*/ + const char* dirname,/*!< in: directory name or path */ + os_file_dir_t dir, /*!< in: directory stream */ + os_file_stat_t* info); /*!< in/out: buffer where the info is returned */ +/*****************************************************************//** +This function attempts to create a directory named pathname. The new directory +gets default permissions. On Unix, the permissions are (0770 & ~umask). If the +directory exists already, nothing is done and the call succeeds, unless the +fail_if_exists arguments is true. +@return TRUE if call succeeds, FALSE on error */ +UNIV_INTERN +ibool +os_file_create_directory( +/*=====================*/ + const char* pathname, /*!< in: directory name as + null-terminated string */ + ibool fail_if_exists);/*!< in: if TRUE, pre-existing directory + is treated as an error. */ +/****************************************************************//** +NOTE! Use the corresponding macro os_file_create_simple(), not directly +this function! +A simple function to open or create a file. +@return own: handle to the file, not defined if error, error number +can be retrieved with os_file_get_last_error */ +UNIV_INTERN +os_file_t +os_file_create_simple_func( +/*=======================*/ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + ulint create_mode,/*!< in: create mode */ + ulint access_type,/*!< in: OS_FILE_READ_ONLY or + OS_FILE_READ_WRITE */ + ibool* success);/*!< out: TRUE if succeed, FALSE if error */ +/****************************************************************//** +NOTE! Use the corresponding macro +os_file_create_simple_no_error_handling(), not directly this function! +A simple function to open or create a file. +@return own: handle to the file, not defined if error, error number +can be retrieved with os_file_get_last_error */ +UNIV_INTERN +os_file_t +os_file_create_simple_no_error_handling_func( +/*=========================================*/ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + ulint create_mode,/*!< in: create mode */ + ulint access_type,/*!< in: OS_FILE_READ_ONLY, + OS_FILE_READ_WRITE, or + OS_FILE_READ_ALLOW_DELETE; the last option is + used by a backup program reading the file */ + ibool* success)/*!< out: TRUE if succeed, FALSE if error */ + __attribute__((nonnull, warn_unused_result)); +/****************************************************************//** +Tries to disable OS caching on an opened file descriptor. */ +UNIV_INTERN +void +os_file_set_nocache( +/*================*/ + int fd, /*!< in: file descriptor to alter */ + const char* file_name, /*!< in: file name, used in the + diagnostic message */ + const char* operation_name);/*!< in: "open" or "create"; used in the + diagnostic message */ +/****************************************************************//** +NOTE! Use the corresponding macro os_file_create(), not directly +this function! +Opens an existing file or creates a new. +@return own: handle to the file, not defined if error, error number +can be retrieved with os_file_get_last_error */ +UNIV_INTERN +os_file_t +os_file_create_func( +/*================*/ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + ulint create_mode,/*!< in: create mode */ + ulint purpose,/*!< in: OS_FILE_AIO, if asynchronous, + non-buffered i/o is desired, + OS_FILE_NORMAL, if any normal file; + NOTE that it also depends on type, os_aio_.. + and srv_.. variables whether we really use + async i/o or unbuffered i/o: look in the + function source code for the exact rules */ + ulint type, /*!< in: OS_DATA_FILE or OS_LOG_FILE */ + ibool* success)/*!< out: TRUE if succeed, FALSE if error */ + __attribute__((nonnull, warn_unused_result)); +/***********************************************************************//** +Deletes a file. The file has to be closed before calling this. +@return TRUE if success */ +UNIV_INTERN +bool +os_file_delete_func( +/*================*/ + const char* name); /*!< in: file path as a null-terminated + string */ + +/***********************************************************************//** +Deletes a file if it exists. The file has to be closed before calling this. +@return TRUE if success */ +UNIV_INTERN +bool +os_file_delete_if_exists_func( +/*==========================*/ + const char* name); /*!< in: file path as a null-terminated + string */ +/***********************************************************************//** +NOTE! Use the corresponding macro os_file_rename(), not directly +this function! +Renames a file (can also move it to another directory). It is safest that the +file is closed before calling this function. +@return TRUE if success */ +UNIV_INTERN +ibool +os_file_rename_func( +/*================*/ + const char* oldpath, /*!< in: old file path as a + null-terminated string */ + const char* newpath); /*!< in: new file path */ +/***********************************************************************//** +NOTE! Use the corresponding macro os_file_close(), not directly this +function! +Closes a file handle. In case of error, error number can be retrieved with +os_file_get_last_error. +@return TRUE if success */ +UNIV_INTERN +ibool +os_file_close_func( +/*===============*/ + os_file_t file); /*!< in, own: handle to a file */ + +#ifdef UNIV_PFS_IO +/****************************************************************//** +NOTE! Please use the corresponding macro os_file_create_simple(), +not directly this function! +A performance schema instrumented wrapper function for +os_file_create_simple() which opens or creates a file. +@return own: handle to the file, not defined if error, error number +can be retrieved with os_file_get_last_error */ +UNIV_INLINE +os_file_t +pfs_os_file_create_simple_func( +/*===========================*/ + mysql_pfs_key_t key, /*!< in: Performance Schema Key */ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + ulint create_mode,/*!< in: create mode */ + ulint access_type,/*!< in: OS_FILE_READ_ONLY or + OS_FILE_READ_WRITE */ + ibool* success,/*!< out: TRUE if succeed, FALSE if error */ + const char* src_file,/*!< in: file name where func invoked */ + ulint src_line)/*!< in: line where the func invoked */ + __attribute__((nonnull, warn_unused_result)); + +/****************************************************************//** +NOTE! Please use the corresponding macro +os_file_create_simple_no_error_handling(), not directly this function! +A performance schema instrumented wrapper function for +os_file_create_simple_no_error_handling(). Add instrumentation to +monitor file creation/open. +@return own: handle to the file, not defined if error, error number +can be retrieved with os_file_get_last_error */ +UNIV_INLINE +os_file_t +pfs_os_file_create_simple_no_error_handling_func( +/*=============================================*/ + mysql_pfs_key_t key, /*!< in: Performance Schema Key */ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + ulint create_mode, /*!< in: file create mode */ + ulint access_type,/*!< in: OS_FILE_READ_ONLY, + OS_FILE_READ_WRITE, or + OS_FILE_READ_ALLOW_DELETE; the last option is + used by a backup program reading the file */ + ibool* success,/*!< out: TRUE if succeed, FALSE if error */ + const char* src_file,/*!< in: file name where func invoked */ + ulint src_line)/*!< in: line where the func invoked */ + __attribute__((nonnull, warn_unused_result)); + +/****************************************************************//** +NOTE! Please use the corresponding macro os_file_create(), not directly +this function! +A performance schema wrapper function for os_file_create(). +Add instrumentation to monitor file creation/open. +@return own: handle to the file, not defined if error, error number +can be retrieved with os_file_get_last_error */ +UNIV_INLINE +os_file_t +pfs_os_file_create_func( +/*====================*/ + mysql_pfs_key_t key, /*!< in: Performance Schema Key */ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + ulint create_mode,/*!< in: file create mode */ + ulint purpose,/*!< in: OS_FILE_AIO, if asynchronous, + non-buffered i/o is desired, + OS_FILE_NORMAL, if any normal file; + NOTE that it also depends on type, os_aio_.. + and srv_.. variables whether we really use + async i/o or unbuffered i/o: look in the + function source code for the exact rules */ + ulint type, /*!< in: OS_DATA_FILE or OS_LOG_FILE */ + ibool* success,/*!< out: TRUE if succeed, FALSE if error */ + const char* src_file,/*!< in: file name where func invoked */ + ulint src_line)/*!< in: line where the func invoked */ + __attribute__((nonnull, warn_unused_result)); + +/***********************************************************************//** +NOTE! Please use the corresponding macro os_file_close(), not directly +this function! +A performance schema instrumented wrapper function for os_file_close(). +@return TRUE if success */ +UNIV_INLINE +ibool +pfs_os_file_close_func( +/*===================*/ + os_file_t file, /*!< in, own: handle to a file */ + const char* src_file,/*!< in: file name where func invoked */ + ulint src_line);/*!< in: line where the func invoked */ +/*******************************************************************//** +NOTE! Please use the corresponding macro os_file_read(), not directly +this function! +This is the performance schema instrumented wrapper function for +os_file_read() which requests a synchronous read operation. +@return TRUE if request was successful, FALSE if fail */ +UNIV_INLINE +ibool +pfs_os_file_read_func( +/*==================*/ + os_file_t file, /*!< in: handle to a file */ + void* buf, /*!< in: buffer where to read */ + os_offset_t offset, /*!< in: file offset where to read */ + ulint n, /*!< in: number of bytes to read */ + trx_t* trx, + const char* src_file,/*!< in: file name where func invoked */ + ulint src_line);/*!< in: line where the func invoked */ + +/*******************************************************************//** +NOTE! Please use the corresponding macro os_file_read_no_error_handling(), +not directly this function! +This is the performance schema instrumented wrapper function for +os_file_read_no_error_handling_func() which requests a synchronous +read operation. +@return TRUE if request was successful, FALSE if fail */ +UNIV_INLINE +ibool +pfs_os_file_read_no_error_handling_func( +/*====================================*/ + os_file_t file, /*!< in: handle to a file */ + void* buf, /*!< in: buffer where to read */ + os_offset_t offset, /*!< in: file offset where to read */ + ulint n, /*!< in: number of bytes to read */ + const char* src_file,/*!< in: file name where func invoked */ + ulint src_line);/*!< in: line where the func invoked */ + +/*******************************************************************//** +NOTE! Please use the corresponding macro os_aio(), not directly this +function! +Performance schema wrapper function of os_aio() which requests +an asynchronous i/o operation. +@return TRUE if request was queued successfully, FALSE if fail */ +UNIV_INLINE +ibool +pfs_os_aio_func( +/*============*/ + ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE */ + ulint mode, /*!< in: OS_AIO_NORMAL etc. I/O mode */ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + os_file_t file, /*!< in: handle to a file */ + void* buf, /*!< in: buffer where to read or from which + to write */ + os_offset_t offset, /*!< in: file offset where to read or write */ + ulint n, /*!< in: number of bytes to read or write */ + fil_node_t* message1,/*!< in: message for the aio handler + (can be used to identify a completed + aio operation); ignored if mode is + OS_AIO_SYNC */ + void* message2,/*!< in: message for the aio handler + (can be used to identify a completed + aio operation); ignored if mode is + OS_AIO_SYNC */ + ulint space_id, + trx_t* trx, + const char* src_file,/*!< in: file name where func invoked */ + ulint src_line);/*!< in: line where the func invoked */ +/*******************************************************************//** +NOTE! Please use the corresponding macro os_file_write(), not directly +this function! +This is the performance schema instrumented wrapper function for +os_file_write() which requests a synchronous write operation. +@return TRUE if request was successful, FALSE if fail */ +UNIV_INLINE +ibool +pfs_os_file_write_func( +/*===================*/ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + os_file_t file, /*!< in: handle to a file */ + const void* buf, /*!< in: buffer from which to write */ + os_offset_t offset, /*!< in: file offset where to write */ + ulint n, /*!< in: number of bytes to write */ + const char* src_file,/*!< in: file name where func invoked */ + ulint src_line);/*!< in: line where the func invoked */ +/***********************************************************************//** +NOTE! Please use the corresponding macro os_file_flush(), not directly +this function! +This is the performance schema instrumented wrapper function for +os_file_flush() which flushes the write buffers of a given file to the disk. +Flushes the write buffers of a given file to the disk. +@return TRUE if success */ +UNIV_INLINE +ibool +pfs_os_file_flush_func( +/*===================*/ + os_file_t file, /*!< in, own: handle to a file */ + const char* src_file,/*!< in: file name where func invoked */ + ulint src_line);/*!< in: line where the func invoked */ + +/***********************************************************************//** +NOTE! Please use the corresponding macro os_file_rename(), not directly +this function! +This is the performance schema instrumented wrapper function for +os_file_rename() +@return TRUE if success */ +UNIV_INLINE +ibool +pfs_os_file_rename_func( +/*====================*/ + mysql_pfs_key_t key, /*!< in: Performance Schema Key */ + const char* oldpath,/*!< in: old file path as a null-terminated + string */ + const char* newpath,/*!< in: new file path */ + const char* src_file,/*!< in: file name where func invoked */ + ulint src_line);/*!< in: line where the func invoked */ + +/***********************************************************************//** +NOTE! Please use the corresponding macro os_file_delete(), not directly +this function! +This is the performance schema instrumented wrapper function for +os_file_delete() +@return TRUE if success */ +UNIV_INLINE +bool +pfs_os_file_delete_func( +/*====================*/ + mysql_pfs_key_t key, /*!< in: Performance Schema Key */ + const char* name, /*!< in: old file path as a null-terminated + string */ + const char* src_file,/*!< in: file name where func invoked */ + ulint src_line);/*!< in: line where the func invoked */ + +/***********************************************************************//** +NOTE! Please use the corresponding macro os_file_delete_if_exists(), not +directly this function! +This is the performance schema instrumented wrapper function for +os_file_delete_if_exists() +@return TRUE if success */ +UNIV_INLINE +bool +pfs_os_file_delete_if_exists_func( +/*==============================*/ + mysql_pfs_key_t key, /*!< in: Performance Schema Key */ + const char* name, /*!< in: old file path as a null-terminated + string */ + const char* src_file,/*!< in: file name where func invoked */ + ulint src_line);/*!< in: line where the func invoked */ +#endif /* UNIV_PFS_IO */ + +/***********************************************************************//** +Closes a file handle. +@return TRUE if success */ +UNIV_INTERN +ibool +os_file_close_no_error_handling( +/*============================*/ + os_file_t file); /*!< in, own: handle to a file */ +/***********************************************************************//** +Gets a file size. +@return file size, or (os_offset_t) -1 on failure */ +UNIV_INTERN +os_offset_t +os_file_get_size( +/*=============*/ + os_file_t file) /*!< in: handle to a file */ + __attribute__((warn_unused_result)); +/***********************************************************************//** +Write the specified number of zeros to a newly created file. +@return TRUE if success */ +UNIV_INTERN +ibool +os_file_set_size( +/*=============*/ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + os_file_t file, /*!< in: handle to a file */ + os_offset_t size) /*!< in: file size */ + __attribute__((nonnull, warn_unused_result)); +/***********************************************************************//** +Truncates a file at its current position. +@return TRUE if success */ +UNIV_INTERN +ibool +os_file_set_eof( +/*============*/ + FILE* file); /*!< in: file to be truncated */ +/***********************************************************************//** +Truncates a file at the specified position. +@return TRUE if success */ +UNIV_INTERN +ibool +os_file_set_eof_at( + os_file_t file, /*!< in: handle to a file */ + ib_uint64_t new_len);/*!< in: new file length */ +/***********************************************************************//** +NOTE! Use the corresponding macro os_file_flush(), not directly this function! +Flushes the write buffers of a given file to the disk. +@return TRUE if success */ +UNIV_INTERN +ibool +os_file_flush_func( +/*===============*/ + os_file_t file); /*!< in, own: handle to a file */ +/***********************************************************************//** +Retrieves the last error number if an error occurs in a file io function. +The number should be retrieved before any other OS calls (because they may +overwrite the error number). If the number is not known to this program, +the OS error number + 100 is returned. +@return error number, or OS error number + 100 */ +UNIV_INTERN +ulint +os_file_get_last_error( +/*===================*/ + bool report_all_errors); /*!< in: TRUE if we want an error message + printed of all errors */ +/*******************************************************************//** +NOTE! Use the corresponding macro os_file_read(), not directly this function! +Requests a synchronous read operation. +@return TRUE if request was successful, FALSE if fail */ +UNIV_INTERN +ibool +os_file_read_func( +/*==============*/ + os_file_t file, /*!< in: handle to a file */ + void* buf, /*!< in: buffer where to read */ + os_offset_t offset, /*!< in: file offset where to read */ + ulint n, /*!< in: number of bytes to read */ + trx_t* trx); +/*******************************************************************//** +Rewind file to its start, read at most size - 1 bytes from it to str, and +NUL-terminate str. All errors are silently ignored. This function is +mostly meant to be used with temporary files. */ +UNIV_INTERN +void +os_file_read_string( +/*================*/ + FILE* file, /*!< in: file to read from */ + char* str, /*!< in: buffer where to read */ + ulint size); /*!< in: size of buffer */ +/*******************************************************************//** +NOTE! Use the corresponding macro os_file_read_no_error_handling(), +not directly this function! +Requests a synchronous positioned read operation. This function does not do +any error handling. In case of error it returns FALSE. +@return TRUE if request was successful, FALSE if fail */ +UNIV_INTERN +ibool +os_file_read_no_error_handling_func( +/*================================*/ + os_file_t file, /*!< in: handle to a file */ + void* buf, /*!< in: buffer where to read */ + os_offset_t offset, /*!< in: file offset where to read */ + ulint n); /*!< in: number of bytes to read */ + +/*******************************************************************//** +NOTE! Use the corresponding macro os_file_write(), not directly this +function! +Requests a synchronous write operation. +@return TRUE if request was successful, FALSE if fail */ +UNIV_INTERN +ibool +os_file_write_func( +/*===============*/ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + os_file_t file, /*!< in: handle to a file */ + const void* buf, /*!< in: buffer from which to write */ + os_offset_t offset, /*!< in: file offset where to write */ + ulint n); /*!< in: number of bytes to write */ +/*******************************************************************//** +Check the existence and type of the given file. +@return TRUE if call succeeded */ +UNIV_INTERN +ibool +os_file_status( +/*===========*/ + const char* path, /*!< in: pathname of the file */ + ibool* exists, /*!< out: TRUE if file exists */ + os_file_type_t* type); /*!< out: type of the file (if it exists) */ +/****************************************************************//** +The function os_file_dirname returns a directory component of a +null-terminated pathname string. In the usual case, dirname returns +the string up to, but not including, the final '/', and basename +is the component following the final '/'. Trailing '/' characters +are not counted as part of the pathname. + +If path does not contain a slash, dirname returns the string ".". + +Concatenating the string returned by dirname, a "/", and the basename +yields a complete pathname. + +The return value is a copy of the directory component of the pathname. +The copy is allocated from heap. It is the caller responsibility +to free it after it is no longer needed. + +The following list of examples (taken from SUSv2) shows the strings +returned by dirname and basename for different paths: + + path dirname basename + "/usr/lib" "/usr" "lib" + "/usr/" "/" "usr" + "usr" "." "usr" + "/" "/" "/" + "." "." "." + ".." "." ".." + +@return own: directory component of the pathname */ +UNIV_INTERN +char* +os_file_dirname( +/*============*/ + const char* path); /*!< in: pathname */ +/****************************************************************//** +This function returns a new path name after replacing the basename +in an old path with a new basename. The old_path is a full path +name including the extension. The tablename is in the normal +form "databasename/tablename". The new base name is found after +the forward slash. Both input strings are null terminated. + +This function allocates memory to be returned. It is the callers +responsibility to free the return value after it is no longer needed. + +@return own: new full pathname */ +UNIV_INTERN +char* +os_file_make_new_pathname( +/*======================*/ + const char* old_path, /*!< in: pathname */ + const char* new_name); /*!< in: new file name */ +/****************************************************************//** +This function returns a remote path name by combining a data directory +path provided in a DATA DIRECTORY clause with the tablename which is +in the form 'database/tablename'. It strips the file basename (which +is the tablename) found after the last directory in the path provided. +The full filepath created will include the database name as a directory +under the path provided. The filename is the tablename with the '.ibd' +extension. All input and output strings are null-terminated. + +This function allocates memory to be returned. It is the callers +responsibility to free the return value after it is no longer needed. + +@return own: A full pathname; data_dir_path/databasename/tablename.ibd */ +UNIV_INTERN +char* +os_file_make_remote_pathname( +/*=========================*/ + const char* data_dir_path, /*!< in: pathname */ + const char* tablename, /*!< in: tablename */ + const char* extention); /*!< in: file extention; ibd,cfg*/ +/****************************************************************//** +This function reduces a null-terminated full remote path name into +the path that is sent by MySQL for DATA DIRECTORY clause. It replaces +the 'databasename/tablename.ibd' found at the end of the path with just +'tablename'. + +Since the result is always smaller than the path sent in, no new memory +is allocated. The caller should allocate memory for the path sent in. +This function manipulates that path in place. + +If the path format is not as expected, just return. The result is used +to inform a SHOW CREATE TABLE command. */ +UNIV_INTERN +void +os_file_make_data_dir_path( +/*========================*/ + char* data_dir_path); /*!< in/out: full path/data_dir_path */ +/****************************************************************//** +Creates all missing subdirectories along the given path. +@return TRUE if call succeeded FALSE otherwise */ +UNIV_INTERN +ibool +os_file_create_subdirs_if_needed( +/*=============================*/ + const char* path); /*!< in: path name */ +/*********************************************************************** +Initializes the asynchronous io system. Creates one array each for ibuf +and log i/o. Also creates one array each for read and write where each +array is divided logically into n_read_segs and n_write_segs +respectively. The caller must create an i/o handler thread for each +segment in these arrays. This function also creates the sync array. +No i/o handler thread needs to be created for that */ +UNIV_INTERN +ibool +os_aio_init( +/*========*/ + ulint n_per_seg, /*<! in: maximum number of pending aio + operations allowed per segment */ + ulint n_read_segs, /*<! in: number of reader threads */ + ulint n_write_segs, /*<! in: number of writer threads */ + ulint n_slots_sync); /*<! in: number of slots in the sync aio + array */ +/*********************************************************************** +Frees the asynchronous io system. */ +UNIV_INTERN +void +os_aio_free(void); +/*=============*/ + +/*******************************************************************//** +NOTE! Use the corresponding macro os_aio(), not directly this function! +Requests an asynchronous i/o operation. +@return TRUE if request was queued successfully, FALSE if fail */ +UNIV_INTERN +ibool +os_aio_func( +/*========*/ + ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE */ + ulint mode, /*!< in: OS_AIO_NORMAL, ..., possibly ORed + to OS_AIO_SIMULATED_WAKE_LATER: the + last flag advises this function not to wake + i/o-handler threads, but the caller will + do the waking explicitly later, in this + way the caller can post several requests in + a batch; NOTE that the batch must not be + so big that it exhausts the slots in aio + arrays! NOTE that a simulated batch + may introduce hidden chances of deadlocks, + because i/os are not actually handled until + all have been posted: use with great + caution! */ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + os_file_t file, /*!< in: handle to a file */ + void* buf, /*!< in: buffer where to read or from which + to write */ + os_offset_t offset, /*!< in: file offset where to read or write */ + ulint n, /*!< in: number of bytes to read or write */ + fil_node_t* message1,/*!< in: message for the aio handler + (can be used to identify a completed + aio operation); ignored if mode is + OS_AIO_SYNC */ + void* message2,/*!< in: message for the aio handler + (can be used to identify a completed + aio operation); ignored if mode is + OS_AIO_SYNC */ + ulint space_id, + trx_t* trx); +/************************************************************************//** +Wakes up all async i/o threads so that they know to exit themselves in +shutdown. */ +UNIV_INTERN +void +os_aio_wake_all_threads_at_shutdown(void); +/*=====================================*/ +/************************************************************************//** +Waits until there are no pending writes in os_aio_write_array. There can +be other, synchronous, pending writes. */ +UNIV_INTERN +void +os_aio_wait_until_no_pending_writes(void); +/*=====================================*/ +/**********************************************************************//** +Wakes up simulated aio i/o-handler threads if they have something to do. */ +UNIV_INTERN +void +os_aio_simulated_wake_handler_threads(void); +/*=======================================*/ +/**********************************************************************//** +This function can be called if one wants to post a batch of reads and +prefers an i/o-handler thread to handle them all at once later. You must +call os_aio_simulated_wake_handler_threads later to ensure the threads +are not left sleeping! */ +UNIV_INTERN +void +os_aio_simulated_put_read_threads_to_sleep(void); +/*============================================*/ + +#ifdef WIN_ASYNC_IO +/**********************************************************************//** +This function is only used in Windows asynchronous i/o. +Waits for an aio operation to complete. This function is used to wait the +for completed requests. The aio array of pending requests is divided +into segments. The thread specifies which segment or slot it wants to wait +for. NOTE: this function will also take care of freeing the aio slot, +therefore no other thread is allowed to do the freeing! +@return TRUE if the aio operation succeeded */ +UNIV_INTERN +ibool +os_aio_windows_handle( +/*==================*/ + ulint segment, /*!< in: the number of the segment in the aio + arrays to wait for; segment 0 is the ibuf + i/o thread, segment 1 the log i/o thread, + then follow the non-ibuf read threads, and as + the last are the non-ibuf write threads; if + this is ULINT_UNDEFINED, then it means that + sync aio is used, and this parameter is + ignored */ + ulint pos, /*!< this parameter is used only in sync aio: + wait for the aio slot at this position */ + fil_node_t**message1, /*!< out: the messages passed with the aio + request; note that also in the case where + the aio operation failed, these output + parameters are valid and can be used to + restart the operation, for example */ + void** message2, + ulint* type, /*!< out: OS_FILE_WRITE or ..._READ */ + ulint* space_id); +#endif + +/**********************************************************************//** +Does simulated aio. This function should be called by an i/o-handler +thread. +@return TRUE if the aio operation succeeded */ +UNIV_INTERN +ibool +os_aio_simulated_handle( +/*====================*/ + ulint segment, /*!< in: the number of the segment in the aio + arrays to wait for; segment 0 is the ibuf + i/o thread, segment 1 the log i/o thread, + then follow the non-ibuf read threads, and as + the last are the non-ibuf write threads */ + fil_node_t**message1, /*!< out: the messages passed with the aio + request; note that also in the case where + the aio operation failed, these output + parameters are valid and can be used to + restart the operation, for example */ + void** message2, + ulint* type, /*!< out: OS_FILE_WRITE or ..._READ */ + ulint* space_id); +/**********************************************************************//** +Validates the consistency of the aio system. +@return TRUE if ok */ +UNIV_INTERN +ibool +os_aio_validate(void); +/*=================*/ +/**********************************************************************//** +Prints info of the aio arrays. */ +UNIV_INTERN +void +os_aio_print( +/*=========*/ + FILE* file); /*!< in: file where to print */ +/**********************************************************************//** +Refreshes the statistics used to print per-second averages. */ +UNIV_INTERN +void +os_aio_refresh_stats(void); +/*======================*/ + +#ifdef UNIV_DEBUG +/**********************************************************************//** +Checks that all slots in the system have been freed, that is, there are +no pending io operations. */ +UNIV_INTERN +ibool +os_aio_all_slots_free(void); +/*=======================*/ +#endif /* UNIV_DEBUG */ + +/*******************************************************************//** +This function returns information about the specified file +@return DB_SUCCESS if all OK */ +UNIV_INTERN +dberr_t +os_file_get_status( +/*===============*/ + const char* path, /*!< in: pathname of the file */ + os_file_stat_t* stat_info, /*!< information of a file in a + directory */ + bool check_rw_perm); /*!< in: for testing whether the + file can be opened in RW mode */ + +#if !defined(UNIV_HOTBACKUP) +/*********************************************************************//** +Creates a temporary file that will be deleted on close. +This function is defined in ha_innodb.cc. +@return temporary file descriptor, or < 0 on error */ +UNIV_INTERN +int +innobase_mysql_tmpfile(void); +/*========================*/ +#endif /* !UNIV_HOTBACKUP */ + + +#if defined(LINUX_NATIVE_AIO) +/************************************************************************** +This function is only used in Linux native asynchronous i/o. +Waits for an aio operation to complete. This function is used to wait the +for completed requests. The aio array of pending requests is divided +into segments. The thread specifies which segment or slot it wants to wait +for. NOTE: this function will also take care of freeing the aio slot, +therefore no other thread is allowed to do the freeing! +@return TRUE if the IO was successful */ +UNIV_INTERN +ibool +os_aio_linux_handle( +/*================*/ + ulint global_seg, /*!< in: segment number in the aio array + to wait for; segment 0 is the ibuf + i/o thread, segment 1 is log i/o thread, + then follow the non-ibuf read threads, + and the last are the non-ibuf write + threads. */ + fil_node_t**message1, /*!< out: the messages passed with the */ + void** message2, /*!< aio request; note that in case the + aio operation failed, these output + parameters are valid and can be used to + restart the operation. */ + ulint* type, /*!< out: OS_FILE_WRITE or ..._READ */ + ulint* space_id); +#endif /* LINUX_NATIVE_AIO */ + +#ifndef UNIV_NONINL +#include "os0file.ic" +#endif + +#endif diff --git a/storage/xtradb/include/os0file.ic b/storage/xtradb/include/os0file.ic new file mode 100644 index 00000000000..25a1397147e --- /dev/null +++ b/storage/xtradb/include/os0file.ic @@ -0,0 +1,452 @@ +/***************************************************************************** + +Copyright (c) 2010, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/os0file.ic +The interface to the operating system file io + +Created 2/20/2010 Jimmy Yang +*******************************************************/ + +#include "univ.i" + +#ifdef UNIV_PFS_IO +/****************************************************************//** +NOTE! Please use the corresponding macro os_file_create_simple(), +not directly this function! +A performance schema instrumented wrapper function for +os_file_create_simple() which opens or creates a file. +@return own: handle to the file, not defined if error, error number +can be retrieved with os_file_get_last_error */ +UNIV_INLINE +os_file_t +pfs_os_file_create_simple_func( +/*===========================*/ + mysql_pfs_key_t key, /*!< in: Performance Schema Key */ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + ulint create_mode,/*!< in: create mode */ + ulint access_type,/*!< in: OS_FILE_READ_ONLY or + OS_FILE_READ_WRITE */ + ibool* success,/*!< out: TRUE if succeed, FALSE if error */ + const char* src_file,/*!< in: file name where func invoked */ + ulint src_line)/*!< in: line where the func invoked */ +{ + os_file_t file; + struct PSI_file_locker* locker = NULL; + PSI_file_locker_state state; + + /* register a file open or creation depending on "create_mode" */ + register_pfs_file_open_begin(&state, locker, key, + ((create_mode == OS_FILE_CREATE) + ? PSI_FILE_CREATE + : PSI_FILE_OPEN), + name, src_file, src_line); + + file = os_file_create_simple_func(name, create_mode, + access_type, success); + + /* Regsiter the returning "file" value with the system */ + register_pfs_file_open_end(locker, file); + + return(file); +} + +/****************************************************************//** +NOTE! Please use the corresponding macro +os_file_create_simple_no_error_handling(), not directly this function! +A performance schema instrumented wrapper function for +os_file_create_simple_no_error_handling(). Add instrumentation to +monitor file creation/open. +@return own: handle to the file, not defined if error, error number +can be retrieved with os_file_get_last_error */ +UNIV_INLINE +os_file_t +pfs_os_file_create_simple_no_error_handling_func( +/*=============================================*/ + mysql_pfs_key_t key, /*!< in: Performance Schema Key */ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + ulint create_mode, /*!< in: file create mode */ + ulint access_type,/*!< in: OS_FILE_READ_ONLY, + OS_FILE_READ_WRITE, or + OS_FILE_READ_ALLOW_DELETE; the last option is + used by a backup program reading the file */ + ibool* success,/*!< out: TRUE if succeed, FALSE if error */ + const char* src_file,/*!< in: file name where func invoked */ + ulint src_line)/*!< in: line where the func invoked */ +{ + os_file_t file; + struct PSI_file_locker* locker = NULL; + PSI_file_locker_state state; + + /* register a file open or creation depending on "create_mode" */ + register_pfs_file_open_begin(&state, locker, key, + ((create_mode == OS_FILE_CREATE) + ? PSI_FILE_CREATE + : PSI_FILE_OPEN), + name, src_file, src_line); + + file = os_file_create_simple_no_error_handling_func( + name, create_mode, access_type, success); + + register_pfs_file_open_end(locker, file); + + return(file); +} + +/****************************************************************//** +NOTE! Please use the corresponding macro os_file_create(), not directly +this function! +A performance schema wrapper function for os_file_create(). +Add instrumentation to monitor file creation/open. +@return own: handle to the file, not defined if error, error number +can be retrieved with os_file_get_last_error */ +UNIV_INLINE +os_file_t +pfs_os_file_create_func( +/*====================*/ + mysql_pfs_key_t key, /*!< in: Performance Schema Key */ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + ulint create_mode,/*!< in: file create mode */ + ulint purpose,/*!< in: OS_FILE_AIO, if asynchronous, + non-buffered i/o is desired, + OS_FILE_NORMAL, if any normal file; + NOTE that it also depends on type, os_aio_.. + and srv_.. variables whether we really use + async i/o or unbuffered i/o: look in the + function source code for the exact rules */ + ulint type, /*!< in: OS_DATA_FILE or OS_LOG_FILE */ + ibool* success,/*!< out: TRUE if succeed, FALSE if error */ + const char* src_file,/*!< in: file name where func invoked */ + ulint src_line)/*!< in: line where the func invoked */ +{ + os_file_t file; + struct PSI_file_locker* locker = NULL; + PSI_file_locker_state state; + + /* register a file open or creation depending on "create_mode" */ + register_pfs_file_open_begin(&state, locker, key, + ((create_mode == OS_FILE_CREATE) + ? PSI_FILE_CREATE + : PSI_FILE_OPEN), + name, src_file, src_line); + + file = os_file_create_func(name, create_mode, purpose, type, success); + + register_pfs_file_open_end(locker, file); + + return(file); +} + +/***********************************************************************//** +NOTE! Please use the corresponding macro os_file_close(), not directly +this function! +A performance schema instrumented wrapper function for os_file_close(). +@return TRUE if success */ +UNIV_INLINE +ibool +pfs_os_file_close_func( +/*===================*/ + os_file_t file, /*!< in, own: handle to a file */ + const char* src_file,/*!< in: file name where func invoked */ + ulint src_line)/*!< in: line where the func invoked */ +{ + ibool result; + struct PSI_file_locker* locker = NULL; + PSI_file_locker_state state; + + /* register the file close */ + register_pfs_file_io_begin(&state, locker, file, 0, PSI_FILE_CLOSE, + src_file, src_line); + + result = os_file_close_func(file); + + register_pfs_file_io_end(locker, 0); + + return(result); +} + +/*******************************************************************//** +NOTE! Please use the corresponding macro os_aio(), not directly this +function! +Performance schema instrumented wrapper function of os_aio() which +requests an asynchronous i/o operation. +@return TRUE if request was queued successfully, FALSE if fail */ +UNIV_INLINE +ibool +pfs_os_aio_func( +/*============*/ + ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE */ + ulint mode, /*!< in: OS_AIO_NORMAL etc. I/O mode */ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + os_file_t file, /*!< in: handle to a file */ + void* buf, /*!< in: buffer where to read or from which + to write */ + os_offset_t offset, /*!< in: file offset where to read or write */ + ulint n, /*!< in: number of bytes to read or write */ + fil_node_t* message1,/*!< in: message for the aio handler + (can be used to identify a completed + aio operation); ignored if mode is + OS_AIO_SYNC */ + void* message2,/*!< in: message for the aio handler + (can be used to identify a completed + aio operation); ignored if mode is + OS_AIO_SYNC */ + ulint space_id, + trx_t* trx, + const char* src_file,/*!< in: file name where func invoked */ + ulint src_line)/*!< in: line where the func invoked */ +{ + ibool result; + struct PSI_file_locker* locker = NULL; + PSI_file_locker_state state; + + /* Register the read or write I/O depending on "type" */ + register_pfs_file_io_begin(&state, locker, file, n, + (type == OS_FILE_WRITE) + ? PSI_FILE_WRITE + : PSI_FILE_READ, + src_file, src_line); + + result = os_aio_func(type, mode, name, file, buf, offset, + n, message1, message2, space_id, trx); + + register_pfs_file_io_end(locker, n); + + return(result); +} + +/*******************************************************************//** +NOTE! Please use the corresponding macro os_file_read(), not directly +this function! +This is the performance schema instrumented wrapper function for +os_file_read() which requests a synchronous read operation. +@return TRUE if request was successful, FALSE if fail */ +UNIV_INLINE +ibool +pfs_os_file_read_func( +/*==================*/ + os_file_t file, /*!< in: handle to a file */ + void* buf, /*!< in: buffer where to read */ + os_offset_t offset, /*!< in: file offset where to read */ + ulint n, /*!< in: number of bytes to read */ + trx_t* trx, + const char* src_file,/*!< in: file name where func invoked */ + ulint src_line)/*!< in: line where the func invoked */ +{ + ibool result; + struct PSI_file_locker* locker = NULL; + PSI_file_locker_state state; + + register_pfs_file_io_begin(&state, locker, file, n, PSI_FILE_READ, + src_file, src_line); + + result = os_file_read_func(file, buf, offset, n, trx); + + register_pfs_file_io_end(locker, n); + + return(result); +} + +/*******************************************************************//** +NOTE! Please use the corresponding macro +os_file_read_no_error_handling(), not directly this function! +This is the performance schema instrumented wrapper function for +os_file_read_no_error_handling() which requests a synchronous +positioned read operation. This function does not do any error +handling. In case of error it returns FALSE. +@return TRUE if request was successful, FALSE if fail */ +UNIV_INLINE +ibool +pfs_os_file_read_no_error_handling_func( +/*====================================*/ + os_file_t file, /*!< in: handle to a file */ + void* buf, /*!< in: buffer where to read */ + os_offset_t offset, /*!< in: file offset where to read */ + ulint n, /*!< in: number of bytes to read */ + const char* src_file,/*!< in: file name where func invoked */ + ulint src_line)/*!< in: line where the func invoked */ +{ + ibool result; + struct PSI_file_locker* locker = NULL; + PSI_file_locker_state state; + + register_pfs_file_io_begin(&state, locker, file, n, PSI_FILE_READ, + src_file, src_line); + + result = os_file_read_no_error_handling_func(file, buf, offset, n); + + register_pfs_file_io_end(locker, n); + + return(result); +} + +/*******************************************************************//** +NOTE! Please use the corresponding macro os_file_write(), not directly +this function! +This is the performance schema instrumented wrapper function for +os_file_write() which requests a synchronous write operation. +@return TRUE if request was successful, FALSE if fail */ +UNIV_INLINE +ibool +pfs_os_file_write_func( +/*===================*/ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + os_file_t file, /*!< in: handle to a file */ + const void* buf, /*!< in: buffer from which to write */ + os_offset_t offset, /*!< in: file offset where to write */ + ulint n, /*!< in: number of bytes to write */ + const char* src_file,/*!< in: file name where func invoked */ + ulint src_line)/*!< in: line where the func invoked */ +{ + ibool result; + struct PSI_file_locker* locker = NULL; + PSI_file_locker_state state; + + register_pfs_file_io_begin(&state, locker, file, n, PSI_FILE_WRITE, + src_file, src_line); + + result = os_file_write_func(name, file, buf, offset, n); + + register_pfs_file_io_end(locker, n); + + return(result); +} + +/***********************************************************************//** +NOTE! Please use the corresponding macro os_file_flush(), not directly +this function! +This is the performance schema instrumented wrapper function for +os_file_flush() which flushes the write buffers of a given file to the disk. +@return TRUE if success */ +UNIV_INLINE +ibool +pfs_os_file_flush_func( +/*===================*/ + os_file_t file, /*!< in, own: handle to a file */ + const char* src_file,/*!< in: file name where func invoked */ + ulint src_line)/*!< in: line where the func invoked */ +{ + ibool result; + struct PSI_file_locker* locker = NULL; + PSI_file_locker_state state; + + register_pfs_file_io_begin(&state, locker, file, 0, PSI_FILE_SYNC, + src_file, src_line); + result = os_file_flush_func(file); + + register_pfs_file_io_end(locker, 0); + + return(result); +} + +/***********************************************************************//** +NOTE! Please use the corresponding macro os_file_rename(), not directly +this function! +This is the performance schema instrumented wrapper function for +os_file_rename() +@return TRUE if success */ +UNIV_INLINE +ibool +pfs_os_file_rename_func( +/*====================*/ + mysql_pfs_key_t key, /*!< in: Performance Schema Key */ + const char* oldpath,/*!< in: old file path as a null-terminated + string */ + const char* newpath,/*!< in: new file path */ + const char* src_file,/*!< in: file name where func invoked */ + ulint src_line)/*!< in: line where the func invoked */ +{ + ibool result; + struct PSI_file_locker* locker = NULL; + PSI_file_locker_state state; + + register_pfs_file_open_begin(&state, locker, key, PSI_FILE_RENAME, newpath, + src_file, src_line); + + result = os_file_rename_func(oldpath, newpath); + + register_pfs_file_open_end(locker, 0); + + return(result); +} + +/***********************************************************************//** +NOTE! Please use the corresponding macro os_file_delete(), not directly +this function! +This is the performance schema instrumented wrapper function for +os_file_delete() +@return TRUE if success */ +UNIV_INLINE +bool +pfs_os_file_delete_func( +/*====================*/ + mysql_pfs_key_t key, /*!< in: Performance Schema Key */ + const char* name, /*!< in: file path as a null-terminated + string */ + const char* src_file, /*!< in: file name where func invoked */ + ulint src_line) /*!< in: line where the func invoked */ +{ + bool result; + struct PSI_file_locker* locker = NULL; + PSI_file_locker_state state; + + register_pfs_file_close_begin(&state, locker, key, PSI_FILE_DELETE, + name, src_file, src_line); + + result = os_file_delete_func(name); + + register_pfs_file_close_end(locker, 0); + + return(result); +} + +/***********************************************************************//** +NOTE! Please use the corresponding macro os_file_delete_if_exists(), not +directly this function! +This is the performance schema instrumented wrapper function for +os_file_delete_if_exists() +@return TRUE if success */ +UNIV_INLINE +bool +pfs_os_file_delete_if_exists_func( +/*==============================*/ + mysql_pfs_key_t key, /*!< in: Performance Schema Key */ + const char* name, /*!< in: file path as a null-terminated + string */ + const char* src_file, /*!< in: file name where func invoked */ + ulint src_line) /*!< in: line where the func invoked */ +{ + bool result; + struct PSI_file_locker* locker = NULL; + PSI_file_locker_state state; + + register_pfs_file_close_begin(&state, locker, key, PSI_FILE_DELETE, + name, src_file, src_line); + + result = os_file_delete_if_exists_func(name); + + register_pfs_file_close_end(locker, 0); + + return(result); +} +#endif /* UNIV_PFS_IO */ diff --git a/storage/xtradb/include/os0once.h b/storage/xtradb/include/os0once.h new file mode 100644 index 00000000000..a8bbaf1d2d4 --- /dev/null +++ b/storage/xtradb/include/os0once.h @@ -0,0 +1,125 @@ +/***************************************************************************** + +Copyright (c) 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/os0once.h +A class that aids executing a given function exactly once in a multi-threaded +environment. + +Created Feb 20, 2014 Vasil Dimov +*******************************************************/ + +#ifndef os0once_h +#define os0once_h + +#include "univ.i" + +#include "os0sync.h" +#include "ut0ut.h" + +/** Execute a given function exactly once in a multi-threaded environment +or wait for the function to be executed by another thread. + +Example usage: +First the user must create a control variable of type os_once::state_t and +assign it os_once::NEVER_DONE. +Then the user must pass this variable, together with a function to be +executed to os_once::do_or_wait_for_done(). + +Multiple threads can call os_once::do_or_wait_for_done() simultaneously with +the same (os_once::state_t) control variable. The provided function will be +called exactly once and when os_once::do_or_wait_for_done() returns then this +function has completed execution, by this or another thread. In other words +os_once::do_or_wait_for_done() will either execute the provided function or +will wait for its execution to complete if it is already called by another +thread or will do nothing if the function has already completed its execution +earlier. + +This mimics pthread_once(3), but unfortunatelly pthread_once(3) does not +support passing arguments to the init_routine() function. We should use +std::call_once() when we start compiling with C++11 enabled. */ +class os_once { +public: + /** Control variables' state type */ + typedef ib_uint32_t state_t; + + /** Not yet executed. */ + static const state_t NEVER_DONE = 0; + + /** Currently being executed by this or another thread. */ + static const state_t IN_PROGRESS = 1; + + /** Finished execution. */ + static const state_t DONE = 2; + +#ifdef HAVE_ATOMIC_BUILTINS + /** Call a given function or wait its execution to complete if it is + already called by another thread. + @param[in,out] state control variable + @param[in] do_func function to call + @param[in,out] do_func_arg an argument to pass to do_func(). */ + static + void + do_or_wait_for_done( + volatile state_t* state, + void (*do_func)(void*), + void* do_func_arg) + { + /* Avoid calling os_compare_and_swap_uint32() in the most + common case. */ + if (*state == DONE) { + return; + } + + if (os_compare_and_swap_uint32(state, + NEVER_DONE, IN_PROGRESS)) { + /* We are the first. Call the function. */ + + do_func(do_func_arg); + + const bool swapped = os_compare_and_swap_uint32( + state, IN_PROGRESS, DONE); + + ut_a(swapped); + } else { + /* The state is not NEVER_DONE, so either it is + IN_PROGRESS (somebody is calling the function right + now or DONE (it has already been called and completed). + Wait for it to become DONE. */ + for (;;) { + const state_t s = *state; + + switch (s) { + case DONE: + return; + case IN_PROGRESS: + break; + case NEVER_DONE: + /* fall through */ + default: + ut_error; + } + + UT_RELAX_CPU(); + } + } + } +#endif /* HAVE_ATOMIC_BUILTINS */ +}; + +#endif /* os0once_h */ diff --git a/storage/xtradb/include/os0proc.h b/storage/xtradb/include/os0proc.h new file mode 100644 index 00000000000..f9e88ff1a28 --- /dev/null +++ b/storage/xtradb/include/os0proc.h @@ -0,0 +1,78 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/os0proc.h +The interface to the operating system +process control primitives + +Created 9/30/1995 Heikki Tuuri +*******************************************************/ + +#ifndef os0proc_h +#define os0proc_h + +#include "univ.i" + +#ifdef UNIV_LINUX +#include <sys/ipc.h> +#include <sys/shm.h> +#endif + +typedef void* os_process_t; +typedef unsigned long int os_process_id_t; + +extern ibool os_use_large_pages; +/* Large page size. This may be a boot-time option on some platforms */ +extern ulint os_large_page_size; + +/****************************************************************//** +Converts the current process id to a number. It is not guaranteed that the +number is unique. In Linux returns the 'process number' of the current +thread. That number is the same as one sees in 'top', for example. In Linux +the thread id is not the same as one sees in 'top'. +@return process id as a number */ +UNIV_INTERN +ulint +os_proc_get_number(void); +/*====================*/ +/****************************************************************//** +Allocates large pages memory. +@return allocated memory */ +UNIV_INTERN +void* +os_mem_alloc_large( +/*===============*/ + ulint* n, /*!< in/out: number of bytes */ + ibool populate); /*!< in: virtual page preallocation */ +/****************************************************************//** +Frees large pages memory. */ +UNIV_INTERN +void +os_mem_free_large( +/*==============*/ + void *ptr, /*!< in: pointer returned by + os_mem_alloc_large() */ + ulint size); /*!< in: size returned by + os_mem_alloc_large() */ + +#ifndef UNIV_NONINL +#include "os0proc.ic" +#endif + +#endif diff --git a/storage/xtradb/include/os0proc.ic b/storage/xtradb/include/os0proc.ic new file mode 100644 index 00000000000..506f4f8ce0c --- /dev/null +++ b/storage/xtradb/include/os0proc.ic @@ -0,0 +1,27 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/os0proc.ic +The interface to the operating system +process control primitives + +Created 9/30/1995 Heikki Tuuri +*******************************************************/ + + diff --git a/storage/xtradb/include/os0sync.h b/storage/xtradb/include/os0sync.h new file mode 100644 index 00000000000..57b29fff663 --- /dev/null +++ b/storage/xtradb/include/os0sync.h @@ -0,0 +1,743 @@ +/***************************************************************************** + +Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2008, Google Inc. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/os0sync.h +The interface to the operating system +synchronization primitives. + +Created 9/6/1995 Heikki Tuuri +*******************************************************/ + +#ifndef os0sync_h +#define os0sync_h + +#include "univ.i" +#include "ut0lst.h" +#include "sync0types.h" + +#ifdef __WIN__ +/** Native event (slow)*/ +typedef HANDLE os_native_event_t; +/** Native mutex */ +typedef CRITICAL_SECTION fast_mutex_t; +/** Native condition variable. */ +typedef CONDITION_VARIABLE os_cond_t; +#else +/** Native mutex */ +typedef pthread_mutex_t fast_mutex_t; +/** Native condition variable */ +typedef pthread_cond_t os_cond_t; +#endif + +/** Structure that includes Performance Schema Probe pfs_psi +in the os_fast_mutex structure if UNIV_PFS_MUTEX is defined */ +struct os_fast_mutex_t { + fast_mutex_t mutex; /*!< os_fast_mutex */ +#ifdef UNIV_PFS_MUTEX + struct PSI_mutex* pfs_psi;/*!< The performance schema + instrumentation hook */ +#endif +}; + +/** Operating system event handle */ +typedef struct os_event* os_event_t; + +/** An asynchronous signal sent between threads */ +struct os_event { +#ifdef __WIN__ + HANDLE handle; /*!< kernel event object, slow, + used on older Windows */ +#endif + os_fast_mutex_t os_mutex; /*!< this mutex protects the next + fields */ + ibool is_set; /*!< this is TRUE when the event is + in the signaled state, i.e., a thread + does not stop if it tries to wait for + this event */ + ib_int64_t signal_count; /*!< this is incremented each time + the event becomes signaled */ + os_cond_t cond_var; /*!< condition variable is used in + waiting for the event */ + UT_LIST_NODE_T(os_event_t) os_event_list; + /*!< list of all created events */ +}; + +/** Denotes an infinite delay for os_event_wait_time() */ +#define OS_SYNC_INFINITE_TIME ULINT_UNDEFINED + +/** Return value of os_event_wait_time() when the time is exceeded */ +#define OS_SYNC_TIME_EXCEEDED 1 + +/** Operating system mutex handle */ +typedef struct os_mutex_t* os_ib_mutex_t; + +/** Mutex protecting counts and the event and OS 'slow' mutex lists */ +extern os_ib_mutex_t os_sync_mutex; + +/** This is incremented by 1 in os_thread_create and decremented by 1 in +os_thread_exit */ +extern ulint os_thread_count; + +extern ulint os_event_count; +extern ulint os_mutex_count; +extern ulint os_fast_mutex_count; + +/*********************************************************//** +Initializes global event and OS 'slow' mutex lists. */ +UNIV_INTERN +void +os_sync_init(void); +/*==============*/ +/*********************************************************//** +Frees created events and OS 'slow' mutexes. */ +UNIV_INTERN +void +os_sync_free(void); +/*==============*/ +/*********************************************************//** +Creates an event semaphore, i.e., a semaphore which may just have two states: +signaled and nonsignaled. The created event is manual reset: it must be reset +explicitly by calling sync_os_reset_event. +@return the event handle */ +UNIV_INTERN +os_event_t +os_event_create(void); +/*==================*/ +/**********************************************************//** +Sets an event semaphore to the signaled state: lets waiting threads +proceed. */ +UNIV_INTERN +void +os_event_set( +/*=========*/ + os_event_t event); /*!< in: event to set */ +/**********************************************************//** +Resets an event semaphore to the nonsignaled state. Waiting threads will +stop to wait for the event. +The return value should be passed to os_even_wait_low() if it is desired +that this thread should not wait in case of an intervening call to +os_event_set() between this os_event_reset() and the +os_event_wait_low() call. See comments for os_event_wait_low(). */ +UNIV_INTERN +ib_int64_t +os_event_reset( +/*===========*/ + os_event_t event); /*!< in: event to reset */ +/**********************************************************//** +Frees an event object. */ +UNIV_INTERN +void +os_event_free( +/*==========*/ + os_event_t event); /*!< in: event to free */ + +/**********************************************************//** +Waits for an event object until it is in the signaled state. + +Typically, if the event has been signalled after the os_event_reset() +we'll return immediately because event->is_set == TRUE. +There are, however, situations (e.g.: sync_array code) where we may +lose this information. For example: + +thread A calls os_event_reset() +thread B calls os_event_set() [event->is_set == TRUE] +thread C calls os_event_reset() [event->is_set == FALSE] +thread A calls os_event_wait() [infinite wait!] +thread C calls os_event_wait() [infinite wait!] + +Where such a scenario is possible, to avoid infinite wait, the +value returned by os_event_reset() should be passed in as +reset_sig_count. */ +UNIV_INTERN +void +os_event_wait_low( +/*==============*/ + os_event_t event, /*!< in: event to wait */ + ib_int64_t reset_sig_count);/*!< in: zero or the value + returned by previous call of + os_event_reset(). */ + +#define os_event_wait(event) os_event_wait_low(event, 0) +#define os_event_wait_time(event, t) os_event_wait_time_low(event, t, 0) + +/**********************************************************//** +Waits for an event object until it is in the signaled state or +a timeout is exceeded. In Unix the timeout is always infinite. +@return 0 if success, OS_SYNC_TIME_EXCEEDED if timeout was exceeded */ +UNIV_INTERN +ulint +os_event_wait_time_low( +/*===================*/ + os_event_t event, /*!< in: event to wait */ + ulint time_in_usec, /*!< in: timeout in + microseconds, or + OS_SYNC_INFINITE_TIME */ + ib_int64_t reset_sig_count); /*!< in: zero or the value + returned by previous call of + os_event_reset(). */ +/*********************************************************//** +Creates an operating system mutex semaphore. Because these are slow, the +mutex semaphore of InnoDB itself (ib_mutex_t) should be used where possible. +@return the mutex handle */ +UNIV_INTERN +os_ib_mutex_t +os_mutex_create(void); +/*=================*/ +/**********************************************************//** +Acquires ownership of a mutex semaphore. */ +UNIV_INTERN +void +os_mutex_enter( +/*===========*/ + os_ib_mutex_t mutex); /*!< in: mutex to acquire */ +/**********************************************************//** +Releases ownership of a mutex. */ +UNIV_INTERN +void +os_mutex_exit( +/*==========*/ + os_ib_mutex_t mutex); /*!< in: mutex to release */ +/**********************************************************//** +Frees an mutex object. */ +UNIV_INTERN +void +os_mutex_free( +/*==========*/ + os_ib_mutex_t mutex); /*!< in: mutex to free */ +/**********************************************************//** +Acquires ownership of a fast mutex. Currently in Windows this is the same +as os_fast_mutex_lock! +@return 0 if success, != 0 if was reserved by another thread */ +UNIV_INLINE +ulint +os_fast_mutex_trylock( +/*==================*/ + os_fast_mutex_t* fast_mutex); /*!< in: mutex to acquire */ + +/********************************************************************** +Following os_fast_ mutex APIs would be performance schema instrumented: + +os_fast_mutex_init +os_fast_mutex_lock +os_fast_mutex_unlock +os_fast_mutex_free + +These mutex APIs will point to corresponding wrapper functions that contain +the performance schema instrumentation. + +NOTE! The following macro should be used in mutex operation, not the +corresponding function. */ + +#ifdef UNIV_PFS_MUTEX +# define os_fast_mutex_init(K, M) \ + pfs_os_fast_mutex_init(K, M) + +# define os_fast_mutex_lock(M) \ + pfs_os_fast_mutex_lock(M, __FILE__, __LINE__) + +# define os_fast_mutex_unlock(M) pfs_os_fast_mutex_unlock(M) + +# define os_fast_mutex_free(M) pfs_os_fast_mutex_free(M) + +/*********************************************************//** +NOTE! Please use the corresponding macro os_fast_mutex_init(), not directly +this function! +A wrapper function for os_fast_mutex_init_func(). Initializes an operating +system fast mutex semaphore. */ +UNIV_INLINE +void +pfs_os_fast_mutex_init( +/*===================*/ + PSI_mutex_key key, /*!< in: Performance Schema + key */ + os_fast_mutex_t* fast_mutex); /*!< out: fast mutex */ +/**********************************************************//** +NOTE! Please use the corresponding macro os_fast_mutex_free(), not directly +this function! +Wrapper function for pfs_os_fast_mutex_free(). Also destroys the performance +schema probes when freeing the mutex */ +UNIV_INLINE +void +pfs_os_fast_mutex_free( +/*===================*/ + os_fast_mutex_t* fast_mutex); /*!< in/out: mutex to free */ +/**********************************************************//** +NOTE! Please use the corresponding macro os_fast_mutex_lock, not directly +this function! +Wrapper function of os_fast_mutex_lock. Acquires ownership of a fast mutex. */ +UNIV_INLINE +void +pfs_os_fast_mutex_lock( +/*===================*/ + os_fast_mutex_t* fast_mutex, /*!< in/out: mutex to acquire */ + const char* file_name, /*!< in: file name where + locked */ + ulint line); /*!< in: line where locked */ +/**********************************************************//** +NOTE! Please use the corresponding macro os_fast_mutex_unlock, not directly +this function! +Wrapper function of os_fast_mutex_unlock. Releases ownership of a fast mutex. */ +UNIV_INLINE +void +pfs_os_fast_mutex_unlock( +/*=====================*/ + os_fast_mutex_t* fast_mutex); /*!< in/out: mutex to release */ + +#else /* UNIV_PFS_MUTEX */ + +# define os_fast_mutex_init(K, M) \ + os_fast_mutex_init_func(&((os_fast_mutex_t*)(M))->mutex) + +# define os_fast_mutex_lock(M) \ + os_fast_mutex_lock_func(&((os_fast_mutex_t*)(M))->mutex) + +# define os_fast_mutex_unlock(M) \ + os_fast_mutex_unlock_func(&((os_fast_mutex_t*)(M))->mutex) + +# define os_fast_mutex_free(M) \ + os_fast_mutex_free_func(&((os_fast_mutex_t*)(M))->mutex) +#endif /* UNIV_PFS_MUTEX */ + +/**********************************************************//** +Releases ownership of a fast mutex. */ +UNIV_INTERN +void +os_fast_mutex_unlock_func( +/*======================*/ + fast_mutex_t* fast_mutex); /*!< in: mutex to release */ +/*********************************************************//** +Initializes an operating system fast mutex semaphore. */ +UNIV_INTERN +void +os_fast_mutex_init_func( +/*====================*/ + fast_mutex_t* fast_mutex); /*!< in: fast mutex */ +/**********************************************************//** +Acquires ownership of a fast mutex. */ +UNIV_INTERN +void +os_fast_mutex_lock_func( +/*====================*/ + fast_mutex_t* fast_mutex); /*!< in: mutex to acquire */ +/**********************************************************//** +Frees an mutex object. */ +UNIV_INTERN +void +os_fast_mutex_free_func( +/*====================*/ + fast_mutex_t* fast_mutex); /*!< in: mutex to free */ + +/**********************************************************//** +Atomic compare-and-swap and increment for InnoDB. */ + +#if defined(HAVE_IB_GCC_ATOMIC_BUILTINS) + +# define HAVE_ATOMIC_BUILTINS + +# ifdef HAVE_IB_GCC_ATOMIC_BUILTINS_BYTE +# define HAVE_ATOMIC_BUILTINS_BYTE +# endif + +# ifdef HAVE_IB_GCC_ATOMIC_BUILTINS_64 +# define HAVE_ATOMIC_BUILTINS_64 +# endif + +/**********************************************************//** +Returns true if swapped, ptr is pointer to target, old_val is value to +compare to, new_val is the value to swap in. */ + +# define os_compare_and_swap(ptr, old_val, new_val) \ + __sync_bool_compare_and_swap(ptr, old_val, new_val) + +# define os_compare_and_swap_ulint(ptr, old_val, new_val) \ + os_compare_and_swap(ptr, old_val, new_val) + +# define os_compare_and_swap_lint(ptr, old_val, new_val) \ + os_compare_and_swap(ptr, old_val, new_val) + +# define os_compare_and_swap_uint32(ptr, old_val, new_val) \ + os_compare_and_swap(ptr, old_val, new_val) + +# ifdef HAVE_IB_ATOMIC_PTHREAD_T_GCC +# define os_compare_and_swap_thread_id(ptr, old_val, new_val) \ + os_compare_and_swap(ptr, old_val, new_val) +# define INNODB_RW_LOCKS_USE_ATOMICS +# define IB_ATOMICS_STARTUP_MSG \ + "Mutexes and rw_locks use GCC atomic builtins" +# else /* HAVE_IB_ATOMIC_PTHREAD_T_GCC */ +# define IB_ATOMICS_STARTUP_MSG \ + "Mutexes use GCC atomic builtins, rw_locks do not" +# endif /* HAVE_IB_ATOMIC_PTHREAD_T_GCC */ + +/**********************************************************//** +Returns the resulting value, ptr is pointer to target, amount is the +amount of increment. */ + +# define os_atomic_increment(ptr, amount) \ + __sync_add_and_fetch(ptr, amount) + +# define os_atomic_increment_lint(ptr, amount) \ + os_atomic_increment(ptr, amount) + +# define os_atomic_increment_uint32(ptr, amount ) \ + os_atomic_increment(ptr, amount) + +# define os_atomic_increment_ulint(ptr, amount) \ + os_atomic_increment(ptr, amount) + +# define os_atomic_increment_uint64(ptr, amount) \ + os_atomic_increment(ptr, amount) + +/* Returns the resulting value, ptr is pointer to target, amount is the +amount to decrement. */ + +# define os_atomic_decrement(ptr, amount) \ + __sync_sub_and_fetch(ptr, amount) + +# define os_atomic_decrement_uint32(ptr, amount) \ + os_atomic_decrement(ptr, amount) + +# define os_atomic_decrement_lint(ptr, amount) \ + os_atomic_decrement(ptr, amount) + +# define os_atomic_decrement_ulint(ptr, amount) \ + os_atomic_decrement(ptr, amount) + +# define os_atomic_decrement_uint64(ptr, amount) \ + os_atomic_decrement(ptr, amount) + +/**********************************************************//** +Returns the old value of *ptr, atomically sets *ptr to new_val */ + +# define os_atomic_test_and_set_byte(ptr, new_val) \ + __sync_lock_test_and_set(ptr, (byte) new_val) + +# define os_atomic_test_and_set_ulint(ptr, new_val) \ + __sync_lock_test_and_set(ptr, new_val) + +#elif defined(HAVE_IB_SOLARIS_ATOMICS) + +# define HAVE_ATOMIC_BUILTINS +# define HAVE_ATOMIC_BUILTINS_BYTE +# define HAVE_ATOMIC_BUILTINS_64 + +/* If not compiling with GCC or GCC doesn't support the atomic +intrinsics and running on Solaris >= 10 use Solaris atomics */ + +# include <atomic.h> + +/**********************************************************//** +Returns true if swapped, ptr is pointer to target, old_val is value to +compare to, new_val is the value to swap in. */ + +# define os_compare_and_swap_uint32(ptr, old_val, new_val) \ + (atomic_cas_32(ptr, old_val, new_val) == old_val) + +# define os_compare_and_swap_ulint(ptr, old_val, new_val) \ + (atomic_cas_ulong(ptr, old_val, new_val) == old_val) + +# define os_compare_and_swap_lint(ptr, old_val, new_val) \ + ((lint) atomic_cas_ulong((ulong_t*) ptr, old_val, new_val) == old_val) + +# ifdef HAVE_IB_ATOMIC_PTHREAD_T_SOLARIS +# if SIZEOF_PTHREAD_T == 4 +# define os_compare_and_swap_thread_id(ptr, old_val, new_val) \ + ((pthread_t) atomic_cas_32(ptr, old_val, new_val) == old_val) +# elif SIZEOF_PTHREAD_T == 8 +# define os_compare_and_swap_thread_id(ptr, old_val, new_val) \ + ((pthread_t) atomic_cas_64(ptr, old_val, new_val) == old_val) +# else +# error "SIZEOF_PTHREAD_T != 4 or 8" +# endif /* SIZEOF_PTHREAD_T CHECK */ +# define INNODB_RW_LOCKS_USE_ATOMICS +# define IB_ATOMICS_STARTUP_MSG \ + "Mutexes and rw_locks use Solaris atomic functions" +# else /* HAVE_IB_ATOMIC_PTHREAD_T_SOLARIS */ +# define IB_ATOMICS_STARTUP_MSG \ + "Mutexes use Solaris atomic functions, rw_locks do not" +# endif /* HAVE_IB_ATOMIC_PTHREAD_T_SOLARIS */ + +/**********************************************************//** +Returns the resulting value, ptr is pointer to target, amount is the +amount of increment. */ + +# define os_atomic_increment_uint32(ptr, amount) \ + atomic_add_32_nv(ptr, amount) + +# define os_atomic_increment_ulint(ptr, amount) \ + atomic_add_long_nv(ptr, amount) + +# define os_atomic_increment_lint(ptr, amount) \ + os_atomic_increment_ulint((ulong_t*) ptr, amount) + +# define os_atomic_increment_uint64(ptr, amount) \ + atomic_add_64_nv(ptr, amount) + +/* Returns the resulting value, ptr is pointer to target, amount is the +amount to decrement. */ + +# define os_atomic_decrement_uint32(ptr, amount) \ + os_atomic_increment_uint32(ptr, -(amount)) + +# define os_atomic_decrement_lint(ptr, amount) \ + os_atomic_increment_ulint((ulong_t*) ptr, -(amount)) + +# define os_atomic_decrement_ulint(ptr, amount) \ + os_atomic_increment_ulint(ptr, -(amount)) + +# define os_atomic_decrement_uint64(ptr, amount) \ + os_atomic_increment_uint64(ptr, -(amount)) + +/**********************************************************//** +Returns the old value of *ptr, atomically sets *ptr to new_val */ + +# define os_atomic_test_and_set_byte(ptr, new_val) \ + atomic_swap_uchar(ptr, new_val) + +# define os_atomic_test_and_set_ulint(ptr, new_val) \ + atomic_swap_ulong(ptr, new_val) + +#elif defined(HAVE_WINDOWS_ATOMICS) + +# define HAVE_ATOMIC_BUILTINS +# define HAVE_ATOMIC_BUILTINS_BYTE + +# ifndef _WIN32 +# define HAVE_ATOMIC_BUILTINS_64 +# endif + +/**********************************************************//** +Atomic compare and exchange of signed integers (both 32 and 64 bit). +@return value found before the exchange. +If it is not equal to old_value the exchange did not happen. */ +UNIV_INLINE +lint +win_cmp_and_xchg_lint( +/*==================*/ + volatile lint* ptr, /*!< in/out: source/destination */ + lint new_val, /*!< in: exchange value */ + lint old_val); /*!< in: value to compare to */ + +/**********************************************************//** +Atomic addition of signed integers. +@return Initial value of the variable pointed to by ptr */ +UNIV_INLINE +lint +win_xchg_and_add( +/*=============*/ + volatile lint* ptr, /*!< in/out: address of destination */ + lint val); /*!< in: number to be added */ + +/**********************************************************//** +Atomic compare and exchange of unsigned integers. +@return value found before the exchange. +If it is not equal to old_value the exchange did not happen. */ +UNIV_INLINE +ulint +win_cmp_and_xchg_ulint( +/*===================*/ + volatile ulint* ptr, /*!< in/out: source/destination */ + ulint new_val, /*!< in: exchange value */ + ulint old_val); /*!< in: value to compare to */ + +/**********************************************************//** +Atomic compare and exchange of 32 bit unsigned integers. +@return value found before the exchange. +If it is not equal to old_value the exchange did not happen. */ +UNIV_INLINE +DWORD +win_cmp_and_xchg_dword( +/*===================*/ + volatile DWORD* ptr, /*!< in/out: source/destination */ + DWORD new_val, /*!< in: exchange value */ + DWORD old_val); /*!< in: value to compare to */ + +/**********************************************************//** +Returns true if swapped, ptr is pointer to target, old_val is value to +compare to, new_val is the value to swap in. */ + +# define os_compare_and_swap_uint32(ptr, old_val, new_val) \ + (InterlockedCompareExchange(reinterpret_cast<volatile long*>(ptr), \ + new_val, old_val) == old_val) + +# define os_compare_and_swap_ulint(ptr, old_val, new_val) \ + (win_cmp_and_xchg_ulint(ptr, new_val, old_val) == old_val) + +# define os_compare_and_swap_lint(ptr, old_val, new_val) \ + (win_cmp_and_xchg_lint(ptr, new_val, old_val) == old_val) + +/* windows thread objects can always be passed to windows atomic functions */ +# define os_compare_and_swap_thread_id(ptr, old_val, new_val) \ + (win_cmp_and_xchg_dword(ptr, new_val, old_val) == old_val) + +# define INNODB_RW_LOCKS_USE_ATOMICS +# define IB_ATOMICS_STARTUP_MSG \ + "Mutexes and rw_locks use Windows interlocked functions" + +/**********************************************************//** +Returns the resulting value, ptr is pointer to target, amount is the +amount of increment. */ + +# define os_atomic_increment_lint(ptr, amount) \ + (win_xchg_and_add(ptr, amount) + amount) + +# define os_atomic_increment_uint32(ptr, amount) \ + ((ulint) InterlockedExchangeAdd((long*) ptr, amount)) + +# define os_atomic_increment_ulint(ptr, amount) \ + ((ulint) (win_xchg_and_add((lint*) ptr, (lint) amount) + amount)) + +# define os_atomic_increment_uint64(ptr, amount) \ + ((ib_uint64_t) (InterlockedExchangeAdd64( \ + (ib_int64_t*) ptr, \ + (ib_int64_t) amount) + amount)) + +/**********************************************************//** +Returns the resulting value, ptr is pointer to target, amount is the +amount to decrement. There is no atomic substract function on Windows */ + +# define os_atomic_decrement_uint32(ptr, amount) \ + ((ulint) InterlockedExchangeAdd((long*) ptr, (-amount))) + +# define os_atomic_decrement_lint(ptr, amount) \ + (win_xchg_and_add(ptr, -(lint) amount) - amount) + +# define os_atomic_decrement_ulint(ptr, amount) \ + ((ulint) (win_xchg_and_add((lint*) ptr, -(lint) amount) - amount)) + +# define os_atomic_decrement_uint64(ptr, amount) \ + ((ib_uint64_t) (InterlockedExchangeAdd64( \ + (ib_int64_t*) ptr, \ + -(ib_int64_t) amount) - amount)) + +/**********************************************************//** +Returns the old value of *ptr, atomically sets *ptr to new_val. +InterlockedExchange() operates on LONG, and the LONG will be +clobbered */ + +# define os_atomic_test_and_set_byte(ptr, new_val) \ + ((byte) InterlockedExchange(ptr, new_val)) + +# define os_atomic_test_and_set_ulong(ptr, new_val) \ + InterlockedExchange(ptr, new_val) + +#else +# define IB_ATOMICS_STARTUP_MSG \ + "Mutexes and rw_locks use InnoDB's own implementation" +#endif +#ifdef HAVE_ATOMIC_BUILTINS +#define os_atomic_inc_ulint(m,v,d) os_atomic_increment_ulint(v, d) +#define os_atomic_dec_ulint(m,v,d) os_atomic_decrement_ulint(v, d) +#else +#define os_atomic_inc_ulint(m,v,d) os_atomic_inc_ulint_func(m, v, d) +#define os_atomic_dec_ulint(m,v,d) os_atomic_dec_ulint_func(m, v, d) +#endif /* HAVE_ATOMIC_BUILTINS */ + +/**********************************************************//** +Following macros are used to update specified counter atomically +if HAVE_ATOMIC_BUILTINS defined. Otherwise, use mutex passed in +for synchronization */ +#ifdef HAVE_ATOMIC_BUILTINS +#define os_increment_counter_by_amount(mutex, counter, amount) \ + (void) os_atomic_increment_ulint(&counter, amount) + +#define os_decrement_counter_by_amount(mutex, counter, amount) \ + (void) os_atomic_increment_ulint(&counter, (-((lint) amount))) +#else +#define os_increment_counter_by_amount(mutex, counter, amount) \ + do { \ + mutex_enter(&(mutex)); \ + (counter) += (amount); \ + mutex_exit(&(mutex)); \ + } while (0) + +#define os_decrement_counter_by_amount(mutex, counter, amount) \ + do { \ + ut_a(counter >= amount); \ + mutex_enter(&(mutex)); \ + (counter) -= (amount); \ + mutex_exit(&(mutex)); \ + } while (0) +#endif /* HAVE_ATOMIC_BUILTINS */ + +#define os_inc_counter(mutex, counter) \ + os_increment_counter_by_amount(mutex, counter, 1) + +#define os_dec_counter(mutex, counter) \ + do { \ + os_decrement_counter_by_amount(mutex, counter, 1);\ + } while (0); + +/** barrier definitions for memory ordering */ +#if defined __i386__ || defined __x86_64__ || defined _M_IX86 || defined _M_X64 || defined __WIN__ +/* Performance regression was observed at some conditions for Intel +architecture. Disable memory barrier for Intel architecture for now. */ +# define os_rmb +# define os_wmb +# define IB_MEMORY_BARRIER_STARTUP_MSG \ + "Memory barrier is not used" +#elif defined(HAVE_IB_GCC_ATOMIC_THREAD_FENCE) +# define HAVE_MEMORY_BARRIER +# define os_rmb __atomic_thread_fence(__ATOMIC_ACQUIRE) +# define os_wmb __atomic_thread_fence(__ATOMIC_RELEASE) +# define IB_MEMORY_BARRIER_STARTUP_MSG \ + "GCC builtin __atomic_thread_fence() is used for memory barrier" + +#elif defined(HAVE_IB_GCC_SYNC_SYNCHRONISE) +# define HAVE_MEMORY_BARRIER +# define os_rmb __sync_synchronize() +# define os_wmb __sync_synchronize() +# define IB_MEMORY_BARRIER_STARTUP_MSG \ + "GCC builtin __sync_synchronize() is used for memory barrier" + +#elif defined(HAVE_IB_MACHINE_BARRIER_SOLARIS) +# define HAVE_MEMORY_BARRIER +# include <mbarrier.h> +# define os_rmb __machine_r_barrier() +# define os_wmb __machine_w_barrier() +# define IB_MEMORY_BARRIER_STARTUP_MSG \ + "Solaris memory ordering functions are used for memory barrier" + +#elif defined(HAVE_WINDOWS_MM_FENCE) && defined(_WIN64) +# define HAVE_MEMORY_BARRIER +# include <mmintrin.h> +# define os_rmb _mm_lfence() +# define os_wmb _mm_sfence() +# define IB_MEMORY_BARRIER_STARTUP_MSG \ + "_mm_lfence() and _mm_sfence() are used for memory barrier" + +#else +# define os_rmb +# define os_wmb +# define IB_MEMORY_BARRIER_STARTUP_MSG \ + "Memory barrier is not used" +#endif + +#ifndef UNIV_NONINL +#include "os0sync.ic" +#endif + +#endif diff --git a/storage/xtradb/include/os0sync.ic b/storage/xtradb/include/os0sync.ic new file mode 100644 index 00000000000..9a7e520ece6 --- /dev/null +++ b/storage/xtradb/include/os0sync.ic @@ -0,0 +1,234 @@ +/***************************************************************************** + +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/os0sync.ic +The interface to the operating system synchronization primitives. + +Created 9/6/1995 Heikki Tuuri +*******************************************************/ + +#ifdef __WIN__ +#include <winbase.h> +#endif + +/**********************************************************//** +Acquires ownership of a fast mutex. +@return 0 if success, != 0 if was reserved by another thread */ +UNIV_INLINE +ulint +os_fast_mutex_trylock( +/*==================*/ + os_fast_mutex_t* fast_mutex) /*!< in: mutex to acquire */ +{ + fast_mutex_t* mutex = &fast_mutex->mutex; + +#ifdef __WIN__ + return(!TryEnterCriticalSection(mutex)); +#else + /* NOTE that the MySQL my_pthread.h redefines pthread_mutex_trylock + so that it returns 0 on success. In the operating system + libraries, HP-UX-10.20 follows the old Posix 1003.4a Draft 4 and + returns 1 on success (but MySQL remaps that to 0), while Linux, + FreeBSD, Solaris, AIX, Tru64 Unix, HP-UX-11.0 return 0 on success. */ + + return((ulint) pthread_mutex_trylock(mutex)); +#endif +} + +#ifdef UNIV_PFS_MUTEX +/*********************************************************//** +NOTE! Please use the corresponding macro os_fast_mutex_init(), not directly +this function! +A wrapper function for os_fast_mutex_init_func(). Initializes an operating +system fast mutex semaphore. */ +UNIV_INLINE +void +pfs_os_fast_mutex_init( +/*===================*/ + PSI_mutex_key key, /*!< in: Performance Schema + key */ + os_fast_mutex_t* fast_mutex) /*!< out: fast mutex */ +{ +#ifdef HAVE_PSI_MUTEX_INTERFACE + fast_mutex->pfs_psi = PSI_MUTEX_CALL(init_mutex)(key, &fast_mutex->mutex); +#else + fast_mutex->pfs_psi = NULL; +#endif + + os_fast_mutex_init_func(&fast_mutex->mutex); +} +/******************************************************************//** +NOTE! Please use the corresponding macro os_fast_mutex_free(), not directly +this function! +Wrapper function for pfs_os_fast_mutex_free(). Also destroys the performance +schema probes when freeing the mutex */ +UNIV_INLINE +void +pfs_os_fast_mutex_free( +/*===================*/ + os_fast_mutex_t* fast_mutex) /*!< in/out: mutex */ +{ +#ifdef HAVE_PSI_MUTEX_INTERFACE + if (fast_mutex->pfs_psi != NULL) + PSI_MUTEX_CALL(destroy_mutex)(fast_mutex->pfs_psi); +#endif + fast_mutex->pfs_psi = NULL; + + os_fast_mutex_free_func(&fast_mutex->mutex); +} +/**********************************************************//** +NOTE! Please use the corresponding macro os_fast_mutex_lock, not directly +this function! +Wrapper function of os_fast_mutex_lock_func. Acquires ownership of a fast +mutex. */ +UNIV_INLINE +void +pfs_os_fast_mutex_lock( +/*===================*/ + os_fast_mutex_t* fast_mutex, /*!< in/out: mutex to acquire */ + const char* file_name, /*!< in: file name where + locked */ + ulint line) /*!< in: line where locked */ +{ +#ifdef HAVE_PSI_MUTEX_INTERFACE + if (fast_mutex->pfs_psi != NULL) + { + PSI_mutex_locker* locker; + PSI_mutex_locker_state state; + + locker = PSI_MUTEX_CALL(start_mutex_wait)( + &state, fast_mutex->pfs_psi, + PSI_MUTEX_LOCK, file_name, + static_cast<uint>(line)); + + os_fast_mutex_lock_func(&fast_mutex->mutex); + + if (locker != NULL) + PSI_MUTEX_CALL(end_mutex_wait)(locker, 0); + } + else +#endif + { + os_fast_mutex_lock_func(&fast_mutex->mutex); + } + + return; +} +/**********************************************************//** +NOTE! Please use the corresponding macro os_fast_mutex_unlock, not directly +this function! +Wrapper function of os_fast_mutex_unlock_func. Releases ownership of a +fast mutex. */ +UNIV_INLINE +void +pfs_os_fast_mutex_unlock( +/*=====================*/ + os_fast_mutex_t* fast_mutex) /*!< in/out: mutex to release */ +{ +#ifdef HAVE_PSI_MUTEX_INTERFACE + if (fast_mutex->pfs_psi != NULL) + PSI_MUTEX_CALL(unlock_mutex)(fast_mutex->pfs_psi); +#endif + + os_fast_mutex_unlock_func(&fast_mutex->mutex); +} +#endif /* UNIV_PFS_MUTEX */ + +#ifdef HAVE_WINDOWS_ATOMICS + +/* Use inline functions to make 64 and 32 bit versions of windows atomic +functions so that typecasts are evaluated at compile time. Take advantage +that lint is either __int64 or long int and windows atomic functions work +on __int64 and LONG */ + +/**********************************************************//** +Atomic compare and exchange of unsigned integers. +@return value found before the exchange. +If it is not equal to old_value the exchange did not happen. */ +UNIV_INLINE +lint +win_cmp_and_xchg_lint( +/*==================*/ + volatile lint* ptr, /*!< in/out: source/destination */ + lint new_val, /*!< in: exchange value */ + lint old_val) /*!< in: value to compare to */ +{ +# ifdef _WIN64 + return(InterlockedCompareExchange64(ptr, new_val, old_val)); +# else + return(InterlockedCompareExchange(ptr, new_val, old_val)); +# endif +} + +/**********************************************************//** +Atomic addition of signed integers. +@return Initial value of the variable pointed to by ptr */ +UNIV_INLINE +lint +win_xchg_and_add( +/*=============*/ + volatile lint* ptr, /*!< in/out: address of destination */ + lint val) /*!< in: number to be added */ +{ +#ifdef _WIN64 + return(InterlockedExchangeAdd64(ptr, val)); +#else + return(InterlockedExchangeAdd(ptr, val)); +#endif +} + +/**********************************************************//** +Atomic compare and exchange of unsigned integers. +@return value found before the exchange. +If it is not equal to old_value the exchange did not happen. */ +UNIV_INLINE +ulint +win_cmp_and_xchg_ulint( +/*===================*/ + volatile ulint* ptr, /*!< in/out: source/destination */ + ulint new_val, /*!< in: exchange value */ + ulint old_val) /*!< in: value to compare to */ +{ + return((ulint) win_cmp_and_xchg_lint( + (volatile lint*) ptr, + (lint) new_val, + (lint) old_val)); +} + +/**********************************************************//** +Atomic compare and exchange of 32-bit unsigned integers. +@return value found before the exchange. +If it is not equal to old_value the exchange did not happen. */ +UNIV_INLINE +DWORD +win_cmp_and_xchg_dword( +/*===================*/ + volatile DWORD* ptr, /*!< in/out: source/destination */ + DWORD new_val, /*!< in: exchange value */ + DWORD old_val) /*!< in: value to compare to */ +{ + ut_ad(sizeof(DWORD) == sizeof(LONG)); /* We assume this. */ + return(InterlockedCompareExchange( + (volatile LONG*) ptr, + (LONG) new_val, + (LONG) old_val)); +} + +#endif /* HAVE_WINDOWS_ATOMICS */ + diff --git a/storage/xtradb/include/os0thread.h b/storage/xtradb/include/os0thread.h new file mode 100644 index 00000000000..d84eff99519 --- /dev/null +++ b/storage/xtradb/include/os0thread.h @@ -0,0 +1,189 @@ +/***************************************************************************** + +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/os0thread.h +The interface to the operating system +process and thread control primitives + +Created 9/8/1995 Heikki Tuuri +*******************************************************/ + +#ifndef os0thread_h +#define os0thread_h + +#include "univ.i" + +#ifdef UNIV_LINUX +#include <sys/types.h> +#endif + +/* Maximum number of threads which can be created in the program; +this is also the size of the wait slot array for MySQL threads which +can wait inside InnoDB */ + +#define OS_THREAD_MAX_N srv_max_n_threads + +/* Possible fixed priorities for threads */ +#define OS_THREAD_PRIORITY_NONE 100 +#define OS_THREAD_PRIORITY_BACKGROUND 1 +#define OS_THREAD_PRIORITY_NORMAL 2 +#define OS_THREAD_PRIORITY_ABOVE_NORMAL 3 + +#ifdef __WIN__ +typedef void* os_thread_t; +typedef DWORD os_thread_id_t; /*!< In Windows the thread id + is an unsigned long int */ +typedef os_thread_id_t os_tid_t; +extern "C" { +typedef LPTHREAD_START_ROUTINE os_thread_func_t; +} + +/** Macro for specifying a Windows thread start function. */ +#define DECLARE_THREAD(func) WINAPI func + +/** Required to get around a build error on Windows. Even though our functions +are defined/declared as WINAPI f(LPVOID a); the compiler complains that they +are defined as: os_thread_ret_t (__cdecl*)(void*). Because our functions +don't access the arguments and don't return any value, we should be safe. */ +#define os_thread_create(f,a,i) \ + os_thread_create_func(reinterpret_cast<os_thread_func_t>(f), a, i) + +#else + +typedef pthread_t os_thread_t; +typedef os_thread_t os_thread_id_t; /*!< In Unix we use the thread + handle itself as the id of + the thread */ +#ifdef UNIV_LINUX +typedef pid_t os_tid_t; /*!< An alias for pid_t on + Linux, where setpriority() + accepts thread id of this type + and not pthread_t */ +#else +typedef os_thread_id_t os_tid_t; +#endif + +extern "C" { typedef void* (*os_thread_func_t)(void*); } + +/** Macro for specifying a POSIX thread start function. */ +#define DECLARE_THREAD(func) func +#define os_thread_create(f,a,i) os_thread_create_func(f, a, i) + +#endif /* __WIN__ */ + +/* Define a function pointer type to use in a typecast */ +typedef void* (*os_posix_f_t) (void*); + +#ifdef HAVE_PSI_INTERFACE +/* Define for performance schema registration key */ +typedef unsigned int mysql_pfs_key_t; +#endif + +/***************************************************************//** +Compares two thread ids for equality. +@return TRUE if equal */ +UNIV_INTERN +ibool +os_thread_eq( +/*=========*/ + os_thread_id_t a, /*!< in: OS thread or thread id */ + os_thread_id_t b); /*!< in: OS thread or thread id */ +/****************************************************************//** +Converts an OS thread id to a ulint. It is NOT guaranteed that the ulint is +unique for the thread though! +@return thread identifier as a number */ +UNIV_INTERN +ulint +os_thread_pf( +/*=========*/ + os_thread_id_t a); /*!< in: OS thread identifier */ +/****************************************************************//** +Creates a new thread of execution. The execution starts from +the function given. The start function takes a void* parameter +and returns a ulint. +NOTE: We count the number of threads in os_thread_exit(). A created +thread should always use that to exit and not use return() to exit. +@return handle to the thread */ +UNIV_INTERN +os_thread_t +os_thread_create_func( +/*==================*/ + os_thread_func_t func, /*!< in: pointer to function + from which to start */ + void* arg, /*!< in: argument to start + function */ + os_thread_id_t* thread_id); /*!< out: id of the created + thread, or NULL */ + +/*****************************************************************//** +Exits the current thread. */ +UNIV_INTERN +void +os_thread_exit( +/*===========*/ + void* exit_value) /*!< in: exit value; in Windows this void* + is cast as a DWORD */ + UNIV_COLD __attribute__((noreturn)); +/*****************************************************************//** +Returns the thread identifier of current thread. +@return current thread identifier */ +UNIV_INTERN +os_thread_id_t +os_thread_get_curr_id(void); +/*========================*/ +/*****************************************************************//** +Returns the system-specific thread identifier of current thread. On Linux, +returns tid. On other systems currently returns os_thread_get_curr_id(). + +@return current thread identifier */ +UNIV_INTERN +os_tid_t +os_thread_get_tid(void); +/*=====================*/ +/*****************************************************************//** +Advises the os to give up remainder of the thread's time slice. */ +UNIV_INTERN +void +os_thread_yield(void); +/*=================*/ +/*****************************************************************//** +The thread sleeps at least the time given in microseconds. */ +UNIV_INTERN +void +os_thread_sleep( +/*============*/ + ulint tm); /*!< in: time in microseconds */ +/*****************************************************************//** +Set relative scheduling priority for a given thread on Linux. Currently a +no-op on other systems. + +@return An actual thread priority after the update */ +UNIV_INTERN +ulint +os_thread_set_priority( +/*===================*/ + os_tid_t thread_id, /*!< in: thread id */ + ulint relative_priority); /*!< in: system-specific + priority value */ + +#ifndef UNIV_NONINL +#include "os0thread.ic" +#endif + +#endif diff --git a/storage/xtradb/include/os0thread.ic b/storage/xtradb/include/os0thread.ic new file mode 100644 index 00000000000..0622d22f2dc --- /dev/null +++ b/storage/xtradb/include/os0thread.ic @@ -0,0 +1,25 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/os0thread.ic +The interface to the operating system +process and thread control primitives + +Created 9/8/1995 Heikki Tuuri +*******************************************************/ diff --git a/storage/xtradb/include/page0cur.h b/storage/xtradb/include/page0cur.h new file mode 100644 index 00000000000..b1ad49b4915 --- /dev/null +++ b/storage/xtradb/include/page0cur.h @@ -0,0 +1,387 @@ +/***************************************************************************** + +Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/page0cur.h +The page cursor + +Created 10/4/1994 Heikki Tuuri +*************************************************************************/ + +#ifndef page0cur_h +#define page0cur_h + +#include "univ.i" + +#include "buf0types.h" +#include "page0page.h" +#include "rem0rec.h" +#include "data0data.h" +#include "mtr0mtr.h" + + +#define PAGE_CUR_ADAPT + +/* Page cursor search modes; the values must be in this order! */ + +#define PAGE_CUR_UNSUPP 0 +#define PAGE_CUR_G 1 +#define PAGE_CUR_GE 2 +#define PAGE_CUR_L 3 +#define PAGE_CUR_LE 4 +/*#define PAGE_CUR_LE_OR_EXTENDS 5*/ /* This is a search mode used in + "column LIKE 'abc%' ORDER BY column DESC"; + we have to find strings which are <= 'abc' or + which extend it */ +#ifdef UNIV_SEARCH_DEBUG +# define PAGE_CUR_DBG 6 /* As PAGE_CUR_LE, but skips search shortcut */ +#endif /* UNIV_SEARCH_DEBUG */ + +#ifdef UNIV_DEBUG +/*********************************************************//** +Gets pointer to the page frame where the cursor is positioned. +@return page */ +UNIV_INLINE +page_t* +page_cur_get_page( +/*==============*/ + page_cur_t* cur); /*!< in: page cursor */ +/*********************************************************//** +Gets pointer to the buffer block where the cursor is positioned. +@return page */ +UNIV_INLINE +buf_block_t* +page_cur_get_block( +/*===============*/ + page_cur_t* cur); /*!< in: page cursor */ +/*********************************************************//** +Gets pointer to the page frame where the cursor is positioned. +@return page */ +UNIV_INLINE +page_zip_des_t* +page_cur_get_page_zip( +/*==================*/ + page_cur_t* cur); /*!< in: page cursor */ +/*********************************************************//** +Gets the record where the cursor is positioned. +@return record */ +UNIV_INLINE +rec_t* +page_cur_get_rec( +/*=============*/ + page_cur_t* cur); /*!< in: page cursor */ +#else /* UNIV_DEBUG */ +# define page_cur_get_page(cur) page_align((cur)->rec) +# define page_cur_get_block(cur) (cur)->block +# define page_cur_get_page_zip(cur) buf_block_get_page_zip((cur)->block) +# define page_cur_get_rec(cur) (cur)->rec +#endif /* UNIV_DEBUG */ +/*********************************************************//** +Sets the cursor object to point before the first user record +on the page. */ +UNIV_INLINE +void +page_cur_set_before_first( +/*======================*/ + const buf_block_t* block, /*!< in: index page */ + page_cur_t* cur); /*!< in: cursor */ +/*********************************************************//** +Sets the cursor object to point after the last user record on +the page. */ +UNIV_INLINE +void +page_cur_set_after_last( +/*====================*/ + const buf_block_t* block, /*!< in: index page */ + page_cur_t* cur); /*!< in: cursor */ +/*********************************************************//** +Returns TRUE if the cursor is before first user record on page. +@return TRUE if at start */ +UNIV_INLINE +ibool +page_cur_is_before_first( +/*=====================*/ + const page_cur_t* cur); /*!< in: cursor */ +/*********************************************************//** +Returns TRUE if the cursor is after last user record. +@return TRUE if at end */ +UNIV_INLINE +ibool +page_cur_is_after_last( +/*===================*/ + const page_cur_t* cur); /*!< in: cursor */ +/**********************************************************//** +Positions the cursor on the given record. */ +UNIV_INLINE +void +page_cur_position( +/*==============*/ + const rec_t* rec, /*!< in: record on a page */ + const buf_block_t* block, /*!< in: buffer block containing + the record */ + page_cur_t* cur); /*!< out: page cursor */ +/**********************************************************//** +Invalidates a page cursor by setting the record pointer NULL. */ +UNIV_INLINE +void +page_cur_invalidate( +/*================*/ + page_cur_t* cur); /*!< out: page cursor */ +/**********************************************************//** +Moves the cursor to the next record on page. */ +UNIV_INLINE +void +page_cur_move_to_next( +/*==================*/ + page_cur_t* cur); /*!< in/out: cursor; must not be after last */ +/**********************************************************//** +Moves the cursor to the previous record on page. */ +UNIV_INLINE +void +page_cur_move_to_prev( +/*==================*/ + page_cur_t* cur); /*!< in/out: cursor; not before first */ +#ifndef UNIV_HOTBACKUP +/***********************************************************//** +Inserts a record next to page cursor. Returns pointer to inserted record if +succeed, i.e., enough space available, NULL otherwise. The cursor stays at +the same logical position, but the physical position may change if it is +pointing to a compressed page that was reorganized. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if this is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + +@return pointer to record if succeed, NULL otherwise */ +UNIV_INLINE +rec_t* +page_cur_tuple_insert( +/*==================*/ + page_cur_t* cursor, /*!< in/out: a page cursor */ + const dtuple_t* tuple, /*!< in: pointer to a data tuple */ + dict_index_t* index, /*!< in: record descriptor */ + ulint** offsets,/*!< out: offsets on *rec */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */ + ulint n_ext, /*!< in: number of externally stored columns */ + mtr_t* mtr) /*!< in: mini-transaction handle, or NULL */ + __attribute__((nonnull(1,2,3,4,5), warn_unused_result)); +#endif /* !UNIV_HOTBACKUP */ +/***********************************************************//** +Inserts a record next to page cursor. Returns pointer to inserted record if +succeed, i.e., enough space available, NULL otherwise. The cursor stays at +the same logical position, but the physical position may change if it is +pointing to a compressed page that was reorganized. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if this is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + +@return pointer to record if succeed, NULL otherwise */ +UNIV_INLINE +rec_t* +page_cur_rec_insert( +/*================*/ + page_cur_t* cursor, /*!< in/out: a page cursor */ + const rec_t* rec, /*!< in: record to insert */ + dict_index_t* index, /*!< in: record descriptor */ + ulint* offsets,/*!< in/out: rec_get_offsets(rec, index) */ + mtr_t* mtr); /*!< in: mini-transaction handle, or NULL */ +/***********************************************************//** +Inserts a record next to page cursor on an uncompressed page. +Returns pointer to inserted record if succeed, i.e., enough +space available, NULL otherwise. The cursor stays at the same position. +@return pointer to record if succeed, NULL otherwise */ +UNIV_INTERN +rec_t* +page_cur_insert_rec_low( +/*====================*/ + rec_t* current_rec,/*!< in: pointer to current record after + which the new record is inserted */ + dict_index_t* index, /*!< in: record descriptor */ + const rec_t* rec, /*!< in: pointer to a physical record */ + ulint* offsets,/*!< in/out: rec_get_offsets(rec, index) */ + mtr_t* mtr) /*!< in: mini-transaction handle, or NULL */ + __attribute__((nonnull(1,2,3,4), warn_unused_result)); +/***********************************************************//** +Inserts a record next to page cursor on a compressed and uncompressed +page. Returns pointer to inserted record if succeed, i.e., +enough space available, NULL otherwise. +The cursor stays at the same position. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if this is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + +@return pointer to record if succeed, NULL otherwise */ +UNIV_INTERN +rec_t* +page_cur_insert_rec_zip( +/*====================*/ + page_cur_t* cursor, /*!< in/out: page cursor */ + dict_index_t* index, /*!< in: record descriptor */ + const rec_t* rec, /*!< in: pointer to a physical record */ + ulint* offsets,/*!< in/out: rec_get_offsets(rec, index) */ + mtr_t* mtr) /*!< in: mini-transaction handle, or NULL */ + __attribute__((nonnull(1,2,3,4), warn_unused_result)); +/*************************************************************//** +Copies records from page to a newly created page, from a given record onward, +including that record. Infimum and supremum records are not copied. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if this is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). */ +UNIV_INTERN +void +page_copy_rec_list_end_to_created_page( +/*===================================*/ + page_t* new_page, /*!< in/out: index page to copy to */ + rec_t* rec, /*!< in: first record to copy */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr); /*!< in: mtr */ +/***********************************************************//** +Deletes a record at the page cursor. The cursor is moved to the +next record after the deleted one. */ +UNIV_INTERN +void +page_cur_delete_rec( +/*================*/ + page_cur_t* cursor, /*!< in/out: a page cursor */ + const dict_index_t* index, /*!< in: record descriptor */ + const ulint* offsets,/*!< in: rec_get_offsets( + cursor->rec, index) */ + mtr_t* mtr); /*!< in: mini-transaction handle */ +#ifndef UNIV_HOTBACKUP +/****************************************************************//** +Searches the right position for a page cursor. +@return number of matched fields on the left */ +UNIV_INLINE +ulint +page_cur_search( +/*============*/ + const buf_block_t* block, /*!< in: buffer block */ + const dict_index_t* index, /*!< in: record descriptor */ + const dtuple_t* tuple, /*!< in: data tuple */ + ulint mode, /*!< in: PAGE_CUR_L, + PAGE_CUR_LE, PAGE_CUR_G, or + PAGE_CUR_GE */ + page_cur_t* cursor);/*!< out: page cursor */ +/****************************************************************//** +Searches the right position for a page cursor. */ +UNIV_INTERN +void +page_cur_search_with_match( +/*=======================*/ + const buf_block_t* block, /*!< in: buffer block */ + const dict_index_t* index, /*!< in: record descriptor */ + const dtuple_t* tuple, /*!< in: data tuple */ + ulint mode, /*!< in: PAGE_CUR_L, + PAGE_CUR_LE, PAGE_CUR_G, or + PAGE_CUR_GE */ + ulint* iup_matched_fields, + /*!< in/out: already matched + fields in upper limit record */ + ulint* iup_matched_bytes, + /*!< in/out: already matched + bytes in a field not yet + completely matched */ + ulint* ilow_matched_fields, + /*!< in/out: already matched + fields in lower limit record */ + ulint* ilow_matched_bytes, + /*!< in/out: already matched + bytes in a field not yet + completely matched */ + page_cur_t* cursor);/*!< out: page cursor */ +/***********************************************************//** +Positions a page cursor on a randomly chosen user record on a page. If there +are no user records, sets the cursor on the infimum record. */ +UNIV_INTERN +void +page_cur_open_on_rnd_user_rec( +/*==========================*/ + buf_block_t* block, /*!< in: page */ + page_cur_t* cursor);/*!< out: page cursor */ +#endif /* !UNIV_HOTBACKUP */ +/***********************************************************//** +Parses a log record of a record insert on a page. +@return end of log record or NULL */ +UNIV_INTERN +byte* +page_cur_parse_insert_rec( +/*======================*/ + ibool is_short,/*!< in: TRUE if short inserts */ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + buf_block_t* block, /*!< in: page or NULL */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr); /*!< in: mtr or NULL */ +/**********************************************************//** +Parses a log record of copying a record list end to a new created page. +@return end of log record or NULL */ +UNIV_INTERN +byte* +page_parse_copy_rec_list_to_created_page( +/*=====================================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + buf_block_t* block, /*!< in: page or NULL */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr); /*!< in: mtr or NULL */ +/***********************************************************//** +Parses log record of a record delete on a page. +@return pointer to record end or NULL */ +UNIV_INTERN +byte* +page_cur_parse_delete_rec( +/*======================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + buf_block_t* block, /*!< in: page or NULL */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr); /*!< in: mtr or NULL */ +/*******************************************************//** +Removes the record from a leaf page. This function does not log +any changes. It is used by the IMPORT tablespace functions. +@return true if success, i.e., the page did not become too empty */ +UNIV_INTERN +bool +page_delete_rec( +/*============*/ + const dict_index_t* index, /*!< in: The index that the record + belongs to */ + page_cur_t* pcur, /*!< in/out: page cursor on record + to delete */ + page_zip_des_t* page_zip,/*!< in: compressed page descriptor */ + const ulint* offsets);/*!< in: offsets for record */ + +/** Index page cursor */ + +struct page_cur_t{ + byte* rec; /*!< pointer to a record on page */ + buf_block_t* block; /*!< pointer to the block containing rec */ +}; + +#ifndef UNIV_NONINL +#include "page0cur.ic" +#endif + +#endif diff --git a/storage/xtradb/include/page0cur.ic b/storage/xtradb/include/page0cur.ic new file mode 100644 index 00000000000..028d33b17aa --- /dev/null +++ b/storage/xtradb/include/page0cur.ic @@ -0,0 +1,317 @@ +/***************************************************************************** + +Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/page0cur.ic +The page cursor + +Created 10/4/1994 Heikki Tuuri +*************************************************************************/ + +#include "page0page.h" +#include "buf0types.h" + +#ifdef UNIV_DEBUG +# include "rem0cmp.h" + +/*********************************************************//** +Gets pointer to the page frame where the cursor is positioned. +@return page */ +UNIV_INLINE +page_t* +page_cur_get_page( +/*==============*/ + page_cur_t* cur) /*!< in: page cursor */ +{ + ut_ad(cur); + ut_ad(page_align(cur->rec) == cur->block->frame); + + return(page_align(cur->rec)); +} + +/*********************************************************//** +Gets pointer to the buffer block where the cursor is positioned. +@return page */ +UNIV_INLINE +buf_block_t* +page_cur_get_block( +/*===============*/ + page_cur_t* cur) /*!< in: page cursor */ +{ + ut_ad(cur); + ut_ad(page_align(cur->rec) == cur->block->frame); + return(cur->block); +} + +/*********************************************************//** +Gets pointer to the page frame where the cursor is positioned. +@return page */ +UNIV_INLINE +page_zip_des_t* +page_cur_get_page_zip( +/*==================*/ + page_cur_t* cur) /*!< in: page cursor */ +{ + return(buf_block_get_page_zip(page_cur_get_block(cur))); +} + +/*********************************************************//** +Gets the record where the cursor is positioned. +@return record */ +UNIV_INLINE +rec_t* +page_cur_get_rec( +/*=============*/ + page_cur_t* cur) /*!< in: page cursor */ +{ + ut_ad(cur); + ut_ad(page_align(cur->rec) == cur->block->frame); + + return(cur->rec); +} +#endif /* UNIV_DEBUG */ + +/*********************************************************//** +Sets the cursor object to point before the first user record +on the page. */ +UNIV_INLINE +void +page_cur_set_before_first( +/*======================*/ + const buf_block_t* block, /*!< in: index page */ + page_cur_t* cur) /*!< in: cursor */ +{ + cur->block = (buf_block_t*) block; + cur->rec = page_get_infimum_rec(buf_block_get_frame(cur->block)); +} + +/*********************************************************//** +Sets the cursor object to point after the last user record on +the page. */ +UNIV_INLINE +void +page_cur_set_after_last( +/*====================*/ + const buf_block_t* block, /*!< in: index page */ + page_cur_t* cur) /*!< in: cursor */ +{ + cur->block = (buf_block_t*) block; + cur->rec = page_get_supremum_rec(buf_block_get_frame(cur->block)); +} + +/*********************************************************//** +Returns TRUE if the cursor is before first user record on page. +@return TRUE if at start */ +UNIV_INLINE +ibool +page_cur_is_before_first( +/*=====================*/ + const page_cur_t* cur) /*!< in: cursor */ +{ + ut_ad(cur); + ut_ad(page_align(cur->rec) == cur->block->frame); + return(page_rec_is_infimum(cur->rec)); +} + +/*********************************************************//** +Returns TRUE if the cursor is after last user record. +@return TRUE if at end */ +UNIV_INLINE +ibool +page_cur_is_after_last( +/*===================*/ + const page_cur_t* cur) /*!< in: cursor */ +{ + ut_ad(cur); + ut_ad(page_align(cur->rec) == cur->block->frame); + return(page_rec_is_supremum(cur->rec)); +} + +/**********************************************************//** +Positions the cursor on the given record. */ +UNIV_INLINE +void +page_cur_position( +/*==============*/ + const rec_t* rec, /*!< in: record on a page */ + const buf_block_t* block, /*!< in: buffer block containing + the record */ + page_cur_t* cur) /*!< out: page cursor */ +{ + ut_ad(rec && block && cur); + ut_ad(page_align(rec) == block->frame); + + cur->rec = (rec_t*) rec; + cur->block = (buf_block_t*) block; +} + +/**********************************************************//** +Invalidates a page cursor by setting the record pointer NULL. */ +UNIV_INLINE +void +page_cur_invalidate( +/*================*/ + page_cur_t* cur) /*!< out: page cursor */ +{ + ut_ad(cur); + + cur->rec = NULL; + cur->block = NULL; +} + +/**********************************************************//** +Moves the cursor to the next record on page. */ +UNIV_INLINE +void +page_cur_move_to_next( +/*==================*/ + page_cur_t* cur) /*!< in/out: cursor; must not be after last */ +{ + ut_ad(!page_cur_is_after_last(cur)); + + cur->rec = page_rec_get_next(cur->rec); +} + +/**********************************************************//** +Moves the cursor to the previous record on page. */ +UNIV_INLINE +void +page_cur_move_to_prev( +/*==================*/ + page_cur_t* cur) /*!< in/out: page cursor, not before first */ +{ + ut_ad(!page_cur_is_before_first(cur)); + + cur->rec = page_rec_get_prev(cur->rec); +} + +#ifndef UNIV_HOTBACKUP +/****************************************************************//** +Searches the right position for a page cursor. +@return number of matched fields on the left */ +UNIV_INLINE +ulint +page_cur_search( +/*============*/ + const buf_block_t* block, /*!< in: buffer block */ + const dict_index_t* index, /*!< in: record descriptor */ + const dtuple_t* tuple, /*!< in: data tuple */ + ulint mode, /*!< in: PAGE_CUR_L, + PAGE_CUR_LE, PAGE_CUR_G, or + PAGE_CUR_GE */ + page_cur_t* cursor) /*!< out: page cursor */ +{ + ulint low_matched_fields = 0; + ulint low_matched_bytes = 0; + ulint up_matched_fields = 0; + ulint up_matched_bytes = 0; + + ut_ad(dtuple_check_typed(tuple)); + + page_cur_search_with_match(block, index, tuple, mode, + &up_matched_fields, + &up_matched_bytes, + &low_matched_fields, + &low_matched_bytes, + cursor); + return(low_matched_fields); +} + +/***********************************************************//** +Inserts a record next to page cursor. Returns pointer to inserted record if +succeed, i.e., enough space available, NULL otherwise. The cursor stays at +the same logical position, but the physical position may change if it is +pointing to a compressed page that was reorganized. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if this is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + +@return pointer to record if succeed, NULL otherwise */ +UNIV_INLINE +rec_t* +page_cur_tuple_insert( +/*==================*/ + page_cur_t* cursor, /*!< in/out: a page cursor */ + const dtuple_t* tuple, /*!< in: pointer to a data tuple */ + dict_index_t* index, /*!< in: record descriptor */ + ulint** offsets,/*!< out: offsets on *rec */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */ + ulint n_ext, /*!< in: number of externally stored columns */ + mtr_t* mtr) /*!< in: mini-transaction handle, or NULL */ +{ + ulint size + = rec_get_converted_size(index, tuple, n_ext); + rec_t* rec; + + if (!*heap) { + *heap = mem_heap_create(size + + (4 + REC_OFFS_HEADER_SIZE + + dtuple_get_n_fields(tuple)) + * sizeof **offsets); + } + + rec = rec_convert_dtuple_to_rec((byte*) mem_heap_alloc(*heap, size), + index, tuple, n_ext); + *offsets = rec_get_offsets( + rec, index, *offsets, ULINT_UNDEFINED, heap); + + if (buf_block_get_page_zip(cursor->block)) { + rec = page_cur_insert_rec_zip( + cursor, index, rec, *offsets, mtr); + } else { + rec = page_cur_insert_rec_low(cursor->rec, + index, rec, *offsets, mtr); + } + + ut_ad(!rec || !cmp_dtuple_rec(tuple, rec, *offsets)); + return(rec); +} +#endif /* !UNIV_HOTBACKUP */ + +/***********************************************************//** +Inserts a record next to page cursor. Returns pointer to inserted record if +succeed, i.e., enough space available, NULL otherwise. The cursor stays at +the same logical position, but the physical position may change if it is +pointing to a compressed page that was reorganized. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if this is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + +@return pointer to record if succeed, NULL otherwise */ +UNIV_INLINE +rec_t* +page_cur_rec_insert( +/*================*/ + page_cur_t* cursor, /*!< in/out: a page cursor */ + const rec_t* rec, /*!< in: record to insert */ + dict_index_t* index, /*!< in: record descriptor */ + ulint* offsets,/*!< in/out: rec_get_offsets(rec, index) */ + mtr_t* mtr) /*!< in: mini-transaction handle, or NULL */ +{ + if (buf_block_get_page_zip(cursor->block)) { + return(page_cur_insert_rec_zip( + cursor, index, rec, offsets, mtr)); + } else { + return(page_cur_insert_rec_low(cursor->rec, + index, rec, offsets, mtr)); + } +} diff --git a/storage/xtradb/include/page0page.h b/storage/xtradb/include/page0page.h new file mode 100644 index 00000000000..a387d9e0de1 --- /dev/null +++ b/storage/xtradb/include/page0page.h @@ -0,0 +1,1122 @@ +/***************************************************************************** + +Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/page0page.h +Index page routines + +Created 2/2/1994 Heikki Tuuri +*******************************************************/ + +#ifndef page0page_h +#define page0page_h + +#include "univ.i" + +#include "page0types.h" +#include "fil0fil.h" +#include "buf0buf.h" +#include "data0data.h" +#include "dict0dict.h" +#include "rem0rec.h" +#include "fsp0fsp.h" +#include "mtr0mtr.h" + +#ifdef UNIV_MATERIALIZE +#undef UNIV_INLINE +#define UNIV_INLINE +#endif + +/* PAGE HEADER + =========== + +Index page header starts at the first offset left free by the FIL-module */ + +typedef byte page_header_t; + +#define PAGE_HEADER FSEG_PAGE_DATA /* index page header starts at this + offset */ +/*-----------------------------*/ +#define PAGE_N_DIR_SLOTS 0 /* number of slots in page directory */ +#define PAGE_HEAP_TOP 2 /* pointer to record heap top */ +#define PAGE_N_HEAP 4 /* number of records in the heap, + bit 15=flag: new-style compact page format */ +#define PAGE_FREE 6 /* pointer to start of page free record list */ +#define PAGE_GARBAGE 8 /* number of bytes in deleted records */ +#define PAGE_LAST_INSERT 10 /* pointer to the last inserted record, or + NULL if this info has been reset by a delete, + for example */ +#define PAGE_DIRECTION 12 /* last insert direction: PAGE_LEFT, ... */ +#define PAGE_N_DIRECTION 14 /* number of consecutive inserts to the same + direction */ +#define PAGE_N_RECS 16 /* number of user records on the page */ +#define PAGE_MAX_TRX_ID 18 /* highest id of a trx which may have modified + a record on the page; trx_id_t; defined only + in secondary indexes and in the insert buffer + tree */ +#define PAGE_HEADER_PRIV_END 26 /* end of private data structure of the page + header which are set in a page create */ +/*----*/ +#define PAGE_LEVEL 26 /* level of the node in an index tree; the + leaf level is the level 0. This field should + not be written to after page creation. */ +#define PAGE_INDEX_ID 28 /* index id where the page belongs. + This field should not be written to after + page creation. */ +#define PAGE_BTR_SEG_LEAF 36 /* file segment header for the leaf pages in + a B-tree: defined only on the root page of a + B-tree, but not in the root of an ibuf tree */ +#define PAGE_BTR_IBUF_FREE_LIST PAGE_BTR_SEG_LEAF +#define PAGE_BTR_IBUF_FREE_LIST_NODE PAGE_BTR_SEG_LEAF + /* in the place of PAGE_BTR_SEG_LEAF and _TOP + there is a free list base node if the page is + the root page of an ibuf tree, and at the same + place is the free list node if the page is in + a free list */ +#define PAGE_BTR_SEG_TOP (36 + FSEG_HEADER_SIZE) + /* file segment header for the non-leaf pages + in a B-tree: defined only on the root page of + a B-tree, but not in the root of an ibuf + tree */ +/*----*/ +#define PAGE_DATA (PAGE_HEADER + 36 + 2 * FSEG_HEADER_SIZE) + /* start of data on the page */ + +#define PAGE_OLD_INFIMUM (PAGE_DATA + 1 + REC_N_OLD_EXTRA_BYTES) + /* offset of the page infimum record on an + old-style page */ +#define PAGE_OLD_SUPREMUM (PAGE_DATA + 2 + 2 * REC_N_OLD_EXTRA_BYTES + 8) + /* offset of the page supremum record on an + old-style page */ +#define PAGE_OLD_SUPREMUM_END (PAGE_OLD_SUPREMUM + 9) + /* offset of the page supremum record end on + an old-style page */ +#define PAGE_NEW_INFIMUM (PAGE_DATA + REC_N_NEW_EXTRA_BYTES) + /* offset of the page infimum record on a + new-style compact page */ +#define PAGE_NEW_SUPREMUM (PAGE_DATA + 2 * REC_N_NEW_EXTRA_BYTES + 8) + /* offset of the page supremum record on a + new-style compact page */ +#define PAGE_NEW_SUPREMUM_END (PAGE_NEW_SUPREMUM + 8) + /* offset of the page supremum record end on + a new-style compact page */ +/*-----------------------------*/ + +/* Heap numbers */ +#define PAGE_HEAP_NO_INFIMUM 0 /* page infimum */ +#define PAGE_HEAP_NO_SUPREMUM 1 /* page supremum */ +#define PAGE_HEAP_NO_USER_LOW 2 /* first user record in + creation (insertion) order, + not necessarily collation order; + this record may have been deleted */ + +/* Directions of cursor movement */ +#define PAGE_LEFT 1 +#define PAGE_RIGHT 2 +#define PAGE_SAME_REC 3 +#define PAGE_SAME_PAGE 4 +#define PAGE_NO_DIRECTION 5 + +/* PAGE DIRECTORY + ============== +*/ + +typedef byte page_dir_slot_t; +typedef page_dir_slot_t page_dir_t; + +/* Offset of the directory start down from the page end. We call the +slot with the highest file address directory start, as it points to +the first record in the list of records. */ +#define PAGE_DIR FIL_PAGE_DATA_END + +/* We define a slot in the page directory as two bytes */ +#define PAGE_DIR_SLOT_SIZE 2 + +/* The offset of the physically lower end of the directory, counted from +page end, when the page is empty */ +#define PAGE_EMPTY_DIR_START (PAGE_DIR + 2 * PAGE_DIR_SLOT_SIZE) + +/* The maximum and minimum number of records owned by a directory slot. The +number may drop below the minimum in the first and the last slot in the +directory. */ +#define PAGE_DIR_SLOT_MAX_N_OWNED 8 +#define PAGE_DIR_SLOT_MIN_N_OWNED 4 + +/************************************************************//** +Gets the start of a page. +@return start of the page */ +UNIV_INLINE +page_t* +page_align( +/*=======*/ + const void* ptr) /*!< in: pointer to page frame */ + __attribute__((const)); +/************************************************************//** +Gets the offset within a page. +@return offset from the start of the page */ +UNIV_INLINE +ulint +page_offset( +/*========*/ + const void* ptr) /*!< in: pointer to page frame */ + __attribute__((const)); +/*************************************************************//** +Returns the max trx id field value. */ +UNIV_INLINE +trx_id_t +page_get_max_trx_id( +/*================*/ + const page_t* page); /*!< in: page */ +/*************************************************************//** +Sets the max trx id field value. */ +UNIV_INTERN +void +page_set_max_trx_id( +/*================*/ + buf_block_t* block, /*!< in/out: page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */ + trx_id_t trx_id, /*!< in: transaction id */ + mtr_t* mtr); /*!< in/out: mini-transaction, or NULL */ +/*************************************************************//** +Sets the max trx id field value if trx_id is bigger than the previous +value. */ +UNIV_INLINE +void +page_update_max_trx_id( +/*===================*/ + buf_block_t* block, /*!< in/out: page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page whose + uncompressed part will be updated, or NULL */ + trx_id_t trx_id, /*!< in: transaction id */ + mtr_t* mtr); /*!< in/out: mini-transaction */ +/*************************************************************//** +Reads the given header field. */ +UNIV_INLINE +ulint +page_header_get_field( +/*==================*/ + const page_t* page, /*!< in: page */ + ulint field); /*!< in: PAGE_N_DIR_SLOTS, ... */ +/*************************************************************//** +Sets the given header field. */ +UNIV_INLINE +void +page_header_set_field( +/*==================*/ + page_t* page, /*!< in/out: page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page whose + uncompressed part will be updated, or NULL */ + ulint field, /*!< in: PAGE_N_DIR_SLOTS, ... */ + ulint val); /*!< in: value */ +/*************************************************************//** +Returns the offset stored in the given header field. +@return offset from the start of the page, or 0 */ +UNIV_INLINE +ulint +page_header_get_offs( +/*=================*/ + const page_t* page, /*!< in: page */ + ulint field) /*!< in: PAGE_FREE, ... */ + __attribute__((nonnull, pure)); + +/*************************************************************//** +Returns the pointer stored in the given header field, or NULL. */ +#define page_header_get_ptr(page, field) \ + (page_header_get_offs(page, field) \ + ? page + page_header_get_offs(page, field) : NULL) +/*************************************************************//** +Sets the pointer stored in the given header field. */ +UNIV_INLINE +void +page_header_set_ptr( +/*================*/ + page_t* page, /*!< in/out: page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page whose + uncompressed part will be updated, or NULL */ + ulint field, /*!< in/out: PAGE_FREE, ... */ + const byte* ptr); /*!< in: pointer or NULL*/ +#ifndef UNIV_HOTBACKUP +/*************************************************************//** +Resets the last insert info field in the page header. Writes to mlog +about this operation. */ +UNIV_INLINE +void +page_header_reset_last_insert( +/*==========================*/ + page_t* page, /*!< in: page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page whose + uncompressed part will be updated, or NULL */ + mtr_t* mtr); /*!< in: mtr */ +#endif /* !UNIV_HOTBACKUP */ +/************************************************************//** +Gets the offset of the first record on the page. +@return offset of the first record in record list, relative from page */ +UNIV_INLINE +ulint +page_get_infimum_offset( +/*====================*/ + const page_t* page); /*!< in: page which must have record(s) */ +/************************************************************//** +Gets the offset of the last record on the page. +@return offset of the last record in record list, relative from page */ +UNIV_INLINE +ulint +page_get_supremum_offset( +/*=====================*/ + const page_t* page); /*!< in: page which must have record(s) */ +#define page_get_infimum_rec(page) ((page) + page_get_infimum_offset(page)) +#define page_get_supremum_rec(page) ((page) + page_get_supremum_offset(page)) + +/************************************************************//** +Returns the nth record of the record list. +This is the inverse function of page_rec_get_n_recs_before(). +@return nth record */ +UNIV_INTERN +const rec_t* +page_rec_get_nth_const( +/*===================*/ + const page_t* page, /*!< in: page */ + ulint nth) /*!< in: nth record */ + __attribute__((nonnull, warn_unused_result)); +/************************************************************//** +Returns the nth record of the record list. +This is the inverse function of page_rec_get_n_recs_before(). +@return nth record */ +UNIV_INLINE +rec_t* +page_rec_get_nth( +/*=============*/ + page_t* page, /*< in: page */ + ulint nth) /*!< in: nth record */ + __attribute__((nonnull, warn_unused_result)); + +#ifndef UNIV_HOTBACKUP +/************************************************************//** +Returns the middle record of the records on the page. If there is an +even number of records in the list, returns the first record of the +upper half-list. +@return middle record */ +UNIV_INLINE +rec_t* +page_get_middle_rec( +/*================*/ + page_t* page) /*!< in: page */ + __attribute__((nonnull, warn_unused_result)); +/*************************************************************//** +Compares a data tuple to a physical record. Differs from the function +cmp_dtuple_rec_with_match in the way that the record must reside on an +index page, and also page infimum and supremum records can be given in +the parameter rec. These are considered as the negative infinity and +the positive infinity in the alphabetical order. +@return 1, 0, -1, if dtuple is greater, equal, less than rec, +respectively, when only the common first fields are compared */ +UNIV_INLINE +int +page_cmp_dtuple_rec_with_match( +/*===========================*/ + const dtuple_t* dtuple, /*!< in: data tuple */ + const rec_t* rec, /*!< in: physical record on a page; may also + be page infimum or supremum, in which case + matched-parameter values below are not + affected */ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint* matched_fields, /*!< in/out: number of already completely + matched fields; when function returns + contains the value for current comparison */ + ulint* matched_bytes); /*!< in/out: number of already matched + bytes within the first field not completely + matched; when function returns contains the + value for current comparison */ +#endif /* !UNIV_HOTBACKUP */ +/*************************************************************//** +Gets the page number. +@return page number */ +UNIV_INLINE +ulint +page_get_page_no( +/*=============*/ + const page_t* page); /*!< in: page */ +/*************************************************************//** +Gets the tablespace identifier. +@return space id */ +UNIV_INLINE +ulint +page_get_space_id( +/*==============*/ + const page_t* page); /*!< in: page */ +/*************************************************************//** +Gets the number of user records on page (the infimum and supremum records +are not user records). +@return number of user records */ +UNIV_INLINE +ulint +page_get_n_recs( +/*============*/ + const page_t* page); /*!< in: index page */ +/***************************************************************//** +Returns the number of records before the given record in chain. +The number includes infimum and supremum records. +This is the inverse function of page_rec_get_nth(). +@return number of records */ +UNIV_INTERN +ulint +page_rec_get_n_recs_before( +/*=======================*/ + const rec_t* rec); /*!< in: the physical record */ +/*************************************************************//** +Gets the number of records in the heap. +@return number of user records */ +UNIV_INLINE +ulint +page_dir_get_n_heap( +/*================*/ + const page_t* page); /*!< in: index page */ +/*************************************************************//** +Sets the number of records in the heap. */ +UNIV_INLINE +void +page_dir_set_n_heap( +/*================*/ + page_t* page, /*!< in/out: index page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page whose + uncompressed part will be updated, or NULL. + Note that the size of the dense page directory + in the compressed page trailer is + n_heap * PAGE_ZIP_DIR_SLOT_SIZE. */ + ulint n_heap);/*!< in: number of records */ +/*************************************************************//** +Gets the number of dir slots in directory. +@return number of slots */ +UNIV_INLINE +ulint +page_dir_get_n_slots( +/*=================*/ + const page_t* page); /*!< in: index page */ +/*************************************************************//** +Sets the number of dir slots in directory. */ +UNIV_INLINE +void +page_dir_set_n_slots( +/*=================*/ + page_t* page, /*!< in/out: page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page whose + uncompressed part will be updated, or NULL */ + ulint n_slots);/*!< in: number of slots */ +#ifdef UNIV_DEBUG +/*************************************************************//** +Gets pointer to nth directory slot. +@return pointer to dir slot */ +UNIV_INLINE +page_dir_slot_t* +page_dir_get_nth_slot( +/*==================*/ + const page_t* page, /*!< in: index page */ + ulint n); /*!< in: position */ +#else /* UNIV_DEBUG */ +# define page_dir_get_nth_slot(page, n) \ + ((page) + UNIV_PAGE_SIZE - PAGE_DIR \ + - (n + 1) * PAGE_DIR_SLOT_SIZE) +#endif /* UNIV_DEBUG */ +/**************************************************************//** +Used to check the consistency of a record on a page. +@return TRUE if succeed */ +UNIV_INLINE +ibool +page_rec_check( +/*===========*/ + const rec_t* rec); /*!< in: record */ +/***************************************************************//** +Gets the record pointed to by a directory slot. +@return pointer to record */ +UNIV_INLINE +const rec_t* +page_dir_slot_get_rec( +/*==================*/ + const page_dir_slot_t* slot); /*!< in: directory slot */ +/***************************************************************//** +This is used to set the record offset in a directory slot. */ +UNIV_INLINE +void +page_dir_slot_set_rec( +/*==================*/ + page_dir_slot_t* slot, /*!< in: directory slot */ + rec_t* rec); /*!< in: record on the page */ +/***************************************************************//** +Gets the number of records owned by a directory slot. +@return number of records */ +UNIV_INLINE +ulint +page_dir_slot_get_n_owned( +/*======================*/ + const page_dir_slot_t* slot); /*!< in: page directory slot */ +/***************************************************************//** +This is used to set the owned records field of a directory slot. */ +UNIV_INLINE +void +page_dir_slot_set_n_owned( +/*======================*/ + page_dir_slot_t*slot, /*!< in/out: directory slot */ + page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */ + ulint n); /*!< in: number of records owned by the slot */ +/************************************************************//** +Calculates the space reserved for directory slots of a given +number of records. The exact value is a fraction number +n * PAGE_DIR_SLOT_SIZE / PAGE_DIR_SLOT_MIN_N_OWNED, and it is +rounded upwards to an integer. */ +UNIV_INLINE +ulint +page_dir_calc_reserved_space( +/*=========================*/ + ulint n_recs); /*!< in: number of records */ +/***************************************************************//** +Looks for the directory slot which owns the given record. +@return the directory slot number */ +UNIV_INTERN +ulint +page_dir_find_owner_slot( +/*=====================*/ + const rec_t* rec); /*!< in: the physical record */ +/************************************************************//** +Determine whether the page is in new-style compact format. +@return nonzero if the page is in compact format, zero if it is in +old-style format */ +UNIV_INLINE +ulint +page_is_comp( +/*=========*/ + const page_t* page); /*!< in: index page */ +/************************************************************//** +TRUE if the record is on a page in compact format. +@return nonzero if in compact format */ +UNIV_INLINE +ulint +page_rec_is_comp( +/*=============*/ + const rec_t* rec); /*!< in: record */ +/***************************************************************//** +Returns the heap number of a record. +@return heap number */ +UNIV_INLINE +ulint +page_rec_get_heap_no( +/*=================*/ + const rec_t* rec); /*!< in: the physical record */ +/************************************************************//** +Determine whether the page is a B-tree leaf. +@return true if the page is a B-tree leaf (PAGE_LEVEL = 0) */ +UNIV_INLINE +bool +page_is_leaf( +/*=========*/ + const page_t* page) /*!< in: page */ + __attribute__((pure)); +/************************************************************//** +Determine whether the page is empty. +@return true if the page is empty (PAGE_N_RECS = 0) */ +UNIV_INLINE +bool +page_is_empty( +/*==========*/ + const page_t* page) /*!< in: page */ + __attribute__((nonnull, pure)); +/************************************************************//** +Determine whether the page contains garbage. +@return true if the page contains garbage (PAGE_GARBAGE is not 0) */ +UNIV_INLINE +bool +page_has_garbage( +/*=============*/ + const page_t* page) /*!< in: page */ + __attribute__((nonnull, pure)); +/************************************************************//** +Gets the pointer to the next record on the page. +@return pointer to next record */ +UNIV_INLINE +const rec_t* +page_rec_get_next_low( +/*==================*/ + const rec_t* rec, /*!< in: pointer to record */ + ulint comp); /*!< in: nonzero=compact page layout */ +/************************************************************//** +Gets the pointer to the next record on the page. +@return pointer to next record */ +UNIV_INLINE +rec_t* +page_rec_get_next( +/*==============*/ + rec_t* rec); /*!< in: pointer to record */ +/************************************************************//** +Gets the pointer to the next record on the page. +@return pointer to next record */ +UNIV_INLINE +const rec_t* +page_rec_get_next_const( +/*====================*/ + const rec_t* rec); /*!< in: pointer to record */ +/************************************************************//** +Gets the pointer to the next non delete-marked record on the page. +If all subsequent records are delete-marked, then this function +will return the supremum record. +@return pointer to next non delete-marked record or pointer to supremum */ +UNIV_INLINE +const rec_t* +page_rec_get_next_non_del_marked( +/*=============================*/ + const rec_t* rec); /*!< in: pointer to record */ +/************************************************************//** +Sets the pointer to the next record on the page. */ +UNIV_INLINE +void +page_rec_set_next( +/*==============*/ + rec_t* rec, /*!< in: pointer to record, + must not be page supremum */ + const rec_t* next); /*!< in: pointer to next record, + must not be page infimum */ +/************************************************************//** +Gets the pointer to the previous record. +@return pointer to previous record */ +UNIV_INLINE +const rec_t* +page_rec_get_prev_const( +/*====================*/ + const rec_t* rec); /*!< in: pointer to record, must not be page + infimum */ +/************************************************************//** +Gets the pointer to the previous record. +@return pointer to previous record */ +UNIV_INLINE +rec_t* +page_rec_get_prev( +/*==============*/ + rec_t* rec); /*!< in: pointer to record, + must not be page infimum */ +/************************************************************//** +TRUE if the record is a user record on the page. +@return TRUE if a user record */ +UNIV_INLINE +ibool +page_rec_is_user_rec_low( +/*=====================*/ + ulint offset) /*!< in: record offset on page */ + __attribute__((const)); +/************************************************************//** +TRUE if the record is the supremum record on a page. +@return TRUE if the supremum record */ +UNIV_INLINE +ibool +page_rec_is_supremum_low( +/*=====================*/ + ulint offset) /*!< in: record offset on page */ + __attribute__((const)); +/************************************************************//** +TRUE if the record is the infimum record on a page. +@return TRUE if the infimum record */ +UNIV_INLINE +ibool +page_rec_is_infimum_low( +/*====================*/ + ulint offset) /*!< in: record offset on page */ + __attribute__((const)); + +/************************************************************//** +TRUE if the record is a user record on the page. +@return TRUE if a user record */ +UNIV_INLINE +ibool +page_rec_is_user_rec( +/*=================*/ + const rec_t* rec) /*!< in: record */ + __attribute__((const)); +/************************************************************//** +TRUE if the record is the supremum record on a page. +@return TRUE if the supremum record */ +UNIV_INLINE +ibool +page_rec_is_supremum( +/*=================*/ + const rec_t* rec) /*!< in: record */ + __attribute__((const)); + +/************************************************************//** +TRUE if the record is the infimum record on a page. +@return TRUE if the infimum record */ +UNIV_INLINE +ibool +page_rec_is_infimum( +/*================*/ + const rec_t* rec) /*!< in: record */ + __attribute__((const)); +/***************************************************************//** +Looks for the record which owns the given record. +@return the owner record */ +UNIV_INLINE +rec_t* +page_rec_find_owner_rec( +/*====================*/ + rec_t* rec); /*!< in: the physical record */ +#ifndef UNIV_HOTBACKUP +/***********************************************************************//** +Write a 32-bit field in a data dictionary record. */ +UNIV_INLINE +void +page_rec_write_field( +/*=================*/ + rec_t* rec, /*!< in/out: record to update */ + ulint i, /*!< in: index of the field to update */ + ulint val, /*!< in: value to write */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull)); +#endif /* !UNIV_HOTBACKUP */ +/************************************************************//** +Returns the maximum combined size of records which can be inserted on top +of record heap. +@return maximum combined size for inserted records */ +UNIV_INLINE +ulint +page_get_max_insert_size( +/*=====================*/ + const page_t* page, /*!< in: index page */ + ulint n_recs);/*!< in: number of records */ +/************************************************************//** +Returns the maximum combined size of records which can be inserted on top +of record heap if page is first reorganized. +@return maximum combined size for inserted records */ +UNIV_INLINE +ulint +page_get_max_insert_size_after_reorganize( +/*======================================*/ + const page_t* page, /*!< in: index page */ + ulint n_recs);/*!< in: number of records */ +/*************************************************************//** +Calculates free space if a page is emptied. +@return free space */ +UNIV_INLINE +ulint +page_get_free_space_of_empty( +/*=========================*/ + ulint comp) /*!< in: nonzero=compact page format */ + __attribute__((const)); +/**********************************************************//** +Returns the base extra size of a physical record. This is the +size of the fixed header, independent of the record size. +@return REC_N_NEW_EXTRA_BYTES or REC_N_OLD_EXTRA_BYTES */ +UNIV_INLINE +ulint +page_rec_get_base_extra_size( +/*=========================*/ + const rec_t* rec); /*!< in: physical record */ +/************************************************************//** +Returns the sum of the sizes of the records in the record list +excluding the infimum and supremum records. +@return data in bytes */ +UNIV_INLINE +ulint +page_get_data_size( +/*===============*/ + const page_t* page); /*!< in: index page */ +/************************************************************//** +Allocates a block of memory from the head of the free list +of an index page. */ +UNIV_INLINE +void +page_mem_alloc_free( +/*================*/ + page_t* page, /*!< in/out: index page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page with enough + space available for inserting the record, + or NULL */ + rec_t* next_rec,/*!< in: pointer to the new head of the + free record list */ + ulint need); /*!< in: number of bytes allocated */ +/************************************************************//** +Allocates a block of memory from the heap of an index page. +@return pointer to start of allocated buffer, or NULL if allocation fails */ +UNIV_INTERN +byte* +page_mem_alloc_heap( +/*================*/ + page_t* page, /*!< in/out: index page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page with enough + space available for inserting the record, + or NULL */ + ulint need, /*!< in: total number of bytes needed */ + ulint* heap_no);/*!< out: this contains the heap number + of the allocated record + if allocation succeeds */ +/************************************************************//** +Puts a record to free list. */ +UNIV_INLINE +void +page_mem_free( +/*==========*/ + page_t* page, /*!< in/out: index page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page, + or NULL */ + rec_t* rec, /*!< in: pointer to the (origin of) + record */ + const dict_index_t* index, /*!< in: index of rec */ + const ulint* offsets);/*!< in: array returned by + rec_get_offsets() */ +/**********************************************************//** +Create an uncompressed B-tree index page. +@return pointer to the page */ +UNIV_INTERN +page_t* +page_create( +/*========*/ + buf_block_t* block, /*!< in: a buffer block where the + page is created */ + mtr_t* mtr, /*!< in: mini-transaction handle */ + ulint comp); /*!< in: nonzero=compact page format */ +/**********************************************************//** +Create a compressed B-tree index page. +@return pointer to the page */ +UNIV_INTERN +page_t* +page_create_zip( +/*============*/ + buf_block_t* block, /*!< in/out: a buffer frame where the + page is created */ + dict_index_t* index, /*!< in: the index of the page */ + ulint level, /*!< in: the B-tree level of the page */ + trx_id_t max_trx_id, /*!< in: PAGE_MAX_TRX_ID */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull)); +/**********************************************************//** +Empty a previously created B-tree index page. */ +UNIV_INTERN +void +page_create_empty( +/*==============*/ + buf_block_t* block, /*!< in/out: B-tree block */ + dict_index_t* index, /*!< in: the index of the page */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull(1,2))); +/*************************************************************//** +Differs from page_copy_rec_list_end, because this function does not +touch the lock table and max trx id on page or compress the page. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if new_block is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). */ +UNIV_INTERN +void +page_copy_rec_list_end_no_locks( +/*============================*/ + buf_block_t* new_block, /*!< in: index page to copy to */ + buf_block_t* block, /*!< in: index page of rec */ + rec_t* rec, /*!< in: record on page */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr); /*!< in: mtr */ +/*************************************************************//** +Copies records from page to new_page, from the given record onward, +including that record. Infimum and supremum records are not copied. +The records are copied to the start of the record list on new_page. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if new_block is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + +@return pointer to the original successor of the infimum record on +new_page, or NULL on zip overflow (new_block will be decompressed) */ +UNIV_INTERN +rec_t* +page_copy_rec_list_end( +/*===================*/ + buf_block_t* new_block, /*!< in/out: index page to copy to */ + buf_block_t* block, /*!< in: index page containing rec */ + rec_t* rec, /*!< in: record on page */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull)); +/*************************************************************//** +Copies records from page to new_page, up to the given record, NOT +including that record. Infimum and supremum records are not copied. +The records are copied to the end of the record list on new_page. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if new_block is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + +@return pointer to the original predecessor of the supremum record on +new_page, or NULL on zip overflow (new_block will be decompressed) */ +UNIV_INTERN +rec_t* +page_copy_rec_list_start( +/*=====================*/ + buf_block_t* new_block, /*!< in/out: index page to copy to */ + buf_block_t* block, /*!< in: index page containing rec */ + rec_t* rec, /*!< in: record on page */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull)); +/*************************************************************//** +Deletes records from a page from a given record onward, including that record. +The infimum and supremum records are not deleted. */ +UNIV_INTERN +void +page_delete_rec_list_end( +/*=====================*/ + rec_t* rec, /*!< in: pointer to record on page */ + buf_block_t* block, /*!< in: buffer block of the page */ + dict_index_t* index, /*!< in: record descriptor */ + ulint n_recs, /*!< in: number of records to delete, + or ULINT_UNDEFINED if not known */ + ulint size, /*!< in: the sum of the sizes of the + records in the end of the chain to + delete, or ULINT_UNDEFINED if not known */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull)); +/*************************************************************//** +Deletes records from page, up to the given record, NOT including +that record. Infimum and supremum records are not deleted. */ +UNIV_INTERN +void +page_delete_rec_list_start( +/*=======================*/ + rec_t* rec, /*!< in: record on page */ + buf_block_t* block, /*!< in: buffer block of the page */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull)); +/*************************************************************//** +Moves record list end to another page. Moved records include +split_rec. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if new_block is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + +@return TRUE on success; FALSE on compression failure (new_block will +be decompressed) */ +UNIV_INTERN +ibool +page_move_rec_list_end( +/*===================*/ + buf_block_t* new_block, /*!< in/out: index page where to move */ + buf_block_t* block, /*!< in: index page from where to move */ + rec_t* split_rec, /*!< in: first record to move */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull(1, 2, 4, 5))); +/*************************************************************//** +Moves record list start to another page. Moved records do not include +split_rec. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if new_block is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + +@return TRUE on success; FALSE on compression failure */ +UNIV_INTERN +ibool +page_move_rec_list_start( +/*=====================*/ + buf_block_t* new_block, /*!< in/out: index page where to move */ + buf_block_t* block, /*!< in/out: page containing split_rec */ + rec_t* split_rec, /*!< in: first record not to move */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull(1, 2, 4, 5))); +/****************************************************************//** +Splits a directory slot which owns too many records. */ +UNIV_INTERN +void +page_dir_split_slot( +/*================*/ + page_t* page, /*!< in: index page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page whose + uncompressed part will be written, or NULL */ + ulint slot_no)/*!< in: the directory slot */ + __attribute__((nonnull(1))); +/*************************************************************//** +Tries to balance the given directory slot with too few records +with the upper neighbor, so that there are at least the minimum number +of records owned by the slot; this may result in the merging of +two slots. */ +UNIV_INTERN +void +page_dir_balance_slot( +/*==================*/ + page_t* page, /*!< in/out: index page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */ + ulint slot_no)/*!< in: the directory slot */ + __attribute__((nonnull(1))); +/**********************************************************//** +Parses a log record of a record list end or start deletion. +@return end of log record or NULL */ +UNIV_INTERN +byte* +page_parse_delete_rec_list( +/*=======================*/ + byte type, /*!< in: MLOG_LIST_END_DELETE, + MLOG_LIST_START_DELETE, + MLOG_COMP_LIST_END_DELETE or + MLOG_COMP_LIST_START_DELETE */ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + buf_block_t* block, /*!< in/out: buffer block or NULL */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr); /*!< in: mtr or NULL */ +/***********************************************************//** +Parses a redo log record of creating a page. +@return end of log record or NULL */ +UNIV_INTERN +byte* +page_parse_create( +/*==============*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + ulint comp, /*!< in: nonzero=compact page format */ + buf_block_t* block, /*!< in: block or NULL */ + mtr_t* mtr); /*!< in: mtr or NULL */ +#ifndef UNIV_HOTBACKUP +/************************************************************//** +Prints record contents including the data relevant only in +the index page context. */ +UNIV_INTERN +void +page_rec_print( +/*===========*/ + const rec_t* rec, /*!< in: physical record */ + const ulint* offsets);/*!< in: record descriptor */ +# ifdef UNIV_BTR_PRINT +/***************************************************************//** +This is used to print the contents of the directory for +debugging purposes. */ +UNIV_INTERN +void +page_dir_print( +/*===========*/ + page_t* page, /*!< in: index page */ + ulint pr_n); /*!< in: print n first and n last entries */ +/***************************************************************//** +This is used to print the contents of the page record list for +debugging purposes. */ +UNIV_INTERN +void +page_print_list( +/*============*/ + buf_block_t* block, /*!< in: index page */ + dict_index_t* index, /*!< in: dictionary index of the page */ + ulint pr_n); /*!< in: print n first and n last entries */ +/***************************************************************//** +Prints the info in a page header. */ +UNIV_INTERN +void +page_header_print( +/*==============*/ + const page_t* page); /*!< in: index page */ +/***************************************************************//** +This is used to print the contents of the page for +debugging purposes. */ +UNIV_INTERN +void +page_print( +/*=======*/ + buf_block_t* block, /*!< in: index page */ + dict_index_t* index, /*!< in: dictionary index of the page */ + ulint dn, /*!< in: print dn first and last entries + in directory */ + ulint rn); /*!< in: print rn first and last records + in directory */ +# endif /* UNIV_BTR_PRINT */ +#endif /* !UNIV_HOTBACKUP */ +/***************************************************************//** +The following is used to validate a record on a page. This function +differs from rec_validate as it can also check the n_owned field and +the heap_no field. +@return TRUE if ok */ +UNIV_INTERN +ibool +page_rec_validate( +/*==============*/ + const rec_t* rec, /*!< in: physical record */ + const ulint* offsets);/*!< in: array returned by rec_get_offsets() */ +/***************************************************************//** +Checks that the first directory slot points to the infimum record and +the last to the supremum. This function is intended to track if the +bug fixed in 4.0.14 has caused corruption to users' databases. */ +UNIV_INTERN +void +page_check_dir( +/*===========*/ + const page_t* page); /*!< in: index page */ +/***************************************************************//** +This function checks the consistency of an index page when we do not +know the index. This is also resilient so that this should never crash +even if the page is total garbage. +@return TRUE if ok */ +UNIV_INTERN +ibool +page_simple_validate_old( +/*=====================*/ + const page_t* page); /*!< in: index page in ROW_FORMAT=REDUNDANT */ +/***************************************************************//** +This function checks the consistency of an index page when we do not +know the index. This is also resilient so that this should never crash +even if the page is total garbage. +@return TRUE if ok */ +UNIV_INTERN +ibool +page_simple_validate_new( +/*=====================*/ + const page_t* page); /*!< in: index page in ROW_FORMAT!=REDUNDANT */ +/***************************************************************//** +This function checks the consistency of an index page. +@return TRUE if ok */ +UNIV_INTERN +ibool +page_validate( +/*==========*/ + const page_t* page, /*!< in: index page */ + dict_index_t* index); /*!< in: data dictionary index containing + the page record type definition */ +/***************************************************************//** +Looks in the page record list for a record with the given heap number. +@return record, NULL if not found */ + +const rec_t* +page_find_rec_with_heap_no( +/*=======================*/ + const page_t* page, /*!< in: index page */ + ulint heap_no);/*!< in: heap number */ +/** Get the last non-delete-marked record on a page. +@param[in] page index tree leaf page +@return the last record, not delete-marked +@retval infimum record if all records are delete-marked */ + +const rec_t* +page_find_rec_max_not_deleted( + const page_t* page); +#ifdef UNIV_MATERIALIZE +#undef UNIV_INLINE +#define UNIV_INLINE UNIV_INLINE_ORIGINAL +#endif + +#ifndef UNIV_NONINL +#include "page0page.ic" +#endif + +#endif diff --git a/storage/xtradb/include/page0page.ic b/storage/xtradb/include/page0page.ic new file mode 100644 index 00000000000..99ef07349dc --- /dev/null +++ b/storage/xtradb/include/page0page.ic @@ -0,0 +1,1179 @@ +/***************************************************************************** + +Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/page0page.ic +Index page routines + +Created 2/2/1994 Heikki Tuuri +*******************************************************/ + +#include "mach0data.h" +#ifdef UNIV_DEBUG +# include "log0recv.h" +#endif /* !UNIV_DEBUG */ +#ifndef UNIV_HOTBACKUP +# include "rem0cmp.h" +#endif /* !UNIV_HOTBACKUP */ +#include "mtr0log.h" +#include "page0zip.h" + +#ifdef UNIV_MATERIALIZE +#undef UNIV_INLINE +#define UNIV_INLINE +#endif + +/************************************************************//** +Gets the start of a page. +@return start of the page */ +UNIV_INLINE +page_t* +page_align( +/*=======*/ + const void* ptr) /*!< in: pointer to page frame */ +{ + return((page_t*) ut_align_down(ptr, UNIV_PAGE_SIZE)); +} +/************************************************************//** +Gets the offset within a page. +@return offset from the start of the page */ +UNIV_INLINE +ulint +page_offset( +/*========*/ + const void* ptr) /*!< in: pointer to page frame */ +{ + return(ut_align_offset(ptr, UNIV_PAGE_SIZE)); +} +/*************************************************************//** +Returns the max trx id field value. */ +UNIV_INLINE +trx_id_t +page_get_max_trx_id( +/*================*/ + const page_t* page) /*!< in: page */ +{ + ut_ad(page); + + return(mach_read_from_8(page + PAGE_HEADER + PAGE_MAX_TRX_ID)); +} + +/*************************************************************//** +Sets the max trx id field value if trx_id is bigger than the previous +value. */ +UNIV_INLINE +void +page_update_max_trx_id( +/*===================*/ + buf_block_t* block, /*!< in/out: page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page whose + uncompressed part will be updated, or NULL */ + trx_id_t trx_id, /*!< in: transaction id */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ut_ad(block); + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + /* During crash recovery, this function may be called on + something else than a leaf page of a secondary index or the + insert buffer index tree (dict_index_is_sec_or_ibuf() returns + TRUE for the dummy indexes constructed during redo log + application). In that case, PAGE_MAX_TRX_ID is unused, + and trx_id is usually zero. */ + ut_ad(trx_id || recv_recovery_is_on()); + ut_ad(page_is_leaf(buf_block_get_frame(block))); + + if (page_get_max_trx_id(buf_block_get_frame(block)) < trx_id) { + + page_set_max_trx_id(block, page_zip, trx_id, mtr); + } +} + +/*************************************************************//** +Reads the given header field. */ +UNIV_INLINE +ulint +page_header_get_field( +/*==================*/ + const page_t* page, /*!< in: page */ + ulint field) /*!< in: PAGE_LEVEL, ... */ +{ + ut_ad(page); + ut_ad(field <= PAGE_INDEX_ID); + + return(mach_read_from_2(page + PAGE_HEADER + field)); +} + +/*************************************************************//** +Sets the given header field. */ +UNIV_INLINE +void +page_header_set_field( +/*==================*/ + page_t* page, /*!< in/out: page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page whose + uncompressed part will be updated, or NULL */ + ulint field, /*!< in: PAGE_N_DIR_SLOTS, ... */ + ulint val) /*!< in: value */ +{ + ut_ad(page); + ut_ad(field <= PAGE_N_RECS); + ut_ad(field == PAGE_N_HEAP || val < UNIV_PAGE_SIZE); + ut_ad(field != PAGE_N_HEAP || (val & 0x7fff) < UNIV_PAGE_SIZE); + + mach_write_to_2(page + PAGE_HEADER + field, val); + if (page_zip) { + page_zip_write_header(page_zip, + page + PAGE_HEADER + field, 2, NULL); + } +} + +/*************************************************************//** +Returns the offset stored in the given header field. +@return offset from the start of the page, or 0 */ +UNIV_INLINE +ulint +page_header_get_offs( +/*=================*/ + const page_t* page, /*!< in: page */ + ulint field) /*!< in: PAGE_FREE, ... */ +{ + ulint offs; + + ut_ad(page); + ut_ad((field == PAGE_FREE) + || (field == PAGE_LAST_INSERT) + || (field == PAGE_HEAP_TOP)); + + offs = page_header_get_field(page, field); + + ut_ad((field != PAGE_HEAP_TOP) || offs); + + return(offs); +} + +/*************************************************************//** +Sets the pointer stored in the given header field. */ +UNIV_INLINE +void +page_header_set_ptr( +/*================*/ + page_t* page, /*!< in: page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page whose + uncompressed part will be updated, or NULL */ + ulint field, /*!< in: PAGE_FREE, ... */ + const byte* ptr) /*!< in: pointer or NULL*/ +{ + ulint offs; + + ut_ad(page); + ut_ad((field == PAGE_FREE) + || (field == PAGE_LAST_INSERT) + || (field == PAGE_HEAP_TOP)); + + if (ptr == NULL) { + offs = 0; + } else { + offs = ptr - page; + } + + ut_ad((field != PAGE_HEAP_TOP) || offs); + + page_header_set_field(page, page_zip, field, offs); +} + +#ifndef UNIV_HOTBACKUP +/*************************************************************//** +Resets the last insert info field in the page header. Writes to mlog +about this operation. */ +UNIV_INLINE +void +page_header_reset_last_insert( +/*==========================*/ + page_t* page, /*!< in/out: page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page whose + uncompressed part will be updated, or NULL */ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(page && mtr); + + if (page_zip) { + mach_write_to_2(page + (PAGE_HEADER + PAGE_LAST_INSERT), 0); + page_zip_write_header(page_zip, + page + (PAGE_HEADER + PAGE_LAST_INSERT), + 2, mtr); + } else { + mlog_write_ulint(page + (PAGE_HEADER + PAGE_LAST_INSERT), 0, + MLOG_2BYTES, mtr); + } +} +#endif /* !UNIV_HOTBACKUP */ + +/************************************************************//** +Determine whether the page is in new-style compact format. +@return nonzero if the page is in compact format, zero if it is in +old-style format */ +UNIV_INLINE +ulint +page_is_comp( +/*=========*/ + const page_t* page) /*!< in: index page */ +{ + return(page_header_get_field(page, PAGE_N_HEAP) & 0x8000); +} + +/************************************************************//** +TRUE if the record is on a page in compact format. +@return nonzero if in compact format */ +UNIV_INLINE +ulint +page_rec_is_comp( +/*=============*/ + const rec_t* rec) /*!< in: record */ +{ + return(page_is_comp(page_align(rec))); +} + +/***************************************************************//** +Returns the heap number of a record. +@return heap number */ +UNIV_INLINE +ulint +page_rec_get_heap_no( +/*=================*/ + const rec_t* rec) /*!< in: the physical record */ +{ + if (page_rec_is_comp(rec)) { + return(rec_get_heap_no_new(rec)); + } else { + return(rec_get_heap_no_old(rec)); + } +} + +/************************************************************//** +Determine whether the page is a B-tree leaf. +@return true if the page is a B-tree leaf (PAGE_LEVEL = 0) */ +UNIV_INLINE +bool +page_is_leaf( +/*=========*/ + const page_t* page) /*!< in: page */ +{ + if (!page) { + return(FALSE); + } + return(!*(const uint16*) (page + (PAGE_HEADER + PAGE_LEVEL))); +} + +/************************************************************//** +Determine whether the page is empty. +@return true if the page is empty (PAGE_N_RECS = 0) */ +UNIV_INLINE +bool +page_is_empty( +/*==========*/ + const page_t* page) /*!< in: page */ +{ + return(!*(const uint16*) (page + (PAGE_HEADER + PAGE_N_RECS))); +} + +/************************************************************//** +Determine whether the page contains garbage. +@return true if the page contains garbage (PAGE_GARBAGE is not 0) */ +UNIV_INLINE +bool +page_has_garbage( +/*=============*/ + const page_t* page) /*!< in: page */ +{ + return(!!*(const uint16*) (page + (PAGE_HEADER + PAGE_GARBAGE))); +} + +/************************************************************//** +Gets the offset of the first record on the page. +@return offset of the first record in record list, relative from page */ +UNIV_INLINE +ulint +page_get_infimum_offset( +/*====================*/ + const page_t* page) /*!< in: page which must have record(s) */ +{ + ut_ad(page); + ut_ad(!page_offset(page)); + + if (page_is_comp(page)) { + return(PAGE_NEW_INFIMUM); + } else { + return(PAGE_OLD_INFIMUM); + } +} + +/************************************************************//** +Gets the offset of the last record on the page. +@return offset of the last record in record list, relative from page */ +UNIV_INLINE +ulint +page_get_supremum_offset( +/*=====================*/ + const page_t* page) /*!< in: page which must have record(s) */ +{ + ut_ad(page); + ut_ad(!page_offset(page)); + + if (page_is_comp(page)) { + return(PAGE_NEW_SUPREMUM); + } else { + return(PAGE_OLD_SUPREMUM); + } +} + +/************************************************************//** +TRUE if the record is a user record on the page. +@return TRUE if a user record */ +UNIV_INLINE +ibool +page_rec_is_user_rec_low( +/*=====================*/ + ulint offset) /*!< in: record offset on page */ +{ + ut_ad(offset >= PAGE_NEW_INFIMUM); +#if PAGE_OLD_INFIMUM < PAGE_NEW_INFIMUM +# error "PAGE_OLD_INFIMUM < PAGE_NEW_INFIMUM" +#endif +#if PAGE_OLD_SUPREMUM < PAGE_NEW_SUPREMUM +# error "PAGE_OLD_SUPREMUM < PAGE_NEW_SUPREMUM" +#endif +#if PAGE_NEW_INFIMUM > PAGE_OLD_SUPREMUM +# error "PAGE_NEW_INFIMUM > PAGE_OLD_SUPREMUM" +#endif +#if PAGE_OLD_INFIMUM > PAGE_NEW_SUPREMUM +# error "PAGE_OLD_INFIMUM > PAGE_NEW_SUPREMUM" +#endif +#if PAGE_NEW_SUPREMUM > PAGE_OLD_SUPREMUM_END +# error "PAGE_NEW_SUPREMUM > PAGE_OLD_SUPREMUM_END" +#endif +#if PAGE_OLD_SUPREMUM > PAGE_NEW_SUPREMUM_END +# error "PAGE_OLD_SUPREMUM > PAGE_NEW_SUPREMUM_END" +#endif + ut_ad(offset <= UNIV_PAGE_SIZE - PAGE_EMPTY_DIR_START); + + return(offset != PAGE_NEW_SUPREMUM + && offset != PAGE_NEW_INFIMUM + && offset != PAGE_OLD_INFIMUM + && offset != PAGE_OLD_SUPREMUM); +} + +/************************************************************//** +TRUE if the record is the supremum record on a page. +@return TRUE if the supremum record */ +UNIV_INLINE +ibool +page_rec_is_supremum_low( +/*=====================*/ + ulint offset) /*!< in: record offset on page */ +{ + ut_ad(offset >= PAGE_NEW_INFIMUM); + ut_ad(offset <= UNIV_PAGE_SIZE - PAGE_EMPTY_DIR_START); + + return(offset == PAGE_NEW_SUPREMUM + || offset == PAGE_OLD_SUPREMUM); +} + +/************************************************************//** +TRUE if the record is the infimum record on a page. +@return TRUE if the infimum record */ +UNIV_INLINE +ibool +page_rec_is_infimum_low( +/*====================*/ + ulint offset) /*!< in: record offset on page */ +{ + ut_ad(offset >= PAGE_NEW_INFIMUM); + ut_ad(offset <= UNIV_PAGE_SIZE - PAGE_EMPTY_DIR_START); + + return(offset == PAGE_NEW_INFIMUM || offset == PAGE_OLD_INFIMUM); +} + +/************************************************************//** +TRUE if the record is a user record on the page. +@return TRUE if a user record */ +UNIV_INLINE +ibool +page_rec_is_user_rec( +/*=================*/ + const rec_t* rec) /*!< in: record */ +{ + ut_ad(page_rec_check(rec)); + + return(page_rec_is_user_rec_low(page_offset(rec))); +} + +/************************************************************//** +TRUE if the record is the supremum record on a page. +@return TRUE if the supremum record */ +UNIV_INLINE +ibool +page_rec_is_supremum( +/*=================*/ + const rec_t* rec) /*!< in: record */ +{ + ut_ad(page_rec_check(rec)); + + return(page_rec_is_supremum_low(page_offset(rec))); +} + +/************************************************************//** +TRUE if the record is the infimum record on a page. +@return TRUE if the infimum record */ +UNIV_INLINE +ibool +page_rec_is_infimum( +/*================*/ + const rec_t* rec) /*!< in: record */ +{ + ut_ad(page_rec_check(rec)); + + return(page_rec_is_infimum_low(page_offset(rec))); +} + +/************************************************************//** +Returns the nth record of the record list. +This is the inverse function of page_rec_get_n_recs_before(). +@return nth record */ +UNIV_INLINE +rec_t* +page_rec_get_nth( +/*=============*/ + page_t* page, /*!< in: page */ + ulint nth) /*!< in: nth record */ +{ + return((rec_t*) page_rec_get_nth_const(page, nth)); +} + +#ifndef UNIV_HOTBACKUP +/************************************************************//** +Returns the middle record of the records on the page. If there is an +even number of records in the list, returns the first record of the +upper half-list. +@return middle record */ +UNIV_INLINE +rec_t* +page_get_middle_rec( +/*================*/ + page_t* page) /*!< in: page */ +{ + ulint middle = (page_get_n_recs(page) + PAGE_HEAP_NO_USER_LOW) / 2; + + return(page_rec_get_nth(page, middle)); +} + +/*************************************************************//** +Compares a data tuple to a physical record. Differs from the function +cmp_dtuple_rec_with_match in the way that the record must reside on an +index page, and also page infimum and supremum records can be given in +the parameter rec. These are considered as the negative infinity and +the positive infinity in the alphabetical order. +@return 1, 0, -1, if dtuple is greater, equal, less than rec, +respectively, when only the common first fields are compared */ +UNIV_INLINE +int +page_cmp_dtuple_rec_with_match( +/*===========================*/ + const dtuple_t* dtuple, /*!< in: data tuple */ + const rec_t* rec, /*!< in: physical record on a page; may also + be page infimum or supremum, in which case + matched-parameter values below are not + affected */ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint* matched_fields, /*!< in/out: number of already completely + matched fields; when function returns + contains the value for current comparison */ + ulint* matched_bytes) /*!< in/out: number of already matched + bytes within the first field not completely + matched; when function returns contains the + value for current comparison */ +{ + ulint rec_offset; + + ut_ad(dtuple_check_typed(dtuple)); + ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_ad(!rec_offs_comp(offsets) == !page_rec_is_comp(rec)); + + rec_offset = page_offset(rec); + + if (rec_offset == PAGE_NEW_INFIMUM + || rec_offset == PAGE_OLD_INFIMUM) { + + return(1); + + } else if (rec_offset == PAGE_NEW_SUPREMUM + || rec_offset == PAGE_OLD_SUPREMUM) { + + return(-1); + } + + return(cmp_dtuple_rec_with_match(dtuple, rec, offsets, + matched_fields, + matched_bytes)); +} +#endif /* !UNIV_HOTBACKUP */ + +/*************************************************************//** +Gets the page number. +@return page number */ +UNIV_INLINE +ulint +page_get_page_no( +/*=============*/ + const page_t* page) /*!< in: page */ +{ + ut_ad(page == page_align((page_t*) page)); + return(mach_read_from_4(page + FIL_PAGE_OFFSET)); +} + +/*************************************************************//** +Gets the tablespace identifier. +@return space id */ +UNIV_INLINE +ulint +page_get_space_id( +/*==============*/ + const page_t* page) /*!< in: page */ +{ + ut_ad(page == page_align((page_t*) page)); + return(mach_read_from_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID)); +} + +/*************************************************************//** +Gets the number of user records on page (infimum and supremum records +are not user records). +@return number of user records */ +UNIV_INLINE +ulint +page_get_n_recs( +/*============*/ + const page_t* page) /*!< in: index page */ +{ + return(page_header_get_field(page, PAGE_N_RECS)); +} + +/*************************************************************//** +Gets the number of dir slots in directory. +@return number of slots */ +UNIV_INLINE +ulint +page_dir_get_n_slots( +/*=================*/ + const page_t* page) /*!< in: index page */ +{ + return(page_header_get_field(page, PAGE_N_DIR_SLOTS)); +} +/*************************************************************//** +Sets the number of dir slots in directory. */ +UNIV_INLINE +void +page_dir_set_n_slots( +/*=================*/ + page_t* page, /*!< in/out: page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page whose + uncompressed part will be updated, or NULL */ + ulint n_slots)/*!< in: number of slots */ +{ + page_header_set_field(page, page_zip, PAGE_N_DIR_SLOTS, n_slots); +} + +/*************************************************************//** +Gets the number of records in the heap. +@return number of user records */ +UNIV_INLINE +ulint +page_dir_get_n_heap( +/*================*/ + const page_t* page) /*!< in: index page */ +{ + return(page_header_get_field(page, PAGE_N_HEAP) & 0x7fff); +} + +/*************************************************************//** +Sets the number of records in the heap. */ +UNIV_INLINE +void +page_dir_set_n_heap( +/*================*/ + page_t* page, /*!< in/out: index page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page whose + uncompressed part will be updated, or NULL. + Note that the size of the dense page directory + in the compressed page trailer is + n_heap * PAGE_ZIP_DIR_SLOT_SIZE. */ + ulint n_heap) /*!< in: number of records */ +{ + ut_ad(n_heap < 0x8000); + ut_ad(!page_zip || n_heap + == (page_header_get_field(page, PAGE_N_HEAP) & 0x7fff) + 1); + + page_header_set_field(page, page_zip, PAGE_N_HEAP, n_heap + | (0x8000 + & page_header_get_field(page, PAGE_N_HEAP))); +} + +#ifdef UNIV_DEBUG +/*************************************************************//** +Gets pointer to nth directory slot. +@return pointer to dir slot */ +UNIV_INLINE +page_dir_slot_t* +page_dir_get_nth_slot( +/*==================*/ + const page_t* page, /*!< in: index page */ + ulint n) /*!< in: position */ +{ + ut_ad(page_dir_get_n_slots(page) > n); + + return((page_dir_slot_t*) + page + UNIV_PAGE_SIZE - PAGE_DIR + - (n + 1) * PAGE_DIR_SLOT_SIZE); +} +#endif /* UNIV_DEBUG */ + +/**************************************************************//** +Used to check the consistency of a record on a page. +@return TRUE if succeed */ +UNIV_INLINE +ibool +page_rec_check( +/*===========*/ + const rec_t* rec) /*!< in: record */ +{ + const page_t* page = page_align(rec); + + ut_a(rec); + + ut_a(page_offset(rec) <= page_header_get_field(page, PAGE_HEAP_TOP)); + ut_a(page_offset(rec) >= PAGE_DATA); + + return(TRUE); +} + +/***************************************************************//** +Gets the record pointed to by a directory slot. +@return pointer to record */ +UNIV_INLINE +const rec_t* +page_dir_slot_get_rec( +/*==================*/ + const page_dir_slot_t* slot) /*!< in: directory slot */ +{ + return(page_align(slot) + mach_read_from_2(slot)); +} + +/***************************************************************//** +This is used to set the record offset in a directory slot. */ +UNIV_INLINE +void +page_dir_slot_set_rec( +/*==================*/ + page_dir_slot_t* slot, /*!< in: directory slot */ + rec_t* rec) /*!< in: record on the page */ +{ + ut_ad(page_rec_check(rec)); + + mach_write_to_2(slot, page_offset(rec)); +} + +/***************************************************************//** +Gets the number of records owned by a directory slot. +@return number of records */ +UNIV_INLINE +ulint +page_dir_slot_get_n_owned( +/*======================*/ + const page_dir_slot_t* slot) /*!< in: page directory slot */ +{ + const rec_t* rec = page_dir_slot_get_rec(slot); + if (page_rec_is_comp(slot)) { + return(rec_get_n_owned_new(rec)); + } else { + return(rec_get_n_owned_old(rec)); + } +} + +/***************************************************************//** +This is used to set the owned records field of a directory slot. */ +UNIV_INLINE +void +page_dir_slot_set_n_owned( +/*======================*/ + page_dir_slot_t*slot, /*!< in/out: directory slot */ + page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */ + ulint n) /*!< in: number of records owned by the slot */ +{ + rec_t* rec = (rec_t*) page_dir_slot_get_rec(slot); + if (page_rec_is_comp(slot)) { + rec_set_n_owned_new(rec, page_zip, n); + } else { + ut_ad(!page_zip); + rec_set_n_owned_old(rec, n); + } +} + +/************************************************************//** +Calculates the space reserved for directory slots of a given number of +records. The exact value is a fraction number n * PAGE_DIR_SLOT_SIZE / +PAGE_DIR_SLOT_MIN_N_OWNED, and it is rounded upwards to an integer. */ +UNIV_INLINE +ulint +page_dir_calc_reserved_space( +/*=========================*/ + ulint n_recs) /*!< in: number of records */ +{ + return((PAGE_DIR_SLOT_SIZE * n_recs + PAGE_DIR_SLOT_MIN_N_OWNED - 1) + / PAGE_DIR_SLOT_MIN_N_OWNED); +} + +/************************************************************//** +Gets the pointer to the next record on the page. +@return pointer to next record */ +UNIV_INLINE +const rec_t* +page_rec_get_next_low( +/*==================*/ + const rec_t* rec, /*!< in: pointer to record */ + ulint comp) /*!< in: nonzero=compact page layout */ +{ + ulint offs; + const page_t* page; + + ut_ad(page_rec_check(rec)); + + page = page_align(rec); + + offs = rec_get_next_offs(rec, comp); + + if (offs >= UNIV_PAGE_SIZE) { + fprintf(stderr, + "InnoDB: Next record offset is nonsensical %lu" + " in record at offset %lu\n" + "InnoDB: rec address %p, space id %lu, page %lu\n", + (ulong) offs, (ulong) page_offset(rec), + (void*) rec, + (ulong) page_get_space_id(page), + (ulong) page_get_page_no(page)); + buf_page_print(page, 0, 0); + + ut_error; + } else if (offs == 0) { + + return(NULL); + } + + return(page + offs); +} + +/************************************************************//** +Gets the pointer to the next record on the page. +@return pointer to next record */ +UNIV_INLINE +rec_t* +page_rec_get_next( +/*==============*/ + rec_t* rec) /*!< in: pointer to record */ +{ + return((rec_t*) page_rec_get_next_low(rec, page_rec_is_comp(rec))); +} + +/************************************************************//** +Gets the pointer to the next record on the page. +@return pointer to next record */ +UNIV_INLINE +const rec_t* +page_rec_get_next_const( +/*====================*/ + const rec_t* rec) /*!< in: pointer to record */ +{ + return(page_rec_get_next_low(rec, page_rec_is_comp(rec))); +} + +/************************************************************//** +Gets the pointer to the next non delete-marked record on the page. +If all subsequent records are delete-marked, then this function +will return the supremum record. +@return pointer to next non delete-marked record or pointer to supremum */ +UNIV_INLINE +const rec_t* +page_rec_get_next_non_del_marked( +/*=============================*/ + const rec_t* rec) /*!< in: pointer to record */ +{ + const rec_t* r; + ulint page_is_compact = page_rec_is_comp(rec); + + for (r = page_rec_get_next_const(rec); + !page_rec_is_supremum(r) + && rec_get_deleted_flag(r, page_is_compact); + r = page_rec_get_next_const(r)) { + /* noop */ + } + + return(r); +} + +/************************************************************//** +Sets the pointer to the next record on the page. */ +UNIV_INLINE +void +page_rec_set_next( +/*==============*/ + rec_t* rec, /*!< in: pointer to record, + must not be page supremum */ + const rec_t* next) /*!< in: pointer to next record, + must not be page infimum */ +{ + ulint offs; + + ut_ad(page_rec_check(rec)); + ut_ad(!page_rec_is_supremum(rec)); + ut_ad(rec != next); + + ut_ad(!next || !page_rec_is_infimum(next)); + ut_ad(!next || page_align(rec) == page_align(next)); + + offs = next != NULL ? page_offset(next) : 0; + + if (page_rec_is_comp(rec)) { + rec_set_next_offs_new(rec, offs); + } else { + rec_set_next_offs_old(rec, offs); + } +} + +/************************************************************//** +Gets the pointer to the previous record. +@return pointer to previous record */ +UNIV_INLINE +const rec_t* +page_rec_get_prev_const( +/*====================*/ + const rec_t* rec) /*!< in: pointer to record, must not be page + infimum */ +{ + const page_dir_slot_t* slot; + ulint slot_no; + const rec_t* rec2; + const rec_t* prev_rec = NULL; + const page_t* page; + + ut_ad(page_rec_check(rec)); + + page = page_align(rec); + + ut_ad(!page_rec_is_infimum(rec)); + + slot_no = page_dir_find_owner_slot(rec); + + ut_a(slot_no != 0); + + slot = page_dir_get_nth_slot(page, slot_no - 1); + + rec2 = page_dir_slot_get_rec(slot); + + if (page_is_comp(page)) { + while (rec != rec2) { + prev_rec = rec2; + rec2 = page_rec_get_next_low(rec2, TRUE); + } + } else { + while (rec != rec2) { + prev_rec = rec2; + rec2 = page_rec_get_next_low(rec2, FALSE); + } + } + + ut_a(prev_rec); + + return(prev_rec); +} + +/************************************************************//** +Gets the pointer to the previous record. +@return pointer to previous record */ +UNIV_INLINE +rec_t* +page_rec_get_prev( +/*==============*/ + rec_t* rec) /*!< in: pointer to record, must not be page + infimum */ +{ + return((rec_t*) page_rec_get_prev_const(rec)); +} + +/***************************************************************//** +Looks for the record which owns the given record. +@return the owner record */ +UNIV_INLINE +rec_t* +page_rec_find_owner_rec( +/*====================*/ + rec_t* rec) /*!< in: the physical record */ +{ + ut_ad(page_rec_check(rec)); + + if (page_rec_is_comp(rec)) { + while (rec_get_n_owned_new(rec) == 0) { + rec = page_rec_get_next(rec); + } + } else { + while (rec_get_n_owned_old(rec) == 0) { + rec = page_rec_get_next(rec); + } + } + + return(rec); +} + +/**********************************************************//** +Returns the base extra size of a physical record. This is the +size of the fixed header, independent of the record size. +@return REC_N_NEW_EXTRA_BYTES or REC_N_OLD_EXTRA_BYTES */ +UNIV_INLINE +ulint +page_rec_get_base_extra_size( +/*=========================*/ + const rec_t* rec) /*!< in: physical record */ +{ +#if REC_N_NEW_EXTRA_BYTES + 1 != REC_N_OLD_EXTRA_BYTES +# error "REC_N_NEW_EXTRA_BYTES + 1 != REC_N_OLD_EXTRA_BYTES" +#endif + return(REC_N_NEW_EXTRA_BYTES + (ulint) !page_rec_is_comp(rec)); +} + +/************************************************************//** +Returns the sum of the sizes of the records in the record list, excluding +the infimum and supremum records. +@return data in bytes */ +UNIV_INLINE +ulint +page_get_data_size( +/*===============*/ + const page_t* page) /*!< in: index page */ +{ + ulint ret; + + ret = (ulint)(page_header_get_field(page, PAGE_HEAP_TOP) + - (page_is_comp(page) + ? PAGE_NEW_SUPREMUM_END + : PAGE_OLD_SUPREMUM_END) + - page_header_get_field(page, PAGE_GARBAGE)); + + ut_ad(ret < UNIV_PAGE_SIZE); + + return(ret); +} + + +/************************************************************//** +Allocates a block of memory from the free list of an index page. */ +UNIV_INLINE +void +page_mem_alloc_free( +/*================*/ + page_t* page, /*!< in/out: index page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page with enough + space available for inserting the record, + or NULL */ + rec_t* next_rec,/*!< in: pointer to the new head of the + free record list */ + ulint need) /*!< in: number of bytes allocated */ +{ + ulint garbage; + +#ifdef UNIV_DEBUG + const rec_t* old_rec = page_header_get_ptr(page, PAGE_FREE); + ulint next_offs; + + ut_ad(old_rec); + next_offs = rec_get_next_offs(old_rec, page_is_comp(page)); + ut_ad(next_rec == (next_offs ? page + next_offs : NULL)); +#endif + + page_header_set_ptr(page, page_zip, PAGE_FREE, next_rec); + + garbage = page_header_get_field(page, PAGE_GARBAGE); + ut_ad(garbage >= need); + + page_header_set_field(page, page_zip, PAGE_GARBAGE, garbage - need); +} + +/*************************************************************//** +Calculates free space if a page is emptied. +@return free space */ +UNIV_INLINE +ulint +page_get_free_space_of_empty( +/*=========================*/ + ulint comp) /*!< in: nonzero=compact page layout */ +{ + if (comp) { + return((ulint)(UNIV_PAGE_SIZE + - PAGE_NEW_SUPREMUM_END + - PAGE_DIR + - 2 * PAGE_DIR_SLOT_SIZE)); + } + + return((ulint)(UNIV_PAGE_SIZE + - PAGE_OLD_SUPREMUM_END + - PAGE_DIR + - 2 * PAGE_DIR_SLOT_SIZE)); +} + +#ifndef UNIV_HOTBACKUP +/***********************************************************************//** +Write a 32-bit field in a data dictionary record. */ +UNIV_INLINE +void +page_rec_write_field( +/*=================*/ + rec_t* rec, /*!< in/out: record to update */ + ulint i, /*!< in: index of the field to update */ + ulint val, /*!< in: value to write */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + byte* data; + ulint len; + + data = rec_get_nth_field_old(rec, i, &len); + + ut_ad(len == 4); + + mlog_write_ulint(data, val, MLOG_4BYTES, mtr); +} +#endif /* !UNIV_HOTBACKUP */ + +/************************************************************//** +Each user record on a page, and also the deleted user records in the heap +takes its size plus the fraction of the dir cell size / +PAGE_DIR_SLOT_MIN_N_OWNED bytes for it. If the sum of these exceeds the +value of page_get_free_space_of_empty, the insert is impossible, otherwise +it is allowed. This function returns the maximum combined size of records +which can be inserted on top of the record heap. +@return maximum combined size for inserted records */ +UNIV_INLINE +ulint +page_get_max_insert_size( +/*=====================*/ + const page_t* page, /*!< in: index page */ + ulint n_recs) /*!< in: number of records */ +{ + ulint occupied; + ulint free_space; + + if (page_is_comp(page)) { + occupied = page_header_get_field(page, PAGE_HEAP_TOP) + - PAGE_NEW_SUPREMUM_END + + page_dir_calc_reserved_space( + n_recs + page_dir_get_n_heap(page) - 2); + + free_space = page_get_free_space_of_empty(TRUE); + } else { + occupied = page_header_get_field(page, PAGE_HEAP_TOP) + - PAGE_OLD_SUPREMUM_END + + page_dir_calc_reserved_space( + n_recs + page_dir_get_n_heap(page) - 2); + + free_space = page_get_free_space_of_empty(FALSE); + } + + /* Above the 'n_recs +' part reserves directory space for the new + inserted records; the '- 2' excludes page infimum and supremum + records */ + + if (occupied > free_space) { + + return(0); + } + + return(free_space - occupied); +} + +/************************************************************//** +Returns the maximum combined size of records which can be inserted on top +of the record heap if a page is first reorganized. +@return maximum combined size for inserted records */ +UNIV_INLINE +ulint +page_get_max_insert_size_after_reorganize( +/*======================================*/ + const page_t* page, /*!< in: index page */ + ulint n_recs) /*!< in: number of records */ +{ + ulint occupied; + ulint free_space; + + occupied = page_get_data_size(page) + + page_dir_calc_reserved_space(n_recs + page_get_n_recs(page)); + + free_space = page_get_free_space_of_empty(page_is_comp(page)); + + if (occupied > free_space) { + + return(0); + } + + return(free_space - occupied); +} + +/************************************************************//** +Puts a record to free list. */ +UNIV_INLINE +void +page_mem_free( +/*==========*/ + page_t* page, /*!< in/out: index page */ + page_zip_des_t* page_zip, /*!< in/out: compressed page, + or NULL */ + rec_t* rec, /*!< in: pointer to the + (origin of) record */ + const dict_index_t* index, /*!< in: index of rec */ + const ulint* offsets) /*!< in: array returned by + rec_get_offsets() */ +{ + rec_t* free; + ulint garbage; + + ut_ad(rec_offs_validate(rec, index, offsets)); + free = page_header_get_ptr(page, PAGE_FREE); + + page_rec_set_next(rec, free); + page_header_set_ptr(page, page_zip, PAGE_FREE, rec); + + garbage = page_header_get_field(page, PAGE_GARBAGE); + + page_header_set_field(page, page_zip, PAGE_GARBAGE, + garbage + rec_offs_size(offsets)); + + if (page_zip) { + page_zip_dir_delete(page_zip, rec, index, offsets, free); + } else { + page_header_set_field(page, page_zip, PAGE_N_RECS, + page_get_n_recs(page) - 1); + } +} + +#ifdef UNIV_MATERIALIZE +#undef UNIV_INLINE +#define UNIV_INLINE UNIV_INLINE_ORIGINAL +#endif diff --git a/storage/xtradb/include/page0types.h b/storage/xtradb/include/page0types.h new file mode 100644 index 00000000000..95143a4bb44 --- /dev/null +++ b/storage/xtradb/include/page0types.h @@ -0,0 +1,169 @@ +/***************************************************************************** + +Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/page0types.h +Index page routines + +Created 2/2/1994 Heikki Tuuri +*******************************************************/ + +#ifndef page0types_h +#define page0types_h + +using namespace std; + +#include <map> + +#include "univ.i" +#include "dict0types.h" +#include "mtr0types.h" + +/** Eliminates a name collision on HP-UX */ +#define page_t ib_page_t +/** Type of the index page */ +typedef byte page_t; +/** Index page cursor */ +struct page_cur_t; + +/** Compressed index page */ +typedef byte page_zip_t; + +/* The following definitions would better belong to page0zip.h, +but we cannot include page0zip.h from rem0rec.ic, because +page0*.h includes rem0rec.h and may include rem0rec.ic. */ + +/** Number of bits needed for representing different compressed page sizes */ +#define PAGE_ZIP_SSIZE_BITS 3 + +/** Maximum compressed page shift size */ +#define PAGE_ZIP_SSIZE_MAX \ + (UNIV_ZIP_SIZE_SHIFT_MAX - UNIV_ZIP_SIZE_SHIFT_MIN + 1) + +/* Make sure there are enough bits available to store the maximum zip +ssize, which is the number of shifts from 512. */ +#if PAGE_ZIP_SSIZE_MAX >= (1 << PAGE_ZIP_SSIZE_BITS) +# error "PAGE_ZIP_SSIZE_MAX >= (1 << PAGE_ZIP_SSIZE_BITS)" +#endif + +/** Compressed page descriptor */ +struct page_zip_des_t +{ + page_zip_t* data; /*!< compressed page data */ + +#ifdef UNIV_DEBUG + unsigned m_start:16; /*!< start offset of modification log */ + bool m_external; /*!< Allocated externally, not from the + buffer pool */ +#endif /* UNIV_DEBUG */ + unsigned m_end:16; /*!< end offset of modification log */ + unsigned m_nonempty:1; /*!< TRUE if the modification log + is not empty */ + unsigned n_blobs:12; /*!< number of externally stored + columns on the page; the maximum + is 744 on a 16 KiB page */ + unsigned ssize:PAGE_ZIP_SSIZE_BITS; + /*!< 0 or compressed page shift size; + the size in bytes is + (UNIV_ZIP_SIZE_MIN >> 1) << ssize. */ +}; + +/** Compression statistics for a given page size */ +struct page_zip_stat_t { + /** Number of page compressions */ + ulint compressed; + /** Number of successful page compressions */ + ulint compressed_ok; + /** Number of page decompressions */ + ulint decompressed; + /** Duration of page compressions in microseconds */ + ib_uint64_t compressed_usec; + /** Duration of page decompressions in microseconds */ + ib_uint64_t decompressed_usec; + page_zip_stat_t() : + /* Initialize members to 0 so that when we do + stlmap[key].compressed++ and element with "key" does not + exist it gets inserted with zeroed members. */ + compressed(0), + compressed_ok(0), + decompressed(0), + compressed_usec(0), + decompressed_usec(0) + { } +}; + +/** Compression statistics types */ +typedef map<index_id_t, page_zip_stat_t> page_zip_stat_per_index_t; + +/** Statistics on compression, indexed by page_zip_des_t::ssize - 1 */ +extern page_zip_stat_t page_zip_stat[PAGE_ZIP_SSIZE_MAX]; +/** Statistics on compression, indexed by dict_index_t::id */ +extern page_zip_stat_per_index_t page_zip_stat_per_index; +extern ib_mutex_t page_zip_stat_per_index_mutex; +#ifdef HAVE_PSI_INTERFACE +extern mysql_pfs_key_t page_zip_stat_per_index_mutex_key; +#endif /* HAVE_PSI_INTERFACE */ + +/**********************************************************************//** +Write the "deleted" flag of a record on a compressed page. The flag must +already have been written on the uncompressed page. */ +UNIV_INTERN +void +page_zip_rec_set_deleted( +/*=====================*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page */ + const byte* rec, /*!< in: record on the uncompressed page */ + ulint flag) /*!< in: the deleted flag (nonzero=TRUE) */ + __attribute__((nonnull)); + +/**********************************************************************//** +Write the "owned" flag of a record on a compressed page. The n_owned field +must already have been written on the uncompressed page. */ +UNIV_INTERN +void +page_zip_rec_set_owned( +/*===================*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page */ + const byte* rec, /*!< in: record on the uncompressed page */ + ulint flag) /*!< in: the owned flag (nonzero=TRUE) */ + __attribute__((nonnull)); + +/**********************************************************************//** +Shift the dense page directory when a record is deleted. */ +UNIV_INTERN +void +page_zip_dir_delete( +/*================*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page */ + byte* rec, /*!< in: deleted record */ + dict_index_t* index, /*!< in: index of rec */ + const ulint* offsets,/*!< in: rec_get_offsets(rec) */ + const byte* free) /*!< in: previous start of the free list */ + __attribute__((nonnull(1,2,3,4))); + +/**********************************************************************//** +Add a slot to the dense page directory. */ +UNIV_INTERN +void +page_zip_dir_add_slot( +/*==================*/ + page_zip_des_t* page_zip, /*!< in/out: compressed page */ + ulint is_clustered) /*!< in: nonzero for clustered index, + zero for others */ + __attribute__((nonnull)); +#endif diff --git a/storage/xtradb/include/page0zip.h b/storage/xtradb/include/page0zip.h new file mode 100644 index 00000000000..2f9efc4a40c --- /dev/null +++ b/storage/xtradb/include/page0zip.h @@ -0,0 +1,538 @@ +/***************************************************************************** + +Copyright (c) 2005, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/page0zip.h +Compressed page interface + +Created June 2005 by Marko Makela +*******************************************************/ + +#ifndef page0zip_h +#define page0zip_h + +#ifdef UNIV_MATERIALIZE +# undef UNIV_INLINE +# define UNIV_INLINE +#endif + +#include "mtr0types.h" +#include "page0types.h" +#include "buf0types.h" +#include "dict0types.h" +#include "srv0srv.h" +#include "trx0types.h" +#include "mem0mem.h" + +/* Compression level to be used by zlib. Settable by user. */ +extern uint page_zip_level; + +/* Default compression level. */ +#define DEFAULT_COMPRESSION_LEVEL 6 + +/* Whether or not to log compressed page images to avoid possible +compression algorithm changes in zlib. */ +extern my_bool page_zip_log_pages; + +/**********************************************************************//** +Determine the size of a compressed page in bytes. +@return size in bytes */ +UNIV_INLINE +ulint +page_zip_get_size( +/*==============*/ + const page_zip_des_t* page_zip) /*!< in: compressed page */ + __attribute__((nonnull, pure)); +/**********************************************************************//** +Set the size of a compressed page in bytes. */ +UNIV_INLINE +void +page_zip_set_size( +/*==============*/ + page_zip_des_t* page_zip, /*!< in/out: compressed page */ + ulint size); /*!< in: size in bytes */ + +#ifndef UNIV_HOTBACKUP +/**********************************************************************//** +Determine if a record is so big that it needs to be stored externally. +@return FALSE if the entire record can be stored locally on the page */ +UNIV_INLINE +ibool +page_zip_rec_needs_ext( +/*===================*/ + ulint rec_size, /*!< in: length of the record in bytes */ + ulint comp, /*!< in: nonzero=compact format */ + ulint n_fields, /*!< in: number of fields in the record; + ignored if zip_size == 0 */ + ulint zip_size) /*!< in: compressed page size in bytes, or 0 */ + __attribute__((const)); + +/**********************************************************************//** +Determine the guaranteed free space on an empty page. +@return minimum payload size on the page */ +UNIV_INTERN +ulint +page_zip_empty_size( +/*================*/ + ulint n_fields, /*!< in: number of columns in the index */ + ulint zip_size) /*!< in: compressed page size in bytes */ + __attribute__((const)); +#endif /* !UNIV_HOTBACKUP */ + +/**********************************************************************//** +Initialize a compressed page descriptor. */ +UNIV_INLINE +void +page_zip_des_init( +/*==============*/ + page_zip_des_t* page_zip); /*!< in/out: compressed page + descriptor */ + +/**********************************************************************//** +Configure the zlib allocator to use the given memory heap. */ +UNIV_INTERN +void +page_zip_set_alloc( +/*===============*/ + void* stream, /*!< in/out: zlib stream */ + mem_heap_t* heap); /*!< in: memory heap to use */ + +/**********************************************************************//** +Compress a page. +@return TRUE on success, FALSE on failure; page_zip will be left +intact on failure. */ +UNIV_INTERN +ibool +page_zip_compress( +/*==============*/ + page_zip_des_t* page_zip,/*!< in: size; out: data, n_blobs, + m_start, m_end, m_nonempty */ + const page_t* page, /*!< in: uncompressed page */ + dict_index_t* index, /*!< in: index of the B-tree node */ + ulint level, /*!< in: compression level */ + mtr_t* mtr) /*!< in: mini-transaction, or NULL */ + __attribute__((nonnull(1,3))); + +/**********************************************************************//** +Decompress a page. This function should tolerate errors on the compressed +page. Instead of letting assertions fail, it will return FALSE if an +inconsistency is detected. +@return TRUE on success, FALSE on failure */ +UNIV_INTERN +ibool +page_zip_decompress( +/*================*/ + page_zip_des_t* page_zip,/*!< in: data, ssize; + out: m_start, m_end, m_nonempty, n_blobs */ + page_t* page, /*!< out: uncompressed page, may be trashed */ + ibool all) /*!< in: TRUE=decompress the whole page; + FALSE=verify but do not copy some + page header fields that should not change + after page creation */ + __attribute__((nonnull(1,2))); + +#ifdef UNIV_DEBUG +/**********************************************************************//** +Validate a compressed page descriptor. +@return TRUE if ok */ +UNIV_INLINE +ibool +page_zip_simple_validate( +/*=====================*/ + const page_zip_des_t* page_zip); /*!< in: compressed page + descriptor */ +#endif /* UNIV_DEBUG */ + +#ifdef UNIV_ZIP_DEBUG +/**********************************************************************//** +Check that the compressed and decompressed pages match. +@return TRUE if valid, FALSE if not */ +UNIV_INTERN +ibool +page_zip_validate_low( +/*==================*/ + const page_zip_des_t* page_zip,/*!< in: compressed page */ + const page_t* page, /*!< in: uncompressed page */ + const dict_index_t* index, /*!< in: index of the page, if known */ + ibool sloppy) /*!< in: FALSE=strict, + TRUE=ignore the MIN_REC_FLAG */ + __attribute__((nonnull(1,2))); +/**********************************************************************//** +Check that the compressed and decompressed pages match. */ +UNIV_INTERN +ibool +page_zip_validate( +/*==============*/ + const page_zip_des_t* page_zip,/*!< in: compressed page */ + const page_t* page, /*!< in: uncompressed page */ + const dict_index_t* index) /*!< in: index of the page, if known */ + __attribute__((nonnull(1,2))); +#endif /* UNIV_ZIP_DEBUG */ + +/**********************************************************************//** +Determine how big record can be inserted without recompressing the page. +@return a positive number indicating the maximum size of a record +whose insertion is guaranteed to succeed, or zero or negative */ +UNIV_INLINE +lint +page_zip_max_ins_size( +/*==================*/ + const page_zip_des_t* page_zip,/*!< in: compressed page */ + ibool is_clust)/*!< in: TRUE if clustered index */ + __attribute__((nonnull, pure)); + +/**********************************************************************//** +Determine if enough space is available in the modification log. +@return TRUE if page_zip_write_rec() will succeed */ +UNIV_INLINE +ibool +page_zip_available( +/*===============*/ + const page_zip_des_t* page_zip,/*!< in: compressed page */ + ibool is_clust,/*!< in: TRUE if clustered index */ + ulint length, /*!< in: combined size of the record */ + ulint create) /*!< in: nonzero=add the record to + the heap */ + __attribute__((nonnull, pure)); + +/**********************************************************************//** +Write data to the uncompressed header portion of a page. The data must +already have been written to the uncompressed page. */ +UNIV_INLINE +void +page_zip_write_header( +/*==================*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page */ + const byte* str, /*!< in: address on the uncompressed page */ + ulint length, /*!< in: length of the data */ + mtr_t* mtr) /*!< in: mini-transaction, or NULL */ + __attribute__((nonnull(1,2))); + +/**********************************************************************//** +Write an entire record on the compressed page. The data must already +have been written to the uncompressed page. */ +UNIV_INTERN +void +page_zip_write_rec( +/*===============*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page */ + const byte* rec, /*!< in: record being written */ + dict_index_t* index, /*!< in: the index the record belongs to */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + ulint create) /*!< in: nonzero=insert, zero=update */ + __attribute__((nonnull)); + +/***********************************************************//** +Parses a log record of writing a BLOB pointer of a record. +@return end of log record or NULL */ +UNIV_INTERN +byte* +page_zip_parse_write_blob_ptr( +/*==========================*/ + byte* ptr, /*!< in: redo log buffer */ + byte* end_ptr,/*!< in: redo log buffer end */ + page_t* page, /*!< in/out: uncompressed page */ + page_zip_des_t* page_zip);/*!< in/out: compressed page */ + +/**********************************************************************//** +Write a BLOB pointer of a record on the leaf page of a clustered index. +The information must already have been updated on the uncompressed page. */ +UNIV_INTERN +void +page_zip_write_blob_ptr( +/*====================*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page */ + const byte* rec, /*!< in/out: record whose data is being + written */ + dict_index_t* index, /*!< in: index of the page */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + ulint n, /*!< in: column index */ + mtr_t* mtr) /*!< in: mini-transaction handle, + or NULL if no logging is needed */ + __attribute__((nonnull(1,2,3,4))); + +/***********************************************************//** +Parses a log record of writing the node pointer of a record. +@return end of log record or NULL */ +UNIV_INTERN +byte* +page_zip_parse_write_node_ptr( +/*==========================*/ + byte* ptr, /*!< in: redo log buffer */ + byte* end_ptr,/*!< in: redo log buffer end */ + page_t* page, /*!< in/out: uncompressed page */ + page_zip_des_t* page_zip);/*!< in/out: compressed page */ + +/**********************************************************************//** +Write the node pointer of a record on a non-leaf compressed page. */ +UNIV_INTERN +void +page_zip_write_node_ptr( +/*====================*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page */ + byte* rec, /*!< in/out: record */ + ulint size, /*!< in: data size of rec */ + ulint ptr, /*!< in: node pointer */ + mtr_t* mtr) /*!< in: mini-transaction, or NULL */ + __attribute__((nonnull(1,2))); + +/**********************************************************************//** +Write the trx_id and roll_ptr of a record on a B-tree leaf node page. */ +UNIV_INTERN +void +page_zip_write_trx_id_and_roll_ptr( +/*===============================*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page */ + byte* rec, /*!< in/out: record */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + ulint trx_id_col,/*!< in: column number of TRX_ID in rec */ + trx_id_t trx_id, /*!< in: transaction identifier */ + roll_ptr_t roll_ptr)/*!< in: roll_ptr */ + __attribute__((nonnull)); + +/**********************************************************************//** +Write the "deleted" flag of a record on a compressed page. The flag must +already have been written on the uncompressed page. */ +UNIV_INTERN +void +page_zip_rec_set_deleted( +/*=====================*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page */ + const byte* rec, /*!< in: record on the uncompressed page */ + ulint flag) /*!< in: the deleted flag (nonzero=TRUE) */ + __attribute__((nonnull)); + +/**********************************************************************//** +Write the "owned" flag of a record on a compressed page. The n_owned field +must already have been written on the uncompressed page. */ +UNIV_INTERN +void +page_zip_rec_set_owned( +/*===================*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page */ + const byte* rec, /*!< in: record on the uncompressed page */ + ulint flag) /*!< in: the owned flag (nonzero=TRUE) */ + __attribute__((nonnull)); + +/**********************************************************************//** +Insert a record to the dense page directory. */ +UNIV_INTERN +void +page_zip_dir_insert( +/*================*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page */ + const byte* prev_rec,/*!< in: record after which to insert */ + const byte* free_rec,/*!< in: record from which rec was + allocated, or NULL */ + byte* rec); /*!< in: record to insert */ + +/**********************************************************************//** +Shift the dense page directory and the array of BLOB pointers +when a record is deleted. */ +UNIV_INTERN +void +page_zip_dir_delete( +/*================*/ + page_zip_des_t* page_zip, /*!< in/out: compressed page */ + byte* rec, /*!< in: deleted record */ + const dict_index_t* index, /*!< in: index of rec */ + const ulint* offsets, /*!< in: rec_get_offsets(rec) */ + const byte* free) /*!< in: previous start of + the free list */ + __attribute__((nonnull(1,2,3,4))); + +/**********************************************************************//** +Add a slot to the dense page directory. */ +UNIV_INTERN +void +page_zip_dir_add_slot( +/*==================*/ + page_zip_des_t* page_zip, /*!< in/out: compressed page */ + ulint is_clustered) /*!< in: nonzero for clustered index, + zero for others */ + __attribute__((nonnull)); + +/***********************************************************//** +Parses a log record of writing to the header of a page. +@return end of log record or NULL */ +UNIV_INTERN +byte* +page_zip_parse_write_header( +/*========================*/ + byte* ptr, /*!< in: redo log buffer */ + byte* end_ptr,/*!< in: redo log buffer end */ + page_t* page, /*!< in/out: uncompressed page */ + page_zip_des_t* page_zip);/*!< in/out: compressed page */ + +/**********************************************************************//** +Write data to the uncompressed header portion of a page. The data must +already have been written to the uncompressed page. +However, the data portion of the uncompressed page may differ from +the compressed page when a record is being inserted in +page_cur_insert_rec_low(). */ +UNIV_INLINE +void +page_zip_write_header( +/*==================*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page */ + const byte* str, /*!< in: address on the uncompressed page */ + ulint length, /*!< in: length of the data */ + mtr_t* mtr) /*!< in: mini-transaction, or NULL */ + __attribute__((nonnull(1,2))); + +/**********************************************************************//** +Reorganize and compress a page. This is a low-level operation for +compressed pages, to be used when page_zip_compress() fails. +On success, a redo log entry MLOG_ZIP_PAGE_COMPRESS will be written. +The function btr_page_reorganize() should be preferred whenever possible. +IMPORTANT: if page_zip_reorganize() is invoked on a leaf page of a +non-clustered index, the caller must update the insert buffer free +bits in the same mini-transaction in such a way that the modification +will be redo-logged. +@return TRUE on success, FALSE on failure; page_zip will be left +intact on failure, but page will be overwritten. */ +UNIV_INTERN +ibool +page_zip_reorganize( +/*================*/ + buf_block_t* block, /*!< in/out: page with compressed page; + on the compressed page, in: size; + out: data, n_blobs, + m_start, m_end, m_nonempty */ + dict_index_t* index, /*!< in: index of the B-tree node */ + mtr_t* mtr) /*!< in: mini-transaction */ + __attribute__((nonnull)); +#ifndef UNIV_HOTBACKUP +/**********************************************************************//** +Copy the records of a page byte for byte. Do not copy the page header +or trailer, except those B-tree header fields that are directly +related to the storage of records. Also copy PAGE_MAX_TRX_ID. +NOTE: The caller must update the lock table and the adaptive hash index. */ +UNIV_INTERN +void +page_zip_copy_recs( +/*===============*/ + page_zip_des_t* page_zip, /*!< out: copy of src_zip + (n_blobs, m_start, m_end, + m_nonempty, data[0..size-1]) */ + page_t* page, /*!< out: copy of src */ + const page_zip_des_t* src_zip, /*!< in: compressed page */ + const page_t* src, /*!< in: page */ + dict_index_t* index, /*!< in: index of the B-tree */ + mtr_t* mtr) /*!< in: mini-transaction */ + __attribute__((nonnull)); +#endif /* !UNIV_HOTBACKUP */ + +/**********************************************************************//** +Parses a log record of compressing an index page. +@return end of log record or NULL */ +UNIV_INTERN +byte* +page_zip_parse_compress( +/*====================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + page_t* page, /*!< out: uncompressed page */ + page_zip_des_t* page_zip)/*!< out: compressed page */ + __attribute__((nonnull(1,2))); + +/**********************************************************************//** +Calculate the compressed page checksum. +@return page checksum */ +UNIV_INTERN +ulint +page_zip_calc_checksum( +/*===================*/ + const void* data, /*!< in: compressed page */ + ulint size, /*!< in: size of compressed page */ + srv_checksum_algorithm_t algo) /*!< in: algorithm to use */ + __attribute__((nonnull)); + +/**********************************************************************//** +Verify a compressed page's checksum. +@return TRUE if the stored checksum is valid according to the value of +innodb_checksum_algorithm */ +UNIV_INTERN +ibool +page_zip_verify_checksum( +/*=====================*/ + const void* data, /*!< in: compressed page */ + ulint size); /*!< in: size of compressed page */ +/**********************************************************************//** +Write a log record of compressing an index page without the data on the page. */ +UNIV_INLINE +void +page_zip_compress_write_log_no_data( +/*================================*/ + ulint level, /*!< in: compression level */ + const page_t* page, /*!< in: page that is compressed */ + dict_index_t* index, /*!< in: index */ + mtr_t* mtr); /*!< in: mtr */ +/**********************************************************************//** +Parses a log record of compressing an index page without the data. +@return end of log record or NULL */ +UNIV_INLINE +byte* +page_zip_parse_compress_no_data( +/*============================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr, /*!< in: buffer end */ + page_t* page, /*!< in: uncompressed page */ + page_zip_des_t* page_zip, /*!< out: compressed page */ + dict_index_t* index) /*!< in: index */ + __attribute__((nonnull(1,2))); + +/**********************************************************************//** +Reset the counters used for filling +INFORMATION_SCHEMA.innodb_cmp_per_index. */ +UNIV_INLINE +void +page_zip_reset_stat_per_index(); +/*===========================*/ + +#ifndef UNIV_HOTBACKUP +/** Check if a pointer to an uncompressed page matches a compressed page. +When we IMPORT a tablespace the blocks and accompanying frames are allocted +from outside the buffer pool. +@param ptr pointer to an uncompressed page frame +@param page_zip compressed page descriptor +@return TRUE if ptr and page_zip refer to the same block */ +# define PAGE_ZIP_MATCH(ptr, page_zip) \ + (((page_zip)->m_external \ + && (page_align(ptr) + UNIV_PAGE_SIZE == (page_zip)->data)) \ + || buf_frame_get_page_zip(ptr) == (page_zip)) +#else /* !UNIV_HOTBACKUP */ +/** Check if a pointer to an uncompressed page matches a compressed page. +@param ptr pointer to an uncompressed page frame +@param page_zip compressed page descriptor +@return TRUE if ptr and page_zip refer to the same block */ +# define PAGE_ZIP_MATCH(ptr, page_zip) \ + (page_align(ptr) + UNIV_PAGE_SIZE == (page_zip)->data) +#endif /* !UNIV_HOTBACKUP */ + +#ifdef UNIV_MATERIALIZE +# undef UNIV_INLINE +# define UNIV_INLINE UNIV_INLINE_ORIGINAL +#endif + +#ifndef UNIV_NONINL +# include "page0zip.ic" +#endif + +#endif /* page0zip_h */ diff --git a/storage/xtradb/include/page0zip.ic b/storage/xtradb/include/page0zip.ic new file mode 100644 index 00000000000..6c7d8cd32c7 --- /dev/null +++ b/storage/xtradb/include/page0zip.ic @@ -0,0 +1,456 @@ +/***************************************************************************** + +Copyright (c) 2005, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/page0zip.ic +Compressed page interface + +Created June 2005 by Marko Makela +*******************************************************/ + +#ifdef UNIV_MATERIALIZE +# undef UNIV_INLINE +# define UNIV_INLINE +#endif + +#include "page0zip.h" +#include "mtr0log.h" +#include "page0page.h" + +/* The format of compressed pages is as follows. + +The header and trailer of the uncompressed pages, excluding the page +directory in the trailer, are copied as is to the header and trailer +of the compressed page. + +At the end of the compressed page, there is a dense page directory +pointing to every user record contained on the page, including deleted +records on the free list. The dense directory is indexed in the +collation order, i.e., in the order in which the record list is +linked on the uncompressed page. The infimum and supremum records are +excluded. The two most significant bits of the entries are allocated +for the delete-mark and an n_owned flag indicating the last record in +a chain of records pointed to from the sparse page directory on the +uncompressed page. + +The data between PAGE_ZIP_START and the last page directory entry will +be written in compressed format, starting at offset PAGE_DATA. +Infimum and supremum records are not stored. We exclude the +REC_N_NEW_EXTRA_BYTES in every record header. These can be recovered +from the dense page directory stored at the end of the compressed +page. + +The fields node_ptr (in non-leaf B-tree nodes; level>0), trx_id and +roll_ptr (in leaf B-tree nodes; level=0), and BLOB pointers of +externally stored columns are stored separately, in ascending order of +heap_no and column index, starting backwards from the dense page +directory. + +The compressed data stream may be followed by a modification log +covering the compressed portion of the page, as follows. + +MODIFICATION LOG ENTRY FORMAT +- write record: + - (heap_no - 1) << 1 (1..2 bytes) + - extra bytes backwards + - data bytes +- clear record: + - (heap_no - 1) << 1 | 1 (1..2 bytes) + +The integer values are stored in a variable-length format: +- 0xxxxxxx: 0..127 +- 1xxxxxxx xxxxxxxx: 0..32767 + +The end of the modification log is marked by a 0 byte. + +In summary, the compressed page looks like this: + +(1) Uncompressed page header (PAGE_DATA bytes) +(2) Compressed index information +(3) Compressed page data +(4) Page modification log (page_zip->m_start..page_zip->m_end) +(5) Empty zero-filled space +(6) BLOB pointers (on leaf pages) + - BTR_EXTERN_FIELD_REF_SIZE for each externally stored column + - in descending collation order +(7) Uncompressed columns of user records, n_dense * uncompressed_size bytes, + - indexed by heap_no + - DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN for leaf pages of clustered indexes + - REC_NODE_PTR_SIZE for non-leaf pages + - 0 otherwise +(8) dense page directory, stored backwards + - n_dense = n_heap - 2 + - existing records in ascending collation order + - deleted records (free list) in link order +*/ + +/** Start offset of the area that will be compressed */ +#define PAGE_ZIP_START PAGE_NEW_SUPREMUM_END +/** Size of an compressed page directory entry */ +#define PAGE_ZIP_DIR_SLOT_SIZE 2 +/** Mask of record offsets */ +#define PAGE_ZIP_DIR_SLOT_MASK 0x3fff +/** 'owned' flag */ +#define PAGE_ZIP_DIR_SLOT_OWNED 0x4000 +/** 'deleted' flag */ +#define PAGE_ZIP_DIR_SLOT_DEL 0x8000 + +/**********************************************************************//** +Determine the size of a compressed page in bytes. +@return size in bytes */ +UNIV_INLINE +ulint +page_zip_get_size( +/*==============*/ + const page_zip_des_t* page_zip) /*!< in: compressed page */ +{ + ulint size; + + if (!page_zip->ssize) { + return(0); + } + + size = (UNIV_ZIP_SIZE_MIN >> 1) << page_zip->ssize; + + ut_ad(size >= UNIV_ZIP_SIZE_MIN); + ut_ad(size <= UNIV_PAGE_SIZE); + + return(size); +} +/**********************************************************************//** +Set the size of a compressed page in bytes. */ +UNIV_INLINE +void +page_zip_set_size( +/*==============*/ + page_zip_des_t* page_zip, /*!< in/out: compressed page */ + ulint size) /*!< in: size in bytes */ +{ + if (size) { + int ssize; + + ut_ad(ut_is_2pow(size)); + + for (ssize = 1; size > (ulint) (512 << ssize); ssize++) { + } + + page_zip->ssize = ssize; + } else { + page_zip->ssize = 0; + } + + ut_ad(page_zip_get_size(page_zip) == size); +} + +#ifndef UNIV_HOTBACKUP +/**********************************************************************//** +Determine if a record is so big that it needs to be stored externally. +@return FALSE if the entire record can be stored locally on the page */ +UNIV_INLINE +ibool +page_zip_rec_needs_ext( +/*===================*/ + ulint rec_size, /*!< in: length of the record in bytes */ + ulint comp, /*!< in: nonzero=compact format */ + ulint n_fields, /*!< in: number of fields in the record; + ignored if zip_size == 0 */ + ulint zip_size) /*!< in: compressed page size in bytes, or 0 */ +{ + ut_ad(rec_size > comp ? REC_N_NEW_EXTRA_BYTES : REC_N_OLD_EXTRA_BYTES); + ut_ad(ut_is_2pow(zip_size)); + ut_ad(comp || !zip_size); + +#if UNIV_PAGE_SIZE_MAX > REC_MAX_DATA_SIZE + if (rec_size >= REC_MAX_DATA_SIZE) { + return(TRUE); + } +#endif + + if (zip_size) { + ut_ad(comp); + /* On a compressed page, there is a two-byte entry in + the dense page directory for every record. But there + is no record header. There should be enough room for + one record on an empty leaf page. Subtract 1 byte for + the encoded heap number. Check also the available space + on the uncompressed page. */ + return(rec_size - (REC_N_NEW_EXTRA_BYTES - 2 - 1) + >= page_zip_empty_size(n_fields, zip_size) + || rec_size >= page_get_free_space_of_empty(TRUE) / 2); + } + + return(rec_size >= page_get_free_space_of_empty(comp) / 2); +} +#endif /* !UNIV_HOTBACKUP */ + +#ifdef UNIV_DEBUG +/**********************************************************************//** +Validate a compressed page descriptor. +@return TRUE if ok */ +UNIV_INLINE +ibool +page_zip_simple_validate( +/*=====================*/ + const page_zip_des_t* page_zip)/*!< in: compressed page descriptor */ +{ + ut_ad(page_zip); + ut_ad(page_zip->data); + ut_ad(page_zip->ssize <= PAGE_ZIP_SSIZE_MAX); + ut_ad(page_zip_get_size(page_zip) + > PAGE_DATA + PAGE_ZIP_DIR_SLOT_SIZE); + ut_ad(page_zip->m_start <= page_zip->m_end); + ut_ad(page_zip->m_end < page_zip_get_size(page_zip)); + ut_ad(page_zip->n_blobs + < page_zip_get_size(page_zip) / BTR_EXTERN_FIELD_REF_SIZE); + return(TRUE); +} +#endif /* UNIV_DEBUG */ + +/**********************************************************************//** +Determine if the length of the page trailer. +@return length of the page trailer, in bytes, not including the +terminating zero byte of the modification log */ +UNIV_INLINE +ibool +page_zip_get_trailer_len( +/*=====================*/ + const page_zip_des_t* page_zip,/*!< in: compressed page */ + ibool is_clust)/*!< in: TRUE if clustered index */ +{ + ulint uncompressed_size; + + ut_ad(page_zip_simple_validate(page_zip)); + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + + if (!page_is_leaf(page_zip->data)) { + uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE + + REC_NODE_PTR_SIZE; + ut_ad(!page_zip->n_blobs); + } else if (is_clust) { + uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE + + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + } else { + uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE; + ut_ad(!page_zip->n_blobs); + } + + return((page_dir_get_n_heap(page_zip->data) - 2) + * uncompressed_size + + page_zip->n_blobs * BTR_EXTERN_FIELD_REF_SIZE); +} + +/**********************************************************************//** +Determine how big record can be inserted without recompressing the page. +@return a positive number indicating the maximum size of a record +whose insertion is guaranteed to succeed, or zero or negative */ +UNIV_INLINE +lint +page_zip_max_ins_size( +/*==================*/ + const page_zip_des_t* page_zip,/*!< in: compressed page */ + ibool is_clust)/*!< in: TRUE if clustered index */ +{ + ulint trailer_len; + + trailer_len = page_zip_get_trailer_len(page_zip, is_clust); + + /* When a record is created, a pointer may be added to + the dense directory. + Likewise, space for the columns that will not be + compressed will be allocated from the page trailer. + Also the BLOB pointers will be allocated from there, but + we may as well count them in the length of the record. */ + + trailer_len += PAGE_ZIP_DIR_SLOT_SIZE; + + return((lint) page_zip_get_size(page_zip) + - trailer_len - page_zip->m_end + - (REC_N_NEW_EXTRA_BYTES - 2)); +} + +/**********************************************************************//** +Determine if enough space is available in the modification log. +@return TRUE if enough space is available */ +UNIV_INLINE +ibool +page_zip_available( +/*===============*/ + const page_zip_des_t* page_zip,/*!< in: compressed page */ + ibool is_clust,/*!< in: TRUE if clustered index */ + ulint length, /*!< in: combined size of the record */ + ulint create) /*!< in: nonzero=add the record to + the heap */ +{ + ulint trailer_len; + + ut_ad(length > REC_N_NEW_EXTRA_BYTES); + + trailer_len = page_zip_get_trailer_len(page_zip, is_clust); + + /* Subtract the fixed extra bytes and add the maximum + space needed for identifying the record (encoded heap_no). */ + length -= REC_N_NEW_EXTRA_BYTES - 2; + + if (create > 0) { + /* When a record is created, a pointer may be added to + the dense directory. + Likewise, space for the columns that will not be + compressed will be allocated from the page trailer. + Also the BLOB pointers will be allocated from there, but + we may as well count them in the length of the record. */ + + trailer_len += PAGE_ZIP_DIR_SLOT_SIZE; + } + + return(length + trailer_len + page_zip->m_end + < page_zip_get_size(page_zip)); +} + +/**********************************************************************//** +Initialize a compressed page descriptor. */ +UNIV_INLINE +void +page_zip_des_init( +/*==============*/ + page_zip_des_t* page_zip) /*!< in/out: compressed page + descriptor */ +{ + memset(page_zip, 0, sizeof *page_zip); +} + +/**********************************************************************//** +Write a log record of writing to the uncompressed header portion of a page. */ +UNIV_INTERN +void +page_zip_write_header_log( +/*======================*/ + const byte* data,/*!< in: data on the uncompressed page */ + ulint length, /*!< in: length of the data */ + mtr_t* mtr); /*!< in: mini-transaction */ + +/**********************************************************************//** +Write data to the uncompressed header portion of a page. The data must +already have been written to the uncompressed page. +However, the data portion of the uncompressed page may differ from +the compressed page when a record is being inserted in +page_cur_insert_rec_zip(). */ +UNIV_INLINE +void +page_zip_write_header( +/*==================*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page */ + const byte* str, /*!< in: address on the uncompressed page */ + ulint length, /*!< in: length of the data */ + mtr_t* mtr) /*!< in: mini-transaction, or NULL */ +{ + ulint pos; + + ut_ad(PAGE_ZIP_MATCH(str, page_zip)); + ut_ad(page_zip_simple_validate(page_zip)); + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + + pos = page_offset(str); + + ut_ad(pos < PAGE_DATA); + + memcpy(page_zip->data + pos, str, length); + + /* The following would fail in page_cur_insert_rec_zip(). */ + /* ut_ad(page_zip_validate(page_zip, str - pos)); */ + + if (mtr) { +#ifndef UNIV_HOTBACKUP + page_zip_write_header_log(str, length, mtr); +#endif /* !UNIV_HOTBACKUP */ + } +} + +/**********************************************************************//** +Write a log record of compressing an index page without the data on the page. */ +UNIV_INLINE +void +page_zip_compress_write_log_no_data( +/*================================*/ + ulint level, /*!< in: compression level */ + const page_t* page, /*!< in: page that is compressed */ + dict_index_t* index, /*!< in: index */ + mtr_t* mtr) /*!< in: mtr */ +{ + byte* log_ptr = mlog_open_and_write_index( + mtr, page, index, MLOG_ZIP_PAGE_COMPRESS_NO_DATA, 1); + + if (log_ptr) { + mach_write_to_1(log_ptr, level); + mlog_close(mtr, log_ptr + 1); + } +} + +/**********************************************************************//** +Parses a log record of compressing an index page without the data. +@return end of log record or NULL */ +UNIV_INLINE +byte* +page_zip_parse_compress_no_data( +/*============================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr, /*!< in: buffer end */ + page_t* page, /*!< in: uncompressed page */ + page_zip_des_t* page_zip, /*!< out: compressed page */ + dict_index_t* index) /*!< in: index */ +{ + ulint level; + if (end_ptr == ptr) { + return(NULL); + } + + level = mach_read_from_1(ptr); + + /* If page compression fails then there must be something wrong + because a compress log record is logged only if the compression + was successful. Crash in this case. */ + + if (page + && !page_zip_compress(page_zip, page, index, level, NULL)) { + ut_error; + } + + return(ptr + 1); +} + +/**********************************************************************//** +Reset the counters used for filling +INFORMATION_SCHEMA.innodb_cmp_per_index. */ +UNIV_INLINE +void +page_zip_reset_stat_per_index() +/*===========================*/ +{ + mutex_enter(&page_zip_stat_per_index_mutex); + + page_zip_stat_per_index.erase( + page_zip_stat_per_index.begin(), + page_zip_stat_per_index.end()); + + mutex_exit(&page_zip_stat_per_index_mutex); +} + +#ifdef UNIV_MATERIALIZE +# undef UNIV_INLINE +# define UNIV_INLINE UNIV_INLINE_ORIGINAL +#endif diff --git a/storage/xtradb/include/pars0grm.h b/storage/xtradb/include/pars0grm.h new file mode 100644 index 00000000000..8e725fe9545 --- /dev/null +++ b/storage/xtradb/include/pars0grm.h @@ -0,0 +1,261 @@ +/* A Bison parser, made by GNU Bison 2.3. */ + +/* Skeleton interface for Bison's Yacc-like parsers in C + + Copyright (C) 1984, 1989, 1990, 2000, 2001, 2002, 2003, 2004, 2005, 2006 + Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, + Boston, MA 02110-1301, USA. */ + +/* As a special exception, you may create a larger work that contains + part or all of the Bison parser skeleton and distribute that work + under terms of your choice, so long as that work isn't itself a + parser generator using the skeleton or a modified version thereof + as a parser skeleton. Alternatively, if you modify or redistribute + the parser skeleton itself, you may (at your option) remove this + special exception, which will cause the skeleton and the resulting + Bison output files to be licensed under the GNU General Public + License without this special exception. + + This special exception was added by the Free Software Foundation in + version 2.2 of Bison. */ + +/* Tokens. */ +#ifndef YYTOKENTYPE +# define YYTOKENTYPE + /* Put the tokens into the symbol table, so that GDB and other debuggers + know about them. */ + enum yytokentype { + PARS_INT_LIT = 258, + PARS_FLOAT_LIT = 259, + PARS_STR_LIT = 260, + PARS_FIXBINARY_LIT = 261, + PARS_BLOB_LIT = 262, + PARS_NULL_LIT = 263, + PARS_ID_TOKEN = 264, + PARS_AND_TOKEN = 265, + PARS_OR_TOKEN = 266, + PARS_NOT_TOKEN = 267, + PARS_GE_TOKEN = 268, + PARS_LE_TOKEN = 269, + PARS_NE_TOKEN = 270, + PARS_PROCEDURE_TOKEN = 271, + PARS_IN_TOKEN = 272, + PARS_OUT_TOKEN = 273, + PARS_BINARY_TOKEN = 274, + PARS_BLOB_TOKEN = 275, + PARS_INT_TOKEN = 276, + PARS_INTEGER_TOKEN = 277, + PARS_FLOAT_TOKEN = 278, + PARS_CHAR_TOKEN = 279, + PARS_IS_TOKEN = 280, + PARS_BEGIN_TOKEN = 281, + PARS_END_TOKEN = 282, + PARS_IF_TOKEN = 283, + PARS_THEN_TOKEN = 284, + PARS_ELSE_TOKEN = 285, + PARS_ELSIF_TOKEN = 286, + PARS_LOOP_TOKEN = 287, + PARS_WHILE_TOKEN = 288, + PARS_RETURN_TOKEN = 289, + PARS_SELECT_TOKEN = 290, + PARS_SUM_TOKEN = 291, + PARS_COUNT_TOKEN = 292, + PARS_DISTINCT_TOKEN = 293, + PARS_FROM_TOKEN = 294, + PARS_WHERE_TOKEN = 295, + PARS_FOR_TOKEN = 296, + PARS_DDOT_TOKEN = 297, + PARS_READ_TOKEN = 298, + PARS_ORDER_TOKEN = 299, + PARS_BY_TOKEN = 300, + PARS_ASC_TOKEN = 301, + PARS_DESC_TOKEN = 302, + PARS_INSERT_TOKEN = 303, + PARS_INTO_TOKEN = 304, + PARS_VALUES_TOKEN = 305, + PARS_UPDATE_TOKEN = 306, + PARS_SET_TOKEN = 307, + PARS_DELETE_TOKEN = 308, + PARS_CURRENT_TOKEN = 309, + PARS_OF_TOKEN = 310, + PARS_CREATE_TOKEN = 311, + PARS_TABLE_TOKEN = 312, + PARS_INDEX_TOKEN = 313, + PARS_UNIQUE_TOKEN = 314, + PARS_CLUSTERED_TOKEN = 315, + PARS_DOES_NOT_FIT_IN_MEM_TOKEN = 316, + PARS_ON_TOKEN = 317, + PARS_ASSIGN_TOKEN = 318, + PARS_DECLARE_TOKEN = 319, + PARS_CURSOR_TOKEN = 320, + PARS_SQL_TOKEN = 321, + PARS_OPEN_TOKEN = 322, + PARS_FETCH_TOKEN = 323, + PARS_CLOSE_TOKEN = 324, + PARS_NOTFOUND_TOKEN = 325, + PARS_TO_CHAR_TOKEN = 326, + PARS_TO_NUMBER_TOKEN = 327, + PARS_TO_BINARY_TOKEN = 328, + PARS_BINARY_TO_NUMBER_TOKEN = 329, + PARS_SUBSTR_TOKEN = 330, + PARS_REPLSTR_TOKEN = 331, + PARS_CONCAT_TOKEN = 332, + PARS_INSTR_TOKEN = 333, + PARS_LENGTH_TOKEN = 334, + PARS_SYSDATE_TOKEN = 335, + PARS_PRINTF_TOKEN = 336, + PARS_ASSERT_TOKEN = 337, + PARS_RND_TOKEN = 338, + PARS_RND_STR_TOKEN = 339, + PARS_ROW_PRINTF_TOKEN = 340, + PARS_COMMIT_TOKEN = 341, + PARS_ROLLBACK_TOKEN = 342, + PARS_WORK_TOKEN = 343, + PARS_UNSIGNED_TOKEN = 344, + PARS_EXIT_TOKEN = 345, + PARS_FUNCTION_TOKEN = 346, + PARS_LOCK_TOKEN = 347, + PARS_SHARE_TOKEN = 348, + PARS_MODE_TOKEN = 349, + PARS_LIKE_TOKEN = 350, + PARS_LIKE_TOKEN_EXACT = 351, + PARS_LIKE_TOKEN_PREFIX = 352, + PARS_LIKE_TOKEN_SUFFIX = 353, + PARS_LIKE_TOKEN_SUBSTR = 354, + PARS_TABLE_NAME_TOKEN = 355, + PARS_COMPACT_TOKEN = 356, + PARS_BLOCK_SIZE_TOKEN = 357, + PARS_BIGINT_TOKEN = 358, + NEG = 359 + }; +#endif +/* Tokens. */ +#define PARS_INT_LIT 258 +#define PARS_FLOAT_LIT 259 +#define PARS_STR_LIT 260 +#define PARS_FIXBINARY_LIT 261 +#define PARS_BLOB_LIT 262 +#define PARS_NULL_LIT 263 +#define PARS_ID_TOKEN 264 +#define PARS_AND_TOKEN 265 +#define PARS_OR_TOKEN 266 +#define PARS_NOT_TOKEN 267 +#define PARS_GE_TOKEN 268 +#define PARS_LE_TOKEN 269 +#define PARS_NE_TOKEN 270 +#define PARS_PROCEDURE_TOKEN 271 +#define PARS_IN_TOKEN 272 +#define PARS_OUT_TOKEN 273 +#define PARS_BINARY_TOKEN 274 +#define PARS_BLOB_TOKEN 275 +#define PARS_INT_TOKEN 276 +#define PARS_INTEGER_TOKEN 277 +#define PARS_FLOAT_TOKEN 278 +#define PARS_CHAR_TOKEN 279 +#define PARS_IS_TOKEN 280 +#define PARS_BEGIN_TOKEN 281 +#define PARS_END_TOKEN 282 +#define PARS_IF_TOKEN 283 +#define PARS_THEN_TOKEN 284 +#define PARS_ELSE_TOKEN 285 +#define PARS_ELSIF_TOKEN 286 +#define PARS_LOOP_TOKEN 287 +#define PARS_WHILE_TOKEN 288 +#define PARS_RETURN_TOKEN 289 +#define PARS_SELECT_TOKEN 290 +#define PARS_SUM_TOKEN 291 +#define PARS_COUNT_TOKEN 292 +#define PARS_DISTINCT_TOKEN 293 +#define PARS_FROM_TOKEN 294 +#define PARS_WHERE_TOKEN 295 +#define PARS_FOR_TOKEN 296 +#define PARS_DDOT_TOKEN 297 +#define PARS_READ_TOKEN 298 +#define PARS_ORDER_TOKEN 299 +#define PARS_BY_TOKEN 300 +#define PARS_ASC_TOKEN 301 +#define PARS_DESC_TOKEN 302 +#define PARS_INSERT_TOKEN 303 +#define PARS_INTO_TOKEN 304 +#define PARS_VALUES_TOKEN 305 +#define PARS_UPDATE_TOKEN 306 +#define PARS_SET_TOKEN 307 +#define PARS_DELETE_TOKEN 308 +#define PARS_CURRENT_TOKEN 309 +#define PARS_OF_TOKEN 310 +#define PARS_CREATE_TOKEN 311 +#define PARS_TABLE_TOKEN 312 +#define PARS_INDEX_TOKEN 313 +#define PARS_UNIQUE_TOKEN 314 +#define PARS_CLUSTERED_TOKEN 315 +#define PARS_DOES_NOT_FIT_IN_MEM_TOKEN 316 +#define PARS_ON_TOKEN 317 +#define PARS_ASSIGN_TOKEN 318 +#define PARS_DECLARE_TOKEN 319 +#define PARS_CURSOR_TOKEN 320 +#define PARS_SQL_TOKEN 321 +#define PARS_OPEN_TOKEN 322 +#define PARS_FETCH_TOKEN 323 +#define PARS_CLOSE_TOKEN 324 +#define PARS_NOTFOUND_TOKEN 325 +#define PARS_TO_CHAR_TOKEN 326 +#define PARS_TO_NUMBER_TOKEN 327 +#define PARS_TO_BINARY_TOKEN 328 +#define PARS_BINARY_TO_NUMBER_TOKEN 329 +#define PARS_SUBSTR_TOKEN 330 +#define PARS_REPLSTR_TOKEN 331 +#define PARS_CONCAT_TOKEN 332 +#define PARS_INSTR_TOKEN 333 +#define PARS_LENGTH_TOKEN 334 +#define PARS_SYSDATE_TOKEN 335 +#define PARS_PRINTF_TOKEN 336 +#define PARS_ASSERT_TOKEN 337 +#define PARS_RND_TOKEN 338 +#define PARS_RND_STR_TOKEN 339 +#define PARS_ROW_PRINTF_TOKEN 340 +#define PARS_COMMIT_TOKEN 341 +#define PARS_ROLLBACK_TOKEN 342 +#define PARS_WORK_TOKEN 343 +#define PARS_UNSIGNED_TOKEN 344 +#define PARS_EXIT_TOKEN 345 +#define PARS_FUNCTION_TOKEN 346 +#define PARS_LOCK_TOKEN 347 +#define PARS_SHARE_TOKEN 348 +#define PARS_MODE_TOKEN 349 +#define PARS_LIKE_TOKEN 350 +#define PARS_LIKE_TOKEN_EXACT 351 +#define PARS_LIKE_TOKEN_PREFIX 352 +#define PARS_LIKE_TOKEN_SUFFIX 353 +#define PARS_LIKE_TOKEN_SUBSTR 354 +#define PARS_TABLE_NAME_TOKEN 355 +#define PARS_COMPACT_TOKEN 356 +#define PARS_BLOCK_SIZE_TOKEN 357 +#define PARS_BIGINT_TOKEN 358 +#define NEG 359 + + + + +#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED +typedef int YYSTYPE; +# define yystype YYSTYPE /* obsolescent; will be withdrawn */ +# define YYSTYPE_IS_DECLARED 1 +# define YYSTYPE_IS_TRIVIAL 1 +#endif + +extern YYSTYPE yylval; + diff --git a/storage/xtradb/include/pars0opt.h b/storage/xtradb/include/pars0opt.h new file mode 100644 index 00000000000..1084d644c90 --- /dev/null +++ b/storage/xtradb/include/pars0opt.h @@ -0,0 +1,75 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/pars0opt.h +Simple SQL optimizer + +Created 12/21/1997 Heikki Tuuri +*******************************************************/ + +#ifndef pars0opt_h +#define pars0opt_h + +#include "univ.i" +#include "que0types.h" +#include "usr0types.h" +#include "pars0sym.h" +#include "dict0types.h" +#include "row0sel.h" + +/*******************************************************************//** +Optimizes a select. Decides which indexes to tables to use. The tables +are accessed in the order that they were written to the FROM part in the +select statement. */ +UNIV_INTERN +void +opt_search_plan( +/*============*/ + sel_node_t* sel_node); /*!< in: parsed select node */ +/*******************************************************************//** +Looks for occurrences of the columns of the table in the query subgraph and +adds them to the list of columns if an occurrence of the same column does not +already exist in the list. If the column is already in the list, puts a value +indirection to point to the occurrence in the column list, except if the +column occurrence we are looking at is in the column list, in which case +nothing is done. */ +UNIV_INTERN +void +opt_find_all_cols( +/*==============*/ + ibool copy_val, /*!< in: if TRUE, new found columns are + added as columns to copy */ + dict_index_t* index, /*!< in: index to use */ + sym_node_list_t* col_list, /*!< in: base node of a list where + to add new found columns */ + plan_t* plan, /*!< in: plan or NULL */ + que_node_t* exp); /*!< in: expression or condition */ +/********************************************************************//** +Prints info of a query plan. */ +UNIV_INTERN +void +opt_print_query_plan( +/*=================*/ + sel_node_t* sel_node); /*!< in: select node */ + +#ifndef UNIV_NONINL +#include "pars0opt.ic" +#endif + +#endif diff --git a/storage/xtradb/include/pars0opt.ic b/storage/xtradb/include/pars0opt.ic new file mode 100644 index 00000000000..786d911ca3d --- /dev/null +++ b/storage/xtradb/include/pars0opt.ic @@ -0,0 +1,24 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/pars0opt.ic +Simple SQL optimizer + +Created 12/21/1997 Heikki Tuuri +*******************************************************/ diff --git a/storage/xtradb/include/pars0pars.h b/storage/xtradb/include/pars0pars.h new file mode 100644 index 00000000000..65ff7533828 --- /dev/null +++ b/storage/xtradb/include/pars0pars.h @@ -0,0 +1,826 @@ +/***************************************************************************** + +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/pars0pars.h +SQL parser + +Created 11/19/1996 Heikki Tuuri +*******************************************************/ + +#ifndef pars0pars_h +#define pars0pars_h + +#include "univ.i" +#include "que0types.h" +#include "usr0types.h" +#include "pars0types.h" +#include "row0types.h" +#include "trx0types.h" +#include "ut0vec.h" + +/** Type of the user functions. The first argument is always InnoDB-supplied +and varies in type, while 'user_arg' is a user-supplied argument. The +meaning of the return type also varies. See the individual use cases, e.g. +the FETCH statement, for details on them. */ +typedef ibool (*pars_user_func_cb_t)(void* arg, void* user_arg); + +/** If the following is set TRUE, the parser will emit debugging +information */ +extern int yydebug; + +#ifdef UNIV_SQL_DEBUG +/** If the following is set TRUE, the lexer will print the SQL string +as it tokenizes it */ +extern ibool pars_print_lexed; +#endif /* UNIV_SQL_DEBUG */ + +/* Global variable used while parsing a single procedure or query : the code is +NOT re-entrant */ +extern sym_tab_t* pars_sym_tab_global; + +extern pars_res_word_t pars_to_char_token; +extern pars_res_word_t pars_to_number_token; +extern pars_res_word_t pars_to_binary_token; +extern pars_res_word_t pars_binary_to_number_token; +extern pars_res_word_t pars_substr_token; +extern pars_res_word_t pars_replstr_token; +extern pars_res_word_t pars_concat_token; +extern pars_res_word_t pars_length_token; +extern pars_res_word_t pars_instr_token; +extern pars_res_word_t pars_sysdate_token; +extern pars_res_word_t pars_printf_token; +extern pars_res_word_t pars_assert_token; +extern pars_res_word_t pars_rnd_token; +extern pars_res_word_t pars_rnd_str_token; +extern pars_res_word_t pars_count_token; +extern pars_res_word_t pars_sum_token; +extern pars_res_word_t pars_distinct_token; +extern pars_res_word_t pars_binary_token; +extern pars_res_word_t pars_blob_token; +extern pars_res_word_t pars_int_token; +extern pars_res_word_t pars_bigint_token; +extern pars_res_word_t pars_char_token; +extern pars_res_word_t pars_float_token; +extern pars_res_word_t pars_update_token; +extern pars_res_word_t pars_asc_token; +extern pars_res_word_t pars_desc_token; +extern pars_res_word_t pars_open_token; +extern pars_res_word_t pars_close_token; +extern pars_res_word_t pars_share_token; +extern pars_res_word_t pars_unique_token; +extern pars_res_word_t pars_clustered_token; + +extern ulint pars_star_denoter; + +/* Procedure parameter types */ +#define PARS_INPUT 0 +#define PARS_OUTPUT 1 +#define PARS_NOT_PARAM 2 + +int +yyparse(void); + +/*************************************************************//** +Parses an SQL string returning the query graph. +@return own: the query graph */ +UNIV_INTERN +que_t* +pars_sql( +/*=====*/ + pars_info_t* info, /*!< in: extra information, or NULL */ + const char* str); /*!< in: SQL string */ +/*************************************************************//** +Retrieves characters to the lexical analyzer. +@return number of characters copied or 0 on EOF */ +UNIV_INTERN +int +pars_get_lex_chars( +/*===============*/ + char* buf, /*!< in/out: buffer where to copy */ + int max_size); /*!< in: maximum number of characters which fit + in the buffer */ +/*************************************************************//** +Called by yyparse on error. */ +UNIV_INTERN +void +yyerror( +/*====*/ + const char* s); /*!< in: error message string */ +/*********************************************************************//** +Parses a variable declaration. +@return own: symbol table node of type SYM_VAR */ +UNIV_INTERN +sym_node_t* +pars_variable_declaration( +/*======================*/ + sym_node_t* node, /*!< in: symbol table node allocated for the + id of the variable */ + pars_res_word_t* type); /*!< in: pointer to a type token */ +/*********************************************************************//** +Parses a function expression. +@return own: function node in a query tree */ +UNIV_INTERN +func_node_t* +pars_func( +/*======*/ + que_node_t* res_word,/*!< in: function name reserved word */ + que_node_t* arg); /*!< in: first argument in the argument list */ +/************************************************************************* +Rebind a LIKE search string. NOTE: We ignore any '%' characters embedded +within the search string. +@return own: function node in a query tree */ +UNIV_INTERN +int +pars_like_rebind( +/*=============*/ + sym_node_t* node, /* in: The search string node.*/ + const byte* ptr, /* in: literal to (re) bind */ + ulint len); /* in: length of literal to (re) bind*/ +/*********************************************************************//** +Parses an operator expression. +@return own: function node in a query tree */ +UNIV_INTERN +func_node_t* +pars_op( +/*====*/ + int func, /*!< in: operator token code */ + que_node_t* arg1, /*!< in: first argument */ + que_node_t* arg2); /*!< in: second argument or NULL for an unary + operator */ +/*********************************************************************//** +Parses an ORDER BY clause. Order by a single column only is supported. +@return own: order-by node in a query tree */ +UNIV_INTERN +order_node_t* +pars_order_by( +/*==========*/ + sym_node_t* column, /*!< in: column name */ + pars_res_word_t* asc); /*!< in: &pars_asc_token or pars_desc_token */ +/*********************************************************************//** +Parses a select list; creates a query graph node for the whole SELECT +statement. +@return own: select node in a query tree */ +UNIV_INTERN +sel_node_t* +pars_select_list( +/*=============*/ + que_node_t* select_list, /*!< in: select list */ + sym_node_t* into_list); /*!< in: variables list or NULL */ +/*********************************************************************//** +Parses a cursor declaration. +@return sym_node */ +UNIV_INTERN +que_node_t* +pars_cursor_declaration( +/*====================*/ + sym_node_t* sym_node, /*!< in: cursor id node in the symbol + table */ + sel_node_t* select_node); /*!< in: select node */ +/*********************************************************************//** +Parses a function declaration. +@return sym_node */ +UNIV_INTERN +que_node_t* +pars_function_declaration( +/*======================*/ + sym_node_t* sym_node); /*!< in: function id node in the symbol + table */ +/*********************************************************************//** +Parses a select statement. +@return own: select node in a query tree */ +UNIV_INTERN +sel_node_t* +pars_select_statement( +/*==================*/ + sel_node_t* select_node, /*!< in: select node already containing + the select list */ + sym_node_t* table_list, /*!< in: table list */ + que_node_t* search_cond, /*!< in: search condition or NULL */ + pars_res_word_t* for_update, /*!< in: NULL or &pars_update_token */ + pars_res_word_t* consistent_read,/*!< in: NULL or + &pars_consistent_token */ + order_node_t* order_by); /*!< in: NULL or an order-by node */ +/*********************************************************************//** +Parses a column assignment in an update. +@return column assignment node */ +UNIV_INTERN +col_assign_node_t* +pars_column_assignment( +/*===================*/ + sym_node_t* column, /*!< in: column to assign */ + que_node_t* exp); /*!< in: value to assign */ +/*********************************************************************//** +Parses a delete or update statement start. +@return own: update node in a query tree */ +UNIV_INTERN +upd_node_t* +pars_update_statement_start( +/*========================*/ + ibool is_delete, /*!< in: TRUE if delete */ + sym_node_t* table_sym, /*!< in: table name node */ + col_assign_node_t* col_assign_list);/*!< in: column assignment list, NULL + if delete */ +/*********************************************************************//** +Parses an update or delete statement. +@return own: update node in a query tree */ +UNIV_INTERN +upd_node_t* +pars_update_statement( +/*==================*/ + upd_node_t* node, /*!< in: update node */ + sym_node_t* cursor_sym, /*!< in: pointer to a cursor entry in + the symbol table or NULL */ + que_node_t* search_cond); /*!< in: search condition or NULL */ +/*********************************************************************//** +Parses an insert statement. +@return own: update node in a query tree */ +UNIV_INTERN +ins_node_t* +pars_insert_statement( +/*==================*/ + sym_node_t* table_sym, /*!< in: table name node */ + que_node_t* values_list, /*!< in: value expression list or NULL */ + sel_node_t* select); /*!< in: select condition or NULL */ +/*********************************************************************//** +Parses a procedure parameter declaration. +@return own: symbol table node of type SYM_VAR */ +UNIV_INTERN +sym_node_t* +pars_parameter_declaration( +/*=======================*/ + sym_node_t* node, /*!< in: symbol table node allocated for the + id of the parameter */ + ulint param_type, + /*!< in: PARS_INPUT or PARS_OUTPUT */ + pars_res_word_t* type); /*!< in: pointer to a type token */ +/*********************************************************************//** +Parses an elsif element. +@return elsif node */ +UNIV_INTERN +elsif_node_t* +pars_elsif_element( +/*===============*/ + que_node_t* cond, /*!< in: if-condition */ + que_node_t* stat_list); /*!< in: statement list */ +/*********************************************************************//** +Parses an if-statement. +@return if-statement node */ +UNIV_INTERN +if_node_t* +pars_if_statement( +/*==============*/ + que_node_t* cond, /*!< in: if-condition */ + que_node_t* stat_list, /*!< in: statement list */ + que_node_t* else_part); /*!< in: else-part statement list */ +/*********************************************************************//** +Parses a for-loop-statement. +@return for-statement node */ +UNIV_INTERN +for_node_t* +pars_for_statement( +/*===============*/ + sym_node_t* loop_var, /*!< in: loop variable */ + que_node_t* loop_start_limit,/*!< in: loop start expression */ + que_node_t* loop_end_limit, /*!< in: loop end expression */ + que_node_t* stat_list); /*!< in: statement list */ +/*********************************************************************//** +Parses a while-statement. +@return while-statement node */ +UNIV_INTERN +while_node_t* +pars_while_statement( +/*=================*/ + que_node_t* cond, /*!< in: while-condition */ + que_node_t* stat_list); /*!< in: statement list */ +/*********************************************************************//** +Parses an exit statement. +@return exit statement node */ +UNIV_INTERN +exit_node_t* +pars_exit_statement(void); +/*=====================*/ +/*********************************************************************//** +Parses a return-statement. +@return return-statement node */ +UNIV_INTERN +return_node_t* +pars_return_statement(void); +/*=======================*/ +/*********************************************************************//** +Parses a procedure call. +@return function node */ +UNIV_INTERN +func_node_t* +pars_procedure_call( +/*================*/ + que_node_t* res_word,/*!< in: procedure name reserved word */ + que_node_t* args); /*!< in: argument list */ +/*********************************************************************//** +Parses an assignment statement. +@return assignment statement node */ +UNIV_INTERN +assign_node_t* +pars_assignment_statement( +/*======================*/ + sym_node_t* var, /*!< in: variable to assign */ + que_node_t* val); /*!< in: value to assign */ +/*********************************************************************//** +Parses a fetch statement. into_list or user_func (but not both) must be +non-NULL. +@return fetch statement node */ +UNIV_INTERN +fetch_node_t* +pars_fetch_statement( +/*=================*/ + sym_node_t* cursor, /*!< in: cursor node */ + sym_node_t* into_list, /*!< in: variables to set, or NULL */ + sym_node_t* user_func); /*!< in: user function name, or NULL */ +/*********************************************************************//** +Parses an open or close cursor statement. +@return fetch statement node */ +UNIV_INTERN +open_node_t* +pars_open_statement( +/*================*/ + ulint type, /*!< in: ROW_SEL_OPEN_CURSOR + or ROW_SEL_CLOSE_CURSOR */ + sym_node_t* cursor); /*!< in: cursor node */ +/*********************************************************************//** +Parses a row_printf-statement. +@return row_printf-statement node */ +UNIV_INTERN +row_printf_node_t* +pars_row_printf_statement( +/*======================*/ + sel_node_t* sel_node); /*!< in: select node */ +/*********************************************************************//** +Parses a commit statement. +@return own: commit node struct */ +UNIV_INTERN +commit_node_t* +pars_commit_statement(void); +/*=======================*/ +/*********************************************************************//** +Parses a rollback statement. +@return own: rollback node struct */ +UNIV_INTERN +roll_node_t* +pars_rollback_statement(void); +/*=========================*/ +/*********************************************************************//** +Parses a column definition at a table creation. +@return column sym table node */ +UNIV_INTERN +sym_node_t* +pars_column_def( +/*============*/ + sym_node_t* sym_node, /*!< in: column node in the + symbol table */ + pars_res_word_t* type, /*!< in: data type */ + sym_node_t* len, /*!< in: length of column, or + NULL */ + void* is_unsigned, /*!< in: if not NULL, column + is of type UNSIGNED. */ + void* is_not_null); /*!< in: if not NULL, column + is of type NOT NULL. */ +/*********************************************************************//** +Parses a table creation operation. +@return table create subgraph */ +UNIV_INTERN +tab_node_t* +pars_create_table( +/*==============*/ + sym_node_t* table_sym, /*!< in: table name node in the symbol + table */ + sym_node_t* column_defs, /*!< in: list of column names */ + sym_node_t* compact, /* in: non-NULL if COMPACT table. */ + sym_node_t* block_size, /* in: block size (can be NULL) */ + void* not_fit_in_memory); + /*!< in: a non-NULL pointer means that + this is a table which in simulations + should be simulated as not fitting + in memory; thread is put to sleep + to simulate disk accesses; NOTE that + this flag is not stored to the data + dictionary on disk, and the database + will forget about non-NULL value if + it has to reload the table definition + from disk */ +/*********************************************************************//** +Parses an index creation operation. +@return index create subgraph */ +UNIV_INTERN +ind_node_t* +pars_create_index( +/*==============*/ + pars_res_word_t* unique_def, /*!< in: not NULL if a unique index */ + pars_res_word_t* clustered_def, /*!< in: not NULL if a clustered index */ + sym_node_t* index_sym, /*!< in: index name node in the symbol + table */ + sym_node_t* table_sym, /*!< in: table name node in the symbol + table */ + sym_node_t* column_list); /*!< in: list of column names */ +/*********************************************************************//** +Parses a procedure definition. +@return query fork node */ +UNIV_INTERN +que_fork_t* +pars_procedure_definition( +/*======================*/ + sym_node_t* sym_node, /*!< in: procedure id node in the symbol + table */ + sym_node_t* param_list, /*!< in: parameter declaration list */ + que_node_t* stat_list); /*!< in: statement list */ + +/*************************************************************//** +Parses a stored procedure call, when this is not within another stored +procedure, that is, the client issues a procedure call directly. +In MySQL/InnoDB, stored InnoDB procedures are invoked via the +parsed procedure tree, not via InnoDB SQL, so this function is not used. +@return query graph */ +UNIV_INTERN +que_fork_t* +pars_stored_procedure_call( +/*=======================*/ + sym_node_t* sym_node); /*!< in: stored procedure name */ +/******************************************************************//** +Completes a query graph by adding query thread and fork nodes +above it and prepares the graph for running. The fork created is of +type QUE_FORK_MYSQL_INTERFACE. +@return query thread node to run */ +UNIV_INTERN +que_thr_t* +pars_complete_graph_for_exec( +/*=========================*/ + que_node_t* node, /*!< in: root node for an incomplete + query graph, or NULL for dummy graph */ + trx_t* trx, /*!< in: transaction handle */ + mem_heap_t* heap) /*!< in: memory heap from which allocated */ + __attribute__((nonnull(2,3), warn_unused_result)); + +/****************************************************************//** +Create parser info struct. +@return own: info struct */ +UNIV_INTERN +pars_info_t* +pars_info_create(void); +/*==================*/ + +/****************************************************************//** +Free info struct and everything it contains. */ +UNIV_INTERN +void +pars_info_free( +/*===========*/ + pars_info_t* info); /*!< in, own: info struct */ + +/****************************************************************//** +Add bound literal. */ +UNIV_INTERN +void +pars_info_add_literal( +/*==================*/ + pars_info_t* info, /*!< in: info struct */ + const char* name, /*!< in: name */ + const void* address, /*!< in: address */ + ulint length, /*!< in: length of data */ + ulint type, /*!< in: type, e.g. DATA_FIXBINARY */ + ulint prtype); /*!< in: precise type, e.g. + DATA_UNSIGNED */ + +/****************************************************************//** +Equivalent to pars_info_add_literal(info, name, str, strlen(str), +DATA_VARCHAR, DATA_ENGLISH). */ +UNIV_INTERN +void +pars_info_add_str_literal( +/*======================*/ + pars_info_t* info, /*!< in: info struct */ + const char* name, /*!< in: name */ + const char* str); /*!< in: string */ +/******************************************************************** +If the literal value already exists then it rebinds otherwise it +creates a new entry.*/ +UNIV_INTERN +void +pars_info_bind_literal( +/*===================*/ + pars_info_t* info, /* in: info struct */ + const char* name, /* in: name */ + const void* address, /* in: address */ + ulint length, /* in: length of data */ + ulint type, /* in: type, e.g. DATA_FIXBINARY */ + ulint prtype); /* in: precise type, e.g. */ +/******************************************************************** +If the literal value already exists then it rebinds otherwise it +creates a new entry.*/ +UNIV_INTERN +void +pars_info_bind_varchar_literal( +/*===========================*/ + pars_info_t* info, /*!< in: info struct */ + const char* name, /*!< in: name */ + const byte* str, /*!< in: string */ + ulint str_len); /*!< in: string length */ +/****************************************************************//** +Equivalent to: + +char buf[4]; +mach_write_to_4(buf, val); +pars_info_add_literal(info, name, buf, 4, DATA_INT, 0); + +except that the buffer is dynamically allocated from the info struct's +heap. */ +UNIV_INTERN +void +pars_info_bind_int4_literal( +/*=======================*/ + pars_info_t* info, /*!< in: info struct */ + const char* name, /*!< in: name */ + const ib_uint32_t* val); /*!< in: value */ +/******************************************************************** +If the literal value already exists then it rebinds otherwise it +creates a new entry. */ +UNIV_INTERN +void +pars_info_bind_int8_literal( +/*=======================*/ + pars_info_t* info, /*!< in: info struct */ + const char* name, /*!< in: name */ + const ib_uint64_t* val); /*!< in: value */ +/****************************************************************//** +Add user function. */ +UNIV_INTERN +void +pars_info_bind_function( +/*===================*/ + pars_info_t* info, /*!< in: info struct */ + const char* name, /*!< in: function name */ + pars_user_func_cb_t func, /*!< in: function address */ + void* arg); /*!< in: user-supplied argument */ +/****************************************************************//** +Add bound id. */ +UNIV_INTERN +void +pars_info_bind_id( +/*=============*/ + pars_info_t* info, /*!< in: info struct */ + ibool copy_name,/* in: make a copy of name if TRUE */ + const char* name, /*!< in: name */ + const char* id); /*!< in: id */ +/****************************************************************//** +Equivalent to: + +char buf[4]; +mach_write_to_4(buf, val); +pars_info_add_literal(info, name, buf, 4, DATA_INT, 0); + +except that the buffer is dynamically allocated from the info struct's +heap. */ +UNIV_INTERN +void +pars_info_add_int4_literal( +/*=======================*/ + pars_info_t* info, /*!< in: info struct */ + const char* name, /*!< in: name */ + lint val); /*!< in: value */ + +/****************************************************************//** +Equivalent to: + +char buf[8]; +mach_write_to_8(buf, val); +pars_info_add_literal(info, name, buf, 8, DATA_FIXBINARY, 0); + +except that the buffer is dynamically allocated from the info struct's +heap. */ +UNIV_INTERN +void +pars_info_add_ull_literal( +/*======================*/ + pars_info_t* info, /*!< in: info struct */ + const char* name, /*!< in: name */ + ib_uint64_t val); /*!< in: value */ + +/****************************************************************//** +If the literal value already exists then it rebinds otherwise it +creates a new entry. */ +UNIV_INTERN +void +pars_info_bind_ull_literal( +/*=======================*/ + pars_info_t* info, /*!< in: info struct */ + const char* name, /*!< in: name */ + const ib_uint64_t* val) /*!< in: value */ + __attribute__((nonnull)); + +/****************************************************************//** +Add bound id. */ +UNIV_INTERN +void +pars_info_add_id( +/*=============*/ + pars_info_t* info, /*!< in: info struct */ + const char* name, /*!< in: name */ + const char* id); /*!< in: id */ + +/****************************************************************//** +Get bound literal with the given name. +@return bound literal, or NULL if not found */ +UNIV_INTERN +pars_bound_lit_t* +pars_info_get_bound_lit( +/*====================*/ + pars_info_t* info, /*!< in: info struct */ + const char* name); /*!< in: bound literal name to find */ + +/****************************************************************//** +Get bound id with the given name. +@return bound id, or NULL if not found */ +UNIV_INTERN +pars_bound_id_t* +pars_info_get_bound_id( +/*===================*/ + pars_info_t* info, /*!< in: info struct */ + const char* name); /*!< in: bound id name to find */ + +/******************************************************************//** +Release any resources used by the lexer. */ +UNIV_INTERN +void +pars_lexer_close(void); +/*==================*/ + +/** Extra information supplied for pars_sql(). */ +struct pars_info_t { + mem_heap_t* heap; /*!< our own memory heap */ + + ib_vector_t* funcs; /*!< user functions, or NUll + (pars_user_func_t*) */ + ib_vector_t* bound_lits; /*!< bound literals, or NULL + (pars_bound_lit_t*) */ + ib_vector_t* bound_ids; /*!< bound ids, or NULL + (pars_bound_id_t*) */ + + ibool graph_owns_us; /*!< if TRUE (which is the default), + que_graph_free() will free us */ +}; + +/** User-supplied function and argument. */ +struct pars_user_func_t { + const char* name; /*!< function name */ + pars_user_func_cb_t func; /*!< function address */ + void* arg; /*!< user-supplied argument */ +}; + +/** Bound literal. */ +struct pars_bound_lit_t { + const char* name; /*!< name */ + const void* address; /*!< address */ + ulint length; /*!< length of data */ + ulint type; /*!< type, e.g. DATA_FIXBINARY */ + ulint prtype; /*!< precise type, e.g. DATA_UNSIGNED */ + sym_node_t* node; /*!< symbol node */ +}; + +/** Bound identifier. */ +struct pars_bound_id_t { + const char* name; /*!< name */ + const char* id; /*!< identifier */ +}; + +/** Struct used to denote a reserved word in a parsing tree */ +struct pars_res_word_t{ + int code; /*!< the token code for the reserved word from + pars0grm.h */ +}; + +/** A predefined function or operator node in a parsing tree; this construct +is also used for some non-functions like the assignment ':=' */ +struct func_node_t{ + que_common_t common; /*!< type: QUE_NODE_FUNC */ + int func; /*!< token code of the function name */ + ulint fclass; /*!< class of the function */ + que_node_t* args; /*!< argument(s) of the function */ + UT_LIST_NODE_T(func_node_t) cond_list; + /*!< list of comparison conditions; defined + only for comparison operator nodes except, + presently, for OPT_SCROLL_TYPE ones */ + UT_LIST_NODE_T(func_node_t) func_node_list; + /*!< list of function nodes in a parsed + query graph */ +}; + +/** An order-by node in a select */ +struct order_node_t{ + que_common_t common; /*!< type: QUE_NODE_ORDER */ + sym_node_t* column; /*!< order-by column */ + ibool asc; /*!< TRUE if ascending, FALSE if descending */ +}; + +/** Procedure definition node */ +struct proc_node_t{ + que_common_t common; /*!< type: QUE_NODE_PROC */ + sym_node_t* proc_id; /*!< procedure name symbol in the symbol + table of this same procedure */ + sym_node_t* param_list; /*!< input and output parameters */ + que_node_t* stat_list; /*!< statement list */ + sym_tab_t* sym_tab; /*!< symbol table of this procedure */ +}; + +/** elsif-element node */ +struct elsif_node_t{ + que_common_t common; /*!< type: QUE_NODE_ELSIF */ + que_node_t* cond; /*!< if condition */ + que_node_t* stat_list; /*!< statement list */ +}; + +/** if-statement node */ +struct if_node_t{ + que_common_t common; /*!< type: QUE_NODE_IF */ + que_node_t* cond; /*!< if condition */ + que_node_t* stat_list; /*!< statement list */ + que_node_t* else_part; /*!< else-part statement list */ + elsif_node_t* elsif_list; /*!< elsif element list */ +}; + +/** while-statement node */ +struct while_node_t{ + que_common_t common; /*!< type: QUE_NODE_WHILE */ + que_node_t* cond; /*!< while condition */ + que_node_t* stat_list; /*!< statement list */ +}; + +/** for-loop-statement node */ +struct for_node_t{ + que_common_t common; /*!< type: QUE_NODE_FOR */ + sym_node_t* loop_var; /*!< loop variable: this is the + dereferenced symbol from the + variable declarations, not the + symbol occurrence in the for loop + definition */ + que_node_t* loop_start_limit;/*!< initial value of loop variable */ + que_node_t* loop_end_limit; /*!< end value of loop variable */ + lint loop_end_value; /*!< evaluated value for the end value: + it is calculated only when the loop + is entered, and will not change within + the loop */ + que_node_t* stat_list; /*!< statement list */ +}; + +/** exit statement node */ +struct exit_node_t{ + que_common_t common; /*!< type: QUE_NODE_EXIT */ +}; + +/** return-statement node */ +struct return_node_t{ + que_common_t common; /*!< type: QUE_NODE_RETURN */ +}; + +/** Assignment statement node */ +struct assign_node_t{ + que_common_t common; /*!< type: QUE_NODE_ASSIGNMENT */ + sym_node_t* var; /*!< variable to set */ + que_node_t* val; /*!< value to assign */ +}; + +/** Column assignment node */ +struct col_assign_node_t{ + que_common_t common; /*!< type: QUE_NODE_COL_ASSIGN */ + sym_node_t* col; /*!< column to set */ + que_node_t* val; /*!< value to assign */ +}; + +/** Classes of functions */ +/* @{ */ +#define PARS_FUNC_ARITH 1 /*!< +, -, *, / */ +#define PARS_FUNC_LOGICAL 2 /*!< AND, OR, NOT */ +#define PARS_FUNC_CMP 3 /*!< comparison operators */ +#define PARS_FUNC_PREDEFINED 4 /*!< TO_NUMBER, SUBSTR, ... */ +#define PARS_FUNC_AGGREGATE 5 /*!< COUNT, DISTINCT, SUM */ +#define PARS_FUNC_OTHER 6 /*!< these are not real functions, + e.g., := */ +/* @} */ + +#ifndef UNIV_NONINL +#include "pars0pars.ic" +#endif + +#endif diff --git a/storage/xtradb/include/pars0pars.ic b/storage/xtradb/include/pars0pars.ic new file mode 100644 index 00000000000..4c88337a265 --- /dev/null +++ b/storage/xtradb/include/pars0pars.ic @@ -0,0 +1,24 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/pars0pars.ic +SQL parser + +Created 11/19/1996 Heikki Tuuri +*******************************************************/ diff --git a/storage/xtradb/include/pars0sym.h b/storage/xtradb/include/pars0sym.h new file mode 100644 index 00000000000..bcf73639228 --- /dev/null +++ b/storage/xtradb/include/pars0sym.h @@ -0,0 +1,258 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/pars0sym.h +SQL parser symbol table + +Created 12/15/1997 Heikki Tuuri +*******************************************************/ + +#ifndef pars0sym_h +#define pars0sym_h + +#include "univ.i" +#include "que0types.h" +#include "usr0types.h" +#include "dict0types.h" +#include "pars0types.h" +#include "row0types.h" + +/******************************************************************//** +Creates a symbol table for a single stored procedure or query. +@return own: symbol table */ +UNIV_INTERN +sym_tab_t* +sym_tab_create( +/*===========*/ + mem_heap_t* heap); /*!< in: memory heap where to create */ +/******************************************************************//** +Frees the memory allocated dynamically AFTER parsing phase for variables +etc. in the symbol table. Does not free the mem heap where the table was +originally created. Frees also SQL explicit cursor definitions. */ +UNIV_INTERN +void +sym_tab_free_private( +/*=================*/ + sym_tab_t* sym_tab); /*!< in, own: symbol table */ +/******************************************************************//** +Adds an integer literal to a symbol table. +@return symbol table node */ +UNIV_INTERN +sym_node_t* +sym_tab_add_int_lit( +/*================*/ + sym_tab_t* sym_tab, /*!< in: symbol table */ + ulint val); /*!< in: integer value */ +/******************************************************************//** +Adds an string literal to a symbol table. +@return symbol table node */ +UNIV_INTERN +sym_node_t* +sym_tab_add_str_lit( +/*================*/ + sym_tab_t* sym_tab, /*!< in: symbol table */ + const byte* str, /*!< in: string with no quotes around + it */ + ulint len); /*!< in: string length */ +/******************************************************************//** +Add a bound literal to a symbol table. +@return symbol table node */ +UNIV_INTERN +sym_node_t* +sym_tab_add_bound_lit( +/*==================*/ + sym_tab_t* sym_tab, /*!< in: symbol table */ + const char* name, /*!< in: name of bound literal */ + ulint* lit_type); /*!< out: type of literal (PARS_*_LIT) */ +/********************************************************************** +Rebind literal to a node in the symbol table. */ + +sym_node_t* +sym_tab_rebind_lit( +/*===============*/ + /* out: symbol table node */ + sym_node_t* node, /* in: node that is bound to literal*/ + const void* address, /* in: pointer to data */ + ulint length); /* in: length of data */ +/******************************************************************//** +Adds an SQL null literal to a symbol table. +@return symbol table node */ +UNIV_INTERN +sym_node_t* +sym_tab_add_null_lit( +/*=================*/ + sym_tab_t* sym_tab); /*!< in: symbol table */ +/******************************************************************//** +Adds an identifier to a symbol table. +@return symbol table node */ +UNIV_INTERN +sym_node_t* +sym_tab_add_id( +/*===========*/ + sym_tab_t* sym_tab, /*!< in: symbol table */ + byte* name, /*!< in: identifier name */ + ulint len); /*!< in: identifier length */ + +/******************************************************************//** +Add a bound identifier to a symbol table. +@return symbol table node */ +UNIV_INTERN +sym_node_t* +sym_tab_add_bound_id( +/*===========*/ + sym_tab_t* sym_tab, /*!< in: symbol table */ + const char* name); /*!< in: name of bound id */ + +/** Index of sym_node_t::field_nos corresponding to the clustered index */ +#define SYM_CLUST_FIELD_NO 0 +/** Index of sym_node_t::field_nos corresponding to a secondary index */ +#define SYM_SEC_FIELD_NO 1 + +/** Types of a symbol table node */ +enum sym_tab_entry { + SYM_UNSET, /*!< Unset entry. */ + SYM_VAR = 91, /*!< declared parameter or local + variable of a procedure */ + SYM_IMPLICIT_VAR, /*!< storage for a intermediate result + of a calculation */ + SYM_LIT, /*!< literal */ + SYM_TABLE_REF_COUNTED, /*!< database table name, ref counted. Must + be closed explicitly. */ + SYM_TABLE, /*!< database table name */ + SYM_COLUMN, /*!< database table name */ + SYM_CURSOR, /*!< named cursor */ + SYM_PROCEDURE_NAME, /*!< stored procedure name */ + SYM_INDEX, /*!< database index name */ + SYM_FUNCTION /*!< user function name */ +}; + +/** Symbol table node */ +struct sym_node_t{ + que_common_t common; /*!< node type: + QUE_NODE_SYMBOL */ + /* NOTE: if the data field in 'common.val' is not NULL and the symbol + table node is not for a temporary column, the memory for the value has + been allocated from dynamic memory and it should be freed when the + symbol table is discarded */ + + /* 'alias' and 'indirection' are almost the same, but not quite. + 'alias' always points to the primary instance of the variable, while + 'indirection' does the same only if we should use the primary + instance's values for the node's data. This is usually the case, but + when initializing a cursor (e.g., "DECLARE CURSOR c IS SELECT * FROM + t WHERE id = x;"), we copy the values from the primary instance to + the cursor's instance so that they are fixed for the duration of the + cursor, and set 'indirection' to NULL. If we did not, the value of + 'x' could change between fetches and things would break horribly. + + TODO: It would be cleaner to make 'indirection' a boolean field and + always use 'alias' to refer to the primary node. */ + + sym_node_t* indirection; /*!< pointer to + another symbol table + node which contains + the value for this + node, NULL otherwise */ + sym_node_t* alias; /*!< pointer to + another symbol table + node for which this + node is an alias, + NULL otherwise */ + UT_LIST_NODE_T(sym_node_t) col_var_list; /*!< list of table + columns or a list of + input variables for an + explicit cursor */ + ibool copy_val; /*!< TRUE if a column + and its value should + be copied to dynamic + memory when fetched */ + ulint field_nos[2]; /*!< if a column, in + the position + SYM_CLUST_FIELD_NO is + the field number in the + clustered index; in + the position + SYM_SEC_FIELD_NO + the field number in the + non-clustered index to + use first; if not found + from the index, then + ULINT_UNDEFINED */ + ibool resolved; /*!< TRUE if the + meaning of a variable + or a column has been + resolved; for literals + this is always TRUE */ + enum sym_tab_entry token_type; /*!< type of the + parsed token */ + const char* name; /*!< name of an id */ + ulint name_len; /*!< id name length */ + dict_table_t* table; /*!< table definition + if a table id or a + column id */ + ulint col_no; /*!< column number if a + column */ + sel_buf_t* prefetch_buf; /*!< NULL, or a buffer + for cached column + values for prefetched + rows */ + sel_node_t* cursor_def; /*!< cursor definition + select node if a + named cursor */ + ulint param_type; /*!< PARS_INPUT, + PARS_OUTPUT, or + PARS_NOT_PARAM if not a + procedure parameter */ + sym_tab_t* sym_table; /*!< back pointer to + the symbol table */ + UT_LIST_NODE_T(sym_node_t) sym_list; /*!< list of symbol + nodes */ + sym_node_t* like_node; /* LIKE operator node*/ +}; + +/** Symbol table */ +struct sym_tab_t{ + que_t* query_graph; + /*!< query graph generated by the + parser */ + const char* sql_string; + /*!< SQL string to parse */ + size_t string_len; + /*!< SQL string length */ + int next_char_pos; + /*!< position of the next character in + sql_string to give to the lexical + analyzer */ + pars_info_t* info; /*!< extra information, or NULL */ + sym_node_list_t sym_list; + /*!< list of symbol nodes in the symbol + table */ + UT_LIST_BASE_NODE_T(func_node_t) + func_node_list; + /*!< list of function nodes in the + parsed query graph */ + mem_heap_t* heap; /*!< memory heap from which we can + allocate space */ +}; + +#ifndef UNIV_NONINL +#include "pars0sym.ic" +#endif + +#endif diff --git a/storage/xtradb/include/pars0sym.ic b/storage/xtradb/include/pars0sym.ic new file mode 100644 index 00000000000..266c1a6310d --- /dev/null +++ b/storage/xtradb/include/pars0sym.ic @@ -0,0 +1,24 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/pars0sym.ic +SQL parser symbol table + +Created 12/15/1997 Heikki Tuuri +*******************************************************/ diff --git a/storage/xtradb/include/pars0types.h b/storage/xtradb/include/pars0types.h new file mode 100644 index 00000000000..47f4b432d20 --- /dev/null +++ b/storage/xtradb/include/pars0types.h @@ -0,0 +1,50 @@ +/***************************************************************************** + +Copyright (c) 1998, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/pars0types.h +SQL parser global types + +Created 1/11/1998 Heikki Tuuri +*******************************************************/ + +#ifndef pars0types_h +#define pars0types_h + +struct pars_info_t; +struct pars_user_func_t; +struct pars_bound_lit_t; +struct pars_bound_id_t; +struct sym_node_t; +struct sym_tab_t; +struct pars_res_word_t; +struct func_node_t; +struct order_node_t; +struct proc_node_t; +struct elsif_node_t; +struct if_node_t; +struct while_node_t; +struct for_node_t; +struct exit_node_t; +struct return_node_t; +struct assign_node_t; +struct col_assign_node_t; + +typedef UT_LIST_BASE_NODE_T(sym_node_t) sym_node_list_t; + +#endif diff --git a/storage/xtradb/include/que0que.h b/storage/xtradb/include/que0que.h new file mode 100644 index 00000000000..e5b2a1ba3fc --- /dev/null +++ b/storage/xtradb/include/que0que.h @@ -0,0 +1,531 @@ +/***************************************************************************** + +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/que0que.h +Query graph + +Created 5/27/1996 Heikki Tuuri +*******************************************************/ + +#ifndef que0que_h +#define que0que_h + +#include "univ.i" +#include "data0data.h" +#include "dict0types.h" +#include "btr0sea.h" +#include "trx0trx.h" +#include "trx0roll.h" +#include "srv0srv.h" +#include "usr0types.h" +#include "que0types.h" +#include "row0types.h" +#include "pars0types.h" + +/* If the following flag is set TRUE, the module will print trace info +of SQL execution in the UNIV_SQL_DEBUG version */ +extern ibool que_trace_on; + +/** Mutex protecting the query threads. */ +extern ib_mutex_t que_thr_mutex; + +/***********************************************************************//** +Creates a query graph fork node. +@return own: fork node */ +UNIV_INTERN +que_fork_t* +que_fork_create( +/*============*/ + que_t* graph, /*!< in: graph, if NULL then this + fork node is assumed to be the + graph root */ + que_node_t* parent, /*!< in: parent node */ + ulint fork_type, /*!< in: fork type */ + mem_heap_t* heap); /*!< in: memory heap where created */ +/***********************************************************************//** +Gets the first thr in a fork. */ +UNIV_INLINE +que_thr_t* +que_fork_get_first_thr( +/*===================*/ + que_fork_t* fork); /*!< in: query fork */ +/***********************************************************************//** +Gets the child node of the first thr in a fork. */ +UNIV_INLINE +que_node_t* +que_fork_get_child( +/*===============*/ + que_fork_t* fork); /*!< in: query fork */ +/***********************************************************************//** +Sets the parent of a graph node. */ +UNIV_INLINE +void +que_node_set_parent( +/*================*/ + que_node_t* node, /*!< in: graph node */ + que_node_t* parent);/*!< in: parent */ +/***********************************************************************//** +Creates a query graph thread node. +@return own: query thread node */ +UNIV_INTERN +que_thr_t* +que_thr_create( +/*===========*/ + que_fork_t* parent, /*!< in: parent node, i.e., a fork node */ + mem_heap_t* heap); /*!< in: memory heap where created */ +/**********************************************************************//** +Frees a query graph, but not the heap where it was created. Does not free +explicit cursor declarations, they are freed in que_graph_free. */ +UNIV_INTERN +void +que_graph_free_recursive( +/*=====================*/ + que_node_t* node); /*!< in: query graph node */ +/**********************************************************************//** +Frees a query graph. */ +UNIV_INTERN +void +que_graph_free( +/*===========*/ + que_t* graph); /*!< in: query graph; we assume that the memory + heap where this graph was created is private + to this graph: if not, then use + que_graph_free_recursive and free the heap + afterwards! */ +/**********************************************************************//** +Stops a query thread if graph or trx is in a state requiring it. The +conditions are tested in the order (1) graph, (2) trx. The lock_sys_t::mutex +has to be reserved. +@return TRUE if stopped */ +UNIV_INTERN +ibool +que_thr_stop( +/*=========*/ + que_thr_t* thr); /*!< in: query thread */ +/**********************************************************************//** +Moves a thread from another state to the QUE_THR_RUNNING state. Increments +the n_active_thrs counters of the query graph and transaction. */ +UNIV_INTERN +void +que_thr_move_to_run_state_for_mysql( +/*================================*/ + que_thr_t* thr, /*!< in: an query thread */ + trx_t* trx); /*!< in: transaction */ +/**********************************************************************//** +A patch for MySQL used to 'stop' a dummy query thread used in MySQL +select, when there is no error or lock wait. */ +UNIV_INTERN +void +que_thr_stop_for_mysql_no_error( +/*============================*/ + que_thr_t* thr, /*!< in: query thread */ + trx_t* trx); /*!< in: transaction */ +/**********************************************************************//** +A patch for MySQL used to 'stop' a dummy query thread used in MySQL. The +query thread is stopped and made inactive, except in the case where +it was put to the lock wait state in lock0lock.cc, but the lock has already +been granted or the transaction chosen as a victim in deadlock resolution. */ +UNIV_INTERN +void +que_thr_stop_for_mysql( +/*===================*/ + que_thr_t* thr); /*!< in: query thread */ +/**********************************************************************//** +Run a query thread. Handles lock waits. */ +UNIV_INTERN +void +que_run_threads( +/*============*/ + que_thr_t* thr); /*!< in: query thread */ +/**********************************************************************//** +Moves a suspended query thread to the QUE_THR_RUNNING state and release +a worker thread to execute it. This function should be used to end +the wait state of a query thread waiting for a lock or a stored procedure +completion. +@return query thread instance of thread to wakeup or NULL */ +UNIV_INTERN +que_thr_t* +que_thr_end_lock_wait( +/*==================*/ + trx_t* trx); /*!< in: transaction in the + QUE_THR_LOCK_WAIT state */ +/**********************************************************************//** +Starts execution of a command in a query fork. Picks a query thread which +is not in the QUE_THR_RUNNING state and moves it to that state. If none +can be chosen, a situation which may arise in parallelized fetches, NULL +is returned. +@return a query thread of the graph moved to QUE_THR_RUNNING state, or +NULL; the query thread should be executed by que_run_threads by the +caller */ +UNIV_INTERN +que_thr_t* +que_fork_start_command( +/*===================*/ + que_fork_t* fork); /*!< in: a query fork */ +/***********************************************************************//** +Gets the trx of a query thread. */ +UNIV_INLINE +trx_t* +thr_get_trx( +/*========*/ + que_thr_t* thr); /*!< in: query thread */ +/*******************************************************************//** +Determines if this thread is rolling back an incomplete transaction +in crash recovery. +@return TRUE if thr is rolling back an incomplete transaction in crash +recovery */ +UNIV_INLINE +ibool +thr_is_recv( +/*========*/ + const que_thr_t* thr); /*!< in: query thread */ +/***********************************************************************//** +Gets the type of a graph node. */ +UNIV_INLINE +ulint +que_node_get_type( +/*==============*/ + que_node_t* node); /*!< in: graph node */ +/***********************************************************************//** +Gets pointer to the value data type field of a graph node. */ +UNIV_INLINE +dtype_t* +que_node_get_data_type( +/*===================*/ + que_node_t* node); /*!< in: graph node */ +/***********************************************************************//** +Gets pointer to the value dfield of a graph node. */ +UNIV_INLINE +dfield_t* +que_node_get_val( +/*=============*/ + que_node_t* node); /*!< in: graph node */ +/***********************************************************************//** +Gets the value buffer size of a graph node. +@return val buffer size, not defined if val.data == NULL in node */ +UNIV_INLINE +ulint +que_node_get_val_buf_size( +/*======================*/ + que_node_t* node); /*!< in: graph node */ +/***********************************************************************//** +Sets the value buffer size of a graph node. */ +UNIV_INLINE +void +que_node_set_val_buf_size( +/*======================*/ + que_node_t* node, /*!< in: graph node */ + ulint size); /*!< in: size */ +/*********************************************************************//** +Gets the next list node in a list of query graph nodes. */ +UNIV_INLINE +que_node_t* +que_node_get_next( +/*==============*/ + que_node_t* node); /*!< in: node in a list */ +/*********************************************************************//** +Gets the parent node of a query graph node. +@return parent node or NULL */ +UNIV_INLINE +que_node_t* +que_node_get_parent( +/*================*/ + que_node_t* node); /*!< in: node */ +/****************************************************************//** +Get the first containing loop node (e.g. while_node_t or for_node_t) for the +given node, or NULL if the node is not within a loop. +@return containing loop node, or NULL. */ +UNIV_INTERN +que_node_t* +que_node_get_containing_loop_node( +/*==============================*/ + que_node_t* node); /*!< in: node */ +/*********************************************************************//** +Catenates a query graph node to a list of them, possible empty list. +@return one-way list of nodes */ +UNIV_INLINE +que_node_t* +que_node_list_add_last( +/*===================*/ + que_node_t* node_list, /*!< in: node list, or NULL */ + que_node_t* node); /*!< in: node */ +/************************************************************************* +Get the last node from the list.*/ +UNIV_INLINE +que_node_t* +que_node_list_get_last( +/*===================*/ + /* out: node last node from list.*/ + que_node_t* node_list); /* in: node list, or NULL */ +/*********************************************************************//** +Gets a query graph node list length. +@return length, for NULL list 0 */ +UNIV_INLINE +ulint +que_node_list_get_len( +/*==================*/ + que_node_t* node_list); /*!< in: node list, or NULL */ +/**********************************************************************//** +Checks if graph, trx, or session is in a state where the query thread should +be stopped. +@return TRUE if should be stopped; NOTE that if the peek is made +without reserving the trx_t::mutex, then another peek with the mutex +reserved is necessary before deciding the actual stopping */ +UNIV_INLINE +ibool +que_thr_peek_stop( +/*==============*/ + que_thr_t* thr); /*!< in: query thread */ +/***********************************************************************//** +Returns TRUE if the query graph is for a SELECT statement. +@return TRUE if a select */ +UNIV_INLINE +ibool +que_graph_is_select( +/*================*/ + que_t* graph); /*!< in: graph */ +/**********************************************************************//** +Prints info of an SQL query graph node. */ +UNIV_INTERN +void +que_node_print_info( +/*================*/ + que_node_t* node); /*!< in: query graph node */ +/*********************************************************************//** +Evaluate the given SQL +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +que_eval_sql( +/*=========*/ + pars_info_t* info, /*!< in: info struct, or NULL */ + const char* sql, /*!< in: SQL string */ + ibool reserve_dict_mutex, + /*!< in: if TRUE, acquire/release + dict_sys->mutex around call to pars_sql. */ + trx_t* trx); /*!< in: trx */ + +/**********************************************************************//** +Round robin scheduler. +@return a query thread of the graph moved to QUE_THR_RUNNING state, or +NULL; the query thread should be executed by que_run_threads by the +caller */ +UNIV_INTERN +que_thr_t* +que_fork_scheduler_round_robin( +/*===========================*/ + que_fork_t* fork, /*!< in: a query fork */ + que_thr_t* thr); /*!< in: current pos */ + +/*********************************************************************//** +Initialise the query sub-system. */ +UNIV_INTERN +void +que_init(void); +/*==========*/ + +/*********************************************************************//** +Close the query sub-system. */ +UNIV_INTERN +void +que_close(void); +/*===========*/ + +/* Query graph query thread node: the fields are protected by the +trx_t::mutex with the exceptions named below */ + +struct que_thr_t{ + que_common_t common; /*!< type: QUE_NODE_THR */ + ulint magic_n; /*!< magic number to catch memory + corruption */ + que_node_t* child; /*!< graph child node */ + que_t* graph; /*!< graph where this node belongs */ + ulint state; /*!< state of the query thread */ + ibool is_active; /*!< TRUE if the thread has been set + to the run state in + que_thr_move_to_run_state, but not + deactivated in + que_thr_dec_reference_count */ + /*------------------------------*/ + /* The following fields are private to the OS thread executing the + query thread, and are not protected by any mutex: */ + + que_node_t* run_node; /*!< pointer to the node where the + subgraph down from this node is + currently executed */ + que_node_t* prev_node; /*!< pointer to the node from which + the control came */ + ulint resource; /*!< resource usage of the query thread + thus far */ + ulint lock_state; /*!< lock state of thread (table or + row) */ + struct srv_slot_t* + slot; /* The thread slot in the wait + array in srv_sys_t */ + /*------------------------------*/ + /* The following fields are links for the various lists that + this type can be on. */ + UT_LIST_NODE_T(que_thr_t) + thrs; /*!< list of thread nodes of the fork + node */ + UT_LIST_NODE_T(que_thr_t) + trx_thrs; /*!< lists of threads in wait list of + the trx */ + UT_LIST_NODE_T(que_thr_t) + queue; /*!< list of runnable thread nodes in + the server task queue */ + ulint fk_cascade_depth; /*!< maximum cascading call depth + supported for foreign key constraint + related delete/updates */ +}; + +#define QUE_THR_MAGIC_N 8476583 +#define QUE_THR_MAGIC_FREED 123461526 + +/* Query graph fork node: its fields are protected by the query thread mutex */ +struct que_fork_t{ + que_common_t common; /*!< type: QUE_NODE_FORK */ + que_t* graph; /*!< query graph of this node */ + ulint fork_type; /*!< fork type */ + ulint n_active_thrs; /*!< if this is the root of a graph, the + number query threads that have been + started in que_thr_move_to_run_state + but for which que_thr_dec_refer_count + has not yet been called */ + trx_t* trx; /*!< transaction: this is set only in + the root node */ + ulint state; /*!< state of the fork node */ + que_thr_t* caller; /*!< pointer to a possible calling query + thread */ + UT_LIST_BASE_NODE_T(que_thr_t) + thrs; /*!< list of query threads */ + /*------------------------------*/ + /* The fields in this section are defined only in the root node */ + sym_tab_t* sym_tab; /*!< symbol table of the query, + generated by the parser, or NULL + if the graph was created 'by hand' */ + pars_info_t* info; /*!< info struct, or NULL */ + /* The following cur_... fields are relevant only in a select graph */ + + ulint cur_end; /*!< QUE_CUR_NOT_DEFINED, QUE_CUR_START, + QUE_CUR_END */ + ulint cur_pos; /*!< if there are n rows in the result + set, values 0 and n + 1 mean before + first row, or after last row, depending + on cur_end; values 1...n mean a row + index */ + ibool cur_on_row; /*!< TRUE if cursor is on a row, i.e., + it is not before the first row or + after the last row */ + sel_node_t* last_sel_node; /*!< last executed select node, or NULL + if none */ + UT_LIST_NODE_T(que_fork_t) + graphs; /*!< list of query graphs of a session + or a stored procedure */ + /*------------------------------*/ + mem_heap_t* heap; /*!< memory heap where the fork was + created */ + +}; + +/* Query fork (or graph) types */ +#define QUE_FORK_SELECT_NON_SCROLL 1 /* forward-only cursor */ +#define QUE_FORK_SELECT_SCROLL 2 /* scrollable cursor */ +#define QUE_FORK_INSERT 3 +#define QUE_FORK_UPDATE 4 +#define QUE_FORK_ROLLBACK 5 + /* This is really the undo graph used in rollback, + no signal-sending roll_node in this graph */ +#define QUE_FORK_PURGE 6 +#define QUE_FORK_EXECUTE 7 +#define QUE_FORK_PROCEDURE 8 +#define QUE_FORK_PROCEDURE_CALL 9 +#define QUE_FORK_MYSQL_INTERFACE 10 +#define QUE_FORK_RECOVERY 11 + +/* Query fork (or graph) states */ +#define QUE_FORK_ACTIVE 1 +#define QUE_FORK_COMMAND_WAIT 2 +#define QUE_FORK_INVALID 3 +#define QUE_FORK_BEING_FREED 4 + +/* Flag which is ORed to control structure statement node types */ +#define QUE_NODE_CONTROL_STAT 1024 + +/* Query graph node types */ +#define QUE_NODE_LOCK 1 +#define QUE_NODE_INSERT 2 +#define QUE_NODE_UPDATE 4 +#define QUE_NODE_CURSOR 5 +#define QUE_NODE_SELECT 6 +#define QUE_NODE_AGGREGATE 7 +#define QUE_NODE_FORK 8 +#define QUE_NODE_THR 9 +#define QUE_NODE_UNDO 10 +#define QUE_NODE_COMMIT 11 +#define QUE_NODE_ROLLBACK 12 +#define QUE_NODE_PURGE 13 +#define QUE_NODE_CREATE_TABLE 14 +#define QUE_NODE_CREATE_INDEX 15 +#define QUE_NODE_SYMBOL 16 +#define QUE_NODE_RES_WORD 17 +#define QUE_NODE_FUNC 18 +#define QUE_NODE_ORDER 19 +#define QUE_NODE_PROC (20 + QUE_NODE_CONTROL_STAT) +#define QUE_NODE_IF (21 + QUE_NODE_CONTROL_STAT) +#define QUE_NODE_WHILE (22 + QUE_NODE_CONTROL_STAT) +#define QUE_NODE_ASSIGNMENT 23 +#define QUE_NODE_FETCH 24 +#define QUE_NODE_OPEN 25 +#define QUE_NODE_COL_ASSIGNMENT 26 +#define QUE_NODE_FOR (27 + QUE_NODE_CONTROL_STAT) +#define QUE_NODE_RETURN 28 +#define QUE_NODE_ROW_PRINTF 29 +#define QUE_NODE_ELSIF 30 +#define QUE_NODE_CALL 31 +#define QUE_NODE_EXIT 32 + +/* Query thread states */ +#define QUE_THR_RUNNING 1 +#define QUE_THR_PROCEDURE_WAIT 2 +#define QUE_THR_COMPLETED 3 /* in selects this means that the + thread is at the end of its result set + (or start, in case of a scroll cursor); + in other statements, this means the + thread has done its task */ +#define QUE_THR_COMMAND_WAIT 4 +#define QUE_THR_LOCK_WAIT 5 +#define QUE_THR_SUSPENDED 7 +#define QUE_THR_ERROR 8 + +/* Query thread lock states */ +#define QUE_THR_LOCK_NOLOCK 0 +#define QUE_THR_LOCK_ROW 1 +#define QUE_THR_LOCK_TABLE 2 + +/* From where the cursor position is counted */ +#define QUE_CUR_NOT_DEFINED 1 +#define QUE_CUR_START 2 +#define QUE_CUR_END 3 + +#ifndef UNIV_NONINL +#include "que0que.ic" +#endif + +#endif diff --git a/storage/xtradb/include/que0que.ic b/storage/xtradb/include/que0que.ic new file mode 100644 index 00000000000..eff5a86d958 --- /dev/null +++ b/storage/xtradb/include/que0que.ic @@ -0,0 +1,309 @@ +/***************************************************************************** + +Copyright (c) 1996, 2010, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/que0que.ic +Query graph + +Created 5/27/1996 Heikki Tuuri +*******************************************************/ + +#include "usr0sess.h" + +/***********************************************************************//** +Gets the trx of a query thread. */ +UNIV_INLINE +trx_t* +thr_get_trx( +/*========*/ + que_thr_t* thr) /*!< in: query thread */ +{ + ut_ad(thr); + + return(thr->graph->trx); +} + +/*******************************************************************//** +Determines if this thread is rolling back an incomplete transaction +in crash recovery. +@return TRUE if thr is rolling back an incomplete transaction in crash +recovery */ +UNIV_INLINE +ibool +thr_is_recv( +/*========*/ + const que_thr_t* thr) /*!< in: query thread */ +{ + return(trx_is_recv(thr->graph->trx)); +} + +/***********************************************************************//** +Gets the first thr in a fork. */ +UNIV_INLINE +que_thr_t* +que_fork_get_first_thr( +/*===================*/ + que_fork_t* fork) /*!< in: query fork */ +{ + return(UT_LIST_GET_FIRST(fork->thrs)); +} + +/***********************************************************************//** +Gets the child node of the first thr in a fork. */ +UNIV_INLINE +que_node_t* +que_fork_get_child( +/*===============*/ + que_fork_t* fork) /*!< in: query fork */ +{ + que_thr_t* thr; + + thr = UT_LIST_GET_FIRST(fork->thrs); + + return(thr->child); +} + +/***********************************************************************//** +Gets the type of a graph node. */ +UNIV_INLINE +ulint +que_node_get_type( +/*==============*/ + que_node_t* node) /*!< in: graph node */ +{ + ut_ad(node); + + return(((que_common_t*) node)->type); +} + +/***********************************************************************//** +Gets pointer to the value dfield of a graph node. */ +UNIV_INLINE +dfield_t* +que_node_get_val( +/*=============*/ + que_node_t* node) /*!< in: graph node */ +{ + ut_ad(node); + + return(&(((que_common_t*) node)->val)); +} + +/***********************************************************************//** +Gets the value buffer size of a graph node. +@return val buffer size, not defined if val.data == NULL in node */ +UNIV_INLINE +ulint +que_node_get_val_buf_size( +/*======================*/ + que_node_t* node) /*!< in: graph node */ +{ + ut_ad(node); + + return(((que_common_t*) node)->val_buf_size); +} + +/***********************************************************************//** +Sets the value buffer size of a graph node. */ +UNIV_INLINE +void +que_node_set_val_buf_size( +/*======================*/ + que_node_t* node, /*!< in: graph node */ + ulint size) /*!< in: size */ +{ + ut_ad(node); + + ((que_common_t*) node)->val_buf_size = size; +} + +/***********************************************************************//** +Sets the parent of a graph node. */ +UNIV_INLINE +void +que_node_set_parent( +/*================*/ + que_node_t* node, /*!< in: graph node */ + que_node_t* parent) /*!< in: parent */ +{ + ut_ad(node); + + ((que_common_t*) node)->parent = parent; +} + +/***********************************************************************//** +Gets pointer to the value data type field of a graph node. */ +UNIV_INLINE +dtype_t* +que_node_get_data_type( +/*===================*/ + que_node_t* node) /*!< in: graph node */ +{ + ut_ad(node); + + return(dfield_get_type(&((que_common_t*) node)->val)); +} + +/*********************************************************************//** +Catenates a query graph node to a list of them, possible empty list. +@return one-way list of nodes */ +UNIV_INLINE +que_node_t* +que_node_list_add_last( +/*===================*/ + que_node_t* node_list, /*!< in: node list, or NULL */ + que_node_t* node) /*!< in: node */ +{ + que_common_t* cnode; + que_common_t* cnode2; + + cnode = (que_common_t*) node; + + cnode->brother = NULL; + + if (node_list == NULL) { + + return(node); + } + + cnode2 = (que_common_t*) node_list; + + while (cnode2->brother != NULL) { + cnode2 = (que_common_t*) cnode2->brother; + } + + cnode2->brother = node; + + return(node_list); +} + +/************************************************************************* +Removes a query graph node from the list.*/ +UNIV_INLINE +que_node_t* +que_node_list_get_last( +/*===================*/ + /* out: last node in list.*/ + que_node_t* node_list) /* in: node list */ +{ + que_common_t* node; + + ut_a(node_list != NULL); + + node = (que_common_t*) node_list; + + /* We need the last element */ + while (node->brother != NULL) { + node = (que_common_t*) node->brother; + } + + return(node); +} +/*********************************************************************//** +Gets the next list node in a list of query graph nodes. +@return next node in a list of nodes */ +UNIV_INLINE +que_node_t* +que_node_get_next( +/*==============*/ + que_node_t* node) /*!< in: node in a list */ +{ + return(((que_common_t*) node)->brother); +} + +/*********************************************************************//** +Gets a query graph node list length. +@return length, for NULL list 0 */ +UNIV_INLINE +ulint +que_node_list_get_len( +/*==================*/ + que_node_t* node_list) /*!< in: node list, or NULL */ +{ + const que_common_t* cnode; + ulint len; + + cnode = (const que_common_t*) node_list; + len = 0; + + while (cnode != NULL) { + len++; + cnode = (const que_common_t*) cnode->brother; + } + + return(len); +} + +/*********************************************************************//** +Gets the parent node of a query graph node. +@return parent node or NULL */ +UNIV_INLINE +que_node_t* +que_node_get_parent( +/*================*/ + que_node_t* node) /*!< in: node */ +{ + return(((que_common_t*) node)->parent); +} + +/**********************************************************************//** +Checks if graph, trx, or session is in a state where the query thread should +be stopped. +@return TRUE if should be stopped; NOTE that if the peek is made +without reserving the trx mutex, then another peek with the mutex +reserved is necessary before deciding the actual stopping */ +UNIV_INLINE +ibool +que_thr_peek_stop( +/*==============*/ + que_thr_t* thr) /*!< in: query thread */ +{ + trx_t* trx; + que_t* graph; + + graph = thr->graph; + trx = graph->trx; + + if (graph->state != QUE_FORK_ACTIVE + || trx->lock.que_state == TRX_QUE_LOCK_WAIT + || (trx->lock.que_state != TRX_QUE_ROLLING_BACK + && trx->lock.que_state != TRX_QUE_RUNNING)) { + + return(TRUE); + } + + return(FALSE); +} + +/***********************************************************************//** +Returns TRUE if the query graph is for a SELECT statement. +@return TRUE if a select */ +UNIV_INLINE +ibool +que_graph_is_select( +/*================*/ + que_t* graph) /*!< in: graph */ +{ + if (graph->fork_type == QUE_FORK_SELECT_SCROLL + || graph->fork_type == QUE_FORK_SELECT_NON_SCROLL) { + + return(TRUE); + } + + return(FALSE); +} diff --git a/storage/xtradb/include/que0types.h b/storage/xtradb/include/que0types.h new file mode 100644 index 00000000000..0f11cad301a --- /dev/null +++ b/storage/xtradb/include/que0types.h @@ -0,0 +1,57 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/que0types.h +Query graph global types + +Created 5/27/1996 Heikki Tuuri +*******************************************************/ + +#ifndef que0types_h +#define que0types_h + +#include "data0data.h" +#include "dict0types.h" + +/* Pseudotype for all graph nodes */ +typedef void que_node_t; + +/* Query graph root is a fork node */ +typedef struct que_fork_t que_t; + +struct que_thr_t; + +/* Common struct at the beginning of each query graph node; the name of this +substruct must be 'common' */ + +struct que_common_t{ + ulint type; /*!< query node type */ + que_node_t* parent; /*!< back pointer to parent node, or NULL */ + que_node_t* brother;/* pointer to a possible brother node */ + dfield_t val; /*!< evaluated value for an expression */ + ulint val_buf_size; + /* buffer size for the evaluated value data, + if the buffer has been allocated dynamically: + if this field is != 0, and the node is a + symbol node or a function node, then we + have to free the data field in val + explicitly */ +}; + +#endif diff --git a/storage/xtradb/include/read0i_s.h b/storage/xtradb/include/read0i_s.h new file mode 100644 index 00000000000..11b63affe09 --- /dev/null +++ b/storage/xtradb/include/read0i_s.h @@ -0,0 +1,54 @@ +/***************************************************************************** + +Copyright (c) 1997, 2012, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2010-2012, Percona Inc. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +#ifndef read0i_s_h +#define read0i_s_h + +#include <trx0types.h> + +struct i_s_xtradb_read_view_struct { + undo_no_t undo_no;/*!< 0 or if type is + VIEW_HIGH_GRANULARITY + transaction undo_no when this high-granularity + consistent read view was created */ + trx_id_t low_limit_no; + /*!< The view does not need to see the undo + logs for transactions whose transaction number + is strictly smaller (<) than this value: they + can be removed in purge if not needed by other + views */ + trx_id_t low_limit_id; + /*!< The read should not see any transaction + with trx id >= this value. In other words, + this is the "high water mark". */ + trx_id_t up_limit_id; + /*!< The read should see all trx ids which + are strictly smaller (<) than this value. + In other words, + this is the "low water mark". */ +}; + +typedef struct i_s_xtradb_read_view_struct i_s_xtradb_read_view_t; + +UNIV_INTERN +i_s_xtradb_read_view_t* +read_fill_i_s_xtradb_read_view(i_s_xtradb_read_view_t *rv); + + +#endif /* read0i_s_h */ diff --git a/storage/xtradb/include/read0read.h b/storage/xtradb/include/read0read.h new file mode 100644 index 00000000000..0352f129c30 --- /dev/null +++ b/storage/xtradb/include/read0read.h @@ -0,0 +1,232 @@ +/***************************************************************************** + +Copyright (c) 1997, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/read0read.h +Cursor read + +Created 2/16/1997 Heikki Tuuri +*******************************************************/ + +#ifndef read0read_h +#define read0read_h + +#include "univ.i" + + +#include "ut0byte.h" +#include "ut0lst.h" +#include "btr0types.h" +#include "trx0trx.h" +#include "trx0sys.h" +#include "read0types.h" + +/*********************************************************************//** +Opens a read view where exactly the transactions serialized before this +point in time are seen in the view. +@return own: read view struct */ +UNIV_INTERN +read_view_t* +read_view_open_now( +/*===============*/ + trx_id_t cr_trx_id, /*!< in: trx_id of creating + transaction, or 0 used in purge */ + read_view_t*& view); /*!< in,out: pre-allocated view array or + NULL if a new one needs to be created */ + +/*********************************************************************//** +Clones a read view object. This function will allocate space for two read +views contiguously, one identical in size and content as @param view (starting +at returned pointer) and another view immediately following the trx_ids array. +The second view will have space for an extra trx_id_t element. +@return read view struct */ +UNIV_INTERN +read_view_t* +read_view_clone( +/*============*/ + const read_view_t* view, /*!< in: view to clone */ + read_view_t*& prebuilt_clone);/*!< in,out: prebuilt view or + NULL */ +/*********************************************************************//** +Insert the view in the proper order into the trx_sys->view_list. The +read view list is ordered by read_view_t::low_limit_no in descending order. */ +UNIV_INTERN +void +read_view_add( +/*==========*/ + read_view_t* view); /*!< in: view to add to */ +/*********************************************************************//** +Makes a copy of the oldest existing read view, or opens a new. The view +must be closed with ..._close. +@return own: read view struct */ +UNIV_INTERN +read_view_t* +read_view_purge_open( +/*=================*/ + read_view_t*& clone_view, /*!< in,out: pre-allocated view that + will be used to clone the oldest view if + exists */ + read_view_t*& view); /*!< in,out: pre-allocated view array or + NULL if a new one needs to be created */ +/*********************************************************************//** +Remove a read view from the trx_sys->view_list. */ +UNIV_INLINE +void +read_view_remove( +/*=============*/ + read_view_t* view, /*!< in: read view, can be 0 */ + bool own_mutex); /*!< in: true if caller owns the + trx_sys_t::mutex */ +/*********************************************************************//** +Frees memory allocated by a read view. */ +UNIV_INTERN +void +read_view_free( +/*===========*/ + read_view_t*& view); /*< in,out: read view */ +/*********************************************************************//** +Closes a consistent read view for MySQL. This function is called at an SQL +statement end if the trx isolation level is <= TRX_ISO_READ_COMMITTED. */ +UNIV_INTERN +void +read_view_close_for_mysql( +/*======================*/ + trx_t* trx); /*!< in: trx which has a read view */ +/*********************************************************************//** +Checks if a read view sees the specified transaction. +@return true if sees */ +UNIV_INLINE +bool +read_view_sees_trx_id( +/*==================*/ + const read_view_t* view, /*!< in: read view */ + trx_id_t trx_id) /*!< in: trx id */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Prints a read view to file. */ +UNIV_INTERN +void +read_view_print( +/*============*/ + FILE* file, /*!< in: file to print to */ + const read_view_t* view); /*!< in: read view */ +/*********************************************************************//** +Create a consistent cursor view for mysql to be used in cursors. In this +consistent read view modifications done by the creating transaction or future +transactions are not visible. */ +UNIV_INTERN +cursor_view_t* +read_cursor_view_create_for_mysql( +/*==============================*/ + trx_t* cr_trx);/*!< in: trx where cursor view is created */ +/*********************************************************************//** +Close a given consistent cursor view for mysql and restore global read view +back to a transaction read view. */ +UNIV_INTERN +void +read_cursor_view_close_for_mysql( +/*=============================*/ + trx_t* trx, /*!< in: trx */ + cursor_view_t* curview); /*!< in: cursor view to be closed */ +/*********************************************************************//** +This function sets a given consistent cursor view to a transaction +read view if given consistent cursor view is not NULL. Otherwise, function +restores a global read view to a transaction read view. */ +UNIV_INTERN +void +read_cursor_set_for_mysql( +/*======================*/ + trx_t* trx, /*!< in: transaction where cursor is set */ + cursor_view_t* curview);/*!< in: consistent cursor view to be set */ + +/** Read view lists the trx ids of those transactions for which a consistent +read should not see the modifications to the database. */ + +struct read_view_t{ + ulint type; /*!< VIEW_NORMAL, VIEW_HIGH_GRANULARITY */ + undo_no_t undo_no;/*!< 0 or if type is + VIEW_HIGH_GRANULARITY + transaction undo_no when this high-granularity + consistent read view was created */ + trx_id_t low_limit_no; + /*!< The view does not need to see the undo + logs for transactions whose transaction number + is strictly smaller (<) than this value: they + can be removed in purge if not needed by other + views */ + trx_id_t low_limit_id; + /*!< The read should not see any transaction + with trx id >= this value. In other words, + this is the "high water mark". */ + trx_id_t up_limit_id; + /*!< The read should see all trx ids which + are strictly smaller (<) than this value. + In other words, + this is the "low water mark". */ + ulint n_descr; + /*!< Number of cells in the trx_ids array */ + ulint max_descr; + /*!< Maximum number of cells in the trx_ids + array */ + trx_id_t* descriptors; + /*!< Additional trx ids which the read should + not see: typically, these are the read-write + active transactions at the time when the read + is serialized, except the reading transaction + itself; the trx ids in this array are in a + ascending order. These trx_ids should be + between the "low" and "high" water marks, + that is, up_limit_id and low_limit_id. */ + trx_id_t creator_trx_id; + /*!< trx id of creating transaction, or + 0 used in purge */ + UT_LIST_NODE_T(read_view_t) view_list; + /*!< List of read views in trx_sys */ +}; + +/** Read view types @{ */ +#define VIEW_NORMAL 1 /*!< Normal consistent read view + where transaction does not see changes + made by active transactions except + creating transaction. */ +#define VIEW_HIGH_GRANULARITY 2 /*!< High-granularity read view where + transaction does not see changes + made by active transactions and own + changes after a point in time when this + read view was created. */ +/* @} */ + +/** Implement InnoDB framework to support consistent read views in +cursors. This struct holds both heap where consistent read view +is allocated and pointer to a read view. */ + +struct cursor_view_t{ + mem_heap_t* heap; + /*!< Memory heap for the cursor view */ + read_view_t* read_view; + /*!< Consistent read view of the cursor*/ + ulint n_mysql_tables_in_use; + /*!< number of Innobase tables used in the + processing of this cursor */ +}; + +#ifndef UNIV_NONINL +#include "read0read.ic" +#endif + +#endif diff --git a/storage/xtradb/include/read0read.ic b/storage/xtradb/include/read0read.ic new file mode 100644 index 00000000000..66bef8866c9 --- /dev/null +++ b/storage/xtradb/include/read0read.ic @@ -0,0 +1,131 @@ +/***************************************************************************** + +Copyright (c) 1997, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/read0read.ic +Cursor read + +Created 2/16/1997 Heikki Tuuri +*******************************************************/ + +#include "trx0sys.h" + +#ifdef UNIV_DEBUG +/*********************************************************************//** +Validates a read view object. */ +static +bool +read_view_validate( +/*===============*/ + const read_view_t* view) /*!< in: view to validate */ +{ + ut_ad(mutex_own(&trx_sys->mutex)); + ut_ad(view->max_descr >= view->n_descr); + ut_ad(view->descriptors == NULL || view->max_descr > 0); + + /* Check that the view->descriptors array is in ascending order. */ + for (ulint i = 1; i < view->n_descr; ++i) { + + ut_a(view->descriptors[i] > view->descriptors[i - 1]); + } + + return(true); +} + +/** Functor to validate the view list. */ +struct ViewCheck { + + ViewCheck() : m_prev_view(0) { } + + void operator()(const read_view_t* view) + { + ut_a(m_prev_view == NULL + || m_prev_view->low_limit_no >= view->low_limit_no); + + m_prev_view = view; + } + + const read_view_t* m_prev_view; +}; + +/*********************************************************************//** +Validates a read view list. */ +static +bool +read_view_list_validate(void) +/*=========================*/ +{ + ut_ad(mutex_own(&trx_sys->mutex)); + + ut_list_map(trx_sys->view_list, &read_view_t::view_list, ViewCheck()); + + return(true); +} +#endif /* UNIV_DEBUG */ + +/*********************************************************************//** +Checks if a read view sees the specified transaction. +@return true if sees */ +UNIV_INLINE +bool +read_view_sees_trx_id( +/*==================*/ + const read_view_t* view, /*!< in: read view */ + trx_id_t trx_id) /*!< in: trx id */ +{ + if (trx_id < view->up_limit_id) { + + return(true); + } else if (trx_id >= view->low_limit_id) { + + return(false); + } + + /* Do a binary search over this view's descriptors array */ + + return(trx_find_descriptor(view->descriptors, view->n_descr, + trx_id) == NULL); +} + +/*********************************************************************//** +Remove a read view from the trx_sys->view_list. */ +UNIV_INLINE +void +read_view_remove( +/*=============*/ + read_view_t* view, /*!< in: read view, can be 0 */ + bool own_mutex) /*!< in: true if caller owns the + trx_sys_t::mutex */ +{ + if (view != 0) { + if (!own_mutex) { + mutex_enter(&trx_sys->mutex); + } + + ut_ad(read_view_validate(view)); + + UT_LIST_REMOVE(view_list, trx_sys->view_list, view); + + ut_ad(read_view_list_validate()); + + if (!own_mutex) { + mutex_exit(&trx_sys->mutex); + } + } +} + diff --git a/storage/xtradb/include/read0types.h b/storage/xtradb/include/read0types.h new file mode 100644 index 00000000000..969f4ebb637 --- /dev/null +++ b/storage/xtradb/include/read0types.h @@ -0,0 +1,32 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/read0types.h +Cursor read + +Created 2/16/1997 Heikki Tuuri +*******************************************************/ + +#ifndef read0types_h +#define read0types_h + +struct read_view_t; +struct cursor_view_t; + +#endif diff --git a/storage/xtradb/include/rem0cmp.h b/storage/xtradb/include/rem0cmp.h new file mode 100644 index 00000000000..cb3c85ac2c8 --- /dev/null +++ b/storage/xtradb/include/rem0cmp.h @@ -0,0 +1,301 @@ +/***************************************************************************** + +Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/*******************************************************************//** +@file include/rem0cmp.h +Comparison services for records + +Created 7/1/1994 Heikki Tuuri +************************************************************************/ + +#ifndef rem0cmp_h +#define rem0cmp_h + +#include "univ.i" +#include "data0data.h" +#include "data0type.h" +#include "dict0dict.h" +#include "rem0rec.h" + +/*************************************************************//** +Returns TRUE if two columns are equal for comparison purposes. +@return TRUE if the columns are considered equal in comparisons */ +UNIV_INTERN +ibool +cmp_cols_are_equal( +/*===============*/ + const dict_col_t* col1, /*!< in: column 1 */ + const dict_col_t* col2, /*!< in: column 2 */ + ibool check_charsets); + /*!< in: whether to check charsets */ +/*************************************************************//** +This function is used to compare two data fields for which we know the +data type. +@return 1, 0, -1, if data1 is greater, equal, less than data2, respectively */ +UNIV_INLINE +int +cmp_data_data( +/*==========*/ + ulint mtype, /*!< in: main type */ + ulint prtype, /*!< in: precise type */ + const byte* data1, /*!< in: data field (== a pointer to a memory + buffer) */ + ulint len1, /*!< in: data field length or UNIV_SQL_NULL */ + const byte* data2, /*!< in: data field (== a pointer to a memory + buffer) */ + ulint len2); /*!< in: data field length or UNIV_SQL_NULL */ +/*************************************************************//** +This function is used to compare two data fields for which we know the +data type. +@return 1, 0, -1, if data1 is greater, equal, less than data2, respectively */ +UNIV_INTERN +int +cmp_data_data_slow( +/*===============*/ + ulint mtype, /*!< in: main type */ + ulint prtype, /*!< in: precise type */ + const byte* data1, /*!< in: data field (== a pointer to a memory + buffer) */ + ulint len1, /*!< in: data field length or UNIV_SQL_NULL */ + const byte* data2, /*!< in: data field (== a pointer to a memory + buffer) */ + ulint len2); /*!< in: data field length or UNIV_SQL_NULL */ + +/***************************************************************** +This function is used to compare two data fields for which we know the +data type to be VARCHAR. +@return 1, 0, -1, if lhs is greater, equal, less than rhs, respectively */ +UNIV_INTERN +int +cmp_data_data_slow_varchar( +/*=======================*/ + const byte* lhs, /* in: data field (== a pointer to a memory + buffer) */ + ulint lhs_len,/* in: data field length or UNIV_SQL_NULL */ + const byte* rhs, /* in: data field (== a pointer to a memory + buffer) */ + ulint rhs_len);/* in: data field length or UNIV_SQL_NULL */ +/***************************************************************** +This function is used to compare two varchar/char fields. The comparison +is for the LIKE operator. +@return 1, 0, -1, if lhs is greater, equal, less than rhs, respectively */ +UNIV_INTERN +int +cmp_data_data_slow_like_prefix( +/*===========================*/ + const byte* data1, /* in: data field (== a pointer to a memory + buffer) */ + ulint len1, /* in: data field length or UNIV_SQL_NULL */ + const byte* data2, /* in: data field (== a pointer to a memory + buffer) */ + ulint len2); /* in: data field length or UNIV_SQL_NULL */ +/***************************************************************** +This function is used to compare two varchar/char fields. The comparison +is for the LIKE operator. +@return 1, 0, -1, if data1 is greater, equal, less than data2, respectively */ +UNIV_INTERN +int +cmp_data_data_slow_like_suffix( +/*===========================*/ + const byte* data1, /* in: data field (== a pointer to a memory + buffer) */ + ulint len1, /* in: data field length or UNIV_SQL_NULL */ + const byte* data2, /* in: data field (== a pointer to a memory + buffer) */ + ulint len2); /* in: data field length or UNIV_SQL_NULL */ +/***************************************************************** +This function is used to compare two varchar/char fields. The comparison +is for the LIKE operator. +@return 1, 0, -1, if data1 is greater, equal, less than data2, respectively */ +UNIV_INTERN +int +cmp_data_data_slow_like_substr( +/*===========================*/ + const byte* data1, /* in: data field (== a pointer to a memory + buffer) */ + ulint len1, /* in: data field length or UNIV_SQL_NULL */ + const byte* data2, /* in: data field (== a pointer to a memory + buffer) */ + ulint len2); /* in: data field length or UNIV_SQL_NULL */ +/*************************************************************//** +This function is used to compare two dfields where at least the first +has its data type field set. +@return 1, 0, -1, if dfield1 is greater, equal, less than dfield2, +respectively */ +UNIV_INLINE +int +cmp_dfield_dfield( +/*==============*/ + const dfield_t* dfield1,/*!< in: data field; must have type field set */ + const dfield_t* dfield2);/*!< in: data field */ +/*************************************************************//** +This function is used to compare a data tuple to a physical record. +Only dtuple->n_fields_cmp first fields are taken into account for +the data tuple! If we denote by n = n_fields_cmp, then rec must +have either m >= n fields, or it must differ from dtuple in some of +the m fields rec has. If rec has an externally stored field we do not +compare it but return with value 0 if such a comparison should be +made. +@return 1, 0, -1, if dtuple is greater, equal, less than rec, +respectively, when only the common first fields are compared, or until +the first externally stored field in rec */ +UNIV_INTERN +int +cmp_dtuple_rec_with_match_low( +/*==========================*/ + const dtuple_t* dtuple, /*!< in: data tuple */ + const rec_t* rec, /*!< in: physical record which differs from + dtuple in some of the common fields, or which + has an equal number or more fields than + dtuple */ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint n_cmp, /*!< in: number of fields to compare */ + ulint* matched_fields, + /*!< in/out: number of already completely + matched fields; when function returns, + contains the value for current comparison */ + ulint* matched_bytes) + /*!< in/out: number of already matched + bytes within the first field not completely + matched; when function returns, contains the + value for current comparison */ + __attribute__((nonnull)); +#define cmp_dtuple_rec_with_match(tuple,rec,offsets,fields,bytes) \ + cmp_dtuple_rec_with_match_low( \ + tuple,rec,offsets,dtuple_get_n_fields_cmp(tuple),fields,bytes) +/**************************************************************//** +Compares a data tuple to a physical record. +@see cmp_dtuple_rec_with_match +@return 1, 0, -1, if dtuple is greater, equal, less than rec, respectively */ +UNIV_INTERN +int +cmp_dtuple_rec( +/*===========*/ + const dtuple_t* dtuple, /*!< in: data tuple */ + const rec_t* rec, /*!< in: physical record */ + const ulint* offsets);/*!< in: array returned by rec_get_offsets() */ +/**************************************************************//** +Checks if a dtuple is a prefix of a record. The last field in dtuple +is allowed to be a prefix of the corresponding field in the record. +@return TRUE if prefix */ +UNIV_INTERN +ibool +cmp_dtuple_is_prefix_of_rec( +/*========================*/ + const dtuple_t* dtuple, /*!< in: data tuple */ + const rec_t* rec, /*!< in: physical record */ + const ulint* offsets);/*!< in: array returned by rec_get_offsets() */ +/*************************************************************//** +Compare two physical records that contain the same number of columns, +none of which are stored externally. +@retval 1 if rec1 (including non-ordering columns) is greater than rec2 +@retval -1 if rec1 (including non-ordering columns) is less than rec2 +@retval 0 if rec1 is a duplicate of rec2 */ +UNIV_INTERN +int +cmp_rec_rec_simple( +/*===============*/ + const rec_t* rec1, /*!< in: physical record */ + const rec_t* rec2, /*!< in: physical record */ + const ulint* offsets1,/*!< in: rec_get_offsets(rec1, ...) */ + const ulint* offsets2,/*!< in: rec_get_offsets(rec2, ...) */ + const dict_index_t* index, /*!< in: data dictionary index */ + struct TABLE* table) /*!< in: MySQL table, for reporting + duplicate key value if applicable, + or NULL */ + __attribute__((nonnull(1,2,3,4), warn_unused_result)); +/*************************************************************//** +This function is used to compare two physical records. Only the common +first fields are compared, and if an externally stored field is +encountered, then 0 is returned. +@return 1, 0, -1 if rec1 is greater, equal, less, respectively */ +UNIV_INTERN +int +cmp_rec_rec_with_match( +/*===================*/ + const rec_t* rec1, /*!< in: physical record */ + const rec_t* rec2, /*!< in: physical record */ + const ulint* offsets1,/*!< in: rec_get_offsets(rec1, index) */ + const ulint* offsets2,/*!< in: rec_get_offsets(rec2, index) */ + dict_index_t* index, /*!< in: data dictionary index */ + ibool nulls_unequal, + /* in: TRUE if this is for index statistics + cardinality estimation, and innodb_stats_method + is "nulls_unequal" or "nulls_ignored" */ + ulint* matched_fields, /*!< in/out: number of already completely + matched fields; when the function returns, + contains the value the for current + comparison */ + ulint* matched_bytes);/*!< in/out: number of already matched + bytes within the first field not completely + matched; when the function returns, contains + the value for the current comparison */ +/*************************************************************//** +This function is used to compare two physical records. Only the common +first fields are compared. +@return 1, 0 , -1 if rec1 is greater, equal, less, respectively, than +rec2; only the common first fields are compared */ +UNIV_INLINE +int +cmp_rec_rec( +/*========*/ + const rec_t* rec1, /*!< in: physical record */ + const rec_t* rec2, /*!< in: physical record */ + const ulint* offsets1,/*!< in: rec_get_offsets(rec1, index) */ + const ulint* offsets2,/*!< in: rec_get_offsets(rec2, index) */ + dict_index_t* index); /*!< in: data dictionary index */ + +/***************************************************************** +This function is used to compare two dfields where at least the first +has its data type field set. */ +UNIV_INTERN +int +cmp_dfield_dfield_like_prefix( +/*==========================*/ + /* out: 1, 0, -1, if dfield1 is greater, equal, + less than dfield2, respectively */ + dfield_t* dfield1,/* in: data field; must have type field set */ + dfield_t* dfield2);/* in: data field */ +/***************************************************************** +This function is used to compare two dfields where at least the first +has its data type field set. */ +UNIV_INLINE +int +cmp_dfield_dfield_like_substr( +/*==========================*/ + /* out: 1, 0, -1, if dfield1 is greater, equal, + less than dfield2, respectively */ + dfield_t* dfield1,/* in: data field; must have type field set */ + dfield_t* dfield2);/* in: data field */ +/***************************************************************** +This function is used to compare two dfields where at least the first +has its data type field set. */ +UNIV_INLINE +int +cmp_dfield_dfield_like_suffix( +/*==========================*/ + /* out: 1, 0, -1, if dfield1 is greater, equal, + less than dfield2, respectively */ + dfield_t* dfield1,/* in: data field; must have type field set */ + dfield_t* dfield2);/* in: data field */ + +#ifndef UNIV_NONINL +#include "rem0cmp.ic" +#endif + +#endif diff --git a/storage/xtradb/include/rem0cmp.ic b/storage/xtradb/include/rem0cmp.ic new file mode 100644 index 00000000000..67a2dcacba1 --- /dev/null +++ b/storage/xtradb/include/rem0cmp.ic @@ -0,0 +1,186 @@ +/***************************************************************************** + +Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/*******************************************************************//** +@file include/rem0cmp.ic +Comparison services for records + +Created 7/1/1994 Heikki Tuuri +************************************************************************/ + +/*************************************************************//** +This function is used to compare two data fields for which we know the +data type. +@return 1, 0, -1, if data1 is greater, equal, less than data2, respectively */ +UNIV_INLINE +int +cmp_data_data( +/*==========*/ + ulint mtype, /*!< in: main type */ + ulint prtype, /*!< in: precise type */ + const byte* data1, /*!< in: data field (== a pointer to a memory + buffer) */ + ulint len1, /*!< in: data field length or UNIV_SQL_NULL */ + const byte* data2, /*!< in: data field (== a pointer to a memory + buffer) */ + ulint len2) /*!< in: data field length or UNIV_SQL_NULL */ +{ + return(cmp_data_data_slow(mtype, prtype, data1, len1, data2, len2)); +} + +/***************************************************************** +This function is used to compare two (CHAR) data fields for the LIKE +operator. */ +UNIV_INLINE +int +cmp_data_data_like_prefix( +/*======================*/ + /* out: 1, 0, -1, if data1 is greater, equal, + less than data2, respectively */ + byte* data1, /* in: data field (== a pointer to a memory + buffer) */ + ulint len1, /* in: data field length or UNIV_SQL_NULL */ + byte* data2, /* in: data field (== a pointer to a memory + buffer) */ + ulint len2) /* in: data field length or UNIV_SQL_NULL */ +{ + return(cmp_data_data_slow_like_prefix(data1, len1, data2, len2)); +} +/***************************************************************** +This function is used to compare two (CHAR) data fields for the LIKE +operator. */ +UNIV_INLINE +int +cmp_data_data_like_suffix( +/*======================*/ + /* out: 1, 0, -1, if data1 is greater, equal, + less than data2, respectively */ + byte* data1, /* in: data field (== a pointer to a memory + buffer) */ + ulint len1, /* in: data field length or UNIV_SQL_NULL */ + byte* data2, /* in: data field (== a pointer to a memory + buffer) */ + ulint len2) /* in: data field length or UNIV_SQL_NULL */ +{ + return(cmp_data_data_slow_like_suffix(data1, len1, data2, len2)); +} +/***************************************************************** +This function is used to compare two (CHAR) data fields for the LIKE +operator. */ +UNIV_INLINE +int +cmp_data_data_like_substr( +/*======================*/ + /* out: 1, 0, -1, if data1 is greater, equal, + less than data2, respectively */ + byte* data1, /* in: data field (== a pointer to a memory + buffer) */ + ulint len1, /* in: data field length or UNIV_SQL_NULL */ + byte* data2, /* in: data field (== a pointer to a memory + buffer) */ + ulint len2) /* in: data field length or UNIV_SQL_NULL */ +{ + return(cmp_data_data_slow_like_substr(data1, len1, data2, len2)); +} +/*************************************************************//** +This function is used to compare two dfields where at least the first +has its data type field set. +@return 1, 0, -1, if dfield1 is greater, equal, less than dfield2, +respectively */ +UNIV_INLINE +int +cmp_dfield_dfield( +/*==============*/ + const dfield_t* dfield1,/*!< in: data field; must have type field set */ + const dfield_t* dfield2)/*!< in: data field */ +{ + const dtype_t* type; + + ut_ad(dfield_check_typed(dfield1)); + + type = dfield_get_type(dfield1); + + return(cmp_data_data(type->mtype, type->prtype, + (const byte*) dfield_get_data(dfield1), + dfield_get_len(dfield1), + (const byte*) dfield_get_data(dfield2), + dfield_get_len(dfield2))); +} + +/***************************************************************** +This function is used to compare two dfields where at least the first +has its data type field set. */ +UNIV_INLINE +int +cmp_dfield_dfield_like_suffix( +/*==========================*/ + /* out: 1, 0, -1, if dfield1 is greater, equal, + less than dfield2, respectively */ + dfield_t* dfield1,/* in: data field; must have type field set */ + dfield_t* dfield2)/* in: data field */ +{ + ut_ad(dfield_check_typed(dfield1)); + + return(cmp_data_data_like_suffix( + (byte*) dfield_get_data(dfield1), + dfield_get_len(dfield1), + (byte*) dfield_get_data(dfield2), + dfield_get_len(dfield2))); +} + +/***************************************************************** +This function is used to compare two dfields where at least the first +has its data type field set. */ +UNIV_INLINE +int +cmp_dfield_dfield_like_substr( +/*==========================*/ + /* out: 1, 0, -1, if dfield1 is greater, equal, + less than dfield2, respectively */ + dfield_t* dfield1,/* in: data field; must have type field set */ + dfield_t* dfield2)/* in: data field */ +{ + ut_ad(dfield_check_typed(dfield1)); + + return(cmp_data_data_like_substr( + (byte*) dfield_get_data(dfield1), + dfield_get_len(dfield1), + (byte*) dfield_get_data(dfield2), + dfield_get_len(dfield2))); +} +/*************************************************************//** +This function is used to compare two physical records. Only the common +first fields are compared. +@return 1, 0 , -1 if rec1 is greater, equal, less, respectively, than +rec2; only the common first fields are compared */ +UNIV_INLINE +int +cmp_rec_rec( +/*========*/ + const rec_t* rec1, /*!< in: physical record */ + const rec_t* rec2, /*!< in: physical record */ + const ulint* offsets1,/*!< in: rec_get_offsets(rec1, index) */ + const ulint* offsets2,/*!< in: rec_get_offsets(rec2, index) */ + dict_index_t* index) /*!< in: data dictionary index */ +{ + ulint match_f = 0; + ulint match_b = 0; + + return(cmp_rec_rec_with_match(rec1, rec2, offsets1, offsets2, index, + FALSE, &match_f, &match_b)); +} diff --git a/storage/xtradb/include/rem0rec.h b/storage/xtradb/include/rem0rec.h new file mode 100644 index 00000000000..8e7d5ff2d48 --- /dev/null +++ b/storage/xtradb/include/rem0rec.h @@ -0,0 +1,988 @@ +/***************************************************************************** + +Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/rem0rec.h +Record manager + +Created 5/30/1994 Heikki Tuuri +*************************************************************************/ + +#ifndef rem0rec_h +#define rem0rec_h + +#include "univ.i" +#include "data0data.h" +#include "rem0types.h" +#include "mtr0types.h" +#include "page0types.h" + +/* Info bit denoting the predefined minimum record: this bit is set +if and only if the record is the first user record on a non-leaf +B-tree page that is the leftmost page on its level +(PAGE_LEVEL is nonzero and FIL_PAGE_PREV is FIL_NULL). */ +#define REC_INFO_MIN_REC_FLAG 0x10UL +/* The deleted flag in info bits */ +#define REC_INFO_DELETED_FLAG 0x20UL /* when bit is set to 1, it means the + record has been delete marked */ + +/* Number of extra bytes in an old-style record, +in addition to the data and the offsets */ +#define REC_N_OLD_EXTRA_BYTES 6 +/* Number of extra bytes in a new-style record, +in addition to the data and the offsets */ +#define REC_N_NEW_EXTRA_BYTES 5 + +/* Record status values */ +#define REC_STATUS_ORDINARY 0 +#define REC_STATUS_NODE_PTR 1 +#define REC_STATUS_INFIMUM 2 +#define REC_STATUS_SUPREMUM 3 + +/* The following four constants are needed in page0zip.cc in order to +efficiently compress and decompress pages. */ + +/* The offset of heap_no in a compact record */ +#define REC_NEW_HEAP_NO 4 +/* The shift of heap_no in a compact record. +The status is stored in the low-order bits. */ +#define REC_HEAP_NO_SHIFT 3 + +/* Length of a B-tree node pointer, in bytes */ +#define REC_NODE_PTR_SIZE 4 + +/** SQL null flag in a 1-byte offset of ROW_FORMAT=REDUNDANT records */ +#define REC_1BYTE_SQL_NULL_MASK 0x80UL +/** SQL null flag in a 2-byte offset of ROW_FORMAT=REDUNDANT records */ +#define REC_2BYTE_SQL_NULL_MASK 0x8000UL + +/** In a 2-byte offset of ROW_FORMAT=REDUNDANT records, the second most +significant bit denotes that the tail of a field is stored off-page. */ +#define REC_2BYTE_EXTERN_MASK 0x4000UL + +#ifdef UNIV_DEBUG +/* Length of the rec_get_offsets() header */ +# define REC_OFFS_HEADER_SIZE 4 +#else /* UNIV_DEBUG */ +/* Length of the rec_get_offsets() header */ +# define REC_OFFS_HEADER_SIZE 2 +#endif /* UNIV_DEBUG */ + +/* Number of elements that should be initially allocated for the +offsets[] array, first passed to rec_get_offsets() */ +#define REC_OFFS_NORMAL_SIZE 100 +#define REC_OFFS_SMALL_SIZE 10 + +/******************************************************//** +The following function is used to get the pointer of the next chained record +on the same page. +@return pointer to the next chained record, or NULL if none */ +UNIV_INLINE +const rec_t* +rec_get_next_ptr_const( +/*===================*/ + const rec_t* rec, /*!< in: physical record */ + ulint comp) /*!< in: nonzero=compact page format */ + __attribute__((nonnull, pure, warn_unused_result)); +/******************************************************//** +The following function is used to get the pointer of the next chained record +on the same page. +@return pointer to the next chained record, or NULL if none */ +UNIV_INLINE +rec_t* +rec_get_next_ptr( +/*=============*/ + rec_t* rec, /*!< in: physical record */ + ulint comp) /*!< in: nonzero=compact page format */ + __attribute__((nonnull, pure, warn_unused_result)); +/******************************************************//** +The following function is used to get the offset of the +next chained record on the same page. +@return the page offset of the next chained record, or 0 if none */ +UNIV_INLINE +ulint +rec_get_next_offs( +/*==============*/ + const rec_t* rec, /*!< in: physical record */ + ulint comp) /*!< in: nonzero=compact page format */ + __attribute__((nonnull, pure, warn_unused_result)); +/******************************************************//** +The following function is used to set the next record offset field +of an old-style record. */ +UNIV_INLINE +void +rec_set_next_offs_old( +/*==================*/ + rec_t* rec, /*!< in: old-style physical record */ + ulint next) /*!< in: offset of the next record */ + __attribute__((nonnull)); +/******************************************************//** +The following function is used to set the next record offset field +of a new-style record. */ +UNIV_INLINE +void +rec_set_next_offs_new( +/*==================*/ + rec_t* rec, /*!< in/out: new-style physical record */ + ulint next) /*!< in: offset of the next record */ + __attribute__((nonnull)); +/******************************************************//** +The following function is used to get the number of fields +in an old-style record. +@return number of data fields */ +UNIV_INLINE +ulint +rec_get_n_fields_old( +/*=================*/ + const rec_t* rec) /*!< in: physical record */ + __attribute__((nonnull, pure, warn_unused_result)); +/******************************************************//** +The following function is used to get the number of fields +in a record. +@return number of data fields */ +UNIV_INLINE +ulint +rec_get_n_fields( +/*=============*/ + const rec_t* rec, /*!< in: physical record */ + const dict_index_t* index) /*!< in: record descriptor */ + __attribute__((nonnull, pure, warn_unused_result)); +/******************************************************//** +The following function is used to get the number of records owned by the +previous directory record. +@return number of owned records */ +UNIV_INLINE +ulint +rec_get_n_owned_old( +/*================*/ + const rec_t* rec) /*!< in: old-style physical record */ + __attribute__((nonnull, pure, warn_unused_result)); +/******************************************************//** +The following function is used to set the number of owned records. */ +UNIV_INLINE +void +rec_set_n_owned_old( +/*================*/ + rec_t* rec, /*!< in: old-style physical record */ + ulint n_owned) /*!< in: the number of owned */ + __attribute__((nonnull)); +/******************************************************//** +The following function is used to get the number of records owned by the +previous directory record. +@return number of owned records */ +UNIV_INLINE +ulint +rec_get_n_owned_new( +/*================*/ + const rec_t* rec) /*!< in: new-style physical record */ + __attribute__((nonnull, pure, warn_unused_result)); +/******************************************************//** +The following function is used to set the number of owned records. */ +UNIV_INLINE +void +rec_set_n_owned_new( +/*================*/ + rec_t* rec, /*!< in/out: new-style physical record */ + page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */ + ulint n_owned)/*!< in: the number of owned */ + __attribute__((nonnull(1))); +/******************************************************//** +The following function is used to retrieve the info bits of +a record. +@return info bits */ +UNIV_INLINE +ulint +rec_get_info_bits( +/*==============*/ + const rec_t* rec, /*!< in: physical record */ + ulint comp) /*!< in: nonzero=compact page format */ + __attribute__((nonnull, pure, warn_unused_result)); +/******************************************************//** +The following function is used to set the info bits of a record. */ +UNIV_INLINE +void +rec_set_info_bits_old( +/*==================*/ + rec_t* rec, /*!< in: old-style physical record */ + ulint bits) /*!< in: info bits */ + __attribute__((nonnull)); +/******************************************************//** +The following function is used to set the info bits of a record. */ +UNIV_INLINE +void +rec_set_info_bits_new( +/*==================*/ + rec_t* rec, /*!< in/out: new-style physical record */ + ulint bits) /*!< in: info bits */ + __attribute__((nonnull)); +/******************************************************//** +The following function retrieves the status bits of a new-style record. +@return status bits */ +UNIV_INLINE +ulint +rec_get_status( +/*===========*/ + const rec_t* rec) /*!< in: physical record */ + __attribute__((nonnull, pure, warn_unused_result)); + +/******************************************************//** +The following function is used to set the status bits of a new-style record. */ +UNIV_INLINE +void +rec_set_status( +/*===========*/ + rec_t* rec, /*!< in/out: physical record */ + ulint bits) /*!< in: info bits */ + __attribute__((nonnull)); + +/******************************************************//** +The following function is used to retrieve the info and status +bits of a record. (Only compact records have status bits.) +@return info bits */ +UNIV_INLINE +ulint +rec_get_info_and_status_bits( +/*=========================*/ + const rec_t* rec, /*!< in: physical record */ + ulint comp) /*!< in: nonzero=compact page format */ + __attribute__((nonnull, pure, warn_unused_result)); +/******************************************************//** +The following function is used to set the info and status +bits of a record. (Only compact records have status bits.) */ +UNIV_INLINE +void +rec_set_info_and_status_bits( +/*=========================*/ + rec_t* rec, /*!< in/out: compact physical record */ + ulint bits) /*!< in: info bits */ + __attribute__((nonnull)); + +/******************************************************//** +The following function tells if record is delete marked. +@return nonzero if delete marked */ +UNIV_INLINE +ulint +rec_get_deleted_flag( +/*=================*/ + const rec_t* rec, /*!< in: physical record */ + ulint comp) /*!< in: nonzero=compact page format */ + __attribute__((nonnull, pure, warn_unused_result)); +/******************************************************//** +The following function is used to set the deleted bit. */ +UNIV_INLINE +void +rec_set_deleted_flag_old( +/*=====================*/ + rec_t* rec, /*!< in: old-style physical record */ + ulint flag) /*!< in: nonzero if delete marked */ + __attribute__((nonnull)); +/******************************************************//** +The following function is used to set the deleted bit. */ +UNIV_INLINE +void +rec_set_deleted_flag_new( +/*=====================*/ + rec_t* rec, /*!< in/out: new-style physical record */ + page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */ + ulint flag) /*!< in: nonzero if delete marked */ + __attribute__((nonnull(1))); +/******************************************************//** +The following function tells if a new-style record is a node pointer. +@return TRUE if node pointer */ +UNIV_INLINE +ibool +rec_get_node_ptr_flag( +/*==================*/ + const rec_t* rec) /*!< in: physical record */ + __attribute__((nonnull, pure, warn_unused_result)); +/******************************************************//** +The following function is used to get the order number +of an old-style record in the heap of the index page. +@return heap order number */ +UNIV_INLINE +ulint +rec_get_heap_no_old( +/*================*/ + const rec_t* rec) /*!< in: physical record */ + __attribute__((nonnull, pure, warn_unused_result)); +/******************************************************//** +The following function is used to set the heap number +field in an old-style record. */ +UNIV_INLINE +void +rec_set_heap_no_old( +/*================*/ + rec_t* rec, /*!< in: physical record */ + ulint heap_no)/*!< in: the heap number */ + __attribute__((nonnull)); +/******************************************************//** +The following function is used to get the order number +of a new-style record in the heap of the index page. +@return heap order number */ +UNIV_INLINE +ulint +rec_get_heap_no_new( +/*================*/ + const rec_t* rec) /*!< in: physical record */ + __attribute__((nonnull, pure, warn_unused_result)); +/******************************************************//** +The following function is used to set the heap number +field in a new-style record. */ +UNIV_INLINE +void +rec_set_heap_no_new( +/*================*/ + rec_t* rec, /*!< in/out: physical record */ + ulint heap_no)/*!< in: the heap number */ + __attribute__((nonnull)); +/******************************************************//** +The following function is used to test whether the data offsets +in the record are stored in one-byte or two-byte format. +@return TRUE if 1-byte form */ +UNIV_INLINE +ibool +rec_get_1byte_offs_flag( +/*====================*/ + const rec_t* rec) /*!< in: physical record */ + __attribute__((nonnull, pure, warn_unused_result)); + +/******************************************************//** +The following function is used to set the 1-byte offsets flag. */ +UNIV_INLINE +void +rec_set_1byte_offs_flag( +/*====================*/ + rec_t* rec, /*!< in: physical record */ + ibool flag) /*!< in: TRUE if 1byte form */ + __attribute__((nonnull)); + +/******************************************************//** +Returns the offset of nth field end if the record is stored in the 1-byte +offsets form. If the field is SQL null, the flag is ORed in the returned +value. +@return offset of the start of the field, SQL null flag ORed */ +UNIV_INLINE +ulint +rec_1_get_field_end_info( +/*=====================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: field index */ + __attribute__((nonnull, pure, warn_unused_result)); + +/******************************************************//** +Returns the offset of nth field end if the record is stored in the 2-byte +offsets form. If the field is SQL null, the flag is ORed in the returned +value. +@return offset of the start of the field, SQL null flag and extern +storage flag ORed */ +UNIV_INLINE +ulint +rec_2_get_field_end_info( +/*=====================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: field index */ + __attribute__((nonnull, pure, warn_unused_result)); + +/******************************************************//** +Returns nonzero if the field is stored off-page. +@retval 0 if the field is stored in-page +@retval REC_2BYTE_EXTERN_MASK if the field is stored externally */ +UNIV_INLINE +ulint +rec_2_is_field_extern( +/*==================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: field index */ + __attribute__((nonnull, pure, warn_unused_result)); + +/******************************************************//** +Determine how many of the first n columns in a compact +physical record are stored externally. +@return number of externally stored columns */ +UNIV_INTERN +ulint +rec_get_n_extern_new( +/*=================*/ + const rec_t* rec, /*!< in: compact physical record */ + const dict_index_t* index, /*!< in: record descriptor */ + ulint n) /*!< in: number of columns to scan */ + __attribute__((nonnull, warn_unused_result)); + +/******************************************************//** +The following function determines the offsets to each field +in the record. It can reuse a previously allocated array. +@return the new offsets */ +UNIV_INTERN +ulint* +rec_get_offsets_func( +/*=================*/ + const rec_t* rec, /*!< in: physical record */ + const dict_index_t* index, /*!< in: record descriptor */ + ulint* offsets,/*!< in/out: array consisting of + offsets[0] allocated elements, + or an array from rec_get_offsets(), + or NULL */ + ulint n_fields,/*!< in: maximum number of + initialized fields + (ULINT_UNDEFINED if all fields) */ +#ifdef UNIV_DEBUG + const char* file, /*!< in: file name where called */ + ulint line, /*!< in: line number where called */ +#endif /* UNIV_DEBUG */ + mem_heap_t** heap) /*!< in/out: memory heap */ +#ifdef UNIV_DEBUG + __attribute__((nonnull(1,2,5,7),warn_unused_result)); +#else /* UNIV_DEBUG */ + __attribute__((nonnull(1,2,5),warn_unused_result)); +#endif /* UNIV_DEBUG */ + +#ifdef UNIV_DEBUG +# define rec_get_offsets(rec,index,offsets,n,heap) \ + rec_get_offsets_func(rec,index,offsets,n,__FILE__,__LINE__,heap) +#else /* UNIV_DEBUG */ +# define rec_get_offsets(rec, index, offsets, n, heap) \ + rec_get_offsets_func(rec, index, offsets, n, heap) +#endif /* UNIV_DEBUG */ + +/******************************************************//** +The following function determines the offsets to each field +in the record. It can reuse a previously allocated array. */ +UNIV_INTERN +void +rec_get_offsets_reverse( +/*====================*/ + const byte* extra, /*!< in: the extra bytes of a + compact record in reverse order, + excluding the fixed-size + REC_N_NEW_EXTRA_BYTES */ + const dict_index_t* index, /*!< in: record descriptor */ + ulint node_ptr,/*!< in: nonzero=node pointer, + 0=leaf node */ + ulint* offsets)/*!< in/out: array consisting of + offsets[0] allocated elements */ + __attribute__((nonnull)); +#ifdef UNIV_DEBUG +/************************************************************//** +Validates offsets returned by rec_get_offsets(). +@return TRUE if valid */ +UNIV_INLINE +ibool +rec_offs_validate( +/*==============*/ + const rec_t* rec, /*!< in: record or NULL */ + const dict_index_t* index, /*!< in: record descriptor or NULL */ + const ulint* offsets)/*!< in: array returned by + rec_get_offsets() */ + __attribute__((nonnull(3), warn_unused_result)); +/************************************************************//** +Updates debug data in offsets, in order to avoid bogus +rec_offs_validate() failures. */ +UNIV_INLINE +void +rec_offs_make_valid( +/*================*/ + const rec_t* rec, /*!< in: record */ + const dict_index_t* index, /*!< in: record descriptor */ + ulint* offsets)/*!< in: array returned by + rec_get_offsets() */ + __attribute__((nonnull)); +#else +# define rec_offs_make_valid(rec, index, offsets) ((void) 0) +#endif /* UNIV_DEBUG */ + +/************************************************************//** +The following function is used to get the offset to the nth +data field in an old-style record. +@return offset to the field */ +UNIV_INTERN +ulint +rec_get_nth_field_offs_old( +/*=======================*/ + const rec_t* rec, /*!< in: record */ + ulint n, /*!< in: index of the field */ + ulint* len) /*!< out: length of the field; UNIV_SQL_NULL + if SQL null */ + __attribute__((nonnull)); +#define rec_get_nth_field_old(rec, n, len) \ +((rec) + rec_get_nth_field_offs_old(rec, n, len)) +/************************************************************//** +Gets the physical size of an old-style field. +Also an SQL null may have a field of size > 0, +if the data type is of a fixed size. +@return field size in bytes */ +UNIV_INLINE +ulint +rec_get_nth_field_size( +/*===================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: index of the field */ + __attribute__((nonnull, pure, warn_unused_result)); +/************************************************************//** +The following function is used to get an offset to the nth +data field in a record. +@return offset from the origin of rec */ +UNIV_INLINE +ulint +rec_get_nth_field_offs( +/*===================*/ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint n, /*!< in: index of the field */ + ulint* len) /*!< out: length of the field; UNIV_SQL_NULL + if SQL null */ + __attribute__((nonnull)); +#define rec_get_nth_field(rec, offsets, n, len) \ +((rec) + rec_get_nth_field_offs(offsets, n, len)) +/******************************************************//** +Determine if the offsets are for a record in the new +compact format. +@return nonzero if compact format */ +UNIV_INLINE +ulint +rec_offs_comp( +/*==========*/ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ + __attribute__((nonnull, pure, warn_unused_result)); +/******************************************************//** +Determine if the offsets are for a record containing +externally stored columns. +@return nonzero if externally stored */ +UNIV_INLINE +ulint +rec_offs_any_extern( +/*================*/ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ + __attribute__((nonnull, pure, warn_unused_result)); +/******************************************************//** +Determine if the offsets are for a record containing null BLOB pointers. +@return first field containing a null BLOB pointer, or NULL if none found */ +UNIV_INLINE +const byte* +rec_offs_any_null_extern( +/*=====================*/ + const rec_t* rec, /*!< in: record */ + const ulint* offsets) /*!< in: rec_get_offsets(rec) */ + __attribute__((nonnull, pure, warn_unused_result)); +/******************************************************//** +Returns nonzero if the extern bit is set in nth field of rec. +@return nonzero if externally stored */ +UNIV_INLINE +ulint +rec_offs_nth_extern( +/*================*/ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint n) /*!< in: nth field */ + __attribute__((nonnull, pure, warn_unused_result)); +/******************************************************//** +Returns nonzero if the SQL NULL bit is set in nth field of rec. +@return nonzero if SQL NULL */ +UNIV_INLINE +ulint +rec_offs_nth_sql_null( +/*==================*/ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint n) /*!< in: nth field */ + __attribute__((nonnull, pure, warn_unused_result)); +/******************************************************//** +Gets the physical size of a field. +@return length of field */ +UNIV_INLINE +ulint +rec_offs_nth_size( +/*==============*/ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint n) /*!< in: nth field */ + __attribute__((nonnull, pure, warn_unused_result)); + +/******************************************************//** +Returns the number of extern bits set in a record. +@return number of externally stored fields */ +UNIV_INLINE +ulint +rec_offs_n_extern( +/*==============*/ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ + __attribute__((nonnull, pure, warn_unused_result)); +/***********************************************************//** +This is used to modify the value of an already existing field in a record. +The previous value must have exactly the same size as the new value. If len +is UNIV_SQL_NULL then the field is treated as an SQL null. +For records in ROW_FORMAT=COMPACT (new-style records), len must not be +UNIV_SQL_NULL unless the field already is SQL null. */ +UNIV_INLINE +void +rec_set_nth_field( +/*==============*/ + rec_t* rec, /*!< in: record */ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint n, /*!< in: index number of the field */ + const void* data, /*!< in: pointer to the data if not SQL null */ + ulint len) /*!< in: length of the data or UNIV_SQL_NULL. + If not SQL null, must have the same + length as the previous value. + If SQL null, previous value must be + SQL null. */ + __attribute__((nonnull(1,2))); +/**********************************************************//** +The following function returns the data size of an old-style physical +record, that is the sum of field lengths. SQL null fields +are counted as length 0 fields. The value returned by the function +is the distance from record origin to record end in bytes. +@return size */ +UNIV_INLINE +ulint +rec_get_data_size_old( +/*==================*/ + const rec_t* rec) /*!< in: physical record */ + __attribute__((nonnull, pure, warn_unused_result)); +/**********************************************************//** +The following function returns the number of allocated elements +for an array of offsets. +@return number of elements */ +UNIV_INLINE +ulint +rec_offs_get_n_alloc( +/*=================*/ + const ulint* offsets)/*!< in: array for rec_get_offsets() */ + __attribute__((nonnull, pure, warn_unused_result)); +/**********************************************************//** +The following function sets the number of allocated elements +for an array of offsets. */ +UNIV_INLINE +void +rec_offs_set_n_alloc( +/*=================*/ + ulint* offsets, /*!< out: array for rec_get_offsets(), + must be allocated */ + ulint n_alloc) /*!< in: number of elements */ + __attribute__((nonnull)); +#define rec_offs_init(offsets) \ + rec_offs_set_n_alloc(offsets, (sizeof offsets) / sizeof *offsets) +/**********************************************************//** +The following function returns the number of fields in a record. +@return number of fields */ +UNIV_INLINE +ulint +rec_offs_n_fields( +/*==============*/ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ + __attribute__((nonnull, pure, warn_unused_result)); +/**********************************************************//** +The following function returns the data size of a physical +record, that is the sum of field lengths. SQL null fields +are counted as length 0 fields. The value returned by the function +is the distance from record origin to record end in bytes. +@return size */ +UNIV_INLINE +ulint +rec_offs_data_size( +/*===============*/ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ + __attribute__((nonnull, pure, warn_unused_result)); +/**********************************************************//** +Returns the total size of record minus data size of record. +The value returned by the function is the distance from record +start to record origin in bytes. +@return size */ +UNIV_INLINE +ulint +rec_offs_extra_size( +/*================*/ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ + __attribute__((nonnull, pure, warn_unused_result)); +/**********************************************************//** +Returns the total size of a physical record. +@return size */ +UNIV_INLINE +ulint +rec_offs_size( +/*==========*/ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ + __attribute__((nonnull, pure, warn_unused_result)); +#ifdef UNIV_DEBUG +/**********************************************************//** +Returns a pointer to the start of the record. +@return pointer to start */ +UNIV_INLINE +byte* +rec_get_start( +/*==========*/ + const rec_t* rec, /*!< in: pointer to record */ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ + __attribute__((nonnull, pure, warn_unused_result)); +/**********************************************************//** +Returns a pointer to the end of the record. +@return pointer to end */ +UNIV_INLINE +byte* +rec_get_end( +/*========*/ + const rec_t* rec, /*!< in: pointer to record */ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ + __attribute__((nonnull, pure, warn_unused_result)); +#else /* UNIV_DEBUG */ +# define rec_get_start(rec, offsets) ((rec) - rec_offs_extra_size(offsets)) +# define rec_get_end(rec, offsets) ((rec) + rec_offs_data_size(offsets)) +#endif /* UNIV_DEBUG */ +/***************************************************************//** +Copies a physical record to a buffer. +@return pointer to the origin of the copy */ +UNIV_INLINE +rec_t* +rec_copy( +/*=====*/ + void* buf, /*!< in: buffer */ + const rec_t* rec, /*!< in: physical record */ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ + __attribute__((nonnull)); +#ifndef UNIV_HOTBACKUP +/**********************************************************//** +Determines the size of a data tuple prefix in a temporary file. +@return total size */ +UNIV_INTERN +ulint +rec_get_converted_size_temp( +/*========================*/ + const dict_index_t* index, /*!< in: record descriptor */ + const dfield_t* fields, /*!< in: array of data fields */ + ulint n_fields,/*!< in: number of data fields */ + ulint* extra) /*!< out: extra size */ + __attribute__((warn_unused_result, nonnull)); + +/******************************************************//** +Determine the offset to each field in temporary file. +@see rec_convert_dtuple_to_temp() */ +UNIV_INTERN +void +rec_init_offsets_temp( +/*==================*/ + const rec_t* rec, /*!< in: temporary file record */ + const dict_index_t* index, /*!< in: record descriptor */ + ulint* offsets)/*!< in/out: array of offsets; + in: n=rec_offs_n_fields(offsets) */ + __attribute__((nonnull)); + +/*********************************************************//** +Builds a temporary file record out of a data tuple. +@see rec_init_offsets_temp() */ +UNIV_INTERN +void +rec_convert_dtuple_to_temp( +/*=======================*/ + rec_t* rec, /*!< out: record */ + const dict_index_t* index, /*!< in: record descriptor */ + const dfield_t* fields, /*!< in: array of data fields */ + ulint n_fields) /*!< in: number of fields */ + __attribute__((nonnull)); + +/**************************************************************//** +Copies the first n fields of a physical record to a new physical record in +a buffer. +@return own: copied record */ +UNIV_INTERN +rec_t* +rec_copy_prefix_to_buf( +/*===================*/ + const rec_t* rec, /*!< in: physical record */ + const dict_index_t* index, /*!< in: record descriptor */ + ulint n_fields, /*!< in: number of fields + to copy */ + byte** buf, /*!< in/out: memory buffer + for the copied prefix, + or NULL */ + ulint* buf_size) /*!< in/out: buffer size */ + __attribute__((nonnull)); +/************************************************************//** +Folds a prefix of a physical record to a ulint. +@return the folded value */ +UNIV_INLINE +ulint +rec_fold( +/*=====*/ + const rec_t* rec, /*!< in: the physical record */ + const ulint* offsets, /*!< in: array returned by + rec_get_offsets() */ + ulint n_fields, /*!< in: number of complete + fields to fold */ + ulint n_bytes, /*!< in: number of bytes to fold + in an incomplete last field */ + index_id_t tree_id) /*!< in: index tree id */ + __attribute__((nonnull, pure, warn_unused_result)); +#endif /* !UNIV_HOTBACKUP */ +/*********************************************************//** +Builds a physical record out of a data tuple and +stores it into the given buffer. +@return pointer to the origin of physical record */ +UNIV_INTERN +rec_t* +rec_convert_dtuple_to_rec( +/*======================*/ + byte* buf, /*!< in: start address of the + physical record */ + const dict_index_t* index, /*!< in: record descriptor */ + const dtuple_t* dtuple, /*!< in: data tuple */ + ulint n_ext) /*!< in: number of + externally stored columns */ + __attribute__((nonnull, warn_unused_result)); +/**********************************************************//** +Returns the extra size of an old-style physical record if we know its +data size and number of fields. +@return extra size */ +UNIV_INLINE +ulint +rec_get_converted_extra_size( +/*=========================*/ + ulint data_size, /*!< in: data size */ + ulint n_fields, /*!< in: number of fields */ + ulint n_ext) /*!< in: number of externally stored columns */ + __attribute__((const)); +/**********************************************************//** +Determines the size of a data tuple prefix in ROW_FORMAT=COMPACT. +@return total size */ +UNIV_INTERN +ulint +rec_get_converted_size_comp_prefix( +/*===============================*/ + const dict_index_t* index, /*!< in: record descriptor */ + const dfield_t* fields, /*!< in: array of data fields */ + ulint n_fields,/*!< in: number of data fields */ + ulint* extra) /*!< out: extra size */ + __attribute__((warn_unused_result, nonnull(1,2))); +/**********************************************************//** +Determines the size of a data tuple in ROW_FORMAT=COMPACT. +@return total size */ +UNIV_INTERN +ulint +rec_get_converted_size_comp( +/*========================*/ + const dict_index_t* index, /*!< in: record descriptor; + dict_table_is_comp() is + assumed to hold, even if + it does not */ + ulint status, /*!< in: status bits of the record */ + const dfield_t* fields, /*!< in: array of data fields */ + ulint n_fields,/*!< in: number of data fields */ + ulint* extra) /*!< out: extra size */ + __attribute__((nonnull(1,3))); +/**********************************************************//** +The following function returns the size of a data tuple when converted to +a physical record. +@return size */ +UNIV_INLINE +ulint +rec_get_converted_size( +/*===================*/ + dict_index_t* index, /*!< in: record descriptor */ + const dtuple_t* dtuple, /*!< in: data tuple */ + ulint n_ext) /*!< in: number of externally stored columns */ + __attribute__((warn_unused_result, nonnull)); +#ifndef UNIV_HOTBACKUP +/**************************************************************//** +Copies the first n fields of a physical record to a data tuple. +The fields are copied to the memory heap. */ +UNIV_INTERN +void +rec_copy_prefix_to_dtuple( +/*======================*/ + dtuple_t* tuple, /*!< out: data tuple */ + const rec_t* rec, /*!< in: physical record */ + const dict_index_t* index, /*!< in: record descriptor */ + ulint n_fields, /*!< in: number of fields + to copy */ + mem_heap_t* heap) /*!< in: memory heap */ + __attribute__((nonnull)); +#endif /* !UNIV_HOTBACKUP */ +/***************************************************************//** +Validates the consistency of a physical record. +@return TRUE if ok */ +UNIV_INTERN +ibool +rec_validate( +/*=========*/ + const rec_t* rec, /*!< in: physical record */ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ + __attribute__((nonnull)); +/***************************************************************//** +Prints an old-style physical record. */ +UNIV_INTERN +void +rec_print_old( +/*==========*/ + FILE* file, /*!< in: file where to print */ + const rec_t* rec) /*!< in: physical record */ + __attribute__((nonnull)); +#ifndef UNIV_HOTBACKUP +/***************************************************************//** +Prints a physical record in ROW_FORMAT=COMPACT. Ignores the +record header. */ +UNIV_INTERN +void +rec_print_comp( +/*===========*/ + FILE* file, /*!< in: file where to print */ + const rec_t* rec, /*!< in: physical record */ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ + __attribute__((nonnull)); +/***************************************************************//** +Prints a physical record. */ +UNIV_INTERN +void +rec_print_new( +/*==========*/ + FILE* file, /*!< in: file where to print */ + const rec_t* rec, /*!< in: physical record */ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ + __attribute__((nonnull)); +/***************************************************************//** +Prints a physical record. */ +UNIV_INTERN +void +rec_print( +/*======*/ + FILE* file, /*!< in: file where to print */ + const rec_t* rec, /*!< in: physical record */ + const dict_index_t* index) /*!< in: record descriptor */ + __attribute__((nonnull)); + +# ifdef UNIV_DEBUG +/************************************************************//** +Reads the DB_TRX_ID of a clustered index record. +@return the value of DB_TRX_ID */ +UNIV_INTERN +trx_id_t +rec_get_trx_id( +/*===========*/ + const rec_t* rec, /*!< in: record */ + const dict_index_t* index) /*!< in: clustered index */ + __attribute__((nonnull, warn_unused_result)); +# endif /* UNIV_DEBUG */ +#endif /* UNIV_HOTBACKUP */ + +/* Maximum lengths for the data in a physical record if the offsets +are given in one byte (resp. two byte) format. */ +#define REC_1BYTE_OFFS_LIMIT 0x7FUL +#define REC_2BYTE_OFFS_LIMIT 0x7FFFUL + +/* The data size of record must be smaller than this because we reserve +two upmost bits in a two byte offset for special purposes */ +#define REC_MAX_DATA_SIZE (16 * 1024) + +#ifndef UNIV_NONINL +#include "rem0rec.ic" +#endif + +#endif diff --git a/storage/xtradb/include/rem0rec.ic b/storage/xtradb/include/rem0rec.ic new file mode 100644 index 00000000000..a539320dd2a --- /dev/null +++ b/storage/xtradb/include/rem0rec.ic @@ -0,0 +1,1718 @@ +/***************************************************************************** + +Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/rem0rec.ic +Record manager + +Created 5/30/1994 Heikki Tuuri +*************************************************************************/ + +#include "mach0data.h" +#include "ut0byte.h" +#include "dict0dict.h" +#include "btr0types.h" + +/* Compact flag ORed to the extra size returned by rec_get_offsets() */ +#define REC_OFFS_COMPACT ((ulint) 1 << 31) +/* SQL NULL flag in offsets returned by rec_get_offsets() */ +#define REC_OFFS_SQL_NULL ((ulint) 1 << 31) +/* External flag in offsets returned by rec_get_offsets() */ +#define REC_OFFS_EXTERNAL ((ulint) 1 << 30) +/* Mask for offsets returned by rec_get_offsets() */ +#define REC_OFFS_MASK (REC_OFFS_EXTERNAL - 1) + +/* Offsets of the bit-fields in an old-style record. NOTE! In the table the +most significant bytes and bits are written below less significant. + + (1) byte offset (2) bit usage within byte + downward from + origin -> 1 8 bits pointer to next record + 2 8 bits pointer to next record + 3 1 bit short flag + 7 bits number of fields + 4 3 bits number of fields + 5 bits heap number + 5 8 bits heap number + 6 4 bits n_owned + 4 bits info bits +*/ + +/* Offsets of the bit-fields in a new-style record. NOTE! In the table the +most significant bytes and bits are written below less significant. + + (1) byte offset (2) bit usage within byte + downward from + origin -> 1 8 bits relative offset of next record + 2 8 bits relative offset of next record + the relative offset is an unsigned 16-bit + integer: + (offset_of_next_record + - offset_of_this_record) mod 64Ki, + where mod is the modulo as a non-negative + number; + we can calculate the offset of the next + record with the formula: + relative_offset + offset_of_this_record + mod UNIV_PAGE_SIZE + 3 3 bits status: + 000=conventional record + 001=node pointer record (inside B-tree) + 010=infimum record + 011=supremum record + 1xx=reserved + 5 bits heap number + 4 8 bits heap number + 5 4 bits n_owned + 4 bits info bits +*/ + +/* We list the byte offsets from the origin of the record, the mask, +and the shift needed to obtain each bit-field of the record. */ + +#define REC_NEXT 2 +#define REC_NEXT_MASK 0xFFFFUL +#define REC_NEXT_SHIFT 0 + +#define REC_OLD_SHORT 3 /* This is single byte bit-field */ +#define REC_OLD_SHORT_MASK 0x1UL +#define REC_OLD_SHORT_SHIFT 0 + +#define REC_OLD_N_FIELDS 4 +#define REC_OLD_N_FIELDS_MASK 0x7FEUL +#define REC_OLD_N_FIELDS_SHIFT 1 + +#define REC_NEW_STATUS 3 /* This is single byte bit-field */ +#define REC_NEW_STATUS_MASK 0x7UL +#define REC_NEW_STATUS_SHIFT 0 + +#define REC_OLD_HEAP_NO 5 +#define REC_HEAP_NO_MASK 0xFFF8UL +#if 0 /* defined in rem0rec.h for use of page0zip.cc */ +#define REC_NEW_HEAP_NO 4 +#define REC_HEAP_NO_SHIFT 3 +#endif + +#define REC_OLD_N_OWNED 6 /* This is single byte bit-field */ +#define REC_NEW_N_OWNED 5 /* This is single byte bit-field */ +#define REC_N_OWNED_MASK 0xFUL +#define REC_N_OWNED_SHIFT 0 + +#define REC_OLD_INFO_BITS 6 /* This is single byte bit-field */ +#define REC_NEW_INFO_BITS 5 /* This is single byte bit-field */ +#define REC_INFO_BITS_MASK 0xF0UL +#define REC_INFO_BITS_SHIFT 0 + +#if REC_OLD_SHORT_MASK << (8 * (REC_OLD_SHORT - 3)) \ + ^ REC_OLD_N_FIELDS_MASK << (8 * (REC_OLD_N_FIELDS - 4)) \ + ^ REC_HEAP_NO_MASK << (8 * (REC_OLD_HEAP_NO - 4)) \ + ^ REC_N_OWNED_MASK << (8 * (REC_OLD_N_OWNED - 3)) \ + ^ REC_INFO_BITS_MASK << (8 * (REC_OLD_INFO_BITS - 3)) \ + ^ 0xFFFFFFFFUL +# error "sum of old-style masks != 0xFFFFFFFFUL" +#endif +#if REC_NEW_STATUS_MASK << (8 * (REC_NEW_STATUS - 3)) \ + ^ REC_HEAP_NO_MASK << (8 * (REC_NEW_HEAP_NO - 4)) \ + ^ REC_N_OWNED_MASK << (8 * (REC_NEW_N_OWNED - 3)) \ + ^ REC_INFO_BITS_MASK << (8 * (REC_NEW_INFO_BITS - 3)) \ + ^ 0xFFFFFFUL +# error "sum of new-style masks != 0xFFFFFFUL" +#endif + +/***********************************************************//** +Sets the value of the ith field SQL null bit of an old-style record. */ +UNIV_INTERN +void +rec_set_nth_field_null_bit( +/*=======================*/ + rec_t* rec, /*!< in: record */ + ulint i, /*!< in: ith field */ + ibool val); /*!< in: value to set */ +/***********************************************************//** +Sets an old-style record field to SQL null. +The physical size of the field is not changed. */ +UNIV_INTERN +void +rec_set_nth_field_sql_null( +/*=======================*/ + rec_t* rec, /*!< in: record */ + ulint n); /*!< in: index of the field */ + +/******************************************************//** +Gets a bit field from within 1 byte. */ +UNIV_INLINE +ulint +rec_get_bit_field_1( +/*================*/ + const rec_t* rec, /*!< in: pointer to record origin */ + ulint offs, /*!< in: offset from the origin down */ + ulint mask, /*!< in: mask used to filter bits */ + ulint shift) /*!< in: shift right applied after masking */ +{ + ut_ad(rec); + + return((mach_read_from_1(rec - offs) & mask) >> shift); +} + +/******************************************************//** +Sets a bit field within 1 byte. */ +UNIV_INLINE +void +rec_set_bit_field_1( +/*================*/ + rec_t* rec, /*!< in: pointer to record origin */ + ulint val, /*!< in: value to set */ + ulint offs, /*!< in: offset from the origin down */ + ulint mask, /*!< in: mask used to filter bits */ + ulint shift) /*!< in: shift right applied after masking */ +{ + ut_ad(rec); + ut_ad(offs <= REC_N_OLD_EXTRA_BYTES); + ut_ad(mask); + ut_ad(mask <= 0xFFUL); + ut_ad(((mask >> shift) << shift) == mask); + ut_ad(((val << shift) & mask) == (val << shift)); + + mach_write_to_1(rec - offs, + (mach_read_from_1(rec - offs) & ~mask) + | (val << shift)); +} + +/******************************************************//** +Gets a bit field from within 2 bytes. */ +UNIV_INLINE +ulint +rec_get_bit_field_2( +/*================*/ + const rec_t* rec, /*!< in: pointer to record origin */ + ulint offs, /*!< in: offset from the origin down */ + ulint mask, /*!< in: mask used to filter bits */ + ulint shift) /*!< in: shift right applied after masking */ +{ + ut_ad(rec); + + return((mach_read_from_2(rec - offs) & mask) >> shift); +} + +/******************************************************//** +Sets a bit field within 2 bytes. */ +UNIV_INLINE +void +rec_set_bit_field_2( +/*================*/ + rec_t* rec, /*!< in: pointer to record origin */ + ulint val, /*!< in: value to set */ + ulint offs, /*!< in: offset from the origin down */ + ulint mask, /*!< in: mask used to filter bits */ + ulint shift) /*!< in: shift right applied after masking */ +{ + ut_ad(rec); + ut_ad(offs <= REC_N_OLD_EXTRA_BYTES); + ut_ad(mask > 0xFFUL); + ut_ad(mask <= 0xFFFFUL); + ut_ad((mask >> shift) & 1); + ut_ad(0 == ((mask >> shift) & ((mask >> shift) + 1))); + ut_ad(((mask >> shift) << shift) == mask); + ut_ad(((val << shift) & mask) == (val << shift)); + + mach_write_to_2(rec - offs, + (mach_read_from_2(rec - offs) & ~mask) + | (val << shift)); +} + +/******************************************************//** +The following function is used to get the pointer of the next chained record +on the same page. +@return pointer to the next chained record, or NULL if none */ +UNIV_INLINE +const rec_t* +rec_get_next_ptr_const( +/*===================*/ + const rec_t* rec, /*!< in: physical record */ + ulint comp) /*!< in: nonzero=compact page format */ +{ + ulint field_value; + + ut_ad(REC_NEXT_MASK == 0xFFFFUL); + ut_ad(REC_NEXT_SHIFT == 0); + + field_value = mach_read_from_2(rec - REC_NEXT); + + if (field_value == 0) { + + return(NULL); + } + + if (comp) { +#if UNIV_PAGE_SIZE_MAX <= 32768 + /* Note that for 64 KiB pages, field_value can 'wrap around' + and the debug assertion is not valid */ + + /* In the following assertion, field_value is interpreted + as signed 16-bit integer in 2's complement arithmetics. + If all platforms defined int16_t in the standard headers, + the expression could be written simpler as + (int16_t) field_value + ut_align_offset(...) < UNIV_PAGE_SIZE + */ + ut_ad((field_value >= 32768 + ? field_value - 65536 + : field_value) + + ut_align_offset(rec, UNIV_PAGE_SIZE) + < UNIV_PAGE_SIZE); +#endif + /* There must be at least REC_N_NEW_EXTRA_BYTES + 1 + between each record. */ + ut_ad((field_value > REC_N_NEW_EXTRA_BYTES + && field_value < 32768) + || field_value < (uint16) -REC_N_NEW_EXTRA_BYTES); + + return((byte*) ut_align_down(rec, UNIV_PAGE_SIZE) + + ut_align_offset(rec + field_value, UNIV_PAGE_SIZE)); + } else { + ut_ad(field_value < UNIV_PAGE_SIZE); + + return((byte*) ut_align_down(rec, UNIV_PAGE_SIZE) + + field_value); + } +} + +/******************************************************//** +The following function is used to get the pointer of the next chained record +on the same page. +@return pointer to the next chained record, or NULL if none */ +UNIV_INLINE +rec_t* +rec_get_next_ptr( +/*=============*/ + rec_t* rec, /*!< in: physical record */ + ulint comp) /*!< in: nonzero=compact page format */ +{ + return(const_cast<rec_t*>(rec_get_next_ptr_const(rec, comp))); +} + +/******************************************************//** +The following function is used to get the offset of the next chained record +on the same page. +@return the page offset of the next chained record, or 0 if none */ +UNIV_INLINE +ulint +rec_get_next_offs( +/*==============*/ + const rec_t* rec, /*!< in: physical record */ + ulint comp) /*!< in: nonzero=compact page format */ +{ + ulint field_value; +#if REC_NEXT_MASK != 0xFFFFUL +# error "REC_NEXT_MASK != 0xFFFFUL" +#endif +#if REC_NEXT_SHIFT +# error "REC_NEXT_SHIFT != 0" +#endif + + field_value = mach_read_from_2(rec - REC_NEXT); + + if (comp) { +#if UNIV_PAGE_SIZE_MAX <= 32768 + /* Note that for 64 KiB pages, field_value can 'wrap around' + and the debug assertion is not valid */ + + /* In the following assertion, field_value is interpreted + as signed 16-bit integer in 2's complement arithmetics. + If all platforms defined int16_t in the standard headers, + the expression could be written simpler as + (int16_t) field_value + ut_align_offset(...) < UNIV_PAGE_SIZE + */ + ut_ad((field_value >= 32768 + ? field_value - 65536 + : field_value) + + ut_align_offset(rec, UNIV_PAGE_SIZE) + < UNIV_PAGE_SIZE); +#endif + if (field_value == 0) { + + return(0); + } + + /* There must be at least REC_N_NEW_EXTRA_BYTES + 1 + between each record. */ + ut_ad((field_value > REC_N_NEW_EXTRA_BYTES + && field_value < 32768) + || field_value < (uint16) -REC_N_NEW_EXTRA_BYTES); + + return(ut_align_offset(rec + field_value, UNIV_PAGE_SIZE)); + } else { + ut_ad(field_value < UNIV_PAGE_SIZE); + + return(field_value); + } +} + +/******************************************************//** +The following function is used to set the next record offset field +of an old-style record. */ +UNIV_INLINE +void +rec_set_next_offs_old( +/*==================*/ + rec_t* rec, /*!< in: old-style physical record */ + ulint next) /*!< in: offset of the next record */ +{ + ut_ad(rec); + ut_ad(UNIV_PAGE_SIZE > next); +#if REC_NEXT_MASK != 0xFFFFUL +# error "REC_NEXT_MASK != 0xFFFFUL" +#endif +#if REC_NEXT_SHIFT +# error "REC_NEXT_SHIFT != 0" +#endif + + mach_write_to_2(rec - REC_NEXT, next); +} + +/******************************************************//** +The following function is used to set the next record offset field +of a new-style record. */ +UNIV_INLINE +void +rec_set_next_offs_new( +/*==================*/ + rec_t* rec, /*!< in/out: new-style physical record */ + ulint next) /*!< in: offset of the next record */ +{ + ulint field_value; + + ut_ad(rec); + ut_ad(UNIV_PAGE_SIZE > next); + + if (!next) { + field_value = 0; + } else { + /* The following two statements calculate + next - offset_of_rec mod 64Ki, where mod is the modulo + as a non-negative number */ + + field_value = (ulint) + ((lint) next + - (lint) ut_align_offset(rec, UNIV_PAGE_SIZE)); + field_value &= REC_NEXT_MASK; + } + + mach_write_to_2(rec - REC_NEXT, field_value); +} + +/******************************************************//** +The following function is used to get the number of fields +in an old-style record. +@return number of data fields */ +UNIV_INLINE +ulint +rec_get_n_fields_old( +/*=================*/ + const rec_t* rec) /*!< in: physical record */ +{ + ulint ret; + + ut_ad(rec); + + ret = rec_get_bit_field_2(rec, REC_OLD_N_FIELDS, + REC_OLD_N_FIELDS_MASK, + REC_OLD_N_FIELDS_SHIFT); + ut_ad(ret <= REC_MAX_N_FIELDS); + ut_ad(ret > 0); + + return(ret); +} + +/******************************************************//** +The following function is used to set the number of fields +in an old-style record. */ +UNIV_INLINE +void +rec_set_n_fields_old( +/*=================*/ + rec_t* rec, /*!< in: physical record */ + ulint n_fields) /*!< in: the number of fields */ +{ + ut_ad(rec); + ut_ad(n_fields <= REC_MAX_N_FIELDS); + ut_ad(n_fields > 0); + + rec_set_bit_field_2(rec, n_fields, REC_OLD_N_FIELDS, + REC_OLD_N_FIELDS_MASK, REC_OLD_N_FIELDS_SHIFT); +} + +/******************************************************//** +The following function retrieves the status bits of a new-style record. +@return status bits */ +UNIV_INLINE +ulint +rec_get_status( +/*===========*/ + const rec_t* rec) /*!< in: physical record */ +{ + ulint ret; + + ut_ad(rec); + + ret = rec_get_bit_field_1(rec, REC_NEW_STATUS, + REC_NEW_STATUS_MASK, REC_NEW_STATUS_SHIFT); + ut_ad((ret & ~REC_NEW_STATUS_MASK) == 0); + + return(ret); +} + +/******************************************************//** +The following function is used to get the number of fields +in a record. +@return number of data fields */ +UNIV_INLINE +ulint +rec_get_n_fields( +/*=============*/ + const rec_t* rec, /*!< in: physical record */ + const dict_index_t* index) /*!< in: record descriptor */ +{ + ut_ad(rec); + ut_ad(index); + + if (!dict_table_is_comp(index->table)) { + return(rec_get_n_fields_old(rec)); + } + + switch (rec_get_status(rec)) { + case REC_STATUS_ORDINARY: + return(dict_index_get_n_fields(index)); + case REC_STATUS_NODE_PTR: + return(dict_index_get_n_unique_in_tree(index) + 1); + case REC_STATUS_INFIMUM: + case REC_STATUS_SUPREMUM: + return(1); + default: + ut_error; + return(ULINT_UNDEFINED); + } +} + +/******************************************************//** +The following function is used to get the number of records owned by the +previous directory record. +@return number of owned records */ +UNIV_INLINE +ulint +rec_get_n_owned_old( +/*================*/ + const rec_t* rec) /*!< in: old-style physical record */ +{ + return(rec_get_bit_field_1(rec, REC_OLD_N_OWNED, + REC_N_OWNED_MASK, REC_N_OWNED_SHIFT)); +} + +/******************************************************//** +The following function is used to set the number of owned records. */ +UNIV_INLINE +void +rec_set_n_owned_old( +/*================*/ + rec_t* rec, /*!< in: old-style physical record */ + ulint n_owned) /*!< in: the number of owned */ +{ + rec_set_bit_field_1(rec, n_owned, REC_OLD_N_OWNED, + REC_N_OWNED_MASK, REC_N_OWNED_SHIFT); +} + +/******************************************************//** +The following function is used to get the number of records owned by the +previous directory record. +@return number of owned records */ +UNIV_INLINE +ulint +rec_get_n_owned_new( +/*================*/ + const rec_t* rec) /*!< in: new-style physical record */ +{ + return(rec_get_bit_field_1(rec, REC_NEW_N_OWNED, + REC_N_OWNED_MASK, REC_N_OWNED_SHIFT)); +} + +/******************************************************//** +The following function is used to set the number of owned records. */ +UNIV_INLINE +void +rec_set_n_owned_new( +/*================*/ + rec_t* rec, /*!< in/out: new-style physical record */ + page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */ + ulint n_owned)/*!< in: the number of owned */ +{ + rec_set_bit_field_1(rec, n_owned, REC_NEW_N_OWNED, + REC_N_OWNED_MASK, REC_N_OWNED_SHIFT); + if (page_zip && rec_get_status(rec) != REC_STATUS_SUPREMUM) { + page_zip_rec_set_owned(page_zip, rec, n_owned); + } +} + +/******************************************************//** +The following function is used to retrieve the info bits of a record. +@return info bits */ +UNIV_INLINE +ulint +rec_get_info_bits( +/*==============*/ + const rec_t* rec, /*!< in: physical record */ + ulint comp) /*!< in: nonzero=compact page format */ +{ + return(rec_get_bit_field_1( + rec, comp ? REC_NEW_INFO_BITS : REC_OLD_INFO_BITS, + REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT)); +} + +/******************************************************//** +The following function is used to set the info bits of a record. */ +UNIV_INLINE +void +rec_set_info_bits_old( +/*==================*/ + rec_t* rec, /*!< in: old-style physical record */ + ulint bits) /*!< in: info bits */ +{ + rec_set_bit_field_1(rec, bits, REC_OLD_INFO_BITS, + REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT); +} +/******************************************************//** +The following function is used to set the info bits of a record. */ +UNIV_INLINE +void +rec_set_info_bits_new( +/*==================*/ + rec_t* rec, /*!< in/out: new-style physical record */ + ulint bits) /*!< in: info bits */ +{ + rec_set_bit_field_1(rec, bits, REC_NEW_INFO_BITS, + REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT); +} + +/******************************************************//** +The following function is used to set the status bits of a new-style record. */ +UNIV_INLINE +void +rec_set_status( +/*===========*/ + rec_t* rec, /*!< in/out: physical record */ + ulint bits) /*!< in: info bits */ +{ + rec_set_bit_field_1(rec, bits, REC_NEW_STATUS, + REC_NEW_STATUS_MASK, REC_NEW_STATUS_SHIFT); +} + +/******************************************************//** +The following function is used to retrieve the info and status +bits of a record. (Only compact records have status bits.) +@return info bits */ +UNIV_INLINE +ulint +rec_get_info_and_status_bits( +/*=========================*/ + const rec_t* rec, /*!< in: physical record */ + ulint comp) /*!< in: nonzero=compact page format */ +{ + ulint bits; +#if (REC_NEW_STATUS_MASK >> REC_NEW_STATUS_SHIFT) \ +& (REC_INFO_BITS_MASK >> REC_INFO_BITS_SHIFT) +# error "REC_NEW_STATUS_MASK and REC_INFO_BITS_MASK overlap" +#endif + if (comp) { + bits = rec_get_info_bits(rec, TRUE) | rec_get_status(rec); + } else { + bits = rec_get_info_bits(rec, FALSE); + ut_ad(!(bits & ~(REC_INFO_BITS_MASK >> REC_INFO_BITS_SHIFT))); + } + return(bits); +} +/******************************************************//** +The following function is used to set the info and status +bits of a record. (Only compact records have status bits.) */ +UNIV_INLINE +void +rec_set_info_and_status_bits( +/*=========================*/ + rec_t* rec, /*!< in/out: physical record */ + ulint bits) /*!< in: info bits */ +{ +#if (REC_NEW_STATUS_MASK >> REC_NEW_STATUS_SHIFT) \ +& (REC_INFO_BITS_MASK >> REC_INFO_BITS_SHIFT) +# error "REC_NEW_STATUS_MASK and REC_INFO_BITS_MASK overlap" +#endif + rec_set_status(rec, bits & REC_NEW_STATUS_MASK); + rec_set_info_bits_new(rec, bits & ~REC_NEW_STATUS_MASK); +} + +/******************************************************//** +The following function tells if record is delete marked. +@return nonzero if delete marked */ +UNIV_INLINE +ulint +rec_get_deleted_flag( +/*=================*/ + const rec_t* rec, /*!< in: physical record */ + ulint comp) /*!< in: nonzero=compact page format */ +{ + if (comp) { + return(rec_get_bit_field_1(rec, REC_NEW_INFO_BITS, + REC_INFO_DELETED_FLAG, + REC_INFO_BITS_SHIFT)); + } else { + return(rec_get_bit_field_1(rec, REC_OLD_INFO_BITS, + REC_INFO_DELETED_FLAG, + REC_INFO_BITS_SHIFT)); + } +} + +/******************************************************//** +The following function is used to set the deleted bit. */ +UNIV_INLINE +void +rec_set_deleted_flag_old( +/*=====================*/ + rec_t* rec, /*!< in: old-style physical record */ + ulint flag) /*!< in: nonzero if delete marked */ +{ + ulint val; + + val = rec_get_info_bits(rec, FALSE); + + if (flag) { + val |= REC_INFO_DELETED_FLAG; + } else { + val &= ~REC_INFO_DELETED_FLAG; + } + + rec_set_info_bits_old(rec, val); +} + +/******************************************************//** +The following function is used to set the deleted bit. */ +UNIV_INLINE +void +rec_set_deleted_flag_new( +/*=====================*/ + rec_t* rec, /*!< in/out: new-style physical record */ + page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */ + ulint flag) /*!< in: nonzero if delete marked */ +{ + ulint val; + + val = rec_get_info_bits(rec, TRUE); + + if (flag) { + val |= REC_INFO_DELETED_FLAG; + } else { + val &= ~REC_INFO_DELETED_FLAG; + } + + rec_set_info_bits_new(rec, val); + + if (page_zip) { + page_zip_rec_set_deleted(page_zip, rec, flag); + } +} + +/******************************************************//** +The following function tells if a new-style record is a node pointer. +@return TRUE if node pointer */ +UNIV_INLINE +ibool +rec_get_node_ptr_flag( +/*==================*/ + const rec_t* rec) /*!< in: physical record */ +{ + return(REC_STATUS_NODE_PTR == rec_get_status(rec)); +} + +/******************************************************//** +The following function is used to get the order number +of an old-style record in the heap of the index page. +@return heap order number */ +UNIV_INLINE +ulint +rec_get_heap_no_old( +/*================*/ + const rec_t* rec) /*!< in: physical record */ +{ + return(rec_get_bit_field_2(rec, REC_OLD_HEAP_NO, + REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT)); +} + +/******************************************************//** +The following function is used to set the heap number +field in an old-style record. */ +UNIV_INLINE +void +rec_set_heap_no_old( +/*================*/ + rec_t* rec, /*!< in: physical record */ + ulint heap_no)/*!< in: the heap number */ +{ + rec_set_bit_field_2(rec, heap_no, REC_OLD_HEAP_NO, + REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT); +} + +/******************************************************//** +The following function is used to get the order number +of a new-style record in the heap of the index page. +@return heap order number */ +UNIV_INLINE +ulint +rec_get_heap_no_new( +/*================*/ + const rec_t* rec) /*!< in: physical record */ +{ + return(rec_get_bit_field_2(rec, REC_NEW_HEAP_NO, + REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT)); +} + +/******************************************************//** +The following function is used to set the heap number +field in a new-style record. */ +UNIV_INLINE +void +rec_set_heap_no_new( +/*================*/ + rec_t* rec, /*!< in/out: physical record */ + ulint heap_no)/*!< in: the heap number */ +{ + rec_set_bit_field_2(rec, heap_no, REC_NEW_HEAP_NO, + REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT); +} + +/******************************************************//** +The following function is used to test whether the data offsets in the record +are stored in one-byte or two-byte format. +@return TRUE if 1-byte form */ +UNIV_INLINE +ibool +rec_get_1byte_offs_flag( +/*====================*/ + const rec_t* rec) /*!< in: physical record */ +{ +#if TRUE != 1 +#error "TRUE != 1" +#endif + + return(rec_get_bit_field_1(rec, REC_OLD_SHORT, REC_OLD_SHORT_MASK, + REC_OLD_SHORT_SHIFT)); +} + +/******************************************************//** +The following function is used to set the 1-byte offsets flag. */ +UNIV_INLINE +void +rec_set_1byte_offs_flag( +/*====================*/ + rec_t* rec, /*!< in: physical record */ + ibool flag) /*!< in: TRUE if 1byte form */ +{ +#if TRUE != 1 +#error "TRUE != 1" +#endif + ut_ad(flag <= TRUE); + + rec_set_bit_field_1(rec, flag, REC_OLD_SHORT, REC_OLD_SHORT_MASK, + REC_OLD_SHORT_SHIFT); +} + +/******************************************************//** +Returns the offset of nth field end if the record is stored in the 1-byte +offsets form. If the field is SQL null, the flag is ORed in the returned +value. +@return offset of the start of the field, SQL null flag ORed */ +UNIV_INLINE +ulint +rec_1_get_field_end_info( +/*=====================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: field index */ +{ + ut_ad(rec_get_1byte_offs_flag(rec)); + ut_ad(n < rec_get_n_fields_old(rec)); + + return(mach_read_from_1(rec - (REC_N_OLD_EXTRA_BYTES + n + 1))); +} + +/******************************************************//** +Returns the offset of nth field end if the record is stored in the 2-byte +offsets form. If the field is SQL null, the flag is ORed in the returned +value. +@return offset of the start of the field, SQL null flag and extern +storage flag ORed */ +UNIV_INLINE +ulint +rec_2_get_field_end_info( +/*=====================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: field index */ +{ + ut_ad(!rec_get_1byte_offs_flag(rec)); + ut_ad(n < rec_get_n_fields_old(rec)); + + return(mach_read_from_2(rec - (REC_N_OLD_EXTRA_BYTES + 2 * n + 2))); +} + +/******************************************************//** +Returns nonzero if the field is stored off-page. +@retval 0 if the field is stored in-page +@retval REC_2BYTE_EXTERN_MASK if the field is stored externally */ +UNIV_INLINE +ulint +rec_2_is_field_extern( +/*==================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: field index */ +{ + return(rec_2_get_field_end_info(rec, n) & REC_2BYTE_EXTERN_MASK); +} + +/* Get the base address of offsets. The extra_size is stored at +this position, and following positions hold the end offsets of +the fields. */ +#define rec_offs_base(offsets) (offsets + REC_OFFS_HEADER_SIZE) + +/**********************************************************//** +The following function returns the number of allocated elements +for an array of offsets. +@return number of elements */ +UNIV_INLINE +ulint +rec_offs_get_n_alloc( +/*=================*/ + const ulint* offsets)/*!< in: array for rec_get_offsets() */ +{ + ulint n_alloc; + ut_ad(offsets); + n_alloc = offsets[0]; + ut_ad(n_alloc > REC_OFFS_HEADER_SIZE); + UNIV_MEM_ASSERT_W(offsets, n_alloc * sizeof *offsets); + return(n_alloc); +} + +/**********************************************************//** +The following function sets the number of allocated elements +for an array of offsets. */ +UNIV_INLINE +void +rec_offs_set_n_alloc( +/*=================*/ + ulint* offsets, /*!< out: array for rec_get_offsets(), + must be allocated */ + ulint n_alloc) /*!< in: number of elements */ +{ + ut_ad(offsets); + ut_ad(n_alloc > REC_OFFS_HEADER_SIZE); + UNIV_MEM_ASSERT_AND_ALLOC(offsets, n_alloc * sizeof *offsets); + offsets[0] = n_alloc; +} + +/**********************************************************//** +The following function returns the number of fields in a record. +@return number of fields */ +UNIV_INLINE +ulint +rec_offs_n_fields( +/*==============*/ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ +{ + ulint n_fields; + ut_ad(offsets); + n_fields = offsets[1]; + ut_ad(n_fields > 0); + ut_ad(n_fields <= REC_MAX_N_FIELDS); + ut_ad(n_fields + REC_OFFS_HEADER_SIZE + <= rec_offs_get_n_alloc(offsets)); + return(n_fields); +} + +/************************************************************//** +Validates offsets returned by rec_get_offsets(). +@return TRUE if valid */ +UNIV_INLINE +ibool +rec_offs_validate( +/*==============*/ + const rec_t* rec, /*!< in: record or NULL */ + const dict_index_t* index, /*!< in: record descriptor or NULL */ + const ulint* offsets)/*!< in: array returned by + rec_get_offsets() */ +{ + ulint i = rec_offs_n_fields(offsets); + ulint last = ULINT_MAX; + ulint comp = *rec_offs_base(offsets) & REC_OFFS_COMPACT; + + if (rec) { + ut_ad((ulint) rec == offsets[2]); + if (!comp) { + ut_a(rec_get_n_fields_old(rec) >= i); + } + } + if (index) { + ulint max_n_fields; + ut_ad((ulint) index == offsets[3]); + max_n_fields = ut_max( + dict_index_get_n_fields(index), + dict_index_get_n_unique_in_tree(index) + 1); + if (comp && rec) { + switch (rec_get_status(rec)) { + case REC_STATUS_ORDINARY: + break; + case REC_STATUS_NODE_PTR: + max_n_fields = dict_index_get_n_unique_in_tree( + index) + 1; + break; + case REC_STATUS_INFIMUM: + case REC_STATUS_SUPREMUM: + max_n_fields = 1; + break; + default: + ut_error; + } + } + /* index->n_def == 0 for dummy indexes if !comp */ + ut_a(!comp || index->n_def); + ut_a(!index->n_def || i <= max_n_fields); + } + while (i--) { + ulint curr = rec_offs_base(offsets)[1 + i] & REC_OFFS_MASK; + ut_a(curr <= last); + last = curr; + } + return(TRUE); +} +#ifdef UNIV_DEBUG +/************************************************************//** +Updates debug data in offsets, in order to avoid bogus +rec_offs_validate() failures. */ +UNIV_INLINE +void +rec_offs_make_valid( +/*================*/ + const rec_t* rec, /*!< in: record */ + const dict_index_t* index, /*!< in: record descriptor */ + ulint* offsets)/*!< in: array returned by + rec_get_offsets() */ +{ + ut_ad(rec); + ut_ad(index); + ut_ad(offsets); + ut_ad(rec_get_n_fields(rec, index) >= rec_offs_n_fields(offsets)); + offsets[2] = (ulint) rec; + offsets[3] = (ulint) index; +} +#endif /* UNIV_DEBUG */ + +/************************************************************//** +The following function is used to get an offset to the nth +data field in a record. +@return offset from the origin of rec */ +UNIV_INLINE +ulint +rec_get_nth_field_offs( +/*===================*/ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint n, /*!< in: index of the field */ + ulint* len) /*!< out: length of the field; UNIV_SQL_NULL + if SQL null */ +{ + ulint offs; + ulint length; + ut_ad(n < rec_offs_n_fields(offsets)); + ut_ad(len); + + if (n == 0) { + offs = 0; + } else { + offs = rec_offs_base(offsets)[n] & REC_OFFS_MASK; + } + + length = rec_offs_base(offsets)[1 + n]; + + if (length & REC_OFFS_SQL_NULL) { + length = UNIV_SQL_NULL; + } else { + length &= REC_OFFS_MASK; + length -= offs; + } + + *len = length; + return(offs); +} + +/******************************************************//** +Determine if the offsets are for a record in the new +compact format. +@return nonzero if compact format */ +UNIV_INLINE +ulint +rec_offs_comp( +/*==========*/ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ +{ + ut_ad(rec_offs_validate(NULL, NULL, offsets)); + return(*rec_offs_base(offsets) & REC_OFFS_COMPACT); +} + +/******************************************************//** +Determine if the offsets are for a record containing +externally stored columns. +@return nonzero if externally stored */ +UNIV_INLINE +ulint +rec_offs_any_extern( +/*================*/ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ +{ + ut_ad(rec_offs_validate(NULL, NULL, offsets)); + return(*rec_offs_base(offsets) & REC_OFFS_EXTERNAL); +} + +/******************************************************//** +Determine if the offsets are for a record containing null BLOB pointers. +@return first field containing a null BLOB pointer, or NULL if none found */ +UNIV_INLINE +const byte* +rec_offs_any_null_extern( +/*=====================*/ + const rec_t* rec, /*!< in: record */ + const ulint* offsets) /*!< in: rec_get_offsets(rec) */ +{ + ulint i; + ut_ad(rec_offs_validate(rec, NULL, offsets)); + + if (!rec_offs_any_extern(offsets)) { + return(NULL); + } + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + if (rec_offs_nth_extern(offsets, i)) { + ulint len; + const byte* field + = rec_get_nth_field(rec, offsets, i, &len); + + ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE); + if (!memcmp(field + len + - BTR_EXTERN_FIELD_REF_SIZE, + field_ref_zero, + BTR_EXTERN_FIELD_REF_SIZE)) { + return(field); + } + } + } + + return(NULL); +} + +/******************************************************//** +Returns nonzero if the extern bit is set in nth field of rec. +@return nonzero if externally stored */ +UNIV_INLINE +ulint +rec_offs_nth_extern( +/*================*/ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint n) /*!< in: nth field */ +{ + ut_ad(rec_offs_validate(NULL, NULL, offsets)); + ut_ad(n < rec_offs_n_fields(offsets)); + return(rec_offs_base(offsets)[1 + n] & REC_OFFS_EXTERNAL); +} + +/******************************************************//** +Returns nonzero if the SQL NULL bit is set in nth field of rec. +@return nonzero if SQL NULL */ +UNIV_INLINE +ulint +rec_offs_nth_sql_null( +/*==================*/ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint n) /*!< in: nth field */ +{ + ut_ad(rec_offs_validate(NULL, NULL, offsets)); + ut_ad(n < rec_offs_n_fields(offsets)); + return(rec_offs_base(offsets)[1 + n] & REC_OFFS_SQL_NULL); +} + +/******************************************************//** +Gets the physical size of a field. +@return length of field */ +UNIV_INLINE +ulint +rec_offs_nth_size( +/*==============*/ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint n) /*!< in: nth field */ +{ + ut_ad(rec_offs_validate(NULL, NULL, offsets)); + ut_ad(n < rec_offs_n_fields(offsets)); + if (!n) { + return(rec_offs_base(offsets)[1 + n] & REC_OFFS_MASK); + } + return((rec_offs_base(offsets)[1 + n] - rec_offs_base(offsets)[n]) + & REC_OFFS_MASK); +} + +/******************************************************//** +Returns the number of extern bits set in a record. +@return number of externally stored fields */ +UNIV_INLINE +ulint +rec_offs_n_extern( +/*==============*/ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ +{ + ulint n = 0; + + if (rec_offs_any_extern(offsets)) { + ulint i; + + for (i = rec_offs_n_fields(offsets); i--; ) { + if (rec_offs_nth_extern(offsets, i)) { + n++; + } + } + } + + return(n); +} + +/******************************************************//** +Returns the offset of n - 1th field end if the record is stored in the 1-byte +offsets form. If the field is SQL null, the flag is ORed in the returned +value. This function and the 2-byte counterpart are defined here because the +C-compiler was not able to sum negative and positive constant offsets, and +warned of constant arithmetic overflow within the compiler. +@return offset of the start of the PREVIOUS field, SQL null flag ORed */ +UNIV_INLINE +ulint +rec_1_get_prev_field_end_info( +/*==========================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: field index */ +{ + ut_ad(rec_get_1byte_offs_flag(rec)); + ut_ad(n <= rec_get_n_fields_old(rec)); + + return(mach_read_from_1(rec - (REC_N_OLD_EXTRA_BYTES + n))); +} + +/******************************************************//** +Returns the offset of n - 1th field end if the record is stored in the 2-byte +offsets form. If the field is SQL null, the flag is ORed in the returned +value. +@return offset of the start of the PREVIOUS field, SQL null flag ORed */ +UNIV_INLINE +ulint +rec_2_get_prev_field_end_info( +/*==========================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: field index */ +{ + ut_ad(!rec_get_1byte_offs_flag(rec)); + ut_ad(n <= rec_get_n_fields_old(rec)); + + return(mach_read_from_2(rec - (REC_N_OLD_EXTRA_BYTES + 2 * n))); +} + +/******************************************************//** +Sets the field end info for the nth field if the record is stored in the +1-byte format. */ +UNIV_INLINE +void +rec_1_set_field_end_info( +/*=====================*/ + rec_t* rec, /*!< in: record */ + ulint n, /*!< in: field index */ + ulint info) /*!< in: value to set */ +{ + ut_ad(rec_get_1byte_offs_flag(rec)); + ut_ad(n < rec_get_n_fields_old(rec)); + + mach_write_to_1(rec - (REC_N_OLD_EXTRA_BYTES + n + 1), info); +} + +/******************************************************//** +Sets the field end info for the nth field if the record is stored in the +2-byte format. */ +UNIV_INLINE +void +rec_2_set_field_end_info( +/*=====================*/ + rec_t* rec, /*!< in: record */ + ulint n, /*!< in: field index */ + ulint info) /*!< in: value to set */ +{ + ut_ad(!rec_get_1byte_offs_flag(rec)); + ut_ad(n < rec_get_n_fields_old(rec)); + + mach_write_to_2(rec - (REC_N_OLD_EXTRA_BYTES + 2 * n + 2), info); +} + +/******************************************************//** +Returns the offset of nth field start if the record is stored in the 1-byte +offsets form. +@return offset of the start of the field */ +UNIV_INLINE +ulint +rec_1_get_field_start_offs( +/*=======================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: field index */ +{ + ut_ad(rec_get_1byte_offs_flag(rec)); + ut_ad(n <= rec_get_n_fields_old(rec)); + + if (n == 0) { + + return(0); + } + + return(rec_1_get_prev_field_end_info(rec, n) + & ~REC_1BYTE_SQL_NULL_MASK); +} + +/******************************************************//** +Returns the offset of nth field start if the record is stored in the 2-byte +offsets form. +@return offset of the start of the field */ +UNIV_INLINE +ulint +rec_2_get_field_start_offs( +/*=======================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: field index */ +{ + ut_ad(!rec_get_1byte_offs_flag(rec)); + ut_ad(n <= rec_get_n_fields_old(rec)); + + if (n == 0) { + + return(0); + } + + return(rec_2_get_prev_field_end_info(rec, n) + & ~(REC_2BYTE_SQL_NULL_MASK | REC_2BYTE_EXTERN_MASK)); +} + +/******************************************************//** +The following function is used to read the offset of the start of a data field +in the record. The start of an SQL null field is the end offset of the +previous non-null field, or 0, if none exists. If n is the number of the last +field + 1, then the end offset of the last field is returned. +@return offset of the start of the field */ +UNIV_INLINE +ulint +rec_get_field_start_offs( +/*=====================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: field index */ +{ + ut_ad(rec); + ut_ad(n <= rec_get_n_fields_old(rec)); + + if (n == 0) { + + return(0); + } + + if (rec_get_1byte_offs_flag(rec)) { + + return(rec_1_get_field_start_offs(rec, n)); + } + + return(rec_2_get_field_start_offs(rec, n)); +} + +/************************************************************//** +Gets the physical size of an old-style field. +Also an SQL null may have a field of size > 0, +if the data type is of a fixed size. +@return field size in bytes */ +UNIV_INLINE +ulint +rec_get_nth_field_size( +/*===================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: index of the field */ +{ + ulint os; + ulint next_os; + + os = rec_get_field_start_offs(rec, n); + next_os = rec_get_field_start_offs(rec, n + 1); + + ut_ad(next_os - os < UNIV_PAGE_SIZE); + + return(next_os - os); +} + +/***********************************************************//** +This is used to modify the value of an already existing field in a record. +The previous value must have exactly the same size as the new value. If len +is UNIV_SQL_NULL then the field is treated as an SQL null. +For records in ROW_FORMAT=COMPACT (new-style records), len must not be +UNIV_SQL_NULL unless the field already is SQL null. */ +UNIV_INLINE +void +rec_set_nth_field( +/*==============*/ + rec_t* rec, /*!< in: record */ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint n, /*!< in: index number of the field */ + const void* data, /*!< in: pointer to the data + if not SQL null */ + ulint len) /*!< in: length of the data or UNIV_SQL_NULL */ +{ + byte* data2; + ulint len2; + + ut_ad(rec); + ut_ad(rec_offs_validate(rec, NULL, offsets)); + + if (len == UNIV_SQL_NULL) { + if (!rec_offs_nth_sql_null(offsets, n)) { + ut_a(!rec_offs_comp(offsets)); + rec_set_nth_field_sql_null(rec, n); + } + + return; + } + + data2 = rec_get_nth_field(rec, offsets, n, &len2); + if (len2 == UNIV_SQL_NULL) { + ut_ad(!rec_offs_comp(offsets)); + rec_set_nth_field_null_bit(rec, n, FALSE); + ut_ad(len == rec_get_nth_field_size(rec, n)); + } else { + ut_ad(len2 == len); + } + + ut_memcpy(data2, data, len); +} + +/**********************************************************//** +The following function returns the data size of an old-style physical +record, that is the sum of field lengths. SQL null fields +are counted as length 0 fields. The value returned by the function +is the distance from record origin to record end in bytes. +@return size */ +UNIV_INLINE +ulint +rec_get_data_size_old( +/*==================*/ + const rec_t* rec) /*!< in: physical record */ +{ + ut_ad(rec); + + return(rec_get_field_start_offs(rec, rec_get_n_fields_old(rec))); +} + +/**********************************************************//** +The following function sets the number of fields in offsets. */ +UNIV_INLINE +void +rec_offs_set_n_fields( +/*==================*/ + ulint* offsets, /*!< in/out: array returned by + rec_get_offsets() */ + ulint n_fields) /*!< in: number of fields */ +{ + ut_ad(offsets); + ut_ad(n_fields > 0); + ut_ad(n_fields <= REC_MAX_N_FIELDS); + ut_ad(n_fields + REC_OFFS_HEADER_SIZE + <= rec_offs_get_n_alloc(offsets)); + offsets[1] = n_fields; +} + +/**********************************************************//** +The following function returns the data size of a physical +record, that is the sum of field lengths. SQL null fields +are counted as length 0 fields. The value returned by the function +is the distance from record origin to record end in bytes. +@return size */ +UNIV_INLINE +ulint +rec_offs_data_size( +/*===============*/ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ +{ + ulint size; + + ut_ad(rec_offs_validate(NULL, NULL, offsets)); + size = rec_offs_base(offsets)[rec_offs_n_fields(offsets)] + & REC_OFFS_MASK; + ut_ad(size < UNIV_PAGE_SIZE); + return(size); +} + +/**********************************************************//** +Returns the total size of record minus data size of record. The value +returned by the function is the distance from record start to record origin +in bytes. +@return size */ +UNIV_INLINE +ulint +rec_offs_extra_size( +/*================*/ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ +{ + ulint size; + ut_ad(rec_offs_validate(NULL, NULL, offsets)); + size = *rec_offs_base(offsets) & ~(REC_OFFS_COMPACT | REC_OFFS_EXTERNAL); + ut_ad(size < UNIV_PAGE_SIZE); + return(size); +} + +/**********************************************************//** +Returns the total size of a physical record. +@return size */ +UNIV_INLINE +ulint +rec_offs_size( +/*==========*/ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ +{ + return(rec_offs_data_size(offsets) + rec_offs_extra_size(offsets)); +} + +#ifdef UNIV_DEBUG +/**********************************************************//** +Returns a pointer to the end of the record. +@return pointer to end */ +UNIV_INLINE +byte* +rec_get_end( +/*========*/ + const rec_t* rec, /*!< in: pointer to record */ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ +{ + ut_ad(rec_offs_validate(rec, NULL, offsets)); + return(const_cast<rec_t*>(rec + rec_offs_data_size(offsets))); +} + +/**********************************************************//** +Returns a pointer to the start of the record. +@return pointer to start */ +UNIV_INLINE +byte* +rec_get_start( +/*==========*/ + const rec_t* rec, /*!< in: pointer to record */ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ +{ + ut_ad(rec_offs_validate(rec, NULL, offsets)); + return(const_cast<rec_t*>(rec - rec_offs_extra_size(offsets))); +} +#endif /* UNIV_DEBUG */ + +/***************************************************************//** +Copies a physical record to a buffer. +@return pointer to the origin of the copy */ +UNIV_INLINE +rec_t* +rec_copy( +/*=====*/ + void* buf, /*!< in: buffer */ + const rec_t* rec, /*!< in: physical record */ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ +{ + ulint extra_len; + ulint data_len; + + ut_ad(rec && buf); + ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_ad(rec_validate(rec, offsets)); + + extra_len = rec_offs_extra_size(offsets); + data_len = rec_offs_data_size(offsets); + + ut_memcpy(buf, rec - extra_len, extra_len + data_len); + + return((byte*) buf + extra_len); +} + +/**********************************************************//** +Returns the extra size of an old-style physical record if we know its +data size and number of fields. +@return extra size */ +UNIV_INLINE +ulint +rec_get_converted_extra_size( +/*=========================*/ + ulint data_size, /*!< in: data size */ + ulint n_fields, /*!< in: number of fields */ + ulint n_ext) /*!< in: number of externally stored columns */ +{ + if (!n_ext && data_size <= REC_1BYTE_OFFS_LIMIT) { + + return(REC_N_OLD_EXTRA_BYTES + n_fields); + } + + return(REC_N_OLD_EXTRA_BYTES + 2 * n_fields); +} + +/**********************************************************//** +The following function returns the size of a data tuple when converted to +a physical record. +@return size */ +UNIV_INLINE +ulint +rec_get_converted_size( +/*===================*/ + dict_index_t* index, /*!< in: record descriptor */ + const dtuple_t* dtuple, /*!< in: data tuple */ + ulint n_ext) /*!< in: number of externally stored columns */ +{ + ulint data_size; + ulint extra_size; + + ut_ad(index); + ut_ad(dtuple); + ut_ad(dtuple_check_typed(dtuple)); + + ut_ad(dict_index_is_univ(index) + || dtuple_get_n_fields(dtuple) + == (((dtuple_get_info_bits(dtuple) & REC_NEW_STATUS_MASK) + == REC_STATUS_NODE_PTR) + ? dict_index_get_n_unique_in_tree(index) + 1 + : dict_index_get_n_fields(index))); + + if (dict_table_is_comp(index->table)) { + return(rec_get_converted_size_comp(index, + dtuple_get_info_bits(dtuple) + & REC_NEW_STATUS_MASK, + dtuple->fields, + dtuple->n_fields, NULL)); + } + + data_size = dtuple_get_data_size(dtuple, 0); + + extra_size = rec_get_converted_extra_size( + data_size, dtuple_get_n_fields(dtuple), n_ext); + +#if 0 + /* This code is inactive since it may be the wrong place to add + in the size of node pointers used in parent pages AND it is not + currently needed since ha_innobase::max_supported_key_length() + ensures that the key size limit for each page size is well below + the actual limit ((free space on page / 4) - record overhead). + But those limits will need to be raised when InnoDB can + support multiple page sizes. At that time, we will need + to consider the node pointer on these universal btrees. */ + + if (dict_index_is_univ(index)) { + /* This is for the insert buffer B-tree. + All fields in the leaf tuple ascend to the + parent node plus the child page pointer. */ + + /* ibuf cannot contain externally stored fields */ + ut_ad(n_ext == 0); + + /* Add the data pointer and recompute extra_size + based on one more field. */ + data_size += REC_NODE_PTR_SIZE; + extra_size = rec_get_converted_extra_size( + data_size, + dtuple_get_n_fields(dtuple) + 1, + 0); + + /* Be sure dtuple->n_fields has this node ptr + accounted for. This function should correspond to + what rec_convert_dtuple_to_rec() needs in storage. + In optimistic insert or update-not-in-place, we will + have to ensure that if the record is converted to a + node pointer, it will not become too large.*/ + } +#endif + + return(data_size + extra_size); +} + +#ifndef UNIV_HOTBACKUP +/************************************************************//** +Folds a prefix of a physical record to a ulint. Folds only existing fields, +that is, checks that we do not run out of the record. +@return the folded value */ +UNIV_INLINE +ulint +rec_fold( +/*=====*/ + const rec_t* rec, /*!< in: the physical record */ + const ulint* offsets, /*!< in: array returned by + rec_get_offsets() */ + ulint n_fields, /*!< in: number of complete + fields to fold */ + ulint n_bytes, /*!< in: number of bytes to fold + in an incomplete last field */ + index_id_t tree_id) /*!< in: index tree id */ +{ + ulint i; + const byte* data; + ulint len; + ulint fold; + ulint n_fields_rec; + + ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_ad(rec_validate(rec, offsets)); + ut_ad(n_fields + n_bytes > 0); + + n_fields_rec = rec_offs_n_fields(offsets); + ut_ad(n_fields <= n_fields_rec); + ut_ad(n_fields < n_fields_rec || n_bytes == 0); + + if (n_fields > n_fields_rec) { + n_fields = n_fields_rec; + } + + if (n_fields == n_fields_rec) { + n_bytes = 0; + } + + fold = ut_fold_ull(tree_id); + + for (i = 0; i < n_fields; i++) { + data = rec_get_nth_field(rec, offsets, i, &len); + + if (len != UNIV_SQL_NULL) { + fold = ut_fold_ulint_pair(fold, + ut_fold_binary(data, len)); + } + } + + if (n_bytes > 0) { + data = rec_get_nth_field(rec, offsets, i, &len); + + if (len != UNIV_SQL_NULL) { + if (len > n_bytes) { + len = n_bytes; + } + + fold = ut_fold_ulint_pair(fold, + ut_fold_binary(data, len)); + } + } + + return(fold); +} +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/xtradb/include/rem0types.h b/storage/xtradb/include/rem0types.h new file mode 100644 index 00000000000..f8133f77466 --- /dev/null +++ b/storage/xtradb/include/rem0types.h @@ -0,0 +1,74 @@ +/***************************************************************************** + +Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/rem0types.h +Record manager global types + +Created 5/30/1994 Heikki Tuuri +*************************************************************************/ + +#ifndef rem0types_h +#define rem0types_h + +/* We define the physical record simply as an array of bytes */ +typedef byte rec_t; + +/* Maximum values for various fields (for non-blob tuples) */ +#define REC_MAX_N_FIELDS (1024 - 1) +#define REC_MAX_HEAP_NO (2 * 8192 - 1) +#define REC_MAX_N_OWNED (16 - 1) + +/* Maximum number of user defined fields/columns. The reserved columns +are the ones InnoDB adds internally: DB_ROW_ID, DB_TRX_ID, DB_ROLL_PTR. +We need "* 2" because mlog_parse_index() creates a dummy table object +possibly, with some of the system columns in it, and then adds the 3 +system columns (again) using dict_table_add_system_columns(). The problem +is that mlog_parse_index() cannot recognize the system columns by +just having n_fields, n_uniq and the lengths of the columns. */ +#define REC_MAX_N_USER_FIELDS (REC_MAX_N_FIELDS - DATA_N_SYS_COLS * 2) + +/* REC_ANTELOPE_MAX_INDEX_COL_LEN is measured in bytes and is the maximum +indexed field length (or indexed prefix length) for indexes on tables of +ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT format. +Before we support UTF-8 encodings with mbmaxlen = 4, a UTF-8 character +may take at most 3 bytes. So the limit was set to 3*256, so that one +can create a column prefix index on 256 characters of a TEXT or VARCHAR +column also in the UTF-8 charset. +This constant MUST NOT BE CHANGED, or the compatibility of InnoDB data +files would be at risk! */ +#define REC_ANTELOPE_MAX_INDEX_COL_LEN 768 + +/** Maximum indexed field length for table format UNIV_FORMAT_B and +beyond. +This (3072) is the maximum index row length allowed, so we cannot create index +prefix column longer than that. */ +#define REC_VERSION_56_MAX_INDEX_COL_LEN 3072 + +/** Innodb row types are a subset of the MySQL global enum row_type. +They are made into their own enum so that switch statements can account +for each of them. */ +enum rec_format_enum { + REC_FORMAT_REDUNDANT = 0, /*!< REDUNDANT row format */ + REC_FORMAT_COMPACT = 1, /*!< COMPACT row format */ + REC_FORMAT_COMPRESSED = 2, /*!< COMPRESSED row format */ + REC_FORMAT_DYNAMIC = 3 /*!< DYNAMIC row format */ +}; +typedef enum rec_format_enum rec_format_t; + +#endif diff --git a/storage/xtradb/include/row0ext.h b/storage/xtradb/include/row0ext.h new file mode 100644 index 00000000000..a098e2f9b29 --- /dev/null +++ b/storage/xtradb/include/row0ext.h @@ -0,0 +1,102 @@ +/***************************************************************************** + +Copyright (c) 2006, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0ext.h +Caching of externally stored column prefixes + +Created September 2006 Marko Makela +*******************************************************/ + +#ifndef row0ext_h +#define row0ext_h + +#include "univ.i" +#include "row0types.h" +#include "data0types.h" +#include "mem0mem.h" +#include "dict0types.h" + +/********************************************************************//** +Creates a cache of column prefixes of externally stored columns. +@return own: column prefix cache */ +UNIV_INTERN +row_ext_t* +row_ext_create( +/*===========*/ + ulint n_ext, /*!< in: number of externally stored columns */ + const ulint* ext, /*!< in: col_no's of externally stored columns + in the InnoDB table object, as reported by + dict_col_get_no(); NOT relative to the records + in the clustered index */ + ulint flags, /*!< in: table->flags */ + const dtuple_t* tuple, /*!< in: data tuple containing the field + references of the externally stored + columns; must be indexed by col_no; + the clustered index record must be + covered by a lock or a page latch + to prevent deletion (rollback or purge). */ + mem_heap_t* heap); /*!< in: heap where created */ + +/********************************************************************//** +Looks up a column prefix of an externally stored column. +@return column prefix, or NULL if the column is not stored externally, +or pointer to field_ref_zero if the BLOB pointer is unset */ +UNIV_INLINE +const byte* +row_ext_lookup_ith( +/*===============*/ + const row_ext_t* ext, /*!< in/out: column prefix cache */ + ulint i, /*!< in: index of ext->ext[] */ + ulint* len); /*!< out: length of prefix, in bytes, + at most the length determined by + DICT_MAX_FIELD_LEN_BY_FORMAT() */ +/********************************************************************//** +Looks up a column prefix of an externally stored column. +@return column prefix, or NULL if the column is not stored externally, +or pointer to field_ref_zero if the BLOB pointer is unset */ +UNIV_INLINE +const byte* +row_ext_lookup( +/*===========*/ + const row_ext_t* ext, /*!< in: column prefix cache */ + ulint col, /*!< in: column number in the InnoDB + table object, as reported by + dict_col_get_no(); NOT relative to the + records in the clustered index */ + ulint* len); /*!< out: length of prefix, in bytes, + at most the length determined by + DICT_MAX_FIELD_LEN_BY_FORMAT() */ + +/** Prefixes of externally stored columns */ +struct row_ext_t{ + ulint n_ext; /*!< number of externally stored columns */ + const ulint* ext; /*!< col_no's of externally stored columns */ + byte* buf; /*!< backing store of the column prefix cache */ + ulint max_len;/*!< maximum prefix length, it could be + REC_ANTELOPE_MAX_INDEX_COL_LEN or + REC_VERSION_56_MAX_INDEX_COL_LEN depending + on row format */ + ulint len[1]; /*!< prefix lengths; 0 if not cached */ +}; + +#ifndef UNIV_NONINL +#include "row0ext.ic" +#endif + +#endif diff --git a/storage/xtradb/include/row0ext.ic b/storage/xtradb/include/row0ext.ic new file mode 100644 index 00000000000..39e150d91d5 --- /dev/null +++ b/storage/xtradb/include/row0ext.ic @@ -0,0 +1,87 @@ +/***************************************************************************** + +Copyright (c) 2006, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0ext.ic +Caching of externally stored column prefixes + +Created September 2006 Marko Makela +*******************************************************/ + +#include "rem0types.h" +#include "btr0types.h" + +/********************************************************************//** +Looks up a column prefix of an externally stored column. +@return column prefix, or NULL if the column is not stored externally, +or pointer to field_ref_zero if the BLOB pointer is unset */ +UNIV_INLINE +const byte* +row_ext_lookup_ith( +/*===============*/ + const row_ext_t* ext, /*!< in/out: column prefix cache */ + ulint i, /*!< in: index of ext->ext[] */ + ulint* len) /*!< out: length of prefix, in bytes, + at most ext->max_len */ +{ + ut_ad(ext); + ut_ad(len); + ut_ad(i < ext->n_ext); + + *len = ext->len[i]; + + ut_ad(*len <= ext->max_len); + ut_ad(ext->max_len > 0); + + if (*len == 0) { + /* The BLOB could not be fetched to the cache. */ + return(field_ref_zero); + } else { + return(ext->buf + i * ext->max_len); + } +} + +/********************************************************************//** +Looks up a column prefix of an externally stored column. +@return column prefix, or NULL if the column is not stored externally, +or pointer to field_ref_zero if the BLOB pointer is unset */ +UNIV_INLINE +const byte* +row_ext_lookup( +/*===========*/ + const row_ext_t* ext, /*!< in: column prefix cache */ + ulint col, /*!< in: column number in the InnoDB + table object, as reported by + dict_col_get_no(); NOT relative to the + records in the clustered index */ + ulint* len) /*!< out: length of prefix, in bytes, + at most ext->max_len */ +{ + ulint i; + + ut_ad(ext); + ut_ad(len); + + for (i = 0; i < ext->n_ext; i++) { + if (col == ext->ext[i]) { + return(row_ext_lookup_ith(ext, i, len)); + } + } + + return(NULL); +} diff --git a/storage/xtradb/include/row0ftsort.h b/storage/xtradb/include/row0ftsort.h new file mode 100644 index 00000000000..4e04a099140 --- /dev/null +++ b/storage/xtradb/include/row0ftsort.h @@ -0,0 +1,279 @@ +/***************************************************************************** + +Copyright (c) 2010, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0ftsort.h +Create Full Text Index with (parallel) merge sort + +Created 10/13/2010 Jimmy Yang +*******************************************************/ + +#ifndef row0ftsort_h +#define row0ftsort_h + +#include "univ.i" +#include "data0data.h" +#include "dict0types.h" +#include "row0mysql.h" +#include "fts0fts.h" +#include "fts0types.h" +#include "fts0priv.h" +#include "row0merge.h" + +/** This structure defineds information the scan thread will fetch +and put to the linked list for parallel tokenization/sort threads +to process */ +typedef struct fts_doc_item fts_doc_item_t; + +/** Information about temporary files used in merge sort */ +struct fts_doc_item { + dfield_t* field; /*!< field contains document string */ + doc_id_t doc_id; /*!< document ID */ + UT_LIST_NODE_T(fts_doc_item_t) doc_list; + /*!< list of doc items */ +}; + +/** This defines the list type that scan thread would feed the parallel +tokenization threads and sort threads. */ +typedef UT_LIST_BASE_NODE_T(fts_doc_item_t) fts_doc_list_t; + +#define FTS_NUM_AUX_INDEX 6 +#define FTS_PLL_MERGE 1 + +/** Sort information passed to each individual parallel sort thread */ +struct fts_psort_t; + +/** Common info passed to each parallel sort thread */ +struct fts_psort_common_t { + row_merge_dup_t* dup; /*!< descriptor of FTS index */ + dict_table_t* new_table; /*!< source table */ + trx_t* trx; /*!< transaction */ + fts_psort_t* all_info; /*!< all parallel sort info */ + os_event_t sort_event; /*!< sort event */ + os_event_t merge_event; /*!< merge event */ + ibool opt_doc_id_size;/*!< whether to use 4 bytes + instead of 8 bytes integer to + store Doc ID during sort, if + Doc ID will not be big enough + to use 8 bytes value */ +}; + +struct fts_psort_t { + ulint psort_id; /*!< Parallel sort ID */ + row_merge_buf_t* merge_buf[FTS_NUM_AUX_INDEX]; + /*!< sort buffer */ + merge_file_t* merge_file[FTS_NUM_AUX_INDEX]; + /*!< sort file */ + row_merge_block_t* merge_block[FTS_NUM_AUX_INDEX]; + /*!< buffer to write to file */ + row_merge_block_t* block_alloc[FTS_NUM_AUX_INDEX]; + /*!< buffer to allocated */ + ulint child_status; /*!< child thread status */ + ulint state; /*!< parent thread state */ + fts_doc_list_t fts_doc_list; /*!< doc list to process */ + fts_psort_common_t* psort_common; /*!< ptr to all psort info */ + os_thread_t thread_hdl; /*!< thread handler */ + dberr_t error; /*!< db error during psort */ + ulint memory_used; /*!< memory used by fts_doc_list */ + ib_mutex_t mutex; /*!< mutex for fts_doc_list */ +}; + +/** Structure stores information from string tokenization operation */ +struct fts_tokenize_ctx { + ulint processed_len; /*!< processed string length */ + ulint init_pos; /*!< doc start position */ + ulint buf_used; /*!< the sort buffer (ID) when + tokenization stops, which + could due to sort buffer full */ + ulint rows_added[FTS_NUM_AUX_INDEX]; + /*!< number of rows added for + each FTS index partition */ + ib_rbt_t* cached_stopword;/*!< in: stopword list */ + dfield_t sort_field[FTS_NUM_FIELDS_SORT]; + /*!< in: sort field */ +}; + +typedef struct fts_tokenize_ctx fts_tokenize_ctx_t; + +/** Structure stores information needed for the insertion phase of FTS +parallel sort. */ +struct fts_psort_insert { + trx_t* trx; /*!< Transaction used for insertion */ + que_t** ins_graph; /*!< insert graph */ + fts_table_t fts_table; /*!< auxiliary table */ + CHARSET_INFO* charset; /*!< charset info */ + mem_heap_t* heap; /*!< heap */ + ibool opt_doc_id_size;/*!< Whether to use smaller (4 bytes) + integer for Doc ID */ +}; + +typedef struct fts_psort_insert fts_psort_insert_t; + + +/** status bit used for communication between parent and child thread */ +#define FTS_PARENT_COMPLETE 1 +#define FTS_PARENT_EXITING 2 +#define FTS_CHILD_COMPLETE 1 +#define FTS_CHILD_EXITING 2 + +/** Print some debug information */ +#define FTSORT_PRINT + +#ifdef FTSORT_PRINT +#define DEBUG_FTS_SORT_PRINT(str) \ + do { \ + ut_print_timestamp(stderr); \ + fprintf(stderr, str); \ + } while (0) +#else +#define DEBUG_FTS_SORT_PRINT(str) +#endif /* FTSORT_PRINT */ + +/*************************************************************//** +Create a temporary "fts sort index" used to merge sort the +tokenized doc string. The index has three "fields": + +1) Tokenized word, +2) Doc ID +3) Word's position in original 'doc'. + +@return dict_index_t structure for the fts sort index */ +UNIV_INTERN +dict_index_t* +row_merge_create_fts_sort_index( +/*============================*/ + dict_index_t* index, /*!< in: Original FTS index + based on which this sort index + is created */ + const dict_table_t* table, /*!< in: table that FTS index + is being created on */ + ibool* opt_doc_id_size); + /*!< out: whether to use 4 bytes + instead of 8 bytes integer to + store Doc ID during sort */ + +/********************************************************************//** +Initialize FTS parallel sort structures. +@return TRUE if all successful */ +UNIV_INTERN +ibool +row_fts_psort_info_init( +/*====================*/ + trx_t* trx, /*!< in: transaction */ + row_merge_dup_t* dup, /*!< in,own: descriptor of + FTS index being created */ + const dict_table_t* new_table,/*!< in: table where indexes are + created */ + ibool opt_doc_id_size, + /*!< in: whether to use 4 bytes + instead of 8 bytes integer to + store Doc ID during sort */ + fts_psort_t** psort, /*!< out: parallel sort info to be + instantiated */ + fts_psort_t** merge) /*!< out: parallel merge info + to be instantiated */ + __attribute__((nonnull)); +/********************************************************************//** +Clean up and deallocate FTS parallel sort structures, and close +temparary merge sort files */ +UNIV_INTERN +void +row_fts_psort_info_destroy( +/*=======================*/ + fts_psort_t* psort_info, /*!< parallel sort info */ + fts_psort_t* merge_info); /*!< parallel merge info */ +/********************************************************************//** +Free up merge buffers when merge sort is done */ +UNIV_INTERN +void +row_fts_free_pll_merge_buf( +/*=======================*/ + fts_psort_t* psort_info); /*!< in: parallel sort info */ + +/*********************************************************************//** +Function performs parallel tokenization of the incoming doc strings. +@return OS_THREAD_DUMMY_RETURN */ +UNIV_INTERN +os_thread_ret_t +fts_parallel_tokenization( +/*======================*/ + void* arg); /*!< in: psort_info for the thread */ +/*********************************************************************//** +Start the parallel tokenization and parallel merge sort */ +UNIV_INTERN +void +row_fts_start_psort( +/*================*/ + fts_psort_t* psort_info); /*!< in: parallel sort info */ +/*********************************************************************//** +Function performs the merge and insertion of the sorted records. +@return OS_THREAD_DUMMY_RETURN */ +UNIV_INTERN +os_thread_ret_t +fts_parallel_merge( +/*===============*/ + void* arg); /*!< in: parallel merge info */ +/*********************************************************************//** +Kick off the parallel merge and insert thread */ +UNIV_INTERN +void +row_fts_start_parallel_merge( +/*=========================*/ + fts_psort_t* merge_info); /*!< in: parallel sort info */ +/********************************************************************//** +Read sorted FTS data files and insert data tuples to auxillary tables. +@return DB_SUCCESS or error number */ +UNIV_INTERN +void +row_fts_insert_tuple( +/*=================*/ + fts_psort_insert_t* + ins_ctx, /*!< in: insert context */ + fts_tokenizer_word_t* word, /*!< in: last processed + tokenized word */ + ib_vector_t* positions, /*!< in: word position */ + doc_id_t* in_doc_id, /*!< in: last item doc id */ + dtuple_t* dtuple); /*!< in: entry to insert */ +/********************************************************************//** +Propagate a newly added record up one level in the selection tree +@return parent where this value propagated to */ +UNIV_INTERN +int +row_merge_fts_sel_propagate( +/*========================*/ + int propogated, /*<! in: tree node propagated */ + int* sel_tree, /*<! in: selection tree */ + ulint level, /*<! in: selection tree level */ + const mrec_t** mrec, /*<! in: sort record */ + ulint** offsets, /*<! in: record offsets */ + dict_index_t* index); /*<! in: FTS index */ +/********************************************************************//** +Read sorted file containing index data tuples and insert these data +tuples to the index +@return DB_SUCCESS or error number */ +UNIV_INTERN +dberr_t +row_fts_merge_insert( +/*=================*/ + dict_index_t* index, /*!< in: index */ + dict_table_t* table, /*!< in: new table */ + fts_psort_t* psort_info, /*!< parallel sort info */ + ulint id) /* !< in: which auxiliary table's data + to insert to */ + __attribute__((nonnull)); +#endif /* row0ftsort_h */ diff --git a/storage/xtradb/include/row0import.h b/storage/xtradb/include/row0import.h new file mode 100644 index 00000000000..aa46fdb7c27 --- /dev/null +++ b/storage/xtradb/include/row0import.h @@ -0,0 +1,91 @@ +/***************************************************************************** + +Copyright (c) 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0import.h +Header file for import tablespace functions. + +Created 2012-02-08 by Sunny Bains +*******************************************************/ + +#ifndef row0import_h +#define row0import_h + +#include "univ.i" +#include "db0err.h" +#include "dict0types.h" + +// Forward declarations +struct trx_t; +struct dict_table_t; +struct row_prebuilt_t; + +/*****************************************************************//** +Imports a tablespace. The space id in the .ibd file must match the space id +of the table in the data dictionary. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_import_for_mysql( +/*=================*/ + dict_table_t* table, /*!< in/out: table */ + row_prebuilt_t* prebuilt) /*!< in: prebuilt struct + in MySQL */ + __attribute__((nonnull, warn_unused_result)); + +/*****************************************************************//** +Update the DICT_TF2_DISCARDED flag in SYS_TABLES. +@return DB_SUCCESS or error code. */ +UNIV_INTERN +dberr_t +row_import_update_discarded_flag( +/*=============================*/ + trx_t* trx, /*!< in/out: transaction that + covers the update */ + table_id_t table_id, /*!< in: Table for which we want + to set the root table->flags2 */ + bool discarded, /*!< in: set MIX_LEN column bit + to discarded, if true */ + bool dict_locked) /*!< in: Set to true if the + caller already owns the + dict_sys_t:: mutex. */ + __attribute__((nonnull, warn_unused_result)); + +/*****************************************************************//** +Update the (space, root page) of a table's indexes from the values +in the data dictionary. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +row_import_update_index_root( +/*=========================*/ + trx_t* trx, /*!< in/out: transaction that + covers the update */ + const dict_table_t* table, /*!< in: Table for which we want + to set the root page_no */ + bool reset, /*!< in: if true then set to + FIL_NUL */ + bool dict_locked) /*!< in: Set to true if the + caller already owns the + dict_sys_t:: mutex. */ + __attribute__((nonnull, warn_unused_result)); +#ifndef UNIV_NONINL +#include "row0import.ic" +#endif + +#endif /* row0import_h */ diff --git a/storage/xtradb/include/row0import.ic b/storage/xtradb/include/row0import.ic new file mode 100644 index 00000000000..c5bbab49f6f --- /dev/null +++ b/storage/xtradb/include/row0import.ic @@ -0,0 +1,25 @@ +/***************************************************************************** + +Copyright (c) 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0import.ic + +Import tablespace inline functions. + +Created 2012-02-08 Sunny Bains +*******************************************************/ diff --git a/storage/xtradb/include/row0ins.h b/storage/xtradb/include/row0ins.h new file mode 100644 index 00000000000..2a892d2f5df --- /dev/null +++ b/storage/xtradb/include/row0ins.h @@ -0,0 +1,240 @@ +/***************************************************************************** + +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0ins.h +Insert into a table + +Created 4/20/1996 Heikki Tuuri +*******************************************************/ + +#ifndef row0ins_h +#define row0ins_h + +#include "univ.i" +#include "data0data.h" +#include "que0types.h" +#include "dict0types.h" +#include "trx0types.h" +#include "row0types.h" + +/***************************************************************//** +Checks if foreign key constraint fails for an index entry. Sets shared locks +which lock either the success or the failure of the constraint. NOTE that +the caller must have a shared latch on dict_foreign_key_check_lock. +@return DB_SUCCESS, DB_LOCK_WAIT, DB_NO_REFERENCED_ROW, or +DB_ROW_IS_REFERENCED */ +UNIV_INTERN +dberr_t +row_ins_check_foreign_constraint( +/*=============================*/ + ibool check_ref,/*!< in: TRUE If we want to check that + the referenced table is ok, FALSE if we + want to check the foreign key table */ + dict_foreign_t* foreign,/*!< in: foreign constraint; NOTE that the + tables mentioned in it must be in the + dictionary cache if they exist at all */ + dict_table_t* table, /*!< in: if check_ref is TRUE, then the foreign + table, else the referenced table */ + dtuple_t* entry, /*!< in: index entry for index */ + que_thr_t* thr) /*!< in: query thread */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Creates an insert node struct. +@return own: insert node struct */ +UNIV_INTERN +ins_node_t* +ins_node_create( +/*============*/ + ulint ins_type, /*!< in: INS_VALUES, ... */ + dict_table_t* table, /*!< in: table where to insert */ + mem_heap_t* heap); /*!< in: mem heap where created */ +/*********************************************************************//** +Sets a new row to insert for an INS_DIRECT node. This function is only used +if we have constructed the row separately, which is a rare case; this +function is quite slow. */ +UNIV_INTERN +void +ins_node_set_new_row( +/*=================*/ + ins_node_t* node, /*!< in: insert node */ + dtuple_t* row); /*!< in: new row (or first row) for the node */ +/***************************************************************//** +Tries to insert an entry into a clustered index, ignoring foreign key +constraints. If a record with the same unique key is found, the other +record is necessarily marked deleted by a committed transaction, or a +unique key violation error occurs. The delete marked record is then +updated to an existing record, and we must write an undo log record on +the delete marked record. +@retval DB_SUCCESS on success +@retval DB_LOCK_WAIT on lock wait when !(flags & BTR_NO_LOCKING_FLAG) +@retval DB_FAIL if retry with BTR_MODIFY_TREE is needed +@return error code */ +UNIV_INTERN +dberr_t +row_ins_clust_index_entry_low( +/*==========================*/ + ulint flags, /*!< in: undo logging and locking flags */ + ulint mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE, + depending on whether we wish optimistic or + pessimistic descent down the index tree */ + dict_index_t* index, /*!< in: clustered index */ + ulint n_uniq, /*!< in: 0 or index->n_uniq */ + dtuple_t* entry, /*!< in/out: index entry to insert */ + ulint n_ext, /*!< in: number of externally stored columns */ + que_thr_t* thr) /*!< in: query thread or NULL */ + __attribute__((nonnull, warn_unused_result)); +/***************************************************************//** +Tries to insert an entry into a secondary index. If a record with exactly the +same fields is found, the other record is necessarily marked deleted. +It is then unmarked. Otherwise, the entry is just inserted to the index. +@retval DB_SUCCESS on success +@retval DB_LOCK_WAIT on lock wait when !(flags & BTR_NO_LOCKING_FLAG) +@retval DB_FAIL if retry with BTR_MODIFY_TREE is needed +@return error code */ +UNIV_INTERN +dberr_t +row_ins_sec_index_entry_low( +/*========================*/ + ulint flags, /*!< in: undo logging and locking flags */ + ulint mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE, + depending on whether we wish optimistic or + pessimistic descent down the index tree */ + dict_index_t* index, /*!< in: secondary index */ + mem_heap_t* offsets_heap, + /*!< in/out: memory heap that can be emptied */ + mem_heap_t* heap, /*!< in/out: memory heap */ + dtuple_t* entry, /*!< in/out: index entry to insert */ + trx_id_t trx_id, /*!< in: PAGE_MAX_TRX_ID during + row_log_table_apply(), or 0 */ + que_thr_t* thr) /*!< in: query thread */ + __attribute__((nonnull, warn_unused_result)); +/***************************************************************//** +Tries to insert the externally stored fields (off-page columns) +of a clustered index entry. +@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ +UNIV_INTERN +dberr_t +row_ins_index_entry_big_rec_func( +/*=============================*/ + const dtuple_t* entry, /*!< in/out: index entry to insert */ + const big_rec_t* big_rec,/*!< in: externally stored fields */ + ulint* offsets,/*!< in/out: rec offsets */ + mem_heap_t** heap, /*!< in/out: memory heap */ + dict_index_t* index, /*!< in: index */ + const char* file, /*!< in: file name of caller */ +#ifndef DBUG_OFF + const void* thd, /*!< in: connection, or NULL */ +#endif /* DBUG_OFF */ + ulint line) /*!< in: line number of caller */ + __attribute__((nonnull(1,2,3,4,5,6), warn_unused_result)); +#ifdef DBUG_OFF +# define row_ins_index_entry_big_rec(e,big,ofs,heap,index,thd,file,line) \ + row_ins_index_entry_big_rec_func(e,big,ofs,heap,index,file,line) +#else /* DBUG_OFF */ +# define row_ins_index_entry_big_rec(e,big,ofs,heap,index,thd,file,line) \ + row_ins_index_entry_big_rec_func(e,big,ofs,heap,index,file,thd,line) +#endif /* DBUG_OFF */ +/***************************************************************//** +Inserts an entry into a clustered index. Tries first optimistic, +then pessimistic descent down the tree. If the entry matches enough +to a delete marked record, performs the insert by updating or delete +unmarking the delete marked record. +@return DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */ +UNIV_INTERN +dberr_t +row_ins_clust_index_entry( +/*======================*/ + dict_index_t* index, /*!< in: clustered index */ + dtuple_t* entry, /*!< in/out: index entry to insert */ + que_thr_t* thr, /*!< in: query thread */ + ulint n_ext) /*!< in: number of externally stored columns */ + __attribute__((nonnull, warn_unused_result)); +/***************************************************************//** +Inserts an entry into a secondary index. Tries first optimistic, +then pessimistic descent down the tree. If the entry matches enough +to a delete marked record, performs the insert by updating or delete +unmarking the delete marked record. +@return DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */ +UNIV_INTERN +dberr_t +row_ins_sec_index_entry( +/*====================*/ + dict_index_t* index, /*!< in: secondary index */ + dtuple_t* entry, /*!< in/out: index entry to insert */ + que_thr_t* thr) /*!< in: query thread */ + __attribute__((nonnull, warn_unused_result)); +/***********************************************************//** +Inserts a row to a table. This is a high-level function used in +SQL execution graphs. +@return query thread to run next or NULL */ +UNIV_INTERN +que_thr_t* +row_ins_step( +/*=========*/ + que_thr_t* thr); /*!< in: query thread */ + +/* Insert node structure */ + +struct ins_node_t{ + que_common_t common; /*!< node type: QUE_NODE_INSERT */ + ulint ins_type;/* INS_VALUES, INS_SEARCHED, or INS_DIRECT */ + dtuple_t* row; /*!< row to insert */ + dict_table_t* table; /*!< table where to insert */ + sel_node_t* select; /*!< select in searched insert */ + que_node_t* values_list;/* list of expressions to evaluate and + insert in an INS_VALUES insert */ + ulint state; /*!< node execution state */ + dict_index_t* index; /*!< NULL, or the next index where the index + entry should be inserted */ + dtuple_t* entry; /*!< NULL, or entry to insert in the index; + after a successful insert of the entry, + this should be reset to NULL */ + UT_LIST_BASE_NODE_T(dtuple_t) + entry_list;/* list of entries, one for each index */ + byte* row_id_buf;/* buffer for the row id sys field in row */ + trx_id_t trx_id; /*!< trx id or the last trx which executed the + node */ + byte* trx_id_buf;/* buffer for the trx id sys field in row */ + mem_heap_t* entry_sys_heap; + /* memory heap used as auxiliary storage; + entry_list and sys fields are stored here; + if this is NULL, entry list should be created + and buffers for sys fields in row allocated */ + ulint magic_n; +}; + +#define INS_NODE_MAGIC_N 15849075 + +/* Insert node types */ +#define INS_SEARCHED 0 /* INSERT INTO ... SELECT ... */ +#define INS_VALUES 1 /* INSERT INTO ... VALUES ... */ +#define INS_DIRECT 2 /* this is for internal use in dict0crea: + insert the row directly */ + +/* Node execution states */ +#define INS_NODE_SET_IX_LOCK 1 /* we should set an IX lock on table */ +#define INS_NODE_ALLOC_ROW_ID 2 /* row id should be allocated */ +#define INS_NODE_INSERT_ENTRIES 3 /* index entries should be built and + inserted */ + +#ifndef UNIV_NONINL +#include "row0ins.ic" +#endif + +#endif diff --git a/storage/xtradb/include/row0ins.ic b/storage/xtradb/include/row0ins.ic new file mode 100644 index 00000000000..9c191d869a2 --- /dev/null +++ b/storage/xtradb/include/row0ins.ic @@ -0,0 +1,26 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0ins.ic +Insert into a table + +Created 4/20/1996 Heikki Tuuri +*******************************************************/ + + diff --git a/storage/xtradb/include/row0log.h b/storage/xtradb/include/row0log.h new file mode 100644 index 00000000000..62715fe8808 --- /dev/null +++ b/storage/xtradb/include/row0log.h @@ -0,0 +1,239 @@ +/***************************************************************************** + +Copyright (c) 2011, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0log.h +Modification log for online index creation and online table rebuild + +Created 2011-05-26 Marko Makela +*******************************************************/ + +#ifndef row0log_h +#define row0log_h + +#include "univ.i" +#include "mtr0types.h" +#include "row0types.h" +#include "rem0types.h" +#include "data0types.h" +#include "dict0types.h" +#include "trx0types.h" +#include "que0types.h" + +/******************************************************//** +Allocate the row log for an index and flag the index +for online creation. +@retval true if success, false if not */ +UNIV_INTERN +bool +row_log_allocate( +/*=============*/ + dict_index_t* index, /*!< in/out: index */ + dict_table_t* table, /*!< in/out: new table being rebuilt, + or NULL when creating a secondary index */ + bool same_pk,/*!< in: whether the definition of the + PRIMARY KEY has remained the same */ + const dtuple_t* add_cols, + /*!< in: default values of + added columns, or NULL */ + const ulint* col_map)/*!< in: mapping of old column + numbers to new ones, or NULL if !table */ + __attribute__((nonnull(1), warn_unused_result)); + +/******************************************************//** +Free the row log for an index that was being created online. */ +UNIV_INTERN +void +row_log_free( +/*=========*/ + row_log_t*& log) /*!< in,own: row log */ + __attribute__((nonnull)); + +/******************************************************//** +Free the row log for an index on which online creation was aborted. */ +UNIV_INLINE +void +row_log_abort_sec( +/*==============*/ + dict_index_t* index) /*!< in/out: index (x-latched) */ + __attribute__((nonnull)); + +/******************************************************//** +Try to log an operation to a secondary index that is +(or was) being created. +@retval true if the operation was logged or can be ignored +@retval false if online index creation is not taking place */ +UNIV_INLINE +bool +row_log_online_op_try( +/*==================*/ + dict_index_t* index, /*!< in/out: index, S or X latched */ + const dtuple_t* tuple, /*!< in: index tuple */ + trx_id_t trx_id) /*!< in: transaction ID for insert, + or 0 for delete */ + __attribute__((nonnull, warn_unused_result)); +/******************************************************//** +Logs an operation to a secondary index that is (or was) being created. */ +UNIV_INTERN +void +row_log_online_op( +/*==============*/ + dict_index_t* index, /*!< in/out: index, S or X latched */ + const dtuple_t* tuple, /*!< in: index tuple */ + trx_id_t trx_id) /*!< in: transaction ID for insert, + or 0 for delete */ + UNIV_COLD __attribute__((nonnull)); + +/******************************************************//** +Gets the error status of the online index rebuild log. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +row_log_table_get_error( +/*====================*/ + const dict_index_t* index) /*!< in: clustered index of a table + that is being rebuilt online */ + __attribute__((nonnull, warn_unused_result)); + +/******************************************************//** +Logs a delete operation to a table that is being rebuilt. +This will be merged in row_log_table_apply_delete(). */ +UNIV_INTERN +void +row_log_table_delete( +/*=================*/ + const rec_t* rec, /*!< in: clustered index leaf page record, + page X-latched */ + dict_index_t* index, /*!< in/out: clustered index, S-latched + or X-latched */ + const ulint* offsets,/*!< in: rec_get_offsets(rec,index) */ + const byte* sys) /*!< in: DB_TRX_ID,DB_ROLL_PTR that should + be logged, or NULL to use those in rec */ + UNIV_COLD __attribute__((nonnull(1,2,3))); + +/******************************************************//** +Logs an update operation to a table that is being rebuilt. +This will be merged in row_log_table_apply_update(). */ +UNIV_INTERN +void +row_log_table_update( +/*=================*/ + const rec_t* rec, /*!< in: clustered index leaf page record, + page X-latched */ + dict_index_t* index, /*!< in/out: clustered index, S-latched + or X-latched */ + const ulint* offsets,/*!< in: rec_get_offsets(rec,index) */ + const dtuple_t* old_pk) /*!< in: row_log_table_get_pk() + before the update */ + UNIV_COLD __attribute__((nonnull(1,2,3))); + +/******************************************************//** +Constructs the old PRIMARY KEY and DB_TRX_ID,DB_ROLL_PTR +of a table that is being rebuilt. +@return tuple of PRIMARY KEY,DB_TRX_ID,DB_ROLL_PTR in the rebuilt table, +or NULL if the PRIMARY KEY definition does not change */ +UNIV_INTERN +const dtuple_t* +row_log_table_get_pk( +/*=================*/ + const rec_t* rec, /*!< in: clustered index leaf page record, + page X-latched */ + dict_index_t* index, /*!< in/out: clustered index, S-latched + or X-latched */ + const ulint* offsets,/*!< in: rec_get_offsets(rec,index), + or NULL */ + byte* sys, /*!< out: DB_TRX_ID,DB_ROLL_PTR for + row_log_table_delete(), or NULL */ + mem_heap_t** heap) /*!< in/out: memory heap where allocated */ + UNIV_COLD __attribute__((nonnull(1,2,5), warn_unused_result)); + +/******************************************************//** +Logs an insert to a table that is being rebuilt. +This will be merged in row_log_table_apply_insert(). */ +UNIV_INTERN +void +row_log_table_insert( +/*=================*/ + const rec_t* rec, /*!< in: clustered index leaf page record, + page X-latched */ + dict_index_t* index, /*!< in/out: clustered index, S-latched + or X-latched */ + const ulint* offsets)/*!< in: rec_get_offsets(rec,index) */ + UNIV_COLD __attribute__((nonnull)); +/******************************************************//** +Notes that a BLOB is being freed during online ALTER TABLE. */ +UNIV_INTERN +void +row_log_table_blob_free( +/*====================*/ + dict_index_t* index, /*!< in/out: clustered index, X-latched */ + ulint page_no)/*!< in: starting page number of the BLOB */ + UNIV_COLD __attribute__((nonnull)); +/******************************************************//** +Notes that a BLOB is being allocated during online ALTER TABLE. */ +UNIV_INTERN +void +row_log_table_blob_alloc( +/*=====================*/ + dict_index_t* index, /*!< in/out: clustered index, X-latched */ + ulint page_no)/*!< in: starting page number of the BLOB */ + UNIV_COLD __attribute__((nonnull)); +/******************************************************//** +Apply the row_log_table log to a table upon completing rebuild. +@return DB_SUCCESS, or error code on failure */ +UNIV_INTERN +dberr_t +row_log_table_apply( +/*================*/ + que_thr_t* thr, /*!< in: query graph */ + dict_table_t* old_table, + /*!< in: old table */ + struct TABLE* table) /*!< in/out: MySQL table + (for reporting duplicates) */ + __attribute__((nonnull, warn_unused_result)); + +/******************************************************//** +Get the latest transaction ID that has invoked row_log_online_op() +during online creation. +@return latest transaction ID, or 0 if nothing was logged */ +UNIV_INTERN +trx_id_t +row_log_get_max_trx( +/*================*/ + dict_index_t* index) /*!< in: index, must be locked */ + __attribute__((nonnull, warn_unused_result)); + +/******************************************************//** +Merge the row log to the index upon completing index creation. +@return DB_SUCCESS, or error code on failure */ +UNIV_INTERN +dberr_t +row_log_apply( +/*==========*/ + trx_t* trx, /*!< in: transaction (for checking if + the operation was interrupted) */ + dict_index_t* index, /*!< in/out: secondary index */ + struct TABLE* table) /*!< in/out: MySQL table + (for reporting duplicates) */ + __attribute__((nonnull, warn_unused_result)); + +#ifndef UNIV_NONINL +#include "row0log.ic" +#endif + +#endif /* row0log.h */ diff --git a/storage/xtradb/include/row0log.ic b/storage/xtradb/include/row0log.ic new file mode 100644 index 00000000000..b0f37dbd8e7 --- /dev/null +++ b/storage/xtradb/include/row0log.ic @@ -0,0 +1,84 @@ +/***************************************************************************** + +Copyright (c) 2011, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0log.ic +Modification log for online index creation and online table rebuild + +Created 2012-10-18 Marko Makela +*******************************************************/ + +#include "dict0dict.h" + +/******************************************************//** +Free the row log for an index on which online creation was aborted. */ +UNIV_INLINE +void +row_log_abort_sec( +/*===============*/ + dict_index_t* index) /*!< in/out: index (x-latched) */ +{ +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + ut_ad(!dict_index_is_clust(index)); + dict_index_set_online_status(index, ONLINE_INDEX_ABORTED); + row_log_free(index->online_log); +} + +/******************************************************//** +Try to log an operation to a secondary index that is +(or was) being created. +@retval true if the operation was logged or can be ignored +@retval false if online index creation is not taking place */ +UNIV_INLINE +bool +row_log_online_op_try( +/*==================*/ + dict_index_t* index, /*!< in/out: index, S or X latched */ + const dtuple_t* tuple, /*!< in: index tuple */ + trx_id_t trx_id) /*!< in: transaction ID for insert, + or 0 for delete */ +{ +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_SHARED) + || rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + switch (dict_index_get_online_status(index)) { + case ONLINE_INDEX_COMPLETE: + /* This is a normal index. Do not log anything. + The caller must perform the operation on the + index tree directly. */ + return(false); + case ONLINE_INDEX_CREATION: + /* The index is being created online. Log the + operation. */ + row_log_online_op(index, tuple, trx_id); + break; + case ONLINE_INDEX_ABORTED: + case ONLINE_INDEX_ABORTED_DROPPED: + /* The index was created online, but the operation was + aborted. Do not log the operation and tell the caller + to skip the operation. */ + break; + } + + return(true); +} diff --git a/storage/xtradb/include/row0merge.h b/storage/xtradb/include/row0merge.h new file mode 100644 index 00000000000..390c0ce038b --- /dev/null +++ b/storage/xtradb/include/row0merge.h @@ -0,0 +1,430 @@ +/***************************************************************************** + +Copyright (c) 2005, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0merge.h +Index build routines using a merge sort + +Created 13/06/2005 Jan Lindstrom +*******************************************************/ + +#ifndef row0merge_h +#define row0merge_h + +#include "univ.i" +#include "data0data.h" +#include "dict0types.h" +#include "trx0types.h" +#include "que0types.h" +#include "mtr0mtr.h" +#include "rem0types.h" +#include "rem0rec.h" +#include "read0types.h" +#include "btr0types.h" +#include "row0mysql.h" +#include "lock0types.h" +#include "srv0srv.h" + +// Forward declaration +struct ib_sequence_t; + +/** @brief Block size for I/O operations in merge sort. + +The minimum is UNIV_PAGE_SIZE, or page_get_free_space_of_empty() +rounded to a power of 2. + +When not creating a PRIMARY KEY that contains column prefixes, this +can be set as small as UNIV_PAGE_SIZE / 2. */ +typedef byte row_merge_block_t; + +/** @brief Secondary buffer for I/O operations of merge records. + +This buffer is used for writing or reading a record that spans two +row_merge_block_t. Thus, it must be able to hold one merge record, +whose maximum size is the same as the minimum size of +row_merge_block_t. */ +typedef byte mrec_buf_t[UNIV_PAGE_SIZE_MAX]; + +/** @brief Merge record in row_merge_block_t. + +The format is the same as a record in ROW_FORMAT=COMPACT with the +exception that the REC_N_NEW_EXTRA_BYTES are omitted. */ +typedef byte mrec_t; + +/** Merge record in row_merge_buf_t */ +struct mtuple_t { + dfield_t* fields; /*!< data fields */ +}; + +/** Buffer for sorting in main memory. */ +struct row_merge_buf_t { + mem_heap_t* heap; /*!< memory heap where allocated */ + dict_index_t* index; /*!< the index the tuples belong to */ + ulint total_size; /*!< total amount of data bytes */ + ulint n_tuples; /*!< number of data tuples */ + ulint max_tuples; /*!< maximum number of data tuples */ + mtuple_t* tuples; /*!< array of data tuples */ + mtuple_t* tmp_tuples; /*!< temporary copy of tuples, + for sorting */ +}; + +/** Information about temporary files used in merge sort */ +struct merge_file_t { + int fd; /*!< file descriptor */ + ulint offset; /*!< file offset (end of file) */ + ib_uint64_t n_rec; /*!< number of records in the file */ +}; + +/** Index field definition */ +struct index_field_t { + ulint col_no; /*!< column offset */ + ulint prefix_len; /*!< column prefix length, or 0 + if indexing the whole column */ +}; + +/** Definition of an index being created */ +struct index_def_t { + const char* name; /*!< index name */ + ulint ind_type; /*!< 0, DICT_UNIQUE, + or DICT_CLUSTERED */ + ulint key_number; /*!< MySQL key number, + or ULINT_UNDEFINED if none */ + ulint n_fields; /*!< number of fields in index */ + index_field_t* fields; /*!< field definitions */ +}; + +/** Structure for reporting duplicate records. */ +struct row_merge_dup_t { + dict_index_t* index; /*!< index being sorted */ + struct TABLE* table; /*!< MySQL table object */ + const ulint* col_map;/*!< mapping of column numbers + in table to the rebuilt table + (index->table), or NULL if not + rebuilding table */ + ulint n_dup; /*!< number of duplicates */ +}; + +/*************************************************************//** +Report a duplicate key. */ +UNIV_INTERN +void +row_merge_dup_report( +/*=================*/ + row_merge_dup_t* dup, /*!< in/out: for reporting duplicates */ + const dfield_t* entry) /*!< in: duplicate index entry */ + __attribute__((nonnull)); +/*********************************************************************//** +Sets an exclusive lock on a table, for the duration of creating indexes. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_merge_lock_table( +/*=================*/ + trx_t* trx, /*!< in/out: transaction */ + dict_table_t* table, /*!< in: table to lock */ + enum lock_mode mode) /*!< in: LOCK_X or LOCK_S */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Drop indexes that were created before an error occurred. +The data dictionary must have been locked exclusively by the caller, +because the transaction will not be committed. */ +UNIV_INTERN +void +row_merge_drop_indexes_dict( +/*========================*/ + trx_t* trx, /*!< in/out: dictionary transaction */ + table_id_t table_id)/*!< in: table identifier */ + __attribute__((nonnull)); +/*********************************************************************//** +Drop those indexes which were created before an error occurred. +The data dictionary must have been locked exclusively by the caller, +because the transaction will not be committed. */ +UNIV_INTERN +void +row_merge_drop_indexes( +/*===================*/ + trx_t* trx, /*!< in/out: transaction */ + dict_table_t* table, /*!< in/out: table containing the indexes */ + ibool locked) /*!< in: TRUE=table locked, + FALSE=may need to do a lazy drop */ + __attribute__((nonnull)); +/*********************************************************************//** +Drop all partially created indexes during crash recovery. */ +UNIV_INTERN +void +row_merge_drop_temp_indexes(void); +/*=============================*/ + +/*********************************************************************//** +Creates temporary merge files, and if UNIV_PFS_IO defined, register +the file descriptor with Performance Schema. +@return File descriptor */ +UNIV_INTERN +int +row_merge_file_create_low(void) +/*===========================*/ + __attribute__((warn_unused_result)); +/*********************************************************************//** +Destroy a merge file. And de-register the file from Performance Schema +if UNIV_PFS_IO is defined. */ +UNIV_INTERN +void +row_merge_file_destroy_low( +/*=======================*/ + int fd); /*!< in: merge file descriptor */ + +/*********************************************************************//** +Provide a new pathname for a table that is being renamed if it belongs to +a file-per-table tablespace. The caller is responsible for freeing the +memory allocated for the return value. +@return new pathname of tablespace file, or NULL if space = 0 */ +UNIV_INTERN +char* +row_make_new_pathname( +/*==================*/ + dict_table_t* table, /*!< in: table to be renamed */ + const char* new_name); /*!< in: new name */ +/*********************************************************************//** +Rename the tables in the data dictionary. The data dictionary must +have been locked exclusively by the caller, because the transaction +will not be committed. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_merge_rename_tables_dict( +/*=========================*/ + dict_table_t* old_table, /*!< in/out: old table, renamed to + tmp_name */ + dict_table_t* new_table, /*!< in/out: new table, renamed to + old_table->name */ + const char* tmp_name, /*!< in: new name for old_table */ + trx_t* trx) /*!< in/out: dictionary transaction */ + __attribute__((nonnull, warn_unused_result)); + +/*********************************************************************//** +Rename an index in the dictionary that was created. The data +dictionary must have been locked exclusively by the caller, because +the transaction will not be committed. +@return DB_SUCCESS if all OK */ +UNIV_INTERN +dberr_t +row_merge_rename_index_to_add( +/*==========================*/ + trx_t* trx, /*!< in/out: transaction */ + table_id_t table_id, /*!< in: table identifier */ + index_id_t index_id) /*!< in: index identifier */ + __attribute__((nonnull)); +/*********************************************************************//** +Rename an index in the dictionary that is to be dropped. The data +dictionary must have been locked exclusively by the caller, because +the transaction will not be committed. +@return DB_SUCCESS if all OK */ +UNIV_INTERN +dberr_t +row_merge_rename_index_to_drop( +/*===========================*/ + trx_t* trx, /*!< in/out: transaction */ + table_id_t table_id, /*!< in: table identifier */ + index_id_t index_id) /*!< in: index identifier */ + __attribute__((nonnull)); +/*********************************************************************//** +Create the index and load in to the dictionary. +@return index, or NULL on error */ +UNIV_INTERN +dict_index_t* +row_merge_create_index( +/*===================*/ + trx_t* trx, /*!< in/out: trx (sets error_state) */ + dict_table_t* table, /*!< in: the index is on this table */ + const index_def_t* index_def); + /*!< in: the index definition */ +/*********************************************************************//** +Check if a transaction can use an index. +@return TRUE if index can be used by the transaction else FALSE */ +UNIV_INTERN +ibool +row_merge_is_index_usable( +/*======================*/ + const trx_t* trx, /*!< in: transaction */ + const dict_index_t* index); /*!< in: index to check */ +/*********************************************************************//** +Drop a table. The caller must have ensured that the background stats +thread is not processing the table. This can be done by calling +dict_stats_wait_bg_to_stop_using_table() after locking the dictionary and +before calling this function. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +row_merge_drop_table( +/*=================*/ + trx_t* trx, /*!< in: transaction */ + dict_table_t* table) /*!< in: table instance to drop */ + __attribute__((nonnull)); +/*********************************************************************//** +Build indexes on a table by reading a clustered index, +creating a temporary file containing index entries, merge sorting +these index entries and inserting sorted index entries to indexes. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +row_merge_build_indexes( +/*====================*/ + trx_t* trx, /*!< in: transaction */ + dict_table_t* old_table, /*!< in: table where rows are + read from */ + dict_table_t* new_table, /*!< in: table where indexes are + created; identical to old_table + unless creating a PRIMARY KEY */ + bool online, /*!< in: true if creating indexes + online */ + dict_index_t** indexes, /*!< in: indexes to be created */ + const ulint* key_numbers, /*!< in: MySQL key numbers */ + ulint n_indexes, /*!< in: size of indexes[] */ + struct TABLE* table, /*!< in/out: MySQL table, for + reporting erroneous key value + if applicable */ + const dtuple_t* add_cols, /*!< in: default values of + added columns, or NULL */ + const ulint* col_map, /*!< in: mapping of old column + numbers to new ones, or NULL + if old_table == new_table */ + ulint add_autoinc, /*!< in: number of added + AUTO_INCREMENT column, or + ULINT_UNDEFINED if none is added */ + ib_sequence_t& sequence) /*!< in/out: autoinc sequence */ + __attribute__((nonnull(1,2,3,5,6,8), warn_unused_result)); +/********************************************************************//** +Write a buffer to a block. */ +UNIV_INTERN +void +row_merge_buf_write( +/*================*/ + const row_merge_buf_t* buf, /*!< in: sorted buffer */ + const merge_file_t* of, /*!< in: output file */ + row_merge_block_t* block) /*!< out: buffer for writing to file */ + __attribute__((nonnull)); +/********************************************************************//** +Sort a buffer. */ +UNIV_INTERN +void +row_merge_buf_sort( +/*===============*/ + row_merge_buf_t* buf, /*!< in/out: sort buffer */ + row_merge_dup_t* dup) /*!< in/out: reporter of duplicates + (NULL if non-unique index) */ + __attribute__((nonnull(1))); +/********************************************************************//** +Write a merge block to the file system. +@return TRUE if request was successful, FALSE if fail */ +UNIV_INTERN +ibool +row_merge_write( +/*============*/ + int fd, /*!< in: file descriptor */ + ulint offset, /*!< in: offset where to write, + in number of row_merge_block_t elements */ + const void* buf); /*!< in: data */ +/********************************************************************//** +Empty a sort buffer. +@return sort buffer */ +UNIV_INTERN +row_merge_buf_t* +row_merge_buf_empty( +/*================*/ + row_merge_buf_t* buf) /*!< in,own: sort buffer */ + __attribute__((warn_unused_result, nonnull)); +/*********************************************************************//** +Create a merge file. +@return file descriptor, or -1 on failure */ +UNIV_INTERN +int +row_merge_file_create( +/*==================*/ + merge_file_t* merge_file) /*!< out: merge file structure */ + __attribute__((nonnull)); +/*********************************************************************//** +Merge disk files. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +row_merge_sort( +/*===========*/ + trx_t* trx, /*!< in: transaction */ + const row_merge_dup_t* dup, /*!< in: descriptor of + index being created */ + merge_file_t* file, /*!< in/out: file containing + index entries */ + row_merge_block_t* block, /*!< in/out: 3 buffers */ + int* tmpfd) /*!< in/out: temporary file handle */ + __attribute__((nonnull)); +/*********************************************************************//** +Allocate a sort buffer. +@return own: sort buffer */ +UNIV_INTERN +row_merge_buf_t* +row_merge_buf_create( +/*=================*/ + dict_index_t* index) /*!< in: secondary index */ + __attribute__((warn_unused_result, nonnull, malloc)); +/*********************************************************************//** +Deallocate a sort buffer. */ +UNIV_INTERN +void +row_merge_buf_free( +/*===============*/ + row_merge_buf_t* buf) /*!< in,own: sort buffer to be freed */ + __attribute__((nonnull)); +/*********************************************************************//** +Destroy a merge file. */ +UNIV_INTERN +void +row_merge_file_destroy( +/*===================*/ + merge_file_t* merge_file) /*!< in/out: merge file structure */ + __attribute__((nonnull)); +/********************************************************************//** +Read a merge block from the file system. +@return TRUE if request was successful, FALSE if fail */ +UNIV_INTERN +ibool +row_merge_read( +/*===========*/ + int fd, /*!< in: file descriptor */ + ulint offset, /*!< in: offset where to read + in number of row_merge_block_t + elements */ + row_merge_block_t* buf); /*!< out: data */ +/********************************************************************//** +Read a merge record. +@return pointer to next record, or NULL on I/O error or end of list */ +UNIV_INTERN +const byte* +row_merge_read_rec( +/*===============*/ + row_merge_block_t* block, /*!< in/out: file buffer */ + mrec_buf_t* buf, /*!< in/out: secondary buffer */ + const byte* b, /*!< in: pointer to record */ + const dict_index_t* index, /*!< in: index of the record */ + int fd, /*!< in: file descriptor */ + ulint* foffs, /*!< in/out: file offset */ + const mrec_t** mrec, /*!< out: pointer to merge record, + or NULL on end of list + (non-NULL on I/O error) */ + ulint* offsets)/*!< out: offsets of mrec */ + __attribute__((nonnull, warn_unused_result)); +#endif /* row0merge.h */ diff --git a/storage/xtradb/include/row0mysql.h b/storage/xtradb/include/row0mysql.h new file mode 100644 index 00000000000..06c07002c2b --- /dev/null +++ b/storage/xtradb/include/row0mysql.h @@ -0,0 +1,915 @@ +/***************************************************************************** + +Copyright (c) 2000, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0mysql.h +Interface between Innobase row operations and MySQL. +Contains also create table and other data dictionary operations. + +Created 9/17/2000 Heikki Tuuri +*******************************************************/ + +#ifndef row0mysql_h +#define row0mysql_h + +#include "univ.i" +#include "data0data.h" +#include "que0types.h" +#include "dict0types.h" +#include "trx0types.h" +#include "row0types.h" +#include "btr0pcur.h" +#include "trx0types.h" + +// Forward declaration +struct SysIndexCallback; + +extern ibool row_rollback_on_timeout; + +struct row_prebuilt_t; + +/*******************************************************************//** +Frees the blob heap in prebuilt when no longer needed. */ +UNIV_INTERN +void +row_mysql_prebuilt_free_blob_heap( +/*==============================*/ + row_prebuilt_t* prebuilt); /*!< in: prebuilt struct of a + ha_innobase:: table handle */ +/*******************************************************************//** +Stores a >= 5.0.3 format true VARCHAR length to dest, in the MySQL row +format. +@return pointer to the data, we skip the 1 or 2 bytes at the start +that are used to store the len */ +UNIV_INTERN +byte* +row_mysql_store_true_var_len( +/*=========================*/ + byte* dest, /*!< in: where to store */ + ulint len, /*!< in: length, must fit in two bytes */ + ulint lenlen);/*!< in: storage length of len: either 1 or 2 bytes */ +/*******************************************************************//** +Reads a >= 5.0.3 format true VARCHAR length, in the MySQL row format, and +returns a pointer to the data. +@return pointer to the data, we skip the 1 or 2 bytes at the start +that are used to store the len */ +UNIV_INTERN +const byte* +row_mysql_read_true_varchar( +/*========================*/ + ulint* len, /*!< out: variable-length field length */ + const byte* field, /*!< in: field in the MySQL format */ + ulint lenlen);/*!< in: storage length of len: either 1 + or 2 bytes */ +/*******************************************************************//** +Stores a reference to a BLOB in the MySQL format. */ +UNIV_INTERN +void +row_mysql_store_blob_ref( +/*=====================*/ + byte* dest, /*!< in: where to store */ + ulint col_len,/*!< in: dest buffer size: determines into + how many bytes the BLOB length is stored, + the space for the length may vary from 1 + to 4 bytes */ + const void* data, /*!< in: BLOB data; if the value to store + is SQL NULL this should be NULL pointer */ + ulint len); /*!< in: BLOB length; if the value to store + is SQL NULL this should be 0; remember + also to set the NULL bit in the MySQL record + header! */ +/*******************************************************************//** +Reads a reference to a BLOB in the MySQL format. +@return pointer to BLOB data */ +UNIV_INTERN +const byte* +row_mysql_read_blob_ref( +/*====================*/ + ulint* len, /*!< out: BLOB length */ + const byte* ref, /*!< in: BLOB reference in the + MySQL format */ + ulint col_len); /*!< in: BLOB reference length + (not BLOB length) */ +/**************************************************************//** +Pad a column with spaces. */ +UNIV_INTERN +void +row_mysql_pad_col( +/*==============*/ + ulint mbminlen, /*!< in: minimum size of a character, + in bytes */ + byte* pad, /*!< out: padded buffer */ + ulint len); /*!< in: number of bytes to pad */ + +/**************************************************************//** +Stores a non-SQL-NULL field given in the MySQL format in the InnoDB format. +The counterpart of this function is row_sel_field_store_in_mysql_format() in +row0sel.cc. +@return up to which byte we used buf in the conversion */ +UNIV_INTERN +byte* +row_mysql_store_col_in_innobase_format( +/*===================================*/ + dfield_t* dfield, /*!< in/out: dfield where dtype + information must be already set when + this function is called! */ + byte* buf, /*!< in/out: buffer for a converted + integer value; this must be at least + col_len long then! NOTE that dfield + may also get a pointer to 'buf', + therefore do not discard this as long + as dfield is used! */ + ibool row_format_col, /*!< TRUE if the mysql_data is from + a MySQL row, FALSE if from a MySQL + key value; + in MySQL, a true VARCHAR storage + format differs in a row and in a + key value: in a key value the length + is always stored in 2 bytes! */ + const byte* mysql_data, /*!< in: MySQL column value, not + SQL NULL; NOTE that dfield may also + get a pointer to mysql_data, + therefore do not discard this as long + as dfield is used! */ + ulint col_len, /*!< in: MySQL column length; NOTE that + this is the storage length of the + column in the MySQL format row, not + necessarily the length of the actual + payload data; if the column is a true + VARCHAR then this is irrelevant */ + ulint comp); /*!< in: nonzero=compact format */ +/****************************************************************//** +Handles user errors and lock waits detected by the database engine. +@return true if it was a lock wait and we should continue running the +query thread */ +UNIV_INTERN +bool +row_mysql_handle_errors( +/*====================*/ + dberr_t* new_err,/*!< out: possible new error encountered in + rollback, or the old error which was + during the function entry */ + trx_t* trx, /*!< in: transaction */ + que_thr_t* thr, /*!< in: query thread, or NULL */ + trx_savept_t* savept) /*!< in: savepoint, or NULL */ + __attribute__((nonnull(1,2))); +/********************************************************************//** +Create a prebuilt struct for a MySQL table handle. +@return own: a prebuilt struct */ +UNIV_INTERN +row_prebuilt_t* +row_create_prebuilt( +/*================*/ + dict_table_t* table, /*!< in: Innobase table handle */ + ulint mysql_row_len); /*!< in: length in bytes of a row in + the MySQL format */ +/********************************************************************//** +Free a prebuilt struct for a MySQL table handle. */ +UNIV_INTERN +void +row_prebuilt_free( +/*==============*/ + row_prebuilt_t* prebuilt, /*!< in, own: prebuilt struct */ + ibool dict_locked); /*!< in: TRUE=data dictionary locked */ +/*********************************************************************//** +Updates the transaction pointers in query graphs stored in the prebuilt +struct. */ +UNIV_INTERN +void +row_update_prebuilt_trx( +/*====================*/ + row_prebuilt_t* prebuilt, /*!< in/out: prebuilt struct + in MySQL handle */ + trx_t* trx); /*!< in: transaction handle */ +/*********************************************************************//** +Sets an AUTO_INC type lock on the table mentioned in prebuilt. The +AUTO_INC lock gives exclusive access to the auto-inc counter of the +table. The lock is reserved only for the duration of an SQL statement. +It is not compatible with another AUTO_INC or exclusive lock on the +table. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_lock_table_autoinc_for_mysql( +/*=============================*/ + row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in the MySQL + table handle */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Sets a table lock on the table mentioned in prebuilt. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_lock_table_for_mysql( +/*=====================*/ + row_prebuilt_t* prebuilt, /*!< in: prebuilt struct in the MySQL + table handle */ + dict_table_t* table, /*!< in: table to lock, or NULL + if prebuilt->table should be + locked as + prebuilt->select_lock_type */ + ulint mode) /*!< in: lock mode of table + (ignored if table==NULL) */ + __attribute__((nonnull(1))); +/*********************************************************************//** +Does an insert for MySQL. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_insert_for_mysql( +/*=================*/ + byte* mysql_rec, /*!< in: row in the MySQL format */ + row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in MySQL + handle */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Builds a dummy query graph used in selects. */ +UNIV_INTERN +void +row_prebuild_sel_graph( +/*===================*/ + row_prebuilt_t* prebuilt); /*!< in: prebuilt struct in MySQL + handle */ +/*********************************************************************//** +Gets pointer to a prebuilt update vector used in updates. If the update +graph has not yet been built in the prebuilt struct, then this function +first builds it. +@return prebuilt update vector */ +UNIV_INTERN +upd_t* +row_get_prebuilt_update_vector( +/*===========================*/ + row_prebuilt_t* prebuilt); /*!< in: prebuilt struct in MySQL + handle */ +/*********************************************************************//** +Checks if a table is such that we automatically created a clustered +index on it (on row id). +@return TRUE if the clustered index was generated automatically */ +UNIV_INTERN +ibool +row_table_got_default_clust_index( +/*==============================*/ + const dict_table_t* table); /*!< in: table */ +/*********************************************************************//** +Does an update or delete of a row for MySQL. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_update_for_mysql( +/*=================*/ + byte* mysql_rec, /*!< in: the row to be updated, in + the MySQL format */ + row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in MySQL + handle */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +This can only be used when srv_locks_unsafe_for_binlog is TRUE or this +session is using a READ COMMITTED or READ UNCOMMITTED isolation level. +Before calling this function row_search_for_mysql() must have +initialized prebuilt->new_rec_locks to store the information which new +record locks really were set. This function removes a newly set +clustered index record lock under prebuilt->pcur or +prebuilt->clust_pcur. Thus, this implements a 'mini-rollback' that +releases the latest clustered index record lock we set. */ +UNIV_INTERN +void +row_unlock_for_mysql( +/*=================*/ + row_prebuilt_t* prebuilt, /*!< in/out: prebuilt struct in MySQL + handle */ + ibool has_latches_on_recs)/*!< in: TRUE if called + so that we have the latches on + the records under pcur and + clust_pcur, and we do not need + to reposition the cursors. */ + __attribute__((nonnull)); +/*********************************************************************//** +Checks if a table name contains the string "/#sql" which denotes temporary +tables in MySQL. +@return true if temporary table */ +UNIV_INTERN +bool +row_is_mysql_tmp_table_name( +/*========================*/ + const char* name) __attribute__((warn_unused_result)); + /*!< in: table name in the form + 'database/tablename' */ + +/*********************************************************************//** +Creates an query graph node of 'update' type to be used in the MySQL +interface. +@return own: update node */ +UNIV_INTERN +upd_node_t* +row_create_update_node_for_mysql( +/*=============================*/ + dict_table_t* table, /*!< in: table to update */ + mem_heap_t* heap); /*!< in: mem heap from which allocated */ +/**********************************************************************//** +Does a cascaded delete or set null in a foreign key operation. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_update_cascade_for_mysql( +/*=========================*/ + que_thr_t* thr, /*!< in: query thread */ + upd_node_t* node, /*!< in: update node used in the cascade + or set null operation */ + dict_table_t* table) /*!< in: table where we do the operation */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Locks the data dictionary exclusively for performing a table create or other +data dictionary modification operation. */ +UNIV_INTERN +void +row_mysql_lock_data_dictionary_func( +/*================================*/ + trx_t* trx, /*!< in/out: transaction */ + const char* file, /*!< in: file name */ + ulint line); /*!< in: line number */ +#define row_mysql_lock_data_dictionary(trx) \ + row_mysql_lock_data_dictionary_func(trx, __FILE__, __LINE__) +/*********************************************************************//** +Unlocks the data dictionary exclusive lock. */ +UNIV_INTERN +void +row_mysql_unlock_data_dictionary( +/*=============================*/ + trx_t* trx); /*!< in/out: transaction */ +/*********************************************************************//** +Locks the data dictionary in shared mode from modifications, for performing +foreign key check, rollback, or other operation invisible to MySQL. */ +UNIV_INTERN +void +row_mysql_freeze_data_dictionary_func( +/*==================================*/ + trx_t* trx, /*!< in/out: transaction */ + const char* file, /*!< in: file name */ + ulint line); /*!< in: line number */ +#define row_mysql_freeze_data_dictionary(trx) \ + row_mysql_freeze_data_dictionary_func(trx, __FILE__, __LINE__) +/*********************************************************************//** +Unlocks the data dictionary shared lock. */ +UNIV_INTERN +void +row_mysql_unfreeze_data_dictionary( +/*===============================*/ + trx_t* trx); /*!< in/out: transaction */ +/*********************************************************************//** +Creates a table for MySQL. If the name of the table ends in +one of "innodb_monitor", "innodb_lock_monitor", "innodb_tablespace_monitor", +"innodb_table_monitor", then this will also start the printing of monitor +output by the master thread. If the table name ends in "innodb_mem_validate", +InnoDB will try to invoke mem_validate(). On failure the transaction will +be rolled back. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_create_table_for_mysql( +/*=======================*/ + dict_table_t* table, /*!< in, own: table definition + (will be freed, or on DB_SUCCESS + added to the data dictionary cache) */ + trx_t* trx, /*!< in/out: transaction */ + bool commit) /*!< in: if true, commit the transaction */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Does an index creation operation for MySQL. TODO: currently failure +to create an index results in dropping the whole table! This is no problem +currently as all indexes must be created at the same time as the table. +@return error number or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_create_index_for_mysql( +/*=======================*/ + dict_index_t* index, /*!< in, own: index definition + (will be freed) */ + trx_t* trx, /*!< in: transaction handle */ + const ulint* field_lengths) /*!< in: if not NULL, must contain + dict_index_get_n_fields(index) + actual field lengths for the + index columns, which are + then checked for not being too + large. */ + __attribute__((nonnull(1,2), warn_unused_result)); +/*********************************************************************//** +Scans a table create SQL string and adds to the data dictionary +the foreign key constraints declared in the string. This function +should be called after the indexes for a table have been created. +Each foreign key constraint must be accompanied with indexes in +bot participating tables. The indexes are allowed to contain more +fields than mentioned in the constraint. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_table_add_foreign_constraints( +/*==============================*/ + trx_t* trx, /*!< in: transaction */ + const char* sql_string, /*!< in: table create statement where + foreign keys are declared like: + FOREIGN KEY (a, b) REFERENCES table2(c, d), + table2 can be written also with the + database name before it: test.table2 */ + size_t sql_length, /*!< in: length of sql_string */ + const char* name, /*!< in: table full name in the + normalized form + database_name/table_name */ + ibool reject_fks) /*!< in: if TRUE, fail with error + code DB_CANNOT_ADD_CONSTRAINT if + any foreign keys are found. */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +The master thread in srv0srv.cc calls this regularly to drop tables which +we must drop in background after queries to them have ended. Such lazy +dropping of tables is needed in ALTER TABLE on Unix. +@return how many tables dropped + remaining tables in list */ +UNIV_INTERN +ulint +row_drop_tables_for_mysql_in_background(void); +/*=========================================*/ +/*********************************************************************//** +Get the background drop list length. NOTE: the caller must own the kernel +mutex! +@return how many tables in list */ +UNIV_INTERN +ulint +row_get_background_drop_list_len_low(void); +/*======================================*/ +/*********************************************************************//** +Sets an exclusive lock on a table. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_mysql_lock_table( +/*=================*/ + trx_t* trx, /*!< in/out: transaction */ + dict_table_t* table, /*!< in: table to lock */ + enum lock_mode mode, /*!< in: LOCK_X or LOCK_S */ + const char* op_info) /*!< in: string for trx->op_info */ + __attribute__((nonnull, warn_unused_result)); + +/*********************************************************************//** +Truncates a table for MySQL. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_truncate_table_for_mysql( +/*=========================*/ + dict_table_t* table, /*!< in: table handle */ + trx_t* trx) /*!< in: transaction handle */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Drops a table for MySQL. If the name of the dropped table ends in +one of "innodb_monitor", "innodb_lock_monitor", "innodb_tablespace_monitor", +"innodb_table_monitor", then this will also stop the printing of monitor +output by the master thread. If the data dictionary was not already locked +by the transaction, the transaction will be committed. Otherwise, the +data dictionary will remain locked. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_drop_table_for_mysql( +/*=====================*/ + const char* name, /*!< in: table name */ + trx_t* trx, /*!< in: dictionary transaction handle */ + bool drop_db,/*!< in: true=dropping whole database */ + bool nonatomic = true) + /*!< in: whether it is permitted + to release and reacquire dict_operation_lock */ + __attribute__((nonnull)); +/*********************************************************************//** +Drop all temporary tables during crash recovery. */ +UNIV_INTERN +void +row_mysql_drop_temp_tables(void); +/*============================*/ + +/*********************************************************************//** +Discards the tablespace of a table which stored in an .ibd file. Discarding +means that this function deletes the .ibd file and assigns a new table id for +the table. Also the flag table->ibd_file_missing is set TRUE. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_discard_tablespace_for_mysql( +/*=============================*/ + const char* name, /*!< in: table name */ + trx_t* trx) /*!< in: transaction handle */ + __attribute__((nonnull, warn_unused_result)); +/*****************************************************************//** +Imports a tablespace. The space id in the .ibd file must match the space id +of the table in the data dictionary. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_import_tablespace_for_mysql( +/*============================*/ + dict_table_t* table, /*!< in/out: table */ + row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in MySQL */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Drops a database for MySQL. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_drop_database_for_mysql( +/*========================*/ + const char* name, /*!< in: database name which ends to '/' */ + trx_t* trx) /*!< in: transaction handle */ + __attribute__((nonnull)); +/*********************************************************************//** +Renames a table for MySQL. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_rename_table_for_mysql( +/*=======================*/ + const char* old_name, /*!< in: old table name */ + const char* new_name, /*!< in: new table name */ + trx_t* trx, /*!< in/out: transaction */ + bool commit) /*!< in: whether to commit trx */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Checks that the index contains entries in an ascending order, unique +constraint is not broken, and calculates the number of index entries +in the read view of the current transaction. +@return true if ok */ +UNIV_INTERN +bool +row_check_index_for_mysql( +/*======================*/ + row_prebuilt_t* prebuilt, /*!< in: prebuilt struct + in MySQL handle */ + const dict_index_t* index, /*!< in: index */ + ulint* n_rows) /*!< out: number of entries + seen in the consistent read */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Determines if a table is a magic monitor table. +@return true if monitor table */ +UNIV_INTERN +bool +row_is_magic_monitor_table( +/*=======================*/ + const char* table_name) /*!< in: name of the table, in the + form database/table_name */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Initialize this module */ +UNIV_INTERN +void +row_mysql_init(void); +/*================*/ + +/*********************************************************************//** +Close this module */ +UNIV_INTERN +void +row_mysql_close(void); +/*=================*/ + +/*********************************************************************//** +Reassigns the table identifier of a table. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_mysql_table_id_reassign( +/*========================*/ + dict_table_t* table, /*!< in/out: table */ + trx_t* trx, /*!< in/out: transaction */ + table_id_t* new_id) /*!< out: new table id */ + __attribute__((nonnull, warn_unused_result)); + +/* A struct describing a place for an individual column in the MySQL +row format which is presented to the table handler in ha_innobase. +This template struct is used to speed up row transformations between +Innobase and MySQL. */ + +struct mysql_row_templ_t { + ulint col_no; /*!< column number of the column */ + ulint rec_field_no; /*!< field number of the column in an + Innobase record in the current index; + not defined if template_type is + ROW_MYSQL_WHOLE_ROW */ + ulint clust_rec_field_no; /*!< field number of the column in an + Innobase record in the clustered index; + not defined if template_type is + ROW_MYSQL_WHOLE_ROW */ + ulint icp_rec_field_no; /*!< field number of the column in an + Innobase record in the current index; + not defined unless + index condition pushdown is used */ + ulint mysql_col_offset; /*!< offset of the column in the MySQL + row format */ + ulint mysql_col_len; /*!< length of the column in the MySQL + row format */ + ulint mysql_null_byte_offset; /*!< MySQL NULL bit byte offset in a + MySQL record */ + ulint mysql_null_bit_mask; /*!< bit mask to get the NULL bit, + zero if column cannot be NULL */ + ulint type; /*!< column type in Innobase mtype + numbers DATA_CHAR... */ + ulint mysql_type; /*!< MySQL type code; this is always + < 256 */ + ulint mysql_length_bytes; /*!< if mysql_type + == DATA_MYSQL_TRUE_VARCHAR, this tells + whether we should use 1 or 2 bytes to + store the MySQL true VARCHAR data + length at the start of row in the MySQL + format (NOTE that the MySQL key value + format always uses 2 bytes for the data + len) */ + ulint charset; /*!< MySQL charset-collation code + of the column, or zero */ + ulint mbminlen; /*!< minimum length of a char, in bytes, + or zero if not a char type */ + ulint mbmaxlen; /*!< maximum length of a char, in bytes, + or zero if not a char type */ + ulint is_unsigned; /*!< if a column type is an integer + type and this field is != 0, then + it is an unsigned integer type */ +}; + +#define MYSQL_FETCH_CACHE_SIZE 8 +/* After fetching this many rows, we start caching them in fetch_cache */ +#define MYSQL_FETCH_CACHE_THRESHOLD 4 + +#define ROW_PREBUILT_ALLOCATED 78540783 +#define ROW_PREBUILT_FREED 26423527 + +/** A struct for (sometimes lazily) prebuilt structures in an Innobase table +handle used within MySQL; these are used to save CPU time. */ + +struct row_prebuilt_t { + ulint magic_n; /*!< this magic number is set to + ROW_PREBUILT_ALLOCATED when created, + or ROW_PREBUILT_FREED when the + struct has been freed */ + dict_table_t* table; /*!< Innobase table handle */ + dict_index_t* index; /*!< current index for a search, if + any */ + trx_t* trx; /*!< current transaction handle */ + unsigned sql_stat_start:1;/*!< TRUE when we start processing of + an SQL statement: we may have to set + an intention lock on the table, + create a consistent read view etc. */ + unsigned mysql_has_locked:1;/*!< this is set TRUE when MySQL + calls external_lock on this handle + with a lock flag, and set FALSE when + with the F_UNLOCK flag */ + unsigned clust_index_was_generated:1; + /*!< if the user did not define a + primary key in MySQL, then Innobase + automatically generated a clustered + index where the ordering column is + the row id: in this case this flag + is set to TRUE */ + unsigned index_usable:1; /*!< caches the value of + row_merge_is_index_usable(trx,index) */ + unsigned read_just_key:1;/*!< set to 1 when MySQL calls + ha_innobase::extra with the + argument HA_EXTRA_KEYREAD; it is enough + to read just columns defined in + the index (i.e., no read of the + clustered index record necessary) */ + unsigned used_in_HANDLER:1;/*!< TRUE if we have been using this + handle in a MySQL HANDLER low level + index cursor command: then we must + store the pcur position even in a + unique search from a clustered index, + because HANDLER allows NEXT and PREV + in such a situation */ + unsigned template_type:2;/*!< ROW_MYSQL_WHOLE_ROW, + ROW_MYSQL_REC_FIELDS, + ROW_MYSQL_DUMMY_TEMPLATE, or + ROW_MYSQL_NO_TEMPLATE */ + unsigned n_template:10; /*!< number of elements in the + template */ + unsigned null_bitmap_len:10;/*!< number of bytes in the SQL NULL + bitmap at the start of a row in the + MySQL format */ + unsigned need_to_access_clustered:1; /*!< if we are fetching + columns through a secondary index + and at least one column is not in + the secondary index, then this is + set to TRUE */ + unsigned templ_contains_blob:1;/*!< TRUE if the template contains + a column with DATA_BLOB == + get_innobase_type_from_mysql_type(); + not to be confused with InnoDB + externally stored columns + (VARCHAR can be off-page too) */ + mysql_row_templ_t* mysql_template;/*!< template used to transform + rows fast between MySQL and Innobase + formats; memory for this template + is not allocated from 'heap' */ + mem_heap_t* heap; /*!< memory heap from which + these auxiliary structures are + allocated when needed */ + ins_node_t* ins_node; /*!< Innobase SQL insert node + used to perform inserts + to the table */ + byte* ins_upd_rec_buff;/*!< buffer for storing data converted + to the Innobase format from the MySQL + format */ + const byte* default_rec; /*!< the default values of all columns + (a "default row") in MySQL format */ + ulint hint_need_to_fetch_extra_cols; + /*!< normally this is set to 0; if this + is set to ROW_RETRIEVE_PRIMARY_KEY, + then we should at least retrieve all + columns in the primary key; if this + is set to ROW_RETRIEVE_ALL_COLS, then + we must retrieve all columns in the + key (if read_just_key == 1), or all + columns in the table */ + upd_node_t* upd_node; /*!< Innobase SQL update node used + to perform updates and deletes */ + trx_id_t trx_id; /*!< The table->def_trx_id when + ins_graph was built */ + que_fork_t* ins_graph; /*!< Innobase SQL query graph used + in inserts. Will be rebuilt on + trx_id or n_indexes mismatch. */ + que_fork_t* upd_graph; /*!< Innobase SQL query graph used + in updates or deletes */ + btr_pcur_t pcur; /*!< persistent cursor used in selects + and updates */ + btr_pcur_t clust_pcur; /*!< persistent cursor used in + some selects and updates */ + que_fork_t* sel_graph; /*!< dummy query graph used in + selects */ + dtuple_t* search_tuple; /*!< prebuilt dtuple used in selects */ + byte row_id[DATA_ROW_ID_LEN]; + /*!< if the clustered index was + generated, the row id of the + last row fetched is stored + here */ + doc_id_t fts_doc_id; /* if the table has an FTS index on + it then we fetch the doc_id. + FTS-FIXME: Currently we fetch it always + but in the future we must only fetch + it when FTS columns are being + updated */ + dtuple_t* clust_ref; /*!< prebuilt dtuple used in + sel/upd/del */ + ulint select_lock_type;/*!< LOCK_NONE, LOCK_S, or LOCK_X */ + ulint stored_select_lock_type;/*!< this field is used to + remember the original select_lock_type + that was decided in ha_innodb.cc, + ::store_lock(), ::external_lock(), + etc. */ + ulint row_read_type; /*!< ROW_READ_WITH_LOCKS if row locks + should be the obtained for records + under an UPDATE or DELETE cursor. + If innodb_locks_unsafe_for_binlog + is TRUE, this can be set to + ROW_READ_TRY_SEMI_CONSISTENT, so that + if the row under an UPDATE or DELETE + cursor was locked by another + transaction, InnoDB will resort + to reading the last committed value + ('semi-consistent read'). Then, + this field will be set to + ROW_READ_DID_SEMI_CONSISTENT to + indicate that. If the row does not + match the WHERE condition, MySQL will + invoke handler::unlock_row() to + clear the flag back to + ROW_READ_TRY_SEMI_CONSISTENT and + to simply skip the row. If + the row matches, the next call to + row_search_for_mysql() will lock + the row. + This eliminates lock waits in some + cases; note that this breaks + serializability. */ + ulint new_rec_locks; /*!< normally 0; if + srv_locks_unsafe_for_binlog is + TRUE or session is using READ + COMMITTED or READ UNCOMMITTED + isolation level, set in + row_search_for_mysql() if we set a new + record lock on the secondary + or clustered index; this is + used in row_unlock_for_mysql() + when releasing the lock under + the cursor if we determine + after retrieving the row that + it does not need to be locked + ('mini-rollback') */ + ulint mysql_prefix_len;/*!< byte offset of the end of + the last requested column */ + ulint mysql_row_len; /*!< length in bytes of a row in the + MySQL format */ + ulint n_rows_fetched; /*!< number of rows fetched after + positioning the current cursor */ + ulint fetch_direction;/*!< ROW_SEL_NEXT or ROW_SEL_PREV */ + byte* fetch_cache[MYSQL_FETCH_CACHE_SIZE]; + /*!< a cache for fetched rows if we + fetch many rows from the same cursor: + it saves CPU time to fetch them in a + batch; we reserve mysql_row_len + bytes for each such row; these + pointers point 4 bytes past the + allocated mem buf start, because + there is a 4 byte magic number at the + start and at the end */ + ibool keep_other_fields_on_keyread; /*!< when using fetch + cache with HA_EXTRA_KEYREAD, don't + overwrite other fields in mysql row + row buffer.*/ + ulint fetch_cache_first;/*!< position of the first not yet + fetched row in fetch_cache */ + ulint n_fetch_cached; /*!< number of not yet fetched rows + in fetch_cache */ + mem_heap_t* blob_heap; /*!< in SELECTS BLOB fields are copied + to this heap */ + mem_heap_t* old_vers_heap; /*!< memory heap where a previous + version is built in consistent read */ + bool in_fts_query; /*!< Whether we are in a FTS query */ + /*----------------------*/ + ulonglong autoinc_last_value; + /*!< last value of AUTO-INC interval */ + ulonglong autoinc_increment;/*!< The increment step of the auto + increment column. Value must be + greater than or equal to 1. Required to + calculate the next value */ + ulonglong autoinc_offset; /*!< The offset passed to + get_auto_increment() by MySQL. Required + to calculate the next value */ + dberr_t autoinc_error; /*!< The actual error code encountered + while trying to init or read the + autoinc value from the table. We + store it here so that we can return + it to MySQL */ + /*----------------------*/ + void* idx_cond; /*!< In ICP, pointer to a ha_innobase, + passed to innobase_index_cond(). + NULL if index condition pushdown is + not used. */ + ulint idx_cond_n_cols;/*!< Number of fields in idx_cond_cols. + 0 if and only if idx_cond == NULL. */ + /*----------------------*/ + ulint magic_n2; /*!< this should be the same as + magic_n */ + /*----------------------*/ + unsigned innodb_api:1; /*!< whether this is a InnoDB API + query */ + const rec_t* innodb_api_rec; /*!< InnoDB API search result */ + byte* srch_key_val1; /*!< buffer used in converting + search key values from MySQL format + to InnoDB format.*/ + byte* srch_key_val2; /*!< buffer used in converting + search key values from MySQL format + to InnoDB format.*/ + uint srch_key_val_len; /*!< Size of search key */ + +}; + +/** Callback for row_mysql_sys_index_iterate() */ +struct SysIndexCallback { + virtual ~SysIndexCallback() { } + + /** Callback method + @param mtr - current mini transaction + @param pcur - persistent cursor. */ + virtual void operator()(mtr_t* mtr, btr_pcur_t* pcur) throw() = 0; +}; + +#define ROW_PREBUILT_FETCH_MAGIC_N 465765687 + +#define ROW_MYSQL_WHOLE_ROW 0 +#define ROW_MYSQL_REC_FIELDS 1 +#define ROW_MYSQL_NO_TEMPLATE 2 +#define ROW_MYSQL_DUMMY_TEMPLATE 3 /* dummy template used in + row_scan_and_check_index */ + +/* Values for hint_need_to_fetch_extra_cols */ +#define ROW_RETRIEVE_PRIMARY_KEY 1 +#define ROW_RETRIEVE_ALL_COLS 2 + +/* Values for row_read_type */ +#define ROW_READ_WITH_LOCKS 0 +#define ROW_READ_TRY_SEMI_CONSISTENT 1 +#define ROW_READ_DID_SEMI_CONSISTENT 2 + +#ifndef UNIV_NONINL +#include "row0mysql.ic" +#endif + +#endif /* row0mysql.h */ diff --git a/storage/xtradb/include/row0mysql.ic b/storage/xtradb/include/row0mysql.ic new file mode 100644 index 00000000000..2eb60898c46 --- /dev/null +++ b/storage/xtradb/include/row0mysql.ic @@ -0,0 +1,24 @@ +/***************************************************************************** + +Copyright (c) 2001, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0mysql.ic +MySQL interface for Innobase + +Created 1/23/2001 Heikki Tuuri +*******************************************************/ diff --git a/storage/xtradb/include/row0purge.h b/storage/xtradb/include/row0purge.h new file mode 100644 index 00000000000..93dcf9cf49b --- /dev/null +++ b/storage/xtradb/include/row0purge.h @@ -0,0 +1,128 @@ +/***************************************************************************** + +Copyright (c) 1997, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0purge.h +Purge obsolete records + +Created 3/14/1997 Heikki Tuuri +*******************************************************/ + +#ifndef row0purge_h +#define row0purge_h + +#include "univ.i" +#include "data0data.h" +#include "btr0types.h" +#include "btr0pcur.h" +#include "dict0types.h" +#include "trx0types.h" +#include "que0types.h" +#include "row0types.h" +#include "row0purge.h" +#include "ut0vec.h" + +/********************************************************************//** +Creates a purge node to a query graph. +@return own: purge node */ +UNIV_INTERN +purge_node_t* +row_purge_node_create( +/*==================*/ + que_thr_t* parent, /*!< in: parent node, i.e., a + thr node */ + mem_heap_t* heap) /*!< in: memory heap where created */ + __attribute__((nonnull, warn_unused_result)); +/***********************************************************//** +Determines if it is possible to remove a secondary index entry. +Removal is possible if the secondary index entry does not refer to any +not delete marked version of a clustered index record where DB_TRX_ID +is newer than the purge view. + +NOTE: This function should only be called by the purge thread, only +while holding a latch on the leaf page of the secondary index entry +(or keeping the buffer pool watch on the page). It is possible that +this function first returns true and then false, if a user transaction +inserts a record that the secondary index entry would refer to. +However, in that case, the user transaction would also re-insert the +secondary index entry after purge has removed it and released the leaf +page latch. +@return true if the secondary index record can be purged */ +UNIV_INTERN +bool +row_purge_poss_sec( +/*===============*/ + purge_node_t* node, /*!< in/out: row purge node */ + dict_index_t* index, /*!< in: secondary index */ + const dtuple_t* entry) /*!< in: secondary index entry */ + __attribute__((nonnull, warn_unused_result)); +/*************************************************************** +Does the purge operation for a single undo log record. This is a high-level +function used in an SQL execution graph. +@return query thread to run next or NULL */ +UNIV_INTERN +que_thr_t* +row_purge_step( +/*===========*/ + que_thr_t* thr) /*!< in: query thread */ + __attribute__((nonnull, warn_unused_result)); + +/* Purge node structure */ + +struct purge_node_t{ + que_common_t common; /*!< node type: QUE_NODE_PURGE */ + /*----------------------*/ + /* Local storage for this graph node */ + roll_ptr_t roll_ptr;/* roll pointer to undo log record */ + ib_vector_t* undo_recs;/*!< Undo recs to purge */ + + undo_no_t undo_no;/* undo number of the record */ + + ulint rec_type;/* undo log record type: TRX_UNDO_INSERT_REC, + ... */ + dict_table_t* table; /*!< table where purge is done */ + + ulint cmpl_info;/* compiler analysis info of an update */ + + upd_t* update; /*!< update vector for a clustered index + record */ + dtuple_t* ref; /*!< NULL, or row reference to the next row to + handle */ + dtuple_t* row; /*!< NULL, or a copy (also fields copied to + heap) of the indexed fields of the row to + handle */ + dict_index_t* index; /*!< NULL, or the next index whose record should + be handled */ + mem_heap_t* heap; /*!< memory heap used as auxiliary storage for + row; this must be emptied after a successful + purge of a row */ + ibool found_clust;/* TRUE if the clustered index record + determined by ref was found in the clustered + index, and we were able to position pcur on + it */ + btr_pcur_t pcur; /*!< persistent cursor used in searching the + clustered index record */ + ibool done; /* Debug flag */ + +}; + +#ifndef UNIV_NONINL +#include "row0purge.ic" +#endif + +#endif diff --git a/storage/xtradb/include/row0purge.ic b/storage/xtradb/include/row0purge.ic new file mode 100644 index 00000000000..700106d1048 --- /dev/null +++ b/storage/xtradb/include/row0purge.ic @@ -0,0 +1,25 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + + +/**************************************************//** +@file include/row0purge.ic +Purge obsolete records + +Created 3/14/1997 Heikki Tuuri +*******************************************************/ diff --git a/storage/xtradb/include/row0quiesce.h b/storage/xtradb/include/row0quiesce.h new file mode 100644 index 00000000000..1d6d11291b8 --- /dev/null +++ b/storage/xtradb/include/row0quiesce.h @@ -0,0 +1,74 @@ +/***************************************************************************** + +Copyright (c) 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0quiesce.h + +Header file for tablespace quiesce functions. + +Created 2012-02-08 by Sunny Bains +*******************************************************/ + +#ifndef row0quiesce_h +#define row0quiesce_h + +#include "univ.i" +#include "dict0types.h" + +struct trx_t; + +/** The version number of the export meta-data text file. */ +#define IB_EXPORT_CFG_VERSION_V1 0x1UL + +/*********************************************************************//** +Quiesce the tablespace that the table resides in. */ +UNIV_INTERN +void +row_quiesce_table_start( +/*====================*/ + dict_table_t* table, /*!< in: quiesce this table */ + trx_t* trx) /*!< in/out: transaction/session */ + __attribute__((nonnull)); + +/*********************************************************************//** +Set a table's quiesce state. +@return DB_SUCCESS or errro code. */ +UNIV_INTERN +dberr_t +row_quiesce_set_state( +/*==================*/ + dict_table_t* table, /*!< in: quiesce this table */ + ib_quiesce_t state, /*!< in: quiesce state to set */ + trx_t* trx) /*!< in/out: transaction */ + __attribute__((nonnull, warn_unused_result)); + +/*********************************************************************//** +Cleanup after table quiesce. */ +UNIV_INTERN +void +row_quiesce_table_complete( +/*=======================*/ + dict_table_t* table, /*!< in: quiesce this table */ + trx_t* trx) /*!< in/out: transaction/session */ + __attribute__((nonnull)); + +#ifndef UNIV_NONINL +#include "row0quiesce.ic" +#endif + +#endif /* row0quiesce_h */ diff --git a/storage/xtradb/include/row0quiesce.ic b/storage/xtradb/include/row0quiesce.ic new file mode 100644 index 00000000000..f570a6aed05 --- /dev/null +++ b/storage/xtradb/include/row0quiesce.ic @@ -0,0 +1,26 @@ +/***************************************************************************** + +Copyright (c) 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0quiesce.ic + +Quiesce a tablespace. + +Created 2012-02-08 Sunny Bains +*******************************************************/ + diff --git a/storage/xtradb/include/row0row.h b/storage/xtradb/include/row0row.h new file mode 100644 index 00000000000..a4e5e0dd2fa --- /dev/null +++ b/storage/xtradb/include/row0row.h @@ -0,0 +1,343 @@ +/***************************************************************************** + +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0row.h +General row routines + +Created 4/20/1996 Heikki Tuuri +*******************************************************/ + +#ifndef row0row_h +#define row0row_h + +#include "univ.i" +#include "data0data.h" +#include "dict0types.h" +#include "trx0types.h" +#include "que0types.h" +#include "mtr0mtr.h" +#include "rem0types.h" +#include "read0types.h" +#include "row0types.h" +#include "btr0types.h" + +/*********************************************************************//** +Gets the offset of the DB_TRX_ID field, in bytes relative to the origin of +a clustered index record. +@return offset of DATA_TRX_ID */ +UNIV_INLINE +ulint +row_get_trx_id_offset( +/*==================*/ + const dict_index_t* index, /*!< in: clustered index */ + const ulint* offsets)/*!< in: record offsets */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Reads the trx id field from a clustered index record. +@return value of the field */ +UNIV_INLINE +trx_id_t +row_get_rec_trx_id( +/*===============*/ + const rec_t* rec, /*!< in: record */ + const dict_index_t* index, /*!< in: clustered index */ + const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Reads the roll pointer field from a clustered index record. +@return value of the field */ +UNIV_INLINE +roll_ptr_t +row_get_rec_roll_ptr( +/*=================*/ + const rec_t* rec, /*!< in: record */ + const dict_index_t* index, /*!< in: clustered index */ + const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */ + __attribute__((nonnull, warn_unused_result)); +/*****************************************************************//** +When an insert or purge to a table is performed, this function builds +the entry to be inserted into or purged from an index on the table. +@return index entry which should be inserted or purged +@retval NULL if the externally stored columns in the clustered index record +are unavailable and ext != NULL, or row is missing some needed columns. */ +UNIV_INTERN +dtuple_t* +row_build_index_entry_low( +/*======================*/ + const dtuple_t* row, /*!< in: row which should be + inserted or purged */ + const row_ext_t* ext, /*!< in: externally stored column + prefixes, or NULL */ + dict_index_t* index, /*!< in: index on the table */ + mem_heap_t* heap) /*!< in: memory heap from which + the memory for the index entry + is allocated */ + __attribute__((warn_unused_result, nonnull(1,3,4))); +/*****************************************************************//** +When an insert or purge to a table is performed, this function builds +the entry to be inserted into or purged from an index on the table. +@return index entry which should be inserted or purged, or NULL if the +externally stored columns in the clustered index record are +unavailable and ext != NULL */ +UNIV_INLINE +dtuple_t* +row_build_index_entry( +/*==================*/ + const dtuple_t* row, /*!< in: row which should be + inserted or purged */ + const row_ext_t* ext, /*!< in: externally stored column + prefixes, or NULL */ + dict_index_t* index, /*!< in: index on the table */ + mem_heap_t* heap) /*!< in: memory heap from which + the memory for the index entry + is allocated */ + __attribute__((warn_unused_result, nonnull(1,3,4))); +/*******************************************************************//** +An inverse function to row_build_index_entry. Builds a row from a +record in a clustered index. +@return own: row built; see the NOTE below! */ +UNIV_INTERN +dtuple_t* +row_build( +/*======*/ + ulint type, /*!< in: ROW_COPY_POINTERS or + ROW_COPY_DATA; the latter + copies also the data fields to + heap while the first only + places pointers to data fields + on the index page, and thus is + more efficient */ + const dict_index_t* index, /*!< in: clustered index */ + const rec_t* rec, /*!< in: record in the clustered + index; NOTE: in the case + ROW_COPY_POINTERS the data + fields in the row will point + directly into this record, + therefore, the buffer page of + this record must be at least + s-latched and the latch held + as long as the row dtuple is used! */ + const ulint* offsets,/*!< in: rec_get_offsets(rec,index) + or NULL, in which case this function + will invoke rec_get_offsets() */ + const dict_table_t* col_table, + /*!< in: table, to check which + externally stored columns + occur in the ordering columns + of an index, or NULL if + index->table should be + consulted instead; the user + columns in this table should be + the same columns as in index->table */ + const dtuple_t* add_cols, + /*!< in: default values of + added columns, or NULL */ + const ulint* col_map,/*!< in: mapping of old column + numbers to new ones, or NULL */ + row_ext_t** ext, /*!< out, own: cache of + externally stored column + prefixes, or NULL */ + mem_heap_t* heap) /*!< in: memory heap from which + the memory needed is allocated */ + __attribute__((nonnull(2,3,9))); +/*******************************************************************//** +Converts an index record to a typed data tuple. +@return index entry built; does not set info_bits, and the data fields +in the entry will point directly to rec */ +UNIV_INTERN +dtuple_t* +row_rec_to_index_entry_low( +/*=======================*/ + const rec_t* rec, /*!< in: record in the index */ + const dict_index_t* index, /*!< in: index */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + ulint* n_ext, /*!< out: number of externally + stored columns */ + mem_heap_t* heap) /*!< in: memory heap from which + the memory needed is allocated */ + __attribute__((nonnull, warn_unused_result)); +/*******************************************************************//** +Converts an index record to a typed data tuple. NOTE that externally +stored (often big) fields are NOT copied to heap. +@return own: index entry built */ +UNIV_INTERN +dtuple_t* +row_rec_to_index_entry( +/*===================*/ + const rec_t* rec, /*!< in: record in the index */ + const dict_index_t* index, /*!< in: index */ + const ulint* offsets,/*!< in/out: rec_get_offsets(rec) */ + ulint* n_ext, /*!< out: number of externally + stored columns */ + mem_heap_t* heap) /*!< in: memory heap from which + the memory needed is allocated */ + __attribute__((nonnull, warn_unused_result)); +/*******************************************************************//** +Builds from a secondary index record a row reference with which we can +search the clustered index record. +@return own: row reference built; see the NOTE below! */ +UNIV_INTERN +dtuple_t* +row_build_row_ref( +/*==============*/ + ulint type, /*!< in: ROW_COPY_DATA, or ROW_COPY_POINTERS: + the former copies also the data fields to + heap, whereas the latter only places pointers + to data fields on the index page */ + dict_index_t* index, /*!< in: secondary index */ + const rec_t* rec, /*!< in: record in the index; + NOTE: in the case ROW_COPY_POINTERS + the data fields in the row will point + directly into this record, therefore, + the buffer page of this record must be + at least s-latched and the latch held + as long as the row reference is used! */ + mem_heap_t* heap) /*!< in: memory heap from which the memory + needed is allocated */ + __attribute__((nonnull, warn_unused_result)); +/*******************************************************************//** +Builds from a secondary index record a row reference with which we can +search the clustered index record. */ +UNIV_INTERN +void +row_build_row_ref_in_tuple( +/*=======================*/ + dtuple_t* ref, /*!< in/out: row reference built; + see the NOTE below! */ + const rec_t* rec, /*!< in: record in the index; + NOTE: the data fields in ref + will point directly into this + record, therefore, the buffer + page of this record must be at + least s-latched and the latch + held as long as the row + reference is used! */ + const dict_index_t* index, /*!< in: secondary index */ + ulint* offsets,/*!< in: rec_get_offsets(rec, index) + or NULL */ + trx_t* trx) /*!< in: transaction or NULL */ + __attribute__((nonnull(1,2,3))); +/*******************************************************************//** +Builds from a secondary index record a row reference with which we can +search the clustered index record. */ +UNIV_INLINE +void +row_build_row_ref_fast( +/*===================*/ + dtuple_t* ref, /*!< in/out: typed data tuple where the + reference is built */ + const ulint* map, /*!< in: array of field numbers in rec + telling how ref should be built from + the fields of rec */ + const rec_t* rec, /*!< in: record in the index; must be + preserved while ref is used, as we do + not copy field values to heap */ + const ulint* offsets);/*!< in: array returned by rec_get_offsets() */ +/***************************************************************//** +Searches the clustered index record for a row, if we have the row +reference. +@return TRUE if found */ +UNIV_INTERN +ibool +row_search_on_row_ref( +/*==================*/ + btr_pcur_t* pcur, /*!< out: persistent cursor, which must + be closed by the caller */ + ulint mode, /*!< in: BTR_MODIFY_LEAF, ... */ + const dict_table_t* table, /*!< in: table */ + const dtuple_t* ref, /*!< in: row reference */ + mtr_t* mtr) /*!< in/out: mtr */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Fetches the clustered index record for a secondary index record. The latches +on the secondary index record are preserved. +@return record or NULL, if no record found */ +UNIV_INTERN +rec_t* +row_get_clust_rec( +/*==============*/ + ulint mode, /*!< in: BTR_MODIFY_LEAF, ... */ + const rec_t* rec, /*!< in: record in a secondary index */ + dict_index_t* index, /*!< in: secondary index */ + dict_index_t** clust_index,/*!< out: clustered index */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull, warn_unused_result)); + +/** Result of row_search_index_entry */ +enum row_search_result { + ROW_FOUND = 0, /*!< the record was found */ + ROW_NOT_FOUND, /*!< record not found */ + ROW_BUFFERED, /*!< one of BTR_INSERT, BTR_DELETE, or + BTR_DELETE_MARK was specified, the + secondary index leaf page was not in + the buffer pool, and the operation was + enqueued in the insert/delete buffer */ + ROW_NOT_DELETED_REF /*!< BTR_DELETE was specified, and + row_purge_poss_sec() failed */ +}; + +/***************************************************************//** +Searches an index record. +@return whether the record was found or buffered */ +UNIV_INTERN +enum row_search_result +row_search_index_entry( +/*===================*/ + dict_index_t* index, /*!< in: index */ + const dtuple_t* entry, /*!< in: index entry */ + ulint mode, /*!< in: BTR_MODIFY_LEAF, ... */ + btr_pcur_t* pcur, /*!< in/out: persistent cursor, which must + be closed by the caller */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull, warn_unused_result)); + +#define ROW_COPY_DATA 1 +#define ROW_COPY_POINTERS 2 + +/* The allowed latching order of index records is the following: +(1) a secondary index record -> +(2) the clustered index record -> +(3) rollback segment data for the clustered index record. */ + +/*******************************************************************//** +Formats the raw data in "data" (in InnoDB on-disk format) using +"dict_field" and writes the result to "buf". +Not more than "buf_size" bytes are written to "buf". +The result is always NUL-terminated (provided buf_size is positive) and the +number of bytes that were written to "buf" is returned (including the +terminating NUL). +@return number of bytes that were written */ +UNIV_INTERN +ulint +row_raw_format( +/*===========*/ + const char* data, /*!< in: raw data */ + ulint data_len, /*!< in: raw data length + in bytes */ + const dict_field_t* dict_field, /*!< in: index field */ + char* buf, /*!< out: output buffer */ + ulint buf_size) /*!< in: output buffer size + in bytes */ + __attribute__((nonnull, warn_unused_result)); + +#ifndef UNIV_NONINL +#include "row0row.ic" +#endif + +#endif diff --git a/storage/xtradb/include/row0row.ic b/storage/xtradb/include/row0row.ic new file mode 100644 index 00000000000..ac62422be1f --- /dev/null +++ b/storage/xtradb/include/row0row.ic @@ -0,0 +1,174 @@ +/***************************************************************************** + +Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0row.ic +General row routines + +Created 4/20/1996 Heikki Tuuri +*******************************************************/ + +#include "dict0dict.h" +#include "rem0rec.h" +#include "trx0undo.h" + +/*********************************************************************//** +Gets the offset of the DB_TRX_ID field, in bytes relative to the origin of +a clustered index record. +@return offset of DATA_TRX_ID */ +UNIV_INLINE +ulint +row_get_trx_id_offset( +/*==================*/ + const dict_index_t* index, /*!< in: clustered index */ + const ulint* offsets)/*!< in: record offsets */ +{ + ulint pos; + ulint offset; + ulint len; + + ut_ad(dict_index_is_clust(index)); + ut_ad(rec_offs_validate(NULL, index, offsets)); + + pos = dict_index_get_sys_col_pos(index, DATA_TRX_ID); + + offset = rec_get_nth_field_offs(offsets, pos, &len); + + ut_ad(len == DATA_TRX_ID_LEN); + + return(offset); +} + +/*********************************************************************//** +Reads the trx id field from a clustered index record. +@return value of the field */ +UNIV_INLINE +trx_id_t +row_get_rec_trx_id( +/*===============*/ + const rec_t* rec, /*!< in: record */ + const dict_index_t* index, /*!< in: clustered index */ + const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */ +{ + ulint offset; + + ut_ad(dict_index_is_clust(index)); + ut_ad(rec_offs_validate(rec, index, offsets)); + + offset = index->trx_id_offset; + + if (!offset) { + offset = row_get_trx_id_offset(index, offsets); + } + + return(trx_read_trx_id(rec + offset)); +} + +/*********************************************************************//** +Reads the roll pointer field from a clustered index record. +@return value of the field */ +UNIV_INLINE +roll_ptr_t +row_get_rec_roll_ptr( +/*=================*/ + const rec_t* rec, /*!< in: record */ + const dict_index_t* index, /*!< in: clustered index */ + const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */ +{ + ulint offset; + + ut_ad(dict_index_is_clust(index)); + ut_ad(rec_offs_validate(rec, index, offsets)); + + offset = index->trx_id_offset; + + if (!offset) { + offset = row_get_trx_id_offset(index, offsets); + } + + return(trx_read_roll_ptr(rec + offset + DATA_TRX_ID_LEN)); +} + +/*****************************************************************//** +When an insert or purge to a table is performed, this function builds +the entry to be inserted into or purged from an index on the table. +@return index entry which should be inserted or purged, or NULL if the +externally stored columns in the clustered index record are +unavailable and ext != NULL */ +UNIV_INLINE +dtuple_t* +row_build_index_entry( +/*==================*/ + const dtuple_t* row, /*!< in: row which should be + inserted or purged */ + const row_ext_t* ext, /*!< in: externally stored column + prefixes, or NULL */ + dict_index_t* index, /*!< in: index on the table */ + mem_heap_t* heap) /*!< in: memory heap from which + the memory for the index entry + is allocated */ +{ + dtuple_t* entry; + + ut_ad(dtuple_check_typed(row)); + entry = row_build_index_entry_low(row, ext, index, heap); + ut_ad(!entry || dtuple_check_typed(entry)); + return(entry); +} + +/*******************************************************************//** +Builds from a secondary index record a row reference with which we can +search the clustered index record. */ +UNIV_INLINE +void +row_build_row_ref_fast( +/*===================*/ + dtuple_t* ref, /*!< in/out: typed data tuple where the + reference is built */ + const ulint* map, /*!< in: array of field numbers in rec + telling how ref should be built from + the fields of rec */ + const rec_t* rec, /*!< in: record in the index; must be + preserved while ref is used, as we do + not copy field values to heap */ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ +{ + dfield_t* dfield; + const byte* field; + ulint len; + ulint ref_len; + ulint field_no; + ulint i; + + ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_ad(!rec_offs_any_extern(offsets)); + ref_len = dtuple_get_n_fields(ref); + + for (i = 0; i < ref_len; i++) { + dfield = dtuple_get_nth_field(ref, i); + + field_no = *(map + i); + + if (field_no != ULINT_UNDEFINED) { + + field = rec_get_nth_field(rec, offsets, + field_no, &len); + dfield_set_data(dfield, field, len); + } + } +} diff --git a/storage/xtradb/include/row0sel.h b/storage/xtradb/include/row0sel.h new file mode 100644 index 00000000000..c8be80f89d9 --- /dev/null +++ b/storage/xtradb/include/row0sel.h @@ -0,0 +1,409 @@ +/***************************************************************************** + +Copyright (c) 1997, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0sel.h +Select + +Created 12/19/1997 Heikki Tuuri +*******************************************************/ + +#ifndef row0sel_h +#define row0sel_h + +#include "univ.i" +#include "data0data.h" +#include "que0types.h" +#include "dict0types.h" +#include "trx0types.h" +#include "row0types.h" +#include "que0types.h" +#include "pars0sym.h" +#include "btr0pcur.h" +#include "read0read.h" +#include "row0mysql.h" + +/*********************************************************************//** +Creates a select node struct. +@return own: select node struct */ +UNIV_INTERN +sel_node_t* +sel_node_create( +/*============*/ + mem_heap_t* heap); /*!< in: memory heap where created */ +/*********************************************************************//** +Frees the memory private to a select node when a query graph is freed, +does not free the heap where the node was originally created. */ +UNIV_INTERN +void +sel_node_free_private( +/*==================*/ + sel_node_t* node); /*!< in: select node struct */ +/*********************************************************************//** +Frees a prefetch buffer for a column, including the dynamically allocated +memory for data stored there. */ +UNIV_INTERN +void +sel_col_prefetch_buf_free( +/*======================*/ + sel_buf_t* prefetch_buf); /*!< in, own: prefetch buffer */ +/*********************************************************************//** +Gets the plan node for the nth table in a join. +@return plan node */ +UNIV_INLINE +plan_t* +sel_node_get_nth_plan( +/*==================*/ + sel_node_t* node, /*!< in: select node */ + ulint i); /*!< in: get ith plan node */ +/**********************************************************************//** +Performs a select step. This is a high-level function used in SQL execution +graphs. +@return query thread to run next or NULL */ +UNIV_INTERN +que_thr_t* +row_sel_step( +/*=========*/ + que_thr_t* thr); /*!< in: query thread */ +/**********************************************************************//** +Performs an execution step of an open or close cursor statement node. +@return query thread to run next or NULL */ +UNIV_INLINE +que_thr_t* +open_step( +/*======*/ + que_thr_t* thr); /*!< in: query thread */ +/**********************************************************************//** +Performs a fetch for a cursor. +@return query thread to run next or NULL */ +UNIV_INTERN +que_thr_t* +fetch_step( +/*=======*/ + que_thr_t* thr); /*!< in: query thread */ +/****************************************************************//** +Sample callback function for fetch that prints each row. +@return always returns non-NULL */ +UNIV_INTERN +void* +row_fetch_print( +/*============*/ + void* row, /*!< in: sel_node_t* */ + void* user_arg); /*!< in: not used */ +/***********************************************************//** +Prints a row in a select result. +@return query thread to run next or NULL */ +UNIV_INTERN +que_thr_t* +row_printf_step( +/*============*/ + que_thr_t* thr); /*!< in: query thread */ +/****************************************************************//** +Converts a key value stored in MySQL format to an Innobase dtuple. The last +field of the key value may be just a prefix of a fixed length field: hence +the parameter key_len. But currently we do not allow search keys where the +last field is only a prefix of the full key field len and print a warning if +such appears. */ +UNIV_INTERN +void +row_sel_convert_mysql_key_to_innobase( +/*==================================*/ + dtuple_t* tuple, /*!< in/out: tuple where to build; + NOTE: we assume that the type info + in the tuple is already according + to index! */ + byte* buf, /*!< in: buffer to use in field + conversions; NOTE that dtuple->data + may end up pointing inside buf so + do not discard that buffer while + the tuple is being used. See + row_mysql_store_col_in_innobase_format() + in the case of DATA_INT */ + ulint buf_len, /*!< in: buffer length */ + dict_index_t* index, /*!< in: index of the key value */ + const byte* key_ptr, /*!< in: MySQL key value */ + ulint key_len, /*!< in: MySQL key value length */ + trx_t* trx); /*!< in: transaction */ +/********************************************************************//** +Searches for rows in the database. This is used in the interface to +MySQL. This function opens a cursor, and also implements fetch next +and fetch prev. NOTE that if we do a search with a full key value +from a unique index (ROW_SEL_EXACT), then we will not store the cursor +position and fetch next or fetch prev must not be tried to the cursor! +@return DB_SUCCESS, DB_RECORD_NOT_FOUND, DB_END_OF_INDEX, DB_DEADLOCK, +DB_LOCK_TABLE_FULL, or DB_TOO_BIG_RECORD */ +UNIV_INTERN +dberr_t +row_search_for_mysql( +/*=================*/ + byte* buf, /*!< in/out: buffer for the fetched + row in the MySQL format */ + ulint mode, /*!< in: search mode PAGE_CUR_L, ... */ + row_prebuilt_t* prebuilt, /*!< in: prebuilt struct for the + table handle; this contains the info + of search_tuple, index; if search + tuple contains 0 fields then we + position the cursor at the start or + the end of the index, depending on + 'mode' */ + ulint match_mode, /*!< in: 0 or ROW_SEL_EXACT or + ROW_SEL_EXACT_PREFIX */ + ulint direction) /*!< in: 0 or ROW_SEL_NEXT or + ROW_SEL_PREV; NOTE: if this is != 0, + then prebuilt must have a pcur + with stored position! In opening of a + cursor 'direction' should be 0. */ + __attribute__((nonnull, warn_unused_result)); +/*******************************************************************//** +Checks if MySQL at the moment is allowed for this table to retrieve a +consistent read result, or store it to the query cache. +@return TRUE if storing or retrieving from the query cache is permitted */ +UNIV_INTERN +ibool +row_search_check_if_query_cache_permitted( +/*======================================*/ + trx_t* trx, /*!< in: transaction object */ + const char* norm_name); /*!< in: concatenation of database name, + '/' char, table name */ +/*******************************************************************//** +Read the max AUTOINC value from an index. +@return DB_SUCCESS if all OK else error code */ +UNIV_INTERN +dberr_t +row_search_max_autoinc( +/*===================*/ + dict_index_t* index, /*!< in: index to search */ + const char* col_name, /*!< in: autoinc column name */ + ib_uint64_t* value) /*!< out: AUTOINC value read */ + __attribute__((nonnull, warn_unused_result)); + +/** A structure for caching column values for prefetched rows */ +struct sel_buf_t{ + byte* data; /*!< data, or NULL; if not NULL, this field + has allocated memory which must be explicitly + freed; can be != NULL even when len is + UNIV_SQL_NULL */ + ulint len; /*!< data length or UNIV_SQL_NULL */ + ulint val_buf_size; + /*!< size of memory buffer allocated for data: + this can be more than len; this is defined + when data != NULL */ +}; + +/** Query plan */ +struct plan_t{ + dict_table_t* table; /*!< table struct in the dictionary + cache */ + dict_index_t* index; /*!< table index used in the search */ + btr_pcur_t pcur; /*!< persistent cursor used to search + the index */ + ibool asc; /*!< TRUE if cursor traveling upwards */ + ibool pcur_is_open; /*!< TRUE if pcur has been positioned + and we can try to fetch new rows */ + ibool cursor_at_end; /*!< TRUE if the cursor is open but + we know that there are no more + qualifying rows left to retrieve from + the index tree; NOTE though, that + there may still be unprocessed rows in + the prefetch stack; always FALSE when + pcur_is_open is FALSE */ + ibool stored_cursor_rec_processed; + /*!< TRUE if the pcur position has been + stored and the record it is positioned + on has already been processed */ + que_node_t** tuple_exps; /*!< array of expressions + which are used to calculate + the field values in the search + tuple: there is one expression + for each field in the search + tuple */ + dtuple_t* tuple; /*!< search tuple */ + ulint mode; /*!< search mode: PAGE_CUR_G, ... */ + ulint n_exact_match; /*!< number of first fields in + the search tuple which must be + exactly matched */ + ibool unique_search; /*!< TRUE if we are searching an + index record with a unique key */ + ulint n_rows_fetched; /*!< number of rows fetched using pcur + after it was opened */ + ulint n_rows_prefetched;/*!< number of prefetched rows cached + for fetch: fetching several rows in + the same mtr saves CPU time */ + ulint first_prefetched;/*!< index of the first cached row in + select buffer arrays for each column */ + ibool no_prefetch; /*!< no prefetch for this table */ + sym_node_list_t columns; /*!< symbol table nodes for the columns + to retrieve from the table */ + UT_LIST_BASE_NODE_T(func_node_t) + end_conds; /*!< conditions which determine the + fetch limit of the index segment we + have to look at: when one of these + fails, the result set has been + exhausted for the cursor in this + index; these conditions are normalized + so that in a comparison the column + for this table is the first argument */ + UT_LIST_BASE_NODE_T(func_node_t) + other_conds; /*!< the rest of search conditions we can + test at this table in a join */ + ibool must_get_clust; /*!< TRUE if index is a non-clustered + index and we must also fetch the + clustered index record; this is the + case if the non-clustered record does + not contain all the needed columns, or + if this is a single-table explicit + cursor, or a searched update or + delete */ + ulint* clust_map; /*!< map telling how clust_ref is built + from the fields of a non-clustered + record */ + dtuple_t* clust_ref; /*!< the reference to the clustered + index entry is built here if index is + a non-clustered index */ + btr_pcur_t clust_pcur; /*!< if index is non-clustered, we use + this pcur to search the clustered + index */ + mem_heap_t* old_vers_heap; /*!< memory heap used in building an old + version of a row, or NULL */ +}; + +/** Select node states */ +enum sel_node_state { + SEL_NODE_CLOSED, /*!< it is a declared cursor which is not + currently open */ + SEL_NODE_OPEN, /*!< intention locks not yet set on tables */ + SEL_NODE_FETCH, /*!< intention locks have been set */ + SEL_NODE_NO_MORE_ROWS /*!< cursor has reached the result set end */ +}; + +/** Select statement node */ +struct sel_node_t{ + que_common_t common; /*!< node type: QUE_NODE_SELECT */ + enum sel_node_state + state; /*!< node state */ + que_node_t* select_list; /*!< select list */ + sym_node_t* into_list; /*!< variables list or NULL */ + sym_node_t* table_list; /*!< table list */ + ibool asc; /*!< TRUE if the rows should be fetched + in an ascending order */ + ibool set_x_locks; /*!< TRUE if the cursor is for update or + delete, which means that a row x-lock + should be placed on the cursor row */ + ulint row_lock_mode; /*!< LOCK_X or LOCK_S */ + ulint n_tables; /*!< number of tables */ + ulint fetch_table; /*!< number of the next table to access + in the join */ + plan_t* plans; /*!< array of n_tables many plan nodes + containing the search plan and the + search data structures */ + que_node_t* search_cond; /*!< search condition */ + read_view_t* read_view; /*!< if the query is a non-locking + consistent read, its read view is + placed here, otherwise NULL */ + ibool consistent_read;/*!< TRUE if the select is a consistent, + non-locking read */ + order_node_t* order_by; /*!< order by column definition, or + NULL */ + ibool is_aggregate; /*!< TRUE if the select list consists of + aggregate functions */ + ibool aggregate_already_fetched; + /*!< TRUE if the aggregate row has + already been fetched for the current + cursor */ + ibool can_get_updated;/*!< this is TRUE if the select + is in a single-table explicit + cursor which can get updated + within the stored procedure, + or in a searched update or + delete; NOTE that to determine + of an explicit cursor if it + can get updated, the parser + checks from a stored procedure + if it contains positioned + update or delete statements */ + sym_node_t* explicit_cursor;/*!< not NULL if an explicit cursor */ + UT_LIST_BASE_NODE_T(sym_node_t) + copy_variables; /*!< variables whose values we have to + copy when an explicit cursor is opened, + so that they do not change between + fetches */ +}; + +/** Fetch statement node */ +struct fetch_node_t{ + que_common_t common; /*!< type: QUE_NODE_FETCH */ + sel_node_t* cursor_def; /*!< cursor definition */ + sym_node_t* into_list; /*!< variables to set */ + + pars_user_func_t* + func; /*!< User callback function or NULL. + The first argument to the function + is a sel_node_t*, containing the + results of the SELECT operation for + one row. If the function returns + NULL, it is not interested in + further rows and the cursor is + modified so (cursor % NOTFOUND) is + true. If it returns not-NULL, + continue normally. See + row_fetch_print() for an example + (and a useful debugging tool). */ +}; + +/** Open or close cursor operation type */ +enum open_node_op { + ROW_SEL_OPEN_CURSOR, /*!< open cursor */ + ROW_SEL_CLOSE_CURSOR /*!< close cursor */ +}; + +/** Open or close cursor statement node */ +struct open_node_t{ + que_common_t common; /*!< type: QUE_NODE_OPEN */ + enum open_node_op + op_type; /*!< operation type: open or + close cursor */ + sel_node_t* cursor_def; /*!< cursor definition */ +}; + +/** Row printf statement node */ +struct row_printf_node_t{ + que_common_t common; /*!< type: QUE_NODE_ROW_PRINTF */ + sel_node_t* sel_node; /*!< select */ +}; + +/** Search direction for the MySQL interface */ +enum row_sel_direction { + ROW_SEL_NEXT = 1, /*!< ascending direction */ + ROW_SEL_PREV = 2 /*!< descending direction */ +}; + +/** Match mode for the MySQL interface */ +enum row_sel_match_mode { + ROW_SEL_EXACT = 1, /*!< search using a complete key value */ + ROW_SEL_EXACT_PREFIX /*!< search using a key prefix which + must match rows: the prefix may + contain an incomplete field (the last + field in prefix may be just a prefix + of a fixed length column) */ +}; + +#ifndef UNIV_NONINL +#include "row0sel.ic" +#endif + +#endif diff --git a/storage/xtradb/include/row0sel.ic b/storage/xtradb/include/row0sel.ic new file mode 100644 index 00000000000..d83a3448832 --- /dev/null +++ b/storage/xtradb/include/row0sel.ic @@ -0,0 +1,105 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0sel.ic +Select + +Created 12/19/1997 Heikki Tuuri +*******************************************************/ + +#include "que0que.h" + +/*********************************************************************//** +Gets the plan node for the nth table in a join. +@return plan node */ +UNIV_INLINE +plan_t* +sel_node_get_nth_plan( +/*==================*/ + sel_node_t* node, /*!< in: select node */ + ulint i) /*!< in: get ith plan node */ +{ + ut_ad(i < node->n_tables); + + return(node->plans + i); +} + +/*********************************************************************//** +Resets the cursor defined by sel_node to the SEL_NODE_OPEN state, which means +that it will start fetching from the start of the result set again, regardless +of where it was before, and it will set intention locks on the tables. */ +UNIV_INLINE +void +sel_node_reset_cursor( +/*==================*/ + sel_node_t* node) /*!< in: select node */ +{ + node->state = SEL_NODE_OPEN; +} + +/**********************************************************************//** +Performs an execution step of an open or close cursor statement node. +@return query thread to run next or NULL */ +UNIV_INLINE +que_thr_t* +open_step( +/*======*/ + que_thr_t* thr) /*!< in: query thread */ +{ + sel_node_t* sel_node; + open_node_t* node; + ulint err; + + ut_ad(thr); + + node = (open_node_t*) thr->run_node; + ut_ad(que_node_get_type(node) == QUE_NODE_OPEN); + + sel_node = node->cursor_def; + + err = DB_SUCCESS; + + if (node->op_type == ROW_SEL_OPEN_CURSOR) { + + /* if (sel_node->state == SEL_NODE_CLOSED) { */ + + sel_node_reset_cursor(sel_node); + /* } else { + err = DB_ERROR; + } */ + } else { + if (sel_node->state != SEL_NODE_CLOSED) { + + sel_node->state = SEL_NODE_CLOSED; + } else { + err = DB_ERROR; + } + } + + if (err != DB_SUCCESS) { + /* SQL error detected */ + fprintf(stderr, "SQL error %lu\n", (ulong) err); + + ut_error; + } + + thr->run_node = que_node_get_parent(node); + + return(thr); +} diff --git a/storage/xtradb/include/row0types.h b/storage/xtradb/include/row0types.h new file mode 100644 index 00000000000..52c89cb01fa --- /dev/null +++ b/storage/xtradb/include/row0types.h @@ -0,0 +1,55 @@ +/***************************************************************************** + +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0types.h +Row operation global types + +Created 12/27/1996 Heikki Tuuri +*******************************************************/ + +#ifndef row0types_h +#define row0types_h + +struct plan_t; + +struct upd_t; +struct upd_field_t; +struct upd_node_t; +struct del_node_t; +struct ins_node_t; +struct sel_node_t; +struct open_node_t; +struct fetch_node_t; + +struct row_printf_node_t; +struct sel_buf_t; + +struct undo_node_t; + +struct purge_node_t; + +struct row_ext_t; + +/** Buffer for logging modifications during online index creation */ +struct row_log_t; + +/* MySQL data types */ +struct TABLE; + +#endif diff --git a/storage/xtradb/include/row0uins.h b/storage/xtradb/include/row0uins.h new file mode 100644 index 00000000000..ebf4881208a --- /dev/null +++ b/storage/xtradb/include/row0uins.h @@ -0,0 +1,54 @@ +/***************************************************************************** + +Copyright (c) 1997, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0uins.h +Fresh insert undo + +Created 2/25/1997 Heikki Tuuri +*******************************************************/ + +#ifndef row0uins_h +#define row0uins_h + +#include "univ.i" +#include "data0data.h" +#include "dict0types.h" +#include "trx0types.h" +#include "que0types.h" +#include "row0types.h" +#include "mtr0mtr.h" + +/***********************************************************//** +Undoes a fresh insert of a row to a table. A fresh insert means that +the same clustered index unique key did not have any record, even delete +marked, at the time of the insert. InnoDB is eager in a rollback: +if it figures out that an index record will be removed in the purge +anyway, it will remove it in the rollback. +@return DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_undo_ins( +/*=========*/ + undo_node_t* node) /*!< in: row undo node */ + __attribute__((nonnull, warn_unused_result)); +#ifndef UNIV_NONINL +#include "row0uins.ic" +#endif + +#endif diff --git a/storage/xtradb/include/row0uins.ic b/storage/xtradb/include/row0uins.ic new file mode 100644 index 00000000000..54da2e49874 --- /dev/null +++ b/storage/xtradb/include/row0uins.ic @@ -0,0 +1,25 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0uins.ic +Fresh insert undo + +Created 2/25/1997 Heikki Tuuri +*******************************************************/ + diff --git a/storage/xtradb/include/row0umod.h b/storage/xtradb/include/row0umod.h new file mode 100644 index 00000000000..f89d5a334fc --- /dev/null +++ b/storage/xtradb/include/row0umod.h @@ -0,0 +1,52 @@ +/***************************************************************************** + +Copyright (c) 1997, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0umod.h +Undo modify of a row + +Created 2/27/1997 Heikki Tuuri +*******************************************************/ + +#ifndef row0umod_h +#define row0umod_h + +#include "univ.i" +#include "data0data.h" +#include "dict0types.h" +#include "trx0types.h" +#include "que0types.h" +#include "row0types.h" +#include "mtr0mtr.h" + +/***********************************************************//** +Undoes a modify operation on a row of a table. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +row_undo_mod( +/*=========*/ + undo_node_t* node, /*!< in: row undo node */ + que_thr_t* thr) /*!< in: query thread */ + __attribute__((nonnull, warn_unused_result)); + +#ifndef UNIV_NONINL +#include "row0umod.ic" +#endif + +#endif diff --git a/storage/xtradb/include/row0umod.ic b/storage/xtradb/include/row0umod.ic new file mode 100644 index 00000000000..00a8cd86e01 --- /dev/null +++ b/storage/xtradb/include/row0umod.ic @@ -0,0 +1,24 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0umod.ic +Undo modify of a row + +Created 2/27/1997 Heikki Tuuri +*******************************************************/ diff --git a/storage/xtradb/include/row0undo.h b/storage/xtradb/include/row0undo.h new file mode 100644 index 00000000000..5dddfb4eae1 --- /dev/null +++ b/storage/xtradb/include/row0undo.h @@ -0,0 +1,135 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0undo.h +Row undo + +Created 1/8/1997 Heikki Tuuri +*******************************************************/ + +#ifndef row0undo_h +#define row0undo_h + +#include "univ.i" +#include "mtr0mtr.h" +#include "trx0sys.h" +#include "btr0types.h" +#include "btr0pcur.h" +#include "dict0types.h" +#include "trx0types.h" +#include "que0types.h" +#include "row0types.h" + +/********************************************************************//** +Creates a row undo node to a query graph. +@return own: undo node */ +UNIV_INTERN +undo_node_t* +row_undo_node_create( +/*=================*/ + trx_t* trx, /*!< in: transaction */ + que_thr_t* parent, /*!< in: parent node, i.e., a thr node */ + mem_heap_t* heap); /*!< in: memory heap where created */ +/***********************************************************//** +Looks for the clustered index record when node has the row reference. +The pcur in node is used in the search. If found, stores the row to node, +and stores the position of pcur, and detaches it. The pcur must be closed +by the caller in any case. +@return TRUE if found; NOTE the node->pcur must be closed by the +caller, regardless of the return value */ +UNIV_INTERN +ibool +row_undo_search_clust_to_pcur( +/*==========================*/ + undo_node_t* node); /*!< in: row undo node */ +/***********************************************************//** +Undoes a row operation in a table. This is a high-level function used +in SQL execution graphs. +@return query thread to run next or NULL */ +UNIV_INTERN +que_thr_t* +row_undo_step( +/*==========*/ + que_thr_t* thr); /*!< in: query thread */ + +/* A single query thread will try to perform the undo for all successive +versions of a clustered index record, if the transaction has modified it +several times during the execution which is rolled back. It may happen +that the task is transferred to another query thread, if the other thread +is assigned to handle an undo log record in the chain of different versions +of the record, and the other thread happens to get the x-latch to the +clustered index record at the right time. + If a query thread notices that the clustered index record it is looking +for is missing, or the roll ptr field in the record doed not point to the +undo log record the thread was assigned to handle, then it gives up the undo +task for that undo log record, and fetches the next. This situation can occur +just in the case where the transaction modified the same record several times +and another thread is currently doing the undo for successive versions of +that index record. */ + +/** Execution state of an undo node */ +enum undo_exec { + UNDO_NODE_FETCH_NEXT = 1, /*!< we should fetch the next + undo log record */ + UNDO_NODE_INSERT, /*!< undo a fresh insert of a + row to a table */ + UNDO_NODE_MODIFY /*!< undo a modify operation + (DELETE or UPDATE) on a row + of a table */ +}; + +/** Undo node structure */ +struct undo_node_t{ + que_common_t common; /*!< node type: QUE_NODE_UNDO */ + enum undo_exec state; /*!< node execution state */ + trx_t* trx; /*!< trx for which undo is done */ + roll_ptr_t roll_ptr;/*!< roll pointer to undo log record */ + trx_undo_rec_t* undo_rec;/*!< undo log record */ + undo_no_t undo_no;/*!< undo number of the record */ + ulint rec_type;/*!< undo log record type: TRX_UNDO_INSERT_REC, + ... */ + trx_id_t new_trx_id; /*!< trx id to restore to clustered index + record */ + btr_pcur_t pcur; /*!< persistent cursor used in searching the + clustered index record */ + dict_table_t* table; /*!< table where undo is done */ + ulint cmpl_info;/*!< compiler analysis of an update */ + upd_t* update; /*!< update vector for a clustered index + record */ + dtuple_t* ref; /*!< row reference to the next row to handle */ + dtuple_t* row; /*!< a copy (also fields copied to heap) of the + row to handle */ + row_ext_t* ext; /*!< NULL, or prefixes of the externally + stored columns of the row */ + dtuple_t* undo_row;/*!< NULL, or the row after undo */ + row_ext_t* undo_ext;/*!< NULL, or prefixes of the externally + stored columns of undo_row */ + dict_index_t* index; /*!< the next index whose record should be + handled */ + mem_heap_t* heap; /*!< memory heap used as auxiliary storage for + row; this must be emptied after undo is tried + on a row */ +}; + + +#ifndef UNIV_NONINL +#include "row0undo.ic" +#endif + +#endif diff --git a/storage/xtradb/include/row0undo.ic b/storage/xtradb/include/row0undo.ic new file mode 100644 index 00000000000..b97ffca590e --- /dev/null +++ b/storage/xtradb/include/row0undo.ic @@ -0,0 +1,24 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0undo.ic +Row undo + +Created 1/8/1997 Heikki Tuuri +*******************************************************/ diff --git a/storage/xtradb/include/row0upd.h b/storage/xtradb/include/row0upd.h new file mode 100644 index 00000000000..27dedeb65a7 --- /dev/null +++ b/storage/xtradb/include/row0upd.h @@ -0,0 +1,540 @@ +/***************************************************************************** + +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0upd.h +Update of a row + +Created 12/27/1996 Heikki Tuuri +*******************************************************/ + +#ifndef row0upd_h +#define row0upd_h + +#include "univ.i" +#include "data0data.h" +#include "row0types.h" +#include "btr0types.h" +#include "dict0types.h" +#include "trx0types.h" + +#ifndef UNIV_HOTBACKUP +# include "btr0pcur.h" +# include "que0types.h" +# include "pars0types.h" +#endif /* !UNIV_HOTBACKUP */ + +/*********************************************************************//** +Creates an update vector object. +@return own: update vector object */ +UNIV_INLINE +upd_t* +upd_create( +/*=======*/ + ulint n, /*!< in: number of fields */ + mem_heap_t* heap); /*!< in: heap from which memory allocated */ +/*********************************************************************//** +Returns the number of fields in the update vector == number of columns +to be updated by an update vector. +@return number of fields */ +UNIV_INLINE +ulint +upd_get_n_fields( +/*=============*/ + const upd_t* update); /*!< in: update vector */ +#ifdef UNIV_DEBUG +/*********************************************************************//** +Returns the nth field of an update vector. +@return update vector field */ +UNIV_INLINE +upd_field_t* +upd_get_nth_field( +/*==============*/ + const upd_t* update, /*!< in: update vector */ + ulint n); /*!< in: field position in update vector */ +#else +# define upd_get_nth_field(update, n) ((update)->fields + (n)) +#endif +#ifndef UNIV_HOTBACKUP +/*********************************************************************//** +Sets an index field number to be updated by an update vector field. */ +UNIV_INLINE +void +upd_field_set_field_no( +/*===================*/ + upd_field_t* upd_field, /*!< in: update vector field */ + ulint field_no, /*!< in: field number in a clustered + index */ + dict_index_t* index, /*!< in: index */ + trx_t* trx); /*!< in: transaction */ +/*********************************************************************//** +Returns a field of an update vector by field_no. +@return update vector field, or NULL */ +UNIV_INLINE +const upd_field_t* +upd_get_field_by_field_no( +/*======================*/ + const upd_t* update, /*!< in: update vector */ + ulint no) /*!< in: field_no */ + __attribute__((nonnull, pure)); +/*********************************************************************//** +Writes into the redo log the values of trx id and roll ptr and enough info +to determine their positions within a clustered index record. +@return new pointer to mlog */ +UNIV_INTERN +byte* +row_upd_write_sys_vals_to_log( +/*==========================*/ + dict_index_t* index, /*!< in: clustered index */ + trx_id_t trx_id, /*!< in: transaction id */ + roll_ptr_t roll_ptr,/*!< in: roll ptr of the undo log record */ + byte* log_ptr,/*!< pointer to a buffer of size > 20 opened + in mlog */ + mtr_t* mtr); /*!< in: mtr */ +/*********************************************************************//** +Updates the trx id and roll ptr field in a clustered index record when +a row is updated or marked deleted. */ +UNIV_INLINE +void +row_upd_rec_sys_fields( +/*===================*/ + rec_t* rec, /*!< in/out: record */ + page_zip_des_t* page_zip,/*!< in/out: compressed page whose + uncompressed part will be updated, or NULL */ + dict_index_t* index, /*!< in: clustered index */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + const trx_t* trx, /*!< in: transaction */ + roll_ptr_t roll_ptr);/*!< in: roll ptr of the undo log record, + can be 0 during IMPORT */ +/*********************************************************************//** +Sets the trx id or roll ptr field of a clustered index entry. */ +UNIV_INTERN +void +row_upd_index_entry_sys_field( +/*==========================*/ + dtuple_t* entry, /*!< in/out: index entry, where the memory + buffers for sys fields are already allocated: + the function just copies the new values to + them */ + dict_index_t* index, /*!< in: clustered index */ + ulint type, /*!< in: DATA_TRX_ID or DATA_ROLL_PTR */ + ib_uint64_t val); /*!< in: value to write */ +/*********************************************************************//** +Creates an update node for a query graph. +@return own: update node */ +UNIV_INTERN +upd_node_t* +upd_node_create( +/*============*/ + mem_heap_t* heap); /*!< in: mem heap where created */ +/***********************************************************//** +Writes to the redo log the new values of the fields occurring in the index. */ +UNIV_INTERN +void +row_upd_index_write_log( +/*====================*/ + const upd_t* update, /*!< in: update vector */ + byte* log_ptr,/*!< in: pointer to mlog buffer: must + contain at least MLOG_BUF_MARGIN bytes + of free space; the buffer is closed + within this function */ + mtr_t* mtr); /*!< in: mtr into whose log to write */ +/***********************************************************//** +Returns TRUE if row update changes size of some field in index or if some +field to be updated is stored externally in rec or update. +@return TRUE if the update changes the size of some field in index or +the field is external in rec or update */ +UNIV_INTERN +ibool +row_upd_changes_field_size_or_external( +/*===================================*/ + dict_index_t* index, /*!< in: index */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + const upd_t* update);/*!< in: update vector */ +/***********************************************************//** +Returns true if row update contains disowned external fields. +@return true if the update contains disowned external fields. */ +UNIV_INTERN +bool +row_upd_changes_disowned_external( +/*==============================*/ + const upd_t* update) /*!< in: update vector */ + __attribute__((nonnull, warn_unused_result)); +#endif /* !UNIV_HOTBACKUP */ +/***********************************************************//** +Replaces the new column values stored in the update vector to the +record given. No field size changes are allowed. This function is +usually invoked on a clustered index. The only use case for a +secondary index is row_ins_sec_index_entry_by_modify() or its +counterpart in ibuf_insert_to_index_page(). */ +UNIV_INTERN +void +row_upd_rec_in_place( +/*=================*/ + rec_t* rec, /*!< in/out: record where replaced */ + dict_index_t* index, /*!< in: the index the record belongs to */ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + const upd_t* update, /*!< in: update vector */ + page_zip_des_t* page_zip);/*!< in: compressed page with enough space + available, or NULL */ +#ifndef UNIV_HOTBACKUP +/***************************************************************//** +Builds an update vector from those fields which in a secondary index entry +differ from a record that has the equal ordering fields. NOTE: we compare +the fields as binary strings! +@return own: update vector of differing fields */ +UNIV_INTERN +upd_t* +row_upd_build_sec_rec_difference_binary( +/*====================================*/ + const rec_t* rec, /*!< in: secondary index record */ + dict_index_t* index, /*!< in: index */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + const dtuple_t* entry, /*!< in: entry to insert */ + mem_heap_t* heap) /*!< in: memory heap from which allocated */ + __attribute__((warn_unused_result, nonnull)); +/***************************************************************//** +Builds an update vector from those fields, excluding the roll ptr and +trx id fields, which in an index entry differ from a record that has +the equal ordering fields. NOTE: we compare the fields as binary strings! +@return own: update vector of differing fields, excluding roll ptr and +trx id */ +UNIV_INTERN +const upd_t* +row_upd_build_difference_binary( +/*============================*/ + dict_index_t* index, /*!< in: clustered index */ + const dtuple_t* entry, /*!< in: entry to insert */ + const rec_t* rec, /*!< in: clustered index record */ + const ulint* offsets,/*!< in: rec_get_offsets(rec,index), or NULL */ + bool no_sys, /*!< in: skip the system columns + DB_TRX_ID and DB_ROLL_PTR */ + trx_t* trx, /*!< in: transaction (for diagnostics), + or NULL */ + mem_heap_t* heap) /*!< in: memory heap from which allocated */ + __attribute__((nonnull(1,2,3,7), warn_unused_result)); +/***********************************************************//** +Replaces the new column values stored in the update vector to the index entry +given. */ +UNIV_INTERN +void +row_upd_index_replace_new_col_vals_index_pos( +/*=========================================*/ + dtuple_t* entry, /*!< in/out: index entry where replaced; + the clustered index record must be + covered by a lock or a page latch to + prevent deletion (rollback or purge) */ + dict_index_t* index, /*!< in: index; NOTE that this may also be a + non-clustered index */ + const upd_t* update, /*!< in: an update vector built for the index so + that the field number in an upd_field is the + index position */ + ibool order_only, + /*!< in: if TRUE, limit the replacement to + ordering fields of index; note that this + does not work for non-clustered indexes. */ + mem_heap_t* heap) /*!< in: memory heap for allocating and + copying the new values */ + __attribute__((nonnull)); +/***********************************************************//** +Replaces the new column values stored in the update vector to the index entry +given. */ +UNIV_INTERN +void +row_upd_index_replace_new_col_vals( +/*===============================*/ + dtuple_t* entry, /*!< in/out: index entry where replaced; + the clustered index record must be + covered by a lock or a page latch to + prevent deletion (rollback or purge) */ + dict_index_t* index, /*!< in: index; NOTE that this may also be a + non-clustered index */ + const upd_t* update, /*!< in: an update vector built for the + CLUSTERED index so that the field number in + an upd_field is the clustered index position */ + mem_heap_t* heap) /*!< in: memory heap for allocating and + copying the new values */ + __attribute__((nonnull)); +/***********************************************************//** +Replaces the new column values stored in the update vector. */ +UNIV_INTERN +void +row_upd_replace( +/*============*/ + dtuple_t* row, /*!< in/out: row where replaced, + indexed by col_no; + the clustered index record must be + covered by a lock or a page latch to + prevent deletion (rollback or purge) */ + row_ext_t** ext, /*!< out, own: NULL, or externally + stored column prefixes */ + const dict_index_t* index, /*!< in: clustered index */ + const upd_t* update, /*!< in: an update vector built for the + clustered index */ + mem_heap_t* heap); /*!< in: memory heap */ +/***********************************************************//** +Checks if an update vector changes an ordering field of an index record. + +This function is fast if the update vector is short or the number of ordering +fields in the index is small. Otherwise, this can be quadratic. +NOTE: we compare the fields as binary strings! +@return TRUE if update vector changes an ordering field in the index record */ +UNIV_INTERN +ibool +row_upd_changes_ord_field_binary_func( +/*==================================*/ + dict_index_t* index, /*!< in: index of the record */ + const upd_t* update, /*!< in: update vector for the row; NOTE: the + field numbers in this MUST be clustered index + positions! */ +#ifdef UNIV_DEBUG + const que_thr_t*thr, /*!< in: query thread */ +#endif /* UNIV_DEBUG */ + const dtuple_t* row, /*!< in: old value of row, or NULL if the + row and the data values in update are not + known when this function is called, e.g., at + compile time */ + const row_ext_t*ext) /*!< NULL, or prefixes of the externally + stored columns in the old row */ + __attribute__((nonnull(1,2), warn_unused_result)); +#ifdef UNIV_DEBUG +# define row_upd_changes_ord_field_binary(index,update,thr,row,ext) \ + row_upd_changes_ord_field_binary_func(index,update,thr,row,ext) +#else /* UNIV_DEBUG */ +# define row_upd_changes_ord_field_binary(index,update,thr,row,ext) \ + row_upd_changes_ord_field_binary_func(index,update,row,ext) +#endif /* UNIV_DEBUG */ +/***********************************************************//** +Checks if an FTS indexed column is affected by an UPDATE. +@return offset within fts_t::indexes if FTS indexed column updated else +ULINT_UNDEFINED */ +UNIV_INTERN +ulint +row_upd_changes_fts_column( +/*=======================*/ + dict_table_t* table, /*!< in: table */ + upd_field_t* upd_field); /*!< in: field to check */ +/***********************************************************//** +Checks if an FTS Doc ID column is affected by an UPDATE. +@return whether Doc ID column is affected */ +UNIV_INTERN +bool +row_upd_changes_doc_id( +/*===================*/ + dict_table_t* table, /*!< in: table */ + upd_field_t* upd_field) /*!< in: field to check */ + __attribute__((nonnull, warn_unused_result)); +/***********************************************************//** +Checks if an update vector changes an ordering field of an index record. +This function is fast if the update vector is short or the number of ordering +fields in the index is small. Otherwise, this can be quadratic. +NOTE: we compare the fields as binary strings! +@return TRUE if update vector may change an ordering field in an index +record */ +UNIV_INTERN +ibool +row_upd_changes_some_index_ord_field_binary( +/*========================================*/ + const dict_table_t* table, /*!< in: table */ + const upd_t* update);/*!< in: update vector for the row */ +/***********************************************************//** +Updates a row in a table. This is a high-level function used +in SQL execution graphs. +@return query thread to run next or NULL */ +UNIV_INTERN +que_thr_t* +row_upd_step( +/*=========*/ + que_thr_t* thr); /*!< in: query thread */ +#endif /* !UNIV_HOTBACKUP */ +/*********************************************************************//** +Parses the log data of system field values. +@return log data end or NULL */ +UNIV_INTERN +byte* +row_upd_parse_sys_vals( +/*===================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + ulint* pos, /*!< out: TRX_ID position in record */ + trx_id_t* trx_id, /*!< out: trx id */ + roll_ptr_t* roll_ptr);/*!< out: roll ptr */ +/*********************************************************************//** +Updates the trx id and roll ptr field in a clustered index record in database +recovery. */ +UNIV_INTERN +void +row_upd_rec_sys_fields_in_recovery( +/*===============================*/ + rec_t* rec, /*!< in/out: record */ + page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint pos, /*!< in: TRX_ID position in rec */ + trx_id_t trx_id, /*!< in: transaction id */ + roll_ptr_t roll_ptr);/*!< in: roll ptr of the undo log record */ +/*********************************************************************//** +Parses the log data written by row_upd_index_write_log. +@return log data end or NULL */ +UNIV_INTERN +byte* +row_upd_index_parse( +/*================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + mem_heap_t* heap, /*!< in: memory heap where update vector is + built */ + upd_t** update_out);/*!< out: update vector */ + + +/* Update vector field */ +struct upd_field_t{ + unsigned field_no:16; /*!< field number in an index, usually + the clustered index, but in updating + a secondary index record in btr0cur.cc + this is the position in the secondary + index */ +#ifndef UNIV_HOTBACKUP + unsigned orig_len:16; /*!< original length of the locally + stored part of an externally stored + column, or 0 */ + que_node_t* exp; /*!< expression for calculating a new + value: it refers to column values and + constants in the symbol table of the + query graph */ +#endif /* !UNIV_HOTBACKUP */ + dfield_t new_val; /*!< new value for the column */ +}; + +/* Update vector structure */ +struct upd_t{ + ulint info_bits; /*!< new value of info bits to record; + default is 0 */ + ulint n_fields; /*!< number of update fields */ + upd_field_t* fields; /*!< array of update fields */ +}; + +#ifndef UNIV_HOTBACKUP +/* Update node structure which also implements the delete operation +of a row */ + +struct upd_node_t{ + que_common_t common; /*!< node type: QUE_NODE_UPDATE */ + ibool is_delete;/* TRUE if delete, FALSE if update */ + ibool searched_update; + /* TRUE if searched update, FALSE if + positioned */ + ibool in_mysql_interface; + /* TRUE if the update node was created + for the MySQL interface */ + dict_foreign_t* foreign;/* NULL or pointer to a foreign key + constraint if this update node is used in + doing an ON DELETE or ON UPDATE operation */ + upd_node_t* cascade_node;/* NULL or an update node template which + is used to implement ON DELETE/UPDATE CASCADE + or ... SET NULL for foreign keys */ + mem_heap_t* cascade_heap;/* NULL or a mem heap where the cascade + node is created */ + sel_node_t* select; /*!< query graph subtree implementing a base + table cursor: the rows returned will be + updated */ + btr_pcur_t* pcur; /*!< persistent cursor placed on the clustered + index record which should be updated or + deleted; the cursor is stored in the graph + of 'select' field above, except in the case + of the MySQL interface */ + dict_table_t* table; /*!< table where updated */ + upd_t* update; /*!< update vector for the row */ + ulint update_n_fields; + /* when this struct is used to implement + a cascade operation for foreign keys, we store + here the size of the buffer allocated for use + as the update vector */ + sym_node_list_t columns;/* symbol table nodes for the columns + to retrieve from the table */ + ibool has_clust_rec_x_lock; + /* TRUE if the select which retrieves the + records to update already sets an x-lock on + the clustered record; note that it must always + set at least an s-lock */ + ulint cmpl_info;/* information extracted during query + compilation; speeds up execution: + UPD_NODE_NO_ORD_CHANGE and + UPD_NODE_NO_SIZE_CHANGE, ORed */ + /*----------------------*/ + /* Local storage for this graph node */ + ulint state; /*!< node execution state */ + dict_index_t* index; /*!< NULL, or the next index whose record should + be updated */ + dtuple_t* row; /*!< NULL, or a copy (also fields copied to + heap) of the row to update; this must be reset + to NULL after a successful update */ + row_ext_t* ext; /*!< NULL, or prefixes of the externally + stored columns in the old row */ + dtuple_t* upd_row;/* NULL, or a copy of the updated row */ + row_ext_t* upd_ext;/* NULL, or prefixes of the externally + stored columns in upd_row */ + mem_heap_t* heap; /*!< memory heap used as auxiliary storage; + this must be emptied after a successful + update */ + /*----------------------*/ + sym_node_t* table_sym;/* table node in symbol table */ + que_node_t* col_assign_list; + /* column assignment list */ + ulint magic_n; +}; + +#define UPD_NODE_MAGIC_N 1579975 + +/* Node execution states */ +#define UPD_NODE_SET_IX_LOCK 1 /* execution came to the node from + a node above and if the field + has_clust_rec_x_lock is FALSE, we + should set an intention x-lock on + the table */ +#define UPD_NODE_UPDATE_CLUSTERED 2 /* clustered index record should be + updated */ +#define UPD_NODE_INSERT_CLUSTERED 3 /* clustered index record should be + inserted, old record is already delete + marked */ +#define UPD_NODE_INSERT_BLOB 4 /* clustered index record should be + inserted, old record is already + delete-marked; non-updated BLOBs + should be inherited by the new record + and disowned by the old record */ +#define UPD_NODE_UPDATE_ALL_SEC 5 /* an ordering field of the clustered + index record was changed, or this is + a delete operation: should update + all the secondary index records */ +#define UPD_NODE_UPDATE_SOME_SEC 6 /* secondary index entries should be + looked at and updated if an ordering + field changed */ + +/* Compilation info flags: these must fit within 3 bits; see trx0rec.h */ +#define UPD_NODE_NO_ORD_CHANGE 1 /* no secondary index record will be + changed in the update and no ordering + field of the clustered index */ +#define UPD_NODE_NO_SIZE_CHANGE 2 /* no record field size will be + changed in the update */ + +#endif /* !UNIV_HOTBACKUP */ + +#ifndef UNIV_NONINL +#include "row0upd.ic" +#endif + +#endif diff --git a/storage/xtradb/include/row0upd.ic b/storage/xtradb/include/row0upd.ic new file mode 100644 index 00000000000..618a77fa4bf --- /dev/null +++ b/storage/xtradb/include/row0upd.ic @@ -0,0 +1,188 @@ +/***************************************************************************** + +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0upd.ic +Update of a row + +Created 12/27/1996 Heikki Tuuri +*******************************************************/ + +#include "mtr0log.h" +#ifndef UNIV_HOTBACKUP +# include "trx0trx.h" +# include "trx0undo.h" +# include "row0row.h" +# include "lock0lock.h" +#endif /* !UNIV_HOTBACKUP */ +#include "page0zip.h" + +/*********************************************************************//** +Creates an update vector object. +@return own: update vector object */ +UNIV_INLINE +upd_t* +upd_create( +/*=======*/ + ulint n, /*!< in: number of fields */ + mem_heap_t* heap) /*!< in: heap from which memory allocated */ +{ + upd_t* update; + + update = (upd_t*) mem_heap_zalloc(heap, sizeof(upd_t)); + + update->n_fields = n; + update->fields = (upd_field_t*) + mem_heap_zalloc(heap, sizeof(upd_field_t) * n); + + return(update); +} + +/*********************************************************************//** +Returns the number of fields in the update vector == number of columns +to be updated by an update vector. +@return number of fields */ +UNIV_INLINE +ulint +upd_get_n_fields( +/*=============*/ + const upd_t* update) /*!< in: update vector */ +{ + ut_ad(update); + + return(update->n_fields); +} + +#ifdef UNIV_DEBUG +/*********************************************************************//** +Returns the nth field of an update vector. +@return update vector field */ +UNIV_INLINE +upd_field_t* +upd_get_nth_field( +/*==============*/ + const upd_t* update, /*!< in: update vector */ + ulint n) /*!< in: field position in update vector */ +{ + ut_ad(update); + ut_ad(n < update->n_fields); + + return((upd_field_t*) update->fields + n); +} +#endif /* UNIV_DEBUG */ + +#ifndef UNIV_HOTBACKUP +/*********************************************************************//** +Sets an index field number to be updated by an update vector field. */ +UNIV_INLINE +void +upd_field_set_field_no( +/*===================*/ + upd_field_t* upd_field, /*!< in: update vector field */ + ulint field_no, /*!< in: field number in a clustered + index */ + dict_index_t* index, /*!< in: index */ + trx_t* trx) /*!< in: transaction */ +{ + upd_field->field_no = field_no; + upd_field->orig_len = 0; + + if (field_no >= dict_index_get_n_fields(index)) { + fprintf(stderr, + "InnoDB: Error: trying to access field %lu in ", + (ulong) field_no); + dict_index_name_print(stderr, trx, index); + fprintf(stderr, "\n" + "InnoDB: but index only has %lu fields\n", + (ulong) dict_index_get_n_fields(index)); + ut_ad(0); + } + + dict_col_copy_type(dict_index_get_nth_col(index, field_no), + dfield_get_type(&upd_field->new_val)); +} + +/*********************************************************************//** +Returns a field of an update vector by field_no. +@return update vector field, or NULL */ +UNIV_INLINE +const upd_field_t* +upd_get_field_by_field_no( +/*======================*/ + const upd_t* update, /*!< in: update vector */ + ulint no) /*!< in: field_no */ +{ + ulint i; + for (i = 0; i < upd_get_n_fields(update); i++) { + const upd_field_t* uf = upd_get_nth_field(update, i); + + if (uf->field_no == no) { + + return(uf); + } + } + + return(NULL); +} + +/*********************************************************************//** +Updates the trx id and roll ptr field in a clustered index record when +a row is updated or marked deleted. */ +UNIV_INLINE +void +row_upd_rec_sys_fields( +/*===================*/ + rec_t* rec, /*!< in/out: record */ + page_zip_des_t* page_zip,/*!< in/out: compressed page whose + uncompressed part will be updated, or NULL */ + dict_index_t* index, /*!< in: clustered index */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + const trx_t* trx, /*!< in: transaction */ + roll_ptr_t roll_ptr)/*!< in: roll ptr of the undo log record, + can be 0 during IMPORT */ +{ + ut_ad(dict_index_is_clust(index)); + ut_ad(rec_offs_validate(rec, index, offsets)); + + if (page_zip) { + ulint pos = dict_index_get_sys_col_pos(index, DATA_TRX_ID); + page_zip_write_trx_id_and_roll_ptr(page_zip, rec, offsets, + pos, trx->id, roll_ptr); + } else { + ulint offset = index->trx_id_offset; + + if (!offset) { + offset = row_get_trx_id_offset(index, offsets); + } + +#if DATA_TRX_ID + 1 != DATA_ROLL_PTR +# error "DATA_TRX_ID + 1 != DATA_ROLL_PTR" +#endif + /* During IMPORT the trx id in the record can be in the + future, if the .ibd file is being imported from another + instance. During IMPORT roll_ptr will be 0. */ + ut_ad(roll_ptr == 0 + || lock_check_trx_id_sanity( + trx_read_trx_id(rec + offset), + rec, index, offsets)); + + trx_write_trx_id(rec + offset, trx->id); + trx_write_roll_ptr(rec + offset + DATA_TRX_ID_LEN, roll_ptr); + } +} +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/xtradb/include/row0vers.h b/storage/xtradb/include/row0vers.h new file mode 100644 index 00000000000..1df5b4d3e98 --- /dev/null +++ b/storage/xtradb/include/row0vers.h @@ -0,0 +1,146 @@ +/***************************************************************************** + +Copyright (c) 1997, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0vers.h +Row versions + +Created 2/6/1997 Heikki Tuuri +*******************************************************/ + +#ifndef row0vers_h +#define row0vers_h + +#include "univ.i" +#include "data0data.h" +#include "dict0types.h" +#include "trx0types.h" +#include "que0types.h" +#include "rem0types.h" +#include "mtr0mtr.h" +#include "read0types.h" + +/*****************************************************************//** +Finds out if an active transaction has inserted or modified a secondary +index record. +@return 0 if committed, else the active transaction id; +NOTE that this function can return false positives but never false +negatives. The caller must confirm all positive results by calling +trx_is_active() while holding lock_sys->mutex. */ +UNIV_INTERN +trx_id_t +row_vers_impl_x_locked( +/*===================*/ + const rec_t* rec, /*!< in: record in a secondary index */ + dict_index_t* index, /*!< in: the secondary index */ + const ulint* offsets);/*!< in: rec_get_offsets(rec, index) */ +/*****************************************************************//** +Finds out if we must preserve a delete marked earlier version of a clustered +index record, because it is >= the purge view. +@return TRUE if earlier version should be preserved */ +UNIV_INTERN +ibool +row_vers_must_preserve_del_marked( +/*==============================*/ + trx_id_t trx_id, /*!< in: transaction id in the version */ + mtr_t* mtr); /*!< in: mtr holding the latch on the + clustered index record; it will also + hold the latch on purge_view */ +/*****************************************************************//** +Finds out if a version of the record, where the version >= the current +purge view, should have ientry as its secondary index entry. We check +if there is any not delete marked version of the record where the trx +id >= purge view, and the secondary index entry == ientry; exactly in +this case we return TRUE. +@return TRUE if earlier version should have */ +UNIV_INTERN +ibool +row_vers_old_has_index_entry( +/*=========================*/ + ibool also_curr,/*!< in: TRUE if also rec is included in the + versions to search; otherwise only versions + prior to it are searched */ + const rec_t* rec, /*!< in: record in the clustered index; the + caller must have a latch on the page */ + mtr_t* mtr, /*!< in: mtr holding the latch on rec; it will + also hold the latch on purge_view */ + dict_index_t* index, /*!< in: the secondary index */ + const dtuple_t* ientry);/*!< in: the secondary index entry */ +/*****************************************************************//** +Constructs the version of a clustered index record which a consistent +read should see. We assume that the trx id stored in rec is such that +the consistent read should not see rec in its present version. +@return DB_SUCCESS or DB_MISSING_HISTORY */ +UNIV_INTERN +dberr_t +row_vers_build_for_consistent_read( +/*===============================*/ + const rec_t* rec, /*!< in: record in a clustered index; the + caller must have a latch on the page; this + latch locks the top of the stack of versions + of this records */ + mtr_t* mtr, /*!< in: mtr holding the latch on rec; it will + also hold the latch on purge_view */ + dict_index_t* index, /*!< in: the clustered index */ + ulint** offsets,/*!< in/out: offsets returned by + rec_get_offsets(rec, index) */ + read_view_t* view, /*!< in: the consistent read view */ + mem_heap_t** offset_heap,/*!< in/out: memory heap from which + the offsets are allocated */ + mem_heap_t* in_heap,/*!< in: memory heap from which the memory for + *old_vers is allocated; memory for possible + intermediate versions is allocated and freed + locally within the function */ + rec_t** old_vers)/*!< out, own: old version, or NULL + if the history is missing or the record + does not exist in the view, that is, + it was freshly inserted afterwards */ + __attribute__((nonnull(1,2,3,4,5,6,7))); + +/*****************************************************************//** +Constructs the last committed version of a clustered index record, +which should be seen by a semi-consistent read. */ +UNIV_INTERN +void +row_vers_build_for_semi_consistent_read( +/*====================================*/ + const rec_t* rec, /*!< in: record in a clustered index; the + caller must have a latch on the page; this + latch locks the top of the stack of versions + of this records */ + mtr_t* mtr, /*!< in: mtr holding the latch on rec */ + dict_index_t* index, /*!< in: the clustered index */ + ulint** offsets,/*!< in/out: offsets returned by + rec_get_offsets(rec, index) */ + mem_heap_t** offset_heap,/*!< in/out: memory heap from which + the offsets are allocated */ + mem_heap_t* in_heap,/*!< in: memory heap from which the memory for + *old_vers is allocated; memory for possible + intermediate versions is allocated and freed + locally within the function */ + const rec_t** old_vers)/*!< out: rec, old version, or NULL if the + record does not exist in the view, that is, + it was freshly inserted afterwards */ + __attribute__((nonnull(1,2,3,4,5))); + + +#ifndef UNIV_NONINL +#include "row0vers.ic" +#endif + +#endif diff --git a/storage/xtradb/include/row0vers.ic b/storage/xtradb/include/row0vers.ic new file mode 100644 index 00000000000..ef43a55bf70 --- /dev/null +++ b/storage/xtradb/include/row0vers.ic @@ -0,0 +1,30 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0vers.ic +Row versions + +Created 2/6/1997 Heikki Tuuri +*******************************************************/ + +#include "row0row.h" +#include "dict0dict.h" +#include "read0read.h" +#include "page0page.h" +#include "log0recv.h" diff --git a/storage/xtradb/include/srv0conc.h b/storage/xtradb/include/srv0conc.h new file mode 100644 index 00000000000..cf61ef5528d --- /dev/null +++ b/storage/xtradb/include/srv0conc.h @@ -0,0 +1,111 @@ +/***************************************************************************** + +Copyright (c) 2011, 2012, Oracle and/or its affiliates. All Rights Reserved. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +Portions of this file contain modifications contributed and copyrighted +by Percona Inc.. Those modifications are +gratefully acknowledged and are described briefly in the InnoDB +documentation. The contributions by Percona Inc. are incorporated with +their permission, and subject to the conditions contained in the file +COPYING.Percona. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file srv/srv0conc.h + +InnoDB concurrency manager header file + +Created 2011/04/18 Sunny Bains +*******************************************************/ + +#ifndef srv_conc_h +#define srv_conc_h + +/** We are prepared for a situation that we have this many threads waiting for +a semaphore inside InnoDB. innobase_start_or_create_for_mysql() sets the +value. */ + +extern ulint srv_max_n_threads; + +/** The following controls how many threads we let inside InnoDB concurrently: +threads waiting for locks are not counted into the number because otherwise +we could get a deadlock. Value of 0 will disable the concurrency check. */ + +extern ulong srv_thread_concurrency; + +/*********************************************************************//** +Initialise the concurrency management data structures */ +void +srv_conc_init(void); +/*===============*/ + +/*********************************************************************//** +Free the concurrency management data structures */ +void +srv_conc_free(void); +/*===============*/ + +/*********************************************************************//** +Puts an OS thread to wait if there are too many concurrent threads +(>= srv_thread_concurrency) inside InnoDB. The threads wait in a FIFO queue. */ +UNIV_INTERN +void +srv_conc_enter_innodb( +/*==================*/ + trx_t* trx); /*!< in: transaction object associated + with the thread */ + +/*********************************************************************//** +This lets a thread enter InnoDB regardless of the number of threads inside +InnoDB. This must be called when a thread ends a lock wait. */ +UNIV_INTERN +void +srv_conc_force_enter_innodb( +/*========================*/ + trx_t* trx); /*!< in: transaction object associated with + the thread */ + +/*********************************************************************//** +This must be called when a thread exits InnoDB in a lock wait or at the +end of an SQL statement. */ +UNIV_INTERN +void +srv_conc_force_exit_innodb( +/*=======================*/ + trx_t* trx); /*!< in: transaction object associated with + the thread */ + +/*********************************************************************//** +Get the count of threads waiting inside InnoDB. */ +UNIV_INTERN +ulint +srv_conc_get_waiting_threads(void); +/*==============================*/ + +/*********************************************************************//** +Get the count of threads active inside InnoDB. */ +UNIV_INTERN +ulint +srv_conc_get_active_threads(void); +/*==============================*/ + +#endif /* srv_conc_h */ diff --git a/storage/xtradb/include/srv0mon.h b/storage/xtradb/include/srv0mon.h new file mode 100644 index 00000000000..e2ab81bf53a --- /dev/null +++ b/storage/xtradb/include/srv0mon.h @@ -0,0 +1,896 @@ +/*********************************************************************** + +Copyright (c) 2010, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. + +This program is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +***********************************************************************/ + +/**************************************************//** +@file include/srv0mon.h +Server monitor counter related defines + +Created 12/15/2009 Jimmy Yang +*******************************************************/ + +#ifndef srv0mon_h +#define srv0mon_h + +#include "univ.i" +#ifndef UNIV_HOTBACKUP + + +/** Possible status values for "mon_status" in "struct monitor_value" */ +enum monitor_running_status { + MONITOR_STARTED = 1, /*!< Monitor has been turned on */ + MONITOR_STOPPED = 2 /*!< Monitor has been turned off */ +}; + +typedef enum monitor_running_status monitor_running_t; + +/** Monitor counter value type */ +typedef ib_int64_t mon_type_t; + +/** Two monitor structures are defined in this file. One is +"monitor_value_t" which contains dynamic counter values for each +counter. The other is "monitor_info_t", which contains +static information (counter name, desc etc.) for each counter. +In addition, an enum datatype "monitor_id_t" is also defined, +it identifies each monitor with an internally used symbol, whose +integer value indexes into above two structure for its dynamic +and static information. +Developer who intend to add new counters would require to +fill in counter information as described in "monitor_info_t" and +create the internal counter ID in "monitor_id_t". */ + +/** Structure containing the actual values of a monitor counter. */ +struct monitor_value_t { + ib_time_t mon_start_time; /*!< Start time of monitoring */ + ib_time_t mon_stop_time; /*!< Stop time of monitoring */ + ib_time_t mon_reset_time; /*!< Time counter resetted */ + mon_type_t mon_value; /*!< Current counter Value */ + mon_type_t mon_max_value; /*!< Current Max value */ + mon_type_t mon_min_value; /*!< Current Min value */ + mon_type_t mon_value_reset;/*!< value at last reset */ + mon_type_t mon_max_value_start; /*!< Max value since start */ + mon_type_t mon_min_value_start; /*!< Min value since start */ + mon_type_t mon_start_value;/*!< Value at the start time */ + mon_type_t mon_last_value; /*!< Last set of values */ + monitor_running_t mon_status; /* whether monitor still running */ +}; + +/** Follwoing defines are possible values for "monitor_type" field in +"struct monitor_info" */ +enum monitor_type_t { + MONITOR_NONE = 0, /*!< No monitoring */ + MONITOR_MODULE = 1, /*!< This is a monitor module type, + not a counter */ + MONITOR_EXISTING = 2, /*!< The monitor carries information from + an existing system status variable */ + MONITOR_NO_AVERAGE = 4, /*!< Set this status if we don't want to + calculate the average value for the counter */ + MONITOR_DISPLAY_CURRENT = 8, /*!< Display current value of the + counter, rather than incremental value + over the period. Mostly for counters + displaying current resource usage */ + MONITOR_GROUP_MODULE = 16, /*!< Monitor can be turned on/off + only as a module, but not individually */ + MONITOR_DEFAULT_ON = 32,/*!< Monitor will be turned on by default at + server start up */ + MONITOR_SET_OWNER = 64, /*!< Owner of "monitor set", a set of + monitor counters */ + MONITOR_SET_MEMBER = 128,/*!< Being part of a "monitor set" */ + MONITOR_HIDDEN = 256 /*!< Do not display this monitor in the + metrics table */ +}; + +/** Counter minimum value is initialized to be max value of + mon_type_t (ib_int64_t) */ +#define MIN_RESERVED ((mon_type_t) (IB_UINT64_MAX >> 1)) +#define MAX_RESERVED (~MIN_RESERVED) + +/** This enumeration defines internal monitor identifier used internally +to identify each particular counter. Its value indexes into two arrays, +one is the "innodb_counter_value" array which records actual monitor +counter values, the other is "innodb_counter_info" array which describes +each counter's basic information (name, desc etc.). A couple of +naming rules here: +1) If the monitor defines a module, it starts with MONITOR_MODULE +2) If the monitor uses exisitng counters from "status variable", its ID +name shall start with MONITOR_OVLD + +Please refer to "innodb_counter_info" in srv/srv0mon.cc for detail +information for each monitor counter */ + +enum monitor_id_t { + /* This is to identify the default value set by the metrics + control global variables */ + MONITOR_DEFAULT_START = 0, + + /* Start of Metadata counter */ + MONITOR_MODULE_METADATA, + MONITOR_TABLE_OPEN, + MONITOR_TABLE_CLOSE, + MONITOR_TABLE_REFERENCE, + MONITOR_OVLD_META_MEM_POOL, + + /* Lock manager related counters */ + MONITOR_MODULE_LOCK, + MONITOR_DEADLOCK, + MONITOR_TIMEOUT, + MONITOR_LOCKREC_WAIT, + MONITOR_TABLELOCK_WAIT, + MONITOR_NUM_RECLOCK_REQ, + MONITOR_RECLOCK_CREATED, + MONITOR_RECLOCK_REMOVED, + MONITOR_NUM_RECLOCK, + MONITOR_TABLELOCK_CREATED, + MONITOR_TABLELOCK_REMOVED, + MONITOR_NUM_TABLELOCK, + MONITOR_OVLD_ROW_LOCK_CURRENT_WAIT, + MONITOR_OVLD_LOCK_WAIT_TIME, + MONITOR_OVLD_LOCK_MAX_WAIT_TIME, + MONITOR_OVLD_ROW_LOCK_WAIT, + MONITOR_OVLD_LOCK_AVG_WAIT_TIME, + + /* Buffer and I/O realted counters. */ + MONITOR_MODULE_BUFFER, + MONITOR_OVLD_BUFFER_POOL_SIZE, + MONITOR_OVLD_BUF_POOL_READS, + MONITOR_OVLD_BUF_POOL_READ_REQUESTS, + MONITOR_OVLD_BUF_POOL_WRITE_REQUEST, + MONITOR_OVLD_BUF_POOL_WAIT_FREE, + MONITOR_OVLD_BUF_POOL_READ_AHEAD, + MONITOR_OVLD_BUF_POOL_READ_AHEAD_EVICTED, + MONITOR_OVLD_BUF_POOL_PAGE_TOTAL, + MONITOR_OVLD_BUF_POOL_PAGE_MISC, + MONITOR_OVLD_BUF_POOL_PAGES_DATA, + MONITOR_OVLD_BUF_POOL_BYTES_DATA, + MONITOR_OVLD_BUF_POOL_PAGES_DIRTY, + MONITOR_OVLD_BUF_POOL_BYTES_DIRTY, + MONITOR_OVLD_BUF_POOL_PAGES_FREE, + MONITOR_OVLD_PAGE_CREATED, + MONITOR_OVLD_PAGES_WRITTEN, + MONITOR_OVLD_PAGES_READ, + MONITOR_OVLD_BYTE_READ, + MONITOR_OVLD_BYTE_WRITTEN, + MONITOR_FLUSH_BATCH_SCANNED, + MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL, + MONITOR_FLUSH_BATCH_SCANNED_PER_CALL, + MONITOR_FLUSH_HP_RESCAN, + MONITOR_FLUSH_BATCH_TOTAL_PAGE, + MONITOR_FLUSH_BATCH_COUNT, + MONITOR_FLUSH_BATCH_PAGES, + MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE, + MONITOR_FLUSH_NEIGHBOR_COUNT, + MONITOR_FLUSH_NEIGHBOR_PAGES, + MONITOR_FLUSH_N_TO_FLUSH_REQUESTED, + MONITOR_FLUSH_AVG_PAGE_RATE, + MONITOR_FLUSH_LSN_AVG_RATE, + MONITOR_FLUSH_PCT_FOR_DIRTY, + MONITOR_FLUSH_PCT_FOR_LSN, + MONITOR_FLUSH_SYNC_WAITS, + MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE, + MONITOR_FLUSH_ADAPTIVE_COUNT, + MONITOR_FLUSH_ADAPTIVE_PAGES, + MONITOR_FLUSH_SYNC_TOTAL_PAGE, + MONITOR_FLUSH_SYNC_COUNT, + MONITOR_FLUSH_SYNC_PAGES, + MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE, + MONITOR_FLUSH_BACKGROUND_COUNT, + MONITOR_FLUSH_BACKGROUND_PAGES, + MONITOR_LRU_BATCH_SCANNED, + MONITOR_LRU_BATCH_SCANNED_NUM_CALL, + MONITOR_LRU_BATCH_SCANNED_PER_CALL, + MONITOR_LRU_BATCH_TOTAL_PAGE, + MONITOR_LRU_BATCH_COUNT, + MONITOR_LRU_BATCH_PAGES, + MONITOR_LRU_SINGLE_FLUSH_SCANNED, + MONITOR_LRU_SINGLE_FLUSH_SCANNED_NUM_CALL, + MONITOR_LRU_SINGLE_FLUSH_SCANNED_PER_CALL, + MONITOR_LRU_SINGLE_FLUSH_FAILURE_COUNT, + MONITOR_LRU_GET_FREE_SEARCH, + MONITOR_LRU_SEARCH_SCANNED, + MONITOR_LRU_SEARCH_SCANNED_NUM_CALL, + MONITOR_LRU_SEARCH_SCANNED_PER_CALL, + MONITOR_LRU_UNZIP_SEARCH_SCANNED, + MONITOR_LRU_UNZIP_SEARCH_SCANNED_NUM_CALL, + MONITOR_LRU_UNZIP_SEARCH_SCANNED_PER_CALL, + + /* Buffer Page I/O specific counters. */ + MONITOR_MODULE_BUF_PAGE, + MONITOR_INDEX_LEAF_PAGE_READ, + MONITOR_INDEX_NON_LEAF_PAGE_READ, + MONITOR_INDEX_IBUF_LEAF_PAGE_READ, + MONITOR_INDEX_IBUF_NON_LEAF_PAGE_READ, + MONITOR_UNDO_LOG_PAGE_READ, + MONITOR_INODE_PAGE_READ, + MONITOR_IBUF_FREELIST_PAGE_READ, + MONITOR_IBUF_BITMAP_PAGE_READ, + MONITOR_SYSTEM_PAGE_READ, + MONITOR_TRX_SYSTEM_PAGE_READ, + MONITOR_FSP_HDR_PAGE_READ, + MONITOR_XDES_PAGE_READ, + MONITOR_BLOB_PAGE_READ, + MONITOR_ZBLOB_PAGE_READ, + MONITOR_ZBLOB2_PAGE_READ, + MONITOR_OTHER_PAGE_READ, + MONITOR_INDEX_LEAF_PAGE_WRITTEN, + MONITOR_INDEX_NON_LEAF_PAGE_WRITTEN, + MONITOR_INDEX_IBUF_LEAF_PAGE_WRITTEN, + MONITOR_INDEX_IBUF_NON_LEAF_PAGE_WRITTEN, + MONITOR_UNDO_LOG_PAGE_WRITTEN, + MONITOR_INODE_PAGE_WRITTEN, + MONITOR_IBUF_FREELIST_PAGE_WRITTEN, + MONITOR_IBUF_BITMAP_PAGE_WRITTEN, + MONITOR_SYSTEM_PAGE_WRITTEN, + MONITOR_TRX_SYSTEM_PAGE_WRITTEN, + MONITOR_FSP_HDR_PAGE_WRITTEN, + MONITOR_XDES_PAGE_WRITTEN, + MONITOR_BLOB_PAGE_WRITTEN, + MONITOR_ZBLOB_PAGE_WRITTEN, + MONITOR_ZBLOB2_PAGE_WRITTEN, + MONITOR_OTHER_PAGE_WRITTEN, + + /* OS level counters (I/O) */ + MONITOR_MODULE_OS, + MONITOR_OVLD_OS_FILE_READ, + MONITOR_OVLD_OS_FILE_WRITE, + MONITOR_OVLD_OS_FSYNC, + MONITOR_OS_PENDING_READS, + MONITOR_OS_PENDING_WRITES, + MONITOR_OVLD_OS_LOG_WRITTEN, + MONITOR_OVLD_OS_LOG_FSYNC, + MONITOR_OVLD_OS_LOG_PENDING_FSYNC, + MONITOR_OVLD_OS_LOG_PENDING_WRITES, + + /* Transaction related counters */ + MONITOR_MODULE_TRX, + MONITOR_TRX_RW_COMMIT, + MONITOR_TRX_RO_COMMIT, + MONITOR_TRX_NL_RO_COMMIT, + MONITOR_TRX_COMMIT_UNDO, + MONITOR_TRX_ROLLBACK, + MONITOR_TRX_ROLLBACK_SAVEPOINT, + MONITOR_TRX_ROLLBACK_ACTIVE, + MONITOR_TRX_ACTIVE, + MONITOR_RSEG_HISTORY_LEN, + MONITOR_NUM_UNDO_SLOT_USED, + MONITOR_NUM_UNDO_SLOT_CACHED, + MONITOR_RSEG_CUR_SIZE, + + /* Purge related counters */ + MONITOR_MODULE_PURGE, + MONITOR_N_DEL_ROW_PURGE, + MONITOR_N_UPD_EXIST_EXTERN, + MONITOR_PURGE_INVOKED, + MONITOR_PURGE_N_PAGE_HANDLED, + MONITOR_DML_PURGE_DELAY, + MONITOR_PURGE_STOP_COUNT, + MONITOR_PURGE_RESUME_COUNT, + + /* Recovery related counters */ + MONITOR_MODULE_RECOVERY, + MONITOR_NUM_CHECKPOINT, + MONITOR_OVLD_LSN_FLUSHDISK, + MONITOR_OVLD_LSN_CHECKPOINT, + MONITOR_OVLD_LSN_CURRENT, + MONITOR_LSN_CHECKPOINT_AGE, + MONITOR_OVLD_BUF_OLDEST_LSN, + MONITOR_OVLD_MAX_AGE_ASYNC, + MONITOR_OVLD_MAX_AGE_SYNC, + MONITOR_PENDING_LOG_WRITE, + MONITOR_PENDING_CHECKPOINT_WRITE, + MONITOR_LOG_IO, + MONITOR_OVLD_LOG_WAITS, + MONITOR_OVLD_LOG_WRITE_REQUEST, + MONITOR_OVLD_LOG_WRITES, + + /* Page Manager related counters */ + MONITOR_MODULE_PAGE, + MONITOR_PAGE_COMPRESS, + MONITOR_PAGE_DECOMPRESS, + MONITOR_PAD_INCREMENTS, + MONITOR_PAD_DECREMENTS, + + /* Index related counters */ + MONITOR_MODULE_INDEX, + MONITOR_INDEX_SPLIT, + MONITOR_INDEX_MERGE_ATTEMPTS, + MONITOR_INDEX_MERGE_SUCCESSFUL, + MONITOR_INDEX_REORG_ATTEMPTS, + MONITOR_INDEX_REORG_SUCCESSFUL, + MONITOR_INDEX_DISCARD, + + /* Adaptive Hash Index related counters */ + MONITOR_MODULE_ADAPTIVE_HASH, + MONITOR_OVLD_ADAPTIVE_HASH_SEARCH, + MONITOR_OVLD_ADAPTIVE_HASH_SEARCH_BTREE, + MONITOR_ADAPTIVE_HASH_PAGE_ADDED, + MONITOR_ADAPTIVE_HASH_PAGE_REMOVED, + MONITOR_ADAPTIVE_HASH_ROW_ADDED, + MONITOR_ADAPTIVE_HASH_ROW_REMOVED, + MONITOR_ADAPTIVE_HASH_ROW_REMOVE_NOT_FOUND, + MONITOR_ADAPTIVE_HASH_ROW_UPDATED, + + /* Tablespace related counters */ + MONITOR_MODULE_FIL_SYSTEM, + MONITOR_OVLD_N_FILE_OPENED, + + /* InnoDB Change Buffer related counters */ + MONITOR_MODULE_IBUF_SYSTEM, + MONITOR_OVLD_IBUF_MERGE_INSERT, + MONITOR_OVLD_IBUF_MERGE_DELETE, + MONITOR_OVLD_IBUF_MERGE_PURGE, + MONITOR_OVLD_IBUF_MERGE_DISCARD_INSERT, + MONITOR_OVLD_IBUF_MERGE_DISCARD_DELETE, + MONITOR_OVLD_IBUF_MERGE_DISCARD_PURGE, + MONITOR_OVLD_IBUF_MERGES, + MONITOR_OVLD_IBUF_SIZE, + + /* Counters for server operations */ + MONITOR_MODULE_SERVER, + MONITOR_MASTER_THREAD_SLEEP, + MONITOR_OVLD_SERVER_ACTIVITY, + MONITOR_MASTER_ACTIVE_LOOPS, + MONITOR_MASTER_IDLE_LOOPS, + MONITOR_SRV_BACKGROUND_DROP_TABLE_MICROSECOND, + MONITOR_SRV_IBUF_MERGE_MICROSECOND, + MONITOR_SRV_LOG_FLUSH_MICROSECOND, + MONITOR_SRV_MEM_VALIDATE_MICROSECOND, + MONITOR_SRV_PURGE_MICROSECOND, + MONITOR_SRV_DICT_LRU_MICROSECOND, + MONITOR_SRV_CHECKPOINT_MICROSECOND, + MONITOR_OVLD_SRV_DBLWR_WRITES, + MONITOR_OVLD_SRV_DBLWR_PAGES_WRITTEN, + MONITOR_OVLD_SRV_PAGE_SIZE, + MONITOR_OVLD_RWLOCK_S_SPIN_WAITS, + MONITOR_OVLD_RWLOCK_X_SPIN_WAITS, + MONITOR_OVLD_RWLOCK_S_SPIN_ROUNDS, + MONITOR_OVLD_RWLOCK_X_SPIN_ROUNDS, + MONITOR_OVLD_RWLOCK_S_OS_WAITS, + MONITOR_OVLD_RWLOCK_X_OS_WAITS, + + /* Data DML related counters */ + MONITOR_MODULE_DML_STATS, + MONITOR_OLVD_ROW_READ, + MONITOR_OLVD_ROW_INSERTED, + MONITOR_OLVD_ROW_DELETED, + MONITOR_OLVD_ROW_UPDTATED, + + /* Data DDL related counters */ + MONITOR_MODULE_DDL_STATS, + MONITOR_BACKGROUND_DROP_INDEX, + MONITOR_BACKGROUND_DROP_TABLE, + MONITOR_ONLINE_CREATE_INDEX, + MONITOR_PENDING_ALTER_TABLE, + + MONITOR_MODULE_ICP, + MONITOR_ICP_ATTEMPTS, + MONITOR_ICP_NO_MATCH, + MONITOR_ICP_OUT_OF_RANGE, + MONITOR_ICP_MATCH, + + /* This is used only for control system to turn + on/off and reset all monitor counters */ + MONITOR_ALL_COUNTER, + + /* This must be the last member */ + NUM_MONITOR +}; + +/** This informs the monitor control system to turn +on/off and reset monitor counters through wild card match */ +#define MONITOR_WILDCARD_MATCH (NUM_MONITOR + 1) + +/** Cannot find monitor counter with a specified name */ +#define MONITOR_NO_MATCH (NUM_MONITOR + 2) + +/** struct monitor_info describes the basic/static information +about each monitor counter. */ +struct monitor_info_t { + const char* monitor_name; /*!< Monitor name */ + const char* monitor_module; /*!< Sub Module the monitor + belongs to */ + const char* monitor_desc; /*!< Brief desc of monitor counter */ + monitor_type_t monitor_type; /*!< Type of Monitor Info */ + monitor_id_t monitor_related_id;/*!< Monitor ID of counter that + related to this monitor. This is + set when the monitor belongs to + a "monitor set" */ + monitor_id_t monitor_id; /*!< Monitor ID as defined in enum + monitor_id_t */ +}; + +/** Following are the "set_option" values allowed for +srv_mon_process_existing_counter() and srv_mon_process_existing_counter() +functions. To turn on/off/reset the monitor counters. */ +enum mon_option_t { + MONITOR_TURN_ON = 1, /*!< Turn on the counter */ + MONITOR_TURN_OFF, /*!< Turn off the counter */ + MONITOR_RESET_VALUE, /*!< Reset current values */ + MONITOR_RESET_ALL_VALUE, /*!< Reset all values */ + MONITOR_GET_VALUE /*!< Option for + srv_mon_process_existing_counter() + function */ +}; + +/** Number of bit in a ulint datatype */ +#define NUM_BITS_ULINT (sizeof(ulint) * CHAR_BIT) + +/** This "monitor_set_tbl" is a bitmap records whether a particular monitor +counter has been turned on or off */ +extern ulint monitor_set_tbl[(NUM_MONITOR + NUM_BITS_ULINT - 1) / + NUM_BITS_ULINT]; + +/** Macros to turn on/off the control bit in monitor_set_tbl for a monitor +counter option. */ +#define MONITOR_ON(monitor) \ + (monitor_set_tbl[monitor / NUM_BITS_ULINT] |= \ + ((ulint)1 << (monitor % NUM_BITS_ULINT))) + +#define MONITOR_OFF(monitor) \ + (monitor_set_tbl[monitor / NUM_BITS_ULINT] &= \ + ~((ulint)1 << (monitor % NUM_BITS_ULINT))) + +/** Check whether the requested monitor is turned on/off */ +#define MONITOR_IS_ON(monitor) \ + (monitor_set_tbl[monitor / NUM_BITS_ULINT] & \ + ((ulint)1 << (monitor % NUM_BITS_ULINT))) + +/** The actual monitor counter array that records each monintor counter +value */ +extern monitor_value_t innodb_counter_value[NUM_MONITOR]; + +/** Following are macro defines for basic montior counter manipulations. +Please note we do not provide any synchronization for these monitor +operations due to performance consideration. Most counters can +be placed under existing mutex protections in respective code +module. */ + +/** Macros to access various fields of a monitor counters */ +#define MONITOR_FIELD(monitor, field) \ + (innodb_counter_value[monitor].field) + +#define MONITOR_VALUE(monitor) \ + MONITOR_FIELD(monitor, mon_value) + +#define MONITOR_MAX_VALUE(monitor) \ + MONITOR_FIELD(monitor, mon_max_value) + +#define MONITOR_MIN_VALUE(monitor) \ + MONITOR_FIELD(monitor, mon_min_value) + +#define MONITOR_VALUE_RESET(monitor) \ + MONITOR_FIELD(monitor, mon_value_reset) + +#define MONITOR_MAX_VALUE_START(monitor) \ + MONITOR_FIELD(monitor, mon_max_value_start) + +#define MONITOR_MIN_VALUE_START(monitor) \ + MONITOR_FIELD(monitor, mon_min_value_start) + +#define MONITOR_LAST_VALUE(monitor) \ + MONITOR_FIELD(monitor, mon_last_value) + +#define MONITOR_START_VALUE(monitor) \ + MONITOR_FIELD(monitor, mon_start_value) + +#define MONITOR_VALUE_SINCE_START(monitor) \ + (MONITOR_VALUE(monitor) + MONITOR_VALUE_RESET(monitor)) + +#define MONITOR_STATUS(monitor) \ + MONITOR_FIELD(monitor, mon_status) + +#define MONITOR_SET_START(monitor) \ + do { \ + MONITOR_STATUS(monitor) = MONITOR_STARTED; \ + MONITOR_FIELD((monitor), mon_start_time) = time(NULL); \ + } while (0) + +#define MONITOR_SET_OFF(monitor) \ + do { \ + MONITOR_STATUS(monitor) = MONITOR_STOPPED; \ + MONITOR_FIELD((monitor), mon_stop_time) = time(NULL); \ + } while (0) + +#define MONITOR_INIT_ZERO_VALUE 0 + +/** Max and min values are initialized when we first turn on the monitor +counter, and set the MONITOR_STATUS. */ +#define MONITOR_MAX_MIN_NOT_INIT(monitor) \ + (MONITOR_STATUS(monitor) == MONITOR_INIT_ZERO_VALUE \ + && MONITOR_MIN_VALUE(monitor) == MONITOR_INIT_ZERO_VALUE \ + && MONITOR_MAX_VALUE(monitor) == MONITOR_INIT_ZERO_VALUE) + +#define MONITOR_INIT(monitor) \ + if (MONITOR_MAX_MIN_NOT_INIT(monitor)) { \ + MONITOR_MIN_VALUE(monitor) = MIN_RESERVED; \ + MONITOR_MIN_VALUE_START(monitor) = MIN_RESERVED; \ + MONITOR_MAX_VALUE(monitor) = MAX_RESERVED; \ + MONITOR_MAX_VALUE_START(monitor) = MAX_RESERVED; \ + } + +/** Macros to increment/decrement the counters. The normal +monitor counter operation expects appropriate synchronization +already exists. No additional mutex is necessary when operating +on the counters */ +#define MONITOR_INC(monitor) \ + if (MONITOR_IS_ON(monitor)) { \ + MONITOR_VALUE(monitor)++; \ + if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) { \ + MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\ + } \ + } + +/** Increment a monitor counter under mutex protection. +Use MONITOR_INC if appropriate mutex protection already exists. +@param monitor monitor to be incremented by 1 +@param mutex mutex to acquire and relese */ +# define MONITOR_MUTEX_INC(mutex, monitor) \ + ut_ad(!mutex_own(mutex)); \ + if (MONITOR_IS_ON(monitor)) { \ + mutex_enter(mutex); \ + if (++MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) { \ + MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor); \ + } \ + mutex_exit(mutex); \ + } +/** Decrement a monitor counter under mutex protection. +Use MONITOR_DEC if appropriate mutex protection already exists. +@param monitor monitor to be decremented by 1 +@param mutex mutex to acquire and relese */ +# define MONITOR_MUTEX_DEC(mutex, monitor) \ + ut_ad(!mutex_own(mutex)); \ + if (MONITOR_IS_ON(monitor)) { \ + mutex_enter(mutex); \ + if (--MONITOR_VALUE(monitor) < MONITOR_MIN_VALUE(monitor)) { \ + MONITOR_MIN_VALUE(monitor) = MONITOR_VALUE(monitor); \ + } \ + mutex_exit(mutex); \ + } + +#if defined HAVE_ATOMIC_BUILTINS_64 +/** Atomically increment a monitor counter. +Use MONITOR_INC if appropriate mutex protection exists. +@param monitor monitor to be incremented by 1 */ +# define MONITOR_ATOMIC_INC(monitor) \ + if (MONITOR_IS_ON(monitor)) { \ + ib_uint64_t value; \ + value = os_atomic_increment_uint64( \ + (ib_uint64_t*) &MONITOR_VALUE(monitor), 1); \ + /* Note: This is not 100% accurate because of the \ + inherent race, we ignore it due to performance. */ \ + if (value > (ib_uint64_t) MONITOR_MAX_VALUE(monitor)) { \ + MONITOR_MAX_VALUE(monitor) = value; \ + } \ + } + +/** Atomically decrement a monitor counter. +Use MONITOR_DEC if appropriate mutex protection exists. +@param monitor monitor to be decremented by 1 */ +# define MONITOR_ATOMIC_DEC(monitor) \ + if (MONITOR_IS_ON(monitor)) { \ + ib_uint64_t value; \ + value = os_atomic_decrement_uint64( \ + (ib_uint64_t*) &MONITOR_VALUE(monitor), 1); \ + /* Note: This is not 100% accurate because of the \ + inherent race, we ignore it due to performance. */ \ + if (value < (ib_uint64_t) MONITOR_MIN_VALUE(monitor)) { \ + MONITOR_MIN_VALUE(monitor) = value; \ + } \ + } +# define srv_mon_create() ((void) 0) +# define srv_mon_free() ((void) 0) +#else /* HAVE_ATOMIC_BUILTINS_64 */ +/** Mutex protecting atomic operations on platforms that lack +built-in operations for atomic memory access */ +extern ib_mutex_t monitor_mutex; +/****************************************************************//** +Initialize the monitor subsystem. */ +UNIV_INTERN +void +srv_mon_create(void); +/*================*/ +/****************************************************************//** +Close the monitor subsystem. */ +UNIV_INTERN +void +srv_mon_free(void); +/*==============*/ + +/** Atomically increment a monitor counter. +Use MONITOR_INC if appropriate mutex protection exists. +@param monitor monitor to be incremented by 1 */ +# define MONITOR_ATOMIC_INC(monitor) MONITOR_MUTEX_INC(&monitor_mutex, monitor) +/** Atomically decrement a monitor counter. +Use MONITOR_DEC if appropriate mutex protection exists. +@param monitor monitor to be decremented by 1 */ +# define MONITOR_ATOMIC_DEC(monitor) MONITOR_MUTEX_DEC(&monitor_mutex, monitor) +#endif /* HAVE_ATOMIC_BUILTINS_64 */ + +#define MONITOR_DEC(monitor) \ + if (MONITOR_IS_ON(monitor)) { \ + MONITOR_VALUE(monitor)--; \ + if (MONITOR_VALUE(monitor) < MONITOR_MIN_VALUE(monitor)) { \ + MONITOR_MIN_VALUE(monitor) = MONITOR_VALUE(monitor);\ + } \ + } + +#ifdef UNIV_DEBUG_VALGRIND +# define MONITOR_CHECK_DEFINED(value) do { \ + mon_type_t m = value; \ + UNIV_MEM_ASSERT_RW(&m, sizeof m); \ +} while (0) +#else /* UNIV_DEBUG_VALGRIND */ +# define MONITOR_CHECK_DEFINED(value) (void) 0 +#endif /* UNIV_DEBUG_VALGRIND */ + +#define MONITOR_INC_VALUE(monitor, value) \ + MONITOR_CHECK_DEFINED(value); \ + if (MONITOR_IS_ON(monitor)) { \ + MONITOR_VALUE(monitor) += (mon_type_t) (value); \ + if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) { \ + MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\ + } \ + } + +#define MONITOR_DEC_VALUE(monitor, value) \ + MONITOR_CHECK_DEFINED(value); \ + if (MONITOR_IS_ON(monitor)) { \ + ut_ad(MONITOR_VALUE(monitor) >= (mon_type_t) (value); \ + MONITOR_VALUE(monitor) -= (mon_type_t) (value); \ + if (MONITOR_VALUE(monitor) < MONITOR_MIN_VALUE(monitor)) { \ + MONITOR_MIN_VALUE(monitor) = MONITOR_VALUE(monitor);\ + } \ + } + +/* Increment/decrement counter without check the monitor on/off bit, which +could already be checked as a module group */ +#define MONITOR_INC_NOCHECK(monitor) \ + do { \ + MONITOR_VALUE(monitor)++; \ + if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) { \ + MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\ + } \ + } while (0) \ + +#define MONITOR_DEC_NOCHECK(monitor) \ + do { \ + MONITOR_VALUE(monitor)--; \ + if (MONITOR_VALUE(monitor) < MONITOR_MIN_VALUE(monitor)) { \ + MONITOR_MIN_VALUE(monitor) = MONITOR_VALUE(monitor);\ + } \ + } while (0) + +/** Directly set a monitor counter's value */ +#define MONITOR_SET(monitor, value) \ + MONITOR_CHECK_DEFINED(value); \ + if (MONITOR_IS_ON(monitor)) { \ + MONITOR_VALUE(monitor) = (mon_type_t) (value); \ + if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) { \ + MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\ + } \ + if (MONITOR_VALUE(monitor) < MONITOR_MIN_VALUE(monitor)) { \ + MONITOR_MIN_VALUE(monitor) = MONITOR_VALUE(monitor);\ + } \ + } + +/** Add time difference between now and input "value" (in seconds) to the +monitor counter +@param monitor monitor to update for the time difference +@param value the start time value */ +#define MONITOR_INC_TIME_IN_MICRO_SECS(monitor, value) \ + MONITOR_CHECK_DEFINED(value); \ + if (MONITOR_IS_ON(monitor)) { \ + ullint old_time = (value); \ + value = ut_time_us(NULL); \ + MONITOR_VALUE(monitor) += (mon_type_t) (value - old_time);\ + } + +/** This macro updates 3 counters in one call. However, it only checks the +main/first monitor counter 'monitor', to see it is on or off to decide +whether to do the update. +@param monitor the main monitor counter to update. It accounts for + the accumulative value for the counter. +@param monitor_n_calls counter that counts number of times this macro is + called +@param monitor_per_call counter that records the current and max value of + each incremental value +@param value incremental value to record this time */ +#define MONITOR_INC_VALUE_CUMULATIVE( \ + monitor, monitor_n_calls, monitor_per_call, value) \ + MONITOR_CHECK_DEFINED(value); \ + if (MONITOR_IS_ON(monitor)) { \ + MONITOR_VALUE(monitor_n_calls)++; \ + MONITOR_VALUE(monitor_per_call) = (mon_type_t) (value); \ + if (MONITOR_VALUE(monitor_per_call) \ + > MONITOR_MAX_VALUE(monitor_per_call)) { \ + MONITOR_MAX_VALUE(monitor_per_call) = \ + (mon_type_t) (value); \ + } \ + MONITOR_VALUE(monitor) += (mon_type_t) (value); \ + if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) { \ + MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\ + } \ + } + +/** Directly set a monitor counter's value, and if the value +is monotonically increasing, only max value needs to be updated */ +#define MONITOR_SET_UPD_MAX_ONLY(monitor, value) \ + MONITOR_CHECK_DEFINED(value); \ + if (MONITOR_IS_ON(monitor)) { \ + MONITOR_VALUE(monitor) = (mon_type_t) (value); \ + if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) { \ + MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\ + } \ + } + +/** Some values such as log sequence number are montomically increasing +number, do not need to record max/min values */ +#define MONITOR_SET_SIMPLE(monitor, value) \ + MONITOR_CHECK_DEFINED(value); \ + if (MONITOR_IS_ON(monitor)) { \ + MONITOR_VALUE(monitor) = (mon_type_t) (value); \ + } + +/** Reset the monitor value and max/min value to zero. The reset +operation would only be conducted when the counter is turned off */ +#define MONITOR_RESET_ALL(monitor) \ + do { \ + MONITOR_VALUE(monitor) = MONITOR_INIT_ZERO_VALUE; \ + MONITOR_MAX_VALUE(monitor) = MAX_RESERVED; \ + MONITOR_MIN_VALUE(monitor) = MIN_RESERVED; \ + MONITOR_VALUE_RESET(monitor) = MONITOR_INIT_ZERO_VALUE; \ + MONITOR_MAX_VALUE_START(monitor) = MAX_RESERVED; \ + MONITOR_MIN_VALUE_START(monitor) = MIN_RESERVED; \ + MONITOR_LAST_VALUE(monitor) = MONITOR_INIT_ZERO_VALUE; \ + MONITOR_FIELD(monitor, mon_start_time) = \ + MONITOR_INIT_ZERO_VALUE; \ + MONITOR_FIELD(monitor, mon_stop_time) = \ + MONITOR_INIT_ZERO_VALUE; \ + MONITOR_FIELD(monitor, mon_reset_time) = \ + MONITOR_INIT_ZERO_VALUE; \ + } while (0) + +/** Following four macros defines necessary operations to fetch and +consolidate information from existing system status variables. */ + +/** Save the passed-in value to mon_start_value field of monitor +counters */ +#define MONITOR_SAVE_START(monitor, value) do { \ + MONITOR_CHECK_DEFINED(value); \ + (MONITOR_START_VALUE(monitor) = \ + (mon_type_t) (value) - MONITOR_VALUE_RESET(monitor)); \ + } while (0) + +/** Save the passed-in value to mon_last_value field of monitor +counters */ +#define MONITOR_SAVE_LAST(monitor) \ + do { \ + MONITOR_LAST_VALUE(monitor) = MONITOR_VALUE(monitor); \ + MONITOR_START_VALUE(monitor) += MONITOR_VALUE(monitor); \ + } while (0) + +/** Set monitor value to the difference of value and mon_start_value +compensated by mon_last_value if accumulated value is required. */ +#define MONITOR_SET_DIFF(monitor, value) \ + MONITOR_SET_UPD_MAX_ONLY(monitor, ((value) \ + - MONITOR_VALUE_RESET(monitor) \ + - MONITOR_FIELD(monitor, mon_start_value) \ + + MONITOR_FIELD(monitor, mon_last_value))) + +/****************************************************************//** +Get monitor's monitor_info_t by its monitor id (index into the +innodb_counter_info array +@return Point to corresponding monitor_info_t, or NULL if no such +monitor */ +UNIV_INTERN +monitor_info_t* +srv_mon_get_info( +/*=============*/ + monitor_id_t monitor_id); /*!< id index into the + innodb_counter_info array */ +/****************************************************************//** +Get monitor's name by its monitor id (index into the +innodb_counter_info array +@return corresponding monitor name, or NULL if no such +monitor */ +UNIV_INTERN +const char* +srv_mon_get_name( +/*=============*/ + monitor_id_t monitor_id); /*!< id index into the + innodb_counter_info array */ + +/****************************************************************//** +Turn on/off/reset monitor counters in a module. If module_value +is NUM_MONITOR then turn on all monitor counters. +@return 0 if successful, or the first monitor that cannot be +turned on because it is already turned on. */ +UNIV_INTERN +void +srv_mon_set_module_control( +/*=======================*/ + monitor_id_t module_id, /*!< in: Module ID as in + monitor_counter_id. If it is + set to NUM_MONITOR, this means + we shall turn on all the counters */ + mon_option_t set_option); /*!< in: Turn on/off reset the + counter */ +/****************************************************************//** +This function consolidates some existing server counters used +by "system status variables". These existing system variables do not have +mechanism to start/stop and reset the counters, so we simulate these +controls by remembering the corresponding counter values when the +corresponding monitors are turned on/off/reset, and do appropriate +mathematics to deduct the actual value. */ +UNIV_INTERN +void +srv_mon_process_existing_counter( +/*=============================*/ + monitor_id_t monitor_id, /*!< in: the monitor's ID as in + monitor_counter_id */ + mon_option_t set_option); /*!< in: Turn on/off reset the + counter */ +/*************************************************************//** +This function is used to calculate the maximum counter value +since the start of monitor counter +@return max counter value since start. */ +UNIV_INLINE +mon_type_t +srv_mon_calc_max_since_start( +/*=========================*/ + monitor_id_t monitor); /*!< in: monitor id */ +/*************************************************************//** +This function is used to calculate the minimum counter value +since the start of monitor counter +@return min counter value since start. */ +UNIV_INLINE +mon_type_t +srv_mon_calc_min_since_start( +/*=========================*/ + monitor_id_t monitor); /*!< in: monitor id*/ +/*************************************************************//** +Reset a monitor, create a new base line with the current monitor +value. This baseline is recorded by MONITOR_VALUE_RESET(monitor) */ +UNIV_INTERN +void +srv_mon_reset( +/*==========*/ + monitor_id_t monitor); /*!< in: monitor id*/ +/*************************************************************//** +This function resets all values of a monitor counter */ +UNIV_INLINE +void +srv_mon_reset_all( +/*==============*/ + monitor_id_t monitor); /*!< in: monitor id*/ +/*************************************************************//** +Turn on monitor counters that are marked as default ON. */ +UNIV_INTERN +void +srv_mon_default_on(void); +/*====================*/ + +#ifndef UNIV_NONINL +#include "srv0mon.ic" +#endif +#else /* !UNIV_HOTBACKUP */ +# define MONITOR_INC(x) ((void) 0) +# define MONITOR_DEC(x) ((void) 0) +#endif /* !UNIV_HOTBACKUP */ + +#endif diff --git a/storage/xtradb/include/srv0mon.ic b/storage/xtradb/include/srv0mon.ic new file mode 100644 index 00000000000..225390c6b6f --- /dev/null +++ b/storage/xtradb/include/srv0mon.ic @@ -0,0 +1,113 @@ +/***************************************************************************** + +Copyright (c) 2010, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/*******************************************************************//** +@file include/srv0mon.ic +Server monitoring system + +Created 1/20/2010 Jimmy Yang +************************************************************************/ + +/*************************************************************//** +This function is used to calculate the maximum counter value +since the start of monitor counter +@return max counter value since start. */ +UNIV_INLINE +mon_type_t +srv_mon_calc_max_since_start( +/*=========================*/ + monitor_id_t monitor) /*!< in: monitor id */ +{ + if (MONITOR_MAX_VALUE_START(monitor) == MAX_RESERVED) { + + /* MONITOR_MAX_VALUE_START has not yet been + initialized, the max value since start is the + max count in MONITOR_MAX_VALUE */ + MONITOR_MAX_VALUE_START(monitor) = + MONITOR_MAX_VALUE(monitor); + + } else if (MONITOR_MAX_VALUE(monitor) != MAX_RESERVED + && (MONITOR_MAX_VALUE(monitor) + + MONITOR_VALUE_RESET(monitor) + > MONITOR_MAX_VALUE_START(monitor))) { + + /* If the max value since reset (as specified + in MONITOR_MAX_VALUE) plus the reset value is + larger than MONITOR_MAX_VALUE_START, reset + MONITOR_MAX_VALUE_START to this new max value */ + MONITOR_MAX_VALUE_START(monitor) = + MONITOR_MAX_VALUE(monitor) + + MONITOR_VALUE_RESET(monitor); + } + + return(MONITOR_MAX_VALUE_START(monitor)); +} + +/*************************************************************//** +This function is used to calculate the minimum counter value +since the start of monitor counter +@return min counter value since start. */ +UNIV_INLINE +mon_type_t +srv_mon_calc_min_since_start( +/*=========================*/ + monitor_id_t monitor) /*!< in: monitor id */ +{ + if (MONITOR_MIN_VALUE_START(monitor) == MIN_RESERVED) { + + /* MONITOR_MIN_VALUE_START has not yet been + initialized, the min value since start is the + min count in MONITOR_MIN_VALUE */ + MONITOR_MIN_VALUE_START(monitor) = + MONITOR_MIN_VALUE(monitor); + + } else if (MONITOR_MIN_VALUE(monitor) != MIN_RESERVED + && (MONITOR_MIN_VALUE(monitor) + + MONITOR_VALUE_RESET(monitor) + < MONITOR_MIN_VALUE_START(monitor))) { + + /* If the min value since reset (as specified + in MONITOR_MIN_VALUE) plus the reset value is + less than MONITOR_MIN_VALUE_START, reset + MONITOR_MIN_VALUE_START to this new min value */ + MONITOR_MIN_VALUE_START(monitor) = + MONITOR_MIN_VALUE(monitor) + + MONITOR_VALUE_RESET(monitor); + } + + return(MONITOR_MIN_VALUE_START(monitor)); +} + +/*************************************************************//** +This function resets all values of a monitor counter */ +UNIV_INLINE +void +srv_mon_reset_all( +/*==============*/ + monitor_id_t monitor) /*!< in: monitor id */ +{ + /* Do not reset all counter values if monitor is still on. */ + if (MONITOR_IS_ON(monitor)) { + fprintf(stderr, "InnoDB: Cannot reset all values for " + "monitor counter %s while it is on. Please " + "turn it off and retry. \n", + srv_mon_get_name(monitor)); + } else { + MONITOR_RESET_ALL(monitor); + } +} diff --git a/storage/xtradb/include/srv0srv.h b/storage/xtradb/include/srv0srv.h new file mode 100644 index 00000000000..ec7b921d166 --- /dev/null +++ b/storage/xtradb/include/srv0srv.h @@ -0,0 +1,1106 @@ +/***************************************************************************** + +Copyright (c) 1995, 2013, Oracle and/or its affiliates. All rights reserved. +Copyright (c) 2008, 2009, Google Inc. +Copyright (c) 2009, Percona Inc. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +Portions of this file contain modifications contributed and copyrighted +by Percona Inc.. Those modifications are +gratefully acknowledged and are described briefly in the InnoDB +documentation. The contributions by Percona Inc. are incorporated with +their permission, and subject to the conditions contained in the file +COPYING.Percona. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/srv0srv.h +The server main program + +Created 10/10/1995 Heikki Tuuri +*******************************************************/ + +#ifndef srv0srv_h +#define srv0srv_h + +#include "univ.i" +#ifndef UNIV_HOTBACKUP +#include "log0log.h" +#include "sync0sync.h" +#include "os0sync.h" +#include "que0types.h" +#include "trx0types.h" +#include "srv0conc.h" +#include "buf0checksum.h" +#include "ut0counter.h" + +/* Global counters used inside InnoDB. */ +struct srv_stats_t { + typedef ib_counter_t<lsn_t, 1, single_indexer_t> lsn_ctr_1_t; + typedef ib_counter_t<ulint, 1, single_indexer_t> ulint_ctr_1_t; + typedef ib_counter_t<lint, 1, single_indexer_t> lint_ctr_1_t; + typedef ib_counter_t<ulint, 64> ulint_ctr_64_t; + typedef ib_counter_t<ib_int64_t, 1, single_indexer_t> ib_int64_ctr_1_t; + + /** Count the amount of data written in total (in bytes) */ + ulint_ctr_1_t data_written; + + /** Number of the log write requests done */ + ulint_ctr_1_t log_write_requests; + + /** Number of physical writes to the log performed */ + ulint_ctr_1_t log_writes; + + /** Amount of data written to the log files in bytes */ + lsn_ctr_1_t os_log_written; + + /** Number of writes being done to the log files */ + lint_ctr_1_t os_log_pending_writes; + + /** We increase this counter, when we don't have enough + space in the log buffer and have to flush it */ + ulint_ctr_1_t log_waits; + + /** Count the number of times the doublewrite buffer was flushed */ + ulint_ctr_1_t dblwr_writes; + + /** Store the number of pages that have been flushed to the + doublewrite buffer */ + ulint_ctr_1_t dblwr_pages_written; + + /** Store the number of write requests issued */ + ulint_ctr_1_t buf_pool_write_requests; + + /** Store the number of times when we had to wait for a free page + in the buffer pool. It happens when the buffer pool is full and we + need to make a flush, in order to be able to read or create a page. */ + ulint_ctr_1_t buf_pool_wait_free; + + /** Count the number of pages that were written from buffer + pool to the disk */ + ulint_ctr_1_t buf_pool_flushed; + + /** Number of buffer pool reads that led to the reading of + a disk page */ + ulint_ctr_1_t buf_pool_reads; + + /** Number of data read in total (in bytes) */ + ulint_ctr_1_t data_read; + + /** Wait time of database locks */ + ib_int64_ctr_1_t n_lock_wait_time; + + /** Number of database lock waits */ + ulint_ctr_1_t n_lock_wait_count; + + /** Number of threads currently waiting on database locks */ + lint_ctr_1_t n_lock_wait_current_count; + + /** Number of rows read. */ + ulint_ctr_64_t n_rows_read; + + /** Number of rows updated */ + ulint_ctr_64_t n_rows_updated; + + /** Number of rows deleted */ + ulint_ctr_64_t n_rows_deleted; + + /** Number of rows inserted */ + ulint_ctr_64_t n_rows_inserted; + + ulint_ctr_1_t lock_deadlock_count; + + ulint_ctr_1_t n_lock_max_wait_time; +}; + +extern const char* srv_main_thread_op_info; + +/** Prefix used by MySQL to indicate pre-5.1 table name encoding */ +extern const char srv_mysql50_table_name_prefix[10]; + +/* The monitor thread waits on this event. */ +extern os_event_t srv_monitor_event; + +/* The error monitor thread waits on this event. */ +extern os_event_t srv_error_event; + +/** The buffer pool dump/load thread waits on this event. */ +extern os_event_t srv_buf_dump_event; + +/** The buffer pool dump/load file name */ +#define SRV_BUF_DUMP_FILENAME_DEFAULT "ib_buffer_pool" +extern char* srv_buf_dump_filename; + +/** Boolean config knobs that tell InnoDB to dump the buffer pool at shutdown +and/or load it during startup. */ +extern char srv_buffer_pool_dump_at_shutdown; +extern char srv_buffer_pool_load_at_startup; + +/* Whether to disable file system cache if it is defined */ +extern char srv_disable_sort_file_cache; + +/* This event is set on checkpoint completion to wake the redo log parser +thread */ +extern os_event_t srv_checkpoint_completed_event; + +/* This event is set on the online redo log following thread after a successful +log tracking iteration */ +extern os_event_t srv_redo_log_tracked_event; + +/** srv_redo_log_follow_thread spawn flag */ +extern bool srv_redo_log_thread_started; + +/* If the last data file is auto-extended, we add this many pages to it +at a time */ +#define SRV_AUTO_EXTEND_INCREMENT \ + (srv_auto_extend_increment * ((1024 * 1024) / UNIV_PAGE_SIZE)) + +/* Mutex for locking srv_monitor_file. Not created if srv_read_only_mode */ +extern ib_mutex_t srv_monitor_file_mutex; + +/* prototypes for new functions added to ha_innodb.cc */ +ibool innobase_get_slow_log(); + +/* Temporary file for innodb monitor output */ +extern FILE* srv_monitor_file; +/* Mutex for locking srv_dict_tmpfile. Only created if !srv_read_only_mode. +This mutex has a very high rank; threads reserving it should not +be holding any InnoDB latches. */ +extern ib_mutex_t srv_dict_tmpfile_mutex; +/* Temporary file for output from the data dictionary */ +extern FILE* srv_dict_tmpfile; +/* Mutex for locking srv_misc_tmpfile. Only created if !srv_read_only_mode. +This mutex has a very low rank; threads reserving it should not +acquire any further latches or sleep before releasing this one. */ +extern ib_mutex_t srv_misc_tmpfile_mutex; +/* Temporary file for miscellanous diagnostic output */ +extern FILE* srv_misc_tmpfile; + +/* Server parameters which are read from the initfile */ + +extern char* srv_data_home; + +#ifdef UNIV_LOG_ARCHIVE +extern char* srv_arch_dir; +#endif /* UNIV_LOG_ARCHIVE */ + +/** Set if InnoDB must operate in read-only mode. We don't do any +recovery and open all tables in RO mode instead of RW mode. We don't +sync the max trx id to disk either. */ +extern my_bool srv_read_only_mode; +/** store to its own file each table created by an user; data +dictionary tables are in the system tablespace 0 */ +extern my_bool srv_file_per_table; +/** Sleep delay for threads waiting to enter InnoDB. In micro-seconds. */ +extern ulong srv_thread_sleep_delay; +#if defined(HAVE_ATOMIC_BUILTINS) +/** Maximum sleep delay (in micro-seconds), value of 0 disables it.*/ +extern ulong srv_adaptive_max_sleep_delay; +#endif /* HAVE_ATOMIC_BUILTINS */ + +/** The file format to use on new *.ibd files. */ +extern ulint srv_file_format; +/** Whether to check file format during startup. A value of +UNIV_FORMAT_MAX + 1 means no checking ie. FALSE. The default is to +set it to the highest format we support. */ +extern ulint srv_max_file_format_at_startup; +/** Place locks to records only i.e. do not use next-key locking except +on duplicate key checking and foreign key checking */ +extern ibool srv_locks_unsafe_for_binlog; + +/** Sort buffer size in index creation */ +extern ulong srv_sort_buf_size; +/** Maximum modification log file size for online index creation */ +extern unsigned long long srv_online_max_size; + +/* If this flag is TRUE, then we will use the native aio of the +OS (provided we compiled Innobase with it in), otherwise we will +use simulated aio we build below with threads. +Currently we support native aio on windows and linux */ +extern my_bool srv_use_native_aio; +#ifdef __WIN__ +extern ibool srv_use_native_conditions; +#endif /* __WIN__ */ +#endif /* !UNIV_HOTBACKUP */ + +/** Server undo tablespaces directory, can be absolute path. */ +extern char* srv_undo_dir; + +/** Number of undo tablespaces to use. */ +extern ulong srv_undo_tablespaces; + +/** The number of UNDO tablespaces that are open and ready to use. */ +extern ulint srv_undo_tablespaces_open; + +/* The number of undo segments to use */ +extern ulong srv_undo_logs; + +extern ulint srv_n_data_files; +extern char** srv_data_file_names; +extern ulint* srv_data_file_sizes; +extern ulint* srv_data_file_is_raw_partition; + +extern my_bool srv_track_changed_pages; +extern ulonglong srv_max_bitmap_file_size; + +extern +ulonglong srv_max_changed_pages; + +extern ibool srv_auto_extend_last_data_file; +extern ulint srv_last_file_size_max; +extern char* srv_log_group_home_dir; +#ifndef UNIV_HOTBACKUP +extern ulong srv_auto_extend_increment; + +extern ibool srv_created_new_raw; + +/** Maximum number of srv_n_log_files, or innodb_log_files_in_group */ +#define SRV_N_LOG_FILES_MAX 100 +extern ulong srv_n_log_files; +extern ib_uint64_t srv_log_file_size; +extern ib_uint64_t srv_log_file_size_requested; +extern ulint srv_log_buffer_size; +extern uint srv_flush_log_at_timeout; +extern char srv_use_global_flush_log_at_trx_commit; +extern char srv_adaptive_flushing; + +/* If this flag is TRUE, then we will load the indexes' (and tables') metadata +even if they are marked as "corrupted". Mostly it is for DBA to process +corrupted index and table */ +extern my_bool srv_load_corrupted; + +extern ulint srv_show_locks_held; +extern ulint srv_show_verbose_locks; + +/* The sort order table of the MySQL latin1_swedish_ci character set +collation */ +extern const byte* srv_latin1_ordering; +#ifndef UNIV_HOTBACKUP +extern my_bool srv_use_sys_malloc; +#else +extern ibool srv_use_sys_malloc; +#endif /* UNIV_HOTBACKUP */ +extern ulint srv_buf_pool_size; /*!< requested size in bytes */ +extern my_bool srv_buf_pool_populate; /*!< virtual page preallocation */ +extern ulint srv_buf_pool_instances; /*!< requested number of buffer pool instances */ +extern ulong srv_n_page_hash_locks; /*!< number of locks to + protect buf_pool->page_hash */ +extern ulong srv_LRU_scan_depth; /*!< Scan depth for LRU + flush batch */ +extern ulong srv_flush_neighbors; /*!< whether or not to flush + neighbors of a block */ +extern ulint srv_buf_pool_old_size; /*!< previously requested size */ +extern ulint srv_buf_pool_curr_size; /*!< current size in bytes */ +extern ulint srv_mem_pool_size; +extern ulint srv_lock_table_size; + +extern ulint srv_foreground_preflush;/*!< Query thread preflush algorithm */ + +extern ulint srv_cleaner_max_lru_time;/*!< the maximum time limit for a + single LRU tail flush iteration by the + page cleaner thread */ + +extern ulint srv_cleaner_max_flush_time;/*!< the maximum time limit for a + single flush list flush iteration by + the page cleaner thread */ + +extern ulint srv_cleaner_flush_chunk_size; + /*!< page cleaner flush list flush + batches are further divided into this + chunk size */ + +extern ulint srv_cleaner_lru_chunk_size; + /*!< page cleaner LRU list flush + batches are further divided into this + chunk size */ + +extern ulint srv_cleaner_free_list_lwm;/*!< if free list length is lower + than this percentage of + srv_LRU_scan_depth, page cleaner LRU + flushes will issue flush batches to the + same instance in a row */ + +extern my_bool srv_cleaner_eviction_factor; + /*!< if TRUE, page cleaner heuristics + use evicted instead of flushed page + counts for its heuristics */ + +extern ulong srv_cleaner_lsn_age_factor; + /*!< page cleaner LSN age factor + formula option */ + +extern ulong srv_empty_free_list_algorithm; + /*!< Empty free list for a query thread + handling algorithm option */ + +extern ulint srv_n_file_io_threads; +extern my_bool srv_random_read_ahead; +extern ulong srv_read_ahead_threshold; +extern ulint srv_n_read_io_threads; +extern ulint srv_n_write_io_threads; + +/* Number of IO operations per second the server can do */ +extern ulong srv_io_capacity; + +/* We use this dummy default value at startup for max_io_capacity. +The real value is set based on the value of io_capacity. */ +#define SRV_MAX_IO_CAPACITY_DUMMY_DEFAULT (~0UL) +#define SRV_MAX_IO_CAPACITY_LIMIT (~0UL) +extern ulong srv_max_io_capacity; +/* Returns the number of IO operations that is X percent of the +capacity. PCT_IO(5) -> returns the number of IO operations that +is 5% of the max where max is srv_io_capacity. */ +#define PCT_IO(p) ((ulong) (srv_io_capacity * ((double) (p) / 100.0))) + +/* The "innodb_stats_method" setting, decides how InnoDB is going +to treat NULL value when collecting statistics. It is not defined +as enum type because the configure option takes unsigned integer type. */ +extern ulong srv_innodb_stats_method; + +#ifdef UNIV_LOG_ARCHIVE +extern ibool srv_log_archive_on; +extern ibool srv_archive_recovery; +extern ib_uint64_t srv_archive_recovery_limit_lsn; +#endif /* UNIV_LOG_ARCHIVE */ + +extern char* srv_file_flush_method_str; +extern ulint srv_unix_file_flush_method; +extern ulint srv_win_file_flush_method; + +extern ulint srv_max_n_open_files; + +extern ulong srv_max_dirty_pages_pct; +extern ulong srv_max_dirty_pages_pct_lwm; + +extern ulong srv_adaptive_flushing_lwm; +extern ulong srv_flushing_avg_loops; + +extern ulong srv_force_recovery; +#ifndef DBUG_OFF +extern ulong srv_force_recovery_crash; +#endif /* !DBUG_OFF */ + +extern ulint srv_fast_shutdown; /*!< If this is 1, do not do a + purge and index buffer merge. + If this 2, do not even flush the + buffer pool to data files at the + shutdown: we effectively 'crash' + InnoDB (but lose no committed + transactions). */ +extern ibool srv_innodb_status; + +extern unsigned long long srv_stats_transient_sample_pages; +extern my_bool srv_stats_persistent; +extern unsigned long long srv_stats_persistent_sample_pages; +extern my_bool srv_stats_auto_recalc; + +extern ibool srv_use_doublewrite_buf; +extern ulong srv_doublewrite_batch_size; +extern ibool srv_use_atomic_writes; +#ifdef HAVE_POSIX_FALLOCATE +extern ibool srv_use_posix_fallocate; +#endif +extern ulong srv_checksum_algorithm; + +extern ulong srv_log_arch_expire_sec; + +extern ulong srv_max_buf_pool_modified_pct; +extern ulong srv_max_purge_lag; +extern ulong srv_max_purge_lag_delay; + +extern ulong srv_replication_delay; + +extern ulint srv_pass_corrupt_table; + +extern ulint srv_log_checksum_algorithm; + +/* Helper macro to support srv_pass_corrupt_table checks. If 'cond' is FALSE, +execute 'code' if srv_pass_corrupt_table is non-zero, or trigger a fatal error +otherwise. The break statement in 'code' will obviously not work as +expected. */ + +#define SRV_CORRUPT_TABLE_CHECK(cond,code) \ + do { \ + if (UNIV_UNLIKELY(!(cond))) { \ + if (srv_pass_corrupt_table) { \ + code \ + } else { \ + ut_error; \ + } \ + } \ + } while(0) + +/*-------------------------------------------*/ + +extern ulint srv_read_views_memory; +extern ulint srv_descriptors_memory; + +extern my_bool srv_print_innodb_monitor; +extern my_bool srv_print_innodb_lock_monitor; +extern ibool srv_print_innodb_tablespace_monitor; +extern ibool srv_print_verbose_log; +#define DEPRECATED_MSG_INNODB_TABLE_MONITOR \ + "Using innodb_table_monitor is deprecated and it may be removed " \ + "in future releases. Please use the InnoDB INFORMATION_SCHEMA " \ + "tables instead, see " REFMAN "innodb-i_s-tables.html" +extern ibool srv_print_innodb_table_monitor; + +extern ibool srv_monitor_active; +extern ibool srv_error_monitor_active; + +/* TRUE during the lifetime of the buffer pool dump/load thread */ +extern ibool srv_buf_dump_thread_active; + +/* TRUE during the lifetime of the stats thread */ +extern ibool srv_dict_stats_thread_active; + +extern ulong srv_n_spin_wait_rounds; +extern ulong srv_n_free_tickets_to_enter; +extern ulong srv_thread_sleep_delay; +extern ulong srv_spin_wait_delay; +extern ibool srv_priority_boost; + +extern ulint srv_truncated_status_writes; +extern ulint srv_available_undo_logs; + +extern ulint srv_mem_pool_size; +extern ulint srv_lock_table_size; + +#ifdef UNIV_DEBUG +extern ibool srv_print_thread_releases; +extern ibool srv_print_lock_waits; +extern ibool srv_print_buf_io; +extern ibool srv_print_log_io; +extern ibool srv_print_latch_waits; +#else /* UNIV_DEBUG */ +# define srv_print_thread_releases FALSE +# define srv_print_lock_waits FALSE +# define srv_print_buf_io FALSE +# define srv_print_log_io FALSE +# define srv_print_latch_waits FALSE +#endif /* UNIV_DEBUG */ + +#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG +extern my_bool srv_ibuf_disable_background_merge; +#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ + +#ifdef UNIV_DEBUG +extern my_bool srv_purge_view_update_only_debug; +#endif /* UNIV_DEBUG */ + +extern ulint srv_fatal_semaphore_wait_threshold; +#define SRV_SEMAPHORE_WAIT_EXTENSION 7200 +extern ulint srv_dml_needed_delay; +extern lint srv_kill_idle_transaction; + +#ifndef HAVE_ATOMIC_BUILTINS +/** Mutex protecting some server global variables. */ +extern ib_mutex_t server_mutex; +#endif /* !HAVE_ATOMIC_BUILTINS */ + +#define SRV_MAX_N_IO_THREADS 130 + +#define SRV_MAX_N_PURGE_THREADS 32 + +/* Array of English strings describing the current state of an +i/o handler thread */ +extern const char* srv_io_thread_op_info[]; +extern const char* srv_io_thread_function[]; + +/* The tid of the cleaner thread */ +extern os_tid_t srv_cleaner_tid; + +/* The tid of the LRU manager thread */ +extern os_tid_t srv_lru_manager_tid; + +/* The tids of the purge threads */ +extern os_tid_t srv_purge_tids[]; + +/* The tids of the I/O threads */ +extern os_tid_t srv_io_tids[]; + +/* The tid of the master thread */ +extern os_tid_t srv_master_tid; + +/* The relative scheduling priority of the cleaner and LRU manager threads */ +extern ulint srv_sched_priority_cleaner; + +/* The relative scheduling priority of the purge threads */ +extern ulint srv_sched_priority_purge; + +/* The relative scheduling priority of the I/O threads */ +extern ulint srv_sched_priority_io; + +/* The relative scheduling priority of the master thread */ +extern ulint srv_sched_priority_master; + +/* The relative priority of the purge coordinator and worker threads. */ +extern my_bool srv_purge_thread_priority; + +/* The relative priority of the I/O threads. */ +extern my_bool srv_io_thread_priority; + +/* The relative priority of the cleaner thread. */ +extern my_bool srv_cleaner_thread_priority; + +/* The relative priority of the master thread. */ +extern my_bool srv_master_thread_priority; + +/* the number of purge threads to use from the worker pool (currently 0 or 1) */ +extern ulong srv_n_purge_threads; + +/* the number of pages to purge in one batch */ +extern ulong srv_purge_batch_size; + +/* the number of sync wait arrays */ +extern ulong srv_sync_array_size; + +/* print all user-level transactions deadlocks to mysqld stderr */ +extern my_bool srv_print_all_deadlocks; + +extern my_bool srv_cmp_per_index_enabled; + +/** Status variables to be passed to MySQL */ +extern struct export_var_t export_vars; + +/** Global counters */ +extern srv_stats_t srv_stats; + +/** When TRUE, fake change transcations take S rather than X row locks. +When FALSE, row locks are not taken at all. */ +extern my_bool srv_fake_changes_locks; + + +# ifdef UNIV_PFS_THREAD +/* Keys to register InnoDB threads with performance schema */ +extern mysql_pfs_key_t buf_page_cleaner_thread_key; +extern mysql_pfs_key_t buf_lru_manager_thread_key; +extern mysql_pfs_key_t trx_rollback_clean_thread_key; +extern mysql_pfs_key_t io_handler_thread_key; +extern mysql_pfs_key_t srv_lock_timeout_thread_key; +extern mysql_pfs_key_t srv_error_monitor_thread_key; +extern mysql_pfs_key_t srv_monitor_thread_key; +extern mysql_pfs_key_t srv_master_thread_key; +extern mysql_pfs_key_t srv_purge_thread_key; +extern mysql_pfs_key_t recv_writer_thread_key; +extern mysql_pfs_key_t srv_log_tracking_thread_key; + +/* This macro register the current thread and its key with performance +schema */ +# define pfs_register_thread(key) \ +do { \ + struct PSI_thread* psi = PSI_THREAD_CALL(new_thread)(key, NULL, 0);\ + PSI_THREAD_CALL(set_thread)(psi); \ +} while (0) + +/* This macro delist the current thread from performance schema */ +# define pfs_delete_thread() \ +do { \ + PSI_THREAD_CALL(delete_current_thread)(); \ +} while (0) +# endif /* UNIV_PFS_THREAD */ + +#endif /* !UNIV_HOTBACKUP */ + +/** Types of raw partitions in innodb_data_file_path */ +enum { + SRV_NOT_RAW = 0, /*!< Not a raw partition */ + SRV_NEW_RAW, /*!< A 'newraw' partition, only to be + initialized */ + SRV_OLD_RAW /*!< An initialized raw partition */ +}; + +/** Alternatives for the file flush option in Unix; see the InnoDB manual +about what these mean */ +enum { + SRV_UNIX_FSYNC = 1, /*!< fsync, the default */ + SRV_UNIX_O_DSYNC, /*!< open log files in O_SYNC mode */ + SRV_UNIX_LITTLESYNC, /*!< do not call os_file_flush() + when writing data files, but do flush + after writing to log files */ + SRV_UNIX_NOSYNC, /*!< do not flush after writing */ + SRV_UNIX_O_DIRECT, /*!< invoke os_file_set_nocache() on + data files. This implies using + non-buffered IO but still using fsync, + the reason for which is that some FS + do not flush meta-data when + unbuffered IO happens */ + SRV_UNIX_O_DIRECT_NO_FSYNC, + /*!< do not use fsync() when using + direct IO i.e.: it can be set to avoid + the fsync() call that we make when + using SRV_UNIX_O_DIRECT. However, in + this case user/DBA should be sure about + the integrity of the meta-data */ + SRV_UNIX_ALL_O_DIRECT /*!< similar to O_DIRECT, invokes + os_file_set_nocache() on data and log files. + This implies using non-buffered IO but still + using fsync for data but not log files. */ +}; + +/** Alternatives for file i/o in Windows */ +enum { + SRV_WIN_IO_NORMAL = 1, /*!< buffered I/O */ + SRV_WIN_IO_UNBUFFERED /*!< unbuffered I/O; this is the default */ +}; + +/** Alternatives for srv_force_recovery. Non-zero values are intended +to help the user get a damaged database up so that he can dump intact +tables and rows with SELECT INTO OUTFILE. The database must not otherwise +be used with these options! A bigger number below means that all precautions +of lower numbers are included. */ +enum { + SRV_FORCE_IGNORE_CORRUPT = 1, /*!< let the server run even if it + detects a corrupt page */ + SRV_FORCE_NO_BACKGROUND = 2, /*!< prevent the main thread from + running: if a crash would occur + in purge, this prevents it */ + SRV_FORCE_NO_TRX_UNDO = 3, /*!< do not run trx rollback after + recovery */ + SRV_FORCE_NO_IBUF_MERGE = 4, /*!< prevent also ibuf operations: + if they would cause a crash, better + not do them */ + SRV_FORCE_NO_UNDO_LOG_SCAN = 5, /*!< do not look at undo logs when + starting the database: InnoDB will + treat even incomplete transactions + as committed */ + SRV_FORCE_NO_LOG_REDO = 6 /*!< do not do the log roll-forward + in connection with recovery */ +}; + +/* Alternatives for srv_innodb_stats_method, which could be changed by +setting innodb_stats_method */ +enum srv_stats_method_name_enum { + SRV_STATS_NULLS_EQUAL, /* All NULL values are treated as + equal. This is the default setting + for innodb_stats_method */ + SRV_STATS_NULLS_UNEQUAL, /* All NULL values are treated as + NOT equal. */ + SRV_STATS_NULLS_IGNORED /* NULL values are ignored */ +}; + +typedef enum srv_stats_method_name_enum srv_stats_method_name_t; + +#ifndef UNIV_HOTBACKUP +/** Types of threads existing in the system. */ +enum srv_thread_type { + SRV_NONE, /*!< None */ + SRV_WORKER, /*!< threads serving parallelized + queries and queries released from + lock wait */ + SRV_PURGE, /*!< Purge coordinator thread */ + SRV_MASTER /*!< the master thread, (whose type + number must be biggest) */ +}; + +/*********************************************************************//** +Boots Innobase server. */ +UNIV_INTERN +void +srv_boot(void); +/*==========*/ +/*********************************************************************//** +Initializes the server. */ +UNIV_INTERN +void +srv_init(void); +/*==========*/ +/*********************************************************************//** +Frees the data structures created in srv_init(). */ +UNIV_INTERN +void +srv_free(void); +/*==========*/ +/*********************************************************************//** +Initializes the synchronization primitives, memory system, and the thread +local storage. */ +UNIV_INTERN +void +srv_general_init(void); +/*==================*/ +/*********************************************************************//** +Sets the info describing an i/o thread current state. */ +UNIV_INTERN +void +srv_set_io_thread_op_info( +/*======================*/ + ulint i, /*!< in: the 'segment' of the i/o thread */ + const char* str); /*!< in: constant char string describing the + state */ +/*********************************************************************//** +Resets the info describing an i/o thread current state. */ +UNIV_INTERN +void +srv_reset_io_thread_op_info(); +/*=========================*/ +/*******************************************************************//** +Tells the purge thread that there has been activity in the database +and wakes up the purge thread if it is suspended (not sleeping). Note +that there is a small chance that the purge thread stays suspended +(we do not protect our operation with the srv_sys_t:mutex, for +performance reasons). */ +UNIV_INTERN +void +srv_wake_purge_thread_if_not_active(void); +/*=====================================*/ +/*******************************************************************//** +Tells the Innobase server that there has been activity in the database +and wakes up the master thread if it is suspended (not sleeping). Used +in the MySQL interface. Note that there is a small chance that the master +thread stays suspended (we do not protect our operation with the kernel +mutex, for performace reasons). */ +UNIV_INTERN +void +srv_active_wake_master_thread(void); +/*===============================*/ +/*******************************************************************//** +Wakes up the master thread if it is suspended or being suspended. */ +UNIV_INTERN +void +srv_wake_master_thread(void); +/*========================*/ +/******************************************************************//** +A thread which follows the redo log and outputs the changed page bitmap. +@return a dummy value */ +extern "C" +UNIV_INTERN +os_thread_ret_t +DECLARE_THREAD(srv_redo_log_follow_thread)( +/*=======================*/ + void* arg); /*!< in: a dummy parameter required by + os_thread_create */ +/******************************************************************//** +Outputs to a file the output of the InnoDB Monitor. +@return FALSE if not all information printed +due to failure to obtain necessary mutex */ +UNIV_INTERN +ibool +srv_printf_innodb_monitor( +/*======================*/ + FILE* file, /*!< in: output stream */ + ibool nowait, /*!< in: whether to wait for the + lock_sys_t::mutex */ + ulint* trx_start, /*!< out: file position of the start of + the list of active transactions */ + ulint* trx_end); /*!< out: file position of the end of + the list of active transactions */ + +/******************************************************************//** +Function to pass InnoDB status variables to MySQL */ +UNIV_INTERN +void +srv_export_innodb_status(void); +/*==========================*/ +/*************************************************************//** +Removes old archived transaction log files. +Both parameters couldn't be provided at the same time. +@return DB_SUCCESS on success, otherwise DB_ERROR */ +UNIV_INTERN +dberr_t +purge_archived_logs( + time_t before_date, /*!< in: all files modified + before timestamp should be removed */ + lsn_t before_lsn); /*!< in: files with this lsn in name + and earler should be removed */ +/*==========================*/ +/*******************************************************************//** +Get current server activity count. We don't hold srv_sys::mutex while +reading this value as it is only used in heuristics. +@return activity count. */ +UNIV_INTERN +ulint +srv_get_activity_count(void); +/*========================*/ +/*******************************************************************//** +Check if there has been any activity. +@return FALSE if no change in activity counter. */ +UNIV_INTERN +ibool +srv_check_activity( +/*===============*/ + ulint old_activity_count); /*!< old activity count */ +/******************************************************************//** +Increment the server activity counter. */ +UNIV_INTERN +void +srv_inc_activity_count(void); +/*=========================*/ + +/**********************************************************************//** +Enqueues a task to server task queue and releases a worker thread, if there +is a suspended one. */ +UNIV_INTERN +void +srv_que_task_enqueue_low( +/*=====================*/ + que_thr_t* thr); /*!< in: query thread */ + +/**********************************************************************//** +Check whether any background thread is active. If so, return the thread +type. +@return SRV_NONE if all are are suspended or have exited, thread +type if any are still active. */ +UNIV_INTERN +enum srv_thread_type +srv_get_active_thread_type(void); +/*============================*/ + +extern "C" { + +/*********************************************************************//** +A thread which prints the info output by various InnoDB monitors. +@return a dummy parameter */ +UNIV_INTERN +os_thread_ret_t +DECLARE_THREAD(srv_monitor_thread)( +/*===============================*/ + void* arg); /*!< in: a dummy parameter required by + os_thread_create */ + +/*********************************************************************//** +The master thread controlling the server. +@return a dummy parameter */ +UNIV_INTERN +os_thread_ret_t +DECLARE_THREAD(srv_master_thread)( +/*==============================*/ + void* arg); /*!< in: a dummy parameter required by + os_thread_create */ + +/************************************************************************* +A thread which prints warnings about semaphore waits which have lasted +too long. These can be used to track bugs which cause hangs. +@return a dummy parameter */ +UNIV_INTERN +os_thread_ret_t +DECLARE_THREAD(srv_error_monitor_thread)( +/*=====================================*/ + void* arg); /*!< in: a dummy parameter required by + os_thread_create */ + +/*********************************************************************//** +Purge coordinator thread that schedules the purge tasks. +@return a dummy parameter */ +UNIV_INTERN +os_thread_ret_t +DECLARE_THREAD(srv_purge_coordinator_thread)( +/*=========================================*/ + void* arg __attribute__((unused))); /*!< in: a dummy parameter + required by os_thread_create */ + +/*********************************************************************//** +Worker thread that reads tasks from the work queue and executes them. +@return a dummy parameter */ +UNIV_INTERN +os_thread_ret_t +DECLARE_THREAD(srv_worker_thread)( +/*==============================*/ + void* arg __attribute__((unused))); /*!< in: a dummy parameter + required by os_thread_create */ +} /* extern "C" */ + +/**********************************************************************//** +Get count of tasks in the queue. +@return number of tasks in queue */ +UNIV_INTERN +ulint +srv_get_task_queue_length(void); +/*===========================*/ + +/*********************************************************************//** +Releases threads of the type given from suspension in the thread table. +NOTE! The server mutex has to be reserved by the caller! +@return number of threads released: this may be less than n if not +enough threads were suspended at the moment */ +UNIV_INTERN +ulint +srv_release_threads( +/*================*/ + enum srv_thread_type type, /*!< in: thread type */ + ulint n); /*!< in: number of threads to release */ + +/**********************************************************************//** +Check whether any background thread are active. If so print which thread +is active. Send the threads wakeup signal. +@return name of thread that is active or NULL */ +UNIV_INTERN +const char* +srv_any_background_threads_are_active(void); +/*=======================================*/ + +/**********************************************************************//** +Wakeup the purge threads. */ +UNIV_INTERN +void +srv_purge_wakeup(void); +/*==================*/ + +/** Status variables to be passed to MySQL */ +struct export_var_t{ + ulint innodb_adaptive_hash_hash_searches; + ulint innodb_adaptive_hash_non_hash_searches; + ulint innodb_background_log_sync; + ulint innodb_data_pending_reads; /*!< Pending reads */ + ulint innodb_data_pending_writes; /*!< Pending writes */ + ulint innodb_data_pending_fsyncs; /*!< Pending fsyncs */ + ulint innodb_data_fsyncs; /*!< Number of fsyncs so far */ + ulint innodb_data_read; /*!< Data bytes read */ + ulint innodb_data_writes; /*!< I/O write requests */ + ulint innodb_data_written; /*!< Data bytes written */ + ulint innodb_data_reads; /*!< I/O read requests */ + char innodb_buffer_pool_dump_status[512];/*!< Buf pool dump status */ + char innodb_buffer_pool_load_status[512];/*!< Buf pool load status */ + ulint innodb_buffer_pool_pages_total; /*!< Buffer pool size */ + ulint innodb_buffer_pool_pages_data; /*!< Data pages */ + ulint innodb_buffer_pool_bytes_data; /*!< File bytes used */ + ulint innodb_buffer_pool_pages_dirty; /*!< Dirty data pages */ + ulint innodb_buffer_pool_bytes_dirty; /*!< File bytes modified */ + ulint innodb_buffer_pool_pages_misc; /*!< Miscellanous pages */ + ulint innodb_buffer_pool_pages_free; /*!< Free pages */ +#ifdef UNIV_DEBUG + ulint innodb_buffer_pool_pages_latched; /*!< Latched pages */ +#endif /* UNIV_DEBUG */ + ulint innodb_buffer_pool_pages_made_not_young; + ulint innodb_buffer_pool_pages_made_young; + ulint innodb_buffer_pool_pages_old; + ulint innodb_buffer_pool_read_requests; /*!< buf_pool->stat.n_page_gets */ + ulint innodb_buffer_pool_reads; /*!< srv_buf_pool_reads */ + ulint innodb_buffer_pool_wait_free; /*!< srv_buf_pool_wait_free */ + ulint innodb_buffer_pool_pages_flushed; /*!< srv_buf_pool_flushed */ + ulint innodb_buffer_pool_pages_LRU_flushed; /*!< buf_lru_flush_page_count */ + ulint innodb_buffer_pool_write_requests;/*!< srv_buf_pool_write_requests */ + ulint innodb_buffer_pool_read_ahead_rnd;/*!< srv_read_ahead_rnd */ + ulint innodb_buffer_pool_read_ahead; /*!< srv_read_ahead */ + ulint innodb_buffer_pool_read_ahead_evicted;/*!< srv_read_ahead evicted*/ + ulint innodb_checkpoint_age; + ulint innodb_checkpoint_max_age; + ulint innodb_dblwr_pages_written; /*!< srv_dblwr_pages_written */ + ulint innodb_dblwr_writes; /*!< srv_dblwr_writes */ + ulint innodb_deadlocks; + ibool innodb_have_atomic_builtins; /*!< HAVE_ATOMIC_BUILTINS */ + ulint innodb_history_list_length; + ulint innodb_ibuf_size; + ulint innodb_ibuf_free_list; + ulint innodb_ibuf_segment_size; + ulint innodb_ibuf_merges; + ulint innodb_ibuf_merged_inserts; + ulint innodb_ibuf_merged_delete_marks; + ulint innodb_ibuf_merged_deletes; + ulint innodb_ibuf_discarded_inserts; + ulint innodb_ibuf_discarded_delete_marks; + ulint innodb_ibuf_discarded_deletes; + ulint innodb_log_waits; /*!< srv_log_waits */ + ulint innodb_log_write_requests; /*!< srv_log_write_requests */ + ulint innodb_log_writes; /*!< srv_log_writes */ + lsn_t innodb_os_log_written; /*!< srv_os_log_written */ + lsn_t innodb_lsn_current; + lsn_t innodb_lsn_flushed; + lsn_t innodb_lsn_last_checkpoint; + ulint innodb_master_thread_active_loops;/*!< srv_main_active_loops */ + ulint innodb_master_thread_idle_loops; /*!< srv_main_idle_loops */ + ib_int64_t innodb_max_trx_id; + ulint innodb_mem_adaptive_hash; + ulint innodb_mem_dictionary; + ulint innodb_mem_total; + ib_int64_t innodb_mutex_os_waits; + ib_int64_t innodb_mutex_spin_rounds; + ib_int64_t innodb_mutex_spin_waits; + ib_int64_t innodb_oldest_view_low_limit_trx_id; + ulint innodb_os_log_fsyncs; /*!< fil_n_log_flushes */ + ulint innodb_os_log_pending_writes; /*!< srv_os_log_pending_writes */ + ulint innodb_os_log_pending_fsyncs; /*!< fil_n_pending_log_flushes */ + ulint innodb_page_size; /*!< UNIV_PAGE_SIZE */ + ulint innodb_pages_created; /*!< buf_pool->stat.n_pages_created */ + ulint innodb_pages_read; /*!< buf_pool->stat.n_pages_read */ + ulint innodb_pages_written; /*!< buf_pool->stat.n_pages_written */ + ib_int64_t innodb_purge_trx_id; + ib_int64_t innodb_purge_undo_no; + ulint innodb_row_lock_waits; /*!< srv_n_lock_wait_count */ + ulint innodb_row_lock_current_waits; /*!< srv_n_lock_wait_current_count */ + ib_int64_t innodb_row_lock_time; /*!< srv_n_lock_wait_time + / 1000 */ + ulint innodb_row_lock_time_avg; /*!< srv_n_lock_wait_time + / 1000 + / srv_n_lock_wait_count */ + ulint innodb_row_lock_time_max; /*!< srv_n_lock_max_wait_time + / 1000 */ + ulint innodb_current_row_locks; + ulint innodb_rows_read; /*!< srv_n_rows_read */ + ulint innodb_rows_inserted; /*!< srv_n_rows_inserted */ + ulint innodb_rows_updated; /*!< srv_n_rows_updated */ + ulint innodb_rows_deleted; /*!< srv_n_rows_deleted */ + ulint innodb_num_open_files; /*!< fil_n_file_opened */ + ulint innodb_truncated_status_writes; /*!< srv_truncated_status_writes */ + ulint innodb_available_undo_logs; /*!< srv_available_undo_logs */ + ulint innodb_read_views_memory; /*!< srv_read_views_memory */ + ulint innodb_descriptors_memory; /*!< srv_descriptors_memory */ + ib_int64_t innodb_s_lock_os_waits; + ib_int64_t innodb_s_lock_spin_rounds; + ib_int64_t innodb_s_lock_spin_waits; + ib_int64_t innodb_x_lock_os_waits; + ib_int64_t innodb_x_lock_spin_rounds; + ib_int64_t innodb_x_lock_spin_waits; +#ifdef UNIV_DEBUG + ulint innodb_purge_trx_id_age; /*!< rw_max_trx_id - purged trx_id */ + ulint innodb_purge_view_trx_id_age; /*!< rw_max_trx_id + - purged view's min trx_id */ +#endif /* UNIV_DEBUG */ +}; + +/** Thread slot in the thread table. */ +struct srv_slot_t{ + srv_thread_type type; /*!< thread type: user, + utility etc. */ + ibool in_use; /*!< TRUE if this slot + is in use */ + ibool suspended; /*!< TRUE if the thread is + waiting for the event of this + slot */ + ib_time_t suspend_time; /*!< time when the thread was + suspended. Initialized by + lock_wait_table_reserve_slot() + for lock wait */ + ulong wait_timeout; /*!< wait time that if exceeded + the thread will be timed out. + Initialized by + lock_wait_table_reserve_slot() + for lock wait */ + os_event_t event; /*!< event used in suspending + the thread when it has nothing + to do */ + que_thr_t* thr; /*!< suspended query thread + (only used for user threads) */ +}; + +#else /* !UNIV_HOTBACKUP */ +# define srv_use_adaptive_hash_indexes FALSE +# define srv_use_native_aio FALSE +# define srv_force_recovery 0UL +# define srv_set_io_thread_op_info(t,info) ((void) 0) +# define srv_reset_io_thread_op_info() ((void) 0) +# define srv_is_being_started 0 +# define srv_win_file_flush_method SRV_WIN_IO_UNBUFFERED +# define srv_unix_file_flush_method SRV_UNIX_O_DSYNC +# define srv_start_raw_disk_in_use 0 +# define srv_file_per_table 1 +#endif /* !UNIV_HOTBACKUP */ + +#endif diff --git a/storage/xtradb/include/srv0srv.ic b/storage/xtradb/include/srv0srv.ic new file mode 100644 index 00000000000..53405c06f97 --- /dev/null +++ b/storage/xtradb/include/srv0srv.ic @@ -0,0 +1,24 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/srv0srv.ic +Server main program + +Created 10/4/1995 Heikki Tuuri +*******************************************************/ diff --git a/storage/xtradb/include/srv0start.h b/storage/xtradb/include/srv0start.h new file mode 100644 index 00000000000..40d502f4459 --- /dev/null +++ b/storage/xtradb/include/srv0start.h @@ -0,0 +1,167 @@ +/***************************************************************************** + +Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/srv0start.h +Starts the Innobase database server + +Created 10/10/1995 Heikki Tuuri +*******************************************************/ + +#ifndef srv0start_h +#define srv0start_h + +#include "univ.i" +#include "log0log.h" +#include "ut0byte.h" + +#ifdef __WIN__ +#define SRV_PATH_SEPARATOR '\\' +#else +#define SRV_PATH_SEPARATOR '/' +#endif + +/*********************************************************************//** +Normalizes a directory path for Windows: converts slashes to backslashes. */ +UNIV_INTERN +void +srv_normalize_path_for_win( +/*=======================*/ + char* str); /*!< in/out: null-terminated character string */ +/*********************************************************************//** +Reads the data files and their sizes from a character string given in +the .cnf file. +@return TRUE if ok, FALSE on parse error */ +UNIV_INTERN +ibool +srv_parse_data_file_paths_and_sizes( +/*================================*/ + char* str); /*!< in/out: the data file path string */ +/*********************************************************************//** +Frees the memory allocated by srv_parse_data_file_paths_and_sizes() +and srv_parse_log_group_home_dirs(). */ +UNIV_INTERN +void +srv_free_paths_and_sizes(void); +/*==========================*/ +/*********************************************************************//** +Adds a slash or a backslash to the end of a string if it is missing +and the string is not empty. +@return string which has the separator if the string is not empty */ +UNIV_INTERN +char* +srv_add_path_separator_if_needed( +/*=============================*/ + char* str); /*!< in: null-terminated character string */ +#ifndef UNIV_HOTBACKUP +/****************************************************************//** +Starts Innobase and creates a new database if database files +are not found and the user wants. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +innobase_start_or_create_for_mysql(void); +/*====================================*/ +/****************************************************************//** +Shuts down the Innobase database. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +innobase_shutdown_for_mysql(void); + +/******************************************************************** +Signal all per-table background threads to shutdown, and wait for them to do +so. */ +UNIV_INTERN +void +srv_shutdown_table_bg_threads(void); +/*=============================*/ + +/*************************************************************//** +Copy the file path component of the physical file to parameter. It will +copy up to and including the terminating path separator. +@return number of bytes copied or ULINT_UNDEFINED if destination buffer + is smaller than the path to be copied. */ +UNIV_INTERN +ulint +srv_path_copy( +/*==========*/ + char* dest, /*!< out: destination buffer */ + ulint dest_len, /*!< in: max bytes to copy */ + const char* basedir, /*!< in: base directory */ + const char* table_name) /*!< in: source table name */ + __attribute__((nonnull, warn_unused_result)); + +/*****************************************************************//** +Get the meta-data filename from the table name. */ +UNIV_INTERN +void +srv_get_meta_data_filename( +/*======================*/ + dict_table_t* table, /*!< in: table */ + char* filename, /*!< out: filename */ + ulint max_len) /*!< in: filename max length */ + __attribute__((nonnull)); + +/** Log sequence number at shutdown */ +extern lsn_t srv_shutdown_lsn; +/** Log sequence number immediately after startup */ +extern lsn_t srv_start_lsn; + +#ifdef HAVE_DARWIN_THREADS +/** TRUE if the F_FULLFSYNC option is available */ +extern ibool srv_have_fullfsync; +#endif + +/** TRUE if the server is being started */ +extern ibool srv_is_being_started; +/** TRUE if the server was successfully started */ +extern ibool srv_was_started; +/** TRUE if the server is being started, before rolling back any +incomplete transactions */ +extern ibool srv_startup_is_before_trx_rollback_phase; + +/** TRUE if a raw partition is in use */ +extern ibool srv_start_raw_disk_in_use; + + +/** Shutdown state */ +enum srv_shutdown_state { + SRV_SHUTDOWN_NONE = 0, /*!< Database running normally */ + SRV_SHUTDOWN_CLEANUP, /*!< Cleaning up in + logs_empty_and_mark_files_at_shutdown() */ + SRV_SHUTDOWN_FLUSH_PHASE,/*!< At this phase the master and the + purge threads must have completed their + work. Once we enter this phase the + page_cleaner can clean up the buffer + pool and exit */ + SRV_SHUTDOWN_LAST_PHASE,/*!< Last phase after ensuring that + the buffer pool can be freed: flush + all file spaces and close all files */ + SRV_SHUTDOWN_EXIT_THREADS/*!< Exit all threads */ +}; + +/** At a shutdown this value climbs from SRV_SHUTDOWN_NONE to +SRV_SHUTDOWN_CLEANUP and then to SRV_SHUTDOWN_LAST_PHASE, and so on */ +extern enum srv_shutdown_state srv_shutdown_state; +#endif /* !UNIV_HOTBACKUP */ + +/** Log 'spaces' have id's >= this */ +#define SRV_LOG_SPACE_FIRST_ID 0xFFFFFFF0UL + +#endif diff --git a/storage/xtradb/include/sync0arr.h b/storage/xtradb/include/sync0arr.h new file mode 100644 index 00000000000..15dbdcb540d --- /dev/null +++ b/storage/xtradb/include/sync0arr.h @@ -0,0 +1,155 @@ +/***************************************************************************** + +Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/sync0arr.h +The wait array used in synchronization primitives + +Created 9/5/1995 Heikki Tuuri +*******************************************************/ + +#ifndef sync0arr_h +#define sync0arr_h + +#include "univ.i" +#include "ut0lst.h" +#include "ut0mem.h" +#include "os0thread.h" + +/** Synchronization wait array cell */ +struct sync_cell_t; +/** Synchronization wait array */ +struct sync_array_t; + +/******************************************************************//** +Get an instance of the sync wait array and reserve a wait array cell +in the instance for waiting for an object. The event of the cell is +reset to nonsignalled state. +If reserving cell of the instance fails, try to get another new +instance until we can reserve an empty cell of it. +@return the instance found, never NULL. */ +UNIV_INLINE +sync_array_t* +sync_array_get_and_reserve_cell( +/*============================*/ + void* object, /*!< in: pointer to the object to wait for */ + ulint type, /*!< in: lock request type */ + const char* file, /*!< in: file where requested */ + ulint line, /*!< in: line where requested */ + ulint* index); /*!< out: index of the reserved cell */ +/******************************************************************//** +Reserves a wait array cell for waiting for an object. +The event of the cell is reset to nonsignalled state. +@return true if free cell is found, otherwise false */ +UNIV_INTERN +bool +sync_array_reserve_cell( +/*====================*/ + sync_array_t* arr, /*!< in: wait array */ + void* object, /*!< in: pointer to the object to wait for */ + ulint type, /*!< in: lock request type */ + const char* file, /*!< in: file where requested */ + ulint line, /*!< in: line where requested */ + ulint* index); /*!< out: index of the reserved cell */ +/******************************************************************//** +This function should be called when a thread starts to wait on +a wait array cell. In the debug version this function checks +if the wait for a semaphore will result in a deadlock, in which +case prints info and asserts. */ +UNIV_INTERN +void +sync_array_wait_event( +/*==================*/ + sync_array_t* arr, /*!< in: wait array */ + ulint index); /*!< in: index of the reserved cell */ +/******************************************************************//** +Frees the cell. NOTE! sync_array_wait_event frees the cell +automatically! */ +UNIV_INTERN +void +sync_array_free_cell( +/*=================*/ + sync_array_t* arr, /*!< in: wait array */ + ulint index); /*!< in: index of the cell in array */ +/**********************************************************************//** +Note that one of the wait objects was signalled. */ +UNIV_INTERN +void +sync_array_object_signalled(void); +/*=============================*/ + +/**********************************************************************//** +If the wakeup algorithm does not work perfectly at semaphore relases, +this function will do the waking (see the comment in mutex_exit). This +function should be called about every 1 second in the server. */ +UNIV_INTERN +void +sync_arr_wake_threads_if_sema_free(void); +/*====================================*/ +/**********************************************************************//** +Prints warnings of long semaphore waits to stderr. +@return TRUE if fatal semaphore wait threshold was exceeded */ +UNIV_INTERN +ibool +sync_array_print_long_waits( +/*========================*/ + os_thread_id_t* waiter, /*!< out: longest waiting thread */ + const void** sema) /*!< out: longest-waited-for semaphore */ + __attribute__((nonnull)); +/********************************************************************//** +Validates the integrity of the wait array. Checks +that the number of reserved cells equals the count variable. */ +UNIV_INTERN +void +sync_array_validate( +/*================*/ + sync_array_t* arr); /*!< in: sync wait array */ +/**********************************************************************//** +Prints info of the wait array. */ +UNIV_INTERN +void +sync_array_print( +/*=============*/ + FILE* file); /*!< in: file where to print */ + +/**********************************************************************//** +Create the primary system wait array(s), they are protected by an OS mutex */ +UNIV_INTERN +void +sync_array_init( +/*============*/ + ulint n_threads); /*!< in: Number of slots to create */ +/**********************************************************************//** +Close sync array wait sub-system. */ +UNIV_INTERN +void +sync_array_close(void); +/*==================*/ + +/**********************************************************************//** +Get an instance of the sync wait array. */ +UNIV_INTERN +sync_array_t* +sync_array_get(void); +/*================*/ + +#ifndef UNIV_NONINL +#include "sync0arr.ic" +#endif + +#endif diff --git a/storage/xtradb/include/sync0arr.ic b/storage/xtradb/include/sync0arr.ic new file mode 100644 index 00000000000..18a46dd0a41 --- /dev/null +++ b/storage/xtradb/include/sync0arr.ic @@ -0,0 +1,64 @@ +/***************************************************************************** + +Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/sync0arr.ic +The wait array for synchronization primitives + +Inline code + +Created 9/5/1995 Heikki Tuuri +*******************************************************/ + +/** User configured sync array size */ +extern ulong srv_sync_array_size; + +/******************************************************************//** +Get an instance of the sync wait array and reserve a wait array cell +in the instance for waiting for an object. The event of the cell is +reset to nonsignalled state. +If reserving cell of the instance fails, try to get another new +instance until we can reserve an empty cell of it. +@return the instance found, never NULL. */ +UNIV_INLINE +sync_array_t* +sync_array_get_and_reserve_cell( +/*============================*/ + void* object, /*!< in: pointer to the object to wait for */ + ulint type, /*!< in: lock request type */ + const char* file, /*!< in: file where requested */ + ulint line, /*!< in: line where requested */ + ulint* index) /*!< out: index of the reserved cell */ +{ + sync_array_t* sync_arr; + bool reserved = false; + + for (ulint i = 0; i < srv_sync_array_size && !reserved; ++i) { + sync_arr = sync_array_get(); + reserved = sync_array_reserve_cell(sync_arr, object, type, + file, line, index); + } + + /* This won't be true every time, for the loop above may execute + more than srv_sync_array_size times to reserve a cell. + But an assertion here makes the code more solid. */ + ut_a(reserved); + + return sync_arr; +} + diff --git a/storage/xtradb/include/sync0rw.h b/storage/xtradb/include/sync0rw.h new file mode 100644 index 00000000000..84ac40bab78 --- /dev/null +++ b/storage/xtradb/include/sync0rw.h @@ -0,0 +1,1094 @@ +/***************************************************************************** + +Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2008, Google Inc. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/sync0rw.h +The read-write lock (for threads, not for database transactions) + +Created 9/11/1995 Heikki Tuuri +*******************************************************/ + +#ifndef sync0rw_h +#define sync0rw_h + +#include "univ.i" +#ifndef UNIV_HOTBACKUP +#include "ut0lst.h" +#include "ut0counter.h" +#include "sync0sync.h" +#include "os0sync.h" + +/* The following undef is to prevent a name conflict with a macro +in MySQL: */ +#undef rw_lock_t +#endif /* !UNIV_HOTBACKUP */ + +/** Counters for RW locks. */ +struct rw_lock_stats_t { + typedef ib_counter_t<ib_int64_t, IB_N_SLOTS> ib_int64_counter_t; + + /** number of spin waits on rw-latches, + resulted during shared (read) locks */ + ib_int64_counter_t rw_s_spin_wait_count; + + /** number of spin loop rounds on rw-latches, + resulted during shared (read) locks */ + ib_int64_counter_t rw_s_spin_round_count; + + /** number of OS waits on rw-latches, + resulted during shared (read) locks */ + ib_int64_counter_t rw_s_os_wait_count; + + /** number of unlocks (that unlock shared locks), + set only when UNIV_SYNC_PERF_STAT is defined */ + ib_int64_counter_t rw_s_exit_count; + + /** number of spin waits on rw-latches, + resulted during exclusive (write) locks */ + ib_int64_counter_t rw_x_spin_wait_count; + + /** number of spin loop rounds on rw-latches, + resulted during exclusive (write) locks */ + ib_int64_counter_t rw_x_spin_round_count; + + /** number of OS waits on rw-latches, + resulted during exclusive (write) locks */ + ib_int64_counter_t rw_x_os_wait_count; + + /** number of unlocks (that unlock exclusive locks), + set only when UNIV_SYNC_PERF_STAT is defined */ + ib_int64_counter_t rw_x_exit_count; +}; + +/* Latch types; these are used also in btr0btr.h: keep the numerical values +smaller than 30 and the order of the numerical values like below! */ +#define RW_S_LATCH 1 +#define RW_X_LATCH 2 +#define RW_NO_LATCH 3 + +#ifndef UNIV_HOTBACKUP +/* We decrement lock_word by this amount for each x_lock. It is also the +start value for the lock_word, meaning that it limits the maximum number +of concurrent read locks before the rw_lock breaks. The current value of +0x00100000 allows 1,048,575 concurrent readers and 2047 recursive writers.*/ +#define X_LOCK_DECR 0x00100000 + +struct rw_lock_t; +struct prio_rw_lock_t; +#ifdef UNIV_SYNC_DEBUG +struct rw_lock_debug_t; +#endif /* UNIV_SYNC_DEBUG */ + +typedef UT_LIST_BASE_NODE_T(rw_lock_t) rw_lock_list_t; + +extern rw_lock_list_t rw_lock_list; +extern ib_mutex_t rw_lock_list_mutex; + +#ifdef UNIV_SYNC_DEBUG +/* The global mutex which protects debug info lists of all rw-locks. +To modify the debug info list of an rw-lock, this mutex has to be + +acquired in addition to the mutex protecting the lock. */ +extern ib_mutex_t rw_lock_debug_mutex; +extern os_event_t rw_lock_debug_event; /*!< If deadlock detection does + not get immediately the mutex it + may wait for this event */ +extern ibool rw_lock_debug_waiters; /*!< This is set to TRUE, if + there may be waiters for the event */ +#endif /* UNIV_SYNC_DEBUG */ + +/** Counters for RW locks. */ +extern rw_lock_stats_t rw_lock_stats; + +#ifdef UNIV_PFS_RWLOCK +/* Following are rwlock keys used to register with MySQL +performance schema */ +# ifdef UNIV_LOG_ARCHIVE +extern mysql_pfs_key_t archive_lock_key; +# endif /* UNIV_LOG_ARCHIVE */ +extern mysql_pfs_key_t btr_search_latch_key; +extern mysql_pfs_key_t buf_block_lock_key; +# ifdef UNIV_SYNC_DEBUG +extern mysql_pfs_key_t buf_block_debug_latch_key; +# endif /* UNIV_SYNC_DEBUG */ +extern mysql_pfs_key_t dict_operation_lock_key; +extern mysql_pfs_key_t checkpoint_lock_key; +extern mysql_pfs_key_t fil_space_latch_key; +extern mysql_pfs_key_t fts_cache_rw_lock_key; +extern mysql_pfs_key_t fts_cache_init_rw_lock_key; +extern mysql_pfs_key_t trx_i_s_cache_lock_key; +extern mysql_pfs_key_t trx_purge_latch_key; +extern mysql_pfs_key_t index_tree_rw_lock_key; +extern mysql_pfs_key_t index_online_log_key; +extern mysql_pfs_key_t dict_table_stats_key; +extern mysql_pfs_key_t trx_sys_rw_lock_key; +extern mysql_pfs_key_t hash_table_rw_lock_key; +#endif /* UNIV_PFS_RWLOCK */ + + +#ifndef UNIV_PFS_RWLOCK +/******************************************************************//** +Creates, or rather, initializes an rw-lock object in a specified memory +location (which must be appropriately aligned). The rw-lock is initialized +to the non-locked state. Explicit freeing of the rw-lock with rw_lock_free +is necessary only if the memory block containing it is freed. +if MySQL performance schema is enabled and "UNIV_PFS_RWLOCK" is +defined, the rwlock are instrumented with performance schema probes. */ +# ifdef UNIV_DEBUG +# ifdef UNIV_SYNC_DEBUG +# define rw_lock_create(K, L, level) \ + rw_lock_create_func((L), (level), __FILE__, __LINE__, #L) +# else /* UNIV_SYNC_DEBUG */ +# define rw_lock_create(K, L, level) \ + rw_lock_create_func((L), __FILE__, __LINE__, #L) +# endif/* UNIV_SYNC_DEBUG */ +# else /* UNIV_DEBUG */ +# define rw_lock_create(K, L, level) \ + rw_lock_create_func((L), #L) +# endif /* UNIV_DEBUG */ + +/**************************************************************//** +NOTE! The following macros should be used in rw locking and +unlocking, not the corresponding function. */ + +# define rw_lock_s_lock(M) \ + rw_lock_s_lock_func((M), 0, __FILE__, __LINE__) + +# define rw_lock_s_lock_inline(M, P, F, L) \ + rw_lock_s_lock_func((M), (P), (F), (L)) + +# define rw_lock_s_lock_gen(M, P) \ + rw_lock_s_lock_func((M), (P), __FILE__, __LINE__) + +# define rw_lock_s_lock_gen_nowait(M, P) \ + rw_lock_s_lock_low((M), (P), __FILE__, __LINE__) + +# define rw_lock_s_lock_nowait(M, F, L) \ + rw_lock_s_lock_low((M), 0, (F), (L)) + +# ifdef UNIV_SYNC_DEBUG +# define rw_lock_s_unlock_gen(L, P) rw_lock_s_unlock_func(P, L) +# else +# define rw_lock_s_unlock_gen(L, P) rw_lock_s_unlock_func(L) +# endif + + +# define rw_lock_x_lock(M) \ + rw_lock_x_lock_func((M), 0, __FILE__, __LINE__) + +# define rw_lock_x_lock_inline(M, P, F, L) \ + rw_lock_x_lock_func((M), (P), (F), (L)) + +# define rw_lock_x_lock_gen(M, P) \ + rw_lock_x_lock_func((M), (P), __FILE__, __LINE__) + +# define rw_lock_x_lock_nowait(M) \ + rw_lock_x_lock_func_nowait((M), __FILE__, __LINE__) + +# define rw_lock_x_lock_func_nowait_inline(M, F, L) \ + rw_lock_x_lock_func_nowait((M), (F), (L)) + +# ifdef UNIV_SYNC_DEBUG +# define rw_lock_x_unlock_gen(L, P) rw_lock_x_unlock_func(P, L) +# else +# define rw_lock_x_unlock_gen(L, P) rw_lock_x_unlock_func(L) +# endif + +# define rw_lock_free(M) rw_lock_free_func(M) + +#else /* !UNIV_PFS_RWLOCK */ + +/* Following macros point to Performance Schema instrumented functions. */ +# ifdef UNIV_DEBUG +# ifdef UNIV_SYNC_DEBUG +# define rw_lock_create(K, L, level) \ + pfs_rw_lock_create_func((K), (L), (level), __FILE__, __LINE__, #L) +# else /* UNIV_SYNC_DEBUG */ +# define rw_lock_create(K, L, level) \ + pfs_rw_lock_create_func((K), (L), __FILE__, __LINE__, #L) +# endif/* UNIV_SYNC_DEBUG */ +# else /* UNIV_DEBUG */ +# define rw_lock_create(K, L, level) \ + pfs_rw_lock_create_func((K), (L), #L) +# endif /* UNIV_DEBUG */ + +/****************************************************************** +NOTE! The following macros should be used in rw locking and +unlocking, not the corresponding function. */ + +# define rw_lock_s_lock(M) \ + pfs_rw_lock_s_lock_func((M), 0, __FILE__, __LINE__) + +# define rw_lock_s_lock_inline(M, P, F, L) \ + pfs_rw_lock_s_lock_func((M), (P), (F), (L)) + +# define rw_lock_s_lock_gen(M, P) \ + pfs_rw_lock_s_lock_func((M), (P), __FILE__, __LINE__) + +# define rw_lock_s_lock_gen_nowait(M, P) \ + pfs_rw_lock_s_lock_low((M), (P), __FILE__, __LINE__) + +# define rw_lock_s_lock_nowait(M, F, L) \ + pfs_rw_lock_s_lock_low((M), 0, (F), (L)) + +# ifdef UNIV_SYNC_DEBUG +# define rw_lock_s_unlock_gen(L, P) pfs_rw_lock_s_unlock_func(P, L) +# else +# define rw_lock_s_unlock_gen(L, P) pfs_rw_lock_s_unlock_func(L) +# endif + +# define rw_lock_x_lock(M) \ + pfs_rw_lock_x_lock_func((M), 0, __FILE__, __LINE__) + +# define rw_lock_x_lock_inline(M, P, F, L) \ + pfs_rw_lock_x_lock_func((M), (P), (F), (L)) + +# define rw_lock_x_lock_gen(M, P) \ + pfs_rw_lock_x_lock_func((M), (P), __FILE__, __LINE__) + +# define rw_lock_x_lock_nowait(M) \ + pfs_rw_lock_x_lock_func_nowait((M), __FILE__, __LINE__) + +# define rw_lock_x_lock_func_nowait_inline(M, F, L) \ + pfs_rw_lock_x_lock_func_nowait((M), (F), (L)) + +# ifdef UNIV_SYNC_DEBUG +# define rw_lock_x_unlock_gen(L, P) pfs_rw_lock_x_unlock_func(P, L) +# else +# define rw_lock_x_unlock_gen(L, P) pfs_rw_lock_x_unlock_func(L) +# endif + +# define rw_lock_free(M) pfs_rw_lock_free_func(M) + +#endif /* UNIV_PFS_RWLOCK */ + +#define rw_lock_s_unlock(L) rw_lock_s_unlock_gen(L, 0) +#define rw_lock_x_unlock(L) rw_lock_x_unlock_gen(L, 0) + +/******************************************************************//** +Creates, or rather, initializes an rw-lock object in a specified memory +location (which must be appropriately aligned). The rw-lock is initialized +to the non-locked state. Explicit freeing of the rw-lock with rw_lock_free +is necessary only if the memory block containing it is freed. */ +UNIV_INTERN +void +rw_lock_create_func( +/*================*/ + rw_lock_t* lock, /*!< in: pointer to memory */ +#ifdef UNIV_DEBUG +# ifdef UNIV_SYNC_DEBUG + ulint level, /*!< in: level */ +# endif /* UNIV_SYNC_DEBUG */ + const char* cfile_name, /*!< in: file name where created */ + ulint cline, /*!< in: file line where created */ +#endif /* UNIV_DEBUG */ + const char* cmutex_name); /*!< in: mutex name */ +/******************************************************************//** +Creates, or rather, initializes a priority rw-lock object in a specified memory +location (which must be appropriately aligned). The rw-lock is initialized +to the non-locked state. Explicit freeing of the rw-lock with rw_lock_free +is necessary only if the memory block containing it is freed. */ +UNIV_INTERN +void +rw_lock_create_func( +/*================*/ + prio_rw_lock_t* lock, /*!< in: pointer to memory */ +#ifdef UNIV_DEBUG +# ifdef UNIV_SYNC_DEBUG + ulint level, /*!< in: level */ +# endif /* UNIV_SYNC_DEBUG */ + const char* cfile_name, /*!< in: file name where created */ + ulint cline, /*!< in: file line where created */ +#endif /* UNIV_DEBUG */ + const char* cmutex_name); /*!< in: mutex name */ +/******************************************************************//** +Calling this function is obligatory only if the memory buffer containing +the rw-lock is freed. Removes an rw-lock object from the global list. The +rw-lock is checked to be in the non-locked state. */ +UNIV_INTERN +void +rw_lock_free_func( +/*==============*/ + rw_lock_t* lock); /*!< in: rw-lock */ +/******************************************************************//** +Calling this function is obligatory only if the memory buffer containing +the priority rw-lock is freed. Removes an rw-lock object from the global list. +The rw-lock is checked to be in the non-locked state. */ +UNIV_INTERN +void +rw_lock_free_func( +/*==============*/ + prio_rw_lock_t* lock); /*!< in: rw-lock */ +#ifdef UNIV_DEBUG +/******************************************************************//** +Checks that the rw-lock has been initialized and that there are no +simultaneous shared and exclusive locks. +@return TRUE */ +UNIV_INTERN +ibool +rw_lock_validate( +/*=============*/ + rw_lock_t* lock); /*!< in: rw-lock */ +/******************************************************************//** +Checks that the priority rw-lock has been initialized and that there are no +simultaneous shared and exclusive locks. +@return TRUE */ +UNIV_INTERN +ibool +rw_lock_validate( +/*=============*/ + prio_rw_lock_t* lock); /*!< in: rw-lock */ +#endif /* UNIV_DEBUG */ +/******************************************************************//** +Low-level function which tries to lock an rw-lock in s-mode. Performs no +spinning. +@return TRUE if success */ +UNIV_INLINE +ibool +rw_lock_s_lock_low( +/*===============*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass __attribute__((unused)), + /*!< in: pass value; != 0, if the lock will be + passed to another thread to unlock */ + const char* file_name, /*!< in: file name where lock requested */ + ulint line); /*!< in: line where requested */ +/******************************************************************//** +NOTE! Use the corresponding macro, not directly this function, except if +you supply the file name and line number. Lock an rw-lock in shared mode +for the current thread. If the rw-lock is locked in exclusive mode, or +there is an exclusive lock request waiting, the function spins a preset +time (controlled by SYNC_SPIN_ROUNDS), waiting for the lock, before +suspending the thread. */ +UNIV_INLINE +void +rw_lock_s_lock_func( +/*================*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass, /*!< in: pass value; != 0, if the lock will + be passed to another thread to unlock */ + const char* file_name,/*!< in: file name where lock requested */ + ulint line); /*!< in: line where requested */ +/******************************************************************//** +NOTE! Use the corresponding macro, not directly this function, except if +you supply the file name and line number. Lock a priority rw-lock in shared +mode for the current thread, using the relative thread priority. If the +rw-lock is locked in exclusive mode, or there is an exclusive lock request +waiting, the function spins a preset time (controlled by SYNC_SPIN_ROUNDS), +waiting for the lock, before suspending the thread. */ +UNIV_INLINE +void +rw_lock_s_lock_func( +/*================*/ + prio_rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass, /*!< in: pass value; != 0, if the lock will + be passed to another thread to unlock */ + const char* file_name,/*!< in: file name where lock requested */ + ulint line); /*!< in: line where requested */ +/******************************************************************//** +NOTE! Use the corresponding macro, not directly this function! Lock an +rw-lock in exclusive mode for the current thread if the lock can be +obtained immediately. +@return TRUE if success */ +UNIV_INLINE +ibool +rw_lock_x_lock_func_nowait( +/*=======================*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + const char* file_name,/*!< in: file name where lock requested */ + ulint line); /*!< in: line where requested */ +/******************************************************************//** +Releases a shared mode lock. */ +UNIV_INLINE +void +rw_lock_s_unlock_func( +/*==================*/ +#ifdef UNIV_SYNC_DEBUG + ulint pass, /*!< in: pass value; != 0, if the lock may have + been passed to another thread to unlock */ +#endif + rw_lock_t* lock); /*!< in/out: rw-lock */ + +/******************************************************************//** +Releases a shared mode priority lock. */ +UNIV_INLINE +void +rw_lock_s_unlock_func( +/*==================*/ +#ifdef UNIV_SYNC_DEBUG + ulint pass, /*!< in: pass value; != 0, if the lock may have + been passed to another thread to unlock */ +#endif + prio_rw_lock_t* lock); /*!< in/out: rw-lock */ +/******************************************************************//** +NOTE! Use the corresponding macro, not directly this function! Lock an +rw-lock in exclusive mode for the current thread. If the rw-lock is locked +in shared or exclusive mode, or there is an exclusive lock request waiting, +the function spins a preset time (controlled by SYNC_SPIN_ROUNDS), waiting +for the lock, before suspending the thread. If the same thread has an x-lock +on the rw-lock, locking succeed, with the following exception: if pass != 0, +only a single x-lock may be taken on the lock. NOTE: If the same thread has +an s-lock, locking does not succeed! */ +UNIV_INTERN +void +rw_lock_x_lock_func( +/*================*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass, /*!< in: pass value; != 0, if the lock will + be passed to another thread to unlock */ + const char* file_name,/*!< in: file name where lock requested */ + ulint line, /*!< in: line where requested */ + bool priority_lock = false, + /*!< in: whether the lock is a priority lock */ + bool high_priority = false); + /*!< in: whether we are acquiring a priority + lock with high priority */ +/******************************************************************//** +NOTE! Use the corresponding macro, not directly this function! Lock a priority +rw-lock in exclusive mode for the current thread. If the rw-lock is locked +in shared or exclusive mode, or there is an exclusive lock request waiting, +the function spins a preset time (controlled by SYNC_SPIN_ROUNDS), waiting +for the lock, before suspending the thread. If the same thread has an x-lock +on the rw-lock, locking succeed, with the following exception: if pass != 0, +only a single x-lock may be taken on the lock. NOTE: If the same thread has +an s-lock, locking does not succeed! */ +UNIV_INTERN +void +rw_lock_x_lock_func( +/*================*/ + prio_rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass, /*!< in: pass value; != 0, if the lock will + be passed to another thread to unlock */ + const char* file_name,/*!< in: file name where lock requested */ + ulint line); /*!< in: line where requested */ +/******************************************************************//** +Releases an exclusive mode lock. */ +UNIV_INLINE +void +rw_lock_x_unlock_func( +/*==================*/ +#ifdef UNIV_SYNC_DEBUG + ulint pass, /*!< in: pass value; != 0, if the lock may have + been passed to another thread to unlock */ +#endif + rw_lock_t* lock); /*!< in/out: rw-lock */ +/******************************************************************//** +Releases an exclusive mode priority lock. */ +UNIV_INLINE +void +rw_lock_x_unlock_func( +/*==================*/ +#ifdef UNIV_SYNC_DEBUG + ulint pass, /*!< in: pass value; != 0, if the lock may have + been passed to another thread to unlock */ +#endif + prio_rw_lock_t* lock); /*!< in/out: rw-lock */ +/******************************************************************//** +This function is used in the insert buffer to move the ownership of an +x-latch on a buffer frame to the current thread. The x-latch was set by +the buffer read operation and it protected the buffer frame while the +read was done. The ownership is moved because we want that the current +thread is able to acquire a second x-latch which is stored in an mtr. +This, in turn, is needed to pass the debug checks of index page +operations. */ +UNIV_INTERN +void +rw_lock_x_lock_move_ownership( +/*==========================*/ + rw_lock_t* lock); /*!< in: lock which was x-locked in the + buffer read */ +/******************************************************************//** +Returns the value of writer_count for the lock. Does not reserve the lock +mutex, so the caller must be sure it is not changed during the call. +@return value of writer_count */ +UNIV_INLINE +ulint +rw_lock_get_x_lock_count( +/*=====================*/ + const rw_lock_t* lock); /*!< in: rw-lock */ +/******************************************************************//** +Returns the value of writer_count for the priority lock. Does not reserve the +lock mutex, so the caller must be sure it is not changed during the call. +@return value of writer_count */ +UNIV_INLINE +ulint +rw_lock_get_x_lock_count( +/*=====================*/ + const prio_rw_lock_t* lock); /*!< in: rw-lock */ +/********************************************************************//** +Check if there are threads waiting for the rw-lock. +@return 1 if waiters, 0 otherwise */ +UNIV_INLINE +ulint +rw_lock_get_waiters( +/*================*/ + const rw_lock_t* lock); /*!< in: rw-lock */ +/********************************************************************//** +Check if there are threads waiting for the priority rw-lock. +@return 1 if waiters, 0 otherwise */ +UNIV_INLINE +ulint +rw_lock_get_waiters( +/*================*/ + const prio_rw_lock_t* lock); /*!< in: rw-lock */ +/******************************************************************//** +Returns the write-status of the lock - this function made more sense +with the old rw_lock implementation. +@return RW_LOCK_NOT_LOCKED, RW_LOCK_EX, RW_LOCK_WAIT_EX */ +UNIV_INLINE +ulint +rw_lock_get_writer( +/*===============*/ + const rw_lock_t* lock); /*!< in: rw-lock */ +/******************************************************************//** +Returns the write-status of the priority lock - this function made more sense +with the old rw_lock implementation. +@return RW_LOCK_NOT_LOCKED, RW_LOCK_EX, RW_LOCK_WAIT_EX */ +UNIV_INLINE +ulint +rw_lock_get_writer( +/*===============*/ + const prio_rw_lock_t* lock); /*!< in: rw-lock */ +/******************************************************************//** +Returns the number of readers. +@return number of readers */ +UNIV_INLINE +ulint +rw_lock_get_reader_count( +/*=====================*/ + const rw_lock_t* lock); /*!< in: rw-lock */ +/******************************************************************//** +Returns the number of readers. +@return number of readers */ +UNIV_INLINE +ulint +rw_lock_get_reader_count( +/*=====================*/ + const prio_rw_lock_t* lock); /*!< in: rw-lock */ +/******************************************************************//** +Decrements lock_word the specified amount if it is greater than 0. +This is used by both s_lock and x_lock operations. +@return TRUE if decr occurs */ +UNIV_INLINE +ibool +rw_lock_lock_word_decr( +/*===================*/ + rw_lock_t* lock, /*!< in/out: rw-lock */ + ulint amount); /*!< in: amount to decrement */ +/******************************************************************//** +Increments lock_word the specified amount and returns new value. +@return lock->lock_word after increment */ +UNIV_INLINE +lint +rw_lock_lock_word_incr( +/*===================*/ + rw_lock_t* lock, /*!< in/out: rw-lock */ + ulint amount); /*!< in: amount to increment */ +/******************************************************************//** +This function sets the lock->writer_thread and lock->recursive fields. +For platforms where we are using atomic builtins instead of lock->mutex +it sets the lock->writer_thread field using atomics to ensure memory +ordering. Note that it is assumed that the caller of this function +effectively owns the lock i.e.: nobody else is allowed to modify +lock->writer_thread at this point in time. +The protocol is that lock->writer_thread MUST be updated BEFORE the +lock->recursive flag is set. */ +UNIV_INLINE +void +rw_lock_set_writer_id_and_recursion_flag( +/*=====================================*/ + rw_lock_t* lock, /*!< in/out: lock to work on */ + ibool recursive); /*!< in: TRUE if recursion + allowed */ +#ifdef UNIV_SYNC_DEBUG +/******************************************************************//** +Checks if the thread has locked the rw-lock in the specified mode, with +the pass value == 0. */ +UNIV_INTERN +ibool +rw_lock_own( +/*========*/ + rw_lock_t* lock, /*!< in: rw-lock */ + ulint lock_type) /*!< in: lock type: RW_LOCK_SHARED, + RW_LOCK_EX */ + __attribute__((warn_unused_result)); +/******************************************************************//** +Checks if the thread has locked the priority rw-lock in the specified mode, +with the pass value == 0. */ +UNIV_INTERN +ibool +rw_lock_own( +/*========*/ + prio_rw_lock_t* lock, /*!< in: rw-lock */ + ulint lock_type) /*!< in: lock type: RW_LOCK_SHARED, + RW_LOCK_EX */ + __attribute__((warn_unused_result)); +#endif /* UNIV_SYNC_DEBUG */ +/******************************************************************//** +Checks if somebody has locked the rw-lock in the specified mode. */ +UNIV_INTERN +ibool +rw_lock_is_locked( +/*==============*/ + rw_lock_t* lock, /*!< in: rw-lock */ + ulint lock_type); /*!< in: lock type: RW_LOCK_SHARED, + RW_LOCK_EX */ +#ifdef UNIV_SYNC_DEBUG +/***************************************************************//** +Prints debug info of an rw-lock. */ +UNIV_INTERN +void +rw_lock_print( +/*==========*/ + rw_lock_t* lock); /*!< in: rw-lock */ +/***************************************************************//** +Prints debug info of currently locked rw-locks. */ +UNIV_INTERN +void +rw_lock_list_print_info( +/*====================*/ + FILE* file); /*!< in: file where to print */ +/***************************************************************//** +Returns the number of currently locked rw-locks. +Works only in the debug version. +@return number of locked rw-locks */ +UNIV_INTERN +ulint +rw_lock_n_locked(void); +/*==================*/ + +/*#####################################################################*/ + +/******************************************************************//** +Acquires the debug mutex. We cannot use the mutex defined in sync0sync, +because the debug mutex is also acquired in sync0arr while holding the OS +mutex protecting the sync array, and the ordinary mutex_enter might +recursively call routines in sync0arr, leading to a deadlock on the OS +mutex. */ +UNIV_INTERN +void +rw_lock_debug_mutex_enter(void); +/*===========================*/ +/******************************************************************//** +Releases the debug mutex. */ +UNIV_INTERN +void +rw_lock_debug_mutex_exit(void); +/*==========================*/ +/*********************************************************************//** +Prints info of a debug struct. */ +UNIV_INTERN +void +rw_lock_debug_print( +/*================*/ + FILE* f, /*!< in: output stream */ + rw_lock_debug_t* info); /*!< in: debug struct */ +#endif /* UNIV_SYNC_DEBUG */ + +/* NOTE! The structure appears here only for the compiler to know its size. +Do not use its fields directly! */ + +/** The structure used in the spin lock implementation of a read-write +lock. Several threads may have a shared lock simultaneously in this +lock, but only one writer may have an exclusive lock, in which case no +shared locks are allowed. To prevent starving of a writer blocked by +readers, a writer may queue for x-lock by decrementing lock_word: no +new readers will be let in while the thread waits for readers to +exit. */ +struct rw_lock_t { + volatile lint lock_word; + /*!< Holds the state of the lock. */ + volatile ulint waiters;/*!< 1: there are waiters */ + volatile ibool recursive;/*!< Default value FALSE which means the lock + is non-recursive. The value is typically set + to TRUE making normal rw_locks recursive. In + case of asynchronous IO, when a non-zero + value of 'pass' is passed then we keep the + lock non-recursive. + This flag also tells us about the state of + writer_thread field. If this flag is set + then writer_thread MUST contain the thread + id of the current x-holder or wait-x thread. + This flag must be reset in x_unlock + functions before incrementing the lock_word */ + volatile os_thread_id_t writer_thread; + /*!< Thread id of writer thread. Is only + guaranteed to have sane and non-stale + value iff recursive flag is set. */ + os_event_t event; /*!< Used by sync0arr.cc for thread queueing */ + os_event_t wait_ex_event; + /*!< Event for next-writer to wait on. A thread + must decrement lock_word before waiting. */ +#ifndef INNODB_RW_LOCKS_USE_ATOMICS + ib_mutex_t mutex; /*!< The mutex protecting rw_lock_t */ +#endif /* INNODB_RW_LOCKS_USE_ATOMICS */ + + UT_LIST_NODE_T(rw_lock_t) list; + /*!< All allocated rw locks are put into a + list */ +#ifdef UNIV_SYNC_DEBUG + UT_LIST_BASE_NODE_T(rw_lock_debug_t) debug_list; + /*!< In the debug version: pointer to the debug + info list of the lock */ + ulint level; /*!< Level in the global latching order. */ +#endif /* UNIV_SYNC_DEBUG */ +#ifdef UNIV_PFS_RWLOCK + struct PSI_rwlock *pfs_psi;/*!< The instrumentation hook */ +#endif + ulint count_os_wait; /*!< Count of os_waits. May not be accurate */ + //const char* cfile_name;/*!< File name where lock created */ + const char* lock_name;/*!< lock name */ + /* last s-lock file/line is not guaranteed to be correct */ + const char* last_s_file_name;/*!< File name where last s-locked */ + const char* last_x_file_name;/*!< File name where last x-locked */ + ibool writer_is_wait_ex; + /*!< This is TRUE if the writer field is + RW_LOCK_WAIT_EX; this field is located far + from the memory update hotspot fields which + are at the start of this struct, thus we can + peek this field without causing much memory + bus traffic */ + //unsigned cline:14; /*!< Line where created */ + unsigned last_s_line:14; /*!< Line number where last time s-locked */ + unsigned last_x_line:14; /*!< Line number where last time x-locked */ +#ifdef UNIV_DEBUG + ulint magic_n; /*!< RW_LOCK_MAGIC_N */ +/** Value of rw_lock_t::magic_n */ +#define RW_LOCK_MAGIC_N 22643 +#endif /* UNIV_DEBUG */ +}; + +/** The structure implementing a priority rw lock. */ +struct prio_rw_lock_t { + struct rw_lock_t base_lock; /* The regular rw latch + provides the lock word etc. for + the priority rw lock */ + volatile ulint high_priority_s_waiters; + /* Number of high priority S + waiters */ + os_event_t high_priority_s_event; /* High priority wait + array event for S waiters */ + volatile ulint high_priority_x_waiters; + /* Number of high priority X + waiters */ + os_event_t high_priority_x_event; + /* High priority wait arraay + event for X waiters */ + volatile ulint high_priority_wait_ex_waiter; + /* If 1, a waiting next-writer + exists and is high-priority */ +}; + +#ifdef UNIV_SYNC_DEBUG +/** The structure for storing debug info of an rw-lock. All access to this +structure must be protected by rw_lock_debug_mutex_enter(). */ +struct rw_lock_debug_t { + + os_thread_id_t thread_id; /*!< The thread id of the thread which + locked the rw-lock */ + ulint pass; /*!< Pass value given in the lock operation */ + ulint lock_type; /*!< Type of the lock: RW_LOCK_EX, + RW_LOCK_SHARED, RW_LOCK_WAIT_EX */ + const char* file_name;/*!< File name where the lock was obtained */ + ulint line; /*!< Line where the rw-lock was locked */ + UT_LIST_NODE_T(rw_lock_debug_t) list; + /*!< Debug structs are linked in a two-way + list */ +}; +#endif /* UNIV_SYNC_DEBUG */ + +/* For performance schema instrumentation, a new set of rwlock +wrap functions are created if "UNIV_PFS_RWLOCK" is defined. +The instrumentations are not planted directly into original +functions, so that we keep the underlying function as they +are. And in case, user wants to "take out" some rwlock from +instrumentation even if performance schema (UNIV_PFS_RWLOCK) +is defined, they can do so by reinstating APIs directly link to +original underlying functions. +The instrumented function names have prefix of "pfs_rw_lock_" vs. +original name prefix of "rw_lock_". Following are list of functions +that have been instrumented: + +rw_lock_create() +rw_lock_x_lock() +rw_lock_x_lock_gen() +rw_lock_x_lock_nowait() +rw_lock_x_unlock_gen() +rw_lock_s_lock() +rw_lock_s_lock_gen() +rw_lock_s_lock_nowait() +rw_lock_s_unlock_gen() +rw_lock_free() +*/ + +#ifdef UNIV_PFS_RWLOCK +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_create_func() +NOTE! Please use the corresponding macro rw_lock_create(), not +directly this function! */ +UNIV_INLINE +void +pfs_rw_lock_create_func( +/*====================*/ + PSI_rwlock_key key, /*!< in: key registered with + performance schema */ + rw_lock_t* lock, /*!< in: rw lock */ +#ifdef UNIV_DEBUG +# ifdef UNIV_SYNC_DEBUG + ulint level, /*!< in: level */ +# endif /* UNIV_SYNC_DEBUG */ + const char* cfile_name, /*!< in: file name where created */ + ulint cline, /*!< in: file line where created */ +#endif /* UNIV_DEBUG */ + const char* cmutex_name); /*!< in: mutex name */ + +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_create_func() +NOTE! Please use the corresponding macro rw_lock_create(), not +directly this function! */ +UNIV_INLINE +void +pfs_rw_lock_create_func( +/*====================*/ + PSI_rwlock_key key, /*!< in: key registered with + performance schema */ + prio_rw_lock_t* lock, /*!< in: rw lock */ +#ifdef UNIV_DEBUG +# ifdef UNIV_SYNC_DEBUG + ulint level, /*!< in: level */ +# endif /* UNIV_SYNC_DEBUG */ + const char* cfile_name, /*!< in: file name where created */ + ulint cline, /*!< in: file line where created */ +#endif /* UNIV_DEBUG */ + const char* cmutex_name); /*!< in: mutex name */ + +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_x_lock_func() +NOTE! Please use the corresponding macro rw_lock_x_lock(), not +directly this function! */ +UNIV_INLINE +void +pfs_rw_lock_x_lock_func( +/*====================*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass, /*!< in: pass value; != 0, if the lock will + be passed to another thread to unlock */ + const char* file_name,/*!< in: file name where lock requested */ + ulint line); /*!< in: line where requested */ + +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_x_lock_func() +NOTE! Please use the corresponding macro rw_lock_x_lock(), not +directly this function! */ +UNIV_INLINE +void +pfs_rw_lock_x_lock_func( +/*====================*/ + prio_rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass, /*!< in: pass value; != 0, if the lock will + be passed to another thread to unlock */ + const char* file_name,/*!< in: file name where lock requested */ + ulint line); /*!< in: line where requested */ + +/******************************************************************//** +Performance schema instrumented wrap function for +rw_lock_x_lock_func_nowait() +NOTE! Please use the corresponding macro, not directly this function! +@return TRUE if success */ +UNIV_INLINE +ibool +pfs_rw_lock_x_lock_func_nowait( +/*===========================*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + const char* file_name,/*!< in: file name where lock requested */ + ulint line); /*!< in: line where requested */ + +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_s_lock_func() +NOTE! Please use the corresponding macro rw_lock_s_lock(), not directly +this function! */ +UNIV_INLINE +void +pfs_rw_lock_s_lock_func( +/*====================*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass, /*!< in: pass value; != 0, if the lock will + be passed to another thread to unlock */ + const char* file_name,/*!< in: file name where lock requested */ + ulint line); /*!< in: line where requested */ + +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_s_lock_func() +NOTE! Please use the corresponding macro rw_lock_s_lock(), not directly +this function! */ +UNIV_INLINE +void +pfs_rw_lock_s_lock_func( +/*====================*/ + prio_rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass, /*!< in: pass value; != 0, if the lock will + be passed to another thread to unlock */ + const char* file_name,/*!< in: file name where lock requested */ + ulint line); /*!< in: line where requested */ + +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_s_lock_func() +NOTE! Please use the corresponding macro rw_lock_s_lock(), not directly +this function! +@return TRUE if success */ +UNIV_INLINE +ibool +pfs_rw_lock_s_lock_low( +/*===================*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass, /*!< in: pass value; != 0, if the + lock will be passed to another + thread to unlock */ + const char* file_name, /*!< in: file name where lock requested */ + ulint line); /*!< in: line where requested */ +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_s_lock_func() +NOTE! Please use the corresponding macro rw_lock_s_lock(), not directly +this function! +@return TRUE if success */ +UNIV_INLINE +ibool +pfs_rw_lock_s_lock_low( +/*===================*/ + prio_rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass, /*!< in: pass value; != 0, if the + lock will be passed to another + thread to unlock */ + const char* file_name, /*!< in: file name where lock requested */ + ulint line); /*!< in: line where requested */ +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_x_lock_func() +NOTE! Please use the corresponding macro rw_lock_x_lock(), not directly +this function! */ +UNIV_INLINE +void +pfs_rw_lock_x_lock_func( +/*====================*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass, /*!< in: pass value; != 0, if the lock will + be passed to another thread to unlock */ + const char* file_name,/*!< in: file name where lock requested */ + ulint line); /*!< in: line where requested */ +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_x_lock_func() +NOTE! Please use the corresponding macro rw_lock_x_lock(), not directly +this function! */ +UNIV_INLINE +void +pfs_rw_lock_x_lock_func( +/*====================*/ + prio_rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass, /*!< in: pass value; != 0, if the lock will + be passed to another thread to unlock */ + const char* file_name,/*!< in: file name where lock requested */ + ulint line); /*!< in: line where requested */ +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_s_unlock_func() +NOTE! Please use the corresponding macro rw_lock_s_unlock(), not directly +this function! */ +UNIV_INLINE +void +pfs_rw_lock_s_unlock_func( +/*======================*/ +#ifdef UNIV_SYNC_DEBUG + ulint pass, /*!< in: pass value; != 0, if the + lock may have been passed to another + thread to unlock */ +#endif + rw_lock_t* lock); /*!< in/out: rw-lock */ +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_s_unlock_func() +NOTE! Please use the corresponding macro rw_lock_s_unlock(), not directly +this function! */ +UNIV_INLINE +void +pfs_rw_lock_s_unlock_func( +/*======================*/ +#ifdef UNIV_SYNC_DEBUG + ulint pass, /*!< in: pass value; != 0, if the + lock may have been passed to another + thread to unlock */ +#endif + prio_rw_lock_t* lock); /*!< in/out: rw-lock */ +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_s_unlock_func() +NOTE! Please use the corresponding macro rw_lock_x_unlock(), not directly +this function! */ +UNIV_INLINE +void +pfs_rw_lock_x_unlock_func( +/*======================*/ +#ifdef UNIV_SYNC_DEBUG + ulint pass, /*!< in: pass value; != 0, if the + lock may have been passed to another + thread to unlock */ +#endif + rw_lock_t* lock); /*!< in/out: rw-lock */ +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_s_unlock_func() +NOTE! Please use the corresponding macro rw_lock_x_unlock(), not directly +this function! */ +UNIV_INLINE +void +pfs_rw_lock_x_unlock_func( +/*======================*/ +#ifdef UNIV_SYNC_DEBUG + ulint pass, /*!< in: pass value; != 0, if the + lock may have been passed to another + thread to unlock */ +#endif + prio_rw_lock_t* lock); /*!< in/out: rw-lock */ +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_free_func() +NOTE! Please use the corresponding macro rw_lock_free(), not directly +this function! */ +UNIV_INLINE +void +pfs_rw_lock_free_func( +/*==================*/ + rw_lock_t* lock); /*!< in: rw-lock */ +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_free_func() +NOTE! Please use the corresponding macro rw_lock_free(), not directly +this function! */ +UNIV_INLINE +void +pfs_rw_lock_free_func( +/*==================*/ + prio_rw_lock_t* lock); /*!< in: rw-lock */ +#endif /* UNIV_PFS_RWLOCK */ + + +#ifndef UNIV_NONINL +#include "sync0rw.ic" +#endif +#endif /* !UNIV_HOTBACKUP */ + +#endif diff --git a/storage/xtradb/include/sync0rw.ic b/storage/xtradb/include/sync0rw.ic new file mode 100644 index 00000000000..8aadc406132 --- /dev/null +++ b/storage/xtradb/include/sync0rw.ic @@ -0,0 +1,1258 @@ +/***************************************************************************** + +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2008, Google Inc. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/sync0rw.ic +The read-write lock (for threads) + +Created 9/11/1995 Heikki Tuuri +*******************************************************/ + +/******************************************************************//** +Lock a regular or priority rw-lock in shared mode for the current thread. If +the rw-lock is locked in exclusive mode, or there is an exclusive lock request +waiting, the function spins a preset time (controlled by SYNC_SPIN_ROUNDS), +waiting for the lock before suspending the thread. */ +UNIV_INTERN +void +rw_lock_s_lock_spin( +/*================*/ + void* _lock, /*!< in: pointer to rw-lock */ + ulint pass, /*!< in: pass value; != 0, if the lock will + be passed to another thread to unlock */ + bool priority_lock, + /*!< in: whether the lock is a priority lock */ + bool high_priority, + /*!< in: whether we are acquiring a priority + lock with high priority */ + const char* file_name,/*!< in: file name where lock requested */ + ulint line); /*!< in: line where requested */ +#ifdef UNIV_SYNC_DEBUG +/******************************************************************//** +Inserts the debug information for an rw-lock. */ +UNIV_INTERN +void +rw_lock_add_debug_info( +/*===================*/ + rw_lock_t* lock, /*!< in: rw-lock */ + ulint pass, /*!< in: pass value */ + ulint lock_type, /*!< in: lock type */ + const char* file_name, /*!< in: file where requested */ + ulint line); /*!< in: line where requested */ +/******************************************************************//** +Removes a debug information struct for an rw-lock. */ +UNIV_INTERN +void +rw_lock_remove_debug_info( +/*======================*/ + rw_lock_t* lock, /*!< in: rw-lock */ + ulint pass, /*!< in: pass value */ + ulint lock_type); /*!< in: lock type */ +#endif /* UNIV_SYNC_DEBUG */ + +/********************************************************************//** +Check if there are threads waiting for the rw-lock. +@return 1 if waiters, 0 otherwise */ +UNIV_INLINE +ulint +rw_lock_get_waiters( +/*================*/ + const rw_lock_t* lock) /*!< in: rw-lock */ +{ + return(lock->waiters); +} + +/********************************************************************//** +Check if there are threads waiting for the priority rw-lock. +@return 1 if waiters, 0 otherwise */ +UNIV_INLINE +ulint +rw_lock_get_waiters( +/*================*/ + const prio_rw_lock_t* lock) /*!< in: rw-lock */ +{ + return rw_lock_get_waiters(&lock->base_lock) + || lock->high_priority_s_waiters + || lock->high_priority_x_waiters; +} + +/********************************************************************//** +Sets lock->waiters to 1. It is not an error if lock->waiters is already +1. On platforms where ATOMIC builtins are used this function enforces a +memory barrier. */ +UNIV_INLINE +void +rw_lock_set_waiter_flag( +/*====================*/ + rw_lock_t* lock) /*!< in/out: rw-lock */ +{ +#ifdef INNODB_RW_LOCKS_USE_ATOMICS + (void) os_compare_and_swap_ulint(&lock->waiters, 0, 1); +#else /* INNODB_RW_LOCKS_USE_ATOMICS */ + lock->waiters = 1; + os_wmb; +#endif /* INNODB_RW_LOCKS_USE_ATOMICS */ +} + +/********************************************************************//** +Resets lock->waiters to 0. It is not an error if lock->waiters is already +0. On platforms where ATOMIC builtins are used this function enforces a +memory barrier. */ +UNIV_INLINE +void +rw_lock_reset_waiter_flag( +/*======================*/ + rw_lock_t* lock) /*!< in/out: rw-lock */ +{ +#ifdef INNODB_RW_LOCKS_USE_ATOMICS + (void) os_compare_and_swap_ulint(&lock->waiters, 1, 0); +#else /* INNODB_RW_LOCKS_USE_ATOMICS */ + lock->waiters = 0; + os_wmb; +#endif /* INNODB_RW_LOCKS_USE_ATOMICS */ +} + +/******************************************************************//** +Returns the write-status of the lock - this function made more sense +with the old rw_lock implementation. +@return RW_LOCK_NOT_LOCKED, RW_LOCK_EX, RW_LOCK_WAIT_EX */ +UNIV_INLINE +ulint +rw_lock_get_writer( +/*===============*/ + const rw_lock_t* lock) /*!< in: rw-lock */ +{ + lint lock_word = lock->lock_word; + if (lock_word > 0) { + /* return NOT_LOCKED in s-lock state, like the writer + member of the old lock implementation. */ + return(RW_LOCK_NOT_LOCKED); + } else if ((lock_word == 0) || (lock_word <= -X_LOCK_DECR)) { + return(RW_LOCK_EX); + } else { + ut_ad(lock_word > -X_LOCK_DECR); + return(RW_LOCK_WAIT_EX); + } +} + +/******************************************************************//** +Returns the write-status of the priority lock - this function made more sense +with the old rw_lock implementation. +@return RW_LOCK_NOT_LOCKED, RW_LOCK_EX, RW_LOCK_WAIT_EX */ +UNIV_INLINE +ulint +rw_lock_get_writer( +/*===============*/ + const prio_rw_lock_t* lock) /*!< in: rw-lock */ +{ + return(rw_lock_get_writer(&lock->base_lock)); +} + +/******************************************************************//** +Returns the number of readers. +@return number of readers */ +UNIV_INLINE +ulint +rw_lock_get_reader_count( +/*=====================*/ + const rw_lock_t* lock) /*!< in: rw-lock */ +{ + lint lock_word = lock->lock_word; + if (lock_word > 0) { + /* s-locked, no x-waiters */ + return(X_LOCK_DECR - lock_word); + } else if (lock_word < 0 && lock_word > -X_LOCK_DECR) { + /* s-locked, with x-waiters */ + return((ulint)(-lock_word)); + } + return(0); +} + +/******************************************************************//** +Returns the number of readers. +@return number of readers */ +UNIV_INLINE +ulint +rw_lock_get_reader_count( +/*=====================*/ + const prio_rw_lock_t* lock) /*!< in: rw-lock */ +{ + return(rw_lock_get_reader_count(&lock->base_lock)); +} + +#ifndef INNODB_RW_LOCKS_USE_ATOMICS +UNIV_INLINE +ib_mutex_t* +rw_lock_get_mutex( +/*==============*/ + rw_lock_t* lock) +{ + return(&(lock->mutex)); +} +#endif + +/******************************************************************//** +Returns the value of writer_count for the lock. Does not reserve the lock +mutex, so the caller must be sure it is not changed during the call. +@return value of writer_count */ +UNIV_INLINE +ulint +rw_lock_get_x_lock_count( +/*=====================*/ + const rw_lock_t* lock) /*!< in: rw-lock */ +{ + lint lock_copy = lock->lock_word; + if ((lock_copy != 0) && (lock_copy > -X_LOCK_DECR)) { + return(0); + } + return((lock_copy == 0) ? 1 : (2 - (lock_copy + X_LOCK_DECR))); +} + +/******************************************************************//** +Returns the value of writer_count for the priority lock. Does not reserve the +lock mutex, so the caller must be sure it is not changed during the call. +@return value of writer_count */ +UNIV_INLINE +ulint +rw_lock_get_x_lock_count( +/*=====================*/ + const prio_rw_lock_t* lock) /*!< in: rw-lock */ +{ + return(rw_lock_get_x_lock_count(&lock->base_lock)); +} + +/******************************************************************//** +Two different implementations for decrementing the lock_word of a rw_lock: +one for systems supporting atomic operations, one for others. This does +does not support recusive x-locks: they should be handled by the caller and +need not be atomic since they are performed by the current lock holder. +Returns true if the decrement was made, false if not. +@return TRUE if decr occurs */ +UNIV_INLINE +ibool +rw_lock_lock_word_decr( +/*===================*/ + rw_lock_t* lock, /*!< in/out: rw-lock */ + ulint amount) /*!< in: amount to decrement */ +{ +#ifdef INNODB_RW_LOCKS_USE_ATOMICS + lint local_lock_word; + + os_rmb; + local_lock_word = lock->lock_word; + while (local_lock_word > 0) { + if (os_compare_and_swap_lint(&lock->lock_word, + local_lock_word, + local_lock_word - amount)) { + return(TRUE); + } + local_lock_word = lock->lock_word; + } + return(FALSE); +#else /* INNODB_RW_LOCKS_USE_ATOMICS */ + ibool success = FALSE; + mutex_enter(&(lock->mutex)); + if (lock->lock_word > 0) { + lock->lock_word -= amount; + success = TRUE; + } + mutex_exit(&(lock->mutex)); + return(success); +#endif /* INNODB_RW_LOCKS_USE_ATOMICS */ +} + +/******************************************************************//** +Increments lock_word the specified amount and returns new value. +@return lock->lock_word after increment */ +UNIV_INLINE +lint +rw_lock_lock_word_incr( +/*===================*/ + rw_lock_t* lock, /*!< in/out: rw-lock */ + ulint amount) /*!< in: amount of increment */ +{ +#ifdef INNODB_RW_LOCKS_USE_ATOMICS + return(os_atomic_increment_lint(&lock->lock_word, amount)); +#else /* INNODB_RW_LOCKS_USE_ATOMICS */ + lint local_lock_word; + + mutex_enter(&(lock->mutex)); + + lock->lock_word += amount; + local_lock_word = lock->lock_word; + + mutex_exit(&(lock->mutex)); + + return(local_lock_word); +#endif /* INNODB_RW_LOCKS_USE_ATOMICS */ +} + +/******************************************************************//** +This function sets the lock->writer_thread and lock->recursive fields. +For platforms where we are using atomic builtins instead of lock->mutex +it sets the lock->writer_thread field using atomics to ensure memory +ordering. Note that it is assumed that the caller of this function +effectively owns the lock i.e.: nobody else is allowed to modify +lock->writer_thread at this point in time. +The protocol is that lock->writer_thread MUST be updated BEFORE the +lock->recursive flag is set. */ +UNIV_INLINE +void +rw_lock_set_writer_id_and_recursion_flag( +/*=====================================*/ + rw_lock_t* lock, /*!< in/out: lock to work on */ + ibool recursive) /*!< in: TRUE if recursion + allowed */ +{ + os_thread_id_t curr_thread = os_thread_get_curr_id(); + +#ifdef INNODB_RW_LOCKS_USE_ATOMICS + os_thread_id_t local_thread; + ibool success; + + /* Prevent Valgrind warnings about writer_thread being + uninitialized. It does not matter if writer_thread is + uninitialized, because we are comparing writer_thread against + itself, and the operation should always succeed. */ + UNIV_MEM_VALID(&lock->writer_thread, sizeof lock->writer_thread); + + local_thread = lock->writer_thread; + success = os_compare_and_swap_thread_id( + &lock->writer_thread, local_thread, curr_thread); + ut_a(success); + lock->recursive = recursive; + +#else /* INNODB_RW_LOCKS_USE_ATOMICS */ + + mutex_enter(&lock->mutex); + lock->writer_thread = curr_thread; + lock->recursive = recursive; + mutex_exit(&lock->mutex); + +#endif /* INNODB_RW_LOCKS_USE_ATOMICS */ +} + +/******************************************************************//** +Low-level function which tries to lock an rw-lock in s-mode. Performs no +spinning. +@return TRUE if success */ +UNIV_INLINE +ibool +rw_lock_s_lock_low( +/*===============*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass __attribute__((unused)), + /*!< in: pass value; != 0, if the lock will be + passed to another thread to unlock */ + const char* file_name, /*!< in: file name where lock requested */ + ulint line) /*!< in: line where requested */ +{ + if (!rw_lock_lock_word_decr(lock, 1)) { + /* Locking did not succeed */ + return(FALSE); + } + +#ifdef UNIV_SYNC_DEBUG + rw_lock_add_debug_info(lock, pass, RW_LOCK_SHARED, file_name, line); +#endif + /* These debugging values are not set safely: they may be incorrect + or even refer to a line that is invalid for the file name. */ + lock->last_s_file_name = file_name; + lock->last_s_line = line; + + return(TRUE); /* locking succeeded */ +} + +/******************************************************************//** +NOTE! Use the corresponding macro, not directly this function! Lock an +rw-lock in shared mode for the current thread. If the rw-lock is locked +in exclusive mode, or there is an exclusive lock request waiting, the +function spins a preset time (controlled by SYNC_SPIN_ROUNDS), waiting for +the lock, before suspending the thread. */ +UNIV_INLINE +void +rw_lock_s_lock_func( +/*================*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass, /*!< in: pass value; != 0, if the lock will + be passed to another thread to unlock */ + const char* file_name,/*!< in: file name where lock requested */ + ulint line) /*!< in: line where requested */ +{ + /* NOTE: As we do not know the thread ids for threads which have + s-locked a latch, and s-lockers will be served only after waiting + x-lock requests have been fulfilled, then if this thread already + owns an s-lock here, it may end up in a deadlock with another thread + which requests an x-lock here. Therefore, we will forbid recursive + s-locking of a latch: the following assert will warn the programmer + of the possibility of this kind of a deadlock. If we want to implement + safe recursive s-locking, we should keep in a list the thread ids of + the threads which have s-locked a latch. This would use some CPU + time. */ + +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(lock, RW_LOCK_SHARED)); /* see NOTE above */ + ut_ad(!rw_lock_own(lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + if (rw_lock_s_lock_low(lock, pass, file_name, line)) { + + return; /* Success */ + } else { + /* Did not succeed, try spin wait */ + + rw_lock_s_lock_spin(lock, pass, false, false, file_name, line); + + return; + } +} + +/******************************************************************//** +Return true if waiters of higher priority than the current thread +exist. +@true if waiterss of higher priority exist */ +UNIV_INLINE +bool +rw_lock_higher_prio_waiters_exist( +/*==============================*/ + bool priority_lock, /*!< in: whether the lock is a priority lock */ + bool high_priority, /*!< in: whether we are acquiring a priority + lock with high priority */ + void* lock) /*!< in: rw lock */ +{ + if (high_priority || !priority_lock) { + ut_ad(!(!priority_lock && high_priority)); + return(false); + } + + ut_ad(priority_lock && !high_priority); + + prio_rw_lock_t *prio_rw_lock = (prio_rw_lock_t *) lock; + return prio_rw_lock->high_priority_wait_ex_waiter > 0 + || prio_rw_lock->high_priority_s_waiters > 0 + || prio_rw_lock->high_priority_x_waiters > 0; +} + +/******************************************************************//** +NOTE! Use the corresponding macro, not directly this function, except if +you supply the file name and line number. Lock a priority rw-lock in shared +mode for the current thread, using the relative thread priority. If the +rw-lock is locked in exclusive mode, or there is an exclusive lock request +waiting, the function spins a preset time (controlled by SYNC_SPIN_ROUNDS), +waiting for the lock, before suspending the thread. */ +UNIV_INLINE +void +rw_lock_s_lock_func( +/*================*/ + prio_rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass, /*!< in: pass value; != 0, if the lock will + be passed to another thread to unlock */ + const char* file_name,/*!< in: file name where lock requested */ + ulint line) /*!< in: line where requested */ +{ +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(lock, RW_LOCK_SHARED)); /* see NOTE above */ + ut_ad(!rw_lock_own(lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + bool high_priority = srv_current_thread_priority > 0; + + /* Do not attempt to acquire a low-priority S latch if there are + high-priority waiters even if such attempt would be successful. This + is to prevent a high priority X request from being starved by a + sequence of overlapping regular priority S requests. */ + + if (!rw_lock_higher_prio_waiters_exist(true, high_priority, lock) + && rw_lock_s_lock_low(&lock->base_lock, pass, file_name, line)) { + + return; /* Success */ + } else { + /* Did not succeed, try spin wait */ + rw_lock_s_lock_spin(lock, pass, true, high_priority, file_name, + line); + + return; + } +} + +/******************************************************************//** +NOTE! Use the corresponding macro, not directly this function! Lock an +rw-lock in exclusive mode for the current thread if the lock can be +obtained immediately. +@return TRUE if success */ +UNIV_INLINE +ibool +rw_lock_x_lock_func_nowait( +/*=======================*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + const char* file_name,/*!< in: file name where lock requested */ + ulint line) /*!< in: line where requested */ +{ + ibool success; + +#ifdef INNODB_RW_LOCKS_USE_ATOMICS + success = os_compare_and_swap_lint(&lock->lock_word, X_LOCK_DECR, 0); +#else + + success = FALSE; + mutex_enter(&(lock->mutex)); + if (lock->lock_word == X_LOCK_DECR) { + lock->lock_word = 0; + success = TRUE; + } + mutex_exit(&(lock->mutex)); + +#endif + if (success) { + rw_lock_set_writer_id_and_recursion_flag(lock, TRUE); + + } else if (lock->recursive + && os_thread_eq(lock->writer_thread, + os_thread_get_curr_id())) { + /* Relock: this lock_word modification is safe since no other + threads can modify (lock, unlock, or reserve) lock_word while + there is an exclusive writer and this is the writer thread. */ + if (lock->lock_word == 0) { + lock->lock_word = -X_LOCK_DECR; + } else { + lock->lock_word--; + } + + /* Watch for too many recursive locks */ + ut_ad(lock->lock_word < 0); + + } else { + /* Failure */ + return(FALSE); + } +#ifdef UNIV_SYNC_DEBUG + rw_lock_add_debug_info(lock, 0, RW_LOCK_EX, file_name, line); +#endif + + lock->last_x_file_name = file_name; + lock->last_x_line = line; + + ut_ad(rw_lock_validate(lock)); + + return(TRUE); +} + +/******************************************************************//** +Releases a shared mode lock. */ +UNIV_INLINE +void +rw_lock_s_unlock_func( +/*==================*/ +#ifdef UNIV_SYNC_DEBUG + ulint pass, /*!< in: pass value; != 0, if the lock may have + been passed to another thread to unlock */ +#endif + rw_lock_t* lock) /*!< in/out: rw-lock */ +{ + ut_ad(lock->lock_word > -X_LOCK_DECR); + ut_ad(lock->lock_word != 0); + ut_ad(lock->lock_word < X_LOCK_DECR); + +#ifdef UNIV_SYNC_DEBUG + rw_lock_remove_debug_info(lock, pass, RW_LOCK_SHARED); +#endif + + /* Increment lock_word to indicate 1 less reader */ + if (rw_lock_lock_word_incr(lock, 1) == 0) { + + /* wait_ex waiter exists. It may not be asleep, but we signal + anyway. We do not wake other waiters, because they can't + exist without wait_ex waiter and wait_ex waiter goes first.*/ + os_event_set(lock->wait_ex_event); + sync_array_object_signalled(); + + } + + ut_ad(rw_lock_validate(lock)); + +#ifdef UNIV_SYNC_PERF_STAT + rw_s_exit_count++; +#endif +} + +/******************************************************************//** +Releases a shared mode priority lock. */ +UNIV_INLINE +void +rw_lock_s_unlock_func( +/*==================*/ +#ifdef UNIV_SYNC_DEBUG + ulint pass, /*!< in: pass value; != 0, if the lock may have + been passed to another thread to unlock */ +#endif + prio_rw_lock_t* lock) /*!< in/out: rw-lock */ +{ + lint lock_word; + + ut_ad(lock->base_lock.lock_word > -X_LOCK_DECR); + ut_ad(lock->base_lock.lock_word != 0); + ut_ad(lock->base_lock.lock_word < X_LOCK_DECR); + +#ifdef UNIV_SYNC_DEBUG + rw_lock_remove_debug_info(&lock->base_lock, pass, RW_LOCK_SHARED); +#endif + + /* Increment lock_word to indicate 1 less reader */ + lock_word = rw_lock_lock_word_incr(&lock->base_lock, 1); + if (lock_word == 0) { + + /* A waiting next-writer exists, either high priority or + regular, sharing the same wait event. */ + os_event_set(lock->base_lock.wait_ex_event); + sync_array_object_signalled(); + + } else if (lock_word == X_LOCK_DECR) { + + /* S-waiters may exist during an S unlock if a high-priority + thread released it, because low-priority threads are prevented + from acquiring S lock while high-priority thread holds it. */ + if (lock->base_lock.waiters) { + + rw_lock_reset_waiter_flag(&lock->base_lock); + os_event_set(lock->base_lock.event); + sync_array_object_signalled(); + } + } + + ut_ad(rw_lock_validate(lock)); + +#ifdef UNIV_SYNC_PERF_STAT + rw_s_exit_count++; +#endif +} + +/******************************************************************//** +Prepares an exclusive mode lock release: resets the recursion flag and removes +the debug information if needed and returns the required lock word increment +value. +@return lock word increment value to perform the unlock */ +UNIV_INLINE +ulint +rw_lock_x_prepare_unlock( +/*=====================*/ +#ifdef UNIV_SYNC_DEBUG + ulint pass, /*!< in: pass value; != 0, if the lock may have + been passed to another thread to unlock */ +#endif + rw_lock_t* lock) /*!< in/out: rw-lock */ +{ + ut_ad(lock->lock_word == 0 || lock->lock_word <= -X_LOCK_DECR); + + /* lock->recursive flag also indicates if lock->writer_thread is + valid or stale. If we are the last of the recursive callers + then we must unset lock->recursive flag to indicate that the + lock->writer_thread is now stale. + Note that since we still hold the x-lock we can safely read the + lock_word. */ + if (lock->lock_word == 0) { + /* Last caller in a possible recursive chain. */ + lock->recursive = FALSE; + } + +#ifdef UNIV_SYNC_DEBUG + rw_lock_remove_debug_info(lock, pass, RW_LOCK_EX); +#endif + + ulint x_lock_incr; + if (lock->lock_word == 0) { + x_lock_incr = X_LOCK_DECR; + } else if (lock->lock_word == -X_LOCK_DECR) { + x_lock_incr = X_LOCK_DECR; + } else { + ut_ad(lock->lock_word < -X_LOCK_DECR); + x_lock_incr = 1; + } + + return(x_lock_incr); +} + +/******************************************************************//** +Releases an exclusive mode lock. */ +UNIV_INLINE +void +rw_lock_x_unlock_func( +/*==================*/ +#ifdef UNIV_SYNC_DEBUG + ulint pass, /*!< in: pass value; != 0, if the lock may have + been passed to another thread to unlock */ +#endif + rw_lock_t* lock) /*!< in/out: rw-lock */ +{ + ulint x_lock_incr = rw_lock_x_prepare_unlock( +#ifdef UNIV_SYNC_DEBUG + pass, +#endif + lock); + + if (rw_lock_lock_word_incr(lock, x_lock_incr) == X_LOCK_DECR) { + /* Lock is now free. May have to signal read/write waiters. + We do not need to signal wait_ex waiters, since they cannot + exist when there is a writer. */ + + if (lock->waiters) { + rw_lock_reset_waiter_flag(lock); + os_event_set(lock->event); + sync_array_object_signalled(); + } + } + + ut_ad(rw_lock_validate(lock)); + +#ifdef UNIV_SYNC_PERF_STAT + rw_x_exit_count++; +#endif +} + +/******************************************************************//** +Releases an exclusive mode priority lock. */ +UNIV_INLINE +void +rw_lock_x_unlock_func( +/*==================*/ +#ifdef UNIV_SYNC_DEBUG + ulint pass, /*!< in: pass value; != 0, if the lock may have + been passed to another thread to unlock */ +#endif + prio_rw_lock_t* lock) /*!< in/out: rw-lock */ +{ + ulint x_lock_incr = rw_lock_x_prepare_unlock( +#ifdef UNIV_SYNC_DEBUG + pass, +#endif + &lock->base_lock); + + ut_ad(lock->high_priority_wait_ex_waiter == 0); + + if (rw_lock_lock_word_incr(&lock->base_lock, x_lock_incr) + == X_LOCK_DECR) { + + /* Priority lock is now free. Signal any waiters in this + order: 1) high priority X waiters; 2) high priority S waiters; + 3) regular priority waiters. + We do not need to signal wait_ex waiters, since they cannot + exist when there is a writer. */ + + if (lock->high_priority_x_waiters) { + + os_event_set(lock->high_priority_x_event); + sync_array_object_signalled(); + } else if (lock->high_priority_s_waiters) { + + os_event_set(lock->high_priority_s_event); + sync_array_object_signalled(); + } else if (lock->base_lock.waiters) { + + rw_lock_reset_waiter_flag(&lock->base_lock); + os_event_set(lock->base_lock.event); + sync_array_object_signalled(); + } + } + + ut_ad(rw_lock_validate(lock)); + +#ifdef UNIV_SYNC_PERF_STAT + rw_x_exit_count++; +#endif +} + +#ifdef UNIV_PFS_RWLOCK + +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_create_func(). +NOTE! Please use the corresponding macro rw_lock_create(), not directly +this function! */ +UNIV_INLINE +void +pfs_rw_lock_create_func( +/*====================*/ + mysql_pfs_key_t key, /*!< in: key registered with + performance schema */ + rw_lock_t* lock, /*!< in: pointer to memory */ +# ifdef UNIV_DEBUG +# ifdef UNIV_SYNC_DEBUG + ulint level, /*!< in: level */ +# endif /* UNIV_SYNC_DEBUG */ + const char* cfile_name, /*!< in: file name where created */ + ulint cline, /*!< in: file line where created */ +# endif /* UNIV_DEBUG */ + const char* cmutex_name) /*!< in: mutex name */ +{ + /* Initialize the rwlock for performance schema */ + lock->pfs_psi = PSI_RWLOCK_CALL(init_rwlock)(key, lock); + + /* The actual function to initialize an rwlock */ + rw_lock_create_func(lock, +# ifdef UNIV_DEBUG +# ifdef UNIV_SYNC_DEBUG + level, +# endif /* UNIV_SYNC_DEBUG */ + cfile_name, + cline, +# endif /* UNIV_DEBUG */ + cmutex_name); +} + +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_create_func(). +NOTE! Please use the corresponding macro rw_lock_create(), not directly +this function! */ +UNIV_INLINE +void +pfs_rw_lock_create_func( +/*====================*/ + mysql_pfs_key_t key, /*!< in: key registered with + performance schema */ + prio_rw_lock_t* lock, /*!< in: pointer to memory */ +# ifdef UNIV_DEBUG +# ifdef UNIV_SYNC_DEBUG + ulint level, /*!< in: level */ +# endif /* UNIV_SYNC_DEBUG */ + const char* cfile_name, /*!< in: file name where created */ + ulint cline, /*!< in: file line where created */ +# endif /* UNIV_DEBUG */ + const char* cmutex_name) /*!< in: mutex name */ +{ + /* Initialize the rwlock for performance schema */ + lock->base_lock.pfs_psi = PSI_RWLOCK_CALL(init_rwlock)(key, lock); + + /* The actual function to initialize an rwlock */ + rw_lock_create_func(lock, +# ifdef UNIV_DEBUG +# ifdef UNIV_SYNC_DEBUG + level, +# endif /* UNIV_SYNC_DEBUG */ + cfile_name, + cline, +# endif /* UNIV_DEBUG */ + cmutex_name); +} + +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_x_lock_func() +NOTE! Please use the corresponding macro rw_lock_x_lock(), not directly +this function! */ +UNIV_INLINE +void +pfs_rw_lock_x_lock_func( +/*====================*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass, /*!< in: pass value; != 0, if the lock will + be passed to another thread to unlock */ + const char* file_name,/*!< in: file name where lock requested */ + ulint line) /*!< in: line where requested */ +{ + if (lock->pfs_psi != NULL) + { + PSI_rwlock_locker* locker; + PSI_rwlock_locker_state state; + + /* Record the entry of rw x lock request in performance schema */ + locker = PSI_RWLOCK_CALL(start_rwlock_wrwait)( + &state, lock->pfs_psi, PSI_RWLOCK_WRITELOCK, + file_name, static_cast<uint>(line)); + + rw_lock_x_lock_func( + lock, pass, file_name, static_cast<uint>(line)); + + if (locker != NULL) { + PSI_RWLOCK_CALL(end_rwlock_wrwait)(locker, 0); + } + } + else + { + rw_lock_x_lock_func(lock, pass, file_name, line); + } +} + +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_x_lock_func() +NOTE! Please use the corresponding macro rw_lock_x_lock(), not directly +this function! */ +UNIV_INLINE +void +pfs_rw_lock_x_lock_func( +/*====================*/ + prio_rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass, /*!< in: pass value; != 0, if the lock will + be passed to another thread to unlock */ + const char* file_name,/*!< in: file name where lock requested */ + ulint line) /*!< in: line where requested */ +{ + if (lock->base_lock.pfs_psi != NULL) + { + PSI_rwlock_locker* locker; + PSI_rwlock_locker_state state; + + /* Record the entry of rw x lock request in performance schema */ + locker = PSI_RWLOCK_CALL(start_rwlock_wrwait)( + &state, lock->base_lock.pfs_psi, PSI_RWLOCK_WRITELOCK, + file_name, line); + + rw_lock_x_lock_func(lock, pass, file_name, line); + + if (locker != NULL) { + PSI_RWLOCK_CALL(end_rwlock_wrwait)(locker, 0); + } + } + else + { + rw_lock_x_lock_func(lock, pass, file_name, line); + } +} + +/******************************************************************//** +Performance schema instrumented wrap function for +rw_lock_x_lock_func_nowait() +NOTE! Please use the corresponding macro rw_lock_x_lock_func(), +not directly this function! +@return TRUE if success */ +UNIV_INLINE +ibool +pfs_rw_lock_x_lock_func_nowait( +/*===========================*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + const char* file_name,/*!< in: file name where lock + requested */ + ulint line) /*!< in: line where requested */ +{ + ibool ret; + + if (lock->pfs_psi != NULL) + { + PSI_rwlock_locker* locker; + PSI_rwlock_locker_state state; + + /* Record the entry of rw x lock request in performance schema */ + locker = PSI_RWLOCK_CALL(start_rwlock_wrwait)( + &state, lock->pfs_psi, PSI_RWLOCK_WRITELOCK, + file_name, static_cast<uint>(line)); + + ret = rw_lock_x_lock_func_nowait(lock, file_name, line); + + if (locker != NULL) { + PSI_RWLOCK_CALL(end_rwlock_wrwait)( + locker, static_cast<int>(ret)); + } + } + else + { + ret = rw_lock_x_lock_func_nowait(lock, file_name, line); + } + + return(ret); +} +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_free_func() +NOTE! Please use the corresponding macro rw_lock_free(), not directly +this function! */ +UNIV_INLINE +void +pfs_rw_lock_free_func( +/*==================*/ + rw_lock_t* lock) /*!< in: pointer to rw-lock */ +{ + if (lock->pfs_psi != NULL) + { + PSI_RWLOCK_CALL(destroy_rwlock)(lock->pfs_psi); + lock->pfs_psi = NULL; + } + + rw_lock_free_func(lock); +} + +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_free_func() +NOTE! Please use the corresponding macro rw_lock_free(), not directly +this function! */ +UNIV_INLINE +void +pfs_rw_lock_free_func( +/*==================*/ + prio_rw_lock_t* lock) /*!< in: pointer to rw-lock */ +{ + if (lock->base_lock.pfs_psi != NULL) + { + PSI_RWLOCK_CALL(destroy_rwlock)(lock->base_lock.pfs_psi); + lock->base_lock.pfs_psi = NULL; + } + + rw_lock_free_func(lock); +} + +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_s_lock_func() +NOTE! Please use the corresponding macro rw_lock_s_lock(), not +directly this function! */ +UNIV_INLINE +void +pfs_rw_lock_s_lock_func( +/*====================*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass, /*!< in: pass value; != 0, if the + lock will be passed to another + thread to unlock */ + const char* file_name,/*!< in: file name where lock + requested */ + ulint line) /*!< in: line where requested */ +{ + if (lock->pfs_psi != NULL) + { + PSI_rwlock_locker* locker; + PSI_rwlock_locker_state state; + + /* Instrumented to inform we are aquiring a shared rwlock */ + locker = PSI_RWLOCK_CALL(start_rwlock_rdwait)( + &state, lock->pfs_psi, PSI_RWLOCK_READLOCK, + file_name, static_cast<uint>(line)); + + rw_lock_s_lock_func(lock, pass, file_name, line); + + if (locker != NULL) { + PSI_RWLOCK_CALL(end_rwlock_rdwait)(locker, 0); + } + } + else + { + rw_lock_s_lock_func(lock, pass, file_name, line); + } + + return; +} + +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_s_lock_func() +NOTE! Please use the corresponding macro rw_lock_s_lock(), not +directly this function! */ +UNIV_INLINE +void +pfs_rw_lock_s_lock_func( +/*====================*/ + prio_rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass, /*!< in: pass value; != 0, if the + lock will be passed to another + thread to unlock */ + const char* file_name,/*!< in: file name where lock + requested */ + ulint line) /*!< in: line where requested */ +{ + if (lock->base_lock.pfs_psi != NULL) + { + PSI_rwlock_locker* locker; + PSI_rwlock_locker_state state; + + /* Instrumented to inform we are aquiring a shared rwlock */ + locker = PSI_RWLOCK_CALL(start_rwlock_rdwait)( + &state, lock->base_lock.pfs_psi, PSI_RWLOCK_READLOCK, + file_name, line); + + rw_lock_s_lock_func(lock, pass, file_name, line); + + if (locker != NULL) { + PSI_RWLOCK_CALL(end_rwlock_rdwait)(locker, 0); + } + } + else + { + rw_lock_s_lock_func(lock, pass, file_name, line); + } + + return; +} + +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_s_lock_func() +NOTE! Please use the corresponding macro rw_lock_s_lock(), not +directly this function! +@return TRUE if success */ +UNIV_INLINE +ibool +pfs_rw_lock_s_lock_low( +/*===================*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass, /*!< in: pass value; != 0, if the + lock will be passed to another + thread to unlock */ + const char* file_name, /*!< in: file name where lock requested */ + ulint line) /*!< in: line where requested */ +{ + ibool ret; + + if (lock->pfs_psi != NULL) + { + PSI_rwlock_locker* locker; + PSI_rwlock_locker_state state; + + /* Instrumented to inform we are aquiring a shared rwlock */ + locker = PSI_RWLOCK_CALL(start_rwlock_rdwait)( + &state, lock->pfs_psi, PSI_RWLOCK_READLOCK, + file_name, static_cast<uint>(line)); + + ret = rw_lock_s_lock_low(lock, pass, file_name, line); + + if (locker != NULL) { + PSI_RWLOCK_CALL(end_rwlock_rdwait)( + locker, static_cast<int>(ret)); + } + } + else + { + ret = rw_lock_s_lock_low(lock, pass, file_name, line); + } + + return(ret); +} + +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_s_lock_func() +NOTE! Please use the corresponding macro rw_lock_s_lock(), not +directly this function! +@return TRUE if success */ +UNIV_INLINE +ibool +pfs_rw_lock_s_lock_low( +/*===================*/ + prio_rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass, /*!< in: pass value; != 0, if the + lock will be passed to another + thread to unlock */ + const char* file_name, /*!< in: file name where lock requested */ + ulint line) /*!< in: line where requested */ +{ + return(pfs_rw_lock_s_lock_low(&lock->base_lock, pass, + file_name, line)); +} + +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_x_unlock_func() +NOTE! Please use the corresponding macro rw_lock_x_unlock(), not directly +this function! */ +UNIV_INLINE +void +pfs_rw_lock_x_unlock_func( +/*======================*/ +#ifdef UNIV_SYNC_DEBUG + ulint pass, /*!< in: pass value; != 0, if the + lock may have been passed to another + thread to unlock */ +#endif + rw_lock_t* lock) /*!< in/out: rw-lock */ +{ + /* Inform performance schema we are unlocking the lock */ + if (lock->pfs_psi != NULL) + PSI_RWLOCK_CALL(unlock_rwlock)(lock->pfs_psi); + + rw_lock_x_unlock_func( +#ifdef UNIV_SYNC_DEBUG + pass, +#endif + lock); +} + +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_x_unlock_func() +NOTE! Please use the corresponding macro rw_lock_x_unlock(), not directly +this function! */ +UNIV_INLINE +void +pfs_rw_lock_x_unlock_func( +/*======================*/ +#ifdef UNIV_SYNC_DEBUG + ulint pass, /*!< in: pass value; != 0, if the + lock may have been passed to another + thread to unlock */ +#endif + prio_rw_lock_t* lock) /*!< in/out: rw-lock */ +{ + /* Inform performance schema we are unlocking the lock */ + if (lock->base_lock.pfs_psi != NULL) + PSI_RWLOCK_CALL(unlock_rwlock)(lock->base_lock.pfs_psi); + + rw_lock_x_unlock_func( +#ifdef UNIV_SYNC_DEBUG + pass, +#endif + lock); +} + +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_s_unlock_func() +NOTE! Please use the corresponding macro pfs_rw_lock_s_unlock(), not +directly this function! */ +UNIV_INLINE +void +pfs_rw_lock_s_unlock_func( +/*======================*/ +#ifdef UNIV_SYNC_DEBUG + ulint pass, /*!< in: pass value; != 0, if the + lock may have been passed to another + thread to unlock */ +#endif + rw_lock_t* lock) /*!< in/out: rw-lock */ +{ + /* Inform performance schema we are unlocking the lock */ + if (lock->pfs_psi != NULL) + PSI_RWLOCK_CALL(unlock_rwlock)(lock->pfs_psi); + + rw_lock_s_unlock_func( +#ifdef UNIV_SYNC_DEBUG + pass, +#endif + lock); + +} + +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_s_unlock_func() +NOTE! Please use the corresponding macro pfs_rw_lock_s_unlock(), not +directly this function! */ +UNIV_INLINE +void +pfs_rw_lock_s_unlock_func( +/*======================*/ +#ifdef UNIV_SYNC_DEBUG + ulint pass, /*!< in: pass value; != 0, if the + lock may have been passed to another + thread to unlock */ +#endif + prio_rw_lock_t* lock) /*!< in/out: rw-lock */ +{ + /* Inform performance schema we are unlocking the lock */ + if (lock->base_lock.pfs_psi != NULL) + PSI_RWLOCK_CALL(unlock_rwlock)(lock->base_lock.pfs_psi); + + rw_lock_s_unlock_func( +#ifdef UNIV_SYNC_DEBUG + pass, +#endif + lock); + +} + +#endif /* UNIV_PFS_RWLOCK */ diff --git a/storage/xtradb/include/sync0sync.h b/storage/xtradb/include/sync0sync.h new file mode 100644 index 00000000000..e3fe3028ed1 --- /dev/null +++ b/storage/xtradb/include/sync0sync.h @@ -0,0 +1,1055 @@ +/***************************************************************************** + +Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2008, Google Inc. +Copyright (c) 2012, Facebook Inc. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/sync0sync.h +Mutex, the basic synchronization primitive + +Created 9/5/1995 Heikki Tuuri +*******************************************************/ + +#ifndef sync0sync_h +#define sync0sync_h + +#include "univ.i" +#include "sync0types.h" +#include "ut0lst.h" +#include "ut0mem.h" +#include "os0thread.h" +#include "os0sync.h" +#include "sync0arr.h" +#include "ut0counter.h" + +#if defined(UNIV_DEBUG) && !defined(UNIV_HOTBACKUP) +extern "C" my_bool timed_mutexes; +#endif /* UNIV_DEBUG && !UNIV_HOTBACKUP */ + +#ifdef HAVE_WINDOWS_ATOMICS +typedef LONG lock_word_t; /*!< On Windows, InterlockedExchange operates + on LONG variable */ +#elif defined(HAVE_ATOMIC_BUILTINS) && !defined(HAVE_ATOMIC_BUILTINS_BYTE) +typedef ulint lock_word_t; +#else +typedef byte lock_word_t; +#endif + +#if defined UNIV_PFS_MUTEX || defined UNIV_PFS_RWLOCK + +/* By default, buffer mutexes and rwlocks will be excluded from +instrumentation due to their large number of instances. */ +# define PFS_SKIP_BUFFER_MUTEX_RWLOCK + +/* By default, event->mutex will also be excluded from instrumentation */ +# define PFS_SKIP_EVENT_MUTEX + +#endif /* UNIV_PFS_MUTEX || UNIV_PFS_RWLOCK */ + +#ifdef UNIV_PFS_MUTEX +/* Key defines to register InnoDB mutexes with performance schema */ +extern mysql_pfs_key_t autoinc_mutex_key; +extern mysql_pfs_key_t buffer_block_mutex_key; +extern mysql_pfs_key_t buf_pool_zip_mutex_key; +extern mysql_pfs_key_t buf_pool_LRU_list_mutex_key; +extern mysql_pfs_key_t buf_pool_free_list_mutex_key; +extern mysql_pfs_key_t buf_pool_zip_free_mutex_key; +extern mysql_pfs_key_t buf_pool_zip_hash_mutex_key; +extern mysql_pfs_key_t buf_pool_flush_state_mutex_key; +extern mysql_pfs_key_t cache_last_read_mutex_key; +extern mysql_pfs_key_t dict_foreign_err_mutex_key; +extern mysql_pfs_key_t dict_sys_mutex_key; +extern mysql_pfs_key_t file_format_max_mutex_key; +extern mysql_pfs_key_t fil_system_mutex_key; +extern mysql_pfs_key_t flush_list_mutex_key; +extern mysql_pfs_key_t fts_bg_threads_mutex_key; +extern mysql_pfs_key_t fts_delete_mutex_key; +extern mysql_pfs_key_t fts_optimize_mutex_key; +extern mysql_pfs_key_t fts_doc_id_mutex_key; +extern mysql_pfs_key_t fts_pll_tokenize_mutex_key; +extern mysql_pfs_key_t hash_table_mutex_key; +extern mysql_pfs_key_t ibuf_bitmap_mutex_key; +extern mysql_pfs_key_t ibuf_mutex_key; +extern mysql_pfs_key_t ibuf_pessimistic_insert_mutex_key; +extern mysql_pfs_key_t log_bmp_sys_mutex_key; +extern mysql_pfs_key_t log_sys_mutex_key; +extern mysql_pfs_key_t log_flush_order_mutex_key; +# ifndef HAVE_ATOMIC_BUILTINS +extern mysql_pfs_key_t server_mutex_key; +# endif /* !HAVE_ATOMIC_BUILTINS */ +# ifdef UNIV_MEM_DEBUG +extern mysql_pfs_key_t mem_hash_mutex_key; +# endif /* UNIV_MEM_DEBUG */ +extern mysql_pfs_key_t mem_pool_mutex_key; +extern mysql_pfs_key_t mutex_list_mutex_key; +extern mysql_pfs_key_t purge_sys_bh_mutex_key; +extern mysql_pfs_key_t recv_sys_mutex_key; +extern mysql_pfs_key_t recv_writer_mutex_key; +extern mysql_pfs_key_t rseg_mutex_key; +# ifdef UNIV_SYNC_DEBUG +extern mysql_pfs_key_t rw_lock_debug_mutex_key; +# endif /* UNIV_SYNC_DEBUG */ +extern mysql_pfs_key_t rw_lock_list_mutex_key; +extern mysql_pfs_key_t rw_lock_mutex_key; +extern mysql_pfs_key_t srv_dict_tmpfile_mutex_key; +extern mysql_pfs_key_t srv_innodb_monitor_mutex_key; +extern mysql_pfs_key_t srv_misc_tmpfile_mutex_key; +extern mysql_pfs_key_t srv_threads_mutex_key; +extern mysql_pfs_key_t srv_monitor_file_mutex_key; +# ifdef UNIV_SYNC_DEBUG +extern mysql_pfs_key_t sync_thread_mutex_key; +# endif /* UNIV_SYNC_DEBUG */ +extern mysql_pfs_key_t buf_dblwr_mutex_key; +extern mysql_pfs_key_t trx_undo_mutex_key; +extern mysql_pfs_key_t trx_mutex_key; +extern mysql_pfs_key_t lock_sys_mutex_key; +extern mysql_pfs_key_t lock_sys_wait_mutex_key; +extern mysql_pfs_key_t trx_sys_mutex_key; +extern mysql_pfs_key_t srv_sys_mutex_key; +extern mysql_pfs_key_t srv_sys_tasks_mutex_key; +#ifndef HAVE_ATOMIC_BUILTINS +extern mysql_pfs_key_t srv_conc_mutex_key; +#endif /* !HAVE_ATOMIC_BUILTINS */ +#ifndef HAVE_ATOMIC_BUILTINS_64 +extern mysql_pfs_key_t monitor_mutex_key; +#endif /* !HAVE_ATOMIC_BUILTINS_64 */ +extern mysql_pfs_key_t event_os_mutex_key; +extern mysql_pfs_key_t ut_list_mutex_key; +extern mysql_pfs_key_t os_mutex_key; +extern mysql_pfs_key_t zip_pad_mutex_key; +#endif /* UNIV_PFS_MUTEX */ + +/******************************************************************//** +Initializes the synchronization data structures. */ +UNIV_INTERN +void +sync_init(void); +/*===========*/ +/******************************************************************//** +Frees the resources in synchronization data structures. */ +UNIV_INTERN +void +sync_close(void); +/*===========*/ + +#undef mutex_free /* Fix for MacOS X */ + +#ifdef UNIV_PFS_MUTEX +/********************************************************************** +Following mutex APIs would be performance schema instrumented +if "UNIV_PFS_MUTEX" is defined: + +mutex_create +mutex_enter +mutex_enter_first +mutex_enter_last +mutex_exit +mutex_enter_nowait +mutex_free + +These mutex APIs will point to corresponding wrapper functions that contain +the performance schema instrumentation if "UNIV_PFS_MUTEX" is defined. +The instrumented wrapper functions have the prefix of "innodb_". + +NOTE! The following macro should be used in mutex operation, not the +corresponding function. */ + +/******************************************************************//** +Creates, or rather, initializes a mutex object to a specified memory +location (which must be appropriately aligned). The mutex is initialized +in the reset state. Explicit freeing of the mutex with mutex_free is +necessary only if the memory block containing it is freed. */ +# ifdef UNIV_DEBUG +# ifdef UNIV_SYNC_DEBUG +# define mutex_create(K, M, level) \ + pfs_mutex_create_func((K), (M), (level), __FILE__, __LINE__, #M) +# else +# define mutex_create(K, M, level) \ + pfs_mutex_create_func((K), (M), __FILE__, __LINE__, #M) +# endif/* UNIV_SYNC_DEBUG */ +# else +# define mutex_create(K, M, level) \ + pfs_mutex_create_func((K), (M), #M) +# endif /* UNIV_DEBUG */ + +# define mutex_enter(M) \ + pfs_mutex_enter_func((M), __FILE__, __LINE__) + +# define mutex_enter_nowait(M) \ + pfs_mutex_enter_nowait_func((M), __FILE__, __LINE__) + +# define mutex_enter_first(M) \ + pfs_mutex_enter_func((M), __FILE__, __LINE__, HIGH_PRIO) + +# define mutex_enter_last(M) \ + pfs_mutex_enter_func((M), __FILE__, __LINE__, LOW_PRIO) + +# define mutex_exit(M) pfs_mutex_exit_func(M) + +# define mutex_free(M) pfs_mutex_free_func(M) + +#else /* UNIV_PFS_MUTEX */ + +/* If "UNIV_PFS_MUTEX" is not defined, the mutex APIs point to +original non-instrumented functions */ +# ifdef UNIV_DEBUG +# ifdef UNIV_SYNC_DEBUG +# define mutex_create(K, M, level) \ + mutex_create_func((M), (level), __FILE__, __LINE__, #M) +# else /* UNIV_SYNC_DEBUG */ +# define mutex_create(K, M, level) \ + mutex_create_func((M), __FILE__, __LINE__, #M) +# endif /* UNIV_SYNC_DEBUG */ +# else /* UNIV_DEBUG */ +# define mutex_create(K, M, level) \ + mutex_create_func((M), #M) +# endif /* UNIV_DEBUG */ + +# define mutex_enter(M) mutex_enter_func((M), __FILE__, __LINE__) + +# define mutex_enter_nowait(M) \ + mutex_enter_nowait_func((M), __FILE__, __LINE__) + +# define mutex_enter_first(M) \ + mutex_enter_func((M), __FILE__, __LINE__, HIGH_PRIO) + +# define mutex_enter_last(M) \ + mutex_enter_func((M), __FILE__, __LINE__, LOW_PRIO) + +# define mutex_exit(M) mutex_exit_func(M) + +# define mutex_free(M) mutex_free_func(M) + +#endif /* UNIV_PFS_MUTEX */ + +/******************************************************************//** +Creates, or rather, initializes a mutex object in a specified memory +location (which must be appropriately aligned). The mutex is initialized +in the reset state. Explicit freeing of the mutex with mutex_free is +necessary only if the memory block containing it is freed. */ +UNIV_INTERN +void +mutex_create_func( +/*==============*/ + ib_mutex_t* mutex, /*!< in: pointer to memory */ +#ifdef UNIV_DEBUG +# ifdef UNIV_SYNC_DEBUG + ulint level, /*!< in: level */ +# endif /* UNIV_SYNC_DEBUG */ + const char* cfile_name, /*!< in: file name where created */ + ulint cline, /*!< in: file line where created */ +#endif /* UNIV_DEBUG */ + const char* cmutex_name); /*!< in: mutex name */ + +/******************************************************************//** +Creates, or rather, initializes a priority mutex object in a specified memory +location (which must be appropriately aligned). The mutex is initialized +in the reset state. Explicit freeing of the mutex with mutex_free is +necessary only if the memory block containing it is freed. */ +UNIV_INTERN +void +mutex_create_func( +/*==============*/ + ib_prio_mutex_t* mutex, /*!< in: pointer to memory */ +#ifdef UNIV_DEBUG +# ifdef UNIV_SYNC_DEBUG + ulint level, /*!< in: level */ +# endif /* UNIV_SYNC_DEBUG */ + const char* cfile_name, /*!< in: file name where + created */ + ulint cline, /*!< in: file line where + created */ +#endif /* UNIV_DEBUG */ + const char* cmutex_name); /*!< in: mutex name */ +/******************************************************************//** +NOTE! Use the corresponding macro mutex_free(), not directly this function! +Calling this function is obligatory only if the memory buffer containing +the mutex is freed. Removes a mutex object from the mutex list. The mutex +is checked to be in the reset state. */ +UNIV_INTERN +void +mutex_free_func( +/*============*/ + ib_mutex_t* mutex); /*!< in: mutex */ +/******************************************************************//** +NOTE! Use the corresponding macro mutex_free(), not directly this function! +Calling this function is obligatory only if the memory buffer containing +the mutex is freed. Removes a priority mutex object from the mutex list. The +mutex is checked to be in the reset state. */ +UNIV_INTERN +void +mutex_free_func( +/*============*/ + ib_prio_mutex_t* mutex); /*!< in: mutex */ +/**************************************************************//** +NOTE! The following macro should be used in mutex locking, not the +corresponding function. */ + +/* NOTE! currently same as mutex_enter! */ + +#define mutex_enter_fast(M) mutex_enter_func((M), __FILE__, __LINE__) +/******************************************************************//** +NOTE! Use the corresponding macro in the header file, not this function +directly. Locks a mutex for the current thread. If the mutex is reserved +the function spins a preset time (controlled by SYNC_SPIN_ROUNDS) waiting +for the mutex before suspending the thread. */ +UNIV_INLINE +void +mutex_enter_func( +/*=============*/ + ib_mutex_t* mutex, /*!< in: pointer to mutex */ + const char* file_name, /*!< in: file name where locked */ + ulint line); /*!< in: line where locked */ +/******************************************************************//** +NOTE! Use the corresponding macro in the header file, not this function +directly. Locks a priority mutex for the current thread. If the mutex is +reserved the function spins a preset time (controlled by SYNC_SPIN_ROUNDS) +waiting for the mutex before suspending the thread. If the thread is suspended, +the priority argument value determines the relative order for its wake up. Any +HIGH_PRIO waiters will be woken up before any LOW_PRIO waiters. In case of +DEFAULT_PRIO, the relative priority will be set according to +srv_current_thread_priority. */ +UNIV_INLINE +void +mutex_enter_func( +/*=============*/ + ib_prio_mutex_t* mutex, /*!< in: pointer to mutex */ + const char* file_name, /*!< in: file name where + locked */ + ulint line, /*!< in: line where locked */ + enum ib_sync_priority priority = DEFAULT_PRIO); + /*!<in: mutex acquisition + priority */ +/********************************************************************//** +NOTE! Use the corresponding macro in the header file, not this function +directly. Tries to lock the mutex for the current thread. If the lock is not +acquired immediately, returns with return value 1. +@return 0 if succeed, 1 if not */ +UNIV_INTERN +ulint +mutex_enter_nowait_func( +/*====================*/ + ib_mutex_t* mutex, /*!< in: pointer to mutex */ + const char* file_name, /*!< in: file name where mutex + requested */ + ulint line); /*!< in: line where requested */ +/********************************************************************//** +NOTE! Use the corresponding macro in the header file, not this function +directly. Tries to lock the mutex for the current thread. If the lock is not +acquired immediately, returns with return value 1. +@return 0 if succeed, 1 if not */ +UNIV_INLINE +ulint +mutex_enter_nowait_func( +/*====================*/ + ib_prio_mutex_t* mutex, /*!< in: pointer to mutex */ + const char* file_name, /*!< in: file name where mutex + requested */ + ulint line); /*!< in: line where + requested */ +/******************************************************************//** +NOTE! Use the corresponding macro mutex_exit(), not directly this function! +Unlocks a mutex owned by the current thread. */ +UNIV_INLINE +void +mutex_exit_func( +/*============*/ + ib_mutex_t* mutex); /*!< in: pointer to mutex */ +/******************************************************************//** +NOTE! Use the corresponding macro mutex_exit(), not directly this function! +Unlocks a priority mutex owned by the current thread. */ +UNIV_INLINE +void +mutex_exit_func( +/*============*/ + ib_prio_mutex_t* mutex); /*!< in: pointer to mutex */ + + +#ifdef UNIV_PFS_MUTEX +/******************************************************************//** +NOTE! Please use the corresponding macro mutex_create(), not directly +this function! +A wrapper function for mutex_create_func(), registers the mutex +with peformance schema if "UNIV_PFS_MUTEX" is defined when +creating the mutex */ +UNIV_INLINE +void +pfs_mutex_create_func( +/*==================*/ + PSI_mutex_key key, /*!< in: Performance Schema key */ + ib_mutex_t* mutex, /*!< in: pointer to memory */ +# ifdef UNIV_DEBUG +# ifdef UNIV_SYNC_DEBUG + ulint level, /*!< in: level */ +# endif /* UNIV_SYNC_DEBUG */ + const char* cfile_name, /*!< in: file name where created */ + ulint cline, /*!< in: file line where created */ +# endif /* UNIV_DEBUG */ + const char* cmutex_name); +/******************************************************************//** +NOTE! Please use the corresponding macro mutex_create(), not directly +this function! +A wrapper function for mutex_create_func(), registers the mutex +with peformance schema if "UNIV_PFS_MUTEX" is defined when +creating the performance mutex */ +UNIV_INLINE +void +pfs_mutex_create_func( +/*==================*/ + PSI_mutex_key key, /*!< in: Performance Schema + key */ + ib_prio_mutex_t* mutex, /*!< in: pointer to memory */ +# ifdef UNIV_DEBUG +# ifdef UNIV_SYNC_DEBUG + ulint level, /*!< in: level */ +# endif /* UNIV_SYNC_DEBUG */ + const char* cfile_name, /*!< in: file name where + created */ + ulint cline, /*!< in: file line where + created */ +# endif /* UNIV_DEBUG */ + const char* cmutex_name); +/******************************************************************//** +NOTE! Please use the corresponding macro mutex_enter(), not directly +this function! +This is a performance schema instrumented wrapper function for +mutex_enter_func(). */ +UNIV_INLINE +void +pfs_mutex_enter_func( +/*=================*/ + ib_mutex_t* mutex, /*!< in: pointer to mutex */ + const char* file_name, /*!< in: file name where locked */ + ulint line); /*!< in: line where locked */ +/******************************************************************//** +NOTE! Please use the corresponding macro mutex_enter(), not directly +this function! +This is a performance schema instrumented wrapper function for +mutex_enter_func(). */ +UNIV_INLINE +void +pfs_mutex_enter_func( +/*=================*/ + ib_prio_mutex_t* mutex, /*!< in: pointer to mutex */ + const char* file_name, /*!< in: file name where + locked */ + ulint line, /*!< in: line where locked */ + enum ib_sync_priority priority = DEFAULT_PRIO); + /*!<in: mutex acquisition + priority */ +/********************************************************************//** +NOTE! Please use the corresponding macro mutex_enter_nowait(), not directly +this function! +This is a performance schema instrumented wrapper function for +mutex_enter_nowait_func. +@return 0 if succeed, 1 if not */ +UNIV_INLINE +ulint +pfs_mutex_enter_nowait_func( +/*========================*/ + ib_mutex_t* mutex, /*!< in: pointer to mutex */ + const char* file_name, /*!< in: file name where mutex + requested */ + ulint line); /*!< in: line where requested */ +/********************************************************************//** +NOTE! Please use the corresponding macro mutex_enter_nowait(), not directly +this function! +This is a performance schema instrumented wrapper function for +mutex_enter_nowait_func. +@return 0 if succeed, 1 if not */ +UNIV_INLINE +ulint +pfs_mutex_enter_nowait_func( +/*========================*/ + ib_prio_mutex_t* mutex, /*!< in: pointer to mutex */ + const char* file_name, /*!< in: file name where mutex + requested */ + ulint line); /*!< in: line where + requested */ +/******************************************************************//** +NOTE! Please use the corresponding macro mutex_exit(), not directly +this function! +A wrap function of mutex_exit_func() with peformance schema instrumentation. +Unlocks a mutex owned by the current thread. */ +UNIV_INLINE +void +pfs_mutex_exit_func( +/*================*/ + ib_mutex_t* mutex); /*!< in: pointer to mutex */ +/******************************************************************//** +NOTE! Please use the corresponding macro mutex_exit(), not directly +this function! +A wrap function of mutex_exit_func() with peformance schema instrumentation. +Unlocks a priority mutex owned by the current thread. */ +UNIV_INLINE +void +pfs_mutex_exit_func( +/*================*/ + ib_prio_mutex_t* mutex); /*!< in: pointer to mutex */ + +/******************************************************************//** +NOTE! Please use the corresponding macro mutex_free(), not directly +this function! +Wrapper function for mutex_free_func(). Also destroys the performance +schema probes when freeing the mutex */ +UNIV_INLINE +void +pfs_mutex_free_func( +/*================*/ + ib_mutex_t* mutex); /*!< in: mutex */ +/******************************************************************//** +NOTE! Please use the corresponding macro mutex_free(), not directly +this function! +Wrapper function for mutex_free_func(). Also destroys the performance +schema probes when freeing the priority mutex */ +UNIV_INLINE +void +pfs_mutex_free_func( +/*================*/ + ib_prio_mutex_t* mutex); /*!< in: mutex */ + +#endif /* UNIV_PFS_MUTEX */ + +#ifdef UNIV_SYNC_DEBUG +/******************************************************************//** +Returns TRUE if no mutex or rw-lock is currently locked. +Works only in the debug version. +@return TRUE if no mutexes and rw-locks reserved */ +UNIV_INTERN +ibool +sync_all_freed(void); +/*================*/ +#endif /* UNIV_SYNC_DEBUG */ +/*##################################################################### +FUNCTION PROTOTYPES FOR DEBUGGING */ +/*******************************************************************//** +Prints wait info of the sync system. */ +UNIV_INTERN +void +sync_print_wait_info( +/*=================*/ + FILE* file); /*!< in: file where to print */ +/*******************************************************************//** +Prints info of the sync system. */ +UNIV_INTERN +void +sync_print( +/*=======*/ + FILE* file); /*!< in: file where to print */ +#ifdef UNIV_DEBUG +/******************************************************************//** +Checks that the mutex has been initialized. +@return TRUE */ +UNIV_INTERN +ibool +mutex_validate( +/*===========*/ + const ib_mutex_t* mutex); /*!< in: mutex */ +/******************************************************************//** +Checks that the current thread owns the mutex. Works only +in the debug version. +@return TRUE if owns */ +UNIV_INTERN +ibool +mutex_own( +/*======*/ + const ib_mutex_t* mutex) /*!< in: mutex */ + __attribute__((warn_unused_result)); +/******************************************************************//** +Checks that the current thread owns the priority mutex. Works only +in the debug version. +@return TRUE if owns */ +UNIV_INTERN +ibool +mutex_own( +/*======*/ + const ib_prio_mutex_t* mutex) /*!< in: priority mutex */ + __attribute__((warn_unused_result)); +#endif /* UNIV_DEBUG */ +#ifdef UNIV_SYNC_DEBUG +/******************************************************************//** +Adds a latch and its level in the thread level array. Allocates the memory +for the array if called first time for this OS thread. Makes the checks +against other latch levels stored in the array for this thread. */ +UNIV_INTERN +void +sync_thread_add_level( +/*==================*/ + void* latch, /*!< in: pointer to a mutex or an rw-lock */ + ulint level, /*!< in: level in the latching order; if + SYNC_LEVEL_VARYING, nothing is done */ + ibool relock) /*!< in: TRUE if re-entering an x-lock */ + __attribute__((nonnull)); +/******************************************************************//** +Removes a latch from the thread level array if it is found there. +@return TRUE if found in the array; it is no error if the latch is +not found, as we presently are not able to determine the level for +every latch reservation the program does */ +UNIV_INTERN +ibool +sync_thread_reset_level( +/*====================*/ + void* latch); /*!< in: pointer to a mutex or an rw-lock */ +/******************************************************************//** +Checks if the level array for the current thread contains a +mutex or rw-latch at the specified level. +@return a matching latch, or NULL if not found */ +UNIV_INTERN +void* +sync_thread_levels_contains( +/*========================*/ + ulint level); /*!< in: latching order level + (SYNC_DICT, ...)*/ +/******************************************************************//** +Checks that the level array for the current thread is empty. +@return a latch, or NULL if empty except the exceptions specified below */ +UNIV_INTERN +void* +sync_thread_levels_nonempty_gen( +/*============================*/ + ibool dict_mutex_allowed) /*!< in: TRUE if dictionary mutex is + allowed to be owned by the thread */ + __attribute__((warn_unused_result)); +/******************************************************************//** +Checks if the level array for the current thread is empty, +except for data dictionary latches. */ +#define sync_thread_levels_empty_except_dict() \ + (!sync_thread_levels_nonempty_gen(TRUE)) +/******************************************************************//** +Checks if the level array for the current thread is empty, +except for the btr_search_latch. +@return a latch, or NULL if empty except the exceptions specified below */ +UNIV_INTERN +void* +sync_thread_levels_nonempty_trx( +/*============================*/ + ibool has_search_latch) + /*!< in: TRUE if and only if the thread + is supposed to hold btr_search_latch */ + __attribute__((warn_unused_result)); + +/******************************************************************//** +Gets the debug information for a reserved mutex. */ +UNIV_INTERN +void +mutex_get_debug_info( +/*=================*/ + ib_mutex_t* mutex, /*!< in: mutex */ + const char** file_name, /*!< out: file where requested */ + ulint* line, /*!< out: line where requested */ + os_thread_id_t* thread_id); /*!< out: id of the thread which owns + the mutex */ +/******************************************************************//** +Counts currently reserved mutexes. Works only in the debug version. +@return number of reserved mutexes */ +UNIV_INTERN +ulint +mutex_n_reserved(void); +/*==================*/ +#endif /* UNIV_SYNC_DEBUG */ +/******************************************************************//** +NOT to be used outside this module except in debugging! Gets the value +of the lock word. */ +UNIV_INLINE +lock_word_t +mutex_get_lock_word( +/*================*/ + const ib_mutex_t* mutex); /*!< in: mutex */ +#ifdef UNIV_SYNC_DEBUG +/******************************************************************//** +NOT to be used outside this module except in debugging! Gets the waiters +field in a mutex. +@return value to set */ +UNIV_INLINE +ulint +mutex_get_waiters( +/*==============*/ + const ib_mutex_t* mutex); /*!< in: mutex */ +#endif /* UNIV_SYNC_DEBUG */ + +/* + LATCHING ORDER WITHIN THE DATABASE + ================================== + +The mutex or latch in the central memory object, for instance, a rollback +segment object, must be acquired before acquiring the latch or latches to +the corresponding file data structure. In the latching order below, these +file page object latches are placed immediately below the corresponding +central memory object latch or mutex. + +Synchronization object Notes +---------------------- ----- + +Dictionary mutex If we have a pointer to a dictionary +| object, e.g., a table, it can be +| accessed without reserving the +| dictionary mutex. We must have a +| reservation, a memoryfix, to the +| appropriate table object in this case, +| and the table must be explicitly +| released later. +V +Dictionary header +| +V +Secondary index tree latch The tree latch protects also all +| the B-tree non-leaf pages. These +V can be read with the page only +Secondary index non-leaf bufferfixed to save CPU time, +| no s-latch is needed on the page. +| Modification of a page requires an +| x-latch on the page, however. If a +| thread owns an x-latch to the tree, +| it is allowed to latch non-leaf pages +| even after it has acquired the fsp +| latch. +V +Secondary index leaf The latch on the secondary index leaf +| can be kept while accessing the +| clustered index, to save CPU time. +V +Clustered index tree latch To increase concurrency, the tree +| latch is usually released when the +| leaf page latch has been acquired. +V +Clustered index non-leaf +| +V +Clustered index leaf +| +V +Transaction system header +| +V +Transaction undo mutex The undo log entry must be written +| before any index page is modified. +| Transaction undo mutex is for the undo +| logs the analogue of the tree latch +| for a B-tree. If a thread has the +| trx undo mutex reserved, it is allowed +| to latch the undo log pages in any +| order, and also after it has acquired +| the fsp latch. +V +Rollback segment mutex The rollback segment mutex must be +| reserved, if, e.g., a new page must +| be added to an undo log. The rollback +| segment and the undo logs in its +| history list can be seen as an +| analogue of a B-tree, and the latches +| reserved similarly, using a version of +| lock-coupling. If an undo log must be +| extended by a page when inserting an +| undo log record, this corresponds to +| a pessimistic insert in a B-tree. +V +Rollback segment header +| +V +Purge system latch +| +V +Undo log pages If a thread owns the trx undo mutex, +| or for a log in the history list, the +| rseg mutex, it is allowed to latch +| undo log pages in any order, and even +| after it has acquired the fsp latch. +| If a thread does not have the +| appropriate mutex, it is allowed to +| latch only a single undo log page in +| a mini-transaction. +V +File space management latch If a mini-transaction must allocate +| several file pages, it can do that, +| because it keeps the x-latch to the +| file space management in its memo. +V +File system pages +| +V +lock_sys_wait_mutex Mutex protecting lock timeout data +| +V +lock_sys_mutex Mutex protecting lock_sys_t +| +V +trx_sys->mutex Mutex protecting trx_sys_t +| +V +Threads mutex Background thread scheduling mutex +| +V +query_thr_mutex Mutex protecting query threads +| +V +trx_mutex Mutex protecting trx_t fields +| +V +Search system mutex +| +V +Buffer pool mutexes +| +V +Log mutex +| +Any other latch +| +V +Memory pool mutex */ + +/* Latching order levels. If you modify these, you have to also update +sync_thread_add_level(). */ + +/* User transaction locks are higher than any of the latch levels below: +no latches are allowed when a thread goes to wait for a normal table +or row lock! */ +#define SYNC_USER_TRX_LOCK 9999 +#define SYNC_NO_ORDER_CHECK 3000 /* this can be used to suppress + latching order checking */ +#define SYNC_LEVEL_VARYING 2000 /* Level is varying. Only used with + buffer pool page locks, which do not + have a fixed level, but instead have + their level set after the page is + locked; see e.g. + ibuf_bitmap_get_map_page(). */ +#define SYNC_TRX_I_S_RWLOCK 1910 /* Used for + trx_i_s_cache_t::rw_lock */ +#define SYNC_TRX_I_S_LAST_READ 1900 /* Used for + trx_i_s_cache_t::last_read_mutex */ +#define SYNC_FILE_FORMAT_TAG 1200 /* Used to serialize access to the + file format tag */ +#define SYNC_DICT_OPERATION 1010 /* table create, drop, etc. reserve + this in X-mode; implicit or backround + operations purge, rollback, foreign + key checks reserve this in S-mode */ +#define SYNC_FTS_CACHE 1005 /* FTS cache rwlock */ +#define SYNC_DICT 1000 +#define SYNC_DICT_AUTOINC_MUTEX 999 +#define SYNC_STATS_AUTO_RECALC 997 +#define SYNC_DICT_HEADER 995 +#define SYNC_IBUF_HEADER 914 +#define SYNC_IBUF_PESS_INSERT_MUTEX 912 +/*-------------------------------*/ +#define SYNC_INDEX_TREE 900 +#define SYNC_TREE_NODE_NEW 892 +#define SYNC_TREE_NODE_FROM_HASH 891 +#define SYNC_TREE_NODE 890 +#define SYNC_PURGE_LATCH 800 +#define SYNC_TRX_UNDO 700 +#define SYNC_RSEG 600 +#define SYNC_RSEG_HEADER_NEW 591 +#define SYNC_RSEG_HEADER 590 +#define SYNC_TRX_UNDO_PAGE 570 +#define SYNC_EXTERN_STORAGE 500 +#define SYNC_FSP 400 +#define SYNC_FSP_PAGE 395 +/*------------------------------------- Change buffer headers */ +#define SYNC_IBUF_MUTEX 370 /* ibuf_mutex */ +/*------------------------------------- Change buffer tree */ +#define SYNC_IBUF_INDEX_TREE 360 +#define SYNC_IBUF_TREE_NODE_NEW 359 +#define SYNC_IBUF_TREE_NODE 358 +#define SYNC_IBUF_BITMAP_MUTEX 351 +#define SYNC_IBUF_BITMAP 350 +/*------------------------------------- Change log for online create index */ +#define SYNC_INDEX_ONLINE_LOG 340 +/*------------------------------------- MySQL query cache mutex */ +/*------------------------------------- MySQL binlog mutex */ +/*-------------------------------*/ +#define SYNC_LOCK_WAIT_SYS 300 +#define SYNC_LOCK_SYS 299 +#define SYNC_TRX_SYS 298 +#define SYNC_TRX 297 +#define SYNC_THREADS 295 +#define SYNC_REC_LOCK 294 +#define SYNC_TRX_SYS_HEADER 290 +#define SYNC_PURGE_QUEUE 200 +#define SYNC_LOG_ONLINE 175 +#define SYNC_LOG 170 +#define SYNC_LOG_FLUSH_ORDER 147 +#define SYNC_RECV 168 +#define SYNC_FTS_TOKENIZE 167 +#define SYNC_FTS_CACHE_INIT 166 /* Used for FTS cache initialization */ +#define SYNC_FTS_BG_THREADS 165 +#define SYNC_FTS_OPTIMIZE 164 // FIXME: is this correct number, test +#define SYNC_WORK_QUEUE 162 +#define SYNC_SEARCH_SYS 160 /* NOTE that if we have a memory + heap that can be extended to the + buffer pool, its logical level is + SYNC_SEARCH_SYS, as memory allocation + can call routines there! Otherwise + the level is SYNC_MEM_HASH. */ +#define SYNC_BUF_LRU_LIST 151 +#define SYNC_BUF_PAGE_HASH 149 /* buf_pool->page_hash rw_lock */ +#define SYNC_BUF_BLOCK 146 /* Block mutex */ +#define SYNC_BUF_FREE_LIST 145 +#define SYNC_BUF_ZIP_FREE 144 +#define SYNC_BUF_ZIP_HASH 143 +#define SYNC_BUF_FLUSH_STATE 142 +#define SYNC_BUF_FLUSH_LIST 141 /* Buffer flush list mutex */ +#define SYNC_DOUBLEWRITE 139 +#define SYNC_ANY_LATCH 135 +#define SYNC_MEM_HASH 131 +#define SYNC_MEM_POOL 130 + +/* Codes used to designate lock operations */ +#define RW_LOCK_NOT_LOCKED 350 +#define RW_LOCK_EX 351 +#define RW_LOCK_EXCLUSIVE 351 +#define RW_LOCK_SHARED 352 +#define RW_LOCK_WAIT_EX 353 +#define SYNC_MUTEX 354 +#define SYNC_PRIO_MUTEX 355 +#define PRIO_RW_LOCK_EX 356 +#define PRIO_RW_LOCK_SHARED 357 + +/* NOTE! The structure appears here only for the compiler to know its size. +Do not use its fields directly! The structure used in the spin lock +implementation of a mutual exclusion semaphore. */ + +/** InnoDB mutex */ +struct ib_mutex_t { + os_event_t event; /*!< Used by sync0arr.cc for the wait queue */ + volatile lock_word_t lock_word; /*!< lock_word is the target + of the atomic test-and-set instruction when + atomic operations are enabled. */ + +#if !defined(HAVE_ATOMIC_BUILTINS) + os_fast_mutex_t + os_fast_mutex; /*!< We use this OS mutex in place of lock_word + when atomic operations are not enabled */ +#endif + ulint waiters; /*!< This ulint is set to 1 if there are (or + may be) threads waiting in the global wait + array for this mutex to be released. + Otherwise, this is 0. */ + UT_LIST_NODE_T(ib_mutex_t) list; /*!< All allocated mutexes are put into + a list. Pointers to the next and prev. */ +#ifdef UNIV_SYNC_DEBUG + const char* file_name; /*!< File where the mutex was locked */ + ulint line; /*!< Line where the mutex was locked */ + ulint level; /*!< Level in the global latching order */ +#endif /* UNIV_SYNC_DEBUG */ +#ifdef UNIV_DEBUG + const char* cfile_name;/*!< File name where mutex created */ + ulint cline; /*!< Line where created */ +#endif + ulong count_os_wait; /*!< count of os_wait */ +#ifdef UNIV_DEBUG + +/** Value of mutex_t::magic_n */ +# define MUTEX_MAGIC_N 979585UL + + os_thread_id_t thread_id; /*!< The thread id of the thread + which locked the mutex. */ + ulint magic_n; /*!< MUTEX_MAGIC_N */ + ulint ib_mutex_type; /*!< 0=usual mutex, 1=rw_lock mutex */ +#endif /* UNIV_DEBUG */ + const char* cmutex_name; /*!< mutex name */ +#ifdef UNIV_PFS_MUTEX + struct PSI_mutex* pfs_psi; /*!< The performance schema + instrumentation hook */ +#endif +}; + +/** XtraDB priority mutex */ +struct ib_prio_mutex_t { + ib_mutex_t base_mutex; /* The regular mutex provides the lock + word etc. for the priority mutex */ + os_event_t high_priority_event; /* High priority wait array + event */ + volatile ulint high_priority_waiters; /* Number of threads that asked + for this mutex to be acquired with high + priority in the global wait array + waiting for this mutex to be + released. */ + UT_LIST_NODE_T(ib_prio_mutex_t) list; +}; + +/** Constant determining how long spin wait is continued before suspending +the thread. A value 600 rounds on a 1995 100 MHz Pentium seems to correspond +to 20 microseconds. */ + +#define SYNC_SPIN_ROUNDS srv_n_spin_wait_rounds + +/** The number of iterations in the mutex_spin_wait() spin loop. +Intended for performance monitoring. */ +extern ib_counter_t<ib_int64_t, IB_N_SLOTS> mutex_spin_round_count; +/** The number of mutex_spin_wait() calls. Intended for +performance monitoring. */ +extern ib_counter_t<ib_int64_t, IB_N_SLOTS> mutex_spin_wait_count; +/** The number of OS waits in mutex_spin_wait(). Intended for +performance monitoring. */ +extern ib_counter_t<ib_int64_t, IB_N_SLOTS> mutex_os_wait_count; + +/** The number of mutex_exit calls. Intended for performance monitoring. */ +extern ib_int64_t mutex_exit_count; + +#ifdef UNIV_SYNC_DEBUG +/** Latching order checks start when this is set TRUE */ +extern ibool sync_order_checks_on; +#endif /* UNIV_SYNC_DEBUG */ + +/** This variable is set to TRUE when sync_init is called */ +extern ibool sync_initialized; + +/** Global list of database mutexes (not OS mutexes) created. */ +typedef UT_LIST_BASE_NODE_T(ib_mutex_t) ut_list_base_node_t; +/** Global list of database mutexes (not OS mutexes) created. */ +extern ut_list_base_node_t mutex_list; + +/** Mutex protecting the mutex_list variable */ +extern ib_mutex_t mutex_list_mutex; + +#ifndef HAVE_ATOMIC_BUILTINS +/**********************************************************//** +Function that uses a mutex to decrement a variable atomically */ +UNIV_INLINE +void +os_atomic_dec_ulint_func( +/*=====================*/ + ib_mutex_t* mutex, /*!< in: mutex guarding the + decrement */ + volatile ulint* var, /*!< in/out: variable to + decrement */ + ulint delta); /*!< in: delta to decrement */ +/**********************************************************//** +Function that uses a mutex to increment a variable atomically */ +UNIV_INLINE +void +os_atomic_inc_ulint_func( +/*=====================*/ + ib_mutex_t* mutex, /*!< in: mutex guarding the + increment */ + volatile ulint* var, /*!< in/out: variable to + increment */ + ulint delta); /*!< in: delta to increment */ +#endif /* !HAVE_ATOMIC_BUILTINS */ + +#ifndef UNIV_NONINL +#include "sync0sync.ic" +#endif + +#endif diff --git a/storage/xtradb/include/sync0sync.ic b/storage/xtradb/include/sync0sync.ic new file mode 100644 index 00000000000..a252d73e432 --- /dev/null +++ b/storage/xtradb/include/sync0sync.ic @@ -0,0 +1,659 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2008, Google Inc. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/sync0sync.ic +Mutex, the basic synchronization primitive + +Created 9/5/1995 Heikki Tuuri +*******************************************************/ + +/******************************************************************//** +Sets the waiters field in a mutex. */ +UNIV_INTERN +void +mutex_set_waiters( +/*==============*/ + ib_mutex_t* mutex, /*!< in: mutex */ + ulint n); /*!< in: value to set */ +/******************************************************************//** +Reserves a mutex or a priority mutex for the current thread. If the mutex is +reserved, the function spins a preset time (controlled by SYNC_SPIN_ROUNDS) +waiting for the mutex before suspending the thread. */ +UNIV_INTERN +void +mutex_spin_wait( +/*============*/ + void* _mutex, /*!< in: pointer to mutex */ + bool high_priority, /*!< in: whether the mutex is a + priority mutex with high priority + specified */ + const char* file_name, /*!< in: file name where mutex + requested */ + ulint line); /*!< in: line where requested */ +#ifdef UNIV_SYNC_DEBUG +/******************************************************************//** +Sets the debug information for a reserved mutex. */ +UNIV_INTERN +void +mutex_set_debug_info( +/*=================*/ + ib_mutex_t* mutex, /*!< in: mutex */ + const char* file_name, /*!< in: file where requested */ + ulint line); /*!< in: line where requested */ +#endif /* UNIV_SYNC_DEBUG */ +/******************************************************************//** +Releases the threads waiting in the primary wait array for this mutex. */ +UNIV_INTERN +void +mutex_signal_object( +/*================*/ + ib_mutex_t* mutex); /*!< in: mutex */ + +/******************************************************************//** +Performs an atomic test-and-set instruction to the lock_word field of a +mutex. +@return the previous value of lock_word: 0 or 1 */ +UNIV_INLINE +byte +ib_mutex_test_and_set( +/*===============*/ + ib_mutex_t* mutex) /*!< in: mutex */ +{ +#if defined(HAVE_ATOMIC_BUILTINS) +# if defined(HAVE_ATOMIC_BUILTINS_BYTE) + return(os_atomic_test_and_set_byte(&mutex->lock_word, 1)); +# else + return(os_atomic_test_and_set_ulint(&mutex->lock_word, 1)); +# endif +#else + ibool ret; + + ret = os_fast_mutex_trylock(&(mutex->os_fast_mutex)); + + if (ret == 0) { + /* We check that os_fast_mutex_trylock does not leak + and allow race conditions */ + ut_a(mutex->lock_word == 0); + + mutex->lock_word = 1; + os_wmb; + } + + return((byte) ret); +#endif +} + +/******************************************************************//** +Performs a reset instruction to the lock_word field of a mutex. This +instruction also serializes memory operations to the program order. */ +UNIV_INLINE +void +mutex_reset_lock_word( +/*==================*/ + ib_mutex_t* mutex) /*!< in: mutex */ +{ +#if defined(HAVE_ATOMIC_BUILTINS) + /* In theory __sync_lock_release should be used to release the lock. + Unfortunately, it does not work properly alone. The workaround is + that more conservative __sync_lock_test_and_set is used instead. */ +# if defined(HAVE_ATOMIC_BUILTINS_BYTE) + os_atomic_test_and_set_byte(&mutex->lock_word, 0); +# else + os_atomic_test_and_set_ulint(&mutex->lock_word, 0); +# endif +#else + mutex->lock_word = 0; + + os_fast_mutex_unlock(&(mutex->os_fast_mutex)); +#endif +} + +/******************************************************************//** +Gets the value of the lock word. */ +UNIV_INLINE +lock_word_t +mutex_get_lock_word( +/*================*/ + const ib_mutex_t* mutex) /*!< in: mutex */ +{ + ut_ad(mutex); + + return(mutex->lock_word); +} + +/******************************************************************//** +Gets the waiters field in a mutex. +@return value to set */ +UNIV_INLINE +ulint +mutex_get_waiters( +/*==============*/ + const ib_mutex_t* mutex) /*!< in: mutex */ +{ + const volatile ulint* ptr; /*!< declared volatile to ensure that + the value is read from memory */ + ut_ad(mutex); + + ptr = &(mutex->waiters); + + return(*ptr); /* Here we assume that the read of a single + word from memory is atomic */ +} + +/******************************************************************//** +NOTE! Use the corresponding macro mutex_exit(), not directly this function! +Unlocks a mutex owned by the current thread. */ +UNIV_INLINE +void +mutex_exit_func( +/*============*/ + ib_mutex_t* mutex) /*!< in: pointer to mutex */ +{ + ut_ad(mutex_own(mutex)); + + ut_d(mutex->thread_id = (os_thread_id_t) ULINT_UNDEFINED); + +#ifdef UNIV_SYNC_DEBUG + sync_thread_reset_level(mutex); +#endif + mutex_reset_lock_word(mutex); + + /* A problem: we assume that mutex_reset_lock word + is a memory barrier, that is when we read the waiters + field next, the read must be serialized in memory + after the reset. A speculative processor might + perform the read first, which could leave a waiting + thread hanging indefinitely. + + Our current solution call every second + sync_arr_wake_threads_if_sema_free() + to wake up possible hanging threads if + they are missed in mutex_signal_object. */ + + if (mutex_get_waiters(mutex) != 0) { + + mutex_signal_object(mutex); + } + +#ifdef UNIV_SYNC_PERF_STAT + mutex_exit_count++; +#endif +} + +/******************************************************************//** +NOTE! Use the corresponding macro mutex_exit(), not directly this function! +Unlocks a priority mutex owned by the current thread. */ +UNIV_INLINE +void +mutex_exit_func( +/*============*/ + ib_prio_mutex_t* mutex) /*!< in: pointer to mutex */ +{ + ut_ad(mutex_own(mutex)); + + ut_d(mutex->base_mutex.thread_id = (os_thread_id_t) ULINT_UNDEFINED); + +#ifdef UNIV_SYNC_DEBUG + sync_thread_reset_level(&mutex->base_mutex); +#endif + mutex_reset_lock_word(&mutex->base_mutex); + + /* A problem: we assume that mutex_reset_lock word + is a memory barrier, that is when we read the waiters + field next, the read must be serialized in memory + after the reset. A speculative processor might + perform the read first, which could leave a waiting + thread hanging indefinitely. + + Our current solution call every second + sync_arr_wake_threads_if_sema_free() + to wake up possible hanging threads if + they are missed in mutex_signal_object. */ + + /* Wake up any high priority waiters first. */ + if (mutex->high_priority_waiters != 0) { + + os_event_set(mutex->high_priority_event); + sync_array_object_signalled(); + + } else if (mutex_get_waiters(&mutex->base_mutex) != 0) { + + mutex_signal_object(&mutex->base_mutex); + } + +#ifdef UNIV_SYNC_PERF_STAT + mutex_exit_count++; +#endif + +} + + +/******************************************************************//** +Locks a mutex for the current thread. If the mutex is reserved, the function +spins a preset time (controlled by SYNC_SPIN_ROUNDS), waiting for the mutex +before suspending the thread. */ +UNIV_INLINE +void +mutex_enter_func( +/*=============*/ + ib_mutex_t* mutex, /*!< in: pointer to mutex */ + const char* file_name, /*!< in: file name where locked */ + ulint line) /*!< in: line where locked */ +{ + ut_ad(mutex_validate(mutex)); + ut_ad(!mutex_own(mutex)); + + /* Note that we do not peek at the value of lock_word before trying + the atomic test_and_set; we could peek, and possibly save time. */ + + if (!ib_mutex_test_and_set(mutex)) { + ut_d(mutex->thread_id = os_thread_get_curr_id()); +#ifdef UNIV_SYNC_DEBUG + mutex_set_debug_info(mutex, file_name, line); +#endif + return; /* Succeeded! */ + } + + mutex_spin_wait(mutex, false, file_name, line); +} + +/******************************************************************//** +NOTE! Use the corresponding macro in the header file, not this function +directly. Locks a priority mutex for the current thread. If the mutex is +reserved the function spins a preset time (controlled by SYNC_SPIN_ROUNDS) +waiting for the mutex before suspending the thread. If the thread is suspended, +the priority argument value determines the relative order for its wake up. Any +HIGH_PRIO waiters will be woken up before any LOW_PRIO waiters. In case of +DEFAULT_PRIO, the relative priority will be set according to +srv_current_thread_priority. */ +UNIV_INLINE +void +mutex_enter_func( +/*=============*/ + ib_prio_mutex_t* mutex, /*!< in: pointer to mutex */ + const char* file_name, /*!< in: file name where + locked */ + ulint line, /*!< in: line where locked */ + enum ib_sync_priority priority) + /*!<in: mutex acquisition + priority */ +{ + bool high_priority; + + ut_ad(mutex_validate(&mutex->base_mutex)); + ut_ad(!mutex_own(mutex)); + + /* Note that we do not peek at the value of lock_word before trying + the atomic test_and_set; we could peek, and possibly save time. */ + + if (!ib_mutex_test_and_set(&mutex->base_mutex)) { + ut_d(mutex->base_mutex.thread_id = os_thread_get_curr_id()); +#ifdef UNIV_SYNC_DEBUG + mutex_set_debug_info(&mutex->base_mutex, file_name, line); +#endif + return; /* Succeeded! */ + } + + if (UNIV_LIKELY(priority == DEFAULT_PRIO)) { + high_priority = srv_current_thread_priority; + } else { + high_priority = (priority == HIGH_PRIO); + } + mutex_spin_wait(mutex, high_priority, file_name, line); +} + +/********************************************************************//** +NOTE! Use the corresponding macro in the header file, not this function +directly. Tries to lock the mutex for the current thread. If the lock is not +acquired immediately, returns with return value 1. +@return 0 if succeed, 1 if not */ +UNIV_INLINE +ulint +mutex_enter_nowait_func( +/*====================*/ + ib_prio_mutex_t* mutex, /*!< in: pointer to mutex */ + const char* file_name, /*!< in: file name where mutex + requested */ + ulint line) /*!< in: line where + requested */ +{ + return mutex_enter_nowait_func(&mutex->base_mutex, file_name, line); +} + +#ifdef UNIV_PFS_MUTEX +/******************************************************************//** +NOTE! Please use the corresponding macro mutex_enter(), not directly +this function! +This is a performance schema instrumented wrapper function for +mutex_enter_func(). */ +UNIV_INLINE +void +pfs_mutex_enter_func( +/*=================*/ + ib_mutex_t* mutex, /*!< in: pointer to mutex */ + const char* file_name, /*!< in: file name where locked */ + ulint line) /*!< in: line where locked */ +{ + if (mutex->pfs_psi != NULL) { + PSI_mutex_locker* locker; + PSI_mutex_locker_state state; + + locker = PSI_MUTEX_CALL(start_mutex_wait)( + &state, mutex->pfs_psi, + PSI_MUTEX_LOCK, file_name, + static_cast<uint>(line)); + + mutex_enter_func(mutex, file_name, line); + + if (locker != NULL) { + PSI_MUTEX_CALL(end_mutex_wait)(locker, 0); + } + } else { + mutex_enter_func(mutex, file_name, line); + } +} + +/******************************************************************//** +NOTE! Please use the corresponding macro mutex_enter(), not directly +this function! +This is a performance schema instrumented wrapper function for +mutex_enter_func(). */ +UNIV_INLINE +void +pfs_mutex_enter_func( +/*=================*/ + ib_prio_mutex_t* mutex, /*!< in: pointer to mutex */ + const char* file_name, /*!< in: file name where + locked */ + ulint line, /*!< in: line where locked */ + enum ib_sync_priority priority) /*!<in: mutex acquisition + priority */ +{ + if (mutex->base_mutex.pfs_psi != NULL) { + PSI_mutex_locker* locker; + PSI_mutex_locker_state state; + + locker = PSI_MUTEX_CALL(start_mutex_wait)( + &state, mutex->base_mutex.pfs_psi, + PSI_MUTEX_LOCK, file_name, line); + + mutex_enter_func(mutex, file_name, line, priority); + + if (locker != NULL) { + PSI_MUTEX_CALL(end_mutex_wait)(locker, 0); + } + } else { + mutex_enter_func(mutex, file_name, line, priority); + } +} + +/********************************************************************//** +NOTE! Please use the corresponding macro mutex_enter_nowait(), not directly +this function! +This is a performance schema instrumented wrapper function for +mutex_enter_nowait_func. +@return 0 if succeed, 1 if not */ +UNIV_INLINE +ulint +pfs_mutex_enter_nowait_func( +/*========================*/ + ib_mutex_t* mutex, /*!< in: pointer to mutex */ + const char* file_name, /*!< in: file name where mutex + requested */ + ulint line) /*!< in: line where requested */ +{ + ulint ret; + + if (mutex->pfs_psi != NULL) { + PSI_mutex_locker* locker; + PSI_mutex_locker_state state; + + locker = PSI_MUTEX_CALL(start_mutex_wait)( + &state, mutex->pfs_psi, + PSI_MUTEX_TRYLOCK, file_name, + static_cast<uint>(line)); + + ret = mutex_enter_nowait_func(mutex, file_name, line); + + if (locker != NULL) { + PSI_MUTEX_CALL(end_mutex_wait)(locker, (int) ret); + } + } else { + ret = mutex_enter_nowait_func(mutex, file_name, line); + } + + return(ret); +} + +/********************************************************************//** +NOTE! Please use the corresponding macro mutex_enter_nowait(), not directly +this function! +This is a performance schema instrumented wrapper function for +mutex_enter_nowait_func. +@return 0 if succeed, 1 if not */ +UNIV_INLINE +ulint +pfs_mutex_enter_nowait_func( +/*========================*/ + ib_prio_mutex_t* mutex, /*!< in: pointer to mutex */ + const char* file_name, /*!< in: file name where mutex + requested */ + ulint line) /*!< in: line where + requested */ +{ + return pfs_mutex_enter_nowait_func(&mutex->base_mutex, file_name, + line); +} + +/******************************************************************//** +NOTE! Please use the corresponding macro mutex_exit(), not directly +this function! +A wrap function of mutex_exit_func() with performance schema instrumentation. +Unlocks a mutex owned by the current thread. */ +UNIV_INLINE +void +pfs_mutex_exit_func( +/*================*/ + ib_mutex_t* mutex) /*!< in: pointer to mutex */ +{ + if (mutex->pfs_psi != NULL) { + PSI_MUTEX_CALL(unlock_mutex)(mutex->pfs_psi); + } + + mutex_exit_func(mutex); +} + +/******************************************************************//** +NOTE! Please use the corresponding macro mutex_exit(), not directly +this function! +A wrap function of mutex_exit_func() with peformance schema instrumentation. +Unlocks a priority mutex owned by the current thread. */ +UNIV_INLINE +void +pfs_mutex_exit_func( +/*================*/ + ib_prio_mutex_t* mutex) /*!< in: pointer to mutex */ +{ + if (mutex->base_mutex.pfs_psi != NULL) { + PSI_MUTEX_CALL(unlock_mutex)(mutex->base_mutex.pfs_psi); + } + + mutex_exit_func(mutex); +} + + +/******************************************************************//** +NOTE! Please use the corresponding macro mutex_create(), not directly +this function! +A wrapper function for mutex_create_func(), registers the mutex +with performance schema if "UNIV_PFS_MUTEX" is defined when +creating the mutex */ +UNIV_INLINE +void +pfs_mutex_create_func( +/*==================*/ + mysql_pfs_key_t key, /*!< in: Performance Schema key */ + ib_mutex_t* mutex, /*!< in: pointer to memory */ +# ifdef UNIV_DEBUG +# ifdef UNIV_SYNC_DEBUG + ulint level, /*!< in: level */ +# endif /* UNIV_SYNC_DEBUG */ + const char* cfile_name, /*!< in: file name where created */ + ulint cline, /*!< in: file line where created */ +# endif /* UNIV_DEBUG */ + const char* cmutex_name) /*!< in: mutex name */ +{ + mutex->pfs_psi = PSI_MUTEX_CALL(init_mutex)(key, mutex); + + mutex_create_func(mutex, +# ifdef UNIV_DEBUG +# ifdef UNIV_SYNC_DEBUG + level, +# endif /* UNIV_SYNC_DEBUG */ + cfile_name, + cline, +# endif /* UNIV_DEBUG */ + cmutex_name); +} + +/******************************************************************//** +NOTE! Please use the corresponding macro mutex_create(), not directly +this function! +A wrapper function for mutex_create_func(), registers the mutex +with peformance schema if "UNIV_PFS_MUTEX" is defined when +creating the performance mutex */ +UNIV_INLINE +void +pfs_mutex_create_func( +/*==================*/ + PSI_mutex_key key, /*!< in: Performance Schema + key */ + ib_prio_mutex_t* mutex, /*!< in: pointer to memory */ +# ifdef UNIV_DEBUG +# ifdef UNIV_SYNC_DEBUG + ulint level, /*!< in: level */ +# endif /* UNIV_SYNC_DEBUG */ + const char* cfile_name, /*!< in: file name where + created */ + ulint cline, /*!< in: file line where + created */ +# endif /* UNIV_DEBUG */ + const char* cmutex_name) +{ + mutex->base_mutex.pfs_psi = PSI_MUTEX_CALL(init_mutex)(key, mutex); + + mutex_create_func(mutex, +# ifdef UNIV_DEBUG +# ifdef UNIV_SYNC_DEBUG + level, +# endif /* UNIV_SYNC_DEBUG */ + cfile_name, + cline, +# endif /* UNIV_DEBUG */ + cmutex_name); +} + + +/******************************************************************//** +NOTE! Please use the corresponding macro mutex_free(), not directly +this function! +Wrapper function for mutex_free_func(). Also destroys the performance +schema probes when freeing the mutex */ +UNIV_INLINE +void +pfs_mutex_free_func( +/*================*/ + ib_mutex_t* mutex) /*!< in: mutex */ +{ + if (mutex->pfs_psi != NULL) { + PSI_MUTEX_CALL(destroy_mutex)(mutex->pfs_psi); + mutex->pfs_psi = NULL; + } + + mutex_free_func(mutex); +} + +/******************************************************************//** +NOTE! Please use the corresponding macro mutex_free(), not directly +this function! +Wrapper function for mutex_free_func(). Also destroys the performance +schema probes when freeing the priority mutex */ +UNIV_INLINE +void +pfs_mutex_free_func( +/*================*/ + ib_prio_mutex_t* mutex) /*!< in: mutex */ +{ + if (mutex->base_mutex.pfs_psi != NULL) { + PSI_MUTEX_CALL(destroy_mutex)(mutex->base_mutex.pfs_psi); + mutex->base_mutex.pfs_psi = NULL; + } + + mutex_free_func(mutex); +} + + +#endif /* UNIV_PFS_MUTEX */ + +#ifndef HAVE_ATOMIC_BUILTINS +/**********************************************************//** +Function that uses a mutex to decrement a variable atomically */ +UNIV_INLINE +void +os_atomic_dec_ulint_func( +/*=====================*/ + ib_mutex_t* mutex, /*!< in: mutex guarding the dec */ + volatile ulint* var, /*!< in/out: variable to decrement */ + ulint delta) /*!< in: delta to decrement */ +{ + mutex_enter(mutex); + + /* I don't think we will encounter a situation where + this check will not be required. */ + ut_ad(*var >= delta); + + *var -= delta; + + mutex_exit(mutex); +} + +/**********************************************************//** +Function that uses a mutex to increment a variable atomically */ +UNIV_INLINE +void +os_atomic_inc_ulint_func( +/*=====================*/ + ib_mutex_t* mutex, /*!< in: mutex guarding the increment */ + volatile ulint* var, /*!< in/out: variable to increment */ + ulint delta) /*!< in: delta to increment */ +{ + mutex_enter(mutex); + + *var += delta; + + mutex_exit(mutex); +} +#endif /* !HAVE_ATOMIC_BUILTINS */ diff --git a/storage/xtradb/include/sync0types.h b/storage/xtradb/include/sync0types.h new file mode 100644 index 00000000000..67f613ab8ae --- /dev/null +++ b/storage/xtradb/include/sync0types.h @@ -0,0 +1,44 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/sync0types.h +Global types for sync + +Created 9/5/1995 Heikki Tuuri +*******************************************************/ + +#ifndef sync0types_h +#define sync0types_h + +struct ib_mutex_t; + +/* The relative priority of the current thread. If 0, low priority; if 1, high +priority. */ +extern UNIV_THREAD_LOCAL ulint srv_current_thread_priority; + +struct ib_prio_mutex_t; + +/** Priority mutex and rwlatch acquisition priorities */ +enum ib_sync_priority { + DEFAULT_PRIO, + LOW_PRIO, + HIGH_PRIO +}; + +#endif diff --git a/storage/xtradb/include/trx0i_s.h b/storage/xtradb/include/trx0i_s.h new file mode 100644 index 00000000000..ac5e00c6834 --- /dev/null +++ b/storage/xtradb/include/trx0i_s.h @@ -0,0 +1,315 @@ +/***************************************************************************** + +Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0i_s.h +INFORMATION SCHEMA innodb_trx, innodb_locks and +innodb_lock_waits tables cache structures and public +functions. + +Created July 17, 2007 Vasil Dimov +*******************************************************/ + +#ifndef trx0i_s_h +#define trx0i_s_h + +#include "univ.i" +#include "trx0types.h" +#include "dict0types.h" +#include "ut0ut.h" + +/** The maximum amount of memory that can be consumed by innodb_trx, +innodb_locks and innodb_lock_waits information schema tables. */ +#define TRX_I_S_MEM_LIMIT 16777216 /* 16 MiB */ + +/** The maximum length of a string that can be stored in +i_s_locks_row_t::lock_data */ +#define TRX_I_S_LOCK_DATA_MAX_LEN 8192 + +/** The maximum length of a string that can be stored in +i_s_trx_row_t::trx_query */ +#define TRX_I_S_TRX_QUERY_MAX_LEN 1024 + +/** The maximum length of a string that can be stored in +i_s_trx_row_t::trx_operation_state */ +#define TRX_I_S_TRX_OP_STATE_MAX_LEN 64 + +/** The maximum length of a string that can be stored in +i_s_trx_row_t::trx_foreign_key_error */ +#define TRX_I_S_TRX_FK_ERROR_MAX_LEN 256 + +/** The maximum length of a string that can be stored in +i_s_trx_row_t::trx_isolation_level */ +#define TRX_I_S_TRX_ISOLATION_LEVEL_MAX_LEN 16 + +/** Safely copy strings in to the INNODB_TRX table's +string based columns */ +#define TRX_I_S_STRING_COPY(data, field, constraint, tcache) \ +do { \ + if (strlen(data) > constraint) { \ + char buff[constraint + 1]; \ + strncpy(buff, data, constraint); \ + buff[constraint] = '\0'; \ + \ + field = static_cast<const char*>( \ + ha_storage_put_memlim( \ + (tcache)->storage, buff, constraint + 1,\ + MAX_ALLOWED_FOR_STORAGE(tcache))); \ + } else { \ + field = static_cast<const char*>( \ + ha_storage_put_str_memlim( \ + (tcache)->storage, data, \ + MAX_ALLOWED_FOR_STORAGE(tcache))); \ + } \ +} while (0) + +/** A row of INFORMATION_SCHEMA.innodb_locks */ +struct i_s_locks_row_t; + +/** Objects of trx_i_s_cache_t::locks_hash */ +struct i_s_hash_chain_t; + +/** Objects of this type are added to the hash table +trx_i_s_cache_t::locks_hash */ +struct i_s_hash_chain_t { + i_s_locks_row_t* value; /*!< row of + INFORMATION_SCHEMA.innodb_locks*/ + i_s_hash_chain_t* next; /*!< next item in the hash chain */ +}; + +/** This structure represents INFORMATION_SCHEMA.innodb_locks row */ +struct i_s_locks_row_t { + trx_id_t lock_trx_id; /*!< transaction identifier */ + const char* lock_mode; /*!< lock mode from + lock_get_mode_str() */ + const char* lock_type; /*!< lock type from + lock_get_type_str() */ + const char* lock_table; /*!< table name from + lock_get_table_name() */ + const char* lock_index; /*!< index name from + lock_rec_get_index_name() */ + /** Information for record locks. All these are + ULINT_UNDEFINED for table locks. */ + /* @{ */ + ulint lock_space; /*!< tablespace identifier */ + ulint lock_page; /*!< page number within the_space */ + ulint lock_rec; /*!< heap number of the record + on the page */ + const char* lock_data; /*!< (some) content of the record */ + /* @} */ + + /** The following are auxiliary and not included in the table */ + /* @{ */ + table_id_t lock_table_id; + /*!< table identifier from + lock_get_table_id */ + i_s_hash_chain_t hash_chain; /*!< hash table chain node for + trx_i_s_cache_t::locks_hash */ + /* @} */ +}; + +/** This structure represents INFORMATION_SCHEMA.innodb_trx row */ +struct i_s_trx_row_t { + trx_id_t trx_id; /*!< transaction identifier */ + const char* trx_state; /*!< transaction state from + trx_get_que_state_str() */ + ib_time_t trx_started; /*!< trx_t::start_time */ + const i_s_locks_row_t* requested_lock_row; + /*!< pointer to a row + in innodb_locks if trx + is waiting, or NULL */ + ib_time_t trx_wait_started; /*!< trx_t::wait_started */ + ullint trx_weight; /*!< TRX_WEIGHT() */ + ulint trx_mysql_thread_id; /*!< thd_get_thread_id() */ + const char* trx_query; /*!< MySQL statement being + executed in the transaction */ + struct charset_info_st* trx_query_cs; + /*!< charset encode the MySQL + statement */ + const char* trx_operation_state; /*!< trx_t::op_info */ + ulint trx_tables_in_use;/*!< n_mysql_tables_in_use in + trx_t */ + ulint trx_tables_locked; + /*!< mysql_n_tables_locked in + trx_t */ + ulint trx_lock_structs;/*!< list len of trx_locks in + trx_t */ + ulint trx_lock_memory_bytes; + /*!< mem_heap_get_size( + trx->lock_heap) */ + ulint trx_rows_locked;/*!< lock_number_of_rows_locked() */ + ullint trx_rows_modified;/*!< trx_t::undo_no */ + ulint trx_concurrency_tickets; + /*!< n_tickets_to_enter_innodb in + trx_t */ + const char* trx_isolation_level; + /*!< isolation_level in trx_t */ + ibool trx_unique_checks; + /*!< check_unique_secondary in trx_t*/ + ibool trx_foreign_key_checks; + /*!< check_foreigns in trx_t */ + const char* trx_foreign_key_error; + /*!< detailed_error in trx_t */ + ibool trx_has_search_latch; + /*!< has_search_latch in trx_t */ + ulint trx_search_latch_timeout; + /*!< search_latch_timeout in trx_t */ + ulint trx_is_read_only; + /*!< trx_t::read_only */ + ulint trx_is_autocommit_non_locking; + /*!< trx_is_autocommit_non_locking(trx) + */ +}; + +/** This structure represents INFORMATION_SCHEMA.innodb_lock_waits row */ +struct i_s_lock_waits_row_t { + const i_s_locks_row_t* requested_lock_row; /*!< requested lock */ + const i_s_locks_row_t* blocking_lock_row; /*!< blocking lock */ +}; + +/** Cache of INFORMATION_SCHEMA table data */ +struct trx_i_s_cache_t; + +/** Auxiliary enum used by functions that need to select one of the +INFORMATION_SCHEMA tables */ +enum i_s_table { + I_S_INNODB_TRX, /*!< INFORMATION_SCHEMA.innodb_trx */ + I_S_INNODB_LOCKS, /*!< INFORMATION_SCHEMA.innodb_locks */ + I_S_INNODB_LOCK_WAITS /*!< INFORMATION_SCHEMA.innodb_lock_waits */ +}; + +/** This is the intermediate buffer where data needed to fill the +INFORMATION SCHEMA tables is fetched and later retrieved by the C++ +code in handler/i_s.cc. */ +extern trx_i_s_cache_t* trx_i_s_cache; + +/*******************************************************************//** +Initialize INFORMATION SCHEMA trx related cache. */ +UNIV_INTERN +void +trx_i_s_cache_init( +/*===============*/ + trx_i_s_cache_t* cache); /*!< out: cache to init */ +/*******************************************************************//** +Free the INFORMATION SCHEMA trx related cache. */ +UNIV_INTERN +void +trx_i_s_cache_free( +/*===============*/ + trx_i_s_cache_t* cache); /*!< in/out: cache to free */ + +/*******************************************************************//** +Issue a shared/read lock on the tables cache. */ +UNIV_INTERN +void +trx_i_s_cache_start_read( +/*=====================*/ + trx_i_s_cache_t* cache); /*!< in: cache */ + +/*******************************************************************//** +Release a shared/read lock on the tables cache. */ +UNIV_INTERN +void +trx_i_s_cache_end_read( +/*===================*/ + trx_i_s_cache_t* cache); /*!< in: cache */ + +/*******************************************************************//** +Issue an exclusive/write lock on the tables cache. */ +UNIV_INTERN +void +trx_i_s_cache_start_write( +/*======================*/ + trx_i_s_cache_t* cache); /*!< in: cache */ + +/*******************************************************************//** +Release an exclusive/write lock on the tables cache. */ +UNIV_INTERN +void +trx_i_s_cache_end_write( +/*====================*/ + trx_i_s_cache_t* cache); /*!< in: cache */ + + +/*******************************************************************//** +Retrieves the number of used rows in the cache for a given +INFORMATION SCHEMA table. +@return number of rows */ +UNIV_INTERN +ulint +trx_i_s_cache_get_rows_used( +/*========================*/ + trx_i_s_cache_t* cache, /*!< in: cache */ + enum i_s_table table); /*!< in: which table */ + +/*******************************************************************//** +Retrieves the nth row in the cache for a given INFORMATION SCHEMA +table. +@return row */ +UNIV_INTERN +void* +trx_i_s_cache_get_nth_row( +/*======================*/ + trx_i_s_cache_t* cache, /*!< in: cache */ + enum i_s_table table, /*!< in: which table */ + ulint n); /*!< in: row number */ + +/*******************************************************************//** +Update the transactions cache if it has not been read for some time. +@return 0 - fetched, 1 - not */ +UNIV_INTERN +int +trx_i_s_possibly_fetch_data_into_cache( +/*===================================*/ + trx_i_s_cache_t* cache); /*!< in/out: cache */ + +/*******************************************************************//** +Returns TRUE if the data in the cache is truncated due to the memory +limit posed by TRX_I_S_MEM_LIMIT. +@return TRUE if truncated */ +UNIV_INTERN +ibool +trx_i_s_cache_is_truncated( +/*=======================*/ + trx_i_s_cache_t* cache); /*!< in: cache */ + +/** The maximum length of a resulting lock_id_size in +trx_i_s_create_lock_id(), not including the terminating NUL. +":%lu:%lu:%lu" -> 63 chars */ +#define TRX_I_S_LOCK_ID_MAX_LEN (TRX_ID_MAX_LEN + 63) + +/*******************************************************************//** +Crafts a lock id string from a i_s_locks_row_t object. Returns its +second argument. This function aborts if there is not enough space in +lock_id. Be sure to provide at least TRX_I_S_LOCK_ID_MAX_LEN + 1 if you +want to be 100% sure that it will not abort. +@return resulting lock id */ +UNIV_INTERN +char* +trx_i_s_create_lock_id( +/*===================*/ + const i_s_locks_row_t* row, /*!< in: innodb_locks row */ + char* lock_id,/*!< out: resulting lock_id */ + ulint lock_id_size);/*!< in: size of the lock id + buffer */ + +UNIV_INTERN +void +trx_i_s_get_lock_sys_memory_usage(ulint *constant, ulint *variable); + +#endif /* trx0i_s_h */ diff --git a/storage/xtradb/include/trx0purge.h b/storage/xtradb/include/trx0purge.h new file mode 100644 index 00000000000..a862523c092 --- /dev/null +++ b/storage/xtradb/include/trx0purge.h @@ -0,0 +1,222 @@ +/***************************************************************************** + +Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0purge.h +Purge old versions + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#ifndef trx0purge_h +#define trx0purge_h + +#include "univ.i" +#include "trx0types.h" +#include "mtr0mtr.h" +#include "trx0sys.h" +#include "que0types.h" +#include "page0page.h" +#include "usr0sess.h" +#include "fil0fil.h" + +/** The global data structure coordinating a purge */ +extern trx_purge_t* purge_sys; + +/** A dummy undo record used as a return value when we have a whole undo log +which needs no purge */ +extern trx_undo_rec_t trx_purge_dummy_rec; + +/********************************************************************//** +Calculates the file address of an undo log header when we have the file +address of its history list node. +@return file address of the log */ +UNIV_INLINE +fil_addr_t +trx_purge_get_log_from_hist( +/*========================*/ + fil_addr_t node_addr); /*!< in: file address of the history + list node of the log */ +/********************************************************************//** +Creates the global purge system control structure and inits the history +mutex. */ +UNIV_INTERN +void +trx_purge_sys_create( +/*=================*/ + ulint n_purge_threads,/*!< in: number of purge threads */ + ib_bh_t* ib_bh); /*!< in/own: UNDO log min binary heap*/ +/********************************************************************//** +Frees the global purge system control structure. */ +UNIV_INTERN +void +trx_purge_sys_close(void); +/*======================*/ +/************************************************************************ +Adds the update undo log as the first log in the history list. Removes the +update undo log segment from the rseg slot if it is too big for reuse. */ +UNIV_INTERN +void +trx_purge_add_update_undo_to_history( +/*=================================*/ + trx_t* trx, /*!< in: transaction */ + page_t* undo_page, /*!< in: update undo log header page, + x-latched */ + mtr_t* mtr); /*!< in: mtr */ +/*******************************************************************//** +This function runs a purge batch. +@return number of undo log pages handled in the batch */ +UNIV_INTERN +ulint +trx_purge( +/*======*/ + ulint n_purge_threads, /*!< in: number of purge tasks to + submit to task queue. */ + ulint limit, /*!< in: the maximum number of + records to purge in one batch */ + bool truncate); /*!< in: truncate history if true */ +/*******************************************************************//** +Stop purge and wait for it to stop, move to PURGE_STATE_STOP. */ +UNIV_INTERN +void +trx_purge_stop(void); +/*================*/ +/*******************************************************************//** +Resume purge, move to PURGE_STATE_RUN. */ +UNIV_INTERN +void +trx_purge_run(void); +/*================*/ + +/** Purge states */ +enum purge_state_t { + PURGE_STATE_INIT, /*!< Purge instance created */ + PURGE_STATE_RUN, /*!< Purge should be running */ + PURGE_STATE_STOP, /*!< Purge should be stopped */ + PURGE_STATE_EXIT, /*!< Purge has been shutdown */ + PURGE_STATE_DISABLED /*!< Purge was never started */ +}; + +/*******************************************************************//** +Get the purge state. +@return purge state. */ +UNIV_INTERN +purge_state_t +trx_purge_state(void); +/*=================*/ + +/** This is the purge pointer/iterator. We need both the undo no and the +transaction no up to which purge has parsed and applied the records. */ +struct purge_iter_t { + trx_id_t trx_no; /*!< Purge has advanced past all + transactions whose number is less + than this */ + undo_no_t undo_no; /*!< Purge has advanced past all records + whose undo number is less than this */ +}; + +/** The control structure used in the purge operation */ +struct trx_purge_t{ + sess_t* sess; /*!< System session running the purge + query */ + trx_t* trx; /*!< System transaction running the + purge query: this trx is not in the + trx list of the trx system and it + never ends */ + prio_rw_lock_t latch; /*!< The latch protecting the purge + view. A purge operation must acquire an + x-latch here for the instant at which + it changes the purge view: an undo + log operation can prevent this by + obtaining an s-latch here. It also + protects state and running */ + os_event_t event; /*!< State signal event */ + ulint n_stop; /*!< Counter to track number stops */ + volatile bool running; /*!< true, if purge is active, + we check this without the latch too */ + volatile purge_state_t state; /*!< Purge coordinator thread states, + we check this in several places + without holding the latch. */ + que_t* query; /*!< The query graph which will do the + parallelized purge operation */ + read_view_t* view; /*!< The purge will not remove undo logs + which are >= this view (purge view) */ + read_view_t* prebuilt_clone; /*!< Pre-built view which is used as a + temporary clone of the oldest view in + read_view_purge_open() */ + read_view_t* prebuilt_view; /*!< Pre-built view array */ + volatile ulint n_submitted; /*!< Count of total tasks submitted + to the task queue */ + volatile ulint n_completed; /*!< Count of total tasks completed */ + + /*------------------------------*/ + /* The following two fields form the 'purge pointer' which advances + during a purge, and which is used in history list truncation */ + + purge_iter_t iter; /* Limit up to which we have read and + parsed the UNDO log records. Not + necessarily purged from the indexes. + Note that this can never be less than + the limit below, we check for this + invariant in trx0purge.cc */ + purge_iter_t limit; /* The 'purge pointer' which advances + during a purge, and which is used in + history list truncation */ +#ifdef UNIV_DEBUG + purge_iter_t done; /* Indicate 'purge pointer' which have + purged already accurately. */ +#endif /* UNIV_DEBUG */ + /*-----------------------------*/ + ibool next_stored; /*!< TRUE if the info of the next record + to purge is stored below: if yes, then + the transaction number and the undo + number of the record are stored in + purge_trx_no and purge_undo_no above */ + trx_rseg_t* rseg; /*!< Rollback segment for the next undo + record to purge */ + ulint page_no; /*!< Page number for the next undo + record to purge, page number of the + log header, if dummy record */ + ulint offset; /*!< Page offset for the next undo + record to purge, 0 if the dummy + record */ + ulint hdr_page_no; /*!< Header page of the undo log where + the next record to purge belongs */ + ulint hdr_offset; /*!< Header byte offset on the page */ + /*-----------------------------*/ + mem_heap_t* heap; /*!< Temporary storage used during a + purge: can be emptied after purge + completes */ + /*-----------------------------*/ + ib_bh_t* ib_bh; /*!< Binary min-heap, ordered on + rseg_queue_t::trx_no. It is protected + by the bh_mutex */ + ib_mutex_t bh_mutex; /*!< Mutex protecting ib_bh */ +}; + +/** Info required to purge a record */ +struct trx_purge_rec_t { + trx_undo_rec_t* undo_rec; /*!< Record to purge */ + roll_ptr_t roll_ptr; /*!< File pointr to UNDO record */ +}; + +#ifndef UNIV_NONINL +#include "trx0purge.ic" +#endif + +#endif diff --git a/storage/xtradb/include/trx0purge.ic b/storage/xtradb/include/trx0purge.ic new file mode 100644 index 00000000000..ca9cc1fb894 --- /dev/null +++ b/storage/xtradb/include/trx0purge.ic @@ -0,0 +1,62 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0purge.ic +Purge old versions + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#include "trx0undo.h" + +/********************************************************************//** +Calculates the file address of an undo log header when we have the file +address of its history list node. +@return file address of the log */ +UNIV_INLINE +fil_addr_t +trx_purge_get_log_from_hist( +/*========================*/ + fil_addr_t node_addr) /*!< in: file address of the history + list node of the log */ +{ + node_addr.boffset -= TRX_UNDO_HISTORY_NODE; + + return(node_addr); +} + +#ifdef UNIV_DEBUG +/********************************************************************//** +address of its history list node. +@return TRUE if purge_sys_t::limit <= purge_sys_t::iter*/ +UNIV_INLINE +ibool +trx_purge_check_limit(void) +/*=======================*/ +{ + ut_ad(purge_sys->limit.trx_no <= purge_sys->iter.trx_no); + + if (purge_sys->limit.trx_no == purge_sys->iter.trx_no) { + ut_ad(purge_sys->limit.undo_no <= purge_sys->iter.undo_no); + } + + return(TRUE); +} +#endif /* UNIV_DEBUG */ + diff --git a/storage/xtradb/include/trx0rec.h b/storage/xtradb/include/trx0rec.h new file mode 100644 index 00000000000..96e7d595035 --- /dev/null +++ b/storage/xtradb/include/trx0rec.h @@ -0,0 +1,326 @@ +/***************************************************************************** + +Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0rec.h +Transaction undo log record + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#ifndef trx0rec_h +#define trx0rec_h + +#include "univ.i" +#include "trx0types.h" +#include "row0types.h" +#include "mtr0mtr.h" +#include "dict0types.h" +#include "data0data.h" +#include "rem0types.h" + +#ifndef UNIV_HOTBACKUP +# include "que0types.h" + +/***********************************************************************//** +Copies the undo record to the heap. +@return own: copy of undo log record */ +UNIV_INLINE +trx_undo_rec_t* +trx_undo_rec_copy( +/*==============*/ + const trx_undo_rec_t* undo_rec, /*!< in: undo log record */ + mem_heap_t* heap); /*!< in: heap where copied */ +/**********************************************************************//** +Reads the undo log record type. +@return record type */ +UNIV_INLINE +ulint +trx_undo_rec_get_type( +/*==================*/ + const trx_undo_rec_t* undo_rec); /*!< in: undo log record */ +/**********************************************************************//** +Reads from an undo log record the record compiler info. +@return compiler info */ +UNIV_INLINE +ulint +trx_undo_rec_get_cmpl_info( +/*=======================*/ + const trx_undo_rec_t* undo_rec); /*!< in: undo log record */ +/**********************************************************************//** +Returns TRUE if an undo log record contains an extern storage field. +@return TRUE if extern */ +UNIV_INLINE +ibool +trx_undo_rec_get_extern_storage( +/*============================*/ + const trx_undo_rec_t* undo_rec); /*!< in: undo log record */ +/**********************************************************************//** +Reads the undo log record number. +@return undo no */ +UNIV_INLINE +undo_no_t +trx_undo_rec_get_undo_no( +/*=====================*/ + const trx_undo_rec_t* undo_rec); /*!< in: undo log record */ +/**********************************************************************//** +Returns the start of the undo record data area. +@return offset to the data area */ +UNIV_INLINE +ulint +trx_undo_rec_get_offset( +/*====================*/ + undo_no_t undo_no) /*!< in: undo no read from node */ + __attribute__((const)); + +/**********************************************************************//** +Returns the start of the undo record data area. */ +#define trx_undo_rec_get_ptr(undo_rec, undo_no) \ + ((undo_rec) + trx_undo_rec_get_offset(undo_no)) + +/**********************************************************************//** +Reads from an undo log record the general parameters. +@return remaining part of undo log record after reading these values */ +UNIV_INTERN +byte* +trx_undo_rec_get_pars( +/*==================*/ + trx_undo_rec_t* undo_rec, /*!< in: undo log record */ + ulint* type, /*!< out: undo record type: + TRX_UNDO_INSERT_REC, ... */ + ulint* cmpl_info, /*!< out: compiler info, relevant only + for update type records */ + bool* updated_extern, /*!< out: true if we updated an + externally stored fild */ + undo_no_t* undo_no, /*!< out: undo log record number */ + table_id_t* table_id) /*!< out: table id */ + __attribute__((nonnull)); +/*******************************************************************//** +Builds a row reference from an undo log record. +@return pointer to remaining part of undo record */ +UNIV_INTERN +byte* +trx_undo_rec_get_row_ref( +/*=====================*/ + byte* ptr, /*!< in: remaining part of a copy of an undo log + record, at the start of the row reference; + NOTE that this copy of the undo log record must + be preserved as long as the row reference is + used, as we do NOT copy the data in the + record! */ + dict_index_t* index, /*!< in: clustered index */ + dtuple_t** ref, /*!< out, own: row reference */ + mem_heap_t* heap); /*!< in: memory heap from which the memory + needed is allocated */ +/*******************************************************************//** +Skips a row reference from an undo log record. +@return pointer to remaining part of undo record */ +UNIV_INTERN +byte* +trx_undo_rec_skip_row_ref( +/*======================*/ + byte* ptr, /*!< in: remaining part in update undo log + record, at the start of the row reference */ + dict_index_t* index); /*!< in: clustered index */ +/**********************************************************************//** +Reads from an undo log update record the system field values of the old +version. +@return remaining part of undo log record after reading these values */ +UNIV_INTERN +byte* +trx_undo_update_rec_get_sys_cols( +/*=============================*/ + byte* ptr, /*!< in: remaining part of undo + log record after reading + general parameters */ + trx_id_t* trx_id, /*!< out: trx id */ + roll_ptr_t* roll_ptr, /*!< out: roll ptr */ + ulint* info_bits); /*!< out: info bits state */ +/*******************************************************************//** +Builds an update vector based on a remaining part of an undo log record. +@return remaining part of the record, NULL if an error detected, which +means that the record is corrupted */ +UNIV_INTERN +byte* +trx_undo_update_rec_get_update( +/*===========================*/ + byte* ptr, /*!< in: remaining part in update undo log + record, after reading the row reference + NOTE that this copy of the undo log record must + be preserved as long as the update vector is + used, as we do NOT copy the data in the + record! */ + dict_index_t* index, /*!< in: clustered index */ + ulint type, /*!< in: TRX_UNDO_UPD_EXIST_REC, + TRX_UNDO_UPD_DEL_REC, or + TRX_UNDO_DEL_MARK_REC; in the last case, + only trx id and roll ptr fields are added to + the update vector */ + trx_id_t trx_id, /*!< in: transaction id from this undorecord */ + roll_ptr_t roll_ptr,/*!< in: roll pointer from this undo record */ + ulint info_bits,/*!< in: info bits from this undo record */ + trx_t* trx, /*!< in: transaction */ + mem_heap_t* heap, /*!< in: memory heap from which the memory + needed is allocated */ + upd_t** upd); /*!< out, own: update vector */ +/*******************************************************************//** +Builds a partial row from an update undo log record, for purge. +It contains the columns which occur as ordering in any index of the table. +Any missing columns are indicated by col->mtype == DATA_MISSING. +@return pointer to remaining part of undo record */ +UNIV_INTERN +byte* +trx_undo_rec_get_partial_row( +/*=========================*/ + byte* ptr, /*!< in: remaining part in update undo log + record of a suitable type, at the start of + the stored index columns; + NOTE that this copy of the undo log record must + be preserved as long as the partial row is + used, as we do NOT copy the data in the + record! */ + dict_index_t* index, /*!< in: clustered index */ + dtuple_t** row, /*!< out, own: partial row */ + ibool ignore_prefix, /*!< in: flag to indicate if we + expect blob prefixes in undo. Used + only in the assertion. */ + mem_heap_t* heap) /*!< in: memory heap from which the memory + needed is allocated */ + __attribute__((nonnull, warn_unused_result)); +/***********************************************************************//** +Writes information to an undo log about an insert, update, or a delete marking +of a clustered index record. This information is used in a rollback of the +transaction and in consistent reads that must look to the history of this +transaction. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +trx_undo_report_row_operation( +/*==========================*/ + ulint flags, /*!< in: if BTR_NO_UNDO_LOG_FLAG bit is + set, does nothing */ + ulint op_type, /*!< in: TRX_UNDO_INSERT_OP or + TRX_UNDO_MODIFY_OP */ + que_thr_t* thr, /*!< in: query thread */ + dict_index_t* index, /*!< in: clustered index */ + const dtuple_t* clust_entry, /*!< in: in the case of an insert, + index entry to insert into the + clustered index, otherwise NULL */ + const upd_t* update, /*!< in: in the case of an update, + the update vector, otherwise NULL */ + ulint cmpl_info, /*!< in: compiler info on secondary + index updates */ + const rec_t* rec, /*!< in: case of an update or delete + marking, the record in the clustered + index, otherwise NULL */ + const ulint* offsets, /*!< in: rec_get_offsets(rec) */ + roll_ptr_t* roll_ptr) /*!< out: rollback pointer to the + inserted undo log record, + 0 if BTR_NO_UNDO_LOG + flag was specified */ + __attribute__((nonnull(4,10), warn_unused_result)); +/******************************************************************//** +Copies an undo record to heap. This function can be called if we know that +the undo log record exists. +@return own: copy of the record */ +UNIV_INTERN +trx_undo_rec_t* +trx_undo_get_undo_rec_low( +/*======================*/ + roll_ptr_t roll_ptr, /*!< in: roll pointer to record */ + mem_heap_t* heap) /*!< in: memory heap where copied */ + __attribute__((nonnull, warn_unused_result)); +/*******************************************************************//** +Build a previous version of a clustered index record. The caller must +hold a latch on the index page of the clustered index record. +@retval true if previous version was built, or if it was an insert +or the table has been rebuilt +@retval false if the previous version is earlier than purge_view, +which means that it may have been removed */ +UNIV_INTERN +bool +trx_undo_prev_version_build( +/*========================*/ + const rec_t* index_rec,/*!< in: clustered index record in the + index tree */ + mtr_t* index_mtr,/*!< in: mtr which contains the latch to + index_rec page and purge_view */ + const rec_t* rec, /*!< in: version of a clustered index record */ + dict_index_t* index, /*!< in: clustered index */ + ulint* offsets,/*!< in/out: rec_get_offsets(rec, index) */ + mem_heap_t* heap, /*!< in: memory heap from which the memory + needed is allocated */ + rec_t** old_vers)/*!< out, own: previous version, or NULL if + rec is the first inserted version, or if + history data has been deleted */ + __attribute__((nonnull)); +#endif /* !UNIV_HOTBACKUP */ +/***********************************************************//** +Parses a redo log record of adding an undo log record. +@return end of log record or NULL */ +UNIV_INTERN +byte* +trx_undo_parse_add_undo_rec( +/*========================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + page_t* page); /*!< in: page or NULL */ +/***********************************************************//** +Parses a redo log record of erasing of an undo page end. +@return end of log record or NULL */ +UNIV_INTERN +byte* +trx_undo_parse_erase_page_end( +/*==========================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + page_t* page, /*!< in: page or NULL */ + mtr_t* mtr); /*!< in: mtr or NULL */ + +#ifndef UNIV_HOTBACKUP + +/* Types of an undo log record: these have to be smaller than 16, as the +compilation info multiplied by 16 is ORed to this value in an undo log +record */ + +#define TRX_UNDO_INSERT_REC 11 /* fresh insert into clustered index */ +#define TRX_UNDO_UPD_EXIST_REC 12 /* update of a non-delete-marked + record */ +#define TRX_UNDO_UPD_DEL_REC 13 /* update of a delete marked record to + a not delete marked record; also the + fields of the record can change */ +#define TRX_UNDO_DEL_MARK_REC 14 /* delete marking of a record; fields + do not change */ +#define TRX_UNDO_CMPL_INFO_MULT 16 /* compilation info is multiplied by + this and ORed to the type above */ +#define TRX_UNDO_UPD_EXTERN 128 /* This bit can be ORed to type_cmpl + to denote that we updated external + storage fields: used by purge to + free the external storage */ + +/* Operation type flags used in trx_undo_report_row_operation */ +#define TRX_UNDO_INSERT_OP 1 +#define TRX_UNDO_MODIFY_OP 2 + +#ifndef UNIV_NONINL +#include "trx0rec.ic" +#endif + +#endif /* !UNIV_HOTBACKUP */ + +#endif /* trx0rec_h */ diff --git a/storage/xtradb/include/trx0rec.ic b/storage/xtradb/include/trx0rec.ic new file mode 100644 index 00000000000..08704f6b821 --- /dev/null +++ b/storage/xtradb/include/trx0rec.ic @@ -0,0 +1,113 @@ +/***************************************************************************** + +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0rec.ic +Transaction undo log record + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#ifndef UNIV_HOTBACKUP +/**********************************************************************//** +Reads from an undo log record the record type. +@return record type */ +UNIV_INLINE +ulint +trx_undo_rec_get_type( +/*==================*/ + const trx_undo_rec_t* undo_rec) /*!< in: undo log record */ +{ + return(mach_read_from_1(undo_rec + 2) & (TRX_UNDO_CMPL_INFO_MULT - 1)); +} + +/**********************************************************************//** +Reads from an undo log record the record compiler info. +@return compiler info */ +UNIV_INLINE +ulint +trx_undo_rec_get_cmpl_info( +/*=======================*/ + const trx_undo_rec_t* undo_rec) /*!< in: undo log record */ +{ + return(mach_read_from_1(undo_rec + 2) / TRX_UNDO_CMPL_INFO_MULT); +} + +/**********************************************************************//** +Returns TRUE if an undo log record contains an extern storage field. +@return TRUE if extern */ +UNIV_INLINE +ibool +trx_undo_rec_get_extern_storage( +/*============================*/ + const trx_undo_rec_t* undo_rec) /*!< in: undo log record */ +{ + if (mach_read_from_1(undo_rec + 2) & TRX_UNDO_UPD_EXTERN) { + + return(TRUE); + } + + return(FALSE); +} + +/**********************************************************************//** +Reads the undo log record number. +@return undo no */ +UNIV_INLINE +undo_no_t +trx_undo_rec_get_undo_no( +/*=====================*/ + const trx_undo_rec_t* undo_rec) /*!< in: undo log record */ +{ + const byte* ptr; + + ptr = undo_rec + 3; + + return(mach_ull_read_much_compressed(ptr)); +} + +/**********************************************************************//** +Returns the start of the undo record data area. +@return offset to the data area */ +UNIV_INLINE +ulint +trx_undo_rec_get_offset( +/*====================*/ + undo_no_t undo_no) /*!< in: undo no read from node */ +{ + return(3 + mach_ull_get_much_compressed_size(undo_no)); +} + +/***********************************************************************//** +Copies the undo record to the heap. +@return own: copy of undo log record */ +UNIV_INLINE +trx_undo_rec_t* +trx_undo_rec_copy( +/*==============*/ + const trx_undo_rec_t* undo_rec, /*!< in: undo log record */ + mem_heap_t* heap) /*!< in: heap where copied */ +{ + ulint len; + + len = mach_read_from_2(undo_rec) + - ut_align_offset(undo_rec, UNIV_PAGE_SIZE); + ut_ad(len < UNIV_PAGE_SIZE); + return((trx_undo_rec_t*) mem_heap_dup(heap, undo_rec, len)); +} +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/xtradb/include/trx0roll.h b/storage/xtradb/include/trx0roll.h new file mode 100644 index 00000000000..aa3dbb1f6cd --- /dev/null +++ b/storage/xtradb/include/trx0roll.h @@ -0,0 +1,296 @@ +/***************************************************************************** + +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0roll.h +Transaction rollback + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#ifndef trx0roll_h +#define trx0roll_h + +#include "univ.i" +#include "btr0types.h" +#include "trx0trx.h" +#include "trx0types.h" +#include "mtr0mtr.h" +#include "trx0sys.h" + +/*******************************************************************//** +Determines if this transaction is rolling back an incomplete transaction +in crash recovery. +@return TRUE if trx is an incomplete transaction that is being rolled +back in crash recovery */ +UNIV_INTERN +ibool +trx_is_recv( +/*========*/ + const trx_t* trx); /*!< in: transaction */ +/*******************************************************************//** +Returns a transaction savepoint taken at this point in time. +@return savepoint */ +UNIV_INTERN +trx_savept_t +trx_savept_take( +/*============*/ + trx_t* trx); /*!< in: transaction */ +/*******************************************************************//** +Frees an undo number array. */ +UNIV_INTERN +void +trx_undo_arr_free( +/*==============*/ + trx_undo_arr_t* arr); /*!< in: undo number array */ +/*******************************************************************//** +Returns pointer to nth element in an undo number array. +@return pointer to the nth element */ +UNIV_INLINE +trx_undo_inf_t* +trx_undo_arr_get_nth_info( +/*======================*/ + trx_undo_arr_t* arr, /*!< in: undo number array */ + ulint n); /*!< in: position */ +/********************************************************************//** +Pops the topmost record when the two undo logs of a transaction are seen +as a single stack of records ordered by their undo numbers. Inserts the +undo number of the popped undo record to the array of currently processed +undo numbers in the transaction. When the query thread finishes processing +of this undo record, it must be released with trx_undo_rec_release. +@return undo log record copied to heap, NULL if none left, or if the +undo number of the top record would be less than the limit */ +UNIV_INTERN +trx_undo_rec_t* +trx_roll_pop_top_rec_of_trx( +/*========================*/ + trx_t* trx, /*!< in: transaction */ + undo_no_t limit, /*!< in: least undo number we need */ + roll_ptr_t* roll_ptr,/*!< out: roll pointer to undo record */ + mem_heap_t* heap); /*!< in: memory heap where copied */ +/********************************************************************//** +Reserves an undo log record for a query thread to undo. This should be +called if the query thread gets the undo log record not using the pop +function above. +@return TRUE if succeeded */ +UNIV_INTERN +ibool +trx_undo_rec_reserve( +/*=================*/ + trx_t* trx, /*!< in/out: transaction */ + undo_no_t undo_no);/*!< in: undo number of the record */ +/*******************************************************************//** +Releases a reserved undo record. */ +UNIV_INTERN +void +trx_undo_rec_release( +/*=================*/ + trx_t* trx, /*!< in/out: transaction */ + undo_no_t undo_no);/*!< in: undo number */ +/*******************************************************************//** +Rollback or clean up any incomplete transactions which were +encountered in crash recovery. If the transaction already was +committed, then we clean up a possible insert undo log. If the +transaction was not yet committed, then we roll it back. */ +UNIV_INTERN +void +trx_rollback_or_clean_recovered( +/*============================*/ + ibool all); /*!< in: FALSE=roll back dictionary transactions; + TRUE=roll back all non-PREPARED transactions */ +/*******************************************************************//** +Rollback or clean up any incomplete transactions which were +encountered in crash recovery. If the transaction already was +committed, then we clean up a possible insert undo log. If the +transaction was not yet committed, then we roll it back. +Note: this is done in a background thread. +@return a dummy parameter */ +extern "C" UNIV_INTERN +os_thread_ret_t +DECLARE_THREAD(trx_rollback_or_clean_all_recovered)( +/*================================================*/ + void* arg __attribute__((unused))); + /*!< in: a dummy parameter required by + os_thread_create */ +/*********************************************************************//** +Creates a rollback command node struct. +@return own: rollback node struct */ +UNIV_INTERN +roll_node_t* +roll_node_create( +/*=============*/ + mem_heap_t* heap); /*!< in: mem heap where created */ +/***********************************************************//** +Performs an execution step for a rollback command node in a query graph. +@return query thread to run next, or NULL */ +UNIV_INTERN +que_thr_t* +trx_rollback_step( +/*==============*/ + que_thr_t* thr); /*!< in: query thread */ +/*******************************************************************//** +Rollback a transaction used in MySQL. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +trx_rollback_for_mysql( +/*===================*/ + trx_t* trx) /*!< in/out: transaction */ + __attribute__((nonnull)); +/*******************************************************************//** +Rollback the latest SQL statement for MySQL. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +trx_rollback_last_sql_stat_for_mysql( +/*=================================*/ + trx_t* trx) /*!< in/out: transaction */ + __attribute__((nonnull)); +/*******************************************************************//** +Rollback a transaction to a given savepoint or do a complete rollback. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +trx_rollback_to_savepoint( +/*======================*/ + trx_t* trx, /*!< in: transaction handle */ + trx_savept_t* savept) /*!< in: pointer to savepoint undo number, if + partial rollback requested, or NULL for + complete rollback */ + __attribute__((nonnull(1))); +/*******************************************************************//** +Rolls back a transaction back to a named savepoint. Modifications after the +savepoint are undone but InnoDB does NOT release the corresponding locks +which are stored in memory. If a lock is 'implicit', that is, a new inserted +row holds a lock where the lock information is carried by the trx id stored in +the row, these locks are naturally released in the rollback. Savepoints which +were set after this savepoint are deleted. +@return if no savepoint of the name found then DB_NO_SAVEPOINT, +otherwise DB_SUCCESS */ +UNIV_INTERN +dberr_t +trx_rollback_to_savepoint_for_mysql( +/*================================*/ + trx_t* trx, /*!< in: transaction handle */ + const char* savepoint_name, /*!< in: savepoint name */ + ib_int64_t* mysql_binlog_cache_pos) /*!< out: the MySQL binlog cache + position corresponding to this + savepoint; MySQL needs this + information to remove the + binlog entries of the queries + executed after the savepoint */ + __attribute__((nonnull, warn_unused_result)); +/*******************************************************************//** +Creates a named savepoint. If the transaction is not yet started, starts it. +If there is already a savepoint of the same name, this call erases that old +savepoint and replaces it with a new. Savepoints are deleted in a transaction +commit or rollback. +@return always DB_SUCCESS */ +UNIV_INTERN +dberr_t +trx_savepoint_for_mysql( +/*====================*/ + trx_t* trx, /*!< in: transaction handle */ + const char* savepoint_name, /*!< in: savepoint name */ + ib_int64_t binlog_cache_pos) /*!< in: MySQL binlog cache + position corresponding to this + connection at the time of the + savepoint */ + __attribute__((nonnull)); +/*******************************************************************//** +Releases a named savepoint. Savepoints which +were set after this savepoint are deleted. +@return if no savepoint of the name found then DB_NO_SAVEPOINT, +otherwise DB_SUCCESS */ +UNIV_INTERN +dberr_t +trx_release_savepoint_for_mysql( +/*============================*/ + trx_t* trx, /*!< in: transaction handle */ + const char* savepoint_name) /*!< in: savepoint name */ + __attribute__((nonnull, warn_unused_result)); +/*******************************************************************//** +Frees savepoint structs starting from savep. */ +UNIV_INTERN +void +trx_roll_savepoints_free( +/*=====================*/ + trx_t* trx, /*!< in: transaction handle */ + trx_named_savept_t* savep); /*!< in: free all savepoints > this one; + if this is NULL, free all savepoints + of trx */ + +/** A cell of trx_undo_arr_t; used during a rollback and a purge */ +struct trx_undo_inf_t{ + ibool in_use; /*!< true if cell is being used */ + trx_id_t trx_no; /*!< transaction number: not defined during + a rollback */ + undo_no_t undo_no;/*!< undo number of an undo record */ +}; + +/** During a rollback and a purge, undo numbers of undo records currently being +processed are stored in this array */ + +struct trx_undo_arr_t{ + ulint n_cells; /*!< number of cells in the array */ + ulint n_used; /*!< number of cells in use */ + trx_undo_inf_t* infos; /*!< the array of undo infos */ + mem_heap_t* heap; /*!< memory heap from which allocated */ +}; + +/** Rollback node states */ +enum roll_node_state { + ROLL_NODE_NONE = 0, /*!< Unknown state */ + ROLL_NODE_SEND, /*!< about to send a rollback signal to + the transaction */ + ROLL_NODE_WAIT /*!< rollback signal sent to the + transaction, waiting for completion */ +}; + +/** Rollback command node in a query graph */ +struct roll_node_t{ + que_common_t common; /*!< node type: QUE_NODE_ROLLBACK */ + enum roll_node_state state; /*!< node execution state */ + ibool partial;/*!< TRUE if we want a partial + rollback */ + trx_savept_t savept; /*!< savepoint to which to + roll back, in the case of a + partial rollback */ + que_thr_t* undo_thr;/*!< undo query graph */ +}; + +/** A savepoint set with SQL's "SAVEPOINT savepoint_id" command */ +struct trx_named_savept_t{ + char* name; /*!< savepoint name */ + trx_savept_t savept; /*!< the undo number corresponding to + the savepoint */ + ib_int64_t mysql_binlog_cache_pos; + /*!< the MySQL binlog cache position + corresponding to this savepoint, not + defined if the MySQL binlogging is not + enabled */ + UT_LIST_NODE_T(trx_named_savept_t) + trx_savepoints; /*!< the list of savepoints of a + transaction */ +}; + +#ifndef UNIV_NONINL +#include "trx0roll.ic" +#endif + +#endif diff --git a/storage/xtradb/include/trx0roll.ic b/storage/xtradb/include/trx0roll.ic new file mode 100644 index 00000000000..178e9bb730a --- /dev/null +++ b/storage/xtradb/include/trx0roll.ic @@ -0,0 +1,40 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0roll.ic +Transaction rollback + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +/*******************************************************************//** +Returns pointer to nth element in an undo number array. +@return pointer to the nth element */ +UNIV_INLINE +trx_undo_inf_t* +trx_undo_arr_get_nth_info( +/*======================*/ + trx_undo_arr_t* arr, /*!< in: undo number array */ + ulint n) /*!< in: position */ +{ + ut_ad(arr); + ut_ad(n < arr->n_cells); + + return(arr->infos + n); +} diff --git a/storage/xtradb/include/trx0rseg.h b/storage/xtradb/include/trx0rseg.h new file mode 100644 index 00000000000..b9c84ef2b06 --- /dev/null +++ b/storage/xtradb/include/trx0rseg.h @@ -0,0 +1,230 @@ +/***************************************************************************** + +Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0rseg.h +Rollback segment + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#ifndef trx0rseg_h +#define trx0rseg_h + +#include "univ.i" +#include "trx0types.h" +#include "trx0sys.h" +#include "ut0bh.h" + +/******************************************************************//** +Gets a rollback segment header. +@return rollback segment header, page x-latched */ +UNIV_INLINE +trx_rsegf_t* +trx_rsegf_get( +/*==========*/ + ulint space, /*!< in: space where placed */ + ulint zip_size, /*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page_no, /*!< in: page number of the header */ + mtr_t* mtr); /*!< in: mtr */ +/******************************************************************//** +Gets a newly created rollback segment header. +@return rollback segment header, page x-latched */ +UNIV_INLINE +trx_rsegf_t* +trx_rsegf_get_new( +/*==============*/ + ulint space, /*!< in: space where placed */ + ulint zip_size, /*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page_no, /*!< in: page number of the header */ + mtr_t* mtr); /*!< in: mtr */ +/***************************************************************//** +Gets the file page number of the nth undo log slot. +@return page number of the undo log segment */ +UNIV_INLINE +ulint +trx_rsegf_get_nth_undo( +/*===================*/ + trx_rsegf_t* rsegf, /*!< in: rollback segment header */ + ulint n, /*!< in: index of slot */ + mtr_t* mtr); /*!< in: mtr */ +/***************************************************************//** +Sets the file page number of the nth undo log slot. */ +UNIV_INLINE +void +trx_rsegf_set_nth_undo( +/*===================*/ + trx_rsegf_t* rsegf, /*!< in: rollback segment header */ + ulint n, /*!< in: index of slot */ + ulint page_no,/*!< in: page number of the undo log segment */ + mtr_t* mtr); /*!< in: mtr */ +/****************************************************************//** +Looks for a free slot for an undo log segment. +@return slot index or ULINT_UNDEFINED if not found */ +UNIV_INLINE +ulint +trx_rsegf_undo_find_free( +/*=====================*/ + trx_rsegf_t* rsegf, /*!< in: rollback segment header */ + mtr_t* mtr); /*!< in: mtr */ +/******************************************************************//** +Looks for a rollback segment, based on the rollback segment id. +@return rollback segment */ +UNIV_INLINE +trx_rseg_t* +trx_rseg_get_on_id( +/*===============*/ + ulint id); /*!< in: rollback segment id */ +/****************************************************************//** +Creates a rollback segment header. This function is called only when +a new rollback segment is created in the database. +@return page number of the created segment, FIL_NULL if fail */ +UNIV_INTERN +ulint +trx_rseg_header_create( +/*===================*/ + ulint space, /*!< in: space id */ + ulint zip_size, /*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint max_size, /*!< in: max size in pages */ + ulint rseg_slot_no, /*!< in: rseg id == slot number in trx sys */ + mtr_t* mtr); /*!< in: mtr */ +/*********************************************************************//** +Creates the memory copies for rollback segments and initializes the +rseg array in trx_sys at a database startup. */ +UNIV_INTERN +void +trx_rseg_array_init( +/*================*/ + trx_sysf_t* sys_header, /*!< in/out: trx system header */ + ib_bh_t* ib_bh, /*!< in: rseg queue */ + mtr_t* mtr); /*!< in/out: mtr */ +/*************************************************************************** +Free's an instance of the rollback segment in memory. */ +UNIV_INTERN +void +trx_rseg_mem_free( +/*==============*/ + trx_rseg_t* rseg); /*!< in, own: instance to free */ + +/********************************************************************* +Creates a rollback segment. */ +UNIV_INTERN +trx_rseg_t* +trx_rseg_create( +/*============*/ + ulint space); /*!< in: id of UNDO tablespace */ + +/******************************************************************** +Get the number of unique rollback tablespaces in use except space id 0. +The last space id will be the sentinel value ULINT_UNDEFINED. The array +will be sorted on space id. Note: space_ids should have have space for +TRX_SYS_N_RSEGS + 1 elements. +@return number of unique rollback tablespaces in use. */ +UNIV_INTERN +ulint +trx_rseg_get_n_undo_tablespaces( +/*============================*/ + ulint* space_ids); /*!< out: array of space ids of + UNDO tablespaces */ +/* Number of undo log slots in a rollback segment file copy */ +#define TRX_RSEG_N_SLOTS (UNIV_PAGE_SIZE / 16) + +/* Maximum number of transactions supported by a single rollback segment */ +#define TRX_RSEG_MAX_N_TRXS (TRX_RSEG_N_SLOTS / 2) + +/* The rollback segment memory object */ +struct trx_rseg_t{ + /*--------------------------------------------------------*/ + ulint id; /*!< rollback segment id == the index of + its slot in the trx system file copy */ + ib_prio_mutex_t mutex; /*!< mutex protecting the fields in this + struct except id, which is constant */ + ulint space; /*!< space where the rollback segment is + header is placed */ + ulint zip_size;/* compressed page size of space + in bytes, or 0 for uncompressed spaces */ + ulint page_no;/* page number of the rollback segment + header */ + ulint max_size;/* maximum allowed size in pages */ + ulint curr_size;/* current size in pages */ + /*--------------------------------------------------------*/ + /* Fields for update undo logs */ + UT_LIST_BASE_NODE_T(trx_undo_t) update_undo_list; + /* List of update undo logs */ + UT_LIST_BASE_NODE_T(trx_undo_t) update_undo_cached; + /* List of update undo log segments + cached for fast reuse */ + /*--------------------------------------------------------*/ + /* Fields for insert undo logs */ + UT_LIST_BASE_NODE_T(trx_undo_t) insert_undo_list; + /* List of insert undo logs */ + UT_LIST_BASE_NODE_T(trx_undo_t) insert_undo_cached; + /* List of insert undo log segments + cached for fast reuse */ + /*--------------------------------------------------------*/ + ulint last_page_no; /*!< Page number of the last not yet + purged log header in the history list; + FIL_NULL if all list purged */ + ulint last_offset; /*!< Byte offset of the last not yet + purged log header */ + trx_id_t last_trx_no; /*!< Transaction number of the last not + yet purged log */ + ibool last_del_marks; /*!< TRUE if the last not yet purged log + needs purging */ +}; + +/** For prioritising the rollback segments for purge. */ +struct rseg_queue_t { + trx_id_t trx_no; /*!< trx_rseg_t::last_trx_no */ + trx_rseg_t* rseg; /*!< Rollback segment */ +}; + +/* Undo log segment slot in a rollback segment header */ +/*-------------------------------------------------------------*/ +#define TRX_RSEG_SLOT_PAGE_NO 0 /* Page number of the header page of + an undo log segment */ +/*-------------------------------------------------------------*/ +/* Slot size */ +#define TRX_RSEG_SLOT_SIZE 4 + +/* The offset of the rollback segment header on its page */ +#define TRX_RSEG FSEG_PAGE_DATA + +/* Transaction rollback segment header */ +/*-------------------------------------------------------------*/ +#define TRX_RSEG_MAX_SIZE 0 /* Maximum allowed size for rollback + segment in pages */ +#define TRX_RSEG_HISTORY_SIZE 4 /* Number of file pages occupied + by the logs in the history list */ +#define TRX_RSEG_HISTORY 8 /* The update undo logs for committed + transactions */ +#define TRX_RSEG_FSEG_HEADER (8 + FLST_BASE_NODE_SIZE) + /* Header for the file segment where + this page is placed */ +#define TRX_RSEG_UNDO_SLOTS (8 + FLST_BASE_NODE_SIZE + FSEG_HEADER_SIZE) + /* Undo log segment slots */ +/*-------------------------------------------------------------*/ + +#ifndef UNIV_NONINL +#include "trx0rseg.ic" +#endif + +#endif diff --git a/storage/xtradb/include/trx0rseg.ic b/storage/xtradb/include/trx0rseg.ic new file mode 100644 index 00000000000..30743da9b8c --- /dev/null +++ b/storage/xtradb/include/trx0rseg.ic @@ -0,0 +1,167 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0rseg.ic +Rollback segment + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#include "srv0srv.h" +#include "mtr0log.h" +#include "trx0sys.h" + +/******************************************************************//** +Gets a rollback segment header. +@return rollback segment header, page x-latched */ +UNIV_INLINE +trx_rsegf_t* +trx_rsegf_get( +/*==========*/ + ulint space, /*!< in: space where placed */ + ulint zip_size, /*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page_no, /*!< in: page number of the header */ + mtr_t* mtr) /*!< in: mtr */ +{ + buf_block_t* block; + trx_rsegf_t* header; + + block = buf_page_get(space, zip_size, page_no, RW_X_LATCH, mtr); + buf_block_dbg_add_level(block, SYNC_RSEG_HEADER); + + header = TRX_RSEG + buf_block_get_frame(block); + + return(header); +} + +/******************************************************************//** +Gets a newly created rollback segment header. +@return rollback segment header, page x-latched */ +UNIV_INLINE +trx_rsegf_t* +trx_rsegf_get_new( +/*==============*/ + ulint space, /*!< in: space where placed */ + ulint zip_size, /*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page_no, /*!< in: page number of the header */ + mtr_t* mtr) /*!< in: mtr */ +{ + buf_block_t* block; + trx_rsegf_t* header; + + block = buf_page_get(space, zip_size, page_no, RW_X_LATCH, mtr); + buf_block_dbg_add_level(block, SYNC_RSEG_HEADER_NEW); + + header = TRX_RSEG + buf_block_get_frame(block); + + return(header); +} + +/***************************************************************//** +Gets the file page number of the nth undo log slot. +@return page number of the undo log segment */ +UNIV_INLINE +ulint +trx_rsegf_get_nth_undo( +/*===================*/ + trx_rsegf_t* rsegf, /*!< in: rollback segment header */ + ulint n, /*!< in: index of slot */ + mtr_t* mtr) /*!< in: mtr */ +{ + if (n >= TRX_RSEG_N_SLOTS) { + fprintf(stderr, + "InnoDB: Error: trying to get slot %lu of rseg\n", + (ulong) n); + ut_error; + } + + return(mtr_read_ulint(rsegf + TRX_RSEG_UNDO_SLOTS + + n * TRX_RSEG_SLOT_SIZE, MLOG_4BYTES, mtr)); +} + +/***************************************************************//** +Sets the file page number of the nth undo log slot. */ +UNIV_INLINE +void +trx_rsegf_set_nth_undo( +/*===================*/ + trx_rsegf_t* rsegf, /*!< in: rollback segment header */ + ulint n, /*!< in: index of slot */ + ulint page_no,/*!< in: page number of the undo log segment */ + mtr_t* mtr) /*!< in: mtr */ +{ + if (n >= TRX_RSEG_N_SLOTS) { + fprintf(stderr, + "InnoDB: Error: trying to set slot %lu of rseg\n", + (ulong) n); + ut_error; + } + + mlog_write_ulint(rsegf + TRX_RSEG_UNDO_SLOTS + n * TRX_RSEG_SLOT_SIZE, + page_no, MLOG_4BYTES, mtr); +} + +/****************************************************************//** +Looks for a free slot for an undo log segment. +@return slot index or ULINT_UNDEFINED if not found */ +UNIV_INLINE +ulint +trx_rsegf_undo_find_free( +/*=====================*/ + trx_rsegf_t* rsegf, /*!< in: rollback segment header */ + mtr_t* mtr) /*!< in: mtr */ +{ + ulint i; + ulint page_no; + + for (i = 0; +#ifndef UNIV_DEBUG + i < TRX_RSEG_N_SLOTS; +#else + i < (trx_rseg_n_slots_debug ? trx_rseg_n_slots_debug : TRX_RSEG_N_SLOTS); +#endif + i++) { + + page_no = trx_rsegf_get_nth_undo(rsegf, i, mtr); + + if (page_no == FIL_NULL) { + + return(i); + } + } + + return(ULINT_UNDEFINED); +} + +/******************************************************************//** +Looks for a rollback segment, based on the rollback segment id. +@return rollback segment */ +UNIV_INLINE +trx_rseg_t* +trx_rseg_get_on_id( +/*===============*/ + ulint id) /*!< in: rollback segment id */ +{ + ut_a(id < TRX_SYS_N_RSEGS); + + return(trx_sys->rseg_array[id]); +} + diff --git a/storage/xtradb/include/trx0sys.h b/storage/xtradb/include/trx0sys.h new file mode 100644 index 00000000000..7b97c6e99cd --- /dev/null +++ b/storage/xtradb/include/trx0sys.h @@ -0,0 +1,722 @@ +/***************************************************************************** + +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0sys.h +Transaction system + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#ifndef trx0sys_h +#define trx0sys_h + +#include "univ.i" + +#include "trx0types.h" +#include "fsp0types.h" +#include "fil0fil.h" +#include "buf0buf.h" +#ifndef UNIV_HOTBACKUP +#include "mtr0mtr.h" +#include "ut0byte.h" +#include "mem0mem.h" +#include "sync0sync.h" +#include "ut0lst.h" +#include "ut0bh.h" +#include "read0types.h" +#include "page0types.h" +#include "ut0bh.h" + +typedef UT_LIST_BASE_NODE_T(trx_t) trx_list_t; + +/** In a MySQL replication slave, in crash recovery we store the master log +file name and position here. */ +/* @{ */ +/** Master binlog file name */ +extern char trx_sys_mysql_master_log_name[]; +/** Master binlog file position. We have successfully got the updates +up to this position. -1 means that no crash recovery was needed, or +there was no master log position info inside InnoDB.*/ +extern ib_int64_t trx_sys_mysql_master_log_pos; +/* @} */ + +/** If this MySQL server uses binary logging, after InnoDB has been inited +and if it has done a crash recovery, we store the binlog file name and position +here. */ +/* @{ */ +/** Binlog file name */ +extern char trx_sys_mysql_bin_log_name[]; +/** Binlog file position, or -1 if unknown */ +extern ib_int64_t trx_sys_mysql_bin_log_pos; +/* @} */ + +/** The transaction system */ +extern trx_sys_t* trx_sys; + +/***************************************************************//** +Checks if a page address is the trx sys header page. +@return TRUE if trx sys header page */ +UNIV_INLINE +ibool +trx_sys_hdr_page( +/*=============*/ + ulint space, /*!< in: space */ + ulint page_no);/*!< in: page number */ +/*****************************************************************//** +Creates and initializes the central memory structures for the transaction +system. This is called when the database is started. +@return min binary heap of rsegs to purge */ +UNIV_INTERN +ib_bh_t* +trx_sys_init_at_db_start(void); +/*==========================*/ +/*****************************************************************//** +Creates the trx_sys instance and initializes ib_bh and mutex. */ +UNIV_INTERN +void +trx_sys_create(void); +/*================*/ +/*****************************************************************//** +Creates and initializes the transaction system at the database creation. */ +UNIV_INTERN +void +trx_sys_create_sys_pages(void); +/*==========================*/ +/****************************************************************//** +Looks for a free slot for a rollback segment in the trx system file copy. +@return slot index or ULINT_UNDEFINED if not found */ +UNIV_INTERN +ulint +trx_sysf_rseg_find_free( +/*====================*/ + mtr_t* mtr); /*!< in: mtr */ +/***************************************************************//** +Gets the pointer in the nth slot of the rseg array. +@return pointer to rseg object, NULL if slot not in use */ +UNIV_INLINE +trx_rseg_t* +trx_sys_get_nth_rseg( +/*=================*/ + trx_sys_t* sys, /*!< in: trx system */ + ulint n); /*!< in: index of slot */ +/**********************************************************************//** +Gets a pointer to the transaction system file copy and x-locks its page. +@return pointer to system file copy, page x-locked */ +UNIV_INLINE +trx_sysf_t* +trx_sysf_get( +/*=========*/ + mtr_t* mtr); /*!< in: mtr */ +/*****************************************************************//** +Gets the space of the nth rollback segment slot in the trx system +file copy. +@return space id */ +UNIV_INLINE +ulint +trx_sysf_rseg_get_space( +/*====================*/ + trx_sysf_t* sys_header, /*!< in: trx sys file copy */ + ulint i, /*!< in: slot index == rseg id */ + mtr_t* mtr); /*!< in: mtr */ +/*****************************************************************//** +Gets the page number of the nth rollback segment slot in the trx system +file copy. +@return page number, FIL_NULL if slot unused */ +UNIV_INLINE +ulint +trx_sysf_rseg_get_page_no( +/*======================*/ + trx_sysf_t* sys_header, /*!< in: trx sys file copy */ + ulint i, /*!< in: slot index == rseg id */ + mtr_t* mtr); /*!< in: mtr */ +/*****************************************************************//** +Sets the space id of the nth rollback segment slot in the trx system +file copy. */ +UNIV_INLINE +void +trx_sysf_rseg_set_space( +/*====================*/ + trx_sysf_t* sys_header, /*!< in: trx sys file copy */ + ulint i, /*!< in: slot index == rseg id */ + ulint space, /*!< in: space id */ + mtr_t* mtr); /*!< in: mtr */ +/*****************************************************************//** +Sets the page number of the nth rollback segment slot in the trx system +file copy. */ +UNIV_INLINE +void +trx_sysf_rseg_set_page_no( +/*======================*/ + trx_sysf_t* sys_header, /*!< in: trx sys file copy */ + ulint i, /*!< in: slot index == rseg id */ + ulint page_no, /*!< in: page number, FIL_NULL if + the slot is reset to unused */ + mtr_t* mtr); /*!< in: mtr */ +/*****************************************************************//** +Allocates a new transaction id. +@return new, allocated trx id */ +UNIV_INLINE +trx_id_t +trx_sys_get_new_trx_id(void); +/*========================*/ +/*****************************************************************//** +Determines the maximum transaction id. +@return maximum currently allocated trx id; will be stale after the +next call to trx_sys_get_new_trx_id() */ +UNIV_INLINE +trx_id_t +trx_sys_get_max_trx_id(void); +/*========================*/ + +/*************************************************************//** +Find a slot for a given trx ID in a descriptors array. +@return: slot pointer */ +UNIV_INLINE +trx_id_t* +trx_find_descriptor( +/*================*/ + const trx_id_t* descriptors, /*!< in: descriptors array */ + ulint n_descr, /*!< in: array size */ + trx_id_t trx_id); /*!< in: trx pointer */ + +#ifdef UNIV_DEBUG +/* Flag to control TRX_RSEG_N_SLOTS behavior debugging. */ +extern uint trx_rseg_n_slots_debug; +#endif + +/*****************************************************************//** +Writes a trx id to an index page. In case that the id size changes in +some future version, this function should be used instead of +mach_write_... */ +UNIV_INLINE +void +trx_write_trx_id( +/*=============*/ + byte* ptr, /*!< in: pointer to memory where written */ + trx_id_t id); /*!< in: id */ +/*****************************************************************//** +Reads a trx id from an index page. In case that the id size changes in +some future version, this function should be used instead of +mach_read_... +@return id */ +UNIV_INLINE +trx_id_t +trx_read_trx_id( +/*============*/ + const byte* ptr); /*!< in: pointer to memory from where to read */ +/****************************************************************//** +Looks for the trx instance with the given id in the rw trx_list. +The caller must be holding trx_sys->mutex. +@return the trx handle or NULL if not found; +the pointer must not be dereferenced unless lock_sys->mutex was +acquired before calling this function and is still being held */ +UNIV_INLINE +trx_t* +trx_get_rw_trx_by_id( +/*=================*/ + trx_id_t trx_id);/*!< in: trx id to search for */ +/****************************************************************//** +Returns the minimum trx id in rw trx list. This is the smallest id for which +the trx can possibly be active. (But, you must look at the trx->state to +find out if the minimum trx id transaction itself is active, or already +committed.) +@return the minimum trx id, or trx_sys->max_trx_id if the trx list is empty */ +UNIV_INLINE +trx_id_t +trx_rw_min_trx_id(void); +/*===================*/ +/****************************************************************//** +Returns pointer to a transaction instance if a rw transaction with the given id +is active. Caller must hold trx_sys->mutex. If the caller is not holding +lock_sys->mutex, the transaction may already have been committed. +@return transaction instance if active, or NULL; +the pointer must not be dereferenced unless lock_sys->mutex was +acquired before calling this function and is still being held */ +UNIV_INLINE +trx_t* +trx_rw_get_active_trx_by_id( +/*========================*/ + trx_id_t trx_id, /*!< in: trx id of the transaction */ + ibool* corrupt); /*!< in: NULL or pointer to a flag + that will be set if corrupt */ +/****************************************************************//** +Checks if a rw transaction with the given id is active. Caller must hold +trx_sys->mutex. If the caller is not holding lock_sys->mutex, the +transaction may already have been committed. +@return true if rw transaction it with a given id is active. */ +UNIV_INLINE +bool +trx_rw_is_active_low( +/*=================*/ + trx_id_t trx_id, /*!< in: trx id of the transaction */ + ibool* corrupt); /*!< in: NULL or pointer to a flag + that will be set if corrupt */ +/****************************************************************//** +Checks if a rw transaction with the given id is active. If the caller is +not holding lock_sys->mutex, the transaction may already have been +committed. +@return true if rw transaction it with a given id is active. */ +UNIV_INLINE +bool +trx_rw_is_active( +/*=============*/ + trx_id_t trx_id, /*!< in: trx id of the transaction */ + ibool* corrupt); /*!< in: NULL or pointer to a flag + that will be set if corrupt */ +#ifdef UNIV_DEBUG +/****************************************************************//** +Checks whether a trx is in one of rw_trx_list or ro_trx_list. +@return TRUE if is in */ +UNIV_INTERN +ibool +trx_in_trx_list( +/*============*/ + const trx_t* in_trx) /*!< in: transaction */ + __attribute__((nonnull, warn_unused_result)); +#endif /* UNIV_DEBUG */ +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG +/***********************************************************//** +Assert that a transaction has been recovered. +@return TRUE */ +UNIV_INLINE +ibool +trx_assert_recovered( +/*=================*/ + trx_id_t trx_id) /*!< in: transaction identifier */ + __attribute__((warn_unused_result)); +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ +/*****************************************************************//** +Updates the offset information about the end of the MySQL binlog entry +which corresponds to the transaction just being committed. In a MySQL +replication slave updates the latest master binlog position up to which +replication has proceeded. */ +UNIV_INTERN +void +trx_sys_update_mysql_binlog_offset( +/*===============================*/ + const char* file_name,/*!< in: MySQL log file name */ + ib_int64_t offset, /*!< in: position in that log file */ + ulint field, /*!< in: offset of the MySQL log info field in + the trx sys header */ + mtr_t* mtr); /*!< in: mtr */ +/*****************************************************************//** +Prints to stderr the MySQL binlog offset info in the trx system header if +the magic number shows it valid. */ +UNIV_INTERN +void +trx_sys_print_mysql_binlog_offset(void); +/*===================================*/ +/*****************************************************************//** +Prints to stderr the MySQL master log offset info in the trx system header if +the magic number shows it valid. */ +UNIV_INTERN +void +trx_sys_print_mysql_master_log_pos(void); +/*====================================*/ +/*****************************************************************//** +Initializes the tablespace tag system. */ +UNIV_INTERN +void +trx_sys_file_format_init(void); +/*==========================*/ +/*****************************************************************//** +Closes the tablespace tag system. */ +UNIV_INTERN +void +trx_sys_file_format_close(void); +/*===========================*/ +/********************************************************************//** +Tags the system table space with minimum format id if it has not been +tagged yet. +WARNING: This function is only called during the startup and AFTER the +redo log application during recovery has finished. */ +UNIV_INTERN +void +trx_sys_file_format_tag_init(void); +/*==============================*/ +/*****************************************************************//** +Shutdown/Close the transaction system. */ +UNIV_INTERN +void +trx_sys_close(void); +/*===============*/ +/*****************************************************************//** +Get the name representation of the file format from its id. +@return pointer to the name */ +UNIV_INTERN +const char* +trx_sys_file_format_id_to_name( +/*===========================*/ + const ulint id); /*!< in: id of the file format */ +/*****************************************************************//** +Set the file format id unconditionally except if it's already the +same value. +@return TRUE if value updated */ +UNIV_INTERN +ibool +trx_sys_file_format_max_set( +/*========================*/ + ulint format_id, /*!< in: file format id */ + const char** name); /*!< out: max file format name or + NULL if not needed. */ +/********************************************************************* +Creates the rollback segments +@return number of rollback segments that are active. */ +UNIV_INTERN +ulint +trx_sys_create_rsegs( +/*=================*/ + ulint n_spaces, /*!< number of tablespaces for UNDO logs */ + ulint n_rsegs); /*!< number of rollback segments to create */ +/*****************************************************************//** +Get the number of transaction in the system, independent of their state. +@return count of transactions in trx_sys_t::trx_list */ +UNIV_INLINE +ulint +trx_sys_get_n_rw_trx(void); +/*======================*/ + +/********************************************************************* +Check if there are any active (non-prepared) transactions. +@return total number of active transactions or 0 if none */ +UNIV_INTERN +ulint +trx_sys_any_active_transactions(void); +/*=================================*/ +#else /* !UNIV_HOTBACKUP */ +/*****************************************************************//** +Prints to stderr the MySQL binlog info in the system header if the +magic number shows it valid. */ +UNIV_INTERN +void +trx_sys_print_mysql_binlog_offset_from_page( +/*========================================*/ + const byte* page); /*!< in: buffer containing the trx + system header page, i.e., page number + TRX_SYS_PAGE_NO in the tablespace */ +/*****************************************************************//** +Reads the file format id from the first system table space file. +Even if the call succeeds and returns TRUE, the returned format id +may be ULINT_UNDEFINED signalling that the format id was not present +in the data file. +@return TRUE if call succeeds */ +UNIV_INTERN +ibool +trx_sys_read_file_format_id( +/*========================*/ + const char *pathname, /*!< in: pathname of the first system + table space file */ + ulint *format_id); /*!< out: file format of the system table + space */ +/*****************************************************************//** +Reads the file format id from the given per-table data file. +@return TRUE if call succeeds */ +UNIV_INTERN +ibool +trx_sys_read_pertable_file_format_id( +/*=================================*/ + const char *pathname, /*!< in: pathname of a per-table + datafile */ + ulint *format_id); /*!< out: file format of the per-table + data file */ +#endif /* !UNIV_HOTBACKUP */ +/*****************************************************************//** +Get the name representation of the file format from its id. +@return pointer to the max format name */ +UNIV_INTERN +const char* +trx_sys_file_format_max_get(void); +/*=============================*/ +/*****************************************************************//** +Check for the max file format tag stored on disk. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +trx_sys_file_format_max_check( +/*==========================*/ + ulint max_format_id); /*!< in: the max format id to check */ +/********************************************************************//** +Update the file format tag in the system tablespace only if the given +format id is greater than the known max id. +@return TRUE if format_id was bigger than the known max id */ +UNIV_INTERN +ibool +trx_sys_file_format_max_upgrade( +/*============================*/ + const char** name, /*!< out: max file format name */ + ulint format_id); /*!< in: file format identifier */ +/*****************************************************************//** +Get the name representation of the file format from its id. +@return pointer to the name */ +UNIV_INTERN +const char* +trx_sys_file_format_id_to_name( +/*===========================*/ + const ulint id); /*!< in: id of the file format */ + +#ifdef UNIV_DEBUG +/*************************************************************//** +Validate the trx_sys_t::trx_list. */ +UNIV_INTERN +ibool +trx_sys_validate_trx_list(void); +/*===========================*/ +#endif /* UNIV_DEBUG */ + +/* The automatically created system rollback segment has this id */ +#define TRX_SYS_SYSTEM_RSEG_ID 0 + +/* Space id and page no where the trx system file copy resides */ +#define TRX_SYS_SPACE 0 /* the SYSTEM tablespace */ +#include "fsp0fsp.h" +#define TRX_SYS_PAGE_NO FSP_TRX_SYS_PAGE_NO + +/* The offset of the transaction system header on the page */ +#define TRX_SYS FSEG_PAGE_DATA + +/** Transaction system header */ +/*------------------------------------------------------------- @{ */ +#define TRX_SYS_TRX_ID_STORE 0 /*!< the maximum trx id or trx + number modulo + TRX_SYS_TRX_ID_UPDATE_MARGIN + written to a file page by any + transaction; the assignment of + transaction ids continues from + this number rounded up by + TRX_SYS_TRX_ID_UPDATE_MARGIN + plus + TRX_SYS_TRX_ID_UPDATE_MARGIN + when the database is + started */ +#define TRX_SYS_FSEG_HEADER 8 /*!< segment header for the + tablespace segment the trx + system is created into */ +#define TRX_SYS_RSEGS (8 + FSEG_HEADER_SIZE) + /*!< the start of the array of + rollback segment specification + slots */ +/*------------------------------------------------------------- @} */ + +/* Max number of rollback segments: the number of segment specification slots +in the transaction system array; rollback segment id must fit in one (signed) +byte, therefore 128; each slot is currently 8 bytes in size. If you want +to raise the level to 256 then you will need to fix some assertions that +impose the 7 bit restriction. e.g., mach_write_to_3() */ +#define TRX_SYS_N_RSEGS 128 +/* Originally, InnoDB defined TRX_SYS_N_RSEGS as 256 but created only one +rollback segment. It initialized some arrays with this number of entries. +We must remember this limit in order to keep file compatibility. */ +#define TRX_SYS_OLD_N_RSEGS 256 + +/** Maximum length of MySQL binlog file name, in bytes. +@see trx_sys_mysql_master_log_name +@see trx_sys_mysql_bin_log_name */ +#define TRX_SYS_MYSQL_LOG_NAME_LEN 512 +/** Contents of TRX_SYS_MYSQL_LOG_MAGIC_N_FLD */ +#define TRX_SYS_MYSQL_LOG_MAGIC_N 873422344 + +#if UNIV_PAGE_SIZE_MIN < 4096 +# error "UNIV_PAGE_SIZE_MIN < 4096" +#endif +/** The offset of the MySQL replication info in the trx system header; +this contains the same fields as TRX_SYS_MYSQL_LOG_INFO below */ +#define TRX_SYS_MYSQL_MASTER_LOG_INFO (UNIV_PAGE_SIZE - 2000) + +/** The offset of the MySQL binlog offset info in the trx system header */ +#define TRX_SYS_MYSQL_LOG_INFO (UNIV_PAGE_SIZE - 1000) +#define TRX_SYS_MYSQL_LOG_MAGIC_N_FLD 0 /*!< magic number which is + TRX_SYS_MYSQL_LOG_MAGIC_N + if we have valid data in the + MySQL binlog info */ +#define TRX_SYS_MYSQL_LOG_OFFSET_HIGH 4 /*!< high 4 bytes of the offset + within that file */ +#define TRX_SYS_MYSQL_LOG_OFFSET_LOW 8 /*!< low 4 bytes of the offset + within that file */ +#define TRX_SYS_MYSQL_LOG_NAME 12 /*!< MySQL log file name */ + +/** Doublewrite buffer */ +/* @{ */ +/** The offset of the doublewrite buffer header on the trx system header page */ +#define TRX_SYS_DOUBLEWRITE (UNIV_PAGE_SIZE - 200) +/*-------------------------------------------------------------*/ +#define TRX_SYS_DOUBLEWRITE_FSEG 0 /*!< fseg header of the fseg + containing the doublewrite + buffer */ +#define TRX_SYS_DOUBLEWRITE_MAGIC FSEG_HEADER_SIZE + /*!< 4-byte magic number which + shows if we already have + created the doublewrite + buffer */ +#define TRX_SYS_DOUBLEWRITE_BLOCK1 (4 + FSEG_HEADER_SIZE) + /*!< page number of the + first page in the first + sequence of 64 + (= FSP_EXTENT_SIZE) consecutive + pages in the doublewrite + buffer */ +#define TRX_SYS_DOUBLEWRITE_BLOCK2 (8 + FSEG_HEADER_SIZE) + /*!< page number of the + first page in the second + sequence of 64 consecutive + pages in the doublewrite + buffer */ +#define TRX_SYS_DOUBLEWRITE_REPEAT 12 /*!< we repeat + TRX_SYS_DOUBLEWRITE_MAGIC, + TRX_SYS_DOUBLEWRITE_BLOCK1, + TRX_SYS_DOUBLEWRITE_BLOCK2 + so that if the trx sys + header is half-written + to disk, we still may + be able to recover the + information */ +/** If this is not yet set to TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N, +we must reset the doublewrite buffer, because starting from 4.1.x the +space id of a data page is stored into +FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID. */ +#define TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED (24 + FSEG_HEADER_SIZE) + +/*-------------------------------------------------------------*/ +/** Contents of TRX_SYS_DOUBLEWRITE_MAGIC */ +#define TRX_SYS_DOUBLEWRITE_MAGIC_N 536853855 +/** Contents of TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED */ +#define TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N 1783657386 + +/** Size of the doublewrite block in pages */ +#define TRX_SYS_DOUBLEWRITE_BLOCK_SIZE FSP_EXTENT_SIZE +/* @} */ + +/** File format tag */ +/* @{ */ +/** The offset of the file format tag on the trx system header page +(TRX_SYS_PAGE_NO of TRX_SYS_SPACE) */ +#define TRX_SYS_FILE_FORMAT_TAG (UNIV_PAGE_SIZE - 16) + +/** Contents of TRX_SYS_FILE_FORMAT_TAG when valid. The file format +identifier is added to this constant. */ +#define TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_LOW 3645922177UL +/** Contents of TRX_SYS_FILE_FORMAT_TAG+4 when valid */ +#define TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_HIGH 2745987765UL +/** Contents of TRX_SYS_FILE_FORMAT_TAG when valid. The file format +identifier is added to this 64-bit constant. */ +#define TRX_SYS_FILE_FORMAT_TAG_MAGIC_N \ + ((ib_uint64_t) TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_HIGH << 32 \ + | TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_LOW) +/* @} */ + +#define TRX_DESCR_ARRAY_INITIAL_SIZE 1000 + +#ifndef UNIV_HOTBACKUP +/** The transaction system central memory data structure. */ +struct trx_sys_t{ + + ib_mutex_t mutex; /*!< mutex protecting most fields in + this structure except when noted + otherwise */ + ulint n_prepared_trx; /*!< Number of transactions currently + in the XA PREPARED state */ + ulint n_prepared_recovered_trx; /*!< Number of transactions + currently in XA PREPARED state that are + also recovered. Such transactions cannot + be added during runtime. They can only + occur after recovery if mysqld crashed + while there were XA PREPARED + transactions. We disable query cache + if such transactions exist. */ + trx_id_t max_trx_id; /*!< The smallest number not yet + assigned as a transaction id or + transaction number */ + char pad1[64]; /*!< Ensure max_trx_id does not share + cache line with other fields. */ + trx_id_t* descriptors; /*!< Array of trx descriptors */ + ulint descr_n_max; /*!< The current size of the descriptors + array. */ + char pad2[64]; /*!< Ensure static descriptor fields + do not share cache lines with + descr_n_used */ + ulint descr_n_used; /*!< Number of used elements in the + descriptors array. */ + char pad3[64]; /*!< Ensure descriptors do not share + cache line with other fields */ +#ifdef UNIV_DEBUG + trx_id_t rw_max_trx_id; /*!< Max trx id of read-write transactions + which exist or existed */ +#endif + trx_list_t rw_trx_list; /*!< List of active and committed in + memory read-write transactions, sorted + on trx id, biggest first. Recovered + transactions are always on this list. */ + char pad4[64]; /*!< Ensure list base nodes do not + share cache line with other fields */ + trx_list_t ro_trx_list; /*!< List of active and committed in + memory read-only transactions, sorted + on trx id, biggest first. NOTE: + The order for read-only transactions + is not necessary. We should exploit + this and increase concurrency during + add/remove. */ + char pad5[64]; /*!< Ensure list base nodes do not + share cache line with other fields */ + trx_list_t mysql_trx_list; /*!< List of transactions created + for MySQL. All transactions on + ro_trx_list are on mysql_trx_list. The + rw_trx_list can contain system + transactions and recovered transactions + that will not be in the mysql_trx_list. + There can be active non-locking + auto-commit read only transactions that + are on this list but not on ro_trx_list. + mysql_trx_list may additionally contain + transactions that have not yet been + started in InnoDB. */ + char pad6[64]; /*!< Ensure list base nodes do not + share cache line with other fields */ + trx_list_t trx_serial_list; + /*!< trx->no ordered List of + transactions in either TRX_PREPARED or + TRX_ACTIVE which have already been + assigned a serialization number */ + char pad7[64]; /*!< Ensure list base nodes do not + share cache line with other fields */ + trx_rseg_t* const rseg_array[TRX_SYS_N_RSEGS]; + /*!< Pointer array to rollback + segments; NULL if slot not in use; + created and destroyed in + single-threaded mode; not protected + by any mutex, because it is read-only + during multi-threaded operation */ + ulint rseg_history_len;/*!< Length of the TRX_RSEG_HISTORY + list (update undo logs for committed + transactions), protected by + rseg->mutex */ + UT_LIST_BASE_NODE_T(read_view_t) view_list; + /*!< List of read views sorted + on trx no, biggest first */ +}; + +/** When a trx id which is zero modulo this number (which must be a power of +two) is assigned, the field TRX_SYS_TRX_ID_STORE on the transaction system +page is updated */ +#define TRX_SYS_TRX_ID_WRITE_MARGIN 256 +#endif /* !UNIV_HOTBACKUP */ + +#ifndef UNIV_NONINL +#include "trx0sys.ic" +#endif + +#endif diff --git a/storage/xtradb/include/trx0sys.ic b/storage/xtradb/include/trx0sys.ic new file mode 100644 index 00000000000..699148cff6d --- /dev/null +++ b/storage/xtradb/include/trx0sys.ic @@ -0,0 +1,565 @@ +/***************************************************************************** + +Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0sys.ic +Transaction system + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#include "trx0trx.h" +#include "data0type.h" +#ifndef UNIV_HOTBACKUP +# include "srv0srv.h" +# include "mtr0log.h" + +/* The typedef for rseg slot in the file copy */ +typedef byte trx_sysf_rseg_t; + +/* Rollback segment specification slot offsets */ +/*-------------------------------------------------------------*/ +#define TRX_SYS_RSEG_SPACE 0 /* space where the segment + header is placed; starting with + MySQL/InnoDB 5.1.7, this is + UNIV_UNDEFINED if the slot is unused */ +#define TRX_SYS_RSEG_PAGE_NO 4 /* page number where the segment + header is placed; this is FIL_NULL + if the slot is unused */ +/*-------------------------------------------------------------*/ +/* Size of a rollback segment specification slot */ +#define TRX_SYS_RSEG_SLOT_SIZE 8 + +/*****************************************************************//** +Writes the value of max_trx_id to the file based trx system header. */ +UNIV_INTERN +void +trx_sys_flush_max_trx_id(void); +/*==========================*/ + +/***************************************************************//** +Checks if a page address is the trx sys header page. +@return TRUE if trx sys header page */ +UNIV_INLINE +ibool +trx_sys_hdr_page( +/*=============*/ + ulint space, /*!< in: space */ + ulint page_no)/*!< in: page number */ +{ + if ((space == TRX_SYS_SPACE) && (page_no == TRX_SYS_PAGE_NO)) { + + return(TRUE); + } + + return(FALSE); +} + +/***************************************************************//** +Gets the pointer in the nth slot of the rseg array. +@return pointer to rseg object, NULL if slot not in use */ +UNIV_INLINE +trx_rseg_t* +trx_sys_get_nth_rseg( +/*=================*/ + trx_sys_t* sys, /*!< in: trx system */ + ulint n) /*!< in: index of slot */ +{ + ut_ad(n < TRX_SYS_N_RSEGS); + + return(sys->rseg_array[n]); +} + +/**********************************************************************//** +Gets a pointer to the transaction system header and x-latches its page. +@return pointer to system header, page x-latched. */ +UNIV_INLINE +trx_sysf_t* +trx_sysf_get( +/*=========*/ + mtr_t* mtr) /*!< in: mtr */ +{ + buf_block_t* block; + trx_sysf_t* header; + + ut_ad(mtr); + + block = buf_page_get(TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, + RW_X_LATCH, mtr); + buf_block_dbg_add_level(block, SYNC_TRX_SYS_HEADER); + + header = TRX_SYS + buf_block_get_frame(block); + + return(header); +} + +/*****************************************************************//** +Gets the space of the nth rollback segment slot in the trx system +file copy. +@return space id */ +UNIV_INLINE +ulint +trx_sysf_rseg_get_space( +/*====================*/ + trx_sysf_t* sys_header, /*!< in: trx sys header */ + ulint i, /*!< in: slot index == rseg id */ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(sys_header); + ut_ad(i < TRX_SYS_N_RSEGS); + + return(mtr_read_ulint(sys_header + TRX_SYS_RSEGS + + i * TRX_SYS_RSEG_SLOT_SIZE + + TRX_SYS_RSEG_SPACE, MLOG_4BYTES, mtr)); +} + +/*****************************************************************//** +Gets the page number of the nth rollback segment slot in the trx system +header. +@return page number, FIL_NULL if slot unused */ +UNIV_INLINE +ulint +trx_sysf_rseg_get_page_no( +/*======================*/ + trx_sysf_t* sys_header, /*!< in: trx system header */ + ulint i, /*!< in: slot index == rseg id */ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(sys_header); + ut_ad(i < TRX_SYS_N_RSEGS); + + return(mtr_read_ulint(sys_header + TRX_SYS_RSEGS + + i * TRX_SYS_RSEG_SLOT_SIZE + + TRX_SYS_RSEG_PAGE_NO, MLOG_4BYTES, mtr)); +} + +/*****************************************************************//** +Sets the space id of the nth rollback segment slot in the trx system +file copy. */ +UNIV_INLINE +void +trx_sysf_rseg_set_space( +/*====================*/ + trx_sysf_t* sys_header, /*!< in: trx sys file copy */ + ulint i, /*!< in: slot index == rseg id */ + ulint space, /*!< in: space id */ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(sys_header); + ut_ad(i < TRX_SYS_N_RSEGS); + + mlog_write_ulint(sys_header + TRX_SYS_RSEGS + + i * TRX_SYS_RSEG_SLOT_SIZE + + TRX_SYS_RSEG_SPACE, + space, + MLOG_4BYTES, mtr); +} + +/*****************************************************************//** +Sets the page number of the nth rollback segment slot in the trx system +header. */ +UNIV_INLINE +void +trx_sysf_rseg_set_page_no( +/*======================*/ + trx_sysf_t* sys_header, /*!< in: trx sys header */ + ulint i, /*!< in: slot index == rseg id */ + ulint page_no, /*!< in: page number, FIL_NULL if the + slot is reset to unused */ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(sys_header); + ut_ad(i < TRX_SYS_N_RSEGS); + + mlog_write_ulint(sys_header + TRX_SYS_RSEGS + + i * TRX_SYS_RSEG_SLOT_SIZE + + TRX_SYS_RSEG_PAGE_NO, + page_no, + MLOG_4BYTES, mtr); +} +#endif /* !UNIV_HOTBACKUP */ + +/*****************************************************************//** +Writes a trx id to an index page. In case that the id size changes in +some future version, this function should be used instead of +mach_write_... */ +UNIV_INLINE +void +trx_write_trx_id( +/*=============*/ + byte* ptr, /*!< in: pointer to memory where written */ + trx_id_t id) /*!< in: id */ +{ +#if DATA_TRX_ID_LEN != 6 +# error "DATA_TRX_ID_LEN != 6" +#endif + mach_write_to_6(ptr, id); +} + +#ifndef UNIV_HOTBACKUP +/*****************************************************************//** +Reads a trx id from an index page. In case that the id size changes in +some future version, this function should be used instead of +mach_read_... +@return id */ +UNIV_INLINE +trx_id_t +trx_read_trx_id( +/*============*/ + const byte* ptr) /*!< in: pointer to memory from where to read */ +{ +#if DATA_TRX_ID_LEN != 6 +# error "DATA_TRX_ID_LEN != 6" +#endif + return(mach_read_from_6(ptr)); +} + +/****************************************************************//** +Looks for the trx handle with the given id in rw_trx_list. +The caller must be holding trx_sys->mutex. +@return the trx handle or NULL if not found; +the pointer must not be dereferenced unless lock_sys->mutex was +acquired before calling this function and is still being held */ +UNIV_INLINE +trx_t* +trx_get_rw_trx_by_id( +/*=================*/ + trx_id_t trx_id) /*!< in: trx id to search for */ +{ + trx_t* trx; + ulint len; + trx_t* first; + + ut_ad(mutex_own(&trx_sys->mutex)); + + len = UT_LIST_GET_LEN(trx_sys->rw_trx_list); + + if (len == 0) { + return(NULL); + } + + /* Because the list is ordered on trx id in descending order, + we try to speed things up a bit. */ + + trx = UT_LIST_GET_FIRST(trx_sys->rw_trx_list); + assert_trx_in_rw_list(trx); + + if (trx_id == trx->id) { + return(trx); + } else if (len == 1 || trx_id > trx->id) { + return(NULL); + } + + first = trx; + + trx = UT_LIST_GET_LAST(trx_sys->rw_trx_list); + assert_trx_in_rw_list(trx); + + if (trx_id == trx->id) { + return(trx); + } else if (len == 2 || trx_id < trx->id) { + return(NULL); + } + + /* Search the list from the lower end (tail). */ + if (trx_id < (first->id + trx->id) >> 1) { + for (trx = UT_LIST_GET_PREV(trx_list, trx); + trx != NULL && trx_id > trx->id; + trx = UT_LIST_GET_PREV(trx_list, trx)) { + assert_trx_in_rw_list(trx); + } + } else { + for (trx = UT_LIST_GET_NEXT(trx_list, first); + trx != NULL && trx_id < trx->id; + trx = UT_LIST_GET_NEXT(trx_list, trx)) { + assert_trx_in_rw_list(trx); + } + } + + return((trx != NULL && trx->id == trx_id) ? trx : NULL); +} + +/****************************************************************//** +Returns the minimum trx id in trx list. This is the smallest id for which +the trx can possibly be active. (But, you must look at the trx->state +to find out if the minimum trx id transaction itself is active, or already +committed.). The caller must be holding the trx_sys_t::mutex in shared mode. +@return the minimum trx id, or trx_sys->max_trx_id if the trx list is empty */ +UNIV_INLINE +trx_id_t +trx_rw_min_trx_id_low(void) +/*=======================*/ +{ + trx_id_t id; + const trx_t* trx; + + ut_ad(mutex_own(&trx_sys->mutex)); + + trx = UT_LIST_GET_LAST(trx_sys->rw_trx_list); + + if (trx == NULL) { + id = trx_sys->max_trx_id; + } else { + assert_trx_in_rw_list(trx); + id = trx->id; + } + + return(id); +} + +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG +/***********************************************************//** +Assert that a transaction has been recovered. +@return TRUE */ +UNIV_INLINE +ibool +trx_assert_recovered( +/*=================*/ + trx_id_t trx_id) /*!< in: transaction identifier */ +{ + const trx_t* trx; + + mutex_enter(&trx_sys->mutex); + + trx = trx_get_rw_trx_by_id(trx_id); + ut_a(trx->is_recovered); + + mutex_exit(&trx_sys->mutex); + + return(TRUE); +} +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + +/****************************************************************//** +Returns the minimum trx id in rw trx list. This is the smallest id for which +the rw trx can possibly be active. (But, you must look at the trx->state +to find out if the minimum trx id transaction itself is active, or already +committed.) +@return the minimum trx id, or trx_sys->max_trx_id if rw trx list is empty */ +UNIV_INLINE +trx_id_t +trx_rw_min_trx_id(void) +/*===================*/ +{ + trx_id_t id; + + mutex_enter(&trx_sys->mutex); + + id = trx_rw_min_trx_id_low(); + + mutex_exit(&trx_sys->mutex); + + return(id); +} + +/****************************************************************//** +Returns pointer to a transaction instance if a rw transaction with the given id +is active. Caller must hold trx_sys->mutex. If the caller is not holding +lock_sys->mutex, the transaction may already have been committed. +@return transaction instance if active, or NULL; +the pointer must not be dereferenced unless lock_sys->mutex was +acquired before calling this function and is still being held */ +UNIV_INLINE +trx_t* +trx_rw_get_active_trx_by_id( +/*========================*/ + trx_id_t trx_id, /*!< in: trx id of the transaction */ + ibool* corrupt) /*!< in: NULL or pointer to a flag + that will be set if corrupt */ +{ + trx_t* trx; + + ut_ad(mutex_own(&trx_sys->mutex)); + + if (trx_id < trx_rw_min_trx_id_low()) { + + trx = NULL; + } else if (trx_id >= trx_sys->max_trx_id) { + + /* There must be corruption: we let the caller handle the + diagnostic prints in this case. */ + + trx = NULL; + if (corrupt != NULL) { + *corrupt = TRUE; + } + } else { + trx = trx_get_rw_trx_by_id(trx_id); + + if (trx != NULL + && trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY)) { + + trx = NULL; + } + } + + return(trx); +} + +/****************************************************************//** +Checks if a rw transaction with the given id is active. Caller must hold +trx_sys->mutex. If the caller is not holding lock_sys->mutex, the +transaction may already have been committed. +@return true if rw transaction it with a given id is active. */ +UNIV_INLINE +bool +trx_rw_is_active_low( +/*=================*/ + trx_id_t trx_id, /*!< in: trx id of the transaction */ + ibool* corrupt) /*!< in: NULL or pointer to a flag + that will be set if corrupt */ +{ + ut_ad(mutex_own(&trx_sys->mutex)); + + if (UNIV_UNLIKELY(trx_id >= trx_sys->max_trx_id)) { + + /* There must be corruption: we let the caller handle the + diagnostic prints in this case. */ + + if (corrupt != NULL) { + *corrupt = TRUE; + } + + return(false); + } + + return(trx_find_descriptor(trx_sys->descriptors, trx_sys->descr_n_used, + trx_id) != NULL); +} + +/****************************************************************//** +Checks if a rw transaction with the given id is active. If the caller is +not holding lock_sys->mutex, the transaction may already have been +committed. +@return true if rw transaction it with a given id is active. */ +UNIV_INLINE +bool +trx_rw_is_active( +/*=============*/ + trx_id_t trx_id, /*!< in: trx id of the transaction */ + ibool* corrupt) /*!< in: NULL or pointer to a flag + that will be set if corrupt */ +{ + bool res; + + mutex_enter(&trx_sys->mutex); + + res = trx_rw_is_active_low(trx_id, corrupt); + + mutex_exit(&trx_sys->mutex); + + return(res); +} + +/*****************************************************************//** +Allocates a new transaction id. +@return new, allocated trx id */ +UNIV_INLINE +trx_id_t +trx_sys_get_new_trx_id(void) +/*========================*/ +{ + ut_ad(mutex_own(&trx_sys->mutex)); + + /* VERY important: after the database is started, max_trx_id value is + divisible by TRX_SYS_TRX_ID_WRITE_MARGIN, and the following if + will evaluate to TRUE when this function is first time called, + and the value for trx id will be written to disk-based header! + Thus trx id values will not overlap when the database is + repeatedly started! */ + + if (!(trx_sys->max_trx_id % (trx_id_t) TRX_SYS_TRX_ID_WRITE_MARGIN)) { + + trx_sys_flush_max_trx_id(); + } + + return(trx_sys->max_trx_id++); +} + +/*****************************************************************//** +Determines the maximum transaction id. +@return maximum currently allocated trx id; will be stale after the +next call to trx_sys_get_new_trx_id() */ +UNIV_INLINE +trx_id_t +trx_sys_get_max_trx_id(void) +/*========================*/ +{ +#if UNIV_WORD_SIZE < DATA_TRX_ID_LEN + trx_id_t max_trx_id; +#endif + + ut_ad(!mutex_own(&trx_sys->mutex)); + +#if UNIV_WORD_SIZE < DATA_TRX_ID_LEN + /* Avoid torn reads. */ + mutex_enter(&trx_sys->mutex); + max_trx_id = trx_sys->max_trx_id; + mutex_exit(&trx_sys->mutex); + return(max_trx_id); +#else + /* Perform a dirty read. Callers should be prepared for stale + values, and we know that the value fits in a machine word, so + that it will be read and written atomically. */ + return(trx_sys->max_trx_id); +#endif +} + +/*****************************************************************//** +Get the number of transaction in the system, independent of their state. +@return count of transactions in trx_sys_t::rw_trx_list */ +UNIV_INLINE +ulint +trx_sys_get_n_rw_trx(void) +/*======================*/ +{ + ulint n_trx; + + mutex_enter(&trx_sys->mutex); + + n_trx = UT_LIST_GET_LEN(trx_sys->rw_trx_list); + + mutex_exit(&trx_sys->mutex); + + return(n_trx); +} + + +/*************************************************************//** +Find a slot for a given trx ID in a descriptors array. +@return: slot pointer */ +UNIV_INLINE +trx_id_t* +trx_find_descriptor( +/*================*/ + const trx_id_t* descriptors, /*!< in: descriptors array */ + ulint n_descr, /*!< in: array size */ + trx_id_t trx_id) /*!< in: trx id */ +{ + ut_ad(descriptors != trx_sys->descriptors || + mutex_own(&trx_sys->mutex)); + + if (UNIV_UNLIKELY(n_descr == 0)) { + + return(NULL); + } + + return((trx_id_t *) bsearch(&trx_id, descriptors, n_descr, + sizeof(trx_id_t), trx_descr_cmp)); +} +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/xtradb/include/trx0trx.h b/storage/xtradb/include/trx0trx.h new file mode 100644 index 00000000000..9ae25ef2d7d --- /dev/null +++ b/storage/xtradb/include/trx0trx.h @@ -0,0 +1,1162 @@ +/***************************************************************************** + +Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0trx.h +The transaction + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#ifndef trx0trx_h +#define trx0trx_h + +#include "univ.i" +#include "trx0types.h" +#include "dict0types.h" +#ifndef UNIV_HOTBACKUP +#include "lock0types.h" +#include "log0log.h" +#include "usr0types.h" +#include "que0types.h" +#include "mem0mem.h" +#include "read0types.h" +#include "trx0xa.h" +#include "ut0vec.h" +#include "fts0fts.h" + +/** Dummy session used currently in MySQL interface */ +extern sess_t* trx_dummy_sess; + +/********************************************************************//** +In XtraDB it is impossible for a transaction to own a search latch outside of +InnoDB code, so there is nothing to release on demand. We keep this function to +simplify maintenance.*/ +UNIV_INLINE +void +trx_search_latch_release_if_reserved( +/*=================================*/ + trx_t* trx __attribute__((unused))); /*!< in: transaction */ +/******************************************************************//** +Set detailed error message for the transaction. */ +UNIV_INTERN +void +trx_set_detailed_error( +/*===================*/ + trx_t* trx, /*!< in: transaction struct */ + const char* msg); /*!< in: detailed error message */ +/*************************************************************//** +Set detailed error message for the transaction from a file. Note that the +file is rewinded before reading from it. */ +UNIV_INTERN +void +trx_set_detailed_error_from_file( +/*=============================*/ + trx_t* trx, /*!< in: transaction struct */ + FILE* file); /*!< in: file to read message from */ +/****************************************************************//** +Retrieves the error_info field from a trx. +@return the error info */ +UNIV_INLINE +const dict_index_t* +trx_get_error_info( +/*===============*/ + const trx_t* trx); /*!< in: trx object */ +/********************************************************************//** +Creates a transaction object for MySQL. +@return own: transaction object */ +UNIV_INTERN +trx_t* +trx_allocate_for_mysql(void); +/*========================*/ +/********************************************************************//** +Creates a transaction object for background operations by the master thread. +@return own: transaction object */ +UNIV_INTERN +trx_t* +trx_allocate_for_background(void); +/*=============================*/ +/********************************************************************//** +Frees a transaction object of a background operation of the master thread. */ +UNIV_INTERN +void +trx_free_for_background( +/*====================*/ + trx_t* trx); /*!< in, own: trx object */ +/********************************************************************//** +At shutdown, frees a transaction object that is in the PREPARED state. */ +UNIV_INTERN +void +trx_free_prepared( +/*==============*/ + trx_t* trx) /*!< in, own: trx object */ + UNIV_COLD __attribute__((nonnull)); +/********************************************************************//** +Frees a transaction object for MySQL. */ +UNIV_INTERN +void +trx_free_for_mysql( +/*===============*/ + trx_t* trx); /*!< in, own: trx object */ +/****************************************************************//** +Creates trx objects for transactions and initializes the trx list of +trx_sys at database start. Rollback segment and undo log lists must +already exist when this function is called, because the lists of +transactions to be rolled back or cleaned up are built based on the +undo log lists. */ +UNIV_INTERN +void +trx_lists_init_at_db_start(void); +/*============================*/ + +#ifdef UNIV_DEBUG +#define trx_start_if_not_started_xa(t) \ + { \ + (t)->start_line = __LINE__; \ + (t)->start_file = __FILE__; \ + trx_start_if_not_started_xa_low((t)); \ + } +#else +#define trx_start_if_not_started_xa(t) \ + trx_start_if_not_started_xa_low((t)) +#endif /* UNIV_DEBUG */ + +/*************************************************************//** +Starts the transaction if it is not yet started. */ +UNIV_INTERN +void +trx_start_if_not_started_xa_low( +/*============================*/ + trx_t* trx); /*!< in: transaction */ +/*************************************************************//** +Starts the transaction if it is not yet started. */ +UNIV_INTERN +void +trx_start_if_not_started_low( +/*=========================*/ + trx_t* trx); /*!< in: transaction */ + +#ifdef UNIV_DEBUG +#define trx_start_if_not_started(t) \ + { \ + (t)->start_line = __LINE__; \ + (t)->start_file = __FILE__; \ + trx_start_if_not_started_low((t)); \ + } +#else +#define trx_start_if_not_started(t) \ + trx_start_if_not_started_low((t)) +#endif /* UNIV_DEBUG */ + +/*************************************************************//** +Starts the transaction for a DDL operation. */ +UNIV_INTERN +void +trx_start_for_ddl_low( +/*==================*/ + trx_t* trx, /*!< in/out: transaction */ + trx_dict_op_t op) /*!< in: dictionary operation type */ + __attribute__((nonnull)); + +#ifdef UNIV_DEBUG +#define trx_start_for_ddl(t, o) \ + { \ + ut_ad((t)->start_file == 0); \ + (t)->start_line = __LINE__; \ + (t)->start_file = __FILE__; \ + trx_start_for_ddl_low((t), (o)); \ + } +#else +#define trx_start_for_ddl(t, o) \ + trx_start_for_ddl_low((t), (o)) +#endif /* UNIV_DEBUG */ + +/****************************************************************//** +Commits a transaction. */ +UNIV_INTERN +void +trx_commit( +/*=======*/ + trx_t* trx) /*!< in/out: transaction */ + __attribute__((nonnull)); +/****************************************************************//** +Commits a transaction and a mini-transaction. */ +UNIV_INTERN +void +trx_commit_low( +/*===========*/ + trx_t* trx, /*!< in/out: transaction */ + mtr_t* mtr) /*!< in/out: mini-transaction (will be committed), + or NULL if trx made no modifications */ + __attribute__((nonnull(1))); +/****************************************************************//** +Cleans up a transaction at database startup. The cleanup is needed if +the transaction already got to the middle of a commit when the database +crashed, and we cannot roll it back. */ +UNIV_INTERN +void +trx_cleanup_at_db_startup( +/*======================*/ + trx_t* trx); /*!< in: transaction */ +/**********************************************************************//** +Does the transaction commit for MySQL. +@return DB_SUCCESS or error number */ +UNIV_INTERN +dberr_t +trx_commit_for_mysql( +/*=================*/ + trx_t* trx); /*!< in/out: transaction */ +/**********************************************************************//** +Does the transaction prepare for MySQL. */ +UNIV_INTERN +void +trx_prepare_for_mysql( +/*==================*/ + trx_t* trx); /*!< in/out: trx handle */ +/**********************************************************************//** +This function is used to find number of prepared transactions and +their transaction objects for a recovery. +@return number of prepared transactions */ +UNIV_INTERN +int +trx_recover_for_mysql( +/*==================*/ + XID* xid_list, /*!< in/out: prepared transactions */ + ulint len); /*!< in: number of slots in xid_list */ +/*******************************************************************//** +This function is used to find one X/Open XA distributed transaction +which is in the prepared state +@return trx or NULL; on match, the trx->xid will be invalidated; +note that the trx may have been committed, unless the caller is +holding lock_sys->mutex */ +UNIV_INTERN +trx_t * +trx_get_trx_by_xid( +/*===============*/ + const XID* xid); /*!< in: X/Open XA transaction identifier */ +/**********************************************************************//** +If required, flushes the log to disk if we called trx_commit_for_mysql() +with trx->flush_log_later == TRUE. */ +UNIV_INTERN +void +trx_commit_complete_for_mysql( +/*==========================*/ + trx_t* trx) /*!< in/out: transaction */ + __attribute__((nonnull)); +/**********************************************************************//** +Marks the latest SQL statement ended. */ +UNIV_INTERN +void +trx_mark_sql_stat_end( +/*==================*/ + trx_t* trx); /*!< in: trx handle */ +/********************************************************************//** +Assigns a read view for a consistent read query. All the consistent reads +within the same transaction will get the same read view, which is created +when this function is first called for a new started transaction. +@return consistent read view */ +UNIV_INTERN +read_view_t* +trx_assign_read_view( +/*=================*/ + trx_t* trx); /*!< in: active transaction */ +/********************************************************************//** +Clones the read view from another transaction. All the consistent reads within +the receiver transaction will get the same read view as the donor transaction +@return read view clone */ +UNIV_INTERN +read_view_t* +trx_clone_read_view( +/*================*/ + trx_t* trx, /*!< in: receiver transaction */ + trx_t* from_trx) /*!< in: donor transaction */ + __attribute__((nonnull, warn_unused_result)); +/****************************************************************//** +Prepares a transaction for commit/rollback. */ +UNIV_INTERN +void +trx_commit_or_rollback_prepare( +/*===========================*/ + trx_t* trx); /*!< in/out: transaction */ +/*********************************************************************//** +Creates a commit command node struct. +@return own: commit node struct */ +UNIV_INTERN +commit_node_t* +trx_commit_node_create( +/*===================*/ + mem_heap_t* heap); /*!< in: mem heap where created */ +/***********************************************************//** +Performs an execution step for a commit type node in a query graph. +@return query thread to run next, or NULL */ +UNIV_INTERN +que_thr_t* +trx_commit_step( +/*============*/ + que_thr_t* thr); /*!< in: query thread */ + +/**********************************************************************//** +Prints info about a transaction. +Caller must hold trx_sys->mutex. */ +UNIV_INTERN +void +trx_print_low( +/*==========*/ + FILE* f, + /*!< in: output stream */ + const trx_t* trx, + /*!< in: transaction */ + ulint max_query_len, + /*!< in: max query length to print, + or 0 to use the default max length */ + ulint n_rec_locks, + /*!< in: lock_number_of_rows_locked(&trx->lock) */ + ulint n_trx_locks, + /*!< in: length of trx->lock.trx_locks */ + ulint heap_size) + /*!< in: mem_heap_get_size(trx->lock.lock_heap) */ + __attribute__((nonnull)); + +/**********************************************************************//** +Prints info about a transaction. +The caller must hold lock_sys->mutex and trx_sys->mutex. +When possible, use trx_print() instead. */ +UNIV_INTERN +void +trx_print_latched( +/*==============*/ + FILE* f, /*!< in: output stream */ + const trx_t* trx, /*!< in: transaction */ + ulint max_query_len) /*!< in: max query length to print, + or 0 to use the default max length */ + __attribute__((nonnull)); + +/**********************************************************************//** +Prints info about a transaction. +Acquires and releases lock_sys->mutex and trx_sys->mutex. */ +UNIV_INTERN +void +trx_print( +/*======*/ + FILE* f, /*!< in: output stream */ + const trx_t* trx, /*!< in: transaction */ + ulint max_query_len) /*!< in: max query length to print, + or 0 to use the default max length */ + __attribute__((nonnull)); + +/**********************************************************************//** +Determine if a transaction is a dictionary operation. +@return dictionary operation mode */ +UNIV_INLINE +enum trx_dict_op_t +trx_get_dict_operation( +/*===================*/ + const trx_t* trx) /*!< in: transaction */ + __attribute__((pure)); +/**********************************************************************//** +Flag a transaction a dictionary operation. */ +UNIV_INLINE +void +trx_set_dict_operation( +/*===================*/ + trx_t* trx, /*!< in/out: transaction */ + enum trx_dict_op_t op); /*!< in: operation, not + TRX_DICT_OP_NONE */ + +#ifndef UNIV_HOTBACKUP +/**********************************************************************//** +Determines if a transaction is in the given state. +The caller must hold trx_sys->mutex, or it must be the thread +that is serving a running transaction. +A running transaction must be in trx_sys->ro_trx_list or trx_sys->rw_trx_list +unless it is a non-locking autocommit read only transaction, which is only +in trx_sys->mysql_trx_list. +@return TRUE if trx->state == state */ +UNIV_INLINE +ibool +trx_state_eq( +/*=========*/ + const trx_t* trx, /*!< in: transaction */ + trx_state_t state) /*!< in: state; + if state != TRX_STATE_NOT_STARTED + asserts that + trx->state != TRX_STATE_NOT_STARTED */ + __attribute__((nonnull, warn_unused_result)); +# ifdef UNIV_DEBUG +/**********************************************************************//** +Asserts that a transaction has been started. +The caller must hold trx_sys->mutex. +@return TRUE if started */ +UNIV_INTERN +ibool +trx_assert_started( +/*===============*/ + const trx_t* trx) /*!< in: transaction */ + __attribute__((nonnull, warn_unused_result)); +# endif /* UNIV_DEBUG */ + +/**********************************************************************//** +Determines if the currently running transaction has been interrupted. +@return TRUE if interrupted */ +UNIV_INTERN +ibool +trx_is_interrupted( +/*===============*/ + const trx_t* trx); /*!< in: transaction */ +/**********************************************************************//** +Determines if the currently running transaction is in strict mode. +@return TRUE if strict */ +UNIV_INTERN +ibool +trx_is_strict( +/*==========*/ + trx_t* trx); /*!< in: transaction */ +#else /* !UNIV_HOTBACKUP */ +#define trx_is_interrupted(trx) FALSE +#endif /* !UNIV_HOTBACKUP */ + +/*******************************************************************//** +Calculates the "weight" of a transaction. The weight of one transaction +is estimated as the number of altered rows + the number of locked rows. +@param t transaction +@return transaction weight */ +#define TRX_WEIGHT(t) ((t)->undo_no + UT_LIST_GET_LEN((t)->lock.trx_locks)) + +/*******************************************************************//** +Compares the "weight" (or size) of two transactions. Transactions that +have edited non-transactional tables are considered heavier than ones +that have not. +@return TRUE if weight(a) >= weight(b) */ +UNIV_INTERN +ibool +trx_weight_ge( +/*==========*/ + const trx_t* a, /*!< in: the first transaction to be compared */ + const trx_t* b); /*!< in: the second transaction to be compared */ + +/* Maximum length of a string that can be returned by +trx_get_que_state_str(). */ +#define TRX_QUE_STATE_STR_MAX_LEN 12 /* "ROLLING BACK" */ + +/*******************************************************************//** +Retrieves transaction's que state in a human readable string. The string +should not be free()'d or modified. +@return string in the data segment */ +UNIV_INLINE +const char* +trx_get_que_state_str( +/*==================*/ + const trx_t* trx); /*!< in: transaction */ + +/****************************************************************//** +Assign a read-only transaction a rollback-segment, if it is attempting +to write to a TEMPORARY table. */ +UNIV_INTERN +void +trx_assign_rseg( +/*============*/ + trx_t* trx); /*!< A read-only transaction that + needs to be assigned a RBS. */ + +/*************************************************************//** +Callback function for trx_find_descriptor() to compare trx IDs. */ +UNIV_INTERN +int +trx_descr_cmp( +/*==========*/ + const void *a, /*!< in: pointer to first comparison argument */ + const void *b); /*!< in: pointer to second comparison argument */ + +/*************************************************************//** +Release a slot for a given trx in the global descriptors array. */ +UNIV_INTERN +void +trx_release_descriptor( +/*===================*/ + trx_t* trx); /*!< in: trx pointer */ + +/*******************************************************************//** +Transactions that aren't started by the MySQL server don't set +the trx_t::mysql_thd field. For such transactions we set the lock +wait timeout to 0 instead of the user configured value that comes +from innodb_lock_wait_timeout via trx_t::mysql_thd. +@param trx transaction +@return lock wait timeout in seconds */ +#define trx_lock_wait_timeout_get(trx) \ + ((trx)->mysql_thd != NULL \ + ? thd_lock_wait_timeout((trx)->mysql_thd) \ + : 0) + +/*******************************************************************//** +Determine if the transaction is a non-locking autocommit select +(implied read-only). +@param t transaction +@return true if non-locking autocommit select transaction. */ +#define trx_is_autocommit_non_locking(t) \ +((t)->auto_commit && (t)->will_lock == 0) + +/*******************************************************************//** +Determine if the transaction is a non-locking autocommit select +with an explicit check for the read-only status. +@param t transaction +@return true if non-locking autocommit read-only transaction. */ +#define trx_is_ac_nl_ro(t) \ +((t)->read_only && trx_is_autocommit_non_locking((t))) + +/*******************************************************************//** +Assert that the transaction is in the trx_sys_t::rw_trx_list */ +#define assert_trx_in_rw_list(t) do { \ + ut_ad(!(t)->read_only); \ + assert_trx_in_list(t); \ +} while (0) + +/*******************************************************************//** +Assert that the transaction is either in trx_sys->ro_trx_list or +trx_sys->rw_trx_list but not both and it cannot be an autocommit +non-locking select */ +#define assert_trx_in_list(t) do { \ + ut_ad((t)->in_ro_trx_list == (t)->read_only); \ + ut_ad((t)->in_rw_trx_list == !(t)->read_only); \ + ut_ad(!trx_is_autocommit_non_locking((t))); \ + switch ((t)->state) { \ + case TRX_STATE_PREPARED: \ + /* fall through */ \ + case TRX_STATE_ACTIVE: \ + case TRX_STATE_COMMITTED_IN_MEMORY: \ + continue; \ + case TRX_STATE_NOT_STARTED: \ + break; \ + } \ + ut_error; \ +} while (0) + +#ifdef UNIV_DEBUG +/*******************************************************************//** +Assert that an autocommit non-locking select cannot be in the +ro_trx_list nor the rw_trx_list and that it is a read-only transaction. +The tranasction must be in the mysql_trx_list. */ +# define assert_trx_nonlocking_or_in_list(t) \ + do { \ + if (trx_is_autocommit_non_locking(t)) { \ + trx_state_t t_state = (t)->state; \ + ut_ad((t)->read_only); \ + ut_ad(!(t)->is_recovered); \ + ut_ad(!(t)->in_ro_trx_list); \ + ut_ad(!(t)->in_rw_trx_list); \ + ut_ad((t)->in_mysql_trx_list); \ + ut_ad(t_state == TRX_STATE_NOT_STARTED \ + || t_state == TRX_STATE_ACTIVE); \ + } else { \ + assert_trx_in_list(t); \ + } \ + } while (0) +#else /* UNIV_DEBUG */ +/*******************************************************************//** +Assert that an autocommit non-locking slect cannot be in the +ro_trx_list nor the rw_trx_list and that it is a read-only transaction. +The tranasction must be in the mysql_trx_list. */ +# define assert_trx_nonlocking_or_in_list(trx) ((void)0) +#endif /* UNIV_DEBUG */ + +/*******************************************************************//** +Latching protocol for trx_lock_t::que_state. trx_lock_t::que_state +captures the state of the query thread during the execution of a query. +This is different from a transaction state. The query state of a transaction +can be updated asynchronously by other threads. The other threads can be +system threads, like the timeout monitor thread or user threads executing +other queries. Another thing to be mindful of is that there is a delay between +when a query thread is put into LOCK_WAIT state and before it actually starts +waiting. Between these two events it is possible that the query thread is +granted the lock it was waiting for, which implies that the state can be changed +asynchronously. + +All these operations take place within the context of locking. Therefore state +changes within the locking code must acquire both the lock mutex and the +trx->mutex when changing trx->lock.que_state to TRX_QUE_LOCK_WAIT or +trx->lock.wait_lock to non-NULL but when the lock wait ends it is sufficient +to only acquire the trx->mutex. +To query the state either of the mutexes is sufficient within the locking +code and no mutex is required when the query thread is no longer waiting. */ + +/** The locks and state of an active transaction. Protected by +lock_sys->mutex, trx->mutex or both. */ +struct trx_lock_t { + ulint n_active_thrs; /*!< number of active query threads */ + + trx_que_t que_state; /*!< valid when trx->state + == TRX_STATE_ACTIVE: TRX_QUE_RUNNING, + TRX_QUE_LOCK_WAIT, ... */ + + lock_t* wait_lock; /*!< if trx execution state is + TRX_QUE_LOCK_WAIT, this points to + the lock request, otherwise this is + NULL; set to non-NULL when holding + both trx->mutex and lock_sys->mutex; + set to NULL when holding + lock_sys->mutex; readers should + hold lock_sys->mutex, except when + they are holding trx->mutex and + wait_lock==NULL */ + ib_uint64_t deadlock_mark; /*!< A mark field that is initialized + to and checked against lock_mark_counter + by lock_deadlock_recursive(). */ + ibool was_chosen_as_deadlock_victim; + /*!< when the transaction decides to + wait for a lock, it sets this to FALSE; + if another transaction chooses this + transaction as a victim in deadlock + resolution, it sets this to TRUE. + Protected by trx->mutex. */ + time_t wait_started; /*!< lock wait started at this time, + protected only by lock_sys->mutex */ + + que_thr_t* wait_thr; /*!< query thread belonging to this + trx that is in QUE_THR_LOCK_WAIT + state. For threads suspended in a + lock wait, this is protected by + lock_sys->mutex. Otherwise, this may + only be modified by the thread that is + serving the running transaction. */ + + mem_heap_t* lock_heap; /*!< memory heap for trx_locks; + protected by lock_sys->mutex */ + + UT_LIST_BASE_NODE_T(lock_t) + trx_locks; /*!< locks requested + by the transaction; + insertions are protected by trx->mutex + and lock_sys->mutex; removals are + protected by lock_sys->mutex */ + + ib_vector_t* table_locks; /*!< All table locks requested by this + transaction, including AUTOINC locks */ + + ibool cancel; /*!< TRUE if the transaction is being + rolled back either via deadlock + detection or due to lock timeout. The + caller has to acquire the trx_t::mutex + in order to cancel the locks. In + lock_trx_table_locks_remove() we + check for this cancel of a transaction's + locks and avoid reacquiring the trx + mutex to prevent recursive deadlocks. + Protected by both the lock sys mutex + and the trx_t::mutex. */ +}; + +#define TRX_MAGIC_N 91118598 + +/** The transaction handle + +Normally, there is a 1:1 relationship between a transaction handle +(trx) and a session (client connection). One session is associated +with exactly one user transaction. There are some exceptions to this: + +* For DDL operations, a subtransaction is allocated that modifies the +data dictionary tables. Lock waits and deadlocks are prevented by +acquiring the dict_operation_lock before starting the subtransaction +and releasing it after committing the subtransaction. + +* The purge system uses a special transaction that is not associated +with any session. + +* If the system crashed or it was quickly shut down while there were +transactions in the ACTIVE or PREPARED state, these transactions would +no longer be associated with a session when the server is restarted. + +A session may be served by at most one thread at a time. The serving +thread of a session might change in some MySQL implementations. +Therefore we do not have os_thread_get_curr_id() assertions in the code. + +Normally, only the thread that is currently associated with a running +transaction may access (read and modify) the trx object, and it may do +so without holding any mutex. The following are exceptions to this: + +* trx_rollback_resurrected() may access resurrected (connectionless) +transactions while the system is already processing new user +transactions. The trx_sys->mutex prevents a race condition between it +and lock_trx_release_locks() [invoked by trx_commit()]. + +* trx_print_low() may access transactions not associated with the current +thread. The caller must be holding trx_sys->mutex and lock_sys->mutex. + +* When a transaction handle is in the trx_sys->mysql_trx_list or +trx_sys->trx_list, some of its fields must not be modified without +holding trx_sys->mutex exclusively. + +* The locking code (in particular, lock_deadlock_recursive() and +lock_rec_convert_impl_to_expl()) will access transactions associated +to other connections. The locks of transactions are protected by +lock_sys->mutex and sometimes by trx->mutex. */ + +struct trx_t{ + ulint magic_n; + + ib_mutex_t mutex; /*!< Mutex protecting the fields + state and lock + (except some fields of lock, which + are protected by lock_sys->mutex) */ + + /** State of the trx from the point of view of concurrency control + and the valid state transitions. + + Possible states: + + TRX_STATE_NOT_STARTED + TRX_STATE_ACTIVE + TRX_STATE_PREPARED + TRX_STATE_COMMITTED_IN_MEMORY (alias below COMMITTED) + + Valid state transitions are: + + Regular transactions: + * NOT_STARTED -> ACTIVE -> COMMITTED -> NOT_STARTED + + Auto-commit non-locking read-only: + * NOT_STARTED -> ACTIVE -> NOT_STARTED + + XA (2PC): + * NOT_STARTED -> ACTIVE -> PREPARED -> COMMITTED -> NOT_STARTED + + Recovered XA: + * NOT_STARTED -> PREPARED -> COMMITTED -> (freed) + + XA (2PC) (shutdown before ROLLBACK or COMMIT): + * NOT_STARTED -> PREPARED -> (freed) + + Latching and various transaction lists membership rules: + + XA (2PC) transactions are always treated as non-autocommit. + + Transitions to ACTIVE or NOT_STARTED occur when + !in_rw_trx_list and !in_ro_trx_list (no trx_sys->mutex needed). + + Autocommit non-locking read-only transactions move between states + without holding any mutex. They are !in_rw_trx_list, !in_ro_trx_list. + + When a transaction is NOT_STARTED, it can be in_mysql_trx_list if + it is a user transaction. It cannot be in ro_trx_list or rw_trx_list. + + ACTIVE->PREPARED->COMMITTED is only possible when trx->in_rw_trx_list. + The transition ACTIVE->PREPARED is protected by trx_sys->mutex. + + ACTIVE->COMMITTED is possible when the transaction is in + ro_trx_list or rw_trx_list. + + Transitions to COMMITTED are protected by both lock_sys->mutex + and trx->mutex. + + NOTE: Some of these state change constraints are an overkill, + currently only required for a consistent view for printing stats. + This unnecessarily adds a huge cost for the general case. + + NOTE: In the future we should add read only transactions to the + ro_trx_list the first time they try to acquire a lock ie. by default + we treat all read-only transactions as non-locking. */ + trx_state_t state; + + trx_lock_t lock; /*!< Information about the transaction + locks and state. Protected by + trx->mutex or lock_sys->mutex + or both */ + ulint is_recovered; /*!< 0=normal transaction, + 1=recovered, must be rolled back, + protected by trx_sys->mutex when + trx->in_rw_trx_list holds */ + + /* These fields are not protected by any mutex. */ + const char* op_info; /*!< English text describing the + current operation, or an empty + string */ + ulint isolation_level;/*!< TRX_ISO_REPEATABLE_READ, ... */ + ulint check_foreigns; /*!< normally TRUE, but if the user + wants to suppress foreign key checks, + (in table imports, for example) we + set this FALSE */ + /*------------------------------*/ + /* MySQL has a transaction coordinator to coordinate two phase + commit between multiple storage engines and the binary log. When + an engine participates in a transaction, it's responsible for + registering itself using the trans_register_ha() API. */ + unsigned is_registered:1;/* This flag is set to 1 after the + transaction has been registered with + the coordinator using the XA API, and + is set to 0 after commit or rollback. */ + unsigned owns_prepare_mutex:1;/* 1 if owns prepare mutex, if + this is set to 1 then registered should + also be set to 1. This is used in the + XA code */ + /*------------------------------*/ + ulint check_unique_secondary; + /*!< normally TRUE, but if the user + wants to speed up inserts by + suppressing unique key checks + for secondary indexes when we decide + if we can use the insert buffer for + them, we set this FALSE */ + ulint support_xa; /*!< normally we do the XA two-phase + commit steps, but by setting this to + FALSE, one can save CPU time and about + 150 bytes in the undo log size as then + we skip XA steps */ + ulint fake_changes; + ulint flush_log_later;/* In 2PC, we hold the + prepare_commit mutex across + both phases. In that case, we + defer flush of the logs to disk + until after we release the + mutex. */ + ulint must_flush_log_later;/*!< this flag is set to TRUE in + trx_commit() if flush_log_later was + TRUE, and there were modifications by + the transaction; in that case we must + flush the log in + trx_commit_complete_for_mysql() */ + ulint duplicates; /*!< TRX_DUP_IGNORE | TRX_DUP_REPLACE */ + bool has_search_latch; + /*!< true if this trx has latched any + search system latch in S-mode */ + ulint search_latch_timeout; + /*!< If we notice that someone is + waiting for our S-lock on the search + latch to be released, we wait in + row0sel.cc for BTR_SEA_TIMEOUT new + searches until we try to keep + the search latch again over + calls from MySQL; this is intended + to reduce contention on the search + latch */ + trx_dict_op_t dict_operation; /**< @see enum trx_dict_op */ + + /* Fields protected by the srv_conc_mutex. */ + ulint declared_to_be_inside_innodb; + /*!< this is TRUE if we have declared + this transaction in + srv_conc_enter_innodb to be inside the + InnoDB engine */ + ulint n_tickets_to_enter_innodb; + /*!< this can be > 0 only when + declared_to_... is TRUE; when we come + to srv_conc_innodb_enter, if the value + here is > 0, we decrement this by 1 */ + ulint dict_operation_lock_mode; + /*!< 0, RW_S_LATCH, or RW_X_LATCH: + the latch mode trx currently holds + on dict_operation_lock. Protected + by dict_operation_lock. */ + + trx_id_t no; /*!< transaction serialization number: + max trx id shortly before the + transaction is moved to + COMMITTED_IN_MEMORY state. + Protected by trx_sys_t::mutex + when trx->in_rw_trx_list. Initially + set to TRX_ID_MAX. */ + + time_t start_time; /*!< time the trx state last time became + TRX_STATE_ACTIVE */ + trx_id_t id; /*!< transaction id */ + XID xid; /*!< X/Open XA transaction + identification to identify a + transaction branch */ + lsn_t commit_lsn; /*!< lsn at the time of the commit */ + table_id_t table_id; /*!< Table to drop iff dict_operation + == TRX_DICT_OP_TABLE, or 0. */ + /*------------------------------*/ + THD* mysql_thd; /*!< MySQL thread handle corresponding + to this trx, or NULL */ + const char* mysql_log_file_name; + /*!< if MySQL binlog is used, this field + contains a pointer to the latest file + name; this is NULL if binlog is not + used */ + ib_int64_t mysql_log_offset; + /*!< if MySQL binlog is used, this + field contains the end offset of the + binlog entry */ + time_t idle_start; + ib_int64_t last_stmt_start; + /*------------------------------*/ + ulint n_mysql_tables_in_use; /*!< number of Innobase tables + used in the processing of the current + SQL statement in MySQL */ + ulint mysql_n_tables_locked; + /*!< how many tables the current SQL + statement uses, except those + in consistent read */ + /*------------------------------*/ + UT_LIST_NODE_T(trx_t) + trx_list; /*!< list of transactions; + protected by trx_sys->mutex. + The same node is used for both + trx_sys_t::ro_trx_list and + trx_sys_t::rw_trx_list */ +#ifdef UNIV_DEBUG + /** The following two fields are mutually exclusive. */ + /* @{ */ + + ibool in_ro_trx_list; /*!< TRUE if in trx_sys->ro_trx_list */ + ibool in_rw_trx_list; /*!< TRUE if in trx_sys->rw_trx_list */ + /* @} */ +#endif /* UNIV_DEBUG */ + UT_LIST_NODE_T(trx_t) + mysql_trx_list; /*!< list of transactions created for + MySQL; protected by trx_sys->mutex */ +#ifdef UNIV_DEBUG + ibool in_mysql_trx_list; + /*!< TRUE if in + trx_sys->mysql_trx_list */ +#endif /* UNIV_DEBUG */ + UT_LIST_NODE_T(trx_t) + trx_serial_list;/*!< list node for + trx_sys->trx_serial_list */ + bool in_trx_serial_list; + /* Set when transaction is in the + trx_serial_list */ + /*------------------------------*/ + dberr_t error_state; /*!< 0 if no error, otherwise error + number; NOTE That ONLY the thread + doing the transaction is allowed to + set this field: this is NOT protected + by any mutex */ + const dict_index_t*error_info; /*!< if the error number indicates a + duplicate key error, a pointer to + the problematic index is stored here */ + ulint error_key_num; /*!< if the index creation fails to a + duplicate key error, a mysql key + number of that index is stored here */ + sess_t* sess; /*!< session of the trx, NULL if none */ + que_t* graph; /*!< query currently run in the session, + or NULL if none; NOTE that the query + belongs to the session, and it can + survive over a transaction commit, if + it is a stored procedure with a COMMIT + WORK statement, for instance */ + read_view_t* global_read_view; + /*!< consistent read view associated + to a transaction or NULL */ + read_view_t* read_view; /*!< consistent read view used in the + transaction or NULL, this read view + if defined can be normal read view + associated to a transaction (i.e. + same as global_read_view) or read view + associated to a cursor */ + read_view_t* prebuilt_view; /* pre-built view array */ + /*------------------------------*/ + UT_LIST_BASE_NODE_T(trx_named_savept_t) + trx_savepoints; /*!< savepoints set with SAVEPOINT ..., + oldest first */ + /*------------------------------*/ + ib_mutex_t undo_mutex; /*!< mutex protecting the fields in this + section (down to undo_no_arr), EXCEPT + last_sql_stat_start, which can be + accessed only when we know that there + cannot be any activity in the undo + logs! */ + undo_no_t undo_no; /*!< next undo log record number to + assign; since the undo log is + private for a transaction, this + is a simple ascending sequence + with no gaps; thus it represents + the number of modified/inserted + rows in a transaction */ + trx_savept_t last_sql_stat_start; + /*!< undo_no when the last sql statement + was started: in case of an error, trx + is rolled back down to this undo + number; see note at undo_mutex! */ + trx_rseg_t* rseg; /*!< rollback segment assigned to the + transaction, or NULL if not assigned + yet */ + trx_undo_t* insert_undo; /*!< pointer to the insert undo log, or + NULL if no inserts performed yet */ + trx_undo_t* update_undo; /*!< pointer to the update undo log, or + NULL if no update performed yet */ + undo_no_t roll_limit; /*!< least undo number to undo during + a rollback */ + ulint pages_undone; /*!< number of undo log pages undone + since the last undo log truncation */ + trx_undo_arr_t* undo_no_arr; /*!< array of undo numbers of undo log + records which are currently processed + by a rollback operation */ + /*------------------------------*/ + ulint n_autoinc_rows; /*!< no. of AUTO-INC rows required for + an SQL statement. This is useful for + multi-row INSERTs */ + ib_vector_t* autoinc_locks; /* AUTOINC locks held by this + transaction. Note that these are + also in the lock list trx_locks. This + vector needs to be freed explicitly + when the trx instance is destroyed. + Protected by lock_sys->mutex. */ + /*------------------------------*/ + ibool read_only; /*!< TRUE if transaction is flagged + as a READ-ONLY transaction. + if !auto_commit || will_lock > 0 + then it will added to the list + trx_sys_t::ro_trx_list. A read only + transaction will not be assigned an + UNDO log. Non-locking auto-commit + read-only transaction will not be on + either list. */ + ibool auto_commit; /*!< TRUE if it is an autocommit */ + ulint will_lock; /*!< Will acquire some locks. Increment + each time we determine that a lock will + be acquired by the MySQL layer. */ + bool ddl; /*!< true if it is a transaction that + is being started for a DDL operation */ + /*------------------------------*/ + fts_trx_t* fts_trx; /*!< FTS information, or NULL if + transaction hasn't modified tables + with FTS indexes (yet). */ + doc_id_t fts_next_doc_id;/* The document id used for updates */ + /*------------------------------*/ + ulint flush_tables; /*!< if "covering" the FLUSH TABLES", + count of tables being flushed. */ + + /*------------------------------*/ +#ifdef UNIV_DEBUG + ulint start_line; /*!< Track where it was started from */ + const char* start_file; /*!< Filename where it was started */ +#endif /* UNIV_DEBUG */ + /*------------------------------*/ + bool api_trx; /*!< trx started by InnoDB API */ + bool api_auto_commit;/*!< automatic commit */ + bool read_write; /*!< if read and write operation */ + + /*------------------------------*/ + char detailed_error[256]; /*!< detailed error message for last + error, or empty. */ + /*------------------------------*/ + ulint io_reads; + ib_uint64_t io_read; + ulint io_reads_wait_timer; + ib_uint64_t lock_que_wait_ustarted; + ulint lock_que_wait_timer; + ulint innodb_que_wait_timer; + ulint distinct_page_access; +#define DPAH_SIZE 8192 + byte* distinct_page_access_hash; + ibool take_stats; +}; + +/* Transaction isolation levels (trx->isolation_level) */ +#define TRX_ISO_READ_UNCOMMITTED 0 /* dirty read: non-locking + SELECTs are performed so that + we do not look at a possible + earlier version of a record; + thus they are not 'consistent' + reads under this isolation + level; otherwise like level + 2 */ + +#define TRX_ISO_READ_COMMITTED 1 /* somewhat Oracle-like + isolation, except that in + range UPDATE and DELETE we + must block phantom rows + with next-key locks; + SELECT ... FOR UPDATE and ... + LOCK IN SHARE MODE only lock + the index records, NOT the + gaps before them, and thus + allow free inserting; + each consistent read reads its + own snapshot */ + +#define TRX_ISO_REPEATABLE_READ 2 /* this is the default; + all consistent reads in the + same trx read the same + snapshot; + full next-key locking used + in locking reads to block + insertions into gaps */ + +#define TRX_ISO_SERIALIZABLE 3 /* all plain SELECTs are + converted to LOCK IN SHARE + MODE reads */ + +/* Treatment of duplicate values (trx->duplicates; for example, in inserts). +Multiple flags can be combined with bitwise OR. */ +#define TRX_DUP_IGNORE 1 /* duplicate rows are to be updated */ +#define TRX_DUP_REPLACE 2 /* duplicate rows are to be replaced */ + + +/* Types of a trx signal */ +#define TRX_SIG_NO_SIGNAL 0 +#define TRX_SIG_TOTAL_ROLLBACK 1 +#define TRX_SIG_ROLLBACK_TO_SAVEPT 2 +#define TRX_SIG_COMMIT 3 +#define TRX_SIG_BREAK_EXECUTION 5 + +/* Sender types of a signal */ +#define TRX_SIG_SELF 0 /* sent by the session itself, or + by an error occurring within this + session */ +#define TRX_SIG_OTHER_SESS 1 /* sent by another session (which + must hold rights to this) */ + +/** Commit node states */ +enum commit_node_state { + COMMIT_NODE_SEND = 1, /*!< about to send a commit signal to + the transaction */ + COMMIT_NODE_WAIT /*!< commit signal sent to the transaction, + waiting for completion */ +}; + +/** Commit command node in a query graph */ +struct commit_node_t{ + que_common_t common; /*!< node type: QUE_NODE_COMMIT */ + enum commit_node_state + state; /*!< node execution state */ +}; + + +/** Test if trx->mutex is owned. */ +#define trx_mutex_own(t) mutex_own(&t->mutex) + +/** Acquire the trx->mutex. */ +#define trx_mutex_enter(t) do { \ + mutex_enter(&t->mutex); \ +} while (0) + +/** Release the trx->mutex. */ +#define trx_mutex_exit(t) do { \ + mutex_exit(&t->mutex); \ +} while (0) + +/** @brief The latch protecting the adaptive search system + +This latch protects the +(1) hash index; +(2) columns of a record to which we have a pointer in the hash index; + +but does NOT protect: + +(3) next record offset field in a record; +(4) next or previous records on the same page. + +Bear in mind (3) and (4) when using the hash index. +*/ +extern prio_rw_lock_t* btr_search_latch_arr; + +#ifndef UNIV_NONINL +#include "trx0trx.ic" +#endif +#endif /* !UNIV_HOTBACKUP */ + +#endif diff --git a/storage/xtradb/include/trx0trx.ic b/storage/xtradb/include/trx0trx.ic new file mode 100644 index 00000000000..787931dc4b6 --- /dev/null +++ b/storage/xtradb/include/trx0trx.ic @@ -0,0 +1,177 @@ +/***************************************************************************** + +Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0trx.ic +The transaction + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +/**********************************************************************//** +Determines if a transaction is in the given state. +The caller must hold trx_sys->mutex, or it must be the thread +that is serving a running transaction. +A running transaction must be in trx_sys->ro_trx_list or trx_sys->rw_trx_list +unless it is a non-locking autocommit read only transaction, which is only +in trx_sys->mysql_trx_list. +@return TRUE if trx->state == state */ +UNIV_INLINE +ibool +trx_state_eq( +/*=========*/ + const trx_t* trx, /*!< in: transaction */ + trx_state_t state) /*!< in: state; + if state != TRX_STATE_NOT_STARTED + asserts that + trx->state != TRX_STATE_NOT_STARTED */ +{ +#ifdef UNIV_DEBUG + switch (trx->state) { + case TRX_STATE_PREPARED: + ut_ad(!trx_is_autocommit_non_locking(trx)); + return(trx->state == state); + + case TRX_STATE_ACTIVE: + assert_trx_nonlocking_or_in_list(trx); + return(state == trx->state); + + case TRX_STATE_COMMITTED_IN_MEMORY: + assert_trx_in_list(trx); + return(state == trx->state); + + case TRX_STATE_NOT_STARTED: + /* This state is not allowed for running transactions. */ + ut_a(state == TRX_STATE_NOT_STARTED); + ut_ad(!trx->in_rw_trx_list); + ut_ad(!trx->in_ro_trx_list); + return(state == trx->state); + } + ut_error; +#endif /* UNIV_DEBUG */ + return(trx->state == state); +} + +/****************************************************************//** +Retrieves the error_info field from a trx. +@return the error info */ +UNIV_INLINE +const dict_index_t* +trx_get_error_info( +/*===============*/ + const trx_t* trx) /*!< in: trx object */ +{ + return(trx->error_info); +} + +/*******************************************************************//** +Retrieves transaction's que state in a human readable string. The string +should not be free()'d or modified. +@return string in the data segment */ +UNIV_INLINE +const char* +trx_get_que_state_str( +/*==================*/ + const trx_t* trx) /*!< in: transaction */ +{ + /* be sure to adjust TRX_QUE_STATE_STR_MAX_LEN if you change this */ + switch (trx->lock.que_state) { + case TRX_QUE_RUNNING: + return("RUNNING"); + case TRX_QUE_LOCK_WAIT: + return("LOCK WAIT"); + case TRX_QUE_ROLLING_BACK: + return("ROLLING BACK"); + case TRX_QUE_COMMITTING: + return("COMMITTING"); + default: + return("UNKNOWN"); + } +} + +/**********************************************************************//** +Determine if a transaction is a dictionary operation. +@return dictionary operation mode */ +UNIV_INLINE +enum trx_dict_op_t +trx_get_dict_operation( +/*===================*/ + const trx_t* trx) /*!< in: transaction */ +{ + trx_dict_op_t op = static_cast<trx_dict_op_t>(trx->dict_operation); + +#ifdef UNIV_DEBUG + switch (op) { + case TRX_DICT_OP_NONE: + case TRX_DICT_OP_TABLE: + case TRX_DICT_OP_INDEX: + return(op); + } + ut_error; +#endif /* UNIV_DEBUG */ + return(op); +} +/**********************************************************************//** +Flag a transaction a dictionary operation. */ +UNIV_INLINE +void +trx_set_dict_operation( +/*===================*/ + trx_t* trx, /*!< in/out: transaction */ + enum trx_dict_op_t op) /*!< in: operation, not + TRX_DICT_OP_NONE */ +{ +#ifdef UNIV_DEBUG + enum trx_dict_op_t old_op = trx_get_dict_operation(trx); + + switch (op) { + case TRX_DICT_OP_NONE: + ut_error; + break; + case TRX_DICT_OP_TABLE: + switch (old_op) { + case TRX_DICT_OP_NONE: + case TRX_DICT_OP_INDEX: + case TRX_DICT_OP_TABLE: + goto ok; + } + ut_error; + break; + case TRX_DICT_OP_INDEX: + ut_ad(old_op == TRX_DICT_OP_NONE); + break; + } +ok: +#endif /* UNIV_DEBUG */ + + trx->ddl = true; + trx->dict_operation = op; +} + +/********************************************************************//** +In XtraDB it is impossible for a transaction to own a search latch outside of +InnoDB code, so there is nothing to release on demand. We keep this function to +simplify maintenance.*/ +UNIV_INLINE +void +trx_search_latch_release_if_reserved( +/*=================================*/ + trx_t* trx __attribute__((unused))) /*!< in: transaction */ +{ + ut_ad(!trx->has_search_latch); +} diff --git a/storage/xtradb/include/trx0types.h b/storage/xtradb/include/trx0types.h new file mode 100644 index 00000000000..7ca95131328 --- /dev/null +++ b/storage/xtradb/include/trx0types.h @@ -0,0 +1,147 @@ +/***************************************************************************** + +Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0types.h +Transaction system global type definitions + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#ifndef trx0types_h +#define trx0types_h + +#include "ut0byte.h" + +/** printf(3) format used for printing DB_TRX_ID and other system fields */ +#define TRX_ID_FMT IB_ID_FMT + +/** maximum length that a formatted trx_t::id could take, not including +the terminating NUL character. */ +#define TRX_ID_MAX_LEN 17 + +/** Transaction execution states when trx->state == TRX_STATE_ACTIVE */ +enum trx_que_t { + TRX_QUE_RUNNING, /*!< transaction is running */ + TRX_QUE_LOCK_WAIT, /*!< transaction is waiting for + a lock */ + TRX_QUE_ROLLING_BACK, /*!< transaction is rolling back */ + TRX_QUE_COMMITTING /*!< transaction is committing */ +}; + +/** Transaction states (trx_t::state) */ +enum trx_state_t { + TRX_STATE_NOT_STARTED, + TRX_STATE_ACTIVE, + TRX_STATE_PREPARED, /* Support for 2PC/XA */ + TRX_STATE_COMMITTED_IN_MEMORY +}; + +/** Type of data dictionary operation */ +enum trx_dict_op_t { + /** The transaction is not modifying the data dictionary. */ + TRX_DICT_OP_NONE = 0, + /** The transaction is creating a table or an index, or + dropping a table. The table must be dropped in crash + recovery. This and TRX_DICT_OP_NONE are the only possible + operation modes in crash recovery. */ + TRX_DICT_OP_TABLE = 1, + /** The transaction is creating or dropping an index in an + existing table. In crash recovery, the data dictionary + must be locked, but the table must not be dropped. */ + TRX_DICT_OP_INDEX = 2 +}; + +/** Memory objects */ +/* @{ */ +/** Transaction */ +struct trx_t; +/** The locks and state of an active transaction */ +struct trx_lock_t; +/** Transaction system */ +struct trx_sys_t; +/** Signal */ +struct trx_sig_t; +/** Rollback segment */ +struct trx_rseg_t; +/** Transaction undo log */ +struct trx_undo_t; +/** Array of undo numbers of undo records being rolled back or purged */ +struct trx_undo_arr_t; +/** A cell of trx_undo_arr_t */ +struct trx_undo_inf_t; +/** The control structure used in the purge operation */ +struct trx_purge_t; +/** Rollback command node in a query graph */ +struct roll_node_t; +/** Commit command node in a query graph */ +struct commit_node_t; +/** SAVEPOINT command node in a query graph */ +struct trx_named_savept_t; +/* @} */ + +/** Rollback contexts */ +enum trx_rb_ctx { + RB_NONE = 0, /*!< no rollback */ + RB_NORMAL, /*!< normal rollback */ + RB_RECOVERY_PURGE_REC, + /*!< rolling back an incomplete transaction, + in crash recovery, rolling back an + INSERT that was performed by updating a + delete-marked record; if the delete-marked record + no longer exists in an active read view, it will + be purged */ + RB_RECOVERY /*!< rolling back an incomplete transaction, + in crash recovery */ +}; + +/** Row identifier (DB_ROW_ID, DATA_ROW_ID) */ +typedef ib_id_t row_id_t; +/** Transaction identifier (DB_TRX_ID, DATA_TRX_ID) */ +typedef ib_id_t trx_id_t; +/** Rollback pointer (DB_ROLL_PTR, DATA_ROLL_PTR) */ +typedef ib_id_t roll_ptr_t; +/** Undo number */ +typedef ib_id_t undo_no_t; + +/** Maximum transaction identifier */ +#define TRX_ID_MAX IB_ID_MAX + +/** Transaction savepoint */ +struct trx_savept_t{ + undo_no_t least_undo_no; /*!< least undo number to undo */ +}; + +/** File objects */ +/* @{ */ +/** Transaction system header */ +typedef byte trx_sysf_t; +/** Rollback segment header */ +typedef byte trx_rsegf_t; +/** Undo segment header */ +typedef byte trx_usegf_t; +/** Undo log header */ +typedef byte trx_ulogf_t; +/** Undo log page header */ +typedef byte trx_upagef_t; + +/** Undo log record */ +typedef byte trx_undo_rec_t; +/* @} */ + +#endif diff --git a/storage/xtradb/include/trx0undo.h b/storage/xtradb/include/trx0undo.h new file mode 100644 index 00000000000..61b0dabb1e6 --- /dev/null +++ b/storage/xtradb/include/trx0undo.h @@ -0,0 +1,604 @@ +/***************************************************************************** + +Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0undo.h +Transaction undo log + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#ifndef trx0undo_h +#define trx0undo_h + +#include "univ.i" +#include "trx0types.h" +#include "mtr0mtr.h" +#include "trx0sys.h" +#include "page0types.h" +#include "trx0xa.h" + +#ifndef UNIV_HOTBACKUP +/***********************************************************************//** +Builds a roll pointer. +@return roll pointer */ +UNIV_INLINE +roll_ptr_t +trx_undo_build_roll_ptr( +/*====================*/ + ibool is_insert, /*!< in: TRUE if insert undo log */ + ulint rseg_id, /*!< in: rollback segment id */ + ulint page_no, /*!< in: page number */ + ulint offset); /*!< in: offset of the undo entry within page */ +/***********************************************************************//** +Decodes a roll pointer. */ +UNIV_INLINE +void +trx_undo_decode_roll_ptr( +/*=====================*/ + roll_ptr_t roll_ptr, /*!< in: roll pointer */ + ibool* is_insert, /*!< out: TRUE if insert undo log */ + ulint* rseg_id, /*!< out: rollback segment id */ + ulint* page_no, /*!< out: page number */ + ulint* offset); /*!< out: offset of the undo + entry within page */ +/***********************************************************************//** +Returns TRUE if the roll pointer is of the insert type. +@return TRUE if insert undo log */ +UNIV_INLINE +ibool +trx_undo_roll_ptr_is_insert( +/*========================*/ + roll_ptr_t roll_ptr); /*!< in: roll pointer */ +/***********************************************************************//** +Returns true if the record is of the insert type. +@return true if the record was freshly inserted (not updated). */ +UNIV_INLINE +bool +trx_undo_trx_id_is_insert( +/*======================*/ + const byte* trx_id) /*!< in: DB_TRX_ID, followed by DB_ROLL_PTR */ + __attribute__((nonnull, pure, warn_unused_result)); +#endif /* !UNIV_HOTBACKUP */ +/*****************************************************************//** +Writes a roll ptr to an index page. In case that the size changes in +some future version, this function should be used instead of +mach_write_... */ +UNIV_INLINE +void +trx_write_roll_ptr( +/*===============*/ + byte* ptr, /*!< in: pointer to memory where + written */ + roll_ptr_t roll_ptr); /*!< in: roll ptr */ +/*****************************************************************//** +Reads a roll ptr from an index page. In case that the roll ptr size +changes in some future version, this function should be used instead of +mach_read_... +@return roll ptr */ +UNIV_INLINE +roll_ptr_t +trx_read_roll_ptr( +/*==============*/ + const byte* ptr); /*!< in: pointer to memory from where to read */ +#ifndef UNIV_HOTBACKUP +/******************************************************************//** +Gets an undo log page and x-latches it. +@return pointer to page x-latched */ +UNIV_INLINE +page_t* +trx_undo_page_get( +/*==============*/ + ulint space, /*!< in: space where placed */ + ulint zip_size, /*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page_no, /*!< in: page number */ + mtr_t* mtr); /*!< in: mtr */ +/******************************************************************//** +Gets an undo log page and s-latches it. +@return pointer to page s-latched */ +UNIV_INLINE +page_t* +trx_undo_page_get_s_latched( +/*========================*/ + ulint space, /*!< in: space where placed */ + ulint zip_size, /*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page_no, /*!< in: page number */ + mtr_t* mtr); /*!< in: mtr */ +/******************************************************************//** +Returns the previous undo record on the page in the specified log, or +NULL if none exists. +@return pointer to record, NULL if none */ +UNIV_INLINE +trx_undo_rec_t* +trx_undo_page_get_prev_rec( +/*=======================*/ + trx_undo_rec_t* rec, /*!< in: undo log record */ + ulint page_no,/*!< in: undo log header page number */ + ulint offset);/*!< in: undo log header offset on page */ +/******************************************************************//** +Returns the next undo log record on the page in the specified log, or +NULL if none exists. +@return pointer to record, NULL if none */ +UNIV_INLINE +trx_undo_rec_t* +trx_undo_page_get_next_rec( +/*=======================*/ + trx_undo_rec_t* rec, /*!< in: undo log record */ + ulint page_no,/*!< in: undo log header page number */ + ulint offset);/*!< in: undo log header offset on page */ +/******************************************************************//** +Returns the last undo record on the page in the specified undo log, or +NULL if none exists. +@return pointer to record, NULL if none */ +UNIV_INLINE +trx_undo_rec_t* +trx_undo_page_get_last_rec( +/*=======================*/ + page_t* undo_page,/*!< in: undo log page */ + ulint page_no,/*!< in: undo log header page number */ + ulint offset); /*!< in: undo log header offset on page */ +/******************************************************************//** +Returns the first undo record on the page in the specified undo log, or +NULL if none exists. +@return pointer to record, NULL if none */ +UNIV_INLINE +trx_undo_rec_t* +trx_undo_page_get_first_rec( +/*========================*/ + page_t* undo_page,/*!< in: undo log page */ + ulint page_no,/*!< in: undo log header page number */ + ulint offset);/*!< in: undo log header offset on page */ +/***********************************************************************//** +Gets the previous record in an undo log. +@return undo log record, the page s-latched, NULL if none */ +UNIV_INTERN +trx_undo_rec_t* +trx_undo_get_prev_rec( +/*==================*/ + trx_undo_rec_t* rec, /*!< in: undo record */ + ulint page_no,/*!< in: undo log header page number */ + ulint offset, /*!< in: undo log header offset on page */ + bool shared, /*!< in: true=S-latch, false=X-latch */ + mtr_t* mtr); /*!< in: mtr */ +/***********************************************************************//** +Gets the next record in an undo log. +@return undo log record, the page s-latched, NULL if none */ +UNIV_INTERN +trx_undo_rec_t* +trx_undo_get_next_rec( +/*==================*/ + trx_undo_rec_t* rec, /*!< in: undo record */ + ulint page_no,/*!< in: undo log header page number */ + ulint offset, /*!< in: undo log header offset on page */ + mtr_t* mtr); /*!< in: mtr */ +/***********************************************************************//** +Gets the first record in an undo log. +@return undo log record, the page latched, NULL if none */ +UNIV_INTERN +trx_undo_rec_t* +trx_undo_get_first_rec( +/*===================*/ + ulint space, /*!< in: undo log header space */ + ulint zip_size,/*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page_no,/*!< in: undo log header page number */ + ulint offset, /*!< in: undo log header offset on page */ + ulint mode, /*!< in: latching mode: RW_S_LATCH or RW_X_LATCH */ + mtr_t* mtr); /*!< in: mtr */ +/********************************************************************//** +Tries to add a page to the undo log segment where the undo log is placed. +@return X-latched block if success, else NULL */ +UNIV_INTERN +buf_block_t* +trx_undo_add_page( +/*==============*/ + trx_t* trx, /*!< in: transaction */ + trx_undo_t* undo, /*!< in: undo log memory object */ + mtr_t* mtr) /*!< in: mtr which does not have a latch to any + undo log page; the caller must have reserved + the rollback segment mutex */ + __attribute__((nonnull, warn_unused_result)); +/********************************************************************//** +Frees the last undo log page. +The caller must hold the rollback segment mutex. */ +UNIV_INTERN +void +trx_undo_free_last_page_func( +/*==========================*/ +#ifdef UNIV_DEBUG + const trx_t* trx, /*!< in: transaction */ +#endif /* UNIV_DEBUG */ + trx_undo_t* undo, /*!< in/out: undo log memory copy */ + mtr_t* mtr) /*!< in/out: mini-transaction which does not + have a latch to any undo log page or which + has allocated the undo log page */ + __attribute__((nonnull)); +#ifdef UNIV_DEBUG +# define trx_undo_free_last_page(trx,undo,mtr) \ + trx_undo_free_last_page_func(trx,undo,mtr) +#else /* UNIV_DEBUG */ +# define trx_undo_free_last_page(trx,undo,mtr) \ + trx_undo_free_last_page_func(undo,mtr) +#endif /* UNIV_DEBUG */ + +/***********************************************************************//** +Truncates an undo log from the end. This function is used during a rollback +to free space from an undo log. */ +UNIV_INTERN +void +trx_undo_truncate_end_func( +/*=======================*/ +#ifdef UNIV_DEBUG + const trx_t* trx, /*!< in: transaction whose undo log it is */ +#endif /* UNIV_DEBUG */ + trx_undo_t* undo, /*!< in/out: undo log */ + undo_no_t limit) /*!< in: all undo records with undo number + >= this value should be truncated */ + __attribute__((nonnull)); +#ifdef UNIV_DEBUG +# define trx_undo_truncate_end(trx,undo,limit) \ + trx_undo_truncate_end_func(trx,undo,limit) +#else /* UNIV_DEBUG */ +# define trx_undo_truncate_end(trx,undo,limit) \ + trx_undo_truncate_end_func(undo,limit) +#endif /* UNIV_DEBUG */ + +/***********************************************************************//** +Truncates an undo log from the start. This function is used during a purge +operation. */ +UNIV_INTERN +void +trx_undo_truncate_start( +/*====================*/ + trx_rseg_t* rseg, /*!< in: rollback segment */ + ulint space, /*!< in: space id of the log */ + ulint hdr_page_no, /*!< in: header page number */ + ulint hdr_offset, /*!< in: header offset on the page */ + undo_no_t limit); /*!< in: all undo pages with + undo numbers < this value + should be truncated; NOTE that + the function only frees whole + pages; the header page is not + freed, but emptied, if all the + records there are < limit */ +/********************************************************************//** +Initializes the undo log lists for a rollback segment memory copy. +This function is only called when the database is started or a new +rollback segment created. +@return the combined size of undo log segments in pages */ +UNIV_INTERN +ulint +trx_undo_lists_init( +/*================*/ + trx_rseg_t* rseg); /*!< in: rollback segment memory object */ +/**********************************************************************//** +Assigns an undo log for a transaction. A new undo log is created or a cached +undo log reused. +@return DB_SUCCESS if undo log assign successful, possible error codes +are: DB_TOO_MANY_CONCURRENT_TRXS DB_OUT_OF_FILE_SPACE DB_READ_ONLY +DB_OUT_OF_MEMORY */ +UNIV_INTERN +dberr_t +trx_undo_assign_undo( +/*=================*/ + trx_t* trx, /*!< in: transaction */ + ulint type) /*!< in: TRX_UNDO_INSERT or TRX_UNDO_UPDATE */ + __attribute__((nonnull, warn_unused_result)); +/******************************************************************//** +Sets the state of the undo log segment at a transaction finish. +@return undo log segment header page, x-latched */ +UNIV_INTERN +page_t* +trx_undo_set_state_at_finish( +/*=========================*/ + trx_undo_t* undo, /*!< in: undo log memory copy */ + mtr_t* mtr); /*!< in: mtr */ +/******************************************************************//** +Sets the state of the undo log segment at a transaction prepare. +@return undo log segment header page, x-latched */ +UNIV_INTERN +page_t* +trx_undo_set_state_at_prepare( +/*==========================*/ + trx_t* trx, /*!< in: transaction */ + trx_undo_t* undo, /*!< in: undo log memory copy */ + mtr_t* mtr); /*!< in: mtr */ + +/**********************************************************************//** +Adds the update undo log header as the first in the history list, and +frees the memory object, or puts it to the list of cached update undo log +segments. */ +UNIV_INTERN +void +trx_undo_update_cleanup( +/*====================*/ + trx_t* trx, /*!< in: trx owning the update undo log */ + page_t* undo_page, /*!< in: update undo log header page, + x-latched */ + mtr_t* mtr); /*!< in: mtr */ +/******************************************************************//** +Frees or caches an insert undo log after a transaction commit or rollback. +Knowledge of inserts is not needed after a commit or rollback, therefore +the data can be discarded. */ +UNIV_INTERN +void +trx_undo_insert_cleanup( +/*====================*/ + trx_t* trx); /*!< in: transaction handle */ + +/********************************************************************//** +At shutdown, frees the undo logs of a PREPARED transaction. */ +UNIV_INTERN +void +trx_undo_free_prepared( +/*===================*/ + trx_t* trx) /*!< in/out: PREPARED transaction */ + UNIV_COLD __attribute__((nonnull)); +#endif /* !UNIV_HOTBACKUP */ +/***********************************************************//** +Parses the redo log entry of an undo log page initialization. +@return end of log record or NULL */ +UNIV_INTERN +byte* +trx_undo_parse_page_init( +/*=====================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + page_t* page, /*!< in: page or NULL */ + mtr_t* mtr); /*!< in: mtr or NULL */ +/***********************************************************//** +Parses the redo log entry of an undo log page header create or reuse. +@return end of log record or NULL */ +UNIV_INTERN +byte* +trx_undo_parse_page_header( +/*=======================*/ + ulint type, /*!< in: MLOG_UNDO_HDR_CREATE or MLOG_UNDO_HDR_REUSE */ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + page_t* page, /*!< in: page or NULL */ + mtr_t* mtr); /*!< in: mtr or NULL */ +/***********************************************************//** +Parses the redo log entry of an undo log page header discard. +@return end of log record or NULL */ +UNIV_INTERN +byte* +trx_undo_parse_discard_latest( +/*==========================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + page_t* page, /*!< in: page or NULL */ + mtr_t* mtr); /*!< in: mtr or NULL */ +/************************************************************************ +Frees an undo log memory copy. */ +UNIV_INTERN +void +trx_undo_mem_free( +/*==============*/ + trx_undo_t* undo); /* in: the undo object to be freed */ + +/* Types of an undo log segment */ +#define TRX_UNDO_INSERT 1 /* contains undo entries for inserts */ +#define TRX_UNDO_UPDATE 2 /* contains undo entries for updates + and delete markings: in short, + modifys (the name 'UPDATE' is a + historical relic) */ +/* States of an undo log segment */ +#define TRX_UNDO_ACTIVE 1 /* contains an undo log of an active + transaction */ +#define TRX_UNDO_CACHED 2 /* cached for quick reuse */ +#define TRX_UNDO_TO_FREE 3 /* insert undo segment can be freed */ +#define TRX_UNDO_TO_PURGE 4 /* update undo segment will not be + reused: it can be freed in purge when + all undo data in it is removed */ +#define TRX_UNDO_PREPARED 5 /* contains an undo log of an + prepared transaction */ + +#ifndef UNIV_HOTBACKUP +/** Transaction undo log memory object; this is protected by the undo_mutex +in the corresponding transaction object */ + +struct trx_undo_t{ + /*-----------------------------*/ + ulint id; /*!< undo log slot number within the + rollback segment */ + ulint type; /*!< TRX_UNDO_INSERT or + TRX_UNDO_UPDATE */ + ulint state; /*!< state of the corresponding undo log + segment */ + ibool del_marks; /*!< relevant only in an update undo + log: this is TRUE if the transaction may + have delete marked records, because of + a delete of a row or an update of an + indexed field; purge is then + necessary; also TRUE if the transaction + has updated an externally stored + field */ + trx_id_t trx_id; /*!< id of the trx assigned to the undo + log */ + XID xid; /*!< X/Open XA transaction + identification */ + ibool dict_operation; /*!< TRUE if a dict operation trx */ + table_id_t table_id; /*!< if a dict operation, then the table + id */ + trx_rseg_t* rseg; /*!< rseg where the undo log belongs */ + /*-----------------------------*/ + ulint space; /*!< space id where the undo log + placed */ + ulint zip_size; /*!< compressed page size of space + in bytes, or 0 for uncompressed */ + ulint hdr_page_no; /*!< page number of the header page in + the undo log */ + ulint hdr_offset; /*!< header offset of the undo log on + the page */ + ulint last_page_no; /*!< page number of the last page in the + undo log; this may differ from + top_page_no during a rollback */ + ulint size; /*!< current size in pages */ + /*-----------------------------*/ + ulint empty; /*!< TRUE if the stack of undo log + records is currently empty */ + ulint top_page_no; /*!< page number where the latest undo + log record was catenated; during + rollback the page from which the latest + undo record was chosen */ + ulint top_offset; /*!< offset of the latest undo record, + i.e., the topmost element in the undo + log if we think of it as a stack */ + undo_no_t top_undo_no; /*!< undo number of the latest record */ + buf_block_t* guess_block; /*!< guess for the buffer block where + the top page might reside */ + /*-----------------------------*/ + UT_LIST_NODE_T(trx_undo_t) undo_list; + /*!< undo log objects in the rollback + segment are chained into lists */ +}; +#endif /* !UNIV_HOTBACKUP */ + +/** The offset of the undo log page header on pages of the undo log */ +#define TRX_UNDO_PAGE_HDR FSEG_PAGE_DATA +/*-------------------------------------------------------------*/ +/** Transaction undo log page header offsets */ +/* @{ */ +#define TRX_UNDO_PAGE_TYPE 0 /*!< TRX_UNDO_INSERT or + TRX_UNDO_UPDATE */ +#define TRX_UNDO_PAGE_START 2 /*!< Byte offset where the undo log + records for the LATEST transaction + start on this page (remember that + in an update undo log, the first page + can contain several undo logs) */ +#define TRX_UNDO_PAGE_FREE 4 /*!< On each page of the undo log this + field contains the byte offset of the + first free byte on the page */ +#define TRX_UNDO_PAGE_NODE 6 /*!< The file list node in the chain + of undo log pages */ +/*-------------------------------------------------------------*/ +#define TRX_UNDO_PAGE_HDR_SIZE (6 + FLST_NODE_SIZE) + /*!< Size of the transaction undo + log page header, in bytes */ +/* @} */ + +/** An update undo segment with just one page can be reused if it has +at most this many bytes used; we must leave space at least for one new undo +log header on the page */ + +#define TRX_UNDO_PAGE_REUSE_LIMIT (3 * UNIV_PAGE_SIZE / 4) + +/* An update undo log segment may contain several undo logs on its first page +if the undo logs took so little space that the segment could be cached and +reused. All the undo log headers are then on the first page, and the last one +owns the undo log records on subsequent pages if the segment is bigger than +one page. If an undo log is stored in a segment, then on the first page it is +allowed to have zero undo records, but if the segment extends to several +pages, then all the rest of the pages must contain at least one undo log +record. */ + +/** The offset of the undo log segment header on the first page of the undo +log segment */ + +#define TRX_UNDO_SEG_HDR (TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE) +/** Undo log segment header */ +/* @{ */ +/*-------------------------------------------------------------*/ +#define TRX_UNDO_STATE 0 /*!< TRX_UNDO_ACTIVE, ... */ +#define TRX_UNDO_LAST_LOG 2 /*!< Offset of the last undo log header + on the segment header page, 0 if + none */ +#define TRX_UNDO_FSEG_HEADER 4 /*!< Header for the file segment which + the undo log segment occupies */ +#define TRX_UNDO_PAGE_LIST (4 + FSEG_HEADER_SIZE) + /*!< Base node for the list of pages in + the undo log segment; defined only on + the undo log segment's first page */ +/*-------------------------------------------------------------*/ +/** Size of the undo log segment header */ +#define TRX_UNDO_SEG_HDR_SIZE (4 + FSEG_HEADER_SIZE + FLST_BASE_NODE_SIZE) +/* @} */ + + +/** The undo log header. There can be several undo log headers on the first +page of an update undo log segment. */ +/* @{ */ +/*-------------------------------------------------------------*/ +#define TRX_UNDO_TRX_ID 0 /*!< Transaction id */ +#define TRX_UNDO_TRX_NO 8 /*!< Transaction number of the + transaction; defined only if the log + is in a history list */ +#define TRX_UNDO_DEL_MARKS 16 /*!< Defined only in an update undo + log: TRUE if the transaction may have + done delete markings of records, and + thus purge is necessary */ +#define TRX_UNDO_LOG_START 18 /*!< Offset of the first undo log record + of this log on the header page; purge + may remove undo log record from the + log start, and therefore this is not + necessarily the same as this log + header end offset */ +#define TRX_UNDO_XID_EXISTS 20 /*!< TRUE if undo log header includes + X/Open XA transaction identification + XID */ +#define TRX_UNDO_DICT_TRANS 21 /*!< TRUE if the transaction is a table + create, index create, or drop + transaction: in recovery + the transaction cannot be rolled back + in the usual way: a 'rollback' rather + means dropping the created or dropped + table, if it still exists */ +#define TRX_UNDO_TABLE_ID 22 /*!< Id of the table if the preceding + field is TRUE */ +#define TRX_UNDO_NEXT_LOG 30 /*!< Offset of the next undo log header + on this page, 0 if none */ +#define TRX_UNDO_PREV_LOG 32 /*!< Offset of the previous undo log + header on this page, 0 if none */ +#define TRX_UNDO_HISTORY_NODE 34 /*!< If the log is put to the history + list, the file list node is here */ +/*-------------------------------------------------------------*/ +/** Size of the undo log header without XID information */ +#define TRX_UNDO_LOG_OLD_HDR_SIZE (34 + FLST_NODE_SIZE) + +/* Note: the writing of the undo log old header is coded by a log record +MLOG_UNDO_HDR_CREATE or MLOG_UNDO_HDR_REUSE. The appending of an XID to the +header is logged separately. In this sense, the XID is not really a member +of the undo log header. TODO: do not append the XID to the log header if XA +is not needed by the user. The XID wastes about 150 bytes of space in every +undo log. In the history list we may have millions of undo logs, which means +quite a large overhead. */ + +/** X/Open XA Transaction Identification (XID) */ +/* @{ */ +/** xid_t::formatID */ +#define TRX_UNDO_XA_FORMAT (TRX_UNDO_LOG_OLD_HDR_SIZE) +/** xid_t::gtrid_length */ +#define TRX_UNDO_XA_TRID_LEN (TRX_UNDO_XA_FORMAT + 4) +/** xid_t::bqual_length */ +#define TRX_UNDO_XA_BQUAL_LEN (TRX_UNDO_XA_TRID_LEN + 4) +/** Distributed transaction identifier data */ +#define TRX_UNDO_XA_XID (TRX_UNDO_XA_BQUAL_LEN + 4) +/*--------------------------------------------------------------*/ +#define TRX_UNDO_LOG_XA_HDR_SIZE (TRX_UNDO_XA_XID + XIDDATASIZE) + /*!< Total size of the undo log header + with the XA XID */ +/* @} */ + +#ifndef UNIV_NONINL +#include "trx0undo.ic" +#endif + +#endif diff --git a/storage/xtradb/include/trx0undo.ic b/storage/xtradb/include/trx0undo.ic new file mode 100644 index 00000000000..577759d6c3d --- /dev/null +++ b/storage/xtradb/include/trx0undo.ic @@ -0,0 +1,363 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0undo.ic +Transaction undo log + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#include "data0type.h" +#include "page0page.h" + +#ifndef UNIV_HOTBACKUP +/***********************************************************************//** +Builds a roll pointer. +@return roll pointer */ +UNIV_INLINE +roll_ptr_t +trx_undo_build_roll_ptr( +/*====================*/ + ibool is_insert, /*!< in: TRUE if insert undo log */ + ulint rseg_id, /*!< in: rollback segment id */ + ulint page_no, /*!< in: page number */ + ulint offset) /*!< in: offset of the undo entry within page */ +{ + roll_ptr_t roll_ptr; +#if DATA_ROLL_PTR_LEN != 7 +# error "DATA_ROLL_PTR_LEN != 7" +#endif + ut_ad(is_insert == 0 || is_insert == 1); + ut_ad(rseg_id < TRX_SYS_N_RSEGS); + ut_ad(offset < 65536); + + roll_ptr = (roll_ptr_t) is_insert << 55 + | (roll_ptr_t) rseg_id << 48 + | (roll_ptr_t) page_no << 16 + | offset; + return(roll_ptr); +} + +/***********************************************************************//** +Decodes a roll pointer. */ +UNIV_INLINE +void +trx_undo_decode_roll_ptr( +/*=====================*/ + roll_ptr_t roll_ptr, /*!< in: roll pointer */ + ibool* is_insert, /*!< out: TRUE if insert undo log */ + ulint* rseg_id, /*!< out: rollback segment id */ + ulint* page_no, /*!< out: page number */ + ulint* offset) /*!< out: offset of the undo + entry within page */ +{ +#if DATA_ROLL_PTR_LEN != 7 +# error "DATA_ROLL_PTR_LEN != 7" +#endif +#if TRUE != 1 +# error "TRUE != 1" +#endif + ut_ad(roll_ptr < (1ULL << 56)); + *offset = (ulint) roll_ptr & 0xFFFF; + roll_ptr >>= 16; + *page_no = (ulint) roll_ptr & 0xFFFFFFFF; + roll_ptr >>= 32; + *rseg_id = (ulint) roll_ptr & 0x7F; + roll_ptr >>= 7; + *is_insert = (ibool) roll_ptr; /* TRUE==1 */ +} + +/***********************************************************************//** +Returns TRUE if the roll pointer is of the insert type. +@return TRUE if insert undo log */ +UNIV_INLINE +ibool +trx_undo_roll_ptr_is_insert( +/*========================*/ + roll_ptr_t roll_ptr) /*!< in: roll pointer */ +{ +#if DATA_ROLL_PTR_LEN != 7 +# error "DATA_ROLL_PTR_LEN != 7" +#endif +#if TRUE != 1 +# error "TRUE != 1" +#endif + ut_ad(roll_ptr < (1ULL << 56)); + return((ibool) (roll_ptr >> 55)); +} + +/***********************************************************************//** +Returns true if the record is of the insert type. +@return true if the record was freshly inserted (not updated). */ +UNIV_INLINE +bool +trx_undo_trx_id_is_insert( +/*======================*/ + const byte* trx_id) /*!< in: DB_TRX_ID, followed by DB_ROLL_PTR */ +{ +#if DATA_TRX_ID + 1 != DATA_ROLL_PTR +# error +#endif + return(static_cast<bool>(trx_id[DATA_TRX_ID_LEN] >> 7)); +} +#endif /* !UNIV_HOTBACKUP */ + +/*****************************************************************//** +Writes a roll ptr to an index page. In case that the size changes in +some future version, this function should be used instead of +mach_write_... */ +UNIV_INLINE +void +trx_write_roll_ptr( +/*===============*/ + byte* ptr, /*!< in: pointer to memory where + written */ + roll_ptr_t roll_ptr) /*!< in: roll ptr */ +{ +#if DATA_ROLL_PTR_LEN != 7 +# error "DATA_ROLL_PTR_LEN != 7" +#endif + mach_write_to_7(ptr, roll_ptr); +} + +/*****************************************************************//** +Reads a roll ptr from an index page. In case that the roll ptr size +changes in some future version, this function should be used instead of +mach_read_... +@return roll ptr */ +UNIV_INLINE +roll_ptr_t +trx_read_roll_ptr( +/*==============*/ + const byte* ptr) /*!< in: pointer to memory from where to read */ +{ +#if DATA_ROLL_PTR_LEN != 7 +# error "DATA_ROLL_PTR_LEN != 7" +#endif + return(mach_read_from_7(ptr)); +} + +#ifndef UNIV_HOTBACKUP +/******************************************************************//** +Gets an undo log page and x-latches it. +@return pointer to page x-latched */ +UNIV_INLINE +page_t* +trx_undo_page_get( +/*==============*/ + ulint space, /*!< in: space where placed */ + ulint zip_size, /*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page_no, /*!< in: page number */ + mtr_t* mtr) /*!< in: mtr */ +{ + buf_block_t* block = buf_page_get(space, zip_size, page_no, + RW_X_LATCH, mtr); + buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE); + + return(buf_block_get_frame(block)); +} + +/******************************************************************//** +Gets an undo log page and s-latches it. +@return pointer to page s-latched */ +UNIV_INLINE +page_t* +trx_undo_page_get_s_latched( +/*========================*/ + ulint space, /*!< in: space where placed */ + ulint zip_size, /*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page_no, /*!< in: page number */ + mtr_t* mtr) /*!< in: mtr */ +{ + buf_block_t* block = buf_page_get(space, zip_size, page_no, + RW_S_LATCH, mtr); + buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE); + + return(buf_block_get_frame(block)); +} + +/******************************************************************//** +Returns the start offset of the undo log records of the specified undo +log on the page. +@return start offset */ +UNIV_INLINE +ulint +trx_undo_page_get_start( +/*====================*/ + page_t* undo_page,/*!< in: undo log page */ + ulint page_no,/*!< in: undo log header page number */ + ulint offset) /*!< in: undo log header offset on page */ +{ + ulint start; + + if (page_no == page_get_page_no(undo_page)) { + + start = mach_read_from_2(offset + undo_page + + TRX_UNDO_LOG_START); + } else { + start = TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE; + } + + return(start); +} + +/******************************************************************//** +Returns the end offset of the undo log records of the specified undo +log on the page. +@return end offset */ +UNIV_INLINE +ulint +trx_undo_page_get_end( +/*==================*/ + page_t* undo_page,/*!< in: undo log page */ + ulint page_no,/*!< in: undo log header page number */ + ulint offset) /*!< in: undo log header offset on page */ +{ + trx_ulogf_t* log_hdr; + ulint end; + + if (page_no == page_get_page_no(undo_page)) { + + log_hdr = undo_page + offset; + + end = mach_read_from_2(log_hdr + TRX_UNDO_NEXT_LOG); + + if (end == 0) { + end = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_FREE); + } + } else { + end = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_FREE); + } + + return(end); +} + +/******************************************************************//** +Returns the previous undo record on the page in the specified log, or +NULL if none exists. +@return pointer to record, NULL if none */ +UNIV_INLINE +trx_undo_rec_t* +trx_undo_page_get_prev_rec( +/*=======================*/ + trx_undo_rec_t* rec, /*!< in: undo log record */ + ulint page_no,/*!< in: undo log header page number */ + ulint offset) /*!< in: undo log header offset on page */ +{ + page_t* undo_page; + ulint start; + + undo_page = (page_t*) ut_align_down(rec, UNIV_PAGE_SIZE); + + start = trx_undo_page_get_start(undo_page, page_no, offset); + + if (start + undo_page == rec) { + + return(NULL); + } + + return(undo_page + mach_read_from_2(rec - 2)); +} + +/******************************************************************//** +Returns the next undo log record on the page in the specified log, or +NULL if none exists. +@return pointer to record, NULL if none */ +UNIV_INLINE +trx_undo_rec_t* +trx_undo_page_get_next_rec( +/*=======================*/ + trx_undo_rec_t* rec, /*!< in: undo log record */ + ulint page_no,/*!< in: undo log header page number */ + ulint offset) /*!< in: undo log header offset on page */ +{ + page_t* undo_page; + ulint end; + ulint next; + + undo_page = (page_t*) ut_align_down(rec, UNIV_PAGE_SIZE); + + end = trx_undo_page_get_end(undo_page, page_no, offset); + + next = mach_read_from_2(rec); + + if (next == end) { + + return(NULL); + } + + return(undo_page + next); +} + +/******************************************************************//** +Returns the last undo record on the page in the specified undo log, or +NULL if none exists. +@return pointer to record, NULL if none */ +UNIV_INLINE +trx_undo_rec_t* +trx_undo_page_get_last_rec( +/*=======================*/ + page_t* undo_page,/*!< in: undo log page */ + ulint page_no,/*!< in: undo log header page number */ + ulint offset) /*!< in: undo log header offset on page */ +{ + ulint start; + ulint end; + + start = trx_undo_page_get_start(undo_page, page_no, offset); + end = trx_undo_page_get_end(undo_page, page_no, offset); + + if (start == end) { + + return(NULL); + } + + return(undo_page + mach_read_from_2(undo_page + end - 2)); +} + +/******************************************************************//** +Returns the first undo record on the page in the specified undo log, or +NULL if none exists. +@return pointer to record, NULL if none */ +UNIV_INLINE +trx_undo_rec_t* +trx_undo_page_get_first_rec( +/*========================*/ + page_t* undo_page,/*!< in: undo log page */ + ulint page_no,/*!< in: undo log header page number */ + ulint offset) /*!< in: undo log header offset on page */ +{ + ulint start; + ulint end; + + start = trx_undo_page_get_start(undo_page, page_no, offset); + end = trx_undo_page_get_end(undo_page, page_no, offset); + + if (start == end) { + + return(NULL); + } + + return(undo_page + start); +} +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/xtradb/include/trx0xa.h b/storage/xtradb/include/trx0xa.h new file mode 100644 index 00000000000..7caddfb7ba4 --- /dev/null +++ b/storage/xtradb/include/trx0xa.h @@ -0,0 +1,70 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/* + * Start of xa.h header + * + * Define a symbol to prevent multiple inclusions of this header file + */ +#ifndef XA_H +#define XA_H + +/* + * Transaction branch identification: XID and NULLXID: + */ +#ifndef XIDDATASIZE + +/** Sizes of transaction identifier */ +#define XIDDATASIZE 128 /*!< maximum size of a transaction + identifier, in bytes */ +#define MAXGTRIDSIZE 64 /*!< maximum size in bytes of gtrid */ +#define MAXBQUALSIZE 64 /*!< maximum size in bytes of bqual */ + +/** X/Open XA distributed transaction identifier */ +struct xid_t { + long formatID; /*!< format identifier; -1 + means that the XID is null */ + long gtrid_length; /*!< value from 1 through 64 */ + long bqual_length; /*!< value from 1 through 64 */ + char data[XIDDATASIZE]; /*!< distributed transaction + identifier */ +}; +/** X/Open XA distributed transaction identifier */ +typedef struct xid_t XID; +#endif +/** X/Open XA distributed transaction status codes */ +/* @{ */ +#define XA_OK 0 /*!< normal execution */ +#define XAER_ASYNC -2 /*!< asynchronous operation already + outstanding */ +#define XAER_RMERR -3 /*!< a resource manager error + occurred in the transaction + branch */ +#define XAER_NOTA -4 /*!< the XID is not valid */ +#define XAER_INVAL -5 /*!< invalid arguments were given */ +#define XAER_PROTO -6 /*!< routine invoked in an improper + context */ +#define XAER_RMFAIL -7 /*!< resource manager unavailable */ +#define XAER_DUPID -8 /*!< the XID already exists */ +#define XAER_OUTSIDE -9 /*!< resource manager doing + work outside transaction */ +/* @} */ +#endif /* ifndef XA_H */ +/* + * End of xa.h header + */ diff --git a/storage/xtradb/include/univ.i b/storage/xtradb/include/univ.i new file mode 100644 index 00000000000..afb967d6680 --- /dev/null +++ b/storage/xtradb/include/univ.i @@ -0,0 +1,686 @@ +/***************************************************************************** + +Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2008, Google Inc. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/***********************************************************************//** +@file include/univ.i +Version control for database, common definitions, and include files + +Created 1/20/1994 Heikki Tuuri +****************************************************************************/ + +#ifndef univ_i +#define univ_i + +#ifdef UNIV_HOTBACKUP +#include "hb_univ.i" +#endif /* UNIV_HOTBACKUP */ + +/* aux macros to convert M into "123" (string) if M is defined like +#define M 123 */ +#define _IB_TO_STR(s) #s +#define IB_TO_STR(s) _IB_TO_STR(s) + +#define INNODB_VERSION_MAJOR MYSQL_VERSION_MAJOR +#define INNODB_VERSION_MINOR MYSQL_VERSION_MINOR +#define INNODB_VERSION_BUGFIX MYSQL_VERSION_PATCH + +#ifndef PERCONA_INNODB_VERSION +#define PERCONA_INNODB_VERSION 72.0 +#endif + +/* Enable UNIV_LOG_ARCHIVE in XtraDB */ +#define UNIV_LOG_ARCHIVE 1 + +/* The following is the InnoDB version as shown in +SELECT plugin_version FROM information_schema.plugins; +calculated in make_version_string() in sql/sql_show.cc like this: +"version >> 8" . "version & 0xff" +because the version is shown with only one dot, we skip the last +component, i.e. we show M.N.P as M.N */ +#define INNODB_VERSION_SHORT \ + (INNODB_VERSION_MAJOR << 8 | INNODB_VERSION_MINOR) + +#define INNODB_VERSION_STR \ + IB_TO_STR(INNODB_VERSION_MAJOR) "." \ + IB_TO_STR(INNODB_VERSION_MINOR) "." \ + IB_TO_STR(INNODB_VERSION_BUGFIX) "-" \ + IB_TO_STR(PERCONA_INNODB_VERSION) + +#define REFMAN "http://dev.mysql.com/doc/refman/" \ + IB_TO_STR(MYSQL_VERSION_MAJOR) "." \ + IB_TO_STR(MYSQL_VERSION_MINOR) "/en/" + +#ifdef MYSQL_DYNAMIC_PLUGIN +/* In the dynamic plugin, redefine some externally visible symbols +in order not to conflict with the symbols of a builtin InnoDB. */ + +/* Rename all C++ classes that contain virtual functions, because we +have not figured out how to apply the visibility=hidden attribute to +the virtual method table (vtable) in GCC 3. */ +# define ha_innobase ha_innodb +#endif /* MYSQL_DYNAMIC_PLUGIN */ + +#if (defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)) && !defined(MYSQL_SERVER) && !defined(__WIN__) +# undef __WIN__ +# define __WIN__ + +# include <windows.h> + +# ifdef _NT_ +# define __NT__ +# endif + +#else +/* The defines used with MySQL */ + +/* Include two header files from MySQL to make the Unix flavor used +in compiling more Posix-compatible. These headers also define __WIN__ +if we are compiling on Windows. */ + +#ifndef UNIV_HOTBACKUP +# include <my_global.h> +# include <my_pthread.h> +#endif /* UNIV_HOTBACKUP */ + +/* Include <sys/stat.h> to get S_I... macros defined for os0file.cc */ +# include <sys/stat.h> +# if !defined(__WIN__) +# include <sys/mman.h> /* mmap() for os0proc.cc */ +# endif + +/* Include the header file generated by GNU autoconf */ +# ifndef __WIN__ +# ifndef UNIV_HOTBACKUP +# include "config.h" +# endif /* UNIV_HOTBACKUP */ +# endif + +# ifdef HAVE_SCHED_H +# include <sched.h> +# endif + +# ifdef HAVE_MALLOC_H +# include <malloc.h> +# endif + +/* We only try to do explicit inlining of functions with gcc and +Sun Studio */ + +# ifdef HAVE_PREAD +# define HAVE_PWRITE +# endif + +#endif /* #if (defined(WIN32) || ... */ + +#ifndef __WIN__ +#define __STDC_FORMAT_MACROS /* Enable C99 printf format macros */ +#include <inttypes.h> +#endif /* !__WIN__ */ + +/* Following defines are to enable performance schema +instrumentation in each of four InnoDB modules if +HAVE_PSI_INTERFACE is defined. */ +#if defined HAVE_PSI_INTERFACE && !defined UNIV_HOTBACKUP +# define UNIV_PFS_MUTEX +# define UNIV_PFS_RWLOCK +/* For I/O instrumentation, performance schema rely +on a native descriptor to identify the file, this +descriptor could conflict with our OS level descriptor. +Disable IO instrumentation on Windows until this is +resolved */ +# ifndef __WIN__ +# define UNIV_PFS_IO +# endif +# define UNIV_PFS_THREAD + +/* There are mutexes/rwlocks that we want to exclude from +instrumentation even if their corresponding performance schema +define is set. And this PFS_NOT_INSTRUMENTED is used +as the key value to identify those objects that would +be excluded from instrumentation. */ +# define PFS_NOT_INSTRUMENTED ULINT32_UNDEFINED + +# define PFS_IS_INSTRUMENTED(key) ((key) != PFS_NOT_INSTRUMENTED) + +#endif /* HAVE_PSI_INTERFACE */ + +#ifdef __WIN__ +# define YY_NO_UNISTD_H 1 +#endif /* __WIN__ */ + +/* DEBUG VERSION CONTROL + ===================== */ + +/* When this macro is defined then additional test functions will be +compiled. These functions live at the end of each relevant source file +and have "test_" prefix. These functions are not called from anywhere in +the code, they can be called from gdb after +innobase_start_or_create_for_mysql() has executed using the call +command. Not tested on Windows. */ +/* +#define UNIV_COMPILE_TEST_FUNCS +*/ + +#if defined HAVE_VALGRIND +# define UNIV_DEBUG_VALGRIND +#endif /* HAVE_VALGRIND */ +#if 0 +#define UNIV_DEBUG_VALGRIND /* Enable extra + Valgrind instrumentation */ +#define UNIV_DEBUG_PRINT /* Enable the compilation of + some debug print functions */ +#define UNIV_AHI_DEBUG /* Enable adaptive hash index + debugging without UNIV_DEBUG */ +#define UNIV_BUF_DEBUG /* Enable buffer pool + debugging without UNIV_DEBUG */ +#define UNIV_BLOB_LIGHT_DEBUG /* Enable off-page column + debugging without UNIV_DEBUG */ +#define UNIV_DEBUG /* Enable ut_ad() assertions + and disable UNIV_INLINE */ +#define UNIV_DEBUG_LOCK_VALIDATE /* Enable + ut_ad(lock_rec_validate_page()) + assertions. */ +#define UNIV_DEBUG_FILE_ACCESSES /* Enable freed block access + debugging without UNIV_DEBUG */ +#define UNIV_LRU_DEBUG /* debug the buffer pool LRU */ +#define UNIV_HASH_DEBUG /* debug HASH_ macros */ +#define UNIV_LIST_DEBUG /* debug UT_LIST_ macros */ +#define UNIV_LOG_LSN_DEBUG /* write LSN to the redo log; +this will break redo log file compatibility, but it may be useful when +debugging redo log application problems. */ +#define UNIV_MEM_DEBUG /* detect memory leaks etc */ +#define UNIV_IBUF_DEBUG /* debug the insert buffer */ +#define UNIV_BLOB_DEBUG /* track BLOB ownership; +assumes that no BLOBs survive server restart */ +#define UNIV_IBUF_COUNT_DEBUG /* debug the insert buffer; +this limits the database to IBUF_COUNT_N_SPACES and IBUF_COUNT_N_PAGES, +and the insert buffer must be empty when the database is started */ +#define UNIV_PERF_DEBUG /* debug flag that enables + light weight performance + related stuff. */ +#define UNIV_SYNC_DEBUG /* debug mutex and latch +operations (very slow); also UNIV_DEBUG must be defined */ +#define UNIV_SEARCH_DEBUG /* debug B-tree comparisons */ +#define UNIV_SYNC_PERF_STAT /* operation counts for + rw-locks and mutexes */ +#define UNIV_SEARCH_PERF_STAT /* statistics for the + adaptive hash index */ +#define UNIV_SRV_PRINT_LATCH_WAITS /* enable diagnostic output + in sync0sync.cc */ +#define UNIV_BTR_PRINT /* enable functions for + printing B-trees */ +#define UNIV_ZIP_DEBUG /* extensive consistency checks + for compressed pages */ +#define UNIV_ZIP_COPY /* call page_zip_copy_recs() + more often */ +#define UNIV_AIO_DEBUG /* prints info about + submitted and reaped AIO + requests to the log. */ +#define UNIV_STATS_DEBUG /* prints various stats + related debug info from + dict0stats.c */ +#define FTS_INTERNAL_DIAG_PRINT /* FTS internal debugging + info output */ +#endif + +#define UNIV_BTR_DEBUG /* check B-tree links */ +#define UNIV_LIGHT_MEM_DEBUG /* light memory debugging */ + +/* +#define UNIV_SQL_DEBUG +#define UNIV_LOG_DEBUG +*/ + /* the above option prevents forcing of log to disk + at a buffer page write: it should be tested with this + option off; also some ibuf tests are suppressed */ + +/* Linkage specifier for non-static InnoDB symbols (variables and functions) +that are only referenced from within InnoDB, not from MySQL. We disable the +GCC visibility directive on all Sun operating systems because there is no +easy way to get it to work. See http://bugs.mysql.com/bug.php?id=52263. */ +#if defined(__GNUC__) && (__GNUC__ >= 4) && !defined(sun) || defined(__INTEL_COMPILER) +# define UNIV_INTERN __attribute__((visibility ("hidden"))) +#else +# define UNIV_INTERN +#endif +#if defined(INNODB_COMPILER_HINTS) \ + && defined __GNUC__ \ + && (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 3) +/** Starting with GCC 4.3, the "cold" attribute is used to inform the +compiler that a function is unlikely executed. The function is +optimized for size rather than speed and on many targets it is placed +into special subsection of the text section so all cold functions +appears close together improving code locality of non-cold parts of +program. The paths leading to call of cold functions within code are +marked as unlikely by the branch prediction mechanism. optimize a +rarely invoked function for size instead for speed. */ +# define UNIV_COLD __attribute__((cold)) +#else +# define UNIV_COLD /* empty */ +#endif + +#ifdef UNIV_LINUX +# define UNIV_THREAD_LOCAL __thread +#else +/* FIXME: the TLS variables are silently broken on other platforms for now */ +# define UNIV_THREAD_LOCAL +#endif + +#ifndef UNIV_MUST_NOT_INLINE +/* Definition for inline version */ + +#define UNIV_INLINE static inline + +#else /* !UNIV_MUST_NOT_INLINE */ +/* If we want to compile a noninlined version we use the following macro +definitions: */ + +#define UNIV_NONINL +#define UNIV_INLINE UNIV_INTERN + +#endif /* !UNIV_MUST_NOT_INLINE */ + +#ifdef _WIN32 +#define UNIV_WORD_SIZE 4 +#elif defined(_WIN64) +#define UNIV_WORD_SIZE 8 +#else +/** MySQL config.h generated by GNU autoconf will define SIZEOF_LONG in Posix */ +#define UNIV_WORD_SIZE SIZEOF_LONG +#endif + +/** The following alignment is used in memory allocations in memory heap +management to ensure correct alignment for doubles etc. */ +#define UNIV_MEM_ALIGNMENT 8 + +/** The following alignment is used in aligning lints etc. */ +#define UNIV_WORD_ALIGNMENT UNIV_WORD_SIZE + +/* + DATABASE VERSION CONTROL + ======================== +*/ + +/** There are currently two InnoDB file formats which are used to group +features with similar restrictions and dependencies. Using an enum allows +switch statements to give a compiler warning when a new one is introduced. */ +enum innodb_file_formats_enum { + /** Antelope File Format: InnoDB/MySQL up to 5.1. + This format includes REDUNDANT and COMPACT row formats */ + UNIV_FORMAT_A = 0, + + /** Barracuda File Format: Introduced in InnoDB plugin for 5.1: + This format includes COMPRESSED and DYNAMIC row formats. It + includes the ability to create secondary indexes from data that + is not on the clustered index page and the ability to store more + data off the clustered index page. */ + UNIV_FORMAT_B = 1 +}; + +typedef enum innodb_file_formats_enum innodb_file_formats_t; + +/** Minimum supported file format */ +#define UNIV_FORMAT_MIN UNIV_FORMAT_A + +/** Maximum supported file format */ +#define UNIV_FORMAT_MAX UNIV_FORMAT_B + +/** The 2-logarithm of UNIV_PAGE_SIZE: */ +#define UNIV_PAGE_SIZE_SHIFT srv_page_size_shift + +/** The universal page size of the database */ +#define UNIV_PAGE_SIZE ((ulint) srv_page_size) + +/** log2 of smallest compressed page size (1<<10 == 1024 bytes) +Note: This must never change! */ +#define UNIV_ZIP_SIZE_SHIFT_MIN 10 + +/** log2 of largest compressed page size (1<<14 == 16384 bytes). +A compressed page directory entry reserves 14 bits for the start offset +and 2 bits for flags. This limits the uncompressed page size to 16k. +Even though a 16k uncompressed page can theoretically be compressed +into a larger compressed page, it is not a useful feature so we will +limit both with this same constant. */ +#define UNIV_ZIP_SIZE_SHIFT_MAX 14 + +/* Define the Min, Max, Default page sizes. */ +/** Minimum Page Size Shift (power of 2) */ +#define UNIV_PAGE_SIZE_SHIFT_MIN 12 +/** Maximum Page Size Shift (power of 2) */ +#define UNIV_PAGE_SIZE_SHIFT_MAX 14 +/** Default Page Size Shift (power of 2) */ +#define UNIV_PAGE_SIZE_SHIFT_DEF 14 +/** Original 16k InnoDB Page Size Shift, in case the default changes */ +#define UNIV_PAGE_SIZE_SHIFT_ORIG 14 + +/** Minimum page size InnoDB currently supports. */ +#define UNIV_PAGE_SIZE_MIN (1 << UNIV_PAGE_SIZE_SHIFT_MIN) +/** Maximum page size InnoDB currently supports. */ +#define UNIV_PAGE_SIZE_MAX (1 << UNIV_PAGE_SIZE_SHIFT_MAX) +/** Default page size for InnoDB tablespaces. */ +#define UNIV_PAGE_SIZE_DEF (1 << UNIV_PAGE_SIZE_SHIFT_DEF) +/** Original 16k page size for InnoDB tablespaces. */ +#define UNIV_PAGE_SIZE_ORIG (1 << UNIV_PAGE_SIZE_SHIFT_ORIG) + +/** Smallest compressed page size */ +#define UNIV_ZIP_SIZE_MIN (1 << UNIV_ZIP_SIZE_SHIFT_MIN) + +/** Largest compressed page size */ +#define UNIV_ZIP_SIZE_MAX (1 << UNIV_ZIP_SIZE_SHIFT_MAX) + +/** Number of supported page sizes (The convention 'ssize' is used +for 'log2 minus 9' or the number of shifts starting with 512.) +This number varies depending on UNIV_PAGE_SIZE. */ +#define UNIV_PAGE_SSIZE_MAX \ + (UNIV_PAGE_SIZE_SHIFT - UNIV_ZIP_SIZE_SHIFT_MIN + 1) + +/** Maximum number of parallel threads in a parallelized operation */ +#define UNIV_MAX_PARALLELISM 32 + +/** This is the "mbmaxlen" for my_charset_filename (defined in +strings/ctype-utf8.c), which is used to encode File and Database names. */ +#define FILENAME_CHARSET_MAXNAMLEN 5 + +/** The maximum length of an encode table name in bytes. The max +table and database names are NAME_CHAR_LEN (64) characters. After the +encoding, the max length would be NAME_CHAR_LEN (64) * +FILENAME_CHARSET_MAXNAMLEN (5) = 320 bytes. The number does not include a +terminating '\0'. InnoDB can handle longer names internally */ +#define MAX_TABLE_NAME_LEN 320 + +/** The maximum length of a database name. Like MAX_TABLE_NAME_LEN this is +the MySQL's NAME_LEN, see check_and_convert_db_name(). */ +#define MAX_DATABASE_NAME_LEN MAX_TABLE_NAME_LEN + +/** MAX_FULL_NAME_LEN defines the full name path including the +database name and table name. In addition, 14 bytes is added for: + 2 for surrounding quotes around table name + 1 for the separating dot (.) + 9 for the #mysql50# prefix */ +#define MAX_FULL_NAME_LEN \ + (MAX_TABLE_NAME_LEN + MAX_DATABASE_NAME_LEN + 14) + +/** The maximum length in bytes that a database name can occupy when stored in +UTF8, including the terminating '\0', see dict_fs2utf8(). You must include +mysql_com.h if you are to use this macro. */ +#define MAX_DB_UTF8_LEN (NAME_LEN + 1) + +/** The maximum length in bytes that a table name can occupy when stored in +UTF8, including the terminating '\0', see dict_fs2utf8(). You must include +mysql_com.h if you are to use this macro. */ +#define MAX_TABLE_UTF8_LEN (NAME_LEN + sizeof(srv_mysql50_table_name_prefix)) + +/* + UNIVERSAL TYPE DEFINITIONS + ========================== +*/ + +/* Note that inside MySQL 'byte' is defined as char on Linux! */ +#define byte unsigned char + +/* Another basic type we use is unsigned long integer which should be equal to +the word size of the machine, that is on a 32-bit platform 32 bits, and on a +64-bit platform 64 bits. We also give the printf format for the type as a +macro ULINTPF. */ + + +#ifdef __WIN__ +/* Use the integer types and formatting strings defined in Visual Studio. */ +# define UINT32PF "%I32u" +# define INT64PF "%I64d" +# define UINT64PF "%I64u" +# define UINT64PFx "%016I64x" +# define DBUG_LSN_PF "%llu" +typedef __int64 ib_int64_t; +typedef unsigned __int64 ib_uint64_t; +typedef unsigned __int32 ib_uint32_t; +#else +/* Use the integer types and formatting strings defined in the C99 standard. */ +# define UINT32PF "%" PRIu32 +# define INT64PF "%" PRId64 +# define UINT64PF "%" PRIu64 +# define UINT64PFx "%016" PRIx64 +# define DBUG_LSN_PF UINT64PF +typedef int64_t ib_int64_t; +typedef uint64_t ib_uint64_t; +typedef uint32_t ib_uint32_t; +# endif /* __WIN__ */ + +# define IB_ID_FMT UINT64PF + +#ifdef _WIN64 +typedef unsigned __int64 ulint; +typedef __int64 lint; +# define ULINTPF UINT64PF +#else +typedef unsigned long int ulint; +typedef long int lint; +# define ULINTPF "%lu" +#endif /* _WIN64 */ + +#ifndef UNIV_HOTBACKUP +typedef unsigned long long int ullint; +#endif /* UNIV_HOTBACKUP */ + +#ifndef __WIN__ +#if SIZEOF_LONG != SIZEOF_VOIDP +#error "Error: InnoDB's ulint must be of the same size as void*" +#endif +#endif + +/** The 'undefined' value for a ulint */ +#define ULINT_UNDEFINED ((ulint)(-1)) + +#define ULONG_UNDEFINED ((ulong)(-1)) + +/** The 'undefined' value for a ib_uint64_t */ +#define UINT64_UNDEFINED ((ib_uint64_t)(-1)) + +/** The bitmask of 32-bit unsigned integer */ +#define ULINT32_MASK 0xFFFFFFFF +/** The undefined 32-bit unsigned integer */ +#define ULINT32_UNDEFINED ULINT32_MASK + +/** Maximum value for a ulint */ +#define ULINT_MAX ((ulint)(-2)) + +/** Maximum value for ib_uint64_t */ +#define IB_UINT64_MAX ((ib_uint64_t) (~0ULL)) + +/** The generic InnoDB system object identifier data type */ +typedef ib_uint64_t ib_id_t; +#define IB_ID_MAX IB_UINT64_MAX + +/** The 'undefined' value for a ullint */ +#define ULLINT_UNDEFINED ((ullint)(-1)) + +/** This 'ibool' type is used within Innobase. Remember that different included +headers may define 'bool' differently. Do not assume that 'bool' is a ulint! */ +#define ibool ulint + +#ifndef TRUE + +#define TRUE 1 +#define FALSE 0 + +#endif + +#define UNIV_NOTHROW + +/** The following number as the length of a logical field means that the field +has the SQL NULL as its value. NOTE that because we assume that the length +of a field is a 32-bit integer when we store it, for example, to an undo log +on disk, we must have also this number fit in 32 bits, also in 64-bit +computers! */ + +#define UNIV_SQL_NULL ULINT32_UNDEFINED + +/** Lengths which are not UNIV_SQL_NULL, but bigger than the following +number indicate that a field contains a reference to an externally +stored part of the field in the tablespace. The length field then +contains the sum of the following flag and the locally stored len. */ + +#define UNIV_EXTERN_STORAGE_FIELD (UNIV_SQL_NULL - UNIV_PAGE_SIZE_MAX) + +#if defined(__GNUC__) && (__GNUC__ > 2) && ! defined(__INTEL_COMPILER) +#define HAVE_GCC_GT_2 +/* Tell the compiler that variable/function is unused. */ +# define UNIV_UNUSED __attribute__ ((unused)) +#else +# define UNIV_UNUSED +#endif /* CHECK FOR GCC VER_GT_2 */ + +/* Some macros to improve branch prediction and reduce cache misses */ +#if defined(INNODB_COMPILER_HINTS) && defined(HAVE_GCC_GT_2) +/* Tell the compiler that 'expr' probably evaluates to 'constant'. */ +# define UNIV_EXPECT(expr,constant) __builtin_expect(expr, constant) +/* Tell the compiler that a pointer is likely to be NULL */ +# define UNIV_LIKELY_NULL(ptr) __builtin_expect((ulint) ptr, 0) +/* Minimize cache-miss latency by moving data at addr into a cache before +it is read. */ +# define UNIV_PREFETCH_R(addr) __builtin_prefetch(addr, 0, 3) +/* Minimize cache-miss latency by moving data at addr into a cache before +it is read or written. */ +# define UNIV_PREFETCH_RW(addr) __builtin_prefetch(addr, 1, 3) + +/* Sun Studio includes sun_prefetch.h as of version 5.9 */ +#elif (defined(__SUNPRO_C) && __SUNPRO_C >= 0x590) \ + || (defined(__SUNPRO_CC) && __SUNPRO_CC >= 0x590) + +# include <sun_prefetch.h> + +#if __SUNPRO_C >= 0x550 +# undef UNIV_INTERN +# define UNIV_INTERN __hidden +#endif /* __SUNPRO_C >= 0x550 */ + +# define UNIV_EXPECT(expr,value) (expr) +# define UNIV_LIKELY_NULL(expr) (expr) + +# if defined(INNODB_COMPILER_HINTS) +//# define UNIV_PREFETCH_R(addr) sun_prefetch_read_many((void*) addr) +# define UNIV_PREFETCH_R(addr) ((void) 0) +# define UNIV_PREFETCH_RW(addr) sun_prefetch_write_many(addr) +# else +# define UNIV_PREFETCH_R(addr) ((void) 0) +# define UNIV_PREFETCH_RW(addr) ((void) 0) +# endif /* INNODB_COMPILER_HINTS */ + +#else +/* Dummy versions of the macros */ +# define UNIV_EXPECT(expr,value) (expr) +# define UNIV_LIKELY_NULL(expr) (expr) +# define UNIV_PREFETCH_R(addr) ((void) 0) +# define UNIV_PREFETCH_RW(addr) ((void) 0) +#endif + +/* Tell the compiler that cond is likely to hold */ +#define UNIV_LIKELY(cond) UNIV_EXPECT(cond, TRUE) +/* Tell the compiler that cond is unlikely to hold */ +#define UNIV_UNLIKELY(cond) UNIV_EXPECT(cond, FALSE) + +/* Compile-time constant of the given array's size. */ +#define UT_ARR_SIZE(a) (sizeof(a) / sizeof((a)[0])) + +/* The return type from a thread's start function differs between Unix and +Windows, so define a typedef for it and a macro to use at the end of such +functions. */ + +#ifdef __WIN__ +typedef ulint os_thread_ret_t; +#define OS_THREAD_DUMMY_RETURN return(0) +#else +typedef void* os_thread_ret_t; +#define OS_THREAD_DUMMY_RETURN return(NULL) +#endif + +#include <stdio.h> +#include "ut0dbg.h" +#include "ut0ut.h" +#include "db0err.h" +#ifdef UNIV_DEBUG_VALGRIND +# include <valgrind/memcheck.h> +# define UNIV_MEM_VALID(addr, size) VALGRIND_MAKE_MEM_DEFINED(addr, size) +# define UNIV_MEM_INVALID(addr, size) VALGRIND_MAKE_MEM_UNDEFINED(addr, size) +# define UNIV_MEM_FREE(addr, size) VALGRIND_MAKE_MEM_NOACCESS(addr, size) +# define UNIV_MEM_ALLOC(addr, size) VALGRIND_MAKE_MEM_UNDEFINED(addr, size) +# define UNIV_MEM_DESC(addr, size) VALGRIND_CREATE_BLOCK(addr, size, #addr) +# define UNIV_MEM_UNDESC(b) VALGRIND_DISCARD(b) +# define UNIV_MEM_ASSERT_RW_LOW(addr, size, should_abort) do { \ + const void* _p = (const void*) (ulint) \ + VALGRIND_CHECK_MEM_IS_DEFINED(addr, size); \ + if (UNIV_LIKELY_NULL(_p)) { \ + fprintf(stderr, "%s:%d: %p[%u] undefined at %ld\n", \ + __FILE__, __LINE__, \ + (const void*) (addr), (unsigned) (size), (long) \ + (((const char*) _p) - ((const char*) (addr)))); \ + if (should_abort) { \ + ut_error; \ + } \ + } \ +} while (0) +# define UNIV_MEM_ASSERT_RW(addr, size) \ + UNIV_MEM_ASSERT_RW_LOW(addr, size, false) +# define UNIV_MEM_ASSERT_RW_ABORT(addr, size) \ + UNIV_MEM_ASSERT_RW_LOW(addr, size, true) +# define UNIV_MEM_ASSERT_W(addr, size) do { \ + const void* _p = (const void*) (ulint) \ + VALGRIND_CHECK_MEM_IS_ADDRESSABLE(addr, size); \ + if (UNIV_LIKELY_NULL(_p)) \ + fprintf(stderr, "%s:%d: %p[%u] unwritable at %ld\n", \ + __FILE__, __LINE__, \ + (const void*) (addr), (unsigned) (size), (long) \ + (((const char*) _p) - ((const char*) (addr)))); \ + } while (0) +# define UNIV_MEM_TRASH(addr, c, size) do { \ + ut_d(memset(addr, c, size)); \ + UNIV_MEM_INVALID(addr, size); \ + } while (0) +#else +# define UNIV_MEM_VALID(addr, size) do {} while(0) +# define UNIV_MEM_INVALID(addr, size) do {} while(0) +# define UNIV_MEM_FREE(addr, size) do {} while(0) +# define UNIV_MEM_ALLOC(addr, size) do {} while(0) +# define UNIV_MEM_DESC(addr, size) do {} while(0) +# define UNIV_MEM_UNDESC(b) do {} while(0) +# define UNIV_MEM_ASSERT_RW_LOW(addr, size, should_abort) do {} while(0) +# define UNIV_MEM_ASSERT_RW(addr, size) do {} while(0) +# define UNIV_MEM_ASSERT_RW_ABORT(addr, size) do {} while(0) +# define UNIV_MEM_ASSERT_W(addr, size) do {} while(0) +# define UNIV_MEM_TRASH(addr, c, size) do {} while(0) +#endif +#define UNIV_MEM_ASSERT_AND_FREE(addr, size) do { \ + UNIV_MEM_ASSERT_W(addr, size); \ + UNIV_MEM_FREE(addr, size); \ +} while (0) +#define UNIV_MEM_ASSERT_AND_ALLOC(addr, size) do { \ + UNIV_MEM_ASSERT_W(addr, size); \ + UNIV_MEM_ALLOC(addr, size); \ +} while (0) + +extern ulong srv_page_size_shift; +extern ulong srv_page_size; + +#endif diff --git a/storage/xtradb/include/usr0sess.h b/storage/xtradb/include/usr0sess.h new file mode 100644 index 00000000000..b5c80b97b43 --- /dev/null +++ b/storage/xtradb/include/usr0sess.h @@ -0,0 +1,77 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/usr0sess.h +Sessions + +Created 6/25/1996 Heikki Tuuri +*******************************************************/ + +#ifndef usr0sess_h +#define usr0sess_h + +#include "univ.i" +#include "ut0byte.h" +#include "trx0types.h" +#include "srv0srv.h" +#include "trx0types.h" +#include "usr0types.h" +#include "que0types.h" +#include "data0data.h" +#include "rem0rec.h" + +/*********************************************************************//** +Opens a session. +@return own: session object */ +UNIV_INTERN +sess_t* +sess_open(void); +/*============*/ +/*********************************************************************//** +Closes a session, freeing the memory occupied by it. */ +UNIV_INTERN +void +sess_close( +/*=======*/ + sess_t* sess); /* in, own: session object */ + +/* The session handle. This data structure is only used by purge and is +not really necessary. We should get rid of it. */ +struct sess_t{ + ulint state; /*!< state of the session */ + trx_t* trx; /*!< transaction object permanently + assigned for the session: the + transaction instance designated by the + trx id changes, but the memory + structure is preserved */ + UT_LIST_BASE_NODE_T(que_t) + graphs; /*!< query graphs belonging to this + session */ +}; + +/* Session states */ +#define SESS_ACTIVE 1 +#define SESS_ERROR 2 /* session contains an error message + which has not yet been communicated + to the client */ +#ifndef UNIV_NONINL +#include "usr0sess.ic" +#endif + +#endif diff --git a/storage/xtradb/include/usr0sess.ic b/storage/xtradb/include/usr0sess.ic new file mode 100644 index 00000000000..284e59537fe --- /dev/null +++ b/storage/xtradb/include/usr0sess.ic @@ -0,0 +1,24 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/usr0sess.ic +Sessions + +Created 6/25/1996 Heikki Tuuri +*******************************************************/ diff --git a/storage/xtradb/include/usr0types.h b/storage/xtradb/include/usr0types.h new file mode 100644 index 00000000000..6ba937cacc8 --- /dev/null +++ b/storage/xtradb/include/usr0types.h @@ -0,0 +1,31 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/usr0types.h +Users and sessions global types + +Created 6/25/1996 Heikki Tuuri +*******************************************************/ + +#ifndef usr0types_h +#define usr0types_h + +struct sess_t; + +#endif diff --git a/storage/xtradb/include/ut0bh.h b/storage/xtradb/include/ut0bh.h new file mode 100644 index 00000000000..1085736c7ab --- /dev/null +++ b/storage/xtradb/include/ut0bh.h @@ -0,0 +1,152 @@ +/***************************************************************************//** + +Copyright (c) 2011, 2013, Oracle Corpn. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/ut0bh.h +Binary min-heap interface. + +Created 2010-05-28 by Sunny Bains +*******************************************************/ + +#ifndef INNOBASE_UT0BH_H +#define INNOBASE_UT0BH_H + +#include "univ.i" + +/** Comparison function for objects in the binary heap. */ +typedef int (*ib_bh_cmp_t)(const void* p1, const void* p2); + +struct ib_bh_t; + +/**********************************************************************//** +Get the number of elements in the binary heap. +@return number of elements */ +UNIV_INLINE +ulint +ib_bh_size( +/*=======*/ + const ib_bh_t* ib_bh); /*!< in: instance */ + +/**********************************************************************//** +Test if binary heap is empty. +@return TRUE if empty. */ +UNIV_INLINE +ibool +ib_bh_is_empty( +/*===========*/ + const ib_bh_t* ib_bh); /*!< in: instance */ + +/**********************************************************************//** +Test if binary heap is full. +@return TRUE if full. */ +UNIV_INLINE +ibool +ib_bh_is_full( +/*===========*/ + const ib_bh_t* ib_bh); /*!< in: instance */ + +/**********************************************************************//** +Get a pointer to the element. +@return pointer to element */ +UNIV_INLINE +void* +ib_bh_get( +/*=======*/ + ib_bh_t* ib_bh, /*!< in: instance */ + ulint i); /*!< in: index */ + +/**********************************************************************//** +Copy an element to the binary heap. +@return pointer to copied element */ +UNIV_INLINE +void* +ib_bh_set( +/*======*/ + ib_bh_t* ib_bh, /*!< in/out: instance */ + ulint i, /*!< in: index */ + const void* elem); /*!< in: element to add */ + +/**********************************************************************//** +Return the first element from the binary heap. +@return pointer to first element or NULL if empty. */ +UNIV_INLINE +void* +ib_bh_first( +/*========*/ + ib_bh_t* ib_bh); /*!< in: instance */ + +/**********************************************************************//** +Return the last element from the binary heap. +@return pointer to last element or NULL if empty. */ +UNIV_INLINE +void* +ib_bh_last( +/*========*/ + ib_bh_t* ib_bh); /*!< in/out: instance */ + +/**********************************************************************//** +Create a binary heap. +@return a new binary heap */ +UNIV_INTERN +ib_bh_t* +ib_bh_create( +/*=========*/ + ib_bh_cmp_t compare, /*!< in: comparator */ + ulint sizeof_elem, /*!< in: size of one element */ + ulint max_elems); /*!< in: max elements allowed */ + +/**********************************************************************//** +Free a binary heap. +@return a new binary heap */ +UNIV_INTERN +void +ib_bh_free( +/*=======*/ + ib_bh_t* ib_bh); /*!< in,own: instance */ + +/**********************************************************************//** +Add an element to the binary heap. Note: The element is copied. +@return pointer to added element or NULL if full. */ +UNIV_INTERN +void* +ib_bh_push( +/*=======*/ + ib_bh_t* ib_bh, /*!< in/out: instance */ + const void* elem); /*!< in: element to add */ + +/**********************************************************************//** +Remove the first element from the binary heap. */ +UNIV_INTERN +void +ib_bh_pop( +/*======*/ + ib_bh_t* ib_bh); /*!< in/out: instance */ + +/** Binary heap data structure */ +struct ib_bh_t { + ulint max_elems; /*!< max elements allowed */ + ulint n_elems; /*!< current size */ + ulint sizeof_elem; /*!< sizeof element */ + ib_bh_cmp_t compare; /*!< comparator */ +}; + +#ifndef UNIV_NONINL +#include "ut0bh.ic" +#endif + +#endif /* INNOBASE_UT0BH_H */ diff --git a/storage/xtradb/include/ut0bh.ic b/storage/xtradb/include/ut0bh.ic new file mode 100644 index 00000000000..b11de5b8b3e --- /dev/null +++ b/storage/xtradb/include/ut0bh.ic @@ -0,0 +1,125 @@ +/***************************************************************************//** + +Copyright (c) 2011, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/ut0bh.ic +Binary min-heap implementation. + +Created 2011-01-15 by Sunny Bains +*******************************************************/ + +#include "ut0bh.h" +#include "ut0mem.h" /* For ut_memcpy() */ + +/**********************************************************************//** +Get the number of elements in the binary heap. +@return number of elements */ +UNIV_INLINE +ulint +ib_bh_size( +/*=======*/ + const ib_bh_t* ib_bh) /*!< in: instance */ +{ + return(ib_bh->n_elems); +} + +/**********************************************************************//** +Test if binary heap is empty. +@return TRUE if empty. */ +UNIV_INLINE +ibool +ib_bh_is_empty( +/*===========*/ + const ib_bh_t* ib_bh) /*!< in: instance */ +{ + return(ib_bh_size(ib_bh) == 0); +} + +/**********************************************************************//** +Test if binary heap is full. +@return TRUE if full. */ +UNIV_INLINE +ibool +ib_bh_is_full( +/*===========*/ + const ib_bh_t* ib_bh) /*!< in: instance */ +{ + return(ib_bh_size(ib_bh) >= ib_bh->max_elems); +} + +/**********************************************************************//** +Get a pointer to the element. +@return pointer to element */ +UNIV_INLINE +void* +ib_bh_get( +/*=======*/ + ib_bh_t* ib_bh, /*!< in: instance */ + ulint i) /*!< in: index */ +{ + byte* ptr = (byte*) (ib_bh + 1); + + ut_a(i < ib_bh_size(ib_bh)); + + return(ptr + (ib_bh->sizeof_elem * i)); +} + +/**********************************************************************//** +Copy an element to the binary heap. +@return pointer to copied element */ +UNIV_INLINE +void* +ib_bh_set( +/*======*/ + ib_bh_t* ib_bh, /*!< in/out: instance */ + ulint i, /*!< in: index */ + const void* elem) /*!< in: element to add */ +{ + void* ptr = ib_bh_get(ib_bh, i); + + ut_memcpy(ptr, elem, ib_bh->sizeof_elem); + + return(ptr); +} + +/**********************************************************************//** +Return the first element from the binary heap. +@return pointer to first element or NULL if empty. */ +UNIV_INLINE +void* +ib_bh_first( +/*========*/ + ib_bh_t* ib_bh) /*!< in: instance */ +{ + return(ib_bh_is_empty(ib_bh) ? NULL : ib_bh_get(ib_bh, 0)); +} + +/**********************************************************************//** +Return the last element from the binary heap. +@return pointer to last element or NULL if empty. */ +UNIV_INLINE +void* +ib_bh_last( +/*========*/ + ib_bh_t* ib_bh) /*!< in/out: instance */ +{ + return(ib_bh_is_empty(ib_bh) + ? NULL + : ib_bh_get(ib_bh, ib_bh_size(ib_bh) - 1)); +} + diff --git a/storage/xtradb/include/ut0byte.h b/storage/xtradb/include/ut0byte.h new file mode 100644 index 00000000000..5bdd553ca80 --- /dev/null +++ b/storage/xtradb/include/ut0byte.h @@ -0,0 +1,119 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/ut0byte.h +Utilities for byte operations + +Created 1/20/1994 Heikki Tuuri +***********************************************************************/ + +#ifndef ut0byte_h +#define ut0byte_h + + + +#include "univ.i" + +/*******************************************************//** +Creates a 64-bit integer out of two 32-bit integers. +@return created integer */ +UNIV_INLINE +ib_uint64_t +ut_ull_create( +/*==========*/ + ulint high, /*!< in: high-order 32 bits */ + ulint low) /*!< in: low-order 32 bits */ + __attribute__((const)); + +/********************************************************//** +Rounds a 64-bit integer downward to a multiple of a power of 2. +@return rounded value */ +UNIV_INLINE +ib_uint64_t +ut_uint64_align_down( +/*=================*/ + ib_uint64_t n, /*!< in: number to be rounded */ + ulint align_no); /*!< in: align by this number + which must be a power of 2 */ +/********************************************************//** +Rounds ib_uint64_t upward to a multiple of a power of 2. +@return rounded value */ +UNIV_INLINE +ib_uint64_t +ut_uint64_align_up( +/*===============*/ + ib_uint64_t n, /*!< in: number to be rounded */ + ulint align_no); /*!< in: align by this number + which must be a power of 2 */ +/*********************************************************//** +The following function rounds up a pointer to the nearest aligned address. +@return aligned pointer */ +UNIV_INLINE +void* +ut_align( +/*=====*/ + const void* ptr, /*!< in: pointer */ + ulint align_no); /*!< in: align by this number */ +/*********************************************************//** +The following function rounds down a pointer to the nearest +aligned address. +@return aligned pointer */ +UNIV_INLINE +void* +ut_align_down( +/*==========*/ + const void* ptr, /*!< in: pointer */ + ulint align_no) /*!< in: align by this number */ + __attribute__((const)); +/*********************************************************//** +The following function computes the offset of a pointer from the nearest +aligned address. +@return distance from aligned pointer */ +UNIV_INLINE +ulint +ut_align_offset( +/*============*/ + const void* ptr, /*!< in: pointer */ + ulint align_no) /*!< in: align by this number */ + __attribute__((const)); +/*****************************************************************//** +Gets the nth bit of a ulint. +@return TRUE if nth bit is 1; 0th bit is defined to be the least significant */ +UNIV_INLINE +ibool +ut_bit_get_nth( +/*===========*/ + ulint a, /*!< in: ulint */ + ulint n); /*!< in: nth bit requested */ +/*****************************************************************//** +Sets the nth bit of a ulint. +@return the ulint with the bit set as requested */ +UNIV_INLINE +ulint +ut_bit_set_nth( +/*===========*/ + ulint a, /*!< in: ulint */ + ulint n, /*!< in: nth bit requested */ + ibool val); /*!< in: value for the bit to set */ + +#ifndef UNIV_NONINL +#include "ut0byte.ic" +#endif + +#endif diff --git a/storage/xtradb/include/ut0byte.ic b/storage/xtradb/include/ut0byte.ic new file mode 100644 index 00000000000..873d98c727e --- /dev/null +++ b/storage/xtradb/include/ut0byte.ic @@ -0,0 +1,173 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************************//** +@file include/ut0byte.ic +Utilities for byte operations + +Created 5/30/1994 Heikki Tuuri +*******************************************************************/ + +/*******************************************************//** +Creates a 64-bit integer out of two 32-bit integers. +@return created integer */ +UNIV_INLINE +ib_uint64_t +ut_ull_create( +/*==========*/ + ulint high, /*!< in: high-order 32 bits */ + ulint low) /*!< in: low-order 32 bits */ +{ + ut_ad(high <= ULINT32_MASK); + ut_ad(low <= ULINT32_MASK); + return(((ib_uint64_t) high) << 32 | low); +} + +/********************************************************//** +Rounds a 64-bit integer downward to a multiple of a power of 2. +@return rounded value */ +UNIV_INLINE +ib_uint64_t +ut_uint64_align_down( +/*=================*/ + ib_uint64_t n, /*!< in: number to be rounded */ + ulint align_no) /*!< in: align by this number + which must be a power of 2 */ +{ + ut_ad(align_no > 0); + ut_ad(ut_is_2pow(align_no)); + + return(n & ~((ib_uint64_t) align_no - 1)); +} + +/********************************************************//** +Rounds ib_uint64_t upward to a multiple of a power of 2. +@return rounded value */ +UNIV_INLINE +ib_uint64_t +ut_uint64_align_up( +/*===============*/ + ib_uint64_t n, /*!< in: number to be rounded */ + ulint align_no) /*!< in: align by this number + which must be a power of 2 */ +{ + ib_uint64_t align_1 = (ib_uint64_t) align_no - 1; + + ut_ad(align_no > 0); + ut_ad(ut_is_2pow(align_no)); + + return((n + align_1) & ~align_1); +} + +/*********************************************************//** +The following function rounds up a pointer to the nearest aligned address. +@return aligned pointer */ +UNIV_INLINE +void* +ut_align( +/*=====*/ + const void* ptr, /*!< in: pointer */ + ulint align_no) /*!< in: align by this number */ +{ + ut_ad(align_no > 0); + ut_ad(((align_no - 1) & align_no) == 0); + ut_ad(ptr); + + ut_ad(sizeof(void*) == sizeof(ulint)); + + return((void*)((((ulint) ptr) + align_no - 1) & ~(align_no - 1))); +} + +/*********************************************************//** +The following function rounds down a pointer to the nearest +aligned address. +@return aligned pointer */ +UNIV_INLINE +void* +ut_align_down( +/*==========*/ + const void* ptr, /*!< in: pointer */ + ulint align_no) /*!< in: align by this number */ +{ + ut_ad(align_no > 0); + ut_ad(((align_no - 1) & align_no) == 0); + ut_ad(ptr); + + ut_ad(sizeof(void*) == sizeof(ulint)); + + return((void*)((((ulint) ptr)) & ~(align_no - 1))); +} + +/*********************************************************//** +The following function computes the offset of a pointer from the nearest +aligned address. +@return distance from aligned pointer */ +UNIV_INLINE +ulint +ut_align_offset( +/*============*/ + const void* ptr, /*!< in: pointer */ + ulint align_no) /*!< in: align by this number */ +{ + ut_ad(align_no > 0); + ut_ad(((align_no - 1) & align_no) == 0); + ut_ad(ptr); + + ut_ad(sizeof(void*) == sizeof(ulint)); + + return(((ulint) ptr) & (align_no - 1)); +} + +/*****************************************************************//** +Gets the nth bit of a ulint. +@return TRUE if nth bit is 1; 0th bit is defined to be the least significant */ +UNIV_INLINE +ibool +ut_bit_get_nth( +/*===========*/ + ulint a, /*!< in: ulint */ + ulint n) /*!< in: nth bit requested */ +{ + ut_ad(n < 8 * sizeof(ulint)); +#if TRUE != 1 +# error "TRUE != 1" +#endif + return(1 & (a >> n)); +} + +/*****************************************************************//** +Sets the nth bit of a ulint. +@return the ulint with the bit set as requested */ +UNIV_INLINE +ulint +ut_bit_set_nth( +/*===========*/ + ulint a, /*!< in: ulint */ + ulint n, /*!< in: nth bit requested */ + ibool val) /*!< in: value for the bit to set */ +{ + ut_ad(n < 8 * sizeof(ulint)); +#if TRUE != 1 +# error "TRUE != 1" +#endif + if (val) { + return(((ulint) 1 << n) | a); + } else { + return(~((ulint) 1 << n) & a); + } +} diff --git a/storage/xtradb/include/ut0counter.h b/storage/xtradb/include/ut0counter.h new file mode 100644 index 00000000000..fe0f36dfff2 --- /dev/null +++ b/storage/xtradb/include/ut0counter.h @@ -0,0 +1,203 @@ +/***************************************************************************** + +Copyright (c) 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/ut0counter.h + +Counter utility class + +Created 2012/04/12 by Sunny Bains +*******************************************************/ + +#ifndef UT0COUNTER_H +#define UT0COUNTER_H + +#include "univ.i" +#include <string.h> +#include "os0thread.h" + +/** CPU cache line size */ +#define CACHE_LINE_SIZE 64 + +/** Default number of slots to use in ib_counter_t */ +#define IB_N_SLOTS 64 + +/** Get the offset into the counter array. */ +template <typename Type, int N> +struct generic_indexer_t { + /** Default constructor/destructor should be OK. */ + + /** @return offset within m_counter */ + size_t offset(size_t index) const UNIV_NOTHROW { + return(((index % N) + 1) * (CACHE_LINE_SIZE / sizeof(Type))); + } +}; + +#ifdef HAVE_SCHED_GETCPU +#include <utmpx.h> +/** Use the cpu id to index into the counter array. If it fails then +use the thread id. */ +template <typename Type, int N> +struct get_sched_indexer_t : public generic_indexer_t<Type, N> { + /** Default constructor/destructor should be OK. */ + + /* @return result from sched_getcpu(), the thread id if it fails. */ + size_t get_rnd_index() const UNIV_NOTHROW { + + size_t cpu = sched_getcpu(); + if (cpu == -1) { + cpu = (lint) os_thread_get_curr_id(); + } + + return(cpu); + } +}; +#endif /* HAVE_SCHED_GETCPU */ + +/** Use the thread id to index into the counter array. */ +template <typename Type, int N> +struct thread_id_indexer_t : public generic_indexer_t<Type, N> { + /** Default constructor/destructor should are OK. */ + + /* @return a random number, currently we use the thread id. Where + thread id is represented as a pointer, it may not work as + effectively. */ + size_t get_rnd_index() const UNIV_NOTHROW { + return((lint) os_thread_get_curr_id()); + } +}; + +/** For counters wher N=1 */ +template <typename Type, int N=1> +struct single_indexer_t { + /** Default constructor/destructor should are OK. */ + + /** @return offset within m_counter */ + size_t offset(size_t index) const UNIV_NOTHROW { + ut_ad(N == 1); + return((CACHE_LINE_SIZE / sizeof(Type))); + } + + /* @return 1 */ + size_t get_rnd_index() const UNIV_NOTHROW { + ut_ad(N == 1); + return(1); + } +}; + +/** Class for using fuzzy counters. The counter is not protected by any +mutex and the results are not guaranteed to be 100% accurate but close +enough. Creates an array of counters and separates each element by the +CACHE_LINE_SIZE bytes */ +template < + typename Type, + int N = IB_N_SLOTS, + template<typename, int> class Indexer = thread_id_indexer_t> +class ib_counter_t { +public: + ib_counter_t() { memset(m_counter, 0x0, sizeof(m_counter)); } + + ~ib_counter_t() + { + ut_ad(validate()); + } + + bool validate() UNIV_NOTHROW { +#ifdef UNIV_DEBUG + size_t n = (CACHE_LINE_SIZE / sizeof(Type)); + + /* Check that we aren't writing outside our defined bounds. */ + for (size_t i = 0; i < UT_ARR_SIZE(m_counter); i += n) { + for (size_t j = 1; j < n - 1; ++j) { + ut_ad(m_counter[i + j] == 0); + } + } +#endif /* UNIV_DEBUG */ + return(true); + } + + /** If you can't use a good index id. Increment by 1. */ + void inc() UNIV_NOTHROW { add(1); } + + /** If you can't use a good index id. + * @param n - is the amount to increment */ + void add(Type n) UNIV_NOTHROW { + size_t i = m_policy.offset(m_policy.get_rnd_index()); + + ut_ad(i < UT_ARR_SIZE(m_counter)); + + m_counter[i] += n; + } + + /** Use this if you can use a unique indentifier, saves a + call to get_rnd_index(). + @param i - index into a slot + @param n - amount to increment */ + void add(size_t index, Type n) UNIV_NOTHROW { + size_t i = m_policy.offset(index); + + ut_ad(i < UT_ARR_SIZE(m_counter)); + + m_counter[i] += n; + } + + /** If you can't use a good index id. Decrement by 1. */ + void dec() UNIV_NOTHROW { sub(1); } + + /** If you can't use a good index id. + * @param - n is the amount to decrement */ + void sub(Type n) UNIV_NOTHROW { + size_t i = m_policy.offset(m_policy.get_rnd_index()); + + ut_ad(i < UT_ARR_SIZE(m_counter)); + + m_counter[i] -= n; + } + + /** Use this if you can use a unique indentifier, saves a + call to get_rnd_index(). + @param i - index into a slot + @param n - amount to decrement */ + void sub(size_t index, Type n) UNIV_NOTHROW { + size_t i = m_policy.offset(index); + + ut_ad(i < UT_ARR_SIZE(m_counter)); + + m_counter[i] -= n; + } + + /* @return total value - not 100% accurate, since it is not atomic. */ + operator Type() const UNIV_NOTHROW { + Type total = 0; + + for (size_t i = 0; i < N; ++i) { + total += m_counter[m_policy.offset(i)]; + } + + return(total); + } + +private: + /** Indexer into the array */ + Indexer<Type, N>m_policy; + + /** Slot 0 is unused. */ + Type m_counter[(N + 1) * (CACHE_LINE_SIZE / sizeof(Type))]; +}; + +#endif /* UT0COUNTER_H */ diff --git a/storage/xtradb/include/ut0crc32.h b/storage/xtradb/include/ut0crc32.h new file mode 100644 index 00000000000..86217692764 --- /dev/null +++ b/storage/xtradb/include/ut0crc32.h @@ -0,0 +1,51 @@ +/***************************************************************************** + +Copyright (c) 2011, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/ut0crc32.h +CRC32 implementation + +Created Aug 10, 2011 Vasil Dimov +*******************************************************/ + +#ifndef ut0crc32_h +#define ut0crc32_h + +#include "univ.i" + +/********************************************************************//** +Initializes the data structures used by ut_crc32(). Does not do any +allocations, would not hurt if called twice, but would be pointless. */ +UNIV_INTERN +void +ut_crc32_init(); +/*===========*/ + +/********************************************************************//** +Calculates CRC32. +@param ptr - data over which to calculate CRC32. +@param len - data length in bytes. +@return CRC32 (CRC-32C, using the GF(2) primitive polynomial 0x11EDC6F41, +or 0x1EDC6F41 without the high-order bit) */ +typedef ib_uint32_t (*ib_ut_crc32_t)(const byte* ptr, ulint len); + +extern ib_ut_crc32_t ut_crc32; + +extern bool ut_crc32_sse2_enabled; + +#endif /* ut0crc32_h */ diff --git a/storage/xtradb/include/ut0dbg.h b/storage/xtradb/include/ut0dbg.h new file mode 100644 index 00000000000..6a4afe99597 --- /dev/null +++ b/storage/xtradb/include/ut0dbg.h @@ -0,0 +1,132 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/*****************************************************************//** +@file include/ut0dbg.h +Debug utilities for Innobase + +Created 1/30/1994 Heikki Tuuri +**********************************************************************/ + +#ifndef ut0dbg_h +#define ut0dbg_h + +#ifdef UNIV_INNOCHECKSUM +#define ut_a assert +#define ut_ad assert +#define ut_error assert(0) +#else /* !UNIV_INNOCHECKSUM */ + +#include "univ.i" +#include <stdlib.h> +#include "os0thread.h" + +#if defined(__GNUC__) && (__GNUC__ > 2) +/** Test if an assertion fails. +@param EXPR assertion expression +@return nonzero if EXPR holds, zero if not */ +# define UT_DBG_FAIL(EXPR) UNIV_UNLIKELY(!((ulint)(EXPR))) +#else +/** This is used to eliminate compiler warnings */ +extern ulint ut_dbg_zero; +/** Test if an assertion fails. +@param EXPR assertion expression +@return nonzero if EXPR holds, zero if not */ +# define UT_DBG_FAIL(EXPR) !((ulint)(EXPR) + ut_dbg_zero) +#endif + +/*************************************************************//** +Report a failed assertion. */ +UNIV_INTERN +void +ut_dbg_assertion_failed( +/*====================*/ + const char* expr, /*!< in: the failed assertion */ + const char* file, /*!< in: source file containing the assertion */ + ulint line) /*!< in: line number of the assertion */ + UNIV_COLD __attribute__((nonnull(2))); + +/** Abort the execution. */ +# define UT_DBG_PANIC abort() + +/** Abort execution if EXPR does not evaluate to nonzero. +@param EXPR assertion expression that should hold */ +#define ut_a(EXPR) do { \ + if (UT_DBG_FAIL(EXPR)) { \ + ut_dbg_assertion_failed(#EXPR, \ + __FILE__, (ulint) __LINE__); \ + UT_DBG_PANIC; \ + } \ +} while (0) + +/** Abort execution. */ +#define ut_error do { \ + ut_dbg_assertion_failed(0, __FILE__, (ulint) __LINE__); \ + UT_DBG_PANIC; \ +} while (0) + +#ifdef UNIV_DEBUG +/** Debug assertion. Does nothing unless UNIV_DEBUG is defined. */ +#define ut_ad(EXPR) ut_a(EXPR) +/** Debug statement. Does nothing unless UNIV_DEBUG is defined. */ +#define ut_d(EXPR) do {EXPR;} while (0) +#else +/** Debug assertion. Does nothing unless UNIV_DEBUG is defined. */ +#define ut_ad(EXPR) +/** Debug statement. Does nothing unless UNIV_DEBUG is defined. */ +#define ut_d(EXPR) +#endif + +/** Silence warnings about an unused variable by doing a null assignment. +@param A the unused variable */ +#define UT_NOT_USED(A) A = A + +#ifdef UNIV_COMPILE_TEST_FUNCS + +#include <sys/types.h> +#include <sys/time.h> +#include <sys/resource.h> + +/** structure used for recording usage statistics */ +struct speedo_t { + struct rusage ru; /*!< getrusage() result */ + struct timeval tv; /*!< gettimeofday() result */ +}; + +/*******************************************************************//** +Resets a speedo (records the current time in it). */ +UNIV_INTERN +void +speedo_reset( +/*=========*/ + speedo_t* speedo); /*!< out: speedo */ + +/*******************************************************************//** +Shows the time elapsed and usage statistics since the last reset of a +speedo. */ +UNIV_INTERN +void +speedo_show( +/*========*/ + const speedo_t* speedo); /*!< in: speedo */ + +#endif /* UNIV_COMPILE_TEST_FUNCS */ + +#endif /* !UNIV_INNOCHECKSUM */ + +#endif diff --git a/storage/xtradb/include/ut0list.h b/storage/xtradb/include/ut0list.h new file mode 100644 index 00000000000..29fc8669ce4 --- /dev/null +++ b/storage/xtradb/include/ut0list.h @@ -0,0 +1,180 @@ +/***************************************************************************** + +Copyright (c) 2006, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/*******************************************************************//** +@file include/ut0list.h +A double-linked list + +Created 4/26/2006 Osku Salerma +************************************************************************/ + +/*******************************************************************//** +A double-linked list. This differs from the one in ut0lst.h in that in this +one, each list node contains a pointer to the data, whereas the one in +ut0lst.h uses a strategy where the list pointers are embedded in the data +items themselves. + +Use this one when you need to store arbitrary data in the list where you +can't embed the list pointers in the data, if a data item needs to be +stored in multiple lists, etc. + +Note about the memory management: ib_list_t is a fixed-size struct whose +allocation/deallocation is done through ib_list_create/ib_list_free, but the +memory for the list nodes is allocated through a user-given memory heap, +which can either be the same for all nodes or vary per node. Most users will +probably want to create a memory heap to store the item-specific data, and +pass in this same heap to the list node creation functions, thus +automatically freeing the list node when the item's heap is freed. + +************************************************************************/ + +#ifndef IB_LIST_H +#define IB_LIST_H + +#include "mem0mem.h" + +struct ib_list_t; +struct ib_list_node_t; + +/****************************************************************//** +Create a new list using mem_alloc. Lists created with this function must be +freed with ib_list_free. +@return list */ +UNIV_INTERN +ib_list_t* +ib_list_create(void); +/*=================*/ + + +/****************************************************************//** +Create a new list using the given heap. ib_list_free MUST NOT BE CALLED for +lists created with this function. +@return list */ +UNIV_INTERN +ib_list_t* +ib_list_create_heap( +/*================*/ + mem_heap_t* heap); /*!< in: memory heap to use */ + +/****************************************************************//** +Free a list. */ +UNIV_INTERN +void +ib_list_free( +/*=========*/ + ib_list_t* list); /*!< in: list */ + +/****************************************************************//** +Add the data to the start of the list. +@return new list node */ +UNIV_INTERN +ib_list_node_t* +ib_list_add_first( +/*==============*/ + ib_list_t* list, /*!< in: list */ + void* data, /*!< in: data */ + mem_heap_t* heap); /*!< in: memory heap to use */ + +/****************************************************************//** +Add the data to the end of the list. +@return new list node */ +UNIV_INTERN +ib_list_node_t* +ib_list_add_last( +/*=============*/ + ib_list_t* list, /*!< in: list */ + void* data, /*!< in: data */ + mem_heap_t* heap); /*!< in: memory heap to use */ + +/****************************************************************//** +Add the data after the indicated node. +@return new list node */ +UNIV_INTERN +ib_list_node_t* +ib_list_add_after( +/*==============*/ + ib_list_t* list, /*!< in: list */ + ib_list_node_t* prev_node, /*!< in: node preceding new node (can + be NULL) */ + void* data, /*!< in: data */ + mem_heap_t* heap); /*!< in: memory heap to use */ + +/****************************************************************//** +Remove the node from the list. */ +UNIV_INTERN +void +ib_list_remove( +/*===========*/ + ib_list_t* list, /*!< in: list */ + ib_list_node_t* node); /*!< in: node to remove */ + +/****************************************************************//** +Get the first node in the list. +@return first node, or NULL */ +UNIV_INLINE +ib_list_node_t* +ib_list_get_first( +/*==============*/ + ib_list_t* list); /*!< in: list */ + +/****************************************************************//** +Get the last node in the list. +@return last node, or NULL */ +UNIV_INLINE +ib_list_node_t* +ib_list_get_last( +/*=============*/ + ib_list_t* list); /*!< in: list */ + +/******************************************************************** +Check if list is empty. */ +UNIV_INLINE +ibool +ib_list_is_empty( +/*=============*/ + /* out: TRUE if empty else */ + const ib_list_t* list); /* in: list */ + +/* List. */ +struct ib_list_t { + ib_list_node_t* first; /*!< first node */ + ib_list_node_t* last; /*!< last node */ + ibool is_heap_list; /*!< TRUE if this list was + allocated through a heap */ +}; + +/* A list node. */ +struct ib_list_node_t { + ib_list_node_t* prev; /*!< previous node */ + ib_list_node_t* next; /*!< next node */ + void* data; /*!< user data */ +}; + +/* Quite often, the only additional piece of data you need is the per-item +memory heap, so we have this generic struct available to use in those +cases. */ +struct ib_list_helper_t { + mem_heap_t* heap; /*!< memory heap */ + void* data; /*!< user data */ +}; + +#ifndef UNIV_NONINL +#include "ut0list.ic" +#endif + +#endif diff --git a/storage/xtradb/include/ut0list.ic b/storage/xtradb/include/ut0list.ic new file mode 100644 index 00000000000..d9dcb2eac99 --- /dev/null +++ b/storage/xtradb/include/ut0list.ic @@ -0,0 +1,60 @@ +/***************************************************************************** + +Copyright (c) 2006, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/*******************************************************************//** +@file include/ut0list.ic +A double-linked list + +Created 4/26/2006 Osku Salerma +************************************************************************/ + +/****************************************************************//** +Get the first node in the list. +@return first node, or NULL */ +UNIV_INLINE +ib_list_node_t* +ib_list_get_first( +/*==============*/ + ib_list_t* list) /*!< in: list */ +{ + return(list->first); +} + +/****************************************************************//** +Get the last node in the list. +@return last node, or NULL */ +UNIV_INLINE +ib_list_node_t* +ib_list_get_last( +/*=============*/ + ib_list_t* list) /*!< in: list */ +{ + return(list->last); +} + +/******************************************************************** +Check if list is empty. */ +UNIV_INLINE +ibool +ib_list_is_empty( +/*=============*/ + /* out: TRUE if empty else FALSE */ + const ib_list_t* list) /* in: list */ +{ + return(!(list->first || list->last)); +} diff --git a/storage/xtradb/include/ut0lst.h b/storage/xtradb/include/ut0lst.h new file mode 100644 index 00000000000..b53e7ade4c1 --- /dev/null +++ b/storage/xtradb/include/ut0lst.h @@ -0,0 +1,408 @@ +/***************************************************************************** + +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/ut0lst.h +List utilities + +Created 9/10/1995 Heikki Tuuri +***********************************************************************/ + +#ifndef ut0lst_h +#define ut0lst_h + +#include "univ.i" + +/*******************************************************************//** +Return offset of F in POD T. +@param T - POD pointer +@param F - Field in T */ +#define IB_OFFSETOF(T, F) \ + (reinterpret_cast<byte*>(&(T)->F) - reinterpret_cast<byte*>(T)) + +/* This module implements the two-way linear list which should be used +if a list is used in the database. Note that a single struct may belong +to two or more lists, provided that the list are given different names. +An example of the usage of the lists can be found in fil0fil.cc. */ + +/*******************************************************************//** +This macro expands to the unnamed type definition of a struct which acts +as the two-way list base node. The base node contains pointers +to both ends of the list and a count of nodes in the list (excluding +the base node from the count). +@param TYPE the name of the list node data type */ +template <typename TYPE> +struct ut_list_base { + typedef TYPE elem_type; + + ulint count; /*!< count of nodes in list */ + TYPE* start; /*!< pointer to list start, NULL if empty */ + TYPE* end; /*!< pointer to list end, NULL if empty */ +}; + +#define UT_LIST_BASE_NODE_T(TYPE) ut_list_base<TYPE> + +/*******************************************************************//** +This macro expands to the unnamed type definition of a struct which +should be embedded in the nodes of the list, the node type must be a struct. +This struct contains the pointers to next and previous nodes in the list. +The name of the field in the node struct should be the name given +to the list. +@param TYPE the list node type name */ +/* Example: +struct LRU_node_t { + UT_LIST_NODE_T(LRU_node_t) LRU_list; + ... +} +The example implements an LRU list of name LRU_list. Its nodes are of type +LRU_node_t. */ + +template <typename TYPE> +struct ut_list_node { + TYPE* prev; /*!< pointer to the previous node, + NULL if start of list */ + TYPE* next; /*!< pointer to next node, NULL if end of list */ +}; + +#define UT_LIST_NODE_T(TYPE) ut_list_node<TYPE> + +/*******************************************************************//** +Get the list node at offset. +@param elem - list element +@param offset - offset within element. +@return reference to list node. */ +template <typename Type> +ut_list_node<Type>& +ut_elem_get_node(Type& elem, size_t offset) +{ + ut_a(offset < sizeof(elem)); + + return(*reinterpret_cast<ut_list_node<Type>*>( + reinterpret_cast<byte*>(&elem) + offset)); +} + +/*******************************************************************//** +Initializes the base node of a two-way list. +@param BASE the list base node +*/ +#define UT_LIST_INIT(BASE)\ +{\ + (BASE).count = 0;\ + (BASE).start = NULL;\ + (BASE).end = NULL;\ +}\ + +/*******************************************************************//** +Adds the node as the first element in a two-way linked list. +@param list the base node (not a pointer to it) +@param elem the element to add +@param offset offset of list node in elem. */ +template <typename List, typename Type> +void +ut_list_prepend( + List& list, + Type& elem, + size_t offset) +{ + ut_list_node<Type>& elem_node = ut_elem_get_node(elem, offset); + + elem_node.prev = 0; + elem_node.next = list.start; + + if (list.start != 0) { + ut_list_node<Type>& base_node = + ut_elem_get_node(*list.start, offset); + + ut_ad(list.start != &elem); + + base_node.prev = &elem; + } + + list.start = &elem; + + if (list.end == 0) { + list.end = &elem; + } + + ++list.count; +} + +/*******************************************************************//** +Adds the node as the first element in a two-way linked list. +@param NAME list name +@param LIST the base node (not a pointer to it) +@param ELEM the element to add */ +#define UT_LIST_ADD_FIRST(NAME, LIST, ELEM) \ + ut_list_prepend(LIST, *ELEM, IB_OFFSETOF(ELEM, NAME)) + +/*******************************************************************//** +Adds the node as the last element in a two-way linked list. +@param list list +@param elem the element to add +@param offset offset of list node in elem */ +template <typename List, typename Type> +void +ut_list_append( + List& list, + Type& elem, + size_t offset) +{ + ut_list_node<Type>& elem_node = ut_elem_get_node(elem, offset); + + elem_node.next = 0; + elem_node.prev = list.end; + + if (list.end != 0) { + ut_list_node<Type>& base_node = + ut_elem_get_node(*list.end, offset); + + ut_ad(list.end != &elem); + + base_node.next = &elem; + } + + list.end = &elem; + + if (list.start == 0) { + list.start = &elem; + } + + ++list.count; +} + +/*******************************************************************//** +Adds the node as the last element in a two-way linked list. +@param NAME list name +@param LIST list +@param ELEM the element to add */ +#define UT_LIST_ADD_LAST(NAME, LIST, ELEM)\ + ut_list_append(LIST, *ELEM, IB_OFFSETOF(ELEM, NAME)) + +/*******************************************************************//** +Inserts a ELEM2 after ELEM1 in a list. +@param list the base node +@param elem1 node after which ELEM2 is inserted +@param elem2 node being inserted after NODE1 +@param offset offset of list node in elem1 and elem2 */ +template <typename List, typename Type> +void +ut_list_insert( + List& list, + Type& elem1, + Type& elem2, + size_t offset) +{ + ut_ad(&elem1 != &elem2); + + ut_list_node<Type>& elem1_node = ut_elem_get_node(elem1, offset); + ut_list_node<Type>& elem2_node = ut_elem_get_node(elem2, offset); + + elem2_node.prev = &elem1; + elem2_node.next = elem1_node.next; + + if (elem1_node.next != NULL) { + ut_list_node<Type>& next_node = + ut_elem_get_node(*elem1_node.next, offset); + + next_node.prev = &elem2; + } + + elem1_node.next = &elem2; + + if (list.end == &elem1) { + list.end = &elem2; + } + + ++list.count; +} + +/*******************************************************************//** +Inserts a ELEM2 after ELEM1 in a list. +@param NAME list name +@param LIST the base node +@param ELEM1 node after which ELEM2 is inserted +@param ELEM2 node being inserted after ELEM1 */ +#define UT_LIST_INSERT_AFTER(NAME, LIST, ELEM1, ELEM2)\ + ut_list_insert(LIST, *ELEM1, *ELEM2, IB_OFFSETOF(ELEM1, NAME)) + +#ifdef UNIV_LIST_DEBUG +/** Invalidate the pointers in a list node. +@param NAME list name +@param N pointer to the node that was removed */ +# define UT_LIST_REMOVE_CLEAR(N) \ + (N).next = (Type*) -1; \ + (N).prev = (N).next +#else +/** Invalidate the pointers in a list node. +@param NAME list name +@param N pointer to the node that was removed */ +# define UT_LIST_REMOVE_CLEAR(N) +#endif /* UNIV_LIST_DEBUG */ + +/*******************************************************************//** +Removes a node from a two-way linked list. +@param list the base node (not a pointer to it) +@param elem node to be removed from the list +@param offset offset of list node within elem */ +template <typename List, typename Type> +void +ut_list_remove( + List& list, + Type& elem, + size_t offset) +{ + ut_list_node<Type>& elem_node = ut_elem_get_node(elem, offset); + + ut_a(list.count > 0); + + if (elem_node.next != NULL) { + ut_list_node<Type>& next_node = + ut_elem_get_node(*elem_node.next, offset); + + next_node.prev = elem_node.prev; + } else { + list.end = elem_node.prev; + } + + if (elem_node.prev != NULL) { + ut_list_node<Type>& prev_node = + ut_elem_get_node(*elem_node.prev, offset); + + prev_node.next = elem_node.next; + } else { + list.start = elem_node.next; + } + + UT_LIST_REMOVE_CLEAR(elem_node); + + --list.count; +} + +/*******************************************************************//** +Removes a node from a two-way linked list. + aram NAME list name +@param LIST the base node (not a pointer to it) +@param ELEM node to be removed from the list */ +#define UT_LIST_REMOVE(NAME, LIST, ELEM) \ + ut_list_remove(LIST, *ELEM, IB_OFFSETOF(ELEM, NAME)) + +/********************************************************************//** +Gets the next node in a two-way list. +@param NAME list name +@param N pointer to a node +@return the successor of N in NAME, or NULL */ +#define UT_LIST_GET_NEXT(NAME, N)\ + (((N)->NAME).next) + +/********************************************************************//** +Gets the previous node in a two-way list. +@param NAME list name +@param N pointer to a node +@return the predecessor of N in NAME, or NULL */ +#define UT_LIST_GET_PREV(NAME, N)\ + (((N)->NAME).prev) + +/********************************************************************//** +Alternative macro to get the number of nodes in a two-way list, i.e., +its length. +@param BASE the base node (not a pointer to it). +@return the number of nodes in the list */ +#define UT_LIST_GET_LEN(BASE)\ + (BASE).count + +/********************************************************************//** +Gets the first node in a two-way list. +@param BASE the base node (not a pointer to it) +@return first node, or NULL if the list is empty */ +#define UT_LIST_GET_FIRST(BASE)\ + (BASE).start + +/********************************************************************//** +Gets the last node in a two-way list. +@param BASE the base node (not a pointer to it) +@return last node, or NULL if the list is empty */ +#define UT_LIST_GET_LAST(BASE)\ + (BASE).end + +struct NullValidate { void operator()(const void* elem) { } }; + +/********************************************************************//** +Iterate over all the elements and call the functor for each element. +@param list base node (not a pointer to it) +@param functor Functor that is called for each element in the list +@parm node pointer to member node within list element */ +template <typename List, class Functor> +void +ut_list_map( + List& list, + ut_list_node<typename List::elem_type> + List::elem_type::*node, + Functor functor) +{ + ulint count = 0; + + for (typename List::elem_type* elem = list.start; + elem != 0; + elem = (elem->*node).next, ++count) { + + functor(elem); + } + + ut_a(count == list.count); +} + +/********************************************************************//** +Checks the consistency of a two-way list. +@param list base node (not a pointer to it) +@param functor Functor that is called for each element in the list +@parm node pointer to member node within list element */ +template <typename List, class Functor> +void +ut_list_validate( + List& list, + ut_list_node<typename List::elem_type> + List::elem_type::*node, + Functor functor = NullValidate()) +{ + ut_list_map(list, node, functor); + + ulint count = 0; + + for (typename List::elem_type* elem = list.end; + elem != 0; + elem = (elem->*node).prev, ++count) { + + functor(elem); + } + + ut_a(count == list.count); +} + +/********************************************************************//** +Checks the consistency of a two-way list. +@param NAME the name of the list +@param TYPE node type +@param LIST base node (not a pointer to it) +@param FUNCTOR called for each list element */ +#define UT_LIST_VALIDATE(NAME, TYPE, LIST, FUNCTOR) \ + ut_list_validate(LIST, &TYPE::NAME, FUNCTOR) + +#define UT_LIST_CHECK(NAME, TYPE, LIST) \ + ut_list_validate(LIST, &TYPE::NAME, NullValidate()) + +#endif /* ut0lst.h */ diff --git a/storage/xtradb/include/ut0mem.h b/storage/xtradb/include/ut0mem.h new file mode 100644 index 00000000000..af7eb4e9b1d --- /dev/null +++ b/storage/xtradb/include/ut0mem.h @@ -0,0 +1,261 @@ +/***************************************************************************** + +Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/*******************************************************************//** +@file include/ut0mem.h +Memory primitives + +Created 5/30/1994 Heikki Tuuri +************************************************************************/ + +#ifndef ut0mem_h +#define ut0mem_h + +#include "univ.i" +#include <string.h> +#ifndef UNIV_HOTBACKUP +# include "os0sync.h" + +/** The total amount of memory currently allocated from the operating +system with os_mem_alloc_large() or malloc(). Does not count malloc() +if srv_use_sys_malloc is set. Protected by ut_list_mutex. */ +extern ulint ut_total_allocated_memory; + +/** Mutex protecting ut_total_allocated_memory and ut_mem_block_list */ +extern os_fast_mutex_t ut_list_mutex; +#endif /* !UNIV_HOTBACKUP */ + +/** Wrapper for memcpy(3). Copy memory area when the source and +target are not overlapping. +* @param dest in: copy to +* @param sour in: copy from +* @param n in: number of bytes to copy +* @return dest */ +UNIV_INLINE +void* +ut_memcpy(void* dest, const void* sour, ulint n); + +/** Wrapper for memmove(3). Copy memory area when the source and +target are overlapping. +* @param dest in: copy to +* @param sour in: copy from +* @param n in: number of bytes to copy +* @return dest */ +UNIV_INLINE +void* +ut_memmove(void* dest, const void* sour, ulint n); + +/** Wrapper for memcmp(3). Compare memory areas. +* @param str1 in: first memory block to compare +* @param str2 in: second memory block to compare +* @param n in: number of bytes to compare +* @return negative, 0, or positive if str1 is smaller, equal, + or greater than str2, respectively. */ +UNIV_INLINE +int +ut_memcmp(const void* str1, const void* str2, ulint n); + +/**********************************************************************//** +Initializes the mem block list at database startup. */ +UNIV_INTERN +void +ut_mem_init(void); +/*=============*/ + +/**********************************************************************//** +Allocates memory. +@return own: allocated memory */ +UNIV_INTERN +void* +ut_malloc_low( +/*==========*/ + ulint n, /*!< in: number of bytes to allocate */ + ibool assert_on_error) /*!< in: if TRUE, we crash mysqld if + the memory cannot be allocated */ + __attribute__((malloc)); +/**********************************************************************//** +Allocates memory. */ +#define ut_malloc(n) ut_malloc_low(n, TRUE) +/**********************************************************************//** +Frees a memory block allocated with ut_malloc. Freeing a NULL pointer is +a nop. */ +UNIV_INTERN +void +ut_free( +/*====*/ + void* ptr); /*!< in, own: memory block, can be NULL */ +#ifndef UNIV_HOTBACKUP +/**********************************************************************//** +Implements realloc. This is needed by /pars/lexyy.cc. Otherwise, you should not +use this function because the allocation functions in mem0mem.h are the +recommended ones in InnoDB. + +man realloc in Linux, 2004: + + realloc() changes the size of the memory block pointed to + by ptr to size bytes. The contents will be unchanged to + the minimum of the old and new sizes; newly allocated mem + ory will be uninitialized. If ptr is NULL, the call is + equivalent to malloc(size); if size is equal to zero, the + call is equivalent to free(ptr). Unless ptr is NULL, it + must have been returned by an earlier call to malloc(), + calloc() or realloc(). + +RETURN VALUE + realloc() returns a pointer to the newly allocated memory, + which is suitably aligned for any kind of variable and may + be different from ptr, or NULL if the request fails. If + size was equal to 0, either NULL or a pointer suitable to + be passed to free() is returned. If realloc() fails the + original block is left untouched - it is not freed or + moved. +@return own: pointer to new mem block or NULL */ +UNIV_INTERN +void* +ut_realloc( +/*=======*/ + void* ptr, /*!< in: pointer to old block or NULL */ + ulint size); /*!< in: desired size */ +/**********************************************************************//** +Frees in shutdown all allocated memory not freed yet. */ +UNIV_INTERN +void +ut_free_all_mem(void); +/*=================*/ +#endif /* !UNIV_HOTBACKUP */ + +/** Wrapper for strcpy(3). Copy a NUL-terminated string. +* @param dest in: copy to +* @param sour in: copy from +* @return dest */ +UNIV_INLINE +char* +ut_strcpy(char* dest, const char* sour); + +/** Wrapper for strlen(3). Determine the length of a NUL-terminated string. +* @param str in: string +* @return length of the string in bytes, excluding the terminating NUL */ +UNIV_INLINE +ulint +ut_strlen(const char* str); + +/** Wrapper for strcmp(3). Compare NUL-terminated strings. +* @param str1 in: first string to compare +* @param str2 in: second string to compare +* @return negative, 0, or positive if str1 is smaller, equal, + or greater than str2, respectively. */ +UNIV_INLINE +int +ut_strcmp(const char* str1, const char* str2); + +/**********************************************************************//** +Copies up to size - 1 characters from the NUL-terminated string src to +dst, NUL-terminating the result. Returns strlen(src), so truncation +occurred if the return value >= size. +@return strlen(src) */ +UNIV_INTERN +ulint +ut_strlcpy( +/*=======*/ + char* dst, /*!< in: destination buffer */ + const char* src, /*!< in: source buffer */ + ulint size); /*!< in: size of destination buffer */ + +/**********************************************************************//** +Like ut_strlcpy, but if src doesn't fit in dst completely, copies the last +(size - 1) bytes of src, not the first. +@return strlen(src) */ +UNIV_INTERN +ulint +ut_strlcpy_rev( +/*===========*/ + char* dst, /*!< in: destination buffer */ + const char* src, /*!< in: source buffer */ + ulint size); /*!< in: size of destination buffer */ + +/**********************************************************************//** +Return the number of times s2 occurs in s1. Overlapping instances of s2 +are only counted once. +@return the number of times s2 occurs in s1 */ +UNIV_INTERN +ulint +ut_strcount( +/*========*/ + const char* s1, /*!< in: string to search in */ + const char* s2); /*!< in: string to search for */ + +/**********************************************************************//** +Replace every occurrence of s1 in str with s2. Overlapping instances of s1 +are only replaced once. +@return own: modified string, must be freed with mem_free() */ +UNIV_INTERN +char* +ut_strreplace( +/*==========*/ + const char* str, /*!< in: string to operate on */ + const char* s1, /*!< in: string to replace */ + const char* s2); /*!< in: string to replace s1 with */ + +/******************************************************************** +Concatenate 3 strings.*/ + +char* +ut_str3cat( +/*=======*/ + /* out, own: concatenated string, must be + freed with mem_free() */ + const char* s1, /* in: string 1 */ + const char* s2, /* in: string 2 */ + const char* s3); /* in: string 3 */ + +/**********************************************************************//** +Converts a raw binary data to a NUL-terminated hex string. The output is +truncated if there is not enough space in "hex", make sure "hex_size" is at +least (2 * raw_size + 1) if you do not want this to happen. Returns the +actual number of characters written to "hex" (including the NUL). +@return number of chars written */ +UNIV_INLINE +ulint +ut_raw_to_hex( +/*==========*/ + const void* raw, /*!< in: raw data */ + ulint raw_size, /*!< in: "raw" length in bytes */ + char* hex, /*!< out: hex string */ + ulint hex_size); /*!< in: "hex" size in bytes */ + +/*******************************************************************//** +Adds single quotes to the start and end of string and escapes any quotes +by doubling them. Returns the number of bytes that were written to "buf" +(including the terminating NUL). If buf_size is too small then the +trailing bytes from "str" are discarded. +@return number of bytes that were written */ +UNIV_INLINE +ulint +ut_str_sql_format( +/*==============*/ + const char* str, /*!< in: string */ + ulint str_len, /*!< in: string length in bytes */ + char* buf, /*!< out: output buffer */ + ulint buf_size); /*!< in: output buffer size + in bytes */ + +#ifndef UNIV_NONINL +#include "ut0mem.ic" +#endif + +#endif diff --git a/storage/xtradb/include/ut0mem.ic b/storage/xtradb/include/ut0mem.ic new file mode 100644 index 00000000000..5c9071d52cc --- /dev/null +++ b/storage/xtradb/include/ut0mem.ic @@ -0,0 +1,317 @@ +/***************************************************************************** + +Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/*******************************************************************//** +@file include/ut0mem.ic +Memory primitives + +Created 5/30/1994 Heikki Tuuri +************************************************************************/ + +#include "ut0byte.h" +#include "mach0data.h" + +/** Wrapper for memcpy(3). Copy memory area when the source and +target are not overlapping. +* @param dest in: copy to +* @param sour in: copy from +* @param n in: number of bytes to copy +* @return dest */ +UNIV_INLINE +void* +ut_memcpy(void* dest, const void* sour, ulint n) +{ + return(memcpy(dest, sour, n)); +} + +/** Wrapper for memmove(3). Copy memory area when the source and +target are overlapping. +* @param dest in: copy to +* @param sour in: copy from +* @param n in: number of bytes to copy +* @return dest */ +UNIV_INLINE +void* +ut_memmove(void* dest, const void* sour, ulint n) +{ + return(memmove(dest, sour, n)); +} + +/** Wrapper for memcmp(3). Compare memory areas. +* @param str1 in: first memory block to compare +* @param str2 in: second memory block to compare +* @param n in: number of bytes to compare +* @return negative, 0, or positive if str1 is smaller, equal, + or greater than str2, respectively. */ +UNIV_INLINE +int +ut_memcmp(const void* str1, const void* str2, ulint n) +{ + return(memcmp(str1, str2, n)); +} + +/** Wrapper for strcpy(3). Copy a NUL-terminated string. +* @param dest in: copy to +* @param sour in: copy from +* @return dest */ +UNIV_INLINE +char* +ut_strcpy(char* dest, const char* sour) +{ + return(strcpy(dest, sour)); +} + +/** Wrapper for strlen(3). Determine the length of a NUL-terminated string. +* @param str in: string +* @return length of the string in bytes, excluding the terminating NUL */ +UNIV_INLINE +ulint +ut_strlen(const char* str) +{ + return(strlen(str)); +} + +/** Wrapper for strcmp(3). Compare NUL-terminated strings. +* @param str1 in: first string to compare +* @param str2 in: second string to compare +* @return negative, 0, or positive if str1 is smaller, equal, + or greater than str2, respectively. */ +UNIV_INLINE +int +ut_strcmp(const char* str1, const char* str2) +{ + return(strcmp(str1, str2)); +} + +/**********************************************************************//** +Converts a raw binary data to a NUL-terminated hex string. The output is +truncated if there is not enough space in "hex", make sure "hex_size" is at +least (2 * raw_size + 1) if you do not want this to happen. Returns the +actual number of characters written to "hex" (including the NUL). +@return number of chars written */ +UNIV_INLINE +ulint +ut_raw_to_hex( +/*==========*/ + const void* raw, /*!< in: raw data */ + ulint raw_size, /*!< in: "raw" length in bytes */ + char* hex, /*!< out: hex string */ + ulint hex_size) /*!< in: "hex" size in bytes */ +{ + +#ifdef WORDS_BIGENDIAN + +#define MK_UINT16(a, b) (((uint16) (a)) << 8 | (uint16) (b)) + +#define UINT16_GET_A(u) ((unsigned char) ((u) >> 8)) +#define UINT16_GET_B(u) ((unsigned char) ((u) & 0xFF)) + +#else /* WORDS_BIGENDIAN */ + +#define MK_UINT16(a, b) (((uint16) (b)) << 8 | (uint16) (a)) + +#define UINT16_GET_A(u) ((unsigned char) ((u) & 0xFF)) +#define UINT16_GET_B(u) ((unsigned char) ((u) >> 8)) + +#endif /* WORDS_BIGENDIAN */ + +#define MK_ALL_UINT16_WITH_A(a) \ + MK_UINT16(a, '0'), \ + MK_UINT16(a, '1'), \ + MK_UINT16(a, '2'), \ + MK_UINT16(a, '3'), \ + MK_UINT16(a, '4'), \ + MK_UINT16(a, '5'), \ + MK_UINT16(a, '6'), \ + MK_UINT16(a, '7'), \ + MK_UINT16(a, '8'), \ + MK_UINT16(a, '9'), \ + MK_UINT16(a, 'A'), \ + MK_UINT16(a, 'B'), \ + MK_UINT16(a, 'C'), \ + MK_UINT16(a, 'D'), \ + MK_UINT16(a, 'E'), \ + MK_UINT16(a, 'F') + + static const uint16 hex_map[256] = { + MK_ALL_UINT16_WITH_A('0'), + MK_ALL_UINT16_WITH_A('1'), + MK_ALL_UINT16_WITH_A('2'), + MK_ALL_UINT16_WITH_A('3'), + MK_ALL_UINT16_WITH_A('4'), + MK_ALL_UINT16_WITH_A('5'), + MK_ALL_UINT16_WITH_A('6'), + MK_ALL_UINT16_WITH_A('7'), + MK_ALL_UINT16_WITH_A('8'), + MK_ALL_UINT16_WITH_A('9'), + MK_ALL_UINT16_WITH_A('A'), + MK_ALL_UINT16_WITH_A('B'), + MK_ALL_UINT16_WITH_A('C'), + MK_ALL_UINT16_WITH_A('D'), + MK_ALL_UINT16_WITH_A('E'), + MK_ALL_UINT16_WITH_A('F') + }; + const unsigned char* rawc; + ulint read_bytes; + ulint write_bytes; + ulint i; + + rawc = (const unsigned char*) raw; + + if (hex_size == 0) { + + return(0); + } + + if (hex_size <= 2 * raw_size) { + + read_bytes = hex_size / 2; + write_bytes = hex_size; + } else { + + read_bytes = raw_size; + write_bytes = 2 * raw_size + 1; + } + +#define LOOP_READ_BYTES(ASSIGN) \ + for (i = 0; i < read_bytes; i++) { \ + ASSIGN; \ + hex += 2; \ + rawc++; \ + } + + if (ut_align_offset(hex, 2) == 0) { + + LOOP_READ_BYTES( + *(uint16*) hex = hex_map[*rawc] + ); + } else { + + LOOP_READ_BYTES( + *hex = UINT16_GET_A(hex_map[*rawc]); + *(hex + 1) = UINT16_GET_B(hex_map[*rawc]) + ); + } + + if (hex_size <= 2 * raw_size && hex_size % 2 == 0) { + + hex--; + } + + *hex = '\0'; + + return(write_bytes); +} + +/*******************************************************************//** +Adds single quotes to the start and end of string and escapes any quotes +by doubling them. Returns the number of bytes that were written to "buf" +(including the terminating NUL). If buf_size is too small then the +trailing bytes from "str" are discarded. +@return number of bytes that were written */ +UNIV_INLINE +ulint +ut_str_sql_format( +/*==============*/ + const char* str, /*!< in: string */ + ulint str_len, /*!< in: string length in bytes */ + char* buf, /*!< out: output buffer */ + ulint buf_size) /*!< in: output buffer size + in bytes */ +{ + ulint str_i; + ulint buf_i; + + buf_i = 0; + + switch (buf_size) { + case 3: + + if (str_len == 0) { + + buf[buf_i] = '\''; + buf_i++; + buf[buf_i] = '\''; + buf_i++; + } + /* FALLTHROUGH */ + case 2: + case 1: + + buf[buf_i] = '\0'; + buf_i++; + /* FALLTHROUGH */ + case 0: + + return(buf_i); + } + + /* buf_size >= 4 */ + + buf[0] = '\''; + buf_i = 1; + + for (str_i = 0; str_i < str_len; str_i++) { + + char ch; + + if (buf_size - buf_i == 2) { + + break; + } + + ch = str[str_i]; + + switch (ch) { + case '\0': + + if (buf_size - buf_i < 4) { + + goto func_exit; + } + buf[buf_i] = '\\'; + buf_i++; + buf[buf_i] = '0'; + buf_i++; + break; + case '\'': + case '\\': + + if (buf_size - buf_i < 4) { + + goto func_exit; + } + buf[buf_i] = ch; + buf_i++; + /* FALLTHROUGH */ + default: + + buf[buf_i] = ch; + buf_i++; + } + } + +func_exit: + + buf[buf_i] = '\''; + buf_i++; + buf[buf_i] = '\0'; + buf_i++; + + return(buf_i); +} diff --git a/storage/xtradb/include/ut0rbt.h b/storage/xtradb/include/ut0rbt.h new file mode 100644 index 00000000000..5c25104b5d7 --- /dev/null +++ b/storage/xtradb/include/ut0rbt.h @@ -0,0 +1,346 @@ +/***************************************************************************//** + +Copyright (c) 2007, 2010, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ +/******************************************************************//** +@file include/ut0rbt.h +Various utilities + +Created 2007-03-20 Sunny Bains +*******************************************************/ + +#ifndef INNOBASE_UT0RBT_H +#define INNOBASE_UT0RBT_H + +#if !defined(IB_RBT_TESTING) +#include "univ.i" +#include "ut0mem.h" +#else +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <assert.h> + +#define ut_malloc malloc +#define ut_free free +#define ulint unsigned long +#define ut_a(c) assert(c) +#define ut_error assert(0) +#define ibool unsigned int +#define TRUE 1 +#define FALSE 0 +#endif + +struct ib_rbt_node_t; +typedef void (*ib_rbt_print_node)(const ib_rbt_node_t* node); +typedef int (*ib_rbt_compare)(const void* p1, const void* p2); +typedef int (*ib_rbt_arg_compare)(const void*, const void* p1, const void* p2); + +/** Red black tree color types */ +enum ib_rbt_color_t { + IB_RBT_RED, + IB_RBT_BLACK +}; + +/** Red black tree node */ +struct ib_rbt_node_t { + ib_rbt_color_t color; /* color of this node */ + + ib_rbt_node_t* left; /* points left child */ + ib_rbt_node_t* right; /* points right child */ + ib_rbt_node_t* parent; /* points parent node */ + + char value[1]; /* Data value */ +}; + +/** Red black tree instance.*/ +struct ib_rbt_t { + ib_rbt_node_t* nil; /* Black colored node that is + used as a sentinel. This is + pre-allocated too.*/ + + ib_rbt_node_t* root; /* Root of the tree, this is + pre-allocated and the first + data node is the left child.*/ + + ulint n_nodes; /* Total number of data nodes */ + + ib_rbt_compare compare; /* Fn. to use for comparison */ + ib_rbt_arg_compare + compare_with_arg; /* Fn. to use for comparison + with argument */ + ulint sizeof_value; /* Sizeof the item in bytes */ + void* cmp_arg; /* Compare func argument */ +}; + +/** The result of searching for a key in the tree, this is useful for +a speedy lookup and insert if key doesn't exist.*/ +struct ib_rbt_bound_t { + const ib_rbt_node_t* + last; /* Last node visited */ + + int result; /* Result of comparing with + the last non-nil node that + was visited */ +}; + +/* Size in elements (t is an rb tree instance) */ +#define rbt_size(t) (t->n_nodes) + +/* Check whether the rb tree is empty (t is an rb tree instance) */ +#define rbt_empty(t) (rbt_size(t) == 0) + +/* Get data value (t is the data type, n is an rb tree node instance) */ +#define rbt_value(t, n) ((t*) &n->value[0]) + +/* Compare a key with the node value (t is tree, k is key, n is node)*/ +#define rbt_compare(t, k, n) (t->compare(k, n->value)) + +/* Node size. FIXME: name might clash, but currently it does not, so for easier + maintenance do not rename it for now. */ +#define SIZEOF_NODE(t) ((sizeof(ib_rbt_node_t) + t->sizeof_value) - 1) + +/**********************************************************************//** +Free an instance of a red black tree */ +UNIV_INTERN +void +rbt_free( +/*=====*/ + ib_rbt_t* tree); /*!< in: rb tree to free */ +/**********************************************************************//** +Create an instance of a red black tree +@return rb tree instance */ +UNIV_INTERN +ib_rbt_t* +rbt_create( +/*=======*/ + size_t sizeof_value, /*!< in: size in bytes */ + ib_rbt_compare compare); /*!< in: comparator */ +/**********************************************************************//** +Create an instance of a red black tree, whose comparison function takes +an argument +@return rb tree instance */ +UNIV_INTERN +ib_rbt_t* +rbt_create_arg_cmp( +/*===============*/ + size_t sizeof_value, /*!< in: size in bytes */ + ib_rbt_arg_compare + compare, /*!< in: comparator */ + void* cmp_arg); /*!< in: compare fn arg */ +/**********************************************************************//** +Delete a node from the red black tree, identified by key */ +UNIV_INTERN +ibool +rbt_delete( +/*=======*/ + /* in: TRUE on success */ + ib_rbt_t* tree, /* in: rb tree */ + const void* key); /* in: key to delete */ +/**********************************************************************//** +Remove a node from the red black tree, NOTE: This function will not delete +the node instance, THAT IS THE CALLERS RESPONSIBILITY. +@return the deleted node with the const. */ +UNIV_INTERN +ib_rbt_node_t* +rbt_remove_node( +/*============*/ + ib_rbt_t* tree, /*!< in: rb tree */ + const ib_rbt_node_t* + node); /*!< in: node to delete, this + is a fudge and declared const + because the caller has access + only to const nodes.*/ +/**********************************************************************//** +Return a node from the red black tree, identified by +key, NULL if not found +@return node if found else return NULL */ +UNIV_INTERN +const ib_rbt_node_t* +rbt_lookup( +/*=======*/ + const ib_rbt_t* tree, /*!< in: rb tree to search */ + const void* key); /*!< in: key to lookup */ +/**********************************************************************//** +Add data to the red black tree, identified by key (no dups yet!) +@return inserted node */ +UNIV_INTERN +const ib_rbt_node_t* +rbt_insert( +/*=======*/ + ib_rbt_t* tree, /*!< in: rb tree */ + const void* key, /*!< in: key for ordering */ + const void* value); /*!< in: data that will be + copied to the node.*/ +/**********************************************************************//** +Add a new node to the tree, useful for data that is pre-sorted. +@return appended node */ +UNIV_INTERN +const ib_rbt_node_t* +rbt_add_node( +/*=========*/ + ib_rbt_t* tree, /*!< in: rb tree */ + ib_rbt_bound_t* parent, /*!< in: parent */ + const void* value); /*!< in: this value is copied + to the node */ +/****************************************************************//** +Add a new caller-provided node to tree at the specified position. +The node must have its key fields initialized correctly. +@return added node */ +UNIV_INTERN +const ib_rbt_node_t* +rbt_add_preallocated_node( +/*======================*/ + ib_rbt_t* tree, /*!< in: rb tree */ + ib_rbt_bound_t* parent, /*!< in: parent */ + ib_rbt_node_t* node); /*!< in: node */ +/**********************************************************************//** +Return the left most data node in the tree +@return left most node */ +UNIV_INTERN +const ib_rbt_node_t* +rbt_first( +/*======*/ + const ib_rbt_t* tree); /*!< in: rb tree */ +/**********************************************************************//** +Return the right most data node in the tree +@return right most node */ +UNIV_INTERN +const ib_rbt_node_t* +rbt_last( +/*=====*/ + const ib_rbt_t* tree); /*!< in: rb tree */ +/**********************************************************************//** +Return the next node from current. +@return successor node to current that is passed in. */ +UNIV_INTERN +const ib_rbt_node_t* +rbt_next( +/*=====*/ + const ib_rbt_t* tree, /*!< in: rb tree */ + const ib_rbt_node_t* /* in: current node */ + current); +/**********************************************************************//** +Return the prev node from current. +@return precedessor node to current that is passed in */ +UNIV_INTERN +const ib_rbt_node_t* +rbt_prev( +/*=====*/ + const ib_rbt_t* tree, /*!< in: rb tree */ + const ib_rbt_node_t* /* in: current node */ + current); +/**********************************************************************//** +Find the node that has the lowest key that is >= key. +@return node that satisfies the lower bound constraint or NULL */ +UNIV_INTERN +const ib_rbt_node_t* +rbt_lower_bound( +/*============*/ + const ib_rbt_t* tree, /*!< in: rb tree */ + const void* key); /*!< in: key to search */ +/**********************************************************************//** +Find the node that has the greatest key that is <= key. +@return node that satisifies the upper bound constraint or NULL */ +UNIV_INTERN +const ib_rbt_node_t* +rbt_upper_bound( +/*============*/ + const ib_rbt_t* tree, /*!< in: rb tree */ + const void* key); /*!< in: key to search */ +/**********************************************************************//** +Search for the key, a node will be retuned in parent.last, whether it +was found or not. If not found then parent.last will contain the +parent node for the possibly new key otherwise the matching node. +@return result of last comparison */ +UNIV_INTERN +int +rbt_search( +/*=======*/ + const ib_rbt_t* tree, /*!< in: rb tree */ + ib_rbt_bound_t* parent, /*!< in: search bounds */ + const void* key); /*!< in: key to search */ +/**********************************************************************//** +Search for the key, a node will be retuned in parent.last, whether it +was found or not. If not found then parent.last will contain the +parent node for the possibly new key otherwise the matching node. +@return result of last comparison */ +UNIV_INTERN +int +rbt_search_cmp( +/*===========*/ + const ib_rbt_t* tree, /*!< in: rb tree */ + ib_rbt_bound_t* parent, /*!< in: search bounds */ + const void* key, /*!< in: key to search */ + ib_rbt_compare compare, /*!< in: comparator */ + ib_rbt_arg_compare + arg_compare); /*!< in: fn to compare items + with argument */ +/**********************************************************************//** +Clear the tree, deletes (and free's) all the nodes. */ +UNIV_INTERN +void +rbt_clear( +/*======*/ + ib_rbt_t* tree); /*!< in: rb tree */ +/****************************************************************//** +Clear the tree without deleting and freeing its nodes. */ +UNIV_INTERN +void +rbt_reset( +/*======*/ + ib_rbt_t* tree); /*!< in: rb tree */ +/**********************************************************************//** +Merge the node from dst into src. Return the number of nodes merged. +@return no. of recs merged */ +UNIV_INTERN +ulint +rbt_merge_uniq( +/*===========*/ + ib_rbt_t* dst, /*!< in: dst rb tree */ + const ib_rbt_t* src); /*!< in: src rb tree */ +/**********************************************************************//** +Merge the node from dst into src. Return the number of nodes merged. +Delete the nodes from src after copying node to dst. As a side effect +the duplicates will be left untouched in the src, since we don't support +duplicates (yet). NOTE: src and dst must be similar, the function doesn't +check for this condition (yet). +@return no. of recs merged */ +UNIV_INTERN +ulint +rbt_merge_uniq_destructive( +/*=======================*/ + ib_rbt_t* dst, /*!< in: dst rb tree */ + ib_rbt_t* src); /*!< in: src rb tree */ +/**********************************************************************//** +Verify the integrity of the RB tree. For debugging. 0 failure else height +of tree (in count of black nodes). +@return TRUE if OK FALSE if tree invalid. */ +UNIV_INTERN +ibool +rbt_validate( +/*=========*/ + const ib_rbt_t* tree); /*!< in: tree to validate */ +/**********************************************************************//** +Iterate over the tree in depth first order. */ +UNIV_INTERN +void +rbt_print( +/*======*/ + const ib_rbt_t* tree, /*!< in: tree to traverse */ + ib_rbt_print_node print); /*!< in: print function */ + +#endif /* INNOBASE_UT0RBT_H */ diff --git a/storage/xtradb/include/ut0rnd.h b/storage/xtradb/include/ut0rnd.h new file mode 100644 index 00000000000..53b769849a5 --- /dev/null +++ b/storage/xtradb/include/ut0rnd.h @@ -0,0 +1,148 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/ut0rnd.h +Random numbers and hashing + +Created 1/20/1994 Heikki Tuuri +***********************************************************************/ + +#ifndef ut0rnd_h +#define ut0rnd_h + +#include "univ.i" + +#ifndef UNIV_INNOCHECKSUM + +#include "ut0byte.h" + +/** The 'character code' for end of field or string (used +in folding records */ +#define UT_END_OF_FIELD 257 + +/********************************************************//** +This is used to set the random number seed. */ +UNIV_INLINE +void +ut_rnd_set_seed( +/*============*/ + ulint seed); /*!< in: seed */ +/********************************************************//** +The following function generates a series of 'random' ulint integers. +@return the next 'random' number */ +UNIV_INLINE +ulint +ut_rnd_gen_next_ulint( +/*==================*/ + ulint rnd); /*!< in: the previous random number value */ +/*********************************************************//** +The following function generates 'random' ulint integers which +enumerate the value space (let there be N of them) of ulint integers +in a pseudo-random fashion. Note that the same integer is repeated +always after N calls to the generator. +@return the 'random' number */ +UNIV_INLINE +ulint +ut_rnd_gen_ulint(void); +/*==================*/ +/********************************************************//** +Generates a random integer from a given interval. +@return the 'random' number */ +UNIV_INLINE +ulint +ut_rnd_interval( +/*============*/ + ulint low, /*!< in: low limit; can generate also this value */ + ulint high); /*!< in: high limit; can generate also this value */ +/*********************************************************//** +Generates a random iboolean value. +@return the random value */ +UNIV_INLINE +ibool +ut_rnd_gen_ibool(void); +/*=================*/ +/*******************************************************//** +The following function generates a hash value for a ulint integer +to a hash table of size table_size, which should be a prime or some +random number to work reliably. +@return hash value */ +UNIV_INLINE +ulint +ut_hash_ulint( +/*==========*/ + ulint key, /*!< in: value to be hashed */ + ulint table_size); /*!< in: hash table size */ +/*************************************************************//** +Folds a 64-bit integer. +@return folded value */ +UNIV_INLINE +ulint +ut_fold_ull( +/*========*/ + ib_uint64_t d) /*!< in: 64-bit integer */ + __attribute__((const)); +/*************************************************************//** +Folds a character string ending in the null character. +@return folded value */ +UNIV_INLINE +ulint +ut_fold_string( +/*===========*/ + const char* str) /*!< in: null-terminated string */ + __attribute__((pure)); +/***********************************************************//** +Looks for a prime number slightly greater than the given argument. +The prime is chosen so that it is not near any power of 2. +@return prime */ +UNIV_INTERN +ulint +ut_find_prime( +/*==========*/ + ulint n) /*!< in: positive number > 100 */ + __attribute__((const)); + +#endif /* !UNIV_INNOCHECKSUM */ + +/*************************************************************//** +Folds a pair of ulints. +@return folded value */ +UNIV_INLINE +ulint +ut_fold_ulint_pair( +/*===============*/ + ulint n1, /*!< in: ulint */ + ulint n2) /*!< in: ulint */ + __attribute__((const)); +/*************************************************************//** +Folds a binary string. +@return folded value */ +UNIV_INLINE +ulint +ut_fold_binary( +/*===========*/ + const byte* str, /*!< in: string of bytes */ + ulint len) /*!< in: length */ + __attribute__((pure)); + + +#ifndef UNIV_NONINL +#include "ut0rnd.ic" +#endif + +#endif diff --git a/storage/xtradb/include/ut0rnd.ic b/storage/xtradb/include/ut0rnd.ic new file mode 100644 index 00000000000..024c59e553b --- /dev/null +++ b/storage/xtradb/include/ut0rnd.ic @@ -0,0 +1,255 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************************//** +@file include/ut0rnd.ic +Random numbers and hashing + +Created 5/30/1994 Heikki Tuuri +*******************************************************************/ + +#define UT_HASH_RANDOM_MASK 1463735687 +#define UT_HASH_RANDOM_MASK2 1653893711 + +#ifndef UNIV_INNOCHECKSUM + +#define UT_RND1 151117737 +#define UT_RND2 119785373 +#define UT_RND3 85689495 +#define UT_RND4 76595339 +#define UT_SUM_RND2 98781234 +#define UT_SUM_RND3 126792457 +#define UT_SUM_RND4 63498502 +#define UT_XOR_RND1 187678878 +#define UT_XOR_RND2 143537923 + +/** Seed value of ut_rnd_gen_ulint() */ +extern ulint ut_rnd_ulint_counter; + +/********************************************************//** +This is used to set the random number seed. */ +UNIV_INLINE +void +ut_rnd_set_seed( +/*============*/ + ulint seed) /*!< in: seed */ +{ + ut_rnd_ulint_counter = seed; +} + +/********************************************************//** +The following function generates a series of 'random' ulint integers. +@return the next 'random' number */ +UNIV_INLINE +ulint +ut_rnd_gen_next_ulint( +/*==================*/ + ulint rnd) /*!< in: the previous random number value */ +{ + ulint n_bits; + + n_bits = 8 * sizeof(ulint); + + rnd = UT_RND2 * rnd + UT_SUM_RND3; + rnd = UT_XOR_RND1 ^ rnd; + rnd = (rnd << 20) + (rnd >> (n_bits - 20)); + rnd = UT_RND3 * rnd + UT_SUM_RND4; + rnd = UT_XOR_RND2 ^ rnd; + rnd = (rnd << 20) + (rnd >> (n_bits - 20)); + rnd = UT_RND1 * rnd + UT_SUM_RND2; + + return(rnd); +} + +/********************************************************//** +The following function generates 'random' ulint integers which +enumerate the value space of ulint integers in a pseudo random +fashion. Note that the same integer is repeated always after +2 to power 32 calls to the generator (if ulint is 32-bit). +@return the 'random' number */ +UNIV_INLINE +ulint +ut_rnd_gen_ulint(void) +/*==================*/ +{ + ulint rnd; + + ut_rnd_ulint_counter = UT_RND1 * ut_rnd_ulint_counter + UT_RND2; + + rnd = ut_rnd_gen_next_ulint(ut_rnd_ulint_counter); + + return(rnd); +} + +/********************************************************//** +Generates a random integer from a given interval. +@return the 'random' number */ +UNIV_INLINE +ulint +ut_rnd_interval( +/*============*/ + ulint low, /*!< in: low limit; can generate also this value */ + ulint high) /*!< in: high limit; can generate also this value */ +{ + ulint rnd; + + ut_ad(high >= low); + + if (low == high) { + + return(low); + } + + rnd = ut_rnd_gen_ulint(); + + return(low + (rnd % (high - low))); +} + +/*********************************************************//** +Generates a random iboolean value. +@return the random value */ +UNIV_INLINE +ibool +ut_rnd_gen_ibool(void) +/*=================*/ +{ + ulint x; + + x = ut_rnd_gen_ulint(); + + if (((x >> 20) + (x >> 15)) & 1) { + + return(TRUE); + } + + return(FALSE); +} + +/*******************************************************//** +The following function generates a hash value for a ulint integer +to a hash table of size table_size, which should be a prime +or some random number for the hash table to work reliably. +@return hash value */ +UNIV_INLINE +ulint +ut_hash_ulint( +/*==========*/ + ulint key, /*!< in: value to be hashed */ + ulint table_size) /*!< in: hash table size */ +{ + ut_ad(table_size); + key = key ^ UT_HASH_RANDOM_MASK2; + + return(key % table_size); +} + +/*************************************************************//** +Folds a 64-bit integer. +@return folded value */ +UNIV_INLINE +ulint +ut_fold_ull( +/*========*/ + ib_uint64_t d) /*!< in: 64-bit integer */ +{ + return(ut_fold_ulint_pair((ulint) d & ULINT32_MASK, + (ulint) (d >> 32))); +} + +/*************************************************************//** +Folds a character string ending in the null character. +@return folded value */ +UNIV_INLINE +ulint +ut_fold_string( +/*===========*/ + const char* str) /*!< in: null-terminated string */ +{ + ulint fold = 0; + + ut_ad(str); + + while (*str != '\0') { + fold = ut_fold_ulint_pair(fold, (ulint)(*str)); + str++; + } + + return(fold); +} + +#endif /* !UNIV_INNOCHECKSUM */ + +/*************************************************************//** +Folds a pair of ulints. +@return folded value */ +UNIV_INLINE +ulint +ut_fold_ulint_pair( +/*===============*/ + ulint n1, /*!< in: ulint */ + ulint n2) /*!< in: ulint */ +{ + return(((((n1 ^ n2 ^ UT_HASH_RANDOM_MASK2) << 8) + n1) + ^ UT_HASH_RANDOM_MASK) + n2); +} + +/*************************************************************//** +Folds a binary string. +@return folded value */ +UNIV_INLINE +ulint +ut_fold_binary( +/*===========*/ + const byte* str, /*!< in: string of bytes */ + ulint len) /*!< in: length */ +{ + ulint fold = 0; + const byte* str_end = str + (len & 0xFFFFFFF8); + + ut_ad(str || !len); + + while (str < str_end) { + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + } + + switch (len & 0x7) { + case 7: + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + case 6: + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + case 5: + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + case 4: + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + case 3: + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + case 2: + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + case 1: + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + } + + return(fold); +} diff --git a/storage/xtradb/include/ut0sort.h b/storage/xtradb/include/ut0sort.h new file mode 100644 index 00000000000..75648b5c317 --- /dev/null +++ b/storage/xtradb/include/ut0sort.h @@ -0,0 +1,106 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/ut0sort.h +Sort utility + +Created 11/9/1995 Heikki Tuuri +***********************************************************************/ + +#ifndef ut0sort_h +#define ut0sort_h + +#include "univ.i" + +/* This module gives a macro definition of the body of +a standard sort function for an array of elements of any +type. The comparison function is given as a parameter to +the macro. The sort algorithm is mergesort which has logarithmic +worst case. +*/ + +/*******************************************************************//** +This macro expands to the body of a standard sort function. +The sort function uses mergesort and must be defined separately +for each type of array. +Also the comparison function has to be defined individually +for each array cell type. SORT_FUN is the sort function name. +The function takes the array to be sorted (ARR), +the array of auxiliary space (AUX_ARR) of same size, +and the low (LOW), inclusive, and high (HIGH), noninclusive, +limits for the sort interval as arguments. +CMP_FUN is the comparison function name. It takes as arguments +two elements from the array and returns 1, if the first is bigger, +0 if equal, and -1 if the second bigger. */ + +#define UT_SORT_FUNCTION_BODY(SORT_FUN, ARR, AUX_ARR, LOW, HIGH, CMP_FUN)\ +{\ + ulint ut_sort_mid77;\ + ulint ut_sort_i77;\ + ulint ut_sort_low77;\ + ulint ut_sort_high77;\ +\ + ut_ad((LOW) < (HIGH));\ + ut_ad(ARR);\ + ut_ad(AUX_ARR);\ +\ + if ((LOW) == (HIGH) - 1) {\ + return;\ + } else if ((LOW) == (HIGH) - 2) {\ + if (CMP_FUN((ARR)[LOW], (ARR)[(HIGH) - 1]) > 0) {\ + (AUX_ARR)[LOW] = (ARR)[LOW];\ + (ARR)[LOW] = (ARR)[(HIGH) - 1];\ + (ARR)[(HIGH) - 1] = (AUX_ARR)[LOW];\ + }\ + return;\ + }\ +\ + ut_sort_mid77 = ((LOW) + (HIGH)) / 2;\ +\ + SORT_FUN((ARR), (AUX_ARR), (LOW), ut_sort_mid77);\ + SORT_FUN((ARR), (AUX_ARR), ut_sort_mid77, (HIGH));\ +\ + ut_sort_low77 = (LOW);\ + ut_sort_high77 = ut_sort_mid77;\ +\ + for (ut_sort_i77 = (LOW); ut_sort_i77 < (HIGH); ut_sort_i77++) {\ +\ + if (ut_sort_low77 >= ut_sort_mid77) {\ + (AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_high77];\ + ut_sort_high77++;\ + } else if (ut_sort_high77 >= (HIGH)) {\ + (AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_low77];\ + ut_sort_low77++;\ + } else if (CMP_FUN((ARR)[ut_sort_low77],\ + (ARR)[ut_sort_high77]) > 0) {\ + (AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_high77];\ + ut_sort_high77++;\ + } else {\ + (AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_low77];\ + ut_sort_low77++;\ + }\ + }\ +\ + memcpy((void*) ((ARR) + (LOW)), (AUX_ARR) + (LOW),\ + ((HIGH) - (LOW)) * sizeof *(ARR));\ +}\ + + +#endif + diff --git a/storage/xtradb/include/ut0ut.h b/storage/xtradb/include/ut0ut.h new file mode 100644 index 00000000000..0caf379d8fa --- /dev/null +++ b/storage/xtradb/include/ut0ut.h @@ -0,0 +1,497 @@ +/***************************************************************************** + +Copyright (c) 1994, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/ut0ut.h +Various utilities + +Created 1/20/1994 Heikki Tuuri +***********************************************************************/ + +#ifndef ut0ut_h +#define ut0ut_h + +#include "univ.i" + +#ifndef UNIV_INNOCHECKSUM + +#include "db0err.h" + +#ifndef UNIV_HOTBACKUP +# include "os0sync.h" /* for HAVE_ATOMIC_BUILTINS */ +#endif /* UNIV_HOTBACKUP */ + +#include <time.h> +#ifndef MYSQL_SERVER +#include <ctype.h> +#endif + +#include <stdarg.h> /* for va_list */ + +/** Index name prefix in fast index creation */ +#define TEMP_INDEX_PREFIX '\377' +/** Index name prefix in fast index creation, as a string constant */ +#define TEMP_INDEX_PREFIX_STR "\377" + +/** Time stamp */ +typedef time_t ib_time_t; + +/* In order to call a piece of code, when a function returns or when the +scope ends, use this utility class. It will invoke the given function +object in its destructor. */ +template<typename F> +struct ut_when_dtor { + ut_when_dtor(F& p) : f(p) {} + ~ut_when_dtor() { + f(); + } +private: + F& f; +}; + +#ifndef UNIV_HOTBACKUP +# if defined(HAVE_PAUSE_INSTRUCTION) + /* According to the gcc info page, asm volatile means that the + instruction has important side-effects and must not be removed. + Also asm volatile may trigger a memory barrier (spilling all registers + to memory). */ +# ifdef __SUNPRO_CC +# define UT_RELAX_CPU() asm ("pause" ) +# else +# define UT_RELAX_CPU() __asm__ __volatile__ ("pause") +# endif /* __SUNPRO_CC */ + +# elif defined(HAVE_FAKE_PAUSE_INSTRUCTION) +# define UT_RELAX_CPU() __asm__ __volatile__ ("rep; nop") +# elif defined(HAVE_ATOMIC_BUILTINS) +# define UT_RELAX_CPU() do { \ + volatile lint volatile_var; \ + os_compare_and_swap_lint(&volatile_var, 0, 1); \ + } while (0) +# elif defined(HAVE_WINDOWS_ATOMICS) + /* In the Win32 API, the x86 PAUSE instruction is executed by calling + the YieldProcessor macro defined in WinNT.h. It is a CPU architecture- + independent way by using YieldProcessor. */ +# define UT_RELAX_CPU() YieldProcessor() +# else +# define UT_RELAX_CPU() ((void)0) /* avoid warning for an empty statement */ +# endif + +/*********************************************************************//** +Delays execution for at most max_wait_us microseconds or returns earlier +if cond becomes true. +@param cond in: condition to wait for; evaluated every 2 ms +@param max_wait_us in: maximum delay to wait, in microseconds */ +#define UT_WAIT_FOR(cond, max_wait_us) \ +do { \ + ullint start_us; \ + start_us = ut_time_us(NULL); \ + while (!(cond) \ + && ut_time_us(NULL) - start_us < (max_wait_us)) {\ + \ + os_thread_sleep(2000 /* 2 ms */); \ + } \ +} while (0) +#endif /* !UNIV_HOTBACKUP */ + +template <class T> T ut_min(T a, T b) { return(a < b ? a : b); } +template <class T> T ut_max(T a, T b) { return(a > b ? a : b); } + +/******************************************************//** +Calculates the minimum of two ulints. +@return minimum */ +UNIV_INLINE +ulint +ut_min( +/*===*/ + ulint n1, /*!< in: first number */ + ulint n2); /*!< in: second number */ +/******************************************************//** +Calculates the maximum of two ulints. +@return maximum */ +UNIV_INLINE +ulint +ut_max( +/*===*/ + ulint n1, /*!< in: first number */ + ulint n2); /*!< in: second number */ +/****************************************************************//** +Calculates minimum of two ulint-pairs. */ +UNIV_INLINE +void +ut_pair_min( +/*========*/ + ulint* a, /*!< out: more significant part of minimum */ + ulint* b, /*!< out: less significant part of minimum */ + ulint a1, /*!< in: more significant part of first pair */ + ulint b1, /*!< in: less significant part of first pair */ + ulint a2, /*!< in: more significant part of second pair */ + ulint b2); /*!< in: less significant part of second pair */ +/******************************************************//** +Compares two ulints. +@return 1 if a > b, 0 if a == b, -1 if a < b */ +UNIV_INLINE +int +ut_ulint_cmp( +/*=========*/ + ulint a, /*!< in: ulint */ + ulint b); /*!< in: ulint */ +/*******************************************************//** +Compares two pairs of ulints. +@return -1 if a < b, 0 if a == b, 1 if a > b */ +UNIV_INLINE +int +ut_pair_cmp( +/*========*/ + ulint a1, /*!< in: more significant part of first pair */ + ulint a2, /*!< in: less significant part of first pair */ + ulint b1, /*!< in: more significant part of second pair */ + ulint b2); /*!< in: less significant part of second pair */ +/*************************************************************//** +Determines if a number is zero or a power of two. +@param n in: number +@return nonzero if n is zero or a power of two; zero otherwise */ +#define ut_is_2pow(n) UNIV_LIKELY(!((n) & ((n) - 1))) +/*************************************************************//** +Calculates fast the remainder of n/m when m is a power of two. +@param n in: numerator +@param m in: denominator, must be a power of two +@return the remainder of n/m */ +#define ut_2pow_remainder(n, m) ((n) & ((m) - 1)) +/*************************************************************//** +Calculates the biggest multiple of m that is not bigger than n +when m is a power of two. In other words, rounds n down to m * k. +@param n in: number to round down +@param m in: alignment, must be a power of two +@return n rounded down to the biggest possible integer multiple of m */ +#define ut_2pow_round(n, m) ((n) & ~((m) - 1)) +/** Align a number down to a multiple of a power of two. +@param n in: number to round down +@param m in: alignment, must be a power of two +@return n rounded down to the biggest possible integer multiple of m */ +#define ut_calc_align_down(n, m) ut_2pow_round(n, m) +/********************************************************//** +Calculates the smallest multiple of m that is not smaller than n +when m is a power of two. In other words, rounds n up to m * k. +@param n in: number to round up +@param m in: alignment, must be a power of two +@return n rounded up to the smallest possible integer multiple of m */ +#define ut_calc_align(n, m) (((n) + ((m) - 1)) & ~((m) - 1)) +/*************************************************************//** +Calculates fast the 2-logarithm of a number, rounded upward to an +integer. +@return logarithm in the base 2, rounded upward */ +UNIV_INLINE +ulint +ut_2_log( +/*=====*/ + ulint n); /*!< in: number */ +/*************************************************************//** +Calculates 2 to power n. +@return 2 to power n */ +UNIV_INLINE +ulint +ut_2_exp( +/*=====*/ + ulint n); /*!< in: number */ +/*************************************************************//** +Calculates fast the number rounded up to the nearest power of 2. +@return first power of 2 which is >= n */ +UNIV_INTERN +ulint +ut_2_power_up( +/*==========*/ + ulint n) /*!< in: number != 0 */ + __attribute__((const)); + +/** Determine how many bytes (groups of 8 bits) are needed to +store the given number of bits. +@param b in: bits +@return number of bytes (octets) needed to represent b */ +#define UT_BITS_IN_BYTES(b) (((b) + 7) / 8) + +/**********************************************************//** +Returns system time. We do not specify the format of the time returned: +the only way to manipulate it is to use the function ut_difftime. +@return system time */ +UNIV_INTERN +ib_time_t +ut_time(void); +/*=========*/ +#ifndef UNIV_HOTBACKUP +/**********************************************************//** +Returns system time. +Upon successful completion, the value 0 is returned; otherwise the +value -1 is returned and the global variable errno is set to indicate the +error. +@return 0 on success, -1 otherwise */ +UNIV_INTERN +int +ut_usectime( +/*========*/ + ulint* sec, /*!< out: seconds since the Epoch */ + ulint* ms); /*!< out: microseconds since the Epoch+*sec */ + +/**********************************************************//** +Returns the number of microseconds since epoch. Similar to +time(3), the return value is also stored in *tloc, provided +that tloc is non-NULL. +@return us since epoch */ +UNIV_INTERN +ullint +ut_time_us( +/*=======*/ + ullint* tloc); /*!< out: us since epoch, if non-NULL */ +/**********************************************************//** +Returns the number of milliseconds since some epoch. The +value may wrap around. It should only be used for heuristic +purposes. +@return ms since epoch */ +UNIV_INTERN +ulint +ut_time_ms(void); +/*============*/ +#endif /* !UNIV_HOTBACKUP */ + +/**********************************************************//** +Returns the number of milliseconds since some epoch. The +value may wrap around. It should only be used for heuristic +purposes. +@return ms since epoch */ +UNIV_INTERN +ulint +ut_time_ms(void); +/*============*/ + +/**********************************************************//** +Returns the difference of two times in seconds. +@return time2 - time1 expressed in seconds */ +UNIV_INTERN +double +ut_difftime( +/*========*/ + ib_time_t time2, /*!< in: time */ + ib_time_t time1); /*!< in: time */ + +#endif /* !UNIV_INNOCHECKSUM */ + +/**********************************************************//** +Prints a timestamp to a file. */ +UNIV_INTERN +void +ut_print_timestamp( +/*===============*/ + FILE* file) /*!< in: file where to print */ + UNIV_COLD __attribute__((nonnull)); + +#ifndef UNIV_INNOCHECKSUM + +/**********************************************************//** +Sprintfs a timestamp to a buffer, 13..14 chars plus terminating NUL. */ +UNIV_INTERN +void +ut_sprintf_timestamp( +/*=================*/ + char* buf); /*!< in: buffer where to sprintf */ +#ifdef UNIV_HOTBACKUP +/**********************************************************//** +Sprintfs a timestamp to a buffer with no spaces and with ':' characters +replaced by '_'. */ +UNIV_INTERN +void +ut_sprintf_timestamp_without_extra_chars( +/*=====================================*/ + char* buf); /*!< in: buffer where to sprintf */ +/**********************************************************//** +Returns current year, month, day. */ +UNIV_INTERN +void +ut_get_year_month_day( +/*==================*/ + ulint* year, /*!< out: current year */ + ulint* month, /*!< out: month */ + ulint* day); /*!< out: day */ +#else /* UNIV_HOTBACKUP */ +/*************************************************************//** +Runs an idle loop on CPU. The argument gives the desired delay +in microseconds on 100 MHz Pentium + Visual C++. +@return dummy value */ +UNIV_INTERN +ulint +ut_delay( +/*=====*/ + ulint delay); /*!< in: delay in microseconds on 100 MHz Pentium */ +#endif /* UNIV_HOTBACKUP */ +/*************************************************************//** +Prints the contents of a memory buffer in hex and ascii. */ +UNIV_INTERN +void +ut_print_buf( +/*=========*/ + FILE* file, /*!< in: file where to print */ + const void* buf, /*!< in: memory buffer */ + ulint len); /*!< in: length of the buffer */ + +/**********************************************************************//** +Outputs a NUL-terminated file name, quoted with apostrophes. */ +UNIV_INTERN +void +ut_print_filename( +/*==============*/ + FILE* f, /*!< in: output stream */ + const char* name); /*!< in: name to print */ + +#ifndef UNIV_HOTBACKUP +/* Forward declaration of transaction handle */ +struct trx_t; + +/**********************************************************************//** +Outputs a fixed-length string, quoted as an SQL identifier. +If the string contains a slash '/', the string will be +output as two identifiers separated by a period (.), +as in SQL database_name.identifier. */ +UNIV_INTERN +void +ut_print_name( +/*==========*/ + FILE* f, /*!< in: output stream */ + const trx_t* trx, /*!< in: transaction */ + ibool table_id,/*!< in: TRUE=print a table name, + FALSE=print other identifier */ + const char* name); /*!< in: name to print */ + +/**********************************************************************//** +Outputs a fixed-length string, quoted as an SQL identifier. +If the string contains a slash '/', the string will be +output as two identifiers separated by a period (.), +as in SQL database_name.identifier. */ +UNIV_INTERN +void +ut_print_namel( +/*===========*/ + FILE* f, /*!< in: output stream */ + const trx_t* trx, /*!< in: transaction (NULL=no quotes) */ + ibool table_id,/*!< in: TRUE=print a table name, + FALSE=print other identifier */ + const char* name, /*!< in: name to print */ + ulint namelen);/*!< in: length of name */ + +/**********************************************************************//** +Formats a table or index name, quoted as an SQL identifier. If the name +contains a slash '/', the result will contain two identifiers separated by +a period (.), as in SQL database_name.identifier. +@return pointer to 'formatted' */ +UNIV_INTERN +char* +ut_format_name( +/*===========*/ + const char* name, /*!< in: table or index name, must be + '\0'-terminated */ + ibool is_table, /*!< in: if TRUE then 'name' is a table + name */ + char* formatted, /*!< out: formatted result, will be + '\0'-terminated */ + ulint formatted_size);/*!< out: no more than this number of + bytes will be written to 'formatted' */ + +/**********************************************************************//** +Catenate files. */ +UNIV_INTERN +void +ut_copy_file( +/*=========*/ + FILE* dest, /*!< in: output file */ + FILE* src); /*!< in: input file to be appended to output */ +#endif /* !UNIV_HOTBACKUP */ + +#ifdef __WIN__ +/**********************************************************************//** +A substitute for vsnprintf(3), formatted output conversion into +a limited buffer. Note: this function DOES NOT return the number of +characters that would have been printed if the buffer was unlimited because +VC's _vsnprintf() returns -1 in this case and we would need to call +_vscprintf() in addition to estimate that but we would need another copy +of "ap" for that and VC does not provide va_copy(). */ +UNIV_INTERN +void +ut_vsnprintf( +/*=========*/ + char* str, /*!< out: string */ + size_t size, /*!< in: str size */ + const char* fmt, /*!< in: format */ + va_list ap); /*!< in: format values */ + +/**********************************************************************//** +A substitute for snprintf(3), formatted output conversion into +a limited buffer. +@return number of characters that would have been printed if the size +were unlimited, not including the terminating '\0'. */ +UNIV_INTERN +int +ut_snprintf( +/*========*/ + char* str, /*!< out: string */ + size_t size, /*!< in: str size */ + const char* fmt, /*!< in: format */ + ...); /*!< in: format values */ +#else +/**********************************************************************//** +A wrapper for vsnprintf(3), formatted output conversion into +a limited buffer. Note: this function DOES NOT return the number of +characters that would have been printed if the buffer was unlimited because +VC's _vsnprintf() returns -1 in this case and we would need to call +_vscprintf() in addition to estimate that but we would need another copy +of "ap" for that and VC does not provide va_copy(). */ +# define ut_vsnprintf(buf, size, fmt, ap) \ + ((void) vsnprintf(buf, size, fmt, ap)) +/**********************************************************************//** +A wrapper for snprintf(3), formatted output conversion into +a limited buffer. */ +# define ut_snprintf snprintf +#endif /* __WIN__ */ + +/*************************************************************//** +Convert an error number to a human readable text message. The +returned string is static and should not be freed or modified. +@return string, describing the error */ +UNIV_INTERN +const char* +ut_strerr( +/*======*/ + dberr_t num); /*!< in: error number */ + +/**************************************************************** +Sort function for ulint arrays. */ +UNIV_INTERN +void +ut_ulint_sort( +/*==========*/ + ulint* arr, /*!< in/out: array to sort */ + ulint* aux_arr, /*!< in/out: aux array to use in sort */ + ulint low, /*!< in: lower bound */ + ulint high) /*!< in: upper bound */ + __attribute__((nonnull)); + +#ifndef UNIV_NONINL +#include "ut0ut.ic" +#endif + +#endif /* !UNIV_INNOCHECKSUM */ + +#endif + diff --git a/storage/xtradb/include/ut0ut.ic b/storage/xtradb/include/ut0ut.ic new file mode 100644 index 00000000000..4e0f76e1957 --- /dev/null +++ b/storage/xtradb/include/ut0ut.ic @@ -0,0 +1,162 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************************//** +@file include/ut0ut.ic +Various utilities + +Created 5/30/1994 Heikki Tuuri +*******************************************************************/ + +/******************************************************//** +Calculates the minimum of two ulints. +@return minimum */ +UNIV_INLINE +ulint +ut_min( +/*===*/ + ulint n1, /*!< in: first number */ + ulint n2) /*!< in: second number */ +{ + return((n1 <= n2) ? n1 : n2); +} + +/******************************************************//** +Calculates the maximum of two ulints. +@return maximum */ +UNIV_INLINE +ulint +ut_max( +/*===*/ + ulint n1, /*!< in: first number */ + ulint n2) /*!< in: second number */ +{ + return((n1 <= n2) ? n2 : n1); +} + +/****************************************************************//** +Calculates minimum of two ulint-pairs. */ +UNIV_INLINE +void +ut_pair_min( +/*========*/ + ulint* a, /*!< out: more significant part of minimum */ + ulint* b, /*!< out: less significant part of minimum */ + ulint a1, /*!< in: more significant part of first pair */ + ulint b1, /*!< in: less significant part of first pair */ + ulint a2, /*!< in: more significant part of second pair */ + ulint b2) /*!< in: less significant part of second pair */ +{ + if (a1 == a2) { + *a = a1; + *b = ut_min(b1, b2); + } else if (a1 < a2) { + *a = a1; + *b = b1; + } else { + *a = a2; + *b = b2; + } +} + +/******************************************************//** +Compares two ulints. +@return 1 if a > b, 0 if a == b, -1 if a < b */ +UNIV_INLINE +int +ut_ulint_cmp( +/*=========*/ + ulint a, /*!< in: ulint */ + ulint b) /*!< in: ulint */ +{ + if (a < b) { + return(-1); + } else if (a == b) { + return(0); + } else { + return(1); + } +} + +/*******************************************************//** +Compares two pairs of ulints. +@return -1 if a < b, 0 if a == b, 1 if a > b */ +UNIV_INLINE +int +ut_pair_cmp( +/*========*/ + ulint a1, /*!< in: more significant part of first pair */ + ulint a2, /*!< in: less significant part of first pair */ + ulint b1, /*!< in: more significant part of second pair */ + ulint b2) /*!< in: less significant part of second pair */ +{ + if (a1 > b1) { + return(1); + } else if (a1 < b1) { + return(-1); + } else if (a2 > b2) { + return(1); + } else if (a2 < b2) { + return(-1); + } else { + return(0); + } +} + +/*************************************************************//** +Calculates fast the 2-logarithm of a number, rounded upward to an +integer. +@return logarithm in the base 2, rounded upward */ +UNIV_INLINE +ulint +ut_2_log( +/*=====*/ + ulint n) /*!< in: number != 0 */ +{ + ulint res; + + res = 0; + + ut_ad(n > 0); + + n = n - 1; + + for (;;) { + n = n / 2; + + if (n == 0) { + break; + } + + res++; + } + + return(res + 1); +} + +/*************************************************************//** +Calculates 2 to power n. +@return 2 to power n */ +UNIV_INLINE +ulint +ut_2_exp( +/*=====*/ + ulint n) /*!< in: number */ +{ + return((ulint) 1 << n); +} diff --git a/storage/xtradb/include/ut0vec.h b/storage/xtradb/include/ut0vec.h new file mode 100644 index 00000000000..432fb348a09 --- /dev/null +++ b/storage/xtradb/include/ut0vec.h @@ -0,0 +1,337 @@ +/***************************************************************************** + +Copyright (c) 2006, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/*******************************************************************//** +@file include/ut0vec.h +A vector of pointers to data items + +Created 4/6/2006 Osku Salerma +************************************************************************/ + +#ifndef IB_VECTOR_H +#define IB_VECTOR_H + +#include "univ.i" +#include "mem0mem.h" + +struct ib_alloc_t; +struct ib_vector_t; + +typedef void* (*ib_mem_alloc_t)( + /* out: Pointer to allocated memory */ + ib_alloc_t* allocator, /* in: Pointer to allocator instance */ + ulint size); /* in: Number of bytes to allocate */ + +typedef void (*ib_mem_free_t)( + ib_alloc_t* allocator, /* in: Pointer to allocator instance */ + void* ptr); /* in: Memory to free */ + +typedef void* (*ib_mem_resize_t)( + /* out: Pointer to resized memory */ + ib_alloc_t* allocator, /* in: Pointer to allocator */ + void* ptr, /* in: Memory to resize */ + ulint old_size, /* in: Old memory size in bytes */ + ulint new_size); /* in: New size in bytes */ + +typedef int (*ib_compare_t)(const void*, const void*); + +/* An automatically resizing vector datatype with the following properties: + + -All memory allocation is done through an allocator, which is responsible for +freeing it when done with the vector. +*/ + +/* This is useful shorthand for elements of type void* */ +#define ib_vector_getp(v, n) (*(void**) ib_vector_get(v, n)) +#define ib_vector_getp_const(v, n) (*(void**) ib_vector_get_const(v, n)) + +#define ib_vector_allocator(v) (v->allocator) + +/******************************************************************** +Create a new vector with the given initial size. */ +UNIV_INTERN +ib_vector_t* +ib_vector_create( +/*=============*/ + /* out: vector */ + ib_alloc_t* alloc, /* in: Allocator */ + /* in: size of the data item */ + ulint sizeof_value, + ulint size); /* in: initial size */ + +/******************************************************************** +Destroy the vector. Make sure the vector owns the allocator, e.g., +the heap in the the heap allocator. */ +UNIV_INLINE +void +ib_vector_free( +/*===========*/ + ib_vector_t* vec); /* in/out: vector */ + +/******************************************************************** +Push a new element to the vector, increasing its size if necessary, +if elem is not NULL then elem is copied to the vector.*/ +UNIV_INLINE +void* +ib_vector_push( +/*===========*/ + /* out: pointer the "new" element */ + ib_vector_t* vec, /* in/out: vector */ + const void* elem); /* in: data element */ + +/******************************************************************** +Pop the last element from the vector.*/ +UNIV_INLINE +void* +ib_vector_pop( +/*==========*/ + /* out: pointer to the "new" element */ + ib_vector_t* vec); /* in/out: vector */ + +/*******************************************************************//** +Remove an element to the vector +@return pointer to the "removed" element */ +UNIV_INLINE +void* +ib_vector_remove( +/*=============*/ + ib_vector_t* vec, /*!< in: vector */ + const void* elem); /*!< in: value to remove */ + +/******************************************************************** +Get the number of elements in the vector. */ +UNIV_INLINE +ulint +ib_vector_size( +/*===========*/ + /* out: number of elements in vector */ + const ib_vector_t* vec); /* in: vector */ + +/******************************************************************** +Increase the size of the vector. */ +UNIV_INTERN +void +ib_vector_resize( +/*=============*/ + /* out: number of elements in vector */ + ib_vector_t* vec); /* in/out: vector */ + +/******************************************************************** +Test whether a vector is empty or not. +@return TRUE if empty */ +UNIV_INLINE +ibool +ib_vector_is_empty( +/*===============*/ + const ib_vector_t* vec); /*!< in: vector */ + +/****************************************************************//** +Get the n'th element. +@return n'th element */ +UNIV_INLINE +void* +ib_vector_get( +/*==========*/ + ib_vector_t* vec, /*!< in: vector */ + ulint n); /*!< in: element index to get */ + +/******************************************************************** +Const version of the get n'th element. +@return n'th element */ +UNIV_INLINE +const void* +ib_vector_get_const( +/*================*/ + const ib_vector_t* vec, /* in: vector */ + ulint n); /* in: element index to get */ +/****************************************************************//** +Get last element. The vector must not be empty. +@return last element */ +UNIV_INLINE +void* +ib_vector_get_last( +/*===============*/ + ib_vector_t* vec); /*!< in: vector */ +/****************************************************************//** +Set the n'th element. */ +UNIV_INLINE +void +ib_vector_set( +/*==========*/ + ib_vector_t* vec, /*!< in/out: vector */ + ulint n, /*!< in: element index to set */ + void* elem); /*!< in: data element */ + +/******************************************************************** +Reset the vector size to 0 elements. */ +UNIV_INLINE +void +ib_vector_reset( +/*============*/ + ib_vector_t* vec); /* in/out: vector */ + +/******************************************************************** +Get the last element of the vector. */ +UNIV_INLINE +void* +ib_vector_last( +/*===========*/ + /* out: pointer to last element */ + ib_vector_t* vec); /* in/out: vector */ + +/******************************************************************** +Get the last element of the vector. */ +UNIV_INLINE +const void* +ib_vector_last_const( +/*=================*/ + /* out: pointer to last element */ + const ib_vector_t* vec); /* in: vector */ + +/******************************************************************** +Sort the vector elements. */ +UNIV_INLINE +void +ib_vector_sort( +/*===========*/ + ib_vector_t* vec, /* in/out: vector */ + ib_compare_t compare); /* in: the comparator to use for sort */ + +/******************************************************************** +The default ib_vector_t heap free. Does nothing. */ +UNIV_INLINE +void +ib_heap_free( +/*=========*/ + ib_alloc_t* allocator, /* in: allocator */ + void* ptr); /* in: size in bytes */ + +/******************************************************************** +The default ib_vector_t heap malloc. Uses mem_heap_alloc(). */ +UNIV_INLINE +void* +ib_heap_malloc( +/*===========*/ + /* out: pointer to allocated memory */ + ib_alloc_t* allocator, /* in: allocator */ + ulint size); /* in: size in bytes */ + +/******************************************************************** +The default ib_vector_t heap resize. Since we can't resize the heap +we have to copy the elements from the old ptr to the new ptr. +Uses mem_heap_alloc(). */ +UNIV_INLINE +void* +ib_heap_resize( +/*===========*/ + /* out: pointer to reallocated + memory */ + ib_alloc_t* allocator, /* in: allocator */ + void* old_ptr, /* in: pointer to memory */ + ulint old_size, /* in: old size in bytes */ + ulint new_size); /* in: new size in bytes */ + +/******************************************************************** +Create a heap allocator that uses the passed in heap. */ +UNIV_INLINE +ib_alloc_t* +ib_heap_allocator_create( +/*=====================*/ + /* out: heap allocator instance */ + mem_heap_t* heap); /* in: heap to use */ + +/******************************************************************** +Free a heap allocator. */ +UNIV_INLINE +void +ib_heap_allocator_free( +/*===================*/ + ib_alloc_t* ib_ut_alloc); /* in: alloc instace to free */ + +/******************************************************************** +Wrapper for ut_free(). */ +UNIV_INLINE +void +ib_ut_free( +/*=======*/ + ib_alloc_t* allocator, /* in: allocator */ + void* ptr); /* in: size in bytes */ + +/******************************************************************** +Wrapper for ut_malloc(). */ +UNIV_INLINE +void* +ib_ut_malloc( +/*=========*/ + /* out: pointer to allocated memory */ + ib_alloc_t* allocator, /* in: allocator */ + ulint size); /* in: size in bytes */ + +/******************************************************************** +Wrapper for ut_realloc(). */ +UNIV_INLINE +void* +ib_ut_resize( +/*=========*/ + /* out: pointer to reallocated + memory */ + ib_alloc_t* allocator, /* in: allocator */ + void* old_ptr, /* in: pointer to memory */ + ulint old_size, /* in: old size in bytes */ + ulint new_size); /* in: new size in bytes */ + +/******************************************************************** +Create a heap allocator that uses the passed in heap. */ +UNIV_INLINE +ib_alloc_t* +ib_ut_allocator_create(void); +/*=========================*/ + +/******************************************************************** +Create a heap allocator that uses the passed in heap. */ +UNIV_INLINE +void +ib_ut_allocator_free( +/*=================*/ + ib_alloc_t* ib_ut_alloc); /* in: alloc instace to free */ + +/* Allocator used by ib_vector_t. */ +struct ib_alloc_t { + ib_mem_alloc_t mem_malloc; /* For allocating memory */ + ib_mem_free_t mem_release; /* For freeing memory */ + ib_mem_resize_t mem_resize; /* For resizing memory */ + void* arg; /* Currently if not NULL then it + points to the heap instance */ +}; + +/* See comment at beginning of file. */ +struct ib_vector_t { + ib_alloc_t* allocator; /* Allocator, because one size + doesn't fit all */ + void* data; /* data elements */ + ulint used; /* number of elements currently used */ + ulint total; /* number of elements allocated */ + /* Size of a data item */ + ulint sizeof_value; +}; + +#ifndef UNIV_NONINL +#include "ut0vec.ic" +#endif + +#endif /* IB_VECTOR_H */ diff --git a/storage/xtradb/include/ut0vec.ic b/storage/xtradb/include/ut0vec.ic new file mode 100644 index 00000000000..f41a85e1d1d --- /dev/null +++ b/storage/xtradb/include/ut0vec.ic @@ -0,0 +1,425 @@ +/***************************************************************************** + +Copyright (c) 2006, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/*******************************************************************//** +@file include/ut0vec.ic +A vector of pointers to data items + +Created 4/6/2006 Osku Salerma +************************************************************************/ + +#define IB_VEC_OFFSET(v, i) (vec->sizeof_value * i) + +/******************************************************************** +The default ib_vector_t heap malloc. Uses mem_heap_alloc(). */ +UNIV_INLINE +void* +ib_heap_malloc( +/*===========*/ + ib_alloc_t* allocator, /* in: allocator */ + ulint size) /* in: size in bytes */ +{ + mem_heap_t* heap = (mem_heap_t*) allocator->arg; + + return(mem_heap_alloc(heap, size)); +} + +/******************************************************************** +The default ib_vector_t heap free. Does nothing. */ +UNIV_INLINE +void +ib_heap_free( +/*=========*/ + ib_alloc_t* allocator UNIV_UNUSED, /* in: allocator */ + void* ptr UNIV_UNUSED) /* in: size in bytes */ +{ + /* We can't free individual elements. */ +} + +/******************************************************************** +The default ib_vector_t heap resize. Since we can't resize the heap +we have to copy the elements from the old ptr to the new ptr. +Uses mem_heap_alloc(). */ +UNIV_INLINE +void* +ib_heap_resize( +/*===========*/ + ib_alloc_t* allocator, /* in: allocator */ + void* old_ptr, /* in: pointer to memory */ + ulint old_size, /* in: old size in bytes */ + ulint new_size) /* in: new size in bytes */ +{ + void* new_ptr; + mem_heap_t* heap = (mem_heap_t*) allocator->arg; + + new_ptr = mem_heap_alloc(heap, new_size); + memcpy(new_ptr, old_ptr, old_size); + + return(new_ptr); +} + +/******************************************************************** +Create a heap allocator that uses the passed in heap. */ +UNIV_INLINE +ib_alloc_t* +ib_heap_allocator_create( +/*=====================*/ + mem_heap_t* heap) /* in: heap to use */ +{ + ib_alloc_t* heap_alloc; + + heap_alloc = (ib_alloc_t*) mem_heap_alloc(heap, sizeof(*heap_alloc)); + + heap_alloc->arg = heap; + heap_alloc->mem_release = ib_heap_free; + heap_alloc->mem_malloc = ib_heap_malloc; + heap_alloc->mem_resize = ib_heap_resize; + + return(heap_alloc); +} + +/******************************************************************** +Free a heap allocator. */ +UNIV_INLINE +void +ib_heap_allocator_free( +/*===================*/ + ib_alloc_t* ib_ut_alloc) /* in: alloc instace to free */ +{ + mem_heap_free((mem_heap_t*) ib_ut_alloc->arg); +} + +/******************************************************************** +Wrapper around ut_malloc(). */ +UNIV_INLINE +void* +ib_ut_malloc( +/*=========*/ + ib_alloc_t* allocator UNIV_UNUSED, /* in: allocator */ + ulint size) /* in: size in bytes */ +{ + return(ut_malloc(size)); +} + +/******************************************************************** +Wrapper around ut_free(). */ +UNIV_INLINE +void +ib_ut_free( +/*=======*/ + ib_alloc_t* allocator UNIV_UNUSED, /* in: allocator */ + void* ptr) /* in: size in bytes */ +{ + ut_free(ptr); +} + +/******************************************************************** +Wrapper aroung ut_realloc(). */ +UNIV_INLINE +void* +ib_ut_resize( +/*=========*/ + ib_alloc_t* allocator UNIV_UNUSED, /* in: allocator */ + void* old_ptr, /* in: pointer to memory */ + ulint old_size UNIV_UNUSED,/* in: old size in bytes */ + ulint new_size) /* in: new size in bytes */ +{ + return(ut_realloc(old_ptr, new_size)); +} + +/******************************************************************** +Create a ut allocator. */ +UNIV_INLINE +ib_alloc_t* +ib_ut_allocator_create(void) +/*========================*/ +{ + ib_alloc_t* ib_ut_alloc; + + ib_ut_alloc = (ib_alloc_t*) ut_malloc(sizeof(*ib_ut_alloc)); + + ib_ut_alloc->arg = NULL; + ib_ut_alloc->mem_release = ib_ut_free; + ib_ut_alloc->mem_malloc = ib_ut_malloc; + ib_ut_alloc->mem_resize = ib_ut_resize; + + return(ib_ut_alloc); +} + +/******************************************************************** +Free a ut allocator. */ +UNIV_INLINE +void +ib_ut_allocator_free( +/*=================*/ + ib_alloc_t* ib_ut_alloc) /* in: alloc instace to free */ +{ + ut_free(ib_ut_alloc); +} + +/******************************************************************** +Get number of elements in vector. */ +UNIV_INLINE +ulint +ib_vector_size( +/*===========*/ + /* out: number of elements in vector*/ + const ib_vector_t* vec) /* in: vector */ +{ + return(vec->used); +} + +/****************************************************************//** +Get n'th element. */ +UNIV_INLINE +void* +ib_vector_get( +/*==========*/ + ib_vector_t* vec, /*!< in: vector */ + ulint n) /*!< in: element index to get */ +{ + ut_a(n < vec->used); + + return((byte*) vec->data + IB_VEC_OFFSET(vec, n)); +} + +/******************************************************************** +Const version of the get n'th element. +@return n'th element */ +UNIV_INLINE +const void* +ib_vector_get_const( +/*================*/ + const ib_vector_t* vec, /* in: vector */ + ulint n) /* in: element index to get */ +{ + ut_a(n < vec->used); + + return((byte*) vec->data + IB_VEC_OFFSET(vec, n)); +} +/****************************************************************//** +Get last element. The vector must not be empty. +@return last element */ +UNIV_INLINE +void* +ib_vector_get_last( +/*===============*/ + ib_vector_t* vec) /*!< in: vector */ +{ + ut_a(vec->used > 0); + + return((byte*) ib_vector_get(vec, vec->used - 1)); +} + +/****************************************************************//** +Set the n'th element. */ +UNIV_INLINE +void +ib_vector_set( +/*==========*/ + ib_vector_t* vec, /*!< in/out: vector */ + ulint n, /*!< in: element index to set */ + void* elem) /*!< in: data element */ +{ + void* slot; + + ut_a(n < vec->used); + + slot = ((byte*) vec->data + IB_VEC_OFFSET(vec, n)); + memcpy(slot, elem, vec->sizeof_value); +} + +/******************************************************************** +Reset the vector size to 0 elements. */ +UNIV_INLINE +void +ib_vector_reset( +/*============*/ + /* out: void */ + ib_vector_t* vec) /* in: vector */ +{ + vec->used = 0; +} + +/******************************************************************** +Get the last element of the vector. */ +UNIV_INLINE +void* +ib_vector_last( +/*===========*/ + /* out: void */ + ib_vector_t* vec) /* in: vector */ +{ + ut_a(ib_vector_size(vec) > 0); + + return(ib_vector_get(vec, ib_vector_size(vec) - 1)); +} + +/******************************************************************** +Get the last element of the vector. */ +UNIV_INLINE +const void* +ib_vector_last_const( +/*=================*/ + /* out: void */ + const ib_vector_t* vec) /* in: vector */ +{ + ut_a(ib_vector_size(vec) > 0); + + return(ib_vector_get_const(vec, ib_vector_size(vec) - 1)); +} + +/****************************************************************//** +Remove the last element from the vector. +@return last vector element */ +UNIV_INLINE +void* +ib_vector_pop( +/*==========*/ + /* out: pointer to element */ + ib_vector_t* vec) /* in: vector */ +{ + void* elem; + + ut_a(vec->used > 0); + + elem = ib_vector_last(vec); + --vec->used; + + return(elem); +} + +/******************************************************************** +Append an element to the vector, if elem != NULL then copy the data +from elem.*/ +UNIV_INLINE +void* +ib_vector_push( +/*===========*/ + /* out: pointer to the "new" element */ + ib_vector_t* vec, /* in: vector */ + const void* elem) /* in: element to add (can be NULL) */ +{ + void* last; + + if (vec->used >= vec->total) { + ib_vector_resize(vec); + } + + last = (byte*) vec->data + IB_VEC_OFFSET(vec, vec->used); + +#ifdef UNIV_DEBUG + memset(last, 0, vec->sizeof_value); +#endif + + if (elem) { + memcpy(last, elem, vec->sizeof_value); + } + + ++vec->used; + + return(last); +} + +/*******************************************************************//** +Remove an element to the vector +@return pointer to the "removed" element */ +UNIV_INLINE +void* +ib_vector_remove( +/*=============*/ + ib_vector_t* vec, /*!< in: vector */ + const void* elem) /*!< in: value to remove */ +{ + void* current = NULL; + void* next; + ulint i; + ulint old_used_count = vec->used; + + for (i = 0; i < vec->used; i++) { + current = ib_vector_get(vec, i); + + if (*(void**) current == elem) { + if (i == vec->used - 1) { + return(ib_vector_pop(vec)); + } + + next = ib_vector_get(vec, i + 1); + memmove(current, next, vec->sizeof_value + * (vec->used - i - 1)); + --vec->used; + break; + } + } + + return((old_used_count != vec->used) ? current : NULL); +} + +/******************************************************************** +Sort the vector elements. */ +UNIV_INLINE +void +ib_vector_sort( +/*===========*/ + /* out: void */ + ib_vector_t* vec, /* in: vector */ + ib_compare_t compare)/* in: the comparator to use for sort */ +{ + qsort(vec->data, vec->used, vec->sizeof_value, compare); +} + +/******************************************************************** +Destroy the vector. Make sure the vector owns the allocator, e.g., +the heap in the the heap allocator. */ +UNIV_INLINE +void +ib_vector_free( +/*===========*/ + ib_vector_t* vec) /* in, own: vector */ +{ + /* Currently we only support two types of allocators, heap + and ut_malloc(), when the heap is freed all the elements are + freed too. With ut allocator, we need to free the elements, + the vector instance and the allocator separately. */ + + /* Only the heap allocator uses the arg field. */ + if (vec->allocator->arg) { + mem_heap_free((mem_heap_t*) vec->allocator->arg); + } else { + ib_alloc_t* allocator; + + allocator = vec->allocator; + + allocator->mem_release(allocator, vec->data); + allocator->mem_release(allocator, vec); + + ib_ut_allocator_free(allocator); + } +} + +/******************************************************************** +Test whether a vector is empty or not. +@return TRUE if empty */ +UNIV_INLINE +ibool +ib_vector_is_empty( +/*===============*/ + const ib_vector_t* vec) /*!< in: vector */ +{ + return(ib_vector_size(vec) == 0); +} diff --git a/storage/xtradb/include/ut0wqueue.h b/storage/xtradb/include/ut0wqueue.h new file mode 100644 index 00000000000..33385ddf2d4 --- /dev/null +++ b/storage/xtradb/include/ut0wqueue.h @@ -0,0 +1,105 @@ +/***************************************************************************** + +Copyright (c) 2006, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/*******************************************************************//** +@file include/ut0wqueue.h +A work queue + +Created 4/26/2006 Osku Salerma +************************************************************************/ + +/*******************************************************************//** +A Work queue. Threads can add work items to the queue and other threads can +wait for work items to be available and take them off the queue for +processing. +************************************************************************/ + +#ifndef IB_WORK_QUEUE_H +#define IB_WORK_QUEUE_H + +#include "ut0list.h" +#include "mem0mem.h" +#include "os0sync.h" +#include "sync0types.h" + +struct ib_wqueue_t; + +/****************************************************************//** +Create a new work queue. +@return work queue */ +UNIV_INTERN +ib_wqueue_t* +ib_wqueue_create(void); +/*===================*/ + +/****************************************************************//** +Free a work queue. */ +UNIV_INTERN +void +ib_wqueue_free( +/*===========*/ + ib_wqueue_t* wq); /*!< in: work queue */ + +/****************************************************************//** +Add a work item to the queue. */ +UNIV_INTERN +void +ib_wqueue_add( +/*==========*/ + ib_wqueue_t* wq, /*!< in: work queue */ + void* item, /*!< in: work item */ + mem_heap_t* heap); /*!< in: memory heap to use for allocating the + list node */ + +/******************************************************************** +Check if queue is empty. */ + +ibool +ib_wqueue_is_empty( +/*===============*/ + /* out: TRUE if queue empty + else FALSE */ + const ib_wqueue_t* wq); /* in: work queue */ + +/****************************************************************//** +Wait for a work item to appear in the queue. +@return work item */ +UNIV_INTERN +void* +ib_wqueue_wait( +/*===========*/ + ib_wqueue_t* wq); /*!< in: work queue */ + +/******************************************************************** +Wait for a work item to appear in the queue for specified time. */ + +void* +ib_wqueue_timedwait( +/*================*/ + /* out: work item or NULL on timeout*/ + ib_wqueue_t* wq, /* in: work queue */ + ib_time_t wait_in_usecs); /* in: wait time in micro seconds */ + +/* Work queue. */ +struct ib_wqueue_t { + ib_mutex_t mutex; /*!< mutex protecting everything */ + ib_list_t* items; /*!< work item list */ + os_event_t event; /*!< event we use to signal additions to list */ +}; + +#endif diff --git a/storage/xtradb/lock/lock0iter.cc b/storage/xtradb/lock/lock0iter.cc new file mode 100644 index 00000000000..b424d2fc757 --- /dev/null +++ b/storage/xtradb/lock/lock0iter.cc @@ -0,0 +1,111 @@ +/***************************************************************************** + +Copyright (c) 2007, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file lock/lock0iter.cc +Lock queue iterator. Can iterate over table and record +lock queues. + +Created July 16, 2007 Vasil Dimov +*******************************************************/ + +#define LOCK_MODULE_IMPLEMENTATION + +#include "univ.i" +#include "lock0iter.h" +#include "lock0lock.h" +#include "lock0priv.h" +#include "ut0dbg.h" +#include "ut0lst.h" + +/*******************************************************************//** +Initialize lock queue iterator so that it starts to iterate from +"lock". bit_no specifies the record number within the heap where the +record is stored. It can be undefined (ULINT_UNDEFINED) in two cases: +1. If the lock is a table lock, thus we have a table lock queue; +2. If the lock is a record lock and it is a wait lock. In this case + bit_no is calculated in this function by using + lock_rec_find_set_bit(). There is exactly one bit set in the bitmap + of a wait lock. */ +UNIV_INTERN +void +lock_queue_iterator_reset( +/*======================*/ + lock_queue_iterator_t* iter, /*!< out: iterator */ + const lock_t* lock, /*!< in: lock to start from */ + ulint bit_no) /*!< in: record number in the + heap */ +{ + ut_ad(lock_mutex_own()); + + iter->current_lock = lock; + + if (bit_no != ULINT_UNDEFINED) { + + iter->bit_no = bit_no; + } else { + + switch (lock_get_type_low(lock)) { + case LOCK_TABLE: + iter->bit_no = ULINT_UNDEFINED; + break; + case LOCK_REC: + iter->bit_no = lock_rec_find_set_bit(lock); + ut_a(iter->bit_no != ULINT_UNDEFINED); + break; + default: + ut_error; + } + } +} + +/*******************************************************************//** +Gets the previous lock in the lock queue, returns NULL if there are no +more locks (i.e. the current lock is the first one). The iterator is +receded (if not-NULL is returned). +@return previous lock or NULL */ +UNIV_INTERN +const lock_t* +lock_queue_iterator_get_prev( +/*=========================*/ + lock_queue_iterator_t* iter) /*!< in/out: iterator */ +{ + const lock_t* prev_lock; + + ut_ad(lock_mutex_own()); + + switch (lock_get_type_low(iter->current_lock)) { + case LOCK_REC: + prev_lock = lock_rec_get_prev( + iter->current_lock, iter->bit_no); + break; + case LOCK_TABLE: + prev_lock = UT_LIST_GET_PREV( + un_member.tab_lock.locks, iter->current_lock); + break; + default: + ut_error; + } + + if (prev_lock != NULL) { + + iter->current_lock = prev_lock; + } + + return(prev_lock); +} diff --git a/storage/xtradb/lock/lock0lock.cc b/storage/xtradb/lock/lock0lock.cc new file mode 100644 index 00000000000..6cfb8aa0f72 --- /dev/null +++ b/storage/xtradb/lock/lock0lock.cc @@ -0,0 +1,7215 @@ +/***************************************************************************** + +Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file lock/lock0lock.cc +The transaction lock system + +Created 5/7/1996 Heikki Tuuri +*******************************************************/ + +#define LOCK_MODULE_IMPLEMENTATION + +#include "lock0lock.h" +#include "lock0priv.h" + +#ifdef UNIV_NONINL +#include "lock0lock.ic" +#include "lock0priv.ic" +#endif + +#include "ha_prototypes.h" +#include "usr0sess.h" +#include "trx0purge.h" +#include "dict0mem.h" +#include "dict0boot.h" +#include "trx0sys.h" +#include "pars0pars.h" /* pars_complete_graph_for_exec() */ +#include "que0que.h" /* que_node_get_parent() */ +#include "row0mysql.h" /* row_mysql_handle_errors() */ +#include "row0sel.h" /* sel_node_create(), sel_node_t */ +#include "row0types.h" /* sel_node_t */ +#include "srv0mon.h" +#include "ut0vec.h" +#include "btr0btr.h" +#include "dict0boot.h" +#include <set> + +/* Restricts the length of search we will do in the waits-for +graph of transactions */ +#define LOCK_MAX_N_STEPS_IN_DEADLOCK_CHECK 1000000 + +/* Restricts the search depth we will do in the waits-for graph of +transactions */ +#define LOCK_MAX_DEPTH_IN_DEADLOCK_CHECK 200 + +/* When releasing transaction locks, this specifies how often we release +the lock mutex for a moment to give also others access to it */ + +#define LOCK_RELEASE_INTERVAL 1000 + +/* Safety margin when creating a new record lock: this many extra records +can be inserted to the page without need to create a lock with a bigger +bitmap */ + +#define LOCK_PAGE_BITMAP_MARGIN 64 + +/* An explicit record lock affects both the record and the gap before it. +An implicit x-lock does not affect the gap, it only locks the index +record from read or update. + +If a transaction has modified or inserted an index record, then +it owns an implicit x-lock on the record. On a secondary index record, +a transaction has an implicit x-lock also if it has modified the +clustered index record, the max trx id of the page where the secondary +index record resides is >= trx id of the transaction (or database recovery +is running), and there are no explicit non-gap lock requests on the +secondary index record. + +This complicated definition for a secondary index comes from the +implementation: we want to be able to determine if a secondary index +record has an implicit x-lock, just by looking at the present clustered +index record, not at the historical versions of the record. The +complicated definition can be explained to the user so that there is +nondeterminism in the access path when a query is answered: we may, +or may not, access the clustered index record and thus may, or may not, +bump into an x-lock set there. + +Different transaction can have conflicting locks set on the gap at the +same time. The locks on the gap are purely inhibitive: an insert cannot +be made, or a select cursor may have to wait if a different transaction +has a conflicting lock on the gap. An x-lock on the gap does not give +the right to insert into the gap. + +An explicit lock can be placed on a user record or the supremum record of +a page. The locks on the supremum record are always thought to be of the gap +type, though the gap bit is not set. When we perform an update of a record +where the size of the record changes, we may temporarily store its explicit +locks on the infimum record of the page, though the infimum otherwise never +carries locks. + +A waiting record lock can also be of the gap type. A waiting lock request +can be granted when there is no conflicting mode lock request by another +transaction ahead of it in the explicit lock queue. + +In version 4.0.5 we added yet another explicit lock type: LOCK_REC_NOT_GAP. +It only locks the record it is placed on, not the gap before the record. +This lock type is necessary to emulate an Oracle-like READ COMMITTED isolation +level. + +------------------------------------------------------------------------- +RULE 1: If there is an implicit x-lock on a record, and there are non-gap +------- +lock requests waiting in the queue, then the transaction holding the implicit +x-lock also has an explicit non-gap record x-lock. Therefore, as locks are +released, we can grant locks to waiting lock requests purely by looking at +the explicit lock requests in the queue. + +RULE 3: Different transactions cannot have conflicting granted non-gap locks +------- +on a record at the same time. However, they can have conflicting granted gap +locks. +RULE 4: If a there is a waiting lock request in a queue, no lock request, +------- +gap or not, can be inserted ahead of it in the queue. In record deletes +and page splits new gap type locks can be created by the database manager +for a transaction, and without rule 4, the waits-for graph of transactions +might become cyclic without the database noticing it, as the deadlock check +is only performed when a transaction itself requests a lock! +------------------------------------------------------------------------- + +An insert is allowed to a gap if there are no explicit lock requests by +other transactions on the next record. It does not matter if these lock +requests are granted or waiting, gap bit set or not, with the exception +that a gap type request set by another transaction to wait for +its turn to do an insert is ignored. On the other hand, an +implicit x-lock by another transaction does not prevent an insert, which +allows for more concurrency when using an Oracle-style sequence number +generator for the primary key with many transactions doing inserts +concurrently. + +A modify of a record is allowed if the transaction has an x-lock on the +record, or if other transactions do not have any non-gap lock requests on the +record. + +A read of a single user record with a cursor is allowed if the transaction +has a non-gap explicit, or an implicit lock on the record, or if the other +transactions have no x-lock requests on the record. At a page supremum a +read is always allowed. + +In summary, an implicit lock is seen as a granted x-lock only on the +record, not on the gap. An explicit lock with no gap bit set is a lock +both on the record and the gap. If the gap bit is set, the lock is only +on the gap. Different transaction cannot own conflicting locks on the +record at the same time, but they may own conflicting locks on the gap. +Granted locks on a record give an access right to the record, but gap type +locks just inhibit operations. + +NOTE: Finding out if some transaction has an implicit x-lock on a secondary +index record can be cumbersome. We may have to look at previous versions of +the corresponding clustered index record to find out if a delete marked +secondary index record was delete marked by an active transaction, not by +a committed one. + +FACT A: If a transaction has inserted a row, it can delete it any time +without need to wait for locks. + +PROOF: The transaction has an implicit x-lock on every index record inserted +for the row, and can thus modify each record without the need to wait. Q.E.D. + +FACT B: If a transaction has read some result set with a cursor, it can read +it again, and retrieves the same result set, if it has not modified the +result set in the meantime. Hence, there is no phantom problem. If the +biggest record, in the alphabetical order, touched by the cursor is removed, +a lock wait may occur, otherwise not. + +PROOF: When a read cursor proceeds, it sets an s-lock on each user record +it passes, and a gap type s-lock on each page supremum. The cursor must +wait until it has these locks granted. Then no other transaction can +have a granted x-lock on any of the user records, and therefore cannot +modify the user records. Neither can any other transaction insert into +the gaps which were passed over by the cursor. Page splits and merges, +and removal of obsolete versions of records do not affect this, because +when a user record or a page supremum is removed, the next record inherits +its locks as gap type locks, and therefore blocks inserts to the same gap. +Also, if a page supremum is inserted, it inherits its locks from the successor +record. When the cursor is positioned again at the start of the result set, +the records it will touch on its course are either records it touched +during the last pass or new inserted page supremums. It can immediately +access all these records, and when it arrives at the biggest record, it +notices that the result set is complete. If the biggest record was removed, +lock wait can occur because the next record only inherits a gap type lock, +and a wait may be needed. Q.E.D. */ + +/* If an index record should be changed or a new inserted, we must check +the lock on the record or the next. When a read cursor starts reading, +we will set a record level s-lock on each record it passes, except on the +initial record on which the cursor is positioned before we start to fetch +records. Our index tree search has the convention that the B-tree +cursor is positioned BEFORE the first possibly matching record in +the search. Optimizations are possible here: if the record is searched +on an equality condition to a unique key, we could actually set a special +lock on the record, a lock which would not prevent any insert before +this record. In the next key locking an x-lock set on a record also +prevents inserts just before that record. + There are special infimum and supremum records on each page. +A supremum record can be locked by a read cursor. This records cannot be +updated but the lock prevents insert of a user record to the end of +the page. + Next key locks will prevent the phantom problem where new rows +could appear to SELECT result sets after the select operation has been +performed. Prevention of phantoms ensures the serilizability of +transactions. + What should we check if an insert of a new record is wanted? +Only the lock on the next record on the same page, because also the +supremum record can carry a lock. An s-lock prevents insertion, but +what about an x-lock? If it was set by a searched update, then there +is implicitly an s-lock, too, and the insert should be prevented. +What if our transaction owns an x-lock to the next record, but there is +a waiting s-lock request on the next record? If this s-lock was placed +by a read cursor moving in the ascending order in the index, we cannot +do the insert immediately, because when we finally commit our transaction, +the read cursor should see also the new inserted record. So we should +move the read cursor backward from the next record for it to pass over +the new inserted record. This move backward may be too cumbersome to +implement. If we in this situation just enqueue a second x-lock request +for our transaction on the next record, then the deadlock mechanism +notices a deadlock between our transaction and the s-lock request +transaction. This seems to be an ok solution. + We could have the convention that granted explicit record locks, +lock the corresponding records from changing, and also lock the gaps +before them from inserting. A waiting explicit lock request locks the gap +before from inserting. Implicit record x-locks, which we derive from the +transaction id in the clustered index record, only lock the record itself +from modification, not the gap before it from inserting. + How should we store update locks? If the search is done by a unique +key, we could just modify the record trx id. Otherwise, we could put a record +x-lock on the record. If the update changes ordering fields of the +clustered index record, the inserted new record needs no record lock in +lock table, the trx id is enough. The same holds for a secondary index +record. Searched delete is similar to update. + +PROBLEM: +What about waiting lock requests? If a transaction is waiting to make an +update to a record which another modified, how does the other transaction +know to send the end-lock-wait signal to the waiting transaction? If we have +the convention that a transaction may wait for just one lock at a time, how +do we preserve it if lock wait ends? + +PROBLEM: +Checking the trx id label of a secondary index record. In the case of a +modification, not an insert, is this necessary? A secondary index record +is modified only by setting or resetting its deleted flag. A secondary index +record contains fields to uniquely determine the corresponding clustered +index record. A secondary index record is therefore only modified if we +also modify the clustered index record, and the trx id checking is done +on the clustered index record, before we come to modify the secondary index +record. So, in the case of delete marking or unmarking a secondary index +record, we do not have to care about trx ids, only the locks in the lock +table must be checked. In the case of a select from a secondary index, the +trx id is relevant, and in this case we may have to search the clustered +index record. + +PROBLEM: How to update record locks when page is split or merged, or +-------------------------------------------------------------------- +a record is deleted or updated? +If the size of fields in a record changes, we perform the update by +a delete followed by an insert. How can we retain the locks set or +waiting on the record? Because a record lock is indexed in the bitmap +by the heap number of the record, when we remove the record from the +record list, it is possible still to keep the lock bits. If the page +is reorganized, we could make a table of old and new heap numbers, +and permute the bitmaps in the locks accordingly. We can add to the +table a row telling where the updated record ended. If the update does +not require a reorganization of the page, we can simply move the lock +bits for the updated record to the position determined by its new heap +number (we may have to allocate a new lock, if we run out of the bitmap +in the old one). + A more complicated case is the one where the reinsertion of the +updated record is done pessimistically, because the structure of the +tree may change. + +PROBLEM: If a supremum record is removed in a page merge, or a record +--------------------------------------------------------------------- +removed in a purge, what to do to the waiting lock requests? In a split to +the right, we just move the lock requests to the new supremum. If a record +is removed, we could move the waiting lock request to its inheritor, the +next record in the index. But, the next record may already have lock +requests on its own queue. A new deadlock check should be made then. Maybe +it is easier just to release the waiting transactions. They can then enqueue +new lock requests on appropriate records. + +PROBLEM: When a record is inserted, what locks should it inherit from the +------------------------------------------------------------------------- +upper neighbor? An insert of a new supremum record in a page split is +always possible, but an insert of a new user record requires that the upper +neighbor does not have any lock requests by other transactions, granted or +waiting, in its lock queue. Solution: We can copy the locks as gap type +locks, so that also the waiting locks are transformed to granted gap type +locks on the inserted record. */ + +#define LOCK_STACK_SIZE OS_THREAD_MAX_N + +/* LOCK COMPATIBILITY MATRIX + * IS IX S X AI + * IS + + + - + + * IX + + - - + + * S + - + - - + * X - - - - - + * AI + + - - - + * + * Note that for rows, InnoDB only acquires S or X locks. + * For tables, InnoDB normally acquires IS or IX locks. + * S or X table locks are only acquired for LOCK TABLES. + * Auto-increment (AI) locks are needed because of + * statement-level MySQL binlog. + * See also lock_mode_compatible(). + */ +static const byte lock_compatibility_matrix[5][5] = { + /** IS IX S X AI */ + /* IS */ { TRUE, TRUE, TRUE, FALSE, TRUE}, + /* IX */ { TRUE, TRUE, FALSE, FALSE, TRUE}, + /* S */ { TRUE, FALSE, TRUE, FALSE, FALSE}, + /* X */ { FALSE, FALSE, FALSE, FALSE, FALSE}, + /* AI */ { TRUE, TRUE, FALSE, FALSE, FALSE} +}; + +/* STRONGER-OR-EQUAL RELATION (mode1=row, mode2=column) + * IS IX S X AI + * IS + - - - - + * IX + + - - - + * S + - + - - + * X + + + + + + * AI - - - - + + * See lock_mode_stronger_or_eq(). + */ +static const byte lock_strength_matrix[5][5] = { + /** IS IX S X AI */ + /* IS */ { TRUE, FALSE, FALSE, FALSE, FALSE}, + /* IX */ { TRUE, TRUE, FALSE, FALSE, FALSE}, + /* S */ { TRUE, FALSE, TRUE, FALSE, FALSE}, + /* X */ { TRUE, TRUE, TRUE, TRUE, TRUE}, + /* AI */ { FALSE, FALSE, FALSE, FALSE, TRUE} +}; + +/** Deadlock check context. */ +struct lock_deadlock_ctx_t { + const trx_t* start; /*!< Joining transaction that is + requesting a lock in an incompatible + mode */ + + const lock_t* wait_lock; /*!< Lock that trx wants */ + + ib_uint64_t mark_start; /*!< Value of lock_mark_count at + the start of the deadlock check. */ + + ulint depth; /*!< Stack depth */ + + ulint cost; /*!< Calculation steps thus far */ + + ibool too_deep; /*!< TRUE if search was too deep and + was aborted */ +}; + +/** DFS visited node information used during deadlock checking. */ +struct lock_stack_t { + const lock_t* lock; /*!< Current lock */ + const lock_t* wait_lock; /*!< Waiting for lock */ + ulint heap_no; /*!< heap number if rec lock */ +}; + +/** Stack to use during DFS search. Currently only a single stack is required +because there is no parallel deadlock check. This stack is protected by +the lock_sys_t::mutex. */ +static lock_stack_t* lock_stack; + +/** The count of the types of locks. */ +static const ulint lock_types = UT_ARR_SIZE(lock_compatibility_matrix); + +#ifdef UNIV_PFS_MUTEX +/* Key to register mutex with performance schema */ +UNIV_INTERN mysql_pfs_key_t lock_sys_mutex_key; +/* Key to register mutex with performance schema */ +UNIV_INTERN mysql_pfs_key_t lock_sys_wait_mutex_key; +#endif /* UNIV_PFS_MUTEX */ + +#ifdef UNIV_DEBUG +UNIV_INTERN ibool lock_print_waits = FALSE; + +/*********************************************************************//** +Validates the lock system. +@return TRUE if ok */ +static +bool +lock_validate(); +/*============*/ + +/*********************************************************************//** +Validates the record lock queues on a page. +@return TRUE if ok */ +static +ibool +lock_rec_validate_page( +/*===================*/ + const buf_block_t* block) /*!< in: buffer block */ + __attribute__((nonnull, warn_unused_result)); +#endif /* UNIV_DEBUG */ + +/* The lock system */ +UNIV_INTERN lock_sys_t* lock_sys = NULL; + +/** We store info on the latest deadlock error to this buffer. InnoDB +Monitor will then fetch it and print */ +UNIV_INTERN ibool lock_deadlock_found = FALSE; +/** Only created if !srv_read_only_mode */ +static FILE* lock_latest_err_file; + +/********************************************************************//** +Checks if a joining lock request results in a deadlock. If a deadlock is +found this function will resolve the dadlock by choosing a victim transaction +and rolling it back. It will attempt to resolve all deadlocks. The returned +transaction id will be the joining transaction id or 0 if some other +transaction was chosen as a victim and rolled back or no deadlock found. + +@return id of transaction chosen as victim or 0 */ +static +trx_id_t +lock_deadlock_check_and_resolve( +/*===========================*/ + const lock_t* lock, /*!< in: lock the transaction is requesting */ + const trx_t* trx); /*!< in: transaction */ + +/*********************************************************************//** +Gets the nth bit of a record lock. +@return TRUE if bit set also if i == ULINT_UNDEFINED return FALSE*/ +UNIV_INLINE +ibool +lock_rec_get_nth_bit( +/*=================*/ + const lock_t* lock, /*!< in: record lock */ + ulint i) /*!< in: index of the bit */ +{ + const byte* b; + + ut_ad(lock); + ut_ad(lock_get_type_low(lock) == LOCK_REC); + + if (i >= lock->un_member.rec_lock.n_bits) { + + return(FALSE); + } + + b = ((const byte*) &lock[1]) + (i / 8); + + return(1 & *b >> (i % 8)); +} + +/*********************************************************************//** +Reports that a transaction id is insensible, i.e., in the future. */ +UNIV_INTERN +void +lock_report_trx_id_insanity( +/*========================*/ + trx_id_t trx_id, /*!< in: trx id */ + const rec_t* rec, /*!< in: user record */ + dict_index_t* index, /*!< in: index */ + const ulint* offsets, /*!< in: rec_get_offsets(rec, index) */ + trx_id_t max_trx_id) /*!< in: trx_sys_get_max_trx_id() */ +{ + ut_print_timestamp(stderr); + fputs(" InnoDB: Error: transaction id associated with record\n", + stderr); + rec_print_new(stderr, rec, offsets); + fputs("InnoDB: in ", stderr); + dict_index_name_print(stderr, NULL, index); + fprintf(stderr, "\n" + "InnoDB: is " TRX_ID_FMT " which is higher than the" + " global trx id counter " TRX_ID_FMT "!\n" + "InnoDB: The table is corrupt. You have to do" + " dump + drop + reimport.\n", + trx_id, max_trx_id); +} + +/*********************************************************************//** +Checks that a transaction id is sensible, i.e., not in the future. +@return true if ok */ +#ifdef UNIV_DEBUG +UNIV_INTERN +#else +static __attribute__((nonnull, warn_unused_result)) +#endif +bool +lock_check_trx_id_sanity( +/*=====================*/ + trx_id_t trx_id, /*!< in: trx id */ + const rec_t* rec, /*!< in: user record */ + dict_index_t* index, /*!< in: index */ + const ulint* offsets) /*!< in: rec_get_offsets(rec, index) */ +{ + bool is_ok; + trx_id_t max_trx_id; + + ut_ad(rec_offs_validate(rec, index, offsets)); + + max_trx_id = trx_sys_get_max_trx_id(); + is_ok = trx_id < max_trx_id; + + if (UNIV_UNLIKELY(!is_ok)) { + lock_report_trx_id_insanity(trx_id, + rec, index, offsets, max_trx_id); + } + + return(is_ok); +} + +/*********************************************************************//** +Checks that a record is seen in a consistent read. +@return true if sees, or false if an earlier version of the record +should be retrieved */ +UNIV_INTERN +bool +lock_clust_rec_cons_read_sees( +/*==========================*/ + const rec_t* rec, /*!< in: user record which should be read or + passed over by a read cursor */ + dict_index_t* index, /*!< in: clustered index */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + read_view_t* view) /*!< in: consistent read view */ +{ + trx_id_t trx_id; + + ut_ad(dict_index_is_clust(index)); + ut_ad(page_rec_is_user_rec(rec)); + ut_ad(rec_offs_validate(rec, index, offsets)); + + /* NOTE that we call this function while holding the search + system latch. */ + + trx_id = row_get_rec_trx_id(rec, index, offsets); + + return(read_view_sees_trx_id(view, trx_id)); +} + +/*********************************************************************//** +Checks that a non-clustered index record is seen in a consistent read. + +NOTE that a non-clustered index page contains so little information on +its modifications that also in the case false, the present version of +rec may be the right, but we must check this from the clustered index +record. + +@return true if certainly sees, or false if an earlier version of the +clustered index record might be needed */ +UNIV_INTERN +bool +lock_sec_rec_cons_read_sees( +/*========================*/ + const rec_t* rec, /*!< in: user record which + should be read or passed over + by a read cursor */ + const read_view_t* view) /*!< in: consistent read view */ +{ + trx_id_t max_trx_id; + + ut_ad(page_rec_is_user_rec(rec)); + + /* NOTE that we might call this function while holding the search + system latch. */ + + if (recv_recovery_is_on()) { + + return(false); + } + + max_trx_id = page_get_max_trx_id(page_align(rec)); + ut_ad(max_trx_id); + + return(max_trx_id < view->up_limit_id); +} + +/*********************************************************************//** +Creates the lock system at database start. */ +UNIV_INTERN +void +lock_sys_create( +/*============*/ + ulint n_cells) /*!< in: number of slots in lock hash table */ +{ + ulint lock_sys_sz; + + lock_sys_sz = sizeof(*lock_sys) + + OS_THREAD_MAX_N * sizeof(srv_slot_t); + + lock_sys = static_cast<lock_sys_t*>(mem_zalloc(lock_sys_sz)); + + lock_stack = static_cast<lock_stack_t*>( + mem_zalloc(sizeof(*lock_stack) * LOCK_STACK_SIZE)); + + void* ptr = &lock_sys[1]; + + lock_sys->waiting_threads = static_cast<srv_slot_t*>(ptr); + + lock_sys->last_slot = lock_sys->waiting_threads; + + mutex_create(lock_sys_mutex_key, &lock_sys->mutex, SYNC_LOCK_SYS); + + mutex_create(lock_sys_wait_mutex_key, + &lock_sys->wait_mutex, SYNC_LOCK_WAIT_SYS); + + lock_sys->timeout_event = os_event_create(); + + lock_sys->rec_hash = hash_create(n_cells); + lock_sys->rec_num = 0; + + if (!srv_read_only_mode) { + lock_latest_err_file = os_file_create_tmpfile(); + ut_a(lock_latest_err_file); + } +} + +/*********************************************************************//** +Closes the lock system at database shutdown. */ +UNIV_INTERN +void +lock_sys_close(void) +/*================*/ +{ + if (lock_latest_err_file != NULL) { + fclose(lock_latest_err_file); + lock_latest_err_file = NULL; + } + + hash_table_free(lock_sys->rec_hash); + + mutex_free(&lock_sys->mutex); + mutex_free(&lock_sys->wait_mutex); + + mem_free(lock_stack); + mem_free(lock_sys); + + lock_sys = NULL; + lock_stack = NULL; +} + +/*********************************************************************//** +Gets the size of a lock struct. +@return size in bytes */ +UNIV_INTERN +ulint +lock_get_size(void) +/*===============*/ +{ + return((ulint) sizeof(lock_t)); +} + +/*********************************************************************//** +Gets the mode of a lock. +@return mode */ +UNIV_INLINE +enum lock_mode +lock_get_mode( +/*==========*/ + const lock_t* lock) /*!< in: lock */ +{ + ut_ad(lock); + + return(static_cast<enum lock_mode>(lock->type_mode & LOCK_MODE_MASK)); +} + +/*********************************************************************//** +Gets the wait flag of a lock. +@return LOCK_WAIT if waiting, 0 if not */ +UNIV_INLINE +ulint +lock_get_wait( +/*==========*/ + const lock_t* lock) /*!< in: lock */ +{ + ut_ad(lock); + + return(lock->type_mode & LOCK_WAIT); +} + +/*********************************************************************//** +Gets the source table of an ALTER TABLE transaction. The table must be +covered by an IX or IS table lock. +@return the source table of transaction, if it is covered by an IX or +IS table lock; dest if there is no source table, and NULL if the +transaction is locking more than two tables or an inconsistency is +found */ +UNIV_INTERN +dict_table_t* +lock_get_src_table( +/*===============*/ + trx_t* trx, /*!< in: transaction */ + dict_table_t* dest, /*!< in: destination of ALTER TABLE */ + enum lock_mode* mode) /*!< out: lock mode of the source table */ +{ + dict_table_t* src; + lock_t* lock; + + ut_ad(!lock_mutex_own()); + + src = NULL; + *mode = LOCK_NONE; + + /* The trx mutex protects the trx_locks for our purposes. + Other transactions could want to convert one of our implicit + record locks to an explicit one. For that, they would need our + trx mutex. Waiting locks can be removed while only holding + lock_sys->mutex, but this is a running transaction and cannot + thus be holding any waiting locks. */ + trx_mutex_enter(trx); + + for (lock = UT_LIST_GET_FIRST(trx->lock.trx_locks); + lock != NULL; + lock = UT_LIST_GET_NEXT(trx_locks, lock)) { + lock_table_t* tab_lock; + enum lock_mode lock_mode; + if (!(lock_get_type_low(lock) & LOCK_TABLE)) { + /* We are only interested in table locks. */ + continue; + } + tab_lock = &lock->un_member.tab_lock; + if (dest == tab_lock->table) { + /* We are not interested in the destination table. */ + continue; + } else if (!src) { + /* This presumably is the source table. */ + src = tab_lock->table; + if (UT_LIST_GET_LEN(src->locks) != 1 + || UT_LIST_GET_FIRST(src->locks) != lock) { + /* We only support the case when + there is only one lock on this table. */ + src = NULL; + goto func_exit; + } + } else if (src != tab_lock->table) { + /* The transaction is locking more than + two tables (src and dest): abort */ + src = NULL; + goto func_exit; + } + + /* Check that the source table is locked by + LOCK_IX or LOCK_IS. */ + lock_mode = lock_get_mode(lock); + if (lock_mode == LOCK_IX || lock_mode == LOCK_IS) { + if (*mode != LOCK_NONE && *mode != lock_mode) { + /* There are multiple locks on src. */ + src = NULL; + goto func_exit; + } + *mode = lock_mode; + } + } + + if (!src) { + /* No source table lock found: flag the situation to caller */ + src = dest; + } + +func_exit: + trx_mutex_exit(trx); + return(src); +} + +/*********************************************************************//** +Determine if the given table is exclusively "owned" by the given +transaction, i.e., transaction holds LOCK_IX and possibly LOCK_AUTO_INC +on the table. +@return TRUE if table is only locked by trx, with LOCK_IX, and +possibly LOCK_AUTO_INC */ +UNIV_INTERN +ibool +lock_is_table_exclusive( +/*====================*/ + const dict_table_t* table, /*!< in: table */ + const trx_t* trx) /*!< in: transaction */ +{ + const lock_t* lock; + ibool ok = FALSE; + + ut_ad(table); + ut_ad(trx); + + lock_mutex_enter(); + + for (lock = UT_LIST_GET_FIRST(table->locks); + lock != NULL; + lock = UT_LIST_GET_NEXT(locks, &lock->un_member.tab_lock)) { + if (lock->trx != trx) { + /* A lock on the table is held + by some other transaction. */ + goto not_ok; + } + + if (!(lock_get_type_low(lock) & LOCK_TABLE)) { + /* We are interested in table locks only. */ + continue; + } + + switch (lock_get_mode(lock)) { + case LOCK_IX: + ok = TRUE; + break; + case LOCK_AUTO_INC: + /* It is allowed for trx to hold an + auto_increment lock. */ + break; + default: +not_ok: + /* Other table locks than LOCK_IX are not allowed. */ + ok = FALSE; + goto func_exit; + } + } + +func_exit: + lock_mutex_exit(); + + return(ok); +} + +/*********************************************************************//** +Sets the wait flag of a lock and the back pointer in trx to lock. */ +UNIV_INLINE +void +lock_set_lock_and_trx_wait( +/*=======================*/ + lock_t* lock, /*!< in: lock */ + trx_t* trx) /*!< in/out: trx */ +{ + ut_ad(lock); + ut_ad(lock->trx == trx); + ut_ad(trx->lock.wait_lock == NULL); + ut_ad(lock_mutex_own()); + ut_ad(trx_mutex_own(trx)); + + trx->lock.wait_lock = lock; + lock->type_mode |= LOCK_WAIT; +} + +/**********************************************************************//** +The back pointer to a waiting lock request in the transaction is set to NULL +and the wait bit in lock type_mode is reset. */ +UNIV_INLINE +void +lock_reset_lock_and_trx_wait( +/*=========================*/ + lock_t* lock) /*!< in/out: record lock */ +{ + ut_ad(lock->trx->lock.wait_lock == lock); + ut_ad(lock_get_wait(lock)); + ut_ad(lock_mutex_own()); + + lock->trx->lock.wait_lock = NULL; + lock->type_mode &= ~LOCK_WAIT; +} + +/*********************************************************************//** +Gets the gap flag of a record lock. +@return LOCK_GAP or 0 */ +UNIV_INLINE +ulint +lock_rec_get_gap( +/*=============*/ + const lock_t* lock) /*!< in: record lock */ +{ + ut_ad(lock); + ut_ad(lock_get_type_low(lock) == LOCK_REC); + + return(lock->type_mode & LOCK_GAP); +} + +/*********************************************************************//** +Gets the LOCK_REC_NOT_GAP flag of a record lock. +@return LOCK_REC_NOT_GAP or 0 */ +UNIV_INLINE +ulint +lock_rec_get_rec_not_gap( +/*=====================*/ + const lock_t* lock) /*!< in: record lock */ +{ + ut_ad(lock); + ut_ad(lock_get_type_low(lock) == LOCK_REC); + + return(lock->type_mode & LOCK_REC_NOT_GAP); +} + +/*********************************************************************//** +Gets the waiting insert flag of a record lock. +@return LOCK_INSERT_INTENTION or 0 */ +UNIV_INLINE +ulint +lock_rec_get_insert_intention( +/*==========================*/ + const lock_t* lock) /*!< in: record lock */ +{ + ut_ad(lock); + ut_ad(lock_get_type_low(lock) == LOCK_REC); + + return(lock->type_mode & LOCK_INSERT_INTENTION); +} + +/*********************************************************************//** +Calculates if lock mode 1 is stronger or equal to lock mode 2. +@return nonzero if mode1 stronger or equal to mode2 */ +UNIV_INLINE +ulint +lock_mode_stronger_or_eq( +/*=====================*/ + enum lock_mode mode1, /*!< in: lock mode */ + enum lock_mode mode2) /*!< in: lock mode */ +{ + ut_ad((ulint) mode1 < lock_types); + ut_ad((ulint) mode2 < lock_types); + + return(lock_strength_matrix[mode1][mode2]); +} + +/*********************************************************************//** +Calculates if lock mode 1 is compatible with lock mode 2. +@return nonzero if mode1 compatible with mode2 */ +UNIV_INLINE +ulint +lock_mode_compatible( +/*=================*/ + enum lock_mode mode1, /*!< in: lock mode */ + enum lock_mode mode2) /*!< in: lock mode */ +{ + ut_ad((ulint) mode1 < lock_types); + ut_ad((ulint) mode2 < lock_types); + + return(lock_compatibility_matrix[mode1][mode2]); +} + +/*********************************************************************//** +Checks if a lock request for a new lock has to wait for request lock2. +@return TRUE if new lock has to wait for lock2 to be removed */ +UNIV_INLINE +ibool +lock_rec_has_to_wait( +/*=================*/ + const trx_t* trx, /*!< in: trx of new lock */ + ulint type_mode,/*!< in: precise mode of the new lock + to set: LOCK_S or LOCK_X, possibly + ORed to LOCK_GAP or LOCK_REC_NOT_GAP, + LOCK_INSERT_INTENTION */ + const lock_t* lock2, /*!< in: another record lock; NOTE that + it is assumed that this has a lock bit + set on the same record as in the new + lock we are setting */ + ibool lock_is_on_supremum) /*!< in: TRUE if we are setting the + lock on the 'supremum' record of an + index page: we know then that the lock + request is really for a 'gap' type lock */ +{ + ut_ad(trx && lock2); + ut_ad(lock_get_type_low(lock2) == LOCK_REC); + + if (trx != lock2->trx + && !lock_mode_compatible(static_cast<enum lock_mode>( + LOCK_MODE_MASK & type_mode), + lock_get_mode(lock2))) { + + /* We have somewhat complex rules when gap type record locks + cause waits */ + + if ((lock_is_on_supremum || (type_mode & LOCK_GAP)) + && !(type_mode & LOCK_INSERT_INTENTION)) { + + /* Gap type locks without LOCK_INSERT_INTENTION flag + do not need to wait for anything. This is because + different users can have conflicting lock types + on gaps. */ + + return(FALSE); + } + + if (!(type_mode & LOCK_INSERT_INTENTION) + && lock_rec_get_gap(lock2)) { + + /* Record lock (LOCK_ORDINARY or LOCK_REC_NOT_GAP + does not need to wait for a gap type lock */ + + return(FALSE); + } + + if ((type_mode & LOCK_GAP) + && lock_rec_get_rec_not_gap(lock2)) { + + /* Lock on gap does not need to wait for + a LOCK_REC_NOT_GAP type lock */ + + return(FALSE); + } + + if (lock_rec_get_insert_intention(lock2)) { + + /* No lock request needs to wait for an insert + intention lock to be removed. This is ok since our + rules allow conflicting locks on gaps. This eliminates + a spurious deadlock caused by a next-key lock waiting + for an insert intention lock; when the insert + intention lock was granted, the insert deadlocked on + the waiting next-key lock. + + Also, insert intention locks do not disturb each + other. */ + + return(FALSE); + } + + return(TRUE); + } + + return(FALSE); +} + +/*********************************************************************//** +Checks if a lock request lock1 has to wait for request lock2. +@return TRUE if lock1 has to wait for lock2 to be removed */ +UNIV_INTERN +ibool +lock_has_to_wait( +/*=============*/ + const lock_t* lock1, /*!< in: waiting lock */ + const lock_t* lock2) /*!< in: another lock; NOTE that it is + assumed that this has a lock bit set + on the same record as in lock1 if the + locks are record locks */ +{ + ut_ad(lock1 && lock2); + + if (lock1->trx != lock2->trx + && !lock_mode_compatible(lock_get_mode(lock1), + lock_get_mode(lock2))) { + if (lock_get_type_low(lock1) == LOCK_REC) { + ut_ad(lock_get_type_low(lock2) == LOCK_REC); + + /* If this lock request is for a supremum record + then the second bit on the lock bitmap is set */ + + return(lock_rec_has_to_wait(lock1->trx, + lock1->type_mode, lock2, + lock_rec_get_nth_bit( + lock1, 1))); + } + + return(TRUE); + } + + return(FALSE); +} + +/*============== RECORD LOCK BASIC FUNCTIONS ============================*/ + +/*********************************************************************//** +Gets the number of bits in a record lock bitmap. +@return number of bits */ +UNIV_INLINE +ulint +lock_rec_get_n_bits( +/*================*/ + const lock_t* lock) /*!< in: record lock */ +{ + return(lock->un_member.rec_lock.n_bits); +} + +/**********************************************************************//** +Sets the nth bit of a record lock to TRUE. */ +UNIV_INLINE +void +lock_rec_set_nth_bit( +/*=================*/ + lock_t* lock, /*!< in: record lock */ + ulint i) /*!< in: index of the bit */ +{ + ulint byte_index; + ulint bit_index; + + ut_ad(lock); + ut_ad(lock_get_type_low(lock) == LOCK_REC); + ut_ad(i < lock->un_member.rec_lock.n_bits); + + byte_index = i / 8; + bit_index = i % 8; + + ((byte*) &lock[1])[byte_index] |= 1 << bit_index; +} + +/**********************************************************************//** +Looks for a set bit in a record lock bitmap. Returns ULINT_UNDEFINED, +if none found. +@return bit index == heap number of the record, or ULINT_UNDEFINED if +none found */ +UNIV_INTERN +ulint +lock_rec_find_set_bit( +/*==================*/ + const lock_t* lock) /*!< in: record lock with at least one bit set */ +{ + ulint i; + + for (i = 0; i < lock_rec_get_n_bits(lock); i++) { + + if (lock_rec_get_nth_bit(lock, i)) { + + return(i); + } + } + + return(ULINT_UNDEFINED); +} + +/**********************************************************************//** +Resets the nth bit of a record lock. */ +UNIV_INLINE +void +lock_rec_reset_nth_bit( +/*===================*/ + lock_t* lock, /*!< in: record lock */ + ulint i) /*!< in: index of the bit which must be set to TRUE + when this function is called */ +{ + ulint byte_index; + ulint bit_index; + + ut_ad(lock); + ut_ad(lock_get_type_low(lock) == LOCK_REC); + ut_ad(i < lock->un_member.rec_lock.n_bits); + + byte_index = i / 8; + bit_index = i % 8; + + ((byte*) &lock[1])[byte_index] &= ~(1 << bit_index); +} + +/*********************************************************************//** +Gets the first or next record lock on a page. +@return next lock, NULL if none exists */ +UNIV_INLINE +const lock_t* +lock_rec_get_next_on_page_const( +/*============================*/ + const lock_t* lock) /*!< in: a record lock */ +{ + ulint space; + ulint page_no; + + ut_ad(lock_mutex_own()); + ut_ad(lock_get_type_low(lock) == LOCK_REC); + + space = lock->un_member.rec_lock.space; + page_no = lock->un_member.rec_lock.page_no; + + for (;;) { + lock = static_cast<const lock_t*>(HASH_GET_NEXT(hash, lock)); + + if (!lock) { + + break; + } + + if ((lock->un_member.rec_lock.space == space) + && (lock->un_member.rec_lock.page_no == page_no)) { + + break; + } + } + + return(lock); +} + +/*********************************************************************//** +Gets the first or next record lock on a page. +@return next lock, NULL if none exists */ +UNIV_INLINE +lock_t* +lock_rec_get_next_on_page( +/*======================*/ + lock_t* lock) /*!< in: a record lock */ +{ + return((lock_t*) lock_rec_get_next_on_page_const(lock)); +} + +/*********************************************************************//** +Gets the first record lock on a page, where the page is identified by its +file address. +@return first lock, NULL if none exists */ +UNIV_INLINE +lock_t* +lock_rec_get_first_on_page_addr( +/*============================*/ + ulint space, /*!< in: space */ + ulint page_no)/*!< in: page number */ +{ + lock_t* lock; + + ut_ad(lock_mutex_own()); + + for (lock = static_cast<lock_t*>( + HASH_GET_FIRST(lock_sys->rec_hash, + lock_rec_hash(space, page_no))); + lock != NULL; + lock = static_cast<lock_t*>(HASH_GET_NEXT(hash, lock))) { + + if (lock->un_member.rec_lock.space == space + && lock->un_member.rec_lock.page_no == page_no) { + + break; + } + } + + return(lock); +} + +/*********************************************************************//** +Determines if there are explicit record locks on a page. +@return an explicit record lock on the page, or NULL if there are none */ +UNIV_INTERN +lock_t* +lock_rec_expl_exist_on_page( +/*========================*/ + ulint space, /*!< in: space id */ + ulint page_no)/*!< in: page number */ +{ + lock_t* lock; + + lock_mutex_enter(); + lock = lock_rec_get_first_on_page_addr(space, page_no); + lock_mutex_exit(); + + return(lock); +} + +/*********************************************************************//** +Gets the first record lock on a page, where the page is identified by a +pointer to it. +@return first lock, NULL if none exists */ +UNIV_INLINE +lock_t* +lock_rec_get_first_on_page( +/*=======================*/ + const buf_block_t* block) /*!< in: buffer block */ +{ + ulint hash; + lock_t* lock; + ulint space = buf_block_get_space(block); + ulint page_no = buf_block_get_page_no(block); + + ut_ad(lock_mutex_own()); + + hash = buf_block_get_lock_hash_val(block); + + for (lock = static_cast<lock_t*>( + HASH_GET_FIRST( lock_sys->rec_hash, hash)); + lock != NULL; + lock = static_cast<lock_t*>(HASH_GET_NEXT(hash, lock))) { + + if ((lock->un_member.rec_lock.space == space) + && (lock->un_member.rec_lock.page_no == page_no)) { + + break; + } + } + + return(lock); +} + +/*********************************************************************//** +Gets the next explicit lock request on a record. +@return next lock, NULL if none exists or if heap_no == ULINT_UNDEFINED */ +UNIV_INLINE +lock_t* +lock_rec_get_next( +/*==============*/ + ulint heap_no,/*!< in: heap number of the record */ + lock_t* lock) /*!< in: lock */ +{ + ut_ad(lock_mutex_own()); + + do { + ut_ad(lock_get_type_low(lock) == LOCK_REC); + lock = lock_rec_get_next_on_page(lock); + } while (lock && !lock_rec_get_nth_bit(lock, heap_no)); + + return(lock); +} + +/*********************************************************************//** +Gets the next explicit lock request on a record. +@return next lock, NULL if none exists or if heap_no == ULINT_UNDEFINED */ +UNIV_INLINE +const lock_t* +lock_rec_get_next_const( +/*====================*/ + ulint heap_no,/*!< in: heap number of the record */ + const lock_t* lock) /*!< in: lock */ +{ + return(lock_rec_get_next(heap_no, (lock_t*) lock)); +} + +/*********************************************************************//** +Gets the first explicit lock request on a record. +@return first lock, NULL if none exists */ +UNIV_INLINE +lock_t* +lock_rec_get_first( +/*===============*/ + const buf_block_t* block, /*!< in: block containing the record */ + ulint heap_no)/*!< in: heap number of the record */ +{ + lock_t* lock; + + ut_ad(lock_mutex_own()); + + for (lock = lock_rec_get_first_on_page(block); lock; + lock = lock_rec_get_next_on_page(lock)) { + if (lock_rec_get_nth_bit(lock, heap_no)) { + break; + } + } + + return(lock); +} + +/*********************************************************************//** +Resets the record lock bitmap to zero. NOTE: does not touch the wait_lock +pointer in the transaction! This function is used in lock object creation +and resetting. */ +static +void +lock_rec_bitmap_reset( +/*==================*/ + lock_t* lock) /*!< in: record lock */ +{ + ulint n_bytes; + + ut_ad(lock_get_type_low(lock) == LOCK_REC); + + /* Reset to zero the bitmap which resides immediately after the lock + struct */ + + n_bytes = lock_rec_get_n_bits(lock) / 8; + + ut_ad((lock_rec_get_n_bits(lock) % 8) == 0); + + memset(&lock[1], 0, n_bytes); +} + +/*********************************************************************//** +Copies a record lock to heap. +@return copy of lock */ +static +lock_t* +lock_rec_copy( +/*==========*/ + const lock_t* lock, /*!< in: record lock */ + mem_heap_t* heap) /*!< in: memory heap */ +{ + ulint size; + + ut_ad(lock_get_type_low(lock) == LOCK_REC); + + size = sizeof(lock_t) + lock_rec_get_n_bits(lock) / 8; + + return(static_cast<lock_t*>(mem_heap_dup(heap, lock, size))); +} + +/*********************************************************************//** +Gets the previous record lock set on a record. +@return previous lock on the same record, NULL if none exists */ +UNIV_INTERN +const lock_t* +lock_rec_get_prev( +/*==============*/ + const lock_t* in_lock,/*!< in: record lock */ + ulint heap_no)/*!< in: heap number of the record */ +{ + lock_t* lock; + ulint space; + ulint page_no; + lock_t* found_lock = NULL; + + ut_ad(lock_mutex_own()); + ut_ad(lock_get_type_low(in_lock) == LOCK_REC); + + space = in_lock->un_member.rec_lock.space; + page_no = in_lock->un_member.rec_lock.page_no; + + for (lock = lock_rec_get_first_on_page_addr(space, page_no); + /* No op */; + lock = lock_rec_get_next_on_page(lock)) { + + ut_ad(lock); + + if (lock == in_lock) { + + return(found_lock); + } + + if (lock_rec_get_nth_bit(lock, heap_no)) { + + found_lock = lock; + } + } +} + +/*============= FUNCTIONS FOR ANALYZING TABLE LOCK QUEUE ================*/ + +/*********************************************************************//** +Checks if a transaction has the specified table lock, or stronger. This +function should only be called by the thread that owns the transaction. +@return lock or NULL */ +UNIV_INLINE +const lock_t* +lock_table_has( +/*===========*/ + const trx_t* trx, /*!< in: transaction */ + const dict_table_t* table, /*!< in: table */ + enum lock_mode mode) /*!< in: lock mode */ +{ + lint i; + + if (ib_vector_is_empty(trx->lock.table_locks)) { + return(NULL); + } + + /* Look for stronger locks the same trx already has on the table */ + + for (i = ib_vector_size(trx->lock.table_locks) - 1; i >= 0; --i) { + const lock_t* lock; + enum lock_mode lock_mode; + + lock = *static_cast<const lock_t**>( + ib_vector_get(trx->lock.table_locks, i)); + + if (lock == NULL) { + continue; + } + + lock_mode = lock_get_mode(lock); + + ut_ad(trx == lock->trx); + ut_ad(lock_get_type_low(lock) & LOCK_TABLE); + ut_ad(lock->un_member.tab_lock.table != NULL); + + if (table == lock->un_member.tab_lock.table + && lock_mode_stronger_or_eq(lock_mode, mode)) { + + ut_ad(!lock_get_wait(lock)); + + return(lock); + } + } + + return(NULL); +} + +/*============= FUNCTIONS FOR ANALYZING RECORD LOCK QUEUE ================*/ + +/*********************************************************************//** +Checks if a transaction has a GRANTED explicit lock on rec stronger or equal +to precise_mode. +@return lock or NULL */ +UNIV_INLINE +lock_t* +lock_rec_has_expl( +/*==============*/ + ulint precise_mode,/*!< in: LOCK_S or LOCK_X + possibly ORed to LOCK_GAP or + LOCK_REC_NOT_GAP, for a + supremum record we regard this + always a gap type request */ + const buf_block_t* block, /*!< in: buffer block containing + the record */ + ulint heap_no,/*!< in: heap number of the record */ + trx_id_t trx_id) /*!< in: transaction id */ +{ + lock_t* lock; + + ut_ad(lock_mutex_own()); + ut_ad((precise_mode & LOCK_MODE_MASK) == LOCK_S + || (precise_mode & LOCK_MODE_MASK) == LOCK_X); + ut_ad(!(precise_mode & LOCK_INSERT_INTENTION)); + + for (lock = lock_rec_get_first(block, heap_no); + lock != NULL; + lock = lock_rec_get_next(heap_no, lock)) { + + if (lock->trx->id == trx_id + && !lock_rec_get_insert_intention(lock) + && lock_mode_stronger_or_eq( + lock_get_mode(lock), + static_cast<enum lock_mode>( + precise_mode & LOCK_MODE_MASK)) + && !lock_get_wait(lock) + && (!lock_rec_get_rec_not_gap(lock) + || (precise_mode & LOCK_REC_NOT_GAP) + || heap_no == PAGE_HEAP_NO_SUPREMUM) + && (!lock_rec_get_gap(lock) + || (precise_mode & LOCK_GAP) + || heap_no == PAGE_HEAP_NO_SUPREMUM)) { + + return(lock); + } + } + + return(NULL); +} + +#ifdef UNIV_DEBUG +/*********************************************************************//** +Checks if some other transaction has a lock request in the queue. +@return lock or NULL */ +static __attribute__((nonnull, warn_unused_result)) +const lock_t* +lock_rec_other_has_expl_req( +/*========================*/ + enum lock_mode mode, /*!< in: LOCK_S or LOCK_X */ + ulint gap, /*!< in: LOCK_GAP if also gap + locks are taken into account, + or 0 if not */ + ulint wait, /*!< in: LOCK_WAIT if also + waiting locks are taken into + account, or 0 if not */ + const buf_block_t* block, /*!< in: buffer block containing + the record */ + ulint heap_no,/*!< in: heap number of the record */ + trx_id_t trx_id) /*!< in: transaction */ +{ + const lock_t* lock; + + ut_ad(lock_mutex_own()); + ut_ad(mode == LOCK_X || mode == LOCK_S); + ut_ad(gap == 0 || gap == LOCK_GAP); + ut_ad(wait == 0 || wait == LOCK_WAIT); + + for (lock = lock_rec_get_first(block, heap_no); + lock != NULL; + lock = lock_rec_get_next_const(heap_no, lock)) { + + if (lock->trx->id != trx_id + && (gap + || !(lock_rec_get_gap(lock) + || heap_no == PAGE_HEAP_NO_SUPREMUM)) + && (wait || !lock_get_wait(lock)) + && lock_mode_stronger_or_eq(lock_get_mode(lock), mode)) { + + return(lock); + } + } + + return(NULL); +} +#endif /* UNIV_DEBUG */ + +/*********************************************************************//** +Checks if some other transaction has a conflicting explicit lock request +in the queue, so that we have to wait. +@return lock or NULL */ +static +const lock_t* +lock_rec_other_has_conflicting( +/*===========================*/ + enum lock_mode mode, /*!< in: LOCK_S or LOCK_X, + possibly ORed to LOCK_GAP or + LOC_REC_NOT_GAP, + LOCK_INSERT_INTENTION */ + const buf_block_t* block, /*!< in: buffer block containing + the record */ + ulint heap_no,/*!< in: heap number of the record */ + const trx_t* trx) /*!< in: our transaction */ +{ + const lock_t* lock; + ibool is_supremum; + + ut_ad(lock_mutex_own()); + + is_supremum = (heap_no == PAGE_HEAP_NO_SUPREMUM); + + for (lock = lock_rec_get_first(block, heap_no); + lock != NULL; + lock = lock_rec_get_next_const(heap_no, lock)) { + + if (lock_rec_has_to_wait(trx, mode, lock, is_supremum)) { + return(lock); + } + } + + return(NULL); +} + +/*********************************************************************//** +Looks for a suitable type record lock struct by the same trx on the same page. +This can be used to save space when a new record lock should be set on a page: +no new struct is needed, if a suitable old is found. +@return lock or NULL */ +UNIV_INLINE +lock_t* +lock_rec_find_similar_on_page( +/*==========================*/ + ulint type_mode, /*!< in: lock type_mode field */ + ulint heap_no, /*!< in: heap number of the record */ + lock_t* lock, /*!< in: lock_rec_get_first_on_page() */ + const trx_t* trx) /*!< in: transaction */ +{ + ut_ad(lock_mutex_own()); + + for (/* No op */; + lock != NULL; + lock = lock_rec_get_next_on_page(lock)) { + + if (lock->trx == trx + && lock->type_mode == type_mode + && lock_rec_get_n_bits(lock) > heap_no) { + + return(lock); + } + } + + return(NULL); +} + +/*********************************************************************//** +Checks if some transaction has an implicit x-lock on a record in a secondary +index. +@return transaction id of the transaction which has the x-lock, or 0; +NOTE that this function can return false positives but never false +negatives. The caller must confirm all positive results by calling +trx_is_active(). */ +static +trx_id_t +lock_sec_rec_some_has_impl( +/*=======================*/ + const rec_t* rec, /*!< in: user record */ + dict_index_t* index, /*!< in: secondary index */ + const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */ +{ + trx_id_t trx_id; + trx_id_t max_trx_id; + const page_t* page = page_align(rec); + + ut_ad(!lock_mutex_own()); + ut_ad(!mutex_own(&trx_sys->mutex)); + ut_ad(!dict_index_is_clust(index)); + ut_ad(page_rec_is_user_rec(rec)); + ut_ad(rec_offs_validate(rec, index, offsets)); + + max_trx_id = page_get_max_trx_id(page); + + /* Some transaction may have an implicit x-lock on the record only + if the max trx id for the page >= min trx id for the trx list, or + database recovery is running. We do not write the changes of a page + max trx id to the log, and therefore during recovery, this value + for a page may be incorrect. */ + + if (max_trx_id < trx_rw_min_trx_id() && !recv_recovery_is_on()) { + + trx_id = 0; + + } else if (!lock_check_trx_id_sanity(max_trx_id, rec, index, offsets)) { + + buf_page_print(page, 0, 0); + + /* The page is corrupt: try to avoid a crash by returning 0 */ + trx_id = 0; + + /* In this case it is possible that some transaction has an implicit + x-lock. We have to look in the clustered index. */ + + } else { + trx_id = row_vers_impl_x_locked(rec, index, offsets); + } + + return(trx_id); +} + +#ifdef UNIV_DEBUG +/*********************************************************************//** +Checks if some transaction, other than given trx_id, has an explicit +lock on the given rec, in the given precise_mode. +@return the transaction, whose id is not equal to trx_id, that has an +explicit lock on the given rec, in the given precise_mode or NULL.*/ +static +trx_t* +lock_rec_other_trx_holds_expl( +/*==========================*/ + ulint precise_mode, /*!< in: LOCK_S or LOCK_X + possibly ORed to LOCK_GAP or + LOCK_REC_NOT_GAP. */ + trx_id_t trx_id, /*!< in: trx holding implicit + lock on rec */ + const rec_t* rec, /*!< in: user record */ + const buf_block_t* block) /*!< in: buffer block + containing the record */ +{ + trx_t* holds = NULL; + + lock_mutex_enter(); + mutex_enter(&trx_sys->mutex); + + trx_id_t* impl_trx_desc = trx_find_descriptor(trx_sys->descriptors, + trx_sys->descr_n_used, + trx_id); + if (impl_trx_desc) { + ut_ad(trx_id == *impl_trx_desc); + ulint heap_no = page_rec_get_heap_no(rec); + ulint rw_trx_count = trx_sys->descr_n_used; + trx_id_t* rw_trx_snapshot = static_cast<trx_id_t *> + (ut_malloc(sizeof(trx_id_t) * rw_trx_count)); + memcpy(rw_trx_snapshot, trx_sys->descriptors, + sizeof(trx_id_t) * rw_trx_count); + + mutex_exit(&trx_sys->mutex); + + for (ulint i = 0; i < rw_trx_count; i++) { + + lock_t* expl_lock = lock_rec_has_expl(precise_mode, + block, heap_no, + rw_trx_snapshot[i]); + if (expl_lock && expl_lock->trx->id != trx_id) { + /* An explicit lock is held by trx other than + the trx holding the implicit lock. */ + holds = expl_lock->trx; + break; + } + } + + ut_free(rw_trx_snapshot); + + } else { + mutex_exit(&trx_sys->mutex); + } + + lock_mutex_exit(); + + return(holds); +} +#endif /* UNIV_DEBUG */ + +/*********************************************************************//** +Return approximate number or record locks (bits set in the bitmap) for +this transaction. Since delete-marked records may be removed, the +record count will not be precise. +The caller must be holding lock_sys->mutex. */ +UNIV_INTERN +ulint +lock_number_of_rows_locked( +/*=======================*/ + const trx_lock_t* trx_lock) /*!< in: transaction locks */ +{ + const lock_t* lock; + ulint n_records = 0; + + ut_ad(lock_mutex_own()); + + for (lock = UT_LIST_GET_FIRST(trx_lock->trx_locks); + lock != NULL; + lock = UT_LIST_GET_NEXT(trx_locks, lock)) { + + if (lock_get_type_low(lock) == LOCK_REC) { + ulint n_bit; + ulint n_bits = lock_rec_get_n_bits(lock); + + for (n_bit = 0; n_bit < n_bits; n_bit++) { + if (lock_rec_get_nth_bit(lock, n_bit)) { + n_records++; + } + } + } + } + + return(n_records); +} + +/*============== RECORD LOCK CREATION AND QUEUE MANAGEMENT =============*/ + +/*********************************************************************//** +Creates a new record lock and inserts it to the lock queue. Does NOT check +for deadlocks or lock compatibility! +@return created lock */ +static +lock_t* +lock_rec_create( +/*============*/ + ulint type_mode,/*!< in: lock mode and wait + flag, type is ignored and + replaced by LOCK_REC */ + const buf_block_t* block, /*!< in: buffer block containing + the record */ + ulint heap_no,/*!< in: heap number of the record */ + dict_index_t* index, /*!< in: index of record */ + trx_t* trx, /*!< in/out: transaction */ + ibool caller_owns_trx_mutex) + /*!< in: TRUE if caller owns + trx mutex */ +{ + lock_t* lock; + ulint page_no; + ulint space; + ulint n_bits; + ulint n_bytes; + const page_t* page; + + ut_ad(lock_mutex_own()); + ut_ad(caller_owns_trx_mutex == trx_mutex_own(trx)); + ut_ad(dict_index_is_clust(index) || !dict_index_is_online_ddl(index)); + + /* Non-locking autocommit read-only transactions should not set + any locks. */ + assert_trx_in_list(trx); + + space = buf_block_get_space(block); + page_no = buf_block_get_page_no(block); + page = block->frame; + + btr_assert_not_corrupted(block, index); + + /* If rec is the supremum record, then we reset the gap and + LOCK_REC_NOT_GAP bits, as all locks on the supremum are + automatically of the gap type */ + + if (UNIV_UNLIKELY(heap_no == PAGE_HEAP_NO_SUPREMUM)) { + ut_ad(!(type_mode & LOCK_REC_NOT_GAP)); + + type_mode = type_mode & ~(LOCK_GAP | LOCK_REC_NOT_GAP); + } + + /* Make lock bitmap bigger by a safety margin */ + n_bits = page_dir_get_n_heap(page) + LOCK_PAGE_BITMAP_MARGIN; + n_bytes = 1 + n_bits / 8; + + lock = static_cast<lock_t*>( + mem_heap_alloc(trx->lock.lock_heap, sizeof(lock_t) + n_bytes)); + + lock->trx = trx; + + lock->type_mode = (type_mode & ~LOCK_TYPE_MASK) | LOCK_REC; + lock->index = index; + + lock->un_member.rec_lock.space = space; + lock->un_member.rec_lock.page_no = page_no; + lock->un_member.rec_lock.n_bits = n_bytes * 8; + + /* Reset to zero the bitmap which resides immediately after the + lock struct */ + + lock_rec_bitmap_reset(lock); + + /* Set the bit corresponding to rec */ + lock_rec_set_nth_bit(lock, heap_no); + + index->table->n_rec_locks++; + + ut_ad(index->table->n_ref_count > 0 || !index->table->can_be_evicted); + + HASH_INSERT(lock_t, hash, lock_sys->rec_hash, + lock_rec_fold(space, page_no), lock); + + lock_sys->rec_num++; + + if (!caller_owns_trx_mutex) { + trx_mutex_enter(trx); + } + ut_ad(trx_mutex_own(trx)); + + if (type_mode & LOCK_WAIT) { + + lock_set_lock_and_trx_wait(lock, trx); + } + + UT_LIST_ADD_LAST(trx_locks, trx->lock.trx_locks, lock); + + if (!caller_owns_trx_mutex) { + trx_mutex_exit(trx); + } + + MONITOR_INC(MONITOR_RECLOCK_CREATED); + MONITOR_INC(MONITOR_NUM_RECLOCK); + + return(lock); +} + +/*********************************************************************//** +Enqueues a waiting request for a lock which cannot be granted immediately. +Checks for deadlocks. +@return DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED, or +DB_SUCCESS_LOCKED_REC; DB_SUCCESS_LOCKED_REC means that +there was a deadlock, but another transaction was chosen as a victim, +and we got the lock immediately: no need to wait then */ +static +dberr_t +lock_rec_enqueue_waiting( +/*=====================*/ + ulint type_mode,/*!< in: lock mode this + transaction is requesting: + LOCK_S or LOCK_X, possibly + ORed with LOCK_GAP or + LOCK_REC_NOT_GAP, ORed with + LOCK_INSERT_INTENTION if this + waiting lock request is set + when performing an insert of + an index record */ + const buf_block_t* block, /*!< in: buffer block containing + the record */ + ulint heap_no,/*!< in: heap number of the record */ + dict_index_t* index, /*!< in: index of record */ + que_thr_t* thr) /*!< in: query thread */ +{ + trx_t* trx; + lock_t* lock; + trx_id_t victim_trx_id; + ulint sec; + ulint ms; + + + ut_ad(lock_mutex_own()); + ut_ad(!srv_read_only_mode); + ut_ad(dict_index_is_clust(index) || !dict_index_is_online_ddl(index)); + + trx = thr_get_trx(thr); + + ut_ad(trx_mutex_own(trx)); + + /* Test if there already is some other reason to suspend thread: + we do not enqueue a lock request if the query thread should be + stopped anyway */ + + if (que_thr_stop(thr)) { + ut_error; + + return(DB_QUE_THR_SUSPENDED); + } + + switch (trx_get_dict_operation(trx)) { + case TRX_DICT_OP_NONE: + break; + case TRX_DICT_OP_TABLE: + case TRX_DICT_OP_INDEX: + ut_print_timestamp(stderr); + fputs(" InnoDB: Error: a record lock wait happens" + " in a dictionary operation!\n" + "InnoDB: ", stderr); + dict_index_name_print(stderr, trx, index); + fputs(".\n" + "InnoDB: Submit a detailed bug report" + " to http://bugs.mysql.com\n", + stderr); + ut_ad(0); + } + + /* Enqueue the lock request that will wait to be granted, note that + we already own the trx mutex. */ + lock = lock_rec_create( + type_mode | LOCK_WAIT, block, heap_no, index, trx, TRUE); + + /* Release the mutex to obey the latching order. + This is safe, because lock_deadlock_check_and_resolve() + is invoked when a lock wait is enqueued for the currently + running transaction. Because trx is a running transaction + (it is not currently suspended because of a lock wait), + its state can only be changed by this thread, which is + currently associated with the transaction. */ + + trx_mutex_exit(trx); + + victim_trx_id = lock_deadlock_check_and_resolve(lock, trx); + + trx_mutex_enter(trx); + + if (victim_trx_id != 0) { + + ut_ad(victim_trx_id == trx->id); + + lock_reset_lock_and_trx_wait(lock); + lock_rec_reset_nth_bit(lock, heap_no); + + return(DB_DEADLOCK); + + } else if (trx->lock.wait_lock == NULL) { + + /* If there was a deadlock but we chose another + transaction as a victim, it is possible that we + already have the lock now granted! */ + + return(DB_SUCCESS_LOCKED_REC); + } + + trx->lock.que_state = TRX_QUE_LOCK_WAIT; + + trx->lock.was_chosen_as_deadlock_victim = FALSE; + trx->lock.wait_started = ut_time(); + + if (UNIV_UNLIKELY(trx->take_stats)) { + ut_usectime(&sec, &ms); + trx->lock_que_wait_ustarted = (ib_uint64_t)sec * 1000000 + ms; + } + + ut_a(que_thr_stop(thr)); + +#ifdef UNIV_DEBUG + if (lock_print_waits) { + fprintf(stderr, "Lock wait for trx " TRX_ID_FMT " in index ", + trx->id); + ut_print_name(stderr, trx, FALSE, index->name); + } +#endif /* UNIV_DEBUG */ + + MONITOR_INC(MONITOR_LOCKREC_WAIT); + + return(DB_LOCK_WAIT); +} + +/*********************************************************************//** +Adds a record lock request in the record queue. The request is normally +added as the last in the queue, but if there are no waiting lock requests +on the record, and the request to be added is not a waiting request, we +can reuse a suitable record lock object already existing on the same page, +just setting the appropriate bit in its bitmap. This is a low-level function +which does NOT check for deadlocks or lock compatibility! +@return lock where the bit was set */ +static +lock_t* +lock_rec_add_to_queue( +/*==================*/ + ulint type_mode,/*!< in: lock mode, wait, gap + etc. flags; type is ignored + and replaced by LOCK_REC */ + const buf_block_t* block, /*!< in: buffer block containing + the record */ + ulint heap_no,/*!< in: heap number of the record */ + dict_index_t* index, /*!< in: index of record */ + trx_t* trx, /*!< in/out: transaction */ + ibool caller_owns_trx_mutex) + /*!< in: TRUE if caller owns the + transaction mutex */ +{ + lock_t* lock; + lock_t* first_lock; + + ut_ad(lock_mutex_own()); + ut_ad(caller_owns_trx_mutex == trx_mutex_own(trx)); + ut_ad(dict_index_is_clust(index) || !dict_index_is_online_ddl(index)); +#ifdef UNIV_DEBUG + switch (type_mode & LOCK_MODE_MASK) { + case LOCK_X: + case LOCK_S: + break; + default: + ut_error; + } + + if (!(type_mode & (LOCK_WAIT | LOCK_GAP))) { + enum lock_mode mode = (type_mode & LOCK_MODE_MASK) == LOCK_S + ? LOCK_X + : LOCK_S; + const lock_t* other_lock + = lock_rec_other_has_expl_req(mode, 0, LOCK_WAIT, + block, heap_no, trx->id); + ut_a(!other_lock); + } +#endif /* UNIV_DEBUG */ + + type_mode |= LOCK_REC; + + /* If rec is the supremum record, then we can reset the gap bit, as + all locks on the supremum are automatically of the gap type, and we + try to avoid unnecessary memory consumption of a new record lock + struct for a gap type lock */ + + if (UNIV_UNLIKELY(heap_no == PAGE_HEAP_NO_SUPREMUM)) { + ut_ad(!(type_mode & LOCK_REC_NOT_GAP)); + + /* There should never be LOCK_REC_NOT_GAP on a supremum + record, but let us play safe */ + + type_mode = type_mode & ~(LOCK_GAP | LOCK_REC_NOT_GAP); + } + + /* Look for a waiting lock request on the same record or on a gap */ + + for (first_lock = lock = lock_rec_get_first_on_page(block); + lock != NULL; + lock = lock_rec_get_next_on_page(lock)) { + + if (lock_get_wait(lock) + && lock_rec_get_nth_bit(lock, heap_no)) { + + goto somebody_waits; + } + } + + if (UNIV_LIKELY(!(type_mode & LOCK_WAIT))) { + + /* Look for a similar record lock on the same page: + if one is found and there are no waiting lock requests, + we can just set the bit */ + + lock = lock_rec_find_similar_on_page( + type_mode, heap_no, first_lock, trx); + + if (lock) { + + lock_rec_set_nth_bit(lock, heap_no); + + return(lock); + } + } + +somebody_waits: + return(lock_rec_create( + type_mode, block, heap_no, index, trx, + caller_owns_trx_mutex)); +} + +/** Record locking request status */ +enum lock_rec_req_status { + /** Failed to acquire a lock */ + LOCK_REC_FAIL, + /** Succeeded in acquiring a lock (implicit or already acquired) */ + LOCK_REC_SUCCESS, + /** Explicitly created a new lock */ + LOCK_REC_SUCCESS_CREATED +}; + +/*********************************************************************//** +This is a fast routine for locking a record in the most common cases: +there are no explicit locks on the page, or there is just one lock, owned +by this transaction, and of the right type_mode. This is a low-level function +which does NOT look at implicit locks! Checks lock compatibility within +explicit locks. This function sets a normal next-key lock, or in the case of +a page supremum record, a gap type lock. +@return whether the locking succeeded */ +UNIV_INLINE +enum lock_rec_req_status +lock_rec_lock_fast( +/*===============*/ + ibool impl, /*!< in: if TRUE, no lock is set + if no wait is necessary: we + assume that the caller will + set an implicit lock */ + ulint mode, /*!< in: lock mode: LOCK_X or + LOCK_S possibly ORed to either + LOCK_GAP or LOCK_REC_NOT_GAP */ + const buf_block_t* block, /*!< in: buffer block containing + the record */ + ulint heap_no,/*!< in: heap number of record */ + dict_index_t* index, /*!< in: index of record */ + que_thr_t* thr) /*!< in: query thread */ +{ + lock_t* lock; + trx_t* trx; + enum lock_rec_req_status status = LOCK_REC_SUCCESS; + + ut_ad(lock_mutex_own()); + ut_ad((LOCK_MODE_MASK & mode) != LOCK_S + || lock_table_has(thr_get_trx(thr), index->table, LOCK_IS)); + ut_ad((LOCK_MODE_MASK & mode) != LOCK_X + || lock_table_has(thr_get_trx(thr), index->table, LOCK_IX)); + ut_ad((LOCK_MODE_MASK & mode) == LOCK_S + || (LOCK_MODE_MASK & mode) == LOCK_X); + ut_ad(mode - (LOCK_MODE_MASK & mode) == LOCK_GAP + || mode - (LOCK_MODE_MASK & mode) == 0 + || mode - (LOCK_MODE_MASK & mode) == LOCK_REC_NOT_GAP); + ut_ad(dict_index_is_clust(index) || !dict_index_is_online_ddl(index)); + + DBUG_EXECUTE_IF("innodb_report_deadlock", return(LOCK_REC_FAIL);); + + lock = lock_rec_get_first_on_page(block); + + trx = thr_get_trx(thr); + + if (lock == NULL) { + if (!impl) { + /* Note that we don't own the trx mutex. */ + lock = lock_rec_create( + mode, block, heap_no, index, trx, FALSE); + + } + status = LOCK_REC_SUCCESS_CREATED; + } else { + trx_mutex_enter(trx); + + if (lock_rec_get_next_on_page(lock) + || lock->trx != trx + || lock->type_mode != (mode | LOCK_REC) + || lock_rec_get_n_bits(lock) <= heap_no) { + + status = LOCK_REC_FAIL; + } else if (!impl) { + /* If the nth bit of the record lock is already set + then we do not set a new lock bit, otherwise we do + set */ + if (!lock_rec_get_nth_bit(lock, heap_no)) { + lock_rec_set_nth_bit(lock, heap_no); + status = LOCK_REC_SUCCESS_CREATED; + } + } + + trx_mutex_exit(trx); + } + + return(status); +} + +/*********************************************************************//** +This is the general, and slower, routine for locking a record. This is a +low-level function which does NOT look at implicit locks! Checks lock +compatibility within explicit locks. This function sets a normal next-key +lock, or in the case of a page supremum record, a gap type lock. +@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, DB_DEADLOCK, +or DB_QUE_THR_SUSPENDED */ +static +dberr_t +lock_rec_lock_slow( +/*===============*/ + ibool impl, /*!< in: if TRUE, no lock is set + if no wait is necessary: we + assume that the caller will + set an implicit lock */ + ulint mode, /*!< in: lock mode: LOCK_X or + LOCK_S possibly ORed to either + LOCK_GAP or LOCK_REC_NOT_GAP */ + const buf_block_t* block, /*!< in: buffer block containing + the record */ + ulint heap_no,/*!< in: heap number of record */ + dict_index_t* index, /*!< in: index of record */ + que_thr_t* thr) /*!< in: query thread */ +{ + trx_t* trx; + dberr_t err = DB_SUCCESS; + + ut_ad(lock_mutex_own()); + ut_ad((LOCK_MODE_MASK & mode) != LOCK_S + || lock_table_has(thr_get_trx(thr), index->table, LOCK_IS)); + ut_ad((LOCK_MODE_MASK & mode) != LOCK_X + || lock_table_has(thr_get_trx(thr), index->table, LOCK_IX)); + ut_ad((LOCK_MODE_MASK & mode) == LOCK_S + || (LOCK_MODE_MASK & mode) == LOCK_X); + ut_ad(mode - (LOCK_MODE_MASK & mode) == LOCK_GAP + || mode - (LOCK_MODE_MASK & mode) == 0 + || mode - (LOCK_MODE_MASK & mode) == LOCK_REC_NOT_GAP); + ut_ad(dict_index_is_clust(index) || !dict_index_is_online_ddl(index)); + + DBUG_EXECUTE_IF("innodb_report_deadlock", return(DB_DEADLOCK);); + + trx = thr_get_trx(thr); + trx_mutex_enter(trx); + + if (lock_rec_has_expl(mode, block, heap_no, trx->id)) { + + /* The trx already has a strong enough lock on rec: do + nothing */ + + } else if (lock_rec_other_has_conflicting( + static_cast<enum lock_mode>(mode), + block, heap_no, trx)) { + + /* If another transaction has a non-gap conflicting + request in the queue, as this transaction does not + have a lock strong enough already granted on the + record, we have to wait. */ + + err = lock_rec_enqueue_waiting( + mode, block, heap_no, index, thr); + + } else if (!impl) { + /* Set the requested lock on the record, note that + we already own the transaction mutex. */ + + lock_rec_add_to_queue( + LOCK_REC | mode, block, heap_no, index, trx, TRUE); + + err = DB_SUCCESS_LOCKED_REC; + } + + trx_mutex_exit(trx); + + return(err); +} + +/*********************************************************************//** +Tries to lock the specified record in the mode requested. If not immediately +possible, enqueues a waiting lock request. This is a low-level function +which does NOT look at implicit locks! Checks lock compatibility within +explicit locks. This function sets a normal next-key lock, or in the case +of a page supremum record, a gap type lock. +@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, DB_DEADLOCK, +or DB_QUE_THR_SUSPENDED */ +static +dberr_t +lock_rec_lock( +/*==========*/ + ibool impl, /*!< in: if TRUE, no lock is set + if no wait is necessary: we + assume that the caller will + set an implicit lock */ + ulint mode, /*!< in: lock mode: LOCK_X or + LOCK_S possibly ORed to either + LOCK_GAP or LOCK_REC_NOT_GAP */ + const buf_block_t* block, /*!< in: buffer block containing + the record */ + ulint heap_no,/*!< in: heap number of record */ + dict_index_t* index, /*!< in: index of record */ + que_thr_t* thr) /*!< in: query thread */ +{ + ut_ad(lock_mutex_own()); + ut_ad((LOCK_MODE_MASK & mode) != LOCK_S + || lock_table_has(thr_get_trx(thr), index->table, LOCK_IS)); + ut_ad((LOCK_MODE_MASK & mode) != LOCK_X + || lock_table_has(thr_get_trx(thr), index->table, LOCK_IX)); + ut_ad((LOCK_MODE_MASK & mode) == LOCK_S + || (LOCK_MODE_MASK & mode) == LOCK_X); + ut_ad(mode - (LOCK_MODE_MASK & mode) == LOCK_GAP + || mode - (LOCK_MODE_MASK & mode) == LOCK_REC_NOT_GAP + || mode - (LOCK_MODE_MASK & mode) == 0); + ut_ad(dict_index_is_clust(index) || !dict_index_is_online_ddl(index)); + + /* We try a simplified and faster subroutine for the most + common cases */ + switch (lock_rec_lock_fast(impl, mode, block, heap_no, index, thr)) { + case LOCK_REC_SUCCESS: + return(DB_SUCCESS); + case LOCK_REC_SUCCESS_CREATED: + return(DB_SUCCESS_LOCKED_REC); + case LOCK_REC_FAIL: + return(lock_rec_lock_slow(impl, mode, block, + heap_no, index, thr)); + } + + ut_error; + return(DB_ERROR); +} + +/*********************************************************************//** +Checks if a waiting record lock request still has to wait in a queue. +@return lock that is causing the wait */ +static +const lock_t* +lock_rec_has_to_wait_in_queue( +/*==========================*/ + const lock_t* wait_lock) /*!< in: waiting record lock */ +{ + const lock_t* lock; + ulint space; + ulint page_no; + ulint heap_no; + ulint bit_mask; + ulint bit_offset; + + ut_ad(lock_mutex_own()); + ut_ad(lock_get_wait(wait_lock)); + ut_ad(lock_get_type_low(wait_lock) == LOCK_REC); + + space = wait_lock->un_member.rec_lock.space; + page_no = wait_lock->un_member.rec_lock.page_no; + heap_no = lock_rec_find_set_bit(wait_lock); + + bit_offset = heap_no / 8; + bit_mask = static_cast<ulint>(1 << (heap_no % 8)); + + for (lock = lock_rec_get_first_on_page_addr(space, page_no); + lock != wait_lock; + lock = lock_rec_get_next_on_page_const(lock)) { + + const byte* p = (const byte*) &lock[1]; + + if (heap_no < lock_rec_get_n_bits(lock) + && (p[bit_offset] & bit_mask) + && lock_has_to_wait(wait_lock, lock)) { + + return(lock); + } + } + + return(NULL); +} + +/*************************************************************//** +Grants a lock to a waiting lock request and releases the waiting transaction. +The caller must hold lock_sys->mutex but not lock->trx->mutex. */ +static +void +lock_grant( +/*=======*/ + lock_t* lock) /*!< in/out: waiting lock request */ +{ + ut_ad(lock_mutex_own()); + + lock_reset_lock_and_trx_wait(lock); + + trx_mutex_enter(lock->trx); + + if (lock_get_mode(lock) == LOCK_AUTO_INC) { + dict_table_t* table = lock->un_member.tab_lock.table; + + if (UNIV_UNLIKELY(table->autoinc_trx == lock->trx)) { + fprintf(stderr, + "InnoDB: Error: trx already had" + " an AUTO-INC lock!\n"); + } else { + table->autoinc_trx = lock->trx; + + ib_vector_push(lock->trx->autoinc_locks, &lock); + } + } + +#ifdef UNIV_DEBUG + if (lock_print_waits) { + fprintf(stderr, "Lock wait for trx " TRX_ID_FMT " ends\n", + lock->trx->id); + } +#endif /* UNIV_DEBUG */ + + /* If we are resolving a deadlock by choosing another transaction + as a victim, then our original transaction may not be in the + TRX_QUE_LOCK_WAIT state, and there is no need to end the lock wait + for it */ + + if (lock->trx->lock.que_state == TRX_QUE_LOCK_WAIT) { + que_thr_t* thr; + + thr = que_thr_end_lock_wait(lock->trx); + + if (thr != NULL) { + lock_wait_release_thread_if_suspended(thr); + } + } + + trx_mutex_exit(lock->trx); +} + +/*************************************************************//** +Cancels a waiting record lock request and releases the waiting transaction +that requested it. NOTE: does NOT check if waiting lock requests behind this +one can now be granted! */ +static +void +lock_rec_cancel( +/*============*/ + lock_t* lock) /*!< in: waiting record lock request */ +{ + que_thr_t* thr; + + ut_ad(lock_mutex_own()); + ut_ad(lock_get_type_low(lock) == LOCK_REC); + + /* Reset the bit (there can be only one set bit) in the lock bitmap */ + lock_rec_reset_nth_bit(lock, lock_rec_find_set_bit(lock)); + + /* Reset the wait flag and the back pointer to lock in trx */ + + lock_reset_lock_and_trx_wait(lock); + + /* The following function releases the trx from lock wait */ + + trx_mutex_enter(lock->trx); + + thr = que_thr_end_lock_wait(lock->trx); + + if (thr != NULL) { + lock_wait_release_thread_if_suspended(thr); + } + + trx_mutex_exit(lock->trx); +} + +/*************************************************************//** +Removes a record lock request, waiting or granted, from the queue and +grants locks to other transactions in the queue if they now are entitled +to a lock. NOTE: all record locks contained in in_lock are removed. */ +static +void +lock_rec_dequeue_from_page( +/*=======================*/ + lock_t* in_lock) /*!< in: record lock object: all + record locks which are contained in + this lock object are removed; + transactions waiting behind will + get their lock requests granted, + if they are now qualified to it */ +{ + ulint space; + ulint page_no; + lock_t* lock; + trx_lock_t* trx_lock; + + ut_ad(lock_mutex_own()); + ut_ad(lock_get_type_low(in_lock) == LOCK_REC); + /* We may or may not be holding in_lock->trx->mutex here. */ + + trx_lock = &in_lock->trx->lock; + + space = in_lock->un_member.rec_lock.space; + page_no = in_lock->un_member.rec_lock.page_no; + + in_lock->index->table->n_rec_locks--; + + HASH_DELETE(lock_t, hash, lock_sys->rec_hash, + lock_rec_fold(space, page_no), in_lock); + lock_sys->rec_num--; + + UT_LIST_REMOVE(trx_locks, trx_lock->trx_locks, in_lock); + + MONITOR_INC(MONITOR_RECLOCK_REMOVED); + MONITOR_DEC(MONITOR_NUM_RECLOCK); + + /* Check if waiting locks in the queue can now be granted: grant + locks if there are no conflicting locks ahead. Stop at the first + X lock that is waiting or has been granted. */ + + for (lock = lock_rec_get_first_on_page_addr(space, page_no); + lock != NULL; + lock = lock_rec_get_next_on_page(lock)) { + + if (lock_get_wait(lock) + && !lock_rec_has_to_wait_in_queue(lock)) { + + /* Grant the lock */ + ut_ad(lock->trx != in_lock->trx); + lock_grant(lock); + } + } +} + +/*************************************************************//** +Removes a record lock request, waiting or granted, from the queue. */ +static +void +lock_rec_discard( +/*=============*/ + lock_t* in_lock) /*!< in: record lock object: all + record locks which are contained + in this lock object are removed */ +{ + ulint space; + ulint page_no; + trx_lock_t* trx_lock; + + ut_ad(lock_mutex_own()); + ut_ad(lock_get_type_low(in_lock) == LOCK_REC); + + trx_lock = &in_lock->trx->lock; + + space = in_lock->un_member.rec_lock.space; + page_no = in_lock->un_member.rec_lock.page_no; + + in_lock->index->table->n_rec_locks--; + + HASH_DELETE(lock_t, hash, lock_sys->rec_hash, + lock_rec_fold(space, page_no), in_lock); + lock_sys->rec_num--; + + UT_LIST_REMOVE(trx_locks, trx_lock->trx_locks, in_lock); + + MONITOR_INC(MONITOR_RECLOCK_REMOVED); + MONITOR_DEC(MONITOR_NUM_RECLOCK); +} + +/*************************************************************//** +Removes record lock objects set on an index page which is discarded. This +function does not move locks, or check for waiting locks, therefore the +lock bitmaps must already be reset when this function is called. */ +static +void +lock_rec_free_all_from_discard_page( +/*================================*/ + const buf_block_t* block) /*!< in: page to be discarded */ +{ + ulint space; + ulint page_no; + lock_t* lock; + lock_t* next_lock; + + ut_ad(lock_mutex_own()); + + space = buf_block_get_space(block); + page_no = buf_block_get_page_no(block); + + lock = lock_rec_get_first_on_page_addr(space, page_no); + + while (lock != NULL) { + ut_ad(lock_rec_find_set_bit(lock) == ULINT_UNDEFINED); + ut_ad(!lock_get_wait(lock)); + + next_lock = lock_rec_get_next_on_page(lock); + + lock_rec_discard(lock); + + lock = next_lock; + } +} + +/*============= RECORD LOCK MOVING AND INHERITING ===================*/ + +/*************************************************************//** +Resets the lock bits for a single record. Releases transactions waiting for +lock requests here. */ +static +void +lock_rec_reset_and_release_wait( +/*============================*/ + const buf_block_t* block, /*!< in: buffer block containing + the record */ + ulint heap_no)/*!< in: heap number of record */ +{ + lock_t* lock; + + ut_ad(lock_mutex_own()); + + for (lock = lock_rec_get_first(block, heap_no); + lock != NULL; + lock = lock_rec_get_next(heap_no, lock)) { + + if (lock_get_wait(lock)) { + lock_rec_cancel(lock); + } else { + lock_rec_reset_nth_bit(lock, heap_no); + } + } +} + +/*************************************************************//** +Makes a record to inherit the locks (except LOCK_INSERT_INTENTION type) +of another record as gap type locks, but does not reset the lock bits of +the other record. Also waiting lock requests on rec are inherited as +GRANTED gap locks. */ +static +void +lock_rec_inherit_to_gap( +/*====================*/ + const buf_block_t* heir_block, /*!< in: block containing the + record which inherits */ + const buf_block_t* block, /*!< in: block containing the + record from which inherited; + does NOT reset the locks on + this record */ + ulint heir_heap_no, /*!< in: heap_no of the + inheriting record */ + ulint heap_no) /*!< in: heap_no of the + donating record */ +{ + lock_t* lock; + + ut_ad(lock_mutex_own()); + + /* If srv_locks_unsafe_for_binlog is TRUE or session is using + READ COMMITTED isolation level, we do not want locks set + by an UPDATE or a DELETE to be inherited as gap type locks. But we + DO want S-locks set by a consistency constraint to be inherited also + then. */ + + for (lock = lock_rec_get_first(block, heap_no); + lock != NULL; + lock = lock_rec_get_next(heap_no, lock)) { + + if (!lock_rec_get_insert_intention(lock) + && !((srv_locks_unsafe_for_binlog + || lock->trx->isolation_level + <= TRX_ISO_READ_COMMITTED) + && lock_get_mode(lock) == LOCK_X)) { + + lock_rec_add_to_queue( + LOCK_REC | LOCK_GAP | lock_get_mode(lock), + heir_block, heir_heap_no, lock->index, + lock->trx, FALSE); + } + } +} + +/*************************************************************//** +Makes a record to inherit the gap locks (except LOCK_INSERT_INTENTION type) +of another record as gap type locks, but does not reset the lock bits of the +other record. Also waiting lock requests are inherited as GRANTED gap locks. */ +static +void +lock_rec_inherit_to_gap_if_gap_lock( +/*================================*/ + const buf_block_t* block, /*!< in: buffer block */ + ulint heir_heap_no, /*!< in: heap_no of + record which inherits */ + ulint heap_no) /*!< in: heap_no of record + from which inherited; + does NOT reset the locks + on this record */ +{ + lock_t* lock; + + lock_mutex_enter(); + + for (lock = lock_rec_get_first(block, heap_no); + lock != NULL; + lock = lock_rec_get_next(heap_no, lock)) { + + if (!lock_rec_get_insert_intention(lock) + && (heap_no == PAGE_HEAP_NO_SUPREMUM + || !lock_rec_get_rec_not_gap(lock))) { + + lock_rec_add_to_queue( + LOCK_REC | LOCK_GAP | lock_get_mode(lock), + block, heir_heap_no, lock->index, + lock->trx, FALSE); + } + } + + lock_mutex_exit(); +} + +/*************************************************************//** +Moves the locks of a record to another record and resets the lock bits of +the donating record. */ +static +void +lock_rec_move( +/*==========*/ + const buf_block_t* receiver, /*!< in: buffer block containing + the receiving record */ + const buf_block_t* donator, /*!< in: buffer block containing + the donating record */ + ulint receiver_heap_no,/*!< in: heap_no of the record + which gets the locks; there + must be no lock requests + on it! */ + ulint donator_heap_no)/*!< in: heap_no of the record + which gives the locks */ +{ + lock_t* lock; + + ut_ad(lock_mutex_own()); + + ut_ad(lock_rec_get_first(receiver, receiver_heap_no) == NULL); + + for (lock = lock_rec_get_first(donator, donator_heap_no); + lock != NULL; + lock = lock_rec_get_next(donator_heap_no, lock)) { + + const ulint type_mode = lock->type_mode; + + lock_rec_reset_nth_bit(lock, donator_heap_no); + + if (UNIV_UNLIKELY(type_mode & LOCK_WAIT)) { + lock_reset_lock_and_trx_wait(lock); + } + + /* Note that we FIRST reset the bit, and then set the lock: + the function works also if donator == receiver */ + + lock_rec_add_to_queue( + type_mode, receiver, receiver_heap_no, + lock->index, lock->trx, FALSE); + } + + ut_ad(lock_rec_get_first(donator, donator_heap_no) == NULL); +} + +/*************************************************************//** +Updates the lock table when we have reorganized a page. NOTE: we copy +also the locks set on the infimum of the page; the infimum may carry +locks if an update of a record is occurring on the page, and its locks +were temporarily stored on the infimum. */ +UNIV_INTERN +void +lock_move_reorganize_page( +/*======================*/ + const buf_block_t* block, /*!< in: old index page, now + reorganized */ + const buf_block_t* oblock) /*!< in: copy of the old, not + reorganized page */ +{ + lock_t* lock; + UT_LIST_BASE_NODE_T(lock_t) old_locks; + mem_heap_t* heap = NULL; + ulint comp; + + lock_mutex_enter(); + + lock = lock_rec_get_first_on_page(block); + + if (lock == NULL) { + lock_mutex_exit(); + + return; + } + + heap = mem_heap_create(256); + + /* Copy first all the locks on the page to heap and reset the + bitmaps in the original locks; chain the copies of the locks + using the trx_locks field in them. */ + + UT_LIST_INIT(old_locks); + + do { + /* Make a copy of the lock */ + lock_t* old_lock = lock_rec_copy(lock, heap); + + UT_LIST_ADD_LAST(trx_locks, old_locks, old_lock); + + /* Reset bitmap of lock */ + lock_rec_bitmap_reset(lock); + + if (lock_get_wait(lock)) { + + lock_reset_lock_and_trx_wait(lock); + } + + lock = lock_rec_get_next_on_page(lock); + } while (lock != NULL); + + comp = page_is_comp(block->frame); + ut_ad(comp == page_is_comp(oblock->frame)); + + for (lock = UT_LIST_GET_FIRST(old_locks); lock; + lock = UT_LIST_GET_NEXT(trx_locks, lock)) { + /* NOTE: we copy also the locks set on the infimum and + supremum of the page; the infimum may carry locks if an + update of a record is occurring on the page, and its locks + were temporarily stored on the infimum */ + page_cur_t cur1; + page_cur_t cur2; + + page_cur_set_before_first(block, &cur1); + page_cur_set_before_first(oblock, &cur2); + + /* Set locks according to old locks */ + for (;;) { + ulint old_heap_no; + ulint new_heap_no; + + ut_ad(comp || !memcmp(page_cur_get_rec(&cur1), + page_cur_get_rec(&cur2), + rec_get_data_size_old( + page_cur_get_rec( + &cur2)))); + if (UNIV_LIKELY(comp)) { + old_heap_no = rec_get_heap_no_new( + page_cur_get_rec(&cur2)); + new_heap_no = rec_get_heap_no_new( + page_cur_get_rec(&cur1)); + } else { + old_heap_no = rec_get_heap_no_old( + page_cur_get_rec(&cur2)); + new_heap_no = rec_get_heap_no_old( + page_cur_get_rec(&cur1)); + } + + if (lock_rec_get_nth_bit(lock, old_heap_no)) { + + /* Clear the bit in old_lock. */ + ut_d(lock_rec_reset_nth_bit(lock, + old_heap_no)); + + /* NOTE that the old lock bitmap could be too + small for the new heap number! */ + + lock_rec_add_to_queue( + lock->type_mode, block, new_heap_no, + lock->index, lock->trx, FALSE); + + /* if (new_heap_no == PAGE_HEAP_NO_SUPREMUM + && lock_get_wait(lock)) { + fprintf(stderr, + "---\n--\n!!!Lock reorg: supr type %lu\n", + lock->type_mode); + } */ + } + + if (UNIV_UNLIKELY + (new_heap_no == PAGE_HEAP_NO_SUPREMUM)) { + + ut_ad(old_heap_no == PAGE_HEAP_NO_SUPREMUM); + break; + } + + page_cur_move_to_next(&cur1); + page_cur_move_to_next(&cur2); + } + +#ifdef UNIV_DEBUG + { + ulint i = lock_rec_find_set_bit(lock); + + /* Check that all locks were moved. */ + if (UNIV_UNLIKELY(i != ULINT_UNDEFINED)) { + fprintf(stderr, + "lock_move_reorganize_page():" + " %lu not moved in %p\n", + (ulong) i, (void*) lock); + ut_error; + } + } +#endif /* UNIV_DEBUG */ + } + + lock_mutex_exit(); + + mem_heap_free(heap); + +#ifdef UNIV_DEBUG_LOCK_VALIDATE + ut_ad(lock_rec_validate_page(block)); +#endif +} + +/*************************************************************//** +Moves the explicit locks on user records to another page if a record +list end is moved to another page. */ +UNIV_INTERN +void +lock_move_rec_list_end( +/*===================*/ + const buf_block_t* new_block, /*!< in: index page to move to */ + const buf_block_t* block, /*!< in: index page */ + const rec_t* rec) /*!< in: record on page: this + is the first record moved */ +{ + lock_t* lock; + const ulint comp = page_rec_is_comp(rec); + + lock_mutex_enter(); + + /* Note: when we move locks from record to record, waiting locks + and possible granted gap type locks behind them are enqueued in + the original order, because new elements are inserted to a hash + table to the end of the hash chain, and lock_rec_add_to_queue + does not reuse locks if there are waiters in the queue. */ + + for (lock = lock_rec_get_first_on_page(block); lock; + lock = lock_rec_get_next_on_page(lock)) { + page_cur_t cur1; + page_cur_t cur2; + const ulint type_mode = lock->type_mode; + + page_cur_position(rec, block, &cur1); + + if (page_cur_is_before_first(&cur1)) { + page_cur_move_to_next(&cur1); + } + + page_cur_set_before_first(new_block, &cur2); + page_cur_move_to_next(&cur2); + + /* Copy lock requests on user records to new page and + reset the lock bits on the old */ + + while (!page_cur_is_after_last(&cur1)) { + ulint heap_no; + + if (comp) { + heap_no = rec_get_heap_no_new( + page_cur_get_rec(&cur1)); + } else { + heap_no = rec_get_heap_no_old( + page_cur_get_rec(&cur1)); + ut_ad(!memcmp(page_cur_get_rec(&cur1), + page_cur_get_rec(&cur2), + rec_get_data_size_old( + page_cur_get_rec(&cur2)))); + } + + if (lock_rec_get_nth_bit(lock, heap_no)) { + lock_rec_reset_nth_bit(lock, heap_no); + + if (UNIV_UNLIKELY(type_mode & LOCK_WAIT)) { + lock_reset_lock_and_trx_wait(lock); + } + + if (comp) { + heap_no = rec_get_heap_no_new( + page_cur_get_rec(&cur2)); + } else { + heap_no = rec_get_heap_no_old( + page_cur_get_rec(&cur2)); + } + + lock_rec_add_to_queue( + type_mode, new_block, heap_no, + lock->index, lock->trx, FALSE); + } + + page_cur_move_to_next(&cur1); + page_cur_move_to_next(&cur2); + } + } + + lock_mutex_exit(); + +#ifdef UNIV_DEBUG_LOCK_VALIDATE + ut_ad(lock_rec_validate_page(block)); + ut_ad(lock_rec_validate_page(new_block)); +#endif +} + +/*************************************************************//** +Moves the explicit locks on user records to another page if a record +list start is moved to another page. */ +UNIV_INTERN +void +lock_move_rec_list_start( +/*=====================*/ + const buf_block_t* new_block, /*!< in: index page to + move to */ + const buf_block_t* block, /*!< in: index page */ + const rec_t* rec, /*!< in: record on page: + this is the first + record NOT copied */ + const rec_t* old_end) /*!< in: old + previous-to-last + record on new_page + before the records + were copied */ +{ + lock_t* lock; + const ulint comp = page_rec_is_comp(rec); + + ut_ad(block->frame == page_align(rec)); + ut_ad(new_block->frame == page_align(old_end)); + + lock_mutex_enter(); + + for (lock = lock_rec_get_first_on_page(block); lock; + lock = lock_rec_get_next_on_page(lock)) { + page_cur_t cur1; + page_cur_t cur2; + const ulint type_mode = lock->type_mode; + + page_cur_set_before_first(block, &cur1); + page_cur_move_to_next(&cur1); + + page_cur_position(old_end, new_block, &cur2); + page_cur_move_to_next(&cur2); + + /* Copy lock requests on user records to new page and + reset the lock bits on the old */ + + while (page_cur_get_rec(&cur1) != rec) { + ulint heap_no; + + if (comp) { + heap_no = rec_get_heap_no_new( + page_cur_get_rec(&cur1)); + } else { + heap_no = rec_get_heap_no_old( + page_cur_get_rec(&cur1)); + ut_ad(!memcmp(page_cur_get_rec(&cur1), + page_cur_get_rec(&cur2), + rec_get_data_size_old( + page_cur_get_rec( + &cur2)))); + } + + if (lock_rec_get_nth_bit(lock, heap_no)) { + lock_rec_reset_nth_bit(lock, heap_no); + + if (UNIV_UNLIKELY(type_mode & LOCK_WAIT)) { + lock_reset_lock_and_trx_wait(lock); + } + + if (comp) { + heap_no = rec_get_heap_no_new( + page_cur_get_rec(&cur2)); + } else { + heap_no = rec_get_heap_no_old( + page_cur_get_rec(&cur2)); + } + + lock_rec_add_to_queue( + type_mode, new_block, heap_no, + lock->index, lock->trx, FALSE); + } + + page_cur_move_to_next(&cur1); + page_cur_move_to_next(&cur2); + } + +#ifdef UNIV_DEBUG + if (page_rec_is_supremum(rec)) { + ulint i; + + for (i = PAGE_HEAP_NO_USER_LOW; + i < lock_rec_get_n_bits(lock); i++) { + if (UNIV_UNLIKELY + (lock_rec_get_nth_bit(lock, i))) { + + fprintf(stderr, + "lock_move_rec_list_start():" + " %lu not moved in %p\n", + (ulong) i, (void*) lock); + ut_error; + } + } + } +#endif /* UNIV_DEBUG */ + } + + lock_mutex_exit(); + +#ifdef UNIV_DEBUG_LOCK_VALIDATE + ut_ad(lock_rec_validate_page(block)); +#endif +} + +/*************************************************************//** +Updates the lock table when a page is split to the right. */ +UNIV_INTERN +void +lock_update_split_right( +/*====================*/ + const buf_block_t* right_block, /*!< in: right page */ + const buf_block_t* left_block) /*!< in: left page */ +{ + ulint heap_no = lock_get_min_heap_no(right_block); + + lock_mutex_enter(); + + /* Move the locks on the supremum of the left page to the supremum + of the right page */ + + lock_rec_move(right_block, left_block, + PAGE_HEAP_NO_SUPREMUM, PAGE_HEAP_NO_SUPREMUM); + + /* Inherit the locks to the supremum of left page from the successor + of the infimum on right page */ + + lock_rec_inherit_to_gap(left_block, right_block, + PAGE_HEAP_NO_SUPREMUM, heap_no); + + lock_mutex_exit(); +} + +/*************************************************************//** +Updates the lock table when a page is merged to the right. */ +UNIV_INTERN +void +lock_update_merge_right( +/*====================*/ + const buf_block_t* right_block, /*!< in: right page to + which merged */ + const rec_t* orig_succ, /*!< in: original + successor of infimum + on the right page + before merge */ + const buf_block_t* left_block) /*!< in: merged index + page which will be + discarded */ +{ + lock_mutex_enter(); + + /* Inherit the locks from the supremum of the left page to the + original successor of infimum on the right page, to which the left + page was merged */ + + lock_rec_inherit_to_gap(right_block, left_block, + page_rec_get_heap_no(orig_succ), + PAGE_HEAP_NO_SUPREMUM); + + /* Reset the locks on the supremum of the left page, releasing + waiting transactions */ + + lock_rec_reset_and_release_wait(left_block, + PAGE_HEAP_NO_SUPREMUM); + + lock_rec_free_all_from_discard_page(left_block); + + lock_mutex_exit(); +} + +/*************************************************************//** +Updates the lock table when the root page is copied to another in +btr_root_raise_and_insert. Note that we leave lock structs on the +root page, even though they do not make sense on other than leaf +pages: the reason is that in a pessimistic update the infimum record +of the root page will act as a dummy carrier of the locks of the record +to be updated. */ +UNIV_INTERN +void +lock_update_root_raise( +/*===================*/ + const buf_block_t* block, /*!< in: index page to which copied */ + const buf_block_t* root) /*!< in: root page */ +{ + lock_mutex_enter(); + + /* Move the locks on the supremum of the root to the supremum + of block */ + + lock_rec_move(block, root, + PAGE_HEAP_NO_SUPREMUM, PAGE_HEAP_NO_SUPREMUM); + lock_mutex_exit(); +} + +/*************************************************************//** +Updates the lock table when a page is copied to another and the original page +is removed from the chain of leaf pages, except if page is the root! */ +UNIV_INTERN +void +lock_update_copy_and_discard( +/*=========================*/ + const buf_block_t* new_block, /*!< in: index page to + which copied */ + const buf_block_t* block) /*!< in: index page; + NOT the root! */ +{ + lock_mutex_enter(); + + /* Move the locks on the supremum of the old page to the supremum + of new_page */ + + lock_rec_move(new_block, block, + PAGE_HEAP_NO_SUPREMUM, PAGE_HEAP_NO_SUPREMUM); + lock_rec_free_all_from_discard_page(block); + + lock_mutex_exit(); +} + +/*************************************************************//** +Updates the lock table when a page is split to the left. */ +UNIV_INTERN +void +lock_update_split_left( +/*===================*/ + const buf_block_t* right_block, /*!< in: right page */ + const buf_block_t* left_block) /*!< in: left page */ +{ + ulint heap_no = lock_get_min_heap_no(right_block); + + lock_mutex_enter(); + + /* Inherit the locks to the supremum of the left page from the + successor of the infimum on the right page */ + + lock_rec_inherit_to_gap(left_block, right_block, + PAGE_HEAP_NO_SUPREMUM, heap_no); + + lock_mutex_exit(); +} + +/*************************************************************//** +Updates the lock table when a page is merged to the left. */ +UNIV_INTERN +void +lock_update_merge_left( +/*===================*/ + const buf_block_t* left_block, /*!< in: left page to + which merged */ + const rec_t* orig_pred, /*!< in: original predecessor + of supremum on the left page + before merge */ + const buf_block_t* right_block) /*!< in: merged index page + which will be discarded */ +{ + const rec_t* left_next_rec; + + ut_ad(left_block->frame == page_align(orig_pred)); + + lock_mutex_enter(); + + left_next_rec = page_rec_get_next_const(orig_pred); + + if (!page_rec_is_supremum(left_next_rec)) { + + /* Inherit the locks on the supremum of the left page to the + first record which was moved from the right page */ + + lock_rec_inherit_to_gap(left_block, left_block, + page_rec_get_heap_no(left_next_rec), + PAGE_HEAP_NO_SUPREMUM); + + /* Reset the locks on the supremum of the left page, + releasing waiting transactions */ + + lock_rec_reset_and_release_wait(left_block, + PAGE_HEAP_NO_SUPREMUM); + } + + /* Move the locks from the supremum of right page to the supremum + of the left page */ + + lock_rec_move(left_block, right_block, + PAGE_HEAP_NO_SUPREMUM, PAGE_HEAP_NO_SUPREMUM); + + lock_rec_free_all_from_discard_page(right_block); + + lock_mutex_exit(); +} + +/*************************************************************//** +Resets the original locks on heir and replaces them with gap type locks +inherited from rec. */ +UNIV_INTERN +void +lock_rec_reset_and_inherit_gap_locks( +/*=================================*/ + const buf_block_t* heir_block, /*!< in: block containing the + record which inherits */ + const buf_block_t* block, /*!< in: block containing the + record from which inherited; + does NOT reset the locks on + this record */ + ulint heir_heap_no, /*!< in: heap_no of the + inheriting record */ + ulint heap_no) /*!< in: heap_no of the + donating record */ +{ + lock_mutex_enter(); + + lock_rec_reset_and_release_wait(heir_block, heir_heap_no); + + lock_rec_inherit_to_gap(heir_block, block, heir_heap_no, heap_no); + + lock_mutex_exit(); +} + +/*************************************************************//** +Updates the lock table when a page is discarded. */ +UNIV_INTERN +void +lock_update_discard( +/*================*/ + const buf_block_t* heir_block, /*!< in: index page + which will inherit the locks */ + ulint heir_heap_no, /*!< in: heap_no of the record + which will inherit the locks */ + const buf_block_t* block) /*!< in: index page + which will be discarded */ +{ + const page_t* page = block->frame; + const rec_t* rec; + ulint heap_no; + + lock_mutex_enter(); + + if (!lock_rec_get_first_on_page(block)) { + /* No locks exist on page, nothing to do */ + + lock_mutex_exit(); + + return; + } + + /* Inherit all the locks on the page to the record and reset all + the locks on the page */ + + if (page_is_comp(page)) { + rec = page + PAGE_NEW_INFIMUM; + + do { + heap_no = rec_get_heap_no_new(rec); + + lock_rec_inherit_to_gap(heir_block, block, + heir_heap_no, heap_no); + + lock_rec_reset_and_release_wait(block, heap_no); + + rec = page + rec_get_next_offs(rec, TRUE); + } while (heap_no != PAGE_HEAP_NO_SUPREMUM); + } else { + rec = page + PAGE_OLD_INFIMUM; + + do { + heap_no = rec_get_heap_no_old(rec); + + lock_rec_inherit_to_gap(heir_block, block, + heir_heap_no, heap_no); + + lock_rec_reset_and_release_wait(block, heap_no); + + rec = page + rec_get_next_offs(rec, FALSE); + } while (heap_no != PAGE_HEAP_NO_SUPREMUM); + } + + lock_rec_free_all_from_discard_page(block); + + lock_mutex_exit(); +} + +/*************************************************************//** +Updates the lock table when a new user record is inserted. */ +UNIV_INTERN +void +lock_update_insert( +/*===============*/ + const buf_block_t* block, /*!< in: buffer block containing rec */ + const rec_t* rec) /*!< in: the inserted record */ +{ + ulint receiver_heap_no; + ulint donator_heap_no; + + ut_ad(block->frame == page_align(rec)); + + /* Inherit the gap-locking locks for rec, in gap mode, from the next + record */ + + if (page_rec_is_comp(rec)) { + receiver_heap_no = rec_get_heap_no_new(rec); + donator_heap_no = rec_get_heap_no_new( + page_rec_get_next_low(rec, TRUE)); + } else { + receiver_heap_no = rec_get_heap_no_old(rec); + donator_heap_no = rec_get_heap_no_old( + page_rec_get_next_low(rec, FALSE)); + } + + lock_rec_inherit_to_gap_if_gap_lock( + block, receiver_heap_no, donator_heap_no); +} + +/*************************************************************//** +Updates the lock table when a record is removed. */ +UNIV_INTERN +void +lock_update_delete( +/*===============*/ + const buf_block_t* block, /*!< in: buffer block containing rec */ + const rec_t* rec) /*!< in: the record to be removed */ +{ + const page_t* page = block->frame; + ulint heap_no; + ulint next_heap_no; + + ut_ad(page == page_align(rec)); + + if (page_is_comp(page)) { + heap_no = rec_get_heap_no_new(rec); + next_heap_no = rec_get_heap_no_new(page + + rec_get_next_offs(rec, + TRUE)); + } else { + heap_no = rec_get_heap_no_old(rec); + next_heap_no = rec_get_heap_no_old(page + + rec_get_next_offs(rec, + FALSE)); + } + + lock_mutex_enter(); + + /* Let the next record inherit the locks from rec, in gap mode */ + + lock_rec_inherit_to_gap(block, block, next_heap_no, heap_no); + + /* Reset the lock bits on rec and release waiting transactions */ + + lock_rec_reset_and_release_wait(block, heap_no); + + lock_mutex_exit(); +} + +/*********************************************************************//** +Stores on the page infimum record the explicit locks of another record. +This function is used to store the lock state of a record when it is +updated and the size of the record changes in the update. The record +is moved in such an update, perhaps to another page. The infimum record +acts as a dummy carrier record, taking care of lock releases while the +actual record is being moved. */ +UNIV_INTERN +void +lock_rec_store_on_page_infimum( +/*===========================*/ + const buf_block_t* block, /*!< in: buffer block containing rec */ + const rec_t* rec) /*!< in: record whose lock state + is stored on the infimum + record of the same page; lock + bits are reset on the + record */ +{ + ulint heap_no = page_rec_get_heap_no(rec); + + ut_ad(block->frame == page_align(rec)); + + lock_mutex_enter(); + + lock_rec_move(block, block, PAGE_HEAP_NO_INFIMUM, heap_no); + + lock_mutex_exit(); +} + +/*********************************************************************//** +Restores the state of explicit lock requests on a single record, where the +state was stored on the infimum of the page. */ +UNIV_INTERN +void +lock_rec_restore_from_page_infimum( +/*===============================*/ + const buf_block_t* block, /*!< in: buffer block containing rec */ + const rec_t* rec, /*!< in: record whose lock state + is restored */ + const buf_block_t* donator)/*!< in: page (rec is not + necessarily on this page) + whose infimum stored the lock + state; lock bits are reset on + the infimum */ +{ + ulint heap_no = page_rec_get_heap_no(rec); + + lock_mutex_enter(); + + lock_rec_move(block, donator, heap_no, PAGE_HEAP_NO_INFIMUM); + + lock_mutex_exit(); +} + +/*=========== DEADLOCK CHECKING ======================================*/ + +/*********************************************************************//** +rewind(3) the file used for storing the latest detected deadlock and +print a heading message to stderr if printing of all deadlocks to stderr +is enabled. */ +UNIV_INLINE +void +lock_deadlock_start_print() +/*=======================*/ +{ + ut_ad(lock_mutex_own()); + ut_ad(!srv_read_only_mode); + + rewind(lock_latest_err_file); + ut_print_timestamp(lock_latest_err_file); + + if (srv_print_all_deadlocks) { + ut_print_timestamp(stderr); + fprintf(stderr, "InnoDB: transactions deadlock detected, " + "dumping detailed information.\n"); + ut_print_timestamp(stderr); + } +} + +/*********************************************************************//** +Print a message to the deadlock file and possibly to stderr. */ +UNIV_INLINE +void +lock_deadlock_fputs( +/*================*/ + const char* msg) /*!< in: message to print */ +{ + if (!srv_read_only_mode) { + fputs(msg, lock_latest_err_file); + + if (srv_print_all_deadlocks) { + fputs(msg, stderr); + } + } +} + +/*********************************************************************//** +Print transaction data to the deadlock file and possibly to stderr. */ +UNIV_INLINE +void +lock_deadlock_trx_print( +/*====================*/ + const trx_t* trx, /*!< in: transaction */ + ulint max_query_len) /*!< in: max query length to print, + or 0 to use the default max length */ +{ + ut_ad(lock_mutex_own()); + ut_ad(!srv_read_only_mode); + + ulint n_rec_locks = lock_number_of_rows_locked(&trx->lock); + ulint n_trx_locks = UT_LIST_GET_LEN(trx->lock.trx_locks); + ulint heap_size = mem_heap_get_size(trx->lock.lock_heap); + + mutex_enter(&trx_sys->mutex); + + trx_print_low(lock_latest_err_file, trx, max_query_len, + n_rec_locks, n_trx_locks, heap_size); + + if (srv_print_all_deadlocks) { + trx_print_low(stderr, trx, max_query_len, + n_rec_locks, n_trx_locks, heap_size); + } + + mutex_exit(&trx_sys->mutex); +} + +/*********************************************************************//** +Print lock data to the deadlock file and possibly to stderr. */ +UNIV_INLINE +void +lock_deadlock_lock_print( +/*=====================*/ + const lock_t* lock) /*!< in: record or table type lock */ +{ + ut_ad(lock_mutex_own()); + ut_ad(!srv_read_only_mode); + + if (lock_get_type_low(lock) == LOCK_REC) { + lock_rec_print(lock_latest_err_file, lock); + + if (srv_print_all_deadlocks) { + lock_rec_print(stderr, lock); + } + } else { + lock_table_print(lock_latest_err_file, lock); + + if (srv_print_all_deadlocks) { + lock_table_print(stderr, lock); + } + } +} + +/** Used in deadlock tracking. Protected by lock_sys->mutex. */ +static ib_uint64_t lock_mark_counter = 0; + +/** Check if the search is too deep. */ +#define lock_deadlock_too_deep(c) \ + (c->depth > LOCK_MAX_DEPTH_IN_DEADLOCK_CHECK \ + || c->cost > LOCK_MAX_N_STEPS_IN_DEADLOCK_CHECK) + +/********************************************************************//** +Get the next lock in the queue that is owned by a transaction whose +sub-tree has not already been searched. +@return next lock or NULL if at end of queue */ +static +const lock_t* +lock_get_next_lock( +/*===============*/ + const lock_deadlock_ctx_t* + ctx, /*!< in: deadlock context */ + const lock_t* lock, /*!< in: lock in the queue */ + ulint heap_no)/*!< in: heap no if rec lock else + ULINT_UNDEFINED */ +{ + ut_ad(lock_mutex_own()); + + do { + if (lock_get_type_low(lock) == LOCK_REC) { + ut_ad(heap_no != ULINT_UNDEFINED); + lock = lock_rec_get_next_const(heap_no, lock); + } else { + ut_ad(heap_no == ULINT_UNDEFINED); + ut_ad(lock_get_type_low(lock) == LOCK_TABLE); + + lock = UT_LIST_GET_PREV(un_member.tab_lock.locks, lock); + } + } while (lock != NULL + && lock->trx->lock.deadlock_mark > ctx->mark_start); + + ut_ad(lock == NULL + || lock_get_type_low(lock) == lock_get_type_low(ctx->wait_lock)); + + return(lock); +} + +/********************************************************************//** +Get the first lock to search. The search starts from the current +wait_lock. What we are really interested in is an edge from the +current wait_lock's owning transaction to another transaction that has +a lock ahead in the queue. We skip locks where the owning transaction's +sub-tree has already been searched. +@return first lock or NULL */ +static +const lock_t* +lock_get_first_lock( +/*================*/ + const lock_deadlock_ctx_t* + ctx, /*!< in: deadlock context */ + ulint* heap_no)/*!< out: heap no if rec lock, + else ULINT_UNDEFINED */ +{ + const lock_t* lock; + + ut_ad(lock_mutex_own()); + + lock = ctx->wait_lock; + + if (lock_get_type_low(lock) == LOCK_REC) { + + *heap_no = lock_rec_find_set_bit(lock); + ut_ad(*heap_no != ULINT_UNDEFINED); + + lock = lock_rec_get_first_on_page_addr( + lock->un_member.rec_lock.space, + lock->un_member.rec_lock.page_no); + + /* Position on the first lock on the physical record. */ + if (!lock_rec_get_nth_bit(lock, *heap_no)) { + lock = lock_rec_get_next_const(*heap_no, lock); + } + + } else { + *heap_no = ULINT_UNDEFINED; + ut_ad(lock_get_type_low(lock) == LOCK_TABLE); + lock = UT_LIST_GET_PREV(un_member.tab_lock.locks, lock); + } + + ut_a(lock != NULL); + ut_a(lock != ctx->wait_lock); + ut_ad(lock_get_type_low(lock) == lock_get_type_low(ctx->wait_lock)); + + return(lock); +} + +/********************************************************************//** +Notify that a deadlock has been detected and print the conflicting +transaction info. */ +static +void +lock_deadlock_notify( +/*=================*/ + const lock_deadlock_ctx_t* ctx, /*!< in: deadlock context */ + const lock_t* lock) /*!< in: lock causing + deadlock */ +{ + ut_ad(lock_mutex_own()); + ut_ad(!srv_read_only_mode); + + lock_deadlock_start_print(); + + lock_deadlock_fputs("\n*** (1) TRANSACTION:\n"); + + lock_deadlock_trx_print(ctx->wait_lock->trx, 3000); + + lock_deadlock_fputs("*** (1) WAITING FOR THIS LOCK TO BE GRANTED:\n"); + + lock_deadlock_lock_print(ctx->wait_lock); + + lock_deadlock_fputs("*** (2) TRANSACTION:\n"); + + lock_deadlock_trx_print(lock->trx, 3000); + + lock_deadlock_fputs("*** (2) HOLDS THE LOCK(S):\n"); + + lock_deadlock_lock_print(lock); + + /* It is possible that the joining transaction was granted its + lock when we rolled back some other waiting transaction. */ + + if (ctx->start->lock.wait_lock != 0) { + lock_deadlock_fputs( + "*** (2) WAITING FOR THIS LOCK TO BE GRANTED:\n"); + + lock_deadlock_lock_print(ctx->start->lock.wait_lock); + } + +#ifdef UNIV_DEBUG + if (lock_print_waits) { + fputs("Deadlock detected\n", stderr); + } +#endif /* UNIV_DEBUG */ +} + +/********************************************************************//** +Select the victim transaction that should be rolledback. +@return victim transaction */ +static +const trx_t* +lock_deadlock_select_victim( +/*========================*/ + const lock_deadlock_ctx_t* ctx) /*!< in: deadlock context */ +{ + ut_ad(lock_mutex_own()); + ut_ad(ctx->start->lock.wait_lock != 0); + ut_ad(ctx->wait_lock->trx != ctx->start); + + if (trx_weight_ge(ctx->wait_lock->trx, ctx->start)) { + /* The joining transaction is 'smaller', + choose it as the victim and roll it back. */ + + return(ctx->start); + } + + return(ctx->wait_lock->trx); +} + +/********************************************************************//** +Pop the deadlock search state from the stack. +@return stack slot instance that was on top of the stack. */ +static +const lock_stack_t* +lock_deadlock_pop( +/*==============*/ + lock_deadlock_ctx_t* ctx) /*!< in/out: context */ +{ + ut_ad(lock_mutex_own()); + + ut_ad(ctx->depth > 0); + + return(&lock_stack[--ctx->depth]); +} + +/********************************************************************//** +Push the deadlock search state onto the stack. +@return slot that was used in the stack */ +static +lock_stack_t* +lock_deadlock_push( +/*===============*/ + lock_deadlock_ctx_t* ctx, /*!< in/out: context */ + const lock_t* lock, /*!< in: current lock */ + ulint heap_no) /*!< in: heap number */ +{ + ut_ad(lock_mutex_own()); + + /* Save current search state. */ + + if (LOCK_STACK_SIZE > ctx->depth) { + lock_stack_t* stack; + + stack = &lock_stack[ctx->depth++]; + + stack->lock = lock; + stack->heap_no = heap_no; + stack->wait_lock = ctx->wait_lock; + + return(stack); + } + + return(NULL); +} + +/********************************************************************//** +Looks iteratively for a deadlock. Note: the joining transaction may +have been granted its lock by the deadlock checks. +@return 0 if no deadlock else the victim transaction id.*/ +static +trx_id_t +lock_deadlock_search( +/*=================*/ + lock_deadlock_ctx_t* ctx) /*!< in/out: deadlock context */ +{ + const lock_t* lock; + ulint heap_no; + + ut_ad(lock_mutex_own()); + ut_ad(!trx_mutex_own(ctx->start)); + + ut_ad(ctx->start != NULL); + ut_ad(ctx->wait_lock != NULL); + assert_trx_in_list(ctx->wait_lock->trx); + ut_ad(ctx->mark_start <= lock_mark_counter); + + /* Look at the locks ahead of wait_lock in the lock queue. */ + lock = lock_get_first_lock(ctx, &heap_no); + + for (;;) { + + /* We should never visit the same sub-tree more than once. */ + ut_ad(lock == NULL + || lock->trx->lock.deadlock_mark <= ctx->mark_start); + + while (ctx->depth > 0 && lock == NULL) { + const lock_stack_t* stack; + + /* Restore previous search state. */ + + stack = lock_deadlock_pop(ctx); + + lock = stack->lock; + heap_no = stack->heap_no; + ctx->wait_lock = stack->wait_lock; + + lock = lock_get_next_lock(ctx, lock, heap_no); + } + + if (lock == NULL) { + break; + } else if (lock == ctx->wait_lock) { + + /* We can mark this subtree as searched */ + ut_ad(lock->trx->lock.deadlock_mark <= ctx->mark_start); + + lock->trx->lock.deadlock_mark = ++lock_mark_counter; + + /* We are not prepared for an overflow. This 64-bit + counter should never wrap around. At 10^9 increments + per second, it would take 10^3 years of uptime. */ + + ut_ad(lock_mark_counter > 0); + + lock = NULL; + + } else if (!lock_has_to_wait(ctx->wait_lock, lock)) { + + /* No conflict, next lock */ + lock = lock_get_next_lock(ctx, lock, heap_no); + + } else if (lock->trx == ctx->start) { + + /* Found a cycle. */ + + lock_deadlock_notify(ctx, lock); + + return(lock_deadlock_select_victim(ctx)->id); + + } else if (lock_deadlock_too_deep(ctx)) { + + /* Search too deep to continue. */ + + ctx->too_deep = TRUE; + + /* Select the joining transaction as the victim. */ + return(ctx->start->id); + + } else if (lock->trx->lock.que_state == TRX_QUE_LOCK_WAIT) { + + /* Another trx ahead has requested a lock in an + incompatible mode, and is itself waiting for a lock. */ + + ++ctx->cost; + + /* Save current search state. */ + if (!lock_deadlock_push(ctx, lock, heap_no)) { + + /* Unable to save current search state, stack + size not big enough. */ + + ctx->too_deep = TRUE; + + return(ctx->start->id); + } + + ctx->wait_lock = lock->trx->lock.wait_lock; + lock = lock_get_first_lock(ctx, &heap_no); + + if (lock->trx->lock.deadlock_mark > ctx->mark_start) { + lock = lock_get_next_lock(ctx, lock, heap_no); + } + + } else { + lock = lock_get_next_lock(ctx, lock, heap_no); + } + } + + ut_a(lock == NULL && ctx->depth == 0); + + /* No deadlock found. */ + return(0); +} + +/********************************************************************//** +Print info about transaction that was rolled back. */ +static +void +lock_deadlock_joining_trx_print( +/*============================*/ + const trx_t* trx, /*!< in: transaction rolled back */ + const lock_t* lock) /*!< in: lock trx wants */ +{ + ut_ad(lock_mutex_own()); + ut_ad(!srv_read_only_mode); + + /* If the lock search exceeds the max step + or the max depth, the current trx will be + the victim. Print its information. */ + lock_deadlock_start_print(); + + lock_deadlock_fputs( + "TOO DEEP OR LONG SEARCH IN THE LOCK TABLE" + " WAITS-FOR GRAPH, WE WILL ROLL BACK" + " FOLLOWING TRANSACTION \n\n" + "*** TRANSACTION:\n"); + + lock_deadlock_trx_print(trx, 3000); + + lock_deadlock_fputs("*** WAITING FOR THIS LOCK TO BE GRANTED:\n"); + + lock_deadlock_lock_print(lock); +} + +/********************************************************************//** +Rollback transaction selected as the victim. */ +static +void +lock_deadlock_trx_rollback( +/*=======================*/ + lock_deadlock_ctx_t* ctx) /*!< in: deadlock context */ +{ + trx_t* trx; + + ut_ad(lock_mutex_own()); + + trx = ctx->wait_lock->trx; + + lock_deadlock_fputs("*** WE ROLL BACK TRANSACTION (1)\n"); + + trx_mutex_enter(trx); + + trx->lock.was_chosen_as_deadlock_victim = TRUE; + + lock_cancel_waiting_and_release(trx->lock.wait_lock); + + trx_mutex_exit(trx); +} + +/********************************************************************//** +Checks if a joining lock request results in a deadlock. If a deadlock is +found this function will resolve the dadlock by choosing a victim transaction +and rolling it back. It will attempt to resolve all deadlocks. The returned +transaction id will be the joining transaction id or 0 if some other +transaction was chosen as a victim and rolled back or no deadlock found. + +@return id of transaction chosen as victim or 0 */ +static +trx_id_t +lock_deadlock_check_and_resolve( +/*============================*/ + const lock_t* lock, /*!< in: lock the transaction is requesting */ + const trx_t* trx) /*!< in: transaction */ +{ + trx_id_t victim_trx_id; + + ut_ad(trx != NULL); + ut_ad(lock != NULL); + ut_ad(lock_mutex_own()); + assert_trx_in_list(trx); + + /* Try and resolve as many deadlocks as possible. */ + do { + lock_deadlock_ctx_t ctx; + + /* Reset the context. */ + ctx.cost = 0; + ctx.depth = 0; + ctx.start = trx; + ctx.too_deep = FALSE; + ctx.wait_lock = lock; + ctx.mark_start = lock_mark_counter; + + victim_trx_id = lock_deadlock_search(&ctx); + + /* Search too deep, we rollback the joining transaction. */ + if (ctx.too_deep) { + + ut_a(trx == ctx.start); + ut_a(victim_trx_id == trx->id); + + if (!srv_read_only_mode) { + lock_deadlock_joining_trx_print(trx, lock); + } + + MONITOR_INC(MONITOR_DEADLOCK); + + } else if (victim_trx_id != 0 && victim_trx_id != trx->id) { + + ut_ad(victim_trx_id == ctx.wait_lock->trx->id); + lock_deadlock_trx_rollback(&ctx); + + lock_deadlock_found = TRUE; + + MONITOR_INC(MONITOR_DEADLOCK); + } + + } while (victim_trx_id != 0 && victim_trx_id != trx->id); + + /* If the joining transaction was selected as the victim. */ + if (victim_trx_id != 0) { + ut_a(victim_trx_id == trx->id); + + srv_stats.lock_deadlock_count.inc(); + + lock_deadlock_fputs("*** WE ROLL BACK TRANSACTION (2)\n"); + + lock_deadlock_found = TRUE; + } + + return(victim_trx_id); +} + +/*========================= TABLE LOCKS ==============================*/ + +/*********************************************************************//** +Creates a table lock object and adds it as the last in the lock queue +of the table. Does NOT check for deadlocks or lock compatibility. +@return own: new lock object */ +UNIV_INLINE +lock_t* +lock_table_create( +/*==============*/ + dict_table_t* table, /*!< in/out: database table + in dictionary cache */ + ulint type_mode,/*!< in: lock mode possibly ORed with + LOCK_WAIT */ + trx_t* trx) /*!< in: trx */ +{ + lock_t* lock; + + ut_ad(table && trx); + ut_ad(lock_mutex_own()); + ut_ad(trx_mutex_own(trx)); + + /* Non-locking autocommit read-only transactions should not set + any locks. */ + assert_trx_in_list(trx); + + if ((type_mode & LOCK_MODE_MASK) == LOCK_AUTO_INC) { + ++table->n_waiting_or_granted_auto_inc_locks; + } + + /* For AUTOINC locking we reuse the lock instance only if + there is no wait involved else we allocate the waiting lock + from the transaction lock heap. */ + if (type_mode == LOCK_AUTO_INC) { + + lock = table->autoinc_lock; + + table->autoinc_trx = trx; + + ib_vector_push(trx->autoinc_locks, &lock); + } else { + lock = static_cast<lock_t*>( + mem_heap_alloc(trx->lock.lock_heap, sizeof(*lock))); + } + + lock->type_mode = type_mode | LOCK_TABLE; + lock->trx = trx; + + lock->un_member.tab_lock.table = table; + + ut_ad(table->n_ref_count > 0 || !table->can_be_evicted); + + UT_LIST_ADD_LAST(trx_locks, trx->lock.trx_locks, lock); + UT_LIST_ADD_LAST(un_member.tab_lock.locks, table->locks, lock); + + if (UNIV_UNLIKELY(type_mode & LOCK_WAIT)) { + + lock_set_lock_and_trx_wait(lock, trx); + } + + ib_vector_push(lock->trx->lock.table_locks, &lock); + + MONITOR_INC(MONITOR_TABLELOCK_CREATED); + MONITOR_INC(MONITOR_NUM_TABLELOCK); + + return(lock); +} + +/*************************************************************//** +Pops autoinc lock requests from the transaction's autoinc_locks. We +handle the case where there are gaps in the array and they need to +be popped off the stack. */ +UNIV_INLINE +void +lock_table_pop_autoinc_locks( +/*=========================*/ + trx_t* trx) /*!< in/out: transaction that owns the AUTOINC locks */ +{ + ut_ad(lock_mutex_own()); + ut_ad(!ib_vector_is_empty(trx->autoinc_locks)); + + /* Skip any gaps, gaps are NULL lock entries in the + trx->autoinc_locks vector. */ + + do { + ib_vector_pop(trx->autoinc_locks); + + if (ib_vector_is_empty(trx->autoinc_locks)) { + return; + } + + } while (*(lock_t**) ib_vector_get_last(trx->autoinc_locks) == NULL); +} + +/*************************************************************//** +Removes an autoinc lock request from the transaction's autoinc_locks. */ +UNIV_INLINE +void +lock_table_remove_autoinc_lock( +/*===========================*/ + lock_t* lock, /*!< in: table lock */ + trx_t* trx) /*!< in/out: transaction that owns the lock */ +{ + lock_t* autoinc_lock; + lint i = ib_vector_size(trx->autoinc_locks) - 1; + + ut_ad(lock_mutex_own()); + ut_ad(lock_get_mode(lock) == LOCK_AUTO_INC); + ut_ad(lock_get_type_low(lock) & LOCK_TABLE); + ut_ad(!ib_vector_is_empty(trx->autoinc_locks)); + + /* With stored functions and procedures the user may drop + a table within the same "statement". This special case has + to be handled by deleting only those AUTOINC locks that were + held by the table being dropped. */ + + autoinc_lock = *static_cast<lock_t**>( + ib_vector_get(trx->autoinc_locks, i)); + + /* This is the default fast case. */ + + if (autoinc_lock == lock) { + lock_table_pop_autoinc_locks(trx); + } else { + /* The last element should never be NULL */ + ut_a(autoinc_lock != NULL); + + /* Handle freeing the locks from within the stack. */ + + while (--i >= 0) { + autoinc_lock = *static_cast<lock_t**>( + ib_vector_get(trx->autoinc_locks, i)); + + if (UNIV_LIKELY(autoinc_lock == lock)) { + void* null_var = NULL; + ib_vector_set(trx->autoinc_locks, i, &null_var); + return; + } + } + + /* Must find the autoinc lock. */ + ut_error; + } +} + +/*************************************************************//** +Removes a table lock request from the queue and the trx list of locks; +this is a low-level function which does NOT check if waiting requests +can now be granted. */ +UNIV_INLINE +void +lock_table_remove_low( +/*==================*/ + lock_t* lock) /*!< in/out: table lock */ +{ + trx_t* trx; + dict_table_t* table; + + ut_ad(lock_mutex_own()); + + trx = lock->trx; + table = lock->un_member.tab_lock.table; + + /* Remove the table from the transaction's AUTOINC vector, if + the lock that is being released is an AUTOINC lock. */ + if (lock_get_mode(lock) == LOCK_AUTO_INC) { + + /* The table's AUTOINC lock can get transferred to + another transaction before we get here. */ + if (table->autoinc_trx == trx) { + table->autoinc_trx = NULL; + } + + /* The locks must be freed in the reverse order from + the one in which they were acquired. This is to avoid + traversing the AUTOINC lock vector unnecessarily. + + We only store locks that were granted in the + trx->autoinc_locks vector (see lock_table_create() + and lock_grant()). Therefore it can be empty and we + need to check for that. */ + + if (!lock_get_wait(lock) + && !ib_vector_is_empty(trx->autoinc_locks)) { + + lock_table_remove_autoinc_lock(lock, trx); + } + + ut_a(table->n_waiting_or_granted_auto_inc_locks > 0); + table->n_waiting_or_granted_auto_inc_locks--; + } + + UT_LIST_REMOVE(trx_locks, trx->lock.trx_locks, lock); + UT_LIST_REMOVE(un_member.tab_lock.locks, table->locks, lock); + + MONITOR_INC(MONITOR_TABLELOCK_REMOVED); + MONITOR_DEC(MONITOR_NUM_TABLELOCK); +} + +/*********************************************************************//** +Enqueues a waiting request for a table lock which cannot be granted +immediately. Checks for deadlocks. +@return DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED, or +DB_SUCCESS; DB_SUCCESS means that there was a deadlock, but another +transaction was chosen as a victim, and we got the lock immediately: +no need to wait then */ +static +dberr_t +lock_table_enqueue_waiting( +/*=======================*/ + ulint mode, /*!< in: lock mode this transaction is + requesting */ + dict_table_t* table, /*!< in/out: table */ + que_thr_t* thr) /*!< in: query thread */ +{ + trx_t* trx; + lock_t* lock; + trx_id_t victim_trx_id; + ulint sec; + ulint ms; + + ut_ad(lock_mutex_own()); + ut_ad(!srv_read_only_mode); + + trx = thr_get_trx(thr); + ut_ad(trx_mutex_own(trx)); + + /* Test if there already is some other reason to suspend thread: + we do not enqueue a lock request if the query thread should be + stopped anyway */ + + if (que_thr_stop(thr)) { + ut_error; + + return(DB_QUE_THR_SUSPENDED); + } + + switch (trx_get_dict_operation(trx)) { + case TRX_DICT_OP_NONE: + break; + case TRX_DICT_OP_TABLE: + case TRX_DICT_OP_INDEX: + ut_print_timestamp(stderr); + fputs(" InnoDB: Error: a table lock wait happens" + " in a dictionary operation!\n" + "InnoDB: Table name ", stderr); + ut_print_name(stderr, trx, TRUE, table->name); + fputs(".\n" + "InnoDB: Submit a detailed bug report" + " to http://bugs.mysql.com\n", + stderr); + ut_ad(0); + } + + /* Enqueue the lock request that will wait to be granted */ + + lock = lock_table_create(table, mode | LOCK_WAIT, trx); + + /* Release the mutex to obey the latching order. + This is safe, because lock_deadlock_check_and_resolve() + is invoked when a lock wait is enqueued for the currently + running transaction. Because trx is a running transaction + (it is not currently suspended because of a lock wait), + its state can only be changed by this thread, which is + currently associated with the transaction. */ + + trx_mutex_exit(trx); + + victim_trx_id = lock_deadlock_check_and_resolve(lock, trx); + + trx_mutex_enter(trx); + + if (victim_trx_id != 0) { + ut_ad(victim_trx_id == trx->id); + + /* The order here is important, we don't want to + lose the state of the lock before calling remove. */ + lock_table_remove_low(lock); + lock_reset_lock_and_trx_wait(lock); + + return(DB_DEADLOCK); + } else if (trx->lock.wait_lock == NULL) { + /* Deadlock resolution chose another transaction as a victim, + and we accidentally got our lock granted! */ + + return(DB_SUCCESS); + } + + trx->lock.que_state = TRX_QUE_LOCK_WAIT; + + trx->lock.wait_started = ut_time(); + trx->lock.was_chosen_as_deadlock_victim = FALSE; + + if (UNIV_UNLIKELY(trx->take_stats)) { + ut_usectime(&sec, &ms); + trx->lock_que_wait_ustarted = (ib_uint64_t)sec * 1000000 + ms; + } + + ut_a(que_thr_stop(thr)); + + MONITOR_INC(MONITOR_TABLELOCK_WAIT); + + return(DB_LOCK_WAIT); +} + +/*********************************************************************//** +Checks if other transactions have an incompatible mode lock request in +the lock queue. +@return lock or NULL */ +UNIV_INLINE +const lock_t* +lock_table_other_has_incompatible( +/*==============================*/ + const trx_t* trx, /*!< in: transaction, or NULL if all + transactions should be included */ + ulint wait, /*!< in: LOCK_WAIT if also + waiting locks are taken into + account, or 0 if not */ + const dict_table_t* table, /*!< in: table */ + enum lock_mode mode) /*!< in: lock mode */ +{ + const lock_t* lock; + + ut_ad(lock_mutex_own()); + + for (lock = UT_LIST_GET_LAST(table->locks); + lock != NULL; + lock = UT_LIST_GET_PREV(un_member.tab_lock.locks, lock)) { + + if (lock->trx != trx + && !lock_mode_compatible(lock_get_mode(lock), mode) + && (wait || !lock_get_wait(lock))) { + + return(lock); + } + } + + return(NULL); +} + +/*********************************************************************//** +Locks the specified database table in the mode given. If the lock cannot +be granted immediately, the query thread is put to wait. +@return DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */ +UNIV_INTERN +dberr_t +lock_table( +/*=======*/ + ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG bit is set, + does nothing */ + dict_table_t* table, /*!< in/out: database table + in dictionary cache */ + enum lock_mode mode, /*!< in: lock mode */ + que_thr_t* thr) /*!< in: query thread */ +{ + trx_t* trx; + dberr_t err; + const lock_t* wait_for; + + ut_ad(table && thr); + + if (flags & BTR_NO_LOCKING_FLAG) { + + return(DB_SUCCESS); + } + + ut_a(flags == 0); + + trx = thr_get_trx(thr); + + if (UNIV_UNLIKELY(trx->fake_changes && mode == LOCK_IX)) { + mode = LOCK_IS; + } + + /* Look for equal or stronger locks the same trx already + has on the table. No need to acquire the lock mutex here + because only this transacton can add/access table locks + to/from trx_t::table_locks. */ + + if (lock_table_has(trx, table, mode)) { + + return(DB_SUCCESS); + } + + lock_mutex_enter(); + + /* We have to check if the new lock is compatible with any locks + other transactions have in the table lock queue. */ + + wait_for = lock_table_other_has_incompatible( + trx, LOCK_WAIT, table, mode); + + trx_mutex_enter(trx); + + /* Another trx has a request on the table in an incompatible + mode: this trx may have to wait */ + + if (wait_for != NULL) { + err = lock_table_enqueue_waiting(mode | flags, table, thr); + } else { + lock_table_create(table, mode | flags, trx); + + ut_a(!flags || mode == LOCK_S || mode == LOCK_X); + + err = DB_SUCCESS; + } + + lock_mutex_exit(); + + trx_mutex_exit(trx); + + return(err); +} + +/*********************************************************************//** +Creates a table IX lock object for a resurrected transaction. */ +UNIV_INTERN +void +lock_table_ix_resurrect( +/*====================*/ + dict_table_t* table, /*!< in/out: table */ + trx_t* trx) /*!< in/out: transaction */ +{ + ut_ad(trx->is_recovered); + + if (lock_table_has(trx, table, LOCK_IX)) { + return; + } + + lock_mutex_enter(); + + /* We have to check if the new lock is compatible with any locks + other transactions have in the table lock queue. */ + + ut_ad(!lock_table_other_has_incompatible( + trx, LOCK_WAIT, table, LOCK_IX)); + + trx_mutex_enter(trx); + lock_table_create(table, LOCK_IX, trx); + lock_mutex_exit(); + trx_mutex_exit(trx); +} + +/*********************************************************************//** +Checks if a waiting table lock request still has to wait in a queue. +@return TRUE if still has to wait */ +static +ibool +lock_table_has_to_wait_in_queue( +/*============================*/ + const lock_t* wait_lock) /*!< in: waiting table lock */ +{ + const dict_table_t* table; + const lock_t* lock; + + ut_ad(lock_mutex_own()); + ut_ad(lock_get_wait(wait_lock)); + + table = wait_lock->un_member.tab_lock.table; + + for (lock = UT_LIST_GET_FIRST(table->locks); + lock != wait_lock; + lock = UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock)) { + + if (lock_has_to_wait(wait_lock, lock)) { + + return(TRUE); + } + } + + return(FALSE); +} + +/*************************************************************//** +Removes a table lock request, waiting or granted, from the queue and grants +locks to other transactions in the queue, if they now are entitled to a +lock. */ +static +void +lock_table_dequeue( +/*===============*/ + lock_t* in_lock)/*!< in/out: table lock object; transactions waiting + behind will get their lock requests granted, if + they are now qualified to it */ +{ + lock_t* lock; + + ut_ad(lock_mutex_own()); + ut_a(lock_get_type_low(in_lock) == LOCK_TABLE); + + lock = UT_LIST_GET_NEXT(un_member.tab_lock.locks, in_lock); + + lock_table_remove_low(in_lock); + + /* Check if waiting locks in the queue can now be granted: grant + locks if there are no conflicting locks ahead. */ + + for (/* No op */; + lock != NULL; + lock = UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock)) { + + if (lock_get_wait(lock) + && !lock_table_has_to_wait_in_queue(lock)) { + + /* Grant the lock */ + ut_ad(in_lock->trx != lock->trx); + lock_grant(lock); + } + } +} + +/*=========================== LOCK RELEASE ==============================*/ + +/*************************************************************//** +Removes a granted record lock of a transaction from the queue and grants +locks to other transactions waiting in the queue if they now are entitled +to a lock. */ +UNIV_INTERN +void +lock_rec_unlock( +/*============*/ + trx_t* trx, /*!< in/out: transaction that has + set a record lock */ + const buf_block_t* block, /*!< in: buffer block containing rec */ + const rec_t* rec, /*!< in: record */ + enum lock_mode lock_mode)/*!< in: LOCK_S or LOCK_X */ +{ + lock_t* first_lock; + lock_t* lock; + ulint heap_no; + const char* stmt; + size_t stmt_len; + + ut_ad(trx); + ut_ad(rec); + ut_ad(block->frame == page_align(rec)); + ut_ad(!trx->lock.wait_lock); + ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE)); + + heap_no = page_rec_get_heap_no(rec); + + lock_mutex_enter(); + trx_mutex_enter(trx); + + first_lock = lock_rec_get_first(block, heap_no); + + /* Find the last lock with the same lock_mode and transaction + on the record. */ + + for (lock = first_lock; lock != NULL; + lock = lock_rec_get_next(heap_no, lock)) { + if (lock->trx == trx && lock_get_mode(lock) == lock_mode) { + goto released; + } + } + + lock_mutex_exit(); + trx_mutex_exit(trx); + + stmt = innobase_get_stmt(trx->mysql_thd, &stmt_len); + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: unlock row could not" + " find a %lu mode lock on the record\n", + (ulong) lock_mode); + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: current statement: %.*s\n", + (int) stmt_len, stmt); + + return; + +released: + ut_a(!lock_get_wait(lock)); + lock_rec_reset_nth_bit(lock, heap_no); + + /* Check if we can now grant waiting lock requests */ + + for (lock = first_lock; lock != NULL; + lock = lock_rec_get_next(heap_no, lock)) { + if (lock_get_wait(lock) + && !lock_rec_has_to_wait_in_queue(lock)) { + + /* Grant the lock */ + ut_ad(trx != lock->trx); + lock_grant(lock); + } + } + + lock_mutex_exit(); + trx_mutex_exit(trx); +} + +/*********************************************************************//** +Releases transaction locks, and releases possible other transactions waiting +because of these locks. */ +static +void +lock_release( +/*=========*/ + trx_t* trx) /*!< in/out: transaction */ +{ + lock_t* lock; + ulint count = 0; + trx_id_t max_trx_id; + + ut_ad(lock_mutex_own()); + ut_ad(!trx_mutex_own(trx)); + + max_trx_id = trx_sys_get_max_trx_id(); + + for (lock = UT_LIST_GET_LAST(trx->lock.trx_locks); + lock != NULL; + lock = UT_LIST_GET_LAST(trx->lock.trx_locks)) { + + if (lock_get_type_low(lock) == LOCK_REC) { + +#ifdef UNIV_DEBUG + /* Check if the transcation locked a record + in a system table in X mode. It should have set + the dict_op code correctly if it did. */ + if (lock->index->table->id < DICT_HDR_FIRST_ID + && lock_get_mode(lock) == LOCK_X) { + + ut_ad(lock_get_mode(lock) != LOCK_IX); + ut_ad(trx->dict_operation != TRX_DICT_OP_NONE); + } +#endif /* UNIV_DEBUG */ + + lock_rec_dequeue_from_page(lock); + } else { + dict_table_t* table; + + table = lock->un_member.tab_lock.table; +#ifdef UNIV_DEBUG + ut_ad(lock_get_type_low(lock) & LOCK_TABLE); + + /* Check if the transcation locked a system table + in IX mode. It should have set the dict_op code + correctly if it did. */ + if (table->id < DICT_HDR_FIRST_ID + && (lock_get_mode(lock) == LOCK_X + || lock_get_mode(lock) == LOCK_IX)) { + + ut_ad(trx->dict_operation != TRX_DICT_OP_NONE); + } +#endif /* UNIV_DEBUG */ + + if (lock_get_mode(lock) != LOCK_IS + && trx->undo_no != 0) { + + /* The trx may have modified the table. We + block the use of the MySQL query cache for + all currently active transactions. */ + + table->query_cache_inv_trx_id = max_trx_id; + } + + lock_table_dequeue(lock); + } + + if (count == LOCK_RELEASE_INTERVAL) { + /* Release the mutex for a while, so that we + do not monopolize it */ + + lock_mutex_exit(); + + lock_mutex_enter(); + + count = 0; + } + + ++count; + } + + /* We don't remove the locks one by one from the vector for + efficiency reasons. We simply reset it because we would have + released all the locks anyway. */ + + ib_vector_reset(trx->lock.table_locks); + + ut_a(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0); + ut_a(ib_vector_is_empty(trx->autoinc_locks)); + ut_a(ib_vector_is_empty(trx->lock.table_locks)); + + mem_heap_empty(trx->lock.lock_heap); +} + +/* True if a lock mode is S or X */ +#define IS_LOCK_S_OR_X(lock) \ + (lock_get_mode(lock) == LOCK_S \ + || lock_get_mode(lock) == LOCK_X) + +/*********************************************************************//** +Removes table locks of the transaction on a table to be dropped. */ +static +void +lock_trx_table_locks_remove( +/*========================*/ + const lock_t* lock_to_remove) /*!< in: lock to remove */ +{ + lint i; + trx_t* trx = lock_to_remove->trx; + + ut_ad(lock_mutex_own()); + + /* It is safe to read this because we are holding the lock mutex */ + if (!trx->lock.cancel) { + trx_mutex_enter(trx); + } else { + ut_ad(trx_mutex_own(trx)); + } + + for (i = ib_vector_size(trx->lock.table_locks) - 1; i >= 0; --i) { + const lock_t* lock; + + lock = *static_cast<lock_t**>( + ib_vector_get(trx->lock.table_locks, i)); + + if (lock == NULL) { + continue; + } + + ut_a(trx == lock->trx); + ut_a(lock_get_type_low(lock) & LOCK_TABLE); + ut_a(lock->un_member.tab_lock.table != NULL); + + if (lock == lock_to_remove) { + void* null_var = NULL; + ib_vector_set(trx->lock.table_locks, i, &null_var); + + if (!trx->lock.cancel) { + trx_mutex_exit(trx); + } + + return; + } + } + + if (!trx->lock.cancel) { + trx_mutex_exit(trx); + } + + /* Lock must exist in the vector. */ + ut_error; +} + +/*********************************************************************//** +Removes locks of a transaction on a table to be dropped. +If remove_also_table_sx_locks is TRUE then table-level S and X locks are +also removed in addition to other table-level and record-level locks. +No lock that is going to be removed is allowed to be a wait lock. */ +static +void +lock_remove_all_on_table_for_trx( +/*=============================*/ + dict_table_t* table, /*!< in: table to be dropped */ + trx_t* trx, /*!< in: a transaction */ + ibool remove_also_table_sx_locks)/*!< in: also removes + table S and X locks */ +{ + lock_t* lock; + lock_t* prev_lock; + + ut_ad(lock_mutex_own()); + + for (lock = UT_LIST_GET_LAST(trx->lock.trx_locks); + lock != NULL; + lock = prev_lock) { + + prev_lock = UT_LIST_GET_PREV(trx_locks, lock); + + if (lock_get_type_low(lock) == LOCK_REC + && lock->index->table == table) { + ut_a(!lock_get_wait(lock)); + + lock_rec_discard(lock); + } else if (lock_get_type_low(lock) & LOCK_TABLE + && lock->un_member.tab_lock.table == table + && (remove_also_table_sx_locks + || !IS_LOCK_S_OR_X(lock))) { + + ut_a(!lock_get_wait(lock)); + + lock_trx_table_locks_remove(lock); + lock_table_remove_low(lock); + } + } +} + +/*******************************************************************//** +Remove any explicit record locks held by recovering transactions on +the table. +@return number of recovered transactions examined */ +static +ulint +lock_remove_recovered_trx_record_locks( +/*===================================*/ + dict_table_t* table) /*!< in: check if there are any locks + held on records in this table or on the + table itself */ +{ + trx_t* trx; + ulint n_recovered_trx = 0; + + ut_a(table != NULL); + ut_ad(lock_mutex_own()); + + mutex_enter(&trx_sys->mutex); + + for (trx = UT_LIST_GET_FIRST(trx_sys->rw_trx_list); + trx != NULL; + trx = UT_LIST_GET_NEXT(trx_list, trx)) { + + lock_t* lock; + lock_t* next_lock; + + assert_trx_in_rw_list(trx); + + if (!trx->is_recovered) { + continue; + } + + /* Because we are holding the lock_sys->mutex, + implicit locks cannot be converted to explicit ones + while we are scanning the explicit locks. */ + + for (lock = UT_LIST_GET_FIRST(trx->lock.trx_locks); + lock != NULL; + lock = next_lock) { + + ut_a(lock->trx == trx); + + /* Recovered transactions can't wait on a lock. */ + + ut_a(!lock_get_wait(lock)); + + next_lock = UT_LIST_GET_NEXT(trx_locks, lock); + + switch (lock_get_type_low(lock)) { + default: + ut_error; + case LOCK_TABLE: + if (lock->un_member.tab_lock.table == table) { + lock_trx_table_locks_remove(lock); + lock_table_remove_low(lock); + } + break; + case LOCK_REC: + if (lock->index->table == table) { + lock_rec_discard(lock); + } + } + } + + ++n_recovered_trx; + } + + mutex_exit(&trx_sys->mutex); + + return(n_recovered_trx); +} + +/*********************************************************************//** +Removes locks on a table to be dropped or truncated. +If remove_also_table_sx_locks is TRUE then table-level S and X locks are +also removed in addition to other table-level and record-level locks. +No lock, that is going to be removed, is allowed to be a wait lock. */ +UNIV_INTERN +void +lock_remove_all_on_table( +/*=====================*/ + dict_table_t* table, /*!< in: table to be dropped + or truncated */ + ibool remove_also_table_sx_locks)/*!< in: also removes + table S and X locks */ +{ + lock_t* lock; + + lock_mutex_enter(); + + for (lock = UT_LIST_GET_FIRST(table->locks); + lock != NULL; + /* No op */) { + + lock_t* prev_lock; + + prev_lock = UT_LIST_GET_PREV(un_member.tab_lock.locks, lock); + + /* If we should remove all locks (remove_also_table_sx_locks + is TRUE), or if the lock is not table-level S or X lock, + then check we are not going to remove a wait lock. */ + if (remove_also_table_sx_locks + || !(lock_get_type(lock) == LOCK_TABLE + && IS_LOCK_S_OR_X(lock))) { + + ut_a(!lock_get_wait(lock)); + } + + lock_remove_all_on_table_for_trx( + table, lock->trx, remove_also_table_sx_locks); + + if (prev_lock == NULL) { + if (lock == UT_LIST_GET_FIRST(table->locks)) { + /* lock was not removed, pick its successor */ + lock = UT_LIST_GET_NEXT( + un_member.tab_lock.locks, lock); + } else { + /* lock was removed, pick the first one */ + lock = UT_LIST_GET_FIRST(table->locks); + } + } else if (UT_LIST_GET_NEXT(un_member.tab_lock.locks, + prev_lock) != lock) { + /* If lock was removed by + lock_remove_all_on_table_for_trx() then pick the + successor of prev_lock ... */ + lock = UT_LIST_GET_NEXT( + un_member.tab_lock.locks, prev_lock); + } else { + /* ... otherwise pick the successor of lock. */ + lock = UT_LIST_GET_NEXT( + un_member.tab_lock.locks, lock); + } + } + + /* Note: Recovered transactions don't have table level IX or IS locks + but can have implicit record locks that have been converted to explicit + record locks. Such record locks cannot be freed by traversing the + transaction lock list in dict_table_t (as above). */ + + if (!lock_sys->rollback_complete + && lock_remove_recovered_trx_record_locks(table) == 0) { + + lock_sys->rollback_complete = TRUE; + } + + lock_mutex_exit(); +} + +/*===================== VALIDATION AND DEBUGGING ====================*/ + +/*********************************************************************//** +Prints info of a table lock. */ +UNIV_INTERN +void +lock_table_print( +/*=============*/ + FILE* file, /*!< in: file where to print */ + const lock_t* lock) /*!< in: table type lock */ +{ + ut_ad(lock_mutex_own()); + ut_a(lock_get_type_low(lock) == LOCK_TABLE); + + fputs("TABLE LOCK table ", file); + ut_print_name(file, lock->trx, TRUE, + lock->un_member.tab_lock.table->name); + fprintf(file, " trx id " TRX_ID_FMT, lock->trx->id); + + if (lock_get_mode(lock) == LOCK_S) { + fputs(" lock mode S", file); + } else if (lock_get_mode(lock) == LOCK_X) { + fputs(" lock mode X", file); + } else if (lock_get_mode(lock) == LOCK_IS) { + fputs(" lock mode IS", file); + } else if (lock_get_mode(lock) == LOCK_IX) { + fputs(" lock mode IX", file); + } else if (lock_get_mode(lock) == LOCK_AUTO_INC) { + fputs(" lock mode AUTO-INC", file); + } else { + fprintf(file, " unknown lock mode %lu", + (ulong) lock_get_mode(lock)); + } + + if (lock_get_wait(lock)) { + fputs(" waiting", file); + } + + putc('\n', file); +} + +/*********************************************************************//** +Prints info of a record lock. */ +UNIV_INTERN +void +lock_rec_print( +/*===========*/ + FILE* file, /*!< in: file where to print */ + const lock_t* lock) /*!< in: record type lock */ +{ + const buf_block_t* block; + ulint space; + ulint page_no; + ulint i; + mtr_t mtr; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + ut_ad(lock_mutex_own()); + ut_a(lock_get_type_low(lock) == LOCK_REC); + + space = lock->un_member.rec_lock.space; + page_no = lock->un_member.rec_lock.page_no; + + fprintf(file, "RECORD LOCKS space id %lu page no %lu n bits %lu ", + (ulong) space, (ulong) page_no, + (ulong) lock_rec_get_n_bits(lock)); + dict_index_name_print(file, lock->trx, lock->index); + fprintf(file, " trx id " TRX_ID_FMT, lock->trx->id); + + if (lock_get_mode(lock) == LOCK_S) { + fputs(" lock mode S", file); + } else if (lock_get_mode(lock) == LOCK_X) { + fputs(" lock_mode X", file); + } else { + ut_error; + } + + if (lock_rec_get_gap(lock)) { + fputs(" locks gap before rec", file); + } + + if (lock_rec_get_rec_not_gap(lock)) { + fputs(" locks rec but not gap", file); + } + + if (lock_rec_get_insert_intention(lock)) { + fputs(" insert intention", file); + } + + if (lock_get_wait(lock)) { + fputs(" waiting", file); + } + + mtr_start(&mtr); + + putc('\n', file); + + if ( srv_show_verbose_locks ) { + block = buf_page_try_get(space, page_no, &mtr); + + for (i = 0; i < lock_rec_get_n_bits(lock); ++i) { + + if (!lock_rec_get_nth_bit(lock, i)) { + continue; + } + + fprintf(file, "Record lock, heap no %lu", (ulong) i); + + if (block) { + const rec_t* rec; + + rec = page_find_rec_with_heap_no( + buf_block_get_frame(block), i); + + offsets = rec_get_offsets( + rec, lock->index, offsets, + ULINT_UNDEFINED, &heap); + + putc(' ', file); + rec_print_new(file, rec, offsets); + } + + putc('\n', file); + } + } + + mtr_commit(&mtr); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } +} + +#ifdef UNIV_DEBUG +/* Print the number of lock structs from lock_print_info_summary() only +in non-production builds for performance reasons, see +http://bugs.mysql.com/36942 */ +#define PRINT_NUM_OF_LOCK_STRUCTS +#endif /* UNIV_DEBUG */ + +#ifdef PRINT_NUM_OF_LOCK_STRUCTS +/*********************************************************************//** +Calculates the number of record lock structs in the record lock hash table. +@return number of record locks */ +static +ulint +lock_get_n_rec_locks(void) +/*======================*/ +{ + ulint n_locks = 0; + ulint i; + + ut_ad(lock_mutex_own()); + + for (i = 0; i < hash_get_n_cells(lock_sys->rec_hash); i++) { + const lock_t* lock; + + for (lock = static_cast<const lock_t*>( + HASH_GET_FIRST(lock_sys->rec_hash, i)); + lock != 0; + lock = static_cast<const lock_t*>( + HASH_GET_NEXT(hash, lock))) { + + n_locks++; + } + } + + return(n_locks); +} +#endif /* PRINT_NUM_OF_LOCK_STRUCTS */ + +/*********************************************************************//** +Prints info of locks for all transactions. +@return FALSE if not able to obtain lock mutex +and exits without printing info */ +UNIV_INTERN +ibool +lock_print_info_summary( +/*====================*/ + FILE* file, /*!< in: file where to print */ + ibool nowait) /*!< in: whether to wait for the lock mutex */ +{ + /* if nowait is FALSE, wait on the lock mutex, + otherwise return immediately if fail to obtain the + mutex. */ + if (!nowait) { + lock_mutex_enter(); + } else if (lock_mutex_enter_nowait()) { + fputs("FAIL TO OBTAIN LOCK MUTEX, " + "SKIP LOCK INFO PRINTING\n", file); + return(FALSE); + } + + if (lock_deadlock_found) { + fputs("------------------------\n" + "LATEST DETECTED DEADLOCK\n" + "------------------------\n", file); + + if (!srv_read_only_mode) { + ut_copy_file(file, lock_latest_err_file); + } + } + + fputs("------------\n" + "TRANSACTIONS\n" + "------------\n", file); + + fprintf(file, "Trx id counter " TRX_ID_FMT "\n", + trx_sys_get_max_trx_id()); + + fprintf(file, + "Purge done for trx's n:o < " TRX_ID_FMT + " undo n:o < " TRX_ID_FMT " state: ", + purge_sys->iter.trx_no, + purge_sys->iter.undo_no); + + /* Note: We are reading the state without the latch. One because it + will violate the latching order and two because we are merely querying + the state of the variable for display. */ + + switch (purge_sys->state){ + case PURGE_STATE_INIT: + /* Should never be in this state while the system is running. */ + ut_error; + + case PURGE_STATE_EXIT: + fprintf(file, "exited"); + break; + + case PURGE_STATE_DISABLED: + fprintf(file, "disabled"); + break; + + case PURGE_STATE_RUN: + fprintf(file, "running"); + /* Check if it is waiting for more data to arrive. */ + if (!purge_sys->running) { + fprintf(file, " but idle"); + } + break; + + case PURGE_STATE_STOP: + fprintf(file, "stopped"); + break; + } + + fprintf(file, "\n"); + + fprintf(file, + "History list length %lu\n", + (ulong) trx_sys->rseg_history_len); + +#ifdef PRINT_NUM_OF_LOCK_STRUCTS + fprintf(file, + "Total number of lock structs in row lock hash table %lu\n", + (ulong) lock_get_n_rec_locks()); +#endif /* PRINT_NUM_OF_LOCK_STRUCTS */ + return(TRUE); +} + +/*********************************************************************//** +Prints info of locks for each transaction. This function assumes that the +caller holds the lock mutex and more importantly it will release the lock +mutex on behalf of the caller. (This should be fixed in the future). */ +UNIV_INTERN +void +lock_print_info_all_transactions( +/*=============================*/ + FILE* file) /*!< in: file where to print */ +{ + const lock_t* lock; + ibool load_page_first = TRUE; + ulint nth_trx = 0; + ulint nth_lock = 0; + ulint i; + mtr_t mtr; + const trx_t* trx; + trx_list_t* trx_list = &trx_sys->rw_trx_list; + + fprintf(file, "LIST OF TRANSACTIONS FOR EACH SESSION:\n"); + + ut_ad(lock_mutex_own()); + + mutex_enter(&trx_sys->mutex); + + /* First print info on non-active transactions */ + + /* NOTE: information of auto-commit non-locking read-only + transactions will be omitted here. The information will be + available from INFORMATION_SCHEMA.INNODB_TRX. */ + + for (trx = UT_LIST_GET_FIRST(trx_sys->mysql_trx_list); + trx != NULL; + trx = UT_LIST_GET_NEXT(mysql_trx_list, trx)) { + + ut_ad(trx->in_mysql_trx_list); + + /* See state transitions and locking rules in trx0trx.h */ + + if (trx_state_eq(trx, TRX_STATE_NOT_STARTED)) { + fputs("---", file); + trx_print_latched(file, trx, 600); + } + } + +loop: + /* Since we temporarily release lock_sys->mutex and + trx_sys->mutex when reading a database page in below, + variable trx may be obsolete now and we must loop + through the trx list to get probably the same trx, + or some other trx. */ + + for (trx = UT_LIST_GET_FIRST(*trx_list), i = 0; + trx && (i < nth_trx); + trx = UT_LIST_GET_NEXT(trx_list, trx), i++) { + + assert_trx_in_list(trx); + ut_ad(trx->read_only == (trx_list == &trx_sys->ro_trx_list)); + } + + ut_ad(trx == NULL + || trx->read_only == (trx_list == &trx_sys->ro_trx_list)); + + if (trx == NULL) { + /* Check the read-only transaction list next. */ + if (trx_list == &trx_sys->rw_trx_list) { + trx_list = &trx_sys->ro_trx_list; + nth_trx = 0; + nth_lock = 0; + goto loop; + } + + lock_mutex_exit(); + mutex_exit(&trx_sys->mutex); + + ut_ad(lock_validate()); + + return; + } + + assert_trx_in_list(trx); + + if (nth_lock == 0) { + fputs("---", file); + + trx_print_latched(file, trx, 600); + + if (trx->read_view) { + fprintf(file, + "Trx read view will not see trx with" + " id >= " TRX_ID_FMT + ", sees < " TRX_ID_FMT "\n", + trx->read_view->low_limit_id, + trx->read_view->up_limit_id); + } + + if (trx->lock.que_state == TRX_QUE_LOCK_WAIT) { + + fprintf(file, + "------- TRX HAS BEEN WAITING %lu SEC" + " FOR THIS LOCK TO BE GRANTED:\n", + (ulong) difftime(ut_time(), + trx->lock.wait_started)); + + if (lock_get_type_low(trx->lock.wait_lock) == LOCK_REC) { + lock_rec_print(file, trx->lock.wait_lock); + } else { + lock_table_print(file, trx->lock.wait_lock); + } + + fputs("------------------\n", file); + } + } + + if (!srv_print_innodb_lock_monitor && !srv_show_locks_held) { + nth_trx++; + goto loop; + } + + i = 0; + + /* Look at the note about the trx loop above why we loop here: + lock may be an obsolete pointer now. */ + + lock = UT_LIST_GET_FIRST(trx->lock.trx_locks); + + while (lock && (i < nth_lock)) { + lock = UT_LIST_GET_NEXT(trx_locks, lock); + i++; + } + + if (lock == NULL) { + nth_trx++; + nth_lock = 0; + + goto loop; + } + + if (lock_get_type_low(lock) == LOCK_REC) { + if (load_page_first) { + ulint space = lock->un_member.rec_lock.space; + ulint zip_size= fil_space_get_zip_size(space); + ulint page_no = lock->un_member.rec_lock.page_no; + ibool tablespace_being_deleted = FALSE; + + if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) { + + /* It is a single table tablespace and + the .ibd file is missing (TRUNCATE + TABLE probably stole the locks): just + print the lock without attempting to + load the page in the buffer pool. */ + + fprintf(file, "RECORD LOCKS on" + " non-existing space %lu\n", + (ulong) space); + goto print_rec; + } + + lock_mutex_exit(); + mutex_exit(&trx_sys->mutex); + + if (srv_show_verbose_locks) { + + DEBUG_SYNC_C("innodb_monitor_before_lock_page_read"); + + /* Check if the space is exists or not. only + when the space is valid, try to get the page. */ + tablespace_being_deleted + = fil_inc_pending_ops(space, false); + + if (!tablespace_being_deleted) { + mtr_start(&mtr); + + buf_page_get_gen(space, zip_size, + page_no, RW_NO_LATCH, + NULL, + BUF_GET_POSSIBLY_FREED, + __FILE__, __LINE__, + &mtr); + + mtr_commit(&mtr); + + fil_decr_pending_ops(space); + } else { + fprintf(file, "RECORD LOCKS on" + " non-existing space %lu\n", + (ulong) space); + } + } + + load_page_first = FALSE; + + lock_mutex_enter(); + + mutex_enter(&trx_sys->mutex); + + goto loop; + } + +print_rec: + lock_rec_print(file, lock); + } else { + ut_ad(lock_get_type_low(lock) & LOCK_TABLE); + + lock_table_print(file, lock); + } + + load_page_first = TRUE; + + nth_lock++; + + if (nth_lock >= srv_show_locks_held) { + fputs("TOO MANY LOCKS PRINTED FOR THIS TRX:" + " SUPPRESSING FURTHER PRINTS\n", + file); + + nth_trx++; + nth_lock = 0; + } + + goto loop; +} + +#ifdef UNIV_DEBUG +/*********************************************************************//** +Find the the lock in the trx_t::trx_lock_t::table_locks vector. +@return TRUE if found */ +static +ibool +lock_trx_table_locks_find( +/*======================*/ + trx_t* trx, /*!< in: trx to validate */ + const lock_t* find_lock) /*!< in: lock to find */ +{ + lint i; + ibool found = FALSE; + + trx_mutex_enter(trx); + + for (i = ib_vector_size(trx->lock.table_locks) - 1; i >= 0; --i) { + const lock_t* lock; + + lock = *static_cast<const lock_t**>( + ib_vector_get(trx->lock.table_locks, i)); + + if (lock == NULL) { + continue; + } else if (lock == find_lock) { + /* Can't be duplicates. */ + ut_a(!found); + found = TRUE; + } + + ut_a(trx == lock->trx); + ut_a(lock_get_type_low(lock) & LOCK_TABLE); + ut_a(lock->un_member.tab_lock.table != NULL); + } + + trx_mutex_exit(trx); + + return(found); +} + +/*********************************************************************//** +Validates the lock queue on a table. +@return TRUE if ok */ +static +ibool +lock_table_queue_validate( +/*======================*/ + const dict_table_t* table) /*!< in: table */ +{ + const lock_t* lock; + + ut_ad(lock_mutex_own()); + ut_ad(mutex_own(&trx_sys->mutex)); + + for (lock = UT_LIST_GET_FIRST(table->locks); + lock != NULL; + lock = UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock)) { + + /* lock->trx->state cannot change from or to NOT_STARTED + while we are holding the trx_sys->mutex. It may change + from ACTIVE to PREPARED, but it may not change to + COMMITTED, because we are holding the lock_sys->mutex. */ + ut_ad(trx_assert_started(lock->trx)); + + if (!lock_get_wait(lock)) { + + ut_a(!lock_table_other_has_incompatible( + lock->trx, 0, table, + lock_get_mode(lock))); + } else { + + ut_a(lock_table_has_to_wait_in_queue(lock)); + } + + ut_a(lock_trx_table_locks_find(lock->trx, lock)); + } + + return(TRUE); +} + +/*********************************************************************//** +Validates the lock queue on a single record. +@return TRUE if ok */ +static +ibool +lock_rec_queue_validate( +/*====================*/ + ibool locked_lock_trx_sys, + /*!< in: if the caller holds + both the lock mutex and + trx_sys_t->lock. */ + const buf_block_t* block, /*!< in: buffer block containing rec */ + const rec_t* rec, /*!< in: record to look at */ + const dict_index_t* index, /*!< in: index, or NULL if not known */ + const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */ +{ + const lock_t* lock; + ulint heap_no; + + ut_a(rec); + ut_a(block->frame == page_align(rec)); + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(!page_rec_is_comp(rec) == !rec_offs_comp(offsets)); + ut_ad(lock_mutex_own() == locked_lock_trx_sys); + ut_ad(!index || dict_index_is_clust(index) + || !dict_index_is_online_ddl(index)); + + heap_no = page_rec_get_heap_no(rec); + + if (!locked_lock_trx_sys) { + lock_mutex_enter(); + mutex_enter(&trx_sys->mutex); + } + + if (!page_rec_is_user_rec(rec)) { + + for (lock = lock_rec_get_first(block, heap_no); + lock != NULL; + lock = lock_rec_get_next_const(heap_no, lock)) { + + ut_a(trx_in_trx_list(lock->trx)); + + if (lock_get_wait(lock)) { + ut_a(lock_rec_has_to_wait_in_queue(lock)); + } + + if (index) { + ut_a(lock->index == index); + } + } + + goto func_exit; + } + + if (!index); + else if (dict_index_is_clust(index)) { + trx_id_t trx_id; + trx_id_t* trx_desc; + + /* Unlike the non-debug code, this invariant can only succeed + if the check and assertion are covered by the lock mutex. */ + + trx_id = lock_clust_rec_some_has_impl(rec, index, offsets); + trx_desc = trx_find_descriptor(trx_sys->descriptors, + trx_sys->descr_n_used, + trx_id); + + ut_ad(lock_mutex_own()); + /* trx_id cannot be committed until lock_mutex_exit() + because lock_trx_release_locks() acquires lock_sys->mutex */ + + if (trx_desc != NULL + && lock_rec_other_has_expl_req(LOCK_S, 0, LOCK_WAIT, + block, heap_no, trx_id)) { + + ut_ad(trx_id == *trx_desc); + ut_a(lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, + block, heap_no, trx_id)); + } + } + + for (lock = lock_rec_get_first(block, heap_no); + lock != NULL; + lock = lock_rec_get_next_const(heap_no, lock)) { + + ut_a(trx_in_trx_list(lock->trx)); + + if (index) { + ut_a(lock->index == index); + } + + if (!lock_rec_get_gap(lock) && !lock_get_wait(lock)) { + + enum lock_mode mode; + + if (lock_get_mode(lock) == LOCK_S) { + mode = LOCK_X; + } else { + mode = LOCK_S; + } + ut_a(!lock_rec_other_has_expl_req( + mode, 0, 0, block, heap_no, + lock->trx->id)); + + } else if (lock_get_wait(lock) && !lock_rec_get_gap(lock)) { + + ut_a(lock_rec_has_to_wait_in_queue(lock)); + } + } + +func_exit: + if (!locked_lock_trx_sys) { + lock_mutex_exit(); + mutex_exit(&trx_sys->mutex); + } + + return(TRUE); +} + +/*********************************************************************//** +Validates the record lock queues on a page. +@return TRUE if ok */ +static +ibool +lock_rec_validate_page( +/*===================*/ + const buf_block_t* block) /*!< in: buffer block */ +{ + const lock_t* lock; + const rec_t* rec; + ulint nth_lock = 0; + ulint nth_bit = 0; + ulint i; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + ut_ad(!lock_mutex_own()); + + lock_mutex_enter(); + mutex_enter(&trx_sys->mutex); +loop: + lock = lock_rec_get_first_on_page_addr(buf_block_get_space(block), + buf_block_get_page_no(block)); + + if (!lock) { + goto function_exit; + } + +#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG + ut_a(!block->page.file_page_was_freed); +#endif + + for (i = 0; i < nth_lock; i++) { + + lock = lock_rec_get_next_on_page_const(lock); + + if (!lock) { + goto function_exit; + } + } + + ut_a(trx_in_trx_list(lock->trx)); + +# ifdef UNIV_SYNC_DEBUG + /* Only validate the record queues when this thread is not + holding a space->latch. Deadlocks are possible due to + latching order violation when UNIV_DEBUG is defined while + UNIV_SYNC_DEBUG is not. */ + if (!sync_thread_levels_contains(SYNC_FSP)) +# endif /* UNIV_SYNC_DEBUG */ + for (i = nth_bit; i < lock_rec_get_n_bits(lock); i++) { + + if (i == 1 || lock_rec_get_nth_bit(lock, i)) { + + rec = page_find_rec_with_heap_no(block->frame, i); + ut_a(rec); + offsets = rec_get_offsets(rec, lock->index, offsets, + ULINT_UNDEFINED, &heap); +#if 0 + fprintf(stderr, + "Validating %u %u\n", + block->page.space, block->page.offset); +#endif + /* If this thread is holding the file space + latch (fil_space_t::latch), the following + check WILL break the latching order and may + cause a deadlock of threads. */ + + lock_rec_queue_validate( + TRUE, block, rec, lock->index, offsets); + + nth_bit = i + 1; + + goto loop; + } + } + + nth_bit = 0; + nth_lock++; + + goto loop; + +function_exit: + lock_mutex_exit(); + mutex_exit(&trx_sys->mutex); + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(TRUE); +} + +/*********************************************************************//** +Validates the table locks. +@return TRUE if ok */ +static +ibool +lock_validate_table_locks( +/*======================*/ + const trx_list_t* trx_list) /*!< in: trx list */ +{ + const trx_t* trx; + + ut_ad(lock_mutex_own()); + ut_ad(mutex_own(&trx_sys->mutex)); + + ut_ad(trx_list == &trx_sys->rw_trx_list + || trx_list == &trx_sys->ro_trx_list); + + for (trx = UT_LIST_GET_FIRST(*trx_list); + trx != NULL; + trx = UT_LIST_GET_NEXT(trx_list, trx)) { + + const lock_t* lock; + + assert_trx_in_list(trx); + ut_ad(trx->read_only == (trx_list == &trx_sys->ro_trx_list)); + + for (lock = UT_LIST_GET_FIRST(trx->lock.trx_locks); + lock != NULL; + lock = UT_LIST_GET_NEXT(trx_locks, lock)) { + + if (lock_get_type_low(lock) & LOCK_TABLE) { + + lock_table_queue_validate( + lock->un_member.tab_lock.table); + } + } + } + + return(TRUE); +} + +/*********************************************************************//** +Validate record locks up to a limit. +@return lock at limit or NULL if no more locks in the hash bucket */ +static __attribute__((nonnull, warn_unused_result)) +const lock_t* +lock_rec_validate( +/*==============*/ + ulint start, /*!< in: lock_sys->rec_hash + bucket */ + ib_uint64_t* limit) /*!< in/out: upper limit of + (space, page_no) */ +{ + ut_ad(lock_mutex_own()); + ut_ad(mutex_own(&trx_sys->mutex)); + + for (const lock_t* lock = static_cast<const lock_t*>( + HASH_GET_FIRST(lock_sys->rec_hash, start)); + lock != NULL; + lock = static_cast<const lock_t*>(HASH_GET_NEXT(hash, lock))) { + + ib_uint64_t current; + + ut_a(trx_in_trx_list(lock->trx)); + ut_a(lock_get_type(lock) == LOCK_REC); + + current = ut_ull_create( + lock->un_member.rec_lock.space, + lock->un_member.rec_lock.page_no); + + if (current > *limit) { + *limit = current + 1; + return(lock); + } + } + + return(0); +} + +/*********************************************************************//** +Validate a record lock's block */ +static +void +lock_rec_block_validate( +/*====================*/ + ulint space, + ulint page_no) +{ + /* The lock and the block that it is referring to may be freed at + this point. We pass BUF_GET_POSSIBLY_FREED to skip a debug check. + If the lock exists in lock_rec_validate_page() we assert + !block->page.file_page_was_freed. */ + + buf_block_t* block; + mtr_t mtr; + + /* Make sure that the tablespace is not deleted while we are + trying to access the page. */ + if (!fil_inc_pending_ops(space, true)) { + mtr_start(&mtr); + block = buf_page_get_gen( + space, fil_space_get_zip_size(space), + page_no, RW_X_LATCH, NULL, + BUF_GET_POSSIBLY_FREED, + __FILE__, __LINE__, &mtr); + + buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK); + + ut_ad(lock_rec_validate_page(block)); + mtr_commit(&mtr); + + fil_decr_pending_ops(space); + } +} + +/*********************************************************************//** +Validates the lock system. +@return TRUE if ok */ +static +bool +lock_validate() +/*===========*/ +{ + typedef std::pair<ulint, ulint> page_addr_t; + typedef std::set<page_addr_t> page_addr_set; + page_addr_set pages; + + lock_mutex_enter(); + mutex_enter(&trx_sys->mutex); + + ut_a(lock_validate_table_locks(&trx_sys->rw_trx_list)); + ut_a(lock_validate_table_locks(&trx_sys->ro_trx_list)); + + /* Iterate over all the record locks and validate the locks. We + don't want to hog the lock_sys_t::mutex and the trx_sys_t::mutex. + Release both mutexes during the validation check. */ + + for (ulint i = 0; i < hash_get_n_cells(lock_sys->rec_hash); i++) { + const lock_t* lock; + ib_uint64_t limit = 0; + + while ((lock = lock_rec_validate(i, &limit)) != 0) { + + ulint space = lock->un_member.rec_lock.space; + ulint page_no = lock->un_member.rec_lock.page_no; + + pages.insert(std::make_pair(space, page_no)); + } + } + + mutex_exit(&trx_sys->mutex); + lock_mutex_exit(); + + for (page_addr_set::const_iterator it = pages.begin(); + it != pages.end(); + ++it) { + lock_rec_block_validate((*it).first, (*it).second); + } + + return(true); +} +#endif /* UNIV_DEBUG */ +/*============ RECORD LOCK CHECKS FOR ROW OPERATIONS ====================*/ + +/*********************************************************************//** +Checks if locks of other transactions prevent an immediate insert of +a record. If they do, first tests if the query thread should anyway +be suspended for some reason; if not, then puts the transaction and +the query thread to the lock wait state and inserts a waiting request +for a gap x-lock to the lock queue. +@return DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */ +UNIV_INTERN +dberr_t +lock_rec_insert_check_and_lock( +/*===========================*/ + ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG bit is + set, does nothing */ + const rec_t* rec, /*!< in: record after which to insert */ + buf_block_t* block, /*!< in/out: buffer block of rec */ + dict_index_t* index, /*!< in: index */ + que_thr_t* thr, /*!< in: query thread */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + ibool* inherit)/*!< out: set to TRUE if the new + inserted record maybe should inherit + LOCK_GAP type locks from the successor + record */ +{ + const rec_t* next_rec; + trx_t* trx; + lock_t* lock; + dberr_t err; + ulint next_rec_heap_no; + ibool inherit_in = *inherit; + + ut_ad(block->frame == page_align(rec)); + ut_ad(!dict_index_is_online_ddl(index) + || dict_index_is_clust(index) + || (flags & BTR_CREATE_FLAG)); + ut_ad((flags & BTR_NO_LOCKING_FLAG) || thr); + + if (flags & BTR_NO_LOCKING_FLAG) { + + return(DB_SUCCESS); + } + + trx = thr_get_trx(thr); + + if (UNIV_UNLIKELY(trx->fake_changes)) { + return(DB_SUCCESS); + } + + next_rec = page_rec_get_next_const(rec); + next_rec_heap_no = page_rec_get_heap_no(next_rec); + + lock_mutex_enter(); + /* Because this code is invoked for a running transaction by + the thread that is serving the transaction, it is not necessary + to hold trx->mutex here. */ + + /* When inserting a record into an index, the table must be at + least IX-locked. When we are building an index, we would pass + BTR_NO_LOCKING_FLAG and skip the locking altogether. */ + ut_ad(lock_table_has(trx, index->table, LOCK_IX)); + + lock = lock_rec_get_first(block, next_rec_heap_no); + + if (UNIV_LIKELY(lock == NULL)) { + /* We optimize CPU time usage in the simplest case */ + + lock_mutex_exit(); + + if (inherit_in && !dict_index_is_clust(index)) { + /* Update the page max trx id field */ + page_update_max_trx_id(block, + buf_block_get_page_zip(block), + trx->id, mtr); + } + + *inherit = FALSE; + + return(DB_SUCCESS); + } + + *inherit = TRUE; + + /* If another transaction has an explicit lock request which locks + the gap, waiting or granted, on the successor, the insert has to wait. + + An exception is the case where the lock by the another transaction + is a gap type lock which it placed to wait for its turn to insert. We + do not consider that kind of a lock conflicting with our insert. This + eliminates an unnecessary deadlock which resulted when 2 transactions + had to wait for their insert. Both had waiting gap type lock requests + on the successor, which produced an unnecessary deadlock. */ + + if (lock_rec_other_has_conflicting( + static_cast<enum lock_mode>( + LOCK_X | LOCK_GAP | LOCK_INSERT_INTENTION), + block, next_rec_heap_no, trx)) { + + /* Note that we may get DB_SUCCESS also here! */ + trx_mutex_enter(trx); + + err = lock_rec_enqueue_waiting( + LOCK_X | LOCK_GAP | LOCK_INSERT_INTENTION, + block, next_rec_heap_no, index, thr); + + trx_mutex_exit(trx); + } else { + err = DB_SUCCESS; + } + + lock_mutex_exit(); + + switch (err) { + case DB_SUCCESS_LOCKED_REC: + err = DB_SUCCESS; + /* fall through */ + case DB_SUCCESS: + if (!inherit_in || dict_index_is_clust(index)) { + break; + } + /* Update the page max trx id field */ + page_update_max_trx_id(block, + buf_block_get_page_zip(block), + trx->id, mtr); + default: + /* We only care about the two return values. */ + break; + } + +#ifdef UNIV_DEBUG + { + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + const ulint* offsets; + rec_offs_init(offsets_); + + offsets = rec_get_offsets(next_rec, index, offsets_, + ULINT_UNDEFINED, &heap); + + ut_ad(lock_rec_queue_validate( + FALSE, block, next_rec, index, offsets)); + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + } +#endif /* UNIV_DEBUG */ + + return(err); +} + +/*********************************************************************//** +If a transaction has an implicit x-lock on a record, but no explicit x-lock +set on the record, sets one for it. */ +static +void +lock_rec_convert_impl_to_expl( +/*==========================*/ + const buf_block_t* block, /*!< in: buffer block of rec */ + const rec_t* rec, /*!< in: user record on page */ + dict_index_t* index, /*!< in: index of record */ + const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */ +{ + trx_id_t trx_id; + + ut_ad(!lock_mutex_own()); + ut_ad(page_rec_is_user_rec(rec)); + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(!page_rec_is_comp(rec) == !rec_offs_comp(offsets)); + + if (dict_index_is_clust(index)) { + trx_id = lock_clust_rec_some_has_impl(rec, index, offsets); + /* The clustered index record was last modified by + this transaction. The transaction may have been + committed a long time ago. */ + } else { + ut_ad(!dict_index_is_online_ddl(index)); + trx_id = lock_sec_rec_some_has_impl(rec, index, offsets); + /* The transaction can be committed before the + trx_is_active(trx_id, NULL) check below, because we are not + holding lock_mutex. */ + + ut_ad(!lock_rec_other_trx_holds_expl(LOCK_S | LOCK_REC_NOT_GAP, + trx_id, rec, block)); + } + + if (trx_id != 0) { + trx_id_t* impl_trx_desc; + ulint heap_no = page_rec_get_heap_no(rec); + + lock_mutex_enter(); + + /* If the transaction is still active and has no + explicit x-lock set on the record, set one for it */ + + mutex_enter(&trx_sys->mutex); + impl_trx_desc = trx_find_descriptor(trx_sys->descriptors, + trx_sys->descr_n_used, + trx_id); + mutex_exit(&trx_sys->mutex); + + /* trx_id cannot be committed until lock_mutex_exit() + because lock_trx_release_locks() acquires lock_sys->mutex */ + + if (impl_trx_desc != NULL + && !lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, block, + heap_no, trx_id)) { + ulint type_mode = (LOCK_REC | LOCK_X + | LOCK_REC_NOT_GAP); + + mutex_enter(&trx_sys->mutex); + trx_t* impl_trx = trx_rw_get_active_trx_by_id(trx_id, + NULL); + mutex_exit(&trx_sys->mutex); + ut_ad(impl_trx != NULL); + + lock_rec_add_to_queue( + type_mode, block, heap_no, index, + impl_trx, FALSE); + } + + lock_mutex_exit(); + } +} + +/*********************************************************************//** +Checks if locks of other transactions prevent an immediate modify (update, +delete mark, or delete unmark) of a clustered index record. If they do, +first tests if the query thread should anyway be suspended for some +reason; if not, then puts the transaction and the query thread to the +lock wait state and inserts a waiting request for a record x-lock to the +lock queue. +@return DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */ +UNIV_INTERN +dberr_t +lock_clust_rec_modify_check_and_lock( +/*=================================*/ + ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG + bit is set, does nothing */ + const buf_block_t* block, /*!< in: buffer block of rec */ + const rec_t* rec, /*!< in: record which should be + modified */ + dict_index_t* index, /*!< in: clustered index */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + que_thr_t* thr) /*!< in: query thread */ +{ + dberr_t err; + ulint heap_no; + + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(dict_index_is_clust(index)); + ut_ad(block->frame == page_align(rec)); + + if (flags & BTR_NO_LOCKING_FLAG) { + + return(DB_SUCCESS); + } + + if (UNIV_UNLIKELY(thr_get_trx(thr)->fake_changes)) { + return(DB_SUCCESS); + } + + heap_no = rec_offs_comp(offsets) + ? rec_get_heap_no_new(rec) + : rec_get_heap_no_old(rec); + + /* If a transaction has no explicit x-lock set on the record, set one + for it */ + + lock_rec_convert_impl_to_expl(block, rec, index, offsets); + + lock_mutex_enter(); + + ut_ad(lock_table_has(thr_get_trx(thr), index->table, LOCK_IX)); + + err = lock_rec_lock(TRUE, LOCK_X | LOCK_REC_NOT_GAP, + block, heap_no, index, thr); + + MONITOR_INC(MONITOR_NUM_RECLOCK_REQ); + + lock_mutex_exit(); + + ut_ad(lock_rec_queue_validate(FALSE, block, rec, index, offsets)); + + if (UNIV_UNLIKELY(err == DB_SUCCESS_LOCKED_REC)) { + err = DB_SUCCESS; + } + + return(err); +} + +/*********************************************************************//** +Checks if locks of other transactions prevent an immediate modify (delete +mark or delete unmark) of a secondary index record. +@return DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */ +UNIV_INTERN +dberr_t +lock_sec_rec_modify_check_and_lock( +/*===============================*/ + ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG + bit is set, does nothing */ + buf_block_t* block, /*!< in/out: buffer block of rec */ + const rec_t* rec, /*!< in: record which should be + modified; NOTE: as this is a secondary + index, we always have to modify the + clustered index record first: see the + comment below */ + dict_index_t* index, /*!< in: secondary index */ + que_thr_t* thr, /*!< in: query thread + (can be NULL if BTR_NO_LOCKING_FLAG) */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + dberr_t err; + ulint heap_no; + + ut_ad(!dict_index_is_clust(index)); + ut_ad(!dict_index_is_online_ddl(index) || (flags & BTR_CREATE_FLAG)); + ut_ad(block->frame == page_align(rec)); + + if (flags & BTR_NO_LOCKING_FLAG) { + + return(DB_SUCCESS); + } + + if (UNIV_UNLIKELY(thr_get_trx(thr)->fake_changes)) { + return(DB_SUCCESS); + } + + heap_no = page_rec_get_heap_no(rec); + + /* Another transaction cannot have an implicit lock on the record, + because when we come here, we already have modified the clustered + index record, and this would not have been possible if another active + transaction had modified this secondary index record. */ + + lock_mutex_enter(); + + ut_ad(lock_table_has(thr_get_trx(thr), index->table, LOCK_IX)); + + err = lock_rec_lock(TRUE, LOCK_X | LOCK_REC_NOT_GAP, + block, heap_no, index, thr); + + MONITOR_INC(MONITOR_NUM_RECLOCK_REQ); + + lock_mutex_exit(); + +#ifdef UNIV_DEBUG + { + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + const ulint* offsets; + rec_offs_init(offsets_); + + offsets = rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap); + + ut_ad(lock_rec_queue_validate( + FALSE, block, rec, index, offsets)); + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + } +#endif /* UNIV_DEBUG */ + + if (err == DB_SUCCESS || err == DB_SUCCESS_LOCKED_REC) { + /* Update the page max trx id field */ + /* It might not be necessary to do this if + err == DB_SUCCESS (no new lock created), + but it should not cost too much performance. */ + page_update_max_trx_id(block, + buf_block_get_page_zip(block), + thr_get_trx(thr)->id, mtr); + err = DB_SUCCESS; + } + + return(err); +} + +/*********************************************************************//** +Like lock_clust_rec_read_check_and_lock(), but reads a +secondary index record. +@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, DB_DEADLOCK, +or DB_QUE_THR_SUSPENDED */ +UNIV_INTERN +dberr_t +lock_sec_rec_read_check_and_lock( +/*=============================*/ + ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG + bit is set, does nothing */ + const buf_block_t* block, /*!< in: buffer block of rec */ + const rec_t* rec, /*!< in: user record or page + supremum record which should + be read or passed over by a + read cursor */ + dict_index_t* index, /*!< in: secondary index */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + enum lock_mode mode, /*!< in: mode of the lock which + the read cursor should set on + records: LOCK_S or LOCK_X; the + latter is possible in + SELECT FOR UPDATE */ + ulint gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or + LOCK_REC_NOT_GAP */ + que_thr_t* thr) /*!< in: query thread */ +{ + dberr_t err; + ulint heap_no; + + ut_ad(!dict_index_is_clust(index)); + ut_ad(!dict_index_is_online_ddl(index)); + ut_ad(block->frame == page_align(rec)); + ut_ad(page_rec_is_user_rec(rec) || page_rec_is_supremum(rec)); + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(mode == LOCK_X || mode == LOCK_S); + + if (flags & BTR_NO_LOCKING_FLAG) { + + return(DB_SUCCESS); + } + + if (UNIV_UNLIKELY((thr && thr_get_trx(thr)->fake_changes))) { + if (!srv_fake_changes_locks) { + return(DB_SUCCESS); + } + if (mode == LOCK_X) { + mode = LOCK_S; + } + } + + heap_no = page_rec_get_heap_no(rec); + + /* Some transaction may have an implicit x-lock on the record only + if the max trx id for the page >= min trx id for the trx list or a + database recovery is running. */ + + if ((page_get_max_trx_id(block->frame) >= trx_rw_min_trx_id() + || recv_recovery_is_on()) + && !page_rec_is_supremum(rec)) { + + lock_rec_convert_impl_to_expl(block, rec, index, offsets); + } + + lock_mutex_enter(); + + ut_ad(mode != LOCK_X + || lock_table_has(thr_get_trx(thr), index->table, LOCK_IX)); + ut_ad(mode != LOCK_S + || lock_table_has(thr_get_trx(thr), index->table, LOCK_IS)); + + err = lock_rec_lock(FALSE, mode | gap_mode, + block, heap_no, index, thr); + + MONITOR_INC(MONITOR_NUM_RECLOCK_REQ); + + lock_mutex_exit(); + + ut_ad(lock_rec_queue_validate(FALSE, block, rec, index, offsets)); + + return(err); +} + +/*********************************************************************//** +Checks if locks of other transactions prevent an immediate read, or passing +over by a read cursor, of a clustered index record. If they do, first tests +if the query thread should anyway be suspended for some reason; if not, then +puts the transaction and the query thread to the lock wait state and inserts a +waiting request for a record lock to the lock queue. Sets the requested mode +lock on the record. +@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, DB_DEADLOCK, +or DB_QUE_THR_SUSPENDED */ +UNIV_INTERN +dberr_t +lock_clust_rec_read_check_and_lock( +/*===============================*/ + ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG + bit is set, does nothing */ + const buf_block_t* block, /*!< in: buffer block of rec */ + const rec_t* rec, /*!< in: user record or page + supremum record which should + be read or passed over by a + read cursor */ + dict_index_t* index, /*!< in: clustered index */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + enum lock_mode mode, /*!< in: mode of the lock which + the read cursor should set on + records: LOCK_S or LOCK_X; the + latter is possible in + SELECT FOR UPDATE */ + ulint gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or + LOCK_REC_NOT_GAP */ + que_thr_t* thr) /*!< in: query thread */ +{ + dberr_t err; + ulint heap_no; + + ut_ad(dict_index_is_clust(index)); + ut_ad(block->frame == page_align(rec)); + ut_ad(page_rec_is_user_rec(rec) || page_rec_is_supremum(rec)); + ut_ad(gap_mode == LOCK_ORDINARY || gap_mode == LOCK_GAP + || gap_mode == LOCK_REC_NOT_GAP); + ut_ad(rec_offs_validate(rec, index, offsets)); + + if (flags & BTR_NO_LOCKING_FLAG) { + + return(DB_SUCCESS); + } + + if (UNIV_UNLIKELY((thr && thr_get_trx(thr)->fake_changes))) { + if (!srv_fake_changes_locks) { + return(DB_SUCCESS); + } + if (mode == LOCK_X) { + mode = LOCK_S; + } + } + + heap_no = page_rec_get_heap_no(rec); + + if (UNIV_LIKELY(heap_no != PAGE_HEAP_NO_SUPREMUM)) { + + lock_rec_convert_impl_to_expl(block, rec, index, offsets); + } + + lock_mutex_enter(); + + ut_ad(mode != LOCK_X + || lock_table_has(thr_get_trx(thr), index->table, LOCK_IX)); + ut_ad(mode != LOCK_S + || lock_table_has(thr_get_trx(thr), index->table, LOCK_IS)); + + err = lock_rec_lock(FALSE, mode | gap_mode, + block, heap_no, index, thr); + + MONITOR_INC(MONITOR_NUM_RECLOCK_REQ); + + lock_mutex_exit(); + + ut_ad(lock_rec_queue_validate(FALSE, block, rec, index, offsets)); + + return(err); +} +/*********************************************************************//** +Checks if locks of other transactions prevent an immediate read, or passing +over by a read cursor, of a clustered index record. If they do, first tests +if the query thread should anyway be suspended for some reason; if not, then +puts the transaction and the query thread to the lock wait state and inserts a +waiting request for a record lock to the lock queue. Sets the requested mode +lock on the record. This is an alternative version of +lock_clust_rec_read_check_and_lock() that does not require the parameter +"offsets". +@return DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */ +UNIV_INTERN +dberr_t +lock_clust_rec_read_check_and_lock_alt( +/*===================================*/ + ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG + bit is set, does nothing */ + const buf_block_t* block, /*!< in: buffer block of rec */ + const rec_t* rec, /*!< in: user record or page + supremum record which should + be read or passed over by a + read cursor */ + dict_index_t* index, /*!< in: clustered index */ + enum lock_mode mode, /*!< in: mode of the lock which + the read cursor should set on + records: LOCK_S or LOCK_X; the + latter is possible in + SELECT FOR UPDATE */ + ulint gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or + LOCK_REC_NOT_GAP */ + que_thr_t* thr) /*!< in: query thread */ +{ + mem_heap_t* tmp_heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + dberr_t err; + rec_offs_init(offsets_); + + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &tmp_heap); + err = lock_clust_rec_read_check_and_lock(flags, block, rec, index, + offsets, mode, gap_mode, thr); + if (tmp_heap) { + mem_heap_free(tmp_heap); + } + + if (UNIV_UNLIKELY(err == DB_SUCCESS_LOCKED_REC)) { + err = DB_SUCCESS; + } + + return(err); +} + +/*******************************************************************//** +Release the last lock from the transaction's autoinc locks. */ +UNIV_INLINE +void +lock_release_autoinc_last_lock( +/*===========================*/ + ib_vector_t* autoinc_locks) /*!< in/out: vector of AUTOINC locks */ +{ + ulint last; + lock_t* lock; + + ut_ad(lock_mutex_own()); + ut_a(!ib_vector_is_empty(autoinc_locks)); + + /* The lock to be release must be the last lock acquired. */ + last = ib_vector_size(autoinc_locks) - 1; + lock = *static_cast<lock_t**>(ib_vector_get(autoinc_locks, last)); + + /* Should have only AUTOINC locks in the vector. */ + ut_a(lock_get_mode(lock) == LOCK_AUTO_INC); + ut_a(lock_get_type(lock) == LOCK_TABLE); + + ut_a(lock->un_member.tab_lock.table != NULL); + + /* This will remove the lock from the trx autoinc_locks too. */ + lock_table_dequeue(lock); + + /* Remove from the table vector too. */ + lock_trx_table_locks_remove(lock); +} + +/*******************************************************************//** +Check if a transaction holds any autoinc locks. +@return TRUE if the transaction holds any AUTOINC locks. */ +static +ibool +lock_trx_holds_autoinc_locks( +/*=========================*/ + const trx_t* trx) /*!< in: transaction */ +{ + ut_a(trx->autoinc_locks != NULL); + + return(!ib_vector_is_empty(trx->autoinc_locks)); +} + +/*******************************************************************//** +Release all the transaction's autoinc locks. */ +static +void +lock_release_autoinc_locks( +/*=======================*/ + trx_t* trx) /*!< in/out: transaction */ +{ + ut_ad(lock_mutex_own()); + /* If this is invoked for a running transaction by the thread + that is serving the transaction, then it is not necessary to + hold trx->mutex here. */ + + ut_a(trx->autoinc_locks != NULL); + + /* We release the locks in the reverse order. This is to + avoid searching the vector for the element to delete at + the lower level. See (lock_table_remove_low()) for details. */ + while (!ib_vector_is_empty(trx->autoinc_locks)) { + + /* lock_table_remove_low() will also remove the lock from + the transaction's autoinc_locks vector. */ + lock_release_autoinc_last_lock(trx->autoinc_locks); + } + + /* Should release all locks. */ + ut_a(ib_vector_is_empty(trx->autoinc_locks)); +} + +/*******************************************************************//** +Gets the type of a lock. Non-inline version for using outside of the +lock module. +@return LOCK_TABLE or LOCK_REC */ +UNIV_INTERN +ulint +lock_get_type( +/*==========*/ + const lock_t* lock) /*!< in: lock */ +{ + return(lock_get_type_low(lock)); +} + +/*******************************************************************//** +Gets the id of the transaction owning a lock. +@return transaction id */ +UNIV_INTERN +trx_id_t +lock_get_trx_id( +/*============*/ + const lock_t* lock) /*!< in: lock */ +{ + return(lock->trx->id); +} + +/*******************************************************************//** +Gets the mode of a lock in a human readable string. +The string should not be free()'d or modified. +@return lock mode */ +UNIV_INTERN +const char* +lock_get_mode_str( +/*==============*/ + const lock_t* lock) /*!< in: lock */ +{ + ibool is_gap_lock; + + is_gap_lock = lock_get_type_low(lock) == LOCK_REC + && lock_rec_get_gap(lock); + + switch (lock_get_mode(lock)) { + case LOCK_S: + if (is_gap_lock) { + return("S,GAP"); + } else { + return("S"); + } + case LOCK_X: + if (is_gap_lock) { + return("X,GAP"); + } else { + return("X"); + } + case LOCK_IS: + if (is_gap_lock) { + return("IS,GAP"); + } else { + return("IS"); + } + case LOCK_IX: + if (is_gap_lock) { + return("IX,GAP"); + } else { + return("IX"); + } + case LOCK_AUTO_INC: + return("AUTO_INC"); + default: + return("UNKNOWN"); + } +} + +/*******************************************************************//** +Gets the type of a lock in a human readable string. +The string should not be free()'d or modified. +@return lock type */ +UNIV_INTERN +const char* +lock_get_type_str( +/*==============*/ + const lock_t* lock) /*!< in: lock */ +{ + switch (lock_get_type_low(lock)) { + case LOCK_REC: + return("RECORD"); + case LOCK_TABLE: + return("TABLE"); + default: + return("UNKNOWN"); + } +} + +/*******************************************************************//** +Gets the table on which the lock is. +@return table */ +UNIV_INLINE +dict_table_t* +lock_get_table( +/*===========*/ + const lock_t* lock) /*!< in: lock */ +{ + switch (lock_get_type_low(lock)) { + case LOCK_REC: + ut_ad(dict_index_is_clust(lock->index) + || !dict_index_is_online_ddl(lock->index)); + return(lock->index->table); + case LOCK_TABLE: + return(lock->un_member.tab_lock.table); + default: + ut_error; + return(NULL); + } +} + +/*******************************************************************//** +Gets the id of the table on which the lock is. +@return id of the table */ +UNIV_INTERN +table_id_t +lock_get_table_id( +/*==============*/ + const lock_t* lock) /*!< in: lock */ +{ + dict_table_t* table; + + table = lock_get_table(lock); + + return(table->id); +} + +/*******************************************************************//** +Gets the name of the table on which the lock is. +The string should not be free()'d or modified. +@return name of the table */ +UNIV_INTERN +const char* +lock_get_table_name( +/*================*/ + const lock_t* lock) /*!< in: lock */ +{ + dict_table_t* table; + + table = lock_get_table(lock); + + return(table->name); +} + +/*******************************************************************//** +For a record lock, gets the index on which the lock is. +@return index */ +UNIV_INTERN +const dict_index_t* +lock_rec_get_index( +/*===============*/ + const lock_t* lock) /*!< in: lock */ +{ + ut_a(lock_get_type_low(lock) == LOCK_REC); + ut_ad(dict_index_is_clust(lock->index) + || !dict_index_is_online_ddl(lock->index)); + + return(lock->index); +} + +/*******************************************************************//** +For a record lock, gets the name of the index on which the lock is. +The string should not be free()'d or modified. +@return name of the index */ +UNIV_INTERN +const char* +lock_rec_get_index_name( +/*====================*/ + const lock_t* lock) /*!< in: lock */ +{ + ut_a(lock_get_type_low(lock) == LOCK_REC); + ut_ad(dict_index_is_clust(lock->index) + || !dict_index_is_online_ddl(lock->index)); + + return(lock->index->name); +} + +/*******************************************************************//** +For a record lock, gets the tablespace number on which the lock is. +@return tablespace number */ +UNIV_INTERN +ulint +lock_rec_get_space_id( +/*==================*/ + const lock_t* lock) /*!< in: lock */ +{ + ut_a(lock_get_type_low(lock) == LOCK_REC); + + return(lock->un_member.rec_lock.space); +} + +/*******************************************************************//** +For a record lock, gets the page number on which the lock is. +@return page number */ +UNIV_INTERN +ulint +lock_rec_get_page_no( +/*=================*/ + const lock_t* lock) /*!< in: lock */ +{ + ut_a(lock_get_type_low(lock) == LOCK_REC); + + return(lock->un_member.rec_lock.page_no); +} + +/*********************************************************************//** +Cancels a waiting lock request and releases possible other transactions +waiting behind it. */ +UNIV_INTERN +void +lock_cancel_waiting_and_release( +/*============================*/ + lock_t* lock) /*!< in/out: waiting lock request */ +{ + que_thr_t* thr; + + ut_ad(lock_mutex_own()); + ut_ad(trx_mutex_own(lock->trx)); + + lock->trx->lock.cancel = TRUE; + + if (lock_get_type_low(lock) == LOCK_REC) { + + lock_rec_dequeue_from_page(lock); + } else { + ut_ad(lock_get_type_low(lock) & LOCK_TABLE); + + if (lock->trx->autoinc_locks != NULL) { + /* Release the transaction's AUTOINC locks. */ + lock_release_autoinc_locks(lock->trx); + } + + lock_table_dequeue(lock); + } + + /* Reset the wait flag and the back pointer to lock in trx. */ + + lock_reset_lock_and_trx_wait(lock); + + /* The following function releases the trx from lock wait. */ + + thr = que_thr_end_lock_wait(lock->trx); + + if (thr != NULL) { + lock_wait_release_thread_if_suspended(thr); + } + + lock->trx->lock.cancel = FALSE; +} + +/*********************************************************************//** +Unlocks AUTO_INC type locks that were possibly reserved by a trx. This +function should be called at the the end of an SQL statement, by the +connection thread that owns the transaction (trx->mysql_thd). */ +UNIV_INTERN +void +lock_unlock_table_autoinc( +/*======================*/ + trx_t* trx) /*!< in/out: transaction */ +{ + ut_ad(!lock_mutex_own()); + ut_ad(!trx_mutex_own(trx)); + ut_ad(!trx->lock.wait_lock); + /* This can be invoked on NOT_STARTED, ACTIVE, PREPARED, + but not COMMITTED transactions. */ + ut_ad(trx_state_eq(trx, TRX_STATE_NOT_STARTED) + || !trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY)); + + /* This function is invoked for a running transaction by the + thread that is serving the transaction. Therefore it is not + necessary to hold trx->mutex here. */ + + if (lock_trx_holds_autoinc_locks(trx)) { + lock_mutex_enter(); + + lock_release_autoinc_locks(trx); + + lock_mutex_exit(); + } +} + +/*********************************************************************//** +Releases a transaction's locks, and releases possible other transactions +waiting because of these locks. Change the state of the transaction to +TRX_STATE_COMMITTED_IN_MEMORY. */ +UNIV_INTERN +void +lock_trx_release_locks( +/*===================*/ + trx_t* trx) /*!< in/out: transaction */ +{ + assert_trx_in_list(trx); + + if (trx_state_eq(trx, TRX_STATE_PREPARED)) { + mutex_enter(&trx_sys->mutex); + ut_a(trx_sys->n_prepared_trx > 0); + trx_sys->n_prepared_trx--; + if (trx->is_recovered) { + ut_a(trx_sys->n_prepared_recovered_trx > 0); + trx_sys->n_prepared_recovered_trx--; + } + mutex_exit(&trx_sys->mutex); + } else { + ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE)); + } + + /* The transition of trx->state to TRX_STATE_COMMITTED_IN_MEMORY + is protected by both the lock_sys->mutex and the trx->mutex. + We also lock trx_sys->mutex, because state transition to + TRX_STATE_COMMITTED_IN_MEMORY must be atomic with removing trx + from the descriptors array. */ + lock_mutex_enter(); + mutex_enter(&trx_sys->mutex); + trx_mutex_enter(trx); + + /* The following assignment makes the transaction committed in memory + and makes its changes to data visible to other transactions. + NOTE that there is a small discrepancy from the strict formal + visibility rules here: a human user of the database can see + modifications made by another transaction T even before the necessary + log segment has been flushed to the disk. If the database happens to + crash before the flush, the user has seen modifications from T which + will never be a committed transaction. However, any transaction T2 + which sees the modifications of the committing transaction T, and + which also itself makes modifications to the database, will get an lsn + larger than the committing transaction T. In the case where the log + flush fails, and T never gets committed, also T2 will never get + committed. */ + + /*--------------------------------------*/ + trx->state = TRX_STATE_COMMITTED_IN_MEMORY; + /* The following also removes trx from trx_serial_list */ + trx_release_descriptor(trx); + /*--------------------------------------*/ + + /* If the background thread trx_rollback_or_clean_recovered() + is still active then there is a chance that the rollback + thread may see this trx as COMMITTED_IN_MEMORY and goes ahead + to clean it up calling trx_cleanup_at_db_startup(). This can + happen in the case we are committing a trx here that is left + in PREPARED state during the crash. Note that commit of the + rollback of a PREPARED trx happens in the recovery thread + while the rollback of other transactions happen in the + background thread. To avoid this race we unconditionally unset + the is_recovered flag. */ + + trx->is_recovered = FALSE; + + trx_mutex_exit(trx); + + mutex_exit(&trx_sys->mutex); + + lock_release(trx); + + lock_mutex_exit(); +} + +/*********************************************************************//** +Check whether the transaction has already been rolled back because it +was selected as a deadlock victim, or if it has to wait then cancel +the wait lock. +@return DB_DEADLOCK, DB_LOCK_WAIT or DB_SUCCESS */ +UNIV_INTERN +dberr_t +lock_trx_handle_wait( +/*=================*/ + trx_t* trx) /*!< in/out: trx lock state */ +{ + dberr_t err; + + lock_mutex_enter(); + + trx_mutex_enter(trx); + + if (trx->lock.was_chosen_as_deadlock_victim) { + err = DB_DEADLOCK; + } else if (trx->lock.wait_lock != NULL) { + lock_cancel_waiting_and_release(trx->lock.wait_lock); + err = DB_LOCK_WAIT; + } else { + /* The lock was probably granted before we got here. */ + err = DB_SUCCESS; + } + + lock_mutex_exit(); + trx_mutex_exit(trx); + + return(err); +} + +/*********************************************************************//** +Get the number of locks on a table. +@return number of locks */ +UNIV_INTERN +ulint +lock_table_get_n_locks( +/*===================*/ + const dict_table_t* table) /*!< in: table */ +{ + ulint n_table_locks; + + lock_mutex_enter(); + + n_table_locks = UT_LIST_GET_LEN(table->locks); + + lock_mutex_exit(); + + return(n_table_locks); +} + +#ifdef UNIV_DEBUG +/*******************************************************************//** +Do an exhaustive check for any locks (table or rec) against the table. +@return lock if found */ +static +const lock_t* +lock_table_locks_lookup( +/*====================*/ + const dict_table_t* table, /*!< in: check if there are + any locks held on records in + this table or on the table + itself */ + const trx_list_t* trx_list) /*!< in: trx list to check */ +{ + trx_t* trx; + + ut_a(table != NULL); + ut_ad(lock_mutex_own()); + ut_ad(mutex_own(&trx_sys->mutex)); + + ut_ad(trx_list == &trx_sys->rw_trx_list + || trx_list == &trx_sys->ro_trx_list); + + for (trx = UT_LIST_GET_FIRST(*trx_list); + trx != NULL; + trx = UT_LIST_GET_NEXT(trx_list, trx)) { + + const lock_t* lock; + + assert_trx_in_list(trx); + ut_ad(trx->read_only == (trx_list == &trx_sys->ro_trx_list)); + + for (lock = UT_LIST_GET_FIRST(trx->lock.trx_locks); + lock != NULL; + lock = UT_LIST_GET_NEXT(trx_locks, lock)) { + + ut_a(lock->trx == trx); + + if (lock_get_type_low(lock) == LOCK_REC) { + ut_ad(!dict_index_is_online_ddl(lock->index) + || dict_index_is_clust(lock->index)); + if (lock->index->table == table) { + return(lock); + } + } else if (lock->un_member.tab_lock.table == table) { + return(lock); + } + } + } + + return(NULL); +} +#endif /* UNIV_DEBUG */ + +/*******************************************************************//** +Check if there are any locks (table or rec) against table. +@return TRUE if table has either table or record locks. */ +UNIV_INTERN +ibool +lock_table_has_locks( +/*=================*/ + const dict_table_t* table) /*!< in: check if there are any locks + held on records in this table or on the + table itself */ +{ + ibool has_locks; + + lock_mutex_enter(); + + has_locks = UT_LIST_GET_LEN(table->locks) > 0 || table->n_rec_locks > 0; + +#ifdef UNIV_DEBUG + if (!has_locks) { + mutex_enter(&trx_sys->mutex); + + ut_ad(!lock_table_locks_lookup(table, &trx_sys->rw_trx_list)); + ut_ad(!lock_table_locks_lookup(table, &trx_sys->ro_trx_list)); + + mutex_exit(&trx_sys->mutex); + } +#endif /* UNIV_DEBUG */ + + lock_mutex_exit(); + + return(has_locks); +} + +#ifdef UNIV_DEBUG +/*******************************************************************//** +Check if the transaction holds any locks on the sys tables +or its records. +@return the strongest lock found on any sys table or 0 for none */ +UNIV_INTERN +const lock_t* +lock_trx_has_sys_table_locks( +/*=========================*/ + const trx_t* trx) /*!< in: transaction to check */ +{ + lint i; + const lock_t* strongest_lock = 0; + lock_mode strongest = LOCK_NONE; + + lock_mutex_enter(); + + /* Find a valid mode. Note: ib_vector_size() can be 0. */ + for (i = ib_vector_size(trx->lock.table_locks) - 1; i >= 0; --i) { + const lock_t* lock; + + lock = *static_cast<const lock_t**>( + ib_vector_get(trx->lock.table_locks, i)); + + if (lock != NULL + && dict_is_sys_table(lock->un_member.tab_lock.table->id)) { + + strongest = lock_get_mode(lock); + ut_ad(strongest != LOCK_NONE); + strongest_lock = lock; + break; + } + } + + if (strongest == LOCK_NONE) { + lock_mutex_exit(); + return(NULL); + } + + for (/* No op */; i >= 0; --i) { + const lock_t* lock; + + lock = *static_cast<const lock_t**>( + ib_vector_get(trx->lock.table_locks, i)); + + if (lock == NULL) { + continue; + } + + ut_ad(trx == lock->trx); + ut_ad(lock_get_type_low(lock) & LOCK_TABLE); + ut_ad(lock->un_member.tab_lock.table != NULL); + + lock_mode mode = lock_get_mode(lock); + + if (dict_is_sys_table(lock->un_member.tab_lock.table->id) + && lock_mode_stronger_or_eq(mode, strongest)) { + + strongest = mode; + strongest_lock = lock; + } + } + + lock_mutex_exit(); + + return(strongest_lock); +} + +/*******************************************************************//** +Check if the transaction holds an exclusive lock on a record. +@return whether the locks are held */ +UNIV_INTERN +bool +lock_trx_has_rec_x_lock( +/*====================*/ + const trx_t* trx, /*!< in: transaction to check */ + const dict_table_t* table, /*!< in: table to check */ + const buf_block_t* block, /*!< in: buffer block of the record */ + ulint heap_no)/*!< in: record heap number */ +{ + enum lock_mode intention_lock; + enum lock_mode rec_lock; + ut_ad(heap_no > PAGE_HEAP_NO_SUPREMUM); + + if (UNIV_UNLIKELY(trx->fake_changes)) { + + intention_lock = LOCK_IS; + rec_lock = LOCK_S; + } else { + + intention_lock = LOCK_IX; + rec_lock = LOCK_X; + } + lock_mutex_enter(); + ut_a(lock_table_has(trx, table, intention_lock)); + if (UNIV_LIKELY(srv_fake_changes_locks)) { + + ut_a(lock_rec_has_expl(rec_lock | LOCK_REC_NOT_GAP, + block, heap_no, trx->id)); + } + lock_mutex_exit(); + return(true); +} +#endif /* UNIV_DEBUG */ diff --git a/storage/xtradb/lock/lock0wait.cc b/storage/xtradb/lock/lock0wait.cc new file mode 100644 index 00000000000..a1c35e20ead --- /dev/null +++ b/storage/xtradb/lock/lock0wait.cc @@ -0,0 +1,543 @@ +/***************************************************************************** + +Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file lock/lock0wait.cc +The transaction lock system + +Created 25/5/2010 Sunny Bains +*******************************************************/ + +#define LOCK_MODULE_IMPLEMENTATION + +#include "srv0mon.h" +#include "que0que.h" +#include "lock0lock.h" +#include "row0mysql.h" +#include "srv0start.h" +#include "ha_prototypes.h" +#include "lock0priv.h" + +/*********************************************************************//** +Print the contents of the lock_sys_t::waiting_threads array. */ +static +void +lock_wait_table_print(void) +/*=======================*/ +{ + ulint i; + const srv_slot_t* slot; + + ut_ad(lock_wait_mutex_own()); + + slot = lock_sys->waiting_threads; + + for (i = 0; i < OS_THREAD_MAX_N; i++, ++slot) { + + fprintf(stderr, + "Slot %lu: thread type %lu," + " in use %lu, susp %lu, timeout %lu, time %lu\n", + (ulong) i, + (ulong) slot->type, + (ulong) slot->in_use, + (ulong) slot->suspended, + slot->wait_timeout, + (ulong) difftime(ut_time(), slot->suspend_time)); + } +} + +/*********************************************************************//** +Release a slot in the lock_sys_t::waiting_threads. Adjust the array last pointer +if there are empty slots towards the end of the table. */ +static +void +lock_wait_table_release_slot( +/*=========================*/ + srv_slot_t* slot) /*!< in: slot to release */ +{ +#ifdef UNIV_DEBUG + srv_slot_t* upper = lock_sys->waiting_threads + OS_THREAD_MAX_N; +#endif /* UNIV_DEBUG */ + + lock_wait_mutex_enter(); + + ut_ad(slot->in_use); + ut_ad(slot->thr != NULL); + ut_ad(slot->thr->slot != NULL); + ut_ad(slot->thr->slot == slot); + + /* Must be within the array boundaries. */ + ut_ad(slot >= lock_sys->waiting_threads); + ut_ad(slot < upper); + + /* Note: When we reserve the slot we use the trx_t::mutex to update + the slot values to change the state to reserved. Here we are using the + lock mutex to change the state of the slot to free. This is by design, + because when we query the slot state we always hold both the lock and + trx_t::mutex. To reduce contention on the lock mutex when reserving the + slot we avoid acquiring the lock mutex. */ + + lock_mutex_enter(); + + slot->thr->slot = NULL; + slot->thr = NULL; + slot->in_use = FALSE; + + lock_mutex_exit(); + + /* Scan backwards and adjust the last free slot pointer. */ + for (slot = lock_sys->last_slot; + slot > lock_sys->waiting_threads && !slot->in_use; + --slot) { + /* No op */ + } + + /* Either the array is empty or the last scanned slot is in use. */ + ut_ad(slot->in_use || slot == lock_sys->waiting_threads); + + lock_sys->last_slot = slot + 1; + + /* The last slot is either outside of the array boundary or it's + on an empty slot. */ + ut_ad(lock_sys->last_slot == upper || !lock_sys->last_slot->in_use); + + ut_ad(lock_sys->last_slot >= lock_sys->waiting_threads); + ut_ad(lock_sys->last_slot <= upper); + + lock_wait_mutex_exit(); +} + +/*********************************************************************//** +Reserves a slot in the thread table for the current user OS thread. +@return reserved slot */ +static +srv_slot_t* +lock_wait_table_reserve_slot( +/*=========================*/ + que_thr_t* thr, /*!< in: query thread associated + with the user OS thread */ + ulong wait_timeout) /*!< in: lock wait timeout value */ +{ + ulint i; + srv_slot_t* slot; + + ut_ad(lock_wait_mutex_own()); + ut_ad(trx_mutex_own(thr_get_trx(thr))); + + slot = lock_sys->waiting_threads; + + for (i = OS_THREAD_MAX_N; i--; ++slot) { + if (!slot->in_use) { + slot->in_use = TRUE; + slot->thr = thr; + slot->thr->slot = slot; + + if (slot->event == NULL) { + slot->event = os_event_create(); + ut_a(slot->event); + } + + os_event_reset(slot->event); + slot->suspended = TRUE; + slot->suspend_time = ut_time(); + slot->wait_timeout = wait_timeout; + + if (slot == lock_sys->last_slot) { + ++lock_sys->last_slot; + } + + ut_ad(lock_sys->last_slot + <= lock_sys->waiting_threads + OS_THREAD_MAX_N); + + return(slot); + } + } + + ut_print_timestamp(stderr); + + fprintf(stderr, + " InnoDB: There appear to be %lu user" + " threads currently waiting\n" + "InnoDB: inside InnoDB, which is the" + " upper limit. Cannot continue operation.\n" + "InnoDB: As a last thing, we print" + " a list of waiting threads.\n", (ulong) OS_THREAD_MAX_N); + + lock_wait_table_print(); + + ut_error; + return(NULL); +} + +/***************************************************************//** +Puts a user OS thread to wait for a lock to be released. If an error +occurs during the wait trx->error_state associated with thr is +!= DB_SUCCESS when we return. DB_LOCK_WAIT_TIMEOUT and DB_DEADLOCK +are possible errors. DB_DEADLOCK is returned if selective deadlock +resolution chose this transaction as a victim. */ +UNIV_INTERN +void +lock_wait_suspend_thread( +/*=====================*/ + que_thr_t* thr) /*!< in: query thread associated with the + user OS thread */ +{ + srv_slot_t* slot; + double wait_time; + trx_t* trx; + ulint had_dict_lock; + ibool was_declared_inside_innodb; + ib_int64_t start_time = 0; + ib_int64_t finish_time; + ulint sec; + ulint ms; + ulong lock_wait_timeout; + + trx = thr_get_trx(thr); + + if (trx->mysql_thd != 0) { + DEBUG_SYNC_C("lock_wait_suspend_thread_enter"); + } + + /* InnoDB system transactions (such as the purge, and + incomplete transactions that are being rolled back after crash + recovery) will use the global value of + innodb_lock_wait_timeout, because trx->mysql_thd == NULL. */ + lock_wait_timeout = trx_lock_wait_timeout_get(trx); + + lock_wait_mutex_enter(); + + trx_mutex_enter(trx); + + trx->error_state = DB_SUCCESS; + + if (thr->state == QUE_THR_RUNNING) { + + ut_ad(thr->is_active); + + /* The lock has already been released or this transaction + was chosen as a deadlock victim: no need to suspend */ + + if (trx->lock.was_chosen_as_deadlock_victim) { + + trx->error_state = DB_DEADLOCK; + trx->lock.was_chosen_as_deadlock_victim = FALSE; + } + + lock_wait_mutex_exit(); + trx_mutex_exit(trx); + return; + } + + ut_ad(!thr->is_active); + + slot = lock_wait_table_reserve_slot(thr, lock_wait_timeout); + + if (thr->lock_state == QUE_THR_LOCK_ROW) { + srv_stats.n_lock_wait_count.inc(); + srv_stats.n_lock_wait_current_count.inc(); + + if (ut_usectime(&sec, &ms) == -1) { + start_time = -1; + } else { + start_time = (ib_int64_t) sec * 1000000 + ms; + } + } + + /* Wake the lock timeout monitor thread, if it is suspended */ + + os_event_set(lock_sys->timeout_event); + + lock_wait_mutex_exit(); + trx_mutex_exit(trx); + + ulint lock_type = ULINT_UNDEFINED; + + lock_mutex_enter(); + + if (const lock_t* wait_lock = trx->lock.wait_lock) { + lock_type = lock_get_type_low(wait_lock); + } + + lock_mutex_exit(); + + had_dict_lock = trx->dict_operation_lock_mode; + + switch (had_dict_lock) { + case 0: + break; + case RW_S_LATCH: + /* Release foreign key check latch */ + row_mysql_unfreeze_data_dictionary(trx); + + DEBUG_SYNC_C("lock_wait_release_s_latch_before_sleep"); + break; + default: + /* There should never be a lock wait when the + dictionary latch is reserved in X mode. Dictionary + transactions should only acquire locks on dictionary + tables, not other tables. All access to dictionary + tables should be covered by dictionary + transactions. */ + ut_error; + } + + ut_a(trx->dict_operation_lock_mode == 0); + + /* Suspend this thread and wait for the event. */ + + was_declared_inside_innodb = trx->declared_to_be_inside_innodb; + + if (was_declared_inside_innodb) { + /* We must declare this OS thread to exit InnoDB, since a + possible other thread holding a lock which this thread waits + for must be allowed to enter, sooner or later */ + + srv_conc_force_exit_innodb(trx); + } + + /* Unknown is also treated like a record lock */ + if (lock_type == ULINT_UNDEFINED || lock_type == LOCK_REC) { + thd_wait_begin(trx->mysql_thd, THD_WAIT_ROW_LOCK); + } else { + ut_ad(lock_type == LOCK_TABLE); + thd_wait_begin(trx->mysql_thd, THD_WAIT_TABLE_LOCK); + } + + os_event_wait(slot->event); + + thd_wait_end(trx->mysql_thd); + + /* After resuming, reacquire the data dictionary latch if + necessary. */ + + if (was_declared_inside_innodb) { + + /* Return back inside InnoDB */ + + srv_conc_force_enter_innodb(trx); + } + + if (had_dict_lock) { + + row_mysql_freeze_data_dictionary(trx); + } + + wait_time = ut_difftime(ut_time(), slot->suspend_time); + + /* Release the slot for others to use */ + + lock_wait_table_release_slot(slot); + + if (thr->lock_state == QUE_THR_LOCK_ROW) { + ulint diff_time; + + if (ut_usectime(&sec, &ms) == -1) { + finish_time = -1; + } else { + finish_time = (ib_int64_t) sec * 1000000 + ms; + } + + diff_time = (finish_time > start_time) ? + (ulint) (finish_time - start_time) : 0; + + srv_stats.n_lock_wait_current_count.dec(); + srv_stats.n_lock_wait_time.add(diff_time); + + /* Only update the variable if we successfully + retrieved the start and finish times. See Bug#36819. */ + if (diff_time > lock_sys->n_lock_max_wait_time + && start_time != -1 + && finish_time != -1) { + + lock_sys->n_lock_max_wait_time = diff_time; + } + + /* Record the lock wait time for this thread */ + thd_set_lock_wait_time(trx->mysql_thd, diff_time); + + } + + if (lock_wait_timeout < 100000000 + && wait_time > (double) lock_wait_timeout) { + + trx->error_state = DB_LOCK_WAIT_TIMEOUT; + + MONITOR_INC(MONITOR_TIMEOUT); + } + + if (trx_is_interrupted(trx)) { + + trx->error_state = DB_INTERRUPTED; + } +} + +/********************************************************************//** +Releases a user OS thread waiting for a lock to be released, if the +thread is already suspended. */ +UNIV_INTERN +void +lock_wait_release_thread_if_suspended( +/*==================================*/ + que_thr_t* thr) /*!< in: query thread associated with the + user OS thread */ +{ + ut_ad(lock_mutex_own()); + ut_ad(trx_mutex_own(thr_get_trx(thr))); + + /* We own both the lock mutex and the trx_t::mutex but not the + lock wait mutex. This is OK because other threads will see the state + of this slot as being in use and no other thread can change the state + of the slot to free unless that thread also owns the lock mutex. */ + + if (thr->slot != NULL && thr->slot->in_use && thr->slot->thr == thr) { + trx_t* trx = thr_get_trx(thr); + + if (trx->lock.was_chosen_as_deadlock_victim) { + + trx->error_state = DB_DEADLOCK; + trx->lock.was_chosen_as_deadlock_victim = FALSE; + } + + os_event_set(thr->slot->event); + } +} + +/*********************************************************************//** +Check if the thread lock wait has timed out. Release its locks if the +wait has actually timed out. */ +static +void +lock_wait_check_and_cancel( +/*=======================*/ + const srv_slot_t* slot) /*!< in: slot reserved by a user + thread when the wait started */ +{ + trx_t* trx; + double wait_time; + ib_time_t suspend_time = slot->suspend_time; + + ut_ad(lock_wait_mutex_own()); + + ut_ad(slot->in_use); + + ut_ad(slot->suspended); + + wait_time = ut_difftime(ut_time(), suspend_time); + + trx = thr_get_trx(slot->thr); + + if (trx_is_interrupted(trx) + || (slot->wait_timeout < 100000000 + && (wait_time > (double) slot->wait_timeout + || wait_time < 0))) { + + /* Timeout exceeded or a wrap-around in system + time counter: cancel the lock request queued + by the transaction and release possible + other transactions waiting behind; it is + possible that the lock has already been + granted: in that case do nothing */ + + lock_mutex_enter(); + + trx_mutex_enter(trx); + + if (trx->lock.wait_lock) { + + ut_a(trx->lock.que_state == TRX_QUE_LOCK_WAIT); + + lock_cancel_waiting_and_release(trx->lock.wait_lock); + } + + lock_mutex_exit(); + + trx_mutex_exit(trx); + } + +} + +/*********************************************************************//** +A thread which wakes up threads whose lock wait may have lasted too long. +@return a dummy parameter */ +extern "C" UNIV_INTERN +os_thread_ret_t +DECLARE_THREAD(lock_wait_timeout_thread)( +/*=====================================*/ + void* arg __attribute__((unused))) + /* in: a dummy parameter required by + os_thread_create */ +{ + ib_int64_t sig_count = 0; + os_event_t event = lock_sys->timeout_event; + + ut_ad(!srv_read_only_mode); + +#ifdef UNIV_PFS_THREAD + pfs_register_thread(srv_lock_timeout_thread_key); +#endif /* UNIV_PFS_THREAD */ + + lock_sys->timeout_thread_active = true; + + do { + srv_slot_t* slot; + + /* When someone is waiting for a lock, we wake up every second + and check if a timeout has passed for a lock wait */ + + os_event_wait_time_low(event, 1000000, sig_count); + sig_count = os_event_reset(event); + + if (srv_shutdown_state >= SRV_SHUTDOWN_CLEANUP) { + break; + } + + lock_wait_mutex_enter(); + + /* Check all slots for user threads that are waiting + on locks, and if they have exceeded the time limit. */ + + for (slot = lock_sys->waiting_threads; + slot < lock_sys->last_slot; + ++slot) { + + /* We are doing a read without the lock mutex + and/or the trx mutex. This is OK because a slot + can't be freed or reserved without the lock wait + mutex. */ + + if (slot->in_use) { + lock_wait_check_and_cancel(slot); + } + } + + sig_count = os_event_reset(event); + + lock_wait_mutex_exit(); + + } while (srv_shutdown_state < SRV_SHUTDOWN_CLEANUP); + + lock_sys->timeout_thread_active = false; + + /* We count the number of threads in os_thread_exit(). A created + thread should always use that to exit and not use return() to exit. */ + + os_thread_exit(NULL); + + OS_THREAD_DUMMY_RETURN; +} diff --git a/storage/xtradb/log/log0log.cc b/storage/xtradb/log/log0log.cc new file mode 100644 index 00000000000..18736636c98 --- /dev/null +++ b/storage/xtradb/log/log0log.cc @@ -0,0 +1,3955 @@ +/***************************************************************************** + +Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2009, Google Inc. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file log/log0log.cc +Database log + +Created 12/9/1995 Heikki Tuuri +*******************************************************/ + +#include "log0log.h" + +#ifdef UNIV_NONINL +#include "log0log.ic" +#endif + +#ifndef UNIV_HOTBACKUP +#include "mem0mem.h" +#include "buf0buf.h" +#include "buf0flu.h" +#include "srv0srv.h" +#include "log0recv.h" +#include "fil0fil.h" +#include "dict0boot.h" +#include "srv0srv.h" +#include "srv0start.h" +#include "trx0sys.h" +#include "trx0trx.h" +#include "srv0mon.h" + +/* +General philosophy of InnoDB redo-logs: + +1) Every change to a contents of a data page must be done +through mtr, which in mtr_commit() writes log records +to the InnoDB redo log. + +2) Normally these changes are performed using a mlog_write_ulint() +or similar function. + +3) In some page level operations only a code number of a +c-function and its parameters are written to the log to +reduce the size of the log. + + 3a) You should not add parameters to these kind of functions + (e.g. trx_undo_header_create(), trx_undo_insert_header_reuse()) + + 3b) You should not add such functionality which either change + working when compared with the old or are dependent on data + outside of the page. These kind of functions should implement + self-contained page transformation and it should be unchanged + if you don't have very essential reasons to change log + semantics or format. + +*/ + +/* Global log system variable */ +UNIV_INTERN log_t* log_sys = NULL; + +/** Pointer to the log checksum calculation function */ +UNIV_INTERN log_checksum_func_t log_checksum_algorithm_ptr = + log_block_calc_checksum_innodb; + +#ifdef UNIV_PFS_RWLOCK +UNIV_INTERN mysql_pfs_key_t checkpoint_lock_key; +# ifdef UNIV_LOG_ARCHIVE +UNIV_INTERN mysql_pfs_key_t archive_lock_key; +# endif +#endif /* UNIV_PFS_RWLOCK */ + +#ifdef UNIV_PFS_MUTEX +UNIV_INTERN mysql_pfs_key_t log_sys_mutex_key; +UNIV_INTERN mysql_pfs_key_t log_flush_order_mutex_key; +#endif /* UNIV_PFS_MUTEX */ + +#ifdef UNIV_DEBUG +UNIV_INTERN ibool log_do_write = TRUE; +#endif /* UNIV_DEBUG */ + +/* These control how often we print warnings if the last checkpoint is too +old */ +UNIV_INTERN ibool log_has_printed_chkp_warning = FALSE; +UNIV_INTERN time_t log_last_warning_time; + +#ifdef UNIV_LOG_ARCHIVE +/* Pointer to this variable is used as the i/o-message when we do i/o to an +archive */ +UNIV_INTERN byte log_archive_io; +#endif /* UNIV_LOG_ARCHIVE */ + +/* A margin for free space in the log buffer before a log entry is catenated */ +#define LOG_BUF_WRITE_MARGIN (4 * OS_FILE_LOG_BLOCK_SIZE) + +/* Margins for free space in the log buffer after a log entry is catenated */ +#define LOG_BUF_FLUSH_RATIO 2 +#define LOG_BUF_FLUSH_MARGIN (LOG_BUF_WRITE_MARGIN + 4 * UNIV_PAGE_SIZE) + +/* Margin for the free space in the smallest log group, before a new query +step which modifies the database, is started */ + +#define LOG_CHECKPOINT_FREE_PER_THREAD (4 * UNIV_PAGE_SIZE) +#define LOG_CHECKPOINT_EXTRA_FREE (8 * UNIV_PAGE_SIZE) + +/* This parameter controls asynchronous making of a new checkpoint; the value +should be bigger than LOG_POOL_PREFLUSH_RATIO_SYNC */ + +#define LOG_POOL_CHECKPOINT_RATIO_ASYNC 32 + +/* This parameter controls synchronous preflushing of modified buffer pages */ +#define LOG_POOL_PREFLUSH_RATIO_SYNC 16 + +/* The same ratio for asynchronous preflushing; this value should be less than +the previous */ +#define LOG_POOL_PREFLUSH_RATIO_ASYNC 8 + +/* Extra margin, in addition to one log file, used in archiving */ +#define LOG_ARCHIVE_EXTRA_MARGIN (4 * UNIV_PAGE_SIZE) + +/* This parameter controls asynchronous writing to the archive */ +#define LOG_ARCHIVE_RATIO_ASYNC 16 + +/* Codes used in unlocking flush latches */ +#define LOG_UNLOCK_NONE_FLUSHED_LOCK 1 +#define LOG_UNLOCK_FLUSH_LOCK 2 + +/* States of an archiving operation */ +#define LOG_ARCHIVE_READ 1 +#define LOG_ARCHIVE_WRITE 2 + +/******************************************************//** +Completes a checkpoint write i/o to a log file. */ +static +void +log_io_complete_checkpoint(void); +/*============================*/ +#ifdef UNIV_LOG_ARCHIVE +/******************************************************//** +Completes an archiving i/o. */ +static +void +log_io_complete_archive(void); +/*=========================*/ +#endif /* UNIV_LOG_ARCHIVE */ + +/****************************************************************//** +Returns the oldest modified block lsn in the pool, or log_sys->lsn if none +exists. +@return LSN of oldest modification */ +static +lsn_t +log_buf_pool_get_oldest_modification(void) +/*======================================*/ +{ + lsn_t lsn; + + ut_ad(mutex_own(&(log_sys->mutex))); + + lsn = buf_pool_get_oldest_modification(); + + if (!lsn) { + + lsn = log_sys->lsn; + } + + return(lsn); +} + +/****************************************************************//** +Checks if the log groups have a big enough margin of free space in +so that a new log entry can be written without overwriting log data +that is not read by the changed page bitmap thread. +@return TRUE if there is not enough free space. */ +static +ibool +log_check_tracking_margin( + ulint lsn_advance) /*!< in: an upper limit on how much log data we + plan to write. If zero, the margin will be + checked for the already-written log. */ +{ + lsn_t tracked_lsn; + lsn_t tracked_lsn_age; + + if (!srv_track_changed_pages) { + return FALSE; + } + + ut_ad(mutex_own(&(log_sys->mutex))); + + tracked_lsn = log_get_tracked_lsn(); + tracked_lsn_age = log_sys->lsn - tracked_lsn; + + /* The overwrite would happen when log_sys->log_group_capacity is + exceeded, but we use max_checkpoint_age for an extra safety margin. */ + return tracked_lsn_age + lsn_advance > log_sys->max_checkpoint_age; +} + +/** Extends the log buffer. +@param[in] len requested minimum size in bytes */ +static +void +log_buffer_extend( + ulint len) +{ + ulint move_start; + ulint move_end; + byte* tmp_buf = static_cast<byte *>(alloca(OS_FILE_LOG_BLOCK_SIZE)); + + mutex_enter(&(log_sys->mutex)); + + while (log_sys->is_extending) { + /* Another thread is trying to extend already. + Needs to wait for. */ + mutex_exit(&(log_sys->mutex)); + + log_buffer_flush_to_disk(); + + mutex_enter(&(log_sys->mutex)); + + if (srv_log_buffer_size > len / UNIV_PAGE_SIZE) { + /* Already extended enough by the others */ + mutex_exit(&(log_sys->mutex)); + return; + } + } + + log_sys->is_extending = true; + + while (log_sys->n_pending_writes != 0 + || ut_calc_align_down(log_sys->buf_free, + OS_FILE_LOG_BLOCK_SIZE) + != ut_calc_align_down(log_sys->buf_next_to_write, + OS_FILE_LOG_BLOCK_SIZE)) { + /* Buffer might have >1 blocks to write still. */ + mutex_exit(&(log_sys->mutex)); + + log_buffer_flush_to_disk(); + + mutex_enter(&(log_sys->mutex)); + } + + move_start = ut_calc_align_down( + log_sys->buf_free, + OS_FILE_LOG_BLOCK_SIZE); + move_end = log_sys->buf_free; + + /* store the last log block in buffer */ + ut_memcpy(tmp_buf, log_sys->buf + move_start, + move_end - move_start); + + log_sys->buf_free -= move_start; + log_sys->buf_next_to_write -= move_start; + + /* reallocate log buffer */ + srv_log_buffer_size = len / UNIV_PAGE_SIZE + 1; + mem_free(log_sys->buf_ptr); + log_sys->buf_ptr = static_cast<byte*>( + mem_zalloc(LOG_BUFFER_SIZE + OS_FILE_LOG_BLOCK_SIZE)); + log_sys->buf = static_cast<byte*>( + ut_align(log_sys->buf_ptr, OS_FILE_LOG_BLOCK_SIZE)); + log_sys->buf_size = LOG_BUFFER_SIZE; + log_sys->max_buf_free = log_sys->buf_size / LOG_BUF_FLUSH_RATIO + - LOG_BUF_FLUSH_MARGIN; + + /* restore the last log block */ + ut_memcpy(log_sys->buf, tmp_buf, move_end - move_start); + + ut_ad(log_sys->is_extending); + log_sys->is_extending = false; + + mutex_exit(&(log_sys->mutex)); + + ib_logf(IB_LOG_LEVEL_INFO, + "innodb_log_buffer_size was extended to %lu.", + LOG_BUFFER_SIZE); +} + +/************************************************************//** +Opens the log for log_write_low. The log must be closed with log_close. +@return start lsn of the log record */ +UNIV_INTERN +lsn_t +log_open( +/*=====*/ + ulint len) /*!< in: length of data to be catenated */ +{ + log_t* log = log_sys; + ulint len_upper_limit; +#ifdef UNIV_LOG_ARCHIVE + ulint archived_lsn_age; + ulint dummy; +#endif /* UNIV_LOG_ARCHIVE */ + ulint count = 0; + ulint tcount = 0; + + if (len >= log->buf_size / 2) { + DBUG_EXECUTE_IF("ib_log_buffer_is_short_crash", + DBUG_SUICIDE();); + + /* log_buffer is too small. try to extend instead of crash. */ + ib_logf(IB_LOG_LEVEL_WARN, + "The transaction log size is too large" + " for innodb_log_buffer_size (%lu >= %lu / 2). " + "Trying to extend it.", + len, LOG_BUFFER_SIZE); + + log_buffer_extend((len + 1) * 2); + } +loop: + ut_ad(!recv_no_log_write); + + if (log->is_extending) { + + mutex_exit(&(log->mutex)); + + /* Log buffer size is extending. Writing up to the next block + should wait for the extending finished. */ + + os_thread_sleep(100000); + + ut_ad(++count < 50); + + goto loop; + } + + /* Calculate an upper limit for the space the string may take in the + log buffer */ + + len_upper_limit = LOG_BUF_WRITE_MARGIN + (5 * len) / 4; + + if (log->buf_free + len_upper_limit > log->buf_size) { + + mutex_exit(&(log->mutex)); + + /* Not enough free space, do a syncronous flush of the log + buffer */ + + log_buffer_flush_to_disk(); + + srv_stats.log_waits.inc(); + + ut_ad(++count < 50); + + mutex_enter(&(log->mutex)); + + goto loop; + } + +#ifdef UNIV_LOG_ARCHIVE + if (log->archiving_state != LOG_ARCH_OFF) { + + archived_lsn_age = log->lsn - log->archived_lsn; + if (archived_lsn_age + len_upper_limit + > log->max_archived_lsn_age) { + /* Not enough free archived space in log groups: do a + synchronous archive write batch: */ + + mutex_exit(&(log->mutex)); + + ut_ad(len_upper_limit <= log->max_archived_lsn_age); + + log_archive_do(TRUE, &dummy); + + ut_ad(++count < 50); + + mutex_enter(&(log->mutex)); + + goto loop; + } + } +#endif /* UNIV_LOG_ARCHIVE */ + + if (log_check_tracking_margin(len_upper_limit) && + (++tcount + count < 50)) { + + /* This log write would violate the untracked LSN free space + margin. Limit this to 50 retries as there might be situations + where we have no choice but to proceed anyway, i.e. if the log + is about to be overflown, log tracking or not. */ + mutex_exit(&(log->mutex)); + + os_thread_sleep(10000); + + mutex_enter(&(log->mutex)); + + goto loop; + } + +#ifdef UNIV_LOG_DEBUG + log->old_buf_free = log->buf_free; + log->old_lsn = log->lsn; +#endif + return(log->lsn); +} + +/************************************************************//** +Writes to the log the string given. It is assumed that the caller holds the +log mutex. */ +UNIV_INTERN +void +log_write_low( +/*==========*/ + byte* str, /*!< in: string */ + ulint str_len) /*!< in: string length */ +{ + log_t* log = log_sys; + ulint len; + ulint data_len; + byte* log_block; + + ut_ad(mutex_own(&(log->mutex))); +part_loop: + ut_ad(!recv_no_log_write); + /* Calculate a part length */ + + data_len = (log->buf_free % OS_FILE_LOG_BLOCK_SIZE) + str_len; + + if (data_len <= OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE) { + + /* The string fits within the current log block */ + + len = str_len; + } else { + data_len = OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE; + + len = OS_FILE_LOG_BLOCK_SIZE + - (log->buf_free % OS_FILE_LOG_BLOCK_SIZE) + - LOG_BLOCK_TRL_SIZE; + } + + ut_memcpy(log->buf + log->buf_free, str, len); + + str_len -= len; + str = str + len; + + log_block = static_cast<byte*>( + ut_align_down( + log->buf + log->buf_free, OS_FILE_LOG_BLOCK_SIZE)); + + log_block_set_data_len(log_block, data_len); + + if (data_len == OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE) { + /* This block became full */ + log_block_set_data_len(log_block, OS_FILE_LOG_BLOCK_SIZE); + log_block_set_checkpoint_no(log_block, + log_sys->next_checkpoint_no); + len += LOG_BLOCK_HDR_SIZE + LOG_BLOCK_TRL_SIZE; + + log->lsn += len; + + /* Initialize the next block header */ + log_block_init(log_block + OS_FILE_LOG_BLOCK_SIZE, log->lsn); + } else { + log->lsn += len; + } + + log->buf_free += len; + + ut_ad(log->buf_free <= log->buf_size); + + if (str_len > 0) { + goto part_loop; + } + + srv_stats.log_write_requests.inc(); +} + +/************************************************************//** +Closes the log. +@return lsn */ +UNIV_INTERN +lsn_t +log_close(void) +/*===========*/ +{ + byte* log_block; + ulint first_rec_group; + lsn_t oldest_lsn; + lsn_t lsn; + lsn_t tracked_lsn; + lsn_t tracked_lsn_age; + log_t* log = log_sys; + lsn_t checkpoint_age; + + ut_ad(mutex_own(&(log->mutex))); + ut_ad(!recv_no_log_write); + + lsn = log->lsn; + + log_block = static_cast<byte*>( + ut_align_down( + log->buf + log->buf_free, OS_FILE_LOG_BLOCK_SIZE)); + + first_rec_group = log_block_get_first_rec_group(log_block); + + if (first_rec_group == 0) { + /* We initialized a new log block which was not written + full by the current mtr: the next mtr log record group + will start within this block at the offset data_len */ + + log_block_set_first_rec_group( + log_block, log_block_get_data_len(log_block)); + } + + if (log->buf_free > log->max_buf_free) { + + log->check_flush_or_checkpoint = TRUE; + } + + if (srv_track_changed_pages) { + + tracked_lsn = log_get_tracked_lsn(); + tracked_lsn_age = lsn - tracked_lsn; + + if (tracked_lsn_age >= log->log_group_capacity) { + + fprintf(stderr, "InnoDB: Error: the age of the " + "oldest untracked record exceeds the log " + "group capacity!\n"); + fprintf(stderr, "InnoDB: Error: stopping the log " + "tracking thread at LSN " LSN_PF "\n", + tracked_lsn); + srv_track_changed_pages = FALSE; + } + } + + checkpoint_age = lsn - log->last_checkpoint_lsn; + + if (checkpoint_age >= log->log_group_capacity) { + /* TODO: split btr_store_big_rec_extern_fields() into small + steps so that we can release all latches in the middle, and + call log_free_check() to ensure we never write over log written + after the latest checkpoint. In principle, we should split all + big_rec operations, but other operations are smaller. */ + + if (!log_has_printed_chkp_warning + || difftime(time(NULL), log_last_warning_time) > 15) { + + log_has_printed_chkp_warning = TRUE; + log_last_warning_time = time(NULL); + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: ERROR: the age of the last" + " checkpoint is " LSN_PF ",\n" + "InnoDB: which exceeds the log group" + " capacity " LSN_PF ".\n" + "InnoDB: If you are using big" + " BLOB or TEXT rows, you must set the\n" + "InnoDB: combined size of log files" + " at least 10 times bigger than the\n" + "InnoDB: largest such row.\n", + checkpoint_age, + log->log_group_capacity); + } + } + + if (checkpoint_age <= log->max_modified_age_sync) { + + goto function_exit; + } + + oldest_lsn = buf_pool_get_oldest_modification(); + + if (!oldest_lsn + || lsn - oldest_lsn > log->max_modified_age_sync + || checkpoint_age > log->max_checkpoint_age_async) { + + log->check_flush_or_checkpoint = TRUE; + } +function_exit: + +#ifdef UNIV_LOG_DEBUG + log_check_log_recs(log->buf + log->old_buf_free, + log->buf_free - log->old_buf_free, log->old_lsn); +#endif + + return(lsn); +} + +#ifdef UNIV_LOG_ARCHIVE +/******************************************************//** +Pads the current log block full with dummy log records. Used in producing +consistent archived log files. */ +static +void +log_pad_current_log_block(void) +/*===========================*/ +{ + byte b = MLOG_DUMMY_RECORD; + ulint pad_length; + ulint i; + lsn_t lsn; + + /* We retrieve lsn only because otherwise gcc crashed on HP-UX */ + lsn = log_reserve_and_open(OS_FILE_LOG_BLOCK_SIZE); + + pad_length = OS_FILE_LOG_BLOCK_SIZE + - (log_sys->buf_free % OS_FILE_LOG_BLOCK_SIZE) + - LOG_BLOCK_TRL_SIZE; + if (pad_length + == (OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_HDR_SIZE + - LOG_BLOCK_TRL_SIZE)) { + + pad_length = 0; + } + + for (i = 0; i < pad_length; i++) { + log_write_low(&b, 1); + } + + lsn = log_sys->lsn; + + log_close(); + log_release(); + + ut_a(lsn % OS_FILE_LOG_BLOCK_SIZE == LOG_BLOCK_HDR_SIZE); +} +#endif /* UNIV_LOG_ARCHIVE */ + +/******************************************************//** +Calculates the data capacity of a log group, when the log file headers are not +included. +@return capacity in bytes */ +UNIV_INTERN +lsn_t +log_group_get_capacity( +/*===================*/ + const log_group_t* group) /*!< in: log group */ +{ + ut_ad(mutex_own(&(log_sys->mutex))); + + return((group->file_size - LOG_FILE_HDR_SIZE) * group->n_files); +} + +/******************************************************//** +Calculates the offset within a log group, when the log file headers are not +included. +@return size offset (<= offset) */ +UNIV_INLINE +lsn_t +log_group_calc_size_offset( +/*=======================*/ + lsn_t offset, /*!< in: real offset within the + log group */ + const log_group_t* group) /*!< in: log group */ +{ + ut_ad(mutex_own(&(log_sys->mutex))); + + return(offset - LOG_FILE_HDR_SIZE * (1 + offset / group->file_size)); +} + +/******************************************************//** +Calculates the offset within a log group, when the log file headers are +included. +@return real offset (>= offset) */ +UNIV_INLINE +lsn_t +log_group_calc_real_offset( +/*=======================*/ + lsn_t offset, /*!< in: size offset within the + log group */ + const log_group_t* group) /*!< in: log group */ +{ + ut_ad(mutex_own(&(log_sys->mutex))); + + return(offset + LOG_FILE_HDR_SIZE + * (1 + offset / (group->file_size - LOG_FILE_HDR_SIZE))); +} + +/******************************************************//** +Calculates the offset of an lsn within a log group. +@return offset within the log group */ +static +lsn_t +log_group_calc_lsn_offset( +/*======================*/ + lsn_t lsn, /*!< in: lsn */ + const log_group_t* group) /*!< in: log group */ +{ + lsn_t gr_lsn; + lsn_t gr_lsn_size_offset; + lsn_t difference; + lsn_t group_size; + lsn_t offset; + + ut_ad(mutex_own(&(log_sys->mutex))); + + gr_lsn = group->lsn; + + gr_lsn_size_offset = log_group_calc_size_offset(group->lsn_offset, group); + + group_size = log_group_get_capacity(group); + + if (lsn >= gr_lsn) { + + difference = lsn - gr_lsn; + } else { + difference = gr_lsn - lsn; + + difference = difference % group_size; + + difference = group_size - difference; + } + + offset = (gr_lsn_size_offset + difference) % group_size; + + /* fprintf(stderr, + "Offset is " LSN_PF " gr_lsn_offset is " LSN_PF + " difference is " LSN_PF "\n", + offset, gr_lsn_size_offset, difference); + */ + + return(log_group_calc_real_offset(offset, group)); +} +#endif /* !UNIV_HOTBACKUP */ + +#ifdef UNIV_DEBUG +UNIV_INTERN ibool log_debug_writes = FALSE; +#endif /* UNIV_DEBUG */ + +/*******************************************************************//** +Calculates where in log files we find a specified lsn. +@return log file number */ +UNIV_INTERN +ulint +log_calc_where_lsn_is( +/*==================*/ + ib_int64_t* log_file_offset, /*!< out: offset in that file + (including the header) */ + ib_uint64_t first_header_lsn, /*!< in: first log file start + lsn */ + ib_uint64_t lsn, /*!< in: lsn whose position to + determine */ + ulint n_log_files, /*!< in: total number of log + files */ + ib_int64_t log_file_size) /*!< in: log file size + (including the header) */ +{ + ib_int64_t capacity = log_file_size - LOG_FILE_HDR_SIZE; + ulint file_no; + ib_int64_t add_this_many; + + if (lsn < first_header_lsn) { + add_this_many = 1 + (first_header_lsn - lsn) + / (capacity * (ib_int64_t) n_log_files); + lsn += add_this_many + * capacity * (ib_int64_t) n_log_files; + } + + ut_a(lsn >= first_header_lsn); + + file_no = ((ulint)((lsn - first_header_lsn) / capacity)) + % n_log_files; + *log_file_offset = (lsn - first_header_lsn) % capacity; + + *log_file_offset = *log_file_offset + LOG_FILE_HDR_SIZE; + + return(file_no); +} + +#ifndef UNIV_HOTBACKUP +/********************************************************//** +Sets the field values in group to correspond to a given lsn. For this function +to work, the values must already be correctly initialized to correspond to +some lsn, for instance, a checkpoint lsn. */ +UNIV_INTERN +void +log_group_set_fields( +/*=================*/ + log_group_t* group, /*!< in/out: group */ + lsn_t lsn) /*!< in: lsn for which the values should be + set */ +{ + group->lsn_offset = log_group_calc_lsn_offset(lsn, group); + group->lsn = lsn; +} + +/*****************************************************************//** +Calculates the recommended highest values for lsn - last_checkpoint_lsn, +lsn - buf_get_oldest_modification(), and lsn - max_archive_lsn_age. +@return error value FALSE if the smallest log group is too small to +accommodate the number of OS threads in the database server */ +static +ibool +log_calc_max_ages(void) +/*===================*/ +{ + log_group_t* group; + lsn_t margin; + ulint free; + ibool success = TRUE; + lsn_t smallest_capacity; + lsn_t archive_margin; + lsn_t smallest_archive_margin; + + mutex_enter(&(log_sys->mutex)); + + group = UT_LIST_GET_FIRST(log_sys->log_groups); + + ut_ad(group); + + smallest_capacity = LSN_MAX; + smallest_archive_margin = LSN_MAX; + + while (group) { + if (log_group_get_capacity(group) < smallest_capacity) { + + smallest_capacity = log_group_get_capacity(group); + } + + archive_margin = log_group_get_capacity(group) + - (group->file_size - LOG_FILE_HDR_SIZE) + - LOG_ARCHIVE_EXTRA_MARGIN; + + if (archive_margin < smallest_archive_margin) { + + smallest_archive_margin = archive_margin; + } + + group = UT_LIST_GET_NEXT(log_groups, group); + } + + /* Add extra safety */ + smallest_capacity = smallest_capacity - smallest_capacity / 10; + + /* For each OS thread we must reserve so much free space in the + smallest log group that it can accommodate the log entries produced + by single query steps: running out of free log space is a serious + system error which requires rebooting the database. */ + + free = LOG_CHECKPOINT_FREE_PER_THREAD * (10 + srv_thread_concurrency) + + LOG_CHECKPOINT_EXTRA_FREE; + if (free >= smallest_capacity / 2) { + success = FALSE; + + goto failure; + } else { + margin = smallest_capacity - free; + } + + margin = margin - margin / 10; /* Add still some extra safety */ + + log_sys->log_group_capacity = smallest_capacity; + + log_sys->max_modified_age_async = margin + - margin / LOG_POOL_PREFLUSH_RATIO_ASYNC; + log_sys->max_modified_age_sync = margin + - margin / LOG_POOL_PREFLUSH_RATIO_SYNC; + + log_sys->max_checkpoint_age_async = margin - margin + / LOG_POOL_CHECKPOINT_RATIO_ASYNC; + log_sys->max_checkpoint_age = margin; + +#ifdef UNIV_LOG_ARCHIVE + log_sys->max_archived_lsn_age = smallest_archive_margin; + + log_sys->max_archived_lsn_age_async = smallest_archive_margin + - smallest_archive_margin / LOG_ARCHIVE_RATIO_ASYNC; +#endif /* UNIV_LOG_ARCHIVE */ +failure: + mutex_exit(&(log_sys->mutex)); + + if (!success) { + fprintf(stderr, + "InnoDB: Error: ib_logfiles are too small" + " for innodb_thread_concurrency %lu.\n" + "InnoDB: The combined size of ib_logfiles" + " should be bigger than\n" + "InnoDB: 200 kB * innodb_thread_concurrency.\n" + "InnoDB: To get mysqld to start up, set" + " innodb_thread_concurrency in my.cnf\n" + "InnoDB: to a lower value, for example, to 8." + " After an ERROR-FREE shutdown\n" + "InnoDB: of mysqld you can adjust the size of" + " ib_logfiles, as explained in\n" + "InnoDB: " REFMAN "adding-and-removing.html\n" + "InnoDB: Cannot continue operation." + " Calling exit(1).\n", + (ulong) srv_thread_concurrency); + + exit(1); + } + + return(success); +} + +/******************************************************//** +Initializes the log. */ +UNIV_INTERN +void +log_init(void) +/*==========*/ +{ + log_sys = static_cast<log_t*>(mem_alloc(sizeof(log_t))); + + mutex_create(log_sys_mutex_key, &log_sys->mutex, SYNC_LOG); + + mutex_create(log_flush_order_mutex_key, + &log_sys->log_flush_order_mutex, + SYNC_LOG_FLUSH_ORDER); + + mutex_enter(&(log_sys->mutex)); + + /* Start the lsn from one log block from zero: this way every + log record has a start lsn != zero, a fact which we will use */ + + log_sys->lsn = LOG_START_LSN; + + ut_a(LOG_BUFFER_SIZE >= 16 * OS_FILE_LOG_BLOCK_SIZE); + ut_a(LOG_BUFFER_SIZE >= 4 * UNIV_PAGE_SIZE); + + log_sys->buf_ptr = static_cast<byte*>( + mem_zalloc(LOG_BUFFER_SIZE + OS_FILE_LOG_BLOCK_SIZE)); + + log_sys->buf = static_cast<byte*>( + ut_align(log_sys->buf_ptr, OS_FILE_LOG_BLOCK_SIZE)); + + log_sys->buf_size = LOG_BUFFER_SIZE; + log_sys->is_extending = false; + + log_sys->max_buf_free = log_sys->buf_size / LOG_BUF_FLUSH_RATIO + - LOG_BUF_FLUSH_MARGIN; + log_sys->check_flush_or_checkpoint = TRUE; + UT_LIST_INIT(log_sys->log_groups); + + log_sys->n_log_ios = 0; + + log_sys->n_log_ios_old = log_sys->n_log_ios; + log_sys->last_printout_time = time(NULL); + /*----------------------------*/ + + log_sys->buf_next_to_write = 0; + + log_sys->write_lsn = 0; + log_sys->current_flush_lsn = 0; + log_sys->flushed_to_disk_lsn = 0; + + log_sys->written_to_some_lsn = log_sys->lsn; + log_sys->written_to_all_lsn = log_sys->lsn; + + log_sys->n_pending_writes = 0; + + log_sys->no_flush_event = os_event_create(); + + os_event_set(log_sys->no_flush_event); + + log_sys->one_flushed_event = os_event_create(); + + os_event_set(log_sys->one_flushed_event); + + /*----------------------------*/ + + log_sys->next_checkpoint_no = 0; + log_sys->last_checkpoint_lsn = log_sys->lsn; + log_sys->n_pending_checkpoint_writes = 0; + + + rw_lock_create(checkpoint_lock_key, &log_sys->checkpoint_lock, + SYNC_NO_ORDER_CHECK); + + log_sys->checkpoint_buf_ptr = static_cast<byte*>( + mem_zalloc(2 * OS_FILE_LOG_BLOCK_SIZE)); + + log_sys->checkpoint_buf = static_cast<byte*>( + ut_align(log_sys->checkpoint_buf_ptr, OS_FILE_LOG_BLOCK_SIZE)); + + /*----------------------------*/ + +#ifdef UNIV_LOG_ARCHIVE + /* Under MySQL, log archiving is always off */ + log_sys->archiving_state = LOG_ARCH_OFF; + log_sys->archived_lsn = log_sys->lsn; + log_sys->next_archived_lsn = 0; + + log_sys->n_pending_archive_ios = 0; + + rw_lock_create(archive_lock_key, &log_sys->archive_lock, + SYNC_NO_ORDER_CHECK); + + log_sys->archive_buf_ptr = static_cast<byte*>( + mem_zalloc(LOG_ARCHIVE_BUF_SIZE + OS_FILE_LOG_BLOCK_SIZE)); + + log_sys->archive_buf = static_cast<byte*>( + ut_align(log_sys->archive_buf_ptr, OS_FILE_LOG_BLOCK_SIZE)); + + log_sys->archive_buf_size = LOG_ARCHIVE_BUF_SIZE; + + log_sys->archiving_on = os_event_create(); +#endif /* UNIV_LOG_ARCHIVE */ + + log_sys->tracked_lsn = 0; + + /*----------------------------*/ + + log_block_init(log_sys->buf, log_sys->lsn); + log_block_set_first_rec_group(log_sys->buf, LOG_BLOCK_HDR_SIZE); + + log_sys->buf_free = LOG_BLOCK_HDR_SIZE; + log_sys->lsn = LOG_START_LSN + LOG_BLOCK_HDR_SIZE; + + MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE, + log_sys->lsn - log_sys->last_checkpoint_lsn); + + mutex_exit(&(log_sys->mutex)); + +#ifdef UNIV_LOG_DEBUG + recv_sys_create(); + recv_sys_init(buf_pool_get_curr_size()); + + recv_sys->parse_start_lsn = log_sys->lsn; + recv_sys->scanned_lsn = log_sys->lsn; + recv_sys->scanned_checkpoint_no = 0; + recv_sys->recovered_lsn = log_sys->lsn; + recv_sys->limit_lsn = LSN_MAX; +#endif +} + +/******************************************************************//** +Inits a log group to the log system. */ +UNIV_INTERN +void +log_group_init( +/*===========*/ + ulint id, /*!< in: group id */ + ulint n_files, /*!< in: number of log files */ + lsn_t file_size, /*!< in: log file size in bytes */ + ulint space_id, /*!< in: space id of the file space + which contains the log files of this + group */ + ulint archive_space_id __attribute__((unused))) + /*!< in: space id of the file space + which contains some archived log + files for this group; currently, only + for the first log group this is + used */ +{ + ulint i; + + log_group_t* group; + + group = static_cast<log_group_t*>(mem_alloc(sizeof(log_group_t))); + + group->id = id; + group->n_files = n_files; + group->file_size = file_size; + group->space_id = space_id; + group->state = LOG_GROUP_OK; + group->lsn = LOG_START_LSN; + group->lsn_offset = LOG_FILE_HDR_SIZE; + group->n_pending_writes = 0; + + group->file_header_bufs_ptr = static_cast<byte**>( + mem_zalloc(sizeof(byte*) * n_files)); + + group->file_header_bufs = static_cast<byte**>( + mem_zalloc(sizeof(byte**) * n_files)); + +#ifdef UNIV_LOG_ARCHIVE + group->archive_file_header_bufs_ptr = static_cast<byte**>( + mem_zalloc( sizeof(byte*) * n_files)); + + group->archive_file_header_bufs = static_cast<byte**>( + mem_zalloc(sizeof(byte*) * n_files)); +#endif /* UNIV_LOG_ARCHIVE */ + + for (i = 0; i < n_files; i++) { + group->file_header_bufs_ptr[i] = static_cast<byte*>( + mem_zalloc(LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE)); + + group->file_header_bufs[i] = static_cast<byte*>( + ut_align(group->file_header_bufs_ptr[i], + OS_FILE_LOG_BLOCK_SIZE)); + +#ifdef UNIV_LOG_ARCHIVE + group->archive_file_header_bufs_ptr[i] = static_cast<byte*>( + mem_zalloc(LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE)); + + group->archive_file_header_bufs[i] = static_cast<byte*>( + ut_align(group->archive_file_header_bufs_ptr[i], + OS_FILE_LOG_BLOCK_SIZE)); +#endif /* UNIV_LOG_ARCHIVE */ + } + +#ifdef UNIV_LOG_ARCHIVE + group->archive_space_id = archive_space_id; + + group->archived_file_no = LOG_START_LSN; + group->archived_offset = 0; +#endif /* UNIV_LOG_ARCHIVE */ + + group->checkpoint_buf_ptr = static_cast<byte*>( + mem_zalloc(2 * OS_FILE_LOG_BLOCK_SIZE)); + + group->checkpoint_buf = static_cast<byte*>( + ut_align(group->checkpoint_buf_ptr,OS_FILE_LOG_BLOCK_SIZE)); + + UT_LIST_ADD_LAST(log_groups, log_sys->log_groups, group); + + ut_a(log_calc_max_ages()); +} + +/******************************************************************//** +Does the unlockings needed in flush i/o completion. */ +UNIV_INLINE +void +log_flush_do_unlocks( +/*=================*/ + ulint code) /*!< in: any ORed combination of LOG_UNLOCK_FLUSH_LOCK + and LOG_UNLOCK_NONE_FLUSHED_LOCK */ +{ + ut_ad(mutex_own(&(log_sys->mutex))); + + /* NOTE that we must own the log mutex when doing the setting of the + events: this is because transactions will wait for these events to + be set, and at that moment the log flush they were waiting for must + have ended. If the log mutex were not reserved here, the i/o-thread + calling this function might be preempted for a while, and when it + resumed execution, it might be that a new flush had been started, and + this function would erroneously signal the NEW flush as completed. + Thus, the changes in the state of these events are performed + atomically in conjunction with the changes in the state of + log_sys->n_pending_writes etc. */ + + if (code & LOG_UNLOCK_NONE_FLUSHED_LOCK) { + os_event_set(log_sys->one_flushed_event); + } + + if (code & LOG_UNLOCK_FLUSH_LOCK) { + os_event_set(log_sys->no_flush_event); + } +} + +/******************************************************************//** +Checks if a flush is completed for a log group and does the completion +routine if yes. +@return LOG_UNLOCK_NONE_FLUSHED_LOCK or 0 */ +UNIV_INLINE +ulint +log_group_check_flush_completion( +/*=============================*/ + log_group_t* group) /*!< in: log group */ +{ + ut_ad(mutex_own(&(log_sys->mutex))); + + if (!log_sys->one_flushed && group->n_pending_writes == 0) { +#ifdef UNIV_DEBUG + if (log_debug_writes) { + fprintf(stderr, + "Log flushed first to group %lu\n", + (ulong) group->id); + } +#endif /* UNIV_DEBUG */ + log_sys->written_to_some_lsn = log_sys->write_lsn; + log_sys->one_flushed = TRUE; + + return(LOG_UNLOCK_NONE_FLUSHED_LOCK); + } + +#ifdef UNIV_DEBUG + if (log_debug_writes && (group->n_pending_writes == 0)) { + + fprintf(stderr, "Log flushed to group %lu\n", + (ulong) group->id); + } +#endif /* UNIV_DEBUG */ + return(0); +} + +/******************************************************//** +Checks if a flush is completed and does the completion routine if yes. +@return LOG_UNLOCK_FLUSH_LOCK or 0 */ +static +ulint +log_sys_check_flush_completion(void) +/*================================*/ +{ + ulint move_start; + ulint move_end; + + ut_ad(mutex_own(&(log_sys->mutex))); + + if (log_sys->n_pending_writes == 0) { + + log_sys->written_to_all_lsn = log_sys->write_lsn; + log_sys->buf_next_to_write = log_sys->write_end_offset; + + if (log_sys->write_end_offset > log_sys->max_buf_free / 2) { + /* Move the log buffer content to the start of the + buffer */ + + move_start = ut_calc_align_down( + log_sys->write_end_offset, + OS_FILE_LOG_BLOCK_SIZE); + move_end = ut_calc_align(log_sys->buf_free, + OS_FILE_LOG_BLOCK_SIZE); + + ut_memmove(log_sys->buf, log_sys->buf + move_start, + move_end - move_start); + log_sys->buf_free -= move_start; + + log_sys->buf_next_to_write -= move_start; + } + + return(LOG_UNLOCK_FLUSH_LOCK); + } + + return(0); +} + +/******************************************************//** +Completes an i/o to a log file. */ +UNIV_INTERN +void +log_io_complete( +/*============*/ + log_group_t* group) /*!< in: log group or a dummy pointer */ +{ + ulint unlock; + +#ifdef UNIV_LOG_ARCHIVE + if ((byte*) group == &log_archive_io) { + /* It was an archive write */ + + log_io_complete_archive(); + + return; + } +#endif /* UNIV_LOG_ARCHIVE */ + + if ((ulint) group & 0x1UL) { + /* It was a checkpoint write */ + group = (log_group_t*)((ulint) group - 1); + + if (srv_unix_file_flush_method != SRV_UNIX_O_DSYNC + && srv_unix_file_flush_method != SRV_UNIX_ALL_O_DIRECT + && srv_unix_file_flush_method != SRV_UNIX_NOSYNC) { + + fil_flush(group->space_id); + } + +#ifdef UNIV_DEBUG + if (log_debug_writes) { + fprintf(stderr, + "Checkpoint info written to group %lu\n", + group->id); + } +#endif /* UNIV_DEBUG */ + log_io_complete_checkpoint(); + + return; + } + + ut_error; /*!< We currently use synchronous writing of the + logs and cannot end up here! */ + + if (srv_unix_file_flush_method != SRV_UNIX_O_DSYNC + && srv_unix_file_flush_method != SRV_UNIX_ALL_O_DIRECT + && srv_unix_file_flush_method != SRV_UNIX_NOSYNC + && thd_flush_log_at_trx_commit(NULL) != 2) { + + fil_flush(group->space_id); + } + + mutex_enter(&(log_sys->mutex)); + ut_ad(!recv_no_log_write); + + ut_a(group->n_pending_writes > 0); + ut_a(log_sys->n_pending_writes > 0); + + group->n_pending_writes--; + log_sys->n_pending_writes--; + MONITOR_DEC(MONITOR_PENDING_LOG_WRITE); + + unlock = log_group_check_flush_completion(group); + unlock = unlock | log_sys_check_flush_completion(); + + log_flush_do_unlocks(unlock); + + mutex_exit(&(log_sys->mutex)); +} + +/******************************************************//** +Writes a log file header to a log file space. */ +static +void +log_group_file_header_flush( +/*========================*/ + log_group_t* group, /*!< in: log group */ + ulint nth_file, /*!< in: header to the nth file in the + log file space */ + lsn_t start_lsn) /*!< in: log file data starts at this + lsn */ +{ + byte* buf; + lsn_t dest_offset; + + ut_ad(mutex_own(&(log_sys->mutex))); + ut_ad(!recv_no_log_write); + ut_a(nth_file < group->n_files); + + buf = *(group->file_header_bufs + nth_file); + + mach_write_to_4(buf + LOG_GROUP_ID, group->id); + mach_write_to_8(buf + LOG_FILE_START_LSN, start_lsn); + + /* Wipe over possible label of mysqlbackup --restore */ + memcpy(buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP, " ", 4); + + mach_write_to_4(buf + LOG_FILE_OS_FILE_LOG_BLOCK_SIZE, + srv_log_block_size); + + dest_offset = nth_file * group->file_size; + +#ifdef UNIV_DEBUG + if (log_debug_writes) { + fprintf(stderr, + "Writing log file header to group %lu file %lu\n", + (ulong) group->id, (ulong) nth_file); + } +#endif /* UNIV_DEBUG */ + if (log_do_write) { + log_sys->n_log_ios++; + + MONITOR_INC(MONITOR_LOG_IO); + + srv_stats.os_log_pending_writes.inc(); + + fil_io(OS_FILE_WRITE | OS_FILE_LOG, true, group->space_id, 0, + (ulint) (dest_offset / UNIV_PAGE_SIZE), + (ulint) (dest_offset % UNIV_PAGE_SIZE), + OS_FILE_LOG_BLOCK_SIZE, + buf, group); + + srv_stats.os_log_pending_writes.dec(); + } +} + +/******************************************************//** +Stores a 4-byte checksum to the trailer checksum field of a log block +before writing it to a log file. This checksum is used in recovery to +check the consistency of a log block. */ +static +void +log_block_store_checksum( +/*=====================*/ + byte* block) /*!< in/out: pointer to a log block */ +{ + log_block_set_checksum(block, log_block_calc_checksum(block)); +} + +/******************************************************//** +Writes a buffer to a log file group. */ +UNIV_INTERN +void +log_group_write_buf( +/*================*/ + log_group_t* group, /*!< in: log group */ + byte* buf, /*!< in: buffer */ + ulint len, /*!< in: buffer len; must be divisible + by OS_FILE_LOG_BLOCK_SIZE */ + lsn_t start_lsn, /*!< in: start lsn of the buffer; must + be divisible by + OS_FILE_LOG_BLOCK_SIZE */ + ulint new_data_offset)/*!< in: start offset of new data in + buf: this parameter is used to decide + if we have to write a new log file + header */ +{ + ulint write_len; + ibool write_header; + lsn_t next_offset; + ulint i; + + ut_ad(mutex_own(&(log_sys->mutex))); + ut_ad(!recv_no_log_write); + ut_a(len % OS_FILE_LOG_BLOCK_SIZE == 0); + ut_a(start_lsn % OS_FILE_LOG_BLOCK_SIZE == 0); + + if (new_data_offset == 0) { + write_header = TRUE; + } else { + write_header = FALSE; + } +loop: + if (len == 0) { + + return; + } + + next_offset = log_group_calc_lsn_offset(start_lsn, group); + + if ((next_offset % group->file_size == LOG_FILE_HDR_SIZE) + && write_header) { + /* We start to write a new log file instance in the group */ + + ut_a(next_offset / group->file_size <= ULINT_MAX); + + log_group_file_header_flush(group, (ulint) + (next_offset / group->file_size), + start_lsn); + srv_stats.os_log_written.add(OS_FILE_LOG_BLOCK_SIZE); + + srv_stats.log_writes.inc(); + } + + if ((next_offset % group->file_size) + len > group->file_size) { + + /* if the above condition holds, then the below expression + is < len which is ulint, so the typecast is ok */ + write_len = (ulint) + (group->file_size - (next_offset % group->file_size)); + } else { + write_len = len; + } + +#ifdef UNIV_DEBUG + if (log_debug_writes) { + + fprintf(stderr, + "Writing log file segment to group %lu" + " offset " LSN_PF " len %lu\n" + "start lsn " LSN_PF "\n" + "First block n:o %lu last block n:o %lu\n", + (ulong) group->id, next_offset, + write_len, + start_lsn, + (ulong) log_block_get_hdr_no(buf), + (ulong) log_block_get_hdr_no( + buf + write_len - OS_FILE_LOG_BLOCK_SIZE)); + ut_a(log_block_get_hdr_no(buf) + == log_block_convert_lsn_to_no(start_lsn)); + + for (i = 0; i < write_len / OS_FILE_LOG_BLOCK_SIZE; i++) { + + ut_a(log_block_get_hdr_no(buf) + i + == log_block_get_hdr_no( + buf + i * OS_FILE_LOG_BLOCK_SIZE)); + } + } +#endif /* UNIV_DEBUG */ + /* Calculate the checksums for each log block and write them to + the trailer fields of the log blocks */ + + for (i = 0; i < write_len / OS_FILE_LOG_BLOCK_SIZE; i++) { + log_block_store_checksum(buf + i * OS_FILE_LOG_BLOCK_SIZE); + } + + if (log_do_write) { + log_sys->n_log_ios++; + + MONITOR_INC(MONITOR_LOG_IO); + + srv_stats.os_log_pending_writes.inc(); + + ut_a(next_offset / UNIV_PAGE_SIZE <= ULINT_MAX); + + fil_io(OS_FILE_WRITE | OS_FILE_LOG, true, group->space_id, 0, + (ulint) (next_offset / UNIV_PAGE_SIZE), + (ulint) (next_offset % UNIV_PAGE_SIZE), write_len, buf, + group); + + srv_stats.os_log_pending_writes.dec(); + + srv_stats.os_log_written.add(write_len); + srv_stats.log_writes.inc(); + } + + if (write_len < len) { + start_lsn += write_len; + len -= write_len; + buf += write_len; + + write_header = TRUE; + + goto loop; + } +} + +/******************************************************//** +This function is called, e.g., when a transaction wants to commit. It checks +that the log has been written to the log file up to the last log entry written +by the transaction. If there is a flush running, it waits and checks if the +flush flushed enough. If not, starts a new flush. */ +UNIV_INTERN +void +log_write_up_to( +/*============*/ + lsn_t lsn, /*!< in: log sequence number up to which + the log should be written, + LSN_MAX if not specified */ + ulint wait, /*!< in: LOG_NO_WAIT, LOG_WAIT_ONE_GROUP, + or LOG_WAIT_ALL_GROUPS */ + ibool flush_to_disk) + /*!< in: TRUE if we want the written log + also to be flushed to disk */ +{ + log_group_t* group; + ulint start_offset; + ulint end_offset; + ulint area_start; + ulint area_end; +#ifdef UNIV_DEBUG + ulint loop_count = 0; +#endif /* UNIV_DEBUG */ + ulint unlock; + + ut_ad(!srv_read_only_mode); + + if (recv_no_ibuf_operations) { + /* Recovery is running and no operations on the log files are + allowed yet (the variable name .._no_ibuf_.. is misleading) */ + + return; + } + +loop: +#ifdef UNIV_DEBUG + loop_count++; + + ut_ad(loop_count < 5); + +# if 0 + if (loop_count > 2) { + fprintf(stderr, "Log loop count %lu\n", loop_count); + } +# endif +#endif + + mutex_enter(&(log_sys->mutex)); + ut_ad(!recv_no_log_write); + + if (flush_to_disk + && log_sys->flushed_to_disk_lsn >= lsn) { + + mutex_exit(&(log_sys->mutex)); + + return; + } + + if (!flush_to_disk + && (log_sys->written_to_all_lsn >= lsn + || (log_sys->written_to_some_lsn >= lsn + && wait != LOG_WAIT_ALL_GROUPS))) { + + mutex_exit(&(log_sys->mutex)); + + return; + } + + if (log_sys->n_pending_writes > 0) { + /* A write (+ possibly flush to disk) is running */ + + if (flush_to_disk + && log_sys->current_flush_lsn >= lsn) { + /* The write + flush will write enough: wait for it to + complete */ + + goto do_waits; + } + + if (!flush_to_disk + && log_sys->write_lsn >= lsn) { + /* The write will write enough: wait for it to + complete */ + + goto do_waits; + } + + mutex_exit(&(log_sys->mutex)); + + /* Wait for the write to complete and try to start a new + write */ + + os_event_wait(log_sys->no_flush_event); + + goto loop; + } + + if (!flush_to_disk + && log_sys->buf_free == log_sys->buf_next_to_write) { + /* Nothing to write and no flush to disk requested */ + + mutex_exit(&(log_sys->mutex)); + + return; + } + +#ifdef UNIV_DEBUG + if (log_debug_writes) { + fprintf(stderr, + "Writing log from " LSN_PF " up to lsn " LSN_PF "\n", + log_sys->written_to_all_lsn, + log_sys->lsn); + } +#endif /* UNIV_DEBUG */ + log_sys->n_pending_writes++; + MONITOR_INC(MONITOR_PENDING_LOG_WRITE); + + group = UT_LIST_GET_FIRST(log_sys->log_groups); + group->n_pending_writes++; /*!< We assume here that we have only + one log group! */ + + os_event_reset(log_sys->no_flush_event); + os_event_reset(log_sys->one_flushed_event); + + start_offset = log_sys->buf_next_to_write; + end_offset = log_sys->buf_free; + + area_start = ut_calc_align_down(start_offset, OS_FILE_LOG_BLOCK_SIZE); + area_end = ut_calc_align(end_offset, OS_FILE_LOG_BLOCK_SIZE); + + ut_ad(area_end - area_start > 0); + + log_sys->write_lsn = log_sys->lsn; + + if (flush_to_disk) { + log_sys->current_flush_lsn = log_sys->lsn; + } + + log_sys->one_flushed = FALSE; + + log_block_set_flush_bit(log_sys->buf + area_start, TRUE); + log_block_set_checkpoint_no( + log_sys->buf + area_end - OS_FILE_LOG_BLOCK_SIZE, + log_sys->next_checkpoint_no); + + /* Copy the last, incompletely written, log block a log block length + up, so that when the flush operation writes from the log buffer, the + segment to write will not be changed by writers to the log */ + + ut_memcpy(log_sys->buf + area_end, + log_sys->buf + area_end - OS_FILE_LOG_BLOCK_SIZE, + OS_FILE_LOG_BLOCK_SIZE); + + log_sys->buf_free += OS_FILE_LOG_BLOCK_SIZE; + log_sys->write_end_offset = log_sys->buf_free; + + group = UT_LIST_GET_FIRST(log_sys->log_groups); + + /* Do the write to the log files */ + + while (group) { + log_group_write_buf( + group, log_sys->buf + area_start, + area_end - area_start, + ut_uint64_align_down(log_sys->written_to_all_lsn, + OS_FILE_LOG_BLOCK_SIZE), + start_offset - area_start); + + log_group_set_fields(group, log_sys->write_lsn); + + group = UT_LIST_GET_NEXT(log_groups, group); + } + + mutex_exit(&(log_sys->mutex)); + + if (srv_unix_file_flush_method == SRV_UNIX_O_DSYNC + || srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT) { + /* O_DSYNC or ALL_O_DIRECT means the OS did not buffer the log + file at all: so we have also flushed to disk what we have + written */ + + log_sys->flushed_to_disk_lsn = log_sys->write_lsn; + + } else if (flush_to_disk) { + + group = UT_LIST_GET_FIRST(log_sys->log_groups); + + fil_flush(group->space_id); + log_sys->flushed_to_disk_lsn = log_sys->write_lsn; + } + + mutex_enter(&(log_sys->mutex)); + + group = UT_LIST_GET_FIRST(log_sys->log_groups); + + ut_a(group->n_pending_writes == 1); + ut_a(log_sys->n_pending_writes == 1); + + group->n_pending_writes--; + log_sys->n_pending_writes--; + MONITOR_DEC(MONITOR_PENDING_LOG_WRITE); + + unlock = log_group_check_flush_completion(group); + unlock = unlock | log_sys_check_flush_completion(); + + log_flush_do_unlocks(unlock); + + mutex_exit(&(log_sys->mutex)); + + return; + +do_waits: + mutex_exit(&(log_sys->mutex)); + + switch (wait) { + case LOG_WAIT_ONE_GROUP: + os_event_wait(log_sys->one_flushed_event); + break; + case LOG_WAIT_ALL_GROUPS: + os_event_wait(log_sys->no_flush_event); + break; +#ifdef UNIV_DEBUG + case LOG_NO_WAIT: + break; + default: + ut_error; +#endif /* UNIV_DEBUG */ + } +} + +/****************************************************************//** +Does a syncronous flush of the log buffer to disk. */ +UNIV_INTERN +void +log_buffer_flush_to_disk(void) +/*==========================*/ +{ + lsn_t lsn; + + ut_ad(!srv_read_only_mode); + mutex_enter(&(log_sys->mutex)); + + lsn = log_sys->lsn; + + mutex_exit(&(log_sys->mutex)); + + log_write_up_to(lsn, LOG_WAIT_ALL_GROUPS, TRUE); +} + +/****************************************************************//** +This functions writes the log buffer to the log file and if 'flush' +is set it forces a flush of the log file as well. This is meant to be +called from background master thread only as it does not wait for +the write (+ possible flush) to finish. */ +UNIV_INTERN +void +log_buffer_sync_in_background( +/*==========================*/ + ibool flush) /*!< in: flush the logs to disk */ +{ + lsn_t lsn; + + mutex_enter(&(log_sys->mutex)); + + lsn = log_sys->lsn; + + mutex_exit(&(log_sys->mutex)); + + log_write_up_to(lsn, LOG_NO_WAIT, flush); +} + +/******************************************************************** + +Tries to establish a big enough margin of free space in the log buffer, such +that a new log entry can be catenated without an immediate need for a flush. */ +static +void +log_flush_margin(void) +/*==================*/ +{ + log_t* log = log_sys; + lsn_t lsn = 0; + + mutex_enter(&(log->mutex)); + + if (log->buf_free > log->max_buf_free) { + + if (log->n_pending_writes > 0) { + /* A flush is running: hope that it will provide enough + free space */ + } else { + lsn = log->lsn; + } + } + + mutex_exit(&(log->mutex)); + + if (lsn) { + log_write_up_to(lsn, LOG_NO_WAIT, FALSE); + } +} + +/****************************************************************//** +Advances the smallest lsn for which there are unflushed dirty blocks in the +buffer pool. NOTE: this function may only be called if the calling thread owns +no synchronization objects! +@return false if there was a flush batch of the same type running, +which means that we could not start this flush batch */ +static +bool +log_preflush_pool_modified_pages( +/*=============================*/ + lsn_t new_oldest) /*!< in: try to advance oldest_modified_lsn + at least to this lsn */ +{ + lsn_t current_oldest; + ulint i; + + if (recv_recovery_on) { + /* If the recovery is running, we must first apply all + log records to their respective file pages to get the + right modify lsn values to these pages: otherwise, there + might be pages on disk which are not yet recovered to the + current lsn, and even after calling this function, we could + not know how up-to-date the disk version of the database is, + and we could not make a new checkpoint on the basis of the + info on the buffer pool only. */ + + recv_apply_hashed_log_recs(TRUE); + } + + if (!buf_page_cleaner_is_active + || (srv_foreground_preflush + == SRV_FOREGROUND_PREFLUSH_SYNC_PREFLUSH) + || (new_oldest == LSN_MAX)) { + + ulint n_pages; + + bool success = buf_flush_list(ULINT_MAX, new_oldest, &n_pages); + + buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST); + + if (!success) { + MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS); + } + + MONITOR_INC_VALUE_CUMULATIVE( + MONITOR_FLUSH_SYNC_TOTAL_PAGE, + MONITOR_FLUSH_SYNC_COUNT, + MONITOR_FLUSH_SYNC_PAGES, + n_pages); + + return(success); + } + + ut_ad(srv_foreground_preflush == SRV_FOREGROUND_PREFLUSH_EXP_BACKOFF); + + current_oldest = buf_pool_get_oldest_modification(); + i = 0; + + while (current_oldest < new_oldest && current_oldest) { + + while (!buf_flush_flush_list_in_progress()) { + + /* If a flush list flush by the cleaner thread is not + running, backoff until one is started. */ + os_thread_sleep(ut_rnd_interval(0, 1 << i)); + i++; + i %= 16; + } + buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST); + + current_oldest = buf_pool_get_oldest_modification(); + } + + return(current_oldest >= new_oldest || !current_oldest); +} + +/******************************************************//** +Completes a checkpoint. */ +static +void +log_complete_checkpoint(void) +/*=========================*/ +{ + ut_ad(mutex_own(&(log_sys->mutex))); + ut_ad(log_sys->n_pending_checkpoint_writes == 0); + + log_sys->next_checkpoint_no++; + + log_sys->last_checkpoint_lsn = log_sys->next_checkpoint_lsn; + MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE, + log_sys->lsn - log_sys->last_checkpoint_lsn); + + rw_lock_x_unlock_gen(&(log_sys->checkpoint_lock), LOG_CHECKPOINT); +} + +/******************************************************//** +Completes an asynchronous checkpoint info write i/o to a log file. */ +static +void +log_io_complete_checkpoint(void) +/*============================*/ +{ + mutex_enter(&(log_sys->mutex)); + + ut_ad(log_sys->n_pending_checkpoint_writes > 0); + + log_sys->n_pending_checkpoint_writes--; + MONITOR_DEC(MONITOR_PENDING_CHECKPOINT_WRITE); + + if (log_sys->n_pending_checkpoint_writes == 0) { + log_complete_checkpoint(); + } + + mutex_exit(&(log_sys->mutex)); + + /* Wake the redo log watching thread to parse the log up to this + checkpoint. */ + if (srv_track_changed_pages) { + os_event_reset(srv_redo_log_tracked_event); + os_event_set(srv_checkpoint_completed_event); + } +} + +/*******************************************************************//** +Writes info to a checkpoint about a log group. */ +static +void +log_checkpoint_set_nth_group_info( +/*==============================*/ + byte* buf, /*!< in: buffer for checkpoint info */ + ulint n, /*!< in: nth slot */ + lsn_t file_no)/*!< in: archived file number */ +{ + ut_ad(n < LOG_MAX_N_GROUPS); + + mach_write_to_8(buf + LOG_CHECKPOINT_GROUP_ARRAY + + 8 * n + LOG_CHECKPOINT_ARCHIVED_FILE_NO, + file_no); +} + +/*******************************************************************//** +Gets info from a checkpoint about a log group. */ +UNIV_INTERN +void +log_checkpoint_get_nth_group_info( +/*==============================*/ + const byte* buf, /*!< in: buffer containing checkpoint info */ + ulint n, /*!< in: nth slot */ + lsn_t* file_no)/*!< out: archived file number */ +{ + ut_ad(n < LOG_MAX_N_GROUPS); + + *file_no = mach_read_from_8(buf + LOG_CHECKPOINT_GROUP_ARRAY + + 8 * n + LOG_CHECKPOINT_ARCHIVED_FILE_NO); +} + +/******************************************************//** +Writes the checkpoint info to a log group header. */ +static +void +log_group_checkpoint( +/*=================*/ + log_group_t* group) /*!< in: log group */ +{ + log_group_t* group2; +#ifdef UNIV_LOG_ARCHIVE + ib_uint64_t archived_lsn; +#endif /* UNIV_LOG_ARCHIVE */ + lsn_t lsn_offset; + ulint write_offset; + ulint fold; + byte* buf; + ulint i; + + ut_ad(!srv_read_only_mode); + ut_ad(mutex_own(&(log_sys->mutex))); + ut_a(LOG_CHECKPOINT_SIZE <= OS_FILE_LOG_BLOCK_SIZE); + + buf = group->checkpoint_buf; + + mach_write_to_8(buf + LOG_CHECKPOINT_NO, log_sys->next_checkpoint_no); + mach_write_to_8(buf + LOG_CHECKPOINT_LSN, log_sys->next_checkpoint_lsn); + + lsn_offset = log_group_calc_lsn_offset(log_sys->next_checkpoint_lsn, + group); + mach_write_to_4(buf + LOG_CHECKPOINT_OFFSET_LOW32, + lsn_offset & 0xFFFFFFFFUL); + mach_write_to_4(buf + LOG_CHECKPOINT_OFFSET_HIGH32, + lsn_offset >> 32); + + mach_write_to_4(buf + LOG_CHECKPOINT_LOG_BUF_SIZE, log_sys->buf_size); + +#ifdef UNIV_LOG_ARCHIVE + if (log_sys->archiving_state == LOG_ARCH_OFF) { + archived_lsn = LSN_MAX; + } else { + archived_lsn = log_sys->archived_lsn; + } + + mach_write_to_8(buf + LOG_CHECKPOINT_ARCHIVED_LSN, archived_lsn); +#else /* UNIV_LOG_ARCHIVE */ + mach_write_to_8(buf + LOG_CHECKPOINT_ARCHIVED_LSN, LSN_MAX); +#endif /* UNIV_LOG_ARCHIVE */ + + for (i = 0; i < LOG_MAX_N_GROUPS; i++) { + log_checkpoint_set_nth_group_info(buf, i, 0); + } + + group2 = UT_LIST_GET_FIRST(log_sys->log_groups); + + while (group2) { + log_checkpoint_set_nth_group_info(buf, group2->id, +#ifdef UNIV_LOG_ARCHIVE + group2->archived_file_no +#else /* UNIV_LOG_ARCHIVE */ + 0 +#endif /* UNIV_LOG_ARCHIVE */ + ); + + group2 = UT_LIST_GET_NEXT(log_groups, group2); + } + + fold = ut_fold_binary(buf, LOG_CHECKPOINT_CHECKSUM_1); + mach_write_to_4(buf + LOG_CHECKPOINT_CHECKSUM_1, fold); + + fold = ut_fold_binary(buf + LOG_CHECKPOINT_LSN, + LOG_CHECKPOINT_CHECKSUM_2 - LOG_CHECKPOINT_LSN); + mach_write_to_4(buf + LOG_CHECKPOINT_CHECKSUM_2, fold); + + /* We alternate the physical place of the checkpoint info in the first + log file */ + + if ((log_sys->next_checkpoint_no & 1) == 0) { + write_offset = LOG_CHECKPOINT_1; + } else { + write_offset = LOG_CHECKPOINT_2; + } + + if (log_do_write) { + if (log_sys->n_pending_checkpoint_writes == 0) { + + rw_lock_x_lock_gen(&(log_sys->checkpoint_lock), + LOG_CHECKPOINT); + } + + log_sys->n_pending_checkpoint_writes++; + MONITOR_INC(MONITOR_PENDING_CHECKPOINT_WRITE); + + log_sys->n_log_ios++; + + MONITOR_INC(MONITOR_LOG_IO); + + /* We send as the last parameter the group machine address + added with 1, as we want to distinguish between a normal log + file write and a checkpoint field write */ + + fil_io(OS_FILE_WRITE | OS_FILE_LOG, false, group->space_id, 0, + write_offset / UNIV_PAGE_SIZE, + write_offset % UNIV_PAGE_SIZE, + OS_FILE_LOG_BLOCK_SIZE, + buf, ((byte*) group + 1)); + + ut_ad(((ulint) group & 0x1UL) == 0); + } +} +#endif /* !UNIV_HOTBACKUP */ + +#ifdef UNIV_HOTBACKUP +/******************************************************//** +Writes info to a buffer of a log group when log files are created in +backup restoration. */ +UNIV_INTERN +void +log_reset_first_header_and_checkpoint( +/*==================================*/ + byte* hdr_buf,/*!< in: buffer which will be written to the + start of the first log file */ + ib_uint64_t start) /*!< in: lsn of the start of the first log file; + we pretend that there is a checkpoint at + start + LOG_BLOCK_HDR_SIZE */ +{ + ulint fold; + byte* buf; + ib_uint64_t lsn; + + mach_write_to_4(hdr_buf + LOG_GROUP_ID, 0); + mach_write_to_8(hdr_buf + LOG_FILE_START_LSN, start); + + lsn = start + LOG_BLOCK_HDR_SIZE; + + /* Write the label of mysqlbackup --restore */ + strcpy((char*) hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP, + "ibbackup "); + ut_sprintf_timestamp((char*) hdr_buf + + (LOG_FILE_WAS_CREATED_BY_HOT_BACKUP + + (sizeof "ibbackup ") - 1)); + buf = hdr_buf + LOG_CHECKPOINT_1; + + mach_write_to_8(buf + LOG_CHECKPOINT_NO, 0); + mach_write_to_8(buf + LOG_CHECKPOINT_LSN, lsn); + + mach_write_to_4(buf + LOG_CHECKPOINT_OFFSET_LOW32, + LOG_FILE_HDR_SIZE + LOG_BLOCK_HDR_SIZE); + mach_write_to_4(buf + LOG_CHECKPOINT_OFFSET_HIGH32, 0); + + mach_write_to_4(buf + LOG_CHECKPOINT_LOG_BUF_SIZE, 2 * 1024 * 1024); + + mach_write_to_8(buf + LOG_CHECKPOINT_ARCHIVED_LSN, LSN_MAX); + + fold = ut_fold_binary(buf, LOG_CHECKPOINT_CHECKSUM_1); + mach_write_to_4(buf + LOG_CHECKPOINT_CHECKSUM_1, fold); + + fold = ut_fold_binary(buf + LOG_CHECKPOINT_LSN, + LOG_CHECKPOINT_CHECKSUM_2 - LOG_CHECKPOINT_LSN); + mach_write_to_4(buf + LOG_CHECKPOINT_CHECKSUM_2, fold); + + /* Starting from InnoDB-3.23.50, we should also write info on + allocated size in the tablespace, but unfortunately we do not + know it here */ +} +#endif /* UNIV_HOTBACKUP */ + +#ifndef UNIV_HOTBACKUP +/******************************************************//** +Reads a checkpoint info from a log group header to log_sys->checkpoint_buf. */ +UNIV_INTERN +void +log_group_read_checkpoint_info( +/*===========================*/ + log_group_t* group, /*!< in: log group */ + ulint field) /*!< in: LOG_CHECKPOINT_1 or LOG_CHECKPOINT_2 */ +{ + ut_ad(mutex_own(&(log_sys->mutex))); + + log_sys->n_log_ios++; + + MONITOR_INC(MONITOR_LOG_IO); + + fil_io(OS_FILE_READ | OS_FILE_LOG, true, group->space_id, 0, + field / UNIV_PAGE_SIZE, field % UNIV_PAGE_SIZE, + OS_FILE_LOG_BLOCK_SIZE, log_sys->checkpoint_buf, NULL); +} + +/******************************************************//** +Writes checkpoint info to groups. */ +UNIV_INTERN +void +log_groups_write_checkpoint_info(void) +/*==================================*/ +{ + log_group_t* group; + + ut_ad(mutex_own(&(log_sys->mutex))); + + if (!srv_read_only_mode) { + for (group = UT_LIST_GET_FIRST(log_sys->log_groups); + group; + group = UT_LIST_GET_NEXT(log_groups, group)) { + + log_group_checkpoint(group); + } + } +} + +/******************************************************//** +Makes a checkpoint. Note that this function does not flush dirty +blocks from the buffer pool: it only checks what is lsn of the oldest +modification in the pool, and writes information about the lsn in +log files. Use log_make_checkpoint_at to flush also the pool. +@return TRUE if success, FALSE if a checkpoint write was already running */ +UNIV_INTERN +ibool +log_checkpoint( +/*===========*/ + ibool sync, /*!< in: TRUE if synchronous operation is + desired */ + ibool write_always) /*!< in: the function normally checks if the + the new checkpoint would have a greater + lsn than the previous one: if not, then no + physical write is done; by setting this + parameter TRUE, a physical write will always be + made to log files */ +{ + lsn_t oldest_lsn; + + ut_ad(!srv_read_only_mode); + + if (recv_recovery_is_on()) { + recv_apply_hashed_log_recs(TRUE); + } + + if (srv_unix_file_flush_method != SRV_UNIX_NOSYNC && + srv_unix_file_flush_method != SRV_UNIX_ALL_O_DIRECT) { + fil_flush_file_spaces(FIL_TABLESPACE); + } + + mutex_enter(&(log_sys->mutex)); + + ut_ad(!recv_no_log_write); + oldest_lsn = log_buf_pool_get_oldest_modification(); + + mutex_exit(&(log_sys->mutex)); + + /* Because log also contains headers and dummy log records, + if the buffer pool contains no dirty buffers, oldest_lsn + gets the value log_sys->lsn from the previous function, + and we must make sure that the log is flushed up to that + lsn. If there are dirty buffers in the buffer pool, then our + write-ahead-logging algorithm ensures that the log has been flushed + up to oldest_lsn. */ + + log_write_up_to(oldest_lsn, LOG_WAIT_ALL_GROUPS, TRUE); + + mutex_enter(&(log_sys->mutex)); + + if (!write_always + && log_sys->last_checkpoint_lsn >= oldest_lsn) { + + mutex_exit(&(log_sys->mutex)); + + return(TRUE); + } + + ut_ad(log_sys->flushed_to_disk_lsn >= oldest_lsn); + + if (log_sys->n_pending_checkpoint_writes > 0) { + /* A checkpoint write is running */ + + mutex_exit(&(log_sys->mutex)); + + if (sync) { + /* Wait for the checkpoint write to complete */ + rw_lock_s_lock(&(log_sys->checkpoint_lock)); + rw_lock_s_unlock(&(log_sys->checkpoint_lock)); + } + + return(FALSE); + } + + log_sys->next_checkpoint_lsn = oldest_lsn; + +#ifdef UNIV_DEBUG + if (log_debug_writes) { + fprintf(stderr, "Making checkpoint no " + LSN_PF " at lsn " LSN_PF "\n", + log_sys->next_checkpoint_no, + oldest_lsn); + } +#endif /* UNIV_DEBUG */ + + log_groups_write_checkpoint_info(); + + MONITOR_INC(MONITOR_NUM_CHECKPOINT); + + mutex_exit(&(log_sys->mutex)); + + if (sync) { + /* Wait for the checkpoint write to complete */ + rw_lock_s_lock(&(log_sys->checkpoint_lock)); + rw_lock_s_unlock(&(log_sys->checkpoint_lock)); + } + + return(TRUE); +} + +/****************************************************************//** +Makes a checkpoint at a given lsn or later. */ +UNIV_INTERN +void +log_make_checkpoint_at( +/*===================*/ + lsn_t lsn, /*!< in: make a checkpoint at this or a + later lsn, if LSN_MAX, makes + a checkpoint at the latest lsn */ + ibool write_always) /*!< in: the function normally checks if + the new checkpoint would have a + greater lsn than the previous one: if + not, then no physical write is done; + by setting this parameter TRUE, a + physical write will always be made to + log files */ +{ + /* Preflush pages synchronously */ + + while (!log_preflush_pool_modified_pages(lsn)) { + /* Flush as much as we can */ + } + + while (!log_checkpoint(TRUE, write_always)) { + /* Force a checkpoint */ + } +} + +/****************************************************************//** +Tries to establish a big enough margin of free space in the log groups, such +that a new log entry can be catenated without an immediate need for a +checkpoint. NOTE: this function may only be called if the calling thread +owns no synchronization objects! */ +static +void +log_checkpoint_margin(void) +/*=======================*/ +{ + log_t* log = log_sys; + lsn_t age; + lsn_t checkpoint_age; + ib_uint64_t advance; + lsn_t oldest_lsn; + ibool checkpoint_sync; + ibool do_checkpoint; + bool success; +loop: + checkpoint_sync = FALSE; + do_checkpoint = FALSE; + advance = 0; + + mutex_enter(&(log->mutex)); + ut_ad(!recv_no_log_write); + + if (log->check_flush_or_checkpoint == FALSE) { + mutex_exit(&(log->mutex)); + + return; + } + + oldest_lsn = log_buf_pool_get_oldest_modification(); + + age = log->lsn - oldest_lsn; + + if (age > log->max_modified_age_sync) { + + /* A flush is urgent: we have to do a synchronous preflush */ + advance = 2 * (age - log->max_modified_age_sync); + } + + checkpoint_age = log->lsn - log->last_checkpoint_lsn; + + if (checkpoint_age > log->max_checkpoint_age) { + /* A checkpoint is urgent: we do it synchronously */ + + checkpoint_sync = TRUE; + + do_checkpoint = TRUE; + + } else if (checkpoint_age > log->max_checkpoint_age_async) { + /* A checkpoint is not urgent: do it asynchronously */ + + do_checkpoint = TRUE; + + log->check_flush_or_checkpoint = FALSE; + } else { + log->check_flush_or_checkpoint = FALSE; + } + + mutex_exit(&(log->mutex)); + + if (advance) { + lsn_t new_oldest = oldest_lsn + advance; + + success = log_preflush_pool_modified_pages(new_oldest); + + /* If the flush succeeded, this thread has done its part + and can proceed. If it did not succeed, there was another + thread doing a flush at the same time. */ + if (!success) { + mutex_enter(&(log->mutex)); + + log->check_flush_or_checkpoint = TRUE; + + mutex_exit(&(log->mutex)); + goto loop; + } + } + + if (do_checkpoint) { + log_checkpoint(checkpoint_sync, FALSE); + + if (checkpoint_sync) { + + goto loop; + } + } +} + +/******************************************************//** +Reads a specified log segment to a buffer. Optionally releases the log mutex +before the I/O. */ +UNIV_INTERN +void +log_group_read_log_seg( +/*===================*/ + ulint type, /*!< in: LOG_ARCHIVE or LOG_RECOVER */ + byte* buf, /*!< in: buffer where to read */ + log_group_t* group, /*!< in: log group */ + lsn_t start_lsn, /*!< in: read area start */ + lsn_t end_lsn, /*!< in: read area end */ + ibool release_mutex) /*!< in: whether the log_sys->mutex + should be released before the read */ +{ + ulint len; + lsn_t source_offset; + bool sync; + + ut_ad(mutex_own(&(log_sys->mutex))); + + sync = (type == LOG_RECOVER); +loop: + source_offset = log_group_calc_lsn_offset(start_lsn, group); + + ut_a(end_lsn - start_lsn <= ULINT_MAX); + len = (ulint) (end_lsn - start_lsn); + + ut_ad(len != 0); + + if ((source_offset % group->file_size) + len > group->file_size) { + + /* If the above condition is true then len (which is ulint) + is > the expression below, so the typecast is ok */ + len = (ulint) (group->file_size - + (source_offset % group->file_size)); + } + +#ifdef UNIV_LOG_ARCHIVE + if (type == LOG_ARCHIVE) { + + log_sys->n_pending_archive_ios++; + } +#endif /* UNIV_LOG_ARCHIVE */ + + log_sys->n_log_ios++; + + MONITOR_INC(MONITOR_LOG_IO); + + ut_a(source_offset / UNIV_PAGE_SIZE <= ULINT_MAX); + + if (release_mutex) { + mutex_exit(&(log_sys->mutex)); + } + + fil_io(OS_FILE_READ | OS_FILE_LOG, sync, group->space_id, 0, + (ulint) (source_offset / UNIV_PAGE_SIZE), + (ulint) (source_offset % UNIV_PAGE_SIZE), + len, buf, (type == LOG_ARCHIVE) ? &log_archive_io : NULL); + + start_lsn += len; + buf += len; + + if (start_lsn != end_lsn) { + + if (release_mutex) { + mutex_enter(&(log_sys->mutex)); + } + goto loop; + } +} + +#ifdef UNIV_LOG_ARCHIVE +/******************************************************//** +Generates an archived log file name. */ +UNIV_INTERN +void +log_archived_file_name_gen( +/*=======================*/ + char* buf, /*!< in: buffer where to write */ + ulint buf_len,/*!< in: buffer length */ + ulint id __attribute__((unused)), + /*!< in: group id; + currently we only archive the first group */ + lsn_t file_no)/*!< in: file number */ +{ + ulint dirnamelen; + + dirnamelen = strlen(srv_arch_dir); + + ut_a(buf_len > dirnamelen + + IB_ARCHIVED_LOGS_SERIAL_LEN + + IB_ARCHIVED_LOGS_PREFIX_LEN + 2); + + strcpy(buf, srv_arch_dir); + + if (buf[dirnamelen-1] != SRV_PATH_SEPARATOR) { + buf[dirnamelen++] = SRV_PATH_SEPARATOR; + } + sprintf(buf + dirnamelen, IB_ARCHIVED_LOGS_PREFIX + "%0" IB_TO_STR(IB_ARCHIVED_LOGS_SERIAL_LEN) "llu", + (unsigned long long)file_no); +} + +/******************************************************//** +Get offset within archived log file to continue to write +with. */ +UNIV_INTERN +void +log_archived_get_offset( +/*=====================*/ + log_group_t* group, /*!< in: log group */ + lsn_t file_no, /*!< in: archive log file number */ + lsn_t archived_lsn, /*!< in: last archived LSN */ + lsn_t* offset) /*!< out: offset within archived file */ +{ + char file_name[OS_FILE_MAX_PATH]; + ibool exists; + os_file_type_t type; + + log_archived_file_name_gen(file_name, + sizeof(file_name), group->id, file_no); + + ut_a(os_file_status(file_name, &exists, &type)); + + if (!exists) { + *offset = 0; + return; + } + + *offset = archived_lsn - file_no + LOG_FILE_HDR_SIZE; + + if (archived_lsn != LSN_MAX) { + *offset = archived_lsn - file_no + LOG_FILE_HDR_SIZE; + } else { + /* Archiving was OFF prior startup */ + *offset = 0; + } + + ut_a(group->file_size >= *offset + LOG_FILE_HDR_SIZE); + + return; +} + +/******************************************************//** +Writes a log file header to a log file space. */ +static +void +log_group_archive_file_header_write( +/*================================*/ + log_group_t* group, /*!< in: log group */ + ulint nth_file, /*!< in: header to the nth file in the + archive log file space */ + lsn_t file_no, /*!< in: archived file number */ + ib_uint64_t start_lsn) /*!< in: log file data starts at this + lsn */ +{ + byte* buf; + ulint dest_offset; + + ut_ad(mutex_own(&(log_sys->mutex))); + + ut_a(nth_file < group->n_files); + + buf = *(group->archive_file_header_bufs + nth_file); + + mach_write_to_4(buf + LOG_GROUP_ID, group->id); + mach_write_to_8(buf + LOG_FILE_START_LSN, start_lsn); + mach_write_to_4(buf + LOG_FILE_NO, file_no); + + mach_write_to_4(buf + LOG_FILE_ARCH_COMPLETED, FALSE); + + dest_offset = nth_file * group->file_size; + + log_sys->n_log_ios++; + + MONITOR_INC(MONITOR_LOG_IO); + + fil_io(OS_FILE_WRITE | OS_FILE_LOG, true, group->archive_space_id, + 0, + dest_offset / UNIV_PAGE_SIZE, + dest_offset % UNIV_PAGE_SIZE, + 2 * OS_FILE_LOG_BLOCK_SIZE, + buf, &log_archive_io); +} + +/******************************************************//** +Writes a log file header to a completed archived log file. */ +static +void +log_group_archive_completed_header_write( +/*=====================================*/ + log_group_t* group, /*!< in: log group */ + ulint nth_file, /*!< in: header to the nth file in the + archive log file space */ + ib_uint64_t end_lsn) /*!< in: end lsn of the file */ +{ + byte* buf; + ulint dest_offset; + + ut_ad(mutex_own(&(log_sys->mutex))); + ut_a(nth_file < group->n_files); + + buf = *(group->archive_file_header_bufs + nth_file); + + mach_write_to_4(buf + LOG_FILE_ARCH_COMPLETED, TRUE); + mach_write_to_8(buf + LOG_FILE_END_LSN, end_lsn); + + dest_offset = nth_file * group->file_size + LOG_FILE_ARCH_COMPLETED; + + log_sys->n_log_ios++; + + MONITOR_INC(MONITOR_LOG_IO); + + fil_io(OS_FILE_WRITE | OS_FILE_LOG, true, group->archive_space_id, + 0, + dest_offset / UNIV_PAGE_SIZE, + dest_offset % UNIV_PAGE_SIZE, + OS_FILE_LOG_BLOCK_SIZE, + buf + LOG_FILE_ARCH_COMPLETED, + &log_archive_io); +} + +/******************************************************//** +Does the archive writes for a single log group. */ +static +void +log_group_archive( +/*==============*/ + log_group_t* group) /*!< in: log group */ +{ + os_file_t file_handle; + lsn_t start_lsn; + lsn_t end_lsn; + char name[OS_FILE_MAX_PATH]; + byte* buf; + ulint len; + ibool ret; + lsn_t next_offset; + ulint n_files; + ulint open_mode; + + ut_ad(mutex_own(&(log_sys->mutex))); + + start_lsn = log_sys->archived_lsn; + + ut_a(start_lsn % OS_FILE_LOG_BLOCK_SIZE == 0); + + end_lsn = log_sys->next_archived_lsn; + + ut_a(end_lsn % OS_FILE_LOG_BLOCK_SIZE == 0); + + buf = log_sys->archive_buf; + + n_files = 0; + + next_offset = group->archived_offset; +loop: + if ((next_offset % group->file_size == 0) + || (fil_space_get_size(group->archive_space_id) == 0)) { + + /* Add the file to the archive file space; create or open the + file */ + + if (next_offset % group->file_size == 0) { + open_mode = OS_FILE_CREATE; + if (n_files == 0) { + /* Adjust archived_file_no to match start_lsn + which is written in file header as well */ + group->archived_file_no = start_lsn; + } + } else { + open_mode = OS_FILE_OPEN; + } + + log_archived_file_name_gen(name, sizeof(name), group->id, + group->archived_file_no + + n_files * (group->file_size - + LOG_FILE_HDR_SIZE)); + + file_handle = os_file_create(innodb_file_log_key, + name, open_mode, + OS_FILE_AIO, + OS_DATA_FILE, &ret); + + if (!ret && (open_mode == OS_FILE_CREATE)) { + file_handle = os_file_create( + innodb_file_log_key, name, OS_FILE_OPEN, + OS_FILE_AIO, OS_DATA_FILE, &ret); + } + + if (!ret) { + fprintf(stderr, + "InnoDB: Cannot create or open" + " archive log file %s.\n" + "InnoDB: Cannot continue operation.\n" + "InnoDB: Check that the log archive" + " directory exists,\n" + "InnoDB: you have access rights to it, and\n" + "InnoDB: there is space available.\n", name); + exit(1); + } + +#ifdef UNIV_DEBUG + if (log_debug_writes) { + fprintf(stderr, "Created archive file %s\n", name); + } +#endif /* UNIV_DEBUG */ + + ret = os_file_close(file_handle); + + ut_a(ret); + + /* Add the archive file as a node to the space */ + + ut_a(fil_node_create(name, group->file_size / UNIV_PAGE_SIZE, + group->archive_space_id, FALSE)); + + if (next_offset % group->file_size == 0) { + log_group_archive_file_header_write( + group, n_files, + group->archived_file_no + + n_files * (group->file_size - LOG_FILE_HDR_SIZE), + start_lsn); + + next_offset += LOG_FILE_HDR_SIZE; + } + } + + len = end_lsn - start_lsn; + + if (group->file_size < (next_offset % group->file_size) + len) { + + len = group->file_size - (next_offset % group->file_size); + } + +#ifdef UNIV_DEBUG + if (log_debug_writes) { + fprintf(stderr, + "Archiving starting at lsn " LSN_PF ", len %lu" + " to group %lu\n", + start_lsn, + (ulong) len, (ulong) group->id); + } +#endif /* UNIV_DEBUG */ + + log_sys->n_pending_archive_ios++; + + log_sys->n_log_ios++; + + MONITOR_INC(MONITOR_LOG_IO); + + fil_io(OS_FILE_WRITE | OS_FILE_LOG, false, group->archive_space_id, + 0, + (ulint) (next_offset / UNIV_PAGE_SIZE), + (ulint) (next_offset % UNIV_PAGE_SIZE), + ut_calc_align(len, OS_FILE_LOG_BLOCK_SIZE), buf, + &log_archive_io); + + start_lsn += len; + next_offset += len; + buf += len; + + if (next_offset % group->file_size == 0) { + n_files++; + } + + if (end_lsn != start_lsn) { + + goto loop; + } + + group->next_archived_file_no = group->archived_file_no + + n_files * (group->file_size - LOG_FILE_HDR_SIZE); + group->next_archived_offset = next_offset % group->file_size; + + ut_a(group->next_archived_offset % OS_FILE_LOG_BLOCK_SIZE == 0); +} + +/*****************************************************//** +(Writes to the archive of each log group.) Currently, only the first +group is archived. */ +static +void +log_archive_groups(void) +/*====================*/ +{ + log_group_t* group; + + ut_ad(mutex_own(&(log_sys->mutex))); + + group = UT_LIST_GET_FIRST(log_sys->log_groups); + + log_group_archive(group); +} + +/*****************************************************//** +Completes the archiving write phase for (each log group), currently, +the first log group. */ +static +void +log_archive_write_complete_groups(void) +/*===================================*/ +{ + log_group_t* group; + lsn_t end_offset; + ulint trunc_files; + ulint n_files; + ib_uint64_t start_lsn; + ib_uint64_t end_lsn; + ulint i; + + ut_ad(mutex_own(&(log_sys->mutex))); + + group = UT_LIST_GET_FIRST(log_sys->log_groups); + + group->archived_file_no = group->next_archived_file_no; + group->archived_offset = group->next_archived_offset; + + /* Truncate from the archive file space all but the last + file, or if it has been written full, all files */ + + n_files = (UNIV_PAGE_SIZE + * fil_space_get_size(group->archive_space_id)) + / group->file_size; + ut_ad(n_files > 0); + + end_offset = group->archived_offset; + + if (end_offset % group->file_size == 0) { + + trunc_files = n_files; + } else { + trunc_files = n_files - 1; + } + +#ifdef UNIV_DEBUG + if (log_debug_writes && trunc_files) { + fprintf(stderr, + "Complete file(s) archived to group %lu\n", + (ulong) group->id); + } +#endif /* UNIV_DEBUG */ + + /* Calculate the archive file space start lsn */ + start_lsn = log_sys->next_archived_lsn + - (end_offset - LOG_FILE_HDR_SIZE + trunc_files + * (group->file_size - LOG_FILE_HDR_SIZE)); + end_lsn = start_lsn; + + for (i = 0; i < trunc_files; i++) { + + end_lsn += group->file_size - LOG_FILE_HDR_SIZE; + + /* Write a notice to the headers of archived log + files that the file write has been completed */ + + log_group_archive_completed_header_write(group, i, end_lsn); + } + + fil_space_truncate_start(group->archive_space_id, + trunc_files * group->file_size); + +#ifdef UNIV_DEBUG + if (log_debug_writes) { + fputs("Archiving writes completed\n", stderr); + } +#endif /* UNIV_DEBUG */ +} + +/******************************************************//** +Completes an archiving i/o. */ +static +void +log_archive_check_completion_low(void) +/*==================================*/ +{ + ut_ad(mutex_own(&(log_sys->mutex))); + + if (log_sys->n_pending_archive_ios == 0 + && log_sys->archiving_phase == LOG_ARCHIVE_READ) { + +#ifdef UNIV_DEBUG + if (log_debug_writes) { + fputs("Archiving read completed\n", stderr); + } +#endif /* UNIV_DEBUG */ + + /* Archive buffer has now been read in: start archive writes */ + + log_sys->archiving_phase = LOG_ARCHIVE_WRITE; + + log_archive_groups(); + } + + if (log_sys->n_pending_archive_ios == 0 + && log_sys->archiving_phase == LOG_ARCHIVE_WRITE) { + + log_archive_write_complete_groups(); + + log_sys->archived_lsn = log_sys->next_archived_lsn; + + rw_lock_x_unlock_gen(&(log_sys->archive_lock), LOG_ARCHIVE); + } +} + +/******************************************************//** +Completes an archiving i/o. */ +static +void +log_io_complete_archive(void) +/*=========================*/ +{ + log_group_t* group; + + mutex_enter(&(log_sys->mutex)); + + group = UT_LIST_GET_FIRST(log_sys->log_groups); + + mutex_exit(&(log_sys->mutex)); + + fil_flush(group->archive_space_id); + + mutex_enter(&(log_sys->mutex)); + + ut_ad(log_sys->n_pending_archive_ios > 0); + + log_sys->n_pending_archive_ios--; + + log_archive_check_completion_low(); + + mutex_exit(&(log_sys->mutex)); +} + +/********************************************************************//** +Starts an archiving operation. +@return TRUE if succeed, FALSE if an archiving operation was already running */ +UNIV_INTERN +ibool +log_archive_do( +/*===========*/ + ibool sync, /*!< in: TRUE if synchronous operation is desired */ + ulint* n_bytes)/*!< out: archive log buffer size, 0 if nothing to + archive */ +{ + ibool calc_new_limit; + lsn_t start_lsn; + lsn_t limit_lsn = LSN_MAX; + + calc_new_limit = TRUE; +loop: + mutex_enter(&(log_sys->mutex)); + + switch (log_sys->archiving_state) { + case LOG_ARCH_OFF: +arch_none: + mutex_exit(&(log_sys->mutex)); + + *n_bytes = 0; + + return(TRUE); + case LOG_ARCH_STOPPED: + case LOG_ARCH_STOPPING2: + mutex_exit(&(log_sys->mutex)); + + os_event_wait(log_sys->archiving_on); + + goto loop; + } + + start_lsn = log_sys->archived_lsn; + + if (calc_new_limit) { + ut_a(log_sys->archive_buf_size % OS_FILE_LOG_BLOCK_SIZE == 0); + limit_lsn = start_lsn + log_sys->archive_buf_size; + + *n_bytes = log_sys->archive_buf_size; + + if (limit_lsn >= log_sys->lsn) { + + limit_lsn = ut_uint64_align_down( + log_sys->lsn, OS_FILE_LOG_BLOCK_SIZE); + } + } + + if (log_sys->archived_lsn >= limit_lsn) { + + goto arch_none; + } + + if (log_sys->written_to_all_lsn < limit_lsn) { + + mutex_exit(&(log_sys->mutex)); + + log_write_up_to(limit_lsn, LOG_WAIT_ALL_GROUPS, TRUE); + + calc_new_limit = FALSE; + + goto loop; + } + + if (log_sys->n_pending_archive_ios > 0) { + /* An archiving operation is running */ + + mutex_exit(&(log_sys->mutex)); + + if (sync) { + rw_lock_s_lock(&(log_sys->archive_lock)); + rw_lock_s_unlock(&(log_sys->archive_lock)); + } + + *n_bytes = log_sys->archive_buf_size; + + return(FALSE); + } + + rw_lock_x_lock_gen(&(log_sys->archive_lock), LOG_ARCHIVE); + + log_sys->archiving_phase = LOG_ARCHIVE_READ; + + log_sys->next_archived_lsn = limit_lsn; + +#ifdef UNIV_DEBUG + if (log_debug_writes) { + fprintf(stderr, + "Archiving from lsn " LSN_PF " to lsn " LSN_PF "\n", + log_sys->archived_lsn, limit_lsn); + } +#endif /* UNIV_DEBUG */ + + /* Read the log segment to the archive buffer */ + + log_group_read_log_seg(LOG_ARCHIVE, log_sys->archive_buf, + UT_LIST_GET_FIRST(log_sys->log_groups), + start_lsn, limit_lsn, FALSE); + + mutex_exit(&(log_sys->mutex)); + + if (sync) { + rw_lock_s_lock(&(log_sys->archive_lock)); + rw_lock_s_unlock(&(log_sys->archive_lock)); + } + + *n_bytes = log_sys->archive_buf_size; + + return(TRUE); +} + +/****************************************************************//** +Writes the log contents to the archive at least up to the lsn when this +function was called. */ +static +void +log_archive_all(void) +/*=================*/ +{ + lsn_t present_lsn; + + mutex_enter(&(log_sys->mutex)); + + if (log_sys->archiving_state == LOG_ARCH_OFF) { + mutex_exit(&(log_sys->mutex)); + + return; + } + + present_lsn = log_sys->lsn; + + mutex_exit(&(log_sys->mutex)); + + log_pad_current_log_block(); + + for (;;) { + + ulint archived_bytes; + + mutex_enter(&(log_sys->mutex)); + + if (present_lsn <= log_sys->archived_lsn) { + + mutex_exit(&(log_sys->mutex)); + + return; + } + + mutex_exit(&(log_sys->mutex)); + + log_archive_do(TRUE, &archived_bytes); + + if (archived_bytes == 0) + return; + } +} + +/*****************************************************//** +Closes the possible open archive log file (for each group) the first group, +and if it was open, increments the group file count by 2, if desired. */ +static +void +log_archive_close_groups( +/*=====================*/ + ibool increment_file_count) /*!< in: TRUE if we want to increment + the file count */ +{ + log_group_t* group; + ulint trunc_len; + + ut_ad(mutex_own(&(log_sys->mutex))); + + if (log_sys->archiving_state == LOG_ARCH_OFF) { + + return; + } + + group = UT_LIST_GET_FIRST(log_sys->log_groups); + + trunc_len = UNIV_PAGE_SIZE + * fil_space_get_size(group->archive_space_id); + if (trunc_len > 0) { + ut_a(trunc_len == group->file_size); + + /* Write a notice to the headers of archived log + files that the file write has been completed */ + + log_group_archive_completed_header_write( + group, 0, log_sys->archived_lsn); + + fil_space_truncate_start(group->archive_space_id, + trunc_len); + if (increment_file_count) { + group->archived_offset = 0; + } + + } +} + +/****************************************************************//** +Writes the log contents to the archive up to the lsn when this function was +called, and stops the archiving. When archiving is started again, the archived +log file numbers start from 2 higher, so that the archiving will not write +again to the archived log files which exist when this function returns. +@return DB_SUCCESS or DB_ERROR */ +UNIV_INTERN +ulint +log_archive_stop(void) +/*==================*/ +{ + ibool success; + + mutex_enter(&(log_sys->mutex)); + + if (log_sys->archiving_state != LOG_ARCH_ON) { + + mutex_exit(&(log_sys->mutex)); + + return(DB_ERROR); + } + + log_sys->archiving_state = LOG_ARCH_STOPPING; + + mutex_exit(&(log_sys->mutex)); + + log_archive_all(); + + mutex_enter(&(log_sys->mutex)); + + log_sys->archiving_state = LOG_ARCH_STOPPING2; + os_event_reset(log_sys->archiving_on); + + mutex_exit(&(log_sys->mutex)); + + /* Wait for a possible archiving operation to end */ + + rw_lock_s_lock(&(log_sys->archive_lock)); + rw_lock_s_unlock(&(log_sys->archive_lock)); + + mutex_enter(&(log_sys->mutex)); + + /* Close all archived log files, incrementing the file count by 2, + if appropriate */ + + log_archive_close_groups(TRUE); + + mutex_exit(&(log_sys->mutex)); + + /* Make a checkpoint, so that if recovery is needed, the file numbers + of new archived log files will start from the right value */ + + success = FALSE; + + while (!success) { + success = log_checkpoint(TRUE, TRUE); + } + + mutex_enter(&(log_sys->mutex)); + + log_sys->archiving_state = LOG_ARCH_STOPPED; + + mutex_exit(&(log_sys->mutex)); + + return(DB_SUCCESS); +} + +/****************************************************************//** +Starts again archiving which has been stopped. +@return DB_SUCCESS or DB_ERROR */ +UNIV_INTERN +ulint +log_archive_start(void) +/*===================*/ +{ + mutex_enter(&(log_sys->mutex)); + + if (log_sys->archiving_state != LOG_ARCH_STOPPED) { + + mutex_exit(&(log_sys->mutex)); + + return(DB_ERROR); + } + + log_sys->archiving_state = LOG_ARCH_ON; + + os_event_set(log_sys->archiving_on); + + mutex_exit(&(log_sys->mutex)); + + return(DB_SUCCESS); +} + +/****************************************************************//** +Stop archiving the log so that a gap may occur in the archived log files. +@return DB_SUCCESS or DB_ERROR */ +UNIV_INTERN +ulint +log_archive_noarchivelog(void) +/*==========================*/ +{ +loop: + mutex_enter(&(log_sys->mutex)); + + if (log_sys->archiving_state == LOG_ARCH_STOPPED + || log_sys->archiving_state == LOG_ARCH_OFF) { + + log_sys->archiving_state = LOG_ARCH_OFF; + + os_event_set(log_sys->archiving_on); + + mutex_exit(&(log_sys->mutex)); + + return(DB_SUCCESS); + } + + mutex_exit(&(log_sys->mutex)); + + log_archive_stop(); + + os_thread_sleep(500000); + + goto loop; +} + +/****************************************************************//** +Start archiving the log so that a gap may occur in the archived log files. +@return DB_SUCCESS or DB_ERROR */ +UNIV_INTERN +ulint +log_archive_archivelog(void) +/*========================*/ +{ + mutex_enter(&(log_sys->mutex)); + + if (log_sys->archiving_state == LOG_ARCH_OFF) { + + log_sys->archiving_state = LOG_ARCH_ON; + + log_sys->archived_lsn + = ut_uint64_align_down(log_sys->lsn, + OS_FILE_LOG_BLOCK_SIZE); + mutex_exit(&(log_sys->mutex)); + + return(DB_SUCCESS); + } + + mutex_exit(&(log_sys->mutex)); + + return(DB_ERROR); +} + +/****************************************************************//** +Tries to establish a big enough margin of free space in the log groups, such +that a new log entry can be catenated without an immediate need for +archiving. */ +static +void +log_archive_margin(void) +/*====================*/ +{ + log_t* log = log_sys; + ulint age; + ibool sync; + ulint dummy; +loop: + mutex_enter(&(log->mutex)); + + if (log->archiving_state == LOG_ARCH_OFF) { + mutex_exit(&(log->mutex)); + + return; + } + + age = log->lsn - log->archived_lsn; + + if (age > log->max_archived_lsn_age) { + + /* An archiving is urgent: we have to do synchronous i/o */ + + sync = TRUE; + + } else if (age > log->max_archived_lsn_age_async) { + + /* An archiving is not urgent: we do asynchronous i/o */ + + sync = FALSE; + } else { + /* No archiving required yet */ + + mutex_exit(&(log->mutex)); + + return; + } + + mutex_exit(&(log->mutex)); + + log_archive_do(sync, &dummy); + + if (sync == TRUE) { + /* Check again that enough was written to the archive */ + + goto loop; + } +} +#endif /* UNIV_LOG_ARCHIVE */ + +/********************************************************************//** +Checks that there is enough free space in the log to start a new query step. +Flushes the log buffer or makes a new checkpoint if necessary. NOTE: this +function may only be called if the calling thread owns no synchronization +objects! */ +UNIV_INTERN +void +log_check_margins(void) +/*===================*/ +{ +loop: + log_flush_margin(); + + log_checkpoint_margin(); + + mutex_enter(&(log_sys->mutex)); + if (log_check_tracking_margin(0)) { + + mutex_exit(&(log_sys->mutex)); + os_thread_sleep(10000); + goto loop; + } + mutex_exit(&(log_sys->mutex)); + +#ifdef UNIV_LOG_ARCHIVE + log_archive_margin(); +#endif /* UNIV_LOG_ARCHIVE */ + + mutex_enter(&(log_sys->mutex)); + ut_ad(!recv_no_log_write); + + if (log_sys->check_flush_or_checkpoint) { + + mutex_exit(&(log_sys->mutex)); + + goto loop; + } + + mutex_exit(&(log_sys->mutex)); +} + +/****************************************************************//** +Makes a checkpoint at the latest lsn and writes it to first page of each +data file in the database, so that we know that the file spaces contain +all modifications up to that lsn. This can only be called at database +shutdown. This function also writes all log in log files to the log archive. */ +UNIV_INTERN +void +logs_empty_and_mark_files_at_shutdown(void) +/*=======================================*/ +{ + lsn_t lsn; + lsn_t tracked_lsn; + ulint count = 0; + ulint total_trx; + ulint pending_io; + enum srv_thread_type active_thd; + const char* thread_name; + ibool server_busy; + + ib_logf(IB_LOG_LEVEL_INFO, "Starting shutdown..."); + + /* Wait until the master thread and all other operations are idle: our + algorithm only works if the server is idle at shutdown */ + + srv_shutdown_state = SRV_SHUTDOWN_CLEANUP; +loop: + os_thread_sleep(100000); + + count++; + + /* We need the monitor threads to stop before we proceed with + a shutdown. */ + + thread_name = srv_any_background_threads_are_active(); + + if (thread_name != NULL) { + /* Print a message every 60 seconds if we are waiting + for the monitor thread to exit. Master and worker + threads check will be done later. */ + + if (srv_print_verbose_log && count > 600) { + ib_logf(IB_LOG_LEVEL_INFO, + "Waiting for %s to exit", thread_name); + count = 0; + } + + goto loop; + } + + /* Check that there are no longer transactions, except for + PREPARED ones. We need this wait even for the 'very fast' + shutdown, because the InnoDB layer may have committed or + prepared transactions and we don't want to lose them. */ + + total_trx = trx_sys_any_active_transactions(); + + if (total_trx > 0) { + + if (srv_print_verbose_log && count > 600) { + ib_logf(IB_LOG_LEVEL_INFO, + "Waiting for %lu active transactions to finish", + (ulong) total_trx); + + count = 0; + } + + goto loop; + } + + /* Check that the background threads are suspended */ + + active_thd = srv_get_active_thread_type(); + + if (active_thd != SRV_NONE) { + + if (active_thd == SRV_PURGE) { + srv_purge_wakeup(); + } + + /* The srv_lock_timeout_thread, srv_error_monitor_thread + and srv_monitor_thread should already exit by now. The + only threads to be suspended are the master threads + and worker threads (purge threads). Print the thread + type if any of such threads not in suspended mode */ + if (srv_print_verbose_log && count > 600) { + const char* thread_type = "<null>"; + + switch (active_thd) { + case SRV_NONE: + /* This shouldn't happen because we've + already checked for this case before + entering the if(). We handle it here + to avoid a compiler warning. */ + ut_error; + case SRV_WORKER: + thread_type = "worker threads"; + break; + case SRV_MASTER: + thread_type = "master thread"; + break; + case SRV_PURGE: + thread_type = "purge thread"; + break; + } + + ib_logf(IB_LOG_LEVEL_INFO, + "Waiting for %s to be suspended", + thread_type); + count = 0; + } + + goto loop; + } + + /* At this point only page_cleaner should be active. We wait + here to let it complete the flushing of the buffer pools + before proceeding further. */ + srv_shutdown_state = SRV_SHUTDOWN_FLUSH_PHASE; + count = 0; + while (buf_page_cleaner_is_active) { + ++count; + os_thread_sleep(100000); + if (srv_print_verbose_log && count > 600) { + ib_logf(IB_LOG_LEVEL_INFO, + "Waiting for page_cleaner to " + "finish flushing of buffer pool"); + count = 0; + } + } + + mutex_enter(&log_sys->mutex); + server_busy = log_sys->n_pending_checkpoint_writes +#ifdef UNIV_LOG_ARCHIVE + || log_sys->n_pending_archive_ios +#endif /* UNIV_LOG_ARCHIVE */ + || log_sys->n_pending_writes; + mutex_exit(&log_sys->mutex); + + if (server_busy) { + if (srv_print_verbose_log && count > 600) { + ib_logf(IB_LOG_LEVEL_INFO, + "Pending checkpoint_writes: %lu. " + "Pending log flush writes: %lu", + (ulong) log_sys->n_pending_checkpoint_writes, + (ulong) log_sys->n_pending_writes); + count = 0; + } + goto loop; + } + + pending_io = buf_pool_check_no_pending_io(); + + if (pending_io) { + if (srv_print_verbose_log && count > 600) { + ib_logf(IB_LOG_LEVEL_INFO, + "Waiting for %lu buffer page I/Os to complete", + (ulong) pending_io); + count = 0; + } + + goto loop; + } + +#ifdef UNIV_LOG_ARCHIVE + log_archive_all(); +#endif /* UNIV_LOG_ARCHIVE */ + if (srv_fast_shutdown == 2) { + if (!srv_read_only_mode) { + ib_logf(IB_LOG_LEVEL_INFO, + "MySQL has requested a very fast shutdown " + "without flushing the InnoDB buffer pool to " + "data files. At the next mysqld startup " + "InnoDB will do a crash recovery!"); + + /* In this fastest shutdown we do not flush the + buffer pool: + + it is essentially a 'crash' of the InnoDB server. + Make sure that the log is all flushed to disk, so + that we can recover all committed transactions in + a crash recovery. We must not write the lsn stamps + to the data files, since at a startup InnoDB deduces + from the stamps if the previous shutdown was clean. */ + + log_buffer_flush_to_disk(); + + /* Check that the background threads stay suspended */ + thread_name = srv_any_background_threads_are_active(); + + if (thread_name != NULL) { + ib_logf(IB_LOG_LEVEL_WARN, + "Background thread %s woke up " + "during shutdown", thread_name); + goto loop; + } + } + + srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE; + + /* Wake the log tracking thread which will then immediatelly + quit because of srv_shutdown_state value */ + if (srv_track_changed_pages) { + os_event_reset(srv_redo_log_tracked_event); + os_event_set(srv_checkpoint_completed_event); + } + + fil_close_all_files(); + + thread_name = srv_any_background_threads_are_active(); + + ut_a(!thread_name); + + return; + } + + if (!srv_read_only_mode) { + log_make_checkpoint_at(LSN_MAX, TRUE); + } + + mutex_enter(&log_sys->mutex); + + tracked_lsn = log_get_tracked_lsn(); + + lsn = log_sys->lsn; + + ut_ad(srv_force_recovery != SRV_FORCE_NO_LOG_REDO + || lsn == log_sys->last_checkpoint_lsn + LOG_BLOCK_HDR_SIZE); + + + if ((srv_force_recovery != SRV_FORCE_NO_LOG_REDO + && lsn != log_sys->last_checkpoint_lsn) + || (srv_track_changed_pages + && (tracked_lsn != log_sys->last_checkpoint_lsn)) +#ifdef UNIV_LOG_ARCHIVE + || (srv_log_archive_on + && lsn != log_sys->archived_lsn + LOG_BLOCK_HDR_SIZE) +#endif /* UNIV_LOG_ARCHIVE */ + ) { + + mutex_exit(&log_sys->mutex); + + goto loop; + } + +#ifdef UNIV_LOG_ARCHIVE + + log_archive_close_groups(TRUE); +#endif /* UNIV_LOG_ARCHIVE */ + + mutex_exit(&log_sys->mutex); + + /* Check that the background threads stay suspended */ + thread_name = srv_any_background_threads_are_active(); + if (thread_name != NULL) { + ib_logf(IB_LOG_LEVEL_WARN, + "Background thread %s woke up during shutdown", + thread_name); + + goto loop; + } + + if (!srv_read_only_mode) { + fil_flush_file_spaces(FIL_TABLESPACE); + fil_flush_file_spaces(FIL_LOG); + } + + /* The call fil_write_flushed_lsn_to_data_files() will pass the buffer + pool: therefore it is essential that the buffer pool has been + completely flushed to disk! (We do not call fil_write... if the + 'very fast' shutdown is enabled.) */ + + if (!buf_all_freed()) { + + if (srv_print_verbose_log && count > 600) { + ib_logf(IB_LOG_LEVEL_INFO, + "Waiting for dirty buffer pages to be flushed"); + count = 0; + } + + goto loop; + } + + srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE; + + /* Signal the log following thread to quit */ + if (srv_track_changed_pages) { + os_event_reset(srv_redo_log_tracked_event); + os_event_set(srv_checkpoint_completed_event); + } + + /* Make some checks that the server really is quiet */ + srv_thread_type type = srv_get_active_thread_type(); + ut_a(type == SRV_NONE); + + bool freed = buf_all_freed(); + ut_a(freed); + + ut_a(lsn == log_sys->lsn); + + if (lsn < srv_start_lsn) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Log sequence number at shutdown " LSN_PF " " + "is lower than at startup " LSN_PF "!", + lsn, srv_start_lsn); + } + + srv_shutdown_lsn = lsn; + + if (!srv_read_only_mode) { + fil_write_flushed_lsn_to_data_files(lsn, 0); + + fil_flush_file_spaces(FIL_TABLESPACE); + } + + fil_close_all_files(); + + /* Make some checks that the server really is quiet */ + type = srv_get_active_thread_type(); + ut_a(type == SRV_NONE); + + freed = buf_all_freed(); + ut_a(freed); + + ut_a(lsn == log_sys->lsn); +} + +#ifdef UNIV_LOG_DEBUG +/******************************************************//** +Checks by parsing that the catenated log segment for a single mtr is +consistent. */ +UNIV_INTERN +ibool +log_check_log_recs( +/*===============*/ + const byte* buf, /*!< in: pointer to the start of + the log segment in the + log_sys->buf log buffer */ + ulint len, /*!< in: segment length in bytes */ + ib_uint64_t buf_start_lsn) /*!< in: buffer start lsn */ +{ + ib_uint64_t contiguous_lsn; + ib_uint64_t scanned_lsn; + const byte* start; + const byte* end; + byte* buf1; + byte* scan_buf; + + ut_ad(mutex_own(&(log_sys->mutex))); + + if (len == 0) { + + return(TRUE); + } + + start = ut_align_down(buf, OS_FILE_LOG_BLOCK_SIZE); + end = ut_align(buf + len, OS_FILE_LOG_BLOCK_SIZE); + + buf1 = mem_alloc((end - start) + OS_FILE_LOG_BLOCK_SIZE); + scan_buf = ut_align(buf1, OS_FILE_LOG_BLOCK_SIZE); + + ut_memcpy(scan_buf, start, end - start); + + recv_scan_log_recs((buf_pool_get_n_pages() + - (recv_n_pool_free_frames * srv_buf_pool_instances)) + * UNIV_PAGE_SIZE, FALSE, scan_buf, end - start, + ut_uint64_align_down(buf_start_lsn, + OS_FILE_LOG_BLOCK_SIZE), + &contiguous_lsn, &scanned_lsn); + + ut_a(scanned_lsn == buf_start_lsn + len); + ut_a(recv_sys->recovered_lsn == scanned_lsn); + + mem_free(buf1); + + return(TRUE); +} +#endif /* UNIV_LOG_DEBUG */ + +/******************************************************//** +Peeks the current lsn. +@return TRUE if success, FALSE if could not get the log system mutex */ +UNIV_INTERN +ibool +log_peek_lsn( +/*=========*/ + lsn_t* lsn) /*!< out: if returns TRUE, current lsn is here */ +{ + if (0 == mutex_enter_nowait(&(log_sys->mutex))) { + *lsn = log_sys->lsn; + + mutex_exit(&(log_sys->mutex)); + + return(TRUE); + } + + return(FALSE); +} + +/******************************************************//** +Prints info of the log. */ +UNIV_INTERN +void +log_print( +/*======*/ + FILE* file) /*!< in: file where to print */ +{ + double time_elapsed; + time_t current_time; + + mutex_enter(&(log_sys->mutex)); + + fprintf(file, + "Log sequence number " LSN_PF "\n" + "Log flushed up to " LSN_PF "\n" + "Pages flushed up to " LSN_PF "\n" + "Last checkpoint at " LSN_PF "\n", + log_sys->lsn, + log_sys->flushed_to_disk_lsn, + log_buf_pool_get_oldest_modification(), + log_sys->last_checkpoint_lsn); + + fprintf(file, + "Max checkpoint age " LSN_PF "\n" + "Checkpoint age target " LSN_PF "\n" + "Modified age " LSN_PF "\n" + "Checkpoint age " LSN_PF "\n", + log_sys->max_checkpoint_age, + log_sys->max_checkpoint_age_async, + log_sys->lsn -log_buf_pool_get_oldest_modification(), + log_sys->lsn - log_sys->last_checkpoint_lsn); + + current_time = time(NULL); + + time_elapsed = difftime(current_time, + log_sys->last_printout_time); + + if (time_elapsed <= 0) { + time_elapsed = 1; + } + + fprintf(file, + "%lu pending log writes, %lu pending chkp writes\n" + "%lu log i/o's done, %.2f log i/o's/second\n", + (ulong) log_sys->n_pending_writes, + (ulong) log_sys->n_pending_checkpoint_writes, + (ulong) log_sys->n_log_ios, + ((double)(log_sys->n_log_ios - log_sys->n_log_ios_old) + / time_elapsed)); + + if (srv_track_changed_pages) { + + /* The maximum tracked LSN age is equal to the maximum + checkpoint age */ + fprintf(file, + "Log tracking enabled\n" + "Log tracked up to " LSN_PF "\n" + "Max tracked LSN age " LSN_PF "\n", + log_get_tracked_lsn(), + log_sys->max_checkpoint_age); + } + + log_sys->n_log_ios_old = log_sys->n_log_ios; + log_sys->last_printout_time = current_time; + + mutex_exit(&(log_sys->mutex)); +} + +/**********************************************************************//** +Refreshes the statistics used to print per-second averages. */ +UNIV_INTERN +void +log_refresh_stats(void) +/*===================*/ +{ + log_sys->n_log_ios_old = log_sys->n_log_ios; + log_sys->last_printout_time = time(NULL); +} + +/********************************************************//** +Closes a log group. */ +static +void +log_group_close( +/*===========*/ + log_group_t* group) /* in,own: log group to close */ +{ + ulint i; + + for (i = 0; i < group->n_files; i++) { + mem_free(group->file_header_bufs_ptr[i]); +#ifdef UNIV_LOG_ARCHIVE + mem_free(group->archive_file_header_bufs_ptr[i]); +#endif /* UNIV_LOG_ARCHIVE */ + } + + mem_free(group->file_header_bufs_ptr); + mem_free(group->file_header_bufs); + +#ifdef UNIV_LOG_ARCHIVE + mem_free(group->archive_file_header_bufs_ptr); + mem_free(group->archive_file_header_bufs); +#endif /* UNIV_LOG_ARCHIVE */ + + mem_free(group->checkpoint_buf_ptr); + + mem_free(group); +} + +/********************************************************//** +Closes all log groups. */ +UNIV_INTERN +void +log_group_close_all(void) +/*=====================*/ +{ + log_group_t* group; + + group = UT_LIST_GET_FIRST(log_sys->log_groups); + + while (UT_LIST_GET_LEN(log_sys->log_groups) > 0) { + log_group_t* prev_group = group; + + group = UT_LIST_GET_NEXT(log_groups, group); + UT_LIST_REMOVE(log_groups, log_sys->log_groups, prev_group); + + log_group_close(prev_group); + } +} + +/********************************************************//** +Shutdown the log system but do not release all the memory. */ +UNIV_INTERN +void +log_shutdown(void) +/*==============*/ +{ + log_group_close_all(); + + mem_free(log_sys->buf_ptr); + log_sys->buf_ptr = NULL; + log_sys->buf = NULL; + mem_free(log_sys->checkpoint_buf_ptr); + log_sys->checkpoint_buf_ptr = NULL; + log_sys->checkpoint_buf = NULL; + mem_free(log_sys->archive_buf_ptr); + log_sys->archive_buf_ptr = NULL; + log_sys->archive_buf = NULL; + + os_event_free(log_sys->no_flush_event); + os_event_free(log_sys->one_flushed_event); + + rw_lock_free(&log_sys->checkpoint_lock); + + mutex_free(&log_sys->mutex); + +#ifdef UNIV_LOG_ARCHIVE + rw_lock_free(&log_sys->archive_lock); + os_event_free(log_sys->archiving_on); +#endif /* UNIV_LOG_ARCHIVE */ + +#ifdef UNIV_LOG_DEBUG + recv_sys_debug_free(); +#endif + + recv_sys_close(); +} + +/********************************************************//** +Free the log system data structures. */ +UNIV_INTERN +void +log_mem_free(void) +/*==============*/ +{ + if (log_sys != NULL) { + recv_sys_mem_free(); + mem_free(log_sys); + + log_sys = NULL; + } +} +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/xtradb/log/log0online.cc b/storage/xtradb/log/log0online.cc new file mode 100644 index 00000000000..00ff3bf07ed --- /dev/null +++ b/storage/xtradb/log/log0online.cc @@ -0,0 +1,1867 @@ +/***************************************************************************** + +Copyright (c) 2011-2012 Percona Inc. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/**************************************************//** +@file log/log0online.cc +Online database log parsing for changed page tracking + +*******************************************************/ + +#include "log0online.h" + +#include "my_dbug.h" + +#include "log0recv.h" +#include "mach0data.h" +#include "mtr0log.h" +#include "srv0srv.h" +#include "srv0start.h" +#include "trx0sys.h" +#include "ut0rbt.h" + +#ifdef __WIN__ +/* error LNK2001: unresolved external symbol _debug_sync_C_callback_ptr */ +# define DEBUG_SYNC_C(dummy) ((void) 0) +#else +# include "m_string.h" /* for my_sys.h */ +# include "my_sys.h" /* DEBUG_SYNC_C */ +#endif + +enum { FOLLOW_SCAN_SIZE = 4 * (UNIV_PAGE_SIZE_MAX) }; + +#ifdef UNIV_PFS_MUTEX +/* Key to register log_bmp_sys->mutex with PFS */ +UNIV_INTERN mysql_pfs_key_t log_bmp_sys_mutex_key; +#endif /* UNIV_PFS_MUTEX */ + +/** Log parsing and bitmap output data structure */ +struct log_bitmap_struct { + byte* read_buf_ptr; /*!< Unaligned log read buffer */ + byte* read_buf; /*!< log read buffer */ + byte parse_buf[RECV_PARSING_BUF_SIZE]; + /*!< log parse buffer */ + byte* parse_buf_end; /*!< parse buffer position where the + next read log data should be copied to. + If the previous log records were fully + parsed, it points to the start, + otherwise points immediatelly past the + end of the incomplete log record. */ + char bmp_file_home[FN_REFLEN]; + /*!< directory for bitmap files */ + log_online_bitmap_file_t out; /*!< The current bitmap file */ + ulint out_seq_num; /*!< the bitmap file sequence number */ + lsn_t start_lsn; /*!< the LSN of the next unparsed + record and the start of the next LSN + interval to be parsed. */ + lsn_t end_lsn; /*!< the end of the LSN interval to be + parsed, equal to the next checkpoint + LSN at the time of parse */ + lsn_t next_parse_lsn; /*!< the LSN of the next unparsed + record in the current parse */ + ib_rbt_t* modified_pages; /*!< the current modified page set, + organized as the RB-tree with the keys + of (space, 4KB-block-start-page-id) + pairs */ + ib_rbt_node_t* page_free_list; /*!< Singly-linked list of freed nodes + of modified_pages tree for later + reuse. Nodes are linked through + ib_rbt_node_t.left as this field has + both the correct type and the tree does + not mind its overwrite during + rbt_next() tree traversal. */ + ib_mutex_t mutex; /*!< mutex protecting all the fields.*/ +}; + +/* The log parsing and bitmap output struct instance */ +static struct log_bitmap_struct* log_bmp_sys; + +/** File name stem for bitmap files. */ +static const char* bmp_file_name_stem = "ib_modified_log_"; + +/** File name template for bitmap files. The 1st format tag is a directory +name, the 2nd tag is the stem, the 3rd tag is a file sequence number, the 4th +tag is the start LSN for the file. */ +static const char* bmp_file_name_template = "%s%s%lu_%llu.xdb"; + +/* On server startup with empty database srv_start_lsn == 0, in +which case the first LSN of actual log records will be this. */ +#define MIN_TRACKED_LSN ((LOG_START_LSN) + (LOG_BLOCK_HDR_SIZE)) + +/* Tests if num bit of bitmap is set */ +#define IS_BIT_SET(bitmap, num) \ + (*((bitmap) + ((num) >> 3)) & (1UL << ((num) & 7UL))) + +/** The bitmap file block size in bytes. All writes will be multiples of this. + */ +enum { + MODIFIED_PAGE_BLOCK_SIZE = 4096 +}; + + +/** Offsets in a file bitmap block */ +enum { + MODIFIED_PAGE_IS_LAST_BLOCK = 0,/* 1 if last block in the current + write, 0 otherwise. */ + MODIFIED_PAGE_START_LSN = 4, /* The starting tracked LSN of this and + other blocks in the same write */ + MODIFIED_PAGE_END_LSN = 12, /* The ending tracked LSN of this and + other blocks in the same write */ + MODIFIED_PAGE_SPACE_ID = 20, /* The space ID of tracked pages in + this block */ + MODIFIED_PAGE_1ST_PAGE_ID = 24, /* The page ID of the first tracked + page in this block */ + MODIFIED_PAGE_BLOCK_UNUSED_1 = 28,/* Unused in order to align the start + of bitmap at 8 byte boundary */ + MODIFIED_PAGE_BLOCK_BITMAP = 32,/* Start of the bitmap itself */ + MODIFIED_PAGE_BLOCK_UNUSED_2 = MODIFIED_PAGE_BLOCK_SIZE - 8, + /* Unused in order to align the end of + bitmap at 8 byte boundary */ + MODIFIED_PAGE_BLOCK_CHECKSUM = MODIFIED_PAGE_BLOCK_SIZE - 4 + /* The checksum of the current block */ +}; + +/** Length of the bitmap data in a block in bytes */ +enum { MODIFIED_PAGE_BLOCK_BITMAP_LEN + = MODIFIED_PAGE_BLOCK_UNUSED_2 - MODIFIED_PAGE_BLOCK_BITMAP }; + +/** Length of the bitmap data in a block in page ids */ +enum { MODIFIED_PAGE_BLOCK_ID_COUNT = MODIFIED_PAGE_BLOCK_BITMAP_LEN * 8 }; + +/****************************************************************//** +Provide a comparisson function for the RB-tree tree (space, +block_start_page) pairs. Actual implementation does not matter as +long as the ordering is full. +@return -1 if p1 < p2, 0 if p1 == p2, 1 if p1 > p2 +*/ +static +int +log_online_compare_bmp_keys( +/*========================*/ + const void* p1, /*!<in: 1st key to compare */ + const void* p2) /*!<in: 2nd key to compare */ +{ + const byte *k1 = (const byte *)p1; + const byte *k2 = (const byte *)p2; + + ulint k1_space = mach_read_from_4(k1 + MODIFIED_PAGE_SPACE_ID); + ulint k2_space = mach_read_from_4(k2 + MODIFIED_PAGE_SPACE_ID); + if (k1_space == k2_space) { + ulint k1_start_page + = mach_read_from_4(k1 + MODIFIED_PAGE_1ST_PAGE_ID); + ulint k2_start_page + = mach_read_from_4(k2 + MODIFIED_PAGE_1ST_PAGE_ID); + return k1_start_page < k2_start_page + ? -1 : k1_start_page > k2_start_page ? 1 : 0; + } + return k1_space < k2_space ? -1 : 1; +} + +/****************************************************************//** +Set a bit for tracked page in the bitmap. Expand the bitmap tree as +necessary. */ +static +void +log_online_set_page_bit( +/*====================*/ + ulint space, /*!<in: log record space id */ + ulint page_no)/*!<in: log record page id */ +{ + ulint block_start_page; + ulint block_pos; + uint bit_pos; + ib_rbt_bound_t tree_search_pos; + byte search_page[MODIFIED_PAGE_BLOCK_SIZE]; + byte *page_ptr; + + ut_ad(mutex_own(&log_bmp_sys->mutex)); + + ut_a(space != ULINT_UNDEFINED); + ut_a(page_no != ULINT_UNDEFINED); + + block_start_page = page_no / MODIFIED_PAGE_BLOCK_ID_COUNT + * MODIFIED_PAGE_BLOCK_ID_COUNT; + block_pos = block_start_page ? (page_no % block_start_page / 8) + : (page_no / 8); + bit_pos = page_no % 8; + + mach_write_to_4(search_page + MODIFIED_PAGE_SPACE_ID, space); + mach_write_to_4(search_page + MODIFIED_PAGE_1ST_PAGE_ID, + block_start_page); + + if (!rbt_search(log_bmp_sys->modified_pages, &tree_search_pos, + search_page)) { + page_ptr = rbt_value(byte, tree_search_pos.last); + } + else { + ib_rbt_node_t *new_node; + + if (log_bmp_sys->page_free_list) { + new_node = log_bmp_sys->page_free_list; + log_bmp_sys->page_free_list = new_node->left; + } + else { + new_node = static_cast<ib_rbt_node_t *> + (ut_malloc + (SIZEOF_NODE(log_bmp_sys->modified_pages))); + } + memset(new_node, 0, SIZEOF_NODE(log_bmp_sys->modified_pages)); + + page_ptr = rbt_value(byte, new_node); + mach_write_to_4(page_ptr + MODIFIED_PAGE_SPACE_ID, space); + mach_write_to_4(page_ptr + MODIFIED_PAGE_1ST_PAGE_ID, + block_start_page); + + rbt_add_preallocated_node(log_bmp_sys->modified_pages, + &tree_search_pos, new_node); + } + page_ptr[MODIFIED_PAGE_BLOCK_BITMAP + block_pos] |= (1U << bit_pos); +} + +/****************************************************************//** +Calculate a bitmap block checksum. Algorithm borrowed from +log_block_calc_checksum. +@return checksum */ +UNIV_INLINE +ulint +log_online_calc_checksum( +/*=====================*/ + const byte* block) /*!<in: bitmap block */ +{ + ulint sum; + ulint sh; + ulint i; + + sum = 1; + sh = 0; + + for (i = 0; i < MODIFIED_PAGE_BLOCK_CHECKSUM; i++) { + + ulint b = block[i]; + sum &= 0x7FFFFFFFUL; + sum += b; + sum += b << sh; + sh++; + if (sh > 24) { + sh = 0; + } + } + + return sum; +} + +/****************************************************************//** +Read one bitmap data page and check it for corruption. + +@return TRUE if page read OK, FALSE if I/O error */ +static +ibool +log_online_read_bitmap_page( +/*========================*/ + log_online_bitmap_file_t *bitmap_file, /*!<in/out: bitmap + file */ + byte *page, /*!<out: read page. + Must be at least + MODIFIED_PAGE_BLOCK_SIZE + bytes long */ + ibool *checksum_ok) /*!<out: TRUE if page + checksum OK */ +{ + ulint checksum; + ulint actual_checksum; + ibool success; + + ut_a(bitmap_file->size >= MODIFIED_PAGE_BLOCK_SIZE); + ut_a(bitmap_file->offset + <= bitmap_file->size - MODIFIED_PAGE_BLOCK_SIZE); + ut_a(bitmap_file->offset % MODIFIED_PAGE_BLOCK_SIZE == 0); + + success = os_file_read(bitmap_file->file, page, bitmap_file->offset, + MODIFIED_PAGE_BLOCK_SIZE); + + if (UNIV_UNLIKELY(!success)) { + + /* The following call prints an error message */ + os_file_get_last_error(TRUE); + ib_logf(IB_LOG_LEVEL_WARN, + "failed reading changed page bitmap file \'%s\'\n", + bitmap_file->name); + return FALSE; + } + + bitmap_file->offset += MODIFIED_PAGE_BLOCK_SIZE; + ut_ad(bitmap_file->offset <= bitmap_file->size); + + checksum = mach_read_from_4(page + MODIFIED_PAGE_BLOCK_CHECKSUM); + actual_checksum = log_online_calc_checksum(page); + *checksum_ok = (checksum == actual_checksum); + + return TRUE; +} + +/****************************************************************//** +Get the last tracked fully LSN from the bitmap file by reading +backwards untile a correct end page is found. Detects incomplete +writes and corrupted data. Sets the start output position for the +written bitmap data. + +Multiple bitmap files are handled using the following assumptions: +1) Only the last file might be corrupted. In case where no good data was found +in the last file, assume that the next to last file is OK. This assumption +does not limit crash recovery capability in any way. +2) If the whole of the last file was corrupted, assume that the start LSN in +its name is correct and use it for (re-)tracking start. + +@return the last fully tracked LSN */ +static +lsn_t +log_online_read_last_tracked_lsn(void) +/*==================================*/ +{ + byte page[MODIFIED_PAGE_BLOCK_SIZE]; + ibool is_last_page = FALSE; + ibool checksum_ok = FALSE; + lsn_t result; + os_offset_t read_offset = log_bmp_sys->out.offset; + + while (!checksum_ok && read_offset > 0 && !is_last_page) + { + read_offset -= MODIFIED_PAGE_BLOCK_SIZE; + log_bmp_sys->out.offset = read_offset; + + if (!log_online_read_bitmap_page(&log_bmp_sys->out, page, + &checksum_ok)) { + checksum_ok = FALSE; + result = 0; + break; + } + + if (checksum_ok) { + is_last_page + = mach_read_from_4 + (page + MODIFIED_PAGE_IS_LAST_BLOCK); + } else { + + ib_logf(IB_LOG_LEVEL_WARN, + "corruption detected in \'%s\' at offset " + UINT64PF "\n", + log_bmp_sys->out.name, read_offset); + } + }; + + result = (checksum_ok && is_last_page) + ? mach_read_from_8(page + MODIFIED_PAGE_END_LSN) : 0; + + /* Truncate the output file to discard the corrupted bitmap data, if + any */ + if (!os_file_set_eof_at(log_bmp_sys->out.file, + log_bmp_sys->out.offset)) { + ib_logf(IB_LOG_LEVEL_WARN, + "failed truncating changed page bitmap file \'%s\' to " + UINT64PF " bytes\n", + log_bmp_sys->out.name, log_bmp_sys->out.offset); + result = 0; + } + return result; +} + +/****************************************************************//** +Safely write the log_sys->tracked_lsn value. Uses atomic operations +if available, otherwise this field is protected with the log system +mutex. The reader counterpart function is log_get_tracked_lsn() in +log0log.c. */ +UNIV_INLINE +void +log_set_tracked_lsn( +/*================*/ + lsn_t tracked_lsn) /*!<in: new value */ +{ +#ifdef HAVE_ATOMIC_BUILTINS_64 + /* Single writer, no data race here */ + lsn_t old_value = os_atomic_increment_uint64(&log_sys->tracked_lsn, 0); + (void) os_atomic_increment_uint64(&log_sys->tracked_lsn, + tracked_lsn - old_value); +#else + mutex_enter(&log_sys->mutex); + log_sys->tracked_lsn = tracked_lsn; + mutex_exit(&log_sys->mutex); +#endif +} + +/*********************************************************************//** +Check if missing, if any, LSN interval can be read and tracked using the +current LSN value, the LSN value where the tracking stopped, and the log group +capacity. + +@return TRUE if the missing interval can be tracked or if there's no missing +data. */ +static +ibool +log_online_can_track_missing( +/*=========================*/ + lsn_t last_tracked_lsn, /*!<in: last tracked LSN */ + lsn_t tracking_start_lsn) /*!<in: current LSN */ +{ + /* last_tracked_lsn might be < MIN_TRACKED_LSN in the case of empty + bitmap file, handle this too. */ + last_tracked_lsn = ut_max(last_tracked_lsn, MIN_TRACKED_LSN); + + if (last_tracked_lsn > tracking_start_lsn) { + ib_logf(IB_LOG_LEVEL_ERROR, + "last tracked LSN " LSN_PF " is ahead of tracking " + "start LSN " LSN_PF ". This can be caused by " + "mismatched bitmap files.\n", + last_tracked_lsn, tracking_start_lsn); + exit(1); + } + + return (last_tracked_lsn == tracking_start_lsn) + || (log_sys->lsn - last_tracked_lsn + <= log_sys->log_group_capacity); +} + + +/****************************************************************//** +Diagnose a gap in tracked LSN range on server startup due to crash or +very fast shutdown and try to close it by tracking the data +immediatelly, if possible. */ +static +void +log_online_track_missing_on_startup( +/*================================*/ + lsn_t last_tracked_lsn, /*!<in: last tracked LSN read from the + bitmap file */ + lsn_t tracking_start_lsn) /*!<in: last checkpoint LSN of the + current server startup */ +{ + ut_ad(last_tracked_lsn != tracking_start_lsn); + + ib_logf(IB_LOG_LEVEL_WARN, "last tracked LSN in \'%s\' is " LSN_PF + ", but the last checkpoint LSN is " LSN_PF ". This might be " + "due to a server crash or a very fast shutdown. ", + log_bmp_sys->out.name, last_tracked_lsn, tracking_start_lsn); + + /* See if we can fully recover the missing interval */ + if (log_online_can_track_missing(last_tracked_lsn, + tracking_start_lsn)) { + + ib_logf(IB_LOG_LEVEL_INFO, + "reading the log to advance the last tracked LSN.\n"); + + log_bmp_sys->start_lsn = ut_max(last_tracked_lsn, + MIN_TRACKED_LSN); + log_set_tracked_lsn(log_bmp_sys->start_lsn); + if (!log_online_follow_redo_log()) { + exit(1); + } + ut_ad(log_bmp_sys->end_lsn >= tracking_start_lsn); + + ib_logf(IB_LOG_LEVEL_INFO, + "continuing tracking changed pages from LSN " LSN_PF + "\n", log_bmp_sys->end_lsn); + } + else { + ib_logf(IB_LOG_LEVEL_WARN, + "the age of last tracked LSN exceeds log capacity, " + "tracking-based incremental backups will work only " + "from the higher LSN!\n"); + + log_bmp_sys->end_lsn = log_bmp_sys->start_lsn + = tracking_start_lsn; + log_set_tracked_lsn(log_bmp_sys->start_lsn); + + ib_logf(IB_LOG_LEVEL_INFO, + "starting tracking changed pages from LSN " LSN_PF + "\n", log_bmp_sys->end_lsn); + } +} + +/*********************************************************************//** +Format a bitmap output file name to log_bmp_sys->out.name. */ +static +void +log_online_make_bitmap_name( +/*=========================*/ + lsn_t start_lsn) /*!< in: the start LSN name part */ +{ + ut_snprintf(log_bmp_sys->out.name, FN_REFLEN, bmp_file_name_template, + log_bmp_sys->bmp_file_home, bmp_file_name_stem, + log_bmp_sys->out_seq_num, start_lsn); +} + +/*********************************************************************//** +Check if an old file that has the name of a new bitmap file we are about to +create should be overwritten. */ +static +ibool +log_online_should_overwrite( +/*========================*/ + const char *path) /*!< in: path to file */ +{ + dberr_t err; + os_file_stat_t file_info; + + /* Currently, it's OK to overwrite 0-sized files only */ + err = os_file_get_status(path, &file_info, false); + return err == DB_SUCCESS && file_info.type == OS_FILE_TYPE_FILE + && file_info.size == 0LL; +} + +/*********************************************************************//** +Create a new empty bitmap output file. + +@return TRUE if operation succeeded, FALSE if I/O error */ +static +ibool +log_online_start_bitmap_file(void) +/*==============================*/ +{ + ibool success = TRUE; + + /* Check for an old file that should be deleted first */ + if (log_online_should_overwrite(log_bmp_sys->out.name)) { + + success = static_cast<ibool>( + os_file_delete_if_exists(innodb_file_bmp_key, + log_bmp_sys->out.name)); + } + + if (UNIV_LIKELY(success)) { + log_bmp_sys->out.file + = os_file_create_simple_no_error_handling( + innodb_file_bmp_key, + log_bmp_sys->out.name, + OS_FILE_CREATE, + OS_FILE_READ_WRITE, + &success); + } + if (UNIV_UNLIKELY(!success)) { + + /* The following call prints an error message */ + os_file_get_last_error(TRUE); + ib_logf(IB_LOG_LEVEL_ERROR, + "cannot create \'%s\'\n", log_bmp_sys->out.name); + return FALSE; + } + + log_bmp_sys->out.offset = 0; + return TRUE; +} + +/*********************************************************************//** +Close the current bitmap output file and create the next one. + +@return TRUE if operation succeeded, FALSE if I/O error */ +static +ibool +log_online_rotate_bitmap_file( +/*===========================*/ + lsn_t next_file_start_lsn) /*!<in: the start LSN name + part */ +{ + if (log_bmp_sys->out.file != os_file_invalid) { + os_file_close(log_bmp_sys->out.file); + log_bmp_sys->out.file = os_file_invalid; + } + log_bmp_sys->out_seq_num++; + log_online_make_bitmap_name(next_file_start_lsn); + return log_online_start_bitmap_file(); +} + +/*********************************************************************//** +Check the name of a given file if it's a changed page bitmap file and +return file sequence and start LSN name components if it is. If is not, +the values of output parameters are undefined. + +@return TRUE if a given file is a changed page bitmap file. */ +static +ibool +log_online_is_bitmap_file( +/*======================*/ + const os_file_stat_t* file_info, /*!<in: file to + check */ + ulong* bitmap_file_seq_num, /*!<out: bitmap file + sequence number */ + lsn_t* bitmap_file_start_lsn) /*!<out: bitmap file + start LSN */ +{ + char stem[FN_REFLEN]; + + ut_ad (strlen(file_info->name) < OS_FILE_MAX_PATH); + + return ((file_info->type == OS_FILE_TYPE_FILE + || file_info->type == OS_FILE_TYPE_LINK) + && (sscanf(file_info->name, "%[a-z_]%lu_%llu.xdb", stem, + bitmap_file_seq_num, + (unsigned long long *)bitmap_file_start_lsn) == 3) + && (!strcmp(stem, bmp_file_name_stem))); +} + +/*********************************************************************//** +Initialize the online log following subsytem. */ +UNIV_INTERN +void +log_online_read_init(void) +/*======================*/ +{ + ibool success; + lsn_t tracking_start_lsn + = ut_max(log_sys->last_checkpoint_lsn, MIN_TRACKED_LSN); + os_file_dir_t bitmap_dir; + os_file_stat_t bitmap_dir_file_info; + lsn_t last_file_start_lsn = MIN_TRACKED_LSN; + size_t srv_data_home_len; + + /* Bitmap data start and end in a bitmap block must be 8-byte + aligned. */ + compile_time_assert(MODIFIED_PAGE_BLOCK_BITMAP % 8 == 0); + compile_time_assert(MODIFIED_PAGE_BLOCK_BITMAP_LEN % 8 == 0); + + log_bmp_sys = static_cast<log_bitmap_struct *> + (ut_malloc(sizeof(*log_bmp_sys))); + log_bmp_sys->read_buf_ptr = static_cast<byte *> + (ut_malloc(FOLLOW_SCAN_SIZE + OS_FILE_LOG_BLOCK_SIZE)); + log_bmp_sys->read_buf = static_cast<byte *> + (ut_align(log_bmp_sys->read_buf_ptr, OS_FILE_LOG_BLOCK_SIZE)); + + mutex_create(log_bmp_sys_mutex_key, &log_bmp_sys->mutex, + SYNC_LOG_ONLINE); + + /* Initialize bitmap file directory from srv_data_home and add a path + separator if needed. */ + srv_data_home_len = strlen(srv_data_home); + ut_a (srv_data_home_len < FN_REFLEN); + strcpy(log_bmp_sys->bmp_file_home, srv_data_home); + if (srv_data_home_len + && log_bmp_sys->bmp_file_home[srv_data_home_len - 1] + != SRV_PATH_SEPARATOR) { + + ut_a (srv_data_home_len < FN_REFLEN - 1); + log_bmp_sys->bmp_file_home[srv_data_home_len] + = SRV_PATH_SEPARATOR; + log_bmp_sys->bmp_file_home[srv_data_home_len + 1] = '\0'; + } + + /* Enumerate existing bitmap files to either open the last one to get + the last tracked LSN either to find that there are none and start + tracking from scratch. */ + log_bmp_sys->out.name[0] = '\0'; + log_bmp_sys->out_seq_num = 0; + + bitmap_dir = os_file_opendir(log_bmp_sys->bmp_file_home, TRUE); + ut_a(bitmap_dir); + while (!os_file_readdir_next_file(log_bmp_sys->bmp_file_home, + bitmap_dir, &bitmap_dir_file_info)) { + + ulong file_seq_num; + lsn_t file_start_lsn; + + if (!log_online_is_bitmap_file(&bitmap_dir_file_info, + &file_seq_num, + &file_start_lsn)) { + continue; + } + + if (file_seq_num > log_bmp_sys->out_seq_num + && bitmap_dir_file_info.size > 0) { + log_bmp_sys->out_seq_num = file_seq_num; + last_file_start_lsn = file_start_lsn; + /* No dir component (log_bmp_sys->bmp_file_home) here, + because that's the cwd */ + strncpy(log_bmp_sys->out.name, + bitmap_dir_file_info.name, FN_REFLEN - 1); + log_bmp_sys->out.name[FN_REFLEN - 1] = '\0'; + } + } + + if (os_file_closedir(bitmap_dir)) { + os_file_get_last_error(TRUE); + ib_logf(IB_LOG_LEVEL_ERROR, "cannot close \'%s\'\n", + log_bmp_sys->bmp_file_home); + exit(1); + } + + if (!log_bmp_sys->out_seq_num) { + log_bmp_sys->out_seq_num = 1; + log_online_make_bitmap_name(0); + } + + log_bmp_sys->modified_pages = rbt_create(MODIFIED_PAGE_BLOCK_SIZE, + log_online_compare_bmp_keys); + log_bmp_sys->page_free_list = NULL; + + log_bmp_sys->out.file + = os_file_create_simple_no_error_handling + (innodb_file_bmp_key, log_bmp_sys->out.name, OS_FILE_OPEN, + OS_FILE_READ_WRITE, &success); + + if (!success) { + + /* New file, tracking from scratch */ + if (!log_online_start_bitmap_file()) { + exit(1); + } + } + else { + + /* Read the last tracked LSN from the last file */ + lsn_t last_tracked_lsn; + lsn_t file_start_lsn; + + log_bmp_sys->out.size + = os_file_get_size(log_bmp_sys->out.file); + log_bmp_sys->out.offset = log_bmp_sys->out.size; + + if (log_bmp_sys->out.offset % MODIFIED_PAGE_BLOCK_SIZE != 0) { + + ib_logf(IB_LOG_LEVEL_WARN, + "truncated block detected in \'%s\' at offset " + UINT64PF "\n", + log_bmp_sys->out.name, + log_bmp_sys->out.offset); + log_bmp_sys->out.offset -= + log_bmp_sys->out.offset + % MODIFIED_PAGE_BLOCK_SIZE; + } + + last_tracked_lsn = log_online_read_last_tracked_lsn(); + if (!last_tracked_lsn) { + last_tracked_lsn = last_file_start_lsn; + } + + /* Start a new file. Choose the LSN value in its name based on + if we can retrack any missing data. */ + if (log_online_can_track_missing(last_tracked_lsn, + tracking_start_lsn)) { + file_start_lsn = last_tracked_lsn; + } else { + file_start_lsn = tracking_start_lsn; + } + if (!log_online_rotate_bitmap_file(file_start_lsn)) { + exit(1); + } + + if (last_tracked_lsn < tracking_start_lsn) { + + log_online_track_missing_on_startup + (last_tracked_lsn, tracking_start_lsn); + return; + } + + if (last_tracked_lsn > tracking_start_lsn) { + + ib_logf(IB_LOG_LEVEL_WARN, + "last tracked LSN is " LSN_PF ", but the last " + "checkpoint LSN is " LSN_PF ". The " + "tracking-based incremental backups will work " + "only from the latter LSN!\n", + last_tracked_lsn, tracking_start_lsn); + } + + } + + ib_logf(IB_LOG_LEVEL_INFO, "starting tracking changed pages from LSN " + LSN_PF "\n", tracking_start_lsn); + log_bmp_sys->start_lsn = tracking_start_lsn; + log_set_tracked_lsn(tracking_start_lsn); +} + +/*********************************************************************//** +Shut down the online log following subsystem. */ +UNIV_INTERN +void +log_online_read_shutdown(void) +/*==========================*/ +{ + ib_rbt_node_t *free_list_node = log_bmp_sys->page_free_list; + + if (log_bmp_sys->out.file != os_file_invalid) { + os_file_close(log_bmp_sys->out.file); + log_bmp_sys->out.file = os_file_invalid; + } + + rbt_free(log_bmp_sys->modified_pages); + + while (free_list_node) { + ib_rbt_node_t *next = free_list_node->left; + ut_free(free_list_node); + free_list_node = next; + } + + mutex_free(&log_bmp_sys->mutex); + + ut_free(log_bmp_sys->read_buf_ptr); + ut_free(log_bmp_sys); +} + +/*********************************************************************//** +For the given minilog record type determine if the record has (space; page) +associated with it. +@return TRUE if the record has (space; page) in it */ +static +ibool +log_online_rec_has_page( +/*====================*/ + byte type) /*!<in: the minilog record type */ +{ + return type != MLOG_MULTI_REC_END && type != MLOG_DUMMY_RECORD; +} + +/*********************************************************************//** +Check if a page field for a given log record type actually contains a page +id. It does not for file operations and MLOG_LSN. +@return TRUE if page field contains actual page id, FALSE otherwise */ +static +ibool +log_online_rec_page_means_page( +/*===========================*/ + byte type) /*!<in: log record type */ +{ + return log_online_rec_has_page(type) +#ifdef UNIV_LOG_LSN_DEBUG + && type != MLOG_LSN +#endif + && type != MLOG_FILE_CREATE + && type != MLOG_FILE_RENAME + && type != MLOG_FILE_DELETE + && type != MLOG_FILE_CREATE2; +} + +/*********************************************************************//** +Parse the log data in the parse buffer for the (space, page) pairs and add +them to the modified page set as necessary. Removes the fully-parsed records +from the buffer. If an incomplete record is found, moves it to the end of the +buffer. */ +static +void +log_online_parse_redo_log(void) +/*===========================*/ +{ + byte *ptr = log_bmp_sys->parse_buf; + byte *end = log_bmp_sys->parse_buf_end; + + ulint len = 0; + + ut_ad(mutex_own(&log_bmp_sys->mutex)); + + while (ptr != end + && log_bmp_sys->next_parse_lsn < log_bmp_sys->end_lsn) { + + byte type; + ulint space; + ulint page_no; + byte* body; + + /* recv_sys is not initialized, so on corrupt log we will + SIGSEGV. But the log of a live database should not be + corrupt. */ + len = recv_parse_log_rec(ptr, end, &type, &space, &page_no, + &body); + if (len > 0) { + + if (log_online_rec_page_means_page(type)) { + + ut_a(len >= 3); + log_online_set_page_bit(space, page_no); + } + + ptr += len; + ut_ad(ptr <= end); + log_bmp_sys->next_parse_lsn + = recv_calc_lsn_on_data_add + (log_bmp_sys->next_parse_lsn, len); + } + else { + + /* Incomplete log record. Shift it to the + beginning of the parse buffer and leave it to be + completed on the next read. */ + ut_memmove(log_bmp_sys->parse_buf, ptr, end - ptr); + log_bmp_sys->parse_buf_end + = log_bmp_sys->parse_buf + (end - ptr); + ptr = end; + } + } + + if (len > 0) { + + log_bmp_sys->parse_buf_end = log_bmp_sys->parse_buf; + } +} + +/*********************************************************************//** +Check the log block checksum. +@return TRUE if the log block checksum is OK, FALSE otherwise. */ +static +ibool +log_online_is_valid_log_seg( +/*========================*/ + const byte* log_block) /*!< in: read log data */ +{ + ibool checksum_is_ok + = log_block_checksum_is_ok_or_old_format(log_block); + + if (!checksum_is_ok) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "log block checksum mismatch: expected " ULINTPF ", " + "calculated checksum " ULINTPF "\n", + log_block_get_checksum(log_block), + log_block_calc_checksum(log_block)); + } + + return checksum_is_ok; +} + +/*********************************************************************//** +Copy new log data to the parse buffer while skipping log block header, +trailer and already parsed data. */ +static +void +log_online_add_to_parse_buf( +/*========================*/ + const byte* log_block, /*!< in: read log data */ + ulint data_len, /*!< in: length of read log data */ + ulint skip_len) /*!< in: how much of log data to + skip */ +{ + ulint start_offset = skip_len ? skip_len : LOG_BLOCK_HDR_SIZE; + ulint end_offset + = (data_len == OS_FILE_LOG_BLOCK_SIZE) + ? data_len - LOG_BLOCK_TRL_SIZE + : data_len; + ulint actual_data_len = (end_offset >= start_offset) + ? end_offset - start_offset : 0; + + ut_ad(mutex_own(&log_bmp_sys->mutex)); + + ut_memcpy(log_bmp_sys->parse_buf_end, log_block + start_offset, + actual_data_len); + + log_bmp_sys->parse_buf_end += actual_data_len; + + ut_a(log_bmp_sys->parse_buf_end - log_bmp_sys->parse_buf + <= RECV_PARSING_BUF_SIZE); +} + +/*********************************************************************//** +Parse the log block: first copies the read log data to the parse buffer while +skipping log block header, trailer and already parsed data. Then it actually +parses the log to add to the modified page bitmap. */ +static +void +log_online_parse_redo_log_block( +/*============================*/ + const byte* log_block, /*!< in: read log data */ + ulint skip_already_parsed_len) /*!< in: how many bytes of + log data should be skipped as + they were parsed before */ +{ + ulint block_data_len; + + ut_ad(mutex_own(&log_bmp_sys->mutex)); + + block_data_len = log_block_get_data_len(log_block); + + ut_ad(block_data_len % OS_FILE_LOG_BLOCK_SIZE == 0 + || block_data_len < OS_FILE_LOG_BLOCK_SIZE); + + log_online_add_to_parse_buf(log_block, block_data_len, + skip_already_parsed_len); + log_online_parse_redo_log(); +} + +/*********************************************************************//** +Read and parse one redo log chunk and updates the modified page bitmap. */ +static +void +log_online_follow_log_seg( +/*======================*/ + log_group_t* group, /*!< in: the log group to use */ + lsn_t block_start_lsn, /*!< in: the LSN to read from */ + lsn_t block_end_lsn) /*!< in: the LSN to read to */ +{ + /* Pointer to the current OS_FILE_LOG_BLOCK-sized chunk of the read log + data to parse */ + byte* log_block = log_bmp_sys->read_buf; + byte* log_block_end = log_bmp_sys->read_buf + + (block_end_lsn - block_start_lsn); + + ut_ad(mutex_own(&log_bmp_sys->mutex)); + + mutex_enter(&log_sys->mutex); + log_group_read_log_seg(LOG_RECOVER, log_bmp_sys->read_buf, + group, block_start_lsn, block_end_lsn, TRUE); + /* log_group_read_log_seg will release the log_sys->mutex for us */ + + while (log_block < log_block_end + && log_bmp_sys->next_parse_lsn < log_bmp_sys->end_lsn) { + + /* How many bytes of log data should we skip in the current log + block. Skipping is necessary because we round down the next + parse LSN thus it is possible to read the already-processed log + data many times */ + ulint skip_already_parsed_len = 0; + + if (!log_online_is_valid_log_seg(log_block)) { + break; + } + + if ((block_start_lsn <= log_bmp_sys->next_parse_lsn) + && (block_start_lsn + OS_FILE_LOG_BLOCK_SIZE + > log_bmp_sys->next_parse_lsn)) { + + /* The next parse LSN is inside the current block, skip + data preceding it. */ + skip_already_parsed_len + = (ulint)(log_bmp_sys->next_parse_lsn + - block_start_lsn); + } + else { + + /* If the next parse LSN is not inside the current + block, then the only option is that we have processed + ahead already. */ + ut_a(block_start_lsn > log_bmp_sys->next_parse_lsn); + } + + /* TODO: merge the copying to the parse buf code with + skip_already_len calculations */ + log_online_parse_redo_log_block(log_block, + skip_already_parsed_len); + + log_block += OS_FILE_LOG_BLOCK_SIZE; + block_start_lsn += OS_FILE_LOG_BLOCK_SIZE; + } + + return; +} + +/*********************************************************************//** +Read and parse the redo log in a given group in FOLLOW_SCAN_SIZE-sized +chunks and updates the modified page bitmap. */ +static +void +log_online_follow_log_group( +/*========================*/ + log_group_t* group, /*!< in: the log group to use */ + lsn_t contiguous_lsn) /*!< in: the LSN of log block start + containing the log_parse_start_lsn */ +{ + lsn_t block_start_lsn = contiguous_lsn; + lsn_t block_end_lsn; + + ut_ad(mutex_own(&log_bmp_sys->mutex)); + + log_bmp_sys->next_parse_lsn = log_bmp_sys->start_lsn; + log_bmp_sys->parse_buf_end = log_bmp_sys->parse_buf; + + do { + block_end_lsn = block_start_lsn + FOLLOW_SCAN_SIZE; + + log_online_follow_log_seg(group, block_start_lsn, + block_end_lsn); + + /* Next parse LSN can become higher than the last read LSN + only in the case when the read LSN falls right on the block + boundary, in which case next parse lsn is bumped to the actual + data LSN on the next (not yet read) block. This assert is + slightly conservative. */ + ut_a(log_bmp_sys->next_parse_lsn + <= block_end_lsn + LOG_BLOCK_HDR_SIZE + + LOG_BLOCK_TRL_SIZE); + + block_start_lsn = block_end_lsn; + } while (block_end_lsn < log_bmp_sys->end_lsn); + + /* Assert that the last read log record is a full one */ + ut_a(log_bmp_sys->parse_buf_end == log_bmp_sys->parse_buf); +} + +/*********************************************************************//** +Write, flush one bitmap block to disk and advance the output position if +successful. + +@return TRUE if page written OK, FALSE if I/O error */ +static +ibool +log_online_write_bitmap_page( +/*=========================*/ + const byte *block) /*!< in: block to write */ +{ + ibool success; + + ut_ad(mutex_own(&log_bmp_sys->mutex)); + + /* Simulate a write error */ + DBUG_EXECUTE_IF("bitmap_page_write_error", return FALSE;); + + success = os_file_write(log_bmp_sys->out.name, log_bmp_sys->out.file, + block, log_bmp_sys->out.offset, + MODIFIED_PAGE_BLOCK_SIZE); + if (UNIV_UNLIKELY(!success)) { + + /* The following call prints an error message */ + os_file_get_last_error(TRUE); + ib_logf(IB_LOG_LEVEL_ERROR, "failed writing changed page " + "bitmap file \'%s\'\n", log_bmp_sys->out.name); + return FALSE; + } + + success = os_file_flush(log_bmp_sys->out.file); + if (UNIV_UNLIKELY(!success)) { + + /* The following call prints an error message */ + os_file_get_last_error(TRUE); + ib_logf(IB_LOG_LEVEL_ERROR, "failed flushing changed page " + "bitmap file \'%s\'\n", log_bmp_sys->out.name); + return FALSE; + } + +#ifdef UNIV_LINUX + posix_fadvise(log_bmp_sys->out.file, log_bmp_sys->out.offset, + MODIFIED_PAGE_BLOCK_SIZE, POSIX_FADV_DONTNEED); +#endif + + log_bmp_sys->out.offset += MODIFIED_PAGE_BLOCK_SIZE; + return TRUE; +} + +/*********************************************************************//** +Append the current changed page bitmap to the bitmap file. Clears the +bitmap tree and recycles its nodes to the free list. + +@return TRUE if bitmap written OK, FALSE if I/O error*/ +static +ibool +log_online_write_bitmap(void) +/*=========================*/ +{ + ib_rbt_node_t *bmp_tree_node; + const ib_rbt_node_t *last_bmp_tree_node; + ibool success = TRUE; + + ut_ad(mutex_own(&log_bmp_sys->mutex)); + + if (log_bmp_sys->out.offset >= srv_max_bitmap_file_size) { + if (!log_online_rotate_bitmap_file(log_bmp_sys->start_lsn)) { + return FALSE; + } + } + + bmp_tree_node = (ib_rbt_node_t *) + rbt_first(log_bmp_sys->modified_pages); + last_bmp_tree_node = rbt_last(log_bmp_sys->modified_pages); + + while (bmp_tree_node) { + + byte *page = rbt_value(byte, bmp_tree_node); + + /* In case of a bitmap page write error keep on looping over + the tree to reclaim its memory through the free list instead of + returning immediatelly. */ + if (UNIV_LIKELY(success)) { + if (bmp_tree_node == last_bmp_tree_node) { + mach_write_to_4(page + + MODIFIED_PAGE_IS_LAST_BLOCK, + 1); + } + + mach_write_to_8(page + MODIFIED_PAGE_START_LSN, + log_bmp_sys->start_lsn); + mach_write_to_8(page + MODIFIED_PAGE_END_LSN, + log_bmp_sys->end_lsn); + mach_write_to_4(page + MODIFIED_PAGE_BLOCK_CHECKSUM, + log_online_calc_checksum(page)); + + success = log_online_write_bitmap_page(page); + } + + bmp_tree_node->left = log_bmp_sys->page_free_list; + log_bmp_sys->page_free_list = bmp_tree_node; + + bmp_tree_node = (ib_rbt_node_t*) + rbt_next(log_bmp_sys->modified_pages, bmp_tree_node); + + DBUG_EXECUTE_IF("bitmap_page_2_write_error", + DBUG_SET("+d,bitmap_page_write_error");); + } + + rbt_reset(log_bmp_sys->modified_pages); + return success; +} + +/*********************************************************************//** +Read and parse the redo log up to last checkpoint LSN to build the changed +page bitmap which is then written to disk. + +@return TRUE if log tracking succeeded, FALSE if bitmap write I/O error */ +UNIV_INTERN +ibool +log_online_follow_redo_log(void) +/*============================*/ +{ + lsn_t contiguous_start_lsn; + log_group_t* group; + ibool result; + + mutex_enter(&log_bmp_sys->mutex); + + if (!srv_track_changed_pages) { + mutex_exit(&log_bmp_sys->mutex); + return FALSE; + } + + ut_ad(!srv_read_only_mode); + + /* Grab the LSN of the last checkpoint, we will parse up to it */ + mutex_enter(&(log_sys->mutex)); + log_bmp_sys->end_lsn = log_sys->last_checkpoint_lsn; + mutex_exit(&(log_sys->mutex)); + + if (log_bmp_sys->end_lsn == log_bmp_sys->start_lsn) { + mutex_exit(&log_bmp_sys->mutex); + return TRUE; + } + + group = UT_LIST_GET_FIRST(log_sys->log_groups); + ut_a(group); + + contiguous_start_lsn = ut_uint64_align_down(log_bmp_sys->start_lsn, + OS_FILE_LOG_BLOCK_SIZE); + + while (group) { + log_online_follow_log_group(group, contiguous_start_lsn); + group = UT_LIST_GET_NEXT(log_groups, group); + } + + /* A crash injection site that ensures last checkpoint LSN > last + tracked LSN, so that LSN tracking for this interval is tested. */ + DBUG_EXECUTE_IF("crash_before_bitmap_write", DBUG_SUICIDE();); + + result = log_online_write_bitmap(); + log_bmp_sys->start_lsn = log_bmp_sys->end_lsn; + log_set_tracked_lsn(log_bmp_sys->start_lsn); + + mutex_exit(&log_bmp_sys->mutex); + return result; +} + +/*********************************************************************//** +Diagnose a bitmap file range setup failure and free the partially-initialized +bitmap file range. */ +UNIV_COLD +static +void +log_online_diagnose_inconsistent_dir( +/*=================================*/ + log_online_bitmap_file_range_t *bitmap_files) /*!<in/out: bitmap file + range */ +{ + ib_logf(IB_LOG_LEVEL_WARN, + "InnoDB: Warning: inconsistent bitmap file " + "directory for a " + "INFORMATION_SCHEMA.INNODB_CHANGED_PAGES query" + "\n"); + free(bitmap_files->files); +} + +/*********************************************************************//** +List the bitmap files in srv_data_home and setup their range that contains the +specified LSN interval. This range, if non-empty, will start with a file that +has the greatest LSN equal to or less than the start LSN and will include all +the files up to the one with the greatest LSN less than the end LSN. Caller +must free bitmap_files->files when done if bitmap_files set to non-NULL and +this function returned TRUE. Field bitmap_files->count might be set to a +larger value than the actual count of the files, and space for the unused array +slots will be allocated but cleared to zeroes. + +@return TRUE if succeeded +*/ +static +ibool +log_online_setup_bitmap_file_range( +/*===============================*/ + log_online_bitmap_file_range_t *bitmap_files, /*!<in/out: bitmap file + range */ + lsn_t range_start, /*!<in: start LSN */ + lsn_t range_end) /*!<in: end LSN */ +{ + os_file_dir_t bitmap_dir; + os_file_stat_t bitmap_dir_file_info; + ulong first_file_seq_num = ULONG_MAX; + ulong last_file_seq_num = 0; + lsn_t first_file_start_lsn = LSN_MAX; + + ut_ad(range_end >= range_start); + + bitmap_files->count = 0; + bitmap_files->files = NULL; + + /* 1st pass: size the info array */ + + bitmap_dir = os_file_opendir(srv_data_home, FALSE); + if (UNIV_UNLIKELY(!bitmap_dir)) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "failed to open bitmap directory \'%s\'\n", + srv_data_home); + return FALSE; + } + + while (!os_file_readdir_next_file(srv_data_home, bitmap_dir, + &bitmap_dir_file_info)) { + + ulong file_seq_num; + lsn_t file_start_lsn; + + if (!log_online_is_bitmap_file(&bitmap_dir_file_info, + &file_seq_num, + &file_start_lsn) + || file_start_lsn >= range_end) { + + continue; + } + + if (file_seq_num > last_file_seq_num) { + + last_file_seq_num = file_seq_num; + } + + if (file_start_lsn >= range_start + || file_start_lsn == first_file_start_lsn + || first_file_start_lsn > range_start) { + + /* A file that falls into the range */ + + if (file_start_lsn < first_file_start_lsn) { + + first_file_start_lsn = file_start_lsn; + } + if (file_seq_num < first_file_seq_num) { + + first_file_seq_num = file_seq_num; + } + } else if (file_start_lsn > first_file_start_lsn) { + + /* A file that has LSN closer to the range start + but smaller than it, replacing another such file */ + first_file_start_lsn = file_start_lsn; + first_file_seq_num = file_seq_num; + } + } + + if (UNIV_UNLIKELY(os_file_closedir(bitmap_dir))) { + + os_file_get_last_error(TRUE); + ib_logf(IB_LOG_LEVEL_ERROR, "cannot close \'%s\'\n", + srv_data_home); + return FALSE; + } + + if (first_file_seq_num == ULONG_MAX && last_file_seq_num == 0) { + + bitmap_files->count = 0; + return TRUE; + } + + bitmap_files->count = last_file_seq_num - first_file_seq_num + 1; + + DEBUG_SYNC_C("setup_bitmap_range_middle"); + + /* 2nd pass: get the file names in the file_seq_num order */ + + bitmap_dir = os_file_opendir(srv_data_home, FALSE); + if (UNIV_UNLIKELY(!bitmap_dir)) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "failed to open bitmap directory \'%s\'\n", + srv_data_home); + return FALSE; + } + + bitmap_files->files + = static_cast<log_online_bitmap_file_range_struct::files_t *> + (ut_malloc(bitmap_files->count + * sizeof(bitmap_files->files[0]))); + memset(bitmap_files->files, 0, + bitmap_files->count * sizeof(bitmap_files->files[0])); + + while (!os_file_readdir_next_file(srv_data_home, bitmap_dir, + &bitmap_dir_file_info)) { + + ulong file_seq_num; + lsn_t file_start_lsn; + size_t array_pos; + + if (!log_online_is_bitmap_file(&bitmap_dir_file_info, + &file_seq_num, + &file_start_lsn) + || file_start_lsn >= range_end + || file_start_lsn < first_file_start_lsn) { + + continue; + } + + array_pos = file_seq_num - first_file_seq_num; + if (UNIV_UNLIKELY(array_pos >= bitmap_files->count)) { + + log_online_diagnose_inconsistent_dir(bitmap_files); + return FALSE; + } + + + if (file_seq_num > bitmap_files->files[array_pos].seq_num) { + + bitmap_files->files[array_pos].seq_num = file_seq_num; + strncpy(bitmap_files->files[array_pos].name, + bitmap_dir_file_info.name, FN_REFLEN); + bitmap_files->files[array_pos].name[FN_REFLEN - 1] + = '\0'; + bitmap_files->files[array_pos].start_lsn + = file_start_lsn; + } + } + + if (UNIV_UNLIKELY(os_file_closedir(bitmap_dir))) { + + os_file_get_last_error(TRUE); + ib_logf(IB_LOG_LEVEL_ERROR, "cannot close \'%s\'\n", + srv_data_home); + free(bitmap_files->files); + return FALSE; + } + + if (!bitmap_files->files[0].seq_num + || bitmap_files->files[0].seq_num != first_file_seq_num) { + + log_online_diagnose_inconsistent_dir(bitmap_files); + return FALSE; + } + + { + size_t i; + for (i = 1; i < bitmap_files->count; i++) { + if (!bitmap_files->files[i].seq_num) { + break; + } + if ((bitmap_files->files[i].seq_num + <= bitmap_files->files[i - 1].seq_num) + || (bitmap_files->files[i].start_lsn + < bitmap_files->files[i - 1].start_lsn)) { + + log_online_diagnose_inconsistent_dir( + bitmap_files); + return FALSE; + } + } + } + + return TRUE; +} + +/****************************************************************//** +Open a bitmap file for reading. + +@return TRUE if opened successfully */ +static +ibool +log_online_open_bitmap_file_read_only( +/*==================================*/ + const char* name, /*!<in: bitmap file + name without directory, + which is assumed to be + srv_data_home */ + log_online_bitmap_file_t* bitmap_file) /*!<out: opened bitmap + file */ +{ + ibool success = FALSE; + + ut_ad(name[0] != '\0'); + + ut_snprintf(bitmap_file->name, FN_REFLEN, "%s%s", srv_data_home, name); + bitmap_file->file + = os_file_create_simple_no_error_handling(innodb_file_bmp_key, + bitmap_file->name, + OS_FILE_OPEN, + OS_FILE_READ_ONLY, + &success); + if (UNIV_UNLIKELY(!success)) { + + /* Here and below assume that bitmap file names do not + contain apostrophes, thus no need for ut_print_filename(). */ + ib_logf(IB_LOG_LEVEL_WARN, + "error opening the changed page bitmap \'%s\'\n", + bitmap_file->name); + return FALSE; + } + + bitmap_file->size = os_file_get_size(bitmap_file->file); + bitmap_file->offset = 0; + +#ifdef UNIV_LINUX + posix_fadvise(bitmap_file->file, 0, 0, POSIX_FADV_SEQUENTIAL); + posix_fadvise(bitmap_file->file, 0, 0, POSIX_FADV_NOREUSE); +#endif + + return TRUE; +} + +/****************************************************************//** +Diagnose one or both of the following situations if we read close to +the end of bitmap file: +1) Warn if the remainder of the file is less than one page. +2) Error if we cannot read any more full pages but the last read page +did not have the last-in-run flag set. + +@return FALSE for the error */ +static +ibool +log_online_diagnose_bitmap_eof( +/*===========================*/ + const log_online_bitmap_file_t* bitmap_file, /*!< in: bitmap file */ + ibool last_page_in_run)/*!< in: "last page in + run" flag value in the + last read page */ +{ + /* Check if we are too close to EOF to read a full page */ + if ((bitmap_file->size < MODIFIED_PAGE_BLOCK_SIZE) + || (bitmap_file->offset + > bitmap_file->size - MODIFIED_PAGE_BLOCK_SIZE)) { + + if (UNIV_UNLIKELY(bitmap_file->offset != bitmap_file->size)) { + + /* If we are not at EOF and we have less than one page + to read, it's junk. This error is not fatal in + itself. */ + + ib_logf(IB_LOG_LEVEL_WARN, + "junk at the end of changed page bitmap file " + "\'%s\'.\n", bitmap_file->name); + } + + if (UNIV_UNLIKELY(!last_page_in_run)) { + + /* We are at EOF but the last read page did not finish + a run */ + /* It's a "Warning" here because it's not a fatal error + for the whole server */ + ib_logf(IB_LOG_LEVEL_WARN, + "changed page bitmap file \'%s\' does not " + "contain a complete run at the end.\n", + bitmap_file->name); + return FALSE; + } + } + return TRUE; +} + +/*********************************************************************//** +Initialize the log bitmap iterator for a given range. The records are +processed at a bitmap block granularity, i.e. all the records in the same block +share the same start and end LSN values, the exact LSN of each record is +unavailable (nor is it defined for blocks that are touched more than once in +the LSN interval contained in the block). Thus min_lsn and max_lsn should be +set at block boundaries or bigger, otherwise the records at the 1st and the +last blocks will not be returned. Also note that there might be returned +records with LSN < min_lsn, as min_lsn is used to select the correct starting +file but not block. + +@return TRUE if the iterator is initialized OK, FALSE otherwise. */ +UNIV_INTERN +ibool +log_online_bitmap_iterator_init( +/*============================*/ + log_bitmap_iterator_t *i, /*!<in/out: iterator */ + lsn_t min_lsn,/*!< in: start LSN */ + lsn_t max_lsn)/*!< in: end LSN */ +{ + ut_a(i); + + if (UNIV_UNLIKELY(min_lsn > max_lsn)) { + + /* Empty range */ + i->in_files.count = 0; + i->in_files.files = NULL; + i->in.file = os_file_invalid; + i->page = NULL; + i->failed = FALSE; + return TRUE; + } + + if (!log_online_setup_bitmap_file_range(&i->in_files, min_lsn, + max_lsn)) { + + i->failed = TRUE; + return FALSE; + } + + i->in_i = 0; + + if (i->in_files.count == 0) { + + /* Empty range */ + i->in.file = os_file_invalid; + i->page = NULL; + i->failed = FALSE; + return TRUE; + } + + /* Open the 1st bitmap file */ + if (UNIV_UNLIKELY(!log_online_open_bitmap_file_read_only( + i->in_files.files[i->in_i].name, + &i->in))) { + + i->in_i = i->in_files.count; + free(i->in_files.files); + i->failed = TRUE; + return FALSE; + } + + i->page = static_cast<byte *>(ut_malloc(MODIFIED_PAGE_BLOCK_SIZE)); + i->bit_offset = MODIFIED_PAGE_BLOCK_BITMAP_LEN; + i->start_lsn = i->end_lsn = 0; + i->space_id = 0; + i->first_page_id = 0; + i->last_page_in_run = TRUE; + i->changed = FALSE; + i->failed = FALSE; + + return TRUE; +} + +/*********************************************************************//** +Releases log bitmap iterator. */ +UNIV_INTERN +void +log_online_bitmap_iterator_release( +/*===============================*/ + log_bitmap_iterator_t *i) /*!<in/out: iterator */ +{ + ut_a(i); + + if (i->in.file != os_file_invalid) { + + os_file_close(i->in.file); + i->in.file = os_file_invalid; + } + if (i->in_files.files) { + + ut_free(i->in_files.files); + } + if (i->page) { + + ut_free(i->page); + } + i->failed = TRUE; +} + +/*********************************************************************//** +Iterates through bits of saved bitmap blocks. +Sequentially reads blocks from bitmap file(s) and interates through +their bits. Ignores blocks with wrong checksum. +@return TRUE if iteration is successful, FALSE if all bits are iterated. */ +UNIV_INTERN +ibool +log_online_bitmap_iterator_next( +/*============================*/ + log_bitmap_iterator_t *i) /*!<in/out: iterator */ +{ + ibool checksum_ok = FALSE; + ibool success; + + ut_a(i); + + if (UNIV_UNLIKELY(i->in_files.count == 0)) { + + return FALSE; + } + + if (UNIV_LIKELY(i->bit_offset < MODIFIED_PAGE_BLOCK_BITMAP_LEN)) + { + ++i->bit_offset; + i->changed = + IS_BIT_SET(i->page + MODIFIED_PAGE_BLOCK_BITMAP, + i->bit_offset); + return TRUE; + } + + while (!checksum_ok) + { + while (i->in.size < MODIFIED_PAGE_BLOCK_SIZE + || (i->in.offset + > i->in.size - MODIFIED_PAGE_BLOCK_SIZE)) { + + /* Advance file */ + i->in_i++; + success = os_file_close_no_error_handling(i->in.file); + i->in.file = os_file_invalid; + if (UNIV_UNLIKELY(!success)) { + + os_file_get_last_error(TRUE); + i->failed = TRUE; + return FALSE; + } + + success = log_online_diagnose_bitmap_eof( + &i->in, i->last_page_in_run); + if (UNIV_UNLIKELY(!success)) { + + i->failed = TRUE; + return FALSE; + + } + + if (i->in_i == i->in_files.count) { + + return FALSE; + } + + if (UNIV_UNLIKELY(i->in_files.files[i->in_i].seq_num + == 0)) { + + i->failed = TRUE; + return FALSE; + } + + success = log_online_open_bitmap_file_read_only( + i->in_files.files[i->in_i].name, + &i->in); + if (UNIV_UNLIKELY(!success)) { + + i->failed = TRUE; + return FALSE; + } + } + + success = log_online_read_bitmap_page(&i->in, i->page, + &checksum_ok); + if (UNIV_UNLIKELY(!success)) { + + os_file_get_last_error(TRUE); + ib_logf(IB_LOG_LEVEL_WARN, + "failed reading changed page bitmap file " + "\'%s\'\n", i->in_files.files[i->in_i].name); + i->failed = TRUE; + return FALSE; + } + } + + i->start_lsn = mach_read_from_8(i->page + MODIFIED_PAGE_START_LSN); + i->end_lsn = mach_read_from_8(i->page + MODIFIED_PAGE_END_LSN); + i->space_id = mach_read_from_4(i->page + MODIFIED_PAGE_SPACE_ID); + i->first_page_id = mach_read_from_4(i->page + + MODIFIED_PAGE_1ST_PAGE_ID); + i->last_page_in_run = mach_read_from_4(i->page + + MODIFIED_PAGE_IS_LAST_BLOCK); + i->bit_offset = 0; + i->changed = IS_BIT_SET(i->page + MODIFIED_PAGE_BLOCK_BITMAP, + i->bit_offset); + + return TRUE; +} + +/************************************************************//** +Delete all the bitmap files for data less than the specified LSN. +If called with lsn == 0 (i.e. set by RESET request) or LSN_MAX, +restart the bitmap file sequence, otherwise continue it. + +@return FALSE to indicate success, TRUE for failure. */ +UNIV_INTERN +ibool +log_online_purge_changed_page_bitmaps( +/*==================================*/ + lsn_t lsn) /*!< in: LSN to purge files up to */ +{ + log_online_bitmap_file_range_t bitmap_files; + size_t i; + ibool result = FALSE; + + if (lsn == 0) { + lsn = LSN_MAX; + } + + if (srv_track_changed_pages) { + /* User requests might happen with both enabled and disabled + tracking */ + mutex_enter(&log_bmp_sys->mutex); + } + + if (!log_online_setup_bitmap_file_range(&bitmap_files, 0, LSN_MAX)) { + if (srv_track_changed_pages) { + mutex_exit(&log_bmp_sys->mutex); + } + return TRUE; + } + + if (srv_track_changed_pages && lsn > log_bmp_sys->end_lsn) { + /* If we have to delete the current output file, close it + first. */ + os_file_close(log_bmp_sys->out.file); + log_bmp_sys->out.file = os_file_invalid; + } + + for (i = 0; i < bitmap_files.count; i++) { + + /* We consider the end LSN of the current bitmap, derived from + the start LSN of the subsequent bitmap file, to determine + whether to remove the current bitmap. Note that bitmap_files + does not contain an entry for the bitmap past the given LSN so + we must check the boundary conditions as well. For example, + consider 1_0.xdb and 2_10.xdb and querying LSN 5. bitmap_files + will only contain 1_0.xdb and we must not delete it since it + represents LSNs 0-9. */ + if ((i + 1 == bitmap_files.count + || bitmap_files.files[i + 1].seq_num == 0 + || bitmap_files.files[i + 1].start_lsn > lsn) + && (lsn != LSN_MAX)) { + + break; + } + if (!os_file_delete_if_exists(innodb_file_bmp_key, + bitmap_files.files[i].name)) { + + os_file_get_last_error(TRUE); + result = TRUE; + break; + } + } + + if (srv_track_changed_pages) { + if (lsn > log_bmp_sys->end_lsn) { + lsn_t new_file_lsn; + if (lsn == LSN_MAX) { + /* RESET restarts the sequence */ + log_bmp_sys->out_seq_num = 0; + new_file_lsn = 0; + } else { + new_file_lsn = log_bmp_sys->end_lsn; + } + if (!log_online_rotate_bitmap_file(new_file_lsn)) { + /* If file create failed, signal the log + tracking thread to quit next time it wakes + up. */ + srv_track_changed_pages = FALSE; + } + } + + mutex_exit(&log_bmp_sys->mutex); + } + + free(bitmap_files.files); + return result; +} diff --git a/storage/xtradb/log/log0recv.cc b/storage/xtradb/log/log0recv.cc new file mode 100644 index 00000000000..fd35fda83c7 --- /dev/null +++ b/storage/xtradb/log/log0recv.cc @@ -0,0 +1,4107 @@ +/***************************************************************************** + +Copyright (c) 1997, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file log/log0recv.cc +Recovery + +Created 9/20/1997 Heikki Tuuri +*******************************************************/ + +// First include (the generated) my_config.h, to get correct platform defines. +#include "my_config.h" +#include <stdio.h> // Solaris/x86 header file bug + +#include <vector> +#include "log0recv.h" + +#ifdef UNIV_NONINL +#include "log0recv.ic" +#endif + +#include "mem0mem.h" +#include "buf0buf.h" +#include "buf0flu.h" +#include "mtr0mtr.h" +#include "mtr0log.h" +#include "page0cur.h" +#include "page0zip.h" +#include "btr0btr.h" +#include "btr0cur.h" +#include "ibuf0ibuf.h" +#include "trx0undo.h" +#include "trx0rec.h" +#include "fil0fil.h" +#ifndef UNIV_HOTBACKUP +# include "buf0rea.h" +# include "srv0srv.h" +# include "srv0start.h" +# include "trx0roll.h" +# include "row0merge.h" +# include "sync0sync.h" +#else /* !UNIV_HOTBACKUP */ + + +/** This is set to FALSE if the backup was originally taken with the +mysqlbackup --include regexp option: then we do not want to create tables in +directories which were not included */ +UNIV_INTERN ibool recv_replay_file_ops = TRUE; +#endif /* !UNIV_HOTBACKUP */ + +/** Log records are stored in the hash table in chunks at most of this size; +this must be less than UNIV_PAGE_SIZE as it is stored in the buffer pool */ +#define RECV_DATA_BLOCK_SIZE (MEM_MAX_ALLOC_IN_BUF - sizeof(recv_data_t)) + +/** Read-ahead area in applying log records to file pages */ +#define RECV_READ_AHEAD_AREA 32 + +/** The recovery system */ +UNIV_INTERN recv_sys_t* recv_sys = NULL; +/** TRUE when applying redo log records during crash recovery; FALSE +otherwise. Note that this is FALSE while a background thread is +rolling back incomplete transactions. */ +UNIV_INTERN ibool recv_recovery_on; +#ifdef UNIV_LOG_ARCHIVE +/** TRUE when applying redo log records from an archived log file */ +UNIV_INTERN ibool recv_recovery_from_backup_on; +#endif /* UNIV_LOG_ARCHIVE */ + +#ifndef UNIV_HOTBACKUP +/** TRUE when recv_init_crash_recovery() has been called. */ +UNIV_INTERN ibool recv_needed_recovery; +# ifdef UNIV_DEBUG +/** TRUE if writing to the redo log (mtr_commit) is forbidden. +Protected by log_sys->mutex. */ +UNIV_INTERN ibool recv_no_log_write = FALSE; +# endif /* UNIV_DEBUG */ + +/** TRUE if buf_page_is_corrupted() should check if the log sequence +number (FIL_PAGE_LSN) is in the future. Initially FALSE, and set by +recv_recovery_from_checkpoint_start_func(). */ +UNIV_INTERN ibool recv_lsn_checks_on; + +/** There are two conditions under which we scan the logs, the first +is normal startup and the second is when we do a recovery from an +archive. +This flag is set if we are doing a scan from the last checkpoint during +startup. If we find log entries that were written after the last checkpoint +we know that the server was not cleanly shutdown. We must then initialize +the crash recovery environment before attempting to store these entries in +the log hash table. */ +static ibool recv_log_scan_is_startup_type; + +/** If the following is TRUE, the buffer pool file pages must be invalidated +after recovery and no ibuf operations are allowed; this becomes TRUE if +the log record hash table becomes too full, and log records must be merged +to file pages already before the recovery is finished: in this case no +ibuf operations are allowed, as they could modify the pages read in the +buffer pool before the pages have been recovered to the up-to-date state. + +TRUE means that recovery is running and no operations on the log files +are allowed yet: the variable name is misleading. */ +UNIV_INTERN ibool recv_no_ibuf_operations; +/** TRUE when the redo log is being backed up */ +# define recv_is_making_a_backup FALSE +/** TRUE when recovering from a backed up redo log file */ +# define recv_is_from_backup FALSE +#else /* !UNIV_HOTBACKUP */ +# define recv_needed_recovery FALSE +/** TRUE when the redo log is being backed up */ +UNIV_INTERN ibool recv_is_making_a_backup = FALSE; +/** TRUE when recovering from a backed up redo log file */ +UNIV_INTERN ibool recv_is_from_backup = FALSE; +# define buf_pool_get_curr_size() (5 * 1024 * 1024) +#endif /* !UNIV_HOTBACKUP */ +/** The following counter is used to decide when to print info on +log scan */ +static ulint recv_scan_print_counter; + +/** The type of the previous parsed redo log record */ +static ulint recv_previous_parsed_rec_type; +/** The offset of the previous parsed redo log record */ +static ulint recv_previous_parsed_rec_offset; +/** The 'multi' flag of the previous parsed redo log record */ +static ulint recv_previous_parsed_rec_is_multi; + +/** Maximum page number encountered in the redo log */ +UNIV_INTERN ulint recv_max_parsed_page_no; + +/** This many frames must be left free in the buffer pool when we scan +the log and store the scanned log records in the buffer pool: we will +use these free frames to read in pages when we start applying the +log records to the database. +This is the default value. If the actual size of the buffer pool is +larger than 10 MB we'll set this value to 512. */ +UNIV_INTERN ulint recv_n_pool_free_frames; + +/** The maximum lsn we see for a page during the recovery process. If this +is bigger than the lsn we are able to scan up to, that is an indication that +the recovery failed and the database may be corrupt. */ +UNIV_INTERN lsn_t recv_max_page_lsn; + +#ifdef UNIV_PFS_THREAD +UNIV_INTERN mysql_pfs_key_t trx_rollback_clean_thread_key; +#endif /* UNIV_PFS_THREAD */ + +#ifdef UNIV_PFS_MUTEX +UNIV_INTERN mysql_pfs_key_t recv_sys_mutex_key; +#endif /* UNIV_PFS_MUTEX */ + +#ifndef UNIV_HOTBACKUP +# ifdef UNIV_PFS_THREAD +UNIV_INTERN mysql_pfs_key_t recv_writer_thread_key; +# endif /* UNIV_PFS_THREAD */ + +# ifdef UNIV_PFS_MUTEX +UNIV_INTERN mysql_pfs_key_t recv_writer_mutex_key; +# endif /* UNIV_PFS_MUTEX */ + +/** Flag indicating if recv_writer thread is active. */ +UNIV_INTERN bool recv_writer_thread_active = false; +UNIV_INTERN os_thread_t recv_writer_thread_handle = 0; +#endif /* !UNIV_HOTBACKUP */ + +/* prototypes */ + +#ifndef UNIV_HOTBACKUP +/*******************************************************//** +Initialize crash recovery environment. Can be called iff +recv_needed_recovery == FALSE. */ +static +void +recv_init_crash_recovery(void); +/*===========================*/ +#endif /* !UNIV_HOTBACKUP */ + +/********************************************************//** +Creates the recovery system. */ +UNIV_INTERN +void +recv_sys_create(void) +/*=================*/ +{ + if (recv_sys != NULL) { + + return; + } + + recv_sys = static_cast<recv_sys_t*>(mem_zalloc(sizeof(*recv_sys))); + + mutex_create(recv_sys_mutex_key, &recv_sys->mutex, SYNC_RECV); + +#ifndef UNIV_HOTBACKUP + mutex_create(recv_writer_mutex_key, &recv_sys->writer_mutex, + SYNC_LEVEL_VARYING); +#endif /* !UNIV_HOTBACKUP */ + + recv_sys->heap = NULL; + recv_sys->addr_hash = NULL; +} + +/********************************************************//** +Release recovery system mutexes. */ +UNIV_INTERN +void +recv_sys_close(void) +/*================*/ +{ + if (recv_sys != NULL) { + if (recv_sys->addr_hash != NULL) { + hash_table_free(recv_sys->addr_hash); + } + + if (recv_sys->heap != NULL) { + mem_heap_free(recv_sys->heap); + } + + if (recv_sys->buf != NULL) { + ut_free(recv_sys->buf); + } + + if (recv_sys->last_block_buf_start != NULL) { + mem_free(recv_sys->last_block_buf_start); + } + +#ifndef UNIV_HOTBACKUP + ut_ad(!recv_writer_thread_active); + mutex_free(&recv_sys->writer_mutex); +#endif /* !UNIV_HOTBACKUP */ + + mutex_free(&recv_sys->mutex); + + mem_free(recv_sys); + recv_sys = NULL; + } +} + +/********************************************************//** +Frees the recovery system memory. */ +UNIV_INTERN +void +recv_sys_mem_free(void) +/*===================*/ +{ + if (recv_sys != NULL) { + if (recv_sys->addr_hash != NULL) { + hash_table_free(recv_sys->addr_hash); + } + + if (recv_sys->heap != NULL) { + mem_heap_free(recv_sys->heap); + } + + if (recv_sys->buf != NULL) { + ut_free(recv_sys->buf); + } + + if (recv_sys->last_block_buf_start != NULL) { + mem_free(recv_sys->last_block_buf_start); + } + + mem_free(recv_sys); + recv_sys = NULL; + } +} + +#ifndef UNIV_HOTBACKUP +/************************************************************ +Reset the state of the recovery system variables. */ +UNIV_INTERN +void +recv_sys_var_init(void) +/*===================*/ +{ + recv_lsn_checks_on = FALSE; + + recv_n_pool_free_frames = 256; + + recv_recovery_on = FALSE; + +#ifdef UNIV_LOG_ARCHIVE + recv_recovery_from_backup_on = FALSE; +#endif /* UNIV_LOG_ARCHIVE */ + + recv_needed_recovery = FALSE; + + recv_lsn_checks_on = FALSE; + + recv_log_scan_is_startup_type = FALSE; + + recv_no_ibuf_operations = FALSE; + + recv_scan_print_counter = 0; + + recv_previous_parsed_rec_type = 999999; + + recv_previous_parsed_rec_offset = 0; + + recv_previous_parsed_rec_is_multi = 0; + + recv_max_parsed_page_no = 0; + + recv_n_pool_free_frames = 256; + + recv_max_page_lsn = 0; +} + +/******************************************************************//** +recv_writer thread tasked with flushing dirty pages from the buffer +pools. +@return a dummy parameter */ +extern "C" UNIV_INTERN +os_thread_ret_t +DECLARE_THREAD(recv_writer_thread)( +/*===============================*/ + void* arg __attribute__((unused))) + /*!< in: a dummy parameter required by + os_thread_create */ +{ + ut_ad(!srv_read_only_mode); + +#ifdef UNIV_PFS_THREAD + pfs_register_thread(recv_writer_thread_key); +#endif /* UNIV_PFS_THREAD */ + +#ifdef UNIV_DEBUG_THREAD_CREATION + fprintf(stderr, "InnoDB: recv_writer thread running, id %lu\n", + os_thread_pf(os_thread_get_curr_id())); +#endif /* UNIV_DEBUG_THREAD_CREATION */ + + recv_writer_thread_active = true; + + while (srv_shutdown_state == SRV_SHUTDOWN_NONE) { + + os_thread_sleep(100000); + + mutex_enter(&recv_sys->writer_mutex); + + if (!recv_recovery_on) { + mutex_exit(&recv_sys->writer_mutex); + break; + } + + /* Flush pages from end of LRU if required */ + buf_flush_LRU_tail(); + + mutex_exit(&recv_sys->writer_mutex); + } + + recv_writer_thread_active = false; + + /* We count the number of threads in os_thread_exit(). + A created thread should always use that to exit and not + use return() to exit. */ + os_thread_exit(NULL); + + OS_THREAD_DUMMY_RETURN; +} +#endif /* !UNIV_HOTBACKUP */ + +/************************************************************ +Inits the recovery system for a recovery operation. */ +UNIV_INTERN +void +recv_sys_init( +/*==========*/ + ulint available_memory) /*!< in: available memory in bytes */ +{ + if (recv_sys->heap != NULL) { + + return; + } + +#ifndef UNIV_HOTBACKUP + /* Initialize red-black tree for fast insertions into the + flush_list during recovery process. + As this initialization is done while holding the buffer pool + mutex we perform it before acquiring recv_sys->mutex. */ + buf_flush_init_flush_rbt(); + + mutex_enter(&(recv_sys->mutex)); + + recv_sys->heap = mem_heap_create_typed(256, + MEM_HEAP_FOR_RECV_SYS); +#else /* !UNIV_HOTBACKUP */ + recv_sys->heap = mem_heap_create(256); + recv_is_from_backup = TRUE; +#endif /* !UNIV_HOTBACKUP */ + + /* Set appropriate value of recv_n_pool_free_frames. */ + if (buf_pool_get_curr_size() >= (10 * 1024 * 1024)) { + /* Buffer pool of size greater than 10 MB. */ + recv_n_pool_free_frames = 512; + } + + recv_sys->buf = static_cast<byte*>(ut_malloc(RECV_PARSING_BUF_SIZE)); + recv_sys->len = 0; + recv_sys->recovered_offset = 0; + + recv_sys->addr_hash = hash_create(available_memory / 512); + recv_sys->n_addrs = 0; + + recv_sys->apply_log_recs = FALSE; + recv_sys->apply_batch_on = FALSE; + + recv_sys->last_block_buf_start = static_cast<byte*>( + mem_alloc(2 * OS_FILE_LOG_BLOCK_SIZE)); + + recv_sys->last_block = static_cast<byte*>(ut_align( + recv_sys->last_block_buf_start, OS_FILE_LOG_BLOCK_SIZE)); + + recv_sys->found_corrupt_log = FALSE; + + recv_max_page_lsn = 0; + + /* Call the constructor for recv_sys_t::dblwr member */ + new (&recv_sys->dblwr) recv_dblwr_t(); + + mutex_exit(&(recv_sys->mutex)); +} + +/********************************************************//** +Empties the hash table when it has been fully processed. */ +static +void +recv_sys_empty_hash(void) +/*=====================*/ +{ + ut_ad(mutex_own(&(recv_sys->mutex))); + + if (recv_sys->n_addrs != 0) { + fprintf(stderr, + "InnoDB: Error: %lu pages with log records" + " were left unprocessed!\n" + "InnoDB: Maximum page number with" + " log records on it %lu\n", + (ulong) recv_sys->n_addrs, + (ulong) recv_max_parsed_page_no); + ut_error; + } + + hash_table_free(recv_sys->addr_hash); + mem_heap_empty(recv_sys->heap); + + recv_sys->addr_hash = hash_create(buf_pool_get_curr_size() / 512); +} + +#ifndef UNIV_HOTBACKUP +# ifndef UNIV_LOG_DEBUG +/********************************************************//** +Frees the recovery system. */ +static +void +recv_sys_debug_free(void) +/*=====================*/ +{ + mutex_enter(&(recv_sys->mutex)); + + hash_table_free(recv_sys->addr_hash); + mem_heap_free(recv_sys->heap); + ut_free(recv_sys->buf); + mem_free(recv_sys->last_block_buf_start); + + recv_sys->buf = NULL; + recv_sys->heap = NULL; + recv_sys->addr_hash = NULL; + recv_sys->last_block_buf_start = NULL; + + mutex_exit(&(recv_sys->mutex)); + + /* Free up the flush_rbt. */ + buf_flush_free_flush_rbt(); +} +# endif /* UNIV_LOG_DEBUG */ + +# ifdef UNIV_LOG_ARCHIVE +/********************************************************//** +Truncates possible corrupted or extra records from a log group. */ +static +void +recv_truncate_group( +/*================*/ + log_group_t* group, /*!< in: log group */ + lsn_t recovered_lsn, /*!< in: recovery succeeded up to this + lsn */ + lsn_t limit_lsn, /*!< in: this was the limit for + recovery */ + lsn_t checkpoint_lsn, /*!< in: recovery was started from this + checkpoint */ + lsn_t archived_lsn) /*!< in: the log has been archived up to + this lsn */ +{ + lsn_t start_lsn; + lsn_t end_lsn; + lsn_t finish_lsn1; + lsn_t finish_lsn2; + lsn_t finish_lsn; + + if (archived_lsn == LSN_MAX) { + /* Checkpoint was taken in the NOARCHIVELOG mode */ + archived_lsn = checkpoint_lsn; + } + + finish_lsn1 = ut_uint64_align_down(archived_lsn, + OS_FILE_LOG_BLOCK_SIZE) + + log_group_get_capacity(group); + + finish_lsn2 = ut_uint64_align_up(recovered_lsn, + OS_FILE_LOG_BLOCK_SIZE) + + recv_sys->last_log_buf_size; + + if (limit_lsn != LSN_MAX) { + /* We do not know how far we should erase log records: erase + as much as possible */ + + finish_lsn = finish_lsn1; + } else { + /* It is enough to erase the length of the log buffer */ + finish_lsn = finish_lsn1 < finish_lsn2 + ? finish_lsn1 : finish_lsn2; + } + + ut_a(RECV_SCAN_SIZE <= log_sys->buf_size); + + memset(log_sys->buf, 0, RECV_SCAN_SIZE); + + start_lsn = ut_uint64_align_down(recovered_lsn, + OS_FILE_LOG_BLOCK_SIZE); + + if (start_lsn != recovered_lsn) { + /* Copy the last incomplete log block to the log buffer and + edit its data length: */ + lsn_t diff = recovered_lsn - start_lsn; + + ut_a(diff <= 0xFFFFUL); + + ut_memcpy(log_sys->buf, recv_sys->last_block, + OS_FILE_LOG_BLOCK_SIZE); + log_block_set_data_len(log_sys->buf, (ulint) diff); + } + + if (start_lsn >= finish_lsn) { + + return; + } + + for (;;) { + ulint len; + + end_lsn = start_lsn + RECV_SCAN_SIZE; + + if (end_lsn > finish_lsn) { + + end_lsn = finish_lsn; + } + + len = (ulint) (end_lsn - start_lsn); + + log_group_write_buf(group, log_sys->buf, len, start_lsn, 0); + if (end_lsn >= finish_lsn) { + + return; + } + + memset(log_sys->buf, 0, RECV_SCAN_SIZE); + + start_lsn = end_lsn; + } +} + +/********************************************************//** +Copies the log segment between group->recovered_lsn and recovered_lsn from the +most up-to-date log group to group, so that it contains the latest log data. */ +static +void +recv_copy_group( +/*============*/ + log_group_t* up_to_date_group, /*!< in: the most up-to-date log + group */ + log_group_t* group, /*!< in: copy to this log + group */ + lsn_t recovered_lsn) /*!< in: recovery succeeded up + to this lsn */ +{ + lsn_t start_lsn; + lsn_t end_lsn; + + if (group->scanned_lsn >= recovered_lsn) { + + return; + } + + ut_a(RECV_SCAN_SIZE <= log_sys->buf_size); + + start_lsn = ut_uint64_align_down(group->scanned_lsn, + OS_FILE_LOG_BLOCK_SIZE); + for (;;) { + ulint len; + + end_lsn = start_lsn + RECV_SCAN_SIZE; + + if (end_lsn > recovered_lsn) { + end_lsn = ut_uint64_align_up(recovered_lsn, + OS_FILE_LOG_BLOCK_SIZE); + } + + log_group_read_log_seg(LOG_RECOVER, log_sys->buf, + up_to_date_group, start_lsn, end_lsn, + FALSE); + + len = (ulint) (end_lsn - start_lsn); + + log_group_write_buf(group, log_sys->buf, len, start_lsn, 0); + + if (end_lsn >= recovered_lsn) { + + return; + } + + start_lsn = end_lsn; + } +} +# endif /* UNIV_LOG_ARCHIVE */ + +/********************************************************//** +Copies a log segment from the most up-to-date log group to the other log +groups, so that they all contain the latest log data. Also writes the info +about the latest checkpoint to the groups, and inits the fields in the group +memory structs to up-to-date values. */ +static +void +recv_synchronize_groups( +/*====================*/ +#ifdef UNIV_LOG_ARCHIVE + log_group_t* up_to_date_group /*!< in: the most up-to-date + log group */ +#endif + ) +{ + lsn_t start_lsn; + lsn_t end_lsn; + lsn_t recovered_lsn; + + recovered_lsn = recv_sys->recovered_lsn; + + /* Read the last recovered log block to the recovery system buffer: + the block is always incomplete */ + + start_lsn = ut_uint64_align_down(recovered_lsn, + OS_FILE_LOG_BLOCK_SIZE); + end_lsn = ut_uint64_align_up(recovered_lsn, OS_FILE_LOG_BLOCK_SIZE); + + ut_a(start_lsn != end_lsn); + + log_group_read_log_seg(LOG_RECOVER, recv_sys->last_block, +#ifdef UNIV_LOG_ARCHIVE + up_to_date_group, +#else /* UNIV_LOG_ARCHIVE */ + UT_LIST_GET_FIRST(log_sys->log_groups), +#endif /* UNIV_LOG_ARCHIVE */ + start_lsn, end_lsn, FALSE); + + for (log_group_t* group = UT_LIST_GET_FIRST(log_sys->log_groups); + group; + group = UT_LIST_GET_NEXT(log_groups, group)) { +#ifdef UNIV_LOG_ARCHIVE + if (group != up_to_date_group) { + + /* Copy log data if needed */ + + recv_copy_group(group, up_to_date_group, + recovered_lsn); + } +#endif /* UNIV_LOG_ARCHIVE */ + /* Update the fields in the group struct to correspond to + recovered_lsn */ + + log_group_set_fields(group, recovered_lsn); + } + + /* Copy the checkpoint info to the groups; remember that we have + incremented checkpoint_no by one, and the info will not be written + over the max checkpoint info, thus making the preservation of max + checkpoint info on disk certain */ + + log_groups_write_checkpoint_info(); + + mutex_exit(&(log_sys->mutex)); + + /* Wait for the checkpoint write to complete */ + rw_lock_s_lock(&(log_sys->checkpoint_lock)); + rw_lock_s_unlock(&(log_sys->checkpoint_lock)); + + mutex_enter(&(log_sys->mutex)); +} +#endif /* !UNIV_HOTBACKUP */ + +/***********************************************************************//** +Checks the consistency of the checkpoint info +@return TRUE if ok */ +static +ibool +recv_check_cp_is_consistent( +/*========================*/ + const byte* buf) /*!< in: buffer containing checkpoint info */ +{ + ulint fold; + + fold = ut_fold_binary(buf, LOG_CHECKPOINT_CHECKSUM_1); + + if ((fold & 0xFFFFFFFFUL) != mach_read_from_4( + buf + LOG_CHECKPOINT_CHECKSUM_1)) { + return(FALSE); + } + + fold = ut_fold_binary(buf + LOG_CHECKPOINT_LSN, + LOG_CHECKPOINT_CHECKSUM_2 - LOG_CHECKPOINT_LSN); + + if ((fold & 0xFFFFFFFFUL) != mach_read_from_4( + buf + LOG_CHECKPOINT_CHECKSUM_2)) { + return(FALSE); + } + + return(TRUE); +} + +#ifndef UNIV_HOTBACKUP +/********************************************************//** +Looks for the maximum consistent checkpoint from the log groups. +@return error code or DB_SUCCESS */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +recv_find_max_checkpoint( +/*=====================*/ + log_group_t** max_group, /*!< out: max group */ + ulint* max_field) /*!< out: LOG_CHECKPOINT_1 or + LOG_CHECKPOINT_2 */ +{ + log_group_t* group; + ib_uint64_t max_no; + ib_uint64_t checkpoint_no; + ulint field; + byte* buf; + + group = UT_LIST_GET_FIRST(log_sys->log_groups); + + max_no = 0; + *max_group = NULL; + *max_field = 0; + + buf = log_sys->checkpoint_buf; + + while (group) { + group->state = LOG_GROUP_CORRUPTED; + + for (field = LOG_CHECKPOINT_1; field <= LOG_CHECKPOINT_2; + field += LOG_CHECKPOINT_2 - LOG_CHECKPOINT_1) { + + log_group_read_checkpoint_info(group, field); + + if (!recv_check_cp_is_consistent(buf)) { +#ifdef UNIV_DEBUG + if (log_debug_writes) { + fprintf(stderr, + "InnoDB: Checkpoint in group" + " %lu at %lu invalid, %lu\n", + (ulong) group->id, + (ulong) field, + (ulong) mach_read_from_4( + buf + + LOG_CHECKPOINT_CHECKSUM_1)); + + } +#endif /* UNIV_DEBUG */ + goto not_consistent; + } + + group->state = LOG_GROUP_OK; + + group->lsn = mach_read_from_8( + buf + LOG_CHECKPOINT_LSN); + group->lsn_offset = mach_read_from_4( + buf + LOG_CHECKPOINT_OFFSET_LOW32); + group->lsn_offset |= ((lsn_t) mach_read_from_4( + buf + LOG_CHECKPOINT_OFFSET_HIGH32)) << 32; + checkpoint_no = mach_read_from_8( + buf + LOG_CHECKPOINT_NO); + +#ifdef UNIV_DEBUG + if (log_debug_writes) { + fprintf(stderr, + "InnoDB: Checkpoint number %lu" + " found in group %lu\n", + (ulong) checkpoint_no, + (ulong) group->id); + } +#endif /* UNIV_DEBUG */ + + if (checkpoint_no >= max_no) { + *max_group = group; + *max_field = field; + max_no = checkpoint_no; + } + +not_consistent: + ; + } + + group = UT_LIST_GET_NEXT(log_groups, group); + } + + if (*max_group == NULL) { + + fprintf(stderr, + "InnoDB: No valid checkpoint found.\n" + "InnoDB: If this error appears when you are" + " creating an InnoDB database,\n" + "InnoDB: the problem may be that during" + " an earlier attempt you managed\n" + "InnoDB: to create the InnoDB data files," + " but log file creation failed.\n" + "InnoDB: If that is the case, please refer to\n" + "InnoDB: " REFMAN "error-creating-innodb.html\n"); + return(DB_ERROR); + } + + return(DB_SUCCESS); +} +#else /* !UNIV_HOTBACKUP */ +/*******************************************************************//** +Reads the checkpoint info needed in hot backup. +@return TRUE if success */ +UNIV_INTERN +ibool +recv_read_checkpoint_info_for_backup( +/*=================================*/ + const byte* hdr, /*!< in: buffer containing the log group + header */ + lsn_t* lsn, /*!< out: checkpoint lsn */ + lsn_t* offset, /*!< out: checkpoint offset in the log group */ + lsn_t* cp_no, /*!< out: checkpoint number */ + lsn_t* first_header_lsn) + /*!< out: lsn of of the start of the + first log file */ +{ + ulint max_cp = 0; + ib_uint64_t max_cp_no = 0; + const byte* cp_buf; + + cp_buf = hdr + LOG_CHECKPOINT_1; + + if (recv_check_cp_is_consistent(cp_buf)) { + max_cp_no = mach_read_from_8(cp_buf + LOG_CHECKPOINT_NO); + max_cp = LOG_CHECKPOINT_1; + } + + cp_buf = hdr + LOG_CHECKPOINT_2; + + if (recv_check_cp_is_consistent(cp_buf)) { + if (mach_read_from_8(cp_buf + LOG_CHECKPOINT_NO) > max_cp_no) { + max_cp = LOG_CHECKPOINT_2; + } + } + + if (max_cp == 0) { + return(FALSE); + } + + cp_buf = hdr + max_cp; + + *lsn = mach_read_from_8(cp_buf + LOG_CHECKPOINT_LSN); + *offset = mach_read_from_4( + cp_buf + LOG_CHECKPOINT_OFFSET_LOW32); + *offset |= ((lsn_t) mach_read_from_4( + cp_buf + LOG_CHECKPOINT_OFFSET_HIGH32)) << 32; + + *cp_no = mach_read_from_8(cp_buf + LOG_CHECKPOINT_NO); + + *first_header_lsn = mach_read_from_8(hdr + LOG_FILE_START_LSN); + + return(TRUE); +} +#endif /* !UNIV_HOTBACKUP */ + +/******************************************************//** +Checks the 4-byte checksum to the trailer checksum field of a log +block. We also accept a log block in the old format before +InnoDB-3.23.52 where the checksum field contains the log block number. +@return TRUE if ok, or if the log block may be in the format of InnoDB +version predating 3.23.52 */ +UNIV_INTERN +ibool +log_block_checksum_is_ok_or_old_format( +/*===================================*/ + const byte* block) /*!< in: pointer to a log block */ +{ +#ifdef UNIV_LOG_DEBUG + return(TRUE); +#endif /* UNIV_LOG_DEBUG */ + + ulint block_checksum = log_block_get_checksum(block); + + if (UNIV_LIKELY(srv_log_checksum_algorithm == + SRV_CHECKSUM_ALGORITHM_NONE || + log_block_calc_checksum(block) == block_checksum)) { + + return(TRUE); + } + + if (srv_log_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_STRICT_CRC32 || + srv_log_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_STRICT_INNODB || + srv_log_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_STRICT_NONE) { + + const char* algo = NULL; + + ib_logf(IB_LOG_LEVEL_ERROR, + "log block checksum mismatch: expected " ULINTPF ", " + "calculated checksum " ULINTPF, + block_checksum, + log_block_calc_checksum(block)); + + if (block_checksum == LOG_NO_CHECKSUM_MAGIC) { + + algo = "none"; + } else if (block_checksum == + log_block_calc_checksum_crc32(block)) { + + algo = "crc32"; + } else if (block_checksum == + log_block_calc_checksum_innodb(block)) { + + algo = "innodb"; + } + + if (algo) { + + const char* current_algo; + + current_algo = buf_checksum_algorithm_name( + (srv_checksum_algorithm_t) + srv_log_checksum_algorithm); + + ib_logf(IB_LOG_LEVEL_ERROR, + "current InnoDB log checksum type: %s, " + "detected log checksum type: %s", + current_algo, + algo); + } + + ib_logf(IB_LOG_LEVEL_FATAL, + "STRICT method was specified for innodb_log_checksum, " + "so we intentionally assert here."); + } + + ut_ad(srv_log_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_CRC32 || + srv_log_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_INNODB); + + if (block_checksum == LOG_NO_CHECKSUM_MAGIC || + block_checksum == log_block_calc_checksum_crc32(block) || + block_checksum == log_block_calc_checksum_innodb(block)) { + + return(TRUE); + } + + if (log_block_get_hdr_no(block) == block_checksum) { + + /* We assume the log block is in the format of + InnoDB version < 3.23.52 and the block is ok */ +#if 0 + fprintf(stderr, + "InnoDB: Scanned old format < InnoDB-3.23.52" + " log block number %lu\n", + log_block_get_hdr_no(block)); +#endif + return(TRUE); + } + + return(FALSE); +} + +#ifdef UNIV_HOTBACKUP +/*******************************************************************//** +Scans the log segment and n_bytes_scanned is set to the length of valid +log scanned. */ +UNIV_INTERN +void +recv_scan_log_seg_for_backup( +/*=========================*/ + byte* buf, /*!< in: buffer containing log data */ + ulint buf_len, /*!< in: data length in that buffer */ + lsn_t* scanned_lsn, /*!< in/out: lsn of buffer start, + we return scanned lsn */ + ulint* scanned_checkpoint_no, + /*!< in/out: 4 lowest bytes of the + highest scanned checkpoint number so + far */ + ulint* n_bytes_scanned)/*!< out: how much we were able to + scan, smaller than buf_len if log + data ended here */ +{ + ulint data_len; + byte* log_block; + ulint no; + + *n_bytes_scanned = 0; + + for (log_block = buf; log_block < buf + buf_len; + log_block += OS_FILE_LOG_BLOCK_SIZE) { + + no = log_block_get_hdr_no(log_block); + +#if 0 + fprintf(stderr, "Log block header no %lu\n", no); +#endif + + if (no != log_block_convert_lsn_to_no(*scanned_lsn) + || !log_block_checksum_is_ok_or_old_format(log_block)) { +#if 0 + fprintf(stderr, + "Log block n:o %lu, scanned lsn n:o %lu\n", + no, log_block_convert_lsn_to_no(*scanned_lsn)); +#endif + /* Garbage or an incompletely written log block */ + + log_block += OS_FILE_LOG_BLOCK_SIZE; +#if 0 + fprintf(stderr, + "Next log block n:o %lu\n", + log_block_get_hdr_no(log_block)); +#endif + break; + } + + if (*scanned_checkpoint_no > 0 + && log_block_get_checkpoint_no(log_block) + < *scanned_checkpoint_no + && *scanned_checkpoint_no + - log_block_get_checkpoint_no(log_block) + > 0x80000000UL) { + + /* Garbage from a log buffer flush which was made + before the most recent database recovery */ +#if 0 + fprintf(stderr, + "Scanned cp n:o %lu, block cp n:o %lu\n", + *scanned_checkpoint_no, + log_block_get_checkpoint_no(log_block)); +#endif + break; + } + + data_len = log_block_get_data_len(log_block); + + *scanned_checkpoint_no + = log_block_get_checkpoint_no(log_block); + *scanned_lsn += data_len; + + *n_bytes_scanned += data_len; + + if (data_len < OS_FILE_LOG_BLOCK_SIZE) { + /* Log data ends here */ + +#if 0 + fprintf(stderr, "Log block data len %lu\n", + data_len); +#endif + break; + } + } +} +#endif /* UNIV_HOTBACKUP */ + +/*******************************************************************//** +Tries to parse a single log record body and also applies it to a page if +specified. File ops are parsed, but not applied in this function. +@return log record end, NULL if not a complete record */ +static +byte* +recv_parse_or_apply_log_rec_body( +/*=============================*/ + byte type, /*!< in: type */ + byte* ptr, /*!< in: pointer to a buffer */ + byte* end_ptr,/*!< in: pointer to the buffer end */ + buf_block_t* block, /*!< in/out: buffer block or NULL; if + not NULL, then the log record is + applied to the page, and the log + record should be complete then */ + mtr_t* mtr, /*!< in: mtr or NULL; should be non-NULL + if and only if block is non-NULL */ + ulint space_id) + /*!< in: tablespace id obtained by + parsing initial log record */ +{ + dict_index_t* index = NULL; + page_t* page; + page_zip_des_t* page_zip; +#ifdef UNIV_DEBUG + ulint page_type; +#endif /* UNIV_DEBUG */ + + ut_ad(!block == !mtr); + + if (block) { + page = block->frame; + page_zip = buf_block_get_page_zip(block); + ut_d(page_type = fil_page_get_type(page)); + } else { + page = NULL; + page_zip = NULL; + ut_d(page_type = FIL_PAGE_TYPE_ALLOCATED); + } + + switch (type) { +#ifdef UNIV_LOG_LSN_DEBUG + case MLOG_LSN: + /* The LSN is checked in recv_parse_log_rec(). */ + break; +#endif /* UNIV_LOG_LSN_DEBUG */ + case MLOG_1BYTE: case MLOG_2BYTES: case MLOG_4BYTES: case MLOG_8BYTES: +#ifdef UNIV_DEBUG + if (page && page_type == FIL_PAGE_TYPE_ALLOCATED + && end_ptr >= ptr + 2) { + /* It is OK to set FIL_PAGE_TYPE and certain + list node fields on an empty page. Any other + write is not OK. */ + + /* NOTE: There may be bogus assertion failures for + dict_hdr_create(), trx_rseg_header_create(), + trx_sys_create_doublewrite_buf(), and + trx_sysf_create(). + These are only called during database creation. */ + ulint offs = mach_read_from_2(ptr); + + switch (type) { + default: + ut_error; + case MLOG_2BYTES: + /* Note that this can fail when the + redo log been written with something + older than InnoDB Plugin 1.0.4. */ + ut_ad(offs == FIL_PAGE_TYPE + || offs == IBUF_TREE_SEG_HEADER + + IBUF_HEADER + FSEG_HDR_OFFSET + || offs == PAGE_BTR_IBUF_FREE_LIST + + PAGE_HEADER + FIL_ADDR_BYTE + || offs == PAGE_BTR_IBUF_FREE_LIST + + PAGE_HEADER + FIL_ADDR_BYTE + + FIL_ADDR_SIZE + || offs == PAGE_BTR_SEG_LEAF + + PAGE_HEADER + FSEG_HDR_OFFSET + || offs == PAGE_BTR_SEG_TOP + + PAGE_HEADER + FSEG_HDR_OFFSET + || offs == PAGE_BTR_IBUF_FREE_LIST_NODE + + PAGE_HEADER + FIL_ADDR_BYTE + + 0 /*FLST_PREV*/ + || offs == PAGE_BTR_IBUF_FREE_LIST_NODE + + PAGE_HEADER + FIL_ADDR_BYTE + + FIL_ADDR_SIZE /*FLST_NEXT*/); + break; + case MLOG_4BYTES: + /* Note that this can fail when the + redo log been written with something + older than InnoDB Plugin 1.0.4. */ + ut_ad(0 + || offs == IBUF_TREE_SEG_HEADER + + IBUF_HEADER + FSEG_HDR_SPACE + || offs == IBUF_TREE_SEG_HEADER + + IBUF_HEADER + FSEG_HDR_PAGE_NO + || offs == PAGE_BTR_IBUF_FREE_LIST + + PAGE_HEADER/* flst_init */ + || offs == PAGE_BTR_IBUF_FREE_LIST + + PAGE_HEADER + FIL_ADDR_PAGE + || offs == PAGE_BTR_IBUF_FREE_LIST + + PAGE_HEADER + FIL_ADDR_PAGE + + FIL_ADDR_SIZE + || offs == PAGE_BTR_SEG_LEAF + + PAGE_HEADER + FSEG_HDR_PAGE_NO + || offs == PAGE_BTR_SEG_LEAF + + PAGE_HEADER + FSEG_HDR_SPACE + || offs == PAGE_BTR_SEG_TOP + + PAGE_HEADER + FSEG_HDR_PAGE_NO + || offs == PAGE_BTR_SEG_TOP + + PAGE_HEADER + FSEG_HDR_SPACE + || offs == PAGE_BTR_IBUF_FREE_LIST_NODE + + PAGE_HEADER + FIL_ADDR_PAGE + + 0 /*FLST_PREV*/ + || offs == PAGE_BTR_IBUF_FREE_LIST_NODE + + PAGE_HEADER + FIL_ADDR_PAGE + + FIL_ADDR_SIZE /*FLST_NEXT*/); + break; + } + } +#endif /* UNIV_DEBUG */ + ptr = mlog_parse_nbytes(type, ptr, end_ptr, page, page_zip); + break; + case MLOG_REC_INSERT: case MLOG_COMP_REC_INSERT: + ut_ad(!page || page_type == FIL_PAGE_INDEX); + + if (NULL != (ptr = mlog_parse_index( + ptr, end_ptr, + type == MLOG_COMP_REC_INSERT, + &index))) { + ut_a(!page + || (ibool)!!page_is_comp(page) + == dict_table_is_comp(index->table)); + ptr = page_cur_parse_insert_rec(FALSE, ptr, end_ptr, + block, index, mtr); + } + break; + case MLOG_REC_CLUST_DELETE_MARK: case MLOG_COMP_REC_CLUST_DELETE_MARK: + ut_ad(!page || page_type == FIL_PAGE_INDEX); + + if (NULL != (ptr = mlog_parse_index( + ptr, end_ptr, + type == MLOG_COMP_REC_CLUST_DELETE_MARK, + &index))) { + ut_a(!page + || (ibool)!!page_is_comp(page) + == dict_table_is_comp(index->table)); + ptr = btr_cur_parse_del_mark_set_clust_rec( + ptr, end_ptr, page, page_zip, index); + } + break; + case MLOG_COMP_REC_SEC_DELETE_MARK: + ut_ad(!page || page_type == FIL_PAGE_INDEX); + /* This log record type is obsolete, but we process it for + backward compatibility with MySQL 5.0.3 and 5.0.4. */ + ut_a(!page || page_is_comp(page)); + ut_a(!page_zip); + ptr = mlog_parse_index(ptr, end_ptr, TRUE, &index); + if (!ptr) { + break; + } + /* Fall through */ + case MLOG_REC_SEC_DELETE_MARK: + ut_ad(!page || page_type == FIL_PAGE_INDEX); + ptr = btr_cur_parse_del_mark_set_sec_rec(ptr, end_ptr, + page, page_zip); + break; + case MLOG_REC_UPDATE_IN_PLACE: case MLOG_COMP_REC_UPDATE_IN_PLACE: + ut_ad(!page || page_type == FIL_PAGE_INDEX); + + if (NULL != (ptr = mlog_parse_index( + ptr, end_ptr, + type == MLOG_COMP_REC_UPDATE_IN_PLACE, + &index))) { + ut_a(!page + || (ibool)!!page_is_comp(page) + == dict_table_is_comp(index->table)); + ptr = btr_cur_parse_update_in_place(ptr, end_ptr, page, + page_zip, index); + } + break; + case MLOG_LIST_END_DELETE: case MLOG_COMP_LIST_END_DELETE: + case MLOG_LIST_START_DELETE: case MLOG_COMP_LIST_START_DELETE: + ut_ad(!page || page_type == FIL_PAGE_INDEX); + + if (NULL != (ptr = mlog_parse_index( + ptr, end_ptr, + type == MLOG_COMP_LIST_END_DELETE + || type == MLOG_COMP_LIST_START_DELETE, + &index))) { + ut_a(!page + || (ibool)!!page_is_comp(page) + == dict_table_is_comp(index->table)); + ptr = page_parse_delete_rec_list(type, ptr, end_ptr, + block, index, mtr); + } + break; + case MLOG_LIST_END_COPY_CREATED: case MLOG_COMP_LIST_END_COPY_CREATED: + ut_ad(!page || page_type == FIL_PAGE_INDEX); + + if (NULL != (ptr = mlog_parse_index( + ptr, end_ptr, + type == MLOG_COMP_LIST_END_COPY_CREATED, + &index))) { + ut_a(!page + || (ibool)!!page_is_comp(page) + == dict_table_is_comp(index->table)); + ptr = page_parse_copy_rec_list_to_created_page( + ptr, end_ptr, block, index, mtr); + } + break; + case MLOG_PAGE_REORGANIZE: + case MLOG_COMP_PAGE_REORGANIZE: + case MLOG_ZIP_PAGE_REORGANIZE: + ut_ad(!page || page_type == FIL_PAGE_INDEX); + + if (NULL != (ptr = mlog_parse_index( + ptr, end_ptr, + type != MLOG_PAGE_REORGANIZE, + &index))) { + ut_a(!page + || (ibool)!!page_is_comp(page) + == dict_table_is_comp(index->table)); + ptr = btr_parse_page_reorganize( + ptr, end_ptr, index, + type == MLOG_ZIP_PAGE_REORGANIZE, + block, mtr); + } + break; + case MLOG_PAGE_CREATE: case MLOG_COMP_PAGE_CREATE: + /* Allow anything in page_type when creating a page. */ + ut_a(!page_zip); + ptr = page_parse_create(ptr, end_ptr, + type == MLOG_COMP_PAGE_CREATE, + block, mtr); + break; + case MLOG_UNDO_INSERT: + ut_ad(!page || page_type == FIL_PAGE_UNDO_LOG); + ptr = trx_undo_parse_add_undo_rec(ptr, end_ptr, page); + break; + case MLOG_UNDO_ERASE_END: + ut_ad(!page || page_type == FIL_PAGE_UNDO_LOG); + ptr = trx_undo_parse_erase_page_end(ptr, end_ptr, page, mtr); + break; + case MLOG_UNDO_INIT: + /* Allow anything in page_type when creating a page. */ + ptr = trx_undo_parse_page_init(ptr, end_ptr, page, mtr); + break; + case MLOG_UNDO_HDR_DISCARD: + ut_ad(!page || page_type == FIL_PAGE_UNDO_LOG); + ptr = trx_undo_parse_discard_latest(ptr, end_ptr, page, mtr); + break; + case MLOG_UNDO_HDR_CREATE: + case MLOG_UNDO_HDR_REUSE: + ut_ad(!page || page_type == FIL_PAGE_UNDO_LOG); + ptr = trx_undo_parse_page_header(type, ptr, end_ptr, + page, mtr); + break; + case MLOG_REC_MIN_MARK: case MLOG_COMP_REC_MIN_MARK: + ut_ad(!page || page_type == FIL_PAGE_INDEX); + /* On a compressed page, MLOG_COMP_REC_MIN_MARK + will be followed by MLOG_COMP_REC_DELETE + or MLOG_ZIP_WRITE_HEADER(FIL_PAGE_PREV, FIL_NULL) + in the same mini-transaction. */ + ut_a(type == MLOG_COMP_REC_MIN_MARK || !page_zip); + ptr = btr_parse_set_min_rec_mark( + ptr, end_ptr, type == MLOG_COMP_REC_MIN_MARK, + page, mtr); + break; + case MLOG_REC_DELETE: case MLOG_COMP_REC_DELETE: + ut_ad(!page || page_type == FIL_PAGE_INDEX); + + if (NULL != (ptr = mlog_parse_index( + ptr, end_ptr, + type == MLOG_COMP_REC_DELETE, + &index))) { + ut_a(!page + || (ibool)!!page_is_comp(page) + == dict_table_is_comp(index->table)); + ptr = page_cur_parse_delete_rec(ptr, end_ptr, + block, index, mtr); + } + break; + case MLOG_IBUF_BITMAP_INIT: + /* Allow anything in page_type when creating a page. */ + ptr = ibuf_parse_bitmap_init(ptr, end_ptr, block, mtr); + break; + case MLOG_INIT_FILE_PAGE: + /* Allow anything in page_type when creating a page. */ + ptr = fsp_parse_init_file_page(ptr, end_ptr, block); + break; + case MLOG_WRITE_STRING: + ut_ad(!page || page_type != FIL_PAGE_TYPE_ALLOCATED); + ptr = mlog_parse_string(ptr, end_ptr, page, page_zip); + break; + case MLOG_FILE_RENAME: + /* Do not rerun file-based log entries if this is + IO completion from a page read. */ + if (page == NULL) { + ptr = fil_op_log_parse_or_replay(ptr, end_ptr, type, + (recv_recovery_is_on() + ? space_id : 0), 0); + } + break; + case MLOG_FILE_CREATE: + case MLOG_FILE_DELETE: + case MLOG_FILE_CREATE2: + /* Do not rerun file-based log entries if this is + IO completion from a page read. */ + if (page == NULL) { + ptr = fil_op_log_parse_or_replay(ptr, end_ptr, + type, 0, 0); + } + break; + case MLOG_ZIP_WRITE_NODE_PTR: + ut_ad(!page || page_type == FIL_PAGE_INDEX); + ptr = page_zip_parse_write_node_ptr(ptr, end_ptr, + page, page_zip); + break; + case MLOG_ZIP_WRITE_BLOB_PTR: + ut_ad(!page || page_type == FIL_PAGE_INDEX); + ptr = page_zip_parse_write_blob_ptr(ptr, end_ptr, + page, page_zip); + break; + case MLOG_ZIP_WRITE_HEADER: + ut_ad(!page || page_type == FIL_PAGE_INDEX); + ptr = page_zip_parse_write_header(ptr, end_ptr, + page, page_zip); + break; + case MLOG_ZIP_PAGE_COMPRESS: + /* Allow anything in page_type when creating a page. */ + ptr = page_zip_parse_compress(ptr, end_ptr, + page, page_zip); + break; + case MLOG_ZIP_PAGE_COMPRESS_NO_DATA: + if (NULL != (ptr = mlog_parse_index( + ptr, end_ptr, TRUE, &index))) { + + ut_a(!page || ((ibool)!!page_is_comp(page) + == dict_table_is_comp(index->table))); + ptr = page_zip_parse_compress_no_data( + ptr, end_ptr, page, page_zip, index); + } + break; + default: + ptr = NULL; + recv_sys->found_corrupt_log = TRUE; + } + + if (index) { + dict_table_t* table = index->table; + + dict_mem_index_free(index); + dict_mem_table_free(table); + } + + return(ptr); +} + +/*********************************************************************//** +Calculates the fold value of a page file address: used in inserting or +searching for a log record in the hash table. +@return folded value */ +UNIV_INLINE +ulint +recv_fold( +/*======*/ + ulint space, /*!< in: space */ + ulint page_no)/*!< in: page number */ +{ + return(ut_fold_ulint_pair(space, page_no)); +} + +/*********************************************************************//** +Calculates the hash value of a page file address: used in inserting or +searching for a log record in the hash table. +@return folded value */ +UNIV_INLINE +ulint +recv_hash( +/*======*/ + ulint space, /*!< in: space */ + ulint page_no)/*!< in: page number */ +{ + return(hash_calc_hash(recv_fold(space, page_no), recv_sys->addr_hash)); +} + +/*********************************************************************//** +Gets the hashed file address struct for a page. +@return file address struct, NULL if not found from the hash table */ +static +recv_addr_t* +recv_get_fil_addr_struct( +/*=====================*/ + ulint space, /*!< in: space id */ + ulint page_no)/*!< in: page number */ +{ + recv_addr_t* recv_addr; + + for (recv_addr = static_cast<recv_addr_t*>( + HASH_GET_FIRST(recv_sys->addr_hash, + recv_hash(space, page_no))); + recv_addr != 0; + recv_addr = static_cast<recv_addr_t*>( + HASH_GET_NEXT(addr_hash, recv_addr))) { + + if (recv_addr->space == space + && recv_addr->page_no == page_no) { + + return(recv_addr); + } + } + + return(NULL); +} + +/*******************************************************************//** +Adds a new log record to the hash table of log records. */ +static +void +recv_add_to_hash_table( +/*===================*/ + byte type, /*!< in: log record type */ + ulint space, /*!< in: space id */ + ulint page_no, /*!< in: page number */ + byte* body, /*!< in: log record body */ + byte* rec_end, /*!< in: log record end */ + lsn_t start_lsn, /*!< in: start lsn of the mtr */ + lsn_t end_lsn) /*!< in: end lsn of the mtr */ +{ + recv_t* recv; + ulint len; + recv_data_t* recv_data; + recv_data_t** prev_field; + recv_addr_t* recv_addr; + + if (fil_tablespace_deleted_or_being_deleted_in_mem(space, -1)) { + /* The tablespace does not exist any more: do not store the + log record */ + + return; + } + + len = rec_end - body; + + recv = static_cast<recv_t*>( + mem_heap_alloc(recv_sys->heap, sizeof(recv_t))); + + recv->type = type; + recv->len = rec_end - body; + recv->start_lsn = start_lsn; + recv->end_lsn = end_lsn; + + recv_addr = recv_get_fil_addr_struct(space, page_no); + + if (recv_addr == NULL) { + recv_addr = static_cast<recv_addr_t*>( + mem_heap_alloc(recv_sys->heap, sizeof(recv_addr_t))); + + recv_addr->space = space; + recv_addr->page_no = page_no; + recv_addr->state = RECV_NOT_PROCESSED; + + UT_LIST_INIT(recv_addr->rec_list); + + HASH_INSERT(recv_addr_t, addr_hash, recv_sys->addr_hash, + recv_fold(space, page_no), recv_addr); + recv_sys->n_addrs++; +#if 0 + fprintf(stderr, "Inserting log rec for space %lu, page %lu\n", + space, page_no); +#endif + } + + UT_LIST_ADD_LAST(rec_list, recv_addr->rec_list, recv); + + prev_field = &(recv->data); + + /* Store the log record body in chunks of less than UNIV_PAGE_SIZE: + recv_sys->heap grows into the buffer pool, and bigger chunks could not + be allocated */ + + while (rec_end > body) { + + len = rec_end - body; + + if (len > RECV_DATA_BLOCK_SIZE) { + len = RECV_DATA_BLOCK_SIZE; + } + + recv_data = static_cast<recv_data_t*>( + mem_heap_alloc(recv_sys->heap, + sizeof(recv_data_t) + len)); + + *prev_field = recv_data; + + memcpy(recv_data + 1, body, len); + + prev_field = &(recv_data->next); + + body += len; + } + + *prev_field = NULL; +} + +/*********************************************************************//** +Copies the log record body from recv to buf. */ +static +void +recv_data_copy_to_buf( +/*==================*/ + byte* buf, /*!< in: buffer of length at least recv->len */ + recv_t* recv) /*!< in: log record */ +{ + recv_data_t* recv_data; + ulint part_len; + ulint len; + + len = recv->len; + recv_data = recv->data; + + while (len > 0) { + if (len > RECV_DATA_BLOCK_SIZE) { + part_len = RECV_DATA_BLOCK_SIZE; + } else { + part_len = len; + } + + ut_memcpy(buf, ((byte*) recv_data) + sizeof(recv_data_t), + part_len); + buf += part_len; + len -= part_len; + + recv_data = recv_data->next; + } +} + +/************************************************************************//** +Applies the hashed log records to the page, if the page lsn is less than the +lsn of a log record. This can be called when a buffer page has just been +read in, or also for a page already in the buffer pool. */ +UNIV_INTERN +void +recv_recover_page_func( +/*===================*/ +#ifndef UNIV_HOTBACKUP + ibool just_read_in, + /*!< in: TRUE if the i/o handler calls + this for a freshly read page */ +#endif /* !UNIV_HOTBACKUP */ + buf_block_t* block) /*!< in/out: buffer block */ +{ + page_t* page; + page_zip_des_t* page_zip; + recv_addr_t* recv_addr; + recv_t* recv; + byte* buf; + lsn_t start_lsn; + lsn_t end_lsn; + lsn_t page_lsn; + lsn_t page_newest_lsn; + ibool modification_to_page; +#ifndef UNIV_HOTBACKUP + ibool success; +#endif /* !UNIV_HOTBACKUP */ + mtr_t mtr; + + mutex_enter(&(recv_sys->mutex)); + + if (recv_sys->apply_log_recs == FALSE) { + + /* Log records should not be applied now */ + + mutex_exit(&(recv_sys->mutex)); + + return; + } + + recv_addr = recv_get_fil_addr_struct(buf_block_get_space(block), + buf_block_get_page_no(block)); + + if ((recv_addr == NULL) + /* bugfix: http://bugs.mysql.com/bug.php?id=44140 */ + || (recv_addr->state == RECV_BEING_READ && !just_read_in) + || (recv_addr->state == RECV_BEING_PROCESSED) + || (recv_addr->state == RECV_PROCESSED)) { + + mutex_exit(&(recv_sys->mutex)); + + return; + } + +#if 0 + fprintf(stderr, "Recovering space %lu, page %lu\n", + buf_block_get_space(block), buf_block_get_page_no(block)); +#endif + + recv_addr->state = RECV_BEING_PROCESSED; + + mutex_exit(&(recv_sys->mutex)); + + mtr_start(&mtr); + mtr_set_log_mode(&mtr, MTR_LOG_NONE); + + page = block->frame; + page_zip = buf_block_get_page_zip(block); + +#ifndef UNIV_HOTBACKUP + if (just_read_in) { + /* Move the ownership of the x-latch on the page to + this OS thread, so that we can acquire a second + x-latch on it. This is needed for the operations to + the page to pass the debug checks. */ + + rw_lock_x_lock_move_ownership(&block->lock); + } + + success = buf_page_get_known_nowait(RW_X_LATCH, block, + BUF_KEEP_OLD, + __FILE__, __LINE__, + &mtr); + ut_a(success); + + buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK); +#endif /* !UNIV_HOTBACKUP */ + + /* Read the newest modification lsn from the page */ + page_lsn = mach_read_from_8(page + FIL_PAGE_LSN); + +#ifndef UNIV_HOTBACKUP + /* It may be that the page has been modified in the buffer + pool: read the newest modification lsn there */ + + page_newest_lsn = buf_page_get_newest_modification(&block->page); + + if (page_newest_lsn) { + + page_lsn = page_newest_lsn; + } +#else /* !UNIV_HOTBACKUP */ + /* In recovery from a backup we do not really use the buffer pool */ + page_newest_lsn = 0; +#endif /* !UNIV_HOTBACKUP */ + + modification_to_page = FALSE; + start_lsn = end_lsn = 0; + + recv = UT_LIST_GET_FIRST(recv_addr->rec_list); + + while (recv) { + end_lsn = recv->end_lsn; + + if (recv->len > RECV_DATA_BLOCK_SIZE) { + /* We have to copy the record body to a separate + buffer */ + + buf = static_cast<byte*>(mem_alloc(recv->len)); + + recv_data_copy_to_buf(buf, recv); + } else { + buf = ((byte*)(recv->data)) + sizeof(recv_data_t); + } + + if (recv->type == MLOG_INIT_FILE_PAGE) { + page_lsn = page_newest_lsn; + + memset(FIL_PAGE_LSN + page, 0, 8); + memset(UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM + + page, 0, 8); + + if (page_zip) { + memset(FIL_PAGE_LSN + page_zip->data, 0, 8); + } + } + + if (recv->start_lsn >= page_lsn) { + + lsn_t end_lsn; + + if (!modification_to_page) { + + modification_to_page = TRUE; + start_lsn = recv->start_lsn; + } + + DBUG_PRINT("ib_log", + ("apply " DBUG_LSN_PF ": %u len %u " + "page %u:%u", recv->start_lsn, + (unsigned) recv->type, + (unsigned) recv->len, + (unsigned) recv_addr->space, + (unsigned) recv_addr->page_no)); + + recv_parse_or_apply_log_rec_body(recv->type, buf, + buf + recv->len, + block, &mtr, + recv_addr->space); + + end_lsn = recv->start_lsn + recv->len; + mach_write_to_8(FIL_PAGE_LSN + page, end_lsn); + mach_write_to_8(UNIV_PAGE_SIZE + - FIL_PAGE_END_LSN_OLD_CHKSUM + + page, end_lsn); + + if (page_zip) { + mach_write_to_8(FIL_PAGE_LSN + + page_zip->data, end_lsn); + } + } + + if (recv->len > RECV_DATA_BLOCK_SIZE) { + mem_free(buf); + } + + recv = UT_LIST_GET_NEXT(rec_list, recv); + } + +#ifdef UNIV_ZIP_DEBUG + if (fil_page_get_type(page) == FIL_PAGE_INDEX) { + page_zip_des_t* page_zip = buf_block_get_page_zip(block); + + ut_a(!page_zip + || page_zip_validate_low(page_zip, page, NULL, FALSE)); + } +#endif /* UNIV_ZIP_DEBUG */ + +#ifndef UNIV_HOTBACKUP + if (modification_to_page) { + ut_a(block); + + log_flush_order_mutex_enter(); + buf_flush_recv_note_modification(block, start_lsn, end_lsn); + log_flush_order_mutex_exit(); + } +#endif /* !UNIV_HOTBACKUP */ + + /* Make sure that committing mtr does not change the modification + lsn values of page */ + + mtr.modifications = FALSE; + + mtr_commit(&mtr); + + mutex_enter(&(recv_sys->mutex)); + + if (recv_max_page_lsn < page_lsn) { + recv_max_page_lsn = page_lsn; + } + + recv_addr->state = RECV_PROCESSED; + + ut_a(recv_sys->n_addrs); + recv_sys->n_addrs--; + + mutex_exit(&(recv_sys->mutex)); + +} + +#ifndef UNIV_HOTBACKUP +/*******************************************************************//** +Reads in pages which have hashed log records, from an area around a given +page number. +@return number of pages found */ +static +ulint +recv_read_in_area( +/*==============*/ + ulint space, /*!< in: space */ + ulint zip_size,/*!< in: compressed page size in bytes, or 0 */ + ulint page_no)/*!< in: page number */ +{ + recv_addr_t* recv_addr; + ulint page_nos[RECV_READ_AHEAD_AREA]; + ulint low_limit; + ulint n; + + low_limit = page_no - (page_no % RECV_READ_AHEAD_AREA); + + n = 0; + + for (page_no = low_limit; page_no < low_limit + RECV_READ_AHEAD_AREA; + page_no++) { + recv_addr = recv_get_fil_addr_struct(space, page_no); + + if (recv_addr && !buf_page_peek(space, page_no)) { + + mutex_enter(&(recv_sys->mutex)); + + if (recv_addr->state == RECV_NOT_PROCESSED) { + recv_addr->state = RECV_BEING_READ; + + page_nos[n] = page_no; + + n++; + } + + mutex_exit(&(recv_sys->mutex)); + } + } + + buf_read_recv_pages(FALSE, space, zip_size, page_nos, n); + /* + fprintf(stderr, "Recv pages at %lu n %lu\n", page_nos[0], n); + */ + return(n); +} + +/*******************************************************************//** +Empties the hash table of stored log records, applying them to appropriate +pages. */ +UNIV_INTERN +void +recv_apply_hashed_log_recs( +/*=======================*/ + ibool allow_ibuf) /*!< in: if TRUE, also ibuf operations are + allowed during the application; if FALSE, + no ibuf operations are allowed, and after + the application all file pages are flushed to + disk and invalidated in buffer pool: this + alternative means that no new log records + can be generated during the application; + the caller must in this case own the log + mutex */ +{ + recv_addr_t* recv_addr; + ulint i; + ibool has_printed = FALSE; + mtr_t mtr; +loop: + mutex_enter(&(recv_sys->mutex)); + + if (recv_sys->apply_batch_on) { + + mutex_exit(&(recv_sys->mutex)); + + os_thread_sleep(500000); + + goto loop; + } + + ut_ad(!allow_ibuf == mutex_own(&log_sys->mutex)); + + if (!allow_ibuf) { + recv_no_ibuf_operations = TRUE; + } + + recv_sys->apply_log_recs = TRUE; + recv_sys->apply_batch_on = TRUE; + + for (i = 0; i < hash_get_n_cells(recv_sys->addr_hash); i++) { + + for (recv_addr = static_cast<recv_addr_t*>( + HASH_GET_FIRST(recv_sys->addr_hash, i)); + recv_addr != 0; + recv_addr = static_cast<recv_addr_t*>( + HASH_GET_NEXT(addr_hash, recv_addr))) { + + ulint space = recv_addr->space; + ulint zip_size = fil_space_get_zip_size(space); + ulint page_no = recv_addr->page_no; + + if (recv_addr->state == RECV_NOT_PROCESSED) { + if (!has_printed) { + ib_logf(IB_LOG_LEVEL_INFO, + "Starting an apply batch" + " of log records" + " to the database..."); + fputs("InnoDB: Progress in percent: ", + stderr); + has_printed = TRUE; + } + + mutex_exit(&(recv_sys->mutex)); + + if (buf_page_peek(space, page_no)) { + buf_block_t* block; + + mtr_start(&mtr); + + block = buf_page_get( + space, zip_size, page_no, + RW_X_LATCH, &mtr); + buf_block_dbg_add_level( + block, SYNC_NO_ORDER_CHECK); + + recv_recover_page(FALSE, block); + mtr_commit(&mtr); + } else { + recv_read_in_area(space, zip_size, + page_no); + } + + mutex_enter(&(recv_sys->mutex)); + } + } + + if (has_printed + && (i * 100) / hash_get_n_cells(recv_sys->addr_hash) + != ((i + 1) * 100) + / hash_get_n_cells(recv_sys->addr_hash)) { + + fprintf(stderr, "%lu ", (ulong) + ((i * 100) + / hash_get_n_cells(recv_sys->addr_hash))); + } + } + + /* Wait until all the pages have been processed */ + + while (recv_sys->n_addrs != 0) { + + mutex_exit(&(recv_sys->mutex)); + + os_thread_sleep(500000); + + mutex_enter(&(recv_sys->mutex)); + } + + if (has_printed) { + + fprintf(stderr, "\n"); + } + + if (!allow_ibuf) { + bool success; + + /* Flush all the file pages to disk and invalidate them in + the buffer pool */ + + ut_d(recv_no_log_write = TRUE); + mutex_exit(&(recv_sys->mutex)); + mutex_exit(&(log_sys->mutex)); + + /* Stop the recv_writer thread from issuing any LRU + flush batches. */ + mutex_enter(&recv_sys->writer_mutex); + + /* Wait for any currently run batch to end. */ + buf_flush_wait_LRU_batch_end(); + + success = buf_flush_list(ULINT_MAX, LSN_MAX, NULL); + + ut_a(success); + + buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST); + + buf_pool_invalidate(); + + /* Allow batches from recv_writer thread. */ + mutex_exit(&recv_sys->writer_mutex); + + mutex_enter(&(log_sys->mutex)); + mutex_enter(&(recv_sys->mutex)); + ut_d(recv_no_log_write = FALSE); + + recv_no_ibuf_operations = FALSE; + } + + recv_sys->apply_log_recs = FALSE; + recv_sys->apply_batch_on = FALSE; + + recv_sys_empty_hash(); + + if (has_printed) { + fprintf(stderr, "InnoDB: Apply batch completed\n"); + } + + mutex_exit(&(recv_sys->mutex)); +} +#else /* !UNIV_HOTBACKUP */ +/*******************************************************************//** +Applies log records in the hash table to a backup. */ +UNIV_INTERN +void +recv_apply_log_recs_for_backup(void) +/*================================*/ +{ + recv_addr_t* recv_addr; + ulint n_hash_cells; + buf_block_t* block; + ulint actual_size; + ibool success; + ulint error; + ulint i; + + recv_sys->apply_log_recs = TRUE; + recv_sys->apply_batch_on = TRUE; + + block = back_block1; + + ib_logf(IB_LOG_LEVEL_INFO, + "Starting an apply batch of log records to the database..."); + + fputs("InnoDB: Progress in percent: ", stderr); + + n_hash_cells = hash_get_n_cells(recv_sys->addr_hash); + + for (i = 0; i < n_hash_cells; i++) { + /* The address hash table is externally chained */ + recv_addr = hash_get_nth_cell(recv_sys->addr_hash, i)->node; + + while (recv_addr != NULL) { + + ulint zip_size + = fil_space_get_zip_size(recv_addr->space); + + if (zip_size == ULINT_UNDEFINED) { +#if 0 + fprintf(stderr, + "InnoDB: Warning: cannot apply" + " log record to" + " tablespace %lu page %lu,\n" + "InnoDB: because tablespace with" + " that id does not exist.\n", + recv_addr->space, recv_addr->page_no); +#endif + recv_addr->state = RECV_PROCESSED; + + ut_a(recv_sys->n_addrs); + recv_sys->n_addrs--; + + goto skip_this_recv_addr; + } + + /* We simulate a page read made by the buffer pool, to + make sure the recovery apparatus works ok. We must init + the block. */ + + buf_page_init_for_backup_restore( + recv_addr->space, recv_addr->page_no, + zip_size, block); + + /* Extend the tablespace's last file if the page_no + does not fall inside its bounds; we assume the last + file is auto-extending, and mysqlbackup copied the file + when it still was smaller */ + + success = fil_extend_space_to_desired_size( + &actual_size, + recv_addr->space, recv_addr->page_no + 1); + if (!success) { + fprintf(stderr, + "InnoDB: Fatal error: cannot extend" + " tablespace %u to hold %u pages\n", + recv_addr->space, recv_addr->page_no); + + exit(1); + } + + /* Read the page from the tablespace file using the + fil0fil.cc routines */ + + if (zip_size) { + error = fil_io(OS_FILE_READ, true, + recv_addr->space, zip_size, + recv_addr->page_no, 0, zip_size, + block->page.zip.data, NULL); + if (error == DB_SUCCESS + && !buf_zip_decompress(block, TRUE)) { + exit(1); + } + } else { + error = fil_io(OS_FILE_READ, true, + recv_addr->space, 0, + recv_addr->page_no, 0, + UNIV_PAGE_SIZE, + block->frame, NULL); + } + + if (error != DB_SUCCESS) { + fprintf(stderr, + "InnoDB: Fatal error: cannot read" + " from tablespace" + " %lu page number %lu\n", + (ulong) recv_addr->space, + (ulong) recv_addr->page_no); + + exit(1); + } + + /* Apply the log records to this page */ + recv_recover_page(FALSE, block); + + /* Write the page back to the tablespace file using the + fil0fil.cc routines */ + + buf_flush_init_for_writing( + block->frame, buf_block_get_page_zip(block), + mach_read_from_8(block->frame + FIL_PAGE_LSN)); + + if (zip_size) { + error = fil_io(OS_FILE_WRITE, true, + recv_addr->space, zip_size, + recv_addr->page_no, 0, + zip_size, + block->page.zip.data, NULL); + } else { + error = fil_io(OS_FILE_WRITE, true, + recv_addr->space, 0, + recv_addr->page_no, 0, + UNIV_PAGE_SIZE, + block->frame, NULL); + } +skip_this_recv_addr: + recv_addr = HASH_GET_NEXT(addr_hash, recv_addr); + } + + if ((100 * i) / n_hash_cells + != (100 * (i + 1)) / n_hash_cells) { + fprintf(stderr, "%lu ", + (ulong) ((100 * i) / n_hash_cells)); + fflush(stderr); + } + } + + recv_sys_empty_hash(); +} +#endif /* !UNIV_HOTBACKUP */ + +/*******************************************************************//** +Tries to parse a single log record and returns its length. +@return length of the record, or 0 if the record was not complete */ +UNIV_INTERN +ulint +recv_parse_log_rec( +/*===============*/ + byte* ptr, /*!< in: pointer to a buffer */ + byte* end_ptr,/*!< in: pointer to the buffer end */ + byte* type, /*!< out: type */ + ulint* space, /*!< out: space id */ + ulint* page_no,/*!< out: page number */ + byte** body) /*!< out: log record body start */ +{ + byte* new_ptr; + + *body = NULL; + + if (ptr == end_ptr) { + + return(0); + } + + if (*ptr == MLOG_MULTI_REC_END) { + + *type = *ptr; + + return(1); + } + + if (*ptr == MLOG_DUMMY_RECORD) { + *type = *ptr; + + *space = ULINT_UNDEFINED - 1; /* For debugging */ + + return(1); + } + + new_ptr = mlog_parse_initial_log_record(ptr, end_ptr, type, space, + page_no); + *body = new_ptr; + + if (UNIV_UNLIKELY(!new_ptr)) { + + return(0); + } + +#ifdef UNIV_LOG_LSN_DEBUG + if (*type == MLOG_LSN) { + lsn_t lsn = (lsn_t) *space << 32 | *page_no; +# ifdef UNIV_LOG_DEBUG + ut_a(lsn == log_sys->old_lsn); +# else /* UNIV_LOG_DEBUG */ + ut_a(lsn == recv_sys->recovered_lsn); +# endif /* UNIV_LOG_DEBUG */ + } +#endif /* UNIV_LOG_LSN_DEBUG */ + + new_ptr = recv_parse_or_apply_log_rec_body(*type, new_ptr, end_ptr, + NULL, NULL, *space); + if (UNIV_UNLIKELY(new_ptr == NULL)) { + + return(0); + } + + if (*page_no > recv_max_parsed_page_no) { + recv_max_parsed_page_no = *page_no; + } + + return(new_ptr - ptr); +} + +/*******************************************************//** +Calculates the new value for lsn when more data is added to the log. */ +UNIV_INTERN +lsn_t +recv_calc_lsn_on_data_add( +/*======================*/ + lsn_t lsn, /*!< in: old lsn */ + ib_uint64_t len) /*!< in: this many bytes of data is + added, log block headers not included */ +{ + ulint frag_len; + ib_uint64_t lsn_len; + + frag_len = (lsn % OS_FILE_LOG_BLOCK_SIZE) - LOG_BLOCK_HDR_SIZE; + ut_ad(frag_len < OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_HDR_SIZE + - LOG_BLOCK_TRL_SIZE); + lsn_len = len; + lsn_len += (lsn_len + frag_len) + / (OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_HDR_SIZE + - LOG_BLOCK_TRL_SIZE) + * (LOG_BLOCK_HDR_SIZE + LOG_BLOCK_TRL_SIZE); + + return(lsn + lsn_len); +} + +#ifdef UNIV_LOG_DEBUG +/*******************************************************//** +Checks that the parser recognizes incomplete initial segments of a log +record as incomplete. */ +static +void +recv_check_incomplete_log_recs( +/*===========================*/ + byte* ptr, /*!< in: pointer to a complete log record */ + ulint len) /*!< in: length of the log record */ +{ + ulint i; + byte type; + ulint space; + ulint page_no; + byte* body; + + for (i = 0; i < len; i++) { + ut_a(0 == recv_parse_log_rec(ptr, ptr + i, &type, &space, + &page_no, &body)); + } +} +#endif /* UNIV_LOG_DEBUG */ + +/*******************************************************//** +Prints diagnostic info of corrupt log. */ +static +void +recv_report_corrupt_log( +/*====================*/ + byte* ptr, /*!< in: pointer to corrupt log record */ + byte type, /*!< in: type of the record */ + ulint space, /*!< in: space id, this may also be garbage */ + ulint page_no)/*!< in: page number, this may also be garbage */ +{ + fprintf(stderr, + "InnoDB: ############### CORRUPT LOG RECORD FOUND\n" + "InnoDB: Log record type %lu, space id %lu, page number %lu\n" + "InnoDB: Log parsing proceeded successfully up to " LSN_PF "\n" + "InnoDB: Previous log record type %lu, is multi %lu\n" + "InnoDB: Recv offset %lu, prev %lu\n", + (ulong) type, (ulong) space, (ulong) page_no, + recv_sys->recovered_lsn, + (ulong) recv_previous_parsed_rec_type, + (ulong) recv_previous_parsed_rec_is_multi, + (ulong) (ptr - recv_sys->buf), + (ulong) recv_previous_parsed_rec_offset); + + if ((ulint)(ptr - recv_sys->buf + 100) + > recv_previous_parsed_rec_offset + && (ulint)(ptr - recv_sys->buf + 100 + - recv_previous_parsed_rec_offset) + < 200000) { + fputs("InnoDB: Hex dump of corrupt log starting" + " 100 bytes before the start\n" + "InnoDB: of the previous log rec,\n" + "InnoDB: and ending 100 bytes after the start" + " of the corrupt rec:\n", + stderr); + + ut_print_buf(stderr, + recv_sys->buf + + recv_previous_parsed_rec_offset - 100, + ptr - recv_sys->buf + 200 + - recv_previous_parsed_rec_offset); + putc('\n', stderr); + } + +#ifndef UNIV_HOTBACKUP + if (!srv_force_recovery) { + fputs("InnoDB: Set innodb_force_recovery" + " to ignore this error.\n", stderr); + ut_error; + } +#endif /* !UNIV_HOTBACKUP */ + + fputs("InnoDB: WARNING: the log file may have been corrupt and it\n" + "InnoDB: is possible that the log scan did not proceed\n" + "InnoDB: far enough in recovery! Please run CHECK TABLE\n" + "InnoDB: on your InnoDB tables to check that they are ok!\n" + "InnoDB: If mysqld crashes after this recovery, look at\n" + "InnoDB: " REFMAN "forcing-innodb-recovery.html\n" + "InnoDB: about forcing recovery.\n", stderr); + + fflush(stderr); +} + +/*******************************************************//** +Parses log records from a buffer and stores them to a hash table to wait +merging to file pages. +@return currently always returns FALSE */ +static +ibool +recv_parse_log_recs( +/*================*/ + ibool store_to_hash) /*!< in: TRUE if the records should be stored + to the hash table; this is set to FALSE if just + debug checking is needed */ +{ + byte* ptr; + byte* end_ptr; + ulint single_rec; + ulint len; + ulint total_len; + lsn_t new_recovered_lsn; + lsn_t old_lsn; + byte type; + ulint space; + ulint page_no; + byte* body; + ulint n_recs; + + ut_ad(mutex_own(&(log_sys->mutex))); + ut_ad(recv_sys->parse_start_lsn != 0); +loop: + ptr = recv_sys->buf + recv_sys->recovered_offset; + + end_ptr = recv_sys->buf + recv_sys->len; + + if (ptr == end_ptr) { + + return(FALSE); + } + + single_rec = (ulint)*ptr & MLOG_SINGLE_REC_FLAG; + + if (single_rec || *ptr == MLOG_DUMMY_RECORD) { + /* The mtr only modified a single page, or this is a file op */ + + old_lsn = recv_sys->recovered_lsn; + + /* Try to parse a log record, fetching its type, space id, + page no, and a pointer to the body of the log record */ + + len = recv_parse_log_rec(ptr, end_ptr, &type, &space, + &page_no, &body); + + if (len == 0 || recv_sys->found_corrupt_log) { + if (recv_sys->found_corrupt_log) { + + recv_report_corrupt_log(ptr, + type, space, page_no); + } + + return(FALSE); + } + + new_recovered_lsn = recv_calc_lsn_on_data_add(old_lsn, len); + + if (new_recovered_lsn > recv_sys->scanned_lsn) { + /* The log record filled a log block, and we require + that also the next log block should have been scanned + in */ + + return(FALSE); + } + + recv_previous_parsed_rec_type = (ulint) type; + recv_previous_parsed_rec_offset = recv_sys->recovered_offset; + recv_previous_parsed_rec_is_multi = 0; + + recv_sys->recovered_offset += len; + recv_sys->recovered_lsn = new_recovered_lsn; + + DBUG_PRINT("ib_log", + ("scan " DBUG_LSN_PF ": log rec %u len %u " + "page %u:%u", old_lsn, + (unsigned) type, (unsigned) len, + (unsigned) space, (unsigned) page_no)); + + if (type == MLOG_DUMMY_RECORD) { + /* Do nothing */ + + } else if (!store_to_hash) { + /* In debug checking, update a replicate page + according to the log record, and check that it + becomes identical with the original page */ +#ifdef UNIV_LOG_DEBUG + recv_check_incomplete_log_recs(ptr, len); +#endif/* UNIV_LOG_DEBUG */ + + } else if (type == MLOG_FILE_CREATE + || type == MLOG_FILE_CREATE2 + || type == MLOG_FILE_RENAME + || type == MLOG_FILE_DELETE) { + ut_a(space); +#ifdef UNIV_HOTBACKUP + if (recv_replay_file_ops) { + + /* In mysqlbackup --apply-log, replay an .ibd + file operation, if possible; note that + fil_path_to_mysql_datadir is set in mysqlbackup + to point to the datadir we should use there */ + + if (NULL == fil_op_log_parse_or_replay( + body, end_ptr, type, + space, page_no)) { + fprintf(stderr, + "InnoDB: Error: file op" + " log record of type %lu" + " space %lu not complete in\n" + "InnoDB: the replay phase." + " Path %s\n", + (ulint) type, space, + (char*)(body + 2)); + + ut_error; + } + } +#endif + /* In normal mysqld crash recovery we do not try to + replay file operations */ +#ifdef UNIV_LOG_LSN_DEBUG + } else if (type == MLOG_LSN) { + /* Do not add these records to the hash table. + The page number and space id fields are misused + for something else. */ +#endif /* UNIV_LOG_LSN_DEBUG */ + } else { + recv_add_to_hash_table(type, space, page_no, body, + ptr + len, old_lsn, + recv_sys->recovered_lsn); + } + } else { + /* Check that all the records associated with the single mtr + are included within the buffer */ + + total_len = 0; + n_recs = 0; + + for (;;) { + len = recv_parse_log_rec(ptr, end_ptr, &type, &space, + &page_no, &body); + if (len == 0 || recv_sys->found_corrupt_log) { + + if (recv_sys->found_corrupt_log) { + + recv_report_corrupt_log( + ptr, type, space, page_no); + } + + return(FALSE); + } + + recv_previous_parsed_rec_type = (ulint) type; + recv_previous_parsed_rec_offset + = recv_sys->recovered_offset + total_len; + recv_previous_parsed_rec_is_multi = 1; + +#ifdef UNIV_LOG_DEBUG + if ((!store_to_hash) && (type != MLOG_MULTI_REC_END)) { + recv_check_incomplete_log_recs(ptr, len); + } +#endif /* UNIV_LOG_DEBUG */ + + DBUG_PRINT("ib_log", + ("scan " DBUG_LSN_PF ": multi-log rec %u " + "len %u page %u:%u", + recv_sys->recovered_lsn, + (unsigned) type, (unsigned) len, + (unsigned) space, (unsigned) page_no)); + + total_len += len; + n_recs++; + + ptr += len; + + if (type == MLOG_MULTI_REC_END) { + + /* Found the end mark for the records */ + + break; + } + } + + new_recovered_lsn = recv_calc_lsn_on_data_add( + recv_sys->recovered_lsn, total_len); + + if (new_recovered_lsn > recv_sys->scanned_lsn) { + /* The log record filled a log block, and we require + that also the next log block should have been scanned + in */ + + return(FALSE); + } + + /* Add all the records to the hash table */ + + ptr = recv_sys->buf + recv_sys->recovered_offset; + + for (;;) { + old_lsn = recv_sys->recovered_lsn; + len = recv_parse_log_rec(ptr, end_ptr, &type, &space, + &page_no, &body); + if (recv_sys->found_corrupt_log) { + + recv_report_corrupt_log(ptr, + type, space, page_no); + } + + ut_a(len != 0); + ut_a(0 == ((ulint)*ptr & MLOG_SINGLE_REC_FLAG)); + + recv_sys->recovered_offset += len; + recv_sys->recovered_lsn + = recv_calc_lsn_on_data_add(old_lsn, len); + if (type == MLOG_MULTI_REC_END) { + + /* Found the end mark for the records */ + + break; + } + + if (store_to_hash +#ifdef UNIV_LOG_LSN_DEBUG + && type != MLOG_LSN +#endif /* UNIV_LOG_LSN_DEBUG */ + ) { + recv_add_to_hash_table(type, space, page_no, + body, ptr + len, + old_lsn, + new_recovered_lsn); + } + + ptr += len; + } + } + + goto loop; +} + +/*******************************************************//** +Adds data from a new log block to the parsing buffer of recv_sys if +recv_sys->parse_start_lsn is non-zero. +@return TRUE if more data added */ +static +ibool +recv_sys_add_to_parsing_buf( +/*========================*/ + const byte* log_block, /*!< in: log block */ + lsn_t scanned_lsn) /*!< in: lsn of how far we were able + to find data in this log block */ +{ + ulint more_len; + ulint data_len; + ulint start_offset; + ulint end_offset; + + ut_ad(scanned_lsn >= recv_sys->scanned_lsn); + + if (!recv_sys->parse_start_lsn) { + /* Cannot start parsing yet because no start point for + it found */ + + return(FALSE); + } + + data_len = log_block_get_data_len(log_block); + + if (recv_sys->parse_start_lsn >= scanned_lsn) { + + return(FALSE); + + } else if (recv_sys->scanned_lsn >= scanned_lsn) { + + return(FALSE); + + } else if (recv_sys->parse_start_lsn > recv_sys->scanned_lsn) { + more_len = (ulint) (scanned_lsn - recv_sys->parse_start_lsn); + } else { + more_len = (ulint) (scanned_lsn - recv_sys->scanned_lsn); + } + + if (more_len == 0) { + + return(FALSE); + } + + ut_ad(data_len >= more_len); + + start_offset = data_len - more_len; + + if (start_offset < LOG_BLOCK_HDR_SIZE) { + start_offset = LOG_BLOCK_HDR_SIZE; + } + + end_offset = data_len; + + if (end_offset > OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE) { + end_offset = OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE; + } + + ut_ad(start_offset <= end_offset); + + if (start_offset < end_offset) { + ut_memcpy(recv_sys->buf + recv_sys->len, + log_block + start_offset, end_offset - start_offset); + + recv_sys->len += end_offset - start_offset; + + ut_a(recv_sys->len <= RECV_PARSING_BUF_SIZE); + } + + return(TRUE); +} + +/*******************************************************//** +Moves the parsing buffer data left to the buffer start. */ +static +void +recv_sys_justify_left_parsing_buf(void) +/*===================================*/ +{ + ut_memmove(recv_sys->buf, recv_sys->buf + recv_sys->recovered_offset, + recv_sys->len - recv_sys->recovered_offset); + + recv_sys->len -= recv_sys->recovered_offset; + + recv_sys->recovered_offset = 0; +} + +/*******************************************************//** +Scans log from a buffer and stores new log data to the parsing buffer. +Parses and hashes the log records if new data found. Unless +UNIV_HOTBACKUP is defined, this function will apply log records +automatically when the hash table becomes full. +@return TRUE if limit_lsn has been reached, or not able to scan any +more in this log group */ +UNIV_INTERN +ibool +recv_scan_log_recs( +/*===============*/ + ulint available_memory,/*!< in: we let the hash table of recs + to grow to this size, at the maximum */ + ibool store_to_hash, /*!< in: TRUE if the records should be + stored to the hash table; this is set + to FALSE if just debug checking is + needed */ + const byte* buf, /*!< in: buffer containing a log + segment or garbage */ + ulint len, /*!< in: buffer length */ + lsn_t start_lsn, /*!< in: buffer start lsn */ + lsn_t* contiguous_lsn, /*!< in/out: it is known that all log + groups contain contiguous log data up + to this lsn */ + lsn_t* group_scanned_lsn)/*!< out: scanning succeeded up to + this lsn */ +{ + const byte* log_block; + ulint no; + lsn_t scanned_lsn; + ibool finished; + ulint data_len; + ibool more_data; + + ut_ad(start_lsn % OS_FILE_LOG_BLOCK_SIZE == 0); + ut_ad(len % OS_FILE_LOG_BLOCK_SIZE == 0); + ut_ad(len >= OS_FILE_LOG_BLOCK_SIZE); + ut_a(store_to_hash <= TRUE); + + finished = FALSE; + + log_block = buf; + scanned_lsn = start_lsn; + more_data = FALSE; + + do { + no = log_block_get_hdr_no(log_block); + /* + fprintf(stderr, "Log block header no %lu\n", no); + + fprintf(stderr, "Scanned lsn no %lu\n", + log_block_convert_lsn_to_no(scanned_lsn)); + */ + if (no != log_block_convert_lsn_to_no(scanned_lsn) + || !log_block_checksum_is_ok_or_old_format(log_block)) { + + if (no == log_block_convert_lsn_to_no(scanned_lsn) + && !log_block_checksum_is_ok_or_old_format( + log_block)) { + fprintf(stderr, + "InnoDB: Log block no %lu at" + " lsn " LSN_PF " has\n" + "InnoDB: ok header, but checksum field" + " contains %lu, should be %lu\n", + (ulong) no, + scanned_lsn, + (ulong) log_block_get_checksum( + log_block), + (ulong) log_block_calc_checksum( + log_block)); + } + + /* Garbage or an incompletely written log block */ + + finished = TRUE; + + break; + } + + if (log_block_get_flush_bit(log_block)) { + /* This block was a start of a log flush operation: + we know that the previous flush operation must have + been completed for all log groups before this block + can have been flushed to any of the groups. Therefore, + we know that log data is contiguous up to scanned_lsn + in all non-corrupt log groups. */ + + if (scanned_lsn > *contiguous_lsn) { + *contiguous_lsn = scanned_lsn; + } + } + + data_len = log_block_get_data_len(log_block); + + if ((store_to_hash || (data_len == OS_FILE_LOG_BLOCK_SIZE)) + && scanned_lsn + data_len > recv_sys->scanned_lsn + && (recv_sys->scanned_checkpoint_no > 0) + && (log_block_get_checkpoint_no(log_block) + < recv_sys->scanned_checkpoint_no) + && (recv_sys->scanned_checkpoint_no + - log_block_get_checkpoint_no(log_block) + > 0x80000000UL)) { + + /* Garbage from a log buffer flush which was made + before the most recent database recovery */ + + finished = TRUE; +#ifdef UNIV_LOG_DEBUG + /* This is not really an error, but currently + we stop here in the debug version: */ + + ut_error; +#endif + break; + } + + if (!recv_sys->parse_start_lsn + && (log_block_get_first_rec_group(log_block) > 0)) { + + /* We found a point from which to start the parsing + of log records */ + + recv_sys->parse_start_lsn = scanned_lsn + + log_block_get_first_rec_group(log_block); + recv_sys->scanned_lsn = recv_sys->parse_start_lsn; + recv_sys->recovered_lsn = recv_sys->parse_start_lsn; + } + + scanned_lsn += data_len; + + if (scanned_lsn > recv_sys->scanned_lsn) { + + /* We have found more entries. If this scan is + of startup type, we must initiate crash recovery + environment before parsing these log records. */ + +#ifndef UNIV_HOTBACKUP + if (recv_log_scan_is_startup_type + && !recv_needed_recovery) { + + if (!srv_read_only_mode) { + ib_logf(IB_LOG_LEVEL_INFO, + "Log scan progressed past the " + "checkpoint lsn " LSN_PF "", + recv_sys->scanned_lsn); + + recv_init_crash_recovery(); + } else { + + ib_logf(IB_LOG_LEVEL_WARN, + "Recovery skipped, " + "--innodb-read-only set!"); + + return(TRUE); + } + } +#endif /* !UNIV_HOTBACKUP */ + + /* We were able to find more log data: add it to the + parsing buffer if parse_start_lsn is already + non-zero */ + + if (recv_sys->len + 4 * OS_FILE_LOG_BLOCK_SIZE + >= RECV_PARSING_BUF_SIZE) { + fprintf(stderr, + "InnoDB: Error: log parsing" + " buffer overflow." + " Recovery may have failed!\n"); + + recv_sys->found_corrupt_log = TRUE; + +#ifndef UNIV_HOTBACKUP + if (!srv_force_recovery) { + fputs("InnoDB: Set" + " innodb_force_recovery" + " to ignore this error.\n", + stderr); + ut_error; + } +#endif /* !UNIV_HOTBACKUP */ + + } else if (!recv_sys->found_corrupt_log) { + more_data = recv_sys_add_to_parsing_buf( + log_block, scanned_lsn); + } + + recv_sys->scanned_lsn = scanned_lsn; + recv_sys->scanned_checkpoint_no + = log_block_get_checkpoint_no(log_block); + } + + if (data_len < OS_FILE_LOG_BLOCK_SIZE) { + /* Log data for this group ends here */ + + finished = TRUE; + break; + } else { + log_block += OS_FILE_LOG_BLOCK_SIZE; + } + } while (log_block < buf + len && !finished); + + *group_scanned_lsn = scanned_lsn; + + if (recv_needed_recovery + || (recv_is_from_backup && !recv_is_making_a_backup)) { + recv_scan_print_counter++; + + if (finished || (recv_scan_print_counter % 80 == 0)) { + + fprintf(stderr, + "InnoDB: Doing recovery: scanned up to" + " log sequence number " LSN_PF "\n", + *group_scanned_lsn); + } + } + + if (more_data && !recv_sys->found_corrupt_log) { + /* Try to parse more log records */ + + recv_parse_log_recs(store_to_hash); + +#ifndef UNIV_HOTBACKUP + if (store_to_hash + && mem_heap_get_size(recv_sys->heap) > available_memory) { + + /* Hash table of log records has grown too big: + empty it; FALSE means no ibuf operations + allowed, as we cannot add new records to the + log yet: they would be produced by ibuf + operations */ + + recv_apply_hashed_log_recs(FALSE); + } +#endif /* !UNIV_HOTBACKUP */ + + if (recv_sys->recovered_offset > RECV_PARSING_BUF_SIZE / 4) { + /* Move parsing buffer data to the buffer start */ + + recv_sys_justify_left_parsing_buf(); + } + } + + return(finished); +} + +#ifndef UNIV_HOTBACKUP +/*******************************************************//** +Scans log from a buffer and stores new log data to the parsing buffer. Parses +and hashes the log records if new data found. */ +static +void +recv_group_scan_log_recs( +/*=====================*/ + log_group_t* group, /*!< in: log group */ + lsn_t* contiguous_lsn, /*!< in/out: it is known that all log + groups contain contiguous log data up + to this lsn */ + lsn_t* group_scanned_lsn)/*!< out: scanning succeeded up to + this lsn */ +{ + ibool finished; + lsn_t start_lsn; + lsn_t end_lsn; + + finished = FALSE; + + start_lsn = *contiguous_lsn; + + while (!finished) { + end_lsn = start_lsn + RECV_SCAN_SIZE; + + log_group_read_log_seg(LOG_RECOVER, log_sys->buf, + group, start_lsn, end_lsn, FALSE); + + finished = recv_scan_log_recs( + (buf_pool_get_n_pages() + - (recv_n_pool_free_frames * srv_buf_pool_instances)) + * UNIV_PAGE_SIZE, + TRUE, log_sys->buf, RECV_SCAN_SIZE, + start_lsn, contiguous_lsn, group_scanned_lsn); + start_lsn = end_lsn; + } + +#ifdef UNIV_DEBUG + if (log_debug_writes) { + fprintf(stderr, + "InnoDB: Scanned group %lu up to" + " log sequence number " LSN_PF "\n", + (ulong) group->id, + *group_scanned_lsn); + } +#endif /* UNIV_DEBUG */ +} + +/*******************************************************//** +Initialize crash recovery environment. Can be called iff +recv_needed_recovery == FALSE. */ +static +void +recv_init_crash_recovery(void) +/*==========================*/ +{ + ut_ad(!srv_read_only_mode); + ut_a(!recv_needed_recovery); + + recv_needed_recovery = TRUE; + + ib_logf(IB_LOG_LEVEL_INFO, "Database was not shutdown normally!"); + ib_logf(IB_LOG_LEVEL_INFO, "Starting crash recovery."); + ib_logf(IB_LOG_LEVEL_INFO, + "Reading tablespace information from the .ibd files..."); + + fil_load_single_table_tablespaces(); + + /* If we are using the doublewrite method, we will + check if there are half-written pages in data files, + and restore them from the doublewrite buffer if + possible */ + + if (srv_force_recovery < SRV_FORCE_NO_LOG_REDO) { + + ib_logf(IB_LOG_LEVEL_INFO, + "Restoring possible half-written data pages "); + + ib_logf(IB_LOG_LEVEL_INFO, + "from the doublewrite buffer..."); + + buf_dblwr_process(); + + /* Spawn the background thread to flush dirty pages + from the buffer pools. */ + recv_writer_thread_handle = os_thread_create( + recv_writer_thread, 0, 0); + } +} + +/********************************************************//** +Recovers from a checkpoint. When this function returns, the database is able +to start processing of new user transactions, but the function +recv_recovery_from_checkpoint_finish should be called later to complete +the recovery and free the resources used in it. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +recv_recovery_from_checkpoint_start_func( +/*=====================================*/ +#ifdef UNIV_LOG_ARCHIVE + ulint type, /*!< in: LOG_CHECKPOINT or LOG_ARCHIVE */ + lsn_t limit_lsn, /*!< in: recover up to this lsn if possible */ +#endif /* UNIV_LOG_ARCHIVE */ + lsn_t min_flushed_lsn,/*!< in: min flushed lsn from data files */ + lsn_t max_flushed_lsn)/*!< in: max flushed lsn from data files */ +{ + log_group_t* group; + log_group_t* max_cp_group; + ulint max_cp_field; + ulint log_hdr_log_block_size; + lsn_t checkpoint_lsn; + ib_uint64_t checkpoint_no; + lsn_t group_scanned_lsn = 0; + lsn_t contiguous_lsn; +#ifdef UNIV_LOG_ARCHIVE + log_group_t* up_to_date_group; + lsn_t archived_lsn; +#endif /* UNIV_LOG_ARCHIVE */ + byte* buf; + byte* log_hdr_buf; + byte* log_hdr_buf_base = static_cast<byte *> + (alloca(LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE)); + dberr_t err; + ut_when_dtor<recv_dblwr_t> tmp(recv_sys->dblwr); + + log_hdr_buf = static_cast<byte *> + (ut_align(log_hdr_buf_base, OS_FILE_LOG_BLOCK_SIZE)); + +#ifdef UNIV_LOG_ARCHIVE + ut_ad(type != LOG_CHECKPOINT || limit_lsn == LSN_MAX); +/** TRUE when recovering from a checkpoint */ +# define TYPE_CHECKPOINT (type == LOG_CHECKPOINT) +/** Recover up to this log sequence number */ +# define LIMIT_LSN limit_lsn +#else /* UNIV_LOG_ARCHIVE */ +/** TRUE when recovering from a checkpoint */ +# define TYPE_CHECKPOINT 1 +/** Recover up to this log sequence number */ +# define LIMIT_LSN LSN_MAX +#endif /* UNIV_LOG_ARCHIVE */ + + if (srv_force_recovery >= SRV_FORCE_NO_LOG_REDO) { + + ib_logf(IB_LOG_LEVEL_INFO, + "The user has set SRV_FORCE_NO_LOG_REDO on, " + "skipping log redo"); + + return(DB_SUCCESS); + } + + recv_recovery_on = TRUE; + + recv_sys->limit_lsn = LIMIT_LSN; + + mutex_enter(&(log_sys->mutex)); + + /* Look for the latest checkpoint from any of the log groups */ + + err = recv_find_max_checkpoint(&max_cp_group, &max_cp_field); + + if (err != DB_SUCCESS) { + + mutex_exit(&(log_sys->mutex)); + + return(err); + } + + log_group_read_checkpoint_info(max_cp_group, max_cp_field); + + buf = log_sys->checkpoint_buf; + + checkpoint_lsn = mach_read_from_8(buf + LOG_CHECKPOINT_LSN); + checkpoint_no = mach_read_from_8(buf + LOG_CHECKPOINT_NO); +#ifdef UNIV_LOG_ARCHIVE + archived_lsn = mach_read_from_8(buf + LOG_CHECKPOINT_ARCHIVED_LSN); +#endif /* UNIV_LOG_ARCHIVE */ + + /* Read the first log file header to print a note if this is + a recovery from a restored InnoDB Hot Backup */ + + fil_io(OS_FILE_READ | OS_FILE_LOG, true, max_cp_group->space_id, 0, + 0, 0, LOG_FILE_HDR_SIZE, + log_hdr_buf, max_cp_group); + + if (0 == ut_memcmp(log_hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP, + (byte*)"ibbackup", (sizeof "ibbackup") - 1)) { + + if (srv_read_only_mode) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "Cannot restore from mysqlbackup, InnoDB " + "running in read-only mode!"); + + return(DB_ERROR); + } + + /* This log file was created by mysqlbackup --restore: print + a note to the user about it */ + + ib_logf(IB_LOG_LEVEL_INFO, + "The log file was created by mysqlbackup --apply-log " + "at %s. The following crash recovery is part of a " + "normal restore.", + log_hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP); + + /* Wipe over the label now */ + + memset(log_hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP, + ' ', 4); + /* Write to the log file to wipe over the label */ + fil_io(OS_FILE_WRITE | OS_FILE_LOG, true, + max_cp_group->space_id, 0, + 0, 0, OS_FILE_LOG_BLOCK_SIZE, + log_hdr_buf, max_cp_group); + } + + log_hdr_log_block_size + = mach_read_from_4(log_hdr_buf + LOG_FILE_OS_FILE_LOG_BLOCK_SIZE); + if (log_hdr_log_block_size == 0) { + /* 0 means default value */ + log_hdr_log_block_size = 512; + } + if (UNIV_UNLIKELY(log_hdr_log_block_size != srv_log_block_size)) { + fprintf(stderr, + "InnoDB: Error: The block size of ib_logfile (" ULINTPF + ") is not equal to innodb_log_block_size.\n" + "InnoDB: Error: Suggestion - Recreate log files.\n", + log_hdr_log_block_size); + return(DB_ERROR); + } + +#ifdef UNIV_LOG_ARCHIVE + group = UT_LIST_GET_FIRST(log_sys->log_groups); + + while (group) { + log_checkpoint_get_nth_group_info(buf, group->id, + &(group->archived_file_no)); + + log_archived_get_offset(group, group->archived_file_no, + archived_lsn, &(group->archived_offset)); + + group = UT_LIST_GET_NEXT(log_groups, group); + } +#endif /* UNIV_LOG_ARCHIVE */ + + if (TYPE_CHECKPOINT) { + /* Start reading the log groups from the checkpoint lsn up. The + variable contiguous_lsn contains an lsn up to which the log is + known to be contiguously written to all log groups. */ + + recv_sys->parse_start_lsn = checkpoint_lsn; + recv_sys->scanned_lsn = checkpoint_lsn; + recv_sys->scanned_checkpoint_no = 0; + recv_sys->recovered_lsn = checkpoint_lsn; + + srv_start_lsn = checkpoint_lsn; + } + + contiguous_lsn = ut_uint64_align_down(recv_sys->scanned_lsn, + OS_FILE_LOG_BLOCK_SIZE); +#ifdef UNIV_LOG_ARCHIVE + if (TYPE_CHECKPOINT) { + up_to_date_group = max_cp_group; + } else { + ulint capacity; + + /* Try to recover the remaining part from logs: first from + the logs of the archived group */ + + group = recv_sys->archive_group; + capacity = log_group_get_capacity(group); + + if (recv_sys->scanned_lsn > checkpoint_lsn + capacity + || checkpoint_lsn > recv_sys->scanned_lsn + capacity) { + + mutex_exit(&(log_sys->mutex)); + + /* The group does not contain enough log: probably + an archived log file was missing or corrupt */ + + return(DB_ERROR); + } + + recv_group_scan_log_recs(group, &contiguous_lsn, + &group_scanned_lsn); + if (recv_sys->scanned_lsn < checkpoint_lsn) { + + mutex_exit(&(log_sys->mutex)); + + /* The group did not contain enough log: an archived + log file was missing or invalid, or the log group + was corrupt */ + + return(DB_ERROR); + } + + group->scanned_lsn = group_scanned_lsn; + up_to_date_group = group; + } +#endif /* UNIV_LOG_ARCHIVE */ + + ut_ad(RECV_SCAN_SIZE <= log_sys->buf_size); + + group = UT_LIST_GET_FIRST(log_sys->log_groups); + +#ifdef UNIV_LOG_ARCHIVE + if ((type == LOG_ARCHIVE) && (group == recv_sys->archive_group)) { + group = UT_LIST_GET_NEXT(log_groups, group); + } +#endif /* UNIV_LOG_ARCHIVE */ + + /* Set the flag to publish that we are doing startup scan. */ + recv_log_scan_is_startup_type = TYPE_CHECKPOINT; + while (group) { +#ifdef UNIV_LOG_ARCHIVE + lsn_t old_scanned_lsn = recv_sys->scanned_lsn; +#endif /* UNIV_LOG_ARCHIVE */ + + recv_group_scan_log_recs(group, &contiguous_lsn, + &group_scanned_lsn); + group->scanned_lsn = group_scanned_lsn; + +#ifdef UNIV_LOG_ARCHIVE + if (old_scanned_lsn < group_scanned_lsn) { + /* We found a more up-to-date group */ + + up_to_date_group = group; + } + + if ((type == LOG_ARCHIVE) + && (group == recv_sys->archive_group)) { + group = UT_LIST_GET_NEXT(log_groups, group); + } +#endif /* UNIV_LOG_ARCHIVE */ + + group = UT_LIST_GET_NEXT(log_groups, group); + } + + /* Done with startup scan. Clear the flag. */ + recv_log_scan_is_startup_type = FALSE; + if (TYPE_CHECKPOINT) { + /* NOTE: we always do a 'recovery' at startup, but only if + there is something wrong we will print a message to the + user about recovery: */ + + if (checkpoint_lsn != max_flushed_lsn + || checkpoint_lsn != min_flushed_lsn) { + + if (checkpoint_lsn < max_flushed_lsn) { + + ib_logf(IB_LOG_LEVEL_WARN, + "The log sequence number " + "in the ibdata files is higher " + "than the log sequence number " + "in the ib_logfiles! Are you sure " + "you are using the right " + "ib_logfiles to start up the database. " + "Log sequence number in the " + "ib_logfiles is " LSN_PF ", log" + "sequence numbers stamped " + "to ibdata file headers are between " + "" LSN_PF " and " LSN_PF ".", + checkpoint_lsn, + min_flushed_lsn, + max_flushed_lsn); + } + + if (!recv_needed_recovery) { + ib_logf(IB_LOG_LEVEL_INFO, + "The log sequence numbers " + LSN_PF " and " LSN_PF + " in ibdata files do not match" + " the log sequence number " + LSN_PF + " in the ib_logfiles!", + min_flushed_lsn, + max_flushed_lsn, + checkpoint_lsn); + + if (!srv_read_only_mode) { + recv_init_crash_recovery(); + } else { + ib_logf(IB_LOG_LEVEL_ERROR, + "Can't initiate database " + "recovery, running " + "in read-only-mode."); + return(DB_READ_ONLY); + } + } + } + } + + /* We currently have only one log group */ + if (group_scanned_lsn < checkpoint_lsn + || group_scanned_lsn < recv_max_page_lsn) { + ib_logf(IB_LOG_LEVEL_ERROR, + "We scanned the log up to " + LSN_PF ". A checkpoint was at " LSN_PF + " and the maximum LSN on a database page was " LSN_PF + ". It is possible that the database is now corrupt!", + group_scanned_lsn, checkpoint_lsn, recv_max_page_lsn); + } + + if (recv_sys->recovered_lsn < checkpoint_lsn) { + + mutex_exit(&(log_sys->mutex)); + + if (recv_sys->recovered_lsn >= LIMIT_LSN) { + + return(DB_SUCCESS); + } + + /* No harm in trying to do RO access. */ + if (!srv_read_only_mode) { + ut_error; + } + + return(DB_ERROR); + } + + /* Synchronize the uncorrupted log groups to the most up-to-date log + group; we also copy checkpoint info to groups */ + + log_sys->next_checkpoint_lsn = checkpoint_lsn; + log_sys->next_checkpoint_no = checkpoint_no + 1; + +#ifdef UNIV_LOG_ARCHIVE + log_sys->archived_lsn = archived_lsn; + + recv_synchronize_groups(up_to_date_group); +#else /* UNIV_LOG_ARCHIVE */ + recv_synchronize_groups(); +#endif /* UNIV_LOG_ARCHIVE */ + + if (!recv_needed_recovery) { + ut_a(checkpoint_lsn == recv_sys->recovered_lsn); + } else { + srv_start_lsn = recv_sys->recovered_lsn; + } + + log_sys->lsn = recv_sys->recovered_lsn; + + ut_memcpy(log_sys->buf, recv_sys->last_block, OS_FILE_LOG_BLOCK_SIZE); + + log_sys->buf_free = (ulint) log_sys->lsn % OS_FILE_LOG_BLOCK_SIZE; + log_sys->buf_next_to_write = log_sys->buf_free; + log_sys->written_to_some_lsn = log_sys->lsn; + log_sys->written_to_all_lsn = log_sys->lsn; + + log_sys->last_checkpoint_lsn = checkpoint_lsn; + + MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE, + log_sys->lsn - log_sys->last_checkpoint_lsn); + + log_sys->next_checkpoint_no = checkpoint_no + 1; + +#ifdef UNIV_LOG_ARCHIVE + if (archived_lsn == LSN_MAX) { + + log_sys->archiving_state = LOG_ARCH_OFF; + } +#endif /* UNIV_LOG_ARCHIVE */ + + mutex_enter(&recv_sys->mutex); + + recv_sys->apply_log_recs = TRUE; + + mutex_exit(&recv_sys->mutex); + + mutex_exit(&log_sys->mutex); + + recv_lsn_checks_on = TRUE; + + /* The database is now ready to start almost normal processing of user + transactions: transaction rollbacks and the application of the log + records in the hash table can be run in background. */ + + return(DB_SUCCESS); + +#undef TYPE_CHECKPOINT +#undef LIMIT_LSN +} + +/********************************************************//** +Completes recovery from a checkpoint. */ +UNIV_INTERN +void +recv_recovery_from_checkpoint_finish(void) +/*======================================*/ +{ + /* Apply the hashed log records to the respective file pages */ + + if (srv_force_recovery < SRV_FORCE_NO_LOG_REDO) { + + recv_apply_hashed_log_recs(TRUE); + } + + DBUG_PRINT("ib_log", ("apply completed")); + + if (recv_needed_recovery) { + trx_sys_print_mysql_master_log_pos(); + trx_sys_print_mysql_binlog_offset(); + } + + if (recv_sys->found_corrupt_log) { + + fprintf(stderr, + "InnoDB: WARNING: the log file may have been" + " corrupt and it\n" + "InnoDB: is possible that the log scan or parsing" + " did not proceed\n" + "InnoDB: far enough in recovery. Please run" + " CHECK TABLE\n" + "InnoDB: on your InnoDB tables to check that" + " they are ok!\n" + "InnoDB: It may be safest to recover your" + " InnoDB database from\n" + "InnoDB: a backup!\n"); + } + + /* Make sure that the recv_writer thread is done. This is + required because it grabs various mutexes and we want to + ensure that when we enable sync_order_checks there is no + mutex currently held by any thread. */ + mutex_enter(&recv_sys->writer_mutex); + + /* Free the resources of the recovery system */ + recv_recovery_on = FALSE; + + /* By acquring the mutex we ensure that the recv_writer thread + won't trigger any more LRU batchtes. Now wait for currently + in progress batches to finish. */ + buf_flush_wait_LRU_batch_end(); + + mutex_exit(&recv_sys->writer_mutex); + + ulint count = 0; + while (recv_writer_thread_active) { + ++count; + os_thread_sleep(100000); + if (srv_print_verbose_log && count > 600) { + ib_logf(IB_LOG_LEVEL_INFO, + "Waiting for recv_writer to " + "finish flushing of buffer pool"); + count = 0; + } + } + +#ifdef __WIN__ + if (recv_writer_thread_handle) { + CloseHandle(recv_writer_thread_handle); + } +#endif /* __WIN__ */ + +#ifndef UNIV_LOG_DEBUG + recv_sys_debug_free(); +#endif + /* Roll back any recovered data dictionary transactions, so + that the data dictionary tables will be free of any locks. + The data dictionary latch should guarantee that there is at + most one data dictionary transaction active at a time. */ + if (srv_force_recovery < SRV_FORCE_NO_TRX_UNDO) { + trx_rollback_or_clean_recovered(FALSE); + } +} + +/********************************************************//** +Initiates the rollback of active transactions. */ +UNIV_INTERN +void +recv_recovery_rollback_active(void) +/*===============================*/ +{ +#ifdef UNIV_SYNC_DEBUG + /* Wait for a while so that created threads have time to suspend + themselves before we switch the latching order checks on */ + os_thread_sleep(1000000); + + ut_ad(!recv_writer_thread_active); + + /* Switch latching order checks on in sync0sync.cc */ + sync_order_checks_on = TRUE; +#endif + /* We can't start any (DDL) transactions if UNDO logging + has been disabled, additionally disable ROLLBACK of recovered + user transactions. */ + if (srv_force_recovery < SRV_FORCE_NO_TRX_UNDO + && !srv_read_only_mode) { + + /* Drop partially created indexes. */ + row_merge_drop_temp_indexes(); + /* Drop temporary tables. */ + row_mysql_drop_temp_tables(); + + /* Drop any auxiliary tables that were not dropped when the + parent table was dropped. This can happen if the parent table + was dropped but the server crashed before the auxiliary tables + were dropped. */ + fts_drop_orphaned_tables(); + + /* Rollback the uncommitted transactions which have no user + session */ + + os_thread_create(trx_rollback_or_clean_all_recovered, 0, 0); + } +} + +/******************************************************//** +Resets the logs. The contents of log files will be lost! */ +UNIV_INTERN +void +recv_reset_logs( +/*============*/ +#ifdef UNIV_LOG_ARCHIVE + ulint arch_log_no, /*!< in: next archived log file number */ + ibool new_logs_created,/*!< in: TRUE if resetting logs + is done at the log creation; + FALSE if it is done after + archive recovery */ +#endif /* UNIV_LOG_ARCHIVE */ + lsn_t lsn) /*!< in: reset to this lsn + rounded up to be divisible by + OS_FILE_LOG_BLOCK_SIZE, after + which we add + LOG_BLOCK_HDR_SIZE */ +{ + log_group_t* group; + + ut_ad(mutex_own(&(log_sys->mutex))); + + log_sys->lsn = ut_uint64_align_up(lsn, OS_FILE_LOG_BLOCK_SIZE); + + group = UT_LIST_GET_FIRST(log_sys->log_groups); + + while (group) { + group->lsn = log_sys->lsn; + group->lsn_offset = LOG_FILE_HDR_SIZE; +#ifdef UNIV_LOG_ARCHIVE + group->archived_file_no = arch_log_no; + group->archived_offset = 0; + + if (!new_logs_created) { + recv_truncate_group(group, group->lsn, group->lsn, + group->lsn, group->lsn); + } +#endif /* UNIV_LOG_ARCHIVE */ + + group = UT_LIST_GET_NEXT(log_groups, group); + } + + log_sys->buf_next_to_write = 0; + log_sys->written_to_some_lsn = log_sys->lsn; + log_sys->written_to_all_lsn = log_sys->lsn; + + log_sys->next_checkpoint_no = 0; + log_sys->last_checkpoint_lsn = 0; + +#ifdef UNIV_LOG_ARCHIVE + log_sys->archived_lsn = log_sys->lsn; +#endif /* UNIV_LOG_ARCHIVE */ + + log_sys->tracked_lsn = log_sys->lsn; + + log_block_init(log_sys->buf, log_sys->lsn); + log_block_set_first_rec_group(log_sys->buf, LOG_BLOCK_HDR_SIZE); + + log_sys->buf_free = LOG_BLOCK_HDR_SIZE; + log_sys->lsn += LOG_BLOCK_HDR_SIZE; + + MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE, + (log_sys->lsn - log_sys->last_checkpoint_lsn)); + + mutex_exit(&(log_sys->mutex)); + + /* Reset the checkpoint fields in logs */ + + log_make_checkpoint_at(LSN_MAX, TRUE); + + mutex_enter(&(log_sys->mutex)); +} +#endif /* !UNIV_HOTBACKUP */ + +#ifdef UNIV_HOTBACKUP +/******************************************************//** +Creates new log files after a backup has been restored. */ +UNIV_INTERN +void +recv_reset_log_files_for_backup( +/*============================*/ + const char* log_dir, /*!< in: log file directory path */ + ulint n_log_files, /*!< in: number of log files */ + lsn_t log_file_size, /*!< in: log file size */ + lsn_t lsn) /*!< in: new start lsn, must be + divisible by OS_FILE_LOG_BLOCK_SIZE */ +{ + os_file_t log_file; + ibool success; + byte* buf; + ulint i; + ulint log_dir_len; + char name[5000]; + static const char ib_logfile_basename[] = "ib_logfile"; + + log_dir_len = strlen(log_dir); + /* full path name of ib_logfile consists of log dir path + basename + + number. This must fit in the name buffer. + */ + ut_a(log_dir_len + strlen(ib_logfile_basename) + 11 < sizeof(name)); + + buf = ut_malloc(LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE); + memset(buf, '\0', LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE); + + for (i = 0; i < n_log_files; i++) { + + sprintf(name, "%s%s%lu", log_dir, + ib_logfile_basename, (ulong) i); + + log_file = os_file_create_simple(innodb_file_log_key, + name, OS_FILE_CREATE, + OS_FILE_READ_WRITE, + &success); + if (!success) { + fprintf(stderr, + "InnoDB: Cannot create %s. Check that" + " the file does not exist yet.\n", name); + + exit(1); + } + + fprintf(stderr, + "Setting log file size to %llu\n", + log_file_size); + + success = os_file_set_size(name, log_file, log_file_size); + + if (!success) { + fprintf(stderr, + "InnoDB: Cannot set %s size to %llu\n", + name, log_file_size); + exit(1); + } + + os_file_flush(log_file); + os_file_close(log_file); + } + + /* We pretend there is a checkpoint at lsn + LOG_BLOCK_HDR_SIZE */ + + log_reset_first_header_and_checkpoint(buf, lsn); + + log_block_init_in_old_format(buf + LOG_FILE_HDR_SIZE, lsn); + log_block_set_first_rec_group(buf + LOG_FILE_HDR_SIZE, + LOG_BLOCK_HDR_SIZE); + sprintf(name, "%s%s%lu", log_dir, ib_logfile_basename, (ulong)0); + + log_file = os_file_create_simple(innodb_file_log_key, + name, OS_FILE_OPEN, + OS_FILE_READ_WRITE, &success); + if (!success) { + fprintf(stderr, "InnoDB: Cannot open %s.\n", name); + + exit(1); + } + + os_file_write(name, log_file, buf, 0, + LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE); + os_file_flush(log_file); + os_file_close(log_file); + + ut_free(buf); +} +#endif /* UNIV_HOTBACKUP */ + +#ifdef UNIV_LOG_ARCHIVE +/******************************************************//** +Reads from the archive of a log group and performs recovery. +@return TRUE if no more complete consistent archive files */ +static +ibool +log_group_recover_from_archive_file( +/*================================*/ + log_group_t* group) /*!< in: log group */ +{ + os_file_t file_handle; + ib_uint64_t start_lsn; + ib_uint64_t file_end_lsn; + ib_uint64_t dummy_lsn; + ib_uint64_t scanned_lsn; + ulint len; + ibool ret; + byte* buf; + os_offset_t read_offset; + os_offset_t file_size; + int input_char; + char name[OS_FILE_MAX_PATH]; + + ut_a(0); + +try_open_again: + buf = log_sys->buf; + + /* Add the file to the archive file space; open the file */ + + log_archived_file_name_gen(name, sizeof(name), + group->id, group->archived_file_no); + + file_handle = os_file_create(innodb_file_log_key, + name, OS_FILE_OPEN, + OS_FILE_LOG, OS_FILE_AIO, &ret); + + if (ret == FALSE) { +ask_again: + fprintf(stderr, + "InnoDB: Do you want to copy additional" + " archived log files\n" + "InnoDB: to the directory\n"); + fprintf(stderr, + "InnoDB: or were these all the files needed" + " in recovery?\n"); + fprintf(stderr, + "InnoDB: (Y == copy more files; N == this is all)?"); + + input_char = getchar(); + + if (input_char == (int) 'N') { + + return(TRUE); + } else if (input_char == (int) 'Y') { + + goto try_open_again; + } else { + goto ask_again; + } + } + + file_size = os_file_get_size(file_handle); + ut_a(file_size != (os_offset_t) -1); + + fprintf(stderr, "InnoDB: Opened archived log file %s\n", name); + + ret = os_file_close(file_handle); + + if (file_size < LOG_FILE_HDR_SIZE) { + fprintf(stderr, + "InnoDB: Archive file header incomplete %s\n", name); + + return(TRUE); + } + + ut_a(ret); + + /* Add the archive file as a node to the space */ + + ut_a(fil_node_create(name, 1 + file_size / UNIV_PAGE_SIZE, + group->archive_space_id, FALSE)); + ut_a(RECV_SCAN_SIZE >= LOG_FILE_HDR_SIZE); + + /* Read the archive file header */ + fil_io(OS_FILE_READ | OS_FILE_LOG, true, group->archive_space_id, 0, + 0, 0, + LOG_FILE_HDR_SIZE, buf, NULL); + + /* Check if the archive file header is consistent */ + + if (mach_read_from_4(buf + LOG_GROUP_ID) != group->id + || mach_read_from_8(buf + LOG_FILE_START_LSN) + != group->archived_file_no) { + fprintf(stderr, + "InnoDB: Archive file header inconsistent %s\n", name); + + return(TRUE); + } + + if (!mach_read_from_4(buf + LOG_FILE_ARCH_COMPLETED)) { + fprintf(stderr, + "InnoDB: Archive file not completely written %s\n", + name); + + return(TRUE); + } + + start_lsn = mach_read_from_8(buf + LOG_FILE_START_LSN); + file_end_lsn = mach_read_from_8(buf + LOG_FILE_END_LSN); + + if (!recv_sys->scanned_lsn) { + + if (recv_sys->parse_start_lsn < start_lsn) { + fprintf(stderr, + "InnoDB: Archive log file %s" + " starts from too big a lsn\n", + name); + return(TRUE); + } + + recv_sys->scanned_lsn = start_lsn; + } + + if (recv_sys->scanned_lsn != start_lsn) { + + fprintf(stderr, + "InnoDB: Archive log file %s starts from" + " a wrong lsn\n", + name); + return(TRUE); + } + + read_offset = LOG_FILE_HDR_SIZE; + + for (;;) { + len = RECV_SCAN_SIZE; + + if (read_offset + len > file_size) { + len = ut_calc_align_down(file_size - read_offset, + OS_FILE_LOG_BLOCK_SIZE); + } + + if (len == 0) { + + break; + } + +#ifdef UNIV_DEBUG + if (log_debug_writes) { + fprintf(stderr, + "InnoDB: Archive read starting at" + " lsn " LSN_PF ", len %lu from file %s\n", + start_lsn, + (ulong) len, name); + } +#endif /* UNIV_DEBUG */ + + fil_io(OS_FILE_READ | OS_FILE_LOG, true, + group->archive_space_id, 0, + read_offset / UNIV_PAGE_SIZE, + read_offset % UNIV_PAGE_SIZE, len, buf, NULL); + + ret = recv_scan_log_recs( + (buf_pool_get_n_pages() + - (recv_n_pool_free_frames * srv_buf_pool_instances)) + * UNIV_PAGE_SIZE, TRUE, buf, len, start_lsn, + &dummy_lsn, &scanned_lsn); + + if (scanned_lsn == file_end_lsn) { + + return(FALSE); + } + + if (ret) { + fprintf(stderr, + "InnoDB: Archive log file %s" + " does not scan right\n", + name); + return(TRUE); + } + + read_offset += len; + start_lsn += len; + + ut_ad(start_lsn == scanned_lsn); + } + + return(FALSE); +} + +/********************************************************//** +Recovers from archived log files, and also from log files, if they exist. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +recv_recovery_from_archive_start( +/*=============================*/ + ib_uint64_t min_flushed_lsn,/*!< in: min flushed lsn field from the + data files */ + ib_uint64_t limit_lsn, /*!< in: recover up to this lsn if + possible */ + lsn_t first_log_no) /*!< in: number of the first archived + log file to use in the recovery; the + file will be searched from + INNOBASE_LOG_ARCH_DIR specified in + server config file */ +{ + log_group_t* group; + ulint group_id; + ulint trunc_len; + ibool ret; + dberr_t err; + + ut_a(0); + + recv_sys_create(); + recv_sys_init(buf_pool_get_curr_size()); + + recv_recovery_on = TRUE; + recv_recovery_from_backup_on = TRUE; + + recv_sys->limit_lsn = limit_lsn; + + group_id = 0; + + group = UT_LIST_GET_FIRST(log_sys->log_groups); + + while (group) { + if (group->id == group_id) { + + break; + } + + group = UT_LIST_GET_NEXT(log_groups, group); + } + + if (!group) { + fprintf(stderr, + "InnoDB: There is no log group defined with id %lu!\n", + (ulong) group_id); + return(DB_ERROR); + } + + group->archived_file_no = first_log_no; + + recv_sys->parse_start_lsn = min_flushed_lsn; + + recv_sys->scanned_lsn = 0; + recv_sys->scanned_checkpoint_no = 0; + recv_sys->recovered_lsn = recv_sys->parse_start_lsn; + + recv_sys->archive_group = group; + + ret = FALSE; + + mutex_enter(&(log_sys->mutex)); + + while (!ret) { + ret = log_group_recover_from_archive_file(group); + + /* Close and truncate a possible processed archive file + from the file space */ + + trunc_len = UNIV_PAGE_SIZE + * fil_space_get_size(group->archive_space_id); + if (trunc_len > 0) { + fil_space_truncate_start(group->archive_space_id, + trunc_len); + } + + group->archived_file_no += group->file_size - LOG_FILE_HDR_SIZE; + } + + if (recv_sys->recovered_lsn < limit_lsn) { + + if (!recv_sys->scanned_lsn) { + + recv_sys->scanned_lsn = recv_sys->parse_start_lsn; + } + + mutex_exit(&(log_sys->mutex)); + + err = recv_recovery_from_checkpoint_start(LOG_ARCHIVE, + limit_lsn, + LSN_MAX, + LSN_MAX); + if (err != DB_SUCCESS) { + + return(err); + } + + mutex_enter(&(log_sys->mutex)); + } + + if (limit_lsn != LSN_MAX) { + + recv_apply_hashed_log_recs(FALSE); + + recv_reset_logs(0, FALSE, recv_sys->recovered_lsn); + } + + mutex_exit(&(log_sys->mutex)); + + return(DB_SUCCESS); +} + +/********************************************************//** +Completes recovery from archive. */ +UNIV_INTERN +void +recv_recovery_from_archive_finish(void) +/*===================================*/ +{ + recv_recovery_from_checkpoint_finish(); + + recv_recovery_from_backup_on = FALSE; +} +#endif /* UNIV_LOG_ARCHIVE */ + + +void recv_dblwr_t::add(byte* page) +{ + pages.push_back(page); +} + +byte* recv_dblwr_t::find_page(ulint space_id, ulint page_no) +{ + std::vector<byte*> matches; + byte* result = 0; + + for (std::list<byte*>::iterator i = pages.begin(); + i != pages.end(); ++i) { + + if ((page_get_space_id(*i) == space_id) + && (page_get_page_no(*i) == page_no)) { + matches.push_back(*i); + } + } + + if (matches.size() == 1) { + result = matches[0]; + } else if (matches.size() > 1) { + + lsn_t max_lsn = 0; + lsn_t page_lsn = 0; + + for (std::vector<byte*>::iterator i = matches.begin(); + i != matches.end(); ++i) { + + page_lsn = mach_read_from_8(*i + FIL_PAGE_LSN); + + if (page_lsn > max_lsn) { + max_lsn = page_lsn; + result = *i; + } + } + } + + return(result); +} + diff --git a/storage/xtradb/mach/mach0data.cc b/storage/xtradb/mach/mach0data.cc new file mode 100644 index 00000000000..df68aab8a18 --- /dev/null +++ b/storage/xtradb/mach/mach0data.cc @@ -0,0 +1,94 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file mach/mach0data.cc +Utilities for converting data from the database file +to the machine format. + +Created 11/28/1995 Heikki Tuuri +***********************************************************************/ + +#include "mach0data.h" + +#ifdef UNIV_NONINL +#include "mach0data.ic" +#endif + +/*********************************************************//** +Reads a ulint in a compressed form if the log record fully contains it. +@return pointer to end of the stored field, NULL if not complete */ +UNIV_INTERN +byte* +mach_parse_compressed( +/*==================*/ + byte* ptr, /*!< in: pointer to buffer from where to read */ + byte* end_ptr,/*!< in: pointer to end of the buffer */ + ulint* val) /*!< out: read value (< 2^32) */ +{ + ulint flag; + + ut_ad(ptr && end_ptr && val); + + if (ptr >= end_ptr) { + + return(NULL); + } + + flag = mach_read_from_1(ptr); + + if (flag < 0x80UL) { + *val = flag; + return(ptr + 1); + + } else if (flag < 0xC0UL) { + if (end_ptr < ptr + 2) { + return(NULL); + } + + *val = mach_read_from_2(ptr) & 0x7FFFUL; + + return(ptr + 2); + + } else if (flag < 0xE0UL) { + if (end_ptr < ptr + 3) { + return(NULL); + } + + *val = mach_read_from_3(ptr) & 0x3FFFFFUL; + + return(ptr + 3); + } else if (flag < 0xF0UL) { + if (end_ptr < ptr + 4) { + return(NULL); + } + + *val = mach_read_from_4(ptr) & 0x1FFFFFFFUL; + + return(ptr + 4); + } else { + ut_ad(flag == 0xF0UL); + + if (end_ptr < ptr + 5) { + return(NULL); + } + + *val = mach_read_from_4(ptr + 1); + return(ptr + 5); + } +} diff --git a/storage/xtradb/mem/mem0dbg.cc b/storage/xtradb/mem/mem0dbg.cc new file mode 100644 index 00000000000..308c2979551 --- /dev/null +++ b/storage/xtradb/mem/mem0dbg.cc @@ -0,0 +1,1050 @@ +/***************************************************************************** + +Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file mem/mem0dbg.cc +The memory management: the debug code. This is not a compilation module, +but is included in mem0mem.* ! + +Created 6/9/1994 Heikki Tuuri +*************************************************************************/ + +#ifdef UNIV_MEM_DEBUG +# ifndef UNIV_HOTBACKUP +# include "ha_prototypes.h" +/* The mutex which protects in the debug version the hash table +containing the list of live memory heaps, and also the global +variables below. */ +UNIV_INTERN ib_mutex_t mem_hash_mutex; + +#ifdef UNIV_PFS_MUTEX +/* Key to register mem_hash_mutex with performance schema */ +UNIV_INTERN mysql_pfs_key_t mem_hash_mutex_key; +#endif /* UNIV_PFS_MUTEX */ + +# endif /* !UNIV_HOTBACKUP */ + +/* The following variables contain information about the +extent of memory allocations. Only used in the debug version. +Protected by mem_hash_mutex above. */ + +static ulint mem_n_created_heaps = 0; +static ulint mem_n_allocations = 0; +static ulint mem_total_allocated_memory = 0; +UNIV_INTERN ulint mem_current_allocated_memory = 0; +static ulint mem_max_allocated_memory = 0; +# ifndef UNIV_HOTBACKUP +static ulint mem_last_print_info = 0; +static ibool mem_hash_initialized = FALSE; +# endif /* !UNIV_HOTBACKUP */ + +/* Size of the hash table for memory management tracking */ +#define MEM_HASH_SIZE 997 + +/* The node of the list containing currently allocated memory heaps */ + +struct mem_hash_node_t { + UT_LIST_NODE_T(mem_hash_node_t) + list; /*!< hash list node */ + mem_heap_t* heap; /*!< memory heap */ + const char* file_name;/* file where heap was created*/ + ulint line; /*!< file line of creation */ + ulint nth_heap;/* this is the nth heap created */ + UT_LIST_NODE_T(mem_hash_node_t) + all_list;/* list of all created heaps */ +}; + +typedef UT_LIST_BASE_NODE_T(mem_hash_node_t) mem_hash_cell_t; + +/* The hash table of allocated heaps */ +static mem_hash_cell_t mem_hash_table[MEM_HASH_SIZE]; + +/* The base node of the list of all allocated heaps */ +static mem_hash_cell_t mem_all_list_base; + + + +UNIV_INLINE +mem_hash_cell_t* +mem_hash_get_nth_cell(ulint i); + +/* Accessor function for the hash table. Returns a pointer to the +table cell. */ +UNIV_INLINE +mem_hash_cell_t* +mem_hash_get_nth_cell(ulint i) +{ + ut_a(i < MEM_HASH_SIZE); + + return(&(mem_hash_table[i])); +} + +/* Accessor functions for a memory field in the debug version */ +UNIV_INTERN +void +mem_field_header_set_len(byte* field, ulint len) +{ + mach_write_to_4(field - 2 * sizeof(ulint), len); +} + +UNIV_INTERN +ulint +mem_field_header_get_len(byte* field) +{ + return(mach_read_from_4(field - 2 * sizeof(ulint))); +} + +UNIV_INTERN +void +mem_field_header_set_check(byte* field, ulint check) +{ + mach_write_to_4(field - sizeof(ulint), check); +} + +UNIV_INTERN +ulint +mem_field_header_get_check(byte* field) +{ + return(mach_read_from_4(field - sizeof(ulint))); +} + +UNIV_INTERN +void +mem_field_trailer_set_check(byte* field, ulint check) +{ + mach_write_to_4(field + mem_field_header_get_len(field), check); +} + +UNIV_INTERN +ulint +mem_field_trailer_get_check(byte* field) +{ + return(mach_read_from_4(field + + mem_field_header_get_len(field))); +} +#endif /* UNIV_MEM_DEBUG */ + +#ifndef UNIV_HOTBACKUP +/******************************************************************//** +Initializes the memory system. */ +UNIV_INTERN +void +mem_init( +/*=====*/ + ulint size) /*!< in: common pool size in bytes */ +{ +#ifdef UNIV_MEM_DEBUG + + ulint i; + + /* Initialize the hash table */ + ut_a(FALSE == mem_hash_initialized); + + mutex_create(mem_hash_mutex_key, &mem_hash_mutex, SYNC_MEM_HASH); + + for (i = 0; i < MEM_HASH_SIZE; i++) { + UT_LIST_INIT(*mem_hash_get_nth_cell(i)); + } + + UT_LIST_INIT(mem_all_list_base); + + mem_hash_initialized = TRUE; +#endif + + if (UNIV_LIKELY(srv_use_sys_malloc)) { + /* When innodb_use_sys_malloc is set, the + mem_comm_pool won't be used for any allocations. We + create a dummy mem_comm_pool, because some statistics + and debugging code relies on it being initialized. */ + size = 1; + } + + mem_comm_pool = mem_pool_create(size); +} + +/******************************************************************//** +Closes the memory system. */ +UNIV_INTERN +void +mem_close(void) +/*===========*/ +{ + mem_pool_free(mem_comm_pool); + mem_comm_pool = NULL; +#ifdef UNIV_MEM_DEBUG + mutex_free(&mem_hash_mutex); + mem_hash_initialized = FALSE; +#endif /* UNIV_MEM_DEBUG */ +} +#endif /* !UNIV_HOTBACKUP */ + +#ifdef UNIV_MEM_DEBUG +/******************************************************************//** +Initializes an allocated memory field in the debug version. */ +UNIV_INTERN +void +mem_field_init( +/*===========*/ + byte* buf, /*!< in: memory field */ + ulint n) /*!< in: how many bytes the user requested */ +{ + ulint rnd; + byte* usr_buf; + + usr_buf = buf + MEM_FIELD_HEADER_SIZE; + + /* In the debug version write the length field and the + check fields to the start and the end of the allocated storage. + The field header consists of a length field and + a random number field, in this order. The field trailer contains + the same random number as a check field. */ + + mem_field_header_set_len(usr_buf, n); + + rnd = ut_rnd_gen_ulint(); + + mem_field_header_set_check(usr_buf, rnd); + mem_field_trailer_set_check(usr_buf, rnd); + + /* Update the memory allocation information */ + + mutex_enter(&mem_hash_mutex); + + mem_total_allocated_memory += n; + mem_current_allocated_memory += n; + mem_n_allocations++; + + if (mem_current_allocated_memory > mem_max_allocated_memory) { + mem_max_allocated_memory = mem_current_allocated_memory; + } + + mutex_exit(&mem_hash_mutex); + + /* In the debug version set the buffer to a random + combination of 0xBA and 0xBE */ + + mem_init_buf(usr_buf, n); +} + +/******************************************************************//** +Erases an allocated memory field in the debug version. */ +UNIV_INTERN +void +mem_field_erase( +/*============*/ + byte* buf, /*!< in: memory field */ + ulint n __attribute__((unused))) + /*!< in: how many bytes the user requested */ +{ + byte* usr_buf; + + usr_buf = buf + MEM_FIELD_HEADER_SIZE; + + mutex_enter(&mem_hash_mutex); + mem_current_allocated_memory -= n; + mutex_exit(&mem_hash_mutex); + + /* Check that the field lengths agree */ + ut_ad(n == (ulint) mem_field_header_get_len(usr_buf)); + + /* In the debug version, set the freed space to a random + combination of 0xDE and 0xAD */ + + mem_erase_buf(buf, MEM_SPACE_NEEDED(n)); +} + +/***************************************************************//** +Initializes a buffer to a random combination of hex BA and BE. +Used to initialize allocated memory. */ +UNIV_INTERN +void +mem_init_buf( +/*=========*/ + byte* buf, /*!< in: pointer to buffer */ + ulint n) /*!< in: length of buffer */ +{ + byte* ptr; + + UNIV_MEM_ASSERT_W(buf, n); + + for (ptr = buf; ptr < buf + n; ptr++) { + + if (ut_rnd_gen_ibool()) { + *ptr = 0xBA; + } else { + *ptr = 0xBE; + } + } + + UNIV_MEM_INVALID(buf, n); +} + +/***************************************************************//** +Initializes a buffer to a random combination of hex DE and AD. +Used to erase freed memory. */ +UNIV_INTERN +void +mem_erase_buf( +/*==========*/ + byte* buf, /*!< in: pointer to buffer */ + ulint n) /*!< in: length of buffer */ +{ + byte* ptr; + + UNIV_MEM_ASSERT_W(buf, n); + + for (ptr = buf; ptr < buf + n; ptr++) { + if (ut_rnd_gen_ibool()) { + *ptr = 0xDE; + } else { + *ptr = 0xAD; + } + } + + UNIV_MEM_FREE(buf, n); +} + +/***************************************************************//** +Inserts a created memory heap to the hash table of current allocated +memory heaps. */ +UNIV_INTERN +void +mem_hash_insert( +/*============*/ + mem_heap_t* heap, /*!< in: the created heap */ + const char* file_name, /*!< in: file name of creation */ + ulint line) /*!< in: line where created */ +{ + mem_hash_node_t* new_node; + ulint cell_no ; + + ut_ad(mem_heap_check(heap)); + + mutex_enter(&mem_hash_mutex); + + cell_no = ut_hash_ulint((ulint) heap, MEM_HASH_SIZE); + + /* Allocate a new node to the list */ + new_node = static_cast<mem_hash_node_t*>(ut_malloc(sizeof(*new_node))); + + new_node->heap = heap; + new_node->file_name = file_name; + new_node->line = line; + new_node->nth_heap = mem_n_created_heaps; + + /* Insert into lists */ + UT_LIST_ADD_FIRST(list, *mem_hash_get_nth_cell(cell_no), new_node); + + UT_LIST_ADD_LAST(all_list, mem_all_list_base, new_node); + + mem_n_created_heaps++; + + mutex_exit(&mem_hash_mutex); +} + +/***************************************************************//** +Removes a memory heap (which is going to be freed by the caller) +from the list of live memory heaps. Returns the size of the heap +in terms of how much memory in bytes was allocated for the user of +the heap (not the total space occupied by the heap). +Also validates the heap. +NOTE: This function does not free the storage occupied by the +heap itself, only the node in the list of heaps. */ +UNIV_INTERN +void +mem_hash_remove( +/*============*/ + mem_heap_t* heap, /*!< in: the heap to be freed */ + const char* file_name, /*!< in: file name of freeing */ + ulint line) /*!< in: line where freed */ +{ + mem_hash_node_t* node; + ulint cell_no; + ibool error; + ulint size; + + ut_ad(mem_heap_check(heap)); + + mutex_enter(&mem_hash_mutex); + + cell_no = ut_hash_ulint((ulint) heap, MEM_HASH_SIZE); + + /* Look for the heap in the hash table list */ + node = UT_LIST_GET_FIRST(*mem_hash_get_nth_cell(cell_no)); + + while (node != NULL) { + if (node->heap == heap) { + + break; + } + + node = UT_LIST_GET_NEXT(list, node); + } + + if (node == NULL) { + fprintf(stderr, + "Memory heap or buffer freed in %s line %lu" + " did not exist.\n", + innobase_basename(file_name), (ulong) line); + ut_error; + } + + /* Remove from lists */ + UT_LIST_REMOVE(list, *mem_hash_get_nth_cell(cell_no), node); + + UT_LIST_REMOVE(all_list, mem_all_list_base, node); + + /* Validate the heap which will be freed */ + mem_heap_validate_or_print(node->heap, NULL, FALSE, &error, &size, + NULL, NULL); + if (error) { + fprintf(stderr, + "Inconsistency in memory heap or" + " buffer n:o %lu created\n" + "in %s line %lu and tried to free in %s line %lu.\n" + "Hex dump of 400 bytes around memory heap" + " first block start:\n", + node->nth_heap, + innobase_basename(node->file_name), (ulong) node->line, + innobase_basename(file_name), (ulong) line); + ut_print_buf(stderr, (byte*) node->heap - 200, 400); + fputs("\nDump of the mem heap:\n", stderr); + mem_heap_validate_or_print(node->heap, NULL, TRUE, &error, + &size, NULL, NULL); + ut_error; + } + + /* Free the memory occupied by the node struct */ + ut_free(node); + + mem_current_allocated_memory -= size; + + mutex_exit(&mem_hash_mutex); +} +#endif /* UNIV_MEM_DEBUG */ + +#if defined UNIV_MEM_DEBUG || defined UNIV_DEBUG +/***************************************************************//** +Checks a memory heap for consistency and prints the contents if requested. +Outputs the sum of sizes of buffers given to the user (only in +the debug version), the physical size of the heap and the number of +blocks in the heap. In case of error returns 0 as sizes and number +of blocks. */ +UNIV_INTERN +void +mem_heap_validate_or_print( +/*=======================*/ + mem_heap_t* heap, /*!< in: memory heap */ + byte* top __attribute__((unused)), + /*!< in: calculate and validate only until + this top pointer in the heap is reached, + if this pointer is NULL, ignored */ + ibool print, /*!< in: if TRUE, prints the contents + of the heap; works only in + the debug version */ + ibool* error, /*!< out: TRUE if error */ + ulint* us_size,/*!< out: allocated memory + (for the user) in the heap, + if a NULL pointer is passed as this + argument, it is ignored; in the + non-debug version this is always -1 */ + ulint* ph_size,/*!< out: physical size of the heap, + if a NULL pointer is passed as this + argument, it is ignored */ + ulint* n_blocks) /*!< out: number of blocks in the heap, + if a NULL pointer is passed as this + argument, it is ignored */ +{ + mem_block_t* block; + ulint total_len = 0; + ulint block_count = 0; + ulint phys_len = 0; +#ifdef UNIV_MEM_DEBUG + ulint len; + byte* field; + byte* user_field; + ulint check_field; +#endif + + /* Pessimistically, we set the parameters to error values */ + if (us_size != NULL) { + *us_size = 0; + } + if (ph_size != NULL) { + *ph_size = 0; + } + if (n_blocks != NULL) { + *n_blocks = 0; + } + *error = TRUE; + + block = heap; + + if (block->magic_n != MEM_BLOCK_MAGIC_N) { + return; + } + + if (print) { + fputs("Memory heap:", stderr); + } + + while (block != NULL) { + phys_len += mem_block_get_len(block); + + if ((block->type == MEM_HEAP_BUFFER) + && (mem_block_get_len(block) > UNIV_PAGE_SIZE)) { + + fprintf(stderr, + "InnoDB: Error: mem block %p" + " length %lu > UNIV_PAGE_SIZE\n", + (void*) block, + (ulong) mem_block_get_len(block)); + /* error */ + + return; + } + +#ifdef UNIV_MEM_DEBUG + /* We can trace the fields of the block only in the debug + version */ + if (print) { + fprintf(stderr, " Block %ld:", block_count); + } + + field = (byte*) block + mem_block_get_start(block); + + if (top && (field == top)) { + + goto completed; + } + + while (field < (byte*) block + mem_block_get_free(block)) { + + /* Calculate the pointer to the storage + which was given to the user */ + + user_field = field + MEM_FIELD_HEADER_SIZE; + + len = mem_field_header_get_len(user_field); + + if (print) { + ut_print_buf(stderr, user_field, len); + putc('\n', stderr); + } + + total_len += len; + check_field = mem_field_header_get_check(user_field); + + if (check_field + != mem_field_trailer_get_check(user_field)) { + /* error */ + + fprintf(stderr, + "InnoDB: Error: block %lx mem" + " field %lx len %lu\n" + "InnoDB: header check field is" + " %lx but trailer %lx\n", + (ulint) block, + (ulint) field, len, check_field, + mem_field_trailer_get_check( + user_field)); + + return; + } + + /* Move to next field */ + field = field + MEM_SPACE_NEEDED(len); + + if (top && (field == top)) { + + goto completed; + } + + } + + /* At the end check that we have arrived to the first free + position */ + + if (field != (byte*) block + mem_block_get_free(block)) { + /* error */ + + fprintf(stderr, + "InnoDB: Error: block %lx end of" + " mem fields %lx\n" + "InnoDB: but block free at %lx\n", + (ulint) block, (ulint) field, + (ulint)((byte*) block + + mem_block_get_free(block))); + + return; + } + +#endif + + block = UT_LIST_GET_NEXT(list, block); + block_count++; + } +#ifdef UNIV_MEM_DEBUG +completed: +#endif + if (us_size != NULL) { + *us_size = total_len; + } + if (ph_size != NULL) { + *ph_size = phys_len; + } + if (n_blocks != NULL) { + *n_blocks = block_count; + } + *error = FALSE; +} + +/**************************************************************//** +Prints the contents of a memory heap. */ +static +void +mem_heap_print( +/*===========*/ + mem_heap_t* heap) /*!< in: memory heap */ +{ + ibool error; + ulint us_size; + ulint phys_size; + ulint n_blocks; + + ut_ad(mem_heap_check(heap)); + + mem_heap_validate_or_print(heap, NULL, TRUE, &error, + &us_size, &phys_size, &n_blocks); + fprintf(stderr, + "\nheap type: %lu; size: user size %lu;" + " physical size %lu; blocks %lu.\n", + (ulong) heap->type, (ulong) us_size, + (ulong) phys_size, (ulong) n_blocks); + ut_a(!error); +} + +/**************************************************************//** +Validates the contents of a memory heap. +@return TRUE if ok */ +UNIV_INTERN +ibool +mem_heap_validate( +/*==============*/ + mem_heap_t* heap) /*!< in: memory heap */ +{ + ibool error; + ulint us_size; + ulint phys_size; + ulint n_blocks; + + ut_ad(mem_heap_check(heap)); + + mem_heap_validate_or_print(heap, NULL, FALSE, &error, &us_size, + &phys_size, &n_blocks); + if (error) { + mem_heap_print(heap); + } + + ut_a(!error); + + return(TRUE); +} +#endif /* UNIV_MEM_DEBUG || UNIV_DEBUG */ + +#ifdef UNIV_DEBUG +/**************************************************************//** +Checks that an object is a memory heap (or a block of it). +@return TRUE if ok */ +UNIV_INTERN +ibool +mem_heap_check( +/*===========*/ + mem_heap_t* heap) /*!< in: memory heap */ +{ + ut_a(heap->magic_n == MEM_BLOCK_MAGIC_N); + + return(TRUE); +} +#endif /* UNIV_DEBUG */ + +#ifdef UNIV_MEM_DEBUG +/*****************************************************************//** +TRUE if no memory is currently allocated. +@return TRUE if no heaps exist */ +UNIV_INTERN +ibool +mem_all_freed(void) +/*===============*/ +{ + mem_hash_node_t* node; + ulint heap_count = 0; + ulint i; + + mem_validate(); + + mutex_enter(&mem_hash_mutex); + + for (i = 0; i < MEM_HASH_SIZE; i++) { + + node = UT_LIST_GET_FIRST(*mem_hash_get_nth_cell(i)); + while (node != NULL) { + heap_count++; + node = UT_LIST_GET_NEXT(list, node); + } + } + + mutex_exit(&mem_hash_mutex); + + if (heap_count == 0) { +# ifndef UNIV_HOTBACKUP + ut_a(mem_pool_get_reserved(mem_comm_pool) == 0); +# endif /* !UNIV_HOTBACKUP */ + + return(TRUE); + } else { + return(FALSE); + } +} + +/*****************************************************************//** +Validates the dynamic memory allocation system. +@return TRUE if error */ +UNIV_INTERN +ibool +mem_validate_no_assert(void) +/*========================*/ +{ + mem_hash_node_t* node; + ulint n_heaps = 0; + ulint allocated_mem; + ulint ph_size; + ulint total_allocated_mem = 0; + ibool error = FALSE; + ulint n_blocks; + ulint i; + +# ifndef UNIV_HOTBACKUP + mem_pool_validate(mem_comm_pool); +# endif /* !UNIV_HOTBACKUP */ + + mutex_enter(&mem_hash_mutex); + + for (i = 0; i < MEM_HASH_SIZE; i++) { + + node = UT_LIST_GET_FIRST(*mem_hash_get_nth_cell(i)); + + while (node != NULL) { + n_heaps++; + + mem_heap_validate_or_print(node->heap, NULL, + FALSE, &error, + &allocated_mem, + &ph_size, &n_blocks); + + if (error) { + fprintf(stderr, + "\nERROR!!!!!!!!!!!!!!!!!!!" + "!!!!!!!!!!!!!!!!!!!!!!!\n\n" + "Inconsistency in memory heap" + " or buffer created\n" + "in %s line %lu.\n", + innobase_basename(node->file_name), + node->line); + + mutex_exit(&mem_hash_mutex); + + return(TRUE); + } + + total_allocated_mem += allocated_mem; + node = UT_LIST_GET_NEXT(list, node); + } + } + + if ((n_heaps == 0) && (mem_current_allocated_memory != 0)) { + error = TRUE; + } + + if (mem_total_allocated_memory < mem_current_allocated_memory) { + error = TRUE; + } + + if (mem_max_allocated_memory > mem_total_allocated_memory) { + error = TRUE; + } + + if (mem_n_created_heaps < n_heaps) { + error = TRUE; + } + + mutex_exit(&mem_hash_mutex); + + return(error); +} + +/************************************************************//** +Validates the dynamic memory +@return TRUE if ok */ +UNIV_INTERN +ibool +mem_validate(void) +/*==============*/ +{ + ut_a(!mem_validate_no_assert()); + + return(TRUE); +} +#endif /* UNIV_MEM_DEBUG */ + +/************************************************************//** +Tries to find neigboring memory allocation blocks and dumps to stderr +the neighborhood of a given pointer. */ +UNIV_INTERN +void +mem_analyze_corruption( +/*===================*/ + void* ptr) /*!< in: pointer to place of possible corruption */ +{ + byte* p; + ulint i; + ulint dist; + + fputs("InnoDB: Apparent memory corruption: mem dump ", stderr); + ut_print_buf(stderr, (byte*) ptr - 250, 500); + + fputs("\nInnoDB: Scanning backward trying to find" + " previous allocated mem blocks\n", stderr); + + p = (byte*) ptr; + dist = 0; + + for (i = 0; i < 10; i++) { + for (;;) { + if (((ulint) p) % 4 == 0) { + + if (*((ulint*) p) == MEM_BLOCK_MAGIC_N) { + fprintf(stderr, + "Mem block at - %lu," + " file %s, line %lu\n", + (ulong) dist, + (p + sizeof(ulint)), + (ulong) + (*(ulint*)(p + 8 + + sizeof(ulint)))); + + break; + } + + if (*((ulint*) p) == MEM_FREED_BLOCK_MAGIC_N) { + fprintf(stderr, + "Freed mem block at - %lu," + " file %s, line %lu\n", + (ulong) dist, + (p + sizeof(ulint)), + (ulong) + (*(ulint*)(p + 8 + + sizeof(ulint)))); + + break; + } + } + + p--; + dist++; + } + + p--; + dist++; + } + + fprintf(stderr, + "InnoDB: Scanning forward trying to find next" + " allocated mem blocks\n"); + + p = (byte*) ptr; + dist = 0; + + for (i = 0; i < 10; i++) { + for (;;) { + if (((ulint) p) % 4 == 0) { + + if (*((ulint*) p) == MEM_BLOCK_MAGIC_N) { + fprintf(stderr, + "Mem block at + %lu, file %s," + " line %lu\n", + (ulong) dist, + (p + sizeof(ulint)), + (ulong) + (*(ulint*)(p + 8 + + sizeof(ulint)))); + + break; + } + + if (*((ulint*) p) == MEM_FREED_BLOCK_MAGIC_N) { + fprintf(stderr, + "Freed mem block at + %lu," + " file %s, line %lu\n", + (ulong) dist, + (p + sizeof(ulint)), + (ulong) + (*(ulint*)(p + 8 + + sizeof(ulint)))); + + break; + } + } + + p++; + dist++; + } + + p++; + dist++; + } +} + +#ifndef UNIV_HOTBACKUP +/*****************************************************************//** +Prints information of dynamic memory usage and currently allocated +memory heaps or buffers. Can only be used in the debug version. */ +static +void +mem_print_info_low( +/*===============*/ + ibool print_all) /*!< in: if TRUE, all heaps are printed, + else only the heaps allocated after the + previous call of this function */ +{ +#ifdef UNIV_MEM_DEBUG + mem_hash_node_t* node; + ulint n_heaps = 0; + ulint allocated_mem; + ulint ph_size; + ulint total_allocated_mem = 0; + ibool error; + ulint n_blocks; +#endif + FILE* outfile; + + /* outfile = fopen("ibdebug", "a"); */ + + outfile = stdout; + + fprintf(outfile, "\n"); + fprintf(outfile, + "________________________________________________________\n"); + fprintf(outfile, "MEMORY ALLOCATION INFORMATION\n\n"); + +#ifndef UNIV_MEM_DEBUG + + UT_NOT_USED(print_all); + + mem_pool_print_info(outfile, mem_comm_pool); + + fprintf(outfile, + "Sorry, non-debug version cannot give more memory info\n"); + + /* fclose(outfile); */ + + return; +#else + mutex_enter(&mem_hash_mutex); + + fprintf(outfile, "LIST OF CREATED HEAPS AND ALLOCATED BUFFERS: \n\n"); + + if (!print_all) { + fprintf(outfile, "AFTER THE LAST PRINT INFO\n"); + } + + node = UT_LIST_GET_FIRST(mem_all_list_base); + + while (node != NULL) { + n_heaps++; + + if (!print_all && node->nth_heap < mem_last_print_info) { + + goto next_heap; + } + + mem_heap_validate_or_print(node->heap, NULL, + FALSE, &error, &allocated_mem, + &ph_size, &n_blocks); + total_allocated_mem += allocated_mem; + + fprintf(outfile, + "%lu: file %s line %lu of size %lu phys.size %lu" + " with %lu blocks, type %lu\n", + node->nth_heap, + innobase_basename(node->file_name), node->line, + allocated_mem, ph_size, n_blocks, + (node->heap)->type); +next_heap: + node = UT_LIST_GET_NEXT(all_list, node); + } + + fprintf(outfile, "\n"); + + fprintf(outfile, "Current allocated memory : %lu\n", + mem_current_allocated_memory); + fprintf(outfile, "Current allocated heaps and buffers : %lu\n", + n_heaps); + fprintf(outfile, "Cumulative allocated memory : %lu\n", + mem_total_allocated_memory); + fprintf(outfile, "Maximum allocated memory : %lu\n", + mem_max_allocated_memory); + fprintf(outfile, "Cumulative created heaps and buffers : %lu\n", + mem_n_created_heaps); + fprintf(outfile, "Cumulative number of allocations : %lu\n", + mem_n_allocations); + + mem_last_print_info = mem_n_created_heaps; + + mutex_exit(&mem_hash_mutex); + + mem_pool_print_info(outfile, mem_comm_pool); + + /* mem_validate(); */ + + /* fclose(outfile); */ +#endif +} + +/*****************************************************************//** +Prints information of dynamic memory usage and currently allocated memory +heaps or buffers. Can only be used in the debug version. */ +UNIV_INTERN +void +mem_print_info(void) +/*================*/ +{ + mem_print_info_low(TRUE); +} + +/*****************************************************************//** +Prints information of dynamic memory usage and currently allocated memory +heaps or buffers since the last ..._print_info or..._print_new_info. */ +UNIV_INTERN +void +mem_print_new_info(void) +/*====================*/ +{ + mem_print_info_low(FALSE); +} +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/xtradb/mem/mem0mem.cc b/storage/xtradb/mem/mem0mem.cc new file mode 100644 index 00000000000..e066aff5b30 --- /dev/null +++ b/storage/xtradb/mem/mem0mem.cc @@ -0,0 +1,583 @@ +/***************************************************************************** + +Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file mem/mem0mem.cc +The memory management + +Created 6/9/1994 Heikki Tuuri +*************************************************************************/ + +#include "mem0mem.h" +#ifdef UNIV_NONINL +#include "mem0mem.ic" +#endif + +#include "buf0buf.h" +#include "srv0srv.h" +#include "mem0dbg.cc" +#include <stdarg.h> + +/* + THE MEMORY MANAGEMENT + ===================== + +The basic element of the memory management is called a memory +heap. A memory heap is conceptually a +stack from which memory can be allocated. The stack may grow infinitely. +The top element of the stack may be freed, or +the whole stack can be freed at one time. The advantage of the +memory heap concept is that we can avoid using the malloc and free +functions of C which are quite expensive, for example, on the Solaris + GCC +system (50 MHz Sparc, 1993) the pair takes 3 microseconds, +on Win NT + 100MHz Pentium, 2.5 microseconds. +When we use a memory heap, +we can allocate larger blocks of memory at a time and thus +reduce overhead. Slightly more efficient the method is when we +allocate the memory from the index page buffer pool, as we can +claim a new page fast. This is called buffer allocation. +When we allocate the memory from the dynamic memory of the +C environment, that is called dynamic allocation. + +The default way of operation of the memory heap is the following. +First, when the heap is created, an initial block of memory is +allocated. In dynamic allocation this may be about 50 bytes. +If more space is needed, additional blocks are allocated +and they are put into a linked list. +After the initial block, each allocated block is twice the size of the +previous, until a threshold is attained, after which the sizes +of the blocks stay the same. An exception is, of course, the case +where the caller requests a memory buffer whose size is +bigger than the threshold. In that case a block big enough must +be allocated. + +The heap is physically arranged so that if the current block +becomes full, a new block is allocated and always inserted in the +chain of blocks as the last block. + +In the debug version of the memory management, all the allocated +heaps are kept in a list (which is implemented as a hash table). +Thus we can notice if the caller tries to free an already freed +heap. In addition, each buffer given to the caller contains +start field at the start and a trailer field at the end of the buffer. + +The start field has the following content: +A. sizeof(ulint) bytes of field length (in the standard byte order) +B. sizeof(ulint) bytes of check field (a random number) + +The trailer field contains: +A. sizeof(ulint) bytes of check field (the same random number as at the start) + +Thus we can notice if something has been copied over the +borders of the buffer, which is illegal. +The memory in the buffers is initialized to a random byte sequence. +After freeing, all the blocks in the heap are set to random bytes +to help us discover errors which result from the use of +buffers in an already freed heap. */ + +#ifdef MEM_PERIODIC_CHECK + +ibool mem_block_list_inited; +/* List of all mem blocks allocated; protected by the mem_comm_pool mutex */ +UT_LIST_BASE_NODE_T(mem_block_t) mem_block_list; + +#endif + +/**********************************************************************//** +Duplicates a NUL-terminated string, allocated from a memory heap. +@return own: a copy of the string */ +UNIV_INTERN +char* +mem_heap_strdup( +/*============*/ + mem_heap_t* heap, /*!< in: memory heap where string is allocated */ + const char* str) /*!< in: string to be copied */ +{ + return(static_cast<char*>(mem_heap_dup(heap, str, strlen(str) + 1))); +} + +/**********************************************************************//** +Duplicate a block of data, allocated from a memory heap. +@return own: a copy of the data */ +UNIV_INTERN +void* +mem_heap_dup( +/*=========*/ + mem_heap_t* heap, /*!< in: memory heap where copy is allocated */ + const void* data, /*!< in: data to be copied */ + ulint len) /*!< in: length of data, in bytes */ +{ + return(memcpy(mem_heap_alloc(heap, len), data, len)); +} + +/**********************************************************************//** +Concatenate two strings and return the result, using a memory heap. +@return own: the result */ +UNIV_INTERN +char* +mem_heap_strcat( +/*============*/ + mem_heap_t* heap, /*!< in: memory heap where string is allocated */ + const char* s1, /*!< in: string 1 */ + const char* s2) /*!< in: string 2 */ +{ + char* s; + ulint s1_len = strlen(s1); + ulint s2_len = strlen(s2); + + s = static_cast<char*>(mem_heap_alloc(heap, s1_len + s2_len + 1)); + + memcpy(s, s1, s1_len); + memcpy(s + s1_len, s2, s2_len); + + s[s1_len + s2_len] = '\0'; + + return(s); +} + + +/****************************************************************//** +Helper function for mem_heap_printf. +@return length of formatted string, including terminating NUL */ +static +ulint +mem_heap_printf_low( +/*================*/ + char* buf, /*!< in/out: buffer to store formatted string + in, or NULL to just calculate length */ + const char* format, /*!< in: format string */ + va_list ap) /*!< in: arguments */ +{ + ulint len = 0; + + while (*format) { + + /* Does this format specifier have the 'l' length modifier. */ + ibool is_long = FALSE; + + /* Length of one parameter. */ + size_t plen; + + if (*format++ != '%') { + /* Non-format character. */ + + len++; + + if (buf) { + *buf++ = *(format - 1); + } + + continue; + } + + if (*format == 'l') { + is_long = TRUE; + format++; + } + + switch (*format++) { + case 's': + /* string */ + { + char* s = va_arg(ap, char*); + + /* "%ls" is a non-sensical format specifier. */ + ut_a(!is_long); + + plen = strlen(s); + len += plen; + + if (buf) { + memcpy(buf, s, plen); + buf += plen; + } + } + + break; + + case 'u': + /* unsigned int */ + { + char tmp[32]; + unsigned long val; + + /* We only support 'long' values for now. */ + ut_a(is_long); + + val = va_arg(ap, unsigned long); + + plen = sprintf(tmp, "%lu", val); + len += plen; + + if (buf) { + memcpy(buf, tmp, plen); + buf += plen; + } + } + + break; + + case '%': + + /* "%l%" is a non-sensical format specifier. */ + ut_a(!is_long); + + len++; + + if (buf) { + *buf++ = '%'; + } + + break; + + default: + ut_error; + } + } + + /* For the NUL character. */ + len++; + + if (buf) { + *buf = '\0'; + } + + return(len); +} + +/****************************************************************//** +A simple sprintf replacement that dynamically allocates the space for the +formatted string from the given heap. This supports a very limited set of +the printf syntax: types 's' and 'u' and length modifier 'l' (which is +required for the 'u' type). +@return heap-allocated formatted string */ +UNIV_INTERN +char* +mem_heap_printf( +/*============*/ + mem_heap_t* heap, /*!< in: memory heap */ + const char* format, /*!< in: format string */ + ...) +{ + va_list ap; + char* str; + ulint len; + + /* Calculate length of string */ + len = 0; + va_start(ap, format); + len = mem_heap_printf_low(NULL, format, ap); + va_end(ap); + + /* Now create it for real. */ + str = static_cast<char*>(mem_heap_alloc(heap, len)); + va_start(ap, format); + mem_heap_printf_low(str, format, ap); + va_end(ap); + + return(str); +} + +/***************************************************************//** +Creates a memory heap block where data can be allocated. +@return own: memory heap block, NULL if did not succeed (only possible +for MEM_HEAP_BTR_SEARCH type heaps) */ +UNIV_INTERN +mem_block_t* +mem_heap_create_block_func( +/*=======================*/ + mem_heap_t* heap, /*!< in: memory heap or NULL if first block + should be created */ + ulint n, /*!< in: number of bytes needed for user data */ +#ifdef UNIV_DEBUG + const char* file_name,/*!< in: file name where created */ + ulint line, /*!< in: line where created */ +#endif /* UNIV_DEBUG */ + ulint type) /*!< in: type of heap: MEM_HEAP_DYNAMIC or + MEM_HEAP_BUFFER */ +{ +#ifndef UNIV_HOTBACKUP + buf_block_t* buf_block = NULL; +#endif /* !UNIV_HOTBACKUP */ + mem_block_t* block; + ulint len; + + ut_ad((type == MEM_HEAP_DYNAMIC) || (type == MEM_HEAP_BUFFER) + || (type == MEM_HEAP_BUFFER + MEM_HEAP_BTR_SEARCH)); + + if (heap && heap->magic_n != MEM_BLOCK_MAGIC_N) { + mem_analyze_corruption(heap); + } + + /* In dynamic allocation, calculate the size: block header + data. */ + len = MEM_BLOCK_HEADER_SIZE + MEM_SPACE_NEEDED(n); + +#ifndef UNIV_HOTBACKUP + if (type == MEM_HEAP_DYNAMIC || len < UNIV_PAGE_SIZE / 2) { + + ut_ad(type == MEM_HEAP_DYNAMIC || n <= MEM_MAX_ALLOC_IN_BUF); + + block = static_cast<mem_block_t*>( + mem_area_alloc(&len, mem_comm_pool)); + } else { + len = UNIV_PAGE_SIZE; + + if ((type & MEM_HEAP_BTR_SEARCH) && heap) { + /* We cannot allocate the block from the + buffer pool, but must get the free block from + the heap header free block field */ + + buf_block = static_cast<buf_block_t*>(heap->free_block); + heap->free_block = NULL; + + if (UNIV_UNLIKELY(!buf_block)) { + + return(NULL); + } + } else { + buf_block = buf_block_alloc(NULL); + } + + block = (mem_block_t*) buf_block->frame; + } + + if(!block) { + ib_logf(IB_LOG_LEVEL_FATAL, + " InnoDB: Unable to allocate memory of size %lu.\n", + len); + } + block->buf_block = buf_block; + block->free_block = NULL; +#else /* !UNIV_HOTBACKUP */ + len = MEM_BLOCK_HEADER_SIZE + MEM_SPACE_NEEDED(n); + block = ut_malloc(len); + ut_ad(block); +#endif /* !UNIV_HOTBACKUP */ + + block->magic_n = MEM_BLOCK_MAGIC_N; + ut_d(ut_strlcpy_rev(block->file_name, file_name, + sizeof(block->file_name))); + ut_d(block->line = line); + +#ifdef MEM_PERIODIC_CHECK + mutex_enter(&(mem_comm_pool->mutex)); + + if (!mem_block_list_inited) { + mem_block_list_inited = TRUE; + UT_LIST_INIT(mem_block_list); + } + + UT_LIST_ADD_LAST(mem_block_list, mem_block_list, block); + + mutex_exit(&(mem_comm_pool->mutex)); +#endif + mem_block_set_len(block, len); + mem_block_set_type(block, type); + mem_block_set_free(block, MEM_BLOCK_HEADER_SIZE); + mem_block_set_start(block, MEM_BLOCK_HEADER_SIZE); + + if (UNIV_UNLIKELY(heap == NULL)) { + /* This is the first block of the heap. The field + total_size should be initialized here */ + block->total_size = len; + } else { + /* Not the first allocation for the heap. This block's + total_length field should be set to undefined. */ + ut_d(block->total_size = ULINT_UNDEFINED); + UNIV_MEM_INVALID(&block->total_size, + sizeof block->total_size); + + heap->total_size += len; + } + + ut_ad((ulint)MEM_BLOCK_HEADER_SIZE < len); + + return(block); +} + +/***************************************************************//** +Adds a new block to a memory heap. +@return created block, NULL if did not succeed (only possible for +MEM_HEAP_BTR_SEARCH type heaps) */ +UNIV_INTERN +mem_block_t* +mem_heap_add_block( +/*===============*/ + mem_heap_t* heap, /*!< in: memory heap */ + ulint n) /*!< in: number of bytes user needs */ +{ + mem_block_t* block; + mem_block_t* new_block; + ulint new_size; + + ut_ad(mem_heap_check(heap)); + + block = UT_LIST_GET_LAST(heap->base); + + /* We have to allocate a new block. The size is always at least + doubled until the standard size is reached. After that the size + stays the same, except in cases where the caller needs more space. */ + + new_size = 2 * mem_block_get_len(block); + + if (heap->type != MEM_HEAP_DYNAMIC) { + /* From the buffer pool we allocate buffer frames */ + ut_a(n <= MEM_MAX_ALLOC_IN_BUF); + + if (new_size > MEM_MAX_ALLOC_IN_BUF) { + new_size = MEM_MAX_ALLOC_IN_BUF; + } + } else if (new_size > MEM_BLOCK_STANDARD_SIZE) { + + new_size = MEM_BLOCK_STANDARD_SIZE; + } + + if (new_size < n) { + new_size = n; + } + + new_block = mem_heap_create_block(heap, new_size, heap->type, + heap->file_name, heap->line); + if (new_block == NULL) { + + return(NULL); + } + + /* Add the new block as the last block */ + + UT_LIST_INSERT_AFTER(list, heap->base, block, new_block); + + return(new_block); +} + +/******************************************************************//** +Frees a block from a memory heap. */ +UNIV_INTERN +void +mem_heap_block_free( +/*================*/ + mem_heap_t* heap, /*!< in: heap */ + mem_block_t* block) /*!< in: block to free */ +{ + ulint type; + ulint len; +#ifndef UNIV_HOTBACKUP + buf_block_t* buf_block; + + buf_block = static_cast<buf_block_t*>(block->buf_block); +#endif /* !UNIV_HOTBACKUP */ + + if (block->magic_n != MEM_BLOCK_MAGIC_N) { + mem_analyze_corruption(block); + } + + UT_LIST_REMOVE(list, heap->base, block); + +#ifdef MEM_PERIODIC_CHECK + mutex_enter(&(mem_comm_pool->mutex)); + + UT_LIST_REMOVE(mem_block_list, mem_block_list, block); + + mutex_exit(&(mem_comm_pool->mutex)); +#endif + + ut_ad(heap->total_size >= block->len); + heap->total_size -= block->len; + + type = heap->type; + len = block->len; + block->magic_n = MEM_FREED_BLOCK_MAGIC_N; + +#ifndef UNIV_HOTBACKUP + if (!srv_use_sys_malloc) { +#ifdef UNIV_MEM_DEBUG + /* In the debug version we set the memory to a random + combination of hex 0xDE and 0xAD. */ + + mem_erase_buf((byte*) block, len); +#else /* UNIV_MEM_DEBUG */ + UNIV_MEM_ASSERT_AND_FREE(block, len); +#endif /* UNIV_MEM_DEBUG */ + + } + if (type == MEM_HEAP_DYNAMIC || len < UNIV_PAGE_SIZE / 2) { + + ut_ad(!buf_block); + mem_area_free(block, mem_comm_pool); + } else { + ut_ad(type & MEM_HEAP_BUFFER); + + buf_block_free(buf_block); + } +#else /* !UNIV_HOTBACKUP */ +#ifdef UNIV_MEM_DEBUG + /* In the debug version we set the memory to a random + combination of hex 0xDE and 0xAD. */ + + mem_erase_buf((byte*) block, len); +#else /* UNIV_MEM_DEBUG */ + UNIV_MEM_ASSERT_AND_FREE(block, len); +#endif /* UNIV_MEM_DEBUG */ + ut_free(block); +#endif /* !UNIV_HOTBACKUP */ +} + +#ifndef UNIV_HOTBACKUP +/******************************************************************//** +Frees the free_block field from a memory heap. */ +UNIV_INTERN +void +mem_heap_free_block_free( +/*=====================*/ + mem_heap_t* heap) /*!< in: heap */ +{ + if (UNIV_LIKELY_NULL(heap->free_block)) { + + buf_block_free(static_cast<buf_block_t*>(heap->free_block)); + + heap->free_block = NULL; + } +} +#endif /* !UNIV_HOTBACKUP */ + +#ifdef MEM_PERIODIC_CHECK +/******************************************************************//** +Goes through the list of all allocated mem blocks, checks their magic +numbers, and reports possible corruption. */ +UNIV_INTERN +void +mem_validate_all_blocks(void) +/*=========================*/ +{ + mem_block_t* block; + + mutex_enter(&(mem_comm_pool->mutex)); + + block = UT_LIST_GET_FIRST(mem_block_list); + + while (block) { + if (block->magic_n != MEM_BLOCK_MAGIC_N) { + mem_analyze_corruption(block); + } + + block = UT_LIST_GET_NEXT(mem_block_list, block); + } + + mutex_exit(&(mem_comm_pool->mutex)); +} +#endif diff --git a/storage/xtradb/mem/mem0pool.cc b/storage/xtradb/mem/mem0pool.cc new file mode 100644 index 00000000000..fe9a84d21fa --- /dev/null +++ b/storage/xtradb/mem/mem0pool.cc @@ -0,0 +1,727 @@ +/***************************************************************************** + +Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file mem/mem0pool.cc +The lowest-level memory management + +Created 5/12/1997 Heikki Tuuri +*************************************************************************/ + +#include "mem0pool.h" +#ifdef UNIV_NONINL +#include "mem0pool.ic" +#endif + +#include "srv0srv.h" +#include "sync0sync.h" +#include "ut0mem.h" +#include "ut0lst.h" +#include "ut0byte.h" +#include "mem0mem.h" +#include "srv0start.h" + +/* We would like to use also the buffer frames to allocate memory. This +would be desirable, because then the memory consumption of the database +would be fixed, and we might even lock the buffer pool to the main memory. +The problem here is that the buffer management routines can themselves call +memory allocation, while the buffer pool mutex is reserved. + +The main components of the memory consumption are: + +1. buffer pool, +2. parsed and optimized SQL statements, +3. data dictionary cache, +4. log buffer, +5. locks for each transaction, +6. hash table for the adaptive index, +7. state and buffers for each SQL query currently being executed, +8. session for each user, and +9. stack for each OS thread. + +Items 1 and 2 are managed by an LRU algorithm. Items 5 and 6 can potentially +consume very much memory. Items 7 and 8 should consume quite little memory, +and the OS should take care of item 9, which too should consume little memory. + +A solution to the memory management: + +1. the buffer pool size is set separately; +2. log buffer size is set separately; +3. the common pool size for all the other entries, except 8, is set separately. + +Problems: we may waste memory if the common pool is set too big. Another +problem is the locks, which may take very much space in big transactions. +Then the shared pool size should be set very big. We can allow locks to take +space from the buffer pool, but the SQL optimizer is then unaware of the +usable size of the buffer pool. We could also combine the objects in the +common pool and the buffers in the buffer pool into a single LRU list and +manage it uniformly, but this approach does not take into account the parsing +and other costs unique to SQL statements. + +The locks for a transaction can be seen as a part of the state of the +transaction. Hence, they should be stored in the common pool. We still +have the problem of a very big update transaction, for example, which +will set very many x-locks on rows, and the locks will consume a lot +of memory, say, half of the buffer pool size. + +Another problem is what to do if we are not able to malloc a requested +block of memory from the common pool. Then we can request memory from +the operating system. If it does not help, a system error results. + +Because 5 and 6 may potentially consume very much memory, we let them grow +into the buffer pool. We may let the locks of a transaction take frames +from the buffer pool, when the corresponding memory heap block has grown to +the size of a buffer frame. Similarly for the hash node cells of the locks, +and for the adaptive index. Thus, for each individual transaction, its locks +can occupy at most about the size of the buffer frame of memory in the common +pool, and after that its locks will grow into the buffer pool. */ + +/** Mask used to extract the free bit from area->size */ +#define MEM_AREA_FREE 1 + +/** The smallest memory area total size */ +#define MEM_AREA_MIN_SIZE (2 * MEM_AREA_EXTRA_SIZE) + + +/** Data structure for a memory pool. The space is allocated using the buddy +algorithm, where free list i contains areas of size 2 to power i. */ +struct mem_pool_t{ + byte* buf; /*!< memory pool */ + ulint size; /*!< memory common pool size */ + ulint reserved; /*!< amount of currently allocated + memory */ + ib_mutex_t mutex; /*!< mutex protecting this struct */ + UT_LIST_BASE_NODE_T(mem_area_t) + free_list[64]; /*!< lists of free memory areas: an + area is put to the list whose number + is the 2-logarithm of the area size */ +}; + +/** The common memory pool */ +UNIV_INTERN mem_pool_t* mem_comm_pool = NULL; + +#ifdef UNIV_PFS_MUTEX +/* Key to register mutex in mem_pool_t with performance schema */ +UNIV_INTERN mysql_pfs_key_t mem_pool_mutex_key; +#endif /* UNIV_PFS_MUTEX */ + +/* We use this counter to check that the mem pool mutex does not leak; +this is to track a strange assertion failure reported at +mysql@lists.mysql.com */ + +UNIV_INTERN ulint mem_n_threads_inside = 0; + +/********************************************************************//** +Reserves the mem pool mutex if we are not in server shutdown. Use +this function only in memory free functions, since only memory +free functions are used during server shutdown. */ +UNIV_INLINE +void +mem_pool_mutex_enter( +/*=================*/ + mem_pool_t* pool) /*!< in: memory pool */ +{ + if (srv_shutdown_state < SRV_SHUTDOWN_EXIT_THREADS) { + mutex_enter(&(pool->mutex)); + } +} + +/********************************************************************//** +Releases the mem pool mutex if we are not in server shutdown. As +its corresponding mem_pool_mutex_enter() function, use it only +in memory free functions */ +UNIV_INLINE +void +mem_pool_mutex_exit( +/*================*/ + mem_pool_t* pool) /*!< in: memory pool */ +{ + if (srv_shutdown_state < SRV_SHUTDOWN_EXIT_THREADS) { + mutex_exit(&(pool->mutex)); + } +} + +/********************************************************************//** +Returns memory area size. +@return size */ +UNIV_INLINE +ulint +mem_area_get_size( +/*==============*/ + mem_area_t* area) /*!< in: area */ +{ + return(area->size_and_free & ~MEM_AREA_FREE); +} + +/********************************************************************//** +Sets memory area size. */ +UNIV_INLINE +void +mem_area_set_size( +/*==============*/ + mem_area_t* area, /*!< in: area */ + ulint size) /*!< in: size */ +{ + area->size_and_free = (area->size_and_free & MEM_AREA_FREE) + | size; +} + +/********************************************************************//** +Returns memory area free bit. +@return TRUE if free */ +UNIV_INLINE +ibool +mem_area_get_free( +/*==============*/ + mem_area_t* area) /*!< in: area */ +{ +#if TRUE != MEM_AREA_FREE +# error "TRUE != MEM_AREA_FREE" +#endif + return(area->size_and_free & MEM_AREA_FREE); +} + +/********************************************************************//** +Sets memory area free bit. */ +UNIV_INLINE +void +mem_area_set_free( +/*==============*/ + mem_area_t* area, /*!< in: area */ + ibool free) /*!< in: free bit value */ +{ +#if TRUE != MEM_AREA_FREE +# error "TRUE != MEM_AREA_FREE" +#endif + area->size_and_free = (area->size_and_free & ~MEM_AREA_FREE) + | free; +} + +/********************************************************************//** +Creates a memory pool. +@return memory pool */ +UNIV_INTERN +mem_pool_t* +mem_pool_create( +/*============*/ + ulint size) /*!< in: pool size in bytes */ +{ + mem_pool_t* pool; + mem_area_t* area; + ulint i; + ulint used; + + pool = static_cast<mem_pool_t*>(ut_malloc(sizeof(mem_pool_t))); + + pool->buf = static_cast<byte*>(ut_malloc_low(size, TRUE)); + pool->size = size; + + mutex_create(mem_pool_mutex_key, &pool->mutex, SYNC_MEM_POOL); + + /* Initialize the free lists */ + + for (i = 0; i < 64; i++) { + + UT_LIST_INIT(pool->free_list[i]); + } + + used = 0; + + while (size - used >= MEM_AREA_MIN_SIZE) { + + i = ut_2_log(size - used); + + if (ut_2_exp(i) > size - used) { + + /* ut_2_log rounds upward */ + + i--; + } + + area = (mem_area_t*)(pool->buf + used); + + mem_area_set_size(area, ut_2_exp(i)); + mem_area_set_free(area, TRUE); + UNIV_MEM_FREE(MEM_AREA_EXTRA_SIZE + (byte*) area, + ut_2_exp(i) - MEM_AREA_EXTRA_SIZE); + + UT_LIST_ADD_FIRST(free_list, pool->free_list[i], area); + + used = used + ut_2_exp(i); + } + + ut_ad(size >= used); + + pool->reserved = 0; + + return(pool); +} + +/********************************************************************//** +Frees a memory pool. */ +UNIV_INTERN +void +mem_pool_free( +/*==========*/ + mem_pool_t* pool) /*!< in, own: memory pool */ +{ + ut_free(pool->buf); + ut_free(pool); +} + +/********************************************************************//** +Fills the specified free list. +@return TRUE if we were able to insert a block to the free list */ +static +ibool +mem_pool_fill_free_list( +/*====================*/ + ulint i, /*!< in: free list index */ + mem_pool_t* pool) /*!< in: memory pool */ +{ + mem_area_t* area; + mem_area_t* area2; + ibool ret; + + ut_ad(mutex_own(&(pool->mutex))); + + if (UNIV_UNLIKELY(i >= 63)) { + /* We come here when we have run out of space in the + memory pool: */ + + return(FALSE); + } + + area = UT_LIST_GET_FIRST(pool->free_list[i + 1]); + + if (area == NULL) { + if (UT_LIST_GET_LEN(pool->free_list[i + 1]) > 0) { + ut_print_timestamp(stderr); + + fprintf(stderr, + " InnoDB: Error: mem pool free list %lu" + " length is %lu\n" + "InnoDB: though the list is empty!\n", + (ulong) i + 1, + (ulong) + UT_LIST_GET_LEN(pool->free_list[i + 1])); + } + + ret = mem_pool_fill_free_list(i + 1, pool); + + if (ret == FALSE) { + + return(FALSE); + } + + area = UT_LIST_GET_FIRST(pool->free_list[i + 1]); + } + + if (UNIV_UNLIKELY(UT_LIST_GET_LEN(pool->free_list[i + 1]) == 0)) { + mem_analyze_corruption(area); + + ut_error; + } + + UT_LIST_REMOVE(free_list, pool->free_list[i + 1], area); + + area2 = (mem_area_t*)(((byte*) area) + ut_2_exp(i)); + UNIV_MEM_ALLOC(area2, MEM_AREA_EXTRA_SIZE); + + mem_area_set_size(area2, ut_2_exp(i)); + mem_area_set_free(area2, TRUE); + + UT_LIST_ADD_FIRST(free_list, pool->free_list[i], area2); + + mem_area_set_size(area, ut_2_exp(i)); + + UT_LIST_ADD_FIRST(free_list, pool->free_list[i], area); + + return(TRUE); +} + +/********************************************************************//** +Allocates memory from a pool. NOTE: This low-level function should only be +used in mem0mem.*! +@return own: allocated memory buffer */ +UNIV_INTERN +void* +mem_area_alloc( +/*===========*/ + ulint* psize, /*!< in: requested size in bytes; for optimum + space usage, the size should be a power of 2 + minus MEM_AREA_EXTRA_SIZE; + out: allocated size in bytes (greater than + or equal to the requested size) */ + mem_pool_t* pool) /*!< in: memory pool */ +{ + mem_area_t* area; + ulint size; + ulint n; + ibool ret; + + /* If we are using os allocator just make a simple call + to malloc */ + if (UNIV_LIKELY(srv_use_sys_malloc)) { + return(malloc(*psize)); + } + + size = *psize; + n = ut_2_log(ut_max(size + MEM_AREA_EXTRA_SIZE, MEM_AREA_MIN_SIZE)); + + mutex_enter(&(pool->mutex)); + mem_n_threads_inside++; + + ut_a(mem_n_threads_inside == 1); + + area = UT_LIST_GET_FIRST(pool->free_list[n]); + + if (area == NULL) { + ret = mem_pool_fill_free_list(n, pool); + + if (ret == FALSE) { + /* Out of memory in memory pool: we try to allocate + from the operating system with the regular malloc: */ + + mem_n_threads_inside--; + mutex_exit(&(pool->mutex)); + + return(ut_malloc(size)); + } + + area = UT_LIST_GET_FIRST(pool->free_list[n]); + } + + if (!mem_area_get_free(area)) { + fprintf(stderr, + "InnoDB: Error: Removing element from mem pool" + " free list %lu though the\n" + "InnoDB: element is not marked free!\n", + (ulong) n); + + mem_analyze_corruption(area); + + /* Try to analyze a strange assertion failure reported at + mysql@lists.mysql.com where the free bit IS 1 in the + hex dump above */ + + if (mem_area_get_free(area)) { + fprintf(stderr, + "InnoDB: Probably a race condition" + " because now the area is marked free!\n"); + } + + ut_error; + } + + if (UT_LIST_GET_LEN(pool->free_list[n]) == 0) { + fprintf(stderr, + "InnoDB: Error: Removing element from mem pool" + " free list %lu\n" + "InnoDB: though the list length is 0!\n", + (ulong) n); + mem_analyze_corruption(area); + + ut_error; + } + + ut_ad(mem_area_get_size(area) == ut_2_exp(n)); + + mem_area_set_free(area, FALSE); + + UT_LIST_REMOVE(free_list, pool->free_list[n], area); + + pool->reserved += mem_area_get_size(area); + + mem_n_threads_inside--; + mutex_exit(&(pool->mutex)); + + ut_ad(mem_pool_validate(pool)); + + *psize = ut_2_exp(n) - MEM_AREA_EXTRA_SIZE; + UNIV_MEM_ALLOC(MEM_AREA_EXTRA_SIZE + (byte*) area, *psize); + + return((void*)(MEM_AREA_EXTRA_SIZE + ((byte*) area))); +} + +/********************************************************************//** +Gets the buddy of an area, if it exists in pool. +@return the buddy, NULL if no buddy in pool */ +UNIV_INLINE +mem_area_t* +mem_area_get_buddy( +/*===============*/ + mem_area_t* area, /*!< in: memory area */ + ulint size, /*!< in: memory area size */ + mem_pool_t* pool) /*!< in: memory pool */ +{ + mem_area_t* buddy; + + ut_ad(size != 0); + + if (((((byte*) area) - pool->buf) % (2 * size)) == 0) { + + /* The buddy is in a higher address */ + + buddy = (mem_area_t*)(((byte*) area) + size); + + if ((((byte*) buddy) - pool->buf) + size > pool->size) { + + /* The buddy is not wholly contained in the pool: + there is no buddy */ + + buddy = NULL; + } + } else { + /* The buddy is in a lower address; NOTE that area cannot + be at the pool lower end, because then we would end up to + the upper branch in this if-clause: the remainder would be + 0 */ + + buddy = (mem_area_t*)(((byte*) area) - size); + } + + return(buddy); +} + +/********************************************************************//** +Frees memory to a pool. */ +UNIV_INTERN +void +mem_area_free( +/*==========*/ + void* ptr, /*!< in, own: pointer to allocated memory + buffer */ + mem_pool_t* pool) /*!< in: memory pool */ +{ + mem_area_t* area; + mem_area_t* buddy; + void* new_ptr; + ulint size; + ulint n; + + if (UNIV_LIKELY(srv_use_sys_malloc)) { + free(ptr); + + return; + } + + /* It may be that the area was really allocated from the OS with + regular malloc: check if ptr points within our memory pool */ + + if ((byte*) ptr < pool->buf || (byte*) ptr >= pool->buf + pool->size) { + ut_free(ptr); + + return; + } + + area = (mem_area_t*) (((byte*) ptr) - MEM_AREA_EXTRA_SIZE); + + if (mem_area_get_free(area)) { + fprintf(stderr, + "InnoDB: Error: Freeing element to mem pool" + " free list though the\n" + "InnoDB: element is marked free!\n"); + + mem_analyze_corruption(area); + ut_error; + } + + size = mem_area_get_size(area); + UNIV_MEM_FREE(ptr, size - MEM_AREA_EXTRA_SIZE); + + if (size == 0) { + fprintf(stderr, + "InnoDB: Error: Mem area size is 0. Possibly a" + " memory overrun of the\n" + "InnoDB: previous allocated area!\n"); + + mem_analyze_corruption(area); + ut_error; + } + +#ifdef UNIV_LIGHT_MEM_DEBUG + if (((byte*) area) + size < pool->buf + pool->size) { + + ulint next_size; + + next_size = mem_area_get_size( + (mem_area_t*)(((byte*) area) + size)); + if (UNIV_UNLIKELY(!next_size || !ut_is_2pow(next_size))) { + fprintf(stderr, + "InnoDB: Error: Memory area size %lu," + " next area size %lu not a power of 2!\n" + "InnoDB: Possibly a memory overrun of" + " the buffer being freed here.\n", + (ulong) size, (ulong) next_size); + mem_analyze_corruption(area); + + ut_error; + } + } +#endif + buddy = mem_area_get_buddy(area, size, pool); + + n = ut_2_log(size); + + mem_pool_mutex_enter(pool); + mem_n_threads_inside++; + + ut_a(mem_n_threads_inside == 1); + + if (buddy && mem_area_get_free(buddy) + && (size == mem_area_get_size(buddy))) { + + /* The buddy is in a free list */ + + if ((byte*) buddy < (byte*) area) { + new_ptr = ((byte*) buddy) + MEM_AREA_EXTRA_SIZE; + + mem_area_set_size(buddy, 2 * size); + mem_area_set_free(buddy, FALSE); + } else { + new_ptr = ptr; + + mem_area_set_size(area, 2 * size); + } + + /* Remove the buddy from its free list and merge it to area */ + + UT_LIST_REMOVE(free_list, pool->free_list[n], buddy); + + pool->reserved += ut_2_exp(n); + + mem_n_threads_inside--; + mem_pool_mutex_exit(pool); + + mem_area_free(new_ptr, pool); + + return; + } else { + UT_LIST_ADD_FIRST(free_list, pool->free_list[n], area); + + mem_area_set_free(area, TRUE); + + ut_ad(pool->reserved >= size); + + pool->reserved -= size; + } + + mem_n_threads_inside--; + mem_pool_mutex_exit(pool); + + ut_ad(mem_pool_validate(pool)); +} + +/********************************************************************//** +Validates a memory pool. +@return TRUE if ok */ +UNIV_INTERN +ibool +mem_pool_validate( +/*==============*/ + mem_pool_t* pool) /*!< in: memory pool */ +{ + mem_area_t* area; + mem_area_t* buddy; + ulint free; + ulint i; + + mem_pool_mutex_enter(pool); + + free = 0; + + for (i = 0; i < 64; i++) { + + UT_LIST_CHECK(free_list, mem_area_t, pool->free_list[i]); + + for (area = UT_LIST_GET_FIRST(pool->free_list[i]); + area != 0; + area = UT_LIST_GET_NEXT(free_list, area)) { + + ut_a(mem_area_get_free(area)); + ut_a(mem_area_get_size(area) == ut_2_exp(i)); + + buddy = mem_area_get_buddy(area, ut_2_exp(i), pool); + + ut_a(!buddy || !mem_area_get_free(buddy) + || (ut_2_exp(i) != mem_area_get_size(buddy))); + + free += ut_2_exp(i); + } + } + + ut_a(free + pool->reserved == pool->size); + + mem_pool_mutex_exit(pool); + + return(TRUE); +} + +/********************************************************************//** +Prints info of a memory pool. */ +UNIV_INTERN +void +mem_pool_print_info( +/*================*/ + FILE* outfile,/*!< in: output file to write to */ + mem_pool_t* pool) /*!< in: memory pool */ +{ + ulint i; + + mem_pool_validate(pool); + + fprintf(outfile, "INFO OF A MEMORY POOL\n"); + + mutex_enter(&(pool->mutex)); + + for (i = 0; i < 64; i++) { + if (UT_LIST_GET_LEN(pool->free_list[i]) > 0) { + + fprintf(outfile, + "Free list length %lu for" + " blocks of size %lu\n", + (ulong) UT_LIST_GET_LEN(pool->free_list[i]), + (ulong) ut_2_exp(i)); + } + } + + fprintf(outfile, "Pool size %lu, reserved %lu.\n", (ulong) pool->size, + (ulong) pool->reserved); + mutex_exit(&(pool->mutex)); +} + +/********************************************************************//** +Returns the amount of reserved memory. +@return reserved memory in bytes */ +UNIV_INTERN +ulint +mem_pool_get_reserved( +/*==================*/ + mem_pool_t* pool) /*!< in: memory pool */ +{ + ulint reserved; + + mutex_enter(&(pool->mutex)); + + reserved = pool->reserved; + + mutex_exit(&(pool->mutex)); + + return(reserved); +} diff --git a/storage/xtradb/mtr/mtr0log.cc b/storage/xtradb/mtr/mtr0log.cc new file mode 100644 index 00000000000..0660c819240 --- /dev/null +++ b/storage/xtradb/mtr/mtr0log.cc @@ -0,0 +1,609 @@ +/***************************************************************************** + +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file mtr/mtr0log.cc +Mini-transaction log routines + +Created 12/7/1995 Heikki Tuuri +*******************************************************/ + +#include "mtr0log.h" + +#ifdef UNIV_NONINL +#include "mtr0log.ic" +#endif + +#include "buf0buf.h" +#include "dict0dict.h" +#include "log0recv.h" +#include "page0page.h" + +#ifndef UNIV_HOTBACKUP +# include "dict0boot.h" + +/********************************************************//** +Catenates n bytes to the mtr log. */ +UNIV_INTERN +void +mlog_catenate_string( +/*=================*/ + mtr_t* mtr, /*!< in: mtr */ + const byte* str, /*!< in: string to write */ + ulint len) /*!< in: string length */ +{ + dyn_array_t* mlog; + + if (mtr_get_log_mode(mtr) == MTR_LOG_NONE) { + + return; + } + + mlog = &(mtr->log); + + dyn_push_string(mlog, str, len); +} + +/********************************************************//** +Writes the initial part of a log record consisting of one-byte item +type and four-byte space and page numbers. Also pushes info +to the mtr memo that a buffer page has been modified. */ +UNIV_INTERN +void +mlog_write_initial_log_record( +/*==========================*/ + const byte* ptr, /*!< in: pointer to (inside) a buffer + frame holding the file page where + modification is made */ + byte type, /*!< in: log item type: MLOG_1BYTE, ... */ + mtr_t* mtr) /*!< in: mini-transaction handle */ +{ + byte* log_ptr; + + ut_ad(type <= MLOG_BIGGEST_TYPE); + ut_ad(type > MLOG_8BYTES); + + log_ptr = mlog_open(mtr, 11); + + /* If no logging is requested, we may return now */ + if (log_ptr == NULL) { + + return; + } + + log_ptr = mlog_write_initial_log_record_fast(ptr, type, log_ptr, mtr); + + mlog_close(mtr, log_ptr); +} +#endif /* !UNIV_HOTBACKUP */ + +/********************************************************//** +Parses an initial log record written by mlog_write_initial_log_record. +@return parsed record end, NULL if not a complete record */ +UNIV_INTERN +byte* +mlog_parse_initial_log_record( +/*==========================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + byte* type, /*!< out: log record type: MLOG_1BYTE, ... */ + ulint* space, /*!< out: space id */ + ulint* page_no)/*!< out: page number */ +{ + if (end_ptr < ptr + 1) { + + return(NULL); + } + + *type = (byte)((ulint)*ptr & ~MLOG_SINGLE_REC_FLAG); + ut_ad(*type <= MLOG_BIGGEST_TYPE); + + ptr++; + + if (end_ptr < ptr + 2) { + + return(NULL); + } + + ptr = mach_parse_compressed(ptr, end_ptr, space); + + if (ptr == NULL) { + + return(NULL); + } + + ptr = mach_parse_compressed(ptr, end_ptr, page_no); + + return(ptr); +} + +/********************************************************//** +Parses a log record written by mlog_write_ulint or mlog_write_ull. +@return parsed record end, NULL if not a complete record or a corrupt record */ +UNIV_INTERN +byte* +mlog_parse_nbytes( +/*==============*/ + ulint type, /*!< in: log record type: MLOG_1BYTE, ... */ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + byte* page, /*!< in: page where to apply the log record, or NULL */ + void* page_zip)/*!< in/out: compressed page, or NULL */ +{ + ulint offset; + ulint val; + ib_uint64_t dval; + + ut_a(type <= MLOG_8BYTES); + ut_a(!page || !page_zip || fil_page_get_type(page) != FIL_PAGE_INDEX); + + if (end_ptr < ptr + 2) { + + return(NULL); + } + + offset = mach_read_from_2(ptr); + ptr += 2; + + if (offset >= UNIV_PAGE_SIZE) { + recv_sys->found_corrupt_log = TRUE; + + return(NULL); + } + + if (type == MLOG_8BYTES) { + ptr = mach_ull_parse_compressed(ptr, end_ptr, &dval); + + if (ptr == NULL) { + + return(NULL); + } + + if (page) { + if (page_zip) { + mach_write_to_8 + (((page_zip_des_t*) page_zip)->data + + offset, dval); + } + mach_write_to_8(page + offset, dval); + } + + return(ptr); + } + + ptr = mach_parse_compressed(ptr, end_ptr, &val); + + if (ptr == NULL) { + + return(NULL); + } + + switch (type) { + case MLOG_1BYTE: + if (UNIV_UNLIKELY(val > 0xFFUL)) { + goto corrupt; + } + if (page) { + if (page_zip) { + mach_write_to_1 + (((page_zip_des_t*) page_zip)->data + + offset, val); + } + mach_write_to_1(page + offset, val); + } + break; + case MLOG_2BYTES: + if (UNIV_UNLIKELY(val > 0xFFFFUL)) { + goto corrupt; + } + if (page) { + if (page_zip) { + mach_write_to_2 + (((page_zip_des_t*) page_zip)->data + + offset, val); + } + mach_write_to_2(page + offset, val); + } + break; + case MLOG_4BYTES: + if (page) { + if (page_zip) { + mach_write_to_4 + (((page_zip_des_t*) page_zip)->data + + offset, val); + } + mach_write_to_4(page + offset, val); + } + break; + default: + corrupt: + recv_sys->found_corrupt_log = TRUE; + ptr = NULL; + } + + return(ptr); +} + +/********************************************************//** +Writes 1, 2 or 4 bytes to a file page. Writes the corresponding log +record to the mini-transaction log if mtr is not NULL. */ +UNIV_INTERN +void +mlog_write_ulint( +/*=============*/ + byte* ptr, /*!< in: pointer where to write */ + ulint val, /*!< in: value to write */ + byte type, /*!< in: MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES */ + mtr_t* mtr) /*!< in: mini-transaction handle */ +{ + switch (type) { + case MLOG_1BYTE: + mach_write_to_1(ptr, val); + break; + case MLOG_2BYTES: + mach_write_to_2(ptr, val); + break; + case MLOG_4BYTES: + mach_write_to_4(ptr, val); + break; + default: + ut_error; + } + + if (mtr != 0) { + byte* log_ptr = mlog_open(mtr, 11 + 2 + 5); + + /* If no logging is requested, we may return now */ + + if (log_ptr != 0) { + + log_ptr = mlog_write_initial_log_record_fast( + ptr, type, log_ptr, mtr); + + mach_write_to_2(log_ptr, page_offset(ptr)); + log_ptr += 2; + + log_ptr += mach_write_compressed(log_ptr, val); + + mlog_close(mtr, log_ptr); + } + } +} + +/********************************************************//** +Writes 8 bytes to a file page. Writes the corresponding log +record to the mini-transaction log, only if mtr is not NULL */ +UNIV_INTERN +void +mlog_write_ull( +/*===========*/ + byte* ptr, /*!< in: pointer where to write */ + ib_uint64_t val, /*!< in: value to write */ + mtr_t* mtr) /*!< in: mini-transaction handle */ +{ + mach_write_to_8(ptr, val); + + if (mtr != 0) { + byte* log_ptr = mlog_open(mtr, 11 + 2 + 9); + + /* If no logging is requested, we may return now */ + if (log_ptr != 0) { + + log_ptr = mlog_write_initial_log_record_fast( + ptr, MLOG_8BYTES, log_ptr, mtr); + + mach_write_to_2(log_ptr, page_offset(ptr)); + log_ptr += 2; + + log_ptr += mach_ull_write_compressed(log_ptr, val); + + mlog_close(mtr, log_ptr); + } + } +} + +#ifndef UNIV_HOTBACKUP +/********************************************************//** +Writes a string to a file page buffered in the buffer pool. Writes the +corresponding log record to the mini-transaction log. */ +UNIV_INTERN +void +mlog_write_string( +/*==============*/ + byte* ptr, /*!< in: pointer where to write */ + const byte* str, /*!< in: string to write */ + ulint len, /*!< in: string length */ + mtr_t* mtr) /*!< in: mini-transaction handle */ +{ + ut_ad(ptr && mtr); + ut_a(len < UNIV_PAGE_SIZE); + + memcpy(ptr, str, len); + + mlog_log_string(ptr, len, mtr); +} + +/********************************************************//** +Logs a write of a string to a file page buffered in the buffer pool. +Writes the corresponding log record to the mini-transaction log. */ +UNIV_INTERN +void +mlog_log_string( +/*============*/ + byte* ptr, /*!< in: pointer written to */ + ulint len, /*!< in: string length */ + mtr_t* mtr) /*!< in: mini-transaction handle */ +{ + byte* log_ptr; + + ut_ad(ptr && mtr); + ut_ad(len <= UNIV_PAGE_SIZE); + + log_ptr = mlog_open(mtr, 30); + + /* If no logging is requested, we may return now */ + if (log_ptr == NULL) { + + return; + } + + log_ptr = mlog_write_initial_log_record_fast(ptr, MLOG_WRITE_STRING, + log_ptr, mtr); + mach_write_to_2(log_ptr, page_offset(ptr)); + log_ptr += 2; + + mach_write_to_2(log_ptr, len); + log_ptr += 2; + + mlog_close(mtr, log_ptr); + + mlog_catenate_string(mtr, ptr, len); +} +#endif /* !UNIV_HOTBACKUP */ + +/********************************************************//** +Parses a log record written by mlog_write_string. +@return parsed record end, NULL if not a complete record */ +UNIV_INTERN +byte* +mlog_parse_string( +/*==============*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + byte* page, /*!< in: page where to apply the log record, or NULL */ + void* page_zip)/*!< in/out: compressed page, or NULL */ +{ + ulint offset; + ulint len; + + ut_a(!page || !page_zip || fil_page_get_type(page) != FIL_PAGE_INDEX); + + if (end_ptr < ptr + 4) { + + return(NULL); + } + + offset = mach_read_from_2(ptr); + ptr += 2; + len = mach_read_from_2(ptr); + ptr += 2; + + if (UNIV_UNLIKELY(offset >= UNIV_PAGE_SIZE) + || UNIV_UNLIKELY(len + offset > UNIV_PAGE_SIZE)) { + recv_sys->found_corrupt_log = TRUE; + + return(NULL); + } + + if (end_ptr < ptr + len) { + + return(NULL); + } + + if (page) { + if (page_zip) { + memcpy(((page_zip_des_t*) page_zip)->data + + offset, ptr, len); + } + memcpy(page + offset, ptr, len); + } + + return(ptr + len); +} + +#ifndef UNIV_HOTBACKUP +/********************************************************//** +Opens a buffer for mlog, writes the initial log record and, +if needed, the field lengths of an index. +@return buffer, NULL if log mode MTR_LOG_NONE */ +UNIV_INTERN +byte* +mlog_open_and_write_index( +/*======================*/ + mtr_t* mtr, /*!< in: mtr */ + const byte* rec, /*!< in: index record or page */ + const dict_index_t* index, /*!< in: record descriptor */ + byte type, /*!< in: log item type */ + ulint size) /*!< in: requested buffer size in bytes + (if 0, calls mlog_close() and + returns NULL) */ +{ + byte* log_ptr; + const byte* log_start; + const byte* log_end; + + ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table)); + + if (!page_rec_is_comp(rec)) { + log_start = log_ptr = mlog_open(mtr, 11 + size); + if (!log_ptr) { + return(NULL); /* logging is disabled */ + } + log_ptr = mlog_write_initial_log_record_fast(rec, type, + log_ptr, mtr); + log_end = log_ptr + 11 + size; + } else { + ulint i; + ulint n = dict_index_get_n_fields(index); + /* total size needed */ + ulint total = 11 + size + (n + 2) * 2; + ulint alloc = total; + /* allocate at most DYN_ARRAY_DATA_SIZE at a time */ + if (alloc > DYN_ARRAY_DATA_SIZE) { + alloc = DYN_ARRAY_DATA_SIZE; + } + log_start = log_ptr = mlog_open(mtr, alloc); + if (!log_ptr) { + return(NULL); /* logging is disabled */ + } + log_end = log_ptr + alloc; + log_ptr = mlog_write_initial_log_record_fast(rec, type, + log_ptr, mtr); + mach_write_to_2(log_ptr, n); + log_ptr += 2; + mach_write_to_2(log_ptr, + dict_index_get_n_unique_in_tree(index)); + log_ptr += 2; + for (i = 0; i < n; i++) { + dict_field_t* field; + const dict_col_t* col; + ulint len; + + field = dict_index_get_nth_field(index, i); + col = dict_field_get_col(field); + len = field->fixed_len; + ut_ad(len < 0x7fff); + if (len == 0 + && (col->len > 255 || col->mtype == DATA_BLOB)) { + /* variable-length field + with maximum length > 255 */ + len = 0x7fff; + } + if (col->prtype & DATA_NOT_NULL) { + len |= 0x8000; + } + if (log_ptr + 2 > log_end) { + mlog_close(mtr, log_ptr); + ut_a(total > (ulint) (log_ptr - log_start)); + total -= log_ptr - log_start; + alloc = total; + if (alloc > DYN_ARRAY_DATA_SIZE) { + alloc = DYN_ARRAY_DATA_SIZE; + } + log_start = log_ptr = mlog_open(mtr, alloc); + if (!log_ptr) { + return(NULL); /* logging is disabled */ + } + log_end = log_ptr + alloc; + } + mach_write_to_2(log_ptr, len); + log_ptr += 2; + } + } + if (size == 0) { + mlog_close(mtr, log_ptr); + log_ptr = NULL; + } else if (log_ptr + size > log_end) { + mlog_close(mtr, log_ptr); + log_ptr = mlog_open(mtr, size); + } + return(log_ptr); +} +#endif /* !UNIV_HOTBACKUP */ + +/********************************************************//** +Parses a log record written by mlog_open_and_write_index. +@return parsed record end, NULL if not a complete record */ +UNIV_INTERN +byte* +mlog_parse_index( +/*=============*/ + byte* ptr, /*!< in: buffer */ + const byte* end_ptr,/*!< in: buffer end */ + ibool comp, /*!< in: TRUE=compact row format */ + dict_index_t** index) /*!< out, own: dummy index */ +{ + ulint i, n, n_uniq; + dict_table_t* table; + dict_index_t* ind; + + ut_ad(comp == FALSE || comp == TRUE); + + if (comp) { + if (end_ptr < ptr + 4) { + return(NULL); + } + n = mach_read_from_2(ptr); + ptr += 2; + n_uniq = mach_read_from_2(ptr); + ptr += 2; + ut_ad(n_uniq <= n); + if (end_ptr < ptr + n * 2) { + return(NULL); + } + } else { + n = n_uniq = 1; + } + table = dict_mem_table_create("LOG_DUMMY", DICT_HDR_SPACE, n, + comp ? DICT_TF_COMPACT : 0, 0, true); + ind = dict_mem_index_create("LOG_DUMMY", "LOG_DUMMY", + DICT_HDR_SPACE, 0, n); + ind->table = table; + ind->n_uniq = (unsigned int) n_uniq; + if (n_uniq != n) { + ut_a(n_uniq + DATA_ROLL_PTR <= n); + ind->type = DICT_CLUSTERED; + } + if (comp) { + for (i = 0; i < n; i++) { + ulint len = mach_read_from_2(ptr); + ptr += 2; + /* The high-order bit of len is the NOT NULL flag; + the rest is 0 or 0x7fff for variable-length fields, + and 1..0x7ffe for fixed-length fields. */ + dict_mem_table_add_col( + table, NULL, NULL, + ((len + 1) & 0x7fff) <= 1 + ? DATA_BINARY : DATA_FIXBINARY, + len & 0x8000 ? DATA_NOT_NULL : 0, + len & 0x7fff); + + dict_index_add_col(ind, table, + dict_table_get_nth_col(table, i), + 0); + } + dict_table_add_system_columns(table, table->heap); + if (n_uniq != n) { + /* Identify DB_TRX_ID and DB_ROLL_PTR in the index. */ + ut_a(DATA_TRX_ID_LEN + == dict_index_get_nth_col(ind, DATA_TRX_ID - 1 + + n_uniq)->len); + ut_a(DATA_ROLL_PTR_LEN + == dict_index_get_nth_col(ind, DATA_ROLL_PTR - 1 + + n_uniq)->len); + ind->fields[DATA_TRX_ID - 1 + n_uniq].col + = &table->cols[n + DATA_TRX_ID]; + ind->fields[DATA_ROLL_PTR - 1 + n_uniq].col + = &table->cols[n + DATA_ROLL_PTR]; + } + } + /* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */ + ind->cached = TRUE; + *index = ind; + return(ptr); +} diff --git a/storage/xtradb/mtr/mtr0mtr.cc b/storage/xtradb/mtr/mtr0mtr.cc new file mode 100644 index 00000000000..d17b5c5259d --- /dev/null +++ b/storage/xtradb/mtr/mtr0mtr.cc @@ -0,0 +1,441 @@ +/***************************************************************************** + +Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file mtr/mtr0mtr.cc +Mini-transaction buffer + +Created 11/26/1995 Heikki Tuuri +*******************************************************/ + +#include "mtr0mtr.h" + +#ifdef UNIV_NONINL +#include "mtr0mtr.ic" +#endif + +#include "buf0buf.h" +#include "buf0flu.h" +#include "page0types.h" +#include "mtr0log.h" +#include "log0log.h" + +#ifndef UNIV_HOTBACKUP +# include "log0recv.h" + +/***************************************************//** +Checks if a mini-transaction is dirtying a clean page. +@return TRUE if the mtr is dirtying a clean page. */ +UNIV_INTERN +ibool +mtr_block_dirtied( +/*==============*/ + const buf_block_t* block) /*!< in: block being x-fixed */ +{ + ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); + ut_ad(block->page.buf_fix_count > 0); + + /* It is OK to read oldest_modification because no + other thread can be performing a write of it and it + is only during write that the value is reset to 0. */ + return(block->page.oldest_modification == 0); +} + +/*****************************************************************//** +Releases the item in the slot given. */ +static __attribute__((nonnull)) +void +mtr_memo_slot_release_func( +/*=======================*/ +#ifdef UNIV_DEBUG + mtr_t* mtr, /*!< in/out: mini-transaction */ +#endif /* UNIV_DEBUG */ + mtr_memo_slot_t* slot) /*!< in: memo slot */ +{ + void* object = slot->object; + slot->object = NULL; + + /* slot release is a local operation for the current mtr. + We must not be holding the flush_order mutex while + doing this. */ + ut_ad(!log_flush_order_mutex_own()); + + switch (slot->type) { + case MTR_MEMO_PAGE_S_FIX: + case MTR_MEMO_PAGE_X_FIX: + case MTR_MEMO_BUF_FIX: + buf_page_release((buf_block_t*) object, slot->type); + break; + case MTR_MEMO_S_LOCK: + rw_lock_s_unlock((prio_rw_lock_t*) object); + break; + case MTR_MEMO_X_LOCK: + rw_lock_x_unlock((prio_rw_lock_t*) object); + break; +#ifdef UNIV_DEBUG + default: + ut_ad(slot->type == MTR_MEMO_MODIFY); + ut_ad(mtr_memo_contains(mtr, object, MTR_MEMO_PAGE_X_FIX)); +#endif /* UNIV_DEBUG */ + } +} + +#ifdef UNIV_DEBUG +# define mtr_memo_slot_release(mtr, slot) mtr_memo_slot_release_func(mtr, slot) +#else /* UNIV_DEBUG */ +# define mtr_memo_slot_release(mtr, slot) mtr_memo_slot_release_func(slot) +#endif /* UNIV_DEBUG */ + +/**********************************************************//** +Releases the mlocks and other objects stored in an mtr memo. +They are released in the order opposite to which they were pushed +to the memo. */ +static __attribute__((nonnull)) +void +mtr_memo_pop_all( +/*=============*/ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ut_ad(mtr->magic_n == MTR_MAGIC_N); + ut_ad(mtr->state == MTR_COMMITTING); /* Currently only used in + commit */ + + for (const dyn_block_t* block = dyn_array_get_last_block(&mtr->memo); + block; + block = dyn_array_get_prev_block(&mtr->memo, block)) { + const mtr_memo_slot_t* start + = reinterpret_cast<mtr_memo_slot_t*>( + dyn_block_get_data(block)); + mtr_memo_slot_t* slot + = reinterpret_cast<mtr_memo_slot_t*>( + dyn_block_get_data(block) + + dyn_block_get_used(block)); + + ut_ad(!(dyn_block_get_used(block) % sizeof(mtr_memo_slot_t))); + + while (slot-- != start) { + if (slot->object != NULL) { + mtr_memo_slot_release(mtr, slot); + } + } + } +} + +/*****************************************************************//** +Releases the item in the slot given. */ +static +void +mtr_memo_slot_note_modification( +/*============================*/ + mtr_t* mtr, /*!< in: mtr */ + mtr_memo_slot_t* slot) /*!< in: memo slot */ +{ + ut_ad(mtr->modifications); + ut_ad(!srv_read_only_mode); + ut_ad(mtr->magic_n == MTR_MAGIC_N); + + if (slot->object != NULL && slot->type == MTR_MEMO_PAGE_X_FIX) { + buf_block_t* block = (buf_block_t*) slot->object; + + ut_ad(!mtr->made_dirty || log_flush_order_mutex_own()); + buf_flush_note_modification(block, mtr); + } +} + +/**********************************************************//** +Add the modified pages to the buffer flush list. They are released +in the order opposite to which they were pushed to the memo. NOTE! It is +essential that the x-rw-lock on a modified buffer page is not released +before buf_page_note_modification is called for that page! Otherwise, +some thread might race to modify it, and the flush list sort order on +lsn would be destroyed. */ +static +void +mtr_memo_note_modifications( +/*========================*/ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(!srv_read_only_mode); + ut_ad(mtr->magic_n == MTR_MAGIC_N); + ut_ad(mtr->state == MTR_COMMITTING); /* Currently only used in + commit */ + + for (const dyn_block_t* block = dyn_array_get_last_block(&mtr->memo); + block; + block = dyn_array_get_prev_block(&mtr->memo, block)) { + const mtr_memo_slot_t* start + = reinterpret_cast<mtr_memo_slot_t*>( + dyn_block_get_data(block)); + mtr_memo_slot_t* slot + = reinterpret_cast<mtr_memo_slot_t*>( + dyn_block_get_data(block) + + dyn_block_get_used(block)); + + ut_ad(!(dyn_block_get_used(block) % sizeof(mtr_memo_slot_t))); + + while (slot-- != start) { + if (slot->object != NULL) { + mtr_memo_slot_note_modification(mtr, slot); + } + } + } +} + +/************************************************************//** +Append the dirty pages to the flush list. */ +static +void +mtr_add_dirtied_pages_to_flush_list( +/*================================*/ + mtr_t* mtr) /*!< in/out: mtr */ +{ + ut_ad(!srv_read_only_mode); + + /* No need to acquire log_flush_order_mutex if this mtr has + not dirtied a clean page. log_flush_order_mutex is used to + ensure ordered insertions in the flush_list. We need to + insert in the flush_list iff the page in question was clean + before modifications. */ + if (mtr->made_dirty) { + log_flush_order_mutex_enter(); + } + + /* It is now safe to release the log mutex because the + flush_order mutex will ensure that we are the first one + to insert into the flush list. */ + log_release(); + + if (mtr->modifications) { + mtr_memo_note_modifications(mtr); + } + + if (mtr->made_dirty) { + log_flush_order_mutex_exit(); + } +} + +/************************************************************//** +Writes the contents of a mini-transaction log, if any, to the database log. */ +static +void +mtr_log_reserve_and_write( +/*======================*/ + mtr_t* mtr) /*!< in/out: mtr */ +{ + dyn_array_t* mlog; + ulint data_size; + byte* first_data; + + ut_ad(!srv_read_only_mode); + + mlog = &(mtr->log); + + first_data = dyn_block_get_data(mlog); + + if (mtr->n_log_recs > 1) { + mlog_catenate_ulint(mtr, MLOG_MULTI_REC_END, MLOG_1BYTE); + } else { + *first_data = (byte)((ulint)*first_data + | MLOG_SINGLE_REC_FLAG); + } + + if (mlog->heap == NULL) { + ulint len; + + len = mtr->log_mode != MTR_LOG_NO_REDO + ? dyn_block_get_used(mlog) : 0; + + mtr->end_lsn = log_reserve_and_write_fast( + first_data, len, &mtr->start_lsn); + + if (mtr->end_lsn) { + + /* Success. We have the log mutex. + Add pages to flush list and exit */ + mtr_add_dirtied_pages_to_flush_list(mtr); + + return; + } + } else { + mutex_enter(&log_sys->mutex); + } + + data_size = dyn_array_get_data_size(mlog); + + /* Open the database log for log_write_low */ + mtr->start_lsn = log_open(data_size); + + if (mtr->log_mode == MTR_LOG_ALL) { + + for (dyn_block_t* block = mlog; + block != 0; + block = dyn_array_get_next_block(mlog, block)) { + + log_write_low( + dyn_block_get_data(block), + dyn_block_get_used(block)); + } + + } else { + ut_ad(mtr->log_mode == MTR_LOG_NONE + || mtr->log_mode == MTR_LOG_NO_REDO); + /* Do nothing */ + } + + mtr->end_lsn = log_close(); + + mtr_add_dirtied_pages_to_flush_list(mtr); +} +#endif /* !UNIV_HOTBACKUP */ + +/***************************************************************//** +Commits a mini-transaction. */ +UNIV_INTERN +void +mtr_commit( +/*=======*/ + mtr_t* mtr) /*!< in: mini-transaction */ +{ + ut_ad(mtr); + ut_ad(mtr->magic_n == MTR_MAGIC_N); + ut_ad(mtr->state == MTR_ACTIVE); + ut_ad(!mtr->inside_ibuf); + ut_d(mtr->state = MTR_COMMITTING); + +#ifndef UNIV_HOTBACKUP + /* This is a dirty read, for debugging. */ + ut_ad(!recv_no_log_write); + + if (mtr->modifications && mtr->n_log_recs) { + ut_ad(!srv_read_only_mode); + mtr_log_reserve_and_write(mtr); + } + + mtr_memo_pop_all(mtr); +#endif /* !UNIV_HOTBACKUP */ + + dyn_array_free(&(mtr->memo)); + dyn_array_free(&(mtr->log)); +#ifdef UNIV_DEBUG_VALGRIND + /* Declare everything uninitialized except + mtr->start_lsn, mtr->end_lsn and mtr->state. */ + { + lsn_t start_lsn = mtr->start_lsn; + lsn_t end_lsn = mtr->end_lsn; + UNIV_MEM_INVALID(mtr, sizeof *mtr); + mtr->start_lsn = start_lsn; + mtr->end_lsn = end_lsn; + } +#endif /* UNIV_DEBUG_VALGRIND */ + ut_d(mtr->state = MTR_COMMITTED); +} + +#ifndef UNIV_HOTBACKUP +/***************************************************//** +Releases an object in the memo stack. +@return true if released */ +UNIV_INTERN +bool +mtr_memo_release( +/*=============*/ + mtr_t* mtr, /*!< in/out: mini-transaction */ + void* object, /*!< in: object */ + ulint type) /*!< in: object type: MTR_MEMO_S_LOCK, ... */ +{ + ut_ad(mtr->magic_n == MTR_MAGIC_N); + ut_ad(mtr->state == MTR_ACTIVE); + /* We cannot release a page that has been written to in the + middle of a mini-transaction. */ + ut_ad(!mtr->modifications || type != MTR_MEMO_PAGE_X_FIX); + + for (const dyn_block_t* block = dyn_array_get_last_block(&mtr->memo); + block; + block = dyn_array_get_prev_block(&mtr->memo, block)) { + const mtr_memo_slot_t* start + = reinterpret_cast<mtr_memo_slot_t*>( + dyn_block_get_data(block)); + mtr_memo_slot_t* slot + = reinterpret_cast<mtr_memo_slot_t*>( + dyn_block_get_data(block) + + dyn_block_get_used(block)); + + ut_ad(!(dyn_block_get_used(block) % sizeof(mtr_memo_slot_t))); + + while (slot-- != start) { + if (object == slot->object && type == slot->type) { + mtr_memo_slot_release(mtr, slot); + return(true); + } + } + } + + return(false); +} +#endif /* !UNIV_HOTBACKUP */ + +/********************************************************//** +Reads 1 - 4 bytes from a file page buffered in the buffer pool. +@return value read */ +UNIV_INTERN +ulint +mtr_read_ulint( +/*===========*/ + const byte* ptr, /*!< in: pointer from where to read */ + ulint type, /*!< in: MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES */ + mtr_t* mtr __attribute__((unused))) + /*!< in: mini-transaction handle */ +{ + ut_ad(mtr->state == MTR_ACTIVE); + ut_ad(mtr_memo_contains_page(mtr, ptr, MTR_MEMO_PAGE_S_FIX) + || mtr_memo_contains_page(mtr, ptr, MTR_MEMO_PAGE_X_FIX)); + + return(mach_read_ulint(ptr, type)); +} + +#ifdef UNIV_DEBUG +# ifndef UNIV_HOTBACKUP +/**********************************************************//** +Checks if memo contains the given page. +@return TRUE if contains */ +UNIV_INTERN +ibool +mtr_memo_contains_page( +/*===================*/ + mtr_t* mtr, /*!< in: mtr */ + const byte* ptr, /*!< in: pointer to buffer frame */ + ulint type) /*!< in: type of object */ +{ + return(mtr_memo_contains(mtr, buf_block_align(ptr), type)); +} + +/*********************************************************//** +Prints info of an mtr handle. */ +UNIV_INTERN +void +mtr_print( +/*======*/ + mtr_t* mtr) /*!< in: mtr */ +{ + fprintf(stderr, + "Mini-transaction handle: memo size %lu bytes" + " log size %lu bytes\n", + (ulong) dyn_array_get_data_size(&(mtr->memo)), + (ulong) dyn_array_get_data_size(&(mtr->log))); +} +# endif /* !UNIV_HOTBACKUP */ +#endif /* UNIV_DEBUG */ diff --git a/storage/xtradb/os/os0file.cc b/storage/xtradb/os/os0file.cc new file mode 100644 index 00000000000..b72939564d8 --- /dev/null +++ b/storage/xtradb/os/os0file.cc @@ -0,0 +1,6004 @@ +/*********************************************************************** + +Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2009, Percona Inc. + +Portions of this file contain modifications contributed and copyrighted +by Percona Inc.. Those modifications are +gratefully acknowledged and are described briefly in the InnoDB +documentation. The contributions by Percona Inc. are incorporated with +their permission, and subject to the conditions contained in the file +COPYING.Percona. + +This program is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +***********************************************************************/ + +/**************************************************//** +@file os/os0file.cc +The interface to the operating system file i/o primitives + +Created 10/21/1995 Heikki Tuuri +*******************************************************/ + +#include "os0file.h" + +#ifdef UNIV_NONINL +#include "os0file.ic" +#endif + +#include "ut0mem.h" +#include "srv0srv.h" +#include "srv0start.h" +#include "fil0fil.h" +#include "buf0buf.h" +#include "btr0types.h" +#include "trx0trx.h" +#include "srv0mon.h" +#ifndef UNIV_HOTBACKUP +# include "os0sync.h" +# include "os0thread.h" +#else /* !UNIV_HOTBACKUP */ +# ifdef __WIN__ +/* Add includes for the _stat() call to compile on Windows */ +# include <sys/types.h> +# include <sys/stat.h> +# include <errno.h> +# endif /* __WIN__ */ +#endif /* !UNIV_HOTBACKUP */ + +#if defined(LINUX_NATIVE_AIO) +#include <libaio.h> +#endif + +#if defined(UNIV_LINUX) && defined(HAVE_SYS_IOCTL_H) +# include <sys/ioctl.h> +# ifndef DFS_IOCTL_ATOMIC_WRITE_SET +# define DFS_IOCTL_ATOMIC_WRITE_SET _IOW(0x95, 2, uint) +# endif +#endif + +/** Insert buffer segment id */ +static const ulint IO_IBUF_SEGMENT = 0; + +/** Log segment id */ +static const ulint IO_LOG_SEGMENT = 1; + +/* This specifies the file permissions InnoDB uses when it creates files in +Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to +my_umask */ + +#ifndef __WIN__ +/** Umask for creating files */ +UNIV_INTERN ulint os_innodb_umask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP; +#else +/** Umask for creating files */ +UNIV_INTERN ulint os_innodb_umask = 0; +#endif /* __WIN__ */ + +#ifndef UNIV_HOTBACKUP +/* We use these mutexes to protect lseek + file i/o operation, if the +OS does not provide an atomic pread or pwrite, or similar */ +#define OS_FILE_N_SEEK_MUTEXES 16 +UNIV_INTERN os_ib_mutex_t os_file_seek_mutexes[OS_FILE_N_SEEK_MUTEXES]; + +/* In simulated aio, merge at most this many consecutive i/os */ +#define OS_AIO_MERGE_N_CONSECUTIVE 64 + +/********************************************************************** + +InnoDB AIO Implementation: +========================= + +We support native AIO for windows and linux. For rest of the platforms +we simulate AIO by special io-threads servicing the IO-requests. + +Simulated AIO: +============== + +In platforms where we 'simulate' AIO following is a rough explanation +of the high level design. +There are four io-threads (for ibuf, log, read, write). +All synchronous IO requests are serviced by the calling thread using +os_file_write/os_file_read. The Asynchronous requests are queued up +in an array (there are four such arrays) by the calling thread. +Later these requests are picked up by the io-thread and are serviced +synchronously. + +Windows native AIO: +================== + +If srv_use_native_aio is not set then windows follow the same +code as simulated AIO. If the flag is set then native AIO interface +is used. On windows, one of the limitation is that if a file is opened +for AIO no synchronous IO can be done on it. Therefore we have an +extra fifth array to queue up synchronous IO requests. +There are innodb_file_io_threads helper threads. These threads work +on the four arrays mentioned above in Simulated AIO. No thread is +required for the sync array. +If a synchronous IO request is made, it is first queued in the sync +array. Then the calling thread itself waits on the request, thus +making the call synchronous. +If an AIO request is made the calling thread not only queues it in the +array but also submits the requests. The helper thread then collects +the completed IO request and calls completion routine on it. + +Linux native AIO: +================= + +If we have libaio installed on the system and innodb_use_native_aio +is set to TRUE we follow the code path of native AIO, otherwise we +do simulated AIO. +There are innodb_file_io_threads helper threads. These threads work +on the four arrays mentioned above in Simulated AIO. +If a synchronous IO request is made, it is handled by calling +os_file_write/os_file_read. +If an AIO request is made the calling thread not only queues it in the +array but also submits the requests. The helper thread then collects +the completed IO request and calls completion routine on it. + +**********************************************************************/ + +/** Flag: enable debug printout for asynchronous i/o */ +UNIV_INTERN ibool os_aio_print_debug = FALSE; + +#ifdef UNIV_PFS_IO +/* Keys to register InnoDB I/O with performance schema */ +UNIV_INTERN mysql_pfs_key_t innodb_file_data_key; +UNIV_INTERN mysql_pfs_key_t innodb_file_log_key; +UNIV_INTERN mysql_pfs_key_t innodb_file_temp_key; +UNIV_INTERN mysql_pfs_key_t innodb_file_bmp_key; +#endif /* UNIV_PFS_IO */ + +/** The asynchronous i/o array slot structure */ +struct os_aio_slot_t{ + ibool is_read; /*!< TRUE if a read operation */ + ulint pos; /*!< index of the slot in the aio + array */ + ibool reserved; /*!< TRUE if this slot is reserved */ + time_t reservation_time;/*!< time when reserved */ + ulint len; /*!< length of the block to read or + write */ + byte* buf; /*!< buffer used in i/o */ + ulint type; /*!< OS_FILE_READ or OS_FILE_WRITE */ + os_offset_t offset; /*!< file offset in bytes */ + os_file_t file; /*!< file where to read or write */ + const char* name; /*!< file name or path */ + ibool io_already_done;/*!< used only in simulated aio: + TRUE if the physical i/o already + made and only the slot message + needs to be passed to the caller + of os_aio_simulated_handle */ + ulint space_id; + fil_node_t* message1; /*!< message which is given by the */ + void* message2; /*!< the requester of an aio operation + and which can be used to identify + which pending aio operation was + completed */ +#ifdef WIN_ASYNC_IO + HANDLE handle; /*!< handle object we need in the + OVERLAPPED struct */ + OVERLAPPED control; /*!< Windows control block for the + aio request */ +#elif defined(LINUX_NATIVE_AIO) + struct iocb control; /* Linux control block for aio */ + int n_bytes; /* bytes written/read. */ + int ret; /* AIO return code */ +#endif /* WIN_ASYNC_IO */ +}; + +/** The asynchronous i/o array structure */ +struct os_aio_array_t{ + os_ib_mutex_t mutex; /*!< the mutex protecting the aio array */ + os_event_t not_full; + /*!< The event which is set to the + signaled state when there is space in + the aio outside the ibuf segment */ + os_event_t is_empty; + /*!< The event which is set to the + signaled state when there are no + pending i/os in this array */ + ulint n_slots;/*!< Total number of slots in the aio + array. This must be divisible by + n_threads. */ + ulint n_segments; + /*!< Number of segments in the aio + array of pending aio requests. A + thread can wait separately for any one + of the segments. */ + ulint cur_seg;/*!< We reserve IO requests in round + robin fashion to different segments. + This points to the segment that is to + be used to service next IO request. */ + ulint n_reserved; + /*!< Number of reserved slots in the + aio array outside the ibuf segment */ + os_aio_slot_t* slots; /*!< Pointer to the slots in the array */ +#ifdef __WIN__ + HANDLE* handles; + /*!< Pointer to an array of OS native + event handles where we copied the + handles from slots, in the same + order. This can be used in + WaitForMultipleObjects; used only in + Windows */ +#endif /* __WIN__ */ + +#if defined(LINUX_NATIVE_AIO) + io_context_t* aio_ctx; + /* completion queue for IO. There is + one such queue per segment. Each thread + will work on one ctx exclusively. */ + struct io_event* aio_events; + /* The array to collect completed IOs. + There is one such event for each + possible pending IO. The size of the + array is equal to n_slots. */ +#endif /* LINUX_NATIV_AIO */ +}; + +#if defined(LINUX_NATIVE_AIO) +/** timeout for each io_getevents() call = 500ms. */ +#define OS_AIO_REAP_TIMEOUT (500000000UL) + +/** time to sleep, in microseconds if io_setup() returns EAGAIN. */ +#define OS_AIO_IO_SETUP_RETRY_SLEEP (500000UL) + +/** number of attempts before giving up on io_setup(). */ +#define OS_AIO_IO_SETUP_RETRY_ATTEMPTS 5 +#endif + +/** Array of events used in simulated aio */ +static os_event_t* os_aio_segment_wait_events = NULL; + +/** The aio arrays for non-ibuf i/o and ibuf i/o, as well as sync aio. These +are NULL when the module has not yet been initialized. @{ */ +static os_aio_array_t* os_aio_read_array = NULL; /*!< Reads */ +static os_aio_array_t* os_aio_write_array = NULL; /*!< Writes */ +static os_aio_array_t* os_aio_ibuf_array = NULL; /*!< Insert buffer */ +static os_aio_array_t* os_aio_log_array = NULL; /*!< Redo log */ +static os_aio_array_t* os_aio_sync_array = NULL; /*!< Synchronous I/O */ +/* @} */ + +/** Number of asynchronous I/O segments. Set by os_aio_init(). */ +static ulint os_aio_n_segments = ULINT_UNDEFINED; + +/** If the following is TRUE, read i/o handler threads try to +wait until a batch of new read requests have been posted */ +static ibool os_aio_recommend_sleep_for_read_threads = FALSE; +#endif /* !UNIV_HOTBACKUP */ + +UNIV_INTERN ulint os_n_file_reads = 0; +UNIV_INTERN ulint os_bytes_read_since_printout = 0; +UNIV_INTERN ulint os_n_file_writes = 0; +UNIV_INTERN ulint os_n_fsyncs = 0; +UNIV_INTERN ulint os_n_file_reads_old = 0; +UNIV_INTERN ulint os_n_file_writes_old = 0; +UNIV_INTERN ulint os_n_fsyncs_old = 0; +UNIV_INTERN time_t os_last_printout; + +UNIV_INTERN ibool os_has_said_disk_full = FALSE; + +#if !defined(UNIV_HOTBACKUP) \ + && (!defined(HAVE_ATOMIC_BUILTINS) || UNIV_WORD_SIZE < 8) +/** The mutex protecting the following counts of pending I/O operations */ +static os_ib_mutex_t os_file_count_mutex; +#endif /* !UNIV_HOTBACKUP && (!HAVE_ATOMIC_BUILTINS || UNIV_WORD_SIZE < 8) */ + +/** Number of pending os_file_pread() operations */ +UNIV_INTERN ulint os_file_n_pending_preads = 0; +/** Number of pending os_file_pwrite() operations */ +UNIV_INTERN ulint os_file_n_pending_pwrites = 0; +/** Number of pending write operations */ +UNIV_INTERN ulint os_n_pending_writes = 0; +/** Number of pending read operations */ +UNIV_INTERN ulint os_n_pending_reads = 0; + +#ifdef UNIV_DEBUG +# ifndef UNIV_HOTBACKUP +/**********************************************************************//** +Validates the consistency the aio system some of the time. +@return TRUE if ok or the check was skipped */ +UNIV_INTERN +ibool +os_aio_validate_skip(void) +/*======================*/ +{ +/** Try os_aio_validate() every this many times */ +# define OS_AIO_VALIDATE_SKIP 13 + + /** The os_aio_validate() call skip counter. + Use a signed type because of the race condition below. */ + static int os_aio_validate_count = OS_AIO_VALIDATE_SKIP; + + /* There is a race condition below, but it does not matter, + because this call is only for heuristic purposes. We want to + reduce the call frequency of the costly os_aio_validate() + check in debug builds. */ + if (--os_aio_validate_count > 0) { + return(TRUE); + } + + os_aio_validate_count = OS_AIO_VALIDATE_SKIP; + return(os_aio_validate()); +} +# endif /* !UNIV_HOTBACKUP */ +#endif /* UNIV_DEBUG */ + +#ifdef __WIN__ +/***********************************************************************//** +Gets the operating system version. Currently works only on Windows. +@return OS_WIN95, OS_WIN31, OS_WINNT, OS_WIN2000, OS_WINXP, OS_WINVISTA, +OS_WIN7. */ +UNIV_INTERN +ulint +os_get_os_version(void) +/*===================*/ +{ + OSVERSIONINFO os_info; + + os_info.dwOSVersionInfoSize = sizeof(OSVERSIONINFO); + + ut_a(GetVersionEx(&os_info)); + + if (os_info.dwPlatformId == VER_PLATFORM_WIN32s) { + return(OS_WIN31); + } else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_WINDOWS) { + return(OS_WIN95); + } else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_NT) { + switch (os_info.dwMajorVersion) { + case 3: + case 4: + return(OS_WINNT); + case 5: + return (os_info.dwMinorVersion == 0) + ? OS_WIN2000 : OS_WINXP; + case 6: + return (os_info.dwMinorVersion == 0) + ? OS_WINVISTA : OS_WIN7; + default: + return(OS_WIN7); + } + } else { + ut_error; + return(0); + } +} +#endif /* __WIN__ */ + +/***********************************************************************//** +Retrieves the last error number if an error occurs in a file io function. +The number should be retrieved before any other OS calls (because they may +overwrite the error number). If the number is not known to this program, +the OS error number + 100 is returned. +@return error number, or OS error number + 100 */ +static +ulint +os_file_get_last_error_low( +/*=======================*/ + bool report_all_errors, /*!< in: TRUE if we want an error + message printed of all errors */ + bool on_error_silent) /*!< in: TRUE then don't print any + diagnostic to the log */ +{ +#ifdef __WIN__ + + ulint err = (ulint) GetLastError(); + if (err == ERROR_SUCCESS) { + return(0); + } + + if (report_all_errors + || (!on_error_silent + && err != ERROR_DISK_FULL + && err != ERROR_FILE_EXISTS)) { + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Operating system error number %lu" + " in a file operation.\n", (ulong) err); + + if (err == ERROR_PATH_NOT_FOUND) { + fprintf(stderr, + "InnoDB: The error means the system" + " cannot find the path specified.\n"); + + if (srv_is_being_started) { + fprintf(stderr, + "InnoDB: If you are installing InnoDB," + " remember that you must create\n" + "InnoDB: directories yourself, InnoDB" + " does not create them.\n"); + } + } else if (err == ERROR_ACCESS_DENIED) { + fprintf(stderr, + "InnoDB: The error means mysqld does not have" + " the access rights to\n" + "InnoDB: the directory. It may also be" + " you have created a subdirectory\n" + "InnoDB: of the same name as a data file.\n"); + } else if (err == ERROR_SHARING_VIOLATION + || err == ERROR_LOCK_VIOLATION) { + fprintf(stderr, + "InnoDB: The error means that another program" + " is using InnoDB's files.\n" + "InnoDB: This might be a backup or antivirus" + " software or another instance\n" + "InnoDB: of MySQL." + " Please close it to get rid of this error.\n"); + } else if (err == ERROR_WORKING_SET_QUOTA + || err == ERROR_NO_SYSTEM_RESOURCES) { + fprintf(stderr, + "InnoDB: The error means that there are no" + " sufficient system resources or quota to" + " complete the operation.\n"); + } else if (err == ERROR_OPERATION_ABORTED) { + fprintf(stderr, + "InnoDB: The error means that the I/O" + " operation has been aborted\n" + "InnoDB: because of either a thread exit" + " or an application request.\n" + "InnoDB: Retry attempt is made.\n"); + } else { + fprintf(stderr, + "InnoDB: Some operating system error numbers" + " are described at\n" + "InnoDB: " + REFMAN + "operating-system-error-codes.html\n"); + } + } + + fflush(stderr); + + if (err == ERROR_FILE_NOT_FOUND) { + return(OS_FILE_NOT_FOUND); + } else if (err == ERROR_DISK_FULL) { + return(OS_FILE_DISK_FULL); + } else if (err == ERROR_FILE_EXISTS) { + return(OS_FILE_ALREADY_EXISTS); + } else if (err == ERROR_SHARING_VIOLATION + || err == ERROR_LOCK_VIOLATION) { + return(OS_FILE_SHARING_VIOLATION); + } else if (err == ERROR_WORKING_SET_QUOTA + || err == ERROR_NO_SYSTEM_RESOURCES) { + return(OS_FILE_INSUFFICIENT_RESOURCE); + } else if (err == ERROR_OPERATION_ABORTED) { + return(OS_FILE_OPERATION_ABORTED); + } else if (err == ERROR_ACCESS_DENIED) { + return(OS_FILE_ACCESS_VIOLATION); + } else { + return(OS_FILE_ERROR_MAX + err); + } +#else + int err = errno; + if (err == 0) { + return(0); + } + + if (report_all_errors + || (err != ENOSPC && err != EEXIST && !on_error_silent)) { + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Operating system error number %d" + " in a file operation.\n", err); + + if (err == ENOENT) { + fprintf(stderr, + "InnoDB: The error means the system" + " cannot find the path specified.\n"); + + if (srv_is_being_started) { + fprintf(stderr, + "InnoDB: If you are installing InnoDB," + " remember that you must create\n" + "InnoDB: directories yourself, InnoDB" + " does not create them.\n"); + } + } else if (err == EACCES) { + fprintf(stderr, + "InnoDB: The error means mysqld does not have" + " the access rights to\n" + "InnoDB: the directory.\n"); + } else { + if (strerror(err) != NULL) { + fprintf(stderr, + "InnoDB: Error number %d" + " means '%s'.\n", + err, strerror(err)); + } + + + fprintf(stderr, + "InnoDB: Some operating system" + " error numbers are described at\n" + "InnoDB: " + REFMAN + "operating-system-error-codes.html\n"); + } + } + + fflush(stderr); + + switch (err) { + case ENOSPC: + return(OS_FILE_DISK_FULL); + case ENOENT: + return(OS_FILE_NOT_FOUND); + case EEXIST: + return(OS_FILE_ALREADY_EXISTS); + case EXDEV: + case ENOTDIR: + case EISDIR: + return(OS_FILE_PATH_ERROR); + case EAGAIN: + if (srv_use_native_aio) { + return(OS_FILE_AIO_RESOURCES_RESERVED); + } + break; + case EINTR: + if (srv_use_native_aio) { + return(OS_FILE_AIO_INTERRUPTED); + } + break; + case EACCES: + return(OS_FILE_ACCESS_VIOLATION); + } + return(OS_FILE_ERROR_MAX + err); +#endif +} + +/***********************************************************************//** +Retrieves the last error number if an error occurs in a file io function. +The number should be retrieved before any other OS calls (because they may +overwrite the error number). If the number is not known to this program, +the OS error number + 100 is returned. +@return error number, or OS error number + 100 */ +UNIV_INTERN +ulint +os_file_get_last_error( +/*===================*/ + bool report_all_errors) /*!< in: TRUE if we want an error + message printed of all errors */ +{ + return(os_file_get_last_error_low(report_all_errors, false)); +} + +/****************************************************************//** +Does error handling when a file operation fails. +Conditionally exits (calling exit(3)) based on should_exit value and the +error type, if should_exit is TRUE then on_error_silent is ignored. +@return TRUE if we should retry the operation */ +static +ibool +os_file_handle_error_cond_exit( +/*===========================*/ + const char* name, /*!< in: name of a file or NULL */ + const char* operation, /*!< in: operation */ + ibool should_exit, /*!< in: call exit(3) if unknown error + and this parameter is TRUE */ + ibool on_error_silent)/*!< in: if TRUE then don't print + any message to the log iff it is + an unknown non-fatal error */ +{ + ulint err; + + err = os_file_get_last_error_low(false, on_error_silent); + + switch (err) { + case OS_FILE_DISK_FULL: + /* We only print a warning about disk full once */ + + if (os_has_said_disk_full) { + + return(FALSE); + } + + /* Disk full error is reported irrespective of the + on_error_silent setting. */ + + if (name) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Encountered a problem with" + " file %s\n", name); + } + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Disk is full. Try to clean the disk" + " to free space.\n"); + + os_has_said_disk_full = TRUE; + + fflush(stderr); + + return(FALSE); + + case OS_FILE_AIO_RESOURCES_RESERVED: + case OS_FILE_AIO_INTERRUPTED: + + return(TRUE); + + case OS_FILE_PATH_ERROR: + case OS_FILE_ALREADY_EXISTS: + case OS_FILE_ACCESS_VIOLATION: + + return(FALSE); + + case OS_FILE_SHARING_VIOLATION: + + os_thread_sleep(10000000); /* 10 sec */ + return(TRUE); + + case OS_FILE_OPERATION_ABORTED: + case OS_FILE_INSUFFICIENT_RESOURCE: + + os_thread_sleep(100000); /* 100 ms */ + return(TRUE); + + default: + + /* If it is an operation that can crash on error then it + is better to ignore on_error_silent and print an error message + to the log. */ + + if (should_exit || !on_error_silent) { + ib_logf(IB_LOG_LEVEL_ERROR, "File %s: '%s' returned OS " + "error " ULINTPF ".%s", name ? name : "(unknown)", + operation, err, should_exit + ? " Cannot continue operation" : ""); + } + + if (should_exit) { + exit(1); + } + } + + return(FALSE); +} + +/****************************************************************//** +Does error handling when a file operation fails. +@return TRUE if we should retry the operation */ +static +ibool +os_file_handle_error( +/*=================*/ + const char* name, /*!< in: name of a file or NULL */ + const char* operation) /*!< in: operation */ +{ + /* exit in case of unknown error */ + return(os_file_handle_error_cond_exit(name, operation, TRUE, FALSE)); +} + +/****************************************************************//** +Does error handling when a file operation fails. +@return TRUE if we should retry the operation */ +static +ibool +os_file_handle_error_no_exit( +/*=========================*/ + const char* name, /*!< in: name of a file or NULL */ + const char* operation, /*!< in: operation */ + ibool on_error_silent)/*!< in: if TRUE then don't print + any message to the log. */ +{ + /* don't exit in case of unknown error */ + return(os_file_handle_error_cond_exit( + name, operation, FALSE, on_error_silent)); +} + +#undef USE_FILE_LOCK +#define USE_FILE_LOCK +#if defined(UNIV_HOTBACKUP) || defined(__WIN__) +/* InnoDB Hot Backup does not lock the data files. + * On Windows, mandatory locking is used. + */ +# undef USE_FILE_LOCK +#endif +#ifdef USE_FILE_LOCK +/****************************************************************//** +Obtain an exclusive lock on a file. +@return 0 on success */ +static +int +os_file_lock( +/*=========*/ + int fd, /*!< in: file descriptor */ + const char* name) /*!< in: file name */ +{ + struct flock lk; + + ut_ad(!srv_read_only_mode); + + lk.l_type = F_WRLCK; + lk.l_whence = SEEK_SET; + lk.l_start = lk.l_len = 0; + + if (fcntl(fd, F_SETLK, &lk) == -1) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "Unable to lock %s, error: %d", name, errno); + + if (errno == EAGAIN || errno == EACCES) { + ib_logf(IB_LOG_LEVEL_INFO, + "Check that you do not already have " + "another mysqld process using the " + "same InnoDB data or log files."); + } + + return(-1); + } + + return(0); +} +#endif /* USE_FILE_LOCK */ + +#ifndef UNIV_HOTBACKUP +/****************************************************************//** +Creates the seek mutexes used in positioned reads and writes. */ +UNIV_INTERN +void +os_io_init_simple(void) +/*===================*/ +{ +#if !defined(HAVE_ATOMIC_BUILTINS) || UNIV_WORD_SIZE < 8 + os_file_count_mutex = os_mutex_create(); +#endif /* !HAVE_ATOMIC_BUILTINS || UNIV_WORD_SIZE < 8 */ + + for (ulint i = 0; i < OS_FILE_N_SEEK_MUTEXES; i++) { + os_file_seek_mutexes[i] = os_mutex_create(); + } +} + +/***********************************************************************//** +Creates a temporary file. This function is like tmpfile(3), but +the temporary file is created in the MySQL temporary directory. +@return temporary file handle, or NULL on error */ +UNIV_INTERN +FILE* +os_file_create_tmpfile(void) +/*========================*/ +{ + FILE* file = NULL; + int fd = innobase_mysql_tmpfile(); + + ut_ad(!srv_read_only_mode); + + if (fd >= 0) { + file = fdopen(fd, "w+b"); + } + + if (!file) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: unable to create temporary file;" + " errno: %d\n", errno); + if (fd >= 0) { + close(fd); + } + } + + return(file); +} +#endif /* !UNIV_HOTBACKUP */ + +/***********************************************************************//** +The os_file_opendir() function opens a directory stream corresponding to the +directory named by the dirname argument. The directory stream is positioned +at the first entry. In both Unix and Windows we automatically skip the '.' +and '..' items at the start of the directory listing. +@return directory stream, NULL if error */ +UNIV_INTERN +os_file_dir_t +os_file_opendir( +/*============*/ + const char* dirname, /*!< in: directory name; it must not + contain a trailing '\' or '/' */ + ibool error_is_fatal) /*!< in: TRUE if we should treat an + error as a fatal error; if we try to + open symlinks then we do not wish a + fatal error if it happens not to be + a directory */ +{ + os_file_dir_t dir; +#ifdef __WIN__ + LPWIN32_FIND_DATA lpFindFileData; + char path[OS_FILE_MAX_PATH + 3]; + + ut_a(strlen(dirname) < OS_FILE_MAX_PATH); + + strcpy(path, dirname); + strcpy(path + strlen(path), "\\*"); + + /* Note that in Windows opening the 'directory stream' also retrieves + the first entry in the directory. Since it is '.', that is no problem, + as we will skip over the '.' and '..' entries anyway. */ + + lpFindFileData = static_cast<LPWIN32_FIND_DATA>( + ut_malloc(sizeof(WIN32_FIND_DATA))); + + dir = FindFirstFile((LPCTSTR) path, lpFindFileData); + + ut_free(lpFindFileData); + + if (dir == INVALID_HANDLE_VALUE) { + + if (error_is_fatal) { + os_file_handle_error(dirname, "opendir"); + } + + return(NULL); + } + + return(dir); +#else + dir = opendir(dirname); + + if (dir == NULL && error_is_fatal) { + os_file_handle_error(dirname, "opendir"); + } + + return(dir); +#endif /* __WIN__ */ +} + +/***********************************************************************//** +Closes a directory stream. +@return 0 if success, -1 if failure */ +UNIV_INTERN +int +os_file_closedir( +/*=============*/ + os_file_dir_t dir) /*!< in: directory stream */ +{ +#ifdef __WIN__ + BOOL ret; + + ret = FindClose(dir); + + if (!ret) { + os_file_handle_error_no_exit(NULL, "closedir", FALSE); + + return(-1); + } + + return(0); +#else + int ret; + + ret = closedir(dir); + + if (ret) { + os_file_handle_error_no_exit(NULL, "closedir", FALSE); + } + + return(ret); +#endif /* __WIN__ */ +} + +/***********************************************************************//** +This function returns information of the next file in the directory. We jump +over the '.' and '..' entries in the directory. +@return 0 if ok, -1 if error, 1 if at the end of the directory */ +UNIV_INTERN +int +os_file_readdir_next_file( +/*======================*/ + const char* dirname,/*!< in: directory name or path */ + os_file_dir_t dir, /*!< in: directory stream */ + os_file_stat_t* info) /*!< in/out: buffer where the info is returned */ +{ +#ifdef __WIN__ + LPWIN32_FIND_DATA lpFindFileData; + BOOL ret; + + lpFindFileData = static_cast<LPWIN32_FIND_DATA>( + ut_malloc(sizeof(WIN32_FIND_DATA))); +next_file: + ret = FindNextFile(dir, lpFindFileData); + + if (ret) { + ut_a(strlen((char*) lpFindFileData->cFileName) + < OS_FILE_MAX_PATH); + + if (strcmp((char*) lpFindFileData->cFileName, ".") == 0 + || strcmp((char*) lpFindFileData->cFileName, "..") == 0) { + + goto next_file; + } + + strcpy(info->name, (char*) lpFindFileData->cFileName); + + info->size = (ib_int64_t)(lpFindFileData->nFileSizeLow) + + (((ib_int64_t)(lpFindFileData->nFileSizeHigh)) + << 32); + + if (lpFindFileData->dwFileAttributes + & FILE_ATTRIBUTE_REPARSE_POINT) { + /* TODO: test Windows symlinks */ + /* TODO: MySQL has apparently its own symlink + implementation in Windows, dbname.sym can + redirect a database directory: + REFMAN "windows-symbolic-links.html" */ + info->type = OS_FILE_TYPE_LINK; + } else if (lpFindFileData->dwFileAttributes + & FILE_ATTRIBUTE_DIRECTORY) { + info->type = OS_FILE_TYPE_DIR; + } else { + /* It is probably safest to assume that all other + file types are normal. Better to check them rather + than blindly skip them. */ + + info->type = OS_FILE_TYPE_FILE; + } + } + + ut_free(lpFindFileData); + + if (ret) { + return(0); + } else if (GetLastError() == ERROR_NO_MORE_FILES) { + + return(1); + } else { + os_file_handle_error_no_exit(NULL, "readdir_next_file", FALSE); + return(-1); + } +#else + struct dirent* ent; + char* full_path; + int ret; + struct stat statinfo; +#ifdef HAVE_READDIR_R + char dirent_buf[sizeof(struct dirent) + + _POSIX_PATH_MAX + 100]; + /* In /mysys/my_lib.c, _POSIX_PATH_MAX + 1 is used as + the max file name len; but in most standards, the + length is NAME_MAX; we add 100 to be even safer */ +#endif + +next_file: + +#ifdef HAVE_READDIR_R + ret = readdir_r(dir, (struct dirent*) dirent_buf, &ent); + + if (ret != 0 +#ifdef UNIV_AIX + /* On AIX, only if we got non-NULL 'ent' (result) value and + a non-zero 'ret' (return) value, it indicates a failed + readdir_r() call. An NULL 'ent' with an non-zero 'ret' + would indicate the "end of the directory" is reached. */ + && ent != NULL +#endif + ) { + fprintf(stderr, + "InnoDB: cannot read directory %s, error %lu\n", + dirname, (ulong) ret); + + return(-1); + } + + if (ent == NULL) { + /* End of directory */ + + return(1); + } + + ut_a(strlen(ent->d_name) < _POSIX_PATH_MAX + 100 - 1); +#else + ent = readdir(dir); + + if (ent == NULL) { + + return(1); + } +#endif + ut_a(strlen(ent->d_name) < OS_FILE_MAX_PATH); + + if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) { + + goto next_file; + } + + strcpy(info->name, ent->d_name); + + full_path = static_cast<char*>( + ut_malloc(strlen(dirname) + strlen(ent->d_name) + 10)); + + sprintf(full_path, "%s/%s", dirname, ent->d_name); + + ret = stat(full_path, &statinfo); + + if (ret) { + + if (errno == ENOENT) { + /* readdir() returned a file that does not exist, + it must have been deleted in the meantime. Do what + would have happened if the file was deleted before + readdir() - ignore and go to the next entry. + If this is the last entry then info->name will still + contain the name of the deleted file when this + function returns, but this is not an issue since the + caller shouldn't be looking at info when end of + directory is returned. */ + + ut_free(full_path); + + goto next_file; + } + + os_file_handle_error_no_exit(full_path, "stat", FALSE); + + ut_free(full_path); + + return(-1); + } + + info->size = (ib_int64_t) statinfo.st_size; + + if (S_ISDIR(statinfo.st_mode)) { + info->type = OS_FILE_TYPE_DIR; + } else if (S_ISLNK(statinfo.st_mode)) { + info->type = OS_FILE_TYPE_LINK; + } else if (S_ISREG(statinfo.st_mode)) { + info->type = OS_FILE_TYPE_FILE; + } else { + info->type = OS_FILE_TYPE_UNKNOWN; + } + + ut_free(full_path); + + return(0); +#endif +} + +/*****************************************************************//** +This function attempts to create a directory named pathname. The new +directory gets default permissions. On Unix the permissions are +(0770 & ~umask). If the directory exists already, nothing is done and +the call succeeds, unless the fail_if_exists arguments is true. +If another error occurs, such as a permission error, this does not crash, +but reports the error and returns FALSE. +@return TRUE if call succeeds, FALSE on error */ +UNIV_INTERN +ibool +os_file_create_directory( +/*=====================*/ + const char* pathname, /*!< in: directory name as + null-terminated string */ + ibool fail_if_exists) /*!< in: if TRUE, pre-existing directory + is treated as an error. */ +{ +#ifdef __WIN__ + BOOL rcode; + + rcode = CreateDirectory((LPCTSTR) pathname, NULL); + if (!(rcode != 0 + || (GetLastError() == ERROR_ALREADY_EXISTS + && !fail_if_exists))) { + + os_file_handle_error_no_exit( + pathname, "CreateDirectory", FALSE); + + return(FALSE); + } + + return(TRUE); +#else + int rcode; + + rcode = mkdir(pathname, 0770); + + if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) { + /* failure */ + os_file_handle_error_no_exit(pathname, "mkdir", FALSE); + + return(FALSE); + } + + return (TRUE); +#endif /* __WIN__ */ +} + +/****************************************************************//** +NOTE! Use the corresponding macro os_file_create_simple(), not directly +this function! +A simple function to open or create a file. +@return own: handle to the file, not defined if error, error number +can be retrieved with os_file_get_last_error */ +UNIV_INTERN +os_file_t +os_file_create_simple_func( +/*=======================*/ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + ulint create_mode,/*!< in: create mode */ + ulint access_type,/*!< in: OS_FILE_READ_ONLY or + OS_FILE_READ_WRITE */ + ibool* success)/*!< out: TRUE if succeed, FALSE if error */ +{ + os_file_t file; + ibool retry; + + *success = FALSE; +#ifdef __WIN__ + DWORD access; + DWORD create_flag; + DWORD attributes = 0; + + ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT)); + ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT)); + + if (create_mode == OS_FILE_OPEN) { + + create_flag = OPEN_EXISTING; + + } else if (srv_read_only_mode) { + + create_flag = OPEN_EXISTING; + + } else if (create_mode == OS_FILE_CREATE) { + + create_flag = CREATE_NEW; + + } else if (create_mode == OS_FILE_CREATE_PATH) { + + ut_a(!srv_read_only_mode); + + /* Create subdirs along the path if needed */ + *success = os_file_create_subdirs_if_needed(name); + + if (!*success) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "Unable to create subdirectories '%s'", + name); + + return((os_file_t) -1); + } + + create_flag = CREATE_NEW; + create_mode = OS_FILE_CREATE; + + } else { + ib_logf(IB_LOG_LEVEL_ERROR, + "Unknown file create mode (%lu) for file '%s'", + create_mode, name); + + return((os_file_t) -1); + } + + if (access_type == OS_FILE_READ_ONLY) { + access = GENERIC_READ; + } else if (srv_read_only_mode) { + + ib_logf(IB_LOG_LEVEL_INFO, + "read only mode set. Unable to " + "open file '%s' in RW mode, trying RO mode", name); + + access = GENERIC_READ; + + } else if (access_type == OS_FILE_READ_WRITE) { + access = GENERIC_READ | GENERIC_WRITE; + } else { + ib_logf(IB_LOG_LEVEL_ERROR, + "Unknown file access type (%lu) for file '%s'", + access_type, name); + + return((os_file_t) -1); + } + + do { + /* Use default security attributes and no template file. */ + + file = CreateFile( + (LPCTSTR) name, access, FILE_SHARE_READ, NULL, + create_flag, attributes, NULL); + + if (file == INVALID_HANDLE_VALUE) { + + *success = FALSE; + + retry = os_file_handle_error( + name, create_mode == OS_FILE_OPEN ? + "open" : "create"); + + } else { + *success = TRUE; + retry = false; + } + + } while (retry); + +#else /* __WIN__ */ + int create_flag; + + ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT)); + ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT)); + + if (create_mode == OS_FILE_OPEN) { + + if (access_type == OS_FILE_READ_ONLY) { + create_flag = O_RDONLY; + } else if (srv_read_only_mode) { + create_flag = O_RDONLY; + } else { + create_flag = O_RDWR; + } + + } else if (srv_read_only_mode) { + + create_flag = O_RDONLY; + + } else if (create_mode == OS_FILE_CREATE) { + + create_flag = O_RDWR | O_CREAT | O_EXCL; + + } else if (create_mode == OS_FILE_CREATE_PATH) { + + /* Create subdirs along the path if needed */ + + *success = os_file_create_subdirs_if_needed(name); + + if (!*success) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "Unable to create subdirectories '%s'", + name); + + return((os_file_t) -1); + } + + create_flag = O_RDWR | O_CREAT | O_EXCL; + create_mode = OS_FILE_CREATE; + } else { + + ib_logf(IB_LOG_LEVEL_ERROR, + "Unknown file create mode (%lu) for file '%s'", + create_mode, name); + + return((os_file_t) -1); + } + + do { + file = ::open(name, create_flag, os_innodb_umask); + + if (file == -1) { + *success = FALSE; + + retry = os_file_handle_error( + name, + create_mode == OS_FILE_OPEN + ? "open" : "create"); + } else { + *success = TRUE; + retry = false; + } + + } while (retry); + +#ifdef USE_FILE_LOCK + if (!srv_read_only_mode + && *success + && access_type == OS_FILE_READ_WRITE + && os_file_lock(file, name)) { + + *success = FALSE; + close(file); + file = -1; + } +#endif /* USE_FILE_LOCK */ + +#endif /* __WIN__ */ + + return(file); +} + +/****************************************************************//** +NOTE! Use the corresponding macro +os_file_create_simple_no_error_handling(), not directly this function! +A simple function to open or create a file. +@return own: handle to the file, not defined if error, error number +can be retrieved with os_file_get_last_error */ +UNIV_INTERN +os_file_t +os_file_create_simple_no_error_handling_func( +/*=========================================*/ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + ulint create_mode,/*!< in: create mode */ + ulint access_type,/*!< in: OS_FILE_READ_ONLY, + OS_FILE_READ_WRITE, or + OS_FILE_READ_ALLOW_DELETE; the last option is + used by a backup program reading the file */ + ibool* success)/*!< out: TRUE if succeed, FALSE if error */ +{ + os_file_t file; + + *success = FALSE; +#ifdef __WIN__ + DWORD access; + DWORD create_flag; + DWORD attributes = 0; + DWORD share_mode = FILE_SHARE_READ; + + ut_a(name); + + ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT)); + ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT)); + + if (create_mode == OS_FILE_OPEN) { + create_flag = OPEN_EXISTING; + } else if (srv_read_only_mode) { + create_flag = OPEN_EXISTING; + } else if (create_mode == OS_FILE_CREATE) { + create_flag = CREATE_NEW; + } else { + + ib_logf(IB_LOG_LEVEL_ERROR, + "Unknown file create mode (%lu) for file '%s'", + create_mode, name); + + return((os_file_t) -1); + } + + if (access_type == OS_FILE_READ_ONLY) { + access = GENERIC_READ; + } else if (srv_read_only_mode) { + access = GENERIC_READ; + } else if (access_type == OS_FILE_READ_WRITE) { + access = GENERIC_READ | GENERIC_WRITE; + } else if (access_type == OS_FILE_READ_ALLOW_DELETE) { + + ut_a(!srv_read_only_mode); + + access = GENERIC_READ; + + /*!< A backup program has to give mysqld the maximum + freedom to do what it likes with the file */ + + share_mode |= FILE_SHARE_DELETE | FILE_SHARE_WRITE; + } else { + ib_logf(IB_LOG_LEVEL_ERROR, + "Unknown file access type (%lu) for file '%s'", + access_type, name); + + return((os_file_t) -1); + } + + file = CreateFile((LPCTSTR) name, + access, + share_mode, + NULL, // Security attributes + create_flag, + attributes, + NULL); // No template file + + *success = (file != INVALID_HANDLE_VALUE); +#else /* __WIN__ */ + int create_flag; + + ut_a(name); + + ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT)); + ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT)); + + if (create_mode == OS_FILE_OPEN) { + + if (access_type == OS_FILE_READ_ONLY) { + + create_flag = O_RDONLY; + + } else if (srv_read_only_mode) { + + create_flag = O_RDONLY; + + } else { + + ut_a(access_type == OS_FILE_READ_WRITE + || access_type == OS_FILE_READ_ALLOW_DELETE); + + create_flag = O_RDWR; + } + + } else if (srv_read_only_mode) { + + create_flag = O_RDONLY; + + } else if (create_mode == OS_FILE_CREATE) { + + create_flag = O_RDWR | O_CREAT | O_EXCL; + + } else { + ib_logf(IB_LOG_LEVEL_ERROR, + "Unknown file create mode (%lu) for file '%s'", + create_mode, name); + + return((os_file_t) -1); + } + + file = ::open(name, create_flag, os_innodb_umask); + + *success = file == -1 ? FALSE : TRUE; + +#ifdef USE_FILE_LOCK + if (!srv_read_only_mode + && *success + && access_type == OS_FILE_READ_WRITE + && os_file_lock(file, name)) { + + *success = FALSE; + close(file); + file = -1; + + } +#endif /* USE_FILE_LOCK */ + +#endif /* __WIN__ */ + + return(file); +} + +/****************************************************************//** +Tries to disable OS caching on an opened file descriptor. */ +UNIV_INTERN +void +os_file_set_nocache( +/*================*/ + int fd /*!< in: file descriptor to alter */ + __attribute__((unused)), + const char* file_name /*!< in: used in the diagnostic + message */ + __attribute__((unused)), + const char* operation_name __attribute__((unused))) + /*!< in: "open" or "create"; used + in the diagnostic message */ +{ + /* some versions of Solaris may not have DIRECTIO_ON */ +#if defined(UNIV_SOLARIS) && defined(DIRECTIO_ON) + if (directio(fd, DIRECTIO_ON) == -1) { + int errno_save = errno; + + ib_logf(IB_LOG_LEVEL_ERROR, + "Failed to set DIRECTIO_ON on file %s: %s: %s, " + "continuing anyway.", + file_name, operation_name, strerror(errno_save)); + } +#elif defined(O_DIRECT) + if (fcntl(fd, F_SETFL, O_DIRECT) == -1) { + int errno_save = errno; + static bool warning_message_printed = false; + if (errno_save == EINVAL) { + if (!warning_message_printed) { + warning_message_printed = true; +# ifdef UNIV_LINUX + ib_logf(IB_LOG_LEVEL_WARN, + "Failed to set O_DIRECT on file " + "%s: %s: %s, continuing anyway. " + "O_DIRECT is known to result " + "in 'Invalid argument' on Linux on " + "tmpfs, see MySQL Bug#26662.", + file_name, operation_name, + strerror(errno_save)); +# else /* UNIV_LINUX */ + goto short_warning; +# endif /* UNIV_LINUX */ + } + } else { +# ifndef UNIV_LINUX +short_warning: +# endif + ib_logf(IB_LOG_LEVEL_WARN, + "Failed to set O_DIRECT on file %s: %s: %s, " + "continuing anyway.", + file_name, operation_name, strerror(errno_save)); + } + } +#endif /* defined(UNIV_SOLARIS) && defined(DIRECTIO_ON) */ +} + +/****************************************************************//** +Tries to enable the atomic write feature, if available, for the specified file +handle. +@return TRUE if success */ +static __attribute__((warn_unused_result)) +ibool +os_file_set_atomic_writes( +/*======================*/ + const char* name /*!< in: name of the file */ + __attribute__((unused)), + os_file_t file /*!< in: handle to the file */ + __attribute__((unused))) + +{ +#ifdef DFS_IOCTL_ATOMIC_WRITE_SET + int atomic_option = 1; + + if (ioctl(file, DFS_IOCTL_ATOMIC_WRITE_SET, &atomic_option)) { + + os_file_handle_error_no_exit(name, "ioctl", FALSE); + return(FALSE); + } + + return(TRUE); +#else + ib_logf(IB_LOG_LEVEL_ERROR, + "trying to enable atomic writes on non-supported platform! " + "Please restart with innodb_use_atomic_writes disabled.\n"); + return(FALSE); +#endif +} + +/****************************************************************//** +NOTE! Use the corresponding macro os_file_create(), not directly +this function! +Opens an existing file or creates a new. +@return own: handle to the file, not defined if error, error number +can be retrieved with os_file_get_last_error */ +UNIV_INTERN +os_file_t +os_file_create_func( +/*================*/ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + ulint create_mode,/*!< in: create mode */ + ulint purpose,/*!< in: OS_FILE_AIO, if asynchronous, + non-buffered i/o is desired, + OS_FILE_NORMAL, if any normal file; + NOTE that it also depends on type, os_aio_.. + and srv_.. variables whether we really use + async i/o or unbuffered i/o: look in the + function source code for the exact rules */ + ulint type, /*!< in: OS_DATA_FILE or OS_LOG_FILE */ + ibool* success)/*!< out: TRUE if succeed, FALSE if error */ +{ + os_file_t file; + ibool retry; + ibool on_error_no_exit; + ibool on_error_silent; + +#ifdef __WIN__ + DBUG_EXECUTE_IF( + "ib_create_table_fail_disk_full", + *success = FALSE; + SetLastError(ERROR_DISK_FULL); + return((os_file_t) -1); + ); +#else /* __WIN__ */ + DBUG_EXECUTE_IF( + "ib_create_table_fail_disk_full", + *success = FALSE; + errno = ENOSPC; + return((os_file_t) -1); + ); +#endif /* __WIN__ */ + +#ifdef __WIN__ + DWORD create_flag; + DWORD share_mode = FILE_SHARE_READ; + + on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT + ? TRUE : FALSE; + + on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT + ? TRUE : FALSE; + + create_mode &= ~OS_FILE_ON_ERROR_NO_EXIT; + create_mode &= ~OS_FILE_ON_ERROR_SILENT; + + if (create_mode == OS_FILE_OPEN_RAW) { + + ut_a(!srv_read_only_mode); + + create_flag = OPEN_EXISTING; + + /* On Windows Physical devices require admin privileges and + have to have the write-share mode set. See the remarks + section for the CreateFile() function documentation in MSDN. */ + + share_mode |= FILE_SHARE_WRITE; + + } else if (create_mode == OS_FILE_OPEN + || create_mode == OS_FILE_OPEN_RETRY) { + + create_flag = OPEN_EXISTING; + + } else if (srv_read_only_mode) { + + create_flag = OPEN_EXISTING; + + } else if (create_mode == OS_FILE_CREATE) { + + create_flag = CREATE_NEW; + + } else if (create_mode == OS_FILE_OVERWRITE) { + + create_flag = CREATE_ALWAYS; + + } else { + ib_logf(IB_LOG_LEVEL_ERROR, + "Unknown file create mode (%lu) for file '%s'", + create_mode, name); + + return((os_file_t) -1); + } + + DWORD attributes = 0; + +#ifdef UNIV_HOTBACKUP + attributes |= FILE_FLAG_NO_BUFFERING; +#else + if (purpose == OS_FILE_AIO) { + +#ifdef WIN_ASYNC_IO + /* If specified, use asynchronous (overlapped) io and no + buffering of writes in the OS */ + + if (srv_use_native_aio) { + attributes |= FILE_FLAG_OVERLAPPED; + } +#endif /* WIN_ASYNC_IO */ + + } else if (purpose == OS_FILE_NORMAL) { + /* Use default setting. */ + } else { + ib_logf(IB_LOG_LEVEL_ERROR, + "Unknown purpose flag (%lu) while opening file '%s'", + purpose, name); + + return((os_file_t)(-1)); + } + +#ifdef UNIV_NON_BUFFERED_IO + // TODO: Create a bug, this looks wrong. The flush log + // parameter is dynamic. + if (type == OS_LOG_FILE && thd_flush_log_at_trx_commit(NULL) == 2) { + + /* Do not use unbuffered i/o for the log files because + value 2 denotes that we do not flush the log at every + commit, but only once per second */ + + } else if (srv_win_file_flush_method == SRV_WIN_IO_UNBUFFERED) { + + attributes |= FILE_FLAG_NO_BUFFERING; + } +#endif /* UNIV_NON_BUFFERED_IO */ + +#endif /* UNIV_HOTBACKUP */ + DWORD access = GENERIC_READ; + + if (!srv_read_only_mode) { + access |= GENERIC_WRITE; + } + + do { + /* Use default security attributes and no template file. */ + file = CreateFile( + (LPCTSTR) name, access, share_mode, NULL, + create_flag, attributes, NULL); + + if (file == INVALID_HANDLE_VALUE) { + const char* operation; + + operation = (create_mode == OS_FILE_CREATE + && !srv_read_only_mode) + ? "create" : "open"; + + *success = FALSE; + + if (on_error_no_exit) { + retry = os_file_handle_error_no_exit( + name, operation, on_error_silent); + } else { + retry = os_file_handle_error(name, operation); + } + } else { + *success = TRUE; + retry = FALSE; + } + + } while (retry); + +#else /* __WIN__ */ + int create_flag; + const char* mode_str = NULL; + + on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT + ? TRUE : FALSE; + on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT + ? TRUE : FALSE; + + create_mode &= ~OS_FILE_ON_ERROR_NO_EXIT; + create_mode &= ~OS_FILE_ON_ERROR_SILENT; + + if (create_mode == OS_FILE_OPEN + || create_mode == OS_FILE_OPEN_RAW + || create_mode == OS_FILE_OPEN_RETRY) { + + mode_str = "OPEN"; + + create_flag = srv_read_only_mode ? O_RDONLY : O_RDWR; + + } else if (srv_read_only_mode) { + + mode_str = "OPEN"; + + create_flag = O_RDONLY; + + } else if (create_mode == OS_FILE_CREATE) { + + mode_str = "CREATE"; + create_flag = O_RDWR | O_CREAT | O_EXCL; + + } else if (create_mode == OS_FILE_OVERWRITE) { + + mode_str = "OVERWRITE"; + create_flag = O_RDWR | O_CREAT | O_TRUNC; + + } else { + ib_logf(IB_LOG_LEVEL_ERROR, + "Unknown file create mode (%lu) for file '%s'", + create_mode, name); + + return((os_file_t) -1); + } + + ut_a(type == OS_LOG_FILE || type == OS_DATA_FILE); + ut_a(purpose == OS_FILE_AIO || purpose == OS_FILE_NORMAL); + +#ifdef O_SYNC + /* We let O_SYNC only affect log files; note that we map O_DSYNC to + O_SYNC because the datasync options seemed to corrupt files in 2001 + in both Linux and Solaris */ + + if (!srv_read_only_mode + && type == OS_LOG_FILE + && srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) { + + create_flag |= O_SYNC; + } +#endif /* O_SYNC */ + + do { + file = ::open(name, create_flag, os_innodb_umask); + + if (file == -1) { + const char* operation; + + operation = (create_mode == OS_FILE_CREATE + && !srv_read_only_mode) + ? "create" : "open"; + + *success = FALSE; + + if (on_error_no_exit) { + retry = os_file_handle_error_no_exit( + name, operation, on_error_silent); + } else { + retry = os_file_handle_error(name, operation); + } + } else { + *success = TRUE; + retry = false; + } + + } while (retry); + + if (!srv_read_only_mode + && *success + && type != OS_LOG_FILE + && (srv_unix_file_flush_method == SRV_UNIX_O_DIRECT + || srv_unix_file_flush_method == SRV_UNIX_O_DIRECT_NO_FSYNC)) { + + os_file_set_nocache(file, name, mode_str); + } else if (!srv_read_only_mode + && *success + && srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT) { + os_file_set_nocache(file, name, mode_str); + } + +#ifdef USE_FILE_LOCK + if (!srv_read_only_mode + && *success + && create_mode != OS_FILE_OPEN_RAW + && os_file_lock(file, name)) { + + if (create_mode == OS_FILE_OPEN_RETRY) { + + ut_a(!srv_read_only_mode); + + ib_logf(IB_LOG_LEVEL_INFO, + "Retrying to lock the first data file"); + + for (int i = 0; i < 100; i++) { + os_thread_sleep(1000000); + + if (!os_file_lock(file, name)) { + *success = TRUE; + return(file); + } + } + + ib_logf(IB_LOG_LEVEL_INFO, + "Unable to open the first data file"); + } + + *success = FALSE; + close(file); + file = -1; + } +#endif /* USE_FILE_LOCK */ + + if (srv_use_atomic_writes && type == OS_DATA_FILE + && file != -1 && !os_file_set_atomic_writes(name, file)) { + + *success = FALSE; + close(file); + file = -1; + } + +#endif /* __WIN__ */ + + return(file); +} + +/***********************************************************************//** +Deletes a file if it exists. The file has to be closed before calling this. +@return TRUE if success */ +UNIV_INTERN +bool +os_file_delete_if_exists_func( +/*==========================*/ + const char* name) /*!< in: file path as a null-terminated + string */ +{ +#ifdef __WIN__ + bool ret; + ulint count = 0; +loop: + /* In Windows, deleting an .ibd file may fail if mysqlbackup is copying + it */ + + ret = DeleteFile((LPCTSTR) name); + + if (ret) { + return(true); + } + + DWORD lasterr = GetLastError(); + if (lasterr == ERROR_FILE_NOT_FOUND + || lasterr == ERROR_PATH_NOT_FOUND) { + /* the file does not exist, this not an error */ + + return(true); + } + + count++; + + if (count > 100 && 0 == (count % 10)) { + os_file_get_last_error(true); /* print error information */ + + ib_logf(IB_LOG_LEVEL_WARN, "Delete of file %s failed.", name); + } + + os_thread_sleep(500000); /* sleep for 0.5 second */ + + if (count > 2000) { + + return(false); + } + + goto loop; +#else + int ret; + + ret = unlink(name); + + if (ret != 0 && errno != ENOENT) { + os_file_handle_error_no_exit(name, "delete", FALSE); + + return(false); + } + + return(true); +#endif /* __WIN__ */ +} + +/***********************************************************************//** +Deletes a file. The file has to be closed before calling this. +@return TRUE if success */ +UNIV_INTERN +bool +os_file_delete_func( +/*================*/ + const char* name) /*!< in: file path as a null-terminated + string */ +{ +#ifdef __WIN__ + BOOL ret; + ulint count = 0; +loop: + /* In Windows, deleting an .ibd file may fail if mysqlbackup is copying + it */ + + ret = DeleteFile((LPCTSTR) name); + + if (ret) { + return(true); + } + + if (GetLastError() == ERROR_FILE_NOT_FOUND) { + /* If the file does not exist, we classify this as a 'mild' + error and return */ + + return(false); + } + + count++; + + if (count > 100 && 0 == (count % 10)) { + os_file_get_last_error(true); /* print error information */ + + fprintf(stderr, + "InnoDB: Warning: cannot delete file %s\n" + "InnoDB: Are you running mysqlbackup" + " to back up the file?\n", name); + } + + os_thread_sleep(1000000); /* sleep for a second */ + + if (count > 2000) { + + return(false); + } + + goto loop; +#else + int ret; + + ret = unlink(name); + + if (ret != 0) { + os_file_handle_error_no_exit(name, "delete", FALSE); + + return(false); + } + + return(true); +#endif +} + +/***********************************************************************//** +NOTE! Use the corresponding macro os_file_rename(), not directly this function! +Renames a file (can also move it to another directory). It is safest that the +file is closed before calling this function. +@return TRUE if success */ +UNIV_INTERN +ibool +os_file_rename_func( +/*================*/ + const char* oldpath,/*!< in: old file path as a null-terminated + string */ + const char* newpath)/*!< in: new file path */ +{ +#ifdef UNIV_DEBUG + os_file_type_t type; + ibool exists; + + /* New path must not exist. */ + ut_ad(os_file_status(newpath, &exists, &type)); + ut_ad(!exists); + + /* Old path must exist. */ + ut_ad(os_file_status(oldpath, &exists, &type)); + ut_ad(exists); +#endif /* UNIV_DEBUG */ + +#ifdef __WIN__ + BOOL ret; + + ret = MoveFile((LPCTSTR) oldpath, (LPCTSTR) newpath); + + if (ret) { + return(TRUE); + } + + os_file_handle_error_no_exit(oldpath, "rename", FALSE); + + return(FALSE); +#else + int ret; + + ret = rename(oldpath, newpath); + + if (ret != 0) { + os_file_handle_error_no_exit(oldpath, "rename", FALSE); + + return(FALSE); + } + + return(TRUE); +#endif /* __WIN__ */ +} + +/***********************************************************************//** +NOTE! Use the corresponding macro os_file_close(), not directly this function! +Closes a file handle. In case of error, error number can be retrieved with +os_file_get_last_error. +@return TRUE if success */ +UNIV_INTERN +ibool +os_file_close_func( +/*===============*/ + os_file_t file) /*!< in, own: handle to a file */ +{ +#ifdef __WIN__ + BOOL ret; + + ut_a(file); + + ret = CloseHandle(file); + + if (ret) { + return(TRUE); + } + + os_file_handle_error(NULL, "close"); + + return(FALSE); +#else + int ret; + + ret = close(file); + + if (ret == -1) { + os_file_handle_error(NULL, "close"); + + return(FALSE); + } + + return(TRUE); +#endif /* __WIN__ */ +} + +/***********************************************************************//** +Closes a file handle. +@return TRUE if success */ +UNIV_INTERN +ibool +os_file_close_no_error_handling( +/*============================*/ + os_file_t file) /*!< in, own: handle to a file */ +{ +#ifdef __WIN__ + BOOL ret; + + ut_a(file); + + ret = CloseHandle(file); + + if (ret) { + return(TRUE); + } + + return(FALSE); +#else + int ret; + + ret = close(file); + + if (ret == -1) { + + return(FALSE); + } + + return(TRUE); +#endif /* __WIN__ */ +} + +/***********************************************************************//** +Gets a file size. +@return file size, or (os_offset_t) -1 on failure */ +UNIV_INTERN +os_offset_t +os_file_get_size( +/*=============*/ + os_file_t file) /*!< in: handle to a file */ +{ +#ifdef __WIN__ + os_offset_t offset; + DWORD high; + DWORD low; + + low = GetFileSize(file, &high); + + if ((low == 0xFFFFFFFF) && (GetLastError() != NO_ERROR)) { + return((os_offset_t) -1); + } + + offset = (os_offset_t) low | ((os_offset_t) high << 32); + + return(offset); +#else + return((os_offset_t) lseek(file, 0, SEEK_END)); +#endif /* __WIN__ */ +} + +/***********************************************************************//** +Write the specified number of zeros to a newly created file. +@return TRUE if success */ +UNIV_INTERN +ibool +os_file_set_size( +/*=============*/ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + os_file_t file, /*!< in: handle to a file */ + os_offset_t size) /*!< in: file size */ +{ + os_offset_t current_size; + ibool ret; + byte* buf; + byte* buf2; + ulint buf_size; + + current_size = 0; + +#ifdef HAVE_POSIX_FALLOCATE + if (srv_use_posix_fallocate) { + + if (posix_fallocate(file, current_size, size) == -1) { + + ib_logf(IB_LOG_LEVEL_ERROR, "preallocating file " + "space for file \'%s\' failed. Current size " + INT64PF ", desired size " INT64PF "\n", + name, current_size, size); + os_file_handle_error_no_exit (name, "posix_fallocate", + FALSE); + return(FALSE); + } + return(TRUE); + } +#endif + + /* Write up to 1 megabyte at a time. */ + buf_size = ut_min(64, (ulint) (size / UNIV_PAGE_SIZE)) + * UNIV_PAGE_SIZE; + buf2 = static_cast<byte*>(ut_malloc(buf_size + UNIV_PAGE_SIZE)); + + /* Align the buffer for possible raw i/o */ + buf = static_cast<byte*>(ut_align(buf2, UNIV_PAGE_SIZE)); + + /* Write buffer full of zeros */ + memset(buf, 0, buf_size); + + if (size >= (os_offset_t) 100 << 20) { + + fprintf(stderr, "InnoDB: Progress in MB:"); + } + + while (current_size < size) { + ulint n_bytes; + + if (size - current_size < (os_offset_t) buf_size) { + n_bytes = (ulint) (size - current_size); + } else { + n_bytes = buf_size; + } + + ret = os_file_write(name, file, buf, current_size, n_bytes); + if (!ret) { + ut_free(buf2); + goto error_handling; + } + + /* Print about progress for each 100 MB written */ + if ((current_size + n_bytes) / (100 << 20) + != current_size / (100 << 20)) { + + fprintf(stderr, " %lu00", + (ulong) ((current_size + n_bytes) + / (100 << 20))); + } + + current_size += n_bytes; + } + + if (size >= (os_offset_t) 100 << 20) { + + fprintf(stderr, "\n"); + } + + ut_free(buf2); + + ret = os_file_flush(file); + + if (ret) { + return(TRUE); + } + +error_handling: + return(FALSE); +} + +/***********************************************************************//** +Truncates a file at its current position. +@return TRUE if success */ +UNIV_INTERN +ibool +os_file_set_eof( +/*============*/ + FILE* file) /*!< in: file to be truncated */ +{ +#ifdef __WIN__ + HANDLE h = (HANDLE) _get_osfhandle(fileno(file)); + return(SetEndOfFile(h)); +#else /* __WIN__ */ + return(!ftruncate(fileno(file), ftell(file))); +#endif /* __WIN__ */ +} + +/***********************************************************************//** +Truncates a file at the specified position. +@return TRUE if success */ +UNIV_INTERN +ibool +os_file_set_eof_at( + os_file_t file, /*!< in: handle to a file */ + ib_uint64_t new_len)/*!< in: new file length */ +{ +#ifdef __WIN__ + LARGE_INTEGER li, li2; + li.QuadPart = new_len; + return(SetFilePointerEx(file, li, &li2,FILE_BEGIN) + && SetEndOfFile(file)); +#else + /* TODO: works only with -D_FILE_OFFSET_BITS=64 ? */ + return(!ftruncate(file, new_len)); +#endif +} + + +#ifndef __WIN__ +/***********************************************************************//** +Wrapper to fsync(2) that retries the call on some errors. +Returns the value 0 if successful; otherwise the value -1 is returned and +the global variable errno is set to indicate the error. +@return 0 if success, -1 otherwise */ + +static +int +os_file_fsync( +/*==========*/ + os_file_t file) /*!< in: handle to a file */ +{ + int ret; + int failures; + ibool retry; + + failures = 0; + + do { + ret = fsync(file); + + os_n_fsyncs++; + + if (ret == -1 && errno == ENOLCK) { + + if (failures % 100 == 0) { + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: fsync(): " + "No locks available; retrying\n"); + } + + os_thread_sleep(200000 /* 0.2 sec */); + + failures++; + + retry = TRUE; + } else if (ret == -1 && errno == EINTR) { + /* Handle signal interruptions correctly */ + retry = TRUE; + } else { + + retry = FALSE; + } + } while (retry); + + return(ret); +} +#endif /* !__WIN__ */ + +/***********************************************************************//** +NOTE! Use the corresponding macro os_file_flush(), not directly this function! +Flushes the write buffers of a given file to the disk. +@return TRUE if success */ +UNIV_INTERN +ibool +os_file_flush_func( +/*===============*/ + os_file_t file) /*!< in, own: handle to a file */ +{ +#ifdef __WIN__ + BOOL ret; + + ut_a(file); + + os_n_fsyncs++; + + ret = FlushFileBuffers(file); + + if (ret) { + return(TRUE); + } + + /* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is + actually a raw device, we choose to ignore that error if we are using + raw disks */ + + if (srv_start_raw_disk_in_use && GetLastError() + == ERROR_INVALID_FUNCTION) { + return(TRUE); + } + + os_file_handle_error(NULL, "flush"); + + /* It is a fatal error if a file flush does not succeed, because then + the database can get corrupt on disk */ + ut_error; + + return(FALSE); +#else + int ret; + +#if defined(HAVE_DARWIN_THREADS) +# ifndef F_FULLFSYNC + /* The following definition is from the Mac OS X 10.3 <sys/fcntl.h> */ +# define F_FULLFSYNC 51 /* fsync + ask the drive to flush to the media */ +# elif F_FULLFSYNC != 51 +# error "F_FULLFSYNC != 51: ABI incompatibility with Mac OS X 10.3" +# endif + /* Apple has disabled fsync() for internal disk drives in OS X. That + caused corruption for a user when he tested a power outage. Let us in + OS X use a nonstandard flush method recommended by an Apple + engineer. */ + + if (!srv_have_fullfsync) { + /* If we are not on an operating system that supports this, + then fall back to a plain fsync. */ + + ret = os_file_fsync(file); + } else { + ret = fcntl(file, F_FULLFSYNC, NULL); + + if (ret) { + /* If we are not on a file system that supports this, + then fall back to a plain fsync. */ + ret = os_file_fsync(file); + } + } +#else + ret = os_file_fsync(file); +#endif + + if (ret == 0) { + return(TRUE); + } + + /* Since Linux returns EINVAL if the 'file' is actually a raw device, + we choose to ignore that error if we are using raw disks */ + + if (srv_start_raw_disk_in_use && errno == EINVAL) { + + return(TRUE); + } + + ib_logf(IB_LOG_LEVEL_ERROR, "The OS said file flush did not succeed"); + + os_file_handle_error(NULL, "flush"); + + /* It is a fatal error if a file flush does not succeed, because then + the database can get corrupt on disk */ + ut_error; + + return(FALSE); +#endif +} + +#ifndef __WIN__ +/*******************************************************************//** +Does a synchronous read operation in Posix. +@return number of bytes read, -1 if error */ +static __attribute__((nonnull(2), warn_unused_result)) +ssize_t +os_file_pread( +/*==========*/ + os_file_t file, /*!< in: handle to a file */ + void* buf, /*!< in: buffer where to read */ + ulint n, /*!< in: number of bytes to read */ + os_offset_t offset, /*!< in: file offset from where to read */ + trx_t* trx) +{ + off_t offs; +#if defined(HAVE_PREAD) && !defined(HAVE_BROKEN_PREAD) + ssize_t n_bytes; + ssize_t n_read; +#endif /* HAVE_PREAD && !HAVE_BROKEN_PREAD */ + ulint sec; + ulint ms; + ib_uint64_t start_time; + ib_uint64_t finish_time; + + ut_ad(n); + + /* If off_t is > 4 bytes in size, then we assume we can pass a + 64-bit address */ + offs = (off_t) offset; + + if (sizeof(off_t) <= 4) { + if (offset != (os_offset_t) offs) { + ib_logf(IB_LOG_LEVEL_ERROR, + "File read at offset > 4 GB"); + } + } + + os_n_file_reads++; + + if (UNIV_UNLIKELY(trx && trx->take_stats)) + { + trx->io_reads++; + trx->io_read += n; + ut_usectime(&sec, &ms); + start_time = (ib_uint64_t)sec * 1000000 + ms; + } else { + start_time = 0; + } +#if defined(HAVE_PREAD) && !defined(HAVE_BROKEN_PREAD) +#if defined(HAVE_ATOMIC_BUILTINS) && UNIV_WORD_SIZE == 8 + (void) os_atomic_increment_ulint(&os_n_pending_reads, 1); + (void) os_atomic_increment_ulint(&os_file_n_pending_preads, 1); + MONITOR_ATOMIC_INC(MONITOR_OS_PENDING_READS); +#else + os_mutex_enter(os_file_count_mutex); + os_file_n_pending_preads++; + os_n_pending_reads++; + MONITOR_INC(MONITOR_OS_PENDING_READS); + os_mutex_exit(os_file_count_mutex); +#endif /* HAVE_ATOMIC_BUILTINS && UNIV_WORD == 8 */ + + /* Handle partial reads and signal interruptions correctly */ + for (n_bytes = 0; n_bytes < (ssize_t) n; ) { + n_read = pread(file, buf, (ssize_t)n - n_bytes, offs); + if (n_read > 0) { + n_bytes += n_read; + offs += n_read; + buf = (char *)buf + n_read; + } else if (n_read == -1 && errno == EINTR) { + continue; + } else { + break; + } + } + +#if defined(HAVE_ATOMIC_BUILTINS) && UNIV_WORD_SIZE == 8 + (void) os_atomic_decrement_ulint(&os_n_pending_reads, 1); + (void) os_atomic_decrement_ulint(&os_file_n_pending_preads, 1); + MONITOR_ATOMIC_DEC(MONITOR_OS_PENDING_READS); +#else + os_mutex_enter(os_file_count_mutex); + os_file_n_pending_preads--; + os_n_pending_reads--; + MONITOR_DEC(MONITOR_OS_PENDING_READS); + os_mutex_exit(os_file_count_mutex); +#endif /* !HAVE_ATOMIC_BUILTINS || UNIV_WORD == 8 */ + + if (UNIV_UNLIKELY(start_time != 0)) + { + ut_usectime(&sec, &ms); + finish_time = (ib_uint64_t)sec * 1000000 + ms; + trx->io_reads_wait_timer += (ulint)(finish_time - start_time); + } + + return(n_bytes); +#else + { + off_t ret_offset; + ssize_t ret; + ssize_t n_read; +#ifndef UNIV_HOTBACKUP + ulint i; +#endif /* !UNIV_HOTBACKUP */ + +#if defined(HAVE_ATOMIC_BUILTINS) && UNIV_WORD_SIZE == 8 + (void) os_atomic_increment_ulint(&os_n_pending_reads, 1); + MONITOR_ATOMIC_INC(MONITOR_OS_PENDING_READS); +#else + os_mutex_enter(os_file_count_mutex); + os_n_pending_reads++; + MONITOR_INC(MONITOR_OS_PENDING_READS); + os_mutex_exit(os_file_count_mutex); +#endif /* HAVE_ATOMIC_BUILTINS && UNIV_WORD == 8 */ +#ifndef UNIV_HOTBACKUP + /* Protect the seek / read operation with a mutex */ + i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES; + + os_mutex_enter(os_file_seek_mutexes[i]); +#endif /* !UNIV_HOTBACKUP */ + + ret_offset = lseek(file, offs, SEEK_SET); + + if (ret_offset < 0) { + ret = -1; + } else { + /* Handle signal interruptions correctly */ + for (ret = 0; ret < (ssize_t) n; ) { + n_read = read(file, buf, (ssize_t)n); + if (n_read > 0) { + ret += n_read; + } else if (n_read == -1 && errno == EINTR) { + continue; + } else { + break; + } + } + } + +#ifndef UNIV_HOTBACKUP + os_mutex_exit(os_file_seek_mutexes[i]); +#endif /* !UNIV_HOTBACKUP */ + +#if defined(HAVE_ATOMIC_BUILTINS) && UNIV_WORD_SIZE == 8 + (void) os_atomic_decrement_ulint(&os_n_pending_reads, 1); + MONITOR_ATOIC_DEC(MONITOR_OS_PENDING_READS); +#else + os_mutex_enter(os_file_count_mutex); + os_n_pending_reads--; + MONITOR_DEC(MONITOR_OS_PENDING_READS); + os_mutex_exit(os_file_count_mutex); +#endif /* HAVE_ATOMIC_BUILTINS && UNIV_WORD_SIZE == 8 */ + + if (UNIV_UNLIKELY(start_time != 0) + { + ut_usectime(&sec, &ms); + finish_time = (ib_uint64_t)sec * 1000000 + ms; + trx->io_reads_wait_timer += (ulint)(finish_time - start_time); + } + + return(ret); + } +#endif +} + +/*******************************************************************//** +Does a synchronous write operation in Posix. +@return number of bytes written, -1 if error */ +static __attribute__((nonnull, warn_unused_result)) +ssize_t +os_file_pwrite( +/*===========*/ + os_file_t file, /*!< in: handle to a file */ + const void* buf, /*!< in: buffer from where to write */ + ulint n, /*!< in: number of bytes to write */ + os_offset_t offset) /*!< in: file offset where to write */ +{ + ssize_t ret; + ssize_t n_written; + off_t offs; + + ut_ad(n); + ut_ad(!srv_read_only_mode); + + /* If off_t is > 4 bytes in size, then we assume we can pass a + 64-bit address */ + offs = (off_t) offset; + + if (sizeof(off_t) <= 4) { + if (offset != (os_offset_t) offs) { + ib_logf(IB_LOG_LEVEL_ERROR, + "File write at offset > 4 GB."); + } + } + + os_n_file_writes++; + +#if defined(HAVE_PWRITE) && !defined(HAVE_BROKEN_PREAD) +#if !defined(HAVE_ATOMIC_BUILTINS) || UNIV_WORD_SIZE < 8 + os_mutex_enter(os_file_count_mutex); + os_file_n_pending_pwrites++; + os_n_pending_writes++; + MONITOR_INC(MONITOR_OS_PENDING_WRITES); + os_mutex_exit(os_file_count_mutex); +#else + (void) os_atomic_increment_ulint(&os_n_pending_writes, 1); + (void) os_atomic_increment_ulint(&os_file_n_pending_pwrites, 1); + MONITOR_ATOMIC_INC(MONITOR_OS_PENDING_WRITES); +#endif /* !HAVE_ATOMIC_BUILTINS || UNIV_WORD < 8 */ + + /* Handle partial writes and signal interruptions correctly */ + for (ret = 0; ret < (ssize_t) n; ) { + n_written = pwrite(file, buf, (ssize_t)n - ret, offs); + if (n_written >= 0) { + ret += n_written; + offs += n_written; + buf = (char *)buf + n_written; + } else if (n_written == -1 && errno == EINTR) { + continue; + } else { + break; + } + } + +#if !defined(HAVE_ATOMIC_BUILTINS) || UNIV_WORD_SIZE < 8 + os_mutex_enter(os_file_count_mutex); + os_file_n_pending_pwrites--; + os_n_pending_writes--; + MONITOR_DEC(MONITOR_OS_PENDING_WRITES); + os_mutex_exit(os_file_count_mutex); +#else + (void) os_atomic_decrement_ulint(&os_n_pending_writes, 1); + (void) os_atomic_decrement_ulint(&os_file_n_pending_pwrites, 1); + MONITOR_ATOMIC_DEC(MONITOR_OS_PENDING_WRITES); +#endif /* !HAVE_ATOMIC_BUILTINS || UNIV_WORD < 8 */ + + return(ret); +#else + { + off_t ret_offset; +# ifndef UNIV_HOTBACKUP + ulint i; +# endif /* !UNIV_HOTBACKUP */ + + os_mutex_enter(os_file_count_mutex); + os_n_pending_writes++; + MONITOR_INC(MONITOR_OS_PENDING_WRITES); + os_mutex_exit(os_file_count_mutex); + +# ifndef UNIV_HOTBACKUP + /* Protect the seek / write operation with a mutex */ + i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES; + + os_mutex_enter(os_file_seek_mutexes[i]); +# endif /* UNIV_HOTBACKUP */ + + ret_offset = lseek(file, offs, SEEK_SET); + + if (ret_offset < 0) { + ret = -1; + + goto func_exit; + } + + /* Handle signal interruptions correctly */ + for (ret = 0; ret < (ssize_t) n; ) { + n_written = write(file, buf, (ssize_t)n); + if (n_written > 0) { + ret += n_written; + } else if (n_written == -1 && errno == EINTR) { + continue; + } else { + break; + } + } + +func_exit: +# ifndef UNIV_HOTBACKUP + os_mutex_exit(os_file_seek_mutexes[i]); +# endif /* !UNIV_HOTBACKUP */ + + os_mutex_enter(os_file_count_mutex); + os_n_pending_writes--; + MONITOR_DEC(MONITOR_OS_PENDING_WRITES); + os_mutex_exit(os_file_count_mutex); + + return(ret); + } +#endif /* !UNIV_HOTBACKUP */ +} +#endif + +/*******************************************************************//** +NOTE! Use the corresponding macro os_file_read(), not directly this +function! +Requests a synchronous positioned read operation. +@return TRUE if request was successful, FALSE if fail */ +UNIV_INTERN +ibool +os_file_read_func( +/*==============*/ + os_file_t file, /*!< in: handle to a file */ + void* buf, /*!< in: buffer where to read */ + os_offset_t offset, /*!< in: file offset where to read */ + ulint n, /*!< in: number of bytes to read */ + trx_t* trx) +{ +#ifdef __WIN__ + BOOL ret; + DWORD len; + DWORD ret2; + DWORD low; + DWORD high; + ibool retry; +#ifndef UNIV_HOTBACKUP + ulint i; +#endif /* !UNIV_HOTBACKUP */ + + /* On 64-bit Windows, ulint is 64 bits. But offset and n should be + no more than 32 bits. */ + ut_a((n & 0xFFFFFFFFUL) == n); + + os_n_file_reads++; + os_bytes_read_since_printout += n; + +try_again: + ut_ad(file); + ut_ad(buf); + ut_ad(n > 0); + + low = (DWORD) offset & 0xFFFFFFFF; + high = (DWORD) (offset >> 32); + + os_mutex_enter(os_file_count_mutex); + os_n_pending_reads++; + MONITOR_INC(MONITOR_OS_PENDING_READS); + os_mutex_exit(os_file_count_mutex); + +#ifndef UNIV_HOTBACKUP + /* Protect the seek / read operation with a mutex */ + i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES; + + os_mutex_enter(os_file_seek_mutexes[i]); +#endif /* !UNIV_HOTBACKUP */ + + ret2 = SetFilePointer( + file, low, reinterpret_cast<PLONG>(&high), FILE_BEGIN); + + if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) { + +#ifndef UNIV_HOTBACKUP + os_mutex_exit(os_file_seek_mutexes[i]); +#endif /* !UNIV_HOTBACKUP */ + + os_mutex_enter(os_file_count_mutex); + os_n_pending_reads--; + MONITOR_DEC(MONITOR_OS_PENDING_READS); + os_mutex_exit(os_file_count_mutex); + + goto error_handling; + } + + ret = ReadFile(file, buf, (DWORD) n, &len, NULL); + +#ifndef UNIV_HOTBACKUP + os_mutex_exit(os_file_seek_mutexes[i]); +#endif /* !UNIV_HOTBACKUP */ + + os_mutex_enter(os_file_count_mutex); + os_n_pending_reads--; + MONITOR_DEC(MONITOR_OS_PENDING_READS); + os_mutex_exit(os_file_count_mutex); + + if (ret && len == n) { + return(TRUE); + } +#else /* __WIN__ */ + ibool retry; + ssize_t ret; + + os_bytes_read_since_printout += n; + +try_again: + ret = os_file_pread(file, buf, n, offset, trx); + + if ((ulint) ret == n) { + + return(TRUE); + } + + ib_logf(IB_LOG_LEVEL_ERROR, + "Tried to read " ULINTPF " bytes at offset " UINT64PF ". " + "Was only able to read %ld.", n, offset, (lint) ret); +#endif /* __WIN__ */ +#ifdef __WIN__ +error_handling: +#endif + retry = os_file_handle_error(NULL, "read"); + + if (retry) { + goto try_again; + } + + fprintf(stderr, + "InnoDB: Fatal error: cannot read from file." + " OS error number %lu.\n", +#ifdef __WIN__ + (ulong) GetLastError() +#else + (ulong) errno +#endif /* __WIN__ */ + ); + fflush(stderr); + + ut_error; + + return(FALSE); +} + +/*******************************************************************//** +NOTE! Use the corresponding macro os_file_read_no_error_handling(), +not directly this function! +Requests a synchronous positioned read operation. This function does not do +any error handling. In case of error it returns FALSE. +@return TRUE if request was successful, FALSE if fail */ +UNIV_INTERN +ibool +os_file_read_no_error_handling_func( +/*================================*/ + os_file_t file, /*!< in: handle to a file */ + void* buf, /*!< in: buffer where to read */ + os_offset_t offset, /*!< in: file offset where to read */ + ulint n) /*!< in: number of bytes to read */ +{ +#ifdef __WIN__ + BOOL ret; + DWORD len; + DWORD ret2; + DWORD low; + DWORD high; + ibool retry; +#ifndef UNIV_HOTBACKUP + ulint i; +#endif /* !UNIV_HOTBACKUP */ + + /* On 64-bit Windows, ulint is 64 bits. But offset and n should be + no more than 32 bits. */ + ut_a((n & 0xFFFFFFFFUL) == n); + + os_n_file_reads++; + os_bytes_read_since_printout += n; + +try_again: + ut_ad(file); + ut_ad(buf); + ut_ad(n > 0); + + low = (DWORD) offset & 0xFFFFFFFF; + high = (DWORD) (offset >> 32); + + os_mutex_enter(os_file_count_mutex); + os_n_pending_reads++; + MONITOR_INC(MONITOR_OS_PENDING_READS); + os_mutex_exit(os_file_count_mutex); + +#ifndef UNIV_HOTBACKUP + /* Protect the seek / read operation with a mutex */ + i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES; + + os_mutex_enter(os_file_seek_mutexes[i]); +#endif /* !UNIV_HOTBACKUP */ + + ret2 = SetFilePointer( + file, low, reinterpret_cast<PLONG>(&high), FILE_BEGIN); + + if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) { + +#ifndef UNIV_HOTBACKUP + os_mutex_exit(os_file_seek_mutexes[i]); +#endif /* !UNIV_HOTBACKUP */ + + os_mutex_enter(os_file_count_mutex); + os_n_pending_reads--; + MONITOR_DEC(MONITOR_OS_PENDING_READS); + os_mutex_exit(os_file_count_mutex); + + goto error_handling; + } + + ret = ReadFile(file, buf, (DWORD) n, &len, NULL); + +#ifndef UNIV_HOTBACKUP + os_mutex_exit(os_file_seek_mutexes[i]); +#endif /* !UNIV_HOTBACKUP */ + + os_mutex_enter(os_file_count_mutex); + os_n_pending_reads--; + MONITOR_DEC(MONITOR_OS_PENDING_READS); + os_mutex_exit(os_file_count_mutex); + + if (ret && len == n) { + return(TRUE); + } +#else /* __WIN__ */ + ibool retry; + ssize_t ret; + + os_bytes_read_since_printout += n; + +try_again: + ret = os_file_pread(file, buf, n, offset, NULL); + + if ((ulint) ret == n) { + + return(TRUE); + } +#endif /* __WIN__ */ +#ifdef __WIN__ +error_handling: +#endif + retry = os_file_handle_error_no_exit(NULL, "read", FALSE); + + if (retry) { + goto try_again; + } + + return(FALSE); +} + +/*******************************************************************//** +Rewind file to its start, read at most size - 1 bytes from it to str, and +NUL-terminate str. All errors are silently ignored. This function is +mostly meant to be used with temporary files. */ +UNIV_INTERN +void +os_file_read_string( +/*================*/ + FILE* file, /*!< in: file to read from */ + char* str, /*!< in: buffer where to read */ + ulint size) /*!< in: size of buffer */ +{ + size_t flen; + + if (size == 0) { + return; + } + + rewind(file); + flen = fread(str, 1, size - 1, file); + str[flen] = '\0'; +} + +/*******************************************************************//** +NOTE! Use the corresponding macro os_file_write(), not directly +this function! +Requests a synchronous write operation. +@return TRUE if request was successful, FALSE if fail */ +UNIV_INTERN +ibool +os_file_write_func( +/*===============*/ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + os_file_t file, /*!< in: handle to a file */ + const void* buf, /*!< in: buffer from which to write */ + os_offset_t offset, /*!< in: file offset where to write */ + ulint n) /*!< in: number of bytes to write */ +{ + ut_ad(!srv_read_only_mode); + +#ifdef __WIN__ + BOOL ret; + DWORD len; + DWORD ret2; + DWORD low; + DWORD high; + ulint n_retries = 0; + ulint err; +#ifndef UNIV_HOTBACKUP + ulint i; +#endif /* !UNIV_HOTBACKUP */ + + /* On 64-bit Windows, ulint is 64 bits. But offset and n should be + no more than 32 bits. */ + ut_a((n & 0xFFFFFFFFUL) == n); + + os_n_file_writes++; + + ut_ad(file); + ut_ad(buf); + ut_ad(n > 0); +retry: + low = (DWORD) offset & 0xFFFFFFFF; + high = (DWORD) (offset >> 32); + + os_mutex_enter(os_file_count_mutex); + os_n_pending_writes++; + MONITOR_INC(MONITOR_OS_PENDING_WRITES); + os_mutex_exit(os_file_count_mutex); + +#ifndef UNIV_HOTBACKUP + /* Protect the seek / write operation with a mutex */ + i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES; + + os_mutex_enter(os_file_seek_mutexes[i]); +#endif /* !UNIV_HOTBACKUP */ + + ret2 = SetFilePointer( + file, low, reinterpret_cast<PLONG>(&high), FILE_BEGIN); + + if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) { + +#ifndef UNIV_HOTBACKUP + os_mutex_exit(os_file_seek_mutexes[i]); +#endif /* !UNIV_HOTBACKUP */ + + os_mutex_enter(os_file_count_mutex); + os_n_pending_writes--; + MONITOR_DEC(MONITOR_OS_PENDING_WRITES); + os_mutex_exit(os_file_count_mutex); + + ut_print_timestamp(stderr); + + fprintf(stderr, + " InnoDB: Error: File pointer positioning to" + " file %s failed at\n" + "InnoDB: offset %llu. Operating system" + " error number %lu.\n" + "InnoDB: Some operating system error numbers" + " are described at\n" + "InnoDB: " + REFMAN "operating-system-error-codes.html\n", + name, offset, (ulong) GetLastError()); + + return(FALSE); + } + + ret = WriteFile(file, buf, (DWORD) n, &len, NULL); + +#ifndef UNIV_HOTBACKUP + os_mutex_exit(os_file_seek_mutexes[i]); +#endif /* !UNIV_HOTBACKUP */ + + os_mutex_enter(os_file_count_mutex); + os_n_pending_writes--; + MONITOR_DEC(MONITOR_OS_PENDING_WRITES); + os_mutex_exit(os_file_count_mutex); + + if (ret && len == n) { + + return(TRUE); + } + + /* If some background file system backup tool is running, then, at + least in Windows 2000, we may get here a specific error. Let us + retry the operation 100 times, with 1 second waits. */ + + if (GetLastError() == ERROR_LOCK_VIOLATION && n_retries < 100) { + + os_thread_sleep(1000000); + + n_retries++; + + goto retry; + } + + if (!os_has_said_disk_full) { + + err = (ulint) GetLastError(); + + ut_print_timestamp(stderr); + + fprintf(stderr, + " InnoDB: Error: Write to file %s failed" + " at offset %llu.\n" + "InnoDB: %lu bytes should have been written," + " only %lu were written.\n" + "InnoDB: Operating system error number %lu.\n" + "InnoDB: Check that your OS and file system" + " support files of this size.\n" + "InnoDB: Check also that the disk is not full" + " or a disk quota exceeded.\n", + name, offset, + (ulong) n, (ulong) len, (ulong) err); + + if (strerror((int) err) != NULL) { + fprintf(stderr, + "InnoDB: Error number %lu means '%s'.\n", + (ulong) err, strerror((int) err)); + } + + fprintf(stderr, + "InnoDB: Some operating system error numbers" + " are described at\n" + "InnoDB: " + REFMAN "operating-system-error-codes.html\n"); + + os_has_said_disk_full = TRUE; + } + + return(FALSE); +#else + ssize_t ret; + + ret = os_file_pwrite(file, buf, n, offset); + + if ((ulint) ret == n) { + + return(TRUE); + } + + if (!os_has_said_disk_full) { + + ut_print_timestamp(stderr); + + fprintf(stderr, + " InnoDB: Error: Write to file %s failed" + " at offset " UINT64PF ".\n" + "InnoDB: %lu bytes should have been written," + " only %ld were written.\n" + "InnoDB: Operating system error number %lu.\n" + "InnoDB: Check that your OS and file system" + " support files of this size.\n" + "InnoDB: Check also that the disk is not full" + " or a disk quota exceeded.\n", + name, offset, n, (lint) ret, + (ulint) errno); + if (strerror(errno) != NULL) { + fprintf(stderr, + "InnoDB: Error number %d means '%s'.\n", + errno, strerror(errno)); + } + + fprintf(stderr, + "InnoDB: Some operating system error numbers" + " are described at\n" + "InnoDB: " + REFMAN "operating-system-error-codes.html\n"); + + os_has_said_disk_full = TRUE; + } + + return(FALSE); +#endif +} + +/*******************************************************************//** +Check the existence and type of the given file. +@return TRUE if call succeeded */ +UNIV_INTERN +ibool +os_file_status( +/*===========*/ + const char* path, /*!< in: pathname of the file */ + ibool* exists, /*!< out: TRUE if file exists */ + os_file_type_t* type) /*!< out: type of the file (if it exists) */ +{ +#ifdef __WIN__ + int ret; + struct _stat64 statinfo; + + ret = _stat64(path, &statinfo); + if (ret && (errno == ENOENT || errno == ENOTDIR)) { + /* file does not exist */ + *exists = FALSE; + return(TRUE); + } else if (ret) { + /* file exists, but stat call failed */ + + os_file_handle_error_no_exit(path, "stat", FALSE); + + return(FALSE); + } + + if (_S_IFDIR & statinfo.st_mode) { + *type = OS_FILE_TYPE_DIR; + } else if (_S_IFREG & statinfo.st_mode) { + *type = OS_FILE_TYPE_FILE; + } else { + *type = OS_FILE_TYPE_UNKNOWN; + } + + *exists = TRUE; + + return(TRUE); +#else + int ret; + struct stat statinfo; + + ret = stat(path, &statinfo); + if (ret && (errno == ENOENT || errno == ENOTDIR)) { + /* file does not exist */ + *exists = FALSE; + return(TRUE); + } else if (ret) { + /* file exists, but stat call failed */ + + os_file_handle_error_no_exit(path, "stat", FALSE); + + return(FALSE); + } + + if (S_ISDIR(statinfo.st_mode)) { + *type = OS_FILE_TYPE_DIR; + } else if (S_ISLNK(statinfo.st_mode)) { + *type = OS_FILE_TYPE_LINK; + } else if (S_ISREG(statinfo.st_mode)) { + *type = OS_FILE_TYPE_FILE; + } else { + *type = OS_FILE_TYPE_UNKNOWN; + } + + *exists = TRUE; + + return(TRUE); +#endif +} + +/*******************************************************************//** +This function returns information about the specified file +@return DB_SUCCESS if all OK */ +UNIV_INTERN +dberr_t +os_file_get_status( +/*===============*/ + const char* path, /*!< in: pathname of the file */ + os_file_stat_t* stat_info, /*!< information of a file in a + directory */ + bool check_rw_perm) /*!< in: for testing whether the + file can be opened in RW mode */ +{ + int ret; + +#ifdef __WIN__ + struct _stat64 statinfo; + + ret = _stat64(path, &statinfo); + + if (ret && (errno == ENOENT || errno == ENOTDIR)) { + /* file does not exist */ + + return(DB_NOT_FOUND); + + } else if (ret) { + /* file exists, but stat call failed */ + + os_file_handle_error_no_exit(path, "stat", FALSE); + + return(DB_FAIL); + + } else if (_S_IFDIR & statinfo.st_mode) { + stat_info->type = OS_FILE_TYPE_DIR; + } else if (_S_IFREG & statinfo.st_mode) { + + DWORD access = GENERIC_READ; + + if (!srv_read_only_mode) { + access |= GENERIC_WRITE; + } + + stat_info->type = OS_FILE_TYPE_FILE; + + /* Check if we can open it in read-only mode. */ + + if (check_rw_perm) { + HANDLE fh; + + fh = CreateFile( + (LPCTSTR) path, // File to open + access, + 0, // No sharing + NULL, // Default security + OPEN_EXISTING, // Existing file only + FILE_ATTRIBUTE_NORMAL, // Normal file + NULL); // No attr. template + + if (fh == INVALID_HANDLE_VALUE) { + stat_info->rw_perm = false; + } else { + stat_info->rw_perm = true; + CloseHandle(fh); + } + } + } else { + stat_info->type = OS_FILE_TYPE_UNKNOWN; + } +#else + struct stat statinfo; + + ret = stat(path, &statinfo); + + if (ret && (errno == ENOENT || errno == ENOTDIR)) { + /* file does not exist */ + + return(DB_NOT_FOUND); + + } else if (ret) { + /* file exists, but stat call failed */ + + os_file_handle_error_no_exit(path, "stat", FALSE); + + return(DB_FAIL); + + } + + switch (statinfo.st_mode & S_IFMT) { + case S_IFDIR: + stat_info->type = OS_FILE_TYPE_DIR; + break; + case S_IFLNK: + stat_info->type = OS_FILE_TYPE_LINK; + break; + case S_IFBLK: + stat_info->type = OS_FILE_TYPE_BLOCK; + break; + case S_IFREG: + stat_info->type = OS_FILE_TYPE_FILE; + break; + default: + stat_info->type = OS_FILE_TYPE_UNKNOWN; + } + + + if (check_rw_perm && (stat_info->type == OS_FILE_TYPE_FILE + || stat_info->type == OS_FILE_TYPE_BLOCK)) { + int fh; + int access; + + access = !srv_read_only_mode ? O_RDWR : O_RDONLY; + + fh = ::open(path, access, os_innodb_umask); + + if (fh == -1) { + stat_info->rw_perm = false; + } else { + stat_info->rw_perm = true; + close(fh); + } + } + +#endif /* _WIN_ */ + + stat_info->ctime = statinfo.st_ctime; + stat_info->atime = statinfo.st_atime; + stat_info->mtime = statinfo.st_mtime; + stat_info->size = statinfo.st_size; + + return(DB_SUCCESS); +} + +/* path name separator character */ +#ifdef __WIN__ +# define OS_FILE_PATH_SEPARATOR '\\' +#else +# define OS_FILE_PATH_SEPARATOR '/' +#endif + +/****************************************************************//** +This function returns a new path name after replacing the basename +in an old path with a new basename. The old_path is a full path +name including the extension. The tablename is in the normal +form "databasename/tablename". The new base name is found after +the forward slash. Both input strings are null terminated. + +This function allocates memory to be returned. It is the callers +responsibility to free the return value after it is no longer needed. + +@return own: new full pathname */ +UNIV_INTERN +char* +os_file_make_new_pathname( +/*======================*/ + const char* old_path, /*!< in: pathname */ + const char* tablename) /*!< in: contains new base name */ +{ + ulint dir_len; + char* last_slash; + char* base_name; + char* new_path; + ulint new_path_len; + + /* Split the tablename into its database and table name components. + They are separated by a '/'. */ + last_slash = strrchr((char*) tablename, '/'); + base_name = last_slash ? last_slash + 1 : (char*) tablename; + + /* Find the offset of the last slash. We will strip off the + old basename.ibd which starts after that slash. */ + last_slash = strrchr((char*) old_path, OS_FILE_PATH_SEPARATOR); + dir_len = last_slash ? last_slash - old_path : strlen(old_path); + + /* allocate a new path and move the old directory path to it. */ + new_path_len = dir_len + strlen(base_name) + sizeof "/.ibd"; + new_path = static_cast<char*>(mem_alloc(new_path_len)); + memcpy(new_path, old_path, dir_len); + + ut_snprintf(new_path + dir_len, + new_path_len - dir_len, + "%c%s.ibd", + OS_FILE_PATH_SEPARATOR, + base_name); + + return(new_path); +} + +/****************************************************************//** +This function returns a remote path name by combining a data directory +path provided in a DATA DIRECTORY clause with the tablename which is +in the form 'database/tablename'. It strips the file basename (which +is the tablename) found after the last directory in the path provided. +The full filepath created will include the database name as a directory +under the path provided. The filename is the tablename with the '.ibd' +extension. All input and output strings are null-terminated. + +This function allocates memory to be returned. It is the callers +responsibility to free the return value after it is no longer needed. + +@return own: A full pathname; data_dir_path/databasename/tablename.ibd */ +UNIV_INTERN +char* +os_file_make_remote_pathname( +/*=========================*/ + const char* data_dir_path, /*!< in: pathname */ + const char* tablename, /*!< in: tablename */ + const char* extention) /*!< in: file extention; ibd,cfg */ +{ + ulint data_dir_len; + char* last_slash; + char* new_path; + ulint new_path_len; + + ut_ad(extention && strlen(extention) == 3); + + /* Find the offset of the last slash. We will strip off the + old basename or tablename which starts after that slash. */ + last_slash = strrchr((char*) data_dir_path, OS_FILE_PATH_SEPARATOR); + data_dir_len = last_slash ? last_slash - data_dir_path : strlen(data_dir_path); + + /* allocate a new path and move the old directory path to it. */ + new_path_len = data_dir_len + strlen(tablename) + + sizeof "/." + strlen(extention); + new_path = static_cast<char*>(mem_alloc(new_path_len)); + memcpy(new_path, data_dir_path, data_dir_len); + ut_snprintf(new_path + data_dir_len, + new_path_len - data_dir_len, + "%c%s.%s", + OS_FILE_PATH_SEPARATOR, + tablename, + extention); + + srv_normalize_path_for_win(new_path); + + return(new_path); +} + +/****************************************************************//** +This function reduces a null-terminated full remote path name into +the path that is sent by MySQL for DATA DIRECTORY clause. It replaces +the 'databasename/tablename.ibd' found at the end of the path with just +'tablename'. + +Since the result is always smaller than the path sent in, no new memory +is allocated. The caller should allocate memory for the path sent in. +This function manipulates that path in place. + +If the path format is not as expected, just return. The result is used +to inform a SHOW CREATE TABLE command. */ +UNIV_INTERN +void +os_file_make_data_dir_path( +/*========================*/ + char* data_dir_path) /*!< in/out: full path/data_dir_path */ +{ + char* ptr; + char* tablename; + ulint tablename_len; + + /* Replace the period before the extension with a null byte. */ + ptr = strrchr((char*) data_dir_path, '.'); + if (!ptr) { + return; + } + ptr[0] = '\0'; + + /* The tablename starts after the last slash. */ + ptr = strrchr((char*) data_dir_path, OS_FILE_PATH_SEPARATOR); + if (!ptr) { + return; + } + ptr[0] = '\0'; + tablename = ptr + 1; + + /* The databasename starts after the next to last slash. */ + ptr = strrchr((char*) data_dir_path, OS_FILE_PATH_SEPARATOR); + if (!ptr) { + return; + } + tablename_len = ut_strlen(tablename); + + ut_memmove(++ptr, tablename, tablename_len); + + ptr[tablename_len] = '\0'; +} + +/****************************************************************//** +The function os_file_dirname returns a directory component of a +null-terminated pathname string. In the usual case, dirname returns +the string up to, but not including, the final '/', and basename +is the component following the final '/'. Trailing '/' characters +are not counted as part of the pathname. + +If path does not contain a slash, dirname returns the string ".". + +Concatenating the string returned by dirname, a "/", and the basename +yields a complete pathname. + +The return value is a copy of the directory component of the pathname. +The copy is allocated from heap. It is the caller responsibility +to free it after it is no longer needed. + +The following list of examples (taken from SUSv2) shows the strings +returned by dirname and basename for different paths: + + path dirname basename + "/usr/lib" "/usr" "lib" + "/usr/" "/" "usr" + "usr" "." "usr" + "/" "/" "/" + "." "." "." + ".." "." ".." + +@return own: directory component of the pathname */ +UNIV_INTERN +char* +os_file_dirname( +/*============*/ + const char* path) /*!< in: pathname */ +{ + /* Find the offset of the last slash */ + const char* last_slash = strrchr(path, OS_FILE_PATH_SEPARATOR); + if (!last_slash) { + /* No slash in the path, return "." */ + + return(mem_strdup(".")); + } + + /* Ok, there is a slash */ + + if (last_slash == path) { + /* last slash is the first char of the path */ + + return(mem_strdup("/")); + } + + /* Non-trivial directory component */ + + return(mem_strdupl(path, last_slash - path)); +} + +/****************************************************************//** +Creates all missing subdirectories along the given path. +@return TRUE if call succeeded FALSE otherwise */ +UNIV_INTERN +ibool +os_file_create_subdirs_if_needed( +/*=============================*/ + const char* path) /*!< in: path name */ +{ + if (srv_read_only_mode) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "read only mode set. Can't create subdirectories '%s'", + path); + + return(FALSE); + + } + + char* subdir = os_file_dirname(path); + + if (strlen(subdir) == 1 + && (*subdir == OS_FILE_PATH_SEPARATOR || *subdir == '.')) { + /* subdir is root or cwd, nothing to do */ + mem_free(subdir); + + return(TRUE); + } + + /* Test if subdir exists */ + os_file_type_t type; + ibool subdir_exists; + ibool success = os_file_status(subdir, &subdir_exists, &type); + + if (success && !subdir_exists) { + + /* subdir does not exist, create it */ + success = os_file_create_subdirs_if_needed(subdir); + + if (!success) { + mem_free(subdir); + + return(FALSE); + } + + success = os_file_create_directory(subdir, FALSE); + } + + mem_free(subdir); + + return(success); +} + +#ifndef UNIV_HOTBACKUP +/****************************************************************//** +Returns a pointer to the nth slot in the aio array. +@return pointer to slot */ +static +os_aio_slot_t* +os_aio_array_get_nth_slot( +/*======================*/ + os_aio_array_t* array, /*!< in: aio array */ + ulint index) /*!< in: index of the slot */ +{ + ut_a(index < array->n_slots); + + return(&array->slots[index]); +} + +#if defined(LINUX_NATIVE_AIO) +/******************************************************************//** +Creates an io_context for native linux AIO. +@return TRUE on success. */ +static +ibool +os_aio_linux_create_io_ctx( +/*=======================*/ + ulint max_events, /*!< in: number of events. */ + io_context_t* io_ctx) /*!< out: io_ctx to initialize. */ +{ + int ret; + ulint retries = 0; + +retry: + memset(io_ctx, 0x0, sizeof(*io_ctx)); + + /* Initialize the io_ctx. Tell it how many pending + IO requests this context will handle. */ + + ret = io_setup(max_events, io_ctx); + if (ret == 0) { +#if defined(UNIV_AIO_DEBUG) + fprintf(stderr, + "InnoDB: Linux native AIO:" + " initialized io_ctx for segment\n"); +#endif + /* Success. Return now. */ + return(TRUE); + } + + /* If we hit EAGAIN we'll make a few attempts before failing. */ + + switch (ret) { + case -EAGAIN: + if (retries == 0) { + /* First time around. */ + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Warning: io_setup() failed" + " with EAGAIN. Will make %d attempts" + " before giving up.\n", + OS_AIO_IO_SETUP_RETRY_ATTEMPTS); + } + + if (retries < OS_AIO_IO_SETUP_RETRY_ATTEMPTS) { + ++retries; + fprintf(stderr, + "InnoDB: Warning: io_setup() attempt" + " %lu failed.\n", + retries); + os_thread_sleep(OS_AIO_IO_SETUP_RETRY_SLEEP); + goto retry; + } + + /* Have tried enough. Better call it a day. */ + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: io_setup() failed" + " with EAGAIN after %d attempts.\n", + OS_AIO_IO_SETUP_RETRY_ATTEMPTS); + break; + + case -ENOSYS: + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: Linux Native AIO interface" + " is not supported on this platform. Please" + " check your OS documentation and install" + " appropriate binary of InnoDB.\n"); + + break; + + default: + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: Linux Native AIO setup" + " returned following error[%d]\n", -ret); + break; + } + + fprintf(stderr, + "InnoDB: You can disable Linux Native AIO by" + " setting innodb_use_native_aio = 0 in my.cnf\n"); + return(FALSE); +} + +/******************************************************************//** +Checks if the system supports native linux aio. On some kernel +versions where native aio is supported it won't work on tmpfs. In such +cases we can't use native aio as it is not possible to mix simulated +and native aio. +@return: TRUE if supported, FALSE otherwise. */ +static +ibool +os_aio_native_aio_supported(void) +/*=============================*/ +{ + int fd; + io_context_t io_ctx; + char name[1000]; + + if (!os_aio_linux_create_io_ctx(1, &io_ctx)) { + /* The platform does not support native aio. */ + return(FALSE); + } else if (!srv_read_only_mode) { + /* Now check if tmpdir supports native aio ops. */ + fd = innobase_mysql_tmpfile(); + + if (fd < 0) { + ib_logf(IB_LOG_LEVEL_WARN, + "Unable to create temp file to check " + "native AIO support."); + + return(FALSE); + } + } else { + + srv_normalize_path_for_win(srv_log_group_home_dir); + + ulint dirnamelen = strlen(srv_log_group_home_dir); + ut_a(dirnamelen < (sizeof name) - 10 - sizeof "ib_logfile"); + memcpy(name, srv_log_group_home_dir, dirnamelen); + + /* Add a path separator if needed. */ + if (dirnamelen && name[dirnamelen - 1] != SRV_PATH_SEPARATOR) { + name[dirnamelen++] = SRV_PATH_SEPARATOR; + } + + strcpy(name + dirnamelen, "ib_logfile0"); + + fd = ::open(name, O_RDONLY); + + if (fd == -1) { + + ib_logf(IB_LOG_LEVEL_WARN, + "Unable to open \"%s\" to check " + "native AIO read support.", name); + + return(FALSE); + } + } + + struct io_event io_event; + + memset(&io_event, 0x0, sizeof(io_event)); + + byte* buf = static_cast<byte*>(ut_malloc(UNIV_PAGE_SIZE * 2)); + byte* ptr = static_cast<byte*>(ut_align(buf, UNIV_PAGE_SIZE)); + + struct iocb iocb; + + /* Suppress valgrind warning. */ + memset(buf, 0x00, UNIV_PAGE_SIZE * 2); + memset(&iocb, 0x0, sizeof(iocb)); + + struct iocb* p_iocb = &iocb; + + if (!srv_read_only_mode) { + io_prep_pwrite(p_iocb, fd, ptr, UNIV_PAGE_SIZE, 0); + } else { + ut_a(UNIV_PAGE_SIZE >= 512); + io_prep_pread(p_iocb, fd, ptr, 512, 0); + } + + int err = io_submit(io_ctx, 1, &p_iocb); + + if (err >= 1) { + /* Now collect the submitted IO request. */ + err = io_getevents(io_ctx, 1, 1, &io_event, NULL); + } + + ut_free(buf); + close(fd); + + switch (err) { + case 1: + return(TRUE); + + case -EINVAL: + case -ENOSYS: + ib_logf(IB_LOG_LEVEL_ERROR, + "Linux Native AIO not supported. You can either " + "move %s to a file system that supports native " + "AIO or you can set innodb_use_native_aio to " + "FALSE to avoid this message.", + srv_read_only_mode ? name : "tmpdir"); + + /* fall through. */ + default: + ib_logf(IB_LOG_LEVEL_ERROR, + "Linux Native AIO check on %s returned error[%d]", + srv_read_only_mode ? name : "tmpdir", -err); + } + + return(FALSE); +} +#endif /* LINUX_NATIVE_AIO */ + +/******************************************************************//** +Creates an aio wait array. Note that we return NULL in case of failure. +We don't care about freeing memory here because we assume that a +failure will result in server refusing to start up. +@return own: aio array, NULL on failure */ +static +os_aio_array_t* +os_aio_array_create( +/*================*/ + ulint n, /*!< in: maximum number of pending aio + operations allowed; n must be + divisible by n_segments */ + ulint n_segments) /*!< in: number of segments in the aio array */ +{ + os_aio_array_t* array; +#ifdef WIN_ASYNC_IO + OVERLAPPED* over; +#elif defined(LINUX_NATIVE_AIO) + struct io_event* io_event = NULL; +#endif /* WIN_ASYNC_IO */ + ut_a(n > 0); + ut_a(n_segments > 0); + + array = static_cast<os_aio_array_t*>(ut_malloc(sizeof(*array))); + memset(array, 0x0, sizeof(*array)); + + array->mutex = os_mutex_create(); + array->not_full = os_event_create(); + array->is_empty = os_event_create(); + + os_event_set(array->is_empty); + + array->n_slots = n; + array->n_segments = n_segments; + + array->slots = static_cast<os_aio_slot_t*>( + ut_malloc(n * sizeof(*array->slots))); + + memset(array->slots, 0x0, sizeof(n * sizeof(*array->slots))); +#ifdef __WIN__ + array->handles = static_cast<HANDLE*>(ut_malloc(n * sizeof(HANDLE))); +#endif /* __WIN__ */ + +#if defined(LINUX_NATIVE_AIO) + array->aio_ctx = NULL; + array->aio_events = NULL; + + /* If we are not using native aio interface then skip this + part of initialization. */ + if (!srv_use_native_aio) { + goto skip_native_aio; + } + + /* Initialize the io_context array. One io_context + per segment in the array. */ + + array->aio_ctx = static_cast<io_context**>( + ut_malloc(n_segments * sizeof(*array->aio_ctx))); + + for (ulint i = 0; i < n_segments; ++i) { + if (!os_aio_linux_create_io_ctx(n/n_segments, + &array->aio_ctx[i])) { + /* If something bad happened during aio setup + we should call it a day and return right away. + We don't care about any leaks because a failure + to initialize the io subsystem means that the + server (or atleast the innodb storage engine) + is not going to startup. */ + return(NULL); + } + } + + /* Initialize the event array. One event per slot. */ + io_event = static_cast<struct io_event*>( + ut_malloc(n * sizeof(*io_event))); + + memset(io_event, 0x0, sizeof(*io_event) * n); + array->aio_events = io_event; + +skip_native_aio: +#endif /* LINUX_NATIVE_AIO */ + for (ulint i = 0; i < n; i++) { + os_aio_slot_t* slot; + + slot = os_aio_array_get_nth_slot(array, i); + + slot->pos = i; + slot->reserved = FALSE; +#ifdef WIN_ASYNC_IO + slot->handle = CreateEvent(NULL,TRUE, FALSE, NULL); + + over = &slot->control; + + over->hEvent = slot->handle; + + array->handles[i] = over->hEvent; + +#elif defined(LINUX_NATIVE_AIO) + memset(&slot->control, 0x0, sizeof(slot->control)); + slot->n_bytes = 0; + slot->ret = 0; +#endif /* WIN_ASYNC_IO */ + } + + return(array); +} + +/************************************************************************//** +Frees an aio wait array. */ +static +void +os_aio_array_free( +/*==============*/ + os_aio_array_t*& array) /*!< in, own: array to free */ +{ +#ifdef WIN_ASYNC_IO + ulint i; + + for (i = 0; i < array->n_slots; i++) { + os_aio_slot_t* slot = os_aio_array_get_nth_slot(array, i); + CloseHandle(slot->handle); + } +#endif /* WIN_ASYNC_IO */ + +#ifdef __WIN__ + ut_free(array->handles); +#endif /* __WIN__ */ + os_mutex_free(array->mutex); + os_event_free(array->not_full); + os_event_free(array->is_empty); + +#if defined(LINUX_NATIVE_AIO) + if (srv_use_native_aio) { + ut_free(array->aio_events); + ut_free(array->aio_ctx); + } +#endif /* LINUX_NATIVE_AIO */ + + ut_free(array->slots); + ut_free(array); + + array = 0; +} + +/*********************************************************************** +Initializes the asynchronous io system. Creates one array each for ibuf +and log i/o. Also creates one array each for read and write where each +array is divided logically into n_read_segs and n_write_segs +respectively. The caller must create an i/o handler thread for each +segment in these arrays. This function also creates the sync array. +No i/o handler thread needs to be created for that */ +UNIV_INTERN +ibool +os_aio_init( +/*========*/ + ulint n_per_seg, /*<! in: maximum number of pending aio + operations allowed per segment */ + ulint n_read_segs, /*<! in: number of reader threads */ + ulint n_write_segs, /*<! in: number of writer threads */ + ulint n_slots_sync) /*<! in: number of slots in the sync aio + array */ +{ + os_io_init_simple(); + +#if defined(LINUX_NATIVE_AIO) + /* Check if native aio is supported on this system and tmpfs */ + if (srv_use_native_aio && !os_aio_native_aio_supported()) { + + ib_logf(IB_LOG_LEVEL_WARN, "Linux Native AIO disabled."); + + srv_use_native_aio = FALSE; + } +#endif /* LINUX_NATIVE_AIO */ + + srv_reset_io_thread_op_info(); + + os_aio_read_array = os_aio_array_create( + n_read_segs * n_per_seg, n_read_segs); + + if (os_aio_read_array == NULL) { + return(FALSE); + } + + ulint start = (srv_read_only_mode) ? 0 : 2; + ulint n_segs = n_read_segs + start; + + /* 0 is the ibuf segment and 1 is the insert buffer segment. */ + for (ulint i = start; i < n_segs; ++i) { + ut_a(i < SRV_MAX_N_IO_THREADS); + srv_io_thread_function[i] = "read thread"; + } + + ulint n_segments = n_read_segs; + + if (!srv_read_only_mode) { + + os_aio_log_array = os_aio_array_create(n_per_seg, 1); + + if (os_aio_log_array == NULL) { + return(FALSE); + } + + ++n_segments; + + srv_io_thread_function[1] = "log thread"; + + os_aio_ibuf_array = os_aio_array_create(n_per_seg, 1); + + if (os_aio_ibuf_array == NULL) { + return(FALSE); + } + + ++n_segments; + + srv_io_thread_function[0] = "insert buffer thread"; + + os_aio_write_array = os_aio_array_create( + n_write_segs * n_per_seg, n_write_segs); + + if (os_aio_write_array == NULL) { + return(FALSE); + } + + n_segments += n_write_segs; + + for (ulint i = start + n_read_segs; i < n_segments; ++i) { + ut_a(i < SRV_MAX_N_IO_THREADS); + srv_io_thread_function[i] = "write thread"; + } + + ut_ad(n_segments >= 4); + } else { + ut_ad(n_segments > 0); + } + + os_aio_sync_array = os_aio_array_create(n_slots_sync, 1); + + if (os_aio_sync_array == NULL) { + return(FALSE); + } + + os_aio_n_segments = n_segments; + + os_aio_validate(); + + os_aio_segment_wait_events = static_cast<os_event_t*>( + ut_malloc(n_segments * sizeof *os_aio_segment_wait_events)); + + for (ulint i = 0; i < n_segments; ++i) { + os_aio_segment_wait_events[i] = os_event_create(); + } + + os_last_printout = ut_time(); + + return(TRUE); + +} + +/*********************************************************************** +Frees the asynchronous io system. */ +UNIV_INTERN +void +os_aio_free(void) +/*=============*/ +{ + if (os_aio_ibuf_array != 0) { + os_aio_array_free(os_aio_ibuf_array); + } + + if (os_aio_log_array != 0) { + os_aio_array_free(os_aio_log_array); + } + + if (os_aio_write_array != 0) { + os_aio_array_free(os_aio_write_array); + } + + if (os_aio_sync_array != 0) { + os_aio_array_free(os_aio_sync_array); + } + + os_aio_array_free(os_aio_read_array); + + for (ulint i = 0; i < os_aio_n_segments; i++) { + os_event_free(os_aio_segment_wait_events[i]); + } + + ut_free(os_aio_segment_wait_events); + os_aio_segment_wait_events = 0; + os_aio_n_segments = 0; +} + +#ifdef WIN_ASYNC_IO +/************************************************************************//** +Wakes up all async i/o threads in the array in Windows async i/o at +shutdown. */ +static +void +os_aio_array_wake_win_aio_at_shutdown( +/*==================================*/ + os_aio_array_t* array) /*!< in: aio array */ +{ + ulint i; + + for (i = 0; i < array->n_slots; i++) { + + SetEvent((array->slots + i)->handle); + } +} +#endif + +/************************************************************************//** +Wakes up all async i/o threads so that they know to exit themselves in +shutdown. */ +UNIV_INTERN +void +os_aio_wake_all_threads_at_shutdown(void) +/*=====================================*/ +{ +#ifdef WIN_ASYNC_IO + /* This code wakes up all ai/o threads in Windows native aio */ + os_aio_array_wake_win_aio_at_shutdown(os_aio_read_array); + if (os_aio_write_array != 0) { + os_aio_array_wake_win_aio_at_shutdown(os_aio_write_array); + } + + if (os_aio_ibuf_array != 0) { + os_aio_array_wake_win_aio_at_shutdown(os_aio_ibuf_array); + } + + if (os_aio_log_array != 0) { + os_aio_array_wake_win_aio_at_shutdown(os_aio_log_array); + } + +#elif defined(LINUX_NATIVE_AIO) + + /* When using native AIO interface the io helper threads + wait on io_getevents with a timeout value of 500ms. At + each wake up these threads check the server status. + No need to do anything to wake them up. */ + + if (srv_use_native_aio) { + return; + } + + /* Fall through to simulated AIO handler wakeup if we are + not using native AIO. */ +#endif /* !WIN_ASYNC_AIO */ + + /* This loop wakes up all simulated ai/o threads */ + + for (ulint i = 0; i < os_aio_n_segments; i++) { + + os_event_set(os_aio_segment_wait_events[i]); + } +} + +/************************************************************************//** +Waits until there are no pending writes in os_aio_write_array. There can +be other, synchronous, pending writes. */ +UNIV_INTERN +void +os_aio_wait_until_no_pending_writes(void) +/*=====================================*/ +{ + ut_ad(!srv_read_only_mode); + os_event_wait(os_aio_write_array->is_empty); +} + +/**********************************************************************//** +Calculates segment number for a slot. +@return segment number (which is the number used by, for example, +i/o-handler threads) */ +static +ulint +os_aio_get_segment_no_from_slot( +/*============================*/ + os_aio_array_t* array, /*!< in: aio wait array */ + os_aio_slot_t* slot) /*!< in: slot in this array */ +{ + ulint segment; + ulint seg_len; + + if (array == os_aio_ibuf_array) { + ut_ad(!srv_read_only_mode); + + segment = IO_IBUF_SEGMENT; + + } else if (array == os_aio_log_array) { + ut_ad(!srv_read_only_mode); + + segment = IO_LOG_SEGMENT; + + } else if (array == os_aio_read_array) { + seg_len = os_aio_read_array->n_slots + / os_aio_read_array->n_segments; + + segment = (srv_read_only_mode ? 0 : 2) + slot->pos / seg_len; + } else { + ut_ad(!srv_read_only_mode); + ut_a(array == os_aio_write_array); + + seg_len = os_aio_write_array->n_slots + / os_aio_write_array->n_segments; + + segment = os_aio_read_array->n_segments + 2 + + slot->pos / seg_len; + } + + return(segment); +} + +/**********************************************************************//** +Calculates local segment number and aio array from global segment number. +@return local segment number within the aio array */ +static +ulint +os_aio_get_array_and_local_segment( +/*===============================*/ + os_aio_array_t** array, /*!< out: aio wait array */ + ulint global_segment)/*!< in: global segment number */ +{ + ulint segment; + + ut_a(global_segment < os_aio_n_segments); + + if (srv_read_only_mode) { + *array = os_aio_read_array; + + return(global_segment); + } else if (global_segment == IO_IBUF_SEGMENT) { + *array = os_aio_ibuf_array; + segment = 0; + + } else if (global_segment == IO_LOG_SEGMENT) { + *array = os_aio_log_array; + segment = 0; + + } else if (global_segment < os_aio_read_array->n_segments + 2) { + *array = os_aio_read_array; + + segment = global_segment - 2; + } else { + *array = os_aio_write_array; + + segment = global_segment - (os_aio_read_array->n_segments + 2); + } + + return(segment); +} + +/*******************************************************************//** +Requests for a slot in the aio array. If no slot is available, waits until +not_full-event becomes signaled. +@return pointer to slot */ +static +os_aio_slot_t* +os_aio_array_reserve_slot( +/*======================*/ + ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE */ + os_aio_array_t* array, /*!< in: aio array */ + fil_node_t* message1,/*!< in: message to be passed along with + the aio operation */ + void* message2,/*!< in: message to be passed along with + the aio operation */ + os_file_t file, /*!< in: file handle */ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + void* buf, /*!< in: buffer where to read or from which + to write */ + os_offset_t offset, /*!< in: file offset */ + ulint len, /*!< in: length of the block to read or write */ + ulint space_id) +{ + os_aio_slot_t* slot = NULL; +#ifdef WIN_ASYNC_IO + OVERLAPPED* control; + +#elif defined(LINUX_NATIVE_AIO) + + struct iocb* iocb; + off_t aio_offset; + +#endif /* WIN_ASYNC_IO */ + ulint i; + ulint counter; + ulint slots_per_seg; + ulint local_seg; + +#ifdef WIN_ASYNC_IO + ut_a((len & 0xFFFFFFFFUL) == len); +#endif /* WIN_ASYNC_IO */ + + /* No need of a mutex. Only reading constant fields */ + slots_per_seg = array->n_slots / array->n_segments; + + /* We attempt to keep adjacent blocks in the same local + segment. This can help in merging IO requests when we are + doing simulated AIO */ + local_seg = (offset >> (UNIV_PAGE_SIZE_SHIFT + 6)) + % array->n_segments; + +loop: + os_mutex_enter(array->mutex); + + if (array->n_reserved == array->n_slots) { + os_mutex_exit(array->mutex); + + if (!srv_use_native_aio) { + /* If the handler threads are suspended, wake them + so that we get more slots */ + + os_aio_simulated_wake_handler_threads(); + } + + os_event_wait(array->not_full); + + goto loop; + } + + /* We start our search for an available slot from our preferred + local segment and do a full scan of the array. We are + guaranteed to find a slot in full scan. */ + for (i = local_seg * slots_per_seg, counter = 0; + counter < array->n_slots; + i++, counter++) { + + i %= array->n_slots; + + slot = os_aio_array_get_nth_slot(array, i); + + if (slot->reserved == FALSE) { + goto found; + } + } + + /* We MUST always be able to get hold of a reserved slot. */ + ut_error; + +found: + ut_a(slot->reserved == FALSE); + array->n_reserved++; + + if (array->n_reserved == 1) { + os_event_reset(array->is_empty); + } + + if (array->n_reserved == array->n_slots) { + os_event_reset(array->not_full); + } + + slot->reserved = TRUE; + slot->reservation_time = ut_time(); + slot->message1 = message1; + slot->message2 = message2; + slot->file = file; + slot->name = name; + slot->len = len; + slot->type = type; + slot->buf = static_cast<byte*>(buf); + slot->offset = offset; + slot->io_already_done = FALSE; + slot->space_id = space_id; + +#ifdef WIN_ASYNC_IO + control = &slot->control; + control->Offset = (DWORD) offset & 0xFFFFFFFF; + control->OffsetHigh = (DWORD) (offset >> 32); + ResetEvent(slot->handle); + +#elif defined(LINUX_NATIVE_AIO) + + /* If we are not using native AIO skip this part. */ + if (!srv_use_native_aio) { + goto skip_native_aio; + } + + /* Check if we are dealing with 64 bit arch. + If not then make sure that offset fits in 32 bits. */ + aio_offset = (off_t) offset; + + ut_a(sizeof(aio_offset) >= sizeof(offset) + || ((os_offset_t) aio_offset) == offset); + + iocb = &slot->control; + + if (type == OS_FILE_READ) { + io_prep_pread(iocb, file, buf, len, aio_offset); + } else { + ut_a(type == OS_FILE_WRITE); + io_prep_pwrite(iocb, file, buf, len, aio_offset); + } + + iocb->data = (void*) slot; + slot->n_bytes = 0; + slot->ret = 0; + +skip_native_aio: +#endif /* LINUX_NATIVE_AIO */ + os_mutex_exit(array->mutex); + + return(slot); +} + +/*******************************************************************//** +Frees a slot in the aio array. */ +static +void +os_aio_array_free_slot( +/*===================*/ + os_aio_array_t* array, /*!< in: aio array */ + os_aio_slot_t* slot) /*!< in: pointer to slot */ +{ + os_mutex_enter(array->mutex); + + ut_ad(slot->reserved); + + slot->reserved = FALSE; + + array->n_reserved--; + + if (array->n_reserved == array->n_slots - 1) { + os_event_set(array->not_full); + } + + if (array->n_reserved == 0) { + os_event_set(array->is_empty); + } + +#ifdef WIN_ASYNC_IO + + ResetEvent(slot->handle); + +#elif defined(LINUX_NATIVE_AIO) + + if (srv_use_native_aio) { + memset(&slot->control, 0x0, sizeof(slot->control)); + slot->n_bytes = 0; + slot->ret = 0; + /*fprintf(stderr, "Freed up Linux native slot.\n");*/ + } else { + /* These fields should not be used if we are not + using native AIO. */ + ut_ad(slot->n_bytes == 0); + ut_ad(slot->ret == 0); + } + +#endif + os_mutex_exit(array->mutex); +} + +/**********************************************************************//** +Wakes up a simulated aio i/o-handler thread if it has something to do. */ +static +void +os_aio_simulated_wake_handler_thread( +/*=================================*/ + ulint global_segment) /*!< in: the number of the segment in the aio + arrays */ +{ + os_aio_array_t* array; + ulint segment; + + ut_ad(!srv_use_native_aio); + + segment = os_aio_get_array_and_local_segment(&array, global_segment); + + ulint n = array->n_slots / array->n_segments; + + segment *= n; + + /* Look through n slots after the segment * n'th slot */ + + os_mutex_enter(array->mutex); + + for (ulint i = 0; i < n; ++i) { + const os_aio_slot_t* slot; + + slot = os_aio_array_get_nth_slot(array, segment + i); + + if (slot->reserved) { + + /* Found an i/o request */ + + os_mutex_exit(array->mutex); + + os_event_t event; + + event = os_aio_segment_wait_events[global_segment]; + + os_event_set(event); + + return; + } + } + + os_mutex_exit(array->mutex); +} + +/**********************************************************************//** +Wakes up simulated aio i/o-handler threads if they have something to do. */ +UNIV_INTERN +void +os_aio_simulated_wake_handler_threads(void) +/*=======================================*/ +{ + if (srv_use_native_aio) { + /* We do not use simulated aio: do nothing */ + + return; + } + + os_aio_recommend_sleep_for_read_threads = FALSE; + + for (ulint i = 0; i < os_aio_n_segments; i++) { + os_aio_simulated_wake_handler_thread(i); + } +} + +/**********************************************************************//** +This function can be called if one wants to post a batch of reads and +prefers an i/o-handler thread to handle them all at once later. You must +call os_aio_simulated_wake_handler_threads later to ensure the threads +are not left sleeping! */ +UNIV_INTERN +void +os_aio_simulated_put_read_threads_to_sleep(void) +/*============================================*/ +{ + +/* The idea of putting background IO threads to sleep is only for +Windows when using simulated AIO. Windows XP seems to schedule +background threads too eagerly to allow for coalescing during +readahead requests. */ +#ifdef __WIN__ + os_aio_array_t* array; + + if (srv_use_native_aio) { + /* We do not use simulated aio: do nothing */ + + return; + } + + os_aio_recommend_sleep_for_read_threads = TRUE; + + for (ulint i = 0; i < os_aio_n_segments; i++) { + os_aio_get_array_and_local_segment(&array, i); + + if (array == os_aio_read_array) { + + os_event_reset(os_aio_segment_wait_events[i]); + } + } +#endif /* __WIN__ */ +} + +#if defined(LINUX_NATIVE_AIO) +/*******************************************************************//** +Dispatch an AIO request to the kernel. +@return TRUE on success. */ +static +ibool +os_aio_linux_dispatch( +/*==================*/ + os_aio_array_t* array, /*!< in: io request array. */ + os_aio_slot_t* slot) /*!< in: an already reserved slot. */ +{ + int ret; + ulint io_ctx_index; + struct iocb* iocb; + + ut_ad(slot != NULL); + ut_ad(array); + + ut_a(slot->reserved); + + /* Find out what we are going to work with. + The iocb struct is directly in the slot. + The io_context is one per segment. */ + + iocb = &slot->control; + io_ctx_index = (slot->pos * array->n_segments) / array->n_slots; + + ret = io_submit(array->aio_ctx[io_ctx_index], 1, &iocb); + +#if defined(UNIV_AIO_DEBUG) + fprintf(stderr, + "io_submit[%c] ret[%d]: slot[%p] ctx[%p] seg[%lu]\n", + (slot->type == OS_FILE_WRITE) ? 'w' : 'r', ret, slot, + array->aio_ctx[io_ctx_index], (ulong) io_ctx_index); +#endif + + /* io_submit returns number of successfully + queued requests or -errno. */ + if (UNIV_UNLIKELY(ret != 1)) { + errno = -ret; + return(FALSE); + } + + return(TRUE); +} +#endif /* LINUX_NATIVE_AIO */ + + +/*******************************************************************//** +NOTE! Use the corresponding macro os_aio(), not directly this function! +Requests an asynchronous i/o operation. +@return TRUE if request was queued successfully, FALSE if fail */ +UNIV_INTERN +ibool +os_aio_func( +/*========*/ + ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE */ + ulint mode, /*!< in: OS_AIO_NORMAL, ..., possibly ORed + to OS_AIO_SIMULATED_WAKE_LATER: the + last flag advises this function not to wake + i/o-handler threads, but the caller will + do the waking explicitly later, in this + way the caller can post several requests in + a batch; NOTE that the batch must not be + so big that it exhausts the slots in aio + arrays! NOTE that a simulated batch + may introduce hidden chances of deadlocks, + because i/os are not actually handled until + all have been posted: use with great + caution! */ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + os_file_t file, /*!< in: handle to a file */ + void* buf, /*!< in: buffer where to read or from which + to write */ + os_offset_t offset, /*!< in: file offset where to read or write */ + ulint n, /*!< in: number of bytes to read or write */ + fil_node_t* message1,/*!< in: message for the aio handler + (can be used to identify a completed + aio operation); ignored if mode is + OS_AIO_SYNC */ + void* message2,/*!< in: message for the aio handler + (can be used to identify a completed + aio operation); ignored if mode is + OS_AIO_SYNC */ + ulint space_id, + trx_t* trx) +{ + os_aio_array_t* array; + os_aio_slot_t* slot; +#ifdef WIN_ASYNC_IO + ibool retval; + BOOL ret = TRUE; + DWORD len = (DWORD) n; + struct fil_node_t* dummy_mess1; + void* dummy_mess2; + ulint dummy_type; +#endif /* WIN_ASYNC_IO */ + ulint wake_later; + + ut_ad(file); + ut_ad(buf); + ut_ad(n > 0); + ut_ad(n % OS_MIN_LOG_BLOCK_SIZE == 0); + ut_ad(offset % OS_MIN_LOG_BLOCK_SIZE == 0); + ut_ad(os_aio_validate_skip()); +#ifdef WIN_ASYNC_IO + ut_ad((n & 0xFFFFFFFFUL) == n); +#endif + + wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER; + mode = mode & (~OS_AIO_SIMULATED_WAKE_LATER); + + if (mode == OS_AIO_SYNC +#ifdef WIN_ASYNC_IO + && !srv_use_native_aio +#endif /* WIN_ASYNC_IO */ + ) { + /* This is actually an ordinary synchronous read or write: + no need to use an i/o-handler thread. NOTE that if we use + Windows async i/o, Windows does not allow us to use + ordinary synchronous os_file_read etc. on the same file, + therefore we have built a special mechanism for synchronous + wait in the Windows case. + Also note that the Performance Schema instrumentation has + been performed by current os_aio_func()'s wrapper function + pfs_os_aio_func(). So we would no longer need to call + Performance Schema instrumented os_file_read() and + os_file_write(). Instead, we should use os_file_read_func() + and os_file_write_func() */ + + if (type == OS_FILE_READ) { + return(os_file_read_func(file, buf, offset, n, trx)); + } + + ut_ad(!srv_read_only_mode); + ut_a(type == OS_FILE_WRITE); + + return(os_file_write_func(name, file, buf, offset, n)); + } + +try_again: + switch (mode) { + case OS_AIO_NORMAL: + if (type == OS_FILE_READ) { + array = os_aio_read_array; + } else { + ut_ad(!srv_read_only_mode); + array = os_aio_write_array; + } + break; + case OS_AIO_IBUF: + ut_ad(type == OS_FILE_READ); + /* Reduce probability of deadlock bugs in connection with ibuf: + do not let the ibuf i/o handler sleep */ + + wake_later = FALSE; + + if (srv_read_only_mode) { + array = os_aio_read_array; + } else { + array = os_aio_ibuf_array; + } + break; + case OS_AIO_LOG: + if (srv_read_only_mode) { + array = os_aio_read_array; + } else { + array = os_aio_log_array; + } + break; + case OS_AIO_SYNC: + array = os_aio_sync_array; +#if defined(LINUX_NATIVE_AIO) + /* In Linux native AIO we don't use sync IO array. */ + ut_a(!srv_use_native_aio); +#endif /* LINUX_NATIVE_AIO */ + break; + default: + ut_error; + array = NULL; /* Eliminate compiler warning */ + } + + if (trx && type == OS_FILE_READ) + { + trx->io_reads++; + trx->io_read += n; + } + slot = os_aio_array_reserve_slot(type, array, message1, message2, file, + name, buf, offset, n, space_id); + if (type == OS_FILE_READ) { + if (srv_use_native_aio) { + os_n_file_reads++; + os_bytes_read_since_printout += n; +#ifdef WIN_ASYNC_IO + ret = ReadFile(file, buf, (DWORD) n, &len, + &(slot->control)); + +#elif defined(LINUX_NATIVE_AIO) + if (!os_aio_linux_dispatch(array, slot)) { + goto err_exit; + } +#endif /* WIN_ASYNC_IO */ + } else { + if (!wake_later) { + os_aio_simulated_wake_handler_thread( + os_aio_get_segment_no_from_slot( + array, slot)); + } + } + } else if (type == OS_FILE_WRITE) { + ut_ad(!srv_read_only_mode); + if (srv_use_native_aio) { + os_n_file_writes++; +#ifdef WIN_ASYNC_IO + ret = WriteFile(file, buf, (DWORD) n, &len, + &(slot->control)); + +#elif defined(LINUX_NATIVE_AIO) + if (!os_aio_linux_dispatch(array, slot)) { + goto err_exit; + } +#endif /* WIN_ASYNC_IO */ + } else { + if (!wake_later) { + os_aio_simulated_wake_handler_thread( + os_aio_get_segment_no_from_slot( + array, slot)); + } + } + } else { + ut_error; + } + +#ifdef WIN_ASYNC_IO + if (srv_use_native_aio) { + if ((ret && len == n) + || (!ret && GetLastError() == ERROR_IO_PENDING)) { + /* aio was queued successfully! */ + + if (mode == OS_AIO_SYNC) { + /* We want a synchronous i/o operation on a + file where we also use async i/o: in Windows + we must use the same wait mechanism as for + async i/o */ + + retval = os_aio_windows_handle( + ULINT_UNDEFINED, slot->pos, + &dummy_mess1, &dummy_mess2, + &dummy_type); + + return(retval); + } + + return(TRUE); + } + + goto err_exit; + } +#endif /* WIN_ASYNC_IO */ + /* aio was queued successfully! */ + return(TRUE); + +#if defined LINUX_NATIVE_AIO || defined WIN_ASYNC_IO +err_exit: +#endif /* LINUX_NATIVE_AIO || WIN_ASYNC_IO */ + os_aio_array_free_slot(array, slot); + + if (os_file_handle_error( + name,type == OS_FILE_READ ? "aio read" : "aio write")) { + + goto try_again; + } + + return(FALSE); +} + +#ifdef WIN_ASYNC_IO +/**********************************************************************//** +This function is only used in Windows asynchronous i/o. +Waits for an aio operation to complete. This function is used to wait the +for completed requests. The aio array of pending requests is divided +into segments. The thread specifies which segment or slot it wants to wait +for. NOTE: this function will also take care of freeing the aio slot, +therefore no other thread is allowed to do the freeing! +@return TRUE if the aio operation succeeded */ +UNIV_INTERN +ibool +os_aio_windows_handle( +/*==================*/ + ulint segment, /*!< in: the number of the segment in the aio + arrays to wait for; segment 0 is the ibuf + i/o thread, segment 1 the log i/o thread, + then follow the non-ibuf read threads, and as + the last are the non-ibuf write threads; if + this is ULINT_UNDEFINED, then it means that + sync aio is used, and this parameter is + ignored */ + ulint pos, /*!< this parameter is used only in sync aio: + wait for the aio slot at this position */ + fil_node_t**message1, /*!< out: the messages passed with the aio + request; note that also in the case where + the aio operation failed, these output + parameters are valid and can be used to + restart the operation, for example */ + void** message2, + ulint* type, /*!< out: OS_FILE_WRITE or ..._READ */ + ulint* space_id) +{ + ulint orig_seg = segment; + os_aio_array_t* array; + os_aio_slot_t* slot; + ulint n; + ulint i; + ibool ret_val; + BOOL ret; + DWORD len; + BOOL retry = FALSE; + + if (segment == ULINT_UNDEFINED) { + segment = 0; + array = os_aio_sync_array; + } else { + segment = os_aio_get_array_and_local_segment(&array, segment); + } + + /* NOTE! We only access constant fields in os_aio_array. Therefore + we do not have to acquire the protecting mutex yet */ + + ut_ad(os_aio_validate_skip()); + ut_ad(segment < array->n_segments); + + n = array->n_slots / array->n_segments; + + if (array == os_aio_sync_array) { + + WaitForSingleObject( + os_aio_array_get_nth_slot(array, pos)->handle, + INFINITE); + + i = pos; + + } else { + if (orig_seg != ULINT_UNDEFINED) { + srv_set_io_thread_op_info(orig_seg, "wait Windows aio"); + } + + i = WaitForMultipleObjects( + (DWORD) n, array->handles + segment * n, + FALSE, INFINITE); + } + + os_mutex_enter(array->mutex); + + if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS + && array->n_reserved == 0) { + *message1 = NULL; + *message2 = NULL; + os_mutex_exit(array->mutex); + return(TRUE); + } + + ut_a(i >= WAIT_OBJECT_0 && i <= WAIT_OBJECT_0 + n); + + slot = os_aio_array_get_nth_slot(array, i + segment * n); + + ut_a(slot->reserved); + + if (orig_seg != ULINT_UNDEFINED) { + srv_set_io_thread_op_info( + orig_seg, "get windows aio return value"); + } + + ret = GetOverlappedResult(slot->file, &(slot->control), &len, TRUE); + + *message1 = slot->message1; + *message2 = slot->message2; + + *type = slot->type; + *space_id = slot->space_id; + + if (ret && len == slot->len) { + + ret_val = TRUE; + } else if (os_file_handle_error(slot->name, "Windows aio")) { + + retry = TRUE; + } else { + + ret_val = FALSE; + } + + os_mutex_exit(array->mutex); + + if (retry) { + /* retry failed read/write operation synchronously. + No need to hold array->mutex. */ + +#ifdef UNIV_PFS_IO + /* This read/write does not go through os_file_read + and os_file_write APIs, need to register with + performance schema explicitly here. */ + struct PSI_file_locker* locker = NULL; + register_pfs_file_io_begin(locker, slot->file, slot->len, + (slot->type == OS_FILE_WRITE) + ? PSI_FILE_WRITE + : PSI_FILE_READ, + __FILE__, __LINE__); +#endif + + ut_a((slot->len & 0xFFFFFFFFUL) == slot->len); + + switch (slot->type) { + case OS_FILE_WRITE: + ret = WriteFile(slot->file, slot->buf, + (DWORD) slot->len, &len, + &(slot->control)); + + break; + case OS_FILE_READ: + ret = ReadFile(slot->file, slot->buf, + (DWORD) slot->len, &len, + &(slot->control)); + + break; + default: + ut_error; + } + +#ifdef UNIV_PFS_IO + register_pfs_file_io_end(locker, len); +#endif + + if (!ret && GetLastError() == ERROR_IO_PENDING) { + /* aio was queued successfully! + We want a synchronous i/o operation on a + file where we also use async i/o: in Windows + we must use the same wait mechanism as for + async i/o */ + + ret = GetOverlappedResult(slot->file, + &(slot->control), + &len, TRUE); + } + + ret_val = ret && len == slot->len; + } + + os_aio_array_free_slot(array, slot); + + return(ret_val); +} +#endif + +#if defined(LINUX_NATIVE_AIO) +/******************************************************************//** +This function is only used in Linux native asynchronous i/o. This is +called from within the io-thread. If there are no completed IO requests +in the slot array, the thread calls this function to collect more +requests from the kernel. +The io-thread waits on io_getevents(), which is a blocking call, with +a timeout value. Unless the system is very heavy loaded, keeping the +io-thread very busy, the io-thread will spend most of its time waiting +in this function. +The io-thread also exits in this function. It checks server status at +each wakeup and that is why we use timed wait in io_getevents(). */ +static +void +os_aio_linux_collect( +/*=================*/ + os_aio_array_t* array, /*!< in/out: slot array. */ + ulint segment, /*!< in: local segment no. */ + ulint seg_size) /*!< in: segment size. */ +{ + int i; + int ret; + ulint start_pos; + ulint end_pos; + struct timespec timeout; + struct io_event* events; + struct io_context* io_ctx; + + /* sanity checks. */ + ut_ad(array != NULL); + ut_ad(seg_size > 0); + ut_ad(segment < array->n_segments); + + /* Which part of event array we are going to work on. */ + events = &array->aio_events[segment * seg_size]; + + /* Which io_context we are going to use. */ + io_ctx = array->aio_ctx[segment]; + + /* Starting point of the segment we will be working on. */ + start_pos = segment * seg_size; + + /* End point. */ + end_pos = start_pos + seg_size; + +retry: + + /* Initialize the events. The timeout value is arbitrary. + We probably need to experiment with it a little. */ + memset(events, 0, sizeof(*events) * seg_size); + timeout.tv_sec = 0; + timeout.tv_nsec = OS_AIO_REAP_TIMEOUT; + + ret = io_getevents(io_ctx, 1, seg_size, events, &timeout); + + if (ret > 0) { + for (i = 0; i < ret; i++) { + os_aio_slot_t* slot; + struct iocb* control; + + control = (struct iocb*) events[i].obj; + ut_a(control != NULL); + + slot = (os_aio_slot_t*) control->data; + + /* Some sanity checks. */ + ut_a(slot != NULL); + ut_a(slot->reserved); + +#if defined(UNIV_AIO_DEBUG) + fprintf(stderr, + "io_getevents[%c]: slot[%p] ctx[%p]" + " seg[%lu]\n", + (slot->type == OS_FILE_WRITE) ? 'w' : 'r', + slot, io_ctx, segment); +#endif + + /* We are not scribbling previous segment. */ + ut_a(slot->pos >= start_pos); + + /* We have not overstepped to next segment. */ + ut_a(slot->pos < end_pos); + + /* Mark this request as completed. The error handling + will be done in the calling function. */ + os_mutex_enter(array->mutex); + slot->n_bytes = events[i].res; + slot->ret = events[i].res2; + slot->io_already_done = TRUE; + os_mutex_exit(array->mutex); + } + return; + } + + if (UNIV_UNLIKELY(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS)) { + return; + } + + /* This error handling is for any error in collecting the + IO requests. The errors, if any, for any particular IO + request are simply passed on to the calling routine. */ + + switch (ret) { + case -EAGAIN: + /* Not enough resources! Try again. */ + case -EINTR: + /* Interrupted! I have tested the behaviour in case of an + interrupt. If we have some completed IOs available then + the return code will be the number of IOs. We get EINTR only + if there are no completed IOs and we have been interrupted. */ + case 0: + /* No pending request! Go back and check again. */ + goto retry; + } + + /* All other errors should cause a trap for now. */ + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: unexpected ret_code[%d] from io_getevents()!\n", + ret); + ut_error; +} + +/**********************************************************************//** +This function is only used in Linux native asynchronous i/o. +Waits for an aio operation to complete. This function is used to wait for +the completed requests. The aio array of pending requests is divided +into segments. The thread specifies which segment or slot it wants to wait +for. NOTE: this function will also take care of freeing the aio slot, +therefore no other thread is allowed to do the freeing! +@return TRUE if the IO was successful */ +UNIV_INTERN +ibool +os_aio_linux_handle( +/*================*/ + ulint global_seg, /*!< in: segment number in the aio array + to wait for; segment 0 is the ibuf + i/o thread, segment 1 is log i/o thread, + then follow the non-ibuf read threads, + and the last are the non-ibuf write + threads. */ + fil_node_t**message1, /*!< out: the messages passed with the */ + void** message2, /*!< aio request; note that in case the + aio operation failed, these output + parameters are valid and can be used to + restart the operation. */ + ulint* type, /*!< out: OS_FILE_WRITE or ..._READ */ + ulint* space_id) +{ + ulint segment; + os_aio_array_t* array; + os_aio_slot_t* slot; + ulint n; + ulint i; + ibool ret = FALSE; + + /* Should never be doing Sync IO here. */ + ut_a(global_seg != ULINT_UNDEFINED); + + /* Find the array and the local segment. */ + segment = os_aio_get_array_and_local_segment(&array, global_seg); + n = array->n_slots / array->n_segments; + + wait_for_event: + /* Loop until we have found a completed request. */ + for (;;) { + ibool any_reserved = FALSE; + os_mutex_enter(array->mutex); + for (i = 0; i < n; ++i) { + slot = os_aio_array_get_nth_slot( + array, i + segment * n); + if (!slot->reserved) { + continue; + } else if (slot->io_already_done) { + /* Something for us to work on. */ + goto found; + } else { + any_reserved = TRUE; + } + } + + os_mutex_exit(array->mutex); + + /* There is no completed request. + If there is no pending request at all, + and the system is being shut down, exit. */ + if (UNIV_UNLIKELY + (!any_reserved + && srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS)) { + *message1 = NULL; + *message2 = NULL; + return(TRUE); + } + + /* Wait for some request. Note that we return + from wait iff we have found a request. */ + + srv_set_io_thread_op_info(global_seg, + "waiting for completed aio requests"); + os_aio_linux_collect(array, segment, n); + } + +found: + /* Note that it may be that there are more then one completed + IO requests. We process them one at a time. We may have a case + here to improve the performance slightly by dealing with all + requests in one sweep. */ + srv_set_io_thread_op_info(global_seg, + "processing completed aio requests"); + + /* Ensure that we are scribbling only our segment. */ + ut_a(i < n); + + ut_ad(slot != NULL); + ut_ad(slot->reserved); + ut_ad(slot->io_already_done); + + *message1 = slot->message1; + *message2 = slot->message2; + + *type = slot->type; + *space_id = slot->space_id; + + if (slot->ret == 0 && slot->n_bytes == (long) slot->len) { + + ret = TRUE; + } else if ((slot->ret == 0) && (slot->n_bytes > 0) + && (slot->n_bytes < (long) slot->len)) { + /* Partial read or write scenario */ + int submit_ret; + struct iocb* iocb; + slot->buf = (byte*)slot->buf + slot->n_bytes; + slot->offset = slot->offset + slot->n_bytes; + slot->len = slot->len - slot->n_bytes; + /* Resetting the bytes read/written */ + slot->n_bytes = 0; + slot->io_already_done = FALSE; + iocb = &(slot->control); + + if (slot->type == OS_FILE_READ) { + io_prep_pread(&slot->control, slot->file, slot->buf, + slot->len, (off_t) slot->offset); + } else { + ut_a(slot->type == OS_FILE_WRITE); + io_prep_pwrite(&slot->control, slot->file, slot->buf, + slot->len, (off_t) slot->offset); + } + /* Resubmit an I/O request */ + submit_ret = io_submit(array->aio_ctx[segment], 1, &iocb); + if (submit_ret < 0 ) { + /* Aborting in case of submit failure */ + ib_logf(IB_LOG_LEVEL_FATAL, + "Native Linux AIO interface. io_submit()" + " call failed when resubmitting a partial" + " I/O request on the file %s.", + slot->name); + } else { + ret = FALSE; + os_mutex_exit(array->mutex); + goto wait_for_event; + } + } else { + errno = -slot->ret; + + /* os_file_handle_error does tell us if we should retry + this IO. As it stands now, we don't do this retry when + reaping requests from a different context than + the dispatcher. This non-retry logic is the same for + windows and linux native AIO. + We should probably look into this to transparently + re-submit the IO. */ + os_file_handle_error(slot->name, "Linux aio"); + + ret = FALSE; + } + + os_mutex_exit(array->mutex); + + os_aio_array_free_slot(array, slot); + + return(ret); +} +#endif /* LINUX_NATIVE_AIO */ + +/**********************************************************************//** +Does simulated aio. This function should be called by an i/o-handler +thread. +@return TRUE if the aio operation succeeded */ +UNIV_INTERN +ibool +os_aio_simulated_handle( +/*====================*/ + ulint global_segment, /*!< in: the number of the segment in the aio + arrays to wait for; segment 0 is the ibuf + i/o thread, segment 1 the log i/o thread, + then follow the non-ibuf read threads, and as + the last are the non-ibuf write threads */ + fil_node_t**message1, /*!< out: the messages passed with the aio + request; note that also in the case where + the aio operation failed, these output + parameters are valid and can be used to + restart the operation, for example */ + void** message2, + ulint* type, /*!< out: OS_FILE_WRITE or ..._READ */ + ulint* space_id) +{ + os_aio_array_t* array; + ulint segment; + os_aio_slot_t* consecutive_ios[OS_AIO_MERGE_N_CONSECUTIVE]; + ulint n_consecutive; + ulint total_len; + ulint offs; + os_offset_t lowest_offset; + ulint biggest_age; + ulint age; + byte* combined_buf; + byte* combined_buf2; + ibool ret; + ibool any_reserved; + ulint n; + os_aio_slot_t* aio_slot; + + /* Fix compiler warning */ + *consecutive_ios = NULL; + + segment = os_aio_get_array_and_local_segment(&array, global_segment); + +restart: + /* NOTE! We only access constant fields in os_aio_array. Therefore + we do not have to acquire the protecting mutex yet */ + + srv_set_io_thread_op_info(global_segment, + "looking for i/o requests (a)"); + ut_ad(os_aio_validate_skip()); + ut_ad(segment < array->n_segments); + + n = array->n_slots / array->n_segments; + + /* Look through n slots after the segment * n'th slot */ + + if (array == os_aio_read_array + && os_aio_recommend_sleep_for_read_threads) { + + /* Give other threads chance to add several i/os to the array + at once. */ + + goto recommended_sleep; + } + + srv_set_io_thread_op_info(global_segment, + "looking for i/o requests (b)"); + + /* Check if there is a slot for which the i/o has already been + done */ + any_reserved = FALSE; + + os_mutex_enter(array->mutex); + + for (ulint i = 0; i < n; i++) { + os_aio_slot_t* slot; + + slot = os_aio_array_get_nth_slot(array, i + segment * n); + + if (!slot->reserved) { + continue; + } else if (slot->io_already_done) { + + if (os_aio_print_debug) { + fprintf(stderr, + "InnoDB: i/o for slot %lu" + " already done, returning\n", + (ulong) i); + } + + aio_slot = slot; + ret = TRUE; + goto slot_io_done; + } else { + any_reserved = TRUE; + } + } + + /* There is no completed request. + If there is no pending request at all, + and the system is being shut down, exit. */ + if (!any_reserved && srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) { + os_mutex_exit(array->mutex); + *message1 = NULL; + *message2 = NULL; + return(TRUE); + } + + n_consecutive = 0; + + /* If there are at least 2 seconds old requests, then pick the oldest + one to prevent starvation. If several requests have the same age, + then pick the one at the lowest offset. */ + + biggest_age = 0; + lowest_offset = IB_UINT64_MAX; + + for (ulint i = 0; i < n; i++) { + os_aio_slot_t* slot; + + slot = os_aio_array_get_nth_slot(array, i + segment * n); + + if (slot->reserved) { + + age = (ulint) difftime( + ut_time(), slot->reservation_time); + + if ((age >= 2 && age > biggest_age) + || (age >= 2 && age == biggest_age + && slot->offset < lowest_offset)) { + + /* Found an i/o request */ + consecutive_ios[0] = slot; + + n_consecutive = 1; + + biggest_age = age; + lowest_offset = slot->offset; + } + } + } + + if (n_consecutive == 0) { + /* There were no old requests. Look for an i/o request at the + lowest offset in the array (we ignore the high 32 bits of the + offset in these heuristics) */ + + lowest_offset = IB_UINT64_MAX; + + for (ulint i = 0; i < n; i++) { + os_aio_slot_t* slot; + + slot = os_aio_array_get_nth_slot( + array, i + segment * n); + + if (slot->reserved && slot->offset < lowest_offset) { + + /* Found an i/o request */ + consecutive_ios[0] = slot; + + n_consecutive = 1; + + lowest_offset = slot->offset; + } + } + } + + if (n_consecutive == 0) { + + /* No i/o requested at the moment */ + + goto wait_for_io; + } + + /* if n_consecutive != 0, then we have assigned + something valid to consecutive_ios[0] */ + ut_ad(n_consecutive != 0); + ut_ad(consecutive_ios[0] != NULL); + + aio_slot = consecutive_ios[0]; + + /* Check if there are several consecutive blocks to read or write */ + +consecutive_loop: + for (ulint i = 0; i < n; i++) { + os_aio_slot_t* slot; + + slot = os_aio_array_get_nth_slot(array, i + segment * n); + + if (slot->reserved + && slot != aio_slot + && slot->offset == aio_slot->offset + aio_slot->len + && slot->type == aio_slot->type + && slot->file == aio_slot->file) { + + /* Found a consecutive i/o request */ + + consecutive_ios[n_consecutive] = slot; + n_consecutive++; + + aio_slot = slot; + + if (n_consecutive < OS_AIO_MERGE_N_CONSECUTIVE) { + + goto consecutive_loop; + } else { + break; + } + } + } + + srv_set_io_thread_op_info(global_segment, "consecutive i/o requests"); + + /* We have now collected n_consecutive i/o requests in the array; + allocate a single buffer which can hold all data, and perform the + i/o */ + + total_len = 0; + aio_slot = consecutive_ios[0]; + + for (ulint i = 0; i < n_consecutive; i++) { + total_len += consecutive_ios[i]->len; + } + + if (n_consecutive == 1) { + /* We can use the buffer of the i/o request */ + combined_buf = aio_slot->buf; + combined_buf2 = NULL; + } else { + combined_buf2 = static_cast<byte*>( + ut_malloc(total_len + UNIV_PAGE_SIZE)); + + ut_a(combined_buf2); + + combined_buf = static_cast<byte*>( + ut_align(combined_buf2, UNIV_PAGE_SIZE)); + } + + /* We release the array mutex for the time of the i/o: NOTE that + this assumes that there is just one i/o-handler thread serving + a single segment of slots! */ + + os_mutex_exit(array->mutex); + + if (aio_slot->type == OS_FILE_WRITE && n_consecutive > 1) { + /* Copy the buffers to the combined buffer */ + offs = 0; + + for (ulint i = 0; i < n_consecutive; i++) { + + ut_memcpy(combined_buf + offs, consecutive_ios[i]->buf, + consecutive_ios[i]->len); + + offs += consecutive_ios[i]->len; + } + } + + srv_set_io_thread_op_info(global_segment, "doing file i/o"); + + /* Do the i/o with ordinary, synchronous i/o functions: */ + if (aio_slot->type == OS_FILE_WRITE) { + ut_ad(!srv_read_only_mode); + ret = os_file_write( + aio_slot->name, aio_slot->file, combined_buf, + aio_slot->offset, total_len); + } else { + ret = os_file_read( + aio_slot->file, combined_buf, + aio_slot->offset, total_len); + } + + ut_a(ret); + srv_set_io_thread_op_info(global_segment, "file i/o done"); + + if (aio_slot->type == OS_FILE_READ && n_consecutive > 1) { + /* Copy the combined buffer to individual buffers */ + offs = 0; + + for (ulint i = 0; i < n_consecutive; i++) { + + ut_memcpy(consecutive_ios[i]->buf, combined_buf + offs, + consecutive_ios[i]->len); + offs += consecutive_ios[i]->len; + } + } + + if (combined_buf2) { + ut_free(combined_buf2); + } + + os_mutex_enter(array->mutex); + + /* Mark the i/os done in slots */ + + for (ulint i = 0; i < n_consecutive; i++) { + consecutive_ios[i]->io_already_done = TRUE; + } + + /* We return the messages for the first slot now, and if there were + several slots, the messages will be returned with subsequent calls + of this function */ + +slot_io_done: + + ut_a(aio_slot->reserved); + + *message1 = aio_slot->message1; + *message2 = aio_slot->message2; + + *type = aio_slot->type; + *space_id = aio_slot->space_id; + + os_mutex_exit(array->mutex); + + os_aio_array_free_slot(array, aio_slot); + + return(ret); + +wait_for_io: + srv_set_io_thread_op_info(global_segment, "resetting wait event"); + + /* We wait here until there again can be i/os in the segment + of this thread */ + + os_event_reset(os_aio_segment_wait_events[global_segment]); + + os_mutex_exit(array->mutex); + +recommended_sleep: + srv_set_io_thread_op_info(global_segment, "waiting for i/o request"); + + os_event_wait(os_aio_segment_wait_events[global_segment]); + + goto restart; +} + +/**********************************************************************//** +Validates the consistency of an aio array. +@return true if ok */ +static +bool +os_aio_array_validate( +/*==================*/ + os_aio_array_t* array) /*!< in: aio wait array */ +{ + ulint i; + ulint n_reserved = 0; + + os_mutex_enter(array->mutex); + + ut_a(array->n_slots > 0); + ut_a(array->n_segments > 0); + + for (i = 0; i < array->n_slots; i++) { + os_aio_slot_t* slot; + + slot = os_aio_array_get_nth_slot(array, i); + + if (slot->reserved) { + n_reserved++; + ut_a(slot->len > 0); + } + } + + ut_a(array->n_reserved == n_reserved); + + os_mutex_exit(array->mutex); + + return(true); +} + +/**********************************************************************//** +Validates the consistency the aio system. +@return TRUE if ok */ +UNIV_INTERN +ibool +os_aio_validate(void) +/*=================*/ +{ + os_aio_array_validate(os_aio_read_array); + + if (os_aio_write_array != 0) { + os_aio_array_validate(os_aio_write_array); + } + + if (os_aio_ibuf_array != 0) { + os_aio_array_validate(os_aio_ibuf_array); + } + + if (os_aio_log_array != 0) { + os_aio_array_validate(os_aio_log_array); + } + + if (os_aio_sync_array != 0) { + os_aio_array_validate(os_aio_sync_array); + } + + return(TRUE); +} + +/**********************************************************************//** +Prints pending IO requests per segment of an aio array. +We probably don't need per segment statistics but they can help us +during development phase to see if the IO requests are being +distributed as expected. */ +static +void +os_aio_print_segment_info( +/*======================*/ + FILE* file, /*!< in: file where to print */ + ulint* n_seg, /*!< in: pending IO array */ + os_aio_array_t* array) /*!< in: array to process */ +{ + ulint i; + + ut_ad(array); + ut_ad(n_seg); + ut_ad(array->n_segments > 0); + + if (array->n_segments == 1) { + return; + } + + fprintf(file, " ["); + for (i = 0; i < array->n_segments; i++) { + if (i != 0) { + fprintf(file, ", "); + } + + fprintf(file, "%lu", n_seg[i]); + } + fprintf(file, "] "); +} + +/**********************************************************************//** +Prints info about the aio array. */ +UNIV_INTERN +void +os_aio_print_array( +/*==============*/ + FILE* file, /*!< in: file where to print */ + os_aio_array_t* array) /*!< in: aio array to print */ +{ + ulint n_reserved = 0; + ulint n_res_seg[SRV_MAX_N_IO_THREADS]; + + os_mutex_enter(array->mutex); + + ut_a(array->n_slots > 0); + ut_a(array->n_segments > 0); + + memset(n_res_seg, 0x0, sizeof(n_res_seg)); + + for (ulint i = 0; i < array->n_slots; ++i) { + os_aio_slot_t* slot; + ulint seg_no; + + slot = os_aio_array_get_nth_slot(array, i); + + seg_no = (i * array->n_segments) / array->n_slots; + + if (slot->reserved) { + ++n_reserved; + ++n_res_seg[seg_no]; + + ut_a(slot->len > 0); + } + } + + ut_a(array->n_reserved == n_reserved); + + fprintf(file, " %lu", (ulong) n_reserved); + + os_aio_print_segment_info(file, n_res_seg, array); + + os_mutex_exit(array->mutex); +} + +/**********************************************************************//** +Prints info of the aio arrays. */ +UNIV_INTERN +void +os_aio_print( +/*=========*/ + FILE* file) /*!< in: file where to print */ +{ + time_t current_time; + double time_elapsed; + double avg_bytes_read; + + for (ulint i = 0; i < srv_n_file_io_threads; ++i) { + fprintf(file, "I/O thread %lu state: %s (%s)", + (ulong) i, + srv_io_thread_op_info[i], + srv_io_thread_function[i]); + +#ifndef __WIN__ + if (os_aio_segment_wait_events[i]->is_set) { + fprintf(file, " ev set"); + } +#endif /* __WIN__ */ + + fprintf(file, "\n"); + } + + fputs("Pending normal aio reads:", file); + + os_aio_print_array(file, os_aio_read_array); + + if (os_aio_write_array != 0) { + fputs(", aio writes:", file); + os_aio_print_array(file, os_aio_write_array); + } + + if (os_aio_ibuf_array != 0) { + fputs(",\n ibuf aio reads:", file); + os_aio_print_array(file, os_aio_ibuf_array); + } + + if (os_aio_log_array != 0) { + fputs(", log i/o's:", file); + os_aio_print_array(file, os_aio_log_array); + } + + if (os_aio_sync_array != 0) { + fputs(", sync i/o's:", file); + os_aio_print_array(file, os_aio_sync_array); + } + + putc('\n', file); + current_time = ut_time(); + time_elapsed = 0.001 + difftime(current_time, os_last_printout); + + fprintf(file, + "Pending flushes (fsync) log: %lu; buffer pool: %lu\n" + "%lu OS file reads, %lu OS file writes, %lu OS fsyncs\n", + (ulong) fil_n_pending_log_flushes, + (ulong) fil_n_pending_tablespace_flushes, + (ulong) os_n_file_reads, + (ulong) os_n_file_writes, + (ulong) os_n_fsyncs); + + if (os_file_n_pending_preads != 0 || os_file_n_pending_pwrites != 0) { + fprintf(file, + "%lu pending preads, %lu pending pwrites\n", + (ulong) os_file_n_pending_preads, + (ulong) os_file_n_pending_pwrites); + } + + if (os_n_file_reads == os_n_file_reads_old) { + avg_bytes_read = 0.0; + } else { + avg_bytes_read = (double) os_bytes_read_since_printout + / (os_n_file_reads - os_n_file_reads_old); + } + + fprintf(file, + "%.2f reads/s, %lu avg bytes/read," + " %.2f writes/s, %.2f fsyncs/s\n", + (os_n_file_reads - os_n_file_reads_old) + / time_elapsed, + (ulong) avg_bytes_read, + (os_n_file_writes - os_n_file_writes_old) + / time_elapsed, + (os_n_fsyncs - os_n_fsyncs_old) + / time_elapsed); + + os_n_file_reads_old = os_n_file_reads; + os_n_file_writes_old = os_n_file_writes; + os_n_fsyncs_old = os_n_fsyncs; + os_bytes_read_since_printout = 0; + + os_last_printout = current_time; +} + +/**********************************************************************//** +Refreshes the statistics used to print per-second averages. */ +UNIV_INTERN +void +os_aio_refresh_stats(void) +/*======================*/ +{ + os_n_file_reads_old = os_n_file_reads; + os_n_file_writes_old = os_n_file_writes; + os_n_fsyncs_old = os_n_fsyncs; + os_bytes_read_since_printout = 0; + + os_last_printout = time(NULL); +} + +#ifdef UNIV_DEBUG +/**********************************************************************//** +Checks that all slots in the system have been freed, that is, there are +no pending io operations. +@return TRUE if all free */ +UNIV_INTERN +ibool +os_aio_all_slots_free(void) +/*=======================*/ +{ + os_aio_array_t* array; + ulint n_res = 0; + + array = os_aio_read_array; + + os_mutex_enter(array->mutex); + + n_res += array->n_reserved; + + os_mutex_exit(array->mutex); + + if (!srv_read_only_mode) { + ut_a(os_aio_write_array == 0); + + array = os_aio_write_array; + + os_mutex_enter(array->mutex); + + n_res += array->n_reserved; + + os_mutex_exit(array->mutex); + + ut_a(os_aio_ibuf_array == 0); + + array = os_aio_ibuf_array; + + os_mutex_enter(array->mutex); + + n_res += array->n_reserved; + + os_mutex_exit(array->mutex); + } + + ut_a(os_aio_log_array == 0); + + array = os_aio_log_array; + + os_mutex_enter(array->mutex); + + n_res += array->n_reserved; + + os_mutex_exit(array->mutex); + + array = os_aio_sync_array; + + os_mutex_enter(array->mutex); + + n_res += array->n_reserved; + + os_mutex_exit(array->mutex); + + if (n_res == 0) { + + return(TRUE); + } + + return(FALSE); +} +#endif /* UNIV_DEBUG */ + +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/xtradb/os/os0proc.cc b/storage/xtradb/os/os0proc.cc new file mode 100644 index 00000000000..ec629430baf --- /dev/null +++ b/storage/xtradb/os/os0proc.cc @@ -0,0 +1,284 @@ +/***************************************************************************** + +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file os/os0proc.cc +The interface to the operating system +process control primitives + +Created 9/30/1995 Heikki Tuuri +*******************************************************/ + +#include "os0proc.h" +#ifdef UNIV_NONINL +#include "os0proc.ic" +#endif + +#include "ut0mem.h" +#include "ut0byte.h" + +/* Linux release version */ +#if defined(UNIV_LINUX) && defined(_GNU_SOURCE) +#include <string.h> /* strverscmp() */ +#include <sys/utsname.h> /* uname() */ +#endif + +/* FreeBSD for example has only MAP_ANON, Linux has MAP_ANONYMOUS and +MAP_ANON but MAP_ANON is marked as deprecated */ +#if defined(MAP_ANONYMOUS) +#define OS_MAP_ANON MAP_ANONYMOUS +#elif defined(MAP_ANON) +#define OS_MAP_ANON MAP_ANON +#endif + +/* Linux's MAP_POPULATE */ +#if defined(MAP_POPULATE) +#define OS_MAP_POPULATE MAP_POPULATE +#else +#define OS_MAP_POPULATE 0 +#endif + +UNIV_INTERN ibool os_use_large_pages; +/* Large page size. This may be a boot-time option on some platforms */ +UNIV_INTERN ulint os_large_page_size; + +/****************************************************************//** +Converts the current process id to a number. It is not guaranteed that the +number is unique. In Linux returns the 'process number' of the current +thread. That number is the same as one sees in 'top', for example. In Linux +the thread id is not the same as one sees in 'top'. +@return process id as a number */ +UNIV_INTERN +ulint +os_proc_get_number(void) +/*====================*/ +{ +#ifdef __WIN__ + return((ulint)GetCurrentProcessId()); +#else + return((ulint) getpid()); +#endif +} + +/****************************************************************//** +Retrieve and compare operating system release. +@return TRUE if the OS release is equal to, or later than release. */ +UNIV_INTERN +ibool +os_compare_release( +/*===============*/ + const char* release /*!< in: OS release */ + __attribute__((unused))) +{ +#if defined(UNIV_LINUX) && defined(_GNU_SOURCE) + struct utsname name; + return uname(&name) == 0 && strverscmp(name.release, release) >= 0; +#else + return 0; +#endif +} + +/****************************************************************//** +Allocates large pages memory. +@return allocated memory */ +UNIV_INTERN +void* +os_mem_alloc_large( +/*===============*/ + ulint* n, /*!< in/out: number of bytes */ + ibool populate) /*!< in: virtual page preallocation */ +{ + void* ptr; + ulint size; +#if defined HAVE_LARGE_PAGES && defined UNIV_LINUX + int shmid; + struct shmid_ds buf; + + if (!os_use_large_pages || !os_large_page_size) { + goto skip; + } + + /* Align block size to os_large_page_size */ + ut_ad(ut_is_2pow(os_large_page_size)); + size = ut_2pow_round(*n + (os_large_page_size - 1), + os_large_page_size); + + shmid = shmget(IPC_PRIVATE, (size_t) size, SHM_HUGETLB | SHM_R | SHM_W); + if (shmid < 0) { + fprintf(stderr, "InnoDB: HugeTLB: Warning: Failed to allocate" + " %lu bytes. errno %d\n", size, errno); + ptr = NULL; + } else { + ptr = shmat(shmid, NULL, 0); + if (ptr == (void*)-1) { + fprintf(stderr, "InnoDB: HugeTLB: Warning: Failed to" + " attach shared memory segment, errno %d\n", + errno); + ptr = NULL; + } + + /* Remove the shared memory segment so that it will be + automatically freed after memory is detached or + process exits */ + shmctl(shmid, IPC_RMID, &buf); + } + + if (ptr) { + *n = size; + os_fast_mutex_lock(&ut_list_mutex); + ut_total_allocated_memory += size; + os_fast_mutex_unlock(&ut_list_mutex); + UNIV_MEM_ALLOC(ptr, size); + return(ptr); + } + + fprintf(stderr, "InnoDB HugeTLB: Warning: Using conventional" + " memory pool\n"); +skip: +#endif /* HAVE_LARGE_PAGES && UNIV_LINUX */ + +#ifdef __WIN__ + SYSTEM_INFO system_info; + GetSystemInfo(&system_info); + + /* Align block size to system page size */ + ut_ad(ut_is_2pow(system_info.dwPageSize)); + /* system_info.dwPageSize is only 32-bit. Casting to ulint is required + on 64-bit Windows. */ + size = *n = ut_2pow_round(*n + (system_info.dwPageSize - 1), + (ulint) system_info.dwPageSize); + ptr = VirtualAlloc(NULL, size, MEM_COMMIT | MEM_RESERVE, + PAGE_READWRITE); + if (!ptr) { + fprintf(stderr, "InnoDB: VirtualAlloc(%lu bytes) failed;" + " Windows error %lu\n", + (ulong) size, (ulong) GetLastError()); + } else { + os_fast_mutex_lock(&ut_list_mutex); + ut_total_allocated_memory += size; + os_fast_mutex_unlock(&ut_list_mutex); + UNIV_MEM_ALLOC(ptr, size); + } +#elif !defined OS_MAP_ANON + size = *n; + ptr = ut_malloc_low(size, TRUE, FALSE); +#else +# ifdef HAVE_GETPAGESIZE + size = getpagesize(); +# else + size = UNIV_PAGE_SIZE; +# endif + /* Align block size to system page size */ + ut_ad(ut_is_2pow(size)); + size = *n = ut_2pow_round(*n + (size - 1), size); + ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | OS_MAP_ANON | + (populate ? OS_MAP_POPULATE : 0), -1, 0); + if (UNIV_UNLIKELY(ptr == (void*) -1)) { + fprintf(stderr, "InnoDB: mmap(%lu bytes) failed;" + " errno %lu\n", + (ulong) size, (ulong) errno); + return(NULL); + } else { + os_fast_mutex_lock(&ut_list_mutex); + ut_total_allocated_memory += size; + os_fast_mutex_unlock(&ut_list_mutex); + UNIV_MEM_ALLOC(ptr, size); + } +#endif + +#if OS_MAP_ANON && OS_MAP_POPULATE + /* MAP_POPULATE is only supported for private mappings + since Linux 2.6.23. */ + populate = populate && !os_compare_release("2.6.23"); + + if (populate) { + fprintf(stderr, "InnoDB: Warning: mmap(MAP_POPULATE) " + "is not supported for private mappings. " + "Forcing preallocation by faulting in pages.\n"); + } +#endif + + /* Initialize the entire buffer to force the allocation + of physical memory page frames. */ + if (populate) { + memset(ptr, '\0', size); + } + + return(ptr); +} + +/****************************************************************//** +Frees large pages memory. */ +UNIV_INTERN +void +os_mem_free_large( +/*==============*/ + void *ptr, /*!< in: pointer returned by + os_mem_alloc_large() */ + ulint size) /*!< in: size returned by + os_mem_alloc_large() */ +{ + os_fast_mutex_lock(&ut_list_mutex); + ut_a(ut_total_allocated_memory >= size); + os_fast_mutex_unlock(&ut_list_mutex); + +#if defined HAVE_LARGE_PAGES && defined UNIV_LINUX + if (os_use_large_pages && os_large_page_size && !shmdt(ptr)) { + os_fast_mutex_lock(&ut_list_mutex); + ut_a(ut_total_allocated_memory >= size); + ut_total_allocated_memory -= size; + os_fast_mutex_unlock(&ut_list_mutex); + UNIV_MEM_FREE(ptr, size); + return; + } +#endif /* HAVE_LARGE_PAGES && UNIV_LINUX */ +#ifdef __WIN__ + /* When RELEASE memory, the size parameter must be 0. + Do not use MEM_RELEASE with MEM_DECOMMIT. */ + if (!VirtualFree(ptr, 0, MEM_RELEASE)) { + fprintf(stderr, "InnoDB: VirtualFree(%p, %lu) failed;" + " Windows error %lu\n", + ptr, (ulong) size, (ulong) GetLastError()); + } else { + os_fast_mutex_lock(&ut_list_mutex); + ut_a(ut_total_allocated_memory >= size); + ut_total_allocated_memory -= size; + os_fast_mutex_unlock(&ut_list_mutex); + UNIV_MEM_FREE(ptr, size); + } +#elif !defined OS_MAP_ANON + ut_free(ptr); +#else +# if defined(UNIV_SOLARIS) + if (munmap(static_cast<caddr_t>(ptr), size)) { +# else + if (munmap(ptr, size)) { +# endif /* UNIV_SOLARIS */ + fprintf(stderr, "InnoDB: munmap(%p, %lu) failed;" + " errno %lu\n", + ptr, (ulong) size, (ulong) errno); + } else { + os_fast_mutex_lock(&ut_list_mutex); + ut_a(ut_total_allocated_memory >= size); + ut_total_allocated_memory -= size; + os_fast_mutex_unlock(&ut_list_mutex); + UNIV_MEM_FREE(ptr, size); + } +#endif +} diff --git a/storage/xtradb/os/os0sync.cc b/storage/xtradb/os/os0sync.cc new file mode 100644 index 00000000000..e42c5900c0c --- /dev/null +++ b/storage/xtradb/os/os0sync.cc @@ -0,0 +1,934 @@ +/***************************************************************************** + +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file os/os0sync.cc +The interface to the operating system +synchronization primitives. + +Created 9/6/1995 Heikki Tuuri +*******************************************************/ + +#include "os0sync.h" +#ifdef UNIV_NONINL +#include "os0sync.ic" +#endif + +#ifdef __WIN__ +#include <windows.h> +#endif + +#include "ut0mem.h" +#include "srv0start.h" +#include "srv0srv.h" + +/* Type definition for an operating system mutex struct */ +struct os_mutex_t{ + os_event_t event; /*!< Used by sync0arr.cc for queing threads */ + void* handle; /*!< OS handle to mutex */ + ulint count; /*!< we use this counter to check + that the same thread does not + recursively lock the mutex: we + do not assume that the OS mutex + supports recursive locking, though + NT seems to do that */ + UT_LIST_NODE_T(os_mutex_t) os_mutex_list; + /* list of all 'slow' OS mutexes created */ +}; + +/** Mutex protecting counts and the lists of OS mutexes and events */ +UNIV_INTERN os_ib_mutex_t os_sync_mutex; +/** TRUE if os_sync_mutex has been initialized */ +static ibool os_sync_mutex_inited = FALSE; +/** TRUE when os_sync_free() is being executed */ +static ibool os_sync_free_called = FALSE; + +/** This is incremented by 1 in os_thread_create and decremented by 1 in +os_thread_exit */ +UNIV_INTERN ulint os_thread_count = 0; + +/** The list of all events created */ +static UT_LIST_BASE_NODE_T(os_event) os_event_list; + +/** The list of all OS 'slow' mutexes */ +static UT_LIST_BASE_NODE_T(os_mutex_t) os_mutex_list; + +UNIV_INTERN ulint os_event_count = 0; +UNIV_INTERN ulint os_mutex_count = 0; +UNIV_INTERN ulint os_fast_mutex_count = 0; + +/* The number of microsecnds in a second. */ +static const ulint MICROSECS_IN_A_SECOND = 1000000; + +#ifdef UNIV_PFS_MUTEX +UNIV_INTERN mysql_pfs_key_t event_os_mutex_key; +UNIV_INTERN mysql_pfs_key_t os_mutex_key; +#endif + +/* Because a mutex is embedded inside an event and there is an +event embedded inside a mutex, on free, this generates a recursive call. +This version of the free event function doesn't acquire the global lock */ +static void os_event_free_internal(os_event_t event); + +/* On Windows (Vista and later), load function pointers for condition +variable handling. Those functions are not available in prior versions, +so we have to use them via runtime loading, as long as we support XP. */ +static void os_cond_module_init(void); + +#ifdef __WIN__ +/* Prototypes and function pointers for condition variable functions */ +typedef VOID (WINAPI* InitializeConditionVariableProc) + (PCONDITION_VARIABLE ConditionVariable); +static InitializeConditionVariableProc initialize_condition_variable; + +typedef BOOL (WINAPI* SleepConditionVariableCSProc) + (PCONDITION_VARIABLE ConditionVariable, + PCRITICAL_SECTION CriticalSection, + DWORD dwMilliseconds); +static SleepConditionVariableCSProc sleep_condition_variable; + +typedef VOID (WINAPI* WakeAllConditionVariableProc) + (PCONDITION_VARIABLE ConditionVariable); +static WakeAllConditionVariableProc wake_all_condition_variable; + +typedef VOID (WINAPI* WakeConditionVariableProc) + (PCONDITION_VARIABLE ConditionVariable); +static WakeConditionVariableProc wake_condition_variable; +#endif + +/*********************************************************//** +Initialitze condition variable */ +UNIV_INLINE +void +os_cond_init( +/*=========*/ + os_cond_t* cond) /*!< in: condition variable. */ +{ + ut_a(cond); + +#ifdef __WIN__ + ut_a(initialize_condition_variable != NULL); + initialize_condition_variable(cond); +#else + ut_a(pthread_cond_init(cond, NULL) == 0); +#endif +} + +/*********************************************************//** +Do a timed wait on condition variable. +@return TRUE if timed out, FALSE otherwise */ +UNIV_INLINE +ibool +os_cond_wait_timed( +/*===============*/ + os_cond_t* cond, /*!< in: condition variable. */ + os_fast_mutex_t* fast_mutex, /*!< in: fast mutex */ +#ifndef __WIN__ + const struct timespec* abstime /*!< in: timeout */ +#else + DWORD time_in_ms /*!< in: timeout in + milliseconds*/ +#endif /* !__WIN__ */ +) +{ + fast_mutex_t* mutex = &fast_mutex->mutex; +#ifdef __WIN__ + BOOL ret; + DWORD err; + + ut_a(sleep_condition_variable != NULL); + + ret = sleep_condition_variable(cond, mutex, time_in_ms); + + if (!ret) { + err = GetLastError(); + /* From http://msdn.microsoft.com/en-us/library/ms686301%28VS.85%29.aspx, + "Condition variables are subject to spurious wakeups + (those not associated with an explicit wake) and stolen wakeups + (another thread manages to run before the woken thread)." + Check for both types of timeouts. + Conditions are checked by the caller.*/ + if ((err == WAIT_TIMEOUT) || (err == ERROR_TIMEOUT)) { + return(TRUE); + } + } + + ut_a(ret); + + return(FALSE); +#else + int ret; + + ret = pthread_cond_timedwait(cond, mutex, abstime); + + switch (ret) { + case 0: + case ETIMEDOUT: + /* We play it safe by checking for EINTR even though + according to the POSIX documentation it can't return EINTR. */ + case EINTR: + break; + + default: + fprintf(stderr, " InnoDB: pthread_cond_timedwait() returned: " + "%d: abstime={%lu,%lu}\n", + ret, (ulong) abstime->tv_sec, (ulong) abstime->tv_nsec); + ut_error; + } + + return(ret == ETIMEDOUT); +#endif +} +/*********************************************************//** +Wait on condition variable */ +UNIV_INLINE +void +os_cond_wait( +/*=========*/ + os_cond_t* cond, /*!< in: condition variable. */ + os_fast_mutex_t* fast_mutex)/*!< in: fast mutex */ +{ + fast_mutex_t* mutex = &fast_mutex->mutex; + ut_a(cond); + ut_a(mutex); + +#ifdef __WIN__ + ut_a(sleep_condition_variable != NULL); + ut_a(sleep_condition_variable(cond, mutex, INFINITE)); +#else + ut_a(pthread_cond_wait(cond, mutex) == 0); +#endif +} + +/*********************************************************//** +Wakes all threads waiting for condition variable */ +UNIV_INLINE +void +os_cond_broadcast( +/*==============*/ + os_cond_t* cond) /*!< in: condition variable. */ +{ + ut_a(cond); + +#ifdef __WIN__ + ut_a(wake_all_condition_variable != NULL); + wake_all_condition_variable(cond); +#else + ut_a(pthread_cond_broadcast(cond) == 0); +#endif +} + +/*********************************************************//** +Wakes one thread waiting for condition variable */ +UNIV_INLINE +void +os_cond_signal( +/*==========*/ + os_cond_t* cond) /*!< in: condition variable. */ +{ + ut_a(cond); + +#ifdef __WIN__ + ut_a(wake_condition_variable != NULL); + wake_condition_variable(cond); +#else + ut_a(pthread_cond_signal(cond) == 0); +#endif +} + +/*********************************************************//** +Destroys condition variable */ +UNIV_INLINE +void +os_cond_destroy( +/*============*/ + os_cond_t* cond) /*!< in: condition variable. */ +{ +#ifdef __WIN__ + /* Do nothing */ +#else + ut_a(pthread_cond_destroy(cond) == 0); +#endif +} + +/*********************************************************//** +On Windows (Vista and later), load function pointers for condition variable +handling. Those functions are not available in prior versions, so we have to +use them via runtime loading, as long as we support XP. */ +static +void +os_cond_module_init(void) +/*=====================*/ +{ +#ifdef __WIN__ + HMODULE h_dll; + + if (!srv_use_native_conditions) + return; + + h_dll = GetModuleHandle("kernel32"); + + initialize_condition_variable = (InitializeConditionVariableProc) + GetProcAddress(h_dll, "InitializeConditionVariable"); + sleep_condition_variable = (SleepConditionVariableCSProc) + GetProcAddress(h_dll, "SleepConditionVariableCS"); + wake_all_condition_variable = (WakeAllConditionVariableProc) + GetProcAddress(h_dll, "WakeAllConditionVariable"); + wake_condition_variable = (WakeConditionVariableProc) + GetProcAddress(h_dll, "WakeConditionVariable"); + + /* When using native condition variables, check function pointers */ + ut_a(initialize_condition_variable); + ut_a(sleep_condition_variable); + ut_a(wake_all_condition_variable); + ut_a(wake_condition_variable); +#endif +} + +/*********************************************************//** +Initializes global event and OS 'slow' mutex lists. */ +UNIV_INTERN +void +os_sync_init(void) +/*==============*/ +{ + UT_LIST_INIT(os_event_list); + UT_LIST_INIT(os_mutex_list); + + os_sync_mutex = NULL; + os_sync_mutex_inited = FALSE; + + /* Now for Windows only */ + os_cond_module_init(); + + os_sync_mutex = os_mutex_create(); + + os_sync_mutex_inited = TRUE; +} + +/*********************************************************//** +Frees created events and OS 'slow' mutexes. */ +UNIV_INTERN +void +os_sync_free(void) +/*==============*/ +{ + os_event_t event; + os_ib_mutex_t mutex; + + os_sync_free_called = TRUE; + event = UT_LIST_GET_FIRST(os_event_list); + + while (event) { + + os_event_free(event); + + event = UT_LIST_GET_FIRST(os_event_list); + } + + mutex = UT_LIST_GET_FIRST(os_mutex_list); + + while (mutex) { + if (mutex == os_sync_mutex) { + /* Set the flag to FALSE so that we do not try to + reserve os_sync_mutex any more in remaining freeing + operations in shutdown */ + os_sync_mutex_inited = FALSE; + } + + os_mutex_free(mutex); + + mutex = UT_LIST_GET_FIRST(os_mutex_list); + } + os_sync_free_called = FALSE; +} + +/*********************************************************//** +Creates an event semaphore, i.e., a semaphore which may just have two +states: signaled and nonsignaled. The created event is manual reset: it +must be reset explicitly by calling sync_os_reset_event. +@return the event handle */ +UNIV_INTERN +os_event_t +os_event_create(void) +/*==================*/ +{ + os_event_t event; + +#ifdef __WIN__ + if(!srv_use_native_conditions) { + + event = static_cast<os_event_t>(ut_malloc(sizeof(*event))); + + event->handle = CreateEvent(NULL, TRUE, FALSE, NULL); + if (!event->handle) { + fprintf(stderr, + "InnoDB: Could not create a Windows event" + " semaphore; Windows error %lu\n", + (ulong) GetLastError()); + } + } else /* Windows with condition variables */ +#endif + { + event = static_cast<os_event_t>(ut_malloc(sizeof *event)); + +#ifndef PFS_SKIP_EVENT_MUTEX + os_fast_mutex_init(event_os_mutex_key, &event->os_mutex); +#else + os_fast_mutex_init(PFS_NOT_INSTRUMENTED, &event->os_mutex); +#endif + + os_cond_init(&(event->cond_var)); + + event->is_set = FALSE; + + /* We return this value in os_event_reset(), which can then be + be used to pass to the os_event_wait_low(). The value of zero + is reserved in os_event_wait_low() for the case when the + caller does not want to pass any signal_count value. To + distinguish between the two cases we initialize signal_count + to 1 here. */ + event->signal_count = 1; + } + + /* The os_sync_mutex can be NULL because during startup an event + can be created [ because it's embedded in the mutex/rwlock ] before + this module has been initialized */ + if (os_sync_mutex != NULL) { + os_mutex_enter(os_sync_mutex); + } + + /* Put to the list of events */ + UT_LIST_ADD_FIRST(os_event_list, os_event_list, event); + + os_event_count++; + + if (os_sync_mutex != NULL) { + os_mutex_exit(os_sync_mutex); + } + + return(event); +} + +/**********************************************************//** +Sets an event semaphore to the signaled state: lets waiting threads +proceed. */ +UNIV_INTERN +void +os_event_set( +/*=========*/ + os_event_t event) /*!< in: event to set */ +{ + ut_a(event); + +#ifdef __WIN__ + if (!srv_use_native_conditions) { + ut_a(SetEvent(event->handle)); + return; + } +#endif + + os_fast_mutex_lock(&(event->os_mutex)); + + if (event->is_set) { + /* Do nothing */ + } else { + event->is_set = TRUE; + event->signal_count += 1; + os_cond_broadcast(&(event->cond_var)); + } + + os_fast_mutex_unlock(&(event->os_mutex)); +} + +/**********************************************************//** +Resets an event semaphore to the nonsignaled state. Waiting threads will +stop to wait for the event. +The return value should be passed to os_even_wait_low() if it is desired +that this thread should not wait in case of an intervening call to +os_event_set() between this os_event_reset() and the +os_event_wait_low() call. See comments for os_event_wait_low(). +@return current signal_count. */ +UNIV_INTERN +ib_int64_t +os_event_reset( +/*===========*/ + os_event_t event) /*!< in: event to reset */ +{ + ib_int64_t ret = 0; + + ut_a(event); + +#ifdef __WIN__ + if(!srv_use_native_conditions) { + ut_a(ResetEvent(event->handle)); + return(0); + } +#endif + + os_fast_mutex_lock(&(event->os_mutex)); + + if (!event->is_set) { + /* Do nothing */ + } else { + event->is_set = FALSE; + } + ret = event->signal_count; + + os_fast_mutex_unlock(&(event->os_mutex)); + return(ret); +} + +/**********************************************************//** +Frees an event object, without acquiring the global lock. */ +static +void +os_event_free_internal( +/*===================*/ + os_event_t event) /*!< in: event to free */ +{ +#ifdef __WIN__ + if(!srv_use_native_conditions) { + ut_a(event); + ut_a(CloseHandle(event->handle)); + } else +#endif + { + ut_a(event); + + /* This is to avoid freeing the mutex twice */ + os_fast_mutex_free(&(event->os_mutex)); + + os_cond_destroy(&(event->cond_var)); + } + + /* Remove from the list of events */ + UT_LIST_REMOVE(os_event_list, os_event_list, event); + + os_event_count--; + + ut_free(event); +} + +/**********************************************************//** +Frees an event object. */ +UNIV_INTERN +void +os_event_free( +/*==========*/ + os_event_t event) /*!< in: event to free */ + +{ + ut_a(event); +#ifdef __WIN__ + if(!srv_use_native_conditions){ + ut_a(CloseHandle(event->handle)); + } else /*Windows with condition variables */ +#endif + { + os_fast_mutex_free(&(event->os_mutex)); + + os_cond_destroy(&(event->cond_var)); + } + + /* Remove from the list of events */ + os_mutex_enter(os_sync_mutex); + + UT_LIST_REMOVE(os_event_list, os_event_list, event); + + os_event_count--; + + os_mutex_exit(os_sync_mutex); + + ut_free(event); +} + +/**********************************************************//** +Waits for an event object until it is in the signaled state. + +Typically, if the event has been signalled after the os_event_reset() +we'll return immediately because event->is_set == TRUE. +There are, however, situations (e.g.: sync_array code) where we may +lose this information. For example: + +thread A calls os_event_reset() +thread B calls os_event_set() [event->is_set == TRUE] +thread C calls os_event_reset() [event->is_set == FALSE] +thread A calls os_event_wait() [infinite wait!] +thread C calls os_event_wait() [infinite wait!] + +Where such a scenario is possible, to avoid infinite wait, the +value returned by os_event_reset() should be passed in as +reset_sig_count. */ +UNIV_INTERN +void +os_event_wait_low( +/*==============*/ + os_event_t event, /*!< in: event to wait */ + ib_int64_t reset_sig_count)/*!< in: zero or the value + returned by previous call of + os_event_reset(). */ +{ +#ifdef __WIN__ + if(!srv_use_native_conditions) { + DWORD err; + + ut_a(event); + + UT_NOT_USED(reset_sig_count); + + /* Specify an infinite wait */ + err = WaitForSingleObject(event->handle, INFINITE); + + ut_a(err == WAIT_OBJECT_0); + return; + } +#endif + + os_fast_mutex_lock(&event->os_mutex); + + if (!reset_sig_count) { + reset_sig_count = event->signal_count; + } + + while (!event->is_set && event->signal_count == reset_sig_count) { + os_cond_wait(&(event->cond_var), &(event->os_mutex)); + + /* Solaris manual said that spurious wakeups may occur: we + have to check if the event really has been signaled after + we came here to wait */ + } + + os_fast_mutex_unlock(&event->os_mutex); +} + +/**********************************************************//** +Waits for an event object until it is in the signaled state or +a timeout is exceeded. +@return 0 if success, OS_SYNC_TIME_EXCEEDED if timeout was exceeded */ +UNIV_INTERN +ulint +os_event_wait_time_low( +/*===================*/ + os_event_t event, /*!< in: event to wait */ + ulint time_in_usec, /*!< in: timeout in + microseconds, or + OS_SYNC_INFINITE_TIME */ + ib_int64_t reset_sig_count) /*!< in: zero or the value + returned by previous call of + os_event_reset(). */ +{ + ibool timed_out = FALSE; + +#ifdef __WIN__ + DWORD time_in_ms; + + if (!srv_use_native_conditions) { + DWORD err; + + ut_a(event); + + if (time_in_usec != OS_SYNC_INFINITE_TIME) { + time_in_ms = static_cast<DWORD>(time_in_usec / 1000); + err = WaitForSingleObject(event->handle, time_in_ms); + } else { + err = WaitForSingleObject(event->handle, INFINITE); + } + + if (err == WAIT_OBJECT_0) { + return(0); + } else if ((err == WAIT_TIMEOUT) || (err == ERROR_TIMEOUT)) { + return(OS_SYNC_TIME_EXCEEDED); + } + + ut_error; + /* Dummy value to eliminate compiler warning. */ + return(42); + } else { + ut_a(sleep_condition_variable != NULL); + + if (time_in_usec != OS_SYNC_INFINITE_TIME) { + time_in_ms = static_cast<DWORD>(time_in_usec / 1000); + } else { + time_in_ms = INFINITE; + } + } +#else + struct timespec abstime; + + if (time_in_usec != OS_SYNC_INFINITE_TIME) { + struct timeval tv; + int ret; + ulint sec; + ulint usec; + + ret = ut_usectime(&sec, &usec); + ut_a(ret == 0); + + tv.tv_sec = sec; + tv.tv_usec = usec; + + tv.tv_usec += time_in_usec; + + if ((ulint) tv.tv_usec >= MICROSECS_IN_A_SECOND) { + tv.tv_sec += time_in_usec / MICROSECS_IN_A_SECOND; + tv.tv_usec %= MICROSECS_IN_A_SECOND; + } + + abstime.tv_sec = tv.tv_sec; + abstime.tv_nsec = tv.tv_usec * 1000; + } else { + abstime.tv_nsec = 999999999; + abstime.tv_sec = (time_t) ULINT_MAX; + } + + ut_a(abstime.tv_nsec <= 999999999); + +#endif /* __WIN__ */ + + os_fast_mutex_lock(&event->os_mutex); + + if (!reset_sig_count) { + reset_sig_count = event->signal_count; + } + + do { + if (event->is_set || event->signal_count != reset_sig_count) { + + break; + } + + timed_out = os_cond_wait_timed( + &event->cond_var, &event->os_mutex, +#ifndef __WIN__ + &abstime +#else + time_in_ms +#endif /* !__WIN__ */ + ); + + } while (!timed_out); + + os_fast_mutex_unlock(&event->os_mutex); + + return(timed_out ? OS_SYNC_TIME_EXCEEDED : 0); +} + +/*********************************************************//** +Creates an operating system mutex semaphore. Because these are slow, the +mutex semaphore of InnoDB itself (ib_mutex_t) should be used where possible. +@return the mutex handle */ +UNIV_INTERN +os_ib_mutex_t +os_mutex_create(void) +/*=================*/ +{ + os_fast_mutex_t* mutex; + os_ib_mutex_t mutex_str; + + mutex = static_cast<os_fast_mutex_t*>( + ut_malloc(sizeof(os_fast_mutex_t))); + + os_fast_mutex_init(os_mutex_key, mutex); + + mutex_str = static_cast<os_ib_mutex_t>(ut_malloc(sizeof *mutex_str)); + + mutex_str->handle = mutex; + mutex_str->count = 0; + mutex_str->event = os_event_create(); + + if (UNIV_LIKELY(os_sync_mutex_inited)) { + /* When creating os_sync_mutex itself we cannot reserve it */ + os_mutex_enter(os_sync_mutex); + } + + UT_LIST_ADD_FIRST(os_mutex_list, os_mutex_list, mutex_str); + + os_mutex_count++; + + if (UNIV_LIKELY(os_sync_mutex_inited)) { + os_mutex_exit(os_sync_mutex); + } + + return(mutex_str); +} + +/**********************************************************//** +Acquires ownership of a mutex semaphore. */ +UNIV_INTERN +void +os_mutex_enter( +/*===========*/ + os_ib_mutex_t mutex) /*!< in: mutex to acquire */ +{ + os_fast_mutex_lock(static_cast<os_fast_mutex_t*>(mutex->handle)); + + (mutex->count)++; + + ut_a(mutex->count == 1); +} + +/**********************************************************//** +Releases ownership of a mutex. */ +UNIV_INTERN +void +os_mutex_exit( +/*==========*/ + os_ib_mutex_t mutex) /*!< in: mutex to release */ +{ + ut_a(mutex); + + ut_a(mutex->count == 1); + + (mutex->count)--; + os_fast_mutex_unlock(static_cast<os_fast_mutex_t*>(mutex->handle)); +} + +/**********************************************************//** +Frees a mutex object. */ +UNIV_INTERN +void +os_mutex_free( +/*==========*/ + os_ib_mutex_t mutex) /*!< in: mutex to free */ +{ + ut_a(mutex); + + if (UNIV_LIKELY(!os_sync_free_called)) { + os_event_free_internal(mutex->event); + } + + if (UNIV_LIKELY(os_sync_mutex_inited)) { + os_mutex_enter(os_sync_mutex); + } + + UT_LIST_REMOVE(os_mutex_list, os_mutex_list, mutex); + + os_mutex_count--; + + if (UNIV_LIKELY(os_sync_mutex_inited)) { + os_mutex_exit(os_sync_mutex); + } + + os_fast_mutex_free(static_cast<os_fast_mutex_t*>(mutex->handle)); + ut_free(mutex->handle); + ut_free(mutex); +} + +/*********************************************************//** +Initializes an operating system fast mutex semaphore. */ +UNIV_INTERN +void +os_fast_mutex_init_func( +/*====================*/ + fast_mutex_t* fast_mutex) /*!< in: fast mutex */ +{ +#ifdef __WIN__ + ut_a(fast_mutex); + + InitializeCriticalSection((LPCRITICAL_SECTION) fast_mutex); +#else + ut_a(0 == pthread_mutex_init(fast_mutex, MY_MUTEX_INIT_FAST)); +#endif + if (UNIV_LIKELY(os_sync_mutex_inited)) { + /* When creating os_sync_mutex itself (in Unix) we cannot + reserve it */ + + os_mutex_enter(os_sync_mutex); + } + + os_fast_mutex_count++; + + if (UNIV_LIKELY(os_sync_mutex_inited)) { + os_mutex_exit(os_sync_mutex); + } +} + +/**********************************************************//** +Acquires ownership of a fast mutex. */ +UNIV_INTERN +void +os_fast_mutex_lock_func( +/*====================*/ + fast_mutex_t* fast_mutex) /*!< in: mutex to acquire */ +{ +#ifdef __WIN__ + EnterCriticalSection((LPCRITICAL_SECTION) fast_mutex); +#else + pthread_mutex_lock(fast_mutex); +#endif +} + +/**********************************************************//** +Releases ownership of a fast mutex. */ +UNIV_INTERN +void +os_fast_mutex_unlock_func( +/*======================*/ + fast_mutex_t* fast_mutex) /*!< in: mutex to release */ +{ +#ifdef __WIN__ + LeaveCriticalSection(fast_mutex); +#else + pthread_mutex_unlock(fast_mutex); +#endif +} + +/**********************************************************//** +Frees a mutex object. */ +UNIV_INTERN +void +os_fast_mutex_free_func( +/*====================*/ + fast_mutex_t* fast_mutex) /*!< in: mutex to free */ +{ +#ifdef __WIN__ + ut_a(fast_mutex); + + DeleteCriticalSection((LPCRITICAL_SECTION) fast_mutex); +#else + int ret; + + ret = pthread_mutex_destroy(fast_mutex); + + if (UNIV_UNLIKELY(ret != 0)) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: error: return value %lu when calling\n" + "InnoDB: pthread_mutex_destroy().\n", (ulint) ret); + fprintf(stderr, + "InnoDB: Byte contents of the pthread mutex at %p:\n", + (void*) fast_mutex); + ut_print_buf(stderr, fast_mutex, sizeof(os_fast_mutex_t)); + putc('\n', stderr); + } +#endif + if (UNIV_LIKELY(os_sync_mutex_inited)) { + /* When freeing the last mutexes, we have + already freed os_sync_mutex */ + + os_mutex_enter(os_sync_mutex); + } + + ut_ad(os_fast_mutex_count > 0); + os_fast_mutex_count--; + + if (UNIV_LIKELY(os_sync_mutex_inited)) { + os_mutex_exit(os_sync_mutex); + } +} diff --git a/storage/xtradb/os/os0thread.cc b/storage/xtradb/os/os0thread.cc new file mode 100644 index 00000000000..a862022693c --- /dev/null +++ b/storage/xtradb/os/os0thread.cc @@ -0,0 +1,315 @@ +/***************************************************************************** + +Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file os/os0thread.cc +The interface to the operating system thread control primitives + +Created 9/8/1995 Heikki Tuuri +*******************************************************/ + +#include "os0thread.h" +#ifdef UNIV_NONINL +#include "os0thread.ic" +#endif + +#ifdef __WIN__ +#include <windows.h> +#elif UNIV_LINUX +#include <sys/time.h> +#include <sys/resource.h> +#include <unistd.h> +#include <sys/syscall.h> +#include <sys/types.h> +#endif + +#ifndef UNIV_HOTBACKUP +#include "srv0srv.h" +#include "os0sync.h" + +/***************************************************************//** +Compares two thread ids for equality. +@return TRUE if equal */ +UNIV_INTERN +ibool +os_thread_eq( +/*=========*/ + os_thread_id_t a, /*!< in: OS thread or thread id */ + os_thread_id_t b) /*!< in: OS thread or thread id */ +{ +#ifdef __WIN__ + if (a == b) { + return(TRUE); + } + + return(FALSE); +#else + if (pthread_equal(a, b)) { + return(TRUE); + } + + return(FALSE); +#endif +} + +/****************************************************************//** +Converts an OS thread id to a ulint. It is NOT guaranteed that the ulint is +unique for the thread though! +@return thread identifier as a number */ +UNIV_INTERN +ulint +os_thread_pf( +/*=========*/ + os_thread_id_t a) /*!< in: OS thread identifier */ +{ +#ifdef UNIV_HPUX10 + /* In HP-UX-10.20 a pthread_t is a struct of 3 fields: field1, field2, + field3. We do not know if field1 determines the thread uniquely. */ + + return((ulint)(a.field1)); +#else + return((ulint) a); +#endif +} + +/*****************************************************************//** +Returns the thread identifier of current thread. Currently the thread +identifier in Unix is the thread handle itself. Note that in HP-UX +pthread_t is a struct of 3 fields. +@return current thread identifier */ +UNIV_INTERN +os_thread_id_t +os_thread_get_curr_id(void) +/*=======================*/ +{ +#ifdef __WIN__ + return(GetCurrentThreadId()); +#else + return(pthread_self()); +#endif +} + +/*****************************************************************//** +Returns the system-specific thread identifier of current thread. On Linux, +returns tid. On other systems currently returns os_thread_get_curr_id(). + +@return current thread identifier */ +UNIV_INTERN +os_tid_t +os_thread_get_tid(void) +/*===================*/ +{ +#ifdef UNIV_LINUX + return((os_tid_t)syscall(SYS_gettid)); +#else + return(os_thread_get_curr_id()); +#endif +} + + +/****************************************************************//** +Creates a new thread of execution. The execution starts from +the function given. The start function takes a void* parameter +and returns an ulint. +@return handle to the thread */ +UNIV_INTERN +os_thread_t +os_thread_create_func( +/*==================*/ + os_thread_func_t func, /*!< in: pointer to function + from which to start */ + void* arg, /*!< in: argument to start + function */ + os_thread_id_t* thread_id) /*!< out: id of the created + thread, or NULL */ +{ + /* the new thread should look recent changes up here so far. */ + os_wmb; + +#ifdef __WIN__ + os_thread_t thread; + DWORD win_thread_id; + + os_mutex_enter(os_sync_mutex); + os_thread_count++; + os_mutex_exit(os_sync_mutex); + + thread = CreateThread(NULL, /* no security attributes */ + 0, /* default size stack */ + func, + arg, + 0, /* thread runs immediately */ + &win_thread_id); + + if (thread_id) { + *thread_id = win_thread_id; + } + + return(thread); +#else + int ret; + os_thread_t pthread; + pthread_attr_t attr; + +#ifndef UNIV_HPUX10 + pthread_attr_init(&attr); +#endif + +#ifdef UNIV_AIX + /* We must make sure a thread stack is at least 32 kB, otherwise + InnoDB might crash; we do not know if the default stack size on + AIX is always big enough. An empirical test on AIX-4.3 suggested + the size was 96 kB, though. */ + + ret = pthread_attr_setstacksize(&attr, + (size_t)(PTHREAD_STACK_MIN + + 32 * 1024)); + if (ret) { + fprintf(stderr, + "InnoDB: Error: pthread_attr_setstacksize" + " returned %d\n", ret); + exit(1); + } +#endif + os_mutex_enter(os_sync_mutex); + os_thread_count++; + os_mutex_exit(os_sync_mutex); + +#ifdef UNIV_HPUX10 + ret = pthread_create(&pthread, pthread_attr_default, func, arg); +#else + ret = pthread_create(&pthread, &attr, func, arg); +#endif + if (ret) { + fprintf(stderr, + "InnoDB: Error: pthread_create returned %d\n", ret); + exit(1); + } + +#ifndef UNIV_HPUX10 + pthread_attr_destroy(&attr); +#endif + + ut_a(os_thread_count <= OS_THREAD_MAX_N); + + if (thread_id) { + *thread_id = pthread; + } + + return(pthread); +#endif +} + +/*****************************************************************//** +Exits the current thread. */ +UNIV_INTERN +void +os_thread_exit( +/*===========*/ + void* exit_value) /*!< in: exit value; in Windows this void* + is cast as a DWORD */ +{ +#ifdef UNIV_DEBUG_THREAD_CREATION + fprintf(stderr, "Thread exits, id %lu\n", + os_thread_pf(os_thread_get_curr_id())); +#endif + +#ifdef UNIV_PFS_THREAD + pfs_delete_thread(); +#endif + + os_mutex_enter(os_sync_mutex); + os_thread_count--; + os_mutex_exit(os_sync_mutex); + +#ifdef __WIN__ + ExitThread((DWORD) exit_value); +#else + pthread_detach(pthread_self()); + pthread_exit(exit_value); +#endif +} + +/*****************************************************************//** +Advises the os to give up remainder of the thread's time slice. */ +UNIV_INTERN +void +os_thread_yield(void) +/*=================*/ +{ +#if defined(__WIN__) + SwitchToThread(); +#elif (defined(HAVE_SCHED_YIELD) && defined(HAVE_SCHED_H)) + sched_yield(); +#elif defined(HAVE_PTHREAD_YIELD_ZERO_ARG) + pthread_yield(); +#elif defined(HAVE_PTHREAD_YIELD_ONE_ARG) + pthread_yield(0); +#else + os_thread_sleep(0); +#endif +} +#endif /* !UNIV_HOTBACKUP */ + +/*****************************************************************//** +The thread sleeps at least the time given in microseconds. */ +UNIV_INTERN +void +os_thread_sleep( +/*============*/ + ulint tm) /*!< in: time in microseconds */ +{ +#ifdef __WIN__ + Sleep((DWORD) tm / 1000); +#else + struct timeval t; + + t.tv_sec = tm / 1000000; + t.tv_usec = tm % 1000000; + + select(0, NULL, NULL, NULL, &t); +#endif +} + +/*****************************************************************//** +Set relative scheduling priority for a given thread on Linux. Currently a +no-op on other systems. + +@return An actual thread priority after the update */ +UNIV_INTERN +ulint +os_thread_set_priority( +/*===================*/ + os_tid_t thread_id, /*!< in: thread id */ + ulint relative_priority) /*!< in: system-specific + priority value */ +{ +#ifdef UNIV_LINUX + lint thread_nice = 19 - relative_priority; + if (setpriority(PRIO_PROCESS, thread_id, thread_nice) == -1) { + ib_logf(IB_LOG_LEVEL_WARN, + "Setting thread %lu nice to %ld failed, " + "current nice %d, errno %d", + os_thread_pf(thread_id), thread_nice, + getpriority(PRIO_PROCESS, thread_id), errno); + } + return(19 - getpriority(PRIO_PROCESS, thread_id)); +#else + return(relative_priority); +#endif +} diff --git a/storage/xtradb/page/page0cur.cc b/storage/xtradb/page/page0cur.cc new file mode 100644 index 00000000000..f5f7e1299ce --- /dev/null +++ b/storage/xtradb/page/page0cur.cc @@ -0,0 +1,2145 @@ +/***************************************************************************** + +Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file page/page0cur.cc +The page cursor + +Created 10/4/1994 Heikki Tuuri +*************************************************************************/ + +#include "page0cur.h" +#ifdef UNIV_NONINL +#include "page0cur.ic" +#endif + +#include "page0zip.h" +#include "btr0btr.h" +#include "mtr0log.h" +#include "log0recv.h" +#include "ut0ut.h" +#ifndef UNIV_HOTBACKUP +#include "rem0cmp.h" + +#ifdef PAGE_CUR_ADAPT +# ifdef UNIV_SEARCH_PERF_STAT +static ulint page_cur_short_succ = 0; +# endif /* UNIV_SEARCH_PERF_STAT */ + +/*******************************************************************//** +This is a linear congruential generator PRNG. Returns a pseudo random +number between 0 and 2^64-1 inclusive. The formula and the constants +being used are: +X[n+1] = (a * X[n] + c) mod m +where: +X[0] = ut_time_us(NULL) +a = 1103515245 (3^5 * 5 * 7 * 129749) +c = 12345 (3 * 5 * 823) +m = 18446744073709551616 (2^64) + +@return number between 0 and 2^64-1 */ +static +ib_uint64_t +page_cur_lcg_prng(void) +/*===================*/ +{ +#define LCG_a 1103515245 +#define LCG_c 12345 + static ib_uint64_t lcg_current = 0; + static ibool initialized = FALSE; + + if (!initialized) { + lcg_current = (ib_uint64_t) ut_time_us(NULL); + initialized = TRUE; + } + + /* no need to "% 2^64" explicitly because lcg_current is + 64 bit and this will be done anyway */ + lcg_current = LCG_a * lcg_current + LCG_c; + + return(lcg_current); +} + +/****************************************************************//** +Tries a search shortcut based on the last insert. +@return TRUE on success */ +UNIV_INLINE +ibool +page_cur_try_search_shortcut( +/*=========================*/ + const buf_block_t* block, /*!< in: index page */ + const dict_index_t* index, /*!< in: record descriptor */ + const dtuple_t* tuple, /*!< in: data tuple */ + ulint* iup_matched_fields, + /*!< in/out: already matched + fields in upper limit record */ + ulint* iup_matched_bytes, + /*!< in/out: already matched + bytes in a field not yet + completely matched */ + ulint* ilow_matched_fields, + /*!< in/out: already matched + fields in lower limit record */ + ulint* ilow_matched_bytes, + /*!< in/out: already matched + bytes in a field not yet + completely matched */ + page_cur_t* cursor) /*!< out: page cursor */ +{ + const rec_t* rec; + const rec_t* next_rec; + ulint low_match; + ulint low_bytes; + ulint up_match; + ulint up_bytes; +#ifdef UNIV_SEARCH_DEBUG + page_cur_t cursor2; +#endif + ibool success = FALSE; + const page_t* page = buf_block_get_frame(block); + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + ut_ad(dtuple_check_typed(tuple)); + + rec = page_header_get_ptr(page, PAGE_LAST_INSERT); + offsets = rec_get_offsets(rec, index, offsets, + dtuple_get_n_fields(tuple), &heap); + + ut_ad(rec); + ut_ad(page_rec_is_user_rec(rec)); + + ut_pair_min(&low_match, &low_bytes, + *ilow_matched_fields, *ilow_matched_bytes, + *iup_matched_fields, *iup_matched_bytes); + + up_match = low_match; + up_bytes = low_bytes; + + if (page_cmp_dtuple_rec_with_match(tuple, rec, offsets, + &low_match, &low_bytes) < 0) { + goto exit_func; + } + + next_rec = page_rec_get_next_const(rec); + offsets = rec_get_offsets(next_rec, index, offsets, + dtuple_get_n_fields(tuple), &heap); + + if (page_cmp_dtuple_rec_with_match(tuple, next_rec, offsets, + &up_match, &up_bytes) >= 0) { + goto exit_func; + } + + page_cur_position(rec, block, cursor); + +#ifdef UNIV_SEARCH_DEBUG + page_cur_search_with_match(block, index, tuple, PAGE_CUR_DBG, + iup_matched_fields, + iup_matched_bytes, + ilow_matched_fields, + ilow_matched_bytes, + &cursor2); + ut_a(cursor2.rec == cursor->rec); + + if (!page_rec_is_supremum(next_rec)) { + + ut_a(*iup_matched_fields == up_match); + ut_a(*iup_matched_bytes == up_bytes); + } + + ut_a(*ilow_matched_fields == low_match); + ut_a(*ilow_matched_bytes == low_bytes); +#endif + if (!page_rec_is_supremum(next_rec)) { + + *iup_matched_fields = up_match; + *iup_matched_bytes = up_bytes; + } + + *ilow_matched_fields = low_match; + *ilow_matched_bytes = low_bytes; + +#ifdef UNIV_SEARCH_PERF_STAT + page_cur_short_succ++; +#endif + success = TRUE; +exit_func: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(success); +} + +#endif + +#ifdef PAGE_CUR_LE_OR_EXTENDS +/****************************************************************//** +Checks if the nth field in a record is a character type field which extends +the nth field in tuple, i.e., the field is longer or equal in length and has +common first characters. +@return TRUE if rec field extends tuple field */ +static +ibool +page_cur_rec_field_extends( +/*=======================*/ + const dtuple_t* tuple, /*!< in: data tuple */ + const rec_t* rec, /*!< in: record */ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint n) /*!< in: compare nth field */ +{ + const dtype_t* type; + const dfield_t* dfield; + const byte* rec_f; + ulint rec_f_len; + + ut_ad(rec_offs_validate(rec, NULL, offsets)); + dfield = dtuple_get_nth_field(tuple, n); + + type = dfield_get_type(dfield); + + rec_f = rec_get_nth_field(rec, offsets, n, &rec_f_len); + + if (type->mtype == DATA_VARCHAR + || type->mtype == DATA_CHAR + || type->mtype == DATA_FIXBINARY + || type->mtype == DATA_BINARY + || type->mtype == DATA_BLOB + || type->mtype == DATA_VARMYSQL + || type->mtype == DATA_MYSQL) { + + if (dfield_get_len(dfield) != UNIV_SQL_NULL + && rec_f_len != UNIV_SQL_NULL + && rec_f_len >= dfield_get_len(dfield) + && !cmp_data_data_slow(type->mtype, type->prtype, + dfield_get_data(dfield), + dfield_get_len(dfield), + rec_f, dfield_get_len(dfield))) { + + return(TRUE); + } + } + + return(FALSE); +} +#endif /* PAGE_CUR_LE_OR_EXTENDS */ + +/****************************************************************//** +Searches the right position for a page cursor. */ +UNIV_INTERN +void +page_cur_search_with_match( +/*=======================*/ + const buf_block_t* block, /*!< in: buffer block */ + const dict_index_t* index, /*!< in: record descriptor */ + const dtuple_t* tuple, /*!< in: data tuple */ + ulint mode, /*!< in: PAGE_CUR_L, + PAGE_CUR_LE, PAGE_CUR_G, or + PAGE_CUR_GE */ + ulint* iup_matched_fields, + /*!< in/out: already matched + fields in upper limit record */ + ulint* iup_matched_bytes, + /*!< in/out: already matched + bytes in a field not yet + completely matched */ + ulint* ilow_matched_fields, + /*!< in/out: already matched + fields in lower limit record */ + ulint* ilow_matched_bytes, + /*!< in/out: already matched + bytes in a field not yet + completely matched */ + page_cur_t* cursor) /*!< out: page cursor */ +{ + ulint up; + ulint low; + ulint mid; + const page_t* page; + const page_dir_slot_t* slot; + const rec_t* up_rec; + const rec_t* low_rec; + const rec_t* mid_rec; + ulint up_matched_fields; + ulint up_matched_bytes; + ulint low_matched_fields; + ulint low_matched_bytes; + ulint cur_matched_fields; + ulint cur_matched_bytes; + int cmp; +#ifdef UNIV_SEARCH_DEBUG + int dbg_cmp; + ulint dbg_matched_fields; + ulint dbg_matched_bytes; +#endif +#ifdef UNIV_ZIP_DEBUG + const page_zip_des_t* page_zip = buf_block_get_page_zip(block); +#endif /* UNIV_ZIP_DEBUG */ + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + ut_ad(block && tuple && iup_matched_fields && iup_matched_bytes + && ilow_matched_fields && ilow_matched_bytes && cursor); + ut_ad(dtuple_validate(tuple)); +#ifdef UNIV_DEBUG +# ifdef PAGE_CUR_DBG + if (mode != PAGE_CUR_DBG) +# endif /* PAGE_CUR_DBG */ +# ifdef PAGE_CUR_LE_OR_EXTENDS + if (mode != PAGE_CUR_LE_OR_EXTENDS) +# endif /* PAGE_CUR_LE_OR_EXTENDS */ + ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE + || mode == PAGE_CUR_G || mode == PAGE_CUR_GE); +#endif /* UNIV_DEBUG */ + page = buf_block_get_frame(block); +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + + page_check_dir(page); + +#ifdef PAGE_CUR_ADAPT + if (page_is_leaf(page) + && (mode == PAGE_CUR_LE) + && (page_header_get_field(page, PAGE_N_DIRECTION) > 3) + && (page_header_get_ptr(page, PAGE_LAST_INSERT)) + && (page_header_get_field(page, PAGE_DIRECTION) == PAGE_RIGHT)) { + + if (page_cur_try_search_shortcut( + block, index, tuple, + iup_matched_fields, iup_matched_bytes, + ilow_matched_fields, ilow_matched_bytes, + cursor)) { + return; + } + } +# ifdef PAGE_CUR_DBG + if (mode == PAGE_CUR_DBG) { + mode = PAGE_CUR_LE; + } +# endif +#endif + + /* The following flag does not work for non-latin1 char sets because + cmp_full_field does not tell how many bytes matched */ +#ifdef PAGE_CUR_LE_OR_EXTENDS + ut_a(mode != PAGE_CUR_LE_OR_EXTENDS); +#endif /* PAGE_CUR_LE_OR_EXTENDS */ + + /* If mode PAGE_CUR_G is specified, we are trying to position the + cursor to answer a query of the form "tuple < X", where tuple is + the input parameter, and X denotes an arbitrary physical record on + the page. We want to position the cursor on the first X which + satisfies the condition. */ + + up_matched_fields = *iup_matched_fields; + up_matched_bytes = *iup_matched_bytes; + low_matched_fields = *ilow_matched_fields; + low_matched_bytes = *ilow_matched_bytes; + + /* Perform binary search. First the search is done through the page + directory, after that as a linear search in the list of records + owned by the upper limit directory slot. */ + + low = 0; + up = page_dir_get_n_slots(page) - 1; + + /* Perform binary search until the lower and upper limit directory + slots come to the distance 1 of each other */ + + while (up - low > 1) { + mid = (low + up) / 2; + slot = page_dir_get_nth_slot(page, mid); + mid_rec = page_dir_slot_get_rec(slot); + + ut_pair_min(&cur_matched_fields, &cur_matched_bytes, + low_matched_fields, low_matched_bytes, + up_matched_fields, up_matched_bytes); + + offsets = rec_get_offsets(mid_rec, index, offsets, + dtuple_get_n_fields_cmp(tuple), + &heap); + + cmp = cmp_dtuple_rec_with_match(tuple, mid_rec, offsets, + &cur_matched_fields, + &cur_matched_bytes); + if (UNIV_LIKELY(cmp > 0)) { +low_slot_match: + low = mid; + low_matched_fields = cur_matched_fields; + low_matched_bytes = cur_matched_bytes; + + } else if (UNIV_EXPECT(cmp, -1)) { +#ifdef PAGE_CUR_LE_OR_EXTENDS + if (mode == PAGE_CUR_LE_OR_EXTENDS + && page_cur_rec_field_extends( + tuple, mid_rec, offsets, + cur_matched_fields)) { + + goto low_slot_match; + } +#endif /* PAGE_CUR_LE_OR_EXTENDS */ +up_slot_match: + up = mid; + up_matched_fields = cur_matched_fields; + up_matched_bytes = cur_matched_bytes; + + } else if (mode == PAGE_CUR_G || mode == PAGE_CUR_LE +#ifdef PAGE_CUR_LE_OR_EXTENDS + || mode == PAGE_CUR_LE_OR_EXTENDS +#endif /* PAGE_CUR_LE_OR_EXTENDS */ + ) { + + goto low_slot_match; + } else { + + goto up_slot_match; + } + } + + slot = page_dir_get_nth_slot(page, low); + low_rec = page_dir_slot_get_rec(slot); + slot = page_dir_get_nth_slot(page, up); + up_rec = page_dir_slot_get_rec(slot); + + /* Perform linear search until the upper and lower records come to + distance 1 of each other. */ + + while (page_rec_get_next_const(low_rec) != up_rec) { + + mid_rec = page_rec_get_next_const(low_rec); + + ut_pair_min(&cur_matched_fields, &cur_matched_bytes, + low_matched_fields, low_matched_bytes, + up_matched_fields, up_matched_bytes); + + offsets = rec_get_offsets(mid_rec, index, offsets, + dtuple_get_n_fields_cmp(tuple), + &heap); + + cmp = cmp_dtuple_rec_with_match(tuple, mid_rec, offsets, + &cur_matched_fields, + &cur_matched_bytes); + if (UNIV_LIKELY(cmp > 0)) { +low_rec_match: + low_rec = mid_rec; + low_matched_fields = cur_matched_fields; + low_matched_bytes = cur_matched_bytes; + + } else if (UNIV_EXPECT(cmp, -1)) { +#ifdef PAGE_CUR_LE_OR_EXTENDS + if (mode == PAGE_CUR_LE_OR_EXTENDS + && page_cur_rec_field_extends( + tuple, mid_rec, offsets, + cur_matched_fields)) { + + goto low_rec_match; + } +#endif /* PAGE_CUR_LE_OR_EXTENDS */ +up_rec_match: + up_rec = mid_rec; + up_matched_fields = cur_matched_fields; + up_matched_bytes = cur_matched_bytes; + } else if (mode == PAGE_CUR_G || mode == PAGE_CUR_LE +#ifdef PAGE_CUR_LE_OR_EXTENDS + || mode == PAGE_CUR_LE_OR_EXTENDS +#endif /* PAGE_CUR_LE_OR_EXTENDS */ + ) { + + goto low_rec_match; + } else { + + goto up_rec_match; + } + } + +#ifdef UNIV_SEARCH_DEBUG + + /* Check that the lower and upper limit records have the + right alphabetical order compared to tuple. */ + dbg_matched_fields = 0; + dbg_matched_bytes = 0; + + offsets = rec_get_offsets(low_rec, index, offsets, + ULINT_UNDEFINED, &heap); + dbg_cmp = page_cmp_dtuple_rec_with_match(tuple, low_rec, offsets, + &dbg_matched_fields, + &dbg_matched_bytes); + if (mode == PAGE_CUR_G) { + ut_a(dbg_cmp >= 0); + } else if (mode == PAGE_CUR_GE) { + ut_a(dbg_cmp == 1); + } else if (mode == PAGE_CUR_L) { + ut_a(dbg_cmp == 1); + } else if (mode == PAGE_CUR_LE) { + ut_a(dbg_cmp >= 0); + } + + if (!page_rec_is_infimum(low_rec)) { + + ut_a(low_matched_fields == dbg_matched_fields); + ut_a(low_matched_bytes == dbg_matched_bytes); + } + + dbg_matched_fields = 0; + dbg_matched_bytes = 0; + + offsets = rec_get_offsets(up_rec, index, offsets, + ULINT_UNDEFINED, &heap); + dbg_cmp = page_cmp_dtuple_rec_with_match(tuple, up_rec, offsets, + &dbg_matched_fields, + &dbg_matched_bytes); + if (mode == PAGE_CUR_G) { + ut_a(dbg_cmp == -1); + } else if (mode == PAGE_CUR_GE) { + ut_a(dbg_cmp <= 0); + } else if (mode == PAGE_CUR_L) { + ut_a(dbg_cmp <= 0); + } else if (mode == PAGE_CUR_LE) { + ut_a(dbg_cmp == -1); + } + + if (!page_rec_is_supremum(up_rec)) { + + ut_a(up_matched_fields == dbg_matched_fields); + ut_a(up_matched_bytes == dbg_matched_bytes); + } +#endif + if (mode <= PAGE_CUR_GE) { + page_cur_position(up_rec, block, cursor); + } else { + page_cur_position(low_rec, block, cursor); + } + + *iup_matched_fields = up_matched_fields; + *iup_matched_bytes = up_matched_bytes; + *ilow_matched_fields = low_matched_fields; + *ilow_matched_bytes = low_matched_bytes; + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } +} + +/***********************************************************//** +Positions a page cursor on a randomly chosen user record on a page. If there +are no user records, sets the cursor on the infimum record. */ +UNIV_INTERN +void +page_cur_open_on_rnd_user_rec( +/*==========================*/ + buf_block_t* block, /*!< in: page */ + page_cur_t* cursor) /*!< out: page cursor */ +{ + ulint rnd; + ulint n_recs = page_get_n_recs(buf_block_get_frame(block)); + + page_cur_set_before_first(block, cursor); + + if (UNIV_UNLIKELY(n_recs == 0)) { + + return; + } + + rnd = (ulint) (page_cur_lcg_prng() % n_recs); + + do { + page_cur_move_to_next(cursor); + } while (rnd--); +} + +/***********************************************************//** +Writes the log record of a record insert on a page. */ +static +void +page_cur_insert_rec_write_log( +/*==========================*/ + rec_t* insert_rec, /*!< in: inserted physical record */ + ulint rec_size, /*!< in: insert_rec size */ + rec_t* cursor_rec, /*!< in: record the + cursor is pointing to */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mini-transaction handle */ +{ + ulint cur_rec_size; + ulint extra_size; + ulint cur_extra_size; + const byte* ins_ptr; + byte* log_ptr; + const byte* log_end; + ulint i; + + ut_a(rec_size < UNIV_PAGE_SIZE); + ut_ad(page_align(insert_rec) == page_align(cursor_rec)); + ut_ad(!page_rec_is_comp(insert_rec) + == !dict_table_is_comp(index->table)); + + { + mem_heap_t* heap = NULL; + ulint cur_offs_[REC_OFFS_NORMAL_SIZE]; + ulint ins_offs_[REC_OFFS_NORMAL_SIZE]; + + ulint* cur_offs; + ulint* ins_offs; + + rec_offs_init(cur_offs_); + rec_offs_init(ins_offs_); + + cur_offs = rec_get_offsets(cursor_rec, index, cur_offs_, + ULINT_UNDEFINED, &heap); + ins_offs = rec_get_offsets(insert_rec, index, ins_offs_, + ULINT_UNDEFINED, &heap); + + extra_size = rec_offs_extra_size(ins_offs); + cur_extra_size = rec_offs_extra_size(cur_offs); + ut_ad(rec_size == rec_offs_size(ins_offs)); + cur_rec_size = rec_offs_size(cur_offs); + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + } + + ins_ptr = insert_rec - extra_size; + + i = 0; + + if (cur_extra_size == extra_size) { + ulint min_rec_size = ut_min(cur_rec_size, rec_size); + + const byte* cur_ptr = cursor_rec - cur_extra_size; + + /* Find out the first byte in insert_rec which differs from + cursor_rec; skip the bytes in the record info */ + + do { + if (*ins_ptr == *cur_ptr) { + i++; + ins_ptr++; + cur_ptr++; + } else if ((i < extra_size) + && (i >= extra_size + - page_rec_get_base_extra_size + (insert_rec))) { + i = extra_size; + ins_ptr = insert_rec; + cur_ptr = cursor_rec; + } else { + break; + } + } while (i < min_rec_size); + } + + if (mtr_get_log_mode(mtr) != MTR_LOG_SHORT_INSERTS) { + + if (page_rec_is_comp(insert_rec)) { + log_ptr = mlog_open_and_write_index( + mtr, insert_rec, index, MLOG_COMP_REC_INSERT, + 2 + 5 + 1 + 5 + 5 + MLOG_BUF_MARGIN); + if (UNIV_UNLIKELY(!log_ptr)) { + /* Logging in mtr is switched off + during crash recovery: in that case + mlog_open returns NULL */ + return; + } + } else { + log_ptr = mlog_open(mtr, 11 + + 2 + 5 + 1 + 5 + 5 + + MLOG_BUF_MARGIN); + if (UNIV_UNLIKELY(!log_ptr)) { + /* Logging in mtr is switched off + during crash recovery: in that case + mlog_open returns NULL */ + return; + } + + log_ptr = mlog_write_initial_log_record_fast( + insert_rec, MLOG_REC_INSERT, log_ptr, mtr); + } + + log_end = &log_ptr[2 + 5 + 1 + 5 + 5 + MLOG_BUF_MARGIN]; + /* Write the cursor rec offset as a 2-byte ulint */ + mach_write_to_2(log_ptr, page_offset(cursor_rec)); + log_ptr += 2; + } else { + log_ptr = mlog_open(mtr, 5 + 1 + 5 + 5 + MLOG_BUF_MARGIN); + if (!log_ptr) { + /* Logging in mtr is switched off during crash + recovery: in that case mlog_open returns NULL */ + return; + } + log_end = &log_ptr[5 + 1 + 5 + 5 + MLOG_BUF_MARGIN]; + } + + if (page_rec_is_comp(insert_rec)) { + if (UNIV_UNLIKELY + (rec_get_info_and_status_bits(insert_rec, TRUE) + != rec_get_info_and_status_bits(cursor_rec, TRUE))) { + + goto need_extra_info; + } + } else { + if (UNIV_UNLIKELY + (rec_get_info_and_status_bits(insert_rec, FALSE) + != rec_get_info_and_status_bits(cursor_rec, FALSE))) { + + goto need_extra_info; + } + } + + if (extra_size != cur_extra_size || rec_size != cur_rec_size) { +need_extra_info: + /* Write the record end segment length + and the extra info storage flag */ + log_ptr += mach_write_compressed(log_ptr, + 2 * (rec_size - i) + 1); + + /* Write the info bits */ + mach_write_to_1(log_ptr, + rec_get_info_and_status_bits( + insert_rec, + page_rec_is_comp(insert_rec))); + log_ptr++; + + /* Write the record origin offset */ + log_ptr += mach_write_compressed(log_ptr, extra_size); + + /* Write the mismatch index */ + log_ptr += mach_write_compressed(log_ptr, i); + + ut_a(i < UNIV_PAGE_SIZE); + ut_a(extra_size < UNIV_PAGE_SIZE); + } else { + /* Write the record end segment length + and the extra info storage flag */ + log_ptr += mach_write_compressed(log_ptr, 2 * (rec_size - i)); + } + + /* Write to the log the inserted index record end segment which + differs from the cursor record */ + + rec_size -= i; + + if (log_ptr + rec_size <= log_end) { + memcpy(log_ptr, ins_ptr, rec_size); + mlog_close(mtr, log_ptr + rec_size); + } else { + mlog_close(mtr, log_ptr); + ut_a(rec_size < UNIV_PAGE_SIZE); + mlog_catenate_string(mtr, ins_ptr, rec_size); + } +} +#else /* !UNIV_HOTBACKUP */ +# define page_cur_insert_rec_write_log(ins_rec,size,cur,index,mtr) ((void) 0) +#endif /* !UNIV_HOTBACKUP */ + +/***********************************************************//** +Parses a log record of a record insert on a page. +@return end of log record or NULL */ +UNIV_INTERN +byte* +page_cur_parse_insert_rec( +/*======================*/ + ibool is_short,/*!< in: TRUE if short inserts */ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + buf_block_t* block, /*!< in: page or NULL */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mtr or NULL */ +{ + ulint origin_offset; + ulint end_seg_len; + ulint mismatch_index; + page_t* page; + rec_t* cursor_rec; + byte buf1[1024]; + byte* buf; + byte* ptr2 = ptr; + ulint info_and_status_bits = 0; /* remove warning */ + page_cur_t cursor; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + page = block ? buf_block_get_frame(block) : NULL; + + if (is_short) { + cursor_rec = page_rec_get_prev(page_get_supremum_rec(page)); + } else { + ulint offset; + + /* Read the cursor rec offset as a 2-byte ulint */ + + if (UNIV_UNLIKELY(end_ptr < ptr + 2)) { + + return(NULL); + } + + offset = mach_read_from_2(ptr); + ptr += 2; + + cursor_rec = page + offset; + + if (UNIV_UNLIKELY(offset >= UNIV_PAGE_SIZE)) { + + recv_sys->found_corrupt_log = TRUE; + + return(NULL); + } + } + + ptr = mach_parse_compressed(ptr, end_ptr, &end_seg_len); + + if (ptr == NULL) { + + return(NULL); + } + + if (UNIV_UNLIKELY(end_seg_len >= UNIV_PAGE_SIZE << 1)) { + recv_sys->found_corrupt_log = TRUE; + + return(NULL); + } + + if (end_seg_len & 0x1UL) { + /* Read the info bits */ + + if (end_ptr < ptr + 1) { + + return(NULL); + } + + info_and_status_bits = mach_read_from_1(ptr); + ptr++; + + ptr = mach_parse_compressed(ptr, end_ptr, &origin_offset); + + if (ptr == NULL) { + + return(NULL); + } + + ut_a(origin_offset < UNIV_PAGE_SIZE); + + ptr = mach_parse_compressed(ptr, end_ptr, &mismatch_index); + + if (ptr == NULL) { + + return(NULL); + } + + ut_a(mismatch_index < UNIV_PAGE_SIZE); + } + + if (UNIV_UNLIKELY(end_ptr < ptr + (end_seg_len >> 1))) { + + return(NULL); + } + + if (!block) { + + return(ptr + (end_seg_len >> 1)); + } + + ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table)); + ut_ad(!buf_block_get_page_zip(block) || page_is_comp(page)); + + /* Read from the log the inserted index record end segment which + differs from the cursor record */ + + offsets = rec_get_offsets(cursor_rec, index, offsets, + ULINT_UNDEFINED, &heap); + + if (!(end_seg_len & 0x1UL)) { + info_and_status_bits = rec_get_info_and_status_bits( + cursor_rec, page_is_comp(page)); + origin_offset = rec_offs_extra_size(offsets); + mismatch_index = rec_offs_size(offsets) - (end_seg_len >> 1); + } + + end_seg_len >>= 1; + + if (mismatch_index + end_seg_len < sizeof buf1) { + buf = buf1; + } else { + buf = static_cast<byte*>( + mem_alloc(mismatch_index + end_seg_len)); + } + + /* Build the inserted record to buf */ + + if (UNIV_UNLIKELY(mismatch_index >= UNIV_PAGE_SIZE)) { + fprintf(stderr, + "Is short %lu, info_and_status_bits %lu, offset %lu, " + "o_offset %lu\n" + "mismatch index %lu, end_seg_len %lu\n" + "parsed len %lu\n", + (ulong) is_short, (ulong) info_and_status_bits, + (ulong) page_offset(cursor_rec), + (ulong) origin_offset, + (ulong) mismatch_index, (ulong) end_seg_len, + (ulong) (ptr - ptr2)); + + fputs("Dump of 300 bytes of log:\n", stderr); + ut_print_buf(stderr, ptr2, 300); + putc('\n', stderr); + + buf_page_print(page, 0, 0); + + ut_error; + } + + ut_memcpy(buf, rec_get_start(cursor_rec, offsets), mismatch_index); + ut_memcpy(buf + mismatch_index, ptr, end_seg_len); + + if (page_is_comp(page)) { + rec_set_info_and_status_bits(buf + origin_offset, + info_and_status_bits); + } else { + rec_set_info_bits_old(buf + origin_offset, + info_and_status_bits); + } + + page_cur_position(cursor_rec, block, &cursor); + + offsets = rec_get_offsets(buf + origin_offset, index, offsets, + ULINT_UNDEFINED, &heap); + if (UNIV_UNLIKELY(!page_cur_rec_insert(&cursor, + buf + origin_offset, + index, offsets, mtr))) { + /* The redo log record should only have been written + after the write was successful. */ + ut_error; + } + + if (buf != buf1) { + + mem_free(buf); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + return(ptr + end_seg_len); +} + +/***********************************************************//** +Inserts a record next to page cursor on an uncompressed page. +Returns pointer to inserted record if succeed, i.e., enough +space available, NULL otherwise. The cursor stays at the same position. +@return pointer to record if succeed, NULL otherwise */ +UNIV_INTERN +rec_t* +page_cur_insert_rec_low( +/*====================*/ + rec_t* current_rec,/*!< in: pointer to current record after + which the new record is inserted */ + dict_index_t* index, /*!< in: record descriptor */ + const rec_t* rec, /*!< in: pointer to a physical record */ + ulint* offsets,/*!< in/out: rec_get_offsets(rec, index) */ + mtr_t* mtr) /*!< in: mini-transaction handle, or NULL */ +{ + byte* insert_buf; + ulint rec_size; + page_t* page; /*!< the relevant page */ + rec_t* last_insert; /*!< cursor position at previous + insert */ + rec_t* free_rec; /*!< a free record that was reused, + or NULL */ + rec_t* insert_rec; /*!< inserted record */ + ulint heap_no; /*!< heap number of the inserted + record */ + + ut_ad(rec_offs_validate(rec, index, offsets)); + + page = page_align(current_rec); + ut_ad(dict_table_is_comp(index->table) + == (ibool) !!page_is_comp(page)); + ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX); + ut_ad(mach_read_from_8(page + PAGE_HEADER + PAGE_INDEX_ID) + == index->id || recv_recovery_is_on() + || (mtr ? mtr->inside_ibuf : dict_index_is_ibuf(index))); + + ut_ad(!page_rec_is_supremum(current_rec)); + + /* 1. Get the size of the physical record in the page */ + rec_size = rec_offs_size(offsets); + +#ifdef UNIV_DEBUG_VALGRIND + { + const void* rec_start + = rec - rec_offs_extra_size(offsets); + ulint extra_size + = rec_offs_extra_size(offsets) + - (rec_offs_comp(offsets) + ? REC_N_NEW_EXTRA_BYTES + : REC_N_OLD_EXTRA_BYTES); + + /* All data bytes of the record must be valid. */ + UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets)); + /* The variable-length header must be valid. */ + UNIV_MEM_ASSERT_RW(rec_start, extra_size); + } +#endif /* UNIV_DEBUG_VALGRIND */ + + /* 2. Try to find suitable space from page memory management */ + + free_rec = page_header_get_ptr(page, PAGE_FREE); + if (UNIV_LIKELY_NULL(free_rec)) { + /* Try to allocate from the head of the free list. */ + ulint foffsets_[REC_OFFS_NORMAL_SIZE]; + ulint* foffsets = foffsets_; + mem_heap_t* heap = NULL; + + rec_offs_init(foffsets_); + + foffsets = rec_get_offsets( + free_rec, index, foffsets, ULINT_UNDEFINED, &heap); + if (rec_offs_size(foffsets) < rec_size) { + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + goto use_heap; + } + + insert_buf = free_rec - rec_offs_extra_size(foffsets); + + if (page_is_comp(page)) { + heap_no = rec_get_heap_no_new(free_rec); + page_mem_alloc_free(page, NULL, + rec_get_next_ptr(free_rec, TRUE), + rec_size); + } else { + heap_no = rec_get_heap_no_old(free_rec); + page_mem_alloc_free(page, NULL, + rec_get_next_ptr(free_rec, FALSE), + rec_size); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + } else { +use_heap: + free_rec = NULL; + insert_buf = page_mem_alloc_heap(page, NULL, + rec_size, &heap_no); + + if (UNIV_UNLIKELY(insert_buf == NULL)) { + return(NULL); + } + } + + /* 3. Create the record */ + insert_rec = rec_copy(insert_buf, rec, offsets); + rec_offs_make_valid(insert_rec, index, offsets); + + /* 4. Insert the record in the linked list of records */ + ut_ad(current_rec != insert_rec); + + { + /* next record after current before the insertion */ + rec_t* next_rec = page_rec_get_next(current_rec); +#ifdef UNIV_DEBUG + if (page_is_comp(page)) { + ut_ad(rec_get_status(current_rec) + <= REC_STATUS_INFIMUM); + ut_ad(rec_get_status(insert_rec) < REC_STATUS_INFIMUM); + ut_ad(rec_get_status(next_rec) != REC_STATUS_INFIMUM); + } +#endif + page_rec_set_next(insert_rec, next_rec); + page_rec_set_next(current_rec, insert_rec); + } + + page_header_set_field(page, NULL, PAGE_N_RECS, + 1 + page_get_n_recs(page)); + + /* 5. Set the n_owned field in the inserted record to zero, + and set the heap_no field */ + if (page_is_comp(page)) { + rec_set_n_owned_new(insert_rec, NULL, 0); + rec_set_heap_no_new(insert_rec, heap_no); + } else { + rec_set_n_owned_old(insert_rec, 0); + rec_set_heap_no_old(insert_rec, heap_no); + } + + UNIV_MEM_ASSERT_RW(rec_get_start(insert_rec, offsets), + rec_offs_size(offsets)); + /* 6. Update the last insertion info in page header */ + + last_insert = page_header_get_ptr(page, PAGE_LAST_INSERT); + ut_ad(!last_insert || !page_is_comp(page) + || rec_get_node_ptr_flag(last_insert) + == rec_get_node_ptr_flag(insert_rec)); + + if (UNIV_UNLIKELY(last_insert == NULL)) { + page_header_set_field(page, NULL, PAGE_DIRECTION, + PAGE_NO_DIRECTION); + page_header_set_field(page, NULL, PAGE_N_DIRECTION, 0); + + } else if ((last_insert == current_rec) + && (page_header_get_field(page, PAGE_DIRECTION) + != PAGE_LEFT)) { + + page_header_set_field(page, NULL, PAGE_DIRECTION, + PAGE_RIGHT); + page_header_set_field(page, NULL, PAGE_N_DIRECTION, + page_header_get_field( + page, PAGE_N_DIRECTION) + 1); + + } else if ((page_rec_get_next(insert_rec) == last_insert) + && (page_header_get_field(page, PAGE_DIRECTION) + != PAGE_RIGHT)) { + + page_header_set_field(page, NULL, PAGE_DIRECTION, + PAGE_LEFT); + page_header_set_field(page, NULL, PAGE_N_DIRECTION, + page_header_get_field( + page, PAGE_N_DIRECTION) + 1); + } else { + page_header_set_field(page, NULL, PAGE_DIRECTION, + PAGE_NO_DIRECTION); + page_header_set_field(page, NULL, PAGE_N_DIRECTION, 0); + } + + page_header_set_ptr(page, NULL, PAGE_LAST_INSERT, insert_rec); + + /* 7. It remains to update the owner record. */ + { + rec_t* owner_rec = page_rec_find_owner_rec(insert_rec); + ulint n_owned; + if (page_is_comp(page)) { + n_owned = rec_get_n_owned_new(owner_rec); + rec_set_n_owned_new(owner_rec, NULL, n_owned + 1); + } else { + n_owned = rec_get_n_owned_old(owner_rec); + rec_set_n_owned_old(owner_rec, n_owned + 1); + } + + /* 8. Now we have incremented the n_owned field of the owner + record. If the number exceeds PAGE_DIR_SLOT_MAX_N_OWNED, + we have to split the corresponding directory slot in two. */ + + if (UNIV_UNLIKELY(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED)) { + page_dir_split_slot( + page, NULL, + page_dir_find_owner_slot(owner_rec)); + } + } + + /* 9. Write log record of the insert */ + if (UNIV_LIKELY(mtr != NULL)) { + page_cur_insert_rec_write_log(insert_rec, rec_size, + current_rec, index, mtr); + } + + btr_blob_dbg_add_rec(insert_rec, index, offsets, "insert"); + + return(insert_rec); +} + +/***********************************************************//** +Inserts a record next to page cursor on a compressed and uncompressed +page. Returns pointer to inserted record if succeed, i.e., +enough space available, NULL otherwise. +The cursor stays at the same position. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if this is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + +@return pointer to record if succeed, NULL otherwise */ +UNIV_INTERN +rec_t* +page_cur_insert_rec_zip( +/*====================*/ + page_cur_t* cursor, /*!< in/out: page cursor */ + dict_index_t* index, /*!< in: record descriptor */ + const rec_t* rec, /*!< in: pointer to a physical record */ + ulint* offsets,/*!< in/out: rec_get_offsets(rec, index) */ + mtr_t* mtr) /*!< in: mini-transaction handle, or NULL */ +{ + byte* insert_buf; + ulint rec_size; + page_t* page; /*!< the relevant page */ + rec_t* last_insert; /*!< cursor position at previous + insert */ + rec_t* free_rec; /*!< a free record that was reused, + or NULL */ + rec_t* insert_rec; /*!< inserted record */ + ulint heap_no; /*!< heap number of the inserted + record */ + page_zip_des_t* page_zip; + + page_zip = page_cur_get_page_zip(cursor); + ut_ad(page_zip); + + ut_ad(rec_offs_validate(rec, index, offsets)); + + page = page_cur_get_page(cursor); + ut_ad(dict_table_is_comp(index->table)); + ut_ad(page_is_comp(page)); + ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX); + ut_ad(mach_read_from_8(page + PAGE_HEADER + PAGE_INDEX_ID) + == index->id || recv_recovery_is_on() + || (mtr ? mtr->inside_ibuf : dict_index_is_ibuf(index))); + + ut_ad(!page_cur_is_after_last(cursor)); +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + + /* 1. Get the size of the physical record in the page */ + rec_size = rec_offs_size(offsets); + +#ifdef UNIV_DEBUG_VALGRIND + { + const void* rec_start + = rec - rec_offs_extra_size(offsets); + ulint extra_size + = rec_offs_extra_size(offsets) + - (rec_offs_comp(offsets) + ? REC_N_NEW_EXTRA_BYTES + : REC_N_OLD_EXTRA_BYTES); + + /* All data bytes of the record must be valid. */ + UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets)); + /* The variable-length header must be valid. */ + UNIV_MEM_ASSERT_RW(rec_start, extra_size); + } +#endif /* UNIV_DEBUG_VALGRIND */ + + const bool reorg_before_insert = page_has_garbage(page) + && rec_size > page_get_max_insert_size(page, 1) + && rec_size <= page_get_max_insert_size_after_reorganize( + page, 1); + + /* 2. Try to find suitable space from page memory management */ + if (!page_zip_available(page_zip, dict_index_is_clust(index), + rec_size, 1) + || reorg_before_insert) { + /* The values can change dynamically. */ + bool log_compressed = page_zip_log_pages; + ulint level = page_zip_level; +#ifdef UNIV_DEBUG + rec_t* cursor_rec = page_cur_get_rec(cursor); +#endif /* UNIV_DEBUG */ + + /* If we are not writing compressed page images, we + must reorganize the page before attempting the + insert. */ + if (recv_recovery_is_on()) { + /* Insert into the uncompressed page only. + The page reorganization or creation that we + would attempt outside crash recovery would + have been covered by a previous redo log record. */ + } else if (page_is_empty(page)) { + ut_ad(page_cur_is_before_first(cursor)); + + /* This is an empty page. Recreate it to + get rid of the modification log. */ + page_create_zip(page_cur_get_block(cursor), index, + page_header_get_field(page, PAGE_LEVEL), + 0, mtr); + ut_ad(!page_header_get_ptr(page, PAGE_FREE)); + + if (page_zip_available( + page_zip, dict_index_is_clust(index), + rec_size, 1)) { + goto use_heap; + } + + /* The cursor should remain on the page infimum. */ + return(NULL); + } else if (!page_zip->m_nonempty && !page_has_garbage(page)) { + /* The page has been freshly compressed, so + reorganizing it will not help. */ + } else if (log_compressed && !reorg_before_insert) { + /* Insert into uncompressed page only, and + try page_zip_reorganize() afterwards. */ + } else if (btr_page_reorganize_low( + recv_recovery_is_on(), level, + cursor, index, mtr)) { + ut_ad(!page_header_get_ptr(page, PAGE_FREE)); + + if (page_zip_available( + page_zip, dict_index_is_clust(index), + rec_size, 1)) { + /* After reorganizing, there is space + available. */ + goto use_heap; + } + } else { + ut_ad(cursor->rec == cursor_rec); + return(NULL); + } + + /* Try compressing the whole page afterwards. */ + insert_rec = page_cur_insert_rec_low( + cursor->rec, index, rec, offsets, NULL); + + /* If recovery is on, this implies that the compression + of the page was successful during runtime. Had that not + been the case or had the redo logging of compressed + pages been enabled during runtime then we'd have seen + a MLOG_ZIP_PAGE_COMPRESS redo record. Therefore, we + know that we don't need to reorganize the page. We, + however, do need to recompress the page. That will + happen when the next redo record is read which must + be of type MLOG_ZIP_PAGE_COMPRESS_NO_DATA and it must + contain a valid compression level value. + This implies that during recovery from this point till + the next redo is applied the uncompressed and + compressed versions are not identical and + page_zip_validate will fail but that is OK because + we call page_zip_validate only after processing + all changes to a page under a single mtr during + recovery. */ + if (insert_rec == NULL) { + /* Out of space. + This should never occur during crash recovery, + because the MLOG_COMP_REC_INSERT should only + be logged after a successful operation. */ + ut_ad(!recv_recovery_is_on()); + } else if (recv_recovery_is_on()) { + /* This should be followed by + MLOG_ZIP_PAGE_COMPRESS_NO_DATA, + which should succeed. */ + rec_offs_make_valid(insert_rec, index, offsets); + } else { + ulint pos = page_rec_get_n_recs_before(insert_rec); + ut_ad(pos > 0); + + if (!log_compressed) { + if (page_zip_compress( + page_zip, page, index, + level, NULL)) { + page_cur_insert_rec_write_log( + insert_rec, rec_size, + cursor->rec, index, mtr); + page_zip_compress_write_log_no_data( + level, page, index, mtr); + + rec_offs_make_valid( + insert_rec, index, offsets); + return(insert_rec); + } + + ut_ad(cursor->rec + == (pos > 1 + ? page_rec_get_nth( + page, pos - 1) + : page + PAGE_NEW_INFIMUM)); + } else { + /* We are writing entire page images + to the log. Reduce the redo log volume + by reorganizing the page at the same time. */ + if (page_zip_reorganize( + cursor->block, index, mtr)) { + /* The page was reorganized: + Seek to pos. */ + if (pos > 1) { + cursor->rec = page_rec_get_nth( + page, pos - 1); + } else { + cursor->rec = page + + PAGE_NEW_INFIMUM; + } + + insert_rec = page + rec_get_next_offs( + cursor->rec, TRUE); + rec_offs_make_valid( + insert_rec, index, offsets); + return(insert_rec); + } + + /* Theoretically, we could try one + last resort of btr_page_reorganize_low() + followed by page_zip_available(), but + that would be very unlikely to + succeed. (If the full reorganized page + failed to compress, why would it + succeed to compress the page, plus log + the insert of this record? */ + } + + /* Out of space: restore the page */ + btr_blob_dbg_remove(page, index, "insert_zip_fail"); + if (!page_zip_decompress(page_zip, page, FALSE)) { + ut_error; /* Memory corrupted? */ + } + ut_ad(page_validate(page, index)); + btr_blob_dbg_add(page, index, "insert_zip_fail"); + insert_rec = NULL; + } + + return(insert_rec); + } + + free_rec = page_header_get_ptr(page, PAGE_FREE); + if (UNIV_LIKELY_NULL(free_rec)) { + /* Try to allocate from the head of the free list. */ + lint extra_size_diff; + ulint foffsets_[REC_OFFS_NORMAL_SIZE]; + ulint* foffsets = foffsets_; + mem_heap_t* heap = NULL; + + rec_offs_init(foffsets_); + + foffsets = rec_get_offsets(free_rec, index, foffsets, + ULINT_UNDEFINED, &heap); + if (rec_offs_size(foffsets) < rec_size) { +too_small: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + goto use_heap; + } + + insert_buf = free_rec - rec_offs_extra_size(foffsets); + + /* On compressed pages, do not relocate records from + the free list. If extra_size would grow, use the heap. */ + extra_size_diff + = rec_offs_extra_size(offsets) + - rec_offs_extra_size(foffsets); + + if (UNIV_UNLIKELY(extra_size_diff < 0)) { + /* Add an offset to the extra_size. */ + if (rec_offs_size(foffsets) + < rec_size - extra_size_diff) { + + goto too_small; + } + + insert_buf -= extra_size_diff; + } else if (UNIV_UNLIKELY(extra_size_diff)) { + /* Do not allow extra_size to grow */ + + goto too_small; + } + + heap_no = rec_get_heap_no_new(free_rec); + page_mem_alloc_free(page, page_zip, + rec_get_next_ptr(free_rec, TRUE), + rec_size); + + if (!page_is_leaf(page)) { + /* Zero out the node pointer of free_rec, + in case it will not be overwritten by + insert_rec. */ + + ut_ad(rec_size > REC_NODE_PTR_SIZE); + + if (rec_offs_extra_size(foffsets) + + rec_offs_data_size(foffsets) > rec_size) { + + memset(rec_get_end(free_rec, foffsets) + - REC_NODE_PTR_SIZE, 0, + REC_NODE_PTR_SIZE); + } + } else if (dict_index_is_clust(index)) { + /* Zero out the DB_TRX_ID and DB_ROLL_PTR + columns of free_rec, in case it will not be + overwritten by insert_rec. */ + + ulint trx_id_col; + ulint trx_id_offs; + ulint len; + + trx_id_col = dict_index_get_sys_col_pos(index, + DATA_TRX_ID); + ut_ad(trx_id_col > 0); + ut_ad(trx_id_col != ULINT_UNDEFINED); + + trx_id_offs = rec_get_nth_field_offs(foffsets, + trx_id_col, &len); + ut_ad(len == DATA_TRX_ID_LEN); + + if (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN + trx_id_offs + + rec_offs_extra_size(foffsets) > rec_size) { + /* We will have to zero out the + DB_TRX_ID and DB_ROLL_PTR, because + they will not be fully overwritten by + insert_rec. */ + + memset(free_rec + trx_id_offs, 0, + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + } + + ut_ad(free_rec + trx_id_offs + DATA_TRX_ID_LEN + == rec_get_nth_field(free_rec, foffsets, + trx_id_col + 1, &len)); + ut_ad(len == DATA_ROLL_PTR_LEN); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + } else { +use_heap: + free_rec = NULL; + insert_buf = page_mem_alloc_heap(page, page_zip, + rec_size, &heap_no); + + if (UNIV_UNLIKELY(insert_buf == NULL)) { + return(NULL); + } + + page_zip_dir_add_slot(page_zip, dict_index_is_clust(index)); + } + + /* 3. Create the record */ + insert_rec = rec_copy(insert_buf, rec, offsets); + rec_offs_make_valid(insert_rec, index, offsets); + + /* 4. Insert the record in the linked list of records */ + ut_ad(cursor->rec != insert_rec); + + { + /* next record after current before the insertion */ + const rec_t* next_rec = page_rec_get_next_low( + cursor->rec, TRUE); + ut_ad(rec_get_status(cursor->rec) + <= REC_STATUS_INFIMUM); + ut_ad(rec_get_status(insert_rec) < REC_STATUS_INFIMUM); + ut_ad(rec_get_status(next_rec) != REC_STATUS_INFIMUM); + + page_rec_set_next(insert_rec, next_rec); + page_rec_set_next(cursor->rec, insert_rec); + } + + page_header_set_field(page, page_zip, PAGE_N_RECS, + 1 + page_get_n_recs(page)); + + /* 5. Set the n_owned field in the inserted record to zero, + and set the heap_no field */ + rec_set_n_owned_new(insert_rec, NULL, 0); + rec_set_heap_no_new(insert_rec, heap_no); + + UNIV_MEM_ASSERT_RW(rec_get_start(insert_rec, offsets), + rec_offs_size(offsets)); + + page_zip_dir_insert(page_zip, cursor->rec, free_rec, insert_rec); + + /* 6. Update the last insertion info in page header */ + + last_insert = page_header_get_ptr(page, PAGE_LAST_INSERT); + ut_ad(!last_insert + || rec_get_node_ptr_flag(last_insert) + == rec_get_node_ptr_flag(insert_rec)); + + if (UNIV_UNLIKELY(last_insert == NULL)) { + page_header_set_field(page, page_zip, PAGE_DIRECTION, + PAGE_NO_DIRECTION); + page_header_set_field(page, page_zip, PAGE_N_DIRECTION, 0); + + } else if ((last_insert == cursor->rec) + && (page_header_get_field(page, PAGE_DIRECTION) + != PAGE_LEFT)) { + + page_header_set_field(page, page_zip, PAGE_DIRECTION, + PAGE_RIGHT); + page_header_set_field(page, page_zip, PAGE_N_DIRECTION, + page_header_get_field( + page, PAGE_N_DIRECTION) + 1); + + } else if ((page_rec_get_next(insert_rec) == last_insert) + && (page_header_get_field(page, PAGE_DIRECTION) + != PAGE_RIGHT)) { + + page_header_set_field(page, page_zip, PAGE_DIRECTION, + PAGE_LEFT); + page_header_set_field(page, page_zip, PAGE_N_DIRECTION, + page_header_get_field( + page, PAGE_N_DIRECTION) + 1); + } else { + page_header_set_field(page, page_zip, PAGE_DIRECTION, + PAGE_NO_DIRECTION); + page_header_set_field(page, page_zip, PAGE_N_DIRECTION, 0); + } + + page_header_set_ptr(page, page_zip, PAGE_LAST_INSERT, insert_rec); + + /* 7. It remains to update the owner record. */ + { + rec_t* owner_rec = page_rec_find_owner_rec(insert_rec); + ulint n_owned; + + n_owned = rec_get_n_owned_new(owner_rec); + rec_set_n_owned_new(owner_rec, page_zip, n_owned + 1); + + /* 8. Now we have incremented the n_owned field of the owner + record. If the number exceeds PAGE_DIR_SLOT_MAX_N_OWNED, + we have to split the corresponding directory slot in two. */ + + if (UNIV_UNLIKELY(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED)) { + page_dir_split_slot( + page, page_zip, + page_dir_find_owner_slot(owner_rec)); + } + } + + page_zip_write_rec(page_zip, insert_rec, index, offsets, 1); + + btr_blob_dbg_add_rec(insert_rec, index, offsets, "insert_zip_ok"); + + /* 9. Write log record of the insert */ + if (UNIV_LIKELY(mtr != NULL)) { + page_cur_insert_rec_write_log(insert_rec, rec_size, + cursor->rec, index, mtr); + } + + return(insert_rec); +} + +#ifndef UNIV_HOTBACKUP +/**********************************************************//** +Writes a log record of copying a record list end to a new created page. +@return 4-byte field where to write the log data length, or NULL if +logging is disabled */ +UNIV_INLINE +byte* +page_copy_rec_list_to_created_page_write_log( +/*=========================================*/ + page_t* page, /*!< in: index page */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mtr */ +{ + byte* log_ptr; + + ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table)); + + log_ptr = mlog_open_and_write_index(mtr, page, index, + page_is_comp(page) + ? MLOG_COMP_LIST_END_COPY_CREATED + : MLOG_LIST_END_COPY_CREATED, 4); + if (UNIV_LIKELY(log_ptr != NULL)) { + mlog_close(mtr, log_ptr + 4); + } + + return(log_ptr); +} +#endif /* !UNIV_HOTBACKUP */ + +/**********************************************************//** +Parses a log record of copying a record list end to a new created page. +@return end of log record or NULL */ +UNIV_INTERN +byte* +page_parse_copy_rec_list_to_created_page( +/*=====================================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + buf_block_t* block, /*!< in: page or NULL */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mtr or NULL */ +{ + byte* rec_end; + ulint log_data_len; + page_t* page; + page_zip_des_t* page_zip; + + if (ptr + 4 > end_ptr) { + + return(NULL); + } + + log_data_len = mach_read_from_4(ptr); + ptr += 4; + + rec_end = ptr + log_data_len; + + if (rec_end > end_ptr) { + + return(NULL); + } + + if (!block) { + + return(rec_end); + } + + while (ptr < rec_end) { + ptr = page_cur_parse_insert_rec(TRUE, ptr, end_ptr, + block, index, mtr); + } + + ut_a(ptr == rec_end); + + page = buf_block_get_frame(block); + page_zip = buf_block_get_page_zip(block); + + page_header_set_ptr(page, page_zip, PAGE_LAST_INSERT, NULL); + page_header_set_field(page, page_zip, PAGE_DIRECTION, + PAGE_NO_DIRECTION); + page_header_set_field(page, page_zip, PAGE_N_DIRECTION, 0); + + return(rec_end); +} + +#ifndef UNIV_HOTBACKUP +/*************************************************************//** +Copies records from page to a newly created page, from a given record onward, +including that record. Infimum and supremum records are not copied. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if this is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). */ +UNIV_INTERN +void +page_copy_rec_list_end_to_created_page( +/*===================================*/ + page_t* new_page, /*!< in/out: index page to copy to */ + rec_t* rec, /*!< in: first record to copy */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_dir_slot_t* slot = 0; /* remove warning */ + byte* heap_top; + rec_t* insert_rec = 0; /* remove warning */ + rec_t* prev_rec; + ulint count; + ulint n_recs; + ulint slot_index; + ulint rec_size; + ulint log_mode; + byte* log_ptr; + ulint log_data_len; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + ut_ad(page_dir_get_n_heap(new_page) == PAGE_HEAP_NO_USER_LOW); + ut_ad(page_align(rec) != new_page); + ut_ad(page_rec_is_comp(rec) == page_is_comp(new_page)); + + if (page_rec_is_infimum(rec)) { + + rec = page_rec_get_next(rec); + } + + if (page_rec_is_supremum(rec)) { + + return; + } + +#ifdef UNIV_DEBUG + /* To pass the debug tests we have to set these dummy values + in the debug version */ + page_dir_set_n_slots(new_page, NULL, UNIV_PAGE_SIZE / 2); + page_header_set_ptr(new_page, NULL, PAGE_HEAP_TOP, + new_page + UNIV_PAGE_SIZE - 1); +#endif + + log_ptr = page_copy_rec_list_to_created_page_write_log(new_page, + index, mtr); + + log_data_len = dyn_array_get_data_size(&(mtr->log)); + + /* Individual inserts are logged in a shorter form */ + + log_mode = mtr_set_log_mode(mtr, MTR_LOG_SHORT_INSERTS); + + prev_rec = page_get_infimum_rec(new_page); + if (page_is_comp(new_page)) { + heap_top = new_page + PAGE_NEW_SUPREMUM_END; + } else { + heap_top = new_page + PAGE_OLD_SUPREMUM_END; + } + count = 0; + slot_index = 0; + n_recs = 0; + + do { + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + insert_rec = rec_copy(heap_top, rec, offsets); + + if (page_is_comp(new_page)) { + rec_set_next_offs_new(prev_rec, + page_offset(insert_rec)); + + rec_set_n_owned_new(insert_rec, NULL, 0); + rec_set_heap_no_new(insert_rec, + PAGE_HEAP_NO_USER_LOW + n_recs); + } else { + rec_set_next_offs_old(prev_rec, + page_offset(insert_rec)); + + rec_set_n_owned_old(insert_rec, 0); + rec_set_heap_no_old(insert_rec, + PAGE_HEAP_NO_USER_LOW + n_recs); + } + + count++; + n_recs++; + + if (UNIV_UNLIKELY + (count == (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2)) { + + slot_index++; + + slot = page_dir_get_nth_slot(new_page, slot_index); + + page_dir_slot_set_rec(slot, insert_rec); + page_dir_slot_set_n_owned(slot, NULL, count); + + count = 0; + } + + rec_size = rec_offs_size(offsets); + + ut_ad(heap_top < new_page + UNIV_PAGE_SIZE); + + heap_top += rec_size; + + rec_offs_make_valid(insert_rec, index, offsets); + btr_blob_dbg_add_rec(insert_rec, index, offsets, "copy_end"); + + page_cur_insert_rec_write_log(insert_rec, rec_size, prev_rec, + index, mtr); + prev_rec = insert_rec; + rec = page_rec_get_next(rec); + } while (!page_rec_is_supremum(rec)); + + if ((slot_index > 0) && (count + 1 + + (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2 + <= PAGE_DIR_SLOT_MAX_N_OWNED)) { + /* We can merge the two last dir slots. This operation is + here to make this function imitate exactly the equivalent + task made using page_cur_insert_rec, which we use in database + recovery to reproduce the task performed by this function. + To be able to check the correctness of recovery, it is good + that it imitates exactly. */ + + count += (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2; + + page_dir_slot_set_n_owned(slot, NULL, 0); + + slot_index--; + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + log_data_len = dyn_array_get_data_size(&(mtr->log)) - log_data_len; + + ut_a(log_data_len < 100 * UNIV_PAGE_SIZE); + + if (UNIV_LIKELY(log_ptr != NULL)) { + mach_write_to_4(log_ptr, log_data_len); + } + + if (page_is_comp(new_page)) { + rec_set_next_offs_new(insert_rec, PAGE_NEW_SUPREMUM); + } else { + rec_set_next_offs_old(insert_rec, PAGE_OLD_SUPREMUM); + } + + slot = page_dir_get_nth_slot(new_page, 1 + slot_index); + + page_dir_slot_set_rec(slot, page_get_supremum_rec(new_page)); + page_dir_slot_set_n_owned(slot, NULL, count + 1); + + page_dir_set_n_slots(new_page, NULL, 2 + slot_index); + page_header_set_ptr(new_page, NULL, PAGE_HEAP_TOP, heap_top); + page_dir_set_n_heap(new_page, NULL, PAGE_HEAP_NO_USER_LOW + n_recs); + page_header_set_field(new_page, NULL, PAGE_N_RECS, n_recs); + + page_header_set_ptr(new_page, NULL, PAGE_LAST_INSERT, NULL); + page_header_set_field(new_page, NULL, PAGE_DIRECTION, + PAGE_NO_DIRECTION); + page_header_set_field(new_page, NULL, PAGE_N_DIRECTION, 0); + + /* Restore the log mode */ + + mtr_set_log_mode(mtr, log_mode); +} + +/***********************************************************//** +Writes log record of a record delete on a page. */ +UNIV_INLINE +void +page_cur_delete_rec_write_log( +/*==========================*/ + rec_t* rec, /*!< in: record to be deleted */ + const dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mini-transaction handle */ +{ + byte* log_ptr; + + ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table)); + + log_ptr = mlog_open_and_write_index(mtr, rec, index, + page_rec_is_comp(rec) + ? MLOG_COMP_REC_DELETE + : MLOG_REC_DELETE, 2); + + if (!log_ptr) { + /* Logging in mtr is switched off during crash recovery: + in that case mlog_open returns NULL */ + return; + } + + /* Write the cursor rec offset as a 2-byte ulint */ + mach_write_to_2(log_ptr, page_offset(rec)); + + mlog_close(mtr, log_ptr + 2); +} +#else /* !UNIV_HOTBACKUP */ +# define page_cur_delete_rec_write_log(rec,index,mtr) ((void) 0) +#endif /* !UNIV_HOTBACKUP */ + +/***********************************************************//** +Parses log record of a record delete on a page. +@return pointer to record end or NULL */ +UNIV_INTERN +byte* +page_cur_parse_delete_rec( +/*======================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + buf_block_t* block, /*!< in: page or NULL */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mtr or NULL */ +{ + ulint offset; + page_cur_t cursor; + + if (end_ptr < ptr + 2) { + + return(NULL); + } + + /* Read the cursor rec offset as a 2-byte ulint */ + offset = mach_read_from_2(ptr); + ptr += 2; + + ut_a(offset <= UNIV_PAGE_SIZE); + + if (block) { + page_t* page = buf_block_get_frame(block); + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + rec_t* rec = page + offset; + rec_offs_init(offsets_); + + page_cur_position(rec, block, &cursor); + ut_ad(!buf_block_get_page_zip(block) || page_is_comp(page)); + + page_cur_delete_rec(&cursor, index, + rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap), + mtr); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + } + + return(ptr); +} + +/***********************************************************//** +Deletes a record at the page cursor. The cursor is moved to the next +record after the deleted one. */ +UNIV_INTERN +void +page_cur_delete_rec( +/*================*/ + page_cur_t* cursor, /*!< in/out: a page cursor */ + const dict_index_t* index, /*!< in: record descriptor */ + const ulint* offsets,/*!< in: rec_get_offsets( + cursor->rec, index) */ + mtr_t* mtr) /*!< in: mini-transaction handle + or NULL */ +{ + page_dir_slot_t* cur_dir_slot; + page_dir_slot_t* prev_slot; + page_t* page; + page_zip_des_t* page_zip; + rec_t* current_rec; + rec_t* prev_rec = NULL; + rec_t* next_rec; + ulint cur_slot_no; + ulint cur_n_owned; + rec_t* rec; + + page = page_cur_get_page(cursor); + page_zip = page_cur_get_page_zip(cursor); + + /* page_zip_validate() will fail here when + btr_cur_pessimistic_delete() invokes btr_set_min_rec_mark(). + Then, both "page_zip" and "page" would have the min-rec-mark + set on the smallest user record, but "page" would additionally + have it set on the smallest-but-one record. Because sloppy + page_zip_validate_low() only ignores min-rec-flag differences + in the smallest user record, it cannot be used here either. */ + + current_rec = cursor->rec; + ut_ad(rec_offs_validate(current_rec, index, offsets)); + ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table)); + ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX); + ut_ad(mach_read_from_8(page + PAGE_HEADER + PAGE_INDEX_ID) + == index->id || recv_recovery_is_on() + || (mtr ? mtr->inside_ibuf : dict_index_is_ibuf(index))); + + /* The record must not be the supremum or infimum record. */ + ut_ad(page_rec_is_user_rec(current_rec)); + + if (page_get_n_recs(page) == 1 && !recv_recovery_is_on()) { + /* Empty the page, unless we are applying the redo log + during crash recovery. During normal operation, the + page_create_empty() gets logged as one of MLOG_PAGE_CREATE, + MLOG_COMP_PAGE_CREATE, MLOG_ZIP_PAGE_COMPRESS. */ + ut_ad(page_is_leaf(page)); + /* Usually, this should be the root page, + and the whole index tree should become empty. + However, this could also be a call in + btr_cur_pessimistic_update() to delete the only + record in the page and to insert another one. */ + page_cur_move_to_next(cursor); + ut_ad(page_cur_is_after_last(cursor)); + page_create_empty(page_cur_get_block(cursor), + const_cast<dict_index_t*>(index), mtr); + return; + } + + /* Save to local variables some data associated with current_rec */ + cur_slot_no = page_dir_find_owner_slot(current_rec); + ut_ad(cur_slot_no > 0); + cur_dir_slot = page_dir_get_nth_slot(page, cur_slot_no); + cur_n_owned = page_dir_slot_get_n_owned(cur_dir_slot); + + /* 0. Write the log record */ + if (mtr != 0) { + page_cur_delete_rec_write_log(current_rec, index, mtr); + } + + /* 1. Reset the last insert info in the page header and increment + the modify clock for the frame */ + + page_header_set_ptr(page, page_zip, PAGE_LAST_INSERT, NULL); + + /* The page gets invalid for optimistic searches: increment the + frame modify clock only if there is an mini-transaction covering + the change. During IMPORT we allocate local blocks that are not + part of the buffer pool. */ + + if (mtr != 0) { + buf_block_modify_clock_inc(page_cur_get_block(cursor)); + } + + /* 2. Find the next and the previous record. Note that the cursor is + left at the next record. */ + + ut_ad(cur_slot_no > 0); + prev_slot = page_dir_get_nth_slot(page, cur_slot_no - 1); + + rec = (rec_t*) page_dir_slot_get_rec(prev_slot); + + /* rec now points to the record of the previous directory slot. Look + for the immediate predecessor of current_rec in a loop. */ + + while(current_rec != rec) { + prev_rec = rec; + rec = page_rec_get_next(rec); + } + + page_cur_move_to_next(cursor); + next_rec = cursor->rec; + + /* 3. Remove the record from the linked list of records */ + + page_rec_set_next(prev_rec, next_rec); + + /* 4. If the deleted record is pointed to by a dir slot, update the + record pointer in slot. In the following if-clause we assume that + prev_rec is owned by the same slot, i.e., PAGE_DIR_SLOT_MIN_N_OWNED + >= 2. */ + +#if PAGE_DIR_SLOT_MIN_N_OWNED < 2 +# error "PAGE_DIR_SLOT_MIN_N_OWNED < 2" +#endif + ut_ad(cur_n_owned > 1); + + if (current_rec == page_dir_slot_get_rec(cur_dir_slot)) { + page_dir_slot_set_rec(cur_dir_slot, prev_rec); + } + + /* 5. Update the number of owned records of the slot */ + + page_dir_slot_set_n_owned(cur_dir_slot, page_zip, cur_n_owned - 1); + + /* 6. Free the memory occupied by the record */ + btr_blob_dbg_remove_rec(current_rec, const_cast<dict_index_t*>(index), + offsets, "delete"); + page_mem_free(page, page_zip, current_rec, index, offsets); + + /* 7. Now we have decremented the number of owned records of the slot. + If the number drops below PAGE_DIR_SLOT_MIN_N_OWNED, we balance the + slots. */ + + if (cur_n_owned <= PAGE_DIR_SLOT_MIN_N_OWNED) { + page_dir_balance_slot(page, page_zip, cur_slot_no); + } + +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ +} + +#ifdef UNIV_COMPILE_TEST_FUNCS + +/*******************************************************************//** +Print the first n numbers, generated by page_cur_lcg_prng() to make sure +(visually) that it works properly. */ +void +test_page_cur_lcg_prng( +/*===================*/ + int n) /*!< in: print first n numbers */ +{ + int i; + unsigned long long rnd; + + for (i = 0; i < n; i++) { + rnd = page_cur_lcg_prng(); + printf("%llu\t%%2=%llu %%3=%llu %%5=%llu %%7=%llu %%11=%llu\n", + rnd, + rnd % 2, + rnd % 3, + rnd % 5, + rnd % 7, + rnd % 11); + } +} + +#endif /* UNIV_COMPILE_TEST_FUNCS */ diff --git a/storage/xtradb/page/page0page.cc b/storage/xtradb/page/page0page.cc new file mode 100644 index 00000000000..bd5fb36af8f --- /dev/null +++ b/storage/xtradb/page/page0page.cc @@ -0,0 +1,2813 @@ +/***************************************************************************** + +Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file page/page0page.cc +Index page routines + +Created 2/2/1994 Heikki Tuuri +*******************************************************/ + +#define THIS_MODULE +#include "page0page.h" +#ifdef UNIV_NONINL +#include "page0page.ic" +#endif +#undef THIS_MODULE + +#include "page0cur.h" +#include "page0zip.h" +#include "buf0buf.h" +#include "btr0btr.h" +#ifndef UNIV_HOTBACKUP +# include "srv0srv.h" +# include "lock0lock.h" +# include "fut0lst.h" +# include "btr0sea.h" +#endif /* !UNIV_HOTBACKUP */ + +/* THE INDEX PAGE + ============== + +The index page consists of a page header which contains the page's +id and other information. On top of it are the index records +in a heap linked into a one way linear list according to alphabetic order. + +Just below page end is an array of pointers which we call page directory, +to about every sixth record in the list. The pointers are placed in +the directory in the alphabetical order of the records pointed to, +enabling us to make binary search using the array. Each slot n:o I +in the directory points to a record, where a 4-bit field contains a count +of those records which are in the linear list between pointer I and +the pointer I - 1 in the directory, including the record +pointed to by pointer I and not including the record pointed to by I - 1. +We say that the record pointed to by slot I, or that slot I, owns +these records. The count is always kept in the range 4 to 8, with +the exception that it is 1 for the first slot, and 1--8 for the second slot. + +An essentially binary search can be performed in the list of index +records, like we could do if we had pointer to every record in the +page directory. The data structure is, however, more efficient when +we are doing inserts, because most inserts are just pushed on a heap. +Only every 8th insert requires block move in the directory pointer +table, which itself is quite small. A record is deleted from the page +by just taking it off the linear list and updating the number of owned +records-field of the record which owns it, and updating the page directory, +if necessary. A special case is the one when the record owns itself. +Because the overhead of inserts is so small, we may also increase the +page size from the projected default of 8 kB to 64 kB without too +much loss of efficiency in inserts. Bigger page becomes actual +when the disk transfer rate compared to seek and latency time rises. +On the present system, the page size is set so that the page transfer +time (3 ms) is 20 % of the disk random access time (15 ms). + +When the page is split, merged, or becomes full but contains deleted +records, we have to reorganize the page. + +Assuming a page size of 8 kB, a typical index page of a secondary +index contains 300 index entries, and the size of the page directory +is 50 x 4 bytes = 200 bytes. */ + +/***************************************************************//** +Looks for the directory slot which owns the given record. +@return the directory slot number */ +UNIV_INTERN +ulint +page_dir_find_owner_slot( +/*=====================*/ + const rec_t* rec) /*!< in: the physical record */ +{ + const page_t* page; + register uint16 rec_offs_bytes; + register const page_dir_slot_t* slot; + register const page_dir_slot_t* first_slot; + register const rec_t* r = rec; + + ut_ad(page_rec_check(rec)); + + page = page_align(rec); + first_slot = page_dir_get_nth_slot(page, 0); + slot = page_dir_get_nth_slot(page, page_dir_get_n_slots(page) - 1); + + if (page_is_comp(page)) { + while (rec_get_n_owned_new(r) == 0) { + r = rec_get_next_ptr_const(r, TRUE); + ut_ad(r >= page + PAGE_NEW_SUPREMUM); + ut_ad(r < page + (UNIV_PAGE_SIZE - PAGE_DIR)); + } + } else { + while (rec_get_n_owned_old(r) == 0) { + r = rec_get_next_ptr_const(r, FALSE); + ut_ad(r >= page + PAGE_OLD_SUPREMUM); + ut_ad(r < page + (UNIV_PAGE_SIZE - PAGE_DIR)); + } + } + + rec_offs_bytes = mach_encode_2(r - page); + + while (UNIV_LIKELY(*(uint16*) slot != rec_offs_bytes)) { + + if (UNIV_UNLIKELY(slot == first_slot)) { + fprintf(stderr, + "InnoDB: Probable data corruption on" + " page %lu\n" + "InnoDB: Original record ", + (ulong) page_get_page_no(page)); + + if (page_is_comp(page)) { + fputs("(compact record)", stderr); + } else { + rec_print_old(stderr, rec); + } + + fputs("\n" + "InnoDB: on that page.\n" + "InnoDB: Cannot find the dir slot for record ", + stderr); + if (page_is_comp(page)) { + fputs("(compact record)", stderr); + } else { + rec_print_old(stderr, page + + mach_decode_2(rec_offs_bytes)); + } + fputs("\n" + "InnoDB: on that page!\n", stderr); + + buf_page_print(page, 0, 0); + + ut_error; + } + + slot += PAGE_DIR_SLOT_SIZE; + } + + return(((ulint) (first_slot - slot)) / PAGE_DIR_SLOT_SIZE); +} + +/**************************************************************//** +Used to check the consistency of a directory slot. +@return TRUE if succeed */ +static +ibool +page_dir_slot_check( +/*================*/ + const page_dir_slot_t* slot) /*!< in: slot */ +{ + const page_t* page; + ulint n_slots; + ulint n_owned; + + ut_a(slot); + + page = page_align(slot); + + n_slots = page_dir_get_n_slots(page); + + ut_a(slot <= page_dir_get_nth_slot(page, 0)); + ut_a(slot >= page_dir_get_nth_slot(page, n_slots - 1)); + + ut_a(page_rec_check(page_dir_slot_get_rec(slot))); + + if (page_is_comp(page)) { + n_owned = rec_get_n_owned_new(page_dir_slot_get_rec(slot)); + } else { + n_owned = rec_get_n_owned_old(page_dir_slot_get_rec(slot)); + } + + if (slot == page_dir_get_nth_slot(page, 0)) { + ut_a(n_owned == 1); + } else if (slot == page_dir_get_nth_slot(page, n_slots - 1)) { + ut_a(n_owned >= 1); + ut_a(n_owned <= PAGE_DIR_SLOT_MAX_N_OWNED); + } else { + ut_a(n_owned >= PAGE_DIR_SLOT_MIN_N_OWNED); + ut_a(n_owned <= PAGE_DIR_SLOT_MAX_N_OWNED); + } + + return(TRUE); +} + +/*************************************************************//** +Sets the max trx id field value. */ +UNIV_INTERN +void +page_set_max_trx_id( +/*================*/ + buf_block_t* block, /*!< in/out: page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */ + trx_id_t trx_id, /*!< in: transaction id */ + mtr_t* mtr) /*!< in/out: mini-transaction, or NULL */ +{ + page_t* page = buf_block_get_frame(block); +#ifndef UNIV_HOTBACKUP + ut_ad(!mtr || mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); +#endif /* !UNIV_HOTBACKUP */ + + /* It is not necessary to write this change to the redo log, as + during a database recovery we assume that the max trx id of every + page is the maximum trx id assigned before the crash. */ + + if (page_zip) { + mach_write_to_8(page + (PAGE_HEADER + PAGE_MAX_TRX_ID), trx_id); + page_zip_write_header(page_zip, + page + (PAGE_HEADER + PAGE_MAX_TRX_ID), + 8, mtr); +#ifndef UNIV_HOTBACKUP + } else if (mtr) { + mlog_write_ull(page + (PAGE_HEADER + PAGE_MAX_TRX_ID), + trx_id, mtr); +#endif /* !UNIV_HOTBACKUP */ + } else { + mach_write_to_8(page + (PAGE_HEADER + PAGE_MAX_TRX_ID), trx_id); + } +} + +/************************************************************//** +Allocates a block of memory from the heap of an index page. +@return pointer to start of allocated buffer, or NULL if allocation fails */ +UNIV_INTERN +byte* +page_mem_alloc_heap( +/*================*/ + page_t* page, /*!< in/out: index page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page with enough + space available for inserting the record, + or NULL */ + ulint need, /*!< in: total number of bytes needed */ + ulint* heap_no)/*!< out: this contains the heap number + of the allocated record + if allocation succeeds */ +{ + byte* block; + ulint avl_space; + + ut_ad(page && heap_no); + + avl_space = page_get_max_insert_size(page, 1); + + if (avl_space >= need) { + block = page_header_get_ptr(page, PAGE_HEAP_TOP); + + page_header_set_ptr(page, page_zip, PAGE_HEAP_TOP, + block + need); + *heap_no = page_dir_get_n_heap(page); + + page_dir_set_n_heap(page, page_zip, 1 + *heap_no); + + return(block); + } + + return(NULL); +} + +#ifndef UNIV_HOTBACKUP +/**********************************************************//** +Writes a log record of page creation. */ +UNIV_INLINE +void +page_create_write_log( +/*==================*/ + buf_frame_t* frame, /*!< in: a buffer frame where the page is + created */ + mtr_t* mtr, /*!< in: mini-transaction handle */ + ibool comp) /*!< in: TRUE=compact page format */ +{ + mlog_write_initial_log_record(frame, comp + ? MLOG_COMP_PAGE_CREATE + : MLOG_PAGE_CREATE, mtr); +} +#else /* !UNIV_HOTBACKUP */ +# define page_create_write_log(frame,mtr,comp) ((void) 0) +#endif /* !UNIV_HOTBACKUP */ + +/***********************************************************//** +Parses a redo log record of creating a page. +@return end of log record or NULL */ +UNIV_INTERN +byte* +page_parse_create( +/*==============*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr __attribute__((unused)), /*!< in: buffer end */ + ulint comp, /*!< in: nonzero=compact page format */ + buf_block_t* block, /*!< in: block or NULL */ + mtr_t* mtr) /*!< in: mtr or NULL */ +{ + ut_ad(ptr && end_ptr); + + /* The record is empty, except for the record initial part */ + + if (block) { + page_create(block, mtr, comp); + } + + return(ptr); +} + +/**********************************************************//** +The index page creation function. +@return pointer to the page */ +static +page_t* +page_create_low( +/*============*/ + buf_block_t* block, /*!< in: a buffer block where the + page is created */ + ulint comp) /*!< in: nonzero=compact page format */ +{ + page_dir_slot_t* slot; + mem_heap_t* heap; + dtuple_t* tuple; + dfield_t* field; + byte* heap_top; + rec_t* infimum_rec; + rec_t* supremum_rec; + page_t* page; + dict_index_t* index; + ulint* offsets; + + ut_ad(block); +#if PAGE_BTR_IBUF_FREE_LIST + FLST_BASE_NODE_SIZE > PAGE_DATA +# error "PAGE_BTR_IBUF_FREE_LIST + FLST_BASE_NODE_SIZE > PAGE_DATA" +#endif +#if PAGE_BTR_IBUF_FREE_LIST_NODE + FLST_NODE_SIZE > PAGE_DATA +# error "PAGE_BTR_IBUF_FREE_LIST_NODE + FLST_NODE_SIZE > PAGE_DATA" +#endif + + /* The infimum and supremum records use a dummy index. */ + if (UNIV_LIKELY(comp)) { + index = dict_ind_compact; + } else { + index = dict_ind_redundant; + } + + /* 1. INCREMENT MODIFY CLOCK */ + buf_block_modify_clock_inc(block); + + page = buf_block_get_frame(block); + + fil_page_set_type(page, FIL_PAGE_INDEX); + + heap = mem_heap_create(200); + + /* 3. CREATE THE INFIMUM AND SUPREMUM RECORDS */ + + /* Create first a data tuple for infimum record */ + tuple = dtuple_create(heap, 1); + dtuple_set_info_bits(tuple, REC_STATUS_INFIMUM); + field = dtuple_get_nth_field(tuple, 0); + + dfield_set_data(field, "infimum", 8); + dtype_set(dfield_get_type(field), + DATA_VARCHAR, DATA_ENGLISH | DATA_NOT_NULL, 8); + /* Set the corresponding physical record to its place in the page + record heap */ + + heap_top = page + PAGE_DATA; + + infimum_rec = rec_convert_dtuple_to_rec(heap_top, index, tuple, 0); + + if (UNIV_LIKELY(comp)) { + ut_a(infimum_rec == page + PAGE_NEW_INFIMUM); + + rec_set_n_owned_new(infimum_rec, NULL, 1); + rec_set_heap_no_new(infimum_rec, 0); + } else { + ut_a(infimum_rec == page + PAGE_OLD_INFIMUM); + + rec_set_n_owned_old(infimum_rec, 1); + rec_set_heap_no_old(infimum_rec, 0); + } + + offsets = rec_get_offsets(infimum_rec, index, NULL, + ULINT_UNDEFINED, &heap); + + heap_top = rec_get_end(infimum_rec, offsets); + + /* Create then a tuple for supremum */ + + tuple = dtuple_create(heap, 1); + dtuple_set_info_bits(tuple, REC_STATUS_SUPREMUM); + field = dtuple_get_nth_field(tuple, 0); + + dfield_set_data(field, "supremum", comp ? 8 : 9); + dtype_set(dfield_get_type(field), + DATA_VARCHAR, DATA_ENGLISH | DATA_NOT_NULL, comp ? 8 : 9); + + supremum_rec = rec_convert_dtuple_to_rec(heap_top, index, tuple, 0); + + if (UNIV_LIKELY(comp)) { + ut_a(supremum_rec == page + PAGE_NEW_SUPREMUM); + + rec_set_n_owned_new(supremum_rec, NULL, 1); + rec_set_heap_no_new(supremum_rec, 1); + } else { + ut_a(supremum_rec == page + PAGE_OLD_SUPREMUM); + + rec_set_n_owned_old(supremum_rec, 1); + rec_set_heap_no_old(supremum_rec, 1); + } + + offsets = rec_get_offsets(supremum_rec, index, offsets, + ULINT_UNDEFINED, &heap); + heap_top = rec_get_end(supremum_rec, offsets); + + ut_ad(heap_top == page + + (comp ? PAGE_NEW_SUPREMUM_END : PAGE_OLD_SUPREMUM_END)); + + mem_heap_free(heap); + + /* 4. INITIALIZE THE PAGE */ + + page_header_set_field(page, NULL, PAGE_N_DIR_SLOTS, 2); + page_header_set_ptr(page, NULL, PAGE_HEAP_TOP, heap_top); + page_header_set_field(page, NULL, PAGE_N_HEAP, comp + ? 0x8000 | PAGE_HEAP_NO_USER_LOW + : PAGE_HEAP_NO_USER_LOW); + page_header_set_ptr(page, NULL, PAGE_FREE, NULL); + page_header_set_field(page, NULL, PAGE_GARBAGE, 0); + page_header_set_ptr(page, NULL, PAGE_LAST_INSERT, NULL); + page_header_set_field(page, NULL, PAGE_DIRECTION, PAGE_NO_DIRECTION); + page_header_set_field(page, NULL, PAGE_N_DIRECTION, 0); + page_header_set_field(page, NULL, PAGE_N_RECS, 0); + page_set_max_trx_id(block, NULL, 0, NULL); + memset(heap_top, 0, UNIV_PAGE_SIZE - PAGE_EMPTY_DIR_START + - page_offset(heap_top)); + + /* 5. SET POINTERS IN RECORDS AND DIR SLOTS */ + + /* Set the slots to point to infimum and supremum. */ + + slot = page_dir_get_nth_slot(page, 0); + page_dir_slot_set_rec(slot, infimum_rec); + + slot = page_dir_get_nth_slot(page, 1); + page_dir_slot_set_rec(slot, supremum_rec); + + /* Set the next pointers in infimum and supremum */ + + if (UNIV_LIKELY(comp)) { + rec_set_next_offs_new(infimum_rec, PAGE_NEW_SUPREMUM); + rec_set_next_offs_new(supremum_rec, 0); + } else { + rec_set_next_offs_old(infimum_rec, PAGE_OLD_SUPREMUM); + rec_set_next_offs_old(supremum_rec, 0); + } + + return(page); +} + +/**********************************************************//** +Create an uncompressed B-tree index page. +@return pointer to the page */ +UNIV_INTERN +page_t* +page_create( +/*========*/ + buf_block_t* block, /*!< in: a buffer block where the + page is created */ + mtr_t* mtr, /*!< in: mini-transaction handle */ + ulint comp) /*!< in: nonzero=compact page format */ +{ + page_create_write_log(buf_block_get_frame(block), mtr, comp); + return(page_create_low(block, comp)); +} + +/**********************************************************//** +Create a compressed B-tree index page. +@return pointer to the page */ +UNIV_INTERN +page_t* +page_create_zip( +/*============*/ + buf_block_t* block, /*!< in/out: a buffer frame where the + page is created */ + dict_index_t* index, /*!< in: the index of the page */ + ulint level, /*!< in: the B-tree level of the page */ + trx_id_t max_trx_id, /*!< in: PAGE_MAX_TRX_ID */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + page_t* page; + page_zip_des_t* page_zip = buf_block_get_page_zip(block); + + ut_ad(block); + ut_ad(page_zip); + ut_ad(index); + ut_ad(dict_table_is_comp(index->table)); + + page = page_create_low(block, TRUE); + mach_write_to_2(PAGE_HEADER + PAGE_LEVEL + page, level); + mach_write_to_8(PAGE_HEADER + PAGE_MAX_TRX_ID + page, max_trx_id); + + if (!page_zip_compress(page_zip, page, index, + page_zip_level, mtr)) { + /* The compression of a newly created page + should always succeed. */ + ut_error; + } + + return(page); +} + +/**********************************************************//** +Empty a previously created B-tree index page. */ +UNIV_INTERN +void +page_create_empty( +/*==============*/ + buf_block_t* block, /*!< in/out: B-tree block */ + dict_index_t* index, /*!< in: the index of the page */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + trx_id_t max_trx_id = 0; + const page_t* page = buf_block_get_frame(block); + page_zip_des_t* page_zip= buf_block_get_page_zip(block); + + ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX); + + if (dict_index_is_sec_or_ibuf(index) && page_is_leaf(page)) { + max_trx_id = page_get_max_trx_id(page); + ut_ad(max_trx_id); + } + + if (page_zip) { + page_create_zip(block, index, + page_header_get_field(page, PAGE_LEVEL), + max_trx_id, mtr); + } else { + page_create(block, mtr, page_is_comp(page)); + + if (max_trx_id) { + page_update_max_trx_id( + block, page_zip, max_trx_id, mtr); + } + } +} + +/*************************************************************//** +Differs from page_copy_rec_list_end, because this function does not +touch the lock table and max trx id on page or compress the page. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if new_block is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). */ +UNIV_INTERN +void +page_copy_rec_list_end_no_locks( +/*============================*/ + buf_block_t* new_block, /*!< in: index page to copy to */ + buf_block_t* block, /*!< in: index page of rec */ + rec_t* rec, /*!< in: record on page */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_t* new_page = buf_block_get_frame(new_block); + page_cur_t cur1; + rec_t* cur2; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + page_cur_position(rec, block, &cur1); + + if (page_cur_is_before_first(&cur1)) { + + page_cur_move_to_next(&cur1); + } + + btr_assert_not_corrupted(new_block, index); + ut_a(page_is_comp(new_page) == page_rec_is_comp(rec)); + ut_a(mach_read_from_2(new_page + UNIV_PAGE_SIZE - 10) == (ulint) + (page_is_comp(new_page) ? PAGE_NEW_INFIMUM : PAGE_OLD_INFIMUM)); + + cur2 = page_get_infimum_rec(buf_block_get_frame(new_block)); + + /* Copy records from the original page to the new page */ + + while (!page_cur_is_after_last(&cur1)) { + rec_t* cur1_rec = page_cur_get_rec(&cur1); + rec_t* ins_rec; + offsets = rec_get_offsets(cur1_rec, index, offsets, + ULINT_UNDEFINED, &heap); + ins_rec = page_cur_insert_rec_low(cur2, index, + cur1_rec, offsets, mtr); + if (UNIV_UNLIKELY(!ins_rec)) { + /* Track an assertion failure reported on the mailing + list on June 18th, 2003 */ + + buf_page_print(new_page, 0, + BUF_PAGE_PRINT_NO_CRASH); + buf_page_print(page_align(rec), 0, + BUF_PAGE_PRINT_NO_CRASH); + ut_print_timestamp(stderr); + + fprintf(stderr, + "InnoDB: rec offset %lu, cur1 offset %lu," + " cur2 offset %lu\n", + (ulong) page_offset(rec), + (ulong) page_offset(page_cur_get_rec(&cur1)), + (ulong) page_offset(cur2)); + ut_error; + } + + page_cur_move_to_next(&cur1); + cur2 = ins_rec; + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } +} + +#ifndef UNIV_HOTBACKUP +/*************************************************************//** +Copies records from page to new_page, from a given record onward, +including that record. Infimum and supremum records are not copied. +The records are copied to the start of the record list on new_page. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if new_block is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + +@return pointer to the original successor of the infimum record on +new_page, or NULL on zip overflow (new_block will be decompressed) */ +UNIV_INTERN +rec_t* +page_copy_rec_list_end( +/*===================*/ + buf_block_t* new_block, /*!< in/out: index page to copy to */ + buf_block_t* block, /*!< in: index page containing rec */ + rec_t* rec, /*!< in: record on page */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_t* new_page = buf_block_get_frame(new_block); + page_zip_des_t* new_page_zip = buf_block_get_page_zip(new_block); + page_t* page = page_align(rec); + rec_t* ret = page_rec_get_next( + page_get_infimum_rec(new_page)); + ulint log_mode = 0; /* remove warning */ + +#ifdef UNIV_ZIP_DEBUG + if (new_page_zip) { + page_zip_des_t* page_zip = buf_block_get_page_zip(block); + ut_a(page_zip); + + /* Strict page_zip_validate() may fail here. + Furthermore, btr_compress() may set FIL_PAGE_PREV to + FIL_NULL on new_page while leaving it intact on + new_page_zip. So, we cannot validate new_page_zip. */ + ut_a(page_zip_validate_low(page_zip, page, index, TRUE)); + } +#endif /* UNIV_ZIP_DEBUG */ + ut_ad(buf_block_get_frame(block) == page); + ut_ad(page_is_leaf(page) == page_is_leaf(new_page)); + ut_ad(page_is_comp(page) == page_is_comp(new_page)); + /* Here, "ret" may be pointing to a user record or the + predefined supremum record. */ + + if (new_page_zip) { + log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE); + } + + if (page_dir_get_n_heap(new_page) == PAGE_HEAP_NO_USER_LOW) { + page_copy_rec_list_end_to_created_page(new_page, rec, + index, mtr); + } else { + page_copy_rec_list_end_no_locks(new_block, block, rec, + index, mtr); + } + + /* Update PAGE_MAX_TRX_ID on the uncompressed page. + Modifications will be redo logged and copied to the compressed + page in page_zip_compress() or page_zip_reorganize() below. */ + if (dict_index_is_sec_or_ibuf(index) && page_is_leaf(page)) { + page_update_max_trx_id(new_block, NULL, + page_get_max_trx_id(page), mtr); + } + + if (new_page_zip) { + mtr_set_log_mode(mtr, log_mode); + + if (!page_zip_compress(new_page_zip, new_page, + index, page_zip_level, mtr)) { + /* Before trying to reorganize the page, + store the number of preceding records on the page. */ + ulint ret_pos + = page_rec_get_n_recs_before(ret); + /* Before copying, "ret" was the successor of + the predefined infimum record. It must still + have at least one predecessor (the predefined + infimum record, or a freshly copied record + that is smaller than "ret"). */ + ut_a(ret_pos > 0); + + if (!page_zip_reorganize(new_block, index, mtr)) { + + btr_blob_dbg_remove(new_page, index, + "copy_end_reorg_fail"); + if (!page_zip_decompress(new_page_zip, + new_page, FALSE)) { + ut_error; + } + ut_ad(page_validate(new_page, index)); + btr_blob_dbg_add(new_page, index, + "copy_end_reorg_fail"); + return(NULL); + } else { + /* The page was reorganized: + Seek to ret_pos. */ + ret = new_page + PAGE_NEW_INFIMUM; + + do { + ret = rec_get_next_ptr(ret, TRUE); + } while (--ret_pos); + } + } + } + + /* Update the lock table and possible hash index */ + + lock_move_rec_list_end(new_block, block, rec); + + btr_search_move_or_delete_hash_entries(new_block, block, index); + + return(ret); +} + +/*************************************************************//** +Copies records from page to new_page, up to the given record, +NOT including that record. Infimum and supremum records are not copied. +The records are copied to the end of the record list on new_page. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if new_block is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + +@return pointer to the original predecessor of the supremum record on +new_page, or NULL on zip overflow (new_block will be decompressed) */ +UNIV_INTERN +rec_t* +page_copy_rec_list_start( +/*=====================*/ + buf_block_t* new_block, /*!< in/out: index page to copy to */ + buf_block_t* block, /*!< in: index page containing rec */ + rec_t* rec, /*!< in: record on page */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_t* new_page = buf_block_get_frame(new_block); + page_zip_des_t* new_page_zip = buf_block_get_page_zip(new_block); + page_cur_t cur1; + rec_t* cur2; + ulint log_mode = 0 /* remove warning */; + mem_heap_t* heap = NULL; + rec_t* ret + = page_rec_get_prev(page_get_supremum_rec(new_page)); + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + /* Here, "ret" may be pointing to a user record or the + predefined infimum record. */ + + if (page_rec_is_infimum(rec)) { + + return(ret); + } + + if (new_page_zip) { + log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE); + } + + page_cur_set_before_first(block, &cur1); + page_cur_move_to_next(&cur1); + + cur2 = ret; + + /* Copy records from the original page to the new page */ + + while (page_cur_get_rec(&cur1) != rec) { + rec_t* cur1_rec = page_cur_get_rec(&cur1); + offsets = rec_get_offsets(cur1_rec, index, offsets, + ULINT_UNDEFINED, &heap); + cur2 = page_cur_insert_rec_low(cur2, index, + cur1_rec, offsets, mtr); + ut_a(cur2); + + page_cur_move_to_next(&cur1); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + /* Update PAGE_MAX_TRX_ID on the uncompressed page. + Modifications will be redo logged and copied to the compressed + page in page_zip_compress() or page_zip_reorganize() below. */ + if (dict_index_is_sec_or_ibuf(index) + && page_is_leaf(page_align(rec))) { + page_update_max_trx_id(new_block, NULL, + page_get_max_trx_id(page_align(rec)), + mtr); + } + + if (new_page_zip) { + mtr_set_log_mode(mtr, log_mode); + + DBUG_EXECUTE_IF("page_copy_rec_list_start_compress_fail", + goto zip_reorganize;); + + if (!page_zip_compress(new_page_zip, new_page, index, + page_zip_level, mtr)) { + + ulint ret_pos; +#ifndef DBUG_OFF +zip_reorganize: +#endif /* DBUG_OFF */ + /* Before trying to reorganize the page, + store the number of preceding records on the page. */ + ret_pos = page_rec_get_n_recs_before(ret); + /* Before copying, "ret" was the predecessor + of the predefined supremum record. If it was + the predefined infimum record, then it would + still be the infimum, and we would have + ret_pos == 0. */ + + if (UNIV_UNLIKELY + (!page_zip_reorganize(new_block, index, mtr))) { + + btr_blob_dbg_remove(new_page, index, + "copy_start_reorg_fail"); + if (UNIV_UNLIKELY + (!page_zip_decompress(new_page_zip, + new_page, FALSE))) { + ut_error; + } + ut_ad(page_validate(new_page, index)); + btr_blob_dbg_add(new_page, index, + "copy_start_reorg_fail"); + return(NULL); + } + + /* The page was reorganized: Seek to ret_pos. */ + ret = page_rec_get_nth(new_page, ret_pos); + } + } + + /* Update the lock table and possible hash index */ + + lock_move_rec_list_start(new_block, block, rec, ret); + + btr_search_move_or_delete_hash_entries(new_block, block, index); + + return(ret); +} + +/**********************************************************//** +Writes a log record of a record list end or start deletion. */ +UNIV_INLINE +void +page_delete_rec_list_write_log( +/*===========================*/ + rec_t* rec, /*!< in: record on page */ + dict_index_t* index, /*!< in: record descriptor */ + byte type, /*!< in: operation type: + MLOG_LIST_END_DELETE, ... */ + mtr_t* mtr) /*!< in: mtr */ +{ + byte* log_ptr; + ut_ad(type == MLOG_LIST_END_DELETE + || type == MLOG_LIST_START_DELETE + || type == MLOG_COMP_LIST_END_DELETE + || type == MLOG_COMP_LIST_START_DELETE); + + log_ptr = mlog_open_and_write_index(mtr, rec, index, type, 2); + if (log_ptr) { + /* Write the parameter as a 2-byte ulint */ + mach_write_to_2(log_ptr, page_offset(rec)); + mlog_close(mtr, log_ptr + 2); + } +} +#else /* !UNIV_HOTBACKUP */ +# define page_delete_rec_list_write_log(rec,index,type,mtr) ((void) 0) +#endif /* !UNIV_HOTBACKUP */ + +/**********************************************************//** +Parses a log record of a record list end or start deletion. +@return end of log record or NULL */ +UNIV_INTERN +byte* +page_parse_delete_rec_list( +/*=======================*/ + byte type, /*!< in: MLOG_LIST_END_DELETE, + MLOG_LIST_START_DELETE, + MLOG_COMP_LIST_END_DELETE or + MLOG_COMP_LIST_START_DELETE */ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + buf_block_t* block, /*!< in/out: buffer block or NULL */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mtr or NULL */ +{ + page_t* page; + ulint offset; + + ut_ad(type == MLOG_LIST_END_DELETE + || type == MLOG_LIST_START_DELETE + || type == MLOG_COMP_LIST_END_DELETE + || type == MLOG_COMP_LIST_START_DELETE); + + /* Read the record offset as a 2-byte ulint */ + + if (end_ptr < ptr + 2) { + + return(NULL); + } + + offset = mach_read_from_2(ptr); + ptr += 2; + + if (!block) { + + return(ptr); + } + + page = buf_block_get_frame(block); + + ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table)); + + if (type == MLOG_LIST_END_DELETE + || type == MLOG_COMP_LIST_END_DELETE) { + page_delete_rec_list_end(page + offset, block, index, + ULINT_UNDEFINED, ULINT_UNDEFINED, + mtr); + } else { + page_delete_rec_list_start(page + offset, block, index, mtr); + } + + return(ptr); +} + +/*************************************************************//** +Deletes records from a page from a given record onward, including that record. +The infimum and supremum records are not deleted. */ +UNIV_INTERN +void +page_delete_rec_list_end( +/*=====================*/ + rec_t* rec, /*!< in: pointer to record on page */ + buf_block_t* block, /*!< in: buffer block of the page */ + dict_index_t* index, /*!< in: record descriptor */ + ulint n_recs, /*!< in: number of records to delete, + or ULINT_UNDEFINED if not known */ + ulint size, /*!< in: the sum of the sizes of the + records in the end of the chain to + delete, or ULINT_UNDEFINED if not known */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_dir_slot_t*slot; + ulint slot_index; + rec_t* last_rec; + rec_t* prev_rec; + ulint n_owned; + page_zip_des_t* page_zip = buf_block_get_page_zip(block); + page_t* page = page_align(rec); + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + ut_ad(size == ULINT_UNDEFINED || size < UNIV_PAGE_SIZE); + ut_ad(!page_zip || page_rec_is_comp(rec)); +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + + if (page_rec_is_supremum(rec)) { + ut_ad(n_recs == 0 || n_recs == ULINT_UNDEFINED); + /* Nothing to do, there are no records bigger than the + page supremum. */ + return; + } + + if (recv_recovery_is_on()) { + /* If we are replaying a redo log record, we must + replay it exactly. Since MySQL 5.6.11, we should be + generating a redo log record for page creation if + the page would become empty. Thus, this branch should + only be executed when applying redo log that was + generated by an older version of MySQL. */ + } else if (page_rec_is_infimum(rec) + || n_recs == page_get_n_recs(page)) { +delete_all: + /* We are deleting all records. */ + page_create_empty(block, index, mtr); + return; + } else if (page_is_comp(page)) { + if (page_rec_get_next_low(page + PAGE_NEW_INFIMUM, 1) == rec) { + /* We are deleting everything from the first + user record onwards. */ + goto delete_all; + } + } else { + if (page_rec_get_next_low(page + PAGE_OLD_INFIMUM, 0) == rec) { + /* We are deleting everything from the first + user record onwards. */ + goto delete_all; + } + } + + /* Reset the last insert info in the page header and increment + the modify clock for the frame */ + + page_header_set_ptr(page, page_zip, PAGE_LAST_INSERT, NULL); + + /* The page gets invalid for optimistic searches: increment the + frame modify clock */ + + buf_block_modify_clock_inc(block); + + page_delete_rec_list_write_log(rec, index, page_is_comp(page) + ? MLOG_COMP_LIST_END_DELETE + : MLOG_LIST_END_DELETE, mtr); + + if (page_zip) { + ulint log_mode; + + ut_a(page_is_comp(page)); + /* Individual deletes are not logged */ + + log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE); + + do { + page_cur_t cur; + page_cur_position(rec, block, &cur); + + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + rec = rec_get_next_ptr(rec, TRUE); +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + page_cur_delete_rec(&cur, index, offsets, mtr); + } while (page_offset(rec) != PAGE_NEW_SUPREMUM); + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + /* Restore log mode */ + + mtr_set_log_mode(mtr, log_mode); + return; + } + + prev_rec = page_rec_get_prev(rec); + + last_rec = page_rec_get_prev(page_get_supremum_rec(page)); + + if ((size == ULINT_UNDEFINED) || (n_recs == ULINT_UNDEFINED)) { + rec_t* rec2 = rec; + /* Calculate the sum of sizes and the number of records */ + size = 0; + n_recs = 0; + + do { + ulint s; + offsets = rec_get_offsets(rec2, index, offsets, + ULINT_UNDEFINED, &heap); + s = rec_offs_size(offsets); + ut_ad(rec2 - page + s - rec_offs_extra_size(offsets) + < UNIV_PAGE_SIZE); + ut_ad(size + s < UNIV_PAGE_SIZE); + size += s; + n_recs++; + + rec2 = page_rec_get_next(rec2); + } while (!page_rec_is_supremum(rec2)); + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + } + + ut_ad(size < UNIV_PAGE_SIZE); + + /* Update the page directory; there is no need to balance the number + of the records owned by the supremum record, as it is allowed to be + less than PAGE_DIR_SLOT_MIN_N_OWNED */ + + if (page_is_comp(page)) { + rec_t* rec2 = rec; + ulint count = 0; + + while (rec_get_n_owned_new(rec2) == 0) { + count++; + + rec2 = rec_get_next_ptr(rec2, TRUE); + } + + ut_ad(rec_get_n_owned_new(rec2) > count); + + n_owned = rec_get_n_owned_new(rec2) - count; + slot_index = page_dir_find_owner_slot(rec2); + ut_ad(slot_index > 0); + slot = page_dir_get_nth_slot(page, slot_index); + } else { + rec_t* rec2 = rec; + ulint count = 0; + + while (rec_get_n_owned_old(rec2) == 0) { + count++; + + rec2 = rec_get_next_ptr(rec2, FALSE); + } + + ut_ad(rec_get_n_owned_old(rec2) > count); + + n_owned = rec_get_n_owned_old(rec2) - count; + slot_index = page_dir_find_owner_slot(rec2); + ut_ad(slot_index > 0); + slot = page_dir_get_nth_slot(page, slot_index); + } + + page_dir_slot_set_rec(slot, page_get_supremum_rec(page)); + page_dir_slot_set_n_owned(slot, NULL, n_owned); + + page_dir_set_n_slots(page, NULL, slot_index + 1); + + /* Remove the record chain segment from the record chain */ + page_rec_set_next(prev_rec, page_get_supremum_rec(page)); + + btr_blob_dbg_op(page, rec, index, "delete_end", + btr_blob_dbg_remove_rec); + + /* Catenate the deleted chain segment to the page free list */ + + page_rec_set_next(last_rec, page_header_get_ptr(page, PAGE_FREE)); + page_header_set_ptr(page, NULL, PAGE_FREE, rec); + + page_header_set_field(page, NULL, PAGE_GARBAGE, size + + page_header_get_field(page, PAGE_GARBAGE)); + + page_header_set_field(page, NULL, PAGE_N_RECS, + (ulint)(page_get_n_recs(page) - n_recs)); +} + +/*************************************************************//** +Deletes records from page, up to the given record, NOT including +that record. Infimum and supremum records are not deleted. */ +UNIV_INTERN +void +page_delete_rec_list_start( +/*=======================*/ + rec_t* rec, /*!< in: record on page */ + buf_block_t* block, /*!< in: buffer block of the page */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_cur_t cur1; + ulint log_mode; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + mem_heap_t* heap = NULL; + byte type; + + rec_offs_init(offsets_); + + ut_ad((ibool) !!page_rec_is_comp(rec) + == dict_table_is_comp(index->table)); +#ifdef UNIV_ZIP_DEBUG + { + page_zip_des_t* page_zip= buf_block_get_page_zip(block); + page_t* page = buf_block_get_frame(block); + + /* page_zip_validate() would detect a min_rec_mark mismatch + in btr_page_split_and_insert() + between btr_attach_half_pages() and insert_page = ... + when btr_page_get_split_rec_to_left() holds + (direction == FSP_DOWN). */ + ut_a(!page_zip + || page_zip_validate_low(page_zip, page, index, TRUE)); + } +#endif /* UNIV_ZIP_DEBUG */ + + if (page_rec_is_infimum(rec)) { + return; + } + + if (page_rec_is_supremum(rec)) { + /* We are deleting all records. */ + page_create_empty(block, index, mtr); + return; + } + + if (page_rec_is_comp(rec)) { + type = MLOG_COMP_LIST_START_DELETE; + } else { + type = MLOG_LIST_START_DELETE; + } + + page_delete_rec_list_write_log(rec, index, type, mtr); + + page_cur_set_before_first(block, &cur1); + page_cur_move_to_next(&cur1); + + /* Individual deletes are not logged */ + + log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE); + + while (page_cur_get_rec(&cur1) != rec) { + offsets = rec_get_offsets(page_cur_get_rec(&cur1), index, + offsets, ULINT_UNDEFINED, &heap); + page_cur_delete_rec(&cur1, index, offsets, mtr); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + /* Restore log mode */ + + mtr_set_log_mode(mtr, log_mode); +} + +#ifndef UNIV_HOTBACKUP +/*************************************************************//** +Moves record list end to another page. Moved records include +split_rec. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if new_block is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + +@return TRUE on success; FALSE on compression failure (new_block will +be decompressed) */ +UNIV_INTERN +ibool +page_move_rec_list_end( +/*===================*/ + buf_block_t* new_block, /*!< in/out: index page where to move */ + buf_block_t* block, /*!< in: index page from where to move */ + rec_t* split_rec, /*!< in: first record to move */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_t* new_page = buf_block_get_frame(new_block); + ulint old_data_size; + ulint new_data_size; + ulint old_n_recs; + ulint new_n_recs; + + old_data_size = page_get_data_size(new_page); + old_n_recs = page_get_n_recs(new_page); +#ifdef UNIV_ZIP_DEBUG + { + page_zip_des_t* new_page_zip + = buf_block_get_page_zip(new_block); + page_zip_des_t* page_zip + = buf_block_get_page_zip(block); + ut_a(!new_page_zip == !page_zip); + ut_a(!new_page_zip + || page_zip_validate(new_page_zip, new_page, index)); + ut_a(!page_zip + || page_zip_validate(page_zip, page_align(split_rec), + index)); + } +#endif /* UNIV_ZIP_DEBUG */ + + if (UNIV_UNLIKELY(!page_copy_rec_list_end(new_block, block, + split_rec, index, mtr))) { + return(FALSE); + } + + new_data_size = page_get_data_size(new_page); + new_n_recs = page_get_n_recs(new_page); + + ut_ad(new_data_size >= old_data_size); + + page_delete_rec_list_end(split_rec, block, index, + new_n_recs - old_n_recs, + new_data_size - old_data_size, mtr); + + return(TRUE); +} + +/*************************************************************//** +Moves record list start to another page. Moved records do not include +split_rec. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if new_block is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + +@return TRUE on success; FALSE on compression failure */ +UNIV_INTERN +ibool +page_move_rec_list_start( +/*=====================*/ + buf_block_t* new_block, /*!< in/out: index page where to move */ + buf_block_t* block, /*!< in/out: page containing split_rec */ + rec_t* split_rec, /*!< in: first record not to move */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mtr */ +{ + if (UNIV_UNLIKELY(!page_copy_rec_list_start(new_block, block, + split_rec, index, mtr))) { + return(FALSE); + } + + page_delete_rec_list_start(split_rec, block, index, mtr); + + return(TRUE); +} +#endif /* !UNIV_HOTBACKUP */ + +/**************************************************************//** +Used to delete n slots from the directory. This function updates +also n_owned fields in the records, so that the first slot after +the deleted ones inherits the records of the deleted slots. */ +UNIV_INLINE +void +page_dir_delete_slot( +/*=================*/ + page_t* page, /*!< in/out: the index page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */ + ulint slot_no)/*!< in: slot to be deleted */ +{ + page_dir_slot_t* slot; + ulint n_owned; + ulint i; + ulint n_slots; + + ut_ad(!page_zip || page_is_comp(page)); + ut_ad(slot_no > 0); + ut_ad(slot_no + 1 < page_dir_get_n_slots(page)); + + n_slots = page_dir_get_n_slots(page); + + /* 1. Reset the n_owned fields of the slots to be + deleted */ + slot = page_dir_get_nth_slot(page, slot_no); + n_owned = page_dir_slot_get_n_owned(slot); + page_dir_slot_set_n_owned(slot, page_zip, 0); + + /* 2. Update the n_owned value of the first non-deleted slot */ + + slot = page_dir_get_nth_slot(page, slot_no + 1); + page_dir_slot_set_n_owned(slot, page_zip, + n_owned + page_dir_slot_get_n_owned(slot)); + + /* 3. Destroy the slot by copying slots */ + for (i = slot_no + 1; i < n_slots; i++) { + rec_t* rec = (rec_t*) + page_dir_slot_get_rec(page_dir_get_nth_slot(page, i)); + page_dir_slot_set_rec(page_dir_get_nth_slot(page, i - 1), rec); + } + + /* 4. Zero out the last slot, which will be removed */ + mach_write_to_2(page_dir_get_nth_slot(page, n_slots - 1), 0); + + /* 5. Update the page header */ + page_header_set_field(page, page_zip, PAGE_N_DIR_SLOTS, n_slots - 1); +} + +/**************************************************************//** +Used to add n slots to the directory. Does not set the record pointers +in the added slots or update n_owned values: this is the responsibility +of the caller. */ +UNIV_INLINE +void +page_dir_add_slot( +/*==============*/ + page_t* page, /*!< in/out: the index page */ + page_zip_des_t* page_zip,/*!< in/out: comprssed page, or NULL */ + ulint start) /*!< in: the slot above which the new slots + are added */ +{ + page_dir_slot_t* slot; + ulint n_slots; + + n_slots = page_dir_get_n_slots(page); + + ut_ad(start < n_slots - 1); + + /* Update the page header */ + page_dir_set_n_slots(page, page_zip, n_slots + 1); + + /* Move slots up */ + slot = page_dir_get_nth_slot(page, n_slots); + memmove(slot, slot + PAGE_DIR_SLOT_SIZE, + (n_slots - 1 - start) * PAGE_DIR_SLOT_SIZE); +} + +/****************************************************************//** +Splits a directory slot which owns too many records. */ +UNIV_INTERN +void +page_dir_split_slot( +/*================*/ + page_t* page, /*!< in/out: index page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page whose + uncompressed part will be written, or NULL */ + ulint slot_no)/*!< in: the directory slot */ +{ + rec_t* rec; + page_dir_slot_t* new_slot; + page_dir_slot_t* prev_slot; + page_dir_slot_t* slot; + ulint i; + ulint n_owned; + + ut_ad(page); + ut_ad(!page_zip || page_is_comp(page)); + ut_ad(slot_no > 0); + + slot = page_dir_get_nth_slot(page, slot_no); + + n_owned = page_dir_slot_get_n_owned(slot); + ut_ad(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED + 1); + + /* 1. We loop to find a record approximately in the middle of the + records owned by the slot. */ + + prev_slot = page_dir_get_nth_slot(page, slot_no - 1); + rec = (rec_t*) page_dir_slot_get_rec(prev_slot); + + for (i = 0; i < n_owned / 2; i++) { + rec = page_rec_get_next(rec); + } + + ut_ad(n_owned / 2 >= PAGE_DIR_SLOT_MIN_N_OWNED); + + /* 2. We add one directory slot immediately below the slot to be + split. */ + + page_dir_add_slot(page, page_zip, slot_no - 1); + + /* The added slot is now number slot_no, and the old slot is + now number slot_no + 1 */ + + new_slot = page_dir_get_nth_slot(page, slot_no); + slot = page_dir_get_nth_slot(page, slot_no + 1); + + /* 3. We store the appropriate values to the new slot. */ + + page_dir_slot_set_rec(new_slot, rec); + page_dir_slot_set_n_owned(new_slot, page_zip, n_owned / 2); + + /* 4. Finally, we update the number of records field of the + original slot */ + + page_dir_slot_set_n_owned(slot, page_zip, n_owned - (n_owned / 2)); +} + +/*************************************************************//** +Tries to balance the given directory slot with too few records with the upper +neighbor, so that there are at least the minimum number of records owned by +the slot; this may result in the merging of two slots. */ +UNIV_INTERN +void +page_dir_balance_slot( +/*==================*/ + page_t* page, /*!< in/out: index page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */ + ulint slot_no)/*!< in: the directory slot */ +{ + page_dir_slot_t* slot; + page_dir_slot_t* up_slot; + ulint n_owned; + ulint up_n_owned; + rec_t* old_rec; + rec_t* new_rec; + + ut_ad(page); + ut_ad(!page_zip || page_is_comp(page)); + ut_ad(slot_no > 0); + + slot = page_dir_get_nth_slot(page, slot_no); + + /* The last directory slot cannot be balanced with the upper + neighbor, as there is none. */ + + if (UNIV_UNLIKELY(slot_no == page_dir_get_n_slots(page) - 1)) { + + return; + } + + up_slot = page_dir_get_nth_slot(page, slot_no + 1); + + n_owned = page_dir_slot_get_n_owned(slot); + up_n_owned = page_dir_slot_get_n_owned(up_slot); + + ut_ad(n_owned == PAGE_DIR_SLOT_MIN_N_OWNED - 1); + + /* If the upper slot has the minimum value of n_owned, we will merge + the two slots, therefore we assert: */ + ut_ad(2 * PAGE_DIR_SLOT_MIN_N_OWNED - 1 <= PAGE_DIR_SLOT_MAX_N_OWNED); + + if (up_n_owned > PAGE_DIR_SLOT_MIN_N_OWNED) { + + /* In this case we can just transfer one record owned + by the upper slot to the property of the lower slot */ + old_rec = (rec_t*) page_dir_slot_get_rec(slot); + + if (page_is_comp(page)) { + new_rec = rec_get_next_ptr(old_rec, TRUE); + + rec_set_n_owned_new(old_rec, page_zip, 0); + rec_set_n_owned_new(new_rec, page_zip, n_owned + 1); + } else { + new_rec = rec_get_next_ptr(old_rec, FALSE); + + rec_set_n_owned_old(old_rec, 0); + rec_set_n_owned_old(new_rec, n_owned + 1); + } + + page_dir_slot_set_rec(slot, new_rec); + + page_dir_slot_set_n_owned(up_slot, page_zip, up_n_owned -1); + } else { + /* In this case we may merge the two slots */ + page_dir_delete_slot(page, page_zip, slot_no); + } +} + +/************************************************************//** +Returns the nth record of the record list. +This is the inverse function of page_rec_get_n_recs_before(). +@return nth record */ +UNIV_INTERN +const rec_t* +page_rec_get_nth_const( +/*===================*/ + const page_t* page, /*!< in: page */ + ulint nth) /*!< in: nth record */ +{ + const page_dir_slot_t* slot; + ulint i; + ulint n_owned; + const rec_t* rec; + + if (nth == 0) { + return(page_get_infimum_rec(page)); + } + + ut_ad(nth < UNIV_PAGE_SIZE / (REC_N_NEW_EXTRA_BYTES + 1)); + + for (i = 0;; i++) { + + slot = page_dir_get_nth_slot(page, i); + n_owned = page_dir_slot_get_n_owned(slot); + + if (n_owned > nth) { + break; + } else { + nth -= n_owned; + } + } + + ut_ad(i > 0); + slot = page_dir_get_nth_slot(page, i - 1); + rec = page_dir_slot_get_rec(slot); + + if (page_is_comp(page)) { + do { + rec = page_rec_get_next_low(rec, TRUE); + ut_ad(rec); + } while (nth--); + } else { + do { + rec = page_rec_get_next_low(rec, FALSE); + ut_ad(rec); + } while (nth--); + } + + return(rec); +} + +/***************************************************************//** +Returns the number of records before the given record in chain. +The number includes infimum and supremum records. +@return number of records */ +UNIV_INTERN +ulint +page_rec_get_n_recs_before( +/*=======================*/ + const rec_t* rec) /*!< in: the physical record */ +{ + const page_dir_slot_t* slot; + const rec_t* slot_rec; + const page_t* page; + ulint i; + lint n = 0; + + ut_ad(page_rec_check(rec)); + + page = page_align(rec); + if (page_is_comp(page)) { + while (rec_get_n_owned_new(rec) == 0) { + + rec = rec_get_next_ptr_const(rec, TRUE); + n--; + } + + for (i = 0; ; i++) { + slot = page_dir_get_nth_slot(page, i); + slot_rec = page_dir_slot_get_rec(slot); + + n += rec_get_n_owned_new(slot_rec); + + if (rec == slot_rec) { + + break; + } + } + } else { + while (rec_get_n_owned_old(rec) == 0) { + + rec = rec_get_next_ptr_const(rec, FALSE); + n--; + } + + for (i = 0; ; i++) { + slot = page_dir_get_nth_slot(page, i); + slot_rec = page_dir_slot_get_rec(slot); + + n += rec_get_n_owned_old(slot_rec); + + if (rec == slot_rec) { + + break; + } + } + } + + n--; + + ut_ad(n >= 0); + ut_ad((ulong) n < UNIV_PAGE_SIZE / (REC_N_NEW_EXTRA_BYTES + 1)); + + return((ulint) n); +} + +#ifndef UNIV_HOTBACKUP +/************************************************************//** +Prints record contents including the data relevant only in +the index page context. */ +UNIV_INTERN +void +page_rec_print( +/*===========*/ + const rec_t* rec, /*!< in: physical record */ + const ulint* offsets)/*!< in: record descriptor */ +{ + ut_a(!page_rec_is_comp(rec) == !rec_offs_comp(offsets)); + rec_print_new(stderr, rec, offsets); + if (page_rec_is_comp(rec)) { + fprintf(stderr, + " n_owned: %lu; heap_no: %lu; next rec: %lu\n", + (ulong) rec_get_n_owned_new(rec), + (ulong) rec_get_heap_no_new(rec), + (ulong) rec_get_next_offs(rec, TRUE)); + } else { + fprintf(stderr, + " n_owned: %lu; heap_no: %lu; next rec: %lu\n", + (ulong) rec_get_n_owned_old(rec), + (ulong) rec_get_heap_no_old(rec), + (ulong) rec_get_next_offs(rec, FALSE)); + } + + page_rec_check(rec); + rec_validate(rec, offsets); +} + +# ifdef UNIV_BTR_PRINT +/***************************************************************//** +This is used to print the contents of the directory for +debugging purposes. */ +UNIV_INTERN +void +page_dir_print( +/*===========*/ + page_t* page, /*!< in: index page */ + ulint pr_n) /*!< in: print n first and n last entries */ +{ + ulint n; + ulint i; + page_dir_slot_t* slot; + + n = page_dir_get_n_slots(page); + + fprintf(stderr, "--------------------------------\n" + "PAGE DIRECTORY\n" + "Page address %p\n" + "Directory stack top at offs: %lu; number of slots: %lu\n", + page, (ulong) page_offset(page_dir_get_nth_slot(page, n - 1)), + (ulong) n); + for (i = 0; i < n; i++) { + slot = page_dir_get_nth_slot(page, i); + if ((i == pr_n) && (i < n - pr_n)) { + fputs(" ... \n", stderr); + } + if ((i < pr_n) || (i >= n - pr_n)) { + fprintf(stderr, + "Contents of slot: %lu: n_owned: %lu," + " rec offs: %lu\n", + (ulong) i, + (ulong) page_dir_slot_get_n_owned(slot), + (ulong) + page_offset(page_dir_slot_get_rec(slot))); + } + } + fprintf(stderr, "Total of %lu records\n" + "--------------------------------\n", + (ulong) (PAGE_HEAP_NO_USER_LOW + page_get_n_recs(page))); +} + +/***************************************************************//** +This is used to print the contents of the page record list for +debugging purposes. */ +UNIV_INTERN +void +page_print_list( +/*============*/ + buf_block_t* block, /*!< in: index page */ + dict_index_t* index, /*!< in: dictionary index of the page */ + ulint pr_n) /*!< in: print n first and n last entries */ +{ + page_t* page = block->frame; + page_cur_t cur; + ulint count; + ulint n_recs; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + ut_a((ibool)!!page_is_comp(page) == dict_table_is_comp(index->table)); + + fprintf(stderr, + "--------------------------------\n" + "PAGE RECORD LIST\n" + "Page address %p\n", page); + + n_recs = page_get_n_recs(page); + + page_cur_set_before_first(block, &cur); + count = 0; + for (;;) { + offsets = rec_get_offsets(cur.rec, index, offsets, + ULINT_UNDEFINED, &heap); + page_rec_print(cur.rec, offsets); + + if (count == pr_n) { + break; + } + if (page_cur_is_after_last(&cur)) { + break; + } + page_cur_move_to_next(&cur); + count++; + } + + if (n_recs > 2 * pr_n) { + fputs(" ... \n", stderr); + } + + while (!page_cur_is_after_last(&cur)) { + page_cur_move_to_next(&cur); + + if (count + pr_n >= n_recs) { + offsets = rec_get_offsets(cur.rec, index, offsets, + ULINT_UNDEFINED, &heap); + page_rec_print(cur.rec, offsets); + } + count++; + } + + fprintf(stderr, + "Total of %lu records \n" + "--------------------------------\n", + (ulong) (count + 1)); + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } +} + +/***************************************************************//** +Prints the info in a page header. */ +UNIV_INTERN +void +page_header_print( +/*==============*/ + const page_t* page) +{ + fprintf(stderr, + "--------------------------------\n" + "PAGE HEADER INFO\n" + "Page address %p, n records %lu (%s)\n" + "n dir slots %lu, heap top %lu\n" + "Page n heap %lu, free %lu, garbage %lu\n" + "Page last insert %lu, direction %lu, n direction %lu\n", + page, (ulong) page_header_get_field(page, PAGE_N_RECS), + page_is_comp(page) ? "compact format" : "original format", + (ulong) page_header_get_field(page, PAGE_N_DIR_SLOTS), + (ulong) page_header_get_field(page, PAGE_HEAP_TOP), + (ulong) page_dir_get_n_heap(page), + (ulong) page_header_get_field(page, PAGE_FREE), + (ulong) page_header_get_field(page, PAGE_GARBAGE), + (ulong) page_header_get_field(page, PAGE_LAST_INSERT), + (ulong) page_header_get_field(page, PAGE_DIRECTION), + (ulong) page_header_get_field(page, PAGE_N_DIRECTION)); +} + +/***************************************************************//** +This is used to print the contents of the page for +debugging purposes. */ +UNIV_INTERN +void +page_print( +/*=======*/ + buf_block_t* block, /*!< in: index page */ + dict_index_t* index, /*!< in: dictionary index of the page */ + ulint dn, /*!< in: print dn first and last entries + in directory */ + ulint rn) /*!< in: print rn first and last records + in directory */ +{ + page_t* page = block->frame; + + page_header_print(page); + page_dir_print(page, dn); + page_print_list(block, index, rn); +} +# endif /* UNIV_BTR_PRINT */ +#endif /* !UNIV_HOTBACKUP */ + +/***************************************************************//** +The following is used to validate a record on a page. This function +differs from rec_validate as it can also check the n_owned field and +the heap_no field. +@return TRUE if ok */ +UNIV_INTERN +ibool +page_rec_validate( +/*==============*/ + const rec_t* rec, /*!< in: physical record */ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ +{ + ulint n_owned; + ulint heap_no; + const page_t* page; + + page = page_align(rec); + ut_a(!page_is_comp(page) == !rec_offs_comp(offsets)); + + page_rec_check(rec); + rec_validate(rec, offsets); + + if (page_rec_is_comp(rec)) { + n_owned = rec_get_n_owned_new(rec); + heap_no = rec_get_heap_no_new(rec); + } else { + n_owned = rec_get_n_owned_old(rec); + heap_no = rec_get_heap_no_old(rec); + } + + if (UNIV_UNLIKELY(!(n_owned <= PAGE_DIR_SLOT_MAX_N_OWNED))) { + fprintf(stderr, + "InnoDB: Dir slot of rec %lu, n owned too big %lu\n", + (ulong) page_offset(rec), (ulong) n_owned); + return(FALSE); + } + + if (UNIV_UNLIKELY(!(heap_no < page_dir_get_n_heap(page)))) { + fprintf(stderr, + "InnoDB: Heap no of rec %lu too big %lu %lu\n", + (ulong) page_offset(rec), (ulong) heap_no, + (ulong) page_dir_get_n_heap(page)); + return(FALSE); + } + + return(TRUE); +} + +#ifndef UNIV_HOTBACKUP +/***************************************************************//** +Checks that the first directory slot points to the infimum record and +the last to the supremum. This function is intended to track if the +bug fixed in 4.0.14 has caused corruption to users' databases. */ +UNIV_INTERN +void +page_check_dir( +/*===========*/ + const page_t* page) /*!< in: index page */ +{ + ulint n_slots; + ulint infimum_offs; + ulint supremum_offs; + + n_slots = page_dir_get_n_slots(page); + infimum_offs = mach_read_from_2(page_dir_get_nth_slot(page, 0)); + supremum_offs = mach_read_from_2(page_dir_get_nth_slot(page, + n_slots - 1)); + + if (UNIV_UNLIKELY(!page_rec_is_infimum_low(infimum_offs))) { + + fprintf(stderr, + "InnoDB: Page directory corruption:" + " infimum not pointed to\n"); + buf_page_print(page, 0, 0); + } + + if (UNIV_UNLIKELY(!page_rec_is_supremum_low(supremum_offs))) { + + fprintf(stderr, + "InnoDB: Page directory corruption:" + " supremum not pointed to\n"); + buf_page_print(page, 0, 0); + } +} +#endif /* !UNIV_HOTBACKUP */ + +/***************************************************************//** +This function checks the consistency of an index page when we do not +know the index. This is also resilient so that this should never crash +even if the page is total garbage. +@return TRUE if ok */ +UNIV_INTERN +ibool +page_simple_validate_old( +/*=====================*/ + const page_t* page) /*!< in: index page in ROW_FORMAT=REDUNDANT */ +{ + const page_dir_slot_t* slot; + ulint slot_no; + ulint n_slots; + const rec_t* rec; + const byte* rec_heap_top; + ulint count; + ulint own_count; + ibool ret = FALSE; + + ut_a(!page_is_comp(page)); + + /* Check first that the record heap and the directory do not + overlap. */ + + n_slots = page_dir_get_n_slots(page); + + if (UNIV_UNLIKELY(n_slots > UNIV_PAGE_SIZE / 4)) { + fprintf(stderr, + "InnoDB: Nonsensical number %lu of page dir slots\n", + (ulong) n_slots); + + goto func_exit; + } + + rec_heap_top = page_header_get_ptr(page, PAGE_HEAP_TOP); + + if (UNIV_UNLIKELY(rec_heap_top + > page_dir_get_nth_slot(page, n_slots - 1))) { + + fprintf(stderr, + "InnoDB: Record heap and dir overlap on a page," + " heap top %lu, dir %lu\n", + (ulong) page_header_get_field(page, PAGE_HEAP_TOP), + (ulong) + page_offset(page_dir_get_nth_slot(page, n_slots - 1))); + + goto func_exit; + } + + /* Validate the record list in a loop checking also that it is + consistent with the page record directory. */ + + count = 0; + own_count = 1; + slot_no = 0; + slot = page_dir_get_nth_slot(page, slot_no); + + rec = page_get_infimum_rec(page); + + for (;;) { + if (UNIV_UNLIKELY(rec > rec_heap_top)) { + fprintf(stderr, + "InnoDB: Record %lu is above" + " rec heap top %lu\n", + (ulong)(rec - page), + (ulong)(rec_heap_top - page)); + + goto func_exit; + } + + if (UNIV_UNLIKELY(rec_get_n_owned_old(rec))) { + /* This is a record pointed to by a dir slot */ + if (UNIV_UNLIKELY(rec_get_n_owned_old(rec) + != own_count)) { + + fprintf(stderr, + "InnoDB: Wrong owned count %lu, %lu," + " rec %lu\n", + (ulong) rec_get_n_owned_old(rec), + (ulong) own_count, + (ulong)(rec - page)); + + goto func_exit; + } + + if (UNIV_UNLIKELY + (page_dir_slot_get_rec(slot) != rec)) { + fprintf(stderr, + "InnoDB: Dir slot does not point" + " to right rec %lu\n", + (ulong)(rec - page)); + + goto func_exit; + } + + own_count = 0; + + if (!page_rec_is_supremum(rec)) { + slot_no++; + slot = page_dir_get_nth_slot(page, slot_no); + } + } + + if (page_rec_is_supremum(rec)) { + + break; + } + + if (UNIV_UNLIKELY + (rec_get_next_offs(rec, FALSE) < FIL_PAGE_DATA + || rec_get_next_offs(rec, FALSE) >= UNIV_PAGE_SIZE)) { + fprintf(stderr, + "InnoDB: Next record offset" + " nonsensical %lu for rec %lu\n", + (ulong) rec_get_next_offs(rec, FALSE), + (ulong) (rec - page)); + + goto func_exit; + } + + count++; + + if (UNIV_UNLIKELY(count > UNIV_PAGE_SIZE)) { + fprintf(stderr, + "InnoDB: Page record list appears" + " to be circular %lu\n", + (ulong) count); + goto func_exit; + } + + rec = page_rec_get_next_const(rec); + own_count++; + } + + if (UNIV_UNLIKELY(rec_get_n_owned_old(rec) == 0)) { + fprintf(stderr, "InnoDB: n owned is zero in a supremum rec\n"); + + goto func_exit; + } + + if (UNIV_UNLIKELY(slot_no != n_slots - 1)) { + fprintf(stderr, "InnoDB: n slots wrong %lu, %lu\n", + (ulong) slot_no, (ulong) (n_slots - 1)); + goto func_exit; + } + + if (UNIV_UNLIKELY(page_header_get_field(page, PAGE_N_RECS) + + PAGE_HEAP_NO_USER_LOW + != count + 1)) { + fprintf(stderr, "InnoDB: n recs wrong %lu %lu\n", + (ulong) page_header_get_field(page, PAGE_N_RECS) + + PAGE_HEAP_NO_USER_LOW, + (ulong) (count + 1)); + + goto func_exit; + } + + /* Check then the free list */ + rec = page_header_get_ptr(page, PAGE_FREE); + + while (rec != NULL) { + if (UNIV_UNLIKELY(rec < page + FIL_PAGE_DATA + || rec >= page + UNIV_PAGE_SIZE)) { + fprintf(stderr, + "InnoDB: Free list record has" + " a nonsensical offset %lu\n", + (ulong) (rec - page)); + + goto func_exit; + } + + if (UNIV_UNLIKELY(rec > rec_heap_top)) { + fprintf(stderr, + "InnoDB: Free list record %lu" + " is above rec heap top %lu\n", + (ulong) (rec - page), + (ulong) (rec_heap_top - page)); + + goto func_exit; + } + + count++; + + if (UNIV_UNLIKELY(count > UNIV_PAGE_SIZE)) { + fprintf(stderr, + "InnoDB: Page free list appears" + " to be circular %lu\n", + (ulong) count); + goto func_exit; + } + + rec = page_rec_get_next_const(rec); + } + + if (UNIV_UNLIKELY(page_dir_get_n_heap(page) != count + 1)) { + + fprintf(stderr, "InnoDB: N heap is wrong %lu, %lu\n", + (ulong) page_dir_get_n_heap(page), + (ulong) (count + 1)); + + goto func_exit; + } + + ret = TRUE; + +func_exit: + return(ret); +} + +/***************************************************************//** +This function checks the consistency of an index page when we do not +know the index. This is also resilient so that this should never crash +even if the page is total garbage. +@return TRUE if ok */ +UNIV_INTERN +ibool +page_simple_validate_new( +/*=====================*/ + const page_t* page) /*!< in: index page in ROW_FORMAT!=REDUNDANT */ +{ + const page_dir_slot_t* slot; + ulint slot_no; + ulint n_slots; + const rec_t* rec; + const byte* rec_heap_top; + ulint count; + ulint own_count; + ibool ret = FALSE; + + ut_a(page_is_comp(page)); + + /* Check first that the record heap and the directory do not + overlap. */ + + n_slots = page_dir_get_n_slots(page); + + if (UNIV_UNLIKELY(n_slots > UNIV_PAGE_SIZE / 4)) { + fprintf(stderr, + "InnoDB: Nonsensical number %lu" + " of page dir slots\n", (ulong) n_slots); + + goto func_exit; + } + + rec_heap_top = page_header_get_ptr(page, PAGE_HEAP_TOP); + + if (UNIV_UNLIKELY(rec_heap_top + > page_dir_get_nth_slot(page, n_slots - 1))) { + + fprintf(stderr, + "InnoDB: Record heap and dir overlap on a page," + " heap top %lu, dir %lu\n", + (ulong) page_header_get_field(page, PAGE_HEAP_TOP), + (ulong) + page_offset(page_dir_get_nth_slot(page, n_slots - 1))); + + goto func_exit; + } + + /* Validate the record list in a loop checking also that it is + consistent with the page record directory. */ + + count = 0; + own_count = 1; + slot_no = 0; + slot = page_dir_get_nth_slot(page, slot_no); + + rec = page_get_infimum_rec(page); + + for (;;) { + if (UNIV_UNLIKELY(rec > rec_heap_top)) { + fprintf(stderr, + "InnoDB: Record %lu is above rec" + " heap top %lu\n", + (ulong) page_offset(rec), + (ulong) page_offset(rec_heap_top)); + + goto func_exit; + } + + if (UNIV_UNLIKELY(rec_get_n_owned_new(rec))) { + /* This is a record pointed to by a dir slot */ + if (UNIV_UNLIKELY(rec_get_n_owned_new(rec) + != own_count)) { + + fprintf(stderr, + "InnoDB: Wrong owned count %lu, %lu," + " rec %lu\n", + (ulong) rec_get_n_owned_new(rec), + (ulong) own_count, + (ulong) page_offset(rec)); + + goto func_exit; + } + + if (UNIV_UNLIKELY + (page_dir_slot_get_rec(slot) != rec)) { + fprintf(stderr, + "InnoDB: Dir slot does not point" + " to right rec %lu\n", + (ulong) page_offset(rec)); + + goto func_exit; + } + + own_count = 0; + + if (!page_rec_is_supremum(rec)) { + slot_no++; + slot = page_dir_get_nth_slot(page, slot_no); + } + } + + if (page_rec_is_supremum(rec)) { + + break; + } + + if (UNIV_UNLIKELY + (rec_get_next_offs(rec, TRUE) < FIL_PAGE_DATA + || rec_get_next_offs(rec, TRUE) >= UNIV_PAGE_SIZE)) { + fprintf(stderr, + "InnoDB: Next record offset nonsensical %lu" + " for rec %lu\n", + (ulong) rec_get_next_offs(rec, TRUE), + (ulong) page_offset(rec)); + + goto func_exit; + } + + count++; + + if (UNIV_UNLIKELY(count > UNIV_PAGE_SIZE)) { + fprintf(stderr, + "InnoDB: Page record list appears" + " to be circular %lu\n", + (ulong) count); + goto func_exit; + } + + rec = page_rec_get_next_const(rec); + own_count++; + } + + if (UNIV_UNLIKELY(rec_get_n_owned_new(rec) == 0)) { + fprintf(stderr, "InnoDB: n owned is zero" + " in a supremum rec\n"); + + goto func_exit; + } + + if (UNIV_UNLIKELY(slot_no != n_slots - 1)) { + fprintf(stderr, "InnoDB: n slots wrong %lu, %lu\n", + (ulong) slot_no, (ulong) (n_slots - 1)); + goto func_exit; + } + + if (UNIV_UNLIKELY(page_header_get_field(page, PAGE_N_RECS) + + PAGE_HEAP_NO_USER_LOW + != count + 1)) { + fprintf(stderr, "InnoDB: n recs wrong %lu %lu\n", + (ulong) page_header_get_field(page, PAGE_N_RECS) + + PAGE_HEAP_NO_USER_LOW, + (ulong) (count + 1)); + + goto func_exit; + } + + /* Check then the free list */ + rec = page_header_get_ptr(page, PAGE_FREE); + + while (rec != NULL) { + if (UNIV_UNLIKELY(rec < page + FIL_PAGE_DATA + || rec >= page + UNIV_PAGE_SIZE)) { + fprintf(stderr, + "InnoDB: Free list record has" + " a nonsensical offset %lu\n", + (ulong) page_offset(rec)); + + goto func_exit; + } + + if (UNIV_UNLIKELY(rec > rec_heap_top)) { + fprintf(stderr, + "InnoDB: Free list record %lu" + " is above rec heap top %lu\n", + (ulong) page_offset(rec), + (ulong) page_offset(rec_heap_top)); + + goto func_exit; + } + + count++; + + if (UNIV_UNLIKELY(count > UNIV_PAGE_SIZE)) { + fprintf(stderr, + "InnoDB: Page free list appears" + " to be circular %lu\n", + (ulong) count); + goto func_exit; + } + + rec = page_rec_get_next_const(rec); + } + + if (UNIV_UNLIKELY(page_dir_get_n_heap(page) != count + 1)) { + + fprintf(stderr, "InnoDB: N heap is wrong %lu, %lu\n", + (ulong) page_dir_get_n_heap(page), + (ulong) (count + 1)); + + goto func_exit; + } + + ret = TRUE; + +func_exit: + return(ret); +} + +/***************************************************************//** +This function checks the consistency of an index page. +@return TRUE if ok */ +UNIV_INTERN +ibool +page_validate( +/*==========*/ + const page_t* page, /*!< in: index page */ + dict_index_t* index) /*!< in: data dictionary index containing + the page record type definition */ +{ + const page_dir_slot_t* slot; + mem_heap_t* heap; + byte* buf; + ulint count; + ulint own_count; + ulint rec_own_count; + ulint slot_no; + ulint data_size; + const rec_t* rec; + const rec_t* old_rec = NULL; + ulint offs; + ulint n_slots; + ibool ret = FALSE; + ulint i; + ulint* offsets = NULL; + ulint* old_offsets = NULL; + + if (UNIV_UNLIKELY((ibool) !!page_is_comp(page) + != dict_table_is_comp(index->table))) { + fputs("InnoDB: 'compact format' flag mismatch\n", stderr); + goto func_exit2; + } + if (page_is_comp(page)) { + if (UNIV_UNLIKELY(!page_simple_validate_new(page))) { + goto func_exit2; + } + } else { + if (UNIV_UNLIKELY(!page_simple_validate_old(page))) { + goto func_exit2; + } + } + + if (dict_index_is_sec_or_ibuf(index) && page_is_leaf(page) + && !page_is_empty(page)) { + trx_id_t max_trx_id = page_get_max_trx_id(page); + trx_id_t sys_max_trx_id = trx_sys_get_max_trx_id(); + + if (max_trx_id == 0 || max_trx_id > sys_max_trx_id) { + ib_logf(IB_LOG_LEVEL_ERROR, + "PAGE_MAX_TRX_ID out of bounds: " + TRX_ID_FMT ", " TRX_ID_FMT, + max_trx_id, sys_max_trx_id); + goto func_exit2; + } + } + + heap = mem_heap_create(UNIV_PAGE_SIZE + 200); + + /* The following buffer is used to check that the + records in the page record heap do not overlap */ + + buf = static_cast<byte*>(mem_heap_zalloc(heap, UNIV_PAGE_SIZE)); + + /* Check first that the record heap and the directory do not + overlap. */ + + n_slots = page_dir_get_n_slots(page); + + if (UNIV_UNLIKELY(!(page_header_get_ptr(page, PAGE_HEAP_TOP) + <= page_dir_get_nth_slot(page, n_slots - 1)))) { + + fprintf(stderr, + "InnoDB: Record heap and dir overlap" + " on space %lu page %lu index %s, %p, %p\n", + (ulong) page_get_space_id(page), + (ulong) page_get_page_no(page), index->name, + page_header_get_ptr(page, PAGE_HEAP_TOP), + page_dir_get_nth_slot(page, n_slots - 1)); + + goto func_exit; + } + + /* Validate the record list in a loop checking also that + it is consistent with the directory. */ + count = 0; + data_size = 0; + own_count = 1; + slot_no = 0; + slot = page_dir_get_nth_slot(page, slot_no); + + rec = page_get_infimum_rec(page); + + for (;;) { + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + + if (page_is_comp(page) && page_rec_is_user_rec(rec) + && UNIV_UNLIKELY(rec_get_node_ptr_flag(rec) + == page_is_leaf(page))) { + fputs("InnoDB: node_ptr flag mismatch\n", stderr); + goto func_exit; + } + + if (UNIV_UNLIKELY(!page_rec_validate(rec, offsets))) { + goto func_exit; + } + +#ifndef UNIV_HOTBACKUP + /* Check that the records are in the ascending order */ + if (UNIV_LIKELY(count >= PAGE_HEAP_NO_USER_LOW) + && !page_rec_is_supremum(rec)) { + if (UNIV_UNLIKELY + (1 != cmp_rec_rec(rec, old_rec, + offsets, old_offsets, index))) { + fprintf(stderr, + "InnoDB: Records in wrong order" + " on space %lu page %lu index %s\n", + (ulong) page_get_space_id(page), + (ulong) page_get_page_no(page), + index->name); + fputs("\nInnoDB: previous record ", stderr); + rec_print_new(stderr, old_rec, old_offsets); + fputs("\nInnoDB: record ", stderr); + rec_print_new(stderr, rec, offsets); + putc('\n', stderr); + + goto func_exit; + } + } +#endif /* !UNIV_HOTBACKUP */ + + if (page_rec_is_user_rec(rec)) { + + data_size += rec_offs_size(offsets); + } + + offs = page_offset(rec_get_start(rec, offsets)); + i = rec_offs_size(offsets); + if (UNIV_UNLIKELY(offs + i >= UNIV_PAGE_SIZE)) { + fputs("InnoDB: record offset out of bounds\n", stderr); + goto func_exit; + } + + while (i--) { + if (UNIV_UNLIKELY(buf[offs + i])) { + /* No other record may overlap this */ + + fputs("InnoDB: Record overlaps another\n", + stderr); + goto func_exit; + } + + buf[offs + i] = 1; + } + + if (page_is_comp(page)) { + rec_own_count = rec_get_n_owned_new(rec); + } else { + rec_own_count = rec_get_n_owned_old(rec); + } + + if (UNIV_UNLIKELY(rec_own_count)) { + /* This is a record pointed to by a dir slot */ + if (UNIV_UNLIKELY(rec_own_count != own_count)) { + fprintf(stderr, + "InnoDB: Wrong owned count %lu, %lu\n", + (ulong) rec_own_count, + (ulong) own_count); + goto func_exit; + } + + if (page_dir_slot_get_rec(slot) != rec) { + fputs("InnoDB: Dir slot does not" + " point to right rec\n", + stderr); + goto func_exit; + } + + page_dir_slot_check(slot); + + own_count = 0; + if (!page_rec_is_supremum(rec)) { + slot_no++; + slot = page_dir_get_nth_slot(page, slot_no); + } + } + + if (page_rec_is_supremum(rec)) { + break; + } + + count++; + own_count++; + old_rec = rec; + rec = page_rec_get_next_const(rec); + + /* set old_offsets to offsets; recycle offsets */ + { + ulint* offs = old_offsets; + old_offsets = offsets; + offsets = offs; + } + } + + if (page_is_comp(page)) { + if (UNIV_UNLIKELY(rec_get_n_owned_new(rec) == 0)) { + + goto n_owned_zero; + } + } else if (UNIV_UNLIKELY(rec_get_n_owned_old(rec) == 0)) { +n_owned_zero: + fputs("InnoDB: n owned is zero\n", stderr); + goto func_exit; + } + + if (UNIV_UNLIKELY(slot_no != n_slots - 1)) { + fprintf(stderr, "InnoDB: n slots wrong %lu %lu\n", + (ulong) slot_no, (ulong) (n_slots - 1)); + goto func_exit; + } + + if (UNIV_UNLIKELY(page_header_get_field(page, PAGE_N_RECS) + + PAGE_HEAP_NO_USER_LOW + != count + 1)) { + fprintf(stderr, "InnoDB: n recs wrong %lu %lu\n", + (ulong) page_header_get_field(page, PAGE_N_RECS) + + PAGE_HEAP_NO_USER_LOW, + (ulong) (count + 1)); + goto func_exit; + } + + if (UNIV_UNLIKELY(data_size != page_get_data_size(page))) { + fprintf(stderr, + "InnoDB: Summed data size %lu, returned by func %lu\n", + (ulong) data_size, (ulong) page_get_data_size(page)); + goto func_exit; + } + + /* Check then the free list */ + rec = page_header_get_ptr(page, PAGE_FREE); + + while (rec != NULL) { + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + if (UNIV_UNLIKELY(!page_rec_validate(rec, offsets))) { + + goto func_exit; + } + + count++; + offs = page_offset(rec_get_start(rec, offsets)); + i = rec_offs_size(offsets); + if (UNIV_UNLIKELY(offs + i >= UNIV_PAGE_SIZE)) { + fputs("InnoDB: record offset out of bounds\n", stderr); + goto func_exit; + } + + while (i--) { + + if (UNIV_UNLIKELY(buf[offs + i])) { + fputs("InnoDB: Record overlaps another" + " in free list\n", stderr); + goto func_exit; + } + + buf[offs + i] = 1; + } + + rec = page_rec_get_next_const(rec); + } + + if (UNIV_UNLIKELY(page_dir_get_n_heap(page) != count + 1)) { + fprintf(stderr, "InnoDB: N heap is wrong %lu %lu\n", + (ulong) page_dir_get_n_heap(page), + (ulong) count + 1); + goto func_exit; + } + + ret = TRUE; + +func_exit: + mem_heap_free(heap); + + if (UNIV_UNLIKELY(ret == FALSE)) { +func_exit2: + fprintf(stderr, + "InnoDB: Apparent corruption" + " in space %lu page %lu index %s\n", + (ulong) page_get_space_id(page), + (ulong) page_get_page_no(page), + index->name); + buf_page_print(page, 0, 0); + } + + return(ret); +} + +#ifndef UNIV_HOTBACKUP +/***************************************************************//** +Looks in the page record list for a record with the given heap number. +@return record, NULL if not found */ +UNIV_INTERN +const rec_t* +page_find_rec_with_heap_no( +/*=======================*/ + const page_t* page, /*!< in: index page */ + ulint heap_no)/*!< in: heap number */ +{ + const rec_t* rec; + + if (page_is_comp(page)) { + rec = page + PAGE_NEW_INFIMUM; + + for(;;) { + ulint rec_heap_no = rec_get_heap_no_new(rec); + + if (rec_heap_no == heap_no) { + + return(rec); + } else if (rec_heap_no == PAGE_HEAP_NO_SUPREMUM) { + + return(NULL); + } + + rec = page + rec_get_next_offs(rec, TRUE); + } + } else { + rec = page + PAGE_OLD_INFIMUM; + + for (;;) { + ulint rec_heap_no = rec_get_heap_no_old(rec); + + if (rec_heap_no == heap_no) { + + return(rec); + } else if (rec_heap_no == PAGE_HEAP_NO_SUPREMUM) { + + return(NULL); + } + + rec = page + rec_get_next_offs(rec, FALSE); + } + } +} +#endif /* !UNIV_HOTBACKUP */ + +/*******************************************************//** +Removes the record from a leaf page. This function does not log +any changes. It is used by the IMPORT tablespace functions. +The cursor is moved to the next record after the deleted one. +@return true if success, i.e., the page did not become too empty */ +UNIV_INTERN +bool +page_delete_rec( +/*============*/ + const dict_index_t* index, /*!< in: The index that the record + belongs to */ + page_cur_t* pcur, /*!< in/out: page cursor on record + to delete */ + page_zip_des_t* page_zip,/*!< in: compressed page descriptor */ + const ulint* offsets)/*!< in: offsets for record */ +{ + bool no_compress_needed; + buf_block_t* block = pcur->block; + page_t* page = buf_block_get_frame(block); + + ut_ad(page_is_leaf(page)); + + if (!rec_offs_any_extern(offsets) + && ((page_get_data_size(page) - rec_offs_size(offsets) + < BTR_CUR_PAGE_COMPRESS_LIMIT) + || (mach_read_from_4(page + FIL_PAGE_NEXT) == FIL_NULL + && mach_read_from_4(page + FIL_PAGE_PREV) == FIL_NULL) + || (page_get_n_recs(page) < 2))) { + + ulint root_page_no = dict_index_get_page(index); + + /* The page fillfactor will drop below a predefined + minimum value, OR the level in the B-tree contains just + one page, OR the page will become empty: we recommend + compression if this is not the root page. */ + + no_compress_needed = page_get_page_no(page) == root_page_no; + } else { + no_compress_needed = true; + } + + if (no_compress_needed) { +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + + page_cur_delete_rec(pcur, index, offsets, 0); + +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + } + + return(no_compress_needed); +} + +/** Get the last non-delete-marked record on a page. +@param[in] page index tree leaf page +@return the last record, not delete-marked +@retval infimum record if all records are delete-marked */ + +const rec_t* +page_find_rec_max_not_deleted( + const page_t* page) +{ + const rec_t* rec = page_get_infimum_rec(page); + const rec_t* prev_rec = NULL; // remove warning + + /* Because the page infimum is never delete-marked, + prev_rec will always be assigned to it first. */ + ut_ad(!rec_get_deleted_flag(rec, page_rec_is_comp(rec))); + if (page_is_comp(page)) { + do { + if (!rec_get_deleted_flag(rec, true)) { + prev_rec = rec; + } + rec = page_rec_get_next_low(rec, true); + } while (rec != page + PAGE_NEW_SUPREMUM); + } else { + do { + if (!rec_get_deleted_flag(rec, false)) { + prev_rec = rec; + } + rec = page_rec_get_next_low(rec, false); + } while (rec != page + PAGE_OLD_SUPREMUM); + } + return(prev_rec); +} diff --git a/storage/xtradb/page/page0zip.cc b/storage/xtradb/page/page0zip.cc new file mode 100644 index 00000000000..67dca183c6d --- /dev/null +++ b/storage/xtradb/page/page0zip.cc @@ -0,0 +1,4952 @@ +/***************************************************************************** + +Copyright (c) 2005, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file page/page0zip.cc +Compressed page interface + +Created June 2005 by Marko Makela +*******************************************************/ + +// First include (the generated) my_config.h, to get correct platform defines. +#include "my_config.h" + +#include <map> +using namespace std; + +#define THIS_MODULE +#include "page0zip.h" +#ifdef UNIV_NONINL +# include "page0zip.ic" +#endif +#undef THIS_MODULE +#include "page0page.h" +#include "mtr0log.h" +#include "ut0sort.h" +#include "dict0dict.h" +#include "btr0cur.h" +#include "page0types.h" +#include "log0recv.h" +#include "zlib.h" +#ifndef UNIV_HOTBACKUP +# include "buf0buf.h" +# include "buf0lru.h" +# include "btr0sea.h" +# include "dict0boot.h" +# include "lock0lock.h" +# include "srv0mon.h" +# include "srv0srv.h" +# include "ut0crc32.h" +#else /* !UNIV_HOTBACKUP */ +# include "buf0checksum.h" +# define lock_move_reorganize_page(block, temp_block) ((void) 0) +# define buf_LRU_stat_inc_unzip() ((void) 0) +#endif /* !UNIV_HOTBACKUP */ + +#ifndef UNIV_HOTBACKUP +/** Statistics on compression, indexed by page_zip_des_t::ssize - 1 */ +UNIV_INTERN page_zip_stat_t page_zip_stat[PAGE_ZIP_SSIZE_MAX]; +/** Statistics on compression, indexed by index->id */ +UNIV_INTERN page_zip_stat_per_index_t page_zip_stat_per_index; +/** Mutex protecting page_zip_stat_per_index */ +UNIV_INTERN ib_mutex_t page_zip_stat_per_index_mutex; +#ifdef HAVE_PSI_INTERFACE +UNIV_INTERN mysql_pfs_key_t page_zip_stat_per_index_mutex_key; +#endif /* HAVE_PSI_INTERFACE */ +#endif /* !UNIV_HOTBACKUP */ + +/* Compression level to be used by zlib. Settable by user. */ +UNIV_INTERN uint page_zip_level = DEFAULT_COMPRESSION_LEVEL; + +/* Whether or not to log compressed page images to avoid possible +compression algorithm changes in zlib. */ +UNIV_INTERN my_bool page_zip_log_pages = true; + +/* Please refer to ../include/page0zip.ic for a description of the +compressed page format. */ + +/* The infimum and supremum records are omitted from the compressed page. +On compress, we compare that the records are there, and on uncompress we +restore the records. */ +/** Extra bytes of an infimum record */ +static const byte infimum_extra[] = { + 0x01, /* info_bits=0, n_owned=1 */ + 0x00, 0x02 /* heap_no=0, status=2 */ + /* ?, ? */ /* next=(first user rec, or supremum) */ +}; +/** Data bytes of an infimum record */ +static const byte infimum_data[] = { + 0x69, 0x6e, 0x66, 0x69, + 0x6d, 0x75, 0x6d, 0x00 /* "infimum\0" */ +}; +/** Extra bytes and data bytes of a supremum record */ +static const byte supremum_extra_data[] = { + /* 0x0?, */ /* info_bits=0, n_owned=1..8 */ + 0x00, 0x0b, /* heap_no=1, status=3 */ + 0x00, 0x00, /* next=0 */ + 0x73, 0x75, 0x70, 0x72, + 0x65, 0x6d, 0x75, 0x6d /* "supremum" */ +}; + +/** Assert that a block of memory is filled with zero bytes. +Compare at most sizeof(field_ref_zero) bytes. +@param b in: memory block +@param s in: size of the memory block, in bytes */ +#define ASSERT_ZERO(b, s) \ + ut_ad(!memcmp(b, field_ref_zero, ut_min(s, sizeof field_ref_zero))) +/** Assert that a BLOB pointer is filled with zero bytes. +@param b in: BLOB pointer */ +#define ASSERT_ZERO_BLOB(b) \ + ut_ad(!memcmp(b, field_ref_zero, sizeof field_ref_zero)) + +/* Enable some extra debugging output. This code can be enabled +independently of any UNIV_ debugging conditions. */ +#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG +# include <stdarg.h> +__attribute__((format (printf, 1, 2))) +/**********************************************************************//** +Report a failure to decompress or compress. +@return number of characters printed */ +static +int +page_zip_fail_func( +/*===============*/ + const char* fmt, /*!< in: printf(3) format string */ + ...) /*!< in: arguments corresponding to fmt */ +{ + int res; + va_list ap; + + ut_print_timestamp(stderr); + fputs(" InnoDB: ", stderr); + va_start(ap, fmt); + res = vfprintf(stderr, fmt, ap); + va_end(ap); + + return(res); +} +/** Wrapper for page_zip_fail_func() +@param fmt_args in: printf(3) format string and arguments */ +# define page_zip_fail(fmt_args) page_zip_fail_func fmt_args +#else /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ +/** Dummy wrapper for page_zip_fail_func() +@param fmt_args ignored: printf(3) format string and arguments */ +# define page_zip_fail(fmt_args) /* empty */ +#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ + +#ifndef UNIV_HOTBACKUP +/**********************************************************************//** +Determine the guaranteed free space on an empty page. +@return minimum payload size on the page */ +UNIV_INTERN +ulint +page_zip_empty_size( +/*================*/ + ulint n_fields, /*!< in: number of columns in the index */ + ulint zip_size) /*!< in: compressed page size in bytes */ +{ + lint size = zip_size + /* subtract the page header and the longest + uncompressed data needed for one record */ + - (PAGE_DATA + + PAGE_ZIP_DIR_SLOT_SIZE + + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN + + 1/* encoded heap_no==2 in page_zip_write_rec() */ + + 1/* end of modification log */ + - REC_N_NEW_EXTRA_BYTES/* omitted bytes */) + /* subtract the space for page_zip_fields_encode() */ + - compressBound(static_cast<uLong>(2 * (n_fields + 1))); + return(size > 0 ? (ulint) size : 0); +} +#endif /* !UNIV_HOTBACKUP */ + +/*************************************************************//** +Gets the number of elements in the dense page directory, +including deleted records (the free list). +@return number of elements in the dense page directory */ +UNIV_INLINE +ulint +page_zip_dir_elems( +/*===============*/ + const page_zip_des_t* page_zip) /*!< in: compressed page */ +{ + /* Exclude the page infimum and supremum from the record count. */ + return(page_dir_get_n_heap(page_zip->data) - PAGE_HEAP_NO_USER_LOW); +} + +/*************************************************************//** +Gets the size of the compressed page trailer (the dense page directory), +including deleted records (the free list). +@return length of dense page directory, in bytes */ +UNIV_INLINE +ulint +page_zip_dir_size( +/*==============*/ + const page_zip_des_t* page_zip) /*!< in: compressed page */ +{ + return(PAGE_ZIP_DIR_SLOT_SIZE * page_zip_dir_elems(page_zip)); +} + +/*************************************************************//** +Gets an offset to the compressed page trailer (the dense page directory), +including deleted records (the free list). +@return offset of the dense page directory */ +UNIV_INLINE +ulint +page_zip_dir_start_offs( +/*====================*/ + const page_zip_des_t* page_zip, /*!< in: compressed page */ + ulint n_dense) /*!< in: directory size */ +{ + ut_ad(n_dense * PAGE_ZIP_DIR_SLOT_SIZE < page_zip_get_size(page_zip)); + + return(page_zip_get_size(page_zip) - n_dense * PAGE_ZIP_DIR_SLOT_SIZE); +} + +/*************************************************************//** +Gets a pointer to the compressed page trailer (the dense page directory), +including deleted records (the free list). +@param[in] page_zip compressed page +@param[in] n_dense number of entries in the directory +@return pointer to the dense page directory */ +#define page_zip_dir_start_low(page_zip, n_dense) \ + ((page_zip)->data + page_zip_dir_start_offs(page_zip, n_dense)) +/*************************************************************//** +Gets a pointer to the compressed page trailer (the dense page directory), +including deleted records (the free list). +@param[in] page_zip compressed page +@return pointer to the dense page directory */ +#define page_zip_dir_start(page_zip) \ + page_zip_dir_start_low(page_zip, page_zip_dir_elems(page_zip)) + +/*************************************************************//** +Gets the size of the compressed page trailer (the dense page directory), +only including user records (excluding the free list). +@return length of dense page directory comprising existing records, in bytes */ +UNIV_INLINE +ulint +page_zip_dir_user_size( +/*===================*/ + const page_zip_des_t* page_zip) /*!< in: compressed page */ +{ + ulint size = PAGE_ZIP_DIR_SLOT_SIZE + * page_get_n_recs(page_zip->data); + ut_ad(size <= page_zip_dir_size(page_zip)); + return(size); +} + +/*************************************************************//** +Find the slot of the given record in the dense page directory. +@return dense directory slot, or NULL if record not found */ +UNIV_INLINE +byte* +page_zip_dir_find_low( +/*==================*/ + byte* slot, /*!< in: start of records */ + byte* end, /*!< in: end of records */ + ulint offset) /*!< in: offset of user record */ +{ + ut_ad(slot <= end); + + for (; slot < end; slot += PAGE_ZIP_DIR_SLOT_SIZE) { + if ((mach_read_from_2(slot) & PAGE_ZIP_DIR_SLOT_MASK) + == offset) { + return(slot); + } + } + + return(NULL); +} + +/*************************************************************//** +Find the slot of the given non-free record in the dense page directory. +@return dense directory slot, or NULL if record not found */ +UNIV_INLINE +byte* +page_zip_dir_find( +/*==============*/ + page_zip_des_t* page_zip, /*!< in: compressed page */ + ulint offset) /*!< in: offset of user record */ +{ + byte* end = page_zip->data + page_zip_get_size(page_zip); + + ut_ad(page_zip_simple_validate(page_zip)); + + return(page_zip_dir_find_low(end - page_zip_dir_user_size(page_zip), + end, + offset)); +} + +/*************************************************************//** +Find the slot of the given free record in the dense page directory. +@return dense directory slot, or NULL if record not found */ +UNIV_INLINE +byte* +page_zip_dir_find_free( +/*===================*/ + page_zip_des_t* page_zip, /*!< in: compressed page */ + ulint offset) /*!< in: offset of user record */ +{ + byte* end = page_zip->data + page_zip_get_size(page_zip); + + ut_ad(page_zip_simple_validate(page_zip)); + + return(page_zip_dir_find_low(end - page_zip_dir_size(page_zip), + end - page_zip_dir_user_size(page_zip), + offset)); +} + +/*************************************************************//** +Read a given slot in the dense page directory. +@return record offset on the uncompressed page, possibly ORed with +PAGE_ZIP_DIR_SLOT_DEL or PAGE_ZIP_DIR_SLOT_OWNED */ +UNIV_INLINE +ulint +page_zip_dir_get( +/*=============*/ + const page_zip_des_t* page_zip, /*!< in: compressed page */ + ulint slot) /*!< in: slot + (0=first user record) */ +{ + ut_ad(page_zip_simple_validate(page_zip)); + ut_ad(slot < page_zip_dir_size(page_zip) / PAGE_ZIP_DIR_SLOT_SIZE); + return(mach_read_from_2(page_zip->data + page_zip_get_size(page_zip) + - PAGE_ZIP_DIR_SLOT_SIZE * (slot + 1))); +} + +#ifndef UNIV_HOTBACKUP +/**********************************************************************//** +Write a log record of compressing an index page. */ +static +void +page_zip_compress_write_log( +/*========================*/ + const page_zip_des_t* page_zip,/*!< in: compressed page */ + const page_t* page, /*!< in: uncompressed page */ + dict_index_t* index, /*!< in: index of the B-tree node */ + mtr_t* mtr) /*!< in: mini-transaction */ +{ + byte* log_ptr; + ulint trailer_size; + + ut_ad(!dict_index_is_ibuf(index)); + + log_ptr = mlog_open(mtr, 11 + 2 + 2); + + if (!log_ptr) { + + return; + } + + /* Read the number of user records. */ + trailer_size = page_dir_get_n_heap(page_zip->data) + - PAGE_HEAP_NO_USER_LOW; + /* Multiply by uncompressed of size stored per record */ + if (!page_is_leaf(page)) { + trailer_size *= PAGE_ZIP_DIR_SLOT_SIZE + REC_NODE_PTR_SIZE; + } else if (dict_index_is_clust(index)) { + trailer_size *= PAGE_ZIP_DIR_SLOT_SIZE + + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + } else { + trailer_size *= PAGE_ZIP_DIR_SLOT_SIZE; + } + /* Add the space occupied by BLOB pointers. */ + trailer_size += page_zip->n_blobs * BTR_EXTERN_FIELD_REF_SIZE; + ut_a(page_zip->m_end > PAGE_DATA); +#if FIL_PAGE_DATA > PAGE_DATA +# error "FIL_PAGE_DATA > PAGE_DATA" +#endif + ut_a(page_zip->m_end + trailer_size <= page_zip_get_size(page_zip)); + + log_ptr = mlog_write_initial_log_record_fast((page_t*) page, + MLOG_ZIP_PAGE_COMPRESS, + log_ptr, mtr); + mach_write_to_2(log_ptr, page_zip->m_end - FIL_PAGE_TYPE); + log_ptr += 2; + mach_write_to_2(log_ptr, trailer_size); + log_ptr += 2; + mlog_close(mtr, log_ptr); + + /* Write FIL_PAGE_PREV and FIL_PAGE_NEXT */ + mlog_catenate_string(mtr, page_zip->data + FIL_PAGE_PREV, 4); + mlog_catenate_string(mtr, page_zip->data + FIL_PAGE_NEXT, 4); + /* Write most of the page header, the compressed stream and + the modification log. */ + mlog_catenate_string(mtr, page_zip->data + FIL_PAGE_TYPE, + page_zip->m_end - FIL_PAGE_TYPE); + /* Write the uncompressed trailer of the compressed page. */ + mlog_catenate_string(mtr, page_zip->data + page_zip_get_size(page_zip) + - trailer_size, trailer_size); +} +#endif /* !UNIV_HOTBACKUP */ + +/******************************************************//** +Determine how many externally stored columns are contained +in existing records with smaller heap_no than rec. */ +static +ulint +page_zip_get_n_prev_extern( +/*=======================*/ + const page_zip_des_t* page_zip,/*!< in: dense page directory on + compressed page */ + const rec_t* rec, /*!< in: compact physical record + on a B-tree leaf page */ + const dict_index_t* index) /*!< in: record descriptor */ +{ + const page_t* page = page_align(rec); + ulint n_ext = 0; + ulint i; + ulint left; + ulint heap_no; + ulint n_recs = page_get_n_recs(page_zip->data); + + ut_ad(page_is_leaf(page)); + ut_ad(page_is_comp(page)); + ut_ad(dict_table_is_comp(index->table)); + ut_ad(dict_index_is_clust(index)); + ut_ad(!dict_index_is_ibuf(index)); + + heap_no = rec_get_heap_no_new(rec); + ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW); + left = heap_no - PAGE_HEAP_NO_USER_LOW; + if (UNIV_UNLIKELY(!left)) { + return(0); + } + + for (i = 0; i < n_recs; i++) { + const rec_t* r = page + (page_zip_dir_get(page_zip, i) + & PAGE_ZIP_DIR_SLOT_MASK); + + if (rec_get_heap_no_new(r) < heap_no) { + n_ext += rec_get_n_extern_new(r, index, + ULINT_UNDEFINED); + if (!--left) { + break; + } + } + } + + return(n_ext); +} + +/**********************************************************************//** +Encode the length of a fixed-length column. +@return buf + length of encoded val */ +static +byte* +page_zip_fixed_field_encode( +/*========================*/ + byte* buf, /*!< in: pointer to buffer where to write */ + ulint val) /*!< in: value to write */ +{ + ut_ad(val >= 2); + + if (UNIV_LIKELY(val < 126)) { + /* + 0 = nullable variable field of at most 255 bytes length; + 1 = not null variable field of at most 255 bytes length; + 126 = nullable variable field with maximum length >255; + 127 = not null variable field with maximum length >255 + */ + *buf++ = (byte) val; + } else { + *buf++ = (byte) (0x80 | val >> 8); + *buf++ = (byte) val; + } + + return(buf); +} + +/**********************************************************************//** +Write the index information for the compressed page. +@return used size of buf */ +static +ulint +page_zip_fields_encode( +/*===================*/ + ulint n, /*!< in: number of fields to compress */ + dict_index_t* index, /*!< in: index comprising at least n fields */ + ulint trx_id_pos,/*!< in: position of the trx_id column + in the index, or ULINT_UNDEFINED if + this is a non-leaf page */ + byte* buf) /*!< out: buffer of (n + 1) * 2 bytes */ +{ + const byte* buf_start = buf; + ulint i; + ulint col; + ulint trx_id_col = 0; + /* sum of lengths of preceding non-nullable fixed fields, or 0 */ + ulint fixed_sum = 0; + + ut_ad(trx_id_pos == ULINT_UNDEFINED || trx_id_pos < n); + + for (i = col = 0; i < n; i++) { + dict_field_t* field = dict_index_get_nth_field(index, i); + ulint val; + + if (dict_field_get_col(field)->prtype & DATA_NOT_NULL) { + val = 1; /* set the "not nullable" flag */ + } else { + val = 0; /* nullable field */ + } + + if (!field->fixed_len) { + /* variable-length field */ + const dict_col_t* column + = dict_field_get_col(field); + + if (UNIV_UNLIKELY(column->len > 255) + || UNIV_UNLIKELY(column->mtype == DATA_BLOB)) { + val |= 0x7e; /* max > 255 bytes */ + } + + if (fixed_sum) { + /* write out the length of any + preceding non-nullable fields */ + buf = page_zip_fixed_field_encode( + buf, fixed_sum << 1 | 1); + fixed_sum = 0; + col++; + } + + *buf++ = (byte) val; + col++; + } else if (val) { + /* fixed-length non-nullable field */ + + if (fixed_sum && UNIV_UNLIKELY + (fixed_sum + field->fixed_len + > DICT_MAX_FIXED_COL_LEN)) { + /* Write out the length of the + preceding non-nullable fields, + to avoid exceeding the maximum + length of a fixed-length column. */ + buf = page_zip_fixed_field_encode( + buf, fixed_sum << 1 | 1); + fixed_sum = 0; + col++; + } + + if (i && UNIV_UNLIKELY(i == trx_id_pos)) { + if (fixed_sum) { + /* Write out the length of any + preceding non-nullable fields, + and start a new trx_id column. */ + buf = page_zip_fixed_field_encode( + buf, fixed_sum << 1 | 1); + col++; + } + + trx_id_col = col; + fixed_sum = field->fixed_len; + } else { + /* add to the sum */ + fixed_sum += field->fixed_len; + } + } else { + /* fixed-length nullable field */ + + if (fixed_sum) { + /* write out the length of any + preceding non-nullable fields */ + buf = page_zip_fixed_field_encode( + buf, fixed_sum << 1 | 1); + fixed_sum = 0; + col++; + } + + buf = page_zip_fixed_field_encode( + buf, field->fixed_len << 1); + col++; + } + } + + if (fixed_sum) { + /* Write out the lengths of last fixed-length columns. */ + buf = page_zip_fixed_field_encode(buf, fixed_sum << 1 | 1); + } + + if (trx_id_pos != ULINT_UNDEFINED) { + /* Write out the position of the trx_id column */ + i = trx_id_col; + } else { + /* Write out the number of nullable fields */ + i = index->n_nullable; + } + + if (i < 128) { + *buf++ = (byte) i; + } else { + *buf++ = (byte) (0x80 | i >> 8); + *buf++ = (byte) i; + } + + ut_ad((ulint) (buf - buf_start) <= (n + 2) * 2); + return((ulint) (buf - buf_start)); +} + +/**********************************************************************//** +Populate the dense page directory from the sparse directory. */ +static +void +page_zip_dir_encode( +/*================*/ + const page_t* page, /*!< in: compact page */ + byte* buf, /*!< in: pointer to dense page directory[-1]; + out: dense directory on compressed page */ + const rec_t** recs) /*!< in: pointer to an array of 0, or NULL; + out: dense page directory sorted by ascending + address (and heap_no) */ +{ + const byte* rec; + ulint status; + ulint min_mark; + ulint heap_no; + ulint i; + ulint n_heap; + ulint offs; + + min_mark = 0; + + if (page_is_leaf(page)) { + status = REC_STATUS_ORDINARY; + } else { + status = REC_STATUS_NODE_PTR; + if (UNIV_UNLIKELY + (mach_read_from_4(page + FIL_PAGE_PREV) == FIL_NULL)) { + min_mark = REC_INFO_MIN_REC_FLAG; + } + } + + n_heap = page_dir_get_n_heap(page); + + /* Traverse the list of stored records in the collation order, + starting from the first user record. */ + + rec = page + PAGE_NEW_INFIMUM; + + i = 0; + + for (;;) { + ulint info_bits; + offs = rec_get_next_offs(rec, TRUE); + if (UNIV_UNLIKELY(offs == PAGE_NEW_SUPREMUM)) { + break; + } + rec = page + offs; + heap_no = rec_get_heap_no_new(rec); + ut_a(heap_no >= PAGE_HEAP_NO_USER_LOW); + ut_a(heap_no < n_heap); + ut_a(offs < UNIV_PAGE_SIZE - PAGE_DIR); + ut_a(offs >= PAGE_ZIP_START); +#if PAGE_ZIP_DIR_SLOT_MASK & (PAGE_ZIP_DIR_SLOT_MASK + 1) +# error "PAGE_ZIP_DIR_SLOT_MASK is not 1 less than a power of 2" +#endif +#if PAGE_ZIP_DIR_SLOT_MASK < UNIV_PAGE_SIZE_MAX - 1 +# error "PAGE_ZIP_DIR_SLOT_MASK < UNIV_PAGE_SIZE_MAX - 1" +#endif + if (UNIV_UNLIKELY(rec_get_n_owned_new(rec))) { + offs |= PAGE_ZIP_DIR_SLOT_OWNED; + } + + info_bits = rec_get_info_bits(rec, TRUE); + if (info_bits & REC_INFO_DELETED_FLAG) { + info_bits &= ~REC_INFO_DELETED_FLAG; + offs |= PAGE_ZIP_DIR_SLOT_DEL; + } + ut_a(info_bits == min_mark); + /* Only the smallest user record can have + REC_INFO_MIN_REC_FLAG set. */ + min_mark = 0; + + mach_write_to_2(buf - PAGE_ZIP_DIR_SLOT_SIZE * ++i, offs); + + if (UNIV_LIKELY_NULL(recs)) { + /* Ensure that each heap_no occurs at most once. */ + ut_a(!recs[heap_no - PAGE_HEAP_NO_USER_LOW]); + /* exclude infimum and supremum */ + recs[heap_no - PAGE_HEAP_NO_USER_LOW] = rec; + } + + ut_a(rec_get_status(rec) == status); + } + + offs = page_header_get_field(page, PAGE_FREE); + + /* Traverse the free list (of deleted records). */ + while (offs) { + ut_ad(!(offs & ~PAGE_ZIP_DIR_SLOT_MASK)); + rec = page + offs; + + heap_no = rec_get_heap_no_new(rec); + ut_a(heap_no >= PAGE_HEAP_NO_USER_LOW); + ut_a(heap_no < n_heap); + + ut_a(!rec[-REC_N_NEW_EXTRA_BYTES]); /* info_bits and n_owned */ + ut_a(rec_get_status(rec) == status); + + mach_write_to_2(buf - PAGE_ZIP_DIR_SLOT_SIZE * ++i, offs); + + if (UNIV_LIKELY_NULL(recs)) { + /* Ensure that each heap_no occurs at most once. */ + ut_a(!recs[heap_no - PAGE_HEAP_NO_USER_LOW]); + /* exclude infimum and supremum */ + recs[heap_no - PAGE_HEAP_NO_USER_LOW] = rec; + } + + offs = rec_get_next_offs(rec, TRUE); + } + + /* Ensure that each heap no occurs at least once. */ + ut_a(i + PAGE_HEAP_NO_USER_LOW == n_heap); +} + +extern "C" { + +/**********************************************************************//** +Allocate memory for zlib. */ +static +void* +page_zip_zalloc( +/*============*/ + void* opaque, /*!< in/out: memory heap */ + uInt items, /*!< in: number of items to allocate */ + uInt size) /*!< in: size of an item in bytes */ +{ + return(mem_heap_zalloc(static_cast<mem_heap_t*>(opaque), items * size)); +} + +/**********************************************************************//** +Deallocate memory for zlib. */ +static +void +page_zip_free( +/*==========*/ + void* opaque __attribute__((unused)), /*!< in: memory heap */ + void* address __attribute__((unused)))/*!< in: object to free */ +{ +} + +} /* extern "C" */ + +/**********************************************************************//** +Configure the zlib allocator to use the given memory heap. */ +UNIV_INTERN +void +page_zip_set_alloc( +/*===============*/ + void* stream, /*!< in/out: zlib stream */ + mem_heap_t* heap) /*!< in: memory heap to use */ +{ + z_stream* strm = static_cast<z_stream*>(stream); + + strm->zalloc = page_zip_zalloc; + strm->zfree = page_zip_free; + strm->opaque = heap; +} + +#if 0 || defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG +/** Symbol for enabling compression and decompression diagnostics */ +# define PAGE_ZIP_COMPRESS_DBG +#endif + +#ifdef PAGE_ZIP_COMPRESS_DBG +/** Set this variable in a debugger to enable +excessive logging in page_zip_compress(). */ +UNIV_INTERN ibool page_zip_compress_dbg; +/** Set this variable in a debugger to enable +binary logging of the data passed to deflate(). +When this variable is nonzero, it will act +as a log file name generator. */ +UNIV_INTERN unsigned page_zip_compress_log; + +/**********************************************************************//** +Wrapper for deflate(). Log the operation if page_zip_compress_dbg is set. +@return deflate() status: Z_OK, Z_BUF_ERROR, ... */ +static +int +page_zip_compress_deflate( +/*======================*/ + FILE* logfile,/*!< in: log file, or NULL */ + z_streamp strm, /*!< in/out: compressed stream for deflate() */ + int flush) /*!< in: deflate() flushing method */ +{ + int status; + if (UNIV_UNLIKELY(page_zip_compress_dbg)) { + ut_print_buf(stderr, strm->next_in, strm->avail_in); + } + if (UNIV_LIKELY_NULL(logfile)) { + fwrite(strm->next_in, 1, strm->avail_in, logfile); + } + status = deflate(strm, flush); + if (UNIV_UNLIKELY(page_zip_compress_dbg)) { + fprintf(stderr, " -> %d\n", status); + } + return(status); +} + +/* Redefine deflate(). */ +# undef deflate +/** Debug wrapper for the zlib compression routine deflate(). +Log the operation if page_zip_compress_dbg is set. +@param strm in/out: compressed stream +@param flush in: flushing method +@return deflate() status: Z_OK, Z_BUF_ERROR, ... */ +# define deflate(strm, flush) page_zip_compress_deflate(logfile, strm, flush) +/** Declaration of the logfile parameter */ +# define FILE_LOGFILE FILE* logfile, +/** The logfile parameter */ +# define LOGFILE logfile, +#else /* PAGE_ZIP_COMPRESS_DBG */ +/** Empty declaration of the logfile parameter */ +# define FILE_LOGFILE +/** Missing logfile parameter */ +# define LOGFILE +#endif /* PAGE_ZIP_COMPRESS_DBG */ + +/**********************************************************************//** +Compress the records of a node pointer page. +@return Z_OK, or a zlib error code */ +static +int +page_zip_compress_node_ptrs( +/*========================*/ + FILE_LOGFILE + z_stream* c_stream, /*!< in/out: compressed page stream */ + const rec_t** recs, /*!< in: dense page directory + sorted by address */ + ulint n_dense, /*!< in: size of recs[] */ + dict_index_t* index, /*!< in: the index of the page */ + byte* storage, /*!< in: end of dense page directory */ + mem_heap_t* heap) /*!< in: temporary memory heap */ +{ + int err = Z_OK; + ulint* offsets = NULL; + + do { + const rec_t* rec = *recs++; + + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + /* Only leaf nodes may contain externally stored columns. */ + ut_ad(!rec_offs_any_extern(offsets)); + + UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets)); + UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + + /* Compress the extra bytes. */ + c_stream->avail_in = static_cast<uInt>( + rec - REC_N_NEW_EXTRA_BYTES - c_stream->next_in); + + if (c_stream->avail_in) { + err = deflate(c_stream, Z_NO_FLUSH); + if (UNIV_UNLIKELY(err != Z_OK)) { + break; + } + } + ut_ad(!c_stream->avail_in); + + /* Compress the data bytes, except node_ptr. */ + c_stream->next_in = (byte*) rec; + c_stream->avail_in = static_cast<uInt>( + rec_offs_data_size(offsets) - REC_NODE_PTR_SIZE); + + if (c_stream->avail_in) { + err = deflate(c_stream, Z_NO_FLUSH); + if (UNIV_UNLIKELY(err != Z_OK)) { + break; + } + } + + ut_ad(!c_stream->avail_in); + + memcpy(storage - REC_NODE_PTR_SIZE + * (rec_get_heap_no_new(rec) - 1), + c_stream->next_in, REC_NODE_PTR_SIZE); + c_stream->next_in += REC_NODE_PTR_SIZE; + } while (--n_dense); + + return(err); +} + +/**********************************************************************//** +Compress the records of a leaf node of a secondary index. +@return Z_OK, or a zlib error code */ +static +int +page_zip_compress_sec( +/*==================*/ + FILE_LOGFILE + z_stream* c_stream, /*!< in/out: compressed page stream */ + const rec_t** recs, /*!< in: dense page directory + sorted by address */ + ulint n_dense) /*!< in: size of recs[] */ +{ + int err = Z_OK; + + ut_ad(n_dense > 0); + + do { + const rec_t* rec = *recs++; + + /* Compress everything up to this record. */ + c_stream->avail_in = static_cast<uInt>( + rec - REC_N_NEW_EXTRA_BYTES + - c_stream->next_in); + + if (UNIV_LIKELY(c_stream->avail_in)) { + UNIV_MEM_ASSERT_RW(c_stream->next_in, + c_stream->avail_in); + err = deflate(c_stream, Z_NO_FLUSH); + if (UNIV_UNLIKELY(err != Z_OK)) { + break; + } + } + + ut_ad(!c_stream->avail_in); + ut_ad(c_stream->next_in == rec - REC_N_NEW_EXTRA_BYTES); + + /* Skip the REC_N_NEW_EXTRA_BYTES. */ + + c_stream->next_in = (byte*) rec; + } while (--n_dense); + + return(err); +} + +/**********************************************************************//** +Compress a record of a leaf node of a clustered index that contains +externally stored columns. +@return Z_OK, or a zlib error code */ +static +int +page_zip_compress_clust_ext( +/*========================*/ + FILE_LOGFILE + z_stream* c_stream, /*!< in/out: compressed page stream */ + const rec_t* rec, /*!< in: record */ + const ulint* offsets, /*!< in: rec_get_offsets(rec) */ + ulint trx_id_col, /*!< in: position of of DB_TRX_ID */ + byte* deleted, /*!< in: dense directory entry pointing + to the head of the free list */ + byte* storage, /*!< in: end of dense page directory */ + byte** externs, /*!< in/out: pointer to the next + available BLOB pointer */ + ulint* n_blobs) /*!< in/out: number of + externally stored columns */ +{ + int err; + ulint i; + + UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets)); + UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + ulint len; + const byte* src; + + if (UNIV_UNLIKELY(i == trx_id_col)) { + ut_ad(!rec_offs_nth_extern(offsets, i)); + /* Store trx_id and roll_ptr + in uncompressed form. */ + src = rec_get_nth_field(rec, offsets, i, &len); + ut_ad(src + DATA_TRX_ID_LEN + == rec_get_nth_field(rec, offsets, + i + 1, &len)); + ut_ad(len == DATA_ROLL_PTR_LEN); + + /* Compress any preceding bytes. */ + c_stream->avail_in = static_cast<uInt>( + src - c_stream->next_in); + + if (c_stream->avail_in) { + err = deflate(c_stream, Z_NO_FLUSH); + if (UNIV_UNLIKELY(err != Z_OK)) { + + return(err); + } + } + + ut_ad(!c_stream->avail_in); + ut_ad(c_stream->next_in == src); + + memcpy(storage + - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN) + * (rec_get_heap_no_new(rec) - 1), + c_stream->next_in, + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + c_stream->next_in + += DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + + /* Skip also roll_ptr */ + i++; + } else if (rec_offs_nth_extern(offsets, i)) { + src = rec_get_nth_field(rec, offsets, i, &len); + ut_ad(len >= BTR_EXTERN_FIELD_REF_SIZE); + src += len - BTR_EXTERN_FIELD_REF_SIZE; + + c_stream->avail_in = static_cast<uInt>( + src - c_stream->next_in); + if (UNIV_LIKELY(c_stream->avail_in)) { + err = deflate(c_stream, Z_NO_FLUSH); + if (UNIV_UNLIKELY(err != Z_OK)) { + + return(err); + } + } + + ut_ad(!c_stream->avail_in); + ut_ad(c_stream->next_in == src); + + /* Reserve space for the data at + the end of the space reserved for + the compressed data and the page + modification log. */ + + if (UNIV_UNLIKELY + (c_stream->avail_out + <= BTR_EXTERN_FIELD_REF_SIZE)) { + /* out of space */ + return(Z_BUF_ERROR); + } + + ut_ad(*externs == c_stream->next_out + + c_stream->avail_out + + 1/* end of modif. log */); + + c_stream->next_in + += BTR_EXTERN_FIELD_REF_SIZE; + + /* Skip deleted records. */ + if (UNIV_LIKELY_NULL + (page_zip_dir_find_low( + storage, deleted, + page_offset(rec)))) { + continue; + } + + (*n_blobs)++; + c_stream->avail_out + -= BTR_EXTERN_FIELD_REF_SIZE; + *externs -= BTR_EXTERN_FIELD_REF_SIZE; + + /* Copy the BLOB pointer */ + memcpy(*externs, c_stream->next_in + - BTR_EXTERN_FIELD_REF_SIZE, + BTR_EXTERN_FIELD_REF_SIZE); + } + } + + return(Z_OK); +} + +/**********************************************************************//** +Compress the records of a leaf node of a clustered index. +@return Z_OK, or a zlib error code */ +static +int +page_zip_compress_clust( +/*====================*/ + FILE_LOGFILE + z_stream* c_stream, /*!< in/out: compressed page stream */ + const rec_t** recs, /*!< in: dense page directory + sorted by address */ + ulint n_dense, /*!< in: size of recs[] */ + dict_index_t* index, /*!< in: the index of the page */ + ulint* n_blobs, /*!< in: 0; out: number of + externally stored columns */ + ulint trx_id_col, /*!< index of the trx_id column */ + byte* deleted, /*!< in: dense directory entry pointing + to the head of the free list */ + byte* storage, /*!< in: end of dense page directory */ + mem_heap_t* heap) /*!< in: temporary memory heap */ +{ + int err = Z_OK; + ulint* offsets = NULL; + /* BTR_EXTERN_FIELD_REF storage */ + byte* externs = storage - n_dense + * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + ut_ad(*n_blobs == 0); + + do { + const rec_t* rec = *recs++; + + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + ut_ad(rec_offs_n_fields(offsets) + == dict_index_get_n_fields(index)); + UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets)); + UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + + /* Compress the extra bytes. */ + c_stream->avail_in = static_cast<uInt>( + rec - REC_N_NEW_EXTRA_BYTES + - c_stream->next_in); + + if (c_stream->avail_in) { + err = deflate(c_stream, Z_NO_FLUSH); + if (UNIV_UNLIKELY(err != Z_OK)) { + + goto func_exit; + } + } + ut_ad(!c_stream->avail_in); + ut_ad(c_stream->next_in == rec - REC_N_NEW_EXTRA_BYTES); + + /* Compress the data bytes. */ + + c_stream->next_in = (byte*) rec; + + /* Check if there are any externally stored columns. + For each externally stored column, store the + BTR_EXTERN_FIELD_REF separately. */ + if (rec_offs_any_extern(offsets)) { + ut_ad(dict_index_is_clust(index)); + + err = page_zip_compress_clust_ext( + LOGFILE + c_stream, rec, offsets, trx_id_col, + deleted, storage, &externs, n_blobs); + + if (UNIV_UNLIKELY(err != Z_OK)) { + + goto func_exit; + } + } else { + ulint len; + const byte* src; + + /* Store trx_id and roll_ptr in uncompressed form. */ + src = rec_get_nth_field(rec, offsets, + trx_id_col, &len); + ut_ad(src + DATA_TRX_ID_LEN + == rec_get_nth_field(rec, offsets, + trx_id_col + 1, &len)); + ut_ad(len == DATA_ROLL_PTR_LEN); + UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets)); + UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + + /* Compress any preceding bytes. */ + c_stream->avail_in = static_cast<uInt>( + src - c_stream->next_in); + + if (c_stream->avail_in) { + err = deflate(c_stream, Z_NO_FLUSH); + if (UNIV_UNLIKELY(err != Z_OK)) { + + return(err); + } + } + + ut_ad(!c_stream->avail_in); + ut_ad(c_stream->next_in == src); + + memcpy(storage + - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN) + * (rec_get_heap_no_new(rec) - 1), + c_stream->next_in, + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + c_stream->next_in + += DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + + /* Skip also roll_ptr */ + ut_ad(trx_id_col + 1 < rec_offs_n_fields(offsets)); + } + + /* Compress the last bytes of the record. */ + c_stream->avail_in = static_cast<uInt>( + rec + rec_offs_data_size(offsets) - c_stream->next_in); + + if (c_stream->avail_in) { + err = deflate(c_stream, Z_NO_FLUSH); + if (UNIV_UNLIKELY(err != Z_OK)) { + + goto func_exit; + } + } + ut_ad(!c_stream->avail_in); + } while (--n_dense); + +func_exit: + return(err); +} + +/**********************************************************************//** +Compress a page. +@return TRUE on success, FALSE on failure; page_zip will be left +intact on failure. */ +UNIV_INTERN +ibool +page_zip_compress( +/*==============*/ + page_zip_des_t* page_zip,/*!< in: size; out: data, n_blobs, + m_start, m_end, m_nonempty */ + const page_t* page, /*!< in: uncompressed page */ + dict_index_t* index, /*!< in: index of the B-tree node */ + ulint level, /*!< in: compression level */ + mtr_t* mtr) /*!< in: mini-transaction, or NULL */ +{ + z_stream c_stream; + int err; + ulint n_fields;/* number of index fields needed */ + byte* fields; /*!< index field information */ + byte* buf; /*!< compressed payload of the page */ + byte* buf_end;/* end of buf */ + ulint n_dense; + ulint slot_size;/* amount of uncompressed bytes per record */ + const rec_t** recs; /*!< dense page directory, sorted by address */ + mem_heap_t* heap; + ulint trx_id_col; + ulint n_blobs = 0; + byte* storage;/* storage of uncompressed columns */ +#ifndef UNIV_HOTBACKUP + ullint usec = ut_time_us(NULL); +#endif /* !UNIV_HOTBACKUP */ +#ifdef PAGE_ZIP_COMPRESS_DBG + FILE* logfile = NULL; +#endif + /* A local copy of srv_cmp_per_index_enabled to avoid reading that + variable multiple times in this function since it can be changed at + anytime. */ + my_bool cmp_per_index_enabled = srv_cmp_per_index_enabled; + + if (!page) { + return(FALSE); + } + + ut_a(page_is_comp(page)); + ut_a(fil_page_get_type(page) == FIL_PAGE_INDEX); + ut_ad(page_simple_validate_new((page_t*) page)); + ut_ad(page_zip_simple_validate(page_zip)); + ut_ad(dict_table_is_comp(index->table)); + ut_ad(!dict_index_is_ibuf(index)); + + UNIV_MEM_ASSERT_RW(page, UNIV_PAGE_SIZE); + + /* Check the data that will be omitted. */ + ut_a(!memcmp(page + (PAGE_NEW_INFIMUM - REC_N_NEW_EXTRA_BYTES), + infimum_extra, sizeof infimum_extra)); + ut_a(!memcmp(page + PAGE_NEW_INFIMUM, + infimum_data, sizeof infimum_data)); + ut_a(page[PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES] + /* info_bits == 0, n_owned <= max */ + <= PAGE_DIR_SLOT_MAX_N_OWNED); + ut_a(!memcmp(page + (PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES + 1), + supremum_extra_data, sizeof supremum_extra_data)); + + if (page_is_empty(page)) { + ut_a(rec_get_next_offs(page + PAGE_NEW_INFIMUM, TRUE) + == PAGE_NEW_SUPREMUM); + } + + if (page_is_leaf(page)) { + n_fields = dict_index_get_n_fields(index); + } else { + n_fields = dict_index_get_n_unique_in_tree(index); + } + + /* The dense directory excludes the infimum and supremum records. */ + n_dense = page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW; +#ifdef PAGE_ZIP_COMPRESS_DBG + if (UNIV_UNLIKELY(page_zip_compress_dbg)) { + fprintf(stderr, "compress %p %p %lu %lu %lu\n", + (void*) page_zip, (void*) page, + (ibool) page_is_leaf(page), + n_fields, n_dense); + } + if (UNIV_UNLIKELY(page_zip_compress_log)) { + /* Create a log file for every compression attempt. */ + char logfilename[9]; + ut_snprintf(logfilename, sizeof logfilename, + "%08x", page_zip_compress_log++); + logfile = fopen(logfilename, "wb"); + + if (logfile) { + /* Write the uncompressed page to the log. */ + fwrite(page, 1, UNIV_PAGE_SIZE, logfile); + /* Record the compressed size as zero. + This will be overwritten at successful exit. */ + putc(0, logfile); + putc(0, logfile); + putc(0, logfile); + putc(0, logfile); + } + } +#endif /* PAGE_ZIP_COMPRESS_DBG */ +#ifndef UNIV_HOTBACKUP + page_zip_stat[page_zip->ssize - 1].compressed++; + if (cmp_per_index_enabled) { + mutex_enter(&page_zip_stat_per_index_mutex); + page_zip_stat_per_index[index->id].compressed++; + mutex_exit(&page_zip_stat_per_index_mutex); + } +#endif /* !UNIV_HOTBACKUP */ + + if (UNIV_UNLIKELY(n_dense * PAGE_ZIP_DIR_SLOT_SIZE + >= page_zip_get_size(page_zip))) { + + goto err_exit; + } + + MONITOR_INC(MONITOR_PAGE_COMPRESS); + + heap = mem_heap_create(page_zip_get_size(page_zip) + + n_fields * (2 + sizeof(ulint)) + + REC_OFFS_HEADER_SIZE + + n_dense * ((sizeof *recs) + - PAGE_ZIP_DIR_SLOT_SIZE) + + UNIV_PAGE_SIZE * 4 + + (512 << MAX_MEM_LEVEL)); + + recs = static_cast<const rec_t**>( + mem_heap_zalloc(heap, n_dense * sizeof *recs)); + + fields = static_cast<byte*>(mem_heap_alloc(heap, (n_fields + 1) * 2)); + + buf = static_cast<byte*>( + mem_heap_alloc(heap, page_zip_get_size(page_zip) - PAGE_DATA)); + + buf_end = buf + page_zip_get_size(page_zip) - PAGE_DATA; + + /* Compress the data payload. */ + page_zip_set_alloc(&c_stream, heap); + + err = deflateInit2(&c_stream, static_cast<int>(level), + Z_DEFLATED, UNIV_PAGE_SIZE_SHIFT, + MAX_MEM_LEVEL, Z_DEFAULT_STRATEGY); + ut_a(err == Z_OK); + + c_stream.next_out = buf; + /* Subtract the space reserved for uncompressed data. */ + /* Page header and the end marker of the modification log */ + c_stream.avail_out = static_cast<uInt>(buf_end - buf - 1); + + /* Dense page directory and uncompressed columns, if any */ + if (page_is_leaf(page)) { + if (dict_index_is_clust(index)) { + trx_id_col = dict_index_get_sys_col_pos( + index, DATA_TRX_ID); + ut_ad(trx_id_col > 0); + ut_ad(trx_id_col != ULINT_UNDEFINED); + + slot_size = PAGE_ZIP_DIR_SLOT_SIZE + + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + } else { + /* Signal the absence of trx_id + in page_zip_fields_encode() */ + ut_ad(dict_index_get_sys_col_pos(index, DATA_TRX_ID) + == ULINT_UNDEFINED); + trx_id_col = 0; + slot_size = PAGE_ZIP_DIR_SLOT_SIZE; + } + } else { + slot_size = PAGE_ZIP_DIR_SLOT_SIZE + REC_NODE_PTR_SIZE; + trx_id_col = ULINT_UNDEFINED; + } + + if (UNIV_UNLIKELY(c_stream.avail_out <= n_dense * slot_size + + 6/* sizeof(zlib header and footer) */)) { + goto zlib_error; + } + + c_stream.avail_out -= static_cast<uInt>(n_dense * slot_size); + c_stream.avail_in = static_cast<uInt>( + page_zip_fields_encode(n_fields, index, trx_id_col, fields)); + c_stream.next_in = fields; + if (UNIV_LIKELY(!trx_id_col)) { + trx_id_col = ULINT_UNDEFINED; + } + + UNIV_MEM_ASSERT_RW(c_stream.next_in, c_stream.avail_in); + err = deflate(&c_stream, Z_FULL_FLUSH); + if (err != Z_OK) { + goto zlib_error; + } + + ut_ad(!c_stream.avail_in); + + page_zip_dir_encode(page, buf_end, recs); + + c_stream.next_in = (byte*) page + PAGE_ZIP_START; + + storage = buf_end - n_dense * PAGE_ZIP_DIR_SLOT_SIZE; + + /* Compress the records in heap_no order. */ + if (UNIV_UNLIKELY(!n_dense)) { + } else if (!page_is_leaf(page)) { + /* This is a node pointer page. */ + err = page_zip_compress_node_ptrs(LOGFILE + &c_stream, recs, n_dense, + index, storage, heap); + if (UNIV_UNLIKELY(err != Z_OK)) { + goto zlib_error; + } + } else if (UNIV_LIKELY(trx_id_col == ULINT_UNDEFINED)) { + /* This is a leaf page in a secondary index. */ + err = page_zip_compress_sec(LOGFILE + &c_stream, recs, n_dense); + if (UNIV_UNLIKELY(err != Z_OK)) { + goto zlib_error; + } + } else { + /* This is a leaf page in a clustered index. */ + err = page_zip_compress_clust(LOGFILE + &c_stream, recs, n_dense, + index, &n_blobs, trx_id_col, + buf_end - PAGE_ZIP_DIR_SLOT_SIZE + * page_get_n_recs(page), + storage, heap); + if (UNIV_UNLIKELY(err != Z_OK)) { + goto zlib_error; + } + } + + /* Finish the compression. */ + ut_ad(!c_stream.avail_in); + /* Compress any trailing garbage, in case the last record was + allocated from an originally longer space on the free list, + or the data of the last record from page_zip_compress_sec(). */ + c_stream.avail_in = static_cast<uInt>( + page_header_get_field(page, PAGE_HEAP_TOP) + - (c_stream.next_in - page)); + ut_a(c_stream.avail_in <= UNIV_PAGE_SIZE - PAGE_ZIP_START - PAGE_DIR); + + UNIV_MEM_ASSERT_RW(c_stream.next_in, c_stream.avail_in); + err = deflate(&c_stream, Z_FINISH); + + if (UNIV_UNLIKELY(err != Z_STREAM_END)) { +zlib_error: + deflateEnd(&c_stream); + mem_heap_free(heap); +err_exit: +#ifdef PAGE_ZIP_COMPRESS_DBG + if (logfile) { + fclose(logfile); + } +#endif /* PAGE_ZIP_COMPRESS_DBG */ +#ifndef UNIV_HOTBACKUP + if (page_is_leaf(page)) { + dict_index_zip_failure(index); + } + + ullint time_diff = ut_time_us(NULL) - usec; + page_zip_stat[page_zip->ssize - 1].compressed_usec + += time_diff; + if (cmp_per_index_enabled) { + mutex_enter(&page_zip_stat_per_index_mutex); + page_zip_stat_per_index[index->id].compressed_usec + += time_diff; + mutex_exit(&page_zip_stat_per_index_mutex); + } +#endif /* !UNIV_HOTBACKUP */ + return(FALSE); + } + + err = deflateEnd(&c_stream); + ut_a(err == Z_OK); + + ut_ad(buf + c_stream.total_out == c_stream.next_out); + ut_ad((ulint) (storage - c_stream.next_out) >= c_stream.avail_out); + + /* Valgrind believes that zlib does not initialize some bits + in the last 7 or 8 bytes of the stream. Make Valgrind happy. */ + UNIV_MEM_VALID(buf, c_stream.total_out); + + /* Zero out the area reserved for the modification log. + Space for the end marker of the modification log is not + included in avail_out. */ + memset(c_stream.next_out, 0, c_stream.avail_out + 1/* end marker */); + +#ifdef UNIV_DEBUG + page_zip->m_start = +#endif /* UNIV_DEBUG */ + page_zip->m_end = PAGE_DATA + c_stream.total_out; + page_zip->m_nonempty = FALSE; + page_zip->n_blobs = n_blobs; + /* Copy those header fields that will not be written + in buf_flush_init_for_writing() */ + memcpy(page_zip->data + FIL_PAGE_PREV, page + FIL_PAGE_PREV, + FIL_PAGE_LSN - FIL_PAGE_PREV); + memcpy(page_zip->data + FIL_PAGE_TYPE, page + FIL_PAGE_TYPE, 2); + memcpy(page_zip->data + FIL_PAGE_DATA, page + FIL_PAGE_DATA, + PAGE_DATA - FIL_PAGE_DATA); + /* Copy the rest of the compressed page */ + memcpy(page_zip->data + PAGE_DATA, buf, + page_zip_get_size(page_zip) - PAGE_DATA); + mem_heap_free(heap); +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + + if (mtr) { +#ifndef UNIV_HOTBACKUP + page_zip_compress_write_log(page_zip, page, index, mtr); +#endif /* !UNIV_HOTBACKUP */ + } + + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + +#ifdef PAGE_ZIP_COMPRESS_DBG + if (logfile) { + /* Record the compressed size of the block. */ + byte sz[4]; + mach_write_to_4(sz, c_stream.total_out); + fseek(logfile, UNIV_PAGE_SIZE, SEEK_SET); + fwrite(sz, 1, sizeof sz, logfile); + fclose(logfile); + } +#endif /* PAGE_ZIP_COMPRESS_DBG */ +#ifndef UNIV_HOTBACKUP + ullint time_diff = ut_time_us(NULL) - usec; + page_zip_stat[page_zip->ssize - 1].compressed_ok++; + page_zip_stat[page_zip->ssize - 1].compressed_usec += time_diff; + if (cmp_per_index_enabled) { + mutex_enter(&page_zip_stat_per_index_mutex); + page_zip_stat_per_index[index->id].compressed_ok++; + page_zip_stat_per_index[index->id].compressed_usec += time_diff; + mutex_exit(&page_zip_stat_per_index_mutex); + } + + if (page_is_leaf(page)) { + dict_index_zip_success(index); + } +#endif /* !UNIV_HOTBACKUP */ + + return(TRUE); +} + +/**********************************************************************//** +Compare two page directory entries. +@return positive if rec1 > rec2 */ +UNIV_INLINE +ibool +page_zip_dir_cmp( +/*=============*/ + const rec_t* rec1, /*!< in: rec1 */ + const rec_t* rec2) /*!< in: rec2 */ +{ + return(rec1 > rec2); +} + +/**********************************************************************//** +Sort the dense page directory by address (heap_no). */ +static +void +page_zip_dir_sort( +/*==============*/ + rec_t** arr, /*!< in/out: dense page directory */ + rec_t** aux_arr,/*!< in/out: work area */ + ulint low, /*!< in: lower bound of the sorting area, inclusive */ + ulint high) /*!< in: upper bound of the sorting area, exclusive */ +{ + UT_SORT_FUNCTION_BODY(page_zip_dir_sort, arr, aux_arr, low, high, + page_zip_dir_cmp); +} + +/**********************************************************************//** +Deallocate the index information initialized by page_zip_fields_decode(). */ +static +void +page_zip_fields_free( +/*=================*/ + dict_index_t* index) /*!< in: dummy index to be freed */ +{ + if (index) { + dict_table_t* table = index->table; + os_fast_mutex_free(&index->zip_pad.mutex); + mem_heap_free(index->heap); + + dict_mem_table_free(table); + } +} + +/**********************************************************************//** +Read the index information for the compressed page. +@return own: dummy index describing the page, or NULL on error */ +static +dict_index_t* +page_zip_fields_decode( +/*===================*/ + const byte* buf, /*!< in: index information */ + const byte* end, /*!< in: end of buf */ + ulint* trx_id_col)/*!< in: NULL for non-leaf pages; + for leaf pages, pointer to where to store + the position of the trx_id column */ +{ + const byte* b; + ulint n; + ulint i; + ulint val; + dict_table_t* table; + dict_index_t* index; + + /* Determine the number of fields. */ + for (b = buf, n = 0; b < end; n++) { + if (*b++ & 0x80) { + b++; /* skip the second byte */ + } + } + + n--; /* n_nullable or trx_id */ + + if (UNIV_UNLIKELY(n > REC_MAX_N_FIELDS)) { + + page_zip_fail(("page_zip_fields_decode: n = %lu\n", + (ulong) n)); + return(NULL); + } + + if (UNIV_UNLIKELY(b > end)) { + + page_zip_fail(("page_zip_fields_decode: %p > %p\n", + (const void*) b, (const void*) end)); + return(NULL); + } + + table = dict_mem_table_create("ZIP_DUMMY", DICT_HDR_SPACE, n, + DICT_TF_COMPACT, 0, true); + index = dict_mem_index_create("ZIP_DUMMY", "ZIP_DUMMY", + DICT_HDR_SPACE, 0, n); + index->table = table; + index->n_uniq = n; + /* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */ + index->cached = TRUE; + + /* Initialize the fields. */ + for (b = buf, i = 0; i < n; i++) { + ulint mtype; + ulint len; + + val = *b++; + + if (UNIV_UNLIKELY(val & 0x80)) { + /* fixed length > 62 bytes */ + val = (val & 0x7f) << 8 | *b++; + len = val >> 1; + mtype = DATA_FIXBINARY; + } else if (UNIV_UNLIKELY(val >= 126)) { + /* variable length with max > 255 bytes */ + len = 0x7fff; + mtype = DATA_BINARY; + } else if (val <= 1) { + /* variable length with max <= 255 bytes */ + len = 0; + mtype = DATA_BINARY; + } else { + /* fixed length < 62 bytes */ + len = val >> 1; + mtype = DATA_FIXBINARY; + } + + dict_mem_table_add_col(table, NULL, NULL, mtype, + val & 1 ? DATA_NOT_NULL : 0, len); + dict_index_add_col(index, table, + dict_table_get_nth_col(table, i), 0); + } + + val = *b++; + if (UNIV_UNLIKELY(val & 0x80)) { + val = (val & 0x7f) << 8 | *b++; + } + + /* Decode the position of the trx_id column. */ + if (trx_id_col) { + if (!val) { + val = ULINT_UNDEFINED; + } else if (UNIV_UNLIKELY(val >= n)) { + page_zip_fields_free(index); + index = NULL; + } else { + index->type = DICT_CLUSTERED; + } + + *trx_id_col = val; + } else { + /* Decode the number of nullable fields. */ + if (UNIV_UNLIKELY(index->n_nullable > val)) { + page_zip_fields_free(index); + index = NULL; + } else { + index->n_nullable = val; + } + } + + ut_ad(b == end); + + return(index); +} + +/**********************************************************************//** +Populate the sparse page directory from the dense directory. +@return TRUE on success, FALSE on failure */ +static +ibool +page_zip_dir_decode( +/*================*/ + const page_zip_des_t* page_zip,/*!< in: dense page directory on + compressed page */ + page_t* page, /*!< in: compact page with valid header; + out: trailer and sparse page directory + filled in */ + rec_t** recs, /*!< out: dense page directory sorted by + ascending address (and heap_no) */ + rec_t** recs_aux,/*!< in/out: scratch area */ + ulint n_dense)/*!< in: number of user records, and + size of recs[] and recs_aux[] */ +{ + ulint i; + ulint n_recs; + byte* slot; + + n_recs = page_get_n_recs(page); + + if (UNIV_UNLIKELY(n_recs > n_dense)) { + page_zip_fail(("page_zip_dir_decode 1: %lu > %lu\n", + (ulong) n_recs, (ulong) n_dense)); + return(FALSE); + } + + /* Traverse the list of stored records in the sorting order, + starting from the first user record. */ + + slot = page + (UNIV_PAGE_SIZE - PAGE_DIR - PAGE_DIR_SLOT_SIZE); + UNIV_PREFETCH_RW(slot); + + /* Zero out the page trailer. */ + memset(slot + PAGE_DIR_SLOT_SIZE, 0, PAGE_DIR); + + mach_write_to_2(slot, PAGE_NEW_INFIMUM); + slot -= PAGE_DIR_SLOT_SIZE; + UNIV_PREFETCH_RW(slot); + + /* Initialize the sparse directory and copy the dense directory. */ + for (i = 0; i < n_recs; i++) { + ulint offs = page_zip_dir_get(page_zip, i); + + if (offs & PAGE_ZIP_DIR_SLOT_OWNED) { + mach_write_to_2(slot, offs & PAGE_ZIP_DIR_SLOT_MASK); + slot -= PAGE_DIR_SLOT_SIZE; + UNIV_PREFETCH_RW(slot); + } + + if (UNIV_UNLIKELY((offs & PAGE_ZIP_DIR_SLOT_MASK) + < PAGE_ZIP_START + REC_N_NEW_EXTRA_BYTES)) { + page_zip_fail(("page_zip_dir_decode 2: %u %u %lx\n", + (unsigned) i, (unsigned) n_recs, + (ulong) offs)); + return(FALSE); + } + + recs[i] = page + (offs & PAGE_ZIP_DIR_SLOT_MASK); + } + + mach_write_to_2(slot, PAGE_NEW_SUPREMUM); + { + const page_dir_slot_t* last_slot = page_dir_get_nth_slot( + page, page_dir_get_n_slots(page) - 1); + + if (UNIV_UNLIKELY(slot != last_slot)) { + page_zip_fail(("page_zip_dir_decode 3: %p != %p\n", + (const void*) slot, + (const void*) last_slot)); + return(FALSE); + } + } + + /* Copy the rest of the dense directory. */ + for (; i < n_dense; i++) { + ulint offs = page_zip_dir_get(page_zip, i); + + if (UNIV_UNLIKELY(offs & ~PAGE_ZIP_DIR_SLOT_MASK)) { + page_zip_fail(("page_zip_dir_decode 4: %u %u %lx\n", + (unsigned) i, (unsigned) n_dense, + (ulong) offs)); + return(FALSE); + } + + recs[i] = page + offs; + } + + if (UNIV_LIKELY(n_dense > 1)) { + page_zip_dir_sort(recs, recs_aux, 0, n_dense); + } + return(TRUE); +} + +/**********************************************************************//** +Initialize the REC_N_NEW_EXTRA_BYTES of each record. +@return TRUE on success, FALSE on failure */ +static +ibool +page_zip_set_extra_bytes( +/*=====================*/ + const page_zip_des_t* page_zip,/*!< in: compressed page */ + page_t* page, /*!< in/out: uncompressed page */ + ulint info_bits)/*!< in: REC_INFO_MIN_REC_FLAG or 0 */ +{ + ulint n; + ulint i; + ulint n_owned = 1; + ulint offs; + rec_t* rec; + + n = page_get_n_recs(page); + rec = page + PAGE_NEW_INFIMUM; + + for (i = 0; i < n; i++) { + offs = page_zip_dir_get(page_zip, i); + + if (offs & PAGE_ZIP_DIR_SLOT_DEL) { + info_bits |= REC_INFO_DELETED_FLAG; + } + if (UNIV_UNLIKELY(offs & PAGE_ZIP_DIR_SLOT_OWNED)) { + info_bits |= n_owned; + n_owned = 1; + } else { + n_owned++; + } + offs &= PAGE_ZIP_DIR_SLOT_MASK; + if (UNIV_UNLIKELY(offs < PAGE_ZIP_START + + REC_N_NEW_EXTRA_BYTES)) { + page_zip_fail(("page_zip_set_extra_bytes 1:" + " %u %u %lx\n", + (unsigned) i, (unsigned) n, + (ulong) offs)); + return(FALSE); + } + + rec_set_next_offs_new(rec, offs); + rec = page + offs; + rec[-REC_N_NEW_EXTRA_BYTES] = (byte) info_bits; + info_bits = 0; + } + + /* Set the next pointer of the last user record. */ + rec_set_next_offs_new(rec, PAGE_NEW_SUPREMUM); + + /* Set n_owned of the supremum record. */ + page[PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES] = (byte) n_owned; + + /* The dense directory excludes the infimum and supremum records. */ + n = page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW; + + if (i >= n) { + if (UNIV_LIKELY(i == n)) { + return(TRUE); + } + + page_zip_fail(("page_zip_set_extra_bytes 2: %u != %u\n", + (unsigned) i, (unsigned) n)); + return(FALSE); + } + + offs = page_zip_dir_get(page_zip, i); + + /* Set the extra bytes of deleted records on the free list. */ + for (;;) { + if (UNIV_UNLIKELY(!offs) + || UNIV_UNLIKELY(offs & ~PAGE_ZIP_DIR_SLOT_MASK)) { + + page_zip_fail(("page_zip_set_extra_bytes 3: %lx\n", + (ulong) offs)); + return(FALSE); + } + + rec = page + offs; + rec[-REC_N_NEW_EXTRA_BYTES] = 0; /* info_bits and n_owned */ + + if (++i == n) { + break; + } + + offs = page_zip_dir_get(page_zip, i); + rec_set_next_offs_new(rec, offs); + } + + /* Terminate the free list. */ + rec[-REC_N_NEW_EXTRA_BYTES] = 0; /* info_bits and n_owned */ + rec_set_next_offs_new(rec, 0); + + return(TRUE); +} + +/**********************************************************************//** +Apply the modification log to a record containing externally stored +columns. Do not copy the fields that are stored separately. +@return pointer to modification log, or NULL on failure */ +static +const byte* +page_zip_apply_log_ext( +/*===================*/ + rec_t* rec, /*!< in/out: record */ + const ulint* offsets, /*!< in: rec_get_offsets(rec) */ + ulint trx_id_col, /*!< in: position of of DB_TRX_ID */ + const byte* data, /*!< in: modification log */ + const byte* end) /*!< in: end of modification log */ +{ + ulint i; + ulint len; + byte* next_out = rec; + + /* Check if there are any externally stored columns. + For each externally stored column, skip the + BTR_EXTERN_FIELD_REF. */ + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + byte* dst; + + if (UNIV_UNLIKELY(i == trx_id_col)) { + /* Skip trx_id and roll_ptr */ + dst = rec_get_nth_field(rec, offsets, + i, &len); + if (UNIV_UNLIKELY(dst - next_out >= end - data) + || UNIV_UNLIKELY + (len < (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)) + || rec_offs_nth_extern(offsets, i)) { + page_zip_fail(("page_zip_apply_log_ext:" + " trx_id len %lu," + " %p - %p >= %p - %p\n", + (ulong) len, + (const void*) dst, + (const void*) next_out, + (const void*) end, + (const void*) data)); + return(NULL); + } + + memcpy(next_out, data, dst - next_out); + data += dst - next_out; + next_out = dst + (DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN); + } else if (rec_offs_nth_extern(offsets, i)) { + dst = rec_get_nth_field(rec, offsets, + i, &len); + ut_ad(len + >= BTR_EXTERN_FIELD_REF_SIZE); + + len += dst - next_out + - BTR_EXTERN_FIELD_REF_SIZE; + + if (UNIV_UNLIKELY(data + len >= end)) { + page_zip_fail(("page_zip_apply_log_ext: " + "ext %p+%lu >= %p\n", + (const void*) data, + (ulong) len, + (const void*) end)); + return(NULL); + } + + memcpy(next_out, data, len); + data += len; + next_out += len + + BTR_EXTERN_FIELD_REF_SIZE; + } + } + + /* Copy the last bytes of the record. */ + len = rec_get_end(rec, offsets) - next_out; + if (UNIV_UNLIKELY(data + len >= end)) { + page_zip_fail(("page_zip_apply_log_ext: " + "last %p+%lu >= %p\n", + (const void*) data, + (ulong) len, + (const void*) end)); + return(NULL); + } + memcpy(next_out, data, len); + data += len; + + return(data); +} + +/**********************************************************************//** +Apply the modification log to an uncompressed page. +Do not copy the fields that are stored separately. +@return pointer to end of modification log, or NULL on failure */ +static +const byte* +page_zip_apply_log( +/*===============*/ + const byte* data, /*!< in: modification log */ + ulint size, /*!< in: maximum length of the log, in bytes */ + rec_t** recs, /*!< in: dense page directory, + sorted by address (indexed by + heap_no - PAGE_HEAP_NO_USER_LOW) */ + ulint n_dense,/*!< in: size of recs[] */ + ulint trx_id_col,/*!< in: column number of trx_id in the index, + or ULINT_UNDEFINED if none */ + ulint heap_status, + /*!< in: heap_no and status bits for + the next record to uncompress */ + dict_index_t* index, /*!< in: index of the page */ + ulint* offsets)/*!< in/out: work area for + rec_get_offsets_reverse() */ +{ + const byte* const end = data + size; + + for (;;) { + ulint val; + rec_t* rec; + ulint len; + ulint hs; + + val = *data++; + if (UNIV_UNLIKELY(!val)) { + return(data - 1); + } + if (val & 0x80) { + val = (val & 0x7f) << 8 | *data++; + if (UNIV_UNLIKELY(!val)) { + page_zip_fail(("page_zip_apply_log:" + " invalid val %x%x\n", + data[-2], data[-1])); + return(NULL); + } + } + if (UNIV_UNLIKELY(data >= end)) { + page_zip_fail(("page_zip_apply_log: %p >= %p\n", + (const void*) data, + (const void*) end)); + return(NULL); + } + if (UNIV_UNLIKELY((val >> 1) > n_dense)) { + page_zip_fail(("page_zip_apply_log: %lu>>1 > %lu\n", + (ulong) val, (ulong) n_dense)); + return(NULL); + } + + /* Determine the heap number and status bits of the record. */ + rec = recs[(val >> 1) - 1]; + + hs = ((val >> 1) + 1) << REC_HEAP_NO_SHIFT; + hs |= heap_status & ((1 << REC_HEAP_NO_SHIFT) - 1); + + /* This may either be an old record that is being + overwritten (updated in place, or allocated from + the free list), or a new record, with the next + available_heap_no. */ + if (UNIV_UNLIKELY(hs > heap_status)) { + page_zip_fail(("page_zip_apply_log: %lu > %lu\n", + (ulong) hs, (ulong) heap_status)); + return(NULL); + } else if (hs == heap_status) { + /* A new record was allocated from the heap. */ + if (UNIV_UNLIKELY(val & 1)) { + /* Only existing records may be cleared. */ + page_zip_fail(("page_zip_apply_log:" + " attempting to create" + " deleted rec %lu\n", + (ulong) hs)); + return(NULL); + } + heap_status += 1 << REC_HEAP_NO_SHIFT; + } + + mach_write_to_2(rec - REC_NEW_HEAP_NO, hs); + + if (val & 1) { + /* Clear the data bytes of the record. */ + mem_heap_t* heap = NULL; + ulint* offs; + offs = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + memset(rec, 0, rec_offs_data_size(offs)); + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + continue; + } + +#if REC_STATUS_NODE_PTR != TRUE +# error "REC_STATUS_NODE_PTR != TRUE" +#endif + rec_get_offsets_reverse(data, index, + hs & REC_STATUS_NODE_PTR, + offsets); + rec_offs_make_valid(rec, index, offsets); + + /* Copy the extra bytes (backwards). */ + { + byte* start = rec_get_start(rec, offsets); + byte* b = rec - REC_N_NEW_EXTRA_BYTES; + while (b != start) { + *--b = *data++; + } + } + + /* Copy the data bytes. */ + if (UNIV_UNLIKELY(rec_offs_any_extern(offsets))) { + /* Non-leaf nodes should not contain any + externally stored columns. */ + if (UNIV_UNLIKELY(hs & REC_STATUS_NODE_PTR)) { + page_zip_fail(("page_zip_apply_log: " + "%lu&REC_STATUS_NODE_PTR\n", + (ulong) hs)); + return(NULL); + } + + data = page_zip_apply_log_ext( + rec, offsets, trx_id_col, data, end); + + if (UNIV_UNLIKELY(!data)) { + return(NULL); + } + } else if (UNIV_UNLIKELY(hs & REC_STATUS_NODE_PTR)) { + len = rec_offs_data_size(offsets) + - REC_NODE_PTR_SIZE; + /* Copy the data bytes, except node_ptr. */ + if (UNIV_UNLIKELY(data + len >= end)) { + page_zip_fail(("page_zip_apply_log: " + "node_ptr %p+%lu >= %p\n", + (const void*) data, + (ulong) len, + (const void*) end)); + return(NULL); + } + memcpy(rec, data, len); + data += len; + } else if (UNIV_LIKELY(trx_id_col == ULINT_UNDEFINED)) { + len = rec_offs_data_size(offsets); + + /* Copy all data bytes of + a record in a secondary index. */ + if (UNIV_UNLIKELY(data + len >= end)) { + page_zip_fail(("page_zip_apply_log: " + "sec %p+%lu >= %p\n", + (const void*) data, + (ulong) len, + (const void*) end)); + return(NULL); + } + + memcpy(rec, data, len); + data += len; + } else { + /* Skip DB_TRX_ID and DB_ROLL_PTR. */ + ulint l = rec_get_nth_field_offs(offsets, + trx_id_col, &len); + byte* b; + + if (UNIV_UNLIKELY(data + l >= end) + || UNIV_UNLIKELY(len < (DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN))) { + page_zip_fail(("page_zip_apply_log: " + "trx_id %p+%lu >= %p\n", + (const void*) data, + (ulong) l, + (const void*) end)); + return(NULL); + } + + /* Copy any preceding data bytes. */ + memcpy(rec, data, l); + data += l; + + /* Copy any bytes following DB_TRX_ID, DB_ROLL_PTR. */ + b = rec + l + (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + len = rec_get_end(rec, offsets) - b; + if (UNIV_UNLIKELY(data + len >= end)) { + page_zip_fail(("page_zip_apply_log: " + "clust %p+%lu >= %p\n", + (const void*) data, + (ulong) len, + (const void*) end)); + return(NULL); + } + memcpy(b, data, len); + data += len; + } + } +} + +/**********************************************************************//** +Set the heap_no in a record, and skip the fixed-size record header +that is not included in the d_stream. +@return TRUE on success, FALSE if d_stream does not end at rec */ +static +ibool +page_zip_decompress_heap_no( +/*========================*/ + z_stream* d_stream, /*!< in/out: compressed page stream */ + rec_t* rec, /*!< in/out: record */ + ulint& heap_status) /*!< in/out: heap_no and status bits */ +{ + if (d_stream->next_out != rec - REC_N_NEW_EXTRA_BYTES) { + /* n_dense has grown since the page was last compressed. */ + return(FALSE); + } + + /* Skip the REC_N_NEW_EXTRA_BYTES. */ + d_stream->next_out = rec; + + /* Set heap_no and the status bits. */ + mach_write_to_2(rec - REC_NEW_HEAP_NO, heap_status); + heap_status += 1 << REC_HEAP_NO_SHIFT; + return(TRUE); +} + +/**********************************************************************//** +Decompress the records of a node pointer page. +@return TRUE on success, FALSE on failure */ +static +ibool +page_zip_decompress_node_ptrs( +/*==========================*/ + page_zip_des_t* page_zip, /*!< in/out: compressed page */ + z_stream* d_stream, /*!< in/out: compressed page stream */ + rec_t** recs, /*!< in: dense page directory + sorted by address */ + ulint n_dense, /*!< in: size of recs[] */ + dict_index_t* index, /*!< in: the index of the page */ + ulint* offsets, /*!< in/out: temporary offsets */ + mem_heap_t* heap) /*!< in: temporary memory heap */ +{ + ulint heap_status = REC_STATUS_NODE_PTR + | PAGE_HEAP_NO_USER_LOW << REC_HEAP_NO_SHIFT; + ulint slot; + const byte* storage; + + /* Subtract the space reserved for uncompressed data. */ + d_stream->avail_in -= static_cast<uInt>( + n_dense * (PAGE_ZIP_DIR_SLOT_SIZE + REC_NODE_PTR_SIZE)); + + /* Decompress the records in heap_no order. */ + for (slot = 0; slot < n_dense; slot++) { + rec_t* rec = recs[slot]; + + d_stream->avail_out = static_cast<uInt>( + rec - REC_N_NEW_EXTRA_BYTES - d_stream->next_out); + + ut_ad(d_stream->avail_out < UNIV_PAGE_SIZE + - PAGE_ZIP_START - PAGE_DIR); + switch (inflate(d_stream, Z_SYNC_FLUSH)) { + case Z_STREAM_END: + page_zip_decompress_heap_no( + d_stream, rec, heap_status); + goto zlib_done; + case Z_OK: + case Z_BUF_ERROR: + if (!d_stream->avail_out) { + break; + } + /* fall through */ + default: + page_zip_fail(("page_zip_decompress_node_ptrs:" + " 1 inflate(Z_SYNC_FLUSH)=%s\n", + d_stream->msg)); + goto zlib_error; + } + + if (!page_zip_decompress_heap_no( + d_stream, rec, heap_status)) { + ut_ad(0); + } + + /* Read the offsets. The status bits are needed here. */ + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + + /* Non-leaf nodes should not have any externally + stored columns. */ + ut_ad(!rec_offs_any_extern(offsets)); + + /* Decompress the data bytes, except node_ptr. */ + d_stream->avail_out =static_cast<uInt>( + rec_offs_data_size(offsets) - REC_NODE_PTR_SIZE); + + switch (inflate(d_stream, Z_SYNC_FLUSH)) { + case Z_STREAM_END: + goto zlib_done; + case Z_OK: + case Z_BUF_ERROR: + if (!d_stream->avail_out) { + break; + } + /* fall through */ + default: + page_zip_fail(("page_zip_decompress_node_ptrs:" + " 2 inflate(Z_SYNC_FLUSH)=%s\n", + d_stream->msg)); + goto zlib_error; + } + + /* Clear the node pointer in case the record + will be deleted and the space will be reallocated + to a smaller record. */ + memset(d_stream->next_out, 0, REC_NODE_PTR_SIZE); + d_stream->next_out += REC_NODE_PTR_SIZE; + + ut_ad(d_stream->next_out == rec_get_end(rec, offsets)); + } + + /* Decompress any trailing garbage, in case the last record was + allocated from an originally longer space on the free list. */ + d_stream->avail_out = static_cast<uInt>( + page_header_get_field(page_zip->data, PAGE_HEAP_TOP) + - page_offset(d_stream->next_out)); + if (UNIV_UNLIKELY(d_stream->avail_out > UNIV_PAGE_SIZE + - PAGE_ZIP_START - PAGE_DIR)) { + + page_zip_fail(("page_zip_decompress_node_ptrs:" + " avail_out = %u\n", + d_stream->avail_out)); + goto zlib_error; + } + + if (UNIV_UNLIKELY(inflate(d_stream, Z_FINISH) != Z_STREAM_END)) { + page_zip_fail(("page_zip_decompress_node_ptrs:" + " inflate(Z_FINISH)=%s\n", + d_stream->msg)); +zlib_error: + inflateEnd(d_stream); + return(FALSE); + } + + /* Note that d_stream->avail_out > 0 may hold here + if the modification log is nonempty. */ + +zlib_done: + if (UNIV_UNLIKELY(inflateEnd(d_stream) != Z_OK)) { + ut_error; + } + + { + page_t* page = page_align(d_stream->next_out); + + /* Clear the unused heap space on the uncompressed page. */ + memset(d_stream->next_out, 0, + page_dir_get_nth_slot(page, + page_dir_get_n_slots(page) - 1) + - d_stream->next_out); + } + +#ifdef UNIV_DEBUG + page_zip->m_start = PAGE_DATA + d_stream->total_in; +#endif /* UNIV_DEBUG */ + + /* Apply the modification log. */ + { + const byte* mod_log_ptr; + mod_log_ptr = page_zip_apply_log(d_stream->next_in, + d_stream->avail_in + 1, + recs, n_dense, + ULINT_UNDEFINED, heap_status, + index, offsets); + + if (UNIV_UNLIKELY(!mod_log_ptr)) { + return(FALSE); + } + page_zip->m_end = mod_log_ptr - page_zip->data; + page_zip->m_nonempty = mod_log_ptr != d_stream->next_in; + } + + if (UNIV_UNLIKELY + (page_zip_get_trailer_len(page_zip, + dict_index_is_clust(index)) + + page_zip->m_end >= page_zip_get_size(page_zip))) { + page_zip_fail(("page_zip_decompress_node_ptrs:" + " %lu + %lu >= %lu, %lu\n", + (ulong) page_zip_get_trailer_len( + page_zip, dict_index_is_clust(index)), + (ulong) page_zip->m_end, + (ulong) page_zip_get_size(page_zip), + (ulong) dict_index_is_clust(index))); + return(FALSE); + } + + /* Restore the uncompressed columns in heap_no order. */ + storage = page_zip_dir_start_low(page_zip, n_dense); + + for (slot = 0; slot < n_dense; slot++) { + rec_t* rec = recs[slot]; + + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + /* Non-leaf nodes should not have any externally + stored columns. */ + ut_ad(!rec_offs_any_extern(offsets)); + storage -= REC_NODE_PTR_SIZE; + + memcpy(rec_get_end(rec, offsets) - REC_NODE_PTR_SIZE, + storage, REC_NODE_PTR_SIZE); + } + + return(TRUE); +} + +/**********************************************************************//** +Decompress the records of a leaf node of a secondary index. +@return TRUE on success, FALSE on failure */ +static +ibool +page_zip_decompress_sec( +/*====================*/ + page_zip_des_t* page_zip, /*!< in/out: compressed page */ + z_stream* d_stream, /*!< in/out: compressed page stream */ + rec_t** recs, /*!< in: dense page directory + sorted by address */ + ulint n_dense, /*!< in: size of recs[] */ + dict_index_t* index, /*!< in: the index of the page */ + ulint* offsets) /*!< in/out: temporary offsets */ +{ + ulint heap_status = REC_STATUS_ORDINARY + | PAGE_HEAP_NO_USER_LOW << REC_HEAP_NO_SHIFT; + ulint slot; + + ut_a(!dict_index_is_clust(index)); + + /* Subtract the space reserved for uncompressed data. */ + d_stream->avail_in -= static_cast<uint>( + n_dense * PAGE_ZIP_DIR_SLOT_SIZE); + + for (slot = 0; slot < n_dense; slot++) { + rec_t* rec = recs[slot]; + + /* Decompress everything up to this record. */ + d_stream->avail_out = static_cast<uint>( + rec - REC_N_NEW_EXTRA_BYTES - d_stream->next_out); + + if (UNIV_LIKELY(d_stream->avail_out)) { + switch (inflate(d_stream, Z_SYNC_FLUSH)) { + case Z_STREAM_END: + page_zip_decompress_heap_no( + d_stream, rec, heap_status); + goto zlib_done; + case Z_OK: + case Z_BUF_ERROR: + if (!d_stream->avail_out) { + break; + } + /* fall through */ + default: + page_zip_fail(("page_zip_decompress_sec:" + " inflate(Z_SYNC_FLUSH)=%s\n", + d_stream->msg)); + goto zlib_error; + } + } + + if (!page_zip_decompress_heap_no( + d_stream, rec, heap_status)) { + ut_ad(0); + } + } + + /* Decompress the data of the last record and any trailing garbage, + in case the last record was allocated from an originally longer space + on the free list. */ + d_stream->avail_out = static_cast<uInt>( + page_header_get_field(page_zip->data, PAGE_HEAP_TOP) + - page_offset(d_stream->next_out)); + if (UNIV_UNLIKELY(d_stream->avail_out > UNIV_PAGE_SIZE + - PAGE_ZIP_START - PAGE_DIR)) { + + page_zip_fail(("page_zip_decompress_sec:" + " avail_out = %u\n", + d_stream->avail_out)); + goto zlib_error; + } + + if (UNIV_UNLIKELY(inflate(d_stream, Z_FINISH) != Z_STREAM_END)) { + page_zip_fail(("page_zip_decompress_sec:" + " inflate(Z_FINISH)=%s\n", + d_stream->msg)); +zlib_error: + inflateEnd(d_stream); + return(FALSE); + } + + /* Note that d_stream->avail_out > 0 may hold here + if the modification log is nonempty. */ + +zlib_done: + if (UNIV_UNLIKELY(inflateEnd(d_stream) != Z_OK)) { + ut_error; + } + + { + page_t* page = page_align(d_stream->next_out); + + /* Clear the unused heap space on the uncompressed page. */ + memset(d_stream->next_out, 0, + page_dir_get_nth_slot(page, + page_dir_get_n_slots(page) - 1) + - d_stream->next_out); + } + +#ifdef UNIV_DEBUG + page_zip->m_start = PAGE_DATA + d_stream->total_in; +#endif /* UNIV_DEBUG */ + + /* Apply the modification log. */ + { + const byte* mod_log_ptr; + mod_log_ptr = page_zip_apply_log(d_stream->next_in, + d_stream->avail_in + 1, + recs, n_dense, + ULINT_UNDEFINED, heap_status, + index, offsets); + + if (UNIV_UNLIKELY(!mod_log_ptr)) { + return(FALSE); + } + page_zip->m_end = mod_log_ptr - page_zip->data; + page_zip->m_nonempty = mod_log_ptr != d_stream->next_in; + } + + if (UNIV_UNLIKELY(page_zip_get_trailer_len(page_zip, FALSE) + + page_zip->m_end >= page_zip_get_size(page_zip))) { + + page_zip_fail(("page_zip_decompress_sec: %lu + %lu >= %lu\n", + (ulong) page_zip_get_trailer_len( + page_zip, FALSE), + (ulong) page_zip->m_end, + (ulong) page_zip_get_size(page_zip))); + return(FALSE); + } + + /* There are no uncompressed columns on leaf pages of + secondary indexes. */ + + return(TRUE); +} + +/**********************************************************************//** +Decompress a record of a leaf node of a clustered index that contains +externally stored columns. +@return TRUE on success */ +static +ibool +page_zip_decompress_clust_ext( +/*==========================*/ + z_stream* d_stream, /*!< in/out: compressed page stream */ + rec_t* rec, /*!< in/out: record */ + const ulint* offsets, /*!< in: rec_get_offsets(rec) */ + ulint trx_id_col) /*!< in: position of of DB_TRX_ID */ +{ + ulint i; + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + ulint len; + byte* dst; + + if (UNIV_UNLIKELY(i == trx_id_col)) { + /* Skip trx_id and roll_ptr */ + dst = rec_get_nth_field(rec, offsets, i, &len); + if (UNIV_UNLIKELY(len < DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN)) { + + page_zip_fail(("page_zip_decompress_clust_ext:" + " len[%lu] = %lu\n", + (ulong) i, (ulong) len)); + return(FALSE); + } + + if (rec_offs_nth_extern(offsets, i)) { + + page_zip_fail(("page_zip_decompress_clust_ext:" + " DB_TRX_ID at %lu is ext\n", + (ulong) i)); + return(FALSE); + } + + d_stream->avail_out = static_cast<uInt>( + dst - d_stream->next_out); + + switch (inflate(d_stream, Z_SYNC_FLUSH)) { + case Z_STREAM_END: + case Z_OK: + case Z_BUF_ERROR: + if (!d_stream->avail_out) { + break; + } + /* fall through */ + default: + page_zip_fail(("page_zip_decompress_clust_ext:" + " 1 inflate(Z_SYNC_FLUSH)=%s\n", + d_stream->msg)); + return(FALSE); + } + + ut_ad(d_stream->next_out == dst); + + /* Clear DB_TRX_ID and DB_ROLL_PTR in order to + avoid uninitialized bytes in case the record + is affected by page_zip_apply_log(). */ + memset(dst, 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + d_stream->next_out += DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN; + } else if (rec_offs_nth_extern(offsets, i)) { + dst = rec_get_nth_field(rec, offsets, i, &len); + ut_ad(len >= BTR_EXTERN_FIELD_REF_SIZE); + dst += len - BTR_EXTERN_FIELD_REF_SIZE; + + d_stream->avail_out = static_cast<uInt>( + dst - d_stream->next_out); + switch (inflate(d_stream, Z_SYNC_FLUSH)) { + case Z_STREAM_END: + case Z_OK: + case Z_BUF_ERROR: + if (!d_stream->avail_out) { + break; + } + /* fall through */ + default: + page_zip_fail(("page_zip_decompress_clust_ext:" + " 2 inflate(Z_SYNC_FLUSH)=%s\n", + d_stream->msg)); + return(FALSE); + } + + ut_ad(d_stream->next_out == dst); + + /* Clear the BLOB pointer in case + the record will be deleted and the + space will not be reused. Note that + the final initialization of the BLOB + pointers (copying from "externs" + or clearing) will have to take place + only after the page modification log + has been applied. Otherwise, we + could end up with an uninitialized + BLOB pointer when a record is deleted, + reallocated and deleted. */ + memset(d_stream->next_out, 0, + BTR_EXTERN_FIELD_REF_SIZE); + d_stream->next_out + += BTR_EXTERN_FIELD_REF_SIZE; + } + } + + return(TRUE); +} + +/**********************************************************************//** +Compress the records of a leaf node of a clustered index. +@return TRUE on success, FALSE on failure */ +static +ibool +page_zip_decompress_clust( +/*======================*/ + page_zip_des_t* page_zip, /*!< in/out: compressed page */ + z_stream* d_stream, /*!< in/out: compressed page stream */ + rec_t** recs, /*!< in: dense page directory + sorted by address */ + ulint n_dense, /*!< in: size of recs[] */ + dict_index_t* index, /*!< in: the index of the page */ + ulint trx_id_col, /*!< index of the trx_id column */ + ulint* offsets, /*!< in/out: temporary offsets */ + mem_heap_t* heap) /*!< in: temporary memory heap */ +{ + int err; + ulint slot; + ulint heap_status = REC_STATUS_ORDINARY + | PAGE_HEAP_NO_USER_LOW << REC_HEAP_NO_SHIFT; + const byte* storage; + const byte* externs; + + ut_a(dict_index_is_clust(index)); + + /* Subtract the space reserved for uncompressed data. */ + d_stream->avail_in -= static_cast<uInt>(n_dense) + * (PAGE_ZIP_DIR_SLOT_SIZE + + DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN); + + /* Decompress the records in heap_no order. */ + for (slot = 0; slot < n_dense; slot++) { + rec_t* rec = recs[slot]; + + d_stream->avail_out =static_cast<uInt>( + rec - REC_N_NEW_EXTRA_BYTES - d_stream->next_out); + + ut_ad(d_stream->avail_out < UNIV_PAGE_SIZE + - PAGE_ZIP_START - PAGE_DIR); + err = inflate(d_stream, Z_SYNC_FLUSH); + switch (err) { + case Z_STREAM_END: + page_zip_decompress_heap_no( + d_stream, rec, heap_status); + goto zlib_done; + case Z_OK: + case Z_BUF_ERROR: + if (UNIV_LIKELY(!d_stream->avail_out)) { + break; + } + /* fall through */ + default: + page_zip_fail(("page_zip_decompress_clust:" + " 1 inflate(Z_SYNC_FLUSH)=%s\n", + d_stream->msg)); + goto zlib_error; + } + + if (!page_zip_decompress_heap_no( + d_stream, rec, heap_status)) { + ut_ad(0); + } + + /* Read the offsets. The status bits are needed here. */ + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + + /* This is a leaf page in a clustered index. */ + + /* Check if there are any externally stored columns. + For each externally stored column, restore the + BTR_EXTERN_FIELD_REF separately. */ + + if (rec_offs_any_extern(offsets)) { + if (UNIV_UNLIKELY + (!page_zip_decompress_clust_ext( + d_stream, rec, offsets, trx_id_col))) { + + goto zlib_error; + } + } else { + /* Skip trx_id and roll_ptr */ + ulint len; + byte* dst = rec_get_nth_field(rec, offsets, + trx_id_col, &len); + if (UNIV_UNLIKELY(len < DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN)) { + + page_zip_fail(("page_zip_decompress_clust:" + " len = %lu\n", (ulong) len)); + goto zlib_error; + } + + d_stream->avail_out = static_cast<uInt>( + dst - d_stream->next_out); + + switch (inflate(d_stream, Z_SYNC_FLUSH)) { + case Z_STREAM_END: + case Z_OK: + case Z_BUF_ERROR: + if (!d_stream->avail_out) { + break; + } + /* fall through */ + default: + page_zip_fail(("page_zip_decompress_clust:" + " 2 inflate(Z_SYNC_FLUSH)=%s\n", + d_stream->msg)); + goto zlib_error; + } + + ut_ad(d_stream->next_out == dst); + + /* Clear DB_TRX_ID and DB_ROLL_PTR in order to + avoid uninitialized bytes in case the record + is affected by page_zip_apply_log(). */ + memset(dst, 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + d_stream->next_out += DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN; + } + + /* Decompress the last bytes of the record. */ + d_stream->avail_out = static_cast<uInt>( + rec_get_end(rec, offsets) - d_stream->next_out); + + switch (inflate(d_stream, Z_SYNC_FLUSH)) { + case Z_STREAM_END: + case Z_OK: + case Z_BUF_ERROR: + if (!d_stream->avail_out) { + break; + } + /* fall through */ + default: + page_zip_fail(("page_zip_decompress_clust:" + " 3 inflate(Z_SYNC_FLUSH)=%s\n", + d_stream->msg)); + goto zlib_error; + } + } + + /* Decompress any trailing garbage, in case the last record was + allocated from an originally longer space on the free list. */ + d_stream->avail_out = static_cast<uInt>( + page_header_get_field(page_zip->data, PAGE_HEAP_TOP) + - page_offset(d_stream->next_out)); + if (UNIV_UNLIKELY(d_stream->avail_out > UNIV_PAGE_SIZE + - PAGE_ZIP_START - PAGE_DIR)) { + + page_zip_fail(("page_zip_decompress_clust:" + " avail_out = %u\n", + d_stream->avail_out)); + goto zlib_error; + } + + if (UNIV_UNLIKELY(inflate(d_stream, Z_FINISH) != Z_STREAM_END)) { + page_zip_fail(("page_zip_decompress_clust:" + " inflate(Z_FINISH)=%s\n", + d_stream->msg)); +zlib_error: + inflateEnd(d_stream); + return(FALSE); + } + + /* Note that d_stream->avail_out > 0 may hold here + if the modification log is nonempty. */ + +zlib_done: + if (UNIV_UNLIKELY(inflateEnd(d_stream) != Z_OK)) { + ut_error; + } + + { + page_t* page = page_align(d_stream->next_out); + + /* Clear the unused heap space on the uncompressed page. */ + memset(d_stream->next_out, 0, + page_dir_get_nth_slot(page, + page_dir_get_n_slots(page) - 1) + - d_stream->next_out); + } + +#ifdef UNIV_DEBUG + page_zip->m_start = PAGE_DATA + d_stream->total_in; +#endif /* UNIV_DEBUG */ + + /* Apply the modification log. */ + { + const byte* mod_log_ptr; + mod_log_ptr = page_zip_apply_log(d_stream->next_in, + d_stream->avail_in + 1, + recs, n_dense, + trx_id_col, heap_status, + index, offsets); + + if (UNIV_UNLIKELY(!mod_log_ptr)) { + return(FALSE); + } + page_zip->m_end = mod_log_ptr - page_zip->data; + page_zip->m_nonempty = mod_log_ptr != d_stream->next_in; + } + + if (UNIV_UNLIKELY(page_zip_get_trailer_len(page_zip, TRUE) + + page_zip->m_end >= page_zip_get_size(page_zip))) { + + page_zip_fail(("page_zip_decompress_clust: %lu + %lu >= %lu\n", + (ulong) page_zip_get_trailer_len( + page_zip, TRUE), + (ulong) page_zip->m_end, + (ulong) page_zip_get_size(page_zip))); + return(FALSE); + } + + storage = page_zip_dir_start_low(page_zip, n_dense); + + externs = storage - n_dense + * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + /* Restore the uncompressed columns in heap_no order. */ + + for (slot = 0; slot < n_dense; slot++) { + ulint i; + ulint len; + byte* dst; + rec_t* rec = recs[slot]; + ibool exists = !page_zip_dir_find_free( + page_zip, page_offset(rec)); + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + + dst = rec_get_nth_field(rec, offsets, + trx_id_col, &len); + ut_ad(len >= DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + storage -= DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + memcpy(dst, storage, + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + /* Check if there are any externally stored + columns in this record. For each externally + stored column, restore or clear the + BTR_EXTERN_FIELD_REF. */ + if (!rec_offs_any_extern(offsets)) { + continue; + } + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + if (!rec_offs_nth_extern(offsets, i)) { + continue; + } + dst = rec_get_nth_field(rec, offsets, i, &len); + + if (UNIV_UNLIKELY(len < BTR_EXTERN_FIELD_REF_SIZE)) { + page_zip_fail(("page_zip_decompress_clust:" + " %lu < 20\n", + (ulong) len)); + return(FALSE); + } + + dst += len - BTR_EXTERN_FIELD_REF_SIZE; + + if (UNIV_LIKELY(exists)) { + /* Existing record: + restore the BLOB pointer */ + externs -= BTR_EXTERN_FIELD_REF_SIZE; + + if (UNIV_UNLIKELY + (externs < page_zip->data + + page_zip->m_end)) { + page_zip_fail(("page_zip_" + "decompress_clust: " + "%p < %p + %lu\n", + (const void*) externs, + (const void*) + page_zip->data, + (ulong) + page_zip->m_end)); + return(FALSE); + } + + memcpy(dst, externs, + BTR_EXTERN_FIELD_REF_SIZE); + + page_zip->n_blobs++; + } else { + /* Deleted record: + clear the BLOB pointer */ + memset(dst, 0, + BTR_EXTERN_FIELD_REF_SIZE); + } + } + } + + return(TRUE); +} + +/**********************************************************************//** +Decompress a page. This function should tolerate errors on the compressed +page. Instead of letting assertions fail, it will return FALSE if an +inconsistency is detected. +@return TRUE on success, FALSE on failure */ +UNIV_INTERN +ibool +page_zip_decompress( +/*================*/ + page_zip_des_t* page_zip,/*!< in: data, ssize; + out: m_start, m_end, m_nonempty, n_blobs */ + page_t* page, /*!< out: uncompressed page, may be trashed */ + ibool all) /*!< in: TRUE=decompress the whole page; + FALSE=verify but do not copy some + page header fields that should not change + after page creation */ +{ + z_stream d_stream; + dict_index_t* index = NULL; + rec_t** recs; /*!< dense page directory, sorted by address */ + ulint n_dense;/* number of user records on the page */ + ulint trx_id_col = ULINT_UNDEFINED; + mem_heap_t* heap; + ulint* offsets; +#ifndef UNIV_HOTBACKUP + ullint usec = ut_time_us(NULL); +#endif /* !UNIV_HOTBACKUP */ + + ut_ad(page_zip_simple_validate(page_zip)); + UNIV_MEM_ASSERT_W(page, UNIV_PAGE_SIZE); + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + + /* The dense directory excludes the infimum and supremum records. */ + n_dense = page_dir_get_n_heap(page_zip->data) - PAGE_HEAP_NO_USER_LOW; + if (UNIV_UNLIKELY(n_dense * PAGE_ZIP_DIR_SLOT_SIZE + >= page_zip_get_size(page_zip))) { + page_zip_fail(("page_zip_decompress 1: %lu %lu\n", + (ulong) n_dense, + (ulong) page_zip_get_size(page_zip))); + return(FALSE); + } + + heap = mem_heap_create(n_dense * (3 * sizeof *recs) + UNIV_PAGE_SIZE); + + recs = static_cast<rec_t**>( + mem_heap_alloc(heap, n_dense * (2 * sizeof *recs))); + + if (all) { + /* Copy the page header. */ + memcpy(page, page_zip->data, PAGE_DATA); + } else { + /* Check that the bytes that we skip are identical. */ +#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG + ut_a(!memcmp(FIL_PAGE_TYPE + page, + FIL_PAGE_TYPE + page_zip->data, + PAGE_HEADER - FIL_PAGE_TYPE)); + ut_a(!memcmp(PAGE_HEADER + PAGE_LEVEL + page, + PAGE_HEADER + PAGE_LEVEL + page_zip->data, + PAGE_DATA - (PAGE_HEADER + PAGE_LEVEL))); +#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ + + /* Copy the mutable parts of the page header. */ + memcpy(page, page_zip->data, FIL_PAGE_TYPE); + memcpy(PAGE_HEADER + page, PAGE_HEADER + page_zip->data, + PAGE_LEVEL - PAGE_N_DIR_SLOTS); + +#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG + /* Check that the page headers match after copying. */ + ut_a(!memcmp(page, page_zip->data, PAGE_DATA)); +#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ + } + +#ifdef UNIV_ZIP_DEBUG + /* Clear the uncompressed page, except the header. */ + memset(PAGE_DATA + page, 0x55, UNIV_PAGE_SIZE - PAGE_DATA); +#endif /* UNIV_ZIP_DEBUG */ + UNIV_MEM_INVALID(PAGE_DATA + page, UNIV_PAGE_SIZE - PAGE_DATA); + + /* Copy the page directory. */ + if (UNIV_UNLIKELY(!page_zip_dir_decode(page_zip, page, recs, + recs + n_dense, n_dense))) { +zlib_error: + mem_heap_free(heap); + return(FALSE); + } + + /* Copy the infimum and supremum records. */ + memcpy(page + (PAGE_NEW_INFIMUM - REC_N_NEW_EXTRA_BYTES), + infimum_extra, sizeof infimum_extra); + if (page_is_empty(page)) { + rec_set_next_offs_new(page + PAGE_NEW_INFIMUM, + PAGE_NEW_SUPREMUM); + } else { + rec_set_next_offs_new(page + PAGE_NEW_INFIMUM, + page_zip_dir_get(page_zip, 0) + & PAGE_ZIP_DIR_SLOT_MASK); + } + memcpy(page + PAGE_NEW_INFIMUM, infimum_data, sizeof infimum_data); + memcpy(page + (PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES + 1), + supremum_extra_data, sizeof supremum_extra_data); + + page_zip_set_alloc(&d_stream, heap); + + d_stream.next_in = page_zip->data + PAGE_DATA; + /* Subtract the space reserved for + the page header and the end marker of the modification log. */ + d_stream.avail_in = static_cast<uInt>( + page_zip_get_size(page_zip) - (PAGE_DATA + 1)); + d_stream.next_out = page + PAGE_ZIP_START; + d_stream.avail_out = UNIV_PAGE_SIZE - PAGE_ZIP_START; + + if (UNIV_UNLIKELY(inflateInit2(&d_stream, UNIV_PAGE_SIZE_SHIFT) + != Z_OK)) { + ut_error; + } + + /* Decode the zlib header and the index information. */ + if (UNIV_UNLIKELY(inflate(&d_stream, Z_BLOCK) != Z_OK)) { + + page_zip_fail(("page_zip_decompress:" + " 1 inflate(Z_BLOCK)=%s\n", d_stream.msg)); + goto zlib_error; + } + + if (UNIV_UNLIKELY(inflate(&d_stream, Z_BLOCK) != Z_OK)) { + + page_zip_fail(("page_zip_decompress:" + " 2 inflate(Z_BLOCK)=%s\n", d_stream.msg)); + goto zlib_error; + } + + index = page_zip_fields_decode( + page + PAGE_ZIP_START, d_stream.next_out, + page_is_leaf(page) ? &trx_id_col : NULL); + + if (UNIV_UNLIKELY(!index)) { + + goto zlib_error; + } + + /* Decompress the user records. */ + page_zip->n_blobs = 0; + d_stream.next_out = page + PAGE_ZIP_START; + + { + /* Pre-allocate the offsets for rec_get_offsets_reverse(). */ + ulint n = 1 + 1/* node ptr */ + REC_OFFS_HEADER_SIZE + + dict_index_get_n_fields(index); + + offsets = static_cast<ulint*>( + mem_heap_alloc(heap, n * sizeof(ulint))); + + *offsets = n; + } + + /* Decompress the records in heap_no order. */ + if (!page_is_leaf(page)) { + /* This is a node pointer page. */ + ulint info_bits; + + if (UNIV_UNLIKELY + (!page_zip_decompress_node_ptrs(page_zip, &d_stream, + recs, n_dense, index, + offsets, heap))) { + goto err_exit; + } + + info_bits = mach_read_from_4(page + FIL_PAGE_PREV) == FIL_NULL + ? REC_INFO_MIN_REC_FLAG : 0; + + if (UNIV_UNLIKELY(!page_zip_set_extra_bytes(page_zip, page, + info_bits))) { + goto err_exit; + } + } else if (UNIV_LIKELY(trx_id_col == ULINT_UNDEFINED)) { + /* This is a leaf page in a secondary index. */ + if (UNIV_UNLIKELY(!page_zip_decompress_sec(page_zip, &d_stream, + recs, n_dense, + index, offsets))) { + goto err_exit; + } + + if (UNIV_UNLIKELY(!page_zip_set_extra_bytes(page_zip, + page, 0))) { +err_exit: + page_zip_fields_free(index); + mem_heap_free(heap); + return(FALSE); + } + } else { + /* This is a leaf page in a clustered index. */ + if (UNIV_UNLIKELY(!page_zip_decompress_clust(page_zip, + &d_stream, recs, + n_dense, index, + trx_id_col, + offsets, heap))) { + goto err_exit; + } + + if (UNIV_UNLIKELY(!page_zip_set_extra_bytes(page_zip, + page, 0))) { + goto err_exit; + } + } + + ut_a(page_is_comp(page)); + UNIV_MEM_ASSERT_RW(page, UNIV_PAGE_SIZE); + + page_zip_fields_free(index); + mem_heap_free(heap); +#ifndef UNIV_HOTBACKUP + ullint time_diff = ut_time_us(NULL) - usec; + page_zip_stat[page_zip->ssize - 1].decompressed++; + page_zip_stat[page_zip->ssize - 1].decompressed_usec += time_diff; + + index_id_t index_id = btr_page_get_index_id(page); + + if (srv_cmp_per_index_enabled) { + mutex_enter(&page_zip_stat_per_index_mutex); + page_zip_stat_per_index[index_id].decompressed++; + page_zip_stat_per_index[index_id].decompressed_usec += time_diff; + mutex_exit(&page_zip_stat_per_index_mutex); + } +#endif /* !UNIV_HOTBACKUP */ + + /* Update the stat counter for LRU policy. */ + buf_LRU_stat_inc_unzip(); + + MONITOR_INC(MONITOR_PAGE_DECOMPRESS); + + return(TRUE); +} + +#ifdef UNIV_ZIP_DEBUG +/**********************************************************************//** +Dump a block of memory on the standard error stream. */ +static +void +page_zip_hexdump_func( +/*==================*/ + const char* name, /*!< in: name of the data structure */ + const void* buf, /*!< in: data */ + ulint size) /*!< in: length of the data, in bytes */ +{ + const byte* s = static_cast<const byte*>(buf); + ulint addr; + const ulint width = 32; /* bytes per line */ + + fprintf(stderr, "%s:\n", name); + + for (addr = 0; addr < size; addr += width) { + ulint i; + + fprintf(stderr, "%04lx ", (ulong) addr); + + i = ut_min(width, size - addr); + + while (i--) { + fprintf(stderr, "%02x", *s++); + } + + putc('\n', stderr); + } +} + +/** Dump a block of memory on the standard error stream. +@param buf in: data +@param size in: length of the data, in bytes */ +#define page_zip_hexdump(buf, size) page_zip_hexdump_func(#buf, buf, size) + +/** Flag: make page_zip_validate() compare page headers only */ +UNIV_INTERN ibool page_zip_validate_header_only = FALSE; + +/**********************************************************************//** +Check that the compressed and decompressed pages match. +@return TRUE if valid, FALSE if not */ +UNIV_INTERN +ibool +page_zip_validate_low( +/*==================*/ + const page_zip_des_t* page_zip,/*!< in: compressed page */ + const page_t* page, /*!< in: uncompressed page */ + const dict_index_t* index, /*!< in: index of the page, if known */ + ibool sloppy) /*!< in: FALSE=strict, + TRUE=ignore the MIN_REC_FLAG */ +{ + page_zip_des_t temp_page_zip; + byte* temp_page_buf; + page_t* temp_page; + ibool valid; + + if (memcmp(page_zip->data + FIL_PAGE_PREV, page + FIL_PAGE_PREV, + FIL_PAGE_LSN - FIL_PAGE_PREV) + || memcmp(page_zip->data + FIL_PAGE_TYPE, page + FIL_PAGE_TYPE, 2) + || memcmp(page_zip->data + FIL_PAGE_DATA, page + FIL_PAGE_DATA, + PAGE_DATA - FIL_PAGE_DATA)) { + page_zip_fail(("page_zip_validate: page header\n")); + page_zip_hexdump(page_zip, sizeof *page_zip); + page_zip_hexdump(page_zip->data, page_zip_get_size(page_zip)); + page_zip_hexdump(page, UNIV_PAGE_SIZE); + return(FALSE); + } + + ut_a(page_is_comp(page)); + + if (page_zip_validate_header_only) { + return(TRUE); + } + + /* page_zip_decompress() expects the uncompressed page to be + UNIV_PAGE_SIZE aligned. */ + temp_page_buf = static_cast<byte*>(ut_malloc(2 * UNIV_PAGE_SIZE)); + temp_page = static_cast<byte*>(ut_align(temp_page_buf, UNIV_PAGE_SIZE)); + + UNIV_MEM_ASSERT_RW(page, UNIV_PAGE_SIZE); + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + + temp_page_zip = *page_zip; + valid = page_zip_decompress(&temp_page_zip, temp_page, TRUE); + if (!valid) { + fputs("page_zip_validate(): failed to decompress\n", stderr); + goto func_exit; + } + if (page_zip->n_blobs != temp_page_zip.n_blobs) { + page_zip_fail(("page_zip_validate: n_blobs: %u!=%u\n", + page_zip->n_blobs, temp_page_zip.n_blobs)); + valid = FALSE; + } +#ifdef UNIV_DEBUG + if (page_zip->m_start != temp_page_zip.m_start) { + page_zip_fail(("page_zip_validate: m_start: %u!=%u\n", + page_zip->m_start, temp_page_zip.m_start)); + valid = FALSE; + } +#endif /* UNIV_DEBUG */ + if (page_zip->m_end != temp_page_zip.m_end) { + page_zip_fail(("page_zip_validate: m_end: %u!=%u\n", + page_zip->m_end, temp_page_zip.m_end)); + valid = FALSE; + } + if (page_zip->m_nonempty != temp_page_zip.m_nonempty) { + page_zip_fail(("page_zip_validate(): m_nonempty: %u!=%u\n", + page_zip->m_nonempty, + temp_page_zip.m_nonempty)); + valid = FALSE; + } + if (memcmp(page + PAGE_HEADER, temp_page + PAGE_HEADER, + UNIV_PAGE_SIZE - PAGE_HEADER - FIL_PAGE_DATA_END)) { + + /* In crash recovery, the "minimum record" flag may be + set incorrectly until the mini-transaction is + committed. Let us tolerate that difference when we + are performing a sloppy validation. */ + + ulint* offsets; + mem_heap_t* heap; + const rec_t* rec; + const rec_t* trec; + byte info_bits_diff; + ulint offset + = rec_get_next_offs(page + PAGE_NEW_INFIMUM, TRUE); + ut_a(offset >= PAGE_NEW_SUPREMUM); + offset -= 5/*REC_NEW_INFO_BITS*/; + + info_bits_diff = page[offset] ^ temp_page[offset]; + + if (info_bits_diff == REC_INFO_MIN_REC_FLAG) { + temp_page[offset] = page[offset]; + + if (!memcmp(page + PAGE_HEADER, + temp_page + PAGE_HEADER, + UNIV_PAGE_SIZE - PAGE_HEADER + - FIL_PAGE_DATA_END)) { + + /* Only the minimum record flag + differed. Let us ignore it. */ + page_zip_fail(("page_zip_validate: " + "min_rec_flag " + "(%s" + "%lu,%lu,0x%02lx)\n", + sloppy ? "ignored, " : "", + page_get_space_id(page), + page_get_page_no(page), + (ulong) page[offset])); + valid = sloppy; + goto func_exit; + } + } + + /* Compare the pointers in the PAGE_FREE list. */ + rec = page_header_get_ptr(page, PAGE_FREE); + trec = page_header_get_ptr(temp_page, PAGE_FREE); + + while (rec || trec) { + if (page_offset(rec) != page_offset(trec)) { + page_zip_fail(("page_zip_validate: " + "PAGE_FREE list: %u!=%u\n", + (unsigned) page_offset(rec), + (unsigned) page_offset(trec))); + valid = FALSE; + goto func_exit; + } + + rec = page_rec_get_next_low(rec, TRUE); + trec = page_rec_get_next_low(trec, TRUE); + } + + /* Compare the records. */ + heap = NULL; + offsets = NULL; + rec = page_rec_get_next_low( + page + PAGE_NEW_INFIMUM, TRUE); + trec = page_rec_get_next_low( + temp_page + PAGE_NEW_INFIMUM, TRUE); + + do { + if (page_offset(rec) != page_offset(trec)) { + page_zip_fail(("page_zip_validate: " + "record list: 0x%02x!=0x%02x\n", + (unsigned) page_offset(rec), + (unsigned) page_offset(trec))); + valid = FALSE; + break; + } + + if (index) { + /* Compare the data. */ + offsets = rec_get_offsets( + rec, index, offsets, + ULINT_UNDEFINED, &heap); + + if (memcmp(rec - rec_offs_extra_size(offsets), + trec - rec_offs_extra_size(offsets), + rec_offs_size(offsets))) { + page_zip_fail( + ("page_zip_validate: " + "record content: 0x%02x", + (unsigned) page_offset(rec))); + valid = FALSE; + break; + } + } + + rec = page_rec_get_next_low(rec, TRUE); + trec = page_rec_get_next_low(trec, TRUE); + } while (rec || trec); + + if (heap) { + mem_heap_free(heap); + } + } + +func_exit: + if (!valid) { + page_zip_hexdump(page_zip, sizeof *page_zip); + page_zip_hexdump(page_zip->data, page_zip_get_size(page_zip)); + page_zip_hexdump(page, UNIV_PAGE_SIZE); + page_zip_hexdump(temp_page, UNIV_PAGE_SIZE); + } + ut_free(temp_page_buf); + return(valid); +} + +/**********************************************************************//** +Check that the compressed and decompressed pages match. +@return TRUE if valid, FALSE if not */ +UNIV_INTERN +ibool +page_zip_validate( +/*==============*/ + const page_zip_des_t* page_zip,/*!< in: compressed page */ + const page_t* page, /*!< in: uncompressed page */ + const dict_index_t* index) /*!< in: index of the page, if known */ +{ + return(page_zip_validate_low(page_zip, page, index, + recv_recovery_is_on())); +} +#endif /* UNIV_ZIP_DEBUG */ + +#ifdef UNIV_DEBUG +/**********************************************************************//** +Assert that the compressed and decompressed page headers match. +@return TRUE */ +static +ibool +page_zip_header_cmp( +/*================*/ + const page_zip_des_t* page_zip,/*!< in: compressed page */ + const byte* page) /*!< in: uncompressed page */ +{ + ut_ad(!memcmp(page_zip->data + FIL_PAGE_PREV, page + FIL_PAGE_PREV, + FIL_PAGE_LSN - FIL_PAGE_PREV)); + ut_ad(!memcmp(page_zip->data + FIL_PAGE_TYPE, page + FIL_PAGE_TYPE, + 2)); + ut_ad(!memcmp(page_zip->data + FIL_PAGE_DATA, page + FIL_PAGE_DATA, + PAGE_DATA - FIL_PAGE_DATA)); + + return(TRUE); +} +#endif /* UNIV_DEBUG */ + +/**********************************************************************//** +Write a record on the compressed page that contains externally stored +columns. The data must already have been written to the uncompressed page. +@return end of modification log */ +static +byte* +page_zip_write_rec_ext( +/*===================*/ + page_zip_des_t* page_zip, /*!< in/out: compressed page */ + const page_t* page, /*!< in: page containing rec */ + const byte* rec, /*!< in: record being written */ + dict_index_t* index, /*!< in: record descriptor */ + const ulint* offsets, /*!< in: rec_get_offsets(rec, index) */ + ulint create, /*!< in: nonzero=insert, zero=update */ + ulint trx_id_col, /*!< in: position of DB_TRX_ID */ + ulint heap_no, /*!< in: heap number of rec */ + byte* storage, /*!< in: end of dense page directory */ + byte* data) /*!< in: end of modification log */ +{ + const byte* start = rec; + ulint i; + ulint len; + byte* externs = storage; + ulint n_ext = rec_offs_n_extern(offsets); + + ut_ad(rec_offs_validate(rec, index, offsets)); + UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets)); + UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + + externs -= (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN) + * (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW); + + /* Note that this will not take into account + the BLOB columns of rec if create==TRUE. */ + ut_ad(data + rec_offs_data_size(offsets) + - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN) + - n_ext * BTR_EXTERN_FIELD_REF_SIZE + < externs - BTR_EXTERN_FIELD_REF_SIZE * page_zip->n_blobs); + + { + ulint blob_no = page_zip_get_n_prev_extern( + page_zip, rec, index); + byte* ext_end = externs - page_zip->n_blobs + * BTR_EXTERN_FIELD_REF_SIZE; + ut_ad(blob_no <= page_zip->n_blobs); + externs -= blob_no * BTR_EXTERN_FIELD_REF_SIZE; + + if (create) { + page_zip->n_blobs += static_cast<unsigned>(n_ext); + ASSERT_ZERO_BLOB(ext_end - n_ext + * BTR_EXTERN_FIELD_REF_SIZE); + memmove(ext_end - n_ext + * BTR_EXTERN_FIELD_REF_SIZE, + ext_end, + externs - ext_end); + } + + ut_a(blob_no + n_ext <= page_zip->n_blobs); + } + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + const byte* src; + + if (UNIV_UNLIKELY(i == trx_id_col)) { + ut_ad(!rec_offs_nth_extern(offsets, + i)); + ut_ad(!rec_offs_nth_extern(offsets, + i + 1)); + /* Locate trx_id and roll_ptr. */ + src = rec_get_nth_field(rec, offsets, + i, &len); + ut_ad(len == DATA_TRX_ID_LEN); + ut_ad(src + DATA_TRX_ID_LEN + == rec_get_nth_field( + rec, offsets, + i + 1, &len)); + ut_ad(len == DATA_ROLL_PTR_LEN); + + /* Log the preceding fields. */ + ASSERT_ZERO(data, src - start); + memcpy(data, start, src - start); + data += src - start; + start = src + (DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN); + + /* Store trx_id and roll_ptr. */ + memcpy(storage - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN) + * (heap_no - 1), + src, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + i++; /* skip also roll_ptr */ + } else if (rec_offs_nth_extern(offsets, i)) { + src = rec_get_nth_field(rec, offsets, + i, &len); + + ut_ad(dict_index_is_clust(index)); + ut_ad(len + >= BTR_EXTERN_FIELD_REF_SIZE); + src += len - BTR_EXTERN_FIELD_REF_SIZE; + + ASSERT_ZERO(data, src - start); + memcpy(data, start, src - start); + data += src - start; + start = src + BTR_EXTERN_FIELD_REF_SIZE; + + /* Store the BLOB pointer. */ + externs -= BTR_EXTERN_FIELD_REF_SIZE; + ut_ad(data < externs); + memcpy(externs, src, BTR_EXTERN_FIELD_REF_SIZE); + } + } + + /* Log the last bytes of the record. */ + len = rec_offs_data_size(offsets) - (start - rec); + + ASSERT_ZERO(data, len); + memcpy(data, start, len); + data += len; + + return(data); +} + +/**********************************************************************//** +Write an entire record on the compressed page. The data must already +have been written to the uncompressed page. */ +UNIV_INTERN +void +page_zip_write_rec( +/*===============*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page */ + const byte* rec, /*!< in: record being written */ + dict_index_t* index, /*!< in: the index the record belongs to */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + ulint create) /*!< in: nonzero=insert, zero=update */ +{ + const page_t* page; + byte* data; + byte* storage; + ulint heap_no; + byte* slot; + + ut_ad(PAGE_ZIP_MATCH(rec, page_zip)); + ut_ad(page_zip_simple_validate(page_zip)); + ut_ad(page_zip_get_size(page_zip) + > PAGE_DATA + page_zip_dir_size(page_zip)); + ut_ad(rec_offs_comp(offsets)); + ut_ad(rec_offs_validate(rec, index, offsets)); + + ut_ad(page_zip->m_start >= PAGE_DATA); + + page = page_align(rec); + + ut_ad(page_zip_header_cmp(page_zip, page)); + ut_ad(page_simple_validate_new((page_t*) page)); + + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets)); + UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + + slot = page_zip_dir_find(page_zip, page_offset(rec)); + ut_a(slot); + /* Copy the delete mark. */ + if (rec_get_deleted_flag(rec, TRUE)) { + *slot |= PAGE_ZIP_DIR_SLOT_DEL >> 8; + } else { + *slot &= ~(PAGE_ZIP_DIR_SLOT_DEL >> 8); + } + + ut_ad(rec_get_start((rec_t*) rec, offsets) >= page + PAGE_ZIP_START); + ut_ad(rec_get_end((rec_t*) rec, offsets) <= page + UNIV_PAGE_SIZE + - PAGE_DIR - PAGE_DIR_SLOT_SIZE + * page_dir_get_n_slots(page)); + + heap_no = rec_get_heap_no_new(rec); + ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW); /* not infimum or supremum */ + ut_ad(heap_no < page_dir_get_n_heap(page)); + + /* Append to the modification log. */ + data = page_zip->data + page_zip->m_end; + ut_ad(!*data); + + /* Identify the record by writing its heap number - 1. + 0 is reserved to indicate the end of the modification log. */ + + if (UNIV_UNLIKELY(heap_no - 1 >= 64)) { + *data++ = (byte) (0x80 | (heap_no - 1) >> 7); + ut_ad(!*data); + } + *data++ = (byte) ((heap_no - 1) << 1); + ut_ad(!*data); + + { + const byte* start = rec - rec_offs_extra_size(offsets); + const byte* b = rec - REC_N_NEW_EXTRA_BYTES; + + /* Write the extra bytes backwards, so that + rec_offs_extra_size() can be easily computed in + page_zip_apply_log() by invoking + rec_get_offsets_reverse(). */ + + while (b != start) { + *data++ = *--b; + ut_ad(!*data); + } + } + + /* Write the data bytes. Store the uncompressed bytes separately. */ + storage = page_zip_dir_start(page_zip); + + if (page_is_leaf(page)) { + ulint len; + + if (dict_index_is_clust(index)) { + ulint trx_id_col; + + trx_id_col = dict_index_get_sys_col_pos(index, + DATA_TRX_ID); + ut_ad(trx_id_col != ULINT_UNDEFINED); + + /* Store separately trx_id, roll_ptr and + the BTR_EXTERN_FIELD_REF of each BLOB column. */ + if (rec_offs_any_extern(offsets)) { + data = page_zip_write_rec_ext( + page_zip, page, + rec, index, offsets, create, + trx_id_col, heap_no, storage, data); + } else { + /* Locate trx_id and roll_ptr. */ + const byte* src + = rec_get_nth_field(rec, offsets, + trx_id_col, &len); + ut_ad(len == DATA_TRX_ID_LEN); + ut_ad(src + DATA_TRX_ID_LEN + == rec_get_nth_field( + rec, offsets, + trx_id_col + 1, &len)); + ut_ad(len == DATA_ROLL_PTR_LEN); + + /* Log the preceding fields. */ + ASSERT_ZERO(data, src - rec); + memcpy(data, rec, src - rec); + data += src - rec; + + /* Store trx_id and roll_ptr. */ + memcpy(storage + - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN) + * (heap_no - 1), + src, + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + src += DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + + /* Log the last bytes of the record. */ + len = rec_offs_data_size(offsets) + - (src - rec); + + ASSERT_ZERO(data, len); + memcpy(data, src, len); + data += len; + } + } else { + /* Leaf page of a secondary index: + no externally stored columns */ + ut_ad(dict_index_get_sys_col_pos(index, DATA_TRX_ID) + == ULINT_UNDEFINED); + ut_ad(!rec_offs_any_extern(offsets)); + + /* Log the entire record. */ + len = rec_offs_data_size(offsets); + + ASSERT_ZERO(data, len); + memcpy(data, rec, len); + data += len; + } + } else { + /* This is a node pointer page. */ + ulint len; + + /* Non-leaf nodes should not have any externally + stored columns. */ + ut_ad(!rec_offs_any_extern(offsets)); + + /* Copy the data bytes, except node_ptr. */ + len = rec_offs_data_size(offsets) - REC_NODE_PTR_SIZE; + ut_ad(data + len < storage - REC_NODE_PTR_SIZE + * (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW)); + ASSERT_ZERO(data, len); + memcpy(data, rec, len); + data += len; + + /* Copy the node pointer to the uncompressed area. */ + memcpy(storage - REC_NODE_PTR_SIZE + * (heap_no - 1), + rec + len, + REC_NODE_PTR_SIZE); + } + + ut_a(!*data); + ut_ad((ulint) (data - page_zip->data) < page_zip_get_size(page_zip)); + page_zip->m_end = data - page_zip->data; + page_zip->m_nonempty = TRUE; + +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page_align(rec), index)); +#endif /* UNIV_ZIP_DEBUG */ +} + +/***********************************************************//** +Parses a log record of writing a BLOB pointer of a record. +@return end of log record or NULL */ +UNIV_INTERN +byte* +page_zip_parse_write_blob_ptr( +/*==========================*/ + byte* ptr, /*!< in: redo log buffer */ + byte* end_ptr,/*!< in: redo log buffer end */ + page_t* page, /*!< in/out: uncompressed page */ + page_zip_des_t* page_zip)/*!< in/out: compressed page */ +{ + ulint offset; + ulint z_offset; + + ut_ad(!page == !page_zip); + + if (UNIV_UNLIKELY + (end_ptr < ptr + (2 + 2 + BTR_EXTERN_FIELD_REF_SIZE))) { + + return(NULL); + } + + offset = mach_read_from_2(ptr); + z_offset = mach_read_from_2(ptr + 2); + + if (UNIV_UNLIKELY(offset < PAGE_ZIP_START) + || UNIV_UNLIKELY(offset >= UNIV_PAGE_SIZE) + || UNIV_UNLIKELY(z_offset >= UNIV_PAGE_SIZE)) { +corrupt: + recv_sys->found_corrupt_log = TRUE; + + return(NULL); + } + + if (page) { + if (UNIV_UNLIKELY(!page_zip) + || UNIV_UNLIKELY(!page_is_leaf(page))) { + + goto corrupt; + } + +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page, NULL)); +#endif /* UNIV_ZIP_DEBUG */ + + memcpy(page + offset, + ptr + 4, BTR_EXTERN_FIELD_REF_SIZE); + memcpy(page_zip->data + z_offset, + ptr + 4, BTR_EXTERN_FIELD_REF_SIZE); + +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page, NULL)); +#endif /* UNIV_ZIP_DEBUG */ + } + + return(ptr + (2 + 2 + BTR_EXTERN_FIELD_REF_SIZE)); +} + +/**********************************************************************//** +Write a BLOB pointer of a record on the leaf page of a clustered index. +The information must already have been updated on the uncompressed page. */ +UNIV_INTERN +void +page_zip_write_blob_ptr( +/*====================*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page */ + const byte* rec, /*!< in/out: record whose data is being + written */ + dict_index_t* index, /*!< in: index of the page */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + ulint n, /*!< in: column index */ + mtr_t* mtr) /*!< in: mini-transaction handle, + or NULL if no logging is needed */ +{ + const byte* field; + byte* externs; + const page_t* page = page_align(rec); + ulint blob_no; + ulint len; + + ut_ad(PAGE_ZIP_MATCH(rec, page_zip)); + ut_ad(page_simple_validate_new((page_t*) page)); + ut_ad(page_zip_simple_validate(page_zip)); + ut_ad(page_zip_get_size(page_zip) + > PAGE_DATA + page_zip_dir_size(page_zip)); + ut_ad(rec_offs_comp(offsets)); + ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_ad(rec_offs_any_extern(offsets)); + ut_ad(rec_offs_nth_extern(offsets, n)); + + ut_ad(page_zip->m_start >= PAGE_DATA); + ut_ad(page_zip_header_cmp(page_zip, page)); + + ut_ad(page_is_leaf(page)); + ut_ad(dict_index_is_clust(index)); + + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets)); + UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + + blob_no = page_zip_get_n_prev_extern(page_zip, rec, index) + + rec_get_n_extern_new(rec, index, n); + ut_a(blob_no < page_zip->n_blobs); + + externs = page_zip->data + page_zip_get_size(page_zip) + - (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW) + * (PAGE_ZIP_DIR_SLOT_SIZE + + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + field = rec_get_nth_field(rec, offsets, n, &len); + + externs -= (blob_no + 1) * BTR_EXTERN_FIELD_REF_SIZE; + field += len - BTR_EXTERN_FIELD_REF_SIZE; + + memcpy(externs, field, BTR_EXTERN_FIELD_REF_SIZE); + +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + + if (mtr) { +#ifndef UNIV_HOTBACKUP + byte* log_ptr = mlog_open( + mtr, 11 + 2 + 2 + BTR_EXTERN_FIELD_REF_SIZE); + if (UNIV_UNLIKELY(!log_ptr)) { + return; + } + + log_ptr = mlog_write_initial_log_record_fast( + (byte*) field, MLOG_ZIP_WRITE_BLOB_PTR, log_ptr, mtr); + mach_write_to_2(log_ptr, page_offset(field)); + log_ptr += 2; + mach_write_to_2(log_ptr, externs - page_zip->data); + log_ptr += 2; + memcpy(log_ptr, externs, BTR_EXTERN_FIELD_REF_SIZE); + log_ptr += BTR_EXTERN_FIELD_REF_SIZE; + mlog_close(mtr, log_ptr); +#endif /* !UNIV_HOTBACKUP */ + } +} + +/***********************************************************//** +Parses a log record of writing the node pointer of a record. +@return end of log record or NULL */ +UNIV_INTERN +byte* +page_zip_parse_write_node_ptr( +/*==========================*/ + byte* ptr, /*!< in: redo log buffer */ + byte* end_ptr,/*!< in: redo log buffer end */ + page_t* page, /*!< in/out: uncompressed page */ + page_zip_des_t* page_zip)/*!< in/out: compressed page */ +{ + ulint offset; + ulint z_offset; + + ut_ad(!page == !page_zip); + + if (UNIV_UNLIKELY(end_ptr < ptr + (2 + 2 + REC_NODE_PTR_SIZE))) { + + return(NULL); + } + + offset = mach_read_from_2(ptr); + z_offset = mach_read_from_2(ptr + 2); + + if (UNIV_UNLIKELY(offset < PAGE_ZIP_START) + || UNIV_UNLIKELY(offset >= UNIV_PAGE_SIZE) + || UNIV_UNLIKELY(z_offset >= UNIV_PAGE_SIZE)) { +corrupt: + recv_sys->found_corrupt_log = TRUE; + + return(NULL); + } + + if (page) { + byte* storage_end; + byte* field; + byte* storage; + ulint heap_no; + + if (UNIV_UNLIKELY(!page_zip) + || UNIV_UNLIKELY(page_is_leaf(page))) { + + goto corrupt; + } + +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page, NULL)); +#endif /* UNIV_ZIP_DEBUG */ + + field = page + offset; + storage = page_zip->data + z_offset; + + storage_end = page_zip_dir_start(page_zip); + + heap_no = 1 + (storage_end - storage) / REC_NODE_PTR_SIZE; + + if (UNIV_UNLIKELY((storage_end - storage) % REC_NODE_PTR_SIZE) + || UNIV_UNLIKELY(heap_no < PAGE_HEAP_NO_USER_LOW) + || UNIV_UNLIKELY(heap_no >= page_dir_get_n_heap(page))) { + + goto corrupt; + } + + memcpy(field, ptr + 4, REC_NODE_PTR_SIZE); + memcpy(storage, ptr + 4, REC_NODE_PTR_SIZE); + +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page, NULL)); +#endif /* UNIV_ZIP_DEBUG */ + } + + return(ptr + (2 + 2 + REC_NODE_PTR_SIZE)); +} + +/**********************************************************************//** +Write the node pointer of a record on a non-leaf compressed page. */ +UNIV_INTERN +void +page_zip_write_node_ptr( +/*====================*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page */ + byte* rec, /*!< in/out: record */ + ulint size, /*!< in: data size of rec */ + ulint ptr, /*!< in: node pointer */ + mtr_t* mtr) /*!< in: mini-transaction, or NULL */ +{ + byte* field; + byte* storage; +#ifdef UNIV_DEBUG + page_t* page = page_align(rec); +#endif /* UNIV_DEBUG */ + + ut_ad(PAGE_ZIP_MATCH(rec, page_zip)); + ut_ad(page_simple_validate_new(page)); + ut_ad(page_zip_simple_validate(page_zip)); + ut_ad(page_zip_get_size(page_zip) + > PAGE_DATA + page_zip_dir_size(page_zip)); + ut_ad(page_rec_is_comp(rec)); + + ut_ad(page_zip->m_start >= PAGE_DATA); + ut_ad(page_zip_header_cmp(page_zip, page)); + + ut_ad(!page_is_leaf(page)); + + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + UNIV_MEM_ASSERT_RW(rec, size); + + storage = page_zip_dir_start(page_zip) + - (rec_get_heap_no_new(rec) - 1) * REC_NODE_PTR_SIZE; + field = rec + size - REC_NODE_PTR_SIZE; + +#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG + ut_a(!memcmp(storage, field, REC_NODE_PTR_SIZE)); +#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ +#if REC_NODE_PTR_SIZE != 4 +# error "REC_NODE_PTR_SIZE != 4" +#endif + mach_write_to_4(field, ptr); + memcpy(storage, field, REC_NODE_PTR_SIZE); + + if (mtr) { +#ifndef UNIV_HOTBACKUP + byte* log_ptr = mlog_open(mtr, + 11 + 2 + 2 + REC_NODE_PTR_SIZE); + if (UNIV_UNLIKELY(!log_ptr)) { + return; + } + + log_ptr = mlog_write_initial_log_record_fast( + field, MLOG_ZIP_WRITE_NODE_PTR, log_ptr, mtr); + mach_write_to_2(log_ptr, page_offset(field)); + log_ptr += 2; + mach_write_to_2(log_ptr, storage - page_zip->data); + log_ptr += 2; + memcpy(log_ptr, field, REC_NODE_PTR_SIZE); + log_ptr += REC_NODE_PTR_SIZE; + mlog_close(mtr, log_ptr); +#endif /* !UNIV_HOTBACKUP */ + } +} + +/**********************************************************************//** +Write the trx_id and roll_ptr of a record on a B-tree leaf node page. */ +UNIV_INTERN +void +page_zip_write_trx_id_and_roll_ptr( +/*===============================*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page */ + byte* rec, /*!< in/out: record */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + ulint trx_id_col,/*!< in: column number of TRX_ID in rec */ + trx_id_t trx_id, /*!< in: transaction identifier */ + roll_ptr_t roll_ptr)/*!< in: roll_ptr */ +{ + byte* field; + byte* storage; +#ifdef UNIV_DEBUG + page_t* page = page_align(rec); +#endif /* UNIV_DEBUG */ + ulint len; + + ut_ad(PAGE_ZIP_MATCH(rec, page_zip)); + + ut_ad(page_simple_validate_new(page)); + ut_ad(page_zip_simple_validate(page_zip)); + ut_ad(page_zip_get_size(page_zip) + > PAGE_DATA + page_zip_dir_size(page_zip)); + ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_ad(rec_offs_comp(offsets)); + + ut_ad(page_zip->m_start >= PAGE_DATA); + ut_ad(page_zip_header_cmp(page_zip, page)); + + ut_ad(page_is_leaf(page)); + + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + + storage = page_zip_dir_start(page_zip) + - (rec_get_heap_no_new(rec) - 1) + * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + +#if DATA_TRX_ID + 1 != DATA_ROLL_PTR +# error "DATA_TRX_ID + 1 != DATA_ROLL_PTR" +#endif + field = rec_get_nth_field(rec, offsets, trx_id_col, &len); + ut_ad(len == DATA_TRX_ID_LEN); + ut_ad(field + DATA_TRX_ID_LEN + == rec_get_nth_field(rec, offsets, trx_id_col + 1, &len)); + ut_ad(len == DATA_ROLL_PTR_LEN); +#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG + ut_a(!memcmp(storage, field, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)); +#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ +#if DATA_TRX_ID_LEN != 6 +# error "DATA_TRX_ID_LEN != 6" +#endif + mach_write_to_6(field, trx_id); +#if DATA_ROLL_PTR_LEN != 7 +# error "DATA_ROLL_PTR_LEN != 7" +#endif + mach_write_to_7(field + DATA_TRX_ID_LEN, roll_ptr); + memcpy(storage, field, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets)); + UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); +} + +/**********************************************************************//** +Clear an area on the uncompressed and compressed page. +Do not clear the data payload, as that would grow the modification log. */ +static +void +page_zip_clear_rec( +/*===============*/ + page_zip_des_t* page_zip, /*!< in/out: compressed page */ + byte* rec, /*!< in: record to clear */ + const dict_index_t* index, /*!< in: index of rec */ + const ulint* offsets) /*!< in: rec_get_offsets(rec, index) */ +{ + ulint heap_no; + page_t* page = page_align(rec); + byte* storage; + byte* field; + ulint len; + /* page_zip_validate() would fail here if a record + containing externally stored columns is being deleted. */ + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(!page_zip_dir_find(page_zip, page_offset(rec))); + ut_ad(page_zip_dir_find_free(page_zip, page_offset(rec))); + ut_ad(page_zip_header_cmp(page_zip, page)); + + heap_no = rec_get_heap_no_new(rec); + ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW); + + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets)); + UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + + if (!page_is_leaf(page)) { + /* Clear node_ptr. On the compressed page, + there is an array of node_ptr immediately before the + dense page directory, at the very end of the page. */ + storage = page_zip_dir_start(page_zip); + ut_ad(dict_index_get_n_unique_in_tree(index) == + rec_offs_n_fields(offsets) - 1); + field = rec_get_nth_field(rec, offsets, + rec_offs_n_fields(offsets) - 1, + &len); + ut_ad(len == REC_NODE_PTR_SIZE); + + ut_ad(!rec_offs_any_extern(offsets)); + memset(field, 0, REC_NODE_PTR_SIZE); + memset(storage - (heap_no - 1) * REC_NODE_PTR_SIZE, + 0, REC_NODE_PTR_SIZE); + } else if (dict_index_is_clust(index)) { + /* Clear trx_id and roll_ptr. On the compressed page, + there is an array of these fields immediately before the + dense page directory, at the very end of the page. */ + const ulint trx_id_pos + = dict_col_get_clust_pos( + dict_table_get_sys_col( + index->table, DATA_TRX_ID), index); + storage = page_zip_dir_start(page_zip); + field = rec_get_nth_field(rec, offsets, trx_id_pos, &len); + ut_ad(len == DATA_TRX_ID_LEN); + + memset(field, 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + memset(storage - (heap_no - 1) + * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN), + 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + if (rec_offs_any_extern(offsets)) { + ulint i; + + for (i = rec_offs_n_fields(offsets); i--; ) { + /* Clear all BLOB pointers in order to make + page_zip_validate() pass. */ + if (rec_offs_nth_extern(offsets, i)) { + field = rec_get_nth_field( + rec, offsets, i, &len); + ut_ad(len + == BTR_EXTERN_FIELD_REF_SIZE); + memset(field + len + - BTR_EXTERN_FIELD_REF_SIZE, + 0, BTR_EXTERN_FIELD_REF_SIZE); + } + } + } + } else { + ut_ad(!rec_offs_any_extern(offsets)); + } + +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ +} + +/**********************************************************************//** +Write the "deleted" flag of a record on a compressed page. The flag must +already have been written on the uncompressed page. */ +UNIV_INTERN +void +page_zip_rec_set_deleted( +/*=====================*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page */ + const byte* rec, /*!< in: record on the uncompressed page */ + ulint flag) /*!< in: the deleted flag (nonzero=TRUE) */ +{ + byte* slot = page_zip_dir_find(page_zip, page_offset(rec)); + ut_a(slot); + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + if (flag) { + *slot |= (PAGE_ZIP_DIR_SLOT_DEL >> 8); + } else { + *slot &= ~(PAGE_ZIP_DIR_SLOT_DEL >> 8); + } +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page_align(rec), NULL)); +#endif /* UNIV_ZIP_DEBUG */ +} + +/**********************************************************************//** +Write the "owned" flag of a record on a compressed page. The n_owned field +must already have been written on the uncompressed page. */ +UNIV_INTERN +void +page_zip_rec_set_owned( +/*===================*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page */ + const byte* rec, /*!< in: record on the uncompressed page */ + ulint flag) /*!< in: the owned flag (nonzero=TRUE) */ +{ + byte* slot = page_zip_dir_find(page_zip, page_offset(rec)); + ut_a(slot); + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + if (flag) { + *slot |= (PAGE_ZIP_DIR_SLOT_OWNED >> 8); + } else { + *slot &= ~(PAGE_ZIP_DIR_SLOT_OWNED >> 8); + } +} + +/**********************************************************************//** +Insert a record to the dense page directory. */ +UNIV_INTERN +void +page_zip_dir_insert( +/*================*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page */ + const byte* prev_rec,/*!< in: record after which to insert */ + const byte* free_rec,/*!< in: record from which rec was + allocated, or NULL */ + byte* rec) /*!< in: record to insert */ +{ + ulint n_dense; + byte* slot_rec; + byte* slot_free; + + ut_ad(prev_rec != rec); + ut_ad(page_rec_get_next((rec_t*) prev_rec) == rec); + ut_ad(page_zip_simple_validate(page_zip)); + + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + + if (page_rec_is_infimum(prev_rec)) { + /* Use the first slot. */ + slot_rec = page_zip->data + page_zip_get_size(page_zip); + } else { + byte* end = page_zip->data + page_zip_get_size(page_zip); + byte* start = end - page_zip_dir_user_size(page_zip); + + if (UNIV_LIKELY(!free_rec)) { + /* PAGE_N_RECS was already incremented + in page_cur_insert_rec_zip(), but the + dense directory slot at that position + contains garbage. Skip it. */ + start += PAGE_ZIP_DIR_SLOT_SIZE; + } + + slot_rec = page_zip_dir_find_low(start, end, + page_offset(prev_rec)); + ut_a(slot_rec); + } + + /* Read the old n_dense (n_heap may have been incremented). */ + n_dense = page_dir_get_n_heap(page_zip->data) + - (PAGE_HEAP_NO_USER_LOW + 1); + + if (UNIV_LIKELY_NULL(free_rec)) { + /* The record was allocated from the free list. + Shift the dense directory only up to that slot. + Note that in this case, n_dense is actually + off by one, because page_cur_insert_rec_zip() + did not increment n_heap. */ + ut_ad(rec_get_heap_no_new(rec) < n_dense + 1 + + PAGE_HEAP_NO_USER_LOW); + ut_ad(rec >= free_rec); + slot_free = page_zip_dir_find(page_zip, page_offset(free_rec)); + ut_ad(slot_free); + slot_free += PAGE_ZIP_DIR_SLOT_SIZE; + } else { + /* The record was allocated from the heap. + Shift the entire dense directory. */ + ut_ad(rec_get_heap_no_new(rec) == n_dense + + PAGE_HEAP_NO_USER_LOW); + + /* Shift to the end of the dense page directory. */ + slot_free = page_zip->data + page_zip_get_size(page_zip) + - PAGE_ZIP_DIR_SLOT_SIZE * n_dense; + } + + /* Shift the dense directory to allocate place for rec. */ + memmove(slot_free - PAGE_ZIP_DIR_SLOT_SIZE, slot_free, + slot_rec - slot_free); + + /* Write the entry for the inserted record. + The "owned" and "deleted" flags must be zero. */ + mach_write_to_2(slot_rec - PAGE_ZIP_DIR_SLOT_SIZE, page_offset(rec)); +} + +/**********************************************************************//** +Shift the dense page directory and the array of BLOB pointers +when a record is deleted. */ +UNIV_INTERN +void +page_zip_dir_delete( +/*================*/ + page_zip_des_t* page_zip, /*!< in/out: compressed page */ + byte* rec, /*!< in: deleted record */ + const dict_index_t* index, /*!< in: index of rec */ + const ulint* offsets, /*!< in: rec_get_offsets(rec) */ + const byte* free) /*!< in: previous start of + the free list */ +{ + byte* slot_rec; + byte* slot_free; + ulint n_ext; + page_t* page = page_align(rec); + + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(rec_offs_comp(offsets)); + + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets)); + UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + + slot_rec = page_zip_dir_find(page_zip, page_offset(rec)); + + ut_a(slot_rec); + + /* This could not be done before page_zip_dir_find(). */ + page_header_set_field(page, page_zip, PAGE_N_RECS, + (ulint)(page_get_n_recs(page) - 1)); + + if (UNIV_UNLIKELY(!free)) { + /* Make the last slot the start of the free list. */ + slot_free = page_zip->data + page_zip_get_size(page_zip) + - PAGE_ZIP_DIR_SLOT_SIZE + * (page_dir_get_n_heap(page_zip->data) + - PAGE_HEAP_NO_USER_LOW); + } else { + slot_free = page_zip_dir_find_free(page_zip, + page_offset(free)); + ut_a(slot_free < slot_rec); + /* Grow the free list by one slot by moving the start. */ + slot_free += PAGE_ZIP_DIR_SLOT_SIZE; + } + + if (UNIV_LIKELY(slot_rec > slot_free)) { + memmove(slot_free + PAGE_ZIP_DIR_SLOT_SIZE, + slot_free, + slot_rec - slot_free); + } + + /* Write the entry for the deleted record. + The "owned" and "deleted" flags will be cleared. */ + mach_write_to_2(slot_free, page_offset(rec)); + + if (!page_is_leaf(page) || !dict_index_is_clust(index)) { + ut_ad(!rec_offs_any_extern(offsets)); + goto skip_blobs; + } + + n_ext = rec_offs_n_extern(offsets); + if (UNIV_UNLIKELY(n_ext)) { + /* Shift and zero fill the array of BLOB pointers. */ + ulint blob_no; + byte* externs; + byte* ext_end; + + blob_no = page_zip_get_n_prev_extern(page_zip, rec, index); + ut_a(blob_no + n_ext <= page_zip->n_blobs); + + externs = page_zip->data + page_zip_get_size(page_zip) + - (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW) + * (PAGE_ZIP_DIR_SLOT_SIZE + + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + ext_end = externs - page_zip->n_blobs + * BTR_EXTERN_FIELD_REF_SIZE; + externs -= blob_no * BTR_EXTERN_FIELD_REF_SIZE; + + page_zip->n_blobs -= static_cast<unsigned>(n_ext); + /* Shift and zero fill the array. */ + memmove(ext_end + n_ext * BTR_EXTERN_FIELD_REF_SIZE, ext_end, + (page_zip->n_blobs - blob_no) + * BTR_EXTERN_FIELD_REF_SIZE); + memset(ext_end, 0, n_ext * BTR_EXTERN_FIELD_REF_SIZE); + } + +skip_blobs: + /* The compression algorithm expects info_bits and n_owned + to be 0 for deleted records. */ + rec[-REC_N_NEW_EXTRA_BYTES] = 0; /* info_bits and n_owned */ + + page_zip_clear_rec(page_zip, rec, index, offsets); +} + +/**********************************************************************//** +Add a slot to the dense page directory. */ +UNIV_INTERN +void +page_zip_dir_add_slot( +/*==================*/ + page_zip_des_t* page_zip, /*!< in/out: compressed page */ + ulint is_clustered) /*!< in: nonzero for clustered index, + zero for others */ +{ + ulint n_dense; + byte* dir; + byte* stored; + + ut_ad(page_is_comp(page_zip->data)); + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + + /* Read the old n_dense (n_heap has already been incremented). */ + n_dense = page_dir_get_n_heap(page_zip->data) + - (PAGE_HEAP_NO_USER_LOW + 1); + + dir = page_zip->data + page_zip_get_size(page_zip) + - PAGE_ZIP_DIR_SLOT_SIZE * n_dense; + + if (!page_is_leaf(page_zip->data)) { + ut_ad(!page_zip->n_blobs); + stored = dir - n_dense * REC_NODE_PTR_SIZE; + } else if (is_clustered) { + /* Move the BLOB pointer array backwards to make space for the + roll_ptr and trx_id columns and the dense directory slot. */ + byte* externs; + + stored = dir - n_dense + * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + externs = stored + - page_zip->n_blobs * BTR_EXTERN_FIELD_REF_SIZE; + ASSERT_ZERO(externs + - (PAGE_ZIP_DIR_SLOT_SIZE + + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN), + PAGE_ZIP_DIR_SLOT_SIZE + + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + memmove(externs - (PAGE_ZIP_DIR_SLOT_SIZE + + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN), + externs, stored - externs); + } else { + stored = dir + - page_zip->n_blobs * BTR_EXTERN_FIELD_REF_SIZE; + ASSERT_ZERO(stored - PAGE_ZIP_DIR_SLOT_SIZE, + PAGE_ZIP_DIR_SLOT_SIZE); + } + + /* Move the uncompressed area backwards to make space + for one directory slot. */ + memmove(stored - PAGE_ZIP_DIR_SLOT_SIZE, stored, dir - stored); +} + +/***********************************************************//** +Parses a log record of writing to the header of a page. +@return end of log record or NULL */ +UNIV_INTERN +byte* +page_zip_parse_write_header( +/*========================*/ + byte* ptr, /*!< in: redo log buffer */ + byte* end_ptr,/*!< in: redo log buffer end */ + page_t* page, /*!< in/out: uncompressed page */ + page_zip_des_t* page_zip)/*!< in/out: compressed page */ +{ + ulint offset; + ulint len; + + ut_ad(ptr && end_ptr); + ut_ad(!page == !page_zip); + + if (UNIV_UNLIKELY(end_ptr < ptr + (1 + 1))) { + + return(NULL); + } + + offset = (ulint) *ptr++; + len = (ulint) *ptr++; + + if (UNIV_UNLIKELY(!len) || UNIV_UNLIKELY(offset + len >= PAGE_DATA)) { +corrupt: + recv_sys->found_corrupt_log = TRUE; + + return(NULL); + } + + if (UNIV_UNLIKELY(end_ptr < ptr + len)) { + + return(NULL); + } + + if (page) { + if (UNIV_UNLIKELY(!page_zip)) { + + goto corrupt; + } +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page, NULL)); +#endif /* UNIV_ZIP_DEBUG */ + + memcpy(page + offset, ptr, len); + memcpy(page_zip->data + offset, ptr, len); + +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page, NULL)); +#endif /* UNIV_ZIP_DEBUG */ + } + + return(ptr + len); +} + +#ifndef UNIV_HOTBACKUP +/**********************************************************************//** +Write a log record of writing to the uncompressed header portion of a page. */ +UNIV_INTERN +void +page_zip_write_header_log( +/*======================*/ + const byte* data, /*!< in: data on the uncompressed page */ + ulint length, /*!< in: length of the data */ + mtr_t* mtr) /*!< in: mini-transaction */ +{ + byte* log_ptr = mlog_open(mtr, 11 + 1 + 1); + ulint offset = page_offset(data); + + ut_ad(offset < PAGE_DATA); + ut_ad(offset + length < PAGE_DATA); +#if PAGE_DATA > 255 +# error "PAGE_DATA > 255" +#endif + ut_ad(length < 256); + + /* If no logging is requested, we may return now */ + if (UNIV_UNLIKELY(!log_ptr)) { + + return; + } + + log_ptr = mlog_write_initial_log_record_fast( + (byte*) data, MLOG_ZIP_WRITE_HEADER, log_ptr, mtr); + *log_ptr++ = (byte) offset; + *log_ptr++ = (byte) length; + mlog_close(mtr, log_ptr); + + mlog_catenate_string(mtr, data, length); +} +#endif /* !UNIV_HOTBACKUP */ + +/**********************************************************************//** +Reorganize and compress a page. This is a low-level operation for +compressed pages, to be used when page_zip_compress() fails. +On success, a redo log entry MLOG_ZIP_PAGE_COMPRESS will be written. +The function btr_page_reorganize() should be preferred whenever possible. +IMPORTANT: if page_zip_reorganize() is invoked on a leaf page of a +non-clustered index, the caller must update the insert buffer free +bits in the same mini-transaction in such a way that the modification +will be redo-logged. +@return TRUE on success, FALSE on failure; page_zip will be left +intact on failure, but page will be overwritten. */ +UNIV_INTERN +ibool +page_zip_reorganize( +/*================*/ + buf_block_t* block, /*!< in/out: page with compressed page; + on the compressed page, in: size; + out: data, n_blobs, + m_start, m_end, m_nonempty */ + dict_index_t* index, /*!< in: index of the B-tree node */ + mtr_t* mtr) /*!< in: mini-transaction */ +{ +#ifndef UNIV_HOTBACKUP + buf_pool_t* buf_pool = buf_pool_from_block(block); +#endif /* !UNIV_HOTBACKUP */ + page_zip_des_t* page_zip = buf_block_get_page_zip(block); + page_t* page = buf_block_get_frame(block); + buf_block_t* temp_block; + page_t* temp_page; + ulint log_mode; + + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(page_is_comp(page)); + ut_ad(!dict_index_is_ibuf(index)); + /* Note that page_zip_validate(page_zip, page, index) may fail here. */ + UNIV_MEM_ASSERT_RW(page, UNIV_PAGE_SIZE); + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + + /* Disable logging */ + log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE); + +#ifndef UNIV_HOTBACKUP + temp_block = buf_block_alloc(buf_pool); + btr_search_drop_page_hash_index(block); + block->check_index_page_at_flush = TRUE; +#else /* !UNIV_HOTBACKUP */ + ut_ad(block == back_block1); + temp_block = back_block2; +#endif /* !UNIV_HOTBACKUP */ + temp_page = temp_block->frame; + + /* Copy the old page to temporary space */ + buf_frame_copy(temp_page, page); + + btr_blob_dbg_remove(page, index, "zip_reorg"); + + /* Recreate the page: note that global data on page (possible + segment headers, next page-field, etc.) is preserved intact */ + + page_create(block, mtr, TRUE); + + /* Copy the records from the temporary space to the recreated page; + do not copy the lock bits yet */ + + page_copy_rec_list_end_no_locks(block, temp_block, + page_get_infimum_rec(temp_page), + index, mtr); + + if (!dict_index_is_clust(index) && page_is_leaf(temp_page)) { + /* Copy max trx id to recreated page */ + trx_id_t max_trx_id = page_get_max_trx_id(temp_page); + page_set_max_trx_id(block, NULL, max_trx_id, NULL); + ut_ad(max_trx_id != 0); + } + + /* Restore logging. */ + mtr_set_log_mode(mtr, log_mode); + + if (!page_zip_compress(page_zip, page, index, page_zip_level, mtr)) { + +#ifndef UNIV_HOTBACKUP + buf_block_free(temp_block); +#endif /* !UNIV_HOTBACKUP */ + return(FALSE); + } + + lock_move_reorganize_page(block, temp_block); + +#ifndef UNIV_HOTBACKUP + buf_block_free(temp_block); +#endif /* !UNIV_HOTBACKUP */ + return(TRUE); +} + +#ifndef UNIV_HOTBACKUP +/**********************************************************************//** +Copy the records of a page byte for byte. Do not copy the page header +or trailer, except those B-tree header fields that are directly +related to the storage of records. Also copy PAGE_MAX_TRX_ID. +NOTE: The caller must update the lock table and the adaptive hash index. */ +UNIV_INTERN +void +page_zip_copy_recs( +/*===============*/ + page_zip_des_t* page_zip, /*!< out: copy of src_zip + (n_blobs, m_start, m_end, + m_nonempty, data[0..size-1]) */ + page_t* page, /*!< out: copy of src */ + const page_zip_des_t* src_zip, /*!< in: compressed page */ + const page_t* src, /*!< in: page */ + dict_index_t* index, /*!< in: index of the B-tree */ + mtr_t* mtr) /*!< in: mini-transaction */ +{ + ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX)); + ut_ad(mtr_memo_contains_page(mtr, src, MTR_MEMO_PAGE_X_FIX)); + ut_ad(!dict_index_is_ibuf(index)); +#ifdef UNIV_ZIP_DEBUG + /* The B-tree operations that call this function may set + FIL_PAGE_PREV or PAGE_LEVEL, causing a temporary min_rec_flag + mismatch. A strict page_zip_validate() will be executed later + during the B-tree operations. */ + ut_a(page_zip_validate_low(src_zip, src, index, TRUE)); +#endif /* UNIV_ZIP_DEBUG */ + ut_a(page_zip_get_size(page_zip) == page_zip_get_size(src_zip)); + if (UNIV_UNLIKELY(src_zip->n_blobs)) { + ut_a(page_is_leaf(src)); + ut_a(dict_index_is_clust(index)); + } + + /* The PAGE_MAX_TRX_ID must be set on leaf pages of secondary + indexes. It does not matter on other pages. */ + ut_a(dict_index_is_clust(index) || !page_is_leaf(src) + || page_get_max_trx_id(src)); + + UNIV_MEM_ASSERT_W(page, UNIV_PAGE_SIZE); + UNIV_MEM_ASSERT_W(page_zip->data, page_zip_get_size(page_zip)); + UNIV_MEM_ASSERT_RW(src, UNIV_PAGE_SIZE); + UNIV_MEM_ASSERT_RW(src_zip->data, page_zip_get_size(page_zip)); + + /* Copy those B-tree page header fields that are related to + the records stored in the page. Also copy the field + PAGE_MAX_TRX_ID. Skip the rest of the page header and + trailer. On the compressed page, there is no trailer. */ +#if PAGE_MAX_TRX_ID + 8 != PAGE_HEADER_PRIV_END +# error "PAGE_MAX_TRX_ID + 8 != PAGE_HEADER_PRIV_END" +#endif + memcpy(PAGE_HEADER + page, PAGE_HEADER + src, + PAGE_HEADER_PRIV_END); + memcpy(PAGE_DATA + page, PAGE_DATA + src, + UNIV_PAGE_SIZE - PAGE_DATA - FIL_PAGE_DATA_END); + memcpy(PAGE_HEADER + page_zip->data, PAGE_HEADER + src_zip->data, + PAGE_HEADER_PRIV_END); + memcpy(PAGE_DATA + page_zip->data, PAGE_DATA + src_zip->data, + page_zip_get_size(page_zip) - PAGE_DATA); + + /* Copy all fields of src_zip to page_zip, except the pointer + to the compressed data page. */ + { + page_zip_t* data = page_zip->data; + memcpy(page_zip, src_zip, sizeof *page_zip); + page_zip->data = data; + } + ut_ad(page_zip_get_trailer_len(page_zip, dict_index_is_clust(index)) + + page_zip->m_end < page_zip_get_size(page_zip)); + + if (!page_is_leaf(src) + && UNIV_UNLIKELY(mach_read_from_4(src + FIL_PAGE_PREV) == FIL_NULL) + && UNIV_LIKELY(mach_read_from_4(page + + FIL_PAGE_PREV) != FIL_NULL)) { + /* Clear the REC_INFO_MIN_REC_FLAG of the first user record. */ + ulint offs = rec_get_next_offs(page + PAGE_NEW_INFIMUM, + TRUE); + if (UNIV_LIKELY(offs != PAGE_NEW_SUPREMUM)) { + rec_t* rec = page + offs; + ut_a(rec[-REC_N_NEW_EXTRA_BYTES] + & REC_INFO_MIN_REC_FLAG); + rec[-REC_N_NEW_EXTRA_BYTES] &= ~ REC_INFO_MIN_REC_FLAG; + } + } + +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + btr_blob_dbg_add(page, index, "page_zip_copy_recs"); + + page_zip_compress_write_log(page_zip, page, index, mtr); +} +#endif /* !UNIV_HOTBACKUP */ + +/**********************************************************************//** +Parses a log record of compressing an index page. +@return end of log record or NULL */ +UNIV_INTERN +byte* +page_zip_parse_compress( +/*====================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + page_t* page, /*!< out: uncompressed page */ + page_zip_des_t* page_zip)/*!< out: compressed page */ +{ + ulint size; + ulint trailer_size; + + ut_ad(ptr && end_ptr); + ut_ad(!page == !page_zip); + + if (UNIV_UNLIKELY(ptr + (2 + 2) > end_ptr)) { + + return(NULL); + } + + size = mach_read_from_2(ptr); + ptr += 2; + trailer_size = mach_read_from_2(ptr); + ptr += 2; + + if (UNIV_UNLIKELY(ptr + 8 + size + trailer_size > end_ptr)) { + + return(NULL); + } + + if (page) { + if (UNIV_UNLIKELY(!page_zip) + || UNIV_UNLIKELY(page_zip_get_size(page_zip) < size)) { +corrupt: + recv_sys->found_corrupt_log = TRUE; + + return(NULL); + } + + memcpy(page_zip->data + FIL_PAGE_PREV, ptr, 4); + memcpy(page_zip->data + FIL_PAGE_NEXT, ptr + 4, 4); + memcpy(page_zip->data + FIL_PAGE_TYPE, ptr + 8, size); + memset(page_zip->data + FIL_PAGE_TYPE + size, 0, + page_zip_get_size(page_zip) - trailer_size + - (FIL_PAGE_TYPE + size)); + memcpy(page_zip->data + page_zip_get_size(page_zip) + - trailer_size, ptr + 8 + size, trailer_size); + + if (UNIV_UNLIKELY(!page_zip_decompress(page_zip, page, + TRUE))) { + + goto corrupt; + } + } + + return(ptr + 8 + size + trailer_size); +} + +/**********************************************************************//** +Calculate the compressed page checksum. +@return page checksum */ +UNIV_INTERN +ulint +page_zip_calc_checksum( +/*===================*/ + const void* data, /*!< in: compressed page */ + ulint size, /*!< in: size of compressed page */ + srv_checksum_algorithm_t algo) /*!< in: algorithm to use */ +{ + uLong adler; + ib_uint32_t crc32; + const Bytef* s = static_cast<const byte*>(data); + + /* Exclude FIL_PAGE_SPACE_OR_CHKSUM, FIL_PAGE_LSN, + and FIL_PAGE_FILE_FLUSH_LSN from the checksum. */ + + switch (algo) { + case SRV_CHECKSUM_ALGORITHM_CRC32: + case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32: + + ut_ad(size > FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + + crc32 = ut_crc32(s + FIL_PAGE_OFFSET, + FIL_PAGE_LSN - FIL_PAGE_OFFSET) + ^ ut_crc32(s + FIL_PAGE_TYPE, 2) + ^ ut_crc32(s + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, + size - FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + + return((ulint) crc32); + case SRV_CHECKSUM_ALGORITHM_INNODB: + case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB: + ut_ad(size > FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + + adler = adler32(0L, s + FIL_PAGE_OFFSET, + FIL_PAGE_LSN - FIL_PAGE_OFFSET); + adler = adler32(adler, s + FIL_PAGE_TYPE, 2); + adler = adler32( + adler, s + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, + static_cast<uInt>(size) + - FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + + return((ulint) adler); + case SRV_CHECKSUM_ALGORITHM_NONE: + case SRV_CHECKSUM_ALGORITHM_STRICT_NONE: + return(BUF_NO_CHECKSUM_MAGIC); + /* no default so the compiler will emit a warning if new enum + is added and not handled here */ + } + + ut_error; + return(0); +} + +/**********************************************************************//** +Verify a compressed page's checksum. +@return TRUE if the stored checksum is valid according to the value of +innodb_checksum_algorithm */ +UNIV_INTERN +ibool +page_zip_verify_checksum( +/*=====================*/ + const void* data, /*!< in: compressed page */ + ulint size) /*!< in: size of compressed page */ +{ + ib_uint32_t stored; + ib_uint32_t calc; + ib_uint32_t crc32 = 0 /* silence bogus warning */; + ib_uint32_t innodb = 0 /* silence bogus warning */; + + stored = static_cast<ib_uint32_t>(mach_read_from_4( + static_cast<const unsigned char*>(data) + FIL_PAGE_SPACE_OR_CHKSUM)); + +#if FIL_PAGE_LSN % 8 +#error "FIL_PAGE_LSN must be 64 bit aligned" +#endif + + /* Check if page is empty */ + if (stored == 0 + && *reinterpret_cast<const ib_uint64_t*>(static_cast<const char*>( + data) + + FIL_PAGE_LSN) == 0) { + /* make sure that the page is really empty */ + ulint i; + for (i = 0; i < size; i++) { + if (*((const char*) data + i) != 0) { + return(FALSE); + } + } + /* Empty page */ + return(TRUE); + } + + calc = static_cast<ib_uint32_t>(page_zip_calc_checksum( + data, size, static_cast<srv_checksum_algorithm_t>( + srv_checksum_algorithm))); + + if (stored == calc) { + return(TRUE); + } + + switch ((srv_checksum_algorithm_t) srv_checksum_algorithm) { + case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32: + case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB: + case SRV_CHECKSUM_ALGORITHM_STRICT_NONE: + return(stored == calc); + case SRV_CHECKSUM_ALGORITHM_CRC32: + if (stored == BUF_NO_CHECKSUM_MAGIC) { + return(TRUE); + } + crc32 = calc; + innodb = static_cast<ib_uint32_t>(page_zip_calc_checksum( + data, size, SRV_CHECKSUM_ALGORITHM_INNODB)); + break; + case SRV_CHECKSUM_ALGORITHM_INNODB: + if (stored == BUF_NO_CHECKSUM_MAGIC) { + return(TRUE); + } + crc32 = static_cast<ib_uint32_t>(page_zip_calc_checksum( + data, size, SRV_CHECKSUM_ALGORITHM_CRC32)); + innodb = calc; + break; + case SRV_CHECKSUM_ALGORITHM_NONE: + return(TRUE); + /* no default so the compiler will emit a warning if new enum + is added and not handled here */ + } + + return(stored == crc32 || stored == innodb); +} diff --git a/storage/xtradb/pars/lexyy.cc b/storage/xtradb/pars/lexyy.cc new file mode 100644 index 00000000000..1c01becd9ed --- /dev/null +++ b/storage/xtradb/pars/lexyy.cc @@ -0,0 +1,3130 @@ +#include "univ.i" +#line 2 "lexyy.cc" + +#line 4 "lexyy.cc" + +#define YY_INT_ALIGNED short int + +/* A lexical scanner generated by flex */ + +#define FLEX_SCANNER +#define YY_FLEX_MAJOR_VERSION 2 +#define YY_FLEX_MINOR_VERSION 5 +#define YY_FLEX_SUBMINOR_VERSION 35 +#if YY_FLEX_SUBMINOR_VERSION > 0 +#define FLEX_BETA +#endif + +/* First, we deal with platform-specific or compiler-specific issues. */ + +/* begin standard C headers. */ +#include <stdio.h> +#include <string.h> +#include <errno.h> +#include <stdlib.h> + +/* end standard C headers. */ + +/* flex integer type definitions */ + +#ifndef FLEXINT_H +#define FLEXINT_H + +/* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */ + +#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + +/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h, + * if you want the limit (max/min) macros for int types. + */ +#ifndef __STDC_LIMIT_MACROS +#define __STDC_LIMIT_MACROS 1 +#endif + +#include <inttypes.h> +typedef int8_t flex_int8_t; +typedef uint8_t flex_uint8_t; +typedef int16_t flex_int16_t; +typedef uint16_t flex_uint16_t; +typedef int32_t flex_int32_t; +typedef uint32_t flex_uint32_t; +#else +typedef signed char flex_int8_t; +typedef short int flex_int16_t; +typedef int flex_int32_t; +typedef unsigned char flex_uint8_t; +typedef unsigned short int flex_uint16_t; +typedef unsigned int flex_uint32_t; + +/* Limits of integral types. */ +#ifndef INT8_MIN +#define INT8_MIN (-128) +#endif +#ifndef INT16_MIN +#define INT16_MIN (-32767-1) +#endif +#ifndef INT32_MIN +#define INT32_MIN (-2147483647-1) +#endif +#ifndef INT8_MAX +#define INT8_MAX (127) +#endif +#ifndef INT16_MAX +#define INT16_MAX (32767) +#endif +#ifndef INT32_MAX +#define INT32_MAX (2147483647) +#endif +#ifndef UINT8_MAX +#define UINT8_MAX (255U) +#endif +#ifndef UINT16_MAX +#define UINT16_MAX (65535U) +#endif +#ifndef UINT32_MAX +#define UINT32_MAX (4294967295U) +#endif + +#endif /* ! C99 */ + +#endif /* ! FLEXINT_H */ + +#ifdef __cplusplus + +/* The "const" storage-class-modifier is valid. */ +#define YY_USE_CONST + +#else /* ! __cplusplus */ + +/* C99 requires __STDC__ to be defined as 1. */ +#if defined (__STDC__) + +#define YY_USE_CONST + +#endif /* defined (__STDC__) */ +#endif /* ! __cplusplus */ + +#ifdef YY_USE_CONST +#define yyconst const +#else +#define yyconst +#endif + +/* Returned upon end-of-file. */ +#define YY_NULL 0 + +/* Promotes a possibly negative, possibly signed char to an unsigned + * integer for use as an array index. If the signed char is negative, + * we want to instead treat it as an 8-bit unsigned char, hence the + * double cast. + */ +#define YY_SC_TO_UI(c) ((unsigned int) (unsigned char) c) + +/* Enter a start condition. This macro really ought to take a parameter, + * but we do it the disgusting crufty way forced on us by the ()-less + * definition of BEGIN. + */ +#define BEGIN (yy_start) = 1 + 2 * + +/* Translate the current start state into a value that can be later handed + * to BEGIN to return to the state. The YYSTATE alias is for lex + * compatibility. + */ +#define YY_START (((yy_start) - 1) / 2) +#define YYSTATE YY_START + +/* Action number for EOF rule of a given start state. */ +#define YY_STATE_EOF(state) (YY_END_OF_BUFFER + state + 1) + +/* Special action meaning "start processing a new file". */ +#define YY_NEW_FILE yyrestart(yyin ) + +#define YY_END_OF_BUFFER_CHAR 0 + +/* Size of default input buffer. */ +#ifndef YY_BUF_SIZE +#ifdef __ia64__ +/* On IA-64, the buffer size is 16k, not 8k. + * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case. + * Ditto for the __ia64__ case accordingly. + */ +#define YY_BUF_SIZE 32768 +#else +#define YY_BUF_SIZE 16384 +#endif /* __ia64__ */ +#endif + +/* The state buf must be large enough to hold one state per character in the main buffer. + */ +#define YY_STATE_BUF_SIZE ((YY_BUF_SIZE + 2) * sizeof(yy_state_type)) + +#ifndef YY_TYPEDEF_YY_BUFFER_STATE +#define YY_TYPEDEF_YY_BUFFER_STATE +typedef struct yy_buffer_state *YY_BUFFER_STATE; +#endif + +#ifndef YY_TYPEDEF_YY_SIZE_T +#define YY_TYPEDEF_YY_SIZE_T +typedef size_t yy_size_t; +#endif + +extern yy_size_t yyleng; + +extern FILE *yyin, *yyout; + +#define EOB_ACT_CONTINUE_SCAN 0 +#define EOB_ACT_END_OF_FILE 1 +#define EOB_ACT_LAST_MATCH 2 + + #define YY_LESS_LINENO(n) + +/* Return all but the first "n" matched characters back to the input stream. */ +#define yyless(n) \ + do \ + { \ + /* Undo effects of setting up yytext. */ \ + int yyless_macro_arg = (n); \ + YY_LESS_LINENO(yyless_macro_arg);\ + *yy_cp = (yy_hold_char); \ + YY_RESTORE_YY_MORE_OFFSET \ + (yy_c_buf_p) = yy_cp = yy_bp + yyless_macro_arg - YY_MORE_ADJ; \ + YY_DO_BEFORE_ACTION; /* set up yytext again */ \ + } \ + while ( 0 ) + +#define unput(c) yyunput( c, (yytext_ptr) ) + +#ifndef YY_STRUCT_YY_BUFFER_STATE +#define YY_STRUCT_YY_BUFFER_STATE +struct yy_buffer_state + { + FILE *yy_input_file; + + char *yy_ch_buf; /* input buffer */ + char *yy_buf_pos; /* current position in input buffer */ + + /* Size of input buffer in bytes, not including room for EOB + * characters. + */ + yy_size_t yy_buf_size; + + /* Number of characters read into yy_ch_buf, not including EOB + * characters. + */ + yy_size_t yy_n_chars; + + /* Whether we "own" the buffer - i.e., we know we created it, + * and can realloc() it to grow it, and should free() it to + * delete it. + */ + int yy_is_our_buffer; + + /* Whether this is an "interactive" input source; if so, and + * if we're using stdio for input, then we want to use getc() + * instead of fread(), to make sure we stop fetching input after + * each newline. + */ + int yy_is_interactive; + + /* Whether we're considered to be at the beginning of a line. + * If so, '^' rules will be active on the next match, otherwise + * not. + */ + int yy_at_bol; + + int yy_bs_lineno; /**< The line count. */ + int yy_bs_column; /**< The column count. */ + + /* Whether to try to fill the input buffer when we reach the + * end of it. + */ + int yy_fill_buffer; + + int yy_buffer_status; + +#define YY_BUFFER_NEW 0 +#define YY_BUFFER_NORMAL 1 + /* When an EOF's been seen but there's still some text to process + * then we mark the buffer as YY_EOF_PENDING, to indicate that we + * shouldn't try reading from the input source any more. We might + * still have a bunch of tokens to match, though, because of + * possible backing-up. + * + * When we actually see the EOF, we change the status to "new" + * (via yyrestart()), so that the user can continue scanning by + * just pointing yyin at a new input file. + */ +#define YY_BUFFER_EOF_PENDING 2 + + }; +#endif /* !YY_STRUCT_YY_BUFFER_STATE */ + +/* Stack of input buffers. */ +static size_t yy_buffer_stack_top = 0; /**< index of top of stack. */ +static size_t yy_buffer_stack_max = 0; /**< capacity of stack. */ +static YY_BUFFER_STATE * yy_buffer_stack = 0; /**< Stack as an array. */ + +/* We provide macros for accessing buffer states in case in the + * future we want to put the buffer states in a more general + * "scanner state". + * + * Returns the top of the stack, or NULL. + */ +#define YY_CURRENT_BUFFER ( (yy_buffer_stack) \ + ? (yy_buffer_stack)[(yy_buffer_stack_top)] \ + : NULL) + +/* Same as previous macro, but useful when we know that the buffer stack is not + * NULL or when we need an lvalue. For internal use only. + */ +#define YY_CURRENT_BUFFER_LVALUE (yy_buffer_stack)[(yy_buffer_stack_top)] + +/* yy_hold_char holds the character lost when yytext is formed. */ +static char yy_hold_char; +static yy_size_t yy_n_chars; /* number of characters read into yy_ch_buf */ +yy_size_t yyleng; + +/* Points to current character in buffer. */ +static char *yy_c_buf_p = (char *) 0; +static int yy_init = 0; /* whether we need to initialize */ +static int yy_start = 0; /* start state number */ + +/* Flag which is used to allow yywrap()'s to do buffer switches + * instead of setting up a fresh yyin. A bit of a hack ... + */ +static int yy_did_buffer_switch_on_eof; + +void yyrestart (FILE *input_file ); +__attribute__((unused)) static void yy_switch_to_buffer (YY_BUFFER_STATE new_buffer ); +static YY_BUFFER_STATE yy_create_buffer (FILE *file,int size ); +void yy_delete_buffer (YY_BUFFER_STATE b ); +void yy_flush_buffer (YY_BUFFER_STATE b ); +void yypush_buffer_state (YY_BUFFER_STATE new_buffer ); +void yypop_buffer_state (void ); + +static void yyensure_buffer_stack (void ); +static void yy_load_buffer_state (void ); +static void yy_init_buffer (YY_BUFFER_STATE b,FILE *file ); + +#define YY_FLUSH_BUFFER yy_flush_buffer(YY_CURRENT_BUFFER ) + +YY_BUFFER_STATE yy_scan_buffer (char *base,yy_size_t size ); +YY_BUFFER_STATE yy_scan_string (yyconst char *yy_str ); +YY_BUFFER_STATE yy_scan_bytes (yyconst char *bytes,yy_size_t len ); + +void *yyalloc (yy_size_t ); +void *yyrealloc (void *,yy_size_t ); +void yyfree (void * ); + +#define yy_new_buffer yy_create_buffer + +#define yy_set_interactive(is_interactive) \ + { \ + if ( ! YY_CURRENT_BUFFER ){ \ + yyensure_buffer_stack (); \ + YY_CURRENT_BUFFER_LVALUE = \ + yy_create_buffer(yyin,YY_BUF_SIZE ); \ + } \ + YY_CURRENT_BUFFER_LVALUE->yy_is_interactive = is_interactive; \ + } + +#define yy_set_bol(at_bol) \ + { \ + if ( ! YY_CURRENT_BUFFER ){\ + yyensure_buffer_stack (); \ + YY_CURRENT_BUFFER_LVALUE = \ + yy_create_buffer(yyin,YY_BUF_SIZE ); \ + } \ + YY_CURRENT_BUFFER_LVALUE->yy_at_bol = at_bol; \ + } + +#define YY_AT_BOL() (YY_CURRENT_BUFFER_LVALUE->yy_at_bol) + +/* Begin user sect3 */ + +#define yywrap(n) 1 +#define YY_SKIP_YYWRAP + +typedef unsigned char YY_CHAR; + +FILE *yyin = (FILE *) 0, *yyout = (FILE *) 0; + +typedef int yy_state_type; + +extern int yylineno; + +int yylineno = 1; + +extern char *yytext; +#define yytext_ptr yytext + +static yy_state_type yy_get_previous_state (void ); +static yy_state_type yy_try_NUL_trans (yy_state_type current_state ); +static int yy_get_next_buffer (void ); +static void yy_fatal_error (yyconst char msg[] ); + +/* Done after the current pattern has been matched and before the + * corresponding action - sets up yytext. + */ +#define YY_DO_BEFORE_ACTION \ + (yytext_ptr) = yy_bp; \ + yyleng = (size_t) (yy_cp - yy_bp); \ + (yy_hold_char) = *yy_cp; \ + *yy_cp = '\0'; \ + (yy_c_buf_p) = yy_cp; + +#define YY_NUM_RULES 124 +#define YY_END_OF_BUFFER 125 +/* This struct is not used in this scanner, + but its presence is necessary. */ +struct yy_trans_info + { + flex_int32_t yy_verify; + flex_int32_t yy_nxt; + }; +static yyconst flex_int16_t yy_accept[425] = + { 0, + 0, 0, 119, 119, 0, 0, 0, 0, 125, 123, + 122, 122, 8, 123, 114, 5, 103, 109, 112, 110, + 107, 111, 123, 113, 1, 123, 108, 106, 104, 105, + 117, 96, 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, + 115, 116, 119, 120, 6, 7, 9, 10, 122, 4, + 98, 118, 2, 1, 3, 99, 100, 102, 101, 0, + 96, 0, 96, 96, 96, 96, 96, 44, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 28, 17, 25, 96, 96, 96, + + 96, 96, 96, 54, 63, 96, 14, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 119, 120, 120, 121, 6, + 7, 9, 10, 2, 0, 97, 13, 45, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 27, 96, 96, + 96, 41, 96, 96, 96, 96, 21, 96, 96, 96, + 96, 96, 15, 96, 96, 96, 18, 96, 96, 96, + 96, 96, 82, 96, 96, 96, 51, 96, 12, 96, + 36, 96, 96, 96, 96, 96, 96, 96, 96, 96, + + 96, 96, 0, 97, 96, 96, 96, 96, 20, 96, + 24, 96, 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 46, 96, 96, 30, 96, 89, 96, 96, + 39, 96, 96, 96, 96, 96, 48, 96, 94, 91, + 32, 93, 96, 11, 66, 96, 96, 96, 42, 96, + 96, 96, 96, 96, 96, 96, 96, 96, 96, 29, + 96, 96, 96, 96, 96, 96, 96, 96, 96, 87, + 0, 96, 26, 96, 96, 96, 68, 96, 96, 96, + 96, 37, 96, 96, 96, 96, 96, 96, 96, 31, + 67, 23, 96, 59, 96, 77, 96, 96, 96, 43, + + 96, 96, 96, 96, 96, 96, 96, 96, 92, 96, + 96, 56, 96, 96, 96, 96, 96, 96, 96, 40, + 33, 0, 81, 95, 19, 96, 96, 85, 96, 76, + 55, 96, 65, 96, 52, 96, 96, 96, 47, 96, + 78, 96, 80, 96, 96, 34, 96, 96, 96, 35, + 74, 96, 96, 96, 96, 60, 96, 50, 49, 96, + 96, 96, 57, 53, 64, 96, 96, 96, 22, 96, + 96, 75, 83, 96, 96, 79, 96, 70, 96, 96, + 96, 96, 96, 38, 96, 90, 69, 96, 86, 96, + 96, 96, 88, 96, 96, 61, 96, 16, 96, 72, + + 71, 96, 58, 96, 84, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 73, 96, 96, 96, 96, + 96, 96, 62, 0 + } ; + +static yyconst flex_int32_t yy_ec[256] = + { 0, + 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 1, 4, 5, 6, 7, 1, 8, 9, + 10, 11, 12, 13, 14, 15, 16, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 18, 19, 20, + 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, + 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, + 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 1, 1, 1, 1, 51, 1, 34, 34, 34, 34, + + 34, 34, 34, 34, 34, 34, 34, 52, 34, 34, + 34, 34, 53, 34, 54, 34, 34, 34, 34, 34, + 34, 34, 55, 1, 56, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1 + } ; + +static yyconst flex_int32_t yy_meta[57] = + { 0, + 1, 1, 1, 2, 3, 1, 1, 4, 1, 1, + 5, 1, 1, 1, 1, 6, 7, 1, 1, 1, + 8, 1, 1, 6, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 1, 1 + } ; + +static yyconst flex_int16_t yy_base[438] = + { 0, + 0, 0, 293, 287, 284, 281, 272, 256, 254, 1357, + 55, 57, 1357, 0, 1357, 1357, 1357, 1357, 1357, 1357, + 1357, 1357, 238, 227, 46, 205, 1357, 43, 1357, 203, + 1357, 46, 50, 56, 52, 66, 64, 51, 81, 92, + 91, 94, 96, 111, 113, 116, 130, 134, 53, 143, + 1357, 1357, 0, 106, 0, 212, 0, 210, 141, 0, + 1357, 1357, 192, 56, 173, 1357, 1357, 1357, 1357, 168, + 140, 150, 152, 154, 155, 161, 167, 171, 177, 172, + 184, 174, 188, 189, 191, 194, 203, 212, 215, 217, + 219, 221, 226, 228, 231, 240, 233, 235, 246, 251, + + 258, 253, 255, 256, 269, 271, 278, 272, 285, 283, + 287, 289, 296, 305, 298, 315, 319, 321, 322, 326, + 332, 333, 342, 339, 343, 0, 112, 173, 1357, 0, + 155, 0, 156, 132, 93, 0, 355, 357, 358, 360, + 364, 367, 374, 370, 379, 380, 389, 383, 390, 392, + 395, 408, 411, 409, 415, 418, 425, 427, 429, 436, + 431, 441, 446, 448, 450, 452, 453, 462, 471, 464, + 473, 474, 478, 485, 488, 490, 491, 494, 500, 501, + 504, 506, 507, 517, 518, 519, 520, 521, 522, 523, + 533, 536, 538, 543, 549, 554, 555, 561, 556, 566, + + 567, 576, 60, 0, 573, 578, 580, 582, 583, 593, + 589, 596, 598, 603, 605, 607, 610, 617, 619, 621, + 622, 628, 633, 634, 635, 639, 640, 649, 650, 652, + 653, 655, 659, 664, 668, 669, 665, 671, 674, 678, + 681, 685, 687, 688, 692, 697, 698, 701, 703, 704, + 707, 708, 717, 713, 728, 730, 724, 740, 734, 745, + 746, 750, 751, 756, 757, 760, 761, 762, 771, 773, + 42, 778, 782, 783, 787, 789, 792, 794, 793, 804, + 805, 808, 809, 810, 819, 823, 826, 828, 829, 830, + 835, 840, 844, 846, 847, 856, 857, 858, 859, 860, + + 863, 872, 873, 878, 879, 882, 885, 889, 894, 895, + 896, 898, 905, 910, 908, 912, 914, 915, 926, 930, + 931, 73, 932, 933, 935, 937, 942, 944, 946, 947, + 948, 949, 951, 958, 961, 965, 967, 972, 978, 979, + 981, 984, 983, 985, 994, 988, 999, 1000, 1001, 1004, + 1013, 1015, 1022, 1016, 1019, 1026, 1032, 1033, 1035, 1036, + 1038, 1039, 1048, 1049, 1050, 1051, 1053, 1054, 1060, 1063, + 1065, 1066, 1069, 1070, 1072, 1082, 1084, 1085, 1087, 1096, + 1097, 1098, 1099, 1101, 1113, 1114, 1115, 1116, 1117, 1118, + 1119, 1128, 1130, 1131, 1134, 1133, 1135, 1137, 1150, 1151, + + 1153, 1155, 1157, 1162, 1160, 1167, 1172, 1173, 1174, 1176, + 1185, 1190, 1183, 1187, 1189, 1199, 1204, 1206, 1208, 1210, + 1215, 1220, 1222, 1357, 1269, 1278, 1287, 1290, 1293, 1297, + 1306, 1315, 1324, 1333, 1340, 1344, 1347 + } ; + +static yyconst flex_int16_t yy_def[438] = + { 0, + 424, 1, 425, 425, 426, 426, 427, 427, 424, 424, + 424, 424, 424, 428, 424, 424, 424, 424, 424, 424, + 424, 424, 424, 424, 424, 429, 424, 424, 424, 424, + 424, 430, 430, 430, 430, 430, 34, 430, 430, 430, + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + 424, 424, 431, 432, 433, 424, 434, 424, 424, 428, + 424, 424, 424, 424, 429, 424, 424, 424, 424, 435, + 430, 436, 430, 430, 430, 430, 430, 430, 430, 430, + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + 430, 430, 430, 430, 430, 431, 432, 432, 424, 433, + 424, 434, 424, 424, 424, 437, 430, 430, 430, 430, + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + + 430, 430, 424, 437, 430, 430, 430, 430, 430, 430, + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + 424, 430, 430, 430, 430, 430, 430, 430, 430, 430, + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + 430, 424, 430, 430, 430, 430, 430, 430, 430, 430, + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + 430, 430, 430, 430, 430, 430, 430, 430, 430, 430, + 430, 430, 430, 0, 424, 424, 424, 424, 424, 424, + 424, 424, 424, 424, 424, 424, 424 + } ; + +static yyconst flex_int16_t yy_nxt[1414] = + { 0, + 10, 11, 12, 13, 10, 14, 15, 16, 17, 18, + 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, + 29, 30, 31, 10, 32, 33, 34, 35, 36, 37, + 38, 38, 39, 38, 38, 40, 41, 42, 43, 44, + 38, 45, 46, 47, 48, 49, 50, 38, 38, 38, + 38, 38, 38, 38, 51, 52, 59, 59, 59, 59, + 63, 70, 64, 67, 68, 70, 70, 70, 70, 72, + 63, 70, 64, 72, 72, 72, 72, 123, 75, 72, + 84, 70, 76, 73, 85, 77, 136, 79, 74, 72, + 86, 80, 90, 322, 81, 71, 70, 82, 78, 91, + + 83, 87, 92, 88, 72, 93, 70, 70, 94, 70, + 95, 70, 271, 89, 72, 72, 128, 72, 96, 72, + 98, 129, 424, 97, 99, 104, 70, 424, 70, 101, + 100, 70, 102, 105, 72, 106, 72, 107, 103, 72, + 108, 110, 59, 59, 113, 70, 203, 114, 134, 70, + 111, 112, 109, 72, 118, 70, 115, 72, 70, 133, + 116, 119, 131, 72, 117, 70, 72, 70, 120, 70, + 70, 121, 135, 122, 124, 72, 70, 72, 72, 137, + 138, 125, 70, 128, 72, 140, 70, 70, 129, 70, + 72, 141, 70, 424, 72, 72, 139, 72, 142, 70, + + 72, 144, 150, 70, 70, 143, 70, 72, 134, 70, + 145, 72, 72, 133, 72, 152, 146, 72, 70, 131, + 147, 148, 156, 69, 153, 66, 72, 70, 149, 151, + 70, 154, 70, 155, 70, 72, 70, 62, 72, 158, + 72, 70, 72, 70, 72, 157, 70, 159, 70, 72, + 70, 72, 61, 424, 72, 70, 72, 161, 72, 58, + 160, 70, 162, 72, 163, 164, 70, 165, 70, 72, + 70, 70, 168, 70, 72, 58, 72, 170, 72, 72, + 169, 72, 166, 167, 70, 172, 70, 70, 56, 171, + 174, 56, 72, 70, 72, 72, 173, 54, 70, 175, + + 70, 72, 70, 54, 70, 176, 72, 180, 72, 424, + 72, 70, 72, 70, 183, 177, 424, 178, 424, 72, + 70, 72, 181, 179, 184, 424, 182, 424, 72, 188, + 70, 186, 424, 189, 70, 185, 70, 70, 72, 187, + 190, 70, 72, 424, 72, 72, 193, 70, 70, 72, + 194, 191, 424, 424, 70, 72, 72, 70, 70, 424, + 198, 192, 72, 424, 196, 72, 72, 200, 424, 424, + 70, 201, 70, 70, 197, 70, 195, 199, 72, 70, + 72, 72, 70, 72, 202, 70, 205, 72, 424, 70, + 72, 208, 206, 72, 70, 70, 207, 72, 70, 209, + + 210, 424, 72, 72, 70, 70, 72, 70, 424, 216, + 70, 211, 72, 72, 424, 72, 218, 424, 72, 424, + 424, 212, 213, 70, 70, 214, 70, 217, 215, 424, + 70, 72, 72, 70, 72, 223, 219, 220, 72, 222, + 70, 72, 70, 221, 70, 424, 70, 424, 72, 424, + 72, 70, 72, 226, 72, 230, 70, 227, 224, 72, + 225, 70, 229, 70, 72, 70, 424, 70, 70, 72, + 424, 72, 228, 72, 232, 72, 72, 70, 233, 70, + 234, 236, 231, 424, 424, 72, 70, 72, 70, 70, + 424, 237, 238, 70, 72, 235, 72, 72, 240, 239, + + 70, 72, 242, 70, 424, 70, 70, 243, 72, 70, + 424, 72, 241, 72, 72, 70, 70, 72, 246, 70, + 244, 70, 70, 72, 72, 245, 248, 72, 249, 72, + 72, 247, 70, 70, 70, 70, 70, 70, 70, 250, + 72, 72, 72, 72, 72, 72, 72, 255, 70, 424, + 251, 70, 253, 70, 424, 424, 72, 252, 70, 72, + 424, 72, 256, 258, 70, 257, 72, 424, 254, 70, + 70, 70, 72, 259, 261, 262, 70, 72, 72, 72, + 260, 70, 70, 424, 72, 266, 263, 265, 70, 72, + 72, 70, 424, 70, 264, 70, 72, 70, 70, 72, + + 267, 72, 269, 72, 70, 72, 72, 268, 70, 424, + 270, 70, 72, 70, 272, 273, 72, 274, 70, 72, + 70, 72, 70, 275, 277, 70, 72, 276, 72, 280, + 72, 281, 70, 72, 70, 279, 70, 70, 424, 424, + 72, 278, 72, 70, 72, 72, 286, 284, 70, 70, + 70, 72, 424, 282, 70, 70, 72, 72, 72, 285, + 283, 424, 72, 72, 70, 70, 288, 70, 70, 290, + 70, 287, 72, 72, 70, 72, 72, 424, 72, 70, + 70, 291, 72, 70, 70, 289, 70, 72, 72, 70, + 424, 72, 72, 70, 72, 292, 70, 72, 293, 297, + + 70, 72, 70, 70, 72, 295, 294, 70, 72, 296, + 72, 72, 70, 70, 298, 72, 70, 424, 70, 70, + 72, 72, 70, 70, 72, 299, 72, 72, 70, 302, + 72, 72, 70, 424, 424, 424, 72, 424, 300, 70, + 72, 301, 306, 70, 424, 70, 303, 72, 304, 70, + 305, 72, 307, 72, 308, 70, 424, 72, 309, 424, + 70, 70, 312, 72, 311, 70, 70, 310, 72, 72, + 424, 70, 70, 72, 72, 70, 70, 70, 313, 72, + 72, 314, 424, 72, 72, 72, 70, 317, 70, 319, + 320, 424, 424, 70, 72, 315, 72, 70, 70, 321, + + 316, 72, 70, 318, 70, 72, 72, 70, 70, 70, + 72, 424, 72, 424, 424, 72, 72, 72, 424, 70, + 70, 323, 327, 70, 70, 70, 324, 72, 72, 424, + 329, 72, 72, 72, 70, 325, 328, 331, 70, 326, + 424, 70, 72, 70, 70, 70, 72, 332, 330, 72, + 70, 72, 72, 72, 335, 70, 424, 424, 72, 70, + 333, 70, 70, 72, 334, 336, 337, 72, 424, 72, + 72, 70, 70, 70, 70, 70, 338, 424, 70, 72, + 72, 72, 72, 72, 424, 340, 72, 70, 70, 341, + 339, 424, 343, 70, 70, 72, 72, 70, 424, 344, + + 70, 72, 72, 342, 70, 72, 348, 424, 72, 70, + 70, 70, 72, 70, 424, 346, 345, 72, 72, 72, + 70, 72, 347, 70, 424, 70, 349, 70, 72, 70, + 70, 72, 350, 72, 354, 72, 351, 72, 72, 352, + 356, 70, 353, 358, 355, 70, 70, 70, 70, 72, + 70, 357, 70, 72, 72, 72, 72, 70, 72, 70, + 72, 70, 70, 70, 70, 72, 70, 72, 359, 72, + 72, 72, 72, 70, 72, 424, 70, 424, 424, 361, + 70, 72, 70, 362, 72, 360, 365, 70, 72, 363, + 72, 366, 364, 70, 70, 72, 70, 424, 70, 70, + + 70, 72, 72, 70, 72, 367, 72, 72, 72, 70, + 368, 72, 424, 424, 70, 70, 70, 72, 424, 70, + 369, 370, 72, 72, 72, 424, 374, 72, 70, 371, + 70, 70, 424, 375, 70, 372, 72, 70, 72, 72, + 373, 70, 72, 376, 379, 72, 377, 70, 70, 72, + 70, 70, 424, 70, 70, 72, 72, 378, 72, 72, + 380, 72, 72, 70, 70, 70, 70, 383, 70, 70, + 382, 72, 72, 72, 72, 70, 72, 72, 70, 381, + 70, 70, 424, 72, 70, 70, 72, 70, 72, 72, + 387, 386, 72, 72, 384, 72, 385, 70, 424, 70, + + 70, 424, 70, 424, 389, 72, 388, 72, 72, 390, + 72, 70, 70, 70, 70, 392, 70, 424, 424, 72, + 72, 72, 72, 393, 72, 391, 396, 424, 70, 70, + 70, 70, 70, 70, 70, 394, 72, 72, 72, 72, + 72, 72, 72, 70, 398, 70, 70, 395, 70, 70, + 70, 72, 70, 72, 72, 424, 72, 72, 72, 424, + 72, 399, 403, 397, 404, 70, 70, 400, 70, 401, + 70, 424, 70, 72, 72, 70, 72, 70, 72, 405, + 72, 402, 70, 72, 424, 72, 424, 70, 70, 70, + 72, 70, 406, 424, 407, 72, 72, 72, 70, 72, + + 70, 412, 70, 424, 70, 70, 72, 424, 72, 410, + 72, 408, 72, 72, 70, 409, 424, 413, 414, 70, + 415, 70, 72, 70, 411, 70, 424, 72, 416, 72, + 70, 72, 424, 72, 419, 70, 424, 70, 72, 417, + 418, 424, 424, 72, 420, 72, 424, 424, 421, 424, + 424, 424, 424, 424, 424, 424, 422, 424, 424, 424, + 424, 424, 424, 424, 424, 424, 424, 424, 423, 53, + 53, 53, 53, 53, 53, 53, 53, 53, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 60, 424, 60, 65, + + 65, 65, 71, 71, 424, 71, 126, 126, 126, 126, + 424, 126, 126, 126, 126, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 130, 130, 130, 424, 130, 130, + 130, 130, 130, 132, 424, 132, 132, 132, 132, 132, + 132, 132, 136, 424, 424, 424, 424, 424, 136, 72, + 72, 424, 72, 204, 424, 204, 9, 424, 424, 424, + 424, 424, 424, 424, 424, 424, 424, 424, 424, 424, + 424, 424, 424, 424, 424, 424, 424, 424, 424, 424, + 424, 424, 424, 424, 424, 424, 424, 424, 424, 424, + 424, 424, 424, 424, 424, 424, 424, 424, 424, 424, + + 424, 424, 424, 424, 424, 424, 424, 424, 424, 424, + 424, 424, 424 + } ; + +static yyconst flex_int16_t yy_chk[1414] = + { 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 11, 11, 12, 12, + 25, 32, 25, 28, 28, 33, 38, 35, 49, 32, + 64, 34, 64, 33, 38, 35, 49, 49, 33, 34, + 35, 36, 33, 32, 35, 33, 322, 34, 32, 36, + 35, 34, 37, 271, 34, 37, 39, 34, 33, 37, + + 34, 36, 37, 36, 39, 37, 41, 40, 37, 42, + 39, 43, 203, 36, 41, 40, 54, 42, 39, 43, + 40, 54, 127, 39, 40, 43, 44, 127, 45, 41, + 40, 46, 42, 43, 44, 43, 45, 43, 42, 46, + 43, 45, 59, 59, 46, 47, 135, 46, 134, 48, + 45, 45, 44, 47, 47, 71, 46, 48, 50, 133, + 46, 47, 131, 71, 46, 72, 50, 73, 47, 74, + 75, 48, 70, 48, 50, 73, 76, 74, 75, 73, + 74, 50, 77, 128, 76, 75, 78, 80, 128, 82, + 77, 76, 79, 65, 78, 80, 74, 82, 76, 81, + + 79, 79, 82, 83, 84, 77, 85, 81, 63, 86, + 80, 83, 84, 58, 85, 84, 80, 86, 87, 56, + 81, 81, 86, 30, 84, 26, 87, 88, 81, 83, + 89, 84, 90, 85, 91, 88, 92, 24, 89, 88, + 90, 93, 91, 94, 92, 87, 95, 89, 97, 93, + 98, 94, 23, 9, 95, 96, 97, 91, 98, 8, + 90, 99, 92, 96, 93, 94, 100, 96, 102, 99, + 103, 104, 98, 101, 100, 7, 102, 100, 103, 104, + 99, 101, 96, 96, 105, 101, 106, 108, 6, 100, + 103, 5, 105, 107, 106, 108, 102, 4, 110, 106, + + 109, 107, 111, 3, 112, 107, 110, 110, 109, 0, + 111, 113, 112, 115, 111, 108, 0, 109, 0, 113, + 114, 115, 110, 109, 112, 0, 110, 0, 114, 114, + 116, 113, 0, 115, 117, 112, 118, 119, 116, 113, + 116, 120, 117, 0, 118, 119, 118, 121, 122, 120, + 119, 116, 0, 0, 124, 121, 122, 123, 125, 0, + 122, 117, 124, 0, 121, 123, 125, 124, 0, 0, + 137, 124, 138, 139, 121, 140, 120, 123, 137, 141, + 138, 139, 142, 140, 125, 144, 139, 141, 0, 143, + 142, 142, 140, 144, 145, 146, 141, 143, 148, 143, + + 143, 0, 145, 146, 147, 149, 148, 150, 0, 148, + 151, 144, 147, 149, 0, 150, 150, 0, 151, 0, + 0, 145, 146, 152, 154, 147, 153, 149, 147, 0, + 155, 152, 154, 156, 153, 154, 151, 151, 155, 153, + 157, 156, 158, 152, 159, 0, 161, 0, 157, 0, + 158, 160, 159, 157, 161, 161, 162, 157, 155, 160, + 156, 163, 160, 164, 162, 165, 0, 166, 167, 163, + 0, 164, 159, 165, 164, 166, 167, 168, 165, 170, + 166, 167, 163, 0, 0, 168, 169, 170, 171, 172, + 0, 167, 168, 173, 169, 166, 171, 172, 170, 169, + + 174, 173, 172, 175, 0, 176, 177, 173, 174, 178, + 0, 175, 171, 176, 177, 179, 180, 178, 176, 181, + 174, 182, 183, 179, 180, 175, 179, 181, 180, 182, + 183, 178, 184, 185, 186, 187, 188, 189, 190, 181, + 184, 185, 186, 187, 188, 189, 190, 186, 191, 0, + 182, 192, 184, 193, 0, 0, 191, 183, 194, 192, + 0, 193, 188, 192, 195, 190, 194, 0, 185, 196, + 197, 199, 195, 193, 195, 195, 198, 196, 197, 199, + 194, 200, 201, 0, 198, 198, 195, 197, 205, 200, + 201, 202, 0, 206, 196, 207, 205, 208, 209, 202, + + 199, 206, 201, 207, 211, 208, 209, 200, 210, 0, + 202, 212, 211, 213, 205, 206, 210, 207, 214, 212, + 215, 213, 216, 208, 212, 217, 214, 210, 215, 215, + 216, 216, 218, 217, 219, 214, 220, 221, 0, 0, + 218, 213, 219, 222, 220, 221, 221, 219, 223, 224, + 225, 222, 0, 217, 226, 227, 223, 224, 225, 220, + 218, 0, 226, 227, 228, 229, 224, 230, 231, 227, + 232, 222, 228, 229, 233, 230, 231, 0, 232, 234, + 237, 229, 233, 235, 236, 225, 238, 234, 237, 239, + 0, 235, 236, 240, 238, 230, 241, 239, 232, 236, + + 242, 240, 243, 244, 241, 234, 233, 245, 242, 235, + 243, 244, 246, 247, 238, 245, 248, 0, 249, 250, + 246, 247, 251, 252, 248, 243, 249, 250, 254, 248, + 251, 252, 253, 0, 0, 0, 254, 0, 246, 257, + 253, 247, 253, 255, 0, 256, 250, 257, 251, 259, + 252, 255, 254, 256, 255, 258, 0, 259, 256, 0, + 260, 261, 259, 258, 258, 262, 263, 257, 260, 261, + 0, 264, 265, 262, 263, 266, 267, 268, 261, 264, + 265, 262, 0, 266, 267, 268, 269, 265, 270, 267, + 268, 0, 0, 272, 269, 263, 270, 273, 274, 269, + + 264, 272, 275, 266, 276, 273, 274, 277, 279, 278, + 275, 0, 276, 0, 0, 277, 279, 278, 0, 280, + 281, 272, 278, 282, 283, 284, 274, 280, 281, 0, + 280, 282, 283, 284, 285, 275, 279, 283, 286, 276, + 0, 287, 285, 288, 289, 290, 286, 284, 281, 287, + 291, 288, 289, 290, 287, 292, 0, 0, 291, 293, + 285, 294, 295, 292, 286, 288, 289, 293, 0, 294, + 295, 296, 297, 298, 299, 300, 293, 0, 301, 296, + 297, 298, 299, 300, 0, 297, 301, 302, 303, 298, + 295, 0, 301, 304, 305, 302, 303, 306, 0, 302, + + 307, 304, 305, 299, 308, 306, 306, 0, 307, 309, + 310, 311, 308, 312, 0, 304, 303, 309, 310, 311, + 313, 312, 305, 315, 0, 314, 307, 316, 313, 317, + 318, 315, 308, 314, 314, 316, 310, 317, 318, 311, + 316, 319, 313, 318, 315, 320, 321, 323, 324, 319, + 325, 317, 326, 320, 321, 323, 324, 327, 325, 328, + 326, 329, 330, 331, 332, 327, 333, 328, 319, 329, + 330, 331, 332, 334, 333, 0, 335, 0, 0, 326, + 336, 334, 337, 327, 335, 325, 334, 338, 336, 329, + 337, 336, 332, 339, 340, 338, 341, 0, 343, 342, + + 344, 339, 340, 346, 341, 337, 343, 342, 344, 345, + 338, 346, 0, 0, 347, 348, 349, 345, 0, 350, + 340, 342, 347, 348, 349, 0, 348, 350, 351, 344, + 352, 354, 0, 349, 355, 345, 351, 353, 352, 354, + 347, 356, 355, 352, 355, 353, 353, 357, 358, 356, + 359, 360, 0, 361, 362, 357, 358, 354, 359, 360, + 357, 361, 362, 363, 364, 365, 366, 362, 367, 368, + 361, 363, 364, 365, 366, 369, 367, 368, 370, 360, + 371, 372, 0, 369, 373, 374, 370, 375, 371, 372, + 370, 368, 373, 374, 366, 375, 367, 376, 0, 377, + + 378, 0, 379, 0, 374, 376, 371, 377, 378, 375, + 379, 380, 381, 382, 383, 379, 384, 0, 0, 380, + 381, 382, 383, 380, 384, 377, 383, 0, 385, 386, + 387, 388, 389, 390, 391, 381, 385, 386, 387, 388, + 389, 390, 391, 392, 388, 393, 394, 382, 396, 395, + 397, 392, 398, 393, 394, 0, 396, 395, 397, 0, + 398, 390, 395, 385, 397, 399, 400, 391, 401, 392, + 402, 0, 403, 399, 400, 405, 401, 404, 402, 399, + 403, 394, 406, 405, 0, 404, 0, 407, 408, 409, + 406, 410, 402, 0, 404, 407, 408, 409, 413, 410, + + 411, 410, 414, 0, 415, 412, 413, 0, 411, 408, + 414, 406, 415, 412, 416, 407, 0, 411, 412, 417, + 413, 418, 416, 419, 409, 420, 0, 417, 414, 418, + 421, 419, 0, 420, 418, 422, 0, 423, 421, 415, + 417, 0, 0, 422, 419, 423, 0, 0, 420, 0, + 0, 0, 0, 0, 0, 0, 421, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 422, 425, + 425, 425, 425, 425, 425, 425, 425, 425, 426, 426, + 426, 426, 426, 426, 426, 426, 426, 427, 427, 427, + 427, 427, 427, 427, 427, 427, 428, 0, 428, 429, + + 429, 429, 430, 430, 0, 430, 431, 431, 431, 431, + 0, 431, 431, 431, 431, 432, 432, 432, 432, 432, + 432, 432, 432, 432, 433, 433, 433, 0, 433, 433, + 433, 433, 433, 434, 0, 434, 434, 434, 434, 434, + 434, 434, 435, 0, 0, 0, 0, 0, 435, 436, + 436, 0, 436, 437, 0, 437, 424, 424, 424, 424, + 424, 424, 424, 424, 424, 424, 424, 424, 424, 424, + 424, 424, 424, 424, 424, 424, 424, 424, 424, 424, + 424, 424, 424, 424, 424, 424, 424, 424, 424, 424, + 424, 424, 424, 424, 424, 424, 424, 424, 424, 424, + + 424, 424, 424, 424, 424, 424, 424, 424, 424, 424, + 424, 424, 424 + } ; + +static yy_state_type yy_last_accepting_state; +static char *yy_last_accepting_cpos; + +extern int yy_flex_debug; +int yy_flex_debug = 0; + +/* The intent behind this definition is that it'll catch + * any uses of REJECT which flex missed. + */ +#define REJECT reject_used_but_not_detected +#define yymore() yymore_used_but_not_detected +#define YY_MORE_ADJ 0 +#define YY_RESTORE_YY_MORE_OFFSET +char *yytext; +#line 1 "pars0lex.l" +/***************************************************************************** + +Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ +/****************************************************** +SQL parser lexical analyzer: input file for the GNU Flex lexer generator + +The InnoDB parser is frozen because MySQL takes care of SQL parsing. +Therefore we normally keep the InnoDB parser C files as they are, and do +not automatically generate them from pars0grm.y and pars0lex.l. + +How to make the InnoDB parser and lexer C files: + +1. Run ./make_flex.sh to generate lexer files. + +2. Run ./make_bison.sh to generate parser files. + +These instructions seem to work at least with bison-1.875d and flex-2.5.31 on +Linux. + +Created 12/14/1997 Heikki Tuuri +*******************************************************/ +#define YY_NO_INPUT 1 +#define YY_NO_UNISTD_H 1 +#line 53 "pars0lex.l" +#define YYSTYPE que_node_t* + +#include "univ.i" +#include "pars0pars.h" +#include "pars0grm.h" +#include "pars0sym.h" +#include "mem0mem.h" +#include "os0proc.h" + +#define malloc(A) ut_malloc(A) +#define free(A) ut_free(A) +#define realloc(P, A) ut_realloc(P, A) +#define exit(A) ut_error + +/* Note: We cast &result to int* from yysize_t* */ +#define YY_INPUT(buf, result, max_size) \ + (result = pars_get_lex_chars(buf, max_size)) + +/* String buffer for removing quotes */ +static ulint stringbuf_len_alloc = 0; /* Allocated length */ +static ulint stringbuf_len = 0; /* Current length */ +static char* stringbuf; /* Start of buffer */ +/** Appends a string to the buffer. */ +static +void +string_append( +/*==========*/ + const char* str, /*!< in: string to be appended */ + ulint len) /*!< in: length of the string */ +{ + if (stringbuf == NULL) { + stringbuf = static_cast<char*>(malloc(1)); + stringbuf_len_alloc = 1; + } + + if (stringbuf_len + len > stringbuf_len_alloc) { + while (stringbuf_len + len > stringbuf_len_alloc) { + stringbuf_len_alloc <<= 1; + } + + stringbuf = static_cast<char*>( + realloc(stringbuf, stringbuf_len_alloc)); + } + + memcpy(stringbuf + stringbuf_len, str, len); + stringbuf_len += len; +} + + + + +#line 1006 "lexyy.cc" + +#define INITIAL 0 +#define comment 1 +#define quoted 2 +#define id 3 + +#ifndef YY_NO_UNISTD_H +/* Special case for "unistd.h", since it is non-ANSI. We include it way + * down here because we want the user's section 1 to have been scanned first. + * The user has a chance to override it with an option. + */ +#include <unistd.h> +#endif + +#ifndef YY_EXTRA_TYPE +#define YY_EXTRA_TYPE void * +#endif + +static int yy_init_globals (void ); + +/* Accessor methods to globals. + These are made visible to non-reentrant scanners for convenience. */ + +__attribute__((unused)) static int yylex_destroy (void ); + +int yyget_debug (void ); + +void yyset_debug (int debug_flag ); + +YY_EXTRA_TYPE yyget_extra (void ); + +void yyset_extra (YY_EXTRA_TYPE user_defined ); + +FILE *yyget_in (void ); + +void yyset_in (FILE * in_str ); + +FILE *yyget_out (void ); + +void yyset_out (FILE * out_str ); + +yy_size_t yyget_leng (void ); + +char *yyget_text (void ); + +int yyget_lineno (void ); + +void yyset_lineno (int line_number ); + +/* Macros after this point can all be overridden by user definitions in + * section 1. + */ + +#ifndef YY_SKIP_YYWRAP +#ifdef __cplusplus +extern "C" int yywrap (void ); +#else +extern int yywrap (void ); +#endif +#endif + +#ifndef yytext_ptr +static void yy_flex_strncpy (char *,yyconst char *,int ); +#endif + +#ifdef YY_NEED_STRLEN +static int yy_flex_strlen (yyconst char * ); +#endif + +#ifndef YY_NO_INPUT + +#ifdef __cplusplus +static int yyinput (void ); +#else +static int input (void ); +#endif + +#endif + +/* Amount of stuff to slurp up with each read. */ +#ifndef YY_READ_BUF_SIZE +#ifdef __ia64__ +/* On IA-64, the buffer size is 16k, not 8k */ +#define YY_READ_BUF_SIZE 16384 +#else +#define YY_READ_BUF_SIZE 8192 +#endif /* __ia64__ */ +#endif + +/* Copy whatever the last rule matched to the standard output. */ +#ifndef ECHO +/* This used to be an fputs(), but since the string might contain NUL's, + * we now use fwrite(). + */ +#define ECHO do { if (fwrite( yytext, yyleng, 1, yyout )) {} } while (0) +#endif + +/* Gets input and stuffs it into "buf". number of characters read, or YY_NULL, + * is returned in "result". + */ +#ifndef YY_INPUT +#define YY_INPUT(buf,result,max_size) \ + if ( YY_CURRENT_BUFFER_LVALUE->yy_is_interactive ) \ + { \ + int c = '*'; \ + size_t n; \ + for ( n = 0; n < max_size && \ + (c = getc( yyin )) != EOF && c != '\n'; ++n ) \ + buf[n] = (char) c; \ + if ( c == '\n' ) \ + buf[n++] = (char) c; \ + if ( c == EOF && ferror( yyin ) ) \ + YY_FATAL_ERROR( "input in flex scanner failed" ); \ + result = n; \ + } \ + else \ + { \ + errno=0; \ + while ( (result = fread(buf, 1, max_size, yyin))==0 && ferror(yyin)) \ + { \ + if( errno != EINTR) \ + { \ + YY_FATAL_ERROR( "input in flex scanner failed" ); \ + break; \ + } \ + errno=0; \ + clearerr(yyin); \ + } \ + }\ +\ + +#endif + +/* No semi-colon after return; correct usage is to write "yyterminate();" - + * we don't want an extra ';' after the "return" because that will cause + * some compilers to complain about unreachable statements. + */ +#ifndef yyterminate +#define yyterminate() return YY_NULL +#endif + +/* Number of entries by which start-condition stack grows. */ +#ifndef YY_START_STACK_INCR +#define YY_START_STACK_INCR 25 +#endif + +/* Report a fatal error. */ +#ifndef YY_FATAL_ERROR +#define YY_FATAL_ERROR(msg) yy_fatal_error( msg ) +#endif + +/* end tables serialization structures and prototypes */ + +/* Default declaration of generated scanner - a define so the user can + * easily add parameters. + */ +#ifndef YY_DECL +#define YY_DECL_IS_OURS 1 + +extern int yylex (void); + +#define YY_DECL int yylex (void) +#endif /* !YY_DECL */ + +/* Code executed at the beginning of each rule, after yytext and yyleng + * have been set up. + */ +#ifndef YY_USER_ACTION +#define YY_USER_ACTION +#endif + +/* Code executed at the end of each rule. */ +#ifndef YY_BREAK +#define YY_BREAK break; +#endif + +#define YY_RULE_SETUP \ + YY_USER_ACTION + +/** The main scanner function which does all the work. + */ +YY_DECL +{ + register yy_state_type yy_current_state; + register char *yy_cp, *yy_bp; + register int yy_act; + +#line 112 "pars0lex.l" + + +#line 1197 "lexyy.cc" + + if ( !(yy_init) ) + { + (yy_init) = 1; + +#ifdef YY_USER_INIT + YY_USER_INIT; +#endif + + if ( ! (yy_start) ) + (yy_start) = 1; /* first start state */ + + if ( ! yyin ) + yyin = stdin; + + if ( ! yyout ) + yyout = stdout; + + if ( ! YY_CURRENT_BUFFER ) { + yyensure_buffer_stack (); + YY_CURRENT_BUFFER_LVALUE = + yy_create_buffer(yyin,YY_BUF_SIZE ); + } + + yy_load_buffer_state( ); + } + + while ( 1 ) /* loops until end-of-file is reached */ + { + yy_cp = (yy_c_buf_p); + + /* Support of yytext. */ + *yy_cp = (yy_hold_char); + + /* yy_bp points to the position in yy_ch_buf of the start of + * the current run. + */ + yy_bp = yy_cp; + + yy_current_state = (yy_start); +yy_match: + do + { + register YY_CHAR yy_c = yy_ec[YY_SC_TO_UI(*yy_cp)]; + if ( yy_accept[yy_current_state] ) + { + (yy_last_accepting_state) = yy_current_state; + (yy_last_accepting_cpos) = yy_cp; + } + while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) + { + yy_current_state = (int) yy_def[yy_current_state]; + if ( yy_current_state >= 425 ) + yy_c = yy_meta[(unsigned int) yy_c]; + } + yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c]; + ++yy_cp; + } + while ( yy_current_state != 424 ); + yy_cp = (yy_last_accepting_cpos); + yy_current_state = (yy_last_accepting_state); + +yy_find_action: + yy_act = yy_accept[yy_current_state]; + + YY_DO_BEFORE_ACTION; + +do_action: /* This label is used only to access EOF actions. */ + + switch ( yy_act ) + { /* beginning of action switch */ + case 0: /* must back up */ + /* undo the effects of YY_DO_BEFORE_ACTION */ + *yy_cp = (yy_hold_char); + yy_cp = (yy_last_accepting_cpos); + yy_current_state = (yy_last_accepting_state); + goto yy_find_action; + +case 1: +YY_RULE_SETUP +#line 114 "pars0lex.l" +{ + yylval = sym_tab_add_int_lit(pars_sym_tab_global, + atoi(yytext)); + return(PARS_INT_LIT); +} + YY_BREAK +case 2: +YY_RULE_SETUP +#line 120 "pars0lex.l" +{ + ut_error; /* not implemented */ + + return(PARS_FLOAT_LIT); +} + YY_BREAK +case 3: +YY_RULE_SETUP +#line 126 "pars0lex.l" +{ + ulint type; + + yylval = sym_tab_add_bound_lit(pars_sym_tab_global, + yytext + 1, &type); + + return((int) type); +} + YY_BREAK +case 4: +YY_RULE_SETUP +#line 135 "pars0lex.l" +{ + yylval = sym_tab_add_bound_id(pars_sym_tab_global, + yytext + 1); + + return(PARS_ID_TOKEN); +} + YY_BREAK +case 5: +YY_RULE_SETUP +#line 142 "pars0lex.l" +{ +/* Quoted character string literals are handled in an explicit +start state 'quoted'. This state is entered and the buffer for +the scanned string is emptied upon encountering a starting quote. + +In the state 'quoted', only two actions are possible (defined below). */ + BEGIN(quoted); + stringbuf_len = 0; +} + YY_BREAK +case 6: +/* rule 6 can match eol */ +YY_RULE_SETUP +#line 151 "pars0lex.l" +{ + /* Got a sequence of characters other than "'": + append to string buffer */ + string_append(yytext, yyleng); +} + YY_BREAK +case 7: +YY_RULE_SETUP +#line 156 "pars0lex.l" +{ + /* Got a sequence of "'" characters: + append half of them to string buffer, + as "''" represents a single "'". + We apply truncating division, + so that "'''" will result in "'". */ + + string_append(yytext, yyleng / 2); + + /* If we got an odd number of quotes, then the + last quote we got is the terminating quote. + At the end of the string, we return to the + initial start state and report the scanned + string literal. */ + + if (yyleng % 2) { + BEGIN(INITIAL); + yylval = sym_tab_add_str_lit( + pars_sym_tab_global, + (byte*) stringbuf, stringbuf_len); + return(PARS_STR_LIT); + } +} + YY_BREAK +case 8: +YY_RULE_SETUP +#line 180 "pars0lex.l" +{ +/* Quoted identifiers are handled in an explicit start state 'id'. +This state is entered and the buffer for the scanned string is emptied +upon encountering a starting quote. + +In the state 'id', only two actions are possible (defined below). */ + BEGIN(id); + stringbuf_len = 0; +} + YY_BREAK +case 9: +/* rule 9 can match eol */ +YY_RULE_SETUP +#line 189 "pars0lex.l" +{ + /* Got a sequence of characters other than '"': + append to string buffer */ + string_append(yytext, yyleng); +} + YY_BREAK +case 10: +YY_RULE_SETUP +#line 194 "pars0lex.l" +{ + /* Got a sequence of '"' characters: + append half of them to string buffer, + as '""' represents a single '"'. + We apply truncating division, + so that '"""' will result in '"'. */ + + string_append(yytext, yyleng / 2); + + /* If we got an odd number of quotes, then the + last quote we got is the terminating quote. + At the end of the string, we return to the + initial start state and report the scanned + identifier. */ + + if (yyleng % 2) { + BEGIN(INITIAL); + yylval = sym_tab_add_id( + pars_sym_tab_global, + (byte*) stringbuf, stringbuf_len); + + return(PARS_ID_TOKEN); + } +} + YY_BREAK +case 11: +YY_RULE_SETUP +#line 219 "pars0lex.l" +{ + yylval = sym_tab_add_null_lit(pars_sym_tab_global); + + return(PARS_NULL_LIT); +} + YY_BREAK +case 12: +YY_RULE_SETUP +#line 225 "pars0lex.l" +{ + /* Implicit cursor name */ + yylval = sym_tab_add_str_lit(pars_sym_tab_global, + (byte*) yytext, yyleng); + return(PARS_SQL_TOKEN); +} + YY_BREAK +case 13: +YY_RULE_SETUP +#line 232 "pars0lex.l" +{ + return(PARS_AND_TOKEN); +} + YY_BREAK +case 14: +YY_RULE_SETUP +#line 236 "pars0lex.l" +{ + return(PARS_OR_TOKEN); +} + YY_BREAK +case 15: +YY_RULE_SETUP +#line 240 "pars0lex.l" +{ + return(PARS_NOT_TOKEN); +} + YY_BREAK +case 16: +YY_RULE_SETUP +#line 244 "pars0lex.l" +{ + return(PARS_PROCEDURE_TOKEN); +} + YY_BREAK +case 17: +YY_RULE_SETUP +#line 248 "pars0lex.l" +{ + return(PARS_IN_TOKEN); +} + YY_BREAK +case 18: +YY_RULE_SETUP +#line 252 "pars0lex.l" +{ + return(PARS_OUT_TOKEN); +} + YY_BREAK +case 19: +YY_RULE_SETUP +#line 256 "pars0lex.l" +{ + return(PARS_BINARY_TOKEN); +} + YY_BREAK +case 20: +YY_RULE_SETUP +#line 260 "pars0lex.l" +{ + return(PARS_BLOB_TOKEN); +} + YY_BREAK +case 21: +YY_RULE_SETUP +#line 264 "pars0lex.l" +{ + return(PARS_INT_TOKEN); +} + YY_BREAK +case 22: +YY_RULE_SETUP +#line 268 "pars0lex.l" +{ + return(PARS_INT_TOKEN); +} + YY_BREAK +case 23: +YY_RULE_SETUP +#line 272 "pars0lex.l" +{ + return(PARS_FLOAT_TOKEN); +} + YY_BREAK +case 24: +YY_RULE_SETUP +#line 276 "pars0lex.l" +{ + return(PARS_CHAR_TOKEN); +} + YY_BREAK +case 25: +YY_RULE_SETUP +#line 280 "pars0lex.l" +{ + return(PARS_IS_TOKEN); +} + YY_BREAK +case 26: +YY_RULE_SETUP +#line 284 "pars0lex.l" +{ + return(PARS_BEGIN_TOKEN); +} + YY_BREAK +case 27: +YY_RULE_SETUP +#line 288 "pars0lex.l" +{ + return(PARS_END_TOKEN); +} + YY_BREAK +case 28: +YY_RULE_SETUP +#line 292 "pars0lex.l" +{ + return(PARS_IF_TOKEN); +} + YY_BREAK +case 29: +YY_RULE_SETUP +#line 296 "pars0lex.l" +{ + return(PARS_THEN_TOKEN); +} + YY_BREAK +case 30: +YY_RULE_SETUP +#line 300 "pars0lex.l" +{ + return(PARS_ELSE_TOKEN); +} + YY_BREAK +case 31: +YY_RULE_SETUP +#line 304 "pars0lex.l" +{ + return(PARS_ELSIF_TOKEN); +} + YY_BREAK +case 32: +YY_RULE_SETUP +#line 308 "pars0lex.l" +{ + return(PARS_LOOP_TOKEN); +} + YY_BREAK +case 33: +YY_RULE_SETUP +#line 312 "pars0lex.l" +{ + return(PARS_WHILE_TOKEN); +} + YY_BREAK +case 34: +YY_RULE_SETUP +#line 316 "pars0lex.l" +{ + return(PARS_RETURN_TOKEN); +} + YY_BREAK +case 35: +YY_RULE_SETUP +#line 320 "pars0lex.l" +{ + return(PARS_SELECT_TOKEN); +} + YY_BREAK +case 36: +YY_RULE_SETUP +#line 324 "pars0lex.l" +{ + return(PARS_SUM_TOKEN); +} + YY_BREAK +case 37: +YY_RULE_SETUP +#line 328 "pars0lex.l" +{ + return(PARS_COUNT_TOKEN); +} + YY_BREAK +case 38: +YY_RULE_SETUP +#line 332 "pars0lex.l" +{ + return(PARS_DISTINCT_TOKEN); +} + YY_BREAK +case 39: +YY_RULE_SETUP +#line 336 "pars0lex.l" +{ + return(PARS_FROM_TOKEN); +} + YY_BREAK +case 40: +YY_RULE_SETUP +#line 340 "pars0lex.l" +{ + return(PARS_WHERE_TOKEN); +} + YY_BREAK +case 41: +YY_RULE_SETUP +#line 344 "pars0lex.l" +{ + return(PARS_FOR_TOKEN); +} + YY_BREAK +case 42: +YY_RULE_SETUP +#line 348 "pars0lex.l" +{ + return(PARS_READ_TOKEN); +} + YY_BREAK +case 43: +YY_RULE_SETUP +#line 352 "pars0lex.l" +{ + return(PARS_ORDER_TOKEN); +} + YY_BREAK +case 44: +YY_RULE_SETUP +#line 356 "pars0lex.l" +{ + return(PARS_BY_TOKEN); +} + YY_BREAK +case 45: +YY_RULE_SETUP +#line 360 "pars0lex.l" +{ + return(PARS_ASC_TOKEN); +} + YY_BREAK +case 46: +YY_RULE_SETUP +#line 364 "pars0lex.l" +{ + return(PARS_DESC_TOKEN); +} + YY_BREAK +case 47: +YY_RULE_SETUP +#line 368 "pars0lex.l" +{ + return(PARS_INSERT_TOKEN); +} + YY_BREAK +case 48: +YY_RULE_SETUP +#line 372 "pars0lex.l" +{ + return(PARS_INTO_TOKEN); +} + YY_BREAK +case 49: +YY_RULE_SETUP +#line 376 "pars0lex.l" +{ + return(PARS_VALUES_TOKEN); +} + YY_BREAK +case 50: +YY_RULE_SETUP +#line 380 "pars0lex.l" +{ + return(PARS_UPDATE_TOKEN); +} + YY_BREAK +case 51: +YY_RULE_SETUP +#line 384 "pars0lex.l" +{ + return(PARS_SET_TOKEN); +} + YY_BREAK +case 52: +YY_RULE_SETUP +#line 388 "pars0lex.l" +{ + return(PARS_DELETE_TOKEN); +} + YY_BREAK +case 53: +YY_RULE_SETUP +#line 392 "pars0lex.l" +{ + return(PARS_CURRENT_TOKEN); +} + YY_BREAK +case 54: +YY_RULE_SETUP +#line 396 "pars0lex.l" +{ + return(PARS_OF_TOKEN); +} + YY_BREAK +case 55: +YY_RULE_SETUP +#line 400 "pars0lex.l" +{ + return(PARS_CREATE_TOKEN); +} + YY_BREAK +case 56: +YY_RULE_SETUP +#line 404 "pars0lex.l" +{ + return(PARS_TABLE_TOKEN); +} + YY_BREAK +case 57: +YY_RULE_SETUP +#line 408 "pars0lex.l" +{ + return(PARS_COMPACT_TOKEN); +} + YY_BREAK +case 58: +YY_RULE_SETUP +#line 412 "pars0lex.l" +{ + return(PARS_BLOCK_SIZE_TOKEN); +} + YY_BREAK +case 59: +YY_RULE_SETUP +#line 416 "pars0lex.l" +{ + return(PARS_INDEX_TOKEN); +} + YY_BREAK +case 60: +YY_RULE_SETUP +#line 420 "pars0lex.l" +{ + return(PARS_UNIQUE_TOKEN); +} + YY_BREAK +case 61: +YY_RULE_SETUP +#line 424 "pars0lex.l" +{ + return(PARS_CLUSTERED_TOKEN); +} + YY_BREAK +case 62: +YY_RULE_SETUP +#line 428 "pars0lex.l" +{ + return(PARS_DOES_NOT_FIT_IN_MEM_TOKEN); +} + YY_BREAK +case 63: +YY_RULE_SETUP +#line 432 "pars0lex.l" +{ + return(PARS_ON_TOKEN); +} + YY_BREAK +case 64: +YY_RULE_SETUP +#line 436 "pars0lex.l" +{ + return(PARS_DECLARE_TOKEN); +} + YY_BREAK +case 65: +YY_RULE_SETUP +#line 440 "pars0lex.l" +{ + return(PARS_CURSOR_TOKEN); +} + YY_BREAK +case 66: +YY_RULE_SETUP +#line 444 "pars0lex.l" +{ + return(PARS_OPEN_TOKEN); +} + YY_BREAK +case 67: +YY_RULE_SETUP +#line 448 "pars0lex.l" +{ + return(PARS_FETCH_TOKEN); +} + YY_BREAK +case 68: +YY_RULE_SETUP +#line 452 "pars0lex.l" +{ + return(PARS_CLOSE_TOKEN); +} + YY_BREAK +case 69: +YY_RULE_SETUP +#line 456 "pars0lex.l" +{ + return(PARS_NOTFOUND_TOKEN); +} + YY_BREAK +case 70: +YY_RULE_SETUP +#line 460 "pars0lex.l" +{ + return(PARS_TO_CHAR_TOKEN); +} + YY_BREAK +case 71: +YY_RULE_SETUP +#line 464 "pars0lex.l" +{ + return(PARS_TO_NUMBER_TOKEN); +} + YY_BREAK +case 72: +YY_RULE_SETUP +#line 468 "pars0lex.l" +{ + return(PARS_TO_BINARY_TOKEN); +} + YY_BREAK +case 73: +YY_RULE_SETUP +#line 472 "pars0lex.l" +{ + return(PARS_BINARY_TO_NUMBER_TOKEN); +} + YY_BREAK +case 74: +YY_RULE_SETUP +#line 476 "pars0lex.l" +{ + return(PARS_SUBSTR_TOKEN); +} + YY_BREAK +case 75: +YY_RULE_SETUP +#line 480 "pars0lex.l" +{ + return(PARS_REPLSTR_TOKEN); +} + YY_BREAK +case 76: +YY_RULE_SETUP +#line 484 "pars0lex.l" +{ + return(PARS_CONCAT_TOKEN); +} + YY_BREAK +case 77: +YY_RULE_SETUP +#line 488 "pars0lex.l" +{ + return(PARS_INSTR_TOKEN); +} + YY_BREAK +case 78: +YY_RULE_SETUP +#line 492 "pars0lex.l" +{ + return(PARS_LENGTH_TOKEN); +} + YY_BREAK +case 79: +YY_RULE_SETUP +#line 496 "pars0lex.l" +{ + return(PARS_SYSDATE_TOKEN); +} + YY_BREAK +case 80: +YY_RULE_SETUP +#line 500 "pars0lex.l" +{ + return(PARS_PRINTF_TOKEN); +} + YY_BREAK +case 81: +YY_RULE_SETUP +#line 504 "pars0lex.l" +{ + return(PARS_ASSERT_TOKEN); +} + YY_BREAK +case 82: +YY_RULE_SETUP +#line 508 "pars0lex.l" +{ + return(PARS_RND_TOKEN); +} + YY_BREAK +case 83: +YY_RULE_SETUP +#line 512 "pars0lex.l" +{ + return(PARS_RND_STR_TOKEN); +} + YY_BREAK +case 84: +YY_RULE_SETUP +#line 516 "pars0lex.l" +{ + return(PARS_ROW_PRINTF_TOKEN); +} + YY_BREAK +case 85: +YY_RULE_SETUP +#line 520 "pars0lex.l" +{ + return(PARS_COMMIT_TOKEN); +} + YY_BREAK +case 86: +YY_RULE_SETUP +#line 524 "pars0lex.l" +{ + return(PARS_ROLLBACK_TOKEN); +} + YY_BREAK +case 87: +YY_RULE_SETUP +#line 528 "pars0lex.l" +{ + return(PARS_WORK_TOKEN); +} + YY_BREAK +case 88: +YY_RULE_SETUP +#line 532 "pars0lex.l" +{ + return(PARS_UNSIGNED_TOKEN); +} + YY_BREAK +case 89: +YY_RULE_SETUP +#line 536 "pars0lex.l" +{ + return(PARS_EXIT_TOKEN); +} + YY_BREAK +case 90: +YY_RULE_SETUP +#line 540 "pars0lex.l" +{ + return(PARS_FUNCTION_TOKEN); +} + YY_BREAK +case 91: +YY_RULE_SETUP +#line 544 "pars0lex.l" +{ + return(PARS_LOCK_TOKEN); +} + YY_BREAK +case 92: +YY_RULE_SETUP +#line 548 "pars0lex.l" +{ + return(PARS_SHARE_TOKEN); +} + YY_BREAK +case 93: +YY_RULE_SETUP +#line 552 "pars0lex.l" +{ + return(PARS_MODE_TOKEN); +} + YY_BREAK +case 94: +YY_RULE_SETUP +#line 556 "pars0lex.l" +{ + return(PARS_LIKE_TOKEN); +} + YY_BREAK +case 95: +YY_RULE_SETUP +#line 560 "pars0lex.l" +{ + return(PARS_BIGINT_TOKEN); +} + YY_BREAK +case 96: +YY_RULE_SETUP +#line 564 "pars0lex.l" +{ + yylval = sym_tab_add_id(pars_sym_tab_global, + (byte*) yytext, + ut_strlen(yytext)); + return(PARS_ID_TOKEN); +} + YY_BREAK +case 97: +YY_RULE_SETUP +#line 571 "pars0lex.l" +{ + yylval = sym_tab_add_id(pars_sym_tab_global, + (byte*) yytext, + ut_strlen(yytext)); + return(PARS_TABLE_NAME_TOKEN); +} + YY_BREAK +case 98: +YY_RULE_SETUP +#line 578 "pars0lex.l" +{ + return(PARS_DDOT_TOKEN); +} + YY_BREAK +case 99: +YY_RULE_SETUP +#line 582 "pars0lex.l" +{ + return(PARS_ASSIGN_TOKEN); +} + YY_BREAK +case 100: +YY_RULE_SETUP +#line 586 "pars0lex.l" +{ + return(PARS_LE_TOKEN); +} + YY_BREAK +case 101: +YY_RULE_SETUP +#line 590 "pars0lex.l" +{ + return(PARS_GE_TOKEN); +} + YY_BREAK +case 102: +YY_RULE_SETUP +#line 594 "pars0lex.l" +{ + return(PARS_NE_TOKEN); +} + YY_BREAK +case 103: +YY_RULE_SETUP +#line 598 "pars0lex.l" +{ + + return((int)(*yytext)); +} + YY_BREAK +case 104: +YY_RULE_SETUP +#line 603 "pars0lex.l" +{ + + return((int)(*yytext)); +} + YY_BREAK +case 105: +YY_RULE_SETUP +#line 608 "pars0lex.l" +{ + + return((int)(*yytext)); +} + YY_BREAK +case 106: +YY_RULE_SETUP +#line 613 "pars0lex.l" +{ + + return((int)(*yytext)); +} + YY_BREAK +case 107: +YY_RULE_SETUP +#line 618 "pars0lex.l" +{ + + return((int)(*yytext)); +} + YY_BREAK +case 108: +YY_RULE_SETUP +#line 623 "pars0lex.l" +{ + + return((int)(*yytext)); +} + YY_BREAK +case 109: +YY_RULE_SETUP +#line 628 "pars0lex.l" +{ + + return((int)(*yytext)); +} + YY_BREAK +case 110: +YY_RULE_SETUP +#line 633 "pars0lex.l" +{ + + return((int)(*yytext)); +} + YY_BREAK +case 111: +YY_RULE_SETUP +#line 638 "pars0lex.l" +{ + + return((int)(*yytext)); +} + YY_BREAK +case 112: +YY_RULE_SETUP +#line 643 "pars0lex.l" +{ + + return((int)(*yytext)); +} + YY_BREAK +case 113: +YY_RULE_SETUP +#line 648 "pars0lex.l" +{ + + return((int)(*yytext)); +} + YY_BREAK +case 114: +YY_RULE_SETUP +#line 653 "pars0lex.l" +{ + + return((int)(*yytext)); +} + YY_BREAK +case 115: +YY_RULE_SETUP +#line 658 "pars0lex.l" +{ + + return((int)(*yytext)); +} + YY_BREAK +case 116: +YY_RULE_SETUP +#line 663 "pars0lex.l" +{ + + return((int)(*yytext)); +} + YY_BREAK +case 117: +YY_RULE_SETUP +#line 668 "pars0lex.l" +{ + + return((int)(*yytext)); +} + YY_BREAK +case 118: +YY_RULE_SETUP +#line 673 "pars0lex.l" +BEGIN(comment); /* eat up comment */ + YY_BREAK +case 119: +/* rule 119 can match eol */ +YY_RULE_SETUP +#line 675 "pars0lex.l" + + YY_BREAK +case 120: +/* rule 120 can match eol */ +YY_RULE_SETUP +#line 676 "pars0lex.l" + + YY_BREAK +case 121: +YY_RULE_SETUP +#line 677 "pars0lex.l" +BEGIN(INITIAL); + YY_BREAK +case 122: +/* rule 122 can match eol */ +YY_RULE_SETUP +#line 679 "pars0lex.l" +/* eat up whitespace */ + YY_BREAK +case 123: +YY_RULE_SETUP +#line 682 "pars0lex.l" +{ + fprintf(stderr,"Unrecognized character: %02x\n", + *yytext); + + ut_error; + + return(0); +} + YY_BREAK +case 124: +YY_RULE_SETUP +#line 691 "pars0lex.l" +YY_FATAL_ERROR( "flex scanner jammed" ); + YY_BREAK +#line 2237 "lexyy.cc" +case YY_STATE_EOF(INITIAL): +case YY_STATE_EOF(comment): +case YY_STATE_EOF(quoted): +case YY_STATE_EOF(id): + yyterminate(); + + case YY_END_OF_BUFFER: + { + /* Amount of text matched not including the EOB char. */ + int yy_amount_of_matched_text = (int) (yy_cp - (yytext_ptr)) - 1; + + /* Undo the effects of YY_DO_BEFORE_ACTION. */ + *yy_cp = (yy_hold_char); + YY_RESTORE_YY_MORE_OFFSET + + if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_NEW ) + { + /* We're scanning a new file or input source. It's + * possible that this happened because the user + * just pointed yyin at a new source and called + * yylex(). If so, then we have to assure + * consistency between YY_CURRENT_BUFFER and our + * globals. Here is the right place to do so, because + * this is the first action (other than possibly a + * back-up) that will match for the new input source. + */ + (yy_n_chars) = YY_CURRENT_BUFFER_LVALUE->yy_n_chars; + YY_CURRENT_BUFFER_LVALUE->yy_input_file = yyin; + YY_CURRENT_BUFFER_LVALUE->yy_buffer_status = YY_BUFFER_NORMAL; + } + + /* Note that here we test for yy_c_buf_p "<=" to the position + * of the first EOB in the buffer, since yy_c_buf_p will + * already have been incremented past the NUL character + * (since all states make transitions on EOB to the + * end-of-buffer state). Contrast this with the test + * in input(). + */ + if ( (yy_c_buf_p) <= &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)] ) + { /* This was really a NUL. */ + yy_state_type yy_next_state; + + (yy_c_buf_p) = (yytext_ptr) + yy_amount_of_matched_text; + + yy_current_state = yy_get_previous_state( ); + + /* Okay, we're now positioned to make the NUL + * transition. We couldn't have + * yy_get_previous_state() go ahead and do it + * for us because it doesn't know how to deal + * with the possibility of jamming (and we don't + * want to build jamming into it because then it + * will run more slowly). + */ + + yy_next_state = yy_try_NUL_trans( yy_current_state ); + + yy_bp = (yytext_ptr) + YY_MORE_ADJ; + + if ( yy_next_state ) + { + /* Consume the NUL. */ + yy_cp = ++(yy_c_buf_p); + yy_current_state = yy_next_state; + goto yy_match; + } + + else + { + yy_cp = (yy_last_accepting_cpos); + yy_current_state = (yy_last_accepting_state); + goto yy_find_action; + } + } + + else switch ( yy_get_next_buffer( ) ) + { + case EOB_ACT_END_OF_FILE: + { + (yy_did_buffer_switch_on_eof) = 0; + + if ( yywrap( ) ) + { + /* Note: because we've taken care in + * yy_get_next_buffer() to have set up + * yytext, we can now set up + * yy_c_buf_p so that if some total + * hoser (like flex itself) wants to + * call the scanner after we return the + * YY_NULL, it'll still work - another + * YY_NULL will get returned. + */ + (yy_c_buf_p) = (yytext_ptr) + YY_MORE_ADJ; + + yy_act = YY_STATE_EOF(YY_START); + goto do_action; + } + + else + { + if ( ! (yy_did_buffer_switch_on_eof) ) + YY_NEW_FILE; + } + break; + } + + case EOB_ACT_CONTINUE_SCAN: + (yy_c_buf_p) = + (yytext_ptr) + yy_amount_of_matched_text; + + yy_current_state = yy_get_previous_state( ); + + yy_cp = (yy_c_buf_p); + yy_bp = (yytext_ptr) + YY_MORE_ADJ; + goto yy_match; + + case EOB_ACT_LAST_MATCH: + (yy_c_buf_p) = + &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)]; + + yy_current_state = yy_get_previous_state( ); + + yy_cp = (yy_c_buf_p); + yy_bp = (yytext_ptr) + YY_MORE_ADJ; + goto yy_find_action; + } + break; + } + + default: + YY_FATAL_ERROR( + "fatal flex scanner internal error--no action found" ); + } /* end of action switch */ + } /* end of scanning one token */ +} /* end of yylex */ + +/* yy_get_next_buffer - try to read in a new buffer + * + * Returns a code representing an action: + * EOB_ACT_LAST_MATCH - + * EOB_ACT_CONTINUE_SCAN - continue scanning from current position + * EOB_ACT_END_OF_FILE - end of file + */ +static int yy_get_next_buffer (void) +{ + register char *dest = YY_CURRENT_BUFFER_LVALUE->yy_ch_buf; + register char *source = (yytext_ptr); + register int number_to_move, i; + int ret_val; + + if ( (yy_c_buf_p) > &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars) + 1] ) + YY_FATAL_ERROR( + "fatal flex scanner internal error--end of buffer missed" ); + + if ( YY_CURRENT_BUFFER_LVALUE->yy_fill_buffer == 0 ) + { /* Don't try to fill the buffer, so this is an EOF. */ + if ( (yy_c_buf_p) - (yytext_ptr) - YY_MORE_ADJ == 1 ) + { + /* We matched a single character, the EOB, so + * treat this as a final EOF. + */ + return EOB_ACT_END_OF_FILE; + } + + else + { + /* We matched some text prior to the EOB, first + * process it. + */ + return EOB_ACT_LAST_MATCH; + } + } + + /* Try to read more data. */ + + /* First move last chars to start of buffer. */ + number_to_move = (int) ((yy_c_buf_p) - (yytext_ptr)) - 1; + + for ( i = 0; i < number_to_move; ++i ) + *(dest++) = *(source++); + + if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_EOF_PENDING ) + /* don't do the read, it's not guaranteed to return an EOF, + * just force an EOF + */ + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars) = 0; + + else + { + int num_to_read = static_cast<int>( + YY_CURRENT_BUFFER_LVALUE->yy_buf_size - number_to_move - 1); + + while ( num_to_read <= 0 ) + { /* Not enough room in the buffer - grow it. */ + + /* just a shorter name for the current buffer */ + YY_BUFFER_STATE b = YY_CURRENT_BUFFER; + + int yy_c_buf_p_offset = + (int) ((yy_c_buf_p) - b->yy_ch_buf); + + if ( b->yy_is_our_buffer ) + { + int new_size = static_cast<int>(b->yy_buf_size * 2); + + if ( new_size <= 0 ) + b->yy_buf_size += b->yy_buf_size / 8; + else + b->yy_buf_size *= 2; + + b->yy_ch_buf = (char *) + /* Include room in for 2 EOB chars. */ + yyrealloc((void *) b->yy_ch_buf,b->yy_buf_size + 2 ); + } + else + /* Can't grow it, we don't own it. */ + b->yy_ch_buf = 0; + + if ( ! b->yy_ch_buf ) + YY_FATAL_ERROR( + "fatal error - scanner input buffer overflow" ); + + (yy_c_buf_p) = &b->yy_ch_buf[yy_c_buf_p_offset]; + + num_to_read = static_cast<int>( + YY_CURRENT_BUFFER_LVALUE->yy_buf_size + - number_to_move - 1); + + } + + if ( num_to_read > YY_READ_BUF_SIZE ) + num_to_read = YY_READ_BUF_SIZE; + + /* Read in more data. */ + YY_INPUT( (&YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[number_to_move]), + (yy_n_chars), (size_t) num_to_read ); + + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars); + } + + if ( (yy_n_chars) == 0 ) + { + if ( number_to_move == YY_MORE_ADJ ) + { + ret_val = EOB_ACT_END_OF_FILE; + yyrestart(yyin ); + } + + else + { + ret_val = EOB_ACT_LAST_MATCH; + YY_CURRENT_BUFFER_LVALUE->yy_buffer_status = + YY_BUFFER_EOF_PENDING; + } + } + + else + ret_val = EOB_ACT_CONTINUE_SCAN; + + if ((yy_size_t) ((yy_n_chars) + number_to_move) > YY_CURRENT_BUFFER_LVALUE->yy_buf_size) { + /* Extend the array by 50%, plus the number we really need. */ + yy_size_t new_size = (yy_n_chars) + number_to_move + ((yy_n_chars) >> 1); + YY_CURRENT_BUFFER_LVALUE->yy_ch_buf = (char *) yyrealloc((void *) YY_CURRENT_BUFFER_LVALUE->yy_ch_buf,new_size ); + if ( ! YY_CURRENT_BUFFER_LVALUE->yy_ch_buf ) + YY_FATAL_ERROR( "out of dynamic memory in yy_get_next_buffer()" ); + } + + (yy_n_chars) += number_to_move; + YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)] = YY_END_OF_BUFFER_CHAR; + YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars) + 1] = YY_END_OF_BUFFER_CHAR; + + (yytext_ptr) = &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[0]; + + return ret_val; +} + +/* yy_get_previous_state - get the state just before the EOB char was reached */ + + yy_state_type yy_get_previous_state (void) +{ + register yy_state_type yy_current_state; + register char *yy_cp; + + yy_current_state = (yy_start); + + for ( yy_cp = (yytext_ptr) + YY_MORE_ADJ; yy_cp < (yy_c_buf_p); ++yy_cp ) + { + register YY_CHAR yy_c = (*yy_cp ? yy_ec[YY_SC_TO_UI(*yy_cp)] : 1); + if ( yy_accept[yy_current_state] ) + { + (yy_last_accepting_state) = yy_current_state; + (yy_last_accepting_cpos) = yy_cp; + } + while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) + { + yy_current_state = (int) yy_def[yy_current_state]; + if ( yy_current_state >= 425 ) + yy_c = yy_meta[(unsigned int) yy_c]; + } + yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c]; + } + + return yy_current_state; +} + +/* yy_try_NUL_trans - try to make a transition on the NUL character + * + * synopsis + * next_state = yy_try_NUL_trans( current_state ); + */ + static yy_state_type yy_try_NUL_trans (yy_state_type yy_current_state ) +{ + register int yy_is_jam; + register char *yy_cp = (yy_c_buf_p); + + register YY_CHAR yy_c = 1; + if ( yy_accept[yy_current_state] ) + { + (yy_last_accepting_state) = yy_current_state; + (yy_last_accepting_cpos) = yy_cp; + } + while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) + { + yy_current_state = (int) yy_def[yy_current_state]; + if ( yy_current_state >= 425 ) + yy_c = yy_meta[(unsigned int) yy_c]; + } + yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c]; + yy_is_jam = (yy_current_state == 424); + + return yy_is_jam ? 0 : yy_current_state; +} + +#ifndef YY_NO_INPUT +#ifdef __cplusplus + static int yyinput (void) +#else + static int input (void) +#endif + +{ + int c; + + *(yy_c_buf_p) = (yy_hold_char); + + if ( *(yy_c_buf_p) == YY_END_OF_BUFFER_CHAR ) + { + /* yy_c_buf_p now points to the character we want to return. + * If this occurs *before* the EOB characters, then it's a + * valid NUL; if not, then we've hit the end of the buffer. + */ + if ( (yy_c_buf_p) < &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)] ) + /* This was really a NUL. */ + *(yy_c_buf_p) = '\0'; + + else + { /* need more input */ + int offset = (int)((yy_c_buf_p) - (yytext_ptr)); + ++(yy_c_buf_p); + + switch ( yy_get_next_buffer( ) ) + { + case EOB_ACT_LAST_MATCH: + /* This happens because yy_g_n_b() + * sees that we've accumulated a + * token and flags that we need to + * try matching the token before + * proceeding. But for input(), + * there's no matching to consider. + * So convert the EOB_ACT_LAST_MATCH + * to EOB_ACT_END_OF_FILE. + */ + + /* Reset buffer status. */ + yyrestart(yyin ); + + /*FALLTHROUGH*/ + + case EOB_ACT_END_OF_FILE: + { + if ( yywrap( ) ) + return EOF; + + if ( ! (yy_did_buffer_switch_on_eof) ) + YY_NEW_FILE; +#ifdef __cplusplus + return yyinput(); +#else + return input(); +#endif + } + + case EOB_ACT_CONTINUE_SCAN: + (yy_c_buf_p) = (yytext_ptr) + offset; + break; + } + } + } + + c = *(unsigned char *) (yy_c_buf_p); /* cast for 8-bit char's */ + *(yy_c_buf_p) = '\0'; /* preserve yytext */ + (yy_hold_char) = *++(yy_c_buf_p); + + return c; +} +#endif /* ifndef YY_NO_INPUT */ + +/** Immediately switch to a different input stream. + * @param input_file A readable stream. + * + * @note This function does not reset the start condition to @c INITIAL . + */ + void yyrestart (FILE * input_file ) +{ + + if ( ! YY_CURRENT_BUFFER ){ + yyensure_buffer_stack (); + YY_CURRENT_BUFFER_LVALUE = + yy_create_buffer(yyin,YY_BUF_SIZE ); + } + + yy_init_buffer(YY_CURRENT_BUFFER,input_file ); + yy_load_buffer_state( ); +} + +/** Switch to a different input buffer. + * @param new_buffer The new input buffer. + * + */ + __attribute__((unused)) static void yy_switch_to_buffer (YY_BUFFER_STATE new_buffer ) +{ + + /* TODO. We should be able to replace this entire function body + * with + * yypop_buffer_state(); + * yypush_buffer_state(new_buffer); + */ + yyensure_buffer_stack (); + if ( YY_CURRENT_BUFFER == new_buffer ) + return; + + if ( YY_CURRENT_BUFFER ) + { + /* Flush out information for old buffer. */ + *(yy_c_buf_p) = (yy_hold_char); + YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = (yy_c_buf_p); + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars); + } + + YY_CURRENT_BUFFER_LVALUE = new_buffer; + yy_load_buffer_state( ); + + /* We don't actually know whether we did this switch during + * EOF (yywrap()) processing, but the only time this flag + * is looked at is after yywrap() is called, so it's safe + * to go ahead and always set it. + */ + (yy_did_buffer_switch_on_eof) = 1; +} + +static void yy_load_buffer_state (void) +{ + (yy_n_chars) = YY_CURRENT_BUFFER_LVALUE->yy_n_chars; + (yytext_ptr) = (yy_c_buf_p) = YY_CURRENT_BUFFER_LVALUE->yy_buf_pos; + yyin = YY_CURRENT_BUFFER_LVALUE->yy_input_file; + (yy_hold_char) = *(yy_c_buf_p); +} + +/** Allocate and initialize an input buffer state. + * @param file A readable stream. + * @param size The character buffer size in bytes. When in doubt, use @c YY_BUF_SIZE. + * + * @return the allocated buffer state. + */ + static YY_BUFFER_STATE yy_create_buffer (FILE * file, int size ) +{ + YY_BUFFER_STATE b; + + b = (YY_BUFFER_STATE) yyalloc(sizeof( struct yy_buffer_state ) ); + if ( ! b ) + YY_FATAL_ERROR( "out of dynamic memory in yy_create_buffer()" ); + + b->yy_buf_size = size; + + /* yy_ch_buf has to be 2 characters longer than the size given because + * we need to put in 2 end-of-buffer characters. + */ + b->yy_ch_buf = (char *) yyalloc(b->yy_buf_size + 2 ); + if ( ! b->yy_ch_buf ) + YY_FATAL_ERROR( "out of dynamic memory in yy_create_buffer()" ); + + b->yy_is_our_buffer = 1; + + yy_init_buffer(b,file ); + + return b; +} + +/** Destroy the buffer. + * @param b a buffer created with yy_create_buffer() + * + */ + void yy_delete_buffer (YY_BUFFER_STATE b ) +{ + + if ( ! b ) + return; + + if ( b == YY_CURRENT_BUFFER ) /* Not sure if we should pop here. */ + YY_CURRENT_BUFFER_LVALUE = (YY_BUFFER_STATE) 0; + + if ( b->yy_is_our_buffer ) + yyfree((void *) b->yy_ch_buf ); + + yyfree((void *) b ); +} + +/* Initializes or reinitializes a buffer. + * This function is sometimes called more than once on the same buffer, + * such as during a yyrestart() or at EOF. + */ + static void yy_init_buffer (YY_BUFFER_STATE b, FILE * file ) + +{ + int oerrno = errno; + + yy_flush_buffer(b ); + + b->yy_input_file = file; + b->yy_fill_buffer = 1; + + /* If b is the current buffer, then yy_init_buffer was _probably_ + * called from yyrestart() or through yy_get_next_buffer. + * In that case, we don't want to reset the lineno or column. + */ + if (b != YY_CURRENT_BUFFER){ + b->yy_bs_lineno = 1; + b->yy_bs_column = 0; + } + + b->yy_is_interactive = 0; + + errno = oerrno; +} + +/** Discard all buffered characters. On the next scan, YY_INPUT will be called. + * @param b the buffer state to be flushed, usually @c YY_CURRENT_BUFFER. + * + */ + void yy_flush_buffer (YY_BUFFER_STATE b ) +{ + if ( ! b ) + return; + + b->yy_n_chars = 0; + + /* We always need two end-of-buffer characters. The first causes + * a transition to the end-of-buffer state. The second causes + * a jam in that state. + */ + b->yy_ch_buf[0] = YY_END_OF_BUFFER_CHAR; + b->yy_ch_buf[1] = YY_END_OF_BUFFER_CHAR; + + b->yy_buf_pos = &b->yy_ch_buf[0]; + + b->yy_at_bol = 1; + b->yy_buffer_status = YY_BUFFER_NEW; + + if ( b == YY_CURRENT_BUFFER ) + yy_load_buffer_state( ); +} + +/** Pushes the new state onto the stack. The new state becomes + * the current state. This function will allocate the stack + * if necessary. + * @param new_buffer The new state. + * + */ +void yypush_buffer_state (YY_BUFFER_STATE new_buffer ) +{ + if (new_buffer == NULL) + return; + + yyensure_buffer_stack(); + + /* This block is copied from yy_switch_to_buffer. */ + if ( YY_CURRENT_BUFFER ) + { + /* Flush out information for old buffer. */ + *(yy_c_buf_p) = (yy_hold_char); + YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = (yy_c_buf_p); + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars); + } + + /* Only push if top exists. Otherwise, replace top. */ + if (YY_CURRENT_BUFFER) + (yy_buffer_stack_top)++; + YY_CURRENT_BUFFER_LVALUE = new_buffer; + + /* copied from yy_switch_to_buffer. */ + yy_load_buffer_state( ); + (yy_did_buffer_switch_on_eof) = 1; +} + +/** Removes and deletes the top of the stack, if present. + * The next element becomes the new top. + * + */ +void yypop_buffer_state (void) +{ + if (!YY_CURRENT_BUFFER) + return; + + yy_delete_buffer(YY_CURRENT_BUFFER ); + YY_CURRENT_BUFFER_LVALUE = NULL; + if ((yy_buffer_stack_top) > 0) + --(yy_buffer_stack_top); + + if (YY_CURRENT_BUFFER) { + yy_load_buffer_state( ); + (yy_did_buffer_switch_on_eof) = 1; + } +} + +/* Allocates the stack if it does not exist. + * Guarantees space for at least one push. + */ +static void yyensure_buffer_stack (void) +{ + int num_to_alloc; + + if (!(yy_buffer_stack)) { + + /* First allocation is just for 2 elements, since we don't know if this + * scanner will even need a stack. We use 2 instead of 1 to avoid an + * immediate realloc on the next call. + */ + num_to_alloc = 1; + (yy_buffer_stack) = (struct yy_buffer_state**)yyalloc + (num_to_alloc * sizeof(struct yy_buffer_state*) + ); + if ( ! (yy_buffer_stack) ) + YY_FATAL_ERROR( "out of dynamic memory in yyensure_buffer_stack()" ); + + memset((yy_buffer_stack), 0, num_to_alloc * sizeof(struct yy_buffer_state*)); + + (yy_buffer_stack_max) = num_to_alloc; + (yy_buffer_stack_top) = 0; + return; + } + + if ((yy_buffer_stack_top) >= ((yy_buffer_stack_max)) - 1){ + + /* Increase the buffer to prepare for a possible push. */ + int grow_size = 8 /* arbitrary grow size */; + + num_to_alloc = static_cast<int>( + (yy_buffer_stack_max) + grow_size); + (yy_buffer_stack) = (struct yy_buffer_state**)yyrealloc + ((yy_buffer_stack), + num_to_alloc * sizeof(struct yy_buffer_state*) + ); + if ( ! (yy_buffer_stack) ) + YY_FATAL_ERROR( "out of dynamic memory in yyensure_buffer_stack()" ); + + /* zero only the new slots.*/ + memset((yy_buffer_stack) + (yy_buffer_stack_max), 0, grow_size * sizeof(struct yy_buffer_state*)); + (yy_buffer_stack_max) = num_to_alloc; + } +} + +#ifndef YY_EXIT_FAILURE +#define YY_EXIT_FAILURE 2 +#endif + +static void yy_fatal_error (yyconst char* msg ) +{ + (void) fprintf( stderr, "%s\n", msg ); + exit( YY_EXIT_FAILURE ); +} + +/* Redefine yyless() so it works in section 3 code. */ + +#undef yyless +#define yyless(n) \ + do \ + { \ + /* Undo effects of setting up yytext. */ \ + int yyless_macro_arg = (n); \ + YY_LESS_LINENO(yyless_macro_arg);\ + yytext[yyleng] = (yy_hold_char); \ + (yy_c_buf_p) = yytext + yyless_macro_arg; \ + (yy_hold_char) = *(yy_c_buf_p); \ + *(yy_c_buf_p) = '\0'; \ + yyleng = yyless_macro_arg; \ + } \ + while ( 0 ) + +/* Accessor methods (get/set functions) to struct members. */ + +/** Get the current line number. + * + */ +int yyget_lineno (void) +{ + + return yylineno; +} + +/** Get the input stream. + * + */ +FILE *yyget_in (void) +{ + return yyin; +} + +/** Get the output stream. + * + */ +FILE *yyget_out (void) +{ + return yyout; +} + +/** Get the length of the current token. + * + */ +yy_size_t yyget_leng (void) +{ + return yyleng; +} + +/** Get the current token. + * + */ + +char *yyget_text (void) +{ + return yytext; +} + +/** Set the current line number. + * @param line_number + * + */ +void yyset_lineno (int line_number ) +{ + + yylineno = line_number; +} + +/** Set the input stream. This does not discard the current + * input buffer. + * @param in_str A readable stream. + * + * @see yy_switch_to_buffer + */ +void yyset_in (FILE * in_str ) +{ + yyin = in_str ; +} + +void yyset_out (FILE * out_str ) +{ + yyout = out_str ; +} + +int yyget_debug (void) +{ + return yy_flex_debug; +} + +void yyset_debug (int bdebug ) +{ + yy_flex_debug = bdebug ; +} + +static int yy_init_globals (void) +{ + /* Initialization is the same as for the non-reentrant scanner. + * This function is called from yylex_destroy(), so don't allocate here. + */ + + (yy_buffer_stack) = 0; + (yy_buffer_stack_top) = 0; + (yy_buffer_stack_max) = 0; + (yy_c_buf_p) = (char *) 0; + (yy_init) = 0; + (yy_start) = 0; + +/* Defined in main.c */ +#ifdef YY_STDINIT + yyin = stdin; + yyout = stdout; +#else + yyin = (FILE *) 0; + yyout = (FILE *) 0; +#endif + + /* For future reference: Set errno on error, since we are called by + * yylex_init() + */ + return 0; +} + +/* yylex_destroy is for both reentrant and non-reentrant scanners. */ +__attribute__((unused)) static int yylex_destroy (void) +{ + + /* Pop the buffer stack, destroying each element. */ + while(YY_CURRENT_BUFFER){ + yy_delete_buffer(YY_CURRENT_BUFFER ); + YY_CURRENT_BUFFER_LVALUE = NULL; + yypop_buffer_state(); + } + + /* Destroy the stack itself. */ + yyfree((yy_buffer_stack) ); + (yy_buffer_stack) = NULL; + + /* Reset the globals. This is important in a non-reentrant scanner so the next time + * yylex() is called, initialization will occur. */ + yy_init_globals( ); + + return 0; +} + +/* + * Internal utility routines. + */ + +#ifndef yytext_ptr +static void yy_flex_strncpy (char* s1, yyconst char * s2, int n ) +{ + register int i; + for ( i = 0; i < n; ++i ) + s1[i] = s2[i]; +} +#endif + +#ifdef YY_NEED_STRLEN +static int yy_flex_strlen (yyconst char * s ) +{ + register int n; + for ( n = 0; s[n]; ++n ) + ; + + return n; +} +#endif + +void *yyalloc (yy_size_t size ) +{ + return (void *) malloc( size ); +} + +void *yyrealloc (void * ptr, yy_size_t size ) +{ + /* The cast to (char *) in the following accommodates both + * implementations that use char* generic pointers, and those + * that use void* generic pointers. It works with the latter + * because both ANSI C and C++ allow castless assignment from + * any pointer type to void*, and deal with argument conversions + * as though doing an assignment. + */ + return (void *) realloc( (char *) ptr, size ); +} + +void yyfree (void * ptr ) +{ + free( (char*) ptr ); /* see yyrealloc() for (char *) cast */ +} + +#define YYTABLES_NAME "yytables" + +#line 691 "pars0lex.l" + + + +/********************************************************************** +Release any resources used by the lexer. */ +UNIV_INTERN +void +pars_lexer_close(void) +/*==================*/ +{ + yylex_destroy(); + free(stringbuf); + stringbuf = NULL; + stringbuf_len_alloc = stringbuf_len = 0; +} + diff --git a/storage/xtradb/pars/make_bison.sh b/storage/xtradb/pars/make_bison.sh new file mode 100755 index 00000000000..2618be102bc --- /dev/null +++ b/storage/xtradb/pars/make_bison.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# +# Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify it under +# the terms of the GNU General Public License as published by the Free Software +# Foundation; version 2 of the License. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along with +# this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA +# +# generate parser files from bison input files. + +set -eu +TMPFILE=pars0grm.tab.c +OUTFILE=pars0grm.cc + +bison -d pars0grm.y +mv pars0grm.tab.h ../include/pars0grm.h + +sed -e ' +s/'"$TMPFILE"'/'"$OUTFILE"'/; +s/^\(\(YYSTYPE\|int\) yy\(char\|nerrs\)\)/static \1/; +s/\(\(YYSTYPE\|int\) yy\(lval\|parse\)\)/UNIV_INTERN \1/; +' < "$TMPFILE" > "$OUTFILE" + +rm "$TMPFILE" diff --git a/storage/xtradb/pars/make_flex.sh b/storage/xtradb/pars/make_flex.sh new file mode 100755 index 00000000000..581fc2342aa --- /dev/null +++ b/storage/xtradb/pars/make_flex.sh @@ -0,0 +1,48 @@ +#!/bin/bash +# +# Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify it under +# the terms of the GNU General Public License as published by the Free Software +# Foundation; version 2 of the License. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along with +# this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA +# +# generate lexer files from flex input files. + +set -eu + +TMPFILE=_flex_tmp.cc +OUTFILE=lexyy.cc + +flex -o $TMPFILE pars0lex.l + +# AIX needs its includes done in a certain order, so include "univ.i" first +# to be sure we get it right. +echo '#include "univ.i"' > $OUTFILE + +# flex assigns a pointer to an int in one place without a cast, resulting in +# a warning on Win64. Add the cast. Also define some symbols as static. +sed -e ' +s/'"$TMPFILE"'/'"$OUTFILE"'/; +s/\(int offset = \)\((yy_c_buf_p) - (yytext_ptr)\);/\1(int)(\2);/; +s/\(void yy\(restart\|_\(delete\|flush\)_buffer\)\)/static \1/; +s/\(void yy_switch_to_buffer\)/__attribute__((unused)) static \1/; +s/\(void yy\(push\|pop\)_buffer_state\)/__attribute__((unused)) static \1/; +s/\(YY_BUFFER_STATE yy_create_buffer\)/static \1/; +s/\(\(int\|void\) yy[gs]et_\)/__attribute__((unused)) static \1/; +s/\(void \*\?yy\(\(re\)\?alloc\|free\)\)/static \1/; +s/\(extern \)\?\(int yy\(leng\|lineno\|_flex_debug\)\)/static \2/; +s/\(int yylex_destroy\)/__attribute__((unused)) static \1/; +s/\(extern \)\?\(int yylex \)/UNIV_INTERN \2/; +s/^\(\(FILE\|char\) *\* *yyget\)/__attribute__((unused)) static \1/; +s/^\(extern \)\?\(\(FILE\|char\) *\* *yy\)/static \2/; +' < $TMPFILE >> $OUTFILE + +rm $TMPFILE diff --git a/storage/xtradb/pars/pars0grm.cc b/storage/xtradb/pars/pars0grm.cc new file mode 100644 index 00000000000..b360f36e597 --- /dev/null +++ b/storage/xtradb/pars/pars0grm.cc @@ -0,0 +1,3034 @@ +/* A Bison parser, made by GNU Bison 2.3. */ + +/* Skeleton implementation for Bison's Yacc-like parsers in C + + Copyright (C) 1984, 1989, 1990, 2000, 2001, 2002, 2003, 2004, 2005, 2006 + Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, + Boston, MA 02110-1301, USA. */ + +/* As a special exception, you may create a larger work that contains + part or all of the Bison parser skeleton and distribute that work + under terms of your choice, so long as that work isn't itself a + parser generator using the skeleton or a modified version thereof + as a parser skeleton. Alternatively, if you modify or redistribute + the parser skeleton itself, you may (at your option) remove this + special exception, which will cause the skeleton and the resulting + Bison output files to be licensed under the GNU General Public + License without this special exception. + + This special exception was added by the Free Software Foundation in + version 2.2 of Bison. */ + +/* C LALR(1) parser skeleton written by Richard Stallman, by + simplifying the original so-called "semantic" parser. */ + +/* All symbols defined below should begin with yy or YY, to avoid + infringing on user name space. This should be done even for local + variables, as they might otherwise be expanded by user macros. + There are some unavoidable exceptions within include files to + define necessary library symbols; they are noted "INFRINGES ON + USER NAME SPACE" below. */ + +/* Identify Bison output. */ +#define YYBISON 1 + +/* Bison version. */ +#define YYBISON_VERSION "2.3" + +/* Skeleton name. */ +#define YYSKELETON_NAME "yacc.c" + +/* Pure parsers. */ +#define YYPURE 0 + +/* Using locations. */ +#define YYLSP_NEEDED 0 + + + +/* Tokens. */ +#ifndef YYTOKENTYPE +# define YYTOKENTYPE + /* Put the tokens into the symbol table, so that GDB and other debuggers + know about them. */ + enum yytokentype { + PARS_INT_LIT = 258, + PARS_FLOAT_LIT = 259, + PARS_STR_LIT = 260, + PARS_FIXBINARY_LIT = 261, + PARS_BLOB_LIT = 262, + PARS_NULL_LIT = 263, + PARS_ID_TOKEN = 264, + PARS_AND_TOKEN = 265, + PARS_OR_TOKEN = 266, + PARS_NOT_TOKEN = 267, + PARS_GE_TOKEN = 268, + PARS_LE_TOKEN = 269, + PARS_NE_TOKEN = 270, + PARS_PROCEDURE_TOKEN = 271, + PARS_IN_TOKEN = 272, + PARS_OUT_TOKEN = 273, + PARS_BINARY_TOKEN = 274, + PARS_BLOB_TOKEN = 275, + PARS_INT_TOKEN = 276, + PARS_INTEGER_TOKEN = 277, + PARS_FLOAT_TOKEN = 278, + PARS_CHAR_TOKEN = 279, + PARS_IS_TOKEN = 280, + PARS_BEGIN_TOKEN = 281, + PARS_END_TOKEN = 282, + PARS_IF_TOKEN = 283, + PARS_THEN_TOKEN = 284, + PARS_ELSE_TOKEN = 285, + PARS_ELSIF_TOKEN = 286, + PARS_LOOP_TOKEN = 287, + PARS_WHILE_TOKEN = 288, + PARS_RETURN_TOKEN = 289, + PARS_SELECT_TOKEN = 290, + PARS_SUM_TOKEN = 291, + PARS_COUNT_TOKEN = 292, + PARS_DISTINCT_TOKEN = 293, + PARS_FROM_TOKEN = 294, + PARS_WHERE_TOKEN = 295, + PARS_FOR_TOKEN = 296, + PARS_DDOT_TOKEN = 297, + PARS_READ_TOKEN = 298, + PARS_ORDER_TOKEN = 299, + PARS_BY_TOKEN = 300, + PARS_ASC_TOKEN = 301, + PARS_DESC_TOKEN = 302, + PARS_INSERT_TOKEN = 303, + PARS_INTO_TOKEN = 304, + PARS_VALUES_TOKEN = 305, + PARS_UPDATE_TOKEN = 306, + PARS_SET_TOKEN = 307, + PARS_DELETE_TOKEN = 308, + PARS_CURRENT_TOKEN = 309, + PARS_OF_TOKEN = 310, + PARS_CREATE_TOKEN = 311, + PARS_TABLE_TOKEN = 312, + PARS_INDEX_TOKEN = 313, + PARS_UNIQUE_TOKEN = 314, + PARS_CLUSTERED_TOKEN = 315, + PARS_DOES_NOT_FIT_IN_MEM_TOKEN = 316, + PARS_ON_TOKEN = 317, + PARS_ASSIGN_TOKEN = 318, + PARS_DECLARE_TOKEN = 319, + PARS_CURSOR_TOKEN = 320, + PARS_SQL_TOKEN = 321, + PARS_OPEN_TOKEN = 322, + PARS_FETCH_TOKEN = 323, + PARS_CLOSE_TOKEN = 324, + PARS_NOTFOUND_TOKEN = 325, + PARS_TO_CHAR_TOKEN = 326, + PARS_TO_NUMBER_TOKEN = 327, + PARS_TO_BINARY_TOKEN = 328, + PARS_BINARY_TO_NUMBER_TOKEN = 329, + PARS_SUBSTR_TOKEN = 330, + PARS_REPLSTR_TOKEN = 331, + PARS_CONCAT_TOKEN = 332, + PARS_INSTR_TOKEN = 333, + PARS_LENGTH_TOKEN = 334, + PARS_SYSDATE_TOKEN = 335, + PARS_PRINTF_TOKEN = 336, + PARS_ASSERT_TOKEN = 337, + PARS_RND_TOKEN = 338, + PARS_RND_STR_TOKEN = 339, + PARS_ROW_PRINTF_TOKEN = 340, + PARS_COMMIT_TOKEN = 341, + PARS_ROLLBACK_TOKEN = 342, + PARS_WORK_TOKEN = 343, + PARS_UNSIGNED_TOKEN = 344, + PARS_EXIT_TOKEN = 345, + PARS_FUNCTION_TOKEN = 346, + PARS_LOCK_TOKEN = 347, + PARS_SHARE_TOKEN = 348, + PARS_MODE_TOKEN = 349, + PARS_LIKE_TOKEN = 350, + PARS_LIKE_TOKEN_EXACT = 351, + PARS_LIKE_TOKEN_PREFIX = 352, + PARS_LIKE_TOKEN_SUFFIX = 353, + PARS_LIKE_TOKEN_SUBSTR = 354, + PARS_TABLE_NAME_TOKEN = 355, + PARS_COMPACT_TOKEN = 356, + PARS_BLOCK_SIZE_TOKEN = 357, + PARS_BIGINT_TOKEN = 358, + NEG = 359 + }; +#endif +/* Tokens. */ +#define PARS_INT_LIT 258 +#define PARS_FLOAT_LIT 259 +#define PARS_STR_LIT 260 +#define PARS_FIXBINARY_LIT 261 +#define PARS_BLOB_LIT 262 +#define PARS_NULL_LIT 263 +#define PARS_ID_TOKEN 264 +#define PARS_AND_TOKEN 265 +#define PARS_OR_TOKEN 266 +#define PARS_NOT_TOKEN 267 +#define PARS_GE_TOKEN 268 +#define PARS_LE_TOKEN 269 +#define PARS_NE_TOKEN 270 +#define PARS_PROCEDURE_TOKEN 271 +#define PARS_IN_TOKEN 272 +#define PARS_OUT_TOKEN 273 +#define PARS_BINARY_TOKEN 274 +#define PARS_BLOB_TOKEN 275 +#define PARS_INT_TOKEN 276 +#define PARS_INTEGER_TOKEN 277 +#define PARS_FLOAT_TOKEN 278 +#define PARS_CHAR_TOKEN 279 +#define PARS_IS_TOKEN 280 +#define PARS_BEGIN_TOKEN 281 +#define PARS_END_TOKEN 282 +#define PARS_IF_TOKEN 283 +#define PARS_THEN_TOKEN 284 +#define PARS_ELSE_TOKEN 285 +#define PARS_ELSIF_TOKEN 286 +#define PARS_LOOP_TOKEN 287 +#define PARS_WHILE_TOKEN 288 +#define PARS_RETURN_TOKEN 289 +#define PARS_SELECT_TOKEN 290 +#define PARS_SUM_TOKEN 291 +#define PARS_COUNT_TOKEN 292 +#define PARS_DISTINCT_TOKEN 293 +#define PARS_FROM_TOKEN 294 +#define PARS_WHERE_TOKEN 295 +#define PARS_FOR_TOKEN 296 +#define PARS_DDOT_TOKEN 297 +#define PARS_READ_TOKEN 298 +#define PARS_ORDER_TOKEN 299 +#define PARS_BY_TOKEN 300 +#define PARS_ASC_TOKEN 301 +#define PARS_DESC_TOKEN 302 +#define PARS_INSERT_TOKEN 303 +#define PARS_INTO_TOKEN 304 +#define PARS_VALUES_TOKEN 305 +#define PARS_UPDATE_TOKEN 306 +#define PARS_SET_TOKEN 307 +#define PARS_DELETE_TOKEN 308 +#define PARS_CURRENT_TOKEN 309 +#define PARS_OF_TOKEN 310 +#define PARS_CREATE_TOKEN 311 +#define PARS_TABLE_TOKEN 312 +#define PARS_INDEX_TOKEN 313 +#define PARS_UNIQUE_TOKEN 314 +#define PARS_CLUSTERED_TOKEN 315 +#define PARS_DOES_NOT_FIT_IN_MEM_TOKEN 316 +#define PARS_ON_TOKEN 317 +#define PARS_ASSIGN_TOKEN 318 +#define PARS_DECLARE_TOKEN 319 +#define PARS_CURSOR_TOKEN 320 +#define PARS_SQL_TOKEN 321 +#define PARS_OPEN_TOKEN 322 +#define PARS_FETCH_TOKEN 323 +#define PARS_CLOSE_TOKEN 324 +#define PARS_NOTFOUND_TOKEN 325 +#define PARS_TO_CHAR_TOKEN 326 +#define PARS_TO_NUMBER_TOKEN 327 +#define PARS_TO_BINARY_TOKEN 328 +#define PARS_BINARY_TO_NUMBER_TOKEN 329 +#define PARS_SUBSTR_TOKEN 330 +#define PARS_REPLSTR_TOKEN 331 +#define PARS_CONCAT_TOKEN 332 +#define PARS_INSTR_TOKEN 333 +#define PARS_LENGTH_TOKEN 334 +#define PARS_SYSDATE_TOKEN 335 +#define PARS_PRINTF_TOKEN 336 +#define PARS_ASSERT_TOKEN 337 +#define PARS_RND_TOKEN 338 +#define PARS_RND_STR_TOKEN 339 +#define PARS_ROW_PRINTF_TOKEN 340 +#define PARS_COMMIT_TOKEN 341 +#define PARS_ROLLBACK_TOKEN 342 +#define PARS_WORK_TOKEN 343 +#define PARS_UNSIGNED_TOKEN 344 +#define PARS_EXIT_TOKEN 345 +#define PARS_FUNCTION_TOKEN 346 +#define PARS_LOCK_TOKEN 347 +#define PARS_SHARE_TOKEN 348 +#define PARS_MODE_TOKEN 349 +#define PARS_LIKE_TOKEN 350 +#define PARS_LIKE_TOKEN_EXACT 351 +#define PARS_LIKE_TOKEN_PREFIX 352 +#define PARS_LIKE_TOKEN_SUFFIX 353 +#define PARS_LIKE_TOKEN_SUBSTR 354 +#define PARS_TABLE_NAME_TOKEN 355 +#define PARS_COMPACT_TOKEN 356 +#define PARS_BLOCK_SIZE_TOKEN 357 +#define PARS_BIGINT_TOKEN 358 +#define NEG 359 + + + + +/* Copy the first part of user declarations. */ +#line 28 "pars0grm.y" + +/* The value of the semantic attribute is a pointer to a query tree node +que_node_t */ + +#include "univ.i" +#include <math.h> /* Can't be before univ.i */ +#include "pars0pars.h" +#include "mem0mem.h" +#include "que0types.h" +#include "que0que.h" +#include "row0sel.h" + +#define YYSTYPE que_node_t* + +/* #define __STDC__ */ + +int +yylex(void); + + +/* Enabling traces. */ +#ifndef YYDEBUG +# define YYDEBUG 0 +#endif + +/* Enabling verbose error messages. */ +#ifdef YYERROR_VERBOSE +# undef YYERROR_VERBOSE +# define YYERROR_VERBOSE 1 +#else +# define YYERROR_VERBOSE 0 +#endif + +/* Enabling the token table. */ +#ifndef YYTOKEN_TABLE +# define YYTOKEN_TABLE 0 +#endif + +#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED +typedef int YYSTYPE; +# define yystype YYSTYPE /* obsolescent; will be withdrawn */ +# define YYSTYPE_IS_DECLARED 1 +# define YYSTYPE_IS_TRIVIAL 1 +#endif + + + +/* Copy the second part of user declarations. */ + + +/* Line 216 of yacc.c. */ +#line 334 "pars0grm.cc" + +#ifdef short +# undef short +#endif + +#ifdef YYTYPE_UINT8 +typedef YYTYPE_UINT8 yytype_uint8; +#else +typedef unsigned char yytype_uint8; +#endif + +#ifdef YYTYPE_INT8 +typedef YYTYPE_INT8 yytype_int8; +#elif (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +typedef signed char yytype_int8; +#else +typedef short int yytype_int8; +#endif + +#ifdef YYTYPE_UINT16 +typedef YYTYPE_UINT16 yytype_uint16; +#else +typedef unsigned short int yytype_uint16; +#endif + +#ifdef YYTYPE_INT16 +typedef YYTYPE_INT16 yytype_int16; +#else +typedef short int yytype_int16; +#endif + +#ifndef YYSIZE_T +# ifdef __SIZE_TYPE__ +# define YYSIZE_T __SIZE_TYPE__ +# elif defined size_t +# define YYSIZE_T size_t +# elif ! defined YYSIZE_T && (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +# include <stddef.h> /* INFRINGES ON USER NAME SPACE */ +# define YYSIZE_T size_t +# else +# define YYSIZE_T unsigned int +# endif +#endif + +#define YYSIZE_MAXIMUM ((YYSIZE_T) -1) + +#ifndef YY_ +# if defined YYENABLE_NLS && YYENABLE_NLS +# if ENABLE_NLS +# include <libintl.h> /* INFRINGES ON USER NAME SPACE */ +# define YY_(msgid) dgettext ("bison-runtime", msgid) +# endif +# endif +# ifndef YY_ +# define YY_(msgid) msgid +# endif +#endif + +/* Suppress unused-variable warnings by "using" E. */ +#if ! defined lint || defined __GNUC__ +# define YYUSE(e) ((void) (e)) +#else +# define YYUSE(e) /* empty */ +#endif + +/* Identity function, used to suppress warnings about constant conditions. */ +#ifndef lint +# define YYID(n) (n) +#else +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static int +YYID (int i) +#else +static int +YYID (i) + int i; +#endif +{ + return i; +} +#endif + +#if ! defined yyoverflow || YYERROR_VERBOSE + +/* The parser invokes alloca or malloc; define the necessary symbols. */ + +# ifdef YYSTACK_USE_ALLOCA +# if YYSTACK_USE_ALLOCA +# ifdef __GNUC__ +# define YYSTACK_ALLOC __builtin_alloca +# elif defined __BUILTIN_VA_ARG_INCR +# include <alloca.h> /* INFRINGES ON USER NAME SPACE */ +# elif defined _AIX +# define YYSTACK_ALLOC __alloca +# elif defined _MSC_VER +# include <malloc.h> /* INFRINGES ON USER NAME SPACE */ +# define alloca _alloca +# else +# define YYSTACK_ALLOC alloca +# if ! defined _ALLOCA_H && ! defined _STDLIB_H && (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +# include <stdlib.h> /* INFRINGES ON USER NAME SPACE */ +# ifndef _STDLIB_H +# define _STDLIB_H 1 +# endif +# endif +# endif +# endif +# endif + +# ifdef YYSTACK_ALLOC + /* Pacify GCC's `empty if-body' warning. */ +# define YYSTACK_FREE(Ptr) do { /* empty */; } while (YYID (0)) +# ifndef YYSTACK_ALLOC_MAXIMUM + /* The OS might guarantee only one guard page at the bottom of the stack, + and a page size can be as small as 4096 bytes. So we cannot safely + invoke alloca (N) if N exceeds 4096. Use a slightly smaller number + to allow for a few compiler-allocated temporary stack slots. */ +# define YYSTACK_ALLOC_MAXIMUM 4032 /* reasonable circa 2006 */ +# endif +# else +# define YYSTACK_ALLOC YYMALLOC +# define YYSTACK_FREE YYFREE +# ifndef YYSTACK_ALLOC_MAXIMUM +# define YYSTACK_ALLOC_MAXIMUM YYSIZE_MAXIMUM +# endif +# if (defined __cplusplus && ! defined _STDLIB_H \ + && ! ((defined YYMALLOC || defined malloc) \ + && (defined YYFREE || defined free))) +# include <stdlib.h> /* INFRINGES ON USER NAME SPACE */ +# ifndef _STDLIB_H +# define _STDLIB_H 1 +# endif +# endif +# ifndef YYMALLOC +# define YYMALLOC malloc +# if ! defined malloc && ! defined _STDLIB_H && (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +void *malloc (YYSIZE_T); /* INFRINGES ON USER NAME SPACE */ +# endif +# endif +# ifndef YYFREE +# define YYFREE free +# if ! defined free && ! defined _STDLIB_H && (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +void free (void*); /* INFRINGES ON USER NAME SPACE */ +# endif +# endif +# endif +#endif /* ! defined yyoverflow || YYERROR_VERBOSE */ + + +#if (! defined yyoverflow \ + && (! defined __cplusplus \ + || (defined YYSTYPE_IS_TRIVIAL && YYSTYPE_IS_TRIVIAL))) + +/* A type that is properly aligned for any stack member. */ +union yyalloc +{ + yytype_int16 yyss; + YYSTYPE yyvs; + }; + +/* The size of the maximum gap between one aligned stack and the next. */ +# define YYSTACK_GAP_MAXIMUM (sizeof (union yyalloc) - 1) + +/* The size of an array large to enough to hold all stacks, each with + N elements. */ +# define YYSTACK_BYTES(N) \ + ((N) * (sizeof (yytype_int16) + sizeof (YYSTYPE)) \ + + YYSTACK_GAP_MAXIMUM) + +/* Copy COUNT objects from FROM to TO. The source and destination do + not overlap. */ +# ifndef YYCOPY +# if defined __GNUC__ && 1 < __GNUC__ +# define YYCOPY(To, From, Count) \ + __builtin_memcpy (To, From, (Count) * sizeof (*(From))) +# else +# define YYCOPY(To, From, Count) \ + do \ + { \ + YYSIZE_T yyi; \ + for (yyi = 0; yyi < (Count); yyi++) \ + (To)[yyi] = (From)[yyi]; \ + } \ + while (YYID (0)) +# endif +# endif + +/* Relocate STACK from its old location to the new one. The + local variables YYSIZE and YYSTACKSIZE give the old and new number of + elements in the stack, and YYPTR gives the new location of the + stack. Advance YYPTR to a properly aligned location for the next + stack. */ +# define YYSTACK_RELOCATE(Stack) \ + do \ + { \ + YYSIZE_T yynewbytes; \ + YYCOPY (&yyptr->Stack, Stack, yysize); \ + Stack = &yyptr->Stack; \ + yynewbytes = yystacksize * sizeof (*Stack) + YYSTACK_GAP_MAXIMUM; \ + yyptr += yynewbytes / sizeof (*yyptr); \ + } \ + while (YYID (0)) + +#endif + +/* YYFINAL -- State number of the termination state. */ +#define YYFINAL 5 +/* YYLAST -- Last index in YYTABLE. */ +#define YYLAST 816 + +/* YYNTOKENS -- Number of terminals. */ +#define YYNTOKENS 120 +/* YYNNTS -- Number of nonterminals. */ +#define YYNNTS 73 +/* YYNRULES -- Number of rules. */ +#define YYNRULES 183 +/* YYNRULES -- Number of states. */ +#define YYNSTATES 350 + +/* YYTRANSLATE(YYLEX) -- Bison symbol number corresponding to YYLEX. */ +#define YYUNDEFTOK 2 +#define YYMAXUTOK 359 + +#define YYTRANSLATE(YYX) \ + ((unsigned int) (YYX) <= YYMAXUTOK ? yytranslate[YYX] : YYUNDEFTOK) + +/* YYTRANSLATE[YYLEX] -- Bison symbol number corresponding to YYLEX. */ +static const yytype_uint8 yytranslate[] = +{ + 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 112, 2, 2, + 114, 115, 109, 108, 117, 107, 2, 110, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 113, + 105, 104, 106, 116, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 118, 2, 119, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 1, 2, 3, 4, + 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, + 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, + 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, + 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, + 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, + 95, 96, 97, 98, 99, 100, 101, 102, 103, 111 +}; + +#if YYDEBUG +/* YYPRHS[YYN] -- Index of the first RHS symbol of rule number YYN in + YYRHS. */ +static const yytype_uint16 yyprhs[] = +{ + 0, 0, 3, 6, 8, 11, 14, 17, 20, 23, + 26, 29, 32, 35, 38, 41, 44, 47, 50, 53, + 56, 59, 62, 65, 68, 71, 73, 76, 78, 83, + 85, 87, 89, 91, 93, 95, 97, 101, 105, 109, + 113, 116, 120, 124, 128, 132, 136, 140, 144, 148, + 152, 156, 159, 163, 167, 169, 171, 173, 175, 177, + 179, 181, 183, 185, 187, 189, 190, 192, 196, 203, + 208, 210, 212, 214, 218, 220, 224, 225, 227, 231, + 232, 234, 238, 240, 245, 251, 256, 257, 259, 263, + 265, 269, 271, 272, 275, 276, 279, 280, 285, 286, + 288, 290, 291, 296, 305, 309, 315, 318, 322, 324, + 328, 333, 338, 341, 344, 348, 351, 354, 357, 361, + 366, 368, 371, 372, 375, 377, 385, 392, 403, 405, + 407, 410, 413, 418, 423, 429, 431, 435, 436, 440, + 441, 443, 444, 447, 448, 450, 451, 453, 454, 458, + 468, 470, 474, 475, 477, 478, 480, 491, 493, 495, + 498, 501, 503, 505, 507, 509, 511, 513, 517, 521, + 522, 524, 528, 532, 533, 535, 538, 545, 550, 552, + 554, 555, 557, 560 +}; + +/* YYRHS -- A `-1'-separated list of the rules' RHS. */ +static const yytype_int16 yyrhs[] = +{ + 121, 0, -1, 192, 113, -1, 127, -1, 128, 113, + -1, 160, 113, -1, 161, 113, -1, 162, 113, -1, + 159, 113, -1, 163, 113, -1, 155, 113, -1, 142, + 113, -1, 144, 113, -1, 154, 113, -1, 152, 113, + -1, 153, 113, -1, 149, 113, -1, 150, 113, -1, + 164, 113, -1, 166, 113, -1, 165, 113, -1, 181, + 113, -1, 182, 113, -1, 175, 113, -1, 179, 113, + -1, 122, -1, 123, 122, -1, 9, -1, 125, 114, + 133, 115, -1, 3, -1, 4, -1, 5, -1, 6, + -1, 7, -1, 8, -1, 66, -1, 124, 108, 124, + -1, 124, 107, 124, -1, 124, 109, 124, -1, 124, + 110, 124, -1, 107, 124, -1, 114, 124, 115, -1, + 124, 104, 124, -1, 124, 95, 5, -1, 124, 105, + 124, -1, 124, 106, 124, -1, 124, 13, 124, -1, + 124, 14, 124, -1, 124, 15, 124, -1, 124, 10, + 124, -1, 124, 11, 124, -1, 12, 124, -1, 9, + 112, 70, -1, 66, 112, 70, -1, 71, -1, 72, + -1, 73, -1, 74, -1, 75, -1, 77, -1, 78, + -1, 79, -1, 80, -1, 83, -1, 84, -1, -1, + 116, -1, 126, 117, 116, -1, 118, 9, 114, 126, + 115, 119, -1, 129, 114, 133, 115, -1, 76, -1, + 81, -1, 82, -1, 9, 114, 115, -1, 180, -1, + 131, 117, 180, -1, -1, 9, -1, 132, 117, 9, + -1, -1, 124, -1, 133, 117, 124, -1, 124, -1, + 37, 114, 109, 115, -1, 37, 114, 38, 9, 115, + -1, 36, 114, 124, 115, -1, -1, 134, -1, 135, + 117, 134, -1, 109, -1, 135, 49, 132, -1, 135, + -1, -1, 40, 124, -1, -1, 41, 51, -1, -1, + 92, 17, 93, 94, -1, -1, 46, -1, 47, -1, + -1, 44, 45, 9, 140, -1, 35, 136, 39, 131, + 137, 138, 139, 141, -1, 48, 49, 180, -1, 143, + 50, 114, 133, 115, -1, 143, 142, -1, 9, 104, + 124, -1, 145, -1, 146, 117, 145, -1, 40, 54, + 55, 9, -1, 51, 180, 52, 146, -1, 148, 137, + -1, 148, 147, -1, 53, 39, 180, -1, 151, 137, + -1, 151, 147, -1, 85, 142, -1, 9, 63, 124, + -1, 31, 124, 29, 123, -1, 156, -1, 157, 156, + -1, -1, 30, 123, -1, 157, -1, 28, 124, 29, + 123, 158, 27, 28, -1, 33, 124, 32, 123, 27, + 32, -1, 41, 9, 17, 124, 42, 124, 32, 123, + 27, 32, -1, 90, -1, 34, -1, 67, 9, -1, + 69, 9, -1, 68, 9, 49, 132, -1, 68, 9, + 49, 130, -1, 9, 183, 169, 170, 171, -1, 167, + -1, 168, 117, 167, -1, -1, 114, 3, 115, -1, + -1, 89, -1, -1, 12, 8, -1, -1, 61, -1, + -1, 101, -1, -1, 102, 104, 3, -1, 56, 57, + 180, 114, 168, 115, 172, 173, 174, -1, 9, -1, + 176, 117, 9, -1, -1, 59, -1, -1, 60, -1, + 56, 177, 178, 58, 9, 62, 180, 114, 176, 115, + -1, 9, -1, 100, -1, 86, 88, -1, 87, 88, + -1, 21, -1, 22, -1, 103, -1, 24, -1, 19, + -1, 20, -1, 9, 17, 183, -1, 9, 18, 183, + -1, -1, 184, -1, 185, 117, 184, -1, 9, 183, + 113, -1, -1, 186, -1, 187, 186, -1, 64, 65, + 9, 25, 142, 113, -1, 64, 91, 9, 113, -1, + 188, -1, 189, -1, -1, 190, -1, 191, 190, -1, + 16, 9, 114, 185, 115, 25, 187, 191, 26, 123, + 27, -1 +}; + +/* YYRLINE[YYN] -- source line where rule number YYN was defined. */ +static const yytype_uint16 yyrline[] = +{ + 0, 162, 162, 165, 166, 167, 168, 169, 170, 171, + 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, + 182, 183, 184, 185, 186, 190, 191, 196, 197, 199, + 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, + 210, 211, 212, 213, 215, 216, 217, 218, 219, 220, + 221, 222, 223, 225, 230, 231, 232, 233, 235, 236, + 237, 238, 239, 240, 241, 244, 246, 247, 251, 257, + 262, 263, 264, 268, 272, 273, 278, 279, 280, 285, + 286, 287, 291, 292, 297, 303, 310, 311, 312, 317, + 319, 322, 326, 327, 331, 332, 337, 338, 343, 344, + 345, 349, 350, 357, 372, 377, 380, 388, 394, 395, + 400, 406, 415, 423, 431, 438, 446, 454, 460, 467, + 473, 474, 479, 480, 482, 486, 493, 499, 509, 513, + 517, 524, 531, 535, 543, 552, 553, 558, 559, 564, + 565, 571, 572, 578, 579, 585, 586, 591, 592, 597, + 608, 609, 614, 615, 619, 620, 624, 638, 639, 643, + 648, 653, 654, 655, 656, 657, 658, 662, 667, 675, + 676, 677, 682, 688, 690, 691, 695, 703, 709, 710, + 713, 715, 716, 720 +}; +#endif + +#if YYDEBUG || YYERROR_VERBOSE || YYTOKEN_TABLE +/* YYTNAME[SYMBOL-NUM] -- String name of the symbol SYMBOL-NUM. + First, the terminals, then, starting at YYNTOKENS, nonterminals. */ +static const char *const yytname[] = +{ + "$end", "error", "$undefined", "PARS_INT_LIT", "PARS_FLOAT_LIT", + "PARS_STR_LIT", "PARS_FIXBINARY_LIT", "PARS_BLOB_LIT", "PARS_NULL_LIT", + "PARS_ID_TOKEN", "PARS_AND_TOKEN", "PARS_OR_TOKEN", "PARS_NOT_TOKEN", + "PARS_GE_TOKEN", "PARS_LE_TOKEN", "PARS_NE_TOKEN", + "PARS_PROCEDURE_TOKEN", "PARS_IN_TOKEN", "PARS_OUT_TOKEN", + "PARS_BINARY_TOKEN", "PARS_BLOB_TOKEN", "PARS_INT_TOKEN", + "PARS_INTEGER_TOKEN", "PARS_FLOAT_TOKEN", "PARS_CHAR_TOKEN", + "PARS_IS_TOKEN", "PARS_BEGIN_TOKEN", "PARS_END_TOKEN", "PARS_IF_TOKEN", + "PARS_THEN_TOKEN", "PARS_ELSE_TOKEN", "PARS_ELSIF_TOKEN", + "PARS_LOOP_TOKEN", "PARS_WHILE_TOKEN", "PARS_RETURN_TOKEN", + "PARS_SELECT_TOKEN", "PARS_SUM_TOKEN", "PARS_COUNT_TOKEN", + "PARS_DISTINCT_TOKEN", "PARS_FROM_TOKEN", "PARS_WHERE_TOKEN", + "PARS_FOR_TOKEN", "PARS_DDOT_TOKEN", "PARS_READ_TOKEN", + "PARS_ORDER_TOKEN", "PARS_BY_TOKEN", "PARS_ASC_TOKEN", "PARS_DESC_TOKEN", + "PARS_INSERT_TOKEN", "PARS_INTO_TOKEN", "PARS_VALUES_TOKEN", + "PARS_UPDATE_TOKEN", "PARS_SET_TOKEN", "PARS_DELETE_TOKEN", + "PARS_CURRENT_TOKEN", "PARS_OF_TOKEN", "PARS_CREATE_TOKEN", + "PARS_TABLE_TOKEN", "PARS_INDEX_TOKEN", "PARS_UNIQUE_TOKEN", + "PARS_CLUSTERED_TOKEN", "PARS_DOES_NOT_FIT_IN_MEM_TOKEN", + "PARS_ON_TOKEN", "PARS_ASSIGN_TOKEN", "PARS_DECLARE_TOKEN", + "PARS_CURSOR_TOKEN", "PARS_SQL_TOKEN", "PARS_OPEN_TOKEN", + "PARS_FETCH_TOKEN", "PARS_CLOSE_TOKEN", "PARS_NOTFOUND_TOKEN", + "PARS_TO_CHAR_TOKEN", "PARS_TO_NUMBER_TOKEN", "PARS_TO_BINARY_TOKEN", + "PARS_BINARY_TO_NUMBER_TOKEN", "PARS_SUBSTR_TOKEN", "PARS_REPLSTR_TOKEN", + "PARS_CONCAT_TOKEN", "PARS_INSTR_TOKEN", "PARS_LENGTH_TOKEN", + "PARS_SYSDATE_TOKEN", "PARS_PRINTF_TOKEN", "PARS_ASSERT_TOKEN", + "PARS_RND_TOKEN", "PARS_RND_STR_TOKEN", "PARS_ROW_PRINTF_TOKEN", + "PARS_COMMIT_TOKEN", "PARS_ROLLBACK_TOKEN", "PARS_WORK_TOKEN", + "PARS_UNSIGNED_TOKEN", "PARS_EXIT_TOKEN", "PARS_FUNCTION_TOKEN", + "PARS_LOCK_TOKEN", "PARS_SHARE_TOKEN", "PARS_MODE_TOKEN", + "PARS_LIKE_TOKEN", "PARS_LIKE_TOKEN_EXACT", "PARS_LIKE_TOKEN_PREFIX", + "PARS_LIKE_TOKEN_SUFFIX", "PARS_LIKE_TOKEN_SUBSTR", + "PARS_TABLE_NAME_TOKEN", "PARS_COMPACT_TOKEN", "PARS_BLOCK_SIZE_TOKEN", + "PARS_BIGINT_TOKEN", "'='", "'<'", "'>'", "'-'", "'+'", "'*'", "'/'", + "NEG", "'%'", "';'", "'('", "')'", "'?'", "','", "'{'", "'}'", "$accept", + "top_statement", "statement", "statement_list", "exp", "function_name", + "question_mark_list", "stored_procedure_call", + "predefined_procedure_call", "predefined_procedure_name", + "user_function_call", "table_list", "variable_list", "exp_list", + "select_item", "select_item_list", "select_list", "search_condition", + "for_update_clause", "lock_shared_clause", "order_direction", + "order_by_clause", "select_statement", "insert_statement_start", + "insert_statement", "column_assignment", "column_assignment_list", + "cursor_positioned", "update_statement_start", + "update_statement_searched", "update_statement_positioned", + "delete_statement_start", "delete_statement_searched", + "delete_statement_positioned", "row_printf_statement", + "assignment_statement", "elsif_element", "elsif_list", "else_part", + "if_statement", "while_statement", "for_statement", "exit_statement", + "return_statement", "open_cursor_statement", "close_cursor_statement", + "fetch_statement", "column_def", "column_def_list", "opt_column_len", + "opt_unsigned", "opt_not_null", "not_fit_in_memory", "compact", + "block_size", "create_table", "column_list", "unique_def", + "clustered_def", "create_index", "table_name", "commit_statement", + "rollback_statement", "type_name", "parameter_declaration", + "parameter_declaration_list", "variable_declaration", + "variable_declaration_list", "cursor_declaration", + "function_declaration", "declaration", "declaration_list", + "procedure_definition", 0 +}; +#endif + +# ifdef YYPRINT +/* YYTOKNUM[YYLEX-NUM] -- Internal token number corresponding to + token YYLEX-NUM. */ +static const yytype_uint16 yytoknum[] = +{ + 0, 256, 257, 258, 259, 260, 261, 262, 263, 264, + 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, + 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, + 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, + 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, + 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, + 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, + 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, + 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, + 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, + 355, 356, 357, 358, 61, 60, 62, 45, 43, 42, + 47, 359, 37, 59, 40, 41, 63, 44, 123, 125 +}; +# endif + +/* YYR1[YYN] -- Symbol number of symbol that rule YYN derives. */ +static const yytype_uint8 yyr1[] = +{ + 0, 120, 121, 122, 122, 122, 122, 122, 122, 122, + 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, + 122, 122, 122, 122, 122, 123, 123, 124, 124, 124, + 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, + 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, + 124, 124, 124, 124, 125, 125, 125, 125, 125, 125, + 125, 125, 125, 125, 125, 126, 126, 126, 127, 128, + 129, 129, 129, 130, 131, 131, 132, 132, 132, 133, + 133, 133, 134, 134, 134, 134, 135, 135, 135, 136, + 136, 136, 137, 137, 138, 138, 139, 139, 140, 140, + 140, 141, 141, 142, 143, 144, 144, 145, 146, 146, + 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, + 157, 157, 158, 158, 158, 159, 160, 161, 162, 163, + 164, 165, 166, 166, 167, 168, 168, 169, 169, 170, + 170, 171, 171, 172, 172, 173, 173, 174, 174, 175, + 176, 176, 177, 177, 178, 178, 179, 180, 180, 181, + 182, 183, 183, 183, 183, 183, 183, 184, 184, 185, + 185, 185, 186, 187, 187, 187, 188, 189, 190, 190, + 191, 191, 191, 192 +}; + +/* YYR2[YYN] -- Number of symbols composing right hand side of rule YYN. */ +static const yytype_uint8 yyr2[] = +{ + 0, 2, 2, 1, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 1, 2, 1, 4, 1, + 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, + 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 2, 3, 3, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 0, 1, 3, 6, 4, + 1, 1, 1, 3, 1, 3, 0, 1, 3, 0, + 1, 3, 1, 4, 5, 4, 0, 1, 3, 1, + 3, 1, 0, 2, 0, 2, 0, 4, 0, 1, + 1, 0, 4, 8, 3, 5, 2, 3, 1, 3, + 4, 4, 2, 2, 3, 2, 2, 2, 3, 4, + 1, 2, 0, 2, 1, 7, 6, 10, 1, 1, + 2, 2, 4, 4, 5, 1, 3, 0, 3, 0, + 1, 0, 2, 0, 1, 0, 1, 0, 3, 9, + 1, 3, 0, 1, 0, 1, 10, 1, 1, 2, + 2, 1, 1, 1, 1, 1, 1, 3, 3, 0, + 1, 3, 3, 0, 1, 2, 6, 4, 1, 1, + 0, 1, 2, 11 +}; + +/* YYDEFACT[STATE-NAME] -- Default rule to reduce with in state + STATE-NUM when YYTABLE doesn't specify something else to do. Zero + means the default is an error. */ +static const yytype_uint8 yydefact[] = +{ + 0, 0, 0, 0, 0, 1, 2, 169, 0, 170, + 0, 0, 0, 0, 0, 165, 166, 161, 162, 164, + 163, 167, 168, 173, 171, 0, 174, 180, 0, 0, + 175, 178, 179, 181, 0, 172, 0, 0, 0, 182, + 0, 0, 0, 0, 0, 129, 86, 0, 0, 0, + 0, 152, 0, 0, 0, 70, 71, 72, 0, 0, + 0, 128, 0, 25, 0, 3, 0, 0, 0, 0, + 0, 92, 0, 0, 92, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 177, 0, 29, 30, 31, 32, 33, 34, + 27, 0, 35, 54, 55, 56, 57, 58, 59, 60, + 61, 62, 63, 64, 0, 0, 0, 0, 0, 0, + 0, 89, 82, 87, 91, 0, 0, 0, 157, 158, + 0, 0, 0, 153, 154, 130, 0, 131, 117, 159, + 160, 0, 183, 26, 4, 79, 11, 0, 106, 12, + 0, 112, 113, 16, 17, 115, 116, 14, 15, 13, + 10, 8, 5, 6, 7, 9, 18, 20, 19, 23, + 24, 21, 22, 0, 118, 0, 51, 0, 40, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 79, 0, 0, 0, 76, 0, + 0, 0, 104, 0, 114, 0, 155, 0, 76, 65, + 80, 0, 79, 0, 93, 176, 52, 53, 41, 49, + 50, 46, 47, 48, 122, 43, 42, 44, 45, 37, + 36, 38, 39, 0, 0, 0, 0, 0, 77, 90, + 88, 92, 74, 0, 0, 108, 111, 0, 0, 77, + 133, 132, 66, 0, 69, 0, 0, 0, 0, 0, + 120, 124, 0, 28, 0, 85, 0, 83, 0, 0, + 0, 94, 0, 0, 0, 0, 135, 0, 0, 0, + 0, 0, 81, 105, 110, 123, 0, 121, 0, 126, + 84, 78, 75, 0, 96, 0, 107, 109, 137, 143, + 0, 0, 73, 68, 67, 0, 125, 95, 0, 101, + 0, 0, 139, 144, 145, 136, 0, 119, 0, 0, + 103, 0, 0, 140, 141, 146, 147, 0, 0, 0, + 0, 138, 0, 134, 0, 149, 150, 0, 97, 98, + 127, 142, 0, 156, 0, 99, 100, 102, 148, 151 +}; + +/* YYDEFGOTO[NTERM-NUM]. */ +static const yytype_int16 yydefgoto[] = +{ + -1, 2, 63, 64, 210, 117, 253, 65, 66, 67, + 250, 241, 239, 211, 123, 124, 125, 151, 294, 309, + 347, 320, 68, 69, 70, 245, 246, 152, 71, 72, + 73, 74, 75, 76, 77, 78, 260, 261, 262, 79, + 80, 81, 82, 83, 84, 85, 86, 276, 277, 312, + 324, 333, 314, 326, 335, 87, 337, 134, 207, 88, + 130, 89, 90, 21, 9, 10, 26, 27, 31, 32, + 33, 34, 3 +}; + +/* YYPACT[STATE-NUM] -- Index in YYTABLE of the portion describing + STATE-NUM. */ +#define YYPACT_NINF -179 +static const yytype_int16 yypact[] = +{ + 24, 36, 58, -48, -25, -179, -179, 57, 31, -179, + -74, 14, 14, 50, 57, -179, -179, -179, -179, -179, + -179, -179, -179, 72, -179, 14, -179, 3, -26, -28, + -179, -179, -179, -179, 4, -179, 91, 95, 589, -179, + 80, -6, 43, 285, 285, -179, 19, 99, 69, -5, + 81, -13, 110, 112, 114, -179, -179, -179, 89, 37, + 41, -179, 122, -179, 406, -179, 25, 40, 44, -3, + 46, 116, 49, 51, 116, 52, 53, 54, 55, 56, + 59, 61, 62, 70, 73, 74, 75, 76, 77, 78, + 79, 89, -179, 285, -179, -179, -179, -179, -179, -179, + 82, 285, 83, -179, -179, -179, -179, -179, -179, -179, + -179, -179, -179, -179, 285, 285, 577, 92, 618, 94, + 97, -179, 706, -179, -33, 124, 153, -5, -179, -179, + 141, -5, -5, -179, 136, -179, 148, -179, -179, -179, + -179, 98, -179, -179, -179, 285, -179, 101, -179, -179, + 195, -179, -179, -179, -179, -179, -179, -179, -179, -179, + -179, -179, -179, -179, -179, -179, -179, -179, -179, -179, + -179, -179, -179, 100, 706, 135, 6, 154, -7, 206, + 285, 285, 285, 285, 285, 589, 218, 285, 285, 285, + 285, 285, 285, 285, 285, 589, 285, -27, 216, 173, + -5, 285, -179, 217, -179, 113, -179, 171, 221, 119, + 706, -56, 285, 185, 706, -179, -179, -179, -179, 6, + 6, 27, 27, 706, 345, -179, 27, 27, 27, 35, + 35, -7, -7, -53, 467, 223, 232, 127, -179, 126, + -179, -31, -179, 638, 151, -179, 142, 251, 253, 150, + -179, 126, -179, -46, -179, 285, -45, 256, 589, 285, + -179, 240, 249, -179, 245, -179, 166, -179, 273, 285, + -5, 242, 285, 285, 217, 14, -179, -39, 222, 170, + 167, 179, 706, -179, -179, 589, 679, -179, 268, -179, + -179, -179, -179, 247, 207, 686, 706, -179, 186, 243, + 251, -5, -179, -179, -179, 589, -179, -179, 286, 261, + 589, 303, 219, -179, 224, -179, 193, 589, 226, 272, + -179, 528, 205, -179, 310, -179, 233, 314, 230, 317, + 302, -179, 328, -179, 235, -179, -179, -38, -179, 7, + -179, -179, 334, -179, 331, -179, -179, -179, -179, -179 +}; + +/* YYPGOTO[NTERM-NUM]. */ +static const yytype_int16 yypgoto[] = +{ + -179, -179, -63, -178, -41, -179, -179, -179, -179, -179, + -179, -179, 133, -155, 143, -179, -179, -68, -179, -179, + -179, -179, -40, -179, -179, 71, -179, 269, -179, -179, + -179, -179, -179, -179, -179, -179, 85, -179, -179, -179, + -179, -179, -179, -179, -179, -179, -179, 47, -179, -179, + -179, -179, -179, -179, -179, -179, -179, -179, -179, -179, + -117, -179, -179, -12, 330, -179, 321, -179, -179, -179, + 315, -179, -179 +}; + +/* YYTABLE[YYPACT[STATE-NUM]]. What to do in state STATE-NUM. If + positive, shift that token. If negative, reduce the rule which + number is the opposite. If zero, do what YYDEFACT says. + If YYTABLE_NINF, syntax error. */ +#define YYTABLE_NINF -1 +static const yytype_uint16 yytable[] = +{ + 22, 143, 116, 118, 128, 122, 155, 224, 184, 269, + 202, 236, 25, 28, 204, 205, 198, 234, 138, 182, + 183, 184, 94, 95, 96, 97, 98, 99, 100, 148, + 38, 101, 46, 15, 16, 17, 18, 36, 19, 233, + 1, 13, 184, 14, 132, 4, 133, 147, 11, 12, + 184, 173, 174, 345, 346, 119, 120, 256, 5, 254, + 176, 255, 263, 37, 255, 6, 8, 29, 29, 280, + 283, 281, 255, 178, 179, 23, 299, 343, 300, 344, + 285, 25, 237, 242, 199, 102, 270, 35, 186, 7, + 103, 104, 105, 106, 107, 129, 108, 109, 110, 111, + 40, 186, 112, 113, 41, 91, 93, 92, 126, 214, + 187, 188, 189, 190, 191, 192, 193, 20, 127, 135, + 131, 136, 186, 137, 46, 139, 114, 317, 121, 140, + 186, 141, 321, 115, 190, 191, 192, 193, 144, 219, + 220, 221, 222, 223, 192, 193, 226, 227, 228, 229, + 230, 231, 232, 292, 145, 235, 150, 146, 122, 149, + 243, 143, 153, 200, 154, 157, 158, 159, 160, 161, + 201, 143, 162, 271, 163, 164, 94, 95, 96, 97, + 98, 99, 100, 165, 316, 101, 166, 167, 168, 169, + 170, 171, 172, 203, 175, 177, 206, 208, 94, 95, + 96, 97, 98, 99, 100, 216, 194, 101, 196, 119, + 120, 197, 209, 215, 282, 212, 180, 181, 286, 182, + 183, 184, 143, 225, 217, 238, 244, 247, 214, 248, + 249, 295, 296, 180, 181, 252, 182, 183, 184, 102, + 257, 266, 267, 268, 103, 104, 105, 106, 107, 213, + 108, 109, 110, 111, 143, 273, 112, 113, 143, 274, + 275, 102, 278, 298, 279, 284, 103, 104, 105, 106, + 107, 259, 108, 109, 110, 111, 288, 289, 112, 113, + 114, 290, 291, 293, 301, 302, 303, 115, 94, 95, + 96, 97, 98, 99, 100, 304, 306, 101, 307, 308, + 311, 186, 114, 318, 313, 319, 322, 327, 323, 115, + 187, 188, 189, 190, 191, 192, 193, 329, 186, 328, + 331, 218, 332, 336, 338, 325, 339, 187, 188, 189, + 190, 191, 192, 193, 340, 334, 341, 348, 265, 342, + 349, 251, 240, 156, 24, 297, 287, 315, 30, 39, + 0, 102, 0, 0, 42, 0, 103, 104, 105, 106, + 107, 0, 108, 109, 110, 111, 0, 0, 112, 113, + 0, 0, 0, 43, 0, 258, 259, 0, 44, 45, + 46, 0, 0, 0, 0, 0, 47, 0, 0, 0, + 0, 0, 114, 48, 0, 0, 49, 0, 50, 115, + 0, 51, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 52, 53, 54, 42, 0, 0, 0, 0, + 0, 55, 0, 0, 0, 0, 56, 57, 0, 0, + 58, 59, 60, 142, 43, 61, 0, 0, 0, 44, + 45, 46, 0, 0, 0, 0, 0, 47, 0, 0, + 0, 0, 0, 0, 48, 0, 0, 49, 0, 50, + 0, 0, 51, 62, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 52, 53, 54, 42, 0, 0, 0, + 0, 0, 55, 0, 0, 0, 0, 56, 57, 0, + 0, 58, 59, 60, 264, 43, 61, 0, 0, 0, + 44, 45, 46, 0, 0, 0, 0, 0, 47, 0, + 0, 0, 0, 0, 0, 48, 0, 0, 49, 0, + 50, 0, 0, 51, 62, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 52, 53, 54, 42, 0, 0, + 0, 0, 0, 55, 0, 0, 0, 0, 56, 57, + 0, 0, 58, 59, 60, 330, 43, 61, 0, 0, + 0, 44, 45, 46, 0, 0, 0, 0, 0, 47, + 0, 0, 0, 0, 0, 0, 48, 0, 0, 49, + 0, 50, 0, 0, 51, 62, 0, 180, 181, 0, + 182, 183, 184, 0, 0, 52, 53, 54, 42, 0, + 0, 0, 0, 0, 55, 0, 185, 0, 0, 56, + 57, 0, 0, 58, 59, 60, 0, 43, 61, 0, + 0, 0, 44, 45, 46, 0, 0, 0, 180, 181, + 47, 182, 183, 184, 0, 0, 0, 48, 0, 0, + 49, 0, 50, 0, 0, 51, 62, 0, 180, 181, + 195, 182, 183, 184, 0, 0, 52, 53, 54, 0, + 0, 0, 0, 0, 0, 55, 0, 0, 0, 0, + 56, 57, 186, 0, 58, 59, 60, 0, 0, 61, + 272, 187, 188, 189, 190, 191, 192, 193, 0, 180, + 181, 0, 182, 183, 184, 0, 180, 181, 0, 182, + 183, 184, 0, 0, 0, 0, 0, 62, 305, 0, + 0, 0, 0, 186, 0, 0, 180, 181, 310, 182, + 183, 184, 187, 188, 189, 190, 191, 192, 193, 0, + 0, 0, 0, 186, 0, 0, 0, 0, 0, 0, + 0, 0, 187, 188, 189, 190, 191, 192, 193, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 186, 0, 0, 0, 0, 0, + 0, 186, 0, 187, 188, 189, 190, 191, 192, 193, + 187, 188, 189, 190, 191, 192, 193, 0, 0, 0, + 0, 186, 0, 0, 0, 0, 0, 0, 0, 0, + 187, 188, 189, 190, 191, 192, 193 +}; + +static const yytype_int16 yycheck[] = +{ + 12, 64, 43, 44, 9, 46, 74, 185, 15, 40, + 127, 38, 9, 25, 131, 132, 49, 195, 58, 13, + 14, 15, 3, 4, 5, 6, 7, 8, 9, 69, + 26, 12, 35, 19, 20, 21, 22, 65, 24, 194, + 16, 115, 15, 117, 57, 9, 59, 50, 17, 18, + 15, 91, 93, 46, 47, 36, 37, 212, 0, 115, + 101, 117, 115, 91, 117, 113, 9, 64, 64, 115, + 115, 117, 117, 114, 115, 25, 115, 115, 117, 117, + 258, 9, 109, 200, 117, 66, 117, 113, 95, 114, + 71, 72, 73, 74, 75, 100, 77, 78, 79, 80, + 9, 95, 83, 84, 9, 25, 63, 113, 9, 150, + 104, 105, 106, 107, 108, 109, 110, 103, 49, 9, + 39, 9, 95, 9, 35, 88, 107, 305, 109, 88, + 95, 9, 310, 114, 107, 108, 109, 110, 113, 180, + 181, 182, 183, 184, 109, 110, 187, 188, 189, 190, + 191, 192, 193, 270, 114, 196, 40, 113, 199, 113, + 201, 224, 113, 39, 113, 113, 113, 113, 113, 113, + 17, 234, 113, 241, 113, 113, 3, 4, 5, 6, + 7, 8, 9, 113, 301, 12, 113, 113, 113, 113, + 113, 113, 113, 52, 112, 112, 60, 49, 3, 4, + 5, 6, 7, 8, 9, 70, 114, 12, 114, 36, + 37, 114, 114, 113, 255, 114, 10, 11, 259, 13, + 14, 15, 285, 5, 70, 9, 9, 114, 269, 58, + 9, 272, 273, 10, 11, 116, 13, 14, 15, 66, + 55, 9, 115, 117, 71, 72, 73, 74, 75, 54, + 77, 78, 79, 80, 317, 104, 83, 84, 321, 117, + 9, 66, 9, 275, 114, 9, 71, 72, 73, 74, + 75, 31, 77, 78, 79, 80, 27, 32, 83, 84, + 107, 115, 9, 41, 62, 115, 119, 114, 3, 4, + 5, 6, 7, 8, 9, 116, 28, 12, 51, 92, + 114, 95, 107, 17, 61, 44, 3, 114, 89, 114, + 104, 105, 106, 107, 108, 109, 110, 45, 95, 93, + 115, 115, 12, 9, 94, 101, 9, 104, 105, 106, + 107, 108, 109, 110, 32, 102, 8, 3, 115, 104, + 9, 208, 199, 74, 14, 274, 261, 300, 27, 34, + -1, 66, -1, -1, 9, -1, 71, 72, 73, 74, + 75, -1, 77, 78, 79, 80, -1, -1, 83, 84, + -1, -1, -1, 28, -1, 30, 31, -1, 33, 34, + 35, -1, -1, -1, -1, -1, 41, -1, -1, -1, + -1, -1, 107, 48, -1, -1, 51, -1, 53, 114, + -1, 56, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, 67, 68, 69, 9, -1, -1, -1, -1, + -1, 76, -1, -1, -1, -1, 81, 82, -1, -1, + 85, 86, 87, 27, 28, 90, -1, -1, -1, 33, + 34, 35, -1, -1, -1, -1, -1, 41, -1, -1, + -1, -1, -1, -1, 48, -1, -1, 51, -1, 53, + -1, -1, 56, 118, -1, -1, -1, -1, -1, -1, + -1, -1, -1, 67, 68, 69, 9, -1, -1, -1, + -1, -1, 76, -1, -1, -1, -1, 81, 82, -1, + -1, 85, 86, 87, 27, 28, 90, -1, -1, -1, + 33, 34, 35, -1, -1, -1, -1, -1, 41, -1, + -1, -1, -1, -1, -1, 48, -1, -1, 51, -1, + 53, -1, -1, 56, 118, -1, -1, -1, -1, -1, + -1, -1, -1, -1, 67, 68, 69, 9, -1, -1, + -1, -1, -1, 76, -1, -1, -1, -1, 81, 82, + -1, -1, 85, 86, 87, 27, 28, 90, -1, -1, + -1, 33, 34, 35, -1, -1, -1, -1, -1, 41, + -1, -1, -1, -1, -1, -1, 48, -1, -1, 51, + -1, 53, -1, -1, 56, 118, -1, 10, 11, -1, + 13, 14, 15, -1, -1, 67, 68, 69, 9, -1, + -1, -1, -1, -1, 76, -1, 29, -1, -1, 81, + 82, -1, -1, 85, 86, 87, -1, 28, 90, -1, + -1, -1, 33, 34, 35, -1, -1, -1, 10, 11, + 41, 13, 14, 15, -1, -1, -1, 48, -1, -1, + 51, -1, 53, -1, -1, 56, 118, -1, 10, 11, + 32, 13, 14, 15, -1, -1, 67, 68, 69, -1, + -1, -1, -1, -1, -1, 76, -1, -1, -1, -1, + 81, 82, 95, -1, 85, 86, 87, -1, -1, 90, + 42, 104, 105, 106, 107, 108, 109, 110, -1, 10, + 11, -1, 13, 14, 15, -1, 10, 11, -1, 13, + 14, 15, -1, -1, -1, -1, -1, 118, 29, -1, + -1, -1, -1, 95, -1, -1, 10, 11, 32, 13, + 14, 15, 104, 105, 106, 107, 108, 109, 110, -1, + -1, -1, -1, 95, -1, -1, -1, -1, -1, -1, + -1, -1, 104, 105, 106, 107, 108, 109, 110, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, 95, -1, -1, -1, -1, -1, + -1, 95, -1, 104, 105, 106, 107, 108, 109, 110, + 104, 105, 106, 107, 108, 109, 110, -1, -1, -1, + -1, 95, -1, -1, -1, -1, -1, -1, -1, -1, + 104, 105, 106, 107, 108, 109, 110 +}; + +/* YYSTOS[STATE-NUM] -- The (internal number of the) accessing + symbol of state STATE-NUM. */ +static const yytype_uint8 yystos[] = +{ + 0, 16, 121, 192, 9, 0, 113, 114, 9, 184, + 185, 17, 18, 115, 117, 19, 20, 21, 22, 24, + 103, 183, 183, 25, 184, 9, 186, 187, 183, 64, + 186, 188, 189, 190, 191, 113, 65, 91, 26, 190, + 9, 9, 9, 28, 33, 34, 35, 41, 48, 51, + 53, 56, 67, 68, 69, 76, 81, 82, 85, 86, + 87, 90, 118, 122, 123, 127, 128, 129, 142, 143, + 144, 148, 149, 150, 151, 152, 153, 154, 155, 159, + 160, 161, 162, 163, 164, 165, 166, 175, 179, 181, + 182, 25, 113, 63, 3, 4, 5, 6, 7, 8, + 9, 12, 66, 71, 72, 73, 74, 75, 77, 78, + 79, 80, 83, 84, 107, 114, 124, 125, 124, 36, + 37, 109, 124, 134, 135, 136, 9, 49, 9, 100, + 180, 39, 57, 59, 177, 9, 9, 9, 142, 88, + 88, 9, 27, 122, 113, 114, 113, 50, 142, 113, + 40, 137, 147, 113, 113, 137, 147, 113, 113, 113, + 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, + 113, 113, 113, 142, 124, 112, 124, 112, 124, 124, + 10, 11, 13, 14, 15, 29, 95, 104, 105, 106, + 107, 108, 109, 110, 114, 32, 114, 114, 49, 117, + 39, 17, 180, 52, 180, 180, 60, 178, 49, 114, + 124, 133, 114, 54, 124, 113, 70, 70, 115, 124, + 124, 124, 124, 124, 123, 5, 124, 124, 124, 124, + 124, 124, 124, 133, 123, 124, 38, 109, 9, 132, + 134, 131, 180, 124, 9, 145, 146, 114, 58, 9, + 130, 132, 116, 126, 115, 117, 133, 55, 30, 31, + 156, 157, 158, 115, 27, 115, 9, 115, 117, 40, + 117, 137, 42, 104, 117, 9, 167, 168, 9, 114, + 115, 117, 124, 115, 9, 123, 124, 156, 27, 32, + 115, 9, 180, 41, 138, 124, 124, 145, 183, 115, + 117, 62, 115, 119, 116, 29, 28, 51, 92, 139, + 32, 114, 169, 61, 172, 167, 180, 123, 17, 44, + 141, 123, 3, 89, 170, 101, 173, 114, 93, 45, + 27, 115, 12, 171, 102, 174, 9, 176, 94, 9, + 32, 8, 104, 115, 117, 46, 47, 140, 3, 9 +}; + +#define yyerrok (yyerrstatus = 0) +#define yyclearin (yychar = YYEMPTY) +#define YYEMPTY (-2) +#define YYEOF 0 + +#define YYACCEPT goto yyacceptlab +#define YYABORT goto yyabortlab +#define YYERROR goto yyerrorlab + + +/* Like YYERROR except do call yyerror. This remains here temporarily + to ease the transition to the new meaning of YYERROR, for GCC. + Once GCC version 2 has supplanted version 1, this can go. */ + +#define YYFAIL goto yyerrlab + +#define YYRECOVERING() (!!yyerrstatus) + +#define YYBACKUP(Token, Value) \ +do \ + if (yychar == YYEMPTY && yylen == 1) \ + { \ + yychar = (Token); \ + yylval = (Value); \ + yytoken = YYTRANSLATE (yychar); \ + YYPOPSTACK (1); \ + goto yybackup; \ + } \ + else \ + { \ + yyerror (YY_("syntax error: cannot back up")); \ + YYERROR; \ + } \ +while (YYID (0)) + + +#define YYTERROR 1 +#define YYERRCODE 256 + + +/* YYLLOC_DEFAULT -- Set CURRENT to span from RHS[1] to RHS[N]. + If N is 0, then set CURRENT to the empty location which ends + the previous symbol: RHS[0] (always defined). */ + +#define YYRHSLOC(Rhs, K) ((Rhs)[K]) +#ifndef YYLLOC_DEFAULT +# define YYLLOC_DEFAULT(Current, Rhs, N) \ + do \ + if (YYID (N)) \ + { \ + (Current).first_line = YYRHSLOC (Rhs, 1).first_line; \ + (Current).first_column = YYRHSLOC (Rhs, 1).first_column; \ + (Current).last_line = YYRHSLOC (Rhs, N).last_line; \ + (Current).last_column = YYRHSLOC (Rhs, N).last_column; \ + } \ + else \ + { \ + (Current).first_line = (Current).last_line = \ + YYRHSLOC (Rhs, 0).last_line; \ + (Current).first_column = (Current).last_column = \ + YYRHSLOC (Rhs, 0).last_column; \ + } \ + while (YYID (0)) +#endif + + +/* YY_LOCATION_PRINT -- Print the location on the stream. + This macro was not mandated originally: define only if we know + we won't break user code: when these are the locations we know. */ + +#ifndef YY_LOCATION_PRINT +# if defined YYLTYPE_IS_TRIVIAL && YYLTYPE_IS_TRIVIAL +# define YY_LOCATION_PRINT(File, Loc) \ + fprintf (File, "%d.%d-%d.%d", \ + (Loc).first_line, (Loc).first_column, \ + (Loc).last_line, (Loc).last_column) +# else +# define YY_LOCATION_PRINT(File, Loc) ((void) 0) +# endif +#endif + + +/* YYLEX -- calling `yylex' with the right arguments. */ + +#ifdef YYLEX_PARAM +# define YYLEX yylex (YYLEX_PARAM) +#else +# define YYLEX yylex () +#endif + +/* Enable debugging if requested. */ +#if YYDEBUG + +# ifndef YYFPRINTF +# include <stdio.h> /* INFRINGES ON USER NAME SPACE */ +# define YYFPRINTF fprintf +# endif + +# define YYDPRINTF(Args) \ +do { \ + if (yydebug) \ + YYFPRINTF Args; \ +} while (YYID (0)) + +# define YY_SYMBOL_PRINT(Title, Type, Value, Location) \ +do { \ + if (yydebug) \ + { \ + YYFPRINTF (stderr, "%s ", Title); \ + yy_symbol_print (stderr, \ + Type, Value); \ + YYFPRINTF (stderr, "\n"); \ + } \ +} while (YYID (0)) + + +/*--------------------------------. +| Print this symbol on YYOUTPUT. | +`--------------------------------*/ + +/*ARGSUSED*/ +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static void +yy_symbol_value_print (FILE *yyoutput, int yytype, YYSTYPE const * const yyvaluep) +#else +static void +yy_symbol_value_print (yyoutput, yytype, yyvaluep) + FILE *yyoutput; + int yytype; + YYSTYPE const * const yyvaluep; +#endif +{ + if (!yyvaluep) + return; +# ifdef YYPRINT + if (yytype < YYNTOKENS) + YYPRINT (yyoutput, yytoknum[yytype], *yyvaluep); +# else + YYUSE (yyoutput); +# endif + switch (yytype) + { + default: + break; + } +} + + +/*--------------------------------. +| Print this symbol on YYOUTPUT. | +`--------------------------------*/ + +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static void +yy_symbol_print (FILE *yyoutput, int yytype, YYSTYPE const * const yyvaluep) +#else +static void +yy_symbol_print (yyoutput, yytype, yyvaluep) + FILE *yyoutput; + int yytype; + YYSTYPE const * const yyvaluep; +#endif +{ + if (yytype < YYNTOKENS) + YYFPRINTF (yyoutput, "token %s (", yytname[yytype]); + else + YYFPRINTF (yyoutput, "nterm %s (", yytname[yytype]); + + yy_symbol_value_print (yyoutput, yytype, yyvaluep); + YYFPRINTF (yyoutput, ")"); +} + +/*------------------------------------------------------------------. +| yy_stack_print -- Print the state stack from its BOTTOM up to its | +| TOP (included). | +`------------------------------------------------------------------*/ + +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static void +yy_stack_print (yytype_int16 *bottom, yytype_int16 *top) +#else +static void +yy_stack_print (bottom, top) + yytype_int16 *bottom; + yytype_int16 *top; +#endif +{ + YYFPRINTF (stderr, "Stack now"); + for (; bottom <= top; ++bottom) + YYFPRINTF (stderr, " %d", *bottom); + YYFPRINTF (stderr, "\n"); +} + +# define YY_STACK_PRINT(Bottom, Top) \ +do { \ + if (yydebug) \ + yy_stack_print ((Bottom), (Top)); \ +} while (YYID (0)) + + +/*------------------------------------------------. +| Report that the YYRULE is going to be reduced. | +`------------------------------------------------*/ + +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static void +yy_reduce_print (YYSTYPE *yyvsp, int yyrule) +#else +static void +yy_reduce_print (yyvsp, yyrule) + YYSTYPE *yyvsp; + int yyrule; +#endif +{ + int yynrhs = yyr2[yyrule]; + int yyi; + unsigned long int yylno = yyrline[yyrule]; + YYFPRINTF (stderr, "Reducing stack by rule %d (line %lu):\n", + yyrule - 1, yylno); + /* The symbols being reduced. */ + for (yyi = 0; yyi < yynrhs; yyi++) + { + fprintf (stderr, " $%d = ", yyi + 1); + yy_symbol_print (stderr, yyrhs[yyprhs[yyrule] + yyi], + &(yyvsp[(yyi + 1) - (yynrhs)]) + ); + fprintf (stderr, "\n"); + } +} + +# define YY_REDUCE_PRINT(Rule) \ +do { \ + if (yydebug) \ + yy_reduce_print (yyvsp, Rule); \ +} while (YYID (0)) + +/* Nonzero means print parse trace. It is left uninitialized so that + multiple parsers can coexist. */ +int yydebug; +#else /* !YYDEBUG */ +# define YYDPRINTF(Args) +# define YY_SYMBOL_PRINT(Title, Type, Value, Location) +# define YY_STACK_PRINT(Bottom, Top) +# define YY_REDUCE_PRINT(Rule) +#endif /* !YYDEBUG */ + + +/* YYINITDEPTH -- initial size of the parser's stacks. */ +#ifndef YYINITDEPTH +# define YYINITDEPTH 200 +#endif + +/* YYMAXDEPTH -- maximum size the stacks can grow to (effective only + if the built-in stack extension method is used). + + Do not make this value too large; the results are undefined if + YYSTACK_ALLOC_MAXIMUM < YYSTACK_BYTES (YYMAXDEPTH) + evaluated with infinite-precision integer arithmetic. */ + +#ifndef YYMAXDEPTH +# define YYMAXDEPTH 10000 +#endif + + + +#if YYERROR_VERBOSE + +# ifndef yystrlen +# if defined __GLIBC__ && defined _STRING_H +# define yystrlen strlen +# else +/* Return the length of YYSTR. */ +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static YYSIZE_T +yystrlen (const char *yystr) +#else +static YYSIZE_T +yystrlen (yystr) + const char *yystr; +#endif +{ + YYSIZE_T yylen; + for (yylen = 0; yystr[yylen]; yylen++) + continue; + return yylen; +} +# endif +# endif + +# ifndef yystpcpy +# if defined __GLIBC__ && defined _STRING_H && defined _GNU_SOURCE +# define yystpcpy stpcpy +# else +/* Copy YYSRC to YYDEST, returning the address of the terminating '\0' in + YYDEST. */ +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static char * +yystpcpy (char *yydest, const char *yysrc) +#else +static char * +yystpcpy (yydest, yysrc) + char *yydest; + const char *yysrc; +#endif +{ + char *yyd = yydest; + const char *yys = yysrc; + + while ((*yyd++ = *yys++) != '\0') + continue; + + return yyd - 1; +} +# endif +# endif + +# ifndef yytnamerr +/* Copy to YYRES the contents of YYSTR after stripping away unnecessary + quotes and backslashes, so that it's suitable for yyerror. The + heuristic is that double-quoting is unnecessary unless the string + contains an apostrophe, a comma, or backslash (other than + backslash-backslash). YYSTR is taken from yytname. If YYRES is + null, do not copy; instead, return the length of what the result + would have been. */ +static YYSIZE_T +yytnamerr (char *yyres, const char *yystr) +{ + if (*yystr == '"') + { + YYSIZE_T yyn = 0; + char const *yyp = yystr; + + for (;;) + switch (*++yyp) + { + case '\'': + case ',': + goto do_not_strip_quotes; + + case '\\': + if (*++yyp != '\\') + goto do_not_strip_quotes; + /* Fall through. */ + default: + if (yyres) + yyres[yyn] = *yyp; + yyn++; + break; + + case '"': + if (yyres) + yyres[yyn] = '\0'; + return yyn; + } + do_not_strip_quotes: ; + } + + if (! yyres) + return yystrlen (yystr); + + return yystpcpy (yyres, yystr) - yyres; +} +# endif + +/* Copy into YYRESULT an error message about the unexpected token + YYCHAR while in state YYSTATE. Return the number of bytes copied, + including the terminating null byte. If YYRESULT is null, do not + copy anything; just return the number of bytes that would be + copied. As a special case, return 0 if an ordinary "syntax error" + message will do. Return YYSIZE_MAXIMUM if overflow occurs during + size calculation. */ +static YYSIZE_T +yysyntax_error (char *yyresult, int yystate, int yychar) +{ + int yyn = yypact[yystate]; + + if (! (YYPACT_NINF < yyn && yyn <= YYLAST)) + return 0; + else + { + int yytype = YYTRANSLATE (yychar); + YYSIZE_T yysize0 = yytnamerr (0, yytname[yytype]); + YYSIZE_T yysize = yysize0; + YYSIZE_T yysize1; + int yysize_overflow = 0; + enum { YYERROR_VERBOSE_ARGS_MAXIMUM = 5 }; + char const *yyarg[YYERROR_VERBOSE_ARGS_MAXIMUM]; + int yyx; + +# if 0 + /* This is so xgettext sees the translatable formats that are + constructed on the fly. */ + YY_("syntax error, unexpected %s"); + YY_("syntax error, unexpected %s, expecting %s"); + YY_("syntax error, unexpected %s, expecting %s or %s"); + YY_("syntax error, unexpected %s, expecting %s or %s or %s"); + YY_("syntax error, unexpected %s, expecting %s or %s or %s or %s"); +# endif + char *yyfmt; + char const *yyf; + static char const yyunexpected[] = "syntax error, unexpected %s"; + static char const yyexpecting[] = ", expecting %s"; + static char const yyor[] = " or %s"; + char yyformat[sizeof yyunexpected + + sizeof yyexpecting - 1 + + ((YYERROR_VERBOSE_ARGS_MAXIMUM - 2) + * (sizeof yyor - 1))]; + char const *yyprefix = yyexpecting; + + /* Start YYX at -YYN if negative to avoid negative indexes in + YYCHECK. */ + int yyxbegin = yyn < 0 ? -yyn : 0; + + /* Stay within bounds of both yycheck and yytname. */ + int yychecklim = YYLAST - yyn + 1; + int yyxend = yychecklim < YYNTOKENS ? yychecklim : YYNTOKENS; + int yycount = 1; + + yyarg[0] = yytname[yytype]; + yyfmt = yystpcpy (yyformat, yyunexpected); + + for (yyx = yyxbegin; yyx < yyxend; ++yyx) + if (yycheck[yyx + yyn] == yyx && yyx != YYTERROR) + { + if (yycount == YYERROR_VERBOSE_ARGS_MAXIMUM) + { + yycount = 1; + yysize = yysize0; + yyformat[sizeof yyunexpected - 1] = '\0'; + break; + } + yyarg[yycount++] = yytname[yyx]; + yysize1 = yysize + yytnamerr (0, yytname[yyx]); + yysize_overflow |= (yysize1 < yysize); + yysize = yysize1; + yyfmt = yystpcpy (yyfmt, yyprefix); + yyprefix = yyor; + } + + yyf = YY_(yyformat); + yysize1 = yysize + yystrlen (yyf); + yysize_overflow |= (yysize1 < yysize); + yysize = yysize1; + + if (yysize_overflow) + return YYSIZE_MAXIMUM; + + if (yyresult) + { + /* Avoid sprintf, as that infringes on the user's name space. + Don't have undefined behavior even if the translation + produced a string with the wrong number of "%s"s. */ + char *yyp = yyresult; + int yyi = 0; + while ((*yyp = *yyf) != '\0') + { + if (*yyp == '%' && yyf[1] == 's' && yyi < yycount) + { + yyp += yytnamerr (yyp, yyarg[yyi++]); + yyf += 2; + } + else + { + yyp++; + yyf++; + } + } + } + return yysize; + } +} +#endif /* YYERROR_VERBOSE */ + + +/*-----------------------------------------------. +| Release the memory associated to this symbol. | +`-----------------------------------------------*/ + +/*ARGSUSED*/ +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static void +yydestruct (const char *yymsg, int yytype, YYSTYPE *yyvaluep) +#else +static void +yydestruct (yymsg, yytype, yyvaluep) + const char *yymsg; + int yytype; + YYSTYPE *yyvaluep; +#endif +{ + YYUSE (yyvaluep); + + if (!yymsg) + yymsg = "Deleting"; + YY_SYMBOL_PRINT (yymsg, yytype, yyvaluep, yylocationp); +} + + +/* Prevent warnings from -Wmissing-prototypes. */ + +#ifdef YYPARSE_PARAM +#if defined __STDC__ || defined __cplusplus +int yyparse (void *YYPARSE_PARAM); +#else +int yyparse (); +#endif +#else /* ! YYPARSE_PARAM */ +#if defined __STDC__ || defined __cplusplus +int yyparse (void); +#else +int yyparse (); +#endif +#endif /* ! YYPARSE_PARAM */ + + + +/* The look-ahead symbol. */ +int yychar; + +/* The semantic value of the look-ahead symbol. */ +YYSTYPE yylval; + +/* Number of syntax errors so far. */ +int yynerrs; + + + +/*----------. +| yyparse. | +`----------*/ + +#ifdef YYPARSE_PARAM +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +int +yyparse (void *YYPARSE_PARAM) +#else +int +yyparse (YYPARSE_PARAM) + void *YYPARSE_PARAM; +#endif +#else /* ! YYPARSE_PARAM */ +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +int +yyparse (void) +#else +int +yyparse () + +#endif +#endif +{ + + int yystate; + int yyn; + int yyresult; + /* Number of tokens to shift before error messages enabled. */ + int yyerrstatus; + /* Look-ahead token as an internal (translated) token number. */ + int yytoken = 0; +#if YYERROR_VERBOSE + /* Buffer for error messages, and its allocated size. */ + char yymsgbuf[128]; + char *yymsg = yymsgbuf; + YYSIZE_T yymsg_alloc = sizeof yymsgbuf; +#endif + + /* Three stacks and their tools: + `yyss': related to states, + `yyvs': related to semantic values, + `yyls': related to locations. + + Refer to the stacks thru separate pointers, to allow yyoverflow + to reallocate them elsewhere. */ + + /* The state stack. */ + yytype_int16 yyssa[YYINITDEPTH]; + yytype_int16 *yyss = yyssa; + yytype_int16 *yyssp; + + /* The semantic value stack. */ + YYSTYPE yyvsa[YYINITDEPTH]; + YYSTYPE *yyvs = yyvsa; + YYSTYPE *yyvsp; + + + +#define YYPOPSTACK(N) (yyvsp -= (N), yyssp -= (N)) + + YYSIZE_T yystacksize = YYINITDEPTH; + + /* The variables used to return semantic value and location from the + action routines. */ + YYSTYPE yyval; + + + /* The number of symbols on the RHS of the reduced rule. + Keep to zero when no symbol should be popped. */ + int yylen = 0; + + YYDPRINTF ((stderr, "Starting parse\n")); + + yystate = 0; + yyerrstatus = 0; + yynerrs = 0; + yychar = YYEMPTY; /* Cause a token to be read. */ + + /* Initialize stack pointers. + Waste one element of value and location stack + so that they stay on the same level as the state stack. + The wasted elements are never initialized. */ + + yyssp = yyss; + yyvsp = yyvs; + + goto yysetstate; + +/*------------------------------------------------------------. +| yynewstate -- Push a new state, which is found in yystate. | +`------------------------------------------------------------*/ + yynewstate: + /* In all cases, when you get here, the value and location stacks + have just been pushed. So pushing a state here evens the stacks. */ + yyssp++; + + yysetstate: + *yyssp = yystate; + + if (yyss + yystacksize - 1 <= yyssp) + { + /* Get the current used size of the three stacks, in elements. */ + YYSIZE_T yysize = yyssp - yyss + 1; + +#ifdef yyoverflow + { + /* Give user a chance to reallocate the stack. Use copies of + these so that the &'s don't force the real ones into + memory. */ + YYSTYPE *yyvs1 = yyvs; + yytype_int16 *yyss1 = yyss; + + + /* Each stack pointer address is followed by the size of the + data in use in that stack, in bytes. This used to be a + conditional around just the two extra args, but that might + be undefined if yyoverflow is a macro. */ + yyoverflow (YY_("memory exhausted"), + &yyss1, yysize * sizeof (*yyssp), + &yyvs1, yysize * sizeof (*yyvsp), + + &yystacksize); + + yyss = yyss1; + yyvs = yyvs1; + } +#else /* no yyoverflow */ +# ifndef YYSTACK_RELOCATE + goto yyexhaustedlab; +# else + /* Extend the stack our own way. */ + if (YYMAXDEPTH <= yystacksize) + goto yyexhaustedlab; + yystacksize *= 2; + if (YYMAXDEPTH < yystacksize) + yystacksize = YYMAXDEPTH; + + { + yytype_int16 *yyss1 = yyss; + union yyalloc *yyptr = + (union yyalloc*) YYSTACK_ALLOC (YYSTACK_BYTES (yystacksize)); + if (! yyptr) + goto yyexhaustedlab; + YYSTACK_RELOCATE (yyss); + YYSTACK_RELOCATE (yyvs); + +# undef YYSTACK_RELOCATE + if (yyss1 != yyssa) + YYSTACK_FREE (yyss1); + } +# endif +#endif /* no yyoverflow */ + + yyssp = yyss + yysize - 1; + yyvsp = yyvs + yysize - 1; + + + YYDPRINTF ((stderr, "Stack size increased to %lu\n", + (unsigned long int) yystacksize)); + + if (yyss + yystacksize - 1 <= yyssp) + YYABORT; + } + + YYDPRINTF ((stderr, "Entering state %d\n", yystate)); + + goto yybackup; + +/*-----------. +| yybackup. | +`-----------*/ +yybackup: + + /* Do appropriate processing given the current state. Read a + look-ahead token if we need one and don't already have one. */ + + /* First try to decide what to do without reference to look-ahead token. */ + yyn = yypact[yystate]; + if (yyn == YYPACT_NINF) + goto yydefault; + + /* Not known => get a look-ahead token if don't already have one. */ + + /* YYCHAR is either YYEMPTY or YYEOF or a valid look-ahead symbol. */ + if (yychar == YYEMPTY) + { + YYDPRINTF ((stderr, "Reading a token: ")); + yychar = YYLEX; + } + + if (yychar <= YYEOF) + { + yychar = yytoken = YYEOF; + YYDPRINTF ((stderr, "Now at end of input.\n")); + } + else + { + yytoken = YYTRANSLATE (yychar); + YY_SYMBOL_PRINT ("Next token is", yytoken, &yylval, &yylloc); + } + + /* If the proper action on seeing token YYTOKEN is to reduce or to + detect an error, take that action. */ + yyn += yytoken; + if (yyn < 0 || YYLAST < yyn || yycheck[yyn] != yytoken) + goto yydefault; + yyn = yytable[yyn]; + if (yyn <= 0) + { + if (yyn == 0 || yyn == YYTABLE_NINF) + goto yyerrlab; + yyn = -yyn; + goto yyreduce; + } + + if (yyn == YYFINAL) + YYACCEPT; + + /* Count tokens shifted since error; after three, turn off error + status. */ + if (yyerrstatus) + yyerrstatus--; + + /* Shift the look-ahead token. */ + YY_SYMBOL_PRINT ("Shifting", yytoken, &yylval, &yylloc); + + /* Discard the shifted token unless it is eof. */ + if (yychar != YYEOF) + yychar = YYEMPTY; + + yystate = yyn; + *++yyvsp = yylval; + + goto yynewstate; + + +/*-----------------------------------------------------------. +| yydefault -- do the default action for the current state. | +`-----------------------------------------------------------*/ +yydefault: + yyn = yydefact[yystate]; + if (yyn == 0) + goto yyerrlab; + goto yyreduce; + + +/*-----------------------------. +| yyreduce -- Do a reduction. | +`-----------------------------*/ +yyreduce: + /* yyn is the number of a rule to reduce with. */ + yylen = yyr2[yyn]; + + /* If YYLEN is nonzero, implement the default value of the action: + `$$ = $1'. + + Otherwise, the following line sets YYVAL to garbage. + This behavior is undocumented and Bison + users should not rely upon it. Assigning to YYVAL + unconditionally makes the parser a bit smaller, and it avoids a + GCC warning that YYVAL may be used uninitialized. */ + yyval = yyvsp[1-yylen]; + + + YY_REDUCE_PRINT (yyn); + switch (yyn) + { + case 25: +#line 190 "pars0grm.y" + { (yyval) = que_node_list_add_last(NULL, (yyvsp[(1) - (1)])); ;} + break; + + case 26: +#line 192 "pars0grm.y" + { (yyval) = que_node_list_add_last((yyvsp[(1) - (2)]), (yyvsp[(2) - (2)])); ;} + break; + + case 27: +#line 196 "pars0grm.y" + { (yyval) = (yyvsp[(1) - (1)]);;} + break; + + case 28: +#line 198 "pars0grm.y" + { (yyval) = pars_func((yyvsp[(1) - (4)]), (yyvsp[(3) - (4)])); ;} + break; + + case 29: +#line 199 "pars0grm.y" + { (yyval) = (yyvsp[(1) - (1)]);;} + break; + + case 30: +#line 200 "pars0grm.y" + { (yyval) = (yyvsp[(1) - (1)]);;} + break; + + case 31: +#line 201 "pars0grm.y" + { (yyval) = (yyvsp[(1) - (1)]);;} + break; + + case 32: +#line 202 "pars0grm.y" + { (yyval) = (yyvsp[(1) - (1)]);;} + break; + + case 33: +#line 203 "pars0grm.y" + { (yyval) = (yyvsp[(1) - (1)]);;} + break; + + case 34: +#line 204 "pars0grm.y" + { (yyval) = (yyvsp[(1) - (1)]);;} + break; + + case 35: +#line 205 "pars0grm.y" + { (yyval) = (yyvsp[(1) - (1)]);;} + break; + + case 36: +#line 206 "pars0grm.y" + { (yyval) = pars_op('+', (yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;} + break; + + case 37: +#line 207 "pars0grm.y" + { (yyval) = pars_op('-', (yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;} + break; + + case 38: +#line 208 "pars0grm.y" + { (yyval) = pars_op('*', (yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;} + break; + + case 39: +#line 209 "pars0grm.y" + { (yyval) = pars_op('/', (yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;} + break; + + case 40: +#line 210 "pars0grm.y" + { (yyval) = pars_op('-', (yyvsp[(2) - (2)]), NULL); ;} + break; + + case 41: +#line 211 "pars0grm.y" + { (yyval) = (yyvsp[(2) - (3)]); ;} + break; + + case 42: +#line 212 "pars0grm.y" + { (yyval) = pars_op('=', (yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;} + break; + + case 43: +#line 214 "pars0grm.y" + { (yyval) = pars_op(PARS_LIKE_TOKEN, (yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;} + break; + + case 44: +#line 215 "pars0grm.y" + { (yyval) = pars_op('<', (yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;} + break; + + case 45: +#line 216 "pars0grm.y" + { (yyval) = pars_op('>', (yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;} + break; + + case 46: +#line 217 "pars0grm.y" + { (yyval) = pars_op(PARS_GE_TOKEN, (yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;} + break; + + case 47: +#line 218 "pars0grm.y" + { (yyval) = pars_op(PARS_LE_TOKEN, (yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;} + break; + + case 48: +#line 219 "pars0grm.y" + { (yyval) = pars_op(PARS_NE_TOKEN, (yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;} + break; + + case 49: +#line 220 "pars0grm.y" + { (yyval) = pars_op(PARS_AND_TOKEN, (yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;} + break; + + case 50: +#line 221 "pars0grm.y" + { (yyval) = pars_op(PARS_OR_TOKEN, (yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;} + break; + + case 51: +#line 222 "pars0grm.y" + { (yyval) = pars_op(PARS_NOT_TOKEN, (yyvsp[(2) - (2)]), NULL); ;} + break; + + case 52: +#line 224 "pars0grm.y" + { (yyval) = pars_op(PARS_NOTFOUND_TOKEN, (yyvsp[(1) - (3)]), NULL); ;} + break; + + case 53: +#line 226 "pars0grm.y" + { (yyval) = pars_op(PARS_NOTFOUND_TOKEN, (yyvsp[(1) - (3)]), NULL); ;} + break; + + case 54: +#line 230 "pars0grm.y" + { (yyval) = &pars_to_char_token; ;} + break; + + case 55: +#line 231 "pars0grm.y" + { (yyval) = &pars_to_number_token; ;} + break; + + case 56: +#line 232 "pars0grm.y" + { (yyval) = &pars_to_binary_token; ;} + break; + + case 57: +#line 234 "pars0grm.y" + { (yyval) = &pars_binary_to_number_token; ;} + break; + + case 58: +#line 235 "pars0grm.y" + { (yyval) = &pars_substr_token; ;} + break; + + case 59: +#line 236 "pars0grm.y" + { (yyval) = &pars_concat_token; ;} + break; + + case 60: +#line 237 "pars0grm.y" + { (yyval) = &pars_instr_token; ;} + break; + + case 61: +#line 238 "pars0grm.y" + { (yyval) = &pars_length_token; ;} + break; + + case 62: +#line 239 "pars0grm.y" + { (yyval) = &pars_sysdate_token; ;} + break; + + case 63: +#line 240 "pars0grm.y" + { (yyval) = &pars_rnd_token; ;} + break; + + case 64: +#line 241 "pars0grm.y" + { (yyval) = &pars_rnd_str_token; ;} + break; + + case 68: +#line 252 "pars0grm.y" + { (yyval) = pars_stored_procedure_call( + static_cast<sym_node_t*>((yyvsp[(2) - (6)]))); ;} + break; + + case 69: +#line 258 "pars0grm.y" + { (yyval) = pars_procedure_call((yyvsp[(1) - (4)]), (yyvsp[(3) - (4)])); ;} + break; + + case 70: +#line 262 "pars0grm.y" + { (yyval) = &pars_replstr_token; ;} + break; + + case 71: +#line 263 "pars0grm.y" + { (yyval) = &pars_printf_token; ;} + break; + + case 72: +#line 264 "pars0grm.y" + { (yyval) = &pars_assert_token; ;} + break; + + case 73: +#line 268 "pars0grm.y" + { (yyval) = (yyvsp[(1) - (3)]); ;} + break; + + case 74: +#line 272 "pars0grm.y" + { (yyval) = que_node_list_add_last(NULL, (yyvsp[(1) - (1)])); ;} + break; + + case 75: +#line 274 "pars0grm.y" + { (yyval) = que_node_list_add_last((yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;} + break; + + case 76: +#line 278 "pars0grm.y" + { (yyval) = NULL; ;} + break; + + case 77: +#line 279 "pars0grm.y" + { (yyval) = que_node_list_add_last(NULL, (yyvsp[(1) - (1)])); ;} + break; + + case 78: +#line 281 "pars0grm.y" + { (yyval) = que_node_list_add_last((yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;} + break; + + case 79: +#line 285 "pars0grm.y" + { (yyval) = NULL; ;} + break; + + case 80: +#line 286 "pars0grm.y" + { (yyval) = que_node_list_add_last(NULL, (yyvsp[(1) - (1)]));;} + break; + + case 81: +#line 287 "pars0grm.y" + { (yyval) = que_node_list_add_last((yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;} + break; + + case 82: +#line 291 "pars0grm.y" + { (yyval) = (yyvsp[(1) - (1)]); ;} + break; + + case 83: +#line 293 "pars0grm.y" + { (yyval) = pars_func(&pars_count_token, + que_node_list_add_last(NULL, + sym_tab_add_int_lit( + pars_sym_tab_global, 1))); ;} + break; + + case 84: +#line 298 "pars0grm.y" + { (yyval) = pars_func(&pars_count_token, + que_node_list_add_last(NULL, + pars_func(&pars_distinct_token, + que_node_list_add_last( + NULL, (yyvsp[(4) - (5)]))))); ;} + break; + + case 85: +#line 304 "pars0grm.y" + { (yyval) = pars_func(&pars_sum_token, + que_node_list_add_last(NULL, + (yyvsp[(3) - (4)]))); ;} + break; + + case 86: +#line 310 "pars0grm.y" + { (yyval) = NULL; ;} + break; + + case 87: +#line 311 "pars0grm.y" + { (yyval) = que_node_list_add_last(NULL, (yyvsp[(1) - (1)])); ;} + break; + + case 88: +#line 313 "pars0grm.y" + { (yyval) = que_node_list_add_last((yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;} + break; + + case 89: +#line 317 "pars0grm.y" + { (yyval) = pars_select_list(&pars_star_denoter, + NULL); ;} + break; + + case 90: +#line 320 "pars0grm.y" + { (yyval) = pars_select_list( + (yyvsp[(1) - (3)]), static_cast<sym_node_t*>((yyvsp[(3) - (3)]))); ;} + break; + + case 91: +#line 322 "pars0grm.y" + { (yyval) = pars_select_list((yyvsp[(1) - (1)]), NULL); ;} + break; + + case 92: +#line 326 "pars0grm.y" + { (yyval) = NULL; ;} + break; + + case 93: +#line 327 "pars0grm.y" + { (yyval) = (yyvsp[(2) - (2)]); ;} + break; + + case 94: +#line 331 "pars0grm.y" + { (yyval) = NULL; ;} + break; + + case 95: +#line 333 "pars0grm.y" + { (yyval) = &pars_update_token; ;} + break; + + case 96: +#line 337 "pars0grm.y" + { (yyval) = NULL; ;} + break; + + case 97: +#line 339 "pars0grm.y" + { (yyval) = &pars_share_token; ;} + break; + + case 98: +#line 343 "pars0grm.y" + { (yyval) = &pars_asc_token; ;} + break; + + case 99: +#line 344 "pars0grm.y" + { (yyval) = &pars_asc_token; ;} + break; + + case 100: +#line 345 "pars0grm.y" + { (yyval) = &pars_desc_token; ;} + break; + + case 101: +#line 349 "pars0grm.y" + { (yyval) = NULL; ;} + break; + + case 102: +#line 351 "pars0grm.y" + { (yyval) = pars_order_by( + static_cast<sym_node_t*>((yyvsp[(3) - (4)])), + static_cast<pars_res_word_t*>((yyvsp[(4) - (4)]))); ;} + break; + + case 103: +#line 362 "pars0grm.y" + { (yyval) = pars_select_statement( + static_cast<sel_node_t*>((yyvsp[(2) - (8)])), + static_cast<sym_node_t*>((yyvsp[(4) - (8)])), + static_cast<que_node_t*>((yyvsp[(5) - (8)])), + static_cast<pars_res_word_t*>((yyvsp[(6) - (8)])), + static_cast<pars_res_word_t*>((yyvsp[(7) - (8)])), + static_cast<order_node_t*>((yyvsp[(8) - (8)]))); ;} + break; + + case 104: +#line 373 "pars0grm.y" + { (yyval) = (yyvsp[(3) - (3)]); ;} + break; + + case 105: +#line 378 "pars0grm.y" + { (yyval) = pars_insert_statement( + static_cast<sym_node_t*>((yyvsp[(1) - (5)])), (yyvsp[(4) - (5)]), NULL); ;} + break; + + case 106: +#line 381 "pars0grm.y" + { (yyval) = pars_insert_statement( + static_cast<sym_node_t*>((yyvsp[(1) - (2)])), + NULL, + static_cast<sel_node_t*>((yyvsp[(2) - (2)]))); ;} + break; + + case 107: +#line 388 "pars0grm.y" + { (yyval) = pars_column_assignment( + static_cast<sym_node_t*>((yyvsp[(1) - (3)])), + static_cast<que_node_t*>((yyvsp[(3) - (3)]))); ;} + break; + + case 108: +#line 394 "pars0grm.y" + { (yyval) = que_node_list_add_last(NULL, (yyvsp[(1) - (1)])); ;} + break; + + case 109: +#line 396 "pars0grm.y" + { (yyval) = que_node_list_add_last((yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;} + break; + + case 110: +#line 402 "pars0grm.y" + { (yyval) = (yyvsp[(4) - (4)]); ;} + break; + + case 111: +#line 408 "pars0grm.y" + { (yyval) = pars_update_statement_start( + FALSE, + static_cast<sym_node_t*>((yyvsp[(2) - (4)])), + static_cast<col_assign_node_t*>((yyvsp[(4) - (4)]))); ;} + break; + + case 112: +#line 416 "pars0grm.y" + { (yyval) = pars_update_statement( + static_cast<upd_node_t*>((yyvsp[(1) - (2)])), + NULL, + static_cast<que_node_t*>((yyvsp[(2) - (2)]))); ;} + break; + + case 113: +#line 424 "pars0grm.y" + { (yyval) = pars_update_statement( + static_cast<upd_node_t*>((yyvsp[(1) - (2)])), + static_cast<sym_node_t*>((yyvsp[(2) - (2)])), + NULL); ;} + break; + + case 114: +#line 432 "pars0grm.y" + { (yyval) = pars_update_statement_start( + TRUE, + static_cast<sym_node_t*>((yyvsp[(3) - (3)])), NULL); ;} + break; + + case 115: +#line 439 "pars0grm.y" + { (yyval) = pars_update_statement( + static_cast<upd_node_t*>((yyvsp[(1) - (2)])), + NULL, + static_cast<que_node_t*>((yyvsp[(2) - (2)]))); ;} + break; + + case 116: +#line 447 "pars0grm.y" + { (yyval) = pars_update_statement( + static_cast<upd_node_t*>((yyvsp[(1) - (2)])), + static_cast<sym_node_t*>((yyvsp[(2) - (2)])), + NULL); ;} + break; + + case 117: +#line 455 "pars0grm.y" + { (yyval) = pars_row_printf_statement( + static_cast<sel_node_t*>((yyvsp[(2) - (2)]))); ;} + break; + + case 118: +#line 461 "pars0grm.y" + { (yyval) = pars_assignment_statement( + static_cast<sym_node_t*>((yyvsp[(1) - (3)])), + static_cast<que_node_t*>((yyvsp[(3) - (3)]))); ;} + break; + + case 119: +#line 469 "pars0grm.y" + { (yyval) = pars_elsif_element((yyvsp[(2) - (4)]), (yyvsp[(4) - (4)])); ;} + break; + + case 120: +#line 473 "pars0grm.y" + { (yyval) = que_node_list_add_last(NULL, (yyvsp[(1) - (1)])); ;} + break; + + case 121: +#line 475 "pars0grm.y" + { (yyval) = que_node_list_add_last((yyvsp[(1) - (2)]), (yyvsp[(2) - (2)])); ;} + break; + + case 122: +#line 479 "pars0grm.y" + { (yyval) = NULL; ;} + break; + + case 123: +#line 481 "pars0grm.y" + { (yyval) = (yyvsp[(2) - (2)]); ;} + break; + + case 124: +#line 482 "pars0grm.y" + { (yyval) = (yyvsp[(1) - (1)]); ;} + break; + + case 125: +#line 489 "pars0grm.y" + { (yyval) = pars_if_statement((yyvsp[(2) - (7)]), (yyvsp[(4) - (7)]), (yyvsp[(5) - (7)])); ;} + break; + + case 126: +#line 495 "pars0grm.y" + { (yyval) = pars_while_statement((yyvsp[(2) - (6)]), (yyvsp[(4) - (6)])); ;} + break; + + case 127: +#line 503 "pars0grm.y" + { (yyval) = pars_for_statement( + static_cast<sym_node_t*>((yyvsp[(2) - (10)])), + (yyvsp[(4) - (10)]), (yyvsp[(6) - (10)]), (yyvsp[(8) - (10)])); ;} + break; + + case 128: +#line 509 "pars0grm.y" + { (yyval) = pars_exit_statement(); ;} + break; + + case 129: +#line 513 "pars0grm.y" + { (yyval) = pars_return_statement(); ;} + break; + + case 130: +#line 518 "pars0grm.y" + { (yyval) = pars_open_statement( + ROW_SEL_OPEN_CURSOR, + static_cast<sym_node_t*>((yyvsp[(2) - (2)]))); ;} + break; + + case 131: +#line 525 "pars0grm.y" + { (yyval) = pars_open_statement( + ROW_SEL_CLOSE_CURSOR, + static_cast<sym_node_t*>((yyvsp[(2) - (2)]))); ;} + break; + + case 132: +#line 532 "pars0grm.y" + { (yyval) = pars_fetch_statement( + static_cast<sym_node_t*>((yyvsp[(2) - (4)])), + static_cast<sym_node_t*>((yyvsp[(4) - (4)])), NULL); ;} + break; + + case 133: +#line 536 "pars0grm.y" + { (yyval) = pars_fetch_statement( + static_cast<sym_node_t*>((yyvsp[(2) - (4)])), + NULL, + static_cast<sym_node_t*>((yyvsp[(4) - (4)]))); ;} + break; + + case 134: +#line 544 "pars0grm.y" + { (yyval) = pars_column_def( + static_cast<sym_node_t*>((yyvsp[(1) - (5)])), + static_cast<pars_res_word_t*>((yyvsp[(2) - (5)])), + static_cast<sym_node_t*>((yyvsp[(3) - (5)])), + (yyvsp[(4) - (5)]), (yyvsp[(5) - (5)])); ;} + break; + + case 135: +#line 552 "pars0grm.y" + { (yyval) = que_node_list_add_last(NULL, (yyvsp[(1) - (1)])); ;} + break; + + case 136: +#line 554 "pars0grm.y" + { (yyval) = que_node_list_add_last((yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;} + break; + + case 137: +#line 558 "pars0grm.y" + { (yyval) = NULL; ;} + break; + + case 138: +#line 560 "pars0grm.y" + { (yyval) = (yyvsp[(2) - (3)]); ;} + break; + + case 139: +#line 564 "pars0grm.y" + { (yyval) = NULL; ;} + break; + + case 140: +#line 566 "pars0grm.y" + { (yyval) = &pars_int_token; + /* pass any non-NULL pointer */ ;} + break; + + case 141: +#line 571 "pars0grm.y" + { (yyval) = NULL; ;} + break; + + case 142: +#line 573 "pars0grm.y" + { (yyval) = &pars_int_token; + /* pass any non-NULL pointer */ ;} + break; + + case 143: +#line 578 "pars0grm.y" + { (yyval) = NULL; ;} + break; + + case 144: +#line 580 "pars0grm.y" + { (yyval) = &pars_int_token; + /* pass any non-NULL pointer */ ;} + break; + + case 145: +#line 585 "pars0grm.y" + { (yyval) = NULL; ;} + break; + + case 146: +#line 586 "pars0grm.y" + { (yyval) = &pars_int_token; + /* pass any non-NULL pointer */ ;} + break; + + case 147: +#line 591 "pars0grm.y" + { (yyval) = NULL; ;} + break; + + case 148: +#line 593 "pars0grm.y" + { (yyval) = (yyvsp[(3) - (3)]); ;} + break; + + case 149: +#line 600 "pars0grm.y" + { (yyval) = pars_create_table( + static_cast<sym_node_t*>((yyvsp[(3) - (9)])), + static_cast<sym_node_t*>((yyvsp[(5) - (9)])), + static_cast<sym_node_t*>((yyvsp[(8) - (9)])), + static_cast<sym_node_t*>((yyvsp[(9) - (9)])), (yyvsp[(7) - (9)])); ;} + break; + + case 150: +#line 608 "pars0grm.y" + { (yyval) = que_node_list_add_last(NULL, (yyvsp[(1) - (1)])); ;} + break; + + case 151: +#line 610 "pars0grm.y" + { (yyval) = que_node_list_add_last((yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;} + break; + + case 152: +#line 614 "pars0grm.y" + { (yyval) = NULL; ;} + break; + + case 153: +#line 615 "pars0grm.y" + { (yyval) = &pars_unique_token; ;} + break; + + case 154: +#line 619 "pars0grm.y" + { (yyval) = NULL; ;} + break; + + case 155: +#line 620 "pars0grm.y" + { (yyval) = &pars_clustered_token; ;} + break; + + case 156: +#line 629 "pars0grm.y" + { (yyval) = pars_create_index( + static_cast<pars_res_word_t*>((yyvsp[(2) - (10)])), + static_cast<pars_res_word_t*>((yyvsp[(3) - (10)])), + static_cast<sym_node_t*>((yyvsp[(5) - (10)])), + static_cast<sym_node_t*>((yyvsp[(7) - (10)])), + static_cast<sym_node_t*>((yyvsp[(9) - (10)]))); ;} + break; + + case 157: +#line 638 "pars0grm.y" + { (yyval) = (yyvsp[(1) - (1)]); ;} + break; + + case 158: +#line 639 "pars0grm.y" + { (yyval) = (yyvsp[(1) - (1)]); ;} + break; + + case 159: +#line 644 "pars0grm.y" + { (yyval) = pars_commit_statement(); ;} + break; + + case 160: +#line 649 "pars0grm.y" + { (yyval) = pars_rollback_statement(); ;} + break; + + case 161: +#line 653 "pars0grm.y" + { (yyval) = &pars_int_token; ;} + break; + + case 162: +#line 654 "pars0grm.y" + { (yyval) = &pars_int_token; ;} + break; + + case 163: +#line 655 "pars0grm.y" + { (yyval) = &pars_bigint_token; ;} + break; + + case 164: +#line 656 "pars0grm.y" + { (yyval) = &pars_char_token; ;} + break; + + case 165: +#line 657 "pars0grm.y" + { (yyval) = &pars_binary_token; ;} + break; + + case 166: +#line 658 "pars0grm.y" + { (yyval) = &pars_blob_token; ;} + break; + + case 167: +#line 663 "pars0grm.y" + { (yyval) = pars_parameter_declaration( + static_cast<sym_node_t*>((yyvsp[(1) - (3)])), + PARS_INPUT, + static_cast<pars_res_word_t*>((yyvsp[(3) - (3)]))); ;} + break; + + case 168: +#line 668 "pars0grm.y" + { (yyval) = pars_parameter_declaration( + static_cast<sym_node_t*>((yyvsp[(1) - (3)])), + PARS_OUTPUT, + static_cast<pars_res_word_t*>((yyvsp[(3) - (3)]))); ;} + break; + + case 169: +#line 675 "pars0grm.y" + { (yyval) = NULL; ;} + break; + + case 170: +#line 676 "pars0grm.y" + { (yyval) = que_node_list_add_last(NULL, (yyvsp[(1) - (1)])); ;} + break; + + case 171: +#line 678 "pars0grm.y" + { (yyval) = que_node_list_add_last((yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;} + break; + + case 172: +#line 683 "pars0grm.y" + { (yyval) = pars_variable_declaration( + static_cast<sym_node_t*>((yyvsp[(1) - (3)])), + static_cast<pars_res_word_t*>((yyvsp[(2) - (3)]))); ;} + break; + + case 176: +#line 697 "pars0grm.y" + { (yyval) = pars_cursor_declaration( + static_cast<sym_node_t*>((yyvsp[(3) - (6)])), + static_cast<sel_node_t*>((yyvsp[(5) - (6)]))); ;} + break; + + case 177: +#line 704 "pars0grm.y" + { (yyval) = pars_function_declaration( + static_cast<sym_node_t*>((yyvsp[(3) - (4)]))); ;} + break; + + case 183: +#line 726 "pars0grm.y" + { (yyval) = pars_procedure_definition( + static_cast<sym_node_t*>((yyvsp[(2) - (11)])), + static_cast<sym_node_t*>((yyvsp[(4) - (11)])), + (yyvsp[(10) - (11)])); ;} + break; + + +/* Line 1267 of yacc.c. */ +#line 2826 "pars0grm.cc" + default: break; + } + YY_SYMBOL_PRINT ("-> $$ =", yyr1[yyn], &yyval, &yyloc); + + YYPOPSTACK (yylen); + yylen = 0; + YY_STACK_PRINT (yyss, yyssp); + + *++yyvsp = yyval; + + + /* Now `shift' the result of the reduction. Determine what state + that goes to, based on the state we popped back to and the rule + number reduced by. */ + + yyn = yyr1[yyn]; + + yystate = yypgoto[yyn - YYNTOKENS] + *yyssp; + if (0 <= yystate && yystate <= YYLAST && yycheck[yystate] == *yyssp) + yystate = yytable[yystate]; + else + yystate = yydefgoto[yyn - YYNTOKENS]; + + goto yynewstate; + + +/*------------------------------------. +| yyerrlab -- here on detecting error | +`------------------------------------*/ +yyerrlab: + /* If not already recovering from an error, report this error. */ + if (!yyerrstatus) + { + ++yynerrs; +#if ! YYERROR_VERBOSE + yyerror (YY_("syntax error")); +#else + { + YYSIZE_T yysize = yysyntax_error (0, yystate, yychar); + if (yymsg_alloc < yysize && yymsg_alloc < YYSTACK_ALLOC_MAXIMUM) + { + YYSIZE_T yyalloc = 2 * yysize; + if (! (yysize <= yyalloc && yyalloc <= YYSTACK_ALLOC_MAXIMUM)) + yyalloc = YYSTACK_ALLOC_MAXIMUM; + if (yymsg != yymsgbuf) + YYSTACK_FREE (yymsg); + yymsg = (char*) YYSTACK_ALLOC (yyalloc); + if (yymsg) + yymsg_alloc = yyalloc; + else + { + yymsg = yymsgbuf; + yymsg_alloc = sizeof yymsgbuf; + } + } + + if (0 < yysize && yysize <= yymsg_alloc) + { + (void) yysyntax_error (yymsg, yystate, yychar); + yyerror (yymsg); + } + else + { + yyerror (YY_("syntax error")); + if (yysize != 0) + goto yyexhaustedlab; + } + } +#endif + } + + + + if (yyerrstatus == 3) + { + /* If just tried and failed to reuse look-ahead token after an + error, discard it. */ + + if (yychar <= YYEOF) + { + /* Return failure if at end of input. */ + if (yychar == YYEOF) + YYABORT; + } + else + { + yydestruct ("Error: discarding", + yytoken, &yylval); + yychar = YYEMPTY; + } + } + + /* Else will try to reuse look-ahead token after shifting the error + token. */ + goto yyerrlab1; + + +/*---------------------------------------------------. +| yyerrorlab -- error raised explicitly by YYERROR. | +`---------------------------------------------------*/ +yyerrorlab: + + /* Pacify compilers like GCC when the user code never invokes + YYERROR and the label yyerrorlab therefore never appears in user + code. */ + if (/*CONSTCOND*/ 0) + goto yyerrorlab; + + /* Do not reclaim the symbols of the rule which action triggered + this YYERROR. */ + YYPOPSTACK (yylen); + yylen = 0; + YY_STACK_PRINT (yyss, yyssp); + yystate = *yyssp; + goto yyerrlab1; + + +/*-------------------------------------------------------------. +| yyerrlab1 -- common code for both syntax error and YYERROR. | +`-------------------------------------------------------------*/ +yyerrlab1: + yyerrstatus = 3; /* Each real token shifted decrements this. */ + + for (;;) + { + yyn = yypact[yystate]; + if (yyn != YYPACT_NINF) + { + yyn += YYTERROR; + if (0 <= yyn && yyn <= YYLAST && yycheck[yyn] == YYTERROR) + { + yyn = yytable[yyn]; + if (0 < yyn) + break; + } + } + + /* Pop the current state because it cannot handle the error token. */ + if (yyssp == yyss) + YYABORT; + + + yydestruct ("Error: popping", + yystos[yystate], yyvsp); + YYPOPSTACK (1); + yystate = *yyssp; + YY_STACK_PRINT (yyss, yyssp); + } + + if (yyn == YYFINAL) + YYACCEPT; + + *++yyvsp = yylval; + + + /* Shift the error token. */ + YY_SYMBOL_PRINT ("Shifting", yystos[yyn], yyvsp, yylsp); + + yystate = yyn; + goto yynewstate; + + +/*-------------------------------------. +| yyacceptlab -- YYACCEPT comes here. | +`-------------------------------------*/ +yyacceptlab: + yyresult = 0; + goto yyreturn; + +/*-----------------------------------. +| yyabortlab -- YYABORT comes here. | +`-----------------------------------*/ +yyabortlab: + yyresult = 1; + goto yyreturn; + +#ifndef yyoverflow +/*-------------------------------------------------. +| yyexhaustedlab -- memory exhaustion comes here. | +`-------------------------------------------------*/ +yyexhaustedlab: + yyerror (YY_("memory exhausted")); + yyresult = 2; + /* Fall through. */ +#endif + +yyreturn: + if (yychar != YYEOF && yychar != YYEMPTY) + yydestruct ("Cleanup: discarding lookahead", + yytoken, &yylval); + /* Do not reclaim the symbols of the rule which action triggered + this YYABORT or YYACCEPT. */ + YYPOPSTACK (yylen); + YY_STACK_PRINT (yyss, yyssp); + while (yyssp != yyss) + { + yydestruct ("Cleanup: popping", + yystos[*yyssp], yyvsp); + YYPOPSTACK (1); + } +#ifndef yyoverflow + if (yyss != yyssa) + YYSTACK_FREE (yyss); +#endif +#if YYERROR_VERBOSE + if (yymsg != yymsgbuf) + YYSTACK_FREE (yymsg); +#endif + /* Make sure YYID is used. */ + return YYID (yyresult); +} + + +#line 732 "pars0grm.y" + + diff --git a/storage/xtradb/pars/pars0grm.y b/storage/xtradb/pars/pars0grm.y new file mode 100644 index 00000000000..60913287cc4 --- /dev/null +++ b/storage/xtradb/pars/pars0grm.y @@ -0,0 +1,732 @@ +/***************************************************************************** + +Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/****************************************************** +SQL parser: input file for the GNU Bison parser generator + +Look from pars0lex.l for instructions how to generate the C files for +the InnoDB parser. + +Created 12/14/1997 Heikki Tuuri +*******************************************************/ + +%{ +/* The value of the semantic attribute is a pointer to a query tree node +que_node_t */ + +#include "univ.i" +#include <math.h> /* Can't be before univ.i */ +#include "pars0pars.h" +#include "mem0mem.h" +#include "que0types.h" +#include "que0que.h" +#include "row0sel.h" + +#define YYSTYPE que_node_t* + +/* #define __STDC__ */ + +int +yylex(void); +%} + +%token PARS_INT_LIT +%token PARS_FLOAT_LIT +%token PARS_STR_LIT +%token PARS_FIXBINARY_LIT +%token PARS_BLOB_LIT +%token PARS_NULL_LIT +%token PARS_ID_TOKEN +%token PARS_AND_TOKEN +%token PARS_OR_TOKEN +%token PARS_NOT_TOKEN +%token PARS_GE_TOKEN +%token PARS_LE_TOKEN +%token PARS_NE_TOKEN +%token PARS_PROCEDURE_TOKEN +%token PARS_IN_TOKEN +%token PARS_OUT_TOKEN +%token PARS_BINARY_TOKEN +%token PARS_BLOB_TOKEN +%token PARS_INT_TOKEN +%token PARS_INTEGER_TOKEN +%token PARS_FLOAT_TOKEN +%token PARS_CHAR_TOKEN +%token PARS_IS_TOKEN +%token PARS_BEGIN_TOKEN +%token PARS_END_TOKEN +%token PARS_IF_TOKEN +%token PARS_THEN_TOKEN +%token PARS_ELSE_TOKEN +%token PARS_ELSIF_TOKEN +%token PARS_LOOP_TOKEN +%token PARS_WHILE_TOKEN +%token PARS_RETURN_TOKEN +%token PARS_SELECT_TOKEN +%token PARS_SUM_TOKEN +%token PARS_COUNT_TOKEN +%token PARS_DISTINCT_TOKEN +%token PARS_FROM_TOKEN +%token PARS_WHERE_TOKEN +%token PARS_FOR_TOKEN +%token PARS_DDOT_TOKEN +%token PARS_READ_TOKEN +%token PARS_ORDER_TOKEN +%token PARS_BY_TOKEN +%token PARS_ASC_TOKEN +%token PARS_DESC_TOKEN +%token PARS_INSERT_TOKEN +%token PARS_INTO_TOKEN +%token PARS_VALUES_TOKEN +%token PARS_UPDATE_TOKEN +%token PARS_SET_TOKEN +%token PARS_DELETE_TOKEN +%token PARS_CURRENT_TOKEN +%token PARS_OF_TOKEN +%token PARS_CREATE_TOKEN +%token PARS_TABLE_TOKEN +%token PARS_INDEX_TOKEN +%token PARS_UNIQUE_TOKEN +%token PARS_CLUSTERED_TOKEN +%token PARS_DOES_NOT_FIT_IN_MEM_TOKEN +%token PARS_ON_TOKEN +%token PARS_ASSIGN_TOKEN +%token PARS_DECLARE_TOKEN +%token PARS_CURSOR_TOKEN +%token PARS_SQL_TOKEN +%token PARS_OPEN_TOKEN +%token PARS_FETCH_TOKEN +%token PARS_CLOSE_TOKEN +%token PARS_NOTFOUND_TOKEN +%token PARS_TO_CHAR_TOKEN +%token PARS_TO_NUMBER_TOKEN +%token PARS_TO_BINARY_TOKEN +%token PARS_BINARY_TO_NUMBER_TOKEN +%token PARS_SUBSTR_TOKEN +%token PARS_REPLSTR_TOKEN +%token PARS_CONCAT_TOKEN +%token PARS_INSTR_TOKEN +%token PARS_LENGTH_TOKEN +%token PARS_SYSDATE_TOKEN +%token PARS_PRINTF_TOKEN +%token PARS_ASSERT_TOKEN +%token PARS_RND_TOKEN +%token PARS_RND_STR_TOKEN +%token PARS_ROW_PRINTF_TOKEN +%token PARS_COMMIT_TOKEN +%token PARS_ROLLBACK_TOKEN +%token PARS_WORK_TOKEN +%token PARS_UNSIGNED_TOKEN +%token PARS_EXIT_TOKEN +%token PARS_FUNCTION_TOKEN +%token PARS_LOCK_TOKEN +%token PARS_SHARE_TOKEN +%token PARS_MODE_TOKEN +%token PARS_LIKE_TOKEN +%token PARS_LIKE_TOKEN_EXACT +%token PARS_LIKE_TOKEN_PREFIX +%token PARS_LIKE_TOKEN_SUFFIX +%token PARS_LIKE_TOKEN_SUBSTR +%token PARS_TABLE_NAME_TOKEN +%token PARS_COMPACT_TOKEN +%token PARS_BLOCK_SIZE_TOKEN +%token PARS_BIGINT_TOKEN + +%left PARS_AND_TOKEN PARS_OR_TOKEN +%left PARS_NOT_TOKEN +%left '=' '<' '>' PARS_GE_TOKEN PARS_LE_TOKEN +%left '-' '+' +%left '*' '/' +%left NEG /* negation--unary minus */ +%left '%' + +/* Grammar follows */ +%% + +top_statement: + procedure_definition ';' + +statement: + stored_procedure_call + | predefined_procedure_call ';' + | while_statement ';' + | for_statement ';' + | exit_statement ';' + | if_statement ';' + | return_statement ';' + | assignment_statement ';' + | select_statement ';' + | insert_statement ';' + | row_printf_statement ';' + | delete_statement_searched ';' + | delete_statement_positioned ';' + | update_statement_searched ';' + | update_statement_positioned ';' + | open_cursor_statement ';' + | fetch_statement ';' + | close_cursor_statement ';' + | commit_statement ';' + | rollback_statement ';' + | create_table ';' + | create_index ';' +; + +statement_list: + statement { $$ = que_node_list_add_last(NULL, $1); } + | statement_list statement + { $$ = que_node_list_add_last($1, $2); } +; + +exp: + PARS_ID_TOKEN { $$ = $1;} + | function_name '(' exp_list ')' + { $$ = pars_func($1, $3); } + | PARS_INT_LIT { $$ = $1;} + | PARS_FLOAT_LIT { $$ = $1;} + | PARS_STR_LIT { $$ = $1;} + | PARS_FIXBINARY_LIT { $$ = $1;} + | PARS_BLOB_LIT { $$ = $1;} + | PARS_NULL_LIT { $$ = $1;} + | PARS_SQL_TOKEN { $$ = $1;} + | exp '+' exp { $$ = pars_op('+', $1, $3); } + | exp '-' exp { $$ = pars_op('-', $1, $3); } + | exp '*' exp { $$ = pars_op('*', $1, $3); } + | exp '/' exp { $$ = pars_op('/', $1, $3); } + | '-' exp %prec NEG { $$ = pars_op('-', $2, NULL); } + | '(' exp ')' { $$ = $2; } + | exp '=' exp { $$ = pars_op('=', $1, $3); } + | exp PARS_LIKE_TOKEN PARS_STR_LIT + { $$ = pars_op(PARS_LIKE_TOKEN, $1, $3); } + | exp '<' exp { $$ = pars_op('<', $1, $3); } + | exp '>' exp { $$ = pars_op('>', $1, $3); } + | exp PARS_GE_TOKEN exp { $$ = pars_op(PARS_GE_TOKEN, $1, $3); } + | exp PARS_LE_TOKEN exp { $$ = pars_op(PARS_LE_TOKEN, $1, $3); } + | exp PARS_NE_TOKEN exp { $$ = pars_op(PARS_NE_TOKEN, $1, $3); } + | exp PARS_AND_TOKEN exp{ $$ = pars_op(PARS_AND_TOKEN, $1, $3); } + | exp PARS_OR_TOKEN exp { $$ = pars_op(PARS_OR_TOKEN, $1, $3); } + | PARS_NOT_TOKEN exp { $$ = pars_op(PARS_NOT_TOKEN, $2, NULL); } + | PARS_ID_TOKEN '%' PARS_NOTFOUND_TOKEN + { $$ = pars_op(PARS_NOTFOUND_TOKEN, $1, NULL); } + | PARS_SQL_TOKEN '%' PARS_NOTFOUND_TOKEN + { $$ = pars_op(PARS_NOTFOUND_TOKEN, $1, NULL); } +; + +function_name: + PARS_TO_CHAR_TOKEN { $$ = &pars_to_char_token; } + | PARS_TO_NUMBER_TOKEN { $$ = &pars_to_number_token; } + | PARS_TO_BINARY_TOKEN { $$ = &pars_to_binary_token; } + | PARS_BINARY_TO_NUMBER_TOKEN + { $$ = &pars_binary_to_number_token; } + | PARS_SUBSTR_TOKEN { $$ = &pars_substr_token; } + | PARS_CONCAT_TOKEN { $$ = &pars_concat_token; } + | PARS_INSTR_TOKEN { $$ = &pars_instr_token; } + | PARS_LENGTH_TOKEN { $$ = &pars_length_token; } + | PARS_SYSDATE_TOKEN { $$ = &pars_sysdate_token; } + | PARS_RND_TOKEN { $$ = &pars_rnd_token; } + | PARS_RND_STR_TOKEN { $$ = &pars_rnd_str_token; } +; + +question_mark_list: + /* Nothing */ + | '?' + | question_mark_list ',' '?' +; + +stored_procedure_call: + '{' PARS_ID_TOKEN '(' question_mark_list ')' '}' + { $$ = pars_stored_procedure_call( + static_cast<sym_node_t*>($2)); } +; + +predefined_procedure_call: + predefined_procedure_name '(' exp_list ')' + { $$ = pars_procedure_call($1, $3); } +; + +predefined_procedure_name: + PARS_REPLSTR_TOKEN { $$ = &pars_replstr_token; } + | PARS_PRINTF_TOKEN { $$ = &pars_printf_token; } + | PARS_ASSERT_TOKEN { $$ = &pars_assert_token; } +; + +user_function_call: + PARS_ID_TOKEN '(' ')' { $$ = $1; } +; + +table_list: + table_name { $$ = que_node_list_add_last(NULL, $1); } + | table_list ',' table_name + { $$ = que_node_list_add_last($1, $3); } +; + +variable_list: + /* Nothing */ { $$ = NULL; } + | PARS_ID_TOKEN { $$ = que_node_list_add_last(NULL, $1); } + | variable_list ',' PARS_ID_TOKEN + { $$ = que_node_list_add_last($1, $3); } +; + +exp_list: + /* Nothing */ { $$ = NULL; } + | exp { $$ = que_node_list_add_last(NULL, $1);} + | exp_list ',' exp { $$ = que_node_list_add_last($1, $3); } +; + +select_item: + exp { $$ = $1; } + | PARS_COUNT_TOKEN '(' '*' ')' + { $$ = pars_func(&pars_count_token, + que_node_list_add_last(NULL, + sym_tab_add_int_lit( + pars_sym_tab_global, 1))); } + | PARS_COUNT_TOKEN '(' PARS_DISTINCT_TOKEN PARS_ID_TOKEN ')' + { $$ = pars_func(&pars_count_token, + que_node_list_add_last(NULL, + pars_func(&pars_distinct_token, + que_node_list_add_last( + NULL, $4)))); } + | PARS_SUM_TOKEN '(' exp ')' + { $$ = pars_func(&pars_sum_token, + que_node_list_add_last(NULL, + $3)); } +; + +select_item_list: + /* Nothing */ { $$ = NULL; } + | select_item { $$ = que_node_list_add_last(NULL, $1); } + | select_item_list ',' select_item + { $$ = que_node_list_add_last($1, $3); } +; + +select_list: + '*' { $$ = pars_select_list(&pars_star_denoter, + NULL); } + | select_item_list PARS_INTO_TOKEN variable_list + { $$ = pars_select_list( + $1, static_cast<sym_node_t*>($3)); } + | select_item_list { $$ = pars_select_list($1, NULL); } +; + +search_condition: + /* Nothing */ { $$ = NULL; } + | PARS_WHERE_TOKEN exp { $$ = $2; } +; + +for_update_clause: + /* Nothing */ { $$ = NULL; } + | PARS_FOR_TOKEN PARS_UPDATE_TOKEN + { $$ = &pars_update_token; } +; + +lock_shared_clause: + /* Nothing */ { $$ = NULL; } + | PARS_LOCK_TOKEN PARS_IN_TOKEN PARS_SHARE_TOKEN PARS_MODE_TOKEN + { $$ = &pars_share_token; } +; + +order_direction: + /* Nothing */ { $$ = &pars_asc_token; } + | PARS_ASC_TOKEN { $$ = &pars_asc_token; } + | PARS_DESC_TOKEN { $$ = &pars_desc_token; } +; + +order_by_clause: + /* Nothing */ { $$ = NULL; } + | PARS_ORDER_TOKEN PARS_BY_TOKEN PARS_ID_TOKEN order_direction + { $$ = pars_order_by( + static_cast<sym_node_t*>($3), + static_cast<pars_res_word_t*>($4)); } +; + +select_statement: + PARS_SELECT_TOKEN select_list + PARS_FROM_TOKEN table_list + search_condition + for_update_clause + lock_shared_clause + order_by_clause { $$ = pars_select_statement( + static_cast<sel_node_t*>($2), + static_cast<sym_node_t*>($4), + static_cast<que_node_t*>($5), + static_cast<pars_res_word_t*>($6), + static_cast<pars_res_word_t*>($7), + static_cast<order_node_t*>($8)); } +; + +insert_statement_start: + PARS_INSERT_TOKEN PARS_INTO_TOKEN + table_name { $$ = $3; } +; + +insert_statement: + insert_statement_start PARS_VALUES_TOKEN '(' exp_list ')' + { $$ = pars_insert_statement( + static_cast<sym_node_t*>($1), $4, NULL); } + | insert_statement_start select_statement + { $$ = pars_insert_statement( + static_cast<sym_node_t*>($1), + NULL, + static_cast<sel_node_t*>($2)); } +; + +column_assignment: + PARS_ID_TOKEN '=' exp { $$ = pars_column_assignment( + static_cast<sym_node_t*>($1), + static_cast<que_node_t*>($3)); } +; + +column_assignment_list: + column_assignment { $$ = que_node_list_add_last(NULL, $1); } + | column_assignment_list ',' column_assignment + { $$ = que_node_list_add_last($1, $3); } +; + +cursor_positioned: + PARS_WHERE_TOKEN + PARS_CURRENT_TOKEN PARS_OF_TOKEN + PARS_ID_TOKEN { $$ = $4; } +; + +update_statement_start: + PARS_UPDATE_TOKEN table_name + PARS_SET_TOKEN + column_assignment_list { $$ = pars_update_statement_start( + FALSE, + static_cast<sym_node_t*>($2), + static_cast<col_assign_node_t*>($4)); } +; + +update_statement_searched: + update_statement_start + search_condition { $$ = pars_update_statement( + static_cast<upd_node_t*>($1), + NULL, + static_cast<que_node_t*>($2)); } +; + +update_statement_positioned: + update_statement_start + cursor_positioned { $$ = pars_update_statement( + static_cast<upd_node_t*>($1), + static_cast<sym_node_t*>($2), + NULL); } +; + +delete_statement_start: + PARS_DELETE_TOKEN PARS_FROM_TOKEN + table_name { $$ = pars_update_statement_start( + TRUE, + static_cast<sym_node_t*>($3), NULL); } +; + +delete_statement_searched: + delete_statement_start + search_condition { $$ = pars_update_statement( + static_cast<upd_node_t*>($1), + NULL, + static_cast<que_node_t*>($2)); } +; + +delete_statement_positioned: + delete_statement_start + cursor_positioned { $$ = pars_update_statement( + static_cast<upd_node_t*>($1), + static_cast<sym_node_t*>($2), + NULL); } +; + +row_printf_statement: + PARS_ROW_PRINTF_TOKEN select_statement + { $$ = pars_row_printf_statement( + static_cast<sel_node_t*>($2)); } +; + +assignment_statement: + PARS_ID_TOKEN PARS_ASSIGN_TOKEN exp + { $$ = pars_assignment_statement( + static_cast<sym_node_t*>($1), + static_cast<que_node_t*>($3)); } +; + +elsif_element: + PARS_ELSIF_TOKEN + exp PARS_THEN_TOKEN statement_list + { $$ = pars_elsif_element($2, $4); } +; + +elsif_list: + elsif_element { $$ = que_node_list_add_last(NULL, $1); } + | elsif_list elsif_element + { $$ = que_node_list_add_last($1, $2); } +; + +else_part: + /* Nothing */ { $$ = NULL; } + | PARS_ELSE_TOKEN statement_list + { $$ = $2; } + | elsif_list { $$ = $1; } +; + +if_statement: + PARS_IF_TOKEN exp PARS_THEN_TOKEN statement_list + else_part + PARS_END_TOKEN PARS_IF_TOKEN + { $$ = pars_if_statement($2, $4, $5); } +; + +while_statement: + PARS_WHILE_TOKEN exp PARS_LOOP_TOKEN statement_list + PARS_END_TOKEN PARS_LOOP_TOKEN + { $$ = pars_while_statement($2, $4); } +; + +for_statement: + PARS_FOR_TOKEN PARS_ID_TOKEN PARS_IN_TOKEN + exp PARS_DDOT_TOKEN exp + PARS_LOOP_TOKEN statement_list + PARS_END_TOKEN PARS_LOOP_TOKEN + { $$ = pars_for_statement( + static_cast<sym_node_t*>($2), + $4, $6, $8); } +; + +exit_statement: + PARS_EXIT_TOKEN { $$ = pars_exit_statement(); } +; + +return_statement: + PARS_RETURN_TOKEN { $$ = pars_return_statement(); } +; + +open_cursor_statement: + PARS_OPEN_TOKEN PARS_ID_TOKEN + { $$ = pars_open_statement( + ROW_SEL_OPEN_CURSOR, + static_cast<sym_node_t*>($2)); } +; + +close_cursor_statement: + PARS_CLOSE_TOKEN PARS_ID_TOKEN + { $$ = pars_open_statement( + ROW_SEL_CLOSE_CURSOR, + static_cast<sym_node_t*>($2)); } +; + +fetch_statement: + PARS_FETCH_TOKEN PARS_ID_TOKEN PARS_INTO_TOKEN variable_list + { $$ = pars_fetch_statement( + static_cast<sym_node_t*>($2), + static_cast<sym_node_t*>($4), NULL); } + | PARS_FETCH_TOKEN PARS_ID_TOKEN PARS_INTO_TOKEN user_function_call + { $$ = pars_fetch_statement( + static_cast<sym_node_t*>($2), + NULL, + static_cast<sym_node_t*>($4)); } +; + +column_def: + PARS_ID_TOKEN type_name opt_column_len opt_unsigned opt_not_null + { $$ = pars_column_def( + static_cast<sym_node_t*>($1), + static_cast<pars_res_word_t*>($2), + static_cast<sym_node_t*>($3), + $4, $5); } +; + +column_def_list: + column_def { $$ = que_node_list_add_last(NULL, $1); } + | column_def_list ',' column_def + { $$ = que_node_list_add_last($1, $3); } +; + +opt_column_len: + /* Nothing */ { $$ = NULL; } + | '(' PARS_INT_LIT ')' + { $$ = $2; } +; + +opt_unsigned: + /* Nothing */ { $$ = NULL; } + | PARS_UNSIGNED_TOKEN + { $$ = &pars_int_token; + /* pass any non-NULL pointer */ } +; + +opt_not_null: + /* Nothing */ { $$ = NULL; } + | PARS_NOT_TOKEN PARS_NULL_LIT + { $$ = &pars_int_token; + /* pass any non-NULL pointer */ } +; + +not_fit_in_memory: + /* Nothing */ { $$ = NULL; } + | PARS_DOES_NOT_FIT_IN_MEM_TOKEN + { $$ = &pars_int_token; + /* pass any non-NULL pointer */ } +; + +compact: + /* Nothing */ { $$ = NULL; } + | PARS_COMPACT_TOKEN { $$ = &pars_int_token; + /* pass any non-NULL pointer */ } +; + +block_size: + /* Nothing */ { $$ = NULL; } + | PARS_BLOCK_SIZE_TOKEN '=' PARS_INT_LIT + { $$ = $3; } +; + +create_table: + PARS_CREATE_TOKEN PARS_TABLE_TOKEN + table_name '(' column_def_list ')' + not_fit_in_memory compact block_size + { $$ = pars_create_table( + static_cast<sym_node_t*>($3), + static_cast<sym_node_t*>($5), + static_cast<sym_node_t*>($8), + static_cast<sym_node_t*>($9), $7); } +; + +column_list: + PARS_ID_TOKEN { $$ = que_node_list_add_last(NULL, $1); } + | column_list ',' PARS_ID_TOKEN + { $$ = que_node_list_add_last($1, $3); } +; + +unique_def: + /* Nothing */ { $$ = NULL; } + | PARS_UNIQUE_TOKEN { $$ = &pars_unique_token; } +; + +clustered_def: + /* Nothing */ { $$ = NULL; } + | PARS_CLUSTERED_TOKEN { $$ = &pars_clustered_token; } +; + +create_index: + PARS_CREATE_TOKEN unique_def + clustered_def + PARS_INDEX_TOKEN + PARS_ID_TOKEN PARS_ON_TOKEN + table_name + '(' column_list ')' { $$ = pars_create_index( + static_cast<pars_res_word_t*>($2), + static_cast<pars_res_word_t*>($3), + static_cast<sym_node_t*>($5), + static_cast<sym_node_t*>($7), + static_cast<sym_node_t*>($9)); } +; + +table_name: + PARS_ID_TOKEN { $$ = $1; } + | PARS_TABLE_NAME_TOKEN { $$ = $1; } +; + +commit_statement: + PARS_COMMIT_TOKEN PARS_WORK_TOKEN + { $$ = pars_commit_statement(); } +; + +rollback_statement: + PARS_ROLLBACK_TOKEN PARS_WORK_TOKEN + { $$ = pars_rollback_statement(); } +; + +type_name: + PARS_INT_TOKEN { $$ = &pars_int_token; } + | PARS_INTEGER_TOKEN { $$ = &pars_int_token; } + | PARS_BIGINT_TOKEN { $$ = &pars_bigint_token; } + | PARS_CHAR_TOKEN { $$ = &pars_char_token; } + | PARS_BINARY_TOKEN { $$ = &pars_binary_token; } + | PARS_BLOB_TOKEN { $$ = &pars_blob_token; } +; + +parameter_declaration: + PARS_ID_TOKEN PARS_IN_TOKEN type_name + { $$ = pars_parameter_declaration( + static_cast<sym_node_t*>($1), + PARS_INPUT, + static_cast<pars_res_word_t*>($3)); } + | PARS_ID_TOKEN PARS_OUT_TOKEN type_name + { $$ = pars_parameter_declaration( + static_cast<sym_node_t*>($1), + PARS_OUTPUT, + static_cast<pars_res_word_t*>($3)); } +; + +parameter_declaration_list: + /* Nothing */ { $$ = NULL; } + | parameter_declaration { $$ = que_node_list_add_last(NULL, $1); } + | parameter_declaration_list ',' parameter_declaration + { $$ = que_node_list_add_last($1, $3); } +; + +variable_declaration: + PARS_ID_TOKEN type_name ';' + { $$ = pars_variable_declaration( + static_cast<sym_node_t*>($1), + static_cast<pars_res_word_t*>($2)); } +; + +variable_declaration_list: + /* Nothing */ + | variable_declaration + | variable_declaration_list variable_declaration +; + +cursor_declaration: + PARS_DECLARE_TOKEN PARS_CURSOR_TOKEN PARS_ID_TOKEN + PARS_IS_TOKEN select_statement ';' + { $$ = pars_cursor_declaration( + static_cast<sym_node_t*>($3), + static_cast<sel_node_t*>($5)); } +; + +function_declaration: + PARS_DECLARE_TOKEN PARS_FUNCTION_TOKEN PARS_ID_TOKEN ';' + { $$ = pars_function_declaration( + static_cast<sym_node_t*>($3)); } +; + +declaration: + cursor_declaration + | function_declaration +; + +declaration_list: + /* Nothing */ + | declaration + | declaration_list declaration +; + +procedure_definition: + PARS_PROCEDURE_TOKEN PARS_ID_TOKEN '(' parameter_declaration_list ')' + PARS_IS_TOKEN + variable_declaration_list + declaration_list + PARS_BEGIN_TOKEN + statement_list + PARS_END_TOKEN { $$ = pars_procedure_definition( + static_cast<sym_node_t*>($2), + static_cast<sym_node_t*>($4), + $10); } +; + +%% diff --git a/storage/xtradb/pars/pars0lex.l b/storage/xtradb/pars/pars0lex.l new file mode 100644 index 00000000000..83c3af4b6c5 --- /dev/null +++ b/storage/xtradb/pars/pars0lex.l @@ -0,0 +1,704 @@ +/***************************************************************************** + +Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/****************************************************** +SQL parser lexical analyzer: input file for the GNU Flex lexer generator + +The InnoDB parser is frozen because MySQL takes care of SQL parsing. +Therefore we normally keep the InnoDB parser C files as they are, and do +not automatically generate them from pars0grm.y and pars0lex.l. + +How to make the InnoDB parser and lexer C files: + +1. Run ./make_flex.sh to generate lexer files. + +2. Run ./make_bison.sh to generate parser files. + +These instructions seem to work at least with bison-1.875d and flex-2.5.31 on +Linux. + +Created 12/14/1997 Heikki Tuuri +*******************************************************/ + +%option nostdinit +%option 8bit +%option warn +%option pointer +%option never-interactive +%option nodefault +%option noinput +%option nounput +%option noyywrap +%option noyy_scan_buffer +%option noyy_scan_bytes +%option noyy_scan_string +%option nounistd + +%{ +#define YYSTYPE que_node_t* + +#include "univ.i" +#include "pars0pars.h" +#include "pars0grm.h" +#include "pars0sym.h" +#include "mem0mem.h" +#include "os0proc.h" + +#define malloc(A) ut_malloc(A) +#define free(A) ut_free(A) +#define realloc(P, A) ut_realloc(P, A) +#define exit(A) ut_error + +/* Note: We cast &result to int* from yysize_t* */ +#define YY_INPUT(buf, result, max_size) \ + pars_get_lex_chars(buf, (int*) &result, max_size) + +/* String buffer for removing quotes */ +static ulint stringbuf_len_alloc = 0; /* Allocated length */ +static ulint stringbuf_len = 0; /* Current length */ +static char* stringbuf; /* Start of buffer */ +/** Appends a string to the buffer. */ +static +void +string_append( +/*==========*/ + const char* str, /*!< in: string to be appended */ + ulint len) /*!< in: length of the string */ +{ + if (stringbuf == NULL) { + stringbuf = static_cast<char*>(malloc(1)); + stringbuf_len_alloc = 1; + } + + if (stringbuf_len + len > stringbuf_len_alloc) { + while (stringbuf_len + len > stringbuf_len_alloc) { + stringbuf_len_alloc <<= 1; + } + + stringbuf = static_cast<char*>( + realloc(stringbuf, stringbuf_len_alloc)); + } + + memcpy(stringbuf + stringbuf_len, str, len); + stringbuf_len += len; +} + +%} + +DIGIT [0-9] +ID [a-z_A-Z][a-z_A-Z0-9]* +TABLE_NAME [a-z_A-Z][@a-z_A-Z0-9]*\/(#sql-|[a-z_A-Z])[a-z_A-Z0-9]* +BOUND_LIT \:[a-z_A-Z0-9]+ +BOUND_ID \$[a-z_A-Z0-9]+ + +%x comment +%x quoted +%x id +%% + +{DIGIT}+ { + yylval = sym_tab_add_int_lit(pars_sym_tab_global, + atoi(yytext)); + return(PARS_INT_LIT); +} + +{DIGIT}+"."{DIGIT}* { + ut_error; /* not implemented */ + + return(PARS_FLOAT_LIT); +} + +{BOUND_LIT} { + ulint type; + + yylval = sym_tab_add_bound_lit(pars_sym_tab_global, + yytext + 1, &type); + + return((int) type); +} + +{BOUND_ID} { + yylval = sym_tab_add_bound_id(pars_sym_tab_global, + yytext + 1); + + return(PARS_ID_TOKEN); +} + +"'" { +/* Quoted character string literals are handled in an explicit +start state 'quoted'. This state is entered and the buffer for +the scanned string is emptied upon encountering a starting quote. + +In the state 'quoted', only two actions are possible (defined below). */ + BEGIN(quoted); + stringbuf_len = 0; +} +<quoted>[^\']+ { + /* Got a sequence of characters other than "'": + append to string buffer */ + string_append(yytext, yyleng); +} +<quoted>"'"+ { + /* Got a sequence of "'" characters: + append half of them to string buffer, + as "''" represents a single "'". + We apply truncating division, + so that "'''" will result in "'". */ + + string_append(yytext, yyleng / 2); + + /* If we got an odd number of quotes, then the + last quote we got is the terminating quote. + At the end of the string, we return to the + initial start state and report the scanned + string literal. */ + + if (yyleng % 2) { + BEGIN(INITIAL); + yylval = sym_tab_add_str_lit( + pars_sym_tab_global, + (byte*) stringbuf, stringbuf_len); + return(PARS_STR_LIT); + } +} + +\" { +/* Quoted identifiers are handled in an explicit start state 'id'. +This state is entered and the buffer for the scanned string is emptied +upon encountering a starting quote. + +In the state 'id', only two actions are possible (defined below). */ + BEGIN(id); + stringbuf_len = 0; +} +<id>[^\"]+ { + /* Got a sequence of characters other than '"': + append to string buffer */ + string_append(yytext, yyleng); +} +<id>\"+ { + /* Got a sequence of '"' characters: + append half of them to string buffer, + as '""' represents a single '"'. + We apply truncating division, + so that '"""' will result in '"'. */ + + string_append(yytext, yyleng / 2); + + /* If we got an odd number of quotes, then the + last quote we got is the terminating quote. + At the end of the string, we return to the + initial start state and report the scanned + identifier. */ + + if (yyleng % 2) { + BEGIN(INITIAL); + yylval = sym_tab_add_id( + pars_sym_tab_global, + (byte*) stringbuf, stringbuf_len); + + return(PARS_ID_TOKEN); + } +} + +"NULL" { + yylval = sym_tab_add_null_lit(pars_sym_tab_global); + + return(PARS_NULL_LIT); +} + +"SQL" { + /* Implicit cursor name */ + yylval = sym_tab_add_str_lit(pars_sym_tab_global, + (byte*) yytext, yyleng); + return(PARS_SQL_TOKEN); +} + +"AND" { + return(PARS_AND_TOKEN); +} + +"OR" { + return(PARS_OR_TOKEN); +} + +"NOT" { + return(PARS_NOT_TOKEN); +} + +"PROCEDURE" { + return(PARS_PROCEDURE_TOKEN); +} + +"IN" { + return(PARS_IN_TOKEN); +} + +"OUT" { + return(PARS_OUT_TOKEN); +} + +"BINARY" { + return(PARS_BINARY_TOKEN); +} + +"BLOB" { + return(PARS_BLOB_TOKEN); +} + +"INT" { + return(PARS_INT_TOKEN); +} + +"INTEGER" { + return(PARS_INT_TOKEN); +} + +"FLOAT" { + return(PARS_FLOAT_TOKEN); +} + +"CHAR" { + return(PARS_CHAR_TOKEN); +} + +"IS" { + return(PARS_IS_TOKEN); +} + +"BEGIN" { + return(PARS_BEGIN_TOKEN); +} + +"END" { + return(PARS_END_TOKEN); +} + +"IF" { + return(PARS_IF_TOKEN); +} + +"THEN" { + return(PARS_THEN_TOKEN); +} + +"ELSE" { + return(PARS_ELSE_TOKEN); +} + +"ELSIF" { + return(PARS_ELSIF_TOKEN); +} + +"LOOP" { + return(PARS_LOOP_TOKEN); +} + +"WHILE" { + return(PARS_WHILE_TOKEN); +} + +"RETURN" { + return(PARS_RETURN_TOKEN); +} + +"SELECT" { + return(PARS_SELECT_TOKEN); +} + +"SUM" { + return(PARS_SUM_TOKEN); +} + +"COUNT" { + return(PARS_COUNT_TOKEN); +} + +"DISTINCT" { + return(PARS_DISTINCT_TOKEN); +} + +"FROM" { + return(PARS_FROM_TOKEN); +} + +"WHERE" { + return(PARS_WHERE_TOKEN); +} + +"FOR" { + return(PARS_FOR_TOKEN); +} + +"READ" { + return(PARS_READ_TOKEN); +} + +"ORDER" { + return(PARS_ORDER_TOKEN); +} + +"BY" { + return(PARS_BY_TOKEN); +} + +"ASC" { + return(PARS_ASC_TOKEN); +} + +"DESC" { + return(PARS_DESC_TOKEN); +} + +"INSERT" { + return(PARS_INSERT_TOKEN); +} + +"INTO" { + return(PARS_INTO_TOKEN); +} + +"VALUES" { + return(PARS_VALUES_TOKEN); +} + +"UPDATE" { + return(PARS_UPDATE_TOKEN); +} + +"SET" { + return(PARS_SET_TOKEN); +} + +"DELETE" { + return(PARS_DELETE_TOKEN); +} + +"CURRENT" { + return(PARS_CURRENT_TOKEN); +} + +"OF" { + return(PARS_OF_TOKEN); +} + +"CREATE" { + return(PARS_CREATE_TOKEN); +} + +"TABLE" { + return(PARS_TABLE_TOKEN); +} + +"COMPACT" { + return(PARS_COMPACT_TOKEN); +} + +"BLOCK_SIZE" { + return(PARS_BLOCK_SIZE_TOKEN); +} + +"INDEX" { + return(PARS_INDEX_TOKEN); +} + +"UNIQUE" { + return(PARS_UNIQUE_TOKEN); +} + +"CLUSTERED" { + return(PARS_CLUSTERED_TOKEN); +} + +"DOES_NOT_FIT_IN_MEMORY" { + return(PARS_DOES_NOT_FIT_IN_MEM_TOKEN); +} + +"ON" { + return(PARS_ON_TOKEN); +} + +"DECLARE" { + return(PARS_DECLARE_TOKEN); +} + +"CURSOR" { + return(PARS_CURSOR_TOKEN); +} + +"OPEN" { + return(PARS_OPEN_TOKEN); +} + +"FETCH" { + return(PARS_FETCH_TOKEN); +} + +"CLOSE" { + return(PARS_CLOSE_TOKEN); +} + +"NOTFOUND" { + return(PARS_NOTFOUND_TOKEN); +} + +"TO_CHAR" { + return(PARS_TO_CHAR_TOKEN); +} + +"TO_NUMBER" { + return(PARS_TO_NUMBER_TOKEN); +} + +"TO_BINARY" { + return(PARS_TO_BINARY_TOKEN); +} + +"BINARY_TO_NUMBER" { + return(PARS_BINARY_TO_NUMBER_TOKEN); +} + +"SUBSTR" { + return(PARS_SUBSTR_TOKEN); +} + +"REPLSTR" { + return(PARS_REPLSTR_TOKEN); +} + +"CONCAT" { + return(PARS_CONCAT_TOKEN); +} + +"INSTR" { + return(PARS_INSTR_TOKEN); +} + +"LENGTH" { + return(PARS_LENGTH_TOKEN); +} + +"SYSDATE" { + return(PARS_SYSDATE_TOKEN); +} + +"PRINTF" { + return(PARS_PRINTF_TOKEN); +} + +"ASSERT" { + return(PARS_ASSERT_TOKEN); +} + +"RND" { + return(PARS_RND_TOKEN); +} + +"RND_STR" { + return(PARS_RND_STR_TOKEN); +} + +"ROW_PRINTF" { + return(PARS_ROW_PRINTF_TOKEN); +} + +"COMMIT" { + return(PARS_COMMIT_TOKEN); +} + +"ROLLBACK" { + return(PARS_ROLLBACK_TOKEN); +} + +"WORK" { + return(PARS_WORK_TOKEN); +} + +"UNSIGNED" { + return(PARS_UNSIGNED_TOKEN); +} + +"EXIT" { + return(PARS_EXIT_TOKEN); +} + +"FUNCTION" { + return(PARS_FUNCTION_TOKEN); +} + +"LOCK" { + return(PARS_LOCK_TOKEN); +} + +"SHARE" { + return(PARS_SHARE_TOKEN); +} + +"MODE" { + return(PARS_MODE_TOKEN); +} + +"LIKE" { + return(PARS_LIKE_TOKEN); +} + +"BIGINT" { + return(PARS_BIGINT_TOKEN); +} + +{ID} { + yylval = sym_tab_add_id(pars_sym_tab_global, + (byte*) yytext, + ut_strlen(yytext)); + return(PARS_ID_TOKEN); +} + +{TABLE_NAME} { + yylval = sym_tab_add_id(pars_sym_tab_global, + (byte*) yytext, + ut_strlen(yytext)); + return(PARS_TABLE_NAME_TOKEN); +} + +".." { + return(PARS_DDOT_TOKEN); +} + +":=" { + return(PARS_ASSIGN_TOKEN); +} + +"<=" { + return(PARS_LE_TOKEN); +} + +">=" { + return(PARS_GE_TOKEN); +} + +"<>" { + return(PARS_NE_TOKEN); +} + +"(" { + + return((int)(*yytext)); +} + +"=" { + + return((int)(*yytext)); +} + +">" { + + return((int)(*yytext)); +} + +"<" { + + return((int)(*yytext)); +} + +"," { + + return((int)(*yytext)); +} + +";" { + + return((int)(*yytext)); +} + +")" { + + return((int)(*yytext)); +} + +"+" { + + return((int)(*yytext)); +} + +"-" { + + return((int)(*yytext)); +} + +"*" { + + return((int)(*yytext)); +} + +"/" { + + return((int)(*yytext)); +} + +"%" { + + return((int)(*yytext)); +} + +"{" { + + return((int)(*yytext)); +} + +"}" { + + return((int)(*yytext)); +} + +"?" { + + return((int)(*yytext)); +} + +"/*" BEGIN(comment); /* eat up comment */ + +<comment>[^*]* +<comment>"*"+[^*/]* +<comment>"*"+"/" BEGIN(INITIAL); + +[ \t\n]+ /* eat up whitespace */ + + +. { + fprintf(stderr,"Unrecognized character: %02x\n", + *yytext); + + ut_error; + + return(0); +} + +%% + +/********************************************************************** +Release any resources used by the lexer. */ +UNIV_INTERN +void +pars_lexer_close(void) +/*==================*/ +{ + yylex_destroy(); + free(stringbuf); + stringbuf = NULL; + stringbuf_len_alloc = stringbuf_len = 0; +} diff --git a/storage/xtradb/pars/pars0opt.cc b/storage/xtradb/pars/pars0opt.cc new file mode 100644 index 00000000000..cbed2b39eeb --- /dev/null +++ b/storage/xtradb/pars/pars0opt.cc @@ -0,0 +1,1259 @@ +/***************************************************************************** + +Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file pars/pars0opt.cc +Simple SQL optimizer + +Created 12/21/1997 Heikki Tuuri +*******************************************************/ + +#include "pars0opt.h" + +#ifdef UNIV_NONINL +#include "pars0opt.ic" +#endif + +#include "row0sel.h" +#include "row0ins.h" +#include "row0upd.h" +#include "dict0dict.h" +#include "dict0mem.h" +#include "que0que.h" +#include "pars0grm.h" +#include "pars0pars.h" +#include "lock0lock.h" + +#define OPT_EQUAL 1 /* comparison by = */ +#define OPT_COMPARISON 2 /* comparison by <, >, <=, or >= */ + +#define OPT_NOT_COND 1 +#define OPT_END_COND 2 +#define OPT_TEST_COND 3 +#define OPT_SCROLL_COND 4 + + +/*******************************************************************//** +Inverts a comparison operator. +@return the equivalent operator when the order of the arguments is switched */ +static +int +opt_invert_cmp_op( +/*==============*/ + int op) /*!< in: operator */ +{ + if (op == '<') { + return('>'); + } else if (op == '>') { + return('<'); + } else if (op == '=') { + return('='); + } else if (op == PARS_LE_TOKEN) { + return(PARS_GE_TOKEN); + } else if (op == PARS_GE_TOKEN) { + return(PARS_LE_TOKEN); + } else { + /* TODO: LIKE operator */ + ut_error; + } + + return(0); +} + +/*******************************************************************//** +Checks if the value of an expression can be calculated BEFORE the nth table +in a join is accessed. If this is the case, it can possibly be used in an +index search for the nth table. +@return TRUE if already determined */ +static +ibool +opt_check_exp_determined_before( +/*============================*/ + que_node_t* exp, /*!< in: expression */ + sel_node_t* sel_node, /*!< in: select node */ + ulint nth_table) /*!< in: nth table will be accessed */ +{ + func_node_t* func_node; + sym_node_t* sym_node; + dict_table_t* table; + que_node_t* arg; + ulint i; + + ut_ad(exp && sel_node); + + if (que_node_get_type(exp) == QUE_NODE_FUNC) { + func_node = static_cast<func_node_t*>(exp); + + arg = func_node->args; + + while (arg) { + if (!opt_check_exp_determined_before(arg, sel_node, + nth_table)) { + return(FALSE); + } + + arg = que_node_get_next(arg); + } + + return(TRUE); + } + + ut_a(que_node_get_type(exp) == QUE_NODE_SYMBOL); + + sym_node = static_cast<sym_node_t*>(exp); + + if (sym_node->token_type != SYM_COLUMN) { + + return(TRUE); + } + + for (i = 0; i < nth_table; i++) { + + table = sel_node_get_nth_plan(sel_node, i)->table; + + if (sym_node->table == table) { + + return(TRUE); + } + } + + return(FALSE); +} + +/*******************************************************************//** +Looks in a comparison condition if a column value is already restricted by +it BEFORE the nth table is accessed. +@return expression restricting the value of the column, or NULL if not known */ +static +que_node_t* +opt_look_for_col_in_comparison_before( +/*==================================*/ + ulint cmp_type, /*!< in: OPT_EQUAL, OPT_COMPARISON */ + ulint col_no, /*!< in: column number */ + func_node_t* search_cond, /*!< in: comparison condition */ + sel_node_t* sel_node, /*!< in: select node */ + ulint nth_table, /*!< in: nth table in a join (a query + from a single table is considered a + join of 1 table) */ + ulint* op) /*!< out: comparison operator ('=', + PARS_GE_TOKEN, ... ); this is inverted + if the column appears on the right + side */ +{ + sym_node_t* sym_node; + dict_table_t* table; + que_node_t* exp; + que_node_t* arg; + + ut_ad(search_cond); + + ut_a((search_cond->func == '<') + || (search_cond->func == '>') + || (search_cond->func == '=') + || (search_cond->func == PARS_GE_TOKEN) + || (search_cond->func == PARS_LE_TOKEN) + || (search_cond->func == PARS_LIKE_TOKEN_EXACT) + || (search_cond->func == PARS_LIKE_TOKEN_PREFIX) + || (search_cond->func == PARS_LIKE_TOKEN_SUFFIX) + || (search_cond->func == PARS_LIKE_TOKEN_SUBSTR)); + + table = sel_node_get_nth_plan(sel_node, nth_table)->table; + + if ((cmp_type == OPT_EQUAL) + && (search_cond->func != '=') + && (search_cond->func != PARS_LIKE_TOKEN_EXACT) + && (search_cond->func != PARS_LIKE_TOKEN_PREFIX)) { + + return(NULL); + + } else if ((cmp_type == OPT_COMPARISON) + && (search_cond->func != '<') + && (search_cond->func != '>') + && (search_cond->func != PARS_GE_TOKEN) + && (search_cond->func != PARS_LE_TOKEN) + && (search_cond->func != PARS_LIKE_TOKEN_PREFIX) + && (search_cond->func != PARS_LIKE_TOKEN_SUFFIX)) { + + return(NULL); + } + + arg = search_cond->args; + + if (que_node_get_type(arg) == QUE_NODE_SYMBOL) { + sym_node = static_cast<sym_node_t*>(arg); + + if ((sym_node->token_type == SYM_COLUMN) + && (sym_node->table == table) + && (sym_node->col_no == col_no)) { + + /* sym_node contains the desired column id */ + + /* Check if the expression on the right side of the + operator is already determined */ + + exp = que_node_get_next(arg); + + if (opt_check_exp_determined_before(exp, sel_node, + nth_table)) { + *op = search_cond->func; + + return(exp); + } + } + } + + exp = search_cond->args; + arg = que_node_get_next(arg); + + if (que_node_get_type(arg) == QUE_NODE_SYMBOL) { + sym_node = static_cast<sym_node_t*>(arg); + + if ((sym_node->token_type == SYM_COLUMN) + && (sym_node->table == table) + && (sym_node->col_no == col_no)) { + + if (opt_check_exp_determined_before(exp, sel_node, + nth_table)) { + *op = opt_invert_cmp_op(search_cond->func); + + return(exp); + } + } + } + + return(NULL); +} + +/*******************************************************************//** +Looks in a search condition if a column value is already restricted by the +search condition BEFORE the nth table is accessed. Takes into account that +if we will fetch in an ascending order, we cannot utilize an upper limit for +a column value; in a descending order, respectively, a lower limit. +@return expression restricting the value of the column, or NULL if not known */ +static +que_node_t* +opt_look_for_col_in_cond_before( +/*============================*/ + ulint cmp_type, /*!< in: OPT_EQUAL, OPT_COMPARISON */ + ulint col_no, /*!< in: column number */ + func_node_t* search_cond, /*!< in: search condition or NULL */ + sel_node_t* sel_node, /*!< in: select node */ + ulint nth_table, /*!< in: nth table in a join (a query + from a single table is considered a + join of 1 table) */ + ulint* op) /*!< out: comparison operator ('=', + PARS_GE_TOKEN, ... ) */ +{ + func_node_t* new_cond; + que_node_t* exp; + + if (search_cond == NULL) { + + return(NULL); + } + + ut_a(que_node_get_type(search_cond) == QUE_NODE_FUNC); + ut_a(search_cond->func != PARS_OR_TOKEN); + ut_a(search_cond->func != PARS_NOT_TOKEN); + + if (search_cond->func == PARS_AND_TOKEN) { + new_cond = static_cast<func_node_t*>(search_cond->args); + + exp = opt_look_for_col_in_cond_before(cmp_type, col_no, + new_cond, sel_node, + nth_table, op); + if (exp) { + + return(exp); + } + + new_cond = static_cast<func_node_t*>( + que_node_get_next(new_cond)); + + exp = opt_look_for_col_in_cond_before(cmp_type, col_no, + new_cond, sel_node, + nth_table, op); + return(exp); + } + + exp = opt_look_for_col_in_comparison_before(cmp_type, col_no, + search_cond, sel_node, + nth_table, op); + if (exp == NULL) { + + return(NULL); + } + + /* If we will fetch in an ascending order, we cannot utilize an upper + limit for a column value; in a descending order, respectively, a lower + limit */ + + if (sel_node->asc && ((*op == '<') || (*op == PARS_LE_TOKEN))) { + + return(NULL); + + } else if (!sel_node->asc + && ((*op == '>') || (*op == PARS_GE_TOKEN))) { + + return(NULL); + } + + return(exp); +} + +/*******************************************************************//** +Calculates the goodness for an index according to a select node. The +goodness is 4 times the number of first fields in index whose values we +already know exactly in the query. If we have a comparison condition for +an additional field, 2 point are added. If the index is unique, and we know +all the unique fields for the index we add 1024 points. For a clustered index +we add 1 point. +@return goodness */ +static +ulint +opt_calc_index_goodness( +/*====================*/ + dict_index_t* index, /*!< in: index */ + sel_node_t* sel_node, /*!< in: parsed select node */ + ulint nth_table, /*!< in: nth table in a join */ + que_node_t** index_plan, /*!< in/out: comparison expressions for + this index */ + ulint* last_op) /*!< out: last comparison operator, if + goodness > 1 */ +{ + que_node_t* exp; + ulint goodness; + ulint n_fields; + ulint col_no; + ulint op; + ulint j; + + /* At least for now we don't support using FTS indexes for queries + done through InnoDB's own SQL parser. */ + if (dict_index_is_online_ddl(index) || (index->type & DICT_FTS)) { + return(0); + } + + goodness = 0; + + /* Note that as higher level node pointers in the B-tree contain + page addresses as the last field, we must not put more fields in + the search tuple than dict_index_get_n_unique_in_tree(index); see + the note in btr_cur_search_to_nth_level. */ + + n_fields = dict_index_get_n_unique_in_tree(index); + + for (j = 0; j < n_fields; j++) { + + col_no = dict_index_get_nth_col_no(index, j); + + exp = opt_look_for_col_in_cond_before( + OPT_EQUAL, col_no, + static_cast<func_node_t*>(sel_node->search_cond), + sel_node, nth_table, &op); + if (exp) { + /* The value for this column is exactly known already + at this stage of the join */ + + index_plan[j] = exp; + *last_op = op; + goodness += 4; + } else { + /* Look for non-equality comparisons */ + + exp = opt_look_for_col_in_cond_before( + OPT_COMPARISON, col_no, + static_cast<func_node_t*>( + sel_node->search_cond), + sel_node, nth_table, &op); + if (exp) { + index_plan[j] = exp; + *last_op = op; + goodness += 2; + } + + break; + } + } + + if (goodness >= 4 * dict_index_get_n_unique(index)) { + goodness += 1024; + + if (dict_index_is_clust(index)) { + + goodness += 1024; + } + } + + /* We have to test for goodness here, as last_op may not be set */ + if (goodness && dict_index_is_clust(index)) { + + goodness++; + } + + return(goodness); +} + +/*******************************************************************//** +Calculates the number of matched fields based on an index goodness. +@return number of excatly or partially matched fields */ +UNIV_INLINE +ulint +opt_calc_n_fields_from_goodness( +/*============================*/ + ulint goodness) /*!< in: goodness */ +{ + return(((goodness % 1024) + 2) / 4); +} + +/*******************************************************************//** +Converts a comparison operator to the corresponding search mode PAGE_CUR_GE, +... +@return search mode */ +UNIV_INLINE +ulint +opt_op_to_search_mode( +/*==================*/ + ibool asc, /*!< in: TRUE if the rows should be fetched in an + ascending order */ + ulint op) /*!< in: operator '=', PARS_GE_TOKEN, ... */ +{ + if (op == '=' + || op == PARS_LIKE_TOKEN_EXACT + || op == PARS_LIKE_TOKEN_PREFIX + || op == PARS_LIKE_TOKEN_SUFFIX + || op == PARS_LIKE_TOKEN_SUBSTR) { + + if (asc) { + return(PAGE_CUR_GE); + } else { + return(PAGE_CUR_LE); + } + } else if (op == '<') { + ut_a(!asc); + return(PAGE_CUR_L); + } else if (op == '>') { + ut_a(asc); + return(PAGE_CUR_G); + } else if (op == PARS_GE_TOKEN) { + ut_a(asc); + return(PAGE_CUR_GE); + } else if (op == PARS_LE_TOKEN) { + ut_a(!asc); + return(PAGE_CUR_LE); + } else { + ut_error; + } + + return(0); +} + +/*******************************************************************//** +Determines if a node is an argument node of a function node. +@return TRUE if is an argument */ +static +ibool +opt_is_arg( +/*=======*/ + que_node_t* arg_node, /*!< in: possible argument node */ + func_node_t* func_node) /*!< in: function node */ +{ + que_node_t* arg; + + arg = func_node->args; + + while (arg) { + if (arg == arg_node) { + + return(TRUE); + } + + arg = que_node_get_next(arg); + } + + return(FALSE); +} + +/*******************************************************************//** +Decides if the fetching of rows should be made in a descending order, and +also checks that the chosen query plan produces a result which satisfies +the order-by. */ +static +void +opt_check_order_by( +/*===============*/ + sel_node_t* sel_node) /*!< in: select node; asserts an error + if the plan does not agree with the + order-by */ +{ + order_node_t* order_node; + dict_table_t* order_table; + ulint order_col_no; + plan_t* plan; + ulint i; + + if (!sel_node->order_by) { + + return; + } + + order_node = sel_node->order_by; + order_col_no = order_node->column->col_no; + order_table = order_node->column->table; + + /* If there is an order-by clause, the first non-exactly matched field + in the index used for the last table in the table list should be the + column defined in the order-by clause, and for all the other tables + we should get only at most a single row, otherwise we cannot presently + calculate the order-by, as we have no sort utility */ + + for (i = 0; i < sel_node->n_tables; i++) { + + plan = sel_node_get_nth_plan(sel_node, i); + + if (i < sel_node->n_tables - 1) { + ut_a(dict_index_get_n_unique(plan->index) + <= plan->n_exact_match); + } else { + ut_a(plan->table == order_table); + + ut_a((dict_index_get_n_unique(plan->index) + <= plan->n_exact_match) + || (dict_index_get_nth_col_no(plan->index, + plan->n_exact_match) + == order_col_no)); + } + } +} + +/*******************************************************************//** +Optimizes a select. Decides which indexes to tables to use. The tables +are accessed in the order that they were written to the FROM part in the +select statement. */ +static +void +opt_search_plan_for_table( +/*======================*/ + sel_node_t* sel_node, /*!< in: parsed select node */ + ulint i, /*!< in: this is the ith table */ + dict_table_t* table) /*!< in: table */ +{ + plan_t* plan; + dict_index_t* index; + dict_index_t* best_index; + ulint n_fields; + ulint goodness; + ulint last_op = 75946965; /* Eliminate a Purify + warning */ + ulint best_goodness; + ulint best_last_op = 0; /* remove warning */ + que_node_t* index_plan[256]; + que_node_t* best_index_plan[256]; + + plan = sel_node_get_nth_plan(sel_node, i); + + plan->table = table; + plan->asc = sel_node->asc; + plan->pcur_is_open = FALSE; + plan->cursor_at_end = FALSE; + + /* Calculate goodness for each index of the table */ + + index = dict_table_get_first_index(table); + best_index = index; /* Eliminate compiler warning */ + best_goodness = 0; + + /* should be do ... until ? comment by Jani */ + while (index) { + goodness = opt_calc_index_goodness(index, sel_node, i, + index_plan, &last_op); + if (goodness > best_goodness) { + + best_index = index; + best_goodness = goodness; + n_fields = opt_calc_n_fields_from_goodness(goodness); + + ut_memcpy(best_index_plan, index_plan, + n_fields * sizeof(void*)); + best_last_op = last_op; + } + + dict_table_next_uncorrupted_index(index); + } + + plan->index = best_index; + + n_fields = opt_calc_n_fields_from_goodness(best_goodness); + + if (n_fields == 0) { + plan->tuple = NULL; + plan->n_exact_match = 0; + } else { + plan->tuple = dtuple_create(pars_sym_tab_global->heap, + n_fields); + dict_index_copy_types(plan->tuple, plan->index, n_fields); + + plan->tuple_exps = static_cast<que_node_t**>( + mem_heap_alloc( + pars_sym_tab_global->heap, + n_fields * sizeof(void*))); + + ut_memcpy(plan->tuple_exps, best_index_plan, + n_fields * sizeof(void*)); + if (best_last_op == '=' + || best_last_op == PARS_LIKE_TOKEN_EXACT + || best_last_op == PARS_LIKE_TOKEN_PREFIX + || best_last_op == PARS_LIKE_TOKEN_SUFFIX + || best_last_op == PARS_LIKE_TOKEN_SUBSTR) { + plan->n_exact_match = n_fields; + } else { + plan->n_exact_match = n_fields - 1; + } + + plan->mode = opt_op_to_search_mode(sel_node->asc, + best_last_op); + } + + if (dict_index_is_clust(best_index) + && (plan->n_exact_match >= dict_index_get_n_unique(best_index))) { + + plan->unique_search = TRUE; + } else { + plan->unique_search = FALSE; + } + + plan->old_vers_heap = NULL; + + btr_pcur_init(&(plan->pcur)); + btr_pcur_init(&(plan->clust_pcur)); +} + +/*******************************************************************//** +Looks at a comparison condition and decides if it can, and need, be tested for +a table AFTER the table has been accessed. +@return OPT_NOT_COND if not for this table, else OPT_END_COND, +OPT_TEST_COND, or OPT_SCROLL_COND, where the last means that the +condition need not be tested, except when scroll cursors are used */ +static +ulint +opt_classify_comparison( +/*====================*/ + sel_node_t* sel_node, /*!< in: select node */ + ulint i, /*!< in: ith table in the join */ + func_node_t* cond) /*!< in: comparison condition */ +{ + plan_t* plan; + ulint n_fields; + ulint op; + ulint j; + + ut_ad(cond && sel_node); + + plan = sel_node_get_nth_plan(sel_node, i); + + /* Check if the condition is determined after the ith table has been + accessed, but not after the i - 1:th */ + + if (!opt_check_exp_determined_before(cond, sel_node, i + 1)) { + + return(OPT_NOT_COND); + } + + if ((i > 0) && opt_check_exp_determined_before(cond, sel_node, i)) { + + return(OPT_NOT_COND); + } + + /* If the condition is an exact match condition used in constructing + the search tuple, it is classified as OPT_END_COND */ + + if (plan->tuple) { + n_fields = dtuple_get_n_fields(plan->tuple); + } else { + n_fields = 0; + } + + for (j = 0; j < plan->n_exact_match; j++) { + + if (opt_is_arg(plan->tuple_exps[j], cond)) { + + return(OPT_END_COND); + } + } + + /* If the condition is an non-exact match condition used in + constructing the search tuple, it is classified as OPT_SCROLL_COND. + When the cursor is positioned, and if a non-scroll cursor is used, + there is no need to test this condition; if a scroll cursor is used + the testing is necessary when the cursor is reversed. */ + + if ((n_fields > plan->n_exact_match) + && opt_is_arg(plan->tuple_exps[n_fields - 1], cond)) { + + return(OPT_SCROLL_COND); + } + + /* If the condition is a non-exact match condition on the first field + in index for which there is no exact match, and it limits the search + range from the opposite side of the search tuple already BEFORE we + access the table, it is classified as OPT_END_COND */ + + if ((dict_index_get_n_fields(plan->index) > plan->n_exact_match) + && opt_look_for_col_in_comparison_before( + OPT_COMPARISON, + dict_index_get_nth_col_no(plan->index, + plan->n_exact_match), + cond, sel_node, i, &op)) { + + if (sel_node->asc && ((op == '<') || (op == PARS_LE_TOKEN))) { + + return(OPT_END_COND); + } + + if (!sel_node->asc && ((op == '>') || (op == PARS_GE_TOKEN))) { + + return(OPT_END_COND); + } + } + + /* Otherwise, cond is classified as OPT_TEST_COND */ + + return(OPT_TEST_COND); +} + +/*******************************************************************//** +Recursively looks for test conditions for a table in a join. */ +static +void +opt_find_test_conds( +/*================*/ + sel_node_t* sel_node, /*!< in: select node */ + ulint i, /*!< in: ith table in the join */ + func_node_t* cond) /*!< in: conjunction of search + conditions or NULL */ +{ + func_node_t* new_cond; + ulint fclass; + plan_t* plan; + + if (cond == NULL) { + + return; + } + + if (cond->func == PARS_AND_TOKEN) { + new_cond = static_cast<func_node_t*>(cond->args); + + opt_find_test_conds(sel_node, i, new_cond); + + new_cond = static_cast<func_node_t*>( + que_node_get_next(new_cond)); + + opt_find_test_conds(sel_node, i, new_cond); + + return; + } + + plan = sel_node_get_nth_plan(sel_node, i); + + fclass = opt_classify_comparison(sel_node, i, cond); + + if (fclass == OPT_END_COND) { + UT_LIST_ADD_LAST(cond_list, plan->end_conds, cond); + + } else if (fclass == OPT_TEST_COND) { + UT_LIST_ADD_LAST(cond_list, plan->other_conds, cond); + + } +} + +/*******************************************************************//** +Normalizes a list of comparison conditions so that a column of the table +appears on the left side of the comparison if possible. This is accomplished +by switching the arguments of the operator. */ +static +void +opt_normalize_cmp_conds( +/*====================*/ + func_node_t* cond, /*!< in: first in a list of comparison + conditions, or NULL */ + dict_table_t* table) /*!< in: table */ +{ + que_node_t* arg1; + que_node_t* arg2; + sym_node_t* sym_node; + + while (cond) { + arg1 = cond->args; + arg2 = que_node_get_next(arg1); + + if (que_node_get_type(arg2) == QUE_NODE_SYMBOL) { + + sym_node = static_cast<sym_node_t*>(arg2); + + if ((sym_node->token_type == SYM_COLUMN) + && (sym_node->table == table)) { + + /* Switch the order of the arguments */ + + cond->args = arg2; + que_node_list_add_last(NULL, arg2); + que_node_list_add_last(arg2, arg1); + + /* Invert the operator */ + cond->func = opt_invert_cmp_op(cond->func); + } + } + + cond = UT_LIST_GET_NEXT(cond_list, cond); + } +} + +/*******************************************************************//** +Finds out the search condition conjuncts we can, and need, to test as the ith +table in a join is accessed. The search tuple can eliminate the need to test +some conjuncts. */ +static +void +opt_determine_and_normalize_test_conds( +/*===================================*/ + sel_node_t* sel_node, /*!< in: select node */ + ulint i) /*!< in: ith table in the join */ +{ + plan_t* plan; + + plan = sel_node_get_nth_plan(sel_node, i); + + UT_LIST_INIT(plan->end_conds); + UT_LIST_INIT(plan->other_conds); + + /* Recursively go through the conjuncts and classify them */ + + opt_find_test_conds( + sel_node, + i, + static_cast<func_node_t*>(sel_node->search_cond)); + + opt_normalize_cmp_conds(UT_LIST_GET_FIRST(plan->end_conds), + plan->table); + + ut_a(UT_LIST_GET_LEN(plan->end_conds) >= plan->n_exact_match); +} + +/*******************************************************************//** +Looks for occurrences of the columns of the table in the query subgraph and +adds them to the list of columns if an occurrence of the same column does not +already exist in the list. If the column is already in the list, puts a value +indirection to point to the occurrence in the column list, except if the +column occurrence we are looking at is in the column list, in which case +nothing is done. */ +UNIV_INTERN +void +opt_find_all_cols( +/*==============*/ + ibool copy_val, /*!< in: if TRUE, new found columns are + added as columns to copy */ + dict_index_t* index, /*!< in: index of the table to use */ + sym_node_list_t* col_list, /*!< in: base node of a list where + to add new found columns */ + plan_t* plan, /*!< in: plan or NULL */ + que_node_t* exp) /*!< in: expression or condition or + NULL */ +{ + func_node_t* func_node; + que_node_t* arg; + sym_node_t* sym_node; + sym_node_t* col_node; + ulint col_pos; + + if (exp == NULL) { + + return; + } + + if (que_node_get_type(exp) == QUE_NODE_FUNC) { + func_node = static_cast<func_node_t*>(exp); + + for (arg = func_node->args; + arg != 0; + arg = que_node_get_next(arg)) { + + opt_find_all_cols( + copy_val, index, col_list, plan, arg); + } + + return; + } + + ut_a(que_node_get_type(exp) == QUE_NODE_SYMBOL); + + sym_node = static_cast<sym_node_t*>(exp); + + if (sym_node->token_type != SYM_COLUMN) { + + return; + } + + if (sym_node->table != index->table) { + + return; + } + + /* Look for an occurrence of the same column in the plan column + list */ + + col_node = UT_LIST_GET_FIRST(*col_list); + + while (col_node) { + if (col_node->col_no == sym_node->col_no) { + + if (col_node == sym_node) { + /* sym_node was already in a list: do + nothing */ + + return; + } + + /* Put an indirection */ + sym_node->indirection = col_node; + sym_node->alias = col_node; + + return; + } + + col_node = UT_LIST_GET_NEXT(col_var_list, col_node); + } + + /* The same column did not occur in the list: add it */ + + UT_LIST_ADD_LAST(col_var_list, *col_list, sym_node); + + sym_node->copy_val = copy_val; + + /* Fill in the field_no fields in sym_node */ + + sym_node->field_nos[SYM_CLUST_FIELD_NO] = dict_index_get_nth_col_pos( + dict_table_get_first_index(index->table), sym_node->col_no); + if (!dict_index_is_clust(index)) { + + ut_a(plan); + + col_pos = dict_index_get_nth_col_pos(index, sym_node->col_no); + + if (col_pos == ULINT_UNDEFINED) { + + plan->must_get_clust = TRUE; + } + + sym_node->field_nos[SYM_SEC_FIELD_NO] = col_pos; + } +} + +/*******************************************************************//** +Looks for occurrences of the columns of the table in conditions which are +not yet determined AFTER the join operation has fetched a row in the ith +table. The values for these column must be copied to dynamic memory for +later use. */ +static +void +opt_find_copy_cols( +/*===============*/ + sel_node_t* sel_node, /*!< in: select node */ + ulint i, /*!< in: ith table in the join */ + func_node_t* search_cond) /*!< in: search condition or NULL */ +{ + func_node_t* new_cond; + plan_t* plan; + + if (search_cond == NULL) { + + return; + } + + ut_ad(que_node_get_type(search_cond) == QUE_NODE_FUNC); + + if (search_cond->func == PARS_AND_TOKEN) { + new_cond = static_cast<func_node_t*>(search_cond->args); + + opt_find_copy_cols(sel_node, i, new_cond); + + new_cond = static_cast<func_node_t*>( + que_node_get_next(new_cond)); + + opt_find_copy_cols(sel_node, i, new_cond); + + return; + } + + if (!opt_check_exp_determined_before(search_cond, sel_node, i + 1)) { + + /* Any ith table columns occurring in search_cond should be + copied, as this condition cannot be tested already on the + fetch from the ith table */ + + plan = sel_node_get_nth_plan(sel_node, i); + + opt_find_all_cols(TRUE, plan->index, &(plan->columns), plan, + search_cond); + } +} + +/*******************************************************************//** +Classifies the table columns according to whether we use the column only while +holding the latch on the page, or whether we have to copy the column value to +dynamic memory. Puts the first occurrence of a column to either list in the +plan node, and puts indirections to later occurrences of the column. */ +static +void +opt_classify_cols( +/*==============*/ + sel_node_t* sel_node, /*!< in: select node */ + ulint i) /*!< in: ith table in the join */ +{ + plan_t* plan; + que_node_t* exp; + + plan = sel_node_get_nth_plan(sel_node, i); + + /* The final value of the following field will depend on the + environment of the select statement: */ + + plan->must_get_clust = FALSE; + + UT_LIST_INIT(plan->columns); + + /* All select list columns should be copied: therefore TRUE as the + first argument */ + + for (exp = sel_node->select_list; + exp != 0; + exp = que_node_get_next(exp)) { + + opt_find_all_cols( + TRUE, plan->index, &(plan->columns), plan, exp); + } + + opt_find_copy_cols( + sel_node, i, static_cast<func_node_t*>(sel_node->search_cond)); + + /* All remaining columns in the search condition are temporary + columns: therefore FALSE */ + + opt_find_all_cols( + FALSE, plan->index, &plan->columns, plan, + static_cast<func_node_t*>(sel_node->search_cond)); +} + +/*******************************************************************//** +Fills in the info in plan which is used in accessing a clustered index +record. The columns must already be classified for the plan node. */ +static +void +opt_clust_access( +/*=============*/ + sel_node_t* sel_node, /*!< in: select node */ + ulint n) /*!< in: nth table in select */ +{ + plan_t* plan; + dict_table_t* table; + dict_index_t* clust_index; + dict_index_t* index; + mem_heap_t* heap; + ulint n_fields; + ulint pos; + ulint i; + + plan = sel_node_get_nth_plan(sel_node, n); + + index = plan->index; + + /* The final value of the following field depends on the environment + of the select statement: */ + + plan->no_prefetch = FALSE; + + if (dict_index_is_clust(index)) { + plan->clust_map = NULL; + plan->clust_ref = NULL; + + return; + } + + table = index->table; + + clust_index = dict_table_get_first_index(table); + + n_fields = dict_index_get_n_unique(clust_index); + + heap = pars_sym_tab_global->heap; + + plan->clust_ref = dtuple_create(heap, n_fields); + + dict_index_copy_types(plan->clust_ref, clust_index, n_fields); + + plan->clust_map = static_cast<ulint*>( + mem_heap_alloc(heap, n_fields * sizeof(ulint))); + + for (i = 0; i < n_fields; i++) { + pos = dict_index_get_nth_field_pos(index, clust_index, i); + + ut_a(pos != ULINT_UNDEFINED); + + /* We optimize here only queries to InnoDB's internal system + tables, and they should not contain column prefix indexes. */ + + if (dict_index_get_nth_field(index, pos)->prefix_len != 0 + || dict_index_get_nth_field(clust_index, i) + ->prefix_len != 0) { + fprintf(stderr, + "InnoDB: Error in pars0opt.cc:" + " table %s has prefix_len != 0\n", + index->table_name); + } + + *(plan->clust_map + i) = pos; + + ut_ad(pos != ULINT_UNDEFINED); + } +} + +/*******************************************************************//** +Optimizes a select. Decides which indexes to tables to use. The tables +are accessed in the order that they were written to the FROM part in the +select statement. */ +UNIV_INTERN +void +opt_search_plan( +/*============*/ + sel_node_t* sel_node) /*!< in: parsed select node */ +{ + sym_node_t* table_node; + dict_table_t* table; + order_node_t* order_by; + ulint i; + + sel_node->plans = static_cast<plan_t*>( + mem_heap_alloc( + pars_sym_tab_global->heap, + sel_node->n_tables * sizeof(plan_t))); + + /* Analyze the search condition to find out what we know at each + join stage about the conditions that the columns of a table should + satisfy */ + + table_node = sel_node->table_list; + + if (sel_node->order_by == NULL) { + sel_node->asc = TRUE; + } else { + order_by = sel_node->order_by; + + sel_node->asc = order_by->asc; + } + + for (i = 0; i < sel_node->n_tables; i++) { + + table = table_node->table; + + /* Choose index through which to access the table */ + + opt_search_plan_for_table(sel_node, i, table); + + /* Determine the search condition conjuncts we can test at + this table; normalize the end conditions */ + + opt_determine_and_normalize_test_conds(sel_node, i); + + table_node = static_cast<sym_node_t*>( + que_node_get_next(table_node)); + } + + table_node = sel_node->table_list; + + for (i = 0; i < sel_node->n_tables; i++) { + + /* Classify the table columns into those we only need to access + but not copy, and to those we must copy to dynamic memory */ + + opt_classify_cols(sel_node, i); + + /* Calculate possible info for accessing the clustered index + record */ + + opt_clust_access(sel_node, i); + + table_node = static_cast<sym_node_t*>( + que_node_get_next(table_node)); + } + + /* Check that the plan obeys a possible order-by clause: if not, + an assertion error occurs */ + + opt_check_order_by(sel_node); + +#ifdef UNIV_SQL_DEBUG + opt_print_query_plan(sel_node); +#endif +} + +/********************************************************************//** +Prints info of a query plan. */ +UNIV_INTERN +void +opt_print_query_plan( +/*=================*/ + sel_node_t* sel_node) /*!< in: select node */ +{ + plan_t* plan; + ulint n_fields; + ulint i; + + fputs("QUERY PLAN FOR A SELECT NODE\n", stderr); + + fputs(sel_node->asc ? "Asc. search; " : "Desc. search; ", stderr); + + if (sel_node->set_x_locks) { + fputs("sets row x-locks; ", stderr); + ut_a(sel_node->row_lock_mode == LOCK_X); + ut_a(!sel_node->consistent_read); + } else if (sel_node->consistent_read) { + fputs("consistent read; ", stderr); + } else { + ut_a(sel_node->row_lock_mode == LOCK_S); + fputs("sets row s-locks; ", stderr); + } + + putc('\n', stderr); + + for (i = 0; i < sel_node->n_tables; i++) { + plan = sel_node_get_nth_plan(sel_node, i); + + if (plan->tuple) { + n_fields = dtuple_get_n_fields(plan->tuple); + } else { + n_fields = 0; + } + + fputs("Table ", stderr); + dict_index_name_print(stderr, NULL, plan->index); + fprintf(stderr,"; exact m. %lu, match %lu, end conds %lu\n", + (unsigned long) plan->n_exact_match, + (unsigned long) n_fields, + (unsigned long) UT_LIST_GET_LEN(plan->end_conds)); + } +} diff --git a/storage/xtradb/pars/pars0pars.cc b/storage/xtradb/pars/pars0pars.cc new file mode 100644 index 00000000000..f051481184b --- /dev/null +++ b/storage/xtradb/pars/pars0pars.cc @@ -0,0 +1,2668 @@ +/***************************************************************************** + +Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 51 Franklin St, +Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +/**************************************************//** +@file pars/pars0pars.c +SQL parser + +Created 11/19/1996 Heikki Tuuri +*******************************************************/ + +/* Historical note: Innobase executed its first SQL string (CREATE TABLE) +on 1/27/1998 */ + +#include "pars0pars.h" + +#ifdef UNIV_NONINL +#include "pars0pars.ic" +#endif + +#include "row0sel.h" +#include "row0ins.h" +#include "row0upd.h" +#include "dict0dict.h" +#include "dict0mem.h" +#include "dict0crea.h" +#include "que0que.h" +#include "pars0grm.h" +#include "pars0opt.h" +#include "data0data.h" +#include "data0type.h" +#include "trx0trx.h" +#include "trx0roll.h" +#include "lock0lock.h" +#include "eval0eval.h" + +#ifdef UNIV_SQL_DEBUG +/** If the following is set TRUE, the lexer will print the SQL string +as it tokenizes it */ +UNIV_INTERN ibool pars_print_lexed = FALSE; +#endif /* UNIV_SQL_DEBUG */ + +/* Global variable used while parsing a single procedure or query : the code is +NOT re-entrant */ +UNIV_INTERN sym_tab_t* pars_sym_tab_global; + +/* Global variables used to denote certain reserved words, used in +constructing the parsing tree */ + +UNIV_INTERN pars_res_word_t pars_to_char_token = {PARS_TO_CHAR_TOKEN}; +UNIV_INTERN pars_res_word_t pars_to_number_token = {PARS_TO_NUMBER_TOKEN}; +UNIV_INTERN pars_res_word_t pars_to_binary_token = {PARS_TO_BINARY_TOKEN}; +UNIV_INTERN pars_res_word_t pars_binary_to_number_token = {PARS_BINARY_TO_NUMBER_TOKEN}; +UNIV_INTERN pars_res_word_t pars_substr_token = {PARS_SUBSTR_TOKEN}; +UNIV_INTERN pars_res_word_t pars_replstr_token = {PARS_REPLSTR_TOKEN}; +UNIV_INTERN pars_res_word_t pars_concat_token = {PARS_CONCAT_TOKEN}; +UNIV_INTERN pars_res_word_t pars_instr_token = {PARS_INSTR_TOKEN}; +UNIV_INTERN pars_res_word_t pars_length_token = {PARS_LENGTH_TOKEN}; +UNIV_INTERN pars_res_word_t pars_sysdate_token = {PARS_SYSDATE_TOKEN}; +UNIV_INTERN pars_res_word_t pars_printf_token = {PARS_PRINTF_TOKEN}; +UNIV_INTERN pars_res_word_t pars_assert_token = {PARS_ASSERT_TOKEN}; +UNIV_INTERN pars_res_word_t pars_rnd_token = {PARS_RND_TOKEN}; +UNIV_INTERN pars_res_word_t pars_rnd_str_token = {PARS_RND_STR_TOKEN}; +UNIV_INTERN pars_res_word_t pars_count_token = {PARS_COUNT_TOKEN}; +UNIV_INTERN pars_res_word_t pars_sum_token = {PARS_SUM_TOKEN}; +UNIV_INTERN pars_res_word_t pars_distinct_token = {PARS_DISTINCT_TOKEN}; +UNIV_INTERN pars_res_word_t pars_binary_token = {PARS_BINARY_TOKEN}; +UNIV_INTERN pars_res_word_t pars_blob_token = {PARS_BLOB_TOKEN}; +UNIV_INTERN pars_res_word_t pars_int_token = {PARS_INT_TOKEN}; +UNIV_INTERN pars_res_word_t pars_bigint_token = {PARS_BIGINT_TOKEN}; +UNIV_INTERN pars_res_word_t pars_char_token = {PARS_CHAR_TOKEN}; +UNIV_INTERN pars_res_word_t pars_float_token = {PARS_FLOAT_TOKEN}; +UNIV_INTERN pars_res_word_t pars_update_token = {PARS_UPDATE_TOKEN}; +UNIV_INTERN pars_res_word_t pars_asc_token = {PARS_ASC_TOKEN}; +UNIV_INTERN pars_res_word_t pars_desc_token = {PARS_DESC_TOKEN}; +UNIV_INTERN pars_res_word_t pars_open_token = {PARS_OPEN_TOKEN}; +UNIV_INTERN pars_res_word_t pars_close_token = {PARS_CLOSE_TOKEN}; +UNIV_INTERN pars_res_word_t pars_share_token = {PARS_SHARE_TOKEN}; +UNIV_INTERN pars_res_word_t pars_unique_token = {PARS_UNIQUE_TOKEN}; +UNIV_INTERN pars_res_word_t pars_clustered_token = {PARS_CLUSTERED_TOKEN}; + +/** Global variable used to denote the '*' in SELECT * FROM.. */ +UNIV_INTERN ulint pars_star_denoter = 12345678; + +/******************************************************************** +Get user function with the given name.*/ +UNIV_INLINE +pars_user_func_t* +pars_info_lookup_user_func( +/*=======================*/ + /* out: user func, or NULL if not + found */ + pars_info_t* info, /* in: info struct */ + const char* name) /* in: function name to find*/ +{ + if (info && info->funcs) { + ulint i; + ib_vector_t* vec = info->funcs; + + for (i = 0; i < ib_vector_size(vec); i++) { + pars_user_func_t* puf; + + puf = static_cast<pars_user_func_t*>( + ib_vector_get(vec, i)); + + if (strcmp(puf->name, name) == 0) { + return(puf); + } + } + } + + return(NULL); +} + +/******************************************************************** +Get bound identifier with the given name.*/ +UNIV_INLINE +pars_bound_id_t* +pars_info_lookup_bound_id( +/*======================*/ + /* out: bound literal, or NULL if + not found */ + pars_info_t* info, /* in: info struct */ + const char* name) /* in: bound literal name to find */ +{ + if (info && info->bound_ids) { + ulint i; + ib_vector_t* vec = info->bound_ids; + + for (i = 0; i < ib_vector_size(vec); i++) { + pars_bound_id_t* bid; + + bid = static_cast<pars_bound_id_t*>( + ib_vector_get(vec, i)); + + if (strcmp(bid->name, name) == 0) { + return(bid); + } + } + } + + return(NULL); +} + +/******************************************************************** +Get bound literal with the given name.*/ +UNIV_INLINE +pars_bound_lit_t* +pars_info_lookup_bound_lit( +/*=======================*/ + /* out: bound literal, or NULL if + not found */ + pars_info_t* info, /* in: info struct */ + const char* name) /* in: bound literal name to find */ +{ + if (info && info->bound_lits) { + ulint i; + ib_vector_t* vec = info->bound_lits; + + for (i = 0; i < ib_vector_size(vec); i++) { + pars_bound_lit_t* pbl; + + pbl = static_cast<pars_bound_lit_t*>( + ib_vector_get(vec, i)); + + if (strcmp(pbl->name, name) == 0) { + return(pbl); + } + } + } + + return(NULL); +} + +/*********************************************************************//** +Determines the class of a function code. +@return function class: PARS_FUNC_ARITH, ... */ +static +ulint +pars_func_get_class( +/*================*/ + int func) /*!< in: function code: '=', PARS_GE_TOKEN, ... */ +{ + switch (func) { + case '+': case '-': case '*': case '/': + return(PARS_FUNC_ARITH); + + case '=': case '<': case '>': + case PARS_GE_TOKEN: case PARS_LE_TOKEN: case PARS_NE_TOKEN: + return(PARS_FUNC_CMP); + + case PARS_AND_TOKEN: case PARS_OR_TOKEN: case PARS_NOT_TOKEN: + return(PARS_FUNC_LOGICAL); + + case PARS_COUNT_TOKEN: case PARS_SUM_TOKEN: + return(PARS_FUNC_AGGREGATE); + + case PARS_TO_CHAR_TOKEN: + case PARS_TO_NUMBER_TOKEN: + case PARS_TO_BINARY_TOKEN: + case PARS_BINARY_TO_NUMBER_TOKEN: + case PARS_SUBSTR_TOKEN: + case PARS_CONCAT_TOKEN: + case PARS_LENGTH_TOKEN: + case PARS_INSTR_TOKEN: + case PARS_SYSDATE_TOKEN: + case PARS_NOTFOUND_TOKEN: + case PARS_PRINTF_TOKEN: + case PARS_ASSERT_TOKEN: + case PARS_RND_TOKEN: + case PARS_RND_STR_TOKEN: + case PARS_REPLSTR_TOKEN: + return(PARS_FUNC_PREDEFINED); + + default: + return(PARS_FUNC_OTHER); + } +} + +/*********************************************************************//** +Parses an operator or predefined function expression. +@return own: function node in a query tree */ +static +func_node_t* +pars_func_low( +/*==========*/ + int func, /*!< in: function token code */ + que_node_t* arg) /*!< in: first argument in the argument list */ +{ + func_node_t* node; + + node = static_cast<func_node_t*>( + mem_heap_alloc(pars_sym_tab_global->heap, sizeof(func_node_t))); + + node->common.type = QUE_NODE_FUNC; + dfield_set_data(&(node->common.val), NULL, 0); + node->common.val_buf_size = 0; + + node->func = func; + + node->fclass = pars_func_get_class(func); + + node->args = arg; + + UT_LIST_ADD_LAST(func_node_list, pars_sym_tab_global->func_node_list, + node); + return(node); +} + +/*********************************************************************//** +Parses a function expression. +@return own: function node in a query tree */ +UNIV_INTERN +func_node_t* +pars_func( +/*======*/ + que_node_t* res_word,/*!< in: function name reserved word */ + que_node_t* arg) /*!< in: first argument in the argument list */ +{ + return(pars_func_low(((pars_res_word_t*) res_word)->code, arg)); +} + +/************************************************************************* +Rebind a LIKE search string. NOTE: We ignore any '%' characters embedded +within the search string.*/ + +int +pars_like_rebind( +/*=============*/ + /* out, own: function node in a query tree */ + sym_node_t* node, /* in: The search string node.*/ + const byte* ptr, /* in: literal to (re) bind */ + ulint ptr_len)/* in: length of literal to (re) bind*/ +{ + dtype_t* dtype; + dfield_t* dfield; + ib_like_t op_check; + sym_node_t* like_node; + sym_node_t* str_node = NULL; + ib_like_t op = IB_LIKE_EXACT; + int func = PARS_LIKE_TOKEN_EXACT; + + /* Is this a STRING% ? */ + if (ptr[ptr_len - 1] == '%') { + op = IB_LIKE_PREFIX; + } + + /* Is this a '%STRING' or %STRING% ?*/ + if (*ptr == '%') { + op = (op == IB_LIKE_PREFIX) ? IB_LIKE_SUBSTR : IB_LIKE_SUFFIX; + } + + if (node->like_node == NULL) { + /* Add the LIKE operator info node to the node list. + This will be used during the comparison phase to determine + how to match.*/ + like_node = sym_tab_add_int_lit(node->sym_table, op); + que_node_list_add_last(NULL, like_node); + node->like_node = like_node; + str_node = sym_tab_add_str_lit(node->sym_table, ptr, ptr_len); + que_node_list_add_last(like_node, str_node); + } else { + like_node = node->like_node; + + /* Change the value of the string in the existing + string node of like node */ + str_node = static_cast<sym_node_t*>( + que_node_list_get_last(like_node)); + + /* Must find the string node */ + ut_a(str_node); + ut_a(str_node != like_node); + ut_a(str_node->token_type == SYM_LIT); + + dfield = que_node_get_val(str_node); + dfield_set_data(dfield, ptr, ptr_len); + } + + dfield = que_node_get_val(like_node); + dtype = dfield_get_type(dfield); + + ut_a(dtype_get_mtype(dtype) == DATA_INT); + op_check = static_cast<ib_like_t>( + mach_read_from_4(static_cast<byte*>(dfield_get_data(dfield)))); + + switch (op_check) { + case IB_LIKE_PREFIX: + case IB_LIKE_SUFFIX: + case IB_LIKE_SUBSTR: + case IB_LIKE_EXACT: + break; + + default: + ut_error; + } + + mach_write_to_4(static_cast<byte*>(dfield_get_data(dfield)), op); + + dfield = que_node_get_val(node); + + /* Adjust the length of the search value so the '%' is not + visible. Then create and add a search string node to the + search value node. Searching for %SUFFIX and %SUBSTR% requires + a full table scan and so we set the search value to ''. + For PREFIX% we simply remove the trailing '%'.*/ + + switch (op) { + case IB_LIKE_EXACT: + dfield = que_node_get_val(str_node); + dtype = dfield_get_type(dfield); + + ut_a(dtype_get_mtype(dtype) == DATA_VARCHAR); + + dfield_set_data(dfield, ptr, ptr_len); + break; + + case IB_LIKE_PREFIX: + func = PARS_LIKE_TOKEN_PREFIX; + + /* Modify the original node */ + dfield_set_len(dfield, ptr_len - 1); + + dfield = que_node_get_val(str_node); + dtype = dfield_get_type(dfield); + + ut_a(dtype_get_mtype(dtype) == DATA_VARCHAR); + + dfield_set_data(dfield, ptr, ptr_len - 1); + break; + + case IB_LIKE_SUFFIX: + func = PARS_LIKE_TOKEN_SUFFIX; + + /* Modify the original node */ + /* Make it an '' empty string */ + dfield_set_len(dfield, 0); + + dfield = que_node_get_val(str_node); + dtype = dfield_get_type(dfield); + + ut_a(dtype_get_mtype(dtype) == DATA_VARCHAR); + + dfield_set_data(dfield, ptr + 1, ptr_len - 1); + break; + + case IB_LIKE_SUBSTR: + func = PARS_LIKE_TOKEN_SUBSTR; + + /* Modify the original node */ + /* Make it an '' empty string */ + dfield_set_len(dfield, 0); + + dfield = que_node_get_val(str_node); + dtype = dfield_get_type(dfield); + + ut_a(dtype_get_mtype(dtype) == DATA_VARCHAR); + + dfield_set_data(dfield, ptr + 1, ptr_len - 2); + break; + + default: + ut_error; + } + + return(func); +} + +/************************************************************************* +Parses a LIKE operator expression. */ +static +int +pars_like_op( +/*=========*/ + /* out, own: function node in a query tree */ + que_node_t* arg) /* in: LIKE comparison string.*/ +{ + char* ptr; + ulint ptr_len; + int func = PARS_LIKE_TOKEN_EXACT; + dfield_t* dfield = que_node_get_val(arg); + dtype_t* dtype = dfield_get_type(dfield); + + ut_a(dtype_get_mtype(dtype) == DATA_CHAR + || dtype_get_mtype(dtype) == DATA_VARCHAR); + + ptr = static_cast<char*>(dfield_get_data(dfield)); + ptr_len = strlen(ptr); + + if (ptr_len) { + + func = pars_like_rebind( + static_cast<sym_node_t*>(arg), (byte*) ptr, ptr_len); + } + + return(func); +} +/*********************************************************************//** +Parses an operator expression. +@return own: function node in a query tree */ +UNIV_INTERN +func_node_t* +pars_op( +/*====*/ + int func, /*!< in: operator token code */ + que_node_t* arg1, /*!< in: first argument */ + que_node_t* arg2) /*!< in: second argument or NULL for an unary + operator */ +{ + que_node_list_add_last(NULL, arg1); + + if (arg2) { + que_node_list_add_last(arg1, arg2); + } + + /* We need to parse the string and determine whether it's a + PREFIX, SUFFIX or SUBSTRING comparison */ + if (func == PARS_LIKE_TOKEN) { + + ut_a(que_node_get_type(arg2) == QUE_NODE_SYMBOL); + + func = pars_like_op(arg2); + + ut_a(func == PARS_LIKE_TOKEN_EXACT + || func == PARS_LIKE_TOKEN_PREFIX + || func == PARS_LIKE_TOKEN_SUFFIX + || func == PARS_LIKE_TOKEN_SUBSTR); + } + + return(pars_func_low(func, arg1)); +} + +/*********************************************************************//** +Parses an ORDER BY clause. Order by a single column only is supported. +@return own: order-by node in a query tree */ +UNIV_INTERN +order_node_t* +pars_order_by( +/*==========*/ + sym_node_t* column, /*!< in: column name */ + pars_res_word_t* asc) /*!< in: &pars_asc_token or pars_desc_token */ +{ + order_node_t* node; + + node = static_cast<order_node_t*>( + mem_heap_alloc( + pars_sym_tab_global->heap, sizeof(order_node_t))); + + node->common.type = QUE_NODE_ORDER; + + node->column = column; + + if (asc == &pars_asc_token) { + node->asc = TRUE; + } else { + ut_a(asc == &pars_desc_token); + node->asc = FALSE; + } + + return(node); +} + +/*********************************************************************//** +Determine if a data type is a built-in string data type of the InnoDB +SQL parser. +@return TRUE if string data type */ +static +ibool +pars_is_string_type( +/*================*/ + ulint mtype) /*!< in: main data type */ +{ + switch (mtype) { + case DATA_VARCHAR: case DATA_CHAR: + case DATA_FIXBINARY: case DATA_BINARY: + return(TRUE); + } + + return(FALSE); +} + +/*********************************************************************//** +Resolves the data type of a function in an expression. The argument data +types must already be resolved. */ +static +void +pars_resolve_func_data_type( +/*========================*/ + func_node_t* node) /*!< in: function node */ +{ + que_node_t* arg; + + ut_a(que_node_get_type(node) == QUE_NODE_FUNC); + + arg = node->args; + + switch (node->func) { + case PARS_SUM_TOKEN: + case '+': case '-': case '*': case '/': + /* Inherit the data type from the first argument (which must + not be the SQL null literal whose type is DATA_ERROR) */ + + dtype_copy(que_node_get_data_type(node), + que_node_get_data_type(arg)); + + ut_a(dtype_get_mtype(que_node_get_data_type(node)) + == DATA_INT); + break; + + case PARS_COUNT_TOKEN: + ut_a(arg); + dtype_set(que_node_get_data_type(node), DATA_INT, 0, 4); + break; + + case PARS_TO_CHAR_TOKEN: + case PARS_RND_STR_TOKEN: + ut_a(dtype_get_mtype(que_node_get_data_type(arg)) == DATA_INT); + dtype_set(que_node_get_data_type(node), DATA_VARCHAR, + DATA_ENGLISH, 0); + break; + + case PARS_TO_BINARY_TOKEN: + if (dtype_get_mtype(que_node_get_data_type(arg)) == DATA_INT) { + dtype_set(que_node_get_data_type(node), DATA_VARCHAR, + DATA_ENGLISH, 0); + } else { + dtype_set(que_node_get_data_type(node), DATA_BINARY, + 0, 0); + } + break; + + case PARS_TO_NUMBER_TOKEN: + case PARS_BINARY_TO_NUMBER_TOKEN: + case PARS_LENGTH_TOKEN: + case PARS_INSTR_TOKEN: + ut_a(pars_is_string_type(que_node_get_data_type(arg)->mtype)); + dtype_set(que_node_get_data_type(node), DATA_INT, 0, 4); + break; + + case PARS_SYSDATE_TOKEN: + ut_a(arg == NULL); + dtype_set(que_node_get_data_type(node), DATA_INT, 0, 4); + break; + + case PARS_SUBSTR_TOKEN: + case PARS_CONCAT_TOKEN: + ut_a(pars_is_string_type(que_node_get_data_type(arg)->mtype)); + dtype_set(que_node_get_data_type(node), DATA_VARCHAR, + DATA_ENGLISH, 0); + break; + + case '>': case '<': case '=': + case PARS_GE_TOKEN: + case PARS_LE_TOKEN: + case PARS_NE_TOKEN: + case PARS_AND_TOKEN: + case PARS_OR_TOKEN: + case PARS_NOT_TOKEN: + case PARS_NOTFOUND_TOKEN: + + /* We currently have no iboolean type: use integer type */ + dtype_set(que_node_get_data_type(node), DATA_INT, 0, 4); + break; + + case PARS_RND_TOKEN: + ut_a(dtype_get_mtype(que_node_get_data_type(arg)) == DATA_INT); + dtype_set(que_node_get_data_type(node), DATA_INT, 0, 4); + break; + + case PARS_LIKE_TOKEN_EXACT: + case PARS_LIKE_TOKEN_PREFIX: + case PARS_LIKE_TOKEN_SUFFIX: + case PARS_LIKE_TOKEN_SUBSTR: + dtype_set(que_node_get_data_type(node), DATA_VARCHAR, + DATA_ENGLISH, 0); + break; + + default: + ut_error; + } +} + +/*********************************************************************//** +Resolves the meaning of variables in an expression and the data types of +functions. It is an error if some identifier cannot be resolved here. */ +static +void +pars_resolve_exp_variables_and_types( +/*=================================*/ + sel_node_t* select_node, /*!< in: select node or NULL; if + this is not NULL then the variable + sym nodes are added to the + copy_variables list of select_node */ + que_node_t* exp_node) /*!< in: expression */ +{ + func_node_t* func_node; + que_node_t* arg; + sym_node_t* sym_node; + sym_node_t* node; + + ut_a(exp_node); + + if (que_node_get_type(exp_node) == QUE_NODE_FUNC) { + func_node = static_cast<func_node_t*>(exp_node); + + arg = func_node->args; + + while (arg) { + pars_resolve_exp_variables_and_types(select_node, arg); + + arg = que_node_get_next(arg); + } + + pars_resolve_func_data_type(func_node); + + return; + } + + ut_a(que_node_get_type(exp_node) == QUE_NODE_SYMBOL); + + sym_node = static_cast<sym_node_t*>(exp_node); + + if (sym_node->resolved) { + + return; + } + + /* Not resolved yet: look in the symbol table for a variable + or a cursor or a function with the same name */ + + node = UT_LIST_GET_FIRST(pars_sym_tab_global->sym_list); + + while (node) { + if (node->resolved + && ((node->token_type == SYM_VAR) + || (node->token_type == SYM_CURSOR) + || (node->token_type == SYM_FUNCTION)) + && node->name + && (sym_node->name_len == node->name_len) + && (ut_memcmp(sym_node->name, node->name, + node->name_len) == 0)) { + + /* Found a variable or a cursor declared with + the same name */ + + break; + } + + node = UT_LIST_GET_NEXT(sym_list, node); + } + + if (!node) { + fprintf(stderr, "PARSER ERROR: Unresolved identifier %s\n", + sym_node->name); + } + + ut_a(node); + + sym_node->resolved = TRUE; + sym_node->token_type = SYM_IMPLICIT_VAR; + sym_node->alias = node; + sym_node->indirection = node; + + if (select_node) { + UT_LIST_ADD_LAST(col_var_list, select_node->copy_variables, + sym_node); + } + + dfield_set_type(que_node_get_val(sym_node), + que_node_get_data_type(node)); +} + +/*********************************************************************//** +Resolves the meaning of variables in an expression list. It is an error if +some identifier cannot be resolved here. Resolves also the data types of +functions. */ +static +void +pars_resolve_exp_list_variables_and_types( +/*======================================*/ + sel_node_t* select_node, /*!< in: select node or NULL */ + que_node_t* exp_node) /*!< in: expression list first node, or + NULL */ +{ + while (exp_node) { + pars_resolve_exp_variables_and_types(select_node, exp_node); + + exp_node = que_node_get_next(exp_node); + } +} + +/*********************************************************************//** +Resolves the columns in an expression. */ +static +void +pars_resolve_exp_columns( +/*=====================*/ + sym_node_t* table_node, /*!< in: first node in a table list */ + que_node_t* exp_node) /*!< in: expression */ +{ + func_node_t* func_node; + que_node_t* arg; + sym_node_t* sym_node; + dict_table_t* table; + sym_node_t* t_node; + ulint n_cols; + ulint i; + + ut_a(exp_node); + + if (que_node_get_type(exp_node) == QUE_NODE_FUNC) { + func_node = static_cast<func_node_t*>(exp_node); + + arg = func_node->args; + + while (arg) { + pars_resolve_exp_columns(table_node, arg); + + arg = que_node_get_next(arg); + } + + return; + } + + ut_a(que_node_get_type(exp_node) == QUE_NODE_SYMBOL); + + sym_node = static_cast<sym_node_t*>(exp_node); + + if (sym_node->resolved) { + + return; + } + + /* Not resolved yet: look in the table list for a column with the + same name */ + + t_node = table_node; + + while (t_node) { + table = t_node->table; + + n_cols = dict_table_get_n_cols(table); + + for (i = 0; i < n_cols; i++) { + const dict_col_t* col + = dict_table_get_nth_col(table, i); + const char* col_name + = dict_table_get_col_name(table, i); + + if ((sym_node->name_len == ut_strlen(col_name)) + && (0 == ut_memcmp(sym_node->name, col_name, + sym_node->name_len))) { + /* Found */ + sym_node->resolved = TRUE; + sym_node->token_type = SYM_COLUMN; + sym_node->table = table; + sym_node->col_no = i; + sym_node->prefetch_buf = NULL; + + dict_col_copy_type( + col, + dfield_get_type(&sym_node + ->common.val)); + + return; + } + } + + t_node = static_cast<sym_node_t*>(que_node_get_next(t_node)); + } +} + +/*********************************************************************//** +Resolves the meaning of columns in an expression list. */ +static +void +pars_resolve_exp_list_columns( +/*==========================*/ + sym_node_t* table_node, /*!< in: first node in a table list */ + que_node_t* exp_node) /*!< in: expression list first node, or + NULL */ +{ + while (exp_node) { + pars_resolve_exp_columns(table_node, exp_node); + + exp_node = que_node_get_next(exp_node); + } +} + +/*********************************************************************//** +Retrieves the table definition for a table name id. */ +static +void +pars_retrieve_table_def( +/*====================*/ + sym_node_t* sym_node) /*!< in: table node */ +{ + ut_a(sym_node); + ut_a(que_node_get_type(sym_node) == QUE_NODE_SYMBOL); + + /* Open the table only if it is not already opened. */ + if (sym_node->token_type != SYM_TABLE_REF_COUNTED) { + + ut_a(sym_node->table == NULL); + + sym_node->resolved = TRUE; + sym_node->token_type = SYM_TABLE_REF_COUNTED; + + sym_node->table = dict_table_open_on_name( + sym_node->name, TRUE, FALSE, DICT_ERR_IGNORE_NONE); + + ut_a(sym_node->table != NULL); + } +} + +/*********************************************************************//** +Retrieves the table definitions for a list of table name ids. +@return number of tables */ +static +ulint +pars_retrieve_table_list_defs( +/*==========================*/ + sym_node_t* sym_node) /*!< in: first table node in list */ +{ + ulint count = 0; + + if (sym_node == NULL) { + + return(count); + } + + while (sym_node) { + pars_retrieve_table_def(sym_node); + + count++; + + sym_node = static_cast<sym_node_t*>( + que_node_get_next(sym_node)); + } + + return(count); +} + +/*********************************************************************//** +Adds all columns to the select list if the query is SELECT * FROM ... */ +static +void +pars_select_all_columns( +/*====================*/ + sel_node_t* select_node) /*!< in: select node already containing + the table list */ +{ + sym_node_t* col_node; + sym_node_t* table_node; + dict_table_t* table; + ulint i; + + select_node->select_list = NULL; + + table_node = select_node->table_list; + + while (table_node) { + table = table_node->table; + + for (i = 0; i < dict_table_get_n_user_cols(table); i++) { + const char* col_name = dict_table_get_col_name( + table, i); + + col_node = sym_tab_add_id(pars_sym_tab_global, + (byte*) col_name, + ut_strlen(col_name)); + + select_node->select_list = que_node_list_add_last( + select_node->select_list, col_node); + } + + table_node = static_cast<sym_node_t*>( + que_node_get_next(table_node)); + } +} + +/*********************************************************************//** +Parses a select list; creates a query graph node for the whole SELECT +statement. +@return own: select node in a query tree */ +UNIV_INTERN +sel_node_t* +pars_select_list( +/*=============*/ + que_node_t* select_list, /*!< in: select list */ + sym_node_t* into_list) /*!< in: variables list or NULL */ +{ + sel_node_t* node; + + node = sel_node_create(pars_sym_tab_global->heap); + + node->select_list = select_list; + node->into_list = into_list; + + pars_resolve_exp_list_variables_and_types(NULL, into_list); + + return(node); +} + +/*********************************************************************//** +Checks if the query is an aggregate query, in which case the selct list must +contain only aggregate function items. */ +static +void +pars_check_aggregate( +/*=================*/ + sel_node_t* select_node) /*!< in: select node already containing + the select list */ +{ + que_node_t* exp_node; + func_node_t* func_node; + ulint n_nodes = 0; + ulint n_aggregate_nodes = 0; + + exp_node = select_node->select_list; + + while (exp_node) { + + n_nodes++; + + if (que_node_get_type(exp_node) == QUE_NODE_FUNC) { + + func_node = static_cast<func_node_t*>(exp_node); + + if (func_node->fclass == PARS_FUNC_AGGREGATE) { + + n_aggregate_nodes++; + } + } + + exp_node = que_node_get_next(exp_node); + } + + if (n_aggregate_nodes > 0) { + ut_a(n_nodes == n_aggregate_nodes); + + select_node->is_aggregate = TRUE; + } else { + select_node->is_aggregate = FALSE; + } +} + +/*********************************************************************//** +Parses a select statement. +@return own: select node in a query tree */ +UNIV_INTERN +sel_node_t* +pars_select_statement( +/*==================*/ + sel_node_t* select_node, /*!< in: select node already containing + the select list */ + sym_node_t* table_list, /*!< in: table list */ + que_node_t* search_cond, /*!< in: search condition or NULL */ + pars_res_word_t* for_update, /*!< in: NULL or &pars_update_token */ + pars_res_word_t* lock_shared, /*!< in: NULL or &pars_share_token */ + order_node_t* order_by) /*!< in: NULL or an order-by node */ +{ + select_node->state = SEL_NODE_OPEN; + + select_node->table_list = table_list; + select_node->n_tables = pars_retrieve_table_list_defs(table_list); + + if (select_node->select_list == &pars_star_denoter) { + + /* SELECT * FROM ... */ + pars_select_all_columns(select_node); + } + + if (select_node->into_list) { + ut_a(que_node_list_get_len(select_node->into_list) + == que_node_list_get_len(select_node->select_list)); + } + + UT_LIST_INIT(select_node->copy_variables); + + pars_resolve_exp_list_columns(table_list, select_node->select_list); + pars_resolve_exp_list_variables_and_types(select_node, + select_node->select_list); + pars_check_aggregate(select_node); + + select_node->search_cond = search_cond; + + if (search_cond) { + pars_resolve_exp_columns(table_list, search_cond); + pars_resolve_exp_variables_and_types(select_node, search_cond); + } + + if (for_update) { + ut_a(!lock_shared); + + select_node->set_x_locks = TRUE; + select_node->row_lock_mode = LOCK_X; + + select_node->consistent_read = FALSE; + select_node->read_view = NULL; + } else if (lock_shared){ + select_node->set_x_locks = FALSE; + select_node->row_lock_mode = LOCK_S; + + select_node->consistent_read = FALSE; + select_node->read_view = NULL; + } else { + select_node->set_x_locks = FALSE; + select_node->row_lock_mode = LOCK_S; + + select_node->consistent_read = TRUE; + } + + select_node->order_by = order_by; + + if (order_by) { + pars_resolve_exp_columns(table_list, order_by->column); + } + + /* The final value of the following fields depend on the environment + where the select statement appears: */ + + select_node->can_get_updated = FALSE; + select_node->explicit_cursor = NULL; + + opt_search_plan(select_node); + + return(select_node); +} + +/*********************************************************************//** +Parses a cursor declaration. +@return sym_node */ +UNIV_INTERN +que_node_t* +pars_cursor_declaration( +/*====================*/ + sym_node_t* sym_node, /*!< in: cursor id node in the symbol + table */ + sel_node_t* select_node) /*!< in: select node */ +{ + sym_node->resolved = TRUE; + sym_node->token_type = SYM_CURSOR; + sym_node->cursor_def = select_node; + + select_node->state = SEL_NODE_CLOSED; + select_node->explicit_cursor = sym_node; + + return(sym_node); +} + +/*********************************************************************//** +Parses a function declaration. +@return sym_node */ +UNIV_INTERN +que_node_t* +pars_function_declaration( +/*======================*/ + sym_node_t* sym_node) /*!< in: function id node in the symbol + table */ +{ + sym_node->resolved = TRUE; + sym_node->token_type = SYM_FUNCTION; + + /* Check that the function exists. */ + ut_a(pars_info_lookup_user_func( + pars_sym_tab_global->info, sym_node->name)); + + return(sym_node); +} + +/*********************************************************************//** +Parses a delete or update statement start. +@return own: update node in a query tree */ +UNIV_INTERN +upd_node_t* +pars_update_statement_start( +/*========================*/ + ibool is_delete, /*!< in: TRUE if delete */ + sym_node_t* table_sym, /*!< in: table name node */ + col_assign_node_t* col_assign_list)/*!< in: column assignment list, NULL + if delete */ +{ + upd_node_t* node; + + node = upd_node_create(pars_sym_tab_global->heap); + + node->is_delete = is_delete; + + node->table_sym = table_sym; + node->col_assign_list = col_assign_list; + + return(node); +} + +/*********************************************************************//** +Parses a column assignment in an update. +@return column assignment node */ +UNIV_INTERN +col_assign_node_t* +pars_column_assignment( +/*===================*/ + sym_node_t* column, /*!< in: column to assign */ + que_node_t* exp) /*!< in: value to assign */ +{ + col_assign_node_t* node; + + node = static_cast<col_assign_node_t*>( + mem_heap_alloc(pars_sym_tab_global->heap, + sizeof(col_assign_node_t))); + node->common.type = QUE_NODE_COL_ASSIGNMENT; + + node->col = column; + node->val = exp; + + return(node); +} + +/*********************************************************************//** +Processes an update node assignment list. */ +static +void +pars_process_assign_list( +/*=====================*/ + upd_node_t* node) /*!< in: update node */ +{ + col_assign_node_t* col_assign_list; + sym_node_t* table_sym; + col_assign_node_t* assign_node; + upd_field_t* upd_field; + dict_index_t* clust_index; + sym_node_t* col_sym; + ulint changes_ord_field; + ulint changes_field_size; + ulint n_assigns; + ulint i; + + table_sym = node->table_sym; + col_assign_list = static_cast<col_assign_node_t*>( + node->col_assign_list); + clust_index = dict_table_get_first_index(node->table); + + assign_node = col_assign_list; + n_assigns = 0; + + while (assign_node) { + pars_resolve_exp_columns(table_sym, assign_node->col); + pars_resolve_exp_columns(table_sym, assign_node->val); + pars_resolve_exp_variables_and_types(NULL, assign_node->val); +#if 0 + ut_a(dtype_get_mtype( + dfield_get_type(que_node_get_val( + assign_node->col))) + == dtype_get_mtype( + dfield_get_type(que_node_get_val( + assign_node->val)))); +#endif + + /* Add to the update node all the columns found in assignment + values as columns to copy: therefore, TRUE */ + + opt_find_all_cols(TRUE, clust_index, &(node->columns), NULL, + assign_node->val); + n_assigns++; + + assign_node = static_cast<col_assign_node_t*>( + que_node_get_next(assign_node)); + } + + node->update = upd_create(n_assigns, pars_sym_tab_global->heap); + + assign_node = col_assign_list; + + changes_field_size = UPD_NODE_NO_SIZE_CHANGE; + + for (i = 0; i < n_assigns; i++) { + upd_field = upd_get_nth_field(node->update, i); + + col_sym = assign_node->col; + + upd_field_set_field_no(upd_field, dict_index_get_nth_col_pos( + clust_index, col_sym->col_no), + clust_index, NULL); + upd_field->exp = assign_node->val; + + if (!dict_col_get_fixed_size( + dict_index_get_nth_col(clust_index, + upd_field->field_no), + dict_table_is_comp(node->table))) { + changes_field_size = 0; + } + + assign_node = static_cast<col_assign_node_t*>( + que_node_get_next(assign_node)); + } + + /* Find out if the update can modify an ordering field in any index */ + + changes_ord_field = UPD_NODE_NO_ORD_CHANGE; + + if (row_upd_changes_some_index_ord_field_binary(node->table, + node->update)) { + changes_ord_field = 0; + } + + node->cmpl_info = changes_ord_field | changes_field_size; +} + +/*********************************************************************//** +Parses an update or delete statement. +@return own: update node in a query tree */ +UNIV_INTERN +upd_node_t* +pars_update_statement( +/*==================*/ + upd_node_t* node, /*!< in: update node */ + sym_node_t* cursor_sym, /*!< in: pointer to a cursor entry in + the symbol table or NULL */ + que_node_t* search_cond) /*!< in: search condition or NULL */ +{ + sym_node_t* table_sym; + sel_node_t* sel_node; + plan_t* plan; + + table_sym = node->table_sym; + + pars_retrieve_table_def(table_sym); + node->table = table_sym->table; + + UT_LIST_INIT(node->columns); + + /* Make the single table node into a list of table nodes of length 1 */ + + que_node_list_add_last(NULL, table_sym); + + if (cursor_sym) { + pars_resolve_exp_variables_and_types(NULL, cursor_sym); + + sel_node = cursor_sym->alias->cursor_def; + + node->searched_update = FALSE; + } else { + sel_node = pars_select_list(NULL, NULL); + + pars_select_statement(sel_node, table_sym, search_cond, NULL, + &pars_share_token, NULL); + node->searched_update = TRUE; + sel_node->common.parent = node; + } + + node->select = sel_node; + + ut_a(!node->is_delete || (node->col_assign_list == NULL)); + ut_a(node->is_delete || (node->col_assign_list != NULL)); + + if (node->is_delete) { + node->cmpl_info = 0; + } else { + pars_process_assign_list(node); + } + + if (node->searched_update) { + node->has_clust_rec_x_lock = TRUE; + sel_node->set_x_locks = TRUE; + sel_node->row_lock_mode = LOCK_X; + } else { + node->has_clust_rec_x_lock = sel_node->set_x_locks; + } + + ut_a(sel_node->n_tables == 1); + ut_a(sel_node->consistent_read == FALSE); + ut_a(sel_node->order_by == NULL); + ut_a(sel_node->is_aggregate == FALSE); + + sel_node->can_get_updated = TRUE; + + node->state = UPD_NODE_UPDATE_CLUSTERED; + + plan = sel_node_get_nth_plan(sel_node, 0); + + plan->no_prefetch = TRUE; + + if (!dict_index_is_clust(plan->index)) { + + plan->must_get_clust = TRUE; + + node->pcur = &(plan->clust_pcur); + } else { + node->pcur = &(plan->pcur); + } + + return(node); +} + +/*********************************************************************//** +Parses an insert statement. +@return own: update node in a query tree */ +UNIV_INTERN +ins_node_t* +pars_insert_statement( +/*==================*/ + sym_node_t* table_sym, /*!< in: table name node */ + que_node_t* values_list, /*!< in: value expression list or NULL */ + sel_node_t* select) /*!< in: select condition or NULL */ +{ + ins_node_t* node; + dtuple_t* row; + ulint ins_type; + + ut_a(values_list || select); + ut_a(!values_list || !select); + + if (values_list) { + ins_type = INS_VALUES; + } else { + ins_type = INS_SEARCHED; + } + + pars_retrieve_table_def(table_sym); + + node = ins_node_create(ins_type, table_sym->table, + pars_sym_tab_global->heap); + + row = dtuple_create(pars_sym_tab_global->heap, + dict_table_get_n_cols(node->table)); + + dict_table_copy_types(row, table_sym->table); + + ins_node_set_new_row(node, row); + + node->select = select; + + if (select) { + select->common.parent = node; + + ut_a(que_node_list_get_len(select->select_list) + == dict_table_get_n_user_cols(table_sym->table)); + } + + node->values_list = values_list; + + if (node->values_list) { + pars_resolve_exp_list_variables_and_types(NULL, values_list); + + ut_a(que_node_list_get_len(values_list) + == dict_table_get_n_user_cols(table_sym->table)); + } + + return(node); +} + +/*********************************************************************//** +Set the type of a dfield. */ +static +void +pars_set_dfield_type( +/*=================*/ + dfield_t* dfield, /*!< in: dfield */ + pars_res_word_t* type, /*!< in: pointer to a type + token */ + ulint len, /*!< in: length, or 0 */ + ibool is_unsigned, /*!< in: if TRUE, column is + UNSIGNED. */ + ibool is_not_null) /*!< in: if TRUE, column is + NOT NULL. */ +{ + ulint flags = 0; + + if (is_not_null) { + flags |= DATA_NOT_NULL; + } + + if (is_unsigned) { + flags |= DATA_UNSIGNED; + } + + if (type == &pars_bigint_token) { + ut_a(len == 0); + + dtype_set(dfield_get_type(dfield), DATA_INT, flags, 8); + } else if (type == &pars_int_token) { + ut_a(len == 0); + + dtype_set(dfield_get_type(dfield), DATA_INT, flags, 4); + + } else if (type == &pars_char_token) { + //ut_a(len == 0); + + dtype_set(dfield_get_type(dfield), DATA_VARCHAR, + DATA_ENGLISH | flags, len); + } else if (type == &pars_binary_token) { + ut_a(len != 0); + + dtype_set(dfield_get_type(dfield), DATA_FIXBINARY, + DATA_BINARY_TYPE | flags, len); + } else if (type == &pars_blob_token) { + ut_a(len == 0); + + dtype_set(dfield_get_type(dfield), DATA_BLOB, + DATA_BINARY_TYPE | flags, 0); + } else { + ut_error; + } +} + +/*********************************************************************//** +Parses a variable declaration. +@return own: symbol table node of type SYM_VAR */ +UNIV_INTERN +sym_node_t* +pars_variable_declaration( +/*======================*/ + sym_node_t* node, /*!< in: symbol table node allocated for the + id of the variable */ + pars_res_word_t* type) /*!< in: pointer to a type token */ +{ + node->resolved = TRUE; + node->token_type = SYM_VAR; + + node->param_type = PARS_NOT_PARAM; + + pars_set_dfield_type(que_node_get_val(node), type, 0, FALSE, FALSE); + + return(node); +} + +/*********************************************************************//** +Parses a procedure parameter declaration. +@return own: symbol table node of type SYM_VAR */ +UNIV_INTERN +sym_node_t* +pars_parameter_declaration( +/*=======================*/ + sym_node_t* node, /*!< in: symbol table node allocated for the + id of the parameter */ + ulint param_type, + /*!< in: PARS_INPUT or PARS_OUTPUT */ + pars_res_word_t* type) /*!< in: pointer to a type token */ +{ + ut_a((param_type == PARS_INPUT) || (param_type == PARS_OUTPUT)); + + pars_variable_declaration(node, type); + + node->param_type = param_type; + + return(node); +} + +/*********************************************************************//** +Sets the parent field in a query node list. */ +static +void +pars_set_parent_in_list( +/*====================*/ + que_node_t* node_list, /*!< in: first node in a list */ + que_node_t* parent) /*!< in: parent value to set in all + nodes of the list */ +{ + que_common_t* common; + + common = static_cast<que_common_t*>(node_list); + + while (common) { + common->parent = parent; + + common = static_cast<que_common_t*>(que_node_get_next(common)); + } +} + +/*********************************************************************//** +Parses an elsif element. +@return elsif node */ +UNIV_INTERN +elsif_node_t* +pars_elsif_element( +/*===============*/ + que_node_t* cond, /*!< in: if-condition */ + que_node_t* stat_list) /*!< in: statement list */ +{ + elsif_node_t* node; + + node = static_cast<elsif_node_t*>( + mem_heap_alloc( + pars_sym_tab_global->heap, sizeof(elsif_node_t))); + + node->common.type = QUE_NODE_ELSIF; + + node->cond = cond; + + pars_resolve_exp_variables_and_types(NULL, cond); + + node->stat_list = stat_list; + + return(node); +} + +/*********************************************************************//** +Parses an if-statement. +@return if-statement node */ +UNIV_INTERN +if_node_t* +pars_if_statement( +/*==============*/ + que_node_t* cond, /*!< in: if-condition */ + que_node_t* stat_list, /*!< in: statement list */ + que_node_t* else_part) /*!< in: else-part statement list + or elsif element list */ +{ + if_node_t* node; + elsif_node_t* elsif_node; + + node = static_cast<if_node_t*>( + mem_heap_alloc( + pars_sym_tab_global->heap, sizeof(if_node_t))); + + node->common.type = QUE_NODE_IF; + + node->cond = cond; + + pars_resolve_exp_variables_and_types(NULL, cond); + + node->stat_list = stat_list; + + if (else_part && (que_node_get_type(else_part) == QUE_NODE_ELSIF)) { + + /* There is a list of elsif conditions */ + + node->else_part = NULL; + node->elsif_list = static_cast<elsif_node_t*>(else_part); + + elsif_node = static_cast<elsif_node_t*>(else_part); + + while (elsif_node) { + pars_set_parent_in_list(elsif_node->stat_list, node); + + elsif_node = static_cast<elsif_node_t*>( + que_node_get_next(elsif_node)); + } + } else { + node->else_part = else_part; + node->elsif_list = NULL; + + pars_set_parent_in_list(else_part, node); + } + + pars_set_parent_in_list(stat_list, node); + + return(node); +} + +/*********************************************************************//** +Parses a while-statement. +@return while-statement node */ +UNIV_INTERN +while_node_t* +pars_while_statement( +/*=================*/ + que_node_t* cond, /*!< in: while-condition */ + que_node_t* stat_list) /*!< in: statement list */ +{ + while_node_t* node; + + node = static_cast<while_node_t*>( + mem_heap_alloc( + pars_sym_tab_global->heap, sizeof(while_node_t))); + + node->common.type = QUE_NODE_WHILE; + + node->cond = cond; + + pars_resolve_exp_variables_and_types(NULL, cond); + + node->stat_list = stat_list; + + pars_set_parent_in_list(stat_list, node); + + return(node); +} + +/*********************************************************************//** +Parses a for-loop-statement. +@return for-statement node */ +UNIV_INTERN +for_node_t* +pars_for_statement( +/*===============*/ + sym_node_t* loop_var, /*!< in: loop variable */ + que_node_t* loop_start_limit,/*!< in: loop start expression */ + que_node_t* loop_end_limit, /*!< in: loop end expression */ + que_node_t* stat_list) /*!< in: statement list */ +{ + for_node_t* node; + + node = static_cast<for_node_t*>( + mem_heap_alloc(pars_sym_tab_global->heap, sizeof(for_node_t))); + + node->common.type = QUE_NODE_FOR; + + pars_resolve_exp_variables_and_types(NULL, loop_var); + pars_resolve_exp_variables_and_types(NULL, loop_start_limit); + pars_resolve_exp_variables_and_types(NULL, loop_end_limit); + + node->loop_var = loop_var->indirection; + + ut_a(loop_var->indirection); + + node->loop_start_limit = loop_start_limit; + node->loop_end_limit = loop_end_limit; + + node->stat_list = stat_list; + + pars_set_parent_in_list(stat_list, node); + + return(node); +} + +/*********************************************************************//** +Parses an exit statement. +@return exit statement node */ +UNIV_INTERN +exit_node_t* +pars_exit_statement(void) +/*=====================*/ +{ + exit_node_t* node; + + node = static_cast<exit_node_t*>( + mem_heap_alloc(pars_sym_tab_global->heap, sizeof(exit_node_t))); + node->common.type = QUE_NODE_EXIT; + + return(node); +} + +/*********************************************************************//** +Parses a return-statement. +@return return-statement node */ +UNIV_INTERN +return_node_t* +pars_return_statement(void) +/*=======================*/ +{ + return_node_t* node; + + node = static_cast<return_node_t*>( + mem_heap_alloc( + pars_sym_tab_global->heap, sizeof(return_node_t))); + node->common.type = QUE_NODE_RETURN; + + return(node); +} + +/*********************************************************************//** +Parses an assignment statement. +@return assignment statement node */ +UNIV_INTERN +assign_node_t* +pars_assignment_statement( +/*======================*/ + sym_node_t* var, /*!< in: variable to assign */ + que_node_t* val) /*!< in: value to assign */ +{ + assign_node_t* node; + + node = static_cast<assign_node_t*>( + mem_heap_alloc( + pars_sym_tab_global->heap, sizeof(assign_node_t))); + node->common.type = QUE_NODE_ASSIGNMENT; + + node->var = var; + node->val = val; + + pars_resolve_exp_variables_and_types(NULL, var); + pars_resolve_exp_variables_and_types(NULL, val); + + ut_a(dtype_get_mtype(dfield_get_type(que_node_get_val(var))) + == dtype_get_mtype(dfield_get_type(que_node_get_val(val)))); + + return(node); +} + +/*********************************************************************//** +Parses a procedure call. +@return function node */ +UNIV_INTERN +func_node_t* +pars_procedure_call( +/*================*/ + que_node_t* res_word,/*!< in: procedure name reserved word */ + que_node_t* args) /*!< in: argument list */ +{ + func_node_t* node; + + node = pars_func(res_word, args); + + pars_resolve_exp_list_variables_and_types(NULL, args); + + return(node); +} + +/*********************************************************************//** +Parses a fetch statement. into_list or user_func (but not both) must be +non-NULL. +@return fetch statement node */ +UNIV_INTERN +fetch_node_t* +pars_fetch_statement( +/*=================*/ + sym_node_t* cursor, /*!< in: cursor node */ + sym_node_t* into_list, /*!< in: variables to set, or NULL */ + sym_node_t* user_func) /*!< in: user function name, or NULL */ +{ + sym_node_t* cursor_decl; + fetch_node_t* node; + + /* Logical XOR. */ + ut_a(!into_list != !user_func); + + node = static_cast<fetch_node_t*>( + mem_heap_alloc( + pars_sym_tab_global->heap, sizeof(fetch_node_t))); + + node->common.type = QUE_NODE_FETCH; + + pars_resolve_exp_variables_and_types(NULL, cursor); + + if (into_list) { + pars_resolve_exp_list_variables_and_types(NULL, into_list); + node->into_list = into_list; + node->func = NULL; + } else { + pars_resolve_exp_variables_and_types(NULL, user_func); + + node->func = pars_info_lookup_user_func( + pars_sym_tab_global->info, user_func->name); + + ut_a(node->func); + + node->into_list = NULL; + } + + cursor_decl = cursor->alias; + + ut_a(cursor_decl->token_type == SYM_CURSOR); + + node->cursor_def = cursor_decl->cursor_def; + + if (into_list) { + ut_a(que_node_list_get_len(into_list) + == que_node_list_get_len(node->cursor_def->select_list)); + } + + return(node); +} + +/*********************************************************************//** +Parses an open or close cursor statement. +@return fetch statement node */ +UNIV_INTERN +open_node_t* +pars_open_statement( +/*================*/ + ulint type, /*!< in: ROW_SEL_OPEN_CURSOR + or ROW_SEL_CLOSE_CURSOR */ + sym_node_t* cursor) /*!< in: cursor node */ +{ + sym_node_t* cursor_decl; + open_node_t* node; + + node = static_cast<open_node_t*>( + mem_heap_alloc( + pars_sym_tab_global->heap, sizeof(open_node_t))); + + node->common.type = QUE_NODE_OPEN; + + pars_resolve_exp_variables_and_types(NULL, cursor); + + cursor_decl = cursor->alias; + + ut_a(cursor_decl->token_type == SYM_CURSOR); + + node->op_type = static_cast<open_node_op>(type); + node->cursor_def = cursor_decl->cursor_def; + + return(node); +} + +/*********************************************************************//** +Parses a row_printf-statement. +@return row_printf-statement node */ +UNIV_INTERN +row_printf_node_t* +pars_row_printf_statement( +/*======================*/ + sel_node_t* sel_node) /*!< in: select node */ +{ + row_printf_node_t* node; + + node = static_cast<row_printf_node_t*>( + mem_heap_alloc( + pars_sym_tab_global->heap, sizeof(row_printf_node_t))); + node->common.type = QUE_NODE_ROW_PRINTF; + + node->sel_node = sel_node; + + sel_node->common.parent = node; + + return(node); +} + +/*********************************************************************//** +Parses a commit statement. +@return own: commit node struct */ +UNIV_INTERN +commit_node_t* +pars_commit_statement(void) +/*=======================*/ +{ + return(trx_commit_node_create(pars_sym_tab_global->heap)); +} + +/*********************************************************************//** +Parses a rollback statement. +@return own: rollback node struct */ +UNIV_INTERN +roll_node_t* +pars_rollback_statement(void) +/*=========================*/ +{ + return(roll_node_create(pars_sym_tab_global->heap)); +} + +/*********************************************************************//** +Parses a column definition at a table creation. +@return column sym table node */ +UNIV_INTERN +sym_node_t* +pars_column_def( +/*============*/ + sym_node_t* sym_node, /*!< in: column node in the + symbol table */ + pars_res_word_t* type, /*!< in: data type */ + sym_node_t* len, /*!< in: length of column, or + NULL */ + void* is_unsigned, /*!< in: if not NULL, column + is of type UNSIGNED. */ + void* is_not_null) /*!< in: if not NULL, column + is of type NOT NULL. */ +{ + ulint len2; + + if (len) { + len2 = eval_node_get_int_val(len); + } else { + len2 = 0; + } + + pars_set_dfield_type(que_node_get_val(sym_node), type, len2, + is_unsigned != NULL, is_not_null != NULL); + + return(sym_node); +} + +/*********************************************************************//** +Parses a table creation operation. +@return table create subgraph */ +UNIV_INTERN +tab_node_t* +pars_create_table( +/*==============*/ + sym_node_t* table_sym, /*!< in: table name node in the symbol + table */ + sym_node_t* column_defs, /*!< in: list of column names */ + sym_node_t* compact, /* in: non-NULL if COMPACT table. */ + sym_node_t* block_size, /* in: block size (can be NULL) */ + void* not_fit_in_memory __attribute__((unused))) + /*!< in: a non-NULL pointer means that + this is a table which in simulations + should be simulated as not fitting + in memory; thread is put to sleep + to simulate disk accesses; NOTE that + this flag is not stored to the data + dictionary on disk, and the database + will forget about non-NULL value if + it has to reload the table definition + from disk */ +{ + dict_table_t* table; + sym_node_t* column; + tab_node_t* node; + const dtype_t* dtype; + ulint n_cols; + ulint flags = 0; + ulint flags2 = 0; + + if (compact != NULL) { + + /* System tables currently only use the REDUNDANT row + format therefore the check for srv_file_per_table should be + safe for now. */ + + flags |= DICT_TF_COMPACT; + + /* FIXME: Ideally this should be part of the SQL syntax + or use some other mechanism. We want to reduce dependency + on global variables. There is an inherent race here but + that has always existed around this variable. */ + if (srv_file_per_table) { + flags2 |= DICT_TF2_USE_TABLESPACE; + } + } + + if (block_size != NULL) { + ulint size; + dfield_t* dfield; + + dfield = que_node_get_val(block_size); + + ut_a(dfield_get_len(dfield) == 4); + size = mach_read_from_4(static_cast<byte*>( + dfield_get_data(dfield))); + + + switch (size) { + case 0: + break; + + case 1: case 2: case 4: case 8: case 16: + flags |= DICT_TF_COMPACT; + /* FTS-FIXME: needs the zip changes */ + /* flags |= size << DICT_TF_COMPRESSED_SHIFT; */ + break; + + default: + ut_error; + } + } + + /* Set the flags2 when create table or alter tables */ + flags2 |= DICT_TF2_FTS_AUX_HEX_NAME; + DBUG_EXECUTE_IF("innodb_test_wrong_fts_aux_table_name", + flags2 &= ~DICT_TF2_FTS_AUX_HEX_NAME;); + + + n_cols = que_node_list_get_len(column_defs); + + table = dict_mem_table_create( + table_sym->name, 0, n_cols, flags, flags2, false); + +#ifdef UNIV_DEBUG + if (not_fit_in_memory != NULL) { + table->does_not_fit_in_memory = TRUE; + } +#endif /* UNIV_DEBUG */ + column = column_defs; + + while (column) { + dtype = dfield_get_type(que_node_get_val(column)); + + dict_mem_table_add_col(table, table->heap, + column->name, dtype->mtype, + dtype->prtype, dtype->len); + column->resolved = TRUE; + column->token_type = SYM_COLUMN; + + column = static_cast<sym_node_t*>(que_node_get_next(column)); + } + + node = tab_create_graph_create(table, pars_sym_tab_global->heap, true); + + table_sym->resolved = TRUE; + table_sym->token_type = SYM_TABLE; + + return(node); +} + +/*********************************************************************//** +Parses an index creation operation. +@return index create subgraph */ +UNIV_INTERN +ind_node_t* +pars_create_index( +/*==============*/ + pars_res_word_t* unique_def, /*!< in: not NULL if a unique index */ + pars_res_word_t* clustered_def, /*!< in: not NULL if a clustered index */ + sym_node_t* index_sym, /*!< in: index name node in the symbol + table */ + sym_node_t* table_sym, /*!< in: table name node in the symbol + table */ + sym_node_t* column_list) /*!< in: list of column names */ +{ + dict_index_t* index; + sym_node_t* column; + ind_node_t* node; + ulint n_fields; + ulint ind_type; + + n_fields = que_node_list_get_len(column_list); + + ind_type = 0; + + if (unique_def) { + ind_type = ind_type | DICT_UNIQUE; + } + + if (clustered_def) { + ind_type = ind_type | DICT_CLUSTERED; + } + + index = dict_mem_index_create(table_sym->name, index_sym->name, 0, + ind_type, n_fields); + column = column_list; + + while (column) { + dict_mem_index_add_field(index, column->name, 0); + + column->resolved = TRUE; + column->token_type = SYM_COLUMN; + + column = static_cast<sym_node_t*>(que_node_get_next(column)); + } + + node = ind_create_graph_create(index, pars_sym_tab_global->heap, true); + + table_sym->resolved = TRUE; + table_sym->token_type = SYM_TABLE; + + index_sym->resolved = TRUE; + index_sym->token_type = SYM_TABLE; + + return(node); +} + +/*********************************************************************//** +Parses a procedure definition. +@return query fork node */ +UNIV_INTERN +que_fork_t* +pars_procedure_definition( +/*======================*/ + sym_node_t* sym_node, /*!< in: procedure id node in the symbol + table */ + sym_node_t* param_list, /*!< in: parameter declaration list */ + que_node_t* stat_list) /*!< in: statement list */ +{ + proc_node_t* node; + que_fork_t* fork; + que_thr_t* thr; + mem_heap_t* heap; + + heap = pars_sym_tab_global->heap; + + fork = que_fork_create(NULL, NULL, QUE_FORK_PROCEDURE, heap); + fork->trx = NULL; + + thr = que_thr_create(fork, heap); + + node = static_cast<proc_node_t*>( + mem_heap_alloc(heap, sizeof(proc_node_t))); + + node->common.type = QUE_NODE_PROC; + node->common.parent = thr; + + sym_node->token_type = SYM_PROCEDURE_NAME; + sym_node->resolved = TRUE; + + node->proc_id = sym_node; + node->param_list = param_list; + node->stat_list = stat_list; + + pars_set_parent_in_list(stat_list, node); + + node->sym_tab = pars_sym_tab_global; + + thr->child = node; + + pars_sym_tab_global->query_graph = fork; + + return(fork); +} + +/*************************************************************//** +Parses a stored procedure call, when this is not within another stored +procedure, that is, the client issues a procedure call directly. +In MySQL/InnoDB, stored InnoDB procedures are invoked via the +parsed procedure tree, not via InnoDB SQL, so this function is not used. +@return query graph */ +UNIV_INTERN +que_fork_t* +pars_stored_procedure_call( +/*=======================*/ + sym_node_t* sym_node __attribute__((unused))) + /*!< in: stored procedure name */ +{ + ut_error; + return(NULL); +} + +/*************************************************************//** +Retrieves characters to the lexical analyzer. */ +UNIV_INTERN +int +pars_get_lex_chars( +/*===============*/ + char* buf, /*!< in/out: buffer where to copy */ + int max_size) /*!< in: maximum number of characters which fit + in the buffer */ +{ + int len; + + len = static_cast<int>( + pars_sym_tab_global->string_len + - pars_sym_tab_global->next_char_pos); + if (len == 0) { +#ifdef YYDEBUG + /* fputs("SQL string ends\n", stderr); */ +#endif + return(0); + } + + if (len > max_size) { + len = max_size; + } + +#ifdef UNIV_SQL_DEBUG + if (pars_print_lexed) { + + if (len >= 5) { + len = 5; + } + + fwrite(pars_sym_tab_global->sql_string + + pars_sym_tab_global->next_char_pos, + 1, len, stderr); + } +#endif /* UNIV_SQL_DEBUG */ + + ut_memcpy(buf, pars_sym_tab_global->sql_string + + pars_sym_tab_global->next_char_pos, len); + + pars_sym_tab_global->next_char_pos += len; + + return(len); +} + +/*************************************************************//** +Called by yyparse on error. */ +UNIV_INTERN +void +yyerror( +/*====*/ + const char* s __attribute__((unused))) + /*!< in: error message string */ +{ + ut_ad(s); + + fputs("PARSER ERROR: Syntax error in SQL string\n", stderr); + + ut_error; +} + +/*************************************************************//** +Parses an SQL string returning the query graph. +@return own: the query graph */ +UNIV_INTERN +que_t* +pars_sql( +/*=====*/ + pars_info_t* info, /*!< in: extra information, or NULL */ + const char* str) /*!< in: SQL string */ +{ + sym_node_t* sym_node; + mem_heap_t* heap; + que_t* graph; + + ut_ad(str); + + heap = mem_heap_create(16000); + + /* Currently, the parser is not reentrant: */ + ut_ad(mutex_own(&(dict_sys->mutex))); + + pars_sym_tab_global = sym_tab_create(heap); + + pars_sym_tab_global->string_len = strlen(str); + pars_sym_tab_global->sql_string = static_cast<char*>( + mem_heap_dup(heap, str, pars_sym_tab_global->string_len + 1)); + pars_sym_tab_global->next_char_pos = 0; + pars_sym_tab_global->info = info; + + yyparse(); + + sym_node = UT_LIST_GET_FIRST(pars_sym_tab_global->sym_list); + + while (sym_node) { + ut_a(sym_node->resolved); + + sym_node = UT_LIST_GET_NEXT(sym_list, sym_node); + } + + graph = pars_sym_tab_global->query_graph; + + graph->sym_tab = pars_sym_tab_global; + graph->info = info; + + pars_sym_tab_global = NULL; + + /* fprintf(stderr, "SQL graph size %lu\n", mem_heap_get_size(heap)); */ + + return(graph); +} + +/******************************************************************//** +Completes a query graph by adding query thread and fork nodes +above it and prepares the graph for running. The fork created is of +type QUE_FORK_MYSQL_INTERFACE. +@return query thread node to run */ +UNIV_INTERN +que_thr_t* +pars_complete_graph_for_exec( +/*=========================*/ + que_node_t* node, /*!< in: root node for an incomplete + query graph, or NULL for dummy graph */ + trx_t* trx, /*!< in: transaction handle */ + mem_heap_t* heap) /*!< in: memory heap from which allocated */ +{ + que_fork_t* fork; + que_thr_t* thr; + + fork = que_fork_create(NULL, NULL, QUE_FORK_MYSQL_INTERFACE, heap); + fork->trx = trx; + + thr = que_thr_create(fork, heap); + + thr->child = node; + + if (node) { + que_node_set_parent(node, thr); + } + + trx->graph = NULL; + + return(thr); +} + +/****************************************************************//** +Create parser info struct. +@return own: info struct */ +UNIV_INTERN +pars_info_t* +pars_info_create(void) +/*==================*/ +{ + pars_info_t* info; + mem_heap_t* heap; + + heap = mem_heap_create(512); + + info = static_cast<pars_info_t*>(mem_heap_alloc(heap, sizeof(*info))); + + info->heap = heap; + info->funcs = NULL; + info->bound_lits = NULL; + info->bound_ids = NULL; + info->graph_owns_us = TRUE; + + return(info); +} + +/****************************************************************//** +Free info struct and everything it contains. */ +UNIV_INTERN +void +pars_info_free( +/*===========*/ + pars_info_t* info) /*!< in, own: info struct */ +{ + mem_heap_free(info->heap); +} + +/****************************************************************//** +Add bound literal. */ +UNIV_INTERN +void +pars_info_add_literal( +/*==================*/ + pars_info_t* info, /*!< in: info struct */ + const char* name, /*!< in: name */ + const void* address, /*!< in: address */ + ulint length, /*!< in: length of data */ + ulint type, /*!< in: type, e.g. DATA_FIXBINARY */ + ulint prtype) /*!< in: precise type, e.g. + DATA_UNSIGNED */ +{ + pars_bound_lit_t* pbl; + + ut_ad(!pars_info_get_bound_lit(info, name)); + + pbl = static_cast<pars_bound_lit_t*>( + mem_heap_alloc(info->heap, sizeof(*pbl))); + + pbl->name = name; + + pbl->address = address; + pbl->length = length; + pbl->type = type; + pbl->prtype = prtype; + + if (!info->bound_lits) { + ib_alloc_t* heap_alloc; + + heap_alloc = ib_heap_allocator_create(info->heap); + + info->bound_lits = ib_vector_create(heap_alloc, sizeof(*pbl), 8); + } + + ib_vector_push(info->bound_lits, pbl); +} + +/****************************************************************//** +Equivalent to pars_info_add_literal(info, name, str, strlen(str), +DATA_VARCHAR, DATA_ENGLISH). */ +UNIV_INTERN +void +pars_info_add_str_literal( +/*======================*/ + pars_info_t* info, /*!< in: info struct */ + const char* name, /*!< in: name */ + const char* str) /*!< in: string */ +{ + pars_info_add_literal(info, name, str, strlen(str), + DATA_VARCHAR, DATA_ENGLISH); +} + +/******************************************************************** +If the literal value already exists then it rebinds otherwise it +creates a new entry.*/ +UNIV_INTERN +void +pars_info_bind_literal( +/*===================*/ + pars_info_t* info, /* in: info struct */ + const char* name, /* in: name */ + const void* address, /* in: address */ + ulint length, /* in: length of data */ + ulint type, /* in: type, e.g. DATA_FIXBINARY */ + ulint prtype) /* in: precise type, e.g. */ +{ + pars_bound_lit_t* pbl; + + pbl = pars_info_lookup_bound_lit(info, name); + + if (!pbl) { + pars_info_add_literal( + info, name, address, length, type, prtype); + } else { + pbl->address = address; + pbl->length = length; + + sym_tab_rebind_lit(pbl->node, address, length); + } +} + +/******************************************************************** +If the literal value already exists then it rebinds otherwise it +creates a new entry.*/ +UNIV_INTERN +void +pars_info_bind_varchar_literal( +/*===========================*/ + pars_info_t* info, /*!< in: info struct */ + const char* name, /*!< in: name */ + const byte* str, /*!< in: string */ + ulint str_len) /*!< in: string length */ +{ + pars_bound_lit_t* pbl; + + pbl = pars_info_lookup_bound_lit(info, name); + + if (!pbl) { + pars_info_add_literal( + info, name, str, str_len, DATA_VARCHAR, DATA_ENGLISH); + } else { + + pbl->address = str; + pbl->length = str_len; + + sym_tab_rebind_lit(pbl->node, str, str_len); + } +} + +/****************************************************************//** +Equivalent to: + +char buf[4]; +mach_write_to_4(buf, val); +pars_info_add_literal(info, name, buf, 4, DATA_INT, 0); + +except that the buffer is dynamically allocated from the info struct's +heap. */ +UNIV_INTERN +void +pars_info_add_int4_literal( +/*=======================*/ + pars_info_t* info, /*!< in: info struct */ + const char* name, /*!< in: name */ + lint val) /*!< in: value */ +{ + byte* buf = static_cast<byte*>(mem_heap_alloc(info->heap, 4)); + + mach_write_to_4(buf, val); + pars_info_add_literal(info, name, buf, 4, DATA_INT, 0); +} + +/******************************************************************** +If the literal value already exists then it rebinds otherwise it +creates a new entry. */ +UNIV_INTERN +void +pars_info_bind_int4_literal( +/*========================*/ + pars_info_t* info, /* in: info struct */ + const char* name, /* in: name */ + const ib_uint32_t* val) /* in: value */ +{ + pars_bound_lit_t* pbl; + + pbl = pars_info_lookup_bound_lit(info, name); + + if (!pbl) { + pars_info_add_literal(info, name, val, 4, DATA_INT, 0); + } else { + + pbl->address = val; + pbl->length = sizeof(*val); + + sym_tab_rebind_lit(pbl->node, val, sizeof(*val)); + } +} + +/******************************************************************** +If the literal value already exists then it rebinds otherwise it +creates a new entry. */ +UNIV_INTERN +void +pars_info_bind_int8_literal( +/*========================*/ + pars_info_t* info, /* in: info struct */ + const char* name, /* in: name */ + const ib_uint64_t* val) /* in: value */ +{ + pars_bound_lit_t* pbl; + + pbl = pars_info_lookup_bound_lit(info, name); + + if (!pbl) { + pars_info_add_literal( + info, name, val, sizeof(*val), DATA_INT, 0); + } else { + + pbl->address = val; + pbl->length = sizeof(*val); + + sym_tab_rebind_lit(pbl->node, val, sizeof(*val)); + } +} + +/****************************************************************//** +Equivalent to: + +char buf[8]; +mach_write_to_8(buf, val); +pars_info_add_literal(info, name, buf, 8, DATA_FIXBINARY, 0); + +except that the buffer is dynamically allocated from the info struct's +heap. */ +UNIV_INTERN +void +pars_info_add_ull_literal( +/*======================*/ + pars_info_t* info, /*!< in: info struct */ + const char* name, /*!< in: name */ + ib_uint64_t val) /*!< in: value */ +{ + byte* buf = static_cast<byte*>(mem_heap_alloc(info->heap, 8)); + + mach_write_to_8(buf, val); + + pars_info_add_literal(info, name, buf, 8, DATA_FIXBINARY, 0); +} + +/****************************************************************//** +If the literal value already exists then it rebinds otherwise it +creates a new entry. */ +UNIV_INTERN +void +pars_info_bind_ull_literal( +/*=======================*/ + pars_info_t* info, /*!< in: info struct */ + const char* name, /*!< in: name */ + const ib_uint64_t* val) /*!< in: value */ +{ + pars_bound_lit_t* pbl; + + pbl = pars_info_lookup_bound_lit(info, name); + + if (!pbl) { + pars_info_add_literal( + info, name, val, sizeof(*val), DATA_FIXBINARY, 0); + } else { + + pbl->address = val; + pbl->length = sizeof(*val); + + sym_tab_rebind_lit(pbl->node, val, sizeof(*val)); + } +} + +/****************************************************************//** +Add user function. */ +UNIV_INTERN +void +pars_info_bind_function( +/*====================*/ + pars_info_t* info, /*!< in: info struct */ + const char* name, /*!< in: function name */ + pars_user_func_cb_t func, /*!< in: function address */ + void* arg) /*!< in: user-supplied argument */ +{ + pars_user_func_t* puf; + + puf = pars_info_lookup_user_func(info, name); + + if (!puf) { + if (!info->funcs) { + ib_alloc_t* heap_alloc; + + heap_alloc = ib_heap_allocator_create(info->heap); + + info->funcs = ib_vector_create( + heap_alloc, sizeof(*puf), 8); + } + + /* Create a "new" element */ + puf = static_cast<pars_user_func_t*>( + ib_vector_push(info->funcs, NULL)); + puf->name = name; + } + + puf->arg = arg; + puf->func = func; +} + +/******************************************************************** +Add bound id. */ +UNIV_INTERN +void +pars_info_bind_id( +/*==============*/ + pars_info_t* info, /*!< in: info struct */ + ibool copy_name, /* in: copy name if TRUE */ + const char* name, /*!< in: name */ + const char* id) /*!< in: id */ +{ + pars_bound_id_t* bid; + + bid = pars_info_lookup_bound_id(info, name); + + if (!bid) { + + if (!info->bound_ids) { + ib_alloc_t* heap_alloc; + + heap_alloc = ib_heap_allocator_create(info->heap); + + info->bound_ids = ib_vector_create( + heap_alloc, sizeof(*bid), 8); + } + + /* Create a "new" element */ + bid = static_cast<pars_bound_id_t*>( + ib_vector_push(info->bound_ids, NULL)); + + bid->name = (copy_name) + ? mem_heap_strdup(info->heap, name) : name; + } + + bid->id = id; +} + +/******************************************************************** +Get bound identifier with the given name.*/ + +pars_bound_id_t* +pars_info_get_bound_id( +/*===================*/ + /* out: bound id, or NULL if not + found */ + pars_info_t* info, /* in: info struct */ + const char* name) /* in: bound id name to find */ +{ + return(pars_info_lookup_bound_id(info, name)); +} + +/****************************************************************//** +Get bound literal with the given name. +@return bound literal, or NULL if not found */ +UNIV_INTERN +pars_bound_lit_t* +pars_info_get_bound_lit( +/*====================*/ + pars_info_t* info, /*!< in: info struct */ + const char* name) /*!< in: bound literal name to find */ +{ + return(pars_info_lookup_bound_lit(info, name)); +} diff --git a/storage/xtradb/pars/pars0sym.cc b/storage/xtradb/pars/pars0sym.cc new file mode 100644 index 00000000000..b01a69cb33a --- /dev/null +++ b/storage/xtradb/pars/pars0sym.cc @@ -0,0 +1,440 @@ +/***************************************************************************** + +Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file pars/pars0sym.cc +SQL parser symbol table + +Created 12/15/1997 Heikki Tuuri +*******************************************************/ + +#include "pars0sym.h" + +#ifdef UNIV_NONINL +#include "pars0sym.ic" +#endif + +#include "mem0mem.h" +#include "data0type.h" +#include "data0data.h" +#include "pars0grm.h" +#include "pars0pars.h" +#include "que0que.h" +#include "eval0eval.h" +#include "row0sel.h" + +/******************************************************************//** +Creates a symbol table for a single stored procedure or query. +@return own: symbol table */ +UNIV_INTERN +sym_tab_t* +sym_tab_create( +/*===========*/ + mem_heap_t* heap) /*!< in: memory heap where to create */ +{ + sym_tab_t* sym_tab; + + sym_tab = static_cast<sym_tab_t*>( + mem_heap_alloc(heap, sizeof(sym_tab_t))); + + UT_LIST_INIT(sym_tab->sym_list); + UT_LIST_INIT(sym_tab->func_node_list); + + sym_tab->heap = heap; + + return(sym_tab); +} + + +/******************************************************************//** +Frees the memory allocated dynamically AFTER parsing phase for variables +etc. in the symbol table. Does not free the mem heap where the table was +originally created. Frees also SQL explicit cursor definitions. */ +UNIV_INTERN +void +sym_tab_free_private( +/*=================*/ + sym_tab_t* sym_tab) /*!< in, own: symbol table */ +{ + sym_node_t* sym; + func_node_t* func; + + ut_ad(mutex_own(&dict_sys->mutex)); + + for (sym = UT_LIST_GET_FIRST(sym_tab->sym_list); + sym != NULL; + sym = UT_LIST_GET_NEXT(sym_list, sym)) { + + /* Close the tables opened in pars_retrieve_table_def(). */ + + if (sym->token_type == SYM_TABLE_REF_COUNTED) { + + dict_table_close(sym->table, TRUE, FALSE); + + sym->table = NULL; + sym->resolved = FALSE; + sym->token_type = SYM_UNSET; + } + + eval_node_free_val_buf(sym); + + if (sym->prefetch_buf) { + sel_col_prefetch_buf_free(sym->prefetch_buf); + } + + if (sym->cursor_def) { + que_graph_free_recursive(sym->cursor_def); + } + } + + for (func = UT_LIST_GET_FIRST(sym_tab->func_node_list); + func != NULL; + func = UT_LIST_GET_NEXT(func_node_list, func)) { + + eval_node_free_val_buf(func); + } +} + +/******************************************************************//** +Adds an integer literal to a symbol table. +@return symbol table node */ +UNIV_INTERN +sym_node_t* +sym_tab_add_int_lit( +/*================*/ + sym_tab_t* sym_tab, /*!< in: symbol table */ + ulint val) /*!< in: integer value */ +{ + sym_node_t* node; + byte* data; + + node = static_cast<sym_node_t*>( + mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t))); + + node->common.type = QUE_NODE_SYMBOL; + + node->table = NULL; + node->resolved = TRUE; + node->token_type = SYM_LIT; + + node->indirection = NULL; + + dtype_set(dfield_get_type(&node->common.val), DATA_INT, 0, 4); + + data = static_cast<byte*>(mem_heap_alloc(sym_tab->heap, 4)); + mach_write_to_4(data, val); + + dfield_set_data(&(node->common.val), data, 4); + + node->common.val_buf_size = 0; + node->prefetch_buf = NULL; + node->cursor_def = NULL; + + UT_LIST_ADD_LAST(sym_list, sym_tab->sym_list, node); + + node->like_node = NULL; + + node->sym_table = sym_tab; + + return(node); +} + +/******************************************************************//** +Adds a string literal to a symbol table. +@return symbol table node */ +UNIV_INTERN +sym_node_t* +sym_tab_add_str_lit( +/*================*/ + sym_tab_t* sym_tab, /*!< in: symbol table */ + const byte* str, /*!< in: string with no quotes around + it */ + ulint len) /*!< in: string length */ +{ + sym_node_t* node; + byte* data; + + node = static_cast<sym_node_t*>( + mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t))); + + node->common.type = QUE_NODE_SYMBOL; + + node->table = NULL; + node->resolved = TRUE; + node->token_type = SYM_LIT; + + node->indirection = NULL; + + dtype_set(dfield_get_type(&node->common.val), + DATA_VARCHAR, DATA_ENGLISH, 0); + + data = (len) ? static_cast<byte*>(mem_heap_dup(sym_tab->heap, str, len)) + : NULL; + + dfield_set_data(&(node->common.val), data, len); + + node->common.val_buf_size = 0; + node->prefetch_buf = NULL; + node->cursor_def = NULL; + + UT_LIST_ADD_LAST(sym_list, sym_tab->sym_list, node); + + node->like_node = NULL; + + node->sym_table = sym_tab; + + return(node); +} + +/******************************************************************//** +Add a bound literal to a symbol table. +@return symbol table node */ +UNIV_INTERN +sym_node_t* +sym_tab_add_bound_lit( +/*==================*/ + sym_tab_t* sym_tab, /*!< in: symbol table */ + const char* name, /*!< in: name of bound literal */ + ulint* lit_type) /*!< out: type of literal (PARS_*_LIT) */ +{ + sym_node_t* node; + pars_bound_lit_t* blit; + ulint len = 0; + + blit = pars_info_get_bound_lit(sym_tab->info, name); + ut_a(blit); + + node = static_cast<sym_node_t*>( + mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t))); + + node->common.type = QUE_NODE_SYMBOL; + node->common.brother = node->common.parent = NULL; + + node->table = NULL; + node->resolved = TRUE; + node->token_type = SYM_LIT; + + node->indirection = NULL; + + switch (blit->type) { + case DATA_FIXBINARY: + len = blit->length; + *lit_type = PARS_FIXBINARY_LIT; + break; + + case DATA_BLOB: + *lit_type = PARS_BLOB_LIT; + break; + + case DATA_VARCHAR: + *lit_type = PARS_STR_LIT; + break; + + case DATA_CHAR: + ut_a(blit->length > 0); + + len = blit->length; + *lit_type = PARS_STR_LIT; + break; + + case DATA_INT: + ut_a(blit->length > 0); + ut_a(blit->length <= 8); + + len = blit->length; + *lit_type = PARS_INT_LIT; + break; + + default: + ut_error; + } + + dtype_set(dfield_get_type(&node->common.val), + blit->type, blit->prtype, len); + + dfield_set_data(&(node->common.val), blit->address, blit->length); + + node->common.val_buf_size = 0; + node->prefetch_buf = NULL; + node->cursor_def = NULL; + + UT_LIST_ADD_LAST(sym_list, sym_tab->sym_list, node); + + blit->node = node; + node->like_node = NULL; + node->sym_table = sym_tab; + + return(node); +} + +/********************************************************************** +Rebind literal to a node in the symbol table. */ + +sym_node_t* +sym_tab_rebind_lit( +/*===============*/ + /* out: symbol table node */ + sym_node_t* node, /* in: node that is bound to literal*/ + const void* address, /* in: pointer to data */ + ulint length) /* in: length of data */ +{ + dfield_t* dfield = que_node_get_val(node); + dtype_t* dtype = dfield_get_type(dfield); + + ut_a(node->token_type == SYM_LIT); + + dfield_set_data(&node->common.val, address, length); + + if (node->like_node) { + + ut_a(dtype_get_mtype(dtype) == DATA_CHAR + || dtype_get_mtype(dtype) == DATA_VARCHAR); + + /* Don't force [FALSE] creation of sub-nodes (for LIKE) */ + pars_like_rebind( + node,static_cast<const byte*>(address), length); + } + + /* FIXME: What's this ? */ + node->common.val_buf_size = 0; + + if (node->prefetch_buf) { + sel_col_prefetch_buf_free(node->prefetch_buf); + node->prefetch_buf = NULL; + } + + if (node->cursor_def) { + que_graph_free_recursive(node->cursor_def); + node->cursor_def = NULL; + } + + return(node); +} + +/******************************************************************//** +Adds an SQL null literal to a symbol table. +@return symbol table node */ +UNIV_INTERN +sym_node_t* +sym_tab_add_null_lit( +/*=================*/ + sym_tab_t* sym_tab) /*!< in: symbol table */ +{ + sym_node_t* node; + + node = static_cast<sym_node_t*>( + mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t))); + + node->common.type = QUE_NODE_SYMBOL; + + node->table = NULL; + node->resolved = TRUE; + node->token_type = SYM_LIT; + + node->indirection = NULL; + + dfield_get_type(&node->common.val)->mtype = DATA_ERROR; + + dfield_set_null(&node->common.val); + + node->common.val_buf_size = 0; + node->prefetch_buf = NULL; + node->cursor_def = NULL; + + UT_LIST_ADD_LAST(sym_list, sym_tab->sym_list, node); + + node->like_node = NULL; + + node->sym_table = sym_tab; + + return(node); +} + +/******************************************************************//** +Adds an identifier to a symbol table. +@return symbol table node */ +UNIV_INTERN +sym_node_t* +sym_tab_add_id( +/*===========*/ + sym_tab_t* sym_tab, /*!< in: symbol table */ + byte* name, /*!< in: identifier name */ + ulint len) /*!< in: identifier length */ +{ + sym_node_t* node; + + node = static_cast<sym_node_t*>( + mem_heap_zalloc(sym_tab->heap, sizeof(*node))); + + node->common.type = QUE_NODE_SYMBOL; + + node->name = mem_heap_strdupl(sym_tab->heap, (char*) name, len); + node->name_len = len; + + UT_LIST_ADD_LAST(sym_list, sym_tab->sym_list, node); + + dfield_set_null(&node->common.val); + + node->sym_table = sym_tab; + + return(node); +} + +/******************************************************************//** +Add a bound identifier to a symbol table. +@return symbol table node */ +UNIV_INTERN +sym_node_t* +sym_tab_add_bound_id( +/*=================*/ + sym_tab_t* sym_tab, /*!< in: symbol table */ + const char* name) /*!< in: name of bound id */ +{ + sym_node_t* node; + pars_bound_id_t* bid; + + bid = pars_info_get_bound_id(sym_tab->info, name); + ut_a(bid); + + node = static_cast<sym_node_t*>( + mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t))); + + node->common.type = QUE_NODE_SYMBOL; + + node->table = NULL; + node->resolved = FALSE; + node->token_type = SYM_UNSET; + node->indirection = NULL; + + node->name = mem_heap_strdup(sym_tab->heap, bid->id); + node->name_len = strlen(node->name); + + UT_LIST_ADD_LAST(sym_list, sym_tab->sym_list, node); + + dfield_set_null(&node->common.val); + + node->common.val_buf_size = 0; + node->prefetch_buf = NULL; + node->cursor_def = NULL; + + node->like_node = NULL; + + node->sym_table = sym_tab; + + return(node); +} diff --git a/storage/xtradb/que/que0que.cc b/storage/xtradb/que/que0que.cc new file mode 100644 index 00000000000..8d9b8fac776 --- /dev/null +++ b/storage/xtradb/que/que0que.cc @@ -0,0 +1,1334 @@ +/***************************************************************************** + +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file que/que0que.cc +Query graph + +Created 5/27/1996 Heikki Tuuri +*******************************************************/ + +#include "que0que.h" + +#ifdef UNIV_NONINL +#include "que0que.ic" +#endif + +#include "usr0sess.h" +#include "trx0trx.h" +#include "trx0roll.h" +#include "row0undo.h" +#include "row0ins.h" +#include "row0upd.h" +#include "row0sel.h" +#include "row0purge.h" +#include "dict0crea.h" +#include "log0log.h" +#include "eval0proc.h" +#include "lock0lock.h" +#include "eval0eval.h" +#include "pars0types.h" + +#define QUE_MAX_LOOPS_WITHOUT_CHECK 16 + +#ifdef UNIV_DEBUG +/* If the following flag is set TRUE, the module will print trace info +of SQL execution in the UNIV_SQL_DEBUG version */ +UNIV_INTERN ibool que_trace_on = FALSE; +#endif /* UNIV_DEBUG */ + +/* Short introduction to query graphs + ================================== + +A query graph consists of nodes linked to each other in various ways. The +execution starts at que_run_threads() which takes a que_thr_t parameter. +que_thr_t contains two fields that control query graph execution: run_node +and prev_node. run_node is the next node to execute and prev_node is the +last node executed. + +Each node has a pointer to a 'next' statement, i.e., its brother, and a +pointer to its parent node. The next pointer is NULL in the last statement +of a block. + +Loop nodes contain a link to the first statement of the enclosed statement +list. While the loop runs, que_thr_step() checks if execution to the loop +node came from its parent or from one of the statement nodes in the loop. If +it came from the parent of the loop node it starts executing the first +statement node in the loop. If it came from one of the statement nodes in +the loop, then it checks if the statement node has another statement node +following it, and runs it if so. + +To signify loop ending, the loop statements (see e.g. while_step()) set +que_thr_t->run_node to the loop node's parent node. This is noticed on the +next call of que_thr_step() and execution proceeds to the node pointed to by +the loop node's 'next' pointer. + +For example, the code: + +X := 1; +WHILE X < 5 LOOP + X := X + 1; + X := X + 1; +X := 5 + +will result in the following node hierarchy, with the X-axis indicating +'next' links and the Y-axis indicating parent/child links: + +A - W - A + | + | + A - A + +A = assign_node_t, W = while_node_t. */ + +/* How a stored procedure containing COMMIT or ROLLBACK commands +is executed? + +The commit or rollback can be seen as a subprocedure call. + +When the transaction starts to handle a rollback or commit. +It builds a query graph which, when executed, will roll back +or commit the incomplete transaction. The transaction +is moved to the TRX_QUE_ROLLING_BACK or TRX_QUE_COMMITTING state. +If specified, the SQL cursors opened by the transaction are closed. +When the execution of the graph completes, it is like returning +from a subprocedure: the query thread which requested the operation +starts running again. */ + +/**********************************************************************//** +Moves a thread from another state to the QUE_THR_RUNNING state. Increments +the n_active_thrs counters of the query graph and transaction. +***NOTE***: This is the only function in which such a transition is allowed +to happen! */ +static +void +que_thr_move_to_run_state( +/*======================*/ + que_thr_t* thr); /*!< in: an query thread */ + +/***********************************************************************//** +Creates a query graph fork node. +@return own: fork node */ +UNIV_INTERN +que_fork_t* +que_fork_create( +/*============*/ + que_t* graph, /*!< in: graph, if NULL then this + fork node is assumed to be the + graph root */ + que_node_t* parent, /*!< in: parent node */ + ulint fork_type, /*!< in: fork type */ + mem_heap_t* heap) /*!< in: memory heap where created */ +{ + que_fork_t* fork; + + ut_ad(heap); + + fork = static_cast<que_fork_t*>(mem_heap_zalloc(heap, sizeof(*fork))); + + fork->heap = heap; + + fork->fork_type = fork_type; + + fork->common.parent = parent; + + fork->common.type = QUE_NODE_FORK; + + fork->state = QUE_FORK_COMMAND_WAIT; + + fork->graph = (graph != NULL) ? graph : fork; + + return(fork); +} + +/***********************************************************************//** +Creates a query graph thread node. +@return own: query thread node */ +UNIV_INTERN +que_thr_t* +que_thr_create( +/*===========*/ + que_fork_t* parent, /*!< in: parent node, i.e., a fork node */ + mem_heap_t* heap) /*!< in: memory heap where created */ +{ + que_thr_t* thr; + + ut_ad(parent && heap); + + thr = static_cast<que_thr_t*>(mem_heap_zalloc(heap, sizeof(*thr))); + + thr->graph = parent->graph; + + thr->common.parent = parent; + + thr->magic_n = QUE_THR_MAGIC_N; + + thr->common.type = QUE_NODE_THR; + + thr->state = QUE_THR_COMMAND_WAIT; + + thr->lock_state = QUE_THR_LOCK_NOLOCK; + + UT_LIST_ADD_LAST(thrs, parent->thrs, thr); + + return(thr); +} + +/**********************************************************************//** +Moves a suspended query thread to the QUE_THR_RUNNING state and may release +a worker thread to execute it. This function should be used to end +the wait state of a query thread waiting for a lock or a stored procedure +completion. +@return the query thread that needs to be released. */ +UNIV_INTERN +que_thr_t* +que_thr_end_lock_wait( +/*==================*/ + trx_t* trx) /*!< in: transaction with que_state in + QUE_THR_LOCK_WAIT */ +{ + que_thr_t* thr; + ibool was_active; + ulint sec; + ulint ms; + ib_uint64_t now; + + ut_ad(lock_mutex_own()); + ut_ad(trx_mutex_own(trx)); + + thr = trx->lock.wait_thr; + + ut_ad(thr != NULL); + + ut_ad(trx->lock.que_state == TRX_QUE_LOCK_WAIT); + /* In MySQL this is the only possible state here */ + ut_a(thr->state == QUE_THR_LOCK_WAIT); + + was_active = thr->is_active; + + que_thr_move_to_run_state(thr); + + if (UNIV_UNLIKELY(trx->take_stats)) { + ut_usectime(&sec, &ms); + now = (ib_uint64_t)sec * 1000000 + ms; + trx->lock_que_wait_timer + += (ulint)(now - trx->lock_que_wait_ustarted); + } + + trx->lock.que_state = TRX_QUE_RUNNING; + + trx->lock.wait_thr = NULL; + + /* In MySQL we let the OS thread (not just the query thread) to wait + for the lock to be released: */ + + return((!was_active && thr != NULL) ? thr : NULL); +} + +/**********************************************************************//** +Inits a query thread for a command. */ +UNIV_INLINE +void +que_thr_init_command( +/*=================*/ + que_thr_t* thr) /*!< in: query thread */ +{ + thr->run_node = thr; + thr->prev_node = thr->common.parent; + + que_thr_move_to_run_state(thr); +} + +/**********************************************************************//** +Round robin scheduler. +@return a query thread of the graph moved to QUE_THR_RUNNING state, or +NULL; the query thread should be executed by que_run_threads by the +caller */ +UNIV_INTERN +que_thr_t* +que_fork_scheduler_round_robin( +/*===========================*/ + que_fork_t* fork, /*!< in: a query fork */ + que_thr_t* thr) /*!< in: current pos */ +{ + trx_mutex_enter(fork->trx); + + /* If no current, start first available. */ + if (thr == NULL) { + thr = UT_LIST_GET_FIRST(fork->thrs); + } else { + thr = UT_LIST_GET_NEXT(thrs, thr); + } + + if (thr) { + + fork->state = QUE_FORK_ACTIVE; + + fork->last_sel_node = NULL; + + switch (thr->state) { + case QUE_THR_COMMAND_WAIT: + case QUE_THR_COMPLETED: + ut_a(!thr->is_active); + que_thr_init_command(thr); + break; + + case QUE_THR_SUSPENDED: + case QUE_THR_LOCK_WAIT: + default: + ut_error; + + } + } + + trx_mutex_exit(fork->trx); + + return(thr); +} + +/**********************************************************************//** +Starts execution of a command in a query fork. Picks a query thread which +is not in the QUE_THR_RUNNING state and moves it to that state. If none +can be chosen, a situation which may arise in parallelized fetches, NULL +is returned. +@return a query thread of the graph moved to QUE_THR_RUNNING state, or +NULL; the query thread should be executed by que_run_threads by the +caller */ +UNIV_INTERN +que_thr_t* +que_fork_start_command( +/*===================*/ + que_fork_t* fork) /*!< in: a query fork */ +{ + que_thr_t* thr; + que_thr_t* suspended_thr = NULL; + que_thr_t* completed_thr = NULL; + + fork->state = QUE_FORK_ACTIVE; + + fork->last_sel_node = NULL; + + suspended_thr = NULL; + completed_thr = NULL; + + /* Choose the query thread to run: usually there is just one thread, + but in a parallelized select, which necessarily is non-scrollable, + there may be several to choose from */ + + /* First we try to find a query thread in the QUE_THR_COMMAND_WAIT + state. Then we try to find a query thread in the QUE_THR_SUSPENDED + state, finally we try to find a query thread in the QUE_THR_COMPLETED + state */ + + /* We make a single pass over the thr list within which we note which + threads are ready to run. */ + for (thr = UT_LIST_GET_FIRST(fork->thrs); + thr != NULL; + thr = UT_LIST_GET_NEXT(thrs, thr)) { + + switch (thr->state) { + case QUE_THR_COMMAND_WAIT: + + /* We have to send the initial message to query thread + to start it */ + + que_thr_init_command(thr); + + return(thr); + + case QUE_THR_SUSPENDED: + /* In this case the execution of the thread was + suspended: no initial message is needed because + execution can continue from where it was left */ + if (!suspended_thr) { + suspended_thr = thr; + } + + break; + + case QUE_THR_COMPLETED: + if (!completed_thr) { + completed_thr = thr; + } + + break; + + case QUE_THR_LOCK_WAIT: + ut_error; + + } + } + + if (suspended_thr) { + + thr = suspended_thr; + que_thr_move_to_run_state(thr); + + } else if (completed_thr) { + + thr = completed_thr; + que_thr_init_command(thr); + } else { + ut_error; + } + + return(thr); +} + +/****************************************************************//** +Tests if all the query threads in the same fork have a given state. +@return TRUE if all the query threads in the same fork were in the +given state */ +UNIV_INLINE +ibool +que_fork_all_thrs_in_state( +/*=======================*/ + que_fork_t* fork, /*!< in: query fork */ + ulint state) /*!< in: state */ +{ + que_thr_t* thr_node; + + for (thr_node = UT_LIST_GET_FIRST(fork->thrs); + thr_node != NULL; + thr_node = UT_LIST_GET_NEXT(thrs, thr_node)) { + + if (thr_node->state != state) { + + return(FALSE); + } + } + + return(TRUE); +} + +/**********************************************************************//** +Calls que_graph_free_recursive for statements in a statement list. */ +static +void +que_graph_free_stat_list( +/*=====================*/ + que_node_t* node) /*!< in: first query graph node in the list */ +{ + while (node) { + que_graph_free_recursive(node); + + node = que_node_get_next(node); + } +} + +/**********************************************************************//** +Frees a query graph, but not the heap where it was created. Does not free +explicit cursor declarations, they are freed in que_graph_free. */ +UNIV_INTERN +void +que_graph_free_recursive( +/*=====================*/ + que_node_t* node) /*!< in: query graph node */ +{ + que_fork_t* fork; + que_thr_t* thr; + undo_node_t* undo; + sel_node_t* sel; + ins_node_t* ins; + upd_node_t* upd; + tab_node_t* cre_tab; + ind_node_t* cre_ind; + purge_node_t* purge; + + if (node == NULL) { + + return; + } + + switch (que_node_get_type(node)) { + + case QUE_NODE_FORK: + fork = static_cast<que_fork_t*>(node); + + thr = UT_LIST_GET_FIRST(fork->thrs); + + while (thr) { + que_graph_free_recursive(thr); + + thr = UT_LIST_GET_NEXT(thrs, thr); + } + + break; + case QUE_NODE_THR: + + thr = static_cast<que_thr_t*>(node); + + if (thr->magic_n != QUE_THR_MAGIC_N) { + fprintf(stderr, + "que_thr struct appears corrupt;" + " magic n %lu\n", + (unsigned long) thr->magic_n); + mem_analyze_corruption(thr); + ut_error; + } + + thr->magic_n = QUE_THR_MAGIC_FREED; + + que_graph_free_recursive(thr->child); + + break; + case QUE_NODE_UNDO: + + undo = static_cast<undo_node_t*>(node); + + mem_heap_free(undo->heap); + + break; + case QUE_NODE_SELECT: + + sel = static_cast<sel_node_t*>(node); + + sel_node_free_private(sel); + + break; + case QUE_NODE_INSERT: + + ins = static_cast<ins_node_t*>(node); + + que_graph_free_recursive(ins->select); + + mem_heap_free(ins->entry_sys_heap); + + break; + case QUE_NODE_PURGE: + purge = static_cast<purge_node_t*>(node); + + mem_heap_free(purge->heap); + + break; + + case QUE_NODE_UPDATE: + + upd = static_cast<upd_node_t*>(node); + + if (upd->in_mysql_interface) { + + btr_pcur_free_for_mysql(upd->pcur); + } + + que_graph_free_recursive(upd->cascade_node); + + if (upd->cascade_heap) { + mem_heap_free(upd->cascade_heap); + } + + que_graph_free_recursive(upd->select); + + mem_heap_free(upd->heap); + + break; + case QUE_NODE_CREATE_TABLE: + cre_tab = static_cast<tab_node_t*>(node); + + que_graph_free_recursive(cre_tab->tab_def); + que_graph_free_recursive(cre_tab->col_def); + que_graph_free_recursive(cre_tab->commit_node); + + mem_heap_free(cre_tab->heap); + + break; + case QUE_NODE_CREATE_INDEX: + cre_ind = static_cast<ind_node_t*>(node); + + que_graph_free_recursive(cre_ind->ind_def); + que_graph_free_recursive(cre_ind->field_def); + que_graph_free_recursive(cre_ind->commit_node); + + mem_heap_free(cre_ind->heap); + + break; + case QUE_NODE_PROC: + que_graph_free_stat_list(((proc_node_t*) node)->stat_list); + + break; + case QUE_NODE_IF: + que_graph_free_stat_list(((if_node_t*) node)->stat_list); + que_graph_free_stat_list(((if_node_t*) node)->else_part); + que_graph_free_stat_list(((if_node_t*) node)->elsif_list); + + break; + case QUE_NODE_ELSIF: + que_graph_free_stat_list(((elsif_node_t*) node)->stat_list); + + break; + case QUE_NODE_WHILE: + que_graph_free_stat_list(((while_node_t*) node)->stat_list); + + break; + case QUE_NODE_FOR: + que_graph_free_stat_list(((for_node_t*) node)->stat_list); + + break; + + case QUE_NODE_ASSIGNMENT: + case QUE_NODE_EXIT: + case QUE_NODE_RETURN: + case QUE_NODE_COMMIT: + case QUE_NODE_ROLLBACK: + case QUE_NODE_LOCK: + case QUE_NODE_FUNC: + case QUE_NODE_ORDER: + case QUE_NODE_ROW_PRINTF: + case QUE_NODE_OPEN: + case QUE_NODE_FETCH: + /* No need to do anything */ + + break; + default: + fprintf(stderr, + "que_node struct appears corrupt; type %lu\n", + (unsigned long) que_node_get_type(node)); + mem_analyze_corruption(node); + ut_error; + } +} + +/**********************************************************************//** +Frees a query graph. */ +UNIV_INTERN +void +que_graph_free( +/*===========*/ + que_t* graph) /*!< in: query graph; we assume that the memory + heap where this graph was created is private + to this graph: if not, then use + que_graph_free_recursive and free the heap + afterwards! */ +{ + ut_ad(graph); + + if (graph->sym_tab) { + /* The following call frees dynamic memory allocated + for variables etc. during execution. Frees also explicit + cursor definitions. */ + + sym_tab_free_private(graph->sym_tab); + } + + if (graph->info && graph->info->graph_owns_us) { + pars_info_free(graph->info); + } + + que_graph_free_recursive(graph); + + mem_heap_free(graph->heap); +} + +/****************************************************************//** +Performs an execution step on a thr node. +@return query thread to run next, or NULL if none */ +static +que_thr_t* +que_thr_node_step( +/*==============*/ + que_thr_t* thr) /*!< in: query thread where run_node must + be the thread node itself */ +{ + ut_ad(thr->run_node == thr); + + if (thr->prev_node == thr->common.parent) { + /* If control to the node came from above, it is just passed + on */ + + thr->run_node = thr->child; + + return(thr); + } + + trx_mutex_enter(thr_get_trx(thr)); + + if (que_thr_peek_stop(thr)) { + + trx_mutex_exit(thr_get_trx(thr)); + + return(thr); + } + + /* Thread execution completed */ + + thr->state = QUE_THR_COMPLETED; + + trx_mutex_exit(thr_get_trx(thr)); + + return(NULL); +} + +/**********************************************************************//** +Moves a thread from another state to the QUE_THR_RUNNING state. Increments +the n_active_thrs counters of the query graph and transaction if thr was +not active. +***NOTE***: This and ..._mysql are the only functions in which such a +transition is allowed to happen! */ +static +void +que_thr_move_to_run_state( +/*======================*/ + que_thr_t* thr) /*!< in: an query thread */ +{ + ut_ad(thr->state != QUE_THR_RUNNING); + + if (!thr->is_active) { + trx_t* trx; + + trx = thr_get_trx(thr); + + thr->graph->n_active_thrs++; + + trx->lock.n_active_thrs++; + + thr->is_active = TRUE; + } + + thr->state = QUE_THR_RUNNING; +} + +/**********************************************************************//** +Stops a query thread if graph or trx is in a state requiring it. The +conditions are tested in the order (1) graph, (2) trx. +@return TRUE if stopped */ +UNIV_INTERN +ibool +que_thr_stop( +/*=========*/ + que_thr_t* thr) /*!< in: query thread */ +{ + que_t* graph; + trx_t* trx = thr_get_trx(thr); + + graph = thr->graph; + + ut_ad(trx_mutex_own(trx)); + + if (graph->state == QUE_FORK_COMMAND_WAIT) { + + thr->state = QUE_THR_SUSPENDED; + + } else if (trx->lock.que_state == TRX_QUE_LOCK_WAIT) { + + trx->lock.wait_thr = thr; + thr->state = QUE_THR_LOCK_WAIT; + + } else if (trx->error_state != DB_SUCCESS + && trx->error_state != DB_LOCK_WAIT) { + + /* Error handling built for the MySQL interface */ + thr->state = QUE_THR_COMPLETED; + + } else if (graph->fork_type == QUE_FORK_ROLLBACK) { + + thr->state = QUE_THR_SUSPENDED; + } else { + ut_ad(graph->state == QUE_FORK_ACTIVE); + + return(FALSE); + } + + return(TRUE); +} + +/**********************************************************************//** +Decrements the query thread reference counts in the query graph and the +transaction. +*** NOTE ***: +This and que_thr_stop_for_mysql are the only functions where the reference +count can be decremented and this function may only be called from inside +que_run_threads! These restrictions exist to make the rollback code easier +to maintain. */ +static +void +que_thr_dec_refer_count( +/*====================*/ + que_thr_t* thr, /*!< in: query thread */ + que_thr_t** next_thr) /*!< in/out: next query thread to run; + if the value which is passed in is + a pointer to a NULL pointer, then the + calling function can start running + a new query thread */ +{ + trx_t* trx; + que_fork_t* fork; + + trx = thr_get_trx(thr); + + ut_a(thr->is_active); + ut_ad(trx_mutex_own(trx)); + + if (thr->state == QUE_THR_RUNNING) { + + if (!que_thr_stop(thr)) { + + ut_a(next_thr != NULL && *next_thr == NULL); + + /* The reason for the thr suspension or wait was + already canceled before we came here: continue + running the thread. + + This is also possible because in trx_commit_step() we + assume a single query thread. We set the query thread + state to QUE_THR_RUNNING. */ + + /* fprintf(stderr, + "Wait already ended: trx: %p\n", trx); */ + + /* Normally srv_suspend_mysql_thread resets + the state to DB_SUCCESS before waiting, but + in this case we have to do it here, + otherwise nobody does it. */ + + trx->error_state = DB_SUCCESS; + + *next_thr = thr; + + return; + } + } + + fork = static_cast<que_fork_t*>(thr->common.parent); + + --trx->lock.n_active_thrs; + + --fork->n_active_thrs; + + thr->is_active = FALSE; +} + +/**********************************************************************//** +A patch for MySQL used to 'stop' a dummy query thread used in MySQL. The +query thread is stopped and made inactive, except in the case where +it was put to the lock wait state in lock0lock.cc, but the lock has already +been granted or the transaction chosen as a victim in deadlock resolution. */ +UNIV_INTERN +void +que_thr_stop_for_mysql( +/*===================*/ + que_thr_t* thr) /*!< in: query thread */ +{ + trx_t* trx; + + trx = thr_get_trx(thr); + + /* Can't be the purge transaction. */ + ut_a(trx->id != 0); + + trx_mutex_enter(trx); + + if (thr->state == QUE_THR_RUNNING) { + + if (trx->error_state != DB_SUCCESS + && trx->error_state != DB_LOCK_WAIT) { + + /* Error handling built for the MySQL interface */ + thr->state = QUE_THR_COMPLETED; + } else { + /* It must have been a lock wait but the lock was + already released, or this transaction was chosen + as a victim in selective deadlock resolution */ + + trx_mutex_exit(trx); + + return; + } + } + + ut_ad(thr->is_active == TRUE); + ut_ad(trx->lock.n_active_thrs == 1); + ut_ad(thr->graph->n_active_thrs == 1); + + thr->is_active = FALSE; + thr->graph->n_active_thrs--; + + trx->lock.n_active_thrs--; + + trx_mutex_exit(trx); +} + +/**********************************************************************//** +Moves a thread from another state to the QUE_THR_RUNNING state. Increments +the n_active_thrs counters of the query graph and transaction if thr was +not active. */ +UNIV_INTERN +void +que_thr_move_to_run_state_for_mysql( +/*================================*/ + que_thr_t* thr, /*!< in: an query thread */ + trx_t* trx) /*!< in: transaction */ +{ + if (thr->magic_n != QUE_THR_MAGIC_N) { + fprintf(stderr, + "que_thr struct appears corrupt; magic n %lu\n", + (unsigned long) thr->magic_n); + + mem_analyze_corruption(thr); + + ut_error; + } + + if (!thr->is_active) { + + thr->graph->n_active_thrs++; + + trx->lock.n_active_thrs++; + + thr->is_active = TRUE; + } + + thr->state = QUE_THR_RUNNING; +} + +/**********************************************************************//** +A patch for MySQL used to 'stop' a dummy query thread used in MySQL +select, when there is no error or lock wait. */ +UNIV_INTERN +void +que_thr_stop_for_mysql_no_error( +/*============================*/ + que_thr_t* thr, /*!< in: query thread */ + trx_t* trx) /*!< in: transaction */ +{ + ut_ad(thr->state == QUE_THR_RUNNING); + ut_ad(thr_get_trx(thr)->id != 0); + ut_ad(thr->is_active == TRUE); + ut_ad(trx->lock.n_active_thrs == 1); + ut_ad(thr->graph->n_active_thrs == 1); + + if (thr->magic_n != QUE_THR_MAGIC_N) { + fprintf(stderr, + "que_thr struct appears corrupt; magic n %lu\n", + (unsigned long) thr->magic_n); + + mem_analyze_corruption(thr); + + ut_error; + } + + thr->state = QUE_THR_COMPLETED; + + thr->is_active = FALSE; + thr->graph->n_active_thrs--; + + trx->lock.n_active_thrs--; +} + +/****************************************************************//** +Get the first containing loop node (e.g. while_node_t or for_node_t) for the +given node, or NULL if the node is not within a loop. +@return containing loop node, or NULL. */ +UNIV_INTERN +que_node_t* +que_node_get_containing_loop_node( +/*==============================*/ + que_node_t* node) /*!< in: node */ +{ + ut_ad(node); + + for (;;) { + ulint type; + + node = que_node_get_parent(node); + + if (!node) { + break; + } + + type = que_node_get_type(node); + + if ((type == QUE_NODE_FOR) || (type == QUE_NODE_WHILE)) { + break; + } + } + + return(node); +} + +/**********************************************************************//** +Prints info of an SQL query graph node. */ +UNIV_INTERN +void +que_node_print_info( +/*================*/ + que_node_t* node) /*!< in: query graph node */ +{ + ulint type; + const char* str; + + type = que_node_get_type(node); + + if (type == QUE_NODE_SELECT) { + str = "SELECT"; + } else if (type == QUE_NODE_INSERT) { + str = "INSERT"; + } else if (type == QUE_NODE_UPDATE) { + str = "UPDATE"; + } else if (type == QUE_NODE_WHILE) { + str = "WHILE"; + } else if (type == QUE_NODE_ASSIGNMENT) { + str = "ASSIGNMENT"; + } else if (type == QUE_NODE_IF) { + str = "IF"; + } else if (type == QUE_NODE_FETCH) { + str = "FETCH"; + } else if (type == QUE_NODE_OPEN) { + str = "OPEN"; + } else if (type == QUE_NODE_PROC) { + str = "STORED PROCEDURE"; + } else if (type == QUE_NODE_FUNC) { + str = "FUNCTION"; + } else if (type == QUE_NODE_LOCK) { + str = "LOCK"; + } else if (type == QUE_NODE_THR) { + str = "QUERY THREAD"; + } else if (type == QUE_NODE_COMMIT) { + str = "COMMIT"; + } else if (type == QUE_NODE_UNDO) { + str = "UNDO ROW"; + } else if (type == QUE_NODE_PURGE) { + str = "PURGE ROW"; + } else if (type == QUE_NODE_ROLLBACK) { + str = "ROLLBACK"; + } else if (type == QUE_NODE_CREATE_TABLE) { + str = "CREATE TABLE"; + } else if (type == QUE_NODE_CREATE_INDEX) { + str = "CREATE INDEX"; + } else if (type == QUE_NODE_FOR) { + str = "FOR LOOP"; + } else if (type == QUE_NODE_RETURN) { + str = "RETURN"; + } else if (type == QUE_NODE_EXIT) { + str = "EXIT"; + } else { + str = "UNKNOWN NODE TYPE"; + } + + fprintf(stderr, "Node type %lu: %s, address %p\n", + (ulong) type, str, (void*) node); +} + +/**********************************************************************//** +Performs an execution step on a query thread. +@return query thread to run next: it may differ from the input +parameter if, e.g., a subprocedure call is made */ +UNIV_INLINE +que_thr_t* +que_thr_step( +/*=========*/ + que_thr_t* thr) /*!< in: query thread */ +{ + que_node_t* node; + que_thr_t* old_thr; + trx_t* trx; + ulint type; + + trx = thr_get_trx(thr); + + ut_ad(thr->state == QUE_THR_RUNNING); + ut_a(trx->error_state == DB_SUCCESS); + + thr->resource++; + + node = thr->run_node; + type = que_node_get_type(node); + + old_thr = thr; + +#ifdef UNIV_DEBUG + if (que_trace_on) { + fputs("To execute: ", stderr); + que_node_print_info(node); + } +#endif + if (type & QUE_NODE_CONTROL_STAT) { + if ((thr->prev_node != que_node_get_parent(node)) + && que_node_get_next(thr->prev_node)) { + + /* The control statements, like WHILE, always pass the + control to the next child statement if there is any + child left */ + + thr->run_node = que_node_get_next(thr->prev_node); + + } else if (type == QUE_NODE_IF) { + if_step(thr); + } else if (type == QUE_NODE_FOR) { + for_step(thr); + } else if (type == QUE_NODE_PROC) { + + /* We can access trx->undo_no without reserving + trx->undo_mutex, because there cannot be active query + threads doing updating or inserting at the moment! */ + + if (thr->prev_node == que_node_get_parent(node)) { + trx->last_sql_stat_start.least_undo_no + = trx->undo_no; + } + + proc_step(thr); + } else if (type == QUE_NODE_WHILE) { + while_step(thr); + } else { + ut_error; + } + } else if (type == QUE_NODE_ASSIGNMENT) { + assign_step(thr); + } else if (type == QUE_NODE_SELECT) { + thr = row_sel_step(thr); + } else if (type == QUE_NODE_INSERT) { + thr = row_ins_step(thr); + } else if (type == QUE_NODE_UPDATE) { + thr = row_upd_step(thr); + } else if (type == QUE_NODE_FETCH) { + thr = fetch_step(thr); + } else if (type == QUE_NODE_OPEN) { + thr = open_step(thr); + } else if (type == QUE_NODE_FUNC) { + proc_eval_step(thr); + + } else if (type == QUE_NODE_LOCK) { + + ut_error; + } else if (type == QUE_NODE_THR) { + thr = que_thr_node_step(thr); + } else if (type == QUE_NODE_COMMIT) { + thr = trx_commit_step(thr); + } else if (type == QUE_NODE_UNDO) { + thr = row_undo_step(thr); + } else if (type == QUE_NODE_PURGE) { + thr = row_purge_step(thr); + } else if (type == QUE_NODE_RETURN) { + thr = return_step(thr); + } else if (type == QUE_NODE_EXIT) { + thr = exit_step(thr); + } else if (type == QUE_NODE_ROLLBACK) { + thr = trx_rollback_step(thr); + } else if (type == QUE_NODE_CREATE_TABLE) { + thr = dict_create_table_step(thr); + } else if (type == QUE_NODE_CREATE_INDEX) { + thr = dict_create_index_step(thr); + } else if (type == QUE_NODE_ROW_PRINTF) { + thr = row_printf_step(thr); + } else { + ut_error; + } + + if (type == QUE_NODE_EXIT) { + old_thr->prev_node = que_node_get_containing_loop_node(node); + } else { + old_thr->prev_node = node; + } + + if (thr) { + ut_a(thr_get_trx(thr)->error_state == DB_SUCCESS); + } + + return(thr); +} + +/**********************************************************************//** +Run a query thread until it finishes or encounters e.g. a lock wait. */ +static +void +que_run_threads_low( +/*================*/ + que_thr_t* thr) /*!< in: query thread */ +{ + trx_t* trx; + que_thr_t* next_thr; + + ut_ad(thr->state == QUE_THR_RUNNING); + ut_a(thr_get_trx(thr)->error_state == DB_SUCCESS); + ut_ad(!trx_mutex_own(thr_get_trx(thr))); + + /* cumul_resource counts how much resources the OS thread (NOT the + query thread) has spent in this function */ + + trx = thr_get_trx(thr); + + do { + /* Check that there is enough space in the log to accommodate + possible log entries by this query step; if the operation can + touch more than about 4 pages, checks must be made also within + the query step! */ + + log_free_check(); + + /* Perform the actual query step: note that the query thread + may change if, e.g., a subprocedure call is made */ + + /*-------------------------*/ + next_thr = que_thr_step(thr); + /*-------------------------*/ + + trx_mutex_enter(trx); + + ut_a(next_thr == NULL || trx->error_state == DB_SUCCESS); + + if (next_thr != thr) { + ut_a(next_thr == NULL); + + /* This can change next_thr to a non-NULL value + if there was a lock wait that already completed. */ + + que_thr_dec_refer_count(thr, &next_thr); + + if (next_thr != NULL) { + + thr = next_thr; + } + } + + ut_ad(trx == thr_get_trx(thr)); + + trx_mutex_exit(trx); + + } while (next_thr != NULL); +} + +/**********************************************************************//** +Run a query thread. Handles lock waits. */ +UNIV_INTERN +void +que_run_threads( +/*============*/ + que_thr_t* thr) /*!< in: query thread */ +{ + ut_ad(!trx_mutex_own(thr_get_trx(thr))); + +loop: + ut_a(thr_get_trx(thr)->error_state == DB_SUCCESS); + + que_run_threads_low(thr); + + switch (thr->state) { + + case QUE_THR_RUNNING: + /* There probably was a lock wait, but it already ended + before we came here: continue running thr */ + + goto loop; + + case QUE_THR_LOCK_WAIT: + lock_wait_suspend_thread(thr); + + trx_mutex_enter(thr_get_trx(thr)); + + ut_a(thr_get_trx(thr)->id != 0); + + if (thr_get_trx(thr)->error_state != DB_SUCCESS) { + /* thr was chosen as a deadlock victim or there was + a lock wait timeout */ + + que_thr_dec_refer_count(thr, NULL); + trx_mutex_exit(thr_get_trx(thr)); + break; + } + + trx_mutex_exit(thr_get_trx(thr)); + goto loop; + + case QUE_THR_COMPLETED: + case QUE_THR_COMMAND_WAIT: + /* Do nothing */ + break; + + default: + ut_error; + } +} + +/*********************************************************************//** +Evaluate the given SQL. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +que_eval_sql( +/*=========*/ + pars_info_t* info, /*!< in: info struct, or NULL */ + const char* sql, /*!< in: SQL string */ + ibool reserve_dict_mutex, + /*!< in: if TRUE, acquire/release + dict_sys->mutex around call to pars_sql. */ + trx_t* trx) /*!< in: trx */ +{ + que_thr_t* thr; + que_t* graph; + + ut_a(trx->error_state == DB_SUCCESS); + + if (UNIV_UNLIKELY(trx->fake_changes)) { + /* fake_changes should not access to system tables */ + fprintf(stderr, "InnoDB: ERROR: innodb_fake_changes tried to access to system tables.\n"); + return(DB_ERROR); + } + + if (reserve_dict_mutex) { + mutex_enter(&dict_sys->mutex); + } + + graph = pars_sql(info, sql); + + if (reserve_dict_mutex) { + mutex_exit(&dict_sys->mutex); + } + + ut_a(graph); + + graph->trx = trx; + trx->graph = NULL; + + graph->fork_type = QUE_FORK_MYSQL_INTERFACE; + + ut_a(thr = que_fork_start_command(graph)); + + que_run_threads(thr); + + if (reserve_dict_mutex) { + mutex_enter(&dict_sys->mutex); + } + + que_graph_free(graph); + + if (reserve_dict_mutex) { + mutex_exit(&dict_sys->mutex); + } + + return(trx->error_state); +} + +/*********************************************************************//** +Initialise the query sub-system. */ +UNIV_INTERN +void +que_init(void) +/*==========*/ +{ + /* No op */ +} + +/*********************************************************************//** +Close the query sub-system. */ +UNIV_INTERN +void +que_close(void) +/*===========*/ +{ + /* No op */ +} diff --git a/storage/xtradb/read/read0read.cc b/storage/xtradb/read/read0read.cc new file mode 100644 index 00000000000..9f0921ac6d5 --- /dev/null +++ b/storage/xtradb/read/read0read.cc @@ -0,0 +1,691 @@ +/***************************************************************************** + +Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file read/read0read.cc +Cursor read + +Created 2/16/1997 Heikki Tuuri +*******************************************************/ + +#include "read0read.h" +#include "read0i_s.h" + +#ifdef UNIV_NONINL +#include "read0read.ic" +#endif + +#include "srv0srv.h" +#include "trx0sys.h" + +/* +------------------------------------------------------------------------------- +FACT A: Cursor read view on a secondary index sees only committed versions +------- +of the records in the secondary index or those versions of rows created +by transaction which created a cursor before cursor was created even +if transaction which created the cursor has changed that clustered index page. + +PROOF: We must show that read goes always to the clustered index record +to see that record is visible in the cursor read view. Consider e.g. +following table and SQL-clauses: + +create table t1(a int not null, b int, primary key(a), index(b)); +insert into t1 values (1,1),(2,2); +commit; + +Now consider that we have a cursor for a query + +select b from t1 where b >= 1; + +This query will use secondary key on the table t1. Now after the first fetch +on this cursor if we do a update: + +update t1 set b = 5 where b = 2; + +Now second fetch of the cursor should not see record (2,5) instead it should +see record (2,2). + +We also should show that if we have delete t1 where b = 5; we still +can see record (2,2). + +When we access a secondary key record maximum transaction id is fetched +from this record and this trx_id is compared to up_limit_id in the view. +If trx_id in the record is greater or equal than up_limit_id in the view +cluster record is accessed. Because trx_id of the creating +transaction is stored when this view was created to the list of +trx_ids not seen by this read view previous version of the +record is requested to be built. This is build using clustered record. +If the secondary key record is delete-marked, its corresponding +clustered record can be already be purged only if records +trx_id < low_limit_no. Purge can't remove any record deleted by a +transaction which was active when cursor was created. But, we still +may have a deleted secondary key record but no clustered record. But, +this is not a problem because this case is handled in +row_sel_get_clust_rec() function which is called +whenever we note that this read view does not see trx_id in the +record. Thus, we see correct version. Q. E. D. + +------------------------------------------------------------------------------- +FACT B: Cursor read view on a clustered index sees only committed versions +------- +of the records in the clustered index or those versions of rows created +by transaction which created a cursor before cursor was created even +if transaction which created the cursor has changed that clustered index page. + +PROOF: Consider e.g.following table and SQL-clauses: + +create table t1(a int not null, b int, primary key(a)); +insert into t1 values (1),(2); +commit; + +Now consider that we have a cursor for a query + +select a from t1 where a >= 1; + +This query will use clustered key on the table t1. Now after the first fetch +on this cursor if we do a update: + +update t1 set a = 5 where a = 2; + +Now second fetch of the cursor should not see record (5) instead it should +see record (2). + +We also should show that if we have execute delete t1 where a = 5; after +the cursor is opened we still can see record (2). + +When accessing clustered record we always check if this read view sees +trx_id stored to clustered record. By default we don't see any changes +if record trx_id >= low_limit_id i.e. change was made transaction +which started after transaction which created the cursor. If row +was changed by the future transaction a previous version of the +clustered record is created. Thus we see only committed version in +this case. We see all changes made by committed transactions i.e. +record trx_id < up_limit_id. In this case we don't need to do anything, +we already see correct version of the record. We don't see any changes +made by active transaction except creating transaction. We have stored +trx_id of creating transaction to list of trx_ids when this view was +created. Thus we can easily see if this record was changed by the +creating transaction. Because we already have clustered record we can +access roll_ptr. Using this roll_ptr we can fetch undo record. +We can now check that undo_no of the undo record is less than undo_no of the +trancaction which created a view when cursor was created. We see this +clustered record only in case when record undo_no is less than undo_no +in the view. If this is not true we build based on undo_rec previous +version of the record. This record is found because purge can't remove +records accessed by active transaction. Thus we see correct version. Q. E. D. +------------------------------------------------------------------------------- +FACT C: Purge does not remove any delete-marked row that is visible +------- +in any cursor read view. + +PROOF: We know that: + 1: Currently active read views in trx_sys_t::view_list are ordered by + read_view_t::low_limit_no in descending order, that is, + newest read view first. + + 2: Purge clones the oldest read view and uses that to determine whether there + are any active transactions that can see the to be purged records. + +Therefore any joining or active transaction will not have a view older +than the purge view, according to 1. + +When purge needs to remove a delete-marked row from a secondary index, +it will first check that the DB_TRX_ID value of the corresponding +record in the clustered index is older than the purge view. It will +also check if there is a newer version of the row (clustered index +record) that is not delete-marked in the secondary index. If such a +row exists and is collation-equal to the delete-marked secondary index +record then purge will not remove the secondary index record. + +Delete-marked clustered index records will be removed by +row_purge_remove_clust_if_poss(), unless the clustered index record +(and its DB_ROLL_PTR) has been updated. Every new version of the +clustered index record will update DB_ROLL_PTR, pointing to a new UNDO +log entry that allows the old version to be reconstructed. The +DB_ROLL_PTR in the oldest remaining version in the old-version chain +may be pointing to garbage (an undo log record discarded by purge), +but it will never be dereferenced, because the purge view is older +than any active transaction. + +For details see: row_vers_old_has_index_entry() and row_purge_poss_sec() + +Some additional issues: + +What if trx_sys->view_list == NULL and some transaction T1 and Purge both +try to open read_view at same time. Only one can acquire trx_sys->mutex. +In which order will the views be opened? Should it matter? If no, why? + +The order does not matter. No new transactions can be created and no running +transaction can commit or rollback (or free views). +*/ + +/*********************************************************************//** +Creates a read view object. +@return own: read view struct */ +UNIV_INLINE +read_view_t* +read_view_create_low( +/*=================*/ + ulint n, /*!< in: number of cells in the trx_ids array */ + read_view_t*& view) /*!< in,out: pre-allocated view array or NULL if + a new one needs to be created */ +{ + if (view == NULL) { + view = static_cast<read_view_t*>( + ut_malloc(sizeof(read_view_t))); + os_atomic_increment_ulint(&srv_read_views_memory, + sizeof(read_view_t)); + view->max_descr = 0; + view->descriptors = NULL; + } + + if (UNIV_UNLIKELY(view->max_descr < n)) { + + /* avoid frequent re-allocations by extending the array to the + desired size + 10% */ + + os_atomic_increment_ulint(&srv_read_views_memory, + (n + n / 10 - view->max_descr) * + sizeof(trx_id_t)); + view->max_descr = n + n / 10; + view->descriptors = static_cast<trx_id_t*>( + ut_realloc(view->descriptors, + view->max_descr * + sizeof *view->descriptors)); + } + + view->n_descr = n; + + return(view); +} + +/*********************************************************************//** +Clones a read view object. This function will allocate space for two read +views contiguously, one identical in size and content as @param view (starting +at returned pointer) and another view immediately following the trx_ids array. +The second view will have space for an extra trx_id_t element. +@return read view struct */ +UNIV_INTERN +read_view_t* +read_view_clone( +/*============*/ + const read_view_t* view, /*!< in: view to clone */ + read_view_t*& prebuilt_clone) /*!< in,out: prebuilt view or + NULL */ +{ + read_view_t* clone; + trx_id_t* old_descriptors; + ulint old_max_descr; + + ut_ad(mutex_own(&trx_sys->mutex)); + + clone = read_view_create_low(view->n_descr, prebuilt_clone); + + old_descriptors = clone->descriptors; + old_max_descr = clone->max_descr; + + memcpy(clone, view, sizeof(*view)); + + clone->descriptors = old_descriptors; + clone->max_descr = old_max_descr; + + if (view->n_descr) { + memcpy(clone->descriptors, view->descriptors, + view->n_descr * sizeof(trx_id_t)); + } + + return(clone); +} + +/*********************************************************************//** +Insert the view in the proper order into the trx_sys->view_list. The +read view list is ordered by read_view_t::low_limit_no in descending order. */ +UNIV_INTERN +void +read_view_add( +/*==========*/ + read_view_t* view) /*!< in: view to add to */ +{ + read_view_t* elem; + read_view_t* prev_elem; + + ut_ad(mutex_own(&trx_sys->mutex)); + ut_ad(read_view_validate(view)); + + /* Find the correct slot for insertion. */ + for (elem = UT_LIST_GET_FIRST(trx_sys->view_list), prev_elem = NULL; + elem != NULL && view->low_limit_no < elem->low_limit_no; + prev_elem = elem, elem = UT_LIST_GET_NEXT(view_list, elem)) { + /* No op */ + } + + if (prev_elem == NULL) { + UT_LIST_ADD_FIRST(view_list, trx_sys->view_list, view); + } else { + UT_LIST_INSERT_AFTER( + view_list, trx_sys->view_list, prev_elem, view); + } + + ut_ad(read_view_list_validate()); +} + +/*********************************************************************//** +Opens a read view where exactly the transactions serialized before this +point in time are seen in the view. +@return own: read view struct */ +static +read_view_t* +read_view_open_now_low( +/*===================*/ + trx_id_t cr_trx_id, /*!< in: trx_id of creating + transaction, or 0 used in purge */ + read_view_t*& view) /*!< in,out: pre-allocated view array or + NULL if a new one needs to be created */ +{ + trx_id_t* descr; + ulint i; + + ut_ad(mutex_own(&trx_sys->mutex)); + + view = read_view_create_low(trx_sys->descr_n_used, view); + + view->undo_no = 0; + view->type = VIEW_NORMAL; + view->creator_trx_id = cr_trx_id; + + /* No future transactions should be visible in the view */ + + view->low_limit_no = trx_sys->max_trx_id; + view->low_limit_id = view->low_limit_no; + + descr = trx_find_descriptor(trx_sys->descriptors, + trx_sys->descr_n_used, + cr_trx_id); + if (UNIV_LIKELY(descr != NULL)) { + ut_ad(trx_sys->descr_n_used > 0); + ut_ad(view->n_descr > 0); + + view->n_descr--; + + i = descr - trx_sys->descriptors; + } else { + i = trx_sys->descr_n_used; + } + + if (UNIV_LIKELY(i > 0)) { + /* Copy the [0; i-1] range */ + memcpy(view->descriptors, trx_sys->descriptors, + i * sizeof(trx_id_t)); + } + + if (UNIV_UNLIKELY(i + 1 < trx_sys->descr_n_used)) { + /* Copy the [i+1; descr_n_used-1] range */ + memcpy(view->descriptors + i, + trx_sys->descriptors + i + 1, + (trx_sys->descr_n_used - i - 1) * + sizeof(trx_id_t)); + } + + /* NOTE that a transaction whose trx number is < trx_sys->max_trx_id can + still be active, if it is in the middle of its commit! Note that when a + transaction starts, we initialize trx->no to TRX_ID_MAX. */ + + if (UT_LIST_GET_LEN(trx_sys->trx_serial_list) > 0) { + + trx_id_t trx_no; + + trx_no = UT_LIST_GET_FIRST(trx_sys->trx_serial_list)->no; + + if (trx_no < view->low_limit_no) { + view->low_limit_no = trx_no; + } + } + + if (UNIV_LIKELY(view->n_descr > 0)) { + /* The last active transaction has the smallest id: */ + view->up_limit_id = view->descriptors[0]; + } else { + view->up_limit_id = view->low_limit_id; + } + + /* Purge views are not added to the view list. */ + if (cr_trx_id > 0) { + read_view_add(view); + } + + return(view); +} + +/*********************************************************************//** +Opens a read view where exactly the transactions serialized before this +point in time are seen in the view. +@return own: read view struct */ +UNIV_INTERN +read_view_t* +read_view_open_now( +/*===============*/ + trx_id_t cr_trx_id, /*!< in: trx_id of creating + transaction, or 0 used in purge */ + read_view_t*& view) /*!< in,out: pre-allocated view array or + NULL if a new one needs to be created */ +{ + mutex_enter(&trx_sys->mutex); + + view = read_view_open_now_low(cr_trx_id, view); + + mutex_exit(&trx_sys->mutex); + + return(view); +} + +/*********************************************************************//** +Makes a copy of the oldest existing read view, with the exception that also +the creating trx of the oldest view is set as not visible in the 'copied' +view. Opens a new view if no views currently exist. The view must be closed +with ..._close. This is used in purge. +@return own: read view struct */ +UNIV_INTERN +read_view_t* +read_view_purge_open( +/*=================*/ + read_view_t*& prebuilt_clone, /*!< in,out: pre-allocated view that + will be used to clone the oldest view if + exists */ + read_view_t*& prebuilt_view) /*!< in,out: pre-allocated view array or + NULL if a new one needs to be created */ +{ + ulint i; + read_view_t* view; + read_view_t* oldest_view; + trx_id_t creator_trx_id; + ulint insert_done = 0; + + mutex_enter(&trx_sys->mutex); + + oldest_view = UT_LIST_GET_LAST(trx_sys->view_list); + + if (oldest_view == NULL) { + + view = read_view_open_now_low(0, prebuilt_view); + + mutex_exit(&trx_sys->mutex); + + return(view); + } + + /* Clone the oldest view to a pre-allocated clone view */ + + oldest_view = read_view_clone(oldest_view, prebuilt_clone); + + ut_ad(read_view_validate(oldest_view)); + + mutex_exit(&trx_sys->mutex); + + ut_a(oldest_view->creator_trx_id > 0); + creator_trx_id = oldest_view->creator_trx_id; + + view = read_view_create_low(oldest_view->n_descr + 1, prebuilt_view); + + /* Add the creator transaction id in the trx_ids array in the + correct slot. */ + + for (i = 0; i < oldest_view->n_descr; ++i) { + trx_id_t id; + + id = oldest_view->descriptors[i - insert_done]; + + if (insert_done == 0 && creator_trx_id < id) { + id = creator_trx_id; + insert_done = 1; + } + + view->descriptors[i] = id; + } + + if (insert_done == 0) { + view->descriptors[i] = creator_trx_id; + } else { + ut_a(i > 0); + view->descriptors[i] = oldest_view->descriptors[i - 1]; + } + + view->creator_trx_id = 0; + + view->low_limit_no = oldest_view->low_limit_no; + view->low_limit_id = oldest_view->low_limit_id; + + if (view->n_descr > 0) { + /* The last active transaction has the smallest id: */ + + view->up_limit_id = view->descriptors[0]; + } else { + view->up_limit_id = oldest_view->up_limit_id; + } + + return(view); +} + +/*********************************************************************//** +Closes a consistent read view for MySQL. This function is called at an SQL +statement end if the trx isolation level is <= TRX_ISO_READ_COMMITTED. */ +UNIV_INTERN +void +read_view_close_for_mysql( +/*======================*/ + trx_t* trx) /*!< in: trx which has a read view */ +{ + ut_a(trx->global_read_view); + + read_view_remove(trx->global_read_view, false); + + trx->read_view = NULL; + trx->global_read_view = NULL; +} + +/*********************************************************************//** +Prints a read view to file. */ +UNIV_INTERN +void +read_view_print( +/*============*/ + FILE* file, /*!< in: file to print to */ + const read_view_t* view) /*!< in: read view */ +{ + ulint n_ids; + ulint i; + + if (view->type == VIEW_HIGH_GRANULARITY) { + fprintf(file, + "High-granularity read view undo_n:o " TRX_ID_FMT "\n", + view->undo_no); + } else { + fprintf(file, "Normal read view\n"); + } + + fprintf(file, "Read view low limit trx n:o " TRX_ID_FMT "\n", + view->low_limit_no); + + fprintf(file, "Read view up limit trx id " TRX_ID_FMT "\n", + view->up_limit_id); + + fprintf(file, "Read view low limit trx id " TRX_ID_FMT "\n", + view->low_limit_id); + + fprintf(file, "Read view individually stored trx ids:\n"); + + n_ids = view->n_descr; + + for (i = 0; i < n_ids; i++) { + fprintf(file, "Read view trx id " TRX_ID_FMT "\n", + view->descriptors[i]); + } +} + +UNIV_INTERN +i_s_xtradb_read_view_t* +read_fill_i_s_xtradb_read_view(i_s_xtradb_read_view_t* rv) +{ + read_view_t* view; + + mutex_enter(&trx_sys->mutex); + + if (UT_LIST_GET_LEN(trx_sys->view_list)) { + view = UT_LIST_GET_LAST(trx_sys->view_list); + } else { + mutex_exit(&trx_sys->mutex); + return NULL; + } + + if (view->type == VIEW_HIGH_GRANULARITY) { + rv->undo_no = view->undo_no; + } else { + rv->undo_no = ULINT_UNDEFINED; + } + + rv->low_limit_no = view->low_limit_no; + rv->up_limit_id = view->up_limit_id; + rv->low_limit_id = view->low_limit_id; + + mutex_exit(&trx_sys->mutex); + + return rv; +} + +/*********************************************************************//** +Frees resource allocated by a read view. */ +UNIV_INTERN +void +read_view_free( +/*===========*/ + read_view_t*& view) /*< in,out: read view */ +{ + if (view == NULL) { + + return; + } + + os_atomic_decrement_lint(&srv_read_views_memory, + sizeof(read_view_t) + + view->max_descr * sizeof(trx_id_t)); + + if (view->descriptors != NULL) { + ut_free(view->descriptors); + } + + ut_free(view); + + view = NULL; +} + +/*********************************************************************//** +Create a high-granularity consistent cursor view for mysql to be used +in cursors. In this consistent read view modifications done by the +creating transaction after the cursor is created or future transactions +are not visible. */ +UNIV_INTERN +cursor_view_t* +read_cursor_view_create_for_mysql( +/*==============================*/ + trx_t* cr_trx) /*!< in: trx where cursor view is created */ +{ + read_view_t* view; + mem_heap_t* heap; + cursor_view_t* curview; + + /* Use larger heap than in trx_create when creating a read_view + because cursors are quite long. */ + + heap = mem_heap_create(512); + + curview = (cursor_view_t*) mem_heap_alloc(heap, sizeof(*curview)); + + curview->heap = heap; + + /* Drop cursor tables from consideration when evaluating the + need of auto-commit */ + + curview->n_mysql_tables_in_use = cr_trx->n_mysql_tables_in_use; + + cr_trx->n_mysql_tables_in_use = 0; + + mutex_enter(&trx_sys->mutex); + + curview->read_view = NULL; + read_view_open_now_low(UINT64_UNDEFINED, curview->read_view); + + view = curview->read_view; + view->undo_no = cr_trx->undo_no; + view->type = VIEW_HIGH_GRANULARITY; + + mutex_exit(&trx_sys->mutex); + + return(curview); +} + +/*********************************************************************//** +Close a given consistent cursor view for mysql and restore global read view +back to a transaction read view. */ +UNIV_INTERN +void +read_cursor_view_close_for_mysql( +/*=============================*/ + trx_t* trx, /*!< in: trx */ + cursor_view_t* curview)/*!< in: cursor view to be closed */ +{ + ut_a(curview); + ut_a(curview->read_view); + ut_a(curview->heap); + + /* Add cursor's tables to the global count of active tables that + belong to this transaction */ + trx->n_mysql_tables_in_use += curview->n_mysql_tables_in_use; + + read_view_remove(curview->read_view, false); + read_view_free(curview->read_view); + + trx->read_view = trx->global_read_view; + + mem_heap_free(curview->heap); +} + +/*********************************************************************//** +This function sets a given consistent cursor view to a transaction +read view if given consistent cursor view is not NULL. Otherwise, function +restores a global read view to a transaction read view. */ +UNIV_INTERN +void +read_cursor_set_for_mysql( +/*======================*/ + trx_t* trx, /*!< in: transaction where cursor is set */ + cursor_view_t* curview)/*!< in: consistent cursor view to be set */ +{ + ut_a(trx); + + mutex_enter(&trx_sys->mutex); + + if (UNIV_LIKELY(curview != NULL)) { + trx->read_view = curview->read_view; + } else { + trx->read_view = trx->global_read_view; + } + + ut_ad(read_view_validate(trx->read_view)); + + mutex_exit(&trx_sys->mutex); +} diff --git a/storage/xtradb/rem/rem0cmp.cc b/storage/xtradb/rem/rem0cmp.cc new file mode 100644 index 00000000000..426cf9e3ac5 --- /dev/null +++ b/storage/xtradb/rem/rem0cmp.cc @@ -0,0 +1,1458 @@ +/***************************************************************************** + +Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/*******************************************************************//** +@file rem/rem0cmp.cc +Comparison services for records + +Created 7/1/1994 Heikki Tuuri +************************************************************************/ + +#include "rem0cmp.h" + +#ifdef UNIV_NONINL +#include "rem0cmp.ic" +#endif + +#include "ha_prototypes.h" +#include "handler0alter.h" +#include "srv0srv.h" + +/* ALPHABETICAL ORDER + ================== + +The records are put into alphabetical order in the following +way: let F be the first field where two records disagree. +If there is a character in some position n where the +records disagree, the order is determined by comparison of +the characters at position n, possibly after +collating transformation. If there is no such character, +but the corresponding fields have different lengths, then +if the data type of the fields is paddable, +shorter field is padded with a padding character. If the +data type is not paddable, longer field is considered greater. +Finally, the SQL null is bigger than any other value. + +At the present, the comparison functions return 0 in the case, +where two records disagree only in the way that one +has more fields than the other. */ + +#ifdef UNIV_DEBUG +/*************************************************************//** +Used in debug checking of cmp_dtuple_... . +This function is used to compare a data tuple to a physical record. If +dtuple has n fields then rec must have either m >= n fields, or it must +differ from dtuple in some of the m fields rec has. +@return 1, 0, -1, if dtuple is greater, equal, less than rec, +respectively, when only the common first fields are compared */ +static +int +cmp_debug_dtuple_rec_with_match( +/*============================*/ + const dtuple_t* dtuple, /*!< in: data tuple */ + const rec_t* rec, /*!< in: physical record which differs from + dtuple in some of the common fields, or which + has an equal number or more fields than + dtuple */ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint n_cmp, /*!< in: number of fields to compare */ + ulint* matched_fields)/*!< in/out: number of already + completely matched fields; when function + returns, contains the value for current + comparison */ + __attribute__((nonnull, warn_unused_result)); +#endif /* UNIV_DEBUG */ +/*************************************************************//** +This function is used to compare two data fields for which the data type +is such that we must use MySQL code to compare them. The prototype here +must be a copy of the one in ha_innobase.cc! +@return 1, 0, -1, if a is greater, equal, less than b, respectively */ +extern +int +innobase_mysql_cmp( +/*===============*/ + int mysql_type, /*!< in: MySQL type */ + uint charset_number, /*!< in: number of the charset */ + const unsigned char* a, /*!< in: data field */ + unsigned int a_length, /*!< in: data field length, + not UNIV_SQL_NULL */ + const unsigned char* b, /*!< in: data field */ + unsigned int b_length); /*!< in: data field length, + not UNIV_SQL_NULL */ +/*************************************************************//** +This function is used to compare two data fields for which the data type +is such that we must use MySQL code to compare them. The prototype here +must be a copy of the one in ha_innobase.cc! +@return 1, 0, -1, if a is greater, equal, less than b, respectively */ +extern +int +innobase_mysql_cmp_prefix( +/*======================*/ + int mysql_type, /*!< in: MySQL type */ + uint charset_number, /*!< in: number of the charset */ + const unsigned char* a, /*!< in: data field */ + unsigned int a_length, /*!< in: data field length, + not UNIV_SQL_NULL */ + const unsigned char* b, /*!< in: data field */ + unsigned int b_length); /*!< in: data field length, + not UNIV_SQL_NULL */ +/*********************************************************************//** +Transforms the character code so that it is ordered appropriately for the +language. This is only used for the latin1 char set. MySQL does the +comparisons for other char sets. +@return collation order position */ +UNIV_INLINE +ulint +cmp_collate( +/*========*/ + ulint code) /*!< in: code of a character stored in database record */ +{ + return((ulint) srv_latin1_ordering[code]); +} + +/*************************************************************//** +Returns TRUE if two columns are equal for comparison purposes. +@return TRUE if the columns are considered equal in comparisons */ +UNIV_INTERN +ibool +cmp_cols_are_equal( +/*===============*/ + const dict_col_t* col1, /*!< in: column 1 */ + const dict_col_t* col2, /*!< in: column 2 */ + ibool check_charsets) + /*!< in: whether to check charsets */ +{ + if (dtype_is_non_binary_string_type(col1->mtype, col1->prtype) + && dtype_is_non_binary_string_type(col2->mtype, col2->prtype)) { + + /* Both are non-binary string types: they can be compared if + and only if the charset-collation is the same */ + + if (check_charsets) { + return(dtype_get_charset_coll(col1->prtype) + == dtype_get_charset_coll(col2->prtype)); + } else { + return(TRUE); + } + } + + if (dtype_is_binary_string_type(col1->mtype, col1->prtype) + && dtype_is_binary_string_type(col2->mtype, col2->prtype)) { + + /* Both are binary string types: they can be compared */ + + return(TRUE); + } + + if (col1->mtype != col2->mtype) { + + return(FALSE); + } + + if (col1->mtype == DATA_INT + && (col1->prtype & DATA_UNSIGNED) + != (col2->prtype & DATA_UNSIGNED)) { + + /* The storage format of an unsigned integer is different + from a signed integer: in a signed integer we OR + 0x8000... to the value of positive integers. */ + + return(FALSE); + } + + return(col1->mtype != DATA_INT || col1->len == col2->len); +} + +/*************************************************************//** +Innobase uses this function to compare two data fields for which the data type +is such that we must compare whole fields or call MySQL to do the comparison +@return 1, 0, -1, if a is greater, equal, less than b, respectively */ +static +int +cmp_whole_field( +/*============*/ + ulint mtype, /*!< in: main type */ + ulint prtype, /*!< in: precise type */ + const byte* a, /*!< in: data field */ + unsigned int a_length, /*!< in: data field length, + not UNIV_SQL_NULL */ + const byte* b, /*!< in: data field */ + unsigned int b_length) /*!< in: data field length, + not UNIV_SQL_NULL */ +{ + float f_1; + float f_2; + double d_1; + double d_2; + int swap_flag = 1; + + switch (mtype) { + + case DATA_DECIMAL: + /* Remove preceding spaces */ + for (; a_length && *a == ' '; a++, a_length--) { } + for (; b_length && *b == ' '; b++, b_length--) { } + + if (*a == '-') { + if (*b != '-') { + return(-1); + } + + a++; b++; + a_length--; + b_length--; + + swap_flag = -1; + + } else if (*b == '-') { + + return(1); + } + + while (a_length > 0 && (*a == '+' || *a == '0')) { + a++; a_length--; + } + + while (b_length > 0 && (*b == '+' || *b == '0')) { + b++; b_length--; + } + + if (a_length != b_length) { + if (a_length < b_length) { + return(-swap_flag); + } + + return(swap_flag); + } + + while (a_length > 0 && *a == *b) { + + a++; b++; a_length--; + } + + if (a_length == 0) { + + return(0); + } + + if (*a > *b) { + return(swap_flag); + } + + return(-swap_flag); + case DATA_DOUBLE: + d_1 = mach_double_read(a); + d_2 = mach_double_read(b); + + if (d_1 > d_2) { + return(1); + } else if (d_2 > d_1) { + return(-1); + } + + return(0); + + case DATA_FLOAT: + f_1 = mach_float_read(a); + f_2 = mach_float_read(b); + + if (f_1 > f_2) { + return(1); + } else if (f_2 > f_1) { + return(-1); + } + + return(0); + case DATA_BLOB: + if (prtype & DATA_BINARY_TYPE) { + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: comparing a binary BLOB" + " with a character set sensitive\n" + "InnoDB: comparison!\n"); + } + /* fall through */ + case DATA_VARMYSQL: + case DATA_MYSQL: + return(innobase_mysql_cmp( + (int)(prtype & DATA_MYSQL_TYPE_MASK), + (uint) dtype_get_charset_coll(prtype), + a, a_length, b, b_length)); + default: + fprintf(stderr, + "InnoDB: unknown type number %lu\n", + (ulong) mtype); + ut_error; + } + + return(0); +} + +/***************************************************************** +This function is used to compare two dfields where at least the first +has its data type field set. */ +UNIV_INTERN +int +cmp_dfield_dfield_like_prefix( +/*==========================*/ + /* out: 1, 0, -1, if dfield1 is greater, equal, + less than dfield2, respectively */ + dfield_t* dfield1,/* in: data field; must have type field set */ + dfield_t* dfield2)/* in: data field */ +{ + const dtype_t* type; + int ret; + + ut_ad(dfield_check_typed(dfield1)); + + type = dfield_get_type(dfield1); + + if (type->mtype >= DATA_FLOAT) { + ret = innobase_mysql_cmp_prefix( + static_cast<int>(type->prtype & DATA_MYSQL_TYPE_MASK), + static_cast<uint>(dtype_get_charset_coll(type->prtype)), + static_cast<byte*>(dfield_get_data(dfield1)), + static_cast<uint>(dfield_get_len(dfield1)), + static_cast<byte*>(dfield_get_data(dfield2)), + static_cast<uint>(dfield_get_len(dfield2))); + } else { + ret = (cmp_data_data_like_prefix( + static_cast<byte*>(dfield_get_data(dfield1)), + dfield_get_len(dfield1), + static_cast<byte*>(dfield_get_data(dfield2)), + dfield_get_len(dfield2))); + } + + return(ret); +} + +/*************************************************************//** +This function is used to compare two data fields for which we know the +data type. +@return 1, 0, -1, if data1 is greater, equal, less than data2, respectively */ +UNIV_INTERN +int +cmp_data_data_slow( +/*===============*/ + ulint mtype, /*!< in: main type */ + ulint prtype, /*!< in: precise type */ + const byte* data1, /*!< in: data field (== a pointer to a memory + buffer) */ + ulint len1, /*!< in: data field length or UNIV_SQL_NULL */ + const byte* data2, /*!< in: data field (== a pointer to a memory + buffer) */ + ulint len2) /*!< in: data field length or UNIV_SQL_NULL */ +{ + ulint data1_byte; + ulint data2_byte; + ulint cur_bytes; + + if (len1 == UNIV_SQL_NULL || len2 == UNIV_SQL_NULL) { + + if (len1 == len2) { + + return(0); + } + + if (len1 == UNIV_SQL_NULL) { + /* We define the SQL null to be the smallest possible + value of a field in the alphabetical order */ + + return(-1); + } + + return(1); + } + + if (mtype >= DATA_FLOAT + || (mtype == DATA_BLOB + && 0 == (prtype & DATA_BINARY_TYPE) + && dtype_get_charset_coll(prtype) + != DATA_MYSQL_LATIN1_SWEDISH_CHARSET_COLL)) { + + return(cmp_whole_field(mtype, prtype, + data1, (unsigned) len1, + data2, (unsigned) len2)); + } + + /* Compare then the fields */ + + cur_bytes = 0; + + for (;;) { + if (len1 <= cur_bytes) { + if (len2 <= cur_bytes) { + + return(0); + } + + data1_byte = dtype_get_pad_char(mtype, prtype); + + if (data1_byte == ULINT_UNDEFINED) { + + return(-1); + } + } else { + data1_byte = *data1; + } + + if (len2 <= cur_bytes) { + data2_byte = dtype_get_pad_char(mtype, prtype); + + if (data2_byte == ULINT_UNDEFINED) { + + return(1); + } + } else { + data2_byte = *data2; + } + + if (data1_byte == data2_byte) { + /* If the bytes are equal, they will remain such even + after the collation transformation below */ + + goto next_byte; + } + + if (mtype <= DATA_CHAR + || (mtype == DATA_BLOB + && 0 == (prtype & DATA_BINARY_TYPE))) { + + data1_byte = cmp_collate(data1_byte); + data2_byte = cmp_collate(data2_byte); + } + + if (data1_byte > data2_byte) { + + return(1); + } else if (data1_byte < data2_byte) { + + return(-1); + } +next_byte: + /* Next byte */ + cur_bytes++; + data1++; + data2++; + } + + return(0); /* Not reached */ +} + +/***************************************************************** +This function is used to compare two data fields for which we know the +data type to be VARCHAR */ + +int +cmp_data_data_slow_varchar( +/*=======================*/ + /* out: 1, 0, -1, if lhs is greater, equal, + less than rhs, respectively */ + const byte* lhs, /* in: data field (== a pointer to a memory + buffer) */ + ulint lhs_len,/* in: data field length or UNIV_SQL_NULL */ + const byte* rhs, /* in: data field (== a pointer to a memory + buffer) */ + ulint rhs_len)/* in: data field length or UNIV_SQL_NULL */ +{ + ulint i; + + ut_a(rhs_len != UNIV_SQL_NULL); + + if (lhs_len == UNIV_SQL_NULL) { + + /* We define the SQL null to be the smallest possible + value of a field in the alphabetical order */ + + return(-1); + } + + /* Compare the values.*/ + + for (i = 0; i < lhs_len && i < rhs_len; ++i, ++rhs, ++lhs) { + ulint lhs_byte = *lhs; + ulint rhs_byte = *rhs; + + if (lhs_byte != rhs_byte) { + /* If the bytes are equal, they will remain such even + after the collation transformation below */ + + lhs_byte = cmp_collate(lhs_byte); + rhs_byte = cmp_collate(rhs_byte); + + if (lhs_byte > rhs_byte) { + + return(1); + } else if (lhs_byte < rhs_byte) { + + return(-1); + } + } + } + + return((i == lhs_len && i == rhs_len) ? 0 : + static_cast<int>(rhs_len - lhs_len)); +} + +/***************************************************************** +This function is used to compare two data fields for which we know the +data type. The comparison is done for the LIKE operator.*/ + +int +cmp_data_data_slow_like_prefix( +/*===========================*/ + /* out: 1, 0, -1, if lhs is greater, equal, + less than rhs, respectively */ + const byte* lhs, /* in: data field (== a pointer to a memory + buffer) */ + ulint len1, /* in: data field length or UNIV_SQL_NULL */ + const byte* rhs, /* in: data field (== a pointer to a memory + buffer) */ + ulint len2) /* in: data field length or UNIV_SQL_NULL */ +{ + ulint i; + + ut_a(len2 != UNIV_SQL_NULL); + + if (len1 == UNIV_SQL_NULL) { + + /* We define the SQL null to be the smallest possible + value of a field in the alphabetical order */ + + return(-1); + } + + /* Compare the values.*/ + + for (i = 0; i < len1 && i < len2; ++i, ++rhs, ++lhs) { + ulint lhs_byte = *lhs; + ulint rhs_byte = *rhs; + + if (lhs_byte != rhs_byte) { + /* If the bytes are equal, they will remain such even + after the collation transformation below */ + + lhs_byte = cmp_collate(lhs_byte); + rhs_byte = cmp_collate(rhs_byte); + + if (lhs_byte > rhs_byte) { + + return(1); + } else if (lhs_byte < rhs_byte) { + + return(-1); + } + } + } + + return(i == len2 ? 0 : 1); +} + +/***************************************************************** +This function is used to compare two data fields for which we know the +data type. The comparison is done for the LIKE operator.*/ + +int +cmp_data_data_slow_like_suffix( +/*===========================*/ + /* out: 1, 0, -1, if data1 is greater, equal, + less than data2, respectively */ + /* in: data field (== a pointer to a + memory buffer) */ + const byte* data1 UNIV_UNUSED, + /* in: data field length or UNIV_SQL_NULL */ + ulint len1 UNIV_UNUSED, + /* in: data field (== a pointer to a memory + buffer) */ + const byte* data2 UNIV_UNUSED, + /* in: data field length or UNIV_SQL_NULL */ + ulint len2 UNIV_UNUSED) + +{ + ut_error; // FIXME: + return(1); +} + +/***************************************************************** +This function is used to compare two data fields for which we know the +data type. The comparison is done for the LIKE operator.*/ + +int +cmp_data_data_slow_like_substr( +/*===========================*/ + /* out: 1, 0, -1, if data1 is greater, equal, + less than data2, respectively */ + /* in: data field (== a pointer to a + memory buffer) */ + const byte* data1 UNIV_UNUSED, + /* in: data field length or UNIV_SQL_NULL */ + ulint len1 UNIV_UNUSED, + /* in: data field (== a pointer to a memory + buffer) */ + const byte* data2 UNIV_UNUSED, + /* in: data field length or UNIV_SQL_NULL */ + ulint len2 UNIV_UNUSED) +{ + ut_error; // FIXME: + return(1); +} +/*************************************************************//** +This function is used to compare a data tuple to a physical record. +Only dtuple->n_fields_cmp first fields are taken into account for +the data tuple! If we denote by n = n_fields_cmp, then rec must +have either m >= n fields, or it must differ from dtuple in some of +the m fields rec has. If rec has an externally stored field we do not +compare it but return with value 0 if such a comparison should be +made. +@return 1, 0, -1, if dtuple is greater, equal, less than rec, +respectively, when only the common first fields are compared, or until +the first externally stored field in rec */ +UNIV_INTERN +int +cmp_dtuple_rec_with_match_low( +/*==========================*/ + const dtuple_t* dtuple, /*!< in: data tuple */ + const rec_t* rec, /*!< in: physical record which differs from + dtuple in some of the common fields, or which + has an equal number or more fields than + dtuple */ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint n_cmp, /*!< in: number of fields to compare */ + ulint* matched_fields, /*!< in/out: number of already completely + matched fields; when function returns, + contains the value for current comparison */ + ulint* matched_bytes) /*!< in/out: number of already matched + bytes within the first field not completely + matched; when function returns, contains the + value for current comparison */ +{ + const dfield_t* dtuple_field; /* current field in logical record */ + ulint dtuple_f_len; /* the length of the current field + in the logical record */ + const byte* dtuple_b_ptr; /* pointer to the current byte in + logical field data */ + ulint dtuple_byte; /* value of current byte to be compared + in dtuple*/ + ulint rec_f_len; /* length of current field in rec */ + const byte* rec_b_ptr; /* pointer to the current byte in + rec field */ + ulint rec_byte; /* value of current byte to be + compared in rec */ + ulint cur_field; /* current field number */ + ulint cur_bytes; /* number of already matched bytes + in current field */ + int ret; /* return value */ + + ut_ad(dtuple && rec && matched_fields && matched_bytes); + ut_ad(dtuple_check_typed(dtuple)); + ut_ad(rec_offs_validate(rec, NULL, offsets)); + + cur_field = *matched_fields; + cur_bytes = *matched_bytes; + + ut_ad(n_cmp > 0); + ut_ad(n_cmp <= dtuple_get_n_fields(dtuple)); + ut_ad(cur_field <= n_cmp); + ut_ad(cur_field <= rec_offs_n_fields(offsets)); + + if (cur_bytes == 0 && cur_field == 0) { + ulint rec_info = rec_get_info_bits(rec, + rec_offs_comp(offsets)); + ulint tup_info = dtuple_get_info_bits(dtuple); + + if (UNIV_UNLIKELY(rec_info & REC_INFO_MIN_REC_FLAG)) { + ret = !(tup_info & REC_INFO_MIN_REC_FLAG); + goto order_resolved; + } else if (UNIV_UNLIKELY(tup_info & REC_INFO_MIN_REC_FLAG)) { + ret = -1; + goto order_resolved; + } + } + + /* Match fields in a loop; stop if we run out of fields in dtuple + or find an externally stored field */ + + while (cur_field < n_cmp) { + + ulint mtype; + ulint prtype; + + dtuple_field = dtuple_get_nth_field(dtuple, cur_field); + { + const dtype_t* type + = dfield_get_type(dtuple_field); + + mtype = type->mtype; + prtype = type->prtype; + } + + dtuple_f_len = dfield_get_len(dtuple_field); + + rec_b_ptr = rec_get_nth_field(rec, offsets, + cur_field, &rec_f_len); + + /* If we have matched yet 0 bytes, it may be that one or + both the fields are SQL null, or the record or dtuple may be + the predefined minimum record, or the field is externally + stored */ + + if (UNIV_LIKELY(cur_bytes == 0)) { + if (rec_offs_nth_extern(offsets, cur_field)) { + /* We do not compare to an externally + stored field */ + + ret = 0; + + goto order_resolved; + } + + if (dtuple_f_len == UNIV_SQL_NULL) { + if (rec_f_len == UNIV_SQL_NULL) { + + goto next_field; + } + + ret = -1; + goto order_resolved; + } else if (rec_f_len == UNIV_SQL_NULL) { + /* We define the SQL null to be the + smallest possible value of a field + in the alphabetical order */ + + ret = 1; + goto order_resolved; + } + } + + if (mtype >= DATA_FLOAT + || (mtype == DATA_BLOB + && 0 == (prtype & DATA_BINARY_TYPE) + && dtype_get_charset_coll(prtype) + != DATA_MYSQL_LATIN1_SWEDISH_CHARSET_COLL)) { + + ret = cmp_whole_field( + mtype, prtype, + static_cast<const byte*>( + dfield_get_data(dtuple_field)), + (unsigned) dtuple_f_len, + rec_b_ptr, (unsigned) rec_f_len); + + if (ret != 0) { + cur_bytes = 0; + + goto order_resolved; + } else { + goto next_field; + } + } + + /* Set the pointers at the current byte */ + + rec_b_ptr = rec_b_ptr + cur_bytes; + dtuple_b_ptr = (byte*) dfield_get_data(dtuple_field) + + cur_bytes; + /* Compare then the fields */ + + for (;;) { + if (UNIV_UNLIKELY(rec_f_len <= cur_bytes)) { + if (dtuple_f_len <= cur_bytes) { + + goto next_field; + } + + rec_byte = dtype_get_pad_char(mtype, prtype); + + if (rec_byte == ULINT_UNDEFINED) { + ret = 1; + + goto order_resolved; + } + } else { + rec_byte = *rec_b_ptr; + } + + if (UNIV_UNLIKELY(dtuple_f_len <= cur_bytes)) { + dtuple_byte = dtype_get_pad_char(mtype, + prtype); + + if (dtuple_byte == ULINT_UNDEFINED) { + ret = -1; + + goto order_resolved; + } + } else { + dtuple_byte = *dtuple_b_ptr; + } + + if (dtuple_byte == rec_byte) { + /* If the bytes are equal, they will + remain such even after the collation + transformation below */ + + goto next_byte; + } + + if (mtype <= DATA_CHAR + || (mtype == DATA_BLOB + && !(prtype & DATA_BINARY_TYPE))) { + + rec_byte = cmp_collate(rec_byte); + dtuple_byte = cmp_collate(dtuple_byte); + } + + ret = (int) (dtuple_byte - rec_byte); + if (UNIV_LIKELY(ret)) { + if (ret < 0) { + ret = -1; + goto order_resolved; + } else { + ret = 1; + goto order_resolved; + } + } +next_byte: + /* Next byte */ + cur_bytes++; + rec_b_ptr++; + dtuple_b_ptr++; + } + +next_field: + cur_field++; + cur_bytes = 0; + } + + ut_ad(cur_bytes == 0); + + ret = 0; /* If we ran out of fields, dtuple was equal to rec + up to the common fields */ +order_resolved: + ut_ad((ret >= - 1) && (ret <= 1)); + ut_ad(ret == cmp_debug_dtuple_rec_with_match(dtuple, rec, offsets, + n_cmp, matched_fields)); + ut_ad(*matched_fields == cur_field); /* In the debug version, the + above cmp_debug_... sets + *matched_fields to a value */ + *matched_fields = cur_field; + *matched_bytes = cur_bytes; + + return(ret); +} + +/**************************************************************//** +Compares a data tuple to a physical record. +@see cmp_dtuple_rec_with_match +@return 1, 0, -1, if dtuple is greater, equal, less than rec, respectively */ +UNIV_INTERN +int +cmp_dtuple_rec( +/*===========*/ + const dtuple_t* dtuple, /*!< in: data tuple */ + const rec_t* rec, /*!< in: physical record */ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ +{ + ulint matched_fields = 0; + ulint matched_bytes = 0; + + ut_ad(rec_offs_validate(rec, NULL, offsets)); + return(cmp_dtuple_rec_with_match(dtuple, rec, offsets, + &matched_fields, &matched_bytes)); +} + +/**************************************************************//** +Checks if a dtuple is a prefix of a record. The last field in dtuple +is allowed to be a prefix of the corresponding field in the record. +@return TRUE if prefix */ +UNIV_INTERN +ibool +cmp_dtuple_is_prefix_of_rec( +/*========================*/ + const dtuple_t* dtuple, /*!< in: data tuple */ + const rec_t* rec, /*!< in: physical record */ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ +{ + ulint n_fields; + ulint matched_fields = 0; + ulint matched_bytes = 0; + + ut_ad(rec_offs_validate(rec, NULL, offsets)); + n_fields = dtuple_get_n_fields(dtuple); + + if (n_fields > rec_offs_n_fields(offsets)) { + + return(FALSE); + } + + cmp_dtuple_rec_with_match(dtuple, rec, offsets, + &matched_fields, &matched_bytes); + if (matched_fields == n_fields) { + + return(TRUE); + } + + if (matched_fields == n_fields - 1 + && matched_bytes == dfield_get_len( + dtuple_get_nth_field(dtuple, n_fields - 1))) { + return(TRUE); + } + + return(FALSE); +} + +/*************************************************************//** +Compare two physical record fields. +@retval 1 if rec1 field is greater than rec2 +@retval -1 if rec1 field is less than rec2 +@retval 0 if rec1 field equals to rec2 */ +static __attribute__((nonnull, warn_unused_result)) +int +cmp_rec_rec_simple_field( +/*=====================*/ + const rec_t* rec1, /*!< in: physical record */ + const rec_t* rec2, /*!< in: physical record */ + const ulint* offsets1,/*!< in: rec_get_offsets(rec1, ...) */ + const ulint* offsets2,/*!< in: rec_get_offsets(rec2, ...) */ + const dict_index_t* index, /*!< in: data dictionary index */ + ulint n) /*!< in: field to compare */ +{ + const byte* rec1_b_ptr; + const byte* rec2_b_ptr; + ulint rec1_f_len; + ulint rec2_f_len; + const dict_col_t* col = dict_index_get_nth_col(index, n); + + ut_ad(!rec_offs_nth_extern(offsets1, n)); + ut_ad(!rec_offs_nth_extern(offsets2, n)); + + rec1_b_ptr = rec_get_nth_field(rec1, offsets1, n, &rec1_f_len); + rec2_b_ptr = rec_get_nth_field(rec2, offsets2, n, &rec2_f_len); + + if (rec1_f_len == UNIV_SQL_NULL || rec2_f_len == UNIV_SQL_NULL) { + if (rec1_f_len == rec2_f_len) { + return(0); + } + /* We define the SQL null to be the smallest possible + value of a field in the alphabetical order */ + return(rec1_f_len == UNIV_SQL_NULL ? -1 : 1); + } + + if (col->mtype >= DATA_FLOAT + || (col->mtype == DATA_BLOB + && !(col->prtype & DATA_BINARY_TYPE) + && dtype_get_charset_coll(col->prtype) + != DATA_MYSQL_LATIN1_SWEDISH_CHARSET_COLL)) { + return(cmp_whole_field(col->mtype, col->prtype, + rec1_b_ptr, (unsigned) rec1_f_len, + rec2_b_ptr, (unsigned) rec2_f_len)); + } + + /* Compare the fields */ + for (ulint cur_bytes = 0;; cur_bytes++, rec1_b_ptr++, rec2_b_ptr++) { + ulint rec1_byte; + ulint rec2_byte; + + if (rec2_f_len <= cur_bytes) { + if (rec1_f_len <= cur_bytes) { + return(0); + } + + rec2_byte = dtype_get_pad_char( + col->mtype, col->prtype); + + if (rec2_byte == ULINT_UNDEFINED) { + return(1); + } + } else { + rec2_byte = *rec2_b_ptr; + } + + if (rec1_f_len <= cur_bytes) { + rec1_byte = dtype_get_pad_char( + col->mtype, col->prtype); + + if (rec1_byte == ULINT_UNDEFINED) { + return(-1); + } + } else { + rec1_byte = *rec1_b_ptr; + } + + if (rec1_byte == rec2_byte) { + /* If the bytes are equal, they will remain such + even after the collation transformation below */ + continue; + } + + if (col->mtype <= DATA_CHAR + || (col->mtype == DATA_BLOB + && !(col->prtype & DATA_BINARY_TYPE))) { + + rec1_byte = cmp_collate(rec1_byte); + rec2_byte = cmp_collate(rec2_byte); + } + + if (rec1_byte < rec2_byte) { + return(-1); + } else if (rec1_byte > rec2_byte) { + return(1); + } + } +} + +/*************************************************************//** +Compare two physical records that contain the same number of columns, +none of which are stored externally. +@retval 1 if rec1 (including non-ordering columns) is greater than rec2 +@retval -1 if rec1 (including non-ordering columns) is less than rec2 +@retval 0 if rec1 is a duplicate of rec2 */ +UNIV_INTERN +int +cmp_rec_rec_simple( +/*===============*/ + const rec_t* rec1, /*!< in: physical record */ + const rec_t* rec2, /*!< in: physical record */ + const ulint* offsets1,/*!< in: rec_get_offsets(rec1, ...) */ + const ulint* offsets2,/*!< in: rec_get_offsets(rec2, ...) */ + const dict_index_t* index, /*!< in: data dictionary index */ + struct TABLE* table) /*!< in: MySQL table, for reporting + duplicate key value if applicable, + or NULL */ +{ + ulint n; + ulint n_uniq = dict_index_get_n_unique(index); + bool null_eq = false; + + ut_ad(rec_offs_n_fields(offsets1) >= n_uniq); + ut_ad(rec_offs_n_fields(offsets2) == rec_offs_n_fields(offsets2)); + + ut_ad(rec_offs_comp(offsets1) == rec_offs_comp(offsets2)); + + for (n = 0; n < n_uniq; n++) { + int cmp = cmp_rec_rec_simple_field( + rec1, rec2, offsets1, offsets2, index, n); + + if (cmp) { + return(cmp); + } + + /* If the fields are internally equal, they must both + be NULL or non-NULL. */ + ut_ad(rec_offs_nth_sql_null(offsets1, n) + == rec_offs_nth_sql_null(offsets2, n)); + + if (rec_offs_nth_sql_null(offsets1, n)) { + ut_ad(!(dict_index_get_nth_col(index, n)->prtype + & DATA_NOT_NULL)); + null_eq = true; + } + } + + /* If we ran out of fields, the ordering columns of rec1 were + equal to rec2. Issue a duplicate key error if needed. */ + + if (!null_eq && table && dict_index_is_unique(index)) { + /* Report erroneous row using new version of table. */ + innobase_rec_to_mysql(table, rec1, index, offsets1); + return(0); + } + + /* Else, keep comparing so that we have the full internal + order. */ + for (; n < dict_index_get_n_fields(index); n++) { + int cmp = cmp_rec_rec_simple_field( + rec1, rec2, offsets1, offsets2, index, n); + + if (cmp) { + return(cmp); + } + + /* If the fields are internally equal, they must both + be NULL or non-NULL. */ + ut_ad(rec_offs_nth_sql_null(offsets1, n) + == rec_offs_nth_sql_null(offsets2, n)); + } + + /* This should never be reached. Internally, an index must + never contain duplicate entries. */ + ut_ad(0); + return(0); +} + +/*************************************************************//** +This function is used to compare two physical records. Only the common +first fields are compared, and if an externally stored field is +encountered, then 0 is returned. +@return 1, 0, -1 if rec1 is greater, equal, less, respectively */ +UNIV_INTERN +int +cmp_rec_rec_with_match( +/*===================*/ + const rec_t* rec1, /*!< in: physical record */ + const rec_t* rec2, /*!< in: physical record */ + const ulint* offsets1,/*!< in: rec_get_offsets(rec1, index) */ + const ulint* offsets2,/*!< in: rec_get_offsets(rec2, index) */ + dict_index_t* index, /*!< in: data dictionary index */ + ibool nulls_unequal, + /* in: TRUE if this is for index statistics + cardinality estimation, and innodb_stats_method + is "nulls_unequal" or "nulls_ignored" */ + ulint* matched_fields, /*!< in/out: number of already completely + matched fields; when the function returns, + contains the value the for current + comparison */ + ulint* matched_bytes) /*!< in/out: number of already matched + bytes within the first field not completely + matched; when the function returns, contains + the value for the current comparison */ +{ + ulint rec1_n_fields; /* the number of fields in rec */ + ulint rec1_f_len; /* length of current field in rec */ + const byte* rec1_b_ptr; /* pointer to the current byte + in rec field */ + ulint rec1_byte; /* value of current byte to be + compared in rec */ + ulint rec2_n_fields; /* the number of fields in rec */ + ulint rec2_f_len; /* length of current field in rec */ + const byte* rec2_b_ptr; /* pointer to the current byte + in rec field */ + ulint rec2_byte; /* value of current byte to be + compared in rec */ + ulint cur_field; /* current field number */ + ulint cur_bytes; /* number of already matched + bytes in current field */ + int ret = 0; /* return value */ + ulint comp; + + ut_ad(rec1 && rec2 && index); + ut_ad(rec_offs_validate(rec1, index, offsets1)); + ut_ad(rec_offs_validate(rec2, index, offsets2)); + ut_ad(rec_offs_comp(offsets1) == rec_offs_comp(offsets2)); + + comp = rec_offs_comp(offsets1); + rec1_n_fields = rec_offs_n_fields(offsets1); + rec2_n_fields = rec_offs_n_fields(offsets2); + + cur_field = *matched_fields; + cur_bytes = *matched_bytes; + + /* Match fields in a loop */ + + while ((cur_field < rec1_n_fields) && (cur_field < rec2_n_fields)) { + + ulint mtype; + ulint prtype; + + if (dict_index_is_univ(index)) { + /* This is for the insert buffer B-tree. */ + mtype = DATA_BINARY; + prtype = 0; + } else { + const dict_col_t* col + = dict_index_get_nth_col(index, cur_field); + + mtype = col->mtype; + prtype = col->prtype; + } + + rec1_b_ptr = rec_get_nth_field(rec1, offsets1, + cur_field, &rec1_f_len); + rec2_b_ptr = rec_get_nth_field(rec2, offsets2, + cur_field, &rec2_f_len); + + if (cur_bytes == 0) { + if (cur_field == 0) { + /* Test if rec is the predefined minimum + record */ + if (UNIV_UNLIKELY(rec_get_info_bits(rec1, comp) + & REC_INFO_MIN_REC_FLAG)) { + + if (!(rec_get_info_bits(rec2, comp) + & REC_INFO_MIN_REC_FLAG)) { + ret = -1; + } + + goto order_resolved; + + } else if (UNIV_UNLIKELY + (rec_get_info_bits(rec2, comp) + & REC_INFO_MIN_REC_FLAG)) { + + ret = 1; + + goto order_resolved; + } + } + + if (rec_offs_nth_extern(offsets1, cur_field) + || rec_offs_nth_extern(offsets2, cur_field)) { + /* We do not compare to an externally + stored field */ + + goto order_resolved; + } + + if (rec1_f_len == UNIV_SQL_NULL + || rec2_f_len == UNIV_SQL_NULL) { + + if (rec1_f_len == rec2_f_len) { + /* This is limited to stats collection, + cannot use it for regular search */ + if (nulls_unequal) { + ret = -1; + } else { + goto next_field; + } + } else if (rec2_f_len == UNIV_SQL_NULL) { + + /* We define the SQL null to be the + smallest possible value of a field + in the alphabetical order */ + + ret = 1; + } else { + ret = -1; + } + + goto order_resolved; + } + } + + if (mtype >= DATA_FLOAT + || (mtype == DATA_BLOB + && 0 == (prtype & DATA_BINARY_TYPE) + && dtype_get_charset_coll(prtype) + != DATA_MYSQL_LATIN1_SWEDISH_CHARSET_COLL)) { + + ret = cmp_whole_field(mtype, prtype, + rec1_b_ptr, + (unsigned) rec1_f_len, + rec2_b_ptr, + (unsigned) rec2_f_len); + if (ret != 0) { + cur_bytes = 0; + + goto order_resolved; + } else { + goto next_field; + } + } + + /* Set the pointers at the current byte */ + rec1_b_ptr = rec1_b_ptr + cur_bytes; + rec2_b_ptr = rec2_b_ptr + cur_bytes; + + /* Compare then the fields */ + for (;;) { + if (rec2_f_len <= cur_bytes) { + + if (rec1_f_len <= cur_bytes) { + + goto next_field; + } + + rec2_byte = dtype_get_pad_char(mtype, prtype); + + if (rec2_byte == ULINT_UNDEFINED) { + ret = 1; + + goto order_resolved; + } + } else { + rec2_byte = *rec2_b_ptr; + } + + if (rec1_f_len <= cur_bytes) { + rec1_byte = dtype_get_pad_char(mtype, prtype); + + if (rec1_byte == ULINT_UNDEFINED) { + ret = -1; + + goto order_resolved; + } + } else { + rec1_byte = *rec1_b_ptr; + } + + if (rec1_byte == rec2_byte) { + /* If the bytes are equal, they will remain + such even after the collation transformation + below */ + + goto next_byte; + } + + if (mtype <= DATA_CHAR + || (mtype == DATA_BLOB + && !(prtype & DATA_BINARY_TYPE))) { + + rec1_byte = cmp_collate(rec1_byte); + rec2_byte = cmp_collate(rec2_byte); + } + + if (rec1_byte < rec2_byte) { + ret = -1; + goto order_resolved; + } else if (rec1_byte > rec2_byte) { + ret = 1; + goto order_resolved; + } +next_byte: + /* Next byte */ + + cur_bytes++; + rec1_b_ptr++; + rec2_b_ptr++; + } + +next_field: + cur_field++; + cur_bytes = 0; + } + + ut_ad(cur_bytes == 0); + + /* If we ran out of fields, rec1 was equal to rec2 up + to the common fields */ + ut_ad(ret == 0); +order_resolved: + + ut_ad((ret >= - 1) && (ret <= 1)); + + *matched_fields = cur_field; + *matched_bytes = cur_bytes; + + return(ret); +} + +#ifdef UNIV_DEBUG +/*************************************************************//** +Used in debug checking of cmp_dtuple_... . +This function is used to compare a data tuple to a physical record. If +dtuple has n fields then rec must have either m >= n fields, or it must +differ from dtuple in some of the m fields rec has. If encounters an +externally stored field, returns 0. +@return 1, 0, -1, if dtuple is greater, equal, less than rec, +respectively, when only the common first fields are compared */ +static +int +cmp_debug_dtuple_rec_with_match( +/*============================*/ + const dtuple_t* dtuple, /*!< in: data tuple */ + const rec_t* rec, /*!< in: physical record which differs from + dtuple in some of the common fields, or which + has an equal number or more fields than + dtuple */ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint n_cmp, /*!< in: number of fields to compare */ + ulint* matched_fields) /*!< in/out: number of already + completely matched fields; when function + returns, contains the value for current + comparison */ +{ + const dfield_t* dtuple_field; /* current field in logical record */ + ulint dtuple_f_len; /* the length of the current field + in the logical record */ + const byte* dtuple_f_data; /* pointer to the current logical + field data */ + ulint rec_f_len; /* length of current field in rec */ + const byte* rec_f_data; /* pointer to the current rec field */ + int ret; /* return value */ + ulint cur_field; /* current field number */ + + ut_ad(dtuple && rec && matched_fields); + ut_ad(dtuple_check_typed(dtuple)); + ut_ad(rec_offs_validate(rec, NULL, offsets)); + + ut_ad(n_cmp > 0); + ut_ad(n_cmp <= dtuple_get_n_fields(dtuple)); + ut_ad(*matched_fields <= n_cmp); + ut_ad(*matched_fields <= rec_offs_n_fields(offsets)); + + cur_field = *matched_fields; + + if (cur_field == 0) { + if (UNIV_UNLIKELY + (rec_get_info_bits(rec, rec_offs_comp(offsets)) + & REC_INFO_MIN_REC_FLAG)) { + + ret = !(dtuple_get_info_bits(dtuple) + & REC_INFO_MIN_REC_FLAG); + + goto order_resolved; + } + + if (UNIV_UNLIKELY + (dtuple_get_info_bits(dtuple) & REC_INFO_MIN_REC_FLAG)) { + ret = -1; + + goto order_resolved; + } + } + + /* Match fields in a loop; stop if we run out of fields in dtuple */ + + while (cur_field < n_cmp) { + + ulint mtype; + ulint prtype; + + dtuple_field = dtuple_get_nth_field(dtuple, cur_field); + { + const dtype_t* type + = dfield_get_type(dtuple_field); + + mtype = type->mtype; + prtype = type->prtype; + } + + dtuple_f_data = static_cast<const byte*>( + dfield_get_data(dtuple_field)); + + dtuple_f_len = dfield_get_len(dtuple_field); + + rec_f_data = rec_get_nth_field(rec, offsets, + cur_field, &rec_f_len); + + if (rec_offs_nth_extern(offsets, cur_field)) { + /* We do not compare to an externally stored field */ + + ret = 0; + + goto order_resolved; + } + + ret = cmp_data_data(mtype, prtype, dtuple_f_data, dtuple_f_len, + rec_f_data, rec_f_len); + if (ret != 0) { + goto order_resolved; + } + + cur_field++; + } + + ret = 0; /* If we ran out of fields, dtuple was equal to rec + up to the common fields */ +order_resolved: + ut_ad((ret >= - 1) && (ret <= 1)); + + *matched_fields = cur_field; + + return(ret); +} +#endif /* UNIV_DEBUG */ diff --git a/storage/xtradb/rem/rem0rec.cc b/storage/xtradb/rem/rem0rec.cc new file mode 100644 index 00000000000..0d7b7c16785 --- /dev/null +++ b/storage/xtradb/rem/rem0rec.cc @@ -0,0 +1,1963 @@ +/***************************************************************************** + +Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file rem/rem0rec.cc +Record manager + +Created 5/30/1994 Heikki Tuuri +*************************************************************************/ + +#include "rem0rec.h" + +#ifdef UNIV_NONINL +#include "rem0rec.ic" +#endif + +#include "page0page.h" +#include "mtr0mtr.h" +#include "mtr0log.h" +#include "fts0fts.h" + +/* PHYSICAL RECORD (OLD STYLE) + =========================== + +The physical record, which is the data type of all the records +found in index pages of the database, has the following format +(lower addresses and more significant bits inside a byte are below +represented on a higher text line): + +| offset of the end of the last field of data, the most significant + bit is set to 1 if and only if the field is SQL-null, + if the offset is 2-byte, then the second most significant + bit is set to 1 if the field is stored on another page: + mostly this will occur in the case of big BLOB fields | +... +| offset of the end of the first field of data + the SQL-null bit | +| 4 bits used to delete mark a record, and mark a predefined + minimum record in alphabetical order | +| 4 bits giving the number of records owned by this record + (this term is explained in page0page.h) | +| 13 bits giving the order number of this record in the + heap of the index page | +| 10 bits giving the number of fields in this record | +| 1 bit which is set to 1 if the offsets above are given in + one byte format, 0 if in two byte format | +| two bytes giving an absolute pointer to the next record in the page | +ORIGIN of the record +| first field of data | +... +| last field of data | + +The origin of the record is the start address of the first field +of data. The offsets are given relative to the origin. +The offsets of the data fields are stored in an inverted +order because then the offset of the first fields are near the +origin, giving maybe a better processor cache hit rate in searches. + +The offsets of the data fields are given as one-byte +(if there are less than 127 bytes of data in the record) +or two-byte unsigned integers. The most significant bit +is not part of the offset, instead it indicates the SQL-null +if the bit is set to 1. */ + +/* PHYSICAL RECORD (NEW STYLE) + =========================== + +The physical record, which is the data type of all the records +found in index pages of the database, has the following format +(lower addresses and more significant bits inside a byte are below +represented on a higher text line): + +| length of the last non-null variable-length field of data: + if the maximum length is 255, one byte; otherwise, + 0xxxxxxx (one byte, length=0..127), or 1exxxxxxxxxxxxxx (two bytes, + length=128..16383, extern storage flag) | +... +| length of first variable-length field of data | +| SQL-null flags (1 bit per nullable field), padded to full bytes | +| 4 bits used to delete mark a record, and mark a predefined + minimum record in alphabetical order | +| 4 bits giving the number of records owned by this record + (this term is explained in page0page.h) | +| 13 bits giving the order number of this record in the + heap of the index page | +| 3 bits record type: 000=conventional, 001=node pointer (inside B-tree), + 010=infimum, 011=supremum, 1xx=reserved | +| two bytes giving a relative pointer to the next record in the page | +ORIGIN of the record +| first field of data | +... +| last field of data | + +The origin of the record is the start address of the first field +of data. The offsets are given relative to the origin. +The offsets of the data fields are stored in an inverted +order because then the offset of the first fields are near the +origin, giving maybe a better processor cache hit rate in searches. + +The offsets of the data fields are given as one-byte +(if there are less than 127 bytes of data in the record) +or two-byte unsigned integers. The most significant bit +is not part of the offset, instead it indicates the SQL-null +if the bit is set to 1. */ + +/* CANONICAL COORDINATES. A record can be seen as a single +string of 'characters' in the following way: catenate the bytes +in each field, in the order of fields. An SQL-null field +is taken to be an empty sequence of bytes. Then after +the position of each field insert in the string +the 'character' <FIELD-END>, except that after an SQL-null field +insert <NULL-FIELD-END>. Now the ordinal position of each +byte in this canonical string is its canonical coordinate. +So, for the record ("AA", SQL-NULL, "BB", ""), the canonical +string is "AA<FIELD_END><NULL-FIELD-END>BB<FIELD-END><FIELD-END>". +We identify prefixes (= initial segments) of a record +with prefixes of the canonical string. The canonical +length of the prefix is the length of the corresponding +prefix of the canonical string. The canonical length of +a record is the length of its canonical string. + +For example, the maximal common prefix of records +("AA", SQL-NULL, "BB", "C") and ("AA", SQL-NULL, "B", "C") +is "AA<FIELD-END><NULL-FIELD-END>B", and its canonical +length is 5. + +A complete-field prefix of a record is a prefix which ends at the +end of some field (containing also <FIELD-END>). +A record is a complete-field prefix of another record, if +the corresponding canonical strings have the same property. */ + +/* this is used to fool compiler in rec_validate */ +UNIV_INTERN ulint rec_dummy; + +/***************************************************************//** +Validates the consistency of an old-style physical record. +@return TRUE if ok */ +static +ibool +rec_validate_old( +/*=============*/ + const rec_t* rec); /*!< in: physical record */ + +/******************************************************//** +Determine how many of the first n columns in a compact +physical record are stored externally. +@return number of externally stored columns */ +UNIV_INTERN +ulint +rec_get_n_extern_new( +/*=================*/ + const rec_t* rec, /*!< in: compact physical record */ + const dict_index_t* index, /*!< in: record descriptor */ + ulint n) /*!< in: number of columns to scan */ +{ + const byte* nulls; + const byte* lens; + ulint null_mask; + ulint n_extern; + ulint i; + + ut_ad(dict_table_is_comp(index->table)); + ut_ad(rec_get_status(rec) == REC_STATUS_ORDINARY); + ut_ad(n == ULINT_UNDEFINED || n <= dict_index_get_n_fields(index)); + + if (n == ULINT_UNDEFINED) { + n = dict_index_get_n_fields(index); + } + + nulls = rec - (REC_N_NEW_EXTRA_BYTES + 1); + lens = nulls - UT_BITS_IN_BYTES(index->n_nullable); + null_mask = 1; + n_extern = 0; + i = 0; + + /* read the lengths of fields 0..n */ + do { + const dict_field_t* field + = dict_index_get_nth_field(index, i); + const dict_col_t* col + = dict_field_get_col(field); + ulint len; + + if (!(col->prtype & DATA_NOT_NULL)) { + /* nullable field => read the null flag */ + + if (UNIV_UNLIKELY(!(byte) null_mask)) { + nulls--; + null_mask = 1; + } + + if (*nulls & null_mask) { + null_mask <<= 1; + /* No length is stored for NULL fields. */ + continue; + } + null_mask <<= 1; + } + + if (UNIV_UNLIKELY(!field->fixed_len)) { + /* Variable-length field: read the length */ + len = *lens--; + /* If the maximum length of the field is up + to 255 bytes, the actual length is always + stored in one byte. If the maximum length is + more than 255 bytes, the actual length is + stored in one byte for 0..127. The length + will be encoded in two bytes when it is 128 or + more, or when the field is stored externally. */ + if (UNIV_UNLIKELY(col->len > 255) + || UNIV_UNLIKELY(col->mtype == DATA_BLOB)) { + if (len & 0x80) { + /* 1exxxxxxx xxxxxxxx */ + if (len & 0x40) { + n_extern++; + } + lens--; + } + } + } + } while (++i < n); + + return(n_extern); +} + +/******************************************************//** +Determine the offset to each field in a leaf-page record +in ROW_FORMAT=COMPACT. This is a special case of +rec_init_offsets() and rec_get_offsets_func(). */ +UNIV_INLINE __attribute__((nonnull)) +void +rec_init_offsets_comp_ordinary( +/*===========================*/ + const rec_t* rec, /*!< in: physical record in + ROW_FORMAT=COMPACT */ + bool temp, /*!< in: whether to use the + format for temporary files in + index creation */ + const dict_index_t* index, /*!< in: record descriptor */ + ulint* offsets)/*!< in/out: array of offsets; + in: n=rec_offs_n_fields(offsets) */ +{ + ulint i = 0; + ulint offs = 0; + ulint any_ext = 0; + ulint n_null = index->n_nullable; + const byte* nulls = temp + ? rec - 1 + : rec - (1 + REC_N_NEW_EXTRA_BYTES); + const byte* lens = nulls - UT_BITS_IN_BYTES(n_null); + ulint null_mask = 1; + +#ifdef UNIV_DEBUG + /* We cannot invoke rec_offs_make_valid() here if temp=true. + Similarly, rec_offs_validate() will fail in that case, because + it invokes rec_get_status(). */ + offsets[2] = (ulint) rec; + offsets[3] = (ulint) index; +#endif /* UNIV_DEBUG */ + + ut_ad(temp || dict_table_is_comp(index->table)); + + if (temp && dict_table_is_comp(index->table)) { + /* No need to do adjust fixed_len=0. We only need to + adjust it for ROW_FORMAT=REDUNDANT. */ + temp = false; + } + + /* read the lengths of fields 0..n */ + do { + const dict_field_t* field + = dict_index_get_nth_field(index, i); + const dict_col_t* col + = dict_field_get_col(field); + ulint len; + + if (!(col->prtype & DATA_NOT_NULL)) { + /* nullable field => read the null flag */ + ut_ad(n_null--); + + if (UNIV_UNLIKELY(!(byte) null_mask)) { + nulls--; + null_mask = 1; + } + + if (*nulls & null_mask) { + null_mask <<= 1; + /* No length is stored for NULL fields. + We do not advance offs, and we set + the length to zero and enable the + SQL NULL flag in offsets[]. */ + len = offs | REC_OFFS_SQL_NULL; + goto resolved; + } + null_mask <<= 1; + } + + if (!field->fixed_len + || (temp && !dict_col_get_fixed_size(col, temp))) { + /* Variable-length field: read the length */ + len = *lens--; + /* If the maximum length of the field is up + to 255 bytes, the actual length is always + stored in one byte. If the maximum length is + more than 255 bytes, the actual length is + stored in one byte for 0..127. The length + will be encoded in two bytes when it is 128 or + more, or when the field is stored externally. */ + if (UNIV_UNLIKELY(col->len > 255) + || UNIV_UNLIKELY(col->mtype + == DATA_BLOB)) { + if (len & 0x80) { + /* 1exxxxxxx xxxxxxxx */ + len <<= 8; + len |= *lens--; + + offs += len & 0x3fff; + if (UNIV_UNLIKELY(len + & 0x4000)) { + ut_ad(dict_index_is_clust + (index)); + any_ext = REC_OFFS_EXTERNAL; + len = offs + | REC_OFFS_EXTERNAL; + } else { + len = offs; + } + + goto resolved; + } + } + + len = offs += len; + } else { + len = offs += field->fixed_len; + } +resolved: + rec_offs_base(offsets)[i + 1] = len; + } while (++i < rec_offs_n_fields(offsets)); + + *rec_offs_base(offsets) + = (rec - (lens + 1)) | REC_OFFS_COMPACT | any_ext; +} + +/******************************************************//** +The following function determines the offsets to each field in the +record. The offsets are written to a previously allocated array of +ulint, where rec_offs_n_fields(offsets) has been initialized to the +number of fields in the record. The rest of the array will be +initialized by this function. rec_offs_base(offsets)[0] will be set +to the extra size (if REC_OFFS_COMPACT is set, the record is in the +new format; if REC_OFFS_EXTERNAL is set, the record contains externally +stored columns), and rec_offs_base(offsets)[1..n_fields] will be set to +offsets past the end of fields 0..n_fields, or to the beginning of +fields 1..n_fields+1. When the high-order bit of the offset at [i+1] +is set (REC_OFFS_SQL_NULL), the field i is NULL. When the second +high-order bit of the offset at [i+1] is set (REC_OFFS_EXTERNAL), the +field i is being stored externally. */ +static +void +rec_init_offsets( +/*=============*/ + const rec_t* rec, /*!< in: physical record */ + const dict_index_t* index, /*!< in: record descriptor */ + ulint* offsets)/*!< in/out: array of offsets; + in: n=rec_offs_n_fields(offsets) */ +{ + ulint i = 0; + ulint offs; + + rec_offs_make_valid(rec, index, offsets); + + if (dict_table_is_comp(index->table)) { + const byte* nulls; + const byte* lens; + dict_field_t* field; + ulint null_mask; + ulint status = rec_get_status(rec); + ulint n_node_ptr_field = ULINT_UNDEFINED; + + switch (UNIV_EXPECT(status, REC_STATUS_ORDINARY)) { + case REC_STATUS_INFIMUM: + case REC_STATUS_SUPREMUM: + /* the field is 8 bytes long */ + rec_offs_base(offsets)[0] + = REC_N_NEW_EXTRA_BYTES | REC_OFFS_COMPACT; + rec_offs_base(offsets)[1] = 8; + return; + case REC_STATUS_NODE_PTR: + n_node_ptr_field + = dict_index_get_n_unique_in_tree(index); + break; + case REC_STATUS_ORDINARY: + rec_init_offsets_comp_ordinary( + rec, false, index, offsets); + return; + } + + nulls = rec - (REC_N_NEW_EXTRA_BYTES + 1); + lens = nulls - UT_BITS_IN_BYTES(index->n_nullable); + offs = 0; + null_mask = 1; + + /* read the lengths of fields 0..n */ + do { + ulint len; + if (UNIV_UNLIKELY(i == n_node_ptr_field)) { + len = offs += REC_NODE_PTR_SIZE; + goto resolved; + } + + field = dict_index_get_nth_field(index, i); + if (!(dict_field_get_col(field)->prtype + & DATA_NOT_NULL)) { + /* nullable field => read the null flag */ + + if (UNIV_UNLIKELY(!(byte) null_mask)) { + nulls--; + null_mask = 1; + } + + if (*nulls & null_mask) { + null_mask <<= 1; + /* No length is stored for NULL fields. + We do not advance offs, and we set + the length to zero and enable the + SQL NULL flag in offsets[]. */ + len = offs | REC_OFFS_SQL_NULL; + goto resolved; + } + null_mask <<= 1; + } + + if (UNIV_UNLIKELY(!field->fixed_len)) { + /* Variable-length field: read the length */ + const dict_col_t* col + = dict_field_get_col(field); + len = *lens--; + /* If the maximum length of the field + is up to 255 bytes, the actual length + is always stored in one byte. If the + maximum length is more than 255 bytes, + the actual length is stored in one + byte for 0..127. The length will be + encoded in two bytes when it is 128 or + more, or when the field is stored + externally. */ + if (UNIV_UNLIKELY(col->len > 255) + || UNIV_UNLIKELY(col->mtype + == DATA_BLOB)) { + if (len & 0x80) { + /* 1exxxxxxx xxxxxxxx */ + + len <<= 8; + len |= *lens--; + + /* B-tree node pointers + must not contain externally + stored columns. Thus + the "e" flag must be 0. */ + ut_a(!(len & 0x4000)); + offs += len & 0x3fff; + len = offs; + + goto resolved; + } + } + + len = offs += len; + } else { + len = offs += field->fixed_len; + } +resolved: + rec_offs_base(offsets)[i + 1] = len; + } while (++i < rec_offs_n_fields(offsets)); + + *rec_offs_base(offsets) + = (rec - (lens + 1)) | REC_OFFS_COMPACT; + } else { + /* Old-style record: determine extra size and end offsets */ + offs = REC_N_OLD_EXTRA_BYTES; + if (rec_get_1byte_offs_flag(rec)) { + offs += rec_offs_n_fields(offsets); + *rec_offs_base(offsets) = offs; + /* Determine offsets to fields */ + do { + offs = rec_1_get_field_end_info(rec, i); + if (offs & REC_1BYTE_SQL_NULL_MASK) { + offs &= ~REC_1BYTE_SQL_NULL_MASK; + offs |= REC_OFFS_SQL_NULL; + } + rec_offs_base(offsets)[1 + i] = offs; + } while (++i < rec_offs_n_fields(offsets)); + } else { + offs += 2 * rec_offs_n_fields(offsets); + *rec_offs_base(offsets) = offs; + /* Determine offsets to fields */ + do { + offs = rec_2_get_field_end_info(rec, i); + if (offs & REC_2BYTE_SQL_NULL_MASK) { + offs &= ~REC_2BYTE_SQL_NULL_MASK; + offs |= REC_OFFS_SQL_NULL; + } + if (offs & REC_2BYTE_EXTERN_MASK) { + offs &= ~REC_2BYTE_EXTERN_MASK; + offs |= REC_OFFS_EXTERNAL; + *rec_offs_base(offsets) |= REC_OFFS_EXTERNAL; + } + rec_offs_base(offsets)[1 + i] = offs; + } while (++i < rec_offs_n_fields(offsets)); + } + } +} + +/******************************************************//** +The following function determines the offsets to each field +in the record. It can reuse a previously returned array. +@return the new offsets */ +UNIV_INTERN +ulint* +rec_get_offsets_func( +/*=================*/ + const rec_t* rec, /*!< in: physical record */ + const dict_index_t* index, /*!< in: record descriptor */ + ulint* offsets,/*!< in/out: array consisting of + offsets[0] allocated elements, + or an array from rec_get_offsets(), + or NULL */ + ulint n_fields,/*!< in: maximum number of + initialized fields + (ULINT_UNDEFINED if all fields) */ +#ifdef UNIV_DEBUG + const char* file, /*!< in: file name where called */ + ulint line, /*!< in: line number where called */ +#endif /* UNIV_DEBUG */ + mem_heap_t** heap) /*!< in/out: memory heap */ +{ + ulint n; + ulint size; + + ut_ad(rec); + ut_ad(index); + ut_ad(heap); + + if (dict_table_is_comp(index->table)) { + switch (UNIV_EXPECT(rec_get_status(rec), + REC_STATUS_ORDINARY)) { + case REC_STATUS_ORDINARY: + n = dict_index_get_n_fields(index); + break; + case REC_STATUS_NODE_PTR: + /* Node pointer records consist of the + uniquely identifying fields of the record + followed by a child page number field. */ + n = dict_index_get_n_unique_in_tree(index) + 1; + break; + case REC_STATUS_INFIMUM: + case REC_STATUS_SUPREMUM: + /* infimum or supremum record */ + n = 1; + break; + default: + ut_error; + return(NULL); + } + } else { + n = rec_get_n_fields_old(rec); + } + + if (UNIV_UNLIKELY(n_fields < n)) { + n = n_fields; + } + + /* The offsets header consists of the allocation size at + offsets[0] and the REC_OFFS_HEADER_SIZE bytes. */ + size = n + (1 + REC_OFFS_HEADER_SIZE); + + if (UNIV_UNLIKELY(!offsets) + || UNIV_UNLIKELY(rec_offs_get_n_alloc(offsets) < size)) { + if (UNIV_UNLIKELY(!*heap)) { + *heap = mem_heap_create_at(size * sizeof(ulint), + file, line); + } + offsets = static_cast<ulint*>( + mem_heap_alloc(*heap, size * sizeof(ulint))); + + rec_offs_set_n_alloc(offsets, size); + } + + rec_offs_set_n_fields(offsets, n); + rec_init_offsets(rec, index, offsets); + return(offsets); +} + +/******************************************************//** +The following function determines the offsets to each field +in the record. It can reuse a previously allocated array. */ +UNIV_INTERN +void +rec_get_offsets_reverse( +/*====================*/ + const byte* extra, /*!< in: the extra bytes of a + compact record in reverse order, + excluding the fixed-size + REC_N_NEW_EXTRA_BYTES */ + const dict_index_t* index, /*!< in: record descriptor */ + ulint node_ptr,/*!< in: nonzero=node pointer, + 0=leaf node */ + ulint* offsets)/*!< in/out: array consisting of + offsets[0] allocated elements */ +{ + ulint n; + ulint i; + ulint offs; + ulint any_ext; + const byte* nulls; + const byte* lens; + dict_field_t* field; + ulint null_mask; + ulint n_node_ptr_field; + + ut_ad(extra); + ut_ad(index); + ut_ad(offsets); + ut_ad(dict_table_is_comp(index->table)); + + if (UNIV_UNLIKELY(node_ptr)) { + n_node_ptr_field = dict_index_get_n_unique_in_tree(index); + n = n_node_ptr_field + 1; + } else { + n_node_ptr_field = ULINT_UNDEFINED; + n = dict_index_get_n_fields(index); + } + + ut_a(rec_offs_get_n_alloc(offsets) >= n + (1 + REC_OFFS_HEADER_SIZE)); + rec_offs_set_n_fields(offsets, n); + + nulls = extra; + lens = nulls + UT_BITS_IN_BYTES(index->n_nullable); + i = offs = 0; + null_mask = 1; + any_ext = 0; + + /* read the lengths of fields 0..n */ + do { + ulint len; + if (UNIV_UNLIKELY(i == n_node_ptr_field)) { + len = offs += REC_NODE_PTR_SIZE; + goto resolved; + } + + field = dict_index_get_nth_field(index, i); + if (!(dict_field_get_col(field)->prtype & DATA_NOT_NULL)) { + /* nullable field => read the null flag */ + + if (UNIV_UNLIKELY(!(byte) null_mask)) { + nulls++; + null_mask = 1; + } + + if (*nulls & null_mask) { + null_mask <<= 1; + /* No length is stored for NULL fields. + We do not advance offs, and we set + the length to zero and enable the + SQL NULL flag in offsets[]. */ + len = offs | REC_OFFS_SQL_NULL; + goto resolved; + } + null_mask <<= 1; + } + + if (UNIV_UNLIKELY(!field->fixed_len)) { + /* Variable-length field: read the length */ + const dict_col_t* col + = dict_field_get_col(field); + len = *lens++; + /* If the maximum length of the field is up + to 255 bytes, the actual length is always + stored in one byte. If the maximum length is + more than 255 bytes, the actual length is + stored in one byte for 0..127. The length + will be encoded in two bytes when it is 128 or + more, or when the field is stored externally. */ + if (UNIV_UNLIKELY(col->len > 255) + || UNIV_UNLIKELY(col->mtype == DATA_BLOB)) { + if (len & 0x80) { + /* 1exxxxxxx xxxxxxxx */ + len <<= 8; + len |= *lens++; + + offs += len & 0x3fff; + if (UNIV_UNLIKELY(len & 0x4000)) { + any_ext = REC_OFFS_EXTERNAL; + len = offs | REC_OFFS_EXTERNAL; + } else { + len = offs; + } + + goto resolved; + } + } + + len = offs += len; + } else { + len = offs += field->fixed_len; + } +resolved: + rec_offs_base(offsets)[i + 1] = len; + } while (++i < rec_offs_n_fields(offsets)); + + ut_ad(lens >= extra); + *rec_offs_base(offsets) = (lens - extra + REC_N_NEW_EXTRA_BYTES) + | REC_OFFS_COMPACT | any_ext; +} + +/************************************************************//** +The following function is used to get the offset to the nth +data field in an old-style record. +@return offset to the field */ +UNIV_INTERN +ulint +rec_get_nth_field_offs_old( +/*=======================*/ + const rec_t* rec, /*!< in: record */ + ulint n, /*!< in: index of the field */ + ulint* len) /*!< out: length of the field; + UNIV_SQL_NULL if SQL null */ +{ + ulint os; + ulint next_os; + + ut_ad(len); + ut_a(rec); + ut_a(n < rec_get_n_fields_old(rec)); + + if (rec_get_1byte_offs_flag(rec)) { + os = rec_1_get_field_start_offs(rec, n); + + next_os = rec_1_get_field_end_info(rec, n); + + if (next_os & REC_1BYTE_SQL_NULL_MASK) { + *len = UNIV_SQL_NULL; + + return(os); + } + + next_os = next_os & ~REC_1BYTE_SQL_NULL_MASK; + } else { + os = rec_2_get_field_start_offs(rec, n); + + next_os = rec_2_get_field_end_info(rec, n); + + if (next_os & REC_2BYTE_SQL_NULL_MASK) { + *len = UNIV_SQL_NULL; + + return(os); + } + + next_os = next_os & ~(REC_2BYTE_SQL_NULL_MASK + | REC_2BYTE_EXTERN_MASK); + } + + *len = next_os - os; + + ut_ad(*len < UNIV_PAGE_SIZE); + + return(os); +} + +/**********************************************************//** +Determines the size of a data tuple prefix in ROW_FORMAT=COMPACT. +@return total size */ +UNIV_INLINE __attribute__((warn_unused_result, nonnull(1,2))) +ulint +rec_get_converted_size_comp_prefix_low( +/*===================================*/ + const dict_index_t* index, /*!< in: record descriptor; + dict_table_is_comp() is + assumed to hold, even if + it does not */ + const dfield_t* fields, /*!< in: array of data fields */ + ulint n_fields,/*!< in: number of data fields */ + ulint* extra, /*!< out: extra size */ + bool temp) /*!< in: whether this is a + temporary file record */ +{ + ulint extra_size; + ulint data_size; + ulint i; + ulint n_null = index->n_nullable; + ut_ad(n_fields > 0); + ut_ad(n_fields <= dict_index_get_n_fields(index)); + ut_ad(!temp || extra); + + extra_size = temp + ? UT_BITS_IN_BYTES(n_null) + : REC_N_NEW_EXTRA_BYTES + + UT_BITS_IN_BYTES(n_null); + data_size = 0; + + if (temp && dict_table_is_comp(index->table)) { + /* No need to do adjust fixed_len=0. We only need to + adjust it for ROW_FORMAT=REDUNDANT. */ + temp = false; + } + + /* read the lengths of fields 0..n */ + for (i = 0; i < n_fields; i++) { + const dict_field_t* field; + ulint len; + ulint fixed_len; + const dict_col_t* col; + + field = dict_index_get_nth_field(index, i); + len = dfield_get_len(&fields[i]); + col = dict_field_get_col(field); + + ut_ad(dict_col_type_assert_equal(col, + dfield_get_type(&fields[i]))); + /* All NULLable fields must be included in the n_null count. */ + ut_ad((col->prtype & DATA_NOT_NULL) || n_null--); + + if (dfield_is_null(&fields[i])) { + /* No length is stored for NULL fields. */ + ut_ad(!(col->prtype & DATA_NOT_NULL)); + continue; + } + + ut_ad(len <= col->len || col->mtype == DATA_BLOB + || (col->len == 0 && col->mtype == DATA_VARCHAR)); + + fixed_len = field->fixed_len; + if (temp && fixed_len + && !dict_col_get_fixed_size(col, temp)) { + fixed_len = 0; + } + /* If the maximum length of a variable-length field + is up to 255 bytes, the actual length is always stored + in one byte. If the maximum length is more than 255 + bytes, the actual length is stored in one byte for + 0..127. The length will be encoded in two bytes when + it is 128 or more, or when the field is stored externally. */ + + if (fixed_len) { +#ifdef UNIV_DEBUG + ulint mbminlen = DATA_MBMINLEN(col->mbminmaxlen); + ulint mbmaxlen = DATA_MBMAXLEN(col->mbminmaxlen); + + ut_ad(len <= fixed_len); + + ut_ad(!mbmaxlen || len >= mbminlen + * (fixed_len / mbmaxlen)); + + /* dict_index_add_col() should guarantee this */ + ut_ad(!field->prefix_len + || fixed_len == field->prefix_len); +#endif /* UNIV_DEBUG */ + } else if (dfield_is_ext(&fields[i])) { + ut_ad(col->len >= 256 || col->mtype == DATA_BLOB); + extra_size += 2; + } else if (len < 128 + || (col->len < 256 && col->mtype != DATA_BLOB)) { + extra_size++; + } else { + /* For variable-length columns, we look up the + maximum length from the column itself. If this + is a prefix index column shorter than 256 bytes, + this will waste one byte. */ + extra_size += 2; + } + data_size += len; + } + + if (extra) { + *extra = extra_size; + } + + return(extra_size + data_size); +} + +/**********************************************************//** +Determines the size of a data tuple prefix in ROW_FORMAT=COMPACT. +@return total size */ +UNIV_INTERN +ulint +rec_get_converted_size_comp_prefix( +/*===============================*/ + const dict_index_t* index, /*!< in: record descriptor */ + const dfield_t* fields, /*!< in: array of data fields */ + ulint n_fields,/*!< in: number of data fields */ + ulint* extra) /*!< out: extra size */ +{ + ut_ad(dict_table_is_comp(index->table)); + return(rec_get_converted_size_comp_prefix_low( + index, fields, n_fields, extra, false)); +} + +/**********************************************************//** +Determines the size of a data tuple in ROW_FORMAT=COMPACT. +@return total size */ +UNIV_INTERN +ulint +rec_get_converted_size_comp( +/*========================*/ + const dict_index_t* index, /*!< in: record descriptor; + dict_table_is_comp() is + assumed to hold, even if + it does not */ + ulint status, /*!< in: status bits of the record */ + const dfield_t* fields, /*!< in: array of data fields */ + ulint n_fields,/*!< in: number of data fields */ + ulint* extra) /*!< out: extra size */ +{ + ulint size; + ut_ad(n_fields > 0); + + switch (UNIV_EXPECT(status, REC_STATUS_ORDINARY)) { + case REC_STATUS_ORDINARY: + ut_ad(n_fields == dict_index_get_n_fields(index)); + size = 0; + break; + case REC_STATUS_NODE_PTR: + n_fields--; + ut_ad(n_fields == dict_index_get_n_unique_in_tree(index)); + ut_ad(dfield_get_len(&fields[n_fields]) == REC_NODE_PTR_SIZE); + size = REC_NODE_PTR_SIZE; /* child page number */ + break; + case REC_STATUS_INFIMUM: + case REC_STATUS_SUPREMUM: + /* infimum or supremum record, 8 data bytes */ + if (UNIV_LIKELY_NULL(extra)) { + *extra = REC_N_NEW_EXTRA_BYTES; + } + return(REC_N_NEW_EXTRA_BYTES + 8); + default: + ut_error; + return(ULINT_UNDEFINED); + } + + return(size + rec_get_converted_size_comp_prefix_low( + index, fields, n_fields, extra, false)); +} + +/***********************************************************//** +Sets the value of the ith field SQL null bit of an old-style record. */ +UNIV_INTERN +void +rec_set_nth_field_null_bit( +/*=======================*/ + rec_t* rec, /*!< in: record */ + ulint i, /*!< in: ith field */ + ibool val) /*!< in: value to set */ +{ + ulint info; + + if (rec_get_1byte_offs_flag(rec)) { + + info = rec_1_get_field_end_info(rec, i); + + if (val) { + info = info | REC_1BYTE_SQL_NULL_MASK; + } else { + info = info & ~REC_1BYTE_SQL_NULL_MASK; + } + + rec_1_set_field_end_info(rec, i, info); + + return; + } + + info = rec_2_get_field_end_info(rec, i); + + if (val) { + info = info | REC_2BYTE_SQL_NULL_MASK; + } else { + info = info & ~REC_2BYTE_SQL_NULL_MASK; + } + + rec_2_set_field_end_info(rec, i, info); +} + +/***********************************************************//** +Sets an old-style record field to SQL null. +The physical size of the field is not changed. */ +UNIV_INTERN +void +rec_set_nth_field_sql_null( +/*=======================*/ + rec_t* rec, /*!< in: record */ + ulint n) /*!< in: index of the field */ +{ + ulint offset; + + offset = rec_get_field_start_offs(rec, n); + + data_write_sql_null(rec + offset, rec_get_nth_field_size(rec, n)); + + rec_set_nth_field_null_bit(rec, n, TRUE); +} + +/*********************************************************//** +Builds an old-style physical record out of a data tuple and +stores it beginning from the start of the given buffer. +@return pointer to the origin of physical record */ +static +rec_t* +rec_convert_dtuple_to_rec_old( +/*==========================*/ + byte* buf, /*!< in: start address of the physical record */ + const dtuple_t* dtuple, /*!< in: data tuple */ + ulint n_ext) /*!< in: number of externally stored columns */ +{ + const dfield_t* field; + ulint n_fields; + ulint data_size; + rec_t* rec; + ulint end_offset; + ulint ored_offset; + ulint len; + ulint i; + + ut_ad(buf && dtuple); + ut_ad(dtuple_validate(dtuple)); + ut_ad(dtuple_check_typed(dtuple)); + + n_fields = dtuple_get_n_fields(dtuple); + data_size = dtuple_get_data_size(dtuple, 0); + + ut_ad(n_fields > 0); + + /* Calculate the offset of the origin in the physical record */ + + rec = buf + rec_get_converted_extra_size(data_size, n_fields, n_ext); +#ifdef UNIV_DEBUG + /* Suppress Valgrind warnings of ut_ad() + in mach_write_to_1(), mach_write_to_2() et al. */ + memset(buf, 0xff, rec - buf + data_size); +#endif /* UNIV_DEBUG */ + /* Store the number of fields */ + rec_set_n_fields_old(rec, n_fields); + + /* Set the info bits of the record */ + rec_set_info_bits_old(rec, dtuple_get_info_bits(dtuple) + & REC_INFO_BITS_MASK); + + /* Store the data and the offsets */ + + end_offset = 0; + + if (!n_ext && data_size <= REC_1BYTE_OFFS_LIMIT) { + + rec_set_1byte_offs_flag(rec, TRUE); + + for (i = 0; i < n_fields; i++) { + + field = dtuple_get_nth_field(dtuple, i); + + if (dfield_is_null(field)) { + len = dtype_get_sql_null_size( + dfield_get_type(field), 0); + data_write_sql_null(rec + end_offset, len); + + end_offset += len; + ored_offset = end_offset + | REC_1BYTE_SQL_NULL_MASK; + } else { + /* If the data is not SQL null, store it */ + len = dfield_get_len(field); + + memcpy(rec + end_offset, + dfield_get_data(field), len); + + end_offset += len; + ored_offset = end_offset; + } + + rec_1_set_field_end_info(rec, i, ored_offset); + } + } else { + rec_set_1byte_offs_flag(rec, FALSE); + + for (i = 0; i < n_fields; i++) { + + field = dtuple_get_nth_field(dtuple, i); + + if (dfield_is_null(field)) { + len = dtype_get_sql_null_size( + dfield_get_type(field), 0); + data_write_sql_null(rec + end_offset, len); + + end_offset += len; + ored_offset = end_offset + | REC_2BYTE_SQL_NULL_MASK; + } else { + /* If the data is not SQL null, store it */ + len = dfield_get_len(field); + + memcpy(rec + end_offset, + dfield_get_data(field), len); + + end_offset += len; + ored_offset = end_offset; + + if (dfield_is_ext(field)) { + ored_offset |= REC_2BYTE_EXTERN_MASK; + } + } + + rec_2_set_field_end_info(rec, i, ored_offset); + } + } + + return(rec); +} + +/*********************************************************//** +Builds a ROW_FORMAT=COMPACT record out of a data tuple. */ +UNIV_INLINE __attribute__((nonnull)) +void +rec_convert_dtuple_to_rec_comp( +/*===========================*/ + rec_t* rec, /*!< in: origin of record */ + const dict_index_t* index, /*!< in: record descriptor */ + const dfield_t* fields, /*!< in: array of data fields */ + ulint n_fields,/*!< in: number of data fields */ + ulint status, /*!< in: status bits of the record */ + bool temp) /*!< in: whether to use the + format for temporary files in + index creation */ +{ + const dfield_t* field; + const dtype_t* type; + byte* end; + byte* nulls; + byte* lens; + ulint len; + ulint i; + ulint n_node_ptr_field; + ulint fixed_len; + ulint null_mask = 1; + ulint n_null; + + ut_ad(temp || dict_table_is_comp(index->table)); + ut_ad(n_fields > 0); + + if (temp) { + ut_ad(status == REC_STATUS_ORDINARY); + ut_ad(n_fields <= dict_index_get_n_fields(index)); + n_node_ptr_field = ULINT_UNDEFINED; + nulls = rec - 1; + if (dict_table_is_comp(index->table)) { + /* No need to do adjust fixed_len=0. We only + need to adjust it for ROW_FORMAT=REDUNDANT. */ + temp = false; + } + } else { + nulls = rec - (REC_N_NEW_EXTRA_BYTES + 1); + + switch (UNIV_EXPECT(status, REC_STATUS_ORDINARY)) { + case REC_STATUS_ORDINARY: + ut_ad(n_fields <= dict_index_get_n_fields(index)); + n_node_ptr_field = ULINT_UNDEFINED; + break; + case REC_STATUS_NODE_PTR: + ut_ad(n_fields + == dict_index_get_n_unique_in_tree(index) + 1); + n_node_ptr_field = n_fields - 1; + break; + case REC_STATUS_INFIMUM: + case REC_STATUS_SUPREMUM: + ut_ad(n_fields == 1); + n_node_ptr_field = ULINT_UNDEFINED; + break; + default: + ut_error; + return; + } + } + + end = rec; + n_null = index->n_nullable; + lens = nulls - UT_BITS_IN_BYTES(n_null); + /* clear the SQL-null flags */ + memset(lens + 1, 0, nulls - lens); + + /* Store the data and the offsets */ + + for (i = 0, field = fields; i < n_fields; i++, field++) { + const dict_field_t* ifield; + + type = dfield_get_type(field); + len = dfield_get_len(field); + + if (UNIV_UNLIKELY(i == n_node_ptr_field)) { + ut_ad(dtype_get_prtype(type) & DATA_NOT_NULL); + ut_ad(len == REC_NODE_PTR_SIZE); + memcpy(end, dfield_get_data(field), len); + end += REC_NODE_PTR_SIZE; + break; + } + + if (!(dtype_get_prtype(type) & DATA_NOT_NULL)) { + /* nullable field */ + ut_ad(n_null--); + + if (UNIV_UNLIKELY(!(byte) null_mask)) { + nulls--; + null_mask = 1; + } + + ut_ad(*nulls < null_mask); + + /* set the null flag if necessary */ + if (dfield_is_null(field)) { + *nulls |= null_mask; + null_mask <<= 1; + continue; + } + + null_mask <<= 1; + } + /* only nullable fields can be null */ + ut_ad(!dfield_is_null(field)); + + ifield = dict_index_get_nth_field(index, i); + fixed_len = ifield->fixed_len; + if (temp && fixed_len + && !dict_col_get_fixed_size(ifield->col, temp)) { + fixed_len = 0; + } + /* If the maximum length of a variable-length field + is up to 255 bytes, the actual length is always stored + in one byte. If the maximum length is more than 255 + bytes, the actual length is stored in one byte for + 0..127. The length will be encoded in two bytes when + it is 128 or more, or when the field is stored externally. */ + if (fixed_len) { +#ifdef UNIV_DEBUG + ulint mbminlen = DATA_MBMINLEN( + ifield->col->mbminmaxlen); + ulint mbmaxlen = DATA_MBMAXLEN( + ifield->col->mbminmaxlen); + + ut_ad(len <= fixed_len); + ut_ad(!mbmaxlen || len >= mbminlen + * (fixed_len / mbmaxlen)); + ut_ad(!dfield_is_ext(field)); +#endif /* UNIV_DEBUG */ + } else if (dfield_is_ext(field)) { + ut_ad(ifield->col->len >= 256 + || ifield->col->mtype == DATA_BLOB); + ut_ad(len <= REC_ANTELOPE_MAX_INDEX_COL_LEN + + BTR_EXTERN_FIELD_REF_SIZE); + *lens-- = (byte) (len >> 8) | 0xc0; + *lens-- = (byte) len; + } else { + ut_ad(len <= dtype_get_len(type) + || dtype_get_mtype(type) == DATA_BLOB + || !strcmp(index->name, + FTS_INDEX_TABLE_IND_NAME)); + if (len < 128 + || (dtype_get_len(type) < 256 + && dtype_get_mtype(type) != DATA_BLOB)) { + + *lens-- = (byte) len; + } else { + ut_ad(len < 16384); + *lens-- = (byte) (len >> 8) | 0x80; + *lens-- = (byte) len; + } + } + + memcpy(end, dfield_get_data(field), len); + end += len; + } +} + +/*********************************************************//** +Builds a new-style physical record out of a data tuple and +stores it beginning from the start of the given buffer. +@return pointer to the origin of physical record */ +static +rec_t* +rec_convert_dtuple_to_rec_new( +/*==========================*/ + byte* buf, /*!< in: start address of + the physical record */ + const dict_index_t* index, /*!< in: record descriptor */ + const dtuple_t* dtuple) /*!< in: data tuple */ +{ + ulint extra_size; + ulint status; + rec_t* rec; + + status = dtuple_get_info_bits(dtuple) & REC_NEW_STATUS_MASK; + rec_get_converted_size_comp( + index, status, dtuple->fields, dtuple->n_fields, &extra_size); + rec = buf + extra_size; + + rec_convert_dtuple_to_rec_comp( + rec, index, dtuple->fields, dtuple->n_fields, status, false); + + /* Set the info bits of the record */ + rec_set_info_and_status_bits(rec, dtuple_get_info_bits(dtuple)); + + return(rec); +} + +/*********************************************************//** +Builds a physical record out of a data tuple and +stores it beginning from the start of the given buffer. +@return pointer to the origin of physical record */ +UNIV_INTERN +rec_t* +rec_convert_dtuple_to_rec( +/*======================*/ + byte* buf, /*!< in: start address of the + physical record */ + const dict_index_t* index, /*!< in: record descriptor */ + const dtuple_t* dtuple, /*!< in: data tuple */ + ulint n_ext) /*!< in: number of + externally stored columns */ +{ + rec_t* rec; + + ut_ad(buf && index && dtuple); + ut_ad(dtuple_validate(dtuple)); + ut_ad(dtuple_check_typed(dtuple)); + + if (dict_table_is_comp(index->table)) { + rec = rec_convert_dtuple_to_rec_new(buf, index, dtuple); + } else { + rec = rec_convert_dtuple_to_rec_old(buf, dtuple, n_ext); + } + +#ifdef UNIV_DEBUG + { + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + const ulint* offsets; + ulint i; + rec_offs_init(offsets_); + + offsets = rec_get_offsets(rec, index, + offsets_, ULINT_UNDEFINED, &heap); + ut_ad(rec_validate(rec, offsets)); + ut_ad(dtuple_get_n_fields(dtuple) + == rec_offs_n_fields(offsets)); + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + ut_ad(!dfield_is_ext(dtuple_get_nth_field(dtuple, i)) + == !rec_offs_nth_extern(offsets, i)); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + } +#endif /* UNIV_DEBUG */ + return(rec); +} + +#ifndef UNIV_HOTBACKUP +/**********************************************************//** +Determines the size of a data tuple prefix in ROW_FORMAT=COMPACT. +@return total size */ +UNIV_INTERN +ulint +rec_get_converted_size_temp( +/*========================*/ + const dict_index_t* index, /*!< in: record descriptor */ + const dfield_t* fields, /*!< in: array of data fields */ + ulint n_fields,/*!< in: number of data fields */ + ulint* extra) /*!< out: extra size */ +{ + return(rec_get_converted_size_comp_prefix_low( + index, fields, n_fields, extra, true)); +} + +/******************************************************//** +Determine the offset to each field in temporary file. +@see rec_convert_dtuple_to_temp() */ +UNIV_INTERN +void +rec_init_offsets_temp( +/*==================*/ + const rec_t* rec, /*!< in: temporary file record */ + const dict_index_t* index, /*!< in: record descriptor */ + ulint* offsets)/*!< in/out: array of offsets; + in: n=rec_offs_n_fields(offsets) */ +{ + rec_init_offsets_comp_ordinary(rec, true, index, offsets); +} + +/*********************************************************//** +Builds a temporary file record out of a data tuple. +@see rec_init_offsets_temp() */ +UNIV_INTERN +void +rec_convert_dtuple_to_temp( +/*=======================*/ + rec_t* rec, /*!< out: record */ + const dict_index_t* index, /*!< in: record descriptor */ + const dfield_t* fields, /*!< in: array of data fields */ + ulint n_fields) /*!< in: number of fields */ +{ + rec_convert_dtuple_to_rec_comp(rec, index, fields, n_fields, + REC_STATUS_ORDINARY, true); +} + +/**************************************************************//** +Copies the first n fields of a physical record to a data tuple. The fields +are copied to the memory heap. */ +UNIV_INTERN +void +rec_copy_prefix_to_dtuple( +/*======================*/ + dtuple_t* tuple, /*!< out: data tuple */ + const rec_t* rec, /*!< in: physical record */ + const dict_index_t* index, /*!< in: record descriptor */ + ulint n_fields, /*!< in: number of fields + to copy */ + mem_heap_t* heap) /*!< in: memory heap */ +{ + ulint i; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + offsets = rec_get_offsets(rec, index, offsets, n_fields, &heap); + + ut_ad(rec_validate(rec, offsets)); + ut_ad(dtuple_check_typed(tuple)); + + dtuple_set_info_bits(tuple, rec_get_info_bits( + rec, dict_table_is_comp(index->table))); + + for (i = 0; i < n_fields; i++) { + dfield_t* field; + const byte* data; + ulint len; + + field = dtuple_get_nth_field(tuple, i); + data = rec_get_nth_field(rec, offsets, i, &len); + + if (len != UNIV_SQL_NULL) { + dfield_set_data(field, + mem_heap_dup(heap, data, len), len); + ut_ad(!rec_offs_nth_extern(offsets, i)); + } else { + dfield_set_null(field); + } + } +} + +/**************************************************************//** +Copies the first n fields of an old-style physical record +to a new physical record in a buffer. +@return own: copied record */ +static +rec_t* +rec_copy_prefix_to_buf_old( +/*=======================*/ + const rec_t* rec, /*!< in: physical record */ + ulint n_fields, /*!< in: number of fields to copy */ + ulint area_end, /*!< in: end of the prefix data */ + byte** buf, /*!< in/out: memory buffer for + the copied prefix, or NULL */ + ulint* buf_size) /*!< in/out: buffer size */ +{ + rec_t* copy_rec; + ulint area_start; + ulint prefix_len; + + if (rec_get_1byte_offs_flag(rec)) { + area_start = REC_N_OLD_EXTRA_BYTES + n_fields; + } else { + area_start = REC_N_OLD_EXTRA_BYTES + 2 * n_fields; + } + + prefix_len = area_start + area_end; + + if ((*buf == NULL) || (*buf_size < prefix_len)) { + if (*buf != NULL) { + mem_free(*buf); + } + + *buf = static_cast<byte*>(mem_alloc2(prefix_len, buf_size)); + } + + ut_memcpy(*buf, rec - area_start, prefix_len); + + copy_rec = *buf + area_start; + + rec_set_n_fields_old(copy_rec, n_fields); + + return(copy_rec); +} + +/**************************************************************//** +Copies the first n fields of a physical record to a new physical record in +a buffer. +@return own: copied record */ +UNIV_INTERN +rec_t* +rec_copy_prefix_to_buf( +/*===================*/ + const rec_t* rec, /*!< in: physical record */ + const dict_index_t* index, /*!< in: record descriptor */ + ulint n_fields, /*!< in: number of fields + to copy */ + byte** buf, /*!< in/out: memory buffer + for the copied prefix, + or NULL */ + ulint* buf_size) /*!< in/out: buffer size */ +{ + const byte* nulls; + const byte* lens; + ulint i; + ulint prefix_len; + ulint null_mask; + ulint status; + + UNIV_PREFETCH_RW(*buf); + + if (!dict_table_is_comp(index->table)) { + ut_ad(rec_validate_old(rec)); + return(rec_copy_prefix_to_buf_old( + rec, n_fields, + rec_get_field_start_offs(rec, n_fields), + buf, buf_size)); + } + + status = rec_get_status(rec); + + switch (status) { + case REC_STATUS_ORDINARY: + ut_ad(n_fields <= dict_index_get_n_fields(index)); + break; + case REC_STATUS_NODE_PTR: + /* it doesn't make sense to copy the child page number field */ + ut_ad(n_fields <= dict_index_get_n_unique_in_tree(index)); + break; + case REC_STATUS_INFIMUM: + case REC_STATUS_SUPREMUM: + /* infimum or supremum record: no sense to copy anything */ + default: + ut_error; + return(NULL); + } + + nulls = rec - (REC_N_NEW_EXTRA_BYTES + 1); + lens = nulls - UT_BITS_IN_BYTES(index->n_nullable); + UNIV_PREFETCH_R(lens); + prefix_len = 0; + null_mask = 1; + + /* read the lengths of fields 0..n */ + for (i = 0; i < n_fields; i++) { + const dict_field_t* field; + const dict_col_t* col; + + field = dict_index_get_nth_field(index, i); + col = dict_field_get_col(field); + + if (!(col->prtype & DATA_NOT_NULL)) { + /* nullable field => read the null flag */ + if (UNIV_UNLIKELY(!(byte) null_mask)) { + nulls--; + null_mask = 1; + } + + if (*nulls & null_mask) { + null_mask <<= 1; + continue; + } + + null_mask <<= 1; + } + + if (field->fixed_len) { + prefix_len += field->fixed_len; + } else { + ulint len = *lens--; + /* If the maximum length of the column is up + to 255 bytes, the actual length is always + stored in one byte. If the maximum length is + more than 255 bytes, the actual length is + stored in one byte for 0..127. The length + will be encoded in two bytes when it is 128 or + more, or when the column is stored externally. */ + if (col->len > 255 || col->mtype == DATA_BLOB) { + if (len & 0x80) { + /* 1exxxxxx */ + len &= 0x3f; + len <<= 8; + len |= *lens--; + UNIV_PREFETCH_R(lens); + } + } + prefix_len += len; + } + } + + UNIV_PREFETCH_R(rec + prefix_len); + + prefix_len += rec - (lens + 1); + + if ((*buf == NULL) || (*buf_size < prefix_len)) { + if (*buf != NULL) { + mem_free(*buf); + } + + *buf = static_cast<byte*>(mem_alloc2(prefix_len, buf_size)); + } + + memcpy(*buf, lens + 1, prefix_len); + + return(*buf + (rec - (lens + 1))); +} +#endif /* UNIV_HOTBACKUP */ + +/***************************************************************//** +Validates the consistency of an old-style physical record. +@return TRUE if ok */ +static +ibool +rec_validate_old( +/*=============*/ + const rec_t* rec) /*!< in: physical record */ +{ + const byte* data; + ulint len; + ulint n_fields; + ulint len_sum = 0; + ulint sum = 0; + ulint i; + + ut_a(rec); + n_fields = rec_get_n_fields_old(rec); + + if ((n_fields == 0) || (n_fields > REC_MAX_N_FIELDS)) { + fprintf(stderr, "InnoDB: Error: record has %lu fields\n", + (ulong) n_fields); + return(FALSE); + } + + for (i = 0; i < n_fields; i++) { + data = rec_get_nth_field_old(rec, i, &len); + + if (!((len < UNIV_PAGE_SIZE) || (len == UNIV_SQL_NULL))) { + fprintf(stderr, + "InnoDB: Error: record field %lu len %lu\n", + (ulong) i, + (ulong) len); + return(FALSE); + } + + if (len != UNIV_SQL_NULL) { + len_sum += len; + sum += *(data + len -1); /* dereference the + end of the field to + cause a memory trap + if possible */ + } else { + len_sum += rec_get_nth_field_size(rec, i); + } + } + + if (len_sum != rec_get_data_size_old(rec)) { + fprintf(stderr, + "InnoDB: Error: record len should be %lu, len %lu\n", + (ulong) len_sum, + rec_get_data_size_old(rec)); + return(FALSE); + } + + rec_dummy = sum; /* This is here only to fool the compiler */ + + return(TRUE); +} + +/***************************************************************//** +Validates the consistency of a physical record. +@return TRUE if ok */ +UNIV_INTERN +ibool +rec_validate( +/*=========*/ + const rec_t* rec, /*!< in: physical record */ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ +{ + const byte* data; + ulint len; + ulint n_fields; + ulint len_sum = 0; + ulint sum = 0; + ulint i; + + ut_a(rec); + n_fields = rec_offs_n_fields(offsets); + + if ((n_fields == 0) || (n_fields > REC_MAX_N_FIELDS)) { + fprintf(stderr, "InnoDB: Error: record has %lu fields\n", + (ulong) n_fields); + return(FALSE); + } + + ut_a(rec_offs_comp(offsets) || n_fields <= rec_get_n_fields_old(rec)); + + for (i = 0; i < n_fields; i++) { + data = rec_get_nth_field(rec, offsets, i, &len); + + if (!((len < UNIV_PAGE_SIZE) || (len == UNIV_SQL_NULL))) { + fprintf(stderr, + "InnoDB: Error: record field %lu len %lu\n", + (ulong) i, + (ulong) len); + return(FALSE); + } + + if (len != UNIV_SQL_NULL) { + len_sum += len; + sum += *(data + len -1); /* dereference the + end of the field to + cause a memory trap + if possible */ + } else if (!rec_offs_comp(offsets)) { + len_sum += rec_get_nth_field_size(rec, i); + } + } + + if (len_sum != rec_offs_data_size(offsets)) { + fprintf(stderr, + "InnoDB: Error: record len should be %lu, len %lu\n", + (ulong) len_sum, + (ulong) rec_offs_data_size(offsets)); + return(FALSE); + } + + rec_dummy = sum; /* This is here only to fool the compiler */ + + if (!rec_offs_comp(offsets)) { + ut_a(rec_validate_old(rec)); + } + + return(TRUE); +} + +/***************************************************************//** +Prints an old-style physical record. */ +UNIV_INTERN +void +rec_print_old( +/*==========*/ + FILE* file, /*!< in: file where to print */ + const rec_t* rec) /*!< in: physical record */ +{ + const byte* data; + ulint len; + ulint n; + ulint i; + + ut_ad(rec); + + n = rec_get_n_fields_old(rec); + + fprintf(file, "PHYSICAL RECORD: n_fields %lu;" + " %u-byte offsets; info bits %lu\n", + (ulong) n, + rec_get_1byte_offs_flag(rec) ? 1 : 2, + (ulong) rec_get_info_bits(rec, FALSE)); + + for (i = 0; i < n; i++) { + + data = rec_get_nth_field_old(rec, i, &len); + + fprintf(file, " %lu:", (ulong) i); + + if (len != UNIV_SQL_NULL) { + if (len <= 30) { + + ut_print_buf(file, data, len); + } else { + ut_print_buf(file, data, 30); + + fprintf(file, " (total %lu bytes)", + (ulong) len); + } + } else { + fprintf(file, " SQL NULL, size %lu ", + rec_get_nth_field_size(rec, i)); + } + + putc(';', file); + putc('\n', file); + } + + rec_validate_old(rec); +} + +#ifndef UNIV_HOTBACKUP +/***************************************************************//** +Prints a physical record in ROW_FORMAT=COMPACT. Ignores the +record header. */ +UNIV_INTERN +void +rec_print_comp( +/*===========*/ + FILE* file, /*!< in: file where to print */ + const rec_t* rec, /*!< in: physical record */ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ +{ + ulint i; + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + const byte* data; + ulint len; + + data = rec_get_nth_field(rec, offsets, i, &len); + + fprintf(file, " %lu:", (ulong) i); + + if (len != UNIV_SQL_NULL) { + if (len <= 30) { + + ut_print_buf(file, data, len); + } else if (rec_offs_nth_extern(offsets, i)) { + ut_print_buf(file, data, 30); + fprintf(file, " (total %lu bytes, external)", + (ulong) len); + ut_print_buf(file, data + len + - BTR_EXTERN_FIELD_REF_SIZE, + BTR_EXTERN_FIELD_REF_SIZE); + } else { + ut_print_buf(file, data, 30); + + fprintf(file, " (total %lu bytes)", + (ulong) len); + } + } else { + fputs(" SQL NULL", file); + } + putc(';', file); + putc('\n', file); + } +} + +/***************************************************************//** +Prints a physical record. */ +UNIV_INTERN +void +rec_print_new( +/*==========*/ + FILE* file, /*!< in: file where to print */ + const rec_t* rec, /*!< in: physical record */ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ +{ + ut_ad(rec); + ut_ad(offsets); + ut_ad(rec_offs_validate(rec, NULL, offsets)); + + if (!rec_offs_comp(offsets)) { + rec_print_old(file, rec); + return; + } + + fprintf(file, "PHYSICAL RECORD: n_fields %lu;" + " compact format; info bits %lu\n", + (ulong) rec_offs_n_fields(offsets), + (ulong) rec_get_info_bits(rec, TRUE)); + + rec_print_comp(file, rec, offsets); + rec_validate(rec, offsets); +} + +/***************************************************************//** +Prints a physical record. */ +UNIV_INTERN +void +rec_print( +/*======*/ + FILE* file, /*!< in: file where to print */ + const rec_t* rec, /*!< in: physical record */ + const dict_index_t* index) /*!< in: record descriptor */ +{ + ut_ad(index); + + if (!dict_table_is_comp(index->table)) { + rec_print_old(file, rec); + return; + } else { + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs_init(offsets_); + + rec_print_new(file, rec, + rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap)); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + } +} + +# ifdef UNIV_DEBUG +/************************************************************//** +Reads the DB_TRX_ID of a clustered index record. +@return the value of DB_TRX_ID */ +UNIV_INTERN +trx_id_t +rec_get_trx_id( +/*===========*/ + const rec_t* rec, /*!< in: record */ + const dict_index_t* index) /*!< in: clustered index */ +{ + const page_t* page + = page_align(rec); + ulint trx_id_col + = dict_index_get_sys_col_pos(index, DATA_TRX_ID); + const byte* trx_id; + ulint len; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX); + ut_ad(mach_read_from_8(page + PAGE_HEADER + PAGE_INDEX_ID) + == index->id); + ut_ad(dict_index_is_clust(index)); + ut_ad(trx_id_col > 0); + ut_ad(trx_id_col != ULINT_UNDEFINED); + + offsets = rec_get_offsets(rec, index, offsets, trx_id_col + 1, &heap); + + trx_id = rec_get_nth_field(rec, offsets, trx_id_col, &len); + + ut_ad(len == DATA_TRX_ID_LEN); + + if (heap) { + mem_heap_free(heap); + } + + return(trx_read_trx_id(trx_id)); +} +# endif /* UNIV_DEBUG */ +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/xtradb/row/row0ext.cc b/storage/xtradb/row/row0ext.cc new file mode 100644 index 00000000000..32b78391d6a --- /dev/null +++ b/storage/xtradb/row/row0ext.cc @@ -0,0 +1,142 @@ +/***************************************************************************** + +Copyright (c) 2006, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file row/row0ext.cc +Caching of externally stored column prefixes + +Created September 2006 Marko Makela +*******************************************************/ + +#include "row0ext.h" + +#ifdef UNIV_NONINL +#include "row0ext.ic" +#endif + +#include "btr0cur.h" + +/********************************************************************//** +Fills the column prefix cache of an externally stored column. */ +static +void +row_ext_cache_fill( +/*===============*/ + row_ext_t* ext, /*!< in/out: column prefix cache */ + ulint i, /*!< in: index of ext->ext[] */ + ulint zip_size,/*!< compressed page size in bytes, or 0 */ + const dfield_t* dfield) /*!< in: data field */ +{ + const byte* field = static_cast<const byte*>( + dfield_get_data(dfield)); + ulint f_len = dfield_get_len(dfield); + byte* buf = ext->buf + i * ext->max_len; + + ut_ad(ext->max_len > 0); + ut_ad(i < ext->n_ext); + ut_ad(dfield_is_ext(dfield)); + ut_a(f_len >= BTR_EXTERN_FIELD_REF_SIZE); + + if (UNIV_UNLIKELY(!memcmp(field_ref_zero, + field + f_len - BTR_EXTERN_FIELD_REF_SIZE, + BTR_EXTERN_FIELD_REF_SIZE))) { + /* The BLOB pointer is not set: we cannot fetch it */ + ext->len[i] = 0; + } else { + if (ext->max_len == REC_VERSION_56_MAX_INDEX_COL_LEN + && f_len > BTR_EXTERN_FIELD_REF_SIZE) { + /* In this case, the field is in B format or beyond, + (refer to the definition of row_ext_t.max_len) + and the field is already fill with prefix, otherwise + f_len would be BTR_EXTERN_FIELD_REF_SIZE. + So there is no need to re-read the prefix externally, + but just copy the local prefix to buf. Please note + if the ext->len[i] is zero, it means an error + as above. */ + memcpy(buf, field, f_len - BTR_EXTERN_FIELD_REF_SIZE); + ext->len[i] = f_len - BTR_EXTERN_FIELD_REF_SIZE; + } else { + /* Fetch at most ext->max_len of the column. + The column should be non-empty. However, + trx_rollback_or_clean_all_recovered() may try to + access a half-deleted BLOB if the server previously + crashed during the execution of + btr_free_externally_stored_field(). */ + ext->len[i] = btr_copy_externally_stored_field_prefix( + buf, ext->max_len, zip_size, field, f_len); + } + } +} + +/********************************************************************//** +Creates a cache of column prefixes of externally stored columns. +@return own: column prefix cache */ +UNIV_INTERN +row_ext_t* +row_ext_create( +/*===========*/ + ulint n_ext, /*!< in: number of externally stored columns */ + const ulint* ext, /*!< in: col_no's of externally stored columns + in the InnoDB table object, as reported by + dict_col_get_no(); NOT relative to the records + in the clustered index */ + ulint flags, /*!< in: table->flags */ + const dtuple_t* tuple, /*!< in: data tuple containing the field + references of the externally stored + columns; must be indexed by col_no; + the clustered index record must be + covered by a lock or a page latch + to prevent deletion (rollback or purge). */ + mem_heap_t* heap) /*!< in: heap where created */ +{ + ulint i; + ulint zip_size = dict_tf_get_zip_size(flags); + + row_ext_t* ret; + + ut_ad(n_ext > 0); + + ret = static_cast<row_ext_t*>( + mem_heap_alloc(heap, + (sizeof *ret) + (n_ext - 1) * sizeof ret->len)); + + ut_ad(ut_is_2pow(zip_size)); + ut_ad(zip_size <= UNIV_ZIP_SIZE_MAX); + + ret->n_ext = n_ext; + ret->ext = ext; + ret->max_len = DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(flags); + + ret->buf = static_cast<byte*>( + mem_heap_alloc(heap, n_ext * ret->max_len)); + +#ifdef UNIV_DEBUG + memset(ret->buf, 0xaa, n_ext * ret->max_len); + UNIV_MEM_ALLOC(ret->buf, n_ext * ret->max_len); +#endif + + /* Fetch the BLOB prefixes */ + for (i = 0; i < n_ext; i++) { + const dfield_t* dfield; + + dfield = dtuple_get_nth_field(tuple, ext[i]); + row_ext_cache_fill(ret, i, zip_size, dfield); + } + + return(ret); +} diff --git a/storage/xtradb/row/row0ftsort.cc b/storage/xtradb/row/row0ftsort.cc new file mode 100644 index 00000000000..54f6f7bcc0f --- /dev/null +++ b/storage/xtradb/row/row0ftsort.cc @@ -0,0 +1,1573 @@ +/***************************************************************************** + +Copyright (c) 2010, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file row/row0ftsort.cc +Create Full Text Index with (parallel) merge sort + +Created 10/13/2010 Jimmy Yang +*******************************************************/ + +#include "dict0dict.h" /* dict_table_stats_lock() */ +#include "row0merge.h" +#include "pars0pars.h" +#include "row0ftsort.h" +#include "row0merge.h" +#include "row0row.h" +#include "btr0cur.h" +#include "btr0sea.h" + +/** Read the next record to buffer N. +@param N index into array of merge info structure */ +#define ROW_MERGE_READ_GET_NEXT(N) \ + do { \ + b[N] = row_merge_read_rec( \ + block[N], buf[N], b[N], index, \ + fd[N], &foffs[N], &mrec[N], offsets[N]); \ + if (UNIV_UNLIKELY(!b[N])) { \ + if (mrec[N]) { \ + goto exit; \ + } \ + } \ + } while (0) + +/** Parallel sort degree */ +UNIV_INTERN ulong fts_sort_pll_degree = 2; + +/*********************************************************************//** +Create a temporary "fts sort index" used to merge sort the +tokenized doc string. The index has three "fields": + +1) Tokenized word, +2) Doc ID (depend on number of records to sort, it can be a 4 bytes or 8 bytes +integer value) +3) Word's position in original doc. + +@return dict_index_t structure for the fts sort index */ +UNIV_INTERN +dict_index_t* +row_merge_create_fts_sort_index( +/*============================*/ + dict_index_t* index, /*!< in: Original FTS index + based on which this sort index + is created */ + const dict_table_t* table, /*!< in: table that FTS index + is being created on */ + ibool* opt_doc_id_size) + /*!< out: whether to use 4 bytes + instead of 8 bytes integer to + store Doc ID during sort */ +{ + dict_index_t* new_index; + dict_field_t* field; + dict_field_t* idx_field; + CHARSET_INFO* charset; + + // FIXME: This name shouldn't be hard coded here. + new_index = dict_mem_index_create( + index->table->name, "tmp_fts_idx", 0, DICT_FTS, 3); + + new_index->id = index->id; + new_index->table = (dict_table_t*) table; + new_index->n_uniq = FTS_NUM_FIELDS_SORT; + new_index->n_def = FTS_NUM_FIELDS_SORT; + new_index->cached = TRUE; + + btr_search_index_init(new_index); + + idx_field = dict_index_get_nth_field(index, 0); + charset = fts_index_get_charset(index); + + /* The first field is on the Tokenized Word */ + field = dict_index_get_nth_field(new_index, 0); + field->name = NULL; + field->prefix_len = 0; + field->col = static_cast<dict_col_t*>( + mem_heap_alloc(new_index->heap, sizeof(dict_col_t))); + field->col->len = FTS_MAX_WORD_LEN; + + if (strcmp(charset->name, "latin1_swedish_ci") == 0) { + field->col->mtype = DATA_VARCHAR; + } else { + field->col->mtype = DATA_VARMYSQL; + } + + field->col->prtype = idx_field->col->prtype | DATA_NOT_NULL; + field->col->mbminmaxlen = idx_field->col->mbminmaxlen; + field->fixed_len = 0; + + /* Doc ID */ + field = dict_index_get_nth_field(new_index, 1); + field->name = NULL; + field->prefix_len = 0; + field->col = static_cast<dict_col_t*>( + mem_heap_alloc(new_index->heap, sizeof(dict_col_t))); + field->col->mtype = DATA_INT; + *opt_doc_id_size = FALSE; + + /* Check whether we can use 4 bytes instead of 8 bytes integer + field to hold the Doc ID, thus reduce the overall sort size */ + if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_ADD_DOC_ID)) { + /* If Doc ID column is being added by this create + index, then just check the number of rows in the table */ + if (dict_table_get_n_rows(table) < MAX_DOC_ID_OPT_VAL) { + *opt_doc_id_size = TRUE; + } + } else { + doc_id_t max_doc_id; + + /* If the Doc ID column is supplied by user, then + check the maximum Doc ID in the table */ + max_doc_id = fts_get_max_doc_id((dict_table_t*) table); + + if (max_doc_id && max_doc_id < MAX_DOC_ID_OPT_VAL) { + *opt_doc_id_size = TRUE; + } + } + + if (*opt_doc_id_size) { + field->col->len = sizeof(ib_uint32_t); + field->fixed_len = sizeof(ib_uint32_t); + } else { + field->col->len = FTS_DOC_ID_LEN; + field->fixed_len = FTS_DOC_ID_LEN; + } + + field->col->prtype = DATA_NOT_NULL | DATA_BINARY_TYPE; + + field->col->mbminmaxlen = 0; + + /* The third field is on the word's position in the original doc */ + field = dict_index_get_nth_field(new_index, 2); + field->name = NULL; + field->prefix_len = 0; + field->col = static_cast<dict_col_t*>( + mem_heap_alloc(new_index->heap, sizeof(dict_col_t))); + field->col->mtype = DATA_INT; + field->col->len = 4 ; + field->fixed_len = 4; + field->col->prtype = DATA_NOT_NULL; + field->col->mbminmaxlen = 0; + + return(new_index); +} +/*********************************************************************//** +Initialize FTS parallel sort structures. +@return TRUE if all successful */ +UNIV_INTERN +ibool +row_fts_psort_info_init( +/*====================*/ + trx_t* trx, /*!< in: transaction */ + row_merge_dup_t* dup, /*!< in,own: descriptor of + FTS index being created */ + const dict_table_t* new_table,/*!< in: table on which indexes are + created */ + ibool opt_doc_id_size, + /*!< in: whether to use 4 bytes + instead of 8 bytes integer to + store Doc ID during sort */ + fts_psort_t** psort, /*!< out: parallel sort info to be + instantiated */ + fts_psort_t** merge) /*!< out: parallel merge info + to be instantiated */ +{ + ulint i; + ulint j; + fts_psort_common_t* common_info = NULL; + fts_psort_t* psort_info = NULL; + fts_psort_t* merge_info = NULL; + ulint block_size; + ibool ret = TRUE; + + block_size = 3 * srv_sort_buf_size; + + *psort = psort_info = static_cast<fts_psort_t*>(mem_zalloc( + fts_sort_pll_degree * sizeof *psort_info)); + + if (!psort_info) { + ut_free(dup); + return(FALSE); + } + + /* Common Info for all sort threads */ + common_info = static_cast<fts_psort_common_t*>( + mem_alloc(sizeof *common_info)); + + if (!common_info) { + ut_free(dup); + mem_free(psort_info); + return(FALSE); + } + + common_info->dup = dup; + common_info->new_table = (dict_table_t*) new_table; + common_info->trx = trx; + common_info->all_info = psort_info; + common_info->sort_event = os_event_create(); + common_info->merge_event = os_event_create(); + common_info->opt_doc_id_size = opt_doc_id_size; + + /* There will be FTS_NUM_AUX_INDEX number of "sort buckets" for + each parallel sort thread. Each "sort bucket" holds records for + a particular "FTS index partition" */ + for (j = 0; j < fts_sort_pll_degree; j++) { + + UT_LIST_INIT(psort_info[j].fts_doc_list); + + for (i = 0; i < FTS_NUM_AUX_INDEX; i++) { + + psort_info[j].merge_file[i] = + static_cast<merge_file_t*>( + mem_zalloc(sizeof(merge_file_t))); + + if (!psort_info[j].merge_file[i]) { + ret = FALSE; + goto func_exit; + } + + psort_info[j].merge_buf[i] = row_merge_buf_create( + dup->index); + + if (row_merge_file_create(psort_info[j].merge_file[i]) + < 0) { + goto func_exit; + } + + /* Need to align memory for O_DIRECT write */ + psort_info[j].block_alloc[i] = + static_cast<row_merge_block_t*>(ut_malloc( + block_size + 1024)); + + psort_info[j].merge_block[i] = + static_cast<row_merge_block_t*>( + ut_align( + psort_info[j].block_alloc[i], 1024)); + + if (!psort_info[j].merge_block[i]) { + ret = FALSE; + goto func_exit; + } + } + + psort_info[j].child_status = 0; + psort_info[j].state = 0; + psort_info[j].psort_common = common_info; + psort_info[j].error = DB_SUCCESS; + psort_info[j].memory_used = 0; + mutex_create(fts_pll_tokenize_mutex_key, &psort_info[j].mutex, SYNC_FTS_TOKENIZE); + } + + /* Initialize merge_info structures parallel merge and insert + into auxiliary FTS tables (FTS_INDEX_TABLE) */ + *merge = merge_info = static_cast<fts_psort_t*>( + mem_alloc(FTS_NUM_AUX_INDEX * sizeof *merge_info)); + + for (j = 0; j < FTS_NUM_AUX_INDEX; j++) { + + merge_info[j].child_status = 0; + merge_info[j].state = 0; + merge_info[j].psort_common = common_info; + } + +func_exit: + if (!ret) { + row_fts_psort_info_destroy(psort_info, merge_info); + } + + return(ret); +} +/*********************************************************************//** +Clean up and deallocate FTS parallel sort structures, and close the +merge sort files */ +UNIV_INTERN +void +row_fts_psort_info_destroy( +/*=======================*/ + fts_psort_t* psort_info, /*!< parallel sort info */ + fts_psort_t* merge_info) /*!< parallel merge info */ +{ + ulint i; + ulint j; + + if (psort_info) { + for (j = 0; j < fts_sort_pll_degree; j++) { + for (i = 0; i < FTS_NUM_AUX_INDEX; i++) { + if (psort_info[j].merge_file[i]) { + row_merge_file_destroy( + psort_info[j].merge_file[i]); + } + + if (psort_info[j].block_alloc[i]) { + ut_free(psort_info[j].block_alloc[i]); + } + mem_free(psort_info[j].merge_file[i]); + } + + mutex_free(&psort_info[j].mutex); + } + + os_event_free(merge_info[0].psort_common->sort_event); + os_event_free(merge_info[0].psort_common->merge_event); + ut_free(merge_info[0].psort_common->dup); + mem_free(merge_info[0].psort_common); + mem_free(psort_info); + } + + if (merge_info) { + mem_free(merge_info); + } +} +/*********************************************************************//** +Free up merge buffers when merge sort is done */ +UNIV_INTERN +void +row_fts_free_pll_merge_buf( +/*=======================*/ + fts_psort_t* psort_info) /*!< in: parallel sort info */ +{ + ulint j; + ulint i; + + if (!psort_info) { + return; + } + + for (j = 0; j < fts_sort_pll_degree; j++) { + for (i = 0; i < FTS_NUM_AUX_INDEX; i++) { + row_merge_buf_free(psort_info[j].merge_buf[i]); + } + } + + return; +} + +/*********************************************************************//** +Tokenize incoming text data and add to the sort buffer. +@return TRUE if the record passed, FALSE if out of space */ +static +ibool +row_merge_fts_doc_tokenize( +/*=======================*/ + row_merge_buf_t** sort_buf, /*!< in/out: sort buffer */ + doc_id_t doc_id, /*!< in: Doc ID */ + fts_doc_t* doc, /*!< in: Doc to be tokenized */ + dtype_t* word_dtype, /*!< in: data structure for + word col */ + merge_file_t** merge_file, /*!< in/out: merge file */ + ibool opt_doc_id_size,/*!< in: whether to use 4 bytes + instead of 8 bytes integer to + store Doc ID during sort*/ + fts_tokenize_ctx_t* t_ctx) /*!< in/out: tokenize context */ +{ + ulint i; + ulint inc; + fts_string_t str; + ulint len; + row_merge_buf_t* buf; + dfield_t* field; + fts_string_t t_str; + ibool buf_full = FALSE; + byte str_buf[FTS_MAX_WORD_LEN + 1]; + ulint data_size[FTS_NUM_AUX_INDEX]; + ulint n_tuple[FTS_NUM_AUX_INDEX]; + + t_str.f_n_char = 0; + t_ctx->buf_used = 0; + + memset(n_tuple, 0, FTS_NUM_AUX_INDEX * sizeof(ulint)); + memset(data_size, 0, FTS_NUM_AUX_INDEX * sizeof(ulint)); + + /* Tokenize the data and add each word string, its corresponding + doc id and position to sort buffer */ + for (i = t_ctx->processed_len; i < doc->text.f_len; i += inc) { + ib_rbt_bound_t parent; + ulint idx = 0; + ib_uint32_t position; + ulint offset = 0; + ulint cur_len = 0; + doc_id_t write_doc_id; + + inc = innobase_mysql_fts_get_token( + doc->charset, doc->text.f_str + i, + doc->text.f_str + doc->text.f_len, &str, &offset); + + ut_a(inc > 0); + + /* Ignore string whose character number is less than + "fts_min_token_size" or more than "fts_max_token_size" */ + if (str.f_n_char < fts_min_token_size + || str.f_n_char > fts_max_token_size) { + + t_ctx->processed_len += inc; + continue; + } + + t_str.f_len = innobase_fts_casedn_str( + doc->charset, (char*) str.f_str, str.f_len, + (char*) &str_buf, FTS_MAX_WORD_LEN + 1); + + t_str.f_str = (byte*) &str_buf; + + /* if "cached_stopword" is defined, ingore words in the + stopword list */ + if (t_ctx->cached_stopword + && rbt_search(t_ctx->cached_stopword, + &parent, &t_str) == 0) { + + t_ctx->processed_len += inc; + continue; + } + + /* There are FTS_NUM_AUX_INDEX auxiliary tables, find + out which sort buffer to put this word record in */ + t_ctx->buf_used = fts_select_index( + doc->charset, t_str.f_str, t_str.f_len); + + buf = sort_buf[t_ctx->buf_used]; + + ut_a(t_ctx->buf_used < FTS_NUM_AUX_INDEX); + idx = t_ctx->buf_used; + + mtuple_t* mtuple = &buf->tuples[buf->n_tuples + n_tuple[idx]]; + + field = mtuple->fields = static_cast<dfield_t*>( + mem_heap_alloc(buf->heap, + FTS_NUM_FIELDS_SORT * sizeof *field)); + + /* The first field is the tokenized word */ + dfield_set_data(field, t_str.f_str, t_str.f_len); + len = dfield_get_len(field); + + field->type.mtype = word_dtype->mtype; + field->type.prtype = word_dtype->prtype | DATA_NOT_NULL; + + /* Variable length field, set to max size. */ + field->type.len = FTS_MAX_WORD_LEN; + field->type.mbminmaxlen = word_dtype->mbminmaxlen; + + cur_len += len; + dfield_dup(field, buf->heap); + field++; + + /* The second field is the Doc ID */ + + ib_uint32_t doc_id_32_bit; + + if (!opt_doc_id_size) { + fts_write_doc_id((byte*) &write_doc_id, doc_id); + + dfield_set_data( + field, &write_doc_id, sizeof(write_doc_id)); + } else { + mach_write_to_4( + (byte*) &doc_id_32_bit, (ib_uint32_t) doc_id); + + dfield_set_data( + field, &doc_id_32_bit, sizeof(doc_id_32_bit)); + } + + len = field->len; + ut_ad(len == FTS_DOC_ID_LEN || len == sizeof(ib_uint32_t)); + + field->type.mtype = DATA_INT; + field->type.prtype = DATA_NOT_NULL | DATA_BINARY_TYPE; + field->type.len = len; + field->type.mbminmaxlen = 0; + + cur_len += len; + dfield_dup(field, buf->heap); + + ++field; + + /* The third field is the position */ + mach_write_to_4( + (byte*) &position, + (i + offset + inc - str.f_len + t_ctx->init_pos)); + + dfield_set_data(field, &position, sizeof(position)); + len = dfield_get_len(field); + ut_ad(len == sizeof(ib_uint32_t)); + + field->type.mtype = DATA_INT; + field->type.prtype = DATA_NOT_NULL; + field->type.len = len; + field->type.mbminmaxlen = 0; + cur_len += len; + dfield_dup(field, buf->heap); + + /* One variable length column, word with its lenght less than + fts_max_token_size, add one extra size and one extra byte */ + cur_len += 2; + + /* Reserve one byte for the end marker of row_merge_block_t. */ + if (buf->total_size + data_size[idx] + cur_len + >= srv_sort_buf_size - 1) { + + buf_full = TRUE; + break; + } + + /* Increment the number of tuples */ + n_tuple[idx]++; + t_ctx->processed_len += inc; + data_size[idx] += cur_len; + } + + /* Update the data length and the number of new word tuples + added in this round of tokenization */ + for (i = 0; i < FTS_NUM_AUX_INDEX; i++) { + /* The computation of total_size below assumes that no + delete-mark flags will be stored and that all fields + are NOT NULL and fixed-length. */ + + sort_buf[i]->total_size += data_size[i]; + + sort_buf[i]->n_tuples += n_tuple[i]; + + merge_file[i]->n_rec += n_tuple[i]; + t_ctx->rows_added[i] += n_tuple[i]; + } + + if (!buf_full) { + /* we pad one byte between text accross two fields */ + t_ctx->init_pos += doc->text.f_len + 1; + } + + return(!buf_full); +} + +/*********************************************************************//** +Get next doc item from fts_doc_list */ +UNIV_INLINE +void +row_merge_fts_get_next_doc_item( +/*============================*/ + fts_psort_t* psort_info, /*!< in: psort_info */ + fts_doc_item_t** doc_item) /*!< in/out: doc item */ +{ + if (*doc_item != NULL) { + ut_free(*doc_item); + } + + mutex_enter(&psort_info->mutex); + + *doc_item = UT_LIST_GET_FIRST(psort_info->fts_doc_list); + if (*doc_item != NULL) { + UT_LIST_REMOVE(doc_list, psort_info->fts_doc_list, + *doc_item); + + ut_ad(psort_info->memory_used >= sizeof(fts_doc_item_t) + + (*doc_item)->field->len); + psort_info->memory_used -= sizeof(fts_doc_item_t) + + (*doc_item)->field->len; + } + + mutex_exit(&psort_info->mutex); +} + +/*********************************************************************//** +Function performs parallel tokenization of the incoming doc strings. +It also performs the initial in memory sort of the parsed records. +@return OS_THREAD_DUMMY_RETURN */ +UNIV_INTERN +os_thread_ret_t +fts_parallel_tokenization( +/*======================*/ + void* arg) /*!< in: psort_info for the thread */ +{ + fts_psort_t* psort_info = (fts_psort_t*) arg; + ulint i; + fts_doc_item_t* doc_item = NULL; + row_merge_buf_t** buf; + ibool processed = FALSE; + merge_file_t** merge_file; + row_merge_block_t** block; + int tmpfd[FTS_NUM_AUX_INDEX]; + ulint mycount[FTS_NUM_AUX_INDEX]; + ib_uint64_t total_rec = 0; + ulint num_doc_processed = 0; + doc_id_t last_doc_id = 0; + ulint zip_size; + mem_heap_t* blob_heap = NULL; + fts_doc_t doc; + dict_table_t* table = psort_info->psort_common->new_table; + dtype_t word_dtype; + dict_field_t* idx_field; + fts_tokenize_ctx_t t_ctx; + ulint retried = 0; + dberr_t error = DB_SUCCESS; + + ut_ad(psort_info); + + buf = psort_info->merge_buf; + merge_file = psort_info->merge_file; + blob_heap = mem_heap_create(512); + memset(&doc, 0, sizeof(doc)); + memset(&t_ctx, 0, sizeof(t_ctx)); + memset(mycount, 0, FTS_NUM_AUX_INDEX * sizeof(int)); + + doc.charset = fts_index_get_charset( + psort_info->psort_common->dup->index); + + idx_field = dict_index_get_nth_field( + psort_info->psort_common->dup->index, 0); + word_dtype.prtype = idx_field->col->prtype; + word_dtype.mbminmaxlen = idx_field->col->mbminmaxlen; + word_dtype.mtype = (strcmp(doc.charset->name, "latin1_swedish_ci") == 0) + ? DATA_VARCHAR : DATA_VARMYSQL; + + block = psort_info->merge_block; + zip_size = dict_table_zip_size(table); + + row_merge_fts_get_next_doc_item(psort_info, &doc_item); + + t_ctx.cached_stopword = table->fts->cache->stopword_info.cached_stopword; + processed = TRUE; +loop: + while (doc_item) { + dfield_t* dfield = doc_item->field; + + last_doc_id = doc_item->doc_id; + + ut_ad (dfield->data != NULL + && dfield_get_len(dfield) != UNIV_SQL_NULL); + + /* If finish processing the last item, update "doc" with + strings in the doc_item, otherwise continue processing last + item */ + if (processed) { + byte* data; + ulint data_len; + + dfield = doc_item->field; + data = static_cast<byte*>(dfield_get_data(dfield)); + data_len = dfield_get_len(dfield); + + if (dfield_is_ext(dfield)) { + doc.text.f_str = + btr_copy_externally_stored_field( + &doc.text.f_len, data, + zip_size, data_len, blob_heap); + } else { + doc.text.f_str = data; + doc.text.f_len = data_len; + } + + doc.tokens = 0; + t_ctx.processed_len = 0; + } else { + /* Not yet finish processing the "doc" on hand, + continue processing it */ + ut_ad(doc.text.f_str); + ut_ad(t_ctx.processed_len < doc.text.f_len); + } + + processed = row_merge_fts_doc_tokenize( + buf, doc_item->doc_id, &doc, + &word_dtype, + merge_file, psort_info->psort_common->opt_doc_id_size, + &t_ctx); + + /* Current sort buffer full, need to recycle */ + if (!processed) { + ut_ad(t_ctx.processed_len < doc.text.f_len); + ut_ad(t_ctx.rows_added[t_ctx.buf_used]); + break; + } + + num_doc_processed++; + + if (fts_enable_diag_print && num_doc_processed % 10000 == 1) { + ib_logf(IB_LOG_LEVEL_INFO, + "number of doc processed %d\n", + (int) num_doc_processed); +#ifdef FTS_INTERNAL_DIAG_PRINT + for (i = 0; i < FTS_NUM_AUX_INDEX; i++) { + ib_logf(IB_LOG_LEVEL_INFO, + "ID %d, partition %d, word " + "%d\n",(int) psort_info->psort_id, + (int) i, (int) mycount[i]); + } +#endif + } + + mem_heap_empty(blob_heap); + + row_merge_fts_get_next_doc_item(psort_info, &doc_item); + + if (doc_item && last_doc_id != doc_item->doc_id) { + t_ctx.init_pos = 0; + } + } + + /* If we run out of current sort buffer, need to sort + and flush the sort buffer to disk */ + if (t_ctx.rows_added[t_ctx.buf_used] && !processed) { + row_merge_buf_sort(buf[t_ctx.buf_used], NULL); + row_merge_buf_write(buf[t_ctx.buf_used], + merge_file[t_ctx.buf_used], + block[t_ctx.buf_used]); + + if (!row_merge_write(merge_file[t_ctx.buf_used]->fd, + merge_file[t_ctx.buf_used]->offset++, + block[t_ctx.buf_used])) { + error = DB_TEMP_FILE_WRITE_FAILURE; + goto func_exit; + } + + UNIV_MEM_INVALID(block[t_ctx.buf_used][0], srv_sort_buf_size); + buf[t_ctx.buf_used] = row_merge_buf_empty(buf[t_ctx.buf_used]); + mycount[t_ctx.buf_used] += t_ctx.rows_added[t_ctx.buf_used]; + t_ctx.rows_added[t_ctx.buf_used] = 0; + + ut_a(doc_item); + goto loop; + } + + /* Parent done scanning, and if finish processing all the docs, exit */ + if (psort_info->state == FTS_PARENT_COMPLETE) { + if (UT_LIST_GET_LEN(psort_info->fts_doc_list) == 0) { + goto exit; + } else if (retried > 10000) { + ut_ad(!doc_item); + /* retied too many times and cannot get new record */ + ib_logf(IB_LOG_LEVEL_ERROR, + "InnoDB: FTS parallel sort processed " + "%lu records, the sort queue has " + "%lu records. But sort cannot get " + "the next records", num_doc_processed, + UT_LIST_GET_LEN( + psort_info->fts_doc_list)); + goto exit; + } + } else if (psort_info->state == FTS_PARENT_EXITING) { + /* Parent abort */ + goto func_exit; + } + + if (doc_item == NULL) { + os_thread_yield(); + } + + row_merge_fts_get_next_doc_item(psort_info, &doc_item); + + if (doc_item != NULL) { + if (last_doc_id != doc_item->doc_id) { + t_ctx.init_pos = 0; + } + + retried = 0; + } else if (psort_info->state == FTS_PARENT_COMPLETE) { + retried++; + } + + goto loop; + +exit: + /* Do a final sort of the last (or latest) batch of records + in block memory. Flush them to temp file if records cannot + be hold in one block memory */ + for (i = 0; i < FTS_NUM_AUX_INDEX; i++) { + if (t_ctx.rows_added[i]) { + row_merge_buf_sort(buf[i], NULL); + row_merge_buf_write( + buf[i], merge_file[i], block[i]); + + /* Write to temp file, only if records have + been flushed to temp file before (offset > 0): + The pseudo code for sort is following: + + while (there are rows) { + tokenize rows, put result in block[] + if (block[] runs out) { + sort rows; + write to temp file with + row_merge_write(); + offset++; + } + } + + # write out the last batch + if (offset > 0) { + row_merge_write(); + offset++; + } else { + # no need to write anything + offset stay as 0 + } + + so if merge_file[i]->offset is 0 when we come to + here as the last batch, this means rows have + never flush to temp file, it can be held all in + memory */ + if (merge_file[i]->offset != 0) { + if (!row_merge_write(merge_file[i]->fd, + merge_file[i]->offset++, + block[i])) { + error = DB_TEMP_FILE_WRITE_FAILURE; + goto func_exit; + } + + UNIV_MEM_INVALID(block[i][0], + srv_sort_buf_size); + } + + buf[i] = row_merge_buf_empty(buf[i]); + t_ctx.rows_added[i] = 0; + } + } + + if (fts_enable_diag_print) { + DEBUG_FTS_SORT_PRINT(" InnoDB_FTS: start merge sort\n"); + } + + for (i = 0; i < FTS_NUM_AUX_INDEX; i++) { + if (!merge_file[i]->offset) { + continue; + } + + tmpfd[i] = row_merge_file_create_low(); + if (tmpfd[i] < 0) { + error = DB_OUT_OF_MEMORY; + goto func_exit; + } + + error = row_merge_sort(psort_info->psort_common->trx, + psort_info->psort_common->dup, + merge_file[i], block[i], &tmpfd[i]); + if (error != DB_SUCCESS) { + close(tmpfd[i]); + goto func_exit; + } + + total_rec += merge_file[i]->n_rec; + close(tmpfd[i]); + } + +func_exit: + if (fts_enable_diag_print) { + DEBUG_FTS_SORT_PRINT(" InnoDB_FTS: complete merge sort\n"); + } + + mem_heap_free(blob_heap); + + mutex_enter(&psort_info->mutex); + psort_info->error = error; + mutex_exit(&psort_info->mutex); + + if (UT_LIST_GET_LEN(psort_info->fts_doc_list) > 0) { + /* child can exit either with error or told by parent. */ + ut_ad(error != DB_SUCCESS + || psort_info->state == FTS_PARENT_EXITING); + } + + /* Free fts doc list in case of error. */ + do { + row_merge_fts_get_next_doc_item(psort_info, &doc_item); + } while (doc_item != NULL); + + psort_info->child_status = FTS_CHILD_COMPLETE; + os_event_set(psort_info->psort_common->sort_event); + psort_info->child_status = FTS_CHILD_EXITING; + +#ifdef __WIN__ + CloseHandle(psort_info->thread_hdl); +#endif /*__WIN__ */ + + os_thread_exit(NULL); + + OS_THREAD_DUMMY_RETURN; +} + +/*********************************************************************//** +Start the parallel tokenization and parallel merge sort */ +UNIV_INTERN +void +row_fts_start_psort( +/*================*/ + fts_psort_t* psort_info) /*!< parallel sort structure */ +{ + ulint i = 0; + os_thread_id_t thd_id; + + for (i = 0; i < fts_sort_pll_degree; i++) { + psort_info[i].psort_id = i; + psort_info[i].thread_hdl = os_thread_create( + fts_parallel_tokenization, + (void*) &psort_info[i], &thd_id); + } +} + +/*********************************************************************//** +Function performs the merge and insertion of the sorted records. +@return OS_THREAD_DUMMY_RETURN */ +UNIV_INTERN +os_thread_ret_t +fts_parallel_merge( +/*===============*/ + void* arg) /*!< in: parallel merge info */ +{ + fts_psort_t* psort_info = (fts_psort_t*) arg; + ulint id; + + ut_ad(psort_info); + + id = psort_info->psort_id; + + row_fts_merge_insert(psort_info->psort_common->dup->index, + psort_info->psort_common->new_table, + psort_info->psort_common->all_info, id); + + psort_info->child_status = FTS_CHILD_COMPLETE; + os_event_set(psort_info->psort_common->merge_event); + psort_info->child_status = FTS_CHILD_EXITING; + +#ifdef __WIN__ + CloseHandle(psort_info->thread_hdl); +#endif /*__WIN__ */ + + os_thread_exit(NULL); + + OS_THREAD_DUMMY_RETURN; +} + +/*********************************************************************//** +Kick off the parallel merge and insert thread */ +UNIV_INTERN +void +row_fts_start_parallel_merge( +/*=========================*/ + fts_psort_t* merge_info) /*!< in: parallel sort info */ +{ + int i = 0; + os_thread_id_t thd_id; + + /* Kick off merge/insert threads */ + for (i = 0; i < FTS_NUM_AUX_INDEX; i++) { + merge_info[i].psort_id = i; + merge_info[i].child_status = 0; + + merge_info[i].thread_hdl = os_thread_create( + fts_parallel_merge, (void*) &merge_info[i], &thd_id); + } +} + +/********************************************************************//** +Insert processed FTS data to auxillary index tables. +@return DB_SUCCESS if insertion runs fine */ +static __attribute__((nonnull)) +dberr_t +row_merge_write_fts_word( +/*=====================*/ + trx_t* trx, /*!< in: transaction */ + que_t** ins_graph, /*!< in: Insert query graphs */ + fts_tokenizer_word_t* word, /*!< in: sorted and tokenized + word */ + fts_table_t* fts_table, /*!< in: fts aux table instance */ + CHARSET_INFO* charset) /*!< in: charset */ +{ + ulint selected; + dberr_t ret = DB_SUCCESS; + + selected = fts_select_index( + charset, word->text.f_str, word->text.f_len); + fts_table->suffix = fts_get_suffix(selected); + + /* Pop out each fts_node in word->nodes write them to auxiliary table */ + while (ib_vector_size(word->nodes) > 0) { + dberr_t error; + fts_node_t* fts_node; + + fts_node = static_cast<fts_node_t*>(ib_vector_pop(word->nodes)); + + error = fts_write_node( + trx, &ins_graph[selected], fts_table, &word->text, + fts_node); + + if (error != DB_SUCCESS) { + fprintf(stderr, "InnoDB: failed to write" + " word %s to FTS auxiliary index" + " table, error (%s) \n", + word->text.f_str, ut_strerr(error)); + ret = error; + } + + ut_free(fts_node->ilist); + fts_node->ilist = NULL; + } + + return(ret); +} + +/*********************************************************************//** +Read sorted FTS data files and insert data tuples to auxillary tables. +@return DB_SUCCESS or error number */ +UNIV_INTERN +void +row_fts_insert_tuple( +/*=================*/ + fts_psort_insert_t* + ins_ctx, /*!< in: insert context */ + fts_tokenizer_word_t* word, /*!< in: last processed + tokenized word */ + ib_vector_t* positions, /*!< in: word position */ + doc_id_t* in_doc_id, /*!< in: last item doc id */ + dtuple_t* dtuple) /*!< in: entry to insert */ +{ + fts_node_t* fts_node = NULL; + dfield_t* dfield; + doc_id_t doc_id; + ulint position; + fts_string_t token_word; + ulint i; + + /* Get fts_node for the FTS auxillary INDEX table */ + if (ib_vector_size(word->nodes) > 0) { + fts_node = static_cast<fts_node_t*>( + ib_vector_last(word->nodes)); + } + + if (fts_node == NULL + || fts_node->ilist_size > FTS_ILIST_MAX_SIZE) { + + fts_node = static_cast<fts_node_t*>( + ib_vector_push(word->nodes, NULL)); + + memset(fts_node, 0x0, sizeof(*fts_node)); + } + + /* If dtuple == NULL, this is the last word to be processed */ + if (!dtuple) { + if (fts_node && ib_vector_size(positions) > 0) { + fts_cache_node_add_positions( + NULL, fts_node, *in_doc_id, + positions); + + /* Write out the current word */ + row_merge_write_fts_word(ins_ctx->trx, + ins_ctx->ins_graph, word, + &ins_ctx->fts_table, + ins_ctx->charset); + + } + + return; + } + + /* Get the first field for the tokenized word */ + dfield = dtuple_get_nth_field(dtuple, 0); + + token_word.f_n_char = 0; + token_word.f_len = dfield->len; + token_word.f_str = static_cast<byte*>(dfield_get_data(dfield)); + + if (!word->text.f_str) { + fts_utf8_string_dup(&word->text, &token_word, ins_ctx->heap); + } + + /* compare to the last word, to see if they are the same + word */ + if (innobase_fts_text_cmp(ins_ctx->charset, + &word->text, &token_word) != 0) { + ulint num_item; + + /* Getting a new word, flush the last position info + for the currnt word in fts_node */ + if (ib_vector_size(positions) > 0) { + fts_cache_node_add_positions( + NULL, fts_node, *in_doc_id, positions); + } + + /* Write out the current word */ + row_merge_write_fts_word(ins_ctx->trx, ins_ctx->ins_graph, + word, &ins_ctx->fts_table, + ins_ctx->charset); + + /* Copy the new word */ + fts_utf8_string_dup(&word->text, &token_word, ins_ctx->heap); + + num_item = ib_vector_size(positions); + + /* Clean up position queue */ + for (i = 0; i < num_item; i++) { + ib_vector_pop(positions); + } + + /* Reset Doc ID */ + *in_doc_id = 0; + memset(fts_node, 0x0, sizeof(*fts_node)); + } + + /* Get the word's Doc ID */ + dfield = dtuple_get_nth_field(dtuple, 1); + + if (!ins_ctx->opt_doc_id_size) { + doc_id = fts_read_doc_id( + static_cast<byte*>(dfield_get_data(dfield))); + } else { + doc_id = (doc_id_t) mach_read_from_4( + static_cast<byte*>(dfield_get_data(dfield))); + } + + /* Get the word's position info */ + dfield = dtuple_get_nth_field(dtuple, 2); + position = mach_read_from_4(static_cast<byte*>(dfield_get_data(dfield))); + + /* If this is the same word as the last word, and they + have the same Doc ID, we just need to add its position + info. Otherwise, we will flush position info to the + fts_node and initiate a new position vector */ + if (!(*in_doc_id) || *in_doc_id == doc_id) { + ib_vector_push(positions, &position); + } else { + ulint num_pos = ib_vector_size(positions); + + fts_cache_node_add_positions(NULL, fts_node, + *in_doc_id, positions); + for (i = 0; i < num_pos; i++) { + ib_vector_pop(positions); + } + ib_vector_push(positions, &position); + } + + /* record the current Doc ID */ + *in_doc_id = doc_id; +} + +/*********************************************************************//** +Propagate a newly added record up one level in the selection tree +@return parent where this value propagated to */ +static +int +row_fts_sel_tree_propagate( +/*=======================*/ + int propogated, /*<! in: tree node propagated */ + int* sel_tree, /*<! in: selection tree */ + const mrec_t** mrec, /*<! in: sort record */ + ulint** offsets, /*<! in: record offsets */ + dict_index_t* index) /*<! in/out: FTS index */ +{ + ulint parent; + int child_left; + int child_right; + int selected; + + /* Find which parent this value will be propagated to */ + parent = (propogated - 1) / 2; + + /* Find out which value is smaller, and to propagate */ + child_left = sel_tree[parent * 2 + 1]; + child_right = sel_tree[parent * 2 + 2]; + + if (child_left == -1 || mrec[child_left] == NULL) { + if (child_right == -1 + || mrec[child_right] == NULL) { + selected = -1; + } else { + selected = child_right ; + } + } else if (child_right == -1 + || mrec[child_right] == NULL) { + selected = child_left; + } else if (cmp_rec_rec_simple(mrec[child_left], mrec[child_right], + offsets[child_left], + offsets[child_right], + index, NULL) < 0) { + selected = child_left; + } else { + selected = child_right; + } + + sel_tree[parent] = selected; + + return(static_cast<int>(parent)); +} + +/*********************************************************************//** +Readjust selection tree after popping the root and read a new value +@return the new root */ +static +int +row_fts_sel_tree_update( +/*====================*/ + int* sel_tree, /*<! in/out: selection tree */ + ulint propagated, /*<! in: node to propagate up */ + ulint height, /*<! in: tree height */ + const mrec_t** mrec, /*<! in: sort record */ + ulint** offsets, /*<! in: record offsets */ + dict_index_t* index) /*<! in: index dictionary */ +{ + ulint i; + + for (i = 1; i <= height; i++) { + propagated = static_cast<ulint>(row_fts_sel_tree_propagate( + static_cast<int>(propagated), sel_tree, mrec, offsets, index)); + } + + return(sel_tree[0]); +} + +/*********************************************************************//** +Build selection tree at a specified level */ +static +void +row_fts_build_sel_tree_level( +/*=========================*/ + int* sel_tree, /*<! in/out: selection tree */ + ulint level, /*<! in: selection tree level */ + const mrec_t** mrec, /*<! in: sort record */ + ulint** offsets, /*<! in: record offsets */ + dict_index_t* index) /*<! in: index dictionary */ +{ + ulint start; + int child_left; + int child_right; + ulint i; + ulint num_item; + + start = static_cast<ulint>((1 << level) - 1); + num_item = static_cast<ulint>(1 << level); + + for (i = 0; i < num_item; i++) { + child_left = sel_tree[(start + i) * 2 + 1]; + child_right = sel_tree[(start + i) * 2 + 2]; + + if (child_left == -1) { + if (child_right == -1) { + sel_tree[start + i] = -1; + } else { + sel_tree[start + i] = child_right; + } + continue; + } else if (child_right == -1) { + sel_tree[start + i] = child_left; + continue; + } + + /* Deal with NULL child conditions */ + if (!mrec[child_left]) { + if (!mrec[child_right]) { + sel_tree[start + i] = -1; + } else { + sel_tree[start + i] = child_right; + } + continue; + } else if (!mrec[child_right]) { + sel_tree[start + i] = child_left; + continue; + } + + /* Select the smaller one to set parent pointer */ + int cmp = cmp_rec_rec_simple( + mrec[child_left], mrec[child_right], + offsets[child_left], offsets[child_right], + index, NULL); + + sel_tree[start + i] = cmp < 0 ? child_left : child_right; + } +} + +/*********************************************************************//** +Build a selection tree for merge. The selection tree is a binary tree +and should have fts_sort_pll_degree / 2 levels. With root as level 0 +@return number of tree levels */ +static +ulint +row_fts_build_sel_tree( +/*===================*/ + int* sel_tree, /*<! in/out: selection tree */ + const mrec_t** mrec, /*<! in: sort record */ + ulint** offsets, /*<! in: record offsets */ + dict_index_t* index) /*<! in: index dictionary */ +{ + ulint treelevel = 1; + ulint num = 2; + int i = 0; + ulint start; + + /* No need to build selection tree if we only have two merge threads */ + if (fts_sort_pll_degree <= 2) { + return(0); + } + + while (num < fts_sort_pll_degree) { + num = num << 1; + treelevel++; + } + + start = (1 << treelevel) - 1; + + for (i = 0; i < (int) fts_sort_pll_degree; i++) { + sel_tree[i + start] = i; + } + + for (i = static_cast<int>(treelevel) - 1; i >= 0; i--) { + row_fts_build_sel_tree_level( + sel_tree, static_cast<ulint>(i), mrec, offsets, index); + } + + return(treelevel); +} + +/*********************************************************************//** +Read sorted file containing index data tuples and insert these data +tuples to the index +@return DB_SUCCESS or error number */ +UNIV_INTERN +dberr_t +row_fts_merge_insert( +/*=================*/ + dict_index_t* index, /*!< in: index */ + dict_table_t* table, /*!< in: new table */ + fts_psort_t* psort_info, /*!< parallel sort info */ + ulint id) /* !< in: which auxiliary table's data + to insert to */ +{ + const byte** b; + mem_heap_t* tuple_heap; + mem_heap_t* heap; + dberr_t error = DB_SUCCESS; + ulint* foffs; + ulint** offsets; + fts_tokenizer_word_t new_word; + ib_vector_t* positions; + doc_id_t last_doc_id; + ib_alloc_t* heap_alloc; + ulint n_bytes; + ulint i; + mrec_buf_t** buf; + int* fd; + byte** block; + const mrec_t** mrec; + ulint count = 0; + int* sel_tree; + ulint height; + ulint start; + fts_psort_insert_t ins_ctx; + ulint count_diag = 0; + + ut_ad(index); + ut_ad(table); + + /* We use the insert query graph as the dummy graph + needed in the row module call */ + + ins_ctx.trx = trx_allocate_for_background(); + + ins_ctx.trx->op_info = "inserting index entries"; + + ins_ctx.opt_doc_id_size = psort_info[0].psort_common->opt_doc_id_size; + + heap = mem_heap_create(500 + sizeof(mrec_buf_t)); + + b = (const byte**) mem_heap_alloc( + heap, sizeof (*b) * fts_sort_pll_degree); + foffs = (ulint*) mem_heap_alloc( + heap, sizeof(*foffs) * fts_sort_pll_degree); + offsets = (ulint**) mem_heap_alloc( + heap, sizeof(*offsets) * fts_sort_pll_degree); + buf = (mrec_buf_t**) mem_heap_alloc( + heap, sizeof(*buf) * fts_sort_pll_degree); + fd = (int*) mem_heap_alloc(heap, sizeof(*fd) * fts_sort_pll_degree); + block = (byte**) mem_heap_alloc( + heap, sizeof(*block) * fts_sort_pll_degree); + mrec = (const mrec_t**) mem_heap_alloc( + heap, sizeof(*mrec) * fts_sort_pll_degree); + sel_tree = (int*) mem_heap_alloc( + heap, sizeof(*sel_tree) * (fts_sort_pll_degree * 2)); + + tuple_heap = mem_heap_create(1000); + + ins_ctx.charset = fts_index_get_charset(index); + ins_ctx.heap = heap; + + for (i = 0; i < fts_sort_pll_degree; i++) { + ulint num; + + num = 1 + REC_OFFS_HEADER_SIZE + + dict_index_get_n_fields(index); + offsets[i] = static_cast<ulint*>(mem_heap_zalloc( + heap, num * sizeof *offsets[i])); + offsets[i][0] = num; + offsets[i][1] = dict_index_get_n_fields(index); + block[i] = psort_info[i].merge_block[id]; + b[i] = psort_info[i].merge_block[id]; + fd[i] = psort_info[i].merge_file[id]->fd; + foffs[i] = 0; + + buf[i] = static_cast<unsigned char (*)[16384]>( + mem_heap_alloc(heap, sizeof *buf[i])); + count_diag += (int) psort_info[i].merge_file[id]->n_rec; + } + + if (fts_enable_diag_print) { + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB_FTS: to inserted %lu records\n", + (ulong) count_diag); + } + + /* Initialize related variables if creating FTS indexes */ + heap_alloc = ib_heap_allocator_create(heap); + + memset(&new_word, 0, sizeof(new_word)); + + new_word.nodes = ib_vector_create(heap_alloc, sizeof(fts_node_t), 4); + positions = ib_vector_create(heap_alloc, sizeof(ulint), 32); + last_doc_id = 0; + + /* Allocate insert query graphs for FTS auxillary + Index Table, note we have FTS_NUM_AUX_INDEX such index tables */ + n_bytes = sizeof(que_t*) * (FTS_NUM_AUX_INDEX + 1); + ins_ctx.ins_graph = static_cast<que_t**>(mem_heap_alloc(heap, n_bytes)); + memset(ins_ctx.ins_graph, 0x0, n_bytes); + + /* We should set the flags2 with aux_table_name here, + in order to get the correct aux table names. */ + index->table->flags2 |= DICT_TF2_FTS_AUX_HEX_NAME; + DBUG_EXECUTE_IF("innodb_test_wrong_fts_aux_table_name", + index->table->flags2 &= ~DICT_TF2_FTS_AUX_HEX_NAME;); + + ins_ctx.fts_table.type = FTS_INDEX_TABLE; + ins_ctx.fts_table.index_id = index->id; + ins_ctx.fts_table.table_id = table->id; + ins_ctx.fts_table.parent = index->table->name; + ins_ctx.fts_table.table = index->table; + + for (i = 0; i < fts_sort_pll_degree; i++) { + if (psort_info[i].merge_file[id]->n_rec == 0) { + /* No Rows to read */ + mrec[i] = b[i] = NULL; + } else { + /* Read from temp file only if it has been + written to. Otherwise, block memory holds + all the sorted records */ + if (psort_info[i].merge_file[id]->offset > 0 + && (!row_merge_read( + fd[i], foffs[i], + (row_merge_block_t*) block[i]))) { + error = DB_CORRUPTION; + goto exit; + } + + ROW_MERGE_READ_GET_NEXT(i); + } + } + + height = row_fts_build_sel_tree(sel_tree, (const mrec_t **) mrec, + offsets, index); + + start = (1 << height) - 1; + + /* Fetch sorted records from sort buffer and insert them into + corresponding FTS index auxiliary tables */ + for (;;) { + dtuple_t* dtuple; + ulint n_ext; + int min_rec = 0; + + if (fts_sort_pll_degree <= 2) { + while (!mrec[min_rec]) { + min_rec++; + + if (min_rec >= (int) fts_sort_pll_degree) { + row_fts_insert_tuple( + &ins_ctx, &new_word, + positions, &last_doc_id, + NULL); + + goto exit; + } + } + + for (i = min_rec + 1; i < fts_sort_pll_degree; i++) { + if (!mrec[i]) { + continue; + } + + if (cmp_rec_rec_simple( + mrec[i], mrec[min_rec], + offsets[i], offsets[min_rec], + index, NULL) < 0) { + min_rec = static_cast<int>(i); + } + } + } else { + min_rec = sel_tree[0]; + + if (min_rec == -1) { + row_fts_insert_tuple( + &ins_ctx, &new_word, + positions, &last_doc_id, + NULL); + + goto exit; + } + } + + dtuple = row_rec_to_index_entry_low( + mrec[min_rec], index, offsets[min_rec], &n_ext, + tuple_heap); + + row_fts_insert_tuple( + &ins_ctx, &new_word, positions, + &last_doc_id, dtuple); + + + ROW_MERGE_READ_GET_NEXT(min_rec); + + if (fts_sort_pll_degree > 2) { + if (!mrec[min_rec]) { + sel_tree[start + min_rec] = -1; + } + + row_fts_sel_tree_update(sel_tree, start + min_rec, + height, mrec, + offsets, index); + } + + count++; + + mem_heap_empty(tuple_heap); + } + +exit: + fts_sql_commit(ins_ctx.trx); + + ins_ctx.trx->op_info = ""; + + mem_heap_free(tuple_heap); + + for (i = 0; i < FTS_NUM_AUX_INDEX; i++) { + if (ins_ctx.ins_graph[i]) { + fts_que_graph_free(ins_ctx.ins_graph[i]); + } + } + + trx_free_for_background(ins_ctx.trx); + + mem_heap_free(heap); + + if (fts_enable_diag_print) { + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB_FTS: inserted %lu records\n", + (ulong) count); + } + + return(error); +} diff --git a/storage/xtradb/row/row0import.cc b/storage/xtradb/row/row0import.cc new file mode 100644 index 00000000000..b753574158a --- /dev/null +++ b/storage/xtradb/row/row0import.cc @@ -0,0 +1,3806 @@ +/***************************************************************************** + +Copyright (c) 2012, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file row/row0import.cc +Import a tablespace to a running instance. + +Created 2012-02-08 by Sunny Bains. +*******************************************************/ + +#include "row0import.h" + +#ifdef UNIV_NONINL +#include "row0import.ic" +#endif + +#include "btr0pcur.h" +#include "que0que.h" +#include "dict0boot.h" +#include "ibuf0ibuf.h" +#include "pars0pars.h" +#include "row0upd.h" +#include "row0sel.h" +#include "row0mysql.h" +#include "srv0start.h" +#include "row0quiesce.h" + +#include <vector> + +/** The size of the buffer to use for IO. Note: os_file_read() doesn't expect +reads to fail. If you set the buffer size to be greater than a multiple of the +file size then it will assert. TODO: Fix this limitation of the IO functions. +@param n - page size of the tablespace. +@retval number of pages */ +#define IO_BUFFER_SIZE(n) ((1024 * 1024) / n) + +/** For gathering stats on records during phase I */ +struct row_stats_t { + ulint m_n_deleted; /*!< Number of deleted records + found in the index */ + + ulint m_n_purged; /*!< Number of records purged + optimisatically */ + + ulint m_n_rows; /*!< Number of rows */ + + ulint m_n_purge_failed; /*!< Number of deleted rows + that could not be purged */ +}; + +/** Index information required by IMPORT. */ +struct row_index_t { + index_id_t m_id; /*!< Index id of the table + in the exporting server */ + byte* m_name; /*!< Index name */ + + ulint m_space; /*!< Space where it is placed */ + + ulint m_page_no; /*!< Root page number */ + + ulint m_type; /*!< Index type */ + + ulint m_trx_id_offset; /*!< Relevant only for clustered + indexes, offset of transaction + id system column */ + + ulint m_n_user_defined_cols; /*!< User defined columns */ + + ulint m_n_uniq; /*!< Number of columns that can + uniquely identify the row */ + + ulint m_n_nullable; /*!< Number of nullable + columns */ + + ulint m_n_fields; /*!< Total number of fields */ + + dict_field_t* m_fields; /*!< Index fields */ + + const dict_index_t* + m_srv_index; /*!< Index instance in the + importing server */ + + row_stats_t m_stats; /*!< Statistics gathered during + the import phase */ + +}; + +/** Meta data required by IMPORT. */ +struct row_import { + row_import() UNIV_NOTHROW + : + m_table(), + m_version(), + m_hostname(), + m_table_name(), + m_autoinc(), + m_page_size(), + m_flags(), + m_n_cols(), + m_cols(), + m_col_names(), + m_n_indexes(), + m_indexes(), + m_missing(true) { } + + ~row_import() UNIV_NOTHROW; + + /** + Find the index entry in in the indexes array. + @param name - index name + @return instance if found else 0. */ + row_index_t* get_index(const char* name) const UNIV_NOTHROW; + + /** + Get the number of rows in the index. + @param name - index name + @return number of rows (doesn't include delete marked rows). */ + ulint get_n_rows(const char* name) const UNIV_NOTHROW; + + /** + Find the ordinal value of the column name in the cfg table columns. + @param name - of column to look for. + @return ULINT_UNDEFINED if not found. */ + ulint find_col(const char* name) const UNIV_NOTHROW; + + /** + Find the index field entry in in the cfg indexes fields. + @name - of the index to look for + @return instance if found else 0. */ + const dict_field_t* find_field( + const row_index_t* cfg_index, + const char* name) const UNIV_NOTHROW; + + /** + Get the number of rows for which purge failed during the convert phase. + @param name - index name + @return number of rows for which purge failed. */ + ulint get_n_purge_failed(const char* name) const UNIV_NOTHROW; + + /** + Check if the index is clean. ie. no delete-marked records + @param name - index name + @return true if index needs to be purged. */ + bool requires_purge(const char* name) const UNIV_NOTHROW + { + return(get_n_purge_failed(name) > 0); + } + + /** + Set the index root <space, pageno> using the index name */ + void set_root_by_name() UNIV_NOTHROW; + + /** + Set the index root <space, pageno> using a heuristic + @return DB_SUCCESS or error code */ + dberr_t set_root_by_heuristic() UNIV_NOTHROW; + + /** Check if the index schema that was read from the .cfg file + matches the in memory index definition. + Note: It will update row_import_t::m_srv_index to map the meta-data + read from the .cfg file to the server index instance. + @return DB_SUCCESS or error code. */ + dberr_t match_index_columns( + THD* thd, + const dict_index_t* index) UNIV_NOTHROW; + + /** + Check if the table schema that was read from the .cfg file matches the + in memory table definition. + @param thd - MySQL session variable + @return DB_SUCCESS or error code. */ + dberr_t match_table_columns( + THD* thd) UNIV_NOTHROW; + + /** + Check if the table (and index) schema that was read from the .cfg file + matches the in memory table definition. + @param thd - MySQL session variable + @return DB_SUCCESS or error code. */ + dberr_t match_schema( + THD* thd) UNIV_NOTHROW; + + dict_table_t* m_table; /*!< Table instance */ + + ulint m_version; /*!< Version of config file */ + + byte* m_hostname; /*!< Hostname where the + tablespace was exported */ + byte* m_table_name; /*!< Exporting instance table + name */ + + ib_uint64_t m_autoinc; /*!< Next autoinc value */ + + ulint m_page_size; /*!< Tablespace page size */ + + ulint m_flags; /*!< Table flags */ + + ulint m_n_cols; /*!< Number of columns in the + meta-data file */ + + dict_col_t* m_cols; /*!< Column data */ + + byte** m_col_names; /*!< Column names, we store the + column naems separately becuase + there is no field to store the + value in dict_col_t */ + + ulint m_n_indexes; /*!< Number of indexes, + including clustered index */ + + row_index_t* m_indexes; /*!< Index meta data */ + + bool m_missing; /*!< true if a .cfg file was + found and was readable */ +}; + +/** Use the page cursor to iterate over records in a block. */ +class RecIterator { +public: + /** + Default constructor */ + RecIterator() UNIV_NOTHROW + { + memset(&m_cur, 0x0, sizeof(m_cur)); + } + + /** + Position the cursor on the first user record. */ + void open(buf_block_t* block) UNIV_NOTHROW + { + page_cur_set_before_first(block, &m_cur); + + if (!end()) { + next(); + } + } + + /** + Move to the next record. */ + void next() UNIV_NOTHROW + { + page_cur_move_to_next(&m_cur); + } + + /** + @return the current record */ + rec_t* current() UNIV_NOTHROW + { + ut_ad(!end()); + return(page_cur_get_rec(&m_cur)); + } + + /** + @return true if cursor is at the end */ + bool end() UNIV_NOTHROW + { + return(page_cur_is_after_last(&m_cur) == TRUE); + } + + /** Remove the current record + @return true on success */ + bool remove( + const dict_index_t* index, + page_zip_des_t* page_zip, + ulint* offsets) UNIV_NOTHROW + { + /* We can't end up with an empty page unless it is root. */ + if (page_get_n_recs(m_cur.block->frame) <= 1) { + return(false); + } + + return(page_delete_rec(index, &m_cur, page_zip, offsets)); + } + +private: + page_cur_t m_cur; +}; + +/** Class that purges delete marked reocords from indexes, both secondary +and cluster. It does a pessimistic delete. This should only be done if we +couldn't purge the delete marked reocrds during Phase I. */ +class IndexPurge { +public: + /** Constructor + @param trx - the user transaction covering the import tablespace + @param index - to be imported + @param space_id - space id of the tablespace */ + IndexPurge( + trx_t* trx, + dict_index_t* index) UNIV_NOTHROW + : + m_trx(trx), + m_index(index), + m_n_rows(0) + { + ib_logf(IB_LOG_LEVEL_INFO, + "Phase II - Purge records from index %s", + index->name); + } + + /** Descructor */ + ~IndexPurge() UNIV_NOTHROW { } + + /** Purge delete marked records. + @return DB_SUCCESS or error code. */ + dberr_t garbage_collect() UNIV_NOTHROW; + + /** The number of records that are not delete marked. + @return total records in the index after purge */ + ulint get_n_rows() const UNIV_NOTHROW + { + return(m_n_rows); + } + +private: + /** + Begin import, position the cursor on the first record. */ + void open() UNIV_NOTHROW; + + /** + Close the persistent curosr and commit the mini-transaction. */ + void close() UNIV_NOTHROW; + + /** + Position the cursor on the next record. + @return DB_SUCCESS or error code */ + dberr_t next() UNIV_NOTHROW; + + /** + Store the persistent cursor position and reopen the + B-tree cursor in BTR_MODIFY_TREE mode, because the + tree structure may be changed during a pessimistic delete. */ + void purge_pessimistic_delete() UNIV_NOTHROW; + + /** + Purge delete-marked records. + @param offsets - current row offsets. */ + void purge() UNIV_NOTHROW; + +protected: + // Disable copying + IndexPurge(); + IndexPurge(const IndexPurge&); + IndexPurge &operator=(const IndexPurge&); + +private: + trx_t* m_trx; /*!< User transaction */ + mtr_t m_mtr; /*!< Mini-transaction */ + btr_pcur_t m_pcur; /*!< Persistent cursor */ + dict_index_t* m_index; /*!< Index to be processed */ + ulint m_n_rows; /*!< Records in index */ +}; + +/** Functor that is called for each physical page that is read from the +tablespace file. */ +class AbstractCallback : public PageCallback { +public: + /** Constructor + @param trx - covering transaction */ + AbstractCallback(trx_t* trx) + : + m_trx(trx), + m_space(ULINT_UNDEFINED), + m_xdes(), + m_xdes_page_no(ULINT_UNDEFINED), + m_space_flags(ULINT_UNDEFINED), + m_table_flags(ULINT_UNDEFINED) UNIV_NOTHROW { } + + /** + Free any extent descriptor instance */ + virtual ~AbstractCallback() + { + delete [] m_xdes; + } + + /** Determine the page size to use for traversing the tablespace + @param file_size - size of the tablespace file in bytes + @param block - contents of the first page in the tablespace file. + @retval DB_SUCCESS or error code. */ + virtual dberr_t init( + os_offset_t file_size, + const buf_block_t* block) UNIV_NOTHROW; + + /** @return true if compressed table. */ + bool is_compressed_table() const UNIV_NOTHROW + { + return(get_zip_size() > 0); + } + +protected: + /** + Get the data page depending on the table type, compressed or not. + @param block - block read from disk + @retval the buffer frame */ + buf_frame_t* get_frame(buf_block_t* block) const UNIV_NOTHROW + { + if (is_compressed_table()) { + return(block->page.zip.data); + } + + return(buf_block_get_frame(block)); + } + + /** Check for session interrupt. If required we could + even flush to disk here every N pages. + @retval DB_SUCCESS or error code */ + dberr_t periodic_check() UNIV_NOTHROW + { + if (trx_is_interrupted(m_trx)) { + return(DB_INTERRUPTED); + } + + return(DB_SUCCESS); + } + + /** + Get the physical offset of the extent descriptor within the page. + @param page_no - page number of the extent descriptor + @param page - contents of the page containing the extent descriptor. + @return the start of the xdes array in a page */ + const xdes_t* xdes( + ulint page_no, + const page_t* page) const UNIV_NOTHROW + { + ulint offset; + + offset = xdes_calc_descriptor_index(get_zip_size(), page_no); + + return(page + XDES_ARR_OFFSET + XDES_SIZE * offset); + } + + /** + Set the current page directory (xdes). If the extent descriptor is + marked as free then free the current extent descriptor and set it to + 0. This implies that all pages that are covered by this extent + descriptor are also freed. + + @param page_no - offset of page within the file + @param page - page contents + @return DB_SUCCESS or error code. */ + dberr_t set_current_xdes( + ulint page_no, + const page_t* page) UNIV_NOTHROW + { + m_xdes_page_no = page_no; + + delete[] m_xdes; + + m_xdes = 0; + + ulint state; + const xdes_t* xdesc = page + XDES_ARR_OFFSET; + + state = mach_read_ulint(xdesc + XDES_STATE, MLOG_4BYTES); + + if (state != XDES_FREE) { + + m_xdes = new(std::nothrow) xdes_t[m_page_size]; + + /* Trigger OOM */ + DBUG_EXECUTE_IF("ib_import_OOM_13", + delete [] m_xdes; m_xdes = 0;); + + if (m_xdes == 0) { + return(DB_OUT_OF_MEMORY); + } + + memcpy(m_xdes, page, m_page_size); + } + + return(DB_SUCCESS); + } + + /** + @return true if it is a root page */ + bool is_root_page(const page_t* page) const UNIV_NOTHROW + { + ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX); + + return(mach_read_from_4(page + FIL_PAGE_NEXT) == FIL_NULL + && mach_read_from_4(page + FIL_PAGE_PREV) == FIL_NULL); + } + + /** + Check if the page is marked as free in the extent descriptor. + @param page_no - page number to check in the extent descriptor. + @return true if the page is marked as free */ + bool is_free(ulint page_no) const UNIV_NOTHROW + { + ut_a(xdes_calc_descriptor_page(get_zip_size(), page_no) + == m_xdes_page_no); + + if (m_xdes != 0) { + const xdes_t* xdesc = xdes(page_no, m_xdes); + ulint pos = page_no % FSP_EXTENT_SIZE; + + return(xdes_get_bit(xdesc, XDES_FREE_BIT, pos)); + } + + /* If the current xdes was free, the page must be free. */ + return(true); + } + +protected: + /** Covering transaction. */ + trx_t* m_trx; + + /** Space id of the file being iterated over. */ + ulint m_space; + + /** Minimum page number for which the free list has not been + initialized: the pages >= this limit are, by definition, free; + note that in a single-table tablespace where size < 64 pages, + this number is 64, i.e., we have initialized the space about + the first extent, but have not physically allocted those pages + to the file. @see FSP_LIMIT. */ + ulint m_free_limit; + + /** Current size of the space in pages */ + ulint m_size; + + /** Current extent descriptor page */ + xdes_t* m_xdes; + + /** Physical page offset in the file of the extent descriptor */ + ulint m_xdes_page_no; + + /** Flags value read from the header page */ + ulint m_space_flags; + + /** Derived from m_space_flags and row format type, the row format + type is determined from the page header. */ + ulint m_table_flags; +}; + +/** Determine the page size to use for traversing the tablespace +@param file_size - size of the tablespace file in bytes +@param block - contents of the first page in the tablespace file. +@retval DB_SUCCESS or error code. */ +dberr_t +AbstractCallback::init( + os_offset_t file_size, + const buf_block_t* block) UNIV_NOTHROW +{ + const page_t* page = block->frame; + + m_space_flags = fsp_header_get_flags(page); + + /* Since we don't know whether it is a compressed table + or not, the data is always read into the block->frame. */ + + dberr_t err = set_zip_size(block->frame); + + if (err != DB_SUCCESS) { + return(DB_CORRUPTION); + } + + /* Set the page size used to traverse the tablespace. */ + + m_page_size = (is_compressed_table()) + ? get_zip_size() : fsp_flags_get_page_size(m_space_flags); + + if (m_page_size == 0) { + ib_logf(IB_LOG_LEVEL_ERROR, "Page size is 0"); + return(DB_CORRUPTION); + } else if (!is_compressed_table() && m_page_size != UNIV_PAGE_SIZE) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "Page size %lu of ibd file is not the same " + "as the server page size %lu", + m_page_size, UNIV_PAGE_SIZE); + + return(DB_CORRUPTION); + + } else if ((file_size % m_page_size)) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "File size " UINT64PF " is not a multiple " + "of the page size %lu", + (ib_uint64_t) file_size, (ulong) m_page_size); + + return(DB_CORRUPTION); + } + + ut_a(m_space == ULINT_UNDEFINED); + + m_size = mach_read_from_4(page + FSP_SIZE); + m_free_limit = mach_read_from_4(page + FSP_FREE_LIMIT); + m_space = mach_read_from_4(page + FSP_HEADER_OFFSET + FSP_SPACE_ID); + + if ((err = set_current_xdes(0, page)) != DB_SUCCESS) { + return(err); + } + + return(DB_SUCCESS); +} + +/** +Try and determine the index root pages by checking if the next/prev +pointers are both FIL_NULL. We need to ensure that skip deleted pages. */ +struct FetchIndexRootPages : public AbstractCallback { + + /** Index information gathered from the .ibd file. */ + struct Index { + + Index(index_id_t id, ulint page_no) + : + m_id(id), + m_page_no(page_no) { } + + index_id_t m_id; /*!< Index id */ + ulint m_page_no; /*!< Root page number */ + }; + + typedef std::vector<Index> Indexes; + + /** Constructor + @param trx - covering (user) transaction + @param table - table definition in server .*/ + FetchIndexRootPages(const dict_table_t* table, trx_t* trx) + : + AbstractCallback(trx), + m_table(table) UNIV_NOTHROW { } + + /** Destructor */ + virtual ~FetchIndexRootPages() UNIV_NOTHROW { } + + /** + @retval the space id of the tablespace being iterated over */ + virtual ulint get_space_id() const UNIV_NOTHROW + { + return(m_space); + } + + /** + Check if the .ibd file row format is the same as the table's. + @param ibd_table_flags - determined from space and page. + @return DB_SUCCESS or error code. */ + dberr_t check_row_format(ulint ibd_table_flags) UNIV_NOTHROW + { + dberr_t err; + rec_format_t ibd_rec_format; + rec_format_t table_rec_format; + + if (!dict_tf_is_valid(ibd_table_flags)) { + + ib_errf(m_trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + ".ibd file has invlad table flags: %lx", + ibd_table_flags); + + return(DB_CORRUPTION); + } + + ibd_rec_format = dict_tf_get_rec_format(ibd_table_flags); + table_rec_format = dict_tf_get_rec_format(m_table->flags); + + if (table_rec_format != ibd_rec_format) { + + ib_errf(m_trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "Table has %s row format, .ibd " + "file has %s row format.", + dict_tf_to_row_format_string(m_table->flags), + dict_tf_to_row_format_string(ibd_table_flags)); + + err = DB_CORRUPTION; + } else { + err = DB_SUCCESS; + } + + return(err); + } + + /** + Called for each block as it is read from the file. + @param offset - physical offset in the file + @param block - block to convert, it is not from the buffer pool. + @retval DB_SUCCESS or error code. */ + virtual dberr_t operator() ( + os_offset_t offset, + buf_block_t* block) UNIV_NOTHROW; + + /** Update the import configuration that will be used to import + the tablespace. */ + dberr_t build_row_import(row_import* cfg) const UNIV_NOTHROW; + + /** Table definition in server. */ + const dict_table_t* m_table; + + /** Index information */ + Indexes m_indexes; +}; + +/** +Called for each block as it is read from the file. Check index pages to +determine the exact row format. We can't get that from the tablespace +header flags alone. + +@param offset - physical offset in the file +@param block - block to convert, it is not from the buffer pool. +@retval DB_SUCCESS or error code. */ +dberr_t +FetchIndexRootPages::operator() ( + os_offset_t offset, + buf_block_t* block) UNIV_NOTHROW +{ + dberr_t err; + + if ((err = periodic_check()) != DB_SUCCESS) { + return(err); + } + + const page_t* page = get_frame(block); + + ulint page_type = fil_page_get_type(page); + + if (block->page.offset * m_page_size != offset) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Page offset doesn't match file offset: " + "page offset: %lu, file offset: %lu", + (ulint) block->page.offset, + (ulint) (offset / m_page_size)); + + err = DB_CORRUPTION; + } else if (page_type == FIL_PAGE_TYPE_XDES) { + err = set_current_xdes(block->page.offset, page); + } else if (page_type == FIL_PAGE_INDEX + && !is_free(block->page.offset) + && is_root_page(page)) { + + index_id_t id = btr_page_get_index_id(page); + ulint page_no = buf_block_get_page_no(block); + + m_indexes.push_back(Index(id, page_no)); + + if (m_indexes.size() == 1) { + + m_table_flags = dict_sys_tables_type_to_tf( + m_space_flags, + page_is_comp(page) ? DICT_N_COLS_COMPACT : 0); + + err = check_row_format(m_table_flags); + } + } + + return(err); +} + +/** +Update the import configuration that will be used to import the tablespace. +@return error code or DB_SUCCESS */ +dberr_t +FetchIndexRootPages::build_row_import(row_import* cfg) const UNIV_NOTHROW +{ + Indexes::const_iterator end = m_indexes.end(); + + ut_a(cfg->m_table == m_table); + cfg->m_page_size = m_page_size; + cfg->m_n_indexes = m_indexes.size(); + + if (cfg->m_n_indexes == 0) { + + ib_logf(IB_LOG_LEVEL_ERROR, "No B+Tree found in tablespace"); + + return(DB_CORRUPTION); + } + + cfg->m_indexes = new(std::nothrow) row_index_t[cfg->m_n_indexes]; + + /* Trigger OOM */ + DBUG_EXECUTE_IF("ib_import_OOM_11", + delete [] cfg->m_indexes; cfg->m_indexes = 0;); + + if (cfg->m_indexes == 0) { + return(DB_OUT_OF_MEMORY); + } + + memset(cfg->m_indexes, 0x0, sizeof(*cfg->m_indexes) * cfg->m_n_indexes); + + row_index_t* cfg_index = cfg->m_indexes; + + for (Indexes::const_iterator it = m_indexes.begin(); + it != end; + ++it, ++cfg_index) { + + char name[BUFSIZ]; + + ut_snprintf(name, sizeof(name), "index" IB_ID_FMT, it->m_id); + + ulint len = strlen(name) + 1; + + cfg_index->m_name = new(std::nothrow) byte[len]; + + /* Trigger OOM */ + DBUG_EXECUTE_IF("ib_import_OOM_12", + delete [] cfg_index->m_name; + cfg_index->m_name = 0;); + + if (cfg_index->m_name == 0) { + return(DB_OUT_OF_MEMORY); + } + + memcpy(cfg_index->m_name, name, len); + + cfg_index->m_id = it->m_id; + + cfg_index->m_space = m_space; + + cfg_index->m_page_no = it->m_page_no; + } + + return(DB_SUCCESS); +} + +/* Functor that is called for each physical page that is read from the +tablespace file. + + 1. Check each page for corruption. + + 2. Update the space id and LSN on every page + * For the header page + - Validate the flags + - Update the LSN + + 3. On Btree pages + * Set the index id + * Update the max trx id + * In a cluster index, update the system columns + * In a cluster index, update the BLOB ptr, set the space id + * Purge delete marked records, but only if they can be easily + removed from the page + * Keep a counter of number of rows, ie. non-delete-marked rows + * Keep a counter of number of delete marked rows + * Keep a counter of number of purge failure + * If a page is stamped with an index id that isn't in the .cfg file + we assume it is deleted and the page can be ignored. + + 4. Set the page state to dirty so that it will be written to disk. +*/ +class PageConverter : public AbstractCallback { +public: + /** Constructor + * @param cfg - config of table being imported. + * @param trx - transaction covering the import */ + PageConverter(row_import* cfg, trx_t* trx) UNIV_NOTHROW; + + virtual ~PageConverter() UNIV_NOTHROW + { + if (m_heap != 0) { + mem_heap_free(m_heap); + } + } + + /** + @retval the server space id of the tablespace being iterated over */ + virtual ulint get_space_id() const UNIV_NOTHROW + { + return(m_cfg->m_table->space); + } + + /** + Called for each block as it is read from the file. + @param offset - physical offset in the file + @param block - block to convert, it is not from the buffer pool. + @retval DB_SUCCESS or error code. */ + virtual dberr_t operator() ( + os_offset_t offset, + buf_block_t* block) UNIV_NOTHROW; +private: + + /** Status returned by PageConverter::validate() */ + enum import_page_status_t { + IMPORT_PAGE_STATUS_OK, /*!< Page is OK */ + IMPORT_PAGE_STATUS_ALL_ZERO, /*!< Page is all zeros */ + IMPORT_PAGE_STATUS_CORRUPTED /*!< Page is corrupted */ + }; + + /** + Update the page, set the space id, max trx id and index id. + @param block - block read from file + @param page_type - type of the page + @retval DB_SUCCESS or error code */ + dberr_t update_page( + buf_block_t* block, + ulint& page_type) UNIV_NOTHROW; + +#if defined UNIV_DEBUG + /** + @return true error condition is enabled. */ + bool trigger_corruption() UNIV_NOTHROW + { + return(false); + } + #else +#define trigger_corruption() (false) +#endif /* UNIV_DEBUG */ + + /** + Update the space, index id, trx id. + @param block - block to convert + @return DB_SUCCESS or error code */ + dberr_t update_index_page(buf_block_t* block) UNIV_NOTHROW; + + /** Update the BLOB refrences and write UNDO log entries for + rows that can't be purged optimistically. + @param block - block to update + @retval DB_SUCCESS or error code */ + dberr_t update_records(buf_block_t* block) UNIV_NOTHROW; + + /** + Validate the page, check for corruption. + @param offset - physical offset within file. + @param page - page read from file. + @return 0 on success, 1 if all zero, 2 if corrupted */ + import_page_status_t validate( + os_offset_t offset, + buf_block_t* page) UNIV_NOTHROW; + + /** + Validate the space flags and update tablespace header page. + @param block - block read from file, not from the buffer pool. + @retval DB_SUCCESS or error code */ + dberr_t update_header(buf_block_t* block) UNIV_NOTHROW; + + /** + Adjust the BLOB reference for a single column that is externally stored + @param rec - record to update + @param offsets - column offsets for the record + @param i - column ordinal value + @return DB_SUCCESS or error code */ + dberr_t adjust_cluster_index_blob_column( + rec_t* rec, + const ulint* offsets, + ulint i) UNIV_NOTHROW; + + /** + Adjusts the BLOB reference in the clustered index row for all + externally stored columns. + @param rec - record to update + @param offsets - column offsets for the record + @return DB_SUCCESS or error code */ + dberr_t adjust_cluster_index_blob_columns( + rec_t* rec, + const ulint* offsets) UNIV_NOTHROW; + + /** + In the clustered index, adjist the BLOB pointers as needed. + Also update the BLOB reference, write the new space id. + @param rec - record to update + @param offsets - column offsets for the record + @return DB_SUCCESS or error code */ + dberr_t adjust_cluster_index_blob_ref( + rec_t* rec, + const ulint* offsets) UNIV_NOTHROW; + + /** + Purge delete-marked records, only if it is possible to do + so without re-organising the B+tree. + @param offsets - current row offsets. + @retval true if purged */ + bool purge(const ulint* offsets) UNIV_NOTHROW; + + /** + Adjust the BLOB references and sys fields for the current record. + @param index - the index being converted + @param rec - record to update + @param offsets - column offsets for the record + @param deleted - true if row is delete marked + @return DB_SUCCESS or error code. */ + dberr_t adjust_cluster_record( + const dict_index_t* index, + rec_t* rec, + const ulint* offsets, + bool deleted) UNIV_NOTHROW; + + /** + Find an index with the matching id. + @return row_index_t* instance or 0 */ + row_index_t* find_index(index_id_t id) UNIV_NOTHROW + { + row_index_t* index = &m_cfg->m_indexes[0]; + + for (ulint i = 0; i < m_cfg->m_n_indexes; ++i, ++index) { + if (id == index->m_id) { + return(index); + } + } + + return(0); + + } +private: + /** Config for table that is being imported. */ + row_import* m_cfg; + + /** Current index whose pages are being imported */ + row_index_t* m_index; + + /** Current system LSN */ + lsn_t m_current_lsn; + + /** Alias for m_page_zip, only set for compressed pages. */ + page_zip_des_t* m_page_zip_ptr; + + /** Iterator over records in a block */ + RecIterator m_rec_iter; + + /** Record offset */ + ulint m_offsets_[REC_OFFS_NORMAL_SIZE]; + + /** Pointer to m_offsets_ */ + ulint* m_offsets; + + /** Memory heap for the record offsets */ + mem_heap_t* m_heap; + + /** Cluster index instance */ + dict_index_t* m_cluster_index; +}; + +/** +row_import destructor. */ +row_import::~row_import() UNIV_NOTHROW +{ + for (ulint i = 0; m_indexes != 0 && i < m_n_indexes; ++i) { + delete [] m_indexes[i].m_name; + + if (m_indexes[i].m_fields == 0) { + continue; + } + + dict_field_t* fields = m_indexes[i].m_fields; + ulint n_fields = m_indexes[i].m_n_fields; + + for (ulint j = 0; j < n_fields; ++j) { + delete [] fields[j].name; + } + + delete [] fields; + } + + for (ulint i = 0; m_col_names != 0 && i < m_n_cols; ++i) { + delete [] m_col_names[i]; + } + + delete [] m_cols; + delete [] m_indexes; + delete [] m_col_names; + delete [] m_table_name; + delete [] m_hostname; +} + +/** +Find the index entry in in the indexes array. +@param name - index name +@return instance if found else 0. */ +row_index_t* +row_import::get_index( + const char* name) const UNIV_NOTHROW +{ + for (ulint i = 0; i < m_n_indexes; ++i) { + const char* index_name; + row_index_t* index = &m_indexes[i]; + + index_name = reinterpret_cast<const char*>(index->m_name); + + if (strcmp(index_name, name) == 0) { + + return(index); + } + } + + return(0); +} + +/** +Get the number of rows in the index. +@param name - index name +@return number of rows (doesn't include delete marked rows). */ +ulint +row_import::get_n_rows( + const char* name) const UNIV_NOTHROW +{ + const row_index_t* index = get_index(name); + + ut_a(name != 0); + + return(index->m_stats.m_n_rows); +} + +/** +Get the number of rows for which purge failed uding the convert phase. +@param name - index name +@return number of rows for which purge failed. */ +ulint +row_import::get_n_purge_failed( + const char* name) const UNIV_NOTHROW +{ + const row_index_t* index = get_index(name); + + ut_a(name != 0); + + return(index->m_stats.m_n_purge_failed); +} + +/** +Find the ordinal value of the column name in the cfg table columns. +@param name - of column to look for. +@return ULINT_UNDEFINED if not found. */ +ulint +row_import::find_col( + const char* name) const UNIV_NOTHROW +{ + for (ulint i = 0; i < m_n_cols; ++i) { + const char* col_name; + + col_name = reinterpret_cast<const char*>(m_col_names[i]); + + if (strcmp(col_name, name) == 0) { + return(i); + } + } + + return(ULINT_UNDEFINED); +} + +/** +Find the index field entry in in the cfg indexes fields. +@name - of the index to look for +@return instance if found else 0. */ +const dict_field_t* +row_import::find_field( + const row_index_t* cfg_index, + const char* name) const UNIV_NOTHROW +{ + const dict_field_t* field = cfg_index->m_fields; + + for (ulint i = 0; i < cfg_index->m_n_fields; ++i, ++field) { + const char* field_name; + + field_name = reinterpret_cast<const char*>(field->name); + + if (strcmp(field_name, name) == 0) { + return(field); + } + } + + return(0); +} + +/** +Check if the index schema that was read from the .cfg file matches the +in memory index definition. +@return DB_SUCCESS or error code. */ +dberr_t +row_import::match_index_columns( + THD* thd, + const dict_index_t* index) UNIV_NOTHROW +{ + row_index_t* cfg_index; + dberr_t err = DB_SUCCESS; + + cfg_index = get_index(index->name); + + if (cfg_index == 0) { + ib_errf(thd, IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "Index %s not found in tablespace meta-data file.", + index->name); + + return(DB_ERROR); + } + + cfg_index->m_srv_index = index; + + const dict_field_t* field = index->fields; + + for (ulint i = 0; i < index->n_fields; ++i, ++field) { + + const dict_field_t* cfg_field; + + cfg_field = find_field(cfg_index, field->name); + + if (cfg_field == 0) { + ib_errf(thd, IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "Index %s field %s not found in tablespace " + "meta-data file.", + index->name, field->name); + + err = DB_ERROR; + } else { + + if (cfg_field->prefix_len != field->prefix_len) { + ib_errf(thd, IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "Index %s field %s prefix len %lu " + "doesn't match meta-data file value " + "%lu", + index->name, field->name, + (ulong) field->prefix_len, + (ulong) cfg_field->prefix_len); + + err = DB_ERROR; + } + + if (cfg_field->fixed_len != field->fixed_len) { + ib_errf(thd, IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "Index %s field %s fixed len %lu " + "doesn't match meta-data file value " + "%lu", + index->name, field->name, + (ulong) field->fixed_len, + (ulong) cfg_field->fixed_len); + + err = DB_ERROR; + } + } + } + + return(err); +} + +/** +Check if the table schema that was read from the .cfg file matches the +in memory table definition. +@param thd - MySQL session variable +@return DB_SUCCESS or error code. */ +dberr_t +row_import::match_table_columns( + THD* thd) UNIV_NOTHROW +{ + dberr_t err = DB_SUCCESS; + const dict_col_t* col = m_table->cols; + + for (ulint i = 0; i < m_table->n_cols; ++i, ++col) { + + const char* col_name; + ulint cfg_col_index; + + col_name = dict_table_get_col_name( + m_table, dict_col_get_no(col)); + + cfg_col_index = find_col(col_name); + + if (cfg_col_index == ULINT_UNDEFINED) { + + ib_errf(thd, IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "Column %s not found in tablespace.", + col_name); + + err = DB_ERROR; + } else if (cfg_col_index != col->ind) { + + ib_errf(thd, IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "Column %s ordinal value mismatch, it's at " + "%lu in the table and %lu in the tablespace " + "meta-data file", + col_name, + (ulong) col->ind, (ulong) cfg_col_index); + + err = DB_ERROR; + } else { + const dict_col_t* cfg_col; + + cfg_col = &m_cols[cfg_col_index]; + ut_a(cfg_col->ind == cfg_col_index); + + if (cfg_col->prtype != col->prtype) { + ib_errf(thd, + IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "Column %s precise type mismatch.", + col_name); + err = DB_ERROR; + } + + if (cfg_col->mtype != col->mtype) { + ib_errf(thd, + IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "Column %s main type mismatch.", + col_name); + err = DB_ERROR; + } + + if (cfg_col->len != col->len) { + ib_errf(thd, + IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "Column %s length mismatch.", + col_name); + err = DB_ERROR; + } + + if (cfg_col->mbminmaxlen != col->mbminmaxlen) { + ib_errf(thd, + IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "Column %s multi-byte len mismatch.", + col_name); + err = DB_ERROR; + } + + if (cfg_col->ind != col->ind) { + err = DB_ERROR; + } + + if (cfg_col->ord_part != col->ord_part) { + ib_errf(thd, + IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "Column %s ordering mismatch.", + col_name); + err = DB_ERROR; + } + + if (cfg_col->max_prefix != col->max_prefix) { + ib_errf(thd, + IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "Column %s max prefix mismatch.", + col_name); + err = DB_ERROR; + } + } + } + + return(err); +} + +/** +Check if the table (and index) schema that was read from the .cfg file +matches the in memory table definition. +@param thd - MySQL session variable +@return DB_SUCCESS or error code. */ +dberr_t +row_import::match_schema( + THD* thd) UNIV_NOTHROW +{ + /* Do some simple checks. */ + + if (m_flags != m_table->flags) { + ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_TABLE_SCHEMA_MISMATCH, + "Table flags don't match, server table has 0x%lx " + "and the meta-data file has 0x%lx", + (ulong) m_table->n_cols, (ulong) m_flags); + + return(DB_ERROR); + } else if (m_table->n_cols != m_n_cols) { + ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_TABLE_SCHEMA_MISMATCH, + "Number of columns don't match, table has %lu " + "columns but the tablespace meta-data file has " + "%lu columns", + (ulong) m_table->n_cols, (ulong) m_n_cols); + + return(DB_ERROR); + } else if (UT_LIST_GET_LEN(m_table->indexes) != m_n_indexes) { + + /* If the number of indexes don't match then it is better + to abort the IMPORT. It is easy for the user to create a + table matching the IMPORT definition. */ + + ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_TABLE_SCHEMA_MISMATCH, + "Number of indexes don't match, table has %lu " + "indexes but the tablespace meta-data file has " + "%lu indexes", + (ulong) UT_LIST_GET_LEN(m_table->indexes), + (ulong) m_n_indexes); + + return(DB_ERROR); + } + + dberr_t err = match_table_columns(thd); + + if (err != DB_SUCCESS) { + return(err); + } + + /* Check if the index definitions match. */ + + const dict_index_t* index; + + for (index = UT_LIST_GET_FIRST(m_table->indexes); + index != 0; + index = UT_LIST_GET_NEXT(indexes, index)) { + + dberr_t index_err; + + index_err = match_index_columns(thd, index); + + if (index_err != DB_SUCCESS) { + err = index_err; + } + } + + return(err); +} + +/** +Set the index root <space, pageno>, using index name. */ +void +row_import::set_root_by_name() UNIV_NOTHROW +{ + row_index_t* cfg_index = m_indexes; + + for (ulint i = 0; i < m_n_indexes; ++i, ++cfg_index) { + dict_index_t* index; + + const char* index_name; + + index_name = reinterpret_cast<const char*>(cfg_index->m_name); + + index = dict_table_get_index_on_name(m_table, index_name); + + /* We've already checked that it exists. */ + ut_a(index != 0); + + /* Set the root page number and space id. */ + index->space = m_table->space; + index->page = cfg_index->m_page_no; + } +} + +/** +Set the index root <space, pageno>, using a heuristic. +@return DB_SUCCESS or error code */ +dberr_t +row_import::set_root_by_heuristic() UNIV_NOTHROW +{ + row_index_t* cfg_index = m_indexes; + + ut_a(m_n_indexes > 0); + + // TODO: For now use brute force, based on ordinality + + if (UT_LIST_GET_LEN(m_table->indexes) != m_n_indexes) { + + char table_name[MAX_FULL_NAME_LEN + 1]; + + innobase_format_name( + table_name, sizeof(table_name), m_table->name, FALSE); + + ib_logf(IB_LOG_LEVEL_WARN, + "Table %s should have %lu indexes but the tablespace " + "has %lu indexes", + table_name, + UT_LIST_GET_LEN(m_table->indexes), + m_n_indexes); + } + + dict_mutex_enter_for_mysql(); + + ulint i = 0; + dberr_t err = DB_SUCCESS; + + for (dict_index_t* index = UT_LIST_GET_FIRST(m_table->indexes); + index != 0; + index = UT_LIST_GET_NEXT(indexes, index)) { + + if (index->type & DICT_FTS) { + index->type |= DICT_CORRUPT; + ib_logf(IB_LOG_LEVEL_WARN, + "Skipping FTS index: %s", index->name); + } else if (i < m_n_indexes) { + + delete [] cfg_index[i].m_name; + + ulint len = strlen(index->name) + 1; + + cfg_index[i].m_name = new(std::nothrow) byte[len]; + + /* Trigger OOM */ + DBUG_EXECUTE_IF("ib_import_OOM_14", + delete[] cfg_index[i].m_name; + cfg_index[i].m_name = 0;); + + if (cfg_index[i].m_name == 0) { + err = DB_OUT_OF_MEMORY; + break; + } + + memcpy(cfg_index[i].m_name, index->name, len); + + cfg_index[i].m_srv_index = index; + + index->space = m_table->space; + index->page = cfg_index[i].m_page_no; + + ++i; + } + } + + dict_mutex_exit_for_mysql(); + + return(err); +} + +/** +Purge delete marked records. +@return DB_SUCCESS or error code. */ +dberr_t +IndexPurge::garbage_collect() UNIV_NOTHROW +{ + dberr_t err; + ibool comp = dict_table_is_comp(m_index->table); + + /* Open the persistent cursor and start the mini-transaction. */ + + open(); + + while ((err = next()) == DB_SUCCESS) { + + rec_t* rec = btr_pcur_get_rec(&m_pcur); + ibool deleted = rec_get_deleted_flag(rec, comp); + + if (!deleted) { + ++m_n_rows; + } else { + purge(); + } + } + + /* Close the persistent cursor and commit the mini-transaction. */ + + close(); + + return(err == DB_END_OF_INDEX ? DB_SUCCESS : err); +} + +/** +Begin import, position the cursor on the first record. */ +void +IndexPurge::open() UNIV_NOTHROW +{ + mtr_start(&m_mtr); + + mtr_set_log_mode(&m_mtr, MTR_LOG_NO_REDO); + + btr_pcur_open_at_index_side( + true, m_index, BTR_MODIFY_LEAF, &m_pcur, true, 0, &m_mtr); +} + +/** +Close the persistent curosr and commit the mini-transaction. */ +void +IndexPurge::close() UNIV_NOTHROW +{ + btr_pcur_close(&m_pcur); + mtr_commit(&m_mtr); +} + +/** +Position the cursor on the next record. +@return DB_SUCCESS or error code */ +dberr_t +IndexPurge::next() UNIV_NOTHROW +{ + btr_pcur_move_to_next_on_page(&m_pcur); + + /* When switching pages, commit the mini-transaction + in order to release the latch on the old page. */ + + if (!btr_pcur_is_after_last_on_page(&m_pcur)) { + return(DB_SUCCESS); + } else if (trx_is_interrupted(m_trx)) { + /* Check after every page because the check + is expensive. */ + return(DB_INTERRUPTED); + } + + btr_pcur_store_position(&m_pcur, &m_mtr); + + mtr_commit(&m_mtr); + + mtr_start(&m_mtr); + + mtr_set_log_mode(&m_mtr, MTR_LOG_NO_REDO); + + btr_pcur_restore_position(BTR_MODIFY_LEAF, &m_pcur, &m_mtr); + + if (!btr_pcur_move_to_next_user_rec(&m_pcur, &m_mtr)) { + + return(DB_END_OF_INDEX); + } + + return(DB_SUCCESS); +} + +/** +Store the persistent cursor position and reopen the +B-tree cursor in BTR_MODIFY_TREE mode, because the +tree structure may be changed during a pessimistic delete. */ +void +IndexPurge::purge_pessimistic_delete() UNIV_NOTHROW +{ + dberr_t err; + + btr_pcur_restore_position(BTR_MODIFY_TREE, &m_pcur, &m_mtr); + + ut_ad(rec_get_deleted_flag( + btr_pcur_get_rec(&m_pcur), + dict_table_is_comp(m_index->table))); + + btr_cur_pessimistic_delete( + &err, FALSE, btr_pcur_get_btr_cur(&m_pcur), 0, RB_NONE, &m_mtr); + + ut_a(err == DB_SUCCESS); + + /* Reopen the B-tree cursor in BTR_MODIFY_LEAF mode */ + mtr_commit(&m_mtr); +} + +/** +Purge delete-marked records. */ +void +IndexPurge::purge() UNIV_NOTHROW +{ + btr_pcur_store_position(&m_pcur, &m_mtr); + + purge_pessimistic_delete(); + + mtr_start(&m_mtr); + + mtr_set_log_mode(&m_mtr, MTR_LOG_NO_REDO); + + btr_pcur_restore_position(BTR_MODIFY_LEAF, &m_pcur, &m_mtr); +} + +/** +Constructor +* @param cfg - config of table being imported. +* @param trx - transaction covering the import */ +PageConverter::PageConverter( + row_import* cfg, + trx_t* trx) + : + AbstractCallback(trx), + m_cfg(cfg), + m_page_zip_ptr(0), + m_heap(0) UNIV_NOTHROW +{ + m_index = m_cfg->m_indexes; + + m_current_lsn = log_get_lsn(); + ut_a(m_current_lsn > 0); + + m_offsets = m_offsets_; + rec_offs_init(m_offsets_); + + m_cluster_index = dict_table_get_first_index(m_cfg->m_table); +} + +/** +Adjust the BLOB reference for a single column that is externally stored +@param rec - record to update +@param offsets - column offsets for the record +@param i - column ordinal value +@return DB_SUCCESS or error code */ +dberr_t +PageConverter::adjust_cluster_index_blob_column( + rec_t* rec, + const ulint* offsets, + ulint i) UNIV_NOTHROW +{ + ulint len; + byte* field; + + field = rec_get_nth_field(rec, offsets, i, &len); + + DBUG_EXECUTE_IF("ib_import_trigger_corruption_2", + len = BTR_EXTERN_FIELD_REF_SIZE - 1;); + + if (len < BTR_EXTERN_FIELD_REF_SIZE) { + + char index_name[MAX_FULL_NAME_LEN + 1]; + + innobase_format_name( + index_name, sizeof(index_name), + m_cluster_index->name, TRUE); + + ib_errf(m_trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_INNODB_INDEX_CORRUPT, + "Externally stored column(%lu) has a reference " + "length of %lu in the cluster index %s", + (ulong) i, (ulong) len, index_name); + + return(DB_CORRUPTION); + } + + field += BTR_EXTERN_SPACE_ID - BTR_EXTERN_FIELD_REF_SIZE + len; + + if (is_compressed_table()) { + mach_write_to_4(field, get_space_id()); + + page_zip_write_blob_ptr( + m_page_zip_ptr, rec, m_cluster_index, offsets, i, 0); + } else { + mlog_write_ulint(field, get_space_id(), MLOG_4BYTES, 0); + } + + return(DB_SUCCESS); +} + +/** +Adjusts the BLOB reference in the clustered index row for all externally +stored columns. +@param rec - record to update +@param offsets - column offsets for the record +@return DB_SUCCESS or error code */ +dberr_t +PageConverter::adjust_cluster_index_blob_columns( + rec_t* rec, + const ulint* offsets) UNIV_NOTHROW +{ + ut_ad(rec_offs_any_extern(offsets)); + + /* Adjust the space_id in the BLOB pointers. */ + + for (ulint i = 0; i < rec_offs_n_fields(offsets); ++i) { + + /* Only if the column is stored "externally". */ + + if (rec_offs_nth_extern(offsets, i)) { + dberr_t err; + + err = adjust_cluster_index_blob_column(rec, offsets, i); + + if (err != DB_SUCCESS) { + return(err); + } + } + } + + return(DB_SUCCESS); +} + +/** +In the clustered index, adjust BLOB pointers as needed. Also update the +BLOB reference, write the new space id. +@param rec - record to update +@param offsets - column offsets for the record +@return DB_SUCCESS or error code */ +dberr_t +PageConverter::adjust_cluster_index_blob_ref( + rec_t* rec, + const ulint* offsets) UNIV_NOTHROW +{ + if (rec_offs_any_extern(offsets)) { + dberr_t err; + + err = adjust_cluster_index_blob_columns(rec, offsets); + + if (err != DB_SUCCESS) { + return(err); + } + } + + return(DB_SUCCESS); +} + +/** +Purge delete-marked records, only if it is possible to do so without +re-organising the B+tree. +@param offsets - current row offsets. +@return true if purge succeeded */ +bool +PageConverter::purge(const ulint* offsets) UNIV_NOTHROW +{ + const dict_index_t* index = m_index->m_srv_index; + + /* We can't have a page that is empty and not root. */ + if (m_rec_iter.remove(index, m_page_zip_ptr, m_offsets)) { + + ++m_index->m_stats.m_n_purged; + + return(true); + } else { + ++m_index->m_stats.m_n_purge_failed; + } + + return(false); +} + +/** +Adjust the BLOB references and sys fields for the current record. +@param rec - record to update +@param offsets - column offsets for the record +@param deleted - true if row is delete marked +@return DB_SUCCESS or error code. */ +dberr_t +PageConverter::adjust_cluster_record( + const dict_index_t* index, + rec_t* rec, + const ulint* offsets, + bool deleted) UNIV_NOTHROW +{ + dberr_t err; + + if ((err = adjust_cluster_index_blob_ref(rec, offsets)) == DB_SUCCESS) { + + /* Reset DB_TRX_ID and DB_ROLL_PTR. Normally, these fields + are only written in conjunction with other changes to the + record. */ + + row_upd_rec_sys_fields( + rec, m_page_zip_ptr, m_cluster_index, m_offsets, + m_trx, 0); + } + + return(err); +} + +/** +Update the BLOB refrences and write UNDO log entries for +rows that can't be purged optimistically. +@param block - block to update +@retval DB_SUCCESS or error code */ +dberr_t +PageConverter::update_records( + buf_block_t* block) UNIV_NOTHROW +{ + ibool comp = dict_table_is_comp(m_cfg->m_table); + bool clust_index = m_index->m_srv_index == m_cluster_index; + + /* This will also position the cursor on the first user record. */ + + m_rec_iter.open(block); + + while (!m_rec_iter.end()) { + + rec_t* rec = m_rec_iter.current(); + + /* FIXME: Move out of the loop */ + + if (rec_get_status(rec) == REC_STATUS_NODE_PTR) { + break; + } + + ibool deleted = rec_get_deleted_flag(rec, comp); + + /* For the clustered index we have to adjust the BLOB + reference and the system fields irrespective of the + delete marked flag. The adjustment of delete marked + cluster records is required for purge to work later. */ + + if (deleted || clust_index) { + m_offsets = rec_get_offsets( + rec, m_index->m_srv_index, m_offsets, + ULINT_UNDEFINED, &m_heap); + } + + if (clust_index) { + + dberr_t err = adjust_cluster_record( + m_index->m_srv_index, rec, m_offsets, + deleted); + + if (err != DB_SUCCESS) { + return(err); + } + } + + /* If it is a delete marked record then try an + optimistic delete. */ + + if (deleted) { + /* A successful purge will move the cursor to the + next record. */ + + if (!purge(m_offsets)) { + m_rec_iter.next(); + } + + ++m_index->m_stats.m_n_deleted; + } else { + ++m_index->m_stats.m_n_rows; + m_rec_iter.next(); + } + } + + return(DB_SUCCESS); +} + +/** +Update the space, index id, trx id. +@return DB_SUCCESS or error code */ +dberr_t +PageConverter::update_index_page( + buf_block_t* block) UNIV_NOTHROW +{ + index_id_t id; + buf_frame_t* page = block->frame; + + if (is_free(buf_block_get_page_no(block))) { + return(DB_SUCCESS); + } else if ((id = btr_page_get_index_id(page)) != m_index->m_id) { + + row_index_t* index = find_index(id); + + if (index == 0) { + m_index = 0; + return(DB_CORRUPTION); + } + + /* Update current index */ + m_index = index; + } + + /* If the .cfg file is missing and there is an index mismatch + then ignore the error. */ + if (m_cfg->m_missing && (m_index == 0 || m_index->m_srv_index == 0)) { + return(DB_SUCCESS); + } + +#ifdef UNIV_ZIP_DEBUG + ut_a(!is_compressed_table() + || page_zip_validate(m_page_zip_ptr, page, m_index->m_srv_index)); +#endif /* UNIV_ZIP_DEBUG */ + + /* This has to be written to uncompressed index header. Set it to + the current index id. */ + btr_page_set_index_id( + page, m_page_zip_ptr, m_index->m_srv_index->id, 0); + + page_set_max_trx_id(block, m_page_zip_ptr, m_trx->id, 0); + + if (page_is_empty(block->frame)) { + + /* Only a root page can be empty. */ + if (!is_root_page(block->frame)) { + // TODO: We should relax this and skip secondary + // indexes. Mark them as corrupt because they can + // always be rebuilt. + return(DB_CORRUPTION); + } + + return(DB_SUCCESS); + } + + return(update_records(block)); +} + +/** +Validate the space flags and update tablespace header page. +@param block - block read from file, not from the buffer pool. +@retval DB_SUCCESS or error code */ +dberr_t +PageConverter::update_header( + buf_block_t* block) UNIV_NOTHROW +{ + /* Check for valid header */ + switch(fsp_header_get_space_id(get_frame(block))) { + case 0: + return(DB_CORRUPTION); + case ULINT_UNDEFINED: + ib_logf(IB_LOG_LEVEL_WARN, + "Space id check in the header failed " + "- ignored"); + } + + ulint space_flags = fsp_header_get_flags(get_frame(block)); + + if (!fsp_flags_is_valid(space_flags)) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "Unsupported tablespace format %lu", + (ulong) space_flags); + + return(DB_UNSUPPORTED); + } + + mach_write_to_8( + get_frame(block) + FIL_PAGE_FILE_FLUSH_LSN, m_current_lsn); + + /* Write space_id to the tablespace header, page 0. */ + mach_write_to_4( + get_frame(block) + FSP_HEADER_OFFSET + FSP_SPACE_ID, + get_space_id()); + + /* This is on every page in the tablespace. */ + mach_write_to_4( + get_frame(block) + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, + get_space_id()); + + return(DB_SUCCESS); +} + +/** +Update the page, set the space id, max trx id and index id. +@param block - block read from file +@retval DB_SUCCESS or error code */ +dberr_t +PageConverter::update_page( + buf_block_t* block, + ulint& page_type) UNIV_NOTHROW +{ + dberr_t err = DB_SUCCESS; + + switch (page_type = fil_page_get_type(get_frame(block))) { + case FIL_PAGE_TYPE_FSP_HDR: + /* Work directly on the uncompressed page headers. */ + ut_a(buf_block_get_page_no(block) == 0); + return(update_header(block)); + + case FIL_PAGE_INDEX: + /* We need to decompress the contents into block->frame + before we can do any thing with Btree pages. */ + + if (is_compressed_table() && !buf_zip_decompress(block, TRUE)) { + return(DB_CORRUPTION); + } + + /* This is on every page in the tablespace. */ + mach_write_to_4( + get_frame(block) + + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, get_space_id()); + + /* Only update the Btree nodes. */ + return(update_index_page(block)); + + case FIL_PAGE_TYPE_SYS: + /* This is page 0 in the system tablespace. */ + return(DB_CORRUPTION); + + case FIL_PAGE_TYPE_XDES: + err = set_current_xdes( + buf_block_get_page_no(block), get_frame(block)); + case FIL_PAGE_INODE: + case FIL_PAGE_TYPE_TRX_SYS: + case FIL_PAGE_IBUF_FREE_LIST: + case FIL_PAGE_TYPE_ALLOCATED: + case FIL_PAGE_IBUF_BITMAP: + case FIL_PAGE_TYPE_BLOB: + case FIL_PAGE_TYPE_ZBLOB: + case FIL_PAGE_TYPE_ZBLOB2: + + /* Work directly on the uncompressed page headers. */ + /* This is on every page in the tablespace. */ + mach_write_to_4( + get_frame(block) + + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, get_space_id()); + + return(err); + } + + ib_logf(IB_LOG_LEVEL_WARN, "Unknown page type (%lu)", page_type); + + return(DB_CORRUPTION); +} + +/** +Validate the page +@param offset - physical offset within file. +@param page - page read from file. +@return status */ +PageConverter::import_page_status_t +PageConverter::validate( + os_offset_t offset, + buf_block_t* block) UNIV_NOTHROW +{ + buf_frame_t* page = get_frame(block); + + /* Check that the page number corresponds to the offset in + the file. Flag as corrupt if it doesn't. Disable the check + for LSN in buf_page_is_corrupted() */ + + if (buf_page_is_corrupted(false, page, get_zip_size()) + || (page_get_page_no(page) != offset / m_page_size + && page_get_page_no(page) != 0)) { + + return(IMPORT_PAGE_STATUS_CORRUPTED); + + } else if (offset > 0 && page_get_page_no(page) == 0) { + const byte* b = page; + const byte* e = b + m_page_size; + + /* If the page number is zero and offset > 0 then + the entire page MUST consist of zeroes. If not then + we flag it as corrupt. */ + + while (b != e) { + + if (*b++ && !trigger_corruption()) { + return(IMPORT_PAGE_STATUS_CORRUPTED); + } + } + + /* The page is all zero: do nothing. */ + return(IMPORT_PAGE_STATUS_ALL_ZERO); + } + + return(IMPORT_PAGE_STATUS_OK); +} + +/** +Called for every page in the tablespace. If the page was not +updated then its state must be set to BUF_PAGE_NOT_USED. +@param offset - physical offset within the file +@param block - block read from file, note it is not from the buffer pool +@retval DB_SUCCESS or error code. */ +dberr_t +PageConverter::operator() ( + os_offset_t offset, + buf_block_t* block) UNIV_NOTHROW +{ + ulint page_type; + dberr_t err = DB_SUCCESS; + + if ((err = periodic_check()) != DB_SUCCESS) { + return(err); + } + + if (is_compressed_table()) { + m_page_zip_ptr = &block->page.zip; + } else { + ut_ad(m_page_zip_ptr == 0); + } + + switch(validate(offset, block)) { + case IMPORT_PAGE_STATUS_OK: + + /* We have to decompress the compressed pages before + we can work on them */ + + if ((err = update_page(block, page_type)) != DB_SUCCESS) { + return(err); + } + + /* Note: For compressed pages this function will write to the + zip descriptor and for uncompressed pages it will write to + page (ie. the block->frame). Therefore the caller should write + out the descriptor contents and not block->frame for compressed + pages. */ + + if (!is_compressed_table() || page_type == FIL_PAGE_INDEX) { + + buf_flush_init_for_writing( + !is_compressed_table() + ? block->frame : block->page.zip.data, + !is_compressed_table() ? 0 : m_page_zip_ptr, + m_current_lsn); + } else { + /* Calculate and update the checksum of non-btree + pages for compressed tables explicitly here. */ + + buf_flush_update_zip_checksum( + get_frame(block), get_zip_size(), + m_current_lsn); + } + + break; + + case IMPORT_PAGE_STATUS_ALL_ZERO: + /* The page is all zero: leave it as is. */ + break; + + case IMPORT_PAGE_STATUS_CORRUPTED: + + ib_logf(IB_LOG_LEVEL_WARN, + "%s: Page %lu at offset " UINT64PF " looks corrupted.", + m_filepath, (ulong) (offset / m_page_size), offset); + + return(DB_CORRUPTION); + } + + return(err); +} + +/*****************************************************************//** +Clean up after import tablespace failure, this function will acquire +the dictionary latches on behalf of the transaction if the transaction +hasn't already acquired them. */ +static __attribute__((nonnull)) +void +row_import_discard_changes( +/*=======================*/ + row_prebuilt_t* prebuilt, /*!< in/out: prebuilt from handler */ + trx_t* trx, /*!< in/out: transaction for import */ + dberr_t err) /*!< in: error code */ +{ + dict_table_t* table = prebuilt->table; + + ut_a(err != DB_SUCCESS); + + prebuilt->trx->error_info = NULL; + + char table_name[MAX_FULL_NAME_LEN + 1]; + + innobase_format_name( + table_name, sizeof(table_name), + prebuilt->table->name, FALSE); + + ib_logf(IB_LOG_LEVEL_INFO, + "Discarding tablespace of table %s: %s", + table_name, ut_strerr(err)); + + if (trx->dict_operation_lock_mode != RW_X_LATCH) { + ut_a(trx->dict_operation_lock_mode == 0); + row_mysql_lock_data_dictionary(trx); + } + + ut_a(trx->dict_operation_lock_mode == RW_X_LATCH); + + /* Since we update the index root page numbers on disk after + we've done a successful import. The table will not be loadable. + However, we need to ensure that the in memory root page numbers + are reset to "NULL". */ + + for (dict_index_t* index = UT_LIST_GET_FIRST(table->indexes); + index != 0; + index = UT_LIST_GET_NEXT(indexes, index)) { + + index->page = FIL_NULL; + index->space = FIL_NULL; + } + + table->ibd_file_missing = TRUE; + + fil_close_tablespace(trx, table->space); +} + +/*****************************************************************//** +Clean up after import tablespace. */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_import_cleanup( +/*===============*/ + row_prebuilt_t* prebuilt, /*!< in/out: prebuilt from handler */ + trx_t* trx, /*!< in/out: transaction for import */ + dberr_t err) /*!< in: error code */ +{ + ut_a(prebuilt->trx != trx); + + if (err != DB_SUCCESS) { + row_import_discard_changes(prebuilt, trx, err); + } + + ut_a(trx->dict_operation_lock_mode == RW_X_LATCH); + + DBUG_EXECUTE_IF("ib_import_before_commit_crash", DBUG_SUICIDE();); + + trx_commit_for_mysql(trx); + + row_mysql_unlock_data_dictionary(trx); + + trx_free_for_mysql(trx); + + prebuilt->trx->op_info = ""; + + DBUG_EXECUTE_IF("ib_import_before_checkpoint_crash", DBUG_SUICIDE();); + + log_make_checkpoint_at(LSN_MAX, TRUE); + + return(err); +} + +/*****************************************************************//** +Report error during tablespace import. */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_import_error( +/*=============*/ + row_prebuilt_t* prebuilt, /*!< in/out: prebuilt from handler */ + trx_t* trx, /*!< in/out: transaction for import */ + dberr_t err) /*!< in: error code */ +{ + if (!trx_is_interrupted(trx)) { + char table_name[MAX_FULL_NAME_LEN + 1]; + + innobase_format_name( + table_name, sizeof(table_name), + prebuilt->table->name, FALSE); + + ib_senderrf( + trx->mysql_thd, IB_LOG_LEVEL_WARN, + ER_INNODB_IMPORT_ERROR, + table_name, (ulong) err, ut_strerr(err)); + } + + return(row_import_cleanup(prebuilt, trx, err)); +} + +/*****************************************************************//** +Adjust the root page index node and leaf node segment headers, update +with the new space id. For all the table's secondary indexes. +@return error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_import_adjust_root_pages_of_secondary_indexes( +/*==============================================*/ + row_prebuilt_t* prebuilt, /*!< in/out: prebuilt from + handler */ + trx_t* trx, /*!< in: transaction used for + the import */ + dict_table_t* table, /*!< in: table the indexes + belong to */ + const row_import& cfg) /*!< Import context */ +{ + dict_index_t* index; + ulint n_rows_in_table; + dberr_t err = DB_SUCCESS; + + /* Skip the clustered index. */ + index = dict_table_get_first_index(table); + + n_rows_in_table = cfg.get_n_rows(index->name); + + DBUG_EXECUTE_IF("ib_import_sec_rec_count_mismatch_failure", + n_rows_in_table++;); + + /* Adjust the root pages of the secondary indexes only. */ + while ((index = dict_table_get_next_index(index)) != NULL) { + char index_name[MAX_FULL_NAME_LEN + 1]; + + innobase_format_name( + index_name, sizeof(index_name), index->name, TRUE); + + ut_a(!dict_index_is_clust(index)); + + if (!(index->type & DICT_CORRUPT) + && index->space != FIL_NULL + && index->page != FIL_NULL) { + + /* Update the Btree segment headers for index node and + leaf nodes in the root page. Set the new space id. */ + + err = btr_root_adjust_on_import(index); + } else { + ib_logf(IB_LOG_LEVEL_WARN, + "Skip adjustment of root pages for " + "index %s.", index->name); + + err = DB_CORRUPTION; + } + + if (err != DB_SUCCESS) { + + if (index->type & DICT_CLUSTERED) { + break; + } + + ib_errf(trx->mysql_thd, + IB_LOG_LEVEL_WARN, + ER_INNODB_INDEX_CORRUPT, + "Index '%s' not found or corrupt, " + "you should recreate this index.", + index_name); + + /* Do not bail out, so that the data + can be recovered. */ + + err = DB_SUCCESS; + index->type |= DICT_CORRUPT; + continue; + } + + /* If we failed to purge any records in the index then + do it the hard way. + + TODO: We can do this in the first pass by generating UNDO log + records for the failed rows. */ + + if (!cfg.requires_purge(index->name)) { + continue; + } + + IndexPurge purge(trx, index); + + trx->op_info = "secondary: purge delete marked records"; + + err = purge.garbage_collect(); + + trx->op_info = ""; + + if (err != DB_SUCCESS) { + break; + } else if (purge.get_n_rows() != n_rows_in_table) { + + ib_errf(trx->mysql_thd, + IB_LOG_LEVEL_WARN, + ER_INNODB_INDEX_CORRUPT, + "Index '%s' contains %lu entries, " + "should be %lu, you should recreate " + "this index.", index_name, + (ulong) purge.get_n_rows(), + (ulong) n_rows_in_table); + + index->type |= DICT_CORRUPT; + + /* Do not bail out, so that the data + can be recovered. */ + + err = DB_SUCCESS; + } + } + + return(err); +} + +/*****************************************************************//** +Ensure that dict_sys->row_id exceeds SELECT MAX(DB_ROW_ID). +@return error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_import_set_sys_max_row_id( +/*==========================*/ + row_prebuilt_t* prebuilt, /*!< in/out: prebuilt from + handler */ + const dict_table_t* table) /*!< in: table to import */ +{ + dberr_t err; + const rec_t* rec; + mtr_t mtr; + btr_pcur_t pcur; + row_id_t row_id = 0; + dict_index_t* index; + + index = dict_table_get_first_index(table); + ut_a(dict_index_is_clust(index)); + + mtr_start(&mtr); + + mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO); + + btr_pcur_open_at_index_side( + false, // High end + index, + BTR_SEARCH_LEAF, + &pcur, + true, // Init cursor + 0, // Leaf level + &mtr); + + btr_pcur_move_to_prev_on_page(&pcur); + rec = btr_pcur_get_rec(&pcur); + + /* Check for empty table. */ + if (!page_rec_is_infimum(rec)) { + ulint len; + const byte* field; + mem_heap_t* heap = NULL; + ulint offsets_[1 + REC_OFFS_HEADER_SIZE]; + ulint* offsets; + + rec_offs_init(offsets_); + + offsets = rec_get_offsets( + rec, index, offsets_, ULINT_UNDEFINED, &heap); + + field = rec_get_nth_field( + rec, offsets, + dict_index_get_sys_col_pos(index, DATA_ROW_ID), + &len); + + if (len == DATA_ROW_ID_LEN) { + row_id = mach_read_from_6(field); + err = DB_SUCCESS; + } else { + err = DB_CORRUPTION; + } + + if (heap != NULL) { + mem_heap_free(heap); + } + } else { + /* The table is empty. */ + err = DB_SUCCESS; + } + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + DBUG_EXECUTE_IF("ib_import_set_max_rowid_failure", + err = DB_CORRUPTION;); + + if (err != DB_SUCCESS) { + char index_name[MAX_FULL_NAME_LEN + 1]; + + innobase_format_name( + index_name, sizeof(index_name), index->name, TRUE); + + ib_errf(prebuilt->trx->mysql_thd, + IB_LOG_LEVEL_WARN, + ER_INNODB_INDEX_CORRUPT, + "Index '%s' corruption detected, invalid DB_ROW_ID " + "in index.", index_name); + + return(err); + + } else if (row_id > 0) { + + /* Update the system row id if the imported index row id is + greater than the max system row id. */ + + mutex_enter(&dict_sys->mutex); + + if (row_id >= dict_sys->row_id) { + dict_sys->row_id = row_id + 1; + dict_hdr_flush_row_id(); + } + + mutex_exit(&dict_sys->mutex); + } + + return(DB_SUCCESS); +} + +/*****************************************************************//** +Read the a string from the meta data file. +@return DB_SUCCESS or error code. */ +static +dberr_t +row_import_cfg_read_string( +/*=======================*/ + FILE* file, /*!< in/out: File to read from */ + byte* ptr, /*!< out: string to read */ + ulint max_len) /*!< in: maximum length of the output + buffer in bytes */ +{ + DBUG_EXECUTE_IF("ib_import_string_read_error", + errno = EINVAL; return(DB_IO_ERROR);); + + ulint len = 0; + + while (!feof(file)) { + int ch = fgetc(file); + + if (ch == EOF) { + break; + } else if (ch != 0) { + if (len < max_len) { + ptr[len++] = ch; + } else { + break; + } + /* max_len includes the NUL byte */ + } else if (len != max_len - 1) { + break; + } else { + ptr[len] = 0; + return(DB_SUCCESS); + } + } + + errno = EINVAL; + + return(DB_IO_ERROR); +} + +/*********************************************************************//** +Write the meta data (index user fields) config file. +@return DB_SUCCESS or error code. */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_import_cfg_read_index_fields( +/*=============================*/ + FILE* file, /*!< in: file to write to */ + THD* thd, /*!< in/out: session */ + row_index_t* index, /*!< Index being read in */ + row_import* cfg) /*!< in/out: meta-data read */ +{ + byte row[sizeof(ib_uint32_t) * 3]; + ulint n_fields = index->m_n_fields; + + index->m_fields = new(std::nothrow) dict_field_t[n_fields]; + + /* Trigger OOM */ + DBUG_EXECUTE_IF("ib_import_OOM_4", + delete [] index->m_fields; index->m_fields = 0;); + + if (index->m_fields == 0) { + return(DB_OUT_OF_MEMORY); + } + + dict_field_t* field = index->m_fields; + + memset(field, 0x0, sizeof(*field) * n_fields); + + for (ulint i = 0; i < n_fields; ++i, ++field) { + byte* ptr = row; + + /* Trigger EOF */ + DBUG_EXECUTE_IF("ib_import_io_read_error_1", + (void) fseek(file, 0L, SEEK_END);); + + if (fread(row, 1, sizeof(row), file) != sizeof(row)) { + + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + errno, strerror(errno), + "while reading index fields."); + + return(DB_IO_ERROR); + } + + field->prefix_len = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + field->fixed_len = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + /* Include the NUL byte in the length. */ + ulint len = mach_read_from_4(ptr); + + byte* name = new(std::nothrow) byte[len]; + + /* Trigger OOM */ + DBUG_EXECUTE_IF("ib_import_OOM_5", delete [] name; name = 0;); + + if (name == 0) { + return(DB_OUT_OF_MEMORY); + } + + field->name = reinterpret_cast<const char*>(name); + + dberr_t err = row_import_cfg_read_string(file, name, len); + + if (err != DB_SUCCESS) { + + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + errno, strerror(errno), + "while parsing table name."); + + return(err); + } + } + + return(DB_SUCCESS); +} + +/*****************************************************************//** +Read the index names and root page numbers of the indexes and set the values. +Row format [root_page_no, len of str, str ... ] +@return DB_SUCCESS or error code. */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_import_read_index_data( +/*=======================*/ + FILE* file, /*!< in: File to read from */ + THD* thd, /*!< in: session */ + row_import* cfg) /*!< in/out: meta-data read */ +{ + byte* ptr; + row_index_t* cfg_index; + byte row[sizeof(index_id_t) + sizeof(ib_uint32_t) * 9]; + + /* FIXME: What is the max value? */ + ut_a(cfg->m_n_indexes > 0); + ut_a(cfg->m_n_indexes < 1024); + + cfg->m_indexes = new(std::nothrow) row_index_t[cfg->m_n_indexes]; + + /* Trigger OOM */ + DBUG_EXECUTE_IF("ib_import_OOM_6", + delete [] cfg->m_indexes; cfg->m_indexes = 0;); + + if (cfg->m_indexes == 0) { + return(DB_OUT_OF_MEMORY); + } + + memset(cfg->m_indexes, 0x0, sizeof(*cfg->m_indexes) * cfg->m_n_indexes); + + cfg_index = cfg->m_indexes; + + for (ulint i = 0; i < cfg->m_n_indexes; ++i, ++cfg_index) { + /* Trigger EOF */ + DBUG_EXECUTE_IF("ib_import_io_read_error_2", + (void) fseek(file, 0L, SEEK_END);); + + /* Read the index data. */ + size_t n_bytes = fread(row, 1, sizeof(row), file); + + /* Trigger EOF */ + DBUG_EXECUTE_IF("ib_import_io_read_error", + (void) fseek(file, 0L, SEEK_END);); + + if (n_bytes != sizeof(row)) { + char msg[BUFSIZ]; + + ut_snprintf(msg, sizeof(msg), + "while reading index meta-data, expected " + "to read %lu bytes but read only %lu " + "bytes", + (ulong) sizeof(row), (ulong) n_bytes); + + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + errno, strerror(errno), msg); + + ib_logf(IB_LOG_LEVEL_ERROR, "IO Error: %s", msg); + + return(DB_IO_ERROR); + } + + ptr = row; + + cfg_index->m_id = mach_read_from_8(ptr); + ptr += sizeof(index_id_t); + + cfg_index->m_space = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + cfg_index->m_page_no = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + cfg_index->m_type = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + cfg_index->m_trx_id_offset = mach_read_from_4(ptr); + if (cfg_index->m_trx_id_offset != mach_read_from_4(ptr)) { + ut_ad(0); + /* Overflow. Pretend that the clustered index + has a variable-length PRIMARY KEY. */ + cfg_index->m_trx_id_offset = 0; + } + ptr += sizeof(ib_uint32_t); + + cfg_index->m_n_user_defined_cols = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + cfg_index->m_n_uniq = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + cfg_index->m_n_nullable = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + cfg_index->m_n_fields = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + /* The NUL byte is included in the name length. */ + ulint len = mach_read_from_4(ptr); + + if (len > OS_FILE_MAX_PATH) { + ib_errf(thd, IB_LOG_LEVEL_ERROR, + ER_INNODB_INDEX_CORRUPT, + "Index name length (%lu) is too long, " + "the meta-data is corrupt", len); + + return(DB_CORRUPTION); + } + + cfg_index->m_name = new(std::nothrow) byte[len]; + + /* Trigger OOM */ + DBUG_EXECUTE_IF("ib_import_OOM_7", + delete [] cfg_index->m_name; + cfg_index->m_name = 0;); + + if (cfg_index->m_name == 0) { + return(DB_OUT_OF_MEMORY); + } + + dberr_t err; + + err = row_import_cfg_read_string(file, cfg_index->m_name, len); + + if (err != DB_SUCCESS) { + + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + errno, strerror(errno), + "while parsing index name."); + + return(err); + } + + err = row_import_cfg_read_index_fields( + file, thd, cfg_index, cfg); + + if (err != DB_SUCCESS) { + return(err); + } + + } + + return(DB_SUCCESS); +} + +/*****************************************************************//** +Set the index root page number for v1 format. +@return DB_SUCCESS or error code. */ +static +dberr_t +row_import_read_indexes( +/*====================*/ + FILE* file, /*!< in: File to read from */ + THD* thd, /*!< in: session */ + row_import* cfg) /*!< in/out: meta-data read */ +{ + byte row[sizeof(ib_uint32_t)]; + + /* Trigger EOF */ + DBUG_EXECUTE_IF("ib_import_io_read_error_3", + (void) fseek(file, 0L, SEEK_END);); + + /* Read the number of indexes. */ + if (fread(row, 1, sizeof(row), file) != sizeof(row)) { + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + errno, strerror(errno), + "while reading number of indexes."); + + return(DB_IO_ERROR); + } + + cfg->m_n_indexes = mach_read_from_4(row); + + if (cfg->m_n_indexes == 0) { + ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + "Number of indexes in meta-data file is 0"); + + return(DB_CORRUPTION); + + } else if (cfg->m_n_indexes > 1024) { + // FIXME: What is the upper limit? */ + ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + "Number of indexes in meta-data file is too high: %lu", + (ulong) cfg->m_n_indexes); + cfg->m_n_indexes = 0; + + return(DB_CORRUPTION); + } + + return(row_import_read_index_data(file, thd, cfg)); +} + +/*********************************************************************//** +Read the meta data (table columns) config file. Deserialise the contents of +dict_col_t structure, along with the column name. */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_import_read_columns( +/*====================*/ + FILE* file, /*!< in: file to write to */ + THD* thd, /*!< in/out: session */ + row_import* cfg) /*!< in/out: meta-data read */ +{ + dict_col_t* col; + byte row[sizeof(ib_uint32_t) * 8]; + + /* FIXME: What should the upper limit be? */ + ut_a(cfg->m_n_cols > 0); + ut_a(cfg->m_n_cols < 1024); + + cfg->m_cols = new(std::nothrow) dict_col_t[cfg->m_n_cols]; + + /* Trigger OOM */ + DBUG_EXECUTE_IF("ib_import_OOM_8", + delete [] cfg->m_cols; cfg->m_cols = 0;); + + if (cfg->m_cols == 0) { + return(DB_OUT_OF_MEMORY); + } + + cfg->m_col_names = new(std::nothrow) byte* [cfg->m_n_cols]; + + /* Trigger OOM */ + DBUG_EXECUTE_IF("ib_import_OOM_9", + delete [] cfg->m_col_names; cfg->m_col_names = 0;); + + if (cfg->m_col_names == 0) { + return(DB_OUT_OF_MEMORY); + } + + memset(cfg->m_cols, 0x0, sizeof(cfg->m_cols) * cfg->m_n_cols); + memset(cfg->m_col_names, 0x0, sizeof(cfg->m_col_names) * cfg->m_n_cols); + + col = cfg->m_cols; + + for (ulint i = 0; i < cfg->m_n_cols; ++i, ++col) { + byte* ptr = row; + + /* Trigger EOF */ + DBUG_EXECUTE_IF("ib_import_io_read_error_4", + (void) fseek(file, 0L, SEEK_END);); + + if (fread(row, 1, sizeof(row), file) != sizeof(row)) { + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + errno, strerror(errno), + "while reading table column meta-data."); + + return(DB_IO_ERROR); + } + + col->prtype = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + col->mtype = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + col->len = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + col->mbminmaxlen = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + col->ind = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + col->ord_part = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + col->max_prefix = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + /* Read in the column name as [len, byte array]. The len + includes the NUL byte. */ + + ulint len = mach_read_from_4(ptr); + + /* FIXME: What is the maximum column name length? */ + if (len == 0 || len > 128) { + ib_errf(thd, IB_LOG_LEVEL_ERROR, + ER_IO_READ_ERROR, + "Column name length %lu, is invalid", + (ulong) len); + + return(DB_CORRUPTION); + } + + cfg->m_col_names[i] = new(std::nothrow) byte[len]; + + /* Trigger OOM */ + DBUG_EXECUTE_IF("ib_import_OOM_10", + delete [] cfg->m_col_names[i]; + cfg->m_col_names[i] = 0;); + + if (cfg->m_col_names[i] == 0) { + return(DB_OUT_OF_MEMORY); + } + + dberr_t err; + + err = row_import_cfg_read_string( + file, cfg->m_col_names[i], len); + + if (err != DB_SUCCESS) { + + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + errno, strerror(errno), + "while parsing table column name."); + + return(err); + } + } + + return(DB_SUCCESS); +} + +/*****************************************************************//** +Read the contents of the <tablespace>.cfg file. +@return DB_SUCCESS or error code. */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_import_read_v1( +/*===============*/ + FILE* file, /*!< in: File to read from */ + THD* thd, /*!< in: session */ + row_import* cfg) /*!< out: meta data */ +{ + byte value[sizeof(ib_uint32_t)]; + + /* Trigger EOF */ + DBUG_EXECUTE_IF("ib_import_io_read_error_5", + (void) fseek(file, 0L, SEEK_END);); + + /* Read the hostname where the tablespace was exported. */ + if (fread(value, 1, sizeof(value), file) != sizeof(value)) { + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + errno, strerror(errno), + "while reading meta-data export hostname length."); + + return(DB_IO_ERROR); + } + + ulint len = mach_read_from_4(value); + + /* NUL byte is part of name length. */ + cfg->m_hostname = new(std::nothrow) byte[len]; + + /* Trigger OOM */ + DBUG_EXECUTE_IF("ib_import_OOM_1", + delete [] cfg->m_hostname; cfg->m_hostname = 0;); + + if (cfg->m_hostname == 0) { + return(DB_OUT_OF_MEMORY); + } + + dberr_t err = row_import_cfg_read_string(file, cfg->m_hostname, len); + + if (err != DB_SUCCESS) { + + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + errno, strerror(errno), + "while parsing export hostname."); + + return(err); + } + + /* Trigger EOF */ + DBUG_EXECUTE_IF("ib_import_io_read_error_6", + (void) fseek(file, 0L, SEEK_END);); + + /* Read the table name of tablespace that was exported. */ + if (fread(value, 1, sizeof(value), file) != sizeof(value)) { + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + errno, strerror(errno), + "while reading meta-data table name length."); + + return(DB_IO_ERROR); + } + + len = mach_read_from_4(value); + + /* NUL byte is part of name length. */ + cfg->m_table_name = new(std::nothrow) byte[len]; + + /* Trigger OOM */ + DBUG_EXECUTE_IF("ib_import_OOM_2", + delete [] cfg->m_table_name; cfg->m_table_name = 0;); + + if (cfg->m_table_name == 0) { + return(DB_OUT_OF_MEMORY); + } + + err = row_import_cfg_read_string(file, cfg->m_table_name, len); + + if (err != DB_SUCCESS) { + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + errno, strerror(errno), + "while parsing table name."); + + return(err); + } + + ib_logf(IB_LOG_LEVEL_INFO, + "Importing tablespace for table '%s' that was exported " + "from host '%s'", cfg->m_table_name, cfg->m_hostname); + + byte row[sizeof(ib_uint32_t) * 3]; + + /* Trigger EOF */ + DBUG_EXECUTE_IF("ib_import_io_read_error_7", + (void) fseek(file, 0L, SEEK_END);); + + /* Read the autoinc value. */ + if (fread(row, 1, sizeof(ib_uint64_t), file) != sizeof(ib_uint64_t)) { + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + errno, strerror(errno), + "while reading autoinc value."); + + return(DB_IO_ERROR); + } + + cfg->m_autoinc = mach_read_from_8(row); + + /* Trigger EOF */ + DBUG_EXECUTE_IF("ib_import_io_read_error_8", + (void) fseek(file, 0L, SEEK_END);); + + /* Read the tablespace page size. */ + if (fread(row, 1, sizeof(row), file) != sizeof(row)) { + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + errno, strerror(errno), + "while reading meta-data header."); + + return(DB_IO_ERROR); + } + + byte* ptr = row; + + cfg->m_page_size = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + if (cfg->m_page_size != UNIV_PAGE_SIZE) { + + ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_TABLE_SCHEMA_MISMATCH, + "Tablespace to be imported has a different " + "page size than this server. Server page size " + "is %lu, whereas tablespace page size is %lu", + UNIV_PAGE_SIZE, (ulong) cfg->m_page_size); + + return(DB_ERROR); + } + + cfg->m_flags = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + cfg->m_n_cols = mach_read_from_4(ptr); + + if (!dict_tf_is_valid(cfg->m_flags)) { + + return(DB_CORRUPTION); + + } else if ((err = row_import_read_columns(file, thd, cfg)) + != DB_SUCCESS) { + + return(err); + + } else if ((err = row_import_read_indexes(file, thd, cfg)) + != DB_SUCCESS) { + + return(err); + } + + ut_a(err == DB_SUCCESS); + return(err); +} + +/** +Read the contents of the <tablespace>.cfg file. +@return DB_SUCCESS or error code. */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_import_read_meta_data( +/*======================*/ + dict_table_t* table, /*!< in: table */ + FILE* file, /*!< in: File to read from */ + THD* thd, /*!< in: session */ + row_import& cfg) /*!< out: contents of the .cfg file */ +{ + byte row[sizeof(ib_uint32_t)]; + + /* Trigger EOF */ + DBUG_EXECUTE_IF("ib_import_io_read_error_9", + (void) fseek(file, 0L, SEEK_END);); + + if (fread(&row, 1, sizeof(row), file) != sizeof(row)) { + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + errno, strerror(errno), + "while reading meta-data version."); + + return(DB_IO_ERROR); + } + + cfg.m_version = mach_read_from_4(row); + + /* Check the version number. */ + switch (cfg.m_version) { + case IB_EXPORT_CFG_VERSION_V1: + + return(row_import_read_v1(file, thd, &cfg)); + default: + ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + "Unsupported meta-data version number (%lu), " + "file ignored", (ulong) cfg.m_version); + } + + return(DB_ERROR); +} + +/** +Read the contents of the <tablename>.cfg file. +@return DB_SUCCESS or error code. */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_import_read_cfg( +/*================*/ + dict_table_t* table, /*!< in: table */ + THD* thd, /*!< in: session */ + row_import& cfg) /*!< out: contents of the .cfg file */ +{ + dberr_t err; + char name[OS_FILE_MAX_PATH]; + + cfg.m_table = table; + + srv_get_meta_data_filename(table, name, sizeof(name)); + + FILE* file = fopen(name, "rb"); + + if (file == NULL) { + char msg[BUFSIZ]; + + ut_snprintf(msg, sizeof(msg), + "Error opening '%s', will attempt to import " + "without schema verification", name); + + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_READ_ERROR, + errno, strerror(errno), msg); + + cfg.m_missing = true; + + err = DB_FAIL; + } else { + + cfg.m_missing = false; + + err = row_import_read_meta_data(table, file, thd, cfg); + fclose(file); + } + + return(err); +} + +/*****************************************************************//** +Update the <space, root page> of a table's indexes from the values +in the data dictionary. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +row_import_update_index_root( +/*=========================*/ + trx_t* trx, /*!< in/out: transaction that + covers the update */ + const dict_table_t* table, /*!< in: Table for which we want + to set the root page_no */ + bool reset, /*!< in: if true then set to + FIL_NUL */ + bool dict_locked) /*!< in: Set to true if the + caller already owns the + dict_sys_t:: mutex. */ + +{ + const dict_index_t* index; + que_t* graph = 0; + dberr_t err = DB_SUCCESS; + + static const char sql[] = { + "PROCEDURE UPDATE_INDEX_ROOT() IS\n" + "BEGIN\n" + "UPDATE SYS_INDEXES\n" + "SET SPACE = :space,\n" + " PAGE_NO = :page,\n" + " TYPE = :type\n" + "WHERE TABLE_ID = :table_id AND ID = :index_id;\n" + "END;\n"}; + + if (!dict_locked) { + mutex_enter(&dict_sys->mutex); + } + + for (index = dict_table_get_first_index(table); + index != 0; + index = dict_table_get_next_index(index)) { + + pars_info_t* info; + ib_uint32_t page; + ib_uint32_t space; + ib_uint32_t type; + index_id_t index_id; + table_id_t table_id; + + info = (graph != 0) ? graph->info : pars_info_create(); + + mach_write_to_4( + reinterpret_cast<byte*>(&type), + index->type); + + mach_write_to_4( + reinterpret_cast<byte*>(&page), + reset ? FIL_NULL : index->page); + + mach_write_to_4( + reinterpret_cast<byte*>(&space), + reset ? FIL_NULL : index->space); + + mach_write_to_8( + reinterpret_cast<byte*>(&index_id), + index->id); + + mach_write_to_8( + reinterpret_cast<byte*>(&table_id), + table->id); + + /* If we set the corrupt bit during the IMPORT phase then + we need to update the system tables. */ + pars_info_bind_int4_literal(info, "type", &type); + pars_info_bind_int4_literal(info, "space", &space); + pars_info_bind_int4_literal(info, "page", &page); + pars_info_bind_ull_literal(info, "index_id", &index_id); + pars_info_bind_ull_literal(info, "table_id", &table_id); + + if (graph == 0) { + graph = pars_sql(info, sql); + ut_a(graph); + graph->trx = trx; + } + + que_thr_t* thr; + + graph->fork_type = QUE_FORK_MYSQL_INTERFACE; + + ut_a(thr = que_fork_start_command(graph)); + + que_run_threads(thr); + + DBUG_EXECUTE_IF("ib_import_internal_error", + trx->error_state = DB_ERROR;); + + err = trx->error_state; + + if (err != DB_SUCCESS) { + char index_name[MAX_FULL_NAME_LEN + 1]; + + innobase_format_name( + index_name, sizeof(index_name), + index->name, TRUE); + + ib_errf(trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_INTERNAL_ERROR, + "While updating the <space, root page " + "number> of index %s - %s", + index_name, ut_strerr(err)); + + break; + } + } + + que_graph_free(graph); + + if (!dict_locked) { + mutex_exit(&dict_sys->mutex); + } + + return(err); +} + +/** Callback arg for row_import_set_discarded. */ +struct discard_t { + ib_uint32_t flags2; /*!< Value read from column */ + bool state; /*!< New state of the flag */ + ulint n_recs; /*!< Number of recs processed */ +}; + +/******************************************************************//** +Fetch callback that sets or unsets the DISCARDED tablespace flag in +SYS_TABLES. The flags is stored in MIX_LEN column. +@return FALSE if all OK */ +static +ibool +row_import_set_discarded( +/*=====================*/ + void* row, /*!< in: sel_node_t* */ + void* user_arg) /*!< in: bool set/unset flag */ +{ + sel_node_t* node = static_cast<sel_node_t*>(row); + discard_t* discard = static_cast<discard_t*>(user_arg); + dfield_t* dfield = que_node_get_val(node->select_list); + dtype_t* type = dfield_get_type(dfield); + ulint len = dfield_get_len(dfield); + + ut_a(dtype_get_mtype(type) == DATA_INT); + ut_a(len == sizeof(ib_uint32_t)); + + ulint flags2 = mach_read_from_4( + static_cast<byte*>(dfield_get_data(dfield))); + + if (discard->state) { + flags2 |= DICT_TF2_DISCARDED; + } else { + flags2 &= ~DICT_TF2_DISCARDED; + } + + mach_write_to_4(reinterpret_cast<byte*>(&discard->flags2), flags2); + + ++discard->n_recs; + + /* There should be at most one matching record. */ + ut_a(discard->n_recs == 1); + + return(FALSE); +} + +/*****************************************************************//** +Update the DICT_TF2_DISCARDED flag in SYS_TABLES. +@return DB_SUCCESS or error code. */ +UNIV_INTERN +dberr_t +row_import_update_discarded_flag( +/*=============================*/ + trx_t* trx, /*!< in/out: transaction that + covers the update */ + table_id_t table_id, /*!< in: Table for which we want + to set the root table->flags2 */ + bool discarded, /*!< in: set MIX_LEN column bit + to discarded, if true */ + bool dict_locked) /*!< in: set to true if the + caller already owns the + dict_sys_t:: mutex. */ + +{ + pars_info_t* info; + discard_t discard; + + static const char sql[] = + "PROCEDURE UPDATE_DISCARDED_FLAG() IS\n" + "DECLARE FUNCTION my_func;\n" + "DECLARE CURSOR c IS\n" + " SELECT MIX_LEN " + " FROM SYS_TABLES " + " WHERE ID = :table_id FOR UPDATE;" + "\n" + "BEGIN\n" + "OPEN c;\n" + "WHILE 1 = 1 LOOP\n" + " FETCH c INTO my_func();\n" + " IF c % NOTFOUND THEN\n" + " EXIT;\n" + " END IF;\n" + "END LOOP;\n" + "UPDATE SYS_TABLES" + " SET MIX_LEN = :flags2" + " WHERE ID = :table_id;\n" + "CLOSE c;\n" + "END;\n"; + + discard.n_recs = 0; + discard.state = discarded; + discard.flags2 = ULINT32_UNDEFINED; + + info = pars_info_create(); + + pars_info_add_ull_literal(info, "table_id", table_id); + pars_info_bind_int4_literal(info, "flags2", &discard.flags2); + + pars_info_bind_function( + info, "my_func", row_import_set_discarded, &discard); + + dberr_t err = que_eval_sql(info, sql, !dict_locked, trx); + + ut_a(discard.n_recs == 1); + ut_a(discard.flags2 != ULINT32_UNDEFINED); + + return(err); +} + +/*****************************************************************//** +Imports a tablespace. The space id in the .ibd file must match the space id +of the table in the data dictionary. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_import_for_mysql( +/*=================*/ + dict_table_t* table, /*!< in/out: table */ + row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in MySQL */ +{ + dberr_t err; + trx_t* trx; + ib_uint64_t autoinc = 0; + char table_name[MAX_FULL_NAME_LEN + 1]; + char* filepath = NULL; + + ut_ad(!srv_read_only_mode); + + innobase_format_name( + table_name, sizeof(table_name), table->name, FALSE); + + ut_a(table->space); + ut_ad(prebuilt->trx); + ut_a(table->ibd_file_missing); + + trx_start_if_not_started(prebuilt->trx); + + trx = trx_allocate_for_mysql(); + + /* So that the table is not DROPped during recovery. */ + trx_set_dict_operation(trx, TRX_DICT_OP_INDEX); + + trx_start_if_not_started(trx); + + /* So that we can send error messages to the user. */ + trx->mysql_thd = prebuilt->trx->mysql_thd; + + /* Ensure that the table will be dropped by trx_rollback_active() + in case of a crash. */ + + trx->table_id = table->id; + + /* Assign an undo segment for the transaction, so that the + transaction will be recovered after a crash. */ + + mutex_enter(&trx->undo_mutex); + + err = trx_undo_assign_undo(trx, TRX_UNDO_UPDATE); + + mutex_exit(&trx->undo_mutex); + + DBUG_EXECUTE_IF("ib_import_undo_assign_failure", + err = DB_TOO_MANY_CONCURRENT_TRXS;); + + if (err != DB_SUCCESS) { + + return(row_import_cleanup(prebuilt, trx, err)); + + } else if (trx->update_undo == 0) { + + err = DB_TOO_MANY_CONCURRENT_TRXS; + return(row_import_cleanup(prebuilt, trx, err)); + } + + prebuilt->trx->op_info = "read meta-data file"; + + /* Prevent DDL operations while we are checking. */ + + rw_lock_s_lock_func(&dict_operation_lock, 0, __FILE__, __LINE__); + + row_import cfg; + + memset(&cfg, 0x0, sizeof(cfg)); + + err = row_import_read_cfg(table, trx->mysql_thd, cfg); + + /* Check if the table column definitions match the contents + of the config file. */ + + if (err == DB_SUCCESS) { + + /* We have a schema file, try and match it with the our + data dictionary. */ + + err = cfg.match_schema(trx->mysql_thd); + + /* Update index->page and SYS_INDEXES.PAGE_NO to match the + B-tree root page numbers in the tablespace. Use the index + name from the .cfg file to find match. */ + + if (err == DB_SUCCESS) { + cfg.set_root_by_name(); + autoinc = cfg.m_autoinc; + } + + rw_lock_s_unlock_gen(&dict_operation_lock, 0); + + DBUG_EXECUTE_IF("ib_import_set_index_root_failure", + err = DB_TOO_MANY_CONCURRENT_TRXS;); + + } else if (cfg.m_missing) { + + rw_lock_s_unlock_gen(&dict_operation_lock, 0); + + /* We don't have a schema file, we will have to discover + the index root pages from the .ibd file and skip the schema + matching step. */ + + ut_a(err == DB_FAIL); + + cfg.m_page_size = UNIV_PAGE_SIZE; + + FetchIndexRootPages fetchIndexRootPages(table, trx); + + err = fil_tablespace_iterate( + table, IO_BUFFER_SIZE(cfg.m_page_size), + fetchIndexRootPages); + + if (err == DB_SUCCESS) { + + err = fetchIndexRootPages.build_row_import(&cfg); + + /* Update index->page and SYS_INDEXES.PAGE_NO + to match the B-tree root page numbers in the + tablespace. */ + + if (err == DB_SUCCESS) { + err = cfg.set_root_by_heuristic(); + } + } + + } else { + rw_lock_s_unlock_gen(&dict_operation_lock, 0); + } + + if (err != DB_SUCCESS) { + return(row_import_error(prebuilt, trx, err)); + } + + prebuilt->trx->op_info = "importing tablespace"; + + ib_logf(IB_LOG_LEVEL_INFO, "Phase I - Update all pages"); + + /* Iterate over all the pages and do the sanity checking and + the conversion required to import the tablespace. */ + + PageConverter converter(&cfg, trx); + + /* Set the IO buffer size in pages. */ + + err = fil_tablespace_iterate( + table, IO_BUFFER_SIZE(cfg.m_page_size), converter); + + DBUG_EXECUTE_IF("ib_import_reset_space_and_lsn_failure", + err = DB_TOO_MANY_CONCURRENT_TRXS;); + + if (err != DB_SUCCESS) { + char table_name[MAX_FULL_NAME_LEN + 1]; + + innobase_format_name( + table_name, sizeof(table_name), table->name, FALSE); + + ib_errf(trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_INTERNAL_ERROR, + "Cannot reset LSNs in table '%s' : %s", + table_name, ut_strerr(err)); + + return(row_import_cleanup(prebuilt, trx, err)); + } + + row_mysql_lock_data_dictionary(trx); + + /* If the table is stored in a remote tablespace, we need to + determine that filepath from the link file and system tables. + Find the space ID in SYS_TABLES since this is an ALTER TABLE. */ + if (DICT_TF_HAS_DATA_DIR(table->flags)) { + dict_get_and_save_data_dir_path(table, true); + ut_a(table->data_dir_path); + + filepath = os_file_make_remote_pathname( + table->data_dir_path, table->name, "ibd"); + } else { + filepath = fil_make_ibd_name(table->name, false); + } + ut_a(filepath); + + /* Open the tablespace so that we can access via the buffer pool. + We set the 2nd param (fix_dict = true) here because we already + have an x-lock on dict_operation_lock and dict_sys->mutex. */ + + err = fil_open_single_table_tablespace( + true, true, table->space, + dict_tf_to_fsp_flags(table->flags), + table->name, filepath); + + DBUG_EXECUTE_IF("ib_import_open_tablespace_failure", + err = DB_TABLESPACE_NOT_FOUND;); + + if (err != DB_SUCCESS) { + row_mysql_unlock_data_dictionary(trx); + + ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_FILE_NOT_FOUND, + filepath, err, ut_strerr(err)); + + mem_free(filepath); + + return(row_import_cleanup(prebuilt, trx, err)); + } + + row_mysql_unlock_data_dictionary(trx); + + mem_free(filepath); + + err = ibuf_check_bitmap_on_import(trx, table->space); + + DBUG_EXECUTE_IF("ib_import_check_bitmap_failure", err = DB_CORRUPTION;); + + if (err != DB_SUCCESS) { + return(row_import_cleanup(prebuilt, trx, err)); + } + + /* The first index must always be the clustered index. */ + + dict_index_t* index = dict_table_get_first_index(table); + + if (!dict_index_is_clust(index)) { + return(row_import_error(prebuilt, trx, DB_CORRUPTION)); + } + + /* Update the Btree segment headers for index node and + leaf nodes in the root page. Set the new space id. */ + + err = btr_root_adjust_on_import(index); + + DBUG_EXECUTE_IF("ib_import_cluster_root_adjust_failure", + err = DB_CORRUPTION;); + + if (err != DB_SUCCESS) { + return(row_import_error(prebuilt, trx, err)); + } + + if (err != DB_SUCCESS) { + return(row_import_error(prebuilt, trx, err)); + } else if (cfg.requires_purge(index->name)) { + + /* Purge any delete-marked records that couldn't be + purged during the page conversion phase from the + cluster index. */ + + IndexPurge purge(trx, index); + + trx->op_info = "cluster: purging delete marked records"; + + err = purge.garbage_collect(); + + trx->op_info = ""; + } + + DBUG_EXECUTE_IF("ib_import_cluster_failure", err = DB_CORRUPTION;); + + if (err != DB_SUCCESS) { + return(row_import_error(prebuilt, trx, err)); + } + + /* For secondary indexes, purge any records that couldn't be purged + during the page conversion phase. */ + + err = row_import_adjust_root_pages_of_secondary_indexes( + prebuilt, trx, table, cfg); + + DBUG_EXECUTE_IF("ib_import_sec_root_adjust_failure", + err = DB_CORRUPTION;); + + if (err != DB_SUCCESS) { + return(row_import_error(prebuilt, trx, err)); + } + + /* Ensure that the next available DB_ROW_ID is not smaller than + any DB_ROW_ID stored in the table. */ + + if (prebuilt->clust_index_was_generated) { + + err = row_import_set_sys_max_row_id(prebuilt, table); + + if (err != DB_SUCCESS) { + return(row_import_error(prebuilt, trx, err)); + } + } + + ib_logf(IB_LOG_LEVEL_INFO, "Phase III - Flush changes to disk"); + + /* Ensure that all pages dirtied during the IMPORT make it to disk. + The only dirty pages generated should be from the pessimistic purge + of delete marked records that couldn't be purged in Phase I. */ + + buf_LRU_flush_or_remove_pages( + prebuilt->table->space, BUF_REMOVE_FLUSH_WRITE, trx); + + if (trx_is_interrupted(trx)) { + ib_logf(IB_LOG_LEVEL_INFO, "Phase III - Flush interrupted"); + return(row_import_error(prebuilt, trx, DB_INTERRUPTED)); + } else { + ib_logf(IB_LOG_LEVEL_INFO, "Phase IV - Flush complete"); + } + + /* The dictionary latches will be released in in row_import_cleanup() + after the transaction commit, for both success and error. */ + + row_mysql_lock_data_dictionary(trx); + + /* Update the root pages of the table's indexes. */ + err = row_import_update_index_root(trx, table, false, true); + + if (err != DB_SUCCESS) { + return(row_import_error(prebuilt, trx, err)); + } + + /* Update the table's discarded flag, unset it. */ + err = row_import_update_discarded_flag(trx, table->id, false, true); + + if (err != DB_SUCCESS) { + return(row_import_error(prebuilt, trx, err)); + } + + table->ibd_file_missing = false; + table->flags2 &= ~DICT_TF2_DISCARDED; + + if (autoinc != 0) { + char table_name[MAX_FULL_NAME_LEN + 1]; + + innobase_format_name( + table_name, sizeof(table_name), table->name, FALSE); + + ib_logf(IB_LOG_LEVEL_INFO, "%s autoinc value set to " IB_ID_FMT, + table_name, autoinc); + + dict_table_autoinc_lock(table); + dict_table_autoinc_initialize(table, autoinc); + dict_table_autoinc_unlock(table); + } + + ut_a(err == DB_SUCCESS); + + return(row_import_cleanup(prebuilt, trx, err)); +} + diff --git a/storage/xtradb/row/row0ins.cc b/storage/xtradb/row/row0ins.cc new file mode 100644 index 00000000000..a1de3d7894f --- /dev/null +++ b/storage/xtradb/row/row0ins.cc @@ -0,0 +1,3358 @@ +/***************************************************************************** + +Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file row/row0ins.cc +Insert into a table + +Created 4/20/1996 Heikki Tuuri +*******************************************************/ + +#include "row0ins.h" + +#ifdef UNIV_NONINL +#include "row0ins.ic" +#endif + +#include "ha_prototypes.h" +#include "dict0dict.h" +#include "dict0boot.h" +#include "trx0rec.h" +#include "trx0undo.h" +#include "btr0btr.h" +#include "btr0cur.h" +#include "mach0data.h" +#include "que0que.h" +#include "row0upd.h" +#include "row0sel.h" +#include "row0row.h" +#include "row0log.h" +#include "rem0cmp.h" +#include "lock0lock.h" +#include "log0log.h" +#include "eval0eval.h" +#include "data0data.h" +#include "usr0sess.h" +#include "buf0lru.h" +#include "fts0fts.h" +#include "fts0types.h" +#include "m_string.h" + +/************************************************************************* +IMPORTANT NOTE: Any operation that generates redo MUST check that there +is enough space in the redo log before for that operation. This is +done by calling log_free_check(). The reason for checking the +availability of the redo log space before the start of the operation is +that we MUST not hold any synchonization objects when performing the +check. +If you make a change in this module make sure that no codepath is +introduced where a call to log_free_check() is bypassed. */ + +/*********************************************************************//** +Creates an insert node struct. +@return own: insert node struct */ +UNIV_INTERN +ins_node_t* +ins_node_create( +/*============*/ + ulint ins_type, /*!< in: INS_VALUES, ... */ + dict_table_t* table, /*!< in: table where to insert */ + mem_heap_t* heap) /*!< in: mem heap where created */ +{ + ins_node_t* node; + + node = static_cast<ins_node_t*>( + mem_heap_alloc(heap, sizeof(ins_node_t))); + + node->common.type = QUE_NODE_INSERT; + + node->ins_type = ins_type; + + node->state = INS_NODE_SET_IX_LOCK; + node->table = table; + node->index = NULL; + node->entry = NULL; + + node->select = NULL; + + node->trx_id = 0; + + node->entry_sys_heap = mem_heap_create(128); + + node->magic_n = INS_NODE_MAGIC_N; + + return(node); +} + +/***********************************************************//** +Creates an entry template for each index of a table. */ +static +void +ins_node_create_entry_list( +/*=======================*/ + ins_node_t* node) /*!< in: row insert node */ +{ + dict_index_t* index; + dtuple_t* entry; + + ut_ad(node->entry_sys_heap); + + UT_LIST_INIT(node->entry_list); + + /* We will include all indexes (include those corrupted + secondary indexes) in the entry list. Filteration of + these corrupted index will be done in row_ins() */ + + for (index = dict_table_get_first_index(node->table); + index != 0; + index = dict_table_get_next_index(index)) { + + entry = row_build_index_entry( + node->row, NULL, index, node->entry_sys_heap); + + UT_LIST_ADD_LAST(tuple_list, node->entry_list, entry); + } +} + +/*****************************************************************//** +Adds system field buffers to a row. */ +static +void +row_ins_alloc_sys_fields( +/*=====================*/ + ins_node_t* node) /*!< in: insert node */ +{ + dtuple_t* row; + dict_table_t* table; + mem_heap_t* heap; + const dict_col_t* col; + dfield_t* dfield; + byte* ptr; + + row = node->row; + table = node->table; + heap = node->entry_sys_heap; + + ut_ad(row && table && heap); + ut_ad(dtuple_get_n_fields(row) == dict_table_get_n_cols(table)); + + /* allocate buffer to hold the needed system created hidden columns. */ + uint len = DATA_ROW_ID_LEN + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + ptr = static_cast<byte*>(mem_heap_zalloc(heap, len)); + + /* 1. Populate row-id */ + col = dict_table_get_sys_col(table, DATA_ROW_ID); + + dfield = dtuple_get_nth_field(row, dict_col_get_no(col)); + + dfield_set_data(dfield, ptr, DATA_ROW_ID_LEN); + + node->row_id_buf = ptr; + + ptr += DATA_ROW_ID_LEN; + + /* 2. Populate trx id */ + col = dict_table_get_sys_col(table, DATA_TRX_ID); + + dfield = dtuple_get_nth_field(row, dict_col_get_no(col)); + + dfield_set_data(dfield, ptr, DATA_TRX_ID_LEN); + + node->trx_id_buf = ptr; + + ptr += DATA_TRX_ID_LEN; + + /* 3. Populate roll ptr */ + + col = dict_table_get_sys_col(table, DATA_ROLL_PTR); + + dfield = dtuple_get_nth_field(row, dict_col_get_no(col)); + + dfield_set_data(dfield, ptr, DATA_ROLL_PTR_LEN); +} + +/*********************************************************************//** +Sets a new row to insert for an INS_DIRECT node. This function is only used +if we have constructed the row separately, which is a rare case; this +function is quite slow. */ +UNIV_INTERN +void +ins_node_set_new_row( +/*=================*/ + ins_node_t* node, /*!< in: insert node */ + dtuple_t* row) /*!< in: new row (or first row) for the node */ +{ + node->state = INS_NODE_SET_IX_LOCK; + node->index = NULL; + node->entry = NULL; + + node->row = row; + + mem_heap_empty(node->entry_sys_heap); + + /* Create templates for index entries */ + + ins_node_create_entry_list(node); + + /* Allocate from entry_sys_heap buffers for sys fields */ + + row_ins_alloc_sys_fields(node); + + /* As we allocated a new trx id buf, the trx id should be written + there again: */ + + node->trx_id = 0; +} + +/*******************************************************************//** +Does an insert operation by updating a delete-marked existing record +in the index. This situation can occur if the delete-marked record is +kept in the index for consistent reads. +@return DB_SUCCESS or error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_ins_sec_index_entry_by_modify( +/*==============================*/ + ulint flags, /*!< in: undo logging and locking flags */ + ulint mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE, + depending on whether mtr holds just a leaf + latch or also a tree latch */ + btr_cur_t* cursor, /*!< in: B-tree cursor */ + ulint** offsets,/*!< in/out: offsets on cursor->page_cur.rec */ + mem_heap_t* offsets_heap, + /*!< in/out: memory heap that can be emptied */ + mem_heap_t* heap, /*!< in/out: memory heap */ + const dtuple_t* entry, /*!< in: index entry to insert */ + que_thr_t* thr, /*!< in: query thread */ + mtr_t* mtr) /*!< in: mtr; must be committed before + latching any further pages */ +{ + big_rec_t* dummy_big_rec; + upd_t* update; + rec_t* rec; + dberr_t err; + + rec = btr_cur_get_rec(cursor); + + ut_ad(!dict_index_is_clust(cursor->index)); + ut_ad(rec_offs_validate(rec, cursor->index, *offsets)); + ut_ad(!entry->info_bits); + + /* We know that in the alphabetical ordering, entry and rec are + identified. But in their binary form there may be differences if + there are char fields in them. Therefore we have to calculate the + difference. */ + + update = row_upd_build_sec_rec_difference_binary( + rec, cursor->index, *offsets, entry, heap); + + if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) { + /* We should never insert in place of a record that + has not been delete-marked. The only exception is when + online CREATE INDEX copied the changes that we already + made to the clustered index, and completed the + secondary index creation before we got here. In this + case, the change would already be there. The CREATE + INDEX should be waiting for a MySQL meta-data lock + upgrade at least until this INSERT or UPDATE + returns. After that point, the TEMP_INDEX_PREFIX + would be dropped from the index name in + commit_inplace_alter_table(). */ + ut_a(update->n_fields == 0); + ut_a(*cursor->index->name == TEMP_INDEX_PREFIX); + ut_ad(!dict_index_is_online_ddl(cursor->index)); + return(DB_SUCCESS); + } + + if (mode == BTR_MODIFY_LEAF) { + /* Try an optimistic updating of the record, keeping changes + within the page */ + + /* TODO: pass only *offsets */ + err = btr_cur_optimistic_update( + flags | BTR_KEEP_SYS_FLAG, cursor, + offsets, &offsets_heap, update, 0, thr, + thr_get_trx(thr)->id, mtr); + switch (err) { + case DB_OVERFLOW: + case DB_UNDERFLOW: + case DB_ZIP_OVERFLOW: + err = DB_FAIL; + default: + break; + } + } else { + ut_a(mode == BTR_MODIFY_TREE); + if (buf_LRU_buf_pool_running_out()) { + + return(DB_LOCK_TABLE_FULL); + } + + err = btr_cur_pessimistic_update( + flags | BTR_KEEP_SYS_FLAG, cursor, + offsets, &offsets_heap, + heap, &dummy_big_rec, update, 0, + thr, thr_get_trx(thr)->id, mtr); + ut_ad(!dummy_big_rec); + } + + return(err); +} + +/*******************************************************************//** +Does an insert operation by delete unmarking and updating a delete marked +existing record in the index. This situation can occur if the delete marked +record is kept in the index for consistent reads. +@return DB_SUCCESS, DB_FAIL, or error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_ins_clust_index_entry_by_modify( +/*================================*/ + ulint flags, /*!< in: undo logging and locking flags */ + ulint mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE, + depending on whether mtr holds just a leaf + latch or also a tree latch */ + btr_cur_t* cursor, /*!< in: B-tree cursor */ + ulint** offsets,/*!< out: offsets on cursor->page_cur.rec */ + mem_heap_t** offsets_heap, + /*!< in/out: pointer to memory heap that can + be emptied, or NULL */ + mem_heap_t* heap, /*!< in/out: memory heap */ + big_rec_t** big_rec,/*!< out: possible big rec vector of fields + which have to be stored externally by the + caller */ + const dtuple_t* entry, /*!< in: index entry to insert */ + que_thr_t* thr, /*!< in: query thread */ + mtr_t* mtr) /*!< in: mtr; must be committed before + latching any further pages */ +{ + const rec_t* rec; + const upd_t* update; + dberr_t err; + + ut_ad(dict_index_is_clust(cursor->index)); + + *big_rec = NULL; + + rec = btr_cur_get_rec(cursor); + + ut_ad(rec_get_deleted_flag(rec, + dict_table_is_comp(cursor->index->table))); + + /* Build an update vector containing all the fields to be modified; + NOTE that this vector may NOT contain system columns trx_id or + roll_ptr */ + + update = row_upd_build_difference_binary( + cursor->index, entry, rec, NULL, true, + thr_get_trx(thr), heap); + if (mode != BTR_MODIFY_TREE) { + ut_ad((mode & ~BTR_ALREADY_S_LATCHED) == BTR_MODIFY_LEAF); + + /* Try optimistic updating of the record, keeping changes + within the page */ + + err = btr_cur_optimistic_update( + flags, cursor, offsets, offsets_heap, update, 0, thr, + thr_get_trx(thr)->id, mtr); + switch (err) { + case DB_OVERFLOW: + case DB_UNDERFLOW: + case DB_ZIP_OVERFLOW: + err = DB_FAIL; + default: + break; + } + } else { + if (buf_LRU_buf_pool_running_out()) { + + return(DB_LOCK_TABLE_FULL); + + } + err = btr_cur_pessimistic_update( + flags | BTR_KEEP_POS_FLAG, + cursor, offsets, offsets_heap, heap, + big_rec, update, 0, thr, thr_get_trx(thr)->id, mtr); + } + + return(err); +} + +/*********************************************************************//** +Returns TRUE if in a cascaded update/delete an ancestor node of node +updates (not DELETE, but UPDATE) table. +@return TRUE if an ancestor updates table */ +static +ibool +row_ins_cascade_ancestor_updates_table( +/*===================================*/ + que_node_t* node, /*!< in: node in a query graph */ + dict_table_t* table) /*!< in: table */ +{ + que_node_t* parent; + + for (parent = que_node_get_parent(node); + que_node_get_type(parent) == QUE_NODE_UPDATE; + parent = que_node_get_parent(parent)) { + + upd_node_t* upd_node; + + upd_node = static_cast<upd_node_t*>(parent); + + if (upd_node->table == table && upd_node->is_delete == FALSE) { + + return(TRUE); + } + } + + return(FALSE); +} + +/*********************************************************************//** +Returns the number of ancestor UPDATE or DELETE nodes of a +cascaded update/delete node. +@return number of ancestors */ +static __attribute__((nonnull, warn_unused_result)) +ulint +row_ins_cascade_n_ancestors( +/*========================*/ + que_node_t* node) /*!< in: node in a query graph */ +{ + que_node_t* parent; + ulint n_ancestors = 0; + + for (parent = que_node_get_parent(node); + que_node_get_type(parent) == QUE_NODE_UPDATE; + parent = que_node_get_parent(parent)) { + + n_ancestors++; + } + + return(n_ancestors); +} + +/******************************************************************//** +Calculates the update vector node->cascade->update for a child table in +a cascaded update. +@return number of fields in the calculated update vector; the value +can also be 0 if no foreign key fields changed; the returned value is +ULINT_UNDEFINED if the column type in the child table is too short to +fit the new value in the parent table: that means the update fails */ +static __attribute__((nonnull, warn_unused_result)) +ulint +row_ins_cascade_calc_update_vec( +/*============================*/ + upd_node_t* node, /*!< in: update node of the parent + table */ + dict_foreign_t* foreign, /*!< in: foreign key constraint whose + type is != 0 */ + mem_heap_t* heap, /*!< in: memory heap to use as + temporary storage */ + trx_t* trx, /*!< in: update transaction */ + ibool* fts_col_affected)/*!< out: is FTS column affected */ +{ + upd_node_t* cascade = node->cascade_node; + dict_table_t* table = foreign->foreign_table; + dict_index_t* index = foreign->foreign_index; + upd_t* update; + dict_table_t* parent_table; + dict_index_t* parent_index; + upd_t* parent_update; + ulint n_fields_updated; + ulint parent_field_no; + ulint i; + ulint j; + ibool doc_id_updated = FALSE; + ulint doc_id_pos = 0; + doc_id_t new_doc_id = FTS_NULL_DOC_ID; + + ut_a(node); + ut_a(foreign); + ut_a(cascade); + ut_a(table); + ut_a(index); + + /* Calculate the appropriate update vector which will set the fields + in the child index record to the same value (possibly padded with + spaces if the column is a fixed length CHAR or FIXBINARY column) as + the referenced index record will get in the update. */ + + parent_table = node->table; + ut_a(parent_table == foreign->referenced_table); + parent_index = foreign->referenced_index; + parent_update = node->update; + + update = cascade->update; + + update->info_bits = 0; + update->n_fields = foreign->n_fields; + + n_fields_updated = 0; + + *fts_col_affected = FALSE; + + if (table->fts) { + doc_id_pos = dict_table_get_nth_col_pos( + table, table->fts->doc_col); + } + + for (i = 0; i < foreign->n_fields; i++) { + + parent_field_no = dict_table_get_nth_col_pos( + parent_table, + dict_index_get_nth_col_no(parent_index, i)); + + for (j = 0; j < parent_update->n_fields; j++) { + const upd_field_t* parent_ufield + = &parent_update->fields[j]; + + if (parent_ufield->field_no == parent_field_no) { + + ulint min_size; + const dict_col_t* col; + ulint ufield_len; + upd_field_t* ufield; + + col = dict_index_get_nth_col(index, i); + + /* A field in the parent index record is + updated. Let us make the update vector + field for the child table. */ + + ufield = update->fields + n_fields_updated; + + ufield->field_no + = dict_table_get_nth_col_pos( + table, dict_col_get_no(col)); + + ufield->orig_len = 0; + ufield->exp = NULL; + + ufield->new_val = parent_ufield->new_val; + ufield_len = dfield_get_len(&ufield->new_val); + + /* Clear the "external storage" flag */ + dfield_set_len(&ufield->new_val, ufield_len); + + /* Do not allow a NOT NULL column to be + updated as NULL */ + + if (dfield_is_null(&ufield->new_val) + && (col->prtype & DATA_NOT_NULL)) { + + return(ULINT_UNDEFINED); + } + + /* If the new value would not fit in the + column, do not allow the update */ + + if (!dfield_is_null(&ufield->new_val) + && dtype_get_at_most_n_mbchars( + col->prtype, col->mbminmaxlen, + col->len, + ufield_len, + static_cast<char*>( + dfield_get_data( + &ufield->new_val))) + < ufield_len) { + + return(ULINT_UNDEFINED); + } + + /* If the parent column type has a different + length than the child column type, we may + need to pad with spaces the new value of the + child column */ + + min_size = dict_col_get_min_size(col); + + /* Because UNIV_SQL_NULL (the marker + of SQL NULL values) exceeds all possible + values of min_size, the test below will + not hold for SQL NULL columns. */ + + if (min_size > ufield_len) { + + byte* pad; + ulint pad_len; + byte* padded_data; + ulint mbminlen; + + padded_data = static_cast<byte*>( + mem_heap_alloc( + heap, min_size)); + + pad = padded_data + ufield_len; + pad_len = min_size - ufield_len; + + memcpy(padded_data, + dfield_get_data(&ufield + ->new_val), + ufield_len); + + mbminlen = dict_col_get_mbminlen(col); + + ut_ad(!(ufield_len % mbminlen)); + ut_ad(!(min_size % mbminlen)); + + if (mbminlen == 1 + && dtype_get_charset_coll( + col->prtype) + == DATA_MYSQL_BINARY_CHARSET_COLL) { + /* Do not pad BINARY columns */ + return(ULINT_UNDEFINED); + } + + row_mysql_pad_col(mbminlen, + pad, pad_len); + dfield_set_data(&ufield->new_val, + padded_data, min_size); + } + + /* Check whether the current column has + FTS index on it */ + if (table->fts + && dict_table_is_fts_column( + table->fts->indexes, + dict_col_get_no(col)) + != ULINT_UNDEFINED) { + *fts_col_affected = TRUE; + } + + /* If Doc ID is updated, check whether the + Doc ID is valid */ + if (table->fts + && ufield->field_no == doc_id_pos) { + doc_id_t n_doc_id; + + n_doc_id = + table->fts->cache->next_doc_id; + + new_doc_id = fts_read_doc_id( + static_cast<const byte*>( + dfield_get_data( + &ufield->new_val))); + + if (new_doc_id <= 0) { + fprintf(stderr, + "InnoDB: FTS Doc ID " + "must be larger than " + "0 \n"); + return(ULINT_UNDEFINED); + } + + if (new_doc_id < n_doc_id) { + fprintf(stderr, + "InnoDB: FTS Doc ID " + "must be larger than " + IB_ID_FMT" for table", + n_doc_id -1); + + ut_print_name(stderr, trx, + TRUE, + table->name); + + putc('\n', stderr); + return(ULINT_UNDEFINED); + } + + *fts_col_affected = TRUE; + doc_id_updated = TRUE; + } + + n_fields_updated++; + } + } + } + + /* Generate a new Doc ID if FTS index columns get updated */ + if (table->fts && *fts_col_affected) { + if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) { + doc_id_t doc_id; + upd_field_t* ufield; + + ut_ad(!doc_id_updated); + ufield = update->fields + n_fields_updated; + fts_get_next_doc_id(table, &trx->fts_next_doc_id); + doc_id = fts_update_doc_id(table, ufield, + &trx->fts_next_doc_id); + n_fields_updated++; + fts_trx_add_op(trx, table, doc_id, FTS_INSERT, NULL); + } else { + if (doc_id_updated) { + ut_ad(new_doc_id); + fts_trx_add_op(trx, table, new_doc_id, + FTS_INSERT, NULL); + } else { + fprintf(stderr, "InnoDB: FTS Doc ID must be " + "updated along with FTS indexed " + "column for table "); + ut_print_name(stderr, trx, TRUE, table->name); + putc('\n', stderr); + return(ULINT_UNDEFINED); + } + } + } + + update->n_fields = n_fields_updated; + + return(n_fields_updated); +} + +/*********************************************************************//** +Set detailed error message associated with foreign key errors for +the given transaction. */ +static +void +row_ins_set_detailed( +/*=================*/ + trx_t* trx, /*!< in: transaction */ + dict_foreign_t* foreign) /*!< in: foreign key constraint */ +{ + ut_ad(!srv_read_only_mode); + + mutex_enter(&srv_misc_tmpfile_mutex); + rewind(srv_misc_tmpfile); + + if (os_file_set_eof(srv_misc_tmpfile)) { + ut_print_name(srv_misc_tmpfile, trx, TRUE, + foreign->foreign_table_name); + dict_print_info_on_foreign_key_in_create_format( + srv_misc_tmpfile, trx, foreign, FALSE); + trx_set_detailed_error_from_file(trx, srv_misc_tmpfile); + } else { + trx_set_detailed_error(trx, "temp file operation failed"); + } + + mutex_exit(&srv_misc_tmpfile_mutex); +} + +/*********************************************************************//** +Acquires dict_foreign_err_mutex, rewinds dict_foreign_err_file +and displays information about the given transaction. +The caller must release dict_foreign_err_mutex. */ +static +void +row_ins_foreign_trx_print( +/*======================*/ + trx_t* trx) /*!< in: transaction */ +{ + ulint n_rec_locks; + ulint n_trx_locks; + ulint heap_size; + + if (srv_read_only_mode) { + return; + } + + lock_mutex_enter(); + n_rec_locks = lock_number_of_rows_locked(&trx->lock); + n_trx_locks = UT_LIST_GET_LEN(trx->lock.trx_locks); + heap_size = mem_heap_get_size(trx->lock.lock_heap); + lock_mutex_exit(); + + mutex_enter(&trx_sys->mutex); + + mutex_enter(&dict_foreign_err_mutex); + rewind(dict_foreign_err_file); + ut_print_timestamp(dict_foreign_err_file); + fputs(" Transaction:\n", dict_foreign_err_file); + + trx_print_low(dict_foreign_err_file, trx, 600, + n_rec_locks, n_trx_locks, heap_size); + + mutex_exit(&trx_sys->mutex); + + ut_ad(mutex_own(&dict_foreign_err_mutex)); +} + +/*********************************************************************//** +Reports a foreign key error associated with an update or a delete of a +parent table index entry. */ +static +void +row_ins_foreign_report_err( +/*=======================*/ + const char* errstr, /*!< in: error string from the viewpoint + of the parent table */ + que_thr_t* thr, /*!< in: query thread whose run_node + is an update node */ + dict_foreign_t* foreign, /*!< in: foreign key constraint */ + const rec_t* rec, /*!< in: a matching index record in the + child table */ + const dtuple_t* entry) /*!< in: index entry in the parent + table */ +{ + if (srv_read_only_mode) { + return; + } + + FILE* ef = dict_foreign_err_file; + trx_t* trx = thr_get_trx(thr); + + row_ins_set_detailed(trx, foreign); + + row_ins_foreign_trx_print(trx); + + fputs("Foreign key constraint fails for table ", ef); + ut_print_name(ef, trx, TRUE, foreign->foreign_table_name); + fputs(":\n", ef); + dict_print_info_on_foreign_key_in_create_format(ef, trx, foreign, + TRUE); + putc('\n', ef); + fputs(errstr, ef); + fputs(" in parent table, in index ", ef); + ut_print_name(ef, trx, FALSE, foreign->referenced_index->name); + if (entry) { + fputs(" tuple:\n", ef); + dtuple_print(ef, entry); + } + fputs("\nBut in child table ", ef); + ut_print_name(ef, trx, TRUE, foreign->foreign_table_name); + fputs(", in index ", ef); + ut_print_name(ef, trx, FALSE, foreign->foreign_index->name); + if (rec) { + fputs(", there is a record:\n", ef); + rec_print(ef, rec, foreign->foreign_index); + } else { + fputs(", the record is not available\n", ef); + } + putc('\n', ef); + + mutex_exit(&dict_foreign_err_mutex); +} + +/*********************************************************************//** +Reports a foreign key error to dict_foreign_err_file when we are trying +to add an index entry to a child table. Note that the adding may be the result +of an update, too. */ +static +void +row_ins_foreign_report_add_err( +/*===========================*/ + trx_t* trx, /*!< in: transaction */ + dict_foreign_t* foreign, /*!< in: foreign key constraint */ + const rec_t* rec, /*!< in: a record in the parent table: + it does not match entry because we + have an error! */ + const dtuple_t* entry) /*!< in: index entry to insert in the + child table */ +{ + if (srv_read_only_mode) { + return; + } + + FILE* ef = dict_foreign_err_file; + + row_ins_set_detailed(trx, foreign); + + row_ins_foreign_trx_print(trx); + + fputs("Foreign key constraint fails for table ", ef); + ut_print_name(ef, trx, TRUE, foreign->foreign_table_name); + fputs(":\n", ef); + dict_print_info_on_foreign_key_in_create_format(ef, trx, foreign, + TRUE); + fputs("\nTrying to add in child table, in index ", ef); + ut_print_name(ef, trx, FALSE, foreign->foreign_index->name); + if (entry) { + fputs(" tuple:\n", ef); + /* TODO: DB_TRX_ID and DB_ROLL_PTR may be uninitialized. + It would be better to only display the user columns. */ + dtuple_print(ef, entry); + } + fputs("\nBut in parent table ", ef); + ut_print_name(ef, trx, TRUE, foreign->referenced_table_name); + fputs(", in index ", ef); + ut_print_name(ef, trx, FALSE, foreign->referenced_index->name); + fputs(",\nthe closest match we can find is record:\n", ef); + if (rec && page_rec_is_supremum(rec)) { + /* If the cursor ended on a supremum record, it is better + to report the previous record in the error message, so that + the user gets a more descriptive error message. */ + rec = page_rec_get_prev_const(rec); + } + + if (rec) { + rec_print(ef, rec, foreign->referenced_index); + } + putc('\n', ef); + + mutex_exit(&dict_foreign_err_mutex); +} + +/*********************************************************************//** +Invalidate the query cache for the given table. */ +static +void +row_ins_invalidate_query_cache( +/*===========================*/ + que_thr_t* thr, /*!< in: query thread whose run_node + is an update node */ + const char* name) /*!< in: table name prefixed with + database name and a '/' character */ +{ + char* buf; + char* ptr; + ulint len = strlen(name) + 1; + + buf = mem_strdupl(name, len); + + ptr = strchr(buf, '/'); + ut_a(ptr); + *ptr = '\0'; + + innobase_invalidate_query_cache(thr_get_trx(thr), buf, len); + mem_free(buf); +} + +/*********************************************************************//** +Perform referential actions or checks when a parent row is deleted or updated +and the constraint had an ON DELETE or ON UPDATE condition which was not +RESTRICT. +@return DB_SUCCESS, DB_LOCK_WAIT, or error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_ins_foreign_check_on_constraint( +/*================================*/ + que_thr_t* thr, /*!< in: query thread whose run_node + is an update node */ + dict_foreign_t* foreign, /*!< in: foreign key constraint whose + type is != 0 */ + btr_pcur_t* pcur, /*!< in: cursor placed on a matching + index record in the child table */ + dtuple_t* entry, /*!< in: index entry in the parent + table */ + mtr_t* mtr) /*!< in: mtr holding the latch of pcur + page */ +{ + upd_node_t* node; + upd_node_t* cascade; + dict_table_t* table = foreign->foreign_table; + dict_index_t* index; + dict_index_t* clust_index; + dtuple_t* ref; + mem_heap_t* upd_vec_heap = NULL; + const rec_t* rec; + const rec_t* clust_rec; + const buf_block_t* clust_block; + upd_t* update; + ulint n_to_update; + dberr_t err; + ulint i; + trx_t* trx; + mem_heap_t* tmp_heap = NULL; + doc_id_t doc_id = FTS_NULL_DOC_ID; + ibool fts_col_affacted = FALSE; + + ut_a(thr); + ut_a(foreign); + ut_a(pcur); + ut_a(mtr); + + trx = thr_get_trx(thr); + + /* Since we are going to delete or update a row, we have to invalidate + the MySQL query cache for table. A deadlock of threads is not possible + here because the caller of this function does not hold any latches with + the sync0sync.h rank above the lock_sys_t::mutex. The query cache mutex + has a rank just above the lock_sys_t::mutex. */ + + row_ins_invalidate_query_cache(thr, table->name); + + node = static_cast<upd_node_t*>(thr->run_node); + + if (node->is_delete && 0 == (foreign->type + & (DICT_FOREIGN_ON_DELETE_CASCADE + | DICT_FOREIGN_ON_DELETE_SET_NULL))) { + + row_ins_foreign_report_err("Trying to delete", + thr, foreign, + btr_pcur_get_rec(pcur), entry); + + return(DB_ROW_IS_REFERENCED); + } + + if (!node->is_delete && 0 == (foreign->type + & (DICT_FOREIGN_ON_UPDATE_CASCADE + | DICT_FOREIGN_ON_UPDATE_SET_NULL))) { + + /* This is an UPDATE */ + + row_ins_foreign_report_err("Trying to update", + thr, foreign, + btr_pcur_get_rec(pcur), entry); + + return(DB_ROW_IS_REFERENCED); + } + + if (node->cascade_node == NULL) { + /* Extend our query graph by creating a child to current + update node. The child is used in the cascade or set null + operation. */ + + node->cascade_heap = mem_heap_create(128); + node->cascade_node = row_create_update_node_for_mysql( + table, node->cascade_heap); + que_node_set_parent(node->cascade_node, node); + } + + /* Initialize cascade_node to do the operation we want. Note that we + use the SAME cascade node to do all foreign key operations of the + SQL DELETE: the table of the cascade node may change if there are + several child tables to the table where the delete is done! */ + + cascade = node->cascade_node; + + cascade->table = table; + + cascade->foreign = foreign; + + if (node->is_delete + && (foreign->type & DICT_FOREIGN_ON_DELETE_CASCADE)) { + cascade->is_delete = TRUE; + } else { + cascade->is_delete = FALSE; + + if (foreign->n_fields > cascade->update_n_fields) { + /* We have to make the update vector longer */ + + cascade->update = upd_create(foreign->n_fields, + node->cascade_heap); + cascade->update_n_fields = foreign->n_fields; + } + } + + /* We do not allow cyclic cascaded updating (DELETE is allowed, + but not UPDATE) of the same table, as this can lead to an infinite + cycle. Check that we are not updating the same table which is + already being modified in this cascade chain. We have to check + this also because the modification of the indexes of a 'parent' + table may still be incomplete, and we must avoid seeing the indexes + of the parent table in an inconsistent state! */ + + if (!cascade->is_delete + && row_ins_cascade_ancestor_updates_table(cascade, table)) { + + /* We do not know if this would break foreign key + constraints, but play safe and return an error */ + + err = DB_ROW_IS_REFERENCED; + + row_ins_foreign_report_err( + "Trying an update, possibly causing a cyclic" + " cascaded update\n" + "in the child table,", thr, foreign, + btr_pcur_get_rec(pcur), entry); + + goto nonstandard_exit_func; + } + + if (row_ins_cascade_n_ancestors(cascade) >= 15) { + err = DB_ROW_IS_REFERENCED; + + row_ins_foreign_report_err( + "Trying a too deep cascaded delete or update\n", + thr, foreign, btr_pcur_get_rec(pcur), entry); + + goto nonstandard_exit_func; + } + + index = btr_pcur_get_btr_cur(pcur)->index; + + ut_a(index == foreign->foreign_index); + + rec = btr_pcur_get_rec(pcur); + + tmp_heap = mem_heap_create(256); + + if (dict_index_is_clust(index)) { + /* pcur is already positioned in the clustered index of + the child table */ + + clust_index = index; + clust_rec = rec; + clust_block = btr_pcur_get_block(pcur); + } else { + /* We have to look for the record in the clustered index + in the child table */ + + clust_index = dict_table_get_first_index(table); + + ref = row_build_row_ref(ROW_COPY_POINTERS, index, rec, + tmp_heap); + btr_pcur_open_with_no_init(clust_index, ref, + PAGE_CUR_LE, BTR_SEARCH_LEAF, + cascade->pcur, 0, mtr); + + clust_rec = btr_pcur_get_rec(cascade->pcur); + clust_block = btr_pcur_get_block(cascade->pcur); + + if (!page_rec_is_user_rec(clust_rec) + || btr_pcur_get_low_match(cascade->pcur) + < dict_index_get_n_unique(clust_index)) { + + fputs("InnoDB: error in cascade of a foreign key op\n" + "InnoDB: ", stderr); + dict_index_name_print(stderr, trx, index); + + fputs("\n" + "InnoDB: record ", stderr); + rec_print(stderr, rec, index); + fputs("\n" + "InnoDB: clustered record ", stderr); + rec_print(stderr, clust_rec, clust_index); + fputs("\n" + "InnoDB: Submit a detailed bug report to" + " http://bugs.mysql.com\n", stderr); + ut_ad(0); + err = DB_SUCCESS; + + goto nonstandard_exit_func; + } + } + + /* Set an X-lock on the row to delete or update in the child table */ + + err = lock_table(0, table, LOCK_IX, thr); + + if (err == DB_SUCCESS) { + /* Here it suffices to use a LOCK_REC_NOT_GAP type lock; + we already have a normal shared lock on the appropriate + gap if the search criterion was not unique */ + + err = lock_clust_rec_read_check_and_lock_alt( + 0, clust_block, clust_rec, clust_index, + LOCK_X, LOCK_REC_NOT_GAP, thr); + } + + if (err != DB_SUCCESS) { + + goto nonstandard_exit_func; + } + + if (rec_get_deleted_flag(clust_rec, dict_table_is_comp(table))) { + /* This can happen if there is a circular reference of + rows such that cascading delete comes to delete a row + already in the process of being delete marked */ + err = DB_SUCCESS; + + goto nonstandard_exit_func; + } + + if (table->fts) { + doc_id = fts_get_doc_id_from_rec(table, clust_rec, tmp_heap); + } + + if (node->is_delete + ? (foreign->type & DICT_FOREIGN_ON_DELETE_SET_NULL) + : (foreign->type & DICT_FOREIGN_ON_UPDATE_SET_NULL)) { + + /* Build the appropriate update vector which sets + foreign->n_fields first fields in rec to SQL NULL */ + + update = cascade->update; + + update->info_bits = 0; + update->n_fields = foreign->n_fields; + UNIV_MEM_INVALID(update->fields, + update->n_fields * sizeof *update->fields); + + for (i = 0; i < foreign->n_fields; i++) { + upd_field_t* ufield = &update->fields[i]; + + ufield->field_no = dict_table_get_nth_col_pos( + table, + dict_index_get_nth_col_no(index, i)); + ufield->orig_len = 0; + ufield->exp = NULL; + dfield_set_null(&ufield->new_val); + + if (table->fts && dict_table_is_fts_column( + table->fts->indexes, + dict_index_get_nth_col_no(index, i)) + != ULINT_UNDEFINED) { + fts_col_affacted = TRUE; + } + } + + if (fts_col_affacted) { + fts_trx_add_op(trx, table, doc_id, FTS_DELETE, NULL); + } + } else if (table->fts && cascade->is_delete) { + /* DICT_FOREIGN_ON_DELETE_CASCADE case */ + for (i = 0; i < foreign->n_fields; i++) { + if (table->fts && dict_table_is_fts_column( + table->fts->indexes, + dict_index_get_nth_col_no(index, i)) + != ULINT_UNDEFINED) { + fts_col_affacted = TRUE; + } + } + + if (fts_col_affacted) { + fts_trx_add_op(trx, table, doc_id, FTS_DELETE, NULL); + } + } + + if (!node->is_delete + && (foreign->type & DICT_FOREIGN_ON_UPDATE_CASCADE)) { + + /* Build the appropriate update vector which sets changing + foreign->n_fields first fields in rec to new values */ + + upd_vec_heap = mem_heap_create(256); + + n_to_update = row_ins_cascade_calc_update_vec( + node, foreign, upd_vec_heap, trx, &fts_col_affacted); + + if (n_to_update == ULINT_UNDEFINED) { + err = DB_ROW_IS_REFERENCED; + + row_ins_foreign_report_err( + "Trying a cascaded update where the" + " updated value in the child\n" + "table would not fit in the length" + " of the column, or the value would\n" + "be NULL and the column is" + " declared as not NULL in the child table,", + thr, foreign, btr_pcur_get_rec(pcur), entry); + + goto nonstandard_exit_func; + } + + if (cascade->update->n_fields == 0) { + + /* The update does not change any columns referred + to in this foreign key constraint: no need to do + anything */ + + err = DB_SUCCESS; + + goto nonstandard_exit_func; + } + + /* Mark the old Doc ID as deleted */ + if (fts_col_affacted) { + ut_ad(table->fts); + fts_trx_add_op(trx, table, doc_id, FTS_DELETE, NULL); + } + } + + /* Store pcur position and initialize or store the cascade node + pcur stored position */ + + btr_pcur_store_position(pcur, mtr); + + if (index == clust_index) { + btr_pcur_copy_stored_position(cascade->pcur, pcur); + } else { + btr_pcur_store_position(cascade->pcur, mtr); + } + + mtr_commit(mtr); + + ut_a(cascade->pcur->rel_pos == BTR_PCUR_ON); + + cascade->state = UPD_NODE_UPDATE_CLUSTERED; + + err = row_update_cascade_for_mysql(thr, cascade, + foreign->foreign_table); + + if (foreign->foreign_table->n_foreign_key_checks_running == 0) { + fprintf(stderr, + "InnoDB: error: table %s has the counter 0" + " though there is\n" + "InnoDB: a FOREIGN KEY check running on it.\n", + foreign->foreign_table->name); + } + + /* Release the data dictionary latch for a while, so that we do not + starve other threads from doing CREATE TABLE etc. if we have a huge + cascaded operation running. The counter n_foreign_key_checks_running + will prevent other users from dropping or ALTERing the table when we + release the latch. */ + + row_mysql_unfreeze_data_dictionary(thr_get_trx(thr)); + + DEBUG_SYNC_C("innodb_dml_cascade_dict_unfreeze"); + + row_mysql_freeze_data_dictionary(thr_get_trx(thr)); + + mtr_start(mtr); + + /* Restore pcur position */ + + btr_pcur_restore_position(BTR_SEARCH_LEAF, pcur, mtr); + + if (tmp_heap) { + mem_heap_free(tmp_heap); + } + + if (upd_vec_heap) { + mem_heap_free(upd_vec_heap); + } + + return(err); + +nonstandard_exit_func: + if (tmp_heap) { + mem_heap_free(tmp_heap); + } + + if (upd_vec_heap) { + mem_heap_free(upd_vec_heap); + } + + btr_pcur_store_position(pcur, mtr); + + mtr_commit(mtr); + mtr_start(mtr); + + btr_pcur_restore_position(BTR_SEARCH_LEAF, pcur, mtr); + + return(err); +} + +/*********************************************************************//** +Sets a shared lock on a record. Used in locking possible duplicate key +records and also in checking foreign key constraints. +@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */ +static +dberr_t +row_ins_set_shared_rec_lock( +/*========================*/ + ulint type, /*!< in: LOCK_ORDINARY, LOCK_GAP, or + LOCK_REC_NOT_GAP type lock */ + const buf_block_t* block, /*!< in: buffer block of rec */ + const rec_t* rec, /*!< in: record */ + dict_index_t* index, /*!< in: index */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + que_thr_t* thr) /*!< in: query thread */ +{ + dberr_t err; + + ut_ad(rec_offs_validate(rec, index, offsets)); + + if (dict_index_is_clust(index)) { + err = lock_clust_rec_read_check_and_lock( + 0, block, rec, index, offsets, LOCK_S, type, thr); + } else { + err = lock_sec_rec_read_check_and_lock( + 0, block, rec, index, offsets, LOCK_S, type, thr); + } + + return(err); +} + +/*********************************************************************//** +Sets a exclusive lock on a record. Used in locking possible duplicate key +records +@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */ +static +dberr_t +row_ins_set_exclusive_rec_lock( +/*===========================*/ + ulint type, /*!< in: LOCK_ORDINARY, LOCK_GAP, or + LOCK_REC_NOT_GAP type lock */ + const buf_block_t* block, /*!< in: buffer block of rec */ + const rec_t* rec, /*!< in: record */ + dict_index_t* index, /*!< in: index */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + que_thr_t* thr) /*!< in: query thread */ +{ + dberr_t err; + + ut_ad(rec_offs_validate(rec, index, offsets)); + + if (dict_index_is_clust(index)) { + err = lock_clust_rec_read_check_and_lock( + 0, block, rec, index, offsets, LOCK_X, type, thr); + } else { + err = lock_sec_rec_read_check_and_lock( + 0, block, rec, index, offsets, LOCK_X, type, thr); + } + + return(err); +} + +/***************************************************************//** +Checks if foreign key constraint fails for an index entry. Sets shared locks +which lock either the success or the failure of the constraint. NOTE that +the caller must have a shared latch on dict_operation_lock. +@return DB_SUCCESS, DB_NO_REFERENCED_ROW, or DB_ROW_IS_REFERENCED */ +UNIV_INTERN +dberr_t +row_ins_check_foreign_constraint( +/*=============================*/ + ibool check_ref,/*!< in: TRUE if we want to check that + the referenced table is ok, FALSE if we + want to check the foreign key table */ + dict_foreign_t* foreign,/*!< in: foreign constraint; NOTE that the + tables mentioned in it must be in the + dictionary cache if they exist at all */ + dict_table_t* table, /*!< in: if check_ref is TRUE, then the foreign + table, else the referenced table */ + dtuple_t* entry, /*!< in: index entry for index */ + que_thr_t* thr) /*!< in: query thread */ +{ + dberr_t err; + upd_node_t* upd_node; + dict_table_t* check_table; + dict_index_t* check_index; + ulint n_fields_cmp; + btr_pcur_t pcur; + int cmp; + ulint i; + mtr_t mtr; + trx_t* trx = thr_get_trx(thr); + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + +run_again: +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + + err = DB_SUCCESS; + + if (trx->check_foreigns == FALSE) { + /* The user has suppressed foreign key checks currently for + this session */ + goto exit_func; + } + + /* If any of the foreign key fields in entry is SQL NULL, we + suppress the foreign key check: this is compatible with Oracle, + for example */ + + for (i = 0; i < foreign->n_fields; i++) { + if (UNIV_SQL_NULL == dfield_get_len( + dtuple_get_nth_field(entry, i))) { + + goto exit_func; + } + } + + if (que_node_get_type(thr->run_node) == QUE_NODE_UPDATE) { + upd_node = static_cast<upd_node_t*>(thr->run_node); + + if (!(upd_node->is_delete) && upd_node->foreign == foreign) { + /* If a cascaded update is done as defined by a + foreign key constraint, do not check that + constraint for the child row. In ON UPDATE CASCADE + the update of the parent row is only half done when + we come here: if we would check the constraint here + for the child row it would fail. + + A QUESTION remains: if in the child table there are + several constraints which refer to the same parent + table, we should merge all updates to the child as + one update? And the updates can be contradictory! + Currently we just perform the update associated + with each foreign key constraint, one after + another, and the user has problems predicting in + which order they are performed. */ + + goto exit_func; + } + } + + if (check_ref) { + check_table = foreign->referenced_table; + check_index = foreign->referenced_index; + } else { + check_table = foreign->foreign_table; + check_index = foreign->foreign_index; + } + + if (check_table == NULL + || check_table->ibd_file_missing + || check_index == NULL) { + + if (!srv_read_only_mode && check_ref) { + FILE* ef = dict_foreign_err_file; + + row_ins_set_detailed(trx, foreign); + + row_ins_foreign_trx_print(trx); + + fputs("Foreign key constraint fails for table ", ef); + ut_print_name(ef, trx, TRUE, + foreign->foreign_table_name); + fputs(":\n", ef); + dict_print_info_on_foreign_key_in_create_format( + ef, trx, foreign, TRUE); + fputs("\nTrying to add to index ", ef); + ut_print_name(ef, trx, FALSE, + foreign->foreign_index->name); + fputs(" tuple:\n", ef); + dtuple_print(ef, entry); + fputs("\nBut the parent table ", ef); + ut_print_name(ef, trx, TRUE, + foreign->referenced_table_name); + fputs("\nor its .ibd file does" + " not currently exist!\n", ef); + mutex_exit(&dict_foreign_err_mutex); + + err = DB_NO_REFERENCED_ROW; + } + + goto exit_func; + } + + if (check_table != table) { + /* We already have a LOCK_IX on table, but not necessarily + on check_table */ + + err = lock_table(0, check_table, LOCK_IS, thr); + + if (err != DB_SUCCESS) { + + goto do_possible_lock_wait; + } + } + + mtr_start(&mtr); + + /* Store old value on n_fields_cmp */ + + n_fields_cmp = dtuple_get_n_fields_cmp(entry); + + dtuple_set_n_fields_cmp(entry, foreign->n_fields); + + btr_pcur_open(check_index, entry, PAGE_CUR_GE, + BTR_SEARCH_LEAF, &pcur, &mtr); + + /* Scan index records and check if there is a matching record */ + + do { + const rec_t* rec = btr_pcur_get_rec(&pcur); + const buf_block_t* block = btr_pcur_get_block(&pcur); + + SRV_CORRUPT_TABLE_CHECK(block, + { + err = DB_CORRUPTION; + goto exit_loop; + }); + + if (page_rec_is_infimum(rec)) { + + continue; + } + + offsets = rec_get_offsets(rec, check_index, + offsets, ULINT_UNDEFINED, &heap); + + if (page_rec_is_supremum(rec)) { + + err = row_ins_set_shared_rec_lock(LOCK_ORDINARY, block, + rec, check_index, + offsets, thr); + switch (err) { + case DB_SUCCESS_LOCKED_REC: + case DB_SUCCESS: + continue; + default: + goto end_scan; + } + } + + cmp = cmp_dtuple_rec(entry, rec, offsets); + + if (cmp == 0) { + if (rec_get_deleted_flag(rec, + rec_offs_comp(offsets))) { + err = row_ins_set_shared_rec_lock( + LOCK_ORDINARY, block, + rec, check_index, offsets, thr); + switch (err) { + case DB_SUCCESS_LOCKED_REC: + case DB_SUCCESS: + break; + default: + goto end_scan; + } + } else { + /* Found a matching record. Lock only + a record because we can allow inserts + into gaps */ + + err = row_ins_set_shared_rec_lock( + LOCK_REC_NOT_GAP, block, + rec, check_index, offsets, thr); + + switch (err) { + case DB_SUCCESS_LOCKED_REC: + case DB_SUCCESS: + break; + default: + goto end_scan; + } + + if (check_ref) { + err = DB_SUCCESS; + + goto end_scan; + } else if (foreign->type != 0) { + /* There is an ON UPDATE or ON DELETE + condition: check them in a separate + function */ + + err = row_ins_foreign_check_on_constraint( + thr, foreign, &pcur, entry, + &mtr); + if (err != DB_SUCCESS) { + /* Since reporting a plain + "duplicate key" error + message to the user in + cases where a long CASCADE + operation would lead to a + duplicate key in some + other table is very + confusing, map duplicate + key errors resulting from + FK constraints to a + separate error code. */ + + if (err == DB_DUPLICATE_KEY) { + err = DB_FOREIGN_DUPLICATE_KEY; + } + + goto end_scan; + } + + /* row_ins_foreign_check_on_constraint + may have repositioned pcur on a + different block */ + block = btr_pcur_get_block(&pcur); + } else { + row_ins_foreign_report_err( + "Trying to delete or update", + thr, foreign, rec, entry); + + err = DB_ROW_IS_REFERENCED; + goto end_scan; + } + } + } else { + ut_a(cmp < 0); + + err = row_ins_set_shared_rec_lock( + LOCK_GAP, block, + rec, check_index, offsets, thr); + + switch (err) { + case DB_SUCCESS_LOCKED_REC: + case DB_SUCCESS: + if (check_ref) { + err = DB_NO_REFERENCED_ROW; + row_ins_foreign_report_add_err( + trx, foreign, rec, entry); + } else { + err = DB_SUCCESS; + } + default: + break; + } + + goto end_scan; + } + } while (btr_pcur_move_to_next(&pcur, &mtr)); + +exit_loop: + if (check_ref) { + row_ins_foreign_report_add_err( + trx, foreign, btr_pcur_get_rec(&pcur), entry); + err = DB_NO_REFERENCED_ROW; + } else { + err = DB_SUCCESS; + } + +end_scan: + btr_pcur_close(&pcur); + + mtr_commit(&mtr); + + /* Restore old value */ + dtuple_set_n_fields_cmp(entry, n_fields_cmp); + +do_possible_lock_wait: + if (err == DB_LOCK_WAIT) { + bool verified = false; + + trx->error_state = err; + + que_thr_stop_for_mysql(thr); + + lock_wait_suspend_thread(thr); + + if (check_table->to_be_dropped) { + /* The table is being dropped. We shall timeout + this operation */ + err = DB_LOCK_WAIT_TIMEOUT; + goto exit_func; + } + + /* We had temporarily released dict_operation_lock in + above lock sleep wait, now we have the lock again, and + we will need to re-check whether the foreign key has been + dropped. We only need to verify if the table is referenced + table case (check_ref == 0), since MDL lock will prevent + concurrent DDL and DML on the same table */ + if (!check_ref) { + for (dict_foreign_set::iterator it + = table->referenced_set.begin(); + it != table->referenced_set.end(); + ++it) { + if (*it == foreign) { + verified = true; + break; + } + } + } else { + verified = true; + } + + if (!verified) { + err = DB_DICT_CHANGED; + } else if (trx->error_state == DB_SUCCESS) { + goto run_again; + } else { + err = trx->error_state; + } + } + +exit_func: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + if (UNIV_UNLIKELY(trx->fake_changes)) { + err = DB_SUCCESS; + } + + return(err); +} + +/***************************************************************//** +Checks if foreign key constraints fail for an index entry. If index +is not mentioned in any constraint, this function does nothing, +Otherwise does searches to the indexes of referenced tables and +sets shared locks which lock either the success or the failure of +a constraint. +@return DB_SUCCESS or error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_ins_check_foreign_constraints( +/*==============================*/ + dict_table_t* table, /*!< in: table */ + dict_index_t* index, /*!< in: index */ + dtuple_t* entry, /*!< in: index entry for index */ + que_thr_t* thr) /*!< in: query thread */ +{ + dict_foreign_t* foreign; + dberr_t err; + trx_t* trx; + ibool got_s_lock = FALSE; + + trx = thr_get_trx(thr); + + DEBUG_SYNC_C_IF_THD(thr_get_trx(thr)->mysql_thd, + "foreign_constraint_check_for_ins"); + + for (dict_foreign_set::iterator it = table->foreign_set.begin(); + it != table->foreign_set.end(); + ++it) { + + foreign = *it; + + if (foreign->foreign_index == index) { + dict_table_t* ref_table = NULL; + dict_table_t* foreign_table = foreign->foreign_table; + dict_table_t* referenced_table + = foreign->referenced_table; + + if (referenced_table == NULL) { + + ref_table = dict_table_open_on_name( + foreign->referenced_table_name_lookup, + FALSE, FALSE, DICT_ERR_IGNORE_NONE); + } + + if (0 == trx->dict_operation_lock_mode) { + got_s_lock = TRUE; + + row_mysql_freeze_data_dictionary(trx); + } + + if (referenced_table) { + os_inc_counter(dict_sys->mutex, + foreign_table + ->n_foreign_key_checks_running); + } + + /* NOTE that if the thread ends up waiting for a lock + we will release dict_operation_lock temporarily! + But the counter on the table protects the referenced + table from being dropped while the check is running. */ + + err = row_ins_check_foreign_constraint( + TRUE, foreign, table, entry, thr); + + DBUG_EXECUTE_IF("row_ins_dict_change_err", + err = DB_DICT_CHANGED;); + + if (referenced_table) { + os_dec_counter(dict_sys->mutex, + foreign_table + ->n_foreign_key_checks_running); + } + + if (got_s_lock) { + row_mysql_unfreeze_data_dictionary(trx); + } + + if (ref_table != NULL) { + dict_table_close(ref_table, FALSE, FALSE); + } + + if (err != DB_SUCCESS) { + + return(err); + } + } + } + + return(DB_SUCCESS); +} + +/***************************************************************//** +Checks if a unique key violation to rec would occur at the index entry +insert. +@return TRUE if error */ +static +ibool +row_ins_dupl_error_with_rec( +/*========================*/ + const rec_t* rec, /*!< in: user record; NOTE that we assume + that the caller already has a record lock on + the record! */ + const dtuple_t* entry, /*!< in: entry to insert */ + dict_index_t* index, /*!< in: index */ + const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */ +{ + ulint matched_fields; + ulint matched_bytes; + ulint n_unique; + ulint i; + + ut_ad(rec_offs_validate(rec, index, offsets)); + + n_unique = dict_index_get_n_unique(index); + + matched_fields = 0; + matched_bytes = 0; + + cmp_dtuple_rec_with_match(entry, rec, offsets, + &matched_fields, &matched_bytes); + + if (matched_fields < n_unique) { + + return(FALSE); + } + + /* In a unique secondary index we allow equal key values if they + contain SQL NULLs */ + + if (!dict_index_is_clust(index)) { + + for (i = 0; i < n_unique; i++) { + if (dfield_is_null(dtuple_get_nth_field(entry, i))) { + + return(FALSE); + } + } + } + + return(!rec_get_deleted_flag(rec, rec_offs_comp(offsets))); +} + +/***************************************************************//** +Scans a unique non-clustered index at a given index entry to determine +whether a uniqueness violation has occurred for the key value of the entry. +Set shared locks on possible duplicate records. +@return DB_SUCCESS, DB_DUPLICATE_KEY, or DB_LOCK_WAIT */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_ins_scan_sec_index_for_duplicate( +/*=================================*/ + ulint flags, /*!< in: undo logging and locking flags */ + dict_index_t* index, /*!< in: non-clustered unique index */ + dtuple_t* entry, /*!< in: index entry */ + que_thr_t* thr, /*!< in: query thread */ + bool s_latch,/*!< in: whether index->lock is being held */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + mem_heap_t* offsets_heap) + /*!< in/out: memory heap that can be emptied */ +{ + ulint n_unique; + int cmp; + ulint n_fields_cmp; + btr_pcur_t pcur; + dberr_t err = DB_SUCCESS; + ulint allow_duplicates; + ulint* offsets = NULL; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(s_latch == rw_lock_own(&index->lock, RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + + n_unique = dict_index_get_n_unique(index); + + /* If the secondary index is unique, but one of the fields in the + n_unique first fields is NULL, a unique key violation cannot occur, + since we define NULL != NULL in this case */ + + for (ulint i = 0; i < n_unique; i++) { + if (UNIV_SQL_NULL == dfield_get_len( + dtuple_get_nth_field(entry, i))) { + + return(DB_SUCCESS); + } + } + + /* Store old value on n_fields_cmp */ + + n_fields_cmp = dtuple_get_n_fields_cmp(entry); + + dtuple_set_n_fields_cmp(entry, n_unique); + + btr_pcur_open(index, entry, PAGE_CUR_GE, + s_latch + ? BTR_SEARCH_LEAF | BTR_ALREADY_S_LATCHED + : BTR_SEARCH_LEAF, + &pcur, mtr); + + allow_duplicates = thr_get_trx(thr)->duplicates; + + /* Scan index records and check if there is a duplicate */ + + do { + const rec_t* rec = btr_pcur_get_rec(&pcur); + const buf_block_t* block = btr_pcur_get_block(&pcur); + const ulint lock_type = LOCK_ORDINARY; + + if (page_rec_is_infimum(rec)) { + + continue; + } + + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &offsets_heap); + + if (flags & BTR_NO_LOCKING_FLAG) { + /* Set no locks when applying log + in online table rebuild. */ + } else if (allow_duplicates) { + + /* If the SQL-query will update or replace + duplicate key we will take X-lock for + duplicates ( REPLACE, LOAD DATAFILE REPLACE, + INSERT ON DUPLICATE KEY UPDATE). */ + + err = row_ins_set_exclusive_rec_lock( + lock_type, block, rec, index, offsets, thr); + } else { + + err = row_ins_set_shared_rec_lock( + lock_type, block, rec, index, offsets, thr); + } + + switch (err) { + case DB_SUCCESS_LOCKED_REC: + err = DB_SUCCESS; + case DB_SUCCESS: + break; + default: + goto end_scan; + } + + if (page_rec_is_supremum(rec)) { + + continue; + } + + cmp = cmp_dtuple_rec(entry, rec, offsets); + + if (cmp == 0) { + if (row_ins_dupl_error_with_rec(rec, entry, + index, offsets)) { + err = DB_DUPLICATE_KEY; + + thr_get_trx(thr)->error_info = index; + + /* If the duplicate is on hidden FTS_DOC_ID, + state so in the error log */ + if (DICT_TF2_FLAG_IS_SET( + index->table, + DICT_TF2_FTS_HAS_DOC_ID) + && strcmp(index->name, + FTS_DOC_ID_INDEX_NAME) == 0) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Duplicate FTS_DOC_ID value" + " on table %s", + index->table->name); + } + + goto end_scan; + } + } else { + ut_a(cmp < 0); + goto end_scan; + } + } while (btr_pcur_move_to_next(&pcur, mtr)); + +end_scan: + /* Restore old value */ + dtuple_set_n_fields_cmp(entry, n_fields_cmp); + + return(err); +} + +/** Checks for a duplicate when the table is being rebuilt online. +@retval DB_SUCCESS when no duplicate is detected +@retval DB_SUCCESS_LOCKED_REC when rec is an exact match of entry or +a newer version of entry (the entry should not be inserted) +@retval DB_DUPLICATE_KEY when entry is a duplicate of rec */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_ins_duplicate_online( +/*=====================*/ + ulint n_uniq, /*!< in: offset of DB_TRX_ID */ + const dtuple_t* entry, /*!< in: entry that is being inserted */ + const rec_t* rec, /*!< in: clustered index record */ + ulint* offsets)/*!< in/out: rec_get_offsets(rec) */ +{ + ulint fields = 0; + ulint bytes = 0; + + /* During rebuild, there should not be any delete-marked rows + in the new table. */ + ut_ad(!rec_get_deleted_flag(rec, rec_offs_comp(offsets))); + ut_ad(dtuple_get_n_fields_cmp(entry) == n_uniq); + + /* Compare the PRIMARY KEY fields and the + DB_TRX_ID, DB_ROLL_PTR. */ + cmp_dtuple_rec_with_match_low( + entry, rec, offsets, n_uniq + 2, &fields, &bytes); + + if (fields < n_uniq) { + /* Not a duplicate. */ + return(DB_SUCCESS); + } + + if (fields == n_uniq + 2) { + /* rec is an exact match of entry. */ + ut_ad(bytes == 0); + return(DB_SUCCESS_LOCKED_REC); + } + + return(DB_DUPLICATE_KEY); +} + +/** Checks for a duplicate when the table is being rebuilt online. +@retval DB_SUCCESS when no duplicate is detected +@retval DB_SUCCESS_LOCKED_REC when rec is an exact match of entry or +a newer version of entry (the entry should not be inserted) +@retval DB_DUPLICATE_KEY when entry is a duplicate of rec */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_ins_duplicate_error_in_clust_online( +/*====================================*/ + ulint n_uniq, /*!< in: offset of DB_TRX_ID */ + const dtuple_t* entry, /*!< in: entry that is being inserted */ + const btr_cur_t*cursor, /*!< in: cursor on insert position */ + ulint** offsets,/*!< in/out: rec_get_offsets(rec) */ + mem_heap_t** heap) /*!< in/out: heap for offsets */ +{ + dberr_t err = DB_SUCCESS; + const rec_t* rec = btr_cur_get_rec(cursor); + + if (cursor->low_match >= n_uniq && !page_rec_is_infimum(rec)) { + *offsets = rec_get_offsets(rec, cursor->index, *offsets, + ULINT_UNDEFINED, heap); + err = row_ins_duplicate_online(n_uniq, entry, rec, *offsets); + if (err != DB_SUCCESS) { + return(err); + } + } + + rec = page_rec_get_next_const(btr_cur_get_rec(cursor)); + + if (cursor->up_match >= n_uniq && !page_rec_is_supremum(rec)) { + *offsets = rec_get_offsets(rec, cursor->index, *offsets, + ULINT_UNDEFINED, heap); + err = row_ins_duplicate_online(n_uniq, entry, rec, *offsets); + } + + return(err); +} + +/***************************************************************//** +Checks if a unique key violation error would occur at an index entry +insert. Sets shared locks on possible duplicate records. Works only +for a clustered index! +@retval DB_SUCCESS if no error +@retval DB_DUPLICATE_KEY if error, +@retval DB_LOCK_WAIT if we have to wait for a lock on a possible duplicate +record +@retval DB_SUCCESS_LOCKED_REC if an exact match of the record was found +in online table rebuild (flags & (BTR_KEEP_SYS_FLAG | BTR_NO_LOCKING_FLAG)) */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_ins_duplicate_error_in_clust( +/*=============================*/ + ulint flags, /*!< in: undo logging and locking flags */ + btr_cur_t* cursor, /*!< in: B-tree cursor */ + const dtuple_t* entry, /*!< in: entry to insert */ + que_thr_t* thr, /*!< in: query thread */ + mtr_t* mtr) /*!< in: mtr */ +{ + dberr_t err; + rec_t* rec; + ulint n_unique; + trx_t* trx = thr_get_trx(thr); + mem_heap_t*heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + UT_NOT_USED(mtr); + + ut_ad(dict_index_is_clust(cursor->index)); + + /* NOTE: For unique non-clustered indexes there may be any number + of delete marked records with the same value for the non-clustered + index key (remember multiversioning), and which differ only in + the row refererence part of the index record, containing the + clustered index key fields. For such a secondary index record, + to avoid race condition, we must FIRST do the insertion and after + that check that the uniqueness condition is not breached! */ + + /* NOTE: A problem is that in the B-tree node pointers on an + upper level may match more to the entry than the actual existing + user records on the leaf level. So, even if low_match would suggest + that a duplicate key violation may occur, this may not be the case. */ + + n_unique = dict_index_get_n_unique(cursor->index); + + if (cursor->low_match >= n_unique) { + + rec = btr_cur_get_rec(cursor); + + if (!page_rec_is_infimum(rec)) { + offsets = rec_get_offsets(rec, cursor->index, offsets, + ULINT_UNDEFINED, &heap); + + /* We set a lock on the possible duplicate: this + is needed in logical logging of MySQL to make + sure that in roll-forward we get the same duplicate + errors as in original execution */ + + if (trx->duplicates) { + + /* If the SQL-query will update or replace + duplicate key we will take X-lock for + duplicates ( REPLACE, LOAD DATAFILE REPLACE, + INSERT ON DUPLICATE KEY UPDATE). */ + + err = row_ins_set_exclusive_rec_lock( + LOCK_REC_NOT_GAP, + btr_cur_get_block(cursor), + rec, cursor->index, offsets, thr); + } else { + + err = row_ins_set_shared_rec_lock( + LOCK_REC_NOT_GAP, + btr_cur_get_block(cursor), rec, + cursor->index, offsets, thr); + } + + switch (err) { + case DB_SUCCESS_LOCKED_REC: + case DB_SUCCESS: + break; + default: + goto func_exit; + } + + if (row_ins_dupl_error_with_rec( + rec, entry, cursor->index, offsets)) { +duplicate: + trx->error_info = cursor->index; + err = DB_DUPLICATE_KEY; + goto func_exit; + } + } + } + + if (cursor->up_match >= n_unique) { + + rec = page_rec_get_next(btr_cur_get_rec(cursor)); + + if (!page_rec_is_supremum(rec)) { + offsets = rec_get_offsets(rec, cursor->index, offsets, + ULINT_UNDEFINED, &heap); + + if (trx->duplicates) { + + /* If the SQL-query will update or replace + duplicate key we will take X-lock for + duplicates ( REPLACE, LOAD DATAFILE REPLACE, + INSERT ON DUPLICATE KEY UPDATE). */ + + err = row_ins_set_exclusive_rec_lock( + LOCK_REC_NOT_GAP, + btr_cur_get_block(cursor), + rec, cursor->index, offsets, thr); + } else { + + err = row_ins_set_shared_rec_lock( + LOCK_REC_NOT_GAP, + btr_cur_get_block(cursor), + rec, cursor->index, offsets, thr); + } + + switch (err) { + case DB_SUCCESS_LOCKED_REC: + case DB_SUCCESS: + break; + default: + goto func_exit; + } + + if (row_ins_dupl_error_with_rec( + rec, entry, cursor->index, offsets)) { + goto duplicate; + } + } + + /* This should never happen */ + ut_error; + } + + err = DB_SUCCESS; +func_exit: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(err); +} + +/***************************************************************//** +Checks if an index entry has long enough common prefix with an +existing record so that the intended insert of the entry must be +changed to a modify of the existing record. In the case of a clustered +index, the prefix must be n_unique fields long. In the case of a +secondary index, all fields must be equal. InnoDB never updates +secondary index records in place, other than clearing or setting the +delete-mark flag. We could be able to update the non-unique fields +of a unique secondary index record by checking the cursor->up_match, +but we do not do so, because it could have some locking implications. +@return TRUE if the existing record should be updated; FALSE if not */ +UNIV_INLINE +ibool +row_ins_must_modify_rec( +/*====================*/ + const btr_cur_t* cursor) /*!< in: B-tree cursor */ +{ + /* NOTE: (compare to the note in row_ins_duplicate_error_in_clust) + Because node pointers on upper levels of the B-tree may match more + to entry than to actual user records on the leaf level, we + have to check if the candidate record is actually a user record. + A clustered index node pointer contains index->n_unique first fields, + and a secondary index node pointer contains all index fields. */ + + return(cursor->low_match + >= dict_index_get_n_unique_in_tree(cursor->index) + && !page_rec_is_infimum(btr_cur_get_rec(cursor))); +} + +/***************************************************************//** +Tries to insert an entry into a clustered index, ignoring foreign key +constraints. If a record with the same unique key is found, the other +record is necessarily marked deleted by a committed transaction, or a +unique key violation error occurs. The delete marked record is then +updated to an existing record, and we must write an undo log record on +the delete marked record. +@retval DB_SUCCESS on success +@retval DB_LOCK_WAIT on lock wait when !(flags & BTR_NO_LOCKING_FLAG) +@retval DB_FAIL if retry with BTR_MODIFY_TREE is needed +@return error code */ +UNIV_INTERN +dberr_t +row_ins_clust_index_entry_low( +/*==========================*/ + ulint flags, /*!< in: undo logging and locking flags */ + ulint mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE, + depending on whether we wish optimistic or + pessimistic descent down the index tree */ + dict_index_t* index, /*!< in: clustered index */ + ulint n_uniq, /*!< in: 0 or index->n_uniq */ + dtuple_t* entry, /*!< in/out: index entry to insert */ + ulint n_ext, /*!< in: number of externally stored columns */ + que_thr_t* thr) /*!< in: query thread */ +{ + btr_cur_t cursor; + ulint* offsets = NULL; + dberr_t err; + big_rec_t* big_rec = NULL; + mtr_t mtr; + mem_heap_t* offsets_heap = NULL; + + ut_ad(dict_index_is_clust(index)); + ut_ad(!dict_index_is_unique(index) + || n_uniq == dict_index_get_n_unique(index)); + ut_ad(!n_uniq || n_uniq == dict_index_get_n_unique(index)); + + mtr_start(&mtr); + + if (mode == BTR_MODIFY_LEAF && dict_index_is_online_ddl(index)) { + if (UNIV_UNLIKELY(thr_get_trx(thr)->fake_changes)) { + mode = BTR_SEARCH_LEAF | BTR_ALREADY_S_LATCHED; + } else { + mode = BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED; + } + mtr_s_lock(dict_index_get_lock(index), &mtr); + } else if (UNIV_UNLIKELY(thr_get_trx(thr)->fake_changes)) { + mode = (mode & BTR_MODIFY_TREE) + ? BTR_SEARCH_TREE : BTR_SEARCH_LEAF; + } + + cursor.thr = thr; + + /* Note that we use PAGE_CUR_LE as the search mode, because then + the function will return in both low_match and up_match of the + cursor sensible values */ + + btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE, mode, + &cursor, 0, __FILE__, __LINE__, &mtr); + +#ifdef UNIV_DEBUG + { + page_t* page = btr_cur_get_page(&cursor); + rec_t* first_rec = page_rec_get_next( + page_get_infimum_rec(page)); + + ut_ad(page_rec_is_supremum(first_rec) + || rec_get_n_fields(first_rec, index) + == dtuple_get_n_fields(entry)); + } +#endif + + if (n_uniq && (cursor.up_match >= n_uniq + || cursor.low_match >= n_uniq)) { + + if (flags + == (BTR_CREATE_FLAG | BTR_NO_LOCKING_FLAG + | BTR_NO_UNDO_LOG_FLAG | BTR_KEEP_SYS_FLAG)) { + /* Set no locks when applying log + in online table rebuild. Only check for duplicates. */ + err = row_ins_duplicate_error_in_clust_online( + n_uniq, entry, &cursor, + &offsets, &offsets_heap); + + switch (err) { + case DB_SUCCESS: + break; + default: + ut_ad(0); + /* fall through */ + case DB_SUCCESS_LOCKED_REC: + case DB_DUPLICATE_KEY: + thr_get_trx(thr)->error_info = cursor.index; + } + } else { + /* Note that the following may return also + DB_LOCK_WAIT */ + + err = row_ins_duplicate_error_in_clust( + flags, &cursor, entry, thr, &mtr); + } + + if (err != DB_SUCCESS) { +err_exit: + mtr_commit(&mtr); + goto func_exit; + } + } + + if (row_ins_must_modify_rec(&cursor)) { + /* There is already an index entry with a long enough common + prefix, we must convert the insert into a modify of an + existing record */ + mem_heap_t* entry_heap = mem_heap_create(1024); + + err = row_ins_clust_index_entry_by_modify( + flags, mode, &cursor, &offsets, &offsets_heap, + entry_heap, &big_rec, entry, thr, &mtr); + + rec_t* rec = btr_cur_get_rec(&cursor); + + if (big_rec && UNIV_LIKELY(!thr_get_trx(thr)->fake_changes)) { + ut_a(err == DB_SUCCESS); + /* Write out the externally stored + columns while still x-latching + index->lock and block->lock. Allocate + pages for big_rec in the mtr that + modified the B-tree, but be sure to skip + any pages that were freed in mtr. We will + write out the big_rec pages before + committing the B-tree mini-transaction. If + the system crashes so that crash recovery + will not replay the mtr_commit(&mtr), the + big_rec pages will be left orphaned until + the pages are allocated for something else. + + TODO: If the allocation extends the + tablespace, it will not be redo + logged, in either mini-transaction. + Tablespace extension should be + redo-logged in the big_rec + mini-transaction, so that recovery + will not fail when the big_rec was + written to the extended portion of the + file, in case the file was somehow + truncated in the crash. */ + + DEBUG_SYNC_C_IF_THD( + thr_get_trx(thr)->mysql_thd, + "before_row_ins_upd_extern"); + err = btr_store_big_rec_extern_fields( + index, btr_cur_get_block(&cursor), + rec, offsets, big_rec, &mtr, + BTR_STORE_INSERT_UPDATE); + DEBUG_SYNC_C_IF_THD( + thr_get_trx(thr)->mysql_thd, + "after_row_ins_upd_extern"); + /* If writing big_rec fails (for + example, because of DB_OUT_OF_FILE_SPACE), + the record will be corrupted. Even if + we did not update any externally + stored columns, our update could cause + the record to grow so that a + non-updated column was selected for + external storage. This non-update + would not have been written to the + undo log, and thus the record cannot + be rolled back. + + However, because we have not executed + mtr_commit(mtr) yet, the update will + not be replayed in crash recovery, and + the following assertion failure will + effectively "roll back" the operation. */ + ut_a(err == DB_SUCCESS); + dtuple_big_rec_free(big_rec); + } + + if (err == DB_SUCCESS && dict_index_is_online_ddl(index)) { + row_log_table_insert(rec, index, offsets); + } + + mtr_commit(&mtr); + mem_heap_free(entry_heap); + } else { + rec_t* insert_rec; + + if (mode != BTR_MODIFY_TREE) { + ut_ad(((mode & ~BTR_ALREADY_S_LATCHED) + == BTR_MODIFY_LEAF) + || thr_get_trx(thr)->fake_changes); + err = btr_cur_optimistic_insert( + flags, &cursor, &offsets, &offsets_heap, + entry, &insert_rec, &big_rec, + n_ext, thr, &mtr); + } else { + if (buf_LRU_buf_pool_running_out()) { + + err = DB_LOCK_TABLE_FULL; + goto err_exit; + } + + err = btr_cur_optimistic_insert( + flags, &cursor, + &offsets, &offsets_heap, + entry, &insert_rec, &big_rec, + n_ext, thr, &mtr); + + if (err == DB_FAIL) { + err = btr_cur_pessimistic_insert( + flags, &cursor, + &offsets, &offsets_heap, + entry, &insert_rec, &big_rec, + n_ext, thr, &mtr); + } + } + + if (UNIV_LIKELY_NULL(big_rec)) { + mtr_commit(&mtr); + + if (UNIV_UNLIKELY(thr_get_trx(thr)->fake_changes)) { + + /* skip store extern */ + mem_heap_free(big_rec->heap); + goto func_exit; + } + + /* Online table rebuild could read (and + ignore) the incomplete record at this point. + If online rebuild is in progress, the + row_ins_index_entry_big_rec() will write log. */ + + DBUG_EXECUTE_IF( + "row_ins_extern_checkpoint", + log_make_checkpoint_at( + LSN_MAX, TRUE);); + err = row_ins_index_entry_big_rec( + entry, big_rec, offsets, &offsets_heap, index, + thr_get_trx(thr)->mysql_thd, + __FILE__, __LINE__); + dtuple_convert_back_big_rec(index, entry, big_rec); + } else { + if (err == DB_SUCCESS + && dict_index_is_online_ddl(index)) { + row_log_table_insert( + insert_rec, index, offsets); + } + + mtr_commit(&mtr); + } + } + +func_exit: + if (offsets_heap) { + mem_heap_free(offsets_heap); + } + + return(err); +} + +/***************************************************************//** +Starts a mini-transaction and checks if the index will be dropped. +@return true if the index is to be dropped */ +static __attribute__((nonnull, warn_unused_result)) +bool +row_ins_sec_mtr_start_and_check_if_aborted( +/*=======================================*/ + mtr_t* mtr, /*!< out: mini-transaction */ + dict_index_t* index, /*!< in/out: secondary index */ + bool check, /*!< in: whether to check */ + ulint search_mode) + /*!< in: flags */ +{ + ut_ad(!dict_index_is_clust(index)); + + mtr_start(mtr); + + if (!check) { + return(false); + } + + if (search_mode & BTR_ALREADY_S_LATCHED) { + mtr_s_lock(dict_index_get_lock(index), mtr); + } else { + mtr_x_lock(dict_index_get_lock(index), mtr); + } + + switch (index->online_status) { + case ONLINE_INDEX_ABORTED: + case ONLINE_INDEX_ABORTED_DROPPED: + ut_ad(*index->name == TEMP_INDEX_PREFIX); + return(true); + case ONLINE_INDEX_COMPLETE: + return(false); + case ONLINE_INDEX_CREATION: + break; + } + + ut_error; + return(true); +} + +/***************************************************************//** +Tries to insert an entry into a secondary index. If a record with exactly the +same fields is found, the other record is necessarily marked deleted. +It is then unmarked. Otherwise, the entry is just inserted to the index. +@retval DB_SUCCESS on success +@retval DB_LOCK_WAIT on lock wait when !(flags & BTR_NO_LOCKING_FLAG) +@retval DB_FAIL if retry with BTR_MODIFY_TREE is needed +@return error code */ +UNIV_INTERN +dberr_t +row_ins_sec_index_entry_low( +/*========================*/ + ulint flags, /*!< in: undo logging and locking flags */ + ulint mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE, + depending on whether we wish optimistic or + pessimistic descent down the index tree */ + dict_index_t* index, /*!< in: secondary index */ + mem_heap_t* offsets_heap, + /*!< in/out: memory heap that can be emptied */ + mem_heap_t* heap, /*!< in/out: memory heap */ + dtuple_t* entry, /*!< in/out: index entry to insert */ + trx_id_t trx_id, /*!< in: PAGE_MAX_TRX_ID during + row_log_table_apply(), or 0 */ + que_thr_t* thr) /*!< in: query thread */ +{ + btr_cur_t cursor; + ulint search_mode = mode | BTR_INSERT; + dberr_t err = DB_SUCCESS; + ulint n_unique; + mtr_t mtr; + ulint* offsets = NULL; + + ut_ad(!dict_index_is_clust(index)); + ut_ad(mode == BTR_MODIFY_LEAF || mode == BTR_MODIFY_TREE); + + cursor.thr = thr; + ut_ad(thr_get_trx(thr)->id); + mtr_start(&mtr); + + /* Ensure that we acquire index->lock when inserting into an + index with index->online_status == ONLINE_INDEX_COMPLETE, but + could still be subject to rollback_inplace_alter_table(). + This prevents a concurrent change of index->online_status. + The memory object cannot be freed as long as we have an open + reference to the table, or index->table->n_ref_count > 0. */ + const bool check = *index->name == TEMP_INDEX_PREFIX; + if (check) { + DEBUG_SYNC_C("row_ins_sec_index_enter"); + if (mode == BTR_MODIFY_LEAF) { + search_mode |= BTR_ALREADY_S_LATCHED; + mtr_s_lock(dict_index_get_lock(index), &mtr); + } else { + mtr_x_lock(dict_index_get_lock(index), &mtr); + } + + if (row_log_online_op_try( + index, entry, thr_get_trx(thr)->id)) { + goto func_exit; + } + } + + /* Note that we use PAGE_CUR_LE as the search mode, because then + the function will return in both low_match and up_match of the + cursor sensible values */ + + if (!thr_get_trx(thr)->check_unique_secondary) { + search_mode |= BTR_IGNORE_SEC_UNIQUE; + } + + btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE, + search_mode, + &cursor, 0, __FILE__, __LINE__, &mtr); + + if (cursor.flag == BTR_CUR_INSERT_TO_IBUF) { + /* The insert was buffered during the search: we are done */ + goto func_exit; + } + +#ifdef UNIV_DEBUG + { + page_t* page = btr_cur_get_page(&cursor); + rec_t* first_rec = page_rec_get_next( + page_get_infimum_rec(page)); + + ut_ad(page_rec_is_supremum(first_rec) + || rec_get_n_fields(first_rec, index) + == dtuple_get_n_fields(entry)); + } +#endif + + n_unique = dict_index_get_n_unique(index); + + if (dict_index_is_unique(index) + && (cursor.low_match >= n_unique || cursor.up_match >= n_unique)) { + mtr_commit(&mtr); + + DEBUG_SYNC_C("row_ins_sec_index_unique"); + + if (row_ins_sec_mtr_start_and_check_if_aborted( + &mtr, index, check, search_mode)) { + goto func_exit; + } + + err = row_ins_scan_sec_index_for_duplicate( + flags, index, entry, thr, check, &mtr, offsets_heap); + + mtr_commit(&mtr); + + switch (err) { + case DB_SUCCESS: + break; + case DB_DUPLICATE_KEY: + if (*index->name == TEMP_INDEX_PREFIX) { + ut_ad(!thr_get_trx(thr) + ->dict_operation_lock_mode); + mutex_enter(&dict_sys->mutex); + dict_set_corrupted_index_cache_only( + index, index->table); + mutex_exit(&dict_sys->mutex); + /* Do not return any error to the + caller. The duplicate will be reported + by ALTER TABLE or CREATE UNIQUE INDEX. + Unfortunately we cannot report the + duplicate key value to the DDL thread, + because the altered_table object is + private to its call stack. */ + err = DB_SUCCESS; + } + /* fall through */ + default: + return(err); + } + + if (row_ins_sec_mtr_start_and_check_if_aborted( + &mtr, index, check, search_mode)) { + goto func_exit; + } + + /* We did not find a duplicate and we have now + locked with s-locks the necessary records to + prevent any insertion of a duplicate by another + transaction. Let us now reposition the cursor and + continue the insertion. */ + + btr_cur_search_to_nth_level( + index, 0, entry, PAGE_CUR_LE, + UNIV_UNLIKELY(thr_get_trx(thr)->fake_changes) + ? BTR_SEARCH_LEAF + : (btr_latch_mode) + (search_mode & ~(BTR_INSERT | BTR_IGNORE_SEC_UNIQUE)), + &cursor, 0, __FILE__, __LINE__, &mtr); + } + + if (row_ins_must_modify_rec(&cursor)) { + /* There is already an index entry with a long enough common + prefix, we must convert the insert into a modify of an + existing record */ + offsets = rec_get_offsets( + btr_cur_get_rec(&cursor), index, offsets, + ULINT_UNDEFINED, &offsets_heap); + + err = row_ins_sec_index_entry_by_modify( + flags, mode, &cursor, &offsets, + offsets_heap, heap, entry, thr, &mtr); + } else { + rec_t* insert_rec; + big_rec_t* big_rec; + + if (mode == BTR_MODIFY_LEAF) { + err = btr_cur_optimistic_insert( + flags, &cursor, &offsets, &offsets_heap, + entry, &insert_rec, + &big_rec, 0, thr, &mtr); + } else { + ut_ad(mode == BTR_MODIFY_TREE); + if (buf_LRU_buf_pool_running_out()) { + + err = DB_LOCK_TABLE_FULL; + goto func_exit; + } + + err = btr_cur_optimistic_insert( + flags, &cursor, + &offsets, &offsets_heap, + entry, &insert_rec, + &big_rec, 0, thr, &mtr); + if (err == DB_FAIL) { + err = btr_cur_pessimistic_insert( + flags, &cursor, + &offsets, &offsets_heap, + entry, &insert_rec, + &big_rec, 0, thr, &mtr); + } + } + + if (err == DB_SUCCESS && trx_id) { + page_update_max_trx_id( + btr_cur_get_block(&cursor), + btr_cur_get_page_zip(&cursor), + trx_id, &mtr); + } + + ut_ad(!big_rec); + } + +func_exit: + mtr_commit(&mtr); + return(err); +} + +/***************************************************************//** +Tries to insert the externally stored fields (off-page columns) +of a clustered index entry. +@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ +UNIV_INTERN +dberr_t +row_ins_index_entry_big_rec_func( +/*=============================*/ + const dtuple_t* entry, /*!< in/out: index entry to insert */ + const big_rec_t* big_rec,/*!< in: externally stored fields */ + ulint* offsets,/*!< in/out: rec offsets */ + mem_heap_t** heap, /*!< in/out: memory heap */ + dict_index_t* index, /*!< in: index */ + const char* file, /*!< in: file name of caller */ +#ifndef DBUG_OFF + const void* thd, /*!< in: connection, or NULL */ +#endif /* DBUG_OFF */ + ulint line) /*!< in: line number of caller */ +{ + mtr_t mtr; + btr_cur_t cursor; + rec_t* rec; + dberr_t error; + + ut_ad(dict_index_is_clust(index)); + + DEBUG_SYNC_C_IF_THD(thd, "before_row_ins_extern_latch"); + + mtr_start(&mtr); + btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE, + BTR_MODIFY_TREE, &cursor, 0, + file, line, &mtr); + rec = btr_cur_get_rec(&cursor); + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, heap); + + DEBUG_SYNC_C_IF_THD(thd, "before_row_ins_extern"); + error = btr_store_big_rec_extern_fields( + index, btr_cur_get_block(&cursor), + rec, offsets, big_rec, &mtr, BTR_STORE_INSERT); + DEBUG_SYNC_C_IF_THD(thd, "after_row_ins_extern"); + + if (error == DB_SUCCESS + && dict_index_is_online_ddl(index)) { + row_log_table_insert(rec, index, offsets); + } + + mtr_commit(&mtr); + + return(error); +} + +/***************************************************************//** +Inserts an entry into a clustered index. Tries first optimistic, +then pessimistic descent down the tree. If the entry matches enough +to a delete marked record, performs the insert by updating or delete +unmarking the delete marked record. +@return DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */ +UNIV_INTERN +dberr_t +row_ins_clust_index_entry( +/*======================*/ + dict_index_t* index, /*!< in: clustered index */ + dtuple_t* entry, /*!< in/out: index entry to insert */ + que_thr_t* thr, /*!< in: query thread */ + ulint n_ext) /*!< in: number of externally stored columns */ +{ + dberr_t err; + ulint n_uniq; + + if (!index->table->foreign_set.empty()) { + err = row_ins_check_foreign_constraints( + index->table, index, entry, thr); + if (err != DB_SUCCESS) { + + return(err); + } + } + + n_uniq = dict_index_is_unique(index) ? index->n_uniq : 0; + + /* Try first optimistic descent to the B-tree */ + + log_free_check(); + + err = row_ins_clust_index_entry_low( + 0, BTR_MODIFY_LEAF, index, n_uniq, entry, n_ext, thr); + +#ifdef UNIV_DEBUG + /* Work around Bug#14626800 ASSERTION FAILURE IN DEBUG_SYNC(). + Once it is fixed, remove the 'ifdef', 'if' and this comment. */ + if (!thr_get_trx(thr)->ddl) { + DEBUG_SYNC_C_IF_THD(thr_get_trx(thr)->mysql_thd, + "after_row_ins_clust_index_entry_leaf"); + } +#endif /* UNIV_DEBUG */ + + if (err != DB_FAIL) { + DEBUG_SYNC_C("row_ins_clust_index_entry_leaf_after"); + return(err); + } + + /* Try then pessimistic descent to the B-tree */ + + log_free_check(); + + return(row_ins_clust_index_entry_low( + 0, BTR_MODIFY_TREE, index, n_uniq, entry, n_ext, thr)); +} + +/***************************************************************//** +Inserts an entry into a secondary index. Tries first optimistic, +then pessimistic descent down the tree. If the entry matches enough +to a delete marked record, performs the insert by updating or delete +unmarking the delete marked record. +@return DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */ +UNIV_INTERN +dberr_t +row_ins_sec_index_entry( +/*====================*/ + dict_index_t* index, /*!< in: secondary index */ + dtuple_t* entry, /*!< in/out: index entry to insert */ + que_thr_t* thr) /*!< in: query thread */ +{ + dberr_t err; + mem_heap_t* offsets_heap; + mem_heap_t* heap; + + if (!index->table->foreign_set.empty()) { + err = row_ins_check_foreign_constraints(index->table, index, + entry, thr); + if (err != DB_SUCCESS) { + + return(err); + } + } + + ut_ad(thr_get_trx(thr)->id); + + offsets_heap = mem_heap_create(1024); + heap = mem_heap_create(1024); + + /* Try first optimistic descent to the B-tree */ + + log_free_check(); + + err = row_ins_sec_index_entry_low( + 0, BTR_MODIFY_LEAF, index, offsets_heap, heap, entry, 0, thr); + if (err == DB_FAIL) { + mem_heap_empty(heap); + + /* Try then pessimistic descent to the B-tree */ + + log_free_check(); + + err = row_ins_sec_index_entry_low( + 0, BTR_MODIFY_TREE, index, + offsets_heap, heap, entry, 0, thr); + } + + mem_heap_free(heap); + mem_heap_free(offsets_heap); + return(err); +} + +/***************************************************************//** +Inserts an index entry to index. Tries first optimistic, then pessimistic +descent down the tree. If the entry matches enough to a delete marked record, +performs the insert by updating or delete unmarking the delete marked +record. +@return DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */ +static +dberr_t +row_ins_index_entry( +/*================*/ + dict_index_t* index, /*!< in: index */ + dtuple_t* entry, /*!< in/out: index entry to insert */ + que_thr_t* thr) /*!< in: query thread */ +{ + DBUG_EXECUTE_IF("row_ins_index_entry_timeout", { + DBUG_SET("-d,row_ins_index_entry_timeout"); + return(DB_LOCK_WAIT);}); + + if (dict_index_is_clust(index)) { + return(row_ins_clust_index_entry(index, entry, thr, 0)); + } else { + return(row_ins_sec_index_entry(index, entry, thr)); + } +} + +/***********************************************************//** +Sets the values of the dtuple fields in entry from the values of appropriate +columns in row. */ +static __attribute__((nonnull)) +void +row_ins_index_entry_set_vals( +/*=========================*/ + dict_index_t* index, /*!< in: index */ + dtuple_t* entry, /*!< in: index entry to make */ + const dtuple_t* row) /*!< in: row */ +{ + ulint n_fields; + ulint i; + + n_fields = dtuple_get_n_fields(entry); + + for (i = 0; i < n_fields; i++) { + dict_field_t* ind_field; + dfield_t* field; + const dfield_t* row_field; + ulint len; + + field = dtuple_get_nth_field(entry, i); + ind_field = dict_index_get_nth_field(index, i); + row_field = dtuple_get_nth_field(row, ind_field->col->ind); + len = dfield_get_len(row_field); + + /* Check column prefix indexes */ + if (ind_field->prefix_len > 0 + && dfield_get_len(row_field) != UNIV_SQL_NULL) { + + const dict_col_t* col + = dict_field_get_col(ind_field); + + len = dtype_get_at_most_n_mbchars( + col->prtype, col->mbminmaxlen, + ind_field->prefix_len, + len, + static_cast<const char*>( + dfield_get_data(row_field))); + + ut_ad(!dfield_is_ext(row_field)); + } + + dfield_set_data(field, dfield_get_data(row_field), len); + if (dfield_is_ext(row_field)) { + ut_ad(dict_index_is_clust(index)); + dfield_set_ext(field); + } + } +} + +/***********************************************************//** +Inserts a single index entry to the table. +@return DB_SUCCESS if operation successfully completed, else error +code or DB_LOCK_WAIT */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_ins_index_entry_step( +/*=====================*/ + ins_node_t* node, /*!< in: row insert node */ + que_thr_t* thr) /*!< in: query thread */ +{ + dberr_t err; + + ut_ad(dtuple_check_typed(node->row)); + + row_ins_index_entry_set_vals(node->index, node->entry, node->row); + + ut_ad(dtuple_check_typed(node->entry)); + + err = row_ins_index_entry(node->index, node->entry, thr); + +#ifdef UNIV_DEBUG + /* Work around Bug#14626800 ASSERTION FAILURE IN DEBUG_SYNC(). + Once it is fixed, remove the 'ifdef', 'if' and this comment. */ + if (!thr_get_trx(thr)->ddl) { + DEBUG_SYNC_C_IF_THD(thr_get_trx(thr)->mysql_thd, + "after_row_ins_index_entry_step"); + } +#endif /* UNIV_DEBUG */ + + return(err); +} + +/***********************************************************//** +Allocates a row id for row and inits the node->index field. */ +UNIV_INLINE +void +row_ins_alloc_row_id_step( +/*======================*/ + ins_node_t* node) /*!< in: row insert node */ +{ + row_id_t row_id; + + ut_ad(node->state == INS_NODE_ALLOC_ROW_ID); + + if (dict_index_is_unique(dict_table_get_first_index(node->table))) { + + /* No row id is stored if the clustered index is unique */ + + return; + } + + /* Fill in row id value to row */ + + row_id = dict_sys_get_new_row_id(); + + dict_sys_write_row_id(node->row_id_buf, row_id); +} + +/***********************************************************//** +Gets a row to insert from the values list. */ +UNIV_INLINE +void +row_ins_get_row_from_values( +/*========================*/ + ins_node_t* node) /*!< in: row insert node */ +{ + que_node_t* list_node; + dfield_t* dfield; + dtuple_t* row; + ulint i; + + /* The field values are copied in the buffers of the select node and + it is safe to use them until we fetch from select again: therefore + we can just copy the pointers */ + + row = node->row; + + i = 0; + list_node = node->values_list; + + while (list_node) { + eval_exp(list_node); + + dfield = dtuple_get_nth_field(row, i); + dfield_copy_data(dfield, que_node_get_val(list_node)); + + i++; + list_node = que_node_get_next(list_node); + } +} + +/***********************************************************//** +Gets a row to insert from the select list. */ +UNIV_INLINE +void +row_ins_get_row_from_select( +/*========================*/ + ins_node_t* node) /*!< in: row insert node */ +{ + que_node_t* list_node; + dfield_t* dfield; + dtuple_t* row; + ulint i; + + /* The field values are copied in the buffers of the select node and + it is safe to use them until we fetch from select again: therefore + we can just copy the pointers */ + + row = node->row; + + i = 0; + list_node = node->select->select_list; + + while (list_node) { + dfield = dtuple_get_nth_field(row, i); + dfield_copy_data(dfield, que_node_get_val(list_node)); + + i++; + list_node = que_node_get_next(list_node); + } +} + +/***********************************************************//** +Inserts a row to a table. +@return DB_SUCCESS if operation successfully completed, else error +code or DB_LOCK_WAIT */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_ins( +/*====*/ + ins_node_t* node, /*!< in: row insert node */ + que_thr_t* thr) /*!< in: query thread */ +{ + dberr_t err; + + if (node->state == INS_NODE_ALLOC_ROW_ID) { + + row_ins_alloc_row_id_step(node); + + node->index = dict_table_get_first_index(node->table); + node->entry = UT_LIST_GET_FIRST(node->entry_list); + + if (node->ins_type == INS_SEARCHED) { + + row_ins_get_row_from_select(node); + + } else if (node->ins_type == INS_VALUES) { + + row_ins_get_row_from_values(node); + } + + node->state = INS_NODE_INSERT_ENTRIES; + } + + ut_ad(node->state == INS_NODE_INSERT_ENTRIES); + + while (node->index != NULL) { + if (node->index->type != DICT_FTS) { + err = row_ins_index_entry_step(node, thr); + + if (err != DB_SUCCESS) { + + return(err); + } + } + + node->index = dict_table_get_next_index(node->index); + node->entry = UT_LIST_GET_NEXT(tuple_list, node->entry); + + DBUG_EXECUTE_IF( + "row_ins_skip_sec", + node->index = NULL; node->entry = NULL; break;); + + /* Skip corrupted secondary index and its entry */ + while (node->index && dict_index_is_corrupted(node->index)) { + + node->index = dict_table_get_next_index(node->index); + node->entry = UT_LIST_GET_NEXT(tuple_list, node->entry); + } + } + + ut_ad(node->entry == NULL); + + node->state = INS_NODE_ALLOC_ROW_ID; + + return(DB_SUCCESS); +} + +/***********************************************************//** +Inserts a row to a table. This is a high-level function used in SQL execution +graphs. +@return query thread to run next or NULL */ +UNIV_INTERN +que_thr_t* +row_ins_step( +/*=========*/ + que_thr_t* thr) /*!< in: query thread */ +{ + ins_node_t* node; + que_node_t* parent; + sel_node_t* sel_node; + trx_t* trx; + dberr_t err; + + ut_ad(thr); + + trx = thr_get_trx(thr); + + trx_start_if_not_started_xa(trx); + + node = static_cast<ins_node_t*>(thr->run_node); + + ut_ad(que_node_get_type(node) == QUE_NODE_INSERT); + + parent = que_node_get_parent(node); + sel_node = node->select; + + if (thr->prev_node == parent) { + node->state = INS_NODE_SET_IX_LOCK; + } + + /* If this is the first time this node is executed (or when + execution resumes after wait for the table IX lock), set an + IX lock on the table and reset the possible select node. MySQL's + partitioned table code may also call an insert within the same + SQL statement AFTER it has used this table handle to do a search. + This happens, for example, when a row update moves it to another + partition. In that case, we have already set the IX lock on the + table during the search operation, and there is no need to set + it again here. But we must write trx->id to node->trx_id_buf. */ + + trx_write_trx_id(node->trx_id_buf, trx->id); + + if (node->state == INS_NODE_SET_IX_LOCK) { + + node->state = INS_NODE_ALLOC_ROW_ID; + + /* It may be that the current session has not yet started + its transaction, or it has been committed: */ + + if (trx->id == node->trx_id) { + /* No need to do IX-locking */ + + goto same_trx; + } + + err = lock_table(0, node->table, LOCK_IX, thr); + + DBUG_EXECUTE_IF("ib_row_ins_ix_lock_wait", + err = DB_LOCK_WAIT;); + + if (err != DB_SUCCESS) { + + goto error_handling; + } + + node->trx_id = trx->id; +same_trx: + if (node->ins_type == INS_SEARCHED) { + /* Reset the cursor */ + sel_node->state = SEL_NODE_OPEN; + + /* Fetch a row to insert */ + + thr->run_node = sel_node; + + return(thr); + } + } + + if ((node->ins_type == INS_SEARCHED) + && (sel_node->state != SEL_NODE_FETCH)) { + + ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS); + + /* No more rows to insert */ + thr->run_node = parent; + + return(thr); + } + + /* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */ + + err = row_ins(node, thr); + +error_handling: + trx->error_state = err; + + if (err != DB_SUCCESS) { + /* err == DB_LOCK_WAIT or SQL error detected */ + return(NULL); + } + + /* DO THE TRIGGER ACTIONS HERE */ + + if (node->ins_type == INS_SEARCHED) { + /* Fetch a row to insert */ + + thr->run_node = sel_node; + } else { + thr->run_node = que_node_get_parent(node); + } + + return(thr); +} diff --git a/storage/xtradb/row/row0log.cc b/storage/xtradb/row/row0log.cc new file mode 100644 index 00000000000..1240cf7fcc5 --- /dev/null +++ b/storage/xtradb/row/row0log.cc @@ -0,0 +1,3634 @@ +/***************************************************************************** + +Copyright (c) 2011, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file row/row0log.cc +Modification log for online index creation and online table rebuild + +Created 2011-05-26 Marko Makela +*******************************************************/ + +#include "row0log.h" + +#ifdef UNIV_NONINL +#include "row0log.ic" +#endif + +#include "row0row.h" +#include "row0ins.h" +#include "row0upd.h" +#include "row0merge.h" +#include "row0ext.h" +#include "data0data.h" +#include "que0que.h" +#include "handler0alter.h" + +#include<map> + +/** Table row modification operations during online table rebuild. +Delete-marked records are not copied to the rebuilt table. */ +enum row_tab_op { + /** Insert a record */ + ROW_T_INSERT = 0x41, + /** Update a record in place */ + ROW_T_UPDATE, + /** Delete (purge) a record */ + ROW_T_DELETE +}; + +/** Index record modification operations during online index creation */ +enum row_op { + /** Insert a record */ + ROW_OP_INSERT = 0x61, + /** Delete a record */ + ROW_OP_DELETE +}; + +#ifdef UNIV_DEBUG +/** Write information about the applied record to the error log */ +# define ROW_LOG_APPLY_PRINT +#endif /* UNIV_DEBUG */ + +#ifdef ROW_LOG_APPLY_PRINT +/** When set, write information about the applied record to the error log */ +static bool row_log_apply_print; +#endif /* ROW_LOG_APPLY_PRINT */ + +/** Size of the modification log entry header, in bytes */ +#define ROW_LOG_HEADER_SIZE 2/*op, extra_size*/ + +/** Log block for modifications during online ALTER TABLE */ +struct row_log_buf_t { + byte* block; /*!< file block buffer */ + mrec_buf_t buf; /*!< buffer for accessing a record + that spans two blocks */ + ulint blocks; /*!< current position in blocks */ + ulint bytes; /*!< current position within block */ + ulonglong total; /*!< logical position, in bytes from + the start of the row_log_table log; + 0 for row_log_online_op() and + row_log_apply(). */ + ulint size; /*!< allocated size of block */ +}; + +/** Tracks BLOB allocation during online ALTER TABLE */ +class row_log_table_blob_t { +public: + /** Constructor (declaring a BLOB freed) + @param offset_arg row_log_t::tail::total */ +#ifdef UNIV_DEBUG + row_log_table_blob_t(ulonglong offset_arg) : + old_offset (0), free_offset (offset_arg), + offset (BLOB_FREED) {} +#else /* UNIV_DEBUG */ + row_log_table_blob_t() : + offset (BLOB_FREED) {} +#endif /* UNIV_DEBUG */ + + /** Declare a BLOB freed again. + @param offset_arg row_log_t::tail::total */ +#ifdef UNIV_DEBUG + void blob_free(ulonglong offset_arg) +#else /* UNIV_DEBUG */ + void blob_free() +#endif /* UNIV_DEBUG */ + { + ut_ad(offset < offset_arg); + ut_ad(offset != BLOB_FREED); + ut_d(old_offset = offset); + ut_d(free_offset = offset_arg); + offset = BLOB_FREED; + } + /** Declare a freed BLOB reused. + @param offset_arg row_log_t::tail::total */ + void blob_alloc(ulonglong offset_arg) { + ut_ad(free_offset <= offset_arg); + ut_d(old_offset = offset); + offset = offset_arg; + } + /** Determine if a BLOB was freed at a given log position + @param offset_arg row_log_t::head::total after the log record + @return true if freed */ + bool is_freed(ulonglong offset_arg) const { + /* This is supposed to be the offset at the end of the + current log record. */ + ut_ad(offset_arg > 0); + /* We should never get anywhere close the magic value. */ + ut_ad(offset_arg < BLOB_FREED); + return(offset_arg < offset); + } +private: + /** Magic value for a freed BLOB */ + static const ulonglong BLOB_FREED = ~0ULL; +#ifdef UNIV_DEBUG + /** Old offset, in case a page was freed, reused, freed, ... */ + ulonglong old_offset; + /** Offset of last blob_free() */ + ulonglong free_offset; +#endif /* UNIV_DEBUG */ + /** Byte offset to the log file */ + ulonglong offset; +}; + +/** @brief Map of off-page column page numbers to 0 or log byte offsets. + +If there is no mapping for a page number, it is safe to access. +If a page number maps to 0, it is an off-page column that has been freed. +If a page number maps to a nonzero number, the number is a byte offset +into the index->online_log, indicating that the page is safe to access +when applying log records starting from that offset. */ +typedef std::map<ulint, row_log_table_blob_t> page_no_map; + +/** @brief Buffer for logging modifications during online index creation + +All modifications to an index that is being created will be logged by +row_log_online_op() to this buffer. + +All modifications to a table that is being rebuilt will be logged by +row_log_table_delete(), row_log_table_update(), row_log_table_insert() +to this buffer. + +When head.blocks == tail.blocks, the reader will access tail.block +directly. When also head.bytes == tail.bytes, both counts will be +reset to 0 and the file will be truncated. */ +struct row_log_t { + int fd; /*!< file descriptor */ + ib_mutex_t mutex; /*!< mutex protecting error, + max_trx and tail */ + page_no_map* blobs; /*!< map of page numbers of off-page columns + that have been freed during table-rebuilding + ALTER TABLE (row_log_table_*); protected by + index->lock X-latch only */ + dict_table_t* table; /*!< table that is being rebuilt, + or NULL when this is a secondary + index that is being created online */ + bool same_pk;/*!< whether the definition of the PRIMARY KEY + has remained the same */ + const dtuple_t* add_cols; + /*!< default values of added columns, or NULL */ + const ulint* col_map;/*!< mapping of old column numbers to + new ones, or NULL if !table */ + dberr_t error; /*!< error that occurred during online + table rebuild */ + trx_id_t max_trx;/*!< biggest observed trx_id in + row_log_online_op(); + protected by mutex and index->lock S-latch, + or by index->lock X-latch only */ + row_log_buf_t tail; /*!< writer context; + protected by mutex and index->lock S-latch, + or by index->lock X-latch only */ + row_log_buf_t head; /*!< reader context; protected by MDL only; + modifiable by row_log_apply_ops() */ +}; + + +/** Allocate the memory for the log buffer. +@param[in,out] log_buf Buffer used for log operation +@return TRUE if success, false if not */ +static __attribute__((warn_unused_result)) +bool +row_log_block_allocate( + row_log_buf_t& log_buf) +{ + DBUG_ENTER("row_log_block_allocate"); + if (log_buf.block == NULL) { + log_buf.size = srv_sort_buf_size; + log_buf.block = (byte*) os_mem_alloc_large(&log_buf.size, + FALSE); + DBUG_EXECUTE_IF("simulate_row_log_allocation_failure", + if (log_buf.block) + os_mem_free_large(log_buf.block, log_buf.size); + log_buf.block = NULL;); + if (!log_buf.block) { + DBUG_RETURN(false); + } + } + DBUG_RETURN(true); +} + +/** Free the log buffer. +@param[in,out] log_buf Buffer used for log operation */ +static +void +row_log_block_free( + row_log_buf_t& log_buf) +{ + DBUG_ENTER("row_log_block_free"); + if (log_buf.block != NULL) { + os_mem_free_large(log_buf.block, log_buf.size); + log_buf.block = NULL; + } + DBUG_VOID_RETURN; +} + +/******************************************************//** +Logs an operation to a secondary index that is (or was) being created. */ +UNIV_INTERN +void +row_log_online_op( +/*==============*/ + dict_index_t* index, /*!< in/out: index, S or X latched */ + const dtuple_t* tuple, /*!< in: index tuple */ + trx_id_t trx_id) /*!< in: transaction ID for insert, + or 0 for delete */ +{ + byte* b; + ulint extra_size; + ulint size; + ulint mrec_size; + ulint avail_size; + row_log_t* log; + + ut_ad(dtuple_validate(tuple)); + ut_ad(dtuple_get_n_fields(tuple) == dict_index_get_n_fields(index)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_SHARED) + || rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + if (dict_index_is_corrupted(index)) { + return; + } + + ut_ad(dict_index_is_online_ddl(index)); + + /* Compute the size of the record. This differs from + row_merge_buf_encode(), because here we do not encode + extra_size+1 (and reserve 0 as the end-of-chunk marker). */ + + size = rec_get_converted_size_temp( + index, tuple->fields, tuple->n_fields, &extra_size); + ut_ad(size >= extra_size); + ut_ad(size <= sizeof log->tail.buf); + + mrec_size = ROW_LOG_HEADER_SIZE + + (extra_size >= 0x80) + size + + (trx_id ? DATA_TRX_ID_LEN : 0); + + log = index->online_log; + mutex_enter(&log->mutex); + + if (trx_id > log->max_trx) { + log->max_trx = trx_id; + } + + if (!row_log_block_allocate(log->tail)) { + log->error = DB_OUT_OF_MEMORY; + goto err_exit; + } + + UNIV_MEM_INVALID(log->tail.buf, sizeof log->tail.buf); + + ut_ad(log->tail.bytes < srv_sort_buf_size); + avail_size = srv_sort_buf_size - log->tail.bytes; + + if (mrec_size > avail_size) { + b = log->tail.buf; + } else { + b = log->tail.block + log->tail.bytes; + } + + if (trx_id != 0) { + *b++ = ROW_OP_INSERT; + trx_write_trx_id(b, trx_id); + b += DATA_TRX_ID_LEN; + } else { + *b++ = ROW_OP_DELETE; + } + + if (extra_size < 0x80) { + *b++ = (byte) extra_size; + } else { + ut_ad(extra_size < 0x8000); + *b++ = (byte) (0x80 | (extra_size >> 8)); + *b++ = (byte) extra_size; + } + + rec_convert_dtuple_to_temp( + b + extra_size, index, tuple->fields, tuple->n_fields); + b += size; + + if (mrec_size >= avail_size) { + const os_offset_t byte_offset + = (os_offset_t) log->tail.blocks + * srv_sort_buf_size; + ibool ret; + + if (byte_offset + srv_sort_buf_size >= srv_online_max_size) { + goto write_failed; + } + + if (mrec_size == avail_size) { + ut_ad(b == &log->tail.block[srv_sort_buf_size]); + } else { + ut_ad(b == log->tail.buf + mrec_size); + memcpy(log->tail.block + log->tail.bytes, + log->tail.buf, avail_size); + } + UNIV_MEM_ASSERT_RW(log->tail.block, srv_sort_buf_size); + ret = os_file_write( + "(modification log)", + OS_FILE_FROM_FD(log->fd), + log->tail.block, byte_offset, srv_sort_buf_size); + log->tail.blocks++; + if (!ret) { +write_failed: + /* We set the flag directly instead of invoking + dict_set_corrupted_index_cache_only(index) here, + because the index is not "public" yet. */ + index->type |= DICT_CORRUPT; + } + UNIV_MEM_INVALID(log->tail.block, srv_sort_buf_size); + memcpy(log->tail.block, log->tail.buf + avail_size, + mrec_size - avail_size); + log->tail.bytes = mrec_size - avail_size; + } else { + log->tail.bytes += mrec_size; + ut_ad(b == log->tail.block + log->tail.bytes); + } + + UNIV_MEM_INVALID(log->tail.buf, sizeof log->tail.buf); +err_exit: + mutex_exit(&log->mutex); +} + +/******************************************************//** +Gets the error status of the online index rebuild log. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +row_log_table_get_error( +/*====================*/ + const dict_index_t* index) /*!< in: clustered index of a table + that is being rebuilt online */ +{ + ut_ad(dict_index_is_clust(index)); + ut_ad(dict_index_is_online_ddl(index)); + return(index->online_log->error); +} + +/******************************************************//** +Starts logging an operation to a table that is being rebuilt. +@return pointer to log, or NULL if no logging is necessary */ +static __attribute__((nonnull, warn_unused_result)) +byte* +row_log_table_open( +/*===============*/ + row_log_t* log, /*!< in/out: online rebuild log */ + ulint size, /*!< in: size of log record */ + ulint* avail) /*!< out: available size for log record */ +{ + mutex_enter(&log->mutex); + + UNIV_MEM_INVALID(log->tail.buf, sizeof log->tail.buf); + + if (log->error != DB_SUCCESS) { +err_exit: + mutex_exit(&log->mutex); + return(NULL); + } + + if (!row_log_block_allocate(log->tail)) { + log->error = DB_OUT_OF_MEMORY; + goto err_exit; + } + + ut_ad(log->tail.bytes < srv_sort_buf_size); + *avail = srv_sort_buf_size - log->tail.bytes; + + if (size > *avail) { + return(log->tail.buf); + } else { + return(log->tail.block + log->tail.bytes); + } +} + +/******************************************************//** +Stops logging an operation to a table that is being rebuilt. */ +static __attribute__((nonnull)) +void +row_log_table_close_func( +/*=====================*/ + row_log_t* log, /*!< in/out: online rebuild log */ +#ifdef UNIV_DEBUG + const byte* b, /*!< in: end of log record */ +#endif /* UNIV_DEBUG */ + ulint size, /*!< in: size of log record */ + ulint avail) /*!< in: available size for log record */ +{ + ut_ad(mutex_own(&log->mutex)); + + if (size >= avail) { + const os_offset_t byte_offset + = (os_offset_t) log->tail.blocks + * srv_sort_buf_size; + ibool ret; + + if (byte_offset + srv_sort_buf_size >= srv_online_max_size) { + goto write_failed; + } + + if (size == avail) { + ut_ad(b == &log->tail.block[srv_sort_buf_size]); + } else { + ut_ad(b == log->tail.buf + size); + memcpy(log->tail.block + log->tail.bytes, + log->tail.buf, avail); + } + UNIV_MEM_ASSERT_RW(log->tail.block, srv_sort_buf_size); + ret = os_file_write( + "(modification log)", + OS_FILE_FROM_FD(log->fd), + log->tail.block, byte_offset, srv_sort_buf_size); + log->tail.blocks++; + if (!ret) { +write_failed: + log->error = DB_ONLINE_LOG_TOO_BIG; + } + UNIV_MEM_INVALID(log->tail.block, srv_sort_buf_size); + memcpy(log->tail.block, log->tail.buf + avail, size - avail); + log->tail.bytes = size - avail; + } else { + log->tail.bytes += size; + ut_ad(b == log->tail.block + log->tail.bytes); + } + + log->tail.total += size; + UNIV_MEM_INVALID(log->tail.buf, sizeof log->tail.buf); + mutex_exit(&log->mutex); +} + +#ifdef UNIV_DEBUG +# define row_log_table_close(log, b, size, avail) \ + row_log_table_close_func(log, b, size, avail) +#else /* UNIV_DEBUG */ +# define row_log_table_close(log, b, size, avail) \ + row_log_table_close_func(log, size, avail) +#endif /* UNIV_DEBUG */ + +/******************************************************//** +Logs a delete operation to a table that is being rebuilt. +This will be merged in row_log_table_apply_delete(). */ +UNIV_INTERN +void +row_log_table_delete( +/*=================*/ + const rec_t* rec, /*!< in: clustered index leaf page record, + page X-latched */ + dict_index_t* index, /*!< in/out: clustered index, S-latched + or X-latched */ + const ulint* offsets,/*!< in: rec_get_offsets(rec,index) */ + const byte* sys) /*!< in: DB_TRX_ID,DB_ROLL_PTR that should + be logged, or NULL to use those in rec */ +{ + ulint old_pk_extra_size; + ulint old_pk_size; + ulint ext_size = 0; + ulint mrec_size; + ulint avail_size; + mem_heap_t* heap = NULL; + const dtuple_t* old_pk; + row_ext_t* ext; + + ut_ad(dict_index_is_clust(index)); + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(rec_offs_n_fields(offsets) == dict_index_get_n_fields(index)); + ut_ad(rec_offs_size(offsets) <= sizeof index->online_log->tail.buf); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&index->lock, RW_LOCK_SHARED) + || rw_lock_own(&index->lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + if (dict_index_is_corrupted(index) + || !dict_index_is_online_ddl(index) + || index->online_log->error != DB_SUCCESS) { + return; + } + + dict_table_t* new_table = index->online_log->table; + dict_index_t* new_index = dict_table_get_first_index(new_table); + + ut_ad(dict_index_is_clust(new_index)); + ut_ad(!dict_index_is_online_ddl(new_index)); + + /* Create the tuple PRIMARY KEY,DB_TRX_ID,DB_ROLL_PTR in new_table. */ + if (index->online_log->same_pk) { + dtuple_t* tuple; + ut_ad(new_index->n_uniq == index->n_uniq); + + /* The PRIMARY KEY and DB_TRX_ID,DB_ROLL_PTR are in the first + fields of the record. */ + heap = mem_heap_create( + DATA_TRX_ID_LEN + + DTUPLE_EST_ALLOC(new_index->n_uniq + 2)); + old_pk = tuple = dtuple_create(heap, new_index->n_uniq + 2); + dict_index_copy_types(tuple, new_index, tuple->n_fields); + dtuple_set_n_fields_cmp(tuple, new_index->n_uniq); + + for (ulint i = 0; i < dtuple_get_n_fields(tuple); i++) { + ulint len; + const void* field = rec_get_nth_field( + rec, offsets, i, &len); + dfield_t* dfield = dtuple_get_nth_field( + tuple, i); + ut_ad(len != UNIV_SQL_NULL); + ut_ad(!rec_offs_nth_extern(offsets, i)); + dfield_set_data(dfield, field, len); + } + + if (sys) { + dfield_set_data( + dtuple_get_nth_field(tuple, + new_index->n_uniq), + sys, DATA_TRX_ID_LEN); + dfield_set_data( + dtuple_get_nth_field(tuple, + new_index->n_uniq + 1), + sys + DATA_TRX_ID_LEN, DATA_ROLL_PTR_LEN); + } + } else { + /* The PRIMARY KEY has changed. Translate the tuple. */ + old_pk = row_log_table_get_pk( + rec, index, offsets, NULL, &heap); + + if (!old_pk) { + ut_ad(index->online_log->error != DB_SUCCESS); + if (heap) { + goto func_exit; + } + return; + } + } + + ut_ad(DATA_TRX_ID_LEN == dtuple_get_nth_field( + old_pk, old_pk->n_fields - 2)->len); + ut_ad(DATA_ROLL_PTR_LEN == dtuple_get_nth_field( + old_pk, old_pk->n_fields - 1)->len); + old_pk_size = rec_get_converted_size_temp( + new_index, old_pk->fields, old_pk->n_fields, + &old_pk_extra_size); + ut_ad(old_pk_extra_size < 0x100); + + mrec_size = 4 + old_pk_size; + + /* Log enough prefix of the BLOB unless both the + old and new table are in COMPACT or REDUNDANT format, + which store the prefix in the clustered index record. */ + if (rec_offs_any_extern(offsets) + && (dict_table_get_format(index->table) >= UNIV_FORMAT_B + || dict_table_get_format(new_table) >= UNIV_FORMAT_B)) { + + /* Build a cache of those off-page column prefixes + that are referenced by secondary indexes. It can be + that none of the off-page columns are needed. */ + row_build(ROW_COPY_DATA, index, rec, + offsets, NULL, NULL, NULL, &ext, heap); + if (ext) { + /* Log the row_ext_t, ext->ext and ext->buf */ + ext_size = ext->n_ext * ext->max_len + + sizeof(*ext) + + ext->n_ext * sizeof(ulint) + + (ext->n_ext - 1) * sizeof ext->len; + mrec_size += ext_size; + } + } + + if (byte* b = row_log_table_open(index->online_log, + mrec_size, &avail_size)) { + *b++ = ROW_T_DELETE; + *b++ = static_cast<byte>(old_pk_extra_size); + + /* Log the size of external prefix we saved */ + mach_write_to_2(b, ext_size); + b += 2; + + rec_convert_dtuple_to_temp( + b + old_pk_extra_size, new_index, + old_pk->fields, old_pk->n_fields); + + b += old_pk_size; + + if (ext_size) { + ulint cur_ext_size = sizeof(*ext) + + (ext->n_ext - 1) * sizeof ext->len; + + memcpy(b, ext, cur_ext_size); + b += cur_ext_size; + + /* Check if we need to col_map to adjust the column + number. If columns were added/removed/reordered, + adjust the column number. */ + if (const ulint* col_map = + index->online_log->col_map) { + for (ulint i = 0; i < ext->n_ext; i++) { + const_cast<ulint&>(ext->ext[i]) = + col_map[ext->ext[i]]; + } + } + + memcpy(b, ext->ext, ext->n_ext * sizeof(*ext->ext)); + b += ext->n_ext * sizeof(*ext->ext); + + ext_size -= cur_ext_size + + ext->n_ext * sizeof(*ext->ext); + memcpy(b, ext->buf, ext_size); + b += ext_size; + } + + row_log_table_close( + index->online_log, b, mrec_size, avail_size); + } + +func_exit: + mem_heap_free(heap); +} + +/******************************************************//** +Logs an insert or update to a table that is being rebuilt. */ +static +void +row_log_table_low_redundant( +/*========================*/ + const rec_t* rec, /*!< in: clustered index leaf + page record in ROW_FORMAT=REDUNDANT, + page X-latched */ + dict_index_t* index, /*!< in/out: clustered index, S-latched + or X-latched */ + bool insert, /*!< in: true if insert, + false if update */ + const dtuple_t* old_pk, /*!< in: old PRIMARY KEY value + (if !insert and a PRIMARY KEY + is being created) */ + const dict_index_t* new_index) + /*!< in: clustered index of the + new table, not latched */ +{ + ulint old_pk_size; + ulint old_pk_extra_size; + ulint size; + ulint extra_size; + ulint mrec_size; + ulint avail_size; + mem_heap_t* heap = NULL; + dtuple_t* tuple; + + ut_ad(!page_is_comp(page_align(rec))); + ut_ad(dict_index_get_n_fields(index) == rec_get_n_fields_old(rec)); + ut_ad(dict_tf_is_valid(index->table->flags)); + ut_ad(!dict_table_is_comp(index->table)); /* redundant row format */ + ut_ad(dict_index_is_clust(new_index)); + + heap = mem_heap_create(DTUPLE_EST_ALLOC(index->n_fields)); + tuple = dtuple_create(heap, index->n_fields); + dict_index_copy_types(tuple, index, index->n_fields); + dtuple_set_n_fields_cmp(tuple, dict_index_get_n_unique(index)); + + if (rec_get_1byte_offs_flag(rec)) { + for (ulint i = 0; i < index->n_fields; i++) { + dfield_t* dfield; + ulint len; + const void* field; + + dfield = dtuple_get_nth_field(tuple, i); + field = rec_get_nth_field_old(rec, i, &len); + + dfield_set_data(dfield, field, len); + } + } else { + for (ulint i = 0; i < index->n_fields; i++) { + dfield_t* dfield; + ulint len; + const void* field; + + dfield = dtuple_get_nth_field(tuple, i); + field = rec_get_nth_field_old(rec, i, &len); + + dfield_set_data(dfield, field, len); + + if (rec_2_is_field_extern(rec, i)) { + dfield_set_ext(dfield); + } + } + } + + size = rec_get_converted_size_temp( + index, tuple->fields, tuple->n_fields, &extra_size); + + mrec_size = ROW_LOG_HEADER_SIZE + size + (extra_size >= 0x80); + + if (insert || index->online_log->same_pk) { + ut_ad(!old_pk); + old_pk_extra_size = old_pk_size = 0; + } else { + ut_ad(old_pk); + ut_ad(old_pk->n_fields == 2 + old_pk->n_fields_cmp); + ut_ad(DATA_TRX_ID_LEN == dtuple_get_nth_field( + old_pk, old_pk->n_fields - 2)->len); + ut_ad(DATA_ROLL_PTR_LEN == dtuple_get_nth_field( + old_pk, old_pk->n_fields - 1)->len); + + old_pk_size = rec_get_converted_size_temp( + new_index, old_pk->fields, old_pk->n_fields, + &old_pk_extra_size); + ut_ad(old_pk_extra_size < 0x100); + mrec_size += 1/*old_pk_extra_size*/ + old_pk_size; + } + + if (byte* b = row_log_table_open(index->online_log, + mrec_size, &avail_size)) { + *b++ = insert ? ROW_T_INSERT : ROW_T_UPDATE; + + if (old_pk_size) { + *b++ = static_cast<byte>(old_pk_extra_size); + + rec_convert_dtuple_to_temp( + b + old_pk_extra_size, new_index, + old_pk->fields, old_pk->n_fields); + b += old_pk_size; + } + + if (extra_size < 0x80) { + *b++ = static_cast<byte>(extra_size); + } else { + ut_ad(extra_size < 0x8000); + *b++ = static_cast<byte>(0x80 | (extra_size >> 8)); + *b++ = static_cast<byte>(extra_size); + } + + rec_convert_dtuple_to_temp( + b + extra_size, index, tuple->fields, tuple->n_fields); + b += size; + + row_log_table_close( + index->online_log, b, mrec_size, avail_size); + } + + mem_heap_free(heap); +} + +/******************************************************//** +Logs an insert or update to a table that is being rebuilt. */ +static __attribute__((nonnull(1,2,3))) +void +row_log_table_low( +/*==============*/ + const rec_t* rec, /*!< in: clustered index leaf page record, + page X-latched */ + dict_index_t* index, /*!< in/out: clustered index, S-latched + or X-latched */ + const ulint* offsets,/*!< in: rec_get_offsets(rec,index) */ + bool insert, /*!< in: true if insert, false if update */ + const dtuple_t* old_pk) /*!< in: old PRIMARY KEY value (if !insert + and a PRIMARY KEY is being created) */ +{ + ulint omit_size; + ulint old_pk_size; + ulint old_pk_extra_size; + ulint extra_size; + ulint mrec_size; + ulint avail_size; + const dict_index_t* new_index = dict_table_get_first_index( + index->online_log->table); + ut_ad(dict_index_is_clust(index)); + ut_ad(dict_index_is_clust(new_index)); + ut_ad(!dict_index_is_online_ddl(new_index)); + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(rec_offs_n_fields(offsets) == dict_index_get_n_fields(index)); + ut_ad(rec_offs_size(offsets) <= sizeof index->online_log->tail.buf); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&index->lock, RW_LOCK_SHARED) + || rw_lock_own(&index->lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(fil_page_get_type(page_align(rec)) == FIL_PAGE_INDEX); + ut_ad(page_is_leaf(page_align(rec))); + ut_ad(!page_is_comp(page_align(rec)) == !rec_offs_comp(offsets)); + + if (dict_index_is_corrupted(index) + || !dict_index_is_online_ddl(index) + || index->online_log->error != DB_SUCCESS) { + return; + } + + if (!rec_offs_comp(offsets)) { + row_log_table_low_redundant( + rec, index, insert, old_pk, new_index); + return; + } + + ut_ad(page_is_comp(page_align(rec))); + ut_ad(rec_get_status(rec) == REC_STATUS_ORDINARY); + + omit_size = REC_N_NEW_EXTRA_BYTES; + + extra_size = rec_offs_extra_size(offsets) - omit_size; + + mrec_size = ROW_LOG_HEADER_SIZE + + (extra_size >= 0x80) + rec_offs_size(offsets) - omit_size; + + if (insert || index->online_log->same_pk) { + ut_ad(!old_pk); + old_pk_extra_size = old_pk_size = 0; + } else { + ut_ad(old_pk); + ut_ad(old_pk->n_fields == 2 + old_pk->n_fields_cmp); + ut_ad(DATA_TRX_ID_LEN == dtuple_get_nth_field( + old_pk, old_pk->n_fields - 2)->len); + ut_ad(DATA_ROLL_PTR_LEN == dtuple_get_nth_field( + old_pk, old_pk->n_fields - 1)->len); + + old_pk_size = rec_get_converted_size_temp( + new_index, old_pk->fields, old_pk->n_fields, + &old_pk_extra_size); + ut_ad(old_pk_extra_size < 0x100); + mrec_size += 1/*old_pk_extra_size*/ + old_pk_size; + } + + if (byte* b = row_log_table_open(index->online_log, + mrec_size, &avail_size)) { + *b++ = insert ? ROW_T_INSERT : ROW_T_UPDATE; + + if (old_pk_size) { + *b++ = static_cast<byte>(old_pk_extra_size); + + rec_convert_dtuple_to_temp( + b + old_pk_extra_size, new_index, + old_pk->fields, old_pk->n_fields); + b += old_pk_size; + } + + if (extra_size < 0x80) { + *b++ = static_cast<byte>(extra_size); + } else { + ut_ad(extra_size < 0x8000); + *b++ = static_cast<byte>(0x80 | (extra_size >> 8)); + *b++ = static_cast<byte>(extra_size); + } + + memcpy(b, rec - rec_offs_extra_size(offsets), extra_size); + b += extra_size; + memcpy(b, rec, rec_offs_data_size(offsets)); + b += rec_offs_data_size(offsets); + + row_log_table_close( + index->online_log, b, mrec_size, avail_size); + } +} + +/******************************************************//** +Logs an update to a table that is being rebuilt. +This will be merged in row_log_table_apply_update(). */ +UNIV_INTERN +void +row_log_table_update( +/*=================*/ + const rec_t* rec, /*!< in: clustered index leaf page record, + page X-latched */ + dict_index_t* index, /*!< in/out: clustered index, S-latched + or X-latched */ + const ulint* offsets,/*!< in: rec_get_offsets(rec,index) */ + const dtuple_t* old_pk) /*!< in: row_log_table_get_pk() + before the update */ +{ + row_log_table_low(rec, index, offsets, false, old_pk); +} + +/** Gets the old table column of a PRIMARY KEY column. +@param table old table (before ALTER TABLE) +@param col_map mapping of old column numbers to new ones +@param col_no column position in the new table +@return old table column, or NULL if this is an added column */ +static +const dict_col_t* +row_log_table_get_pk_old_col( +/*=========================*/ + const dict_table_t* table, + const ulint* col_map, + ulint col_no) +{ + for (ulint i = 0; i < table->n_cols; i++) { + if (col_no == col_map[i]) { + return(dict_table_get_nth_col(table, i)); + } + } + + return(NULL); +} + +/** Maps an old table column of a PRIMARY KEY column. +@param col old table column (before ALTER TABLE) +@param ifield clustered index field in the new table (after ALTER TABLE) +@param dfield clustered index tuple field in the new table +@param heap memory heap for allocating dfield contents +@param rec clustered index leaf page record in the old table +@param offsets rec_get_offsets(rec) +@param i rec field corresponding to col +@param zip_size compressed page size of the old table, or 0 for uncompressed +@param max_len maximum length of dfield +@retval DB_INVALID_NULL if a NULL value is encountered +@retval DB_TOO_BIG_INDEX_COL if the maximum prefix length is exceeded */ +static +dberr_t +row_log_table_get_pk_col( +/*=====================*/ + const dict_col_t* col, + const dict_field_t* ifield, + dfield_t* dfield, + mem_heap_t* heap, + const rec_t* rec, + const ulint* offsets, + ulint i, + ulint zip_size, + ulint max_len) +{ + const byte* field; + ulint len; + + ut_ad(ut_is_2pow(zip_size)); + + field = rec_get_nth_field(rec, offsets, i, &len); + + if (len == UNIV_SQL_NULL) { + return(DB_INVALID_NULL); + } + + if (rec_offs_nth_extern(offsets, i)) { + ulint field_len = ifield->prefix_len; + byte* blob_field; + + if (!field_len) { + field_len = ifield->fixed_len; + if (!field_len) { + field_len = max_len + 1; + } + } + + blob_field = static_cast<byte*>( + mem_heap_alloc(heap, field_len)); + + len = btr_copy_externally_stored_field_prefix( + blob_field, field_len, zip_size, field, len); + if (len >= max_len + 1) { + return(DB_TOO_BIG_INDEX_COL); + } + + dfield_set_data(dfield, blob_field, len); + } else { + dfield_set_data(dfield, mem_heap_dup(heap, field, len), len); + } + + return(DB_SUCCESS); +} + +/******************************************************//** +Constructs the old PRIMARY KEY and DB_TRX_ID,DB_ROLL_PTR +of a table that is being rebuilt. +@return tuple of PRIMARY KEY,DB_TRX_ID,DB_ROLL_PTR in the rebuilt table, +or NULL if the PRIMARY KEY definition does not change */ +UNIV_INTERN +const dtuple_t* +row_log_table_get_pk( +/*=================*/ + const rec_t* rec, /*!< in: clustered index leaf page record, + page X-latched */ + dict_index_t* index, /*!< in/out: clustered index, S-latched + or X-latched */ + const ulint* offsets,/*!< in: rec_get_offsets(rec,index) */ + byte* sys, /*!< out: DB_TRX_ID,DB_ROLL_PTR for + row_log_table_delete(), or NULL */ + mem_heap_t** heap) /*!< in/out: memory heap where allocated */ +{ + dtuple_t* tuple = NULL; + row_log_t* log = index->online_log; + + ut_ad(dict_index_is_clust(index)); + ut_ad(dict_index_is_online_ddl(index)); + ut_ad(!offsets || rec_offs_validate(rec, index, offsets)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&index->lock, RW_LOCK_SHARED) + || rw_lock_own(&index->lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + ut_ad(log); + ut_ad(log->table); + + if (log->same_pk) { + /* The PRIMARY KEY columns are unchanged. */ + if (sys) { + /* Store the DB_TRX_ID,DB_ROLL_PTR. */ + ulint trx_id_offs = index->trx_id_offset; + + if (!trx_id_offs) { + ulint pos = dict_index_get_sys_col_pos( + index, DATA_TRX_ID); + ulint len; + ut_ad(pos > 0); + + if (!offsets) { + offsets = rec_get_offsets( + rec, index, NULL, pos + 1, + heap); + } + + trx_id_offs = rec_get_nth_field_offs( + offsets, pos, &len); + ut_ad(len == DATA_TRX_ID_LEN); + } + + memcpy(sys, rec + trx_id_offs, + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + } + + return(NULL); + } + + mutex_enter(&log->mutex); + + /* log->error is protected by log->mutex. */ + if (log->error == DB_SUCCESS) { + dict_table_t* new_table = log->table; + dict_index_t* new_index + = dict_table_get_first_index(new_table); + const ulint new_n_uniq + = dict_index_get_n_unique(new_index); + + if (!*heap) { + ulint size = 0; + + if (!offsets) { + size += (1 + REC_OFFS_HEADER_SIZE + + index->n_fields) + * sizeof *offsets; + } + + for (ulint i = 0; i < new_n_uniq; i++) { + size += dict_col_get_min_size( + dict_index_get_nth_col(new_index, i)); + } + + *heap = mem_heap_create( + DTUPLE_EST_ALLOC(new_n_uniq + 2) + size); + } + + if (!offsets) { + offsets = rec_get_offsets(rec, index, NULL, + ULINT_UNDEFINED, heap); + } + + tuple = dtuple_create(*heap, new_n_uniq + 2); + dict_index_copy_types(tuple, new_index, tuple->n_fields); + dtuple_set_n_fields_cmp(tuple, new_n_uniq); + + const ulint max_len = DICT_MAX_FIELD_LEN_BY_FORMAT(new_table); + const ulint zip_size = dict_table_zip_size(index->table); + + for (ulint new_i = 0; new_i < new_n_uniq; new_i++) { + dict_field_t* ifield; + dfield_t* dfield; + ulint prtype; + ulint mbminmaxlen; + + ifield = dict_index_get_nth_field(new_index, new_i); + dfield = dtuple_get_nth_field(tuple, new_i); + + const ulint col_no + = dict_field_get_col(ifield)->ind; + + if (const dict_col_t* col + = row_log_table_get_pk_old_col( + index->table, log->col_map, col_no)) { + ulint i = dict_col_get_clust_pos(col, index); + + if (i == ULINT_UNDEFINED) { + ut_ad(0); + log->error = DB_CORRUPTION; + goto err_exit; + } + + log->error = row_log_table_get_pk_col( + col, ifield, dfield, *heap, + rec, offsets, i, zip_size, max_len); + + if (log->error != DB_SUCCESS) { +err_exit: + tuple = NULL; + goto func_exit; + } + + mbminmaxlen = col->mbminmaxlen; + prtype = col->prtype; + } else { + /* No matching column was found in the old + table, so this must be an added column. + Copy the default value. */ + ut_ad(log->add_cols); + + dfield_copy(dfield, dtuple_get_nth_field( + log->add_cols, col_no)); + mbminmaxlen = dfield->type.mbminmaxlen; + prtype = dfield->type.prtype; + } + + ut_ad(!dfield_is_ext(dfield)); + ut_ad(!dfield_is_null(dfield)); + + if (ifield->prefix_len) { + ulint len = dtype_get_at_most_n_mbchars( + prtype, mbminmaxlen, + ifield->prefix_len, + dfield_get_len(dfield), + static_cast<const char*>( + dfield_get_data(dfield))); + + ut_ad(len <= dfield_get_len(dfield)); + dfield_set_len(dfield, len); + } + } + + const byte* trx_roll = rec + + row_get_trx_id_offset(index, offsets); + + /* Copy the fields, because the fields will be updated + or the record may be moved somewhere else in the B-tree + as part of the upcoming operation. */ + if (sys) { + memcpy(sys, trx_roll, + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + trx_roll = sys; + } else { + trx_roll = static_cast<const byte*>( + mem_heap_dup( + *heap, trx_roll, + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)); + } + + dfield_set_data(dtuple_get_nth_field(tuple, new_n_uniq), + trx_roll, DATA_TRX_ID_LEN); + dfield_set_data(dtuple_get_nth_field(tuple, new_n_uniq + 1), + trx_roll + DATA_TRX_ID_LEN, DATA_ROLL_PTR_LEN); + } + +func_exit: + mutex_exit(&log->mutex); + return(tuple); +} + +/******************************************************//** +Logs an insert to a table that is being rebuilt. +This will be merged in row_log_table_apply_insert(). */ +UNIV_INTERN +void +row_log_table_insert( +/*=================*/ + const rec_t* rec, /*!< in: clustered index leaf page record, + page X-latched */ + dict_index_t* index, /*!< in/out: clustered index, S-latched + or X-latched */ + const ulint* offsets)/*!< in: rec_get_offsets(rec,index) */ +{ + row_log_table_low(rec, index, offsets, true, NULL); +} + +/******************************************************//** +Notes that a BLOB is being freed during online ALTER TABLE. */ +UNIV_INTERN +void +row_log_table_blob_free( +/*====================*/ + dict_index_t* index, /*!< in/out: clustered index, X-latched */ + ulint page_no)/*!< in: starting page number of the BLOB */ +{ + ut_ad(dict_index_is_clust(index)); + ut_ad(dict_index_is_online_ddl(index)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&index->lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(page_no != FIL_NULL); + + if (index->online_log->error != DB_SUCCESS) { + return; + } + + page_no_map* blobs = index->online_log->blobs; + + if (!blobs) { + index->online_log->blobs = blobs = new page_no_map(); + } + +#ifdef UNIV_DEBUG + const ulonglong log_pos = index->online_log->tail.total; +#else +# define log_pos /* empty */ +#endif /* UNIV_DEBUG */ + + const page_no_map::value_type v(page_no, + row_log_table_blob_t(log_pos)); + + std::pair<page_no_map::iterator,bool> p = blobs->insert(v); + + if (!p.second) { + /* Update the existing mapping. */ + ut_ad(p.first->first == page_no); + p.first->second.blob_free(log_pos); + } +#undef log_pos +} + +/******************************************************//** +Notes that a BLOB is being allocated during online ALTER TABLE. */ +UNIV_INTERN +void +row_log_table_blob_alloc( +/*=====================*/ + dict_index_t* index, /*!< in/out: clustered index, X-latched */ + ulint page_no)/*!< in: starting page number of the BLOB */ +{ + ut_ad(dict_index_is_clust(index)); + ut_ad(dict_index_is_online_ddl(index)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&index->lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(page_no != FIL_NULL); + + if (index->online_log->error != DB_SUCCESS) { + return; + } + + /* Only track allocations if the same page has been freed + earlier. Double allocation without a free is not allowed. */ + if (page_no_map* blobs = index->online_log->blobs) { + page_no_map::iterator p = blobs->find(page_no); + + if (p != blobs->end()) { + ut_ad(p->first == page_no); + p->second.blob_alloc(index->online_log->tail.total); + } + } +} + +/******************************************************//** +Converts a log record to a table row. +@return converted row, or NULL if the conversion fails */ +static __attribute__((nonnull, warn_unused_result)) +const dtuple_t* +row_log_table_apply_convert_mrec( +/*=============================*/ + const mrec_t* mrec, /*!< in: merge record */ + dict_index_t* index, /*!< in: index of mrec */ + const ulint* offsets, /*!< in: offsets of mrec */ + const row_log_t* log, /*!< in: rebuild context */ + mem_heap_t* heap, /*!< in/out: memory heap */ + trx_id_t trx_id, /*!< in: DB_TRX_ID of mrec */ + dberr_t* error) /*!< out: DB_SUCCESS or + DB_MISSING_HISTORY or + reason of failure */ +{ + dtuple_t* row; + + *error = DB_SUCCESS; + + /* This is based on row_build(). */ + if (log->add_cols) { + row = dtuple_copy(log->add_cols, heap); + /* dict_table_copy_types() would set the fields to NULL */ + for (ulint i = 0; i < dict_table_get_n_cols(log->table); i++) { + dict_col_copy_type( + dict_table_get_nth_col(log->table, i), + dfield_get_type(dtuple_get_nth_field(row, i))); + } + } else { + row = dtuple_create(heap, dict_table_get_n_cols(log->table)); + dict_table_copy_types(row, log->table); + } + + for (ulint i = 0; i < rec_offs_n_fields(offsets); i++) { + const dict_field_t* ind_field + = dict_index_get_nth_field(index, i); + + if (ind_field->prefix_len) { + /* Column prefixes can only occur in key + fields, which cannot be stored externally. For + a column prefix, there should also be the full + field in the clustered index tuple. The row + tuple comprises full fields, not prefixes. */ + ut_ad(!rec_offs_nth_extern(offsets, i)); + continue; + } + + const dict_col_t* col + = dict_field_get_col(ind_field); + ulint col_no + = log->col_map[dict_col_get_no(col)]; + + if (col_no == ULINT_UNDEFINED) { + /* dropped column */ + continue; + } + + dfield_t* dfield + = dtuple_get_nth_field(row, col_no); + ulint len; + const byte* data; + + if (rec_offs_nth_extern(offsets, i)) { + ut_ad(rec_offs_any_extern(offsets)); + rw_lock_x_lock(dict_index_get_lock(index)); + + if (const page_no_map* blobs = log->blobs) { + data = rec_get_nth_field( + mrec, offsets, i, &len); + ut_ad(len >= BTR_EXTERN_FIELD_REF_SIZE); + + ulint page_no = mach_read_from_4( + data + len - (BTR_EXTERN_FIELD_REF_SIZE + - BTR_EXTERN_PAGE_NO)); + page_no_map::const_iterator p = blobs->find( + page_no); + if (p != blobs->end() + && p->second.is_freed(log->head.total)) { + /* This BLOB has been freed. + We must not access the row. */ + *error = DB_MISSING_HISTORY; + dfield_set_data(dfield, data, len); + dfield_set_ext(dfield); + goto blob_done; + } + } + + data = btr_rec_copy_externally_stored_field( + mrec, offsets, + dict_table_zip_size(index->table), + i, &len, heap); + ut_a(data); + dfield_set_data(dfield, data, len); +blob_done: + rw_lock_x_unlock(dict_index_get_lock(index)); + } else { + data = rec_get_nth_field(mrec, offsets, i, &len); + dfield_set_data(dfield, data, len); + } + + /* See if any columns were changed to NULL or NOT NULL. */ + const dict_col_t* new_col + = dict_table_get_nth_col(log->table, col_no); + ut_ad(new_col->mtype == col->mtype); + + /* Assert that prtype matches except for nullability. */ + ut_ad(!((new_col->prtype ^ col->prtype) & ~DATA_NOT_NULL)); + ut_ad(!((new_col->prtype ^ dfield_get_type(dfield)->prtype) + & ~DATA_NOT_NULL)); + + if (new_col->prtype == col->prtype) { + continue; + } + + if ((new_col->prtype & DATA_NOT_NULL) + && dfield_is_null(dfield)) { + /* We got a NULL value for a NOT NULL column. */ + *error = DB_INVALID_NULL; + return(NULL); + } + + /* Adjust the DATA_NOT_NULL flag in the parsed row. */ + dfield_get_type(dfield)->prtype = new_col->prtype; + + ut_ad(dict_col_type_assert_equal(new_col, + dfield_get_type(dfield))); + } + + return(row); +} + +/******************************************************//** +Replays an insert operation on a table that was rebuilt. +@return DB_SUCCESS or error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_log_table_apply_insert_low( +/*===========================*/ + que_thr_t* thr, /*!< in: query graph */ + const dtuple_t* row, /*!< in: table row + in the old table definition */ + trx_id_t trx_id, /*!< in: trx_id of the row */ + mem_heap_t* offsets_heap, /*!< in/out: memory heap + that can be emptied */ + mem_heap_t* heap, /*!< in/out: memory heap */ + row_merge_dup_t* dup) /*!< in/out: for reporting + duplicate key errors */ +{ + dberr_t error; + dtuple_t* entry; + const row_log_t*log = dup->index->online_log; + dict_index_t* index = dict_table_get_first_index(log->table); + + ut_ad(dtuple_validate(row)); + ut_ad(trx_id); + +#ifdef ROW_LOG_APPLY_PRINT + if (row_log_apply_print) { + fprintf(stderr, "table apply insert " + IB_ID_FMT " " IB_ID_FMT "\n", + index->table->id, index->id); + dtuple_print(stderr, row); + } +#endif /* ROW_LOG_APPLY_PRINT */ + + static const ulint flags + = (BTR_CREATE_FLAG + | BTR_NO_LOCKING_FLAG + | BTR_NO_UNDO_LOG_FLAG + | BTR_KEEP_SYS_FLAG); + + entry = row_build_index_entry(row, NULL, index, heap); + + error = row_ins_clust_index_entry_low( + flags, BTR_MODIFY_TREE, index, index->n_uniq, entry, 0, thr); + + switch (error) { + case DB_SUCCESS: + break; + case DB_SUCCESS_LOCKED_REC: + /* The row had already been copied to the table. */ + return(DB_SUCCESS); + default: + return(error); + } + + do { + if (!(index = dict_table_get_next_index(index))) { + break; + } + + if (index->type & DICT_FTS) { + continue; + } + + entry = row_build_index_entry(row, NULL, index, heap); + error = row_ins_sec_index_entry_low( + flags, BTR_MODIFY_TREE, + index, offsets_heap, heap, entry, trx_id, thr); + } while (error == DB_SUCCESS); + + return(error); +} + +/******************************************************//** +Replays an insert operation on a table that was rebuilt. +@return DB_SUCCESS or error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_log_table_apply_insert( +/*=======================*/ + que_thr_t* thr, /*!< in: query graph */ + const mrec_t* mrec, /*!< in: record to insert */ + const ulint* offsets, /*!< in: offsets of mrec */ + mem_heap_t* offsets_heap, /*!< in/out: memory heap + that can be emptied */ + mem_heap_t* heap, /*!< in/out: memory heap */ + row_merge_dup_t* dup, /*!< in/out: for reporting + duplicate key errors */ + trx_id_t trx_id) /*!< in: DB_TRX_ID of mrec */ +{ + const row_log_t*log = dup->index->online_log; + dberr_t error; + const dtuple_t* row = row_log_table_apply_convert_mrec( + mrec, dup->index, offsets, log, heap, trx_id, &error); + + switch (error) { + case DB_MISSING_HISTORY: + ut_ad(log->blobs); + /* Because some BLOBs are missing, we know that the + transaction was rolled back later (a rollback of + an insert can free BLOBs). + We can simply skip the insert: the subsequent + ROW_T_DELETE will be ignored, or a ROW_T_UPDATE will + be interpreted as ROW_T_INSERT. */ + return(DB_SUCCESS); + case DB_SUCCESS: + ut_ad(row != NULL); + break; + default: + ut_ad(0); + case DB_INVALID_NULL: + ut_ad(row == NULL); + return(error); + } + + error = row_log_table_apply_insert_low( + thr, row, trx_id, offsets_heap, heap, dup); + if (error != DB_SUCCESS) { + /* Report the erroneous row using the new + version of the table. */ + innobase_row_to_mysql(dup->table, log->table, row); + } + return(error); +} + +/******************************************************//** +Deletes a record from a table that is being rebuilt. +@return DB_SUCCESS or error code */ +static __attribute__((nonnull(1, 2, 4, 5), warn_unused_result)) +dberr_t +row_log_table_apply_delete_low( +/*===========================*/ + btr_pcur_t* pcur, /*!< in/out: B-tree cursor, + will be trashed */ + const ulint* offsets, /*!< in: offsets on pcur */ + const row_ext_t* save_ext, /*!< in: saved external field + info, or NULL */ + mem_heap_t* heap, /*!< in/out: memory heap */ + mtr_t* mtr) /*!< in/out: mini-transaction, + will be committed */ +{ + dberr_t error; + row_ext_t* ext; + dtuple_t* row; + dict_index_t* index = btr_pcur_get_btr_cur(pcur)->index; + + ut_ad(dict_index_is_clust(index)); + +#ifdef ROW_LOG_APPLY_PRINT + if (row_log_apply_print) { + fprintf(stderr, "table apply delete " + IB_ID_FMT " " IB_ID_FMT "\n", + index->table->id, index->id); + rec_print_new(stderr, btr_pcur_get_rec(pcur), offsets); + } +#endif /* ROW_LOG_APPLY_PRINT */ + if (dict_table_get_next_index(index)) { + /* Build a row template for purging secondary index entries. */ + row = row_build( + ROW_COPY_DATA, index, btr_pcur_get_rec(pcur), + offsets, NULL, NULL, NULL, + save_ext ? NULL : &ext, heap); + if (!save_ext) { + save_ext = ext; + } + } else { + row = NULL; + } + + btr_cur_pessimistic_delete(&error, FALSE, btr_pcur_get_btr_cur(pcur), + BTR_CREATE_FLAG, RB_NONE, mtr); + mtr_commit(mtr); + + if (error != DB_SUCCESS) { + return(error); + } + + while ((index = dict_table_get_next_index(index)) != NULL) { + if (index->type & DICT_FTS) { + continue; + } + + const dtuple_t* entry = row_build_index_entry( + row, save_ext, index, heap); + mtr_start(mtr); + btr_pcur_open(index, entry, PAGE_CUR_LE, + BTR_MODIFY_TREE, pcur, mtr); +#ifdef UNIV_DEBUG + switch (btr_pcur_get_btr_cur(pcur)->flag) { + case BTR_CUR_DELETE_REF: + case BTR_CUR_DEL_MARK_IBUF: + case BTR_CUR_DELETE_IBUF: + case BTR_CUR_INSERT_TO_IBUF: + /* We did not request buffering. */ + break; + case BTR_CUR_HASH: + case BTR_CUR_HASH_FAIL: + case BTR_CUR_BINARY: + goto flag_ok; + } + ut_ad(0); +flag_ok: +#endif /* UNIV_DEBUG */ + + if (page_rec_is_infimum(btr_pcur_get_rec(pcur)) + || btr_pcur_get_low_match(pcur) < index->n_uniq) { + /* All secondary index entries should be + found, because new_table is being modified by + this thread only, and all indexes should be + updated in sync. */ + mtr_commit(mtr); + return(DB_INDEX_CORRUPT); + } + + btr_cur_pessimistic_delete(&error, FALSE, + btr_pcur_get_btr_cur(pcur), + BTR_CREATE_FLAG, RB_NONE, mtr); + mtr_commit(mtr); + } + + return(error); +} + +/******************************************************//** +Replays a delete operation on a table that was rebuilt. +@return DB_SUCCESS or error code */ +static __attribute__((nonnull(1, 3, 4, 5, 6, 7), warn_unused_result)) +dberr_t +row_log_table_apply_delete( +/*=======================*/ + que_thr_t* thr, /*!< in: query graph */ + ulint trx_id_col, /*!< in: position of + DB_TRX_ID in the new + clustered index */ + const mrec_t* mrec, /*!< in: merge record */ + const ulint* moffsets, /*!< in: offsets of mrec */ + mem_heap_t* offsets_heap, /*!< in/out: memory heap + that can be emptied */ + mem_heap_t* heap, /*!< in/out: memory heap */ + const row_log_t* log, /*!< in: online log */ + const row_ext_t* save_ext) /*!< in: saved external field + info, or NULL */ +{ + dict_table_t* new_table = log->table; + dict_index_t* index = dict_table_get_first_index(new_table); + dtuple_t* old_pk; + mtr_t mtr; + btr_pcur_t pcur; + ulint* offsets; + + ut_ad(rec_offs_n_fields(moffsets) + == dict_index_get_n_unique(index) + 2); + ut_ad(!rec_offs_any_extern(moffsets)); + + /* Convert the row to a search tuple. */ + old_pk = dtuple_create(heap, index->n_uniq); + dict_index_copy_types(old_pk, index, index->n_uniq); + + for (ulint i = 0; i < index->n_uniq; i++) { + ulint len; + const void* field; + field = rec_get_nth_field(mrec, moffsets, i, &len); + ut_ad(len != UNIV_SQL_NULL); + dfield_set_data(dtuple_get_nth_field(old_pk, i), + field, len); + } + + mtr_start(&mtr); + btr_pcur_open(index, old_pk, PAGE_CUR_LE, + BTR_MODIFY_TREE, &pcur, &mtr); +#ifdef UNIV_DEBUG + switch (btr_pcur_get_btr_cur(&pcur)->flag) { + case BTR_CUR_DELETE_REF: + case BTR_CUR_DEL_MARK_IBUF: + case BTR_CUR_DELETE_IBUF: + case BTR_CUR_INSERT_TO_IBUF: + /* We did not request buffering. */ + break; + case BTR_CUR_HASH: + case BTR_CUR_HASH_FAIL: + case BTR_CUR_BINARY: + goto flag_ok; + } + ut_ad(0); +flag_ok: +#endif /* UNIV_DEBUG */ + + if (page_rec_is_infimum(btr_pcur_get_rec(&pcur)) + || btr_pcur_get_low_match(&pcur) < index->n_uniq) { +all_done: + mtr_commit(&mtr); + /* The record was not found. All done. */ + /* This should only happen when an earlier + ROW_T_INSERT was skipped or + ROW_T_UPDATE was interpreted as ROW_T_DELETE + due to BLOBs having been freed by rollback. */ + return(DB_SUCCESS); + } + + offsets = rec_get_offsets(btr_pcur_get_rec(&pcur), index, NULL, + ULINT_UNDEFINED, &offsets_heap); +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + ut_a(!rec_offs_any_null_extern(btr_pcur_get_rec(&pcur), offsets)); +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + + /* Only remove the record if DB_TRX_ID,DB_ROLL_PTR match. */ + + { + ulint len; + const byte* mrec_trx_id + = rec_get_nth_field(mrec, moffsets, trx_id_col, &len); + ut_ad(len == DATA_TRX_ID_LEN); + const byte* rec_trx_id + = rec_get_nth_field(btr_pcur_get_rec(&pcur), offsets, + trx_id_col, &len); + ut_ad(len == DATA_TRX_ID_LEN); + + ut_ad(rec_get_nth_field(mrec, moffsets, trx_id_col + 1, &len) + == mrec_trx_id + DATA_TRX_ID_LEN); + ut_ad(len == DATA_ROLL_PTR_LEN); + ut_ad(rec_get_nth_field(btr_pcur_get_rec(&pcur), offsets, + trx_id_col + 1, &len) + == rec_trx_id + DATA_TRX_ID_LEN); + ut_ad(len == DATA_ROLL_PTR_LEN); + + if (memcmp(mrec_trx_id, rec_trx_id, + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)) { + /* The ROW_T_DELETE was logged for a different + PRIMARY KEY,DB_TRX_ID,DB_ROLL_PTR. + This is possible if a ROW_T_INSERT was skipped + or a ROW_T_UPDATE was interpreted as ROW_T_DELETE + because some BLOBs were missing due to + (1) rolling back the initial insert, or + (2) purging the BLOB for a later ROW_T_DELETE + (3) purging 'old values' for a later ROW_T_UPDATE + or ROW_T_DELETE. */ + ut_ad(!log->same_pk); + goto all_done; + } + } + + return(row_log_table_apply_delete_low(&pcur, offsets, save_ext, + heap, &mtr)); +} + +/******************************************************//** +Replays an update operation on a table that was rebuilt. +@return DB_SUCCESS or error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_log_table_apply_update( +/*=======================*/ + que_thr_t* thr, /*!< in: query graph */ + ulint new_trx_id_col, /*!< in: position of + DB_TRX_ID in the new + clustered index */ + const mrec_t* mrec, /*!< in: new value */ + const ulint* offsets, /*!< in: offsets of mrec */ + mem_heap_t* offsets_heap, /*!< in/out: memory heap + that can be emptied */ + mem_heap_t* heap, /*!< in/out: memory heap */ + row_merge_dup_t* dup, /*!< in/out: for reporting + duplicate key errors */ + trx_id_t trx_id, /*!< in: DB_TRX_ID of mrec */ + const dtuple_t* old_pk) /*!< in: PRIMARY KEY and + DB_TRX_ID,DB_ROLL_PTR + of the old value, + or PRIMARY KEY if same_pk */ +{ + const row_log_t*log = dup->index->online_log; + const dtuple_t* row; + dict_index_t* index = dict_table_get_first_index(log->table); + mtr_t mtr; + btr_pcur_t pcur; + dberr_t error; + + ut_ad(dtuple_get_n_fields_cmp(old_pk) + == dict_index_get_n_unique(index)); + ut_ad(dtuple_get_n_fields(old_pk) + == dict_index_get_n_unique(index) + + (log->same_pk ? 0 : 2)); + + row = row_log_table_apply_convert_mrec( + mrec, dup->index, offsets, log, heap, trx_id, &error); + + switch (error) { + case DB_MISSING_HISTORY: + /* The record contained BLOBs that are now missing. */ + ut_ad(log->blobs); + /* Whether or not we are updating the PRIMARY KEY, we + know that there should be a subsequent + ROW_T_DELETE for rolling back a preceding ROW_T_INSERT, + overriding this ROW_T_UPDATE record. (*1) + + This allows us to interpret this ROW_T_UPDATE + as ROW_T_DELETE. + + When applying the subsequent ROW_T_DELETE, no matching + record will be found. */ + case DB_SUCCESS: + ut_ad(row != NULL); + break; + default: + ut_ad(0); + case DB_INVALID_NULL: + ut_ad(row == NULL); + return(error); + } + + mtr_start(&mtr); + btr_pcur_open(index, old_pk, PAGE_CUR_LE, + BTR_MODIFY_TREE, &pcur, &mtr); +#ifdef UNIV_DEBUG + switch (btr_pcur_get_btr_cur(&pcur)->flag) { + case BTR_CUR_DELETE_REF: + case BTR_CUR_DEL_MARK_IBUF: + case BTR_CUR_DELETE_IBUF: + case BTR_CUR_INSERT_TO_IBUF: + ut_ad(0);/* We did not request buffering. */ + case BTR_CUR_HASH: + case BTR_CUR_HASH_FAIL: + case BTR_CUR_BINARY: + break; + } +#endif /* UNIV_DEBUG */ + + if (page_rec_is_infimum(btr_pcur_get_rec(&pcur)) + || btr_pcur_get_low_match(&pcur) < index->n_uniq) { + /* The record was not found. This should only happen + when an earlier ROW_T_INSERT or ROW_T_UPDATE was + diverted because BLOBs were freed when the insert was + later rolled back. */ + + ut_ad(log->blobs); + + if (error == DB_SUCCESS) { + /* An earlier ROW_T_INSERT could have been + skipped because of a missing BLOB, like this: + + BEGIN; + INSERT INTO t SET blob_col='blob value'; + UPDATE t SET blob_col=''; + ROLLBACK; + + This would generate the following records: + ROW_T_INSERT (referring to 'blob value') + ROW_T_UPDATE + ROW_T_UPDATE (referring to 'blob value') + ROW_T_DELETE + [ROLLBACK removes the 'blob value'] + + The ROW_T_INSERT would have been skipped + because of a missing BLOB. Now we are + executing the first ROW_T_UPDATE. + The second ROW_T_UPDATE (for the ROLLBACK) + would be interpreted as ROW_T_DELETE, because + the BLOB would be missing. + + We could probably assume that the transaction + has been rolled back and simply skip the + 'insert' part of this ROW_T_UPDATE record. + However, there might be some complex scenario + that could interfere with such a shortcut. + So, we will insert the row (and risk + introducing a bogus duplicate key error + for the ALTER TABLE), and a subsequent + ROW_T_UPDATE or ROW_T_DELETE will delete it. */ + mtr_commit(&mtr); + error = row_log_table_apply_insert_low( + thr, row, trx_id, offsets_heap, heap, dup); + } else { + /* Some BLOBs are missing, so we are interpreting + this ROW_T_UPDATE as ROW_T_DELETE (see *1). + Because the record was not found, we do nothing. */ + ut_ad(error == DB_MISSING_HISTORY); + error = DB_SUCCESS; +func_exit: + mtr_commit(&mtr); + } +func_exit_committed: + ut_ad(mtr.state == MTR_COMMITTED); + + if (error != DB_SUCCESS) { + /* Report the erroneous row using the new + version of the table. */ + innobase_row_to_mysql(dup->table, log->table, row); + } + + return(error); + } + + /* Prepare to update (or delete) the record. */ + ulint* cur_offsets = rec_get_offsets( + btr_pcur_get_rec(&pcur), + index, NULL, ULINT_UNDEFINED, &offsets_heap); + + if (!log->same_pk) { + /* Only update the record if DB_TRX_ID,DB_ROLL_PTR match what + was buffered. */ + ulint len; + const void* rec_trx_id + = rec_get_nth_field(btr_pcur_get_rec(&pcur), + cur_offsets, index->n_uniq, &len); + ut_ad(len == DATA_TRX_ID_LEN); + ut_ad(dtuple_get_nth_field(old_pk, index->n_uniq)->len + == DATA_TRX_ID_LEN); + ut_ad(dtuple_get_nth_field(old_pk, index->n_uniq + 1)->len + == DATA_ROLL_PTR_LEN); + ut_ad(DATA_TRX_ID_LEN + static_cast<const char*>( + dtuple_get_nth_field(old_pk, + index->n_uniq)->data) + == dtuple_get_nth_field(old_pk, + index->n_uniq + 1)->data); + if (memcmp(rec_trx_id, + dtuple_get_nth_field(old_pk, index->n_uniq)->data, + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)) { + /* The ROW_T_UPDATE was logged for a different + DB_TRX_ID,DB_ROLL_PTR. This is possible if an + earlier ROW_T_INSERT or ROW_T_UPDATE was diverted + because some BLOBs were missing due to rolling + back the initial insert or due to purging + the old BLOB values of an update. */ + ut_ad(log->blobs); + if (error != DB_SUCCESS) { + ut_ad(error == DB_MISSING_HISTORY); + /* Some BLOBs are missing, so we are + interpreting this ROW_T_UPDATE as + ROW_T_DELETE (see *1). + Because this is a different row, + we will do nothing. */ + error = DB_SUCCESS; + } else { + /* Because the user record is missing due to + BLOBs that were missing when processing + an earlier log record, we should + interpret the ROW_T_UPDATE as ROW_T_INSERT. + However, there is a different user record + with the same PRIMARY KEY value already. */ + error = DB_DUPLICATE_KEY; + } + + goto func_exit; + } + } + + if (error != DB_SUCCESS) { + ut_ad(error == DB_MISSING_HISTORY); + ut_ad(log->blobs); + /* Some BLOBs are missing, so we are interpreting + this ROW_T_UPDATE as ROW_T_DELETE (see *1). */ + error = row_log_table_apply_delete_low( + &pcur, cur_offsets, NULL, heap, &mtr); + goto func_exit_committed; + } + + dtuple_t* entry = row_build_index_entry( + row, NULL, index, heap); + const upd_t* update = row_upd_build_difference_binary( + index, entry, btr_pcur_get_rec(&pcur), cur_offsets, + false, NULL, heap); + + if (!update->n_fields) { + /* Nothing to do. */ + goto func_exit; + } + + const bool pk_updated + = upd_get_nth_field(update, 0)->field_no < new_trx_id_col; + + if (pk_updated || rec_offs_any_extern(cur_offsets)) { + /* If the record contains any externally stored + columns, perform the update by delete and insert, + because we will not write any undo log that would + allow purge to free any orphaned externally stored + columns. */ + + if (pk_updated && log->same_pk) { + /* The ROW_T_UPDATE log record should only be + written when the PRIMARY KEY fields of the + record did not change in the old table. We + can only get a change of PRIMARY KEY columns + in the rebuilt table if the PRIMARY KEY was + redefined (!same_pk). */ + ut_ad(0); + error = DB_CORRUPTION; + goto func_exit; + } + + error = row_log_table_apply_delete_low( + &pcur, cur_offsets, NULL, heap, &mtr); + ut_ad(mtr.state == MTR_COMMITTED); + + if (error == DB_SUCCESS) { + error = row_log_table_apply_insert_low( + thr, row, trx_id, offsets_heap, heap, dup); + } + + goto func_exit_committed; + } + + dtuple_t* old_row; + row_ext_t* old_ext; + + if (dict_table_get_next_index(index)) { + /* Construct the row corresponding to the old value of + the record. */ + old_row = row_build( + ROW_COPY_DATA, index, btr_pcur_get_rec(&pcur), + cur_offsets, NULL, NULL, NULL, &old_ext, heap); + ut_ad(old_row); +#ifdef ROW_LOG_APPLY_PRINT + if (row_log_apply_print) { + fprintf(stderr, "table apply update " + IB_ID_FMT " " IB_ID_FMT "\n", + index->table->id, index->id); + dtuple_print(stderr, old_row); + dtuple_print(stderr, row); + } +#endif /* ROW_LOG_APPLY_PRINT */ + } else { + old_row = NULL; + old_ext = NULL; + } + + big_rec_t* big_rec; + + error = btr_cur_pessimistic_update( + BTR_CREATE_FLAG | BTR_NO_LOCKING_FLAG + | BTR_NO_UNDO_LOG_FLAG | BTR_KEEP_SYS_FLAG + | BTR_KEEP_POS_FLAG, + btr_pcur_get_btr_cur(&pcur), + &cur_offsets, &offsets_heap, heap, &big_rec, + update, 0, thr, 0, &mtr); + + if (big_rec) { + if (error == DB_SUCCESS) { + error = btr_store_big_rec_extern_fields( + index, btr_pcur_get_block(&pcur), + btr_pcur_get_rec(&pcur), cur_offsets, + big_rec, &mtr, BTR_STORE_UPDATE); + } + + dtuple_big_rec_free(big_rec); + } + + while ((index = dict_table_get_next_index(index)) != NULL) { + if (error != DB_SUCCESS) { + break; + } + + if (index->type & DICT_FTS) { + continue; + } + + if (!row_upd_changes_ord_field_binary( + index, update, thr, old_row, NULL)) { + continue; + } + + mtr_commit(&mtr); + + entry = row_build_index_entry(old_row, old_ext, index, heap); + if (!entry) { + ut_ad(0); + return(DB_CORRUPTION); + } + + mtr_start(&mtr); + + if (ROW_FOUND != row_search_index_entry( + index, entry, BTR_MODIFY_TREE, &pcur, &mtr)) { + ut_ad(0); + error = DB_CORRUPTION; + break; + } + + btr_cur_pessimistic_delete( + &error, FALSE, btr_pcur_get_btr_cur(&pcur), + BTR_CREATE_FLAG, RB_NONE, &mtr); + + if (error != DB_SUCCESS) { + break; + } + + mtr_commit(&mtr); + + entry = row_build_index_entry(row, NULL, index, heap); + error = row_ins_sec_index_entry_low( + BTR_CREATE_FLAG | BTR_NO_LOCKING_FLAG + | BTR_NO_UNDO_LOG_FLAG | BTR_KEEP_SYS_FLAG, + BTR_MODIFY_TREE, index, offsets_heap, heap, + entry, trx_id, thr); + + mtr_start(&mtr); + } + + goto func_exit; +} + +/******************************************************//** +Applies an operation to a table that was rebuilt. +@return NULL on failure (mrec corruption) or when out of data; +pointer to next record on success */ +static __attribute__((nonnull, warn_unused_result)) +const mrec_t* +row_log_table_apply_op( +/*===================*/ + que_thr_t* thr, /*!< in: query graph */ + ulint trx_id_col, /*!< in: position of + DB_TRX_ID in old index */ + ulint new_trx_id_col, /*!< in: position of + DB_TRX_ID in new index */ + row_merge_dup_t* dup, /*!< in/out: for reporting + duplicate key errors */ + dberr_t* error, /*!< out: DB_SUCCESS + or error code */ + mem_heap_t* offsets_heap, /*!< in/out: memory heap + that can be emptied */ + mem_heap_t* heap, /*!< in/out: memory heap */ + const mrec_t* mrec, /*!< in: merge record */ + const mrec_t* mrec_end, /*!< in: end of buffer */ + ulint* offsets) /*!< in/out: work area + for parsing mrec */ +{ + row_log_t* log = dup->index->online_log; + dict_index_t* new_index = dict_table_get_first_index(log->table); + ulint extra_size; + const mrec_t* next_mrec; + dtuple_t* old_pk; + row_ext_t* ext; + ulint ext_size; + + ut_ad(dict_index_is_clust(dup->index)); + ut_ad(dup->index->table != log->table); + ut_ad(log->head.total <= log->tail.total); + + *error = DB_SUCCESS; + + /* 3 = 1 (op type) + 1 (ext_size) + at least 1 byte payload */ + if (mrec + 3 >= mrec_end) { + return(NULL); + } + + const mrec_t* const mrec_start = mrec; + + switch (*mrec++) { + default: + ut_ad(0); + *error = DB_CORRUPTION; + return(NULL); + case ROW_T_INSERT: + extra_size = *mrec++; + + if (extra_size >= 0x80) { + /* Read another byte of extra_size. */ + + extra_size = (extra_size & 0x7f) << 8; + extra_size |= *mrec++; + } + + mrec += extra_size; + + if (mrec > mrec_end) { + return(NULL); + } + + rec_offs_set_n_fields(offsets, dup->index->n_fields); + rec_init_offsets_temp(mrec, dup->index, offsets); + + next_mrec = mrec + rec_offs_data_size(offsets); + + if (next_mrec > mrec_end) { + return(NULL); + } else { + log->head.total += next_mrec - mrec_start; + + ulint len; + const byte* db_trx_id + = rec_get_nth_field( + mrec, offsets, trx_id_col, &len); + ut_ad(len == DATA_TRX_ID_LEN); + *error = row_log_table_apply_insert( + thr, mrec, offsets, offsets_heap, + heap, dup, trx_read_trx_id(db_trx_id)); + } + break; + + case ROW_T_DELETE: + /* 1 (extra_size) + 2 (ext_size) + at least 1 (payload) */ + if (mrec + 4 >= mrec_end) { + return(NULL); + } + + extra_size = *mrec++; + ext_size = mach_read_from_2(mrec); + mrec += 2; + ut_ad(mrec < mrec_end); + + /* We assume extra_size < 0x100 for the PRIMARY KEY prefix. + For fixed-length PRIMARY key columns, it is 0. */ + mrec += extra_size; + + rec_offs_set_n_fields(offsets, new_index->n_uniq + 2); + rec_init_offsets_temp(mrec, new_index, offsets); + next_mrec = mrec + rec_offs_data_size(offsets) + ext_size; + if (next_mrec > mrec_end) { + return(NULL); + } + + log->head.total += next_mrec - mrec_start; + + /* If there are external fields, retrieve those logged + prefix info and reconstruct the row_ext_t */ + if (ext_size) { + /* We use memcpy to avoid unaligned + access on some non-x86 platforms.*/ + ext = static_cast<row_ext_t*>( + mem_heap_dup(heap, + mrec + rec_offs_data_size(offsets), + ext_size)); + + byte* ext_start = reinterpret_cast<byte*>(ext); + + ulint ext_len = sizeof(*ext) + + (ext->n_ext - 1) * sizeof ext->len; + + ext->ext = reinterpret_cast<ulint*>(ext_start + ext_len); + ext_len += ext->n_ext * sizeof(*ext->ext); + + ext->buf = static_cast<byte*>(ext_start + ext_len); + } else { + ext = NULL; + } + + *error = row_log_table_apply_delete( + thr, new_trx_id_col, + mrec, offsets, offsets_heap, heap, + log, ext); + break; + + case ROW_T_UPDATE: + /* Logically, the log entry consists of the + (PRIMARY KEY,DB_TRX_ID) of the old value (converted + to the new primary key definition) followed by + the new value in the old table definition. If the + definition of the columns belonging to PRIMARY KEY + is not changed, the log will only contain + DB_TRX_ID,new_row. */ + + if (dup->index->online_log->same_pk) { + ut_ad(new_index->n_uniq == dup->index->n_uniq); + + extra_size = *mrec++; + + if (extra_size >= 0x80) { + /* Read another byte of extra_size. */ + + extra_size = (extra_size & 0x7f) << 8; + extra_size |= *mrec++; + } + + mrec += extra_size; + + if (mrec > mrec_end) { + return(NULL); + } + + rec_offs_set_n_fields(offsets, dup->index->n_fields); + rec_init_offsets_temp(mrec, dup->index, offsets); + + next_mrec = mrec + rec_offs_data_size(offsets); + + if (next_mrec > mrec_end) { + return(NULL); + } + + old_pk = dtuple_create(heap, new_index->n_uniq); + dict_index_copy_types( + old_pk, new_index, old_pk->n_fields); + + /* Copy the PRIMARY KEY fields from mrec to old_pk. */ + for (ulint i = 0; i < new_index->n_uniq; i++) { + const void* field; + ulint len; + dfield_t* dfield; + + ut_ad(!rec_offs_nth_extern(offsets, i)); + + field = rec_get_nth_field( + mrec, offsets, i, &len); + ut_ad(len != UNIV_SQL_NULL); + + dfield = dtuple_get_nth_field(old_pk, i); + dfield_set_data(dfield, field, len); + } + } else { + /* We assume extra_size < 0x100 + for the PRIMARY KEY prefix. */ + mrec += *mrec + 1; + + if (mrec > mrec_end) { + return(NULL); + } + + /* Get offsets for PRIMARY KEY, + DB_TRX_ID, DB_ROLL_PTR. */ + rec_offs_set_n_fields(offsets, new_index->n_uniq + 2); + rec_init_offsets_temp(mrec, new_index, offsets); + + next_mrec = mrec + rec_offs_data_size(offsets); + if (next_mrec + 2 > mrec_end) { + return(NULL); + } + + /* Copy the PRIMARY KEY fields and + DB_TRX_ID, DB_ROLL_PTR from mrec to old_pk. */ + old_pk = dtuple_create(heap, new_index->n_uniq + 2); + dict_index_copy_types(old_pk, new_index, + old_pk->n_fields); + + for (ulint i = 0; + i < dict_index_get_n_unique(new_index) + 2; + i++) { + const void* field; + ulint len; + dfield_t* dfield; + + ut_ad(!rec_offs_nth_extern(offsets, i)); + + field = rec_get_nth_field( + mrec, offsets, i, &len); + ut_ad(len != UNIV_SQL_NULL); + + dfield = dtuple_get_nth_field(old_pk, i); + dfield_set_data(dfield, field, len); + } + + mrec = next_mrec; + + /* Fetch the new value of the row as it was + in the old table definition. */ + extra_size = *mrec++; + + if (extra_size >= 0x80) { + /* Read another byte of extra_size. */ + + extra_size = (extra_size & 0x7f) << 8; + extra_size |= *mrec++; + } + + mrec += extra_size; + + if (mrec > mrec_end) { + return(NULL); + } + + rec_offs_set_n_fields(offsets, dup->index->n_fields); + rec_init_offsets_temp(mrec, dup->index, offsets); + + next_mrec = mrec + rec_offs_data_size(offsets); + + if (next_mrec > mrec_end) { + return(NULL); + } + } + + ut_ad(next_mrec <= mrec_end); + log->head.total += next_mrec - mrec_start; + dtuple_set_n_fields_cmp(old_pk, new_index->n_uniq); + + { + ulint len; + const byte* db_trx_id + = rec_get_nth_field( + mrec, offsets, trx_id_col, &len); + ut_ad(len == DATA_TRX_ID_LEN); + *error = row_log_table_apply_update( + thr, new_trx_id_col, + mrec, offsets, offsets_heap, + heap, dup, trx_read_trx_id(db_trx_id), old_pk); + } + + break; + } + + ut_ad(log->head.total <= log->tail.total); + mem_heap_empty(offsets_heap); + mem_heap_empty(heap); + return(next_mrec); +} + +/******************************************************//** +Applies operations to a table was rebuilt. +@return DB_SUCCESS, or error code on failure */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_log_table_apply_ops( +/*====================*/ + que_thr_t* thr, /*!< in: query graph */ + row_merge_dup_t*dup) /*!< in/out: for reporting duplicate key + errors */ +{ + dberr_t error; + const mrec_t* mrec = NULL; + const mrec_t* next_mrec; + const mrec_t* mrec_end = NULL; /* silence bogus warning */ + const mrec_t* next_mrec_end; + mem_heap_t* heap; + mem_heap_t* offsets_heap; + ulint* offsets; + bool has_index_lock; + dict_index_t* index = const_cast<dict_index_t*>( + dup->index); + dict_table_t* new_table = index->online_log->table; + dict_index_t* new_index = dict_table_get_first_index( + new_table); + const ulint i = 1 + REC_OFFS_HEADER_SIZE + + ut_max(dict_index_get_n_fields(index), + dict_index_get_n_unique(new_index) + 2); + const ulint trx_id_col = dict_col_get_clust_pos( + dict_table_get_sys_col(index->table, DATA_TRX_ID), index); + const ulint new_trx_id_col = dict_col_get_clust_pos( + dict_table_get_sys_col(new_table, DATA_TRX_ID), new_index); + trx_t* trx = thr_get_trx(thr); + + ut_ad(dict_index_is_clust(index)); + ut_ad(dict_index_is_online_ddl(index)); + ut_ad(trx->mysql_thd); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(!dict_index_is_online_ddl(new_index)); + ut_ad(trx_id_col > 0); + ut_ad(trx_id_col != ULINT_UNDEFINED); + ut_ad(new_trx_id_col > 0); + ut_ad(new_trx_id_col != ULINT_UNDEFINED); + + UNIV_MEM_INVALID(&mrec_end, sizeof mrec_end); + + offsets = static_cast<ulint*>(ut_malloc(i * sizeof *offsets)); + offsets[0] = i; + offsets[1] = dict_index_get_n_fields(index); + + heap = mem_heap_create(UNIV_PAGE_SIZE); + offsets_heap = mem_heap_create(UNIV_PAGE_SIZE); + has_index_lock = true; + +next_block: + ut_ad(has_index_lock); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(index->online_log->head.bytes == 0); + + if (trx_is_interrupted(trx)) { + goto interrupted; + } + + if (dict_index_is_corrupted(index)) { + error = DB_INDEX_CORRUPT; + goto func_exit; + } + + ut_ad(dict_index_is_online_ddl(index)); + + error = index->online_log->error; + + if (error != DB_SUCCESS) { + goto func_exit; + } + + if (UNIV_UNLIKELY(index->online_log->head.blocks + > index->online_log->tail.blocks)) { +unexpected_eof: + fprintf(stderr, "InnoDB: unexpected end of temporary file" + " for table %s\n", index->table_name); +corruption: + error = DB_CORRUPTION; + goto func_exit; + } + + if (index->online_log->head.blocks + == index->online_log->tail.blocks) { + if (index->online_log->head.blocks) { +#ifdef HAVE_FTRUNCATE + /* Truncate the file in order to save space. */ + if (ftruncate(index->online_log->fd, 0) == -1) { + perror("ftruncate"); + } +#endif /* HAVE_FTRUNCATE */ + index->online_log->head.blocks + = index->online_log->tail.blocks = 0; + } + + next_mrec = index->online_log->tail.block; + next_mrec_end = next_mrec + index->online_log->tail.bytes; + + if (next_mrec_end == next_mrec) { + /* End of log reached. */ +all_done: + ut_ad(has_index_lock); + ut_ad(index->online_log->head.blocks == 0); + ut_ad(index->online_log->tail.blocks == 0); + index->online_log->head.bytes = 0; + index->online_log->tail.bytes = 0; + error = DB_SUCCESS; + goto func_exit; + } + } else { + os_offset_t ofs; + ibool success; + + ofs = (os_offset_t) index->online_log->head.blocks + * srv_sort_buf_size; + + ut_ad(has_index_lock); + has_index_lock = false; + rw_lock_x_unlock(dict_index_get_lock(index)); + + log_free_check(); + + ut_ad(dict_index_is_online_ddl(index)); + + if (!row_log_block_allocate(index->online_log->head)) { + error = DB_OUT_OF_MEMORY; + goto func_exit; + } + + success = os_file_read_no_error_handling( + OS_FILE_FROM_FD(index->online_log->fd), + index->online_log->head.block, ofs, + srv_sort_buf_size); + + if (!success) { + fprintf(stderr, "InnoDB: unable to read temporary file" + " for table %s\n", index->table_name); + goto corruption; + } + +#ifdef POSIX_FADV_DONTNEED + /* Each block is read exactly once. Free up the file cache. */ + posix_fadvise(index->online_log->fd, + ofs, srv_sort_buf_size, POSIX_FADV_DONTNEED); +#endif /* POSIX_FADV_DONTNEED */ +#if 0 //def FALLOC_FL_PUNCH_HOLE + /* Try to deallocate the space for the file on disk. + This should work on ext4 on Linux 2.6.39 and later, + and be ignored when the operation is unsupported. */ + fallocate(index->online_log->fd, + FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + ofs, srv_buf_size); +#endif /* FALLOC_FL_PUNCH_HOLE */ + + next_mrec = index->online_log->head.block; + next_mrec_end = next_mrec + srv_sort_buf_size; + } + + /* This read is not protected by index->online_log->mutex for + performance reasons. We will eventually notice any error that + was flagged by a DML thread. */ + error = index->online_log->error; + + if (error != DB_SUCCESS) { + goto func_exit; + } + + if (mrec) { + /* A partial record was read from the previous block. + Copy the temporary buffer full, as we do not know the + length of the record. Parse subsequent records from + the bigger buffer index->online_log->head.block + or index->online_log->tail.block. */ + + ut_ad(mrec == index->online_log->head.buf); + ut_ad(mrec_end > mrec); + ut_ad(mrec_end < (&index->online_log->head.buf)[1]); + + memcpy((mrec_t*) mrec_end, next_mrec, + (&index->online_log->head.buf)[1] - mrec_end); + mrec = row_log_table_apply_op( + thr, trx_id_col, new_trx_id_col, + dup, &error, offsets_heap, heap, + index->online_log->head.buf, + (&index->online_log->head.buf)[1], offsets); + if (error != DB_SUCCESS) { + goto func_exit; + } else if (UNIV_UNLIKELY(mrec == NULL)) { + /* The record was not reassembled properly. */ + goto corruption; + } + /* The record was previously found out to be + truncated. Now that the parse buffer was extended, + it should proceed beyond the old end of the buffer. */ + ut_a(mrec > mrec_end); + + index->online_log->head.bytes = mrec - mrec_end; + next_mrec += index->online_log->head.bytes; + } + + ut_ad(next_mrec <= next_mrec_end); + /* The following loop must not be parsing the temporary + buffer, but head.block or tail.block. */ + + /* mrec!=NULL means that the next record starts from the + middle of the block */ + ut_ad((mrec == NULL) == (index->online_log->head.bytes == 0)); + +#ifdef UNIV_DEBUG + if (next_mrec_end == index->online_log->head.block + + srv_sort_buf_size) { + /* If tail.bytes == 0, next_mrec_end can also be at + the end of tail.block. */ + if (index->online_log->tail.bytes == 0) { + ut_ad(next_mrec == next_mrec_end); + ut_ad(index->online_log->tail.blocks == 0); + ut_ad(index->online_log->head.blocks == 0); + ut_ad(index->online_log->head.bytes == 0); + } else { + ut_ad(next_mrec == index->online_log->head.block + + index->online_log->head.bytes); + ut_ad(index->online_log->tail.blocks + > index->online_log->head.blocks); + } + } else if (next_mrec_end == index->online_log->tail.block + + index->online_log->tail.bytes) { + ut_ad(next_mrec == index->online_log->tail.block + + index->online_log->head.bytes); + ut_ad(index->online_log->tail.blocks == 0); + ut_ad(index->online_log->head.blocks == 0); + ut_ad(index->online_log->head.bytes + <= index->online_log->tail.bytes); + } else { + ut_error; + } +#endif /* UNIV_DEBUG */ + + mrec_end = next_mrec_end; + + while (!trx_is_interrupted(trx)) { + mrec = next_mrec; + ut_ad(mrec < mrec_end); + + if (!has_index_lock) { + /* We are applying operations from a different + block than the one that is being written to. + We do not hold index->lock in order to + allow other threads to concurrently buffer + modifications. */ + ut_ad(mrec >= index->online_log->head.block); + ut_ad(mrec_end == index->online_log->head.block + + srv_sort_buf_size); + ut_ad(index->online_log->head.bytes + < srv_sort_buf_size); + + /* Take the opportunity to do a redo log + checkpoint if needed. */ + log_free_check(); + } else { + /* We are applying operations from the last block. + Do not allow other threads to buffer anything, + so that we can finally catch up and synchronize. */ + ut_ad(index->online_log->head.blocks == 0); + ut_ad(index->online_log->tail.blocks == 0); + ut_ad(mrec_end == index->online_log->tail.block + + index->online_log->tail.bytes); + ut_ad(mrec >= index->online_log->tail.block); + } + + /* This read is not protected by index->online_log->mutex + for performance reasons. We will eventually notice any + error that was flagged by a DML thread. */ + error = index->online_log->error; + + if (error != DB_SUCCESS) { + goto func_exit; + } + + next_mrec = row_log_table_apply_op( + thr, trx_id_col, new_trx_id_col, + dup, &error, offsets_heap, heap, + mrec, mrec_end, offsets); + + if (error != DB_SUCCESS) { + goto func_exit; + } else if (next_mrec == next_mrec_end) { + /* The record happened to end on a block boundary. + Do we have more blocks left? */ + if (has_index_lock) { + /* The index will be locked while + applying the last block. */ + goto all_done; + } + + mrec = NULL; +process_next_block: + rw_lock_x_lock(dict_index_get_lock(index)); + has_index_lock = true; + + index->online_log->head.bytes = 0; + index->online_log->head.blocks++; + goto next_block; + } else if (next_mrec != NULL) { + ut_ad(next_mrec < next_mrec_end); + index->online_log->head.bytes += next_mrec - mrec; + } else if (has_index_lock) { + /* When mrec is within tail.block, it should + be a complete record, because we are holding + index->lock and thus excluding the writer. */ + ut_ad(index->online_log->tail.blocks == 0); + ut_ad(mrec_end == index->online_log->tail.block + + index->online_log->tail.bytes); + ut_ad(0); + goto unexpected_eof; + } else { + memcpy(index->online_log->head.buf, mrec, + mrec_end - mrec); + mrec_end += index->online_log->head.buf - mrec; + mrec = index->online_log->head.buf; + goto process_next_block; + } + } + +interrupted: + error = DB_INTERRUPTED; +func_exit: + if (!has_index_lock) { + rw_lock_x_lock(dict_index_get_lock(index)); + } + + mem_heap_free(offsets_heap); + mem_heap_free(heap); + row_log_block_free(index->online_log->head); + ut_free(offsets); + return(error); +} + +/******************************************************//** +Apply the row_log_table log to a table upon completing rebuild. +@return DB_SUCCESS, or error code on failure */ +UNIV_INTERN +dberr_t +row_log_table_apply( +/*================*/ + que_thr_t* thr, /*!< in: query graph */ + dict_table_t* old_table, + /*!< in: old table */ + struct TABLE* table) /*!< in/out: MySQL table + (for reporting duplicates) */ +{ + dberr_t error; + dict_index_t* clust_index; + + thr_get_trx(thr)->error_key_num = 0; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(&dict_operation_lock, RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + clust_index = dict_table_get_first_index(old_table); + + rw_lock_x_lock(dict_index_get_lock(clust_index)); + + if (!clust_index->online_log) { + ut_ad(dict_index_get_online_status(clust_index) + == ONLINE_INDEX_COMPLETE); + /* This function should not be called unless + rebuilding a table online. Build in some fault + tolerance. */ + ut_ad(0); + error = DB_ERROR; + } else { + row_merge_dup_t dup = { + clust_index, table, + clust_index->online_log->col_map, 0 + }; + + error = row_log_table_apply_ops(thr, &dup); + + ut_ad(error != DB_SUCCESS + || clust_index->online_log->head.total + == clust_index->online_log->tail.total); + } + + rw_lock_x_unlock(dict_index_get_lock(clust_index)); + return(error); +} + +/******************************************************//** +Allocate the row log for an index and flag the index +for online creation. +@retval true if success, false if not */ +UNIV_INTERN +bool +row_log_allocate( +/*=============*/ + dict_index_t* index, /*!< in/out: index */ + dict_table_t* table, /*!< in/out: new table being rebuilt, + or NULL when creating a secondary index */ + bool same_pk,/*!< in: whether the definition of the + PRIMARY KEY has remained the same */ + const dtuple_t* add_cols, + /*!< in: default values of + added columns, or NULL */ + const ulint* col_map)/*!< in: mapping of old column + numbers to new ones, or NULL if !table */ +{ + row_log_t* log; + DBUG_ENTER("row_log_allocate"); + + ut_ad(!dict_index_is_online_ddl(index)); + ut_ad(dict_index_is_clust(index) == !!table); + ut_ad(!table || index->table != table); + ut_ad(same_pk || table); + ut_ad(!table || col_map); + ut_ad(!add_cols || col_map); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + log = (row_log_t*) ut_malloc(sizeof *log); + if (!log) { + DBUG_RETURN(false); + } + + log->fd = row_merge_file_create_low(); + if (log->fd < 0) { + ut_free(log); + DBUG_RETURN(false); + } + mutex_create(index_online_log_key, &log->mutex, + SYNC_INDEX_ONLINE_LOG); + log->blobs = NULL; + log->table = table; + log->same_pk = same_pk; + log->add_cols = add_cols; + log->col_map = col_map; + log->error = DB_SUCCESS; + log->max_trx = 0; + log->tail.blocks = log->tail.bytes = 0; + log->tail.total = 0; + log->tail.block = log->head.block = NULL; + log->head.blocks = log->head.bytes = 0; + log->head.total = 0; + dict_index_set_online_status(index, ONLINE_INDEX_CREATION); + index->online_log = log; + + /* While we might be holding an exclusive data dictionary lock + here, in row_log_abort_sec() we will not always be holding it. Use + atomic operations in both cases. */ + MONITOR_ATOMIC_INC(MONITOR_ONLINE_CREATE_INDEX); + + DBUG_RETURN(true); +} + +/******************************************************//** +Free the row log for an index that was being created online. */ +UNIV_INTERN +void +row_log_free( +/*=========*/ + row_log_t*& log) /*!< in,own: row log */ +{ + MONITOR_ATOMIC_DEC(MONITOR_ONLINE_CREATE_INDEX); + + delete log->blobs; + row_log_block_free(log->tail); + row_log_block_free(log->head); + row_merge_file_destroy_low(log->fd); + mutex_free(&log->mutex); + ut_free(log); + log = 0; +} + +/******************************************************//** +Get the latest transaction ID that has invoked row_log_online_op() +during online creation. +@return latest transaction ID, or 0 if nothing was logged */ +UNIV_INTERN +trx_id_t +row_log_get_max_trx( +/*================*/ + dict_index_t* index) /*!< in: index, must be locked */ +{ + ut_ad(dict_index_get_online_status(index) == ONLINE_INDEX_CREATION); +#ifdef UNIV_SYNC_DEBUG + ut_ad((rw_lock_own(dict_index_get_lock(index), RW_LOCK_SHARED) + && mutex_own(&index->online_log->mutex)) + || rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + return(index->online_log->max_trx); +} + +/******************************************************//** +Applies an operation to a secondary index that was being created. */ +static __attribute__((nonnull)) +void +row_log_apply_op_low( +/*=================*/ + dict_index_t* index, /*!< in/out: index */ + row_merge_dup_t*dup, /*!< in/out: for reporting + duplicate key errors */ + dberr_t* error, /*!< out: DB_SUCCESS or error code */ + mem_heap_t* offsets_heap, /*!< in/out: memory heap for + allocating offsets; can be emptied */ + bool has_index_lock, /*!< in: true if holding index->lock + in exclusive mode */ + enum row_op op, /*!< in: operation being applied */ + trx_id_t trx_id, /*!< in: transaction identifier */ + const dtuple_t* entry) /*!< in: row */ +{ + mtr_t mtr; + btr_cur_t cursor; + ulint* offsets = NULL; + + ut_ad(!dict_index_is_clust(index)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX) + == has_index_lock); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(!dict_index_is_corrupted(index)); + ut_ad(trx_id != 0 || op == ROW_OP_DELETE); + + mtr_start(&mtr); + + /* We perform the pessimistic variant of the operations if we + already hold index->lock exclusively. First, search the + record. The operation may already have been performed, + depending on when the row in the clustered index was + scanned. */ + btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE, + has_index_lock + ? BTR_MODIFY_TREE + : BTR_MODIFY_LEAF, + &cursor, 0, __FILE__, __LINE__, + &mtr); + + ut_ad(dict_index_get_n_unique(index) > 0); + /* This test is somewhat similar to row_ins_must_modify_rec(), + but not identical for unique secondary indexes. */ + if (cursor.low_match >= dict_index_get_n_unique(index) + && !page_rec_is_infimum(btr_cur_get_rec(&cursor))) { + /* We have a matching record. */ + bool exists = (cursor.low_match + == dict_index_get_n_fields(index)); +#ifdef UNIV_DEBUG + rec_t* rec = btr_cur_get_rec(&cursor); + ut_ad(page_rec_is_user_rec(rec)); + ut_ad(!rec_get_deleted_flag(rec, page_rec_is_comp(rec))); +#endif /* UNIV_DEBUG */ + + ut_ad(exists || dict_index_is_unique(index)); + + switch (op) { + case ROW_OP_DELETE: + if (!exists) { + /* The existing record matches the + unique secondary index key, but the + PRIMARY KEY columns differ. So, this + exact record does not exist. For + example, we could detect a duplicate + key error in some old index before + logging an ROW_OP_INSERT for our + index. This ROW_OP_DELETE could have + been logged for rolling back + TRX_UNDO_INSERT_REC. */ + goto func_exit; + } + + if (btr_cur_optimistic_delete( + &cursor, BTR_CREATE_FLAG, &mtr)) { + *error = DB_SUCCESS; + break; + } + + if (!has_index_lock) { + /* This needs a pessimistic operation. + Lock the index tree exclusively. */ + mtr_commit(&mtr); + mtr_start(&mtr); + btr_cur_search_to_nth_level( + index, 0, entry, PAGE_CUR_LE, + BTR_MODIFY_TREE, &cursor, 0, + __FILE__, __LINE__, &mtr); + + /* No other thread than the current one + is allowed to modify the index tree. + Thus, the record should still exist. */ + ut_ad(cursor.low_match + >= dict_index_get_n_fields(index)); + ut_ad(page_rec_is_user_rec( + btr_cur_get_rec(&cursor))); + } + + /* As there are no externally stored fields in + a secondary index record, the parameter + rb_ctx = RB_NONE will be ignored. */ + + btr_cur_pessimistic_delete( + error, FALSE, &cursor, + BTR_CREATE_FLAG, RB_NONE, &mtr); + break; + case ROW_OP_INSERT: + if (exists) { + /* The record already exists. There + is nothing to be inserted. + This could happen when processing + TRX_UNDO_DEL_MARK_REC in statement + rollback: + + UPDATE of PRIMARY KEY can lead to + statement rollback if the updated + value of the PRIMARY KEY already + exists. In this case, the UPDATE would + be mapped to DELETE;INSERT, and we + only wrote undo log for the DELETE + part. The duplicate key error would be + triggered before logging the INSERT + part. + + Theoretically, we could also get a + similar situation when a DELETE operation + is blocked by a FOREIGN KEY constraint. */ + goto func_exit; + } + + if (dtuple_contains_null(entry)) { + /* The UNIQUE KEY columns match, but + there is a NULL value in the key, and + NULL!=NULL. */ + goto insert_the_rec; + } + + goto duplicate; + } + } else { + switch (op) { + rec_t* rec; + big_rec_t* big_rec; + case ROW_OP_DELETE: + /* The record does not exist. For example, we + could detect a duplicate key error in some old + index before logging an ROW_OP_INSERT for our + index. This ROW_OP_DELETE could be logged for + rolling back TRX_UNDO_INSERT_REC. */ + goto func_exit; + case ROW_OP_INSERT: + if (dict_index_is_unique(index) + && (cursor.up_match + >= dict_index_get_n_unique(index) + || cursor.low_match + >= dict_index_get_n_unique(index)) + && (!index->n_nullable + || !dtuple_contains_null(entry))) { +duplicate: + /* Duplicate key */ + ut_ad(dict_index_is_unique(index)); + row_merge_dup_report(dup, entry->fields); + *error = DB_DUPLICATE_KEY; + goto func_exit; + } +insert_the_rec: + /* Insert the record. As we are inserting into + a secondary index, there cannot be externally + stored columns (!big_rec). */ + *error = btr_cur_optimistic_insert( + BTR_NO_UNDO_LOG_FLAG + | BTR_NO_LOCKING_FLAG + | BTR_CREATE_FLAG, + &cursor, &offsets, &offsets_heap, + const_cast<dtuple_t*>(entry), + &rec, &big_rec, 0, NULL, &mtr); + ut_ad(!big_rec); + if (*error != DB_FAIL) { + break; + } + + if (!has_index_lock) { + /* This needs a pessimistic operation. + Lock the index tree exclusively. */ + mtr_commit(&mtr); + mtr_start(&mtr); + btr_cur_search_to_nth_level( + index, 0, entry, PAGE_CUR_LE, + BTR_MODIFY_TREE, &cursor, 0, + __FILE__, __LINE__, &mtr); + } + + /* We already determined that the + record did not exist. No other thread + than the current one is allowed to + modify the index tree. Thus, the + record should still not exist. */ + + *error = btr_cur_pessimistic_insert( + BTR_NO_UNDO_LOG_FLAG + | BTR_NO_LOCKING_FLAG + | BTR_CREATE_FLAG, + &cursor, &offsets, &offsets_heap, + const_cast<dtuple_t*>(entry), + &rec, &big_rec, + 0, NULL, &mtr); + ut_ad(!big_rec); + break; + } + mem_heap_empty(offsets_heap); + } + + if (*error == DB_SUCCESS && trx_id) { + page_update_max_trx_id(btr_cur_get_block(&cursor), + btr_cur_get_page_zip(&cursor), + trx_id, &mtr); + } + +func_exit: + mtr_commit(&mtr); +} + +/******************************************************//** +Applies an operation to a secondary index that was being created. +@return NULL on failure (mrec corruption) or when out of data; +pointer to next record on success */ +static __attribute__((nonnull, warn_unused_result)) +const mrec_t* +row_log_apply_op( +/*=============*/ + dict_index_t* index, /*!< in/out: index */ + row_merge_dup_t*dup, /*!< in/out: for reporting + duplicate key errors */ + dberr_t* error, /*!< out: DB_SUCCESS or error code */ + mem_heap_t* offsets_heap, /*!< in/out: memory heap for + allocating offsets; can be emptied */ + mem_heap_t* heap, /*!< in/out: memory heap for + allocating data tuples */ + bool has_index_lock, /*!< in: true if holding index->lock + in exclusive mode */ + const mrec_t* mrec, /*!< in: merge record */ + const mrec_t* mrec_end, /*!< in: end of buffer */ + ulint* offsets) /*!< in/out: work area for + rec_init_offsets_temp() */ + +{ + enum row_op op; + ulint extra_size; + ulint data_size; + ulint n_ext; + dtuple_t* entry; + trx_id_t trx_id; + + /* Online index creation is only used for secondary indexes. */ + ut_ad(!dict_index_is_clust(index)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX) + == has_index_lock); +#endif /* UNIV_SYNC_DEBUG */ + + if (dict_index_is_corrupted(index)) { + *error = DB_INDEX_CORRUPT; + return(NULL); + } + + *error = DB_SUCCESS; + + if (mrec + ROW_LOG_HEADER_SIZE >= mrec_end) { + return(NULL); + } + + switch (*mrec) { + case ROW_OP_INSERT: + if (ROW_LOG_HEADER_SIZE + DATA_TRX_ID_LEN + mrec >= mrec_end) { + return(NULL); + } + + op = static_cast<enum row_op>(*mrec++); + trx_id = trx_read_trx_id(mrec); + mrec += DATA_TRX_ID_LEN; + break; + case ROW_OP_DELETE: + op = static_cast<enum row_op>(*mrec++); + trx_id = 0; + break; + default: +corrupted: + ut_ad(0); + *error = DB_CORRUPTION; + return(NULL); + } + + extra_size = *mrec++; + + ut_ad(mrec < mrec_end); + + if (extra_size >= 0x80) { + /* Read another byte of extra_size. */ + + extra_size = (extra_size & 0x7f) << 8; + extra_size |= *mrec++; + } + + mrec += extra_size; + + if (mrec > mrec_end) { + return(NULL); + } + + rec_init_offsets_temp(mrec, index, offsets); + + if (rec_offs_any_extern(offsets)) { + /* There should never be any externally stored fields + in a secondary index, which is what online index + creation is used for. Therefore, the log file must be + corrupted. */ + goto corrupted; + } + + data_size = rec_offs_data_size(offsets); + + mrec += data_size; + + if (mrec > mrec_end) { + return(NULL); + } + + entry = row_rec_to_index_entry_low( + mrec - data_size, index, offsets, &n_ext, heap); + /* Online index creation is only implemented for secondary + indexes, which never contain off-page columns. */ + ut_ad(n_ext == 0); +#ifdef ROW_LOG_APPLY_PRINT + if (row_log_apply_print) { + fprintf(stderr, "apply " IB_ID_FMT " " TRX_ID_FMT " %u %u ", + index->id, trx_id, + unsigned (op), unsigned (has_index_lock)); + for (const byte* m = mrec - data_size; m < mrec; m++) { + fprintf(stderr, "%02x", *m); + } + putc('\n', stderr); + } +#endif /* ROW_LOG_APPLY_PRINT */ + row_log_apply_op_low(index, dup, error, offsets_heap, + has_index_lock, op, trx_id, entry); + return(mrec); +} + +/******************************************************//** +Applies operations to a secondary index that was being created. +@return DB_SUCCESS, or error code on failure */ +static __attribute__((nonnull)) +dberr_t +row_log_apply_ops( +/*==============*/ + trx_t* trx, /*!< in: transaction (for checking if + the operation was interrupted) */ + dict_index_t* index, /*!< in/out: index */ + row_merge_dup_t*dup) /*!< in/out: for reporting duplicate key + errors */ +{ + dberr_t error; + const mrec_t* mrec = NULL; + const mrec_t* next_mrec; + const mrec_t* mrec_end= NULL; /* silence bogus warning */ + const mrec_t* next_mrec_end; + mem_heap_t* offsets_heap; + mem_heap_t* heap; + ulint* offsets; + bool has_index_lock; + const ulint i = 1 + REC_OFFS_HEADER_SIZE + + dict_index_get_n_fields(index); + + ut_ad(dict_index_is_online_ddl(index)); + ut_ad(*index->name == TEMP_INDEX_PREFIX); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(index->online_log); + UNIV_MEM_INVALID(&mrec_end, sizeof mrec_end); + + offsets = static_cast<ulint*>(ut_malloc(i * sizeof *offsets)); + offsets[0] = i; + offsets[1] = dict_index_get_n_fields(index); + + offsets_heap = mem_heap_create(UNIV_PAGE_SIZE); + heap = mem_heap_create(UNIV_PAGE_SIZE); + has_index_lock = true; + +next_block: + ut_ad(has_index_lock); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(index->online_log->head.bytes == 0); + + if (trx_is_interrupted(trx)) { + goto interrupted; + } + + error = index->online_log->error; + if (error != DB_SUCCESS) { + goto func_exit; + } + + if (dict_index_is_corrupted(index)) { + error = DB_INDEX_CORRUPT; + goto func_exit; + } + + if (UNIV_UNLIKELY(index->online_log->head.blocks + > index->online_log->tail.blocks)) { +unexpected_eof: + fprintf(stderr, "InnoDB: unexpected end of temporary file" + " for index %s\n", index->name + 1); +corruption: + error = DB_CORRUPTION; + goto func_exit; + } + + if (index->online_log->head.blocks + == index->online_log->tail.blocks) { + if (index->online_log->head.blocks) { +#ifdef HAVE_FTRUNCATE + /* Truncate the file in order to save space. */ + if (ftruncate(index->online_log->fd, 0) == -1) { + perror("ftruncate"); + } +#endif /* HAVE_FTRUNCATE */ + index->online_log->head.blocks + = index->online_log->tail.blocks = 0; + } + + next_mrec = index->online_log->tail.block; + next_mrec_end = next_mrec + index->online_log->tail.bytes; + + if (next_mrec_end == next_mrec) { + /* End of log reached. */ +all_done: + ut_ad(has_index_lock); + ut_ad(index->online_log->head.blocks == 0); + ut_ad(index->online_log->tail.blocks == 0); + error = DB_SUCCESS; + goto func_exit; + } + } else { + os_offset_t ofs; + ibool success; + + ofs = (os_offset_t) index->online_log->head.blocks + * srv_sort_buf_size; + + ut_ad(has_index_lock); + has_index_lock = false; + rw_lock_x_unlock(dict_index_get_lock(index)); + + log_free_check(); + + if (!row_log_block_allocate(index->online_log->head)) { + error = DB_OUT_OF_MEMORY; + goto func_exit; + } + + success = os_file_read_no_error_handling( + OS_FILE_FROM_FD(index->online_log->fd), + index->online_log->head.block, ofs, + srv_sort_buf_size); + + if (!success) { + fprintf(stderr, "InnoDB: unable to read temporary file" + " for index %s\n", index->name + 1); + goto corruption; + } + +#ifdef POSIX_FADV_DONTNEED + /* Each block is read exactly once. Free up the file cache. */ + posix_fadvise(index->online_log->fd, + ofs, srv_sort_buf_size, POSIX_FADV_DONTNEED); +#endif /* POSIX_FADV_DONTNEED */ +#if 0 //def FALLOC_FL_PUNCH_HOLE + /* Try to deallocate the space for the file on disk. + This should work on ext4 on Linux 2.6.39 and later, + and be ignored when the operation is unsupported. */ + fallocate(index->online_log->fd, + FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + ofs, srv_buf_size); +#endif /* FALLOC_FL_PUNCH_HOLE */ + + next_mrec = index->online_log->head.block; + next_mrec_end = next_mrec + srv_sort_buf_size; + } + + if (mrec) { + /* A partial record was read from the previous block. + Copy the temporary buffer full, as we do not know the + length of the record. Parse subsequent records from + the bigger buffer index->online_log->head.block + or index->online_log->tail.block. */ + + ut_ad(mrec == index->online_log->head.buf); + ut_ad(mrec_end > mrec); + ut_ad(mrec_end < (&index->online_log->head.buf)[1]); + + memcpy((mrec_t*) mrec_end, next_mrec, + (&index->online_log->head.buf)[1] - mrec_end); + mrec = row_log_apply_op( + index, dup, &error, offsets_heap, heap, + has_index_lock, index->online_log->head.buf, + (&index->online_log->head.buf)[1], offsets); + if (error != DB_SUCCESS) { + goto func_exit; + } else if (UNIV_UNLIKELY(mrec == NULL)) { + /* The record was not reassembled properly. */ + goto corruption; + } + /* The record was previously found out to be + truncated. Now that the parse buffer was extended, + it should proceed beyond the old end of the buffer. */ + ut_a(mrec > mrec_end); + + index->online_log->head.bytes = mrec - mrec_end; + next_mrec += index->online_log->head.bytes; + } + + ut_ad(next_mrec <= next_mrec_end); + /* The following loop must not be parsing the temporary + buffer, but head.block or tail.block. */ + + /* mrec!=NULL means that the next record starts from the + middle of the block */ + ut_ad((mrec == NULL) == (index->online_log->head.bytes == 0)); + +#ifdef UNIV_DEBUG + if (next_mrec_end == index->online_log->head.block + + srv_sort_buf_size) { + /* If tail.bytes == 0, next_mrec_end can also be at + the end of tail.block. */ + if (index->online_log->tail.bytes == 0) { + ut_ad(next_mrec == next_mrec_end); + ut_ad(index->online_log->tail.blocks == 0); + ut_ad(index->online_log->head.blocks == 0); + ut_ad(index->online_log->head.bytes == 0); + } else { + ut_ad(next_mrec == index->online_log->head.block + + index->online_log->head.bytes); + ut_ad(index->online_log->tail.blocks + > index->online_log->head.blocks); + } + } else if (next_mrec_end == index->online_log->tail.block + + index->online_log->tail.bytes) { + ut_ad(next_mrec == index->online_log->tail.block + + index->online_log->head.bytes); + ut_ad(index->online_log->tail.blocks == 0); + ut_ad(index->online_log->head.blocks == 0); + ut_ad(index->online_log->head.bytes + <= index->online_log->tail.bytes); + } else { + ut_error; + } +#endif /* UNIV_DEBUG */ + + mrec_end = next_mrec_end; + + while (!trx_is_interrupted(trx)) { + mrec = next_mrec; + ut_ad(mrec < mrec_end); + + if (!has_index_lock) { + /* We are applying operations from a different + block than the one that is being written to. + We do not hold index->lock in order to + allow other threads to concurrently buffer + modifications. */ + ut_ad(mrec >= index->online_log->head.block); + ut_ad(mrec_end == index->online_log->head.block + + srv_sort_buf_size); + ut_ad(index->online_log->head.bytes + < srv_sort_buf_size); + + /* Take the opportunity to do a redo log + checkpoint if needed. */ + log_free_check(); + } else { + /* We are applying operations from the last block. + Do not allow other threads to buffer anything, + so that we can finally catch up and synchronize. */ + ut_ad(index->online_log->head.blocks == 0); + ut_ad(index->online_log->tail.blocks == 0); + ut_ad(mrec_end == index->online_log->tail.block + + index->online_log->tail.bytes); + ut_ad(mrec >= index->online_log->tail.block); + } + + next_mrec = row_log_apply_op( + index, dup, &error, offsets_heap, heap, + has_index_lock, mrec, mrec_end, offsets); + + if (error != DB_SUCCESS) { + goto func_exit; + } else if (next_mrec == next_mrec_end) { + /* The record happened to end on a block boundary. + Do we have more blocks left? */ + if (has_index_lock) { + /* The index will be locked while + applying the last block. */ + goto all_done; + } + + mrec = NULL; +process_next_block: + rw_lock_x_lock(dict_index_get_lock(index)); + has_index_lock = true; + + index->online_log->head.bytes = 0; + index->online_log->head.blocks++; + goto next_block; + } else if (next_mrec != NULL) { + ut_ad(next_mrec < next_mrec_end); + index->online_log->head.bytes += next_mrec - mrec; + } else if (has_index_lock) { + /* When mrec is within tail.block, it should + be a complete record, because we are holding + index->lock and thus excluding the writer. */ + ut_ad(index->online_log->tail.blocks == 0); + ut_ad(mrec_end == index->online_log->tail.block + + index->online_log->tail.bytes); + ut_ad(0); + goto unexpected_eof; + } else { + memcpy(index->online_log->head.buf, mrec, + mrec_end - mrec); + mrec_end += index->online_log->head.buf - mrec; + mrec = index->online_log->head.buf; + goto process_next_block; + } + } + +interrupted: + error = DB_INTERRUPTED; +func_exit: + if (!has_index_lock) { + rw_lock_x_lock(dict_index_get_lock(index)); + } + + switch (error) { + case DB_SUCCESS: + break; + case DB_INDEX_CORRUPT: + if (((os_offset_t) index->online_log->tail.blocks + 1) + * srv_sort_buf_size >= srv_online_max_size) { + /* The log file grew too big. */ + error = DB_ONLINE_LOG_TOO_BIG; + } + /* fall through */ + default: + /* We set the flag directly instead of invoking + dict_set_corrupted_index_cache_only(index) here, + because the index is not "public" yet. */ + index->type |= DICT_CORRUPT; + } + + mem_heap_free(heap); + mem_heap_free(offsets_heap); + row_log_block_free(index->online_log->head); + ut_free(offsets); + return(error); +} + +/******************************************************//** +Apply the row log to the index upon completing index creation. +@return DB_SUCCESS, or error code on failure */ +UNIV_INTERN +dberr_t +row_log_apply( +/*==========*/ + trx_t* trx, /*!< in: transaction (for checking if + the operation was interrupted) */ + dict_index_t* index, /*!< in/out: secondary index */ + struct TABLE* table) /*!< in/out: MySQL table + (for reporting duplicates) */ +{ + dberr_t error; + row_log_t* log; + row_merge_dup_t dup = { index, table, NULL, 0 }; + DBUG_ENTER("row_log_apply"); + + ut_ad(dict_index_is_online_ddl(index)); + ut_ad(!dict_index_is_clust(index)); + + log_free_check(); + + rw_lock_x_lock(dict_index_get_lock(index)); + + if (!dict_table_is_corrupted(index->table)) { + error = row_log_apply_ops(trx, index, &dup); + } else { + error = DB_SUCCESS; + } + + if (error != DB_SUCCESS) { + ut_a(!dict_table_is_discarded(index->table)); + /* We set the flag directly instead of invoking + dict_set_corrupted_index_cache_only(index) here, + because the index is not "public" yet. */ + index->type |= DICT_CORRUPT; + index->table->drop_aborted = TRUE; + + dict_index_set_online_status(index, ONLINE_INDEX_ABORTED); + } else { + ut_ad(dup.n_dup == 0); + dict_index_set_online_status(index, ONLINE_INDEX_COMPLETE); + } + + log = index->online_log; + index->online_log = NULL; + /* We could remove the TEMP_INDEX_PREFIX and update the data + dictionary to say that this index is complete, if we had + access to the .frm file here. If the server crashes before + all requested indexes have been created, this completed index + will be dropped. */ + rw_lock_x_unlock(dict_index_get_lock(index)); + + row_log_free(log); + + DBUG_RETURN(error); +} diff --git a/storage/xtradb/row/row0merge.cc b/storage/xtradb/row/row0merge.cc new file mode 100644 index 00000000000..e074604e3cb --- /dev/null +++ b/storage/xtradb/row/row0merge.cc @@ -0,0 +1,3738 @@ +/***************************************************************************** + +Copyright (c) 2005, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file row/row0merge.cc +New index creation routines using a merge sort + +Created 12/4/2005 Jan Lindstrom +Completed by Sunny Bains and Marko Makela +*******************************************************/ + +#include "row0merge.h" +#include "row0ext.h" +#include "row0log.h" +#include "row0ins.h" +#include "row0sel.h" +#include "dict0crea.h" +#include "trx0purge.h" +#include "lock0lock.h" +#include "pars0pars.h" +#include "ut0sort.h" +#include "row0ftsort.h" +#include "row0import.h" +#include "handler0alter.h" +#include "ha_prototypes.h" + +/* Ignore posix_fadvise() on those platforms where it does not exist */ +#if defined __WIN__ +# define posix_fadvise(fd, offset, len, advice) /* nothing */ +#endif /* __WIN__ */ + +#ifdef UNIV_DEBUG +/** Set these in order ot enable debug printout. */ +/* @{ */ +/** Log each record read from temporary file. */ +static ibool row_merge_print_read; +/** Log each record write to temporary file. */ +static ibool row_merge_print_write; +/** Log each row_merge_blocks() call, merging two blocks of records to +a bigger one. */ +static ibool row_merge_print_block; +/** Log each block read from temporary file. */ +static ibool row_merge_print_block_read; +/** Log each block read from temporary file. */ +static ibool row_merge_print_block_write; +/* @} */ +#endif /* UNIV_DEBUG */ + +/* Whether to disable file system cache */ +UNIV_INTERN char srv_disable_sort_file_cache; + +/* Maximum pending doc memory limit in bytes for a fts tokenization thread */ +#define FTS_PENDING_DOC_MEMORY_LIMIT 1000000 + +#ifdef UNIV_DEBUG +/******************************************************//** +Display a merge tuple. */ +static __attribute__((nonnull)) +void +row_merge_tuple_print( +/*==================*/ + FILE* f, /*!< in: output stream */ + const mtuple_t* entry, /*!< in: tuple to print */ + ulint n_fields)/*!< in: number of fields in the tuple */ +{ + ulint j; + + for (j = 0; j < n_fields; j++) { + const dfield_t* field = &entry->fields[j]; + + if (dfield_is_null(field)) { + fputs("\n NULL;", f); + } else { + ulint field_len = dfield_get_len(field); + ulint len = ut_min(field_len, 20); + if (dfield_is_ext(field)) { + fputs("\nE", f); + } else { + fputs("\n ", f); + } + ut_print_buf(f, dfield_get_data(field), len); + if (len != field_len) { + fprintf(f, " (total %lu bytes)", field_len); + } + } + } + putc('\n', f); +} +#endif /* UNIV_DEBUG */ + +/******************************************************//** +Encode an index record. */ +static __attribute__((nonnull)) +void +row_merge_buf_encode( +/*=================*/ + byte** b, /*!< in/out: pointer to + current end of output buffer */ + const dict_index_t* index, /*!< in: index */ + const mtuple_t* entry, /*!< in: index fields + of the record to encode */ + ulint n_fields) /*!< in: number of fields + in the entry */ +{ + ulint size; + ulint extra_size; + + size = rec_get_converted_size_temp( + index, entry->fields, n_fields, &extra_size); + ut_ad(size >= extra_size); + + /* Encode extra_size + 1 */ + if (extra_size + 1 < 0x80) { + *(*b)++ = (byte) (extra_size + 1); + } else { + ut_ad((extra_size + 1) < 0x8000); + *(*b)++ = (byte) (0x80 | ((extra_size + 1) >> 8)); + *(*b)++ = (byte) (extra_size + 1); + } + + rec_convert_dtuple_to_temp(*b + extra_size, index, + entry->fields, n_fields); + + *b += size; +} + +/******************************************************//** +Allocate a sort buffer. +@return own: sort buffer */ +static __attribute__((malloc, nonnull)) +row_merge_buf_t* +row_merge_buf_create_low( +/*=====================*/ + mem_heap_t* heap, /*!< in: heap where allocated */ + dict_index_t* index, /*!< in: secondary index */ + ulint max_tuples, /*!< in: maximum number of + data tuples */ + ulint buf_size) /*!< in: size of the buffer, + in bytes */ +{ + row_merge_buf_t* buf; + + ut_ad(max_tuples > 0); + + ut_ad(max_tuples <= srv_sort_buf_size); + + buf = static_cast<row_merge_buf_t*>(mem_heap_zalloc(heap, buf_size)); + buf->heap = heap; + buf->index = index; + buf->max_tuples = max_tuples; + buf->tuples = static_cast<mtuple_t*>( + ut_malloc(2 * max_tuples * sizeof *buf->tuples)); + buf->tmp_tuples = buf->tuples + max_tuples; + + return(buf); +} + +/******************************************************//** +Allocate a sort buffer. +@return own: sort buffer */ +UNIV_INTERN +row_merge_buf_t* +row_merge_buf_create( +/*=================*/ + dict_index_t* index) /*!< in: secondary index */ +{ + row_merge_buf_t* buf; + ulint max_tuples; + ulint buf_size; + mem_heap_t* heap; + + max_tuples = srv_sort_buf_size + / ut_max(1, dict_index_get_min_size(index)); + + buf_size = (sizeof *buf); + + heap = mem_heap_create(buf_size); + + buf = row_merge_buf_create_low(heap, index, max_tuples, buf_size); + + return(buf); +} + +/******************************************************//** +Empty a sort buffer. +@return sort buffer */ +UNIV_INTERN +row_merge_buf_t* +row_merge_buf_empty( +/*================*/ + row_merge_buf_t* buf) /*!< in,own: sort buffer */ +{ + ulint buf_size = sizeof *buf; + ulint max_tuples = buf->max_tuples; + mem_heap_t* heap = buf->heap; + dict_index_t* index = buf->index; + mtuple_t* tuples = buf->tuples; + + mem_heap_empty(heap); + + buf = static_cast<row_merge_buf_t*>(mem_heap_zalloc(heap, buf_size)); + buf->heap = heap; + buf->index = index; + buf->max_tuples = max_tuples; + buf->tuples = tuples; + buf->tmp_tuples = buf->tuples + max_tuples; + + return(buf); +} + +/******************************************************//** +Deallocate a sort buffer. */ +UNIV_INTERN +void +row_merge_buf_free( +/*===============*/ + row_merge_buf_t* buf) /*!< in,own: sort buffer to be freed */ +{ + ut_free(buf->tuples); + mem_heap_free(buf->heap); +} + +/******************************************************//** +Insert a data tuple into a sort buffer. +@return number of rows added, 0 if out of space */ +static +ulint +row_merge_buf_add( +/*==============*/ + row_merge_buf_t* buf, /*!< in/out: sort buffer */ + dict_index_t* fts_index,/*!< in: fts index to be created */ + const dict_table_t* old_table,/*!< in: original table */ + fts_psort_t* psort_info, /*!< in: parallel sort info */ + const dtuple_t* row, /*!< in: table row */ + const row_ext_t* ext, /*!< in: cache of externally stored + column prefixes, or NULL */ + doc_id_t* doc_id) /*!< in/out: Doc ID if we are + creating FTS index */ +{ + ulint i; + const dict_index_t* index; + mtuple_t* entry; + dfield_t* field; + const dict_field_t* ifield; + ulint n_fields; + ulint data_size; + ulint extra_size; + ulint bucket = 0; + doc_id_t write_doc_id; + ulint n_row_added = 0; + DBUG_ENTER("row_merge_buf_add"); + + if (buf->n_tuples >= buf->max_tuples) { + DBUG_RETURN(0); + } + + DBUG_EXECUTE_IF( + "ib_row_merge_buf_add_two", + if (buf->n_tuples >= 2) DBUG_RETURN(0);); + + UNIV_PREFETCH_R(row->fields); + + /* If we are building FTS index, buf->index points to + the 'fts_sort_idx', and real FTS index is stored in + fts_index */ + index = (buf->index->type & DICT_FTS) ? fts_index : buf->index; + + n_fields = dict_index_get_n_fields(index); + + entry = &buf->tuples[buf->n_tuples]; + field = entry->fields = static_cast<dfield_t*>( + mem_heap_alloc(buf->heap, n_fields * sizeof *entry->fields)); + + data_size = 0; + extra_size = UT_BITS_IN_BYTES(index->n_nullable); + + ifield = dict_index_get_nth_field(index, 0); + + for (i = 0; i < n_fields; i++, field++, ifield++) { + ulint len; + const dict_col_t* col; + ulint col_no; + ulint fixed_len; + const dfield_t* row_field; + + col = ifield->col; + col_no = dict_col_get_no(col); + + /* Process the Doc ID column */ + if (*doc_id > 0 + && col_no == index->table->fts->doc_col) { + fts_write_doc_id((byte*) &write_doc_id, *doc_id); + + /* Note: field->data now points to a value on the + stack: &write_doc_id after dfield_set_data(). Because + there is only one doc_id per row, it shouldn't matter. + We allocate a new buffer before we leave the function + later below. */ + + dfield_set_data( + field, &write_doc_id, sizeof(write_doc_id)); + + field->type.mtype = ifield->col->mtype; + field->type.prtype = ifield->col->prtype; + field->type.mbminmaxlen = DATA_MBMINMAXLEN(0, 0); + field->type.len = ifield->col->len; + } else { + row_field = dtuple_get_nth_field(row, col_no); + + dfield_copy(field, row_field); + + /* Tokenize and process data for FTS */ + if (index->type & DICT_FTS) { + fts_doc_item_t* doc_item; + byte* value; + void* ptr; + const ulint max_trial_count = 10000; + ulint trial_count = 0; + + /* fetch Doc ID if it already exists + in the row, and not supplied by the + caller. Even if the value column is + NULL, we still need to get the Doc + ID so to maintain the correct max + Doc ID */ + if (*doc_id == 0) { + const dfield_t* doc_field; + doc_field = dtuple_get_nth_field( + row, + index->table->fts->doc_col); + *doc_id = (doc_id_t) mach_read_from_8( + static_cast<byte*>( + dfield_get_data(doc_field))); + + if (*doc_id == 0) { + ib_logf(IB_LOG_LEVEL_WARN, + "FTS Doc ID is zero. " + "Record Skipped"); + DBUG_RETURN(0); + } + } + + if (dfield_is_null(field)) { + n_row_added = 1; + continue; + } + + ptr = ut_malloc(sizeof(*doc_item) + + field->len); + + doc_item = static_cast<fts_doc_item_t*>(ptr); + value = static_cast<byte*>(ptr) + + sizeof(*doc_item); + memcpy(value, field->data, field->len); + field->data = value; + + doc_item->field = field; + doc_item->doc_id = *doc_id; + + bucket = *doc_id % fts_sort_pll_degree; + + /* Add doc item to fts_doc_list */ + mutex_enter(&psort_info[bucket].mutex); + + if (psort_info[bucket].error == DB_SUCCESS) { + UT_LIST_ADD_LAST( + doc_list, + psort_info[bucket].fts_doc_list, + doc_item); + psort_info[bucket].memory_used += + sizeof(*doc_item) + field->len; + } else { + ut_free(doc_item); + } + + mutex_exit(&psort_info[bucket].mutex); + + /* Sleep when memory used exceeds limit*/ + while (psort_info[bucket].memory_used + > FTS_PENDING_DOC_MEMORY_LIMIT + && trial_count++ < max_trial_count) { + os_thread_sleep(1000); + } + + n_row_added = 1; + continue; + } + } + + len = dfield_get_len(field); + + if (dfield_is_null(field)) { + ut_ad(!(col->prtype & DATA_NOT_NULL)); + continue; + } else if (!ext) { + } else if (dict_index_is_clust(index)) { + /* Flag externally stored fields. */ + const byte* buf = row_ext_lookup(ext, col_no, + &len); + if (UNIV_LIKELY_NULL(buf)) { + ut_a(buf != field_ref_zero); + if (i < dict_index_get_n_unique(index)) { + dfield_set_data(field, buf, len); + } else { + dfield_set_ext(field); + len = dfield_get_len(field); + } + } + } else { + const byte* buf = row_ext_lookup(ext, col_no, + &len); + if (UNIV_LIKELY_NULL(buf)) { + ut_a(buf != field_ref_zero); + dfield_set_data(field, buf, len); + } + } + + /* If a column prefix index, take only the prefix */ + + if (ifield->prefix_len) { + len = dtype_get_at_most_n_mbchars( + col->prtype, + col->mbminmaxlen, + ifield->prefix_len, + len, + static_cast<char*>(dfield_get_data(field))); + dfield_set_len(field, len); + } + + ut_ad(len <= col->len || col->mtype == DATA_BLOB); + + fixed_len = ifield->fixed_len; + if (fixed_len && !dict_table_is_comp(index->table) + && DATA_MBMINLEN(col->mbminmaxlen) + != DATA_MBMAXLEN(col->mbminmaxlen)) { + /* CHAR in ROW_FORMAT=REDUNDANT is always + fixed-length, but in the temporary file it is + variable-length for variable-length character + sets. */ + fixed_len = 0; + } + + if (fixed_len) { +#ifdef UNIV_DEBUG + ulint mbminlen = DATA_MBMINLEN(col->mbminmaxlen); + ulint mbmaxlen = DATA_MBMAXLEN(col->mbminmaxlen); + + /* len should be between size calcualted base on + mbmaxlen and mbminlen */ + ut_ad(len <= fixed_len); + ut_ad(!mbmaxlen || len >= mbminlen + * (fixed_len / mbmaxlen)); + + ut_ad(!dfield_is_ext(field)); +#endif /* UNIV_DEBUG */ + } else if (dfield_is_ext(field)) { + extra_size += 2; + } else if (len < 128 + || (col->len < 256 && col->mtype != DATA_BLOB)) { + extra_size++; + } else { + /* For variable-length columns, we look up the + maximum length from the column itself. If this + is a prefix index column shorter than 256 bytes, + this will waste one byte. */ + extra_size += 2; + } + data_size += len; + } + + /* If this is FTS index, we already populated the sort buffer, return + here */ + if (index->type & DICT_FTS) { + DBUG_RETURN(n_row_added); + } + +#ifdef UNIV_DEBUG + { + ulint size; + ulint extra; + + size = rec_get_converted_size_temp( + index, entry->fields, n_fields, &extra); + + ut_ad(data_size + extra_size == size); + ut_ad(extra_size == extra); + } +#endif /* UNIV_DEBUG */ + + /* Add to the total size of the record in row_merge_block_t + the encoded length of extra_size and the extra bytes (extra_size). + See row_merge_buf_write() for the variable-length encoding + of extra_size. */ + data_size += (extra_size + 1) + ((extra_size + 1) >= 0x80); + + ut_ad(data_size < srv_sort_buf_size); + + /* Reserve one byte for the end marker of row_merge_block_t. */ + if (buf->total_size + data_size >= srv_sort_buf_size - 1) { + DBUG_RETURN(0); + } + + buf->total_size += data_size; + buf->n_tuples++; + n_row_added++; + + field = entry->fields; + + /* Copy the data fields. */ + + do { + dfield_dup(field++, buf->heap); + } while (--n_fields); + + DBUG_RETURN(n_row_added); +} + +/*************************************************************//** +Report a duplicate key. */ +UNIV_INTERN +void +row_merge_dup_report( +/*=================*/ + row_merge_dup_t* dup, /*!< in/out: for reporting duplicates */ + const dfield_t* entry) /*!< in: duplicate index entry */ +{ + if (!dup->n_dup++) { + /* Only report the first duplicate record, + but count all duplicate records. */ + innobase_fields_to_mysql(dup->table, dup->index, entry); + } +} + +/*************************************************************//** +Compare two tuples. +@return 1, 0, -1 if a is greater, equal, less, respectively, than b */ +static __attribute__((warn_unused_result)) +int +row_merge_tuple_cmp( +/*================*/ + ulint n_uniq, /*!< in: number of unique fields */ + ulint n_field,/*!< in: number of fields */ + const mtuple_t& a, /*!< in: first tuple to be compared */ + const mtuple_t& b, /*!< in: second tuple to be compared */ + row_merge_dup_t* dup) /*!< in/out: for reporting duplicates, + NULL if non-unique index */ +{ + int cmp; + const dfield_t* af = a.fields; + const dfield_t* bf = b.fields; + ulint n = n_uniq; + + ut_ad(n_uniq > 0); + ut_ad(n_uniq <= n_field); + + /* Compare the fields of the tuples until a difference is + found or we run out of fields to compare. If !cmp at the + end, the tuples are equal. */ + do { + cmp = cmp_dfield_dfield(af++, bf++); + } while (!cmp && --n); + + if (cmp) { + return(cmp); + } + + if (dup) { + /* Report a duplicate value error if the tuples are + logically equal. NULL columns are logically inequal, + although they are equal in the sorting order. Find + out if any of the fields are NULL. */ + for (const dfield_t* df = a.fields; df != af; df++) { + if (dfield_is_null(df)) { + goto no_report; + } + } + + row_merge_dup_report(dup, a.fields); + } + +no_report: + /* The n_uniq fields were equal, but we compare all fields so + that we will get the same (internal) order as in the B-tree. */ + for (n = n_field - n_uniq + 1; --n; ) { + cmp = cmp_dfield_dfield(af++, bf++); + if (cmp) { + return(cmp); + } + } + + /* This should never be reached, except in a secondary index + when creating a secondary index and a PRIMARY KEY, and there + is a duplicate in the PRIMARY KEY that has not been detected + yet. Internally, an index must never contain duplicates. */ + return(cmp); +} + +/** Wrapper for row_merge_tuple_sort() to inject some more context to +UT_SORT_FUNCTION_BODY(). +@param tuples array of tuples that being sorted +@param aux work area, same size as tuples[] +@param low lower bound of the sorting area, inclusive +@param high upper bound of the sorting area, inclusive */ +#define row_merge_tuple_sort_ctx(tuples, aux, low, high) \ + row_merge_tuple_sort(n_uniq, n_field, dup, tuples, aux, low, high) +/** Wrapper for row_merge_tuple_cmp() to inject some more context to +UT_SORT_FUNCTION_BODY(). +@param a first tuple to be compared +@param b second tuple to be compared +@return 1, 0, -1 if a is greater, equal, less, respectively, than b */ +#define row_merge_tuple_cmp_ctx(a,b) \ + row_merge_tuple_cmp(n_uniq, n_field, a, b, dup) + +/**********************************************************************//** +Merge sort the tuple buffer in main memory. */ +static __attribute__((nonnull(4,5))) +void +row_merge_tuple_sort( +/*=================*/ + ulint n_uniq, /*!< in: number of unique fields */ + ulint n_field,/*!< in: number of fields */ + row_merge_dup_t* dup, /*!< in/out: reporter of duplicates + (NULL if non-unique index) */ + mtuple_t* tuples, /*!< in/out: tuples */ + mtuple_t* aux, /*!< in/out: work area */ + ulint low, /*!< in: lower bound of the + sorting area, inclusive */ + ulint high) /*!< in: upper bound of the + sorting area, exclusive */ +{ + ut_ad(n_field > 0); + ut_ad(n_uniq <= n_field); + + UT_SORT_FUNCTION_BODY(row_merge_tuple_sort_ctx, + tuples, aux, low, high, row_merge_tuple_cmp_ctx); +} + +/******************************************************//** +Sort a buffer. */ +UNIV_INTERN +void +row_merge_buf_sort( +/*===============*/ + row_merge_buf_t* buf, /*!< in/out: sort buffer */ + row_merge_dup_t* dup) /*!< in/out: reporter of duplicates + (NULL if non-unique index) */ +{ + row_merge_tuple_sort(dict_index_get_n_unique(buf->index), + dict_index_get_n_fields(buf->index), + dup, + buf->tuples, buf->tmp_tuples, 0, buf->n_tuples); +} + +/******************************************************//** +Write a buffer to a block. */ +UNIV_INTERN +void +row_merge_buf_write( +/*================*/ + const row_merge_buf_t* buf, /*!< in: sorted buffer */ + const merge_file_t* of UNIV_UNUSED, + /*!< in: output file */ + row_merge_block_t* block) /*!< out: buffer for writing to file */ +{ + const dict_index_t* index = buf->index; + ulint n_fields= dict_index_get_n_fields(index); + byte* b = &block[0]; + + for (ulint i = 0; i < buf->n_tuples; i++) { + const mtuple_t* entry = &buf->tuples[i]; + + row_merge_buf_encode(&b, index, entry, n_fields); + ut_ad(b < &block[srv_sort_buf_size]); +#ifdef UNIV_DEBUG + if (row_merge_print_write) { + fprintf(stderr, "row_merge_buf_write %p,%d,%lu %lu", + (void*) b, of->fd, (ulong) of->offset, + (ulong) i); + row_merge_tuple_print(stderr, entry, n_fields); + } +#endif /* UNIV_DEBUG */ + } + + /* Write an "end-of-chunk" marker. */ + ut_a(b < &block[srv_sort_buf_size]); + ut_a(b == &block[0] + buf->total_size); + *b++ = 0; +#ifdef UNIV_DEBUG_VALGRIND + /* The rest of the block is uninitialized. Initialize it + to avoid bogus warnings. */ + memset(b, 0xff, &block[srv_sort_buf_size] - b); +#endif /* UNIV_DEBUG_VALGRIND */ +#ifdef UNIV_DEBUG + if (row_merge_print_write) { + fprintf(stderr, "row_merge_buf_write %p,%d,%lu EOF\n", + (void*) b, of->fd, (ulong) of->offset); + } +#endif /* UNIV_DEBUG */ +} + +/******************************************************//** +Create a memory heap and allocate space for row_merge_rec_offsets() +and mrec_buf_t[3]. +@return memory heap */ +static +mem_heap_t* +row_merge_heap_create( +/*==================*/ + const dict_index_t* index, /*!< in: record descriptor */ + mrec_buf_t** buf, /*!< out: 3 buffers */ + ulint** offsets1, /*!< out: offsets */ + ulint** offsets2) /*!< out: offsets */ +{ + ulint i = 1 + REC_OFFS_HEADER_SIZE + + dict_index_get_n_fields(index); + mem_heap_t* heap = mem_heap_create(2 * i * sizeof **offsets1 + + 3 * sizeof **buf); + + *buf = static_cast<mrec_buf_t*>( + mem_heap_alloc(heap, 3 * sizeof **buf)); + *offsets1 = static_cast<ulint*>( + mem_heap_alloc(heap, i * sizeof **offsets1)); + *offsets2 = static_cast<ulint*>( + mem_heap_alloc(heap, i * sizeof **offsets2)); + + (*offsets1)[0] = (*offsets2)[0] = i; + (*offsets1)[1] = (*offsets2)[1] = dict_index_get_n_fields(index); + + return(heap); +} + +/********************************************************************//** +Read a merge block from the file system. +@return TRUE if request was successful, FALSE if fail */ +UNIV_INTERN +ibool +row_merge_read( +/*===========*/ + int fd, /*!< in: file descriptor */ + ulint offset, /*!< in: offset where to read + in number of row_merge_block_t + elements */ + row_merge_block_t* buf) /*!< out: data */ +{ + os_offset_t ofs = ((os_offset_t) offset) * srv_sort_buf_size; + ibool success; + + DBUG_EXECUTE_IF("row_merge_read_failure", return(FALSE);); + +#ifdef UNIV_DEBUG + if (row_merge_print_block_read) { + fprintf(stderr, "row_merge_read fd=%d ofs=%lu\n", + fd, (ulong) offset); + } +#endif /* UNIV_DEBUG */ + +#ifdef UNIV_DEBUG + if (row_merge_print_block_read) { + fprintf(stderr, "row_merge_read fd=%d ofs=%lu\n", + fd, (ulong) offset); + } +#endif /* UNIV_DEBUG */ + + success = os_file_read_no_error_handling(OS_FILE_FROM_FD(fd), buf, + ofs, srv_sort_buf_size); +#ifdef POSIX_FADV_DONTNEED + /* Each block is read exactly once. Free up the file cache. */ + posix_fadvise(fd, ofs, srv_sort_buf_size, POSIX_FADV_DONTNEED); +#endif /* POSIX_FADV_DONTNEED */ + + if (UNIV_UNLIKELY(!success)) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: failed to read merge block at " UINT64PF "\n", + ofs); + } + + return(UNIV_LIKELY(success)); +} + +/********************************************************************//** +Write a merge block to the file system. +@return TRUE if request was successful, FALSE if fail */ +UNIV_INTERN +ibool +row_merge_write( +/*============*/ + int fd, /*!< in: file descriptor */ + ulint offset, /*!< in: offset where to write, + in number of row_merge_block_t elements */ + const void* buf) /*!< in: data */ +{ + size_t buf_len = srv_sort_buf_size; + os_offset_t ofs = buf_len * (os_offset_t) offset; + ibool ret; + + DBUG_EXECUTE_IF("row_merge_write_failure", return(FALSE);); + + ret = os_file_write("(merge)", OS_FILE_FROM_FD(fd), buf, ofs, buf_len); + +#ifdef UNIV_DEBUG + if (row_merge_print_block_write) { + fprintf(stderr, "row_merge_write fd=%d ofs=%lu\n", + fd, (ulong) offset); + } +#endif /* UNIV_DEBUG */ + +#ifdef POSIX_FADV_DONTNEED + /* The block will be needed on the next merge pass, + but it can be evicted from the file cache meanwhile. */ + posix_fadvise(fd, ofs, buf_len, POSIX_FADV_DONTNEED); +#endif /* POSIX_FADV_DONTNEED */ + + return(UNIV_LIKELY(ret)); +} + +/********************************************************************//** +Read a merge record. +@return pointer to next record, or NULL on I/O error or end of list */ +UNIV_INTERN +const byte* +row_merge_read_rec( +/*===============*/ + row_merge_block_t* block, /*!< in/out: file buffer */ + mrec_buf_t* buf, /*!< in/out: secondary buffer */ + const byte* b, /*!< in: pointer to record */ + const dict_index_t* index, /*!< in: index of the record */ + int fd, /*!< in: file descriptor */ + ulint* foffs, /*!< in/out: file offset */ + const mrec_t** mrec, /*!< out: pointer to merge record, + or NULL on end of list + (non-NULL on I/O error) */ + ulint* offsets)/*!< out: offsets of mrec */ +{ + ulint extra_size; + ulint data_size; + ulint avail_size; + + ut_ad(block); + ut_ad(buf); + ut_ad(b >= &block[0]); + ut_ad(b < &block[srv_sort_buf_size]); + ut_ad(index); + ut_ad(foffs); + ut_ad(mrec); + ut_ad(offsets); + + ut_ad(*offsets == 1 + REC_OFFS_HEADER_SIZE + + dict_index_get_n_fields(index)); + + extra_size = *b++; + + if (UNIV_UNLIKELY(!extra_size)) { + /* End of list */ + *mrec = NULL; +#ifdef UNIV_DEBUG + if (row_merge_print_read) { + fprintf(stderr, "row_merge_read %p,%p,%d,%lu EOF\n", + (const void*) b, (const void*) block, + fd, (ulong) *foffs); + } +#endif /* UNIV_DEBUG */ + return(NULL); + } + + if (extra_size >= 0x80) { + /* Read another byte of extra_size. */ + + if (UNIV_UNLIKELY(b >= &block[srv_sort_buf_size])) { + if (!row_merge_read(fd, ++(*foffs), block)) { +err_exit: + /* Signal I/O error. */ + *mrec = b; + return(NULL); + } + + /* Wrap around to the beginning of the buffer. */ + b = &block[0]; + } + + extra_size = (extra_size & 0x7f) << 8; + extra_size |= *b++; + } + + /* Normalize extra_size. Above, value 0 signals "end of list". */ + extra_size--; + + /* Read the extra bytes. */ + + if (UNIV_UNLIKELY(b + extra_size >= &block[srv_sort_buf_size])) { + /* The record spans two blocks. Copy the entire record + to the auxiliary buffer and handle this as a special + case. */ + + avail_size = &block[srv_sort_buf_size] - b; + ut_ad(avail_size < sizeof *buf); + memcpy(*buf, b, avail_size); + + if (!row_merge_read(fd, ++(*foffs), block)) { + + goto err_exit; + } + + /* Wrap around to the beginning of the buffer. */ + b = &block[0]; + + /* Copy the record. */ + memcpy(*buf + avail_size, b, extra_size - avail_size); + b += extra_size - avail_size; + + *mrec = *buf + extra_size; + + rec_init_offsets_temp(*mrec, index, offsets); + + data_size = rec_offs_data_size(offsets); + + /* These overflows should be impossible given that + records are much smaller than either buffer, and + the record starts near the beginning of each buffer. */ + ut_a(extra_size + data_size < sizeof *buf); + ut_a(b + data_size < &block[srv_sort_buf_size]); + + /* Copy the data bytes. */ + memcpy(*buf + extra_size, b, data_size); + b += data_size; + + goto func_exit; + } + + *mrec = b + extra_size; + + rec_init_offsets_temp(*mrec, index, offsets); + + data_size = rec_offs_data_size(offsets); + ut_ad(extra_size + data_size < sizeof *buf); + + b += extra_size + data_size; + + if (UNIV_LIKELY(b < &block[srv_sort_buf_size])) { + /* The record fits entirely in the block. + This is the normal case. */ + goto func_exit; + } + + /* The record spans two blocks. Copy it to buf. */ + + b -= extra_size + data_size; + avail_size = &block[srv_sort_buf_size] - b; + memcpy(*buf, b, avail_size); + *mrec = *buf + extra_size; +#ifdef UNIV_DEBUG + /* We cannot invoke rec_offs_make_valid() here, because there + are no REC_N_NEW_EXTRA_BYTES between extra_size and data_size. + Similarly, rec_offs_validate() would fail, because it invokes + rec_get_status(). */ + offsets[2] = (ulint) *mrec; + offsets[3] = (ulint) index; +#endif /* UNIV_DEBUG */ + + if (!row_merge_read(fd, ++(*foffs), block)) { + + goto err_exit; + } + + /* Wrap around to the beginning of the buffer. */ + b = &block[0]; + + /* Copy the rest of the record. */ + memcpy(*buf + avail_size, b, extra_size + data_size - avail_size); + b += extra_size + data_size - avail_size; + +func_exit: +#ifdef UNIV_DEBUG + if (row_merge_print_read) { + fprintf(stderr, "row_merge_read %p,%p,%d,%lu ", + (const void*) b, (const void*) block, + fd, (ulong) *foffs); + rec_print_comp(stderr, *mrec, offsets); + putc('\n', stderr); + } +#endif /* UNIV_DEBUG */ + + return(b); +} + +/********************************************************************//** +Write a merge record. */ +static +void +row_merge_write_rec_low( +/*====================*/ + byte* b, /*!< out: buffer */ + ulint e, /*!< in: encoded extra_size */ +#ifdef UNIV_DEBUG + ulint size, /*!< in: total size to write */ + int fd, /*!< in: file descriptor */ + ulint foffs, /*!< in: file offset */ +#endif /* UNIV_DEBUG */ + const mrec_t* mrec, /*!< in: record to write */ + const ulint* offsets)/*!< in: offsets of mrec */ +#ifndef UNIV_DEBUG +# define row_merge_write_rec_low(b, e, size, fd, foffs, mrec, offsets) \ + row_merge_write_rec_low(b, e, mrec, offsets) +#endif /* !UNIV_DEBUG */ +{ +#ifdef UNIV_DEBUG + const byte* const end = b + size; + ut_ad(e == rec_offs_extra_size(offsets) + 1); + + if (row_merge_print_write) { + fprintf(stderr, "row_merge_write %p,%d,%lu ", + (void*) b, fd, (ulong) foffs); + rec_print_comp(stderr, mrec, offsets); + putc('\n', stderr); + } +#endif /* UNIV_DEBUG */ + + if (e < 0x80) { + *b++ = (byte) e; + } else { + *b++ = (byte) (0x80 | (e >> 8)); + *b++ = (byte) e; + } + + memcpy(b, mrec - rec_offs_extra_size(offsets), rec_offs_size(offsets)); + ut_ad(b + rec_offs_size(offsets) == end); +} + +/********************************************************************//** +Write a merge record. +@return pointer to end of block, or NULL on error */ +static +byte* +row_merge_write_rec( +/*================*/ + row_merge_block_t* block, /*!< in/out: file buffer */ + mrec_buf_t* buf, /*!< in/out: secondary buffer */ + byte* b, /*!< in: pointer to end of block */ + int fd, /*!< in: file descriptor */ + ulint* foffs, /*!< in/out: file offset */ + const mrec_t* mrec, /*!< in: record to write */ + const ulint* offsets)/*!< in: offsets of mrec */ +{ + ulint extra_size; + ulint size; + ulint avail_size; + + ut_ad(block); + ut_ad(buf); + ut_ad(b >= &block[0]); + ut_ad(b < &block[srv_sort_buf_size]); + ut_ad(mrec); + ut_ad(foffs); + ut_ad(mrec < &block[0] || mrec > &block[srv_sort_buf_size]); + ut_ad(mrec < buf[0] || mrec > buf[1]); + + /* Normalize extra_size. Value 0 signals "end of list". */ + extra_size = rec_offs_extra_size(offsets) + 1; + + size = extra_size + (extra_size >= 0x80) + + rec_offs_data_size(offsets); + + if (UNIV_UNLIKELY(b + size >= &block[srv_sort_buf_size])) { + /* The record spans two blocks. + Copy it to the temporary buffer first. */ + avail_size = &block[srv_sort_buf_size] - b; + + row_merge_write_rec_low(buf[0], + extra_size, size, fd, *foffs, + mrec, offsets); + + /* Copy the head of the temporary buffer, write + the completed block, and copy the tail of the + record to the head of the new block. */ + memcpy(b, buf[0], avail_size); + + if (!row_merge_write(fd, (*foffs)++, block)) { + return(NULL); + } + + UNIV_MEM_INVALID(&block[0], srv_sort_buf_size); + + /* Copy the rest. */ + b = &block[0]; + memcpy(b, buf[0] + avail_size, size - avail_size); + b += size - avail_size; + } else { + row_merge_write_rec_low(b, extra_size, size, fd, *foffs, + mrec, offsets); + b += size; + } + + return(b); +} + +/********************************************************************//** +Write an end-of-list marker. +@return pointer to end of block, or NULL on error */ +static +byte* +row_merge_write_eof( +/*================*/ + row_merge_block_t* block, /*!< in/out: file buffer */ + byte* b, /*!< in: pointer to end of block */ + int fd, /*!< in: file descriptor */ + ulint* foffs) /*!< in/out: file offset */ +{ + ut_ad(block); + ut_ad(b >= &block[0]); + ut_ad(b < &block[srv_sort_buf_size]); + ut_ad(foffs); +#ifdef UNIV_DEBUG + if (row_merge_print_write) { + fprintf(stderr, "row_merge_write %p,%p,%d,%lu EOF\n", + (void*) b, (void*) block, fd, (ulong) *foffs); + } +#endif /* UNIV_DEBUG */ + + *b++ = 0; + UNIV_MEM_ASSERT_RW(&block[0], b - &block[0]); + UNIV_MEM_ASSERT_W(&block[0], srv_sort_buf_size); +#ifdef UNIV_DEBUG_VALGRIND + /* The rest of the block is uninitialized. Initialize it + to avoid bogus warnings. */ + memset(b, 0xff, &block[srv_sort_buf_size] - b); +#endif /* UNIV_DEBUG_VALGRIND */ + + if (!row_merge_write(fd, (*foffs)++, block)) { + return(NULL); + } + + UNIV_MEM_INVALID(&block[0], srv_sort_buf_size); + return(&block[0]); +} + +/********************************************************************//** +Reads clustered index of the table and create temporary files +containing the index entries for the indexes to be built. +@return DB_SUCCESS or error */ +static __attribute__((nonnull(1,2,3,4,6,9,10,16), warn_unused_result)) +dberr_t +row_merge_read_clustered_index( +/*===========================*/ + trx_t* trx, /*!< in: transaction */ + struct TABLE* table, /*!< in/out: MySQL table object, + for reporting erroneous records */ + const dict_table_t* old_table,/*!< in: table where rows are + read from */ + const dict_table_t* new_table,/*!< in: table where indexes are + created; identical to old_table + unless creating a PRIMARY KEY */ + bool online, /*!< in: true if creating indexes + online */ + dict_index_t** index, /*!< in: indexes to be created */ + dict_index_t* fts_sort_idx, + /*!< in: full-text index to be created, + or NULL */ + fts_psort_t* psort_info, + /*!< in: parallel sort info for + fts_sort_idx creation, or NULL */ + merge_file_t* files, /*!< in: temporary files */ + const ulint* key_numbers, + /*!< in: MySQL key numbers to create */ + ulint n_index,/*!< in: number of indexes to create */ + const dtuple_t* add_cols, + /*!< in: default values of + added columns, or NULL */ + const ulint* col_map,/*!< in: mapping of old column + numbers to new ones, or NULL + if old_table == new_table */ + ulint add_autoinc, + /*!< in: number of added + AUTO_INCREMENT column, or + ULINT_UNDEFINED if none is added */ + ib_sequence_t& sequence,/*!< in/out: autoinc sequence */ + row_merge_block_t* block) /*!< in/out: file buffer */ +{ + dict_index_t* clust_index; /* Clustered index */ + mem_heap_t* row_heap; /* Heap memory to create + clustered index tuples */ + row_merge_buf_t** merge_buf; /* Temporary list for records*/ + btr_pcur_t pcur; /* Cursor on the clustered + index */ + mtr_t mtr; /* Mini transaction */ + dberr_t err = DB_SUCCESS;/* Return code */ + ulint n_nonnull = 0; /* number of columns + changed to NOT NULL */ + ulint* nonnull = NULL; /* NOT NULL columns */ + dict_index_t* fts_index = NULL;/* FTS index */ + doc_id_t doc_id = 0; + doc_id_t max_doc_id = 0; + ibool add_doc_id = FALSE; + os_event_t fts_parallel_sort_event = NULL; + ibool fts_pll_sort = FALSE; + ib_int64_t sig_count = 0; + DBUG_ENTER("row_merge_read_clustered_index"); + + ut_ad((old_table == new_table) == !col_map); + ut_ad(!add_cols || col_map); + + trx->op_info = "reading clustered index"; + +#ifdef FTS_INTERNAL_DIAG_PRINT + DEBUG_FTS_SORT_PRINT("FTS_SORT: Start Create Index\n"); +#endif + + /* Create and initialize memory for record buffers */ + + merge_buf = static_cast<row_merge_buf_t**>( + mem_alloc(n_index * sizeof *merge_buf)); + + for (ulint i = 0; i < n_index; i++) { + if (index[i]->type & DICT_FTS) { + + /* We are building a FT index, make sure + we have the temporary 'fts_sort_idx' */ + ut_a(fts_sort_idx); + + fts_index = index[i]; + + merge_buf[i] = row_merge_buf_create(fts_sort_idx); + + add_doc_id = DICT_TF2_FLAG_IS_SET( + new_table, DICT_TF2_FTS_ADD_DOC_ID); + + /* If Doc ID does not exist in the table itself, + fetch the first FTS Doc ID */ + if (add_doc_id) { + fts_get_next_doc_id( + (dict_table_t*) new_table, + &doc_id); + ut_ad(doc_id > 0); + } + + fts_pll_sort = TRUE; + row_fts_start_psort(psort_info); + fts_parallel_sort_event = + psort_info[0].psort_common->sort_event; + } else { + merge_buf[i] = row_merge_buf_create(index[i]); + } + } + + mtr_start(&mtr); + + /* Find the clustered index and create a persistent cursor + based on that. */ + + clust_index = dict_table_get_first_index(old_table); + + btr_pcur_open_at_index_side( + true, clust_index, BTR_SEARCH_LEAF, &pcur, true, 0, &mtr); + + if (old_table != new_table) { + /* The table is being rebuilt. Identify the columns + that were flagged NOT NULL in the new table, so that + we can quickly check that the records in the old table + do not violate the added NOT NULL constraints. */ + + nonnull = static_cast<ulint*>( + mem_alloc(dict_table_get_n_cols(new_table) + * sizeof *nonnull)); + + for (ulint i = 0; i < dict_table_get_n_cols(old_table); i++) { + if (dict_table_get_nth_col(old_table, i)->prtype + & DATA_NOT_NULL) { + continue; + } + + const ulint j = col_map[i]; + + if (j == ULINT_UNDEFINED) { + /* The column was dropped. */ + continue; + } + + if (dict_table_get_nth_col(new_table, j)->prtype + & DATA_NOT_NULL) { + nonnull[n_nonnull++] = j; + } + } + + if (!n_nonnull) { + mem_free(nonnull); + nonnull = NULL; + } + } + + row_heap = mem_heap_create(sizeof(mrec_buf_t)); + + /* Scan the clustered index. */ + for (;;) { + const rec_t* rec; + ulint* offsets; + const dtuple_t* row; + row_ext_t* ext; + page_cur_t* cur = btr_pcur_get_page_cur(&pcur); + + page_cur_move_to_next(cur); + + if (page_cur_is_after_last(cur)) { + if (UNIV_UNLIKELY(trx_is_interrupted(trx))) { + err = DB_INTERRUPTED; + trx->error_key_num = 0; + goto func_exit; + } + + if (online && old_table != new_table) { + err = row_log_table_get_error(clust_index); + if (err != DB_SUCCESS) { + trx->error_key_num = 0; + goto func_exit; + } + } +#ifdef DBUG_OFF +# define dbug_run_purge false +#else /* DBUG_OFF */ + bool dbug_run_purge = false; +#endif /* DBUG_OFF */ + DBUG_EXECUTE_IF( + "ib_purge_on_create_index_page_switch", + dbug_run_purge = true;); + + if (dbug_run_purge + || rw_lock_get_waiters( + dict_index_get_lock(clust_index))) { + /* There are waiters on the clustered + index tree lock, likely the purge + thread. Store and restore the cursor + position, and yield so that scanning a + large table will not starve other + threads. */ + + /* Store the cursor position on the last user + record on the page. */ + btr_pcur_move_to_prev_on_page(&pcur); + /* Leaf pages must never be empty, unless + this is the only page in the index tree. */ + ut_ad(btr_pcur_is_on_user_rec(&pcur) + || buf_block_get_page_no( + btr_pcur_get_block(&pcur)) + == clust_index->page); + + btr_pcur_store_position(&pcur, &mtr); + mtr_commit(&mtr); + + if (dbug_run_purge) { + /* This is for testing + purposes only (see + DBUG_EXECUTE_IF above). We + signal the purge thread and + hope that the purge batch will + complete before we execute + btr_pcur_restore_position(). */ + trx_purge_run(); + os_thread_sleep(1000000); + } + + /* Give the waiters a chance to proceed. */ + os_thread_yield(); + + mtr_start(&mtr); + /* Restore position on the record, or its + predecessor if the record was purged + meanwhile. */ + btr_pcur_restore_position( + BTR_SEARCH_LEAF, &pcur, &mtr); + /* Move to the successor of the + original record. */ + if (!btr_pcur_move_to_next_user_rec( + &pcur, &mtr)) { +end_of_index: + row = NULL; + mtr_commit(&mtr); + mem_heap_free(row_heap); + if (nonnull) { + mem_free(nonnull); + } + goto write_buffers; + } + } else { + ulint next_page_no; + buf_block_t* block; + + next_page_no = btr_page_get_next( + page_cur_get_page(cur), &mtr); + + if (next_page_no == FIL_NULL) { + goto end_of_index; + } + + block = page_cur_get_block(cur); + block = btr_block_get( + buf_block_get_space(block), + buf_block_get_zip_size(block), + next_page_no, BTR_SEARCH_LEAF, + clust_index, &mtr); + + btr_leaf_page_release(page_cur_get_block(cur), + BTR_SEARCH_LEAF, &mtr); + page_cur_set_before_first(block, cur); + page_cur_move_to_next(cur); + + ut_ad(!page_cur_is_after_last(cur)); + } + } + + rec = page_cur_get_rec(cur); + + SRV_CORRUPT_TABLE_CHECK(rec, + { + err = DB_CORRUPTION; + goto func_exit; + }); + + offsets = rec_get_offsets(rec, clust_index, NULL, + ULINT_UNDEFINED, &row_heap); + + if (online) { + /* Perform a REPEATABLE READ. + + When rebuilding the table online, + row_log_table_apply() must not see a newer + state of the table when applying the log. + This is mainly to prevent false duplicate key + errors, because the log will identify records + by the PRIMARY KEY, and also to prevent unsafe + BLOB access. + + When creating a secondary index online, this + table scan must not see records that have only + been inserted to the clustered index, but have + not been written to the online_log of + index[]. If we performed READ UNCOMMITTED, it + could happen that the ADD INDEX reaches + ONLINE_INDEX_COMPLETE state between the time + the DML thread has updated the clustered index + but has not yet accessed secondary index. */ + ut_ad(trx->read_view); + + if (!read_view_sees_trx_id( + trx->read_view, + row_get_rec_trx_id( + rec, clust_index, offsets))) { + rec_t* old_vers; + + row_vers_build_for_consistent_read( + rec, &mtr, clust_index, &offsets, + trx->read_view, &row_heap, + row_heap, &old_vers); + + rec = old_vers; + + if (!rec) { + continue; + } + } + + if (rec_get_deleted_flag( + rec, + dict_table_is_comp(old_table))) { + /* This record was deleted in the latest + committed version, or it was deleted and + then reinserted-by-update before purge + kicked in. Skip it. */ + continue; + } + + ut_ad(!rec_offs_any_null_extern(rec, offsets)); + } else if (rec_get_deleted_flag( + rec, dict_table_is_comp(old_table))) { + /* Skip delete-marked records. + + Skipping delete-marked records will make the + created indexes unuseable for transactions + whose read views were created before the index + creation completed, but preserving the history + would make it tricky to detect duplicate + keys. */ + continue; + } + + /* When !online, we are holding a lock on old_table, preventing + any inserts that could have written a record 'stub' before + writing out off-page columns. */ + ut_ad(!rec_offs_any_null_extern(rec, offsets)); + + /* Build a row based on the clustered index. */ + + row = row_build(ROW_COPY_POINTERS, clust_index, + rec, offsets, new_table, + add_cols, col_map, &ext, row_heap); + ut_ad(row); + + for (ulint i = 0; i < n_nonnull; i++) { + const dfield_t* field = &row->fields[nonnull[i]]; + + ut_ad(dfield_get_type(field)->prtype & DATA_NOT_NULL); + + if (dfield_is_null(field)) { + err = DB_INVALID_NULL; + trx->error_key_num = 0; + goto func_exit; + } + } + + /* Get the next Doc ID */ + if (add_doc_id) { + doc_id++; + } else { + doc_id = 0; + } + + if (add_autoinc != ULINT_UNDEFINED) { + + ut_ad(add_autoinc + < dict_table_get_n_user_cols(new_table)); + + const dfield_t* dfield; + + dfield = dtuple_get_nth_field(row, add_autoinc); + if (dfield_is_null(dfield)) { + goto write_buffers; + } + + const dtype_t* dtype = dfield_get_type(dfield); + byte* b = static_cast<byte*>(dfield_get_data(dfield)); + + if (sequence.eof()) { + err = DB_ERROR; + trx->error_key_num = 0; + + ib_errf(trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_AUTOINC_READ_FAILED, "[NULL]"); + + goto func_exit; + } + + ulonglong value = sequence++; + + switch (dtype_get_mtype(dtype)) { + case DATA_INT: { + ibool usign; + ulint len = dfield_get_len(dfield); + + usign = dtype_get_prtype(dtype) & DATA_UNSIGNED; + mach_write_ulonglong(b, value, len, usign); + + break; + } + + case DATA_FLOAT: + mach_float_write( + b, static_cast<float>(value)); + break; + + case DATA_DOUBLE: + mach_double_write( + b, static_cast<double>(value)); + break; + + default: + ut_ad(0); + } + } + +write_buffers: + /* Build all entries for all the indexes to be created + in a single scan of the clustered index. */ + + for (ulint i = 0; i < n_index; i++) { + row_merge_buf_t* buf = merge_buf[i]; + merge_file_t* file = &files[i]; + ulint rows_added = 0; + + if (UNIV_LIKELY + (row && (rows_added = row_merge_buf_add( + buf, fts_index, old_table, + psort_info, row, ext, &doc_id)))) { + + /* If we are creating FTS index, + a single row can generate more + records for tokenized word */ + file->n_rec += rows_added; + if (doc_id > max_doc_id) { + max_doc_id = doc_id; + } + + if (buf->index->type & DICT_FTS) { + /* Check if error occurs in child thread */ + for (ulint j = 0; j < fts_sort_pll_degree; j++) { + if (psort_info[j].error != DB_SUCCESS) { + err = psort_info[j].error; + trx->error_key_num = i; + break; + } + } + + if (err != DB_SUCCESS) { + break; + } + } + + continue; + } + + if (buf->index->type & DICT_FTS) { + if (!row || !doc_id) { + continue; + } + } + + /* The buffer must be sufficiently large + to hold at least one record. It may only + be empty when we reach the end of the + clustered index. row_merge_buf_add() + must not have been called in this loop. */ + ut_ad(buf->n_tuples || row == NULL); + + /* We have enough data tuples to form a block. + Sort them and write to disk. */ + + if (buf->n_tuples) { + if (dict_index_is_unique(buf->index)) { + row_merge_dup_t dup = { + buf->index, table, col_map, 0}; + + row_merge_buf_sort(buf, &dup); + + if (dup.n_dup) { + err = DB_DUPLICATE_KEY; + trx->error_key_num + = key_numbers[i]; + break; + } + } else { + row_merge_buf_sort(buf, NULL); + } + } else if (online && new_table == old_table) { + /* Note the newest transaction that + modified this index when the scan was + completed. We prevent older readers + from accessing this index, to ensure + read consistency. */ + + trx_id_t max_trx_id; + + ut_a(row == NULL); + rw_lock_x_lock( + dict_index_get_lock(buf->index)); + ut_a(dict_index_get_online_status(buf->index) + == ONLINE_INDEX_CREATION); + + max_trx_id = row_log_get_max_trx(buf->index); + + if (max_trx_id > buf->index->trx_id) { + buf->index->trx_id = max_trx_id; + } + + rw_lock_x_unlock( + dict_index_get_lock(buf->index)); + } + + row_merge_buf_write(buf, file, block); + + if (!row_merge_write(file->fd, file->offset++, + block)) { + err = DB_TEMP_FILE_WRITE_FAILURE; + trx->error_key_num = i; + break; + } + + UNIV_MEM_INVALID(&block[0], srv_sort_buf_size); + merge_buf[i] = row_merge_buf_empty(buf); + + if (UNIV_LIKELY(row != NULL)) { + /* Try writing the record again, now + that the buffer has been written out + and emptied. */ + + if (UNIV_UNLIKELY + (!(rows_added = row_merge_buf_add( + buf, fts_index, old_table, + psort_info, row, ext, + &doc_id)))) { + /* An empty buffer should have enough + room for at least one record. */ + ut_error; + } + + file->n_rec += rows_added; + } + } + + if (row == NULL) { + goto all_done; + } + + if (err != DB_SUCCESS) { + goto func_exit; + } + + mem_heap_empty(row_heap); + } + +func_exit: + mtr_commit(&mtr); + mem_heap_free(row_heap); + + if (nonnull) { + mem_free(nonnull); + } + +all_done: +#ifdef FTS_INTERNAL_DIAG_PRINT + DEBUG_FTS_SORT_PRINT("FTS_SORT: Complete Scan Table\n"); +#endif + if (fts_pll_sort) { + bool all_exit = false; + ulint trial_count = 0; + const ulint max_trial_count = 10000; + +wait_again: + /* Check if error occurs in child thread */ + for (ulint j = 0; j < fts_sort_pll_degree; j++) { + if (psort_info[j].error != DB_SUCCESS) { + err = psort_info[j].error; + trx->error_key_num = j; + break; + } + } + + /* Tell all children that parent has done scanning */ + for (ulint i = 0; i < fts_sort_pll_degree; i++) { + if (err == DB_SUCCESS) { + psort_info[i].state = FTS_PARENT_COMPLETE; + } else { + psort_info[i].state = FTS_PARENT_EXITING; + } + } + + /* Now wait all children to report back to be completed */ + os_event_wait_time_low(fts_parallel_sort_event, + 1000000, sig_count); + + for (ulint i = 0; i < fts_sort_pll_degree; i++) { + if (psort_info[i].child_status != FTS_CHILD_COMPLETE + && psort_info[i].child_status != FTS_CHILD_EXITING) { + sig_count = os_event_reset( + fts_parallel_sort_event); + goto wait_again; + } + } + + /* Now all children should complete, wait a bit until + they all finish setting the event, before we free everything. + This has a 10 second timeout */ + do { + all_exit = true; + + for (ulint j = 0; j < fts_sort_pll_degree; j++) { + if (psort_info[j].child_status + != FTS_CHILD_EXITING) { + all_exit = false; + os_thread_sleep(1000); + break; + } + } + trial_count++; + } while (!all_exit && trial_count < max_trial_count); + + if (!all_exit) { + ut_ad(0); + ib_logf(IB_LOG_LEVEL_FATAL, + "Not all child sort threads exited" + " when creating FTS index '%s'", + fts_sort_idx->name); + } + } + +#ifdef FTS_INTERNAL_DIAG_PRINT + DEBUG_FTS_SORT_PRINT("FTS_SORT: Complete Tokenization\n"); +#endif + for (ulint i = 0; i < n_index; i++) { + row_merge_buf_free(merge_buf[i]); + } + + row_fts_free_pll_merge_buf(psort_info); + + mem_free(merge_buf); + + btr_pcur_close(&pcur); + + /* Update the next Doc ID we used. Table should be locked, so + no concurrent DML */ + if (max_doc_id && err == DB_SUCCESS) { + /* Sync fts cache for other fts indexes to keep all + fts indexes consistent in sync_doc_id. */ + err = fts_sync_table(const_cast<dict_table_t*>(new_table)); + + if (err == DB_SUCCESS) { + fts_update_next_doc_id( + 0, new_table, old_table->name, max_doc_id); + } + } + + trx->op_info = ""; + + DBUG_RETURN(err); +} + +/** Write a record via buffer 2 and read the next record to buffer N. +@param N number of the buffer (0 or 1) +@param INDEX record descriptor +@param AT_END statement to execute at end of input */ +#define ROW_MERGE_WRITE_GET_NEXT(N, INDEX, AT_END) \ + do { \ + b2 = row_merge_write_rec(&block[2 * srv_sort_buf_size], \ + &buf[2], b2, \ + of->fd, &of->offset, \ + mrec##N, offsets##N); \ + if (UNIV_UNLIKELY(!b2 || ++of->n_rec > file->n_rec)) { \ + goto corrupt; \ + } \ + b##N = row_merge_read_rec(&block[N * srv_sort_buf_size],\ + &buf[N], b##N, INDEX, \ + file->fd, foffs##N, \ + &mrec##N, offsets##N); \ + if (UNIV_UNLIKELY(!b##N)) { \ + if (mrec##N) { \ + goto corrupt; \ + } \ + AT_END; \ + } \ + } while (0) + +/*************************************************************//** +Merge two blocks of records on disk and write a bigger block. +@return DB_SUCCESS or error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_merge_blocks( +/*=============*/ + const row_merge_dup_t* dup, /*!< in: descriptor of + index being created */ + const merge_file_t* file, /*!< in: file containing + index entries */ + row_merge_block_t* block, /*!< in/out: 3 buffers */ + ulint* foffs0, /*!< in/out: offset of first + source list in the file */ + ulint* foffs1, /*!< in/out: offset of second + source list in the file */ + merge_file_t* of) /*!< in/out: output file */ +{ + mem_heap_t* heap; /*!< memory heap for offsets0, offsets1 */ + + mrec_buf_t* buf; /*!< buffer for handling + split mrec in block[] */ + const byte* b0; /*!< pointer to block[0] */ + const byte* b1; /*!< pointer to block[srv_sort_buf_size] */ + byte* b2; /*!< pointer to block[2 * srv_sort_buf_size] */ + const mrec_t* mrec0; /*!< merge rec, points to block[0] or buf[0] */ + const mrec_t* mrec1; /*!< merge rec, points to + block[srv_sort_buf_size] or buf[1] */ + ulint* offsets0;/* offsets of mrec0 */ + ulint* offsets1;/* offsets of mrec1 */ + +#ifdef UNIV_DEBUG + if (row_merge_print_block) { + fprintf(stderr, + "row_merge_blocks fd=%d ofs=%lu + fd=%d ofs=%lu" + " = fd=%d ofs=%lu\n", + file->fd, (ulong) *foffs0, + file->fd, (ulong) *foffs1, + of->fd, (ulong) of->offset); + } +#endif /* UNIV_DEBUG */ + + heap = row_merge_heap_create(dup->index, &buf, &offsets0, &offsets1); + + /* Write a record and read the next record. Split the output + file in two halves, which can be merged on the following pass. */ + + if (!row_merge_read(file->fd, *foffs0, &block[0]) + || !row_merge_read(file->fd, *foffs1, &block[srv_sort_buf_size])) { +corrupt: + mem_heap_free(heap); + return(DB_CORRUPTION); + } + + b0 = &block[0]; + b1 = &block[srv_sort_buf_size]; + b2 = &block[2 * srv_sort_buf_size]; + + b0 = row_merge_read_rec( + &block[0], &buf[0], b0, dup->index, + file->fd, foffs0, &mrec0, offsets0); + b1 = row_merge_read_rec( + &block[srv_sort_buf_size], + &buf[srv_sort_buf_size], b1, dup->index, + file->fd, foffs1, &mrec1, offsets1); + if (UNIV_UNLIKELY(!b0 && mrec0) + || UNIV_UNLIKELY(!b1 && mrec1)) { + + goto corrupt; + } + + while (mrec0 && mrec1) { + switch (cmp_rec_rec_simple( + mrec0, mrec1, offsets0, offsets1, + dup->index, dup->table)) { + case 0: + mem_heap_free(heap); + return(DB_DUPLICATE_KEY); + case -1: + ROW_MERGE_WRITE_GET_NEXT(0, dup->index, goto merged); + break; + case 1: + ROW_MERGE_WRITE_GET_NEXT(1, dup->index, goto merged); + break; + default: + ut_error; + } + } + +merged: + if (mrec0) { + /* append all mrec0 to output */ + for (;;) { + ROW_MERGE_WRITE_GET_NEXT(0, dup->index, goto done0); + } + } +done0: + if (mrec1) { + /* append all mrec1 to output */ + for (;;) { + ROW_MERGE_WRITE_GET_NEXT(1, dup->index, goto done1); + } + } +done1: + + mem_heap_free(heap); + b2 = row_merge_write_eof(&block[2 * srv_sort_buf_size], + b2, of->fd, &of->offset); + return(b2 ? DB_SUCCESS : DB_CORRUPTION); +} + +/*************************************************************//** +Copy a block of index entries. +@return TRUE on success, FALSE on failure */ +static __attribute__((nonnull, warn_unused_result)) +ibool +row_merge_blocks_copy( +/*==================*/ + const dict_index_t* index, /*!< in: index being created */ + const merge_file_t* file, /*!< in: input file */ + row_merge_block_t* block, /*!< in/out: 3 buffers */ + ulint* foffs0, /*!< in/out: input file offset */ + merge_file_t* of) /*!< in/out: output file */ +{ + mem_heap_t* heap; /*!< memory heap for offsets0, offsets1 */ + + mrec_buf_t* buf; /*!< buffer for handling + split mrec in block[] */ + const byte* b0; /*!< pointer to block[0] */ + byte* b2; /*!< pointer to block[2 * srv_sort_buf_size] */ + const mrec_t* mrec0; /*!< merge rec, points to block[0] */ + ulint* offsets0;/* offsets of mrec0 */ + ulint* offsets1;/* dummy offsets */ + +#ifdef UNIV_DEBUG + if (row_merge_print_block) { + fprintf(stderr, + "row_merge_blocks_copy fd=%d ofs=%lu" + " = fd=%d ofs=%lu\n", + file->fd, (ulong) foffs0, + of->fd, (ulong) of->offset); + } +#endif /* UNIV_DEBUG */ + + heap = row_merge_heap_create(index, &buf, &offsets0, &offsets1); + + /* Write a record and read the next record. Split the output + file in two halves, which can be merged on the following pass. */ + + if (!row_merge_read(file->fd, *foffs0, &block[0])) { +corrupt: + mem_heap_free(heap); + return(FALSE); + } + + b0 = &block[0]; + + b2 = &block[2 * srv_sort_buf_size]; + + b0 = row_merge_read_rec(&block[0], &buf[0], b0, index, + file->fd, foffs0, &mrec0, offsets0); + if (UNIV_UNLIKELY(!b0 && mrec0)) { + + goto corrupt; + } + + if (mrec0) { + /* append all mrec0 to output */ + for (;;) { + ROW_MERGE_WRITE_GET_NEXT(0, index, goto done0); + } + } +done0: + + /* The file offset points to the beginning of the last page + that has been read. Update it to point to the next block. */ + (*foffs0)++; + + mem_heap_free(heap); + return(row_merge_write_eof(&block[2 * srv_sort_buf_size], + b2, of->fd, &of->offset) + != NULL); +} + +/*************************************************************//** +Merge disk files. +@return DB_SUCCESS or error code */ +static __attribute__((nonnull)) +dberr_t +row_merge( +/*======*/ + trx_t* trx, /*!< in: transaction */ + const row_merge_dup_t* dup, /*!< in: descriptor of + index being created */ + merge_file_t* file, /*!< in/out: file containing + index entries */ + row_merge_block_t* block, /*!< in/out: 3 buffers */ + int* tmpfd, /*!< in/out: temporary file handle */ + ulint* num_run,/*!< in/out: Number of runs remain + to be merged */ + ulint* run_offset) /*!< in/out: Array contains the + first offset number for each merge + run */ +{ + ulint foffs0; /*!< first input offset */ + ulint foffs1; /*!< second input offset */ + dberr_t error; /*!< error code */ + merge_file_t of; /*!< output file */ + const ulint ihalf = run_offset[*num_run / 2]; + /*!< half the input file */ + ulint n_run = 0; + /*!< num of runs generated from this merge */ + + UNIV_MEM_ASSERT_W(&block[0], 3 * srv_sort_buf_size); + + ut_ad(ihalf < file->offset); + + of.fd = *tmpfd; + of.offset = 0; + of.n_rec = 0; + +#ifdef POSIX_FADV_SEQUENTIAL + /* The input file will be read sequentially, starting from the + beginning and the middle. In Linux, the POSIX_FADV_SEQUENTIAL + affects the entire file. Each block will be read exactly once. */ + posix_fadvise(file->fd, 0, 0, + POSIX_FADV_SEQUENTIAL | POSIX_FADV_NOREUSE); +#endif /* POSIX_FADV_SEQUENTIAL */ + + /* Merge blocks to the output file. */ + foffs0 = 0; + foffs1 = ihalf; + + UNIV_MEM_INVALID(run_offset, *num_run * sizeof *run_offset); + + for (; foffs0 < ihalf && foffs1 < file->offset; foffs0++, foffs1++) { + + if (trx_is_interrupted(trx)) { + return(DB_INTERRUPTED); + } + + /* Remember the offset number for this run */ + run_offset[n_run++] = of.offset; + + error = row_merge_blocks(dup, file, block, + &foffs0, &foffs1, &of); + + if (error != DB_SUCCESS) { + return(error); + } + + } + + /* Copy the last blocks, if there are any. */ + + while (foffs0 < ihalf) { + if (UNIV_UNLIKELY(trx_is_interrupted(trx))) { + return(DB_INTERRUPTED); + } + + /* Remember the offset number for this run */ + run_offset[n_run++] = of.offset; + + if (!row_merge_blocks_copy(dup->index, file, block, + &foffs0, &of)) { + return(DB_CORRUPTION); + } + } + + ut_ad(foffs0 == ihalf); + + while (foffs1 < file->offset) { + if (trx_is_interrupted(trx)) { + return(DB_INTERRUPTED); + } + + /* Remember the offset number for this run */ + run_offset[n_run++] = of.offset; + + if (!row_merge_blocks_copy(dup->index, file, block, + &foffs1, &of)) { + return(DB_CORRUPTION); + } + } + + ut_ad(foffs1 == file->offset); + + if (UNIV_UNLIKELY(of.n_rec != file->n_rec)) { + return(DB_CORRUPTION); + } + + ut_ad(n_run <= *num_run); + + *num_run = n_run; + + /* Each run can contain one or more offsets. As merge goes on, + the number of runs (to merge) will reduce until we have one + single run. So the number of runs will always be smaller than + the number of offsets in file */ + ut_ad((*num_run) <= file->offset); + + /* The number of offsets in output file is always equal or + smaller than input file */ + ut_ad(of.offset <= file->offset); + + /* Swap file descriptors for the next pass. */ + *tmpfd = file->fd; + *file = of; + + UNIV_MEM_INVALID(&block[0], 3 * srv_sort_buf_size); + + return(DB_SUCCESS); +} + +/*************************************************************//** +Merge disk files. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +row_merge_sort( +/*===========*/ + trx_t* trx, /*!< in: transaction */ + const row_merge_dup_t* dup, /*!< in: descriptor of + index being created */ + merge_file_t* file, /*!< in/out: file containing + index entries */ + row_merge_block_t* block, /*!< in/out: 3 buffers */ + int* tmpfd) /*!< in/out: temporary file handle */ +{ + const ulint half = file->offset / 2; + ulint num_runs; + ulint* run_offset; + dberr_t error = DB_SUCCESS; + DBUG_ENTER("row_merge_sort"); + + /* Record the number of merge runs we need to perform */ + num_runs = file->offset; + + /* If num_runs are less than 1, nothing to merge */ + if (num_runs <= 1) { + DBUG_RETURN(error); + } + + /* "run_offset" records each run's first offset number */ + run_offset = (ulint*) mem_alloc(file->offset * sizeof(ulint)); + + /* This tells row_merge() where to start for the first round + of merge. */ + run_offset[half] = half; + + /* The file should always contain at least one byte (the end + of file marker). Thus, it must be at least one block. */ + ut_ad(file->offset > 0); + + /* Merge the runs until we have one big run */ + do { + error = row_merge(trx, dup, file, block, tmpfd, + &num_runs, run_offset); + + if (error != DB_SUCCESS) { + break; + } + + UNIV_MEM_ASSERT_RW(run_offset, num_runs * sizeof *run_offset); + } while (num_runs > 1); + + mem_free(run_offset); + + DBUG_RETURN(error); +} + +/*************************************************************//** +Copy externally stored columns to the data tuple. */ +static __attribute__((nonnull)) +void +row_merge_copy_blobs( +/*=================*/ + const mrec_t* mrec, /*!< in: merge record */ + const ulint* offsets,/*!< in: offsets of mrec */ + ulint zip_size,/*!< in: compressed page size in bytes, or 0 */ + dtuple_t* tuple, /*!< in/out: data tuple */ + mem_heap_t* heap) /*!< in/out: memory heap */ +{ + ut_ad(rec_offs_any_extern(offsets)); + + for (ulint i = 0; i < dtuple_get_n_fields(tuple); i++) { + ulint len; + const void* data; + dfield_t* field = dtuple_get_nth_field(tuple, i); + + if (!dfield_is_ext(field)) { + continue; + } + + ut_ad(!dfield_is_null(field)); + + /* During the creation of a PRIMARY KEY, the table is + X-locked, and we skip copying records that have been + marked for deletion. Therefore, externally stored + columns cannot possibly be freed between the time the + BLOB pointers are read (row_merge_read_clustered_index()) + and dereferenced (below). */ + data = btr_rec_copy_externally_stored_field( + mrec, offsets, zip_size, i, &len, heap); + /* Because we have locked the table, any records + written by incomplete transactions must have been + rolled back already. There must not be any incomplete + BLOB columns. */ + ut_a(data); + + dfield_set_data(field, data, len); + } +} + +/********************************************************************//** +Read sorted file containing index data tuples and insert these data +tuples to the index +@return DB_SUCCESS or error number */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_merge_insert_index_tuples( +/*==========================*/ + trx_id_t trx_id, /*!< in: transaction identifier */ + dict_index_t* index, /*!< in: index */ + const dict_table_t* old_table,/*!< in: old table */ + int fd, /*!< in: file descriptor */ + row_merge_block_t* block) /*!< in/out: file buffer */ +{ + const byte* b; + mem_heap_t* heap; + mem_heap_t* tuple_heap; + mem_heap_t* ins_heap; + dberr_t error = DB_SUCCESS; + ulint foffs = 0; + ulint* offsets; + mrec_buf_t* buf; + DBUG_ENTER("row_merge_insert_index_tuples"); + + ut_ad(!srv_read_only_mode); + ut_ad(!(index->type & DICT_FTS)); + ut_ad(trx_id); + + tuple_heap = mem_heap_create(1000); + + { + ulint i = 1 + REC_OFFS_HEADER_SIZE + + dict_index_get_n_fields(index); + heap = mem_heap_create(sizeof *buf + i * sizeof *offsets); + ins_heap = mem_heap_create(sizeof *buf + i * sizeof *offsets); + offsets = static_cast<ulint*>( + mem_heap_alloc(heap, i * sizeof *offsets)); + offsets[0] = i; + offsets[1] = dict_index_get_n_fields(index); + } + + b = block; + + if (!row_merge_read(fd, foffs, block)) { + error = DB_CORRUPTION; + } else { + buf = static_cast<mrec_buf_t*>( + mem_heap_alloc(heap, sizeof *buf)); + + for (;;) { + const mrec_t* mrec; + dtuple_t* dtuple; + ulint n_ext; + big_rec_t* big_rec; + rec_t* rec; + btr_cur_t cursor; + mtr_t mtr; + + b = row_merge_read_rec(block, buf, b, index, + fd, &foffs, &mrec, offsets); + if (UNIV_UNLIKELY(!b)) { + /* End of list, or I/O error */ + if (mrec) { + error = DB_CORRUPTION; + } + break; + } + + dict_index_t* old_index + = dict_table_get_first_index(old_table); + + if (dict_index_is_clust(index) + && dict_index_is_online_ddl(old_index)) { + error = row_log_table_get_error(old_index); + if (error != DB_SUCCESS) { + break; + } + } + + dtuple = row_rec_to_index_entry_low( + mrec, index, offsets, &n_ext, tuple_heap); + + if (!n_ext) { + /* There are no externally stored columns. */ + } else { + ut_ad(dict_index_is_clust(index)); + /* Off-page columns can be fetched safely + when concurrent modifications to the table + are disabled. (Purge can process delete-marked + records, but row_merge_read_clustered_index() + would have skipped them.) + + When concurrent modifications are enabled, + row_merge_read_clustered_index() will + only see rows from transactions that were + committed before the ALTER TABLE started + (REPEATABLE READ). + + Any modifications after the + row_merge_read_clustered_index() scan + will go through row_log_table_apply(). + Any modifications to off-page columns + will be tracked by + row_log_table_blob_alloc() and + row_log_table_blob_free(). */ + row_merge_copy_blobs( + mrec, offsets, + dict_table_zip_size(old_table), + dtuple, tuple_heap); + } + + ut_ad(dtuple_validate(dtuple)); + log_free_check(); + + mtr_start(&mtr); + /* Insert after the last user record. */ + btr_cur_open_at_index_side( + false, index, BTR_MODIFY_LEAF, + &cursor, 0, &mtr); + page_cur_position( + page_rec_get_prev(btr_cur_get_rec(&cursor)), + btr_cur_get_block(&cursor), + btr_cur_get_page_cur(&cursor)); + cursor.flag = BTR_CUR_BINARY; +#ifdef UNIV_DEBUG + /* Check that the records are inserted in order. */ + rec = btr_cur_get_rec(&cursor); + + if (!page_rec_is_infimum(rec)) { + ulint* rec_offsets = rec_get_offsets( + rec, index, offsets, + ULINT_UNDEFINED, &tuple_heap); + ut_ad(cmp_dtuple_rec(dtuple, rec, rec_offsets) + > 0); + } +#endif /* UNIV_DEBUG */ + ulint* ins_offsets = NULL; + + error = btr_cur_optimistic_insert( + BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG + | BTR_KEEP_SYS_FLAG | BTR_CREATE_FLAG, + &cursor, &ins_offsets, &ins_heap, + dtuple, &rec, &big_rec, 0, NULL, &mtr); + + if (error == DB_FAIL) { + ut_ad(!big_rec); + mtr_commit(&mtr); + mtr_start(&mtr); + btr_cur_open_at_index_side( + false, index, BTR_MODIFY_TREE, + &cursor, 0, &mtr); + page_cur_position( + page_rec_get_prev(btr_cur_get_rec( + &cursor)), + btr_cur_get_block(&cursor), + btr_cur_get_page_cur(&cursor)); + + error = btr_cur_pessimistic_insert( + BTR_NO_UNDO_LOG_FLAG + | BTR_NO_LOCKING_FLAG + | BTR_KEEP_SYS_FLAG | BTR_CREATE_FLAG, + &cursor, &ins_offsets, &ins_heap, + dtuple, &rec, &big_rec, 0, NULL, &mtr); + } + + if (!dict_index_is_clust(index)) { + page_update_max_trx_id( + btr_cur_get_block(&cursor), + btr_cur_get_page_zip(&cursor), + trx_id, &mtr); + } + + mtr_commit(&mtr); + + if (UNIV_LIKELY_NULL(big_rec)) { + /* If the system crashes at this + point, the clustered index record will + contain a null BLOB pointer. This + should not matter, because the copied + table will be dropped on crash + recovery anyway. */ + + ut_ad(dict_index_is_clust(index)); + ut_ad(error == DB_SUCCESS); + error = row_ins_index_entry_big_rec( + dtuple, big_rec, + ins_offsets, &ins_heap, + index, NULL, __FILE__, __LINE__); + dtuple_convert_back_big_rec( + index, dtuple, big_rec); + } + + if (error != DB_SUCCESS) { + goto err_exit; + } + + mem_heap_empty(tuple_heap); + mem_heap_empty(ins_heap); + } + } + +err_exit: + mem_heap_free(tuple_heap); + mem_heap_free(ins_heap); + mem_heap_free(heap); + + DBUG_RETURN(error); +} + +/*********************************************************************//** +Sets an exclusive lock on a table, for the duration of creating indexes. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_merge_lock_table( +/*=================*/ + trx_t* trx, /*!< in/out: transaction */ + dict_table_t* table, /*!< in: table to lock */ + enum lock_mode mode) /*!< in: LOCK_X or LOCK_S */ +{ + mem_heap_t* heap; + que_thr_t* thr; + dberr_t err; + sel_node_t* node; + + ut_ad(!srv_read_only_mode); + ut_ad(mode == LOCK_X || mode == LOCK_S); + + heap = mem_heap_create(512); + + trx->op_info = "setting table lock for creating or dropping index"; + + node = sel_node_create(heap); + thr = pars_complete_graph_for_exec(node, trx, heap); + thr->graph->state = QUE_FORK_ACTIVE; + + /* We use the select query graph as the dummy graph needed + in the lock module call */ + + thr = static_cast<que_thr_t*>( + que_fork_get_first_thr( + static_cast<que_fork_t*>(que_node_get_parent(thr)))); + + que_thr_move_to_run_state_for_mysql(thr, trx); + +run_again: + thr->run_node = thr; + thr->prev_node = thr->common.parent; + + err = lock_table(0, table, mode, thr); + + trx->error_state = err; + + if (UNIV_LIKELY(err == DB_SUCCESS)) { + que_thr_stop_for_mysql_no_error(thr, trx); + } else { + que_thr_stop_for_mysql(thr); + + if (err != DB_QUE_THR_SUSPENDED) { + bool was_lock_wait; + + was_lock_wait = row_mysql_handle_errors( + &err, trx, thr, NULL); + + if (was_lock_wait) { + goto run_again; + } + } else { + que_thr_t* run_thr; + que_node_t* parent; + + parent = que_node_get_parent(thr); + + run_thr = que_fork_start_command( + static_cast<que_fork_t*>(parent)); + + ut_a(run_thr == thr); + + /* There was a lock wait but the thread was not + in a ready to run or running state. */ + trx->error_state = DB_LOCK_WAIT; + + goto run_again; + } + } + + que_graph_free(thr->graph); + trx->op_info = ""; + + return(err); +} + +/*********************************************************************//** +Drop an index that was created before an error occurred. +The data dictionary must have been locked exclusively by the caller, +because the transaction will not be committed. */ +static +void +row_merge_drop_index_dict( +/*======================*/ + trx_t* trx, /*!< in/out: dictionary transaction */ + index_id_t index_id)/*!< in: index identifier */ +{ + static const char sql[] = + "PROCEDURE DROP_INDEX_PROC () IS\n" + "BEGIN\n" + "DELETE FROM SYS_FIELDS WHERE INDEX_ID=:indexid;\n" + "DELETE FROM SYS_INDEXES WHERE ID=:indexid;\n" + "END;\n"; + dberr_t error; + pars_info_t* info; + + ut_ad(!srv_read_only_mode); + ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH); + ut_ad(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + info = pars_info_create(); + pars_info_add_ull_literal(info, "indexid", index_id); + trx->op_info = "dropping index from dictionary"; + error = que_eval_sql(info, sql, FALSE, trx); + + if (error != DB_SUCCESS) { + /* Even though we ensure that DDL transactions are WAIT + and DEADLOCK free, we could encounter other errors e.g., + DB_TOO_MANY_CONCURRENT_TRXS. */ + trx->error_state = DB_SUCCESS; + + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Error: row_merge_drop_index_dict " + "failed with error code: %u.\n", (unsigned) error); + } + + trx->op_info = ""; +} + +/*********************************************************************//** +Drop indexes that were created before an error occurred. +The data dictionary must have been locked exclusively by the caller, +because the transaction will not be committed. */ +UNIV_INTERN +void +row_merge_drop_indexes_dict( +/*========================*/ + trx_t* trx, /*!< in/out: dictionary transaction */ + table_id_t table_id)/*!< in: table identifier */ +{ + static const char sql[] = + "PROCEDURE DROP_INDEXES_PROC () IS\n" + "ixid CHAR;\n" + "found INT;\n" + + "DECLARE CURSOR index_cur IS\n" + " SELECT ID FROM SYS_INDEXES\n" + " WHERE TABLE_ID=:tableid AND\n" + " SUBSTR(NAME,0,1)='" TEMP_INDEX_PREFIX_STR "'\n" + "FOR UPDATE;\n" + + "BEGIN\n" + "found := 1;\n" + "OPEN index_cur;\n" + "WHILE found = 1 LOOP\n" + " FETCH index_cur INTO ixid;\n" + " IF (SQL % NOTFOUND) THEN\n" + " found := 0;\n" + " ELSE\n" + " DELETE FROM SYS_FIELDS WHERE INDEX_ID=ixid;\n" + " DELETE FROM SYS_INDEXES WHERE CURRENT OF index_cur;\n" + " END IF;\n" + "END LOOP;\n" + "CLOSE index_cur;\n" + + "END;\n"; + dberr_t error; + pars_info_t* info; + + ut_ad(!srv_read_only_mode); + ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH); + ut_ad(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + /* It is possible that table->n_ref_count > 1 when + locked=TRUE. In this case, all code that should have an open + handle to the table be waiting for the next statement to execute, + or waiting for a meta-data lock. + + A concurrent purge will be prevented by dict_operation_lock. */ + + info = pars_info_create(); + pars_info_add_ull_literal(info, "tableid", table_id); + trx->op_info = "dropping indexes"; + error = que_eval_sql(info, sql, FALSE, trx); + + if (error != DB_SUCCESS) { + /* Even though we ensure that DDL transactions are WAIT + and DEADLOCK free, we could encounter other errors e.g., + DB_TOO_MANY_CONCURRENT_TRXS. */ + trx->error_state = DB_SUCCESS; + + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Error: row_merge_drop_indexes_dict " + "failed with error code: %u.\n", (unsigned) error); + } + + trx->op_info = ""; +} + +/*********************************************************************//** +Drop indexes that were created before an error occurred. +The data dictionary must have been locked exclusively by the caller, +because the transaction will not be committed. */ +UNIV_INTERN +void +row_merge_drop_indexes( +/*===================*/ + trx_t* trx, /*!< in/out: dictionary transaction */ + dict_table_t* table, /*!< in/out: table containing the indexes */ + ibool locked) /*!< in: TRUE=table locked, + FALSE=may need to do a lazy drop */ +{ + dict_index_t* index; + dict_index_t* next_index; + + ut_ad(!srv_read_only_mode); + ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH); + ut_ad(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + index = dict_table_get_first_index(table); + ut_ad(dict_index_is_clust(index)); + ut_ad(dict_index_get_online_status(index) == ONLINE_INDEX_COMPLETE); + + /* the caller should have an open handle to the table */ + ut_ad(table->n_ref_count >= 1); + + /* It is possible that table->n_ref_count > 1 when + locked=TRUE. In this case, all code that should have an open + handle to the table be waiting for the next statement to execute, + or waiting for a meta-data lock. + + A concurrent purge will be prevented by dict_operation_lock. */ + + if (!locked && table->n_ref_count > 1) { + /* We will have to drop the indexes later, when the + table is guaranteed to be no longer in use. Mark the + indexes as incomplete and corrupted, so that other + threads will stop using them. Let dict_table_close() + or crash recovery or the next invocation of + prepare_inplace_alter_table() take care of dropping + the indexes. */ + + while ((index = dict_table_get_next_index(index)) != NULL) { + ut_ad(!dict_index_is_clust(index)); + + switch (dict_index_get_online_status(index)) { + case ONLINE_INDEX_ABORTED_DROPPED: + continue; + case ONLINE_INDEX_COMPLETE: + if (*index->name != TEMP_INDEX_PREFIX) { + /* Do nothing to already + published indexes. */ + } else if (index->type & DICT_FTS) { + /* Drop a completed FULLTEXT + index, due to a timeout during + MDL upgrade for + commit_inplace_alter_table(). + Because only concurrent reads + are allowed (and they are not + seeing this index yet) we + are safe to drop the index. */ + dict_index_t* prev = UT_LIST_GET_PREV( + indexes, index); + /* At least there should be + the clustered index before + this one. */ + ut_ad(prev); + ut_a(table->fts); + fts_drop_index(table, index, trx); + /* Since + INNOBASE_SHARE::idx_trans_tbl + is shared between all open + ha_innobase handles to this + table, no thread should be + accessing this dict_index_t + object. Also, we should be + holding LOCK=SHARED MDL on the + table even after the MDL + upgrade timeout. */ + + /* We can remove a DICT_FTS + index from the cache, because + we do not allow ADD FULLTEXT INDEX + with LOCK=NONE. If we allowed that, + we should exclude FTS entries from + prebuilt->ins_node->entry_list + in ins_node_create_entry_list(). */ + dict_index_remove_from_cache( + table, index); + index = prev; + } else { + rw_lock_x_lock( + dict_index_get_lock(index)); + dict_index_set_online_status( + index, ONLINE_INDEX_ABORTED); + index->type |= DICT_CORRUPT; + table->drop_aborted = TRUE; + goto drop_aborted; + } + continue; + case ONLINE_INDEX_CREATION: + rw_lock_x_lock(dict_index_get_lock(index)); + ut_ad(*index->name == TEMP_INDEX_PREFIX); + row_log_abort_sec(index); + drop_aborted: + rw_lock_x_unlock(dict_index_get_lock(index)); + + DEBUG_SYNC_C("merge_drop_index_after_abort"); + /* covered by dict_sys->mutex */ + MONITOR_INC(MONITOR_BACKGROUND_DROP_INDEX); + /* fall through */ + case ONLINE_INDEX_ABORTED: + /* Drop the index tree from the + data dictionary and free it from + the tablespace, but keep the object + in the data dictionary cache. */ + row_merge_drop_index_dict(trx, index->id); + rw_lock_x_lock(dict_index_get_lock(index)); + dict_index_set_online_status( + index, ONLINE_INDEX_ABORTED_DROPPED); + rw_lock_x_unlock(dict_index_get_lock(index)); + table->drop_aborted = TRUE; + continue; + } + ut_error; + } + + return; + } + + row_merge_drop_indexes_dict(trx, table->id); + + /* Invalidate all row_prebuilt_t::ins_graph that are referring + to this table. That is, force row_get_prebuilt_insert_row() to + rebuild prebuilt->ins_node->entry_list). */ + ut_ad(table->def_trx_id <= trx->id); + table->def_trx_id = trx->id; + + next_index = dict_table_get_next_index(index); + + while ((index = next_index) != NULL) { + /* read the next pointer before freeing the index */ + next_index = dict_table_get_next_index(index); + + ut_ad(!dict_index_is_clust(index)); + + if (*index->name == TEMP_INDEX_PREFIX) { + /* If it is FTS index, drop from table->fts + and also drop its auxiliary tables */ + if (index->type & DICT_FTS) { + ut_a(table->fts); + fts_drop_index(table, index, trx); + } + + switch (dict_index_get_online_status(index)) { + case ONLINE_INDEX_CREATION: + /* This state should only be possible + when prepare_inplace_alter_table() fails + after invoking row_merge_create_index(). + In inplace_alter_table(), + row_merge_build_indexes() + should never leave the index in this state. + It would invoke row_log_abort_sec() on + failure. */ + case ONLINE_INDEX_COMPLETE: + /* In these cases, we are able to drop + the index straight. The DROP INDEX was + never deferred. */ + break; + case ONLINE_INDEX_ABORTED: + case ONLINE_INDEX_ABORTED_DROPPED: + /* covered by dict_sys->mutex */ + MONITOR_DEC(MONITOR_BACKGROUND_DROP_INDEX); + } + + dict_index_remove_from_cache(table, index); + } + } + + table->drop_aborted = FALSE; + ut_d(dict_table_check_for_dup_indexes(table, CHECK_ALL_COMPLETE)); +} + +/*********************************************************************//** +Drop all partially created indexes during crash recovery. */ +UNIV_INTERN +void +row_merge_drop_temp_indexes(void) +/*=============================*/ +{ + static const char sql[] = + "PROCEDURE DROP_TEMP_INDEXES_PROC () IS\n" + "ixid CHAR;\n" + "found INT;\n" + + "DECLARE CURSOR index_cur IS\n" + " SELECT ID FROM SYS_INDEXES\n" + " WHERE SUBSTR(NAME,0,1)='" TEMP_INDEX_PREFIX_STR "'\n" + "FOR UPDATE;\n" + + "BEGIN\n" + "found := 1;\n" + "OPEN index_cur;\n" + "WHILE found = 1 LOOP\n" + " FETCH index_cur INTO ixid;\n" + " IF (SQL % NOTFOUND) THEN\n" + " found := 0;\n" + " ELSE\n" + " DELETE FROM SYS_FIELDS WHERE INDEX_ID=ixid;\n" + " DELETE FROM SYS_INDEXES WHERE CURRENT OF index_cur;\n" + " END IF;\n" + "END LOOP;\n" + "CLOSE index_cur;\n" + "END;\n"; + trx_t* trx; + dberr_t error; + + /* Load the table definitions that contain partially defined + indexes, so that the data dictionary information can be checked + when accessing the tablename.ibd files. */ + trx = trx_allocate_for_background(); + trx->op_info = "dropping partially created indexes"; + row_mysql_lock_data_dictionary(trx); + /* Ensure that this transaction will be rolled back and locks + will be released, if the server gets killed before the commit + gets written to the redo log. */ + trx_set_dict_operation(trx, TRX_DICT_OP_INDEX); + + trx->op_info = "dropping indexes"; + error = que_eval_sql(NULL, sql, FALSE, trx); + + if (error != DB_SUCCESS) { + /* Even though we ensure that DDL transactions are WAIT + and DEADLOCK free, we could encounter other errors e.g., + DB_TOO_MANY_CONCURRENT_TRXS. */ + trx->error_state = DB_SUCCESS; + + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Error: row_merge_drop_temp_indexes " + "failed with error code: %u.\n", (unsigned) error); + } + + trx_commit_for_mysql(trx); + row_mysql_unlock_data_dictionary(trx); + trx_free_for_background(trx); +} + +/*********************************************************************//** +Creates temporary merge files, and if UNIV_PFS_IO defined, register +the file descriptor with Performance Schema. +@return file descriptor, or -1 on failure */ +UNIV_INTERN +int +row_merge_file_create_low(void) +/*===========================*/ +{ + int fd; +#ifdef UNIV_PFS_IO + /* This temp file open does not go through normal + file APIs, add instrumentation to register with + performance schema */ + struct PSI_file_locker* locker = NULL; + PSI_file_locker_state state; + register_pfs_file_open_begin(&state, locker, innodb_file_temp_key, + PSI_FILE_OPEN, + "Innodb Merge Temp File", + __FILE__, __LINE__); +#endif + fd = innobase_mysql_tmpfile(); +#ifdef UNIV_PFS_IO + register_pfs_file_open_end(locker, fd); +#endif + + if (fd < 0) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Cannot create temporary merge file"); + return (-1); + } + return(fd); +} + +/*********************************************************************//** +Create a merge file. +@return file descriptor, or -1 on failure */ +UNIV_INTERN +int +row_merge_file_create( +/*==================*/ + merge_file_t* merge_file) /*!< out: merge file structure */ +{ + merge_file->fd = row_merge_file_create_low(); + merge_file->offset = 0; + merge_file->n_rec = 0; + + if (merge_file->fd >= 0) { + if (srv_disable_sort_file_cache) { + os_file_set_nocache(merge_file->fd, + "row0merge.cc", "sort"); + } + } + return(merge_file->fd); +} + +/*********************************************************************//** +Destroy a merge file. And de-register the file from Performance Schema +if UNIV_PFS_IO is defined. */ +UNIV_INTERN +void +row_merge_file_destroy_low( +/*=======================*/ + int fd) /*!< in: merge file descriptor */ +{ +#ifdef UNIV_PFS_IO + struct PSI_file_locker* locker = NULL; + PSI_file_locker_state state; + register_pfs_file_io_begin(&state, locker, + fd, 0, PSI_FILE_CLOSE, + __FILE__, __LINE__); +#endif + if (fd >= 0) { + close(fd); + } +#ifdef UNIV_PFS_IO + register_pfs_file_io_end(locker, 0); +#endif +} +/*********************************************************************//** +Destroy a merge file. */ +UNIV_INTERN +void +row_merge_file_destroy( +/*===================*/ + merge_file_t* merge_file) /*!< in/out: merge file structure */ +{ + ut_ad(!srv_read_only_mode); + + if (merge_file->fd != -1) { + row_merge_file_destroy_low(merge_file->fd); + merge_file->fd = -1; + } +} + +/*********************************************************************//** +Rename an index in the dictionary that was created. The data +dictionary must have been locked exclusively by the caller, because +the transaction will not be committed. +@return DB_SUCCESS if all OK */ +UNIV_INTERN +dberr_t +row_merge_rename_index_to_add( +/*==========================*/ + trx_t* trx, /*!< in/out: transaction */ + table_id_t table_id, /*!< in: table identifier */ + index_id_t index_id) /*!< in: index identifier */ +{ + dberr_t err = DB_SUCCESS; + pars_info_t* info = pars_info_create(); + + /* We use the private SQL parser of Innobase to generate the + query graphs needed in renaming indexes. */ + + static const char rename_index[] = + "PROCEDURE RENAME_INDEX_PROC () IS\n" + "BEGIN\n" + "UPDATE SYS_INDEXES SET NAME=SUBSTR(NAME,1,LENGTH(NAME)-1)\n" + "WHERE TABLE_ID = :tableid AND ID = :indexid;\n" + "END;\n"; + + ut_ad(trx); + ut_a(trx->dict_operation_lock_mode == RW_X_LATCH); + ut_ad(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX); + + trx->op_info = "renaming index to add"; + + pars_info_add_ull_literal(info, "tableid", table_id); + pars_info_add_ull_literal(info, "indexid", index_id); + + err = que_eval_sql(info, rename_index, FALSE, trx); + + if (err != DB_SUCCESS) { + /* Even though we ensure that DDL transactions are WAIT + and DEADLOCK free, we could encounter other errors e.g., + DB_TOO_MANY_CONCURRENT_TRXS. */ + trx->error_state = DB_SUCCESS; + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: row_merge_rename_index_to_add " + "failed with error code: %u.\n", (unsigned) err); + } + + trx->op_info = ""; + + return(err); +} + +/*********************************************************************//** +Rename an index in the dictionary that is to be dropped. The data +dictionary must have been locked exclusively by the caller, because +the transaction will not be committed. +@return DB_SUCCESS if all OK */ +UNIV_INTERN +dberr_t +row_merge_rename_index_to_drop( +/*===========================*/ + trx_t* trx, /*!< in/out: transaction */ + table_id_t table_id, /*!< in: table identifier */ + index_id_t index_id) /*!< in: index identifier */ +{ + dberr_t err; + pars_info_t* info = pars_info_create(); + + ut_ad(!srv_read_only_mode); + + /* We use the private SQL parser of Innobase to generate the + query graphs needed in renaming indexes. */ + + static const char rename_index[] = + "PROCEDURE RENAME_INDEX_PROC () IS\n" + "BEGIN\n" + "UPDATE SYS_INDEXES SET NAME=CONCAT('" + TEMP_INDEX_PREFIX_STR "',NAME)\n" + "WHERE TABLE_ID = :tableid AND ID = :indexid;\n" + "END;\n"; + + ut_ad(trx); + ut_a(trx->dict_operation_lock_mode == RW_X_LATCH); + ut_ad(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX); + + trx->op_info = "renaming index to drop"; + + pars_info_add_ull_literal(info, "tableid", table_id); + pars_info_add_ull_literal(info, "indexid", index_id); + + err = que_eval_sql(info, rename_index, FALSE, trx); + + if (err != DB_SUCCESS) { + /* Even though we ensure that DDL transactions are WAIT + and DEADLOCK free, we could encounter other errors e.g., + DB_TOO_MANY_CONCURRENT_TRXS. */ + trx->error_state = DB_SUCCESS; + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: row_merge_rename_index_to_drop " + "failed with error code: %u.\n", (unsigned) err); + } + + trx->op_info = ""; + + return(err); +} + +/*********************************************************************//** +Provide a new pathname for a table that is being renamed if it belongs to +a file-per-table tablespace. The caller is responsible for freeing the +memory allocated for the return value. +@return new pathname of tablespace file, or NULL if space = 0 */ +UNIV_INTERN +char* +row_make_new_pathname( +/*==================*/ + dict_table_t* table, /*!< in: table to be renamed */ + const char* new_name) /*!< in: new name */ +{ + char* new_path; + char* old_path; + + ut_ad(table->space != TRX_SYS_SPACE); + + old_path = fil_space_get_first_path(table->space); + ut_a(old_path); + + new_path = os_file_make_new_pathname(old_path, new_name); + + mem_free(old_path); + + return(new_path); +} + +/*********************************************************************//** +Rename the tables in the data dictionary. The data dictionary must +have been locked exclusively by the caller, because the transaction +will not be committed. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_merge_rename_tables_dict( +/*=========================*/ + dict_table_t* old_table, /*!< in/out: old table, renamed to + tmp_name */ + dict_table_t* new_table, /*!< in/out: new table, renamed to + old_table->name */ + const char* tmp_name, /*!< in: new name for old_table */ + trx_t* trx) /*!< in/out: dictionary transaction */ +{ + dberr_t err = DB_ERROR; + pars_info_t* info; + + ut_ad(!srv_read_only_mode); + ut_ad(old_table != new_table); + ut_ad(mutex_own(&dict_sys->mutex)); + ut_a(trx->dict_operation_lock_mode == RW_X_LATCH); + ut_ad(trx_get_dict_operation(trx) == TRX_DICT_OP_TABLE + || trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX); + + trx->op_info = "renaming tables"; + + /* We use the private SQL parser of Innobase to generate the query + graphs needed in updating the dictionary data in system tables. */ + + info = pars_info_create(); + + pars_info_add_str_literal(info, "new_name", new_table->name); + pars_info_add_str_literal(info, "old_name", old_table->name); + pars_info_add_str_literal(info, "tmp_name", tmp_name); + + err = que_eval_sql(info, + "PROCEDURE RENAME_TABLES () IS\n" + "BEGIN\n" + "UPDATE SYS_TABLES SET NAME = :tmp_name\n" + " WHERE NAME = :old_name;\n" + "UPDATE SYS_TABLES SET NAME = :old_name\n" + " WHERE NAME = :new_name;\n" + "END;\n", FALSE, trx); + + /* Update SYS_TABLESPACES and SYS_DATAFILES if the old + table is in a non-system tablespace where space > 0. */ + if (err == DB_SUCCESS + && old_table->space != TRX_SYS_SPACE + && !old_table->ibd_file_missing) { + /* Make pathname to update SYS_DATAFILES. */ + char* tmp_path = row_make_new_pathname(old_table, tmp_name); + + info = pars_info_create(); + + pars_info_add_str_literal(info, "tmp_name", tmp_name); + pars_info_add_str_literal(info, "tmp_path", tmp_path); + pars_info_add_int4_literal(info, "old_space", + (lint) old_table->space); + + err = que_eval_sql(info, + "PROCEDURE RENAME_OLD_SPACE () IS\n" + "BEGIN\n" + "UPDATE SYS_TABLESPACES" + " SET NAME = :tmp_name\n" + " WHERE SPACE = :old_space;\n" + "UPDATE SYS_DATAFILES" + " SET PATH = :tmp_path\n" + " WHERE SPACE = :old_space;\n" + "END;\n", FALSE, trx); + + mem_free(tmp_path); + } + + /* Update SYS_TABLESPACES and SYS_DATAFILES if the new + table is in a non-system tablespace where space > 0. */ + if (err == DB_SUCCESS && new_table->space != TRX_SYS_SPACE) { + /* Make pathname to update SYS_DATAFILES. */ + char* old_path = row_make_new_pathname( + new_table, old_table->name); + + info = pars_info_create(); + + pars_info_add_str_literal(info, "old_name", old_table->name); + pars_info_add_str_literal(info, "old_path", old_path); + pars_info_add_int4_literal(info, "new_space", + (lint) new_table->space); + + err = que_eval_sql(info, + "PROCEDURE RENAME_NEW_SPACE () IS\n" + "BEGIN\n" + "UPDATE SYS_TABLESPACES" + " SET NAME = :old_name\n" + " WHERE SPACE = :new_space;\n" + "UPDATE SYS_DATAFILES" + " SET PATH = :old_path\n" + " WHERE SPACE = :new_space;\n" + "END;\n", FALSE, trx); + + mem_free(old_path); + } + + if (err == DB_SUCCESS && dict_table_is_discarded(new_table)) { + err = row_import_update_discarded_flag( + trx, new_table->id, true, true); + } + + trx->op_info = ""; + + return(err); +} + +/*********************************************************************//** +Create and execute a query graph for creating an index. +@return DB_SUCCESS or error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_merge_create_index_graph( +/*=========================*/ + trx_t* trx, /*!< in: trx */ + dict_table_t* table, /*!< in: table */ + dict_index_t* index) /*!< in: index */ +{ + ind_node_t* node; /*!< Index creation node */ + mem_heap_t* heap; /*!< Memory heap */ + que_thr_t* thr; /*!< Query thread */ + dberr_t err; + + ut_ad(trx); + ut_ad(table); + ut_ad(index); + + heap = mem_heap_create(512); + + index->table = table; + node = ind_create_graph_create(index, heap, false); + thr = pars_complete_graph_for_exec(node, trx, heap); + + ut_a(thr == que_fork_start_command( + static_cast<que_fork_t*>(que_node_get_parent(thr)))); + + que_run_threads(thr); + + err = trx->error_state; + + que_graph_free((que_t*) que_node_get_parent(thr)); + + return(err); +} + +/*********************************************************************//** +Create the index and load in to the dictionary. +@return index, or NULL on error */ +UNIV_INTERN +dict_index_t* +row_merge_create_index( +/*===================*/ + trx_t* trx, /*!< in/out: trx (sets error_state) */ + dict_table_t* table, /*!< in: the index is on this table */ + const index_def_t* index_def) + /*!< in: the index definition */ +{ + dict_index_t* index; + dberr_t err; + ulint n_fields = index_def->n_fields; + ulint i; + + ut_ad(!srv_read_only_mode); + + /* Create the index prototype, using the passed in def, this is not + a persistent operation. We pass 0 as the space id, and determine at + a lower level the space id where to store the table. */ + + index = dict_mem_index_create(table->name, index_def->name, + 0, index_def->ind_type, n_fields); + + ut_a(index); + + for (i = 0; i < n_fields; i++) { + index_field_t* ifield = &index_def->fields[i]; + + dict_mem_index_add_field( + index, dict_table_get_col_name(table, ifield->col_no), + ifield->prefix_len); + } + + /* Add the index to SYS_INDEXES, using the index prototype. */ + err = row_merge_create_index_graph(trx, table, index); + + if (err == DB_SUCCESS) { + + index = dict_table_get_index_on_name(table, index_def->name); + + ut_a(index); + + /* Note the id of the transaction that created this + index, we use it to restrict readers from accessing + this index, to ensure read consistency. */ + ut_ad(index->trx_id == trx->id); + } else { + index = NULL; + } + + return(index); +} + +/*********************************************************************//** +Check if a transaction can use an index. */ +UNIV_INTERN +ibool +row_merge_is_index_usable( +/*======================*/ + const trx_t* trx, /*!< in: transaction */ + const dict_index_t* index) /*!< in: index to check */ +{ + if (!dict_index_is_clust(index) + && dict_index_is_online_ddl(index)) { + /* Indexes that are being created are not useable. */ + return(FALSE); + } + + return(!dict_index_is_corrupted(index) + && (dict_table_is_temporary(index->table) + || !trx->read_view + || read_view_sees_trx_id(trx->read_view, index->trx_id))); +} + +/*********************************************************************//** +Drop a table. The caller must have ensured that the background stats +thread is not processing the table. This can be done by calling +dict_stats_wait_bg_to_stop_using_table() after locking the dictionary and +before calling this function. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +row_merge_drop_table( +/*=================*/ + trx_t* trx, /*!< in: transaction */ + dict_table_t* table) /*!< in: table to drop */ +{ + ut_ad(!srv_read_only_mode); + + /* There must be no open transactions on the table. */ + ut_a(table->n_ref_count == 0); + + return(row_drop_table_for_mysql(table->name, trx, false, false)); +} + +/*********************************************************************//** +Build indexes on a table by reading a clustered index, +creating a temporary file containing index entries, merge sorting +these index entries and inserting sorted index entries to indexes. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +row_merge_build_indexes( +/*====================*/ + trx_t* trx, /*!< in: transaction */ + dict_table_t* old_table, /*!< in: table where rows are + read from */ + dict_table_t* new_table, /*!< in: table where indexes are + created; identical to old_table + unless creating a PRIMARY KEY */ + bool online, /*!< in: true if creating indexes + online */ + dict_index_t** indexes, /*!< in: indexes to be created */ + const ulint* key_numbers, /*!< in: MySQL key numbers */ + ulint n_indexes, /*!< in: size of indexes[] */ + struct TABLE* table, /*!< in/out: MySQL table, for + reporting erroneous key value + if applicable */ + const dtuple_t* add_cols, /*!< in: default values of + added columns, or NULL */ + const ulint* col_map, /*!< in: mapping of old column + numbers to new ones, or NULL + if old_table == new_table */ + ulint add_autoinc, /*!< in: number of added + AUTO_INCREMENT column, or + ULINT_UNDEFINED if none is added */ + ib_sequence_t& sequence) /*!< in: autoinc instance if + add_autoinc != ULINT_UNDEFINED */ +{ + merge_file_t* merge_files; + row_merge_block_t* block; + ulint block_size; + ulint i; + ulint j; + dberr_t error; + int tmpfd = -1; + dict_index_t* fts_sort_idx = NULL; + fts_psort_t* psort_info = NULL; + fts_psort_t* merge_info = NULL; + ib_int64_t sig_count = 0; + bool fts_psort_initiated = false; + DBUG_ENTER("row_merge_build_indexes"); + + ut_ad(!srv_read_only_mode); + ut_ad((old_table == new_table) == !col_map); + ut_ad(!add_cols || col_map); + + /* Allocate memory for merge file data structure and initialize + fields */ + + block_size = 3 * srv_sort_buf_size; + block = static_cast<row_merge_block_t*>( + os_mem_alloc_large(&block_size, FALSE)); + + if (block == NULL) { + DBUG_RETURN(DB_OUT_OF_MEMORY); + } + + trx_start_if_not_started_xa(trx); + + merge_files = static_cast<merge_file_t*>( + mem_alloc(n_indexes * sizeof *merge_files)); + + /* Initialize all the merge file descriptors, so that we + don't call row_merge_file_destroy() on uninitialized + merge file descriptor */ + + for (i = 0; i < n_indexes; i++) { + merge_files[i].fd = -1; + } + + for (i = 0; i < n_indexes; i++) { + if (row_merge_file_create(&merge_files[i]) < 0) { + error = DB_OUT_OF_MEMORY; + goto func_exit; + } + + if (indexes[i]->type & DICT_FTS) { + ibool opt_doc_id_size = FALSE; + + /* To build FTS index, we would need to extract + doc's word, Doc ID, and word's position, so + we need to build a "fts sort index" indexing + on above three 'fields' */ + fts_sort_idx = row_merge_create_fts_sort_index( + indexes[i], old_table, &opt_doc_id_size); + + row_merge_dup_t* dup = static_cast<row_merge_dup_t*>( + ut_malloc(sizeof *dup)); + dup->index = fts_sort_idx; + dup->table = table; + dup->col_map = col_map; + dup->n_dup = 0; + + row_fts_psort_info_init( + trx, dup, new_table, opt_doc_id_size, + &psort_info, &merge_info); + + /* "We need to ensure that we free the resources + allocated */ + fts_psort_initiated = true; + } + } + + tmpfd = row_merge_file_create_low(); + + if (tmpfd < 0) { + error = DB_OUT_OF_MEMORY; + goto func_exit; + } + + /* Reset the MySQL row buffer that is used when reporting + duplicate keys. */ + innobase_rec_reset(table); + + /* Read clustered index of the table and create files for + secondary index entries for merge sort */ + + error = row_merge_read_clustered_index( + trx, table, old_table, new_table, online, indexes, + fts_sort_idx, psort_info, merge_files, key_numbers, + n_indexes, add_cols, col_map, + add_autoinc, sequence, block); + + if (error != DB_SUCCESS) { + + goto func_exit; + } + + DEBUG_SYNC_C("row_merge_after_scan"); + + /* Now we have files containing index entries ready for + sorting and inserting. */ + + for (i = 0; i < n_indexes; i++) { + dict_index_t* sort_idx = indexes[i]; + + if (indexes[i]->type & DICT_FTS) { + os_event_t fts_parallel_merge_event; + + sort_idx = fts_sort_idx; + + fts_parallel_merge_event + = merge_info[0].psort_common->merge_event; + + if (FTS_PLL_MERGE) { + ulint trial_count = 0; + bool all_exit = false; + + os_event_reset(fts_parallel_merge_event); + row_fts_start_parallel_merge(merge_info); +wait_again: + os_event_wait_time_low( + fts_parallel_merge_event, 1000000, + sig_count); + + for (j = 0; j < FTS_NUM_AUX_INDEX; j++) { + if (merge_info[j].child_status + != FTS_CHILD_COMPLETE + && merge_info[j].child_status + != FTS_CHILD_EXITING) { + sig_count = os_event_reset( + fts_parallel_merge_event); + + goto wait_again; + } + } + + /* Now all children should complete, wait + a bit until they all finish using event */ + while (!all_exit && trial_count < 10000) { + all_exit = true; + + for (j = 0; j < FTS_NUM_AUX_INDEX; + j++) { + if (merge_info[j].child_status + != FTS_CHILD_EXITING) { + all_exit = false; + os_thread_sleep(1000); + break; + } + } + trial_count++; + } + + if (!all_exit) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Not all child merge threads" + " exited when creating FTS" + " index '%s'", + indexes[i]->name); + } + } else { + /* This cannot report duplicates; an + assertion would fail in that case. */ + error = row_fts_merge_insert( + sort_idx, new_table, + psort_info, 0); + } + +#ifdef FTS_INTERNAL_DIAG_PRINT + DEBUG_FTS_SORT_PRINT("FTS_SORT: Complete Insert\n"); +#endif + } else { + row_merge_dup_t dup = { + sort_idx, table, col_map, 0}; + + error = row_merge_sort( + trx, &dup, &merge_files[i], + block, &tmpfd); + + if (error == DB_SUCCESS) { + error = row_merge_insert_index_tuples( + trx->id, sort_idx, old_table, + merge_files[i].fd, block); + } + } + + /* Close the temporary file to free up space. */ + row_merge_file_destroy(&merge_files[i]); + + if (indexes[i]->type & DICT_FTS) { + row_fts_psort_info_destroy(psort_info, merge_info); + fts_psort_initiated = false; + } else if (error != DB_SUCCESS || !online) { + /* Do not apply any online log. */ + } else if (old_table != new_table) { + ut_ad(!sort_idx->online_log); + ut_ad(sort_idx->online_status + == ONLINE_INDEX_COMPLETE); + } else { + DEBUG_SYNC_C("row_log_apply_before"); + error = row_log_apply(trx, sort_idx, table); + DEBUG_SYNC_C("row_log_apply_after"); + } + + if (error != DB_SUCCESS) { + trx->error_key_num = key_numbers[i]; + goto func_exit; + } + + if (indexes[i]->type & DICT_FTS && fts_enable_diag_print) { + char* name = (char*) indexes[i]->name; + + if (*name == TEMP_INDEX_PREFIX) { + name++; + } + + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Finished building " + "full-text index %s\n", name); + } + } + +func_exit: + DBUG_EXECUTE_IF( + "ib_build_indexes_too_many_concurrent_trxs", + error = DB_TOO_MANY_CONCURRENT_TRXS; + trx->error_state = error;); + + if (fts_psort_initiated) { + /* Clean up FTS psort related resource */ + row_fts_psort_info_destroy(psort_info, merge_info); + fts_psort_initiated = false; + } + + row_merge_file_destroy_low(tmpfd); + + for (i = 0; i < n_indexes; i++) { + row_merge_file_destroy(&merge_files[i]); + } + + if (fts_sort_idx) { + dict_mem_index_free(fts_sort_idx); + } + + mem_free(merge_files); + os_mem_free_large(block, block_size); + + DICT_TF2_FLAG_UNSET(new_table, DICT_TF2_FTS_ADD_DOC_ID); + + if (online && old_table == new_table && error != DB_SUCCESS) { + /* On error, flag all online secondary index creation + as aborted. */ + for (i = 0; i < n_indexes; i++) { + ut_ad(!(indexes[i]->type & DICT_FTS)); + ut_ad(*indexes[i]->name == TEMP_INDEX_PREFIX); + ut_ad(!dict_index_is_clust(indexes[i])); + + /* Completed indexes should be dropped as + well, and indexes whose creation was aborted + should be dropped from the persistent + storage. However, at this point we can only + set some flags in the not-yet-published + indexes. These indexes will be dropped later + in row_merge_drop_indexes(), called by + rollback_inplace_alter_table(). */ + + switch (dict_index_get_online_status(indexes[i])) { + case ONLINE_INDEX_COMPLETE: + break; + case ONLINE_INDEX_CREATION: + rw_lock_x_lock( + dict_index_get_lock(indexes[i])); + row_log_abort_sec(indexes[i]); + indexes[i]->type |= DICT_CORRUPT; + rw_lock_x_unlock( + dict_index_get_lock(indexes[i])); + new_table->drop_aborted = TRUE; + /* fall through */ + case ONLINE_INDEX_ABORTED_DROPPED: + case ONLINE_INDEX_ABORTED: + MONITOR_MUTEX_INC( + &dict_sys->mutex, + MONITOR_BACKGROUND_DROP_INDEX); + } + } + } + + DBUG_RETURN(error); +} diff --git a/storage/xtradb/row/row0mysql.cc b/storage/xtradb/row/row0mysql.cc new file mode 100644 index 00000000000..7d486eaf53b --- /dev/null +++ b/storage/xtradb/row/row0mysql.cc @@ -0,0 +1,5461 @@ +/***************************************************************************** + +Copyright (c) 2000, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file row/row0mysql.cc +Interface between Innobase row operations and MySQL. +Contains also create table and other data dictionary operations. + +Created 9/17/2000 Heikki Tuuri +*******************************************************/ + +#include "row0mysql.h" + +#ifdef UNIV_NONINL +#include "row0mysql.ic" +#endif + +#include <debug_sync.h> +#include <my_dbug.h> + +#include <sql_const.h> +#include "row0ins.h" +#include "row0merge.h" +#include "row0sel.h" +#include "row0upd.h" +#include "row0row.h" +#include "que0que.h" +#include "pars0pars.h" +#include "dict0dict.h" +#include "dict0crea.h" +#include "dict0load.h" +#include "dict0priv.h" +#include "dict0boot.h" +#include "dict0stats.h" +#include "dict0stats_bg.h" +#include "trx0roll.h" +#include "trx0purge.h" +#include "trx0rec.h" +#include "trx0undo.h" +#include "lock0lock.h" +#include "rem0cmp.h" +#include "log0log.h" +#include "btr0sea.h" +#include "fil0fil.h" +#include "ibuf0ibuf.h" +#include "fts0fts.h" +#include "fts0types.h" +#include "srv0start.h" +#include "row0import.h" +#include "m_string.h" +#include "my_sys.h" +#include "ha_prototypes.h" +#include <algorithm> + +/** Provide optional 4.x backwards compatibility for 5.0 and above */ +UNIV_INTERN ibool row_rollback_on_timeout = FALSE; + +/** Chain node of the list of tables to drop in the background. */ +struct row_mysql_drop_t{ + char* table_name; /*!< table name */ + UT_LIST_NODE_T(row_mysql_drop_t)row_mysql_drop_list; + /*!< list chain node */ +}; + +#ifdef UNIV_PFS_MUTEX +/* Key to register drop list mutex with performance schema */ +UNIV_INTERN mysql_pfs_key_t row_drop_list_mutex_key; +#endif /* UNIV_PFS_MUTEX */ + +/** @brief List of tables we should drop in background. + +ALTER TABLE in MySQL requires that the table handler can drop the +table in background when there are no queries to it any +more. Protected by row_drop_list_mutex. */ +static UT_LIST_BASE_NODE_T(row_mysql_drop_t) row_mysql_drop_list; + +/** Mutex protecting the background table drop list. */ +static ib_mutex_t row_drop_list_mutex; + +/** Flag: has row_mysql_drop_list been initialized? */ +static ibool row_mysql_drop_list_inited = FALSE; + +/** Magic table names for invoking various monitor threads */ +/* @{ */ +static const char S_innodb_monitor[] = "innodb_monitor"; +static const char S_innodb_lock_monitor[] = "innodb_lock_monitor"; +static const char S_innodb_tablespace_monitor[] = "innodb_tablespace_monitor"; +static const char S_innodb_table_monitor[] = "innodb_table_monitor"; +#ifdef UNIV_MEM_DEBUG +static const char S_innodb_mem_validate[] = "innodb_mem_validate"; +#endif /* UNIV_MEM_DEBUG */ +/* @} */ + +/** Evaluates to true if str1 equals str2_onstack, used for comparing +the magic table names. +@param str1 in: string to compare +@param str1_len in: length of str1, in bytes, including terminating NUL +@param str2_onstack in: char[] array containing a NUL terminated string +@return TRUE if str1 equals str2_onstack */ +#define STR_EQ(str1, str1_len, str2_onstack) \ + ((str1_len) == sizeof(str2_onstack) \ + && memcmp(str1, str2_onstack, sizeof(str2_onstack)) == 0) + +/*******************************************************************//** +Determine if the given name is a name reserved for MySQL system tables. +@return TRUE if name is a MySQL system table name */ +static +ibool +row_mysql_is_system_table( +/*======================*/ + const char* name) +{ + if (strncmp(name, "mysql/", 6) != 0) { + + return(FALSE); + } + + return(0 == strcmp(name + 6, "host") + || 0 == strcmp(name + 6, "user") + || 0 == strcmp(name + 6, "db")); +} + +/*********************************************************************//** +If a table is not yet in the drop list, adds the table to the list of tables +which the master thread drops in background. We need this on Unix because in +ALTER TABLE MySQL may call drop table even if the table has running queries on +it. Also, if there are running foreign key checks on the table, we drop the +table lazily. +@return TRUE if the table was not yet in the drop list, and was added there */ +static +ibool +row_add_table_to_background_drop_list( +/*==================================*/ + const char* name); /*!< in: table name */ + +/*******************************************************************//** +Delays an INSERT, DELETE or UPDATE operation if the purge is lagging. */ +static +void +row_mysql_delay_if_needed(void) +/*===========================*/ +{ + if (srv_dml_needed_delay) { + os_thread_sleep(srv_dml_needed_delay); + } +} + +/*******************************************************************//** +Frees the blob heap in prebuilt when no longer needed. */ +UNIV_INTERN +void +row_mysql_prebuilt_free_blob_heap( +/*==============================*/ + row_prebuilt_t* prebuilt) /*!< in: prebuilt struct of a + ha_innobase:: table handle */ +{ + mem_heap_free(prebuilt->blob_heap); + prebuilt->blob_heap = NULL; +} + +/*******************************************************************//** +Stores a >= 5.0.3 format true VARCHAR length to dest, in the MySQL row +format. +@return pointer to the data, we skip the 1 or 2 bytes at the start +that are used to store the len */ +UNIV_INTERN +byte* +row_mysql_store_true_var_len( +/*=========================*/ + byte* dest, /*!< in: where to store */ + ulint len, /*!< in: length, must fit in two bytes */ + ulint lenlen) /*!< in: storage length of len: either 1 or 2 bytes */ +{ + if (lenlen == 2) { + ut_a(len < 256 * 256); + + mach_write_to_2_little_endian(dest, len); + + return(dest + 2); + } + + ut_a(lenlen == 1); + ut_a(len < 256); + + mach_write_to_1(dest, len); + + return(dest + 1); +} + +/*******************************************************************//** +Reads a >= 5.0.3 format true VARCHAR length, in the MySQL row format, and +returns a pointer to the data. +@return pointer to the data, we skip the 1 or 2 bytes at the start +that are used to store the len */ +UNIV_INTERN +const byte* +row_mysql_read_true_varchar( +/*========================*/ + ulint* len, /*!< out: variable-length field length */ + const byte* field, /*!< in: field in the MySQL format */ + ulint lenlen) /*!< in: storage length of len: either 1 + or 2 bytes */ +{ + if (lenlen == 2) { + *len = mach_read_from_2_little_endian(field); + + return(field + 2); + } + + ut_a(lenlen == 1); + + *len = mach_read_from_1(field); + + return(field + 1); +} + +/*******************************************************************//** +Stores a reference to a BLOB in the MySQL format. */ +UNIV_INTERN +void +row_mysql_store_blob_ref( +/*=====================*/ + byte* dest, /*!< in: where to store */ + ulint col_len,/*!< in: dest buffer size: determines into + how many bytes the BLOB length is stored, + the space for the length may vary from 1 + to 4 bytes */ + const void* data, /*!< in: BLOB data; if the value to store + is SQL NULL this should be NULL pointer */ + ulint len) /*!< in: BLOB length; if the value to store + is SQL NULL this should be 0; remember + also to set the NULL bit in the MySQL record + header! */ +{ + /* MySQL might assume the field is set to zero except the length and + the pointer fields */ + + memset(dest, '\0', col_len); + + /* In dest there are 1 - 4 bytes reserved for the BLOB length, + and after that 8 bytes reserved for the pointer to the data. + In 32-bit architectures we only use the first 4 bytes of the pointer + slot. */ + + ut_a(col_len - 8 > 1 || len < 256); + ut_a(col_len - 8 > 2 || len < 256 * 256); + ut_a(col_len - 8 > 3 || len < 256 * 256 * 256); + + mach_write_to_n_little_endian(dest, col_len - 8, len); + + memcpy(dest + col_len - 8, &data, sizeof data); +} + +/*******************************************************************//** +Reads a reference to a BLOB in the MySQL format. +@return pointer to BLOB data */ +UNIV_INTERN +const byte* +row_mysql_read_blob_ref( +/*====================*/ + ulint* len, /*!< out: BLOB length */ + const byte* ref, /*!< in: BLOB reference in the + MySQL format */ + ulint col_len) /*!< in: BLOB reference length + (not BLOB length) */ +{ + byte* data; + + *len = mach_read_from_n_little_endian(ref, col_len - 8); + + memcpy(&data, ref + col_len - 8, sizeof data); + + return(data); +} + +/**************************************************************//** +Pad a column with spaces. */ +UNIV_INTERN +void +row_mysql_pad_col( +/*==============*/ + ulint mbminlen, /*!< in: minimum size of a character, + in bytes */ + byte* pad, /*!< out: padded buffer */ + ulint len) /*!< in: number of bytes to pad */ +{ + const byte* pad_end; + + switch (UNIV_EXPECT(mbminlen, 1)) { + default: + ut_error; + case 1: + /* space=0x20 */ + memset(pad, 0x20, len); + break; + case 2: + /* space=0x0020 */ + pad_end = pad + len; + ut_a(!(len % 2)); + while (pad < pad_end) { + *pad++ = 0x00; + *pad++ = 0x20; + }; + break; + case 4: + /* space=0x00000020 */ + pad_end = pad + len; + ut_a(!(len % 4)); + while (pad < pad_end) { + *pad++ = 0x00; + *pad++ = 0x00; + *pad++ = 0x00; + *pad++ = 0x20; + } + break; + } +} + +/**************************************************************//** +Stores a non-SQL-NULL field given in the MySQL format in the InnoDB format. +The counterpart of this function is row_sel_field_store_in_mysql_format() in +row0sel.cc. +@return up to which byte we used buf in the conversion */ +UNIV_INTERN +byte* +row_mysql_store_col_in_innobase_format( +/*===================================*/ + dfield_t* dfield, /*!< in/out: dfield where dtype + information must be already set when + this function is called! */ + byte* buf, /*!< in/out: buffer for a converted + integer value; this must be at least + col_len long then! NOTE that dfield + may also get a pointer to 'buf', + therefore do not discard this as long + as dfield is used! */ + ibool row_format_col, /*!< TRUE if the mysql_data is from + a MySQL row, FALSE if from a MySQL + key value; + in MySQL, a true VARCHAR storage + format differs in a row and in a + key value: in a key value the length + is always stored in 2 bytes! */ + const byte* mysql_data, /*!< in: MySQL column value, not + SQL NULL; NOTE that dfield may also + get a pointer to mysql_data, + therefore do not discard this as long + as dfield is used! */ + ulint col_len, /*!< in: MySQL column length; NOTE that + this is the storage length of the + column in the MySQL format row, not + necessarily the length of the actual + payload data; if the column is a true + VARCHAR then this is irrelevant */ + ulint comp) /*!< in: nonzero=compact format */ +{ + const byte* ptr = mysql_data; + const dtype_t* dtype; + ulint type; + ulint lenlen; + + dtype = dfield_get_type(dfield); + + type = dtype->mtype; + + if (type == DATA_INT) { + /* Store integer data in Innobase in a big-endian format, + sign bit negated if the data is a signed integer. In MySQL, + integers are stored in a little-endian format. */ + + byte* p = buf + col_len; + + for (;;) { + p--; + *p = *mysql_data; + if (p == buf) { + break; + } + mysql_data++; + } + + if (!(dtype->prtype & DATA_UNSIGNED)) { + + *buf ^= 128; + } + + ptr = buf; + buf += col_len; + } else if ((type == DATA_VARCHAR + || type == DATA_VARMYSQL + || type == DATA_BINARY)) { + + if (dtype_get_mysql_type(dtype) == DATA_MYSQL_TRUE_VARCHAR) { + /* The length of the actual data is stored to 1 or 2 + bytes at the start of the field */ + + if (row_format_col) { + if (dtype->prtype & DATA_LONG_TRUE_VARCHAR) { + lenlen = 2; + } else { + lenlen = 1; + } + } else { + /* In a MySQL key value, lenlen is always 2 */ + lenlen = 2; + } + + ptr = row_mysql_read_true_varchar(&col_len, mysql_data, + lenlen); + } else { + /* Remove trailing spaces from old style VARCHAR + columns. */ + + /* Handle Unicode strings differently. */ + ulint mbminlen = dtype_get_mbminlen(dtype); + + ptr = mysql_data; + + switch (mbminlen) { + default: + ut_error; + case 4: + /* space=0x00000020 */ + /* Trim "half-chars", just in case. */ + col_len &= ~3; + + while (col_len >= 4 + && ptr[col_len - 4] == 0x00 + && ptr[col_len - 3] == 0x00 + && ptr[col_len - 2] == 0x00 + && ptr[col_len - 1] == 0x20) { + col_len -= 4; + } + break; + case 2: + /* space=0x0020 */ + /* Trim "half-chars", just in case. */ + col_len &= ~1; + + while (col_len >= 2 && ptr[col_len - 2] == 0x00 + && ptr[col_len - 1] == 0x20) { + col_len -= 2; + } + break; + case 1: + /* space=0x20 */ + while (col_len > 0 + && ptr[col_len - 1] == 0x20) { + col_len--; + } + } + } + } else if (comp && type == DATA_MYSQL + && dtype_get_mbminlen(dtype) == 1 + && dtype_get_mbmaxlen(dtype) > 1) { + /* In some cases we strip trailing spaces from UTF-8 and other + multibyte charsets, from FIXED-length CHAR columns, to save + space. UTF-8 would otherwise normally use 3 * the string length + bytes to store an ASCII string! */ + + /* We assume that this CHAR field is encoded in a + variable-length character set where spaces have + 1:1 correspondence to 0x20 bytes, such as UTF-8. + + Consider a CHAR(n) field, a field of n characters. + It will contain between n * mbminlen and n * mbmaxlen bytes. + We will try to truncate it to n bytes by stripping + space padding. If the field contains single-byte + characters only, it will be truncated to n characters. + Consider a CHAR(5) field containing the string ".a " + where "." denotes a 3-byte character represented by + the bytes "$%&". After our stripping, the string will + be stored as "$%&a " (5 bytes). The string ".abc " + will be stored as "$%&abc" (6 bytes). + + The space padding will be restored in row0sel.cc, function + row_sel_field_store_in_mysql_format(). */ + + ulint n_chars; + + ut_a(!(dtype_get_len(dtype) % dtype_get_mbmaxlen(dtype))); + + n_chars = dtype_get_len(dtype) / dtype_get_mbmaxlen(dtype); + + /* Strip space padding. */ + while (col_len > n_chars && ptr[col_len - 1] == 0x20) { + col_len--; + } + } else if (type == DATA_BLOB && row_format_col) { + + ptr = row_mysql_read_blob_ref(&col_len, mysql_data, col_len); + } + + dfield_set_data(dfield, ptr, col_len); + + return(buf); +} + +/**************************************************************//** +Convert a row in the MySQL format to a row in the Innobase format. Note that +the function to convert a MySQL format key value to an InnoDB dtuple is +row_sel_convert_mysql_key_to_innobase() in row0sel.cc. */ +static +void +row_mysql_convert_row_to_innobase( +/*==============================*/ + dtuple_t* row, /*!< in/out: Innobase row where the + field type information is already + copied there! */ + row_prebuilt_t* prebuilt, /*!< in: prebuilt struct where template + must be of type ROW_MYSQL_WHOLE_ROW */ + byte* mysql_rec) /*!< in: row in the MySQL format; + NOTE: do not discard as long as + row is used, as row may contain + pointers to this record! */ +{ + const mysql_row_templ_t*templ; + dfield_t* dfield; + ulint i; + + ut_ad(prebuilt->template_type == ROW_MYSQL_WHOLE_ROW); + ut_ad(prebuilt->mysql_template); + + for (i = 0; i < prebuilt->n_template; i++) { + + templ = prebuilt->mysql_template + i; + dfield = dtuple_get_nth_field(row, i); + + if (templ->mysql_null_bit_mask != 0) { + /* Column may be SQL NULL */ + + if (mysql_rec[templ->mysql_null_byte_offset] + & (byte) (templ->mysql_null_bit_mask)) { + + /* It is SQL NULL */ + + dfield_set_null(dfield); + + goto next_column; + } + } + + row_mysql_store_col_in_innobase_format( + dfield, + prebuilt->ins_upd_rec_buff + templ->mysql_col_offset, + TRUE, /* MySQL row format data */ + mysql_rec + templ->mysql_col_offset, + templ->mysql_col_len, + dict_table_is_comp(prebuilt->table)); +next_column: + ; + } + + /* If there is a FTS doc id column and it is not user supplied ( + generated by server) then assign it a new doc id. */ + if (prebuilt->table->fts) { + + ut_a(prebuilt->table->fts->doc_col != ULINT_UNDEFINED); + + fts_create_doc_id(prebuilt->table, row, prebuilt->heap); + } +} + +/****************************************************************//** +Handles user errors and lock waits detected by the database engine. +@return true if it was a lock wait and we should continue running the +query thread and in that case the thr is ALREADY in the running state. */ +UNIV_INTERN +bool +row_mysql_handle_errors( +/*====================*/ + dberr_t* new_err,/*!< out: possible new error encountered in + lock wait, or if no new error, the value + of trx->error_state at the entry of this + function */ + trx_t* trx, /*!< in: transaction */ + que_thr_t* thr, /*!< in: query thread, or NULL */ + trx_savept_t* savept) /*!< in: savepoint, or NULL */ +{ + dberr_t err; + +handle_new_error: + err = trx->error_state; + + ut_a(err != DB_SUCCESS); + + trx->error_state = DB_SUCCESS; + + switch (err) { + case DB_LOCK_WAIT_TIMEOUT: + if (row_rollback_on_timeout) { + trx_rollback_to_savepoint(trx, NULL); + break; + } + /* fall through */ + case DB_DUPLICATE_KEY: + case DB_FOREIGN_DUPLICATE_KEY: + case DB_TOO_BIG_RECORD: + case DB_UNDO_RECORD_TOO_BIG: + case DB_ROW_IS_REFERENCED: + case DB_NO_REFERENCED_ROW: + case DB_CANNOT_ADD_CONSTRAINT: + case DB_TOO_MANY_CONCURRENT_TRXS: + case DB_OUT_OF_FILE_SPACE: + case DB_READ_ONLY: + case DB_FTS_INVALID_DOCID: + case DB_INTERRUPTED: + case DB_DICT_CHANGED: + if (savept) { + /* Roll back the latest, possibly incomplete insertion + or update */ + + trx_rollback_to_savepoint(trx, savept); + } + /* MySQL will roll back the latest SQL statement */ + break; + case DB_LOCK_WAIT: + lock_wait_suspend_thread(thr); + + if (trx->error_state != DB_SUCCESS) { + que_thr_stop_for_mysql(thr); + + goto handle_new_error; + } + + *new_err = err; + + return(true); + + case DB_DEADLOCK: + case DB_LOCK_TABLE_FULL: + /* Roll back the whole transaction; this resolution was added + to version 3.23.43 */ + + trx_rollback_to_savepoint(trx, NULL); + break; + + case DB_MUST_GET_MORE_FILE_SPACE: + fputs("InnoDB: The database cannot continue" + " operation because of\n" + "InnoDB: lack of space. You must add" + " a new data file to\n" + "InnoDB: my.cnf and restart the database.\n", stderr); + + ut_ad(0); + exit(1); + + case DB_CORRUPTION: + fputs("InnoDB: We detected index corruption" + " in an InnoDB type table.\n" + "InnoDB: You have to dump + drop + reimport" + " the table or, in\n" + "InnoDB: a case of widespread corruption," + " dump all InnoDB\n" + "InnoDB: tables and recreate the" + " whole InnoDB tablespace.\n" + "InnoDB: If the mysqld server crashes" + " after the startup or when\n" + "InnoDB: you dump the tables, look at\n" + "InnoDB: " REFMAN "forcing-innodb-recovery.html" + " for help.\n", stderr); + break; + case DB_FOREIGN_EXCEED_MAX_CASCADE: + fprintf(stderr, "InnoDB: Cannot delete/update rows with" + " cascading foreign key constraints that exceed max" + " depth of %lu\n" + "Please drop excessive foreign constraints" + " and try again\n", (ulong) DICT_FK_MAX_RECURSIVE_LOAD); + break; + default: + fprintf(stderr, "InnoDB: unknown error code %lu\n", + (ulong) err); + ut_error; + } + + if (trx->error_state != DB_SUCCESS) { + *new_err = trx->error_state; + } else { + *new_err = err; + } + + trx->error_state = DB_SUCCESS; + + return(false); +} + +/********************************************************************//** +Create a prebuilt struct for a MySQL table handle. +@return own: a prebuilt struct */ +UNIV_INTERN +row_prebuilt_t* +row_create_prebuilt( +/*================*/ + dict_table_t* table, /*!< in: Innobase table handle */ + ulint mysql_row_len) /*!< in: length in bytes of a row in + the MySQL format */ +{ + row_prebuilt_t* prebuilt; + mem_heap_t* heap; + dict_index_t* clust_index; + dict_index_t* temp_index; + dtuple_t* ref; + ulint ref_len; + uint srch_key_len = 0; + ulint search_tuple_n_fields; + + search_tuple_n_fields = 2 * dict_table_get_n_cols(table); + + clust_index = dict_table_get_first_index(table); + + /* Make sure that search_tuple is long enough for clustered index */ + ut_a(2 * dict_table_get_n_cols(table) >= clust_index->n_fields); + + ref_len = dict_index_get_n_unique(clust_index); + + + /* Maximum size of the buffer needed for conversion of INTs from + little endian format to big endian format in an index. An index + can have maximum 16 columns (MAX_REF_PARTS) in it. Therfore + Max size for PK: 16 * 8 bytes (BIGINT's size) = 128 bytes + Max size Secondary index: 16 * 8 bytes + PK = 256 bytes. */ +#define MAX_SRCH_KEY_VAL_BUFFER 2* (8 * MAX_REF_PARTS) + +#define PREBUILT_HEAP_INITIAL_SIZE \ + ( \ + sizeof(*prebuilt) \ + /* allocd in this function */ \ + + DTUPLE_EST_ALLOC(search_tuple_n_fields) \ + + DTUPLE_EST_ALLOC(ref_len) \ + /* allocd in row_prebuild_sel_graph() */ \ + + sizeof(sel_node_t) \ + + sizeof(que_fork_t) \ + + sizeof(que_thr_t) \ + /* allocd in row_get_prebuilt_update_vector() */ \ + + sizeof(upd_node_t) \ + + sizeof(upd_t) \ + + sizeof(upd_field_t) \ + * dict_table_get_n_cols(table) \ + + sizeof(que_fork_t) \ + + sizeof(que_thr_t) \ + /* allocd in row_get_prebuilt_insert_row() */ \ + + sizeof(ins_node_t) \ + /* mysql_row_len could be huge and we are not \ + sure if this prebuilt instance is going to be \ + used in inserts */ \ + + (mysql_row_len < 256 ? mysql_row_len : 0) \ + + DTUPLE_EST_ALLOC(dict_table_get_n_cols(table)) \ + + sizeof(que_fork_t) \ + + sizeof(que_thr_t) \ + ) + + /* Calculate size of key buffer used to store search key in + InnoDB format. MySQL stores INTs in little endian format and + InnoDB stores INTs in big endian format with the sign bit + flipped. All other field types are stored/compared the same + in MySQL and InnoDB, so we must create a buffer containing + the INT key parts in InnoDB format.We need two such buffers + since both start and end keys are used in records_in_range(). */ + + for (temp_index = dict_table_get_first_index(table); temp_index; + temp_index = dict_table_get_next_index(temp_index)) { + DBUG_EXECUTE_IF("innodb_srch_key_buffer_max_value", + ut_a(temp_index->n_user_defined_cols + == MAX_REF_PARTS);); + uint temp_len = 0; + for (uint i = 0; i < temp_index->n_uniq; i++) { + if (temp_index->fields[i].col->mtype == DATA_INT) { + temp_len += + temp_index->fields[i].fixed_len; + } + } + srch_key_len = max(srch_key_len,temp_len); + } + + ut_a(srch_key_len <= MAX_SRCH_KEY_VAL_BUFFER); + + DBUG_EXECUTE_IF("innodb_srch_key_buffer_max_value", + ut_a(srch_key_len == MAX_SRCH_KEY_VAL_BUFFER);); + + /* We allocate enough space for the objects that are likely to + be created later in order to minimize the number of malloc() + calls */ + heap = mem_heap_create(PREBUILT_HEAP_INITIAL_SIZE + 2 * srch_key_len); + + prebuilt = static_cast<row_prebuilt_t*>( + mem_heap_zalloc(heap, sizeof(*prebuilt))); + + prebuilt->magic_n = ROW_PREBUILT_ALLOCATED; + prebuilt->magic_n2 = ROW_PREBUILT_ALLOCATED; + + prebuilt->table = table; + + prebuilt->sql_stat_start = TRUE; + prebuilt->heap = heap; + + prebuilt->srch_key_val_len = srch_key_len; + if (prebuilt->srch_key_val_len) { + prebuilt->srch_key_val1 = static_cast<byte*>( + mem_heap_alloc(prebuilt->heap, + 2 * prebuilt->srch_key_val_len)); + prebuilt->srch_key_val2 = prebuilt->srch_key_val1 + + prebuilt->srch_key_val_len; + } else { + prebuilt->srch_key_val1 = NULL; + prebuilt->srch_key_val2 = NULL; + } + + btr_pcur_reset(&prebuilt->pcur); + btr_pcur_reset(&prebuilt->clust_pcur); + + prebuilt->select_lock_type = LOCK_NONE; + prebuilt->stored_select_lock_type = LOCK_NONE_UNSET; + + prebuilt->search_tuple = dtuple_create(heap, search_tuple_n_fields); + + ref = dtuple_create(heap, ref_len); + + dict_index_copy_types(ref, clust_index, ref_len); + + prebuilt->clust_ref = ref; + + prebuilt->autoinc_error = DB_SUCCESS; + prebuilt->autoinc_offset = 0; + + /* Default to 1, we will set the actual value later in + ha_innobase::get_auto_increment(). */ + prebuilt->autoinc_increment = 1; + + prebuilt->autoinc_last_value = 0; + + /* During UPDATE and DELETE we need the doc id. */ + prebuilt->fts_doc_id = 0; + + prebuilt->mysql_row_len = mysql_row_len; + + return(prebuilt); +} + +/********************************************************************//** +Free a prebuilt struct for a MySQL table handle. */ +UNIV_INTERN +void +row_prebuilt_free( +/*==============*/ + row_prebuilt_t* prebuilt, /*!< in, own: prebuilt struct */ + ibool dict_locked) /*!< in: TRUE=data dictionary locked */ +{ + ulint i; + + if (UNIV_UNLIKELY + (prebuilt->magic_n != ROW_PREBUILT_ALLOCATED + || prebuilt->magic_n2 != ROW_PREBUILT_ALLOCATED)) { + + fprintf(stderr, + "InnoDB: Error: trying to free a corrupt\n" + "InnoDB: table handle. Magic n %lu," + " magic n2 %lu, table name ", + (ulong) prebuilt->magic_n, + (ulong) prebuilt->magic_n2); + ut_print_name(stderr, NULL, TRUE, prebuilt->table->name); + putc('\n', stderr); + + mem_analyze_corruption(prebuilt); + + ut_error; + } + + prebuilt->magic_n = ROW_PREBUILT_FREED; + prebuilt->magic_n2 = ROW_PREBUILT_FREED; + + btr_pcur_reset(&prebuilt->pcur); + btr_pcur_reset(&prebuilt->clust_pcur); + + if (prebuilt->mysql_template) { + mem_free(prebuilt->mysql_template); + } + + if (prebuilt->ins_graph) { + que_graph_free_recursive(prebuilt->ins_graph); + } + + if (prebuilt->sel_graph) { + que_graph_free_recursive(prebuilt->sel_graph); + } + + if (prebuilt->upd_graph) { + que_graph_free_recursive(prebuilt->upd_graph); + } + + if (prebuilt->blob_heap) { + mem_heap_free(prebuilt->blob_heap); + } + + if (prebuilt->old_vers_heap) { + mem_heap_free(prebuilt->old_vers_heap); + } + + if (prebuilt->fetch_cache[0] != NULL) { + byte* base = prebuilt->fetch_cache[0] - 4; + byte* ptr = base; + + for (i = 0; i < MYSQL_FETCH_CACHE_SIZE; i++) { + byte* row; + ulint magic1; + ulint magic2; + + magic1 = mach_read_from_4(ptr); + ptr += 4; + + row = ptr; + ptr += prebuilt->mysql_row_len; + + magic2 = mach_read_from_4(ptr); + ptr += 4; + + if (ROW_PREBUILT_FETCH_MAGIC_N != magic1 + || row != prebuilt->fetch_cache[i] + || ROW_PREBUILT_FETCH_MAGIC_N != magic2) { + + fputs("InnoDB: Error: trying to free" + " a corrupt fetch buffer.\n", stderr); + + mem_analyze_corruption(base); + ut_error; + } + } + + mem_free(base); + } + + dict_table_close(prebuilt->table, dict_locked, TRUE); + + mem_heap_free(prebuilt->heap); +} + +/*********************************************************************//** +Updates the transaction pointers in query graphs stored in the prebuilt +struct. */ +UNIV_INTERN +void +row_update_prebuilt_trx( +/*====================*/ + row_prebuilt_t* prebuilt, /*!< in/out: prebuilt struct + in MySQL handle */ + trx_t* trx) /*!< in: transaction handle */ +{ + if (trx->magic_n != TRX_MAGIC_N) { + fprintf(stderr, + "InnoDB: Error: trying to use a corrupt\n" + "InnoDB: trx handle. Magic n %lu\n", + (ulong) trx->magic_n); + + mem_analyze_corruption(trx); + + ut_error; + } + + if (prebuilt->magic_n != ROW_PREBUILT_ALLOCATED) { + fprintf(stderr, + "InnoDB: Error: trying to use a corrupt\n" + "InnoDB: table handle. Magic n %lu, table name ", + (ulong) prebuilt->magic_n); + ut_print_name(stderr, trx, TRUE, prebuilt->table->name); + putc('\n', stderr); + + mem_analyze_corruption(prebuilt); + + ut_error; + } + + prebuilt->trx = trx; + + if (prebuilt->ins_graph) { + prebuilt->ins_graph->trx = trx; + } + + if (prebuilt->upd_graph) { + prebuilt->upd_graph->trx = trx; + } + + if (prebuilt->sel_graph) { + prebuilt->sel_graph->trx = trx; + } +} + +/*********************************************************************//** +Gets pointer to a prebuilt dtuple used in insertions. If the insert graph +has not yet been built in the prebuilt struct, then this function first +builds it. +@return prebuilt dtuple; the column type information is also set in it */ +static +dtuple_t* +row_get_prebuilt_insert_row( +/*========================*/ + row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in MySQL + handle */ +{ + dict_table_t* table = prebuilt->table; + + ut_ad(prebuilt && table && prebuilt->trx); + + if (prebuilt->ins_node != 0) { + + /* Check if indexes have been dropped or added and we + may need to rebuild the row insert template. */ + + if (prebuilt->trx_id == table->def_trx_id + && UT_LIST_GET_LEN(prebuilt->ins_node->entry_list) + == UT_LIST_GET_LEN(table->indexes)) { + + return(prebuilt->ins_node->row); + } + + ut_ad(prebuilt->trx_id < table->def_trx_id); + + que_graph_free_recursive(prebuilt->ins_graph); + + prebuilt->ins_graph = 0; + } + + /* Create an insert node and query graph to the prebuilt struct */ + + ins_node_t* node; + + node = ins_node_create(INS_DIRECT, table, prebuilt->heap); + + prebuilt->ins_node = node; + + if (prebuilt->ins_upd_rec_buff == 0) { + prebuilt->ins_upd_rec_buff = static_cast<byte*>( + mem_heap_alloc( + prebuilt->heap, + prebuilt->mysql_row_len)); + } + + dtuple_t* row; + + row = dtuple_create(prebuilt->heap, dict_table_get_n_cols(table)); + + dict_table_copy_types(row, table); + + ins_node_set_new_row(node, row); + + prebuilt->ins_graph = static_cast<que_fork_t*>( + que_node_get_parent( + pars_complete_graph_for_exec( + node, + prebuilt->trx, prebuilt->heap))); + + prebuilt->ins_graph->state = QUE_FORK_ACTIVE; + + prebuilt->trx_id = table->def_trx_id; + + return(prebuilt->ins_node->row); +} + +/*********************************************************************//** +Updates the table modification counter and calculates new estimates +for table and index statistics if necessary. */ +UNIV_INLINE +void +row_update_statistics_if_needed( +/*============================*/ + dict_table_t* table) /*!< in: table */ +{ + ib_uint64_t counter; + ib_uint64_t n_rows; + + if (!table->stat_initialized) { + DBUG_EXECUTE_IF( + "test_upd_stats_if_needed_not_inited", + fprintf(stderr, "test_upd_stats_if_needed_not_inited " + "was executed\n"); + ); + return; + } + + counter = table->stat_modified_counter++; + n_rows = dict_table_get_n_rows(table); + + if (dict_stats_is_persistent_enabled(table)) { + if (counter > n_rows / 10 /* 10% */ + && dict_stats_auto_recalc_is_enabled(table)) { + + dict_stats_recalc_pool_add(table); + table->stat_modified_counter = 0; + } + return; + } + + /* Calculate new statistics if 1 / 16 of table has been modified + since the last time a statistics batch was run. + We calculate statistics at most every 16th round, since we may have + a counter table which is very small and updated very often. */ + + if (counter > 16 + n_rows / 16 /* 6.25% */) { + + ut_ad(!mutex_own(&dict_sys->mutex)); + /* this will reset table->stat_modified_counter to 0 */ + dict_stats_update(table, DICT_STATS_RECALC_TRANSIENT); + } +} + +/*********************************************************************//** +Sets an AUTO_INC type lock on the table mentioned in prebuilt. The +AUTO_INC lock gives exclusive access to the auto-inc counter of the +table. The lock is reserved only for the duration of an SQL statement. +It is not compatible with another AUTO_INC or exclusive lock on the +table. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_lock_table_autoinc_for_mysql( +/*=============================*/ + row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in the MySQL + table handle */ +{ + trx_t* trx = prebuilt->trx; + ins_node_t* node = prebuilt->ins_node; + const dict_table_t* table = prebuilt->table; + que_thr_t* thr; + dberr_t err; + ibool was_lock_wait; + + ut_ad(trx); + + /* If we already hold an AUTOINC lock on the table then do nothing. + Note: We peek at the value of the current owner without acquiring + the lock mutex. **/ + if (trx == table->autoinc_trx) { + + return(DB_SUCCESS); + } + + trx->op_info = "setting auto-inc lock"; + + row_get_prebuilt_insert_row(prebuilt); + node = prebuilt->ins_node; + + /* We use the insert query graph as the dummy graph needed + in the lock module call */ + + thr = que_fork_get_first_thr(prebuilt->ins_graph); + + que_thr_move_to_run_state_for_mysql(thr, trx); + +run_again: + thr->run_node = node; + thr->prev_node = node; + + /* It may be that the current session has not yet started + its transaction, or it has been committed: */ + + trx_start_if_not_started_xa(trx); + + err = lock_table(0, prebuilt->table, LOCK_AUTO_INC, thr); + + trx->error_state = err; + + if (err != DB_SUCCESS) { + que_thr_stop_for_mysql(thr); + + was_lock_wait = row_mysql_handle_errors(&err, trx, thr, NULL); + + if (was_lock_wait) { + goto run_again; + } + + trx->op_info = ""; + + return(err); + } + + que_thr_stop_for_mysql_no_error(thr, trx); + + trx->op_info = ""; + + return(err); +} + +/*********************************************************************//** +Sets a table lock on the table mentioned in prebuilt. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_lock_table_for_mysql( +/*=====================*/ + row_prebuilt_t* prebuilt, /*!< in: prebuilt struct in the MySQL + table handle */ + dict_table_t* table, /*!< in: table to lock, or NULL + if prebuilt->table should be + locked as + prebuilt->select_lock_type */ + ulint mode) /*!< in: lock mode of table + (ignored if table==NULL) */ +{ + trx_t* trx = prebuilt->trx; + que_thr_t* thr; + dberr_t err; + ibool was_lock_wait; + + ut_ad(trx); + + trx->op_info = "setting table lock"; + + if (prebuilt->sel_graph == NULL) { + /* Build a dummy select query graph */ + row_prebuild_sel_graph(prebuilt); + } + + /* We use the select query graph as the dummy graph needed + in the lock module call */ + + thr = que_fork_get_first_thr(prebuilt->sel_graph); + + que_thr_move_to_run_state_for_mysql(thr, trx); + +run_again: + thr->run_node = thr; + thr->prev_node = thr->common.parent; + + /* It may be that the current session has not yet started + its transaction, or it has been committed: */ + + trx_start_if_not_started_xa(trx); + + if (table) { + err = lock_table( + 0, table, + static_cast<enum lock_mode>(mode), thr); + } else { + err = lock_table( + 0, prebuilt->table, + static_cast<enum lock_mode>( + prebuilt->select_lock_type), + thr); + } + + trx->error_state = err; + + if (err != DB_SUCCESS) { + que_thr_stop_for_mysql(thr); + + was_lock_wait = row_mysql_handle_errors(&err, trx, thr, NULL); + + if (was_lock_wait) { + goto run_again; + } + + trx->op_info = ""; + + return(err); + } + + que_thr_stop_for_mysql_no_error(thr, trx); + + trx->op_info = ""; + + return(err); +} + +/*********************************************************************//** +Does an insert for MySQL. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_insert_for_mysql( +/*=================*/ + byte* mysql_rec, /*!< in: row in the MySQL format */ + row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in MySQL + handle */ +{ + trx_savept_t savept; + que_thr_t* thr; + dberr_t err; + ibool was_lock_wait; + trx_t* trx = prebuilt->trx; + ins_node_t* node = prebuilt->ins_node; + dict_table_t* table = prebuilt->table; + + ut_ad(trx); + + if (dict_table_is_discarded(prebuilt->table)) { + ib_logf(IB_LOG_LEVEL_ERROR, + "The table %s doesn't have a corresponding " + "tablespace, it was discarded.", + prebuilt->table->name); + + return(DB_TABLESPACE_DELETED); + + } else if (prebuilt->table->ibd_file_missing) { + + ib_logf(IB_LOG_LEVEL_ERROR, + ".ibd file is missing for table %s", + prebuilt->table->name); + + return(DB_TABLESPACE_NOT_FOUND); + + } else if (prebuilt->magic_n != ROW_PREBUILT_ALLOCATED) { + fprintf(stderr, + "InnoDB: Error: trying to free a corrupt\n" + "InnoDB: table handle. Magic n %lu, table name ", + (ulong) prebuilt->magic_n); + ut_print_name(stderr, trx, TRUE, prebuilt->table->name); + putc('\n', stderr); + + mem_analyze_corruption(prebuilt); + + ut_error; + } else if (srv_created_new_raw || srv_force_recovery) { + fputs("InnoDB: A new raw disk partition was initialized or\n" + "InnoDB: innodb_force_recovery is on: we do not allow\n" + "InnoDB: database modifications by the user. Shut down\n" + "InnoDB: mysqld and edit my.cnf so that" + " newraw is replaced\n" + "InnoDB: with raw, and innodb_force_... is removed.\n", + stderr); + if(srv_force_recovery) { + return(DB_READ_ONLY); + } + return(DB_ERROR); + } + + trx->op_info = "inserting"; + + row_mysql_delay_if_needed(); + + trx_start_if_not_started_xa(trx); + + row_get_prebuilt_insert_row(prebuilt); + node = prebuilt->ins_node; + + row_mysql_convert_row_to_innobase(node->row, prebuilt, mysql_rec); + + savept = trx_savept_take(trx); + + thr = que_fork_get_first_thr(prebuilt->ins_graph); + + if (prebuilt->sql_stat_start) { + node->state = INS_NODE_SET_IX_LOCK; + prebuilt->sql_stat_start = FALSE; + } else { + node->state = INS_NODE_ALLOC_ROW_ID; + } + + que_thr_move_to_run_state_for_mysql(thr, trx); + +run_again: + thr->run_node = node; + thr->prev_node = node; + + row_ins_step(thr); + + err = trx->error_state; + + if (err != DB_SUCCESS) { +error_exit: + que_thr_stop_for_mysql(thr); + + /* FIXME: What's this ? */ + thr->lock_state = QUE_THR_LOCK_ROW; + + was_lock_wait = row_mysql_handle_errors( + &err, trx, thr, &savept); + + thr->lock_state = QUE_THR_LOCK_NOLOCK; + + if (was_lock_wait) { + ut_ad(node->state == INS_NODE_INSERT_ENTRIES + || node->state == INS_NODE_ALLOC_ROW_ID); + goto run_again; + } + + trx->op_info = ""; + + return(err); + } + + if (dict_table_has_fts_index(table)) { + doc_id_t doc_id; + + /* Extract the doc id from the hidden FTS column */ + doc_id = fts_get_doc_id_from_row(table, node->row); + + if (doc_id <= 0) { + fprintf(stderr, + "InnoDB: FTS Doc ID must be large than 0 \n"); + err = DB_FTS_INVALID_DOCID; + trx->error_state = DB_FTS_INVALID_DOCID; + goto error_exit; + } + + if (!DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) { + doc_id_t next_doc_id + = table->fts->cache->next_doc_id; + + if (doc_id < next_doc_id) { + fprintf(stderr, + "InnoDB: FTS Doc ID must be large than" + " " UINT64PF " for table", + next_doc_id - 1); + ut_print_name(stderr, trx, TRUE, table->name); + putc('\n', stderr); + + err = DB_FTS_INVALID_DOCID; + trx->error_state = DB_FTS_INVALID_DOCID; + goto error_exit; + } + + /* Difference between Doc IDs are restricted within + 4 bytes integer. See fts_get_encoded_len() */ + + if (doc_id - next_doc_id >= FTS_DOC_ID_MAX_STEP) { + fprintf(stderr, + "InnoDB: Doc ID " UINT64PF " is too" + " big. Its difference with largest" + " used Doc ID " UINT64PF " cannot" + " exceed or equal to %d\n", + doc_id, next_doc_id - 1, + FTS_DOC_ID_MAX_STEP); + err = DB_FTS_INVALID_DOCID; + trx->error_state = DB_FTS_INVALID_DOCID; + goto error_exit; + } + } + + /* Pass NULL for the columns affected, since an INSERT affects + all FTS indexes. */ + fts_trx_add_op(trx, table, doc_id, FTS_INSERT, NULL); + } + + que_thr_stop_for_mysql_no_error(thr, trx); + + if (UNIV_LIKELY(!(trx->fake_changes))) { + + srv_stats.n_rows_inserted.add((size_t)trx->id, 1); + + /* Not protected by dict_table_stats_lock() for performance + reasons, we would rather get garbage in stat_n_rows (which is + just an estimate anyway) than protecting the following code + with a latch. */ + dict_table_n_rows_inc(table); + + row_update_statistics_if_needed(table); + } + + trx->op_info = ""; + + return(err); +} + +/*********************************************************************//** +Builds a dummy query graph used in selects. */ +UNIV_INTERN +void +row_prebuild_sel_graph( +/*===================*/ + row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in MySQL + handle */ +{ + sel_node_t* node; + + ut_ad(prebuilt && prebuilt->trx); + + if (prebuilt->sel_graph == NULL) { + + node = sel_node_create(prebuilt->heap); + + prebuilt->sel_graph = static_cast<que_fork_t*>( + que_node_get_parent( + pars_complete_graph_for_exec( + static_cast<sel_node_t*>(node), + prebuilt->trx, prebuilt->heap))); + + prebuilt->sel_graph->state = QUE_FORK_ACTIVE; + } +} + +/*********************************************************************//** +Creates an query graph node of 'update' type to be used in the MySQL +interface. +@return own: update node */ +UNIV_INTERN +upd_node_t* +row_create_update_node_for_mysql( +/*=============================*/ + dict_table_t* table, /*!< in: table to update */ + mem_heap_t* heap) /*!< in: mem heap from which allocated */ +{ + upd_node_t* node; + + node = upd_node_create(heap); + + node->in_mysql_interface = TRUE; + node->is_delete = FALSE; + node->searched_update = FALSE; + node->select = NULL; + node->pcur = btr_pcur_create_for_mysql(); + node->table = table; + + node->update = upd_create(dict_table_get_n_cols(table), heap); + + node->update_n_fields = dict_table_get_n_cols(table); + + UT_LIST_INIT(node->columns); + node->has_clust_rec_x_lock = TRUE; + node->cmpl_info = 0; + + node->table_sym = NULL; + node->col_assign_list = NULL; + + return(node); +} + +/*********************************************************************//** +Gets pointer to a prebuilt update vector used in updates. If the update +graph has not yet been built in the prebuilt struct, then this function +first builds it. +@return prebuilt update vector */ +UNIV_INTERN +upd_t* +row_get_prebuilt_update_vector( +/*===========================*/ + row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in MySQL + handle */ +{ + dict_table_t* table = prebuilt->table; + upd_node_t* node; + + ut_ad(prebuilt && table && prebuilt->trx); + + if (prebuilt->upd_node == NULL) { + + /* Not called before for this handle: create an update node + and query graph to the prebuilt struct */ + + node = row_create_update_node_for_mysql(table, prebuilt->heap); + + prebuilt->upd_node = node; + + prebuilt->upd_graph = static_cast<que_fork_t*>( + que_node_get_parent( + pars_complete_graph_for_exec( + static_cast<upd_node_t*>(node), + prebuilt->trx, prebuilt->heap))); + + prebuilt->upd_graph->state = QUE_FORK_ACTIVE; + } + + return(prebuilt->upd_node->update); +} + +/******************************************************************** +Handle an update of a column that has an FTS index. */ +static +void +row_fts_do_update( +/*==============*/ + trx_t* trx, /* in: transaction */ + dict_table_t* table, /* in: Table with FTS index */ + doc_id_t old_doc_id, /* in: old document id */ + doc_id_t new_doc_id) /* in: new document id */ +{ + if (trx->fts_next_doc_id) { + fts_trx_add_op(trx, table, old_doc_id, FTS_DELETE, NULL); + fts_trx_add_op(trx, table, new_doc_id, FTS_INSERT, NULL); + } +} + +/************************************************************************ +Handles FTS matters for an update or a delete. +NOTE: should not be called if the table does not have an FTS index. .*/ +static +dberr_t +row_fts_update_or_delete( +/*=====================*/ + row_prebuilt_t* prebuilt) /* in: prebuilt struct in MySQL + handle */ +{ + trx_t* trx = prebuilt->trx; + dict_table_t* table = prebuilt->table; + upd_node_t* node = prebuilt->upd_node; + doc_id_t old_doc_id = prebuilt->fts_doc_id; + + ut_a(dict_table_has_fts_index(prebuilt->table)); + + /* Deletes are simple; get them out of the way first. */ + if (node->is_delete) { + /* A delete affects all FTS indexes, so we pass NULL */ + fts_trx_add_op(trx, table, old_doc_id, FTS_DELETE, NULL); + } else { + doc_id_t new_doc_id; + + new_doc_id = fts_read_doc_id((byte*) &trx->fts_next_doc_id); + + if (new_doc_id == 0) { + fprintf(stderr, " InnoDB FTS: Doc ID cannot be 0 \n"); + return(DB_FTS_INVALID_DOCID); + } + + row_fts_do_update(trx, table, old_doc_id, new_doc_id); + } + + return(DB_SUCCESS); +} + +/*********************************************************************//** +Initialize the Doc ID system for FK table with FTS index */ +static +void +init_fts_doc_id_for_ref( +/*====================*/ + dict_table_t* table, /*!< in: table */ + ulint* depth) /*!< in: recusive call depth */ +{ + dict_foreign_t* foreign; + + table->fk_max_recusive_level = 0; + + (*depth)++; + + /* Limit on tables involved in cascading delete/update */ + if (*depth > FK_MAX_CASCADE_DEL) { + return; + } + + /* Loop through this table's referenced list and also + recursively traverse each table's foreign table list */ + for (dict_foreign_set::iterator it = table->referenced_set.begin(); + it != table->referenced_set.end(); + ++it) { + + foreign = *it; + + if (foreign->foreign_table == NULL) { + break; + } + + if (foreign->foreign_table->fts != NULL) { + fts_init_doc_id(foreign->foreign_table); + } + + if (!foreign->foreign_table->referenced_set.empty() + && foreign->foreign_table != table) { + init_fts_doc_id_for_ref( + foreign->foreign_table, depth); + } + } +} + +/*********************************************************************//** +Does an update or delete of a row for MySQL. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_update_for_mysql( +/*=================*/ + byte* mysql_rec, /*!< in: the row to be updated, in + the MySQL format */ + row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in MySQL + handle */ +{ + trx_savept_t savept; + dberr_t err; + que_thr_t* thr; + ibool was_lock_wait; + dict_index_t* clust_index; + /* ulint ref_len; */ + upd_node_t* node; + dict_table_t* table = prebuilt->table; + trx_t* trx = prebuilt->trx; + ulint fk_depth = 0; + + ut_ad(prebuilt && trx); + UT_NOT_USED(mysql_rec); + + if (prebuilt->table->ibd_file_missing) { + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Error:\n" + "InnoDB: MySQL is trying to use a table handle" + " but the .ibd file for\n" + "InnoDB: table %s does not exist.\n" + "InnoDB: Have you deleted the .ibd file" + " from the database directory under\n" + "InnoDB: the MySQL datadir, or have you" + " used DISCARD TABLESPACE?\n" + "InnoDB: Look from\n" + "InnoDB: " REFMAN "innodb-troubleshooting.html\n" + "InnoDB: how you can resolve the problem.\n", + prebuilt->table->name); + return(DB_ERROR); + } + + if (UNIV_UNLIKELY(prebuilt->magic_n != ROW_PREBUILT_ALLOCATED)) { + fprintf(stderr, + "InnoDB: Error: trying to free a corrupt\n" + "InnoDB: table handle. Magic n %lu, table name ", + (ulong) prebuilt->magic_n); + ut_print_name(stderr, trx, TRUE, prebuilt->table->name); + putc('\n', stderr); + + mem_analyze_corruption(prebuilt); + + ut_error; + } + + if (UNIV_UNLIKELY(srv_created_new_raw || srv_force_recovery)) { + fputs("InnoDB: A new raw disk partition was initialized or\n" + "InnoDB: innodb_force_recovery is on: we do not allow\n" + "InnoDB: database modifications by the user. Shut down\n" + "InnoDB: mysqld and edit my.cnf so that newraw" + " is replaced\n" + "InnoDB: with raw, and innodb_force_... is removed.\n", + stderr); + if(srv_force_recovery) { + return(DB_READ_ONLY); + } + return(DB_ERROR); + } + + DEBUG_SYNC_C("innodb_row_update_for_mysql_begin"); + + trx->op_info = "updating or deleting"; + + row_mysql_delay_if_needed(); + + trx_start_if_not_started_xa(trx); + + if (dict_table_is_referenced_by_foreign_key(table)) { + /* Share lock the data dictionary to prevent any + table dictionary (for foreign constraint) change. + This is similar to row_ins_check_foreign_constraint + check protect by the dictionary lock as well. + In the future, this can be removed once the Foreign + key MDL is implemented */ + row_mysql_freeze_data_dictionary(trx); + init_fts_doc_id_for_ref(table, &fk_depth); + row_mysql_unfreeze_data_dictionary(trx); + } + + node = prebuilt->upd_node; + + clust_index = dict_table_get_first_index(table); + + if (prebuilt->pcur.btr_cur.index == clust_index) { + btr_pcur_copy_stored_position(node->pcur, &prebuilt->pcur); + } else { + btr_pcur_copy_stored_position(node->pcur, + &prebuilt->clust_pcur); + } + + ut_a(node->pcur->rel_pos == BTR_PCUR_ON); + + /* MySQL seems to call rnd_pos before updating each row it + has cached: we can get the correct cursor position from + prebuilt->pcur; NOTE that we cannot build the row reference + from mysql_rec if the clustered index was automatically + generated for the table: MySQL does not know anything about + the row id used as the clustered index key */ + + savept = trx_savept_take(trx); + + thr = que_fork_get_first_thr(prebuilt->upd_graph); + + node->state = UPD_NODE_UPDATE_CLUSTERED; + + ut_ad(!prebuilt->sql_stat_start); + + que_thr_move_to_run_state_for_mysql(thr, trx); + +run_again: + thr->run_node = node; + thr->prev_node = node; + thr->fk_cascade_depth = 0; + + row_upd_step(thr); + + err = trx->error_state; + + /* Reset fk_cascade_depth back to 0 */ + thr->fk_cascade_depth = 0; + + if (err != DB_SUCCESS) { + que_thr_stop_for_mysql(thr); + + if (err == DB_RECORD_NOT_FOUND) { + trx->error_state = DB_SUCCESS; + trx->op_info = ""; + + return(err); + } + + thr->lock_state= QUE_THR_LOCK_ROW; + + DEBUG_SYNC(trx->mysql_thd, "row_update_for_mysql_error"); + + was_lock_wait = row_mysql_handle_errors(&err, trx, thr, + &savept); + thr->lock_state= QUE_THR_LOCK_NOLOCK; + + if (was_lock_wait) { + goto run_again; + } + + trx->op_info = ""; + + return(err); + } + + que_thr_stop_for_mysql_no_error(thr, trx); + + if (UNIV_UNLIKELY(trx->fake_changes)) { + + trx->op_info = ""; + return(err); + } + + if (dict_table_has_fts_index(table) + && trx->fts_next_doc_id != UINT64_UNDEFINED) { + err = row_fts_update_or_delete(prebuilt); + if (err != DB_SUCCESS) { + trx->op_info = ""; + return(err); + } + } + + if (node->is_delete) { + /* Not protected by dict_table_stats_lock() for performance + reasons, we would rather get garbage in stat_n_rows (which is + just an estimate anyway) than protecting the following code + with a latch. */ + dict_table_n_rows_dec(prebuilt->table); + + srv_stats.n_rows_deleted.add((size_t)trx->id, 1); + } else { + srv_stats.n_rows_updated.add((size_t)trx->id, 1); + } + + /* We update table statistics only if it is a DELETE or UPDATE + that changes indexed columns, UPDATEs that change only non-indexed + columns would not affect statistics. */ + if (node->is_delete || !(node->cmpl_info & UPD_NODE_NO_ORD_CHANGE)) { + row_update_statistics_if_needed(prebuilt->table); + } + + trx->op_info = ""; + + return(err); +} + +/*********************************************************************//** +This can only be used when srv_locks_unsafe_for_binlog is TRUE or this +session is using a READ COMMITTED or READ UNCOMMITTED isolation level. +Before calling this function row_search_for_mysql() must have +initialized prebuilt->new_rec_locks to store the information which new +record locks really were set. This function removes a newly set +clustered index record lock under prebuilt->pcur or +prebuilt->clust_pcur. Thus, this implements a 'mini-rollback' that +releases the latest clustered index record lock we set. +@return error code or DB_SUCCESS */ +UNIV_INTERN +void +row_unlock_for_mysql( +/*=================*/ + row_prebuilt_t* prebuilt, /*!< in/out: prebuilt struct in MySQL + handle */ + ibool has_latches_on_recs)/*!< in: TRUE if called so + that we have the latches on + the records under pcur and + clust_pcur, and we do not need + to reposition the cursors. */ +{ + btr_pcur_t* pcur = &prebuilt->pcur; + btr_pcur_t* clust_pcur = &prebuilt->clust_pcur; + trx_t* trx = prebuilt->trx; + + ut_ad(prebuilt && trx); + + if (UNIV_UNLIKELY + (!srv_locks_unsafe_for_binlog + && trx->isolation_level > TRX_ISO_READ_COMMITTED)) { + + fprintf(stderr, + "InnoDB: Error: calling row_unlock_for_mysql though\n" + "InnoDB: innodb_locks_unsafe_for_binlog is FALSE and\n" + "InnoDB: this session is not using" + " READ COMMITTED isolation level.\n"); + return; + } + + trx->op_info = "unlock_row"; + + if (prebuilt->new_rec_locks >= 1) { + + const rec_t* rec; + dict_index_t* index; + trx_id_t rec_trx_id; + mtr_t mtr; + + mtr_start(&mtr); + + /* Restore the cursor position and find the record */ + + if (!has_latches_on_recs) { + btr_pcur_restore_position(BTR_SEARCH_LEAF, pcur, &mtr); + } + + rec = btr_pcur_get_rec(pcur); + index = btr_pcur_get_btr_cur(pcur)->index; + + if (prebuilt->new_rec_locks >= 2) { + /* Restore the cursor position and find the record + in the clustered index. */ + + if (!has_latches_on_recs) { + btr_pcur_restore_position(BTR_SEARCH_LEAF, + clust_pcur, &mtr); + } + + rec = btr_pcur_get_rec(clust_pcur); + index = btr_pcur_get_btr_cur(clust_pcur)->index; + } + + if (!dict_index_is_clust(index)) { + /* This is not a clustered index record. We + do not know how to unlock the record. */ + goto no_unlock; + } + + /* If the record has been modified by this + transaction, do not unlock it. */ + + if (index->trx_id_offset) { + rec_trx_id = trx_read_trx_id(rec + + index->trx_id_offset); + } else { + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + + rec_offs_init(offsets_); + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + + rec_trx_id = row_get_rec_trx_id(rec, index, offsets); + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + } + + if (rec_trx_id != trx->id) { + /* We did not update the record: unlock it */ + + rec = btr_pcur_get_rec(pcur); + + lock_rec_unlock( + trx, + btr_pcur_get_block(pcur), + rec, + static_cast<enum lock_mode>( + prebuilt->select_lock_type)); + + if (prebuilt->new_rec_locks >= 2) { + rec = btr_pcur_get_rec(clust_pcur); + + lock_rec_unlock( + trx, + btr_pcur_get_block(clust_pcur), + rec, + static_cast<enum lock_mode>( + prebuilt->select_lock_type)); + } + } +no_unlock: + mtr_commit(&mtr); + } + + trx->op_info = ""; +} + +/**********************************************************************//** +Does a cascaded delete or set null in a foreign key operation. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_update_cascade_for_mysql( +/*=========================*/ + que_thr_t* thr, /*!< in: query thread */ + upd_node_t* node, /*!< in: update node used in the cascade + or set null operation */ + dict_table_t* table) /*!< in: table where we do the operation */ +{ + dberr_t err; + trx_t* trx; + + trx = thr_get_trx(thr); + + /* Increment fk_cascade_depth to record the recursive call depth on + a single update/delete that affects multiple tables chained + together with foreign key relations. */ + thr->fk_cascade_depth++; + + if (thr->fk_cascade_depth > FK_MAX_CASCADE_DEL) { + return(DB_FOREIGN_EXCEED_MAX_CASCADE); + } +run_again: + thr->run_node = node; + thr->prev_node = node; + + DEBUG_SYNC_C("foreign_constraint_update_cascade"); + + row_upd_step(thr); + + /* The recursive call for cascading update/delete happens + in above row_upd_step(), reset the counter once we come + out of the recursive call, so it does not accumulate for + different row deletes */ + thr->fk_cascade_depth = 0; + + err = trx->error_state; + + /* Note that the cascade node is a subnode of another InnoDB + query graph node. We do a normal lock wait in this node, but + all errors are handled by the parent node. */ + + if (err == DB_LOCK_WAIT) { + /* Handle lock wait here */ + + que_thr_stop_for_mysql(thr); + + lock_wait_suspend_thread(thr); + + /* Note that a lock wait may also end in a lock wait timeout, + or this transaction is picked as a victim in selective + deadlock resolution */ + + if (trx->error_state != DB_SUCCESS) { + + return(trx->error_state); + } + + /* Retry operation after a normal lock wait */ + + goto run_again; + } + + if (err != DB_SUCCESS) { + + return(err); + } + + if (UNIV_UNLIKELY((trx->fake_changes))) { + + return(err); + } + + if (node->is_delete) { + /* Not protected by dict_table_stats_lock() for performance + reasons, we would rather get garbage in stat_n_rows (which is + just an estimate anyway) than protecting the following code + with a latch. */ + dict_table_n_rows_dec(table); + + srv_stats.n_rows_deleted.add((size_t)trx->id, 1); + } else { + srv_stats.n_rows_updated.add((size_t)trx->id, 1); + } + + row_update_statistics_if_needed(table); + + return(err); +} + +/*********************************************************************//** +Checks if a table is such that we automatically created a clustered +index on it (on row id). +@return TRUE if the clustered index was generated automatically */ +UNIV_INTERN +ibool +row_table_got_default_clust_index( +/*==============================*/ + const dict_table_t* table) /*!< in: table */ +{ + const dict_index_t* clust_index; + + clust_index = dict_table_get_first_index(table); + + return(dict_index_get_nth_col(clust_index, 0)->mtype == DATA_SYS); +} + +/*********************************************************************//** +Locks the data dictionary in shared mode from modifications, for performing +foreign key check, rollback, or other operation invisible to MySQL. */ +UNIV_INTERN +void +row_mysql_freeze_data_dictionary_func( +/*==================================*/ + trx_t* trx, /*!< in/out: transaction */ + const char* file, /*!< in: file name */ + ulint line) /*!< in: line number */ +{ + ut_a(trx->dict_operation_lock_mode == 0); + + rw_lock_s_lock_inline(&dict_operation_lock, 0, file, line); + + trx->dict_operation_lock_mode = RW_S_LATCH; +} + +/*********************************************************************//** +Unlocks the data dictionary shared lock. */ +UNIV_INTERN +void +row_mysql_unfreeze_data_dictionary( +/*===============================*/ + trx_t* trx) /*!< in/out: transaction */ +{ + ut_ad(lock_trx_has_sys_table_locks(trx) == NULL); + + ut_a(trx->dict_operation_lock_mode == RW_S_LATCH); + + rw_lock_s_unlock(&dict_operation_lock); + + trx->dict_operation_lock_mode = 0; +} + +/*********************************************************************//** +Locks the data dictionary exclusively for performing a table create or other +data dictionary modification operation. */ +UNIV_INTERN +void +row_mysql_lock_data_dictionary_func( +/*================================*/ + trx_t* trx, /*!< in/out: transaction */ + const char* file, /*!< in: file name */ + ulint line) /*!< in: line number */ +{ + ut_a(trx->dict_operation_lock_mode == 0 + || trx->dict_operation_lock_mode == RW_X_LATCH); + + /* Serialize data dictionary operations with dictionary mutex: + no deadlocks or lock waits can occur then in these operations */ + + rw_lock_x_lock_inline(&dict_operation_lock, 0, file, line); + trx->dict_operation_lock_mode = RW_X_LATCH; + + mutex_enter(&(dict_sys->mutex)); +} + +/*********************************************************************//** +Unlocks the data dictionary exclusive lock. */ +UNIV_INTERN +void +row_mysql_unlock_data_dictionary( +/*=============================*/ + trx_t* trx) /*!< in/out: transaction */ +{ + ut_ad(lock_trx_has_sys_table_locks(trx) == NULL); + + ut_a(trx->dict_operation_lock_mode == RW_X_LATCH); + + /* Serialize data dictionary operations with dictionary mutex: + no deadlocks can occur then in these operations */ + + mutex_exit(&(dict_sys->mutex)); + rw_lock_x_unlock(&dict_operation_lock); + + trx->dict_operation_lock_mode = 0; +} + +/*********************************************************************//** +Creates a table for MySQL. If the name of the table ends in +one of "innodb_monitor", "innodb_lock_monitor", "innodb_tablespace_monitor", +"innodb_table_monitor", then this will also start the printing of monitor +output by the master thread. If the table name ends in "innodb_mem_validate", +InnoDB will try to invoke mem_validate(). On failure the transaction will +be rolled back and the 'table' object will be freed. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_create_table_for_mysql( +/*=======================*/ + dict_table_t* table, /*!< in, own: table definition + (will be freed, or on DB_SUCCESS + added to the data dictionary cache) */ + trx_t* trx, /*!< in/out: transaction */ + bool commit) /*!< in: if true, commit the transaction */ +{ + tab_node_t* node; + mem_heap_t* heap; + que_thr_t* thr; + const char* table_name; + ulint table_name_len; + dberr_t err; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(mutex_own(&(dict_sys->mutex))); + ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH); + + DBUG_EXECUTE_IF( + "ib_create_table_fail_at_start_of_row_create_table_for_mysql", + goto err_exit; + ); + + if (srv_created_new_raw) { + fputs("InnoDB: A new raw disk partition was initialized:\n" + "InnoDB: we do not allow database modifications" + " by the user.\n" + "InnoDB: Shut down mysqld and edit my.cnf so that newraw" + " is replaced with raw.\n", stderr); +err_exit: + dict_mem_table_free(table); + + if (commit) { + trx_commit_for_mysql(trx); + } + + return(DB_ERROR); + } + + trx->op_info = "creating table"; + + if (row_mysql_is_system_table(table->name)) { + + fprintf(stderr, + "InnoDB: Error: trying to create a MySQL system" + " table %s of type InnoDB.\n" + "InnoDB: MySQL system tables must be" + " of the MyISAM type!\n", + table->name); + goto err_exit; + } + + trx_start_if_not_started_xa(trx); + + /* The table name is prefixed with the database name and a '/'. + Certain table names starting with 'innodb_' have their special + meaning regardless of the database name. Thus, we need to + ignore the database name prefix in the comparisons. */ + table_name = dict_remove_db_name(table->name); + table_name_len = strlen(table_name) + 1; + + if (STR_EQ(table_name, table_name_len, S_innodb_monitor)) { + + /* Table equals "innodb_monitor": + start monitor prints */ + + srv_print_innodb_monitor = TRUE; + + /* The lock timeout monitor thread also takes care + of InnoDB monitor prints */ + + os_event_set(lock_sys->timeout_event); + } else if (STR_EQ(table_name, table_name_len, + S_innodb_lock_monitor)) { + + srv_print_innodb_monitor = TRUE; + srv_print_innodb_lock_monitor = TRUE; + os_event_set(lock_sys->timeout_event); + } else if (STR_EQ(table_name, table_name_len, + S_innodb_tablespace_monitor)) { + + srv_print_innodb_tablespace_monitor = TRUE; + os_event_set(lock_sys->timeout_event); + } else if (STR_EQ(table_name, table_name_len, + S_innodb_table_monitor)) { + + srv_print_innodb_table_monitor = TRUE; + os_event_set(lock_sys->timeout_event); +#ifdef UNIV_MEM_DEBUG + } else if (STR_EQ(table_name, table_name_len, + S_innodb_mem_validate)) { + /* We define here a debugging feature intended for + developers */ + + fputs("Validating InnoDB memory:\n" + "to use this feature you must compile InnoDB with\n" + "UNIV_MEM_DEBUG defined in univ.i and" + " the server must be\n" + "quiet because allocation from a mem heap" + " is not protected\n" + "by any semaphore.\n", stderr); + ut_a(mem_validate()); + fputs("Memory validated\n", stderr); +#endif /* UNIV_MEM_DEBUG */ + } + + heap = mem_heap_create(512); + + switch (trx_get_dict_operation(trx)) { + case TRX_DICT_OP_NONE: + trx_set_dict_operation(trx, TRX_DICT_OP_TABLE); + case TRX_DICT_OP_TABLE: + break; + case TRX_DICT_OP_INDEX: + /* If the transaction was previously flagged as + TRX_DICT_OP_INDEX, we should be creating auxiliary + tables for full-text indexes. */ + ut_ad(strstr(table->name, "/FTS_") != NULL); + } + + node = tab_create_graph_create(table, heap, commit); + + thr = pars_complete_graph_for_exec(node, trx, heap); + + ut_a(thr == que_fork_start_command( + static_cast<que_fork_t*>(que_node_get_parent(thr)))); + + que_run_threads(thr); + + err = trx->error_state; + + if (table->space != TRX_SYS_SPACE) { + ut_a(DICT_TF2_FLAG_IS_SET(table, DICT_TF2_USE_TABLESPACE)); + + /* Update SYS_TABLESPACES and SYS_DATAFILES if a new + tablespace was created. */ + if (err == DB_SUCCESS) { + char* path; + path = fil_space_get_first_path(table->space); + + err = dict_create_add_tablespace_to_dictionary( + table->space, table->name, + fil_space_get_flags(table->space), + path, trx, commit); + + mem_free(path); + } + + if (err != DB_SUCCESS) { + /* We must delete the link file. */ + fil_delete_link_file(table->name); + } + } + + switch (err) { + case DB_SUCCESS: + break; + case DB_OUT_OF_FILE_SPACE: + trx->error_state = DB_SUCCESS; + trx_rollback_to_savepoint(trx, NULL); + + ut_print_timestamp(stderr); + fputs(" InnoDB: Warning: cannot create table ", + stderr); + ut_print_name(stderr, trx, TRUE, table->name); + fputs(" because tablespace full\n", stderr); + + if (dict_table_open_on_name(table->name, TRUE, FALSE, + DICT_ERR_IGNORE_NONE)) { + + /* Make things easy for the drop table code. */ + + if (table->can_be_evicted) { + dict_table_move_from_lru_to_non_lru(table); + } + + dict_table_close(table, TRUE, FALSE); + + row_drop_table_for_mysql(table->name, trx, FALSE); + + if (commit) { + trx_commit_for_mysql(trx); + } + } else { + dict_mem_table_free(table); + } + + break; + + case DB_TOO_MANY_CONCURRENT_TRXS: + /* We already have .ibd file here. it should be deleted. */ + + if (table->space + && fil_delete_tablespace( + table->space, + BUF_REMOVE_FLUSH_NO_WRITE) + != DB_SUCCESS) { + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: not able to" + " delete tablespace %lu of table ", + (ulong) table->space); + ut_print_name(stderr, trx, TRUE, table->name); + fputs("!\n", stderr); + } + /* fall through */ + + case DB_DUPLICATE_KEY: + case DB_TABLESPACE_EXISTS: + default: + trx->error_state = DB_SUCCESS; + trx_rollback_to_savepoint(trx, NULL); + dict_mem_table_free(table); + break; + } + + que_graph_free((que_t*) que_node_get_parent(thr)); + + trx->op_info = ""; + + return(err); +} + +/*********************************************************************//** +Does an index creation operation for MySQL. TODO: currently failure +to create an index results in dropping the whole table! This is no problem +currently as all indexes must be created at the same time as the table. +@return error number or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_create_index_for_mysql( +/*=======================*/ + dict_index_t* index, /*!< in, own: index definition + (will be freed) */ + trx_t* trx, /*!< in: transaction handle */ + const ulint* field_lengths) /*!< in: if not NULL, must contain + dict_index_get_n_fields(index) + actual field lengths for the + index columns, which are + then checked for not being too + large. */ +{ + ind_node_t* node; + mem_heap_t* heap; + que_thr_t* thr; + dberr_t err; + ulint i; + ulint len; + char* table_name; + char* index_name; + dict_table_t* table; + ibool is_fts; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(mutex_own(&(dict_sys->mutex))); + + trx->op_info = "creating index"; + + /* Copy the table name because we may want to drop the + table later, after the index object is freed (inside + que_run_threads()) and thus index->table_name is not available. */ + table_name = mem_strdup(index->table_name); + index_name = mem_strdup(index->name); + + is_fts = (index->type == DICT_FTS); + + table = dict_table_open_on_name(table_name, TRUE, TRUE, + DICT_ERR_IGNORE_NONE); + + trx_start_if_not_started_xa(trx); + + for (i = 0; i < index->n_def; i++) { + /* Check that prefix_len and actual length + < DICT_MAX_INDEX_COL_LEN */ + + len = dict_index_get_nth_field(index, i)->prefix_len; + + if (field_lengths && field_lengths[i]) { + len = ut_max(len, field_lengths[i]); + } + + DBUG_EXECUTE_IF( + "ib_create_table_fail_at_create_index", + len = DICT_MAX_FIELD_LEN_BY_FORMAT(table) + 1; + ); + + /* Column or prefix length exceeds maximum column length */ + if (len > (ulint) DICT_MAX_FIELD_LEN_BY_FORMAT(table)) { + err = DB_TOO_BIG_INDEX_COL; + + dict_mem_index_free(index); + goto error_handling; + } + } + + heap = mem_heap_create(512); + + trx_set_dict_operation(trx, TRX_DICT_OP_TABLE); + + /* Note that the space id where we store the index is inherited from + the table in dict_build_index_def_step() in dict0crea.cc. */ + + node = ind_create_graph_create(index, heap, true); + + thr = pars_complete_graph_for_exec(node, trx, heap); + + ut_a(thr == que_fork_start_command( + static_cast<que_fork_t*>(que_node_get_parent(thr)))); + + que_run_threads(thr); + + err = trx->error_state; + + que_graph_free((que_t*) que_node_get_parent(thr)); + + /* Create the index specific FTS auxiliary tables. */ + if (err == DB_SUCCESS && is_fts) { + dict_index_t* idx; + + idx = dict_table_get_index_on_name(table, index_name); + + ut_ad(idx); + err = fts_create_index_tables(trx, idx); + } + +error_handling: + dict_table_close(table, TRUE, FALSE); + + if (err != DB_SUCCESS) { + /* We have special error handling here */ + + trx->error_state = DB_SUCCESS; + + trx_rollback_to_savepoint(trx, NULL); + + row_drop_table_for_mysql(table_name, trx, FALSE); + + trx_commit_for_mysql(trx); + + trx->error_state = DB_SUCCESS; + } + + trx->op_info = ""; + + mem_free(table_name); + mem_free(index_name); + + return(err); +} + +/*********************************************************************//** +Scans a table create SQL string and adds to the data dictionary +the foreign key constraints declared in the string. This function +should be called after the indexes for a table have been created. +Each foreign key constraint must be accompanied with indexes in +both participating tables. The indexes are allowed to contain more +fields than mentioned in the constraint. Check also that foreign key +constraints which reference this table are ok. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_table_add_foreign_constraints( +/*==============================*/ + trx_t* trx, /*!< in: transaction */ + const char* sql_string, /*!< in: table create statement where + foreign keys are declared like: + FOREIGN KEY (a, b) REFERENCES table2(c, d), + table2 can be written also with the + database name before it: test.table2 */ + size_t sql_length, /*!< in: length of sql_string */ + const char* name, /*!< in: table full name in the + normalized form + database_name/table_name */ + ibool reject_fks) /*!< in: if TRUE, fail with error + code DB_CANNOT_ADD_CONSTRAINT if + any foreign keys are found. */ +{ + dberr_t err; + + ut_ad(mutex_own(&(dict_sys->mutex))); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_a(sql_string); + + trx->op_info = "adding foreign keys"; + + trx_start_if_not_started_xa(trx); + + trx_set_dict_operation(trx, TRX_DICT_OP_TABLE); + + err = dict_create_foreign_constraints(trx, sql_string, sql_length, + name, reject_fks); + + DBUG_EXECUTE_IF("ib_table_add_foreign_fail", + err = DB_DUPLICATE_KEY;); + + DEBUG_SYNC_C("table_add_foreign_constraints"); + + if (err == DB_SUCCESS) { + /* Check that also referencing constraints are ok */ + err = dict_load_foreigns(name, NULL, false, true, + DICT_ERR_IGNORE_NONE); + } + + if (err != DB_SUCCESS) { + /* We have special error handling here */ + + trx->error_state = DB_SUCCESS; + + trx_rollback_to_savepoint(trx, NULL); + + row_drop_table_for_mysql(name, trx, FALSE); + + trx_commit_for_mysql(trx); + + trx->error_state = DB_SUCCESS; + } + + return(err); +} + +/*********************************************************************//** +Drops a table for MySQL as a background operation. MySQL relies on Unix +in ALTER TABLE to the fact that the table handler does not remove the +table before all handles to it has been removed. Furhermore, the MySQL's +call to drop table must be non-blocking. Therefore we do the drop table +as a background operation, which is taken care of by the master thread +in srv0srv.cc. +@return error code or DB_SUCCESS */ +static +dberr_t +row_drop_table_for_mysql_in_background( +/*===================================*/ + const char* name) /*!< in: table name */ +{ + dberr_t error; + trx_t* trx; + + trx = trx_allocate_for_background(); + + /* If the original transaction was dropping a table referenced by + foreign keys, we must set the following to be able to drop the + table: */ + + trx->check_foreigns = FALSE; + + /* fputs("InnoDB: Error: Dropping table ", stderr); + ut_print_name(stderr, trx, TRUE, name); + fputs(" in background drop list\n", stderr); */ + + /* Try to drop the table in InnoDB */ + + error = row_drop_table_for_mysql(name, trx, FALSE); + + /* Flush the log to reduce probability that the .frm files and + the InnoDB data dictionary get out-of-sync if the user runs + with innodb_flush_log_at_trx_commit = 0 */ + + log_buffer_flush_to_disk(); + + trx_commit_for_mysql(trx); + + trx_free_for_background(trx); + + return(error); +} + +/*********************************************************************//** +The master thread in srv0srv.cc calls this regularly to drop tables which +we must drop in background after queries to them have ended. Such lazy +dropping of tables is needed in ALTER TABLE on Unix. +@return how many tables dropped + remaining tables in list */ +UNIV_INTERN +ulint +row_drop_tables_for_mysql_in_background(void) +/*=========================================*/ +{ + row_mysql_drop_t* drop; + dict_table_t* table; + ulint n_tables; + ulint n_tables_dropped = 0; +loop: + mutex_enter(&row_drop_list_mutex); + + ut_a(row_mysql_drop_list_inited); + + drop = UT_LIST_GET_FIRST(row_mysql_drop_list); + + n_tables = UT_LIST_GET_LEN(row_mysql_drop_list); + + mutex_exit(&row_drop_list_mutex); + + if (drop == NULL) { + /* All tables dropped */ + + return(n_tables + n_tables_dropped); + } + + table = dict_table_open_on_name(drop->table_name, FALSE, FALSE, + DICT_ERR_IGNORE_NONE); + + if (table == NULL) { + /* If for some reason the table has already been dropped + through some other mechanism, do not try to drop it */ + + goto already_dropped; + } + + ut_a(!table->can_be_evicted); + + dict_table_close(table, FALSE, FALSE); + + if (DB_SUCCESS != row_drop_table_for_mysql_in_background( + drop->table_name)) { + /* If the DROP fails for some table, we return, and let the + main thread retry later */ + + return(n_tables + n_tables_dropped); + } + + n_tables_dropped++; + +already_dropped: + mutex_enter(&row_drop_list_mutex); + + UT_LIST_REMOVE(row_mysql_drop_list, row_mysql_drop_list, drop); + + MONITOR_DEC(MONITOR_BACKGROUND_DROP_TABLE); + + ut_print_timestamp(stderr); + fputs(" InnoDB: Dropped table ", stderr); + ut_print_name(stderr, NULL, TRUE, drop->table_name); + fputs(" in background drop queue.\n", stderr); + + mem_free(drop->table_name); + + mem_free(drop); + + mutex_exit(&row_drop_list_mutex); + + goto loop; +} + +/*********************************************************************//** +Get the background drop list length. NOTE: the caller must own the +drop list mutex! +@return how many tables in list */ +UNIV_INTERN +ulint +row_get_background_drop_list_len_low(void) +/*======================================*/ +{ + ulint len; + + mutex_enter(&row_drop_list_mutex); + + ut_a(row_mysql_drop_list_inited); + + len = UT_LIST_GET_LEN(row_mysql_drop_list); + + mutex_exit(&row_drop_list_mutex); + + return(len); +} + +/*********************************************************************//** +If a table is not yet in the drop list, adds the table to the list of tables +which the master thread drops in background. We need this on Unix because in +ALTER TABLE MySQL may call drop table even if the table has running queries on +it. Also, if there are running foreign key checks on the table, we drop the +table lazily. +@return TRUE if the table was not yet in the drop list, and was added there */ +static +ibool +row_add_table_to_background_drop_list( +/*==================================*/ + const char* name) /*!< in: table name */ +{ + row_mysql_drop_t* drop; + + mutex_enter(&row_drop_list_mutex); + + ut_a(row_mysql_drop_list_inited); + + /* Look if the table already is in the drop list */ + for (drop = UT_LIST_GET_FIRST(row_mysql_drop_list); + drop != NULL; + drop = UT_LIST_GET_NEXT(row_mysql_drop_list, drop)) { + + if (strcmp(drop->table_name, name) == 0) { + /* Already in the list */ + + mutex_exit(&row_drop_list_mutex); + + return(FALSE); + } + } + + drop = static_cast<row_mysql_drop_t*>( + mem_alloc(sizeof(row_mysql_drop_t))); + + drop->table_name = mem_strdup(name); + + UT_LIST_ADD_LAST(row_mysql_drop_list, row_mysql_drop_list, drop); + + MONITOR_INC(MONITOR_BACKGROUND_DROP_TABLE); + + /* fputs("InnoDB: Adding table ", stderr); + ut_print_name(stderr, trx, TRUE, drop->table_name); + fputs(" to background drop list\n", stderr); */ + + mutex_exit(&row_drop_list_mutex); + + return(TRUE); +} + +/*********************************************************************//** +Reassigns the table identifier of a table. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_mysql_table_id_reassign( +/*========================*/ + dict_table_t* table, /*!< in/out: table */ + trx_t* trx, /*!< in/out: transaction */ + table_id_t* new_id) /*!< out: new table id */ +{ + dberr_t err; + pars_info_t* info = pars_info_create(); + + dict_hdr_get_new_id(new_id, NULL, NULL); + + /* Remove all locks except the table-level S and X locks. */ + lock_remove_all_on_table(table, FALSE); + + pars_info_add_ull_literal(info, "old_id", table->id); + pars_info_add_ull_literal(info, "new_id", *new_id); + + err = que_eval_sql( + info, + "PROCEDURE RENUMBER_TABLE_PROC () IS\n" + "BEGIN\n" + "UPDATE SYS_TABLES SET ID = :new_id\n" + " WHERE ID = :old_id;\n" + "UPDATE SYS_COLUMNS SET TABLE_ID = :new_id\n" + " WHERE TABLE_ID = :old_id;\n" + "UPDATE SYS_INDEXES SET TABLE_ID = :new_id\n" + " WHERE TABLE_ID = :old_id;\n" + "END;\n", FALSE, trx); + + return(err); +} + +/*********************************************************************//** +Setup the pre-requisites for DISCARD TABLESPACE. It will start the transaction, +acquire the data dictionary lock in X mode and open the table. +@return table instance or 0 if not found. */ +static +dict_table_t* +row_discard_tablespace_begin( +/*=========================*/ + const char* name, /*!< in: table name */ + trx_t* trx) /*!< in: transaction handle */ +{ + trx->op_info = "discarding tablespace"; + + trx_set_dict_operation(trx, TRX_DICT_OP_TABLE); + + trx_start_if_not_started_xa(trx); + + /* Serialize data dictionary operations with dictionary mutex: + this is to avoid deadlocks during data dictionary operations */ + + row_mysql_lock_data_dictionary(trx); + + dict_table_t* table; + + table = dict_table_open_on_name( + name, TRUE, FALSE, DICT_ERR_IGNORE_NONE); + + if (table) { + dict_stats_wait_bg_to_stop_using_table(table, trx); + ut_a(table->space != TRX_SYS_SPACE); + ut_a(table->n_foreign_key_checks_running == 0); + } + + return(table); +} + +/*********************************************************************//** +Do the foreign key constraint checks. +@return DB_SUCCESS or error code. */ +static +dberr_t +row_discard_tablespace_foreign_key_checks( +/*======================================*/ + const trx_t* trx, /*!< in: transaction handle */ + const dict_table_t* table) /*!< in: table to be discarded */ +{ + + if (srv_read_only_mode || !trx->check_foreigns) { + return(DB_SUCCESS); + } + + /* Check if the table is referenced by foreign key constraints from + some other table (not the table itself) */ + dict_foreign_set::iterator it + = std::find_if(table->referenced_set.begin(), + table->referenced_set.end(), + dict_foreign_different_tables()); + + if (it == table->referenced_set.end()) { + return(DB_SUCCESS); + } + + const dict_foreign_t* foreign = *it; + FILE* ef = dict_foreign_err_file; + + ut_ad(foreign->foreign_table != table); + ut_ad(foreign->referenced_table == table); + + /* We only allow discarding a referenced table if + FOREIGN_KEY_CHECKS is set to 0 */ + + mutex_enter(&dict_foreign_err_mutex); + + rewind(ef); + + ut_print_timestamp(ef); + + fputs(" Cannot DISCARD table ", ef); + ut_print_name(stderr, trx, TRUE, table->name); + fputs("\n" + "because it is referenced by ", ef); + ut_print_name(stderr, trx, TRUE, foreign->foreign_table_name); + putc('\n', ef); + + mutex_exit(&dict_foreign_err_mutex); + + return(DB_CANNOT_DROP_CONSTRAINT); +} + +/*********************************************************************//** +Cleanup after the DISCARD TABLESPACE operation. +@return error code. */ +static +dberr_t +row_discard_tablespace_end( +/*=======================*/ + trx_t* trx, /*!< in/out: transaction handle */ + dict_table_t* table, /*!< in/out: table to be discarded */ + dberr_t err) /*!< in: error code */ +{ + if (table != 0) { + dict_table_close(table, TRUE, FALSE); + } + + DBUG_EXECUTE_IF("ib_discard_before_commit_crash", + log_make_checkpoint_at(LSN_MAX, TRUE); + DBUG_SUICIDE();); + + trx_commit_for_mysql(trx); + + DBUG_EXECUTE_IF("ib_discard_after_commit_crash", + log_make_checkpoint_at(LSN_MAX, TRUE); + DBUG_SUICIDE();); + + row_mysql_unlock_data_dictionary(trx); + + trx->op_info = ""; + + return(err); +} + +/*********************************************************************//** +Do the DISCARD TABLESPACE operation. +@return DB_SUCCESS or error code. */ +static +dberr_t +row_discard_tablespace( +/*===================*/ + trx_t* trx, /*!< in/out: transaction handle */ + dict_table_t* table) /*!< in/out: table to be discarded */ +{ + dberr_t err; + + /* How do we prevent crashes caused by ongoing operations on + the table? Old operations could try to access non-existent + pages. MySQL will block all DML on the table using MDL and a + DISCARD will not start unless all existing operations on the + table to be discarded are completed. + + 1) Acquire the data dictionary latch in X mode. To prevent any + internal operations that MySQL is not aware off and also for + the internal SQL parser. + + 2) Purge and rollback: we assign a new table id for the + table. Since purge and rollback look for the table based on + the table id, they see the table as 'dropped' and discard + their operations. + + 3) Insert buffer: we remove all entries for the tablespace in + the insert buffer tree. + + 4) FOREIGN KEY operations: if table->n_foreign_key_checks_running > 0, + we do not allow the discard. */ + + /* Play safe and remove all insert buffer entries, though we should + have removed them already when DISCARD TABLESPACE was called */ + + ibuf_delete_for_discarded_space(table->space); + + table_id_t new_id; + + /* Set the TABLESPACE DISCARD flag in the table definition on disk. */ + + err = row_import_update_discarded_flag(trx, table->id, true, true); + + if (err != DB_SUCCESS) { + return(err); + } + + /* Update the index root pages in the system tables, on disk */ + + err = row_import_update_index_root(trx, table, true, true); + + if (err != DB_SUCCESS) { + return(err); + } + + /* Drop all the FTS auxiliary tables. */ + if (dict_table_has_fts_index(table) + || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) { + + fts_drop_tables(trx, table); + } + + /* Assign a new space ID to the table definition so that purge + can ignore the changes. Update the system table on disk. */ + + err = row_mysql_table_id_reassign(table, trx, &new_id); + + if (err != DB_SUCCESS) { + return(err); + } + + /* Discard the physical file that is used for the tablespace. */ + + err = fil_discard_tablespace(table->space); + + switch(err) { + case DB_SUCCESS: + case DB_IO_ERROR: + case DB_TABLESPACE_NOT_FOUND: + /* All persistent operations successful, update the + data dictionary memory cache. */ + + table->ibd_file_missing = TRUE; + + table->flags2 |= DICT_TF2_DISCARDED; + + dict_table_change_id_in_cache(table, new_id); + + /* Reset the root page numbers. */ + + for (dict_index_t* index = UT_LIST_GET_FIRST(table->indexes); + index != 0; + index = UT_LIST_GET_NEXT(indexes, index)) { + + index->page = FIL_NULL; + index->space = FIL_NULL; + } + + /* If the tablespace did not already exist or we couldn't + write to it, we treat that as a successful DISCARD. It is + unusable anyway. */ + + err = DB_SUCCESS; + break; + + default: + /* We need to rollback the disk changes, something failed. */ + + trx->error_state = DB_SUCCESS; + + trx_rollback_to_savepoint(trx, NULL); + + trx->error_state = DB_SUCCESS; + } + + return(err); +} + +/*********************************************************************//** +Discards the tablespace of a table which stored in an .ibd file. Discarding +means that this function renames the .ibd file and assigns a new table id for +the table. Also the flag table->ibd_file_missing is set to TRUE. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_discard_tablespace_for_mysql( +/*=============================*/ + const char* name, /*!< in: table name */ + trx_t* trx) /*!< in: transaction handle */ +{ + dberr_t err; + dict_table_t* table; + + /* Open the table and start the transaction if not started. */ + + table = row_discard_tablespace_begin(name, trx); + + if (table == 0) { + err = DB_TABLE_NOT_FOUND; + } else if (table->space == TRX_SYS_SPACE) { + char table_name[MAX_FULL_NAME_LEN + 1]; + + innobase_format_name( + table_name, sizeof(table_name), table->name, FALSE); + + ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_TABLE_IN_SYSTEM_TABLESPACE, table_name); + + err = DB_ERROR; + + } else if (table->n_foreign_key_checks_running > 0) { + char table_name[MAX_FULL_NAME_LEN + 1]; + + innobase_format_name( + table_name, sizeof(table_name), table->name, FALSE); + + ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_DISCARD_FK_CHECKS_RUNNING, table_name); + + err = DB_ERROR; + + } else { + /* Do foreign key constraint checks. */ + + err = row_discard_tablespace_foreign_key_checks(trx, table); + + if (err == DB_SUCCESS) { + err = row_discard_tablespace(trx, table); + } + } + + return(row_discard_tablespace_end(trx, table, err)); +} + +/*********************************************************************//** +Sets an exclusive lock on a table. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_mysql_lock_table( +/*=================*/ + trx_t* trx, /*!< in/out: transaction */ + dict_table_t* table, /*!< in: table to lock */ + enum lock_mode mode, /*!< in: LOCK_X or LOCK_S */ + const char* op_info) /*!< in: string for trx->op_info */ +{ + mem_heap_t* heap; + que_thr_t* thr; + dberr_t err; + sel_node_t* node; + + ut_ad(trx); + ut_ad(mode == LOCK_X || mode == LOCK_S); + + heap = mem_heap_create(512); + + trx->op_info = op_info; + + node = sel_node_create(heap); + thr = pars_complete_graph_for_exec(node, trx, heap); + thr->graph->state = QUE_FORK_ACTIVE; + + /* We use the select query graph as the dummy graph needed + in the lock module call */ + + thr = que_fork_get_first_thr( + static_cast<que_fork_t*>(que_node_get_parent(thr))); + + que_thr_move_to_run_state_for_mysql(thr, trx); + +run_again: + thr->run_node = thr; + thr->prev_node = thr->common.parent; + + err = lock_table(0, table, mode, thr); + + trx->error_state = err; + + if (err == DB_SUCCESS) { + que_thr_stop_for_mysql_no_error(thr, trx); + } else { + que_thr_stop_for_mysql(thr); + + if (err != DB_QUE_THR_SUSPENDED) { + ibool was_lock_wait; + + was_lock_wait = row_mysql_handle_errors( + &err, trx, thr, NULL); + + if (was_lock_wait) { + goto run_again; + } + } else { + que_thr_t* run_thr; + que_node_t* parent; + + parent = que_node_get_parent(thr); + + run_thr = que_fork_start_command( + static_cast<que_fork_t*>(parent)); + + ut_a(run_thr == thr); + + /* There was a lock wait but the thread was not + in a ready to run or running state. */ + trx->error_state = DB_LOCK_WAIT; + + goto run_again; + } + } + + que_graph_free(thr->graph); + trx->op_info = ""; + + return(err); +} + +/*********************************************************************//** +Truncates a table for MySQL. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_truncate_table_for_mysql( +/*=========================*/ + dict_table_t* table, /*!< in: table handle */ + trx_t* trx) /*!< in: transaction handle */ +{ + dberr_t err; + mem_heap_t* heap; + byte* buf; + dtuple_t* tuple; + dfield_t* dfield; + dict_index_t* sys_index; + btr_pcur_t pcur; + mtr_t mtr; + table_id_t new_id; + ulint recreate_space = 0; + pars_info_t* info = NULL; + ibool has_internal_doc_id; + ulint old_space = table->space; + + /* How do we prevent crashes caused by ongoing operations on + the table? Old operations could try to access non-existent + pages. + + 1) SQL queries, INSERT, SELECT, ...: we must get an exclusive + InnoDB table lock on the table before we can do TRUNCATE + TABLE. Then there are no running queries on the table. + + 2) Purge and rollback: we assign a new table id for the + table. Since purge and rollback look for the table based on + the table id, they see the table as 'dropped' and discard + their operations. + + 3) Insert buffer: TRUNCATE TABLE is analogous to DROP TABLE, + so we do not have to remove insert buffer records, as the + insert buffer works at a low level. If a freed page is later + reallocated, the allocator will remove the ibuf entries for + it. + + When we truncate *.ibd files by recreating them (analogous to + DISCARD TABLESPACE), we remove all entries for the table in the + insert buffer tree. This is not strictly necessary, because + in 6) we will assign a new tablespace identifier, but we can + free up some space in the system tablespace. + + 4) Linear readahead and random readahead: we use the same + method as in 3) to discard ongoing operations. (This is only + relevant for TRUNCATE TABLE by DISCARD TABLESPACE.) + + 5) FOREIGN KEY operations: if + table->n_foreign_key_checks_running > 0, we do not allow the + TRUNCATE. We also reserve the data dictionary latch. + + 6) Crash recovery: To prevent the application of pre-truncation + redo log records on the truncated tablespace, we will assign + a new tablespace identifier to the truncated tablespace. */ + + ut_ad(table); + + if (srv_created_new_raw) { + fputs("InnoDB: A new raw disk partition was initialized:\n" + "InnoDB: we do not allow database modifications" + " by the user.\n" + "InnoDB: Shut down mysqld and edit my.cnf so that newraw" + " is replaced with raw.\n", stderr); + + return(DB_ERROR); + } + + if (dict_table_is_discarded(table)) { + return(DB_TABLESPACE_DELETED); + } else if (table->ibd_file_missing) { + return(DB_TABLESPACE_NOT_FOUND); + } + + trx_start_for_ddl(trx, TRX_DICT_OP_TABLE); + + trx->op_info = "truncating table"; + + /* Serialize data dictionary operations with dictionary mutex: + no deadlocks can occur then in these operations */ + + ut_a(trx->dict_operation_lock_mode == 0); + /* Prevent foreign key checks etc. while we are truncating the + table */ + row_mysql_lock_data_dictionary(trx); + + ut_ad(mutex_own(&(dict_sys->mutex))); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + dict_stats_wait_bg_to_stop_using_table(table, trx); + + /* Check if the table is referenced by foreign key constraints from + some other table (not the table itself) */ + + dict_foreign_set::iterator it + = std::find_if(table->referenced_set.begin(), + table->referenced_set.end(), + dict_foreign_different_tables()); + + if (!srv_read_only_mode + && it != table->referenced_set.end() + && trx->check_foreigns) { + + FILE* ef = dict_foreign_err_file; + dict_foreign_t* foreign = *it; + + /* We only allow truncating a referenced table if + FOREIGN_KEY_CHECKS is set to 0 */ + + mutex_enter(&dict_foreign_err_mutex); + rewind(ef); + ut_print_timestamp(ef); + + fputs(" Cannot truncate table ", ef); + ut_print_name(ef, trx, TRUE, table->name); + fputs(" by DROP+CREATE\n" + "InnoDB: because it is referenced by ", ef); + ut_print_name(ef, trx, TRUE, foreign->foreign_table_name); + putc('\n', ef); + mutex_exit(&dict_foreign_err_mutex); + + err = DB_ERROR; + goto funct_exit; + } + + /* TODO: could we replace the counter n_foreign_key_checks_running + with lock checks on the table? Acquire here an exclusive lock on the + table, and rewrite lock0lock.cc and the lock wait in srv0srv.cc so that + they can cope with the table having been truncated here? Foreign key + checks take an IS or IX lock on the table. */ + + if (table->n_foreign_key_checks_running > 0) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Cannot truncate table ", stderr); + ut_print_name(stderr, trx, TRUE, table->name); + fputs(" by DROP+CREATE\n" + "InnoDB: because there is a foreign key check" + " running on it.\n", + stderr); + err = DB_ERROR; + + goto funct_exit; + } + + /* Check if memcached plugin is running on this table. if is, we don't + allow truncate this table. */ + if (table->memcached_sync_count != 0) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Cannot truncate table ", stderr); + ut_print_name(stderr, trx, TRUE, table->name); + fputs(" by DROP+CREATE\n" + "InnoDB: because there are memcached operations" + " running on it.\n", + stderr); + err = DB_ERROR; + + goto funct_exit; + } else { + /* We need to set this counter to -1 for blocking + memcached operations. */ + table->memcached_sync_count = DICT_TABLE_IN_DDL; + } + + /* Remove all locks except the table-level X lock. */ + + lock_remove_all_on_table(table, FALSE); + + /* Ensure that the table will be dropped by + trx_rollback_active() in case of a crash. */ + + trx->table_id = table->id; + trx_set_dict_operation(trx, TRX_DICT_OP_TABLE); + + /* Assign an undo segment for the transaction, so that the + transaction will be recovered after a crash. */ + + mutex_enter(&trx->undo_mutex); + + err = trx_undo_assign_undo(trx, TRX_UNDO_UPDATE); + + mutex_exit(&trx->undo_mutex); + + if (err != DB_SUCCESS) { + + goto funct_exit; + } + + if (table->space && !table->dir_path_of_temp_table) { + /* Discard and create the single-table tablespace. */ + ulint space = table->space; + ulint flags = fil_space_get_flags(space); + + ut_a(!DICT_TF2_FLAG_IS_SET(table, DICT_TF2_TEMPORARY)); + + dict_get_and_save_data_dir_path(table, true); + + if (flags != ULINT_UNDEFINED + && fil_discard_tablespace(space) == DB_SUCCESS) { + + dict_index_t* index; + + dict_hdr_get_new_id(NULL, NULL, &space); + + /* Lock all index trees for this table. We must + do so after dict_hdr_get_new_id() to preserve + the latch order */ + dict_table_x_lock_indexes(table); + + if (space == ULINT_UNDEFINED + || fil_create_new_single_table_tablespace( + space, table->name, + table->data_dir_path, + flags, table->flags2, + FIL_IBD_FILE_INITIAL_SIZE) + != DB_SUCCESS) { + dict_table_x_unlock_indexes(table); + + ib_logf(IB_LOG_LEVEL_ERROR, + "TRUNCATE TABLE %s failed to " + "create a new tablespace", + table->name); + + table->ibd_file_missing = 1; + err = DB_ERROR; + goto funct_exit; + } + + recreate_space = space; + + /* Replace the space_id in the data dictionary cache. + The persisent data dictionary (SYS_TABLES.SPACE + and SYS_INDEXES.SPACE) are updated later in this + function. */ + table->space = space; + index = dict_table_get_first_index(table); + do { + index->space = space; + index = dict_table_get_next_index(index); + } while (index); + + mtr_start(&mtr); + fsp_header_init(space, + FIL_IBD_FILE_INITIAL_SIZE, &mtr); + mtr_commit(&mtr); + } + } else { + /* Lock all index trees for this table, as we will + truncate the table/index and possibly change their metadata. + All DML/DDL are blocked by table level lock, with + a few exceptions such as queries into information schema + about the table, MySQL could try to access index stats + for this kind of query, we need to use index locks to + sync up */ + dict_table_x_lock_indexes(table); + } + + /* scan SYS_INDEXES for all indexes of the table */ + heap = mem_heap_create(800); + + tuple = dtuple_create(heap, 1); + dfield = dtuple_get_nth_field(tuple, 0); + + buf = static_cast<byte*>(mem_heap_alloc(heap, 8)); + mach_write_to_8(buf, table->id); + + dfield_set_data(dfield, buf, 8); + sys_index = dict_table_get_first_index(dict_sys->sys_indexes); + dict_index_copy_types(tuple, sys_index, 1); + + mtr_start(&mtr); + btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE, + BTR_MODIFY_LEAF, &pcur, &mtr); + for (;;) { + rec_t* rec; + const byte* field; + ulint len; + ulint root_page_no; + + if (!btr_pcur_is_on_user_rec(&pcur)) { + /* The end of SYS_INDEXES has been reached. */ + break; + } + + rec = btr_pcur_get_rec(&pcur); + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_INDEXES__TABLE_ID, &len); + ut_ad(len == 8); + + if (memcmp(buf, field, len) != 0) { + /* End of indexes for the table (TABLE_ID mismatch). */ + break; + } + + if (rec_get_deleted_flag(rec, FALSE)) { + /* The index has been dropped. */ + goto next_rec; + } + + /* This call may commit and restart mtr + and reposition pcur. */ + root_page_no = dict_truncate_index_tree(table, recreate_space, + &pcur, &mtr); + + rec = btr_pcur_get_rec(&pcur); + + if (root_page_no != FIL_NULL) { + page_rec_write_field( + rec, DICT_FLD__SYS_INDEXES__PAGE_NO, + root_page_no, &mtr); + /* We will need to commit and restart the + mini-transaction in order to avoid deadlocks. + The dict_truncate_index_tree() call has allocated + a page in this mini-transaction, and the rest of + this loop could latch another index page. */ + mtr_commit(&mtr); + mtr_start(&mtr); + btr_pcur_restore_position(BTR_MODIFY_LEAF, + &pcur, &mtr); + } + +next_rec: + btr_pcur_move_to_next_user_rec(&pcur, &mtr); + } + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + mem_heap_free(heap); + /* Done with index truncation, release index tree locks, + subsequent work relates to table level metadata change */ + dict_table_x_unlock_indexes(table); + + dict_hdr_get_new_id(&new_id, NULL, NULL); + + /* Create new FTS auxiliary tables with the new_id, and + drop the old index later, only if everything runs successful. */ + has_internal_doc_id = dict_table_has_fts_index(table) + || DICT_TF2_FLAG_IS_SET( + table, DICT_TF2_FTS_HAS_DOC_ID); + if (has_internal_doc_id) { + dict_table_t fts_table; + ulint i; + + fts_table.name = table->name; + fts_table.id = new_id; + fts_table.flags2 = table->flags2; + + err = fts_create_common_tables( + trx, &fts_table, table->name, TRUE); + + for (i = 0; + i < ib_vector_size(table->fts->indexes) + && err == DB_SUCCESS; + i++) { + + dict_index_t* fts_index; + + fts_index = static_cast<dict_index_t*>( + ib_vector_getp(table->fts->indexes, i)); + + err = fts_create_index_tables_low( + trx, fts_index, table->name, new_id); + } + + if (err != DB_SUCCESS) { + trx->error_state = DB_SUCCESS; + trx_rollback_to_savepoint(trx, NULL); + trx->error_state = DB_SUCCESS; + ut_print_timestamp(stderr); + fputs(" InnoDB: Unable to truncate FTS index for" + " table", stderr); + ut_print_name(stderr, trx, TRUE, table->name); + fputs("\n", stderr); + + goto funct_exit; + } else { + ut_ad(trx->state != TRX_STATE_NOT_STARTED); + } + } + + info = pars_info_create(); + + pars_info_add_int4_literal(info, "new_space", (lint) table->space); + pars_info_add_ull_literal(info, "old_id", table->id); + pars_info_add_ull_literal(info, "new_id", new_id); + + err = que_eval_sql(info, + "PROCEDURE RENUMBER_TABLE_ID_PROC () IS\n" + "BEGIN\n" + "UPDATE SYS_TABLES" + " SET ID = :new_id, SPACE = :new_space\n" + " WHERE ID = :old_id;\n" + "UPDATE SYS_COLUMNS SET TABLE_ID = :new_id\n" + " WHERE TABLE_ID = :old_id;\n" + "UPDATE SYS_INDEXES" + " SET TABLE_ID = :new_id, SPACE = :new_space\n" + " WHERE TABLE_ID = :old_id;\n" + "END;\n" + , FALSE, trx); + + if (err == DB_SUCCESS && old_space != table->space) { + info = pars_info_create(); + + pars_info_add_int4_literal(info, "old_space", (lint) old_space); + + pars_info_add_int4_literal( + info, "new_space", (lint) table->space); + + err = que_eval_sql(info, + "PROCEDURE RENUMBER_TABLESPACE_PROC () IS\n" + "BEGIN\n" + "UPDATE SYS_TABLESPACES" + " SET SPACE = :new_space\n" + " WHERE SPACE = :old_space;\n" + "UPDATE SYS_DATAFILES" + " SET SPACE = :new_space" + " WHERE SPACE = :old_space;\n" + "END;\n" + , FALSE, trx); + } + DBUG_EXECUTE_IF("ib_ddl_crash_before_fts_truncate", err = DB_ERROR;); + + if (err != DB_SUCCESS) { + trx->error_state = DB_SUCCESS; + trx_rollback_to_savepoint(trx, NULL); + trx->error_state = DB_SUCCESS; + + /* Update system table failed. Table in memory metadata + could be in an inconsistent state, mark the in-memory + table->corrupted to be true. In the long run, this should + be fixed by atomic truncate table */ + table->corrupted = true; + + ut_print_timestamp(stderr); + fputs(" InnoDB: Unable to assign a new identifier to table ", + stderr); + ut_print_name(stderr, trx, TRUE, table->name); + fputs("\n" + "InnoDB: after truncating it. Background processes" + " may corrupt the table!\n", stderr); + + /* Failed to update the table id, so drop the new + FTS auxiliary tables */ + if (has_internal_doc_id) { + ut_ad(trx->state == TRX_STATE_NOT_STARTED); + + table_id_t id = table->id; + + table->id = new_id; + + fts_drop_tables(trx, table); + + table->id = id; + + ut_ad(trx->state != TRX_STATE_NOT_STARTED); + } + + err = DB_ERROR; + } else { + /* Drop the old FTS index */ + if (has_internal_doc_id) { + ut_ad(trx->state != TRX_STATE_NOT_STARTED); + fts_drop_tables(trx, table); + ut_ad(trx->state != TRX_STATE_NOT_STARTED); + } + + DBUG_EXECUTE_IF("ib_truncate_crash_after_fts_drop", + DBUG_SUICIDE();); + + dict_table_change_id_in_cache(table, new_id); + + /* Reset the Doc ID in cache to 0 */ + if (has_internal_doc_id && table->fts->cache) { + table->fts->fts_status |= TABLE_DICT_LOCKED; + fts_update_next_doc_id(trx, table, NULL, 0); + fts_cache_clear(table->fts->cache); + fts_cache_init(table->fts->cache); + table->fts->fts_status &= ~TABLE_DICT_LOCKED; + } + } + + /* Reset auto-increment. */ + dict_table_autoinc_lock(table); + dict_table_autoinc_initialize(table, 1); + dict_table_autoinc_unlock(table); + + trx_commit_for_mysql(trx); + +funct_exit: + + if (table->memcached_sync_count == DICT_TABLE_IN_DDL) { + /* We need to set the memcached sync back to 0, unblock + memcached operationse. */ + table->memcached_sync_count = 0; + } + + row_mysql_unlock_data_dictionary(trx); + + dict_stats_update(table, DICT_STATS_EMPTY_TABLE); + + trx->op_info = ""; + + srv_wake_master_thread(); + + return(err); +} + +/*********************************************************************//** +Drops a table for MySQL. If the name of the dropped table ends in +one of "innodb_monitor", "innodb_lock_monitor", "innodb_tablespace_monitor", +"innodb_table_monitor", then this will also stop the printing of monitor +output by the master thread. If the data dictionary was not already locked +by the transaction, the transaction will be committed. Otherwise, the +data dictionary will remain locked. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_drop_table_for_mysql( +/*=====================*/ + const char* name, /*!< in: table name */ + trx_t* trx, /*!< in: transaction handle */ + bool drop_db,/*!< in: true=dropping whole database */ + bool nonatomic) + /*!< in: whether it is permitted + to release and reacquire dict_operation_lock */ +{ + dberr_t err; + dict_foreign_t* foreign; + dict_table_t* table; + ibool print_msg; + ulint space_id; + char* filepath = NULL; + const char* tablename_minus_db; + char* tablename = NULL; + bool ibd_file_missing; + ulint namelen; + bool locked_dictionary = false; + pars_info_t* info = NULL; + mem_heap_t* heap = NULL; + + DBUG_ENTER("row_drop_table_for_mysql"); + + DBUG_PRINT("row_drop_table_for_mysql", ("table: %s", name)); + + ut_a(name != NULL); + + if (srv_created_new_raw) { + fputs("InnoDB: A new raw disk partition was initialized:\n" + "InnoDB: we do not allow database modifications" + " by the user.\n" + "InnoDB: Shut down mysqld and edit my.cnf so that newraw" + " is replaced with raw.\n", stderr); + + DBUG_RETURN(DB_ERROR); + } + + /* The table name is prefixed with the database name and a '/'. + Certain table names starting with 'innodb_' have their special + meaning regardless of the database name. Thus, we need to + ignore the database name prefix in the comparisons. */ + tablename_minus_db = strchr(name, '/'); + + if (tablename_minus_db) { + tablename_minus_db++; + } else { + /* Ancillary FTS tables don't have '/' characters. */ + tablename_minus_db = name; + } + + namelen = strlen(tablename_minus_db) + 1; + + if (namelen == sizeof S_innodb_monitor + && !memcmp(tablename_minus_db, S_innodb_monitor, + sizeof S_innodb_monitor)) { + + /* Table name equals "innodb_monitor": + stop monitor prints */ + + srv_print_innodb_monitor = FALSE; + srv_print_innodb_lock_monitor = FALSE; + } else if (namelen == sizeof S_innodb_lock_monitor + && !memcmp(tablename_minus_db, S_innodb_lock_monitor, + sizeof S_innodb_lock_monitor)) { + srv_print_innodb_monitor = FALSE; + srv_print_innodb_lock_monitor = FALSE; + } else if (namelen == sizeof S_innodb_tablespace_monitor + && !memcmp(tablename_minus_db, S_innodb_tablespace_monitor, + sizeof S_innodb_tablespace_monitor)) { + + srv_print_innodb_tablespace_monitor = FALSE; + } else if (namelen == sizeof S_innodb_table_monitor + && !memcmp(tablename_minus_db, S_innodb_table_monitor, + sizeof S_innodb_table_monitor)) { + + srv_print_innodb_table_monitor = FALSE; + } + + /* Serialize data dictionary operations with dictionary mutex: + no deadlocks can occur then in these operations */ + + trx->op_info = "dropping table"; + + /* This function is called recursively via fts_drop_tables(). */ + if (trx->state == TRX_STATE_NOT_STARTED) { + trx_start_for_ddl(trx, TRX_DICT_OP_TABLE); + } + + if (trx->dict_operation_lock_mode != RW_X_LATCH) { + /* Prevent foreign key checks etc. while we are dropping the + table */ + + row_mysql_lock_data_dictionary(trx); + + locked_dictionary = true; + nonatomic = true; + } + + ut_ad(mutex_own(&(dict_sys->mutex))); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + table = dict_table_open_on_name( + name, TRUE, FALSE, + static_cast<dict_err_ignore_t>( + DICT_ERR_IGNORE_INDEX_ROOT | DICT_ERR_IGNORE_CORRUPT)); + + if (!table) { + err = DB_TABLE_NOT_FOUND; + ut_print_timestamp(stderr); + + fputs(" InnoDB: Error: table ", stderr); + ut_print_name(stderr, trx, TRUE, name); + fputs(" does not exist in the InnoDB internal\n" + "InnoDB: data dictionary though MySQL is" + " trying to drop it.\n" + "InnoDB: Have you copied the .frm file" + " of the table to the\n" + "InnoDB: MySQL database directory" + " from another database?\n" + "InnoDB: You can look for further help from\n" + "InnoDB: " REFMAN "innodb-troubleshooting.html\n", + stderr); + goto funct_exit; + } + + /* Turn on this drop bit before we could release the dictionary + latch */ + table->to_be_dropped = true; + + if (nonatomic) { + /* This trx did not acquire any locks on dictionary + table records yet. Thus it is safe to release and + reacquire the data dictionary latches. */ + if (table->fts) { + ut_ad(!table->fts->add_wq); + ut_ad(lock_trx_has_sys_table_locks(trx) == 0); + + row_mysql_unlock_data_dictionary(trx); + fts_optimize_remove_table(table); + row_mysql_lock_data_dictionary(trx); + } + + /* Do not bother to deal with persistent stats for temp + tables since we know temp tables do not use persistent + stats. */ + if (!dict_table_is_temporary(table)) { + dict_stats_wait_bg_to_stop_using_table( + table, trx); + } + } + + /* make sure background stats thread is not running on the table */ + ut_ad(!(table->stats_bg_flag & BG_STAT_IN_PROGRESS)); + + /* Delete the link file if used. */ + if (DICT_TF_HAS_DATA_DIR(table->flags)) { + fil_delete_link_file(name); + } + + if (!dict_table_is_temporary(table)) { + + dict_stats_recalc_pool_del(table); + + /* Remove stats for this table and all of its indexes from the + persistent storage if it exists and if there are stats for this + table in there. This function creates its own trx and commits + it. */ + char errstr[1024]; + err = dict_stats_drop_table(name, errstr, sizeof(errstr)); + + if (err != DB_SUCCESS) { + ib_logf(IB_LOG_LEVEL_WARN, "%s", errstr); + } + } + + /* Move the table the the non-LRU list so that it isn't + considered for eviction. */ + + if (table->can_be_evicted) { + dict_table_move_from_lru_to_non_lru(table); + } + + dict_table_close(table, TRUE, FALSE); + + /* Check if the table is referenced by foreign key constraints from + some other table (not the table itself) */ + + if (!srv_read_only_mode && trx->check_foreigns) { + + for (dict_foreign_set::iterator it + = table->referenced_set.begin(); + it != table->referenced_set.end(); + ++it) { + + foreign = *it; + + const bool ref_ok = drop_db + && dict_tables_have_same_db( + name, + foreign->foreign_table_name_lookup); + + if (foreign->foreign_table != table && !ref_ok) { + + FILE* ef = dict_foreign_err_file; + + /* We only allow dropping a referenced table + if FOREIGN_KEY_CHECKS is set to 0 */ + + err = DB_CANNOT_DROP_CONSTRAINT; + + mutex_enter(&dict_foreign_err_mutex); + rewind(ef); + ut_print_timestamp(ef); + + fputs(" Cannot drop table ", ef); + ut_print_name(ef, trx, TRUE, name); + fputs("\n" + "because it is referenced by ", ef); + ut_print_name(ef, trx, TRUE, + foreign->foreign_table_name); + putc('\n', ef); + mutex_exit(&dict_foreign_err_mutex); + + goto funct_exit; + } + } + } + + /* TODO: could we replace the counter n_foreign_key_checks_running + with lock checks on the table? Acquire here an exclusive lock on the + table, and rewrite lock0lock.cc and the lock wait in srv0srv.cc so that + they can cope with the table having been dropped here? Foreign key + checks take an IS or IX lock on the table. */ + + if (table->n_foreign_key_checks_running > 0) { + + const char* save_tablename = table->name; + ibool added; + + added = row_add_table_to_background_drop_list(save_tablename); + + if (added) { + ut_print_timestamp(stderr); + fputs(" InnoDB: You are trying to drop table ", + stderr); + ut_print_name(stderr, trx, TRUE, save_tablename); + fputs("\n" + "InnoDB: though there is a" + " foreign key check running on it.\n" + "InnoDB: Adding the table to" + " the background drop queue.\n", + stderr); + + /* We return DB_SUCCESS to MySQL though the drop will + happen lazily later */ + + err = DB_SUCCESS; + } else { + /* The table is already in the background drop list */ + err = DB_ERROR; + } + + goto funct_exit; + } + + /* Remove all locks that are on the table or its records, if there + are no refernces to the table but it has record locks, we release + the record locks unconditionally. One use case is: + + CREATE TABLE t2 (PRIMARY KEY (a)) SELECT * FROM t1; + + If after the user transaction has done the SELECT and there is a + problem in completing the CREATE TABLE operation, MySQL will drop + the table. InnoDB will create a new background transaction to do the + actual drop, the trx instance that is passed to this function. To + preserve existing behaviour we remove the locks but ideally we + shouldn't have to. There should never be record locks on a table + that is going to be dropped. */ + + if (table->n_ref_count == 0) { + lock_remove_all_on_table(table, TRUE); + ut_a(table->n_rec_locks == 0); + } else if (table->n_ref_count > 0 || table->n_rec_locks > 0) { + ibool added; + + added = row_add_table_to_background_drop_list(table->name); + + if (added) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Warning: MySQL is" + " trying to drop table ", stderr); + ut_print_name(stderr, trx, TRUE, table->name); + fputs("\n" + "InnoDB: though there are still" + " open handles to it.\n" + "InnoDB: Adding the table to the" + " background drop queue.\n", + stderr); + + /* We return DB_SUCCESS to MySQL though the drop will + happen lazily later */ + err = DB_SUCCESS; + } else { + /* The table is already in the background drop list */ + err = DB_ERROR; + } + + goto funct_exit; + } + + /* The "to_be_dropped" marks table that is to be dropped, but + has not been dropped, instead, was put in the background drop + list due to being used by concurrent DML operations. Clear it + here since there are no longer any concurrent activities on it, + and it is free to be dropped */ + table->to_be_dropped = false; + + /* If we get this far then the table to be dropped must not have + any table or record locks on it. */ + + ut_a(!lock_table_has_locks(table)); + + switch (trx_get_dict_operation(trx)) { + case TRX_DICT_OP_NONE: + trx_set_dict_operation(trx, TRX_DICT_OP_TABLE); + trx->table_id = table->id; + case TRX_DICT_OP_TABLE: + break; + case TRX_DICT_OP_INDEX: + /* If the transaction was previously flagged as + TRX_DICT_OP_INDEX, we should be dropping auxiliary + tables for full-text indexes. */ + ut_ad(strstr(table->name, "/FTS_") != NULL); + } + + /* Mark all indexes unavailable in the data dictionary cache + before starting to drop the table. */ + + unsigned* page_no; + unsigned* page_nos; + heap = mem_heap_create( + 200 + UT_LIST_GET_LEN(table->indexes) * sizeof *page_nos); + tablename = mem_heap_strdup(heap, name); + + page_no = page_nos = static_cast<unsigned*>( + mem_heap_alloc( + heap, + UT_LIST_GET_LEN(table->indexes) * sizeof *page_no)); + + for (dict_index_t* index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + rw_lock_x_lock(dict_index_get_lock(index)); + /* Save the page numbers so that we can restore them + if the operation fails. */ + *page_no++ = index->page; + /* Mark the index unusable. */ + index->page = FIL_NULL; + rw_lock_x_unlock(dict_index_get_lock(index)); + } + + /* We use the private SQL parser of Innobase to generate the + query graphs needed in deleting the dictionary data from system + tables in Innobase. Deleting a row from SYS_INDEXES table also + frees the file segments of the B-tree associated with the index. */ + + info = pars_info_create(); + + pars_info_add_str_literal(info, "table_name", name); + + err = que_eval_sql(info, + "PROCEDURE DROP_TABLE_PROC () IS\n" + "sys_foreign_id CHAR;\n" + "table_id CHAR;\n" + "index_id CHAR;\n" + "foreign_id CHAR;\n" + "space_id INT;\n" + "found INT;\n" + + "DECLARE CURSOR cur_fk IS\n" + "SELECT ID FROM SYS_FOREIGN\n" + "WHERE FOR_NAME = :table_name\n" + "AND TO_BINARY(FOR_NAME)\n" + " = TO_BINARY(:table_name)\n" + "LOCK IN SHARE MODE;\n" + + "DECLARE CURSOR cur_idx IS\n" + "SELECT ID FROM SYS_INDEXES\n" + "WHERE TABLE_ID = table_id\n" + "LOCK IN SHARE MODE;\n" + + "BEGIN\n" + "SELECT ID INTO table_id\n" + "FROM SYS_TABLES\n" + "WHERE NAME = :table_name\n" + "LOCK IN SHARE MODE;\n" + "IF (SQL % NOTFOUND) THEN\n" + " RETURN;\n" + "END IF;\n" + "SELECT SPACE INTO space_id\n" + "FROM SYS_TABLES\n" + "WHERE NAME = :table_name;\n" + "IF (SQL % NOTFOUND) THEN\n" + " RETURN;\n" + "END IF;\n" + "found := 1;\n" + "SELECT ID INTO sys_foreign_id\n" + "FROM SYS_TABLES\n" + "WHERE NAME = 'SYS_FOREIGN'\n" + "LOCK IN SHARE MODE;\n" + "IF (SQL % NOTFOUND) THEN\n" + " found := 0;\n" + "END IF;\n" + "IF (:table_name = 'SYS_FOREIGN') THEN\n" + " found := 0;\n" + "END IF;\n" + "IF (:table_name = 'SYS_FOREIGN_COLS') THEN\n" + " found := 0;\n" + "END IF;\n" + "OPEN cur_fk;\n" + "WHILE found = 1 LOOP\n" + " FETCH cur_fk INTO foreign_id;\n" + " IF (SQL % NOTFOUND) THEN\n" + " found := 0;\n" + " ELSE\n" + " DELETE FROM SYS_FOREIGN_COLS\n" + " WHERE ID = foreign_id;\n" + " DELETE FROM SYS_FOREIGN\n" + " WHERE ID = foreign_id;\n" + " END IF;\n" + "END LOOP;\n" + "CLOSE cur_fk;\n" + "found := 1;\n" + "OPEN cur_idx;\n" + "WHILE found = 1 LOOP\n" + " FETCH cur_idx INTO index_id;\n" + " IF (SQL % NOTFOUND) THEN\n" + " found := 0;\n" + " ELSE\n" + " DELETE FROM SYS_FIELDS\n" + " WHERE INDEX_ID = index_id;\n" + " DELETE FROM SYS_INDEXES\n" + " WHERE ID = index_id\n" + " AND TABLE_ID = table_id;\n" + " END IF;\n" + "END LOOP;\n" + "CLOSE cur_idx;\n" + "DELETE FROM SYS_TABLESPACES\n" + "WHERE SPACE = space_id;\n" + "DELETE FROM SYS_DATAFILES\n" + "WHERE SPACE = space_id;\n" + "DELETE FROM SYS_COLUMNS\n" + "WHERE TABLE_ID = table_id;\n" + "DELETE FROM SYS_TABLES\n" + "WHERE NAME = :table_name;\n" + "END;\n" + , FALSE, trx); + + switch (err) { + ibool is_temp; + + case DB_SUCCESS: + /* Clone the name, in case it has been allocated + from table->heap, which will be freed by + dict_table_remove_from_cache(table) below. */ + space_id = table->space; + ibd_file_missing = table->ibd_file_missing; + + is_temp = DICT_TF2_FLAG_IS_SET(table, DICT_TF2_TEMPORARY); + + /* If there is a temp path then the temp flag is set. + However, during recovery, we might have a temp flag but + not know the temp path */ + ut_a(table->dir_path_of_temp_table == NULL || is_temp); + if (dict_table_is_discarded(table) + || table->ibd_file_missing) { + /* Do not attempt to drop known-to-be-missing + tablespaces. */ + space_id = 0; + } + + /* We do not allow temporary tables with a remote path. */ + ut_a(!(is_temp && DICT_TF_HAS_DATA_DIR(table->flags))); + + if (space_id && DICT_TF_HAS_DATA_DIR(table->flags)) { + dict_get_and_save_data_dir_path(table, true); + ut_a(table->data_dir_path); + + filepath = os_file_make_remote_pathname( + table->data_dir_path, table->name, "ibd"); + } else if (table->dir_path_of_temp_table) { + filepath = fil_make_ibd_name( + table->dir_path_of_temp_table, true); + } else { + filepath = fil_make_ibd_name(tablename, false); + } + + if (dict_table_has_fts_index(table) + || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) { + ut_ad(table->n_ref_count == 0); + ut_ad(trx->state != TRX_STATE_NOT_STARTED); + err = fts_drop_tables(trx, table); + + if (err != DB_SUCCESS) { + ut_print_timestamp(stderr); + fprintf(stderr," InnoDB: Error: (%s) not " + "able to remove ancillary FTS tables " + "for table ", ut_strerr(err)); + ut_print_name(stderr, trx, TRUE, tablename); + fputs("\n", stderr); + + goto funct_exit; + } + } + + /* The table->fts flag can be set on the table for which + the cluster index is being rebuilt. Such table might not have + DICT_TF2_FTS flag set. So keep this out of above + dict_table_has_fts_index condition */ + if (table->fts) { + /* Need to set TABLE_DICT_LOCKED bit, since + fts_que_graph_free_check_lock would try to acquire + dict mutex lock */ + table->fts->fts_status |= TABLE_DICT_LOCKED; + + fts_free(table); + } + + dict_table_remove_from_cache(table); + + if (dict_load_table(tablename, TRUE, + DICT_ERR_IGNORE_NONE) != NULL) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Error: not able to remove table ", + stderr); + ut_print_name(stderr, trx, TRUE, tablename); + fputs(" from the dictionary cache!\n", stderr); + err = DB_ERROR; + } + + /* Do not drop possible .ibd tablespace if something went + wrong: we do not want to delete valuable data of the user */ + + /* Don't spam the log if we can't find the tablespace of + a temp table or if the tablesace has been discarded. */ + print_msg = !(is_temp || ibd_file_missing); + + if (err == DB_SUCCESS && space_id > TRX_SYS_SPACE) { + if (!is_temp + && !fil_space_for_table_exists_in_mem( + space_id, tablename, FALSE, + print_msg, false, NULL, 0)) { + /* This might happen if we are dropping a + discarded tablespace */ + err = DB_SUCCESS; + + if (print_msg) { + char msg_tablename[MAX_FULL_NAME_LEN + 1]; + + innobase_format_name( + msg_tablename, sizeof(tablename), + tablename, FALSE); + + ib_logf(IB_LOG_LEVEL_INFO, + "Removed the table %s from " + "InnoDB's data dictionary", + msg_tablename); + } + + /* Force a delete of any discarded + or temporary files. */ + + fil_delete_file(filepath); + + } else if (fil_delete_tablespace( + space_id, + BUF_REMOVE_FLUSH_NO_WRITE) + != DB_SUCCESS) { + fprintf(stderr, + "InnoDB: We removed now the InnoDB" + " internal data dictionary entry\n" + "InnoDB: of table "); + ut_print_name(stderr, trx, TRUE, tablename); + fprintf(stderr, ".\n"); + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: not able to" + " delete tablespace %lu of table ", + (ulong) space_id); + ut_print_name(stderr, trx, TRUE, tablename); + fputs("!\n", stderr); + err = DB_ERROR; + } + } + + break; + + case DB_OUT_OF_FILE_SPACE: + err = DB_MUST_GET_MORE_FILE_SPACE; + + row_mysql_handle_errors(&err, trx, NULL, NULL); + + /* raise error */ + ut_error; + break; + + case DB_TOO_MANY_CONCURRENT_TRXS: + /* Cannot even find a free slot for the + the undo log. We can directly exit here + and return the DB_TOO_MANY_CONCURRENT_TRXS + error. */ + + default: + /* This is some error we do not expect. Print + the error number and rollback transaction */ + ut_print_timestamp(stderr); + + fprintf(stderr, "InnoDB: unknown error code %lu" + " while dropping table:", (ulong) err); + ut_print_name(stderr, trx, TRUE, tablename); + fprintf(stderr, ".\n"); + + trx->error_state = DB_SUCCESS; + trx_rollback_to_savepoint(trx, NULL); + trx->error_state = DB_SUCCESS; + + /* Mark all indexes available in the data dictionary + cache again. */ + + page_no = page_nos; + + for (dict_index_t* index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + rw_lock_x_lock(dict_index_get_lock(index)); + ut_a(index->page == FIL_NULL); + index->page = *page_no++; + rw_lock_x_unlock(dict_index_get_lock(index)); + } + } + +funct_exit: + if (heap) { + mem_heap_free(heap); + } + if (filepath) { + mem_free(filepath); + } + + if (locked_dictionary) { + trx_commit_for_mysql(trx); + + row_mysql_unlock_data_dictionary(trx); + } + + trx->op_info = ""; + + srv_wake_master_thread(); + + DBUG_RETURN(err); +} + +/*********************************************************************//** +Drop all temporary tables during crash recovery. */ +UNIV_INTERN +void +row_mysql_drop_temp_tables(void) +/*============================*/ +{ + trx_t* trx; + btr_pcur_t pcur; + mtr_t mtr; + mem_heap_t* heap; + + trx = trx_allocate_for_background(); + trx->op_info = "dropping temporary tables"; + row_mysql_lock_data_dictionary(trx); + + heap = mem_heap_create(200); + + mtr_start(&mtr); + + btr_pcur_open_at_index_side( + true, + dict_table_get_first_index(dict_sys->sys_tables), + BTR_SEARCH_LEAF, &pcur, true, 0, &mtr); + + for (;;) { + const rec_t* rec; + const byte* field; + ulint len; + const char* table_name; + dict_table_t* table; + + btr_pcur_move_to_next_user_rec(&pcur, &mtr); + + if (!btr_pcur_is_on_user_rec(&pcur)) { + break; + } + + /* The high order bit of N_COLS is set unless + ROW_FORMAT=REDUNDANT. */ + rec = btr_pcur_get_rec(&pcur); + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_TABLES__NAME, &len); + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_TABLES__N_COLS, &len); + if (len != 4 + || !(mach_read_from_4(field) & DICT_N_COLS_COMPACT)) { + continue; + } + + /* Older versions of InnoDB, which only supported tables + in ROW_FORMAT=REDUNDANT could write garbage to + SYS_TABLES.MIX_LEN, where we now store the is_temp flag. + Above, we assumed is_temp=0 if ROW_FORMAT=REDUNDANT. */ + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_TABLES__MIX_LEN, &len); + if (len != 4 + || !(mach_read_from_4(field) & DICT_TF2_TEMPORARY)) { + continue; + } + + /* This is a temporary table. */ + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_TABLES__NAME, &len); + if (len == UNIV_SQL_NULL || len == 0) { + /* Corrupted SYS_TABLES.NAME */ + continue; + } + + table_name = mem_heap_strdupl(heap, (const char*) field, len); + + btr_pcur_store_position(&pcur, &mtr); + btr_pcur_commit_specify_mtr(&pcur, &mtr); + + table = dict_table_get_low(table_name); + + if (table) { + row_drop_table_for_mysql(table_name, trx, FALSE); + trx_commit_for_mysql(trx); + } + + mtr_start(&mtr); + btr_pcur_restore_position(BTR_SEARCH_LEAF, + &pcur, &mtr); + } + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + mem_heap_free(heap); + row_mysql_unlock_data_dictionary(trx); + trx_free_for_background(trx); +} + +/*******************************************************************//** +Drop all foreign keys in a database, see Bug#18942. +Called at the end of row_drop_database_for_mysql(). +@return error code or DB_SUCCESS */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +drop_all_foreign_keys_in_db( +/*========================*/ + const char* name, /*!< in: database name which ends to '/' */ + trx_t* trx) /*!< in: transaction handle */ +{ + pars_info_t* pinfo; + dberr_t err; + + ut_a(name[strlen(name) - 1] == '/'); + + pinfo = pars_info_create(); + + pars_info_add_str_literal(pinfo, "dbname", name); + +/** true if for_name is not prefixed with dbname */ +#define TABLE_NOT_IN_THIS_DB \ +"SUBSTR(for_name, 0, LENGTH(:dbname)) <> :dbname" + + err = que_eval_sql(pinfo, + "PROCEDURE DROP_ALL_FOREIGN_KEYS_PROC () IS\n" + "foreign_id CHAR;\n" + "for_name CHAR;\n" + "found INT;\n" + "DECLARE CURSOR cur IS\n" + "SELECT ID, FOR_NAME FROM SYS_FOREIGN\n" + "WHERE FOR_NAME >= :dbname\n" + "LOCK IN SHARE MODE\n" + "ORDER BY FOR_NAME;\n" + "BEGIN\n" + "found := 1;\n" + "OPEN cur;\n" + "WHILE found = 1 LOOP\n" + " FETCH cur INTO foreign_id, for_name;\n" + " IF (SQL % NOTFOUND) THEN\n" + " found := 0;\n" + " ELSIF (" TABLE_NOT_IN_THIS_DB ") THEN\n" + " found := 0;\n" + " ELSIF (1=1) THEN\n" + " DELETE FROM SYS_FOREIGN_COLS\n" + " WHERE ID = foreign_id;\n" + " DELETE FROM SYS_FOREIGN\n" + " WHERE ID = foreign_id;\n" + " END IF;\n" + "END LOOP;\n" + "CLOSE cur;\n" + "COMMIT WORK;\n" + "END;\n", + FALSE, /* do not reserve dict mutex, + we are already holding it */ + trx); + + return(err); +} + +/*********************************************************************//** +Drops a database for MySQL. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_drop_database_for_mysql( +/*========================*/ + const char* name, /*!< in: database name which ends to '/' */ + trx_t* trx) /*!< in: transaction handle */ +{ + dict_table_t* table; + char* table_name; + dberr_t err = DB_SUCCESS; + ulint namelen = strlen(name); + + ut_a(name != NULL); + ut_a(name[namelen - 1] == '/'); + + trx->op_info = "dropping database"; + + trx_set_dict_operation(trx, TRX_DICT_OP_TABLE); + + trx_start_if_not_started_xa(trx); +loop: + row_mysql_lock_data_dictionary(trx); + + while ((table_name = dict_get_first_table_name_in_db(name))) { + ut_a(memcmp(table_name, name, namelen) == 0); + + table = dict_table_open_on_name( + table_name, TRUE, FALSE, static_cast<dict_err_ignore_t>( + DICT_ERR_IGNORE_INDEX_ROOT + | DICT_ERR_IGNORE_CORRUPT)); + + if (!table) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Cannot load table %s from InnoDB internal " + "data dictionary during drop database", + table_name); + mem_free(table_name); + err = DB_TABLE_NOT_FOUND; + break; + + } + + if (!row_is_mysql_tmp_table_name(table->name)) { + /* There could be orphan temp tables left from + interrupted alter table. Leave them, and handle + the rest.*/ + if (table->can_be_evicted) { + ib_logf(IB_LOG_LEVEL_WARN, + "Orphan table encountered during " + "DROP DATABASE. This is possible if " + "'%s.frm' was lost.", table->name); + } + + if (table->ibd_file_missing) { + ib_logf(IB_LOG_LEVEL_WARN, + "Missing %s.ibd file for table %s.", + table->name, table->name); + } + } + + dict_table_close(table, TRUE, FALSE); + + /* The dict_table_t object must not be accessed before + dict_table_open() or after dict_table_close(). But this is OK + if we are holding, the dict_sys->mutex. */ + ut_ad(mutex_own(&dict_sys->mutex)); + + /* Wait until MySQL does not have any queries running on + the table */ + + if (table->n_ref_count > 0) { + row_mysql_unlock_data_dictionary(trx); + + ut_print_timestamp(stderr); + fputs(" InnoDB: Warning: MySQL is trying to" + " drop database ", stderr); + ut_print_name(stderr, trx, TRUE, name); + fputs("\n" + "InnoDB: though there are still" + " open handles to table ", stderr); + ut_print_name(stderr, trx, TRUE, table_name); + fputs(".\n", stderr); + + os_thread_sleep(1000000); + + mem_free(table_name); + + goto loop; + } + + err = row_drop_table_for_mysql(table_name, trx, TRUE); + trx_commit_for_mysql(trx); + + if (err != DB_SUCCESS) { + fputs("InnoDB: DROP DATABASE ", stderr); + ut_print_name(stderr, trx, TRUE, name); + fprintf(stderr, " failed with error (%s) for table ", + ut_strerr(err)); + ut_print_name(stderr, trx, TRUE, table_name); + putc('\n', stderr); + mem_free(table_name); + break; + } + + mem_free(table_name); + } + + if (err == DB_SUCCESS) { + /* after dropping all tables try to drop all leftover + foreign keys in case orphaned ones exist */ + err = drop_all_foreign_keys_in_db(name, trx); + + if (err != DB_SUCCESS) { + fputs("InnoDB: DROP DATABASE ", stderr); + ut_print_name(stderr, trx, TRUE, name); + fprintf(stderr, " failed with error %d while " + "dropping all foreign keys", err); + } + } + + trx_commit_for_mysql(trx); + + row_mysql_unlock_data_dictionary(trx); + + trx->op_info = ""; + + return(err); +} + +/*********************************************************************//** +Checks if a table name contains the string "/#sql" which denotes temporary +tables in MySQL. +@return true if temporary table */ +UNIV_INTERN __attribute__((warn_unused_result)) +bool +row_is_mysql_tmp_table_name( +/*========================*/ + const char* name) /*!< in: table name in the form + 'database/tablename' */ +{ + return(strstr(name, "/#sql") != NULL); + /* return(strstr(name, "/@0023sql") != NULL); */ +} + +/****************************************************************//** +Delete a single constraint. +@return error code or DB_SUCCESS */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_delete_constraint_low( +/*======================*/ + const char* id, /*!< in: constraint id */ + trx_t* trx) /*!< in: transaction handle */ +{ + pars_info_t* info = pars_info_create(); + + pars_info_add_str_literal(info, "id", id); + + return(que_eval_sql(info, + "PROCEDURE DELETE_CONSTRAINT () IS\n" + "BEGIN\n" + "DELETE FROM SYS_FOREIGN_COLS WHERE ID = :id;\n" + "DELETE FROM SYS_FOREIGN WHERE ID = :id;\n" + "END;\n" + , FALSE, trx)); +} + +/****************************************************************//** +Delete a single constraint. +@return error code or DB_SUCCESS */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_delete_constraint( +/*==================*/ + const char* id, /*!< in: constraint id */ + const char* database_name, /*!< in: database name, with the + trailing '/' */ + mem_heap_t* heap, /*!< in: memory heap */ + trx_t* trx) /*!< in: transaction handle */ +{ + dberr_t err; + + /* New format constraints have ids <databasename>/<constraintname>. */ + err = row_delete_constraint_low( + mem_heap_strcat(heap, database_name, id), trx); + + if ((err == DB_SUCCESS) && !strchr(id, '/')) { + /* Old format < 4.0.18 constraints have constraint ids + NUMBER_NUMBER. We only try deleting them if the + constraint name does not contain a '/' character, otherwise + deleting a new format constraint named 'foo/bar' from + database 'baz' would remove constraint 'bar' from database + 'foo', if it existed. */ + + err = row_delete_constraint_low(id, trx); + } + + return(err); +} + +/*********************************************************************//** +Renames a table for MySQL. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_rename_table_for_mysql( +/*=======================*/ + const char* old_name, /*!< in: old table name */ + const char* new_name, /*!< in: new table name */ + trx_t* trx, /*!< in/out: transaction */ + bool commit) /*!< in: whether to commit trx */ +{ + dict_table_t* table = NULL; + ibool dict_locked = FALSE; + dberr_t err = DB_ERROR; + mem_heap_t* heap = NULL; + const char** constraints_to_drop = NULL; + ulint n_constraints_to_drop = 0; + ibool old_is_tmp, new_is_tmp; + pars_info_t* info = NULL; + int retry; + + ut_a(old_name != NULL); + ut_a(new_name != NULL); + ut_ad(trx->state == TRX_STATE_ACTIVE); + + if (srv_created_new_raw || srv_force_recovery) { + fputs("InnoDB: A new raw disk partition was initialized or\n" + "InnoDB: innodb_force_recovery is on: we do not allow\n" + "InnoDB: database modifications by the user. Shut down\n" + "InnoDB: mysqld and edit my.cnf so that newraw" + " is replaced\n" + "InnoDB: with raw, and innodb_force_... is removed.\n", + stderr); + if(srv_force_recovery) { + err = DB_READ_ONLY; + } + + goto funct_exit; + } else if (row_mysql_is_system_table(new_name)) { + + fprintf(stderr, + "InnoDB: Error: trying to create a MySQL" + " system table %s of type InnoDB.\n" + "InnoDB: MySQL system tables must be" + " of the MyISAM type!\n", + new_name); + + goto funct_exit; + } + + trx->op_info = "renaming table"; + + old_is_tmp = row_is_mysql_tmp_table_name(old_name); + new_is_tmp = row_is_mysql_tmp_table_name(new_name); + + dict_locked = trx->dict_operation_lock_mode == RW_X_LATCH; + + table = dict_table_open_on_name(old_name, dict_locked, FALSE, + DICT_ERR_IGNORE_NONE); + + if (!table) { + err = DB_TABLE_NOT_FOUND; + ut_print_timestamp(stderr); + + fputs(" InnoDB: Error: table ", stderr); + ut_print_name(stderr, trx, TRUE, old_name); + fputs(" does not exist in the InnoDB internal\n" + "InnoDB: data dictionary though MySQL is" + " trying to rename the table.\n" + "InnoDB: Have you copied the .frm file" + " of the table to the\n" + "InnoDB: MySQL database directory" + " from another database?\n" + "InnoDB: You can look for further help from\n" + "InnoDB: " REFMAN "innodb-troubleshooting.html\n", + stderr); + goto funct_exit; + + } else if (table->ibd_file_missing + && !dict_table_is_discarded(table)) { + + err = DB_TABLE_NOT_FOUND; + + ib_logf(IB_LOG_LEVEL_ERROR, + "Table %s does not have an .ibd file in the database " + "directory. See " REFMAN "innodb-troubleshooting.html", + old_name); + + goto funct_exit; + + } else if (new_is_tmp) { + /* MySQL is doing an ALTER TABLE command and it renames the + original table to a temporary table name. We want to preserve + the original foreign key constraint definitions despite the + name change. An exception is those constraints for which + the ALTER TABLE contained DROP FOREIGN KEY <foreign key id>.*/ + + heap = mem_heap_create(100); + + err = dict_foreign_parse_drop_constraints( + heap, trx, table, &n_constraints_to_drop, + &constraints_to_drop); + + if (err != DB_SUCCESS) { + goto funct_exit; + } + } + + /* Is a foreign key check running on this table? */ + for (retry = 0; retry < 100 + && table->n_foreign_key_checks_running > 0; ++retry) { + row_mysql_unlock_data_dictionary(trx); + os_thread_yield(); + row_mysql_lock_data_dictionary(trx); + } + + if (table->n_foreign_key_checks_running > 0) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Error: in ALTER TABLE ", stderr); + ut_print_name(stderr, trx, TRUE, old_name); + fprintf(stderr, "\n" + "InnoDB: a FOREIGN KEY check is running.\n" + "InnoDB: Cannot rename table.\n"); + err = DB_TABLE_IN_FK_CHECK; + goto funct_exit; + } + + /* We use the private SQL parser of Innobase to generate the query + graphs needed in updating the dictionary data from system tables. */ + + info = pars_info_create(); + + pars_info_add_str_literal(info, "new_table_name", new_name); + pars_info_add_str_literal(info, "old_table_name", old_name); + + err = que_eval_sql(info, + "PROCEDURE RENAME_TABLE () IS\n" + "BEGIN\n" + "UPDATE SYS_TABLES" + " SET NAME = :new_table_name\n" + " WHERE NAME = :old_table_name;\n" + "END;\n" + , FALSE, trx); + + /* SYS_TABLESPACES and SYS_DATAFILES track non-system tablespaces + which have space IDs > 0. */ + if (err == DB_SUCCESS + && table->space != TRX_SYS_SPACE + && !table->ibd_file_missing) { + /* Make a new pathname to update SYS_DATAFILES. */ + char* new_path = row_make_new_pathname(table, new_name); + + info = pars_info_create(); + + pars_info_add_str_literal(info, "new_table_name", new_name); + pars_info_add_str_literal(info, "new_path_name", new_path); + pars_info_add_int4_literal(info, "space_id", table->space); + + err = que_eval_sql(info, + "PROCEDURE RENAME_SPACE () IS\n" + "BEGIN\n" + "UPDATE SYS_TABLESPACES" + " SET NAME = :new_table_name\n" + " WHERE SPACE = :space_id;\n" + "UPDATE SYS_DATAFILES" + " SET PATH = :new_path_name\n" + " WHERE SPACE = :space_id;\n" + "END;\n" + , FALSE, trx); + + mem_free(new_path); + } + if (err != DB_SUCCESS) { + goto end; + } + + if (!new_is_tmp) { + /* Rename all constraints. */ + char new_table_name[MAX_TABLE_NAME_LEN] = ""; + char old_table_utf8[MAX_TABLE_NAME_LEN] = ""; + uint errors = 0; + + strncpy(old_table_utf8, old_name, MAX_TABLE_NAME_LEN); + innobase_convert_to_system_charset( + strchr(old_table_utf8, '/') + 1, + strchr(old_name, '/') +1, + MAX_TABLE_NAME_LEN, &errors); + + if (errors) { + /* Table name could not be converted from charset + my_charset_filename to UTF-8. This means that the + table name is already in UTF-8 (#mysql#50). */ + strncpy(old_table_utf8, old_name, MAX_TABLE_NAME_LEN); + } + + info = pars_info_create(); + + pars_info_add_str_literal(info, "new_table_name", new_name); + pars_info_add_str_literal(info, "old_table_name", old_name); + pars_info_add_str_literal(info, "old_table_name_utf8", + old_table_utf8); + + strncpy(new_table_name, new_name, MAX_TABLE_NAME_LEN); + innobase_convert_to_system_charset( + strchr(new_table_name, '/') + 1, + strchr(new_name, '/') +1, + MAX_TABLE_NAME_LEN, &errors); + + if (errors) { + /* Table name could not be converted from charset + my_charset_filename to UTF-8. This means that the + table name is already in UTF-8 (#mysql#50). */ + strncpy(new_table_name, new_name, MAX_TABLE_NAME_LEN); + } + + pars_info_add_str_literal(info, "new_table_utf8", new_table_name); + + err = que_eval_sql( + info, + "PROCEDURE RENAME_CONSTRAINT_IDS () IS\n" + "gen_constr_prefix CHAR;\n" + "new_db_name CHAR;\n" + "foreign_id CHAR;\n" + "new_foreign_id CHAR;\n" + "old_db_name_len INT;\n" + "old_t_name_len INT;\n" + "new_db_name_len INT;\n" + "id_len INT;\n" + "offset INT;\n" + "found INT;\n" + "BEGIN\n" + "found := 1;\n" + "old_db_name_len := INSTR(:old_table_name, '/')-1;\n" + "new_db_name_len := INSTR(:new_table_name, '/')-1;\n" + "new_db_name := SUBSTR(:new_table_name, 0,\n" + " new_db_name_len);\n" + "old_t_name_len := LENGTH(:old_table_name);\n" + "gen_constr_prefix := CONCAT(:old_table_name_utf8,\n" + " '_ibfk_');\n" + "WHILE found = 1 LOOP\n" + " SELECT ID INTO foreign_id\n" + " FROM SYS_FOREIGN\n" + " WHERE FOR_NAME = :old_table_name\n" + " AND TO_BINARY(FOR_NAME)\n" + " = TO_BINARY(:old_table_name)\n" + " LOCK IN SHARE MODE;\n" + " IF (SQL % NOTFOUND) THEN\n" + " found := 0;\n" + " ELSE\n" + " UPDATE SYS_FOREIGN\n" + " SET FOR_NAME = :new_table_name\n" + " WHERE ID = foreign_id;\n" + " id_len := LENGTH(foreign_id);\n" + " IF (INSTR(foreign_id, '/') > 0) THEN\n" + " IF (INSTR(foreign_id,\n" + " gen_constr_prefix) > 0)\n" + " THEN\n" + " offset := INSTR(foreign_id, '_ibfk_') - 1;\n" + " new_foreign_id :=\n" + " CONCAT(:new_table_utf8,\n" + " SUBSTR(foreign_id, offset,\n" + " id_len - offset));\n" + " ELSE\n" + " new_foreign_id :=\n" + " CONCAT(new_db_name,\n" + " SUBSTR(foreign_id,\n" + " old_db_name_len,\n" + " id_len - old_db_name_len));\n" + " END IF;\n" + " UPDATE SYS_FOREIGN\n" + " SET ID = new_foreign_id\n" + " WHERE ID = foreign_id;\n" + " UPDATE SYS_FOREIGN_COLS\n" + " SET ID = new_foreign_id\n" + " WHERE ID = foreign_id;\n" + " END IF;\n" + " END IF;\n" + "END LOOP;\n" + "UPDATE SYS_FOREIGN SET REF_NAME = :new_table_name\n" + "WHERE REF_NAME = :old_table_name\n" + " AND TO_BINARY(REF_NAME)\n" + " = TO_BINARY(:old_table_name);\n" + "END;\n" + , FALSE, trx); + + } else if (n_constraints_to_drop > 0) { + /* Drop some constraints of tmp tables. */ + + ulint db_name_len = dict_get_db_name_len(old_name) + 1; + char* db_name = mem_heap_strdupl(heap, old_name, + db_name_len); + ulint i; + + for (i = 0; i < n_constraints_to_drop; i++) { + err = row_delete_constraint(constraints_to_drop[i], + db_name, heap, trx); + + if (err != DB_SUCCESS) { + break; + } + } + } + + if (dict_table_has_fts_index(table) + && !dict_tables_have_same_db(old_name, new_name)) { + err = fts_rename_aux_tables(table, new_name, trx); + + if (err != DB_SUCCESS && (table->space != 0)) { + char* orig_name = table->name; + trx_t* trx_bg = trx_allocate_for_background(); + + /* If the first fts_rename fails, the trx would + be rolled back and committed, we can't use it any more, + so we have to start a new background trx here. */ + ut_a(trx_state_eq(trx, TRX_STATE_NOT_STARTED)); + trx_bg->op_info = "Revert the failing rename " + "for fts aux tables"; + trx_bg->dict_operation_lock_mode = RW_X_LATCH; + trx_start_for_ddl(trx_bg, TRX_DICT_OP_TABLE); + + /* If rename fails and table has its own tablespace, + we need to call fts_rename_aux_tables again to + revert the ibd file rename, which is not under the + control of trx. Also notice the parent table name + in cache is not changed yet. If the reverting fails, + the ibd data may be left in the new database, which + can be fixed only manually. */ + table->name = const_cast<char*>(new_name); + fts_rename_aux_tables(table, old_name, trx_bg); + table->name = orig_name; + + trx_bg->dict_operation_lock_mode = 0; + trx_commit_for_mysql(trx_bg); + trx_free_for_background(trx_bg); + } + } + +end: + if (err != DB_SUCCESS) { + if (err == DB_DUPLICATE_KEY) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Error; possible reasons:\n" + "InnoDB: 1) Table rename would cause" + " two FOREIGN KEY constraints\n" + "InnoDB: to have the same internal name" + " in case-insensitive comparison.\n" + "InnoDB: 2) table ", stderr); + ut_print_name(stderr, trx, TRUE, new_name); + fputs(" exists in the InnoDB internal data\n" + "InnoDB: dictionary though MySQL is" + " trying to rename table ", stderr); + ut_print_name(stderr, trx, TRUE, old_name); + fputs(" to it.\n" + "InnoDB: Have you deleted the .frm file" + " and not used DROP TABLE?\n" + "InnoDB: You can look for further help from\n" + "InnoDB: " REFMAN "innodb-troubleshooting.html\n" + "InnoDB: If table ", stderr); + ut_print_name(stderr, trx, TRUE, new_name); + fputs(" is a temporary table #sql..., then" + " it can be that\n" + "InnoDB: there are still queries running" + " on the table, and it will be\n" + "InnoDB: dropped automatically when" + " the queries end.\n" + "InnoDB: You can drop the orphaned table" + " inside InnoDB by\n" + "InnoDB: creating an InnoDB table with" + " the same name in another\n" + "InnoDB: database and copying the .frm file" + " to the current database.\n" + "InnoDB: Then MySQL thinks the table exists," + " and DROP TABLE will\n" + "InnoDB: succeed.\n", stderr); + } + trx->error_state = DB_SUCCESS; + trx_rollback_to_savepoint(trx, NULL); + trx->error_state = DB_SUCCESS; + } else { + /* The following call will also rename the .ibd data file if + the table is stored in a single-table tablespace */ + + err = dict_table_rename_in_cache( + table, new_name, !new_is_tmp); + if (err != DB_SUCCESS) { + trx->error_state = DB_SUCCESS; + trx_rollback_to_savepoint(trx, NULL); + trx->error_state = DB_SUCCESS; + goto funct_exit; + } + + /* We only want to switch off some of the type checking in + an ALTER, not in a RENAME. */ + + err = dict_load_foreigns( + new_name, NULL, + false, !old_is_tmp || trx->check_foreigns, + DICT_ERR_IGNORE_NONE); + + if (err != DB_SUCCESS) { + ut_print_timestamp(stderr); + + if (old_is_tmp) { + fputs(" InnoDB: Error: in ALTER TABLE ", + stderr); + ut_print_name(stderr, trx, TRUE, new_name); + fputs("\n" + "InnoDB: has or is referenced" + " in foreign key constraints\n" + "InnoDB: which are not compatible" + " with the new table definition.\n", + stderr); + } else { + fputs(" InnoDB: Error: in RENAME TABLE" + " table ", + stderr); + ut_print_name(stderr, trx, TRUE, new_name); + fputs("\n" + "InnoDB: is referenced in" + " foreign key constraints\n" + "InnoDB: which are not compatible" + " with the new table definition.\n", + stderr); + } + + ut_a(DB_SUCCESS == dict_table_rename_in_cache( + table, old_name, FALSE)); + trx->error_state = DB_SUCCESS; + trx_rollback_to_savepoint(trx, NULL); + trx->error_state = DB_SUCCESS; + } + } + +funct_exit: + if (table != NULL) { + dict_table_close(table, dict_locked, FALSE); + } + + if (commit) { + trx_commit_for_mysql(trx); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + trx->op_info = ""; + + return(err); +} + +/*********************************************************************//** +Checks that the index contains entries in an ascending order, unique +constraint is not broken, and calculates the number of index entries +in the read view of the current transaction. +@return true if ok */ +UNIV_INTERN +bool +row_check_index_for_mysql( +/*======================*/ + row_prebuilt_t* prebuilt, /*!< in: prebuilt struct + in MySQL handle */ + const dict_index_t* index, /*!< in: index */ + ulint* n_rows) /*!< out: number of entries + seen in the consistent read */ +{ + dtuple_t* prev_entry = NULL; + ulint matched_fields; + ulint matched_bytes; + byte* buf; + ulint ret; + rec_t* rec; + bool is_ok = true; + int cmp; + ibool contains_null; + ulint i; + ulint cnt; + mem_heap_t* heap = NULL; + ulint n_ext; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets; + rec_offs_init(offsets_); + + *n_rows = 0; + + if (dict_index_is_clust(index)) { + /* The clustered index of a table is always available. + During online ALTER TABLE that rebuilds the table, the + clustered index in the old table will have + index->online_log pointing to the new table. All + indexes of the old table will remain valid and the new + table will be unaccessible to MySQL until the + completion of the ALTER TABLE. */ + } else if (dict_index_is_online_ddl(index) + || (index->type & DICT_FTS)) { + /* Full Text index are implemented by auxiliary tables, + not the B-tree. We also skip secondary indexes that are + being created online. */ + return(true); + } + + buf = static_cast<byte*>(mem_alloc(UNIV_PAGE_SIZE)); + heap = mem_heap_create(100); + + cnt = 1000; + + ret = row_search_for_mysql(buf, PAGE_CUR_G, prebuilt, 0, 0); +loop: + /* Check thd->killed every 1,000 scanned rows */ + if (--cnt == 0) { + if (trx_is_interrupted(prebuilt->trx)) { + goto func_exit; + } + cnt = 1000; + } + + switch (ret) { + case DB_SUCCESS: + break; + default: + ut_print_timestamp(stderr); + fputs(" InnoDB: Warning: CHECK TABLE on ", stderr); + dict_index_name_print(stderr, prebuilt->trx, index); + fprintf(stderr, " returned %lu\n", ret); + /* fall through (this error is ignored by CHECK TABLE) */ + case DB_END_OF_INDEX: +func_exit: + mem_free(buf); + mem_heap_free(heap); + + return(is_ok); + } + + *n_rows = *n_rows + 1; + + /* row_search... returns the index record in buf, record origin offset + within buf stored in the first 4 bytes, because we have built a dummy + template */ + + rec = buf + mach_read_from_4(buf); + + offsets = rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap); + + if (prev_entry != NULL) { + matched_fields = 0; + matched_bytes = 0; + + cmp = cmp_dtuple_rec_with_match(prev_entry, rec, offsets, + &matched_fields, + &matched_bytes); + contains_null = FALSE; + + /* In a unique secondary index we allow equal key values if + they contain SQL NULLs */ + + for (i = 0; + i < dict_index_get_n_ordering_defined_by_user(index); + i++) { + if (UNIV_SQL_NULL == dfield_get_len( + dtuple_get_nth_field(prev_entry, i))) { + + contains_null = TRUE; + break; + } + } + + if (cmp > 0) { + fputs("InnoDB: index records in a wrong order in ", + stderr); +not_ok: + dict_index_name_print(stderr, + prebuilt->trx, index); + fputs("\n" + "InnoDB: prev record ", stderr); + dtuple_print(stderr, prev_entry); + fputs("\n" + "InnoDB: record ", stderr); + rec_print_new(stderr, rec, offsets); + putc('\n', stderr); + is_ok = false; + } else if (dict_index_is_unique(index) + && !contains_null + && matched_fields + >= dict_index_get_n_ordering_defined_by_user( + index)) { + + fputs("InnoDB: duplicate key in ", stderr); + goto not_ok; + } + } + + { + mem_heap_t* tmp_heap = NULL; + + /* Empty the heap on each round. But preserve offsets[] + for the row_rec_to_index_entry() call, by copying them + into a separate memory heap when needed. */ + if (UNIV_UNLIKELY(offsets != offsets_)) { + ulint size = rec_offs_get_n_alloc(offsets) + * sizeof *offsets; + + tmp_heap = mem_heap_create(size); + + offsets = static_cast<ulint*>( + mem_heap_dup(tmp_heap, offsets, size)); + } + + mem_heap_empty(heap); + + prev_entry = row_rec_to_index_entry( + rec, index, offsets, &n_ext, heap); + + if (UNIV_LIKELY_NULL(tmp_heap)) { + mem_heap_free(tmp_heap); + } + } + + ret = row_search_for_mysql(buf, PAGE_CUR_G, prebuilt, 0, ROW_SEL_NEXT); + + goto loop; +} + +/*********************************************************************//** +Determines if a table is a magic monitor table. +@return true if monitor table */ +UNIV_INTERN +bool +row_is_magic_monitor_table( +/*=======================*/ + const char* table_name) /*!< in: name of the table, in the + form database/table_name */ +{ + const char* name; /* table_name without database/ */ + ulint len; + + name = dict_remove_db_name(table_name); + len = strlen(name) + 1; + + return(STR_EQ(name, len, S_innodb_monitor) + || STR_EQ(name, len, S_innodb_lock_monitor) + || STR_EQ(name, len, S_innodb_tablespace_monitor) + || STR_EQ(name, len, S_innodb_table_monitor) +#ifdef UNIV_MEM_DEBUG + || STR_EQ(name, len, S_innodb_mem_validate) +#endif /* UNIV_MEM_DEBUG */ + ); +} + +/*********************************************************************//** +Initialize this module */ +UNIV_INTERN +void +row_mysql_init(void) +/*================*/ +{ + mutex_create( + row_drop_list_mutex_key, + &row_drop_list_mutex, SYNC_NO_ORDER_CHECK); + + UT_LIST_INIT(row_mysql_drop_list); + + row_mysql_drop_list_inited = TRUE; +} + +/*********************************************************************//** +Close this module */ +UNIV_INTERN +void +row_mysql_close(void) +/*================*/ +{ + ut_a(UT_LIST_GET_LEN(row_mysql_drop_list) == 0); + + mutex_free(&row_drop_list_mutex); + + row_mysql_drop_list_inited = FALSE; +} diff --git a/storage/xtradb/row/row0purge.cc b/storage/xtradb/row/row0purge.cc new file mode 100644 index 00000000000..8212a7b43e0 --- /dev/null +++ b/storage/xtradb/row/row0purge.cc @@ -0,0 +1,988 @@ +/***************************************************************************** + +Copyright (c) 1997, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file row/row0purge.cc +Purge obsolete records + +Created 3/14/1997 Heikki Tuuri +*******************************************************/ + +#include "row0purge.h" + +#ifdef UNIV_NONINL +#include "row0purge.ic" +#endif + +#include "fsp0fsp.h" +#include "mach0data.h" +#include "trx0rseg.h" +#include "trx0trx.h" +#include "trx0roll.h" +#include "trx0undo.h" +#include "trx0purge.h" +#include "trx0rec.h" +#include "que0que.h" +#include "row0row.h" +#include "row0upd.h" +#include "row0vers.h" +#include "row0mysql.h" +#include "row0log.h" +#include "log0log.h" +#include "srv0mon.h" +#include "srv0start.h" + +/************************************************************************* +IMPORTANT NOTE: Any operation that generates redo MUST check that there +is enough space in the redo log before for that operation. This is +done by calling log_free_check(). The reason for checking the +availability of the redo log space before the start of the operation is +that we MUST not hold any synchonization objects when performing the +check. +If you make a change in this module make sure that no codepath is +introduced where a call to log_free_check() is bypassed. */ + +/********************************************************************//** +Creates a purge node to a query graph. +@return own: purge node */ +UNIV_INTERN +purge_node_t* +row_purge_node_create( +/*==================*/ + que_thr_t* parent, /*!< in: parent node */ + mem_heap_t* heap) /*!< in: memory heap where created */ +{ + purge_node_t* node; + + ut_ad(parent && heap); + + node = static_cast<purge_node_t*>( + mem_heap_zalloc(heap, sizeof(*node))); + + node->common.type = QUE_NODE_PURGE; + node->common.parent = parent; + node->done = TRUE; + node->heap = mem_heap_create(256); + + return(node); +} + +/***********************************************************//** +Repositions the pcur in the purge node on the clustered index record, +if found. +@return TRUE if the record was found */ +static +ibool +row_purge_reposition_pcur( +/*======================*/ + ulint mode, /*!< in: latching mode */ + purge_node_t* node, /*!< in: row purge node */ + mtr_t* mtr) /*!< in: mtr */ +{ + if (node->found_clust) { + ibool found; + + found = btr_pcur_restore_position(mode, &node->pcur, mtr); + + return(found); + } else { + node->found_clust = row_search_on_row_ref( + &node->pcur, mode, node->table, node->ref, mtr); + + if (node->found_clust) { + btr_pcur_store_position(&node->pcur, mtr); + } + } + + return(node->found_clust); +} + +/***********************************************************//** +Removes a delete marked clustered index record if possible. +@retval true if the row was not found, or it was successfully removed +@retval false if the row was modified after the delete marking */ +static __attribute__((nonnull, warn_unused_result)) +bool +row_purge_remove_clust_if_poss_low( +/*===============================*/ + purge_node_t* node, /*!< in/out: row purge node */ + ulint mode) /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */ +{ + dict_index_t* index; + bool success = true; + mtr_t mtr; + rec_t* rec; + mem_heap_t* heap = NULL; + ulint* offsets; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs_init(offsets_); + +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + + index = dict_table_get_first_index(node->table); + + log_free_check(); + mtr_start(&mtr); + + if (!row_purge_reposition_pcur(mode, node, &mtr)) { + /* The record was already removed. */ + goto func_exit; + } + + rec = btr_pcur_get_rec(&node->pcur); + + offsets = rec_get_offsets( + rec, index, offsets_, ULINT_UNDEFINED, &heap); + + if (node->roll_ptr != row_get_rec_roll_ptr(rec, index, offsets)) { + /* Someone else has modified the record later: do not remove */ + goto func_exit; + } + + if (mode == BTR_MODIFY_LEAF) { + success = btr_cur_optimistic_delete( + btr_pcur_get_btr_cur(&node->pcur), 0, &mtr); + } else { + dberr_t err; + ut_ad(mode == BTR_MODIFY_TREE); + btr_cur_pessimistic_delete( + &err, FALSE, btr_pcur_get_btr_cur(&node->pcur), 0, + RB_NONE, &mtr); + + switch (err) { + case DB_SUCCESS: + break; + case DB_OUT_OF_FILE_SPACE: + success = false; + break; + default: + ut_error; + } + } + +func_exit: + if (heap) { + mem_heap_free(heap); + } + + btr_pcur_commit_specify_mtr(&node->pcur, &mtr); + + return(success); +} + +/***********************************************************//** +Removes a clustered index record if it has not been modified after the delete +marking. +@retval true if the row was not found, or it was successfully removed +@retval false the purge needs to be suspended because of running out +of file space. */ +static __attribute__((nonnull, warn_unused_result)) +bool +row_purge_remove_clust_if_poss( +/*===========================*/ + purge_node_t* node) /*!< in/out: row purge node */ +{ + if (row_purge_remove_clust_if_poss_low(node, BTR_MODIFY_LEAF)) { + return(true); + } + + for (ulint n_tries = 0; + n_tries < BTR_CUR_RETRY_DELETE_N_TIMES; + n_tries++) { + if (row_purge_remove_clust_if_poss_low( + node, BTR_MODIFY_TREE)) { + return(true); + } + + os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME); + } + + return(false); +} + +/***********************************************************//** +Determines if it is possible to remove a secondary index entry. +Removal is possible if the secondary index entry does not refer to any +not delete marked version of a clustered index record where DB_TRX_ID +is newer than the purge view. + +NOTE: This function should only be called by the purge thread, only +while holding a latch on the leaf page of the secondary index entry +(or keeping the buffer pool watch on the page). It is possible that +this function first returns true and then false, if a user transaction +inserts a record that the secondary index entry would refer to. +However, in that case, the user transaction would also re-insert the +secondary index entry after purge has removed it and released the leaf +page latch. +@return true if the secondary index record can be purged */ +UNIV_INTERN +bool +row_purge_poss_sec( +/*===============*/ + purge_node_t* node, /*!< in/out: row purge node */ + dict_index_t* index, /*!< in: secondary index */ + const dtuple_t* entry) /*!< in: secondary index entry */ +{ + bool can_delete; + mtr_t mtr; + + ut_ad(!dict_index_is_clust(index)); + mtr_start(&mtr); + + can_delete = !row_purge_reposition_pcur(BTR_SEARCH_LEAF, node, &mtr) + || !row_vers_old_has_index_entry(TRUE, + btr_pcur_get_rec(&node->pcur), + &mtr, index, entry); + + btr_pcur_commit_specify_mtr(&node->pcur, &mtr); + + return(can_delete); +} + +/*************************************************************** +Removes a secondary index entry if possible, by modifying the +index tree. Does not try to buffer the delete. +@return TRUE if success or if not found */ +static __attribute__((nonnull, warn_unused_result)) +ibool +row_purge_remove_sec_if_poss_tree( +/*==============================*/ + purge_node_t* node, /*!< in: row purge node */ + dict_index_t* index, /*!< in: index */ + const dtuple_t* entry) /*!< in: index entry */ +{ + btr_pcur_t pcur; + btr_cur_t* btr_cur; + ibool success = TRUE; + dberr_t err; + mtr_t mtr; + enum row_search_result search_result; + + log_free_check(); + mtr_start(&mtr); + + if (*index->name == TEMP_INDEX_PREFIX) { + /* The index->online_status may change if the + index->name starts with TEMP_INDEX_PREFIX (meaning + that the index is or was being created online). It is + protected by index->lock. */ + mtr_x_lock(dict_index_get_lock(index), &mtr); + + if (dict_index_is_online_ddl(index)) { + /* Online secondary index creation will not + copy any delete-marked records. Therefore + there is nothing to be purged. We must also + skip the purge when a completed index is + dropped by rollback_inplace_alter_table(). */ + goto func_exit_no_pcur; + } + } else { + /* For secondary indexes, + index->online_status==ONLINE_INDEX_CREATION unless + index->name starts with TEMP_INDEX_PREFIX. */ + ut_ad(!dict_index_is_online_ddl(index)); + } + + search_result = row_search_index_entry(index, entry, BTR_MODIFY_TREE, + &pcur, &mtr); + + switch (search_result) { + case ROW_NOT_FOUND: + /* Not found. This is a legitimate condition. In a + rollback, InnoDB will remove secondary recs that would + be purged anyway. Then the actual purge will not find + the secondary index record. Also, the purge itself is + eager: if it comes to consider a secondary index + record, and notices it does not need to exist in the + index, it will remove it. Then if/when the purge + comes to consider the secondary index record a second + time, it will not exist any more in the index. */ + + /* fputs("PURGE:........sec entry not found\n", stderr); */ + /* dtuple_print(stderr, entry); */ + goto func_exit; + case ROW_FOUND: + break; + case ROW_BUFFERED: + case ROW_NOT_DELETED_REF: + /* These are invalid outcomes, because the mode passed + to row_search_index_entry() did not include any of the + flags BTR_INSERT, BTR_DELETE, or BTR_DELETE_MARK. */ + ut_error; + } + + btr_cur = btr_pcur_get_btr_cur(&pcur); + + /* We should remove the index record if no later version of the row, + which cannot be purged yet, requires its existence. If some requires, + we should do nothing. */ + + if (row_purge_poss_sec(node, index, entry)) { + /* Remove the index record, which should have been + marked for deletion. */ + if (!rec_get_deleted_flag(btr_cur_get_rec(btr_cur), + dict_table_is_comp(index->table))) { + fputs("InnoDB: tried to purge sec index entry not" + " marked for deletion in\n" + "InnoDB: ", stderr); + dict_index_name_print(stderr, NULL, index); + fputs("\n" + "InnoDB: tuple ", stderr); + dtuple_print(stderr, entry); + fputs("\n" + "InnoDB: record ", stderr); + rec_print(stderr, btr_cur_get_rec(btr_cur), index); + putc('\n', stderr); + + ut_ad(0); + + goto func_exit; + } + + btr_cur_pessimistic_delete(&err, FALSE, btr_cur, 0, + RB_NONE, &mtr); + switch (UNIV_EXPECT(err, DB_SUCCESS)) { + case DB_SUCCESS: + break; + case DB_OUT_OF_FILE_SPACE: + success = FALSE; + break; + default: + ut_error; + } + } + +func_exit: + btr_pcur_close(&pcur); +func_exit_no_pcur: + mtr_commit(&mtr); + + return(success); +} + +/*************************************************************** +Removes a secondary index entry without modifying the index tree, +if possible. +@retval true if success or if not found +@retval false if row_purge_remove_sec_if_poss_tree() should be invoked */ +static __attribute__((nonnull, warn_unused_result)) +bool +row_purge_remove_sec_if_poss_leaf( +/*==============================*/ + purge_node_t* node, /*!< in: row purge node */ + dict_index_t* index, /*!< in: index */ + const dtuple_t* entry) /*!< in: index entry */ +{ + mtr_t mtr; + btr_pcur_t pcur; + ulint mode; + enum row_search_result search_result; + bool success = true; + + log_free_check(); + + mtr_start(&mtr); + + if (*index->name == TEMP_INDEX_PREFIX) { + /* The index->online_status may change if the + index->name starts with TEMP_INDEX_PREFIX (meaning + that the index is or was being created online). It is + protected by index->lock. */ + mtr_s_lock(dict_index_get_lock(index), &mtr); + + if (dict_index_is_online_ddl(index)) { + /* Online secondary index creation will not + copy any delete-marked records. Therefore + there is nothing to be purged. We must also + skip the purge when a completed index is + dropped by rollback_inplace_alter_table(). */ + goto func_exit_no_pcur; + } + + mode = BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED | BTR_DELETE; + } else { + /* For secondary indexes, + index->online_status==ONLINE_INDEX_CREATION unless + index->name starts with TEMP_INDEX_PREFIX. */ + ut_ad(!dict_index_is_online_ddl(index)); + + mode = BTR_MODIFY_LEAF | BTR_DELETE; + } + + /* Set the purge node for the call to row_purge_poss_sec(). */ + pcur.btr_cur.purge_node = node; + /* Set the query thread, so that ibuf_insert_low() will be + able to invoke thd_get_trx(). */ + pcur.btr_cur.thr = static_cast<que_thr_t*>(que_node_get_parent(node)); + + search_result = row_search_index_entry( + index, entry, mode, &pcur, &mtr); + + switch (search_result) { + case ROW_FOUND: + /* Before attempting to purge a record, check + if it is safe to do so. */ + if (row_purge_poss_sec(node, index, entry)) { + btr_cur_t* btr_cur = btr_pcur_get_btr_cur(&pcur); + + /* Only delete-marked records should be purged. */ + if (!rec_get_deleted_flag( + btr_cur_get_rec(btr_cur), + dict_table_is_comp(index->table))) { + + fputs("InnoDB: tried to purge sec index" + " entry not marked for deletion in\n" + "InnoDB: ", stderr); + dict_index_name_print(stderr, NULL, index); + fputs("\n" + "InnoDB: tuple ", stderr); + dtuple_print(stderr, entry); + fputs("\n" + "InnoDB: record ", stderr); + rec_print(stderr, btr_cur_get_rec(btr_cur), + index); + putc('\n', stderr); + + ut_ad(0); + + btr_pcur_close(&pcur); + + goto func_exit_no_pcur; + } + + if (!btr_cur_optimistic_delete(btr_cur, 0, &mtr)) { + + /* The index entry could not be deleted. */ + success = false; + } + } + /* fall through (the index entry is still needed, + or the deletion succeeded) */ + case ROW_NOT_DELETED_REF: + /* The index entry is still needed. */ + case ROW_BUFFERED: + /* The deletion was buffered. */ + case ROW_NOT_FOUND: + /* The index entry does not exist, nothing to do. */ + btr_pcur_close(&pcur); + func_exit_no_pcur: + mtr_commit(&mtr); + return(success); + } + + ut_error; + return(FALSE); +} + +/***********************************************************//** +Removes a secondary index entry if possible. */ +UNIV_INLINE __attribute__((nonnull(1,2))) +void +row_purge_remove_sec_if_poss( +/*=========================*/ + purge_node_t* node, /*!< in: row purge node */ + dict_index_t* index, /*!< in: index */ + const dtuple_t* entry) /*!< in: index entry */ +{ + ibool success; + ulint n_tries = 0; + + /* fputs("Purge: Removing secondary record\n", stderr); */ + + if (!entry) { + /* The node->row must have lacked some fields of this + index. This is possible when the undo log record was + written before this index was created. */ + return; + } + + if (row_purge_remove_sec_if_poss_leaf(node, index, entry)) { + + return; + } +retry: + success = row_purge_remove_sec_if_poss_tree(node, index, entry); + /* The delete operation may fail if we have little + file space left: TODO: easiest to crash the database + and restart with more file space */ + + if (!success && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) { + + n_tries++; + + os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME); + + goto retry; + } + + ut_a(success); +} + +/***********************************************************//** +Purges a delete marking of a record. +@retval true if the row was not found, or it was successfully removed +@retval false the purge needs to be suspended because of +running out of file space */ +static __attribute__((nonnull, warn_unused_result)) +bool +row_purge_del_mark( +/*===============*/ + purge_node_t* node) /*!< in/out: row purge node */ +{ + mem_heap_t* heap; + + heap = mem_heap_create(1024); + + while (node->index != NULL) { + /* skip corrupted secondary index */ + dict_table_skip_corrupt_index(node->index); + + if (!node->index) { + break; + } + + if (node->index->type != DICT_FTS) { + dtuple_t* entry = row_build_index_entry_low( + node->row, NULL, node->index, heap); + row_purge_remove_sec_if_poss(node, node->index, entry); + mem_heap_empty(heap); + } + + node->index = dict_table_get_next_index(node->index); + } + + mem_heap_free(heap); + + return(row_purge_remove_clust_if_poss(node)); +} + +/***********************************************************//** +Purges an update of an existing record. Also purges an update of a delete +marked record if that record contained an externally stored field. */ +static +void +row_purge_upd_exist_or_extern_func( +/*===============================*/ +#ifdef UNIV_DEBUG + const que_thr_t*thr, /*!< in: query thread */ +#endif /* UNIV_DEBUG */ + purge_node_t* node, /*!< in: row purge node */ + trx_undo_rec_t* undo_rec) /*!< in: record to purge */ +{ + mem_heap_t* heap; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + + if (node->rec_type == TRX_UNDO_UPD_DEL_REC + || (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE)) { + + goto skip_secondaries; + } + + heap = mem_heap_create(1024); + + while (node->index != NULL) { + dict_table_skip_corrupt_index(node->index); + + if (!node->index) { + break; + } + + if (row_upd_changes_ord_field_binary(node->index, node->update, + thr, NULL, NULL)) { + /* Build the older version of the index entry */ + dtuple_t* entry = row_build_index_entry_low( + node->row, NULL, node->index, heap); + row_purge_remove_sec_if_poss(node, node->index, entry); + mem_heap_empty(heap); + } + + node->index = dict_table_get_next_index(node->index); + } + + mem_heap_free(heap); + +skip_secondaries: + /* Free possible externally stored fields */ + for (ulint i = 0; i < upd_get_n_fields(node->update); i++) { + + const upd_field_t* ufield + = upd_get_nth_field(node->update, i); + + if (dfield_is_ext(&ufield->new_val)) { + trx_rseg_t* rseg; + buf_block_t* block; + ulint internal_offset; + byte* data_field; + dict_index_t* index; + ibool is_insert; + ulint rseg_id; + ulint page_no; + ulint offset; + mtr_t mtr; + + /* We use the fact that new_val points to + undo_rec and get thus the offset of + dfield data inside the undo record. Then we + can calculate from node->roll_ptr the file + address of the new_val data */ + + internal_offset + = ((const byte*) + dfield_get_data(&ufield->new_val)) + - undo_rec; + + ut_a(internal_offset < UNIV_PAGE_SIZE); + + trx_undo_decode_roll_ptr(node->roll_ptr, + &is_insert, &rseg_id, + &page_no, &offset); + + rseg = trx_sys_get_nth_rseg(trx_sys, rseg_id); + ut_a(rseg != NULL); + ut_a(rseg->id == rseg_id); + + mtr_start(&mtr); + + /* We have to acquire an X-latch to the clustered + index tree */ + + index = dict_table_get_first_index(node->table); + mtr_x_lock(dict_index_get_lock(index), &mtr); + + /* NOTE: we must also acquire an X-latch to the + root page of the tree. We will need it when we + free pages from the tree. If the tree is of height 1, + the tree X-latch does NOT protect the root page, + because it is also a leaf page. Since we will have a + latch on an undo log page, we would break the + latching order if we would only later latch the + root page of such a tree! */ + + btr_root_get(index, &mtr); + + block = buf_page_get( + rseg->space, 0, page_no, RW_X_LATCH, &mtr); + + buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE); + + data_field = buf_block_get_frame(block) + + offset + internal_offset; + + ut_a(dfield_get_len(&ufield->new_val) + >= BTR_EXTERN_FIELD_REF_SIZE); + btr_free_externally_stored_field( + index, + data_field + dfield_get_len(&ufield->new_val) + - BTR_EXTERN_FIELD_REF_SIZE, + NULL, NULL, NULL, 0, RB_NONE, &mtr); + mtr_commit(&mtr); + } + } +} + +#ifdef UNIV_DEBUG +# define row_purge_upd_exist_or_extern(thr,node,undo_rec) \ + row_purge_upd_exist_or_extern_func(thr,node,undo_rec) +#else /* UNIV_DEBUG */ +# define row_purge_upd_exist_or_extern(thr,node,undo_rec) \ + row_purge_upd_exist_or_extern_func(node,undo_rec) +#endif /* UNIV_DEBUG */ + +/***********************************************************//** +Parses the row reference and other info in a modify undo log record. +@return true if purge operation required */ +static +bool +row_purge_parse_undo_rec( +/*=====================*/ + purge_node_t* node, /*!< in: row undo node */ + trx_undo_rec_t* undo_rec, /*!< in: record to purge */ + bool* updated_extern, /*!< out: true if an externally + stored field was updated */ + que_thr_t* thr) /*!< in: query thread */ +{ + dict_index_t* clust_index; + byte* ptr; + trx_t* trx; + undo_no_t undo_no; + table_id_t table_id; + trx_id_t trx_id; + roll_ptr_t roll_ptr; + ulint info_bits; + ulint type; + + ut_ad(node && thr); + + ptr = trx_undo_rec_get_pars( + undo_rec, &type, &node->cmpl_info, + updated_extern, &undo_no, &table_id); + + node->rec_type = type; + + if (type == TRX_UNDO_UPD_DEL_REC && !*updated_extern) { + + return(false); + } + + ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr, + &info_bits); + node->table = NULL; + + /* Prevent DROP TABLE etc. from running when we are doing the purge + for this row */ + + rw_lock_s_lock_inline(&dict_operation_lock, 0, __FILE__, __LINE__); + + node->table = dict_table_open_on_id( + table_id, FALSE, DICT_TABLE_OP_NORMAL); + + if (node->table == NULL) { + /* The table has been dropped: no need to do purge */ + goto err_exit; + } + + if (node->table->ibd_file_missing) { + /* We skip purge of missing .ibd files */ + + dict_table_close(node->table, FALSE, FALSE); + + node->table = NULL; + + goto err_exit; + } + + clust_index = dict_table_get_first_index(node->table); + + if (clust_index == NULL) { + /* The table was corrupt in the data dictionary. + dict_set_corrupted() works on an index, and + we do not have an index to call it with. */ +close_exit: + dict_table_close(node->table, FALSE, FALSE); +err_exit: + rw_lock_s_unlock(&dict_operation_lock); + return(false); + } + + if (type == TRX_UNDO_UPD_EXIST_REC + && (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) + && !*updated_extern) { + + /* Purge requires no changes to indexes: we may return */ + goto close_exit; + } + + ptr = trx_undo_rec_get_row_ref(ptr, clust_index, &(node->ref), + node->heap); + + trx = thr_get_trx(thr); + + ptr = trx_undo_update_rec_get_update(ptr, clust_index, type, trx_id, + roll_ptr, info_bits, trx, + node->heap, &(node->update)); + + /* Read to the partial row the fields that occur in indexes */ + + if (!(node->cmpl_info & UPD_NODE_NO_ORD_CHANGE)) { + ptr = trx_undo_rec_get_partial_row( + ptr, clust_index, &node->row, + type == TRX_UNDO_UPD_DEL_REC, + node->heap); + } + + return(true); +} + +/***********************************************************//** +Purges the parsed record. +@return true if purged, false if skipped */ +static __attribute__((nonnull, warn_unused_result)) +bool +row_purge_record_func( +/*==================*/ + purge_node_t* node, /*!< in: row purge node */ + trx_undo_rec_t* undo_rec, /*!< in: record to purge */ +#ifdef UNIV_DEBUG + const que_thr_t*thr, /*!< in: query thread */ +#endif /* UNIV_DEBUG */ + bool updated_extern) /*!< in: whether external columns + were updated */ +{ + dict_index_t* clust_index; + bool purged = true; + + clust_index = dict_table_get_first_index(node->table); + + node->index = dict_table_get_next_index(clust_index); + + switch (node->rec_type) { + case TRX_UNDO_DEL_MARK_REC: + purged = row_purge_del_mark(node); + if (!purged) { + break; + } + MONITOR_INC(MONITOR_N_DEL_ROW_PURGE); + break; + default: + if (!updated_extern) { + break; + } + /* fall through */ + case TRX_UNDO_UPD_EXIST_REC: + row_purge_upd_exist_or_extern(thr, node, undo_rec); + MONITOR_INC(MONITOR_N_UPD_EXIST_EXTERN); + break; + } + + if (node->found_clust) { + btr_pcur_close(&node->pcur); + node->found_clust = FALSE; + } + + if (node->table != NULL) { + dict_table_close(node->table, FALSE, FALSE); + node->table = NULL; + } + + return(purged); +} + +#ifdef UNIV_DEBUG +# define row_purge_record(node,undo_rec,thr,updated_extern) \ + row_purge_record_func(node,undo_rec,thr,updated_extern) +#else /* UNIV_DEBUG */ +# define row_purge_record(node,undo_rec,thr,updated_extern) \ + row_purge_record_func(node,undo_rec,updated_extern) +#endif /* UNIV_DEBUG */ + +/***********************************************************//** +Fetches an undo log record and does the purge for the recorded operation. +If none left, or the current purge completed, returns the control to the +parent node, which is always a query thread node. */ +static __attribute__((nonnull)) +void +row_purge( +/*======*/ + purge_node_t* node, /*!< in: row purge node */ + trx_undo_rec_t* undo_rec, /*!< in: record to purge */ + que_thr_t* thr) /*!< in: query thread */ +{ + if (undo_rec != &trx_purge_dummy_rec) { + bool updated_extern; + + while (row_purge_parse_undo_rec( + node, undo_rec, &updated_extern, thr)) { + + bool purged = row_purge_record( + node, undo_rec, thr, updated_extern); + + rw_lock_s_unlock(&dict_operation_lock); + + if (purged + || srv_shutdown_state != SRV_SHUTDOWN_NONE) { + return; + } + + /* Retry the purge in a second. */ + os_thread_sleep(1000000); + } + } +} + +/***********************************************************//** +Reset the purge query thread. */ +UNIV_INLINE +void +row_purge_end( +/*==========*/ + que_thr_t* thr) /*!< in: query thread */ +{ + purge_node_t* node; + + ut_ad(thr); + + node = static_cast<purge_node_t*>(thr->run_node); + + ut_ad(que_node_get_type(node) == QUE_NODE_PURGE); + + thr->run_node = que_node_get_parent(node); + + node->undo_recs = NULL; + + node->done = TRUE; + + ut_a(thr->run_node != NULL); + + mem_heap_empty(node->heap); +} + +/***********************************************************//** +Does the purge operation for a single undo log record. This is a high-level +function used in an SQL execution graph. +@return query thread to run next or NULL */ +UNIV_INTERN +que_thr_t* +row_purge_step( +/*===========*/ + que_thr_t* thr) /*!< in: query thread */ +{ + purge_node_t* node; + + ut_ad(thr); + + node = static_cast<purge_node_t*>(thr->run_node); + + node->table = NULL; + node->row = NULL; + node->ref = NULL; + node->index = NULL; + node->update = NULL; + node->found_clust = FALSE; + node->rec_type = ULINT_UNDEFINED; + node->cmpl_info = ULINT_UNDEFINED; + + ut_a(!node->done); + + ut_ad(que_node_get_type(node) == QUE_NODE_PURGE); + + if (!(node->undo_recs == NULL || ib_vector_is_empty(node->undo_recs))) { + trx_purge_rec_t*purge_rec; + + purge_rec = static_cast<trx_purge_rec_t*>( + ib_vector_pop(node->undo_recs)); + + node->roll_ptr = purge_rec->roll_ptr; + + row_purge(node, purge_rec->undo_rec, thr); + + if (ib_vector_is_empty(node->undo_recs)) { + row_purge_end(thr); + } else { + thr->run_node = node; + } + } else { + row_purge_end(thr); + } + + return(thr); +} diff --git a/storage/xtradb/row/row0quiesce.cc b/storage/xtradb/row/row0quiesce.cc new file mode 100644 index 00000000000..1d67d5a9717 --- /dev/null +++ b/storage/xtradb/row/row0quiesce.cc @@ -0,0 +1,703 @@ +/***************************************************************************** + +Copyright (c) 2012, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file row/row0quiesce.cc +Quiesce a tablespace. + +Created 2012-02-08 by Sunny Bains. +*******************************************************/ + +#include "row0quiesce.h" +#include "row0mysql.h" + +#ifdef UNIV_NONINL +#include "row0quiesce.ic" +#endif + +#include "ibuf0ibuf.h" +#include "srv0start.h" +#include "trx0purge.h" + +/*********************************************************************//** +Write the meta data (index user fields) config file. +@return DB_SUCCESS or error code. */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_quiesce_write_index_fields( +/*===========================*/ + const dict_index_t* index, /*!< in: write the meta data for + this index */ + FILE* file, /*!< in: file to write to */ + THD* thd) /*!< in/out: session */ +{ + byte row[sizeof(ib_uint32_t) * 2]; + + for (ulint i = 0; i < index->n_fields; ++i) { + byte* ptr = row; + const dict_field_t* field = &index->fields[i]; + + mach_write_to_4(ptr, field->prefix_len); + ptr += sizeof(ib_uint32_t); + + mach_write_to_4(ptr, field->fixed_len); + + DBUG_EXECUTE_IF("ib_export_io_write_failure_9", + close(fileno(file));); + + if (fwrite(row, 1, sizeof(row), file) != sizeof(row)) { + + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + errno, strerror(errno), + "while writing index fields."); + + return(DB_IO_ERROR); + } + + /* Include the NUL byte in the length. */ + ib_uint32_t len = static_cast<ib_uint32_t>(strlen(field->name) + 1); + ut_a(len > 1); + + mach_write_to_4(row, len); + + DBUG_EXECUTE_IF("ib_export_io_write_failure_10", + close(fileno(file));); + + if (fwrite(row, 1, sizeof(len), file) != sizeof(len) + || fwrite(field->name, 1, len, file) != len) { + + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + errno, strerror(errno), + "while writing index column."); + + return(DB_IO_ERROR); + } + } + + return(DB_SUCCESS); +} + +/*********************************************************************//** +Write the meta data config file index information. +@return DB_SUCCESS or error code. */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_quiesce_write_indexes( +/*======================*/ + const dict_table_t* table, /*!< in: write the meta data for + this table */ + FILE* file, /*!< in: file to write to */ + THD* thd) /*!< in/out: session */ +{ + { + byte row[sizeof(ib_uint32_t)]; + + /* Write the number of indexes in the table. */ + mach_write_to_4(row, UT_LIST_GET_LEN(table->indexes)); + + DBUG_EXECUTE_IF("ib_export_io_write_failure_11", + close(fileno(file));); + + if (fwrite(row, 1, sizeof(row), file) != sizeof(row)) { + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + errno, strerror(errno), + "while writing index count."); + + return(DB_IO_ERROR); + } + } + + dberr_t err = DB_SUCCESS; + + /* Write the index meta data. */ + for (const dict_index_t* index = UT_LIST_GET_FIRST(table->indexes); + index != 0 && err == DB_SUCCESS; + index = UT_LIST_GET_NEXT(indexes, index)) { + + byte* ptr; + byte row[sizeof(index_id_t) + + sizeof(ib_uint32_t) * 8]; + + ptr = row; + + ut_ad(sizeof(index_id_t) == 8); + mach_write_to_8(ptr, index->id); + ptr += sizeof(index_id_t); + + mach_write_to_4(ptr, index->space); + ptr += sizeof(ib_uint32_t); + + mach_write_to_4(ptr, index->page); + ptr += sizeof(ib_uint32_t); + + mach_write_to_4(ptr, index->type); + ptr += sizeof(ib_uint32_t); + + mach_write_to_4(ptr, index->trx_id_offset); + ptr += sizeof(ib_uint32_t); + + mach_write_to_4(ptr, index->n_user_defined_cols); + ptr += sizeof(ib_uint32_t); + + mach_write_to_4(ptr, index->n_uniq); + ptr += sizeof(ib_uint32_t); + + mach_write_to_4(ptr, index->n_nullable); + ptr += sizeof(ib_uint32_t); + + mach_write_to_4(ptr, index->n_fields); + + DBUG_EXECUTE_IF("ib_export_io_write_failure_12", + close(fileno(file));); + + if (fwrite(row, 1, sizeof(row), file) != sizeof(row)) { + + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + errno, strerror(errno), + "while writing index meta-data."); + + return(DB_IO_ERROR); + } + + /* Write the length of the index name. + NUL byte is included in the length. */ + ib_uint32_t len = static_cast<ib_uint32_t>(strlen(index->name) + 1); + ut_a(len > 1); + + mach_write_to_4(row, len); + + DBUG_EXECUTE_IF("ib_export_io_write_failure_1", + close(fileno(file));); + + if (fwrite(row, 1, sizeof(len), file) != sizeof(len) + || fwrite(index->name, 1, len, file) != len) { + + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + errno, strerror(errno), + "while writing index name."); + + return(DB_IO_ERROR); + } + + err = row_quiesce_write_index_fields(index, file, thd); + } + + return(err); +} + +/*********************************************************************//** +Write the meta data (table columns) config file. Serialise the contents of +dict_col_t structure, along with the column name. All fields are serialized +as ib_uint32_t. +@return DB_SUCCESS or error code. */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_quiesce_write_table( +/*====================*/ + const dict_table_t* table, /*!< in: write the meta data for + this table */ + FILE* file, /*!< in: file to write to */ + THD* thd) /*!< in/out: session */ +{ + dict_col_t* col; + byte row[sizeof(ib_uint32_t) * 7]; + + col = table->cols; + + for (ulint i = 0; i < table->n_cols; ++i, ++col) { + byte* ptr = row; + + mach_write_to_4(ptr, col->prtype); + ptr += sizeof(ib_uint32_t); + + mach_write_to_4(ptr, col->mtype); + ptr += sizeof(ib_uint32_t); + + mach_write_to_4(ptr, col->len); + ptr += sizeof(ib_uint32_t); + + mach_write_to_4(ptr, col->mbminmaxlen); + ptr += sizeof(ib_uint32_t); + + mach_write_to_4(ptr, col->ind); + ptr += sizeof(ib_uint32_t); + + mach_write_to_4(ptr, col->ord_part); + ptr += sizeof(ib_uint32_t); + + mach_write_to_4(ptr, col->max_prefix); + + DBUG_EXECUTE_IF("ib_export_io_write_failure_2", + close(fileno(file));); + + if (fwrite(row, 1, sizeof(row), file) != sizeof(row)) { + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + errno, strerror(errno), + "while writing table column data."); + + return(DB_IO_ERROR); + } + + /* Write out the column name as [len, byte array]. The len + includes the NUL byte. */ + ib_uint32_t len; + const char* col_name; + + col_name = dict_table_get_col_name(table, dict_col_get_no(col)); + + /* Include the NUL byte in the length. */ + len = static_cast<ib_uint32_t>(strlen(col_name) + 1); + ut_a(len > 1); + + mach_write_to_4(row, len); + + DBUG_EXECUTE_IF("ib_export_io_write_failure_3", + close(fileno(file));); + + if (fwrite(row, 1, sizeof(len), file) != sizeof(len) + || fwrite(col_name, 1, len, file) != len) { + + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + errno, strerror(errno), + "while writing column name."); + + return(DB_IO_ERROR); + } + } + + return(DB_SUCCESS); +} + +/*********************************************************************//** +Write the meta data config file header. +@return DB_SUCCESS or error code. */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_quiesce_write_header( +/*=====================*/ + const dict_table_t* table, /*!< in: write the meta data for + this table */ + FILE* file, /*!< in: file to write to */ + THD* thd) /*!< in/out: session */ +{ + byte value[sizeof(ib_uint32_t)]; + + /* Write the meta-data version number. */ + mach_write_to_4(value, IB_EXPORT_CFG_VERSION_V1); + + DBUG_EXECUTE_IF("ib_export_io_write_failure_4", close(fileno(file));); + + if (fwrite(&value, 1, sizeof(value), file) != sizeof(value)) { + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + errno, strerror(errno), + "while writing meta-data version number."); + + return(DB_IO_ERROR); + } + + /* Write the server hostname. */ + ib_uint32_t len; + const char* hostname = server_get_hostname(); + + /* Play it safe and check for NULL. */ + if (hostname == 0) { + static const char NullHostname[] = "Hostname unknown"; + + ib_logf(IB_LOG_LEVEL_WARN, + "Unable to determine server hostname."); + + hostname = NullHostname; + } + + /* The server hostname includes the NUL byte. */ + len = static_cast<ib_uint32_t>(strlen(hostname) + 1); + mach_write_to_4(value, len); + + DBUG_EXECUTE_IF("ib_export_io_write_failure_5", close(fileno(file));); + + if (fwrite(&value, 1, sizeof(value), file) != sizeof(value) + || fwrite(hostname, 1, len, file) != len) { + + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + errno, strerror(errno), + "while writing hostname."); + + return(DB_IO_ERROR); + } + + /* The table name includes the NUL byte. */ + ut_a(table->name != 0); + len = static_cast<ib_uint32_t>(strlen(table->name) + 1); + + /* Write the table name. */ + mach_write_to_4(value, len); + + DBUG_EXECUTE_IF("ib_export_io_write_failure_6", close(fileno(file));); + + if (fwrite(&value, 1, sizeof(value), file) != sizeof(value) + || fwrite(table->name, 1, len, file) != len) { + + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + errno, strerror(errno), + "while writing table name."); + + return(DB_IO_ERROR); + } + + byte row[sizeof(ib_uint32_t) * 3]; + + /* Write the next autoinc value. */ + mach_write_to_8(row, table->autoinc); + + DBUG_EXECUTE_IF("ib_export_io_write_failure_7", close(fileno(file));); + + if (fwrite(row, 1, sizeof(ib_uint64_t), file) != sizeof(ib_uint64_t)) { + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + errno, strerror(errno), + "while writing table autoinc value."); + + return(DB_IO_ERROR); + } + + byte* ptr = row; + + /* Write the system page size. */ + mach_write_to_4(ptr, UNIV_PAGE_SIZE); + ptr += sizeof(ib_uint32_t); + + /* Write the table->flags. */ + mach_write_to_4(ptr, table->flags); + ptr += sizeof(ib_uint32_t); + + /* Write the number of columns in the table. */ + mach_write_to_4(ptr, table->n_cols); + + DBUG_EXECUTE_IF("ib_export_io_write_failure_8", close(fileno(file));); + + if (fwrite(row, 1, sizeof(row), file) != sizeof(row)) { + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + errno, strerror(errno), + "while writing table meta-data."); + + return(DB_IO_ERROR); + } + + return(DB_SUCCESS); +} + +/*********************************************************************//** +Write the table meta data after quiesce. +@return DB_SUCCESS or error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_quiesce_write_cfg( +/*==================*/ + dict_table_t* table, /*!< in: write the meta data for + this table */ + THD* thd) /*!< in/out: session */ +{ + dberr_t err; + char name[OS_FILE_MAX_PATH]; + + srv_get_meta_data_filename(table, name, sizeof(name)); + + ib_logf(IB_LOG_LEVEL_INFO, "Writing table metadata to '%s'", name); + + FILE* file = fopen(name, "w+b"); + + if (file == NULL) { + ib_errf(thd, IB_LOG_LEVEL_WARN, ER_CANT_CREATE_FILE, + name, errno, strerror(errno)); + + err = DB_IO_ERROR; + } else { + err = row_quiesce_write_header(table, file, thd); + + if (err == DB_SUCCESS) { + err = row_quiesce_write_table(table, file, thd); + } + + if (err == DB_SUCCESS) { + err = row_quiesce_write_indexes(table, file, thd); + } + + if (fflush(file) != 0) { + + char msg[BUFSIZ]; + + ut_snprintf(msg, sizeof(msg), "%s flush() failed", + name); + + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + errno, strerror(errno), msg); + } + + if (fclose(file) != 0) { + char msg[BUFSIZ]; + + ut_snprintf(msg, sizeof(msg), "%s flose() failed", + name); + + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + errno, strerror(errno), msg); + } + } + + return(err); +} + +/*********************************************************************//** +Check whether a table has an FTS index defined on it. +@return true if an FTS index exists on the table */ +static +bool +row_quiesce_table_has_fts_index( +/*============================*/ + const dict_table_t* table) /*!< in: quiesce this table */ +{ + bool exists = false; + + dict_mutex_enter_for_mysql(); + + for (const dict_index_t* index = UT_LIST_GET_FIRST(table->indexes); + index != 0; + index = UT_LIST_GET_NEXT(indexes, index)) { + + if (index->type & DICT_FTS) { + exists = true; + break; + } + } + + dict_mutex_exit_for_mysql(); + + return(exists); +} + +/*********************************************************************//** +Quiesce the tablespace that the table resides in. */ +UNIV_INTERN +void +row_quiesce_table_start( +/*====================*/ + dict_table_t* table, /*!< in: quiesce this table */ + trx_t* trx) /*!< in/out: transaction/session */ +{ + ut_a(trx->mysql_thd != 0); + ut_a(srv_n_purge_threads > 0); + ut_ad(!srv_read_only_mode); + + char table_name[MAX_FULL_NAME_LEN + 1]; + + ut_a(trx->mysql_thd != 0); + + innobase_format_name( + table_name, sizeof(table_name), table->name, FALSE); + + ib_logf(IB_LOG_LEVEL_INFO, + "Sync to disk of '%s' started.", table_name); + + if (trx_purge_state() != PURGE_STATE_DISABLED) { + trx_purge_stop(); + } + + ut_a(table->id > 0); + + for (ulint count = 0; + ibuf_contract_in_background(table->id, TRUE) != 0 + && !trx_is_interrupted(trx); + ++count) { + if (!(count % 20)) { + ib_logf(IB_LOG_LEVEL_INFO, + "Merging change buffer entries for '%s'", + table_name); + } + } + + if (!trx_is_interrupted(trx)) { + buf_LRU_flush_or_remove_pages( + table->space, BUF_REMOVE_FLUSH_WRITE, trx); + + if (trx_is_interrupted(trx)) { + + ib_logf(IB_LOG_LEVEL_WARN, "Quiesce aborted!"); + + } else if (row_quiesce_write_cfg(table, trx->mysql_thd) + != DB_SUCCESS) { + + ib_logf(IB_LOG_LEVEL_WARN, + "There was an error writing to the " + "meta data file"); + } else { + ib_logf(IB_LOG_LEVEL_INFO, + "Table '%s' flushed to disk", table_name); + } + } else { + ib_logf(IB_LOG_LEVEL_WARN, "Quiesce aborted!"); + } + + dberr_t err = row_quiesce_set_state(table, QUIESCE_COMPLETE, trx); + ut_a(err == DB_SUCCESS); +} + +/*********************************************************************//** +Cleanup after table quiesce. */ +UNIV_INTERN +void +row_quiesce_table_complete( +/*=======================*/ + dict_table_t* table, /*!< in: quiesce this table */ + trx_t* trx) /*!< in/out: transaction/session */ +{ + ulint count = 0; + char table_name[MAX_FULL_NAME_LEN + 1]; + + ut_a(trx->mysql_thd != 0); + + innobase_format_name( + table_name, sizeof(table_name), table->name, FALSE); + + /* We need to wait for the operation to complete if the + transaction has been killed. */ + + while (table->quiesce != QUIESCE_COMPLETE) { + + /* Print a warning after every minute. */ + if (!(count % 60)) { + ib_logf(IB_LOG_LEVEL_WARN, + "Waiting for quiesce of '%s' to complete", + table_name); + } + + /* Sleep for a second. */ + os_thread_sleep(1000000); + + ++count; + } + + /* Remove the .cfg file now that the user has resumed + normal operations. Otherwise it will cause problems when + the user tries to drop the database (remove directory). */ + char cfg_name[OS_FILE_MAX_PATH]; + + srv_get_meta_data_filename(table, cfg_name, sizeof(cfg_name)); + + os_file_delete_if_exists(innodb_file_data_key, cfg_name); + + ib_logf(IB_LOG_LEVEL_INFO, + "Deleting the meta-data file '%s'", cfg_name); + + if (trx_purge_state() != PURGE_STATE_DISABLED) { + trx_purge_run(); + } + + dberr_t err = row_quiesce_set_state(table, QUIESCE_NONE, trx); + ut_a(err == DB_SUCCESS); +} + +/*********************************************************************//** +Set a table's quiesce state. +@return DB_SUCCESS or error code. */ +UNIV_INTERN +dberr_t +row_quiesce_set_state( +/*==================*/ + dict_table_t* table, /*!< in: quiesce this table */ + ib_quiesce_t state, /*!< in: quiesce state to set */ + trx_t* trx) /*!< in/out: transaction */ +{ + ut_a(srv_n_purge_threads > 0); + + if (srv_read_only_mode) { + + ib_senderrf(trx->mysql_thd, + IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE); + + return(DB_UNSUPPORTED); + + } else if (table->space == TRX_SYS_SPACE) { + + char table_name[MAX_FULL_NAME_LEN + 1]; + + innobase_format_name( + table_name, sizeof(table_name), table->name, FALSE); + + ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_WARN, + ER_TABLE_IN_SYSTEM_TABLESPACE, table_name); + + return(DB_UNSUPPORTED); + } else if (row_quiesce_table_has_fts_index(table)) { + + ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_WARN, + ER_NOT_SUPPORTED_YET, + "FLUSH TABLES on tables that have an FTS index. " + "FTS auxiliary tables will not be flushed."); + + } else if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) { + /* If this flag is set then the table may not have any active + FTS indexes but it will still have the auxiliary tables. */ + + ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_WARN, + ER_NOT_SUPPORTED_YET, + "FLUSH TABLES on a table that had an FTS index, " + "created on a hidden column, the " + "auxiliary tables haven't been dropped as yet. " + "FTS auxiliary tables will not be flushed."); + } + + row_mysql_lock_data_dictionary(trx); + + dict_table_x_lock_indexes(table); + + switch (state) { + case QUIESCE_START: + ut_a(table->quiesce == QUIESCE_NONE); + break; + + case QUIESCE_COMPLETE: + ut_a(table->quiesce == QUIESCE_START); + break; + + case QUIESCE_NONE: + ut_a(table->quiesce == QUIESCE_COMPLETE); + break; + } + + table->quiesce = state; + + dict_table_x_unlock_indexes(table); + + row_mysql_unlock_data_dictionary(trx); + + return(DB_SUCCESS); +} + diff --git a/storage/xtradb/row/row0row.cc b/storage/xtradb/row/row0row.cc new file mode 100644 index 00000000000..be786f954fb --- /dev/null +++ b/storage/xtradb/row/row0row.cc @@ -0,0 +1,1252 @@ +/***************************************************************************** + +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file row/row0row.cc +General row routines + +Created 4/20/1996 Heikki Tuuri +*******************************************************/ + +#include "row0row.h" + +#ifdef UNIV_NONINL +#include "row0row.ic" +#endif + +#include "data0type.h" +#include "dict0dict.h" +#include "btr0btr.h" +#include "ha_prototypes.h" +#include "mach0data.h" +#include "trx0rseg.h" +#include "trx0trx.h" +#include "trx0roll.h" +#include "trx0undo.h" +#include "trx0purge.h" +#include "trx0rec.h" +#include "que0que.h" +#include "row0ext.h" +#include "row0upd.h" +#include "rem0cmp.h" +#include "read0read.h" +#include "ut0mem.h" + +/*****************************************************************//** +When an insert or purge to a table is performed, this function builds +the entry to be inserted into or purged from an index on the table. +@return index entry which should be inserted or purged +@retval NULL if the externally stored columns in the clustered index record +are unavailable and ext != NULL, or row is missing some needed columns. */ +UNIV_INTERN +dtuple_t* +row_build_index_entry_low( +/*======================*/ + const dtuple_t* row, /*!< in: row which should be + inserted or purged */ + const row_ext_t* ext, /*!< in: externally stored column + prefixes, or NULL */ + dict_index_t* index, /*!< in: index on the table */ + mem_heap_t* heap) /*!< in: memory heap from which + the memory for the index entry + is allocated */ +{ + dtuple_t* entry; + ulint entry_len; + ulint i; + + entry_len = dict_index_get_n_fields(index); + entry = dtuple_create(heap, entry_len); + + if (dict_index_is_univ(index)) { + dtuple_set_n_fields_cmp(entry, entry_len); + /* There may only be externally stored columns + in a clustered index B-tree of a user table. */ + ut_a(!ext); + } else { + dtuple_set_n_fields_cmp( + entry, dict_index_get_n_unique_in_tree(index)); + } + + for (i = 0; i < entry_len; i++) { + const dict_field_t* ind_field + = dict_index_get_nth_field(index, i); + const dict_col_t* col + = ind_field->col; + ulint col_no + = dict_col_get_no(col); + dfield_t* dfield + = dtuple_get_nth_field(entry, i); + const dfield_t* dfield2 + = dtuple_get_nth_field(row, col_no); + ulint len; + +#if DATA_MISSING != 0 +# error "DATA_MISSING != 0" +#endif + if (UNIV_UNLIKELY(dfield_get_type(dfield2)->mtype + == DATA_MISSING)) { + /* The field has not been initialized in the row. + This should be from trx_undo_rec_get_partial_row(). */ + return(NULL); + } + + len = dfield_get_len(dfield2); + + dfield_copy(dfield, dfield2); + + if (dfield_is_null(dfield)) { + continue; + } + + if (ind_field->prefix_len == 0 + && (!dfield_is_ext(dfield) + || dict_index_is_clust(index))) { + /* The dfield_copy() above suffices for + columns that are stored in-page, or for + clustered index record columns that are not + part of a column prefix in the PRIMARY KEY. */ + continue; + } + + /* If the column is stored externally (off-page) in + the clustered index, it must be an ordering field in + the secondary index. In the Antelope format, only + prefix-indexed columns may be stored off-page in the + clustered index record. In the Barracuda format, also + fully indexed long CHAR or VARCHAR columns may be + stored off-page. */ + ut_ad(col->ord_part); + + if (ext) { + /* See if the column is stored externally. */ + const byte* buf = row_ext_lookup(ext, col_no, + &len); + if (UNIV_LIKELY_NULL(buf)) { + if (UNIV_UNLIKELY(buf == field_ref_zero)) { + return(NULL); + } + dfield_set_data(dfield, buf, len); + } + + if (ind_field->prefix_len == 0) { + /* In the Barracuda format + (ROW_FORMAT=DYNAMIC or + ROW_FORMAT=COMPRESSED), we can have a + secondary index on an entire column + that is stored off-page in the + clustered index. As this is not a + prefix index (prefix_len == 0), + include the entire off-page column in + the secondary index record. */ + continue; + } + } else if (dfield_is_ext(dfield)) { + /* This table is either in Antelope format + (ROW_FORMAT=REDUNDANT or ROW_FORMAT=COMPACT) + or a purge record where the ordered part of + the field is not external. + In Antelope, the maximum column prefix + index length is 767 bytes, and the clustered + index record contains a 768-byte prefix of + each off-page column. */ + ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE); + len -= BTR_EXTERN_FIELD_REF_SIZE; + dfield_set_len(dfield, len); + } + + /* If a column prefix index, take only the prefix. */ + if (ind_field->prefix_len) { + len = dtype_get_at_most_n_mbchars( + col->prtype, col->mbminmaxlen, + ind_field->prefix_len, len, + static_cast<char*>(dfield_get_data(dfield))); + dfield_set_len(dfield, len); + } + } + + return(entry); +} + +/*******************************************************************//** +An inverse function to row_build_index_entry. Builds a row from a +record in a clustered index. +@return own: row built; see the NOTE below! */ +UNIV_INTERN +dtuple_t* +row_build( +/*======*/ + ulint type, /*!< in: ROW_COPY_POINTERS or + ROW_COPY_DATA; the latter + copies also the data fields to + heap while the first only + places pointers to data fields + on the index page, and thus is + more efficient */ + const dict_index_t* index, /*!< in: clustered index */ + const rec_t* rec, /*!< in: record in the clustered + index; NOTE: in the case + ROW_COPY_POINTERS the data + fields in the row will point + directly into this record, + therefore, the buffer page of + this record must be at least + s-latched and the latch held + as long as the row dtuple is used! */ + const ulint* offsets,/*!< in: rec_get_offsets(rec,index) + or NULL, in which case this function + will invoke rec_get_offsets() */ + const dict_table_t* col_table, + /*!< in: table, to check which + externally stored columns + occur in the ordering columns + of an index, or NULL if + index->table should be + consulted instead */ + const dtuple_t* add_cols, + /*!< in: default values of + added columns, or NULL */ + const ulint* col_map,/*!< in: mapping of old column + numbers to new ones, or NULL */ + row_ext_t** ext, /*!< out, own: cache of + externally stored column + prefixes, or NULL */ + mem_heap_t* heap) /*!< in: memory heap from which + the memory needed is allocated */ +{ + const byte* copy; + dtuple_t* row; + ulint n_ext_cols; + ulint* ext_cols = NULL; /* remove warning */ + ulint len; + byte* buf; + ulint j; + mem_heap_t* tmp_heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs_init(offsets_); + + ut_ad(index && rec && heap); + ut_ad(dict_index_is_clust(index)); + ut_ad(!mutex_own(&trx_sys->mutex)); + ut_ad(!col_map || col_table); + + if (!offsets) { + offsets = rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &tmp_heap); + } else { + ut_ad(rec_offs_validate(rec, index, offsets)); + } + +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + if (rec_offs_any_null_extern(rec, offsets)) { + /* This condition can occur during crash recovery + before trx_rollback_active() has completed execution, + or when a concurrently executing + row_ins_index_entry_low() has committed the B-tree + mini-transaction but has not yet managed to restore + the cursor position for writing the big_rec. */ + ut_a(trx_undo_roll_ptr_is_insert( + row_get_rec_roll_ptr(rec, index, offsets))); + } +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + + if (type != ROW_COPY_POINTERS) { + /* Take a copy of rec to heap */ + buf = static_cast<byte*>( + mem_heap_alloc(heap, rec_offs_size(offsets))); + + copy = rec_copy(buf, rec, offsets); + } else { + copy = rec; + } + + n_ext_cols = rec_offs_n_extern(offsets); + if (n_ext_cols) { + ext_cols = static_cast<ulint*>( + mem_heap_alloc(heap, n_ext_cols * sizeof *ext_cols)); + } + + /* Avoid a debug assertion in rec_offs_validate(). */ + rec_offs_make_valid(copy, index, const_cast<ulint*>(offsets)); + + if (!col_table) { + ut_ad(!col_map); + ut_ad(!add_cols); + col_table = index->table; + } + + if (add_cols) { + ut_ad(col_map); + row = dtuple_copy(add_cols, heap); + /* dict_table_copy_types() would set the fields to NULL */ + for (ulint i = 0; i < dict_table_get_n_cols(col_table); i++) { + dict_col_copy_type( + dict_table_get_nth_col(col_table, i), + dfield_get_type(dtuple_get_nth_field(row, i))); + } + } else { + row = dtuple_create(heap, dict_table_get_n_cols(col_table)); + dict_table_copy_types(row, col_table); + } + + dtuple_set_info_bits(row, rec_get_info_bits( + copy, rec_offs_comp(offsets))); + + j = 0; + + for (ulint i = 0; i < rec_offs_n_fields(offsets); i++) { + const dict_field_t* ind_field + = dict_index_get_nth_field(index, i); + + if (ind_field->prefix_len) { + /* Column prefixes can only occur in key + fields, which cannot be stored externally. For + a column prefix, there should also be the full + field in the clustered index tuple. The row + tuple comprises full fields, not prefixes. */ + ut_ad(!rec_offs_nth_extern(offsets, i)); + continue; + } + + const dict_col_t* col + = dict_field_get_col(ind_field); + ulint col_no + = dict_col_get_no(col); + + if (col_map) { + col_no = col_map[col_no]; + + if (col_no == ULINT_UNDEFINED) { + /* dropped column */ + continue; + } + } + + dfield_t* dfield = dtuple_get_nth_field(row, col_no); + + const byte* field = rec_get_nth_field( + copy, offsets, i, &len); + + dfield_set_data(dfield, field, len); + + if (rec_offs_nth_extern(offsets, i)) { + dfield_set_ext(dfield); + + col = dict_table_get_nth_col(col_table, col_no); + + if (col->ord_part) { + /* We will have to fetch prefixes of + externally stored columns that are + referenced by column prefixes. */ + ext_cols[j++] = col_no; + } + } + } + + rec_offs_make_valid(rec, index, const_cast<ulint*>(offsets)); + + ut_ad(dtuple_check_typed(row)); + + if (!ext) { + /* REDUNDANT and COMPACT formats store a local + 768-byte prefix of each externally stored + column. No cache is needed. + + During online table rebuild, + row_log_table_apply_delete_low() + may use a cache that was set up by + row_log_table_delete(). */ + + } else if (j) { + *ext = row_ext_create(j, ext_cols, index->table->flags, row, + heap); + } else { + *ext = NULL; + } + + if (tmp_heap) { + mem_heap_free(tmp_heap); + } + + return(row); +} + +/*******************************************************************//** +Converts an index record to a typed data tuple. +@return index entry built; does not set info_bits, and the data fields +in the entry will point directly to rec */ +UNIV_INTERN +dtuple_t* +row_rec_to_index_entry_low( +/*=======================*/ + const rec_t* rec, /*!< in: record in the index */ + const dict_index_t* index, /*!< in: index */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + ulint* n_ext, /*!< out: number of externally + stored columns */ + mem_heap_t* heap) /*!< in: memory heap from which + the memory needed is allocated */ +{ + dtuple_t* entry; + dfield_t* dfield; + ulint i; + const byte* field; + ulint len; + ulint rec_len; + + ut_ad(rec && heap && index); + /* Because this function may be invoked by row0merge.cc + on a record whose header is in different format, the check + rec_offs_validate(rec, index, offsets) must be avoided here. */ + ut_ad(n_ext); + *n_ext = 0; + + rec_len = rec_offs_n_fields(offsets); + + entry = dtuple_create(heap, rec_len); + + dtuple_set_n_fields_cmp(entry, + dict_index_get_n_unique_in_tree(index)); + ut_ad(rec_len == dict_index_get_n_fields(index)); + + dict_index_copy_types(entry, index, rec_len); + + for (i = 0; i < rec_len; i++) { + + dfield = dtuple_get_nth_field(entry, i); + field = rec_get_nth_field(rec, offsets, i, &len); + + dfield_set_data(dfield, field, len); + + if (rec_offs_nth_extern(offsets, i)) { + dfield_set_ext(dfield); + (*n_ext)++; + } + } + + ut_ad(dtuple_check_typed(entry)); + + return(entry); +} + +/*******************************************************************//** +Converts an index record to a typed data tuple. NOTE that externally +stored (often big) fields are NOT copied to heap. +@return own: index entry built */ +UNIV_INTERN +dtuple_t* +row_rec_to_index_entry( +/*===================*/ + const rec_t* rec, /*!< in: record in the index */ + const dict_index_t* index, /*!< in: index */ + const ulint* offsets,/*!< in: rec_get_offsets(rec) */ + ulint* n_ext, /*!< out: number of externally + stored columns */ + mem_heap_t* heap) /*!< in: memory heap from which + the memory needed is allocated */ +{ + dtuple_t* entry; + byte* buf; + const rec_t* copy_rec; + + ut_ad(rec && heap && index); + ut_ad(rec_offs_validate(rec, index, offsets)); + + /* Take a copy of rec to heap */ + buf = static_cast<byte*>( + mem_heap_alloc(heap, rec_offs_size(offsets))); + + copy_rec = rec_copy(buf, rec, offsets); + + rec_offs_make_valid(copy_rec, index, const_cast<ulint*>(offsets)); + entry = row_rec_to_index_entry_low( + copy_rec, index, offsets, n_ext, heap); + rec_offs_make_valid(rec, index, const_cast<ulint*>(offsets)); + + dtuple_set_info_bits(entry, + rec_get_info_bits(rec, rec_offs_comp(offsets))); + + return(entry); +} + +/*******************************************************************//** +Builds from a secondary index record a row reference with which we can +search the clustered index record. +@return own: row reference built; see the NOTE below! */ +UNIV_INTERN +dtuple_t* +row_build_row_ref( +/*==============*/ + ulint type, /*!< in: ROW_COPY_DATA, or ROW_COPY_POINTERS: + the former copies also the data fields to + heap, whereas the latter only places pointers + to data fields on the index page */ + dict_index_t* index, /*!< in: secondary index */ + const rec_t* rec, /*!< in: record in the index; + NOTE: in the case ROW_COPY_POINTERS + the data fields in the row will point + directly into this record, therefore, + the buffer page of this record must be + at least s-latched and the latch held + as long as the row reference is used! */ + mem_heap_t* heap) /*!< in: memory heap from which the memory + needed is allocated */ +{ + dict_table_t* table; + dict_index_t* clust_index; + dfield_t* dfield; + dtuple_t* ref; + const byte* field; + ulint len; + ulint ref_len; + ulint pos; + byte* buf; + ulint clust_col_prefix_len; + ulint i; + mem_heap_t* tmp_heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + ut_ad(index && rec && heap); + ut_ad(!dict_index_is_clust(index)); + + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &tmp_heap); + /* Secondary indexes must not contain externally stored columns. */ + ut_ad(!rec_offs_any_extern(offsets)); + + if (type == ROW_COPY_DATA) { + /* Take a copy of rec to heap */ + + buf = static_cast<byte*>( + mem_heap_alloc(heap, rec_offs_size(offsets))); + + rec = rec_copy(buf, rec, offsets); + /* Avoid a debug assertion in rec_offs_validate(). */ + rec_offs_make_valid(rec, index, offsets); + } + + table = index->table; + + clust_index = dict_table_get_first_index(table); + + ref_len = dict_index_get_n_unique(clust_index); + + ref = dtuple_create(heap, ref_len); + + dict_index_copy_types(ref, clust_index, ref_len); + + for (i = 0; i < ref_len; i++) { + dfield = dtuple_get_nth_field(ref, i); + + pos = dict_index_get_nth_field_pos(index, clust_index, i); + + ut_a(pos != ULINT_UNDEFINED); + + field = rec_get_nth_field(rec, offsets, pos, &len); + + dfield_set_data(dfield, field, len); + + /* If the primary key contains a column prefix, then the + secondary index may contain a longer prefix of the same + column, or the full column, and we must adjust the length + accordingly. */ + + clust_col_prefix_len = dict_index_get_nth_field( + clust_index, i)->prefix_len; + + if (clust_col_prefix_len > 0) { + if (len != UNIV_SQL_NULL) { + + const dtype_t* dtype + = dfield_get_type(dfield); + + dfield_set_len(dfield, + dtype_get_at_most_n_mbchars( + dtype->prtype, + dtype->mbminmaxlen, + clust_col_prefix_len, + len, (char*) field)); + } + } + } + + ut_ad(dtuple_check_typed(ref)); + if (tmp_heap) { + mem_heap_free(tmp_heap); + } + + return(ref); +} + +/*******************************************************************//** +Builds from a secondary index record a row reference with which we can +search the clustered index record. */ +UNIV_INTERN +void +row_build_row_ref_in_tuple( +/*=======================*/ + dtuple_t* ref, /*!< in/out: row reference built; + see the NOTE below! */ + const rec_t* rec, /*!< in: record in the index; + NOTE: the data fields in ref + will point directly into this + record, therefore, the buffer + page of this record must be at + least s-latched and the latch + held as long as the row + reference is used! */ + const dict_index_t* index, /*!< in: secondary index */ + ulint* offsets,/*!< in: rec_get_offsets(rec, index) + or NULL */ + trx_t* trx) /*!< in: transaction */ +{ + const dict_index_t* clust_index; + dfield_t* dfield; + const byte* field; + ulint len; + ulint ref_len; + ulint pos; + ulint clust_col_prefix_len; + ulint i; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs_init(offsets_); + + ut_a(ref); + ut_a(index); + ut_a(rec); + ut_ad(!dict_index_is_clust(index)); + + if (UNIV_UNLIKELY(!index->table)) { + fputs("InnoDB: table ", stderr); +notfound: + ut_print_name(stderr, trx, TRUE, index->table_name); + fputs(" for index ", stderr); + ut_print_name(stderr, trx, FALSE, index->name); + fputs(" not found\n", stderr); + ut_error; + } + + clust_index = dict_table_get_first_index(index->table); + + if (UNIV_UNLIKELY(!clust_index)) { + fputs("InnoDB: clust index for table ", stderr); + goto notfound; + } + + if (!offsets) { + offsets = rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap); + } else { + ut_ad(rec_offs_validate(rec, index, offsets)); + } + + /* Secondary indexes must not contain externally stored columns. */ + ut_ad(!rec_offs_any_extern(offsets)); + ref_len = dict_index_get_n_unique(clust_index); + + ut_ad(ref_len == dtuple_get_n_fields(ref)); + + dict_index_copy_types(ref, clust_index, ref_len); + + for (i = 0; i < ref_len; i++) { + dfield = dtuple_get_nth_field(ref, i); + + pos = dict_index_get_nth_field_pos(index, clust_index, i); + + ut_a(pos != ULINT_UNDEFINED); + + field = rec_get_nth_field(rec, offsets, pos, &len); + + dfield_set_data(dfield, field, len); + + /* If the primary key contains a column prefix, then the + secondary index may contain a longer prefix of the same + column, or the full column, and we must adjust the length + accordingly. */ + + clust_col_prefix_len = dict_index_get_nth_field( + clust_index, i)->prefix_len; + + if (clust_col_prefix_len > 0) { + if (len != UNIV_SQL_NULL) { + + const dtype_t* dtype + = dfield_get_type(dfield); + + dfield_set_len(dfield, + dtype_get_at_most_n_mbchars( + dtype->prtype, + dtype->mbminmaxlen, + clust_col_prefix_len, + len, (char*) field)); + } + } + } + + ut_ad(dtuple_check_typed(ref)); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } +} + +/***************************************************************//** +Searches the clustered index record for a row, if we have the row reference. +@return TRUE if found */ +UNIV_INTERN +ibool +row_search_on_row_ref( +/*==================*/ + btr_pcur_t* pcur, /*!< out: persistent cursor, which must + be closed by the caller */ + ulint mode, /*!< in: BTR_MODIFY_LEAF, ... */ + const dict_table_t* table, /*!< in: table */ + const dtuple_t* ref, /*!< in: row reference */ + mtr_t* mtr) /*!< in/out: mtr */ +{ + ulint low_match; + rec_t* rec; + dict_index_t* index; + + ut_ad(dtuple_check_typed(ref)); + + index = dict_table_get_first_index(table); + + ut_a(dtuple_get_n_fields(ref) == dict_index_get_n_unique(index)); + + btr_pcur_open(index, ref, PAGE_CUR_LE, mode, pcur, mtr); + + low_match = btr_pcur_get_low_match(pcur); + + rec = btr_pcur_get_rec(pcur); + + if (page_rec_is_infimum(rec)) { + + return(FALSE); + } + + if (low_match != dtuple_get_n_fields(ref)) { + + return(FALSE); + } + + return(TRUE); +} + +/*********************************************************************//** +Fetches the clustered index record for a secondary index record. The latches +on the secondary index record are preserved. +@return record or NULL, if no record found */ +UNIV_INTERN +rec_t* +row_get_clust_rec( +/*==============*/ + ulint mode, /*!< in: BTR_MODIFY_LEAF, ... */ + const rec_t* rec, /*!< in: record in a secondary index */ + dict_index_t* index, /*!< in: secondary index */ + dict_index_t** clust_index,/*!< out: clustered index */ + mtr_t* mtr) /*!< in: mtr */ +{ + mem_heap_t* heap; + dtuple_t* ref; + dict_table_t* table; + btr_pcur_t pcur; + ibool found; + rec_t* clust_rec; + + ut_ad(!dict_index_is_clust(index)); + + table = index->table; + + heap = mem_heap_create(256); + + ref = row_build_row_ref(ROW_COPY_POINTERS, index, rec, heap); + + found = row_search_on_row_ref(&pcur, mode, table, ref, mtr); + + clust_rec = found ? btr_pcur_get_rec(&pcur) : NULL; + + mem_heap_free(heap); + + btr_pcur_close(&pcur); + + *clust_index = dict_table_get_first_index(table); + + return(clust_rec); +} + +/***************************************************************//** +Searches an index record. +@return whether the record was found or buffered */ +UNIV_INTERN +enum row_search_result +row_search_index_entry( +/*===================*/ + dict_index_t* index, /*!< in: index */ + const dtuple_t* entry, /*!< in: index entry */ + ulint mode, /*!< in: BTR_MODIFY_LEAF, ... */ + btr_pcur_t* pcur, /*!< in/out: persistent cursor, which must + be closed by the caller */ + mtr_t* mtr) /*!< in: mtr */ +{ + ulint n_fields; + ulint low_match; + rec_t* rec; + + ut_ad(dtuple_check_typed(entry)); + + btr_pcur_open(index, entry, PAGE_CUR_LE, mode, pcur, mtr); + + switch (btr_pcur_get_btr_cur(pcur)->flag) { + case BTR_CUR_DELETE_REF: + ut_a(mode & BTR_DELETE); + return(ROW_NOT_DELETED_REF); + + case BTR_CUR_DEL_MARK_IBUF: + case BTR_CUR_DELETE_IBUF: + case BTR_CUR_INSERT_TO_IBUF: + return(ROW_BUFFERED); + + case BTR_CUR_HASH: + case BTR_CUR_HASH_FAIL: + case BTR_CUR_BINARY: + break; + } + + low_match = btr_pcur_get_low_match(pcur); + + rec = btr_pcur_get_rec(pcur); + + n_fields = dtuple_get_n_fields(entry); + + if (page_rec_is_infimum(rec)) { + + return(ROW_NOT_FOUND); + } else if (low_match != n_fields) { + + return(ROW_NOT_FOUND); + } + + return(ROW_FOUND); +} + +/*******************************************************************//** +Formats the raw data in "data" (in InnoDB on-disk format) that is of +type DATA_INT using "prtype" and writes the result to "buf". +If the data is in unknown format, then nothing is written to "buf", +0 is returned and "format_in_hex" is set to TRUE, otherwise +"format_in_hex" is left untouched. +Not more than "buf_size" bytes are written to "buf". +The result is always '\0'-terminated (provided buf_size > 0) and the +number of bytes that were written to "buf" is returned (including the +terminating '\0'). +@return number of bytes that were written */ +static +ulint +row_raw_format_int( +/*===============*/ + const char* data, /*!< in: raw data */ + ulint data_len, /*!< in: raw data length + in bytes */ + ulint prtype, /*!< in: precise type */ + char* buf, /*!< out: output buffer */ + ulint buf_size, /*!< in: output buffer size + in bytes */ + ibool* format_in_hex) /*!< out: should the data be + formated in hex */ +{ + ulint ret; + + if (data_len <= sizeof(ib_uint64_t)) { + + ib_uint64_t value; + ibool unsigned_type = prtype & DATA_UNSIGNED; + + value = mach_read_int_type( + (const byte*) data, data_len, unsigned_type); + + ret = ut_snprintf( + buf, buf_size, + unsigned_type ? UINT64PF : INT64PF, value) + 1; + } else { + + *format_in_hex = TRUE; + ret = 0; + } + + return(ut_min(ret, buf_size)); +} + +/*******************************************************************//** +Formats the raw data in "data" (in InnoDB on-disk format) that is of +type DATA_(CHAR|VARCHAR|MYSQL|VARMYSQL) using "prtype" and writes the +result to "buf". +If the data is in binary format, then nothing is written to "buf", +0 is returned and "format_in_hex" is set to TRUE, otherwise +"format_in_hex" is left untouched. +Not more than "buf_size" bytes are written to "buf". +The result is always '\0'-terminated (provided buf_size > 0) and the +number of bytes that were written to "buf" is returned (including the +terminating '\0'). +@return number of bytes that were written */ +static +ulint +row_raw_format_str( +/*===============*/ + const char* data, /*!< in: raw data */ + ulint data_len, /*!< in: raw data length + in bytes */ + ulint prtype, /*!< in: precise type */ + char* buf, /*!< out: output buffer */ + ulint buf_size, /*!< in: output buffer size + in bytes */ + ibool* format_in_hex) /*!< out: should the data be + formated in hex */ +{ + ulint charset_coll; + + if (buf_size == 0) { + + return(0); + } + + /* we assume system_charset_info is UTF-8 */ + + charset_coll = dtype_get_charset_coll(prtype); + + if (UNIV_LIKELY(dtype_is_utf8(prtype))) { + + return(ut_str_sql_format(data, data_len, buf, buf_size)); + } + /* else */ + + if (charset_coll == DATA_MYSQL_BINARY_CHARSET_COLL) { + + *format_in_hex = TRUE; + return(0); + } + /* else */ + + return(innobase_raw_format(data, data_len, charset_coll, + buf, buf_size)); +} + +/*******************************************************************//** +Formats the raw data in "data" (in InnoDB on-disk format) using +"dict_field" and writes the result to "buf". +Not more than "buf_size" bytes are written to "buf". +The result is always NUL-terminated (provided buf_size is positive) and the +number of bytes that were written to "buf" is returned (including the +terminating NUL). +@return number of bytes that were written */ +UNIV_INTERN +ulint +row_raw_format( +/*===========*/ + const char* data, /*!< in: raw data */ + ulint data_len, /*!< in: raw data length + in bytes */ + const dict_field_t* dict_field, /*!< in: index field */ + char* buf, /*!< out: output buffer */ + ulint buf_size) /*!< in: output buffer size + in bytes */ +{ + ulint mtype; + ulint prtype; + ulint ret; + ibool format_in_hex; + + if (buf_size == 0) { + + return(0); + } + + if (data_len == UNIV_SQL_NULL) { + + ret = ut_snprintf((char*) buf, buf_size, "NULL") + 1; + + return(ut_min(ret, buf_size)); + } + + mtype = dict_field->col->mtype; + prtype = dict_field->col->prtype; + + format_in_hex = FALSE; + + switch (mtype) { + case DATA_INT: + + ret = row_raw_format_int(data, data_len, prtype, + buf, buf_size, &format_in_hex); + if (format_in_hex) { + + goto format_in_hex; + } + break; + case DATA_CHAR: + case DATA_VARCHAR: + case DATA_MYSQL: + case DATA_VARMYSQL: + + ret = row_raw_format_str(data, data_len, prtype, + buf, buf_size, &format_in_hex); + if (format_in_hex) { + + goto format_in_hex; + } + + break; + /* XXX support more data types */ + default: + format_in_hex: + + if (UNIV_LIKELY(buf_size > 2)) { + + memcpy(buf, "0x", 2); + buf += 2; + buf_size -= 2; + ret = 2 + ut_raw_to_hex(data, data_len, + buf, buf_size); + } else { + + buf[0] = '\0'; + ret = 1; + } + } + + return(ret); +} + +#ifdef UNIV_COMPILE_TEST_FUNCS + +#include "ut0dbg.h" + +void +test_row_raw_format_int() +{ + ulint ret; + char buf[128]; + ibool format_in_hex; + speedo_t speedo; + ulint i; + +#define CALL_AND_TEST(data, data_len, prtype, buf, buf_size,\ + ret_expected, buf_expected, format_in_hex_expected)\ + do {\ + ibool ok = TRUE;\ + ulint i;\ + memset(buf, 'x', 10);\ + buf[10] = '\0';\ + format_in_hex = FALSE;\ + fprintf(stderr, "TESTING \"\\x");\ + for (i = 0; i < data_len; i++) {\ + fprintf(stderr, "%02hhX", data[i]);\ + }\ + fprintf(stderr, "\", %lu, %lu, %lu\n",\ + (ulint) data_len, (ulint) prtype,\ + (ulint) buf_size);\ + ret = row_raw_format_int(data, data_len, prtype,\ + buf, buf_size, &format_in_hex);\ + if (ret != ret_expected) {\ + fprintf(stderr, "expected ret %lu, got %lu\n",\ + (ulint) ret_expected, ret);\ + ok = FALSE;\ + }\ + if (strcmp((char*) buf, buf_expected) != 0) {\ + fprintf(stderr, "expected buf \"%s\", got \"%s\"\n",\ + buf_expected, buf);\ + ok = FALSE;\ + }\ + if (format_in_hex != format_in_hex_expected) {\ + fprintf(stderr, "expected format_in_hex %d, got %d\n",\ + (int) format_in_hex_expected,\ + (int) format_in_hex);\ + ok = FALSE;\ + }\ + if (ok) {\ + fprintf(stderr, "OK: %lu, \"%s\" %d\n\n",\ + (ulint) ret, buf, (int) format_in_hex);\ + } else {\ + return;\ + }\ + } while (0) + +#if 1 + /* min values for signed 1-8 byte integers */ + + CALL_AND_TEST("\x00", 1, 0, + buf, sizeof(buf), 5, "-128", 0); + + CALL_AND_TEST("\x00\x00", 2, 0, + buf, sizeof(buf), 7, "-32768", 0); + + CALL_AND_TEST("\x00\x00\x00", 3, 0, + buf, sizeof(buf), 9, "-8388608", 0); + + CALL_AND_TEST("\x00\x00\x00\x00", 4, 0, + buf, sizeof(buf), 12, "-2147483648", 0); + + CALL_AND_TEST("\x00\x00\x00\x00\x00", 5, 0, + buf, sizeof(buf), 14, "-549755813888", 0); + + CALL_AND_TEST("\x00\x00\x00\x00\x00\x00", 6, 0, + buf, sizeof(buf), 17, "-140737488355328", 0); + + CALL_AND_TEST("\x00\x00\x00\x00\x00\x00\x00", 7, 0, + buf, sizeof(buf), 19, "-36028797018963968", 0); + + CALL_AND_TEST("\x00\x00\x00\x00\x00\x00\x00\x00", 8, 0, + buf, sizeof(buf), 21, "-9223372036854775808", 0); + + /* min values for unsigned 1-8 byte integers */ + + CALL_AND_TEST("\x00", 1, DATA_UNSIGNED, + buf, sizeof(buf), 2, "0", 0); + + CALL_AND_TEST("\x00\x00", 2, DATA_UNSIGNED, + buf, sizeof(buf), 2, "0", 0); + + CALL_AND_TEST("\x00\x00\x00", 3, DATA_UNSIGNED, + buf, sizeof(buf), 2, "0", 0); + + CALL_AND_TEST("\x00\x00\x00\x00", 4, DATA_UNSIGNED, + buf, sizeof(buf), 2, "0", 0); + + CALL_AND_TEST("\x00\x00\x00\x00\x00", 5, DATA_UNSIGNED, + buf, sizeof(buf), 2, "0", 0); + + CALL_AND_TEST("\x00\x00\x00\x00\x00\x00", 6, DATA_UNSIGNED, + buf, sizeof(buf), 2, "0", 0); + + CALL_AND_TEST("\x00\x00\x00\x00\x00\x00\x00", 7, DATA_UNSIGNED, + buf, sizeof(buf), 2, "0", 0); + + CALL_AND_TEST("\x00\x00\x00\x00\x00\x00\x00\x00", 8, DATA_UNSIGNED, + buf, sizeof(buf), 2, "0", 0); + + /* max values for signed 1-8 byte integers */ + + CALL_AND_TEST("\xFF", 1, 0, + buf, sizeof(buf), 4, "127", 0); + + CALL_AND_TEST("\xFF\xFF", 2, 0, + buf, sizeof(buf), 6, "32767", 0); + + CALL_AND_TEST("\xFF\xFF\xFF", 3, 0, + buf, sizeof(buf), 8, "8388607", 0); + + CALL_AND_TEST("\xFF\xFF\xFF\xFF", 4, 0, + buf, sizeof(buf), 11, "2147483647", 0); + + CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF", 5, 0, + buf, sizeof(buf), 13, "549755813887", 0); + + CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF", 6, 0, + buf, sizeof(buf), 16, "140737488355327", 0); + + CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF\xFF", 7, 0, + buf, sizeof(buf), 18, "36028797018963967", 0); + + CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", 8, 0, + buf, sizeof(buf), 20, "9223372036854775807", 0); + + /* max values for unsigned 1-8 byte integers */ + + CALL_AND_TEST("\xFF", 1, DATA_UNSIGNED, + buf, sizeof(buf), 4, "255", 0); + + CALL_AND_TEST("\xFF\xFF", 2, DATA_UNSIGNED, + buf, sizeof(buf), 6, "65535", 0); + + CALL_AND_TEST("\xFF\xFF\xFF", 3, DATA_UNSIGNED, + buf, sizeof(buf), 9, "16777215", 0); + + CALL_AND_TEST("\xFF\xFF\xFF\xFF", 4, DATA_UNSIGNED, + buf, sizeof(buf), 11, "4294967295", 0); + + CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF", 5, DATA_UNSIGNED, + buf, sizeof(buf), 14, "1099511627775", 0); + + CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF", 6, DATA_UNSIGNED, + buf, sizeof(buf), 16, "281474976710655", 0); + + CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF\xFF", 7, DATA_UNSIGNED, + buf, sizeof(buf), 18, "72057594037927935", 0); + + CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", 8, DATA_UNSIGNED, + buf, sizeof(buf), 21, "18446744073709551615", 0); + + /* some random values */ + + CALL_AND_TEST("\x52", 1, 0, + buf, sizeof(buf), 4, "-46", 0); + + CALL_AND_TEST("\x0E", 1, DATA_UNSIGNED, + buf, sizeof(buf), 3, "14", 0); + + CALL_AND_TEST("\x62\xCE", 2, 0, + buf, sizeof(buf), 6, "-7474", 0); + + CALL_AND_TEST("\x29\xD6", 2, DATA_UNSIGNED, + buf, sizeof(buf), 6, "10710", 0); + + CALL_AND_TEST("\x7F\xFF\x90", 3, 0, + buf, sizeof(buf), 5, "-112", 0); + + CALL_AND_TEST("\x00\xA1\x16", 3, DATA_UNSIGNED, + buf, sizeof(buf), 6, "41238", 0); + + CALL_AND_TEST("\x7F\xFF\xFF\xF7", 4, 0, + buf, sizeof(buf), 3, "-9", 0); + + CALL_AND_TEST("\x00\x00\x00\x5C", 4, DATA_UNSIGNED, + buf, sizeof(buf), 3, "92", 0); + + CALL_AND_TEST("\x7F\xFF\xFF\xFF\xFF\xFF\xDC\x63", 8, 0, + buf, sizeof(buf), 6, "-9117", 0); + + CALL_AND_TEST("\x00\x00\x00\x00\x00\x01\x64\x62", 8, DATA_UNSIGNED, + buf, sizeof(buf), 6, "91234", 0); +#endif + + /* speed test */ + + speedo_reset(&speedo); + + for (i = 0; i < 1000000; i++) { + row_raw_format_int("\x23", 1, + 0, buf, sizeof(buf), + &format_in_hex); + row_raw_format_int("\x23", 1, + DATA_UNSIGNED, buf, sizeof(buf), + &format_in_hex); + + row_raw_format_int("\x00\x00\x00\x00\x00\x01\x64\x62", 8, + 0, buf, sizeof(buf), + &format_in_hex); + row_raw_format_int("\x00\x00\x00\x00\x00\x01\x64\x62", 8, + DATA_UNSIGNED, buf, sizeof(buf), + &format_in_hex); + } + + speedo_show(&speedo); +} + +#endif /* UNIV_COMPILE_TEST_FUNCS */ diff --git a/storage/xtradb/row/row0sel.cc b/storage/xtradb/row/row0sel.cc new file mode 100644 index 00000000000..7ed45ccd06d --- /dev/null +++ b/storage/xtradb/row/row0sel.cc @@ -0,0 +1,5390 @@ +/***************************************************************************** + +Copyright (c) 1997, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2008, Google Inc. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/***************************************************//** +@file row/row0sel.cc +Select + +Created 12/19/1997 Heikki Tuuri +*******************************************************/ + +#include "row0sel.h" + +#ifdef UNIV_NONINL +#include "row0sel.ic" +#endif + +#include "dict0dict.h" +#include "dict0boot.h" +#include "trx0undo.h" +#include "trx0trx.h" +#include "btr0btr.h" +#include "btr0cur.h" +#include "btr0sea.h" +#include "mach0data.h" +#include "que0que.h" +#include "row0upd.h" +#include "row0row.h" +#include "row0vers.h" +#include "rem0cmp.h" +#include "lock0lock.h" +#include "eval0eval.h" +#include "pars0sym.h" +#include "pars0pars.h" +#include "row0mysql.h" +#include "read0read.h" +#include "buf0lru.h" +#include "ha_prototypes.h" +#include "srv0start.h" +#include "m_string.h" /* for my_sys.h */ +#include "my_sys.h" /* DEBUG_SYNC_C */ + +#include "my_compare.h" /* enum icp_result */ + +/* Maximum number of rows to prefetch; MySQL interface has another parameter */ +#define SEL_MAX_N_PREFETCH 16 + +/* Number of rows fetched, after which to start prefetching; MySQL interface +has another parameter */ +#define SEL_PREFETCH_LIMIT 1 + +/* When a select has accessed about this many pages, it returns control back +to que_run_threads: this is to allow canceling runaway queries */ + +#define SEL_COST_LIMIT 100 + +/* Flags for search shortcut */ +#define SEL_FOUND 0 +#define SEL_EXHAUSTED 1 +#define SEL_RETRY 2 + +/********************************************************************//** +Returns TRUE if the user-defined column in a secondary index record +is alphabetically the same as the corresponding BLOB column in the clustered +index record. +NOTE: the comparison is NOT done as a binary comparison, but character +fields are compared with collation! +@return TRUE if the columns are equal */ +static +ibool +row_sel_sec_rec_is_for_blob( +/*========================*/ + ulint mtype, /*!< in: main type */ + ulint prtype, /*!< in: precise type */ + ulint mbminmaxlen, /*!< in: minimum and maximum length of + a multi-byte character */ + const byte* clust_field, /*!< in: the locally stored part of + the clustered index column, including + the BLOB pointer; the clustered + index record must be covered by + a lock or a page latch to protect it + against deletion (rollback or purge) */ + ulint clust_len, /*!< in: length of clust_field */ + const byte* sec_field, /*!< in: column in secondary index */ + ulint sec_len, /*!< in: length of sec_field */ + ulint prefix_len, /*!< in: index column prefix length + in bytes */ + dict_table_t* table) /*!< in: table */ +{ + ulint len; + byte buf[REC_VERSION_56_MAX_INDEX_COL_LEN]; + ulint zip_size = dict_tf_get_zip_size(table->flags); + + /* This function should never be invoked on an Antelope format + table, because they should always contain enough prefix in the + clustered index record. */ + ut_ad(dict_table_get_format(table) >= UNIV_FORMAT_B); + ut_a(clust_len >= BTR_EXTERN_FIELD_REF_SIZE); + ut_ad(prefix_len >= sec_len); + ut_ad(prefix_len > 0); + ut_a(prefix_len <= sizeof buf); + + if (UNIV_UNLIKELY + (!memcmp(clust_field + clust_len - BTR_EXTERN_FIELD_REF_SIZE, + field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE))) { + /* The externally stored field was not written yet. + This record should only be seen by + recv_recovery_rollback_active() or any + TRX_ISO_READ_UNCOMMITTED transactions. */ + return(FALSE); + } + + len = btr_copy_externally_stored_field_prefix(buf, prefix_len, + zip_size, + clust_field, clust_len); + + if (UNIV_UNLIKELY(len == 0)) { + /* The BLOB was being deleted as the server crashed. + There should not be any secondary index records + referring to this clustered index record, because + btr_free_externally_stored_field() is called after all + secondary index entries of the row have been purged. */ + return(FALSE); + } + + len = dtype_get_at_most_n_mbchars(prtype, mbminmaxlen, + prefix_len, len, (const char*) buf); + + return(!cmp_data_data(mtype, prtype, buf, len, sec_field, sec_len)); +} + +/********************************************************************//** +Returns TRUE if the user-defined column values in a secondary index record +are alphabetically the same as the corresponding columns in the clustered +index record. +NOTE: the comparison is NOT done as a binary comparison, but character +fields are compared with collation! +@return TRUE if the secondary record is equal to the corresponding +fields in the clustered record, when compared with collation; +FALSE if not equal or if the clustered record has been marked for deletion */ +static +ibool +row_sel_sec_rec_is_for_clust_rec( +/*=============================*/ + const rec_t* sec_rec, /*!< in: secondary index record */ + dict_index_t* sec_index, /*!< in: secondary index */ + const rec_t* clust_rec, /*!< in: clustered index record; + must be protected by a lock or + a page latch against deletion + in rollback or purge */ + dict_index_t* clust_index) /*!< in: clustered index */ +{ + const byte* sec_field; + ulint sec_len; + const byte* clust_field; + ulint n; + ulint i; + mem_heap_t* heap = NULL; + ulint clust_offsets_[REC_OFFS_NORMAL_SIZE]; + ulint sec_offsets_[REC_OFFS_SMALL_SIZE]; + ulint* clust_offs = clust_offsets_; + ulint* sec_offs = sec_offsets_; + ibool is_equal = TRUE; + + rec_offs_init(clust_offsets_); + rec_offs_init(sec_offsets_); + + if (rec_get_deleted_flag(clust_rec, + dict_table_is_comp(clust_index->table))) { + + /* The clustered index record is delete-marked; + it is not visible in the read view. Besides, + if there are any externally stored columns, + some of them may have already been purged. */ + return(FALSE); + } + + clust_offs = rec_get_offsets(clust_rec, clust_index, clust_offs, + ULINT_UNDEFINED, &heap); + sec_offs = rec_get_offsets(sec_rec, sec_index, sec_offs, + ULINT_UNDEFINED, &heap); + + n = dict_index_get_n_ordering_defined_by_user(sec_index); + + for (i = 0; i < n; i++) { + const dict_field_t* ifield; + const dict_col_t* col; + ulint clust_pos; + ulint clust_len; + ulint len; + + ifield = dict_index_get_nth_field(sec_index, i); + col = dict_field_get_col(ifield); + clust_pos = dict_col_get_clust_pos(col, clust_index); + + clust_field = rec_get_nth_field( + clust_rec, clust_offs, clust_pos, &clust_len); + sec_field = rec_get_nth_field(sec_rec, sec_offs, i, &sec_len); + + len = clust_len; + + if (ifield->prefix_len > 0 && len != UNIV_SQL_NULL + && sec_len != UNIV_SQL_NULL) { + + if (rec_offs_nth_extern(clust_offs, clust_pos)) { + len -= BTR_EXTERN_FIELD_REF_SIZE; + } + + len = dtype_get_at_most_n_mbchars( + col->prtype, col->mbminmaxlen, + ifield->prefix_len, len, (char*) clust_field); + + if (rec_offs_nth_extern(clust_offs, clust_pos) + && len < sec_len) { + if (!row_sel_sec_rec_is_for_blob( + col->mtype, col->prtype, + col->mbminmaxlen, + clust_field, clust_len, + sec_field, sec_len, + ifield->prefix_len, + clust_index->table)) { + goto inequal; + } + + continue; + } + } + + if (0 != cmp_data_data(col->mtype, col->prtype, + clust_field, len, + sec_field, sec_len)) { +inequal: + is_equal = FALSE; + goto func_exit; + } + } + +func_exit: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(is_equal); +} + +/*********************************************************************//** +Creates a select node struct. +@return own: select node struct */ +UNIV_INTERN +sel_node_t* +sel_node_create( +/*============*/ + mem_heap_t* heap) /*!< in: memory heap where created */ +{ + sel_node_t* node; + + node = static_cast<sel_node_t*>( + mem_heap_alloc(heap, sizeof(sel_node_t))); + + node->common.type = QUE_NODE_SELECT; + node->state = SEL_NODE_OPEN; + + node->plans = NULL; + + return(node); +} + +/*********************************************************************//** +Frees the memory private to a select node when a query graph is freed, +does not free the heap where the node was originally created. */ +UNIV_INTERN +void +sel_node_free_private( +/*==================*/ + sel_node_t* node) /*!< in: select node struct */ +{ + ulint i; + plan_t* plan; + + if (node->plans != NULL) { + for (i = 0; i < node->n_tables; i++) { + plan = sel_node_get_nth_plan(node, i); + + btr_pcur_close(&(plan->pcur)); + btr_pcur_close(&(plan->clust_pcur)); + + if (plan->old_vers_heap) { + mem_heap_free(plan->old_vers_heap); + } + } + } +} + +/*********************************************************************//** +Evaluates the values in a select list. If there are aggregate functions, +their argument value is added to the aggregate total. */ +UNIV_INLINE +void +sel_eval_select_list( +/*=================*/ + sel_node_t* node) /*!< in: select node */ +{ + que_node_t* exp; + + exp = node->select_list; + + while (exp) { + eval_exp(exp); + + exp = que_node_get_next(exp); + } +} + +/*********************************************************************//** +Assigns the values in the select list to the possible into-variables in +SELECT ... INTO ... */ +UNIV_INLINE +void +sel_assign_into_var_values( +/*=======================*/ + sym_node_t* var, /*!< in: first variable in a list of + variables */ + sel_node_t* node) /*!< in: select node */ +{ + que_node_t* exp; + + if (var == NULL) { + + return; + } + + for (exp = node->select_list; + var != 0; + var = static_cast<sym_node_t*>(que_node_get_next(var))) { + + ut_ad(exp); + + eval_node_copy_val(var->alias, exp); + + exp = que_node_get_next(exp); + } +} + +/*********************************************************************//** +Resets the aggregate value totals in the select list of an aggregate type +query. */ +UNIV_INLINE +void +sel_reset_aggregate_vals( +/*=====================*/ + sel_node_t* node) /*!< in: select node */ +{ + func_node_t* func_node; + + ut_ad(node->is_aggregate); + + for (func_node = static_cast<func_node_t*>(node->select_list); + func_node != 0; + func_node = static_cast<func_node_t*>( + que_node_get_next(func_node))) { + + eval_node_set_int_val(func_node, 0); + } + + node->aggregate_already_fetched = FALSE; +} + +/*********************************************************************//** +Copies the input variable values when an explicit cursor is opened. */ +UNIV_INLINE +void +row_sel_copy_input_variable_vals( +/*=============================*/ + sel_node_t* node) /*!< in: select node */ +{ + sym_node_t* var; + + var = UT_LIST_GET_FIRST(node->copy_variables); + + while (var) { + eval_node_copy_val(var, var->alias); + + var->indirection = NULL; + + var = UT_LIST_GET_NEXT(col_var_list, var); + } +} + +/*********************************************************************//** +Fetches the column values from a record. */ +static +void +row_sel_fetch_columns( +/*==================*/ + dict_index_t* index, /*!< in: record index */ + const rec_t* rec, /*!< in: record in a clustered or non-clustered + index; must be protected by a page latch */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + sym_node_t* column) /*!< in: first column in a column list, or + NULL */ +{ + dfield_t* val; + ulint index_type; + ulint field_no; + const byte* data; + ulint len; + + ut_ad(rec_offs_validate(rec, index, offsets)); + + if (dict_index_is_clust(index)) { + index_type = SYM_CLUST_FIELD_NO; + } else { + index_type = SYM_SEC_FIELD_NO; + } + + while (column) { + mem_heap_t* heap = NULL; + ibool needs_copy; + + field_no = column->field_nos[index_type]; + + if (field_no != ULINT_UNDEFINED) { + + if (UNIV_UNLIKELY(rec_offs_nth_extern(offsets, + field_no))) { + + /* Copy an externally stored field to the + temporary heap, if possible. */ + + heap = mem_heap_create(1); + + data = btr_rec_copy_externally_stored_field( + rec, offsets, + dict_table_zip_size(index->table), + field_no, &len, heap); + + /* data == NULL means that the + externally stored field was not + written yet. This record + should only be seen by + recv_recovery_rollback_active() or any + TRX_ISO_READ_UNCOMMITTED + transactions. The InnoDB SQL parser + (the sole caller of this function) + does not implement READ UNCOMMITTED, + and it is not involved during rollback. */ + ut_a(data); + ut_a(len != UNIV_SQL_NULL); + + needs_copy = TRUE; + } else { + data = rec_get_nth_field(rec, offsets, + field_no, &len); + + needs_copy = column->copy_val; + } + + if (needs_copy) { + eval_node_copy_and_alloc_val(column, data, + len); + } else { + val = que_node_get_val(column); + dfield_set_data(val, data, len); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + } + + column = UT_LIST_GET_NEXT(col_var_list, column); + } +} + +/*********************************************************************//** +Allocates a prefetch buffer for a column when prefetch is first time done. */ +static +void +sel_col_prefetch_buf_alloc( +/*=======================*/ + sym_node_t* column) /*!< in: symbol table node for a column */ +{ + sel_buf_t* sel_buf; + ulint i; + + ut_ad(que_node_get_type(column) == QUE_NODE_SYMBOL); + + column->prefetch_buf = static_cast<sel_buf_t*>( + mem_alloc(SEL_MAX_N_PREFETCH * sizeof(sel_buf_t))); + + for (i = 0; i < SEL_MAX_N_PREFETCH; i++) { + sel_buf = column->prefetch_buf + i; + + sel_buf->data = NULL; + sel_buf->len = 0; + sel_buf->val_buf_size = 0; + } +} + +/*********************************************************************//** +Frees a prefetch buffer for a column, including the dynamically allocated +memory for data stored there. */ +UNIV_INTERN +void +sel_col_prefetch_buf_free( +/*======================*/ + sel_buf_t* prefetch_buf) /*!< in, own: prefetch buffer */ +{ + sel_buf_t* sel_buf; + ulint i; + + for (i = 0; i < SEL_MAX_N_PREFETCH; i++) { + sel_buf = prefetch_buf + i; + + if (sel_buf->val_buf_size > 0) { + + mem_free(sel_buf->data); + } + } + + mem_free(prefetch_buf); +} + +/*********************************************************************//** +Pops the column values for a prefetched, cached row from the column prefetch +buffers and places them to the val fields in the column nodes. */ +static +void +sel_dequeue_prefetched_row( +/*=======================*/ + plan_t* plan) /*!< in: plan node for a table */ +{ + sym_node_t* column; + sel_buf_t* sel_buf; + dfield_t* val; + byte* data; + ulint len; + ulint val_buf_size; + + ut_ad(plan->n_rows_prefetched > 0); + + column = UT_LIST_GET_FIRST(plan->columns); + + while (column) { + val = que_node_get_val(column); + + if (!column->copy_val) { + /* We did not really push any value for the + column */ + + ut_ad(!column->prefetch_buf); + ut_ad(que_node_get_val_buf_size(column) == 0); + ut_d(dfield_set_null(val)); + + goto next_col; + } + + ut_ad(column->prefetch_buf); + ut_ad(!dfield_is_ext(val)); + + sel_buf = column->prefetch_buf + plan->first_prefetched; + + data = sel_buf->data; + len = sel_buf->len; + val_buf_size = sel_buf->val_buf_size; + + /* We must keep track of the allocated memory for + column values to be able to free it later: therefore + we swap the values for sel_buf and val */ + + sel_buf->data = static_cast<byte*>(dfield_get_data(val)); + sel_buf->len = dfield_get_len(val); + sel_buf->val_buf_size = que_node_get_val_buf_size(column); + + dfield_set_data(val, data, len); + que_node_set_val_buf_size(column, val_buf_size); +next_col: + column = UT_LIST_GET_NEXT(col_var_list, column); + } + + plan->n_rows_prefetched--; + + plan->first_prefetched++; +} + +/*********************************************************************//** +Pushes the column values for a prefetched, cached row to the column prefetch +buffers from the val fields in the column nodes. */ +UNIV_INLINE +void +sel_enqueue_prefetched_row( +/*=======================*/ + plan_t* plan) /*!< in: plan node for a table */ +{ + sym_node_t* column; + sel_buf_t* sel_buf; + dfield_t* val; + byte* data; + ulint len; + ulint pos; + ulint val_buf_size; + + if (plan->n_rows_prefetched == 0) { + pos = 0; + plan->first_prefetched = 0; + } else { + pos = plan->n_rows_prefetched; + + /* We have the convention that pushing new rows starts only + after the prefetch stack has been emptied: */ + + ut_ad(plan->first_prefetched == 0); + } + + plan->n_rows_prefetched++; + + ut_ad(pos < SEL_MAX_N_PREFETCH); + + for (column = UT_LIST_GET_FIRST(plan->columns); + column != 0; + column = UT_LIST_GET_NEXT(col_var_list, column)) { + + if (!column->copy_val) { + /* There is no sense to push pointers to database + page fields when we do not keep latch on the page! */ + continue; + } + + if (!column->prefetch_buf) { + /* Allocate a new prefetch buffer */ + + sel_col_prefetch_buf_alloc(column); + } + + sel_buf = column->prefetch_buf + pos; + + val = que_node_get_val(column); + + data = static_cast<byte*>(dfield_get_data(val)); + len = dfield_get_len(val); + val_buf_size = que_node_get_val_buf_size(column); + + /* We must keep track of the allocated memory for + column values to be able to free it later: therefore + we swap the values for sel_buf and val */ + + dfield_set_data(val, sel_buf->data, sel_buf->len); + que_node_set_val_buf_size(column, sel_buf->val_buf_size); + + sel_buf->data = data; + sel_buf->len = len; + sel_buf->val_buf_size = val_buf_size; + } +} + +/*********************************************************************//** +Builds a previous version of a clustered index record for a consistent read +@return DB_SUCCESS or error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_sel_build_prev_vers( +/*====================*/ + read_view_t* read_view, /*!< in: read view */ + dict_index_t* index, /*!< in: plan node for table */ + rec_t* rec, /*!< in: record in a clustered index */ + ulint** offsets, /*!< in/out: offsets returned by + rec_get_offsets(rec, plan->index) */ + mem_heap_t** offset_heap, /*!< in/out: memory heap from which + the offsets are allocated */ + mem_heap_t** old_vers_heap, /*!< out: old version heap to use */ + rec_t** old_vers, /*!< out: old version, or NULL if the + record does not exist in the view: + i.e., it was freshly inserted + afterwards */ + mtr_t* mtr) /*!< in: mtr */ +{ + dberr_t err; + + if (*old_vers_heap) { + mem_heap_empty(*old_vers_heap); + } else { + *old_vers_heap = mem_heap_create(512); + } + + err = row_vers_build_for_consistent_read( + rec, mtr, index, offsets, read_view, offset_heap, + *old_vers_heap, old_vers); + return(err); +} + +/*********************************************************************//** +Builds the last committed version of a clustered index record for a +semi-consistent read. */ +static __attribute__((nonnull)) +void +row_sel_build_committed_vers_for_mysql( +/*===================================*/ + dict_index_t* clust_index, /*!< in: clustered index */ + row_prebuilt_t* prebuilt, /*!< in: prebuilt struct */ + const rec_t* rec, /*!< in: record in a clustered index */ + ulint** offsets, /*!< in/out: offsets returned by + rec_get_offsets(rec, clust_index) */ + mem_heap_t** offset_heap, /*!< in/out: memory heap from which + the offsets are allocated */ + const rec_t** old_vers, /*!< out: old version, or NULL if the + record does not exist in the view: + i.e., it was freshly inserted + afterwards */ + mtr_t* mtr) /*!< in: mtr */ +{ + if (prebuilt->old_vers_heap) { + mem_heap_empty(prebuilt->old_vers_heap); + } else { + prebuilt->old_vers_heap = mem_heap_create( + rec_offs_size(*offsets)); + } + + row_vers_build_for_semi_consistent_read( + rec, mtr, clust_index, offsets, offset_heap, + prebuilt->old_vers_heap, old_vers); +} + +/*********************************************************************//** +Tests the conditions which determine when the index segment we are searching +through has been exhausted. +@return TRUE if row passed the tests */ +UNIV_INLINE +ibool +row_sel_test_end_conds( +/*===================*/ + plan_t* plan) /*!< in: plan for the table; the column values must + already have been retrieved and the right sides of + comparisons evaluated */ +{ + func_node_t* cond; + + /* All conditions in end_conds are comparisons of a column to an + expression */ + + for (cond = UT_LIST_GET_FIRST(plan->end_conds); + cond != 0; + cond = UT_LIST_GET_NEXT(cond_list, cond)) { + + /* Evaluate the left side of the comparison, i.e., get the + column value if there is an indirection */ + + eval_sym(static_cast<sym_node_t*>(cond->args)); + + /* Do the comparison */ + + if (!eval_cmp(cond)) { + + return(FALSE); + } + } + + return(TRUE); +} + +/*********************************************************************//** +Tests the other conditions. +@return TRUE if row passed the tests */ +UNIV_INLINE +ibool +row_sel_test_other_conds( +/*=====================*/ + plan_t* plan) /*!< in: plan for the table; the column values must + already have been retrieved */ +{ + func_node_t* cond; + + cond = UT_LIST_GET_FIRST(plan->other_conds); + + while (cond) { + eval_exp(cond); + + if (!eval_node_get_ibool_val(cond)) { + + return(FALSE); + } + + cond = UT_LIST_GET_NEXT(cond_list, cond); + } + + return(TRUE); +} + +/*********************************************************************//** +Retrieves the clustered index record corresponding to a record in a +non-clustered index. Does the necessary locking. +@return DB_SUCCESS or error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_sel_get_clust_rec( +/*==================*/ + sel_node_t* node, /*!< in: select_node */ + plan_t* plan, /*!< in: plan node for table */ + rec_t* rec, /*!< in: record in a non-clustered index */ + que_thr_t* thr, /*!< in: query thread */ + rec_t** out_rec,/*!< out: clustered record or an old version of + it, NULL if the old version did not exist + in the read view, i.e., it was a fresh + inserted version */ + mtr_t* mtr) /*!< in: mtr used to get access to the + non-clustered record; the same mtr is used to + access the clustered index */ +{ + dict_index_t* index; + rec_t* clust_rec; + rec_t* old_vers; + dberr_t err; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + *out_rec = NULL; + + offsets = rec_get_offsets(rec, + btr_pcur_get_btr_cur(&plan->pcur)->index, + offsets, ULINT_UNDEFINED, &heap); + + row_build_row_ref_fast(plan->clust_ref, plan->clust_map, rec, offsets); + + index = dict_table_get_first_index(plan->table); + + btr_pcur_open_with_no_init(index, plan->clust_ref, PAGE_CUR_LE, + BTR_SEARCH_LEAF, &plan->clust_pcur, + 0, mtr); + + clust_rec = btr_pcur_get_rec(&(plan->clust_pcur)); + + /* Note: only if the search ends up on a non-infimum record is the + low_match value the real match to the search tuple */ + + if (!page_rec_is_user_rec(clust_rec) + || btr_pcur_get_low_match(&(plan->clust_pcur)) + < dict_index_get_n_unique(index)) { + + ut_a(rec_get_deleted_flag(rec, + dict_table_is_comp(plan->table))); + ut_a(node->read_view); + + /* In a rare case it is possible that no clust rec is found + for a delete-marked secondary index record: if in row0umod.cc + in row_undo_mod_remove_clust_low() we have already removed + the clust rec, while purge is still cleaning and removing + secondary index records associated with earlier versions of + the clustered index record. In that case we know that the + clustered index record did not exist in the read view of + trx. */ + + goto func_exit; + } + + offsets = rec_get_offsets(clust_rec, index, offsets, + ULINT_UNDEFINED, &heap); + + if (!node->read_view) { + /* Try to place a lock on the index record */ + + /* If innodb_locks_unsafe_for_binlog option is used + or this session is using READ COMMITTED isolation level + we lock only the record, i.e., next-key locking is + not used. */ + ulint lock_type; + trx_t* trx; + + trx = thr_get_trx(thr); + + if (srv_locks_unsafe_for_binlog + || trx->isolation_level <= TRX_ISO_READ_COMMITTED) { + lock_type = LOCK_REC_NOT_GAP; + } else { + lock_type = LOCK_ORDINARY; + } + + err = lock_clust_rec_read_check_and_lock( + 0, btr_pcur_get_block(&plan->clust_pcur), + clust_rec, index, offsets, + static_cast<enum lock_mode>(node->row_lock_mode), + lock_type, + thr); + + switch (err) { + case DB_SUCCESS: + case DB_SUCCESS_LOCKED_REC: + /* Declare the variable uninitialized in Valgrind. + It should be set to DB_SUCCESS at func_exit. */ + UNIV_MEM_INVALID(&err, sizeof err); + break; + default: + goto err_exit; + } + } else { + /* This is a non-locking consistent read: if necessary, fetch + a previous version of the record */ + + old_vers = NULL; + + if (!lock_clust_rec_cons_read_sees(clust_rec, index, offsets, + node->read_view)) { + + err = row_sel_build_prev_vers( + node->read_view, index, clust_rec, + &offsets, &heap, &plan->old_vers_heap, + &old_vers, mtr); + + if (err != DB_SUCCESS) { + + goto err_exit; + } + + clust_rec = old_vers; + + if (clust_rec == NULL) { + goto func_exit; + } + } + + /* If we had to go to an earlier version of row or the + secondary index record is delete marked, then it may be that + the secondary index record corresponding to clust_rec + (or old_vers) is not rec; in that case we must ignore + such row because in our snapshot rec would not have existed. + Remember that from rec we cannot see directly which transaction + id corresponds to it: we have to go to the clustered index + record. A query where we want to fetch all rows where + the secondary index value is in some interval would return + a wrong result if we would not drop rows which we come to + visit through secondary index records that would not really + exist in our snapshot. */ + + if ((old_vers + || rec_get_deleted_flag(rec, dict_table_is_comp( + plan->table))) + && !row_sel_sec_rec_is_for_clust_rec(rec, plan->index, + clust_rec, index)) { + goto func_exit; + } + } + + /* Fetch the columns needed in test conditions. The clustered + index record is protected by a page latch that was acquired + when plan->clust_pcur was positioned. The latch will not be + released until mtr_commit(mtr). */ + + ut_ad(!rec_get_deleted_flag(clust_rec, rec_offs_comp(offsets))); + row_sel_fetch_columns(index, clust_rec, offsets, + UT_LIST_GET_FIRST(plan->columns)); + *out_rec = clust_rec; +func_exit: + err = DB_SUCCESS; +err_exit: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(err); +} + +/*********************************************************************//** +Sets a lock on a record. +@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */ +UNIV_INLINE +dberr_t +sel_set_rec_lock( +/*=============*/ + const buf_block_t* block, /*!< in: buffer block of rec */ + const rec_t* rec, /*!< in: record */ + dict_index_t* index, /*!< in: index */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + ulint mode, /*!< in: lock mode */ + ulint type, /*!< in: LOCK_ORDINARY, LOCK_GAP, or + LOC_REC_NOT_GAP */ + que_thr_t* thr) /*!< in: query thread */ +{ + trx_t* trx; + dberr_t err; + + trx = thr_get_trx(thr); + + if (UT_LIST_GET_LEN(trx->lock.trx_locks) > 10000) { + if (buf_LRU_buf_pool_running_out()) { + + return(DB_LOCK_TABLE_FULL); + } + } + + if (dict_index_is_clust(index)) { + err = lock_clust_rec_read_check_and_lock( + 0, block, rec, index, offsets, + static_cast<enum lock_mode>(mode), type, thr); + } else { + err = lock_sec_rec_read_check_and_lock( + 0, block, rec, index, offsets, + static_cast<enum lock_mode>(mode), type, thr); + } + + return(err); +} + +/*********************************************************************//** +Opens a pcur to a table index. */ +static +void +row_sel_open_pcur( +/*==============*/ + plan_t* plan, /*!< in: table plan */ + ibool search_latch_locked, + /*!< in: TRUE if the thread currently + has the search latch locked in + s-mode */ + mtr_t* mtr) /*!< in: mtr */ +{ + dict_index_t* index; + func_node_t* cond; + que_node_t* exp; + ulint n_fields; + ulint has_search_latch = 0; /* RW_S_LATCH or 0 */ + ulint i; + + if (search_latch_locked) { + has_search_latch = RW_S_LATCH; + } + + index = plan->index; + + /* Calculate the value of the search tuple: the exact match columns + get their expressions evaluated when we evaluate the right sides of + end_conds */ + + cond = UT_LIST_GET_FIRST(plan->end_conds); + + while (cond) { + eval_exp(que_node_get_next(cond->args)); + + cond = UT_LIST_GET_NEXT(cond_list, cond); + } + + if (plan->tuple) { + n_fields = dtuple_get_n_fields(plan->tuple); + + if (plan->n_exact_match < n_fields) { + /* There is a non-exact match field which must be + evaluated separately */ + + eval_exp(plan->tuple_exps[n_fields - 1]); + } + + for (i = 0; i < n_fields; i++) { + exp = plan->tuple_exps[i]; + + dfield_copy_data(dtuple_get_nth_field(plan->tuple, i), + que_node_get_val(exp)); + } + + /* Open pcur to the index */ + + btr_pcur_open_with_no_init(index, plan->tuple, plan->mode, + BTR_SEARCH_LEAF, &plan->pcur, + has_search_latch, mtr); + } else { + /* Open the cursor to the start or the end of the index + (FALSE: no init) */ + + btr_pcur_open_at_index_side(plan->asc, index, BTR_SEARCH_LEAF, + &(plan->pcur), false, 0, mtr); + } + + ut_ad(plan->n_rows_prefetched == 0); + ut_ad(plan->n_rows_fetched == 0); + ut_ad(plan->cursor_at_end == FALSE); + + plan->pcur_is_open = TRUE; +} + +/*********************************************************************//** +Restores a stored pcur position to a table index. +@return TRUE if the cursor should be moved to the next record after we +return from this function (moved to the previous, in the case of a +descending cursor) without processing again the current cursor +record */ +static +ibool +row_sel_restore_pcur_pos( +/*=====================*/ + plan_t* plan, /*!< in: table plan */ + mtr_t* mtr) /*!< in: mtr */ +{ + ibool equal_position; + ulint relative_position; + + ut_ad(!plan->cursor_at_end); + + relative_position = btr_pcur_get_rel_pos(&(plan->pcur)); + + equal_position = btr_pcur_restore_position(BTR_SEARCH_LEAF, + &(plan->pcur), mtr); + + /* If the cursor is traveling upwards, and relative_position is + + (1) BTR_PCUR_BEFORE: this is not allowed, as we did not have a lock + yet on the successor of the page infimum; + (2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the + first record GREATER than the predecessor of a page supremum; we have + not yet processed the cursor record: no need to move the cursor to the + next record; + (3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the + last record LESS or EQUAL to the old stored user record; (a) if + equal_position is FALSE, this means that the cursor is now on a record + less than the old user record, and we must move to the next record; + (b) if equal_position is TRUE, then if + plan->stored_cursor_rec_processed is TRUE, we must move to the next + record, else there is no need to move the cursor. */ + + if (plan->asc) { + if (relative_position == BTR_PCUR_ON) { + + if (equal_position) { + + return(plan->stored_cursor_rec_processed); + } + + return(TRUE); + } + + ut_ad(relative_position == BTR_PCUR_AFTER + || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE); + + return(FALSE); + } + + /* If the cursor is traveling downwards, and relative_position is + + (1) BTR_PCUR_BEFORE: btr_pcur_restore_position placed the cursor on + the last record LESS than the successor of a page infimum; we have not + processed the cursor record: no need to move the cursor; + (2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the + first record GREATER than the predecessor of a page supremum; we have + processed the cursor record: we should move the cursor to the previous + record; + (3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the + last record LESS or EQUAL to the old stored user record; (a) if + equal_position is FALSE, this means that the cursor is now on a record + less than the old user record, and we need not move to the previous + record; (b) if equal_position is TRUE, then if + plan->stored_cursor_rec_processed is TRUE, we must move to the previous + record, else there is no need to move the cursor. */ + + if (relative_position == BTR_PCUR_BEFORE + || relative_position == BTR_PCUR_BEFORE_FIRST_IN_TREE) { + + return(FALSE); + } + + if (relative_position == BTR_PCUR_ON) { + + if (equal_position) { + + return(plan->stored_cursor_rec_processed); + } + + return(FALSE); + } + + ut_ad(relative_position == BTR_PCUR_AFTER + || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE); + + return(TRUE); +} + +/*********************************************************************//** +Resets a plan cursor to a closed state. */ +UNIV_INLINE +void +plan_reset_cursor( +/*==============*/ + plan_t* plan) /*!< in: plan */ +{ + plan->pcur_is_open = FALSE; + plan->cursor_at_end = FALSE; + plan->n_rows_fetched = 0; + plan->n_rows_prefetched = 0; +} + +/*********************************************************************//** +Tries to do a shortcut to fetch a clustered index record with a unique key, +using the hash index if possible (not always). +@return SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */ +static +ulint +row_sel_try_search_shortcut( +/*========================*/ + sel_node_t* node, /*!< in: select node for a consistent read */ + plan_t* plan, /*!< in: plan for a unique search in clustered + index */ + ibool search_latch_locked, + /*!< in: whether the search holds + btr_search_latch */ + mtr_t* mtr) /*!< in: mtr */ +{ + dict_index_t* index; + rec_t* rec; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + ulint ret; + rec_offs_init(offsets_); + + index = plan->index; + + ut_ad(node->read_view); + ut_ad(plan->unique_search); + ut_ad(!plan->must_get_clust); +#ifdef UNIV_SYNC_DEBUG + if (search_latch_locked) { + ut_ad(rw_lock_own(btr_search_get_latch(index), + RW_LOCK_SHARED)); + } +#endif /* UNIV_SYNC_DEBUG */ + + row_sel_open_pcur(plan, search_latch_locked, mtr); + + rec = btr_pcur_get_rec(&(plan->pcur)); + + if (!page_rec_is_user_rec(rec)) { + + return(SEL_RETRY); + } + + ut_ad(plan->mode == PAGE_CUR_GE); + + /* As the cursor is now placed on a user record after a search with + the mode PAGE_CUR_GE, the up_match field in the cursor tells how many + fields in the user record matched to the search tuple */ + + if (btr_pcur_get_up_match(&(plan->pcur)) < plan->n_exact_match) { + + return(SEL_EXHAUSTED); + } + + /* This is a non-locking consistent read: if necessary, fetch + a previous version of the record */ + + offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap); + + if (dict_index_is_clust(index)) { + if (!lock_clust_rec_cons_read_sees(rec, index, offsets, + node->read_view)) { + ret = SEL_RETRY; + goto func_exit; + } + } else if (!lock_sec_rec_cons_read_sees(rec, node->read_view)) { + + ret = SEL_RETRY; + goto func_exit; + } + + /* Test the deleted flag. */ + + if (rec_get_deleted_flag(rec, dict_table_is_comp(plan->table))) { + + ret = SEL_EXHAUSTED; + goto func_exit; + } + + /* Fetch the columns needed in test conditions. The index + record is protected by a page latch that was acquired when + plan->pcur was positioned. The latch will not be released + until mtr_commit(mtr). */ + + row_sel_fetch_columns(index, rec, offsets, + UT_LIST_GET_FIRST(plan->columns)); + + /* Test the rest of search conditions */ + + if (!row_sel_test_other_conds(plan)) { + + ret = SEL_EXHAUSTED; + goto func_exit; + } + + ut_ad(plan->pcur.latch_mode == BTR_SEARCH_LEAF); + + plan->n_rows_fetched++; + ret = SEL_FOUND; +func_exit: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(ret); +} + +/*********************************************************************//** +Performs a select step. +@return DB_SUCCESS or error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_sel( +/*====*/ + sel_node_t* node, /*!< in: select node */ + que_thr_t* thr) /*!< in: query thread */ +{ + dict_index_t* index; + plan_t* plan; + mtr_t mtr; + ibool moved; + rec_t* rec; + rec_t* old_vers; + rec_t* clust_rec; + ibool search_latch_locked; + ibool consistent_read; + + /* The following flag becomes TRUE when we are doing a + consistent read from a non-clustered index and we must look + at the clustered index to find out the previous delete mark + state of the non-clustered record: */ + + ibool cons_read_requires_clust_rec = FALSE; + ulint cost_counter = 0; + ibool cursor_just_opened; + ibool must_go_to_next; + ibool mtr_has_extra_clust_latch = FALSE; + /* TRUE if the search was made using + a non-clustered index, and we had to + access the clustered record: now &mtr + contains a clustered index latch, and + &mtr must be committed before we move + to the next non-clustered record */ + ulint found_flag; + dberr_t err; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + ut_ad(thr->run_node == node); + + search_latch_locked = FALSE; + + if (node->read_view) { + /* In consistent reads, we try to do with the hash index and + not to use the buffer page get. This is to reduce memory bus + load resulting from semaphore operations. The search latch + will be s-locked when we access an index with a unique search + condition, but not locked when we access an index with a + less selective search condition. */ + + consistent_read = TRUE; + } else { + consistent_read = FALSE; + } + +table_loop: + /* TABLE LOOP + ---------- + This is the outer major loop in calculating a join. We come here when + node->fetch_table changes, and after adding a row to aggregate totals + and, of course, when this function is called. */ + + ut_ad(mtr_has_extra_clust_latch == FALSE); + + plan = sel_node_get_nth_plan(node, node->fetch_table); + index = plan->index; + + if (plan->n_rows_prefetched > 0) { + sel_dequeue_prefetched_row(plan); + + goto next_table_no_mtr; + } + + if (plan->cursor_at_end) { + /* The cursor has already reached the result set end: no more + rows to process for this table cursor, as also the prefetch + stack was empty */ + + ut_ad(plan->pcur_is_open); + + goto table_exhausted_no_mtr; + } + + /* Open a cursor to index, or restore an open cursor position */ + + mtr_start(&mtr); + + if (consistent_read && plan->unique_search && !plan->pcur_is_open + && !plan->must_get_clust + && !plan->table->big_rows) { + if (!search_latch_locked) { + rw_lock_s_lock(btr_search_get_latch(index)); + + search_latch_locked = TRUE; + } else if (rw_lock_get_writer(btr_search_get_latch(index)) + == RW_LOCK_WAIT_EX) { + + /* There is an x-latch request waiting: release the + s-latch for a moment; as an s-latch here is often + kept for some 10 searches before being released, + a waiting x-latch request would block other threads + from acquiring an s-latch for a long time, lowering + performance significantly in multiprocessors. */ + + rw_lock_s_unlock(btr_search_get_latch(index)); + rw_lock_s_lock(btr_search_get_latch(index)); + } + + found_flag = row_sel_try_search_shortcut(node, plan, + search_latch_locked, + &mtr); + + if (found_flag == SEL_FOUND) { + + goto next_table; + + } else if (found_flag == SEL_EXHAUSTED) { + + goto table_exhausted; + } + + ut_ad(found_flag == SEL_RETRY); + + plan_reset_cursor(plan); + + mtr_commit(&mtr); + mtr_start(&mtr); + } + + if (search_latch_locked) { + rw_lock_s_unlock(btr_search_get_latch(index)); + + search_latch_locked = FALSE; + } + + if (!plan->pcur_is_open) { + /* Evaluate the expressions to build the search tuple and + open the cursor */ + + row_sel_open_pcur(plan, search_latch_locked, &mtr); + + cursor_just_opened = TRUE; + + /* A new search was made: increment the cost counter */ + cost_counter++; + } else { + /* Restore pcur position to the index */ + + must_go_to_next = row_sel_restore_pcur_pos(plan, &mtr); + + cursor_just_opened = FALSE; + + if (must_go_to_next) { + /* We have already processed the cursor record: move + to the next */ + + goto next_rec; + } + } + +rec_loop: + /* RECORD LOOP + ----------- + In this loop we use pcur and try to fetch a qualifying row, and + also fill the prefetch buffer for this table if n_rows_fetched has + exceeded a threshold. While we are inside this loop, the following + holds: + (1) &mtr is started, + (2) pcur is positioned and open. + + NOTE that if cursor_just_opened is TRUE here, it means that we came + to this point right after row_sel_open_pcur. */ + + ut_ad(mtr_has_extra_clust_latch == FALSE); + + rec = btr_pcur_get_rec(&(plan->pcur)); + + /* PHASE 1: Set a lock if specified */ + + if (!node->asc && cursor_just_opened + && !page_rec_is_supremum(rec)) { + + /* When we open a cursor for a descending search, we must set + a next-key lock on the successor record: otherwise it would + be possible to insert new records next to the cursor position, + and it might be that these new records should appear in the + search result set, resulting in the phantom problem. */ + + if (!consistent_read) { + + /* If innodb_locks_unsafe_for_binlog option is used + or this session is using READ COMMITTED isolation + level, we lock only the record, i.e., next-key + locking is not used. */ + + rec_t* next_rec = page_rec_get_next(rec); + ulint lock_type; + trx_t* trx; + + trx = thr_get_trx(thr); + + offsets = rec_get_offsets(next_rec, index, offsets, + ULINT_UNDEFINED, &heap); + + if (srv_locks_unsafe_for_binlog + || trx->isolation_level + <= TRX_ISO_READ_COMMITTED) { + + if (page_rec_is_supremum(next_rec)) { + + goto skip_lock; + } + + lock_type = LOCK_REC_NOT_GAP; + } else { + lock_type = LOCK_ORDINARY; + } + + err = sel_set_rec_lock(btr_pcur_get_block(&plan->pcur), + next_rec, index, offsets, + node->row_lock_mode, + lock_type, thr); + + switch (err) { + case DB_SUCCESS_LOCKED_REC: + err = DB_SUCCESS; + case DB_SUCCESS: + break; + default: + /* Note that in this case we will store in pcur + the PREDECESSOR of the record we are waiting + the lock for */ + goto lock_wait_or_error; + } + } + } + +skip_lock: + if (page_rec_is_infimum(rec)) { + + /* The infimum record on a page cannot be in the result set, + and neither can a record lock be placed on it: we skip such + a record. We also increment the cost counter as we may have + processed yet another page of index. */ + + cost_counter++; + + goto next_rec; + } + + if (!consistent_read) { + /* Try to place a lock on the index record */ + + /* If innodb_locks_unsafe_for_binlog option is used + or this session is using READ COMMITTED isolation level, + we lock only the record, i.e., next-key locking is + not used. */ + + ulint lock_type; + trx_t* trx; + + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + + trx = thr_get_trx(thr); + + if (srv_locks_unsafe_for_binlog + || trx->isolation_level <= TRX_ISO_READ_COMMITTED) { + + if (page_rec_is_supremum(rec)) { + + goto next_rec; + } + + lock_type = LOCK_REC_NOT_GAP; + } else { + lock_type = LOCK_ORDINARY; + } + + err = sel_set_rec_lock(btr_pcur_get_block(&plan->pcur), + rec, index, offsets, + node->row_lock_mode, lock_type, thr); + + switch (err) { + case DB_SUCCESS_LOCKED_REC: + err = DB_SUCCESS; + case DB_SUCCESS: + break; + default: + goto lock_wait_or_error; + } + } + + if (page_rec_is_supremum(rec)) { + + /* A page supremum record cannot be in the result set: skip + it now when we have placed a possible lock on it */ + + goto next_rec; + } + + ut_ad(page_rec_is_user_rec(rec)); + + if (cost_counter > SEL_COST_LIMIT) { + + /* Now that we have placed the necessary locks, we can stop + for a while and store the cursor position; NOTE that if we + would store the cursor position BEFORE placing a record lock, + it might happen that the cursor would jump over some records + that another transaction could meanwhile insert adjacent to + the cursor: this would result in the phantom problem. */ + + goto stop_for_a_while; + } + + /* PHASE 2: Check a mixed index mix id if needed */ + + if (plan->unique_search && cursor_just_opened) { + + ut_ad(plan->mode == PAGE_CUR_GE); + + /* As the cursor is now placed on a user record after a search + with the mode PAGE_CUR_GE, the up_match field in the cursor + tells how many fields in the user record matched to the search + tuple */ + + if (btr_pcur_get_up_match(&(plan->pcur)) + < plan->n_exact_match) { + goto table_exhausted; + } + + /* Ok, no need to test end_conds or mix id */ + + } + + /* We are ready to look at a possible new index entry in the result + set: the cursor is now placed on a user record */ + + /* PHASE 3: Get previous version in a consistent read */ + + cons_read_requires_clust_rec = FALSE; + offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap); + + if (consistent_read) { + /* This is a non-locking consistent read: if necessary, fetch + a previous version of the record */ + + if (dict_index_is_clust(index)) { + + if (!lock_clust_rec_cons_read_sees(rec, index, offsets, + node->read_view)) { + + err = row_sel_build_prev_vers( + node->read_view, index, rec, + &offsets, &heap, &plan->old_vers_heap, + &old_vers, &mtr); + + if (err != DB_SUCCESS) { + + goto lock_wait_or_error; + } + + if (old_vers == NULL) { + /* The record does not exist + in our read view. Skip it, but + first attempt to determine + whether the index segment we + are searching through has been + exhausted. */ + + offsets = rec_get_offsets( + rec, index, offsets, + ULINT_UNDEFINED, &heap); + + /* Fetch the columns needed in + test conditions. The clustered + index record is protected by a + page latch that was acquired + by row_sel_open_pcur() or + row_sel_restore_pcur_pos(). + The latch will not be released + until mtr_commit(mtr). */ + + row_sel_fetch_columns( + index, rec, offsets, + UT_LIST_GET_FIRST( + plan->columns)); + + if (!row_sel_test_end_conds(plan)) { + + goto table_exhausted; + } + + goto next_rec; + } + + rec = old_vers; + } + } else if (!lock_sec_rec_cons_read_sees(rec, + node->read_view)) { + cons_read_requires_clust_rec = TRUE; + } + } + + /* PHASE 4: Test search end conditions and deleted flag */ + + /* Fetch the columns needed in test conditions. The record is + protected by a page latch that was acquired by + row_sel_open_pcur() or row_sel_restore_pcur_pos(). The latch + will not be released until mtr_commit(mtr). */ + + row_sel_fetch_columns(index, rec, offsets, + UT_LIST_GET_FIRST(plan->columns)); + + /* Test the selection end conditions: these can only contain columns + which already are found in the index, even though the index might be + non-clustered */ + + if (plan->unique_search && cursor_just_opened) { + + /* No test necessary: the test was already made above */ + + } else if (!row_sel_test_end_conds(plan)) { + + goto table_exhausted; + } + + if (rec_get_deleted_flag(rec, dict_table_is_comp(plan->table)) + && !cons_read_requires_clust_rec) { + + /* The record is delete marked: we can skip it if this is + not a consistent read which might see an earlier version + of a non-clustered index record */ + + if (plan->unique_search) { + + goto table_exhausted; + } + + goto next_rec; + } + + /* PHASE 5: Get the clustered index record, if needed and if we did + not do the search using the clustered index */ + + if (plan->must_get_clust || cons_read_requires_clust_rec) { + + /* It was a non-clustered index and we must fetch also the + clustered index record */ + + err = row_sel_get_clust_rec(node, plan, rec, thr, &clust_rec, + &mtr); + mtr_has_extra_clust_latch = TRUE; + + if (err != DB_SUCCESS) { + + goto lock_wait_or_error; + } + + /* Retrieving the clustered record required a search: + increment the cost counter */ + + cost_counter++; + + if (clust_rec == NULL) { + /* The record did not exist in the read view */ + ut_ad(consistent_read); + + goto next_rec; + } + + if (rec_get_deleted_flag(clust_rec, + dict_table_is_comp(plan->table))) { + + /* The record is delete marked: we can skip it */ + + goto next_rec; + } + + if (node->can_get_updated) { + + btr_pcur_store_position(&(plan->clust_pcur), &mtr); + } + } + + /* PHASE 6: Test the rest of search conditions */ + + if (!row_sel_test_other_conds(plan)) { + + if (plan->unique_search) { + + goto table_exhausted; + } + + goto next_rec; + } + + /* PHASE 7: We found a new qualifying row for the current table; push + the row if prefetch is on, or move to the next table in the join */ + + plan->n_rows_fetched++; + + ut_ad(plan->pcur.latch_mode == BTR_SEARCH_LEAF); + + if ((plan->n_rows_fetched <= SEL_PREFETCH_LIMIT) + || plan->unique_search || plan->no_prefetch + || plan->table->big_rows) { + + /* No prefetch in operation: go to the next table */ + + goto next_table; + } + + sel_enqueue_prefetched_row(plan); + + if (plan->n_rows_prefetched == SEL_MAX_N_PREFETCH) { + + /* The prefetch buffer is now full */ + + sel_dequeue_prefetched_row(plan); + + goto next_table; + } + +next_rec: + ut_ad(!search_latch_locked); + + if (mtr_has_extra_clust_latch) { + + /* We must commit &mtr if we are moving to the next + non-clustered index record, because we could break the + latching order if we would access a different clustered + index page right away without releasing the previous. */ + + goto commit_mtr_for_a_while; + } + + if (node->asc) { + moved = btr_pcur_move_to_next(&(plan->pcur), &mtr); + } else { + moved = btr_pcur_move_to_prev(&(plan->pcur), &mtr); + } + + if (!moved) { + + goto table_exhausted; + } + + cursor_just_opened = FALSE; + + /* END OF RECORD LOOP + ------------------ */ + goto rec_loop; + +next_table: + /* We found a record which satisfies the conditions: we can move to + the next table or return a row in the result set */ + + ut_ad(btr_pcur_is_on_user_rec(&plan->pcur)); + + if (plan->unique_search && !node->can_get_updated) { + + plan->cursor_at_end = TRUE; + } else { + ut_ad(!search_latch_locked); + + plan->stored_cursor_rec_processed = TRUE; + + btr_pcur_store_position(&(plan->pcur), &mtr); + } + + mtr_commit(&mtr); + + mtr_has_extra_clust_latch = FALSE; + +next_table_no_mtr: + /* If we use 'goto' to this label, it means that the row was popped + from the prefetched rows stack, and &mtr is already committed */ + + if (node->fetch_table + 1 == node->n_tables) { + + sel_eval_select_list(node); + + if (node->is_aggregate) { + + goto table_loop; + } + + sel_assign_into_var_values(node->into_list, node); + + thr->run_node = que_node_get_parent(node); + + err = DB_SUCCESS; + goto func_exit; + } + + node->fetch_table++; + + /* When we move to the next table, we first reset the plan cursor: + we do not care about resetting it when we backtrack from a table */ + + plan_reset_cursor(sel_node_get_nth_plan(node, node->fetch_table)); + + goto table_loop; + +table_exhausted: + /* The table cursor pcur reached the result set end: backtrack to the + previous table in the join if we do not have cached prefetched rows */ + + plan->cursor_at_end = TRUE; + + mtr_commit(&mtr); + + mtr_has_extra_clust_latch = FALSE; + + if (plan->n_rows_prefetched > 0) { + /* The table became exhausted during a prefetch */ + + sel_dequeue_prefetched_row(plan); + + goto next_table_no_mtr; + } + +table_exhausted_no_mtr: + if (node->fetch_table == 0) { + err = DB_SUCCESS; + + if (node->is_aggregate && !node->aggregate_already_fetched) { + + node->aggregate_already_fetched = TRUE; + + sel_assign_into_var_values(node->into_list, node); + + thr->run_node = que_node_get_parent(node); + } else { + node->state = SEL_NODE_NO_MORE_ROWS; + + thr->run_node = que_node_get_parent(node); + } + + goto func_exit; + } + + node->fetch_table--; + + goto table_loop; + +stop_for_a_while: + /* Return control for a while to que_run_threads, so that runaway + queries can be canceled. NOTE that when we come here, we must, in a + locking read, have placed the necessary (possibly waiting request) + record lock on the cursor record or its successor: when we reposition + the cursor, this record lock guarantees that nobody can meanwhile have + inserted new records which should have appeared in the result set, + which would result in the phantom problem. */ + + ut_ad(!search_latch_locked); + + plan->stored_cursor_rec_processed = FALSE; + btr_pcur_store_position(&(plan->pcur), &mtr); + + mtr_commit(&mtr); + +#ifdef UNIV_SYNC_DEBUG + ut_ad(sync_thread_levels_empty_except_dict()); +#endif /* UNIV_SYNC_DEBUG */ + err = DB_SUCCESS; + goto func_exit; + +commit_mtr_for_a_while: + /* Stores the cursor position and commits &mtr; this is used if + &mtr may contain latches which would break the latching order if + &mtr would not be committed and the latches released. */ + + plan->stored_cursor_rec_processed = TRUE; + + ut_ad(!search_latch_locked); + btr_pcur_store_position(&(plan->pcur), &mtr); + + mtr_commit(&mtr); + + mtr_has_extra_clust_latch = FALSE; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(sync_thread_levels_empty_except_dict()); +#endif /* UNIV_SYNC_DEBUG */ + + goto table_loop; + +lock_wait_or_error: + /* See the note at stop_for_a_while: the same holds for this case */ + + ut_ad(!btr_pcur_is_before_first_on_page(&plan->pcur) || !node->asc); + ut_ad(!search_latch_locked); + + plan->stored_cursor_rec_processed = FALSE; + btr_pcur_store_position(&(plan->pcur), &mtr); + + mtr_commit(&mtr); + +#ifdef UNIV_SYNC_DEBUG + ut_ad(sync_thread_levels_empty_except_dict()); +#endif /* UNIV_SYNC_DEBUG */ + +func_exit: + if (search_latch_locked) { + rw_lock_s_unlock(btr_search_get_latch(index)); + } + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(err); +} + +/**********************************************************************//** +Performs a select step. This is a high-level function used in SQL execution +graphs. +@return query thread to run next or NULL */ +UNIV_INTERN +que_thr_t* +row_sel_step( +/*=========*/ + que_thr_t* thr) /*!< in: query thread */ +{ + sel_node_t* node; + + ut_ad(thr); + + node = static_cast<sel_node_t*>(thr->run_node); + + ut_ad(que_node_get_type(node) == QUE_NODE_SELECT); + + /* If this is a new time this node is executed (or when execution + resumes after wait for a table intention lock), set intention locks + on the tables, or assign a read view */ + + if (node->into_list && (thr->prev_node == que_node_get_parent(node))) { + + node->state = SEL_NODE_OPEN; + } + + if (node->state == SEL_NODE_OPEN) { + + /* It may be that the current session has not yet started + its transaction, or it has been committed: */ + + trx_start_if_not_started_xa(thr_get_trx(thr)); + + plan_reset_cursor(sel_node_get_nth_plan(node, 0)); + + if (node->consistent_read) { + /* Assign a read view for the query */ + node->read_view = trx_assign_read_view( + thr_get_trx(thr)); + } else { + sym_node_t* table_node; + enum lock_mode i_lock_mode; + + if (node->set_x_locks) { + i_lock_mode = LOCK_IX; + } else { + i_lock_mode = LOCK_IS; + } + + for (table_node = node->table_list; + table_node != 0; + table_node = static_cast<sym_node_t*>( + que_node_get_next(table_node))) { + + dberr_t err = lock_table( + 0, table_node->table, i_lock_mode, + thr); + + if (err != DB_SUCCESS) { + trx_t* trx; + + trx = thr_get_trx(thr); + trx->error_state = err; + + return(NULL); + } + } + } + + /* If this is an explicit cursor, copy stored procedure + variable values, so that the values cannot change between + fetches (currently, we copy them also for non-explicit + cursors) */ + + if (node->explicit_cursor + && UT_LIST_GET_FIRST(node->copy_variables)) { + + row_sel_copy_input_variable_vals(node); + } + + node->state = SEL_NODE_FETCH; + node->fetch_table = 0; + + if (node->is_aggregate) { + /* Reset the aggregate total values */ + sel_reset_aggregate_vals(node); + } + } + + dberr_t err = row_sel(node, thr); + + /* NOTE! if queries are parallelized, the following assignment may + have problems; the assignment should be made only if thr is the + only top-level thr in the graph: */ + + thr->graph->last_sel_node = node; + + if (err != DB_SUCCESS) { + thr_get_trx(thr)->error_state = err; + + return(NULL); + } + + return(thr); +} + +/**********************************************************************//** +Performs a fetch for a cursor. +@return query thread to run next or NULL */ +UNIV_INTERN +que_thr_t* +fetch_step( +/*=======*/ + que_thr_t* thr) /*!< in: query thread */ +{ + sel_node_t* sel_node; + fetch_node_t* node; + + ut_ad(thr); + + node = static_cast<fetch_node_t*>(thr->run_node); + sel_node = node->cursor_def; + + ut_ad(que_node_get_type(node) == QUE_NODE_FETCH); + + if (thr->prev_node != que_node_get_parent(node)) { + + if (sel_node->state != SEL_NODE_NO_MORE_ROWS) { + + if (node->into_list) { + sel_assign_into_var_values(node->into_list, + sel_node); + } else { + ibool ret = (*node->func->func)( + sel_node, node->func->arg); + + if (!ret) { + sel_node->state + = SEL_NODE_NO_MORE_ROWS; + } + } + } + + thr->run_node = que_node_get_parent(node); + + return(thr); + } + + /* Make the fetch node the parent of the cursor definition for + the time of the fetch, so that execution knows to return to this + fetch node after a row has been selected or we know that there is + no row left */ + + sel_node->common.parent = node; + + if (sel_node->state == SEL_NODE_CLOSED) { + fprintf(stderr, + "InnoDB: Error: fetch called on a closed cursor\n"); + + thr_get_trx(thr)->error_state = DB_ERROR; + + return(NULL); + } + + thr->run_node = sel_node; + + return(thr); +} + +/****************************************************************//** +Sample callback function for fetch that prints each row. +@return always returns non-NULL */ +UNIV_INTERN +void* +row_fetch_print( +/*============*/ + void* row, /*!< in: sel_node_t* */ + void* user_arg) /*!< in: not used */ +{ + que_node_t* exp; + ulint i = 0; + sel_node_t* node = static_cast<sel_node_t*>(row); + + UT_NOT_USED(user_arg); + + fprintf(stderr, "row_fetch_print: row %p\n", row); + + for (exp = node->select_list; + exp != 0; + exp = que_node_get_next(exp), i++) { + + dfield_t* dfield = que_node_get_val(exp); + const dtype_t* type = dfield_get_type(dfield); + + fprintf(stderr, " column %lu:\n", (ulong) i); + + dtype_print(type); + putc('\n', stderr); + + if (dfield_get_len(dfield) != UNIV_SQL_NULL) { + ut_print_buf(stderr, dfield_get_data(dfield), + dfield_get_len(dfield)); + putc('\n', stderr); + } else { + fputs(" <NULL>;\n", stderr); + } + } + + return((void*)42); +} + +/***********************************************************//** +Prints a row in a select result. +@return query thread to run next or NULL */ +UNIV_INTERN +que_thr_t* +row_printf_step( +/*============*/ + que_thr_t* thr) /*!< in: query thread */ +{ + row_printf_node_t* node; + sel_node_t* sel_node; + que_node_t* arg; + + ut_ad(thr); + + node = static_cast<row_printf_node_t*>(thr->run_node); + + sel_node = node->sel_node; + + ut_ad(que_node_get_type(node) == QUE_NODE_ROW_PRINTF); + + if (thr->prev_node == que_node_get_parent(node)) { + + /* Reset the cursor */ + sel_node->state = SEL_NODE_OPEN; + + /* Fetch next row to print */ + + thr->run_node = sel_node; + + return(thr); + } + + if (sel_node->state != SEL_NODE_FETCH) { + + ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS); + + /* No more rows to print */ + + thr->run_node = que_node_get_parent(node); + + return(thr); + } + + arg = sel_node->select_list; + + while (arg) { + dfield_print_also_hex(que_node_get_val(arg)); + + fputs(" ::: ", stderr); + + arg = que_node_get_next(arg); + } + + putc('\n', stderr); + + /* Fetch next row to print */ + + thr->run_node = sel_node; + + return(thr); +} + +/****************************************************************//** +Converts a key value stored in MySQL format to an Innobase dtuple. The last +field of the key value may be just a prefix of a fixed length field: hence +the parameter key_len. But currently we do not allow search keys where the +last field is only a prefix of the full key field len and print a warning if +such appears. A counterpart of this function is +ha_innobase::store_key_val_for_row() in ha_innodb.cc. */ +UNIV_INTERN +void +row_sel_convert_mysql_key_to_innobase( +/*==================================*/ + dtuple_t* tuple, /*!< in/out: tuple where to build; + NOTE: we assume that the type info + in the tuple is already according + to index! */ + byte* buf, /*!< in: buffer to use in field + conversions; NOTE that dtuple->data + may end up pointing inside buf so + do not discard that buffer while + the tuple is being used. See + row_mysql_store_col_in_innobase_format() + in the case of DATA_INT */ + ulint buf_len, /*!< in: buffer length */ + dict_index_t* index, /*!< in: index of the key value */ + const byte* key_ptr, /*!< in: MySQL key value */ + ulint key_len, /*!< in: MySQL key value length */ + trx_t* trx) /*!< in: transaction */ +{ + byte* original_buf = buf; + const byte* original_key_ptr = key_ptr; + dict_field_t* field; + dfield_t* dfield; + ulint data_offset; + ulint data_len; + ulint data_field_len; + ibool is_null; + const byte* key_end; + ulint n_fields = 0; + + /* For documentation of the key value storage format in MySQL, see + ha_innobase::store_key_val_for_row() in ha_innodb.cc. */ + + key_end = key_ptr + key_len; + + /* Permit us to access any field in the tuple (ULINT_MAX): */ + + dtuple_set_n_fields(tuple, ULINT_MAX); + + dfield = dtuple_get_nth_field(tuple, 0); + field = dict_index_get_nth_field(index, 0); + + if (UNIV_UNLIKELY(dfield_get_type(dfield)->mtype == DATA_SYS)) { + /* A special case: we are looking for a position in the + generated clustered index which InnoDB automatically added + to a table with no primary key: the first and the only + ordering column is ROW_ID which InnoDB stored to the key_ptr + buffer. */ + + ut_a(key_len == DATA_ROW_ID_LEN); + + dfield_set_data(dfield, key_ptr, DATA_ROW_ID_LEN); + + dtuple_set_n_fields(tuple, 1); + + return; + } + + while (key_ptr < key_end) { + + ulint type = dfield_get_type(dfield)->mtype; + ut_a(field->col->mtype == type); + + data_offset = 0; + is_null = FALSE; + + if (!(dfield_get_type(dfield)->prtype & DATA_NOT_NULL)) { + /* The first byte in the field tells if this is + an SQL NULL value */ + + data_offset = 1; + + if (*key_ptr != 0) { + dfield_set_null(dfield); + + is_null = TRUE; + } + } + + /* Calculate data length and data field total length */ + + if (type == DATA_BLOB) { + /* The key field is a column prefix of a BLOB or + TEXT */ + + ut_a(field->prefix_len > 0); + + /* MySQL stores the actual data length to the first 2 + bytes after the optional SQL NULL marker byte. The + storage format is little-endian, that is, the most + significant byte at a higher address. In UTF-8, MySQL + seems to reserve field->prefix_len bytes for + storing this field in the key value buffer, even + though the actual value only takes data_len bytes + from the start. */ + + data_len = key_ptr[data_offset] + + 256 * key_ptr[data_offset + 1]; + data_field_len = data_offset + 2 + field->prefix_len; + + data_offset += 2; + + /* Now that we know the length, we store the column + value like it would be a fixed char field */ + + } else if (field->prefix_len > 0) { + /* Looks like MySQL pads unused end bytes in the + prefix with space. Therefore, also in UTF-8, it is ok + to compare with a prefix containing full prefix_len + bytes, and no need to take at most prefix_len / 3 + UTF-8 characters from the start. + If the prefix is used as the upper end of a LIKE + 'abc%' query, then MySQL pads the end with chars + 0xff. TODO: in that case does it any harm to compare + with the full prefix_len bytes. How do characters + 0xff in UTF-8 behave? */ + + data_len = field->prefix_len; + data_field_len = data_offset + data_len; + } else { + data_len = dfield_get_type(dfield)->len; + data_field_len = data_offset + data_len; + } + + if (UNIV_UNLIKELY + (dtype_get_mysql_type(dfield_get_type(dfield)) + == DATA_MYSQL_TRUE_VARCHAR) + && UNIV_LIKELY(type != DATA_INT)) { + /* In a MySQL key value format, a true VARCHAR is + always preceded by 2 bytes of a length field. + dfield_get_type(dfield)->len returns the maximum + 'payload' len in bytes. That does not include the + 2 bytes that tell the actual data length. + + We added the check != DATA_INT to make sure we do + not treat MySQL ENUM or SET as a true VARCHAR! */ + + data_len += 2; + data_field_len += 2; + } + + /* Storing may use at most data_len bytes of buf */ + + if (UNIV_LIKELY(!is_null)) { + buf = row_mysql_store_col_in_innobase_format( + dfield, buf, + FALSE, /* MySQL key value format col */ + key_ptr + data_offset, data_len, + dict_table_is_comp(index->table)); + ut_a(buf <= original_buf + buf_len); + } + + key_ptr += data_field_len; + + if (UNIV_UNLIKELY(key_ptr > key_end)) { + /* The last field in key was not a complete key field + but a prefix of it. + + Print a warning about this! HA_READ_PREFIX_LAST does + not currently work in InnoDB with partial-field key + value prefixes. Since MySQL currently uses a padding + trick to calculate LIKE 'abc%' type queries there + should never be partial-field prefixes in searches. */ + + ut_print_timestamp(stderr); + + fputs(" InnoDB: Warning: using a partial-field" + " key prefix in search.\n" + "InnoDB: ", stderr); + dict_index_name_print(stderr, trx, index); + fprintf(stderr, ". Last data field length %lu bytes,\n" + "InnoDB: key ptr now exceeds" + " key end by %lu bytes.\n" + "InnoDB: Key value in the MySQL format:\n", + (ulong) data_field_len, + (ulong) (key_ptr - key_end)); + fflush(stderr); + ut_print_buf(stderr, original_key_ptr, key_len); + putc('\n', stderr); + + if (!is_null) { + ulint len = dfield_get_len(dfield); + dfield_set_len(dfield, len + - (ulint) (key_ptr - key_end)); + } + ut_ad(0); + } + + n_fields++; + field++; + dfield++; + } + + ut_a(buf <= original_buf + buf_len); + + /* We set the length of tuple to n_fields: we assume that the memory + area allocated for it is big enough (usually bigger than n_fields). */ + + dtuple_set_n_fields(tuple, n_fields); +} + +/**************************************************************//** +Stores the row id to the prebuilt struct. */ +static +void +row_sel_store_row_id_to_prebuilt( +/*=============================*/ + row_prebuilt_t* prebuilt, /*!< in/out: prebuilt */ + const rec_t* index_rec, /*!< in: record */ + const dict_index_t* index, /*!< in: index of the record */ + const ulint* offsets) /*!< in: rec_get_offsets + (index_rec, index) */ +{ + const byte* data; + ulint len; + + ut_ad(rec_offs_validate(index_rec, index, offsets)); + + data = rec_get_nth_field( + index_rec, offsets, + dict_index_get_sys_col_pos(index, DATA_ROW_ID), &len); + + if (UNIV_UNLIKELY(len != DATA_ROW_ID_LEN)) { + fprintf(stderr, + "InnoDB: Error: Row id field is" + " wrong length %lu in ", (ulong) len); + dict_index_name_print(stderr, prebuilt->trx, index); + fprintf(stderr, "\n" + "InnoDB: Field number %lu, record:\n", + (ulong) dict_index_get_sys_col_pos(index, + DATA_ROW_ID)); + rec_print_new(stderr, index_rec, offsets); + putc('\n', stderr); + ut_error; + } + + ut_memcpy(prebuilt->row_id, data, len); +} + +#ifdef UNIV_DEBUG +/** Convert a non-SQL-NULL field from Innobase format to MySQL format. */ +# define row_sel_field_store_in_mysql_format(dest,templ,idx,field,src,len) \ + row_sel_field_store_in_mysql_format_func(dest,templ,idx,field,src,len) +#else /* UNIV_DEBUG */ +/** Convert a non-SQL-NULL field from Innobase format to MySQL format. */ +# define row_sel_field_store_in_mysql_format(dest,templ,idx,field,src,len) \ + row_sel_field_store_in_mysql_format_func(dest,templ,src,len) +#endif /* UNIV_DEBUG */ + +/**************************************************************//** +Stores a non-SQL-NULL field in the MySQL format. The counterpart of this +function is row_mysql_store_col_in_innobase_format() in row0mysql.cc. */ +static __attribute__((nonnull)) +void +row_sel_field_store_in_mysql_format_func( +/*=====================================*/ + byte* dest, /*!< in/out: buffer where to store; NOTE + that BLOBs are not in themselves + stored here: the caller must allocate + and copy the BLOB into buffer before, + and pass the pointer to the BLOB in + 'data' */ + const mysql_row_templ_t* templ, + /*!< in: MySQL column template. + Its following fields are referenced: + type, is_unsigned, mysql_col_len, + mbminlen, mbmaxlen */ +#ifdef UNIV_DEBUG + const dict_index_t* index, + /*!< in: InnoDB index */ + ulint field_no, + /*!< in: templ->rec_field_no or + templ->clust_rec_field_no or + templ->icp_rec_field_no */ +#endif /* UNIV_DEBUG */ + const byte* data, /*!< in: data to store */ + ulint len) /*!< in: length of the data */ +{ + byte* ptr; +#ifdef UNIV_DEBUG + const dict_field_t* field + = dict_index_get_nth_field(index, field_no); +#endif /* UNIV_DEBUG */ + + ut_ad(len != UNIV_SQL_NULL); + UNIV_MEM_ASSERT_RW(data, len); + UNIV_MEM_ASSERT_W(dest, templ->mysql_col_len); + UNIV_MEM_INVALID(dest, templ->mysql_col_len); + + switch (templ->type) { + const byte* field_end; + byte* pad; + case DATA_INT: + /* Convert integer data from Innobase to a little-endian + format, sign bit restored to normal */ + + ptr = dest + len; + + for (;;) { + ptr--; + *ptr = *data; + if (ptr == dest) { + break; + } + data++; + } + + if (!templ->is_unsigned) { + dest[len - 1] = (byte) (dest[len - 1] ^ 128); + } + + ut_ad(templ->mysql_col_len == len); + break; + + case DATA_VARCHAR: + case DATA_VARMYSQL: + case DATA_BINARY: + field_end = dest + templ->mysql_col_len; + + if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR) { + /* This is a >= 5.0.3 type true VARCHAR. Store the + length of the data to the first byte or the first + two bytes of dest. */ + + dest = row_mysql_store_true_var_len( + dest, len, templ->mysql_length_bytes); + /* Copy the actual data. Leave the rest of the + buffer uninitialized. */ + memcpy(dest, data, len); + break; + } + + /* Copy the actual data */ + ut_memcpy(dest, data, len); + + /* Pad with trailing spaces. */ + + pad = dest + len; + + ut_ad(templ->mbminlen <= templ->mbmaxlen); + + /* We treat some Unicode charset strings specially. */ + switch (templ->mbminlen) { + case 4: + /* InnoDB should never have stripped partial + UTF-32 characters. */ + ut_a(!(len & 3)); + break; + case 2: + /* A space char is two bytes, + 0x0020 in UCS2 and UTF-16 */ + + if (UNIV_UNLIKELY(len & 1)) { + /* A 0x20 has been stripped from the column. + Pad it back. */ + + if (pad < field_end) { + *pad++ = 0x20; + } + } + } + + row_mysql_pad_col(templ->mbminlen, pad, field_end - pad); + break; + + case DATA_BLOB: + /* Store a pointer to the BLOB buffer to dest: the BLOB was + already copied to the buffer in row_sel_store_mysql_rec */ + + row_mysql_store_blob_ref(dest, templ->mysql_col_len, data, + len); + break; + + case DATA_MYSQL: + memcpy(dest, data, len); + + ut_ad(templ->mysql_col_len >= len); + ut_ad(templ->mbmaxlen >= templ->mbminlen); + + /* If field_no equals to templ->icp_rec_field_no, + we are examining a row pointed by "icp_rec_field_no". + There is possibility that icp_rec_field_no refers to + a field in a secondary index while templ->rec_field_no + points to field in a primary index. The length + should still be equal, unless the field pointed + by icp_rec_field_no has a prefix */ + ut_ad(templ->mbmaxlen > templ->mbminlen + || templ->mysql_col_len == len + || (field_no == templ->icp_rec_field_no + && field->prefix_len > 0)); + + /* The following assertion would fail for old tables + containing UTF-8 ENUM columns due to Bug #9526. */ + ut_ad(!templ->mbmaxlen + || !(templ->mysql_col_len % templ->mbmaxlen)); + ut_ad(len * templ->mbmaxlen >= templ->mysql_col_len + || (field_no == templ->icp_rec_field_no + && field->prefix_len > 0)); + ut_ad(!(field->prefix_len % templ->mbmaxlen)); + + if (templ->mbminlen == 1 && templ->mbmaxlen != 1) { + /* Pad with spaces. This undoes the stripping + done in row0mysql.cc, function + row_mysql_store_col_in_innobase_format(). */ + + memset(dest + len, 0x20, templ->mysql_col_len - len); + } + break; + + default: +#ifdef UNIV_DEBUG + case DATA_SYS_CHILD: + case DATA_SYS: + /* These column types should never be shipped to MySQL. */ + ut_ad(0); + + case DATA_CHAR: + case DATA_FIXBINARY: + case DATA_FLOAT: + case DATA_DOUBLE: + case DATA_DECIMAL: + /* Above are the valid column types for MySQL data. */ +#endif /* UNIV_DEBUG */ + ut_ad(field->prefix_len + ? field->prefix_len == len + : templ->mysql_col_len == len); + memcpy(dest, data, len); + } +} + +#ifdef UNIV_DEBUG +/** Convert a field from Innobase format to MySQL format. */ +# define row_sel_store_mysql_field(m,p,r,i,o,f,t) \ + row_sel_store_mysql_field_func(m,p,r,i,o,f,t) +#else /* UNIV_DEBUG */ +/** Convert a field from Innobase format to MySQL format. */ +# define row_sel_store_mysql_field(m,p,r,i,o,f,t) \ + row_sel_store_mysql_field_func(m,p,r,o,f,t) +#endif /* UNIV_DEBUG */ +/**************************************************************//** +Convert a field in the Innobase format to a field in the MySQL format. */ +static __attribute__((warn_unused_result)) +ibool +row_sel_store_mysql_field_func( +/*===========================*/ + byte* mysql_rec, /*!< out: record in the + MySQL format */ + row_prebuilt_t* prebuilt, /*!< in/out: prebuilt struct */ + const rec_t* rec, /*!< in: InnoDB record; + must be protected by + a page latch */ +#ifdef UNIV_DEBUG + const dict_index_t* index, /*!< in: index of rec */ +#endif + const ulint* offsets, /*!< in: array returned by + rec_get_offsets() */ + ulint field_no, /*!< in: templ->rec_field_no or + templ->clust_rec_field_no or + templ->icp_rec_field_no */ + const mysql_row_templ_t*templ) /*!< in: row template */ +{ + const byte* data; + ulint len; + + ut_ad(prebuilt->default_rec); + ut_ad(templ); + ut_ad(templ >= prebuilt->mysql_template); + ut_ad(templ < &prebuilt->mysql_template[prebuilt->n_template]); + ut_ad(field_no == templ->clust_rec_field_no + || field_no == templ->rec_field_no + || field_no == templ->icp_rec_field_no); + ut_ad(rec_offs_validate(rec, index, offsets)); + + if (UNIV_UNLIKELY(rec_offs_nth_extern(offsets, field_no))) { + + mem_heap_t* heap; + /* Copy an externally stored field to a temporary heap */ + + ut_a(!prebuilt->trx->has_search_latch); +#ifdef UNIV_SYNC_DEBUG + ut_ad(!btr_search_own_any()); +#endif + ut_ad(field_no == templ->clust_rec_field_no); + + if (UNIV_UNLIKELY(templ->type == DATA_BLOB)) { + if (prebuilt->blob_heap == NULL) { + prebuilt->blob_heap = mem_heap_create( + UNIV_PAGE_SIZE); + } + + heap = prebuilt->blob_heap; + } else { + heap = mem_heap_create(UNIV_PAGE_SIZE); + } + + /* NOTE: if we are retrieving a big BLOB, we may + already run out of memory in the next call, which + causes an assert */ + + data = btr_rec_copy_externally_stored_field( + rec, offsets, + dict_table_zip_size(prebuilt->table), + field_no, &len, heap); + + if (UNIV_UNLIKELY(!data)) { + /* The externally stored field was not written + yet. This record should only be seen by + recv_recovery_rollback_active() or any + TRX_ISO_READ_UNCOMMITTED transactions. */ + + if (heap != prebuilt->blob_heap) { + mem_heap_free(heap); + } + + ut_a(prebuilt->trx->isolation_level + == TRX_ISO_READ_UNCOMMITTED); + return(FALSE); + } + + ut_a(len != UNIV_SQL_NULL); + + row_sel_field_store_in_mysql_format( + mysql_rec + templ->mysql_col_offset, + templ, index, field_no, data, len); + + if (heap != prebuilt->blob_heap) { + mem_heap_free(heap); + } + } else { + /* Field is stored in the row. */ + + data = rec_get_nth_field(rec, offsets, field_no, &len); + + if (len == UNIV_SQL_NULL) { + /* MySQL assumes that the field for an SQL + NULL value is set to the default value. */ + ut_ad(templ->mysql_null_bit_mask); + + UNIV_MEM_ASSERT_RW(prebuilt->default_rec + + templ->mysql_col_offset, + templ->mysql_col_len); + mysql_rec[templ->mysql_null_byte_offset] + |= (byte) templ->mysql_null_bit_mask; + memcpy(mysql_rec + templ->mysql_col_offset, + (const byte*) prebuilt->default_rec + + templ->mysql_col_offset, + templ->mysql_col_len); + return(TRUE); + } + + if (UNIV_UNLIKELY(templ->type == DATA_BLOB)) { + + /* It is a BLOB field locally stored in the + InnoDB record: we MUST copy its contents to + prebuilt->blob_heap here because + row_sel_field_store_in_mysql_format() stores a + pointer to the data, and the data passed to us + will be invalid as soon as the + mini-transaction is committed and the page + latch on the clustered index page is + released. */ + + if (prebuilt->blob_heap == NULL) { + prebuilt->blob_heap = mem_heap_create( + UNIV_PAGE_SIZE); + } + + data = static_cast<byte*>( + mem_heap_dup(prebuilt->blob_heap, data, len)); + } + + row_sel_field_store_in_mysql_format( + mysql_rec + templ->mysql_col_offset, + templ, index, field_no, data, len); + } + + ut_ad(len != UNIV_SQL_NULL); + + if (templ->mysql_null_bit_mask) { + /* It is a nullable column with a non-NULL + value */ + mysql_rec[templ->mysql_null_byte_offset] + &= ~(byte) templ->mysql_null_bit_mask; + } + + return(TRUE); +} + +/**************************************************************//** +Convert a row in the Innobase format to a row in the MySQL format. +Note that the template in prebuilt may advise us to copy only a few +columns to mysql_rec, other columns are left blank. All columns may not +be needed in the query. +@return TRUE on success, FALSE if not all columns could be retrieved */ +static __attribute__((warn_unused_result)) +ibool +row_sel_store_mysql_rec( +/*====================*/ + byte* mysql_rec, /*!< out: row in the MySQL format */ + row_prebuilt_t* prebuilt, /*!< in: prebuilt struct */ + const rec_t* rec, /*!< in: Innobase record in the index + which was described in prebuilt's + template, or in the clustered index; + must be protected by a page latch */ + ibool rec_clust, /*!< in: TRUE if rec is in the + clustered index instead of + prebuilt->index */ + const dict_index_t* index, /*!< in: index of rec */ + const ulint* offsets) /*!< in: array returned by + rec_get_offsets(rec) */ +{ + ulint i; + + ut_ad(rec_clust || index == prebuilt->index); + ut_ad(!rec_clust || dict_index_is_clust(index)); + + if (UNIV_LIKELY_NULL(prebuilt->blob_heap)) { + mem_heap_free(prebuilt->blob_heap); + prebuilt->blob_heap = NULL; + } + + for (i = 0; i < prebuilt->n_template; i++) { + const mysql_row_templ_t*templ = &prebuilt->mysql_template[i]; + const ulint field_no + = rec_clust + ? templ->clust_rec_field_no + : templ->rec_field_no; + /* We should never deliver column prefixes to MySQL, + except for evaluating innobase_index_cond(). */ + ut_ad(dict_index_get_nth_field(index, field_no)->prefix_len + == 0); + + if (!row_sel_store_mysql_field(mysql_rec, prebuilt, + rec, index, offsets, + field_no, templ)) { + return(FALSE); + } + } + + /* FIXME: We only need to read the doc_id if an FTS indexed + column is being updated. + NOTE, the record must be cluster index record. Secondary index + might not have the Doc ID */ + if (dict_table_has_fts_index(prebuilt->table) + && dict_index_is_clust(index)) { + + prebuilt->fts_doc_id = fts_get_doc_id_from_rec( + prebuilt->table, rec, NULL); + } + + return(TRUE); +} + +/*********************************************************************//** +Builds a previous version of a clustered index record for a consistent read +@return DB_SUCCESS or error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_sel_build_prev_vers_for_mysql( +/*==============================*/ + read_view_t* read_view, /*!< in: read view */ + dict_index_t* clust_index, /*!< in: clustered index */ + row_prebuilt_t* prebuilt, /*!< in: prebuilt struct */ + const rec_t* rec, /*!< in: record in a clustered index */ + ulint** offsets, /*!< in/out: offsets returned by + rec_get_offsets(rec, clust_index) */ + mem_heap_t** offset_heap, /*!< in/out: memory heap from which + the offsets are allocated */ + rec_t** old_vers, /*!< out: old version, or NULL if the + record does not exist in the view: + i.e., it was freshly inserted + afterwards */ + mtr_t* mtr) /*!< in: mtr */ +{ + dberr_t err; + + if (prebuilt->old_vers_heap) { + mem_heap_empty(prebuilt->old_vers_heap); + } else { + prebuilt->old_vers_heap = mem_heap_create(200); + } + + err = row_vers_build_for_consistent_read( + rec, mtr, clust_index, offsets, read_view, offset_heap, + prebuilt->old_vers_heap, old_vers); + return(err); +} + +/*********************************************************************//** +Retrieves the clustered index record corresponding to a record in a +non-clustered index. Does the necessary locking. Used in the MySQL +interface. +@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_sel_get_clust_rec_for_mysql( +/*============================*/ + row_prebuilt_t* prebuilt,/*!< in: prebuilt struct in the handle */ + dict_index_t* sec_index,/*!< in: secondary index where rec resides */ + const rec_t* rec, /*!< in: record in a non-clustered index; if + this is a locking read, then rec is not + allowed to be delete-marked, and that would + not make sense either */ + que_thr_t* thr, /*!< in: query thread */ + const rec_t** out_rec,/*!< out: clustered record or an old version of + it, NULL if the old version did not exist + in the read view, i.e., it was a fresh + inserted version */ + ulint** offsets,/*!< in: offsets returned by + rec_get_offsets(rec, sec_index); + out: offsets returned by + rec_get_offsets(out_rec, clust_index) */ + mem_heap_t** offset_heap,/*!< in/out: memory heap from which + the offsets are allocated */ + mtr_t* mtr) /*!< in: mtr used to get access to the + non-clustered record; the same mtr is used to + access the clustered index */ +{ + dict_index_t* clust_index; + const rec_t* clust_rec; + rec_t* old_vers; + dberr_t err; + trx_t* trx; + + *out_rec = NULL; + trx = thr_get_trx(thr); + + row_build_row_ref_in_tuple(prebuilt->clust_ref, rec, + sec_index, *offsets, trx); + + clust_index = dict_table_get_first_index(sec_index->table); + + btr_pcur_open_with_no_init(clust_index, prebuilt->clust_ref, + PAGE_CUR_LE, BTR_SEARCH_LEAF, + &prebuilt->clust_pcur, 0, mtr); + + clust_rec = btr_pcur_get_rec(&prebuilt->clust_pcur); + + prebuilt->clust_pcur.trx_if_known = trx; + + /* Note: only if the search ends up on a non-infimum record is the + low_match value the real match to the search tuple */ + + if (!page_rec_is_user_rec(clust_rec) + || btr_pcur_get_low_match(&prebuilt->clust_pcur) + < dict_index_get_n_unique(clust_index)) { + + /* In a rare case it is possible that no clust rec is found + for a delete-marked secondary index record: if in row0umod.cc + in row_undo_mod_remove_clust_low() we have already removed + the clust rec, while purge is still cleaning and removing + secondary index records associated with earlier versions of + the clustered index record. In that case we know that the + clustered index record did not exist in the read view of + trx. */ + + if (!rec_get_deleted_flag(rec, + dict_table_is_comp(sec_index->table)) + || prebuilt->select_lock_type != LOCK_NONE) { + ut_print_timestamp(stderr); + fputs(" InnoDB: error clustered record" + " for sec rec not found\n" + "InnoDB: ", stderr); + dict_index_name_print(stderr, trx, sec_index); + fputs("\n" + "InnoDB: sec index record ", stderr); + rec_print(stderr, rec, sec_index); + fputs("\n" + "InnoDB: clust index record ", stderr); + rec_print(stderr, clust_rec, clust_index); + putc('\n', stderr); + trx_print(stderr, trx, 600); + fputs("\n" + "InnoDB: Submit a detailed bug report" + " to http://bugs.mysql.com\n", stderr); + ut_ad(0); + } + + clust_rec = NULL; + + err = DB_SUCCESS; + goto func_exit; + } + + *offsets = rec_get_offsets(clust_rec, clust_index, *offsets, + ULINT_UNDEFINED, offset_heap); + + if (prebuilt->select_lock_type != LOCK_NONE) { + /* Try to place a lock on the index record; we are searching + the clust rec with a unique condition, hence + we set a LOCK_REC_NOT_GAP type lock */ + + err = lock_clust_rec_read_check_and_lock( + 0, btr_pcur_get_block(&prebuilt->clust_pcur), + clust_rec, clust_index, *offsets, + static_cast<enum lock_mode>(prebuilt->select_lock_type), + LOCK_REC_NOT_GAP, + thr); + + switch (err) { + case DB_SUCCESS: + case DB_SUCCESS_LOCKED_REC: + break; + default: + goto err_exit; + } + } else { + /* This is a non-locking consistent read: if necessary, fetch + a previous version of the record */ + + old_vers = NULL; + + /* If the isolation level allows reading of uncommitted data, + then we never look for an earlier version */ + + if (trx->isolation_level > TRX_ISO_READ_UNCOMMITTED + && !lock_clust_rec_cons_read_sees( + clust_rec, clust_index, *offsets, + trx->read_view)) { + + /* The following call returns 'offsets' associated with + 'old_vers' */ + err = row_sel_build_prev_vers_for_mysql( + trx->read_view, clust_index, prebuilt, + clust_rec, offsets, offset_heap, &old_vers, + mtr); + + if (err != DB_SUCCESS || old_vers == NULL) { + + goto err_exit; + } + + clust_rec = old_vers; + } + + /* If we had to go to an earlier version of row or the + secondary index record is delete marked, then it may be that + the secondary index record corresponding to clust_rec + (or old_vers) is not rec; in that case we must ignore + such row because in our snapshot rec would not have existed. + Remember that from rec we cannot see directly which transaction + id corresponds to it: we have to go to the clustered index + record. A query where we want to fetch all rows where + the secondary index value is in some interval would return + a wrong result if we would not drop rows which we come to + visit through secondary index records that would not really + exist in our snapshot. */ + + if (clust_rec + && (old_vers + || trx->isolation_level <= TRX_ISO_READ_UNCOMMITTED + || rec_get_deleted_flag(rec, dict_table_is_comp( + sec_index->table))) + && !row_sel_sec_rec_is_for_clust_rec( + rec, sec_index, clust_rec, clust_index)) { + clust_rec = NULL; +#ifdef UNIV_SEARCH_DEBUG + } else { + ut_a(clust_rec == NULL + || row_sel_sec_rec_is_for_clust_rec( + rec, sec_index, clust_rec, clust_index)); +#endif + } + + err = DB_SUCCESS; + } + +func_exit: + *out_rec = clust_rec; + + /* Store the current position if select_lock_type is not + LOCK_NONE or if we are scanning using InnoDB APIs */ + if (prebuilt->select_lock_type != LOCK_NONE + || prebuilt->innodb_api) { + /* We may use the cursor in update or in unlock_row(): + store its position */ + + btr_pcur_store_position(&prebuilt->clust_pcur, mtr); + } + +err_exit: + return(err); +} + +/********************************************************************//** +Restores cursor position after it has been stored. We have to take into +account that the record cursor was positioned on may have been deleted. +Then we may have to move the cursor one step up or down. +@return TRUE if we may need to process the record the cursor is now +positioned on (i.e. we should not go to the next record yet) */ +static +ibool +sel_restore_position_for_mysql( +/*===========================*/ + ibool* same_user_rec, /*!< out: TRUE if we were able to restore + the cursor on a user record with the + same ordering prefix in in the + B-tree index */ + ulint latch_mode, /*!< in: latch mode wished in + restoration */ + btr_pcur_t* pcur, /*!< in: cursor whose position + has been stored */ + ibool moves_up, /*!< in: TRUE if the cursor moves up + in the index */ + mtr_t* mtr) /*!< in: mtr; CAUTION: may commit + mtr temporarily! */ +{ + ibool success; + + success = btr_pcur_restore_position(latch_mode, pcur, mtr); + + *same_user_rec = success; + + ut_ad(!success || pcur->rel_pos == BTR_PCUR_ON); +#ifdef UNIV_DEBUG + if (pcur->pos_state == BTR_PCUR_IS_POSITIONED_OPTIMISTIC) { + ut_ad(pcur->rel_pos == BTR_PCUR_BEFORE + || pcur->rel_pos == BTR_PCUR_AFTER); + } else { + ut_ad(pcur->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad((pcur->rel_pos == BTR_PCUR_ON) + == btr_pcur_is_on_user_rec(pcur)); + } +#endif + + /* The position may need be adjusted for rel_pos and moves_up. */ + + switch (pcur->rel_pos) { + case BTR_PCUR_ON: + if (!success && moves_up) { +next: + btr_pcur_move_to_next(pcur, mtr); + return(TRUE); + } + return(!success); + case BTR_PCUR_AFTER_LAST_IN_TREE: + case BTR_PCUR_BEFORE_FIRST_IN_TREE: + return(TRUE); + case BTR_PCUR_AFTER: + /* positioned to record after pcur->old_rec. */ + pcur->pos_state = BTR_PCUR_IS_POSITIONED; +prev: + if (btr_pcur_is_on_user_rec(pcur) && !moves_up) { + btr_pcur_move_to_prev(pcur, mtr); + } + return(TRUE); + case BTR_PCUR_BEFORE: + /* For non optimistic restoration: + The position is now set to the record before pcur->old_rec. + + For optimistic restoration: + The position also needs to take the previous search_mode into + consideration. */ + + switch (pcur->pos_state) { + case BTR_PCUR_IS_POSITIONED_OPTIMISTIC: + pcur->pos_state = BTR_PCUR_IS_POSITIONED; + if (pcur->search_mode == PAGE_CUR_GE) { + /* Positioned during Greater or Equal search + with BTR_PCUR_BEFORE. Optimistic restore to + the same record. If scanning for lower then + we must move to previous record. + This can happen with: + HANDLER READ idx a = (const); + HANDLER READ idx PREV; */ + goto prev; + } + return(TRUE); + case BTR_PCUR_IS_POSITIONED: + if (moves_up && btr_pcur_is_on_user_rec(pcur)) { + goto next; + } + return(TRUE); + case BTR_PCUR_WAS_POSITIONED: + case BTR_PCUR_NOT_POSITIONED: + break; + } + } + ut_ad(0); + return(TRUE); +} + +/********************************************************************//** +Copies a cached field for MySQL from the fetch cache. */ +static +void +row_sel_copy_cached_field_for_mysql( +/*================================*/ + byte* buf, /*!< in/out: row buffer */ + const byte* cache, /*!< in: cached row */ + const mysql_row_templ_t*templ) /*!< in: column template */ +{ + ulint len; + + buf += templ->mysql_col_offset; + cache += templ->mysql_col_offset; + + UNIV_MEM_ASSERT_W(buf, templ->mysql_col_len); + + if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR + && templ->type != DATA_INT) { + /* Check for != DATA_INT to make sure we do + not treat MySQL ENUM or SET as a true VARCHAR! + Find the actual length of the true VARCHAR field. */ + row_mysql_read_true_varchar( + &len, cache, templ->mysql_length_bytes); + len += templ->mysql_length_bytes; + UNIV_MEM_INVALID(buf, templ->mysql_col_len); + } else { + len = templ->mysql_col_len; + } + + ut_memcpy(buf, cache, len); +} + +/********************************************************************//** +Pops a cached row for MySQL from the fetch cache. */ +UNIV_INLINE +void +row_sel_dequeue_cached_row_for_mysql( +/*=================================*/ + byte* buf, /*!< in/out: buffer where to copy the + row */ + row_prebuilt_t* prebuilt) /*!< in: prebuilt struct */ +{ + ulint i; + const mysql_row_templ_t*templ; + const byte* cached_rec; + ut_ad(prebuilt->n_fetch_cached > 0); + ut_ad(prebuilt->mysql_prefix_len <= prebuilt->mysql_row_len); + + UNIV_MEM_ASSERT_W(buf, prebuilt->mysql_row_len); + + cached_rec = prebuilt->fetch_cache[prebuilt->fetch_cache_first]; + + if (UNIV_UNLIKELY(prebuilt->keep_other_fields_on_keyread)) { + /* Copy cache record field by field, don't touch fields that + are not covered by current key */ + + for (i = 0; i < prebuilt->n_template; i++) { + templ = prebuilt->mysql_template + i; + row_sel_copy_cached_field_for_mysql( + buf, cached_rec, templ); + /* Copy NULL bit of the current field from cached_rec + to buf */ + if (templ->mysql_null_bit_mask) { + buf[templ->mysql_null_byte_offset] + ^= (buf[templ->mysql_null_byte_offset] + ^ cached_rec[templ->mysql_null_byte_offset]) + & (byte) templ->mysql_null_bit_mask; + } + } + } else if (prebuilt->mysql_prefix_len > 63) { + /* The record is long. Copy it field by field, in case + there are some long VARCHAR column of which only a + small length is being used. */ + UNIV_MEM_INVALID(buf, prebuilt->mysql_prefix_len); + + /* First copy the NULL bits. */ + ut_memcpy(buf, cached_rec, prebuilt->null_bitmap_len); + /* Then copy the requested fields. */ + + for (i = 0; i < prebuilt->n_template; i++) { + row_sel_copy_cached_field_for_mysql( + buf, cached_rec, prebuilt->mysql_template + i); + } + } else { + ut_memcpy(buf, cached_rec, prebuilt->mysql_prefix_len); + } + + prebuilt->n_fetch_cached--; + prebuilt->fetch_cache_first++; + + if (prebuilt->n_fetch_cached == 0) { + prebuilt->fetch_cache_first = 0; + } +} + +/********************************************************************//** +Initialise the prefetch cache. */ +UNIV_INLINE +void +row_sel_prefetch_cache_init( +/*========================*/ + row_prebuilt_t* prebuilt) /*!< in/out: prebuilt struct */ +{ + ulint i; + ulint sz; + byte* ptr; + + /* Reserve space for the magic number. */ + sz = UT_ARR_SIZE(prebuilt->fetch_cache) * (prebuilt->mysql_row_len + 8); + ptr = static_cast<byte*>(mem_alloc(sz)); + + for (i = 0; i < UT_ARR_SIZE(prebuilt->fetch_cache); i++) { + + /* A user has reported memory corruption in these + buffers in Linux. Put magic numbers there to help + to track a possible bug. */ + + mach_write_to_4(ptr, ROW_PREBUILT_FETCH_MAGIC_N); + ptr += 4; + + prebuilt->fetch_cache[i] = ptr; + ptr += prebuilt->mysql_row_len; + + mach_write_to_4(ptr, ROW_PREBUILT_FETCH_MAGIC_N); + ptr += 4; + } +} + +/********************************************************************//** +Get the last fetch cache buffer from the queue. +@return pointer to buffer. */ +UNIV_INLINE +byte* +row_sel_fetch_last_buf( +/*===================*/ + row_prebuilt_t* prebuilt) /*!< in/out: prebuilt struct */ +{ + ut_ad(!prebuilt->templ_contains_blob); + ut_ad(prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE); + + if (prebuilt->fetch_cache[0] == NULL) { + /* Allocate memory for the fetch cache */ + ut_ad(prebuilt->n_fetch_cached == 0); + + row_sel_prefetch_cache_init(prebuilt); + } + + ut_ad(prebuilt->fetch_cache_first == 0); + UNIV_MEM_INVALID(prebuilt->fetch_cache[prebuilt->n_fetch_cached], + prebuilt->mysql_row_len); + + return(prebuilt->fetch_cache[prebuilt->n_fetch_cached]); +} + +/********************************************************************//** +Pushes a row for MySQL to the fetch cache. */ +UNIV_INLINE +void +row_sel_enqueue_cache_row_for_mysql( +/*================================*/ + byte* mysql_rec, /*!< in/out: MySQL record */ + row_prebuilt_t* prebuilt) /*!< in/out: prebuilt struct */ +{ + /* For non ICP code path the row should already exist in the + next fetch cache slot. */ + + if (prebuilt->idx_cond != NULL) { + byte* dest = row_sel_fetch_last_buf(prebuilt); + + ut_memcpy(dest, mysql_rec, prebuilt->mysql_row_len); + } + + ++prebuilt->n_fetch_cached; +} + +/*********************************************************************//** +Tries to do a shortcut to fetch a clustered index record with a unique key, +using the hash index if possible (not always). We assume that the search +mode is PAGE_CUR_GE, it is a consistent read, there is a read view in trx, +btr search latch has been locked in S-mode if AHI is enabled. +@return SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */ +static +ulint +row_sel_try_search_shortcut_for_mysql( +/*==================================*/ + const rec_t** out_rec,/*!< out: record if found */ + row_prebuilt_t* prebuilt,/*!< in: prebuilt struct */ + ulint** offsets,/*!< in/out: for rec_get_offsets(*out_rec) */ + mem_heap_t** heap, /*!< in/out: heap for rec_get_offsets() */ + mtr_t* mtr) /*!< in: started mtr */ +{ + dict_index_t* index = prebuilt->index; + const dtuple_t* search_tuple = prebuilt->search_tuple; + btr_pcur_t* pcur = &prebuilt->pcur; + trx_t* trx = prebuilt->trx; + const rec_t* rec; + + ut_ad(dict_index_is_clust(index)); + ut_ad(!prebuilt->templ_contains_blob); + +#ifndef UNIV_SEARCH_DEBUG + ut_ad(trx->has_search_latch); + + btr_pcur_open_with_no_init(index, search_tuple, PAGE_CUR_GE, + BTR_SEARCH_LEAF, pcur, + RW_S_LATCH, + mtr); +#else /* UNIV_SEARCH_DEBUG */ + btr_pcur_open_with_no_init(index, search_tuple, PAGE_CUR_GE, + BTR_SEARCH_LEAF, pcur, + 0, + mtr); +#endif /* UNIV_SEARCH_DEBUG */ + rec = btr_pcur_get_rec(pcur); + + if (!page_rec_is_user_rec(rec)) { + + return(SEL_RETRY); + } + + /* As the cursor is now placed on a user record after a search with + the mode PAGE_CUR_GE, the up_match field in the cursor tells how many + fields in the user record matched to the search tuple */ + + if (btr_pcur_get_up_match(pcur) < dtuple_get_n_fields(search_tuple)) { + + return(SEL_EXHAUSTED); + } + + /* This is a non-locking consistent read: if necessary, fetch + a previous version of the record */ + + *offsets = rec_get_offsets(rec, index, *offsets, + ULINT_UNDEFINED, heap); + + if (!lock_clust_rec_cons_read_sees(rec, index, + *offsets, trx->read_view)) { + + return(SEL_RETRY); + } + + if (rec_get_deleted_flag(rec, dict_table_is_comp(index->table))) { + + return(SEL_EXHAUSTED); + } + + *out_rec = rec; + + return(SEL_FOUND); +} + +/*********************************************************************//** +Check a pushed-down index condition. +@return ICP_NO_MATCH, ICP_MATCH, or ICP_OUT_OF_RANGE */ +static +enum icp_result +row_search_idx_cond_check( +/*======================*/ + byte* mysql_rec, /*!< out: record + in MySQL format (invalid unless + prebuilt->idx_cond!=NULL and + we return ICP_MATCH) */ + row_prebuilt_t* prebuilt, /*!< in/out: prebuilt struct + for the table handle */ + const rec_t* rec, /*!< in: InnoDB record */ + const ulint* offsets) /*!< in: rec_get_offsets() */ +{ + enum icp_result result; + ulint i; + + ut_ad(rec_offs_validate(rec, prebuilt->index, offsets)); + + if (!prebuilt->idx_cond) { + return(ICP_MATCH); + } + + MONITOR_INC(MONITOR_ICP_ATTEMPTS); + + /* Convert to MySQL format those fields that are needed for + evaluating the index condition. */ + + if (UNIV_LIKELY_NULL(prebuilt->blob_heap)) { + mem_heap_empty(prebuilt->blob_heap); + } + + for (i = 0; i < prebuilt->idx_cond_n_cols; i++) { + const mysql_row_templ_t*templ = &prebuilt->mysql_template[i]; + + if (!row_sel_store_mysql_field(mysql_rec, prebuilt, + rec, prebuilt->index, offsets, + templ->icp_rec_field_no, + templ)) { + return(ICP_NO_MATCH); + } + } + + /* We assume that the index conditions on + case-insensitive columns are case-insensitive. The + case of such columns may be wrong in a secondary + index, if the case of the column has been updated in + the past, or a record has been deleted and a record + inserted in a different case. */ + result = innobase_index_cond(prebuilt->idx_cond); + switch (result) { + case ICP_MATCH: + /* Convert the remaining fields to MySQL format. + If this is a secondary index record, we must defer + this until we have fetched the clustered index record. */ + if (!prebuilt->need_to_access_clustered + || dict_index_is_clust(prebuilt->index)) { + if (!row_sel_store_mysql_rec( + mysql_rec, prebuilt, rec, FALSE, + prebuilt->index, offsets)) { + ut_ad(dict_index_is_clust(prebuilt->index)); + return(ICP_NO_MATCH); + } + } + MONITOR_INC(MONITOR_ICP_MATCH); + return(result); + case ICP_NO_MATCH: + MONITOR_INC(MONITOR_ICP_NO_MATCH); + return(result); + case ICP_OUT_OF_RANGE: + MONITOR_INC(MONITOR_ICP_OUT_OF_RANGE); + return(result); + } + + ut_error; + return(result); +} + +/********************************************************************//** +Searches for rows in the database. This is used in the interface to +MySQL. This function opens a cursor, and also implements fetch next +and fetch prev. NOTE that if we do a search with a full key value +from a unique index (ROW_SEL_EXACT), then we will not store the cursor +position and fetch next or fetch prev must not be tried to the cursor! +@return DB_SUCCESS, DB_RECORD_NOT_FOUND, DB_END_OF_INDEX, DB_DEADLOCK, +DB_LOCK_TABLE_FULL, DB_CORRUPTION, or DB_TOO_BIG_RECORD */ +UNIV_INTERN +dberr_t +row_search_for_mysql( +/*=================*/ + byte* buf, /*!< in/out: buffer for the fetched + row in the MySQL format */ + ulint mode, /*!< in: search mode PAGE_CUR_L, ... */ + row_prebuilt_t* prebuilt, /*!< in: prebuilt struct for the + table handle; this contains the info + of search_tuple, index; if search + tuple contains 0 fields then we + position the cursor at the start or + the end of the index, depending on + 'mode' */ + ulint match_mode, /*!< in: 0 or ROW_SEL_EXACT or + ROW_SEL_EXACT_PREFIX */ + ulint direction) /*!< in: 0 or ROW_SEL_NEXT or + ROW_SEL_PREV; NOTE: if this is != 0, + then prebuilt must have a pcur + with stored position! In opening of a + cursor 'direction' should be 0. */ +{ + dict_index_t* index = prebuilt->index; + ibool comp = dict_table_is_comp(index->table); + const dtuple_t* search_tuple = prebuilt->search_tuple; + btr_pcur_t* pcur = &prebuilt->pcur; + trx_t* trx = prebuilt->trx; + dict_index_t* clust_index; + que_thr_t* thr; + const rec_t* rec; + const rec_t* result_rec = NULL; + const rec_t* clust_rec; + dberr_t err = DB_SUCCESS; + ibool unique_search = FALSE; + ibool mtr_has_extra_clust_latch = FALSE; + ibool moves_up = FALSE; + ibool set_also_gap_locks = TRUE; + /* if the query is a plain locking SELECT, and the isolation level + is <= TRX_ISO_READ_COMMITTED, then this is set to FALSE */ + ibool did_semi_consistent_read = FALSE; + /* if the returned record was locked and we did a semi-consistent + read (fetch the newest committed version), then this is set to + TRUE */ +#ifdef UNIV_SEARCH_DEBUG + ulint cnt = 0; +#endif /* UNIV_SEARCH_DEBUG */ + ulint next_offs; + ibool same_user_rec; + mtr_t mtr; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + ibool table_lock_waited = FALSE; + byte* next_buf = 0; + + rec_offs_init(offsets_); + + ut_ad(index && pcur && search_tuple); + + /* We don't support FTS queries from the HANDLER interfaces, because + we implemented FTS as reversed inverted index with auxiliary tables. + So anything related to traditional index query would not apply to + it. */ + if (index->type & DICT_FTS) { + return(DB_END_OF_INDEX); + } + + ut_ad(!trx->has_search_latch); +#ifdef UNIV_SYNC_DEBUG + ut_ad(!btr_search_own_any()); + ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch)); +#endif /* UNIV_SYNC_DEBUG */ + + if (dict_table_is_discarded(prebuilt->table)) { + + return(DB_TABLESPACE_DELETED); + + } else if (prebuilt->table->ibd_file_missing) { + + return(DB_TABLESPACE_NOT_FOUND); + + } else if (!prebuilt->index_usable) { + + return(DB_MISSING_HISTORY); + + } else if (dict_index_is_corrupted(index)) { + + return(DB_CORRUPTION); + + } else if (prebuilt->magic_n != ROW_PREBUILT_ALLOCATED) { + fprintf(stderr, + "InnoDB: Error: trying to free a corrupt\n" + "InnoDB: table handle. Magic n %lu, table name ", + (ulong) prebuilt->magic_n); + ut_print_name(stderr, trx, TRUE, prebuilt->table->name); + putc('\n', stderr); + + mem_analyze_corruption(prebuilt); + + ut_error; + } + +#if 0 + /* August 19, 2005 by Heikki: temporarily disable this error + print until the cursor lock count is done correctly. + See bugs #12263 and #12456!*/ + + if (trx->n_mysql_tables_in_use == 0 + && UNIV_UNLIKELY(prebuilt->select_lock_type == LOCK_NONE)) { + /* Note that if MySQL uses an InnoDB temp table that it + created inside LOCK TABLES, then n_mysql_tables_in_use can + be zero; in that case select_lock_type is set to LOCK_X in + ::start_stmt. */ + + fputs("InnoDB: Error: MySQL is trying to perform a SELECT\n" + "InnoDB: but it has not locked" + " any tables in ::external_lock()!\n", + stderr); + trx_print(stderr, trx, 600); + fputc('\n', stderr); + } +#endif + +#if 0 + fprintf(stderr, "Match mode %lu\n search tuple ", + (ulong) match_mode); + dtuple_print(search_tuple); + fprintf(stderr, "N tables locked %lu\n", + (ulong) trx->mysql_n_tables_locked); +#endif + /* Reset the new record lock info if srv_locks_unsafe_for_binlog + is set or session is using a READ COMMITED isolation level. Then + we are able to remove the record locks set here on an individual + row. */ + prebuilt->new_rec_locks = 0; + + /*-------------------------------------------------------------*/ + /* PHASE 1: Try to pop the row from the prefetch cache */ + + if (UNIV_UNLIKELY(direction == 0)) { + trx->op_info = "starting index read"; + + prebuilt->n_rows_fetched = 0; + prebuilt->n_fetch_cached = 0; + prebuilt->fetch_cache_first = 0; + + if (prebuilt->sel_graph == NULL) { + /* Build a dummy select query graph */ + row_prebuild_sel_graph(prebuilt); + } + } else { + trx->op_info = "fetching rows"; + + if (prebuilt->n_rows_fetched == 0) { + prebuilt->fetch_direction = direction; + } + + if (UNIV_UNLIKELY(direction != prebuilt->fetch_direction)) { + if (UNIV_UNLIKELY(prebuilt->n_fetch_cached > 0)) { + ut_error; + /* TODO: scrollable cursor: restore cursor to + the place of the latest returned row, + or better: prevent caching for a scroll + cursor! */ + } + + prebuilt->n_rows_fetched = 0; + prebuilt->n_fetch_cached = 0; + prebuilt->fetch_cache_first = 0; + + } else if (UNIV_LIKELY(prebuilt->n_fetch_cached > 0)) { + row_sel_dequeue_cached_row_for_mysql(buf, prebuilt); + + prebuilt->n_rows_fetched++; + + err = DB_SUCCESS; + goto func_exit; + } + + if (prebuilt->fetch_cache_first > 0 + && prebuilt->fetch_cache_first < MYSQL_FETCH_CACHE_SIZE) { + + /* The previous returned row was popped from the fetch + cache, but the cache was not full at the time of the + popping: no more rows can exist in the result set */ + + err = DB_RECORD_NOT_FOUND; + goto func_exit; + } + + prebuilt->n_rows_fetched++; + + if (prebuilt->n_rows_fetched > 1000000000) { + /* Prevent wrap-over */ + prebuilt->n_rows_fetched = 500000000; + } + + mode = pcur->search_mode; + } + + /* In a search where at most one record in the index may match, we + can use a LOCK_REC_NOT_GAP type record lock when locking a + non-delete-marked matching record. + + Note that in a unique secondary index there may be different + delete-marked versions of a record where only the primary key + values differ: thus in a secondary index we must use next-key + locks when locking delete-marked records. */ + + if (match_mode == ROW_SEL_EXACT + && dict_index_is_unique(index) + && dtuple_get_n_fields(search_tuple) + == dict_index_get_n_unique(index) + && (dict_index_is_clust(index) + || !dtuple_contains_null(search_tuple))) { + + /* Note above that a UNIQUE secondary index can contain many + rows with the same key value if one of the columns is the SQL + null. A clustered index under MySQL can never contain null + columns because we demand that all the columns in primary key + are non-null. */ + + unique_search = TRUE; + + /* Even if the condition is unique, MySQL seems to try to + retrieve also a second row if a primary key contains more than + 1 column. Return immediately if this is not a HANDLER + command. */ + + if (UNIV_UNLIKELY(direction != 0 + && !prebuilt->used_in_HANDLER)) { + + err = DB_RECORD_NOT_FOUND; + goto func_exit; + } + } + + mtr_start(&mtr); + + /*-------------------------------------------------------------*/ + /* PHASE 2: Try fast adaptive hash index search if possible */ + + /* Next test if this is the special case where we can use the fast + adaptive hash index to try the search. Since we must release the + search system latch when we retrieve an externally stored field, we + cannot use the adaptive hash index in a search in the case the row + may be long and there may be externally stored fields */ + + if (UNIV_UNLIKELY(direction == 0) + && unique_search + && dict_index_is_clust(index) + && !prebuilt->templ_contains_blob + && !prebuilt->used_in_HANDLER + && (prebuilt->mysql_row_len < UNIV_PAGE_SIZE / 8) + && !prebuilt->innodb_api) { + + mode = PAGE_CUR_GE; + + if (trx->mysql_n_tables_locked == 0 + && prebuilt->select_lock_type == LOCK_NONE + && trx->isolation_level > TRX_ISO_READ_UNCOMMITTED + && trx->read_view) { + + /* This is a SELECT query done as a consistent read, + and the read view has already been allocated: + let us try a search shortcut through the hash + index. + NOTE that we must also test that + mysql_n_tables_locked == 0, because this might + also be INSERT INTO ... SELECT ... or + CREATE TABLE ... SELECT ... . Our algorithm is + NOT prepared to inserts interleaved with the SELECT, + and if we try that, we can deadlock on the adaptive + hash index semaphore! */ + +#ifndef UNIV_SEARCH_DEBUG + ut_ad(!trx->has_search_latch); + rw_lock_s_lock(btr_search_get_latch(index)); + trx->has_search_latch = TRUE; +#endif + switch (row_sel_try_search_shortcut_for_mysql( + &rec, prebuilt, &offsets, &heap, + &mtr)) { + case SEL_FOUND: +#ifdef UNIV_SEARCH_DEBUG + ut_a(0 == cmp_dtuple_rec(search_tuple, + rec, offsets)); +#endif + /* At this point, rec is protected by + a page latch that was acquired by + row_sel_try_search_shortcut_for_mysql(). + The latch will not be released until + mtr_commit(&mtr). */ + ut_ad(!rec_get_deleted_flag(rec, comp)); + + if (prebuilt->idx_cond) { + switch (row_search_idx_cond_check( + buf, prebuilt, + rec, offsets)) { + case ICP_NO_MATCH: + case ICP_OUT_OF_RANGE: + goto shortcut_mismatch; + case ICP_MATCH: + goto shortcut_match; + } + } + + if (!row_sel_store_mysql_rec( + buf, prebuilt, + rec, FALSE, index, offsets)) { + /* Only fresh inserts may contain + incomplete externally stored + columns. Pretend that such + records do not exist. Such + records may only be accessed + at the READ UNCOMMITTED + isolation level or when + rolling back a recovered + transaction. Rollback happens + at a lower level, not here. */ + + /* Proceed as in case SEL_RETRY. */ + break; + } + + shortcut_match: + mtr_commit(&mtr); + + /* ut_print_name(stderr, index->name); + fputs(" shortcut\n", stderr); */ + + err = DB_SUCCESS; + goto release_search_latch; + + case SEL_EXHAUSTED: + shortcut_mismatch: + mtr_commit(&mtr); + + /* ut_print_name(stderr, index->name); + fputs(" record not found 2\n", stderr); */ + + err = DB_RECORD_NOT_FOUND; +release_search_latch: + rw_lock_s_unlock( + btr_search_get_latch(index)); + trx->has_search_latch = FALSE; + + /* NOTE that we do NOT store the cursor + position */ + goto func_exit; + + case SEL_RETRY: + break; + + default: + ut_ad(0); + } + + mtr_commit(&mtr); + mtr_start(&mtr); + + rw_lock_s_unlock(btr_search_get_latch(index)); + trx->has_search_latch = FALSE; + } + } + + /*-------------------------------------------------------------*/ + /* PHASE 3: Open or restore index cursor position */ + + ut_ad(!trx->has_search_latch); +#ifdef UNIV_SYNC_DEBUG + ut_ad(!btr_search_own_any()); +#endif + + /* The state of a running trx can only be changed by the + thread that is currently serving the transaction. Because we + are that thread, we can read trx->state without holding any + mutex. */ + ut_ad(prebuilt->sql_stat_start || trx->state == TRX_STATE_ACTIVE); + + ut_ad(trx->state == TRX_STATE_NOT_STARTED + || trx->state == TRX_STATE_ACTIVE); + + ut_ad(prebuilt->sql_stat_start + || prebuilt->select_lock_type != LOCK_NONE + || trx->read_view); + + trx_start_if_not_started(trx); + + if (trx->isolation_level <= TRX_ISO_READ_COMMITTED + && prebuilt->select_lock_type != LOCK_NONE + && trx->mysql_thd != NULL + && thd_is_select(trx->mysql_thd)) { + /* It is a plain locking SELECT and the isolation + level is low: do not lock gaps */ + + set_also_gap_locks = FALSE; + } + + /* Note that if the search mode was GE or G, then the cursor + naturally moves upward (in fetch next) in alphabetical order, + otherwise downward */ + + if (UNIV_UNLIKELY(direction == 0)) { + if (mode == PAGE_CUR_GE || mode == PAGE_CUR_G) { + moves_up = TRUE; + } + } else if (direction == ROW_SEL_NEXT) { + moves_up = TRUE; + } + + thr = que_fork_get_first_thr(prebuilt->sel_graph); + + que_thr_move_to_run_state_for_mysql(thr, trx); + + clust_index = dict_table_get_first_index(index->table); + + /* Do some start-of-statement preparations */ + + if (!prebuilt->sql_stat_start) { + /* No need to set an intention lock or assign a read view */ + + if (UNIV_UNLIKELY + (trx->read_view == NULL + && prebuilt->select_lock_type == LOCK_NONE)) { + + fputs("InnoDB: Error: MySQL is trying to" + " perform a consistent read\n" + "InnoDB: but the read view is not assigned!\n", + stderr); + trx_print(stderr, trx, 600); + fputc('\n', stderr); + ut_error; + } + } else if (prebuilt->select_lock_type == LOCK_NONE) { + /* This is a consistent read */ + /* Assign a read view for the query */ + + trx_assign_read_view(trx); + prebuilt->sql_stat_start = FALSE; + } else { +wait_table_again: + err = lock_table(0, index->table, + prebuilt->select_lock_type == LOCK_S + ? LOCK_IS : LOCK_IX, thr); + + if (err != DB_SUCCESS) { + + table_lock_waited = TRUE; + goto lock_table_wait; + } + prebuilt->sql_stat_start = FALSE; + } + + /* Open or restore index cursor position */ + + if (UNIV_LIKELY(direction != 0)) { + ibool need_to_process = sel_restore_position_for_mysql( + &same_user_rec, BTR_SEARCH_LEAF, + pcur, moves_up, &mtr); + + if (UNIV_UNLIKELY(need_to_process)) { + if (UNIV_UNLIKELY(prebuilt->row_read_type + == ROW_READ_DID_SEMI_CONSISTENT)) { + /* We did a semi-consistent read, + but the record was removed in + the meantime. */ + prebuilt->row_read_type + = ROW_READ_TRY_SEMI_CONSISTENT; + } + } else if (UNIV_LIKELY(prebuilt->row_read_type + != ROW_READ_DID_SEMI_CONSISTENT)) { + + /* The cursor was positioned on the record + that we returned previously. If we need + to repeat a semi-consistent read as a + pessimistic locking read, the record + cannot be skipped. */ + + goto next_rec; + } + + } else if (dtuple_get_n_fields(search_tuple) > 0) { + + btr_pcur_open_with_no_init(index, search_tuple, mode, + BTR_SEARCH_LEAF, + pcur, 0, &mtr); + + pcur->trx_if_known = trx; + + rec = btr_pcur_get_rec(pcur); + + if (!moves_up + && !page_rec_is_supremum(rec) + && set_also_gap_locks + && !(srv_locks_unsafe_for_binlog + || trx->isolation_level <= TRX_ISO_READ_COMMITTED) + && prebuilt->select_lock_type != LOCK_NONE) { + + /* Try to place a gap lock on the next index record + to prevent phantoms in ORDER BY ... DESC queries */ + const rec_t* next_rec = page_rec_get_next_const(rec); + + offsets = rec_get_offsets(next_rec, index, offsets, + ULINT_UNDEFINED, &heap); + err = sel_set_rec_lock(btr_pcur_get_block(pcur), + next_rec, index, offsets, + prebuilt->select_lock_type, + LOCK_GAP, thr); + + switch (err) { + case DB_SUCCESS_LOCKED_REC: + err = DB_SUCCESS; + case DB_SUCCESS: + break; + default: + goto lock_wait_or_error; + } + } + } else if (mode == PAGE_CUR_G || mode == PAGE_CUR_L) { + btr_pcur_open_at_index_side( + mode == PAGE_CUR_G, index, BTR_SEARCH_LEAF, + pcur, false, 0, &mtr); + } + +rec_loop: + DEBUG_SYNC_C("row_search_rec_loop"); + if (trx_is_interrupted(trx)) { + btr_pcur_store_position(pcur, &mtr); + err = DB_INTERRUPTED; + goto normal_return; + } + + /*-------------------------------------------------------------*/ + /* PHASE 4: Look for matching records in a loop */ + + rec = btr_pcur_get_rec(pcur); + + SRV_CORRUPT_TABLE_CHECK(rec, + { + err = DB_CORRUPTION; + goto lock_wait_or_error; + }); + + ut_ad(!!page_rec_is_comp(rec) == comp); +#ifdef UNIV_SEARCH_DEBUG + /* + fputs("Using ", stderr); + dict_index_name_print(stderr, trx, index); + fprintf(stderr, " cnt %lu ; Page no %lu\n", cnt, + page_get_page_no(page_align(rec))); + rec_print(stderr, rec, index); + printf("delete-mark: %lu\n", + rec_get_deleted_flag(rec, page_rec_is_comp(rec))); + */ +#endif /* UNIV_SEARCH_DEBUG */ + + if (page_rec_is_infimum(rec)) { + + /* The infimum record on a page cannot be in the result set, + and neither can a record lock be placed on it: we skip such + a record. */ + + goto next_rec; + } + + if (page_rec_is_supremum(rec)) { + + if (set_also_gap_locks + && !(srv_locks_unsafe_for_binlog + || trx->isolation_level <= TRX_ISO_READ_COMMITTED) + && prebuilt->select_lock_type != LOCK_NONE) { + + /* Try to place a lock on the index record */ + + /* If innodb_locks_unsafe_for_binlog option is used + or this session is using a READ COMMITTED isolation + level we do not lock gaps. Supremum record is really + a gap and therefore we do not set locks there. */ + + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + err = sel_set_rec_lock(btr_pcur_get_block(pcur), + rec, index, offsets, + prebuilt->select_lock_type, + LOCK_ORDINARY, thr); + + switch (err) { + case DB_SUCCESS_LOCKED_REC: + err = DB_SUCCESS; + case DB_SUCCESS: + break; + default: + goto lock_wait_or_error; + } + } + /* A page supremum record cannot be in the result set: skip + it now that we have placed a possible lock on it */ + + goto next_rec; + } + + /*-------------------------------------------------------------*/ + /* Do sanity checks in case our cursor has bumped into page + corruption */ + + if (comp) { + next_offs = rec_get_next_offs(rec, TRUE); + if (UNIV_UNLIKELY(next_offs < PAGE_NEW_SUPREMUM)) { + + goto wrong_offs; + } + } else { + next_offs = rec_get_next_offs(rec, FALSE); + if (UNIV_UNLIKELY(next_offs < PAGE_OLD_SUPREMUM)) { + + goto wrong_offs; + } + } + + if (UNIV_UNLIKELY(next_offs >= UNIV_PAGE_SIZE - PAGE_DIR)) { + +wrong_offs: + if (srv_pass_corrupt_table && index->table->space != 0 && + index->table->space < SRV_LOG_SPACE_FIRST_ID) { + index->table->is_corrupt = TRUE; + fil_space_set_corrupt(index->table->space); + } + + if ((srv_force_recovery == 0 || moves_up == FALSE) + && srv_pass_corrupt_table <= 1) { + ut_print_timestamp(stderr); + buf_page_print(page_align(rec), 0, + BUF_PAGE_PRINT_NO_CRASH); + fprintf(stderr, + "\nInnoDB: rec address %p," + " buf block fix count %lu\n", + (void*) rec, (ulong) + btr_cur_get_block(btr_pcur_get_btr_cur(pcur)) + ->page.buf_fix_count); + fprintf(stderr, + "InnoDB: Index corruption: rec offs %lu" + " next offs %lu, page no %lu,\n" + "InnoDB: ", + (ulong) page_offset(rec), + (ulong) next_offs, + (ulong) page_get_page_no(page_align(rec))); + dict_index_name_print(stderr, trx, index); + fputs(". Run CHECK TABLE. You may need to\n" + "InnoDB: restore from a backup, or" + " dump + drop + reimport the table.\n", + stderr); + ut_ad(0); + err = DB_CORRUPTION; + + goto lock_wait_or_error; + } else { + /* The user may be dumping a corrupt table. Jump + over the corruption to recover as much as possible. */ + + fprintf(stderr, + "InnoDB: Index corruption: rec offs %lu" + " next offs %lu, page no %lu,\n" + "InnoDB: ", + (ulong) page_offset(rec), + (ulong) next_offs, + (ulong) page_get_page_no(page_align(rec))); + dict_index_name_print(stderr, trx, index); + fputs(". We try to skip the rest of the page.\n", + stderr); + + btr_pcur_move_to_last_on_page(pcur, &mtr); + + goto next_rec; + } + } + /*-------------------------------------------------------------*/ + + /* Calculate the 'offsets' associated with 'rec' */ + + ut_ad(fil_page_get_type(btr_pcur_get_page(pcur)) == FIL_PAGE_INDEX); + ut_ad(btr_page_get_index_id(btr_pcur_get_page(pcur)) == index->id); + + offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap); + + if (UNIV_UNLIKELY(srv_force_recovery > 0 + || (index->table->is_corrupt && + srv_pass_corrupt_table == 2))) { + if (!rec_validate(rec, offsets) + || !btr_index_rec_validate(rec, index, FALSE)) { + fprintf(stderr, + "InnoDB: Index corruption: rec offs %lu" + " next offs %lu, page no %lu,\n" + "InnoDB: ", + (ulong) page_offset(rec), + (ulong) next_offs, + (ulong) page_get_page_no(page_align(rec))); + dict_index_name_print(stderr, trx, index); + fputs(". We try to skip the record.\n", + stderr); + + goto next_rec; + } + } + + /* Note that we cannot trust the up_match value in the cursor at this + place because we can arrive here after moving the cursor! Thus + we have to recompare rec and search_tuple to determine if they + match enough. */ + + if (match_mode == ROW_SEL_EXACT) { + /* Test if the index record matches completely to search_tuple + in prebuilt: if not, then we return with DB_RECORD_NOT_FOUND */ + + /* fputs("Comparing rec and search tuple\n", stderr); */ + + if (0 != cmp_dtuple_rec(search_tuple, rec, offsets)) { + + if (set_also_gap_locks + && !(srv_locks_unsafe_for_binlog + || trx->isolation_level + <= TRX_ISO_READ_COMMITTED) + && prebuilt->select_lock_type != LOCK_NONE) { + + /* Try to place a gap lock on the index + record only if innodb_locks_unsafe_for_binlog + option is not set or this session is not + using a READ COMMITTED isolation level. */ + + err = sel_set_rec_lock( + btr_pcur_get_block(pcur), + rec, index, offsets, + prebuilt->select_lock_type, LOCK_GAP, + thr); + + switch (err) { + case DB_SUCCESS_LOCKED_REC: + case DB_SUCCESS: + break; + default: + goto lock_wait_or_error; + } + } + + btr_pcur_store_position(pcur, &mtr); + + /* The found record was not a match, but may be used + as NEXT record (index_next). Set the relative position + to BTR_PCUR_BEFORE, to reflect that the position of + the persistent cursor is before the found/stored row + (pcur->old_rec). */ + ut_ad(pcur->rel_pos == BTR_PCUR_ON); + pcur->rel_pos = BTR_PCUR_BEFORE; + + err = DB_RECORD_NOT_FOUND; +#if 0 + ut_print_name(stderr, trx, FALSE, index->name); + fputs(" record not found 3\n", stderr); +#endif + + goto normal_return; + } + + } else if (match_mode == ROW_SEL_EXACT_PREFIX) { + + if (!cmp_dtuple_is_prefix_of_rec(search_tuple, rec, offsets)) { + + if (set_also_gap_locks + && !(srv_locks_unsafe_for_binlog + || trx->isolation_level + <= TRX_ISO_READ_COMMITTED) + && prebuilt->select_lock_type != LOCK_NONE) { + + /* Try to place a gap lock on the index + record only if innodb_locks_unsafe_for_binlog + option is not set or this session is not + using a READ COMMITTED isolation level. */ + + err = sel_set_rec_lock( + btr_pcur_get_block(pcur), + rec, index, offsets, + prebuilt->select_lock_type, LOCK_GAP, + thr); + + switch (err) { + case DB_SUCCESS_LOCKED_REC: + case DB_SUCCESS: + break; + default: + goto lock_wait_or_error; + } + } + + btr_pcur_store_position(pcur, &mtr); + + /* The found record was not a match, but may be used + as NEXT record (index_next). Set the relative position + to BTR_PCUR_BEFORE, to reflect that the position of + the persistent cursor is before the found/stored row + (pcur->old_rec). */ + ut_ad(pcur->rel_pos == BTR_PCUR_ON); + pcur->rel_pos = BTR_PCUR_BEFORE; + + err = DB_RECORD_NOT_FOUND; +#if 0 + ut_print_name(stderr, trx, FALSE, index->name); + fputs(" record not found 4\n", stderr); +#endif + + goto normal_return; + } + } + + /* We are ready to look at a possible new index entry in the result + set: the cursor is now placed on a user record */ + + if (prebuilt->select_lock_type != LOCK_NONE) { + /* Try to place a lock on the index record; note that delete + marked records are a special case in a unique search. If there + is a non-delete marked record, then it is enough to lock its + existence with LOCK_REC_NOT_GAP. */ + + /* If innodb_locks_unsafe_for_binlog option is used + or this session is using a READ COMMITED isolation + level we lock only the record, i.e., next-key locking is + not used. */ + + ulint lock_type; + + if (!set_also_gap_locks + || srv_locks_unsafe_for_binlog + || trx->isolation_level <= TRX_ISO_READ_COMMITTED + || (unique_search && !rec_get_deleted_flag(rec, comp))) { + + goto no_gap_lock; + } else { + lock_type = LOCK_ORDINARY; + } + + /* If we are doing a 'greater or equal than a primary key + value' search from a clustered index, and we find a record + that has that exact primary key value, then there is no need + to lock the gap before the record, because no insert in the + gap can be in our search range. That is, no phantom row can + appear that way. + + An example: if col1 is the primary key, the search is WHERE + col1 >= 100, and we find a record where col1 = 100, then no + need to lock the gap before that record. */ + + if (index == clust_index + && mode == PAGE_CUR_GE + && direction == 0 + && dtuple_get_n_fields_cmp(search_tuple) + == dict_index_get_n_unique(index) + && 0 == cmp_dtuple_rec(search_tuple, rec, offsets)) { +no_gap_lock: + lock_type = LOCK_REC_NOT_GAP; + } + + err = sel_set_rec_lock(btr_pcur_get_block(pcur), + rec, index, offsets, + prebuilt->select_lock_type, + lock_type, thr); + + switch (err) { + const rec_t* old_vers; + case DB_SUCCESS_LOCKED_REC: + if (srv_locks_unsafe_for_binlog + || trx->isolation_level + <= TRX_ISO_READ_COMMITTED) { + /* Note that a record of + prebuilt->index was locked. */ + prebuilt->new_rec_locks = 1; + } + err = DB_SUCCESS; + case DB_SUCCESS: + break; + case DB_LOCK_WAIT: + /* Never unlock rows that were part of a conflict. */ + prebuilt->new_rec_locks = 0; + + if (UNIV_LIKELY(prebuilt->row_read_type + != ROW_READ_TRY_SEMI_CONSISTENT) + || unique_search + || index != clust_index) { + + goto lock_wait_or_error; + } + + /* The following call returns 'offsets' + associated with 'old_vers' */ + row_sel_build_committed_vers_for_mysql( + clust_index, prebuilt, rec, + &offsets, &heap, &old_vers, &mtr); + + /* Check whether it was a deadlock or not, if not + a deadlock and the transaction had to wait then + release the lock it is waiting on. */ + + err = lock_trx_handle_wait(trx); + + switch (err) { + case DB_SUCCESS: + /* The lock was granted while we were + searching for the last committed version. + Do a normal locking read. */ + + offsets = rec_get_offsets( + rec, index, offsets, ULINT_UNDEFINED, + &heap); + goto locks_ok; + case DB_DEADLOCK: + goto lock_wait_or_error; + case DB_LOCK_WAIT: + err = DB_SUCCESS; + break; + default: + ut_error; + } + + if (old_vers == NULL) { + /* The row was not yet committed */ + + goto next_rec; + } + + did_semi_consistent_read = TRUE; + rec = old_vers; + break; + default: + + goto lock_wait_or_error; + } + } else { + /* This is a non-locking consistent read: if necessary, fetch + a previous version of the record */ + + if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED) { + + /* Do nothing: we let a non-locking SELECT read the + latest version of the record */ + + } else if (index == clust_index) { + + /* Fetch a previous version of the row if the current + one is not visible in the snapshot; if we have a very + high force recovery level set, we try to avoid crashes + by skipping this lookup */ + + if (UNIV_LIKELY(srv_force_recovery < 5) + && !lock_clust_rec_cons_read_sees( + rec, index, offsets, trx->read_view)) { + + rec_t* old_vers; + /* The following call returns 'offsets' + associated with 'old_vers' */ + err = row_sel_build_prev_vers_for_mysql( + trx->read_view, clust_index, + prebuilt, rec, &offsets, &heap, + &old_vers, &mtr); + + if (err != DB_SUCCESS) { + + goto lock_wait_or_error; + } + + if (old_vers == NULL) { + /* The row did not exist yet in + the read view */ + + goto next_rec; + } + + rec = old_vers; + } + } else { + /* We are looking into a non-clustered index, + and to get the right version of the record we + have to look also into the clustered index: this + is necessary, because we can only get the undo + information via the clustered index record. */ + + ut_ad(!dict_index_is_clust(index)); + + if (!lock_sec_rec_cons_read_sees( + rec, trx->read_view)) { + /* We should look at the clustered index. + However, as this is a non-locking read, + we can skip the clustered index lookup if + the condition does not match the secondary + index entry. */ + switch (row_search_idx_cond_check( + buf, prebuilt, rec, offsets)) { + case ICP_NO_MATCH: + goto next_rec; + case ICP_OUT_OF_RANGE: + err = DB_RECORD_NOT_FOUND; + goto idx_cond_failed; + case ICP_MATCH: + goto requires_clust_rec; + } + + ut_error; + } + } + } + +locks_ok: + /* NOTE that at this point rec can be an old version of a clustered + index record built for a consistent read. We cannot assume after this + point that rec is on a buffer pool page. Functions like + page_rec_is_comp() cannot be used! */ + + if (rec_get_deleted_flag(rec, comp)) { + + /* The record is delete-marked: we can skip it */ + + if ((srv_locks_unsafe_for_binlog + || trx->isolation_level <= TRX_ISO_READ_COMMITTED) + && prebuilt->select_lock_type != LOCK_NONE + && !did_semi_consistent_read) { + + /* No need to keep a lock on a delete-marked record + if we do not want to use next-key locking. */ + + row_unlock_for_mysql(prebuilt, TRUE); + } + + /* This is an optimization to skip setting the next key lock + on the record that follows this delete-marked record. This + optimization works because of the unique search criteria + which precludes the presence of a range lock between this + delete marked record and the record following it. + + For now this is applicable only to clustered indexes while + doing a unique search except for HANDLER queries because + HANDLER allows NEXT and PREV even in unique search on + clustered index. There is scope for further optimization + applicable to unique secondary indexes. Current behaviour is + to widen the scope of a lock on an already delete marked record + if the same record is deleted twice by the same transaction */ + if (index == clust_index && unique_search + && !prebuilt->used_in_HANDLER) { + + err = DB_RECORD_NOT_FOUND; + + goto normal_return; + } + + goto next_rec; + } + + /* Check if the record matches the index condition. */ + switch (row_search_idx_cond_check(buf, prebuilt, rec, offsets)) { + case ICP_NO_MATCH: + if (did_semi_consistent_read) { + row_unlock_for_mysql(prebuilt, TRUE); + } + goto next_rec; + case ICP_OUT_OF_RANGE: + err = DB_RECORD_NOT_FOUND; + goto idx_cond_failed; + case ICP_MATCH: + break; + } + + /* Get the clustered index record if needed, if we did not do the + search using the clustered index. */ + + if (index != clust_index && prebuilt->need_to_access_clustered) { + +requires_clust_rec: + ut_ad(index != clust_index); + /* We use a 'goto' to the preceding label if a consistent + read of a secondary index record requires us to look up old + versions of the associated clustered index record. */ + + ut_ad(rec_offs_validate(rec, index, offsets)); + + /* It was a non-clustered index and we must fetch also the + clustered index record */ + + mtr_has_extra_clust_latch = TRUE; + + /* The following call returns 'offsets' associated with + 'clust_rec'. Note that 'clust_rec' can be an old version + built for a consistent read. */ + + err = row_sel_get_clust_rec_for_mysql(prebuilt, index, rec, + thr, &clust_rec, + &offsets, &heap, &mtr); + switch (err) { + case DB_SUCCESS: + if (clust_rec == NULL) { + /* The record did not exist in the read view */ + ut_ad(prebuilt->select_lock_type == LOCK_NONE); + + goto next_rec; + } + break; + case DB_SUCCESS_LOCKED_REC: + ut_a(clust_rec != NULL); + if (srv_locks_unsafe_for_binlog + || trx->isolation_level + <= TRX_ISO_READ_COMMITTED) { + /* Note that the clustered index record + was locked. */ + prebuilt->new_rec_locks = 2; + } + err = DB_SUCCESS; + break; + default: + goto lock_wait_or_error; + } + + if (rec_get_deleted_flag(clust_rec, comp)) { + + /* The record is delete marked: we can skip it */ + + if ((srv_locks_unsafe_for_binlog + || trx->isolation_level <= TRX_ISO_READ_COMMITTED) + && prebuilt->select_lock_type != LOCK_NONE) { + + /* No need to keep a lock on a delete-marked + record if we do not want to use next-key + locking. */ + + row_unlock_for_mysql(prebuilt, TRUE); + } + + goto next_rec; + } + + result_rec = clust_rec; + ut_ad(rec_offs_validate(result_rec, clust_index, offsets)); + + if (prebuilt->idx_cond) { + /* Convert the record to MySQL format. We were + unable to do this in row_search_idx_cond_check(), + because the condition is on the secondary index + and the requested column is in the clustered index. + We convert all fields, including those that + may have been used in ICP, because the + secondary index may contain a column prefix + rather than the full column. Also, as noted + in Bug #56680, the column in the secondary + index may be in the wrong case, and the + authoritative case is in result_rec, the + appropriate version of the clustered index record. */ + if (!row_sel_store_mysql_rec( + buf, prebuilt, result_rec, + TRUE, clust_index, offsets)) { + goto next_rec; + } + } + } else { + result_rec = rec; + } + + /* We found a qualifying record 'result_rec'. At this point, + 'offsets' are associated with 'result_rec'. */ + + ut_ad(rec_offs_validate(result_rec, + result_rec != rec ? clust_index : index, + offsets)); + ut_ad(!rec_get_deleted_flag(result_rec, comp)); + + /* At this point, the clustered index record is protected + by a page latch that was acquired when pcur was positioned. + The latch will not be released until mtr_commit(&mtr). */ + + if ((match_mode == ROW_SEL_EXACT + || prebuilt->n_rows_fetched >= MYSQL_FETCH_CACHE_THRESHOLD) + && prebuilt->select_lock_type == LOCK_NONE + && !prebuilt->templ_contains_blob + && !prebuilt->clust_index_was_generated + && !prebuilt->used_in_HANDLER + && !prebuilt->innodb_api + && prebuilt->template_type + != ROW_MYSQL_DUMMY_TEMPLATE + && !prebuilt->in_fts_query) { + + /* Inside an update, for example, we do not cache rows, + since we may use the cursor position to do the actual + update, that is why we require ...lock_type == LOCK_NONE. + Since we keep space in prebuilt only for the BLOBs of + a single row, we cannot cache rows in the case there + are BLOBs in the fields to be fetched. In HANDLER we do + not cache rows because there the cursor is a scrollable + cursor. */ + + ut_a(prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE); + + /* We only convert from InnoDB row format to MySQL row + format when ICP is disabled. */ + + if (!prebuilt->idx_cond) { + + /* We use next_buf to track the allocation of buffers + where we store and enqueue the buffers for our + pre-fetch optimisation. + + If next_buf == 0 then we store the converted record + directly into the MySQL record buffer (buf). If it is + != 0 then we allocate a pre-fetch buffer and store the + converted record there. + + If the conversion fails and the MySQL record buffer + was not written to then we reset next_buf so that + we can re-use the MySQL record buffer in the next + iteration. */ + + next_buf = next_buf + ? row_sel_fetch_last_buf(prebuilt) : buf; + + if (!row_sel_store_mysql_rec( + next_buf, prebuilt, result_rec, + result_rec != rec, + result_rec != rec ? clust_index : index, + offsets)) { + + if (next_buf == buf) { + ut_a(prebuilt->n_fetch_cached == 0); + next_buf = 0; + } + + /* Only fresh inserts may contain incomplete + externally stored columns. Pretend that such + records do not exist. Such records may only be + accessed at the READ UNCOMMITTED isolation + level or when rolling back a recovered + transaction. Rollback happens at a lower + level, not here. */ + goto next_rec; + } + + if (next_buf != buf) { + row_sel_enqueue_cache_row_for_mysql( + next_buf, prebuilt); + } + } else { + row_sel_enqueue_cache_row_for_mysql(buf, prebuilt); + } + + if (prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE) { + goto next_rec; + } + + } else { + if (UNIV_UNLIKELY + (prebuilt->template_type == ROW_MYSQL_DUMMY_TEMPLATE)) { + /* CHECK TABLE: fetch the row */ + + if (result_rec != rec + && !prebuilt->need_to_access_clustered) { + /* We used 'offsets' for the clust + rec, recalculate them for 'rec' */ + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, + &heap); + result_rec = rec; + } + + memcpy(buf + 4, result_rec + - rec_offs_extra_size(offsets), + rec_offs_size(offsets)); + mach_write_to_4(buf, + rec_offs_extra_size(offsets) + 4); + } else if (!prebuilt->idx_cond && !prebuilt->innodb_api) { + /* The record was not yet converted to MySQL format. */ + if (!row_sel_store_mysql_rec( + buf, prebuilt, result_rec, + result_rec != rec, + result_rec != rec ? clust_index : index, + offsets)) { + /* Only fresh inserts may contain + incomplete externally stored + columns. Pretend that such records do + not exist. Such records may only be + accessed at the READ UNCOMMITTED + isolation level or when rolling back a + recovered transaction. Rollback + happens at a lower level, not here. */ + goto next_rec; + } + } + + if (prebuilt->clust_index_was_generated) { + row_sel_store_row_id_to_prebuilt( + prebuilt, result_rec, + result_rec == rec ? index : clust_index, + offsets); + } + } + + /* From this point on, 'offsets' are invalid. */ + + /* We have an optimization to save CPU time: if this is a consistent + read on a unique condition on the clustered index, then we do not + store the pcur position, because any fetch next or prev will anyway + return 'end of file'. Exceptions are locking reads and the MySQL + HANDLER command where the user can move the cursor with PREV or NEXT + even after a unique search. */ + + err = DB_SUCCESS; + +idx_cond_failed: + if (!unique_search + || !dict_index_is_clust(index) + || direction != 0 + || prebuilt->select_lock_type != LOCK_NONE + || prebuilt->used_in_HANDLER + || prebuilt->innodb_api) { + + /* Inside an update always store the cursor position */ + + btr_pcur_store_position(pcur, &mtr); + + if (prebuilt->innodb_api) { + prebuilt->innodb_api_rec = result_rec; + } + } + + goto normal_return; + +next_rec: + /* Reset the old and new "did semi-consistent read" flags. */ + if (UNIV_UNLIKELY(prebuilt->row_read_type + == ROW_READ_DID_SEMI_CONSISTENT)) { + prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT; + } + did_semi_consistent_read = FALSE; + prebuilt->new_rec_locks = 0; + + /*-------------------------------------------------------------*/ + /* PHASE 5: Move the cursor to the next index record */ + + /* NOTE: For moves_up==FALSE, the mini-transaction will be + committed and restarted every time when switching b-tree + pages. For moves_up==TRUE in index condition pushdown, we can + scan an entire secondary index tree within a single + mini-transaction. As long as the prebuilt->idx_cond does not + match, we do not need to consult the clustered index or + return records to MySQL, and thus we can avoid repositioning + the cursor. What prevents us from buffer-fixing all leaf pages + within the mini-transaction is the btr_leaf_page_release() + call in btr_pcur_move_to_next_page(). Only the leaf page where + the cursor is positioned will remain buffer-fixed. */ + + if (UNIV_UNLIKELY(mtr_has_extra_clust_latch)) { + /* We must commit mtr if we are moving to the next + non-clustered index record, because we could break the + latching order if we would access a different clustered + index page right away without releasing the previous. */ + + btr_pcur_store_position(pcur, &mtr); + + mtr_commit(&mtr); + mtr_has_extra_clust_latch = FALSE; + + mtr_start(&mtr); + if (sel_restore_position_for_mysql(&same_user_rec, + BTR_SEARCH_LEAF, + pcur, moves_up, &mtr)) { +#ifdef UNIV_SEARCH_DEBUG + cnt++; +#endif /* UNIV_SEARCH_DEBUG */ + + goto rec_loop; + } + } + + if (moves_up) { + if (UNIV_UNLIKELY(!btr_pcur_move_to_next(pcur, &mtr))) { +not_moved: + btr_pcur_store_position(pcur, &mtr); + + if (match_mode != 0) { + err = DB_RECORD_NOT_FOUND; + } else { + err = DB_END_OF_INDEX; + } + + goto normal_return; + } + } else { + if (UNIV_UNLIKELY(!btr_pcur_move_to_prev(pcur, &mtr))) { + goto not_moved; + } + } + +#ifdef UNIV_SEARCH_DEBUG + cnt++; +#endif /* UNIV_SEARCH_DEBUG */ + + goto rec_loop; + +lock_wait_or_error: + /* Reset the old and new "did semi-consistent read" flags. */ + if (UNIV_UNLIKELY(prebuilt->row_read_type + == ROW_READ_DID_SEMI_CONSISTENT)) { + prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT; + } + did_semi_consistent_read = FALSE; + + /*-------------------------------------------------------------*/ + + btr_pcur_store_position(pcur, &mtr); + +lock_table_wait: + mtr_commit(&mtr); + mtr_has_extra_clust_latch = FALSE; + + trx->error_state = err; + + /* The following is a patch for MySQL */ + + que_thr_stop_for_mysql(thr); + + thr->lock_state = QUE_THR_LOCK_ROW; + + if (row_mysql_handle_errors(&err, trx, thr, NULL)) { + /* It was a lock wait, and it ended */ + + thr->lock_state = QUE_THR_LOCK_NOLOCK; + mtr_start(&mtr); + + /* Table lock waited, go try to obtain table lock + again */ + if (table_lock_waited) { + table_lock_waited = FALSE; + + goto wait_table_again; + } + + sel_restore_position_for_mysql(&same_user_rec, + BTR_SEARCH_LEAF, pcur, + moves_up, &mtr); + + if ((srv_locks_unsafe_for_binlog + || trx->isolation_level <= TRX_ISO_READ_COMMITTED) + && !same_user_rec) { + + /* Since we were not able to restore the cursor + on the same user record, we cannot use + row_unlock_for_mysql() to unlock any records, and + we must thus reset the new rec lock info. Since + in lock0lock.cc we have blocked the inheriting of gap + X-locks, we actually do not have any new record locks + set in this case. + + Note that if we were able to restore on the 'same' + user record, it is still possible that we were actually + waiting on a delete-marked record, and meanwhile + it was removed by purge and inserted again by some + other user. But that is no problem, because in + rec_loop we will again try to set a lock, and + new_rec_lock_info in trx will be right at the end. */ + + prebuilt->new_rec_locks = 0; + } + + mode = pcur->search_mode; + + goto rec_loop; + } + + thr->lock_state = QUE_THR_LOCK_NOLOCK; + +#ifdef UNIV_SEARCH_DEBUG + /* fputs("Using ", stderr); + dict_index_name_print(stderr, index); + fprintf(stderr, " cnt %lu ret value %lu err\n", cnt, err); */ +#endif /* UNIV_SEARCH_DEBUG */ + goto func_exit; + +normal_return: + /*-------------------------------------------------------------*/ + que_thr_stop_for_mysql_no_error(thr, trx); + + mtr_commit(&mtr); + + if (prebuilt->idx_cond != 0) { + + /* When ICP is active we don't write to the MySQL buffer + directly, only to buffers that are enqueued in the pre-fetch + queue. We need to dequeue the first buffer and copy the contents + to the record buffer that was passed in by MySQL. */ + + if (prebuilt->n_fetch_cached > 0) { + row_sel_dequeue_cached_row_for_mysql(buf, prebuilt); + err = DB_SUCCESS; + } + + } else if (next_buf != 0) { + + /* We may or may not have enqueued some buffers to the + pre-fetch queue, but we definitely wrote to the record + buffer passed to use by MySQL. */ + + DEBUG_SYNC_C("row_search_cached_row"); + err = DB_SUCCESS; + } + +#ifdef UNIV_SEARCH_DEBUG + /* fputs("Using ", stderr); + dict_index_name_print(stderr, index); + fprintf(stderr, " cnt %lu ret value %lu err\n", cnt, err); */ +#endif /* UNIV_SEARCH_DEBUG */ + +func_exit: + trx->op_info = ""; + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + /* Set or reset the "did semi-consistent read" flag on return. + The flag did_semi_consistent_read is set if and only if + the record being returned was fetched with a semi-consistent read. */ + ut_ad(prebuilt->row_read_type != ROW_READ_WITH_LOCKS + || !did_semi_consistent_read); + + if (UNIV_UNLIKELY(prebuilt->row_read_type != ROW_READ_WITH_LOCKS)) { + if (UNIV_UNLIKELY(did_semi_consistent_read)) { + prebuilt->row_read_type = ROW_READ_DID_SEMI_CONSISTENT; + } else { + prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT; + } + } + + ut_ad(!trx->has_search_latch); +#ifdef UNIV_SYNC_DEBUG + ut_ad(!btr_search_own_any()); + ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch)); +#endif /* UNIV_SYNC_DEBUG */ + + DEBUG_SYNC_C("innodb_row_search_for_mysql_exit"); + + return(err); +} + +/*******************************************************************//** +Checks if MySQL at the moment is allowed for this table to retrieve a +consistent read result, or store it to the query cache. +@return TRUE if storing or retrieving from the query cache is permitted */ +UNIV_INTERN +ibool +row_search_check_if_query_cache_permitted( +/*======================================*/ + trx_t* trx, /*!< in: transaction object */ + const char* norm_name) /*!< in: concatenation of database name, + '/' char, table name */ +{ + dict_table_t* table; + ibool ret = FALSE; + + /* Disable query cache altogether for all tables if recovered XA + transactions in prepared state exist. This is because we do not + restore the table locks for those transactions and we may wrongly + set ret=TRUE above if "lock_table_get_n_locks(table) == 0". See + "Bug#14658648 XA ROLLBACK (DISTRIBUTED DATABASE) NOT WORKING WITH + QUERY CACHE ENABLED". + Read trx_sys->n_prepared_recovered_trx without mutex protection, + not possible to end up with a torn read since n_prepared_recovered_trx + is word size. */ + if (trx_sys->n_prepared_recovered_trx > 0) { + + return(FALSE); + } + + table = dict_table_open_on_name(norm_name, FALSE, FALSE, + DICT_ERR_IGNORE_NONE); + + if (table == NULL) { + + return(FALSE); + } + + /* Start the transaction if it is not started yet */ + + trx_start_if_not_started(trx); + + /* If there are locks on the table or some trx has invalidated the + cache up to our trx id, then ret = FALSE. + We do not check what type locks there are on the table, though only + IX type locks actually would require ret = FALSE. */ + + if (lock_table_get_n_locks(table) == 0 + && trx->id >= table->query_cache_inv_trx_id) { + + ret = TRUE; + + /* If the isolation level is high, assign a read view for the + transaction if it does not yet have one */ + + if (trx->isolation_level >= TRX_ISO_REPEATABLE_READ + && !trx->read_view) { + + trx->read_view = + read_view_open_now(trx->id, + trx->prebuilt_view); + trx->global_read_view = trx->read_view; + } + } + + dict_table_close(table, FALSE, FALSE); + + return(ret); +} + +/*******************************************************************//** +Read the AUTOINC column from the current row. If the value is less than +0 and the type is not unsigned then we reset the value to 0. +@return value read from the column */ +static +ib_uint64_t +row_search_autoinc_read_column( +/*===========================*/ + dict_index_t* index, /*!< in: index to read from */ + const rec_t* rec, /*!< in: current rec */ + ulint col_no, /*!< in: column number */ + ulint mtype, /*!< in: column main type */ + ibool unsigned_type) /*!< in: signed or unsigned flag */ +{ + ulint len; + const byte* data; + ib_uint64_t value; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + + rec_offs_init(offsets_); + + offsets = rec_get_offsets(rec, index, offsets, col_no + 1, &heap); + + if (rec_offs_nth_sql_null(offsets, col_no)) { + /* There is no non-NULL value in the auto-increment column. */ + value = 0; + goto func_exit; + } + + data = rec_get_nth_field(rec, offsets, col_no, &len); + + switch (mtype) { + case DATA_INT: + ut_a(len <= sizeof value); + value = mach_read_int_type(data, len, unsigned_type); + break; + + case DATA_FLOAT: + ut_a(len == sizeof(float)); + value = (ib_uint64_t) mach_float_read(data); + break; + + case DATA_DOUBLE: + ut_a(len == sizeof(double)); + value = (ib_uint64_t) mach_double_read(data); + break; + + default: + ut_error; + } + + if (!unsigned_type && (ib_int64_t) value < 0) { + value = 0; + } + +func_exit: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + return(value); +} + +/** Get the maximum and non-delete-marked record in an index. +@param[in] index index tree +@param[in,out] mtr mini-transaction (may be committed and restarted) +@return maximum record, page s-latched in mtr +@retval NULL if there are no records, or if all of them are delete-marked */ +static +const rec_t* +row_search_get_max_rec( + dict_index_t* index, + mtr_t* mtr) +{ + btr_pcur_t pcur; + const rec_t* rec; + /* Open at the high/right end (false), and init cursor */ + btr_pcur_open_at_index_side( + false, index, BTR_SEARCH_LEAF, &pcur, true, 0, mtr); + + do { + const page_t* page; + + page = btr_pcur_get_page(&pcur); + rec = page_find_rec_max_not_deleted(page); + + if (page_rec_is_user_rec(rec)) { + break; + } else { + rec = NULL; + } + btr_pcur_move_before_first_on_page(&pcur); + } while (btr_pcur_move_to_prev(&pcur, mtr)); + + btr_pcur_close(&pcur); + + return(rec); +} + +/*******************************************************************//** +Read the max AUTOINC value from an index. +@return DB_SUCCESS if all OK else error code, DB_RECORD_NOT_FOUND if +column name can't be found in index */ +UNIV_INTERN +dberr_t +row_search_max_autoinc( +/*===================*/ + dict_index_t* index, /*!< in: index to search */ + const char* col_name, /*!< in: name of autoinc column */ + ib_uint64_t* value) /*!< out: AUTOINC value read */ +{ + dict_field_t* dfield = dict_index_get_nth_field(index, 0); + dberr_t error = DB_SUCCESS; + *value = 0; + + if (strcmp(col_name, dfield->name) != 0) { + error = DB_RECORD_NOT_FOUND; + } else { + mtr_t mtr; + const rec_t* rec; + + mtr_start(&mtr); + + rec = row_search_get_max_rec(index, &mtr); + + if (rec != NULL) { + ibool unsigned_type = ( + dfield->col->prtype & DATA_UNSIGNED); + + *value = row_search_autoinc_read_column( + index, rec, 0, + dfield->col->mtype, unsigned_type); + } + + mtr_commit(&mtr); + } + + return(error); +} diff --git a/storage/xtradb/row/row0uins.cc b/storage/xtradb/row/row0uins.cc new file mode 100644 index 00000000000..849bf096492 --- /dev/null +++ b/storage/xtradb/row/row0uins.cc @@ -0,0 +1,475 @@ +/***************************************************************************** + +Copyright (c) 1997, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file row/row0uins.cc +Fresh insert undo + +Created 2/25/1997 Heikki Tuuri +*******************************************************/ + +#include "row0uins.h" + +#ifdef UNIV_NONINL +#include "row0uins.ic" +#endif + +#include "dict0dict.h" +#include "dict0boot.h" +#include "dict0crea.h" +#include "trx0undo.h" +#include "trx0roll.h" +#include "btr0btr.h" +#include "mach0data.h" +#include "row0undo.h" +#include "row0vers.h" +#include "row0log.h" +#include "trx0trx.h" +#include "trx0rec.h" +#include "row0row.h" +#include "row0upd.h" +#include "que0que.h" +#include "ibuf0ibuf.h" +#include "log0log.h" + +/************************************************************************* +IMPORTANT NOTE: Any operation that generates redo MUST check that there +is enough space in the redo log before for that operation. This is +done by calling log_free_check(). The reason for checking the +availability of the redo log space before the start of the operation is +that we MUST not hold any synchonization objects when performing the +check. +If you make a change in this module make sure that no codepath is +introduced where a call to log_free_check() is bypassed. */ + +/***************************************************************//** +Removes a clustered index record. The pcur in node was positioned on the +record, now it is detached. +@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_undo_ins_remove_clust_rec( +/*==========================*/ + undo_node_t* node) /*!< in: undo node */ +{ + btr_cur_t* btr_cur; + ibool success; + dberr_t err; + ulint n_tries = 0; + mtr_t mtr; + dict_index_t* index = node->pcur.btr_cur.index; + bool online; + + ut_ad(dict_index_is_clust(index)); + + mtr_start(&mtr); + + /* This is similar to row_undo_mod_clust(). The DDL thread may + already have copied this row from the log to the new table. + We must log the removal, so that the row will be correctly + purged. However, we can log the removal out of sync with the + B-tree modification. */ + + online = dict_index_is_online_ddl(index); + if (online) { + ut_ad(node->trx->dict_operation_lock_mode + != RW_X_LATCH); + ut_ad(node->table->id != DICT_INDEXES_ID); + mtr_s_lock(dict_index_get_lock(index), &mtr); + } + + success = btr_pcur_restore_position( + online + ? BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED + : BTR_MODIFY_LEAF, &node->pcur, &mtr); + ut_a(success); + + btr_cur = btr_pcur_get_btr_cur(&node->pcur); + + ut_ad(rec_get_trx_id(btr_cur_get_rec(btr_cur), btr_cur->index) + == node->trx->id); + + if (online && dict_index_is_online_ddl(index)) { + const rec_t* rec = btr_cur_get_rec(btr_cur); + mem_heap_t* heap = NULL; + const ulint* offsets = rec_get_offsets( + rec, index, NULL, ULINT_UNDEFINED, &heap); + row_log_table_delete(rec, index, offsets, NULL); + mem_heap_free(heap); + } + + if (node->table->id == DICT_INDEXES_ID) { + ut_ad(!online); + ut_ad(node->trx->dict_operation_lock_mode == RW_X_LATCH); + + /* Drop the index tree associated with the row in + SYS_INDEXES table: */ + + dict_drop_index_tree(btr_pcur_get_rec(&(node->pcur)), &mtr); + + mtr_commit(&mtr); + + mtr_start(&mtr); + + success = btr_pcur_restore_position( + BTR_MODIFY_LEAF, &node->pcur, &mtr); + ut_a(success); + } + + if (btr_cur_optimistic_delete(btr_cur, 0, &mtr)) { + err = DB_SUCCESS; + goto func_exit; + } + + btr_pcur_commit_specify_mtr(&node->pcur, &mtr); +retry: + /* If did not succeed, try pessimistic descent to tree */ + mtr_start(&mtr); + + success = btr_pcur_restore_position(BTR_MODIFY_TREE, + &(node->pcur), &mtr); + ut_a(success); + + btr_cur_pessimistic_delete(&err, FALSE, btr_cur, 0, + trx_is_recv(node->trx) + ? RB_RECOVERY + : RB_NORMAL, &mtr); + + /* The delete operation may fail if we have little + file space left: TODO: easiest to crash the database + and restart with more file space */ + + if (err == DB_OUT_OF_FILE_SPACE + && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) { + + btr_pcur_commit_specify_mtr(&(node->pcur), &mtr); + + n_tries++; + + os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME); + + goto retry; + } + +func_exit: + btr_pcur_commit_specify_mtr(&node->pcur, &mtr); + trx_undo_rec_release(node->trx, node->undo_no); + + return(err); +} + +/***************************************************************//** +Removes a secondary index entry if found. +@return DB_SUCCESS, DB_FAIL, or DB_OUT_OF_FILE_SPACE */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_undo_ins_remove_sec_low( +/*========================*/ + ulint mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE, + depending on whether we wish optimistic or + pessimistic descent down the index tree */ + dict_index_t* index, /*!< in: index */ + dtuple_t* entry) /*!< in: index entry to remove */ +{ + btr_pcur_t pcur; + btr_cur_t* btr_cur; + dberr_t err = DB_SUCCESS; + mtr_t mtr; + enum row_search_result search_result; + + log_free_check(); + + mtr_start(&mtr); + + if (mode == BTR_MODIFY_LEAF) { + mode = BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED; + mtr_s_lock(dict_index_get_lock(index), &mtr); + } else { + ut_ad(mode == BTR_MODIFY_TREE); + mtr_x_lock(dict_index_get_lock(index), &mtr); + } + + if (row_log_online_op_try(index, entry, 0)) { + goto func_exit_no_pcur; + } + + search_result = row_search_index_entry(index, entry, mode, + &pcur, &mtr); + + switch (search_result) { + case ROW_NOT_FOUND: + goto func_exit; + case ROW_FOUND: + break; + case ROW_BUFFERED: + case ROW_NOT_DELETED_REF: + /* These are invalid outcomes, because the mode passed + to row_search_index_entry() did not include any of the + flags BTR_INSERT, BTR_DELETE, or BTR_DELETE_MARK. */ + ut_error; + } + + btr_cur = btr_pcur_get_btr_cur(&pcur); + + if (mode != BTR_MODIFY_TREE) { + err = btr_cur_optimistic_delete(btr_cur, 0, &mtr) + ? DB_SUCCESS : DB_FAIL; + } else { + /* No need to distinguish RB_RECOVERY here, because we + are deleting a secondary index record: the distinction + between RB_NORMAL and RB_RECOVERY only matters when + deleting a record that contains externally stored + columns. */ + ut_ad(!dict_index_is_clust(index)); + btr_cur_pessimistic_delete(&err, FALSE, btr_cur, 0, + RB_NORMAL, &mtr); + } +func_exit: + btr_pcur_close(&pcur); +func_exit_no_pcur: + mtr_commit(&mtr); + + return(err); +} + +/***************************************************************//** +Removes a secondary index entry from the index if found. Tries first +optimistic, then pessimistic descent down the tree. +@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_undo_ins_remove_sec( +/*====================*/ + dict_index_t* index, /*!< in: index */ + dtuple_t* entry) /*!< in: index entry to insert */ +{ + dberr_t err; + ulint n_tries = 0; + + /* Try first optimistic descent to the B-tree */ + + err = row_undo_ins_remove_sec_low(BTR_MODIFY_LEAF, index, entry); + + if (err == DB_SUCCESS) { + + return(err); + } + + /* Try then pessimistic descent to the B-tree */ +retry: + err = row_undo_ins_remove_sec_low(BTR_MODIFY_TREE, index, entry); + + /* The delete operation may fail if we have little + file space left: TODO: easiest to crash the database + and restart with more file space */ + + if (err != DB_SUCCESS && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) { + + n_tries++; + + os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME); + + goto retry; + } + + return(err); +} + +/***********************************************************//** +Parses the row reference and other info in a fresh insert undo record. */ +static +void +row_undo_ins_parse_undo_rec( +/*========================*/ + undo_node_t* node, /*!< in/out: row undo node */ + ibool dict_locked) /*!< in: TRUE if own dict_sys->mutex */ +{ + dict_index_t* clust_index; + byte* ptr; + undo_no_t undo_no; + table_id_t table_id; + ulint type; + ulint dummy; + bool dummy_extern; + + ut_ad(node); + + ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &dummy, + &dummy_extern, &undo_no, &table_id); + ut_ad(type == TRX_UNDO_INSERT_REC); + node->rec_type = type; + + node->update = NULL; + node->table = dict_table_open_on_id( + table_id, dict_locked, DICT_TABLE_OP_NORMAL); + + /* Skip the UNDO if we can't find the table or the .ibd file. */ + if (UNIV_UNLIKELY(node->table == NULL)) { + } else if (UNIV_UNLIKELY(node->table->ibd_file_missing)) { +close_table: + dict_table_close(node->table, dict_locked, FALSE); + node->table = NULL; + } else { + clust_index = dict_table_get_first_index(node->table); + + if (clust_index != NULL) { + trx_undo_rec_get_row_ref( + ptr, clust_index, &node->ref, node->heap); + + if (!row_undo_search_clust_to_pcur(node)) { + goto close_table; + } + + } else { + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: table "); + ut_print_name(stderr, node->trx, TRUE, + node->table->name); + fprintf(stderr, " has no indexes, " + "ignoring the table\n"); + goto close_table; + } + } +} + +/***************************************************************//** +Removes secondary index records. +@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_undo_ins_remove_sec_rec( +/*========================*/ + undo_node_t* node) /*!< in/out: row undo node */ +{ + dberr_t err = DB_SUCCESS; + dict_index_t* index = node->index; + mem_heap_t* heap; + + heap = mem_heap_create(1024); + + while (index != NULL) { + dtuple_t* entry; + + if (index->type & DICT_FTS) { + dict_table_next_uncorrupted_index(index); + continue; + } + + /* An insert undo record TRX_UNDO_INSERT_REC will + always contain all fields of the index. It does not + matter if any indexes were created afterwards; all + index entries can be reconstructed from the row. */ + entry = row_build_index_entry( + node->row, node->ext, index, heap); + if (UNIV_UNLIKELY(!entry)) { + /* The database must have crashed after + inserting a clustered index record but before + writing all the externally stored columns of + that record, or a statement is being rolled + back because an error occurred while storing + off-page columns. + + Because secondary index entries are inserted + after the clustered index record, we may + assume that the secondary index record does + not exist. */ + } else { + err = row_undo_ins_remove_sec(index, entry); + + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + goto func_exit; + } + } + + mem_heap_empty(heap); + dict_table_next_uncorrupted_index(index); + } + +func_exit: + node->index = index; + mem_heap_free(heap); + return(err); +} + +/***********************************************************//** +Undoes a fresh insert of a row to a table. A fresh insert means that +the same clustered index unique key did not have any record, even delete +marked, at the time of the insert. InnoDB is eager in a rollback: +if it figures out that an index record will be removed in the purge +anyway, it will remove it in the rollback. +@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ +UNIV_INTERN +dberr_t +row_undo_ins( +/*=========*/ + undo_node_t* node) /*!< in: row undo node */ +{ + dberr_t err; + ibool dict_locked; + + ut_ad(node->state == UNDO_NODE_INSERT); + + dict_locked = node->trx->dict_operation_lock_mode == RW_X_LATCH; + + row_undo_ins_parse_undo_rec(node, dict_locked); + + if (node->table == NULL) { + trx_undo_rec_release(node->trx, node->undo_no); + + return(DB_SUCCESS); + } + + /* Iterate over all the indexes and undo the insert.*/ + + node->index = dict_table_get_first_index(node->table); + ut_ad(dict_index_is_clust(node->index)); + /* Skip the clustered index (the first index) */ + node->index = dict_table_get_next_index(node->index); + + dict_table_skip_corrupt_index(node->index); + + err = row_undo_ins_remove_sec_rec(node); + + if (err == DB_SUCCESS) { + + log_free_check(); + + if (node->table->id == DICT_INDEXES_ID) { + + if (!dict_locked) { + mutex_enter(&dict_sys->mutex); + } + } + + // FIXME: We need to update the dict_index_t::space and + // page number fields too. + err = row_undo_ins_remove_clust_rec(node); + + if (node->table->id == DICT_INDEXES_ID + && !dict_locked) { + + mutex_exit(&dict_sys->mutex); + } + } + + dict_table_close(node->table, dict_locked, FALSE); + + node->table = NULL; + + return(err); +} diff --git a/storage/xtradb/row/row0umod.cc b/storage/xtradb/row/row0umod.cc new file mode 100644 index 00000000000..29252c7834a --- /dev/null +++ b/storage/xtradb/row/row0umod.cc @@ -0,0 +1,1160 @@ +/***************************************************************************** + +Copyright (c) 1997, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file row/row0umod.cc +Undo modify of a row + +Created 2/27/1997 Heikki Tuuri +*******************************************************/ + +#include "row0umod.h" + +#ifdef UNIV_NONINL +#include "row0umod.ic" +#endif + +#include "dict0dict.h" +#include "dict0boot.h" +#include "trx0undo.h" +#include "trx0roll.h" +#include "btr0btr.h" +#include "mach0data.h" +#include "row0undo.h" +#include "row0vers.h" +#include "row0log.h" +#include "trx0trx.h" +#include "trx0rec.h" +#include "row0row.h" +#include "row0upd.h" +#include "que0que.h" +#include "log0log.h" + +/* Considerations on undoing a modify operation. +(1) Undoing a delete marking: all index records should be found. Some of +them may have delete mark already FALSE, if the delete mark operation was +stopped underway, or if the undo operation ended prematurely because of a +system crash. +(2) Undoing an update of a delete unmarked record: the newer version of +an updated secondary index entry should be removed if no prior version +of the clustered index record requires its existence. Otherwise, it should +be delete marked. +(3) Undoing an update of a delete marked record. In this kind of update a +delete marked clustered index record was delete unmarked and possibly also +some of its fields were changed. Now, it is possible that the delete marked +version has become obsolete at the time the undo is started. */ + +/************************************************************************* +IMPORTANT NOTE: Any operation that generates redo MUST check that there +is enough space in the redo log before for that operation. This is +done by calling log_free_check(). The reason for checking the +availability of the redo log space before the start of the operation is +that we MUST not hold any synchonization objects when performing the +check. +If you make a change in this module make sure that no codepath is +introduced where a call to log_free_check() is bypassed. */ + +/***********************************************************//** +Undoes a modify in a clustered index record. +@return DB_SUCCESS, DB_FAIL, or error code: we may run out of file space */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_undo_mod_clust_low( +/*===================*/ + undo_node_t* node, /*!< in: row undo node */ + ulint** offsets,/*!< out: rec_get_offsets() on the record */ + mem_heap_t** offsets_heap, + /*!< in/out: memory heap that can be emptied */ + mem_heap_t* heap, /*!< in/out: memory heap */ + const dtuple_t**rebuilt_old_pk, + /*!< out: row_log_table_get_pk() + before the update, or NULL if + the table is not being rebuilt online or + the PRIMARY KEY definition does not change */ + byte* sys, /*!< out: DB_TRX_ID, DB_ROLL_PTR + for row_log_table_delete() */ + que_thr_t* thr, /*!< in: query thread */ + mtr_t* mtr, /*!< in: mtr; must be committed before + latching any further pages */ + ulint mode) /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */ +{ + btr_pcur_t* pcur; + btr_cur_t* btr_cur; + dberr_t err; +#ifdef UNIV_DEBUG + ibool success; +#endif /* UNIV_DEBUG */ + + pcur = &node->pcur; + btr_cur = btr_pcur_get_btr_cur(pcur); + +#ifdef UNIV_DEBUG + success = +#endif /* UNIV_DEBUG */ + btr_pcur_restore_position(mode, pcur, mtr); + + ut_ad(success); + ut_ad(rec_get_trx_id(btr_cur_get_rec(btr_cur), + btr_cur_get_index(btr_cur)) + == thr_get_trx(thr)->id); + + if (mode != BTR_MODIFY_LEAF + && dict_index_is_online_ddl(btr_cur_get_index(btr_cur))) { + *rebuilt_old_pk = row_log_table_get_pk( + btr_cur_get_rec(btr_cur), + btr_cur_get_index(btr_cur), NULL, sys, &heap); + } else { + *rebuilt_old_pk = NULL; + } + + if (mode != BTR_MODIFY_TREE) { + ut_ad((mode & ~BTR_ALREADY_S_LATCHED) == BTR_MODIFY_LEAF); + + err = btr_cur_optimistic_update( + BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG + | BTR_KEEP_SYS_FLAG, + btr_cur, offsets, offsets_heap, + node->update, node->cmpl_info, + thr, thr_get_trx(thr)->id, mtr); + } else { + big_rec_t* dummy_big_rec; + + err = btr_cur_pessimistic_update( + BTR_NO_LOCKING_FLAG + | BTR_NO_UNDO_LOG_FLAG + | BTR_KEEP_SYS_FLAG, + btr_cur, offsets, offsets_heap, heap, + &dummy_big_rec, node->update, + node->cmpl_info, thr, thr_get_trx(thr)->id, mtr); + + ut_a(!dummy_big_rec); + } + + return(err); +} + +/***********************************************************//** +Purges a clustered index record after undo if possible. +This is attempted when the record was inserted by updating a +delete-marked record and there no longer exist transactions +that would see the delete-marked record. +@return DB_SUCCESS, DB_FAIL, or error code: we may run out of file space */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_undo_mod_remove_clust_low( +/*==========================*/ + undo_node_t* node, /*!< in: row undo node */ + que_thr_t* thr, /*!< in: query thread */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + ulint mode) /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */ +{ + btr_cur_t* btr_cur; + dberr_t err; + ulint trx_id_offset; + + ut_ad(node->rec_type == TRX_UNDO_UPD_DEL_REC); + + /* Find out if the record has been purged already + or if we can remove it. */ + + if (!btr_pcur_restore_position(mode, &node->pcur, mtr) + || row_vers_must_preserve_del_marked(node->new_trx_id, mtr)) { + + return(DB_SUCCESS); + } + + btr_cur = btr_pcur_get_btr_cur(&node->pcur); + + trx_id_offset = btr_cur_get_index(btr_cur)->trx_id_offset; + + if (!trx_id_offset) { + mem_heap_t* heap = NULL; + ulint trx_id_col; + const ulint* offsets; + ulint len; + + trx_id_col = dict_index_get_sys_col_pos( + btr_cur_get_index(btr_cur), DATA_TRX_ID); + ut_ad(trx_id_col > 0); + ut_ad(trx_id_col != ULINT_UNDEFINED); + + offsets = rec_get_offsets( + btr_cur_get_rec(btr_cur), btr_cur_get_index(btr_cur), + NULL, trx_id_col + 1, &heap); + + trx_id_offset = rec_get_nth_field_offs( + offsets, trx_id_col, &len); + ut_ad(len == DATA_TRX_ID_LEN); + mem_heap_free(heap); + } + + if (trx_read_trx_id(btr_cur_get_rec(btr_cur) + trx_id_offset) + != node->new_trx_id) { + /* The record must have been purged and then replaced + with a different one. */ + return(DB_SUCCESS); + } + + /* We are about to remove an old, delete-marked version of the + record that may have been delete-marked by a different transaction + than the rolling-back one. */ + ut_ad(rec_get_deleted_flag(btr_cur_get_rec(btr_cur), + dict_table_is_comp(node->table))); + + if (mode == BTR_MODIFY_LEAF) { + err = btr_cur_optimistic_delete(btr_cur, 0, mtr) + ? DB_SUCCESS + : DB_FAIL; + } else { + ut_ad(mode == BTR_MODIFY_TREE); + + /* This operation is analogous to purge, we can free also + inherited externally stored fields */ + + btr_cur_pessimistic_delete(&err, FALSE, btr_cur, 0, + thr_is_recv(thr) + ? RB_RECOVERY_PURGE_REC + : RB_NONE, mtr); + + /* The delete operation may fail if we have little + file space left: TODO: easiest to crash the database + and restart with more file space */ + } + + return(err); +} + +/***********************************************************//** +Undoes a modify in a clustered index record. Sets also the node state for the +next round of undo. +@return DB_SUCCESS or error code: we may run out of file space */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_undo_mod_clust( +/*===============*/ + undo_node_t* node, /*!< in: row undo node */ + que_thr_t* thr) /*!< in: query thread */ +{ + btr_pcur_t* pcur; + mtr_t mtr; + dberr_t err; + dict_index_t* index; + bool online; + + ut_ad(thr_get_trx(thr) == node->trx); + ut_ad(node->trx->dict_operation_lock_mode); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_SHARED) + || rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + log_free_check(); + pcur = &node->pcur; + index = btr_cur_get_index(btr_pcur_get_btr_cur(pcur)); + + mtr_start(&mtr); + + online = dict_index_is_online_ddl(index); + if (online) { + ut_ad(node->trx->dict_operation_lock_mode != RW_X_LATCH); + mtr_s_lock(dict_index_get_lock(index), &mtr); + } + + mem_heap_t* heap = mem_heap_create(1024); + mem_heap_t* offsets_heap = NULL; + ulint* offsets = NULL; + const dtuple_t* rebuilt_old_pk; + byte sys[DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN]; + + /* Try optimistic processing of the record, keeping changes within + the index page */ + + err = row_undo_mod_clust_low(node, &offsets, &offsets_heap, + heap, &rebuilt_old_pk, sys, + thr, &mtr, online + ? BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED + : BTR_MODIFY_LEAF); + + if (err != DB_SUCCESS) { + btr_pcur_commit_specify_mtr(pcur, &mtr); + + /* We may have to modify tree structure: do a pessimistic + descent down the index tree */ + + mtr_start(&mtr); + + err = row_undo_mod_clust_low( + node, &offsets, &offsets_heap, + heap, &rebuilt_old_pk, sys, + thr, &mtr, BTR_MODIFY_TREE); + ut_ad(err == DB_SUCCESS || err == DB_OUT_OF_FILE_SPACE); + } + + /* Online rebuild cannot be initiated while we are holding + dict_operation_lock and index->lock. (It can be aborted.) */ + ut_ad(online || !dict_index_is_online_ddl(index)); + + if (err == DB_SUCCESS && online) { +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&index->lock, RW_LOCK_SHARED) + || rw_lock_own(&index->lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + switch (node->rec_type) { + case TRX_UNDO_DEL_MARK_REC: + row_log_table_insert( + btr_pcur_get_rec(pcur), index, offsets); + break; + case TRX_UNDO_UPD_EXIST_REC: + row_log_table_update( + btr_pcur_get_rec(pcur), index, offsets, + rebuilt_old_pk); + break; + case TRX_UNDO_UPD_DEL_REC: + row_log_table_delete( + btr_pcur_get_rec(pcur), index, offsets, sys); + break; + default: + ut_ad(0); + break; + } + } + + ut_ad(rec_get_trx_id(btr_pcur_get_rec(pcur), index) + == node->new_trx_id); + + btr_pcur_commit_specify_mtr(pcur, &mtr); + + if (err == DB_SUCCESS && node->rec_type == TRX_UNDO_UPD_DEL_REC) { + + mtr_start(&mtr); + + /* It is not necessary to call row_log_table, + because the record is delete-marked and would thus + be omitted from the rebuilt copy of the table. */ + err = row_undo_mod_remove_clust_low( + node, thr, &mtr, BTR_MODIFY_LEAF); + if (err != DB_SUCCESS) { + btr_pcur_commit_specify_mtr(pcur, &mtr); + + /* We may have to modify tree structure: do a + pessimistic descent down the index tree */ + + mtr_start(&mtr); + + err = row_undo_mod_remove_clust_low(node, thr, &mtr, + BTR_MODIFY_TREE); + + ut_ad(err == DB_SUCCESS + || err == DB_OUT_OF_FILE_SPACE); + } + + btr_pcur_commit_specify_mtr(pcur, &mtr); + } + + node->state = UNDO_NODE_FETCH_NEXT; + + trx_undo_rec_release(node->trx, node->undo_no); + + if (offsets_heap) { + mem_heap_free(offsets_heap); + } + mem_heap_free(heap); + return(err); +} + +/***********************************************************//** +Delete marks or removes a secondary index entry if found. +@return DB_SUCCESS, DB_FAIL, or DB_OUT_OF_FILE_SPACE */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_undo_mod_del_mark_or_remove_sec_low( +/*====================================*/ + undo_node_t* node, /*!< in: row undo node */ + que_thr_t* thr, /*!< in: query thread */ + dict_index_t* index, /*!< in: index */ + dtuple_t* entry, /*!< in: index entry */ + ulint mode) /*!< in: latch mode BTR_MODIFY_LEAF or + BTR_MODIFY_TREE */ +{ + btr_pcur_t pcur; + btr_cur_t* btr_cur; + ibool success; + ibool old_has; + dberr_t err = DB_SUCCESS; + mtr_t mtr; + mtr_t mtr_vers; + enum row_search_result search_result; + + log_free_check(); + mtr_start(&mtr); + + if (*index->name == TEMP_INDEX_PREFIX) { + /* The index->online_status may change if the + index->name starts with TEMP_INDEX_PREFIX (meaning + that the index is or was being created online). It is + protected by index->lock. */ + if (mode == BTR_MODIFY_LEAF) { + mode = BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED; + mtr_s_lock(dict_index_get_lock(index), &mtr); + } else { + ut_ad(mode == BTR_MODIFY_TREE); + mtr_x_lock(dict_index_get_lock(index), &mtr); + } + + if (row_log_online_op_try(index, entry, 0)) { + goto func_exit_no_pcur; + } + } else { + /* For secondary indexes, + index->online_status==ONLINE_INDEX_CREATION unless + index->name starts with TEMP_INDEX_PREFIX. */ + ut_ad(!dict_index_is_online_ddl(index)); + } + + btr_cur = btr_pcur_get_btr_cur(&pcur); + + search_result = row_search_index_entry(index, entry, mode, + &pcur, &mtr); + + switch (UNIV_EXPECT(search_result, ROW_FOUND)) { + case ROW_NOT_FOUND: + /* In crash recovery, the secondary index record may + be missing if the UPDATE did not have time to insert + the secondary index records before the crash. When we + are undoing that UPDATE in crash recovery, the record + may be missing. + + In normal processing, if an update ends in a deadlock + before it has inserted all updated secondary index + records, then the undo will not find those records. */ + goto func_exit; + case ROW_FOUND: + break; + case ROW_BUFFERED: + case ROW_NOT_DELETED_REF: + /* These are invalid outcomes, because the mode passed + to row_search_index_entry() did not include any of the + flags BTR_INSERT, BTR_DELETE, or BTR_DELETE_MARK. */ + ut_error; + } + + /* We should remove the index record if no prior version of the row, + which cannot be purged yet, requires its existence. If some requires, + we should delete mark the record. */ + + mtr_start(&mtr_vers); + + success = btr_pcur_restore_position(BTR_SEARCH_LEAF, &(node->pcur), + &mtr_vers); + ut_a(success); + + old_has = row_vers_old_has_index_entry(FALSE, + btr_pcur_get_rec(&(node->pcur)), + &mtr_vers, index, entry); + if (old_has) { + err = btr_cur_del_mark_set_sec_rec(BTR_NO_LOCKING_FLAG, + btr_cur, TRUE, thr, &mtr); + ut_ad(err == DB_SUCCESS); + } else { + /* Remove the index record */ + + if (mode != BTR_MODIFY_TREE) { + success = btr_cur_optimistic_delete(btr_cur, 0, &mtr); + if (success) { + err = DB_SUCCESS; + } else { + err = DB_FAIL; + } + } else { + /* No need to distinguish RB_RECOVERY_PURGE here, + because we are deleting a secondary index record: + the distinction between RB_NORMAL and + RB_RECOVERY_PURGE only matters when deleting a + record that contains externally stored + columns. */ + ut_ad(!dict_index_is_clust(index)); + btr_cur_pessimistic_delete(&err, FALSE, btr_cur, 0, + RB_NORMAL, &mtr); + + /* The delete operation may fail if we have little + file space left: TODO: easiest to crash the database + and restart with more file space */ + } + } + + btr_pcur_commit_specify_mtr(&(node->pcur), &mtr_vers); + +func_exit: + btr_pcur_close(&pcur); +func_exit_no_pcur: + mtr_commit(&mtr); + + return(err); +} + +/***********************************************************//** +Delete marks or removes a secondary index entry if found. +NOTE that if we updated the fields of a delete-marked secondary index record +so that alphabetically they stayed the same, e.g., 'abc' -> 'aBc', we cannot +return to the original values because we do not know them. But this should +not cause problems because in row0sel.cc, in queries we always retrieve the +clustered index record or an earlier version of it, if the secondary index +record through which we do the search is delete-marked. +@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_undo_mod_del_mark_or_remove_sec( +/*================================*/ + undo_node_t* node, /*!< in: row undo node */ + que_thr_t* thr, /*!< in: query thread */ + dict_index_t* index, /*!< in: index */ + dtuple_t* entry) /*!< in: index entry */ +{ + dberr_t err; + + err = row_undo_mod_del_mark_or_remove_sec_low(node, thr, index, + entry, BTR_MODIFY_LEAF); + if (err == DB_SUCCESS) { + + return(err); + } + + err = row_undo_mod_del_mark_or_remove_sec_low(node, thr, index, + entry, BTR_MODIFY_TREE); + return(err); +} + +/***********************************************************//** +Delete unmarks a secondary index entry which must be found. It might not be +delete-marked at the moment, but it does not harm to unmark it anyway. We also +need to update the fields of the secondary index record if we updated its +fields but alphabetically they stayed the same, e.g., 'abc' -> 'aBc'. +@retval DB_SUCCESS on success +@retval DB_FAIL if BTR_MODIFY_TREE should be tried +@retval DB_OUT_OF_FILE_SPACE when running out of tablespace +@retval DB_DUPLICATE_KEY if the value was missing + and an insert would lead to a duplicate exists */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_undo_mod_del_unmark_sec_and_undo_update( +/*========================================*/ + ulint mode, /*!< in: search mode: BTR_MODIFY_LEAF or + BTR_MODIFY_TREE */ + que_thr_t* thr, /*!< in: query thread */ + dict_index_t* index, /*!< in: index */ + dtuple_t* entry) /*!< in: index entry */ +{ + btr_pcur_t pcur; + btr_cur_t* btr_cur = btr_pcur_get_btr_cur(&pcur); + upd_t* update; + dberr_t err = DB_SUCCESS; + big_rec_t* dummy_big_rec; + mtr_t mtr; + trx_t* trx = thr_get_trx(thr); + const ulint flags + = BTR_KEEP_SYS_FLAG | BTR_NO_LOCKING_FLAG; + enum row_search_result search_result; + + ut_ad(trx->id); + + log_free_check(); + mtr_start(&mtr); + + if (*index->name == TEMP_INDEX_PREFIX) { + /* The index->online_status may change if the + index->name starts with TEMP_INDEX_PREFIX (meaning + that the index is or was being created online). It is + protected by index->lock. */ + if (mode == BTR_MODIFY_LEAF) { + mode = BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED; + mtr_s_lock(dict_index_get_lock(index), &mtr); + } else { + ut_ad(mode == BTR_MODIFY_TREE); + mtr_x_lock(dict_index_get_lock(index), &mtr); + } + + if (row_log_online_op_try(index, entry, trx->id)) { + goto func_exit_no_pcur; + } + } else { + /* For secondary indexes, + index->online_status==ONLINE_INDEX_CREATION unless + index->name starts with TEMP_INDEX_PREFIX. */ + ut_ad(!dict_index_is_online_ddl(index)); + } + + search_result = row_search_index_entry(index, entry, mode, + &pcur, &mtr); + + switch (search_result) { + mem_heap_t* heap; + mem_heap_t* offsets_heap; + ulint* offsets; + case ROW_BUFFERED: + case ROW_NOT_DELETED_REF: + /* These are invalid outcomes, because the mode passed + to row_search_index_entry() did not include any of the + flags BTR_INSERT, BTR_DELETE, or BTR_DELETE_MARK. */ + ut_error; + case ROW_NOT_FOUND: + if (*index->name != TEMP_INDEX_PREFIX) { + /* During online secondary index creation, it + is possible that MySQL is waiting for a + meta-data lock upgrade before invoking + ha_innobase::commit_inplace_alter_table() + while this ROLLBACK is executing. InnoDB has + finished building the index, but it does not + yet exist in MySQL. In this case, we suppress + the printout to the error log. */ + fputs("InnoDB: error in sec index entry del undo in\n" + "InnoDB: ", stderr); + dict_index_name_print(stderr, trx, index); + fputs("\n" + "InnoDB: tuple ", stderr); + dtuple_print(stderr, entry); + fputs("\n" + "InnoDB: record ", stderr); + rec_print(stderr, btr_pcur_get_rec(&pcur), index); + putc('\n', stderr); + trx_print(stderr, trx, 0); + fputs("\n" + "InnoDB: Submit a detailed bug report" + " to http://bugs.mysql.com\n", stderr); + + ib_logf(IB_LOG_LEVEL_WARN, + "record in index %s was not found" + " on rollback, trying to insert", + index->name); + } + + if (btr_cur->up_match >= dict_index_get_n_unique(index) + || btr_cur->low_match >= dict_index_get_n_unique(index)) { + if (*index->name != TEMP_INDEX_PREFIX) { + ib_logf(IB_LOG_LEVEL_WARN, + "record in index %s was not found on" + " rollback, and a duplicate exists", + index->name); + } + err = DB_DUPLICATE_KEY; + break; + } + + /* Insert the missing record that we were trying to + delete-unmark. */ + big_rec_t* big_rec; + rec_t* insert_rec; + offsets = NULL; + offsets_heap = NULL; + + err = btr_cur_optimistic_insert( + flags, btr_cur, &offsets, &offsets_heap, + entry, &insert_rec, &big_rec, + 0, thr, &mtr); + ut_ad(!big_rec); + + if (err == DB_FAIL && mode == BTR_MODIFY_TREE) { + err = btr_cur_pessimistic_insert( + flags, btr_cur, + &offsets, &offsets_heap, + entry, &insert_rec, &big_rec, + 0, thr, &mtr); + /* There are no off-page columns in + secondary indexes. */ + ut_ad(!big_rec); + } + + if (err == DB_SUCCESS) { + page_update_max_trx_id( + btr_cur_get_block(btr_cur), + btr_cur_get_page_zip(btr_cur), + trx->id, &mtr); + } + + if (offsets_heap) { + mem_heap_free(offsets_heap); + } + + break; + case ROW_FOUND: + err = btr_cur_del_mark_set_sec_rec( + BTR_NO_LOCKING_FLAG, + btr_cur, FALSE, thr, &mtr); + ut_a(err == DB_SUCCESS); + heap = mem_heap_create( + sizeof(upd_t) + + dtuple_get_n_fields(entry) * sizeof(upd_field_t)); + offsets_heap = NULL; + offsets = rec_get_offsets( + btr_cur_get_rec(btr_cur), + index, NULL, ULINT_UNDEFINED, &offsets_heap); + update = row_upd_build_sec_rec_difference_binary( + btr_cur_get_rec(btr_cur), index, offsets, entry, heap); + if (upd_get_n_fields(update) == 0) { + + /* Do nothing */ + + } else if (mode != BTR_MODIFY_TREE) { + /* Try an optimistic updating of the record, keeping + changes within the page */ + + /* TODO: pass offsets, not &offsets */ + err = btr_cur_optimistic_update( + flags, btr_cur, &offsets, &offsets_heap, + update, 0, thr, thr_get_trx(thr)->id, &mtr); + switch (err) { + case DB_OVERFLOW: + case DB_UNDERFLOW: + case DB_ZIP_OVERFLOW: + err = DB_FAIL; + default: + break; + } + } else { + err = btr_cur_pessimistic_update( + flags, btr_cur, &offsets, &offsets_heap, + heap, &dummy_big_rec, + update, 0, thr, thr_get_trx(thr)->id, &mtr); + ut_a(!dummy_big_rec); + } + + mem_heap_free(heap); + mem_heap_free(offsets_heap); + } + + btr_pcur_close(&pcur); +func_exit_no_pcur: + mtr_commit(&mtr); + + return(err); +} + +/***********************************************************//** +Flags a secondary index corrupted. */ +static __attribute__((nonnull)) +void +row_undo_mod_sec_flag_corrupted( +/*============================*/ + trx_t* trx, /*!< in/out: transaction */ + dict_index_t* index) /*!< in: secondary index */ +{ + ut_ad(!dict_index_is_clust(index)); + + switch (trx->dict_operation_lock_mode) { + case RW_S_LATCH: + /* Because row_undo() is holding an S-latch + on the data dictionary during normal rollback, + we can only mark the index corrupted in the + data dictionary cache. TODO: fix this somehow.*/ + mutex_enter(&dict_sys->mutex); + dict_set_corrupted_index_cache_only(index, index->table); + mutex_exit(&dict_sys->mutex); + break; + default: + ut_ad(0); + /* fall through */ + case RW_X_LATCH: + /* This should be the rollback of a data dictionary + transaction. */ + dict_set_corrupted(index, trx, "rollback"); + } +} + +/***********************************************************//** +Undoes a modify in secondary indexes when undo record type is UPD_DEL. +@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_undo_mod_upd_del_sec( +/*=====================*/ + undo_node_t* node, /*!< in: row undo node */ + que_thr_t* thr) /*!< in: query thread */ +{ + mem_heap_t* heap; + dberr_t err = DB_SUCCESS; + + ut_ad(node->rec_type == TRX_UNDO_UPD_DEL_REC); + ut_ad(!node->undo_row); + + heap = mem_heap_create(1024); + + while (node->index != NULL) { + dict_index_t* index = node->index; + dtuple_t* entry; + + if (index->type & DICT_FTS) { + dict_table_next_uncorrupted_index(node->index); + continue; + } + + /* During online index creation, + HA_ALTER_INPLACE_NO_LOCK_AFTER_PREPARE should + guarantee that any active transaction has not modified + indexed columns such that col->ord_part was 0 at the + time when the undo log record was written. When we get + to roll back an undo log entry TRX_UNDO_DEL_MARK_REC, + it should always cover all affected indexes. */ + entry = row_build_index_entry( + node->row, node->ext, index, heap); + + if (UNIV_UNLIKELY(!entry)) { + /* The database must have crashed after + inserting a clustered index record but before + writing all the externally stored columns of + that record. Because secondary index entries + are inserted after the clustered index record, + we may assume that the secondary index record + does not exist. However, this situation may + only occur during the rollback of incomplete + transactions. */ + ut_a(thr_is_recv(thr)); + } else { + err = row_undo_mod_del_mark_or_remove_sec( + node, thr, index, entry); + + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + + break; + } + } + + mem_heap_empty(heap); + dict_table_next_uncorrupted_index(node->index); + } + + mem_heap_free(heap); + + return(err); +} + +/***********************************************************//** +Undoes a modify in secondary indexes when undo record type is DEL_MARK. +@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_undo_mod_del_mark_sec( +/*======================*/ + undo_node_t* node, /*!< in: row undo node */ + que_thr_t* thr) /*!< in: query thread */ +{ + mem_heap_t* heap; + dberr_t err = DB_SUCCESS; + + ut_ad(!node->undo_row); + + heap = mem_heap_create(1024); + + while (node->index != NULL) { + dict_index_t* index = node->index; + dtuple_t* entry; + + if (index->type == DICT_FTS) { + dict_table_next_uncorrupted_index(node->index); + continue; + } + + /* During online index creation, + HA_ALTER_INPLACE_NO_LOCK_AFTER_PREPARE should + guarantee that any active transaction has not modified + indexed columns such that col->ord_part was 0 at the + time when the undo log record was written. When we get + to roll back an undo log entry TRX_UNDO_DEL_MARK_REC, + it should always cover all affected indexes. */ + entry = row_build_index_entry( + node->row, node->ext, index, heap); + + ut_a(entry); + + err = row_undo_mod_del_unmark_sec_and_undo_update( + BTR_MODIFY_LEAF, thr, index, entry); + if (err == DB_FAIL) { + err = row_undo_mod_del_unmark_sec_and_undo_update( + BTR_MODIFY_TREE, thr, index, entry); + } + + if (err == DB_DUPLICATE_KEY) { + row_undo_mod_sec_flag_corrupted( + thr_get_trx(thr), index); + err = DB_SUCCESS; + /* Do not return any error to the caller. The + duplicate will be reported by ALTER TABLE or + CREATE UNIQUE INDEX. Unfortunately we cannot + report the duplicate key value to the DDL + thread, because the altered_table object is + private to its call stack. */ + } else if (err != DB_SUCCESS) { + break; + } + + mem_heap_empty(heap); + dict_table_next_uncorrupted_index(node->index); + } + + mem_heap_free(heap); + + return(err); +} + +/***********************************************************//** +Undoes a modify in secondary indexes when undo record type is UPD_EXIST. +@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_undo_mod_upd_exist_sec( +/*=======================*/ + undo_node_t* node, /*!< in: row undo node */ + que_thr_t* thr) /*!< in: query thread */ +{ + mem_heap_t* heap; + dberr_t err = DB_SUCCESS; + + if (node->index == NULL + || ((node->cmpl_info & UPD_NODE_NO_ORD_CHANGE))) { + /* No change in secondary indexes */ + + return(err); + } + + heap = mem_heap_create(1024); + + while (node->index != NULL) { + dict_index_t* index = node->index; + dtuple_t* entry; + + if (index->type == DICT_FTS + || !row_upd_changes_ord_field_binary( + index, node->update, thr, node->row, node->ext)) { + dict_table_next_uncorrupted_index(node->index); + continue; + } + + /* Build the newest version of the index entry */ + entry = row_build_index_entry(node->row, node->ext, + index, heap); + if (UNIV_UNLIKELY(!entry)) { + /* The server must have crashed in + row_upd_clust_rec_by_insert() before + the updated externally stored columns (BLOBs) + of the new clustered index entry were written. */ + + /* The table must be in DYNAMIC or COMPRESSED + format. REDUNDANT and COMPACT formats + store a local 768-byte prefix of each + externally stored column. */ + ut_a(dict_table_get_format(index->table) + >= UNIV_FORMAT_B); + + /* This is only legitimate when + rolling back an incomplete transaction + after crash recovery. */ + ut_a(thr_get_trx(thr)->is_recovered); + + /* The server must have crashed before + completing the insert of the new + clustered index entry and before + inserting to the secondary indexes. + Because node->row was not yet written + to this index, we can ignore it. But + we must restore node->undo_row. */ + } else { + /* NOTE that if we updated the fields of a + delete-marked secondary index record so that + alphabetically they stayed the same, e.g., + 'abc' -> 'aBc', we cannot return to the + original values because we do not know them. + But this should not cause problems because + in row0sel.cc, in queries we always retrieve + the clustered index record or an earlier + version of it, if the secondary index record + through which we do the search is + delete-marked. */ + + err = row_undo_mod_del_mark_or_remove_sec( + node, thr, index, entry); + if (err != DB_SUCCESS) { + break; + } + } + + mem_heap_empty(heap); + /* We may have to update the delete mark in the + secondary index record of the previous version of + the row. We also need to update the fields of + the secondary index record if we updated its fields + but alphabetically they stayed the same, e.g., + 'abc' -> 'aBc'. */ + entry = row_build_index_entry(node->undo_row, + node->undo_ext, + index, heap); + ut_a(entry); + + err = row_undo_mod_del_unmark_sec_and_undo_update( + BTR_MODIFY_LEAF, thr, index, entry); + if (err == DB_FAIL) { + err = row_undo_mod_del_unmark_sec_and_undo_update( + BTR_MODIFY_TREE, thr, index, entry); + } + + if (err == DB_DUPLICATE_KEY) { + row_undo_mod_sec_flag_corrupted( + thr_get_trx(thr), index); + err = DB_SUCCESS; + } else if (err != DB_SUCCESS) { + break; + } + + mem_heap_empty(heap); + dict_table_next_uncorrupted_index(node->index); + } + + mem_heap_free(heap); + + return(err); +} + +/***********************************************************//** +Parses the row reference and other info in a modify undo log record. */ +static __attribute__((nonnull)) +void +row_undo_mod_parse_undo_rec( +/*========================*/ + undo_node_t* node, /*!< in: row undo node */ + ibool dict_locked) /*!< in: TRUE if own dict_sys->mutex */ +{ + dict_index_t* clust_index; + byte* ptr; + undo_no_t undo_no; + table_id_t table_id; + trx_id_t trx_id; + roll_ptr_t roll_ptr; + ulint info_bits; + ulint type; + ulint cmpl_info; + bool dummy_extern; + + ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &cmpl_info, + &dummy_extern, &undo_no, &table_id); + node->rec_type = type; + + node->table = dict_table_open_on_id( + table_id, dict_locked, DICT_TABLE_OP_NORMAL); + + /* TODO: other fixes associated with DROP TABLE + rollback in the + same table by another user */ + + if (node->table == NULL) { + /* Table was dropped */ + return; + } + + if (node->table->ibd_file_missing) { + dict_table_close(node->table, dict_locked, FALSE); + + /* We skip undo operations to missing .ibd files */ + node->table = NULL; + + return; + } + + clust_index = dict_table_get_first_index(node->table); + + ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr, + &info_bits); + + ptr = trx_undo_rec_get_row_ref(ptr, clust_index, &(node->ref), + node->heap); + + trx_undo_update_rec_get_update(ptr, clust_index, type, trx_id, + roll_ptr, info_bits, node->trx, + node->heap, &(node->update)); + node->new_trx_id = trx_id; + node->cmpl_info = cmpl_info; + + if (!row_undo_search_clust_to_pcur(node)) { + + dict_table_close(node->table, dict_locked, FALSE); + + node->table = NULL; + } +} + +/***********************************************************//** +Undoes a modify operation on a row of a table. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +row_undo_mod( +/*=========*/ + undo_node_t* node, /*!< in: row undo node */ + que_thr_t* thr) /*!< in: query thread */ +{ + dberr_t err; + ibool dict_locked; + + ut_ad(node && thr); + ut_ad(node->state == UNDO_NODE_MODIFY); + + dict_locked = thr_get_trx(thr)->dict_operation_lock_mode == RW_X_LATCH; + + ut_ad(thr_get_trx(thr) == node->trx); + + row_undo_mod_parse_undo_rec(node, dict_locked); + + if (node->table == NULL) { + /* It is already undone, or will be undone by another query + thread, or table was dropped */ + + trx_undo_rec_release(node->trx, node->undo_no); + node->state = UNDO_NODE_FETCH_NEXT; + + return(DB_SUCCESS); + } + + node->index = dict_table_get_first_index(node->table); + ut_ad(dict_index_is_clust(node->index)); + /* Skip the clustered index (the first index) */ + node->index = dict_table_get_next_index(node->index); + + /* Skip all corrupted secondary index */ + dict_table_skip_corrupt_index(node->index); + + switch (node->rec_type) { + case TRX_UNDO_UPD_EXIST_REC: + err = row_undo_mod_upd_exist_sec(node, thr); + break; + case TRX_UNDO_DEL_MARK_REC: + err = row_undo_mod_del_mark_sec(node, thr); + break; + case TRX_UNDO_UPD_DEL_REC: + err = row_undo_mod_upd_del_sec(node, thr); + break; + default: + ut_error; + err = DB_ERROR; + } + + if (err == DB_SUCCESS) { + + err = row_undo_mod_clust(node, thr); + } + + dict_table_close(node->table, dict_locked, FALSE); + + node->table = NULL; + + return(err); +} diff --git a/storage/xtradb/row/row0undo.cc b/storage/xtradb/row/row0undo.cc new file mode 100644 index 00000000000..9977a1e8f04 --- /dev/null +++ b/storage/xtradb/row/row0undo.cc @@ -0,0 +1,375 @@ +/***************************************************************************** + +Copyright (c) 1997, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file row/row0undo.cc +Row undo + +Created 1/8/1997 Heikki Tuuri +*******************************************************/ + +#include "row0undo.h" + +#ifdef UNIV_NONINL +#include "row0undo.ic" +#endif + +#include "fsp0fsp.h" +#include "mach0data.h" +#include "trx0rseg.h" +#include "trx0trx.h" +#include "trx0roll.h" +#include "trx0undo.h" +#include "trx0purge.h" +#include "trx0rec.h" +#include "que0que.h" +#include "row0row.h" +#include "row0uins.h" +#include "row0umod.h" +#include "row0upd.h" +#include "row0mysql.h" +#include "srv0srv.h" + +/* How to undo row operations? +(1) For an insert, we have stored a prefix of the clustered index record +in the undo log. Using it, we look for the clustered record, and using +that we look for the records in the secondary indexes. The insert operation +may have been left incomplete, if the database crashed, for example. +We may have look at the trx id and roll ptr to make sure the record in the +clustered index is really the one for which the undo log record was +written. We can use the framework we get from the original insert op. +(2) Delete marking: We can use the framework we get from the original +delete mark op. We only have to check the trx id. +(3) Update: This may be the most complicated. We have to use the framework +we get from the original update op. + +What if the same trx repeatedly deletes and inserts an identical row. +Then the row id changes and also roll ptr. What if the row id was not +part of the ordering fields in the clustered index? Maybe we have to write +it to undo log. Well, maybe not, because if we order the row id and trx id +in descending order, then the only undeleted copy is the first in the +index. Our searches in row operations always position the cursor before +the first record in the result set. But, if there is no key defined for +a table, then it would be desirable that row id is in ascending order. +So, lets store row id in descending order only if it is not an ordering +field in the clustered index. + +NOTE: Deletes and inserts may lead to situation where there are identical +records in a secondary index. Is that a problem in the B-tree? Yes. +Also updates can lead to this, unless trx id and roll ptr are included in +ord fields. +(1) Fix in clustered indexes: include row id, trx id, and roll ptr +in node pointers of B-tree. +(2) Fix in secondary indexes: include all fields in node pointers, and +if an entry is inserted, check if it is equal to the right neighbor, +in which case update the right neighbor: the neighbor must be delete +marked, set it unmarked and write the trx id of the current transaction. + +What if the same trx repeatedly updates the same row, updating a secondary +index field or not? Updating a clustered index ordering field? + +(1) If it does not update the secondary index and not the clustered index +ord field. Then the secondary index record stays unchanged, but the +trx id in the secondary index record may be smaller than in the clustered +index record. This is no problem? +(2) If it updates secondary index ord field but not clustered: then in +secondary index there are delete marked records, which differ in an +ord field. No problem. +(3) Updates clustered ord field but not secondary, and secondary index +is unique. Then the record in secondary index is just updated at the +clustered ord field. +(4) + +Problem with duplicate records: +Fix 1: Add a trx op no field to all indexes. A problem: if a trx with a +bigger trx id has inserted and delete marked a similar row, our trx inserts +again a similar row, and a trx with an even bigger id delete marks it. Then +the position of the row should change in the index if the trx id affects +the alphabetical ordering. + +Fix 2: If an insert encounters a similar row marked deleted, we turn the +insert into an 'update' of the row marked deleted. Then we must write undo +info on the update. A problem: what if a purge operation tries to remove +the delete marked row? + +We can think of the database row versions as a linked list which starts +from the record in the clustered index, and is linked by roll ptrs +through undo logs. The secondary index records are references which tell +what kinds of records can be found in this linked list for a record +in the clustered index. + +How to do the purge? A record can be removed from the clustered index +if its linked list becomes empty, i.e., the row has been marked deleted +and its roll ptr points to the record in the undo log we are going through, +doing the purge. Similarly, during a rollback, a record can be removed +if the stored roll ptr in the undo log points to a trx already (being) purged, +or if the roll ptr is NULL, i.e., it was a fresh insert. */ + +/********************************************************************//** +Creates a row undo node to a query graph. +@return own: undo node */ +UNIV_INTERN +undo_node_t* +row_undo_node_create( +/*=================*/ + trx_t* trx, /*!< in: transaction */ + que_thr_t* parent, /*!< in: parent node, i.e., a thr node */ + mem_heap_t* heap) /*!< in: memory heap where created */ +{ + undo_node_t* undo; + + ut_ad(trx && parent && heap); + + undo = static_cast<undo_node_t*>( + mem_heap_alloc(heap, sizeof(undo_node_t))); + + undo->common.type = QUE_NODE_UNDO; + undo->common.parent = parent; + + undo->state = UNDO_NODE_FETCH_NEXT; + undo->trx = trx; + + btr_pcur_init(&(undo->pcur)); + + undo->heap = mem_heap_create(256); + + return(undo); +} + +/***********************************************************//** +Looks for the clustered index record when node has the row reference. +The pcur in node is used in the search. If found, stores the row to node, +and stores the position of pcur, and detaches it. The pcur must be closed +by the caller in any case. +@return TRUE if found; NOTE the node->pcur must be closed by the +caller, regardless of the return value */ +UNIV_INTERN +ibool +row_undo_search_clust_to_pcur( +/*==========================*/ + undo_node_t* node) /*!< in: row undo node */ +{ + dict_index_t* clust_index; + ibool found; + mtr_t mtr; + ibool ret; + rec_t* rec; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + mtr_start(&mtr); + + clust_index = dict_table_get_first_index(node->table); + + found = row_search_on_row_ref(&(node->pcur), BTR_MODIFY_LEAF, + node->table, node->ref, &mtr); + + rec = btr_pcur_get_rec(&(node->pcur)); + + offsets = rec_get_offsets(rec, clust_index, offsets, + ULINT_UNDEFINED, &heap); + + if (!found || node->roll_ptr + != row_get_rec_roll_ptr(rec, clust_index, offsets)) { + + /* We must remove the reservation on the undo log record + BEFORE releasing the latch on the clustered index page: this + is to make sure that some thread will eventually undo the + modification corresponding to node->roll_ptr. */ + + /* fputs("--------------------undoing a previous version\n", + stderr); */ + + ret = FALSE; + } else { + row_ext_t** ext; + + if (dict_table_get_format(node->table) >= UNIV_FORMAT_B) { + /* In DYNAMIC or COMPRESSED format, there is + no prefix of externally stored columns in the + clustered index record. Build a cache of + column prefixes. */ + ext = &node->ext; + } else { + /* REDUNDANT and COMPACT formats store a local + 768-byte prefix of each externally stored + column. No cache is needed. */ + ext = NULL; + node->ext = NULL; + } + + node->row = row_build(ROW_COPY_DATA, clust_index, rec, + offsets, NULL, + NULL, NULL, ext, node->heap); + if (node->rec_type == TRX_UNDO_UPD_EXIST_REC) { + node->undo_row = dtuple_copy(node->row, node->heap); + row_upd_replace(node->undo_row, &node->undo_ext, + clust_index, node->update, node->heap); + } else { + node->undo_row = NULL; + node->undo_ext = NULL; + } + + btr_pcur_store_position(&(node->pcur), &mtr); + + ret = TRUE; + } + + btr_pcur_commit_specify_mtr(&(node->pcur), &mtr); + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(ret); +} + +/***********************************************************//** +Fetches an undo log record and does the undo for the recorded operation. +If none left, or a partial rollback completed, returns control to the +parent node, which is always a query thread node. +@return DB_SUCCESS if operation successfully completed, else error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_undo( +/*=====*/ + undo_node_t* node, /*!< in: row undo node */ + que_thr_t* thr) /*!< in: query thread */ +{ + dberr_t err; + trx_t* trx; + roll_ptr_t roll_ptr; + ibool locked_data_dict; + + ut_ad(node && thr); + + trx = node->trx; + + if (node->state == UNDO_NODE_FETCH_NEXT) { + + node->undo_rec = trx_roll_pop_top_rec_of_trx(trx, + trx->roll_limit, + &roll_ptr, + node->heap); + if (!node->undo_rec) { + /* Rollback completed for this query thread */ + + thr->run_node = que_node_get_parent(node); + + return(DB_SUCCESS); + } + + node->roll_ptr = roll_ptr; + node->undo_no = trx_undo_rec_get_undo_no(node->undo_rec); + + if (trx_undo_roll_ptr_is_insert(roll_ptr)) { + + node->state = UNDO_NODE_INSERT; + } else { + node->state = UNDO_NODE_MODIFY; + } + } + + /* Prevent DROP TABLE etc. while we are rolling back this row. + If we are doing a TABLE CREATE or some other dictionary operation, + then we already have dict_operation_lock locked in x-mode. Do not + try to lock again, because that would cause a hang. */ + + locked_data_dict = (trx->dict_operation_lock_mode == 0); + + if (locked_data_dict) { + + row_mysql_freeze_data_dictionary(trx); + } + + if (node->state == UNDO_NODE_INSERT) { + + err = row_undo_ins(node); + + node->state = UNDO_NODE_FETCH_NEXT; + } else { + ut_ad(node->state == UNDO_NODE_MODIFY); + err = row_undo_mod(node, thr); + } + + if (locked_data_dict) { + + row_mysql_unfreeze_data_dictionary(trx); + } + + /* Do some cleanup */ + btr_pcur_close(&(node->pcur)); + + mem_heap_empty(node->heap); + + thr->run_node = node; + + return(err); +} + +/***********************************************************//** +Undoes a row operation in a table. This is a high-level function used +in SQL execution graphs. +@return query thread to run next or NULL */ +UNIV_INTERN +que_thr_t* +row_undo_step( +/*==========*/ + que_thr_t* thr) /*!< in: query thread */ +{ + dberr_t err; + undo_node_t* node; + trx_t* trx; + + ut_ad(thr); + + srv_inc_activity_count(); + + trx = thr_get_trx(thr); + + node = static_cast<undo_node_t*>(thr->run_node); + + ut_ad(que_node_get_type(node) == QUE_NODE_UNDO); + + err = row_undo(node, thr); + + trx->error_state = err; + + if (err != DB_SUCCESS) { + /* SQL error detected */ + + fprintf(stderr, "InnoDB: Fatal error (%s) in rollback.\n", + ut_strerr(err)); + + if (err == DB_OUT_OF_FILE_SPACE) { + fprintf(stderr, + "InnoDB: Out of tablespace.\n" + "InnoDB: Consider increasing" + " your tablespace.\n"); + + exit(1); + } + + ut_error; + + return(NULL); + } + + return(thr); +} diff --git a/storage/xtradb/row/row0upd.cc b/storage/xtradb/row/row0upd.cc new file mode 100644 index 00000000000..0f189a52789 --- /dev/null +++ b/storage/xtradb/row/row0upd.cc @@ -0,0 +1,2711 @@ +/***************************************************************************** + +Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file row/row0upd.cc +Update of a row + +Created 12/27/1996 Heikki Tuuri +*******************************************************/ + +#include "m_string.h" /* for my_sys.h */ +#include "my_sys.h" /* DEBUG_SYNC_C */ +#include "row0upd.h" + +#ifdef UNIV_NONINL +#include "row0upd.ic" +#endif + +#include "ha_prototypes.h" +#include "dict0dict.h" +#include "trx0undo.h" +#include "rem0rec.h" +#ifndef UNIV_HOTBACKUP +#include "dict0boot.h" +#include "dict0crea.h" +#include "mach0data.h" +#include "btr0btr.h" +#include "btr0cur.h" +#include "que0que.h" +#include "row0ext.h" +#include "row0ins.h" +#include "row0log.h" +#include "row0row.h" +#include "row0sel.h" +#include "rem0cmp.h" +#include "lock0lock.h" +#include "log0log.h" +#include "pars0sym.h" +#include "eval0eval.h" +#include "buf0lru.h" +#include <algorithm> + +/* What kind of latch and lock can we assume when the control comes to + ------------------------------------------------------------------- +an update node? +-------------- +Efficiency of massive updates would require keeping an x-latch on a +clustered index page through many updates, and not setting an explicit +x-lock on clustered index records, as they anyway will get an implicit +x-lock when they are updated. A problem is that the read nodes in the +graph should know that they must keep the latch when passing the control +up to the update node, and not set any record lock on the record which +will be updated. Another problem occurs if the execution is stopped, +as the kernel switches to another query thread, or the transaction must +wait for a lock. Then we should be able to release the latch and, maybe, +acquire an explicit x-lock on the record. + Because this seems too complicated, we conclude that the less +efficient solution of releasing all the latches when the control is +transferred to another node, and acquiring explicit x-locks, is better. */ + +/* How is a delete performed? If there is a delete without an +explicit cursor, i.e., a searched delete, there are at least +two different situations: +the implicit select cursor may run on (1) the clustered index or +on (2) a secondary index. The delete is performed by setting +the delete bit in the record and substituting the id of the +deleting transaction for the original trx id, and substituting a +new roll ptr for previous roll ptr. The old trx id and roll ptr +are saved in the undo log record. Thus, no physical changes occur +in the index tree structure at the time of the delete. Only +when the undo log is purged, the index records will be physically +deleted from the index trees. + +The query graph executing a searched delete would consist of +a delete node which has as a subtree a select subgraph. +The select subgraph should return a (persistent) cursor +in the clustered index, placed on page which is x-latched. +The delete node should look for all secondary index records for +this clustered index entry and mark them as deleted. When is +the x-latch freed? The most efficient way for performing a +searched delete is obviously to keep the x-latch for several +steps of query graph execution. */ + +/************************************************************************* +IMPORTANT NOTE: Any operation that generates redo MUST check that there +is enough space in the redo log before for that operation. This is +done by calling log_free_check(). The reason for checking the +availability of the redo log space before the start of the operation is +that we MUST not hold any synchonization objects when performing the +check. +If you make a change in this module make sure that no codepath is +introduced where a call to log_free_check() is bypassed. */ + +/***********************************************************//** +Checks if an update vector changes some of the first ordering fields of an +index record. This is only used in foreign key checks and we can assume +that index does not contain column prefixes. +@return TRUE if changes */ +static +ibool +row_upd_changes_first_fields_binary( +/*================================*/ + dtuple_t* entry, /*!< in: old value of index entry */ + dict_index_t* index, /*!< in: index of entry */ + const upd_t* update, /*!< in: update vector for the row */ + ulint n); /*!< in: how many first fields to check */ + + +/*********************************************************************//** +Checks if index currently is mentioned as a referenced index in a foreign +key constraint. + +NOTE that since we do not hold dict_operation_lock when leaving the +function, it may be that the referencing table has been dropped when +we leave this function: this function is only for heuristic use! + +@return TRUE if referenced */ +static +ibool +row_upd_index_is_referenced( +/*========================*/ + dict_index_t* index, /*!< in: index */ + trx_t* trx) /*!< in: transaction */ +{ + dict_table_t* table = index->table; + ibool froze_data_dict = FALSE; + ibool is_referenced = FALSE; + + if (table->referenced_set.empty()) { + return(FALSE); + } + + if (trx->dict_operation_lock_mode == 0) { + row_mysql_freeze_data_dictionary(trx); + froze_data_dict = TRUE; + } + + dict_foreign_set::iterator it + = std::find_if(table->referenced_set.begin(), + table->referenced_set.end(), + dict_foreign_with_index(index)); + + is_referenced = (it != table->referenced_set.end()); + + if (froze_data_dict) { + row_mysql_unfreeze_data_dictionary(trx); + } + + return(is_referenced); +} + +/*********************************************************************//** +Checks if possible foreign key constraints hold after a delete of the record +under pcur. + +NOTE that this function will temporarily commit mtr and lose the +pcur position! + +@return DB_SUCCESS or an error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_upd_check_references_constraints( +/*=================================*/ + upd_node_t* node, /*!< in: row update node */ + btr_pcur_t* pcur, /*!< in: cursor positioned on a record; NOTE: the + cursor position is lost in this function! */ + dict_table_t* table, /*!< in: table in question */ + dict_index_t* index, /*!< in: index of the cursor */ + ulint* offsets,/*!< in/out: rec_get_offsets(pcur.rec, index) */ + que_thr_t* thr, /*!< in: query thread */ + mtr_t* mtr) /*!< in: mtr */ +{ + dict_foreign_t* foreign; + mem_heap_t* heap; + dtuple_t* entry; + trx_t* trx; + const rec_t* rec; + ulint n_ext; + dberr_t err; + ibool got_s_lock = FALSE; + + if (table->referenced_set.empty()) { + + return(DB_SUCCESS); + } + + trx = thr_get_trx(thr); + + rec = btr_pcur_get_rec(pcur); + ut_ad(rec_offs_validate(rec, index, offsets)); + + heap = mem_heap_create(500); + + entry = row_rec_to_index_entry(rec, index, offsets, &n_ext, heap); + + mtr_commit(mtr); + + DEBUG_SYNC_C("foreign_constraint_check_for_update"); + + mtr_start(mtr); + + if (trx->dict_operation_lock_mode == 0) { + got_s_lock = TRUE; + + row_mysql_freeze_data_dictionary(trx); + } + +run_again: + + for (dict_foreign_set::iterator it = table->referenced_set.begin(); + it != table->referenced_set.end(); + ++it) { + + foreign = *it; + + /* Note that we may have an update which updates the index + record, but does NOT update the first fields which are + referenced in a foreign key constraint. Then the update does + NOT break the constraint. */ + + if (foreign->referenced_index == index + && (node->is_delete + || row_upd_changes_first_fields_binary( + entry, index, node->update, + foreign->n_fields))) { + dict_table_t* foreign_table = foreign->foreign_table; + + dict_table_t* ref_table = NULL; + + if (foreign_table == NULL) { + + ref_table = dict_table_open_on_name( + foreign->foreign_table_name_lookup, + FALSE, FALSE, DICT_ERR_IGNORE_NONE); + } + + if (foreign_table) { + os_inc_counter(dict_sys->mutex, + foreign_table + ->n_foreign_key_checks_running); + } + + /* NOTE that if the thread ends up waiting for a lock + we will release dict_operation_lock temporarily! + But the counter on the table protects 'foreign' from + being dropped while the check is running. */ + + err = row_ins_check_foreign_constraint( + FALSE, foreign, table, entry, thr); + + if (foreign_table) { + os_dec_counter(dict_sys->mutex, + foreign_table + ->n_foreign_key_checks_running); + } + + if (ref_table != NULL) { + dict_table_close(ref_table, FALSE, FALSE); + } + + /* Some table foreign key dropped, try again */ + if (err == DB_DICT_CHANGED) { + goto run_again; + } else if (err != DB_SUCCESS) { + goto func_exit; + } + } + } + + err = DB_SUCCESS; + +func_exit: + if (got_s_lock) { + row_mysql_unfreeze_data_dictionary(trx); + } + + mem_heap_free(heap); + + DEBUG_SYNC_C("foreign_constraint_check_for_update_done"); + + return(err); +} + +/*********************************************************************//** +Creates an update node for a query graph. +@return own: update node */ +UNIV_INTERN +upd_node_t* +upd_node_create( +/*============*/ + mem_heap_t* heap) /*!< in: mem heap where created */ +{ + upd_node_t* node; + + node = static_cast<upd_node_t*>( + mem_heap_alloc(heap, sizeof(upd_node_t))); + + node->common.type = QUE_NODE_UPDATE; + + node->state = UPD_NODE_UPDATE_CLUSTERED; + node->in_mysql_interface = FALSE; + + node->row = NULL; + node->ext = NULL; + node->upd_row = NULL; + node->upd_ext = NULL; + node->index = NULL; + node->update = NULL; + + node->foreign = NULL; + node->cascade_heap = NULL; + node->cascade_node = NULL; + + node->select = NULL; + + node->heap = mem_heap_create(128); + node->magic_n = UPD_NODE_MAGIC_N; + + node->cmpl_info = 0; + + return(node); +} +#endif /* !UNIV_HOTBACKUP */ + +/*********************************************************************//** +Updates the trx id and roll ptr field in a clustered index record in database +recovery. */ +UNIV_INTERN +void +row_upd_rec_sys_fields_in_recovery( +/*===============================*/ + rec_t* rec, /*!< in/out: record */ + page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint pos, /*!< in: TRX_ID position in rec */ + trx_id_t trx_id, /*!< in: transaction id */ + roll_ptr_t roll_ptr)/*!< in: roll ptr of the undo log record */ +{ + ut_ad(rec_offs_validate(rec, NULL, offsets)); + + if (page_zip) { + page_zip_write_trx_id_and_roll_ptr( + page_zip, rec, offsets, pos, trx_id, roll_ptr); + } else { + byte* field; + ulint len; + + field = rec_get_nth_field(rec, offsets, pos, &len); + ut_ad(len == DATA_TRX_ID_LEN); +#if DATA_TRX_ID + 1 != DATA_ROLL_PTR +# error "DATA_TRX_ID + 1 != DATA_ROLL_PTR" +#endif + trx_write_trx_id(field, trx_id); + trx_write_roll_ptr(field + DATA_TRX_ID_LEN, roll_ptr); + } +} + +#ifndef UNIV_HOTBACKUP +/*********************************************************************//** +Sets the trx id or roll ptr field of a clustered index entry. */ +UNIV_INTERN +void +row_upd_index_entry_sys_field( +/*==========================*/ + dtuple_t* entry, /*!< in/out: index entry, where the memory + buffers for sys fields are already allocated: + the function just copies the new values to + them */ + dict_index_t* index, /*!< in: clustered index */ + ulint type, /*!< in: DATA_TRX_ID or DATA_ROLL_PTR */ + ib_uint64_t val) /*!< in: value to write */ +{ + dfield_t* dfield; + byte* field; + ulint pos; + + ut_ad(dict_index_is_clust(index)); + + pos = dict_index_get_sys_col_pos(index, type); + + dfield = dtuple_get_nth_field(entry, pos); + field = static_cast<byte*>(dfield_get_data(dfield)); + + if (type == DATA_TRX_ID) { + trx_write_trx_id(field, val); + } else { + ut_ad(type == DATA_ROLL_PTR); + trx_write_roll_ptr(field, val); + } +} + +/***********************************************************//** +Returns TRUE if row update changes size of some field in index or if some +field to be updated is stored externally in rec or update. +@return TRUE if the update changes the size of some field in index or +the field is external in rec or update */ +UNIV_INTERN +ibool +row_upd_changes_field_size_or_external( +/*===================================*/ + dict_index_t* index, /*!< in: index */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + const upd_t* update) /*!< in: update vector */ +{ + const upd_field_t* upd_field; + const dfield_t* new_val; + ulint old_len; + ulint new_len; + ulint n_fields; + ulint i; + + ut_ad(rec_offs_validate(NULL, index, offsets)); + n_fields = upd_get_n_fields(update); + + for (i = 0; i < n_fields; i++) { + upd_field = upd_get_nth_field(update, i); + + new_val = &(upd_field->new_val); + new_len = dfield_get_len(new_val); + + if (dfield_is_null(new_val) && !rec_offs_comp(offsets)) { + /* A bug fixed on Dec 31st, 2004: we looked at the + SQL NULL size from the wrong field! We may backport + this fix also to 4.0. The merge to 5.0 will be made + manually immediately after we commit this to 4.1. */ + + new_len = dict_col_get_sql_null_size( + dict_index_get_nth_col(index, + upd_field->field_no), + 0); + } + + old_len = rec_offs_nth_size(offsets, upd_field->field_no); + + if (rec_offs_comp(offsets) + && rec_offs_nth_sql_null(offsets, + upd_field->field_no)) { + /* Note that in the compact table format, for a + variable length field, an SQL NULL will use zero + bytes in the offset array at the start of the physical + record, but a zero-length value (empty string) will + use one byte! Thus, we cannot use update-in-place + if we update an SQL NULL varchar to an empty string! */ + + old_len = UNIV_SQL_NULL; + } + + if (dfield_is_ext(new_val) || old_len != new_len + || rec_offs_nth_extern(offsets, upd_field->field_no)) { + + return(TRUE); + } + } + + return(FALSE); +} + +/***********************************************************//** +Returns true if row update contains disowned external fields. +@return true if the update contains disowned external fields. */ +UNIV_INTERN +bool +row_upd_changes_disowned_external( +/*==============================*/ + const upd_t* update) /*!< in: update vector */ +{ + const upd_field_t* upd_field; + const dfield_t* new_val; + ulint new_len; + ulint n_fields; + ulint i; + + n_fields = upd_get_n_fields(update); + + for (i = 0; i < n_fields; i++) { + const byte* field_ref; + + upd_field = upd_get_nth_field(update, i); + new_val = &(upd_field->new_val); + new_len = dfield_get_len(new_val); + + if (!dfield_is_ext(new_val)) { + continue; + } + + ut_ad(new_len >= BTR_EXTERN_FIELD_REF_SIZE); + + field_ref = static_cast<const byte*>(dfield_get_data(new_val)) + + new_len - BTR_EXTERN_FIELD_REF_SIZE; + + if (field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG) { + return(true); + } + } + + return(false); +} +#endif /* !UNIV_HOTBACKUP */ + +/***********************************************************//** +Replaces the new column values stored in the update vector to the +record given. No field size changes are allowed. This function is +usually invoked on a clustered index. The only use case for a +secondary index is row_ins_sec_index_entry_by_modify() or its +counterpart in ibuf_insert_to_index_page(). */ +UNIV_INTERN +void +row_upd_rec_in_place( +/*=================*/ + rec_t* rec, /*!< in/out: record where replaced */ + dict_index_t* index, /*!< in: the index the record belongs to */ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + const upd_t* update, /*!< in: update vector */ + page_zip_des_t* page_zip)/*!< in: compressed page with enough space + available, or NULL */ +{ + const upd_field_t* upd_field; + const dfield_t* new_val; + ulint n_fields; + ulint i; + + ut_ad(rec_offs_validate(rec, index, offsets)); + + if (rec_offs_comp(offsets)) { + rec_set_info_bits_new(rec, update->info_bits); + } else { + rec_set_info_bits_old(rec, update->info_bits); + } + + n_fields = upd_get_n_fields(update); + + for (i = 0; i < n_fields; i++) { +#ifdef UNIV_BLOB_DEBUG + btr_blob_dbg_t b; + const byte* field_ref = NULL; +#endif /* UNIV_BLOB_DEBUG */ + + upd_field = upd_get_nth_field(update, i); + new_val = &(upd_field->new_val); + ut_ad(!dfield_is_ext(new_val) == + !rec_offs_nth_extern(offsets, upd_field->field_no)); +#ifdef UNIV_BLOB_DEBUG + if (dfield_is_ext(new_val)) { + ulint len; + field_ref = rec_get_nth_field(rec, offsets, i, &len); + ut_a(len != UNIV_SQL_NULL); + ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE); + field_ref += len - BTR_EXTERN_FIELD_REF_SIZE; + + b.ref_page_no = page_get_page_no(page_align(rec)); + b.ref_heap_no = page_rec_get_heap_no(rec); + b.ref_field_no = i; + b.blob_page_no = mach_read_from_4( + field_ref + BTR_EXTERN_PAGE_NO); + ut_a(b.ref_field_no >= index->n_uniq); + btr_blob_dbg_rbt_delete(index, &b, "upd_in_place"); + } +#endif /* UNIV_BLOB_DEBUG */ + + rec_set_nth_field(rec, offsets, upd_field->field_no, + dfield_get_data(new_val), + dfield_get_len(new_val)); + +#ifdef UNIV_BLOB_DEBUG + if (dfield_is_ext(new_val)) { + b.blob_page_no = mach_read_from_4( + field_ref + BTR_EXTERN_PAGE_NO); + b.always_owner = b.owner = !(field_ref[BTR_EXTERN_LEN] + & BTR_EXTERN_OWNER_FLAG); + b.del = rec_get_deleted_flag( + rec, rec_offs_comp(offsets)); + + btr_blob_dbg_rbt_insert(index, &b, "upd_in_place"); + } +#endif /* UNIV_BLOB_DEBUG */ + } + + if (page_zip) { + page_zip_write_rec(page_zip, rec, index, offsets, 0); + } +} + +#ifndef UNIV_HOTBACKUP +/*********************************************************************//** +Writes into the redo log the values of trx id and roll ptr and enough info +to determine their positions within a clustered index record. +@return new pointer to mlog */ +UNIV_INTERN +byte* +row_upd_write_sys_vals_to_log( +/*==========================*/ + dict_index_t* index, /*!< in: clustered index */ + trx_id_t trx_id, /*!< in: transaction id */ + roll_ptr_t roll_ptr,/*!< in: roll ptr of the undo log record */ + byte* log_ptr,/*!< pointer to a buffer of size > 20 opened + in mlog */ + mtr_t* mtr __attribute__((unused))) /*!< in: mtr */ +{ + ut_ad(dict_index_is_clust(index)); + ut_ad(mtr); + + log_ptr += mach_write_compressed(log_ptr, + dict_index_get_sys_col_pos( + index, DATA_TRX_ID)); + + trx_write_roll_ptr(log_ptr, roll_ptr); + log_ptr += DATA_ROLL_PTR_LEN; + + log_ptr += mach_ull_write_compressed(log_ptr, trx_id); + + return(log_ptr); +} +#endif /* !UNIV_HOTBACKUP */ + +/*********************************************************************//** +Parses the log data of system field values. +@return log data end or NULL */ +UNIV_INTERN +byte* +row_upd_parse_sys_vals( +/*===================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + ulint* pos, /*!< out: TRX_ID position in record */ + trx_id_t* trx_id, /*!< out: trx id */ + roll_ptr_t* roll_ptr)/*!< out: roll ptr */ +{ + ptr = mach_parse_compressed(ptr, end_ptr, pos); + + if (ptr == NULL) { + + return(NULL); + } + + if (end_ptr < ptr + DATA_ROLL_PTR_LEN) { + + return(NULL); + } + + *roll_ptr = trx_read_roll_ptr(ptr); + ptr += DATA_ROLL_PTR_LEN; + + ptr = mach_ull_parse_compressed(ptr, end_ptr, trx_id); + + return(ptr); +} + +#ifndef UNIV_HOTBACKUP +/***********************************************************//** +Writes to the redo log the new values of the fields occurring in the index. */ +UNIV_INTERN +void +row_upd_index_write_log( +/*====================*/ + const upd_t* update, /*!< in: update vector */ + byte* log_ptr,/*!< in: pointer to mlog buffer: must + contain at least MLOG_BUF_MARGIN bytes + of free space; the buffer is closed + within this function */ + mtr_t* mtr) /*!< in: mtr into whose log to write */ +{ + const upd_field_t* upd_field; + const dfield_t* new_val; + ulint len; + ulint n_fields; + byte* buf_end; + ulint i; + + n_fields = upd_get_n_fields(update); + + buf_end = log_ptr + MLOG_BUF_MARGIN; + + mach_write_to_1(log_ptr, update->info_bits); + log_ptr++; + log_ptr += mach_write_compressed(log_ptr, n_fields); + + for (i = 0; i < n_fields; i++) { + +#if MLOG_BUF_MARGIN <= 30 +# error "MLOG_BUF_MARGIN <= 30" +#endif + + if (log_ptr + 30 > buf_end) { + mlog_close(mtr, log_ptr); + + log_ptr = mlog_open(mtr, MLOG_BUF_MARGIN); + buf_end = log_ptr + MLOG_BUF_MARGIN; + } + + upd_field = upd_get_nth_field(update, i); + + new_val = &(upd_field->new_val); + + len = dfield_get_len(new_val); + + log_ptr += mach_write_compressed(log_ptr, upd_field->field_no); + log_ptr += mach_write_compressed(log_ptr, len); + + if (len != UNIV_SQL_NULL) { + if (log_ptr + len < buf_end) { + memcpy(log_ptr, dfield_get_data(new_val), len); + + log_ptr += len; + } else { + mlog_close(mtr, log_ptr); + + mlog_catenate_string( + mtr, + static_cast<byte*>( + dfield_get_data(new_val)), + len); + + log_ptr = mlog_open(mtr, MLOG_BUF_MARGIN); + buf_end = log_ptr + MLOG_BUF_MARGIN; + } + } + } + + mlog_close(mtr, log_ptr); +} +#endif /* !UNIV_HOTBACKUP */ + +/*********************************************************************//** +Parses the log data written by row_upd_index_write_log. +@return log data end or NULL */ +UNIV_INTERN +byte* +row_upd_index_parse( +/*================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + mem_heap_t* heap, /*!< in: memory heap where update vector is + built */ + upd_t** update_out)/*!< out: update vector */ +{ + upd_t* update; + upd_field_t* upd_field; + dfield_t* new_val; + ulint len; + ulint n_fields; + ulint info_bits; + ulint i; + + if (end_ptr < ptr + 1) { + + return(NULL); + } + + info_bits = mach_read_from_1(ptr); + ptr++; + ptr = mach_parse_compressed(ptr, end_ptr, &n_fields); + + if (ptr == NULL) { + + return(NULL); + } + + update = upd_create(n_fields, heap); + update->info_bits = info_bits; + + for (i = 0; i < n_fields; i++) { + ulint field_no; + upd_field = upd_get_nth_field(update, i); + new_val = &(upd_field->new_val); + + ptr = mach_parse_compressed(ptr, end_ptr, &field_no); + + if (ptr == NULL) { + + return(NULL); + } + + upd_field->field_no = field_no; + + ptr = mach_parse_compressed(ptr, end_ptr, &len); + + if (ptr == NULL) { + + return(NULL); + } + + if (len != UNIV_SQL_NULL) { + + if (end_ptr < ptr + len) { + + return(NULL); + } + + dfield_set_data(new_val, + mem_heap_dup(heap, ptr, len), len); + ptr += len; + } else { + dfield_set_null(new_val); + } + } + + *update_out = update; + + return(ptr); +} + +#ifndef UNIV_HOTBACKUP +/***************************************************************//** +Builds an update vector from those fields which in a secondary index entry +differ from a record that has the equal ordering fields. NOTE: we compare +the fields as binary strings! +@return own: update vector of differing fields */ +UNIV_INTERN +upd_t* +row_upd_build_sec_rec_difference_binary( +/*====================================*/ + const rec_t* rec, /*!< in: secondary index record */ + dict_index_t* index, /*!< in: index */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + const dtuple_t* entry, /*!< in: entry to insert */ + mem_heap_t* heap) /*!< in: memory heap from which allocated */ +{ + upd_field_t* upd_field; + const dfield_t* dfield; + const byte* data; + ulint len; + upd_t* update; + ulint n_diff; + ulint i; + + /* This function is used only for a secondary index */ + ut_a(!dict_index_is_clust(index)); + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(rec_offs_n_fields(offsets) == dtuple_get_n_fields(entry)); + ut_ad(!rec_offs_any_extern(offsets)); + + update = upd_create(dtuple_get_n_fields(entry), heap); + + n_diff = 0; + + for (i = 0; i < dtuple_get_n_fields(entry); i++) { + + data = rec_get_nth_field(rec, offsets, i, &len); + + dfield = dtuple_get_nth_field(entry, i); + + /* NOTE that it may be that len != dfield_get_len(dfield) if we + are updating in a character set and collation where strings of + different length can be equal in an alphabetical comparison, + and also in the case where we have a column prefix index + and the last characters in the index field are spaces; the + latter case probably caused the assertion failures reported at + row0upd.cc line 713 in versions 4.0.14 - 4.0.16. */ + + /* NOTE: we compare the fields as binary strings! + (No collation) */ + + if (!dfield_data_is_binary_equal(dfield, len, data)) { + + upd_field = upd_get_nth_field(update, n_diff); + + dfield_copy(&(upd_field->new_val), dfield); + + upd_field_set_field_no(upd_field, i, index, NULL); + + n_diff++; + } + } + + update->n_fields = n_diff; + + return(update); +} + +/***************************************************************//** +Builds an update vector from those fields, excluding the roll ptr and +trx id fields, which in an index entry differ from a record that has +the equal ordering fields. NOTE: we compare the fields as binary strings! +@return own: update vector of differing fields, excluding roll ptr and +trx id */ +UNIV_INTERN +const upd_t* +row_upd_build_difference_binary( +/*============================*/ + dict_index_t* index, /*!< in: clustered index */ + const dtuple_t* entry, /*!< in: entry to insert */ + const rec_t* rec, /*!< in: clustered index record */ + const ulint* offsets,/*!< in: rec_get_offsets(rec,index), or NULL */ + bool no_sys, /*!< in: skip the system columns + DB_TRX_ID and DB_ROLL_PTR */ + trx_t* trx, /*!< in: transaction */ + mem_heap_t* heap) /*!< in: memory heap from which allocated */ +{ + upd_field_t* upd_field; + const dfield_t* dfield; + const byte* data; + ulint len; + upd_t* update; + ulint n_diff; + ulint trx_id_pos; + ulint i; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs_init(offsets_); + + /* This function is used only for a clustered index */ + ut_a(dict_index_is_clust(index)); + + update = upd_create(dtuple_get_n_fields(entry), heap); + + n_diff = 0; + + trx_id_pos = dict_index_get_sys_col_pos(index, DATA_TRX_ID); + ut_ad(dict_index_get_sys_col_pos(index, DATA_ROLL_PTR) + == trx_id_pos + 1); + + if (!offsets) { + offsets = rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap); + } else { + ut_ad(rec_offs_validate(rec, index, offsets)); + } + + for (i = 0; i < dtuple_get_n_fields(entry); i++) { + + data = rec_get_nth_field(rec, offsets, i, &len); + + dfield = dtuple_get_nth_field(entry, i); + + /* NOTE: we compare the fields as binary strings! + (No collation) */ + + if (no_sys && (i == trx_id_pos || i == trx_id_pos + 1)) { + + continue; + } + + if (!dfield_is_ext(dfield) + != !rec_offs_nth_extern(offsets, i) + || !dfield_data_is_binary_equal(dfield, len, data)) { + + upd_field = upd_get_nth_field(update, n_diff); + + dfield_copy(&(upd_field->new_val), dfield); + + upd_field_set_field_no(upd_field, i, index, trx); + + n_diff++; + } + } + + update->n_fields = n_diff; + + return(update); +} + +/***********************************************************//** +Fetch a prefix of an externally stored column. This is similar +to row_ext_lookup(), but the row_ext_t holds the old values +of the column and must not be poisoned with the new values. +@return BLOB prefix */ +static +byte* +row_upd_ext_fetch( +/*==============*/ + const byte* data, /*!< in: 'internally' stored part of the + field containing also the reference to + the external part */ + ulint local_len, /*!< in: length of data, in bytes */ + ulint zip_size, /*!< in: nonzero=compressed BLOB + page size, zero for uncompressed + BLOBs */ + ulint* len, /*!< in: length of prefix to fetch; + out: fetched length of the prefix */ + mem_heap_t* heap) /*!< in: heap where to allocate */ +{ + byte* buf = static_cast<byte*>(mem_heap_alloc(heap, *len)); + + *len = btr_copy_externally_stored_field_prefix( + buf, *len, zip_size, data, local_len); + + /* We should never update records containing a half-deleted BLOB. */ + ut_a(*len); + + return(buf); +} + +/***********************************************************//** +Replaces the new column value stored in the update vector in +the given index entry field. */ +static +void +row_upd_index_replace_new_col_val( +/*==============================*/ + dfield_t* dfield, /*!< in/out: data field + of the index entry */ + const dict_field_t* field, /*!< in: index field */ + const dict_col_t* col, /*!< in: field->col */ + const upd_field_t* uf, /*!< in: update field */ + mem_heap_t* heap, /*!< in: memory heap for allocating + and copying the new value */ + ulint zip_size)/*!< in: compressed page + size of the table, or 0 */ +{ + ulint len; + const byte* data; + + dfield_copy_data(dfield, &uf->new_val); + + if (dfield_is_null(dfield)) { + return; + } + + len = dfield_get_len(dfield); + data = static_cast<const byte*>(dfield_get_data(dfield)); + + if (field->prefix_len > 0) { + ibool fetch_ext = dfield_is_ext(dfield) + && len < (ulint) field->prefix_len + + BTR_EXTERN_FIELD_REF_SIZE; + + if (fetch_ext) { + ulint l = len; + + len = field->prefix_len; + + data = row_upd_ext_fetch(data, l, zip_size, + &len, heap); + } + + len = dtype_get_at_most_n_mbchars(col->prtype, + col->mbminmaxlen, + field->prefix_len, len, + (const char*) data); + + dfield_set_data(dfield, data, len); + + if (!fetch_ext) { + dfield_dup(dfield, heap); + } + + return; + } + + switch (uf->orig_len) { + byte* buf; + case BTR_EXTERN_FIELD_REF_SIZE: + /* Restore the original locally stored + part of the column. In the undo log, + InnoDB writes a longer prefix of externally + stored columns, so that column prefixes + in secondary indexes can be reconstructed. */ + dfield_set_data(dfield, + data + len - BTR_EXTERN_FIELD_REF_SIZE, + BTR_EXTERN_FIELD_REF_SIZE); + dfield_set_ext(dfield); + /* fall through */ + case 0: + dfield_dup(dfield, heap); + break; + default: + /* Reconstruct the original locally + stored part of the column. The data + will have to be copied. */ + ut_a(uf->orig_len > BTR_EXTERN_FIELD_REF_SIZE); + buf = static_cast<byte*>(mem_heap_alloc(heap, uf->orig_len)); + + /* Copy the locally stored prefix. */ + memcpy(buf, data, + uf->orig_len - BTR_EXTERN_FIELD_REF_SIZE); + + /* Copy the BLOB pointer. */ + memcpy(buf + uf->orig_len - BTR_EXTERN_FIELD_REF_SIZE, + data + len - BTR_EXTERN_FIELD_REF_SIZE, + BTR_EXTERN_FIELD_REF_SIZE); + + dfield_set_data(dfield, buf, uf->orig_len); + dfield_set_ext(dfield); + break; + } +} + +/***********************************************************//** +Replaces the new column values stored in the update vector to the index entry +given. */ +UNIV_INTERN +void +row_upd_index_replace_new_col_vals_index_pos( +/*=========================================*/ + dtuple_t* entry, /*!< in/out: index entry where replaced; + the clustered index record must be + covered by a lock or a page latch to + prevent deletion (rollback or purge) */ + dict_index_t* index, /*!< in: index; NOTE that this may also be a + non-clustered index */ + const upd_t* update, /*!< in: an update vector built for the index so + that the field number in an upd_field is the + index position */ + ibool order_only, + /*!< in: if TRUE, limit the replacement to + ordering fields of index; note that this + does not work for non-clustered indexes. */ + mem_heap_t* heap) /*!< in: memory heap for allocating and + copying the new values */ +{ + ulint i; + ulint n_fields; + const ulint zip_size = dict_table_zip_size(index->table); + + ut_ad(index); + + dtuple_set_info_bits(entry, update->info_bits); + + if (order_only) { + n_fields = dict_index_get_n_unique(index); + } else { + n_fields = dict_index_get_n_fields(index); + } + + for (i = 0; i < n_fields; i++) { + const dict_field_t* field; + const dict_col_t* col; + const upd_field_t* uf; + + field = dict_index_get_nth_field(index, i); + col = dict_field_get_col(field); + uf = upd_get_field_by_field_no(update, i); + + if (uf) { + row_upd_index_replace_new_col_val( + dtuple_get_nth_field(entry, i), + field, col, uf, heap, zip_size); + } + } +} + +/***********************************************************//** +Replaces the new column values stored in the update vector to the index entry +given. */ +UNIV_INTERN +void +row_upd_index_replace_new_col_vals( +/*===============================*/ + dtuple_t* entry, /*!< in/out: index entry where replaced; + the clustered index record must be + covered by a lock or a page latch to + prevent deletion (rollback or purge) */ + dict_index_t* index, /*!< in: index; NOTE that this may also be a + non-clustered index */ + const upd_t* update, /*!< in: an update vector built for the + CLUSTERED index so that the field number in + an upd_field is the clustered index position */ + mem_heap_t* heap) /*!< in: memory heap for allocating and + copying the new values */ +{ + ulint i; + const dict_index_t* clust_index + = dict_table_get_first_index(index->table); + const ulint zip_size + = dict_table_zip_size(index->table); + + dtuple_set_info_bits(entry, update->info_bits); + + for (i = 0; i < dict_index_get_n_fields(index); i++) { + const dict_field_t* field; + const dict_col_t* col; + const upd_field_t* uf; + + field = dict_index_get_nth_field(index, i); + col = dict_field_get_col(field); + uf = upd_get_field_by_field_no( + update, dict_col_get_clust_pos(col, clust_index)); + + if (uf) { + row_upd_index_replace_new_col_val( + dtuple_get_nth_field(entry, i), + field, col, uf, heap, zip_size); + } + } +} + +/***********************************************************//** +Replaces the new column values stored in the update vector. */ +UNIV_INTERN +void +row_upd_replace( +/*============*/ + dtuple_t* row, /*!< in/out: row where replaced, + indexed by col_no; + the clustered index record must be + covered by a lock or a page latch to + prevent deletion (rollback or purge) */ + row_ext_t** ext, /*!< out, own: NULL, or externally + stored column prefixes */ + const dict_index_t* index, /*!< in: clustered index */ + const upd_t* update, /*!< in: an update vector built for the + clustered index */ + mem_heap_t* heap) /*!< in: memory heap */ +{ + ulint col_no; + ulint i; + ulint n_cols; + ulint n_ext_cols; + ulint* ext_cols; + const dict_table_t* table; + + ut_ad(row); + ut_ad(ext); + ut_ad(index); + ut_ad(dict_index_is_clust(index)); + ut_ad(update); + ut_ad(heap); + + n_cols = dtuple_get_n_fields(row); + table = index->table; + ut_ad(n_cols == dict_table_get_n_cols(table)); + + ext_cols = static_cast<ulint*>( + mem_heap_alloc(heap, n_cols * sizeof *ext_cols)); + + n_ext_cols = 0; + + dtuple_set_info_bits(row, update->info_bits); + + for (col_no = 0; col_no < n_cols; col_no++) { + + const dict_col_t* col + = dict_table_get_nth_col(table, col_no); + const ulint clust_pos + = dict_col_get_clust_pos(col, index); + dfield_t* dfield; + + if (UNIV_UNLIKELY(clust_pos == ULINT_UNDEFINED)) { + + continue; + } + + dfield = dtuple_get_nth_field(row, col_no); + + for (i = 0; i < upd_get_n_fields(update); i++) { + + const upd_field_t* upd_field + = upd_get_nth_field(update, i); + + if (upd_field->field_no != clust_pos) { + + continue; + } + + dfield_copy_data(dfield, &upd_field->new_val); + break; + } + + if (dfield_is_ext(dfield) && col->ord_part) { + ext_cols[n_ext_cols++] = col_no; + } + } + + if (n_ext_cols) { + *ext = row_ext_create(n_ext_cols, ext_cols, table->flags, row, + heap); + } else { + *ext = NULL; + } +} + +/***********************************************************//** +Checks if an update vector changes an ordering field of an index record. + +This function is fast if the update vector is short or the number of ordering +fields in the index is small. Otherwise, this can be quadratic. +NOTE: we compare the fields as binary strings! +@return TRUE if update vector changes an ordering field in the index record */ +UNIV_INTERN +ibool +row_upd_changes_ord_field_binary_func( +/*==================================*/ + dict_index_t* index, /*!< in: index of the record */ + const upd_t* update, /*!< in: update vector for the row; NOTE: the + field numbers in this MUST be clustered index + positions! */ +#ifdef UNIV_DEBUG + const que_thr_t*thr, /*!< in: query thread */ +#endif /* UNIV_DEBUG */ + const dtuple_t* row, /*!< in: old value of row, or NULL if the + row and the data values in update are not + known when this function is called, e.g., at + compile time */ + const row_ext_t*ext) /*!< NULL, or prefixes of the externally + stored columns in the old row */ +{ + ulint n_unique; + ulint i; + const dict_index_t* clust_index; + + ut_ad(index); + ut_ad(update); + ut_ad(thr); + ut_ad(thr->graph); + ut_ad(thr->graph->trx); + + n_unique = dict_index_get_n_unique(index); + + clust_index = dict_table_get_first_index(index->table); + + for (i = 0; i < n_unique; i++) { + + const dict_field_t* ind_field; + const dict_col_t* col; + ulint col_no; + const upd_field_t* upd_field; + const dfield_t* dfield; + dfield_t dfield_ext; + ulint dfield_len; + const byte* buf; + + ind_field = dict_index_get_nth_field(index, i); + col = dict_field_get_col(ind_field); + col_no = dict_col_get_no(col); + + upd_field = upd_get_field_by_field_no( + update, dict_col_get_clust_pos(col, clust_index)); + + if (upd_field == NULL) { + continue; + } + + if (row == NULL) { + ut_ad(ext == NULL); + return(TRUE); + } + + dfield = dtuple_get_nth_field(row, col_no); + + /* This treatment of column prefix indexes is loosely + based on row_build_index_entry(). */ + + if (UNIV_LIKELY(ind_field->prefix_len == 0) + || dfield_is_null(dfield)) { + /* do nothing special */ + } else if (ext) { + /* Silence a compiler warning without + silencing a Valgrind error. */ + dfield_len = 0; + UNIV_MEM_INVALID(&dfield_len, sizeof dfield_len); + /* See if the column is stored externally. */ + buf = row_ext_lookup(ext, col_no, &dfield_len); + + ut_ad(col->ord_part); + + if (UNIV_LIKELY_NULL(buf)) { + if (UNIV_UNLIKELY(buf == field_ref_zero)) { + /* The externally stored field + was not written yet. This + record should only be seen by + recv_recovery_rollback_active(), + when the server had crashed before + storing the field. */ + ut_ad(thr->graph->trx->is_recovered); + ut_ad(trx_is_recv(thr->graph->trx)); + return(TRUE); + } + + goto copy_dfield; + } + } else if (dfield_is_ext(dfield)) { + dfield_len = dfield_get_len(dfield); + ut_a(dfield_len > BTR_EXTERN_FIELD_REF_SIZE); + dfield_len -= BTR_EXTERN_FIELD_REF_SIZE; + ut_a(dict_index_is_clust(index) + || ind_field->prefix_len <= dfield_len); + + buf = static_cast<byte*>(dfield_get_data(dfield)); +copy_dfield: + ut_a(dfield_len > 0); + dfield_copy(&dfield_ext, dfield); + dfield_set_data(&dfield_ext, buf, dfield_len); + dfield = &dfield_ext; + } + + if (!dfield_datas_are_binary_equal( + dfield, &upd_field->new_val, + ind_field->prefix_len)) { + + return(TRUE); + } + } + + return(FALSE); +} + +/***********************************************************//** +Checks if an update vector changes an ordering field of an index record. +NOTE: we compare the fields as binary strings! +@return TRUE if update vector may change an ordering field in an index +record */ +UNIV_INTERN +ibool +row_upd_changes_some_index_ord_field_binary( +/*========================================*/ + const dict_table_t* table, /*!< in: table */ + const upd_t* update) /*!< in: update vector for the row */ +{ + upd_field_t* upd_field; + dict_index_t* index; + ulint i; + + index = dict_table_get_first_index(table); + + for (i = 0; i < upd_get_n_fields(update); i++) { + + upd_field = upd_get_nth_field(update, i); + + if (dict_field_get_col(dict_index_get_nth_field( + index, upd_field->field_no)) + ->ord_part) { + + return(TRUE); + } + } + + return(FALSE); +} + +/***********************************************************//** +Checks if an FTS Doc ID column is affected by an UPDATE. +@return whether the Doc ID column is changed */ +UNIV_INTERN +bool +row_upd_changes_doc_id( +/*===================*/ + dict_table_t* table, /*!< in: table */ + upd_field_t* upd_field) /*!< in: field to check */ +{ + ulint col_no; + dict_index_t* clust_index; + fts_t* fts = table->fts; + + clust_index = dict_table_get_first_index(table); + + /* Convert from index-specific column number to table-global + column number. */ + col_no = dict_index_get_nth_col_no(clust_index, upd_field->field_no); + + return(col_no == fts->doc_col); +} +/***********************************************************//** +Checks if an FTS indexed column is affected by an UPDATE. +@return offset within fts_t::indexes if FTS indexed column updated else +ULINT_UNDEFINED */ +UNIV_INTERN +ulint +row_upd_changes_fts_column( +/*=======================*/ + dict_table_t* table, /*!< in: table */ + upd_field_t* upd_field) /*!< in: field to check */ +{ + ulint col_no; + dict_index_t* clust_index; + fts_t* fts = table->fts; + + clust_index = dict_table_get_first_index(table); + + /* Convert from index-specific column number to table-global + column number. */ + col_no = dict_index_get_nth_col_no(clust_index, upd_field->field_no); + + return(dict_table_is_fts_column(fts->indexes, col_no)); +} + +/***********************************************************//** +Checks if an update vector changes some of the first ordering fields of an +index record. This is only used in foreign key checks and we can assume +that index does not contain column prefixes. +@return TRUE if changes */ +static +ibool +row_upd_changes_first_fields_binary( +/*================================*/ + dtuple_t* entry, /*!< in: index entry */ + dict_index_t* index, /*!< in: index of entry */ + const upd_t* update, /*!< in: update vector for the row */ + ulint n) /*!< in: how many first fields to check */ +{ + ulint n_upd_fields; + ulint i, j; + dict_index_t* clust_index; + + ut_ad(update && index); + ut_ad(n <= dict_index_get_n_fields(index)); + + n_upd_fields = upd_get_n_fields(update); + clust_index = dict_table_get_first_index(index->table); + + for (i = 0; i < n; i++) { + + const dict_field_t* ind_field; + const dict_col_t* col; + ulint col_pos; + + ind_field = dict_index_get_nth_field(index, i); + col = dict_field_get_col(ind_field); + col_pos = dict_col_get_clust_pos(col, clust_index); + + ut_a(ind_field->prefix_len == 0); + + for (j = 0; j < n_upd_fields; j++) { + + upd_field_t* upd_field + = upd_get_nth_field(update, j); + + if (col_pos == upd_field->field_no + && !dfield_datas_are_binary_equal( + dtuple_get_nth_field(entry, i), + &upd_field->new_val, 0)) { + + return(TRUE); + } + } + } + + return(FALSE); +} + +/*********************************************************************//** +Copies the column values from a record. */ +UNIV_INLINE +void +row_upd_copy_columns( +/*=================*/ + rec_t* rec, /*!< in: record in a clustered index */ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + sym_node_t* column) /*!< in: first column in a column list, or + NULL */ +{ + byte* data; + ulint len; + + while (column) { + data = rec_get_nth_field(rec, offsets, + column->field_nos[SYM_CLUST_FIELD_NO], + &len); + eval_node_copy_and_alloc_val(column, data, len); + + column = UT_LIST_GET_NEXT(col_var_list, column); + } +} + +/*********************************************************************//** +Calculates the new values for fields to update. Note that row_upd_copy_columns +must have been called first. */ +UNIV_INLINE +void +row_upd_eval_new_vals( +/*==================*/ + upd_t* update) /*!< in/out: update vector */ +{ + que_node_t* exp; + upd_field_t* upd_field; + ulint n_fields; + ulint i; + + n_fields = upd_get_n_fields(update); + + for (i = 0; i < n_fields; i++) { + upd_field = upd_get_nth_field(update, i); + + exp = upd_field->exp; + + eval_exp(exp); + + dfield_copy_data(&(upd_field->new_val), que_node_get_val(exp)); + } +} + +/***********************************************************//** +Stores to the heap the row on which the node->pcur is positioned. */ +static +void +row_upd_store_row( +/*==============*/ + upd_node_t* node) /*!< in: row update node */ +{ + dict_index_t* clust_index; + rec_t* rec; + mem_heap_t* heap = NULL; + row_ext_t** ext; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + const ulint* offsets; + rec_offs_init(offsets_); + + ut_ad(node->pcur->latch_mode != BTR_NO_LATCHES); + + if (node->row != NULL) { + mem_heap_empty(node->heap); + } + + clust_index = dict_table_get_first_index(node->table); + + rec = btr_pcur_get_rec(node->pcur); + + offsets = rec_get_offsets(rec, clust_index, offsets_, + ULINT_UNDEFINED, &heap); + + if (dict_table_get_format(node->table) >= UNIV_FORMAT_B) { + /* In DYNAMIC or COMPRESSED format, there is no prefix + of externally stored columns in the clustered index + record. Build a cache of column prefixes. */ + ext = &node->ext; + } else { + /* REDUNDANT and COMPACT formats store a local + 768-byte prefix of each externally stored column. + No cache is needed. */ + ext = NULL; + node->ext = NULL; + } + + node->row = row_build(ROW_COPY_DATA, clust_index, rec, offsets, + NULL, NULL, NULL, ext, node->heap); + if (node->is_delete) { + node->upd_row = NULL; + node->upd_ext = NULL; + } else { + node->upd_row = dtuple_copy(node->row, node->heap); + row_upd_replace(node->upd_row, &node->upd_ext, + clust_index, node->update, node->heap); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } +} + +/***********************************************************//** +Updates a secondary index entry of a row. +@return DB_SUCCESS if operation successfully completed, else error +code or DB_LOCK_WAIT */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_upd_sec_index_entry( +/*====================*/ + upd_node_t* node, /*!< in: row update node */ + que_thr_t* thr) /*!< in: query thread */ +{ + mtr_t mtr; + const rec_t* rec; + btr_pcur_t pcur; + mem_heap_t* heap; + dtuple_t* entry; + dict_index_t* index; + btr_cur_t* btr_cur; + ibool referenced; + dberr_t err = DB_SUCCESS; + trx_t* trx = thr_get_trx(thr); + ulint mode; + enum row_search_result search_result; + + ut_ad(trx->id); + + index = node->index; + + referenced = row_upd_index_is_referenced(index, trx); + + heap = mem_heap_create(1024); + + /* Build old index entry */ + entry = row_build_index_entry(node->row, node->ext, index, heap); + ut_a(entry); + + log_free_check(); + +#ifdef UNIV_DEBUG + /* Work around Bug#14626800 ASSERTION FAILURE IN DEBUG_SYNC(). + Once it is fixed, remove the 'ifdef', 'if' and this comment. */ + if (!trx->ddl) { + DEBUG_SYNC_C_IF_THD(trx->mysql_thd, + "before_row_upd_sec_index_entry"); + } +#endif /* UNIV_DEBUG */ + + mtr_start(&mtr); + + if (*index->name == TEMP_INDEX_PREFIX) { + /* The index->online_status may change if the + index->name starts with TEMP_INDEX_PREFIX (meaning + that the index is or was being created online). It is + protected by index->lock. */ + + mtr_s_lock(dict_index_get_lock(index), &mtr); + + switch (dict_index_get_online_status(index)) { + case ONLINE_INDEX_COMPLETE: + /* This is a normal index. Do not log anything. + Perform the update on the index tree directly. */ + break; + case ONLINE_INDEX_CREATION: + /* Log a DELETE and optionally INSERT. */ + row_log_online_op(index, entry, 0); + + if (!node->is_delete) { + mem_heap_empty(heap); + entry = row_build_index_entry( + node->upd_row, node->upd_ext, + index, heap); + ut_a(entry); + row_log_online_op(index, entry, trx->id); + } + /* fall through */ + case ONLINE_INDEX_ABORTED: + case ONLINE_INDEX_ABORTED_DROPPED: + mtr_commit(&mtr); + goto func_exit; + } + + /* We can only buffer delete-mark operations if there + are no foreign key constraints referring to the index. */ + mode = referenced + ? BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED + : BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED + | BTR_DELETE_MARK; + } else { + /* For secondary indexes, + index->online_status==ONLINE_INDEX_CREATION unless + index->name starts with TEMP_INDEX_PREFIX. */ + ut_ad(!dict_index_is_online_ddl(index)); + + /* We can only buffer delete-mark operations if there + are no foreign key constraints referring to the index. */ + mode = referenced + ? BTR_MODIFY_LEAF + : BTR_MODIFY_LEAF | BTR_DELETE_MARK; + } + + /* Set the query thread, so that ibuf_insert_low() will be + able to invoke thd_get_trx(). */ + btr_pcur_get_btr_cur(&pcur)->thr = thr; + + search_result = row_search_index_entry(index, entry, + UNIV_UNLIKELY(trx->fake_changes) + ? BTR_SEARCH_LEAF + : (btr_latch_mode)mode, + &pcur, &mtr); + + btr_cur = btr_pcur_get_btr_cur(&pcur); + + rec = btr_cur_get_rec(btr_cur); + + switch (search_result) { + case ROW_NOT_DELETED_REF: /* should only occur for BTR_DELETE */ + ut_error; + break; + case ROW_BUFFERED: + /* Entry was delete marked already. */ + break; + + case ROW_NOT_FOUND: + if (*index->name == TEMP_INDEX_PREFIX) { + /* When online CREATE INDEX copied the update + that we already made to the clustered index, + and completed the secondary index creation + before we got here, the old secondary index + record would not exist. The CREATE INDEX + should be waiting for a MySQL meta-data lock + upgrade at least until this UPDATE + returns. After that point, the + TEMP_INDEX_PREFIX would be dropped from the + index name in commit_inplace_alter_table(). */ + break; + } + + fputs("InnoDB: error in sec index entry update in\n" + "InnoDB: ", stderr); + dict_index_name_print(stderr, trx, index); + fputs("\n" + "InnoDB: tuple ", stderr); + dtuple_print(stderr, entry); + fputs("\n" + "InnoDB: record ", stderr); + rec_print(stderr, rec, index); + putc('\n', stderr); + trx_print(stderr, trx, 0); + fputs("\n" + "InnoDB: Submit a detailed bug report" + " to http://bugs.mysql.com\n", stderr); + ut_ad(0); + break; + case ROW_FOUND: + /* Delete mark the old index record; it can already be + delete marked if we return after a lock wait in + row_ins_sec_index_entry() below */ + if (!rec_get_deleted_flag( + rec, dict_table_is_comp(index->table))) { + err = btr_cur_del_mark_set_sec_rec( + 0, btr_cur, TRUE, thr, &mtr); + + if (err == DB_SUCCESS && referenced) { + + ulint* offsets; + + offsets = rec_get_offsets( + rec, index, NULL, ULINT_UNDEFINED, + &heap); + + /* NOTE that the following call loses + the position of pcur ! */ + err = row_upd_check_references_constraints( + node, &pcur, index->table, + index, offsets, thr, &mtr); + } + } + break; + } + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + if (node->is_delete || err != DB_SUCCESS) { + + goto func_exit; + } + + mem_heap_empty(heap); + + /* Build a new index entry */ + entry = row_build_index_entry(node->upd_row, node->upd_ext, + index, heap); + ut_a(entry); + + /* Insert new index entry */ + err = row_ins_sec_index_entry(index, entry, thr); + +func_exit: + mem_heap_free(heap); + + return(err); +} + +/***********************************************************//** +Updates the secondary index record if it is changed in the row update or +deletes it if this is a delete. +@return DB_SUCCESS if operation successfully completed, else error +code or DB_LOCK_WAIT */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_upd_sec_step( +/*=============*/ + upd_node_t* node, /*!< in: row update node */ + que_thr_t* thr) /*!< in: query thread */ +{ + ut_ad((node->state == UPD_NODE_UPDATE_ALL_SEC) + || (node->state == UPD_NODE_UPDATE_SOME_SEC)); + ut_ad(!dict_index_is_clust(node->index)); + + if (node->state == UPD_NODE_UPDATE_ALL_SEC + || row_upd_changes_ord_field_binary(node->index, node->update, + thr, node->row, node->ext)) { + return(row_upd_sec_index_entry(node, thr)); + } + + return(DB_SUCCESS); +} + +#ifdef UNIV_DEBUG +# define row_upd_clust_rec_by_insert_inherit(rec,offsets,entry,update) \ + row_upd_clust_rec_by_insert_inherit_func(rec,offsets,entry,update) +#else /* UNIV_DEBUG */ +# define row_upd_clust_rec_by_insert_inherit(rec,offsets,entry,update) \ + row_upd_clust_rec_by_insert_inherit_func(entry,update) +#endif /* UNIV_DEBUG */ +/*******************************************************************//** +Mark non-updated off-page columns inherited when the primary key is +updated. We must mark them as inherited in entry, so that they are not +freed in a rollback. A limited version of this function used to be +called btr_cur_mark_dtuple_inherited_extern(). +@return TRUE if any columns were inherited */ +static __attribute__((warn_unused_result)) +ibool +row_upd_clust_rec_by_insert_inherit_func( +/*=====================================*/ +#ifdef UNIV_DEBUG + const rec_t* rec, /*!< in: old record, or NULL */ + const ulint* offsets,/*!< in: rec_get_offsets(rec), or NULL */ +#endif /* UNIV_DEBUG */ + dtuple_t* entry, /*!< in/out: updated entry to be + inserted into the clustered index */ + const upd_t* update) /*!< in: update vector */ +{ + ibool inherit = FALSE; + ulint i; + + ut_ad(!rec == !offsets); + ut_ad(!rec || rec_offs_any_extern(offsets)); + + for (i = 0; i < dtuple_get_n_fields(entry); i++) { + dfield_t* dfield = dtuple_get_nth_field(entry, i); + byte* data; + ulint len; + + ut_ad(!offsets + || !rec_offs_nth_extern(offsets, i) + == !dfield_is_ext(dfield) + || upd_get_field_by_field_no(update, i)); + if (!dfield_is_ext(dfield) + || upd_get_field_by_field_no(update, i)) { + continue; + } + +#ifdef UNIV_DEBUG + if (UNIV_LIKELY(rec != NULL)) { + const byte* rec_data + = rec_get_nth_field(rec, offsets, i, &len); + ut_ad(len == dfield_get_len(dfield)); + ut_ad(len != UNIV_SQL_NULL); + ut_ad(len >= BTR_EXTERN_FIELD_REF_SIZE); + + rec_data += len - BTR_EXTERN_FIELD_REF_SIZE; + + /* The pointer must not be zero. */ + ut_ad(memcmp(rec_data, field_ref_zero, + BTR_EXTERN_FIELD_REF_SIZE)); + /* The BLOB must be owned. */ + ut_ad(!(rec_data[BTR_EXTERN_LEN] + & BTR_EXTERN_OWNER_FLAG)); + } +#endif /* UNIV_DEBUG */ + + len = dfield_get_len(dfield); + ut_a(len != UNIV_SQL_NULL); + ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE); + + data = static_cast<byte*>(dfield_get_data(dfield)); + + data += len - BTR_EXTERN_FIELD_REF_SIZE; + /* The pointer must not be zero. */ + ut_a(memcmp(data, field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE)); + data[BTR_EXTERN_LEN] &= ~BTR_EXTERN_OWNER_FLAG; + data[BTR_EXTERN_LEN] |= BTR_EXTERN_INHERITED_FLAG; + /* The BTR_EXTERN_INHERITED_FLAG only matters in + rollback. Purge will always free the extern fields of + a delete-marked row. */ + + inherit = TRUE; + } + + return(inherit); +} + +/***********************************************************//** +Marks the clustered index record deleted and inserts the updated version +of the record to the index. This function should be used when the ordering +fields of the clustered index record change. This should be quite rare in +database applications. +@return DB_SUCCESS if operation successfully completed, else error +code or DB_LOCK_WAIT */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_upd_clust_rec_by_insert( +/*========================*/ + upd_node_t* node, /*!< in/out: row update node */ + dict_index_t* index, /*!< in: clustered index of the record */ + que_thr_t* thr, /*!< in: query thread */ + ibool referenced,/*!< in: TRUE if index may be referenced in + a foreign key constraint */ + mtr_t* mtr) /*!< in/out: mtr; gets committed here */ +{ + mem_heap_t* heap; + btr_pcur_t* pcur; + btr_cur_t* btr_cur; + trx_t* trx; + dict_table_t* table; + dtuple_t* entry; + dberr_t err; + ibool change_ownership = FALSE; + rec_t* rec; + ulint* offsets = NULL; + + ut_ad(node); + ut_ad(dict_index_is_clust(index)); + + trx = thr_get_trx(thr); + table = node->table; + pcur = node->pcur; + btr_cur = btr_pcur_get_btr_cur(pcur); + + heap = mem_heap_create(1000); + + entry = row_build_index_entry(node->upd_row, node->upd_ext, + index, heap); + ut_a(entry); + + row_upd_index_entry_sys_field(entry, index, DATA_TRX_ID, trx->id); + + switch (node->state) { + default: + ut_error; + case UPD_NODE_INSERT_BLOB: + /* A lock wait occurred in row_ins_clust_index_entry() in + the previous invocation of this function. Mark the + off-page columns in the entry inherited. */ + + if (UNIV_LIKELY(!trx->fake_changes)) { + change_ownership = row_upd_clust_rec_by_insert_inherit( + NULL, NULL, entry, node->update); + ut_a(change_ownership); + } + /* fall through */ + case UPD_NODE_INSERT_CLUSTERED: + /* A lock wait occurred in row_ins_clust_index_entry() in + the previous invocation of this function. */ + break; + case UPD_NODE_UPDATE_CLUSTERED: + /* This is the first invocation of the function where + we update the primary key. Delete-mark the old record + in the clustered index and prepare to insert a new entry. */ + rec = btr_cur_get_rec(btr_cur); + offsets = rec_get_offsets(rec, index, NULL, + ULINT_UNDEFINED, &heap); + ut_ad(page_rec_is_user_rec(rec)); + + err = btr_cur_del_mark_set_clust_rec( + btr_cur_get_block(btr_cur), rec, index, offsets, + thr, mtr); + if (err != DB_SUCCESS) { +err_exit: + mtr_commit(mtr); + mem_heap_free(heap); + return(err); + } + + /* If the the new row inherits externally stored + fields (off-page columns a.k.a. BLOBs) from the + delete-marked old record, mark them disowned by the + old record and owned by the new entry. */ + + if (rec_offs_any_extern(offsets) + && UNIV_LIKELY(!(trx->fake_changes))) { + change_ownership = row_upd_clust_rec_by_insert_inherit( + rec, offsets, entry, node->update); + + if (change_ownership) { + /* The blobs are disowned here, expecting the + insert down below to inherit them. But if the + insert fails, then this disown will be undone + when the operation is rolled back. */ + btr_cur_disown_inherited_fields( + btr_cur_get_page_zip(btr_cur), + rec, index, offsets, node->update, mtr); + } + } + + if (referenced) { + /* NOTE that the following call loses + the position of pcur ! */ + + err = row_upd_check_references_constraints( + node, pcur, table, index, offsets, thr, mtr); + + if (err != DB_SUCCESS) { + goto err_exit; + } + } + } + + mtr_commit(mtr); + + err = row_ins_clust_index_entry( + index, entry, thr, + node->upd_ext ? node->upd_ext->n_ext : 0); + node->state = change_ownership + ? UPD_NODE_INSERT_BLOB + : UPD_NODE_INSERT_CLUSTERED; + + mem_heap_free(heap); + + return(err); +} + +/***********************************************************//** +Updates a clustered index record of a row when the ordering fields do +not change. +@return DB_SUCCESS if operation successfully completed, else error +code or DB_LOCK_WAIT */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_upd_clust_rec( +/*==============*/ + upd_node_t* node, /*!< in: row update node */ + dict_index_t* index, /*!< in: clustered index */ + ulint* offsets,/*!< in: rec_get_offsets() on node->pcur */ + mem_heap_t** offsets_heap, + /*!< in/out: memory heap, can be emptied */ + que_thr_t* thr, /*!< in: query thread */ + mtr_t* mtr) /*!< in: mtr; gets committed here */ +{ + mem_heap_t* heap = NULL; + big_rec_t* big_rec = NULL; + btr_pcur_t* pcur; + btr_cur_t* btr_cur; + dberr_t err; + const dtuple_t* rebuilt_old_pk = NULL; + + ut_ad(node); + ut_ad(dict_index_is_clust(index)); + + pcur = node->pcur; + btr_cur = btr_pcur_get_btr_cur(pcur); + + ut_ad(btr_cur_get_index(btr_cur) == index); + ut_ad(!rec_get_deleted_flag(btr_cur_get_rec(btr_cur), + dict_table_is_comp(index->table))); + ut_ad(rec_offs_validate(btr_cur_get_rec(btr_cur), index, offsets)); + + if (dict_index_is_online_ddl(index)) { + rebuilt_old_pk = row_log_table_get_pk( + btr_cur_get_rec(btr_cur), index, offsets, NULL, &heap); + } + + /* Try optimistic updating of the record, keeping changes within + the page; we do not check locks because we assume the x-lock on the + record to update */ + + if (node->cmpl_info & UPD_NODE_NO_SIZE_CHANGE) { + err = btr_cur_update_in_place( + BTR_NO_LOCKING_FLAG, btr_cur, + offsets, node->update, + node->cmpl_info, thr, thr_get_trx(thr)->id, mtr); + } else { + err = btr_cur_optimistic_update( + BTR_NO_LOCKING_FLAG, btr_cur, + &offsets, offsets_heap, node->update, + node->cmpl_info, thr, thr_get_trx(thr)->id, mtr); + } + + if (err == DB_SUCCESS && dict_index_is_online_ddl(index)) { + row_log_table_update(btr_cur_get_rec(btr_cur), + index, offsets, rebuilt_old_pk); + } + + mtr_commit(mtr); + + if (UNIV_LIKELY(err == DB_SUCCESS)) { + + goto func_exit; + } + + if (buf_LRU_buf_pool_running_out()) { + + err = DB_LOCK_TABLE_FULL; + goto func_exit; + } + /* We may have to modify the tree structure: do a pessimistic descent + down the index tree */ + + mtr_start(mtr); + + /* NOTE: this transaction has an s-lock or x-lock on the record and + therefore other transactions cannot modify the record when we have no + latch on the page. In addition, we assume that other query threads of + the same transaction do not modify the record in the meantime. + Therefore we can assert that the restoration of the cursor succeeds. */ + + ut_a(btr_pcur_restore_position( + UNIV_UNLIKELY(thr_get_trx(thr)->fake_changes) + ? BTR_SEARCH_TREE : BTR_MODIFY_TREE, + pcur, mtr)); + + ut_ad(!rec_get_deleted_flag(btr_pcur_get_rec(pcur), + dict_table_is_comp(index->table))); + + if (!heap) { + heap = mem_heap_create(1024); + } + + err = btr_cur_pessimistic_update( + BTR_NO_LOCKING_FLAG | BTR_KEEP_POS_FLAG, btr_cur, + &offsets, offsets_heap, heap, &big_rec, + node->update, node->cmpl_info, + thr, thr_get_trx(thr)->id, mtr); + if (big_rec && UNIV_LIKELY(!(thr_get_trx(thr)->fake_changes))) { + ut_a(err == DB_SUCCESS); + /* Write out the externally stored + columns while still x-latching + index->lock and block->lock. Allocate + pages for big_rec in the mtr that + modified the B-tree, but be sure to skip + any pages that were freed in mtr. We will + write out the big_rec pages before + committing the B-tree mini-transaction. If + the system crashes so that crash recovery + will not replay the mtr_commit(&mtr), the + big_rec pages will be left orphaned until + the pages are allocated for something else. + + TODO: If the allocation extends the tablespace, it + will not be redo logged, in either mini-transaction. + Tablespace extension should be redo-logged in the + big_rec mini-transaction, so that recovery will not + fail when the big_rec was written to the extended + portion of the file, in case the file was somehow + truncated in the crash. */ + + DEBUG_SYNC_C("before_row_upd_extern"); + err = btr_store_big_rec_extern_fields( + index, btr_cur_get_block(btr_cur), + btr_cur_get_rec(btr_cur), offsets, + big_rec, mtr, BTR_STORE_UPDATE); + DEBUG_SYNC_C("after_row_upd_extern"); + /* If writing big_rec fails (for example, because of + DB_OUT_OF_FILE_SPACE), the record will be corrupted. + Even if we did not update any externally stored + columns, our update could cause the record to grow so + that a non-updated column was selected for external + storage. This non-update would not have been written + to the undo log, and thus the record cannot be rolled + back. + + However, because we have not executed mtr_commit(mtr) + yet, the update will not be replayed in crash + recovery, and the following assertion failure will + effectively "roll back" the operation. */ + ut_a(err == DB_SUCCESS); + } + + if (err == DB_SUCCESS && dict_index_is_online_ddl(index)) { + row_log_table_update(btr_cur_get_rec(btr_cur), + index, offsets, rebuilt_old_pk); + } + + mtr_commit(mtr); +func_exit: + if (heap) { + mem_heap_free(heap); + } + + if (big_rec) { + dtuple_big_rec_free(big_rec); + } + + return(err); +} + +/***********************************************************//** +Delete marks a clustered index record. +@return DB_SUCCESS if operation successfully completed, else error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_upd_del_mark_clust_rec( +/*=======================*/ + upd_node_t* node, /*!< in: row update node */ + dict_index_t* index, /*!< in: clustered index */ + ulint* offsets,/*!< in/out: rec_get_offsets() for the + record under the cursor */ + que_thr_t* thr, /*!< in: query thread */ + ibool referenced, + /*!< in: TRUE if index may be referenced in + a foreign key constraint */ + mtr_t* mtr) /*!< in: mtr; gets committed here */ +{ + btr_pcur_t* pcur; + btr_cur_t* btr_cur; + dberr_t err; + + ut_ad(node); + ut_ad(dict_index_is_clust(index)); + ut_ad(node->is_delete); + + pcur = node->pcur; + btr_cur = btr_pcur_get_btr_cur(pcur); + + /* Store row because we have to build also the secondary index + entries */ + + row_upd_store_row(node); + + /* Mark the clustered index record deleted; we do not have to check + locks, because we assume that we have an x-lock on the record */ + + err = btr_cur_del_mark_set_clust_rec( + btr_cur_get_block(btr_cur), btr_cur_get_rec(btr_cur), + index, offsets, thr, mtr); + if (err == DB_SUCCESS && referenced) { + /* NOTE that the following call loses the position of pcur ! */ + + err = row_upd_check_references_constraints( + node, pcur, index->table, index, offsets, thr, mtr); + } + + mtr_commit(mtr); + + return(err); +} + +/***********************************************************//** +Updates the clustered index record. +@return DB_SUCCESS if operation successfully completed, DB_LOCK_WAIT +in case of a lock wait, else error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_upd_clust_step( +/*===============*/ + upd_node_t* node, /*!< in: row update node */ + que_thr_t* thr) /*!< in: query thread */ +{ + dict_index_t* index; + btr_pcur_t* pcur; + ibool success; + dberr_t err; + mtr_t mtr; + rec_t* rec; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets; + ibool referenced; + rec_offs_init(offsets_); + + index = dict_table_get_first_index(node->table); + + referenced = row_upd_index_is_referenced(index, thr_get_trx(thr)); + + pcur = node->pcur; + + /* We have to restore the cursor to its position */ + + mtr_start(&mtr); + + /* If the restoration does not succeed, then the same + transaction has deleted the record on which the cursor was, + and that is an SQL error. If the restoration succeeds, it may + still be that the same transaction has successively deleted + and inserted a record with the same ordering fields, but in + that case we know that the transaction has at least an + implicit x-lock on the record. */ + + ut_a(pcur->rel_pos == BTR_PCUR_ON); + + ulint mode; + +#ifdef UNIV_DEBUG + /* Work around Bug#14626800 ASSERTION FAILURE IN DEBUG_SYNC(). + Once it is fixed, remove the 'ifdef', 'if' and this comment. */ + if (!thr_get_trx(thr)->ddl) { + DEBUG_SYNC_C_IF_THD( + thr_get_trx(thr)->mysql_thd, + "innodb_row_upd_clust_step_enter"); + } +#endif /* UNIV_DEBUG */ + + if (UNIV_UNLIKELY(thr_get_trx(thr)->fake_changes)) { + mode = BTR_SEARCH_LEAF; + } else if (dict_index_is_online_ddl(index)) { + ut_ad(node->table->id != DICT_INDEXES_ID); + mode = BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED; + mtr_s_lock(dict_index_get_lock(index), &mtr); + } else { + mode = BTR_MODIFY_LEAF; + } + + success = btr_pcur_restore_position(mode, pcur, &mtr); + + if (!success) { + err = DB_RECORD_NOT_FOUND; + + mtr_commit(&mtr); + + return(err); + } + + /* If this is a row in SYS_INDEXES table of the data dictionary, + then we have to free the file segments of the index tree associated + with the index */ + + if (node->is_delete && node->table->id == DICT_INDEXES_ID) { + + ut_ad(!dict_index_is_online_ddl(index)); + + dict_drop_index_tree(btr_pcur_get_rec(pcur), &mtr); + + mtr_commit(&mtr); + + mtr_start(&mtr); + + success = btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, + &mtr); + if (!success) { + err = DB_ERROR; + + mtr_commit(&mtr); + + return(err); + } + } + + rec = btr_pcur_get_rec(pcur); + offsets = rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap); + + if (!node->has_clust_rec_x_lock) { + err = lock_clust_rec_modify_check_and_lock( + 0, btr_pcur_get_block(pcur), + rec, index, offsets, thr); + if (err != DB_SUCCESS) { + mtr_commit(&mtr); + goto exit_func; + } + } + + ut_ad(lock_trx_has_rec_x_lock(thr_get_trx(thr), index->table, + btr_pcur_get_block(pcur), + page_rec_get_heap_no(rec))); + + /* NOTE: the following function calls will also commit mtr */ + + if (node->is_delete) { + err = row_upd_del_mark_clust_rec( + node, index, offsets, thr, referenced, &mtr); + + if (err == DB_SUCCESS) { + node->state = UPD_NODE_UPDATE_ALL_SEC; + node->index = dict_table_get_next_index(index); + } + + goto exit_func; + } + + /* If the update is made for MySQL, we already have the update vector + ready, else we have to do some evaluation: */ + + if (UNIV_UNLIKELY(!node->in_mysql_interface)) { + /* Copy the necessary columns from clust_rec and calculate the + new values to set */ + row_upd_copy_columns(rec, offsets, + UT_LIST_GET_FIRST(node->columns)); + row_upd_eval_new_vals(node->update); + } + + if (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) { + + err = row_upd_clust_rec( + node, index, offsets, &heap, thr, &mtr); + goto exit_func; + } + + row_upd_store_row(node); + + if (row_upd_changes_ord_field_binary(index, node->update, thr, + node->row, node->ext)) { + + /* Update causes an ordering field (ordering fields within + the B-tree) of the clustered index record to change: perform + the update by delete marking and inserting. + + TODO! What to do to the 'Halloween problem', where an update + moves the record forward in index so that it is again + updated when the cursor arrives there? Solution: the + read operation must check the undo record undo number when + choosing records to update. MySQL solves now the problem + externally! */ + + err = row_upd_clust_rec_by_insert( + node, index, thr, referenced, &mtr); + + if (err != DB_SUCCESS) { + + goto exit_func; + } + + node->state = UPD_NODE_UPDATE_ALL_SEC; + } else { + err = row_upd_clust_rec( + node, index, offsets, &heap, thr, &mtr); + + if (err != DB_SUCCESS) { + + goto exit_func; + } + + node->state = UPD_NODE_UPDATE_SOME_SEC; + } + + node->index = dict_table_get_next_index(index); + +exit_func: + if (heap) { + mem_heap_free(heap); + } + return(err); +} + +/***********************************************************//** +Updates the affected index records of a row. When the control is transferred +to this node, we assume that we have a persistent cursor which was on a +record, and the position of the cursor is stored in the cursor. +@return DB_SUCCESS if operation successfully completed, else error +code or DB_LOCK_WAIT */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_upd( +/*====*/ + upd_node_t* node, /*!< in: row update node */ + que_thr_t* thr) /*!< in: query thread */ +{ + dberr_t err = DB_SUCCESS; + + ut_ad(node && thr); + + if (UNIV_LIKELY(node->in_mysql_interface)) { + + /* We do not get the cmpl_info value from the MySQL + interpreter: we must calculate it on the fly: */ + + if (node->is_delete + || row_upd_changes_some_index_ord_field_binary( + node->table, node->update)) { + node->cmpl_info = 0; + } else { + node->cmpl_info = UPD_NODE_NO_ORD_CHANGE; + } + } + + switch (node->state) { + case UPD_NODE_UPDATE_CLUSTERED: + case UPD_NODE_INSERT_CLUSTERED: + case UPD_NODE_INSERT_BLOB: + log_free_check(); + err = row_upd_clust_step(node, thr); + + if (err != DB_SUCCESS) { + + return(err); + } + } + + if (node->index == NULL + || (!node->is_delete + && (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE))) { + + return(DB_SUCCESS); + } + +#ifdef UNIV_DEBUG + /* Work around Bug#14626800 ASSERTION FAILURE IN DEBUG_SYNC(). + Once it is fixed, remove the 'ifdef', 'if' and this comment. */ + if (!thr_get_trx(thr)->ddl) { + DEBUG_SYNC_C_IF_THD(thr_get_trx(thr)->mysql_thd, + "after_row_upd_clust"); + } +#endif /* UNIV_DEBUG */ + + DBUG_EXECUTE_IF("row_upd_skip_sec", node->index = NULL;); + + do { + /* Skip corrupted index */ + dict_table_skip_corrupt_index(node->index); + + if (!node->index) { + break; + } + + if (node->index->type != DICT_FTS) { + err = row_upd_sec_step(node, thr); + + if (err != DB_SUCCESS) { + + return(err); + } + } + + node->index = dict_table_get_next_index(node->index); + } while (node->index != NULL); + + ut_ad(err == DB_SUCCESS); + + /* Do some cleanup */ + + if (node->row != NULL) { + node->row = NULL; + node->ext = NULL; + node->upd_row = NULL; + node->upd_ext = NULL; + mem_heap_empty(node->heap); + } + + node->state = UPD_NODE_UPDATE_CLUSTERED; + + return(err); +} + +/***********************************************************//** +Updates a row in a table. This is a high-level function used in SQL execution +graphs. +@return query thread to run next or NULL */ +UNIV_INTERN +que_thr_t* +row_upd_step( +/*=========*/ + que_thr_t* thr) /*!< in: query thread */ +{ + upd_node_t* node; + sel_node_t* sel_node; + que_node_t* parent; + dberr_t err = DB_SUCCESS; + trx_t* trx; + + ut_ad(thr); + + trx = thr_get_trx(thr); + + trx_start_if_not_started_xa(trx); + + node = static_cast<upd_node_t*>(thr->run_node); + + sel_node = node->select; + + parent = que_node_get_parent(node); + + ut_ad(que_node_get_type(node) == QUE_NODE_UPDATE); + + if (thr->prev_node == parent) { + node->state = UPD_NODE_SET_IX_LOCK; + } + + if (node->state == UPD_NODE_SET_IX_LOCK) { + + if (!node->has_clust_rec_x_lock) { + /* It may be that the current session has not yet + started its transaction, or it has been committed: */ + + err = lock_table(0, node->table, LOCK_IX, thr); + + if (err != DB_SUCCESS) { + + goto error_handling; + } + } + + node->state = UPD_NODE_UPDATE_CLUSTERED; + + if (node->searched_update) { + /* Reset the cursor */ + sel_node->state = SEL_NODE_OPEN; + + /* Fetch a row to update */ + + thr->run_node = sel_node; + + return(thr); + } + } + + /* sel_node is NULL if we are in the MySQL interface */ + + if (sel_node && (sel_node->state != SEL_NODE_FETCH)) { + + if (!node->searched_update) { + /* An explicit cursor should be positioned on a row + to update */ + + ut_error; + + err = DB_ERROR; + + goto error_handling; + } + + ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS); + + /* No more rows to update, or the select node performed the + updates directly in-place */ + + thr->run_node = parent; + + return(thr); + } + + /* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */ + + err = row_upd(node, thr); + +error_handling: + trx->error_state = err; + + if (err != DB_SUCCESS) { + return(NULL); + } + + /* DO THE TRIGGER ACTIONS HERE */ + + if (node->searched_update) { + /* Fetch next row to update */ + + thr->run_node = sel_node; + } else { + /* It was an explicit cursor update */ + + thr->run_node = parent; + } + + node->state = UPD_NODE_UPDATE_CLUSTERED; + + return(thr); +} +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/xtradb/row/row0vers.cc b/storage/xtradb/row/row0vers.cc new file mode 100644 index 00000000000..9f1fc13ee09 --- /dev/null +++ b/storage/xtradb/row/row0vers.cc @@ -0,0 +1,770 @@ +/***************************************************************************** + +Copyright (c) 1997, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file row/row0vers.cc +Row versions + +Created 2/6/1997 Heikki Tuuri +*******************************************************/ + +#include "row0vers.h" + +#ifdef UNIV_NONINL +#include "row0vers.ic" +#endif + +#include "dict0dict.h" +#include "dict0boot.h" +#include "btr0btr.h" +#include "mach0data.h" +#include "trx0rseg.h" +#include "trx0trx.h" +#include "trx0roll.h" +#include "trx0undo.h" +#include "trx0purge.h" +#include "trx0rec.h" +#include "que0que.h" +#include "row0row.h" +#include "row0upd.h" +#include "rem0cmp.h" +#include "read0read.h" +#include "lock0lock.h" + +/*****************************************************************//** +Finds out if an active transaction has inserted or modified a secondary +index record. +@return 0 if committed, else the active transaction id; +NOTE that this function can return false positives but never false +negatives. The caller must confirm all positive results by calling +trx_is_active() while holding lock_sys->mutex. */ +UNIV_INLINE +trx_id_t +row_vers_impl_x_locked_low( +/*=======================*/ + const rec_t* clust_rec, /*!< in: clustered index record */ + dict_index_t* clust_index, /*!< in: the clustered index */ + const rec_t* rec, /*!< in: secondary index record */ + dict_index_t* index, /*!< in: the secondary index */ + const ulint* offsets, /*!< in: rec_get_offsets(rec, index) */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + trx_id_t trx_id; + ibool corrupt; + ulint comp; + ulint rec_del; + const rec_t* version; + rec_t* prev_version = NULL; + ulint* clust_offsets; + mem_heap_t* heap; + + DBUG_ENTER("row_vers_impl_x_locked_low"); + + ut_ad(rec_offs_validate(rec, index, offsets)); + + heap = mem_heap_create(1024); + + clust_offsets = rec_get_offsets( + clust_rec, clust_index, NULL, ULINT_UNDEFINED, &heap); + + trx_id = row_get_rec_trx_id(clust_rec, clust_index, clust_offsets); + corrupt = FALSE; + + if (!trx_rw_is_active(trx_id, &corrupt)) { + /* The transaction that modified or inserted clust_rec is no + longer active, or it is corrupt: no implicit lock on rec */ + if (corrupt) { + lock_report_trx_id_insanity( + trx_id, clust_rec, clust_index, clust_offsets, + trx_sys_get_max_trx_id()); + } + mem_heap_free(heap); + DBUG_RETURN(0); + } + + comp = page_rec_is_comp(rec); + ut_ad(index->table == clust_index->table); + ut_ad(!!comp == dict_table_is_comp(index->table)); + ut_ad(!comp == !page_rec_is_comp(clust_rec)); + + rec_del = rec_get_deleted_flag(rec, comp); + + /* We look up if some earlier version, which was modified by + the trx_id transaction, of the clustered index record would + require rec to be in a different state (delete marked or + unmarked, or have different field values, or not existing). If + there is such a version, then rec was modified by the trx_id + transaction, and it has an implicit x-lock on rec. Note that + if clust_rec itself would require rec to be in a different + state, then the trx_id transaction has not yet had time to + modify rec, and does not necessarily have an implicit x-lock + on rec. */ + + for (version = clust_rec;; version = prev_version) { + row_ext_t* ext; + const dtuple_t* row; + dtuple_t* entry; + ulint vers_del; + trx_id_t prev_trx_id; + mem_heap_t* old_heap = heap; + + /* We keep the semaphore in mtr on the clust_rec page, so + that no other transaction can update it and get an + implicit x-lock on rec until mtr_commit(mtr). */ + + heap = mem_heap_create(1024); + + trx_undo_prev_version_build( + clust_rec, mtr, version, clust_index, clust_offsets, + heap, &prev_version); + + /* The oldest visible clustered index version must not be + delete-marked, because we never start a transaction by + inserting a delete-marked record. */ + ut_ad(prev_version + || !rec_get_deleted_flag(version, comp) + || !trx_rw_is_active(trx_id, NULL)); + + /* Free version and clust_offsets. */ + mem_heap_free(old_heap); + + if (prev_version == NULL) { + + /* We reached the oldest visible version without + finding an older version of clust_rec that would + match the secondary index record. If the secondary + index record is not delete marked, then clust_rec + is considered the correct match of the secondary + index record and hence holds the implicit lock. */ + + if (rec_del) { + /* The secondary index record is del marked. + So, the implicit lock holder of clust_rec + did not modify the secondary index record yet, + and is not holding an implicit lock on it. + + This assumes that whenever a row is inserted + or updated, the leaf page record always is + created with a clear delete-mark flag. + (We never insert a delete-marked record.) */ + trx_id = 0; + } + + break; + } + + clust_offsets = rec_get_offsets( + prev_version, clust_index, NULL, ULINT_UNDEFINED, + &heap); + + vers_del = rec_get_deleted_flag(prev_version, comp); + + prev_trx_id = row_get_rec_trx_id(prev_version, clust_index, + clust_offsets); + + /* The stack of versions is locked by mtr. Thus, it + is safe to fetch the prefixes for externally stored + columns. */ + + row = row_build(ROW_COPY_POINTERS, clust_index, prev_version, + clust_offsets, + NULL, NULL, NULL, &ext, heap); + + entry = row_build_index_entry(row, ext, index, heap); + + /* entry may be NULL if a record was inserted in place + of a deleted record, and the BLOB pointers of the new + record were not initialized yet. But in that case, + prev_version should be NULL. */ + + ut_a(entry != NULL); + + /* If we get here, we know that the trx_id transaction + modified prev_version. Let us check if prev_version + would require rec to be in a different state. */ + + /* The previous version of clust_rec must be + accessible, because clust_rec was not a fresh insert. + There is no guarantee that the transaction is still + active. */ + + /* We check if entry and rec are identified in the alphabetical + ordering */ + + if (!trx_rw_is_active(trx_id, &corrupt)) { + /* Transaction no longer active: no implicit + x-lock. This situation should only be possible + because we are not holding lock_sys->mutex. */ + ut_ad(!lock_mutex_own()); + if (corrupt) { + lock_report_trx_id_insanity( + trx_id, + prev_version, clust_index, + clust_offsets, + trx_sys_get_max_trx_id()); + } + trx_id = 0; + break; + } else if (0 == cmp_dtuple_rec(entry, rec, offsets)) { + /* The delete marks of rec and prev_version should be + equal for rec to be in the state required by + prev_version */ + + if (rec_del != vers_del) { + + break; + } + + /* It is possible that the row was updated so that the + secondary index record remained the same in + alphabetical ordering, but the field values changed + still. For example, 'abc' -> 'ABC'. Check also that. */ + + dtuple_set_types_binary( + entry, dtuple_get_n_fields(entry)); + + if (0 != cmp_dtuple_rec(entry, rec, offsets)) { + + break; + } + + } else if (!rec_del) { + /* The delete mark should be set in rec for it to be + in the state required by prev_version */ + + break; + } + + if (trx_id != prev_trx_id) { + /* prev_version was the first version modified by + the trx_id transaction: no implicit x-lock */ + + trx_id = 0; + break; + } + } + + DBUG_PRINT("info", ("Implicit lock is held by trx:%lu", + static_cast<unsigned long>(trx_id))); + + mem_heap_free(heap); + DBUG_RETURN(trx_id); +} + +/*****************************************************************//** +Finds out if an active transaction has inserted or modified a secondary +index record. +@return 0 if committed, else the active transaction id; +NOTE that this function can return false positives but never false +negatives. The caller must confirm all positive results by calling +trx_is_active() while holding lock_sys->mutex. */ +UNIV_INTERN +trx_id_t +row_vers_impl_x_locked( +/*===================*/ + const rec_t* rec, /*!< in: record in a secondary index */ + dict_index_t* index, /*!< in: the secondary index */ + const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */ +{ + dict_index_t* clust_index; + const rec_t* clust_rec; + trx_id_t trx_id; + mtr_t mtr; + + ut_ad(!lock_mutex_own()); + ut_ad(!mutex_own(&trx_sys->mutex)); + + mtr_start(&mtr); + + /* Search for the clustered index record. The latch on the + page of clust_rec locks the top of the stack of versions. The + bottom of the version stack is not locked; oldest versions may + disappear by the fact that transactions may be committed and + collected by the purge. This is not a problem, because we are + only interested in active transactions. */ + + clust_rec = row_get_clust_rec( + BTR_SEARCH_LEAF, rec, index, &clust_index, &mtr); + + if (UNIV_UNLIKELY(!clust_rec)) { + /* In a rare case it is possible that no clust rec is found + for a secondary index record: if in row0umod.cc + row_undo_mod_remove_clust_low() we have already removed the + clust rec, while purge is still cleaning and removing + secondary index records associated with earlier versions of + the clustered index record. In that case there cannot be + any implicit lock on the secondary index record, because + an active transaction which has modified the secondary index + record has also modified the clustered index record. And in + a rollback we always undo the modifications to secondary index + records before the clustered index record. */ + + trx_id = 0; + } else { + trx_id = row_vers_impl_x_locked_low( + clust_rec, clust_index, rec, index, offsets, &mtr); + } + + mtr_commit(&mtr); + + return(trx_id); +} + +/*****************************************************************//** +Finds out if we must preserve a delete marked earlier version of a clustered +index record, because it is >= the purge view. +@return TRUE if earlier version should be preserved */ +UNIV_INTERN +ibool +row_vers_must_preserve_del_marked( +/*==============================*/ + trx_id_t trx_id, /*!< in: transaction id in the version */ + mtr_t* mtr) /*!< in: mtr holding the latch on the + clustered index record; it will also + hold the latch on purge_view */ +{ +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + + mtr_s_lock(&(purge_sys->latch), mtr); + + return(!read_view_sees_trx_id(purge_sys->view, trx_id)); +} + +/*****************************************************************//** +Finds out if a version of the record, where the version >= the current +purge view, should have ientry as its secondary index entry. We check +if there is any not delete marked version of the record where the trx +id >= purge view, and the secondary index entry and ientry are identified in +the alphabetical ordering; exactly in this case we return TRUE. +@return TRUE if earlier version should have */ +UNIV_INTERN +ibool +row_vers_old_has_index_entry( +/*=========================*/ + ibool also_curr,/*!< in: TRUE if also rec is included in the + versions to search; otherwise only versions + prior to it are searched */ + const rec_t* rec, /*!< in: record in the clustered index; the + caller must have a latch on the page */ + mtr_t* mtr, /*!< in: mtr holding the latch on rec; it will + also hold the latch on purge_view */ + dict_index_t* index, /*!< in: the secondary index */ + const dtuple_t* ientry) /*!< in: the secondary index entry */ +{ + const rec_t* version; + rec_t* prev_version; + dict_index_t* clust_index; + ulint* clust_offsets; + mem_heap_t* heap; + mem_heap_t* heap2; + const dtuple_t* row; + const dtuple_t* entry; + ulint comp; + + ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX) + || mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_S_FIX)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + + clust_index = dict_table_get_first_index(index->table); + + comp = page_rec_is_comp(rec); + ut_ad(!dict_table_is_comp(index->table) == !comp); + heap = mem_heap_create(1024); + clust_offsets = rec_get_offsets(rec, clust_index, NULL, + ULINT_UNDEFINED, &heap); + + if (also_curr && !rec_get_deleted_flag(rec, comp)) { + row_ext_t* ext; + + /* The top of the stack of versions is locked by the + mtr holding a latch on the page containing the + clustered index record. The bottom of the stack is + locked by the fact that the purge_sys->view must + 'overtake' any read view of an active transaction. + Thus, it is safe to fetch the prefixes for + externally stored columns. */ + row = row_build(ROW_COPY_POINTERS, clust_index, + rec, clust_offsets, + NULL, NULL, NULL, &ext, heap); + entry = row_build_index_entry(row, ext, index, heap); + + /* If entry == NULL, the record contains unset BLOB + pointers. This must be a freshly inserted record. If + this is called from + row_purge_remove_sec_if_poss_low(), the thread will + hold latches on the clustered index and the secondary + index. Because the insert works in three steps: + + (1) insert the record to clustered index + (2) store the BLOBs and update BLOB pointers + (3) insert records to secondary indexes + + the purge thread can safely ignore freshly inserted + records and delete the secondary index record. The + thread that inserted the new record will be inserting + the secondary index records. */ + + /* NOTE that we cannot do the comparison as binary + fields because the row is maybe being modified so that + the clustered index record has already been updated to + a different binary value in a char field, but the + collation identifies the old and new value anyway! */ + if (entry && !dtuple_coll_cmp(ientry, entry)) { + + mem_heap_free(heap); + + return(TRUE); + } + } + + version = rec; + + for (;;) { + heap2 = heap; + heap = mem_heap_create(1024); + trx_undo_prev_version_build(rec, mtr, version, + clust_index, clust_offsets, + heap, &prev_version); + mem_heap_free(heap2); /* free version and clust_offsets */ + + if (!prev_version) { + /* Versions end here */ + + mem_heap_free(heap); + + return(FALSE); + } + + clust_offsets = rec_get_offsets(prev_version, clust_index, + NULL, ULINT_UNDEFINED, &heap); + + if (!rec_get_deleted_flag(prev_version, comp)) { + row_ext_t* ext; + + /* The stack of versions is locked by mtr. + Thus, it is safe to fetch the prefixes for + externally stored columns. */ + row = row_build(ROW_COPY_POINTERS, clust_index, + prev_version, clust_offsets, + NULL, NULL, NULL, &ext, heap); + entry = row_build_index_entry(row, ext, index, heap); + + /* If entry == NULL, the record contains unset + BLOB pointers. This must be a freshly + inserted record that we can safely ignore. + For the justification, see the comments after + the previous row_build_index_entry() call. */ + + /* NOTE that we cannot do the comparison as binary + fields because maybe the secondary index record has + already been updated to a different binary value in + a char field, but the collation identifies the old + and new value anyway! */ + + if (entry && !dtuple_coll_cmp(ientry, entry)) { + + mem_heap_free(heap); + + return(TRUE); + } + } + + version = prev_version; + } +} + +/*****************************************************************//** +Constructs the version of a clustered index record which a consistent +read should see. We assume that the trx id stored in rec is such that +the consistent read should not see rec in its present version. +@return DB_SUCCESS or DB_MISSING_HISTORY */ +UNIV_INTERN +dberr_t +row_vers_build_for_consistent_read( +/*===============================*/ + const rec_t* rec, /*!< in: record in a clustered index; the + caller must have a latch on the page; this + latch locks the top of the stack of versions + of this records */ + mtr_t* mtr, /*!< in: mtr holding the latch on rec */ + dict_index_t* index, /*!< in: the clustered index */ + ulint** offsets,/*!< in/out: offsets returned by + rec_get_offsets(rec, index) */ + read_view_t* view, /*!< in: the consistent read view */ + mem_heap_t** offset_heap,/*!< in/out: memory heap from which + the offsets are allocated */ + mem_heap_t* in_heap,/*!< in: memory heap from which the memory for + *old_vers is allocated; memory for possible + intermediate versions is allocated and freed + locally within the function */ + rec_t** old_vers)/*!< out, own: old version, or NULL + if the history is missing or the record + does not exist in the view, that is, + it was freshly inserted afterwards */ +{ + const rec_t* version; + rec_t* prev_version; + trx_id_t trx_id; + mem_heap_t* heap = NULL; + byte* buf; + dberr_t err; + + ut_ad(dict_index_is_clust(index)); + ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX) + || mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_S_FIX)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + + ut_ad(rec_offs_validate(rec, index, *offsets)); + + trx_id = row_get_rec_trx_id(rec, index, *offsets); + + ut_ad(!read_view_sees_trx_id(view, trx_id)); + + version = rec; + + for (;;) { + mem_heap_t* heap2 = heap; + trx_undo_rec_t* undo_rec; + roll_ptr_t roll_ptr; + undo_no_t undo_no; + heap = mem_heap_create(1024); + + /* If we have high-granularity consistent read view and + creating transaction of the view is the same as trx_id in + the record we see this record only in the case when + undo_no of the record is < undo_no in the view. */ + + if (view->type == VIEW_HIGH_GRANULARITY + && view->creator_trx_id == trx_id) { + + roll_ptr = row_get_rec_roll_ptr(version, index, + *offsets); + undo_rec = trx_undo_get_undo_rec_low(roll_ptr, heap); + undo_no = trx_undo_rec_get_undo_no(undo_rec); + mem_heap_empty(heap); + + if (view->undo_no > undo_no) { + /* The view already sees this version: we can + copy it to in_heap and return */ + +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + ut_a(!rec_offs_any_null_extern( + version, *offsets)); +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + + buf = static_cast<byte*>(mem_heap_alloc( + in_heap, rec_offs_size(*offsets))); + + *old_vers = rec_copy(buf, version, *offsets); + rec_offs_make_valid(*old_vers, index, + *offsets); + err = DB_SUCCESS; + break; + } + } + + err = trx_undo_prev_version_build(rec, mtr, version, index, + *offsets, heap, + &prev_version) + ? DB_SUCCESS : DB_MISSING_HISTORY; + if (heap2) { + mem_heap_free(heap2); /* free version */ + } + + if (prev_version == NULL) { + /* It was a freshly inserted version */ + *old_vers = NULL; + break; + } + + *offsets = rec_get_offsets(prev_version, index, *offsets, + ULINT_UNDEFINED, offset_heap); + +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + ut_a(!rec_offs_any_null_extern(prev_version, *offsets)); +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + + trx_id = row_get_rec_trx_id(prev_version, index, *offsets); + + if (read_view_sees_trx_id(view, trx_id)) { + + /* The view already sees this version: we can copy + it to in_heap and return */ + + buf = static_cast<byte*>( + mem_heap_alloc( + in_heap, rec_offs_size(*offsets))); + + *old_vers = rec_copy(buf, prev_version, *offsets); + rec_offs_make_valid(*old_vers, index, *offsets); + break; + } + + version = prev_version; + }/* for (;;) */ + + mem_heap_free(heap); + + return(err); +} + +/*****************************************************************//** +Constructs the last committed version of a clustered index record, +which should be seen by a semi-consistent read. */ +UNIV_INTERN +void +row_vers_build_for_semi_consistent_read( +/*====================================*/ + const rec_t* rec, /*!< in: record in a clustered index; the + caller must have a latch on the page; this + latch locks the top of the stack of versions + of this records */ + mtr_t* mtr, /*!< in: mtr holding the latch on rec */ + dict_index_t* index, /*!< in: the clustered index */ + ulint** offsets,/*!< in/out: offsets returned by + rec_get_offsets(rec, index) */ + mem_heap_t** offset_heap,/*!< in/out: memory heap from which + the offsets are allocated */ + mem_heap_t* in_heap,/*!< in: memory heap from which the memory for + *old_vers is allocated; memory for possible + intermediate versions is allocated and freed + locally within the function */ + const rec_t** old_vers)/*!< out: rec, old version, or NULL if the + record does not exist in the view, that is, + it was freshly inserted afterwards */ +{ + const rec_t* version; + mem_heap_t* heap = NULL; + byte* buf; + trx_id_t rec_trx_id = 0; + + ut_ad(dict_index_is_clust(index)); + ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX) + || mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_S_FIX)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + + ut_ad(rec_offs_validate(rec, index, *offsets)); + + version = rec; + + for (;;) { + trx_id_t* version_trx_descr; + mem_heap_t* heap2; + rec_t* prev_version; + trx_id_t version_trx_id; + + version_trx_id = row_get_rec_trx_id(version, index, *offsets); + if (rec == version) { + rec_trx_id = version_trx_id; + } + + mutex_enter(&trx_sys->mutex); + version_trx_descr = trx_find_descriptor(trx_sys->descriptors, + trx_sys->descr_n_used, + version_trx_id); + /* Because version_trx is a read-write transaction, + its state cannot change from or to NOT_STARTED while + we are holding the trx_sys->mutex. It may change from + ACTIVE to PREPARED or COMMITTED. */ + mutex_exit(&trx_sys->mutex); + + if (!version_trx_descr) { +committed_version_trx: + /* We found a version that belongs to a + committed transaction: return it. */ + +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + ut_a(!rec_offs_any_null_extern(version, *offsets)); +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + + if (rec == version) { + *old_vers = rec; + break; + } + + /* We assume that a rolled-back transaction stays in + TRX_STATE_ACTIVE state until all the changes have been + rolled back and the transaction is removed from + the global list of transactions. */ + + if (rec_trx_id == version_trx_id) { + /* The transaction was committed while + we searched for earlier versions. + Return the current version as a + semi-consistent read. */ + + version = rec; + *offsets = rec_get_offsets(version, + index, *offsets, + ULINT_UNDEFINED, + offset_heap); + } + + buf = static_cast<byte*>( + mem_heap_alloc( + in_heap, rec_offs_size(*offsets))); + + *old_vers = rec_copy(buf, version, *offsets); + rec_offs_make_valid(*old_vers, index, *offsets); + break; + } + + DEBUG_SYNC_C("after_row_vers_check_trx_active"); + + heap2 = heap; + heap = mem_heap_create(1024); + + if (!trx_undo_prev_version_build(rec, mtr, version, index, + *offsets, heap, + &prev_version)) { + mem_heap_free(heap); + heap = heap2; + heap2 = NULL; + goto committed_version_trx; + } + + if (heap2) { + mem_heap_free(heap2); /* free version */ + } + + if (prev_version == NULL) { + /* It was a freshly inserted version */ + *old_vers = NULL; + break; + } + + version = prev_version; + *offsets = rec_get_offsets(version, index, *offsets, + ULINT_UNDEFINED, offset_heap); +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + ut_a(!rec_offs_any_null_extern(version, *offsets)); +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + }/* for (;;) */ + + if (heap) { + mem_heap_free(heap); + } +} diff --git a/storage/xtradb/srv/srv0conc.cc b/storage/xtradb/srv/srv0conc.cc new file mode 100644 index 00000000000..6c15753246a --- /dev/null +++ b/storage/xtradb/srv/srv0conc.cc @@ -0,0 +1,618 @@ +/***************************************************************************** + +Copyright (c) 2011, 2012, Oracle and/or its affiliates. All Rights Reserved. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +Portions of this file contain modifications contributed and copyrighted +by Percona Inc.. Those modifications are +gratefully acknowledged and are described briefly in the InnoDB +documentation. The contributions by Percona Inc. are incorporated with +their permission, and subject to the conditions contained in the file +COPYING.Percona. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file srv/srv0conc.cc + +InnoDB concurrency manager + +Created 2011/04/18 Sunny Bains +*******************************************************/ + +#include "srv0srv.h" +#include "sync0sync.h" +#include "btr0types.h" +#include "trx0trx.h" + +#include "mysql/plugin.h" + +/** Number of times a thread is allowed to enter InnoDB within the same +SQL query after it has once got the ticket. */ +UNIV_INTERN ulong srv_n_free_tickets_to_enter = 500; + +#ifdef HAVE_ATOMIC_BUILTINS +/** Maximum sleep delay (in micro-seconds), value of 0 disables it. */ +UNIV_INTERN ulong srv_adaptive_max_sleep_delay = 150000; +#endif /* HAVE_ATOMIC_BUILTINS */ + +UNIV_INTERN ulong srv_thread_sleep_delay = 10000; + + +/** We are prepared for a situation that we have this many threads waiting for +a semaphore inside InnoDB. innobase_start_or_create_for_mysql() sets the +value. */ + +UNIV_INTERN ulint srv_max_n_threads = 0; + +/** The following controls how many threads we let inside InnoDB concurrently: +threads waiting for locks are not counted into the number because otherwise +we could get a deadlock. Value of 0 will disable the concurrency check. */ + +UNIV_INTERN ulong srv_thread_concurrency = 0; + +#ifndef HAVE_ATOMIC_BUILTINS + +/** This mutex protects srv_conc data structures */ +static os_fast_mutex_t srv_conc_mutex; + +/** Concurrency list node */ +typedef UT_LIST_NODE_T(struct srv_conc_slot_t) srv_conc_node_t; + +/** Slot for a thread waiting in the concurrency control queue. */ +struct srv_conc_slot_t{ + os_event_t event; /*!< event to wait */ + ibool reserved; /*!< TRUE if slot + reserved */ + ibool wait_ended; /*!< TRUE when another thread has + already set the event and the thread + in this slot is free to proceed; but + reserved may still be TRUE at that + point */ + srv_conc_node_t srv_conc_queue; /*!< queue node */ +}; + +/** Queue of threads waiting to get in */ +typedef UT_LIST_BASE_NODE_T(srv_conc_slot_t) srv_conc_queue_t; + +static srv_conc_queue_t srv_conc_queue; + +/** Array of wait slots */ +static srv_conc_slot_t* srv_conc_slots; + +#if defined(UNIV_PFS_MUTEX) +/* Key to register srv_conc_mutex_key with performance schema */ +UNIV_INTERN mysql_pfs_key_t srv_conc_mutex_key; +#endif /* UNIV_PFS_MUTEX */ + +#endif /* !HAVE_ATOMIC_BUILTINS */ + +/** Variables tracking the active and waiting threads. */ +struct srv_conc_t { + char pad[64 - (sizeof(ulint) + sizeof(lint))]; + + /** Number of transactions that have declared_to_be_inside_innodb set. + It used to be a non-error for this value to drop below zero temporarily. + This is no longer true. We'll, however, keep the lint datatype to add + assertions to catch any corner cases that we may have missed. */ + + volatile lint n_active; + + /** Number of OS threads waiting in the FIFO for permission to + enter InnoDB */ + volatile lint n_waiting; +}; + +/* Control variables for tracking concurrency. */ +static srv_conc_t srv_conc; + +/*********************************************************************//** +Initialise the concurrency management data structures */ +void +srv_conc_init(void) +/*===============*/ +{ +#ifndef HAVE_ATOMIC_BUILTINS + ulint i; + + /* Init the server concurrency restriction data structures */ + + os_fast_mutex_init(srv_conc_mutex_key, &srv_conc_mutex); + + UT_LIST_INIT(srv_conc_queue); + + srv_conc_slots = static_cast<srv_conc_slot_t*>( + mem_zalloc(OS_THREAD_MAX_N * sizeof(*srv_conc_slots))); + + for (i = 0; i < OS_THREAD_MAX_N; i++) { + srv_conc_slot_t* conc_slot = &srv_conc_slots[i]; + + conc_slot->event = os_event_create(); + ut_a(conc_slot->event); + } +#endif /* !HAVE_ATOMIC_BUILTINS */ +} + +/*********************************************************************//** +Free the concurrency management data structures */ +void +srv_conc_free(void) +/*===============*/ +{ +#ifndef HAVE_ATOMIC_BUILTINS + os_fast_mutex_free(&srv_conc_mutex); + mem_free(srv_conc_slots); + srv_conc_slots = NULL; +#endif /* !HAVE_ATOMIC_BUILTINS */ +} + +#ifdef HAVE_ATOMIC_BUILTINS +/*********************************************************************//** +Note that a user thread is entering InnoDB. */ +static +void +srv_enter_innodb_with_tickets( +/*==========================*/ + trx_t* trx) /*!< in/out: transaction that wants + to enter InnoDB */ +{ + trx->declared_to_be_inside_innodb = TRUE; + trx->n_tickets_to_enter_innodb = srv_n_free_tickets_to_enter; +} + +/*********************************************************************//** +Handle the scheduling of a user thread that wants to enter InnoDB. Setting +srv_adaptive_max_sleep_delay > 0 switches the adaptive sleep calibration to +ON. When set, we want to wait in the queue for as little time as possible. +However, very short waits will result in a lot of context switches and that +is also not desirable. When threads need to sleep multiple times we increment +os_thread_sleep_delay by one. When we see threads getting a slot without +waiting and there are no other threads waiting in the queue, we try and reduce +the wait as much as we can. Currently we reduce it by half each time. If the +thread only had to wait for one turn before it was able to enter InnoDB we +decrement it by one. This is to try and keep the sleep time stable around the +"optimum" sleep time. */ +static +void +srv_conc_enter_innodb_with_atomics( +/*===============================*/ + trx_t* trx) /*!< in/out: transaction that wants + to enter InnoDB */ +{ + ulint n_sleeps = 0; + ibool notified_mysql = FALSE; + + ut_a(!trx->declared_to_be_inside_innodb); + + for (;;) { + ulint sleep_in_us; + + if (srv_conc.n_active < (lint) srv_thread_concurrency) { + ulint n_active; + + /* Check if there are any free tickets. */ + n_active = os_atomic_increment_lint( + &srv_conc.n_active, 1); + + if (n_active <= srv_thread_concurrency) { + + srv_enter_innodb_with_tickets(trx); + + if (notified_mysql) { + + (void) os_atomic_decrement_lint( + &srv_conc.n_waiting, 1); + + thd_wait_end(trx->mysql_thd); + } + + if (srv_adaptive_max_sleep_delay > 0) { + if (srv_thread_sleep_delay > 20 + && n_sleeps == 1) { + + --srv_thread_sleep_delay; + } + + if (srv_conc.n_waiting == 0) { + srv_thread_sleep_delay >>= 1; + } + } + + return; + } + + /* Since there were no free seats, we relinquish + the overbooked ticket. */ + + (void) os_atomic_decrement_lint( + &srv_conc.n_active, 1); + } + + if (!notified_mysql) { + (void) os_atomic_increment_lint( + &srv_conc.n_waiting, 1); + + /* Release possible search system latch this + thread has */ + + if (trx->has_search_latch) { + trx_search_latch_release_if_reserved(trx); + } + + thd_wait_begin(trx->mysql_thd, THD_WAIT_USER_LOCK); + + notified_mysql = TRUE; + } + + trx->op_info = "sleeping before entering InnoDB"; + + sleep_in_us = srv_thread_sleep_delay; + + /* Guard against overflow when adaptive sleep delay is on. */ + + if (srv_adaptive_max_sleep_delay > 0 + && sleep_in_us > srv_adaptive_max_sleep_delay) { + + sleep_in_us = srv_adaptive_max_sleep_delay; + srv_thread_sleep_delay = static_cast<ulong>(sleep_in_us); + } + + os_thread_sleep(sleep_in_us); + trx->innodb_que_wait_timer += sleep_in_us; + + trx->op_info = ""; + + ++n_sleeps; + + if (srv_adaptive_max_sleep_delay > 0 && n_sleeps > 1) { + ++srv_thread_sleep_delay; + } + } +} + +/*********************************************************************//** +Note that a user thread is leaving InnoDB code. */ +static +void +srv_conc_exit_innodb_with_atomics( +/*==============================*/ + trx_t* trx) /*!< in/out: transaction */ +{ + trx->n_tickets_to_enter_innodb = 0; + trx->declared_to_be_inside_innodb = FALSE; + + (void) os_atomic_decrement_lint(&srv_conc.n_active, 1); +} +#else +/*********************************************************************//** +Note that a user thread is leaving InnoDB code. */ +static +void +srv_conc_exit_innodb_without_atomics( +/*=================================*/ + trx_t* trx) /*!< in/out: transaction */ +{ + srv_conc_slot_t* slot; + + os_fast_mutex_lock(&srv_conc_mutex); + + ut_ad(srv_conc.n_active > 0); + srv_conc.n_active--; + trx->declared_to_be_inside_innodb = FALSE; + trx->n_tickets_to_enter_innodb = 0; + + slot = NULL; + + if (srv_conc.n_active < (lint) srv_thread_concurrency) { + /* Look for a slot where a thread is waiting and no other + thread has yet released the thread */ + + for (slot = UT_LIST_GET_FIRST(srv_conc_queue); + slot != NULL && slot->wait_ended == TRUE; + slot = UT_LIST_GET_NEXT(srv_conc_queue, slot)) { + + /* No op */ + } + + if (slot != NULL) { + slot->wait_ended = TRUE; + + /* We increment the count on behalf of the released + thread */ + + srv_conc.n_active++; + } + } + + os_fast_mutex_unlock(&srv_conc_mutex); + + if (slot != NULL) { + os_event_set(slot->event); + } +} + +/*********************************************************************//** +Handle the scheduling of a user thread that wants to enter InnoDB. */ +static +void +srv_conc_enter_innodb_without_atomics( +/*==================================*/ + trx_t* trx) /*!< in/out: transaction that wants + to enter InnoDB */ +{ + ulint i; + srv_conc_slot_t* slot = NULL; + ibool has_slept = FALSE; + ib_uint64_t start_time = 0L; + ib_uint64_t finish_time = 0L; + ulint sec; + ulint ms; + + os_fast_mutex_lock(&srv_conc_mutex); +retry: + if (UNIV_UNLIKELY(trx->declared_to_be_inside_innodb)) { + os_fast_mutex_unlock(&srv_conc_mutex); + ut_print_timestamp(stderr); + fputs(" InnoDB: Error: trying to declare trx" + " to enter InnoDB, but\n" + "InnoDB: it already is declared.\n", stderr); + trx_print(stderr, trx, 0); + putc('\n', stderr); + return; + } + + ut_ad(srv_conc.n_active >= 0); + + if (srv_conc.n_active < (lint) srv_thread_concurrency) { + + srv_conc.n_active++; + trx->declared_to_be_inside_innodb = TRUE; + trx->n_tickets_to_enter_innodb = srv_n_free_tickets_to_enter; + + os_fast_mutex_unlock(&srv_conc_mutex); + + return; + } + + /* If the transaction is not holding resources, let it sleep + for srv_thread_sleep_delay microseconds, and try again then */ + + if (!has_slept && !trx->has_search_latch + && NULL == UT_LIST_GET_FIRST(trx->lock.trx_locks)) { + + has_slept = TRUE; /* We let it sleep only once to avoid + starvation */ + + srv_conc.n_waiting++; + + os_fast_mutex_unlock(&srv_conc_mutex); + + trx->op_info = "sleeping before joining InnoDB queue"; + + /* Peter Zaitsev suggested that we take the sleep away + altogether. But the sleep may be good in pathological + situations of lots of thread switches. Simply put some + threads aside for a while to reduce the number of thread + switches. */ + if (srv_thread_sleep_delay > 0) { + os_thread_sleep(srv_thread_sleep_delay); + trx->innodb_que_wait_timer += sleep_in_us; + } + + trx->op_info = ""; + + os_fast_mutex_lock(&srv_conc_mutex); + + srv_conc.n_waiting--; + + goto retry; + } + + /* Too many threads inside: put the current thread to a queue */ + + for (i = 0; i < OS_THREAD_MAX_N; i++) { + slot = srv_conc_slots + i; + + if (!slot->reserved) { + + break; + } + } + + if (i == OS_THREAD_MAX_N) { + /* Could not find a free wait slot, we must let the + thread enter */ + + srv_conc.n_active++; + trx->declared_to_be_inside_innodb = TRUE; + trx->n_tickets_to_enter_innodb = 0; + + os_fast_mutex_unlock(&srv_conc_mutex); + + return; + } + + /* Release possible search system latch this thread has */ + if (trx->has_search_latch) { + trx_search_latch_release_if_reserved(trx); + } + + /* Add to the queue */ + slot->reserved = TRUE; + slot->wait_ended = FALSE; + + UT_LIST_ADD_LAST(srv_conc_queue, srv_conc_queue, slot); + + os_event_reset(slot->event); + + srv_conc.n_waiting++; + + os_fast_mutex_unlock(&srv_conc_mutex); + + /* Go to wait for the event; when a thread leaves InnoDB it will + release this thread */ + + ut_ad(!trx->has_search_latch); +#ifdef UNIV_SYNC_DEBUG + ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch)); +#endif /* UNIV_SYNC_DEBUG */ + + if (UNIV_UNLIKELY(trx->take_stats)) { + ut_usectime(&sec, &ms); + start_time = (ib_uint64_t)sec * 1000000 + ms; + } else { + start_time = 0; + } + + trx->op_info = "waiting in InnoDB queue"; + + thd_wait_begin(trx->mysql_thd, THD_WAIT_USER_LOCK); + + os_event_wait(slot->event); + thd_wait_end(trx->mysql_thd); + + trx->op_info = ""; + + if (UNIV_UNLIKELY(start_time != 0)) { + ut_usectime(&sec, &ms); + finish_time = (ib_uint64_t)sec * 1000000 + ms; + trx->innodb_que_wait_timer += (ulint)(finish_time - start_time); + } + + os_fast_mutex_lock(&srv_conc_mutex); + + srv_conc.n_waiting--; + + /* NOTE that the thread which released this thread already + incremented the thread counter on behalf of this thread */ + + slot->reserved = FALSE; + + UT_LIST_REMOVE(srv_conc_queue, srv_conc_queue, slot); + + trx->declared_to_be_inside_innodb = TRUE; + trx->n_tickets_to_enter_innodb = srv_n_free_tickets_to_enter; + + os_fast_mutex_unlock(&srv_conc_mutex); +} +#endif /* HAVE_ATOMIC_BUILTINS */ + +/*********************************************************************//** +Puts an OS thread to wait if there are too many concurrent threads +(>= srv_thread_concurrency) inside InnoDB. The threads wait in a FIFO queue. */ +UNIV_INTERN +void +srv_conc_enter_innodb( +/*==================*/ + trx_t* trx) /*!< in: transaction object associated with the + thread */ +{ +#ifdef UNIV_SYNC_DEBUG + ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch)); +#endif /* UNIV_SYNC_DEBUG */ + +#ifdef HAVE_ATOMIC_BUILTINS + srv_conc_enter_innodb_with_atomics(trx); +#else + srv_conc_enter_innodb_without_atomics(trx); +#endif /* HAVE_ATOMIC_BUILTINS */ +} + +/*********************************************************************//** +This lets a thread enter InnoDB regardless of the number of threads inside +InnoDB. This must be called when a thread ends a lock wait. */ +UNIV_INTERN +void +srv_conc_force_enter_innodb( +/*========================*/ + trx_t* trx) /*!< in: transaction object associated with the + thread */ +{ +#ifdef UNIV_SYNC_DEBUG + ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch)); +#endif /* UNIV_SYNC_DEBUG */ + + if (!srv_thread_concurrency) { + + return; + } + + ut_ad(srv_conc.n_active >= 0); + +#ifdef HAVE_ATOMIC_BUILTINS + (void) os_atomic_increment_lint(&srv_conc.n_active, 1); +#else + os_fast_mutex_lock(&srv_conc_mutex); + ++srv_conc.n_active; + os_fast_mutex_unlock(&srv_conc_mutex); +#endif /* HAVE_ATOMIC_BUILTINS */ + + trx->n_tickets_to_enter_innodb = 1; + trx->declared_to_be_inside_innodb = TRUE; +} + +/*********************************************************************//** +This must be called when a thread exits InnoDB in a lock wait or at the +end of an SQL statement. */ +UNIV_INTERN +void +srv_conc_force_exit_innodb( +/*=======================*/ + trx_t* trx) /*!< in: transaction object associated with the + thread */ +{ + if ((trx->mysql_thd != NULL + && thd_is_replication_slave_thread(trx->mysql_thd)) + || trx->declared_to_be_inside_innodb == FALSE) { + + return; + } + +#ifdef HAVE_ATOMIC_BUILTINS + srv_conc_exit_innodb_with_atomics(trx); +#else + srv_conc_exit_innodb_without_atomics(trx); +#endif /* HAVE_ATOMIC_BUILTINS */ + +#ifdef UNIV_SYNC_DEBUG + ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch)); +#endif /* UNIV_SYNC_DEBUG */ +} + +/*********************************************************************//** +Get the count of threads waiting inside InnoDB. */ +UNIV_INTERN +ulint +srv_conc_get_waiting_threads(void) +/*==============================*/ +{ + return(srv_conc.n_waiting); +} + +/*********************************************************************//** +Get the count of threads active inside InnoDB. */ +UNIV_INTERN +ulint +srv_conc_get_active_threads(void) +/*==============================*/ +{ + return(srv_conc.n_active); + } + diff --git a/storage/xtradb/srv/srv0mon.cc b/storage/xtradb/srv/srv0mon.cc new file mode 100644 index 00000000000..64417b1e5fb --- /dev/null +++ b/storage/xtradb/srv/srv0mon.cc @@ -0,0 +1,1930 @@ +/***************************************************************************** + +Copyright (c) 2010, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file srv/srv0mon.cc +Database monitor counter interfaces + +Created 12/9/2009 Jimmy Yang +*******************************************************/ + +#ifndef UNIV_HOTBACKUP +#include "os0file.h" +#include "mach0data.h" +#include "srv0mon.h" +#include "srv0srv.h" +#include "buf0buf.h" +#include "trx0sys.h" +#include "trx0rseg.h" +#include "lock0lock.h" +#include "ibuf0ibuf.h" +#ifdef UNIV_NONINL +#include "srv0mon.ic" +#endif + +/* Macro to standardize the counter names for counters in the +"monitor_buf_page" module as they have very structured defines */ +#define MONITOR_BUF_PAGE(name, description, code, op, op_code) \ + {"buffer_page_" op "_" name, "buffer_page_io", \ + "Number of " description " Pages " op, \ + MONITOR_GROUP_MODULE, MONITOR_DEFAULT_START, \ + MONITOR_##code##_##op_code} + +#define MONITOR_BUF_PAGE_READ(name, description, code) \ + MONITOR_BUF_PAGE(name, description, code, "read", PAGE_READ) + +#define MONITOR_BUF_PAGE_WRITTEN(name, description, code) \ + MONITOR_BUF_PAGE(name, description, code, "written", PAGE_WRITTEN) + + +/** This array defines basic static information of monitor counters, +including each monitor's name, module it belongs to, a short +description and its property/type and corresponding monitor_id. +Please note: If you add a monitor here, please add its corresponding +monitor_id to "enum monitor_id_value" structure in srv0mon.h file. */ + +static monitor_info_t innodb_counter_info[] = +{ + /* A dummy item to mark the module start, this is + to accomodate the default value (0) set for the + global variables with the control system. */ + {"module_start", "module_start", "module_start", + MONITOR_MODULE, + MONITOR_DEFAULT_START, MONITOR_DEFAULT_START}, + + /* ========== Counters for Server Metadata ========== */ + {"module_metadata", "metadata", "Server Metadata", + MONITOR_MODULE, + MONITOR_DEFAULT_START, MONITOR_MODULE_METADATA}, + + {"metadata_table_handles_opened", "metadata", + "Number of table handles opened", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_TABLE_OPEN}, + + {"metadata_table_handles_closed", "metadata", + "Number of table handles closed", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_TABLE_CLOSE}, + + {"metadata_table_reference_count", "metadata", + "Table reference counter", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_TABLE_REFERENCE}, + + {"metadata_mem_pool_size", "metadata", + "Size of a memory pool InnoDB uses to store data dictionary" + " and internal data structures in bytes", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DEFAULT_ON | MONITOR_DISPLAY_CURRENT), + MONITOR_DEFAULT_START, MONITOR_OVLD_META_MEM_POOL}, + + /* ========== Counters for Lock Module ========== */ + {"module_lock", "lock", "Lock Module", + MONITOR_MODULE, + MONITOR_DEFAULT_START, MONITOR_MODULE_LOCK}, + + {"lock_deadlocks", "lock", "Number of deadlocks", + MONITOR_DEFAULT_ON, + MONITOR_DEFAULT_START, MONITOR_DEADLOCK}, + + {"lock_timeouts", "lock", "Number of lock timeouts", + MONITOR_DEFAULT_ON, + MONITOR_DEFAULT_START, MONITOR_TIMEOUT}, + + {"lock_rec_lock_waits", "lock", + "Number of times enqueued into record lock wait queue", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_LOCKREC_WAIT}, + + {"lock_table_lock_waits", "lock", + "Number of times enqueued into table lock wait queue", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_TABLELOCK_WAIT}, + + {"lock_rec_lock_requests", "lock", + "Number of record locks requested", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_NUM_RECLOCK_REQ}, + + {"lock_rec_lock_created", "lock", "Number of record locks created", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_RECLOCK_CREATED}, + + {"lock_rec_lock_removed", "lock", + "Number of record locks removed from the lock queue", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_RECLOCK_REMOVED}, + + {"lock_rec_locks", "lock", + "Current number of record locks on tables", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_NUM_RECLOCK}, + + {"lock_table_lock_created", "lock", "Number of table locks created", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_TABLELOCK_CREATED}, + + {"lock_table_lock_removed", "lock", + "Number of table locks removed from the lock queue", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_TABLELOCK_REMOVED}, + + {"lock_table_locks", "lock", + "Current number of table locks on tables", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_NUM_TABLELOCK}, + + {"lock_row_lock_current_waits", "lock", + "Number of row locks currently being waited for" + " (innodb_row_lock_current_waits)", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_ROW_LOCK_CURRENT_WAIT}, + + {"lock_row_lock_time", "lock", + "Time spent in acquiring row locks, in milliseconds" + " (innodb_row_lock_time)", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_LOCK_WAIT_TIME}, + + {"lock_row_lock_time_max", "lock", + "The maximum time to acquire a row lock, in milliseconds" + " (innodb_row_lock_time_max)", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_LOCK_MAX_WAIT_TIME}, + + {"lock_row_lock_waits", "lock", + "Number of times a row lock had to be waited for" + " (innodb_row_lock_waits)", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_ROW_LOCK_WAIT}, + + {"lock_row_lock_time_avg", "lock", + "The average time to acquire a row lock, in milliseconds" + " (innodb_row_lock_time_avg)", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_LOCK_AVG_WAIT_TIME}, + + /* ========== Counters for Buffer Manager and I/O ========== */ + {"module_buffer", "buffer", "Buffer Manager Module", + MONITOR_MODULE, + MONITOR_DEFAULT_START, MONITOR_MODULE_BUFFER}, + + {"buffer_pool_size", "server", + "Server buffer pool size (all buffer pools) in bytes", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DEFAULT_ON | MONITOR_DISPLAY_CURRENT), + MONITOR_DEFAULT_START, MONITOR_OVLD_BUFFER_POOL_SIZE}, + + {"buffer_pool_reads", "buffer", + "Number of reads directly from disk (innodb_buffer_pool_reads)", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_READS}, + + {"buffer_pool_read_requests", "buffer", + "Number of logical read requests (innodb_buffer_pool_read_requests)", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_READ_REQUESTS}, + + {"buffer_pool_write_requests", "buffer", + "Number of write requests (innodb_buffer_pool_write_requests)", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_WRITE_REQUEST}, + + {"buffer_pool_wait_free", "buffer", + "Number of times waited for free buffer" + " (innodb_buffer_pool_wait_free)", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_WAIT_FREE}, + + {"buffer_pool_read_ahead", "buffer", + "Number of pages read as read ahead (innodb_buffer_pool_read_ahead)", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_READ_AHEAD}, + + {"buffer_pool_read_ahead_evicted", "buffer", + "Read-ahead pages evicted without being accessed" + " (innodb_buffer_pool_read_ahead_evicted)", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_READ_AHEAD_EVICTED}, + + {"buffer_pool_pages_total", "buffer", + "Total buffer pool size in pages (innodb_buffer_pool_pages_total)", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_PAGE_TOTAL}, + + {"buffer_pool_pages_misc", "buffer", + "Buffer pages for misc use such as row locks or the adaptive" + " hash index (innodb_buffer_pool_pages_misc)", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_PAGE_MISC}, + + {"buffer_pool_pages_data", "buffer", + "Buffer pages containing data (innodb_buffer_pool_pages_data)", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_PAGES_DATA}, + + {"buffer_pool_bytes_data", "buffer", + "Buffer bytes containing data (innodb_buffer_pool_bytes_data)", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_BYTES_DATA}, + + {"buffer_pool_pages_dirty", "buffer", + "Buffer pages currently dirty (innodb_buffer_pool_pages_dirty)", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_PAGES_DIRTY}, + + {"buffer_pool_bytes_dirty", "buffer", + "Buffer bytes currently dirty (innodb_buffer_pool_bytes_dirty)", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_BYTES_DIRTY}, + + {"buffer_pool_pages_free", "buffer", + "Buffer pages currently free (innodb_buffer_pool_pages_free)", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_PAGES_FREE}, + + {"buffer_pages_created", "buffer", + "Number of pages created (innodb_pages_created)", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_CREATED}, + + {"buffer_pages_written", "buffer", + "Number of pages written (innodb_pages_written)", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_WRITTEN}, + + {"buffer_pages_read", "buffer", + "Number of pages read (innodb_pages_read)", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_READ}, + + {"buffer_data_reads", "buffer", + "Amount of data read in bytes (innodb_data_reads)", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_BYTE_READ}, + + {"buffer_data_written", "buffer", + "Amount of data written in bytes (innodb_data_written)", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_BYTE_WRITTEN}, + + /* Cumulative counter for scanning in flush batches */ + {"buffer_flush_batch_scanned", "buffer", + "Total pages scanned as part of flush batch", + MONITOR_SET_OWNER, + MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL, + MONITOR_FLUSH_BATCH_SCANNED}, + + {"buffer_flush_batch_num_scan", "buffer", + "Number of times buffer flush list flush is called", + MONITOR_SET_MEMBER, MONITOR_FLUSH_BATCH_SCANNED, + MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL}, + + {"buffer_flush_batch_scanned_per_call", "buffer", + "Pages scanned per flush batch scan", + MONITOR_SET_MEMBER, MONITOR_FLUSH_BATCH_SCANNED, + MONITOR_FLUSH_BATCH_SCANNED_PER_CALL}, + + {"buffer_flush_batch_rescan", "buffer", + "Number of times rescan of flush list forced", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_FLUSH_HP_RESCAN}, + + /* Cumulative counter for pages flushed in flush batches */ + {"buffer_flush_batch_total_pages", "buffer", + "Total pages flushed as part of flush batch", + MONITOR_SET_OWNER, MONITOR_FLUSH_BATCH_COUNT, + MONITOR_FLUSH_BATCH_TOTAL_PAGE}, + + {"buffer_flush_batches", "buffer", + "Number of flush batches", + MONITOR_SET_MEMBER, MONITOR_FLUSH_BATCH_TOTAL_PAGE, + MONITOR_FLUSH_BATCH_COUNT}, + + {"buffer_flush_batch_pages", "buffer", + "Pages queued as a flush batch", + MONITOR_SET_MEMBER, MONITOR_FLUSH_BATCH_TOTAL_PAGE, + MONITOR_FLUSH_BATCH_PAGES}, + + /* Cumulative counter for flush batches because of neighbor */ + {"buffer_flush_neighbor_total_pages", "buffer", + "Total neighbors flushed as part of neighbor flush", + MONITOR_SET_OWNER, MONITOR_FLUSH_NEIGHBOR_COUNT, + MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE}, + + {"buffer_flush_neighbor", "buffer", + "Number of times neighbors flushing is invoked", + MONITOR_SET_MEMBER, MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE, + MONITOR_FLUSH_NEIGHBOR_COUNT}, + + {"buffer_flush_neighbor_pages", "buffer", + "Pages queued as a neighbor batch", + MONITOR_SET_MEMBER, MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE, + MONITOR_FLUSH_NEIGHBOR_PAGES}, + + {"buffer_flush_n_to_flush_requested", "buffer", + "Number of pages requested for flushing.", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_FLUSH_N_TO_FLUSH_REQUESTED}, + + {"buffer_flush_avg_page_rate", "buffer", + "Average number of pages at which flushing is happening", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_FLUSH_AVG_PAGE_RATE}, + + {"buffer_flush_lsn_avg_rate", "buffer", + "Average redo generation rate", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_FLUSH_LSN_AVG_RATE}, + + {"buffer_flush_pct_for_dirty", "buffer", + "Percent of IO capacity used to avoid max dirty page limit", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_FLUSH_PCT_FOR_DIRTY}, + + {"buffer_flush_pct_for_lsn", "buffer", + "Percent of IO capacity used to avoid reusable redo space limit", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_FLUSH_PCT_FOR_LSN}, + + {"buffer_flush_sync_waits", "buffer", + "Number of times a wait happens due to sync flushing", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_FLUSH_SYNC_WAITS}, + + /* Cumulative counter for flush batches for adaptive flushing */ + {"buffer_flush_adaptive_total_pages", "buffer", + "Total pages flushed as part of adaptive flushing", + MONITOR_SET_OWNER, MONITOR_FLUSH_ADAPTIVE_COUNT, + MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE}, + + {"buffer_flush_adaptive", "buffer", + "Number of adaptive batches", + MONITOR_SET_MEMBER, MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE, + MONITOR_FLUSH_ADAPTIVE_COUNT}, + + {"buffer_flush_adaptive_pages", "buffer", + "Pages queued as an adaptive batch", + MONITOR_SET_MEMBER, MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE, + MONITOR_FLUSH_ADAPTIVE_PAGES}, + + /* Cumulative counter for flush batches because of sync */ + {"buffer_flush_sync_total_pages", "buffer", + "Total pages flushed as part of sync batches", + MONITOR_SET_OWNER, MONITOR_FLUSH_SYNC_COUNT, + MONITOR_FLUSH_SYNC_TOTAL_PAGE}, + + {"buffer_flush_sync", "buffer", + "Number of sync batches", + MONITOR_SET_MEMBER, MONITOR_FLUSH_SYNC_TOTAL_PAGE, + MONITOR_FLUSH_SYNC_COUNT}, + + {"buffer_flush_sync_pages", "buffer", + "Pages queued as a sync batch", + MONITOR_SET_MEMBER, MONITOR_FLUSH_SYNC_TOTAL_PAGE, + MONITOR_FLUSH_SYNC_PAGES}, + + /* Cumulative counter for flush batches because of background */ + {"buffer_flush_background_total_pages", "buffer", + "Total pages flushed as part of background batches", + MONITOR_SET_OWNER, MONITOR_FLUSH_BACKGROUND_COUNT, + MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE}, + + {"buffer_flush_background", "buffer", + "Number of background batches", + MONITOR_SET_MEMBER, MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE, + MONITOR_FLUSH_BACKGROUND_COUNT}, + + {"buffer_flush_background_pages", "buffer", + "Pages queued as a background batch", + MONITOR_SET_MEMBER, MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE, + MONITOR_FLUSH_BACKGROUND_PAGES}, + + /* Cumulative counter for LRU batch scan */ + {"buffer_LRU_batch_scanned", "buffer", + "Total pages scanned as part of LRU batch", + MONITOR_SET_OWNER, MONITOR_LRU_BATCH_SCANNED_NUM_CALL, + MONITOR_LRU_BATCH_SCANNED}, + + {"buffer_LRU_batch_num_scan", "buffer", + "Number of times LRU batch is called", + MONITOR_SET_MEMBER, MONITOR_LRU_BATCH_SCANNED, + MONITOR_LRU_BATCH_SCANNED_NUM_CALL}, + + {"buffer_LRU_batch_scanned_per_call", "buffer", + "Pages scanned per LRU batch call", + MONITOR_SET_MEMBER, MONITOR_LRU_BATCH_SCANNED, + MONITOR_LRU_BATCH_SCANNED_PER_CALL}, + + /* Cumulative counter for LRU batch pages flushed */ + {"buffer_LRU_batch_total_pages", "buffer", + "Total pages flushed as part of LRU batches", + MONITOR_SET_OWNER, MONITOR_LRU_BATCH_COUNT, + MONITOR_LRU_BATCH_TOTAL_PAGE}, + + {"buffer_LRU_batches", "buffer", + "Number of LRU batches", + MONITOR_SET_MEMBER, MONITOR_LRU_BATCH_TOTAL_PAGE, + MONITOR_LRU_BATCH_COUNT}, + + {"buffer_LRU_batch_pages", "buffer", + "Pages queued as an LRU batch", + MONITOR_SET_MEMBER, MONITOR_LRU_BATCH_TOTAL_PAGE, + MONITOR_LRU_BATCH_PAGES}, + + /* Cumulative counter for single page LRU scans */ + {"buffer_LRU_single_flush_scanned", "buffer", + "Total pages scanned as part of single page LRU flush", + MONITOR_SET_OWNER, + MONITOR_LRU_SINGLE_FLUSH_SCANNED_NUM_CALL, + MONITOR_LRU_SINGLE_FLUSH_SCANNED}, + + {"buffer_LRU_single_flush_num_scan", "buffer", + "Number of times single page LRU flush is called", + MONITOR_SET_MEMBER, MONITOR_LRU_SINGLE_FLUSH_SCANNED, + MONITOR_LRU_SINGLE_FLUSH_SCANNED_NUM_CALL}, + + {"buffer_LRU_single_flush_scanned_per_call", "buffer", + "Page scanned per single LRU flush", + MONITOR_SET_MEMBER, MONITOR_LRU_SINGLE_FLUSH_SCANNED, + MONITOR_LRU_SINGLE_FLUSH_SCANNED_PER_CALL}, + + {"buffer_LRU_single_flush_failure_count", "Buffer", + "Number of times attempt to flush a single page from LRU failed", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_LRU_SINGLE_FLUSH_FAILURE_COUNT}, + + {"buffer_LRU_get_free_search", "Buffer", + "Number of searches performed for a clean page", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_LRU_GET_FREE_SEARCH}, + + /* Cumulative counter for LRU search scans */ + {"buffer_LRU_search_scanned", "buffer", + "Total pages scanned as part of LRU search", + MONITOR_SET_OWNER, + MONITOR_LRU_SEARCH_SCANNED_NUM_CALL, + MONITOR_LRU_SEARCH_SCANNED}, + + {"buffer_LRU_search_num_scan", "buffer", + "Number of times LRU search is performed", + MONITOR_SET_MEMBER, MONITOR_LRU_SEARCH_SCANNED, + MONITOR_LRU_SEARCH_SCANNED_NUM_CALL}, + + {"buffer_LRU_search_scanned_per_call", "buffer", + "Page scanned per single LRU search", + MONITOR_SET_MEMBER, MONITOR_LRU_SEARCH_SCANNED, + MONITOR_LRU_SEARCH_SCANNED_PER_CALL}, + + /* Cumulative counter for LRU unzip search scans */ + {"buffer_LRU_unzip_search_scanned", "buffer", + "Total pages scanned as part of LRU unzip search", + MONITOR_SET_OWNER, + MONITOR_LRU_UNZIP_SEARCH_SCANNED_NUM_CALL, + MONITOR_LRU_UNZIP_SEARCH_SCANNED}, + + {"buffer_LRU_unzip_search_num_scan", "buffer", + "Number of times LRU unzip search is performed", + MONITOR_SET_MEMBER, MONITOR_LRU_UNZIP_SEARCH_SCANNED, + MONITOR_LRU_UNZIP_SEARCH_SCANNED_NUM_CALL}, + + {"buffer_LRU_unzip_search_scanned_per_call", "buffer", + "Page scanned per single LRU unzip search", + MONITOR_SET_MEMBER, MONITOR_LRU_UNZIP_SEARCH_SCANNED, + MONITOR_LRU_UNZIP_SEARCH_SCANNED_PER_CALL}, + + /* ========== Counters for Buffer Page I/O ========== */ + {"module_buffer_page", "buffer_page_io", "Buffer Page I/O Module", + static_cast<monitor_type_t>( + MONITOR_MODULE | MONITOR_GROUP_MODULE), + MONITOR_DEFAULT_START, MONITOR_MODULE_BUF_PAGE}, + + MONITOR_BUF_PAGE_READ("index_leaf","Index Leaf", INDEX_LEAF), + + MONITOR_BUF_PAGE_READ("index_non_leaf","Index Non-leaf", + INDEX_NON_LEAF), + + MONITOR_BUF_PAGE_READ("index_ibuf_leaf", "Insert Buffer Index Leaf", + INDEX_IBUF_LEAF), + + MONITOR_BUF_PAGE_READ("index_ibuf_non_leaf", + "Insert Buffer Index Non-Leaf", + INDEX_IBUF_NON_LEAF), + + MONITOR_BUF_PAGE_READ("undo_log", "Undo Log", UNDO_LOG), + + MONITOR_BUF_PAGE_READ("index_inode", "Index Inode", INODE), + + MONITOR_BUF_PAGE_READ("ibuf_free_list", "Insert Buffer Free List", + IBUF_FREELIST), + + MONITOR_BUF_PAGE_READ("ibuf_bitmap", "Insert Buffer Bitmap", + IBUF_BITMAP), + + MONITOR_BUF_PAGE_READ("system_page", "System", SYSTEM), + + MONITOR_BUF_PAGE_READ("trx_system", "Transaction System", TRX_SYSTEM), + + MONITOR_BUF_PAGE_READ("fsp_hdr", "File Space Header", FSP_HDR), + + MONITOR_BUF_PAGE_READ("xdes", "Extent Descriptor", XDES), + + MONITOR_BUF_PAGE_READ("blob", "Uncompressed BLOB", BLOB), + + MONITOR_BUF_PAGE_READ("zblob", "First Compressed BLOB", ZBLOB), + + MONITOR_BUF_PAGE_READ("zblob2", "Subsequent Compressed BLOB", ZBLOB2), + + MONITOR_BUF_PAGE_READ("other", "other/unknown (old version of InnoDB)", + OTHER), + + MONITOR_BUF_PAGE_WRITTEN("index_leaf","Index Leaf", INDEX_LEAF), + + MONITOR_BUF_PAGE_WRITTEN("index_non_leaf","Index Non-leaf", + INDEX_NON_LEAF), + + MONITOR_BUF_PAGE_WRITTEN("index_ibuf_leaf", "Insert Buffer Index Leaf", + INDEX_IBUF_LEAF), + + MONITOR_BUF_PAGE_WRITTEN("index_ibuf_non_leaf", + "Insert Buffer Index Non-Leaf", + INDEX_IBUF_NON_LEAF), + + MONITOR_BUF_PAGE_WRITTEN("undo_log", "Undo Log", UNDO_LOG), + + MONITOR_BUF_PAGE_WRITTEN("index_inode", "Index Inode", INODE), + + MONITOR_BUF_PAGE_WRITTEN("ibuf_free_list", "Insert Buffer Free List", + IBUF_FREELIST), + + MONITOR_BUF_PAGE_WRITTEN("ibuf_bitmap", "Insert Buffer Bitmap", + IBUF_BITMAP), + + MONITOR_BUF_PAGE_WRITTEN("system_page", "System", SYSTEM), + + MONITOR_BUF_PAGE_WRITTEN("trx_system", "Transaction System", + TRX_SYSTEM), + + MONITOR_BUF_PAGE_WRITTEN("fsp_hdr", "File Space Header", FSP_HDR), + + MONITOR_BUF_PAGE_WRITTEN("xdes", "Extent Descriptor", XDES), + + MONITOR_BUF_PAGE_WRITTEN("blob", "Uncompressed BLOB", BLOB), + + MONITOR_BUF_PAGE_WRITTEN("zblob", "First Compressed BLOB", ZBLOB), + + MONITOR_BUF_PAGE_WRITTEN("zblob2", "Subsequent Compressed BLOB", + ZBLOB2), + + MONITOR_BUF_PAGE_WRITTEN("other", "other/unknown (old version InnoDB)", + OTHER), + + /* ========== Counters for OS level operations ========== */ + {"module_os", "os", "OS Level Operation", + MONITOR_MODULE, + MONITOR_DEFAULT_START, MONITOR_MODULE_OS}, + + {"os_data_reads", "os", + "Number of reads initiated (innodb_data_reads)", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_OS_FILE_READ}, + + {"os_data_writes", "os", + "Number of writes initiated (innodb_data_writes)", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_OS_FILE_WRITE}, + + {"os_data_fsyncs", "os", + "Number of fsync() calls (innodb_data_fsyncs)", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_OS_FSYNC}, + + {"os_pending_reads", "os", "Number of reads pending", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_OS_PENDING_READS}, + + {"os_pending_writes", "os", "Number of writes pending", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_OS_PENDING_WRITES}, + + {"os_log_bytes_written", "os", + "Bytes of log written (innodb_os_log_written)", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_OS_LOG_WRITTEN}, + + {"os_log_fsyncs", "os", + "Number of fsync log writes (innodb_os_log_fsyncs)", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_OS_LOG_FSYNC}, + + {"os_log_pending_fsyncs", "os", + "Number of pending fsync write (innodb_os_log_pending_fsyncs)", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_OS_LOG_PENDING_FSYNC}, + + {"os_log_pending_writes", "os", + "Number of pending log file writes (innodb_os_log_pending_writes)", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_OS_LOG_PENDING_WRITES}, + + /* ========== Counters for Transaction Module ========== */ + {"module_trx", "transaction", "Transaction Manager", + MONITOR_MODULE, + MONITOR_DEFAULT_START, MONITOR_MODULE_TRX}, + + {"trx_rw_commits", "transaction", "Number of read-write transactions " + "committed", + MONITOR_NONE, MONITOR_DEFAULT_START, MONITOR_TRX_RW_COMMIT}, + + {"trx_ro_commits", "transaction", "Number of read-only transactions " + "committed", + MONITOR_NONE, MONITOR_DEFAULT_START, MONITOR_TRX_RO_COMMIT}, + + {"trx_nl_ro_commits", "transaction", "Number of non-locking " + "auto-commit read-only transactions committed", + MONITOR_NONE, MONITOR_DEFAULT_START, MONITOR_TRX_NL_RO_COMMIT}, + + {"trx_commits_insert_update", "transaction", + "Number of transactions committed with inserts and updates", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_TRX_COMMIT_UNDO}, + + {"trx_rollbacks", "transaction", + "Number of transactions rolled back", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_TRX_ROLLBACK}, + + {"trx_rollbacks_savepoint", "transaction", + "Number of transactions rolled back to savepoint", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_TRX_ROLLBACK_SAVEPOINT}, + + {"trx_rollback_active", "transaction", + "Number of resurrected active transactions rolled back", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_TRX_ROLLBACK_ACTIVE}, + + {"trx_active_transactions", "transaction", + "Number of active transactions", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_TRX_ACTIVE}, + + {"trx_rseg_history_len", "transaction", + "Length of the TRX_RSEG_HISTORY list", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_RSEG_HISTORY_LEN}, + + {"trx_undo_slots_used", "transaction", "Number of undo slots used", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_NUM_UNDO_SLOT_USED}, + + {"trx_undo_slots_cached", "transaction", + "Number of undo slots cached", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_NUM_UNDO_SLOT_CACHED}, + + {"trx_rseg_current_size", "transaction", + "Current rollback segment size in pages", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT), + MONITOR_DEFAULT_START, MONITOR_RSEG_CUR_SIZE}, + + /* ========== Counters for Purge Module ========== */ + {"module_purge", "purge", "Purge Module", + MONITOR_MODULE, + MONITOR_DEFAULT_START, MONITOR_MODULE_PURGE}, + + {"purge_del_mark_records", "purge", + "Number of delete-marked rows purged", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_N_DEL_ROW_PURGE}, + + {"purge_upd_exist_or_extern_records", "purge", + "Number of purges on updates of existing records and " + " updates on delete marked record with externally stored field", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_N_UPD_EXIST_EXTERN}, + + {"purge_invoked", "purge", + "Number of times purge was invoked", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_PURGE_INVOKED}, + + {"purge_undo_log_pages", "purge", + "Number of undo log pages handled by the purge", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_PURGE_N_PAGE_HANDLED}, + + {"purge_dml_delay_usec", "purge", + "Microseconds DML to be delayed due to purge lagging", + MONITOR_DISPLAY_CURRENT, + MONITOR_DEFAULT_START, MONITOR_DML_PURGE_DELAY}, + + {"purge_stop_count", "purge", + "Number of times purge was stopped", + MONITOR_DISPLAY_CURRENT, + MONITOR_DEFAULT_START, MONITOR_PURGE_STOP_COUNT}, + + {"purge_resume_count", "purge", + "Number of times purge was resumed", + MONITOR_DISPLAY_CURRENT, + MONITOR_DEFAULT_START, MONITOR_PURGE_RESUME_COUNT}, + + /* ========== Counters for Recovery Module ========== */ + {"module_log", "recovery", "Recovery Module", + MONITOR_MODULE, + MONITOR_DEFAULT_START, MONITOR_MODULE_RECOVERY}, + + {"log_checkpoints", "recovery", "Number of checkpoints", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_NUM_CHECKPOINT}, + + {"log_lsn_last_flush", "recovery", "LSN of Last flush", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT), + MONITOR_DEFAULT_START, MONITOR_OVLD_LSN_FLUSHDISK}, + + {"log_lsn_last_checkpoint", "recovery", "LSN at last checkpoint", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT), + MONITOR_DEFAULT_START, MONITOR_OVLD_LSN_CHECKPOINT}, + + {"log_lsn_current", "recovery", "Current LSN value", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT), + MONITOR_DEFAULT_START, MONITOR_OVLD_LSN_CURRENT}, + + {"log_lsn_checkpoint_age", "recovery", + "Current LSN value minus LSN at last checkpoint", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_LSN_CHECKPOINT_AGE}, + + {"log_lsn_buf_pool_oldest", "recovery", + "The oldest modified block LSN in the buffer pool", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT), + MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_OLDEST_LSN}, + + {"log_max_modified_age_async", "recovery", + "Maximum LSN difference; when exceeded, start asynchronous preflush", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT), + MONITOR_DEFAULT_START, MONITOR_OVLD_MAX_AGE_ASYNC}, + + {"log_max_modified_age_sync", "recovery", + "Maximum LSN difference; when exceeded, start synchronous preflush", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT), + MONITOR_DEFAULT_START, MONITOR_OVLD_MAX_AGE_SYNC}, + + {"log_pending_log_writes", "recovery", "Pending log writes", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_PENDING_LOG_WRITE}, + + {"log_pending_checkpoint_writes", "recovery", "Pending checkpoints", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_PENDING_CHECKPOINT_WRITE}, + + {"log_num_log_io", "recovery", "Number of log I/Os", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_LOG_IO}, + + {"log_waits", "recovery", + "Number of log waits due to small log buffer (innodb_log_waits)", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_LOG_WAITS}, + + {"log_write_requests", "recovery", + "Number of log write requests (innodb_log_write_requests)", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_LOG_WRITE_REQUEST}, + + {"log_writes", "recovery", + "Number of log writes (innodb_log_writes)", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_LOG_WRITES}, + + /* ========== Counters for Page Compression ========== */ + {"module_compress", "compression", "Page Compression Info", + MONITOR_MODULE, + MONITOR_DEFAULT_START, MONITOR_MODULE_PAGE}, + + {"compress_pages_compressed", "compression", + "Number of pages compressed", MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_PAGE_COMPRESS}, + + {"compress_pages_decompressed", "compression", + "Number of pages decompressed", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_PAGE_DECOMPRESS}, + + {"compression_pad_increments", "compression", + "Number of times padding is incremented to avoid compression failures", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_PAD_INCREMENTS}, + + {"compression_pad_decrements", "compression", + "Number of times padding is decremented due to good compressibility", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_PAD_DECREMENTS}, + + /* ========== Counters for Index ========== */ + {"module_index", "index", "Index Manager", + MONITOR_MODULE, + MONITOR_DEFAULT_START, MONITOR_MODULE_INDEX}, + + {"index_page_splits", "index", "Number of index page splits", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_INDEX_SPLIT}, + + {"index_page_merge_attempts", "index", + "Number of index page merge attempts", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_INDEX_MERGE_ATTEMPTS}, + + {"index_page_merge_successful", "index", + "Number of successful index page merges", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_INDEX_MERGE_SUCCESSFUL}, + + {"index_page_reorg_attempts", "index", + "Number of index page reorganization attempts", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_INDEX_REORG_ATTEMPTS}, + + {"index_page_reorg_successful", "index", + "Number of successful index page reorganizations", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_INDEX_REORG_SUCCESSFUL}, + + {"index_page_discards", "index", "Number of index pages discarded", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_INDEX_DISCARD}, + + /* ========== Counters for Adaptive Hash Index ========== */ + {"module_adaptive_hash", "adaptive_hash_index", "Adpative Hash Index", + MONITOR_MODULE, + MONITOR_DEFAULT_START, MONITOR_MODULE_ADAPTIVE_HASH}, + + {"adaptive_hash_searches", "adaptive_hash_index", + "Number of successful searches using Adaptive Hash Index", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_ADAPTIVE_HASH_SEARCH}, + + {"adaptive_hash_searches_btree", "adaptive_hash_index", + "Number of searches using B-tree on an index search", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_OVLD_ADAPTIVE_HASH_SEARCH_BTREE}, + + {"adaptive_hash_pages_added", "adaptive_hash_index", + "Number of index pages on which the Adaptive Hash Index is built", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_ADAPTIVE_HASH_PAGE_ADDED}, + + {"adaptive_hash_pages_removed", "adaptive_hash_index", + "Number of index pages whose corresponding Adaptive Hash Index" + " entries were removed", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_ADAPTIVE_HASH_PAGE_REMOVED}, + + {"adaptive_hash_rows_added", "adaptive_hash_index", + "Number of Adaptive Hash Index rows added", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_ADAPTIVE_HASH_ROW_ADDED}, + + {"adaptive_hash_rows_removed", "adaptive_hash_index", + "Number of Adaptive Hash Index rows removed", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_ADAPTIVE_HASH_ROW_REMOVED}, + + {"adaptive_hash_rows_deleted_no_hash_entry", "adaptive_hash_index", + "Number of rows deleted that did not have corresponding Adaptive Hash" + " Index entries", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_ADAPTIVE_HASH_ROW_REMOVE_NOT_FOUND}, + + {"adaptive_hash_rows_updated", "adaptive_hash_index", + "Number of Adaptive Hash Index rows updated", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_ADAPTIVE_HASH_ROW_UPDATED}, + + /* ========== Counters for tablespace ========== */ + {"module_file", "file_system", "Tablespace and File System Manager", + MONITOR_MODULE, + MONITOR_DEFAULT_START, MONITOR_MODULE_FIL_SYSTEM}, + + {"file_num_open_files", "file_system", + "Number of files currently open (innodb_num_open_files)", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_N_FILE_OPENED}, + + /* ========== Counters for Change Buffer ========== */ + {"module_ibuf_system", "change_buffer", "InnoDB Change Buffer", + MONITOR_MODULE, + MONITOR_DEFAULT_START, MONITOR_MODULE_IBUF_SYSTEM}, + + {"ibuf_merges_insert", "change_buffer", + "Number of inserted records merged by change buffering", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_MERGE_INSERT}, + + {"ibuf_merges_delete_mark", "change_buffer", + "Number of deleted records merged by change buffering", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_MERGE_DELETE}, + + {"ibuf_merges_delete", "change_buffer", + "Number of purge records merged by change buffering", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_MERGE_PURGE}, + + {"ibuf_merges_discard_insert", "change_buffer", + "Number of insert merged operations discarded", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_MERGE_DISCARD_INSERT}, + + {"ibuf_merges_discard_delete_mark", "change_buffer", + "Number of deleted merged operations discarded", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_MERGE_DISCARD_DELETE}, + + {"ibuf_merges_discard_delete", "change_buffer", + "Number of purge merged operations discarded", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_MERGE_DISCARD_PURGE}, + + {"ibuf_merges", "change_buffer", "Number of change buffer merges", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_MERGES}, + + {"ibuf_size", "change_buffer", "Change buffer size in pages", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_SIZE}, + + /* ========== Counters for server operations ========== */ + {"module_innodb", "innodb", + "Counter for general InnoDB server wide operations and properties", + MONITOR_MODULE, + MONITOR_DEFAULT_START, MONITOR_MODULE_SERVER}, + + {"innodb_master_thread_sleeps", "server", + "Number of times (seconds) master thread sleeps", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_MASTER_THREAD_SLEEP}, + + {"innodb_activity_count", "server", "Current server activity count", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_SERVER_ACTIVITY}, + + {"innodb_master_active_loops", "server", + "Number of times master thread performs its tasks when" + " server is active", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_MASTER_ACTIVE_LOOPS}, + + {"innodb_master_idle_loops", "server", + "Number of times master thread performs its tasks when server is idle", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_MASTER_IDLE_LOOPS}, + + {"innodb_background_drop_table_usec", "server", + "Time (in microseconds) spent to process drop table list", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_SRV_BACKGROUND_DROP_TABLE_MICROSECOND}, + + {"innodb_ibuf_merge_usec", "server", + "Time (in microseconds) spent to process change buffer merge", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_SRV_IBUF_MERGE_MICROSECOND}, + + {"innodb_log_flush_usec", "server", + "Time (in microseconds) spent to flush log records", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_SRV_LOG_FLUSH_MICROSECOND}, + + {"innodb_mem_validate_usec", "server", + "Time (in microseconds) spent to do memory validation", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_SRV_MEM_VALIDATE_MICROSECOND}, + + {"innodb_master_purge_usec", "server", + "Time (in microseconds) spent by master thread to purge records", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_SRV_PURGE_MICROSECOND}, + + {"innodb_dict_lru_usec", "server", + "Time (in microseconds) spent to process DICT LRU list", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_SRV_DICT_LRU_MICROSECOND}, + + {"innodb_checkpoint_usec", "server", + "Time (in microseconds) spent by master thread to do checkpoint", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_SRV_CHECKPOINT_MICROSECOND}, + + {"innodb_dblwr_writes", "server", + "Number of doublewrite operations that have been performed" + " (innodb_dblwr_writes)", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_SRV_DBLWR_WRITES}, + + {"innodb_dblwr_pages_written", "server", + "Number of pages that have been written for doublewrite operations" + " (innodb_dblwr_pages_written)", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_SRV_DBLWR_PAGES_WRITTEN}, + + {"innodb_page_size", "server", + "InnoDB page size in bytes (innodb_page_size)", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DEFAULT_ON | MONITOR_DISPLAY_CURRENT), + MONITOR_DEFAULT_START, MONITOR_OVLD_SRV_PAGE_SIZE}, + + {"innodb_rwlock_s_spin_waits", "server", + "Number of rwlock spin waits due to shared latch request", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_RWLOCK_S_SPIN_WAITS}, + + {"innodb_rwlock_x_spin_waits", "server", + "Number of rwlock spin waits due to exclusive latch request", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_RWLOCK_X_SPIN_WAITS}, + + {"innodb_rwlock_s_spin_rounds", "server", + "Number of rwlock spin loop rounds due to shared latch request", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_RWLOCK_S_SPIN_ROUNDS}, + + {"innodb_rwlock_x_spin_rounds", "server", + "Number of rwlock spin loop rounds due to exclusive latch request", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_RWLOCK_X_SPIN_ROUNDS}, + + {"innodb_rwlock_s_os_waits", "server", + "Number of OS waits due to shared latch request", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_RWLOCK_S_OS_WAITS}, + + {"innodb_rwlock_x_os_waits", "server", + "Number of OS waits due to exclusive latch request", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_RWLOCK_X_OS_WAITS}, + + /* ========== Counters for DML operations ========== */ + {"module_dml", "dml", "Statistics for DMLs", + MONITOR_MODULE, + MONITOR_DEFAULT_START, MONITOR_MODULE_DML_STATS}, + + {"dml_reads", "dml", "Number of rows read", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OLVD_ROW_READ}, + + {"dml_inserts", "dml", "Number of rows inserted", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OLVD_ROW_INSERTED}, + + {"dml_deletes", "dml", "Number of rows deleted", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OLVD_ROW_DELETED}, + + {"dml_updates", "dml", "Number of rows updated", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OLVD_ROW_UPDTATED}, + + /* ========== Counters for DDL operations ========== */ + {"module_ddl", "ddl", "Statistics for DDLs", + MONITOR_MODULE, + MONITOR_DEFAULT_START, MONITOR_MODULE_DDL_STATS}, + + {"ddl_background_drop_indexes", "ddl", + "Number of indexes waiting to be dropped after failed index creation", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_BACKGROUND_DROP_INDEX}, + + {"ddl_background_drop_tables", "ddl", + "Number of tables in background drop table list", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_BACKGROUND_DROP_TABLE}, + + {"ddl_online_create_index", "ddl", + "Number of indexes being created online", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_ONLINE_CREATE_INDEX}, + + {"ddl_pending_alter_table", "ddl", + "Number of ALTER TABLE, CREATE INDEX, DROP INDEX in progress", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_PENDING_ALTER_TABLE}, + + /* ===== Counters for ICP (Index Condition Pushdown) Module ===== */ + {"module_icp", "icp", "Index Condition Pushdown", + MONITOR_MODULE, + MONITOR_DEFAULT_START, MONITOR_MODULE_ICP}, + + {"icp_attempts", "icp", + "Number of attempts for index push-down condition checks", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_ICP_ATTEMPTS}, + + {"icp_no_match", "icp", "Index push-down condition does not match", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_ICP_NO_MATCH}, + + {"icp_out_of_range", "icp", "Index push-down condition out of range", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_ICP_OUT_OF_RANGE}, + + {"icp_match", "icp", "Index push-down condition matches", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_ICP_MATCH}, + + /* ========== To turn on/off reset all counters ========== */ + {"all", "All Counters", "Turn on/off and reset all counters", + MONITOR_MODULE, + MONITOR_DEFAULT_START, MONITOR_ALL_COUNTER} +}; + +/* The "innodb_counter_value" array stores actual counter values */ +UNIV_INTERN monitor_value_t innodb_counter_value[NUM_MONITOR]; + +/* monitor_set_tbl is used to record and determine whether a monitor +has been turned on/off. */ +UNIV_INTERN ulint monitor_set_tbl[(NUM_MONITOR + NUM_BITS_ULINT + - 1) / NUM_BITS_ULINT]; + +#ifndef HAVE_ATOMIC_BUILTINS_64 +/** Mutex protecting atomic operations on platforms that lack +built-in operations for atomic memory access */ +ib_mutex_t monitor_mutex; + +/** Key to register monitor_mutex with performance schema */ +UNIV_INTERN mysql_pfs_key_t monitor_mutex_key; + +/****************************************************************//** +Initialize the monitor subsystem. */ +UNIV_INTERN +void +srv_mon_create(void) +/*================*/ +{ + mutex_create(monitor_mutex_key, &monitor_mutex, SYNC_ANY_LATCH); +} +/****************************************************************//** +Close the monitor subsystem. */ +UNIV_INTERN +void +srv_mon_free(void) +/*==============*/ +{ + mutex_free(&monitor_mutex); +} +#endif /* !HAVE_ATOMIC_BUILTINS_64 */ + +/****************************************************************//** +Get a monitor's "monitor_info" by its monitor id (index into the +innodb_counter_info array. +@return Point to corresponding monitor_info_t, or NULL if no such +monitor */ +UNIV_INTERN +monitor_info_t* +srv_mon_get_info( +/*=============*/ + monitor_id_t monitor_id) /*!< id indexing into the + innodb_counter_info array */ +{ + ut_a(monitor_id < NUM_MONITOR); + + return((monitor_id < NUM_MONITOR) + ? &innodb_counter_info[monitor_id] + : NULL); +} + +/****************************************************************//** +Get monitor's name by its monitor id (indexing into the +innodb_counter_info array. +@return corresponding monitor name, or NULL if no such +monitor */ +UNIV_INTERN +const char* +srv_mon_get_name( +/*=============*/ + monitor_id_t monitor_id) /*!< id index into the + innodb_counter_info array */ +{ + ut_a(monitor_id < NUM_MONITOR); + + return((monitor_id < NUM_MONITOR) + ? innodb_counter_info[monitor_id].monitor_name + : NULL); +} + +/****************************************************************//** +Turn on/off, reset monitor counters in a module. If module_id +is MONITOR_ALL_COUNTER then turn on all monitor counters. +turned on because it has already been turned on. */ +UNIV_INTERN +void +srv_mon_set_module_control( +/*=======================*/ + monitor_id_t module_id, /*!< in: Module ID as in + monitor_counter_id. If it is + set to MONITOR_ALL_COUNTER, this means + we shall turn on all the counters */ + mon_option_t set_option) /*!< in: Turn on/off reset the + counter */ +{ + ulint ix; + ulint start_id; + ibool set_current_module = FALSE; + + ut_a(module_id <= NUM_MONITOR); + ut_a(UT_ARR_SIZE(innodb_counter_info) == NUM_MONITOR); + + /* The module_id must be an ID of MONITOR_MODULE type */ + ut_a(innodb_counter_info[module_id].monitor_type & MONITOR_MODULE); + + /* start with the first monitor in the module. If module_id + is MONITOR_ALL_COUNTER, this means we need to turn on all + monitor counters. */ + if (module_id == MONITOR_ALL_COUNTER) { + start_id = 1; + } else if (innodb_counter_info[module_id].monitor_type + & MONITOR_GROUP_MODULE) { + /* Counters in this module are set as a group together + and cannot be turned on/off individually. Need to set + the on/off bit in the module counter */ + start_id = module_id; + set_current_module = TRUE; + + } else { + start_id = module_id + 1; + } + + for (ix = start_id; ix < NUM_MONITOR; ix++) { + /* if we hit the next module counter, we will + continue if we want to turn on all monitor counters, + and break if just turn on the counters in the + current module. */ + if (innodb_counter_info[ix].monitor_type & MONITOR_MODULE) { + + if (set_current_module) { + /* Continue to set on/off bit on current + module */ + set_current_module = FALSE; + } else if (module_id == MONITOR_ALL_COUNTER) { + continue; + } else { + /* Hitting the next module, stop */ + break; + } + } + + /* Cannot turn on a monitor already been turned on. User + should be aware some counters are already on before + turn them on again (which could reset counter value) */ + if (MONITOR_IS_ON(ix) && (set_option == MONITOR_TURN_ON)) { + fprintf(stderr, "Monitor '%s' is already enabled.\n", + srv_mon_get_name((monitor_id_t) ix)); + continue; + } + + /* For some existing counters (server status variables), + we will get its counter value at the start/stop time + to calculate the actual value during the time. */ + if (innodb_counter_info[ix].monitor_type & MONITOR_EXISTING) { + srv_mon_process_existing_counter( + static_cast<monitor_id_t>(ix), set_option); + } + + /* Currently support 4 operations on the monitor counters: + turn on, turn off, reset and reset all operations. */ + switch (set_option) { + case MONITOR_TURN_ON: + MONITOR_ON(ix); + MONITOR_INIT(ix); + MONITOR_SET_START(ix); + break; + + case MONITOR_TURN_OFF: + MONITOR_OFF(ix); + MONITOR_SET_OFF(ix); + break; + + case MONITOR_RESET_VALUE: + srv_mon_reset(static_cast<monitor_id_t>(ix)); + break; + + case MONITOR_RESET_ALL_VALUE: + srv_mon_reset_all(static_cast<monitor_id_t>(ix)); + break; + + default: + ut_error; + } + } +} + +/****************************************************************//** +Get transaction system's rollback segment size in pages +@return size in pages */ +static +ulint +srv_mon_get_rseg_size(void) +/*=======================*/ +{ + ulint i; + ulint value = 0; + + /* rseg_array is a static array, so we can go through it without + mutex protection. In addition, we provide an estimate of the + total rollback segment size and to avoid mutex contention we + don't acquire the rseg->mutex" */ + for (i = 0; i < TRX_SYS_N_RSEGS; ++i) { + const trx_rseg_t* rseg = trx_sys->rseg_array[i]; + + if (rseg != NULL) { + value += rseg->curr_size; + } + } + + return(value); +} + +/****************************************************************//** +This function consolidates some existing server counters used +by "system status variables". These existing system variables do not have +mechanism to start/stop and reset the counters, so we simulate these +controls by remembering the corresponding counter values when the +corresponding monitors are turned on/off/reset, and do appropriate +mathematics to deduct the actual value. Please also refer to +srv_export_innodb_status() for related global counters used by +the existing status variables.*/ +UNIV_INTERN +void +srv_mon_process_existing_counter( +/*=============================*/ + monitor_id_t monitor_id, /*!< in: the monitor's ID as in + monitor_counter_id */ + mon_option_t set_option) /*!< in: Turn on/off reset the + counter */ +{ + mon_type_t value; + monitor_info_t* monitor_info; + ibool update_min = FALSE; + buf_pool_stat_t stat; + buf_pools_list_size_t buf_pools_list_size; + ulint LRU_len; + ulint free_len; + ulint flush_list_len; + + monitor_info = srv_mon_get_info(monitor_id); + + ut_a(monitor_info->monitor_type & MONITOR_EXISTING); + ut_a(monitor_id < NUM_MONITOR); + + /* Get the value from corresponding global variable */ + switch (monitor_id) { + case MONITOR_OVLD_META_MEM_POOL: + value = srv_mem_pool_size; + break; + + /* export_vars.innodb_buffer_pool_reads. Num Reads from + disk (page not in buffer) */ + case MONITOR_OVLD_BUF_POOL_READS: + value = srv_stats.buf_pool_reads; + break; + + /* innodb_buffer_pool_read_requests, the number of logical + read requests */ + case MONITOR_OVLD_BUF_POOL_READ_REQUESTS: + buf_get_total_stat(&stat); + value = stat.n_page_gets; + break; + + /* innodb_buffer_pool_write_requests, the number of + write request */ + case MONITOR_OVLD_BUF_POOL_WRITE_REQUEST: + value = srv_stats.buf_pool_write_requests; + break; + + /* innodb_buffer_pool_wait_free */ + case MONITOR_OVLD_BUF_POOL_WAIT_FREE: + value = srv_stats.buf_pool_wait_free; + break; + + /* innodb_buffer_pool_read_ahead */ + case MONITOR_OVLD_BUF_POOL_READ_AHEAD: + buf_get_total_stat(&stat); + value = stat.n_ra_pages_read; + break; + + /* innodb_buffer_pool_read_ahead_evicted */ + case MONITOR_OVLD_BUF_POOL_READ_AHEAD_EVICTED: + buf_get_total_stat(&stat); + value = stat.n_ra_pages_evicted; + break; + + /* innodb_buffer_pool_pages_total */ + case MONITOR_OVLD_BUF_POOL_PAGE_TOTAL: + value = buf_pool_get_n_pages(); + break; + + /* innodb_buffer_pool_pages_misc */ + case MONITOR_OVLD_BUF_POOL_PAGE_MISC: + buf_get_total_list_len(&LRU_len, &free_len, &flush_list_len); + value = buf_pool_get_n_pages() - LRU_len - free_len; + break; + + /* innodb_buffer_pool_pages_data */ + case MONITOR_OVLD_BUF_POOL_PAGES_DATA: + buf_get_total_list_len(&LRU_len, &free_len, &flush_list_len); + value = LRU_len; + break; + + /* innodb_buffer_pool_bytes_data */ + case MONITOR_OVLD_BUF_POOL_BYTES_DATA: + buf_get_total_list_size_in_bytes(&buf_pools_list_size); + value = buf_pools_list_size.LRU_bytes + + buf_pools_list_size.unzip_LRU_bytes; + break; + + /* innodb_buffer_pool_pages_dirty */ + case MONITOR_OVLD_BUF_POOL_PAGES_DIRTY: + buf_get_total_list_len(&LRU_len, &free_len, &flush_list_len); + value = flush_list_len; + break; + + /* innodb_buffer_pool_bytes_dirty */ + case MONITOR_OVLD_BUF_POOL_BYTES_DIRTY: + buf_get_total_list_size_in_bytes(&buf_pools_list_size); + value = buf_pools_list_size.flush_list_bytes; + break; + + /* innodb_buffer_pool_pages_free */ + case MONITOR_OVLD_BUF_POOL_PAGES_FREE: + buf_get_total_list_len(&LRU_len, &free_len, &flush_list_len); + value = free_len; + break; + + /* innodb_pages_created, the number of pages created */ + case MONITOR_OVLD_PAGE_CREATED: + buf_get_total_stat(&stat); + value = stat.n_pages_created; + break; + + /* innodb_pages_written, the number of page written */ + case MONITOR_OVLD_PAGES_WRITTEN: + buf_get_total_stat(&stat); + value = stat.n_pages_written; + break; + + /* innodb_pages_read */ + case MONITOR_OVLD_PAGES_READ: + buf_get_total_stat(&stat); + value = stat.n_pages_read; + break; + + /* innodb_data_reads, the total number of data reads */ + case MONITOR_OVLD_BYTE_READ: + value = srv_stats.data_read; + break; + + /* innodb_data_writes, the total number of data writes. */ + case MONITOR_OVLD_BYTE_WRITTEN: + value = srv_stats.data_written; + break; + + /* innodb_data_reads, the total number of data reads. */ + case MONITOR_OVLD_OS_FILE_READ: + value = os_n_file_reads; + break; + + /* innodb_data_writes, the total number of data writes*/ + case MONITOR_OVLD_OS_FILE_WRITE: + value = os_n_file_writes; + break; + + /* innodb_data_fsyncs, number of fsync() operations so far. */ + case MONITOR_OVLD_OS_FSYNC: + value = os_n_fsyncs; + break; + + /* innodb_os_log_written */ + case MONITOR_OVLD_OS_LOG_WRITTEN: + value = (mon_type_t) srv_stats.os_log_written; + break; + + /* innodb_os_log_fsyncs */ + case MONITOR_OVLD_OS_LOG_FSYNC: + value = fil_n_log_flushes; + break; + + /* innodb_os_log_pending_fsyncs */ + case MONITOR_OVLD_OS_LOG_PENDING_FSYNC: + value = fil_n_pending_log_flushes; + update_min = TRUE; + break; + + /* innodb_os_log_pending_writes */ + case MONITOR_OVLD_OS_LOG_PENDING_WRITES: + value = srv_stats.os_log_pending_writes; + update_min = TRUE; + break; + + /* innodb_log_waits */ + case MONITOR_OVLD_LOG_WAITS: + value = srv_stats.log_waits; + break; + + /* innodb_log_write_requests */ + case MONITOR_OVLD_LOG_WRITE_REQUEST: + value = srv_stats.log_write_requests; + break; + + /* innodb_log_writes */ + case MONITOR_OVLD_LOG_WRITES: + value = srv_stats.log_writes; + break; + + /* innodb_dblwr_writes */ + case MONITOR_OVLD_SRV_DBLWR_WRITES: + value = srv_stats.dblwr_writes; + break; + + /* innodb_dblwr_pages_written */ + case MONITOR_OVLD_SRV_DBLWR_PAGES_WRITTEN: + value = srv_stats.dblwr_pages_written; + break; + + /* innodb_page_size */ + case MONITOR_OVLD_SRV_PAGE_SIZE: + value = UNIV_PAGE_SIZE; + break; + + case MONITOR_OVLD_RWLOCK_S_SPIN_WAITS: + value = rw_lock_stats.rw_s_spin_wait_count; + break; + + case MONITOR_OVLD_RWLOCK_X_SPIN_WAITS: + value = rw_lock_stats.rw_x_spin_wait_count; + break; + + case MONITOR_OVLD_RWLOCK_S_SPIN_ROUNDS: + value = rw_lock_stats.rw_s_spin_round_count; + break; + + case MONITOR_OVLD_RWLOCK_X_SPIN_ROUNDS: + value = rw_lock_stats.rw_x_spin_round_count; + break; + + case MONITOR_OVLD_RWLOCK_S_OS_WAITS: + value = rw_lock_stats.rw_s_os_wait_count; + break; + + case MONITOR_OVLD_RWLOCK_X_OS_WAITS: + value = rw_lock_stats.rw_x_os_wait_count; + break; + + case MONITOR_OVLD_BUFFER_POOL_SIZE: + value = srv_buf_pool_size; + break; + + /* innodb_rows_read */ + case MONITOR_OLVD_ROW_READ: + value = srv_stats.n_rows_read; + break; + + /* innodb_rows_inserted */ + case MONITOR_OLVD_ROW_INSERTED: + value = srv_stats.n_rows_inserted; + break; + + /* innodb_rows_deleted */ + case MONITOR_OLVD_ROW_DELETED: + value = srv_stats.n_rows_deleted; + break; + + /* innodb_rows_updated */ + case MONITOR_OLVD_ROW_UPDTATED: + value = srv_stats.n_rows_updated; + break; + + /* innodb_row_lock_current_waits */ + case MONITOR_OVLD_ROW_LOCK_CURRENT_WAIT: + value = srv_stats.n_lock_wait_current_count; + break; + + /* innodb_row_lock_time */ + case MONITOR_OVLD_LOCK_WAIT_TIME: + value = srv_stats.n_lock_wait_time / 1000; + break; + + /* innodb_row_lock_time_max */ + case MONITOR_OVLD_LOCK_MAX_WAIT_TIME: + value = lock_sys->n_lock_max_wait_time / 1000; + break; + + /* innodb_row_lock_time_avg */ + case MONITOR_OVLD_LOCK_AVG_WAIT_TIME: + if (srv_stats.n_lock_wait_count > 0) { + value = srv_stats.n_lock_wait_time / 1000 + / srv_stats.n_lock_wait_count; + } else { + value = 0; + } + break; + + /* innodb_row_lock_waits */ + case MONITOR_OVLD_ROW_LOCK_WAIT: + value = srv_stats.n_lock_wait_count; + break; + + case MONITOR_RSEG_HISTORY_LEN: + value = trx_sys->rseg_history_len; + break; + + case MONITOR_RSEG_CUR_SIZE: + value = srv_mon_get_rseg_size(); + break; + + case MONITOR_OVLD_N_FILE_OPENED: + value = fil_n_file_opened; + break; + + case MONITOR_OVLD_IBUF_MERGE_INSERT: + value = ibuf->n_merged_ops[IBUF_OP_INSERT]; + break; + + case MONITOR_OVLD_IBUF_MERGE_DELETE: + value = ibuf->n_merged_ops[IBUF_OP_DELETE_MARK]; + break; + + case MONITOR_OVLD_IBUF_MERGE_PURGE: + value = ibuf->n_merged_ops[IBUF_OP_DELETE]; + break; + + case MONITOR_OVLD_IBUF_MERGE_DISCARD_INSERT: + value = ibuf->n_discarded_ops[IBUF_OP_INSERT]; + break; + + case MONITOR_OVLD_IBUF_MERGE_DISCARD_DELETE: + value = ibuf->n_discarded_ops[IBUF_OP_DELETE_MARK]; + break; + + case MONITOR_OVLD_IBUF_MERGE_DISCARD_PURGE: + value = ibuf->n_discarded_ops[IBUF_OP_DELETE]; + break; + + case MONITOR_OVLD_IBUF_MERGES: + value = ibuf->n_merges; + break; + + case MONITOR_OVLD_IBUF_SIZE: + value = ibuf->size; + break; + + case MONITOR_OVLD_SERVER_ACTIVITY: + value = srv_get_activity_count(); + break; + + case MONITOR_OVLD_LSN_FLUSHDISK: + value = (mon_type_t) log_sys->flushed_to_disk_lsn; + break; + + case MONITOR_OVLD_LSN_CURRENT: + value = (mon_type_t) log_sys->lsn; + break; + + case MONITOR_OVLD_BUF_OLDEST_LSN: + value = (mon_type_t) buf_pool_get_oldest_modification(); + break; + + case MONITOR_OVLD_LSN_CHECKPOINT: + value = (mon_type_t) log_sys->last_checkpoint_lsn; + break; + + case MONITOR_OVLD_MAX_AGE_ASYNC: + value = log_sys->max_modified_age_async; + break; + + case MONITOR_OVLD_MAX_AGE_SYNC: + value = log_sys->max_modified_age_sync; + break; + + case MONITOR_OVLD_ADAPTIVE_HASH_SEARCH: + value = btr_cur_n_sea; + break; + + case MONITOR_OVLD_ADAPTIVE_HASH_SEARCH_BTREE: + value = btr_cur_n_non_sea; + break; + + default: + ut_error; + } + + switch (set_option) { + case MONITOR_TURN_ON: + /* Save the initial counter value in mon_start_value + field */ + MONITOR_SAVE_START(monitor_id, value); + return; + + case MONITOR_TURN_OFF: + /* Save the counter value to mon_last_value when we + turn off the monitor but not yet reset. Note the + counter has not yet been set to off in the bitmap + table for normal turn off. We need to check the + count status (on/off) to avoid reset the value + for an already off conte */ + if (MONITOR_IS_ON(monitor_id)) { + srv_mon_process_existing_counter(monitor_id, + MONITOR_GET_VALUE); + MONITOR_SAVE_LAST(monitor_id); + } + return; + + case MONITOR_GET_VALUE: + if (MONITOR_IS_ON(monitor_id)) { + + /* If MONITOR_DISPLAY_CURRENT bit is on, we + only record the current value, rather than + incremental value over a period. Most of +` this type of counters are resource related + counters such as number of buffer pages etc. */ + if (monitor_info->monitor_type + & MONITOR_DISPLAY_CURRENT) { + MONITOR_SET(monitor_id, value); + } else { + /* Most status counters are montonically + increasing, no need to update their + minimum values. Only do so + if "update_min" set to TRUE */ + MONITOR_SET_DIFF(monitor_id, value); + + if (update_min + && (MONITOR_VALUE(monitor_id) + < MONITOR_MIN_VALUE(monitor_id))) { + MONITOR_MIN_VALUE(monitor_id) = + MONITOR_VALUE(monitor_id); + } + } + } + return; + + case MONITOR_RESET_VALUE: + if (!MONITOR_IS_ON(monitor_id)) { + MONITOR_LAST_VALUE(monitor_id) = 0; + } + return; + + /* Nothing special for reset all operation for these existing + counters */ + case MONITOR_RESET_ALL_VALUE: + return; + } +} + +/*************************************************************//** +Reset a monitor, create a new base line with the current monitor +value. This baseline is recorded by MONITOR_VALUE_RESET(monitor) */ +UNIV_INTERN +void +srv_mon_reset( +/*==========*/ + monitor_id_t monitor) /*!< in: monitor id */ +{ + ibool monitor_was_on; + + monitor_was_on = MONITOR_IS_ON(monitor); + + if (monitor_was_on) { + /* Temporarily turn off the counter for the resetting + operation */ + MONITOR_OFF(monitor); + } + + /* Before resetting the current monitor value, first + calculate and set the max/min value since monitor + start */ + srv_mon_calc_max_since_start(monitor); + srv_mon_calc_min_since_start(monitor); + + /* Monitors with MONITOR_DISPLAY_CURRENT bit + are not incremental, no need to remember + the reset value. */ + if (innodb_counter_info[monitor].monitor_type + & MONITOR_DISPLAY_CURRENT) { + MONITOR_VALUE_RESET(monitor) = 0; + } else { + /* Remember the new baseline */ + MONITOR_VALUE_RESET(monitor) = MONITOR_VALUE_RESET(monitor) + + MONITOR_VALUE(monitor); + } + + /* Reset the counter value */ + MONITOR_VALUE(monitor) = 0; + MONITOR_MAX_VALUE(monitor) = MAX_RESERVED; + MONITOR_MIN_VALUE(monitor) = MIN_RESERVED; + + MONITOR_FIELD((monitor), mon_reset_time) = time(NULL); + + if (monitor_was_on) { + MONITOR_ON(monitor); + } +} + +/*************************************************************//** +Turn on monitor counters that are marked as default ON. */ +UNIV_INTERN +void +srv_mon_default_on(void) +/*====================*/ +{ + ulint ix; + + for (ix = 0; ix < NUM_MONITOR; ix++) { + if (innodb_counter_info[ix].monitor_type + & MONITOR_DEFAULT_ON) { + /* Turn on monitor counters that are default on */ + MONITOR_ON(ix); + MONITOR_INIT(ix); + MONITOR_SET_START(ix); + } + } +} +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/xtradb/srv/srv0srv.cc b/storage/xtradb/srv/srv0srv.cc new file mode 100644 index 00000000000..806c3aea70a --- /dev/null +++ b/storage/xtradb/srv/srv0srv.cc @@ -0,0 +1,3511 @@ +/***************************************************************************** + +Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2008, 2009 Google Inc. +Copyright (c) 2009, Percona Inc. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +Portions of this file contain modifications contributed and copyrighted +by Percona Inc.. Those modifications are +gratefully acknowledged and are described briefly in the InnoDB +documentation. The contributions by Percona Inc. are incorporated with +their permission, and subject to the conditions contained in the file +COPYING.Percona. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file srv/srv0srv.cc +The database server main program + +Created 10/8/1995 Heikki Tuuri +*******************************************************/ + +/* Dummy comment */ +#include "srv0srv.h" + +#include "ut0mem.h" +#include "ut0ut.h" +#include "os0proc.h" +#include "mem0mem.h" +#include "mem0pool.h" +#include "sync0sync.h" +#include "que0que.h" +#include "log0online.h" +#include "log0recv.h" +#include "pars0pars.h" +#include "usr0sess.h" +#include "lock0lock.h" +#include "trx0purge.h" +#include "ibuf0ibuf.h" +#include "buf0flu.h" +#include "buf0lru.h" +#include "btr0sea.h" +#include "dict0load.h" +#include "dict0boot.h" +#include "dict0stats_bg.h" /* dict_stats_event */ +#include "srv0start.h" +#include "row0mysql.h" +#include "ha_prototypes.h" +#include "trx0i_s.h" +#include "os0sync.h" /* for HAVE_ATOMIC_BUILTINS */ +#include "srv0mon.h" +#include "ut0crc32.h" +#include "os0file.h" + +#include "mysql/plugin.h" +#include "mysql/service_thd_wait.h" + +/* prototypes of new functions added to ha_innodb.cc for kill_idle_transaction */ +ibool innobase_thd_is_idle(const void* thd); +ib_int64_t innobase_thd_get_start_time(const void* thd); +void innobase_thd_kill(ulong thd_id); +ulong innobase_thd_get_thread_id(const void* thd); + +/* prototypes for new functions added to ha_innodb.cc */ +ibool innobase_get_slow_log(); + +/* The following is the maximum allowed duration of a lock wait. */ +UNIV_INTERN ulint srv_fatal_semaphore_wait_threshold = 600; + +/**/ +UNIV_INTERN lint srv_kill_idle_transaction = 0; + +/* How much data manipulation language (DML) statements need to be delayed, +in microseconds, in order to reduce the lagging of the purge thread. */ +UNIV_INTERN ulint srv_dml_needed_delay = 0; + +UNIV_INTERN ibool srv_monitor_active = FALSE; +UNIV_INTERN ibool srv_error_monitor_active = FALSE; + +UNIV_INTERN ibool srv_buf_dump_thread_active = FALSE; + +UNIV_INTERN ibool srv_dict_stats_thread_active = FALSE; + +UNIV_INTERN const char* srv_main_thread_op_info = ""; + +/** Prefix used by MySQL to indicate pre-5.1 table name encoding */ +const char srv_mysql50_table_name_prefix[10] = "#mysql50#"; + +/* Server parameters which are read from the initfile */ + +/* The following three are dir paths which are catenated before file +names, where the file name itself may also contain a path */ + +UNIV_INTERN char* srv_data_home = NULL; + +/** Rollback files directory, can be absolute. */ +UNIV_INTERN char* srv_undo_dir = NULL; + +/** The number of tablespaces to use for rollback segments. */ +UNIV_INTERN ulong srv_undo_tablespaces = 8; + +/** The number of UNDO tablespaces that are open and ready to use. */ +UNIV_INTERN ulint srv_undo_tablespaces_open = 8; + +/* The number of rollback segments to use */ +UNIV_INTERN ulong srv_undo_logs = 1; + +#ifdef UNIV_LOG_ARCHIVE +UNIV_INTERN char* srv_arch_dir = NULL; +UNIV_INTERN ulong srv_log_arch_expire_sec = 0; +#endif /* UNIV_LOG_ARCHIVE */ + +/** Set if InnoDB must operate in read-only mode. We don't do any +recovery and open all tables in RO mode instead of RW mode. We don't +sync the max trx id to disk either. */ +UNIV_INTERN my_bool srv_read_only_mode; +/** store to its own file each table created by an user; data +dictionary tables are in the system tablespace 0 */ +UNIV_INTERN my_bool srv_file_per_table; +/** The file format to use on new *.ibd files. */ +UNIV_INTERN ulint srv_file_format = 0; +/** Whether to check file format during startup. A value of +UNIV_FORMAT_MAX + 1 means no checking ie. FALSE. The default is to +set it to the highest format we support. */ +UNIV_INTERN ulint srv_max_file_format_at_startup = UNIV_FORMAT_MAX; + +#if UNIV_FORMAT_A +# error "UNIV_FORMAT_A must be 0!" +#endif + +/** Place locks to records only i.e. do not use next-key locking except +on duplicate key checking and foreign key checking */ +UNIV_INTERN ibool srv_locks_unsafe_for_binlog = FALSE; +/** Sort buffer size in index creation */ +UNIV_INTERN ulong srv_sort_buf_size = 1048576; +/** Maximum modification log file size for online index creation */ +UNIV_INTERN unsigned long long srv_online_max_size; + +/* If this flag is TRUE, then we will use the native aio of the +OS (provided we compiled Innobase with it in), otherwise we will +use simulated aio we build below with threads. +Currently we support native aio on windows and linux */ +UNIV_INTERN my_bool srv_use_native_aio = TRUE; + +#ifdef __WIN__ +/* Windows native condition variables. We use runtime loading / function +pointers, because they are not available on Windows Server 2003 and +Windows XP/2000. + +We use condition for events on Windows if possible, even if os_event +resembles Windows kernel event object well API-wise. The reason is +performance, kernel objects are heavyweights and WaitForSingleObject() is a +performance killer causing calling thread to context switch. Besides, Innodb +is preallocating large number (often millions) of os_events. With kernel event +objects it takes a big chunk out of non-paged pool, which is better suited +for tasks like IO than for storing idle event objects. */ +UNIV_INTERN ibool srv_use_native_conditions = FALSE; +#endif /* __WIN__ */ + +UNIV_INTERN ulint srv_n_data_files = 0; +UNIV_INTERN char** srv_data_file_names = NULL; +/* size in database pages */ +UNIV_INTERN ulint* srv_data_file_sizes = NULL; + +UNIV_INTERN my_bool srv_track_changed_pages = FALSE; + +UNIV_INTERN ulonglong srv_max_bitmap_file_size = 100 * 1024 * 1024; + +UNIV_INTERN ulonglong srv_max_changed_pages = 0; + +/** When TRUE, fake change transcations take S rather than X row locks. + When FALSE, row locks are not taken at all. */ +UNIV_INTERN my_bool srv_fake_changes_locks = TRUE; + +/* if TRUE, then we auto-extend the last data file */ +UNIV_INTERN ibool srv_auto_extend_last_data_file = FALSE; +/* if != 0, this tells the max size auto-extending may increase the +last data file size */ +UNIV_INTERN ulint srv_last_file_size_max = 0; +/* If the last data file is auto-extended, we add this +many pages to it at a time */ +UNIV_INTERN ulong srv_auto_extend_increment = 8; +UNIV_INTERN ulint* srv_data_file_is_raw_partition = NULL; + +/* If the following is TRUE we do not allow inserts etc. This protects +the user from forgetting the 'newraw' keyword to my.cnf */ + +UNIV_INTERN ibool srv_created_new_raw = FALSE; + +UNIV_INTERN char* srv_log_group_home_dir = NULL; + +UNIV_INTERN ulong srv_n_log_files = SRV_N_LOG_FILES_MAX; +/* size in database pages */ +UNIV_INTERN ib_uint64_t srv_log_file_size = IB_UINT64_MAX; +UNIV_INTERN ib_uint64_t srv_log_file_size_requested; +/* size in database pages */ +UNIV_INTERN ulint srv_log_buffer_size = ULINT_MAX; +UNIV_INTERN uint srv_flush_log_at_timeout = 1; +UNIV_INTERN ulong srv_page_size = UNIV_PAGE_SIZE_DEF; +UNIV_INTERN ulong srv_page_size_shift = UNIV_PAGE_SIZE_SHIFT_DEF; +UNIV_INTERN char srv_use_global_flush_log_at_trx_commit = TRUE; + +/* Try to flush dirty pages so as to avoid IO bursts at +the checkpoints. */ +UNIV_INTERN char srv_adaptive_flushing = TRUE; + +UNIV_INTERN ulint srv_show_locks_held = 10; +UNIV_INTERN ulint srv_show_verbose_locks = 0; + +/** Maximum number of times allowed to conditionally acquire +mutex before switching to blocking wait on the mutex */ +#define MAX_MUTEX_NOWAIT 20 + +/** Check whether the number of failed nonblocking mutex +acquisition attempts exceeds maximum allowed value. If so, +srv_printf_innodb_monitor() will request mutex acquisition +with mutex_enter(), which will wait until it gets the mutex. */ +#define MUTEX_NOWAIT(mutex_skipped) ((mutex_skipped) < MAX_MUTEX_NOWAIT) + +/** The sort order table of the MySQL latin1_swedish_ci character set +collation */ +UNIV_INTERN const byte* srv_latin1_ordering; + +/* use os/external memory allocator */ +UNIV_INTERN my_bool srv_use_sys_malloc = TRUE; +/* requested size in kilobytes */ +UNIV_INTERN ulint srv_buf_pool_size = ULINT_MAX; +/* force virtual page preallocation (prefault) */ +UNIV_INTERN my_bool srv_buf_pool_populate = FALSE; +/* requested number of buffer pool instances */ +UNIV_INTERN ulint srv_buf_pool_instances = 1; +/* number of locks to protect buf_pool->page_hash */ +UNIV_INTERN ulong srv_n_page_hash_locks = 16; +/** Scan depth for LRU flush batch i.e.: number of blocks scanned*/ +UNIV_INTERN ulong srv_LRU_scan_depth = 1024; +/** whether or not to flush neighbors of a block */ +UNIV_INTERN ulong srv_flush_neighbors = 1; +/* previously requested size */ +UNIV_INTERN ulint srv_buf_pool_old_size; +/* current size in kilobytes */ +UNIV_INTERN ulint srv_buf_pool_curr_size = 0; +/* size in bytes */ +UNIV_INTERN ulint srv_mem_pool_size = ULINT_MAX; +UNIV_INTERN ulint srv_lock_table_size = ULINT_MAX; + +/** Query thread preflush algorithm */ +UNIV_INTERN ulint srv_foreground_preflush + = SRV_FOREGROUND_PREFLUSH_EXP_BACKOFF; + +/** The maximum time limit for a single LRU tail flush iteration by the page +cleaner thread */ +UNIV_INTERN ulint srv_cleaner_max_lru_time = 1000; + +/** The maximum time limit for a single flush list flush iteration by the page +cleaner thread */ +UNIV_INTERN ulint srv_cleaner_max_flush_time = 1000; + +/** Page cleaner flush list flush batches are further divided into this chunk +size */ +UNIV_INTERN ulint srv_cleaner_flush_chunk_size = 100; + +/** Page cleaner LRU list flush batches are further divided into this chunk +size */ +UNIV_INTERN ulint srv_cleaner_lru_chunk_size = 100; + +/** If free list length is lower than this percentage of srv_LRU_scan_depth, +page cleaner LRU flushes will issue flush batches to the same instance in a +row */ +UNIV_INTERN ulint srv_cleaner_free_list_lwm = 10; + +/** If TRUE, page cleaner heuristics use evicted instead of flushed page counts +for its heuristics */ +UNIV_INTERN my_bool srv_cleaner_eviction_factor = FALSE; + +/** Page cleaner LSN age factor formula option */ +UNIV_INTERN ulong srv_cleaner_lsn_age_factor + = SRV_CLEANER_LSN_AGE_FACTOR_HIGH_CHECKPOINT; + +/** Empty free list for a query thread handling algorithm option */ +UNIV_INTERN ulong srv_empty_free_list_algorithm + = SRV_EMPTY_FREE_LIST_BACKOFF; + +/* This parameter is deprecated. Use srv_n_io_[read|write]_threads +instead. */ +UNIV_INTERN ulint srv_n_file_io_threads = ULINT_MAX; +UNIV_INTERN ulint srv_n_read_io_threads = ULINT_MAX; +UNIV_INTERN ulint srv_n_write_io_threads = ULINT_MAX; + +/* Switch to enable random read ahead. */ +UNIV_INTERN my_bool srv_random_read_ahead = FALSE; + +/* The log block size */ +UNIV_INTERN ulint srv_log_block_size = 0; + +/* User settable value of the number of pages that must be present +in the buffer cache and accessed sequentially for InnoDB to trigger a +readahead request. */ +UNIV_INTERN ulong srv_read_ahead_threshold = 56; + +#ifdef UNIV_LOG_ARCHIVE +UNIV_INTERN ibool srv_log_archive_on = FALSE; +UNIV_INTERN ibool srv_archive_recovery = 0; +UNIV_INTERN ib_uint64_t srv_archive_recovery_limit_lsn; +#endif /* UNIV_LOG_ARCHIVE */ + +/* This parameter is used to throttle the number of insert buffers that are +merged in a batch. By increasing this parameter on a faster disk you can +possibly reduce the number of I/O operations performed to complete the +merge operation. The value of this parameter is used as is by the +background loop when the system is idle (low load), on a busy system +the parameter is scaled down by a factor of 4, this is to avoid putting +a heavier load on the I/O sub system. */ + +UNIV_INTERN ulong srv_insert_buffer_batch_size = 20; + +UNIV_INTERN char* srv_file_flush_method_str = NULL; +UNIV_INTERN ulint srv_unix_file_flush_method = SRV_UNIX_FSYNC; +UNIV_INTERN ulint srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED; + +UNIV_INTERN ulint srv_max_n_open_files = 300; + +/* Number of IO operations per second the server can do */ +UNIV_INTERN ulong srv_io_capacity = 200; +UNIV_INTERN ulong srv_max_io_capacity = 400; + +/* The InnoDB main thread tries to keep the ratio of modified pages +in the buffer pool to all database pages in the buffer pool smaller than +the following number. But it is not guaranteed that the value stays below +that during a time of heavy update/insert activity. */ + +UNIV_INTERN ulong srv_max_buf_pool_modified_pct = 75; +UNIV_INTERN ulong srv_max_dirty_pages_pct_lwm = 50; + +/* This is the percentage of log capacity at which adaptive flushing, +if enabled, will kick in. */ +UNIV_INTERN ulong srv_adaptive_flushing_lwm = 10; + +/* Number of iterations over which adaptive flushing is averaged. */ +UNIV_INTERN ulong srv_flushing_avg_loops = 30; + +/* The tid of the cleaner thread */ +UNIV_INTERN os_tid_t srv_cleaner_tid; + +/* The tid of the LRU manager thread */ +UNIV_INTERN os_tid_t srv_lru_manager_tid; + +/* The tids of the purge threads */ +UNIV_INTERN os_tid_t srv_purge_tids[SRV_MAX_N_PURGE_THREADS]; + +/* The tids of the I/O threads */ +UNIV_INTERN os_tid_t srv_io_tids[SRV_MAX_N_IO_THREADS]; + +/* The tid of the master thread */ +UNIV_INTERN os_tid_t srv_master_tid; + +/* The relative scheduling priority of the cleaner and LRU manager threads */ +UNIV_INTERN ulint srv_sched_priority_cleaner = 19; + +/* The relative scheduling priority of the purge threads */ +UNIV_INTERN ulint srv_sched_priority_purge = 19; + +/* The relative scheduling priority of the I/O threads */ +UNIV_INTERN ulint srv_sched_priority_io = 19; + +/* The relative scheduling priority of the master thread */ +UNIV_INTERN ulint srv_sched_priority_master = 19; + +/* The relative priority of the current thread. If 0, low priority; if 1, high +priority. */ +UNIV_INTERN UNIV_THREAD_LOCAL ulint srv_current_thread_priority = 0; + +/* The relative priority of the purge coordinator and worker threads. */ +UNIV_INTERN my_bool srv_purge_thread_priority = FALSE; + +/* The relative priority of the I/O threads. */ +UNIV_INTERN my_bool srv_io_thread_priority = FALSE; + +/* The relative priority of the cleaner thread. */ +UNIV_INTERN my_bool srv_cleaner_thread_priority = FALSE; + +/* The relative priority of the master thread. */ +UNIV_INTERN my_bool srv_master_thread_priority = FALSE; + +/* The number of purge threads to use.*/ +UNIV_INTERN ulong srv_n_purge_threads = 1; + +/* the number of pages to purge in one batch */ +UNIV_INTERN ulong srv_purge_batch_size = 20; + +/* Internal setting for "innodb_stats_method". Decides how InnoDB treats +NULL value when collecting statistics. By default, it is set to +SRV_STATS_NULLS_EQUAL(0), ie. all NULL value are treated equal */ +UNIV_INTERN ulong srv_innodb_stats_method = SRV_STATS_NULLS_EQUAL; + +UNIV_INTERN srv_stats_t srv_stats; + +/* structure to pass status variables to MySQL */ +UNIV_INTERN export_var_t export_vars; + +/** Normally 0. When nonzero, skip some phases of crash recovery, +starting from SRV_FORCE_IGNORE_CORRUPT, so that data can be recovered +by SELECT or mysqldump. When this is nonzero, we do not allow any user +modifications to the data. */ +UNIV_INTERN ulong srv_force_recovery; +#ifndef DBUG_OFF +/** Inject a crash at different steps of the recovery process. +This is for testing and debugging only. */ +UNIV_INTERN ulong srv_force_recovery_crash; +#endif /* !DBUG_OFF */ + +/** Print all user-level transactions deadlocks to mysqld stderr */ + +UNIV_INTERN my_bool srv_print_all_deadlocks = FALSE; + +/** Enable INFORMATION_SCHEMA.innodb_cmp_per_index */ +UNIV_INTERN my_bool srv_cmp_per_index_enabled = FALSE; + +/* If the following is set to 1 then we do not run purge and insert buffer +merge to completion before shutdown. If it is set to 2, do not even flush the +buffer pool to data files at the shutdown: we effectively 'crash' +InnoDB (but lose no committed transactions). */ +UNIV_INTERN ulint srv_fast_shutdown = 0; + +/* Generate a innodb_status.<pid> file */ +UNIV_INTERN ibool srv_innodb_status = FALSE; + +/* When estimating number of different key values in an index, sample +this many index pages, there are 2 ways to calculate statistics: +* persistent stats that are calculated by ANALYZE TABLE and saved + in the innodb database. +* quick transient stats, that are used if persistent stats for the given + table/index are not found in the innodb database */ +UNIV_INTERN unsigned long long srv_stats_transient_sample_pages = 8; +UNIV_INTERN my_bool srv_stats_persistent = TRUE; +UNIV_INTERN unsigned long long srv_stats_persistent_sample_pages = 20; +UNIV_INTERN my_bool srv_stats_auto_recalc = TRUE; + +UNIV_INTERN ibool srv_use_doublewrite_buf = TRUE; +UNIV_INTERN ibool srv_use_atomic_writes = FALSE; +#ifdef HAVE_POSIX_FALLOCATE +UNIV_INTERN ibool srv_use_posix_fallocate = FALSE; +#endif + +/** doublewrite buffer is 1MB is size i.e.: it can hold 128 16K pages. +The following parameter is the size of the buffer that is used for +batch flushing i.e.: LRU flushing and flush_list flushing. The rest +of the pages are used for single page flushing. */ +UNIV_INTERN ulong srv_doublewrite_batch_size = 120; + +UNIV_INTERN ulong srv_replication_delay = 0; + +UNIV_INTERN ulint srv_pass_corrupt_table = 0; /* 0:disable 1:enable */ + +UNIV_INTERN ulint srv_log_checksum_algorithm = + SRV_CHECKSUM_ALGORITHM_INNODB; + +/*-------------------------------------------*/ +UNIV_INTERN ulong srv_n_spin_wait_rounds = 30; +UNIV_INTERN ulong srv_spin_wait_delay = 6; +UNIV_INTERN ibool srv_priority_boost = TRUE; + +#ifdef UNIV_DEBUG +UNIV_INTERN ibool srv_print_thread_releases = FALSE; +UNIV_INTERN ibool srv_print_lock_waits = FALSE; +UNIV_INTERN ibool srv_print_buf_io = FALSE; +UNIV_INTERN ibool srv_print_log_io = FALSE; +UNIV_INTERN ibool srv_print_latch_waits = FALSE; +#endif /* UNIV_DEBUG */ + +static ulint srv_n_rows_inserted_old = 0; +static ulint srv_n_rows_updated_old = 0; +static ulint srv_n_rows_deleted_old = 0; +static ulint srv_n_rows_read_old = 0; + +UNIV_INTERN ulint srv_truncated_status_writes = 0; +UNIV_INTERN ulint srv_available_undo_logs = 0; + +/* Ensure status variables are on separate cache lines */ + +#define CACHE_LINE_SIZE 64 +#define CACHE_ALIGNED __attribute__ ((aligned (CACHE_LINE_SIZE))) + +UNIV_INTERN byte +counters_pad_start[CACHE_LINE_SIZE] __attribute__((unused)) = {0}; + +UNIV_INTERN ulint srv_read_views_memory CACHE_ALIGNED = 0; +UNIV_INTERN ulint srv_descriptors_memory CACHE_ALIGNED = 0; + +UNIV_INTERN byte +counters_pad_end[CACHE_LINE_SIZE] __attribute__((unused)) = {0}; + +/* Set the following to 0 if you want InnoDB to write messages on +stderr on startup/shutdown. */ +UNIV_INTERN ibool srv_print_verbose_log = TRUE; +UNIV_INTERN my_bool srv_print_innodb_monitor = FALSE; +UNIV_INTERN my_bool srv_print_innodb_lock_monitor = FALSE; +UNIV_INTERN ibool srv_print_innodb_tablespace_monitor = FALSE; +UNIV_INTERN ibool srv_print_innodb_table_monitor = FALSE; + +/* Array of English strings describing the current state of an +i/o handler thread */ + +UNIV_INTERN const char* srv_io_thread_op_info[SRV_MAX_N_IO_THREADS]; +UNIV_INTERN const char* srv_io_thread_function[SRV_MAX_N_IO_THREADS]; + +UNIV_INTERN time_t srv_last_monitor_time; + +UNIV_INTERN ib_mutex_t srv_innodb_monitor_mutex; + +/* Mutex for locking srv_monitor_file. Not created if srv_read_only_mode */ +UNIV_INTERN ib_mutex_t srv_monitor_file_mutex; + +#ifdef UNIV_PFS_MUTEX +# ifndef HAVE_ATOMIC_BUILTINS +/* Key to register server_mutex with performance schema */ +UNIV_INTERN mysql_pfs_key_t server_mutex_key; +# endif /* !HAVE_ATOMIC_BUILTINS */ +/** Key to register srv_innodb_monitor_mutex with performance schema */ +UNIV_INTERN mysql_pfs_key_t srv_innodb_monitor_mutex_key; +/** Key to register srv_monitor_file_mutex with performance schema */ +UNIV_INTERN mysql_pfs_key_t srv_monitor_file_mutex_key; +/** Key to register srv_dict_tmpfile_mutex with performance schema */ +UNIV_INTERN mysql_pfs_key_t srv_dict_tmpfile_mutex_key; +/** Key to register the mutex with performance schema */ +UNIV_INTERN mysql_pfs_key_t srv_misc_tmpfile_mutex_key; +/** Key to register srv_sys_t::mutex with performance schema */ +UNIV_INTERN mysql_pfs_key_t srv_sys_mutex_key; +/** Key to register srv_sys_t::tasks_mutex with performance schema */ +UNIV_INTERN mysql_pfs_key_t srv_sys_tasks_mutex_key; +#endif /* UNIV_PFS_MUTEX */ + +/** Temporary file for innodb monitor output */ +UNIV_INTERN FILE* srv_monitor_file; +/** Mutex for locking srv_dict_tmpfile. Not created if srv_read_only_mode. +This mutex has a very high rank; threads reserving it should not +be holding any InnoDB latches. */ +UNIV_INTERN ib_mutex_t srv_dict_tmpfile_mutex; +/** Temporary file for output from the data dictionary */ +UNIV_INTERN FILE* srv_dict_tmpfile; +/** Mutex for locking srv_misc_tmpfile. Not created if srv_read_only_mode. +This mutex has a very low rank; threads reserving it should not +acquire any further latches or sleep before releasing this one. */ +UNIV_INTERN ib_mutex_t srv_misc_tmpfile_mutex; +/** Temporary file for miscellanous diagnostic output */ +UNIV_INTERN FILE* srv_misc_tmpfile; + +UNIV_INTERN ulint srv_main_thread_process_no = 0; +UNIV_INTERN ulint srv_main_thread_id = 0; + +/* The following counts are used by the srv_master_thread. */ + +/** Iterations of the loop bounded by 'srv_active' label. */ +static ulint srv_main_active_loops = 0; +/** Iterations of the loop bounded by the 'srv_idle' label. */ +static ulint srv_main_idle_loops = 0; +/** Iterations of the loop bounded by the 'srv_shutdown' label. */ +static ulint srv_main_shutdown_loops = 0; +/** Log writes involving flush. */ +static ulint srv_log_writes_and_flush = 0; + +/* This is only ever touched by the master thread. It records the +time when the last flush of log file has happened. The master +thread ensures that we flush the log files at least once per +second. */ +static time_t srv_last_log_flush_time; + +/* Interval in seconds at which various tasks are performed by the +master thread when server is active. In order to balance the workload, +we should try to keep intervals such that they are not multiple of +each other. For example, if we have intervals for various tasks +defined as 5, 10, 15, 60 then all tasks will be performed when +current_time % 60 == 0 and no tasks will be performed when +current_time % 5 != 0. */ + +# define SRV_MASTER_CHECKPOINT_INTERVAL (7) +# define SRV_MASTER_PURGE_INTERVAL (10) +#ifdef MEM_PERIODIC_CHECK +# define SRV_MASTER_MEM_VALIDATE_INTERVAL (13) +#endif /* MEM_PERIODIC_CHECK */ +# define SRV_MASTER_DICT_LRU_INTERVAL (47) + +/** Acquire the system_mutex. */ +#define srv_sys_mutex_enter() do { \ + mutex_enter(&srv_sys->mutex); \ +} while (0) + +/** Test if the system mutex is owned. */ +#define srv_sys_mutex_own() (mutex_own(&srv_sys->mutex) \ + && !srv_read_only_mode) + +/** Release the system mutex. */ +#define srv_sys_mutex_exit() do { \ + mutex_exit(&srv_sys->mutex); \ +} while (0) + +#define fetch_lock_wait_timeout(trx) \ + ((trx)->lock.allowed_to_wait \ + ? thd_lock_wait_timeout((trx)->mysql_thd) \ + : 0) + +/* + IMPLEMENTATION OF THE SERVER MAIN PROGRAM + ========================================= + +There is the following analogue between this database +server and an operating system kernel: + +DB concept equivalent OS concept +---------- --------------------- +transaction -- process; + +query thread -- thread; + +lock -- semaphore; + +kernel -- kernel; + +query thread execution: +(a) without lock mutex +reserved -- process executing in user mode; +(b) with lock mutex reserved + -- process executing in kernel mode; + +The server has several backgroind threads all running at the same +priority as user threads. It periodically checks if here is anything +happening in the server which requires intervention of the master +thread. Such situations may be, for example, when flushing of dirty +blocks is needed in the buffer pool or old version of database rows +have to be cleaned away (purged). The user can configure a separate +dedicated purge thread(s) too, in which case the master thread does not +do any purging. + +The threads which we call user threads serve the queries of the MySQL +server. They run at normal priority. + +When there is no activity in the system, also the master thread +suspends itself to wait for an event making the server totally silent. + +There is still one complication in our server design. If a +background utility thread obtains a resource (e.g., mutex) needed by a user +thread, and there is also some other user activity in the system, +the user thread may have to wait indefinitely long for the +resource, as the OS does not schedule a background thread if +there is some other runnable user thread. This problem is called +priority inversion in real-time programming. + +One solution to the priority inversion problem would be to keep record +of which thread owns which resource and in the above case boost the +priority of the background thread so that it will be scheduled and it +can release the resource. This solution is called priority inheritance +in real-time programming. A drawback of this solution is that the overhead +of acquiring a mutex increases slightly, maybe 0.2 microseconds on a 100 +MHz Pentium, because the thread has to call os_thread_get_curr_id. This may +be compared to 0.5 microsecond overhead for a mutex lock-unlock pair. Note +that the thread cannot store the information in the resource , say mutex, +itself, because competing threads could wipe out the information if it is +stored before acquiring the mutex, and if it stored afterwards, the +information is outdated for the time of one machine instruction, at least. +(To be precise, the information could be stored to lock_word in mutex if +the machine supports atomic swap.) + +The above solution with priority inheritance may become actual in the +future, currently we do not implement any priority twiddling solution. +Our general aim is to reduce the contention of all mutexes by making +them more fine grained. + +The thread table contains information of the current status of each +thread existing in the system, and also the event semaphores used in +suspending the master thread and utility threads when they have nothing +to do. The thread table can be seen as an analogue to the process table +in a traditional Unix implementation. */ + +/** The server system struct */ +struct srv_sys_t{ + ib_mutex_t tasks_mutex; /*!< variable protecting the + tasks queue */ + UT_LIST_BASE_NODE_T(que_thr_t) + tasks; /*!< task queue */ + + ib_mutex_t mutex; /*!< variable protecting the + fields below. */ + ulint n_sys_threads; /*!< size of the sys_threads + array */ + + srv_slot_t* sys_threads; /*!< server thread table */ + + ulint n_threads_active[SRV_MASTER + 1]; + /*!< number of threads active + in a thread class */ + + srv_stats_t::ulint_ctr_1_t + activity_count; /*!< For tracking server + activity */ +}; + +#ifndef HAVE_ATOMIC_BUILTINS +/** Mutex protecting some server global variables. */ +UNIV_INTERN ib_mutex_t server_mutex; +#endif /* !HAVE_ATOMIC_BUILTINS */ + +static srv_sys_t* srv_sys = NULL; + +/** Event to signal the monitor thread. */ +UNIV_INTERN os_event_t srv_monitor_event; + +/** Event to signal the error thread */ +UNIV_INTERN os_event_t srv_error_event; + +/** Event to signal the buffer pool dump/load thread */ +UNIV_INTERN os_event_t srv_buf_dump_event; + +/** The buffer pool dump/load file name */ +UNIV_INTERN char* srv_buf_dump_filename; + +/** Boolean config knobs that tell InnoDB to dump the buffer pool at shutdown +and/or load it during startup. */ +UNIV_INTERN char srv_buffer_pool_dump_at_shutdown = FALSE; +UNIV_INTERN char srv_buffer_pool_load_at_startup = FALSE; + +/** Slot index in the srv_sys->sys_threads array for the purge thread. */ +static const ulint SRV_PURGE_SLOT = 1; + +/** Slot index in the srv_sys->sys_threads array for the master thread. */ +static const ulint SRV_MASTER_SLOT = 0; + +UNIV_INTERN os_event_t srv_checkpoint_completed_event; + +UNIV_INTERN os_event_t srv_redo_log_tracked_event; + +UNIV_INTERN bool srv_redo_log_thread_started = false; + +/*********************************************************************//** +Prints counters for work done by srv_master_thread. */ +static +void +srv_print_master_thread_info( +/*=========================*/ + FILE *file) /* in: output stream */ +{ + fprintf(file, "srv_master_thread loops: %lu srv_active, " + "%lu srv_shutdown, %lu srv_idle\n", + srv_main_active_loops, + srv_main_shutdown_loops, + srv_main_idle_loops); + fprintf(file, "srv_master_thread log flush and writes: %lu\n", + srv_log_writes_and_flush); +} + +/*********************************************************************//** +Sets the info describing an i/o thread current state. */ +UNIV_INTERN +void +srv_set_io_thread_op_info( +/*======================*/ + ulint i, /*!< in: the 'segment' of the i/o thread */ + const char* str) /*!< in: constant char string describing the + state */ +{ + ut_a(i < SRV_MAX_N_IO_THREADS); + + srv_io_thread_op_info[i] = str; +} + +/*********************************************************************//** +Resets the info describing an i/o thread current state. */ +UNIV_INTERN +void +srv_reset_io_thread_op_info() +/*=========================*/ +{ + for (ulint i = 0; i < UT_ARR_SIZE(srv_io_thread_op_info); ++i) { + srv_io_thread_op_info[i] = "not started yet"; + } +} + +#ifdef UNIV_DEBUG +/*********************************************************************//** +Validates the type of a thread table slot. +@return TRUE if ok */ +static +ibool +srv_thread_type_validate( +/*=====================*/ + srv_thread_type type) /*!< in: thread type */ +{ + switch (type) { + case SRV_NONE: + break; + case SRV_WORKER: + case SRV_PURGE: + case SRV_MASTER: + return(TRUE); + } + ut_error; + return(FALSE); +} +#endif /* UNIV_DEBUG */ + +/*********************************************************************//** +Gets the type of a thread table slot. +@return thread type */ +static +srv_thread_type +srv_slot_get_type( +/*==============*/ + const srv_slot_t* slot) /*!< in: thread slot */ +{ + srv_thread_type type = slot->type; + ut_ad(srv_thread_type_validate(type)); + return(type); +} + +/*********************************************************************//** +Reserves a slot in the thread table for the current thread. +@return reserved slot */ +static +srv_slot_t* +srv_reserve_slot( +/*=============*/ + srv_thread_type type) /*!< in: type of the thread */ +{ + srv_slot_t* slot = 0; + + srv_sys_mutex_enter(); + + ut_ad(srv_thread_type_validate(type)); + + switch (type) { + case SRV_MASTER: + slot = &srv_sys->sys_threads[SRV_MASTER_SLOT]; + break; + + case SRV_PURGE: + slot = &srv_sys->sys_threads[SRV_PURGE_SLOT]; + break; + + case SRV_WORKER: + /* Find an empty slot, skip the master and purge slots. */ + for (slot = &srv_sys->sys_threads[2]; + slot->in_use; + ++slot) { + + ut_a(slot < &srv_sys->sys_threads[ + srv_sys->n_sys_threads]); + } + break; + + case SRV_NONE: + ut_error; + } + + ut_a(!slot->in_use); + + slot->in_use = TRUE; + slot->suspended = FALSE; + slot->type = type; + + ut_ad(srv_slot_get_type(slot) == type); + + ++srv_sys->n_threads_active[type]; + + srv_sys_mutex_exit(); + + return(slot); +} + +/*********************************************************************//** +Suspends the calling thread to wait for the event in its thread slot. +@return the current signal count of the event. */ +static +ib_int64_t +srv_suspend_thread_low( +/*===================*/ + srv_slot_t* slot) /*!< in/out: thread slot */ +{ + + ut_ad(!srv_read_only_mode); + ut_ad(srv_sys_mutex_own()); + + ut_ad(slot->in_use); + + srv_thread_type type = srv_slot_get_type(slot); + + switch (type) { + case SRV_NONE: + ut_error; + + case SRV_MASTER: + /* We have only one master thread and it + should be the first entry always. */ + ut_a(srv_sys->n_threads_active[type] == 1); + break; + + case SRV_PURGE: + /* We have only one purge coordinator thread + and it should be the second entry always. */ + ut_a(srv_sys->n_threads_active[type] == 1); + break; + + case SRV_WORKER: + ut_a(srv_n_purge_threads > 1); + ut_a(srv_sys->n_threads_active[type] > 0); + break; + } + + ut_a(!slot->suspended); + slot->suspended = TRUE; + + ut_a(srv_sys->n_threads_active[type] > 0); + + srv_sys->n_threads_active[type]--; + + return(os_event_reset(slot->event)); +} + +/*********************************************************************//** +Suspends the calling thread to wait for the event in its thread slot. +@return the current signal count of the event. */ +static +ib_int64_t +srv_suspend_thread( +/*===============*/ + srv_slot_t* slot) /*!< in/out: thread slot */ +{ + srv_sys_mutex_enter(); + + ib_int64_t sig_count = srv_suspend_thread_low(slot); + + srv_sys_mutex_exit(); + + return(sig_count); +} + +/*********************************************************************//** +Releases threads of the type given from suspension in the thread table. +NOTE! The server mutex has to be reserved by the caller! +@return number of threads released: this may be less than n if not + enough threads were suspended at the moment. */ +UNIV_INTERN +ulint +srv_release_threads( +/*================*/ + srv_thread_type type, /*!< in: thread type */ + ulint n) /*!< in: number of threads to release */ +{ + ulint i; + ulint count = 0; + + ut_ad(srv_thread_type_validate(type)); + ut_ad(n > 0); + + srv_sys_mutex_enter(); + + for (i = 0; i < srv_sys->n_sys_threads; i++) { + srv_slot_t* slot; + + slot = &srv_sys->sys_threads[i]; + + if (slot->in_use + && srv_slot_get_type(slot) == type + && slot->suspended) { + + switch (type) { + case SRV_NONE: + ut_error; + + case SRV_MASTER: + /* We have only one master thread and it + should be the first entry always. */ + ut_a(n == 1); + ut_a(i == SRV_MASTER_SLOT); + ut_a(srv_sys->n_threads_active[type] == 0); + break; + + case SRV_PURGE: + /* We have only one purge coordinator thread + and it should be the second entry always. */ + ut_a(n == 1); + ut_a(i == SRV_PURGE_SLOT); + ut_a(srv_n_purge_threads > 0); + ut_a(srv_sys->n_threads_active[type] == 0); + break; + + case SRV_WORKER: + ut_a(srv_n_purge_threads > 1); + ut_a(srv_sys->n_threads_active[type] + < srv_n_purge_threads - 1); + break; + } + + slot->suspended = FALSE; + + ++srv_sys->n_threads_active[type]; + + os_event_set(slot->event); + + if (++count == n) { + break; + } + } + } + + srv_sys_mutex_exit(); + + return(count); +} + +/*********************************************************************//** +Release a thread's slot. */ +static +void +srv_free_slot( +/*==========*/ + srv_slot_t* slot) /*!< in/out: thread slot */ +{ + srv_sys_mutex_enter(); + + if (!slot->suspended) { + /* Mark the thread as inactive. */ + srv_suspend_thread_low(slot); + } + + /* Free the slot for reuse. */ + ut_ad(slot->in_use); + slot->in_use = FALSE; + + srv_sys_mutex_exit(); +} + +/*********************************************************************//** +Initializes the server. */ +UNIV_INTERN +void +srv_init(void) +/*==========*/ +{ + ulint n_sys_threads = 0; + ulint srv_sys_sz = sizeof(*srv_sys); + +#ifndef HAVE_ATOMIC_BUILTINS + mutex_create(server_mutex_key, &server_mutex, SYNC_ANY_LATCH); +#endif /* !HAVE_ATOMIC_BUILTINS */ + + mutex_create(srv_innodb_monitor_mutex_key, + &srv_innodb_monitor_mutex, SYNC_NO_ORDER_CHECK); + + if (!srv_read_only_mode) { + + /* Number of purge threads + master thread */ + n_sys_threads = srv_n_purge_threads + 1; + + srv_sys_sz += n_sys_threads * sizeof(*srv_sys->sys_threads); + } + + srv_sys = static_cast<srv_sys_t*>(mem_zalloc(srv_sys_sz)); + + srv_sys->n_sys_threads = n_sys_threads; + + if (!srv_read_only_mode) { + + mutex_create(srv_sys_mutex_key, &srv_sys->mutex, SYNC_THREADS); + + mutex_create(srv_sys_tasks_mutex_key, + &srv_sys->tasks_mutex, SYNC_ANY_LATCH); + + srv_sys->sys_threads = (srv_slot_t*) &srv_sys[1]; + + for (ulint i = 0; i < srv_sys->n_sys_threads; ++i) { + srv_slot_t* slot = &srv_sys->sys_threads[i]; + + slot->event = os_event_create(); + + ut_a(slot->event); + } + + srv_error_event = os_event_create(); + + srv_monitor_event = os_event_create(); + + srv_buf_dump_event = os_event_create(); + + srv_checkpoint_completed_event = os_event_create(); + + if (srv_track_changed_pages) { + srv_redo_log_tracked_event = os_event_create(); + os_event_set(srv_redo_log_tracked_event); + } + + UT_LIST_INIT(srv_sys->tasks); + } + + /* page_zip_stat_per_index_mutex is acquired from: + 1. page_zip_compress() (after SYNC_FSP) + 2. page_zip_decompress() + 3. i_s_cmp_per_index_fill_low() (where SYNC_DICT is acquired) + 4. innodb_cmp_per_index_update(), no other latches + since we do not acquire any other latches while holding this mutex, + it can have very low level. We pick SYNC_ANY_LATCH for it. */ + + mutex_create( + page_zip_stat_per_index_mutex_key, + &page_zip_stat_per_index_mutex, SYNC_ANY_LATCH); + + /* Create dummy indexes for infimum and supremum records */ + + dict_ind_init(); + + srv_conc_init(); + + /* Initialize some INFORMATION SCHEMA internal structures */ + trx_i_s_cache_init(trx_i_s_cache); + + ut_crc32_init(); + + dict_mem_init(); +} + +/*********************************************************************//** +Frees the data structures created in srv_init(). */ +UNIV_INTERN +void +srv_free(void) +/*==========*/ +{ + srv_conc_free(); + + /* The mutexes srv_sys->mutex and srv_sys->tasks_mutex should have + been freed by sync_close() already. */ + mem_free(srv_sys); + srv_sys = NULL; + + trx_i_s_cache_free(trx_i_s_cache); + + if (!srv_read_only_mode) { + os_event_free(srv_buf_dump_event); + srv_buf_dump_event = NULL; + } +} + +/*********************************************************************//** +Initializes the synchronization primitives, memory system, and the thread +local storage. */ +UNIV_INTERN +void +srv_general_init(void) +/*==================*/ +{ + ut_mem_init(); + /* Reset the system variables in the recovery module. */ + recv_sys_var_init(); + os_sync_init(); + sync_init(); + mem_init(srv_mem_pool_size); + que_init(); + row_mysql_init(); +} + +/*********************************************************************//** +Normalizes init parameter values to use units we use inside InnoDB. */ +static +void +srv_normalize_init_values(void) +/*===========================*/ +{ + ulint n; + ulint i; + + n = srv_n_data_files; + + for (i = 0; i < n; i++) { + srv_data_file_sizes[i] = srv_data_file_sizes[i] + * ((1024 * 1024) / UNIV_PAGE_SIZE); + } + + srv_last_file_size_max = srv_last_file_size_max + * ((1024 * 1024) / UNIV_PAGE_SIZE); + + srv_log_file_size = srv_log_file_size / UNIV_PAGE_SIZE; + + srv_log_buffer_size = srv_log_buffer_size / UNIV_PAGE_SIZE; + + srv_lock_table_size = 5 * (srv_buf_pool_size / UNIV_PAGE_SIZE); +} + +/*********************************************************************//** +Boots the InnoDB server. */ +UNIV_INTERN +void +srv_boot(void) +/*==========*/ +{ + /* Transform the init parameter values given by MySQL to + use units we use inside InnoDB: */ + + srv_normalize_init_values(); + + /* Initialize synchronization primitives, memory management, and thread + local storage */ + + srv_general_init(); + + /* Initialize this module */ + + srv_init(); + srv_mon_create(); +} + +/******************************************************************//** +Refreshes the values used to calculate per-second averages. */ +static +void +srv_refresh_innodb_monitor_stats(void) +/*==================================*/ +{ + mutex_enter(&srv_innodb_monitor_mutex); + + srv_last_monitor_time = time(NULL); + + os_aio_refresh_stats(); + + btr_cur_n_sea_old = btr_cur_n_sea; + btr_cur_n_non_sea_old = btr_cur_n_non_sea; + + log_refresh_stats(); + + buf_refresh_io_stats_all(); + + srv_n_rows_inserted_old = srv_stats.n_rows_inserted; + srv_n_rows_updated_old = srv_stats.n_rows_updated; + srv_n_rows_deleted_old = srv_stats.n_rows_deleted; + srv_n_rows_read_old = srv_stats.n_rows_read; + + mutex_exit(&srv_innodb_monitor_mutex); +} + +/******************************************************************//** +Outputs to a file the output of the InnoDB Monitor. +@return FALSE if not all information printed +due to failure to obtain necessary mutex */ +UNIV_INTERN +ibool +srv_printf_innodb_monitor( +/*======================*/ + FILE* file, /*!< in: output stream */ + ibool nowait, /*!< in: whether to wait for the + lock_sys_t:: mutex */ + ulint* trx_start_pos, /*!< out: file position of the start of + the list of active transactions */ + ulint* trx_end) /*!< out: file position of the end of + the list of active transactions */ +{ + double time_elapsed; + time_t current_time; + ulint n_reserved; + ibool ret; + + ulong btr_search_sys_constant; + ulong btr_search_sys_variable; + ulint lock_sys_subtotal; + ulint recv_sys_subtotal; + + ulint i; + trx_t* trx; + + mutex_enter(&srv_innodb_monitor_mutex); + + current_time = time(NULL); + + /* We add 0.001 seconds to time_elapsed to prevent division + by zero if two users happen to call SHOW ENGINE INNODB STATUS at the + same time */ + + time_elapsed = difftime(current_time, srv_last_monitor_time) + + 0.001; + + srv_last_monitor_time = time(NULL); + + fputs("\n=====================================\n", file); + + ut_print_timestamp(file); + fprintf(file, + " INNODB MONITOR OUTPUT\n" + "=====================================\n" + "Per second averages calculated from the last %lu seconds\n", + (ulong) time_elapsed); + + fputs("-----------------\n" + "BACKGROUND THREAD\n" + "-----------------\n", file); + srv_print_master_thread_info(file); + + fputs("----------\n" + "SEMAPHORES\n" + "----------\n", file); + sync_print(file); + + /* Conceptually, srv_innodb_monitor_mutex has a very high latching + order level in sync0sync.h, while dict_foreign_err_mutex has a very + low level 135. Therefore we can reserve the latter mutex here without + a danger of a deadlock of threads. */ + + mutex_enter(&dict_foreign_err_mutex); + + if (!srv_read_only_mode && ftell(dict_foreign_err_file) != 0L) { + fputs("------------------------\n" + "LATEST FOREIGN KEY ERROR\n" + "------------------------\n", file); + ut_copy_file(file, dict_foreign_err_file); + } + + mutex_exit(&dict_foreign_err_mutex); + + /* Only if lock_print_info_summary proceeds correctly, + before we call the lock_print_info_all_transactions + to print all the lock information. IMPORTANT NOTE: This + function acquires the lock mutex on success. */ + ret = lock_print_info_summary(file, nowait); + + if (ret) { + if (trx_start_pos) { + long t = ftell(file); + if (t < 0) { + *trx_start_pos = ULINT_UNDEFINED; + } else { + *trx_start_pos = (ulint) t; + } + } + + /* NOTE: If we get here then we have the lock mutex. This + function will release the lock mutex that we acquired when + we called the lock_print_info_summary() function earlier. */ + + lock_print_info_all_transactions(file); + + if (trx_end) { + long t = ftell(file); + if (t < 0) { + *trx_end = ULINT_UNDEFINED; + } else { + *trx_end = (ulint) t; + } + } + } + + fputs("--------\n" + "FILE I/O\n" + "--------\n", file); + os_aio_print(file); + + fputs("-------------------------------------\n" + "INSERT BUFFER AND ADAPTIVE HASH INDEX\n" + "-------------------------------------\n", file); + ibuf_print(file); + + + fprintf(file, + "%.2f hash searches/s, %.2f non-hash searches/s\n", + (btr_cur_n_sea - btr_cur_n_sea_old) + / time_elapsed, + (btr_cur_n_non_sea - btr_cur_n_non_sea_old) + / time_elapsed); + btr_cur_n_sea_old = btr_cur_n_sea; + btr_cur_n_non_sea_old = btr_cur_n_non_sea; + + fputs("---\n" + "LOG\n" + "---\n", file); + log_print(file); + + fputs("----------------------\n" + "BUFFER POOL AND MEMORY\n" + "----------------------\n", file); + fprintf(file, + "Total memory allocated " ULINTPF + "; in additional pool allocated " ULINTPF "\n", + ut_total_allocated_memory, + mem_pool_get_reserved(mem_comm_pool)); + + fprintf(file, + "Total memory allocated by read views " ULINTPF "\n", + os_atomic_increment_lint(&srv_read_views_memory, 0)); + + /* Calculate AHI constant and variable memory allocations */ + + btr_search_sys_constant = 0; + btr_search_sys_variable = 0; + + ut_ad(btr_search_sys->hash_tables); + + for (i = 0; i < btr_search_index_num; i++) { + hash_table_t* ht = btr_search_sys->hash_tables[i]; + + ut_ad(ht); + ut_ad(ht->heap); + + /* Multiple mutexes/heaps are currently never used for adaptive + hash index tables. */ + ut_ad(!ht->n_sync_obj); + ut_ad(!ht->heaps); + + btr_search_sys_variable += mem_heap_get_size(ht->heap); + btr_search_sys_constant += ht->n_cells * sizeof(hash_cell_t); + } + + lock_sys_subtotal = 0; + if (trx_sys) { + mutex_enter(&trx_sys->mutex); + trx = UT_LIST_GET_FIRST(trx_sys->mysql_trx_list); + while (trx) { + lock_sys_subtotal + += ((trx->lock.lock_heap) + ? mem_heap_get_size(trx->lock.lock_heap) + : 0); + trx = UT_LIST_GET_NEXT(mysql_trx_list, trx); + } + mutex_exit(&trx_sys->mutex); + } + + recv_sys_subtotal = ((recv_sys && recv_sys->addr_hash) + ? mem_heap_get_size(recv_sys->heap) : 0); + + fprintf(file, + "Internal hash tables (constant factor + variable factor)\n" + " Adaptive hash index %lu \t(%lu + " ULINTPF ")\n" + " Page hash %lu (buffer pool 0 only)\n" + " Dictionary cache %lu \t(%lu + " ULINTPF ")\n" + " File system %lu \t(%lu + " ULINTPF ")\n" + " Lock system %lu \t(%lu + " ULINTPF ")\n" + " Recovery system %lu \t(%lu + " ULINTPF ")\n", + + btr_search_sys_constant + btr_search_sys_variable, + btr_search_sys_constant, + btr_search_sys_variable, + + (ulong) (buf_pool_from_array(0)->page_hash->n_cells * sizeof(hash_cell_t)), + + (ulong) (dict_sys ? ((dict_sys->table_hash->n_cells + + dict_sys->table_id_hash->n_cells + ) * sizeof(hash_cell_t) + + dict_sys->size) : 0), + (ulong) (dict_sys ? ((dict_sys->table_hash->n_cells + + dict_sys->table_id_hash->n_cells + ) * sizeof(hash_cell_t)) : 0), + dict_sys ? (dict_sys->size) : 0, + + (ulong) (fil_system_hash_cells() * sizeof(hash_cell_t) + + fil_system_hash_nodes()), + (ulong) (fil_system_hash_cells() * sizeof(hash_cell_t)), + fil_system_hash_nodes(), + + (ulong) ((lock_sys ? (lock_sys->rec_hash->n_cells * sizeof(hash_cell_t)) : 0) + + lock_sys_subtotal), + (ulong) (lock_sys ? (lock_sys->rec_hash->n_cells * sizeof(hash_cell_t)) : 0), + lock_sys_subtotal, + + (ulong) (((recv_sys && recv_sys->addr_hash) + ? (recv_sys->addr_hash->n_cells * sizeof(hash_cell_t)) : 0) + + recv_sys_subtotal), + (ulong) ((recv_sys && recv_sys->addr_hash) + ? (recv_sys->addr_hash->n_cells * sizeof(hash_cell_t)) : 0), + recv_sys_subtotal); + + fprintf(file, "Dictionary memory allocated " ULINTPF "\n", + dict_sys->size); + + buf_print_io(file); + + fputs("--------------\n" + "ROW OPERATIONS\n" + "--------------\n", file); + fprintf(file, "%ld queries inside InnoDB, %lu queries in queue\n", + (long) srv_conc_get_active_threads(), + srv_conc_get_waiting_threads()); + + mutex_enter(&trx_sys->mutex); + + fprintf(file, "%lu read views open inside InnoDB\n", + UT_LIST_GET_LEN(trx_sys->view_list)); + + fprintf(file, "%lu RW transactions active inside InnoDB\n", + UT_LIST_GET_LEN(trx_sys->rw_trx_list)); + + fprintf(file, "%lu RO transactions active inside InnoDB\n", + UT_LIST_GET_LEN(trx_sys->ro_trx_list)); + + fprintf(file, "%lu out of %lu descriptors used\n", + trx_sys->descr_n_used, trx_sys->descr_n_max); + + if (UT_LIST_GET_LEN(trx_sys->view_list)) { + read_view_t* view = UT_LIST_GET_LAST(trx_sys->view_list); + + if (view) { + fprintf(file, "---OLDEST VIEW---\n"); + read_view_print(file, view); + fprintf(file, "-----------------\n"); + } + } + + mutex_exit(&trx_sys->mutex); + + n_reserved = fil_space_get_n_reserved_extents(0); + if (n_reserved > 0) { + fprintf(file, + "%lu tablespace extents now reserved for" + " B-tree split operations\n", + (ulong) n_reserved); + } + +#ifdef UNIV_LINUX + fprintf(file, "Main thread process no. %lu, id %lu, state: %s\n", + (ulong) srv_main_thread_process_no, + (ulong) srv_main_thread_id, + srv_main_thread_op_info); +#else + fprintf(file, "Main thread id %lu, state: %s\n", + (ulong) srv_main_thread_id, + srv_main_thread_op_info); +#endif + fprintf(file, + "Number of rows inserted " ULINTPF + ", updated " ULINTPF ", deleted " ULINTPF + ", read " ULINTPF "\n", + (ulint) srv_stats.n_rows_inserted, + (ulint) srv_stats.n_rows_updated, + (ulint) srv_stats.n_rows_deleted, + (ulint) srv_stats.n_rows_read); + fprintf(file, + "%.2f inserts/s, %.2f updates/s," + " %.2f deletes/s, %.2f reads/s\n", + ((ulint) srv_stats.n_rows_inserted - srv_n_rows_inserted_old) + / time_elapsed, + ((ulint) srv_stats.n_rows_updated - srv_n_rows_updated_old) + / time_elapsed, + ((ulint) srv_stats.n_rows_deleted - srv_n_rows_deleted_old) + / time_elapsed, + ((ulint) srv_stats.n_rows_read - srv_n_rows_read_old) + / time_elapsed); + + srv_n_rows_inserted_old = srv_stats.n_rows_inserted; + srv_n_rows_updated_old = srv_stats.n_rows_updated; + srv_n_rows_deleted_old = srv_stats.n_rows_deleted; + srv_n_rows_read_old = srv_stats.n_rows_read; + + fputs("----------------------------\n" + "END OF INNODB MONITOR OUTPUT\n" + "============================\n", file); + mutex_exit(&srv_innodb_monitor_mutex); + fflush(file); + + return(ret); +} + +/******************************************************************//** +Function to pass InnoDB status variables to MySQL */ +UNIV_INTERN +void +srv_export_innodb_status(void) +/*==========================*/ +{ + buf_pool_stat_t stat; + buf_pools_list_size_t buf_pools_list_size; + ulint LRU_len; + ulint free_len; + ulint flush_list_len; + ulint mem_adaptive_hash, mem_dictionary; + read_view_t* oldest_view; + ulint i; + + buf_get_total_stat(&stat); + buf_get_total_list_len(&LRU_len, &free_len, &flush_list_len); + buf_get_total_list_size_in_bytes(&buf_pools_list_size); + + mem_adaptive_hash = 0; + + ut_ad(btr_search_sys->hash_tables); + + for (i = 0; i < btr_search_index_num; i++) { + hash_table_t* ht = btr_search_sys->hash_tables[i]; + + ut_ad(ht); + ut_ad(ht->heap); + /* Multiple mutexes/heaps are currently never used for adaptive + hash index tables. */ + ut_ad(!ht->n_sync_obj); + ut_ad(!ht->heaps); + + mem_adaptive_hash += mem_heap_get_size(ht->heap); + mem_adaptive_hash += ht->n_cells * sizeof(hash_cell_t); + } + + mem_dictionary = (dict_sys ? ((dict_sys->table_hash->n_cells + + dict_sys->table_id_hash->n_cells + ) * sizeof(hash_cell_t) + + dict_sys->size) : 0); + + mutex_enter(&srv_innodb_monitor_mutex); + + export_vars.innodb_data_pending_reads = + os_n_pending_reads; + + export_vars.innodb_data_pending_writes = + os_n_pending_writes; + + export_vars.innodb_data_pending_fsyncs = + fil_n_pending_log_flushes + + fil_n_pending_tablespace_flushes; + export_vars.innodb_adaptive_hash_hash_searches + = btr_cur_n_sea; + export_vars.innodb_adaptive_hash_non_hash_searches + = btr_cur_n_non_sea; + export_vars.innodb_background_log_sync + = srv_log_writes_and_flush; + + export_vars.innodb_data_fsyncs = os_n_fsyncs; + + export_vars.innodb_data_read = srv_stats.data_read; + + export_vars.innodb_data_reads = os_n_file_reads; + + export_vars.innodb_data_writes = os_n_file_writes; + + export_vars.innodb_data_written = srv_stats.data_written; + + export_vars.innodb_buffer_pool_read_requests = stat.n_page_gets; + + export_vars.innodb_buffer_pool_write_requests = + srv_stats.buf_pool_write_requests; + + export_vars.innodb_buffer_pool_wait_free = + srv_stats.buf_pool_wait_free; + + export_vars.innodb_buffer_pool_pages_flushed = + srv_stats.buf_pool_flushed; + + export_vars.innodb_buffer_pool_reads = srv_stats.buf_pool_reads; + + export_vars.innodb_buffer_pool_read_ahead_rnd = + stat.n_ra_pages_read_rnd; + + export_vars.innodb_buffer_pool_read_ahead = + stat.n_ra_pages_read; + + export_vars.innodb_buffer_pool_read_ahead_evicted = + stat.n_ra_pages_evicted; + + export_vars.innodb_buffer_pool_pages_LRU_flushed = + stat.buf_lru_flush_page_count; + + export_vars.innodb_buffer_pool_pages_data = LRU_len; + + export_vars.innodb_buffer_pool_bytes_data = + buf_pools_list_size.LRU_bytes + + buf_pools_list_size.unzip_LRU_bytes; + + export_vars.innodb_buffer_pool_pages_dirty = flush_list_len; + + export_vars.innodb_buffer_pool_bytes_dirty = + buf_pools_list_size.flush_list_bytes; + + export_vars.innodb_buffer_pool_pages_free = free_len; + + export_vars.innodb_deadlocks = srv_stats.lock_deadlock_count; + +#ifdef UNIV_DEBUG + export_vars.innodb_buffer_pool_pages_latched = + buf_get_latched_pages_number(); +#endif /* UNIV_DEBUG */ + export_vars.innodb_buffer_pool_pages_total = buf_pool_get_n_pages(); + + export_vars.innodb_buffer_pool_pages_misc = + buf_pool_get_n_pages() - LRU_len - free_len; + + export_vars.innodb_buffer_pool_pages_made_young + = stat.n_pages_made_young; + export_vars.innodb_buffer_pool_pages_made_not_young + = stat.n_pages_not_made_young; + export_vars.innodb_buffer_pool_pages_old = 0; + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool = buf_pool_from_array(i); + export_vars.innodb_buffer_pool_pages_old + += buf_pool->LRU_old_len; + } + export_vars.innodb_checkpoint_age + = (log_sys->lsn - log_sys->last_checkpoint_lsn); + export_vars.innodb_checkpoint_max_age + = log_sys->max_checkpoint_age; + export_vars.innodb_history_list_length + = trx_sys->rseg_history_len; + ibuf_export_ibuf_status( + &export_vars.innodb_ibuf_size, + &export_vars.innodb_ibuf_free_list, + &export_vars.innodb_ibuf_segment_size, + &export_vars.innodb_ibuf_merges, + &export_vars.innodb_ibuf_merged_inserts, + &export_vars.innodb_ibuf_merged_delete_marks, + &export_vars.innodb_ibuf_merged_deletes, + &export_vars.innodb_ibuf_discarded_inserts, + &export_vars.innodb_ibuf_discarded_delete_marks, + &export_vars.innodb_ibuf_discarded_deletes); + export_vars.innodb_lsn_current + = log_sys->lsn; + export_vars.innodb_lsn_flushed + = log_sys->flushed_to_disk_lsn; + export_vars.innodb_lsn_last_checkpoint + = log_sys->last_checkpoint_lsn; + export_vars.innodb_master_thread_active_loops + = srv_main_active_loops; + export_vars.innodb_master_thread_idle_loops + = srv_main_idle_loops; + export_vars.innodb_max_trx_id + = trx_sys->max_trx_id; + export_vars.innodb_mem_adaptive_hash + = mem_adaptive_hash; + export_vars.innodb_mem_dictionary + = mem_dictionary; + export_vars.innodb_mem_total + = ut_total_allocated_memory; + export_vars.innodb_mutex_os_waits + = mutex_os_wait_count; + export_vars.innodb_mutex_spin_rounds + = mutex_spin_round_count; + export_vars.innodb_mutex_spin_waits + = mutex_spin_wait_count; + export_vars.innodb_s_lock_os_waits + = rw_lock_stats.rw_s_os_wait_count; + export_vars.innodb_s_lock_spin_rounds + = rw_lock_stats.rw_s_spin_round_count; + export_vars.innodb_s_lock_spin_waits + = rw_lock_stats.rw_s_spin_wait_count; + export_vars.innodb_x_lock_os_waits + = rw_lock_stats.rw_x_os_wait_count; + export_vars.innodb_x_lock_spin_rounds + = rw_lock_stats.rw_x_spin_round_count; + export_vars.innodb_x_lock_spin_waits + = rw_lock_stats.rw_x_spin_wait_count; + + oldest_view = UT_LIST_GET_LAST(trx_sys->view_list); + export_vars.innodb_oldest_view_low_limit_trx_id + = oldest_view ? oldest_view->low_limit_id : 0; + + export_vars.innodb_purge_trx_id = purge_sys->limit.trx_no; + export_vars.innodb_purge_undo_no = purge_sys->limit.undo_no; + export_vars.innodb_current_row_locks + = lock_sys->rec_num; + +#ifdef HAVE_ATOMIC_BUILTINS + export_vars.innodb_have_atomic_builtins = 1; +#else + export_vars.innodb_have_atomic_builtins = 0; +#endif + export_vars.innodb_page_size = UNIV_PAGE_SIZE; + + export_vars.innodb_log_waits = srv_stats.log_waits; + + export_vars.innodb_os_log_written = srv_stats.os_log_written; + + export_vars.innodb_os_log_fsyncs = fil_n_log_flushes; + + export_vars.innodb_os_log_pending_fsyncs = fil_n_pending_log_flushes; + + export_vars.innodb_os_log_pending_writes = + srv_stats.os_log_pending_writes; + + export_vars.innodb_log_write_requests = srv_stats.log_write_requests; + + export_vars.innodb_log_writes = srv_stats.log_writes; + + export_vars.innodb_dblwr_pages_written = + srv_stats.dblwr_pages_written; + + export_vars.innodb_dblwr_writes = srv_stats.dblwr_writes; + + export_vars.innodb_pages_created = stat.n_pages_created; + + export_vars.innodb_pages_read = stat.n_pages_read; + + export_vars.innodb_pages_written = stat.n_pages_written; + + export_vars.innodb_row_lock_waits = srv_stats.n_lock_wait_count; + + export_vars.innodb_row_lock_current_waits = + srv_stats.n_lock_wait_current_count; + + export_vars.innodb_row_lock_time = srv_stats.n_lock_wait_time / 1000; + + if (srv_stats.n_lock_wait_count > 0) { + + export_vars.innodb_row_lock_time_avg = (ulint) + (srv_stats.n_lock_wait_time + / 1000 / srv_stats.n_lock_wait_count); + + } else { + export_vars.innodb_row_lock_time_avg = 0; + } + + export_vars.innodb_row_lock_time_max = + lock_sys->n_lock_max_wait_time / 1000; + + export_vars.innodb_rows_read = srv_stats.n_rows_read; + + export_vars.innodb_rows_inserted = srv_stats.n_rows_inserted; + + export_vars.innodb_rows_updated = srv_stats.n_rows_updated; + + export_vars.innodb_rows_deleted = srv_stats.n_rows_deleted; + + export_vars.innodb_num_open_files = fil_n_file_opened; + + export_vars.innodb_truncated_status_writes = + srv_truncated_status_writes; + + export_vars.innodb_available_undo_logs = srv_available_undo_logs; + export_vars.innodb_read_views_memory + = os_atomic_increment_lint(&srv_read_views_memory, 0); + export_vars.innodb_descriptors_memory + = os_atomic_increment_lint(&srv_descriptors_memory, 0); + +#ifdef UNIV_DEBUG + rw_lock_s_lock(&purge_sys->latch); + trx_id_t done_trx_no = purge_sys->done.trx_no; + trx_id_t up_limit_id = purge_sys->view + ? purge_sys->view->up_limit_id + : 0; + rw_lock_s_unlock(&purge_sys->latch); + + mutex_enter(&trx_sys->mutex); + trx_id_t max_trx_id = trx_sys->rw_max_trx_id; + mutex_exit(&trx_sys->mutex); + + if (!done_trx_no || max_trx_id < done_trx_no - 1) { + export_vars.innodb_purge_trx_id_age = 0; + } else { + export_vars.innodb_purge_trx_id_age = + (ulint) (max_trx_id - done_trx_no + 1); + } + + if (!up_limit_id + || max_trx_id < up_limit_id) { + export_vars.innodb_purge_view_trx_id_age = 0; + } else { + export_vars.innodb_purge_view_trx_id_age = + (ulint) (max_trx_id - up_limit_id); + } +#endif /* UNIV_DEBUG */ + + mutex_exit(&srv_innodb_monitor_mutex); +} + +/*********************************************************************//** +A thread which prints the info output by various InnoDB monitors. +@return a dummy parameter */ +extern "C" UNIV_INTERN +os_thread_ret_t +DECLARE_THREAD(srv_monitor_thread)( +/*===============================*/ + void* arg __attribute__((unused))) + /*!< in: a dummy parameter required by + os_thread_create */ +{ + ib_int64_t sig_count; + double time_elapsed; + time_t current_time; + time_t last_table_monitor_time; + time_t last_tablespace_monitor_time; + time_t last_monitor_time; + ulint mutex_skipped; + ibool last_srv_print_monitor; + + ut_ad(!srv_read_only_mode); + +#ifdef UNIV_DEBUG_THREAD_CREATION + fprintf(stderr, "Lock timeout thread starts, id %lu\n", + os_thread_pf(os_thread_get_curr_id())); +#endif /* UNIV_DEBUG_THREAD_CREATION */ + +#ifdef UNIV_PFS_THREAD + pfs_register_thread(srv_monitor_thread_key); +#endif /* UNIV_PFS_THREAD */ + srv_monitor_active = TRUE; + + UT_NOT_USED(arg); + srv_last_monitor_time = ut_time(); + last_table_monitor_time = ut_time(); + last_tablespace_monitor_time = ut_time(); + last_monitor_time = ut_time(); + mutex_skipped = 0; + last_srv_print_monitor = srv_print_innodb_monitor; +loop: + /* Wake up every 5 seconds to see if we need to print + monitor information or if signalled at shutdown. */ + + sig_count = os_event_reset(srv_monitor_event); + + os_event_wait_time_low(srv_monitor_event, 5000000, sig_count); + + current_time = ut_time(); + + time_elapsed = difftime(current_time, last_monitor_time); + + if (time_elapsed > 15) { + last_monitor_time = ut_time(); + + if (srv_print_innodb_monitor) { + /* Reset mutex_skipped counter everytime + srv_print_innodb_monitor changes. This is to + ensure we will not be blocked by lock_sys->mutex + for short duration information printing, + such as requested by sync_array_print_long_waits() */ + if (!last_srv_print_monitor) { + mutex_skipped = 0; + last_srv_print_monitor = TRUE; + } + + if (!srv_printf_innodb_monitor(stderr, + MUTEX_NOWAIT(mutex_skipped), + NULL, NULL)) { + mutex_skipped++; + } else { + /* Reset the counter */ + mutex_skipped = 0; + } + } else { + last_srv_print_monitor = FALSE; + } + + + /* We don't create the temp files or associated + mutexes in read-only-mode */ + + if (!srv_read_only_mode && srv_innodb_status) { + mutex_enter(&srv_monitor_file_mutex); + rewind(srv_monitor_file); + if (!srv_printf_innodb_monitor(srv_monitor_file, + MUTEX_NOWAIT(mutex_skipped), + NULL, NULL)) { + mutex_skipped++; + } else { + mutex_skipped = 0; + } + + os_file_set_eof(srv_monitor_file); + mutex_exit(&srv_monitor_file_mutex); + } + + if (srv_print_innodb_tablespace_monitor + && difftime(current_time, + last_tablespace_monitor_time) > 60) { + last_tablespace_monitor_time = ut_time(); + + fputs("========================" + "========================\n", + stderr); + + ut_print_timestamp(stderr); + + fputs(" INNODB TABLESPACE MONITOR OUTPUT\n" + "========================" + "========================\n", + stderr); + + fsp_print(0); + fputs("Validating tablespace\n", stderr); + fsp_validate(0); + fputs("Validation ok\n" + "---------------------------------------\n" + "END OF INNODB TABLESPACE MONITOR OUTPUT\n" + "=======================================\n", + stderr); + } + + if (srv_print_innodb_table_monitor + && difftime(current_time, last_table_monitor_time) > 60) { + + last_table_monitor_time = ut_time(); + + fprintf(stderr, "Warning: %s\n", + DEPRECATED_MSG_INNODB_TABLE_MONITOR); + + fputs("===========================================\n", + stderr); + + ut_print_timestamp(stderr); + + fputs(" INNODB TABLE MONITOR OUTPUT\n" + "===========================================\n", + stderr); + dict_print(); + + fputs("-----------------------------------\n" + "END OF INNODB TABLE MONITOR OUTPUT\n" + "==================================\n", + stderr); + + fprintf(stderr, "Warning: %s\n", + DEPRECATED_MSG_INNODB_TABLE_MONITOR); + } + } + + if (srv_shutdown_state >= SRV_SHUTDOWN_CLEANUP) { + goto exit_func; + } + + if (srv_print_innodb_monitor + || srv_print_innodb_lock_monitor + || srv_print_innodb_tablespace_monitor + || srv_print_innodb_table_monitor) { + goto loop; + } + + goto loop; + +exit_func: + srv_monitor_active = FALSE; + + /* We count the number of threads in os_thread_exit(). A created + thread should always use that to exit and not use return() to exit. */ + + os_thread_exit(NULL); + + OS_THREAD_DUMMY_RETURN; +} + +/*********************************************************************//** +A thread which prints warnings about semaphore waits which have lasted +too long. These can be used to track bugs which cause hangs. +@return a dummy parameter */ +extern "C" UNIV_INTERN +os_thread_ret_t +DECLARE_THREAD(srv_error_monitor_thread)( +/*=====================================*/ + void* arg __attribute__((unused))) + /*!< in: a dummy parameter required by + os_thread_create */ +{ + /* number of successive fatal timeouts observed */ + ulint fatal_cnt = 0; + lsn_t old_lsn; + lsn_t new_lsn; + ib_int64_t sig_count; + /* longest waiting thread for a semaphore */ + os_thread_id_t waiter = os_thread_get_curr_id(); + os_thread_id_t old_waiter = waiter; + /* the semaphore that is being waited for */ + const void* sema = NULL; + const void* old_sema = NULL; + + ut_ad(!srv_read_only_mode); + + old_lsn = srv_start_lsn; + +#ifdef UNIV_DEBUG_THREAD_CREATION + fprintf(stderr, "Error monitor thread starts, id %lu\n", + os_thread_pf(os_thread_get_curr_id())); +#endif /* UNIV_DEBUG_THREAD_CREATION */ + +#ifdef UNIV_PFS_THREAD + pfs_register_thread(srv_error_monitor_thread_key); +#endif /* UNIV_PFS_THREAD */ + srv_error_monitor_active = TRUE; + +loop: + /* Try to track a strange bug reported by Harald Fuchs and others, + where the lsn seems to decrease at times */ + + new_lsn = log_get_lsn(); + + if (new_lsn < old_lsn) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: old log sequence number " LSN_PF + " was greater\n" + "InnoDB: than the new log sequence number " LSN_PF "!\n" + "InnoDB: Please submit a bug report" + " to http://bugs.mysql.com\n", + old_lsn, new_lsn); + ut_ad(0); + } + + old_lsn = new_lsn; + + if (difftime(time(NULL), srv_last_monitor_time) > 60) { + /* We referesh InnoDB Monitor values so that averages are + printed from at most 60 last seconds */ + + srv_refresh_innodb_monitor_stats(); + } + + /* Update the statistics collected for deciding LRU + eviction policy. */ + buf_LRU_stat_update(); + + /* In case mutex_exit is not a memory barrier, it is + theoretically possible some threads are left waiting though + the semaphore is already released. Wake up those threads: */ + + sync_arr_wake_threads_if_sema_free(); + + if (sync_array_print_long_waits(&waiter, &sema) + && sema == old_sema && os_thread_eq(waiter, old_waiter)) { + fatal_cnt++; + if (fatal_cnt > 10) { + + fprintf(stderr, + "InnoDB: Error: semaphore wait has lasted" + " > %lu seconds\n" + "InnoDB: We intentionally crash the server," + " because it appears to be hung.\n", + (ulong) srv_fatal_semaphore_wait_threshold); + + ut_error; + } + } else { + fatal_cnt = 0; + old_waiter = waiter; + old_sema = sema; + } + + if (srv_kill_idle_transaction && trx_sys) { + trx_t* trx; + time_t now; +rescan_idle: + now = time(NULL); + mutex_enter(&trx_sys->mutex); + trx = UT_LIST_GET_FIRST(trx_sys->mysql_trx_list); + while (trx) { + if (!trx_state_eq(trx, TRX_STATE_NOT_STARTED) + && trx_state_eq(trx, TRX_STATE_ACTIVE) + && trx->mysql_thd + && innobase_thd_is_idle(trx->mysql_thd)) { + ib_int64_t start_time = innobase_thd_get_start_time(trx->mysql_thd); + ulong thd_id = innobase_thd_get_thread_id(trx->mysql_thd); + + if (trx->last_stmt_start != start_time) { + trx->idle_start = now; + trx->last_stmt_start = start_time; + } else if (difftime(now, trx->idle_start) + > srv_kill_idle_transaction) { + /* kill the session */ + mutex_exit(&trx_sys->mutex); + innobase_thd_kill(thd_id); + goto rescan_idle; + } + } + trx = UT_LIST_GET_NEXT(mysql_trx_list, trx); + } + mutex_exit(&trx_sys->mutex); + } + + /* Flush stderr so that a database user gets the output + to possible MySQL error file */ + + fflush(stderr); + + sig_count = os_event_reset(srv_error_event); + + os_event_wait_time_low(srv_error_event, 1000000, sig_count); + + if (srv_shutdown_state < SRV_SHUTDOWN_CLEANUP) { + + goto loop; + } + + srv_error_monitor_active = FALSE; + + /* We count the number of threads in os_thread_exit(). A created + thread should always use that to exit and not use return() to exit. */ + + os_thread_exit(NULL); + + OS_THREAD_DUMMY_RETURN; +} + +/******************************************************************//** +Increment the server activity count. */ +UNIV_INTERN +void +srv_inc_activity_count(void) +/*========================*/ +{ + srv_sys->activity_count.inc(); +} + +/**********************************************************************//** +Check whether any background thread is active. If so return the thread +type. +@return SRV_NONE if all are suspended or have exited, thread +type if any are still active. */ +UNIV_INTERN +srv_thread_type +srv_get_active_thread_type(void) +/*============================*/ +{ + srv_thread_type ret = SRV_NONE; + + if (srv_read_only_mode) { + return(SRV_NONE); + } + + srv_sys_mutex_enter(); + + for (ulint i = SRV_WORKER; i <= SRV_MASTER; ++i) { + if (srv_sys->n_threads_active[i] != 0) { + ret = static_cast<srv_thread_type>(i); + break; + } + } + + srv_sys_mutex_exit(); + + /* Check only on shutdown. */ + if (ret == SRV_NONE + && srv_shutdown_state != SRV_SHUTDOWN_NONE + && trx_purge_state() != PURGE_STATE_DISABLED + && trx_purge_state() != PURGE_STATE_EXIT) { + + ret = SRV_PURGE; + } + + return(ret); +} + +/**********************************************************************//** +Check whether any background thread are active. If so print which thread +is active. Send the threads wakeup signal. +@return name of thread that is active or NULL */ +UNIV_INTERN +const char* +srv_any_background_threads_are_active(void) +/*=======================================*/ +{ + const char* thread_active = NULL; + + if (srv_read_only_mode) { + return(NULL); + } else if (srv_error_monitor_active) { + thread_active = "srv_error_monitor_thread"; + } else if (lock_sys->timeout_thread_active) { + thread_active = "srv_lock_timeout thread"; + } else if (srv_monitor_active) { + thread_active = "srv_monitor_thread"; + } else if (srv_buf_dump_thread_active) { + thread_active = "buf_dump_thread"; + } else if (srv_dict_stats_thread_active) { + thread_active = "dict_stats_thread"; + } + + os_event_set(srv_error_event); + os_event_set(srv_monitor_event); + os_event_set(srv_buf_dump_event); + os_event_set(lock_sys->timeout_event); + os_event_set(dict_stats_event); + + return(thread_active); +} + +/******************************************************************//** +A thread which follows the redo log and outputs the changed page bitmap. +@return a dummy value */ +extern "C" UNIV_INTERN +os_thread_ret_t +DECLARE_THREAD(srv_redo_log_follow_thread)( +/*=======================================*/ + void* arg __attribute__((unused))) /*!< in: a dummy parameter + required by + os_thread_create */ +{ + ut_ad(!srv_read_only_mode); + +#ifdef UNIV_DEBUG_THREAD_CREATION + fprintf(stderr, "Redo log follower thread starts, id %lu\n", + os_thread_pf(os_thread_get_curr_id())); +#endif + +#ifdef UNIV_PFS_THREAD + pfs_register_thread(srv_log_tracking_thread_key); +#endif + + my_thread_init(); + srv_redo_log_thread_started = true; + + do { + os_event_wait(srv_checkpoint_completed_event); + os_event_reset(srv_checkpoint_completed_event); + +#ifdef UNIV_DEBUG + if (!srv_track_changed_pages) { + continue; + } +#endif + + if (srv_shutdown_state < SRV_SHUTDOWN_LAST_PHASE) { + if (!log_online_follow_redo_log()) { + /* TODO: sync with I_S log tracking status? */ + ib_logf(IB_LOG_LEVEL_ERROR, + "log tracking bitmap write failed, " + "stopping log tracking thread!\n"); + break; + } + os_event_set(srv_redo_log_tracked_event); + } + + } while (srv_shutdown_state < SRV_SHUTDOWN_LAST_PHASE); + + srv_track_changed_pages = FALSE; + log_online_read_shutdown(); + os_event_set(srv_redo_log_tracked_event); + srv_redo_log_thread_started = false; /* Defensive, not required */ + + my_thread_end(); + os_thread_exit(NULL); + + OS_THREAD_DUMMY_RETURN; +} + +/*************************************************************//** +Removes old archived transaction log files. +Both parameters couldn't be provided at the same time */ +dberr_t +purge_archived_logs( + time_t before_date, /*!< in: all files modified + before timestamp should be removed */ + lsn_t before_no) /*!< in: files with this number in name + and earler should be removed */ +{ + log_group_t* group = UT_LIST_GET_FIRST(log_sys->log_groups); + + os_file_dir_t dir; + os_file_stat_t fileinfo; + char archived_log_filename[OS_FILE_MAX_PATH]; + char namegen[OS_FILE_MAX_PATH]; + ulint dirnamelen; + + if (srv_arch_dir) { + dir = os_file_opendir(srv_arch_dir, FALSE); + if (!dir) { + ib_logf(IB_LOG_LEVEL_WARN, + "opening archived log directory %s failed. " + "Purge archived logs are not available\n", + srv_arch_dir); + /* failed to open directory */ + return(DB_ERROR); + } + } else { + /* log archive directory is not specified */ + return(DB_ERROR); + } + + dirnamelen = strlen(srv_arch_dir); + + memcpy(archived_log_filename, srv_arch_dir, dirnamelen); + if (dirnamelen && + archived_log_filename[dirnamelen - 1] != SRV_PATH_SEPARATOR) { + archived_log_filename[dirnamelen++] = SRV_PATH_SEPARATOR; + } + + memset(&fileinfo, 0, sizeof(fileinfo)); + while(!os_file_readdir_next_file(srv_arch_dir, dir, + &fileinfo) ) { + if (strncmp(fileinfo.name, + IB_ARCHIVED_LOGS_PREFIX, IB_ARCHIVED_LOGS_PREFIX_LEN)) { + continue; + } + if (dirnamelen + strlen(fileinfo.name) + 2 > OS_FILE_MAX_PATH) + continue; + + snprintf(archived_log_filename + dirnamelen, OS_FILE_MAX_PATH, + "%s", fileinfo.name); + + if (before_no) { + ib_uint64_t log_file_no = strtoull(fileinfo.name + + IB_ARCHIVED_LOGS_PREFIX_LEN, + NULL, 10); + if (log_file_no == 0 || before_no <= log_file_no) { + continue; + } + } else { + fileinfo.mtime = 0; + if (os_file_get_status(archived_log_filename, + &fileinfo, false) != DB_SUCCESS || + fileinfo.mtime == 0) { + continue; + } + + if (before_date == 0 || fileinfo.mtime > before_date) { + continue; + } + } + + /* We are going to delete archived file. Acquire log_sys->mutex + to make sure that we are the only who try to delete file. This + also prevents log system from using this file. Do not delete + file if it is currently in progress of writting or have + pending IO. This is enforced by checking: + 1. fil_space_contains_node. + 2. group->archived_offset % group->file_size != 0, i.e. + there is archive in progress and we are going to delete it. + This covers 3 cases: + a. Usual case when we have one archive in progress, + both 1 and 2 are TRUE + b. When we have more then 1 archive in fil_space, + this can happen when flushed LSN range crosses file + boundary + c. When we have empty fil_space, but existing file will be + opened once archiving operation is requested. This usually + happens on startup. + */ + + mutex_enter(&log_sys->mutex); + + log_archived_file_name_gen(namegen, sizeof(namegen), + group->id, group->archived_file_no); + + if (fil_space_contains_node(group->archive_space_id, + archived_log_filename) || + (group->archived_offset % group->file_size != 0 && + strcmp(namegen, archived_log_filename) == 0)) { + + mutex_exit(&log_sys->mutex); + continue; + } + + if (!os_file_delete_if_exists(innodb_file_data_key, + archived_log_filename)) { + + ib_logf(IB_LOG_LEVEL_WARN, + "can't delete archived log file %s.\n", + archived_log_filename); + + mutex_exit(&log_sys->mutex); + os_file_closedir(dir); + + return(DB_ERROR); + } + + mutex_exit(&log_sys->mutex); + } + + os_file_closedir(dir); + + return(DB_SUCCESS); +} + +/*******************************************************************//** +Tells the InnoDB server that there has been activity in the database +and wakes up the master thread if it is suspended (not sleeping). Used +in the MySQL interface. Note that there is a small chance that the master +thread stays suspended (we do not protect our operation with the +srv_sys_t->mutex, for performance reasons). */ +UNIV_INTERN +void +srv_active_wake_master_thread(void) +/*===============================*/ +{ + if (srv_read_only_mode) { + return; + } + + ut_ad(!srv_sys_mutex_own()); + + srv_inc_activity_count(); + + if (srv_sys->n_threads_active[SRV_MASTER] == 0) { + srv_slot_t* slot; + + srv_sys_mutex_enter(); + + slot = &srv_sys->sys_threads[SRV_MASTER_SLOT]; + + /* Only if the master thread has been started. */ + + if (slot->in_use) { + ut_a(srv_slot_get_type(slot) == SRV_MASTER); + + if (slot->suspended) { + + slot->suspended = FALSE; + + ++srv_sys->n_threads_active[SRV_MASTER]; + + os_event_set(slot->event); + } + } + + srv_sys_mutex_exit(); + } +} + +/*******************************************************************//** +Tells the purge thread that there has been activity in the database +and wakes up the purge thread if it is suspended (not sleeping). Note +that there is a small chance that the purge thread stays suspended +(we do not protect our check with the srv_sys_t:mutex and the +purge_sys->latch, for performance reasons). */ +UNIV_INTERN +void +srv_wake_purge_thread_if_not_active(void) +/*=====================================*/ +{ + ut_ad(!srv_sys_mutex_own()); + + if (purge_sys->state == PURGE_STATE_RUN + && srv_sys->n_threads_active[SRV_PURGE] == 0) { + + srv_release_threads(SRV_PURGE, 1); + } +} + +/*******************************************************************//** +Wakes up the master thread if it is suspended or being suspended. */ +UNIV_INTERN +void +srv_wake_master_thread(void) +/*========================*/ +{ + ut_ad(!srv_sys_mutex_own()); + + srv_inc_activity_count(); + + srv_release_threads(SRV_MASTER, 1); +} + +/*******************************************************************//** +Get current server activity count. We don't hold srv_sys::mutex while +reading this value as it is only used in heuristics. +@return activity count. */ +UNIV_INTERN +ulint +srv_get_activity_count(void) +/*========================*/ +{ + return(srv_sys->activity_count); +} + +/*******************************************************************//** +Check if there has been any activity. +@return FALSE if no change in activity counter. */ +UNIV_INTERN +ibool +srv_check_activity( +/*===============*/ + ulint old_activity_count) /*!< in: old activity count */ +{ + return(srv_sys->activity_count != old_activity_count); +} + +/********************************************************************//** +The master thread is tasked to ensure that flush of log file happens +once every second in the background. This is to ensure that not more +than one second of trxs are lost in case of crash when +innodb_flush_logs_at_trx_commit != 1 */ +static +void +srv_sync_log_buffer_in_background(void) +/*===================================*/ +{ + time_t current_time = time(NULL); + + srv_main_thread_op_info = "flushing log"; + if (difftime(current_time, srv_last_log_flush_time) + >= srv_flush_log_at_timeout) { + log_buffer_sync_in_background(TRUE); + srv_last_log_flush_time = current_time; + srv_log_writes_and_flush++; + } +} + +/********************************************************************//** +Make room in the table cache by evicting an unused table. +@return number of tables evicted. */ +static +ulint +srv_master_evict_from_table_cache( +/*==============================*/ + ulint pct_check) /*!< in: max percent to check */ +{ + ulint n_tables_evicted = 0; + + rw_lock_x_lock(&dict_operation_lock); + + dict_mutex_enter_for_mysql(); + + n_tables_evicted = dict_make_room_in_cache( + innobase_get_table_cache_size(), pct_check); + + dict_mutex_exit_for_mysql(); + + rw_lock_x_unlock(&dict_operation_lock); + + return(n_tables_evicted); +} + +/*********************************************************************//** +This function prints progress message every 60 seconds during server +shutdown, for any activities that master thread is pending on. */ +static +void +srv_shutdown_print_master_pending( +/*==============================*/ + ib_time_t* last_print_time, /*!< last time the function + print the message */ + ulint n_tables_to_drop, /*!< number of tables to + be dropped */ + ulint n_bytes_merged) /*!< number of change buffer + just merged */ +{ + ib_time_t current_time; + double time_elapsed; + + current_time = ut_time(); + time_elapsed = ut_difftime(current_time, *last_print_time); + + if (time_elapsed > 60) { + *last_print_time = ut_time(); + + if (n_tables_to_drop) { + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Waiting for " + "%lu table(s) to be dropped\n", + (ulong) n_tables_to_drop); + } + + /* Check change buffer merge, we only wait for change buffer + merge if it is a slow shutdown */ + if (!srv_fast_shutdown && n_bytes_merged) { + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Waiting for change " + "buffer merge to complete\n" + " InnoDB: number of bytes of change buffer " + "just merged: %lu\n", + n_bytes_merged); + } + } +} + +/*********************************************************************//** +Perform the tasks that the master thread is supposed to do when the +server is active. There are two types of tasks. The first category is +of such tasks which are performed at each inovcation of this function. +We assume that this function is called roughly every second when the +server is active. The second category is of such tasks which are +performed at some interval e.g.: purge, dict_LRU cleanup etc. */ +static +void +srv_master_do_active_tasks(void) +/*============================*/ +{ + ib_time_t cur_time = ut_time(); + ullint counter_time = ut_time_us(NULL); + + /* First do the tasks that we are suppose to do at each + invocation of this function. */ + + ++srv_main_active_loops; + + MONITOR_INC(MONITOR_MASTER_ACTIVE_LOOPS); + + /* ALTER TABLE in MySQL requires on Unix that the table handler + can drop tables lazily after there no longer are SELECT + queries to them. */ + srv_main_thread_op_info = "doing background drop tables"; + row_drop_tables_for_mysql_in_background(); + MONITOR_INC_TIME_IN_MICRO_SECS( + MONITOR_SRV_BACKGROUND_DROP_TABLE_MICROSECOND, counter_time); + + if (srv_shutdown_state > 0) { + return; + } + + /* make sure that there is enough reusable space in the redo + log files */ + srv_main_thread_op_info = "checking free log space"; + log_free_check(); + + /* Do an ibuf merge */ + srv_main_thread_op_info = "doing insert buffer merge"; + counter_time = ut_time_us(NULL); + ibuf_contract_in_background(0, FALSE); + MONITOR_INC_TIME_IN_MICRO_SECS( + MONITOR_SRV_IBUF_MERGE_MICROSECOND, counter_time); + + /* Flush logs if needed */ + srv_main_thread_op_info = "flushing log"; + srv_sync_log_buffer_in_background(); + MONITOR_INC_TIME_IN_MICRO_SECS( + MONITOR_SRV_LOG_FLUSH_MICROSECOND, counter_time); + + /* Now see if various tasks that are performed at defined + intervals need to be performed. */ + +#ifdef MEM_PERIODIC_CHECK + /* Check magic numbers of every allocated mem block once in + SRV_MASTER_MEM_VALIDATE_INTERVAL seconds */ + if (cur_time % SRV_MASTER_MEM_VALIDATE_INTERVAL == 0) { + mem_validate_all_blocks(); + MONITOR_INC_TIME_IN_MICRO_SECS( + MONITOR_SRV_MEM_VALIDATE_MICROSECOND, counter_time); + } +#endif + if (srv_shutdown_state > 0) { + return; + } + + if (srv_shutdown_state > 0) { + return; + } + + if (cur_time % SRV_MASTER_DICT_LRU_INTERVAL == 0) { + srv_main_thread_op_info = "enforcing dict cache limit"; + srv_master_evict_from_table_cache(50); + MONITOR_INC_TIME_IN_MICRO_SECS( + MONITOR_SRV_DICT_LRU_MICROSECOND, counter_time); + } + + if (srv_shutdown_state > 0) { + return; + } + + /* Make a new checkpoint */ + if (cur_time % SRV_MASTER_CHECKPOINT_INTERVAL == 0) { + srv_main_thread_op_info = "making checkpoint"; + log_checkpoint(TRUE, FALSE); + MONITOR_INC_TIME_IN_MICRO_SECS( + MONITOR_SRV_CHECKPOINT_MICROSECOND, counter_time); + } +} + +/*********************************************************************//** +Perform the tasks that the master thread is supposed to do whenever the +server is idle. We do check for the server state during this function +and if the server has entered the shutdown phase we may return from +the function without completing the required tasks. +Note that the server can move to active state when we are executing this +function but we don't check for that as we are suppose to perform more +or less same tasks when server is active. */ +static +void +srv_master_do_idle_tasks(void) +/*==========================*/ +{ + ullint counter_time; + + ++srv_main_idle_loops; + + MONITOR_INC(MONITOR_MASTER_IDLE_LOOPS); + + + /* ALTER TABLE in MySQL requires on Unix that the table handler + can drop tables lazily after there no longer are SELECT + queries to them. */ + counter_time = ut_time_us(NULL); + srv_main_thread_op_info = "doing background drop tables"; + row_drop_tables_for_mysql_in_background(); + MONITOR_INC_TIME_IN_MICRO_SECS( + MONITOR_SRV_BACKGROUND_DROP_TABLE_MICROSECOND, + counter_time); + + if (srv_shutdown_state > 0) { + return; + } + + /* make sure that there is enough reusable space in the redo + log files */ + srv_main_thread_op_info = "checking free log space"; + log_free_check(); + + /* Do an ibuf merge */ + counter_time = ut_time_us(NULL); + srv_main_thread_op_info = "doing insert buffer merge"; + ibuf_contract_in_background(0, TRUE); + MONITOR_INC_TIME_IN_MICRO_SECS( + MONITOR_SRV_IBUF_MERGE_MICROSECOND, counter_time); + + if (srv_shutdown_state > 0) { + return; + } + + srv_main_thread_op_info = "enforcing dict cache limit"; + srv_master_evict_from_table_cache(100); + MONITOR_INC_TIME_IN_MICRO_SECS( + MONITOR_SRV_DICT_LRU_MICROSECOND, counter_time); + + /* Flush logs if needed */ + srv_sync_log_buffer_in_background(); + MONITOR_INC_TIME_IN_MICRO_SECS( + MONITOR_SRV_LOG_FLUSH_MICROSECOND, counter_time); + + if (srv_shutdown_state > 0) { + return; + } + + /* Make a new checkpoint */ + srv_main_thread_op_info = "making checkpoint"; + log_checkpoint(TRUE, FALSE); + MONITOR_INC_TIME_IN_MICRO_SECS(MONITOR_SRV_CHECKPOINT_MICROSECOND, + counter_time); + + if (srv_shutdown_state > 0) { + return; + } + + if (srv_log_arch_expire_sec) { + srv_main_thread_op_info = "purging archived logs"; + purge_archived_logs(ut_time() - srv_log_arch_expire_sec, + 0); + } +} + +/*********************************************************************//** +Perform the tasks during shutdown. The tasks that we do at shutdown +depend on srv_fast_shutdown: +2 => very fast shutdown => do no book keeping +1 => normal shutdown => clear drop table queue and make checkpoint +0 => slow shutdown => in addition to above do complete purge and ibuf +merge +@return TRUE if some work was done. FALSE otherwise */ +static +ibool +srv_master_do_shutdown_tasks( +/*=========================*/ + ib_time_t* last_print_time)/*!< last time the function + print the message */ +{ + ulint n_bytes_merged = 0; + ulint n_tables_to_drop = 0; + + ut_ad(!srv_read_only_mode); + + ++srv_main_shutdown_loops; + + ut_a(srv_shutdown_state > 0); + + /* In very fast shutdown none of the following is necessary */ + if (srv_fast_shutdown == 2) { + return(FALSE); + } + + /* ALTER TABLE in MySQL requires on Unix that the table handler + can drop tables lazily after there no longer are SELECT + queries to them. */ + srv_main_thread_op_info = "doing background drop tables"; + n_tables_to_drop = row_drop_tables_for_mysql_in_background(); + + /* make sure that there is enough reusable space in the redo + log files */ + srv_main_thread_op_info = "checking free log space"; + log_free_check(); + + /* In case of normal shutdown we don't do ibuf merge or purge */ + if (srv_fast_shutdown == 1) { + goto func_exit; + } + + /* Do an ibuf merge */ + srv_main_thread_op_info = "doing insert buffer merge"; + n_bytes_merged = ibuf_contract_in_background(0, TRUE); + + /* Flush logs if needed */ + srv_sync_log_buffer_in_background(); + +func_exit: + /* Make a new checkpoint about once in 10 seconds */ + srv_main_thread_op_info = "making checkpoint"; + log_checkpoint(TRUE, FALSE); + + /* Print progress message every 60 seconds during shutdown */ + if (srv_shutdown_state > 0 && srv_print_verbose_log) { + srv_shutdown_print_master_pending( + last_print_time, n_tables_to_drop, n_bytes_merged); + } + + return(n_bytes_merged || n_tables_to_drop); +} + +/*********************************************************************//** +Puts master thread to sleep. At this point we are using polling to +service various activities. Master thread sleeps for one second before +checking the state of the server again */ +static +void +srv_master_sleep(void) +/*==================*/ +{ + srv_main_thread_op_info = "sleeping"; + os_thread_sleep(1000000); + srv_main_thread_op_info = ""; +} + +/*********************************************************************//** +The master thread controlling the server. +@return a dummy parameter */ +extern "C" UNIV_INTERN +os_thread_ret_t +DECLARE_THREAD(srv_master_thread)( +/*==============================*/ + void* arg __attribute__((unused))) + /*!< in: a dummy parameter required by + os_thread_create */ +{ + srv_slot_t* slot; + ulint old_activity_count = srv_get_activity_count(); + ib_time_t last_print_time; + + ut_ad(!srv_read_only_mode); + + srv_master_tid = os_thread_get_tid(); + + os_thread_set_priority(srv_master_tid, srv_sched_priority_master); + +#ifdef UNIV_DEBUG_THREAD_CREATION + fprintf(stderr, "Master thread starts, id %lu\n", + os_thread_pf(os_thread_get_curr_id())); +#endif /* UNIV_DEBUG_THREAD_CREATION */ + +#ifdef UNIV_PFS_THREAD + pfs_register_thread(srv_master_thread_key); +#endif /* UNIV_PFS_THREAD */ + + srv_main_thread_process_no = os_proc_get_number(); + srv_main_thread_id = os_thread_pf(os_thread_get_curr_id()); + + slot = srv_reserve_slot(SRV_MASTER); + ut_a(slot == srv_sys->sys_threads); + + last_print_time = ut_time(); +loop: + if (srv_force_recovery >= SRV_FORCE_NO_BACKGROUND) { + goto suspend_thread; + } + + while (srv_shutdown_state == SRV_SHUTDOWN_NONE) { + + srv_master_sleep(); + + MONITOR_INC(MONITOR_MASTER_THREAD_SLEEP); + + srv_current_thread_priority = srv_master_thread_priority; + + if (srv_check_activity(old_activity_count)) { + old_activity_count = srv_get_activity_count(); + srv_master_do_active_tasks(); + } else { + srv_master_do_idle_tasks(); + } + } + + while (srv_master_do_shutdown_tasks(&last_print_time)) { + + /* Shouldn't loop here in case of very fast shutdown */ + ut_ad(srv_fast_shutdown < 2); + } + +suspend_thread: + srv_main_thread_op_info = "suspending"; + + srv_suspend_thread(slot); + + /* DO NOT CHANGE THIS STRING. innobase_start_or_create_for_mysql() + waits for database activity to die down when converting < 4.1.x + databases, and relies on this string being exactly as it is. InnoDB + manual also mentions this string in several places. */ + srv_main_thread_op_info = "waiting for server activity"; + + os_event_wait(slot->event); + + if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) { + os_thread_exit(NULL); + } + + goto loop; + + OS_THREAD_DUMMY_RETURN; /* Not reached, avoid compiler warning */ +} + +/*********************************************************************//** +Check if purge should stop. +@return true if it should shutdown. */ +static +bool +srv_purge_should_exit( +/*==============*/ + ulint n_purged) /*!< in: pages purged in last batch */ +{ + switch (srv_shutdown_state) { + case SRV_SHUTDOWN_NONE: + /* Normal operation. */ + break; + + case SRV_SHUTDOWN_CLEANUP: + case SRV_SHUTDOWN_EXIT_THREADS: + /* Exit unless slow shutdown requested or all done. */ + return(srv_fast_shutdown != 0 || n_purged == 0); + + case SRV_SHUTDOWN_LAST_PHASE: + case SRV_SHUTDOWN_FLUSH_PHASE: + ut_error; + } + + return(false); +} + +/*********************************************************************//** +Fetch and execute a task from the work queue. +@return true if a task was executed */ +static +bool +srv_task_execute(void) +/*==================*/ +{ + que_thr_t* thr = NULL; + + ut_ad(!srv_read_only_mode); + ut_a(srv_force_recovery < SRV_FORCE_NO_BACKGROUND); + + mutex_enter(&srv_sys->tasks_mutex); + + if (UT_LIST_GET_LEN(srv_sys->tasks) > 0) { + + thr = UT_LIST_GET_FIRST(srv_sys->tasks); + + ut_a(que_node_get_type(thr->child) == QUE_NODE_PURGE); + + UT_LIST_REMOVE(queue, srv_sys->tasks, thr); + } + + mutex_exit(&srv_sys->tasks_mutex); + + if (thr != NULL) { + + que_run_threads(thr); + + os_atomic_inc_ulint( + &purge_sys->bh_mutex, &purge_sys->n_completed, 1); + + srv_inc_activity_count(); + } + + return(thr != NULL); +} + +static ulint purge_tid_i = 0; + +/*********************************************************************//** +Worker thread that reads tasks from the work queue and executes them. +@return a dummy parameter */ +extern "C" UNIV_INTERN +os_thread_ret_t +DECLARE_THREAD(srv_worker_thread)( +/*==============================*/ + void* arg __attribute__((unused))) /*!< in: a dummy parameter + required by os_thread_create */ +{ + srv_slot_t* slot; + ulint tid_i = os_atomic_increment_ulint(&purge_tid_i, 1); + + ut_ad(tid_i < srv_n_purge_threads); + ut_ad(!srv_read_only_mode); + ut_a(srv_force_recovery < SRV_FORCE_NO_BACKGROUND); + + srv_purge_tids[tid_i] = os_thread_get_tid(); + os_thread_set_priority(srv_purge_tids[tid_i], + srv_sched_priority_purge); + +#ifdef UNIV_DEBUG_THREAD_CREATION + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: worker thread starting, id %lu\n", + os_thread_pf(os_thread_get_curr_id())); +#endif /* UNIV_DEBUG_THREAD_CREATION */ + + slot = srv_reserve_slot(SRV_WORKER); + + ut_a(srv_n_purge_threads > 1); + + srv_sys_mutex_enter(); + + ut_a(srv_sys->n_threads_active[SRV_WORKER] < srv_n_purge_threads); + + srv_sys_mutex_exit(); + + /* We need to ensure that the worker threads exit after the + purge coordinator thread. Otherwise the purge coordinaor can + end up waiting forever in trx_purge_wait_for_workers_to_complete() */ + + do { + srv_suspend_thread(slot); + + os_event_wait(slot->event); + + srv_current_thread_priority = srv_purge_thread_priority; + + if (srv_task_execute()) { + + /* If there are tasks in the queue, wakeup + the purge coordinator thread. */ + + srv_wake_purge_thread_if_not_active(); + } + + /* Note: we are checking the state without holding the + purge_sys->latch here. */ + } while (purge_sys->state != PURGE_STATE_EXIT); + + srv_free_slot(slot); + + rw_lock_x_lock(&purge_sys->latch); + + ut_a(!purge_sys->running); + ut_a(purge_sys->state == PURGE_STATE_EXIT); + ut_a(srv_shutdown_state > SRV_SHUTDOWN_NONE); + + rw_lock_x_unlock(&purge_sys->latch); + +#ifdef UNIV_DEBUG_THREAD_CREATION + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Purge worker thread exiting, id %lu\n", + os_thread_pf(os_thread_get_curr_id())); +#endif /* UNIV_DEBUG_THREAD_CREATION */ + + /* We count the number of threads in os_thread_exit(). A created + thread should always use that to exit and not use return() to exit. */ + os_thread_exit(NULL); + + OS_THREAD_DUMMY_RETURN; /* Not reached, avoid compiler warning */ +} + +/*********************************************************************//** +Do the actual purge operation. +@return length of history list before the last purge batch. */ +static +ulint +srv_do_purge( +/*=========*/ + ulint n_threads, /*!< in: number of threads to use */ + ulint* n_total_purged) /*!< in/out: total pages purged */ +{ + ulint n_pages_purged; + + static ulint count = 0; + static ulint n_use_threads = 0; + static ulint rseg_history_len = 0; + ulint old_activity_count = srv_get_activity_count(); + + ut_a(n_threads > 0); + ut_ad(!srv_read_only_mode); + + /* Purge until there are no more records to purge and there is + no change in configuration or server state. If the user has + configured more than one purge thread then we treat that as a + pool of threads and only use the extra threads if purge can't + keep up with updates. */ + + if (n_use_threads == 0) { + n_use_threads = n_threads; + } + + do { + srv_current_thread_priority = srv_purge_thread_priority; + + if (trx_sys->rseg_history_len > rseg_history_len + || (srv_max_purge_lag > 0 + && rseg_history_len > srv_max_purge_lag)) { + + /* History length is now longer than what it was + when we took the last snapshot. Use more threads. */ + + if (n_use_threads < n_threads) { + ++n_use_threads; + } + + } else if (srv_check_activity(old_activity_count) + && n_use_threads > 1) { + + /* History length same or smaller since last snapshot, + use fewer threads. */ + + --n_use_threads; + + old_activity_count = srv_get_activity_count(); + } + + /* Ensure that the purge threads are less than what + was configured. */ + + ut_a(n_use_threads > 0); + ut_a(n_use_threads <= n_threads); + + /* Take a snapshot of the history list before purge. */ + if ((rseg_history_len = trx_sys->rseg_history_len) == 0) { + break; + } + + n_pages_purged = trx_purge( + n_use_threads, srv_purge_batch_size, false); + + if (!(count++ % TRX_SYS_N_RSEGS)) { + /* Force a truncate of the history list. */ + n_pages_purged += trx_purge( + 1, srv_purge_batch_size, true); + } + + *n_total_purged += n_pages_purged; + + } while (!srv_purge_should_exit(n_pages_purged) && n_pages_purged > 0); + + return(rseg_history_len); +} + +/*********************************************************************//** +Suspend the purge coordinator thread. */ +static +void +srv_purge_coordinator_suspend( +/*==========================*/ + srv_slot_t* slot, /*!< in/out: Purge coordinator + thread slot */ + ulint rseg_history_len) /*!< in: history list length + before last purge */ +{ + ut_ad(!srv_read_only_mode); + ut_a(slot->type == SRV_PURGE); + + bool stop = false; + + /** Maximum wait time on the purge event, in micro-seconds. */ + static const ulint SRV_PURGE_MAX_TIMEOUT = 10000; + + ib_int64_t sig_count = srv_suspend_thread(slot); + + do { + ulint ret; + + rw_lock_x_lock(&purge_sys->latch); + + purge_sys->running = false; + + rw_lock_x_unlock(&purge_sys->latch); + + /* We don't wait right away on the the non-timed wait because + we want to signal the thread that wants to suspend purge. */ + + if (stop) { + os_event_wait_low(slot->event, sig_count); + ret = 0; + } else if (rseg_history_len <= trx_sys->rseg_history_len) { + ret = os_event_wait_time_low( + slot->event, SRV_PURGE_MAX_TIMEOUT, sig_count); + } else { + /* We don't want to waste time waiting, if the + history list increased by the time we got here, + unless purge has been stopped. */ + ret = 0; + } + + srv_sys_mutex_enter(); + + /* The thread can be in state !suspended after the timeout + but before this check if another thread sent a wakeup signal. */ + + if (slot->suspended) { + slot->suspended = FALSE; + ++srv_sys->n_threads_active[slot->type]; + ut_a(srv_sys->n_threads_active[slot->type] == 1); + } + + srv_sys_mutex_exit(); + + sig_count = srv_suspend_thread(slot); + + rw_lock_x_lock(&purge_sys->latch); + + stop = (srv_shutdown_state == SRV_SHUTDOWN_NONE + && purge_sys->state == PURGE_STATE_STOP); + + if (!stop) { + ut_a(purge_sys->n_stop == 0); + purge_sys->running = true; + } else { + ut_a(purge_sys->n_stop > 0); + + /* Signal that we are suspended. */ + os_event_set(purge_sys->event); + } + + rw_lock_x_unlock(&purge_sys->latch); + + if (ret == OS_SYNC_TIME_EXCEEDED) { + + /* No new records added since wait started then simply + wait for new records. The magic number 5000 is an + approximation for the case where we have cached UNDO + log records which prevent truncate of the UNDO + segments. */ + + if (rseg_history_len == trx_sys->rseg_history_len + && trx_sys->rseg_history_len < 5000) { + + stop = true; + } + } + + } while (stop); + + srv_sys_mutex_enter(); + + if (slot->suspended) { + slot->suspended = FALSE; + ++srv_sys->n_threads_active[slot->type]; + ut_a(srv_sys->n_threads_active[slot->type] == 1); + } + + srv_sys_mutex_exit(); +} + +/*********************************************************************//** +Purge coordinator thread that schedules the purge tasks. +@return a dummy parameter */ +extern "C" UNIV_INTERN +os_thread_ret_t +DECLARE_THREAD(srv_purge_coordinator_thread)( +/*=========================================*/ + void* arg __attribute__((unused))) /*!< in: a dummy parameter + required by os_thread_create */ +{ + srv_slot_t* slot; + ulint n_total_purged = ULINT_UNDEFINED; + + ut_ad(!srv_read_only_mode); + ut_a(srv_n_purge_threads >= 1); + ut_a(trx_purge_state() == PURGE_STATE_INIT); + ut_a(srv_force_recovery < SRV_FORCE_NO_BACKGROUND); + + srv_purge_tids[0] = os_thread_get_tid(); + os_thread_set_priority(srv_purge_tids[0], srv_sched_priority_purge); + + rw_lock_x_lock(&purge_sys->latch); + + purge_sys->running = true; + purge_sys->state = PURGE_STATE_RUN; + + rw_lock_x_unlock(&purge_sys->latch); + +#ifdef UNIV_PFS_THREAD + pfs_register_thread(srv_purge_thread_key); +#endif /* UNIV_PFS_THREAD */ + +#ifdef UNIV_DEBUG_THREAD_CREATION + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Purge coordinator thread created, id %lu\n", + os_thread_pf(os_thread_get_curr_id())); +#endif /* UNIV_DEBUG_THREAD_CREATION */ + + slot = srv_reserve_slot(SRV_PURGE); + + ulint rseg_history_len = trx_sys->rseg_history_len; + + do { + /* If there are no records to purge or the last + purge didn't purge any records then wait for activity. */ + + if (srv_shutdown_state == SRV_SHUTDOWN_NONE + && (purge_sys->state == PURGE_STATE_STOP + || n_total_purged == 0)) { + + srv_purge_coordinator_suspend(slot, rseg_history_len); + } + + if (srv_purge_should_exit(n_total_purged)) { + ut_a(!slot->suspended); + break; + } + + n_total_purged = 0; + + srv_current_thread_priority = srv_purge_thread_priority; + + rseg_history_len = srv_do_purge( + srv_n_purge_threads, &n_total_purged); + + srv_inc_activity_count(); + + } while (!srv_purge_should_exit(n_total_purged)); + + /* Ensure that we don't jump out of the loop unless the + exit condition is satisfied. */ + + ut_a(srv_purge_should_exit(n_total_purged)); + + ulint n_pages_purged = ULINT_MAX; + + /* Ensure that all records are purged if it is not a fast shutdown. + This covers the case where a record can be added after we exit the + loop above. */ + while (srv_fast_shutdown == 0 && n_pages_purged > 0) { + n_pages_purged = trx_purge(1, srv_purge_batch_size, false); + } + + /* Force a truncate of the history list. */ + n_pages_purged = trx_purge(1, srv_purge_batch_size, true); + ut_a(n_pages_purged == 0 || srv_fast_shutdown != 0); + + /* The task queue should always be empty, independent of fast + shutdown state. */ + ut_a(srv_get_task_queue_length() == 0); + + srv_free_slot(slot); + + /* Note that we are shutting down. */ + rw_lock_x_lock(&purge_sys->latch); + + purge_sys->state = PURGE_STATE_EXIT; + + purge_sys->running = false; + + rw_lock_x_unlock(&purge_sys->latch); + +#ifdef UNIV_DEBUG_THREAD_CREATION + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Purge coordinator exiting, id %lu\n", + os_thread_pf(os_thread_get_curr_id())); +#endif /* UNIV_DEBUG_THREAD_CREATION */ + + /* Ensure that all the worker threads quit. */ + if (srv_n_purge_threads > 1) { + srv_release_threads(SRV_WORKER, srv_n_purge_threads - 1); + } + + /* We count the number of threads in os_thread_exit(). A created + thread should always use that to exit and not use return() to exit. */ + os_thread_exit(NULL); + + OS_THREAD_DUMMY_RETURN; /* Not reached, avoid compiler warning */ +} + +/**********************************************************************//** +Enqueues a task to server task queue and releases a worker thread, if there +is a suspended one. */ +UNIV_INTERN +void +srv_que_task_enqueue_low( +/*=====================*/ + que_thr_t* thr) /*!< in: query thread */ +{ + ut_ad(!srv_read_only_mode); + mutex_enter(&srv_sys->tasks_mutex); + + UT_LIST_ADD_LAST(queue, srv_sys->tasks, thr); + + mutex_exit(&srv_sys->tasks_mutex); + + srv_release_threads(SRV_WORKER, 1); +} + +/**********************************************************************//** +Get count of tasks in the queue. +@return number of tasks in queue */ +UNIV_INTERN +ulint +srv_get_task_queue_length(void) +/*===========================*/ +{ + ulint n_tasks; + + ut_ad(!srv_read_only_mode); + + mutex_enter(&srv_sys->tasks_mutex); + + n_tasks = UT_LIST_GET_LEN(srv_sys->tasks); + + mutex_exit(&srv_sys->tasks_mutex); + + return(n_tasks); +} + +/**********************************************************************//** +Wakeup the purge threads. */ +UNIV_INTERN +void +srv_purge_wakeup(void) +/*==================*/ +{ + ut_ad(!srv_read_only_mode); + + if (srv_force_recovery < SRV_FORCE_NO_BACKGROUND) { + + srv_release_threads(SRV_PURGE, 1); + + if (srv_n_purge_threads > 1) { + ulint n_workers = srv_n_purge_threads - 1; + + srv_release_threads(SRV_WORKER, n_workers); + } + } +} + diff --git a/storage/xtradb/srv/srv0start.cc b/storage/xtradb/srv/srv0start.cc new file mode 100644 index 00000000000..4e0e03a6491 --- /dev/null +++ b/storage/xtradb/srv/srv0start.cc @@ -0,0 +1,3284 @@ +/***************************************************************************** + +Copyright (c) 1996, 2014, Oracle and/or its affiliates. All rights reserved. +Copyright (c) 2008, Google Inc. +Copyright (c) 2009, Percona Inc. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +Portions of this file contain modifications contributed and copyrighted +by Percona Inc.. Those modifications are +gratefully acknowledged and are described briefly in the InnoDB +documentation. The contributions by Percona Inc. are incorporated with +their permission, and subject to the conditions contained in the file +COPYING.Percona. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file srv/srv0start.cc +Starts the InnoDB database server + +Created 2/16/1996 Heikki Tuuri +*************************************************************************/ + +#include "mysqld.h" +#include "pars0pars.h" +#include "row0ftsort.h" +#include "ut0mem.h" +#include "mem0mem.h" +#include "data0data.h" +#include "data0type.h" +#include "dict0dict.h" +#include "buf0buf.h" +#include "buf0dump.h" +#include "os0file.h" +#include "os0thread.h" +#include "fil0fil.h" +#include "fsp0fsp.h" +#include "rem0rec.h" +#include "mtr0mtr.h" +#include "log0log.h" +#include "log0online.h" +#include "log0recv.h" +#include "page0page.h" +#include "page0cur.h" +#include "trx0trx.h" +#include "trx0sys.h" +#include "btr0btr.h" +#include "btr0cur.h" +#include "rem0rec.h" +#include "ibuf0ibuf.h" +#include "srv0start.h" +#include "srv0srv.h" +#ifndef UNIV_HOTBACKUP +# include "trx0rseg.h" +# include "os0proc.h" +# include "sync0sync.h" +# include "buf0flu.h" +# include "buf0rea.h" +# include "dict0boot.h" +# include "dict0load.h" +# include "dict0stats_bg.h" +# include "que0que.h" +# include "usr0sess.h" +# include "lock0lock.h" +# include "trx0roll.h" +# include "trx0purge.h" +# include "lock0lock.h" +# include "pars0pars.h" +# include "btr0sea.h" +# include "rem0cmp.h" +# include "dict0crea.h" +# include "row0ins.h" +# include "row0sel.h" +# include "row0upd.h" +# include "row0row.h" +# include "row0mysql.h" +# include "btr0pcur.h" +# include "os0sync.h" +# include "zlib.h" +# include "ut0crc32.h" + +/** Log sequence number immediately after startup */ +UNIV_INTERN lsn_t srv_start_lsn; +/** Log sequence number at shutdown */ +UNIV_INTERN lsn_t srv_shutdown_lsn; + +#ifdef HAVE_DARWIN_THREADS +# include <sys/utsname.h> +/** TRUE if the F_FULLFSYNC option is available */ +UNIV_INTERN ibool srv_have_fullfsync = FALSE; +#endif + +/** TRUE if a raw partition is in use */ +UNIV_INTERN ibool srv_start_raw_disk_in_use = FALSE; + +/** TRUE if the server is being started, before rolling back any +incomplete transactions */ +UNIV_INTERN ibool srv_startup_is_before_trx_rollback_phase = FALSE; +/** TRUE if the server is being started */ +UNIV_INTERN ibool srv_is_being_started = FALSE; +/** TRUE if the server was successfully started */ +UNIV_INTERN ibool srv_was_started = FALSE; +/** TRUE if innobase_start_or_create_for_mysql() has been called */ +static ibool srv_start_has_been_called = FALSE; + +/** At a shutdown this value climbs from SRV_SHUTDOWN_NONE to +SRV_SHUTDOWN_CLEANUP and then to SRV_SHUTDOWN_LAST_PHASE, and so on */ +UNIV_INTERN enum srv_shutdown_state srv_shutdown_state = SRV_SHUTDOWN_NONE; + +/** Files comprising the system tablespace */ +static os_file_t files[1000]; + +/** io_handler_thread parameters for thread identification */ +static ulint n[SRV_MAX_N_IO_THREADS]; +/** io_handler_thread identifiers, 32 is the maximum number of purge threads. +The extra elements at the end are allocated as follows: +SRV_MAX_N_IO_THREADS + 1: srv_master_thread +SRV_MAX_N_IO_THREADS + 2: lock_wait_timeout_thread +SRV_MAX_N_IO_THREADS + 3: srv_error_monitor_thread +SRV_MAX_N_IO_THREADS + 4: srv_monitor_thread +SRV_MAX_N_IO_THREADS + 5: srv_redo_log_follow_thread +SRV_MAX_N_IO_THREADS + 6: srv_purge_coordinator_thread +SRV_MAX_N_IO_THREADS + 7: srv_worker_thread +... +SRV_MAX_N_IO_THREADS + 7 + srv_n_purge_threads - 1: srv_worker_thread */ +static os_thread_id_t thread_ids[SRV_MAX_N_IO_THREADS + 7 + + SRV_MAX_N_PURGE_THREADS]; + +/** We use this mutex to test the return value of pthread_mutex_trylock + on successful locking. HP-UX does NOT return 0, though Linux et al do. */ +static os_fast_mutex_t srv_os_test_mutex; + +/** Name of srv_monitor_file */ +static char* srv_monitor_file_name; +#endif /* !UNIV_HOTBACKUP */ + +/** Default undo tablespace size in UNIV_PAGEs count (10MB). */ +static const ulint SRV_UNDO_TABLESPACE_SIZE_IN_PAGES = + ((1024 * 1024) * 10) / UNIV_PAGE_SIZE_DEF; + +/** */ +#define SRV_N_PENDING_IOS_PER_THREAD OS_AIO_N_PENDING_IOS_PER_THREAD +#define SRV_MAX_N_PENDING_SYNC_IOS 100 + +#ifdef UNIV_PFS_THREAD +/* Keys to register InnoDB threads with performance schema */ +UNIV_INTERN mysql_pfs_key_t io_handler_thread_key; +UNIV_INTERN mysql_pfs_key_t srv_lock_timeout_thread_key; +UNIV_INTERN mysql_pfs_key_t srv_error_monitor_thread_key; +UNIV_INTERN mysql_pfs_key_t srv_monitor_thread_key; +UNIV_INTERN mysql_pfs_key_t srv_master_thread_key; +UNIV_INTERN mysql_pfs_key_t srv_purge_thread_key; +UNIV_INTERN mysql_pfs_key_t srv_log_tracking_thread_key; +#endif /* UNIV_PFS_THREAD */ + +/*********************************************************************//** +Convert a numeric string that optionally ends in G or M or K, to a number +containing megabytes. +@return next character in string */ +static +char* +srv_parse_megabytes( +/*================*/ + char* str, /*!< in: string containing a quantity in bytes */ + ulint* megs) /*!< out: the number in megabytes */ +{ + char* endp; + ulint size; + + size = strtoul(str, &endp, 10); + + str = endp; + + switch (*str) { + case 'G': case 'g': + size *= 1024; + /* fall through */ + case 'M': case 'm': + str++; + break; + case 'K': case 'k': + size /= 1024; + str++; + break; + default: + size /= 1024 * 1024; + break; + } + + *megs = size; + return(str); +} + +/*********************************************************************//** +Check if a file can be opened in read-write mode. +@return true if it doesn't exist or can be opened in rw mode. */ +static +bool +srv_file_check_mode( +/*================*/ + const char* name) /*!< in: filename to check */ +{ + os_file_stat_t stat; + + memset(&stat, 0x0, sizeof(stat)); + + dberr_t err = os_file_get_status(name, &stat, true); + + if (err == DB_FAIL) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "os_file_get_status() failed on '%s'. Can't determine " + "file permissions", name); + + return(false); + + } else if (err == DB_SUCCESS) { + + /* Note: stat.rw_perm is only valid of files */ + + if (stat.type == OS_FILE_TYPE_FILE + || stat.type == OS_FILE_TYPE_BLOCK) { + if (!stat.rw_perm) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "%s can't be opened in %s mode", + name, + srv_read_only_mode + ? "read" : "read-write"); + + return(false); + } + } else { + /* Not a regular file, bail out. */ + + ib_logf(IB_LOG_LEVEL_ERROR, + "'%s' not a regular file.", name); + + return(false); + } + } else { + + /* This is OK. If the file create fails on RO media, there + is nothing we can do. */ + + ut_a(err == DB_NOT_FOUND); + } + + return(true); +} + +/*********************************************************************//** +Reads the data files and their sizes from a character string given in +the .cnf file. +@return TRUE if ok, FALSE on parse error */ +UNIV_INTERN +ibool +srv_parse_data_file_paths_and_sizes( +/*================================*/ + char* str) /*!< in/out: the data file path string */ +{ + char* input_str; + char* path; + ulint size; + ulint i = 0; + + srv_auto_extend_last_data_file = FALSE; + srv_last_file_size_max = 0; + srv_data_file_names = NULL; + srv_data_file_sizes = NULL; + srv_data_file_is_raw_partition = NULL; + + input_str = str; + + /* First calculate the number of data files and check syntax: + path:size[M | G];path:size[M | G]... . Note that a Windows path may + contain a drive name and a ':'. */ + + while (*str != '\0') { + path = str; + + while ((*str != ':' && *str != '\0') + || (*str == ':' + && (*(str + 1) == '\\' || *(str + 1) == '/' + || *(str + 1) == ':'))) { + str++; + } + + if (*str == '\0') { + return(FALSE); + } + + str++; + + str = srv_parse_megabytes(str, &size); + + if (0 == strncmp(str, ":autoextend", + (sizeof ":autoextend") - 1)) { + + str += (sizeof ":autoextend") - 1; + + if (0 == strncmp(str, ":max:", + (sizeof ":max:") - 1)) { + + str += (sizeof ":max:") - 1; + + str = srv_parse_megabytes(str, &size); + } + + if (*str != '\0') { + + return(FALSE); + } + } + + if (strlen(str) >= 6 + && *str == 'n' + && *(str + 1) == 'e' + && *(str + 2) == 'w') { + str += 3; + } + + if (*str == 'r' && *(str + 1) == 'a' && *(str + 2) == 'w') { + str += 3; + } + + if (size == 0) { + return(FALSE); + } + + i++; + + if (*str == ';') { + str++; + } else if (*str != '\0') { + + return(FALSE); + } + } + + if (i == 0) { + /* If innodb_data_file_path was defined it must contain + at least one data file definition */ + + return(FALSE); + } + + srv_data_file_names = static_cast<char**>( + malloc(i * sizeof *srv_data_file_names)); + + srv_data_file_sizes = static_cast<ulint*>( + malloc(i * sizeof *srv_data_file_sizes)); + + srv_data_file_is_raw_partition = static_cast<ulint*>( + malloc(i * sizeof *srv_data_file_is_raw_partition)); + + srv_n_data_files = i; + + /* Then store the actual values to our arrays */ + + str = input_str; + i = 0; + + while (*str != '\0') { + path = str; + + /* Note that we must step over the ':' in a Windows path; + a Windows path normally looks like C:\ibdata\ibdata1:1G, but + a Windows raw partition may have a specification like + \\.\C::1Gnewraw or \\.\PHYSICALDRIVE2:1Gnewraw */ + + while ((*str != ':' && *str != '\0') + || (*str == ':' + && (*(str + 1) == '\\' || *(str + 1) == '/' + || *(str + 1) == ':'))) { + str++; + } + + if (*str == ':') { + /* Make path a null-terminated string */ + *str = '\0'; + str++; + } + + str = srv_parse_megabytes(str, &size); + + srv_data_file_names[i] = path; + srv_data_file_sizes[i] = size; + + if (0 == strncmp(str, ":autoextend", + (sizeof ":autoextend") - 1)) { + + srv_auto_extend_last_data_file = TRUE; + + str += (sizeof ":autoextend") - 1; + + if (0 == strncmp(str, ":max:", + (sizeof ":max:") - 1)) { + + str += (sizeof ":max:") - 1; + + str = srv_parse_megabytes( + str, &srv_last_file_size_max); + } + + if (*str != '\0') { + + return(FALSE); + } + } + + (srv_data_file_is_raw_partition)[i] = 0; + + if (strlen(str) >= 6 + && *str == 'n' + && *(str + 1) == 'e' + && *(str + 2) == 'w') { + str += 3; + (srv_data_file_is_raw_partition)[i] = SRV_NEW_RAW; + } + + if (*str == 'r' && *(str + 1) == 'a' && *(str + 2) == 'w') { + str += 3; + + if ((srv_data_file_is_raw_partition)[i] == 0) { + (srv_data_file_is_raw_partition)[i] = SRV_OLD_RAW; + } + } + + i++; + + if (*str == ';') { + str++; + } + } + + return(TRUE); +} + +/*********************************************************************//** +Frees the memory allocated by srv_parse_data_file_paths_and_sizes() +and srv_parse_log_group_home_dirs(). */ +UNIV_INTERN +void +srv_free_paths_and_sizes(void) +/*==========================*/ +{ + free(srv_data_file_names); + srv_data_file_names = NULL; + free(srv_data_file_sizes); + srv_data_file_sizes = NULL; + free(srv_data_file_is_raw_partition); + srv_data_file_is_raw_partition = NULL; +} + +#ifndef UNIV_HOTBACKUP + +static ulint io_tid_i = 0; + +/********************************************************************//** +I/o-handler thread function. +@return OS_THREAD_DUMMY_RETURN */ +extern "C" UNIV_INTERN +os_thread_ret_t +DECLARE_THREAD(io_handler_thread)( +/*==============================*/ + void* arg) /*!< in: pointer to the number of the segment in + the aio array */ +{ + ulint segment; + ulint tid_i = os_atomic_increment_ulint(&io_tid_i, 1) - 1; + + ut_ad(tid_i < srv_n_file_io_threads); + + segment = *((ulint*) arg); + + srv_io_tids[tid_i] = os_thread_get_tid(); + os_thread_set_priority(srv_io_tids[tid_i], srv_sched_priority_io); + +#ifdef UNIV_DEBUG_THREAD_CREATION + fprintf(stderr, "Io handler thread %lu starts, id %lu\n", segment, + os_thread_pf(os_thread_get_curr_id())); +#endif + +#ifdef UNIV_PFS_THREAD + pfs_register_thread(io_handler_thread_key); +#endif /* UNIV_PFS_THREAD */ + + while (srv_shutdown_state != SRV_SHUTDOWN_EXIT_THREADS) { + srv_current_thread_priority = srv_io_thread_priority; + fil_aio_wait(segment); + } + + /* We count the number of threads in os_thread_exit(). A created + thread should always use that to exit and not use return() to exit. + The thread actually never comes here because it is exited in an + os_event_wait(). */ + + os_thread_exit(NULL); + + OS_THREAD_DUMMY_RETURN; +} +#endif /* !UNIV_HOTBACKUP */ + +/*********************************************************************//** +Normalizes a directory path for Windows: converts slashes to backslashes. */ +UNIV_INTERN +void +srv_normalize_path_for_win( +/*=======================*/ + char* str __attribute__((unused))) /*!< in/out: null-terminated + character string */ +{ +#ifdef __WIN__ + for (; *str; str++) { + + if (*str == '/') { + *str = '\\'; + } + } +#endif +} + +#ifndef UNIV_HOTBACKUP +/*********************************************************************//** +Creates a log file. +@return DB_SUCCESS or error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +create_log_file( +/*============*/ + os_file_t* file, /*!< out: file handle */ + const char* name) /*!< in: log file name */ +{ + ibool ret; + + *file = os_file_create( + innodb_file_log_key, name, + OS_FILE_CREATE|OS_FILE_ON_ERROR_NO_EXIT, OS_FILE_NORMAL, + OS_LOG_FILE, &ret); + + if (!ret) { + ib_logf(IB_LOG_LEVEL_ERROR, "Cannot create %s", name); + return(DB_ERROR); + } + + ib_logf(IB_LOG_LEVEL_INFO, + "Setting log file %s size to %lu MB", + name, (ulong) srv_log_file_size + >> (20 - UNIV_PAGE_SIZE_SHIFT)); + + ret = os_file_set_size(name, *file, + (os_offset_t) srv_log_file_size + << UNIV_PAGE_SIZE_SHIFT); + if (!ret) { + ib_logf(IB_LOG_LEVEL_ERROR, "Cannot set log file" + " %s to size %lu MB", name, (ulong) srv_log_file_size + >> (20 - UNIV_PAGE_SIZE_SHIFT)); + return(DB_ERROR); + } + + ret = os_file_close(*file); + ut_a(ret); + + return(DB_SUCCESS); +} + +/** Initial number of the first redo log file */ +#define INIT_LOG_FILE0 (SRV_N_LOG_FILES_MAX + 1) + +#ifdef DBUG_OFF +# define RECOVERY_CRASH(x) do {} while(0) +#else +# define RECOVERY_CRASH(x) do { \ + if (srv_force_recovery_crash == x) { \ + fprintf(stderr, "innodb_force_recovery_crash=%lu\n", \ + srv_force_recovery_crash); \ + fflush(stderr); \ + exit(3); \ + } \ +} while (0) +#endif + +/*********************************************************************//** +Creates all log files. +@return DB_SUCCESS or error code */ +static +dberr_t +create_log_files( +/*=============*/ + bool create_new_db, /*!< in: TRUE if new database is being + created */ + char* logfilename, /*!< in/out: buffer for log file name */ + size_t dirnamelen, /*!< in: length of the directory path */ + lsn_t lsn, /*!< in: FIL_PAGE_FILE_FLUSH_LSN value */ + char*& logfile0) /*!< out: name of the first log file */ +{ + if (srv_read_only_mode) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Cannot create log files in read-only mode"); + return(DB_READ_ONLY); + } + + /* We prevent system tablespace creation with existing files in + data directory. So we do not delete log files when creating new system + tablespace */ + if (!create_new_db) { + /* Remove any old log files. */ + for (unsigned i = 0; i <= INIT_LOG_FILE0; i++) { + sprintf(logfilename + dirnamelen, "ib_logfile%u", i); + + /* Ignore errors about non-existent files or files + that cannot be removed. The create_log_file() will + return an error when the file exists. */ +#ifdef __WIN__ + DeleteFile((LPCTSTR) logfilename); +#else + unlink(logfilename); +#endif + /* Crashing after deleting the first + file should be recoverable. The buffer + pool was clean, and we can simply create + all log files from the scratch. */ + RECOVERY_CRASH(6); + } + } + + ut_ad(!buf_pool_check_no_pending_io()); + + RECOVERY_CRASH(7); + + for (unsigned i = 0; i < srv_n_log_files; i++) { + sprintf(logfilename + dirnamelen, + "ib_logfile%u", i ? i : INIT_LOG_FILE0); + + dberr_t err = create_log_file(&files[i], logfilename); + + if (err != DB_SUCCESS) { + return(err); + } + } + + RECOVERY_CRASH(8); + + /* We did not create the first log file initially as + ib_logfile0, so that crash recovery cannot find it until it + has been completed and renamed. */ + sprintf(logfilename + dirnamelen, "ib_logfile%u", INIT_LOG_FILE0); + + fil_space_create( + logfilename, SRV_LOG_SPACE_FIRST_ID, + fsp_flags_set_page_size(0, UNIV_PAGE_SIZE), + FIL_LOG); + ut_a(fil_validate()); + + logfile0 = fil_node_create( + logfilename, (ulint) srv_log_file_size, + SRV_LOG_SPACE_FIRST_ID, FALSE); + ut_a(logfile0); + + for (unsigned i = 1; i < srv_n_log_files; i++) { + sprintf(logfilename + dirnamelen, "ib_logfile%u", i); + + if (!fil_node_create( + logfilename, + (ulint) srv_log_file_size, + SRV_LOG_SPACE_FIRST_ID, FALSE)) { + ut_error; + } + } + +#ifdef UNIV_LOG_ARCHIVE + /* Create the file space object for archived logs. */ + fil_space_create("arch_log_space", SRV_LOG_SPACE_FIRST_ID + 1, + 0, FIL_LOG); +#endif + log_group_init(0, srv_n_log_files, + srv_log_file_size * UNIV_PAGE_SIZE, + SRV_LOG_SPACE_FIRST_ID, + SRV_LOG_SPACE_FIRST_ID + 1); + + fil_open_log_and_system_tablespace_files(); + + /* Create a log checkpoint. */ + mutex_enter(&log_sys->mutex); + ut_d(recv_no_log_write = FALSE); + recv_reset_logs( +#ifdef UNIV_LOG_ARCHIVE + UT_LIST_GET_FIRST(log_sys->log_groups)->archived_file_no, + TRUE, +#endif + lsn); + mutex_exit(&log_sys->mutex); + + return(DB_SUCCESS); +} + +/*********************************************************************//** +Renames the first log file. */ +static +void +create_log_files_rename( +/*====================*/ + char* logfilename, /*!< in/out: buffer for log file name */ + size_t dirnamelen, /*!< in: length of the directory path */ + lsn_t lsn, /*!< in: FIL_PAGE_FILE_FLUSH_LSN value */ + char* logfile0) /*!< in/out: name of the first log file */ +{ + /* If innodb_flush_method=O_DSYNC, + we need to explicitly flush the log buffers. */ + fil_flush(SRV_LOG_SPACE_FIRST_ID); + /* Close the log files, so that we can rename + the first one. */ + fil_close_log_files(false); + + /* Rename the first log file, now that a log + checkpoint has been created. */ + sprintf(logfilename + dirnamelen, "ib_logfile%u", 0); + + RECOVERY_CRASH(9); + + ib_logf(IB_LOG_LEVEL_INFO, + "Renaming log file %s to %s", logfile0, logfilename); + + mutex_enter(&log_sys->mutex); + ut_ad(strlen(logfile0) == 2 + strlen(logfilename)); + ibool success = os_file_rename( + innodb_file_log_key, logfile0, logfilename); + ut_a(success); + + RECOVERY_CRASH(10); + + /* Replace the first file with ib_logfile0. */ + strcpy(logfile0, logfilename); + mutex_exit(&log_sys->mutex); + + fil_open_log_and_system_tablespace_files(); + + ib_logf(IB_LOG_LEVEL_WARN, "New log files created, LSN=" LSN_PF, lsn); +} + +/*********************************************************************//** +Opens a log file. +@return DB_SUCCESS or error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +open_log_file( +/*==========*/ + os_file_t* file, /*!< out: file handle */ + const char* name, /*!< in: log file name */ + os_offset_t* size) /*!< out: file size */ +{ + ibool ret; + + *file = os_file_create(innodb_file_log_key, name, + OS_FILE_OPEN, OS_FILE_AIO, + OS_LOG_FILE, &ret); + if (!ret) { + ib_logf(IB_LOG_LEVEL_ERROR, "Unable to open '%s'", name); + return(DB_ERROR); + } + + *size = os_file_get_size(*file); + + ret = os_file_close(*file); + ut_a(ret); + return(DB_SUCCESS); +} + +/*********************************************************************//** +Creates or opens database data files and closes them. +@return DB_SUCCESS or error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +open_or_create_data_files( +/*======================*/ + ibool* create_new_db, /*!< out: TRUE if new database should be + created */ +#ifdef UNIV_LOG_ARCHIVE + lsn_t* min_arch_log_no,/*!< out: min of archived log + numbers in data files */ + lsn_t* max_arch_log_no,/*!< out: max of archived log + numbers in data files */ +#endif /* UNIV_LOG_ARCHIVE */ + lsn_t* min_flushed_lsn,/*!< out: min of flushed lsn + values in data files */ + lsn_t* max_flushed_lsn,/*!< out: max of flushed lsn + values in data files */ + ulint* sum_of_new_sizes)/*!< out: sum of sizes of the + new files added */ +{ + ibool ret; + ulint i; + ibool one_opened = FALSE; + ibool one_created = FALSE; + os_offset_t size; + ulint flags; + ulint space; + ulint rounded_size_pages; + char name[10000]; + + if (srv_n_data_files >= 1000) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "Can only have < 1000 data files, you have " + "defined %lu", (ulong) srv_n_data_files); + + return(DB_ERROR); + } + + *sum_of_new_sizes = 0; + + *create_new_db = FALSE; + + srv_normalize_path_for_win(srv_data_home); + + for (i = 0; i < srv_n_data_files; i++) { + ulint dirnamelen; + + srv_normalize_path_for_win(srv_data_file_names[i]); + dirnamelen = strlen(srv_data_home); + + ut_a(dirnamelen + strlen(srv_data_file_names[i]) + < (sizeof name) - 1); + + memcpy(name, srv_data_home, dirnamelen); + + /* Add a path separator if needed. */ + if (dirnamelen && name[dirnamelen - 1] != SRV_PATH_SEPARATOR) { + name[dirnamelen++] = SRV_PATH_SEPARATOR; + } + + strcpy(name + dirnamelen, srv_data_file_names[i]); + + /* Note: It will return true if the file doesn' exist. */ + + if (!srv_file_check_mode(name)) { + + return(DB_FAIL); + + } else if (srv_data_file_is_raw_partition[i] == 0) { + + /* First we try to create the file: if it already + exists, ret will get value FALSE */ + + files[i] = os_file_create( + innodb_file_data_key, name, OS_FILE_CREATE, + OS_FILE_NORMAL, OS_DATA_FILE, &ret); + + if (srv_read_only_mode) { + + if (ret) { + goto size_check; + } + + ib_logf(IB_LOG_LEVEL_ERROR, + "Opening %s failed!", name); + + return(DB_ERROR); + + } else if (!ret + && os_file_get_last_error(false) + != OS_FILE_ALREADY_EXISTS +#ifdef UNIV_AIX + /* AIX 5.1 after security patch ML7 may have + errno set to 0 here, which causes our + function to return 100; work around that + AIX problem */ + && os_file_get_last_error(false) != 100 +#endif /* UNIV_AIX */ + ) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Creating or opening %s failed!", + name); + + return(DB_ERROR); + } + + } else if (srv_data_file_is_raw_partition[i] == SRV_NEW_RAW) { + + ut_a(!srv_read_only_mode); + + /* The partition is opened, not created; then it is + written over */ + + srv_start_raw_disk_in_use = TRUE; + srv_created_new_raw = TRUE; + + files[i] = os_file_create( + innodb_file_data_key, name, OS_FILE_OPEN_RAW, + OS_FILE_NORMAL, OS_DATA_FILE, &ret); + + if (!ret) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Error in opening %s", name); + + return(DB_ERROR); + } + } else if (srv_data_file_is_raw_partition[i] == SRV_OLD_RAW) { + srv_start_raw_disk_in_use = TRUE; + + ret = FALSE; + } else { + ut_a(0); + } + + if (ret == FALSE) { + const char* check_msg; + /* We open the data file */ + + if (one_created) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Data files can only be added at " + "the end of a tablespace, but " + "data file %s existed beforehand.", + name); + return(DB_ERROR); + } + if (srv_data_file_is_raw_partition[i] == SRV_OLD_RAW) { + ut_a(!srv_read_only_mode); + files[i] = os_file_create( + innodb_file_data_key, + name, OS_FILE_OPEN_RAW, + OS_FILE_NORMAL, OS_DATA_FILE, &ret); + } else if (i == 0) { + files[i] = os_file_create( + innodb_file_data_key, + name, OS_FILE_OPEN_RETRY, + OS_FILE_NORMAL, OS_DATA_FILE, &ret); + } else { + files[i] = os_file_create( + innodb_file_data_key, + name, OS_FILE_OPEN, OS_FILE_NORMAL, + OS_DATA_FILE, &ret); + } + + if (!ret) { + + os_file_get_last_error(true); + + ib_logf(IB_LOG_LEVEL_ERROR, + "Can't open '%s'", name); + + return(DB_ERROR); + } + + if (srv_data_file_is_raw_partition[i] == SRV_OLD_RAW) { + + goto skip_size_check; + } + +size_check: + size = os_file_get_size(files[i]); + ut_a(size != (os_offset_t) -1); + /* Round size downward to megabytes */ + + rounded_size_pages = (ulint) + (size >> UNIV_PAGE_SIZE_SHIFT); + + if (i == srv_n_data_files - 1 + && srv_auto_extend_last_data_file) { + + if (srv_data_file_sizes[i] > rounded_size_pages + || (srv_last_file_size_max > 0 + && srv_last_file_size_max + < rounded_size_pages)) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "auto-extending " + "data file %s is " + "of a different size " + "%lu pages (rounded " + "down to MB) than specified " + "in the .cnf file: " + "initial %lu pages, " + "max %lu (relevant if " + "non-zero) pages!", + name, + (ulong) rounded_size_pages, + (ulong) srv_data_file_sizes[i], + (ulong) + srv_last_file_size_max); + + return(DB_ERROR); + } + + srv_data_file_sizes[i] = rounded_size_pages; + } + + if (rounded_size_pages != srv_data_file_sizes[i]) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "Data file %s is of a different " + "size %lu pages (rounded down to MB) " + "than specified in the .cnf file " + "%lu pages!", + name, + (ulong) rounded_size_pages, + (ulong) srv_data_file_sizes[i]); + + return(DB_ERROR); + } +skip_size_check: + + /* This is the earliest location where we can load + the double write buffer. */ + if (i == 0) { + buf_dblwr_init_or_load_pages( + files[i], srv_data_file_names[i], true); + } + + bool retry = true; +check_first_page: + check_msg = fil_read_first_page( + files[i], one_opened, &flags, &space, + min_flushed_lsn, max_flushed_lsn); + + if (check_msg) { + + if (retry) { + fsp_open_info fsp; + const ulint page_no = 0; + + retry = false; + fsp.id = 0; + fsp.filepath = srv_data_file_names[i]; + fsp.file = files[i]; + + if (fil_user_tablespace_restore_page( + &fsp, page_no)) { + goto check_first_page; + } + } + + ib_logf(IB_LOG_LEVEL_ERROR, + "%s in data file %s", + check_msg, name); + return(DB_ERROR); + } + + /* The first file of the system tablespace must + have space ID = TRX_SYS_SPACE. The FSP_SPACE_ID + field in files greater than ibdata1 are unreliable. */ + ut_a(one_opened || space == TRX_SYS_SPACE); + + /* Check the flags for the first system tablespace + file only. */ + if (!one_opened + && UNIV_PAGE_SIZE + != fsp_flags_get_page_size(flags)) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "Data file \"%s\" uses page size %lu," + "but the start-up parameter " + "is --innodb-page-size=%lu", + name, + fsp_flags_get_page_size(flags), + UNIV_PAGE_SIZE); + + return(DB_ERROR); + } + + one_opened = TRUE; + } else if (!srv_read_only_mode) { + /* We created the data file and now write it full of + zeros */ + + one_created = TRUE; + + if (i > 0) { + ib_logf(IB_LOG_LEVEL_INFO, + "Data file %s did not" + " exist: new to be created", + name); + } else { + ib_logf(IB_LOG_LEVEL_INFO, + "The first specified " + "data file %s did not exist: " + "a new database to be created!", + name); + + *create_new_db = TRUE; + } + + ib_logf(IB_LOG_LEVEL_INFO, + "Setting file %s size to %lu MB", + name, + (ulong) (srv_data_file_sizes[i] + >> (20 - UNIV_PAGE_SIZE_SHIFT))); + + ib_logf(IB_LOG_LEVEL_INFO, + "Database physically writes the" + " file full: wait..."); + + ret = os_file_set_size( + name, files[i], + (os_offset_t) srv_data_file_sizes[i] + << UNIV_PAGE_SIZE_SHIFT); + + if (!ret) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Error in creating %s: " + "probably out of disk space", + name); + + return(DB_ERROR); + } + + *sum_of_new_sizes += srv_data_file_sizes[i]; + } + + ret = os_file_close(files[i]); + ut_a(ret); + + if (i == 0) { + flags = fsp_flags_set_page_size(0, UNIV_PAGE_SIZE); + fil_space_create(name, 0, flags, FIL_TABLESPACE); + } + + ut_a(fil_validate()); + + if (!fil_node_create(name, srv_data_file_sizes[i], 0, + srv_data_file_is_raw_partition[i] != 0)) { + return(DB_ERROR); + } + } + + return(DB_SUCCESS); +} + +/*********************************************************************//** +Create undo tablespace. +@return DB_SUCCESS or error code */ +static +dberr_t +srv_undo_tablespace_create( +/*=======================*/ + const char* name, /*!< in: tablespace name */ + ulint size) /*!< in: tablespace size in pages */ +{ + os_file_t fh; + ibool ret; + dberr_t err = DB_SUCCESS; + + os_file_create_subdirs_if_needed(name); + + fh = os_file_create( + innodb_file_data_key, + name, + srv_read_only_mode ? OS_FILE_OPEN : OS_FILE_CREATE, + OS_FILE_NORMAL, OS_DATA_FILE, &ret); + + if (srv_read_only_mode && ret) { + ib_logf(IB_LOG_LEVEL_INFO, + "%s opened in read-only mode", name); + } else if (ret == FALSE) { + if (os_file_get_last_error(false) != OS_FILE_ALREADY_EXISTS +#ifdef UNIV_AIX + /* AIX 5.1 after security patch ML7 may have + errno set to 0 here, which causes our function + to return 100; work around that AIX problem */ + && os_file_get_last_error(false) != 100 +#endif /* UNIV_AIX */ + ) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Can't create UNDO tablespace %s", name); + } else { + ib_logf(IB_LOG_LEVEL_ERROR, + "Creating system tablespace with" + " existing undo tablespaces is not" + " supported. Please delete all undo" + " tablespaces before creating new" + " system tablespace."); + } + err = DB_ERROR; + } else { + ut_a(!srv_read_only_mode); + + /* We created the data file and now write it full of zeros */ + + ib_logf(IB_LOG_LEVEL_INFO, + "Data file %s did not exist: new to be created", + name); + + ib_logf(IB_LOG_LEVEL_INFO, + "Setting file %s size to %lu MB", + name, size >> (20 - UNIV_PAGE_SIZE_SHIFT)); + + ib_logf(IB_LOG_LEVEL_INFO, + "Database physically writes the file full: wait..."); + + ret = os_file_set_size(name, fh, size << UNIV_PAGE_SIZE_SHIFT); + + if (!ret) { + ib_logf(IB_LOG_LEVEL_INFO, + "Error in creating %s: probably out of " + "disk space", name); + + err = DB_ERROR; + } + + os_file_close(fh); + } + + return(err); +} + +/*********************************************************************//** +Open an undo tablespace. +@return DB_SUCCESS or error code */ +static +dberr_t +srv_undo_tablespace_open( +/*=====================*/ + const char* name, /*!< in: tablespace name */ + ulint space) /*!< in: tablespace id */ +{ + os_file_t fh; + dberr_t err = DB_ERROR; + ibool ret; + ulint flags; + + if (!srv_file_check_mode(name)) { + ib_logf(IB_LOG_LEVEL_ERROR, + "UNDO tablespaces must be %s!", + srv_read_only_mode ? "writable" : "readable"); + + return(DB_ERROR); + } + + fh = os_file_create( + innodb_file_data_key, name, + OS_FILE_OPEN_RETRY + | OS_FILE_ON_ERROR_NO_EXIT + | OS_FILE_ON_ERROR_SILENT, + OS_FILE_NORMAL, + OS_DATA_FILE, + &ret); + + /* If the file open was successful then load the tablespace. */ + + if (ret) { + os_offset_t size; + + size = os_file_get_size(fh); + ut_a(size != (os_offset_t) -1); + + ret = os_file_close(fh); + ut_a(ret); + + /* Load the tablespace into InnoDB's internal + data structures. */ + + /* We set the biggest space id to the undo tablespace + because InnoDB hasn't opened any other tablespace apart + from the system tablespace. */ + + fil_set_max_space_id_if_bigger(space); + + /* Set the compressed page size to 0 (non-compressed) */ + flags = fsp_flags_set_page_size(0, UNIV_PAGE_SIZE); + fil_space_create(name, space, flags, FIL_TABLESPACE); + + ut_a(fil_validate()); + + os_offset_t n_pages = size / UNIV_PAGE_SIZE; + + /* On 64 bit Windows ulint can be 32 bit and os_offset_t + is 64 bit. It is OK to cast the n_pages to ulint because + the unit has been scaled to pages and they are always + 32 bit. */ + if (fil_node_create(name, (ulint) n_pages, space, FALSE)) { + err = DB_SUCCESS; + } + } + + return(err); +} + +/******************************************************************** +Opens the configured number of undo tablespaces. +@return DB_SUCCESS or error code */ +static +dberr_t +srv_undo_tablespaces_init( +/*======================*/ + ibool create_new_db, /*!< in: TRUE if new db being + created */ + const ulint n_conf_tablespaces, /*!< in: configured undo + tablespaces */ + ulint* n_opened) /*!< out: number of UNDO + tablespaces successfully + discovered and opened */ +{ + ulint i; + dberr_t err = DB_SUCCESS; + ulint prev_space_id = 0; + ulint n_undo_tablespaces; + ulint undo_tablespace_ids[TRX_SYS_N_RSEGS + 1]; + + *n_opened = 0; + + ut_a(n_conf_tablespaces <= TRX_SYS_N_RSEGS); + + memset(undo_tablespace_ids, 0x0, sizeof(undo_tablespace_ids)); + + /* Create the undo spaces only if we are creating a new + instance. We don't allow creating of new undo tablespaces + in an existing instance (yet). This restriction exists because + we check in several places for SYSTEM tablespaces to be less than + the min of user defined tablespace ids. Once we implement saving + the location of the undo tablespaces and their space ids this + restriction will/should be lifted. */ + + for (i = 0; create_new_db && i < n_conf_tablespaces; ++i) { + char name[OS_FILE_MAX_PATH]; + + ut_snprintf( + name, sizeof(name), + "%s%cundo%03lu", + srv_undo_dir, SRV_PATH_SEPARATOR, i + 1); + + /* Undo space ids start from 1. */ + err = srv_undo_tablespace_create( + name, SRV_UNDO_TABLESPACE_SIZE_IN_PAGES); + + if (err != DB_SUCCESS) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "Could not create undo tablespace '%s'.", + name); + + return(err); + } + } + + /* Get the tablespace ids of all the undo segments excluding + the system tablespace (0). If we are creating a new instance then + we build the undo_tablespace_ids ourselves since they don't + already exist. */ + + if (!create_new_db) { + n_undo_tablespaces = trx_rseg_get_n_undo_tablespaces( + undo_tablespace_ids); + } else { + n_undo_tablespaces = n_conf_tablespaces; + + for (i = 1; i <= n_undo_tablespaces; ++i) { + undo_tablespace_ids[i - 1] = i; + } + + undo_tablespace_ids[i] = ULINT_UNDEFINED; + } + + /* Open all the undo tablespaces that are currently in use. If we + fail to open any of these it is a fatal error. The tablespace ids + should be contiguous. It is a fatal error because they are required + for recovery and are referenced by the UNDO logs (a.k.a RBS). */ + + for (i = 0; i < n_undo_tablespaces; ++i) { + char name[OS_FILE_MAX_PATH]; + + ut_snprintf( + name, sizeof(name), + "%s%cundo%03lu", + srv_undo_dir, SRV_PATH_SEPARATOR, + undo_tablespace_ids[i]); + + /* Should be no gaps in undo tablespace ids. */ + ut_a(prev_space_id + 1 == undo_tablespace_ids[i]); + + /* The system space id should not be in this array. */ + ut_a(undo_tablespace_ids[i] != 0); + ut_a(undo_tablespace_ids[i] != ULINT_UNDEFINED); + + /* Undo space ids start from 1. */ + + err = srv_undo_tablespace_open(name, undo_tablespace_ids[i]); + + if (err != DB_SUCCESS) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "Unable to open undo tablespace '%s'.", name); + + return(err); + } + + prev_space_id = undo_tablespace_ids[i]; + + ++*n_opened; + } + + /* Open any extra unused undo tablespaces. These must be contiguous. + We stop at the first failure. These are undo tablespaces that are + not in use and therefore not required by recovery. We only check + that there are no gaps. */ + + for (i = prev_space_id + 1; i < TRX_SYS_N_RSEGS; ++i) { + char name[OS_FILE_MAX_PATH]; + + ut_snprintf( + name, sizeof(name), + "%s%cundo%03lu", srv_undo_dir, SRV_PATH_SEPARATOR, i); + + /* Undo space ids start from 1. */ + err = srv_undo_tablespace_open(name, i); + + if (err != DB_SUCCESS) { + break; + } + + ++n_undo_tablespaces; + + ++*n_opened; + } + + /* If the user says that there are fewer than what we find we + tolerate that discrepancy but not the inverse. Because there could + be unused undo tablespaces for future use. */ + + if (n_conf_tablespaces > n_undo_tablespaces) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Expected to open %lu undo " + "tablespaces but was able\n", + n_conf_tablespaces); + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: to find only %lu undo " + "tablespaces.\n", n_undo_tablespaces); + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Set the " + "innodb_undo_tablespaces parameter to " + "the\n"); + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: correct value and retry. Suggested " + "value is %lu\n", n_undo_tablespaces); + + return(err != DB_SUCCESS ? err : DB_ERROR); + + } else if (n_undo_tablespaces > 0) { + + ib_logf(IB_LOG_LEVEL_INFO, "Opened %lu undo tablespaces", + n_undo_tablespaces); + + if (n_conf_tablespaces == 0) { + ib_logf(IB_LOG_LEVEL_WARN, + "Using the system tablespace for all UNDO " + "logging because innodb_undo_tablespaces=0"); + } + } + + if (create_new_db) { + mtr_t mtr; + + mtr_start(&mtr); + + /* The undo log tablespace */ + for (i = 1; i <= n_undo_tablespaces; ++i) { + + fsp_header_init( + i, SRV_UNDO_TABLESPACE_SIZE_IN_PAGES, &mtr); + } + + mtr_commit(&mtr); + } + + return(DB_SUCCESS); +} + +/******************************************************************** +Wait for the purge thread(s) to start up. */ +static +void +srv_start_wait_for_purge_to_start() +/*===============================*/ +{ + /* Wait for the purge coordinator and master thread to startup. */ + + purge_state_t state = trx_purge_state(); + + ut_a(state != PURGE_STATE_DISABLED); + + while (srv_shutdown_state == SRV_SHUTDOWN_NONE + && srv_force_recovery < SRV_FORCE_NO_BACKGROUND + && state == PURGE_STATE_INIT) { + + switch (state = trx_purge_state()) { + case PURGE_STATE_RUN: + case PURGE_STATE_STOP: + break; + + case PURGE_STATE_INIT: + ib_logf(IB_LOG_LEVEL_INFO, + "Waiting for purge to start"); + + os_thread_sleep(50000); + break; + + case PURGE_STATE_EXIT: + case PURGE_STATE_DISABLED: + ut_error; + } + } +} + +/*********************************************************************//** +Initializes the log tracking subsystem and starts its thread. */ +static +void +init_log_online(void) +/*=================*/ +{ + if (UNIV_UNLIKELY(srv_force_recovery > 0 || srv_read_only_mode)) { + srv_track_changed_pages = FALSE; + return; + } + + if (srv_track_changed_pages) { + + log_online_read_init(); + + /* Create the thread that follows the redo log to output the + changed page bitmap */ + os_thread_create(&srv_redo_log_follow_thread, NULL, + thread_ids + 5 + SRV_MAX_N_IO_THREADS); + } +} + +/******************************************************************** +Starts InnoDB and creates a new database if database files +are not found and the user wants. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +innobase_start_or_create_for_mysql(void) +/*====================================*/ +{ + ibool create_new_db; + lsn_t min_flushed_lsn; + lsn_t max_flushed_lsn; +#ifdef UNIV_LOG_ARCHIVE + lsn_t min_arch_log_no = LSN_MAX; + lsn_t max_arch_log_no = LSN_MAX; +#endif /* UNIV_LOG_ARCHIVE */ + ulint sum_of_new_sizes; + ulint sum_of_data_file_sizes; + ulint tablespace_size_in_header; + dberr_t err; + unsigned i; + ulint srv_n_log_files_found = srv_n_log_files; + ulint io_limit; + mtr_t mtr; + ib_bh_t* ib_bh; + ulint n_recovered_trx; + char logfilename[10000]; + char* logfile0 = NULL; + size_t dirnamelen; + + if (srv_force_recovery > SRV_FORCE_NO_TRX_UNDO) { + srv_read_only_mode = true; + } + + if (srv_read_only_mode) { + ib_logf(IB_LOG_LEVEL_INFO, "Started in read only mode"); + } + +#ifdef HAVE_DARWIN_THREADS +# ifdef F_FULLFSYNC + /* This executable has been compiled on Mac OS X 10.3 or later. + Assume that F_FULLFSYNC is available at run-time. */ + srv_have_fullfsync = TRUE; +# else /* F_FULLFSYNC */ + /* This executable has been compiled on Mac OS X 10.2 + or earlier. Determine if the executable is running + on Mac OS X 10.3 or later. */ + struct utsname utsname; + if (uname(&utsname)) { + ut_print_timestamp(stderr); + fputs(" InnoDB: cannot determine Mac OS X version!\n", stderr); + } else { + srv_have_fullfsync = strcmp(utsname.release, "7.") >= 0; + } + if (!srv_have_fullfsync) { + ut_print_timestamp(stderr); + fputs(" InnoDB: On Mac OS X, fsync() may be " + "broken on internal drives,\n", stderr); + ut_print_timestamp(stderr); + fputs(" InnoDB: making transactions unsafe!\n", stderr); + } +# endif /* F_FULLFSYNC */ +#endif /* HAVE_DARWIN_THREADS */ + + ib_logf(IB_LOG_LEVEL_INFO, + "Using %s to ref count buffer pool pages", +#ifdef PAGE_ATOMIC_REF_COUNT + "atomics" +#else + "mutexes" +#endif /* PAGE_ATOMIC_REF_COUNT */ + ); + + + if (sizeof(ulint) != sizeof(void*)) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: size of InnoDB's ulint is %lu, " + "but size of void*\n", (ulong) sizeof(ulint)); + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: is %lu. The sizes should be the same " + "so that on a 64-bit\n", + (ulong) sizeof(void*)); + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: platforms you can allocate more than 4 GB " + "of memory.\n"); + } + +#ifdef UNIV_DEBUG + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: !!!!!!!! UNIV_DEBUG switched on !!!!!!!!!\n"); +#endif + +#ifdef UNIV_IBUF_DEBUG + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: !!!!!!!! UNIV_IBUF_DEBUG switched on !!!!!!!!!\n"); +# ifdef UNIV_IBUF_COUNT_DEBUG + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: !!!!!!!! UNIV_IBUF_COUNT_DEBUG switched on " + "!!!!!!!!!\n"); + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Crash recovery will fail with UNIV_IBUF_COUNT_DEBUG\n"); +# endif +#endif + +#ifdef UNIV_BLOB_DEBUG + fprintf(stderr, + "InnoDB: !!!!!!!! UNIV_BLOB_DEBUG switched on !!!!!!!!!\n" + "InnoDB: Server restart may fail with UNIV_BLOB_DEBUG\n"); +#endif /* UNIV_BLOB_DEBUG */ + +#ifdef UNIV_SYNC_DEBUG + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: !!!!!!!! UNIV_SYNC_DEBUG switched on !!!!!!!!!\n"); +#endif + +#ifdef UNIV_SEARCH_DEBUG + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: !!!!!!!! UNIV_SEARCH_DEBUG switched on !!!!!!!!!\n"); +#endif + +#ifdef UNIV_LOG_LSN_DEBUG + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: !!!!!!!! UNIV_LOG_LSN_DEBUG switched on !!!!!!!!!\n"); +#endif /* UNIV_LOG_LSN_DEBUG */ +#ifdef UNIV_MEM_DEBUG + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: !!!!!!!! UNIV_MEM_DEBUG switched on !!!!!!!!!\n"); +#endif + + if (srv_use_sys_malloc) { + ib_logf(IB_LOG_LEVEL_INFO, + "The InnoDB memory heap is disabled"); + } + +#if defined(COMPILER_HINTS_ENABLED) + ib_logf(IB_LOG_LEVEL_INFO, + " InnoDB: Compiler hints enabled."); +#endif /* defined(COMPILER_HINTS_ENABLED) */ + + ib_logf(IB_LOG_LEVEL_INFO, + "" IB_ATOMICS_STARTUP_MSG ""); + + ib_logf(IB_LOG_LEVEL_INFO, + "" IB_MEMORY_BARRIER_STARTUP_MSG ""); + +#ifndef HAVE_MEMORY_BARRIER +#if defined __i386__ || defined __x86_64__ || defined _M_IX86 || defined _M_X64 || defined __WIN__ +#else + ib_logf(IB_LOG_LEVEL_WARN, + "MySQL was built without a memory barrier capability on this" + " architecture, which might allow a mutex/rw_lock violation" + " under high thread concurrency. This may cause a hang."); +#endif /* IA32 or AMD64 */ +#endif /* HAVE_MEMORY_BARRIER */ + + ib_logf(IB_LOG_LEVEL_INFO, + "Compressed tables use zlib " ZLIB_VERSION +#ifdef UNIV_ZIP_DEBUG + " with validation" +#endif /* UNIV_ZIP_DEBUG */ + ); +#ifdef UNIV_ZIP_COPY + ib_logf(IB_LOG_LEVEL_INFO, "and extra copying"); +#endif /* UNIV_ZIP_COPY */ + + + /* Since InnoDB does not currently clean up all its internal data + structures in MySQL Embedded Server Library server_end(), we + print an error message if someone tries to start up InnoDB a + second time during the process lifetime. */ + + if (srv_start_has_been_called) { + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Error: startup called second time " + "during the process\n"); + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: lifetime. In the MySQL Embedded " + "Server Library you\n"); + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: cannot call server_init() more " + "than once during the\n"); + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: process lifetime.\n"); + } + + srv_start_has_been_called = TRUE; + +#ifdef UNIV_DEBUG + log_do_write = TRUE; +#endif /* UNIV_DEBUG */ + /* yydebug = TRUE; */ + + srv_is_being_started = TRUE; + srv_startup_is_before_trx_rollback_phase = TRUE; + +#ifdef __WIN__ + switch (os_get_os_version()) { + case OS_WIN95: + case OS_WIN31: + case OS_WINNT: + /* On Win 95, 98, ME, Win32 subsystem for Windows 3.1, + and NT use simulated aio. In NT Windows provides async i/o, + but when run in conjunction with InnoDB Hot Backup, it seemed + to corrupt the data files. */ + + srv_use_native_aio = FALSE; + break; + + case OS_WIN2000: + case OS_WINXP: + /* On 2000 and XP, async IO is available. */ + srv_use_native_aio = TRUE; + break; + + default: + /* Vista and later have both async IO and condition variables */ + srv_use_native_aio = TRUE; + srv_use_native_conditions = TRUE; + break; + } + +#elif defined(LINUX_NATIVE_AIO) + + if (srv_use_native_aio) { + ib_logf(IB_LOG_LEVEL_INFO, "Using Linux native AIO"); + } +#else + /* Currently native AIO is supported only on windows and linux + and that also when the support is compiled in. In all other + cases, we ignore the setting of innodb_use_native_aio. */ + srv_use_native_aio = FALSE; +#endif /* __WIN__ */ + + if (srv_file_flush_method_str == NULL) { + /* These are the default options */ + + srv_unix_file_flush_method = SRV_UNIX_FSYNC; + + srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED; +#ifndef __WIN__ + } else if (0 == ut_strcmp(srv_file_flush_method_str, "fsync")) { + srv_unix_file_flush_method = SRV_UNIX_FSYNC; + + } else if (0 == ut_strcmp(srv_file_flush_method_str, "O_DSYNC")) { + srv_unix_file_flush_method = SRV_UNIX_O_DSYNC; + + } else if (0 == ut_strcmp(srv_file_flush_method_str, "O_DIRECT")) { + srv_unix_file_flush_method = SRV_UNIX_O_DIRECT; + + } else if (0 == ut_strcmp(srv_file_flush_method_str, "ALL_O_DIRECT")) { + srv_unix_file_flush_method = SRV_UNIX_ALL_O_DIRECT; + + } else if (0 == ut_strcmp(srv_file_flush_method_str, "O_DIRECT_NO_FSYNC")) { + srv_unix_file_flush_method = SRV_UNIX_O_DIRECT_NO_FSYNC; + + } else if (0 == ut_strcmp(srv_file_flush_method_str, "littlesync")) { + srv_unix_file_flush_method = SRV_UNIX_LITTLESYNC; + + } else if (0 == ut_strcmp(srv_file_flush_method_str, "nosync")) { + srv_unix_file_flush_method = SRV_UNIX_NOSYNC; +#else + } else if (0 == ut_strcmp(srv_file_flush_method_str, "normal")) { + srv_win_file_flush_method = SRV_WIN_IO_NORMAL; + srv_use_native_aio = FALSE; + + } else if (0 == ut_strcmp(srv_file_flush_method_str, "unbuffered")) { + srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED; + srv_use_native_aio = FALSE; + + } else if (0 == ut_strcmp(srv_file_flush_method_str, + "async_unbuffered")) { + srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED; +#endif /* __WIN__ */ + } else { + ib_logf(IB_LOG_LEVEL_ERROR, + "Unrecognized value %s for innodb_flush_method", + srv_file_flush_method_str); + return(DB_ERROR); + } + + /* Note that the call srv_boot() also changes the values of + some variables to the units used by InnoDB internally */ + + /* Set the maximum number of threads which can wait for a semaphore + inside InnoDB: this is the 'sync wait array' size, as well as the + maximum number of threads that can wait in the 'srv_conc array' for + their time to enter InnoDB. */ + +#define BUF_POOL_SIZE_THRESHOLD (1024 * 1024 * 1024) + srv_max_n_threads = 1 /* io_ibuf_thread */ + + 1 /* io_log_thread */ + + 1 /* lock_wait_timeout_thread */ + + 1 /* srv_error_monitor_thread */ + + 1 /* srv_monitor_thread */ + + 1 /* srv_master_thread */ + + 1 /* srv_redo_log_follow_thread */ + + 1 /* srv_purge_coordinator_thread */ + + 1 /* buf_dump_thread */ + + 1 /* dict_stats_thread */ + + 1 /* fts_optimize_thread */ + + 1 /* recv_writer_thread */ + + 1 /* buf_flush_page_cleaner_thread */ + + 1 /* trx_rollback_or_clean_all_recovered */ + + 128 /* added as margin, for use of + InnoDB Memcached etc. */ + + max_connections + + srv_n_read_io_threads + + srv_n_write_io_threads + + srv_n_purge_threads + /* FTS Parallel Sort */ + + fts_sort_pll_degree * FTS_NUM_AUX_INDEX + * max_connections; + + if (srv_buf_pool_size < BUF_POOL_SIZE_THRESHOLD) { + /* If buffer pool is less than 1 GB, + use only one buffer pool instance */ + srv_buf_pool_instances = 1; + } + + srv_boot(); + + ib_logf(IB_LOG_LEVEL_INFO, + "%s CPU crc32 instructions", + ut_crc32_sse2_enabled ? "Using" : "Not using"); + + if (!srv_read_only_mode) { + + mutex_create(srv_monitor_file_mutex_key, + &srv_monitor_file_mutex, SYNC_NO_ORDER_CHECK); + + if (srv_innodb_status) { + + srv_monitor_file_name = static_cast<char*>( + mem_alloc( + strlen(fil_path_to_mysql_datadir) + + 20 + sizeof "/innodb_status.")); + + sprintf(srv_monitor_file_name, "%s/innodb_status.%lu", + fil_path_to_mysql_datadir, + os_proc_get_number()); + + srv_monitor_file = fopen(srv_monitor_file_name, "w+"); + + if (!srv_monitor_file) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "Unable to create %s: %s", + srv_monitor_file_name, + strerror(errno)); + + return(DB_ERROR); + } + } else { + srv_monitor_file_name = NULL; + srv_monitor_file = os_file_create_tmpfile(); + + if (!srv_monitor_file) { + return(DB_ERROR); + } + } + + mutex_create(srv_dict_tmpfile_mutex_key, + &srv_dict_tmpfile_mutex, SYNC_DICT_OPERATION); + + srv_dict_tmpfile = os_file_create_tmpfile(); + + if (!srv_dict_tmpfile) { + return(DB_ERROR); + } + + mutex_create(srv_misc_tmpfile_mutex_key, + &srv_misc_tmpfile_mutex, SYNC_ANY_LATCH); + + srv_misc_tmpfile = os_file_create_tmpfile(); + + if (!srv_misc_tmpfile) { + return(DB_ERROR); + } + } + + /* If user has set the value of innodb_file_io_threads then + we'll emit a message telling the user that this parameter + is now deprecated. */ + if (srv_n_file_io_threads != 4) { + ib_logf(IB_LOG_LEVEL_WARN, + "innodb_file_io_threads is deprecated. Please use " + "innodb_read_io_threads and innodb_write_io_threads " + "instead"); + } + + /* Now overwrite the value on srv_n_file_io_threads */ + srv_n_file_io_threads = srv_n_read_io_threads; + + if (!srv_read_only_mode) { + /* Add the log and ibuf IO threads. */ + srv_n_file_io_threads += 2; + srv_n_file_io_threads += srv_n_write_io_threads; + } else { + ib_logf(IB_LOG_LEVEL_INFO, + "Disabling background IO write threads."); + + srv_n_write_io_threads = 0; + } + + ut_a(srv_n_file_io_threads <= SRV_MAX_N_IO_THREADS); + + io_limit = 8 * SRV_N_PENDING_IOS_PER_THREAD; + + /* On Windows when using native aio the number of aio requests + that a thread can handle at a given time is limited to 32 + i.e.: SRV_N_PENDING_IOS_PER_THREAD */ +# ifdef __WIN__ + if (srv_use_native_aio) { + io_limit = SRV_N_PENDING_IOS_PER_THREAD; + } +# endif /* __WIN__ */ + + if (!os_aio_init(io_limit, + srv_n_read_io_threads, + srv_n_write_io_threads, + SRV_MAX_N_PENDING_SYNC_IOS)) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "Fatal : Cannot initialize AIO sub-system"); + + return(DB_ERROR); + } + + fil_init(srv_file_per_table ? 50000 : 5000, srv_max_n_open_files); + + double size; + char unit; + + if (srv_buf_pool_size >= 1024 * 1024 * 1024) { + size = ((double) srv_buf_pool_size) / (1024 * 1024 * 1024); + unit = 'G'; + } else { + size = ((double) srv_buf_pool_size) / (1024 * 1024); + unit = 'M'; + } + + /* Print time to initialize the buffer pool */ + ib_logf(IB_LOG_LEVEL_INFO, + "Initializing buffer pool, size = %.1f%c", size, unit); + + err = buf_pool_init(srv_buf_pool_size, (ibool) srv_buf_pool_populate, + srv_buf_pool_instances); + + if (err != DB_SUCCESS) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Cannot allocate memory for the buffer pool"); + + return(DB_ERROR); + } + + ib_logf(IB_LOG_LEVEL_INFO, + "Completed initialization of buffer pool"); + +#ifdef UNIV_DEBUG + /* We have observed deadlocks with a 5MB buffer pool but + the actual lower limit could very well be a little higher. */ + + if (srv_buf_pool_size <= 5 * 1024 * 1024) { + + ib_logf(IB_LOG_LEVEL_INFO, + "Small buffer pool size (%luM), the flst_validate() " + "debug function can cause a deadlock if the " + "buffer pool fills up.", + srv_buf_pool_size / 1024 / 1024); + } +#endif /* UNIV_DEBUG */ + + fsp_init(); + log_init(); + + lock_sys_create(srv_lock_table_size); + + /* Create i/o-handler threads: */ + + for (i = 0; i < srv_n_file_io_threads; ++i) { + + n[i] = i; + + os_thread_create(io_handler_thread, n + i, thread_ids + i); + } + + if (srv_n_log_files * srv_log_file_size * UNIV_PAGE_SIZE + >= 512ULL * 1024ULL * 1024ULL * 1024ULL) { + /* log_block_convert_lsn_to_no() limits the returned block + number to 1G and given that OS_FILE_LOG_BLOCK_SIZE is 512 + bytes, then we have a limit of 512 GB. If that limit is to + be raised, then log_block_convert_lsn_to_no() must be + modified. */ + ib_logf(IB_LOG_LEVEL_ERROR, + "Combined size of log files must be < 512 GB"); + + return(DB_ERROR); + } + + if (srv_n_log_files * srv_log_file_size >= ULINT_MAX) { + /* fil_io() takes ulint as an argument and we are passing + (next_offset / UNIV_PAGE_SIZE) to it in log_group_write_buf(). + So (next_offset / UNIV_PAGE_SIZE) must be less than ULINT_MAX. + So next_offset must be < ULINT_MAX * UNIV_PAGE_SIZE. This + means that we are limited to ULINT_MAX * UNIV_PAGE_SIZE which + is 64 TB on 32 bit systems. */ + fprintf(stderr, + " InnoDB: Error: combined size of log files" + " must be < %lu GB\n", + ULINT_MAX / 1073741824 * UNIV_PAGE_SIZE); + + return(DB_ERROR); + } + + sum_of_new_sizes = 0; + + for (i = 0; i < srv_n_data_files; i++) { +#ifndef __WIN__ + if (sizeof(off_t) < 5 + && srv_data_file_sizes[i] + >= (ulint) (1 << (32 - UNIV_PAGE_SIZE_SHIFT))) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: file size must be < 4 GB" + " with this MySQL binary\n"); + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: and operating system combination," + " in some OS's < 2 GB\n"); + + return(DB_ERROR); + } +#endif + sum_of_new_sizes += srv_data_file_sizes[i]; + } + + if (sum_of_new_sizes < 10485760 / UNIV_PAGE_SIZE) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Tablespace size must be at least 10 MB"); + + return(DB_ERROR); + } + + recv_sys_create(); + recv_sys_init(buf_pool_get_curr_size()); + + err = open_or_create_data_files(&create_new_db, +#ifdef UNIV_LOG_ARCHIVE + &min_arch_log_no, &max_arch_log_no, +#endif /* UNIV_LOG_ARCHIVE */ + &min_flushed_lsn, &max_flushed_lsn, + &sum_of_new_sizes); + if (err == DB_FAIL) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "The system tablespace must be writable!"); + + return(DB_ERROR); + + } else if (err != DB_SUCCESS) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "Could not open or create the system tablespace. If " + "you tried to add new data files to the system " + "tablespace, and it failed here, you should now " + "edit innodb_data_file_path in my.cnf back to what " + "it was, and remove the new ibdata files InnoDB " + "created in this failed attempt. InnoDB only wrote " + "those files full of zeros, but did not yet use " + "them in any way. But be careful: do not remove " + "old data files which contain your precious data!"); + + return(err); + } + +#ifdef UNIV_LOG_ARCHIVE + srv_normalize_path_for_win(srv_arch_dir); +#endif /* UNIV_LOG_ARCHIVE */ + + dirnamelen = strlen(srv_log_group_home_dir); + ut_a(dirnamelen < (sizeof logfilename) - 10 - sizeof "ib_logfile"); + memcpy(logfilename, srv_log_group_home_dir, dirnamelen); + + /* Add a path separator if needed. */ + if (dirnamelen && logfilename[dirnamelen - 1] != SRV_PATH_SEPARATOR) { + logfilename[dirnamelen++] = SRV_PATH_SEPARATOR; + } + + srv_log_file_size_requested = srv_log_file_size; + + if (create_new_db) { + bool success = buf_flush_list(ULINT_MAX, LSN_MAX, NULL); + ut_a(success); + + min_flushed_lsn = max_flushed_lsn = log_get_lsn(); + + buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST); + + err = create_log_files(create_new_db, logfilename, dirnamelen, + max_flushed_lsn, logfile0); + + if (err != DB_SUCCESS) { + return(err); + } + } else { + for (i = 0; i < SRV_N_LOG_FILES_MAX; i++) { + os_offset_t size; + os_file_stat_t stat_info; + + sprintf(logfilename + dirnamelen, + "ib_logfile%u", i); + + err = os_file_get_status( + logfilename, &stat_info, false); + + if (err == DB_NOT_FOUND) { + if (i == 0) { + if (max_flushed_lsn + != min_flushed_lsn) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Cannot create" + " log files because" + " data files are" + " corrupt or" + " not in sync" + " with each other"); + return(DB_ERROR); + } + + if (max_flushed_lsn < (lsn_t) 1000) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Cannot create" + " log files because" + " data files are" + " corrupt or the" + " database was not" + " shut down cleanly" + " after creating" + " the data files."); + return(DB_ERROR); + } + + err = create_log_files( + create_new_db, logfilename, + dirnamelen, max_flushed_lsn, + logfile0); + + if (err != DB_SUCCESS) { + return(err); + } + + create_log_files_rename( + logfilename, dirnamelen, + max_flushed_lsn, logfile0); + + /* Suppress the message about + crash recovery. */ + max_flushed_lsn = min_flushed_lsn + = log_get_lsn(); + goto files_checked; + } else if (i < 2) { + /* must have at least 2 log files */ + ib_logf(IB_LOG_LEVEL_ERROR, + "Only one log file found."); + return(err); + } + + /* opened all files */ + break; + } + + if (!srv_file_check_mode(logfilename)) { + return(DB_ERROR); + } + + err = open_log_file(&files[i], logfilename, &size); + + if (err != DB_SUCCESS) { + return(err); + } + + ut_a(size != (os_offset_t) -1); + + if (size & ((1 << UNIV_PAGE_SIZE_SHIFT) - 1)) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Log file %s size " + UINT64PF " is not a multiple of" + " innodb_page_size", + logfilename, size); + return(DB_ERROR); + } + + size >>= UNIV_PAGE_SIZE_SHIFT; + + if (i == 0) { + srv_log_file_size = size; + } else if (size != srv_log_file_size) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Log file %s is" + " of different size " UINT64PF " bytes" + " than other log" + " files " UINT64PF " bytes!", + logfilename, + size << UNIV_PAGE_SIZE_SHIFT, + (os_offset_t) srv_log_file_size + << UNIV_PAGE_SIZE_SHIFT); + return(DB_ERROR); + } + } + + srv_n_log_files_found = i; + + /* Create the in-memory file space objects. */ + + sprintf(logfilename + dirnamelen, "ib_logfile%u", 0); + + fil_space_create(logfilename, + SRV_LOG_SPACE_FIRST_ID, + fsp_flags_set_page_size(0, UNIV_PAGE_SIZE), + FIL_LOG); + + ut_a(fil_validate()); + + /* srv_log_file_size is measured in pages; if page size is 16KB, + then we have a limit of 64TB on 32 bit systems */ + ut_a(srv_log_file_size <= ULINT_MAX); + + for (unsigned j = 0; j < i; j++) { + sprintf(logfilename + dirnamelen, "ib_logfile%u", j); + + if (!fil_node_create(logfilename, + (ulint) srv_log_file_size, + SRV_LOG_SPACE_FIRST_ID, FALSE)) { + return(DB_ERROR); + } + } + +#ifdef UNIV_LOG_ARCHIVE + /* Create the file space object for archived logs. Under + MySQL, no archiving ever done. */ + fil_space_create("arch_log_space", SRV_LOG_SPACE_FIRST_ID + 1, + 0, FIL_LOG); +#endif /* UNIV_LOG_ARCHIVE */ + log_group_init(0, i, srv_log_file_size * UNIV_PAGE_SIZE, + SRV_LOG_SPACE_FIRST_ID, + SRV_LOG_SPACE_FIRST_ID + 1); + } + +files_checked: + /* Open all log files and data files in the system + tablespace: we keep them open until database + shutdown */ + + fil_open_log_and_system_tablespace_files(); + + err = srv_undo_tablespaces_init( + create_new_db, + srv_undo_tablespaces, + &srv_undo_tablespaces_open); + + /* If the force recovery is set very high then we carry on regardless + of all errors. Basically this is fingers crossed mode. */ + + if (err != DB_SUCCESS + && srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN) { + + return(err); + } + + /* Initialize objects used by dict stats gathering thread, which + can also be used by recovery if it tries to drop some table */ + if (!srv_read_only_mode) { + dict_stats_thread_init(); + } + + trx_sys_file_format_init(); + + trx_sys_create(); + + if (create_new_db) { + + ut_a(!srv_read_only_mode); + init_log_online(); + + mtr_start(&mtr); + + fsp_header_init(0, sum_of_new_sizes, &mtr); + + mtr_commit(&mtr); + + /* To maintain backward compatibility we create only + the first rollback segment before the double write buffer. + All the remaining rollback segments will be created later, + after the double write buffer has been created. */ + trx_sys_create_sys_pages(); + + ib_bh = trx_sys_init_at_db_start(); + n_recovered_trx = UT_LIST_GET_LEN(trx_sys->rw_trx_list); + + /* The purge system needs to create the purge view and + therefore requires that the trx_sys is inited. */ + + trx_purge_sys_create(srv_n_purge_threads, ib_bh); + + err = dict_create(); + + if (err != DB_SUCCESS) { + return(err); + } + + srv_startup_is_before_trx_rollback_phase = FALSE; + + bool success = buf_flush_list(ULINT_MAX, LSN_MAX, NULL); + ut_a(success); + + min_flushed_lsn = max_flushed_lsn = log_get_lsn(); + + buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST); + + /* Stamp the LSN to the data files. */ + fil_write_flushed_lsn_to_data_files(max_flushed_lsn, 0); + + fil_flush_file_spaces(FIL_TABLESPACE); + + create_log_files_rename(logfilename, dirnamelen, + max_flushed_lsn, logfile0); +#ifdef UNIV_LOG_ARCHIVE + } else if (srv_archive_recovery) { + + ib_logf(IB_LOG_LEVEL_INFO, + " Starting archive recovery from a backup..."); + + err = recv_recovery_from_archive_start( + min_flushed_lsn, srv_archive_recovery_limit_lsn, + min_arch_log_no); + if (err != DB_SUCCESS) { + + return(DB_ERROR); + } + /* Since ibuf init is in dict_boot, and ibuf is needed + in any disk i/o, first call dict_boot */ + + err = dict_boot(); + + if (err != DB_SUCCESS) { + return(err); + } + + ib_bh = trx_sys_init_at_db_start(); + n_recovered_trx = UT_LIST_GET_LEN(trx_sys->rw_trx_list); + + /* The purge system needs to create the purge view and + therefore requires that the trx_sys is inited. */ + + trx_purge_sys_create(srv_n_purge_threads, ib_bh); + + srv_startup_is_before_trx_rollback_phase = FALSE; + + recv_recovery_from_archive_finish(); +#endif /* UNIV_LOG_ARCHIVE */ + } else { + + /* Check if we support the max format that is stamped + on the system tablespace. + Note: We are NOT allowed to make any modifications to + the TRX_SYS_PAGE_NO page before recovery because this + page also contains the max_trx_id etc. important system + variables that are required for recovery. We need to + ensure that we return the system to a state where normal + recovery is guaranteed to work. We do this by + invalidating the buffer cache, this will force the + reread of the page and restoration to its last known + consistent state, this is REQUIRED for the recovery + process to work. */ + err = trx_sys_file_format_max_check( + srv_max_file_format_at_startup); + + if (err != DB_SUCCESS) { + return(err); + } + + /* Invalidate the buffer pool to ensure that we reread + the page that we read above, during recovery. + Note that this is not as heavy weight as it seems. At + this point there will be only ONE page in the buf_LRU + and there must be no page in the buf_flush list. */ + buf_pool_invalidate(); + + /* We always try to do a recovery, even if the database had + been shut down normally: this is the normal startup path */ + + err = recv_recovery_from_checkpoint_start( + LOG_CHECKPOINT, LSN_MAX, + min_flushed_lsn, max_flushed_lsn); + + if (err != DB_SUCCESS) { + + return(DB_ERROR); + } + + init_log_online(); + + /* Since the insert buffer init is in dict_boot, and the + insert buffer is needed in any disk i/o, first we call + dict_boot(). Note that trx_sys_init_at_db_start() only needs + to access space 0, and the insert buffer at this stage already + works for space 0. */ + + err = dict_boot(); + + if (err != DB_SUCCESS) { + return(err); + } + + ib_bh = trx_sys_init_at_db_start(); + n_recovered_trx = UT_LIST_GET_LEN(trx_sys->rw_trx_list); + + /* The purge system needs to create the purge view and + therefore requires that the trx_sys is inited. */ + + trx_purge_sys_create(srv_n_purge_threads, ib_bh); + + /* recv_recovery_from_checkpoint_finish needs trx lists which + are initialized in trx_sys_init_at_db_start(). */ + + recv_recovery_from_checkpoint_finish(); + + if (srv_force_recovery < SRV_FORCE_NO_IBUF_MERGE) { + /* The following call is necessary for the insert + buffer to work with multiple tablespaces. We must + know the mapping between space id's and .ibd file + names. + + In a crash recovery, we check that the info in data + dictionary is consistent with what we already know + about space id's from the call of + fil_load_single_table_tablespaces(). + + In a normal startup, we create the space objects for + every table in the InnoDB data dictionary that has + an .ibd file. + + We also determine the maximum tablespace id used. */ + dict_check_t dict_check; + + if (recv_needed_recovery) { + dict_check = DICT_CHECK_ALL_LOADED; + } else if (n_recovered_trx) { + dict_check = DICT_CHECK_SOME_LOADED; + } else { + dict_check = DICT_CHECK_NONE_LOADED; + } + + dict_check_tablespaces_and_store_max_id(dict_check); + } + + if (!srv_force_recovery + && !recv_sys->found_corrupt_log + && (srv_log_file_size_requested != srv_log_file_size + || srv_n_log_files_found != srv_n_log_files)) { + /* Prepare to replace the redo log files. */ + + if (srv_read_only_mode) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Cannot resize log files " + "in read-only mode."); + return(DB_READ_ONLY); + } + + /* Clean the buffer pool. */ + bool success = buf_flush_list( + ULINT_MAX, LSN_MAX, NULL); + ut_a(success); + + RECOVERY_CRASH(1); + + min_flushed_lsn = max_flushed_lsn = log_get_lsn(); + + ib_logf(IB_LOG_LEVEL_WARN, + "Resizing redo log from %u*%u to %u*%u pages" + ", LSN=" LSN_PF, + (unsigned) i, + (unsigned) srv_log_file_size, + (unsigned) srv_n_log_files, + (unsigned) srv_log_file_size_requested, + max_flushed_lsn); + + buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST); + + RECOVERY_CRASH(2); + + /* Flush the old log files. */ + log_buffer_flush_to_disk(); + /* If innodb_flush_method=O_DSYNC, + we need to explicitly flush the log buffers. */ + fil_flush(SRV_LOG_SPACE_FIRST_ID); + + ut_ad(max_flushed_lsn == log_get_lsn()); + + /* Prohibit redo log writes from any other + threads until creating a log checkpoint at the + end of create_log_files(). */ + ut_d(recv_no_log_write = TRUE); + ut_ad(!buf_pool_check_no_pending_io()); + + RECOVERY_CRASH(3); + + /* Stamp the LSN to the data files. */ + fil_write_flushed_lsn_to_data_files( + max_flushed_lsn, 0); + + fil_flush_file_spaces(FIL_TABLESPACE); + + RECOVERY_CRASH(4); + + /* Close and free the redo log files, so that + we can replace them. */ + fil_close_log_files(true); + + RECOVERY_CRASH(5); + + /* Free the old log file space. */ + log_group_close_all(); + + ib_logf(IB_LOG_LEVEL_WARN, + "Starting to delete and rewrite log files."); + + srv_log_file_size = srv_log_file_size_requested; + + err = create_log_files(create_new_db, logfilename, + dirnamelen, max_flushed_lsn, + logfile0); + + if (err != DB_SUCCESS) { + return(err); + } + + /* create_log_files() can increase system lsn that is + why FIL_PAGE_FILE_FLUSH_LSN have to be updated */ + min_flushed_lsn = max_flushed_lsn = log_get_lsn(); + fil_write_flushed_lsn_to_data_files(min_flushed_lsn, 0); + fil_flush_file_spaces(FIL_TABLESPACE); + + create_log_files_rename(logfilename, dirnamelen, + log_get_lsn(), logfile0); + } + + srv_startup_is_before_trx_rollback_phase = FALSE; + recv_recovery_rollback_active(); + + /* It is possible that file_format tag has never + been set. In this case we initialize it to minimum + value. Important to note that we can do it ONLY after + we have finished the recovery process so that the + image of TRX_SYS_PAGE_NO is not stale. */ + trx_sys_file_format_tag_init(); + } + + if (!create_new_db && sum_of_new_sizes > 0) { + /* New data file(s) were added */ + mtr_start(&mtr); + + fsp_header_inc_size(0, sum_of_new_sizes, &mtr); + + mtr_commit(&mtr); + + /* Immediately write the log record about increased tablespace + size to disk, so that it is durable even if mysqld would crash + quickly */ + + log_buffer_flush_to_disk(); + } + +#ifdef UNIV_LOG_ARCHIVE + /* Archiving is always off under MySQL */ + if (!srv_log_archive_on) { + ut_a(DB_SUCCESS == log_archive_noarchivelog()); + } else { + bool start_archive; + + mutex_enter(&(log_sys->mutex)); + + start_archive = FALSE; + + if (log_sys->archiving_state == LOG_ARCH_OFF) { + start_archive = TRUE; + } + + mutex_exit(&(log_sys->mutex)); + + if (start_archive) { + ut_a(DB_SUCCESS == log_archive_archivelog()); + } + } +#endif /* UNIV_LOG_ARCHIVE */ + + /* fprintf(stderr, "Max allowed record size %lu\n", + page_get_free_space_of_empty() / 2); */ + + if (buf_dblwr == NULL) { + /* Create the doublewrite buffer to a new tablespace */ + + buf_dblwr_create(); + } + + /* Here the double write buffer has already been created and so + any new rollback segments will be allocated after the double + write buffer. The default segment should already exist. + We create the new segments only if it's a new database or + the database was shutdown cleanly. */ + + /* Note: When creating the extra rollback segments during an upgrade + we violate the latching order, even if the change buffer is empty. + We make an exception in sync0sync.cc and check srv_is_being_started + for that violation. It cannot create a deadlock because we are still + running in single threaded mode essentially. Only the IO threads + should be running at this stage. */ + + ut_a(srv_undo_logs > 0); + ut_a(srv_undo_logs <= TRX_SYS_N_RSEGS); + + /* The number of rsegs that exist in InnoDB is given by status + variable srv_available_undo_logs. The number of rsegs to use can + be set using the dynamic global variable srv_undo_logs. */ + + srv_available_undo_logs = trx_sys_create_rsegs( + srv_undo_tablespaces, srv_undo_logs); + + if (srv_available_undo_logs == ULINT_UNDEFINED) { + /* Can only happen if server is read only. */ + ut_a(srv_read_only_mode); + srv_undo_logs = ULONG_UNDEFINED; + } + + if (!srv_read_only_mode) { + /* Create the thread which watches the timeouts + for lock waits */ + os_thread_create( + lock_wait_timeout_thread, + NULL, thread_ids + 2 + SRV_MAX_N_IO_THREADS); + + /* Create the thread which warns of long semaphore waits */ + os_thread_create( + srv_error_monitor_thread, + NULL, thread_ids + 3 + SRV_MAX_N_IO_THREADS); + + /* Create the thread which prints InnoDB monitor info */ + os_thread_create( + srv_monitor_thread, + NULL, thread_ids + 4 + SRV_MAX_N_IO_THREADS); + } + + /* Create the SYS_FOREIGN and SYS_FOREIGN_COLS system tables */ + err = dict_create_or_check_foreign_constraint_tables(); + if (err != DB_SUCCESS) { + return(err); + } + + /* Create the SYS_TABLESPACES system table */ + err = dict_create_or_check_sys_tablespace(); + if (err != DB_SUCCESS) { + return(err); + } + + srv_is_being_started = FALSE; + + ut_a(trx_purge_state() == PURGE_STATE_INIT); + + /* Create the master thread which does purge and other utility + operations */ + + if (!srv_read_only_mode) { + + os_thread_create( + srv_master_thread, + NULL, thread_ids + (1 + SRV_MAX_N_IO_THREADS)); + } + + if (!srv_read_only_mode + && srv_force_recovery < SRV_FORCE_NO_BACKGROUND) { + + os_thread_create( + srv_purge_coordinator_thread, + NULL, thread_ids + 6 + SRV_MAX_N_IO_THREADS); + + ut_a(UT_ARR_SIZE(thread_ids) + > 6 + srv_n_purge_threads + SRV_MAX_N_IO_THREADS); + + /* We've already created the purge coordinator thread above. */ + for (i = 1; i < srv_n_purge_threads; ++i) { + os_thread_create( + srv_worker_thread, NULL, + thread_ids + 6 + i + SRV_MAX_N_IO_THREADS); + } + + srv_start_wait_for_purge_to_start(); + + } else { + purge_sys->state = PURGE_STATE_DISABLED; + } + + if (!srv_read_only_mode) { + os_thread_create(buf_flush_page_cleaner_thread, NULL, NULL); + } + os_thread_create(buf_flush_lru_manager_thread, NULL, NULL); + +#ifdef UNIV_DEBUG + /* buf_debug_prints = TRUE; */ +#endif /* UNIV_DEBUG */ + sum_of_data_file_sizes = 0; + + for (i = 0; i < srv_n_data_files; i++) { + sum_of_data_file_sizes += srv_data_file_sizes[i]; + } + + tablespace_size_in_header = fsp_header_get_tablespace_size(); + + if (!srv_read_only_mode + && !srv_auto_extend_last_data_file + && sum_of_data_file_sizes != tablespace_size_in_header) { + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: tablespace size" + " stored in header is %lu pages, but\n", + (ulong) tablespace_size_in_header); + ut_print_timestamp(stderr); + fprintf(stderr, + "InnoDB: the sum of data file sizes is %lu pages\n", + (ulong) sum_of_data_file_sizes); + + if (srv_force_recovery == 0 + && sum_of_data_file_sizes < tablespace_size_in_header) { + /* This is a fatal error, the tail of a tablespace is + missing */ + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Cannot start InnoDB." + " The tail of the system tablespace is\n"); + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: missing. Have you edited" + " innodb_data_file_path in my.cnf in an\n"); + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: inappropriate way, removing" + " ibdata files from there?\n"); + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: You can set innodb_force_recovery=1" + " in my.cnf to force\n"); + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: a startup if you are trying" + " to recover a badly corrupt database.\n"); + + return(DB_ERROR); + } + } + + if (!srv_read_only_mode + && srv_auto_extend_last_data_file + && sum_of_data_file_sizes < tablespace_size_in_header) { + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: tablespace size stored in header" + " is %lu pages, but\n", + (ulong) tablespace_size_in_header); + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: the sum of data file sizes" + " is only %lu pages\n", + (ulong) sum_of_data_file_sizes); + + if (srv_force_recovery == 0) { + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Cannot start InnoDB. The tail of" + " the system tablespace is\n"); + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: missing. Have you edited" + " innodb_data_file_path in my.cnf in an\n"); + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: inappropriate way, removing" + " ibdata files from there?\n"); + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: You can set innodb_force_recovery=1" + " in my.cnf to force\n"); + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: a startup if you are trying to" + " recover a badly corrupt database.\n"); + + return(DB_ERROR); + } + } + + /* Check that os_fast_mutexes work as expected */ + os_fast_mutex_init(PFS_NOT_INSTRUMENTED, &srv_os_test_mutex); + + if (0 != os_fast_mutex_trylock(&srv_os_test_mutex)) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: pthread_mutex_trylock returns" + " an unexpected value on\n"); + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: success! Cannot continue.\n"); + exit(1); + } + + os_fast_mutex_unlock(&srv_os_test_mutex); + + os_fast_mutex_lock(&srv_os_test_mutex); + + os_fast_mutex_unlock(&srv_os_test_mutex); + + os_fast_mutex_free(&srv_os_test_mutex); + + if (!srv_file_per_table && srv_pass_corrupt_table) { + fprintf(stderr, "InnoDB: Warning:" + " The option innodb_file_per_table is disabled," + " so using the option innodb_pass_corrupt_table doesn't make sense.\n"); + } + + if (srv_print_verbose_log) { + ib_logf(IB_LOG_LEVEL_INFO, + " Percona XtraDB (http://www.percona.com) %s started; " + "log sequence number " LSN_PF "", + INNODB_VERSION_STR, srv_start_lsn); + } + + if (srv_force_recovery > 0) { + ib_logf(IB_LOG_LEVEL_INFO, + "!!! innodb_force_recovery is set to %lu !!!", + (ulong) srv_force_recovery); + } + + if (srv_force_recovery == 0) { + /* In the insert buffer we may have even bigger tablespace + id's, because we may have dropped those tablespaces, but + insert buffer merge has not had time to clean the records from + the ibuf tree. */ + + ibuf_update_max_tablespace_id(); + } + + if (!srv_read_only_mode) { + /* Create the buffer pool dump/load thread */ + os_thread_create(buf_dump_thread, NULL, NULL); + + /* Create the dict stats gathering thread */ + os_thread_create(dict_stats_thread, NULL, NULL); + + /* Create the thread that will optimize the FTS sub-system. */ + fts_optimize_init(); + } + + srv_was_started = TRUE; + + return(DB_SUCCESS); +} + +#if 0 +/******************************************************************** +Sync all FTS cache before shutdown */ +static +void +srv_fts_close(void) +/*===============*/ +{ + dict_table_t* table; + + for (table = UT_LIST_GET_FIRST(dict_sys->table_LRU); + table; table = UT_LIST_GET_NEXT(table_LRU, table)) { + fts_t* fts = table->fts; + + if (fts != NULL) { + fts_sync_table(table); + } + } + + for (table = UT_LIST_GET_FIRST(dict_sys->table_non_LRU); + table; table = UT_LIST_GET_NEXT(table_LRU, table)) { + fts_t* fts = table->fts; + + if (fts != NULL) { + fts_sync_table(table); + } + } +} +#endif + +/****************************************************************//** +Shuts down the InnoDB database. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +innobase_shutdown_for_mysql(void) +/*=============================*/ +{ + ulint i; + + if (!srv_was_started) { + if (srv_is_being_started) { + ib_logf(IB_LOG_LEVEL_WARN, + "Shutting down an improperly started, " + "or created database!"); + } + + return(DB_SUCCESS); + } + + if (!srv_read_only_mode) { + /* Shutdown the FTS optimize sub system. */ + fts_optimize_start_shutdown(); + + fts_optimize_end(); + } + + /* 1. Flush the buffer pool to disk, write the current lsn to + the tablespace header(s), and copy all log data to archive. + The step 1 is the real InnoDB shutdown. The remaining steps 2 - ... + just free data structures after the shutdown. */ + + logs_empty_and_mark_files_at_shutdown(); + + if (srv_conc_get_active_threads() != 0) { + ib_logf(IB_LOG_LEVEL_WARN, + "Query counter shows %ld queries still " + "inside InnoDB at shutdown", + srv_conc_get_active_threads()); + } + + /* 2. Make all threads created by InnoDB to exit */ + + srv_shutdown_state = SRV_SHUTDOWN_EXIT_THREADS; + + /* All threads end up waiting for certain events. Put those events + to the signaled state. Then the threads will exit themselves after + os_event_wait(). */ + + for (i = 0; i < 1000; i++) { + /* NOTE: IF YOU CREATE THREADS IN INNODB, YOU MUST EXIT THEM + HERE OR EARLIER */ + + if (!srv_read_only_mode) { + /* a. Let the lock timeout thread exit */ + os_event_set(lock_sys->timeout_event); + + /* b. srv error monitor thread exits automatically, + no need to do anything here */ + + /* c. We wake the master thread so that it exits */ + srv_wake_master_thread(); + + /* d. Wakeup purge threads. */ + srv_purge_wakeup(); + } + + /* e. Exit the i/o threads */ + + os_aio_wake_all_threads_at_shutdown(); + + /* f. dict_stats_thread is signaled from + logs_empty_and_mark_files_at_shutdown() and should have + already quit or is quitting right now. */ + + os_mutex_enter(os_sync_mutex); + + if (os_thread_count == 0) { + /* All the threads have exited or are just exiting; + NOTE that the threads may not have completed their + exit yet. Should we use pthread_join() to make sure + they have exited? If we did, we would have to + remove the pthread_detach() from + os_thread_exit(). Now we just sleep 0.1 + seconds and hope that is enough! */ + + os_mutex_exit(os_sync_mutex); + + os_thread_sleep(100000); + + break; + } + + os_mutex_exit(os_sync_mutex); + + os_thread_sleep(100000); + } + + if (i == 1000) { + ib_logf(IB_LOG_LEVEL_WARN, + "%lu threads created by InnoDB" + " had not exited at shutdown!", + (ulong) os_thread_count); + } + + if (srv_monitor_file) { + fclose(srv_monitor_file); + srv_monitor_file = 0; + if (srv_monitor_file_name) { + unlink(srv_monitor_file_name); + mem_free(srv_monitor_file_name); + } + } + + if (srv_dict_tmpfile) { + fclose(srv_dict_tmpfile); + srv_dict_tmpfile = 0; + } + + if (srv_misc_tmpfile) { + fclose(srv_misc_tmpfile); + srv_misc_tmpfile = 0; + } + + if (!srv_read_only_mode) { + dict_stats_thread_deinit(); + } + + /* This must be disabled before closing the buffer pool + and closing the data dictionary. */ + btr_search_disable(); + + ibuf_close(); + log_shutdown(); + lock_sys_close(); + trx_sys_file_format_close(); + trx_sys_close(); + + /* We don't create these mutexes in RO mode because we don't create + the temp files that the cover. */ + if (!srv_read_only_mode) { + mutex_free(&srv_monitor_file_mutex); + mutex_free(&srv_dict_tmpfile_mutex); + mutex_free(&srv_misc_tmpfile_mutex); + } + + dict_close(); + btr_search_sys_free(); + + /* 3. Free all InnoDB's own mutexes and the os_fast_mutexes inside + them */ + os_aio_free(); + que_close(); + row_mysql_close(); + srv_mon_free(); + sync_close(); + srv_free(); + fil_close(); + + /* 4. Free the os_conc_mutex and all os_events and os_mutexes */ + + os_sync_free(); + + /* 5. Free all allocated memory */ + + pars_lexer_close(); + log_mem_free(); + buf_pool_free(srv_buf_pool_instances); + mem_close(); + + /* ut_free_all_mem() frees all allocated memory not freed yet + in shutdown, and it will also free the ut_list_mutex, so it + should be the last one for all operation */ + ut_free_all_mem(); + + if (os_thread_count != 0 + || os_event_count != 0 + || os_mutex_count != 0 + || os_fast_mutex_count != 0) { + ib_logf(IB_LOG_LEVEL_WARN, + "Some resources were not cleaned up in shutdown: " + "threads %lu, events %lu, os_mutexes %lu, " + "os_fast_mutexes %lu", + (ulong) os_thread_count, (ulong) os_event_count, + (ulong) os_mutex_count, (ulong) os_fast_mutex_count); + } + + if (dict_foreign_err_file) { + fclose(dict_foreign_err_file); + } + + if (srv_print_verbose_log) { + ib_logf(IB_LOG_LEVEL_INFO, + "Shutdown completed; log sequence number " LSN_PF "", + srv_shutdown_lsn); + } + + srv_was_started = FALSE; + srv_start_has_been_called = FALSE; + + return(DB_SUCCESS); +} +#endif /* !UNIV_HOTBACKUP */ + + +/******************************************************************** +Signal all per-table background threads to shutdown, and wait for them to do +so. */ +UNIV_INTERN +void +srv_shutdown_table_bg_threads(void) +/*===============================*/ +{ + dict_table_t* table; + dict_table_t* first; + dict_table_t* last = NULL; + + mutex_enter(&dict_sys->mutex); + + /* Signal all threads that they should stop. */ + table = UT_LIST_GET_FIRST(dict_sys->table_LRU); + first = table; + while (table) { + dict_table_t* next; + fts_t* fts = table->fts; + + if (fts != NULL) { + fts_start_shutdown(table, fts); + } + + next = UT_LIST_GET_NEXT(table_LRU, table); + + if (!next) { + last = table; + } + + table = next; + } + + /* We must release dict_sys->mutex here; if we hold on to it in the + loop below, we will deadlock if any of the background threads try to + acquire it (for example, the FTS thread by calling que_eval_sql). + + Releasing it here and going through dict_sys->table_LRU without + holding it is safe because: + + a) MySQL only starts the shutdown procedure after all client + threads have been disconnected and no new ones are accepted, so no + new tables are added or old ones dropped. + + b) Despite its name, the list is not LRU, and the order stays + fixed. + + To safeguard against the above assumptions ever changing, we store + the first and last items in the list above, and then check that + they've stayed the same below. */ + + mutex_exit(&dict_sys->mutex); + + /* Wait for the threads of each table to stop. This is not inside + the above loop, because by signaling all the threads first we can + overlap their shutting down delays. */ + table = UT_LIST_GET_FIRST(dict_sys->table_LRU); + ut_a(first == table); + while (table) { + dict_table_t* next; + fts_t* fts = table->fts; + + if (fts != NULL) { + fts_shutdown(table, fts); + } + + next = UT_LIST_GET_NEXT(table_LRU, table); + + if (table == last) { + ut_a(!next); + } + + table = next; + } +} + +/*****************************************************************//** +Get the meta-data filename from the table name. */ +UNIV_INTERN +void +srv_get_meta_data_filename( +/*=======================*/ + dict_table_t* table, /*!< in: table */ + char* filename, /*!< out: filename */ + ulint max_len) /*!< in: filename max length */ +{ + ulint len; + char* path; + char* suffix; + static const ulint suffix_len = strlen(".cfg"); + + if (DICT_TF_HAS_DATA_DIR(table->flags)) { + dict_get_and_save_data_dir_path(table, false); + ut_a(table->data_dir_path); + + path = os_file_make_remote_pathname( + table->data_dir_path, table->name, "cfg"); + } else { + path = fil_make_ibd_name(table->name, false); + } + + ut_a(path); + len = ut_strlen(path); + ut_a(max_len >= len); + + suffix = path + (len - suffix_len); + if (strncmp(suffix, ".cfg", suffix_len) == 0) { + strcpy(filename, path); + } else { + ut_ad(strncmp(suffix, ".ibd", suffix_len) == 0); + + strncpy(filename, path, len - suffix_len); + suffix = filename + (len - suffix_len); + strcpy(suffix, ".cfg"); + } + + mem_free(path); + + srv_normalize_path_for_win(filename); +} diff --git a/storage/xtradb/sync/sync0arr.cc b/storage/xtradb/sync/sync0arr.cc new file mode 100644 index 00000000000..c941a59ba0b --- /dev/null +++ b/storage/xtradb/sync/sync0arr.cc @@ -0,0 +1,1225 @@ +/***************************************************************************** + +Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2008, Google Inc. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file sync/sync0arr.cc +The wait array used in synchronization primitives + +Created 9/5/1995 Heikki Tuuri +*******************************************************/ + +#include "sync0arr.h" +#ifdef UNIV_NONINL +#include "sync0arr.ic" +#endif + +#include "sync0sync.h" +#include "sync0rw.h" +#include "os0sync.h" +#include "os0file.h" +#include "lock0lock.h" +#include "srv0srv.h" +#include "ha_prototypes.h" + +/* + WAIT ARRAY + ========== + +The wait array consists of cells each of which has an +an operating system event object created for it. The threads +waiting for a mutex, for example, can reserve a cell +in the array and suspend themselves to wait for the event +to become signaled. When using the wait array, remember to make +sure that some thread holding the synchronization object +will eventually know that there is a waiter in the array and +signal the object, to prevent infinite wait. +Why we chose to implement a wait array? First, to make +mutexes fast, we had to code our own implementation of them, +which only in usually uncommon cases resorts to using +slow operating system primitives. Then we had the choice of +assigning a unique OS event for each mutex, which would +be simpler, or using a global wait array. In some operating systems, +the global wait array solution is more efficient and flexible, +because we can do with a very small number of OS events, +say 200. In NT 3.51, allocating events seems to be a quadratic +algorithm, because 10 000 events are created fast, but +100 000 events takes a couple of minutes to create. + +As of 5.0.30 the above mentioned design is changed. Since now +OS can handle millions of wait events efficiently, we no longer +have this concept of each cell of wait array having one event. +Instead, now the event that a thread wants to wait on is embedded +in the wait object (mutex or rw_lock). We still keep the global +wait array for the sake of diagnostics and also to avoid infinite +wait The error_monitor thread scans the global wait array to signal +any waiting threads who have missed the signal. */ + +/** A cell where an individual thread may wait suspended +until a resource is released. The suspending is implemented +using an operating system event semaphore. */ +struct sync_cell_t { + void* wait_object; /*!< pointer to the object the + thread is waiting for; if NULL + the cell is free for use */ + void* old_wait_mutex; /*!< the latest regular or priority + wait mutex in cell */ + void* old_wait_rw_lock; + /*!< the latest regular or priority + wait rw-lock in cell */ + ulint request_type; /*!< lock type requested on the + object */ + const char* file; /*!< in debug version file where + requested */ + ulint line; /*!< in debug version line where + requested */ + os_thread_id_t thread; /*!< thread id of this waiting + thread */ + ibool waiting; /*!< TRUE if the thread has already + called sync_array_event_wait + on this cell */ + ib_int64_t signal_count; /*!< We capture the signal_count + of the wait_object when we + reset the event. This value is + then passed on to os_event_wait + and we wait only if the event + has not been signalled in the + period between the reset and + wait call. */ + time_t reservation_time;/*!< time when the thread reserved + the wait cell */ +}; + +/* NOTE: It is allowed for a thread to wait +for an event allocated for the array without owning the +protecting mutex (depending on the case: OS or database mutex), but +all changes (set or reset) to the state of the event must be made +while owning the mutex. */ + +/** Synchronization array */ +struct sync_array_t { + ulint n_reserved; /*!< number of currently reserved + cells in the wait array */ + ulint n_cells; /*!< number of cells in the + wait array */ + sync_cell_t* array; /*!< pointer to wait array */ + ib_mutex_t mutex; /*!< possible database mutex + protecting this data structure */ + os_ib_mutex_t os_mutex; /*!< Possible operating system mutex + protecting the data structure. + As this data structure is used in + constructing the database mutex, + to prevent infinite recursion + in implementation, we fall back to + an OS mutex. */ + ulint res_count; /*!< count of cell reservations + since creation of the array */ +}; + +/** User configured sync array size */ +UNIV_INTERN ulong srv_sync_array_size = 32; + +/** Locally stored copy of srv_sync_array_size */ +static ulint sync_array_size; + +/** The global array of wait cells for implementation of the database's own +mutexes and read-write locks */ +static sync_array_t** sync_wait_array; + +/** count of how many times an object has been signalled */ +static ulint sg_count; + +#ifdef UNIV_SYNC_DEBUG +/******************************************************************//** +This function is called only in the debug version. Detects a deadlock +of one or more threads because of waits of semaphores. +@return TRUE if deadlock detected */ +static +ibool +sync_array_detect_deadlock( +/*=======================*/ + sync_array_t* arr, /*!< in: wait array; NOTE! the caller must + own the mutex to array */ + sync_cell_t* start, /*!< in: cell where recursive search started */ + sync_cell_t* cell, /*!< in: cell to search */ + ulint depth); /*!< in: recursion depth */ +#endif /* UNIV_SYNC_DEBUG */ + +/*****************************************************************//** +Gets the nth cell in array. +@return cell */ +static +sync_cell_t* +sync_array_get_nth_cell( +/*====================*/ + sync_array_t* arr, /*!< in: sync array */ + ulint n) /*!< in: index */ +{ + ut_a(arr); + ut_a(n < arr->n_cells); + + return(arr->array + n); +} + +/******************************************************************//** +Reserves the mutex semaphore protecting a sync array. */ +static +void +sync_array_enter( +/*=============*/ + sync_array_t* arr) /*!< in: sync wait array */ +{ + os_mutex_enter(arr->os_mutex); +} + +/******************************************************************//** +Releases the mutex semaphore protecting a sync array. */ +static +void +sync_array_exit( +/*============*/ + sync_array_t* arr) /*!< in: sync wait array */ +{ + os_mutex_exit(arr->os_mutex); +} + +/*******************************************************************//** +Creates a synchronization wait array. It is protected by a mutex +which is automatically reserved when the functions operating on it +are called. +@return own: created wait array */ +static +sync_array_t* +sync_array_create( +/*==============*/ + ulint n_cells) /*!< in: number of cells in the array + to create */ +{ + ulint sz; + sync_array_t* arr; + + ut_a(n_cells > 0); + + /* Allocate memory for the data structures */ + arr = static_cast<sync_array_t*>(ut_malloc(sizeof(*arr))); + memset(arr, 0x0, sizeof(*arr)); + + sz = sizeof(sync_cell_t) * n_cells; + arr->array = static_cast<sync_cell_t*>(ut_malloc(sz)); + memset(arr->array, 0x0, sz); + + arr->n_cells = n_cells; + + /* Then create the mutex to protect the wait array complex */ + arr->os_mutex = os_mutex_create(); + + return(arr); +} + +/******************************************************************//** +Frees the resources in a wait array. */ +static +void +sync_array_free( +/*============*/ + sync_array_t* arr) /*!< in, own: sync wait array */ +{ + ut_a(arr->n_reserved == 0); + + sync_array_validate(arr); + + /* Release the mutex protecting the wait array complex */ + + os_mutex_free(arr->os_mutex); + + ut_free(arr->array); + ut_free(arr); +} + +/********************************************************************//** +Validates the integrity of the wait array. Checks +that the number of reserved cells equals the count variable. */ +UNIV_INTERN +void +sync_array_validate( +/*================*/ + sync_array_t* arr) /*!< in: sync wait array */ +{ + ulint i; + sync_cell_t* cell; + ulint count = 0; + + sync_array_enter(arr); + + for (i = 0; i < arr->n_cells; i++) { + cell = sync_array_get_nth_cell(arr, i); + if (cell->wait_object != NULL) { + count++; + } + } + + ut_a(count == arr->n_reserved); + + sync_array_exit(arr); +} + +/*******************************************************************//** +Returns the event that the thread owning the cell waits for. */ +static +os_event_t +sync_cell_get_event( +/*================*/ + sync_cell_t* cell) /*!< in: non-empty sync array cell */ +{ + ulint type = cell->request_type; + + if (type == SYNC_MUTEX) { + return(((ib_mutex_t*) cell->wait_object)->event); + } else if (type == SYNC_PRIO_MUTEX) { + return(((ib_prio_mutex_t*) cell->wait_object) + ->high_priority_event); + } else if (type == RW_LOCK_WAIT_EX) { + return(((rw_lock_t*) cell->wait_object)->wait_ex_event); + } else if (type == PRIO_RW_LOCK_SHARED) { + return(((prio_rw_lock_t *) cell->wait_object) + ->high_priority_s_event); + } else if (type == PRIO_RW_LOCK_EX) { + return(((prio_rw_lock_t *) cell->wait_object) + ->high_priority_x_event); + } else { /* RW_LOCK_SHARED and RW_LOCK_EX wait on the same event */ + ut_ad(type == RW_LOCK_SHARED || type == RW_LOCK_EX); + return(((rw_lock_t*) cell->wait_object)->event); + } +} + +/******************************************************************//** +Reserves a wait array cell for waiting for an object. +The event of the cell is reset to nonsignalled state. +@return true if free cell is found, otherwise false */ +UNIV_INTERN +bool +sync_array_reserve_cell( +/*====================*/ + sync_array_t* arr, /*!< in: wait array */ + void* object, /*!< in: pointer to the object to wait for */ + ulint type, /*!< in: lock request type */ + const char* file, /*!< in: file where requested */ + ulint line, /*!< in: line where requested */ + ulint* index) /*!< out: index of the reserved cell */ +{ + sync_cell_t* cell; + os_event_t event; + ulint i; + + ut_a(object); + ut_a(index); + + sync_array_enter(arr); + + arr->res_count++; + + /* Reserve a new cell. */ + for (i = 0; i < arr->n_cells; i++) { + cell = sync_array_get_nth_cell(arr, i); + + if (cell->wait_object == NULL) { + + cell->waiting = FALSE; + cell->wait_object = object; + + if (type == SYNC_MUTEX || type == SYNC_PRIO_MUTEX) { + cell->old_wait_mutex = object; + } else { + cell->old_wait_rw_lock = object; + } + + cell->request_type = type; + + cell->file = file; + cell->line = line; + + arr->n_reserved++; + + *index = i; + + sync_array_exit(arr); + + /* Make sure the event is reset and also store + the value of signal_count at which the event + was reset. */ + event = sync_cell_get_event(cell); + cell->signal_count = os_event_reset(event); + + cell->reservation_time = ut_time(); + + cell->thread = os_thread_get_curr_id(); + + return(true); + } + } + + /* No free cell found */ + return false; +} + +/******************************************************************//** +This function should be called when a thread starts to wait on +a wait array cell. In the debug version this function checks +if the wait for a semaphore will result in a deadlock, in which +case prints info and asserts. */ +UNIV_INTERN +void +sync_array_wait_event( +/*==================*/ + sync_array_t* arr, /*!< in: wait array */ + ulint index) /*!< in: index of the reserved cell */ +{ + sync_cell_t* cell; + os_event_t event; + + ut_a(arr); + + sync_array_enter(arr); + + cell = sync_array_get_nth_cell(arr, index); + + ut_a(cell->wait_object); + ut_a(!cell->waiting); + ut_ad(os_thread_get_curr_id() == cell->thread); + + event = sync_cell_get_event(cell); + cell->waiting = TRUE; + +#ifdef UNIV_SYNC_DEBUG + + /* We use simple enter to the mutex below, because if + we cannot acquire it at once, mutex_enter would call + recursively sync_array routines, leading to trouble. + rw_lock_debug_mutex freezes the debug lists. */ + + rw_lock_debug_mutex_enter(); + + if (TRUE == sync_array_detect_deadlock(arr, cell, cell, 0)) { + + fputs("########################################\n", stderr); + ut_error; + } + + rw_lock_debug_mutex_exit(); +#endif + sync_array_exit(arr); + + os_event_wait_low(event, cell->signal_count); + + sync_array_free_cell(arr, index); +} + +/******************************************************************//** +Reports info of a wait array cell. */ +static +void +sync_array_cell_print( +/*==================*/ + FILE* file, /*!< in: file where to print */ + sync_cell_t* cell) /*!< in: sync cell */ +{ + ib_mutex_t* mutex; + ib_prio_mutex_t* prio_mutex; + rw_lock_t* rwlock; + prio_rw_lock_t* prio_rwlock = NULL; + ulint type; + ulint writer; + + type = cell->request_type; + + fprintf(file, + "--Thread %lu has waited at %s line %lu" + " for %#.5g seconds the semaphore:\n", + (ulong) os_thread_pf(cell->thread), + innobase_basename(cell->file), (ulong) cell->line, + difftime(time(NULL), cell->reservation_time)); + + if (type == SYNC_MUTEX || type == SYNC_PRIO_MUTEX) { + /* We use old_wait_mutex in case the cell has already + been freed meanwhile */ + if (type == SYNC_MUTEX) { + + mutex = static_cast<ib_mutex_t*>(cell->old_wait_mutex); + } else { + + prio_mutex = static_cast<ib_prio_mutex_t*> + (cell->old_wait_mutex); + mutex = &prio_mutex->base_mutex; + } + + + fprintf(file, + "Mutex at %p '%s', lock var %lu\n" +#ifdef UNIV_SYNC_DEBUG + "Last time reserved in file %s line %lu, " +#endif /* UNIV_SYNC_DEBUG */ + "waiters flag %lu\n", + (void*) mutex, mutex->cmutex_name, + (ulong) mutex->lock_word, +#ifdef UNIV_SYNC_DEBUG + mutex->file_name, (ulong) mutex->line, +#endif /* UNIV_SYNC_DEBUG */ + (ulong) mutex->waiters); + + if (type == SYNC_PRIO_MUTEX) { + + fprintf(file, + "high-priority waiters count %lu\n", + (ulong) prio_mutex->high_priority_waiters); + } + + } else if (type == RW_LOCK_EX + || type == RW_LOCK_WAIT_EX + || type == RW_LOCK_SHARED + || type == PRIO_RW_LOCK_SHARED + || type == PRIO_RW_LOCK_EX) { + + fputs((type == RW_LOCK_EX || type == PRIO_RW_LOCK_EX) + ? "X-lock on" + : type == RW_LOCK_WAIT_EX ? "X-lock (wait_ex) on" + : "S-lock on", file); + + /* Currently we are unable to tell high priority + RW_LOCK_WAIT_EX waiter from a regular priority one. Assume + it's a regular one. */ + if (type == RW_LOCK_EX || type == RW_LOCK_WAIT_EX + || type == RW_LOCK_SHARED) { + + rwlock = static_cast<rw_lock_t *> + (cell->old_wait_rw_lock); + } else { + + prio_rwlock = static_cast<prio_rw_lock_t *> + (cell->old_wait_rw_lock); + rwlock = &prio_rwlock->base_lock; + } + + fprintf(file, + " RW-latch at %p '%s'\n", + (void*) rwlock, rwlock->lock_name); + writer = rw_lock_get_writer(rwlock); + if (writer != RW_LOCK_NOT_LOCKED) { + fprintf(file, + "a writer (thread id %lu) has" + " reserved it in mode %s", + (ulong) os_thread_pf(rwlock->writer_thread), + writer == RW_LOCK_EX + ? " exclusive\n" + : " wait exclusive\n"); + } + + fprintf(file, + "number of readers %lu, waiters flag %lu, " + "lock_word: %lx\n" + "Last time read locked in file %s line %lu\n" + "Last time write locked in file %s line %lu\n", + (ulong) rw_lock_get_reader_count(rwlock), + (ulong) rwlock->waiters, + rwlock->lock_word, + innobase_basename(rwlock->last_s_file_name), + (ulong) rwlock->last_s_line, + rwlock->last_x_file_name, + (ulong) rwlock->last_x_line); + if (prio_rwlock) { + fprintf(file, "high priority S waiters count %lu, " + "high priority X waiters count %lu, " + "wait-exclusive waiter is " + "high priority if exists: %lu\n", + prio_rwlock->high_priority_s_waiters, + prio_rwlock->high_priority_x_waiters, + prio_rwlock->high_priority_wait_ex_waiter); + } + } else { + ut_error; + } + + if (!cell->waiting) { + fputs("wait has ended\n", file); + } +} + +#ifdef UNIV_SYNC_DEBUG +/******************************************************************//** +Looks for a cell with the given thread id. +@return pointer to cell or NULL if not found */ +static +sync_cell_t* +sync_array_find_thread( +/*===================*/ + sync_array_t* arr, /*!< in: wait array */ + os_thread_id_t thread) /*!< in: thread id */ +{ + ulint i; + sync_cell_t* cell; + + for (i = 0; i < arr->n_cells; i++) { + + cell = sync_array_get_nth_cell(arr, i); + + if (cell->wait_object != NULL + && os_thread_eq(cell->thread, thread)) { + + return(cell); /* Found */ + } + } + + return(NULL); /* Not found */ +} + +/******************************************************************//** +Recursion step for deadlock detection. +@return TRUE if deadlock detected */ +static +ibool +sync_array_deadlock_step( +/*=====================*/ + sync_array_t* arr, /*!< in: wait array; NOTE! the caller must + own the mutex to array */ + sync_cell_t* start, /*!< in: cell where recursive search + started */ + os_thread_id_t thread, /*!< in: thread to look at */ + ulint pass, /*!< in: pass value */ + ulint depth) /*!< in: recursion depth */ +{ + sync_cell_t* new_cell; + + if (pass != 0) { + /* If pass != 0, then we do not know which threads are + responsible of releasing the lock, and no deadlock can + be detected. */ + + return(FALSE); + } + + new_cell = sync_array_find_thread(arr, thread); + + if (new_cell == start) { + /* Deadlock */ + fputs("########################################\n" + "DEADLOCK of threads detected!\n", stderr); + + return(TRUE); + + } else if (new_cell) { + return(sync_array_detect_deadlock( + arr, start, new_cell, depth + 1)); + } + return(FALSE); +} + +/******************************************************************//** +This function is called only in the debug version. Detects a deadlock +of one or more threads because of waits of semaphores. +@return TRUE if deadlock detected */ +static +ibool +sync_array_detect_deadlock( +/*=======================*/ + sync_array_t* arr, /*!< in: wait array; NOTE! the caller must + own the mutex to array */ + sync_cell_t* start, /*!< in: cell where recursive search started */ + sync_cell_t* cell, /*!< in: cell to search */ + ulint depth) /*!< in: recursion depth */ +{ + ib_mutex_t* mutex; + rw_lock_t* lock; + os_thread_id_t thread; + ibool ret; + rw_lock_debug_t*debug; + + ut_a(arr); + ut_a(start); + ut_a(cell); + ut_ad(cell->wait_object); + ut_ad(os_thread_get_curr_id() == start->thread); + ut_ad(depth < 100); + + depth++; + + if (!cell->waiting) { + + return(FALSE); /* No deadlock here */ + } + + if (cell->request_type == SYNC_MUTEX + || cell->request_type == SYNC_PRIO_MUTEX) { + + if (cell->request_type == SYNC_MUTEX) { + mutex = static_cast<ib_mutex_t*>(cell->wait_object); + } else { + mutex = &(static_cast<ib_prio_mutex_t*>( + cell->wait_object))->base_mutex; + } + + if (mutex_get_lock_word(mutex) != 0) { + + thread = mutex->thread_id; + + /* Note that mutex->thread_id above may be + also OS_THREAD_ID_UNDEFINED, because the + thread which held the mutex maybe has not + yet updated the value, or it has already + released the mutex: in this case no deadlock + can occur, as the wait array cannot contain + a thread with ID_UNDEFINED value. */ + + ret = sync_array_deadlock_step(arr, start, thread, 0, + depth); + if (ret) { + fprintf(stderr, + "Mutex %p owned by thread %lu file %s line %lu\n", + mutex, (ulong) os_thread_pf(mutex->thread_id), + mutex->file_name, (ulong) mutex->line); + sync_array_cell_print(stderr, cell); + + return(TRUE); + } + } + + return(FALSE); /* No deadlock */ + + } else if (cell->request_type == RW_LOCK_EX + || cell->request_type == PRIO_RW_LOCK_EX + || cell->request_type == RW_LOCK_WAIT_EX) { + + lock = static_cast<rw_lock_t*>(cell->wait_object); + + for (debug = UT_LIST_GET_FIRST(lock->debug_list); + debug != 0; + debug = UT_LIST_GET_NEXT(list, debug)) { + + thread = debug->thread_id; + + if (((debug->lock_type == RW_LOCK_EX) + && !os_thread_eq(thread, cell->thread)) + || ((debug->lock_type == RW_LOCK_WAIT_EX) + && !os_thread_eq(thread, cell->thread)) + || (debug->lock_type == RW_LOCK_SHARED)) { + + /* The (wait) x-lock request can block + infinitely only if someone (can be also cell + thread) is holding s-lock, or someone + (cannot be cell thread) (wait) x-lock, and + he is blocked by start thread */ + + ret = sync_array_deadlock_step( + arr, start, thread, debug->pass, + depth); + if (ret) { +print: + fprintf(stderr, "rw-lock %p ", + (void*) lock); + sync_array_cell_print(stderr, cell); + rw_lock_debug_print(stderr, debug); + return(TRUE); + } + } + } + + return(FALSE); + + } else if (cell->request_type == RW_LOCK_SHARED + || cell->request_type == PRIO_RW_LOCK_SHARED) { + + lock = static_cast<rw_lock_t*>(cell->wait_object); + + for (debug = UT_LIST_GET_FIRST(lock->debug_list); + debug != 0; + debug = UT_LIST_GET_NEXT(list, debug)) { + + thread = debug->thread_id; + + if ((debug->lock_type == RW_LOCK_EX) + || (debug->lock_type == RW_LOCK_WAIT_EX)) { + + /* The s-lock request can block infinitely + only if someone (can also be cell thread) is + holding (wait) x-lock, and he is blocked by + start thread */ + + ret = sync_array_deadlock_step( + arr, start, thread, debug->pass, + depth); + if (ret) { + goto print; + } + } + } + + return(FALSE); + + } else { + ut_error; + } + + return(TRUE); /* Execution never reaches this line: for compiler + fooling only */ +} +#endif /* UNIV_SYNC_DEBUG */ + +/******************************************************************//** +Determines if we can wake up the thread waiting for a sempahore. */ +static +ibool +sync_arr_cell_can_wake_up( +/*======================*/ + sync_cell_t* cell) /*!< in: cell to search */ +{ + ib_mutex_t* mutex; + rw_lock_t* lock; + + if (cell->request_type == SYNC_MUTEX + || cell->request_type == SYNC_PRIO_MUTEX) { + + if (cell->request_type == SYNC_MUTEX) { + mutex = static_cast<ib_mutex_t*>(cell->wait_object); + } else { + mutex = &(static_cast<ib_prio_mutex_t*>( + cell->wait_object))->base_mutex; + } + + os_rmb; + if (mutex_get_lock_word(mutex) == 0) { + + return(TRUE); + } + + } else if (cell->request_type == RW_LOCK_EX + || cell->request_type == PRIO_RW_LOCK_EX) { + + lock = static_cast<rw_lock_t*>(cell->wait_object); + + os_rmb; + if (lock->lock_word > 0) { + /* Either unlocked or only read locked. */ + + return(TRUE); + } + + } else if (cell->request_type == RW_LOCK_WAIT_EX) { + + lock = static_cast<rw_lock_t*>(cell->wait_object); + + /* lock_word == 0 means all readers have left */ + os_rmb; + if (lock->lock_word == 0) { + + return(TRUE); + } + } else if (cell->request_type == RW_LOCK_SHARED + || cell->request_type == PRIO_RW_LOCK_SHARED) { + lock = static_cast<rw_lock_t*>(cell->wait_object); + + /* lock_word > 0 means no writer or reserved writer */ + os_rmb; + if (lock->lock_word > 0) { + + return(TRUE); + } + } else { + + ut_error; + } + + return(FALSE); +} + +/******************************************************************//** +Frees the cell. NOTE! sync_array_wait_event frees the cell +automatically! */ +UNIV_INTERN +void +sync_array_free_cell( +/*=================*/ + sync_array_t* arr, /*!< in: wait array */ + ulint index) /*!< in: index of the cell in array */ +{ + sync_cell_t* cell; + + sync_array_enter(arr); + + cell = sync_array_get_nth_cell(arr, index); + + ut_a(cell->wait_object != NULL); + + cell->waiting = FALSE; + cell->wait_object = NULL; + cell->signal_count = 0; + + ut_a(arr->n_reserved > 0); + arr->n_reserved--; + + sync_array_exit(arr); +} + +/**********************************************************************//** +Increments the signalled count. */ +UNIV_INTERN +void +sync_array_object_signalled(void) +/*=============================*/ +{ +#ifdef HAVE_ATOMIC_BUILTINS + (void) os_atomic_increment_ulint(&sg_count, 1); +#else + ++sg_count; +#endif /* HAVE_ATOMIC_BUILTINS */ +} + +/**********************************************************************//** +If the wakeup algorithm does not work perfectly at semaphore relases, +this function will do the waking (see the comment in mutex_exit). This +function should be called about every 1 second in the server. + +Note that there's a race condition between this thread and mutex_exit +changing the lock_word and calling signal_object, so sometimes this finds +threads to wake up even when nothing has gone wrong. */ +static +void +sync_array_wake_threads_if_sema_free_low( +/*=====================================*/ + sync_array_t* arr) /* in/out: wait array */ +{ + ulint i = 0; + ulint count; + + sync_array_enter(arr); + + for (count = 0; count < arr->n_reserved; ++i) { + sync_cell_t* cell; + + cell = sync_array_get_nth_cell(arr, i); + + if (cell->wait_object != NULL) { + + count++; + + if (sync_arr_cell_can_wake_up(cell)) { + os_event_t event; + + event = sync_cell_get_event(cell); + + os_event_set(event); + } + } + } + + sync_array_exit(arr); +} + +/**********************************************************************//** +If the wakeup algorithm does not work perfectly at semaphore relases, +this function will do the waking (see the comment in mutex_exit). This +function should be called about every 1 second in the server. + +Note that there's a race condition between this thread and mutex_exit +changing the lock_word and calling signal_object, so sometimes this finds +threads to wake up even when nothing has gone wrong. */ +UNIV_INTERN +void +sync_arr_wake_threads_if_sema_free(void) +/*====================================*/ +{ + ulint i; + + for (i = 0; i < sync_array_size; ++i) { + + sync_array_wake_threads_if_sema_free_low( + sync_wait_array[i]); + } +} + +/**********************************************************************//** +Prints warnings of long semaphore waits to stderr. +@return TRUE if fatal semaphore wait threshold was exceeded */ +static +ibool +sync_array_print_long_waits_low( +/*============================*/ + sync_array_t* arr, /*!< in: sync array instance */ + os_thread_id_t* waiter, /*!< out: longest waiting thread */ + const void** sema, /*!< out: longest-waited-for semaphore */ + ibool* noticed)/*!< out: TRUE if long wait noticed */ +{ + ulint i; + ulint fatal_timeout = srv_fatal_semaphore_wait_threshold; + ibool fatal = FALSE; + double longest_diff = 0; + + /* For huge tables, skip the check during CHECK TABLE etc... */ + if (fatal_timeout > SRV_SEMAPHORE_WAIT_EXTENSION) { + return(FALSE); + } + +#ifdef UNIV_DEBUG_VALGRIND + /* Increase the timeouts if running under valgrind because it executes + extremely slowly. UNIV_DEBUG_VALGRIND does not necessary mean that + we are running under valgrind but we have no better way to tell. + See Bug#58432 innodb.innodb_bug56143 fails under valgrind + for an example */ +# define SYNC_ARRAY_TIMEOUT 2400 + fatal_timeout *= 10; +#else +# define SYNC_ARRAY_TIMEOUT 240 +#endif + + for (i = 0; i < arr->n_cells; i++) { + + double diff; + sync_cell_t* cell; + void* wait_object; + + cell = sync_array_get_nth_cell(arr, i); + + wait_object = cell->wait_object; + + if (wait_object == NULL || !cell->waiting) { + + continue; + } + + diff = difftime(time(NULL), cell->reservation_time); + + if (diff > SYNC_ARRAY_TIMEOUT) { + fputs("InnoDB: Warning: a long semaphore wait:\n", + stderr); + sync_array_cell_print(stderr, cell); + *noticed = TRUE; + } + + if (diff > fatal_timeout) { + fatal = TRUE; + } + + if (diff > longest_diff) { + longest_diff = diff; + *sema = wait_object; + *waiter = cell->thread; + } + } + +#undef SYNC_ARRAY_TIMEOUT + + return(fatal); +} + +/**********************************************************************//** +Prints warnings of long semaphore waits to stderr. +@return TRUE if fatal semaphore wait threshold was exceeded */ +UNIV_INTERN +ibool +sync_array_print_long_waits( +/*========================*/ + os_thread_id_t* waiter, /*!< out: longest waiting thread */ + const void** sema) /*!< out: longest-waited-for semaphore */ +{ + ulint i; + ibool fatal = FALSE; + ibool noticed = FALSE; + + for (i = 0; i < sync_array_size; ++i) { + + sync_array_t* arr = sync_wait_array[i]; + + sync_array_enter(arr); + + if (sync_array_print_long_waits_low( + arr, waiter, sema, ¬iced)) { + + fatal = TRUE; + } + + sync_array_exit(arr); + } + + if (noticed) { + ibool old_val; + + fprintf(stderr, + "InnoDB: ###### Starts InnoDB Monitor" + " for 30 secs to print diagnostic info:\n"); + + old_val = srv_print_innodb_monitor; + + /* If some crucial semaphore is reserved, then also the InnoDB + Monitor can hang, and we do not get diagnostics. Since in + many cases an InnoDB hang is caused by a pwrite() or a pread() + call hanging inside the operating system, let us print right + now the values of pending calls of these. */ + + fprintf(stderr, + "InnoDB: Pending preads %lu, pwrites %lu\n", + (ulong) os_file_n_pending_preads, + (ulong) os_file_n_pending_pwrites); + + srv_print_innodb_monitor = TRUE; + os_event_set(lock_sys->timeout_event); + + os_thread_sleep(30000000); + + srv_print_innodb_monitor = static_cast<my_bool>(old_val); + fprintf(stderr, + "InnoDB: ###### Diagnostic info printed" + " to the standard error stream\n"); + } + + return(fatal); +} + +/**********************************************************************//** +Prints info of the wait array. */ +static +void +sync_array_print_info_low( +/*======================*/ + FILE* file, /*!< in: file where to print */ + sync_array_t* arr) /*!< in: wait array */ +{ + ulint i; + ulint count = 0; + + fprintf(file, + "OS WAIT ARRAY INFO: reservation count %ld\n", + (long) arr->res_count); + + for (i = 0; count < arr->n_reserved; ++i) { + sync_cell_t* cell; + + cell = sync_array_get_nth_cell(arr, i); + + if (cell->wait_object != NULL) { + count++; + sync_array_cell_print(file, cell); + } + } +} + +/**********************************************************************//** +Prints info of the wait array. */ +static +void +sync_array_print_info( +/*==================*/ + FILE* file, /*!< in: file where to print */ + sync_array_t* arr) /*!< in: wait array */ +{ + sync_array_enter(arr); + + sync_array_print_info_low(file, arr); + + sync_array_exit(arr); +} + +/**********************************************************************//** +Create the primary system wait array(s), they are protected by an OS mutex */ +UNIV_INTERN +void +sync_array_init( +/*============*/ + ulint n_threads) /*!< in: Number of slots to + create in all arrays */ +{ + ulint i; + ulint n_slots; + + ut_a(sync_wait_array == NULL); + ut_a(srv_sync_array_size > 0); + ut_a(n_threads > 0); + + sync_array_size = srv_sync_array_size; + + /* We have to use ut_malloc() because the mutex infrastructure + hasn't been initialised yet. It is required by mem_alloc() and + the heap functions. */ + + sync_wait_array = static_cast<sync_array_t**>( + ut_malloc(sizeof(*sync_wait_array) * sync_array_size)); + + n_slots = 1 + (n_threads - 1) / sync_array_size; + + for (i = 0; i < sync_array_size; ++i) { + + sync_wait_array[i] = sync_array_create(n_slots); + } +} + +/**********************************************************************//** +Close sync array wait sub-system. */ +UNIV_INTERN +void +sync_array_close(void) +/*==================*/ +{ + ulint i; + + for (i = 0; i < sync_array_size; ++i) { + sync_array_free(sync_wait_array[i]); + } + + ut_free(sync_wait_array); + sync_wait_array = NULL; +} + +/**********************************************************************//** +Print info about the sync array(s). */ +UNIV_INTERN +void +sync_array_print( +/*=============*/ + FILE* file) /*!< in/out: Print to this stream */ +{ + ulint i; + + for (i = 0; i < sync_array_size; ++i) { + sync_array_print_info(file, sync_wait_array[i]); + } + + fprintf(file, + "OS WAIT ARRAY INFO: signal count %ld\n", (long) sg_count); + +} + +/**********************************************************************//** +Get an instance of the sync wait array. */ +UNIV_INTERN +sync_array_t* +sync_array_get(void) +/*================*/ +{ + ulint i; + static ulint count; + +#ifdef HAVE_ATOMIC_BUILTINS + i = os_atomic_increment_ulint(&count, 1); +#else + i = count++; +#endif /* HAVE_ATOMIC_BUILTINS */ + + return(sync_wait_array[i % sync_array_size]); +} diff --git a/storage/xtradb/sync/sync0rw.cc b/storage/xtradb/sync/sync0rw.cc new file mode 100644 index 00000000000..a72730e1877 --- /dev/null +++ b/storage/xtradb/sync/sync0rw.cc @@ -0,0 +1,1295 @@ +/***************************************************************************** + +Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2008, Google Inc. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file sync/sync0rw.cc +The read-write lock (for thread synchronization) + +Created 9/11/1995 Heikki Tuuri +*******************************************************/ + +#include "sync0rw.h" +#ifdef UNIV_NONINL +#include "sync0rw.ic" +#include "sync0arr.ic" +#endif + +#include "os0thread.h" +#include "mem0mem.h" +#include "srv0srv.h" +#include "os0sync.h" /* for INNODB_RW_LOCKS_USE_ATOMICS */ +#include "ha_prototypes.h" + +/* + IMPLEMENTATION OF THE RW_LOCK + ============================= +The status of a rw_lock is held in lock_word. The initial value of lock_word is +X_LOCK_DECR. lock_word is decremented by 1 for each s-lock and by X_LOCK_DECR +for each x-lock. This describes the lock state for each value of lock_word: + +lock_word == X_LOCK_DECR: Unlocked. +0 < lock_word < X_LOCK_DECR: Read locked, no waiting writers. + (X_LOCK_DECR - lock_word) is the + number of readers that hold the lock. +lock_word == 0: Write locked +-X_LOCK_DECR < lock_word < 0: Read locked, with a waiting writer. + (-lock_word) is the number of readers + that hold the lock. +lock_word <= -X_LOCK_DECR: Recursively write locked. lock_word has been + decremented by X_LOCK_DECR for the first lock + and the first recursive lock, then by 1 for + each recursive lock thereafter. + So the number of locks is: + (lock_copy == 0) ? 1 : 2 - (lock_copy + X_LOCK_DECR) + +The lock_word is always read and updated atomically and consistently, so that +it always represents the state of the lock, and the state of the lock changes +with a single atomic operation. This lock_word holds all of the information +that a thread needs in order to determine if it is eligible to gain the lock +or if it must spin or sleep. The one exception to this is that writer_thread +must be verified before recursive write locks: to solve this scenario, we make +writer_thread readable by all threads, but only writeable by the x-lock holder. + +The other members of the lock obey the following rules to remain consistent: + +recursive: This and the writer_thread field together control the + behaviour of recursive x-locking. + lock->recursive must be FALSE in following states: + 1) The writer_thread contains garbage i.e.: the + lock has just been initialized. + 2) The lock is not x-held and there is no + x-waiter waiting on WAIT_EX event. + 3) The lock is x-held or there is an x-waiter + waiting on WAIT_EX event but the 'pass' value + is non-zero. + lock->recursive is TRUE iff: + 1) The lock is x-held or there is an x-waiter + waiting on WAIT_EX event and the 'pass' value + is zero. + This flag must be set after the writer_thread field + has been updated with a memory ordering barrier. + It is unset before the lock_word has been incremented. +writer_thread: Is used only in recursive x-locking. Can only be safely + read iff lock->recursive flag is TRUE. + This field is uninitialized at lock creation time and + is updated atomically when x-lock is acquired or when + move_ownership is called. A thread is only allowed to + set the value of this field to it's thread_id i.e.: a + thread cannot set writer_thread to some other thread's + id. +waiters: May be set to 1 anytime, but to avoid unnecessary wake-up + signals, it should only be set to 1 when there are threads + waiting on event. Must be 1 when a writer starts waiting to + ensure the current x-locking thread sends a wake-up signal + during unlock. May only be reset to 0 immediately before a + a wake-up signal is sent to event. On most platforms, a + memory barrier is required after waiters is set, and before + verifying lock_word is still held, to ensure some unlocker + really does see the flags new value. +event: Threads wait on event for read or writer lock when another + thread has an x-lock or an x-lock reservation (wait_ex). A + thread may only wait on event after performing the following + actions in order: + (1) Record the counter value of event (with os_event_reset). + (2) Set waiters to 1. + (3) Verify lock_word <= 0. + (1) must come before (2) to ensure signal is not missed. + (2) must come before (3) to ensure a signal is sent. + These restrictions force the above ordering. + Immediately before sending the wake-up signal, we should: + (1) Verify lock_word == X_LOCK_DECR (unlocked) + (2) Reset waiters to 0. +wait_ex_event: A thread may only wait on the wait_ex_event after it has + performed the following actions in order: + (1) Decrement lock_word by X_LOCK_DECR. + (2) Record counter value of wait_ex_event (os_event_reset, + called from sync_array_reserve_cell). + (3) Verify that lock_word < 0. + (1) must come first to ensures no other threads become reader + or next writer, and notifies unlocker that signal must be sent. + (2) must come before (3) to ensure the signal is not missed. + These restrictions force the above ordering. + Immediately before sending the wake-up signal, we should: + Verify lock_word == 0 (waiting thread holds x_lock) +*/ + +UNIV_INTERN rw_lock_stats_t rw_lock_stats; + +/* The global list of rw-locks */ +UNIV_INTERN rw_lock_list_t rw_lock_list; +UNIV_INTERN ib_mutex_t rw_lock_list_mutex; + +#ifdef UNIV_PFS_MUTEX +UNIV_INTERN mysql_pfs_key_t rw_lock_list_mutex_key; +UNIV_INTERN mysql_pfs_key_t rw_lock_mutex_key; +#endif /* UNIV_PFS_MUTEX */ + +#ifdef UNIV_SYNC_DEBUG +/* The global mutex which protects debug info lists of all rw-locks. +To modify the debug info list of an rw-lock, this mutex has to be +acquired in addition to the mutex protecting the lock. */ + +UNIV_INTERN ib_mutex_t rw_lock_debug_mutex; + +# ifdef UNIV_PFS_MUTEX +UNIV_INTERN mysql_pfs_key_t rw_lock_debug_mutex_key; +# endif + +/* If deadlock detection does not get immediately the mutex, +it may wait for this event */ +UNIV_INTERN os_event_t rw_lock_debug_event; +/* This is set to TRUE, if there may be waiters for the event */ +UNIV_INTERN ibool rw_lock_debug_waiters; + +/******************************************************************//** +Creates a debug info struct. */ +static +rw_lock_debug_t* +rw_lock_debug_create(void); +/*======================*/ +/******************************************************************//** +Frees a debug info struct. */ +static +void +rw_lock_debug_free( +/*===============*/ + rw_lock_debug_t* info); + +/******************************************************************//** +Creates a debug info struct. +@return own: debug info struct */ +static +rw_lock_debug_t* +rw_lock_debug_create(void) +/*======================*/ +{ + return((rw_lock_debug_t*) mem_alloc(sizeof(rw_lock_debug_t))); +} + +/******************************************************************//** +Frees a debug info struct. */ +static +void +rw_lock_debug_free( +/*===============*/ + rw_lock_debug_t* info) +{ + mem_free(info); +} +#endif /* UNIV_SYNC_DEBUG */ + +/******************************************************************//** +Creates, or rather, initializes an rw-lock object in a specified memory +location (which must be appropriately aligned). The rw-lock is initialized +to the non-locked state. Explicit freeing of the rw-lock with rw_lock_free +is necessary only if the memory block containing it is freed. */ +UNIV_INTERN +void +rw_lock_create_func( +/*================*/ + rw_lock_t* lock, /*!< in: pointer to memory */ +#ifdef UNIV_DEBUG +# ifdef UNIV_SYNC_DEBUG + ulint level, /*!< in: level */ +# endif /* UNIV_SYNC_DEBUG */ + const char* cfile_name, /*!< in: file name where created */ + ulint cline, /*!< in: file line where created */ +#endif /* UNIV_DEBUG */ + const char* cmutex_name) /*!< in: mutex name */ +{ + /* If this is the very first time a synchronization object is + created, then the following call initializes the sync system. */ + +#ifndef INNODB_RW_LOCKS_USE_ATOMICS + mutex_create(rw_lock_mutex_key, rw_lock_get_mutex(lock), + SYNC_NO_ORDER_CHECK); + + ut_d(lock->mutex.cfile_name = cfile_name); + ut_d(lock->mutex.cline = cline); + + lock->mutex.cmutex_name = cmutex_name; + ut_d(lock->mutex.ib_mutex_type = 1); +#else /* INNODB_RW_LOCKS_USE_ATOMICS */ +# ifdef UNIV_DEBUG + UT_NOT_USED(cfile_name); + UT_NOT_USED(cline); +# endif +#endif /* INNODB_RW_LOCKS_USE_ATOMICS */ + + lock->lock_word = X_LOCK_DECR; + lock->waiters = 0; + + /* We set this value to signify that lock->writer_thread + contains garbage at initialization and cannot be used for + recursive x-locking. */ + lock->recursive = FALSE; + /* Silence Valgrind when UNIV_DEBUG_VALGRIND is not enabled. */ + memset((void*) &lock->writer_thread, 0, sizeof lock->writer_thread); + UNIV_MEM_INVALID(&lock->writer_thread, sizeof lock->writer_thread); + +#ifdef UNIV_SYNC_DEBUG + UT_LIST_INIT(lock->debug_list); + + lock->level = level; +#endif /* UNIV_SYNC_DEBUG */ + + ut_d(lock->magic_n = RW_LOCK_MAGIC_N); + + lock->lock_name = cmutex_name; + + lock->count_os_wait = 0; + lock->last_s_file_name = "not yet reserved"; + lock->last_x_file_name = "not yet reserved"; + lock->last_s_line = 0; + lock->last_x_line = 0; + lock->event = os_event_create(); + lock->wait_ex_event = os_event_create(); + + mutex_enter(&rw_lock_list_mutex); + + ut_ad(UT_LIST_GET_FIRST(rw_lock_list) == NULL + || UT_LIST_GET_FIRST(rw_lock_list)->magic_n == RW_LOCK_MAGIC_N); + + UT_LIST_ADD_FIRST(list, rw_lock_list, lock); + + mutex_exit(&rw_lock_list_mutex); +} + +/******************************************************************//** +Creates, or rather, initializes a priority rw-lock object in a specified memory +location (which must be appropriately aligned). The rw-lock is initialized +to the non-locked state. Explicit freeing of the rw-lock with rw_lock_free +is necessary only if the memory block containing it is freed. */ +UNIV_INTERN +void +rw_lock_create_func( +/*================*/ + prio_rw_lock_t* lock, /*!< in: pointer to memory */ +#ifdef UNIV_DEBUG +# ifdef UNIV_SYNC_DEBUG + ulint level, /*!< in: level */ +# endif /* UNIV_SYNC_DEBUG */ + const char* cfile_name, /*!< in: file name where created */ + ulint cline, /*!< in: file line where created */ +#endif /* UNIV_DEBUG */ + const char* cmutex_name) /*!< in: mutex name */ +{ + rw_lock_create_func(&lock->base_lock, +#ifdef UNIV_DEBUG +# ifdef UNIV_SYNC_DEBUG + level, +# endif + cfile_name, + cline, +#endif + cmutex_name); + lock->high_priority_s_waiters = 0; + lock->high_priority_s_event = os_event_create(); + lock->high_priority_x_waiters = 0; + lock->high_priority_x_event = os_event_create(); + lock->high_priority_wait_ex_waiter = 0; +} + +/******************************************************************//** +Calling this function is obligatory only if the memory buffer containing +the rw-lock is freed. Removes an rw-lock object from the global list. The +rw-lock is checked to be in the non-locked state. */ +UNIV_INTERN +void +rw_lock_free_func( +/*==============*/ + rw_lock_t* lock) /*!< in: rw-lock */ +{ +#ifndef INNODB_RW_LOCKS_USE_ATOMICS + ib_mutex_t* mutex; +#endif /* !INNODB_RW_LOCKS_USE_ATOMICS */ + + os_rmb; + ut_ad(rw_lock_validate(lock)); + ut_a(lock->lock_word == X_LOCK_DECR); + + mutex_enter(&rw_lock_list_mutex); + +#ifndef INNODB_RW_LOCKS_USE_ATOMICS + mutex = rw_lock_get_mutex(lock); +#endif /* !INNODB_RW_LOCKS_USE_ATOMICS */ + + os_event_free(lock->event); + + os_event_free(lock->wait_ex_event); + + ut_ad(UT_LIST_GET_PREV(list, lock) == NULL + || UT_LIST_GET_PREV(list, lock)->magic_n == RW_LOCK_MAGIC_N); + ut_ad(UT_LIST_GET_NEXT(list, lock) == NULL + || UT_LIST_GET_NEXT(list, lock)->magic_n == RW_LOCK_MAGIC_N); + + UT_LIST_REMOVE(list, rw_lock_list, lock); + + mutex_exit(&rw_lock_list_mutex); + + ut_d(lock->magic_n = 0); + +#ifndef INNODB_RW_LOCKS_USE_ATOMICS + /* We have merely removed the rw_lock from the list, the memory + has not been freed. Therefore the pointer to mutex is valid. */ + mutex_free(mutex); +#endif /* !INNODB_RW_LOCKS_USE_ATOMICS */ +} + +/******************************************************************//** +Calling this function is obligatory only if the memory buffer containing +the priority rw-lock is freed. Removes an rw-lock object from the global list. +The rw-lock is checked to be in the non-locked state. */ +UNIV_INTERN +void +rw_lock_free_func( +/*==============*/ + prio_rw_lock_t* lock) /*!< in: rw-lock */ +{ + os_event_free(lock->high_priority_s_event); + os_event_free(lock->high_priority_x_event); + rw_lock_free_func(&lock->base_lock); +} + +#ifdef UNIV_DEBUG +/******************************************************************//** +Checks that the rw-lock has been initialized and that there are no +simultaneous shared and exclusive locks. +@return TRUE */ +UNIV_INTERN +ibool +rw_lock_validate( +/*=============*/ + rw_lock_t* lock) /*!< in: rw-lock */ +{ + ulint waiters; + lint lock_word; + + ut_ad(lock); + + waiters = rw_lock_get_waiters(lock); + lock_word = lock->lock_word; + + ut_ad(lock->magic_n == RW_LOCK_MAGIC_N); + ut_ad(waiters == 0 || waiters == 1); + ut_ad(lock_word > -(2 * X_LOCK_DECR)); + ut_ad(lock_word <= X_LOCK_DECR); + + return(TRUE); +} + +/******************************************************************//** +Checks that the priority rw-lock has been initialized and that there are no +simultaneous shared and exclusive locks. +@return TRUE */ +UNIV_INTERN +ibool +rw_lock_validate( +/*=============*/ + prio_rw_lock_t* lock) /*!< in: rw-lock */ +{ + return(rw_lock_validate(&lock->base_lock)); +} + +#endif /* UNIV_DEBUG */ + +/******************************************************************//** +Lock a regular or priority rw-lock in shared mode for the current thread. If +the rw-lock is locked in exclusive mode, or there is an exclusive lock request +waiting, the function spins a preset time (controlled by SYNC_SPIN_ROUNDS), +waiting for the lock, before suspending the thread. */ +UNIV_INTERN +void +rw_lock_s_lock_spin( +/*================*/ + void* _lock, /*!< in: pointer to rw-lock */ + ulint pass, /*!< in: pass value; != 0, if the lock + will be passed to another thread to unlock */ + bool priority_lock, + /*!< in: whether the lock is a priority lock */ + bool high_priority, + /*!< in: whether we are acquiring a priority + lock with high priority */ + const char* file_name, /*!< in: file name where lock requested */ + ulint line) /*!< in: line where requested */ +{ + ulint index; /* index of the reserved wait cell */ + ulint i = 0; /* spin round count */ + sync_array_t* sync_arr; + size_t counter_index; + rw_lock_t* lock = (rw_lock_t *) _lock; + + /* We reuse the thread id to index into the counter, cache + it here for efficiency. */ + + counter_index = (size_t) os_thread_get_curr_id(); + + ut_ad(rw_lock_validate(lock)); + + rw_lock_stats.rw_s_spin_wait_count.add(counter_index, 1); +lock_loop: + + if (!rw_lock_higher_prio_waiters_exist(priority_lock, high_priority, + lock)) { + + /* Spin waiting for the writer field to become free */ + os_rmb; + while (i < SYNC_SPIN_ROUNDS && lock->lock_word <= 0) { + if (srv_spin_wait_delay) { + ut_delay(ut_rnd_interval(0, + srv_spin_wait_delay)); + } + + i++; + } + + if (i >= SYNC_SPIN_ROUNDS) { + os_thread_yield(); + } + + if (srv_print_latch_waits) { + fprintf(stderr, + "Thread " ULINTPF " spin wait rw-s-lock at %p" + " '%s' rnds " ULINTPF "\n", + os_thread_pf(os_thread_get_curr_id()), + (void*) lock, lock->lock_name, i); + } + } else { + + /* In case of higher priority waiters already present, perform + only this part of the spinning code path. */ + os_thread_yield(); + } + + /* We try once again to obtain the lock */ + if (!rw_lock_higher_prio_waiters_exist(priority_lock, high_priority, + lock) + && (TRUE == rw_lock_s_lock_low(lock, pass, file_name, line))) { + rw_lock_stats.rw_s_spin_round_count.add(counter_index, i); + + return; /* Success */ + } else { + + prio_rw_lock_t* prio_rw_lock = NULL; + + if (i > 0 && i < SYNC_SPIN_ROUNDS) { + goto lock_loop; + } + + rw_lock_stats.rw_s_spin_round_count.add(counter_index, i); + + sync_arr = sync_array_get_and_reserve_cell(lock, + high_priority + ? PRIO_RW_LOCK_SHARED + : RW_LOCK_SHARED, + file_name, + line, &index); + + /* Set waiters before checking lock_word to ensure wake-up + signal is sent. This may lead to some unnecessary signals. */ + if (high_priority) { + + prio_rw_lock = reinterpret_cast<prio_rw_lock_t *> + (_lock); + os_atomic_increment_ulint( + &prio_rw_lock->high_priority_s_waiters, + 1); + } else { + + rw_lock_set_waiter_flag(lock); + } + + if (!rw_lock_higher_prio_waiters_exist(priority_lock, + high_priority, lock) + && (TRUE == rw_lock_s_lock_low(lock, pass, + file_name, line))) { + sync_array_free_cell(sync_arr, index); + if (prio_rw_lock) { + + os_atomic_decrement_ulint( + &prio_rw_lock->high_priority_s_waiters, + 1); + } + return; /* Success */ + } + + if (srv_print_latch_waits) { + fprintf(stderr, + "Thread " ULINTPF " OS wait rw-s-lock at %p" + " '%s'\n", + os_thread_pf(os_thread_get_curr_id()), + (void*) lock, lock->lock_name); + } + + /* these stats may not be accurate */ + lock->count_os_wait++; + rw_lock_stats.rw_s_os_wait_count.add(counter_index, 1); + + sync_array_wait_event(sync_arr, index); + + if (prio_rw_lock) { + + os_atomic_decrement_ulint( + &prio_rw_lock->high_priority_s_waiters, + 1); + } + + i = 0; + goto lock_loop; + } +} + +/******************************************************************//** +This function is used in the insert buffer to move the ownership of an +x-latch on a buffer frame to the current thread. The x-latch was set by +the buffer read operation and it protected the buffer frame while the +read was done. The ownership is moved because we want that the current +thread is able to acquire a second x-latch which is stored in an mtr. +This, in turn, is needed to pass the debug checks of index page +operations. */ +UNIV_INTERN +void +rw_lock_x_lock_move_ownership( +/*==========================*/ + rw_lock_t* lock) /*!< in: lock which was x-locked in the + buffer read */ +{ + ut_ad(rw_lock_is_locked(lock, RW_LOCK_EX)); + + rw_lock_set_writer_id_and_recursion_flag(lock, TRUE); +} + +/******************************************************************//** +Function for the next writer to call. Waits for readers to exit. +The caller must have already decremented lock_word by X_LOCK_DECR. */ +UNIV_INLINE +void +rw_lock_x_lock_wait( +/*================*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + bool high_priority, + /*!< in: if true, the rw lock is a priority + lock and is being acquired with high + priority */ +#ifdef UNIV_SYNC_DEBUG + ulint pass, /*!< in: pass value; != 0, if the lock will + be passed to another thread to unlock */ +#endif + const char* file_name,/*!< in: file name where lock requested */ + ulint line) /*!< in: line where requested */ +{ + ulint index; + ulint i = 0; + sync_array_t* sync_arr; + size_t counter_index; + prio_rw_lock_t* prio_rw_lock = NULL; + + /* We reuse the thread id to index into the counter, cache + it here for efficiency. */ + + counter_index = (size_t) os_thread_get_curr_id(); + + os_rmb; + ut_ad(lock->lock_word <= 0); + + if (high_priority) { + + prio_rw_lock = reinterpret_cast<prio_rw_lock_t *>(lock); + prio_rw_lock->high_priority_wait_ex_waiter = 1; + } + + while (lock->lock_word < 0) { + if (srv_spin_wait_delay) { + ut_delay(ut_rnd_interval(0, srv_spin_wait_delay)); + } + if(i < SYNC_SPIN_ROUNDS) { + i++; + os_rmb; + continue; + } + + /* If there is still a reader, then go to sleep.*/ + rw_lock_stats.rw_x_spin_round_count.add(counter_index, i); + + sync_arr = sync_array_get_and_reserve_cell(lock, + RW_LOCK_WAIT_EX, + file_name, + line, &index); + + i = 0; + + /* Check lock_word to ensure wake-up isn't missed.*/ + if (lock->lock_word < 0) { + + /* these stats may not be accurate */ + lock->count_os_wait++; + rw_lock_stats.rw_x_os_wait_count.add(counter_index, 1); + + /* Add debug info as it is needed to detect possible + deadlock. We must add info for WAIT_EX thread for + deadlock detection to work properly. */ +#ifdef UNIV_SYNC_DEBUG + rw_lock_add_debug_info(lock, pass, RW_LOCK_WAIT_EX, + file_name, line); +#endif + + sync_array_wait_event(sync_arr, index); +#ifdef UNIV_SYNC_DEBUG + rw_lock_remove_debug_info( + lock, pass, RW_LOCK_WAIT_EX); +#endif + /* It is possible to wake when lock_word < 0. + We must pass the while-loop check to proceed.*/ + } else { + sync_array_free_cell(sync_arr, index); + } + } + + if (prio_rw_lock) { + + prio_rw_lock->high_priority_wait_ex_waiter = 0; + } + + rw_lock_stats.rw_x_spin_round_count.add(counter_index, i); +} + +/******************************************************************//** +Low-level function for acquiring an exclusive lock. +@return FALSE if did not succeed, TRUE if success. */ +UNIV_INLINE +ibool +rw_lock_x_lock_low( +/*===============*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + bool high_priority, + /*!< in: if true, the rw lock is a priority + lock and is being acquired with high + priority */ + ulint pass, /*!< in: pass value; != 0, if the lock will + be passed to another thread to unlock */ + const char* file_name,/*!< in: file name where lock requested */ + ulint line) /*!< in: line where requested */ +{ + if (rw_lock_lock_word_decr(lock, X_LOCK_DECR)) { + + /* lock->recursive also tells us if the writer_thread + field is stale or active. As we are going to write + our own thread id in that field it must be that the + current writer_thread value is not active. */ + ut_a(!lock->recursive); + + /* Decrement occurred: we are writer or next-writer. */ + rw_lock_set_writer_id_and_recursion_flag( + lock, pass ? FALSE : TRUE); + + rw_lock_x_lock_wait(lock, high_priority, +#ifdef UNIV_SYNC_DEBUG + pass, +#endif + file_name, line); + + } else { + os_thread_id_t thread_id = os_thread_get_curr_id(); + + if (!pass) { + os_rmb; + } + + /* Decrement failed: relock or failed lock */ + if (!pass && lock->recursive + && os_thread_eq(lock->writer_thread, thread_id)) { + /* Relock */ + if (lock->lock_word == 0) { + lock->lock_word -= X_LOCK_DECR; + } else { + --lock->lock_word; + } + + } else { + /* Another thread locked before us */ + return(FALSE); + } + } +#ifdef UNIV_SYNC_DEBUG + rw_lock_add_debug_info(lock, pass, RW_LOCK_EX, file_name, line); +#endif + lock->last_x_file_name = file_name; + lock->last_x_line = (unsigned int) line; + + return(TRUE); +} + +/******************************************************************//** +NOTE! Use the corresponding macro, not directly this function! Lock an +rw-lock in exclusive mode for the current thread. If the rw-lock is locked +in shared or exclusive mode, or there is an exclusive lock request waiting, +the function spins a preset time (controlled by SYNC_SPIN_ROUNDS), waiting +for the lock before suspending the thread. If the same thread has an x-lock +on the rw-lock, locking succeed, with the following exception: if pass != 0, +only a single x-lock may be taken on the lock. NOTE: If the same thread has +an s-lock, locking does not succeed! */ +UNIV_INTERN +void +rw_lock_x_lock_func( +/*================*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass, /*!< in: pass value; != 0, if the lock will + be passed to another thread to unlock */ + const char* file_name,/*!< in: file name where lock requested */ + ulint line, /*!< in: line where requested */ + bool priority_lock, + /*!< in: whether the lock is a priority lock */ + bool high_priority) + /*!< in: whether we are acquiring a priority + lock with high priority */ +{ + ulint i; /*!< spin round count */ + ulint index; /*!< index of the reserved wait cell */ + sync_array_t* sync_arr; + ibool spinning = FALSE; + size_t counter_index; + prio_rw_lock_t* prio_lock = NULL; + + /* We reuse the thread id to index into the counter, cache + it here for efficiency. */ + + counter_index = (size_t) os_thread_get_curr_id(); + + ut_ad(rw_lock_validate(lock)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(lock, RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + + i = 0; + + ut_ad(priority_lock || !high_priority); + +lock_loop: + + if (!rw_lock_higher_prio_waiters_exist(priority_lock, high_priority, + lock) + && rw_lock_x_lock_low(lock, high_priority, pass, + file_name, line)) { + rw_lock_stats.rw_x_spin_round_count.add(counter_index, i); + + return; /* Locking succeeded */ + + } else if (!rw_lock_higher_prio_waiters_exist(priority_lock, + high_priority, lock)) { + + if (!spinning) { + spinning = TRUE; + + rw_lock_stats.rw_x_spin_wait_count.add( + counter_index, 1); + } + + /* Spin waiting for the lock_word to become free */ + os_rmb; + while (i < SYNC_SPIN_ROUNDS + && lock->lock_word <= 0) { + if (srv_spin_wait_delay) { + ut_delay(ut_rnd_interval(0, + srv_spin_wait_delay)); + } + + i++; + } + if (i >= SYNC_SPIN_ROUNDS) { + os_thread_yield(); + } else { + goto lock_loop; + } + } else { + + /* In case we skipped spinning because of higher-priority + waiters already waiting, perform only this bit of the spinning + code path. */ + os_thread_yield(); + } + + if (spinning) { + + rw_lock_stats.rw_x_spin_round_count.add(counter_index, i); + + if (srv_print_latch_waits) { + fprintf(stderr, + "Thread " ULINTPF " spin wait rw-x-lock at %p" + " '%s' rnds " ULINTPF "\n", + os_thread_pf(os_thread_get_curr_id()), + (void*) lock,lock->lock_name, i); + } + } + + sync_arr = sync_array_get_and_reserve_cell(lock, + high_priority + ? PRIO_RW_LOCK_EX + : RW_LOCK_EX, + file_name, line, &index); + + /* Waiters must be set before checking lock_word, to ensure signal + is sent. This could lead to a few unnecessary wake-up signals. */ + if (high_priority) { + + prio_lock = reinterpret_cast<prio_rw_lock_t *>(lock); + os_atomic_increment_ulint(&prio_lock->high_priority_x_waiters, + 1); + } else { + rw_lock_set_waiter_flag(lock); + } + + if (rw_lock_x_lock_low(lock, high_priority, pass, file_name, line)) { + sync_array_free_cell(sync_arr, index); + if (prio_lock) { + + os_atomic_decrement_ulint( + &prio_lock->high_priority_x_waiters, + 1); + } + return; /* Locking succeeded */ + } + + if (srv_print_latch_waits) { + fprintf(stderr, + "Thread " ULINTPF " OS wait for rw-x-lock at %p" + " '%s'\n", + os_thread_pf(os_thread_get_curr_id()), (void*) lock, + lock->lock_name); + } + + /* these stats may not be accurate */ + lock->count_os_wait++; + rw_lock_stats.rw_x_os_wait_count.add(counter_index, 1); + + sync_array_wait_event(sync_arr, index); + + if (prio_lock) { + + os_atomic_decrement_ulint(&prio_lock->high_priority_x_waiters, + 1); + } + + i = 0; + goto lock_loop; +} + +/******************************************************************//** +NOTE! Use the corresponding macro, not directly this function! Lock a priority +rw-lock in exclusive mode for the current thread. If the rw-lock is locked +in shared or exclusive mode, or there is an exclusive lock request waiting, +the function spins a preset time (controlled by SYNC_SPIN_ROUNDS), waiting +for the lock, before suspending the thread. If the same thread has an x-lock +on the rw-lock, locking succeed, with the following exception: if pass != 0, +only a single x-lock may be taken on the lock. NOTE: If the same thread has +an s-lock, locking does not succeed! */ +UNIV_INTERN +void +rw_lock_x_lock_func( +/*================*/ + prio_rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass, /*!< in: pass value; != 0, if the lock will + be passed to another thread to unlock */ + const char* file_name,/*!< in: file name where lock requested */ + ulint line) /*!< in: line where requested */ +{ + rw_lock_x_lock_func(&lock->base_lock, pass, file_name, line, true, + srv_current_thread_priority > 0); +} + +#ifdef UNIV_SYNC_DEBUG +/******************************************************************//** +Acquires the debug mutex. We cannot use the mutex defined in sync0sync, +because the debug mutex is also acquired in sync0arr while holding the OS +mutex protecting the sync array, and the ordinary mutex_enter might +recursively call routines in sync0arr, leading to a deadlock on the OS +mutex. */ +UNIV_INTERN +void +rw_lock_debug_mutex_enter(void) +/*===========================*/ +{ +loop: + if (0 == mutex_enter_nowait(&rw_lock_debug_mutex)) { + return; + } + + os_event_reset(rw_lock_debug_event); + + rw_lock_debug_waiters = TRUE; + + if (0 == mutex_enter_nowait(&rw_lock_debug_mutex)) { + return; + } + + os_event_wait(rw_lock_debug_event); + + goto loop; +} + +/******************************************************************//** +Releases the debug mutex. */ +UNIV_INTERN +void +rw_lock_debug_mutex_exit(void) +/*==========================*/ +{ + mutex_exit(&rw_lock_debug_mutex); + + if (rw_lock_debug_waiters) { + rw_lock_debug_waiters = FALSE; + os_event_set(rw_lock_debug_event); + } +} + +/******************************************************************//** +Inserts the debug information for an rw-lock. */ +UNIV_INTERN +void +rw_lock_add_debug_info( +/*===================*/ + rw_lock_t* lock, /*!< in: rw-lock */ + ulint pass, /*!< in: pass value */ + ulint lock_type, /*!< in: lock type */ + const char* file_name, /*!< in: file where requested */ + ulint line) /*!< in: line where requested */ +{ + rw_lock_debug_t* info; + + ut_ad(lock); + ut_ad(file_name); + + info = rw_lock_debug_create(); + + rw_lock_debug_mutex_enter(); + + info->file_name = file_name; + info->line = line; + info->lock_type = lock_type; + info->thread_id = os_thread_get_curr_id(); + info->pass = pass; + + UT_LIST_ADD_FIRST(list, lock->debug_list, info); + + rw_lock_debug_mutex_exit(); + + if ((pass == 0) && (lock_type != RW_LOCK_WAIT_EX)) { + sync_thread_add_level(lock, lock->level, + lock_type == RW_LOCK_EX + && lock->lock_word < 0); + } +} + +/******************************************************************//** +Removes a debug information struct for an rw-lock. */ +UNIV_INTERN +void +rw_lock_remove_debug_info( +/*======================*/ + rw_lock_t* lock, /*!< in: rw-lock */ + ulint pass, /*!< in: pass value */ + ulint lock_type) /*!< in: lock type */ +{ + rw_lock_debug_t* info; + + ut_ad(lock); + + if ((pass == 0) && (lock_type != RW_LOCK_WAIT_EX)) { + sync_thread_reset_level(lock); + } + + rw_lock_debug_mutex_enter(); + + info = UT_LIST_GET_FIRST(lock->debug_list); + + while (info != NULL) { + if ((pass == info->pass) + && ((pass != 0) + || os_thread_eq(info->thread_id, + os_thread_get_curr_id())) + && (info->lock_type == lock_type)) { + + /* Found! */ + UT_LIST_REMOVE(list, lock->debug_list, info); + rw_lock_debug_mutex_exit(); + + rw_lock_debug_free(info); + + return; + } + + info = UT_LIST_GET_NEXT(list, info); + } + + ut_error; +} +#endif /* UNIV_SYNC_DEBUG */ + +#ifdef UNIV_SYNC_DEBUG +/******************************************************************//** +Checks if the thread has locked the rw-lock in the specified mode, with +the pass value == 0. +@return TRUE if locked */ +UNIV_INTERN +ibool +rw_lock_own( +/*========*/ + rw_lock_t* lock, /*!< in: rw-lock */ + ulint lock_type) /*!< in: lock type: RW_LOCK_SHARED, + RW_LOCK_EX */ +{ + rw_lock_debug_t* info; + + ut_ad(lock); + ut_ad(rw_lock_validate(lock)); + + rw_lock_debug_mutex_enter(); + + info = UT_LIST_GET_FIRST(lock->debug_list); + + while (info != NULL) { + + if (os_thread_eq(info->thread_id, os_thread_get_curr_id()) + && (info->pass == 0) + && (info->lock_type == lock_type)) { + + rw_lock_debug_mutex_exit(); + /* Found! */ + + return(TRUE); + } + + info = UT_LIST_GET_NEXT(list, info); + } + rw_lock_debug_mutex_exit(); + + return(FALSE); +} + +/******************************************************************//** +Checks if the thread has locked the priority rw-lock in the specified mode, +with the pass value == 0. */ +UNIV_INTERN +ibool +rw_lock_own( +/*========*/ + prio_rw_lock_t* lock, /*!< in: rw-lock */ + ulint lock_type) /*!< in: lock type: RW_LOCK_SHARED, + RW_LOCK_EX */ +{ + return(rw_lock_own(&lock->base_lock, lock_type)); +} + +#endif /* UNIV_SYNC_DEBUG */ + +/******************************************************************//** +Checks if somebody has locked the rw-lock in the specified mode. +@return TRUE if locked */ +UNIV_INTERN +ibool +rw_lock_is_locked( +/*==============*/ + rw_lock_t* lock, /*!< in: rw-lock */ + ulint lock_type) /*!< in: lock type: RW_LOCK_SHARED, + RW_LOCK_EX */ +{ + ibool ret = FALSE; + + ut_ad(lock); + ut_ad(rw_lock_validate(lock)); + + if (lock_type == RW_LOCK_SHARED) { + if (rw_lock_get_reader_count(lock) > 0) { + ret = TRUE; + } + } else if (lock_type == RW_LOCK_EX) { + if (rw_lock_get_writer(lock) == RW_LOCK_EX) { + ret = TRUE; + } + } else { + ut_error; + } + + return(ret); +} + +#ifdef UNIV_SYNC_DEBUG +/***************************************************************//** +Prints debug info of currently locked rw-locks. */ +UNIV_INTERN +void +rw_lock_list_print_info( +/*====================*/ + FILE* file) /*!< in: file where to print */ +{ + rw_lock_t* lock; + ulint count = 0; + rw_lock_debug_t* info; + + mutex_enter(&rw_lock_list_mutex); + + fputs("-------------\n" + "RW-LATCH INFO\n" + "-------------\n", file); + + lock = UT_LIST_GET_FIRST(rw_lock_list); + + while (lock != NULL) { + + count++; + +#ifndef INNODB_RW_LOCKS_USE_ATOMICS + mutex_enter(&(lock->mutex)); +#endif + if (lock->lock_word != X_LOCK_DECR) { + + fprintf(file, "RW-LOCK: %p ", (void*) lock); + + if (rw_lock_get_waiters(lock)) { + fputs(" Waiters for the lock exist\n", file); + } else { + putc('\n', file); + } + + rw_lock_debug_mutex_enter(); + info = UT_LIST_GET_FIRST(lock->debug_list); + while (info != NULL) { + rw_lock_debug_print(file, info); + info = UT_LIST_GET_NEXT(list, info); + } + rw_lock_debug_mutex_exit(); + } +#ifndef INNODB_RW_LOCKS_USE_ATOMICS + mutex_exit(&(lock->mutex)); +#endif + + lock = UT_LIST_GET_NEXT(list, lock); + } + + fprintf(file, "Total number of rw-locks %ld\n", count); + mutex_exit(&rw_lock_list_mutex); +} + +/***************************************************************//** +Prints debug info of an rw-lock. */ +UNIV_INTERN +void +rw_lock_print( +/*==========*/ + rw_lock_t* lock) /*!< in: rw-lock */ +{ + rw_lock_debug_t* info; + + fprintf(stderr, + "-------------\n" + "RW-LATCH INFO\n" + "RW-LATCH: %p ", (void*) lock); + +#ifndef INNODB_RW_LOCKS_USE_ATOMICS + /* We used to acquire lock->mutex here, but it would cause a + recursive call to sync_thread_add_level() if UNIV_SYNC_DEBUG + is defined. Since this function is only invoked from + sync_thread_levels_g(), let us choose the smaller evil: + performing dirty reads instead of causing bogus deadlocks or + assertion failures. */ +#endif + if (lock->lock_word != X_LOCK_DECR) { + + if (rw_lock_get_waiters(lock)) { + fputs(" Waiters for the lock exist\n", stderr); + } else { + putc('\n', stderr); + } + + rw_lock_debug_mutex_enter(); + info = UT_LIST_GET_FIRST(lock->debug_list); + while (info != NULL) { + rw_lock_debug_print(stderr, info); + info = UT_LIST_GET_NEXT(list, info); + } + rw_lock_debug_mutex_exit(); + } +} + +/*********************************************************************//** +Prints info of a debug struct. */ +UNIV_INTERN +void +rw_lock_debug_print( +/*================*/ + FILE* f, /*!< in: output stream */ + rw_lock_debug_t* info) /*!< in: debug struct */ +{ + ulint rwt; + + rwt = info->lock_type; + + fprintf(f, "Locked: thread %lu file %s line %lu ", + (ulong) os_thread_pf(info->thread_id), info->file_name, + (ulong) info->line); + if (rwt == RW_LOCK_SHARED) { + fputs("S-LOCK", f); + } else if (rwt == RW_LOCK_EX) { + fputs("X-LOCK", f); + } else if (rwt == RW_LOCK_WAIT_EX) { + fputs("WAIT X-LOCK", f); + } else { + ut_error; + } + if (info->pass != 0) { + fprintf(f, " pass value %lu", (ulong) info->pass); + } + putc('\n', f); +} + +/***************************************************************//** +Returns the number of currently locked rw-locks. Works only in the debug +version. +@return number of locked rw-locks */ +UNIV_INTERN +ulint +rw_lock_n_locked(void) +/*==================*/ +{ + rw_lock_t* lock; + ulint count = 0; + + mutex_enter(&rw_lock_list_mutex); + + lock = UT_LIST_GET_FIRST(rw_lock_list); + + while (lock != NULL) { + + if (lock->lock_word != X_LOCK_DECR) { + count++; + } + + lock = UT_LIST_GET_NEXT(list, lock); + } + + mutex_exit(&rw_lock_list_mutex); + + return(count); +} +#endif /* UNIV_SYNC_DEBUG */ diff --git a/storage/xtradb/sync/sync0sync.cc b/storage/xtradb/sync/sync0sync.cc new file mode 100644 index 00000000000..35e96908f6a --- /dev/null +++ b/storage/xtradb/sync/sync0sync.cc @@ -0,0 +1,1740 @@ +/***************************************************************************** + +Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2008, Google Inc. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file sync/sync0sync.cc +Mutex, the basic synchronization primitive + +Created 9/5/1995 Heikki Tuuri +*******************************************************/ + +#include "sync0sync.h" +#ifdef UNIV_NONINL +#include "sync0sync.ic" +#include "sync0arr.ic" +#endif + +#include "sync0rw.h" +#include "buf0buf.h" +#include "srv0srv.h" +#include "btr0types.h" +#include "buf0types.h" +#include "os0sync.h" /* for HAVE_ATOMIC_BUILTINS */ +#ifdef UNIV_SYNC_DEBUG +# include "srv0start.h" /* srv_is_being_started */ +#endif /* UNIV_SYNC_DEBUG */ +#include "ha_prototypes.h" + +/* + REASONS FOR IMPLEMENTING THE SPIN LOCK MUTEX + ============================================ + +Semaphore operations in operating systems are slow: Solaris on a 1993 Sparc +takes 3 microseconds (us) for a lock-unlock pair and Windows NT on a 1995 +Pentium takes 20 microseconds for a lock-unlock pair. Therefore, we have to +implement our own efficient spin lock mutex. Future operating systems may +provide efficient spin locks, but we cannot count on that. + +Another reason for implementing a spin lock is that on multiprocessor systems +it can be more efficient for a processor to run a loop waiting for the +semaphore to be released than to switch to a different thread. A thread switch +takes 25 us on both platforms mentioned above. See Gray and Reuter's book +Transaction processing for background. + +How long should the spin loop last before suspending the thread? On a +uniprocessor, spinning does not help at all, because if the thread owning the +mutex is not executing, it cannot be released. Spinning actually wastes +resources. + +On a multiprocessor, we do not know if the thread owning the mutex is +executing or not. Thus it would make sense to spin as long as the operation +guarded by the mutex would typically last assuming that the thread is +executing. If the mutex is not released by that time, we may assume that the +thread owning the mutex is not executing and suspend the waiting thread. + +A typical operation (where no i/o involved) guarded by a mutex or a read-write +lock may last 1 - 20 us on the current Pentium platform. The longest +operations are the binary searches on an index node. + +We conclude that the best choice is to set the spin time at 20 us. Then the +system should work well on a multiprocessor. On a uniprocessor we have to +make sure that thread swithches due to mutex collisions are not frequent, +i.e., they do not happen every 100 us or so, because that wastes too much +resources. If the thread switches are not frequent, the 20 us wasted in spin +loop is not too much. + +Empirical studies on the effect of spin time should be done for different +platforms. + + + IMPLEMENTATION OF THE MUTEX + =========================== + +For background, see Curt Schimmel's book on Unix implementation on modern +architectures. The key points in the implementation are atomicity and +serialization of memory accesses. The test-and-set instruction (XCHG in +Pentium) must be atomic. As new processors may have weak memory models, also +serialization of memory references may be necessary. The successor of Pentium, +P6, has at least one mode where the memory model is weak. As far as we know, +in Pentium all memory accesses are serialized in the program order and we do +not have to worry about the memory model. On other processors there are +special machine instructions called a fence, memory barrier, or storage +barrier (STBAR in Sparc), which can be used to serialize the memory accesses +to happen in program order relative to the fence instruction. + +Leslie Lamport has devised a "bakery algorithm" to implement a mutex without +the atomic test-and-set, but his algorithm should be modified for weak memory +models. We do not use Lamport's algorithm, because we guess it is slower than +the atomic test-and-set. + +Our mutex implementation works as follows: After that we perform the atomic +test-and-set instruction on the memory word. If the test returns zero, we +know we got the lock first. If the test returns not zero, some other thread +was quicker and got the lock: then we spin in a loop reading the memory word, +waiting it to become zero. It is wise to just read the word in the loop, not +perform numerous test-and-set instructions, because they generate memory +traffic between the cache and the main memory. The read loop can just access +the cache, saving bus bandwidth. + +If we cannot acquire the mutex lock in the specified time, we reserve a cell +in the wait array, set the waiters byte in the mutex to 1. To avoid a race +condition, after setting the waiters byte and before suspending the waiting +thread, we still have to check that the mutex is reserved, because it may +have happened that the thread which was holding the mutex has just released +it and did not see the waiters byte set to 1, a case which would lead the +other thread to an infinite wait. + +LEMMA 1: After a thread resets the event of a mutex (or rw_lock), some +======= +thread will eventually call os_event_set() on that particular event. +Thus no infinite wait is possible in this case. + +Proof: After making the reservation the thread sets the waiters field in the +mutex to 1. Then it checks that the mutex is still reserved by some thread, +or it reserves the mutex for itself. In any case, some thread (which may be +also some earlier thread, not necessarily the one currently holding the mutex) +will set the waiters field to 0 in mutex_exit, and then call +os_event_set() with the mutex as an argument. +Q.E.D. + +LEMMA 2: If an os_event_set() call is made after some thread has called +======= +the os_event_reset() and before it starts wait on that event, the call +will not be lost to the second thread. This is true even if there is an +intervening call to os_event_reset() by another thread. +Thus no infinite wait is possible in this case. + +Proof (non-windows platforms): os_event_reset() returns a monotonically +increasing value of signal_count. This value is increased at every +call of os_event_set() If thread A has called os_event_reset() followed +by thread B calling os_event_set() and then some other thread C calling +os_event_reset(), the is_set flag of the event will be set to FALSE; +but now if thread A calls os_event_wait_low() with the signal_count +value returned from the earlier call of os_event_reset(), it will +return immediately without waiting. +Q.E.D. + +Proof (windows): If there is a writer thread which is forced to wait for +the lock, it may be able to set the state of rw_lock to RW_LOCK_WAIT_EX +The design of rw_lock ensures that there is one and only one thread +that is able to change the state to RW_LOCK_WAIT_EX and this thread is +guaranteed to acquire the lock after it is released by the current +holders and before any other waiter gets the lock. +On windows this thread waits on a separate event i.e.: wait_ex_event. +Since only one thread can wait on this event there is no chance +of this event getting reset before the writer starts wait on it. +Therefore, this thread is guaranteed to catch the os_set_event() +signalled unconditionally at the release of the lock. +Q.E.D. */ + +/* Number of spin waits on mutexes: for performance monitoring */ + +/** The number of iterations in the mutex_spin_wait() spin loop. +Intended for performance monitoring. */ +UNIV_INTERN ib_counter_t<ib_int64_t, IB_N_SLOTS> mutex_spin_round_count; +/** The number of mutex_spin_wait() calls. Intended for +performance monitoring. */ +UNIV_INTERN ib_counter_t<ib_int64_t, IB_N_SLOTS> mutex_spin_wait_count; +/** The number of OS waits in mutex_spin_wait(). Intended for +performance monitoring. */ +UNIV_INTERN ib_counter_t<ib_int64_t, IB_N_SLOTS> mutex_os_wait_count; +/** The number of mutex_exit() calls. Intended for performance +monitoring. */ +UNIV_INTERN ib_int64_t mutex_exit_count; + +/** This variable is set to TRUE when sync_init is called */ +UNIV_INTERN ibool sync_initialized = FALSE; + +#ifdef UNIV_SYNC_DEBUG +/** An acquired mutex or rw-lock and its level in the latching order */ +struct sync_level_t; +/** Mutexes or rw-locks held by a thread */ +struct sync_thread_t; + +/** The latch levels currently owned by threads are stored in this data +structure; the size of this array is OS_THREAD_MAX_N */ + +UNIV_INTERN sync_thread_t* sync_thread_level_arrays; + +/** Mutex protecting sync_thread_level_arrays */ +UNIV_INTERN ib_mutex_t sync_thread_mutex; + +# ifdef UNIV_PFS_MUTEX +UNIV_INTERN mysql_pfs_key_t sync_thread_mutex_key; +# endif /* UNIV_PFS_MUTEX */ +#endif /* UNIV_SYNC_DEBUG */ + +/** Global list of database mutexes (not OS mutexes) created. */ +UNIV_INTERN ut_list_base_node_t mutex_list; + +/** Global list of priority mutexes. A subset of mutex_list */ +UNIV_INTERN UT_LIST_BASE_NODE_T(ib_prio_mutex_t) prio_mutex_list; + +/** Mutex protecting the mutex_list and prio_mutex_list variables */ +UNIV_INTERN ib_mutex_t mutex_list_mutex; + +#ifdef UNIV_PFS_MUTEX +UNIV_INTERN mysql_pfs_key_t mutex_list_mutex_key; +#endif /* UNIV_PFS_MUTEX */ + +#ifdef UNIV_SYNC_DEBUG +/** Latching order checks start when this is set TRUE */ +UNIV_INTERN ibool sync_order_checks_on = FALSE; + +/** Number of slots reserved for each OS thread in the sync level array */ +static const ulint SYNC_THREAD_N_LEVELS = 10000; + +/** Array for tracking sync levels per thread. */ +struct sync_arr_t { + ulint in_use; /*!< Number of active cells */ + ulint n_elems; /*!< Number of elements in the array */ + ulint max_elems; /*!< Maximum elements */ + ulint next_free; /*!< ULINT_UNDEFINED or index of next + free slot */ + sync_level_t* elems; /*!< Array elements */ +}; + +/** Mutexes or rw-locks held by a thread */ +struct sync_thread_t{ + os_thread_id_t id; /*!< OS thread id */ + sync_arr_t* levels; /*!< level array for this thread; if + this is NULL this slot is unused */ +}; + +/** An acquired mutex or rw-lock and its level in the latching order */ +struct sync_level_t{ + void* latch; /*!< pointer to a mutex or an + rw-lock; NULL means that + the slot is empty */ + ulint level; /*!< level of the latch in the + latching order. This field is + overloaded to serve as a node in a + linked list of free nodes too. When + latch == NULL then this will contain + the ordinal value of the next free + element */ +}; +#endif /* UNIV_SYNC_DEBUG */ + +/******************************************************************//** +Creates, or rather, initializes a mutex object in a specified memory +location (which must be appropriately aligned). The mutex is initialized +in the reset state. Explicit freeing of the mutex with mutex_free is +necessary only if the memory block containing it is freed. */ +UNIV_INTERN +void +mutex_create_func( +/*==============*/ + ib_mutex_t* mutex, /*!< in: pointer to memory */ +#ifdef UNIV_DEBUG +# ifdef UNIV_SYNC_DEBUG + ulint level, /*!< in: level */ +# endif /* UNIV_SYNC_DEBUG */ + const char* cfile_name, /*!< in: file name where created */ + ulint cline, /*!< in: file line where created */ +#endif /* UNIV_DEBUG */ + const char* cmutex_name) /*!< in: mutex name */ +{ +#if defined(HAVE_ATOMIC_BUILTINS) + mutex_reset_lock_word(mutex); +#else + os_fast_mutex_init(PFS_NOT_INSTRUMENTED, &mutex->os_fast_mutex); + mutex->lock_word = 0; +#endif + mutex->event = os_event_create(); + mutex_set_waiters(mutex, 0); +#ifdef UNIV_DEBUG + mutex->magic_n = MUTEX_MAGIC_N; +#endif /* UNIV_DEBUG */ +#ifdef UNIV_SYNC_DEBUG + mutex->line = 0; + mutex->file_name = "not yet reserved"; + mutex->level = level; +#endif /* UNIV_SYNC_DEBUG */ +#ifdef UNIV_DEBUG + mutex->cfile_name = cfile_name; + mutex->cline = cline; +#endif /* UNIV_DEBUG */ + mutex->count_os_wait = 0; + mutex->cmutex_name= cmutex_name; + + /* Check that lock_word is aligned; this is important on Intel */ + ut_ad(((ulint)(&(mutex->lock_word))) % 4 == 0); + + /* NOTE! The very first mutexes are not put to the mutex list */ + + if ((mutex == &mutex_list_mutex) +#ifdef UNIV_SYNC_DEBUG + || (mutex == &sync_thread_mutex) +#endif /* UNIV_SYNC_DEBUG */ + ) { + + return; + } + + mutex_enter(&mutex_list_mutex); + + ut_ad(UT_LIST_GET_LEN(mutex_list) == 0 + || UT_LIST_GET_FIRST(mutex_list)->magic_n == MUTEX_MAGIC_N); + + UT_LIST_ADD_FIRST(list, mutex_list, mutex); + + mutex_exit(&mutex_list_mutex); +} + +/******************************************************************//** +Creates, or rather, initializes a priority mutex object in a specified memory +location (which must be appropriately aligned). The mutex is initialized +in the reset state. Explicit freeing of the mutex with mutex_free is +necessary only if the memory block containing it is freed. */ +UNIV_INTERN +void +mutex_create_func( +/*==============*/ + ib_prio_mutex_t* mutex, /*!< in: pointer to memory */ +#ifdef UNIV_DEBUG +# ifdef UNIV_SYNC_DEBUG + ulint level, /*!< in: level */ +# endif /* UNIV_SYNC_DEBUG */ + const char* cfile_name, /*!< in: file name where + created */ + ulint cline, /*!< in: file line where + created */ +#endif /* UNIV_DEBUG */ + const char* cmutex_name) /*!< in: mutex name */ +{ + mutex_create_func(&mutex->base_mutex, +#ifdef UNIV_DEBUG +# ifdef UNIV_SYNC_DEBUG + level, +#endif /* UNIV_SYNC_DEBUG */ + cfile_name, + cline, +#endif /* UNIV_DEBUG */ + cmutex_name); + mutex->high_priority_waiters = 0; + mutex->high_priority_event = os_event_create(); + + mutex_enter(&mutex_list_mutex); + UT_LIST_ADD_FIRST(list, prio_mutex_list, mutex); + mutex_exit(&mutex_list_mutex); +} + +/******************************************************************//** +NOTE! Use the corresponding macro mutex_free(), not directly this function! +Calling this function is obligatory only if the memory buffer containing +the mutex is freed. Removes a mutex object from the mutex list. The mutex +is checked to be in the reset state. */ +UNIV_INTERN +void +mutex_free_func( +/*============*/ + ib_mutex_t* mutex) /*!< in: mutex */ +{ + ut_ad(mutex_validate(mutex)); + ut_a(mutex_get_lock_word(mutex) == 0); + ut_a(mutex_get_waiters(mutex) == 0); + +#ifdef UNIV_MEM_DEBUG + if (mutex == &mem_hash_mutex) { + ut_ad(UT_LIST_GET_LEN(mutex_list) == 1); + ut_ad(UT_LIST_GET_FIRST(mutex_list) == &mem_hash_mutex); + UT_LIST_REMOVE(list, mutex_list, mutex); + goto func_exit; + } +#endif /* UNIV_MEM_DEBUG */ + + if (mutex != &mutex_list_mutex +#ifdef UNIV_SYNC_DEBUG + && mutex != &sync_thread_mutex +#endif /* UNIV_SYNC_DEBUG */ + ) { + + mutex_enter(&mutex_list_mutex); + + ut_ad(!UT_LIST_GET_PREV(list, mutex) + || UT_LIST_GET_PREV(list, mutex)->magic_n + == MUTEX_MAGIC_N); + ut_ad(!UT_LIST_GET_NEXT(list, mutex) + || UT_LIST_GET_NEXT(list, mutex)->magic_n + == MUTEX_MAGIC_N); + + UT_LIST_REMOVE(list, mutex_list, mutex); + + mutex_exit(&mutex_list_mutex); + } + + os_event_free(mutex->event); +#ifdef UNIV_MEM_DEBUG +func_exit: +#endif /* UNIV_MEM_DEBUG */ +#if !defined(HAVE_ATOMIC_BUILTINS) + os_fast_mutex_free(&(mutex->os_fast_mutex)); +#endif + /* If we free the mutex protecting the mutex list (freeing is + not necessary), we have to reset the magic number AFTER removing + it from the list. */ +#ifdef UNIV_DEBUG + mutex->magic_n = 0; +#endif /* UNIV_DEBUG */ + return; +} + +/******************************************************************//** +NOTE! Use the corresponding macro mutex_free(), not directly this function! +Calling this function is obligatory only if the memory buffer containing +the mutex is freed. Removes a priority mutex object from the mutex list. The +mutex is checked to be in the reset state. */ +UNIV_INTERN +void +mutex_free_func( +/*============*/ + ib_prio_mutex_t* mutex) /*!< in: mutex */ +{ + mutex_enter(&mutex_list_mutex); + UT_LIST_REMOVE(list, prio_mutex_list, mutex); + mutex_exit(&mutex_list_mutex); + + ut_a(mutex->high_priority_waiters == 0); + os_event_free(mutex->high_priority_event); + mutex_free_func(&mutex->base_mutex); +} + +/********************************************************************//** +NOTE! Use the corresponding macro in the header file, not this function +directly. Tries to lock the mutex for the current thread. If the lock is not +acquired immediately, returns with return value 1. +@return 0 if succeed, 1 if not */ +UNIV_INTERN +ulint +mutex_enter_nowait_func( +/*====================*/ + ib_mutex_t* mutex, /*!< in: pointer to mutex */ + const char* file_name __attribute__((unused)), + /*!< in: file name where mutex + requested */ + ulint line __attribute__((unused))) + /*!< in: line where requested */ +{ + ut_ad(mutex_validate(mutex)); + + if (!ib_mutex_test_and_set(mutex)) { + + ut_d(mutex->thread_id = os_thread_get_curr_id()); +#ifdef UNIV_SYNC_DEBUG + mutex_set_debug_info(mutex, file_name, line); +#endif + + return(0); /* Succeeded! */ + } + + return(1); +} + +#ifdef UNIV_DEBUG +/******************************************************************//** +Checks that the mutex has been initialized. +@return TRUE */ +UNIV_INTERN +ibool +mutex_validate( +/*===========*/ + const ib_mutex_t* mutex) /*!< in: mutex */ +{ + ut_a(mutex); + ut_a(mutex->magic_n == MUTEX_MAGIC_N); + + return(TRUE); +} + +/******************************************************************//** +Checks that the current thread owns the mutex. Works only in the debug +version. +@return TRUE if owns */ +UNIV_INTERN +ibool +mutex_own( +/*======*/ + const ib_mutex_t* mutex) /*!< in: mutex */ +{ + ut_ad(mutex_validate(mutex)); + + return(mutex_get_lock_word(mutex) == 1 + && os_thread_eq(mutex->thread_id, os_thread_get_curr_id())); +} + +/******************************************************************//** +Checks that the current thread owns the priority mutex. Works only +in the debug version. +@return TRUE if owns */ +UNIV_INTERN +ibool +mutex_own( +/*======*/ + const ib_prio_mutex_t* mutex) /*!< in: priority mutex */ +{ + return mutex_own(&mutex->base_mutex); +} + +#endif /* UNIV_DEBUG */ + +/******************************************************************//** +Sets the waiters field in a mutex. */ +UNIV_INTERN +void +mutex_set_waiters( +/*==============*/ + ib_mutex_t* mutex, /*!< in: mutex */ + ulint n) /*!< in: value to set */ +{ + volatile ulint* ptr; /* declared volatile to ensure that + the value is stored to memory */ + ut_ad(mutex); + + ptr = &(mutex->waiters); + + *ptr = n; /* Here we assume that the write of a single + word in memory is atomic */ + os_wmb; +} + +/******************************************************************//** +Reserves a mutex or a priority mutex for the current thread. If the mutex is +reserved, the function spins a preset time (controlled by SYNC_SPIN_ROUNDS), +waiting for the mutex before suspending the thread. */ +UNIV_INTERN +void +mutex_spin_wait( +/*============*/ + void* _mutex, /*!< in: pointer to mutex */ + bool high_priority, /*!< in: whether the mutex is a + priority mutex with high priority + specified */ + const char* file_name, /*!< in: file name where mutex + requested */ + ulint line) /*!< in: line where requested */ +{ + ulint i; /* spin round count */ + ulint index; /* index of the reserved wait cell */ + sync_array_t* sync_arr; + size_t counter_index; + /* The typecast below is performed for some of the priority mutexes + too, when !high_priority. This exploits the fact that regular mutex is + a prefix of the priority mutex in memory. */ + ib_mutex_t* mutex = (ib_mutex_t *) _mutex; + ib_prio_mutex_t* prio_mutex = NULL; + + counter_index = (size_t) os_thread_get_curr_id(); + + ut_ad(mutex); + + /* This update is not thread safe, but we don't mind if the count + isn't exact. Moved out of ifdef that follows because we are willing + to sacrifice the cost of counting this as the data is valuable. + Count the number of calls to mutex_spin_wait. */ + mutex_spin_wait_count.add(counter_index, 1); + +mutex_loop: + + i = 0; + + /* Spin waiting for the lock word to become zero. Note that we do + not have to assume that the read access to the lock word is atomic, + as the actual locking is always committed with atomic test-and-set. + In reality, however, all processors probably have an atomic read of + a memory word. */ + +spin_loop: + os_rmb; + while (mutex_get_lock_word(mutex) != 0 && i < SYNC_SPIN_ROUNDS) { + if (srv_spin_wait_delay) { + ut_delay(ut_rnd_interval(0, srv_spin_wait_delay)); + } + + i++; + } + + if (i >= SYNC_SPIN_ROUNDS) { + os_thread_yield(); + } + + mutex_spin_round_count.add(counter_index, i); + + if (ib_mutex_test_and_set(mutex) == 0) { + /* Succeeded! */ + + ut_d(mutex->thread_id = os_thread_get_curr_id()); +#ifdef UNIV_SYNC_DEBUG + mutex_set_debug_info(mutex, file_name, line); +#endif + return; + } + + /* We may end up with a situation where lock_word is 0 but the OS + fast mutex is still reserved. On FreeBSD the OS does not seem to + schedule a thread which is constantly calling pthread_mutex_trylock + (in ib_mutex_test_and_set implementation). Then we could end up + spinning here indefinitely. The following 'i++' stops this infinite + spin. */ + + i++; + + if (i < SYNC_SPIN_ROUNDS) { + goto spin_loop; + } + + sync_arr = sync_array_get_and_reserve_cell(mutex, + high_priority + ? SYNC_PRIO_MUTEX + : SYNC_MUTEX, + file_name, line, &index); + + /* The memory order of the array reservation and the change in the + waiters field is important: when we suspend a thread, we first + reserve the cell and then set waiters field to 1. When threads are + released in mutex_exit, the waiters field is first set to zero and + then the event is set to the signaled state. */ + + if (high_priority) { + + prio_mutex = reinterpret_cast<ib_prio_mutex_t *>(_mutex); + os_atomic_increment_ulint(&prio_mutex->high_priority_waiters, + 1); + } else { + + mutex_set_waiters(mutex, 1); + } + + /* Try to reserve still a few times */ + for (i = 0; i < 4; i++) { + if (ib_mutex_test_and_set(mutex) == 0) { + /* Succeeded! Free the reserved wait cell */ + + sync_array_free_cell(sync_arr, index); + + ut_d(mutex->thread_id = os_thread_get_curr_id()); +#ifdef UNIV_SYNC_DEBUG + mutex_set_debug_info(mutex, file_name, line); +#endif + + if (prio_mutex) { + os_atomic_decrement_ulint( + &prio_mutex->high_priority_waiters, + 1); + } + return; + + /* Note that in this case we leave the waiters field + set to 1. We cannot reset it to zero, as we do not + know if there are other waiters. */ + } + } + + /* Now we know that there has been some thread holding the mutex + after the change in the wait array and the waiters field was made. + Now there is no risk of infinite wait on the event. */ + + mutex_os_wait_count.add(counter_index, 1); + + mutex->count_os_wait++; + + sync_array_wait_event(sync_arr, index); + + if (prio_mutex) { + + os_atomic_decrement_ulint(&prio_mutex->high_priority_waiters, + 1); + } + + goto mutex_loop; +} + +/******************************************************************//** +Releases the threads waiting in the primary wait array for this mutex. */ +UNIV_INTERN +void +mutex_signal_object( +/*================*/ + ib_mutex_t* mutex) /*!< in: mutex */ +{ + mutex_set_waiters(mutex, 0); + + /* The memory order of resetting the waiters field and + signaling the object is important. See LEMMA 1 above. */ + os_event_set(mutex->event); + sync_array_object_signalled(); +} + +#ifdef UNIV_SYNC_DEBUG +/******************************************************************//** +Sets the debug information for a reserved mutex. */ +UNIV_INTERN +void +mutex_set_debug_info( +/*=================*/ + ib_mutex_t* mutex, /*!< in: mutex */ + const char* file_name, /*!< in: file where requested */ + ulint line) /*!< in: line where requested */ +{ + ut_ad(mutex); + ut_ad(file_name); + + sync_thread_add_level(mutex, mutex->level, FALSE); + + mutex->file_name = file_name; + mutex->line = line; +} + +/******************************************************************//** +Gets the debug information for a reserved mutex. */ +UNIV_INTERN +void +mutex_get_debug_info( +/*=================*/ + ib_mutex_t* mutex, /*!< in: mutex */ + const char** file_name, /*!< out: file where requested */ + ulint* line, /*!< out: line where requested */ + os_thread_id_t* thread_id) /*!< out: id of the thread which owns + the mutex */ +{ + ut_ad(mutex); + + *file_name = mutex->file_name; + *line = mutex->line; + *thread_id = mutex->thread_id; +} + +/******************************************************************//** +Prints debug info of currently reserved mutexes. */ +static +void +mutex_list_print_info( +/*==================*/ + FILE* file) /*!< in: file where to print */ +{ + ib_mutex_t* mutex; + const char* file_name; + ulint line; + os_thread_id_t thread_id; + ulint count = 0; + + fputs("----------\n" + "MUTEX INFO\n" + "----------\n", file); + + mutex_enter(&mutex_list_mutex); + + mutex = UT_LIST_GET_FIRST(mutex_list); + + while (mutex != NULL) { + count++; + + if (mutex_get_lock_word(mutex) != 0) { + mutex_get_debug_info(mutex, &file_name, &line, + &thread_id); + fprintf(file, + "Locked mutex: addr %p thread %ld" + " file %s line %ld\n", + (void*) mutex, os_thread_pf(thread_id), + file_name, line); + } + + mutex = UT_LIST_GET_NEXT(list, mutex); + } + + fprintf(file, "Total number of mutexes %ld\n", count); + + mutex_exit(&mutex_list_mutex); +} + +/******************************************************************//** +Counts currently reserved mutexes. Works only in the debug version. +@return number of reserved mutexes */ +UNIV_INTERN +ulint +mutex_n_reserved(void) +/*==================*/ +{ + ib_mutex_t* mutex; + ulint count = 0; + + mutex_enter(&mutex_list_mutex); + + for (mutex = UT_LIST_GET_FIRST(mutex_list); + mutex != NULL; + mutex = UT_LIST_GET_NEXT(list, mutex)) { + + if (mutex_get_lock_word(mutex) != 0) { + + count++; + } + } + + mutex_exit(&mutex_list_mutex); + + ut_a(count >= 1); + + /* Subtract one, because this function itself was holding + one mutex (mutex_list_mutex) */ + + return(count - 1); +} + +/******************************************************************//** +Returns TRUE if no mutex or rw-lock is currently locked. Works only in +the debug version. +@return TRUE if no mutexes and rw-locks reserved */ +UNIV_INTERN +ibool +sync_all_freed(void) +/*================*/ +{ + return(mutex_n_reserved() + rw_lock_n_locked() == 0); +} + +/******************************************************************//** +Looks for the thread slot for the calling thread. +@return pointer to thread slot, NULL if not found */ +static +sync_thread_t* +sync_thread_level_arrays_find_slot(void) +/*====================================*/ + +{ + ulint i; + os_thread_id_t id; + + id = os_thread_get_curr_id(); + + for (i = 0; i < OS_THREAD_MAX_N; i++) { + sync_thread_t* slot; + + slot = &sync_thread_level_arrays[i]; + + if (slot->levels && os_thread_eq(slot->id, id)) { + + return(slot); + } + } + + return(NULL); +} + +/******************************************************************//** +Looks for an unused thread slot. +@return pointer to thread slot */ +static +sync_thread_t* +sync_thread_level_arrays_find_free(void) +/*====================================*/ + +{ + ulint i; + + for (i = 0; i < OS_THREAD_MAX_N; i++) { + sync_thread_t* slot; + + slot = &sync_thread_level_arrays[i]; + + if (slot->levels == NULL) { + + return(slot); + } + } + + return(NULL); +} + +/******************************************************************//** +Print warning. */ +static +void +sync_print_warning( +/*===============*/ + const sync_level_t* slot) /*!< in: slot for which to + print warning */ +{ + ib_mutex_t* mutex; + + mutex = static_cast<ib_mutex_t*>(slot->latch); + + if (mutex->magic_n == MUTEX_MAGIC_N) { + fprintf(stderr, + "Mutex '%s'\n", + mutex->cmutex_name); + + if (mutex_get_lock_word(mutex) != 0) { + ulint line; + const char* file_name; + os_thread_id_t thread_id; + + mutex_get_debug_info( + mutex, &file_name, &line, &thread_id); + + fprintf(stderr, + "InnoDB: Locked mutex:" + " addr %p thread %ld file %s line %ld\n", + (void*) mutex, os_thread_pf(thread_id), + file_name, (ulong) line); + } else { + fputs("Not locked\n", stderr); + } + } else { + rw_lock_t* lock; + + lock = static_cast<rw_lock_t*>(slot->latch); + + rw_lock_print(lock); + } +} + +/******************************************************************//** +Checks if all the level values stored in the level array are greater than +the given limit. +@return TRUE if all greater */ +static +ibool +sync_thread_levels_g( +/*=================*/ + sync_arr_t* arr, /*!< in: pointer to level array for an OS + thread */ + ulint limit, /*!< in: level limit */ + ulint warn) /*!< in: TRUE=display a diagnostic message */ +{ + ulint i; + + for (i = 0; i < arr->n_elems; i++) { + const sync_level_t* slot; + + slot = &arr->elems[i]; + + if (slot->latch != NULL && slot->level <= limit) { + if (warn) { + fprintf(stderr, + "InnoDB: sync levels should be" + " > %lu but a level is %lu\n", + (ulong) limit, (ulong) slot->level); + + sync_print_warning(slot); + } + + return(FALSE); + } + } + + return(TRUE); +} + +/******************************************************************//** +Checks if the level value is stored in the level array. +@return slot if found or NULL */ +static +const sync_level_t* +sync_thread_levels_contain( +/*=======================*/ + sync_arr_t* arr, /*!< in: pointer to level array for an OS + thread */ + ulint level) /*!< in: level */ +{ + ulint i; + + for (i = 0; i < arr->n_elems; i++) { + const sync_level_t* slot; + + slot = &arr->elems[i]; + + if (slot->latch != NULL && slot->level == level) { + + return(slot); + } + } + + return(NULL); +} + +/******************************************************************//** +Checks if the level array for the current thread contains a +mutex or rw-latch at the specified level. +@return a matching latch, or NULL if not found */ +UNIV_INTERN +void* +sync_thread_levels_contains( +/*========================*/ + ulint level) /*!< in: latching order level + (SYNC_DICT, ...)*/ +{ + ulint i; + sync_arr_t* arr; + sync_thread_t* thread_slot; + + if (!sync_order_checks_on) { + + return(NULL); + } + + mutex_enter(&sync_thread_mutex); + + thread_slot = sync_thread_level_arrays_find_slot(); + + if (thread_slot == NULL) { + + mutex_exit(&sync_thread_mutex); + + return(NULL); + } + + arr = thread_slot->levels; + + for (i = 0; i < arr->n_elems; i++) { + sync_level_t* slot; + + slot = &arr->elems[i]; + + if (slot->latch != NULL && slot->level == level) { + + mutex_exit(&sync_thread_mutex); + return(slot->latch); + } + } + + mutex_exit(&sync_thread_mutex); + + return(NULL); +} + +/******************************************************************//** +Checks that the level array for the current thread is empty. +@return a latch, or NULL if empty except the exceptions specified below */ +UNIV_INTERN +void* +sync_thread_levels_nonempty_gen( +/*============================*/ + ibool dict_mutex_allowed) /*!< in: TRUE if dictionary mutex is + allowed to be owned by the thread */ +{ + ulint i; + sync_arr_t* arr; + sync_thread_t* thread_slot; + + if (!sync_order_checks_on) { + + return(NULL); + } + + mutex_enter(&sync_thread_mutex); + + thread_slot = sync_thread_level_arrays_find_slot(); + + if (thread_slot == NULL) { + + mutex_exit(&sync_thread_mutex); + + return(NULL); + } + + arr = thread_slot->levels; + + for (i = 0; i < arr->n_elems; ++i) { + const sync_level_t* slot; + + slot = &arr->elems[i]; + + if (slot->latch != NULL + && (!dict_mutex_allowed + || (slot->level != SYNC_DICT + && slot->level != SYNC_DICT_OPERATION + && slot->level != SYNC_FTS_CACHE))) { + + mutex_exit(&sync_thread_mutex); + ut_error; + + return(slot->latch); + } + } + + mutex_exit(&sync_thread_mutex); + + return(NULL); +} + +/******************************************************************//** +Checks if the level array for the current thread is empty, +except for the btr_search_latch. +@return a latch, or NULL if empty except the exceptions specified below */ +UNIV_INTERN +void* +sync_thread_levels_nonempty_trx( +/*============================*/ + ibool has_search_latch) + /*!< in: TRUE if and only if the thread + is supposed to hold btr_search_latch */ +{ + ulint i; + sync_arr_t* arr; + sync_thread_t* thread_slot; + + if (!sync_order_checks_on) { + + return(NULL); + } + + ut_a(!has_search_latch + || sync_thread_levels_contains(SYNC_SEARCH_SYS)); + + mutex_enter(&sync_thread_mutex); + + thread_slot = sync_thread_level_arrays_find_slot(); + + if (thread_slot == NULL) { + + mutex_exit(&sync_thread_mutex); + + return(NULL); + } + + arr = thread_slot->levels; + + for (i = 0; i < arr->n_elems; ++i) { + const sync_level_t* slot; + + slot = &arr->elems[i]; + + if (slot->latch != NULL + && (!has_search_latch + || slot->level != SYNC_SEARCH_SYS)) { + + mutex_exit(&sync_thread_mutex); + ut_error; + + return(slot->latch); + } + } + + mutex_exit(&sync_thread_mutex); + + return(NULL); +} + +/******************************************************************//** +Adds a latch and its level in the thread level array. Allocates the memory +for the array if called first time for this OS thread. Makes the checks +against other latch levels stored in the array for this thread. */ +UNIV_INTERN +void +sync_thread_add_level( +/*==================*/ + void* latch, /*!< in: pointer to a mutex or an rw-lock */ + ulint level, /*!< in: level in the latching order; if + SYNC_LEVEL_VARYING, nothing is done */ + ibool relock) /*!< in: TRUE if re-entering an x-lock */ +{ + ulint i; + sync_level_t* slot; + sync_arr_t* array; + sync_thread_t* thread_slot; + + if (!sync_order_checks_on) { + + return; + } + + if ((latch == (void*) &sync_thread_mutex) + || (latch == (void*) &mutex_list_mutex) + || (latch == (void*) &rw_lock_debug_mutex) + || (latch == (void*) &rw_lock_list_mutex)) { + + return; + } + + if (level == SYNC_LEVEL_VARYING) { + + return; + } + + mutex_enter(&sync_thread_mutex); + + thread_slot = sync_thread_level_arrays_find_slot(); + + if (thread_slot == NULL) { + ulint sz; + + sz = sizeof(*array) + + (sizeof(*array->elems) * SYNC_THREAD_N_LEVELS); + + /* We have to allocate the level array for a new thread */ + array = static_cast<sync_arr_t*>(calloc(sz, sizeof(char))); + ut_a(array != NULL); + + array->next_free = ULINT_UNDEFINED; + array->max_elems = SYNC_THREAD_N_LEVELS; + array->elems = (sync_level_t*) &array[1]; + + thread_slot = sync_thread_level_arrays_find_free(); + + thread_slot->levels = array; + thread_slot->id = os_thread_get_curr_id(); + } + + array = thread_slot->levels; + + if (relock) { + goto levels_ok; + } + + /* NOTE that there is a problem with _NODE and _LEAF levels: if the + B-tree height changes, then a leaf can change to an internal node + or the other way around. We do not know at present if this can cause + unnecessary assertion failures below. */ + + switch (level) { + case SYNC_NO_ORDER_CHECK: + case SYNC_EXTERN_STORAGE: + case SYNC_TREE_NODE_FROM_HASH: + /* Do no order checking */ + break; + case SYNC_TRX_SYS_HEADER: + if (srv_is_being_started) { + /* This is violated during trx_sys_create_rsegs() + when creating additional rollback segments when + upgrading in innobase_start_or_create_for_mysql(). */ + break; + } + case SYNC_MEM_POOL: + case SYNC_MEM_HASH: + case SYNC_RECV: + case SYNC_FTS_BG_THREADS: + case SYNC_WORK_QUEUE: + case SYNC_FTS_TOKENIZE: + case SYNC_FTS_OPTIMIZE: + case SYNC_FTS_CACHE: + case SYNC_FTS_CACHE_INIT: + case SYNC_LOG_ONLINE: + case SYNC_LOG: + case SYNC_LOG_FLUSH_ORDER: + case SYNC_ANY_LATCH: + case SYNC_FILE_FORMAT_TAG: + case SYNC_DOUBLEWRITE: + case SYNC_THREADS: + case SYNC_LOCK_SYS: + case SYNC_LOCK_WAIT_SYS: + case SYNC_TRX_SYS: + case SYNC_IBUF_BITMAP_MUTEX: + case SYNC_RSEG: + case SYNC_TRX_UNDO: + case SYNC_PURGE_LATCH: + case SYNC_PURGE_QUEUE: + case SYNC_DICT_AUTOINC_MUTEX: + case SYNC_DICT_OPERATION: + case SYNC_DICT_HEADER: + case SYNC_TRX_I_S_RWLOCK: + case SYNC_TRX_I_S_LAST_READ: + case SYNC_IBUF_MUTEX: + case SYNC_INDEX_ONLINE_LOG: + case SYNC_STATS_AUTO_RECALC: + if (!sync_thread_levels_g(array, level, TRUE)) { + fprintf(stderr, + "InnoDB: sync_thread_levels_g(array, %lu)" + " does not hold!\n", level); + ut_error; + } + break; + case SYNC_TRX: + /* Either the thread must own the lock_sys->mutex, or + it is allowed to own only ONE trx->mutex. */ + if (!sync_thread_levels_g(array, level, FALSE)) { + ut_a(sync_thread_levels_g(array, level - 1, TRUE)); + ut_a(sync_thread_levels_contain(array, SYNC_LOCK_SYS)); + } + break; + case SYNC_SEARCH_SYS: { + /* Verify the lock order inside the split btr_search_latch + array */ + bool found_current = false; + for (ulint i = 0; i < btr_search_index_num; i++) { + if (&btr_search_latch_arr[i] == latch) { + found_current = true; + } else if (found_current) { + ut_ad(!rw_lock_own(&btr_search_latch_arr[i], + RW_LOCK_SHARED)); + ut_ad(!rw_lock_own(&btr_search_latch_arr[i], + RW_LOCK_EX)); + } + } + ut_ad(found_current); + + /* fallthrough */ + } + case SYNC_BUF_FLUSH_LIST: + case SYNC_BUF_LRU_LIST: + case SYNC_BUF_FREE_LIST: + case SYNC_BUF_ZIP_FREE: + case SYNC_BUF_ZIP_HASH: + case SYNC_BUF_FLUSH_STATE: + /* We can have multiple mutexes of this type therefore we + can only check whether the greater than condition holds. */ + if (!sync_thread_levels_g(array, level-1, TRUE)) { + fprintf(stderr, + "InnoDB: sync_thread_levels_g(array, %lu)" + " does not hold!\n", level-1); + ut_error; + } + break; + + + case SYNC_BUF_PAGE_HASH: + /* Multiple page_hash locks are only allowed during + buf_validate. */ + /* Fall through */ + + case SYNC_BUF_BLOCK: + if (!sync_thread_levels_g(array, level, FALSE)) { + ut_a(sync_thread_levels_g(array, level - 1, TRUE)); + } + break; + case SYNC_REC_LOCK: + if (sync_thread_levels_contain(array, SYNC_LOCK_SYS)) { + ut_a(sync_thread_levels_g(array, SYNC_REC_LOCK - 1, + TRUE)); + } else { + ut_a(sync_thread_levels_g(array, SYNC_REC_LOCK, TRUE)); + } + break; + case SYNC_IBUF_BITMAP: + /* Either the thread must own the master mutex to all + the bitmap pages, or it is allowed to latch only ONE + bitmap page. */ + if (sync_thread_levels_contain(array, + SYNC_IBUF_BITMAP_MUTEX)) { + ut_a(sync_thread_levels_g(array, SYNC_IBUF_BITMAP - 1, + TRUE)); + } else { + /* This is violated during trx_sys_create_rsegs() + when creating additional rollback segments when + upgrading in innobase_start_or_create_for_mysql(). */ + ut_a(srv_is_being_started + || sync_thread_levels_g(array, SYNC_IBUF_BITMAP, + TRUE)); + } + break; + case SYNC_FSP_PAGE: + ut_a(sync_thread_levels_contain(array, SYNC_FSP)); + break; + case SYNC_FSP: + ut_a(sync_thread_levels_contain(array, SYNC_FSP) + || sync_thread_levels_g(array, SYNC_FSP, TRUE)); + break; + case SYNC_TRX_UNDO_PAGE: + /* Purge is allowed to read in as many UNDO pages as it likes, + there was a bogus rule here earlier that forced the caller to + acquire the purge_sys_t::mutex. The purge mutex did not really + protect anything because it was only ever acquired by the + single purge thread. The purge thread can read the UNDO pages + without any covering mutex. */ + + ut_a(sync_thread_levels_contain(array, SYNC_TRX_UNDO) + || sync_thread_levels_contain(array, SYNC_RSEG) + || sync_thread_levels_g(array, level - 1, TRUE)); + break; + case SYNC_RSEG_HEADER: + ut_a(sync_thread_levels_contain(array, SYNC_RSEG)); + break; + case SYNC_RSEG_HEADER_NEW: + ut_a(sync_thread_levels_contain(array, SYNC_FSP_PAGE)); + break; + case SYNC_TREE_NODE: + ut_a(sync_thread_levels_contain(array, SYNC_INDEX_TREE) + || sync_thread_levels_contain(array, SYNC_DICT_OPERATION) + || sync_thread_levels_g(array, SYNC_TREE_NODE - 1, TRUE)); + break; + case SYNC_TREE_NODE_NEW: + ut_a(sync_thread_levels_contain(array, SYNC_FSP_PAGE)); + break; + case SYNC_INDEX_TREE: + ut_a(sync_thread_levels_g(array, SYNC_TREE_NODE - 1, TRUE)); + break; + case SYNC_IBUF_TREE_NODE: + ut_a(sync_thread_levels_contain(array, SYNC_IBUF_INDEX_TREE) + || sync_thread_levels_g(array, SYNC_IBUF_TREE_NODE - 1, + TRUE)); + break; + case SYNC_IBUF_TREE_NODE_NEW: + /* ibuf_add_free_page() allocates new pages for the + change buffer while only holding the tablespace + x-latch. These pre-allocated new pages may only be + taken in use while holding ibuf_mutex, in + btr_page_alloc_for_ibuf(). */ + ut_a(sync_thread_levels_contain(array, SYNC_IBUF_MUTEX) + || sync_thread_levels_contain(array, SYNC_FSP)); + break; + case SYNC_IBUF_INDEX_TREE: + if (sync_thread_levels_contain(array, SYNC_FSP)) { + ut_a(sync_thread_levels_g(array, level - 1, TRUE)); + } else { + ut_a(sync_thread_levels_g( + array, SYNC_IBUF_TREE_NODE - 1, TRUE)); + } + break; + case SYNC_IBUF_PESS_INSERT_MUTEX: + ut_a(sync_thread_levels_g(array, SYNC_FSP - 1, TRUE)); + ut_a(!sync_thread_levels_contain(array, SYNC_IBUF_MUTEX)); + break; + case SYNC_IBUF_HEADER: + ut_a(sync_thread_levels_g(array, SYNC_FSP - 1, TRUE)); + ut_a(!sync_thread_levels_contain(array, SYNC_IBUF_MUTEX)); + ut_a(!sync_thread_levels_contain(array, + SYNC_IBUF_PESS_INSERT_MUTEX)); + break; + case SYNC_DICT: +#ifdef UNIV_DEBUG + ut_a(buf_debug_prints + || sync_thread_levels_g(array, SYNC_DICT, TRUE)); +#else /* UNIV_DEBUG */ + ut_a(sync_thread_levels_g(array, SYNC_DICT, TRUE)); +#endif /* UNIV_DEBUG */ + break; + default: + ut_error; + } + +levels_ok: + if (array->next_free == ULINT_UNDEFINED) { + ut_a(array->n_elems < array->max_elems); + + i = array->n_elems++; + } else { + i = array->next_free; + array->next_free = array->elems[i].level; + } + + ut_a(i < array->n_elems); + ut_a(i != ULINT_UNDEFINED); + + ++array->in_use; + + slot = &array->elems[i]; + + ut_a(slot->latch == NULL); + + slot->latch = latch; + slot->level = level; + + mutex_exit(&sync_thread_mutex); +} + +/******************************************************************//** +Removes a latch from the thread level array if it is found there. +@return TRUE if found in the array; it is no error if the latch is +not found, as we presently are not able to determine the level for +every latch reservation the program does */ +UNIV_INTERN +ibool +sync_thread_reset_level( +/*====================*/ + void* latch) /*!< in: pointer to a mutex or an rw-lock */ +{ + sync_arr_t* array; + sync_thread_t* thread_slot; + ulint i; + + if (!sync_order_checks_on) { + + return(FALSE); + } + + if ((latch == (void*) &sync_thread_mutex) + || (latch == (void*) &mutex_list_mutex) + || (latch == (void*) &rw_lock_debug_mutex) + || (latch == (void*) &rw_lock_list_mutex)) { + + return(FALSE); + } + + mutex_enter(&sync_thread_mutex); + + thread_slot = sync_thread_level_arrays_find_slot(); + + if (thread_slot == NULL) { + + ut_error; + + mutex_exit(&sync_thread_mutex); + return(FALSE); + } + + array = thread_slot->levels; + + for (i = 0; i < array->n_elems; i++) { + sync_level_t* slot; + + slot = &array->elems[i]; + + if (slot->latch != latch) { + continue; + } + + slot->latch = NULL; + + /* Update the free slot list. See comment in sync_level_t + for the level field. */ + slot->level = array->next_free; + array->next_free = i; + + ut_a(array->in_use >= 1); + --array->in_use; + + /* If all cells are idle then reset the free + list. The assumption is that this will save + time when we need to scan up to n_elems. */ + + if (array->in_use == 0) { + array->n_elems = 0; + array->next_free = ULINT_UNDEFINED; + } + + mutex_exit(&sync_thread_mutex); + + return(TRUE); + } + + if (((ib_mutex_t*) latch)->magic_n != MUTEX_MAGIC_N) { + rw_lock_t* rw_lock; + + rw_lock = (rw_lock_t*) latch; + + if (rw_lock->level == SYNC_LEVEL_VARYING) { + mutex_exit(&sync_thread_mutex); + + return(TRUE); + } + } + + ut_error; + + mutex_exit(&sync_thread_mutex); + + return(FALSE); +} +#endif /* UNIV_SYNC_DEBUG */ + +/******************************************************************//** +Initializes the synchronization data structures. */ +UNIV_INTERN +void +sync_init(void) +/*===========*/ +{ + ut_a(sync_initialized == FALSE); + + sync_initialized = TRUE; + + sync_array_init(OS_THREAD_MAX_N); + +#ifdef UNIV_SYNC_DEBUG + /* Create the thread latch level array where the latch levels + are stored for each OS thread */ + + sync_thread_level_arrays = static_cast<sync_thread_t*>( + calloc(sizeof(sync_thread_t), OS_THREAD_MAX_N)); + + ut_a(sync_thread_level_arrays != NULL); + +#endif /* UNIV_SYNC_DEBUG */ + /* Init the mutex list and create the mutex to protect it. */ + + UT_LIST_INIT(mutex_list); + UT_LIST_INIT(prio_mutex_list); + mutex_create(mutex_list_mutex_key, &mutex_list_mutex, + SYNC_NO_ORDER_CHECK); +#ifdef UNIV_SYNC_DEBUG + mutex_create(sync_thread_mutex_key, &sync_thread_mutex, + SYNC_NO_ORDER_CHECK); +#endif /* UNIV_SYNC_DEBUG */ + + /* Init the rw-lock list and create the mutex to protect it. */ + + UT_LIST_INIT(rw_lock_list); + mutex_create(rw_lock_list_mutex_key, &rw_lock_list_mutex, + SYNC_NO_ORDER_CHECK); + +#ifdef UNIV_SYNC_DEBUG + mutex_create(rw_lock_debug_mutex_key, &rw_lock_debug_mutex, + SYNC_NO_ORDER_CHECK); + + rw_lock_debug_event = os_event_create(); + rw_lock_debug_waiters = FALSE; +#endif /* UNIV_SYNC_DEBUG */ +} + +#ifdef UNIV_SYNC_DEBUG +/******************************************************************//** +Frees all debug memory. */ +static +void +sync_thread_level_arrays_free(void) +/*===============================*/ + +{ + ulint i; + + for (i = 0; i < OS_THREAD_MAX_N; i++) { + sync_thread_t* slot; + + slot = &sync_thread_level_arrays[i]; + + /* If this slot was allocated then free the slot memory too. */ + if (slot->levels != NULL) { + free(slot->levels); + slot->levels = NULL; + } + } + + free(sync_thread_level_arrays); + sync_thread_level_arrays = NULL; +} +#endif /* UNIV_SYNC_DEBUG */ + +/******************************************************************//** +Frees the resources in InnoDB's own synchronization data structures. Use +os_sync_free() after calling this. */ +UNIV_INTERN +void +sync_close(void) +/*===========*/ +{ + ib_mutex_t* mutex; + ib_prio_mutex_t* prio_mutex; + + sync_array_close(); + + for (prio_mutex = UT_LIST_GET_FIRST(prio_mutex_list); prio_mutex;) { + mutex_free(prio_mutex); + prio_mutex = UT_LIST_GET_FIRST(prio_mutex_list); + } + + for (mutex = UT_LIST_GET_FIRST(mutex_list); + mutex != NULL; + /* No op */) { + +#ifdef UNIV_MEM_DEBUG + if (mutex == &mem_hash_mutex) { + mutex = UT_LIST_GET_NEXT(list, mutex); + continue; + } +#endif /* UNIV_MEM_DEBUG */ + + mutex_free(mutex); + + mutex = UT_LIST_GET_FIRST(mutex_list); + } + + mutex_free(&mutex_list_mutex); +#ifdef UNIV_SYNC_DEBUG + mutex_free(&sync_thread_mutex); + + /* Switch latching order checks on in sync0sync.cc */ + sync_order_checks_on = FALSE; + + sync_thread_level_arrays_free(); +#endif /* UNIV_SYNC_DEBUG */ + + sync_initialized = FALSE; +} + +/*******************************************************************//** +Prints wait info of the sync system. */ +UNIV_INTERN +void +sync_print_wait_info( +/*=================*/ + FILE* file) /*!< in: file where to print */ +{ + fprintf(file, + "Mutex spin waits " UINT64PF ", rounds " UINT64PF ", " + "OS waits " UINT64PF "\n" + "RW-shared spins " UINT64PF ", rounds " UINT64PF ", " + "OS waits " UINT64PF "\n" + "RW-excl spins " UINT64PF ", rounds " UINT64PF ", " + "OS waits " UINT64PF "\n", + (ib_uint64_t) mutex_spin_wait_count, + (ib_uint64_t) mutex_spin_round_count, + (ib_uint64_t) mutex_os_wait_count, + (ib_uint64_t) rw_lock_stats.rw_s_spin_wait_count, + (ib_uint64_t) rw_lock_stats.rw_s_spin_round_count, + (ib_uint64_t) rw_lock_stats.rw_s_os_wait_count, + (ib_uint64_t) rw_lock_stats.rw_x_spin_wait_count, + (ib_uint64_t) rw_lock_stats.rw_x_spin_round_count, + (ib_uint64_t) rw_lock_stats.rw_x_os_wait_count); + + fprintf(file, + "Spin rounds per wait: %.2f mutex, %.2f RW-shared, " + "%.2f RW-excl\n", + (double) mutex_spin_round_count / + (mutex_spin_wait_count ? mutex_spin_wait_count : 1), + (double) rw_lock_stats.rw_s_spin_round_count / + (rw_lock_stats.rw_s_spin_wait_count + ? rw_lock_stats.rw_s_spin_wait_count : 1), + (double) rw_lock_stats.rw_x_spin_round_count / + (rw_lock_stats.rw_x_spin_wait_count + ? rw_lock_stats.rw_x_spin_wait_count : 1)); +} + +/*******************************************************************//** +Prints info of the sync system. */ +UNIV_INTERN +void +sync_print( +/*=======*/ + FILE* file) /*!< in: file where to print */ +{ +#ifdef UNIV_SYNC_DEBUG + mutex_list_print_info(file); + + rw_lock_list_print_info(file); +#endif /* UNIV_SYNC_DEBUG */ + + sync_array_print(file); + + sync_print_wait_info(file); +} diff --git a/storage/xtradb/trx/trx0i_s.cc b/storage/xtradb/trx/trx0i_s.cc new file mode 100644 index 00000000000..794ee432ca4 --- /dev/null +++ b/storage/xtradb/trx/trx0i_s.cc @@ -0,0 +1,1688 @@ +/***************************************************************************** + +Copyright (c) 2007, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file trx/trx0i_s.cc +INFORMATION SCHEMA innodb_trx, innodb_locks and +innodb_lock_waits tables fetch code. + +The code below fetches information needed to fill those +3 dynamic tables and uploads it into a "transactions +table cache" for later retrieval. + +Created July 17, 2007 Vasil Dimov +*******************************************************/ + +/* Found during the build of 5.5.3 on Linux 2.4 and early 2.6 kernels: + The includes "univ.i" -> "my_global.h" cause a different path + to be taken further down with pthread functions and types, + so they must come first. + From the symptoms, this is related to bug#46587 in the MySQL bug DB. +*/ +#include "univ.i" + +#include <mysql/plugin.h> + +#include "buf0buf.h" +#include "dict0dict.h" +#include "ha0storage.h" +#include "ha_prototypes.h" +#include "hash0hash.h" +#include "lock0iter.h" +#include "lock0lock.h" +#include "mem0mem.h" +#include "page0page.h" +#include "rem0rec.h" +#include "row0row.h" +#include "srv0srv.h" +#include "sync0rw.h" +#include "sync0sync.h" +#include "sync0types.h" +#include "trx0i_s.h" +#include "trx0sys.h" +#include "trx0trx.h" +#include "ut0mem.h" +#include "ut0ut.h" + +/** Initial number of rows in the table cache */ +#define TABLE_CACHE_INITIAL_ROWSNUM 1024 + +/** @brief The maximum number of chunks to allocate for a table cache. + +The rows of a table cache are stored in a set of chunks. When a new +row is added a new chunk is allocated if necessary. Assuming that the +first one is 1024 rows (TABLE_CACHE_INITIAL_ROWSNUM) and each +subsequent is N/2 where N is the number of rows we have allocated till +now, then 39th chunk would accommodate 1677416425 rows and all chunks +would accommodate 3354832851 rows. */ +#define MEM_CHUNKS_IN_TABLE_CACHE 39 + +/** The following are some testing auxiliary macros. Do not enable them +in a production environment. */ +/* @{ */ + +#if 0 +/** If this is enabled then lock folds will always be different +resulting in equal rows being put in a different cells of the hash +table. Checking for duplicates will be flawed because different +fold will be calculated when a row is searched in the hash table. */ +#define TEST_LOCK_FOLD_ALWAYS_DIFFERENT +#endif + +#if 0 +/** This effectively kills the search-for-duplicate-before-adding-a-row +function, but searching in the hash is still performed. It will always +be assumed that lock is not present and insertion will be performed in +the hash table. */ +#define TEST_NO_LOCKS_ROW_IS_EVER_EQUAL_TO_LOCK_T +#endif + +#if 0 +/** This aggressively repeats adding each row many times. Depending on +the above settings this may be noop or may result in lots of rows being +added. */ +#define TEST_ADD_EACH_LOCKS_ROW_MANY_TIMES +#endif + +#if 0 +/** Very similar to TEST_NO_LOCKS_ROW_IS_EVER_EQUAL_TO_LOCK_T but hash +table search is not performed at all. */ +#define TEST_DO_NOT_CHECK_FOR_DUPLICATE_ROWS +#endif + +#if 0 +/** Do not insert each row into the hash table, duplicates may appear +if this is enabled, also if this is enabled searching into the hash is +noop because it will be empty. */ +#define TEST_DO_NOT_INSERT_INTO_THE_HASH_TABLE +#endif +/* @} */ + +/** Memory limit passed to ha_storage_put_memlim(). +@param cache hash storage +@return maximum allowed allocation size */ +#define MAX_ALLOWED_FOR_STORAGE(cache) \ + (TRX_I_S_MEM_LIMIT \ + - (cache)->mem_allocd) + +/** Memory limit in table_cache_create_empty_row(). +@param cache hash storage +@return maximum allowed allocation size */ +#define MAX_ALLOWED_FOR_ALLOC(cache) \ + (TRX_I_S_MEM_LIMIT \ + - (cache)->mem_allocd \ + - ha_storage_get_size((cache)->storage)) + +/** Memory for each table in the intermediate buffer is allocated in +separate chunks. These chunks are considered to be concatenated to +represent one flat array of rows. */ +struct i_s_mem_chunk_t { + ulint offset; /*!< offset, in number of rows */ + ulint rows_allocd; /*!< the size of this chunk, in number + of rows */ + void* base; /*!< start of the chunk */ +}; + +/** This represents one table's cache. */ +struct i_s_table_cache_t { + ulint rows_used; /*!< number of used rows */ + ulint rows_allocd; /*!< number of allocated rows */ + ulint row_size; /*!< size of a single row */ + i_s_mem_chunk_t chunks[MEM_CHUNKS_IN_TABLE_CACHE]; /*!< array of + memory chunks that stores the + rows */ +}; + +/** This structure describes the intermediate buffer */ +struct trx_i_s_cache_t { + rw_lock_t rw_lock; /*!< read-write lock protecting + the rest of this structure */ + ullint last_read; /*!< last time the cache was read; + measured in microseconds since + epoch */ + ib_mutex_t last_read_mutex;/*!< mutex protecting the + last_read member - it is updated + inside a shared lock of the + rw_lock member */ + i_s_table_cache_t innodb_trx; /*!< innodb_trx table */ + i_s_table_cache_t innodb_locks; /*!< innodb_locks table */ + i_s_table_cache_t innodb_lock_waits;/*!< innodb_lock_waits table */ +/** the hash table size is LOCKS_HASH_CELLS_NUM * sizeof(void*) bytes */ +#define LOCKS_HASH_CELLS_NUM 10000 + hash_table_t* locks_hash; /*!< hash table used to eliminate + duplicate entries in the + innodb_locks table */ +/** Initial size of the cache storage */ +#define CACHE_STORAGE_INITIAL_SIZE 1024 +/** Number of hash cells in the cache storage */ +#define CACHE_STORAGE_HASH_CELLS 2048 + ha_storage_t* storage; /*!< storage for external volatile + data that may become unavailable + when we release + lock_sys->mutex or trx_sys->mutex */ + ulint mem_allocd; /*!< the amount of memory + allocated with mem_alloc*() */ + ibool is_truncated; /*!< this is TRUE if the memory + limit was hit and thus the data + in the cache is truncated */ +}; + +/** This is the intermediate buffer where data needed to fill the +INFORMATION SCHEMA tables is fetched and later retrieved by the C++ +code in handler/i_s.cc. */ +static trx_i_s_cache_t trx_i_s_cache_static; +/** This is the intermediate buffer where data needed to fill the +INFORMATION SCHEMA tables is fetched and later retrieved by the C++ +code in handler/i_s.cc. */ +UNIV_INTERN trx_i_s_cache_t* trx_i_s_cache = &trx_i_s_cache_static; + +/* Key to register the lock/mutex with performance schema */ +#ifdef UNIV_PFS_RWLOCK +UNIV_INTERN mysql_pfs_key_t trx_i_s_cache_lock_key; +#endif /* UNIV_PFS_RWLOCK */ + +#ifdef UNIV_PFS_MUTEX +UNIV_INTERN mysql_pfs_key_t cache_last_read_mutex_key; +#endif /* UNIV_PFS_MUTEX */ + +/*******************************************************************//** +For a record lock that is in waiting state retrieves the only bit that +is set, for a table lock returns ULINT_UNDEFINED. +@return record number within the heap */ +static +ulint +wait_lock_get_heap_no( +/*==================*/ + const lock_t* lock) /*!< in: lock */ +{ + ulint ret; + + switch (lock_get_type(lock)) { + case LOCK_REC: + ret = lock_rec_find_set_bit(lock); + ut_a(ret != ULINT_UNDEFINED); + break; + case LOCK_TABLE: + ret = ULINT_UNDEFINED; + break; + default: + ut_error; + } + + return(ret); +} + +/*******************************************************************//** +Initializes the members of a table cache. */ +static +void +table_cache_init( +/*=============*/ + i_s_table_cache_t* table_cache, /*!< out: table cache */ + size_t row_size) /*!< in: the size of a + row */ +{ + ulint i; + + table_cache->rows_used = 0; + table_cache->rows_allocd = 0; + table_cache->row_size = row_size; + + for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) { + + /* the memory is actually allocated in + table_cache_create_empty_row() */ + table_cache->chunks[i].base = NULL; + } +} + +/*******************************************************************//** +Frees a table cache. */ +static +void +table_cache_free( +/*=============*/ + i_s_table_cache_t* table_cache) /*!< in/out: table cache */ +{ + ulint i; + + for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) { + + /* the memory is actually allocated in + table_cache_create_empty_row() */ + if (table_cache->chunks[i].base) { + mem_free(table_cache->chunks[i].base); + table_cache->chunks[i].base = NULL; + } + } +} + +/*******************************************************************//** +Returns an empty row from a table cache. The row is allocated if no more +empty rows are available. The number of used rows is incremented. +If the memory limit is hit then NULL is returned and nothing is +allocated. +@return empty row, or NULL if out of memory */ +static +void* +table_cache_create_empty_row( +/*=========================*/ + i_s_table_cache_t* table_cache, /*!< in/out: table cache */ + trx_i_s_cache_t* cache) /*!< in/out: cache to record + how many bytes are + allocated */ +{ + ulint i; + void* row; + + ut_a(table_cache->rows_used <= table_cache->rows_allocd); + + if (table_cache->rows_used == table_cache->rows_allocd) { + + /* rows_used == rows_allocd means that new chunk needs + to be allocated: either no more empty rows in the + last allocated chunk or nothing has been allocated yet + (rows_num == rows_allocd == 0); */ + + i_s_mem_chunk_t* chunk; + ulint req_bytes; + ulint got_bytes; + ulint req_rows; + ulint got_rows; + + /* find the first not allocated chunk */ + for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) { + + if (table_cache->chunks[i].base == NULL) { + + break; + } + } + + /* i == MEM_CHUNKS_IN_TABLE_CACHE means that all chunks + have been allocated :-X */ + ut_a(i < MEM_CHUNKS_IN_TABLE_CACHE); + + /* allocate the chunk we just found */ + + if (i == 0) { + + /* first chunk, nothing is allocated yet */ + req_rows = TABLE_CACHE_INITIAL_ROWSNUM; + } else { + + /* Memory is increased by the formula + new = old + old / 2; We are trying not to be + aggressive here (= using the common new = old * 2) + because the allocated memory will not be freed + until InnoDB exit (it is reused). So it is better + to once allocate the memory in more steps, but + have less unused/wasted memory than to use less + steps in allocation (which is done once in a + lifetime) but end up with lots of unused/wasted + memory. */ + req_rows = table_cache->rows_allocd / 2; + } + req_bytes = req_rows * table_cache->row_size; + + if (req_bytes > MAX_ALLOWED_FOR_ALLOC(cache)) { + + return(NULL); + } + + chunk = &table_cache->chunks[i]; + + chunk->base = mem_alloc2(req_bytes, &got_bytes); + + got_rows = got_bytes / table_cache->row_size; + + cache->mem_allocd += got_bytes; + +#if 0 + printf("allocating chunk %d req bytes=%lu, got bytes=%lu, " + "row size=%lu, " + "req rows=%lu, got rows=%lu\n", + i, req_bytes, got_bytes, + table_cache->row_size, + req_rows, got_rows); +#endif + + chunk->rows_allocd = got_rows; + + table_cache->rows_allocd += got_rows; + + /* adjust the offset of the next chunk */ + if (i < MEM_CHUNKS_IN_TABLE_CACHE - 1) { + + table_cache->chunks[i + 1].offset + = chunk->offset + chunk->rows_allocd; + } + + /* return the first empty row in the newly allocated + chunk */ + row = chunk->base; + } else { + + char* chunk_start; + ulint offset; + + /* there is an empty row, no need to allocate new + chunks */ + + /* find the first chunk that contains allocated but + empty/unused rows */ + for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) { + + if (table_cache->chunks[i].offset + + table_cache->chunks[i].rows_allocd + > table_cache->rows_used) { + + break; + } + } + + /* i == MEM_CHUNKS_IN_TABLE_CACHE means that all chunks + are full, but + table_cache->rows_used != table_cache->rows_allocd means + exactly the opposite - there are allocated but + empty/unused rows :-X */ + ut_a(i < MEM_CHUNKS_IN_TABLE_CACHE); + + chunk_start = (char*) table_cache->chunks[i].base; + offset = table_cache->rows_used + - table_cache->chunks[i].offset; + + row = chunk_start + offset * table_cache->row_size; + } + + table_cache->rows_used++; + + return(row); +} + +#ifdef UNIV_DEBUG +/*******************************************************************//** +Validates a row in the locks cache. +@return TRUE if valid */ +static +ibool +i_s_locks_row_validate( +/*===================*/ + const i_s_locks_row_t* row) /*!< in: row to validate */ +{ + ut_ad(row->lock_trx_id != 0); + ut_ad(row->lock_mode != NULL); + ut_ad(row->lock_type != NULL); + ut_ad(row->lock_table != NULL); + ut_ad(row->lock_table_id != 0); + + if (row->lock_space == ULINT_UNDEFINED) { + /* table lock */ + ut_ad(!strcmp("TABLE", row->lock_type)); + ut_ad(row->lock_index == NULL); + ut_ad(row->lock_data == NULL); + ut_ad(row->lock_page == ULINT_UNDEFINED); + ut_ad(row->lock_rec == ULINT_UNDEFINED); + } else { + /* record lock */ + ut_ad(!strcmp("RECORD", row->lock_type)); + ut_ad(row->lock_index != NULL); + /* row->lock_data == NULL if buf_page_try_get() == NULL */ + ut_ad(row->lock_page != ULINT_UNDEFINED); + ut_ad(row->lock_rec != ULINT_UNDEFINED); + } + + return(TRUE); +} +#endif /* UNIV_DEBUG */ + +/*******************************************************************//** +Fills i_s_trx_row_t object. +If memory can not be allocated then FALSE is returned. +@return FALSE if allocation fails */ +static +ibool +fill_trx_row( +/*=========*/ + i_s_trx_row_t* row, /*!< out: result object + that's filled */ + const trx_t* trx, /*!< in: transaction to + get data from */ + const i_s_locks_row_t* requested_lock_row,/*!< in: pointer to the + corresponding row in + innodb_locks if trx is + waiting or NULL if trx + is not waiting */ + trx_i_s_cache_t* cache) /*!< in/out: cache into + which to copy volatile + strings */ +{ + const char* stmt; + size_t stmt_len; + const char* s; + + ut_ad(lock_mutex_own()); + + row->trx_id = trx->id; + row->trx_started = (ib_time_t) trx->start_time; + row->trx_state = trx_get_que_state_str(trx); + row->requested_lock_row = requested_lock_row; + ut_ad(requested_lock_row == NULL + || i_s_locks_row_validate(requested_lock_row)); + + if (trx->lock.wait_lock != NULL) { + + ut_a(requested_lock_row != NULL); + row->trx_wait_started = (ib_time_t) trx->lock.wait_started; + } else { + ut_a(requested_lock_row == NULL); + row->trx_wait_started = 0; + } + + row->trx_weight = (ullint) TRX_WEIGHT(trx); + + if (trx->mysql_thd == NULL) { + /* For internal transactions e.g., purge and transactions + being recovered at startup there is no associated MySQL + thread data structure. */ + row->trx_mysql_thread_id = 0; + row->trx_query = NULL; + goto thd_done; + } + + row->trx_mysql_thread_id = thd_get_thread_id(trx->mysql_thd); + + stmt = innobase_get_stmt(trx->mysql_thd, &stmt_len); + + if (stmt != NULL) { + char query[TRX_I_S_TRX_QUERY_MAX_LEN + 1]; + + if (stmt_len > TRX_I_S_TRX_QUERY_MAX_LEN) { + stmt_len = TRX_I_S_TRX_QUERY_MAX_LEN; + } + + memcpy(query, stmt, stmt_len); + query[stmt_len] = '\0'; + + row->trx_query = static_cast<const char*>( + ha_storage_put_memlim( + cache->storage, query, stmt_len + 1, + MAX_ALLOWED_FOR_STORAGE(cache))); + + row->trx_query_cs = innobase_get_charset(trx->mysql_thd); + + if (row->trx_query == NULL) { + + return(FALSE); + } + } else { + + row->trx_query = NULL; + } + +thd_done: + s = trx->op_info; + + if (s != NULL && s[0] != '\0') { + + TRX_I_S_STRING_COPY(s, row->trx_operation_state, + TRX_I_S_TRX_OP_STATE_MAX_LEN, cache); + + if (row->trx_operation_state == NULL) { + + return(FALSE); + } + } else { + + row->trx_operation_state = NULL; + } + + row->trx_tables_in_use = trx->n_mysql_tables_in_use; + + row->trx_tables_locked = trx->mysql_n_tables_locked; + + /* These are protected by both trx->mutex or lock_sys->mutex, + or just lock_sys->mutex. For reading, it suffices to hold + lock_sys->mutex. */ + + row->trx_lock_structs = UT_LIST_GET_LEN(trx->lock.trx_locks); + + row->trx_lock_memory_bytes = mem_heap_get_size(trx->lock.lock_heap); + + row->trx_rows_locked = lock_number_of_rows_locked(&trx->lock); + + row->trx_rows_modified = trx->undo_no; + + row->trx_concurrency_tickets = trx->n_tickets_to_enter_innodb; + + switch (trx->isolation_level) { + case TRX_ISO_READ_UNCOMMITTED: + row->trx_isolation_level = "READ UNCOMMITTED"; + break; + case TRX_ISO_READ_COMMITTED: + row->trx_isolation_level = "READ COMMITTED"; + break; + case TRX_ISO_REPEATABLE_READ: + row->trx_isolation_level = "REPEATABLE READ"; + break; + case TRX_ISO_SERIALIZABLE: + row->trx_isolation_level = "SERIALIZABLE"; + break; + /* Should not happen as TRX_ISO_READ_COMMITTED is default */ + default: + row->trx_isolation_level = "UNKNOWN"; + } + + row->trx_unique_checks = (ibool) trx->check_unique_secondary; + + row->trx_foreign_key_checks = (ibool) trx->check_foreigns; + + s = trx->detailed_error; + + if (s != NULL && s[0] != '\0') { + + TRX_I_S_STRING_COPY(s, + row->trx_foreign_key_error, + TRX_I_S_TRX_FK_ERROR_MAX_LEN, cache); + + if (row->trx_foreign_key_error == NULL) { + + return(FALSE); + } + } else { + row->trx_foreign_key_error = NULL; + } + + row->trx_has_search_latch = (ibool) trx->has_search_latch; + + row->trx_search_latch_timeout = trx->search_latch_timeout; + + row->trx_is_read_only = trx->read_only; + + row->trx_is_autocommit_non_locking = trx_is_autocommit_non_locking(trx); + + return(TRUE); +} + +/*******************************************************************//** +Format the nth field of "rec" and put it in "buf". The result is always +NUL-terminated. Returns the number of bytes that were written to "buf" +(including the terminating NUL). +@return end of the result */ +static +ulint +put_nth_field( +/*==========*/ + char* buf, /*!< out: buffer */ + ulint buf_size,/*!< in: buffer size in bytes */ + ulint n, /*!< in: number of field */ + const dict_index_t* index, /*!< in: index */ + const rec_t* rec, /*!< in: record */ + const ulint* offsets)/*!< in: record offsets, returned + by rec_get_offsets() */ +{ + const byte* data; + ulint data_len; + dict_field_t* dict_field; + ulint ret; + + ut_ad(rec_offs_validate(rec, NULL, offsets)); + + if (buf_size == 0) { + + return(0); + } + + ret = 0; + + if (n > 0) { + /* we must append ", " before the actual data */ + + if (buf_size < 3) { + + buf[0] = '\0'; + return(1); + } + + memcpy(buf, ", ", 3); + + buf += 2; + buf_size -= 2; + ret += 2; + } + + /* now buf_size >= 1 */ + + data = rec_get_nth_field(rec, offsets, n, &data_len); + + dict_field = dict_index_get_nth_field(index, n); + + ret += row_raw_format((const char*) data, data_len, + dict_field, buf, buf_size); + + return(ret); +} + +/*******************************************************************//** +Fills the "lock_data" member of i_s_locks_row_t object. +If memory can not be allocated then FALSE is returned. +@return FALSE if allocation fails */ +static +ibool +fill_lock_data( +/*===========*/ + const char** lock_data,/*!< out: "lock_data" to fill */ + const lock_t* lock, /*!< in: lock used to find the data */ + ulint heap_no,/*!< in: rec num used to find the data */ + trx_i_s_cache_t* cache) /*!< in/out: cache where to store + volatile data */ +{ + mtr_t mtr; + + const buf_block_t* block; + const page_t* page; + const rec_t* rec; + + ut_a(lock_get_type(lock) == LOCK_REC); + + mtr_start(&mtr); + + block = buf_page_try_get(lock_rec_get_space_id(lock), + lock_rec_get_page_no(lock), + &mtr); + + if (block == NULL) { + + *lock_data = NULL; + + mtr_commit(&mtr); + + return(TRUE); + } + + page = (const page_t*) buf_block_get_frame(block); + + rec = page_find_rec_with_heap_no(page, heap_no); + + if (page_rec_is_infimum(rec)) { + + *lock_data = ha_storage_put_str_memlim( + cache->storage, "infimum pseudo-record", + MAX_ALLOWED_FOR_STORAGE(cache)); + } else if (page_rec_is_supremum(rec)) { + + *lock_data = ha_storage_put_str_memlim( + cache->storage, "supremum pseudo-record", + MAX_ALLOWED_FOR_STORAGE(cache)); + } else { + + const dict_index_t* index; + ulint n_fields; + mem_heap_t* heap; + ulint offsets_onstack[REC_OFFS_NORMAL_SIZE]; + ulint* offsets; + char buf[TRX_I_S_LOCK_DATA_MAX_LEN]; + ulint buf_used; + ulint i; + + rec_offs_init(offsets_onstack); + offsets = offsets_onstack; + + index = lock_rec_get_index(lock); + + n_fields = dict_index_get_n_unique(index); + + ut_a(n_fields > 0); + + heap = NULL; + offsets = rec_get_offsets(rec, index, offsets, n_fields, + &heap); + + /* format and store the data */ + + buf_used = 0; + for (i = 0; i < n_fields; i++) { + + buf_used += put_nth_field( + buf + buf_used, sizeof(buf) - buf_used, + i, index, rec, offsets) - 1; + } + + *lock_data = (const char*) ha_storage_put_memlim( + cache->storage, buf, buf_used + 1, + MAX_ALLOWED_FOR_STORAGE(cache)); + + if (UNIV_UNLIKELY(heap != NULL)) { + + /* this means that rec_get_offsets() has created a new + heap and has stored offsets in it; check that this is + really the case and free the heap */ + ut_a(offsets != offsets_onstack); + mem_heap_free(heap); + } + } + + mtr_commit(&mtr); + + if (*lock_data == NULL) { + + return(FALSE); + } + + return(TRUE); +} + +/*******************************************************************//** +Fills i_s_locks_row_t object. Returns its first argument. +If memory can not be allocated then FALSE is returned. +@return FALSE if allocation fails */ +static +ibool +fill_locks_row( +/*===========*/ + i_s_locks_row_t* row, /*!< out: result object that's filled */ + const lock_t* lock, /*!< in: lock to get data from */ + ulint heap_no,/*!< in: lock's record number + or ULINT_UNDEFINED if the lock + is a table lock */ + trx_i_s_cache_t* cache) /*!< in/out: cache into which to copy + volatile strings */ +{ + row->lock_trx_id = lock_get_trx_id(lock); + row->lock_mode = lock_get_mode_str(lock); + row->lock_type = lock_get_type_str(lock); + + row->lock_table = ha_storage_put_str_memlim( + cache->storage, lock_get_table_name(lock), + MAX_ALLOWED_FOR_STORAGE(cache)); + + /* memory could not be allocated */ + if (row->lock_table == NULL) { + + return(FALSE); + } + + switch (lock_get_type(lock)) { + case LOCK_REC: + row->lock_index = ha_storage_put_str_memlim( + cache->storage, lock_rec_get_index_name(lock), + MAX_ALLOWED_FOR_STORAGE(cache)); + + /* memory could not be allocated */ + if (row->lock_index == NULL) { + + return(FALSE); + } + + row->lock_space = lock_rec_get_space_id(lock); + row->lock_page = lock_rec_get_page_no(lock); + row->lock_rec = heap_no; + + if (!fill_lock_data(&row->lock_data, lock, heap_no, cache)) { + + /* memory could not be allocated */ + return(FALSE); + } + + break; + case LOCK_TABLE: + row->lock_index = NULL; + + row->lock_space = ULINT_UNDEFINED; + row->lock_page = ULINT_UNDEFINED; + row->lock_rec = ULINT_UNDEFINED; + + row->lock_data = NULL; + + break; + default: + ut_error; + } + + row->lock_table_id = lock_get_table_id(lock); + + row->hash_chain.value = row; + ut_ad(i_s_locks_row_validate(row)); + + return(TRUE); +} + +/*******************************************************************//** +Fills i_s_lock_waits_row_t object. Returns its first argument. +@return result object that's filled */ +static +i_s_lock_waits_row_t* +fill_lock_waits_row( +/*================*/ + i_s_lock_waits_row_t* row, /*!< out: result object + that's filled */ + const i_s_locks_row_t* requested_lock_row,/*!< in: pointer to the + relevant requested lock + row in innodb_locks */ + const i_s_locks_row_t* blocking_lock_row)/*!< in: pointer to the + relevant blocking lock + row in innodb_locks */ +{ + ut_ad(i_s_locks_row_validate(requested_lock_row)); + ut_ad(i_s_locks_row_validate(blocking_lock_row)); + + row->requested_lock_row = requested_lock_row; + row->blocking_lock_row = blocking_lock_row; + + return(row); +} + +/*******************************************************************//** +Calculates a hash fold for a lock. For a record lock the fold is +calculated from 4 elements, which uniquely identify a lock at a given +point in time: transaction id, space id, page number, record number. +For a table lock the fold is table's id. +@return fold */ +static +ulint +fold_lock( +/*======*/ + const lock_t* lock, /*!< in: lock object to fold */ + ulint heap_no)/*!< in: lock's record number + or ULINT_UNDEFINED if the lock + is a table lock */ +{ +#ifdef TEST_LOCK_FOLD_ALWAYS_DIFFERENT + static ulint fold = 0; + + return(fold++); +#else + ulint ret; + + switch (lock_get_type(lock)) { + case LOCK_REC: + ut_a(heap_no != ULINT_UNDEFINED); + + ret = ut_fold_ulint_pair((ulint) lock_get_trx_id(lock), + lock_rec_get_space_id(lock)); + + ret = ut_fold_ulint_pair(ret, + lock_rec_get_page_no(lock)); + + ret = ut_fold_ulint_pair(ret, heap_no); + + break; + case LOCK_TABLE: + /* this check is actually not necessary for continuing + correct operation, but something must have gone wrong if + it fails. */ + ut_a(heap_no == ULINT_UNDEFINED); + + ret = (ulint) lock_get_table_id(lock); + + break; + default: + ut_error; + } + + return(ret); +#endif +} + +/*******************************************************************//** +Checks whether i_s_locks_row_t object represents a lock_t object. +@return TRUE if they match */ +static +ibool +locks_row_eq_lock( +/*==============*/ + const i_s_locks_row_t* row, /*!< in: innodb_locks row */ + const lock_t* lock, /*!< in: lock object */ + ulint heap_no)/*!< in: lock's record number + or ULINT_UNDEFINED if the lock + is a table lock */ +{ + ut_ad(i_s_locks_row_validate(row)); +#ifdef TEST_NO_LOCKS_ROW_IS_EVER_EQUAL_TO_LOCK_T + return(0); +#else + switch (lock_get_type(lock)) { + case LOCK_REC: + ut_a(heap_no != ULINT_UNDEFINED); + + return(row->lock_trx_id == lock_get_trx_id(lock) + && row->lock_space == lock_rec_get_space_id(lock) + && row->lock_page == lock_rec_get_page_no(lock) + && row->lock_rec == heap_no); + + case LOCK_TABLE: + /* this check is actually not necessary for continuing + correct operation, but something must have gone wrong if + it fails. */ + ut_a(heap_no == ULINT_UNDEFINED); + + return(row->lock_trx_id == lock_get_trx_id(lock) + && row->lock_table_id == lock_get_table_id(lock)); + + default: + ut_error; + return(FALSE); + } +#endif +} + +/*******************************************************************//** +Searches for a row in the innodb_locks cache that has a specified id. +This happens in O(1) time since a hash table is used. Returns pointer to +the row or NULL if none is found. +@return row or NULL */ +static +i_s_locks_row_t* +search_innodb_locks( +/*================*/ + trx_i_s_cache_t* cache, /*!< in: cache */ + const lock_t* lock, /*!< in: lock to search for */ + ulint heap_no)/*!< in: lock's record number + or ULINT_UNDEFINED if the lock + is a table lock */ +{ + i_s_hash_chain_t* hash_chain; + + HASH_SEARCH( + /* hash_chain->"next" */ + next, + /* the hash table */ + cache->locks_hash, + /* fold */ + fold_lock(lock, heap_no), + /* the type of the next variable */ + i_s_hash_chain_t*, + /* auxiliary variable */ + hash_chain, + /* assertion on every traversed item */ + ut_ad(i_s_locks_row_validate(hash_chain->value)), + /* this determines if we have found the lock */ + locks_row_eq_lock(hash_chain->value, lock, heap_no)); + + if (hash_chain == NULL) { + + return(NULL); + } + /* else */ + + return(hash_chain->value); +} + +/*******************************************************************//** +Adds new element to the locks cache, enlarging it if necessary. +Returns a pointer to the added row. If the row is already present then +no row is added and a pointer to the existing row is returned. +If row can not be allocated then NULL is returned. +@return row */ +static +i_s_locks_row_t* +add_lock_to_cache( +/*==============*/ + trx_i_s_cache_t* cache, /*!< in/out: cache */ + const lock_t* lock, /*!< in: the element to add */ + ulint heap_no)/*!< in: lock's record number + or ULINT_UNDEFINED if the lock + is a table lock */ +{ + i_s_locks_row_t* dst_row; + +#ifdef TEST_ADD_EACH_LOCKS_ROW_MANY_TIMES + ulint i; + for (i = 0; i < 10000; i++) { +#endif +#ifndef TEST_DO_NOT_CHECK_FOR_DUPLICATE_ROWS + /* quit if this lock is already present */ + dst_row = search_innodb_locks(cache, lock, heap_no); + if (dst_row != NULL) { + + ut_ad(i_s_locks_row_validate(dst_row)); + return(dst_row); + } +#endif + + dst_row = (i_s_locks_row_t*) + table_cache_create_empty_row(&cache->innodb_locks, cache); + + /* memory could not be allocated */ + if (dst_row == NULL) { + + return(NULL); + } + + if (!fill_locks_row(dst_row, lock, heap_no, cache)) { + + /* memory could not be allocated */ + cache->innodb_locks.rows_used--; + return(NULL); + } + +#ifndef TEST_DO_NOT_INSERT_INTO_THE_HASH_TABLE + HASH_INSERT( + /* the type used in the hash chain */ + i_s_hash_chain_t, + /* hash_chain->"next" */ + next, + /* the hash table */ + cache->locks_hash, + /* fold */ + fold_lock(lock, heap_no), + /* add this data to the hash */ + &dst_row->hash_chain); +#endif +#ifdef TEST_ADD_EACH_LOCKS_ROW_MANY_TIMES + } /* for()-loop */ +#endif + + ut_ad(i_s_locks_row_validate(dst_row)); + return(dst_row); +} + +/*******************************************************************//** +Adds new pair of locks to the lock waits cache. +If memory can not be allocated then FALSE is returned. +@return FALSE if allocation fails */ +static +ibool +add_lock_wait_to_cache( +/*===================*/ + trx_i_s_cache_t* cache, /*!< in/out: cache */ + const i_s_locks_row_t* requested_lock_row,/*!< in: pointer to the + relevant requested lock + row in innodb_locks */ + const i_s_locks_row_t* blocking_lock_row)/*!< in: pointer to the + relevant blocking lock + row in innodb_locks */ +{ + i_s_lock_waits_row_t* dst_row; + + dst_row = (i_s_lock_waits_row_t*) + table_cache_create_empty_row(&cache->innodb_lock_waits, + cache); + + /* memory could not be allocated */ + if (dst_row == NULL) { + + return(FALSE); + } + + fill_lock_waits_row(dst_row, requested_lock_row, blocking_lock_row); + + return(TRUE); +} + +/*******************************************************************//** +Adds transaction's relevant (important) locks to cache. +If the transaction is waiting, then the wait lock is added to +innodb_locks and a pointer to the added row is returned in +requested_lock_row, otherwise requested_lock_row is set to NULL. +If rows can not be allocated then FALSE is returned and the value of +requested_lock_row is undefined. +@return FALSE if allocation fails */ +static +ibool +add_trx_relevant_locks_to_cache( +/*============================*/ + trx_i_s_cache_t* cache, /*!< in/out: cache */ + const trx_t* trx, /*!< in: transaction */ + i_s_locks_row_t** requested_lock_row)/*!< out: pointer to the + requested lock row, or NULL or + undefined */ +{ + ut_ad(lock_mutex_own()); + + /* If transaction is waiting we add the wait lock and all locks + from another transactions that are blocking the wait lock. */ + if (trx->lock.que_state == TRX_QUE_LOCK_WAIT) { + + const lock_t* curr_lock; + ulint wait_lock_heap_no; + i_s_locks_row_t* blocking_lock_row; + lock_queue_iterator_t iter; + + ut_a(trx->lock.wait_lock != NULL); + + wait_lock_heap_no + = wait_lock_get_heap_no(trx->lock.wait_lock); + + /* add the requested lock */ + *requested_lock_row + = add_lock_to_cache(cache, trx->lock.wait_lock, + wait_lock_heap_no); + + /* memory could not be allocated */ + if (*requested_lock_row == NULL) { + + return(FALSE); + } + + /* then iterate over the locks before the wait lock and + add the ones that are blocking it */ + + lock_queue_iterator_reset(&iter, trx->lock.wait_lock, + ULINT_UNDEFINED); + + for (curr_lock = lock_queue_iterator_get_prev(&iter); + curr_lock != NULL; + curr_lock = lock_queue_iterator_get_prev(&iter)) { + + if (lock_has_to_wait(trx->lock.wait_lock, + curr_lock)) { + + /* add the lock that is + blocking trx->lock.wait_lock */ + blocking_lock_row + = add_lock_to_cache( + cache, curr_lock, + /* heap_no is the same + for the wait and waited + locks */ + wait_lock_heap_no); + + /* memory could not be allocated */ + if (blocking_lock_row == NULL) { + + return(FALSE); + } + + /* add the relation between both locks + to innodb_lock_waits */ + if (!add_lock_wait_to_cache( + cache, *requested_lock_row, + blocking_lock_row)) { + + /* memory could not be allocated */ + return(FALSE); + } + } + } + } else { + + *requested_lock_row = NULL; + } + + return(TRUE); +} + +/** The minimum time that a cache must not be updated after it has been +read for the last time; measured in microseconds. We use this technique +to ensure that SELECTs which join several INFORMATION SCHEMA tables read +the same version of the cache. */ +#define CACHE_MIN_IDLE_TIME_US 100000 /* 0.1 sec */ + +/*******************************************************************//** +Checks if the cache can safely be updated. +@return TRUE if can be updated */ +static +ibool +can_cache_be_updated( +/*=================*/ + trx_i_s_cache_t* cache) /*!< in: cache */ +{ + ullint now; + + /* Here we read cache->last_read without acquiring its mutex + because last_read is only updated when a shared rw lock on the + whole cache is being held (see trx_i_s_cache_end_read()) and + we are currently holding an exclusive rw lock on the cache. + So it is not possible for last_read to be updated while we are + reading it. */ + +#ifdef UNIV_SYNC_DEBUG + ut_a(rw_lock_own(&cache->rw_lock, RW_LOCK_EX)); +#endif + + now = ut_time_us(NULL); + if (now - cache->last_read > CACHE_MIN_IDLE_TIME_US) { + + return(TRUE); + } + + return(FALSE); +} + +/*******************************************************************//** +Declare a cache empty, preparing it to be filled up. Not all resources +are freed because they can be reused. */ +static +void +trx_i_s_cache_clear( +/*================*/ + trx_i_s_cache_t* cache) /*!< out: cache to clear */ +{ + cache->innodb_trx.rows_used = 0; + cache->innodb_locks.rows_used = 0; + cache->innodb_lock_waits.rows_used = 0; + + hash_table_clear(cache->locks_hash); + + ha_storage_empty(&cache->storage); +} + +/*******************************************************************//** +Fetches the data needed to fill the 3 INFORMATION SCHEMA tables into the +table cache buffer. Cache must be locked for write. */ +static +void +fetch_data_into_cache_low( +/*======================*/ + trx_i_s_cache_t* cache, /*!< in/out: cache */ + ibool only_ac_nl, /*!< in: only select non-locking + autocommit transactions */ + trx_list_t* trx_list) /*!< in: trx list */ +{ + const trx_t* trx; + + ut_ad(trx_list == &trx_sys->rw_trx_list + || trx_list == &trx_sys->ro_trx_list + || trx_list == &trx_sys->mysql_trx_list); + + ut_ad(only_ac_nl == (trx_list == &trx_sys->mysql_trx_list)); + + /* Iterate over the transaction list and add each one + to innodb_trx's cache. We also add all locks that are relevant + to each transaction into innodb_locks' and innodb_lock_waits' + caches. */ + + for (trx = UT_LIST_GET_FIRST(*trx_list); + trx != NULL; + trx = + (trx_list == &trx_sys->mysql_trx_list + ? UT_LIST_GET_NEXT(mysql_trx_list, trx) + : UT_LIST_GET_NEXT(trx_list, trx))) { + + i_s_trx_row_t* trx_row; + i_s_locks_row_t* requested_lock_row; + + if (trx->state == TRX_STATE_NOT_STARTED + || (only_ac_nl && !trx_is_autocommit_non_locking(trx))) { + + continue; + } + + assert_trx_nonlocking_or_in_list(trx); + + ut_ad(trx->in_ro_trx_list + == (trx_list == &trx_sys->ro_trx_list)); + + ut_ad(trx->in_rw_trx_list + == (trx_list == &trx_sys->rw_trx_list)); + + if (!add_trx_relevant_locks_to_cache(cache, trx, + &requested_lock_row)) { + + cache->is_truncated = TRUE; + return; + } + + trx_row = (i_s_trx_row_t*) + table_cache_create_empty_row(&cache->innodb_trx, + cache); + + /* memory could not be allocated */ + if (trx_row == NULL) { + + cache->is_truncated = TRUE; + return; + } + + if (!fill_trx_row(trx_row, trx, requested_lock_row, cache)) { + + /* memory could not be allocated */ + cache->innodb_trx.rows_used--; + cache->is_truncated = TRUE; + return; + } + } +} + +/*******************************************************************//** +Fetches the data needed to fill the 3 INFORMATION SCHEMA tables into the +table cache buffer. Cache must be locked for write. */ +static +void +fetch_data_into_cache( +/*==================*/ + trx_i_s_cache_t* cache) /*!< in/out: cache */ +{ + ut_ad(lock_mutex_own()); + ut_ad(mutex_own(&trx_sys->mutex)); + + trx_i_s_cache_clear(cache); + + fetch_data_into_cache_low(cache, FALSE, &trx_sys->rw_trx_list); + fetch_data_into_cache_low(cache, FALSE, &trx_sys->ro_trx_list); + + /* Only select autocommit non-locking selects because they can + only be on the MySQL transaction list (TRUE). */ + fetch_data_into_cache_low(cache, TRUE, &trx_sys->mysql_trx_list); + + cache->is_truncated = FALSE; +} + +/*******************************************************************//** +Update the transactions cache if it has not been read for some time. +Called from handler/i_s.cc. +@return 0 - fetched, 1 - not */ +UNIV_INTERN +int +trx_i_s_possibly_fetch_data_into_cache( +/*===================================*/ + trx_i_s_cache_t* cache) /*!< in/out: cache */ +{ + if (!can_cache_be_updated(cache)) { + + return(1); + } + + /* We need to read trx_sys and record/table lock queues */ + + lock_mutex_enter(); + + mutex_enter(&trx_sys->mutex); + + fetch_data_into_cache(cache); + + mutex_exit(&trx_sys->mutex); + + lock_mutex_exit(); + + return(0); +} + +/*******************************************************************//** +Returns TRUE if the data in the cache is truncated due to the memory +limit posed by TRX_I_S_MEM_LIMIT. +@return TRUE if truncated */ +UNIV_INTERN +ibool +trx_i_s_cache_is_truncated( +/*=======================*/ + trx_i_s_cache_t* cache) /*!< in: cache */ +{ + return(cache->is_truncated); +} + +/*******************************************************************//** +Initialize INFORMATION SCHEMA trx related cache. */ +UNIV_INTERN +void +trx_i_s_cache_init( +/*===============*/ + trx_i_s_cache_t* cache) /*!< out: cache to init */ +{ + /* The latching is done in the following order: + acquire trx_i_s_cache_t::rw_lock, X + acquire lock mutex + release lock mutex + release trx_i_s_cache_t::rw_lock + acquire trx_i_s_cache_t::rw_lock, S + acquire trx_i_s_cache_t::last_read_mutex + release trx_i_s_cache_t::last_read_mutex + release trx_i_s_cache_t::rw_lock */ + + rw_lock_create(trx_i_s_cache_lock_key, &cache->rw_lock, + SYNC_TRX_I_S_RWLOCK); + + cache->last_read = 0; + + mutex_create(cache_last_read_mutex_key, + &cache->last_read_mutex, SYNC_TRX_I_S_LAST_READ); + + table_cache_init(&cache->innodb_trx, sizeof(i_s_trx_row_t)); + table_cache_init(&cache->innodb_locks, sizeof(i_s_locks_row_t)); + table_cache_init(&cache->innodb_lock_waits, + sizeof(i_s_lock_waits_row_t)); + + cache->locks_hash = hash_create(LOCKS_HASH_CELLS_NUM); + + cache->storage = ha_storage_create(CACHE_STORAGE_INITIAL_SIZE, + CACHE_STORAGE_HASH_CELLS); + + cache->mem_allocd = 0; + + cache->is_truncated = FALSE; +} + +/*******************************************************************//** +Free the INFORMATION SCHEMA trx related cache. */ +UNIV_INTERN +void +trx_i_s_cache_free( +/*===============*/ + trx_i_s_cache_t* cache) /*!< in, own: cache to free */ +{ + hash_table_free(cache->locks_hash); + ha_storage_free(cache->storage); + table_cache_free(&cache->innodb_trx); + table_cache_free(&cache->innodb_locks); + table_cache_free(&cache->innodb_lock_waits); + memset(cache, 0, sizeof *cache); +} + +/*******************************************************************//** +Issue a shared/read lock on the tables cache. */ +UNIV_INTERN +void +trx_i_s_cache_start_read( +/*=====================*/ + trx_i_s_cache_t* cache) /*!< in: cache */ +{ + rw_lock_s_lock(&cache->rw_lock); +} + +/*******************************************************************//** +Release a shared/read lock on the tables cache. */ +UNIV_INTERN +void +trx_i_s_cache_end_read( +/*===================*/ + trx_i_s_cache_t* cache) /*!< in: cache */ +{ + ullint now; + +#ifdef UNIV_SYNC_DEBUG + ut_a(rw_lock_own(&cache->rw_lock, RW_LOCK_SHARED)); +#endif + + /* update cache last read time */ + now = ut_time_us(NULL); + mutex_enter(&cache->last_read_mutex); + cache->last_read = now; + mutex_exit(&cache->last_read_mutex); + + rw_lock_s_unlock(&cache->rw_lock); +} + +/*******************************************************************//** +Issue an exclusive/write lock on the tables cache. */ +UNIV_INTERN +void +trx_i_s_cache_start_write( +/*======================*/ + trx_i_s_cache_t* cache) /*!< in: cache */ +{ + rw_lock_x_lock(&cache->rw_lock); +} + +/*******************************************************************//** +Release an exclusive/write lock on the tables cache. */ +UNIV_INTERN +void +trx_i_s_cache_end_write( +/*====================*/ + trx_i_s_cache_t* cache) /*!< in: cache */ +{ +#ifdef UNIV_SYNC_DEBUG + ut_a(rw_lock_own(&cache->rw_lock, RW_LOCK_EX)); +#endif + + rw_lock_x_unlock(&cache->rw_lock); +} + +/*******************************************************************//** +Selects a INFORMATION SCHEMA table cache from the whole cache. +@return table cache */ +static +i_s_table_cache_t* +cache_select_table( +/*===============*/ + trx_i_s_cache_t* cache, /*!< in: whole cache */ + enum i_s_table table) /*!< in: which table */ +{ + i_s_table_cache_t* table_cache; + +#ifdef UNIV_SYNC_DEBUG + ut_a(rw_lock_own(&cache->rw_lock, RW_LOCK_SHARED) + || rw_lock_own(&cache->rw_lock, RW_LOCK_EX)); +#endif + + switch (table) { + case I_S_INNODB_TRX: + table_cache = &cache->innodb_trx; + break; + case I_S_INNODB_LOCKS: + table_cache = &cache->innodb_locks; + break; + case I_S_INNODB_LOCK_WAITS: + table_cache = &cache->innodb_lock_waits; + break; + default: + ut_error; + } + + return(table_cache); +} + +/*******************************************************************//** +Retrieves the number of used rows in the cache for a given +INFORMATION SCHEMA table. +@return number of rows */ +UNIV_INTERN +ulint +trx_i_s_cache_get_rows_used( +/*========================*/ + trx_i_s_cache_t* cache, /*!< in: cache */ + enum i_s_table table) /*!< in: which table */ +{ + i_s_table_cache_t* table_cache; + + table_cache = cache_select_table(cache, table); + + return(table_cache->rows_used); +} + +/*******************************************************************//** +Retrieves the nth row (zero-based) in the cache for a given +INFORMATION SCHEMA table. +@return row */ +UNIV_INTERN +void* +trx_i_s_cache_get_nth_row( +/*======================*/ + trx_i_s_cache_t* cache, /*!< in: cache */ + enum i_s_table table, /*!< in: which table */ + ulint n) /*!< in: row number */ +{ + i_s_table_cache_t* table_cache; + ulint i; + void* row; + + table_cache = cache_select_table(cache, table); + + ut_a(n < table_cache->rows_used); + + row = NULL; + + for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) { + + if (table_cache->chunks[i].offset + + table_cache->chunks[i].rows_allocd > n) { + + row = (char*) table_cache->chunks[i].base + + (n - table_cache->chunks[i].offset) + * table_cache->row_size; + break; + } + } + + ut_a(row != NULL); + + return(row); +} + +/*******************************************************************//** +Crafts a lock id string from a i_s_locks_row_t object. Returns its +second argument. This function aborts if there is not enough space in +lock_id. Be sure to provide at least TRX_I_S_LOCK_ID_MAX_LEN + 1 if you +want to be 100% sure that it will not abort. +@return resulting lock id */ +UNIV_INTERN +char* +trx_i_s_create_lock_id( +/*===================*/ + const i_s_locks_row_t* row, /*!< in: innodb_locks row */ + char* lock_id,/*!< out: resulting lock_id */ + ulint lock_id_size)/*!< in: size of the lock id + buffer */ +{ + int res_len; + + /* please adjust TRX_I_S_LOCK_ID_MAX_LEN if you change this */ + + if (row->lock_space != ULINT_UNDEFINED) { + /* record lock */ + res_len = ut_snprintf(lock_id, lock_id_size, + TRX_ID_FMT ":%lu:%lu:%lu", + row->lock_trx_id, row->lock_space, + row->lock_page, row->lock_rec); + } else { + /* table lock */ + res_len = ut_snprintf(lock_id, lock_id_size, + TRX_ID_FMT":" UINT64PF, + row->lock_trx_id, + row->lock_table_id); + } + + /* the typecast is safe because snprintf(3) never returns + negative result */ + ut_a(res_len >= 0); + ut_a((ulint) res_len < lock_id_size); + + return(lock_id); +} + +UNIV_INTERN +void +trx_i_s_get_lock_sys_memory_usage(ulint *constant, ulint *variable) +{ + trx_t* trx; + + *constant = lock_sys->rec_hash->n_cells * sizeof(hash_cell_t); + *variable = 0; + + if (trx_sys) { + mutex_enter(&trx_sys->mutex); + trx = UT_LIST_GET_FIRST(trx_sys->mysql_trx_list); + while (trx) { + *variable += ((trx->lock.lock_heap) ? mem_heap_get_size(trx->lock.lock_heap) : 0); + trx = UT_LIST_GET_NEXT(mysql_trx_list, trx); + } + mutex_exit(&trx_sys->mutex); + } + +} diff --git a/storage/xtradb/trx/trx0purge.cc b/storage/xtradb/trx/trx0purge.cc new file mode 100644 index 00000000000..9d9fe73de6e --- /dev/null +++ b/storage/xtradb/trx/trx0purge.cc @@ -0,0 +1,1410 @@ +/***************************************************************************** + +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file trx/trx0purge.cc +Purge old versions + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#include "trx0purge.h" + +#ifdef UNIV_NONINL +#include "trx0purge.ic" +#endif + +#include "fsp0fsp.h" +#include "mach0data.h" +#include "trx0rseg.h" +#include "trx0trx.h" +#include "trx0roll.h" +#include "read0read.h" +#include "fut0fut.h" +#include "que0que.h" +#include "row0purge.h" +#include "row0upd.h" +#include "trx0rec.h" +#include "srv0srv.h" +#include "srv0start.h" +#include "os0thread.h" +#include "srv0mon.h" +#include "mtr0log.h" + +/** Maximum allowable purge history length. <=0 means 'infinite'. */ +UNIV_INTERN ulong srv_max_purge_lag = 0; + +/** Max DML user threads delay in micro-seconds. */ +UNIV_INTERN ulong srv_max_purge_lag_delay = 0; + +/** The global data structure coordinating a purge */ +UNIV_INTERN trx_purge_t* purge_sys = NULL; + +/** A dummy undo record used as a return value when we have a whole undo log +which needs no purge */ +UNIV_INTERN trx_undo_rec_t trx_purge_dummy_rec; + +#ifdef UNIV_PFS_RWLOCK +/* Key to register trx_purge_latch with performance schema */ +UNIV_INTERN mysql_pfs_key_t trx_purge_latch_key; +#endif /* UNIV_PFS_RWLOCK */ + +#ifdef UNIV_PFS_MUTEX +/* Key to register purge_sys_bh_mutex with performance schema */ +UNIV_INTERN mysql_pfs_key_t purge_sys_bh_mutex_key; +#endif /* UNIV_PFS_MUTEX */ + +#ifdef UNIV_DEBUG +UNIV_INTERN my_bool srv_purge_view_update_only_debug; +#endif /* UNIV_DEBUG */ + +/****************************************************************//** +Builds a purge 'query' graph. The actual purge is performed by executing +this query graph. +@return own: the query graph */ +static +que_t* +trx_purge_graph_build( +/*==================*/ + trx_t* trx, /*!< in: transaction */ + ulint n_purge_threads) /*!< in: number of purge + threads */ +{ + ulint i; + mem_heap_t* heap; + que_fork_t* fork; + + heap = mem_heap_create(512); + fork = que_fork_create(NULL, NULL, QUE_FORK_PURGE, heap); + fork->trx = trx; + + for (i = 0; i < n_purge_threads; ++i) { + que_thr_t* thr; + + thr = que_thr_create(fork, heap); + + thr->child = row_purge_node_create(thr, heap); + } + + return(fork); +} + +/********************************************************************//** +Creates the global purge system control structure and inits the history +mutex. */ +UNIV_INTERN +void +trx_purge_sys_create( +/*=================*/ + ulint n_purge_threads, /*!< in: number of purge + threads */ + ib_bh_t* ib_bh) /*!< in, own: UNDO log min + binary heap */ +{ + purge_sys = static_cast<trx_purge_t*>(mem_zalloc(sizeof(*purge_sys))); + + purge_sys->state = PURGE_STATE_INIT; + purge_sys->event = os_event_create(); + + /* Take ownership of ib_bh, we are responsible for freeing it. */ + purge_sys->ib_bh = ib_bh; + + rw_lock_create(trx_purge_latch_key, + &purge_sys->latch, SYNC_PURGE_LATCH); + + mutex_create( + purge_sys_bh_mutex_key, &purge_sys->bh_mutex, + SYNC_PURGE_QUEUE); + + purge_sys->heap = mem_heap_create(256); + + ut_a(n_purge_threads > 0); + + purge_sys->sess = sess_open(); + + purge_sys->trx = purge_sys->sess->trx; + + ut_a(purge_sys->trx->sess == purge_sys->sess); + + /* A purge transaction is not a real transaction, we use a transaction + here only because the query threads code requires it. It is otherwise + quite unnecessary. We should get rid of it eventually. */ + purge_sys->trx->id = 0; + purge_sys->trx->start_time = ut_time(); + purge_sys->trx->state = TRX_STATE_ACTIVE; + purge_sys->trx->op_info = "purge trx"; + + purge_sys->query = trx_purge_graph_build( + purge_sys->trx, n_purge_threads); + + purge_sys->view = read_view_purge_open(purge_sys->prebuilt_clone, + purge_sys->prebuilt_view); +} + +/************************************************************************ +Frees the global purge system control structure. */ +UNIV_INTERN +void +trx_purge_sys_close(void) +/*======================*/ +{ + que_graph_free(purge_sys->query); + + ut_a(purge_sys->trx->id == 0); + ut_a(purge_sys->sess->trx == purge_sys->trx); + + purge_sys->trx->state = TRX_STATE_NOT_STARTED; + + sess_close(purge_sys->sess); + + purge_sys->sess = NULL; + + read_view_free(purge_sys->prebuilt_view); + read_view_free(purge_sys->prebuilt_clone); + + purge_sys->view = NULL; + + rw_lock_free(&purge_sys->latch); + mutex_free(&purge_sys->bh_mutex); + + mem_heap_free(purge_sys->heap); + + ib_bh_free(purge_sys->ib_bh); + + os_event_free(purge_sys->event); + + purge_sys->event = NULL; + + mem_free(purge_sys); + + purge_sys = NULL; +} + +/*================ UNDO LOG HISTORY LIST =============================*/ + +/********************************************************************//** +Adds the update undo log as the first log in the history list. Removes the +update undo log segment from the rseg slot if it is too big for reuse. */ +UNIV_INTERN +void +trx_purge_add_update_undo_to_history( +/*=================================*/ + trx_t* trx, /*!< in: transaction */ + page_t* undo_page, /*!< in: update undo log header page, + x-latched */ + mtr_t* mtr) /*!< in: mtr */ +{ + trx_undo_t* undo; + trx_rseg_t* rseg; + trx_rsegf_t* rseg_header; + trx_ulogf_t* undo_header; + + undo = trx->update_undo; + rseg = undo->rseg; + + rseg_header = trx_rsegf_get( + undo->rseg->space, undo->rseg->zip_size, undo->rseg->page_no, + mtr); + + undo_header = undo_page + undo->hdr_offset; + + if (undo->state != TRX_UNDO_CACHED) { + ulint hist_size; +#ifdef UNIV_DEBUG + trx_usegf_t* seg_header = undo_page + TRX_UNDO_SEG_HDR; +#endif /* UNIV_DEBUG */ + + /* The undo log segment will not be reused */ + + if (UNIV_UNLIKELY(undo->id >= TRX_RSEG_N_SLOTS)) { + fprintf(stderr, + "InnoDB: Error: undo->id is %lu\n", + (ulong) undo->id); + ut_error; + } + + trx_rsegf_set_nth_undo(rseg_header, undo->id, FIL_NULL, mtr); + + MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_USED); + + hist_size = mtr_read_ulint( + rseg_header + TRX_RSEG_HISTORY_SIZE, MLOG_4BYTES, mtr); + + ut_ad(undo->size == flst_get_len( + seg_header + TRX_UNDO_PAGE_LIST, mtr)); + + mlog_write_ulint( + rseg_header + TRX_RSEG_HISTORY_SIZE, + hist_size + undo->size, MLOG_4BYTES, mtr); + } + + /* Add the log as the first in the history list */ + flst_add_first(rseg_header + TRX_RSEG_HISTORY, + undo_header + TRX_UNDO_HISTORY_NODE, mtr); + +#ifdef HAVE_ATOMIC_BUILTINS + os_atomic_increment_ulint(&trx_sys->rseg_history_len, 1); +#else + mutex_enter(&trx_sys->mutex); + ++trx_sys->rseg_history_len; + mutex_exit(&trx_sys->mutex); +#endif /* HAVE_ATOMIC_BUILTINS */ + + srv_wake_purge_thread_if_not_active(); + + /* Write the trx number to the undo log header */ + mlog_write_ull(undo_header + TRX_UNDO_TRX_NO, trx->no, mtr); + + /* Write information about delete markings to the undo log header */ + + if (!undo->del_marks) { + mlog_write_ulint(undo_header + TRX_UNDO_DEL_MARKS, FALSE, + MLOG_2BYTES, mtr); + } + + if (rseg->last_page_no == FIL_NULL) { + rseg->last_page_no = undo->hdr_page_no; + rseg->last_offset = undo->hdr_offset; + rseg->last_trx_no = trx->no; + rseg->last_del_marks = undo->del_marks; + } +} + +/**********************************************************************//** +Frees an undo log segment which is in the history list. Cuts the end of the +history list at the youngest undo log in this segment. */ +static +void +trx_purge_free_segment( +/*===================*/ + trx_rseg_t* rseg, /*!< in: rollback segment */ + fil_addr_t hdr_addr, /*!< in: the file address of log_hdr */ + ulint n_removed_logs) /*!< in: count of how many undo logs we + will cut off from the end of the + history list */ +{ + mtr_t mtr; + trx_rsegf_t* rseg_hdr; + trx_ulogf_t* log_hdr; + trx_usegf_t* seg_hdr; + ulint seg_size; + ulint hist_size; + ibool marked = FALSE; + + /* fputs("Freeing an update undo log segment\n", stderr); */ + + for (;;) { + page_t* undo_page; + + mtr_start(&mtr); + + mutex_enter(&rseg->mutex); + + rseg_hdr = trx_rsegf_get( + rseg->space, rseg->zip_size, rseg->page_no, &mtr); + + undo_page = trx_undo_page_get( + rseg->space, rseg->zip_size, hdr_addr.page, &mtr); + + seg_hdr = undo_page + TRX_UNDO_SEG_HDR; + log_hdr = undo_page + hdr_addr.boffset; + + /* Mark the last undo log totally purged, so that if the + system crashes, the tail of the undo log will not get accessed + again. The list of pages in the undo log tail gets inconsistent + during the freeing of the segment, and therefore purge should + not try to access them again. */ + + if (!marked) { + mlog_write_ulint( + log_hdr + TRX_UNDO_DEL_MARKS, FALSE, + MLOG_2BYTES, &mtr); + + marked = TRUE; + } + + if (fseg_free_step_not_header( + seg_hdr + TRX_UNDO_FSEG_HEADER, &mtr)) { + + break; + } + + mutex_exit(&rseg->mutex); + + mtr_commit(&mtr); + } + + /* The page list may now be inconsistent, but the length field + stored in the list base node tells us how big it was before we + started the freeing. */ + + seg_size = flst_get_len(seg_hdr + TRX_UNDO_PAGE_LIST, &mtr); + + /* We may free the undo log segment header page; it must be freed + within the same mtr as the undo log header is removed from the + history list: otherwise, in case of a database crash, the segment + could become inaccessible garbage in the file space. */ + + flst_cut_end(rseg_hdr + TRX_RSEG_HISTORY, + log_hdr + TRX_UNDO_HISTORY_NODE, n_removed_logs, &mtr); + +#ifdef HAVE_ATOMIC_BUILTINS + os_atomic_decrement_ulint(&trx_sys->rseg_history_len, n_removed_logs); +#else + mutex_enter(&trx_sys->mutex); + trx_sys->rseg_history_len -= n_removed_logs; + mutex_exit(&trx_sys->mutex); +#endif /* HAVE_ATOMIC_BUILTINS */ + + do { + + /* Here we assume that a file segment with just the header + page can be freed in a few steps, so that the buffer pool + is not flooded with bufferfixed pages: see the note in + fsp0fsp.cc. */ + + } while(!fseg_free_step(seg_hdr + TRX_UNDO_FSEG_HEADER, &mtr)); + + hist_size = mtr_read_ulint(rseg_hdr + TRX_RSEG_HISTORY_SIZE, + MLOG_4BYTES, &mtr); + ut_ad(hist_size >= seg_size); + + mlog_write_ulint(rseg_hdr + TRX_RSEG_HISTORY_SIZE, + hist_size - seg_size, MLOG_4BYTES, &mtr); + + ut_ad(rseg->curr_size >= seg_size); + + rseg->curr_size -= seg_size; + + mutex_exit(&(rseg->mutex)); + + mtr_commit(&mtr); +} + +/********************************************************************//** +Removes unnecessary history data from a rollback segment. */ +static +void +trx_purge_truncate_rseg_history( +/*============================*/ + trx_rseg_t* rseg, /*!< in: rollback segment */ + const purge_iter_t* limit) /*!< in: truncate offset */ +{ + fil_addr_t hdr_addr; + fil_addr_t prev_hdr_addr; + trx_rsegf_t* rseg_hdr; + page_t* undo_page; + trx_ulogf_t* log_hdr; + trx_usegf_t* seg_hdr; + ulint n_removed_logs = 0; + mtr_t mtr; + trx_id_t undo_trx_no; + + mtr_start(&mtr); + mutex_enter(&(rseg->mutex)); + + rseg_hdr = trx_rsegf_get(rseg->space, rseg->zip_size, + rseg->page_no, &mtr); + + hdr_addr = trx_purge_get_log_from_hist( + flst_get_last(rseg_hdr + TRX_RSEG_HISTORY, &mtr)); +loop: + if (hdr_addr.page == FIL_NULL) { + + mutex_exit(&(rseg->mutex)); + + mtr_commit(&mtr); + + return; + } + + undo_page = trx_undo_page_get(rseg->space, rseg->zip_size, + hdr_addr.page, &mtr); + + log_hdr = undo_page + hdr_addr.boffset; + + undo_trx_no = mach_read_from_8(log_hdr + TRX_UNDO_TRX_NO); + + if (undo_trx_no >= limit->trx_no) { + + if (undo_trx_no == limit->trx_no) { + + trx_undo_truncate_start( + rseg, rseg->space, hdr_addr.page, + hdr_addr.boffset, limit->undo_no); + } + +#ifdef HAVE_ATOMIC_BUILTINS + os_atomic_decrement_ulint( + &trx_sys->rseg_history_len, n_removed_logs); +#else + mutex_enter(&trx_sys->mutex); + trx_sys->rseg_history_len -= n_removed_logs; + mutex_exit(&trx_sys->mutex); +#endif /* HAVE_ATOMIC_BUILTINS */ + + flst_truncate_end(rseg_hdr + TRX_RSEG_HISTORY, + log_hdr + TRX_UNDO_HISTORY_NODE, + n_removed_logs, &mtr); + + mutex_exit(&(rseg->mutex)); + mtr_commit(&mtr); + + return; + } + + prev_hdr_addr = trx_purge_get_log_from_hist( + flst_get_prev_addr(log_hdr + TRX_UNDO_HISTORY_NODE, &mtr)); + n_removed_logs++; + + seg_hdr = undo_page + TRX_UNDO_SEG_HDR; + + if ((mach_read_from_2(seg_hdr + TRX_UNDO_STATE) == TRX_UNDO_TO_PURGE) + && (mach_read_from_2(log_hdr + TRX_UNDO_NEXT_LOG) == 0)) { + + /* We can free the whole log segment */ + + mutex_exit(&(rseg->mutex)); + mtr_commit(&mtr); + + trx_purge_free_segment(rseg, hdr_addr, n_removed_logs); + + n_removed_logs = 0; + } else { + mutex_exit(&(rseg->mutex)); + mtr_commit(&mtr); + } + + mtr_start(&mtr); + mutex_enter(&(rseg->mutex)); + + rseg_hdr = trx_rsegf_get(rseg->space, rseg->zip_size, + rseg->page_no, &mtr); + + hdr_addr = prev_hdr_addr; + + goto loop; +} + +/********************************************************************//** +Removes unnecessary history data from rollback segments. NOTE that when this +function is called, the caller must not have any latches on undo log pages! */ +static +void +trx_purge_truncate_history( +/*========================*/ + purge_iter_t* limit, /*!< in: truncate limit */ + const read_view_t* view) /*!< in: purge view */ +{ + ulint i; + + /* We play safe and set the truncate limit at most to the purge view + low_limit number, though this is not necessary */ + + if (limit->trx_no >= view->low_limit_no) { + limit->trx_no = view->low_limit_no; + limit->undo_no = 0; + } + + ut_ad(limit->trx_no <= purge_sys->view->low_limit_no); + + for (i = 0; i < TRX_SYS_N_RSEGS; ++i) { + trx_rseg_t* rseg = trx_sys->rseg_array[i]; + + if (rseg != NULL) { + ut_a(rseg->id == i); + trx_purge_truncate_rseg_history(rseg, limit); + } + } +} + +/***********************************************************************//** +Updates the last not yet purged history log info in rseg when we have purged +a whole undo log. Advances also purge_sys->purge_trx_no past the purged log. */ +static +void +trx_purge_rseg_get_next_history_log( +/*================================*/ + trx_rseg_t* rseg, /*!< in: rollback segment */ + ulint* n_pages_handled)/*!< in/out: number of UNDO pages + handled */ +{ + const void* ptr; + page_t* undo_page; + trx_ulogf_t* log_hdr; + fil_addr_t prev_log_addr; + trx_id_t trx_no; + ibool del_marks; + mtr_t mtr; + rseg_queue_t rseg_queue; + + mutex_enter(&(rseg->mutex)); + + ut_a(rseg->last_page_no != FIL_NULL); + + purge_sys->iter.trx_no = rseg->last_trx_no + 1; + purge_sys->iter.undo_no = 0; + purge_sys->next_stored = FALSE; + + mtr_start(&mtr); + + undo_page = trx_undo_page_get_s_latched( + rseg->space, rseg->zip_size, rseg->last_page_no, &mtr); + + log_hdr = undo_page + rseg->last_offset; + + /* Increase the purge page count by one for every handled log */ + + (*n_pages_handled)++; + + prev_log_addr = trx_purge_get_log_from_hist( + flst_get_prev_addr(log_hdr + TRX_UNDO_HISTORY_NODE, &mtr)); + + if (prev_log_addr.page == FIL_NULL) { + /* No logs left in the history list */ + + rseg->last_page_no = FIL_NULL; + + mutex_exit(&(rseg->mutex)); + mtr_commit(&mtr); + + mutex_enter(&trx_sys->mutex); + + /* Add debug code to track history list corruption reported + on the MySQL mailing list on Nov 9, 2004. The fut0lst.cc + file-based list was corrupt. The prev node pointer was + FIL_NULL, even though the list length was over 8 million nodes! + We assume that purge truncates the history list in large + size pieces, and if we here reach the head of the list, the + list cannot be longer than 2000 000 undo logs now. */ + + if (trx_sys->rseg_history_len > 2000000) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Warning: purge reached the" + " head of the history list,\n" + "InnoDB: but its length is still" + " reported as %lu! Make a detailed bug\n" + "InnoDB: report, and submit it" + " to http://bugs.mysql.com\n", + (ulong) trx_sys->rseg_history_len); + ut_ad(0); + } + + mutex_exit(&trx_sys->mutex); + + return; + } + + mutex_exit(&rseg->mutex); + + mtr_commit(&mtr); + + /* Read the trx number and del marks from the previous log header */ + mtr_start(&mtr); + + log_hdr = trx_undo_page_get_s_latched(rseg->space, rseg->zip_size, + prev_log_addr.page, &mtr) + + prev_log_addr.boffset; + + trx_no = mach_read_from_8(log_hdr + TRX_UNDO_TRX_NO); + + del_marks = mach_read_from_2(log_hdr + TRX_UNDO_DEL_MARKS); + + mtr_commit(&mtr); + + mutex_enter(&(rseg->mutex)); + + rseg->last_page_no = prev_log_addr.page; + rseg->last_offset = prev_log_addr.boffset; + rseg->last_trx_no = trx_no; + rseg->last_del_marks = del_marks; + + rseg_queue.rseg = rseg; + rseg_queue.trx_no = rseg->last_trx_no; + + /* Purge can also produce events, however these are already ordered + in the rollback segment and any user generated event will be greater + than the events that Purge produces. ie. Purge can never produce + events from an empty rollback segment. */ + + mutex_enter(&purge_sys->bh_mutex); + + ptr = ib_bh_push(purge_sys->ib_bh, &rseg_queue); + ut_a(ptr != NULL); + + mutex_exit(&purge_sys->bh_mutex); + + mutex_exit(&rseg->mutex); +} + +/***********************************************************************//** +Chooses the rollback segment with the smallest trx_id. +@return zip_size if log is for a compressed table, ULINT_UNDEFINED if + no rollback segments to purge, 0 for non compressed tables. */ +static +ulint +trx_purge_get_rseg_with_min_trx_id( +/*===============================*/ + trx_purge_t* purge_sys) /*!< in/out: purge instance */ + +{ + ulint zip_size = 0; + + mutex_enter(&purge_sys->bh_mutex); + + /* Only purge consumes events from the binary heap, user + threads only produce the events. */ + + if (!ib_bh_is_empty(purge_sys->ib_bh)) { + trx_rseg_t* rseg; + + rseg = ((rseg_queue_t*) ib_bh_first(purge_sys->ib_bh))->rseg; + ib_bh_pop(purge_sys->ib_bh); + + mutex_exit(&purge_sys->bh_mutex); + + purge_sys->rseg = rseg; + } else { + mutex_exit(&purge_sys->bh_mutex); + + purge_sys->rseg = NULL; + + return(ULINT_UNDEFINED); + } + + ut_a(purge_sys->rseg != NULL); + + mutex_enter(&purge_sys->rseg->mutex); + + ut_a(purge_sys->rseg->last_page_no != FIL_NULL); + + /* We assume in purge of externally stored fields that space id is + in the range of UNDO tablespace space ids */ + ut_a(purge_sys->rseg->space <= srv_undo_tablespaces_open); + + zip_size = purge_sys->rseg->zip_size; + + ut_a(purge_sys->iter.trx_no <= purge_sys->rseg->last_trx_no); + + purge_sys->iter.trx_no = purge_sys->rseg->last_trx_no; + purge_sys->hdr_offset = purge_sys->rseg->last_offset; + purge_sys->hdr_page_no = purge_sys->rseg->last_page_no; + + mutex_exit(&purge_sys->rseg->mutex); + + return(zip_size); +} + +/***********************************************************************//** +Position the purge sys "iterator" on the undo record to use for purging. */ +static +void +trx_purge_read_undo_rec( +/*====================*/ + trx_purge_t* purge_sys, /*!< in/out: purge instance */ + ulint zip_size) /*!< in: block size or 0 */ +{ + ulint offset; + ulint page_no; + ib_uint64_t undo_no; + + purge_sys->hdr_offset = purge_sys->rseg->last_offset; + page_no = purge_sys->hdr_page_no = purge_sys->rseg->last_page_no; + + if (purge_sys->rseg->last_del_marks) { + mtr_t mtr; + trx_undo_rec_t* undo_rec = NULL; + + mtr_start(&mtr); + + undo_rec = trx_undo_get_first_rec( + purge_sys->rseg->space, + zip_size, + purge_sys->hdr_page_no, + purge_sys->hdr_offset, RW_S_LATCH, &mtr); + + if (undo_rec != NULL) { + offset = page_offset(undo_rec); + undo_no = trx_undo_rec_get_undo_no(undo_rec); + page_no = page_get_page_no(page_align(undo_rec)); + } else { + offset = 0; + undo_no = 0; + } + + mtr_commit(&mtr); + } else { + offset = 0; + undo_no = 0; + } + + purge_sys->offset = offset; + purge_sys->page_no = page_no; + purge_sys->iter.undo_no = undo_no; + + purge_sys->next_stored = TRUE; +} + +/***********************************************************************//** +Chooses the next undo log to purge and updates the info in purge_sys. This +function is used to initialize purge_sys when the next record to purge is +not known, and also to update the purge system info on the next record when +purge has handled the whole undo log for a transaction. */ +static +void +trx_purge_choose_next_log(void) +/*===========================*/ +{ + ulint zip_size; + + ut_ad(purge_sys->next_stored == FALSE); + + zip_size = trx_purge_get_rseg_with_min_trx_id(purge_sys); + + if (purge_sys->rseg != NULL) { + trx_purge_read_undo_rec(purge_sys, zip_size); + } else { + /* There is nothing to do yet. */ + os_thread_yield(); + } +} + +/***********************************************************************//** +Gets the next record to purge and updates the info in the purge system. +@return copy of an undo log record or pointer to the dummy undo log record */ +static +trx_undo_rec_t* +trx_purge_get_next_rec( +/*===================*/ + ulint* n_pages_handled,/*!< in/out: number of UNDO pages + handled */ + mem_heap_t* heap) /*!< in: memory heap where copied */ +{ + trx_undo_rec_t* rec; + trx_undo_rec_t* rec_copy; + trx_undo_rec_t* rec2; + page_t* undo_page; + page_t* page; + ulint offset; + ulint page_no; + ulint space; + ulint zip_size; + mtr_t mtr; + + ut_ad(purge_sys->next_stored); + ut_ad(purge_sys->iter.trx_no < purge_sys->view->low_limit_no); + + space = purge_sys->rseg->space; + zip_size = purge_sys->rseg->zip_size; + page_no = purge_sys->page_no; + offset = purge_sys->offset; + + if (offset == 0) { + /* It is the dummy undo log record, which means that there is + no need to purge this undo log */ + + trx_purge_rseg_get_next_history_log( + purge_sys->rseg, n_pages_handled); + + /* Look for the next undo log and record to purge */ + + trx_purge_choose_next_log(); + + return(&trx_purge_dummy_rec); + } + + mtr_start(&mtr); + + undo_page = trx_undo_page_get_s_latched(space, zip_size, page_no, &mtr); + + rec = undo_page + offset; + + rec2 = rec; + + for (;;) { + ulint type; + trx_undo_rec_t* next_rec; + ulint cmpl_info; + + /* Try first to find the next record which requires a purge + operation from the same page of the same undo log */ + + next_rec = trx_undo_page_get_next_rec( + rec2, purge_sys->hdr_page_no, purge_sys->hdr_offset); + + if (next_rec == NULL) { + rec2 = trx_undo_get_next_rec( + rec2, purge_sys->hdr_page_no, + purge_sys->hdr_offset, &mtr); + break; + } + + rec2 = next_rec; + + type = trx_undo_rec_get_type(rec2); + + if (type == TRX_UNDO_DEL_MARK_REC) { + + break; + } + + cmpl_info = trx_undo_rec_get_cmpl_info(rec2); + + if (trx_undo_rec_get_extern_storage(rec2)) { + break; + } + + if ((type == TRX_UNDO_UPD_EXIST_REC) + && !(cmpl_info & UPD_NODE_NO_ORD_CHANGE)) { + break; + } + } + + if (rec2 == NULL) { + mtr_commit(&mtr); + + trx_purge_rseg_get_next_history_log( + purge_sys->rseg, n_pages_handled); + + /* Look for the next undo log and record to purge */ + + trx_purge_choose_next_log(); + + mtr_start(&mtr); + + undo_page = trx_undo_page_get_s_latched( + space, zip_size, page_no, &mtr); + + rec = undo_page + offset; + } else { + page = page_align(rec2); + + purge_sys->offset = rec2 - page; + purge_sys->page_no = page_get_page_no(page); + purge_sys->iter.undo_no = trx_undo_rec_get_undo_no(rec2); + + if (undo_page != page) { + /* We advance to a new page of the undo log: */ + (*n_pages_handled)++; + } + } + + rec_copy = trx_undo_rec_copy(rec, heap); + + mtr_commit(&mtr); + + return(rec_copy); +} + +/********************************************************************//** +Fetches the next undo log record from the history list to purge. It must be +released with the corresponding release function. +@return copy of an undo log record or pointer to trx_purge_dummy_rec, +if the whole undo log can skipped in purge; NULL if none left */ +static __attribute__((warn_unused_result, nonnull)) +trx_undo_rec_t* +trx_purge_fetch_next_rec( +/*=====================*/ + roll_ptr_t* roll_ptr, /*!< out: roll pointer to undo record */ + ulint* n_pages_handled,/*!< in/out: number of UNDO log pages + handled */ + mem_heap_t* heap) /*!< in: memory heap where copied */ +{ + if (!purge_sys->next_stored) { + trx_purge_choose_next_log(); + + if (!purge_sys->next_stored) { + + if (srv_print_thread_releases) { + fprintf(stderr, + "Purge: No logs left in the" + " history list\n"); + } + + return(NULL); + } + } + + if (purge_sys->iter.trx_no >= purge_sys->view->low_limit_no) { + + return(NULL); + } + + /* fprintf(stderr, "Thread %lu purging trx %llu undo record %llu\n", + os_thread_get_curr_id(), iter->trx_no, iter->undo_no); */ + + *roll_ptr = trx_undo_build_roll_ptr( + FALSE, purge_sys->rseg->id, + purge_sys->page_no, purge_sys->offset); + + /* The following call will advance the stored values of the + purge iterator. */ + + return(trx_purge_get_next_rec(n_pages_handled, heap)); +} + +/*******************************************************************//** +This function runs a purge batch. +@return number of undo log pages handled in the batch */ +static +ulint +trx_purge_attach_undo_recs( +/*=======================*/ + ulint n_purge_threads,/*!< in: number of purge threads */ + trx_purge_t* purge_sys, /*!< in/out: purge instance */ + purge_iter_t* limit, /*!< out: records read up to */ + ulint batch_size) /*!< in: no. of pages to purge */ +{ + que_thr_t* thr; + ulint i = 0; + ulint n_pages_handled = 0; + ulint n_thrs = UT_LIST_GET_LEN(purge_sys->query->thrs); + + ut_a(n_purge_threads > 0); + + *limit = purge_sys->iter; + + /* Debug code to validate some pre-requisites and reset done flag. */ + for (thr = UT_LIST_GET_FIRST(purge_sys->query->thrs); + thr != NULL && i < n_purge_threads; + thr = UT_LIST_GET_NEXT(thrs, thr), ++i) { + + purge_node_t* node; + + /* Get the purge node. */ + node = (purge_node_t*) thr->child; + + ut_a(que_node_get_type(node) == QUE_NODE_PURGE); + ut_a(node->undo_recs == NULL); + ut_a(node->done); + + node->done = FALSE; + } + + /* There should never be fewer nodes than threads, the inverse + however is allowed because we only use purge threads as needed. */ + ut_a(i == n_purge_threads); + + /* Fetch and parse the UNDO records. The UNDO records are added + to a per purge node vector. */ + thr = UT_LIST_GET_FIRST(purge_sys->query->thrs); + ut_a(n_thrs > 0 && thr != NULL); + + ut_ad(trx_purge_check_limit()); + + i = 0; + + for (;;) { + purge_node_t* node; + trx_purge_rec_t* purge_rec; + + ut_a(!thr->is_active); + + /* Get the purge node. */ + node = (purge_node_t*) thr->child; + ut_a(que_node_get_type(node) == QUE_NODE_PURGE); + + purge_rec = static_cast<trx_purge_rec_t*>( + mem_heap_zalloc(node->heap, sizeof(*purge_rec))); + + /* Track the max {trx_id, undo_no} for truncating the + UNDO logs once we have purged the records. */ + + if (purge_sys->iter.trx_no > limit->trx_no + || (purge_sys->iter.trx_no == limit->trx_no + && purge_sys->iter.undo_no >= limit->undo_no)) { + + *limit = purge_sys->iter; + } + + /* Fetch the next record, and advance the purge_sys->iter. */ + purge_rec->undo_rec = trx_purge_fetch_next_rec( + &purge_rec->roll_ptr, &n_pages_handled, node->heap); + + if (purge_rec->undo_rec != NULL) { + + if (node->undo_recs == NULL) { + node->undo_recs = ib_vector_create( + ib_heap_allocator_create(node->heap), + sizeof(trx_purge_rec_t), + batch_size); + } else { + ut_a(!ib_vector_is_empty(node->undo_recs)); + } + + ib_vector_push(node->undo_recs, purge_rec); + + if (n_pages_handled >= batch_size) { + + break; + } + } else { + break; + } + + thr = UT_LIST_GET_NEXT(thrs, thr); + + if (!(++i % n_purge_threads)) { + thr = UT_LIST_GET_FIRST(purge_sys->query->thrs); + } + + ut_a(thr != NULL); + } + + ut_ad(trx_purge_check_limit()); + + return(n_pages_handled); +} + +/*******************************************************************//** +Calculate the DML delay required. +@return delay in microseconds or ULINT_MAX */ +static +ulint +trx_purge_dml_delay(void) +/*=====================*/ +{ + /* Determine how much data manipulation language (DML) statements + need to be delayed in order to reduce the lagging of the purge + thread. */ + ulint delay = 0; /* in microseconds; default: no delay */ + + /* If purge lag is set (ie. > 0) then calculate the new DML delay. + Note: we do a dirty read of the trx_sys_t data structure here, + without holding trx_sys->mutex. */ + + if (srv_max_purge_lag > 0) { + float ratio; + + ratio = float(trx_sys->rseg_history_len) / srv_max_purge_lag; + + if (ratio > 1.0) { + /* If the history list length exceeds the + srv_max_purge_lag, the data manipulation + statements are delayed by at least 5000 + microseconds. */ + delay = (ulint) ((ratio - .5) * 10000); + } + + if (delay > srv_max_purge_lag_delay) { + delay = srv_max_purge_lag_delay; + } + + MONITOR_SET(MONITOR_DML_PURGE_DELAY, delay); + } + + return(delay); +} + +/*******************************************************************//** +Wait for pending purge jobs to complete. */ +static +void +trx_purge_wait_for_workers_to_complete( +/*===================================*/ + trx_purge_t* purge_sys) /*!< in: purge instance */ +{ + ulint n_submitted = purge_sys->n_submitted; + +#ifdef HAVE_ATOMIC_BUILTINS + /* Ensure that the work queue empties out. */ + while (!os_compare_and_swap_ulint( + &purge_sys->n_completed, n_submitted, n_submitted)) { +#else + mutex_enter(&purge_sys->bh_mutex); + + while (purge_sys->n_completed < n_submitted) { +#endif /* HAVE_ATOMIC_BUILTINS */ + +#ifndef HAVE_ATOMIC_BUILTINS + mutex_exit(&purge_sys->bh_mutex); +#endif /* !HAVE_ATOMIC_BUILTINS */ + + if (srv_get_task_queue_length() > 0) { + srv_release_threads(SRV_WORKER, 1); + } + + os_thread_yield(); + +#ifndef HAVE_ATOMIC_BUILTINS + mutex_enter(&purge_sys->bh_mutex); +#endif /* !HAVE_ATOMIC_BUILTINS */ + } + +#ifndef HAVE_ATOMIC_BUILTINS + mutex_exit(&purge_sys->bh_mutex); +#endif /* !HAVE_ATOMIC_BUILTINS */ + + /* None of the worker threads should be doing any work. */ + ut_a(purge_sys->n_submitted == purge_sys->n_completed); + + /* There should be no outstanding tasks as long + as the worker threads are active. */ + ut_a(srv_get_task_queue_length() == 0); +} + +/******************************************************************//** +Remove old historical changes from the rollback segments. */ +static +void +trx_purge_truncate(void) +/*====================*/ +{ + ut_ad(trx_purge_check_limit()); + + if (purge_sys->limit.trx_no == 0) { + trx_purge_truncate_history(&purge_sys->iter, purge_sys->view); + } else { + trx_purge_truncate_history(&purge_sys->limit, purge_sys->view); + } +} + +/*******************************************************************//** +This function runs a purge batch. +@return number of undo log pages handled in the batch */ +UNIV_INTERN +ulint +trx_purge( +/*======*/ + ulint n_purge_threads, /*!< in: number of purge tasks + to submit to the work queue */ + ulint batch_size, /*!< in: the maximum number of records + to purge in one batch */ + bool truncate) /*!< in: truncate history if true */ +{ + que_thr_t* thr = NULL; + ulint n_pages_handled; + + ut_a(n_purge_threads > 0); + + srv_dml_needed_delay = trx_purge_dml_delay(); + + /* The number of tasks submitted should be completed. */ + ut_a(purge_sys->n_submitted == purge_sys->n_completed); + + rw_lock_x_lock(&purge_sys->latch); + + purge_sys->view = NULL; + + mem_heap_empty(purge_sys->heap); + + purge_sys->view = read_view_purge_open(purge_sys->prebuilt_clone, + purge_sys->prebuilt_view); + + rw_lock_x_unlock(&purge_sys->latch); + +#ifdef UNIV_DEBUG + if (srv_purge_view_update_only_debug) { + return(0); + } +#endif + + /* Fetch the UNDO recs that need to be purged. */ + n_pages_handled = trx_purge_attach_undo_recs( + n_purge_threads, purge_sys, &purge_sys->limit, batch_size); + + /* Do we do an asynchronous purge or not ? */ + if (n_purge_threads > 1) { + ulint i = 0; + + /* Submit the tasks to the work queue. */ + for (i = 0; i < n_purge_threads - 1; ++i) { + thr = que_fork_scheduler_round_robin( + purge_sys->query, thr); + + ut_a(thr != NULL); + + srv_que_task_enqueue_low(thr); + } + + thr = que_fork_scheduler_round_robin(purge_sys->query, thr); + ut_a(thr != NULL); + + purge_sys->n_submitted += n_purge_threads - 1; + + goto run_synchronously; + + /* Do it synchronously. */ + } else { + thr = que_fork_scheduler_round_robin(purge_sys->query, NULL); + ut_ad(thr); + +run_synchronously: + ++purge_sys->n_submitted; + + que_run_threads(thr); + + os_atomic_inc_ulint( + &purge_sys->bh_mutex, &purge_sys->n_completed, 1); + + if (n_purge_threads > 1) { + trx_purge_wait_for_workers_to_complete(purge_sys); + } + } + + ut_a(purge_sys->n_submitted == purge_sys->n_completed); + +#ifdef UNIV_DEBUG + rw_lock_x_lock(&purge_sys->latch); + if (purge_sys->limit.trx_no == 0) { + purge_sys->done = purge_sys->iter; + } else { + purge_sys->done = purge_sys->limit; + } + rw_lock_x_unlock(&purge_sys->latch); +#endif /* UNIV_DEBUG */ + + if (truncate) { + trx_purge_truncate(); + } + + MONITOR_INC_VALUE(MONITOR_PURGE_INVOKED, 1); + MONITOR_INC_VALUE(MONITOR_PURGE_N_PAGE_HANDLED, n_pages_handled); + + return(n_pages_handled); +} + +/*******************************************************************//** +Get the purge state. +@return purge state. */ +UNIV_INTERN +purge_state_t +trx_purge_state(void) +/*=================*/ +{ + purge_state_t state; + + rw_lock_x_lock(&purge_sys->latch); + + state = purge_sys->state; + + rw_lock_x_unlock(&purge_sys->latch); + + return(state); +} + +/*******************************************************************//** +Stop purge and wait for it to stop, move to PURGE_STATE_STOP. */ +UNIV_INTERN +void +trx_purge_stop(void) +/*================*/ +{ + purge_state_t state; + ib_int64_t sig_count = os_event_reset(purge_sys->event); + + ut_a(srv_n_purge_threads > 0); + + rw_lock_x_lock(&purge_sys->latch); + + ut_a(purge_sys->state != PURGE_STATE_INIT); + ut_a(purge_sys->state != PURGE_STATE_EXIT); + ut_a(purge_sys->state != PURGE_STATE_DISABLED); + + ++purge_sys->n_stop; + + state = purge_sys->state; + + if (state == PURGE_STATE_RUN) { + ib_logf(IB_LOG_LEVEL_INFO, "Stopping purge"); + + /* We need to wakeup the purge thread in case it is suspended, + so that it can acknowledge the state change. */ + + srv_purge_wakeup(); + } + + purge_sys->state = PURGE_STATE_STOP; + + rw_lock_x_unlock(&purge_sys->latch); + + if (state != PURGE_STATE_STOP) { + + /* Wait for purge coordinator to signal that it + is suspended. */ + os_event_wait_low(purge_sys->event, sig_count); + } else { + bool once = true; + + rw_lock_x_lock(&purge_sys->latch); + + /* Wait for purge to signal that it has actually stopped. */ + while (purge_sys->running) { + + if (once) { + ib_logf(IB_LOG_LEVEL_INFO, + "Waiting for purge to stop"); + once = false; + } + + rw_lock_x_unlock(&purge_sys->latch); + + os_thread_sleep(10000); + + rw_lock_x_lock(&purge_sys->latch); + } + + rw_lock_x_unlock(&purge_sys->latch); + } + + MONITOR_INC_VALUE(MONITOR_PURGE_STOP_COUNT, 1); +} + +/*******************************************************************//** +Resume purge, move to PURGE_STATE_RUN. */ +UNIV_INTERN +void +trx_purge_run(void) +/*===============*/ +{ + rw_lock_x_lock(&purge_sys->latch); + + switch(purge_sys->state) { + case PURGE_STATE_INIT: + case PURGE_STATE_EXIT: + case PURGE_STATE_DISABLED: + ut_error; + + case PURGE_STATE_RUN: + case PURGE_STATE_STOP: + break; + } + + if (purge_sys->n_stop > 0) { + + ut_a(purge_sys->state == PURGE_STATE_STOP); + + --purge_sys->n_stop; + + if (purge_sys->n_stop == 0) { + + ib_logf(IB_LOG_LEVEL_INFO, "Resuming purge"); + + purge_sys->state = PURGE_STATE_RUN; + } + + MONITOR_INC_VALUE(MONITOR_PURGE_RESUME_COUNT, 1); + } else { + ut_a(purge_sys->state == PURGE_STATE_RUN); + } + + rw_lock_x_unlock(&purge_sys->latch); + + srv_purge_wakeup(); +} diff --git a/storage/xtradb/trx/trx0rec.cc b/storage/xtradb/trx/trx0rec.cc new file mode 100644 index 00000000000..a698b37c2a6 --- /dev/null +++ b/storage/xtradb/trx/trx0rec.cc @@ -0,0 +1,1656 @@ +/***************************************************************************** + +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file trx/trx0rec.cc +Transaction undo log record + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#include "trx0rec.h" + +#ifdef UNIV_NONINL +#include "trx0rec.ic" +#endif + +#include "fsp0fsp.h" +#include "mach0data.h" +#include "trx0undo.h" +#include "mtr0log.h" +#ifndef UNIV_HOTBACKUP +#include "dict0dict.h" +#include "ut0mem.h" +#include "read0read.h" +#include "row0ext.h" +#include "row0upd.h" +#include "que0que.h" +#include "trx0purge.h" +#include "trx0rseg.h" +#include "row0row.h" + +/*=========== UNDO LOG RECORD CREATION AND DECODING ====================*/ + +/**********************************************************************//** +Writes the mtr log entry of the inserted undo log record on the undo log +page. */ +UNIV_INLINE +void +trx_undof_page_add_undo_rec_log( +/*============================*/ + page_t* undo_page, /*!< in: undo log page */ + ulint old_free, /*!< in: start offset of the inserted entry */ + ulint new_free, /*!< in: end offset of the entry */ + mtr_t* mtr) /*!< in: mtr */ +{ + byte* log_ptr; + const byte* log_end; + ulint len; + + log_ptr = mlog_open(mtr, 11 + 13 + MLOG_BUF_MARGIN); + + if (log_ptr == NULL) { + + return; + } + + log_end = &log_ptr[11 + 13 + MLOG_BUF_MARGIN]; + log_ptr = mlog_write_initial_log_record_fast( + undo_page, MLOG_UNDO_INSERT, log_ptr, mtr); + len = new_free - old_free - 4; + + mach_write_to_2(log_ptr, len); + log_ptr += 2; + + if (log_ptr + len <= log_end) { + memcpy(log_ptr, undo_page + old_free + 2, len); + mlog_close(mtr, log_ptr + len); + } else { + mlog_close(mtr, log_ptr); + mlog_catenate_string(mtr, undo_page + old_free + 2, len); + } +} +#endif /* !UNIV_HOTBACKUP */ + +/***********************************************************//** +Parses a redo log record of adding an undo log record. +@return end of log record or NULL */ +UNIV_INTERN +byte* +trx_undo_parse_add_undo_rec( +/*========================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + page_t* page) /*!< in: page or NULL */ +{ + ulint len; + byte* rec; + ulint first_free; + + if (end_ptr < ptr + 2) { + + return(NULL); + } + + len = mach_read_from_2(ptr); + ptr += 2; + + if (end_ptr < ptr + len) { + + return(NULL); + } + + if (page == NULL) { + + return(ptr + len); + } + + first_free = mach_read_from_2(page + TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_FREE); + rec = page + first_free; + + mach_write_to_2(rec, first_free + 4 + len); + mach_write_to_2(rec + 2 + len, first_free); + + mach_write_to_2(page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE, + first_free + 4 + len); + ut_memcpy(rec + 2, ptr, len); + + return(ptr + len); +} + +#ifndef UNIV_HOTBACKUP +/**********************************************************************//** +Calculates the free space left for extending an undo log record. +@return bytes left */ +UNIV_INLINE +ulint +trx_undo_left( +/*==========*/ + const page_t* page, /*!< in: undo log page */ + const byte* ptr) /*!< in: pointer to page */ +{ + /* The '- 10' is a safety margin, in case we have some small + calculation error below */ + + return(UNIV_PAGE_SIZE - (ptr - page) - 10 - FIL_PAGE_DATA_END); +} + +/**********************************************************************//** +Set the next and previous pointers in the undo page for the undo record +that was written to ptr. Update the first free value by the number of bytes +written for this undo record. +@return offset of the inserted entry on the page if succeeded, 0 if fail */ +static +ulint +trx_undo_page_set_next_prev_and_add( +/*================================*/ + page_t* undo_page, /*!< in/out: undo log page */ + byte* ptr, /*!< in: ptr up to where data has been + written on this undo page. */ + mtr_t* mtr) /*!< in: mtr */ +{ + ulint first_free; /*!< offset within undo_page */ + ulint end_of_rec; /*!< offset within undo_page */ + byte* ptr_to_first_free; + /* pointer within undo_page + that points to the next free + offset value within undo_page.*/ + + ut_ad(ptr > undo_page); + ut_ad(ptr < undo_page + UNIV_PAGE_SIZE); + + if (UNIV_UNLIKELY(trx_undo_left(undo_page, ptr) < 2)) { + + return(0); + } + + ptr_to_first_free = undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE; + + first_free = mach_read_from_2(ptr_to_first_free); + + /* Write offset of the previous undo log record */ + mach_write_to_2(ptr, first_free); + ptr += 2; + + end_of_rec = ptr - undo_page; + + /* Write offset of the next undo log record */ + mach_write_to_2(undo_page + first_free, end_of_rec); + + /* Update the offset to first free undo record */ + mach_write_to_2(ptr_to_first_free, end_of_rec); + + /* Write this log entry to the UNDO log */ + trx_undof_page_add_undo_rec_log(undo_page, first_free, + end_of_rec, mtr); + + return(first_free); +} + +/**********************************************************************//** +Reports in the undo log of an insert of a clustered index record. +@return offset of the inserted entry on the page if succeed, 0 if fail */ +static +ulint +trx_undo_page_report_insert( +/*========================*/ + page_t* undo_page, /*!< in: undo log page */ + trx_t* trx, /*!< in: transaction */ + dict_index_t* index, /*!< in: clustered index */ + const dtuple_t* clust_entry, /*!< in: index entry which will be + inserted to the clustered index */ + mtr_t* mtr) /*!< in: mtr */ +{ + ulint first_free; + byte* ptr; + ulint i; + + ut_ad(dict_index_is_clust(index)); + ut_ad(mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_TYPE) == TRX_UNDO_INSERT); + + first_free = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_FREE); + ptr = undo_page + first_free; + + ut_ad(first_free <= UNIV_PAGE_SIZE); + + if (trx_undo_left(undo_page, ptr) < 2 + 1 + 11 + 11) { + + /* Not enough space for writing the general parameters */ + + return(0); + } + + /* Reserve 2 bytes for the pointer to the next undo log record */ + ptr += 2; + + /* Store first some general parameters to the undo log */ + *ptr++ = TRX_UNDO_INSERT_REC; + ptr += mach_ull_write_much_compressed(ptr, trx->undo_no); + ptr += mach_ull_write_much_compressed(ptr, index->table->id); + /*----------------------------------------*/ + /* Store then the fields required to uniquely determine the record + to be inserted in the clustered index */ + + for (i = 0; i < dict_index_get_n_unique(index); i++) { + + const dfield_t* field = dtuple_get_nth_field(clust_entry, i); + ulint flen = dfield_get_len(field); + + if (trx_undo_left(undo_page, ptr) < 5) { + + return(0); + } + + ptr += mach_write_compressed(ptr, flen); + + if (flen != UNIV_SQL_NULL) { + if (trx_undo_left(undo_page, ptr) < flen) { + + return(0); + } + + ut_memcpy(ptr, dfield_get_data(field), flen); + ptr += flen; + } + } + + return(trx_undo_page_set_next_prev_and_add(undo_page, ptr, mtr)); +} + +/**********************************************************************//** +Reads from an undo log record the general parameters. +@return remaining part of undo log record after reading these values */ +UNIV_INTERN +byte* +trx_undo_rec_get_pars( +/*==================*/ + trx_undo_rec_t* undo_rec, /*!< in: undo log record */ + ulint* type, /*!< out: undo record type: + TRX_UNDO_INSERT_REC, ... */ + ulint* cmpl_info, /*!< out: compiler info, relevant only + for update type records */ + bool* updated_extern, /*!< out: true if we updated an + externally stored fild */ + undo_no_t* undo_no, /*!< out: undo log record number */ + table_id_t* table_id) /*!< out: table id */ +{ + byte* ptr; + ulint type_cmpl; + + ptr = undo_rec + 2; + + type_cmpl = mach_read_from_1(ptr); + ptr++; + + *updated_extern = !!(type_cmpl & TRX_UNDO_UPD_EXTERN); + type_cmpl &= ~TRX_UNDO_UPD_EXTERN; + + *type = type_cmpl & (TRX_UNDO_CMPL_INFO_MULT - 1); + *cmpl_info = type_cmpl / TRX_UNDO_CMPL_INFO_MULT; + + *undo_no = mach_ull_read_much_compressed(ptr); + ptr += mach_ull_get_much_compressed_size(*undo_no); + + *table_id = mach_ull_read_much_compressed(ptr); + ptr += mach_ull_get_much_compressed_size(*table_id); + + return(ptr); +} + +/**********************************************************************//** +Reads from an undo log record a stored column value. +@return remaining part of undo log record after reading these values */ +static +byte* +trx_undo_rec_get_col_val( +/*=====================*/ + byte* ptr, /*!< in: pointer to remaining part of undo log record */ + byte** field, /*!< out: pointer to stored field */ + ulint* len, /*!< out: length of the field, or UNIV_SQL_NULL */ + ulint* orig_len)/*!< out: original length of the locally + stored part of an externally stored column, or 0 */ +{ + *len = mach_read_compressed(ptr); + ptr += mach_get_compressed_size(*len); + + *orig_len = 0; + + switch (*len) { + case UNIV_SQL_NULL: + *field = NULL; + break; + case UNIV_EXTERN_STORAGE_FIELD: + *orig_len = mach_read_compressed(ptr); + ptr += mach_get_compressed_size(*orig_len); + *len = mach_read_compressed(ptr); + ptr += mach_get_compressed_size(*len); + *field = ptr; + ptr += *len; + + ut_ad(*orig_len >= BTR_EXTERN_FIELD_REF_SIZE); + ut_ad(*len > *orig_len); + /* @see dtuple_convert_big_rec() */ + ut_ad(*len >= BTR_EXTERN_FIELD_REF_SIZE); + + /* we do not have access to index->table here + ut_ad(dict_table_get_format(index->table) >= UNIV_FORMAT_B + || *len >= col->max_prefix + + BTR_EXTERN_FIELD_REF_SIZE); + */ + + *len += UNIV_EXTERN_STORAGE_FIELD; + break; + default: + *field = ptr; + if (*len >= UNIV_EXTERN_STORAGE_FIELD) { + ptr += *len - UNIV_EXTERN_STORAGE_FIELD; + } else { + ptr += *len; + } + } + + return(ptr); +} + +/*******************************************************************//** +Builds a row reference from an undo log record. +@return pointer to remaining part of undo record */ +UNIV_INTERN +byte* +trx_undo_rec_get_row_ref( +/*=====================*/ + byte* ptr, /*!< in: remaining part of a copy of an undo log + record, at the start of the row reference; + NOTE that this copy of the undo log record must + be preserved as long as the row reference is + used, as we do NOT copy the data in the + record! */ + dict_index_t* index, /*!< in: clustered index */ + dtuple_t** ref, /*!< out, own: row reference */ + mem_heap_t* heap) /*!< in: memory heap from which the memory + needed is allocated */ +{ + ulint ref_len; + ulint i; + + ut_ad(index && ptr && ref && heap); + ut_a(dict_index_is_clust(index)); + + ref_len = dict_index_get_n_unique(index); + + *ref = dtuple_create(heap, ref_len); + + dict_index_copy_types(*ref, index, ref_len); + + for (i = 0; i < ref_len; i++) { + dfield_t* dfield; + byte* field; + ulint len; + ulint orig_len; + + dfield = dtuple_get_nth_field(*ref, i); + + ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len); + + dfield_set_data(dfield, field, len); + } + + return(ptr); +} + +/*******************************************************************//** +Skips a row reference from an undo log record. +@return pointer to remaining part of undo record */ +UNIV_INTERN +byte* +trx_undo_rec_skip_row_ref( +/*======================*/ + byte* ptr, /*!< in: remaining part in update undo log + record, at the start of the row reference */ + dict_index_t* index) /*!< in: clustered index */ +{ + ulint ref_len; + ulint i; + + ut_ad(index && ptr); + ut_a(dict_index_is_clust(index)); + + ref_len = dict_index_get_n_unique(index); + + for (i = 0; i < ref_len; i++) { + byte* field; + ulint len; + ulint orig_len; + + ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len); + } + + return(ptr); +} + +/**********************************************************************//** +Fetch a prefix of an externally stored column, for writing to the undo log +of an update or delete marking of a clustered index record. +@return ext_buf */ +static +byte* +trx_undo_page_fetch_ext( +/*====================*/ + byte* ext_buf, /*!< in: buffer to hold the prefix + data and BLOB pointer */ + ulint prefix_len, /*!< in: prefix size to store + in the undo log */ + ulint zip_size, /*!< compressed page size in bytes, + or 0 for uncompressed BLOB */ + const byte* field, /*!< in: an externally stored column */ + ulint* len) /*!< in: length of field; + out: used length of ext_buf */ +{ + /* Fetch the BLOB. */ + ulint ext_len = btr_copy_externally_stored_field_prefix( + ext_buf, prefix_len, zip_size, field, *len); + /* BLOBs should always be nonempty. */ + ut_a(ext_len); + /* Append the BLOB pointer to the prefix. */ + memcpy(ext_buf + ext_len, + field + *len - BTR_EXTERN_FIELD_REF_SIZE, + BTR_EXTERN_FIELD_REF_SIZE); + *len = ext_len + BTR_EXTERN_FIELD_REF_SIZE; + return(ext_buf); +} + +/**********************************************************************//** +Writes to the undo log a prefix of an externally stored column. +@return undo log position */ +static +byte* +trx_undo_page_report_modify_ext( +/*============================*/ + byte* ptr, /*!< in: undo log position, + at least 15 bytes must be available */ + byte* ext_buf, /*!< in: a buffer of + DICT_MAX_FIELD_LEN_BY_FORMAT() size, + or NULL when should not fetch + a longer prefix */ + ulint prefix_len, /*!< prefix size to store in the + undo log */ + ulint zip_size, /*!< compressed page size in bytes, + or 0 for uncompressed BLOB */ + const byte** field, /*!< in/out: the locally stored part of + the externally stored column */ + ulint* len) /*!< in/out: length of field, in bytes */ +{ + if (ext_buf) { + ut_a(prefix_len > 0); + + /* If an ordering column is externally stored, we will + have to store a longer prefix of the field. In this + case, write to the log a marker followed by the + original length and the real length of the field. */ + ptr += mach_write_compressed(ptr, UNIV_EXTERN_STORAGE_FIELD); + + ptr += mach_write_compressed(ptr, *len); + + *field = trx_undo_page_fetch_ext(ext_buf, prefix_len, zip_size, + *field, len); + + ptr += mach_write_compressed(ptr, *len); + } else { + ptr += mach_write_compressed(ptr, UNIV_EXTERN_STORAGE_FIELD + + *len); + } + + return(ptr); +} + +/**********************************************************************//** +Reports in the undo log of an update or delete marking of a clustered index +record. +@return byte offset of the inserted undo log entry on the page if +succeed, 0 if fail */ +static +ulint +trx_undo_page_report_modify( +/*========================*/ + page_t* undo_page, /*!< in: undo log page */ + trx_t* trx, /*!< in: transaction */ + dict_index_t* index, /*!< in: clustered index where update or + delete marking is done */ + const rec_t* rec, /*!< in: clustered index record which + has NOT yet been modified */ + const ulint* offsets, /*!< in: rec_get_offsets(rec, index) */ + const upd_t* update, /*!< in: update vector which tells the + columns to be updated; in the case of + a delete, this should be set to NULL */ + ulint cmpl_info, /*!< in: compiler info on secondary + index updates */ + mtr_t* mtr) /*!< in: mtr */ +{ + dict_table_t* table; + ulint first_free; + byte* ptr; + const byte* field; + ulint flen; + ulint col_no; + ulint type_cmpl; + byte* type_cmpl_ptr; + ulint i; + trx_id_t trx_id; + ibool ignore_prefix = FALSE; + byte ext_buf[REC_VERSION_56_MAX_INDEX_COL_LEN + + BTR_EXTERN_FIELD_REF_SIZE]; + + ut_a(dict_index_is_clust(index)); + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_TYPE) == TRX_UNDO_UPDATE); + table = index->table; + + first_free = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_FREE); + ptr = undo_page + first_free; + + ut_ad(first_free <= UNIV_PAGE_SIZE); + + if (trx_undo_left(undo_page, ptr) < 50) { + + /* NOTE: the value 50 must be big enough so that the general + fields written below fit on the undo log page */ + + return(0); + } + + /* Reserve 2 bytes for the pointer to the next undo log record */ + ptr += 2; + + /* Store first some general parameters to the undo log */ + + if (!update) { + ut_ad(!rec_get_deleted_flag(rec, dict_table_is_comp(table))); + type_cmpl = TRX_UNDO_DEL_MARK_REC; + } else if (rec_get_deleted_flag(rec, dict_table_is_comp(table))) { + type_cmpl = TRX_UNDO_UPD_DEL_REC; + /* We are about to update a delete marked record. + We don't typically need the prefix in this case unless + the delete marking is done by the same transaction + (which we check below). */ + ignore_prefix = TRUE; + } else { + type_cmpl = TRX_UNDO_UPD_EXIST_REC; + } + + type_cmpl |= cmpl_info * TRX_UNDO_CMPL_INFO_MULT; + type_cmpl_ptr = ptr; + + *ptr++ = (byte) type_cmpl; + ptr += mach_ull_write_much_compressed(ptr, trx->undo_no); + + ptr += mach_ull_write_much_compressed(ptr, table->id); + + /*----------------------------------------*/ + /* Store the state of the info bits */ + + *ptr++ = (byte) rec_get_info_bits(rec, dict_table_is_comp(table)); + + /* Store the values of the system columns */ + field = rec_get_nth_field(rec, offsets, + dict_index_get_sys_col_pos( + index, DATA_TRX_ID), &flen); + ut_ad(flen == DATA_TRX_ID_LEN); + + trx_id = trx_read_trx_id(field); + + /* If it is an update of a delete marked record, then we are + allowed to ignore blob prefixes if the delete marking was done + by some other trx as it must have committed by now for us to + allow an over-write. */ + if (ignore_prefix) { + ignore_prefix = (trx_id != trx->id); + } + ptr += mach_ull_write_compressed(ptr, trx_id); + + field = rec_get_nth_field(rec, offsets, + dict_index_get_sys_col_pos( + index, DATA_ROLL_PTR), &flen); + ut_ad(flen == DATA_ROLL_PTR_LEN); + + ptr += mach_ull_write_compressed(ptr, trx_read_roll_ptr(field)); + + /*----------------------------------------*/ + /* Store then the fields required to uniquely determine the + record which will be modified in the clustered index */ + + for (i = 0; i < dict_index_get_n_unique(index); i++) { + + field = rec_get_nth_field(rec, offsets, i, &flen); + + /* The ordering columns must not be stored externally. */ + ut_ad(!rec_offs_nth_extern(offsets, i)); + ut_ad(dict_index_get_nth_col(index, i)->ord_part); + + if (trx_undo_left(undo_page, ptr) < 5) { + + return(0); + } + + ptr += mach_write_compressed(ptr, flen); + + if (flen != UNIV_SQL_NULL) { + if (trx_undo_left(undo_page, ptr) < flen) { + + return(0); + } + + ut_memcpy(ptr, field, flen); + ptr += flen; + } + } + + /*----------------------------------------*/ + /* Save to the undo log the old values of the columns to be updated. */ + + if (update) { + if (trx_undo_left(undo_page, ptr) < 5) { + + return(0); + } + + ptr += mach_write_compressed(ptr, upd_get_n_fields(update)); + + for (i = 0; i < upd_get_n_fields(update); i++) { + + ulint pos = upd_get_nth_field(update, i)->field_no; + + /* Write field number to undo log */ + if (trx_undo_left(undo_page, ptr) < 5) { + + return(0); + } + + ptr += mach_write_compressed(ptr, pos); + + /* Save the old value of field */ + field = rec_get_nth_field(rec, offsets, pos, &flen); + + if (trx_undo_left(undo_page, ptr) < 15) { + + return(0); + } + + if (rec_offs_nth_extern(offsets, pos)) { + const dict_col_t* col + = dict_index_get_nth_col(index, pos); + ulint prefix_len + = dict_max_field_len_store_undo( + table, col); + + ut_ad(prefix_len + BTR_EXTERN_FIELD_REF_SIZE + <= sizeof ext_buf); + + ptr = trx_undo_page_report_modify_ext( + ptr, + col->ord_part + && !ignore_prefix + && flen < REC_ANTELOPE_MAX_INDEX_COL_LEN + ? ext_buf : NULL, prefix_len, + dict_table_zip_size(table), + &field, &flen); + + /* Notify purge that it eventually has to + free the old externally stored field */ + + trx->update_undo->del_marks = TRUE; + + *type_cmpl_ptr |= TRX_UNDO_UPD_EXTERN; + } else { + ptr += mach_write_compressed(ptr, flen); + } + + if (flen != UNIV_SQL_NULL) { + if (trx_undo_left(undo_page, ptr) < flen) { + + return(0); + } + + ut_memcpy(ptr, field, flen); + ptr += flen; + } + } + } + + /*----------------------------------------*/ + /* In the case of a delete marking, and also in the case of an update + where any ordering field of any index changes, store the values of all + columns which occur as ordering fields in any index. This info is used + in the purge of old versions where we use it to build and search the + delete marked index records, to look if we can remove them from the + index tree. Note that starting from 4.0.14 also externally stored + fields can be ordering in some index. Starting from 5.2, we no longer + store REC_MAX_INDEX_COL_LEN first bytes to the undo log record, + but we can construct the column prefix fields in the index by + fetching the first page of the BLOB that is pointed to by the + clustered index. This works also in crash recovery, because all pages + (including BLOBs) are recovered before anything is rolled back. */ + + if (!update || !(cmpl_info & UPD_NODE_NO_ORD_CHANGE)) { + byte* old_ptr = ptr; + + trx->update_undo->del_marks = TRUE; + + if (trx_undo_left(undo_page, ptr) < 5) { + + return(0); + } + + /* Reserve 2 bytes to write the number of bytes the stored + fields take in this undo record */ + + ptr += 2; + + for (col_no = 0; col_no < dict_table_get_n_cols(table); + col_no++) { + + const dict_col_t* col + = dict_table_get_nth_col(table, col_no); + + if (col->ord_part) { + ulint pos; + + /* Write field number to undo log */ + if (trx_undo_left(undo_page, ptr) < 5 + 15) { + + return(0); + } + + pos = dict_index_get_nth_col_pos(index, + col_no); + ptr += mach_write_compressed(ptr, pos); + + /* Save the old value of field */ + field = rec_get_nth_field(rec, offsets, pos, + &flen); + + if (rec_offs_nth_extern(offsets, pos)) { + const dict_col_t* col = + dict_index_get_nth_col( + index, pos); + ulint prefix_len = + dict_max_field_len_store_undo( + table, col); + + ut_a(prefix_len < sizeof ext_buf); + + ptr = trx_undo_page_report_modify_ext( + ptr, + flen < REC_ANTELOPE_MAX_INDEX_COL_LEN + && !ignore_prefix + ? ext_buf : NULL, prefix_len, + dict_table_zip_size(table), + &field, &flen); + } else { + ptr += mach_write_compressed( + ptr, flen); + } + + if (flen != UNIV_SQL_NULL) { + if (trx_undo_left(undo_page, ptr) + < flen) { + + return(0); + } + + ut_memcpy(ptr, field, flen); + ptr += flen; + } + } + } + + mach_write_to_2(old_ptr, ptr - old_ptr); + } + + /*----------------------------------------*/ + /* Write pointers to the previous and the next undo log records */ + if (trx_undo_left(undo_page, ptr) < 2) { + + return(0); + } + + mach_write_to_2(ptr, first_free); + ptr += 2; + mach_write_to_2(undo_page + first_free, ptr - undo_page); + + mach_write_to_2(undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE, + ptr - undo_page); + + /* Write to the REDO log about this change in the UNDO log */ + + trx_undof_page_add_undo_rec_log(undo_page, first_free, + ptr - undo_page, mtr); + return(first_free); +} + +/**********************************************************************//** +Reads from an undo log update record the system field values of the old +version. +@return remaining part of undo log record after reading these values */ +UNIV_INTERN +byte* +trx_undo_update_rec_get_sys_cols( +/*=============================*/ + byte* ptr, /*!< in: remaining part of undo + log record after reading + general parameters */ + trx_id_t* trx_id, /*!< out: trx id */ + roll_ptr_t* roll_ptr, /*!< out: roll ptr */ + ulint* info_bits) /*!< out: info bits state */ +{ + /* Read the state of the info bits */ + *info_bits = mach_read_from_1(ptr); + ptr += 1; + + /* Read the values of the system columns */ + + *trx_id = mach_ull_read_compressed(ptr); + ptr += mach_ull_get_compressed_size(*trx_id); + + *roll_ptr = mach_ull_read_compressed(ptr); + ptr += mach_ull_get_compressed_size(*roll_ptr); + + return(ptr); +} + +/**********************************************************************//** +Reads from an update undo log record the number of updated fields. +@return remaining part of undo log record after reading this value */ +UNIV_INLINE +byte* +trx_undo_update_rec_get_n_upd_fields( +/*=================================*/ + byte* ptr, /*!< in: pointer to remaining part of undo log record */ + ulint* n) /*!< out: number of fields */ +{ + *n = mach_read_compressed(ptr); + ptr += mach_get_compressed_size(*n); + + return(ptr); +} + +/**********************************************************************//** +Reads from an update undo log record a stored field number. +@return remaining part of undo log record after reading this value */ +UNIV_INLINE +byte* +trx_undo_update_rec_get_field_no( +/*=============================*/ + byte* ptr, /*!< in: pointer to remaining part of undo log record */ + ulint* field_no)/*!< out: field number */ +{ + *field_no = mach_read_compressed(ptr); + ptr += mach_get_compressed_size(*field_no); + + return(ptr); +} + +/*******************************************************************//** +Builds an update vector based on a remaining part of an undo log record. +@return remaining part of the record, NULL if an error detected, which +means that the record is corrupted */ +UNIV_INTERN +byte* +trx_undo_update_rec_get_update( +/*===========================*/ + byte* ptr, /*!< in: remaining part in update undo log + record, after reading the row reference + NOTE that this copy of the undo log record must + be preserved as long as the update vector is + used, as we do NOT copy the data in the + record! */ + dict_index_t* index, /*!< in: clustered index */ + ulint type, /*!< in: TRX_UNDO_UPD_EXIST_REC, + TRX_UNDO_UPD_DEL_REC, or + TRX_UNDO_DEL_MARK_REC; in the last case, + only trx id and roll ptr fields are added to + the update vector */ + trx_id_t trx_id, /*!< in: transaction id from this undo record */ + roll_ptr_t roll_ptr,/*!< in: roll pointer from this undo record */ + ulint info_bits,/*!< in: info bits from this undo record */ + trx_t* trx, /*!< in: transaction */ + mem_heap_t* heap, /*!< in: memory heap from which the memory + needed is allocated */ + upd_t** upd) /*!< out, own: update vector */ +{ + upd_field_t* upd_field; + upd_t* update; + ulint n_fields; + byte* buf; + ulint i; + + ut_a(dict_index_is_clust(index)); + + if (type != TRX_UNDO_DEL_MARK_REC) { + ptr = trx_undo_update_rec_get_n_upd_fields(ptr, &n_fields); + } else { + n_fields = 0; + } + + update = upd_create(n_fields + 2, heap); + + update->info_bits = info_bits; + + /* Store first trx id and roll ptr to update vector */ + + upd_field = upd_get_nth_field(update, n_fields); + + buf = static_cast<byte*>(mem_heap_alloc(heap, DATA_TRX_ID_LEN)); + + trx_write_trx_id(buf, trx_id); + + upd_field_set_field_no(upd_field, + dict_index_get_sys_col_pos(index, DATA_TRX_ID), + index, trx); + dfield_set_data(&(upd_field->new_val), buf, DATA_TRX_ID_LEN); + + upd_field = upd_get_nth_field(update, n_fields + 1); + + buf = static_cast<byte*>(mem_heap_alloc(heap, DATA_ROLL_PTR_LEN)); + + trx_write_roll_ptr(buf, roll_ptr); + + upd_field_set_field_no( + upd_field, dict_index_get_sys_col_pos(index, DATA_ROLL_PTR), + index, trx); + dfield_set_data(&(upd_field->new_val), buf, DATA_ROLL_PTR_LEN); + + /* Store then the updated ordinary columns to the update vector */ + + for (i = 0; i < n_fields; i++) { + + byte* field; + ulint len; + ulint field_no; + ulint orig_len; + + ptr = trx_undo_update_rec_get_field_no(ptr, &field_no); + + if (field_no >= dict_index_get_n_fields(index)) { + fprintf(stderr, + "InnoDB: Error: trying to access" + " update undo rec field %lu in ", + (ulong) field_no); + dict_index_name_print(stderr, trx, index); + fprintf(stderr, "\n" + "InnoDB: but index has only %lu fields\n" + "InnoDB: Submit a detailed bug report" + " to http://bugs.mysql.com\n" + "InnoDB: Run also CHECK TABLE ", + (ulong) dict_index_get_n_fields(index)); + ut_print_name(stderr, trx, TRUE, index->table_name); + fprintf(stderr, "\n" + "InnoDB: n_fields = %lu, i = %lu, ptr %p\n", + (ulong) n_fields, (ulong) i, ptr); + ut_ad(0); + *upd = NULL; + return(NULL); + } + + upd_field = upd_get_nth_field(update, i); + + upd_field_set_field_no(upd_field, field_no, index, trx); + + ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len); + + upd_field->orig_len = orig_len; + + if (len == UNIV_SQL_NULL) { + dfield_set_null(&upd_field->new_val); + } else if (len < UNIV_EXTERN_STORAGE_FIELD) { + dfield_set_data(&upd_field->new_val, field, len); + } else { + len -= UNIV_EXTERN_STORAGE_FIELD; + + dfield_set_data(&upd_field->new_val, field, len); + dfield_set_ext(&upd_field->new_val); + } + } + + *upd = update; + + return(ptr); +} + +/*******************************************************************//** +Builds a partial row from an update undo log record, for purge. +It contains the columns which occur as ordering in any index of the table. +Any missing columns are indicated by col->mtype == DATA_MISSING. +@return pointer to remaining part of undo record */ +UNIV_INTERN +byte* +trx_undo_rec_get_partial_row( +/*=========================*/ + byte* ptr, /*!< in: remaining part in update undo log + record of a suitable type, at the start of + the stored index columns; + NOTE that this copy of the undo log record must + be preserved as long as the partial row is + used, as we do NOT copy the data in the + record! */ + dict_index_t* index, /*!< in: clustered index */ + dtuple_t** row, /*!< out, own: partial row */ + ibool ignore_prefix, /*!< in: flag to indicate if we + expect blob prefixes in undo. Used + only in the assertion. */ + mem_heap_t* heap) /*!< in: memory heap from which the memory + needed is allocated */ +{ + const byte* end_ptr; + ulint row_len; + + ut_ad(index); + ut_ad(ptr); + ut_ad(row); + ut_ad(heap); + ut_ad(dict_index_is_clust(index)); + + row_len = dict_table_get_n_cols(index->table); + + *row = dtuple_create(heap, row_len); + + /* Mark all columns in the row uninitialized, so that + we can distinguish missing fields from fields that are SQL NULL. */ + for (ulint i = 0; i < row_len; i++) { + dfield_get_type(dtuple_get_nth_field(*row, i)) + ->mtype = DATA_MISSING; + } + + end_ptr = ptr + mach_read_from_2(ptr); + ptr += 2; + + while (ptr != end_ptr) { + dfield_t* dfield; + byte* field; + ulint field_no; + const dict_col_t* col; + ulint col_no; + ulint len; + ulint orig_len; + + ptr = trx_undo_update_rec_get_field_no(ptr, &field_no); + + col = dict_index_get_nth_col(index, field_no); + col_no = dict_col_get_no(col); + + ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len); + + dfield = dtuple_get_nth_field(*row, col_no); + dict_col_copy_type( + dict_table_get_nth_col(index->table, col_no), + dfield_get_type(dfield)); + dfield_set_data(dfield, field, len); + + if (len != UNIV_SQL_NULL + && len >= UNIV_EXTERN_STORAGE_FIELD) { + dfield_set_len(dfield, + len - UNIV_EXTERN_STORAGE_FIELD); + dfield_set_ext(dfield); + /* If the prefix of this column is indexed, + ensure that enough prefix is stored in the + undo log record. */ + if (!ignore_prefix && col->ord_part) { + ut_a(dfield_get_len(dfield) + >= BTR_EXTERN_FIELD_REF_SIZE); + ut_a(dict_table_get_format(index->table) + >= UNIV_FORMAT_B + || dfield_get_len(dfield) + >= REC_ANTELOPE_MAX_INDEX_COL_LEN + + BTR_EXTERN_FIELD_REF_SIZE); + } + } + } + + return(ptr); +} +#endif /* !UNIV_HOTBACKUP */ + +/***********************************************************************//** +Erases the unused undo log page end. +@return TRUE if the page contained something, FALSE if it was empty */ +static __attribute__((nonnull)) +ibool +trx_undo_erase_page_end( +/*====================*/ + page_t* undo_page, /*!< in/out: undo page whose end to erase */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ulint first_free; + + first_free = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_FREE); + memset(undo_page + first_free, 0xff, + (UNIV_PAGE_SIZE - FIL_PAGE_DATA_END) - first_free); + + mlog_write_initial_log_record(undo_page, MLOG_UNDO_ERASE_END, mtr); + return(first_free != TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE); +} + +/***********************************************************//** +Parses a redo log record of erasing of an undo page end. +@return end of log record or NULL */ +UNIV_INTERN +byte* +trx_undo_parse_erase_page_end( +/*==========================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr __attribute__((unused)), /*!< in: buffer end */ + page_t* page, /*!< in: page or NULL */ + mtr_t* mtr) /*!< in: mtr or NULL */ +{ + ut_ad(ptr && end_ptr); + + if (page == NULL) { + + return(ptr); + } + + trx_undo_erase_page_end(page, mtr); + + return(ptr); +} + +#ifndef UNIV_HOTBACKUP +/***********************************************************************//** +Writes information to an undo log about an insert, update, or a delete marking +of a clustered index record. This information is used in a rollback of the +transaction and in consistent reads that must look to the history of this +transaction. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +trx_undo_report_row_operation( +/*==========================*/ + ulint flags, /*!< in: if BTR_NO_UNDO_LOG_FLAG bit is + set, does nothing */ + ulint op_type, /*!< in: TRX_UNDO_INSERT_OP or + TRX_UNDO_MODIFY_OP */ + que_thr_t* thr, /*!< in: query thread */ + dict_index_t* index, /*!< in: clustered index */ + const dtuple_t* clust_entry, /*!< in: in the case of an insert, + index entry to insert into the + clustered index, otherwise NULL */ + const upd_t* update, /*!< in: in the case of an update, + the update vector, otherwise NULL */ + ulint cmpl_info, /*!< in: compiler info on secondary + index updates */ + const rec_t* rec, /*!< in: in case of an update or delete + marking, the record in the clustered + index, otherwise NULL */ + const ulint* offsets, /*!< in: rec_get_offsets(rec) */ + roll_ptr_t* roll_ptr) /*!< out: rollback pointer to the + inserted undo log record, + 0 if BTR_NO_UNDO_LOG + flag was specified */ +{ + trx_t* trx; + trx_undo_t* undo; + ulint page_no; + buf_block_t* undo_block; + trx_rseg_t* rseg; + mtr_t mtr; + dberr_t err = DB_SUCCESS; +#ifdef UNIV_DEBUG + int loop_count = 0; +#endif /* UNIV_DEBUG */ + + ut_ad(!srv_read_only_mode); + ut_a(dict_index_is_clust(index)); + ut_ad(!rec || rec_offs_validate(rec, index, offsets)); + + if (flags & BTR_NO_UNDO_LOG_FLAG) { + + *roll_ptr = 0; + + return(DB_SUCCESS); + } + + ut_ad(thr); + ut_ad((op_type != TRX_UNDO_INSERT_OP) + || (clust_entry && !update && !rec)); + + trx = thr_get_trx(thr); + + /* This table is visible only to the session that created it. */ + if (trx->read_only) { + ut_ad(!srv_read_only_mode); + /* MySQL should block writes to non-temporary tables. */ + ut_a(DICT_TF2_FLAG_IS_SET(index->table, DICT_TF2_TEMPORARY)); + if (trx->rseg == 0) { + trx_assign_rseg(trx); + } + } + + rseg = trx->rseg; + + mtr_start(&mtr); + mutex_enter(&trx->undo_mutex); + + /* If the undo log is not assigned yet, assign one */ + + switch (op_type) { + case TRX_UNDO_INSERT_OP: + undo = trx->insert_undo; + + if (undo == NULL) { + + err = trx_undo_assign_undo(trx, TRX_UNDO_INSERT); + undo = trx->insert_undo; + + if (undo == NULL) { + /* Did not succeed */ + ut_ad(err != DB_SUCCESS); + goto err_exit; + } + + ut_ad(err == DB_SUCCESS); + } + break; + default: + ut_ad(op_type == TRX_UNDO_MODIFY_OP); + + undo = trx->update_undo; + + if (undo == NULL) { + err = trx_undo_assign_undo(trx, TRX_UNDO_UPDATE); + undo = trx->update_undo; + + if (undo == NULL) { + /* Did not succeed */ + ut_ad(err != DB_SUCCESS); + goto err_exit; + } + } + + ut_ad(err == DB_SUCCESS); + } + + page_no = undo->last_page_no; + undo_block = buf_page_get_gen( + undo->space, undo->zip_size, page_no, RW_X_LATCH, + undo->guess_block, BUF_GET, __FILE__, __LINE__, &mtr); + buf_block_dbg_add_level(undo_block, SYNC_TRX_UNDO_PAGE); + + do { + page_t* undo_page; + ulint offset; + + undo_page = buf_block_get_frame(undo_block); + ut_ad(page_no == buf_block_get_page_no(undo_block)); + + switch (op_type) { + case TRX_UNDO_INSERT_OP: + offset = trx_undo_page_report_insert( + undo_page, trx, index, clust_entry, &mtr); + break; + default: + ut_ad(op_type == TRX_UNDO_MODIFY_OP); + offset = trx_undo_page_report_modify( + undo_page, trx, index, rec, offsets, update, + cmpl_info, &mtr); + } + + if (UNIV_UNLIKELY(offset == 0)) { + /* The record did not fit on the page. We erase the + end segment of the undo log page and write a log + record of it: this is to ensure that in the debug + version the replicate page constructed using the log + records stays identical to the original page */ + + if (!trx_undo_erase_page_end(undo_page, &mtr)) { + /* The record did not fit on an empty + undo page. Discard the freshly allocated + page and return an error. */ + + /* When we remove a page from an undo + log, this is analogous to a + pessimistic insert in a B-tree, and we + must reserve the counterpart of the + tree latch, which is the rseg + mutex. We must commit the mini-transaction + first, because it may be holding lower-level + latches, such as SYNC_FSP and SYNC_FSP_PAGE. */ + + mtr_commit(&mtr); + mtr_start(&mtr); + + mutex_enter(&rseg->mutex); + trx_undo_free_last_page(trx, undo, &mtr); + mutex_exit(&rseg->mutex); + + err = DB_UNDO_RECORD_TOO_BIG; + goto err_exit; + } + + mtr_commit(&mtr); + } else { + /* Success */ + + mtr_commit(&mtr); + + undo->empty = FALSE; + undo->top_page_no = page_no; + undo->top_offset = offset; + undo->top_undo_no = trx->undo_no; + undo->guess_block = undo_block; + + trx->undo_no++; + + mutex_exit(&trx->undo_mutex); + + *roll_ptr = trx_undo_build_roll_ptr( + op_type == TRX_UNDO_INSERT_OP, + rseg->id, page_no, offset); + return(DB_SUCCESS); + } + + ut_ad(page_no == undo->last_page_no); + + /* We have to extend the undo log by one page */ + + ut_ad(++loop_count < 2); + mtr_start(&mtr); + + /* When we add a page to an undo log, this is analogous to + a pessimistic insert in a B-tree, and we must reserve the + counterpart of the tree latch, which is the rseg mutex. */ + + mutex_enter(&rseg->mutex); + undo_block = trx_undo_add_page(trx, undo, &mtr); + mutex_exit(&rseg->mutex); + + page_no = undo->last_page_no; + } while (undo_block != NULL); + + /* Did not succeed: out of space */ + err = DB_OUT_OF_FILE_SPACE; + +err_exit: + mutex_exit(&trx->undo_mutex); + mtr_commit(&mtr); + return(err); +} + +/*============== BUILDING PREVIOUS VERSION OF A RECORD ===============*/ + +/******************************************************************//** +Copies an undo record to heap. This function can be called if we know that +the undo log record exists. +@return own: copy of the record */ +UNIV_INTERN +trx_undo_rec_t* +trx_undo_get_undo_rec_low( +/*======================*/ + roll_ptr_t roll_ptr, /*!< in: roll pointer to record */ + mem_heap_t* heap) /*!< in: memory heap where copied */ +{ + trx_undo_rec_t* undo_rec; + ulint rseg_id; + ulint page_no; + ulint offset; + const page_t* undo_page; + trx_rseg_t* rseg; + ibool is_insert; + mtr_t mtr; + + trx_undo_decode_roll_ptr(roll_ptr, &is_insert, &rseg_id, &page_no, + &offset); + rseg = trx_rseg_get_on_id(rseg_id); + + mtr_start(&mtr); + + undo_page = trx_undo_page_get_s_latched(rseg->space, rseg->zip_size, + page_no, &mtr); + + undo_rec = trx_undo_rec_copy(undo_page + offset, heap); + + mtr_commit(&mtr); + + return(undo_rec); +} + +/******************************************************************//** +Copies an undo record to heap. + +NOTE: the caller must have latches on the clustered index page. + +@retval true if the undo log has been +truncated and we cannot fetch the old version +@retval false if the undo log record is available */ +static __attribute__((nonnull, warn_unused_result)) +bool +trx_undo_get_undo_rec( +/*==================*/ + roll_ptr_t roll_ptr, /*!< in: roll pointer to record */ + trx_id_t trx_id, /*!< in: id of the trx that generated + the roll pointer: it points to an + undo log of this transaction */ + trx_undo_rec_t**undo_rec, /*!< out, own: copy of the record */ + mem_heap_t* heap) /*!< in: memory heap where copied */ +{ + bool missing_history; + + rw_lock_s_lock(&purge_sys->latch); + missing_history = read_view_sees_trx_id(purge_sys->view, trx_id); + + if (!missing_history) { + *undo_rec = trx_undo_get_undo_rec_low(roll_ptr, heap); + } + + rw_lock_s_unlock(&purge_sys->latch); + + return(missing_history); +} + +#ifdef UNIV_DEBUG +#define ATTRIB_USED_ONLY_IN_DEBUG +#else /* UNIV_DEBUG */ +#define ATTRIB_USED_ONLY_IN_DEBUG __attribute__((unused)) +#endif /* UNIV_DEBUG */ + +/*******************************************************************//** +Build a previous version of a clustered index record. The caller must +hold a latch on the index page of the clustered index record. +@retval true if previous version was built, or if it was an insert +or the table has been rebuilt +@retval false if the previous version is earlier than purge_view, +which means that it may have been removed */ +UNIV_INTERN +bool +trx_undo_prev_version_build( +/*========================*/ + const rec_t* index_rec ATTRIB_USED_ONLY_IN_DEBUG, + /*!< in: clustered index record in the + index tree */ + mtr_t* index_mtr ATTRIB_USED_ONLY_IN_DEBUG, + /*!< in: mtr which contains the latch to + index_rec page and purge_view */ + const rec_t* rec, /*!< in: version of a clustered index record */ + dict_index_t* index, /*!< in: clustered index */ + ulint* offsets,/*!< in/out: rec_get_offsets(rec, index) */ + mem_heap_t* heap, /*!< in: memory heap from which the memory + needed is allocated */ + rec_t** old_vers)/*!< out, own: previous version, or NULL if + rec is the first inserted version, or if + history data has been deleted (an error), + or if the purge COULD have removed the version + though it has not yet done so */ +{ + trx_undo_rec_t* undo_rec = NULL; + dtuple_t* entry; + trx_id_t rec_trx_id; + ulint type; + undo_no_t undo_no; + table_id_t table_id; + trx_id_t trx_id; + roll_ptr_t roll_ptr; + upd_t* update; + byte* ptr; + ulint info_bits; + ulint cmpl_info; + bool dummy_extern; + byte* buf; +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(&purge_sys->latch, RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(mtr_memo_contains_page(index_mtr, index_rec, MTR_MEMO_PAGE_S_FIX) + || mtr_memo_contains_page(index_mtr, index_rec, + MTR_MEMO_PAGE_X_FIX)); + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_a(dict_index_is_clust(index)); + + roll_ptr = row_get_rec_roll_ptr(rec, index, offsets); + + *old_vers = NULL; + + if (trx_undo_roll_ptr_is_insert(roll_ptr)) { + /* The record rec is the first inserted version */ + return(true); + } + + rec_trx_id = row_get_rec_trx_id(rec, index, offsets); + + if (trx_undo_get_undo_rec(roll_ptr, rec_trx_id, &undo_rec, heap)) { + /* The undo record may already have been purged, + during purge or semi-consistent read. */ + return(false); + } + + ptr = trx_undo_rec_get_pars(undo_rec, &type, &cmpl_info, + &dummy_extern, &undo_no, &table_id); + + if (table_id != index->table->id) { + /* The table should have been rebuilt, but purge has + not yet removed the undo log records for the + now-dropped old table (table_id). */ + return(true); + } + + ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr, + &info_bits); + + /* (a) If a clustered index record version is such that the + trx id stamp in it is bigger than purge_sys->view, then the + BLOBs in that version are known to exist (the purge has not + progressed that far); + + (b) if the version is the first version such that trx id in it + is less than purge_sys->view, and it is not delete-marked, + then the BLOBs in that version are known to exist (the purge + cannot have purged the BLOBs referenced by that version + yet). + + This function does not fetch any BLOBs. The callers might, by + possibly invoking row_ext_create() via row_build(). However, + they should have all needed information in the *old_vers + returned by this function. This is because *old_vers is based + on the transaction undo log records. The function + trx_undo_page_fetch_ext() will write BLOB prefixes to the + transaction undo log that are at least as long as the longest + possible column prefix in a secondary index. Thus, secondary + index entries for *old_vers can be constructed without + dereferencing any BLOB pointers. */ + + ptr = trx_undo_rec_skip_row_ref(ptr, index); + + ptr = trx_undo_update_rec_get_update(ptr, index, type, trx_id, + roll_ptr, info_bits, + NULL, heap, &update); + ut_a(ptr); + +# if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + ut_a(!rec_offs_any_null_extern(rec, offsets)); +# endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + + if (row_upd_changes_field_size_or_external(index, offsets, update)) { + ulint n_ext; + + /* We should confirm the existence of disowned external data, + if the previous version record is delete marked. If the trx_id + of the previous record is seen by purge view, we should treat + it as missing history, because the disowned external data + might be purged already. + + The inherited external data (BLOBs) can be freed (purged) + after trx_id was committed, provided that no view was started + before trx_id. If the purge view can see the committed + delete-marked record by trx_id, no transactions need to access + the BLOB. */ + + /* the row_upd_changes_disowned_external(update) call could be + omitted, but the synchronization on purge_sys->latch is likely + more expensive. */ + + if ((update->info_bits & REC_INFO_DELETED_FLAG) + && row_upd_changes_disowned_external(update)) { + bool missing_extern; + + rw_lock_s_lock(&purge_sys->latch); + missing_extern = read_view_sees_trx_id(purge_sys->view, + trx_id); + rw_lock_s_unlock(&purge_sys->latch); + + if (missing_extern) { + /* treat as a fresh insert, not to + cause assertion error at the caller. */ + return(true); + } + } + + /* We have to set the appropriate extern storage bits in the + old version of the record: the extern bits in rec for those + fields that update does NOT update, as well as the bits for + those fields that update updates to become externally stored + fields. Store the info: */ + + entry = row_rec_to_index_entry( + rec, index, offsets, &n_ext, heap); + n_ext += btr_push_update_extern_fields(entry, update, heap); + /* The page containing the clustered index record + corresponding to entry is latched in mtr. Thus the + following call is safe. */ + row_upd_index_replace_new_col_vals(entry, index, update, heap); + + buf = static_cast<byte*>( + mem_heap_alloc( + heap, + rec_get_converted_size(index, entry, n_ext))); + + *old_vers = rec_convert_dtuple_to_rec(buf, index, + entry, n_ext); + } else { + buf = static_cast<byte*>( + mem_heap_alloc(heap, rec_offs_size(offsets))); + + *old_vers = rec_copy(buf, rec, offsets); + rec_offs_make_valid(*old_vers, index, offsets); + row_upd_rec_in_place(*old_vers, index, offsets, update, NULL); + } + + return(true); +} +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/xtradb/trx/trx0roll.cc b/storage/xtradb/trx/trx0roll.cc new file mode 100644 index 00000000000..a64367c4ba7 --- /dev/null +++ b/storage/xtradb/trx/trx0roll.cc @@ -0,0 +1,1386 @@ +/***************************************************************************** + +Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file trx/trx0roll.cc +Transaction rollback + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#include "trx0roll.h" + +#ifdef UNIV_NONINL +#include "trx0roll.ic" +#endif + +#include "fsp0fsp.h" +#include "mach0data.h" +#include "trx0rseg.h" +#include "trx0trx.h" +#include "trx0undo.h" +#include "trx0rec.h" +#include "que0que.h" +#include "usr0sess.h" +#include "srv0start.h" +#include "read0read.h" +#include "row0undo.h" +#include "row0mysql.h" +#include "lock0lock.h" +#include "pars0pars.h" +#include "srv0mon.h" +#include "trx0sys.h" + +/** This many pages must be undone before a truncate is tried within +rollback */ +#define TRX_ROLL_TRUNC_THRESHOLD 1 + +/** In crash recovery, the current trx to be rolled back; NULL otherwise */ +static const trx_t* trx_roll_crash_recv_trx = NULL; + +/** In crash recovery we set this to the undo n:o of the current trx to be +rolled back. Then we can print how many % the rollback has progressed. */ +static undo_no_t trx_roll_max_undo_no; + +/** Auxiliary variable which tells the previous progress % we printed */ +static ulint trx_roll_progress_printed_pct; + +/****************************************************************//** +Finishes a transaction rollback. */ +static +void +trx_rollback_finish( +/*================*/ + trx_t* trx); /*!< in: transaction */ + +/*******************************************************************//** +Rollback a transaction used in MySQL. */ +static +void +trx_rollback_to_savepoint_low( +/*==========================*/ + trx_t* trx, /*!< in: transaction handle */ + trx_savept_t* savept) /*!< in: pointer to savepoint undo number, if + partial rollback requested, or NULL for + complete rollback */ +{ + que_thr_t* thr; + mem_heap_t* heap; + roll_node_t* roll_node; + + heap = mem_heap_create(512); + + roll_node = roll_node_create(heap); + + if (savept != NULL) { + roll_node->partial = TRUE; + roll_node->savept = *savept; + assert_trx_in_list(trx); + } else { + assert_trx_nonlocking_or_in_list(trx); + } + + trx->error_state = DB_SUCCESS; + + if (trx->insert_undo || trx->update_undo) { + thr = pars_complete_graph_for_exec(roll_node, trx, heap); + + ut_a(thr == que_fork_start_command( + static_cast<que_fork_t*>(que_node_get_parent(thr)))); + + que_run_threads(thr); + + ut_a(roll_node->undo_thr != NULL); + que_run_threads(roll_node->undo_thr); + + /* Free the memory reserved by the undo graph. */ + que_graph_free(static_cast<que_t*>( + roll_node->undo_thr->common.parent)); + } + + if (savept == NULL) { + trx_rollback_finish(trx); + MONITOR_INC(MONITOR_TRX_ROLLBACK); + } else { + trx->lock.que_state = TRX_QUE_RUNNING; + MONITOR_INC(MONITOR_TRX_ROLLBACK_SAVEPOINT); + } + + ut_a(trx->error_state == DB_SUCCESS); + ut_a(trx->lock.que_state == TRX_QUE_RUNNING); + + mem_heap_free(heap); + + /* There might be work for utility threads.*/ + srv_active_wake_master_thread(); + + MONITOR_DEC(MONITOR_TRX_ACTIVE); +} + +/*******************************************************************//** +Rollback a transaction to a given savepoint or do a complete rollback. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +trx_rollback_to_savepoint( +/*======================*/ + trx_t* trx, /*!< in: transaction handle */ + trx_savept_t* savept) /*!< in: pointer to savepoint undo number, if + partial rollback requested, or NULL for + complete rollback */ +{ + ut_ad(!trx_mutex_own(trx)); + + trx_start_if_not_started_xa(trx); + + trx_rollback_to_savepoint_low(trx, savept); + + return(trx->error_state); +} + +/*******************************************************************//** +Rollback a transaction used in MySQL. +@return error code or DB_SUCCESS */ +static +dberr_t +trx_rollback_for_mysql_low( +/*=======================*/ + trx_t* trx) /*!< in/out: transaction */ +{ + trx->op_info = "rollback"; + + /* If we are doing the XA recovery of prepared transactions, + then the transaction object does not have an InnoDB session + object, and we set a dummy session that we use for all MySQL + transactions. */ + + trx_rollback_to_savepoint_low(trx, NULL); + + trx->op_info = ""; + + ut_a(trx->error_state == DB_SUCCESS); + + return(trx->error_state); +} + +/*******************************************************************//** +Rollback a transaction used in MySQL. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +trx_rollback_for_mysql( +/*===================*/ + trx_t* trx) /*!< in/out: transaction */ +{ + /* We are reading trx->state without holding trx_sys->mutex + here, because the rollback should be invoked for a running + active MySQL transaction (or recovered prepared transaction) + that is associated with the current thread. */ + + switch (trx->state) { + case TRX_STATE_NOT_STARTED: + ut_ad(trx->in_mysql_trx_list); + return(DB_SUCCESS); + + case TRX_STATE_ACTIVE: + ut_ad(trx->in_mysql_trx_list); + assert_trx_nonlocking_or_in_list(trx); + return(trx_rollback_for_mysql_low(trx)); + + case TRX_STATE_PREPARED: + ut_ad(!trx_is_autocommit_non_locking(trx)); + return(trx_rollback_for_mysql_low(trx)); + + case TRX_STATE_COMMITTED_IN_MEMORY: + assert_trx_in_list(trx); + break; + } + + ut_error; + return(DB_CORRUPTION); +} + +/*******************************************************************//** +Rollback the latest SQL statement for MySQL. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +trx_rollback_last_sql_stat_for_mysql( +/*=================================*/ + trx_t* trx) /*!< in/out: transaction */ +{ + dberr_t err; + + /* We are reading trx->state without holding trx_sys->mutex + here, because the statement rollback should be invoked for a + running active MySQL transaction that is associated with the + current thread. */ + ut_ad(trx->in_mysql_trx_list); + + switch (trx->state) { + case TRX_STATE_NOT_STARTED: + return(DB_SUCCESS); + case TRX_STATE_ACTIVE: + assert_trx_nonlocking_or_in_list(trx); + + trx->op_info = "rollback of SQL statement"; + + err = trx_rollback_to_savepoint( + trx, &trx->last_sql_stat_start); + + if (trx->fts_trx) { + fts_savepoint_rollback_last_stmt(trx); + } + + /* The following call should not be needed, + but we play it safe: */ + trx_mark_sql_stat_end(trx); + + trx->op_info = ""; + + return(err); + case TRX_STATE_PREPARED: + case TRX_STATE_COMMITTED_IN_MEMORY: + /* The statement rollback is only allowed on an ACTIVE + transaction, not a PREPARED or COMMITTED one. */ + break; + } + + ut_error; + return(DB_CORRUPTION); +} + +/*******************************************************************//** +Search for a savepoint using name. +@return savepoint if found else NULL */ +static +trx_named_savept_t* +trx_savepoint_find( +/*===============*/ + trx_t* trx, /*!< in: transaction */ + const char* name) /*!< in: savepoint name */ +{ + trx_named_savept_t* savep; + + for (savep = UT_LIST_GET_FIRST(trx->trx_savepoints); + savep != NULL; + savep = UT_LIST_GET_NEXT(trx_savepoints, savep)) { + + if (0 == ut_strcmp(savep->name, name)) { + return(savep); + } + } + + return(NULL); +} + +/*******************************************************************//** +Frees a single savepoint struct. */ +static +void +trx_roll_savepoint_free( +/*=====================*/ + trx_t* trx, /*!< in: transaction handle */ + trx_named_savept_t* savep) /*!< in: savepoint to free */ +{ + UT_LIST_REMOVE(trx_savepoints, trx->trx_savepoints, savep); + mem_free(savep->name); + mem_free(savep); +} + +/*******************************************************************//** +Frees savepoint structs starting from savep. */ +UNIV_INTERN +void +trx_roll_savepoints_free( +/*=====================*/ + trx_t* trx, /*!< in: transaction handle */ + trx_named_savept_t* savep) /*!< in: free all savepoints starting + with this savepoint i*/ +{ + while (savep != NULL) { + trx_named_savept_t* next_savep; + + next_savep = UT_LIST_GET_NEXT(trx_savepoints, savep); + + trx_roll_savepoint_free(trx, savep); + + savep = next_savep; + } +} + +/*******************************************************************//** +Rolls back a transaction back to a named savepoint. Modifications after the +savepoint are undone but InnoDB does NOT release the corresponding locks +which are stored in memory. If a lock is 'implicit', that is, a new inserted +row holds a lock where the lock information is carried by the trx id stored in +the row, these locks are naturally released in the rollback. Savepoints which +were set after this savepoint are deleted. +@return if no savepoint of the name found then DB_NO_SAVEPOINT, +otherwise DB_SUCCESS */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +trx_rollback_to_savepoint_for_mysql_low( +/*====================================*/ + trx_t* trx, /*!< in/out: transaction */ + trx_named_savept_t* savep, /*!< in/out: savepoint */ + ib_int64_t* mysql_binlog_cache_pos) + /*!< out: the MySQL binlog + cache position corresponding + to this savepoint; MySQL needs + this information to remove the + binlog entries of the queries + executed after the savepoint */ +{ + dberr_t err; + + ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE)); + ut_ad(trx->in_mysql_trx_list); + + /* Free all savepoints strictly later than savep. */ + + trx_roll_savepoints_free( + trx, UT_LIST_GET_NEXT(trx_savepoints, savep)); + + *mysql_binlog_cache_pos = savep->mysql_binlog_cache_pos; + + trx->op_info = "rollback to a savepoint"; + + err = trx_rollback_to_savepoint(trx, &savep->savept); + + /* Store the current undo_no of the transaction so that + we know where to roll back if we have to roll back the + next SQL statement: */ + + trx_mark_sql_stat_end(trx); + + trx->op_info = ""; + + return(err); +} + +/*******************************************************************//** +Rolls back a transaction back to a named savepoint. Modifications after the +savepoint are undone but InnoDB does NOT release the corresponding locks +which are stored in memory. If a lock is 'implicit', that is, a new inserted +row holds a lock where the lock information is carried by the trx id stored in +the row, these locks are naturally released in the rollback. Savepoints which +were set after this savepoint are deleted. +@return if no savepoint of the name found then DB_NO_SAVEPOINT, +otherwise DB_SUCCESS */ +UNIV_INTERN +dberr_t +trx_rollback_to_savepoint_for_mysql( +/*================================*/ + trx_t* trx, /*!< in: transaction handle */ + const char* savepoint_name, /*!< in: savepoint name */ + ib_int64_t* mysql_binlog_cache_pos) /*!< out: the MySQL binlog cache + position corresponding to this + savepoint; MySQL needs this + information to remove the + binlog entries of the queries + executed after the savepoint */ +{ + trx_named_savept_t* savep; + + /* We are reading trx->state without holding trx_sys->mutex + here, because the savepoint rollback should be invoked for a + running active MySQL transaction that is associated with the + current thread. */ + ut_ad(trx->in_mysql_trx_list); + + savep = trx_savepoint_find(trx, savepoint_name); + + if (savep == NULL) { + return(DB_NO_SAVEPOINT); + } + + switch (trx->state) { + case TRX_STATE_NOT_STARTED: + ut_print_timestamp(stderr); + fputs(" InnoDB: Error: transaction has a savepoint ", stderr); + ut_print_name(stderr, trx, FALSE, savep->name); + fputs(" though it is not started\n", stderr); + return(DB_ERROR); + case TRX_STATE_ACTIVE: + return(trx_rollback_to_savepoint_for_mysql_low( + trx, savep, mysql_binlog_cache_pos)); + case TRX_STATE_PREPARED: + case TRX_STATE_COMMITTED_IN_MEMORY: + /* The savepoint rollback is only allowed on an ACTIVE + transaction, not a PREPARED or COMMITTED one. */ + break; + } + + ut_error; + return(DB_CORRUPTION); +} + +/*******************************************************************//** +Creates a named savepoint. If the transaction is not yet started, starts it. +If there is already a savepoint of the same name, this call erases that old +savepoint and replaces it with a new. Savepoints are deleted in a transaction +commit or rollback. +@return always DB_SUCCESS */ +UNIV_INTERN +dberr_t +trx_savepoint_for_mysql( +/*====================*/ + trx_t* trx, /*!< in: transaction handle */ + const char* savepoint_name, /*!< in: savepoint name */ + ib_int64_t binlog_cache_pos) /*!< in: MySQL binlog cache + position corresponding to this + connection at the time of the + savepoint */ +{ + trx_named_savept_t* savep; + + trx_start_if_not_started_xa(trx); + + savep = trx_savepoint_find(trx, savepoint_name); + + if (savep) { + /* There is a savepoint with the same name: free that */ + + UT_LIST_REMOVE(trx_savepoints, trx->trx_savepoints, savep); + + mem_free(savep->name); + mem_free(savep); + } + + /* Create a new savepoint and add it as the last in the list */ + + savep = static_cast<trx_named_savept_t*>(mem_alloc(sizeof(*savep))); + + savep->name = mem_strdup(savepoint_name); + + savep->savept = trx_savept_take(trx); + + savep->mysql_binlog_cache_pos = binlog_cache_pos; + + UT_LIST_ADD_LAST(trx_savepoints, trx->trx_savepoints, savep); + + return(DB_SUCCESS); +} + +/*******************************************************************//** +Releases only the named savepoint. Savepoints which were set after this +savepoint are left as is. +@return if no savepoint of the name found then DB_NO_SAVEPOINT, +otherwise DB_SUCCESS */ +UNIV_INTERN +dberr_t +trx_release_savepoint_for_mysql( +/*============================*/ + trx_t* trx, /*!< in: transaction handle */ + const char* savepoint_name) /*!< in: savepoint name */ +{ + trx_named_savept_t* savep; + + ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE)); + ut_ad(trx->in_mysql_trx_list); + + savep = trx_savepoint_find(trx, savepoint_name); + + if (savep != NULL) { + trx_roll_savepoint_free(trx, savep); + } + + return(savep != NULL ? DB_SUCCESS : DB_NO_SAVEPOINT); +} + +/*******************************************************************//** +Determines if this transaction is rolling back an incomplete transaction +in crash recovery. +@return TRUE if trx is an incomplete transaction that is being rolled +back in crash recovery */ +UNIV_INTERN +ibool +trx_is_recv( +/*========*/ + const trx_t* trx) /*!< in: transaction */ +{ + return(trx == trx_roll_crash_recv_trx); +} + +/*******************************************************************//** +Returns a transaction savepoint taken at this point in time. +@return savepoint */ +UNIV_INTERN +trx_savept_t +trx_savept_take( +/*============*/ + trx_t* trx) /*!< in: transaction */ +{ + trx_savept_t savept; + + savept.least_undo_no = trx->undo_no; + + return(savept); +} + +/*******************************************************************//** +Roll back an active transaction. */ +static +void +trx_rollback_active( +/*================*/ + trx_t* trx) /*!< in/out: transaction */ +{ + mem_heap_t* heap; + que_fork_t* fork; + que_thr_t* thr; + roll_node_t* roll_node; + dict_table_t* table; + ib_int64_t rows_to_undo; + const char* unit = ""; + ibool dictionary_locked = FALSE; + + heap = mem_heap_create(512); + + fork = que_fork_create(NULL, NULL, QUE_FORK_RECOVERY, heap); + fork->trx = trx; + + thr = que_thr_create(fork, heap); + + roll_node = roll_node_create(heap); + + thr->child = roll_node; + roll_node->common.parent = thr; + + trx->graph = fork; + + ut_a(thr == que_fork_start_command(fork)); + + mutex_enter(&trx_sys->mutex); + + trx_roll_crash_recv_trx = trx; + + trx_roll_max_undo_no = trx->undo_no; + + trx_roll_progress_printed_pct = 0; + + rows_to_undo = trx_roll_max_undo_no; + + mutex_exit(&trx_sys->mutex); + + if (rows_to_undo > 1000000000) { + rows_to_undo = rows_to_undo / 1000000; + unit = "M"; + } + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Rolling back trx with id " TRX_ID_FMT ", %lu%s" + " rows to undo\n", + trx->id, + (ulong) rows_to_undo, unit); + + if (trx_get_dict_operation(trx) != TRX_DICT_OP_NONE) { + row_mysql_lock_data_dictionary(trx); + dictionary_locked = TRUE; + } + + que_run_threads(thr); + ut_a(roll_node->undo_thr != NULL); + + que_run_threads(roll_node->undo_thr); + + trx_rollback_finish(thr_get_trx(roll_node->undo_thr)); + + /* Free the memory reserved by the undo graph */ + que_graph_free(static_cast<que_t*>( + roll_node->undo_thr->common.parent)); + + ut_a(trx->lock.que_state == TRX_QUE_RUNNING); + + if (trx_get_dict_operation(trx) != TRX_DICT_OP_NONE + && trx->table_id != 0) { + + /* If the transaction was for a dictionary operation, + we drop the relevant table only if it is not flagged + as DISCARDED. If it still exists. */ + + table = dict_table_open_on_id( + trx->table_id, dictionary_locked, + DICT_TABLE_OP_NORMAL); + + if (table && !dict_table_is_discarded(table)) { + + dberr_t err; + + /* Ensure that the table doesn't get evicted from the + cache, keeps things simple for drop. */ + + if (table->can_be_evicted) { + dict_table_move_from_lru_to_non_lru(table); + } + + dict_table_close(table, dictionary_locked, FALSE); + + ib_logf(IB_LOG_LEVEL_WARN, + "Dropping table '%s', with id " UINT64PF " " + "in recovery", + table->name, trx->table_id); + + err = row_drop_table_for_mysql(table->name, trx, TRUE); + trx_commit_for_mysql(trx); + + ut_a(err == DB_SUCCESS); + } + } + + if (dictionary_locked) { + row_mysql_unlock_data_dictionary(trx); + } + + ib_logf(IB_LOG_LEVEL_INFO, + "Rollback of trx with id " TRX_ID_FMT " completed", trx->id); + + mem_heap_free(heap); + + trx_roll_crash_recv_trx = NULL; +} + +/*******************************************************************//** +Rollback or clean up any resurrected incomplete transactions. It assumes +that the caller holds the trx_sys_t::mutex and it will release the +lock if it does a clean up or rollback. +@return TRUE if the transaction was cleaned up or rolled back +and trx_sys->mutex was released. */ +static +ibool +trx_rollback_resurrected( +/*=====================*/ + trx_t* trx, /*!< in: transaction to rollback or clean */ + ibool all) /*!< in: FALSE=roll back dictionary transactions; + TRUE=roll back all non-PREPARED transactions */ +{ + ut_ad(mutex_own(&trx_sys->mutex)); + + /* The trx->is_recovered flag and trx->state are set + atomically under the protection of the trx->mutex (and + lock_sys->mutex) in lock_trx_release_locks(). We do not want + to accidentally clean up a non-recovered transaction here. */ + + trx_mutex_enter(trx); + bool is_recovered = trx->is_recovered; + trx_state_t state = trx->state; + trx_mutex_exit(trx); + + if (!is_recovered) { + return(FALSE); + } + + switch (state) { + case TRX_STATE_COMMITTED_IN_MEMORY: + mutex_exit(&trx_sys->mutex); + fprintf(stderr, + "InnoDB: Cleaning up trx with id " TRX_ID_FMT "\n", + trx->id); + trx_cleanup_at_db_startup(trx); + trx_free_for_background(trx); + return(TRUE); + case TRX_STATE_ACTIVE: + if (all || trx_get_dict_operation(trx) != TRX_DICT_OP_NONE) { + mutex_exit(&trx_sys->mutex); + trx_rollback_active(trx); + trx_free_for_background(trx); + return(TRUE); + } + return(FALSE); + case TRX_STATE_PREPARED: + return(FALSE); + case TRX_STATE_NOT_STARTED: + break; + } + + ut_error; + return(FALSE); +} + +/*******************************************************************//** +Rollback or clean up any incomplete transactions which were +encountered in crash recovery. If the transaction already was +committed, then we clean up a possible insert undo log. If the +transaction was not yet committed, then we roll it back. */ +UNIV_INTERN +void +trx_rollback_or_clean_recovered( +/*============================*/ + ibool all) /*!< in: FALSE=roll back dictionary transactions; + TRUE=roll back all non-PREPARED transactions */ +{ + trx_t* trx; + + ut_a(srv_force_recovery < SRV_FORCE_NO_TRX_UNDO); + + if (trx_sys_get_n_rw_trx() == 0) { + + return; + } + + if (all) { + fprintf(stderr, + "InnoDB: Starting in background the rollback" + " of uncommitted transactions\n"); + } + + /* Note: For XA recovered transactions, we rely on MySQL to + do rollback. They will be in TRX_STATE_PREPARED state. If the server + is shutdown and they are still lingering in trx_sys_t::trx_list + then the shutdown will hang. */ + + /* Loop over the transaction list as long as there are + recovered transactions to clean up or recover. */ + + do { + mutex_enter(&trx_sys->mutex); + + for (trx = UT_LIST_GET_FIRST(trx_sys->rw_trx_list); + trx != NULL; + trx = UT_LIST_GET_NEXT(trx_list, trx)) { + + assert_trx_in_rw_list(trx); + + /* If this function does a cleanup or rollback + then it will release the trx_sys->mutex, therefore + we need to reacquire it before retrying the loop. */ + + if (trx_rollback_resurrected(trx, all)) { + + mutex_enter(&trx_sys->mutex); + + break; + } + } + + mutex_exit(&trx_sys->mutex); + + } while (trx != NULL); + + if (all) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Rollback of non-prepared" + " transactions completed\n"); + } +} + +/*******************************************************************//** +Rollback or clean up any incomplete transactions which were +encountered in crash recovery. If the transaction already was +committed, then we clean up a possible insert undo log. If the +transaction was not yet committed, then we roll it back. +Note: this is done in a background thread. +@return a dummy parameter */ +extern "C" UNIV_INTERN +os_thread_ret_t +DECLARE_THREAD(trx_rollback_or_clean_all_recovered)( +/*================================================*/ + void* arg __attribute__((unused))) + /*!< in: a dummy parameter required by + os_thread_create */ +{ + ut_ad(!srv_read_only_mode); + +#ifdef UNIV_PFS_THREAD + pfs_register_thread(trx_rollback_clean_thread_key); +#endif /* UNIV_PFS_THREAD */ + + trx_rollback_or_clean_recovered(TRUE); + + /* We count the number of threads in os_thread_exit(). A created + thread should always use that to exit and not use return() to exit. */ + + os_thread_exit(NULL); + + OS_THREAD_DUMMY_RETURN; +} + +/*******************************************************************//** +Creates an undo number array. +@return own: undo number array */ +static +trx_undo_arr_t* +trx_undo_arr_create( +/*================*/ + ulint n_cells) /*!< Number of cells */ +{ + trx_undo_arr_t* arr; + mem_heap_t* heap; + ulint sz = sizeof(*arr) + sizeof(*arr->infos) * n_cells; + + heap = mem_heap_create(sz); + + arr = static_cast<trx_undo_arr_t*>(mem_heap_zalloc(heap, sz)); + + arr->n_cells = n_cells; + + arr->infos = (trx_undo_inf_t*) (arr + 1); + + arr->heap = heap; + + return(arr); +} + +/*******************************************************************//** +Frees an undo number array. */ +UNIV_INTERN +void +trx_undo_arr_free( +/*==============*/ + trx_undo_arr_t* arr) /*!< in: undo number array */ +{ + mem_heap_free(arr->heap); +} + +/*******************************************************************//** +Stores info of an undo log record to the array if it is not stored yet. +@return FALSE if the record already existed in the array */ +static +ibool +trx_undo_arr_store_info( +/*====================*/ + trx_t* trx, /*!< in: transaction */ + undo_no_t undo_no)/*!< in: undo number */ +{ + ulint i; + trx_undo_arr_t* arr; + ulint n = 0; + ulint n_used; + trx_undo_inf_t* stored_here = NULL; + + arr = trx->undo_no_arr; + n_used = arr->n_used; + + for (i = 0; i < arr->n_cells; i++) { + trx_undo_inf_t* cell; + + cell = trx_undo_arr_get_nth_info(arr, i); + + if (!cell->in_use) { + if (!stored_here) { + /* Not in use, we may store here */ + cell->undo_no = undo_no; + cell->in_use = TRUE; + + arr->n_used++; + + stored_here = cell; + } + } else { + n++; + + if (cell->undo_no == undo_no) { + + if (stored_here) { + stored_here->in_use = FALSE; + ut_ad(arr->n_used > 0); + arr->n_used--; + } + + ut_ad(arr->n_used == n_used); + + return(FALSE); + } + } + + if (n == n_used && stored_here) { + + ut_ad(arr->n_used == 1 + n_used); + + return(TRUE); + } + } + + ut_error; + + return(FALSE); +} + +/*******************************************************************//** +Removes an undo number from the array. */ +static +void +trx_undo_arr_remove_info( +/*=====================*/ + trx_undo_arr_t* arr, /*!< in: undo number array */ + undo_no_t undo_no)/*!< in: undo number */ +{ + ulint i; + + for (i = 0; i < arr->n_cells; i++) { + + trx_undo_inf_t* cell; + + cell = trx_undo_arr_get_nth_info(arr, i); + + if (cell->in_use && cell->undo_no == undo_no) { + cell->in_use = FALSE; + ut_ad(arr->n_used > 0); + --arr->n_used; + break; + } + } +} + +/*******************************************************************//** +Gets the biggest undo number in an array. +@return biggest value, 0 if the array is empty */ +static +undo_no_t +trx_undo_arr_get_biggest( +/*=====================*/ + const trx_undo_arr_t* arr) /*!< in: undo number array */ +{ + ulint i; + undo_no_t biggest = 0; + ulint n_checked = 0; + + for (i = 0; i < arr->n_cells && n_checked < arr->n_used; ++i) { + + const trx_undo_inf_t* cell = &arr->infos[i]; + + if (cell->in_use) { + + ++n_checked; + + if (cell->undo_no > biggest) { + + biggest = cell->undo_no; + } + } + } + + return(biggest); +} + +/***********************************************************************//** +Tries truncate the undo logs. */ +static +void +trx_roll_try_truncate( +/*==================*/ + trx_t* trx) /*!< in/out: transaction */ +{ + undo_no_t limit; + const trx_undo_arr_t* arr; + + ut_ad(mutex_own(&(trx->undo_mutex))); + ut_ad(mutex_own(&((trx->rseg)->mutex))); + + trx->pages_undone = 0; + + arr = trx->undo_no_arr; + + limit = trx->undo_no; + + if (arr->n_used > 0) { + undo_no_t biggest; + + biggest = trx_undo_arr_get_biggest(arr); + + if (biggest >= limit) { + + limit = biggest + 1; + } + } + + if (trx->insert_undo) { + trx_undo_truncate_end(trx, trx->insert_undo, limit); + } + + if (trx->update_undo) { + trx_undo_truncate_end(trx, trx->update_undo, limit); + } +} + +/***********************************************************************//** +Pops the topmost undo log record in a single undo log and updates the info +about the topmost record in the undo log memory struct. +@return undo log record, the page s-latched */ +static +trx_undo_rec_t* +trx_roll_pop_top_rec( +/*=================*/ + trx_t* trx, /*!< in: transaction */ + trx_undo_t* undo, /*!< in: undo log */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_t* undo_page; + ulint offset; + trx_undo_rec_t* prev_rec; + page_t* prev_rec_page; + + ut_ad(mutex_own(&trx->undo_mutex)); + + undo_page = trx_undo_page_get_s_latched( + undo->space, undo->zip_size, undo->top_page_no, mtr); + + offset = undo->top_offset; + + /* fprintf(stderr, "Thread %lu undoing trx " TRX_ID_FMT + " undo record " TRX_ID_FMT "\n", + os_thread_get_curr_id(), trx->id, undo->top_undo_no); */ + + prev_rec = trx_undo_get_prev_rec( + undo_page + offset, undo->hdr_page_no, undo->hdr_offset, + true, mtr); + + if (prev_rec == NULL) { + + undo->empty = TRUE; + } else { + prev_rec_page = page_align(prev_rec); + + if (prev_rec_page != undo_page) { + + trx->pages_undone++; + } + + undo->top_page_no = page_get_page_no(prev_rec_page); + undo->top_offset = prev_rec - prev_rec_page; + undo->top_undo_no = trx_undo_rec_get_undo_no(prev_rec); + } + + return(undo_page + offset); +} + +/********************************************************************//** +Pops the topmost record when the two undo logs of a transaction are seen +as a single stack of records ordered by their undo numbers. Inserts the +undo number of the popped undo record to the array of currently processed +undo numbers in the transaction. When the query thread finishes processing +of this undo record, it must be released with trx_undo_rec_release. +@return undo log record copied to heap, NULL if none left, or if the +undo number of the top record would be less than the limit */ +UNIV_INTERN +trx_undo_rec_t* +trx_roll_pop_top_rec_of_trx( +/*========================*/ + trx_t* trx, /*!< in: transaction */ + undo_no_t limit, /*!< in: least undo number we need */ + roll_ptr_t* roll_ptr,/*!< out: roll pointer to undo record */ + mem_heap_t* heap) /*!< in: memory heap where copied */ +{ + trx_undo_t* undo; + trx_undo_t* ins_undo; + trx_undo_t* upd_undo; + trx_undo_rec_t* undo_rec; + trx_undo_rec_t* undo_rec_copy; + undo_no_t undo_no; + ibool is_insert; + trx_rseg_t* rseg; + ulint progress_pct; + mtr_t mtr; + + rseg = trx->rseg; +try_again: + mutex_enter(&(trx->undo_mutex)); + + if (trx->pages_undone >= TRX_ROLL_TRUNC_THRESHOLD) { + mutex_enter(&rseg->mutex); + + trx_roll_try_truncate(trx); + + mutex_exit(&rseg->mutex); + } + + ins_undo = trx->insert_undo; + upd_undo = trx->update_undo; + + if (!ins_undo || ins_undo->empty) { + undo = upd_undo; + } else if (!upd_undo || upd_undo->empty) { + undo = ins_undo; + } else if (upd_undo->top_undo_no > ins_undo->top_undo_no) { + undo = upd_undo; + } else { + undo = ins_undo; + } + + if (!undo || undo->empty || limit > undo->top_undo_no) { + + if ((trx->undo_no_arr)->n_used == 0) { + /* Rollback is ending */ + + mutex_enter(&(rseg->mutex)); + + trx_roll_try_truncate(trx); + + mutex_exit(&(rseg->mutex)); + } + + mutex_exit(&(trx->undo_mutex)); + + return(NULL); + } + + is_insert = (undo == ins_undo); + + *roll_ptr = trx_undo_build_roll_ptr( + is_insert, undo->rseg->id, undo->top_page_no, undo->top_offset); + + mtr_start(&mtr); + + undo_rec = trx_roll_pop_top_rec(trx, undo, &mtr); + + undo_no = trx_undo_rec_get_undo_no(undo_rec); + + ut_ad(undo_no + 1 == trx->undo_no); + + /* We print rollback progress info if we are in a crash recovery + and the transaction has at least 1000 row operations to undo. */ + + if (trx == trx_roll_crash_recv_trx && trx_roll_max_undo_no > 1000) { + + progress_pct = 100 - (ulint) + ((undo_no * 100) / trx_roll_max_undo_no); + if (progress_pct != trx_roll_progress_printed_pct) { + if (trx_roll_progress_printed_pct == 0) { + fprintf(stderr, + "\nInnoDB: Progress in percents:" + " %lu", (ulong) progress_pct); + } else { + fprintf(stderr, + " %lu", (ulong) progress_pct); + } + fflush(stderr); + trx_roll_progress_printed_pct = progress_pct; + } + } + + trx->undo_no = undo_no; + + if (!trx_undo_arr_store_info(trx, undo_no)) { + /* A query thread is already processing this undo log record */ + + mutex_exit(&(trx->undo_mutex)); + + mtr_commit(&mtr); + + goto try_again; + } + + undo_rec_copy = trx_undo_rec_copy(undo_rec, heap); + + mutex_exit(&(trx->undo_mutex)); + + mtr_commit(&mtr); + + return(undo_rec_copy); +} + +/********************************************************************//** +Reserves an undo log record for a query thread to undo. This should be +called if the query thread gets the undo log record not using the pop +function above. +@return TRUE if succeeded */ +UNIV_INTERN +ibool +trx_undo_rec_reserve( +/*=================*/ + trx_t* trx, /*!< in/out: transaction */ + undo_no_t undo_no)/*!< in: undo number of the record */ +{ + ibool ret; + + mutex_enter(&(trx->undo_mutex)); + + ret = trx_undo_arr_store_info(trx, undo_no); + + mutex_exit(&(trx->undo_mutex)); + + return(ret); +} + +/*******************************************************************//** +Releases a reserved undo record. */ +UNIV_INTERN +void +trx_undo_rec_release( +/*=================*/ + trx_t* trx, /*!< in/out: transaction */ + undo_no_t undo_no)/*!< in: undo number */ +{ + trx_undo_arr_t* arr; + + mutex_enter(&(trx->undo_mutex)); + + arr = trx->undo_no_arr; + + trx_undo_arr_remove_info(arr, undo_no); + + mutex_exit(&(trx->undo_mutex)); +} + +/****************************************************************//** +Builds an undo 'query' graph for a transaction. The actual rollback is +performed by executing this query graph like a query subprocedure call. +The reply about the completion of the rollback will be sent by this +graph. +@return own: the query graph */ +static +que_t* +trx_roll_graph_build( +/*=================*/ + trx_t* trx) /*!< in: trx handle */ +{ + mem_heap_t* heap; + que_fork_t* fork; + que_thr_t* thr; + + ut_ad(trx_mutex_own(trx)); + + heap = mem_heap_create(512); + fork = que_fork_create(NULL, NULL, QUE_FORK_ROLLBACK, heap); + fork->trx = trx; + + thr = que_thr_create(fork, heap); + + thr->child = row_undo_node_create(trx, thr, heap); + + return(fork); +} + +/*********************************************************************//** +Starts a rollback operation, creates the UNDO graph that will do the +actual undo operation. +@return query graph thread that will perform the UNDO operations. */ +static +que_thr_t* +trx_rollback_start( +/*===============*/ + trx_t* trx, /*!< in: transaction */ + ib_id_t roll_limit) /*!< in: rollback to undo no (for + partial undo), 0 if we are rolling back + the entire transaction */ +{ + que_t* roll_graph; + + ut_ad(trx_mutex_own(trx)); + + ut_ad(trx->undo_no_arr == NULL || trx->undo_no_arr->n_used == 0); + + /* Initialize the rollback field in the transaction */ + + trx->roll_limit = roll_limit; + + ut_a(trx->roll_limit <= trx->undo_no); + + trx->pages_undone = 0; + + if (trx->undo_no_arr == NULL) { + /* Single query thread -> 1 */ + trx->undo_no_arr = trx_undo_arr_create(1); + } + + /* Build a 'query' graph which will perform the undo operations */ + + roll_graph = trx_roll_graph_build(trx); + + trx->graph = roll_graph; + + trx->lock.que_state = TRX_QUE_ROLLING_BACK; + + return(que_fork_start_command(roll_graph)); +} + +/****************************************************************//** +Finishes a transaction rollback. */ +static +void +trx_rollback_finish( +/*================*/ + trx_t* trx) /*!< in: transaction */ +{ + ut_a(trx->undo_no_arr == NULL || trx->undo_no_arr->n_used == 0); + + trx_commit(trx); + + trx->lock.que_state = TRX_QUE_RUNNING; +} + +/*********************************************************************//** +Creates a rollback command node struct. +@return own: rollback node struct */ +UNIV_INTERN +roll_node_t* +roll_node_create( +/*=============*/ + mem_heap_t* heap) /*!< in: mem heap where created */ +{ + roll_node_t* node; + + node = static_cast<roll_node_t*>(mem_heap_zalloc(heap, sizeof(*node))); + + node->state = ROLL_NODE_SEND; + + node->common.type = QUE_NODE_ROLLBACK; + + return(node); +} + +/***********************************************************//** +Performs an execution step for a rollback command node in a query graph. +@return query thread to run next, or NULL */ +UNIV_INTERN +que_thr_t* +trx_rollback_step( +/*==============*/ + que_thr_t* thr) /*!< in: query thread */ +{ + roll_node_t* node; + + node = static_cast<roll_node_t*>(thr->run_node); + + ut_ad(que_node_get_type(node) == QUE_NODE_ROLLBACK); + + if (thr->prev_node == que_node_get_parent(node)) { + node->state = ROLL_NODE_SEND; + } + + if (node->state == ROLL_NODE_SEND) { + trx_t* trx; + ib_id_t roll_limit = 0; + + trx = thr_get_trx(thr); + + trx_mutex_enter(trx); + + node->state = ROLL_NODE_WAIT; + + ut_a(node->undo_thr == NULL); + + roll_limit = node->partial ? node->savept.least_undo_no : 0; + + trx_commit_or_rollback_prepare(trx); + + node->undo_thr = trx_rollback_start(trx, roll_limit); + + trx_mutex_exit(trx); + + } else { + ut_ad(node->state == ROLL_NODE_WAIT); + + thr->run_node = que_node_get_parent(node); + } + + return(thr); +} diff --git a/storage/xtradb/trx/trx0rseg.cc b/storage/xtradb/trx/trx0rseg.cc new file mode 100644 index 00000000000..003d1036a8c --- /dev/null +++ b/storage/xtradb/trx/trx0rseg.cc @@ -0,0 +1,425 @@ +/***************************************************************************** + +Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file trx/trx0rseg.cc +Rollback segment + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#include "trx0rseg.h" + +#ifdef UNIV_NONINL +#include "trx0rseg.ic" +#endif + +#include "trx0undo.h" +#include "fut0lst.h" +#include "srv0srv.h" +#include "trx0purge.h" +#include "ut0bh.h" +#include "srv0mon.h" + +#ifdef UNIV_PFS_MUTEX +/* Key to register rseg_mutex_key with performance schema */ +UNIV_INTERN mysql_pfs_key_t rseg_mutex_key; +#endif /* UNIV_PFS_MUTEX */ + +/****************************************************************//** +Creates a rollback segment header. This function is called only when +a new rollback segment is created in the database. +@return page number of the created segment, FIL_NULL if fail */ +UNIV_INTERN +ulint +trx_rseg_header_create( +/*===================*/ + ulint space, /*!< in: space id */ + ulint zip_size, /*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint max_size, /*!< in: max size in pages */ + ulint rseg_slot_no, /*!< in: rseg id == slot number in trx sys */ + mtr_t* mtr) /*!< in: mtr */ +{ + ulint page_no; + trx_rsegf_t* rsegf; + trx_sysf_t* sys_header; + ulint i; + buf_block_t* block; + + ut_ad(mtr); + ut_ad(mtr_memo_contains(mtr, fil_space_get_latch(space, NULL), + MTR_MEMO_X_LOCK)); + + /* Allocate a new file segment for the rollback segment */ + block = fseg_create(space, 0, TRX_RSEG + TRX_RSEG_FSEG_HEADER, mtr); + + if (block == NULL) { + /* No space left */ + + return(FIL_NULL); + } + + buf_block_dbg_add_level(block, SYNC_RSEG_HEADER_NEW); + + page_no = buf_block_get_page_no(block); + + /* Get the rollback segment file page */ + rsegf = trx_rsegf_get_new(space, zip_size, page_no, mtr); + + /* Initialize max size field */ + mlog_write_ulint(rsegf + TRX_RSEG_MAX_SIZE, max_size, + MLOG_4BYTES, mtr); + + /* Initialize the history list */ + + mlog_write_ulint(rsegf + TRX_RSEG_HISTORY_SIZE, 0, MLOG_4BYTES, mtr); + flst_init(rsegf + TRX_RSEG_HISTORY, mtr); + + /* Reset the undo log slots */ + for (i = 0; i < TRX_RSEG_N_SLOTS; i++) { + + trx_rsegf_set_nth_undo(rsegf, i, FIL_NULL, mtr); + } + + /* Add the rollback segment info to the free slot in + the trx system header */ + + sys_header = trx_sysf_get(mtr); + + trx_sysf_rseg_set_space(sys_header, rseg_slot_no, space, mtr); + trx_sysf_rseg_set_page_no(sys_header, rseg_slot_no, page_no, mtr); + + return(page_no); +} + +/***********************************************************************//** +Free's an instance of the rollback segment in memory. */ +UNIV_INTERN +void +trx_rseg_mem_free( +/*==============*/ + trx_rseg_t* rseg) /* in, own: instance to free */ +{ + trx_undo_t* undo; + trx_undo_t* next_undo; + + mutex_free(&rseg->mutex); + + /* There can't be any active transactions. */ + ut_a(UT_LIST_GET_LEN(rseg->update_undo_list) == 0); + ut_a(UT_LIST_GET_LEN(rseg->insert_undo_list) == 0); + + for (undo = UT_LIST_GET_FIRST(rseg->update_undo_cached); + undo != NULL; + undo = next_undo) { + + next_undo = UT_LIST_GET_NEXT(undo_list, undo); + + UT_LIST_REMOVE(undo_list, rseg->update_undo_cached, undo); + + MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_CACHED); + + trx_undo_mem_free(undo); + } + + for (undo = UT_LIST_GET_FIRST(rseg->insert_undo_cached); + undo != NULL; + undo = next_undo) { + + next_undo = UT_LIST_GET_NEXT(undo_list, undo); + + UT_LIST_REMOVE(undo_list, rseg->insert_undo_cached, undo); + + MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_CACHED); + + trx_undo_mem_free(undo); + } + + /* const_cast<trx_rseg_t*>() because this function is + like a destructor. */ + + *((trx_rseg_t**) trx_sys->rseg_array + rseg->id) = NULL; + + mem_free(rseg); +} + +/*************************************************************************** +Creates and initializes a rollback segment object. The values for the +fields are read from the header. The object is inserted to the rseg +list of the trx system object and a pointer is inserted in the rseg +array in the trx system object. +@return own: rollback segment object */ +static +trx_rseg_t* +trx_rseg_mem_create( +/*================*/ + ulint id, /*!< in: rollback segment id */ + ulint space, /*!< in: space where the segment + placed */ + ulint zip_size, /*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page_no, /*!< in: page number of the segment + header */ + ib_bh_t* ib_bh, /*!< in/out: rseg queue */ + mtr_t* mtr) /*!< in: mtr */ +{ + ulint len; + trx_rseg_t* rseg; + fil_addr_t node_addr; + trx_rsegf_t* rseg_header; + trx_ulogf_t* undo_log_hdr; + ulint sum_of_undo_sizes; + + rseg = static_cast<trx_rseg_t*>(mem_zalloc(sizeof(trx_rseg_t))); + + rseg->id = id; + rseg->space = space; + rseg->zip_size = zip_size; + rseg->page_no = page_no; + + mutex_create(rseg_mutex_key, &rseg->mutex, SYNC_RSEG); + + /* const_cast<trx_rseg_t*>() because this function is + like a constructor. */ + *((trx_rseg_t**) trx_sys->rseg_array + rseg->id) = rseg; + + rseg_header = trx_rsegf_get_new(space, zip_size, page_no, mtr); + + rseg->max_size = mtr_read_ulint( + rseg_header + TRX_RSEG_MAX_SIZE, MLOG_4BYTES, mtr); + + /* Initialize the undo log lists according to the rseg header */ + + sum_of_undo_sizes = trx_undo_lists_init(rseg); + + rseg->curr_size = mtr_read_ulint( + rseg_header + TRX_RSEG_HISTORY_SIZE, MLOG_4BYTES, mtr) + + 1 + sum_of_undo_sizes; + + len = flst_get_len(rseg_header + TRX_RSEG_HISTORY, mtr); + + if (len > 0) { + rseg_queue_t rseg_queue; + + trx_sys->rseg_history_len += len; + + node_addr = trx_purge_get_log_from_hist( + flst_get_last(rseg_header + TRX_RSEG_HISTORY, mtr)); + + rseg->last_page_no = node_addr.page; + rseg->last_offset = node_addr.boffset; + + undo_log_hdr = trx_undo_page_get( + rseg->space, rseg->zip_size, node_addr.page, + mtr) + node_addr.boffset; + + rseg->last_trx_no = mach_read_from_8( + undo_log_hdr + TRX_UNDO_TRX_NO); + + rseg->last_del_marks = mtr_read_ulint( + undo_log_hdr + TRX_UNDO_DEL_MARKS, MLOG_2BYTES, mtr); + + rseg_queue.rseg = rseg; + rseg_queue.trx_no = rseg->last_trx_no; + + if (rseg->last_page_no != FIL_NULL) { + const void* ptr; + + /* There is no need to cover this operation by the purge + mutex because we are still bootstrapping. */ + + ptr = ib_bh_push(ib_bh, &rseg_queue); + ut_a(ptr != NULL); + } + } else { + rseg->last_page_no = FIL_NULL; + } + + return(rseg); +} + +/******************************************************************** +Creates the memory copies for the rollback segments and initializes the +rseg array in trx_sys at a database startup. */ +static +void +trx_rseg_create_instance( +/*=====================*/ + trx_sysf_t* sys_header, /*!< in: trx system header */ + ib_bh_t* ib_bh, /*!< in/out: rseg queue */ + mtr_t* mtr) /*!< in: mtr */ +{ + ulint i; + + for (i = 0; i < TRX_SYS_N_RSEGS; i++) { + ulint page_no; + + page_no = trx_sysf_rseg_get_page_no(sys_header, i, mtr); + + if (page_no != FIL_NULL) { + ulint space; + ulint zip_size; + trx_rseg_t* rseg = NULL; + + ut_a(!trx_rseg_get_on_id(i)); + + space = trx_sysf_rseg_get_space(sys_header, i, mtr); + + zip_size = space ? fil_space_get_zip_size(space) : 0; + + rseg = trx_rseg_mem_create( + i, space, zip_size, page_no, ib_bh, mtr); + + ut_a(rseg->id == i); + } else { + ut_a(trx_sys->rseg_array[i] == NULL); + } + } +} + +/********************************************************************* +Creates a rollback segment. +@return pointer to new rollback segment if create successful */ +UNIV_INTERN +trx_rseg_t* +trx_rseg_create( +/*============*/ + ulint space) /*!< in: id of UNDO tablespace */ +{ + mtr_t mtr; + ulint slot_no; + trx_rseg_t* rseg = NULL; + + mtr_start(&mtr); + + /* To obey the latching order, acquire the file space + x-latch before the trx_sys->mutex. */ + mtr_x_lock(fil_space_get_latch(space, NULL), &mtr); + + slot_no = trx_sysf_rseg_find_free(&mtr); + + if (slot_no != ULINT_UNDEFINED) { + ulint id; + ulint page_no; + ulint zip_size; + trx_sysf_t* sys_header; + + page_no = trx_rseg_header_create( + space, 0, ULINT_MAX, slot_no, &mtr); + + ut_a(page_no != FIL_NULL); + + sys_header = trx_sysf_get(&mtr); + + id = trx_sysf_rseg_get_space(sys_header, slot_no, &mtr); + ut_a(id == space); + + zip_size = space ? fil_space_get_zip_size(space) : 0; + + rseg = trx_rseg_mem_create( + slot_no, space, zip_size, page_no, + purge_sys->ib_bh, &mtr); + } + + mtr_commit(&mtr); + + return(rseg); +} + +/*********************************************************************//** +Creates the memory copies for rollback segments and initializes the +rseg array in trx_sys at a database startup. */ +UNIV_INTERN +void +trx_rseg_array_init( +/*================*/ + trx_sysf_t* sys_header, /* in/out: trx system header */ + ib_bh_t* ib_bh, /*!< in: rseg queue */ + mtr_t* mtr) /*!< in: mtr */ +{ + trx_sys->rseg_history_len = 0; + + trx_rseg_create_instance(sys_header, ib_bh, mtr); +} + +/******************************************************************** +Get the number of unique rollback tablespaces in use except space id 0. +The last space id will be the sentinel value ULINT_UNDEFINED. The array +will be sorted on space id. Note: space_ids should have have space for +TRX_SYS_N_RSEGS + 1 elements. +@return number of unique rollback tablespaces in use. */ +UNIV_INTERN +ulint +trx_rseg_get_n_undo_tablespaces( +/*============================*/ + ulint* space_ids) /*!< out: array of space ids of + UNDO tablespaces */ +{ + ulint i; + mtr_t mtr; + trx_sysf_t* sys_header; + ulint n_undo_tablespaces = 0; + ulint space_ids_aux[TRX_SYS_N_RSEGS + 1]; + + mtr_start(&mtr); + + sys_header = trx_sysf_get(&mtr); + + for (i = 0; i < TRX_SYS_N_RSEGS; i++) { + ulint page_no; + ulint space; + + page_no = trx_sysf_rseg_get_page_no(sys_header, i, &mtr); + + if (page_no == FIL_NULL) { + continue; + } + + space = trx_sysf_rseg_get_space(sys_header, i, &mtr); + + if (space != 0) { + ulint j; + ibool found = FALSE; + + for (j = 0; j < n_undo_tablespaces; ++j) { + if (space_ids[j] == space) { + found = TRUE; + break; + } + } + + if (!found) { + ut_a(n_undo_tablespaces <= i); + space_ids[n_undo_tablespaces++] = space; + } + } + } + + mtr_commit(&mtr); + + ut_a(n_undo_tablespaces <= TRX_SYS_N_RSEGS); + + space_ids[n_undo_tablespaces] = ULINT_UNDEFINED; + + if (n_undo_tablespaces > 0) { + ut_ulint_sort(space_ids, space_ids_aux, 0, n_undo_tablespaces); + } + + return(n_undo_tablespaces); +} diff --git a/storage/xtradb/trx/trx0sys.cc b/storage/xtradb/trx/trx0sys.cc new file mode 100644 index 00000000000..6a03ba5ed57 --- /dev/null +++ b/storage/xtradb/trx/trx0sys.cc @@ -0,0 +1,1323 @@ +/***************************************************************************** + +Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file trx/trx0sys.cc +Transaction system + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#include "trx0sys.h" + +#ifdef UNIV_NONINL +#include "trx0sys.ic" +#endif + +#ifndef UNIV_HOTBACKUP +#include "fsp0fsp.h" +#include "mtr0log.h" +#include "mtr0log.h" +#include "trx0trx.h" +#include "trx0rseg.h" +#include "trx0undo.h" +#include "srv0srv.h" +#include "srv0start.h" +#include "trx0purge.h" +#include "log0log.h" +#include "log0recv.h" +#include "os0file.h" +#include "read0read.h" + +/** The file format tag structure with id and name. */ +struct file_format_t { + ulint id; /*!< id of the file format */ + const char* name; /*!< text representation of the + file format */ + ib_mutex_t mutex; /*!< covers changes to the above + fields */ +}; + +/** The transaction system */ +UNIV_INTERN trx_sys_t* trx_sys = NULL; + +/** In a MySQL replication slave, in crash recovery we store the master log +file name and position here. */ +/* @{ */ +/** Master binlog file name */ +UNIV_INTERN char trx_sys_mysql_master_log_name[TRX_SYS_MYSQL_LOG_NAME_LEN]; +/** Master binlog file position. We have successfully got the updates +up to this position. -1 means that no crash recovery was needed, or +there was no master log position info inside InnoDB.*/ +UNIV_INTERN ib_int64_t trx_sys_mysql_master_log_pos = -1; +/* @} */ + +/** If this MySQL server uses binary logging, after InnoDB has been inited +and if it has done a crash recovery, we store the binlog file name and position +here. */ +/* @{ */ +/** Binlog file name */ +UNIV_INTERN char trx_sys_mysql_bin_log_name[TRX_SYS_MYSQL_LOG_NAME_LEN]; +/** Binlog file position, or -1 if unknown */ +UNIV_INTERN ib_int64_t trx_sys_mysql_bin_log_pos = -1; +/* @} */ +#endif /* !UNIV_HOTBACKUP */ + +/** List of animal names representing file format. */ +static const char* file_format_name_map[] = { + "Antelope", + "Barracuda", + "Cheetah", + "Dragon", + "Elk", + "Fox", + "Gazelle", + "Hornet", + "Impala", + "Jaguar", + "Kangaroo", + "Leopard", + "Moose", + "Nautilus", + "Ocelot", + "Porpoise", + "Quail", + "Rabbit", + "Shark", + "Tiger", + "Urchin", + "Viper", + "Whale", + "Xenops", + "Yak", + "Zebra" +}; + +/** The number of elements in the file format name array. */ +static const ulint FILE_FORMAT_NAME_N + = sizeof(file_format_name_map) / sizeof(file_format_name_map[0]); + +#ifdef UNIV_PFS_MUTEX +/* Key to register the mutex with performance schema */ +UNIV_INTERN mysql_pfs_key_t file_format_max_mutex_key; +UNIV_INTERN mysql_pfs_key_t trx_sys_mutex_key; +#endif /* UNIV_PFS_RWLOCK */ + +#ifndef UNIV_HOTBACKUP +#ifdef UNIV_DEBUG +/* Flag to control TRX_RSEG_N_SLOTS behavior debugging. */ +UNIV_INTERN uint trx_rseg_n_slots_debug = 0; +#endif + +/** This is used to track the maximum file format id known to InnoDB. It's +updated via SET GLOBAL innodb_file_format_max = 'x' or when we open +or create a table. */ +static file_format_t file_format_max; + +#ifdef UNIV_DEBUG +/****************************************************************//** +Checks whether a trx is in one of rw_trx_list or ro_trx_list. +@return TRUE if is in */ +UNIV_INTERN +ibool +trx_in_trx_list( +/*============*/ + const trx_t* in_trx) /*!< in: transaction */ +{ + const trx_t* trx; + trx_list_t* trx_list; + + /* Non-locking autocommits should not hold any locks. */ + assert_trx_in_list(in_trx); + + trx_list = in_trx->read_only + ? &trx_sys->ro_trx_list : &trx_sys->rw_trx_list; + + ut_ad(mutex_own(&trx_sys->mutex)); + + ut_ad(trx_assert_started(in_trx)); + + for (trx = UT_LIST_GET_FIRST(*trx_list); + trx != NULL && trx != in_trx; + trx = UT_LIST_GET_NEXT(trx_list, trx)) { + + assert_trx_in_list(trx); + ut_ad(trx->read_only == (trx_list == &trx_sys->ro_trx_list)); + } + + return(trx != NULL); +} +#endif /* UNIV_DEBUG */ + +/*****************************************************************//** +Writes the value of max_trx_id to the file based trx system header. */ +UNIV_INTERN +void +trx_sys_flush_max_trx_id(void) +/*==========================*/ +{ + mtr_t mtr; + trx_sysf_t* sys_header; + + ut_ad(mutex_own(&trx_sys->mutex)); + + if (!srv_read_only_mode) { + mtr_start(&mtr); + + sys_header = trx_sysf_get(&mtr); + + mlog_write_ull( + sys_header + TRX_SYS_TRX_ID_STORE, + trx_sys->max_trx_id, &mtr); + + mtr_commit(&mtr); + } +} + +/*****************************************************************//** +Updates the offset information about the end of the MySQL binlog entry +which corresponds to the transaction just being committed. In a MySQL +replication slave updates the latest master binlog position up to which +replication has proceeded. */ +UNIV_INTERN +void +trx_sys_update_mysql_binlog_offset( +/*===============================*/ + const char* file_name,/*!< in: MySQL log file name */ + ib_int64_t offset, /*!< in: position in that log file */ + ulint field, /*!< in: offset of the MySQL log info field in + the trx sys header */ + mtr_t* mtr) /*!< in: mtr */ +{ + trx_sysf_t* sys_header; + + if (ut_strlen(file_name) >= TRX_SYS_MYSQL_LOG_NAME_LEN) { + + /* We cannot fit the name to the 512 bytes we have reserved */ + + return; + } + + sys_header = trx_sysf_get(mtr); + + if (mach_read_from_4(sys_header + field + + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD) + != TRX_SYS_MYSQL_LOG_MAGIC_N) { + + mlog_write_ulint(sys_header + field + + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD, + TRX_SYS_MYSQL_LOG_MAGIC_N, + MLOG_4BYTES, mtr); + } + + if (0 != strcmp((char*) (sys_header + field + TRX_SYS_MYSQL_LOG_NAME), + file_name)) { + + mlog_write_string(sys_header + field + + TRX_SYS_MYSQL_LOG_NAME, + (byte*) file_name, 1 + ut_strlen(file_name), + mtr); + } + + if (mach_read_from_4(sys_header + field + + TRX_SYS_MYSQL_LOG_OFFSET_HIGH) > 0 + || (offset >> 32) > 0) { + + mlog_write_ulint(sys_header + field + + TRX_SYS_MYSQL_LOG_OFFSET_HIGH, + (ulint)(offset >> 32), + MLOG_4BYTES, mtr); + } + + mlog_write_ulint(sys_header + field + + TRX_SYS_MYSQL_LOG_OFFSET_LOW, + (ulint)(offset & 0xFFFFFFFFUL), + MLOG_4BYTES, mtr); +} + +/*****************************************************************//** +Stores the MySQL binlog offset info in the trx system header if +the magic number shows it valid, and print the info to stderr */ +UNIV_INTERN +void +trx_sys_print_mysql_binlog_offset(void) +/*===================================*/ +{ + trx_sysf_t* sys_header; + mtr_t mtr; + ulint trx_sys_mysql_bin_log_pos_high; + ulint trx_sys_mysql_bin_log_pos_low; + + mtr_start(&mtr); + + sys_header = trx_sysf_get(&mtr); + + if (mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO + + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD) + != TRX_SYS_MYSQL_LOG_MAGIC_N) { + + mtr_commit(&mtr); + + return; + } + + trx_sys_mysql_bin_log_pos_high = mach_read_from_4( + sys_header + TRX_SYS_MYSQL_LOG_INFO + + TRX_SYS_MYSQL_LOG_OFFSET_HIGH); + trx_sys_mysql_bin_log_pos_low = mach_read_from_4( + sys_header + TRX_SYS_MYSQL_LOG_INFO + + TRX_SYS_MYSQL_LOG_OFFSET_LOW); + + trx_sys_mysql_bin_log_pos + = (((ib_int64_t) trx_sys_mysql_bin_log_pos_high) << 32) + + (ib_int64_t) trx_sys_mysql_bin_log_pos_low; + + ut_memcpy(trx_sys_mysql_bin_log_name, + sys_header + TRX_SYS_MYSQL_LOG_INFO + + TRX_SYS_MYSQL_LOG_NAME, TRX_SYS_MYSQL_LOG_NAME_LEN); + + fprintf(stderr, + "InnoDB: Last MySQL binlog file position %lu %lu," + " file name %s\n", + trx_sys_mysql_bin_log_pos_high, trx_sys_mysql_bin_log_pos_low, + trx_sys_mysql_bin_log_name); + + mtr_commit(&mtr); +} + +/*****************************************************************//** +Prints to stderr the MySQL master log offset info in the trx system header if +the magic number shows it valid. */ +UNIV_INTERN +void +trx_sys_print_mysql_master_log_pos(void) +/*====================================*/ +{ + trx_sysf_t* sys_header; + mtr_t mtr; + + mtr_start(&mtr); + + sys_header = trx_sysf_get(&mtr); + + if (mach_read_from_4(sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO + + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD) + != TRX_SYS_MYSQL_LOG_MAGIC_N) { + + mtr_commit(&mtr); + + return; + } + + fprintf(stderr, + "InnoDB: In a MySQL replication slave the last" + " master binlog file\n" + "InnoDB: position %lu %lu, file name %s\n", + (ulong) mach_read_from_4(sys_header + + TRX_SYS_MYSQL_MASTER_LOG_INFO + + TRX_SYS_MYSQL_LOG_OFFSET_HIGH), + (ulong) mach_read_from_4(sys_header + + TRX_SYS_MYSQL_MASTER_LOG_INFO + + TRX_SYS_MYSQL_LOG_OFFSET_LOW), + sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO + + TRX_SYS_MYSQL_LOG_NAME); + /* Copy the master log position info to global variables we can + use in ha_innobase.cc to initialize glob_mi to right values */ + + ut_memcpy(trx_sys_mysql_master_log_name, + sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO + + TRX_SYS_MYSQL_LOG_NAME, + TRX_SYS_MYSQL_LOG_NAME_LEN); + + trx_sys_mysql_master_log_pos + = (((ib_int64_t) mach_read_from_4( + sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO + + TRX_SYS_MYSQL_LOG_OFFSET_HIGH)) << 32) + + ((ib_int64_t) mach_read_from_4( + sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO + + TRX_SYS_MYSQL_LOG_OFFSET_LOW)); + mtr_commit(&mtr); +} + +/****************************************************************//** +Looks for a free slot for a rollback segment in the trx system file copy. +@return slot index or ULINT_UNDEFINED if not found */ +UNIV_INTERN +ulint +trx_sysf_rseg_find_free( +/*====================*/ + mtr_t* mtr) /*!< in: mtr */ +{ + ulint i; + trx_sysf_t* sys_header; + + sys_header = trx_sysf_get(mtr); + + for (i = 0; i < TRX_SYS_N_RSEGS; i++) { + ulint page_no; + + page_no = trx_sysf_rseg_get_page_no(sys_header, i, mtr); + + if (page_no == FIL_NULL) { + + return(i); + } + } + + return(ULINT_UNDEFINED); +} + +/*****************************************************************//** +Creates the file page for the transaction system. This function is called only +at the database creation, before trx_sys_init. */ +static +void +trx_sysf_create( +/*============*/ + mtr_t* mtr) /*!< in: mtr */ +{ + trx_sysf_t* sys_header; + ulint slot_no; + buf_block_t* block; + page_t* page; + ulint page_no; + byte* ptr; + ulint len; + + ut_ad(mtr); + + /* Note that below we first reserve the file space x-latch, and + then enter the kernel: we must do it in this order to conform + to the latching order rules. */ + + mtr_x_lock(fil_space_get_latch(TRX_SYS_SPACE, NULL), mtr); + + /* Create the trx sys file block in a new allocated file segment */ + block = fseg_create(TRX_SYS_SPACE, 0, TRX_SYS + TRX_SYS_FSEG_HEADER, + mtr); + buf_block_dbg_add_level(block, SYNC_TRX_SYS_HEADER); + + ut_a(buf_block_get_page_no(block) == TRX_SYS_PAGE_NO); + + page = buf_block_get_frame(block); + + mlog_write_ulint(page + FIL_PAGE_TYPE, FIL_PAGE_TYPE_TRX_SYS, + MLOG_2BYTES, mtr); + + /* Reset the doublewrite buffer magic number to zero so that we + know that the doublewrite buffer has not yet been created (this + suppresses a Valgrind warning) */ + + mlog_write_ulint(page + TRX_SYS_DOUBLEWRITE + + TRX_SYS_DOUBLEWRITE_MAGIC, 0, MLOG_4BYTES, mtr); + + sys_header = trx_sysf_get(mtr); + + /* Start counting transaction ids from number 1 up */ + mach_write_to_8(sys_header + TRX_SYS_TRX_ID_STORE, 1); + + /* Reset the rollback segment slots. Old versions of InnoDB + define TRX_SYS_N_RSEGS as 256 (TRX_SYS_OLD_N_RSEGS) and expect + that the whole array is initialized. */ + ptr = TRX_SYS_RSEGS + sys_header; + len = ut_max(TRX_SYS_OLD_N_RSEGS, TRX_SYS_N_RSEGS) + * TRX_SYS_RSEG_SLOT_SIZE; + memset(ptr, 0xff, len); + ptr += len; + ut_a(ptr <= page + (UNIV_PAGE_SIZE - FIL_PAGE_DATA_END)); + + /* Initialize all of the page. This part used to be uninitialized. */ + memset(ptr, 0, UNIV_PAGE_SIZE - FIL_PAGE_DATA_END + page - ptr); + + mlog_log_string(sys_header, UNIV_PAGE_SIZE - FIL_PAGE_DATA_END + + page - sys_header, mtr); + + /* Create the first rollback segment in the SYSTEM tablespace */ + slot_no = trx_sysf_rseg_find_free(mtr); + page_no = trx_rseg_header_create(TRX_SYS_SPACE, 0, ULINT_MAX, slot_no, + mtr); + + ut_a(slot_no == TRX_SYS_SYSTEM_RSEG_ID); + ut_a(page_no == FSP_FIRST_RSEG_PAGE_NO); +} + +/*****************************************************************//** +Compare two trx_rseg_t instances on last_trx_no. */ +static +int +trx_rseg_compare_last_trx_no( +/*=========================*/ + const void* p1, /*!< in: elem to compare */ + const void* p2) /*!< in: elem to compare */ +{ + ib_int64_t cmp; + + const rseg_queue_t* rseg_q1 = (const rseg_queue_t*) p1; + const rseg_queue_t* rseg_q2 = (const rseg_queue_t*) p2; + + cmp = rseg_q1->trx_no - rseg_q2->trx_no; + + if (cmp < 0) { + return(-1); + } else if (cmp > 0) { + return(1); + } + + return(0); +} + +/*****************************************************************//** +Creates and initializes the central memory structures for the transaction +system. This is called when the database is started. +@return min binary heap of rsegs to purge */ +UNIV_INTERN +ib_bh_t* +trx_sys_init_at_db_start(void) +/*==========================*/ +{ + mtr_t mtr; + ib_bh_t* ib_bh; + trx_sysf_t* sys_header; + ib_uint64_t rows_to_undo = 0; + const char* unit = ""; + + /* We create the min binary heap here and pass ownership to + purge when we init the purge sub-system. Purge is responsible + for freeing the binary heap. */ + + ib_bh = ib_bh_create( + trx_rseg_compare_last_trx_no, + sizeof(rseg_queue_t), TRX_SYS_N_RSEGS); + + mtr_start(&mtr); + + /* Allocate the trx descriptors array */ + trx_sys->descriptors = static_cast<trx_id_t*>( + ut_malloc(sizeof(trx_id_t) * + TRX_DESCR_ARRAY_INITIAL_SIZE)); + trx_sys->descr_n_max = TRX_DESCR_ARRAY_INITIAL_SIZE; + trx_sys->descr_n_used = 0; + srv_descriptors_memory = TRX_DESCR_ARRAY_INITIAL_SIZE * + sizeof(trx_id_t); + + sys_header = trx_sysf_get(&mtr); + + if (srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN) { + trx_rseg_array_init(sys_header, ib_bh, &mtr); + } + + /* VERY important: after the database is started, max_trx_id value is + divisible by TRX_SYS_TRX_ID_WRITE_MARGIN, and the 'if' in + trx_sys_get_new_trx_id will evaluate to TRUE when the function + is first time called, and the value for trx id will be written + to the disk-based header! Thus trx id values will not overlap when + the database is repeatedly started! */ + + trx_sys->max_trx_id = 2 * TRX_SYS_TRX_ID_WRITE_MARGIN + + ut_uint64_align_up(mach_read_from_8(sys_header + + TRX_SYS_TRX_ID_STORE), + TRX_SYS_TRX_ID_WRITE_MARGIN); + + ut_d(trx_sys->rw_max_trx_id = trx_sys->max_trx_id); + + UT_LIST_INIT(trx_sys->mysql_trx_list); + + trx_dummy_sess = sess_open(); + + trx_lists_init_at_db_start(); + + /* This S lock is not strictly required, it is here only to satisfy + the debug code (assertions). We are still running in single threaded + bootstrap mode. */ + + mutex_enter(&trx_sys->mutex); + + ut_a(UT_LIST_GET_LEN(trx_sys->ro_trx_list) == 0); + + if (UT_LIST_GET_LEN(trx_sys->rw_trx_list) > 0) { + const trx_t* trx; + + for (trx = UT_LIST_GET_FIRST(trx_sys->rw_trx_list); + trx != NULL; + trx = UT_LIST_GET_NEXT(trx_list, trx)) { + + ut_ad(trx->is_recovered); + assert_trx_in_rw_list(trx); + + if (trx_state_eq(trx, TRX_STATE_ACTIVE)) { + rows_to_undo += trx->undo_no; + } + } + + if (rows_to_undo > 1000000000) { + unit = "M"; + rows_to_undo = rows_to_undo / 1000000; + } + + fprintf(stderr, + "InnoDB: %lu transaction(s) which must be" + " rolled back or cleaned up\n" + "InnoDB: in total %lu%s row operations to undo\n", + (ulong) UT_LIST_GET_LEN(trx_sys->rw_trx_list), + (ulong) rows_to_undo, unit); + + fprintf(stderr, "InnoDB: Trx id counter is " TRX_ID_FMT "\n", + trx_sys->max_trx_id); + } + + mutex_exit(&trx_sys->mutex); + + UT_LIST_INIT(trx_sys->view_list); + + mtr_commit(&mtr); + + return(ib_bh); +} + +/*****************************************************************//** +Creates the trx_sys instance and initializes ib_bh and mutex. */ +UNIV_INTERN +void +trx_sys_create(void) +/*================*/ +{ + ut_ad(trx_sys == NULL); + + trx_sys = static_cast<trx_sys_t*>(mem_zalloc(sizeof(*trx_sys))); + + mutex_create(trx_sys_mutex_key, &trx_sys->mutex, SYNC_TRX_SYS); +} + +/*****************************************************************//** +Creates and initializes the transaction system at the database creation. */ +UNIV_INTERN +void +trx_sys_create_sys_pages(void) +/*==========================*/ +{ + mtr_t mtr; + + mtr_start(&mtr); + + trx_sysf_create(&mtr); + + mtr_commit(&mtr); +} + +/*****************************************************************//** +Update the file format tag. +@return always TRUE */ +static +ibool +trx_sys_file_format_max_write( +/*==========================*/ + ulint format_id, /*!< in: file format id */ + const char** name) /*!< out: max file format name, can + be NULL */ +{ + mtr_t mtr; + byte* ptr; + buf_block_t* block; + ib_uint64_t tag_value; + + mtr_start(&mtr); + + block = buf_page_get( + TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, RW_X_LATCH, &mtr); + + file_format_max.id = format_id; + file_format_max.name = trx_sys_file_format_id_to_name(format_id); + + ptr = buf_block_get_frame(block) + TRX_SYS_FILE_FORMAT_TAG; + tag_value = format_id + TRX_SYS_FILE_FORMAT_TAG_MAGIC_N; + + if (name) { + *name = file_format_max.name; + } + + mlog_write_ull(ptr, tag_value, &mtr); + + mtr_commit(&mtr); + + return(TRUE); +} + +/*****************************************************************//** +Read the file format tag. +@return the file format or ULINT_UNDEFINED if not set. */ +static +ulint +trx_sys_file_format_max_read(void) +/*==============================*/ +{ + mtr_t mtr; + const byte* ptr; + const buf_block_t* block; + ib_id_t file_format_id; + + /* Since this is called during the startup phase it's safe to + read the value without a covering mutex. */ + mtr_start(&mtr); + + block = buf_page_get( + TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, RW_X_LATCH, &mtr); + + ptr = buf_block_get_frame(block) + TRX_SYS_FILE_FORMAT_TAG; + file_format_id = mach_read_from_8(ptr); + + mtr_commit(&mtr); + + file_format_id -= TRX_SYS_FILE_FORMAT_TAG_MAGIC_N; + + if (file_format_id >= FILE_FORMAT_NAME_N) { + + /* Either it has never been tagged, or garbage in it. */ + return(ULINT_UNDEFINED); + } + + return((ulint) file_format_id); +} + +/*****************************************************************//** +Get the name representation of the file format from its id. +@return pointer to the name */ +UNIV_INTERN +const char* +trx_sys_file_format_id_to_name( +/*===========================*/ + const ulint id) /*!< in: id of the file format */ +{ + ut_a(id < FILE_FORMAT_NAME_N); + + return(file_format_name_map[id]); +} + +/*****************************************************************//** +Check for the max file format tag stored on disk. Note: If max_format_id +is == UNIV_FORMAT_MAX + 1 then we only print a warning. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +trx_sys_file_format_max_check( +/*==========================*/ + ulint max_format_id) /*!< in: max format id to check */ +{ + ulint format_id; + + /* Check the file format in the tablespace. Do not try to + recover if the file format is not supported by the engine + unless forced by the user. */ + format_id = trx_sys_file_format_max_read(); + if (format_id == ULINT_UNDEFINED) { + /* Format ID was not set. Set it to minimum possible + value. */ + format_id = UNIV_FORMAT_MIN; + } + + ib_logf(IB_LOG_LEVEL_INFO, + "Highest supported file format is %s.", + trx_sys_file_format_id_to_name(UNIV_FORMAT_MAX)); + + if (format_id > UNIV_FORMAT_MAX) { + + ut_a(format_id < FILE_FORMAT_NAME_N); + + ib_logf(max_format_id <= UNIV_FORMAT_MAX + ? IB_LOG_LEVEL_ERROR : IB_LOG_LEVEL_WARN, + "The system tablespace is in a file " + "format that this version doesn't support - %s.", + trx_sys_file_format_id_to_name(format_id)); + + if (max_format_id <= UNIV_FORMAT_MAX) { + return(DB_ERROR); + } + } + + format_id = (format_id > max_format_id) ? format_id : max_format_id; + + /* We don't need a mutex here, as this function should only + be called once at start up. */ + file_format_max.id = format_id; + file_format_max.name = trx_sys_file_format_id_to_name(format_id); + + return(DB_SUCCESS); +} + +/*****************************************************************//** +Set the file format id unconditionally except if it's already the +same value. +@return TRUE if value updated */ +UNIV_INTERN +ibool +trx_sys_file_format_max_set( +/*========================*/ + ulint format_id, /*!< in: file format id */ + const char** name) /*!< out: max file format name or + NULL if not needed. */ +{ + ibool ret = FALSE; + + ut_a(format_id <= UNIV_FORMAT_MAX); + + mutex_enter(&file_format_max.mutex); + + /* Only update if not already same value. */ + if (format_id != file_format_max.id) { + + ret = trx_sys_file_format_max_write(format_id, name); + } + + mutex_exit(&file_format_max.mutex); + + return(ret); +} + +/********************************************************************//** +Tags the system table space with minimum format id if it has not been +tagged yet. +WARNING: This function is only called during the startup and AFTER the +redo log application during recovery has finished. */ +UNIV_INTERN +void +trx_sys_file_format_tag_init(void) +/*==============================*/ +{ + ulint format_id; + + format_id = trx_sys_file_format_max_read(); + + /* If format_id is not set then set it to the minimum. */ + if (format_id == ULINT_UNDEFINED) { + trx_sys_file_format_max_set(UNIV_FORMAT_MIN, NULL); + } +} + +/********************************************************************//** +Update the file format tag in the system tablespace only if the given +format id is greater than the known max id. +@return TRUE if format_id was bigger than the known max id */ +UNIV_INTERN +ibool +trx_sys_file_format_max_upgrade( +/*============================*/ + const char** name, /*!< out: max file format name */ + ulint format_id) /*!< in: file format identifier */ +{ + ibool ret = FALSE; + + ut_a(name); + ut_a(file_format_max.name != NULL); + ut_a(format_id <= UNIV_FORMAT_MAX); + + mutex_enter(&file_format_max.mutex); + + if (format_id > file_format_max.id) { + + ret = trx_sys_file_format_max_write(format_id, name); + } + + mutex_exit(&file_format_max.mutex); + + return(ret); +} + +/*****************************************************************//** +Get the name representation of the file format from its id. +@return pointer to the max format name */ +UNIV_INTERN +const char* +trx_sys_file_format_max_get(void) +/*=============================*/ +{ + return(file_format_max.name); +} + +/*****************************************************************//** +Initializes the tablespace tag system. */ +UNIV_INTERN +void +trx_sys_file_format_init(void) +/*==========================*/ +{ + mutex_create(file_format_max_mutex_key, + &file_format_max.mutex, SYNC_FILE_FORMAT_TAG); + + /* We don't need a mutex here, as this function should only + be called once at start up. */ + file_format_max.id = UNIV_FORMAT_MIN; + + file_format_max.name = trx_sys_file_format_id_to_name( + file_format_max.id); +} + +/*****************************************************************//** +Closes the tablespace tag system. */ +UNIV_INTERN +void +trx_sys_file_format_close(void) +/*===========================*/ +{ + /* Does nothing at the moment */ +} + +/********************************************************************* +Creates the rollback segments. +@return number of rollback segments that are active. */ +UNIV_INTERN +ulint +trx_sys_create_rsegs( +/*=================*/ + ulint n_spaces, /*!< number of tablespaces for UNDO logs */ + ulint n_rsegs) /*!< number of rollback segments to create */ +{ + mtr_t mtr; + ulint n_used; + + ut_a(n_spaces < TRX_SYS_N_RSEGS); + ut_a(n_rsegs <= TRX_SYS_N_RSEGS); + + if (srv_read_only_mode) { + return(ULINT_UNDEFINED); + } + + /* This is executed in single-threaded mode therefore it is not + necessary to use the same mtr in trx_rseg_create(). n_used cannot + change while the function is executing. */ + + mtr_start(&mtr); + n_used = trx_sysf_rseg_find_free(&mtr); + mtr_commit(&mtr); + + if (n_used == ULINT_UNDEFINED) { + n_used = TRX_SYS_N_RSEGS; + } + + /* Do not create additional rollback segments if innodb_force_recovery + has been set and the database was not shutdown cleanly. */ + + if (!srv_force_recovery && !recv_needed_recovery && n_used < n_rsegs) { + ulint i; + ulint new_rsegs = n_rsegs - n_used; + + for (i = 0; i < new_rsegs; ++i) { + ulint space; + + /* Tablespace 0 is the system tablespace. All UNDO + log tablespaces start from 1. */ + + if (n_spaces > 0) { + space = (i % n_spaces) + 1; + } else { + space = 0; /* System tablespace */ + } + + if (trx_rseg_create(space) != NULL) { + ++n_used; + } else { + break; + } + } + } + + ib_logf(IB_LOG_LEVEL_INFO, + "%lu rollback segment(s) are active.", n_used); + + return(n_used); +} + +#else /* !UNIV_HOTBACKUP */ +/*****************************************************************//** +Prints to stderr the MySQL binlog info in the system header if the +magic number shows it valid. */ +UNIV_INTERN +void +trx_sys_print_mysql_binlog_offset_from_page( +/*========================================*/ + const byte* page) /*!< in: buffer containing the trx + system header page, i.e., page number + TRX_SYS_PAGE_NO in the tablespace */ +{ + const trx_sysf_t* sys_header; + + sys_header = page + TRX_SYS; + + if (mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO + + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD) + == TRX_SYS_MYSQL_LOG_MAGIC_N) { + + fprintf(stderr, + "mysqlbackup: Last MySQL binlog file position %lu %lu," + " file name %s\n", + (ulong) mach_read_from_4( + sys_header + TRX_SYS_MYSQL_LOG_INFO + + TRX_SYS_MYSQL_LOG_OFFSET_HIGH), + (ulong) mach_read_from_4( + sys_header + TRX_SYS_MYSQL_LOG_INFO + + TRX_SYS_MYSQL_LOG_OFFSET_LOW), + sys_header + TRX_SYS_MYSQL_LOG_INFO + + TRX_SYS_MYSQL_LOG_NAME); + } +} + +/*****************************************************************//** +Reads the file format id from the first system table space file. +Even if the call succeeds and returns TRUE, the returned format id +may be ULINT_UNDEFINED signalling that the format id was not present +in the data file. +@return TRUE if call succeeds */ +UNIV_INTERN +ibool +trx_sys_read_file_format_id( +/*========================*/ + const char *pathname, /*!< in: pathname of the first system + table space file */ + ulint *format_id) /*!< out: file format of the system table + space */ +{ + os_file_t file; + ibool success; + byte buf[UNIV_PAGE_SIZE * 2]; + page_t* page = ut_align(buf, UNIV_PAGE_SIZE); + const byte* ptr; + ib_id_t file_format_id; + + *format_id = ULINT_UNDEFINED; + + file = os_file_create_simple_no_error_handling( + innodb_file_data_key, + pathname, + OS_FILE_OPEN, + OS_FILE_READ_ONLY, + &success + ); + if (!success) { + /* The following call prints an error message */ + os_file_get_last_error(true); + + ut_print_timestamp(stderr); + + fprintf(stderr, + " mysqlbackup: Error: trying to read system " + "tablespace file format,\n" + " mysqlbackup: but could not open the tablespace " + "file %s!\n", pathname); + return(FALSE); + } + + /* Read the page on which file format is stored */ + + success = os_file_read_no_error_handling( + file, page, TRX_SYS_PAGE_NO * UNIV_PAGE_SIZE, UNIV_PAGE_SIZE); + + if (!success) { + /* The following call prints an error message */ + os_file_get_last_error(true); + + ut_print_timestamp(stderr); + + fprintf(stderr, + " mysqlbackup: Error: trying to read system " + "tablespace file format,\n" + " mysqlbackup: but failed to read the tablespace " + "file %s!\n", pathname); + + os_file_close(file); + return(FALSE); + } + os_file_close(file); + + /* get the file format from the page */ + ptr = page + TRX_SYS_FILE_FORMAT_TAG; + file_format_id = mach_read_from_8(ptr); + file_format_id -= TRX_SYS_FILE_FORMAT_TAG_MAGIC_N; + + if (file_format_id >= FILE_FORMAT_NAME_N) { + + /* Either it has never been tagged, or garbage in it. */ + return(TRUE); + } + + *format_id = (ulint) file_format_id; + + return(TRUE); +} + +/*****************************************************************//** +Reads the file format id from the given per-table data file. +@return TRUE if call succeeds */ +UNIV_INTERN +ibool +trx_sys_read_pertable_file_format_id( +/*=================================*/ + const char *pathname, /*!< in: pathname of a per-table + datafile */ + ulint *format_id) /*!< out: file format of the per-table + data file */ +{ + os_file_t file; + ibool success; + byte buf[UNIV_PAGE_SIZE * 2]; + page_t* page = ut_align(buf, UNIV_PAGE_SIZE); + const byte* ptr; + ib_uint32_t flags; + + *format_id = ULINT_UNDEFINED; + + file = os_file_create_simple_no_error_handling( + innodb_file_data_key, + pathname, + OS_FILE_OPEN, + OS_FILE_READ_ONLY, + &success + ); + if (!success) { + /* The following call prints an error message */ + os_file_get_last_error(true); + + ut_print_timestamp(stderr); + + fprintf(stderr, + " mysqlbackup: Error: trying to read per-table " + "tablespace format,\n" + " mysqlbackup: but could not open the tablespace " + "file %s!\n", pathname); + + return(FALSE); + } + + /* Read the first page of the per-table datafile */ + + success = os_file_read_no_error_handling(file, page, 0, UNIV_PAGE_SIZE); + + if (!success) { + /* The following call prints an error message */ + os_file_get_last_error(true); + + ut_print_timestamp(stderr); + + fprintf(stderr, + " mysqlbackup: Error: trying to per-table data file " + "format,\n" + " mysqlbackup: but failed to read the tablespace " + "file %s!\n", pathname); + + os_file_close(file); + return(FALSE); + } + os_file_close(file); + + /* get the file format from the page */ + ptr = page + 54; + flags = mach_read_from_4(ptr); + if (flags == 0) { + /* file format is Antelope */ + *format_id = 0; + return(TRUE); + } else if (flags & 1) { + /* tablespace flags are ok */ + *format_id = (flags / 32) % 128; + return(TRUE); + } else { + /* bad tablespace flags */ + return(FALSE); + } +} + + +/*****************************************************************//** +Get the name representation of the file format from its id. +@return pointer to the name */ +UNIV_INTERN +const char* +trx_sys_file_format_id_to_name( +/*===========================*/ + const ulint id) /*!< in: id of the file format */ +{ + if (!(id < FILE_FORMAT_NAME_N)) { + /* unknown id */ + return("Unknown"); + } + + return(file_format_name_map[id]); +} + +#endif /* !UNIV_HOTBACKUP */ + +#ifndef UNIV_HOTBACKUP +/********************************************************************* +Shutdown/Close the transaction system. */ +UNIV_INTERN +void +trx_sys_close(void) +/*===============*/ +{ + ulint i; + trx_t* trx; + read_view_t* view; + + ut_ad(trx_sys != NULL); + ut_ad(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS); + + /* Check that all read views are closed except read view owned + by a purge. */ + + mutex_enter(&trx_sys->mutex); + + if (UT_LIST_GET_LEN(trx_sys->view_list) > 1) { + fprintf(stderr, + "InnoDB: Error: all read views were not closed" + " before shutdown:\n" + "InnoDB: %lu read views open \n", + UT_LIST_GET_LEN(trx_sys->view_list) - 1); + } + + mutex_exit(&trx_sys->mutex); + + sess_close(trx_dummy_sess); + trx_dummy_sess = NULL; + + trx_purge_sys_close(); + + /* Free the double write data structures. */ + buf_dblwr_free(); + + mutex_enter(&trx_sys->mutex); + + ut_a(UT_LIST_GET_LEN(trx_sys->ro_trx_list) == 0); + + /* Only prepared transactions may be left in the system. Free them. */ + ut_a(UT_LIST_GET_LEN(trx_sys->rw_trx_list) == trx_sys->n_prepared_trx); + + while ((trx = UT_LIST_GET_FIRST(trx_sys->rw_trx_list)) != NULL) { + trx_free_prepared(trx); + } + + /* There can't be any active transactions. */ + for (i = 0; i < TRX_SYS_N_RSEGS; ++i) { + trx_rseg_t* rseg; + + rseg = trx_sys->rseg_array[i]; + + if (rseg != NULL) { + trx_rseg_mem_free(rseg); + } else { + break; + } + } + + view = UT_LIST_GET_FIRST(trx_sys->view_list); + + while (view != NULL) { + read_view_t* prev_view = view; + + view = UT_LIST_GET_NEXT(view_list, prev_view); + + /* Views are allocated from the trx_sys->global_read_view_heap. + So, we simply remove the element here. */ + UT_LIST_REMOVE(view_list, trx_sys->view_list, prev_view); + } + + ut_a(UT_LIST_GET_LEN(trx_sys->view_list) == 0); + ut_a(UT_LIST_GET_LEN(trx_sys->ro_trx_list) == 0); + ut_a(UT_LIST_GET_LEN(trx_sys->rw_trx_list) == 0); + ut_a(UT_LIST_GET_LEN(trx_sys->mysql_trx_list) == 0); + + mutex_exit(&trx_sys->mutex); + + mutex_free(&trx_sys->mutex); + + ut_ad(trx_sys->descr_n_used == 0); + ut_free(trx_sys->descriptors); + + mem_free(trx_sys); + + trx_sys = NULL; +} + +/********************************************************************* +Check if there are any active (non-prepared) transactions. +@return total number of active transactions or 0 if none */ +UNIV_INTERN +ulint +trx_sys_any_active_transactions(void) +/*=================================*/ +{ + ulint total_trx = 0; + + mutex_enter(&trx_sys->mutex); + + total_trx = UT_LIST_GET_LEN(trx_sys->rw_trx_list) + + UT_LIST_GET_LEN(trx_sys->mysql_trx_list); + + ut_a(total_trx >= trx_sys->n_prepared_trx); + total_trx -= trx_sys->n_prepared_trx; + + mutex_exit(&trx_sys->mutex); + + return(total_trx); +} + +#ifdef UNIV_DEBUG +/*************************************************************//** +Validate the trx_list_t. +@return TRUE if valid. */ +static +ibool +trx_sys_validate_trx_list_low( +/*===========================*/ + trx_list_t* trx_list) /*!< in: &trx_sys->ro_trx_list + or &trx_sys->rw_trx_list */ +{ + const trx_t* trx; + const trx_t* prev_trx = NULL; + + ut_ad(mutex_own(&trx_sys->mutex)); + + ut_ad(trx_list == &trx_sys->ro_trx_list + || trx_list == &trx_sys->rw_trx_list); + + for (trx = UT_LIST_GET_FIRST(*trx_list); + trx != NULL; + prev_trx = trx, trx = UT_LIST_GET_NEXT(trx_list, prev_trx)) { + + assert_trx_in_list(trx); + ut_ad(trx->read_only == (trx_list == &trx_sys->ro_trx_list)); + + ut_a(prev_trx == NULL || prev_trx->id > trx->id); + } + + return(TRUE); +} + +/*************************************************************//** +Validate the trx_sys_t::ro_trx_list and trx_sys_t::rw_trx_list. +@return TRUE if lists are valid. */ +UNIV_INTERN +ibool +trx_sys_validate_trx_list(void) +/*===========================*/ +{ + ut_ad(mutex_own(&trx_sys->mutex)); + + ut_a(trx_sys_validate_trx_list_low(&trx_sys->ro_trx_list)); + ut_a(trx_sys_validate_trx_list_low(&trx_sys->rw_trx_list)); + + return(TRUE); +} +#endif /* UNIV_DEBUG */ +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/xtradb/trx/trx0trx.cc b/storage/xtradb/trx/trx0trx.cc new file mode 100644 index 00000000000..0732e0c0a36 --- /dev/null +++ b/storage/xtradb/trx/trx0trx.cc @@ -0,0 +1,2561 @@ +/***************************************************************************** + +Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file trx/trx0trx.cc +The transaction + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#include "btr0types.h" +#include "trx0trx.h" + +#ifdef UNIV_NONINL +#include "trx0trx.ic" +#endif + +#include "trx0undo.h" +#include "trx0rseg.h" +#include "log0log.h" +#include "que0que.h" +#include "lock0lock.h" +#include "trx0roll.h" +#include "usr0sess.h" +#include "read0read.h" +#include "srv0srv.h" +#include "srv0start.h" +#include "btr0sea.h" +#include "os0proc.h" +#include "trx0xa.h" +#include "trx0rec.h" +#include "trx0purge.h" +#include "ha_prototypes.h" +#include "srv0mon.h" +#include "ut0vec.h" + +#include<set> + +/** Set of table_id */ +typedef std::set<table_id_t> table_id_set; + +/** Dummy session used currently in MySQL interface */ +UNIV_INTERN sess_t* trx_dummy_sess = NULL; + +#ifdef UNIV_PFS_MUTEX +/* Key to register the mutex with performance schema */ +UNIV_INTERN mysql_pfs_key_t trx_mutex_key; +/* Key to register the mutex with performance schema */ +UNIV_INTERN mysql_pfs_key_t trx_undo_mutex_key; +#endif /* UNIV_PFS_MUTEX */ + +/*************************************************************//** +Set detailed error message for the transaction. */ +UNIV_INTERN +void +trx_set_detailed_error( +/*===================*/ + trx_t* trx, /*!< in: transaction struct */ + const char* msg) /*!< in: detailed error message */ +{ + ut_strlcpy(trx->detailed_error, msg, sizeof(trx->detailed_error)); +} + +/*************************************************************//** +Set detailed error message for the transaction from a file. Note that the +file is rewinded before reading from it. */ +UNIV_INTERN +void +trx_set_detailed_error_from_file( +/*=============================*/ + trx_t* trx, /*!< in: transaction struct */ + FILE* file) /*!< in: file to read message from */ +{ + os_file_read_string(file, trx->detailed_error, + sizeof(trx->detailed_error)); +} + +/*************************************************************//** +Callback function for trx_find_descriptor() to compare trx IDs. */ +UNIV_INTERN +int +trx_descr_cmp( +/*==========*/ + const void *a, /*!< in: pointer to first comparison argument */ + const void *b) /*!< in: pointer to second comparison argument */ +{ + const trx_id_t* da = (const trx_id_t*) a; + const trx_id_t* db = (const trx_id_t*) b; + + if (*da < *db) { + return -1; + } else if (*da > *db) { + return 1; + } + + return 0; +} + +/*************************************************************//** +Reserve a slot for a given trx in the global descriptors array. */ +UNIV_INLINE +void +trx_reserve_descriptor( +/*===================*/ + const trx_t* trx) /*!< in: trx pointer */ +{ + ulint n_used; + ulint n_max; + trx_id_t* descr; + + ut_ad(mutex_own(&trx_sys->mutex) || srv_is_being_started); + ut_ad(srv_is_being_started || + !trx_find_descriptor(trx_sys->descriptors, + trx_sys->descr_n_used, + trx->id)); + + n_used = trx_sys->descr_n_used + 1; + n_max = trx_sys->descr_n_max; + + if (UNIV_UNLIKELY(n_used > n_max)) { + + n_max = n_max * 2; + + trx_sys->descriptors = static_cast<trx_id_t*>( + ut_realloc(trx_sys->descriptors, + n_max * sizeof(trx_id_t))); + + trx_sys->descr_n_max = n_max; + srv_descriptors_memory = n_max * sizeof(trx_id_t); + } + + descr = trx_sys->descriptors + n_used - 1; + + if (UNIV_UNLIKELY(n_used > 1 && trx->id < descr[-1])) { + + /* Find the slot where it should be inserted. We could use a + binary search, but in reality linear search should be faster, + because the slot we are looking for is near the array end. */ + + trx_id_t* tdescr; + + for (tdescr = descr - 1; + tdescr >= trx_sys->descriptors && *tdescr > trx->id; + tdescr--) { + } + + tdescr++; + + ut_memmove(tdescr + 1, tdescr, (descr - tdescr) * + sizeof(trx_id_t)); + + descr = tdescr; + } + + *descr = trx->id; + + trx_sys->descr_n_used = n_used; +} + +/*************************************************************//** +Release a slot for a given trx in the global descriptors array. */ +UNIV_INTERN +void +trx_release_descriptor( +/*===================*/ + trx_t* trx) /*!< in: trx pointer */ +{ + ulint size; + trx_id_t* descr; + + ut_ad(mutex_own(&trx_sys->mutex)); + + if (UNIV_LIKELY(trx->in_trx_serial_list)) { + + UT_LIST_REMOVE(trx_serial_list, trx_sys->trx_serial_list, + trx); + trx->in_trx_serial_list = false; + } + + descr = trx_find_descriptor(trx_sys->descriptors, + trx_sys->descr_n_used, + trx->id); + + if (UNIV_UNLIKELY(descr == NULL)) { + + return; + } + + size = (trx_sys->descriptors + trx_sys->descr_n_used - 1 - descr) * + sizeof(trx_id_t); + + if (UNIV_LIKELY(size > 0)) { + + ut_memmove(descr, descr + 1, size); + } + + trx_sys->descr_n_used--; +} + +/****************************************************************//** +Creates and initializes a transaction object. It must be explicitly +started with trx_start_if_not_started() before using it. The default +isolation level is TRX_ISO_REPEATABLE_READ. +@return transaction instance, should never be NULL */ +static +trx_t* +trx_create(void) +/*============*/ +{ + trx_t* trx; + mem_heap_t* heap; + ib_alloc_t* heap_alloc; + + trx = static_cast<trx_t*>(mem_zalloc(sizeof(*trx))); + + mutex_create(trx_mutex_key, &trx->mutex, SYNC_TRX); + + trx->magic_n = TRX_MAGIC_N; + + trx->state = TRX_STATE_NOT_STARTED; + + trx->isolation_level = TRX_ISO_REPEATABLE_READ; + + trx->no = TRX_ID_MAX; + trx->in_trx_serial_list = false; + + trx->support_xa = TRUE; + + trx->fake_changes = FALSE; + + trx->check_foreigns = TRUE; + trx->check_unique_secondary = TRUE; + + trx->dict_operation = TRX_DICT_OP_NONE; + + trx->idle_start = 0; + trx->last_stmt_start = 0; + + mutex_create(trx_undo_mutex_key, &trx->undo_mutex, SYNC_TRX_UNDO); + + trx->error_state = DB_SUCCESS; + + trx->lock.que_state = TRX_QUE_RUNNING; + + trx->lock.lock_heap = mem_heap_create_typed( + 256, MEM_HEAP_FOR_LOCK_HEAP); + + trx->search_latch_timeout = BTR_SEA_TIMEOUT; + + trx->io_reads = 0; + trx->io_read = 0; + trx->io_reads_wait_timer = 0; + trx->lock_que_wait_timer = 0; + trx->innodb_que_wait_timer = 0; + trx->distinct_page_access = 0; + trx->distinct_page_access_hash = NULL; + trx->take_stats = FALSE; + + trx->xid.formatID = -1; + + trx->op_info = ""; + + trx->api_trx = false; + + trx->api_auto_commit = false; + + trx->read_write = true; + + heap = mem_heap_create(sizeof(ib_vector_t) + sizeof(void*) * 8); + heap_alloc = ib_heap_allocator_create(heap); + + /* Remember to free the vector explicitly in trx_free(). */ + trx->autoinc_locks = ib_vector_create(heap_alloc, sizeof(void**), 4); + + /* Remember to free the vector explicitly in trx_free(). */ + heap = mem_heap_create(sizeof(ib_vector_t) + sizeof(void*) * 128); + heap_alloc = ib_heap_allocator_create(heap); + + trx->lock.table_locks = ib_vector_create( + heap_alloc, sizeof(void**), 32); + + return(trx); +} + +/********************************************************************//** +Creates a transaction object for background operations by the master thread. +@return own: transaction object */ +UNIV_INTERN +trx_t* +trx_allocate_for_background(void) +/*=============================*/ +{ + trx_t* trx; + + trx = trx_create(); + + trx->sess = trx_dummy_sess; + + return(trx); +} + +/********************************************************************//** +Creates a transaction object for MySQL. +@return own: transaction object */ +UNIV_INTERN +trx_t* +trx_allocate_for_mysql(void) +/*========================*/ +{ + trx_t* trx; + + trx = trx_allocate_for_background(); + + mutex_enter(&trx_sys->mutex); + + ut_d(trx->in_mysql_trx_list = TRUE); + UT_LIST_ADD_FIRST(mysql_trx_list, trx_sys->mysql_trx_list, trx); + + mutex_exit(&trx_sys->mutex); + + if (UNIV_UNLIKELY(trx->take_stats)) { + trx->distinct_page_access_hash + = static_cast<byte *>(mem_alloc(DPAH_SIZE)); + memset(trx->distinct_page_access_hash, 0, DPAH_SIZE); + } + + return(trx); +} + +/********************************************************************//** +Frees a transaction object without releasing the corresponding descriptor. +Should be used by callers that already own trx_sys->mutex. */ +static +void +trx_free_low( +/*=========*/ + trx_t* trx) /*!< in, own: trx object */ +{ + ut_a(trx->magic_n == TRX_MAGIC_N); + ut_ad(!trx->in_ro_trx_list); + ut_ad(!trx->in_rw_trx_list); + ut_ad(!trx->in_mysql_trx_list); + + mutex_free(&trx->undo_mutex); + + if (trx->undo_no_arr != NULL) { + trx_undo_arr_free(trx->undo_no_arr); + } + + ut_a(trx->lock.wait_lock == NULL); + ut_a(trx->lock.wait_thr == NULL); + + ut_a(!trx->has_search_latch); +#ifdef UNIV_SYNC_DEBUG + ut_ad(!btr_search_own_any()); +#endif + + ut_a(trx->dict_operation_lock_mode == 0); + + if (trx->lock.lock_heap) { + mem_heap_free(trx->lock.lock_heap); + } + + ut_a(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0); + + ut_a(ib_vector_is_empty(trx->autoinc_locks)); + /* We allocated a dedicated heap for the vector. */ + ib_vector_free(trx->autoinc_locks); + + if (trx->lock.table_locks != NULL) { + /* We allocated a dedicated heap for the vector. */ + ib_vector_free(trx->lock.table_locks); + } + + mutex_free(&trx->mutex); + + read_view_free(trx->prebuilt_view); + + mem_free(trx); +} + +/********************************************************************//** +Frees a transaction object. */ +static +void +trx_free( +/*=========*/ + trx_t* trx) /*!< in, own: trx object */ +{ + mutex_enter(&trx_sys->mutex); + trx_release_descriptor(trx); + mutex_exit(&trx_sys->mutex); + + trx_free_low(trx); +} + +/********************************************************************//** +Frees a transaction object of a background operation of the master thread. */ +UNIV_INTERN +void +trx_free_for_background( +/*====================*/ + trx_t* trx) /*!< in, own: trx object */ +{ + + if (trx->distinct_page_access_hash) + { + mem_free(trx->distinct_page_access_hash); + trx->distinct_page_access_hash= NULL; + } + + if (trx->declared_to_be_inside_innodb) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "Freeing a trx (%p, " TRX_ID_FMT ") which is declared " + "to be processing inside InnoDB", trx, trx->id); + + trx_print(stderr, trx, 600); + putc('\n', stderr); + + /* This is an error but not a fatal error. We must keep + the counters like srv_conc_n_threads accurate. */ + srv_conc_force_exit_innodb(trx); + } + + if (trx->n_mysql_tables_in_use != 0 + || trx->mysql_n_tables_locked != 0) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "MySQL is freeing a thd though " + "trx->n_mysql_tables_in_use is %lu and " + "trx->mysql_n_tables_locked is %lu.", + (ulong) trx->n_mysql_tables_in_use, + (ulong) trx->mysql_n_tables_locked); + + trx_print(stderr, trx, 600); + ut_print_buf(stderr, trx, sizeof(trx_t)); + putc('\n', stderr); + } + + ut_a(trx->state == TRX_STATE_NOT_STARTED); + ut_a(trx->insert_undo == NULL); + ut_a(trx->update_undo == NULL); + ut_a(trx->read_view == NULL); + + trx_free(trx); +} + +/********************************************************************//** +At shutdown, frees a transaction object that is in the PREPARED state. */ +UNIV_INTERN +void +trx_free_prepared( +/*==============*/ + trx_t* trx) /*!< in, own: trx object */ +{ + ut_ad(mutex_own(&trx_sys->mutex)); + + ut_a(trx_state_eq(trx, TRX_STATE_PREPARED)); + ut_a(trx->magic_n == TRX_MAGIC_N); + + trx_undo_free_prepared(trx); + + assert_trx_in_rw_list(trx); + + ut_a(!trx->read_only); + + UT_LIST_REMOVE(trx_list, trx_sys->rw_trx_list, trx); + ut_d(trx->in_rw_trx_list = FALSE); + + trx_release_descriptor(trx); + + /* Undo trx_resurrect_table_locks(). */ + UT_LIST_INIT(trx->lock.trx_locks); + + trx_free_low(trx); + + ut_ad(trx_sys->descr_n_used <= UT_LIST_GET_LEN(trx_sys->rw_trx_list)); +} + +/********************************************************************//** +Frees a transaction object for MySQL. */ +UNIV_INTERN +void +trx_free_for_mysql( +/*===============*/ + trx_t* trx) /*!< in, own: trx object */ +{ + if (trx->distinct_page_access_hash) + { + mem_free(trx->distinct_page_access_hash); + trx->distinct_page_access_hash= NULL; + } + + mutex_enter(&trx_sys->mutex); + + ut_ad(trx->in_mysql_trx_list); + ut_d(trx->in_mysql_trx_list = FALSE); + UT_LIST_REMOVE(mysql_trx_list, trx_sys->mysql_trx_list, trx); + + ut_ad(trx_sys_validate_trx_list()); + + mutex_exit(&trx_sys->mutex); + + trx_free_for_background(trx); +} + +/****************************************************************//** +Inserts the trx handle in the trx system trx list in the right position. +The list is sorted on the trx id so that the biggest id is at the list +start. This function is used at the database startup to insert incomplete +transactions to the list. */ +static +void +trx_list_rw_insert_ordered( +/*=======================*/ + trx_t* trx) /*!< in: trx handle */ +{ + trx_t* trx2; + + ut_ad(!trx->read_only); + + ut_d(trx->start_file = __FILE__); + ut_d(trx->start_line = __LINE__); + + ut_a(srv_is_being_started); + ut_ad(!trx->in_ro_trx_list); + ut_ad(!trx->in_rw_trx_list); + ut_ad(trx->state != TRX_STATE_NOT_STARTED); + ut_ad(trx->is_recovered); + + for (trx2 = UT_LIST_GET_FIRST(trx_sys->rw_trx_list); + trx2 != NULL; + trx2 = UT_LIST_GET_NEXT(trx_list, trx2)) { + + assert_trx_in_rw_list(trx2); + + if (trx->id >= trx2->id) { + + ut_ad(trx->id > trx2->id); + break; + } + } + + if (trx2 != NULL) { + trx2 = UT_LIST_GET_PREV(trx_list, trx2); + + if (trx2 == NULL) { + UT_LIST_ADD_FIRST(trx_list, trx_sys->rw_trx_list, trx); + } else { + UT_LIST_INSERT_AFTER( + trx_list, trx_sys->rw_trx_list, trx2, trx); + } + } else { + UT_LIST_ADD_LAST(trx_list, trx_sys->rw_trx_list, trx); + } + +#ifdef UNIV_DEBUG + if (trx->id > trx_sys->rw_max_trx_id) { + trx_sys->rw_max_trx_id = trx->id; + } +#endif /* UNIV_DEBUG */ + + ut_ad(!trx->in_rw_trx_list); + ut_d(trx->in_rw_trx_list = TRUE); +} + +/****************************************************************//** +Resurrect the table locks for a resurrected transaction. */ +static +void +trx_resurrect_table_locks( +/*======================*/ + trx_t* trx, /*!< in/out: transaction */ + const trx_undo_t* undo) /*!< in: undo log */ +{ + mtr_t mtr; + page_t* undo_page; + trx_undo_rec_t* undo_rec; + table_id_set tables; + + ut_ad(undo == trx->insert_undo || undo == trx->update_undo); + + if (trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY) + || undo->empty) { + return; + } + + mtr_start(&mtr); + /* trx_rseg_mem_create() may have acquired an X-latch on this + page, so we cannot acquire an S-latch. */ + undo_page = trx_undo_page_get( + undo->space, undo->zip_size, undo->top_page_no, &mtr); + undo_rec = undo_page + undo->top_offset; + + do { + ulint type; + ulint cmpl_info; + bool updated_extern; + undo_no_t undo_no; + table_id_t table_id; + + page_t* undo_rec_page = page_align(undo_rec); + + if (undo_rec_page != undo_page) { + if (!mtr_memo_release(&mtr, + buf_block_align(undo_page), + MTR_MEMO_PAGE_X_FIX)) { + /* The page of the previous undo_rec + should have been latched by + trx_undo_page_get() or + trx_undo_get_prev_rec(). */ + ut_ad(0); + } + + undo_page = undo_rec_page; + } + + trx_undo_rec_get_pars( + undo_rec, &type, &cmpl_info, + &updated_extern, &undo_no, &table_id); + tables.insert(table_id); + + undo_rec = trx_undo_get_prev_rec( + undo_rec, undo->hdr_page_no, + undo->hdr_offset, false, &mtr); + } while (undo_rec); + + mtr_commit(&mtr); + + for (table_id_set::const_iterator i = tables.begin(); + i != tables.end(); i++) { + if (dict_table_t* table = dict_table_open_on_id( + *i, FALSE, DICT_TABLE_OP_LOAD_TABLESPACE)) { + if (table->ibd_file_missing + || dict_table_is_temporary(table)) { + mutex_enter(&dict_sys->mutex); + dict_table_close(table, TRUE, FALSE); + dict_table_remove_from_cache(table); + mutex_exit(&dict_sys->mutex); + continue; + } + + lock_table_ix_resurrect(table, trx); + + DBUG_PRINT("ib_trx", + ("resurrect" TRX_ID_FMT + " table '%s' IX lock from %s undo", + trx->id, table->name, + undo == trx->insert_undo + ? "insert" : "update")); + + dict_table_close(table, FALSE, FALSE); + } + } +} + +/****************************************************************//** +Resurrect the transactions that were doing inserts the time of the +crash, they need to be undone. +@return trx_t instance */ +static +trx_t* +trx_resurrect_insert( +/*=================*/ + trx_undo_t* undo, /*!< in: entry to UNDO */ + trx_rseg_t* rseg) /*!< in: rollback segment */ +{ + trx_t* trx; + + trx = trx_allocate_for_background(); + + trx->rseg = rseg; + trx->xid = undo->xid; + trx->id = undo->trx_id; + trx->insert_undo = undo; + trx->is_recovered = TRUE; + + /* This is single-threaded startup code, we do not need the + protection of trx->mutex or trx_sys->mutex here. */ + + if (undo->state != TRX_UNDO_ACTIVE) { + + /* Prepared transactions are left in the prepared state + waiting for a commit or abort decision from MySQL */ + + if (undo->state == TRX_UNDO_PREPARED) { + + fprintf(stderr, + "InnoDB: Transaction " TRX_ID_FMT " was in the" + " XA prepared state.\n", trx->id); + + if (srv_force_recovery == 0) { + + trx->state = TRX_STATE_PREPARED; + trx_sys->n_prepared_trx++; + trx_sys->n_prepared_recovered_trx++; + } else { + fprintf(stderr, + "InnoDB: Since innodb_force_recovery" + " > 0, we will rollback it anyway.\n"); + + trx->state = TRX_STATE_ACTIVE; + } + } else { + trx->state = TRX_STATE_COMMITTED_IN_MEMORY; + } + + /* We give a dummy value for the trx no; this should have no + relevance since purge is not interested in committed + transaction numbers, unless they are in the history + list, in which case it looks the number from the disk based + undo log structure */ + + trx->no = trx->id; + } else { + trx->state = TRX_STATE_ACTIVE; + + /* A running transaction always has the number + field inited to TRX_ID_MAX */ + + trx->no = TRX_ID_MAX; + } + + /* trx_start_low() is not called with resurrect, so need to initialize + start time here.*/ + if (trx->state == TRX_STATE_ACTIVE + || trx->state == TRX_STATE_PREPARED) { + trx->start_time = ut_time(); + } + + if (undo->dict_operation) { + trx_set_dict_operation(trx, TRX_DICT_OP_TABLE); + trx->table_id = undo->table_id; + } + + if (!undo->empty) { + trx->undo_no = undo->top_undo_no + 1; + } + + return(trx); +} + +/****************************************************************//** +Prepared transactions are left in the prepared state waiting for a +commit or abort decision from MySQL */ +static +void +trx_resurrect_update_in_prepared_state( +/*===================================*/ + trx_t* trx, /*!< in,out: transaction */ + const trx_undo_t* undo) /*!< in: update UNDO record */ +{ + /* This is single-threaded startup code, we do not need the + protection of trx->mutex or trx_sys->mutex here. */ + + if (undo->state == TRX_UNDO_PREPARED) { + fprintf(stderr, + "InnoDB: Transaction " TRX_ID_FMT + " was in the XA prepared state.\n", trx->id); + + if (srv_force_recovery == 0) { + if (trx_state_eq(trx, TRX_STATE_NOT_STARTED)) { + trx_sys->n_prepared_trx++; + trx_sys->n_prepared_recovered_trx++; + } else { + ut_ad(trx_state_eq(trx, TRX_STATE_PREPARED)); + } + + trx->state = TRX_STATE_PREPARED; + } else { + fprintf(stderr, + "InnoDB: Since innodb_force_recovery" + " > 0, we will rollback it anyway.\n"); + + trx->state = TRX_STATE_ACTIVE; + } + } else { + trx->state = TRX_STATE_COMMITTED_IN_MEMORY; + } +} + +/****************************************************************//** +Resurrect the transactions that were doing updates the time of the +crash, they need to be undone. */ +static +void +trx_resurrect_update( +/*=================*/ + trx_t* trx, /*!< in/out: transaction */ + trx_undo_t* undo, /*!< in/out: update UNDO record */ + trx_rseg_t* rseg) /*!< in/out: rollback segment */ +{ + trx->rseg = rseg; + trx->xid = undo->xid; + trx->id = undo->trx_id; + trx->update_undo = undo; + trx->is_recovered = TRUE; + + /* This is single-threaded startup code, we do not need the + protection of trx->mutex or trx_sys->mutex here. */ + + if (undo->state != TRX_UNDO_ACTIVE) { + trx_resurrect_update_in_prepared_state(trx, undo); + + /* We give a dummy value for the trx number */ + + trx->no = trx->id; + + } else { + trx->state = TRX_STATE_ACTIVE; + + /* A running transaction always has the number field inited to + TRX_ID_MAX */ + + trx->no = TRX_ID_MAX; + } + + /* trx_start_low() is not called with resurrect, so need to initialize + start time here.*/ + if (trx->state == TRX_STATE_ACTIVE + || trx->state == TRX_STATE_PREPARED) { + trx->start_time = ut_time(); + } + + if (undo->dict_operation) { + trx_set_dict_operation(trx, TRX_DICT_OP_TABLE); + trx->table_id = undo->table_id; + } + + if (!undo->empty && undo->top_undo_no >= trx->undo_no) { + + trx->undo_no = undo->top_undo_no + 1; + } +} + +/****************************************************************//** +Creates trx objects for transactions and initializes the trx list of +trx_sys at database start. Rollback segment and undo log lists must +already exist when this function is called, because the lists of +transactions to be rolled back or cleaned up are built based on the +undo log lists. */ +UNIV_INTERN +void +trx_lists_init_at_db_start(void) +/*============================*/ +{ + ulint i; + + ut_a(srv_is_being_started); + + UT_LIST_INIT(trx_sys->ro_trx_list); + UT_LIST_INIT(trx_sys->rw_trx_list); + UT_LIST_INIT(trx_sys->trx_serial_list); + + /* Look from the rollback segments if there exist undo logs for + transactions */ + + for (i = 0; i < TRX_SYS_N_RSEGS; ++i) { + trx_undo_t* undo; + trx_rseg_t* rseg; + + rseg = trx_sys->rseg_array[i]; + + if (rseg == NULL) { + continue; + } + + /* Resurrect transactions that were doing inserts. */ + for (undo = UT_LIST_GET_FIRST(rseg->insert_undo_list); + undo != NULL; + undo = UT_LIST_GET_NEXT(undo_list, undo)) { + trx_t* trx; + + trx = trx_resurrect_insert(undo, rseg); + + if (trx->state == TRX_STATE_ACTIVE || + trx->state == TRX_STATE_PREPARED) { + + trx_reserve_descriptor(trx); + } + trx_list_rw_insert_ordered(trx); + + trx_resurrect_table_locks(trx, undo); + } + + /* Ressurrect transactions that were doing updates. */ + for (undo = UT_LIST_GET_FIRST(rseg->update_undo_list); + undo != NULL; + undo = UT_LIST_GET_NEXT(undo_list, undo)) { + trx_t* trx; + ibool trx_created; + + /* Check the trx_sys->rw_trx_list first. */ + mutex_enter(&trx_sys->mutex); + trx = trx_get_rw_trx_by_id(undo->trx_id); + mutex_exit(&trx_sys->mutex); + + if (trx == NULL) { + trx = trx_allocate_for_background(); + trx_created = TRUE; + } else { + trx_created = FALSE; + } + + trx_resurrect_update(trx, undo, rseg); + + if (trx_created) { + if (trx->state == TRX_STATE_ACTIVE || + trx->state == TRX_STATE_PREPARED) { + + trx_reserve_descriptor(trx); + } + trx_list_rw_insert_ordered(trx); + } + + trx_resurrect_table_locks(trx, undo); + } + } +} + +/******************************************************************//** +Assigns a rollback segment to a transaction in a round-robin fashion. +@return assigned rollback segment instance */ +static +trx_rseg_t* +trx_assign_rseg_low( +/*================*/ + ulong max_undo_logs, /*!< in: maximum number of UNDO logs to use */ + ulint n_tablespaces) /*!< in: number of rollback tablespaces */ +{ + ulint i; + trx_rseg_t* rseg; + static ulint latest_rseg = 0; + + if (srv_read_only_mode) { + ut_a(max_undo_logs == ULONG_UNDEFINED); + return(NULL); + } + + /* This breaks true round robin but that should be OK. */ + + ut_a(max_undo_logs > 0 && max_undo_logs <= TRX_SYS_N_RSEGS); + + i = latest_rseg++; + i %= max_undo_logs; + + /* Note: The assumption here is that there can't be any gaps in + the array. Once we implement more flexible rollback segment + management this may not hold. The assertion checks for that case. */ + + if (trx_sys->rseg_array[0] == NULL) { + return(NULL); + } + + /* Skip the system tablespace if we have more than one tablespace + defined for rollback segments. We want all UNDO records to be in + the non-system tablespaces. */ + + do { + rseg = trx_sys->rseg_array[i]; + ut_a(rseg == NULL || i == rseg->id); + + i = (rseg == NULL) ? 0 : i + 1; + + } while (rseg == NULL + || (rseg->space == 0 + && n_tablespaces > 0 + && trx_sys->rseg_array[1] != NULL)); + + return(rseg); +} + +/****************************************************************//** +Assign a read-only transaction a rollback-segment, if it is attempting +to write to a TEMPORARY table. */ +UNIV_INTERN +void +trx_assign_rseg( +/*============*/ + trx_t* trx) /*!< A read-only transaction that + needs to be assigned a RBS. */ +{ + ut_a(trx->rseg == 0); + ut_a(trx->read_only); + ut_a(!srv_read_only_mode); + ut_a(!trx_is_autocommit_non_locking(trx)); + + trx->rseg = trx_assign_rseg_low(srv_undo_logs, srv_undo_tablespaces); +} + +/****************************************************************//** +Starts a transaction. */ +static +void +trx_start_low( +/*==========*/ + trx_t* trx) /*!< in: transaction */ +{ + ut_ad(trx->rseg == NULL); + + ut_ad(trx->start_file != 0); + ut_ad(trx->start_line != 0); + ut_ad(!trx->is_recovered); + ut_ad(trx_state_eq(trx, TRX_STATE_NOT_STARTED)); + ut_ad(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0); + + /* Check whether it is an AUTOCOMMIT SELECT */ + trx->auto_commit = (trx->api_trx && trx->api_auto_commit) + || thd_trx_is_auto_commit(trx->mysql_thd); + + trx->read_only = + (trx->api_trx && !trx->read_write) + || (!trx->ddl && thd_trx_is_read_only(trx->mysql_thd)) + || srv_read_only_mode; + + if (!trx->auto_commit) { + ++trx->will_lock; + } else if (trx->will_lock == 0) { + trx->read_only = TRUE; + } + + if (!trx->read_only) { + trx->rseg = trx_assign_rseg_low( + srv_undo_logs, srv_undo_tablespaces); + } + + /* The initial value for trx->no: TRX_ID_MAX is used in + read_view_open_now: */ + + trx->no = TRX_ID_MAX; + + ut_a(ib_vector_is_empty(trx->autoinc_locks)); + ut_a(ib_vector_is_empty(trx->lock.table_locks)); + + mutex_enter(&trx_sys->mutex); + + /* If this transaction came from trx_allocate_for_mysql(), + trx->in_mysql_trx_list would hold. In that case, the trx->state + change must be protected by the trx_sys->mutex, so that + lock_print_info_all_transactions() will have a consistent view. */ + + trx->state = TRX_STATE_ACTIVE; + + trx->id = trx_sys_get_new_trx_id(); + + ut_ad(!trx->in_rw_trx_list); + ut_ad(!trx->in_ro_trx_list); + + if (trx->read_only) { + + /* Note: The trx_sys_t::ro_trx_list doesn't really need to + be ordered, we should exploit this using a list type that + doesn't need a list wide lock to increase concurrency. */ + + if (!trx_is_autocommit_non_locking(trx)) { + UT_LIST_ADD_FIRST(trx_list, trx_sys->ro_trx_list, trx); + ut_d(trx->in_ro_trx_list = TRUE); + } + } else { + + ut_ad(trx->rseg != NULL + || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO); + + ut_ad(!trx_is_autocommit_non_locking(trx)); + UT_LIST_ADD_FIRST(trx_list, trx_sys->rw_trx_list, trx); + ut_d(trx->in_rw_trx_list = TRUE); + +#ifdef UNIV_DEBUG + if (trx->id > trx_sys->rw_max_trx_id) { + trx_sys->rw_max_trx_id = trx->id; + } +#endif /* UNIV_DEBUG */ + + trx_reserve_descriptor(trx); + } + + ut_ad(trx_sys_validate_trx_list()); + + mutex_exit(&trx_sys->mutex); + + trx->start_time = ut_time(); + + MONITOR_INC(MONITOR_TRX_ACTIVE); +} + +/****************************************************************//** +Set the transaction serialisation number. */ +static +void +trx_serialisation_number_get( +/*=========================*/ + trx_t* trx) /*!< in: transaction */ +{ + trx_rseg_t* rseg; + + rseg = trx->rseg; + + ut_ad(mutex_own(&rseg->mutex)); + + mutex_enter(&trx_sys->mutex); + + trx->no = trx_sys_get_new_trx_id(); + + if (UNIV_LIKELY(!trx->in_trx_serial_list)) { + + UT_LIST_ADD_LAST(trx_serial_list, trx_sys->trx_serial_list, + trx); + + trx->in_trx_serial_list = true; + } + + /* If the rollack segment is not empty then the + new trx_t::no can't be less than any trx_t::no + already in the rollback segment. User threads only + produce events when a rollback segment is empty. */ + + if (rseg->last_page_no == FIL_NULL) { + void* ptr; + rseg_queue_t rseg_queue; + + rseg_queue.rseg = rseg; + rseg_queue.trx_no = trx->no; + + mutex_enter(&purge_sys->bh_mutex); + + /* This is to reduce the pressure on the trx_sys_t::mutex + though in reality it should make very little (read no) + difference because this code path is only taken when the + rbs is empty. */ + + mutex_exit(&trx_sys->mutex); + + ptr = ib_bh_push(purge_sys->ib_bh, &rseg_queue); + ut_a(ptr); + + mutex_exit(&purge_sys->bh_mutex); + } else { + mutex_exit(&trx_sys->mutex); + } +} + +/****************************************************************//** +Assign the transaction its history serialisation number and write the +update UNDO log record to the assigned rollback segment. */ +static __attribute__((nonnull)) +void +trx_write_serialisation_history( +/*============================*/ + trx_t* trx, /*!< in/out: transaction */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + trx_rseg_t* rseg; + + rseg = trx->rseg; + + /* Change the undo log segment states from TRX_UNDO_ACTIVE + to some other state: these modifications to the file data + structure define the transaction as committed in the file + based domain, at the serialization point of the log sequence + number lsn obtained below. */ + + if (trx->update_undo != NULL) { + page_t* undo_hdr_page; + trx_undo_t* undo = trx->update_undo; + + /* We have to hold the rseg mutex because update + log headers have to be put to the history list in the + (serialisation) order of the UNDO trx number. This is + required for the purge in-memory data structures too. */ + + mutex_enter(&rseg->mutex); + + /* Assign the transaction serialisation number and also + update the purge min binary heap if this is the first + UNDO log being written to the assigned rollback segment. */ + + trx_serialisation_number_get(trx); + + /* It is not necessary to obtain trx->undo_mutex here + because only a single OS thread is allowed to do the + transaction commit for this transaction. */ + + undo_hdr_page = trx_undo_set_state_at_finish(undo, mtr); + + trx_undo_update_cleanup(trx, undo_hdr_page, mtr); + } else { + mutex_enter(&rseg->mutex); + } + + if (trx->insert_undo != NULL) { + trx_undo_set_state_at_finish(trx->insert_undo, mtr); + } + + mutex_exit(&rseg->mutex); + + MONITOR_INC(MONITOR_TRX_COMMIT_UNDO); + + /* Update the latest MySQL binlog name and offset info + in trx sys header if MySQL binlogging is on or the database + server is a MySQL replication slave */ + + if (trx->mysql_log_file_name + && trx->mysql_log_file_name[0] != '\0') { + + trx_sys_update_mysql_binlog_offset( + trx->mysql_log_file_name, + trx->mysql_log_offset, + TRX_SYS_MYSQL_LOG_INFO, mtr); + + trx->mysql_log_file_name = NULL; + } +} + +/******************************************************************** +Finalize a transaction containing updates for a FTS table. */ +static __attribute__((nonnull)) +void +trx_finalize_for_fts_table( +/*=======================*/ + fts_trx_table_t* ftt) /* in: FTS trx table */ +{ + fts_t* fts = ftt->table->fts; + fts_doc_ids_t* doc_ids = ftt->added_doc_ids; + + mutex_enter(&fts->bg_threads_mutex); + + if (fts->fts_status & BG_THREAD_STOP) { + /* The table is about to be dropped, no use + adding anything to its work queue. */ + + mutex_exit(&fts->bg_threads_mutex); + } else { + mem_heap_t* heap; + mutex_exit(&fts->bg_threads_mutex); + + ut_a(fts->add_wq); + + heap = static_cast<mem_heap_t*>(doc_ids->self_heap->arg); + + ib_wqueue_add(fts->add_wq, doc_ids, heap); + + /* fts_trx_table_t no longer owns the list. */ + ftt->added_doc_ids = NULL; + } +} + +/******************************************************************//** +Finalize a transaction containing updates to FTS tables. */ +static __attribute__((nonnull)) +void +trx_finalize_for_fts( +/*=================*/ + trx_t* trx, /*!< in/out: transaction */ + bool is_commit) /*!< in: true if the transaction was + committed, false if it was rolled back. */ +{ + if (is_commit) { + const ib_rbt_node_t* node; + ib_rbt_t* tables; + fts_savepoint_t* savepoint; + + savepoint = static_cast<fts_savepoint_t*>( + ib_vector_last(trx->fts_trx->savepoints)); + + tables = savepoint->tables; + + for (node = rbt_first(tables); + node; + node = rbt_next(tables, node)) { + fts_trx_table_t** ftt; + + ftt = rbt_value(fts_trx_table_t*, node); + + if ((*ftt)->added_doc_ids) { + trx_finalize_for_fts_table(*ftt); + } + } + } + + fts_trx_free(trx->fts_trx); + trx->fts_trx = NULL; +} + +/**********************************************************************//** +If required, flushes the log to disk based on the value of +innodb_flush_log_at_trx_commit. */ +static +void +trx_flush_log_if_needed_low( +/*========================*/ + lsn_t lsn, /*!< in: lsn up to which logs are to be + flushed. */ + trx_t* trx) /*!< in: transaction */ +{ + ulint flush_log_at_trx_commit; + + flush_log_at_trx_commit = srv_use_global_flush_log_at_trx_commit + ? thd_flush_log_at_trx_commit(NULL) + : thd_flush_log_at_trx_commit(trx->mysql_thd); + + switch (flush_log_at_trx_commit) { + case 0: + /* Do nothing */ + break; + case 1: + /* Write the log and optionally flush it to disk */ + log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, + srv_unix_file_flush_method != SRV_UNIX_NOSYNC); + break; + case 2: + /* Write the log but do not flush it to disk */ + log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE); + + break; + default: + ut_error; + } +} + +/**********************************************************************//** +If required, flushes the log to disk based on the value of +innodb_flush_log_at_trx_commit. */ +static __attribute__((nonnull)) +void +trx_flush_log_if_needed( +/*====================*/ + lsn_t lsn, /*!< in: lsn up to which logs are to be + flushed. */ + trx_t* trx) /*!< in/out: transaction */ +{ + trx->op_info = "flushing log"; + trx_flush_log_if_needed_low(lsn, trx); + trx->op_info = ""; +} + +/****************************************************************//** +Commits a transaction in memory. */ +static __attribute__((nonnull)) +void +trx_commit_in_memory( +/*=================*/ + trx_t* trx, /*!< in/out: transaction */ + lsn_t lsn) /*!< in: log sequence number of the mini-transaction + commit of trx_write_serialisation_history(), or 0 + if the transaction did not modify anything */ +{ + trx->must_flush_log_later = FALSE; + + if (trx_is_autocommit_non_locking(trx)) { + ut_ad(trx->read_only); + ut_a(!trx->is_recovered); + ut_ad(trx->rseg == NULL); + ut_ad(!trx->in_ro_trx_list); + ut_ad(!trx->in_rw_trx_list); + + /* Note: We are asserting without holding the lock mutex. But + that is OK because this transaction is not waiting and cannot + be rolled back and no new locks can (or should not) be added + becuase it is flagged as a non-locking read-only transaction. */ + + ut_a(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0); + + /* This state change is not protected by any mutex, therefore + there is an inherent race here around state transition during + printouts. We ignore this race for the sake of efficiency. + However, the trx_sys_t::mutex will protect the trx_t instance + and it cannot be removed from the mysql_trx_list and freed + without first acquiring the trx_sys_t::mutex. */ + + ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE)); + + trx->state = TRX_STATE_NOT_STARTED; + + read_view_remove(trx->global_read_view, false); + + MONITOR_INC(MONITOR_TRX_NL_RO_COMMIT); + } else { + lock_trx_release_locks(trx); + + /* Remove the transaction from the list of active + transactions now that it no longer holds any user locks. */ + + ut_ad(trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY)); + + mutex_enter(&trx_sys->mutex); + + assert_trx_in_list(trx); + + if (trx->read_only) { + UT_LIST_REMOVE(trx_list, trx_sys->ro_trx_list, trx); + ut_d(trx->in_ro_trx_list = FALSE); + MONITOR_INC(MONITOR_TRX_RO_COMMIT); + } else { + UT_LIST_REMOVE(trx_list, trx_sys->rw_trx_list, trx); + ut_d(trx->in_rw_trx_list = FALSE); + ut_ad(trx_sys->descr_n_used <= + UT_LIST_GET_LEN(trx_sys->rw_trx_list)); + MONITOR_INC(MONITOR_TRX_RW_COMMIT); + } + + /* If this transaction came from trx_allocate_for_mysql(), + trx->in_mysql_trx_list would hold. In that case, the + trx->state change must be protected by trx_sys->mutex, so that + lock_print_info_all_transactions() will have a consistent + view. */ + + trx->state = TRX_STATE_NOT_STARTED; + + /* We already own the trx_sys_t::mutex, by doing it here we + avoid a potential context switch later. */ + read_view_remove(trx->global_read_view, true); + + ut_ad(trx_sys_validate_trx_list()); + + mutex_exit(&trx_sys->mutex); + } + + if (trx->global_read_view != NULL) { + + trx->global_read_view = NULL; + } + + trx->read_view = NULL; + + if (lsn) { + ulint flush_log_at_trx_commit; + + if (trx->insert_undo != NULL) { + + trx_undo_insert_cleanup(trx); + } + + if (srv_use_global_flush_log_at_trx_commit) { + flush_log_at_trx_commit = thd_flush_log_at_trx_commit(NULL); + } else { + flush_log_at_trx_commit = thd_flush_log_at_trx_commit(trx->mysql_thd); + } + + /* NOTE that we could possibly make a group commit more + efficient here: call os_thread_yield here to allow also other + trxs to come to commit! */ + + /*-------------------------------------*/ + + /* Depending on the my.cnf options, we may now write the log + buffer to the log files, making the transaction durable if + the OS does not crash. We may also flush the log files to + disk, making the transaction durable also at an OS crash or a + power outage. + + The idea in InnoDB's group commit is that a group of + transactions gather behind a trx doing a physical disk write + to log files, and when that physical write has been completed, + one of those transactions does a write which commits the whole + group. Note that this group commit will only bring benefit if + there are > 2 users in the database. Then at least 2 users can + gather behind one doing the physical log write to disk. + + If we are calling trx_commit() under prepare_commit_mutex, we + will delay possible log write and flush to a separate function + trx_commit_complete_for_mysql(), which is only called when the + thread has released the mutex. This is to make the + group commit algorithm to work. Otherwise, the prepare_commit + mutex would serialize all commits and prevent a group of + transactions from gathering. */ + + if (trx->flush_log_later) { + /* Do nothing yet */ + trx->must_flush_log_later = TRUE; + } else if (flush_log_at_trx_commit == 0 + || thd_requested_durability(trx->mysql_thd) + == HA_IGNORE_DURABILITY) { + /* Do nothing */ + } else { + trx_flush_log_if_needed(lsn, trx); + } + + trx->commit_lsn = lsn; + + /* Tell server some activity has happened, since the trx + does changes something. Background utility threads like + master thread, purge thread or page_cleaner thread might + have some work to do. */ + srv_active_wake_master_thread(); + } + + /* undo_no is non-zero if we're doing the final commit. */ + bool not_rollback = trx->undo_no != 0; + /* Free all savepoints, starting from the first. */ + trx_named_savept_t* savep = UT_LIST_GET_FIRST(trx->trx_savepoints); + trx_roll_savepoints_free(trx, savep); + + trx->rseg = NULL; + trx->undo_no = 0; + trx->last_sql_stat_start.least_undo_no = 0; + + trx->ddl = false; +#ifdef UNIV_DEBUG + ut_ad(trx->start_file != 0); + ut_ad(trx->start_line != 0); + trx->start_file = 0; + trx->start_line = 0; +#endif /* UNIV_DEBUG */ + + trx->will_lock = 0; + trx->read_only = FALSE; + trx->auto_commit = FALSE; + + if (trx->fts_trx) { + trx_finalize_for_fts(trx, not_rollback); + } + + ut_ad(trx->lock.wait_thr == NULL); + ut_ad(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0); + ut_ad(!trx->in_ro_trx_list); + ut_ad(!trx->in_rw_trx_list); + + trx->dict_operation = TRX_DICT_OP_NONE; + + trx->error_state = DB_SUCCESS; + + /* trx->in_mysql_trx_list would hold between + trx_allocate_for_mysql() and trx_free_for_mysql(). It does not + hold for recovered transactions or system transactions. */ +} + +/****************************************************************//** +Commits a transaction and a mini-transaction. */ +UNIV_INTERN +void +trx_commit_low( +/*===========*/ + trx_t* trx, /*!< in/out: transaction */ + mtr_t* mtr) /*!< in/out: mini-transaction (will be committed), + or NULL if trx made no modifications */ +{ + lsn_t lsn; + + assert_trx_nonlocking_or_in_list(trx); + ut_ad(!trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY)); + ut_ad(!mtr || mtr->state == MTR_ACTIVE); + ut_ad(!mtr == !(trx->insert_undo || trx->update_undo)); + + /* undo_no is non-zero if we're doing the final commit. */ + if (trx->fts_trx && trx->undo_no != 0) { + dberr_t error; + + ut_a(!trx_is_autocommit_non_locking(trx)); + + error = fts_commit(trx); + + /* FTS-FIXME: Temporarily tolerate DB_DUPLICATE_KEY + instead of dying. This is a possible scenario if there + is a crash between insert to DELETED table committing + and transaction committing. The fix would be able to + return error from this function */ + if (error != DB_SUCCESS && error != DB_DUPLICATE_KEY) { + /* FTS-FIXME: once we can return values from this + function, we should do so and signal an error + instead of just dying. */ + + ut_error; + } + } + + if (mtr) { + trx_write_serialisation_history(trx, mtr); + /* The following call commits the mini-transaction, making the + whole transaction committed in the file-based world, at this + log sequence number. The transaction becomes 'durable' when + we write the log to disk, but in the logical sense the commit + in the file-based data structures (undo logs etc.) happens + here. + + NOTE that transaction numbers, which are assigned only to + transactions with an update undo log, do not necessarily come + in exactly the same order as commit lsn's, if the transactions + have different rollback segments. To get exactly the same + order we should hold the kernel mutex up to this point, + adding to the contention of the kernel mutex. However, if + a transaction T2 is able to see modifications made by + a transaction T1, T2 will always get a bigger transaction + number and a bigger commit lsn than T1. */ + + /*--------------*/ + mtr_commit(mtr); + /*--------------*/ + lsn = mtr->end_lsn; + } else { + lsn = 0; + } + + trx_commit_in_memory(trx, lsn); +} + +/****************************************************************//** +Commits a transaction. */ +UNIV_INTERN +void +trx_commit( +/*=======*/ + trx_t* trx) /*!< in/out: transaction */ +{ + mtr_t local_mtr; + mtr_t* mtr; + + if (trx->insert_undo || trx->update_undo) { + mtr = &local_mtr; + mtr_start(mtr); + } else { + mtr = NULL; + } + + trx_commit_low(trx, mtr); +} + +/****************************************************************//** +Cleans up a transaction at database startup. The cleanup is needed if +the transaction already got to the middle of a commit when the database +crashed, and we cannot roll it back. */ +UNIV_INTERN +void +trx_cleanup_at_db_startup( +/*======================*/ + trx_t* trx) /*!< in: transaction */ +{ + ut_ad(trx->is_recovered); + + if (trx->insert_undo != NULL) { + + trx_undo_insert_cleanup(trx); + } + + trx->rseg = NULL; + trx->undo_no = 0; + trx->last_sql_stat_start.least_undo_no = 0; + + mutex_enter(&trx_sys->mutex); + + ut_a(!trx->read_only); + + UT_LIST_REMOVE(trx_list, trx_sys->rw_trx_list, trx); + ut_ad(trx_sys->descr_n_used <= UT_LIST_GET_LEN(trx_sys->rw_trx_list)); + + assert_trx_in_rw_list(trx); + ut_d(trx->in_rw_trx_list = FALSE); + + trx->state = TRX_STATE_NOT_STARTED; + trx_release_descriptor(trx); + + mutex_exit(&trx_sys->mutex); + + /* Change the transaction state without mutex protection, now + that it no longer is in the trx_list. Recovered transactions + are never placed in the mysql_trx_list. */ + ut_ad(trx->is_recovered); + ut_ad(!trx->in_ro_trx_list); + ut_ad(!trx->in_rw_trx_list); + ut_ad(!trx->in_mysql_trx_list); +} + +/********************************************************************//** +Assigns a read view for a consistent read query. All the consistent reads +within the same transaction will get the same read view, which is created +when this function is first called for a new started transaction. +@return consistent read view */ +UNIV_INTERN +read_view_t* +trx_assign_read_view( +/*=================*/ + trx_t* trx) /*!< in: active transaction */ +{ + ut_ad(trx->state == TRX_STATE_ACTIVE); + + if (trx->read_view != NULL) { + return(trx->read_view); + } + + trx->read_view = read_view_open_now(trx->id, trx->prebuilt_view); + trx->global_read_view = trx->read_view; + + return(trx->read_view); +} + +/********************************************************************//** +Clones the read view from another transaction. All consistent reads within +the receiver transaction will get the same read view as the donor transaction +@return read view clone */ +UNIV_INTERN +read_view_t* +trx_clone_read_view( +/*================*/ + trx_t* trx, /*!< in: receiver transaction */ + trx_t* from_trx) /*!< in: donor transaction */ +{ + ut_ad(lock_mutex_own()); + ut_ad(mutex_own(&trx_sys->mutex)); + ut_ad(trx_mutex_own(from_trx)); + ut_ad(trx->read_view == NULL); + + if (from_trx->state != TRX_STATE_ACTIVE || + from_trx->read_view == NULL) { + + return(NULL); + } + + trx->read_view = read_view_clone(from_trx->read_view, + trx->prebuilt_view); + + read_view_add(trx->read_view); + + trx->global_read_view = trx->read_view; + + return(trx->read_view); +} + +/****************************************************************//** +Prepares a transaction for commit/rollback. */ +UNIV_INTERN +void +trx_commit_or_rollback_prepare( +/*===========================*/ + trx_t* trx) /*!< in/out: transaction */ +{ + /* We are reading trx->state without holding trx_sys->mutex + here, because the commit or rollback should be invoked for a + running (or recovered prepared) transaction that is associated + with the current thread. */ + + switch (trx->state) { + case TRX_STATE_NOT_STARTED: + trx_start_low(trx); + /* fall through */ + case TRX_STATE_ACTIVE: + case TRX_STATE_PREPARED: + /* If the trx is in a lock wait state, moves the waiting + query thread to the suspended state */ + + if (trx->lock.que_state == TRX_QUE_LOCK_WAIT) { + + ulint sec; + ulint ms; + ib_uint64_t now; + + ut_a(trx->lock.wait_thr != NULL); + trx->lock.wait_thr->state = QUE_THR_SUSPENDED; + trx->lock.wait_thr = NULL; + + if (UNIV_UNLIKELY(trx->take_stats)) { + ut_usectime(&sec, &ms); + now = (ib_uint64_t)sec * 1000000 + ms; + trx->lock_que_wait_timer + += (ulint) + (now - trx->lock_que_wait_ustarted); + } + + trx->lock.que_state = TRX_QUE_RUNNING; + } + + ut_a(trx->lock.n_active_thrs == 1); + return; + case TRX_STATE_COMMITTED_IN_MEMORY: + break; + } + + ut_error; +} + +/*********************************************************************//** +Creates a commit command node struct. +@return own: commit node struct */ +UNIV_INTERN +commit_node_t* +trx_commit_node_create( +/*===================*/ + mem_heap_t* heap) /*!< in: mem heap where created */ +{ + commit_node_t* node; + + node = static_cast<commit_node_t*>(mem_heap_alloc(heap, sizeof(*node))); + node->common.type = QUE_NODE_COMMIT; + node->state = COMMIT_NODE_SEND; + + return(node); +} + +/***********************************************************//** +Performs an execution step for a commit type node in a query graph. +@return query thread to run next, or NULL */ +UNIV_INTERN +que_thr_t* +trx_commit_step( +/*============*/ + que_thr_t* thr) /*!< in: query thread */ +{ + commit_node_t* node; + + node = static_cast<commit_node_t*>(thr->run_node); + + ut_ad(que_node_get_type(node) == QUE_NODE_COMMIT); + + if (thr->prev_node == que_node_get_parent(node)) { + node->state = COMMIT_NODE_SEND; + } + + if (node->state == COMMIT_NODE_SEND) { + trx_t* trx; + + node->state = COMMIT_NODE_WAIT; + + trx = thr_get_trx(thr); + + ut_a(trx->lock.wait_thr == NULL); + ut_a(trx->lock.que_state != TRX_QUE_LOCK_WAIT); + + trx_commit_or_rollback_prepare(trx); + + trx->lock.que_state = TRX_QUE_COMMITTING; + + trx_commit(trx); + + ut_ad(trx->lock.wait_thr == NULL); + + trx->lock.que_state = TRX_QUE_RUNNING; + + thr = NULL; + } else { + ut_ad(node->state == COMMIT_NODE_WAIT); + + node->state = COMMIT_NODE_SEND; + + thr->run_node = que_node_get_parent(node); + } + + return(thr); +} + +/**********************************************************************//** +Does the transaction commit for MySQL. +@return DB_SUCCESS or error number */ +UNIV_INTERN +dberr_t +trx_commit_for_mysql( +/*=================*/ + trx_t* trx) /*!< in/out: transaction */ +{ + /* Because we do not do the commit by sending an Innobase + sig to the transaction, we must here make sure that trx has been + started. */ + + ut_a(trx); + + switch (trx->state) { + case TRX_STATE_NOT_STARTED: + /* Update the info whether we should skip XA steps that eat + CPU time. + + For the duration of the transaction trx->support_xa is + not reread from thd so any changes in the value take + effect in the next transaction. This is to avoid a + scenario where some undo log records generated by a + transaction contain XA information and other undo log + records, generated by the same transaction do not. */ + trx->support_xa = thd_supports_xa(trx->mysql_thd); + + ut_d(trx->start_file = __FILE__); + ut_d(trx->start_line = __LINE__); + + trx_start_low(trx); + /* fall through */ + case TRX_STATE_ACTIVE: + case TRX_STATE_PREPARED: + trx->op_info = "committing"; + trx_commit(trx); + MONITOR_DEC(MONITOR_TRX_ACTIVE); + trx->op_info = ""; + return(DB_SUCCESS); + case TRX_STATE_COMMITTED_IN_MEMORY: + break; + } + ut_error; + return(DB_CORRUPTION); +} + +/**********************************************************************//** +If required, flushes the log to disk if we called trx_commit_for_mysql() +with trx->flush_log_later == TRUE. */ +UNIV_INTERN +void +trx_commit_complete_for_mysql( +/*==========================*/ + trx_t* trx) /*!< in/out: transaction */ +{ + ut_a(trx); + + if (!trx->must_flush_log_later + || thd_requested_durability(trx->mysql_thd) + == HA_IGNORE_DURABILITY) { + return; + } + + trx_flush_log_if_needed(trx->commit_lsn, trx); + + trx->must_flush_log_later = FALSE; +} + +/**********************************************************************//** +Marks the latest SQL statement ended. */ +UNIV_INTERN +void +trx_mark_sql_stat_end( +/*==================*/ + trx_t* trx) /*!< in: trx handle */ +{ + ut_a(trx); + + switch (trx->state) { + case TRX_STATE_PREPARED: + case TRX_STATE_COMMITTED_IN_MEMORY: + break; + case TRX_STATE_NOT_STARTED: + trx->undo_no = 0; + /* fall through */ + case TRX_STATE_ACTIVE: + trx->last_sql_stat_start.least_undo_no = trx->undo_no; + + if (trx->fts_trx) { + fts_savepoint_laststmt_refresh(trx); + } + + return; + } + + ut_error; +} + +/**********************************************************************//** +Prints info about a transaction. +Caller must hold trx_sys->mutex. */ +UNIV_INTERN +void +trx_print_low( +/*==========*/ + FILE* f, + /*!< in: output stream */ + const trx_t* trx, + /*!< in: transaction */ + ulint max_query_len, + /*!< in: max query length to print, + or 0 to use the default max length */ + ulint n_rec_locks, + /*!< in: lock_number_of_rows_locked(&trx->lock) */ + ulint n_trx_locks, + /*!< in: length of trx->lock.trx_locks */ + ulint heap_size) + /*!< in: mem_heap_get_size(trx->lock.lock_heap) */ +{ + ibool newline; + const char* op_info; + + ut_ad(mutex_own(&trx_sys->mutex)); + + fprintf(f, "TRANSACTION " TRX_ID_FMT, trx->id); + + /* trx->state cannot change from or to NOT_STARTED while we + are holding the trx_sys->mutex. It may change from ACTIVE to + PREPARED or COMMITTED. */ + switch (trx->state) { + case TRX_STATE_NOT_STARTED: + fputs(", not started", f); + goto state_ok; + case TRX_STATE_ACTIVE: + fprintf(f, ", ACTIVE %lu sec", + (ulong) difftime(time(NULL), trx->start_time)); + goto state_ok; + case TRX_STATE_PREPARED: + fprintf(f, ", ACTIVE (PREPARED) %lu sec", + (ulong) difftime(time(NULL), trx->start_time)); + goto state_ok; + case TRX_STATE_COMMITTED_IN_MEMORY: + fputs(", COMMITTED IN MEMORY", f); + goto state_ok; + } + fprintf(f, ", state %lu", (ulong) trx->state); + ut_ad(0); +state_ok: + + /* prevent a race condition */ + op_info = trx->op_info; + + if (*op_info) { + putc(' ', f); + fputs(op_info, f); + } + + if (trx->is_recovered) { + fputs(" recovered trx", f); + } + + if (trx->declared_to_be_inside_innodb) { + fprintf(f, ", thread declared inside InnoDB %lu", + (ulong) trx->n_tickets_to_enter_innodb); + } + + putc('\n', f); + + if (trx->n_mysql_tables_in_use > 0 || trx->mysql_n_tables_locked > 0) { + fprintf(f, "mysql tables in use %lu, locked %lu\n", + (ulong) trx->n_mysql_tables_in_use, + (ulong) trx->mysql_n_tables_locked); + } + + newline = TRUE; + + /* trx->lock.que_state of an ACTIVE transaction may change + while we are not holding trx->mutex. We perform a dirty read + for performance reasons. */ + + switch (trx->lock.que_state) { + case TRX_QUE_RUNNING: + newline = FALSE; break; + case TRX_QUE_LOCK_WAIT: + fputs("LOCK WAIT ", f); break; + case TRX_QUE_ROLLING_BACK: + fputs("ROLLING BACK ", f); break; + case TRX_QUE_COMMITTING: + fputs("COMMITTING ", f); break; + default: + fprintf(f, "que state %lu ", (ulong) trx->lock.que_state); + } + + if (n_trx_locks > 0 || heap_size > 400) { + newline = TRUE; + + fprintf(f, "%lu lock struct(s), heap size %lu," + " %lu row lock(s)", + (ulong) n_trx_locks, + (ulong) heap_size, + (ulong) n_rec_locks); + } + + if (trx->has_search_latch) { + newline = TRUE; + fputs(", holds adaptive hash latch", f); + } + + if (trx->undo_no != 0) { + newline = TRUE; + fprintf(f, ", undo log entries " TRX_ID_FMT, trx->undo_no); + } + + if (newline) { + putc('\n', f); + } + + if (trx->mysql_thd != NULL) { + innobase_mysql_print_thd( + f, trx->mysql_thd, static_cast<uint>(max_query_len)); + } +} + +/**********************************************************************//** +Prints info about a transaction. +The caller must hold lock_sys->mutex and trx_sys->mutex. +When possible, use trx_print() instead. */ +UNIV_INTERN +void +trx_print_latched( +/*==============*/ + FILE* f, /*!< in: output stream */ + const trx_t* trx, /*!< in: transaction */ + ulint max_query_len) /*!< in: max query length to print, + or 0 to use the default max length */ +{ + ut_ad(lock_mutex_own()); + ut_ad(mutex_own(&trx_sys->mutex)); + + trx_print_low(f, trx, max_query_len, + lock_number_of_rows_locked(&trx->lock), + UT_LIST_GET_LEN(trx->lock.trx_locks), + mem_heap_get_size(trx->lock.lock_heap)); +} + +/**********************************************************************//** +Prints info about a transaction. +Acquires and releases lock_sys->mutex and trx_sys->mutex. */ +UNIV_INTERN +void +trx_print( +/*======*/ + FILE* f, /*!< in: output stream */ + const trx_t* trx, /*!< in: transaction */ + ulint max_query_len) /*!< in: max query length to print, + or 0 to use the default max length */ +{ + ulint n_rec_locks; + ulint n_trx_locks; + ulint heap_size; + + lock_mutex_enter(); + n_rec_locks = lock_number_of_rows_locked(&trx->lock); + n_trx_locks = UT_LIST_GET_LEN(trx->lock.trx_locks); + heap_size = mem_heap_get_size(trx->lock.lock_heap); + lock_mutex_exit(); + + mutex_enter(&trx_sys->mutex); + trx_print_low(f, trx, max_query_len, + n_rec_locks, n_trx_locks, heap_size); + mutex_exit(&trx_sys->mutex); +} + +#ifdef UNIV_DEBUG +/**********************************************************************//** +Asserts that a transaction has been started. +The caller must hold trx_sys->mutex. +@return TRUE if started */ +UNIV_INTERN +ibool +trx_assert_started( +/*===============*/ + const trx_t* trx) /*!< in: transaction */ +{ + ut_ad(mutex_own(&trx_sys->mutex)); + + /* Non-locking autocommits should not hold any locks and this + function is only called from the locking code. */ + assert_trx_in_list(trx); + + /* trx->state can change from or to NOT_STARTED while we are holding + trx_sys->mutex for non-locking autocommit selects but not for other + types of transactions. It may change from ACTIVE to PREPARED. Unless + we are holding lock_sys->mutex, it may also change to COMMITTED. */ + + switch (trx->state) { + case TRX_STATE_PREPARED: + return(TRUE); + + case TRX_STATE_ACTIVE: + case TRX_STATE_COMMITTED_IN_MEMORY: + return(TRUE); + + case TRX_STATE_NOT_STARTED: + break; + } + + ut_error; + return(FALSE); +} +#endif /* UNIV_DEBUG */ + +/*******************************************************************//** +Compares the "weight" (or size) of two transactions. Transactions that +have edited non-transactional tables are considered heavier than ones +that have not. +@return TRUE if weight(a) >= weight(b) */ +UNIV_INTERN +ibool +trx_weight_ge( +/*==========*/ + const trx_t* a, /*!< in: the first transaction to be compared */ + const trx_t* b) /*!< in: the second transaction to be compared */ +{ + ibool a_notrans_edit; + ibool b_notrans_edit; + + /* If mysql_thd is NULL for a transaction we assume that it has + not edited non-transactional tables. */ + + a_notrans_edit = a->mysql_thd != NULL + && thd_has_edited_nontrans_tables(a->mysql_thd); + + b_notrans_edit = b->mysql_thd != NULL + && thd_has_edited_nontrans_tables(b->mysql_thd); + + if (a_notrans_edit != b_notrans_edit) { + + return(a_notrans_edit); + } + + /* Either both had edited non-transactional tables or both had + not, we fall back to comparing the number of altered/locked + rows. */ + +#if 0 + fprintf(stderr, + "%s TRX_WEIGHT(a): %lld+%lu, TRX_WEIGHT(b): %lld+%lu\n", + __func__, + a->undo_no, UT_LIST_GET_LEN(a->lock.trx_locks), + b->undo_no, UT_LIST_GET_LEN(b->lock.trx_locks)); +#endif + + return(TRX_WEIGHT(a) >= TRX_WEIGHT(b)); +} + +/****************************************************************//** +Prepares a transaction. */ +static +void +trx_prepare( +/*========*/ + trx_t* trx) /*!< in/out: transaction */ +{ + trx_rseg_t* rseg; + lsn_t lsn; + mtr_t mtr; + + rseg = trx->rseg; + /* Only fresh user transactions can be prepared. + Recovered transactions cannot. */ + ut_a(!trx->is_recovered); + + if (trx->insert_undo != NULL || trx->update_undo != NULL) { + + mtr_start(&mtr); + + /* Change the undo log segment states from TRX_UNDO_ACTIVE + to TRX_UNDO_PREPARED: these modifications to the file data + structure define the transaction as prepared in the + file-based world, at the serialization point of lsn. */ + + mutex_enter(&rseg->mutex); + + if (trx->insert_undo != NULL) { + + /* It is not necessary to obtain trx->undo_mutex here + because only a single OS thread is allowed to do the + transaction prepare for this transaction. */ + + trx_undo_set_state_at_prepare(trx, trx->insert_undo, + &mtr); + } + + if (trx->update_undo) { + trx_undo_set_state_at_prepare( + trx, trx->update_undo, &mtr); + } + + mutex_exit(&rseg->mutex); + + /*--------------*/ + mtr_commit(&mtr); /* This mtr commit makes the + transaction prepared in the file-based + world */ + /*--------------*/ + lsn = mtr.end_lsn; + ut_ad(lsn); + } else { + lsn = 0; + } + + /*--------------------------------------*/ + ut_a(trx->state == TRX_STATE_ACTIVE); + mutex_enter(&trx_sys->mutex); + trx->state = TRX_STATE_PREPARED; + trx_sys->n_prepared_trx++; + mutex_exit(&trx_sys->mutex); + /*--------------------------------------*/ + + if (lsn) { + /* Depending on the my.cnf options, we may now write the log + buffer to the log files, making the prepared state of the + transaction durable if the OS does not crash. We may also + flush the log files to disk, making the prepared state of the + transaction durable also at an OS crash or a power outage. + + The idea in InnoDB's group prepare is that a group of + transactions gather behind a trx doing a physical disk write + to log files, and when that physical write has been completed, + one of those transactions does a write which prepares the whole + group. Note that this group prepare will only bring benefit if + there are > 2 users in the database. Then at least 2 users can + gather behind one doing the physical log write to disk. + + TODO: find out if MySQL holds some mutex when calling this. + That would spoil our group prepare algorithm. */ + + trx_flush_log_if_needed(lsn, trx); + } +} + +/**********************************************************************//** +Does the transaction prepare for MySQL. */ +UNIV_INTERN +void +trx_prepare_for_mysql( +/*==================*/ + trx_t* trx) /*!< in/out: trx handle */ +{ + trx_start_if_not_started_xa(trx); + + trx->op_info = "preparing"; + + trx_prepare(trx); + + trx->op_info = ""; +} + +/**********************************************************************//** +This function is used to find number of prepared transactions and +their transaction objects for a recovery. +@return number of prepared transactions stored in xid_list */ +UNIV_INTERN +int +trx_recover_for_mysql( +/*==================*/ + XID* xid_list, /*!< in/out: prepared transactions */ + ulint len) /*!< in: number of slots in xid_list */ +{ + const trx_t* trx; + ulint count = 0; + + ut_ad(xid_list); + ut_ad(len); + + /* We should set those transactions which are in the prepared state + to the xid_list */ + + mutex_enter(&trx_sys->mutex); + + for (trx = UT_LIST_GET_FIRST(trx_sys->rw_trx_list); + trx != NULL; + trx = UT_LIST_GET_NEXT(trx_list, trx)) { + + assert_trx_in_rw_list(trx); + + /* The state of a read-write transaction cannot change + from or to NOT_STARTED while we are holding the + trx_sys->mutex. It may change to PREPARED, but not if + trx->is_recovered. It may also change to COMMITTED. */ + if (trx_state_eq(trx, TRX_STATE_PREPARED)) { + xid_list[count] = trx->xid; + + if (count == 0) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Starting recovery for" + " XA transactions...\n"); + } + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Transaction " TRX_ID_FMT " in" + " prepared state after recovery\n", + trx->id); + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Transaction contains changes" + " to " TRX_ID_FMT " rows\n", + trx->undo_no); + + count++; + + if (count == len) { + break; + } + } + } + + mutex_exit(&trx_sys->mutex); + + if (count > 0){ + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: %d transactions in prepared state" + " after recovery\n", + int (count)); + } + + return(int (count)); +} + +/*******************************************************************//** +This function is used to find one X/Open XA distributed transaction +which is in the prepared state +@return trx on match, the trx->xid will be invalidated; +note that the trx may have been committed, unless the caller is +holding lock_sys->mutex */ +static __attribute__((nonnull, warn_unused_result)) +trx_t* +trx_get_trx_by_xid_low( +/*===================*/ + const XID* xid) /*!< in: X/Open XA transaction + identifier */ +{ + trx_t* trx; + + ut_ad(mutex_own(&trx_sys->mutex)); + + for (trx = UT_LIST_GET_FIRST(trx_sys->rw_trx_list); + trx != NULL; + trx = UT_LIST_GET_NEXT(trx_list, trx)) { + + assert_trx_in_rw_list(trx); + + /* Compare two X/Open XA transaction id's: their + length should be the same and binary comparison + of gtrid_length+bqual_length bytes should be + the same */ + + if (trx->is_recovered + && trx_state_eq(trx, TRX_STATE_PREPARED) + && xid->gtrid_length == trx->xid.gtrid_length + && xid->bqual_length == trx->xid.bqual_length + && memcmp(xid->data, trx->xid.data, + xid->gtrid_length + xid->bqual_length) == 0) { + + /* Invalidate the XID, so that subsequent calls + will not find it. */ + memset(&trx->xid, 0, sizeof(trx->xid)); + trx->xid.formatID = -1; + break; + } + } + + return(trx); +} + +/*******************************************************************//** +This function is used to find one X/Open XA distributed transaction +which is in the prepared state +@return trx or NULL; on match, the trx->xid will be invalidated; +note that the trx may have been committed, unless the caller is +holding lock_sys->mutex */ +UNIV_INTERN +trx_t* +trx_get_trx_by_xid( +/*===============*/ + const XID* xid) /*!< in: X/Open XA transaction identifier */ +{ + trx_t* trx; + + if (xid == NULL) { + + return(NULL); + } + + mutex_enter(&trx_sys->mutex); + + /* Recovered/Resurrected transactions are always only on the + trx_sys_t::rw_trx_list. */ + trx = trx_get_trx_by_xid_low(xid); + + mutex_exit(&trx_sys->mutex); + + return(trx); +} + +/*************************************************************//** +Starts the transaction if it is not yet started. */ +UNIV_INTERN +void +trx_start_if_not_started_xa_low( +/*============================*/ + trx_t* trx) /*!< in: transaction */ +{ + switch (trx->state) { + case TRX_STATE_NOT_STARTED: + + /* Update the info whether we should skip XA steps + that eat CPU time. + + For the duration of the transaction trx->support_xa is + not reread from thd so any changes in the value take + effect in the next transaction. This is to avoid a + scenario where some undo generated by a transaction, + has XA stuff, and other undo, generated by the same + transaction, doesn't. */ + trx->support_xa = thd_supports_xa(trx->mysql_thd); + + trx_start_low(trx); + /* fall through */ + case TRX_STATE_ACTIVE: + return; + case TRX_STATE_PREPARED: + case TRX_STATE_COMMITTED_IN_MEMORY: + break; + } + + ut_error; +} + +/*************************************************************//** +Starts the transaction if it is not yet started. */ +UNIV_INTERN +void +trx_start_if_not_started_low( +/*=========================*/ + trx_t* trx) /*!< in: transaction */ +{ + switch (trx->state) { + case TRX_STATE_NOT_STARTED: + trx_start_low(trx); + /* fall through */ + case TRX_STATE_ACTIVE: + return; + case TRX_STATE_PREPARED: + case TRX_STATE_COMMITTED_IN_MEMORY: + break; + } + + ut_error; +} + +/*************************************************************//** +Starts the transaction for a DDL operation. */ +UNIV_INTERN +void +trx_start_for_ddl_low( +/*==================*/ + trx_t* trx, /*!< in/out: transaction */ + trx_dict_op_t op) /*!< in: dictionary operation type */ +{ + switch (trx->state) { + case TRX_STATE_NOT_STARTED: + /* Flag this transaction as a dictionary operation, so that + the data dictionary will be locked in crash recovery. */ + + trx_set_dict_operation(trx, op); + + /* Ensure it is not flagged as an auto-commit-non-locking + transation. */ + trx->will_lock = 1; + + trx->ddl = true; + + trx_start_low(trx); + return; + + case TRX_STATE_ACTIVE: + /* We have this start if not started idiom, therefore we + can't add stronger checks here. */ + trx->ddl = true; + + ut_ad(trx->dict_operation != TRX_DICT_OP_NONE); + ut_ad(trx->will_lock > 0); + return; + case TRX_STATE_PREPARED: + case TRX_STATE_COMMITTED_IN_MEMORY: + break; + } + + ut_error; +} + diff --git a/storage/xtradb/trx/trx0undo.cc b/storage/xtradb/trx/trx0undo.cc new file mode 100644 index 00000000000..290271c6cab --- /dev/null +++ b/storage/xtradb/trx/trx0undo.cc @@ -0,0 +1,2026 @@ +/***************************************************************************** + +Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file trx/trx0undo.cc +Transaction undo log + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#include "trx0undo.h" + +#ifdef UNIV_NONINL +#include "trx0undo.ic" +#endif + +#include "fsp0fsp.h" +#ifndef UNIV_HOTBACKUP +#include "mach0data.h" +#include "mtr0log.h" +#include "trx0rseg.h" +#include "trx0trx.h" +#include "srv0srv.h" +#include "srv0start.h" +#include "trx0rec.h" +#include "trx0purge.h" +#include "srv0mon.h" + +/* How should the old versions in the history list be managed? + ---------------------------------------------------------- +If each transaction is given a whole page for its update undo log, file +space consumption can be 10 times higher than necessary. Therefore, +partly filled update undo log pages should be reusable. But then there +is no way individual pages can be ordered so that the ordering agrees +with the serialization numbers of the transactions on the pages. Thus, +the history list must be formed of undo logs, not their header pages as +it was in the old implementation. + However, on a single header page the transactions are placed in +the order of their serialization numbers. As old versions are purged, we +may free the page when the last transaction on the page has been purged. + A problem is that the purge has to go through the transactions +in the serialization order. This means that we have to look through all +rollback segments for the one that has the smallest transaction number +in its history list. + When should we do a purge? A purge is necessary when space is +running out in any of the rollback segments. Then we may have to purge +also old version which might be needed by some consistent read. How do +we trigger the start of a purge? When a transaction writes to an undo log, +it may notice that the space is running out. When a read view is closed, +it may make some history superfluous. The server can have an utility which +periodically checks if it can purge some history. + In a parallellized purge we have the problem that a query thread +can remove a delete marked clustered index record before another query +thread has processed an earlier version of the record, which cannot then +be done because the row cannot be constructed from the clustered index +record. To avoid this problem, we will store in the update and delete mark +undo record also the columns necessary to construct the secondary index +entries which are modified. + We can latch the stack of versions of a single clustered index record +by taking a latch on the clustered index page. As long as the latch is held, +no new versions can be added and no versions removed by undo. But, a purge +can still remove old versions from the bottom of the stack. */ + +/* How to protect rollback segments, undo logs, and history lists with + ------------------------------------------------------------------- +latches? +------- +The contention of the trx_sys_t::mutex should be minimized. When a transaction +does its first insert or modify in an index, an undo log is assigned for it. +Then we must have an x-latch to the rollback segment header. + When the transaction does more modifys or rolls back, the undo log is +protected with undo_mutex in the transaction. + When the transaction commits, its insert undo log is either reset and +cached for a fast reuse, or freed. In these cases we must have an x-latch on +the rollback segment page. The update undo log is put to the history list. If +it is not suitable for reuse, its slot in the rollback segment is reset. In +both cases, an x-latch must be acquired on the rollback segment. + The purge operation steps through the history list without modifying +it until a truncate operation occurs, which can remove undo logs from the end +of the list and release undo log segments. In stepping through the list, +s-latches on the undo log pages are enough, but in a truncate, x-latches must +be obtained on the rollback segment and individual pages. */ +#endif /* !UNIV_HOTBACKUP */ + +/********************************************************************//** +Initializes the fields in an undo log segment page. */ +static +void +trx_undo_page_init( +/*===============*/ + page_t* undo_page, /*!< in: undo log segment page */ + ulint type, /*!< in: undo log segment type */ + mtr_t* mtr); /*!< in: mtr */ + +#ifndef UNIV_HOTBACKUP +/********************************************************************//** +Creates and initializes an undo log memory object. +@return own: the undo log memory object */ +static +trx_undo_t* +trx_undo_mem_create( +/*================*/ + trx_rseg_t* rseg, /*!< in: rollback segment memory object */ + ulint id, /*!< in: slot index within rseg */ + ulint type, /*!< in: type of the log: TRX_UNDO_INSERT or + TRX_UNDO_UPDATE */ + trx_id_t trx_id, /*!< in: id of the trx for which the undo log + is created */ + const XID* xid, /*!< in: X/Open XA transaction identification*/ + ulint page_no,/*!< in: undo log header page number */ + ulint offset);/*!< in: undo log header byte offset on page */ +#endif /* !UNIV_HOTBACKUP */ +/***************************************************************//** +Initializes a cached insert undo log header page for new use. NOTE that this +function has its own log record type MLOG_UNDO_HDR_REUSE. You must NOT change +the operation of this function! +@return undo log header byte offset on page */ +static +ulint +trx_undo_insert_header_reuse( +/*=========================*/ + page_t* undo_page, /*!< in/out: insert undo log segment + header page, x-latched */ + trx_id_t trx_id, /*!< in: transaction id */ + mtr_t* mtr); /*!< in: mtr */ +/**********************************************************************//** +If an update undo log can be discarded immediately, this function frees the +space, resetting the page to the proper state for caching. */ +static +void +trx_undo_discard_latest_update_undo( +/*================================*/ + page_t* undo_page, /*!< in: header page of an undo log of size 1 */ + mtr_t* mtr); /*!< in: mtr */ + +#ifndef UNIV_HOTBACKUP +/***********************************************************************//** +Gets the previous record in an undo log from the previous page. +@return undo log record, the page s-latched, NULL if none */ +static +trx_undo_rec_t* +trx_undo_get_prev_rec_from_prev_page( +/*=================================*/ + trx_undo_rec_t* rec, /*!< in: undo record */ + ulint page_no,/*!< in: undo log header page number */ + ulint offset, /*!< in: undo log header offset on page */ + bool shared, /*!< in: true=S-latch, false=X-latch */ + mtr_t* mtr) /*!< in: mtr */ +{ + ulint space; + ulint zip_size; + ulint prev_page_no; + page_t* prev_page; + page_t* undo_page; + + undo_page = page_align(rec); + + prev_page_no = flst_get_prev_addr(undo_page + TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_NODE, mtr) + .page; + + if (prev_page_no == FIL_NULL) { + + return(NULL); + } + + space = page_get_space_id(undo_page); + zip_size = fil_space_get_zip_size(space); + + buf_block_t* block = buf_page_get(space, zip_size, prev_page_no, + shared ? RW_S_LATCH : RW_X_LATCH, + mtr); + buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE); + + prev_page = buf_block_get_frame(block); + + return(trx_undo_page_get_last_rec(prev_page, page_no, offset)); +} + +/***********************************************************************//** +Gets the previous record in an undo log. +@return undo log record, the page s-latched, NULL if none */ +UNIV_INTERN +trx_undo_rec_t* +trx_undo_get_prev_rec( +/*==================*/ + trx_undo_rec_t* rec, /*!< in: undo record */ + ulint page_no,/*!< in: undo log header page number */ + ulint offset, /*!< in: undo log header offset on page */ + bool shared, /*!< in: true=S-latch, false=X-latch */ + mtr_t* mtr) /*!< in: mtr */ +{ + trx_undo_rec_t* prev_rec; + + prev_rec = trx_undo_page_get_prev_rec(rec, page_no, offset); + + if (prev_rec) { + + return(prev_rec); + } + + /* We have to go to the previous undo log page to look for the + previous record */ + + return(trx_undo_get_prev_rec_from_prev_page(rec, page_no, offset, + shared, mtr)); +} + +/***********************************************************************//** +Gets the next record in an undo log from the next page. +@return undo log record, the page latched, NULL if none */ +static +trx_undo_rec_t* +trx_undo_get_next_rec_from_next_page( +/*=================================*/ + ulint space, /*!< in: undo log header space */ + ulint zip_size,/*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + page_t* undo_page, /*!< in: undo log page */ + ulint page_no,/*!< in: undo log header page number */ + ulint offset, /*!< in: undo log header offset on page */ + ulint mode, /*!< in: latch mode: RW_S_LATCH or RW_X_LATCH */ + mtr_t* mtr) /*!< in: mtr */ +{ + trx_ulogf_t* log_hdr; + ulint next_page_no; + page_t* next_page; + ulint next; + + if (page_no == page_get_page_no(undo_page)) { + + log_hdr = undo_page + offset; + next = mach_read_from_2(log_hdr + TRX_UNDO_NEXT_LOG); + + if (next != 0) { + + return(NULL); + } + } + + next_page_no = flst_get_next_addr(undo_page + TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_NODE, mtr) + .page; + if (next_page_no == FIL_NULL) { + + return(NULL); + } + + if (mode == RW_S_LATCH) { + next_page = trx_undo_page_get_s_latched(space, zip_size, + next_page_no, mtr); + } else { + ut_ad(mode == RW_X_LATCH); + next_page = trx_undo_page_get(space, zip_size, + next_page_no, mtr); + } + + return(trx_undo_page_get_first_rec(next_page, page_no, offset)); +} + +/***********************************************************************//** +Gets the next record in an undo log. +@return undo log record, the page s-latched, NULL if none */ +UNIV_INTERN +trx_undo_rec_t* +trx_undo_get_next_rec( +/*==================*/ + trx_undo_rec_t* rec, /*!< in: undo record */ + ulint page_no,/*!< in: undo log header page number */ + ulint offset, /*!< in: undo log header offset on page */ + mtr_t* mtr) /*!< in: mtr */ +{ + ulint space; + ulint zip_size; + trx_undo_rec_t* next_rec; + + next_rec = trx_undo_page_get_next_rec(rec, page_no, offset); + + if (next_rec) { + return(next_rec); + } + + space = page_get_space_id(page_align(rec)); + zip_size = fil_space_get_zip_size(space); + + return(trx_undo_get_next_rec_from_next_page(space, zip_size, + page_align(rec), + page_no, offset, + RW_S_LATCH, mtr)); +} + +/***********************************************************************//** +Gets the first record in an undo log. +@return undo log record, the page latched, NULL if none */ +UNIV_INTERN +trx_undo_rec_t* +trx_undo_get_first_rec( +/*===================*/ + ulint space, /*!< in: undo log header space */ + ulint zip_size,/*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page_no,/*!< in: undo log header page number */ + ulint offset, /*!< in: undo log header offset on page */ + ulint mode, /*!< in: latching mode: RW_S_LATCH or RW_X_LATCH */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_t* undo_page; + trx_undo_rec_t* rec; + + if (mode == RW_S_LATCH) { + undo_page = trx_undo_page_get_s_latched(space, zip_size, + page_no, mtr); + } else { + undo_page = trx_undo_page_get(space, zip_size, page_no, mtr); + } + + rec = trx_undo_page_get_first_rec(undo_page, page_no, offset); + + if (rec) { + return(rec); + } + + return(trx_undo_get_next_rec_from_next_page(space, zip_size, + undo_page, page_no, offset, + mode, mtr)); +} + +/*============== UNDO LOG FILE COPY CREATION AND FREEING ==================*/ + +/**********************************************************************//** +Writes the mtr log entry of an undo log page initialization. */ +UNIV_INLINE +void +trx_undo_page_init_log( +/*===================*/ + page_t* undo_page, /*!< in: undo log page */ + ulint type, /*!< in: undo log type */ + mtr_t* mtr) /*!< in: mtr */ +{ + mlog_write_initial_log_record(undo_page, MLOG_UNDO_INIT, mtr); + + mlog_catenate_ulint_compressed(mtr, type); +} +#else /* !UNIV_HOTBACKUP */ +# define trx_undo_page_init_log(undo_page,type,mtr) ((void) 0) +#endif /* !UNIV_HOTBACKUP */ + +/***********************************************************//** +Parses the redo log entry of an undo log page initialization. +@return end of log record or NULL */ +UNIV_INTERN +byte* +trx_undo_parse_page_init( +/*=====================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + page_t* page, /*!< in: page or NULL */ + mtr_t* mtr) /*!< in: mtr or NULL */ +{ + ulint type; + + ptr = mach_parse_compressed(ptr, end_ptr, &type); + + if (ptr == NULL) { + + return(NULL); + } + + if (page) { + trx_undo_page_init(page, type, mtr); + } + + return(ptr); +} + +/********************************************************************//** +Initializes the fields in an undo log segment page. */ +static +void +trx_undo_page_init( +/*===============*/ + page_t* undo_page, /*!< in: undo log segment page */ + ulint type, /*!< in: undo log segment type */ + mtr_t* mtr) /*!< in: mtr */ +{ + trx_upagef_t* page_hdr; + + page_hdr = undo_page + TRX_UNDO_PAGE_HDR; + + mach_write_to_2(page_hdr + TRX_UNDO_PAGE_TYPE, type); + + mach_write_to_2(page_hdr + TRX_UNDO_PAGE_START, + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE); + mach_write_to_2(page_hdr + TRX_UNDO_PAGE_FREE, + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE); + + fil_page_set_type(undo_page, FIL_PAGE_UNDO_LOG); + + trx_undo_page_init_log(undo_page, type, mtr); +} + +#ifndef UNIV_HOTBACKUP +/***************************************************************//** +Creates a new undo log segment in file. +@return DB_SUCCESS if page creation OK possible error codes are: +DB_TOO_MANY_CONCURRENT_TRXS DB_OUT_OF_FILE_SPACE */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +trx_undo_seg_create( +/*================*/ + trx_rseg_t* rseg __attribute__((unused)),/*!< in: rollback segment */ + trx_rsegf_t* rseg_hdr,/*!< in: rollback segment header, page + x-latched */ + ulint type, /*!< in: type of the segment: TRX_UNDO_INSERT or + TRX_UNDO_UPDATE */ + ulint* id, /*!< out: slot index within rseg header */ + page_t** undo_page, + /*!< out: segment header page x-latched, NULL + if there was an error */ + mtr_t* mtr) /*!< in: mtr */ +{ + ulint slot_no; + ulint space; + buf_block_t* block; + trx_upagef_t* page_hdr; + trx_usegf_t* seg_hdr; + ulint n_reserved; + ibool success; + dberr_t err = DB_SUCCESS; + + ut_ad(mtr && id && rseg_hdr); + ut_ad(mutex_own(&(rseg->mutex))); + + /* fputs(type == TRX_UNDO_INSERT + ? "Creating insert undo log segment\n" + : "Creating update undo log segment\n", stderr); */ + slot_no = trx_rsegf_undo_find_free(rseg_hdr, mtr); + + if (slot_no == ULINT_UNDEFINED) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Warning: cannot find a free slot for" + " an undo log. Do you have too\n" + "InnoDB: many active transactions" + " running concurrently?\n"); + + return(DB_TOO_MANY_CONCURRENT_TRXS); + } + + space = page_get_space_id(page_align(rseg_hdr)); + + success = fsp_reserve_free_extents(&n_reserved, space, 2, FSP_UNDO, + mtr); + if (!success) { + + return(DB_OUT_OF_FILE_SPACE); + } + + /* Allocate a new file segment for the undo log */ + block = fseg_create_general(space, 0, + TRX_UNDO_SEG_HDR + + TRX_UNDO_FSEG_HEADER, TRUE, mtr); + + fil_space_release_free_extents(space, n_reserved); + + if (block == NULL) { + /* No space left */ + + return(DB_OUT_OF_FILE_SPACE); + } + + buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE); + + *undo_page = buf_block_get_frame(block); + + page_hdr = *undo_page + TRX_UNDO_PAGE_HDR; + seg_hdr = *undo_page + TRX_UNDO_SEG_HDR; + + trx_undo_page_init(*undo_page, type, mtr); + + mlog_write_ulint(page_hdr + TRX_UNDO_PAGE_FREE, + TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE, + MLOG_2BYTES, mtr); + + mlog_write_ulint(seg_hdr + TRX_UNDO_LAST_LOG, 0, MLOG_2BYTES, mtr); + + flst_init(seg_hdr + TRX_UNDO_PAGE_LIST, mtr); + + flst_add_last(seg_hdr + TRX_UNDO_PAGE_LIST, + page_hdr + TRX_UNDO_PAGE_NODE, mtr); + + trx_rsegf_set_nth_undo(rseg_hdr, slot_no, + page_get_page_no(*undo_page), mtr); + *id = slot_no; + + MONITOR_INC(MONITOR_NUM_UNDO_SLOT_USED); + + return(err); +} + +/**********************************************************************//** +Writes the mtr log entry of an undo log header initialization. */ +UNIV_INLINE +void +trx_undo_header_create_log( +/*=======================*/ + const page_t* undo_page, /*!< in: undo log header page */ + trx_id_t trx_id, /*!< in: transaction id */ + mtr_t* mtr) /*!< in: mtr */ +{ + mlog_write_initial_log_record(undo_page, MLOG_UNDO_HDR_CREATE, mtr); + + mlog_catenate_ull_compressed(mtr, trx_id); +} +#else /* !UNIV_HOTBACKUP */ +# define trx_undo_header_create_log(undo_page,trx_id,mtr) ((void) 0) +#endif /* !UNIV_HOTBACKUP */ + +/***************************************************************//** +Creates a new undo log header in file. NOTE that this function has its own +log record type MLOG_UNDO_HDR_CREATE. You must NOT change the operation of +this function! +@return header byte offset on page */ +static +ulint +trx_undo_header_create( +/*===================*/ + page_t* undo_page, /*!< in/out: undo log segment + header page, x-latched; it is + assumed that there is + TRX_UNDO_LOG_XA_HDR_SIZE bytes + free space on it */ + trx_id_t trx_id, /*!< in: transaction id */ + mtr_t* mtr) /*!< in: mtr */ +{ + trx_upagef_t* page_hdr; + trx_usegf_t* seg_hdr; + trx_ulogf_t* log_hdr; + trx_ulogf_t* prev_log_hdr; + ulint prev_log; + ulint free; + ulint new_free; + + ut_ad(mtr && undo_page); + + page_hdr = undo_page + TRX_UNDO_PAGE_HDR; + seg_hdr = undo_page + TRX_UNDO_SEG_HDR; + + free = mach_read_from_2(page_hdr + TRX_UNDO_PAGE_FREE); + + log_hdr = undo_page + free; + + new_free = free + TRX_UNDO_LOG_OLD_HDR_SIZE; + + ut_a(free + TRX_UNDO_LOG_XA_HDR_SIZE < UNIV_PAGE_SIZE - 100); + + mach_write_to_2(page_hdr + TRX_UNDO_PAGE_START, new_free); + + mach_write_to_2(page_hdr + TRX_UNDO_PAGE_FREE, new_free); + + mach_write_to_2(seg_hdr + TRX_UNDO_STATE, TRX_UNDO_ACTIVE); + + prev_log = mach_read_from_2(seg_hdr + TRX_UNDO_LAST_LOG); + + if (prev_log != 0) { + prev_log_hdr = undo_page + prev_log; + + mach_write_to_2(prev_log_hdr + TRX_UNDO_NEXT_LOG, free); + } + + mach_write_to_2(seg_hdr + TRX_UNDO_LAST_LOG, free); + + log_hdr = undo_page + free; + + mach_write_to_2(log_hdr + TRX_UNDO_DEL_MARKS, TRUE); + + mach_write_to_8(log_hdr + TRX_UNDO_TRX_ID, trx_id); + mach_write_to_2(log_hdr + TRX_UNDO_LOG_START, new_free); + + mach_write_to_1(log_hdr + TRX_UNDO_XID_EXISTS, FALSE); + mach_write_to_1(log_hdr + TRX_UNDO_DICT_TRANS, FALSE); + + mach_write_to_2(log_hdr + TRX_UNDO_NEXT_LOG, 0); + mach_write_to_2(log_hdr + TRX_UNDO_PREV_LOG, prev_log); + + /* Write the log record about the header creation */ + trx_undo_header_create_log(undo_page, trx_id, mtr); + + return(free); +} + +#ifndef UNIV_HOTBACKUP +/********************************************************************//** +Write X/Open XA Transaction Identification (XID) to undo log header */ +static +void +trx_undo_write_xid( +/*===============*/ + trx_ulogf_t* log_hdr,/*!< in: undo log header */ + const XID* xid, /*!< in: X/Open XA Transaction Identification */ + mtr_t* mtr) /*!< in: mtr */ +{ + mlog_write_ulint(log_hdr + TRX_UNDO_XA_FORMAT, + (ulint) xid->formatID, MLOG_4BYTES, mtr); + + mlog_write_ulint(log_hdr + TRX_UNDO_XA_TRID_LEN, + (ulint) xid->gtrid_length, MLOG_4BYTES, mtr); + + mlog_write_ulint(log_hdr + TRX_UNDO_XA_BQUAL_LEN, + (ulint) xid->bqual_length, MLOG_4BYTES, mtr); + + mlog_write_string(log_hdr + TRX_UNDO_XA_XID, (const byte*) xid->data, + XIDDATASIZE, mtr); +} + +/********************************************************************//** +Read X/Open XA Transaction Identification (XID) from undo log header */ +static +void +trx_undo_read_xid( +/*==============*/ + trx_ulogf_t* log_hdr,/*!< in: undo log header */ + XID* xid) /*!< out: X/Open XA Transaction Identification */ +{ + xid->formatID = (long) mach_read_from_4(log_hdr + TRX_UNDO_XA_FORMAT); + + xid->gtrid_length + = (long) mach_read_from_4(log_hdr + TRX_UNDO_XA_TRID_LEN); + xid->bqual_length + = (long) mach_read_from_4(log_hdr + TRX_UNDO_XA_BQUAL_LEN); + + memcpy(xid->data, log_hdr + TRX_UNDO_XA_XID, XIDDATASIZE); +} + +/***************************************************************//** +Adds space for the XA XID after an undo log old-style header. */ +static +void +trx_undo_header_add_space_for_xid( +/*==============================*/ + page_t* undo_page,/*!< in: undo log segment header page */ + trx_ulogf_t* log_hdr,/*!< in: undo log header */ + mtr_t* mtr) /*!< in: mtr */ +{ + trx_upagef_t* page_hdr; + ulint free; + ulint new_free; + + page_hdr = undo_page + TRX_UNDO_PAGE_HDR; + + free = mach_read_from_2(page_hdr + TRX_UNDO_PAGE_FREE); + + /* free is now the end offset of the old style undo log header */ + + ut_a(free == (ulint)(log_hdr - undo_page) + TRX_UNDO_LOG_OLD_HDR_SIZE); + + new_free = free + (TRX_UNDO_LOG_XA_HDR_SIZE + - TRX_UNDO_LOG_OLD_HDR_SIZE); + + /* Add space for a XID after the header, update the free offset + fields on the undo log page and in the undo log header */ + + mlog_write_ulint(page_hdr + TRX_UNDO_PAGE_START, new_free, + MLOG_2BYTES, mtr); + + mlog_write_ulint(page_hdr + TRX_UNDO_PAGE_FREE, new_free, + MLOG_2BYTES, mtr); + + mlog_write_ulint(log_hdr + TRX_UNDO_LOG_START, new_free, + MLOG_2BYTES, mtr); +} + +/**********************************************************************//** +Writes the mtr log entry of an undo log header reuse. */ +UNIV_INLINE +void +trx_undo_insert_header_reuse_log( +/*=============================*/ + const page_t* undo_page, /*!< in: undo log header page */ + trx_id_t trx_id, /*!< in: transaction id */ + mtr_t* mtr) /*!< in: mtr */ +{ + mlog_write_initial_log_record(undo_page, MLOG_UNDO_HDR_REUSE, mtr); + + mlog_catenate_ull_compressed(mtr, trx_id); +} +#else /* !UNIV_HOTBACKUP */ +# define trx_undo_insert_header_reuse_log(undo_page,trx_id,mtr) ((void) 0) +#endif /* !UNIV_HOTBACKUP */ + +/***********************************************************//** +Parses the redo log entry of an undo log page header create or reuse. +@return end of log record or NULL */ +UNIV_INTERN +byte* +trx_undo_parse_page_header( +/*=======================*/ + ulint type, /*!< in: MLOG_UNDO_HDR_CREATE or MLOG_UNDO_HDR_REUSE */ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + page_t* page, /*!< in: page or NULL */ + mtr_t* mtr) /*!< in: mtr or NULL */ +{ + trx_id_t trx_id; + /* Silence a GCC warning about possibly uninitialized variable + when mach_ull_parse_compressed() is not inlined. */ + ut_d(trx_id = 0); + /* Declare the variable uninitialized in Valgrind, so that the + above initialization will not mask any bugs. */ + UNIV_MEM_INVALID(&trx_id, sizeof trx_id); + + ptr = mach_ull_parse_compressed(ptr, end_ptr, &trx_id); + + if (ptr == NULL) { + + return(NULL); + } + + if (page) { + if (type == MLOG_UNDO_HDR_CREATE) { + trx_undo_header_create(page, trx_id, mtr); + } else { + ut_ad(type == MLOG_UNDO_HDR_REUSE); + trx_undo_insert_header_reuse(page, trx_id, mtr); + } + } + + return(ptr); +} + +/***************************************************************//** +Initializes a cached insert undo log header page for new use. NOTE that this +function has its own log record type MLOG_UNDO_HDR_REUSE. You must NOT change +the operation of this function! +@return undo log header byte offset on page */ +static +ulint +trx_undo_insert_header_reuse( +/*=========================*/ + page_t* undo_page, /*!< in/out: insert undo log segment + header page, x-latched */ + trx_id_t trx_id, /*!< in: transaction id */ + mtr_t* mtr) /*!< in: mtr */ +{ + trx_upagef_t* page_hdr; + trx_usegf_t* seg_hdr; + trx_ulogf_t* log_hdr; + ulint free; + ulint new_free; + + ut_ad(mtr && undo_page); + + page_hdr = undo_page + TRX_UNDO_PAGE_HDR; + seg_hdr = undo_page + TRX_UNDO_SEG_HDR; + + free = TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE; + + ut_a(free + TRX_UNDO_LOG_XA_HDR_SIZE < UNIV_PAGE_SIZE - 100); + + log_hdr = undo_page + free; + + new_free = free + TRX_UNDO_LOG_OLD_HDR_SIZE; + + /* Insert undo data is not needed after commit: we may free all + the space on the page */ + + ut_a(mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_TYPE) + == TRX_UNDO_INSERT); + + mach_write_to_2(page_hdr + TRX_UNDO_PAGE_START, new_free); + + mach_write_to_2(page_hdr + TRX_UNDO_PAGE_FREE, new_free); + + mach_write_to_2(seg_hdr + TRX_UNDO_STATE, TRX_UNDO_ACTIVE); + + log_hdr = undo_page + free; + + mach_write_to_8(log_hdr + TRX_UNDO_TRX_ID, trx_id); + mach_write_to_2(log_hdr + TRX_UNDO_LOG_START, new_free); + + mach_write_to_1(log_hdr + TRX_UNDO_XID_EXISTS, FALSE); + mach_write_to_1(log_hdr + TRX_UNDO_DICT_TRANS, FALSE); + + /* Write the log record MLOG_UNDO_HDR_REUSE */ + trx_undo_insert_header_reuse_log(undo_page, trx_id, mtr); + + return(free); +} + +#ifndef UNIV_HOTBACKUP +/**********************************************************************//** +Writes the redo log entry of an update undo log header discard. */ +UNIV_INLINE +void +trx_undo_discard_latest_log( +/*========================*/ + page_t* undo_page, /*!< in: undo log header page */ + mtr_t* mtr) /*!< in: mtr */ +{ + mlog_write_initial_log_record(undo_page, MLOG_UNDO_HDR_DISCARD, mtr); +} +#else /* !UNIV_HOTBACKUP */ +# define trx_undo_discard_latest_log(undo_page, mtr) ((void) 0) +#endif /* !UNIV_HOTBACKUP */ + +/***********************************************************//** +Parses the redo log entry of an undo log page header discard. +@return end of log record or NULL */ +UNIV_INTERN +byte* +trx_undo_parse_discard_latest( +/*==========================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr __attribute__((unused)), /*!< in: buffer end */ + page_t* page, /*!< in: page or NULL */ + mtr_t* mtr) /*!< in: mtr or NULL */ +{ + ut_ad(end_ptr); + + if (page) { + trx_undo_discard_latest_update_undo(page, mtr); + } + + return(ptr); +} + +/**********************************************************************//** +If an update undo log can be discarded immediately, this function frees the +space, resetting the page to the proper state for caching. */ +static +void +trx_undo_discard_latest_update_undo( +/*================================*/ + page_t* undo_page, /*!< in: header page of an undo log of size 1 */ + mtr_t* mtr) /*!< in: mtr */ +{ + trx_usegf_t* seg_hdr; + trx_upagef_t* page_hdr; + trx_ulogf_t* log_hdr; + trx_ulogf_t* prev_log_hdr; + ulint free; + ulint prev_hdr_offset; + + seg_hdr = undo_page + TRX_UNDO_SEG_HDR; + page_hdr = undo_page + TRX_UNDO_PAGE_HDR; + + free = mach_read_from_2(seg_hdr + TRX_UNDO_LAST_LOG); + log_hdr = undo_page + free; + + prev_hdr_offset = mach_read_from_2(log_hdr + TRX_UNDO_PREV_LOG); + + if (prev_hdr_offset != 0) { + prev_log_hdr = undo_page + prev_hdr_offset; + + mach_write_to_2(page_hdr + TRX_UNDO_PAGE_START, + mach_read_from_2(prev_log_hdr + + TRX_UNDO_LOG_START)); + mach_write_to_2(prev_log_hdr + TRX_UNDO_NEXT_LOG, 0); + } + + mach_write_to_2(page_hdr + TRX_UNDO_PAGE_FREE, free); + + mach_write_to_2(seg_hdr + TRX_UNDO_STATE, TRX_UNDO_CACHED); + mach_write_to_2(seg_hdr + TRX_UNDO_LAST_LOG, prev_hdr_offset); + + trx_undo_discard_latest_log(undo_page, mtr); +} + +#ifndef UNIV_HOTBACKUP +/********************************************************************//** +Tries to add a page to the undo log segment where the undo log is placed. +@return X-latched block if success, else NULL */ +UNIV_INTERN +buf_block_t* +trx_undo_add_page( +/*==============*/ + trx_t* trx, /*!< in: transaction */ + trx_undo_t* undo, /*!< in: undo log memory object */ + mtr_t* mtr) /*!< in: mtr which does not have a latch to any + undo log page; the caller must have reserved + the rollback segment mutex */ +{ + page_t* header_page; + buf_block_t* new_block; + page_t* new_page; + trx_rseg_t* rseg; + ulint n_reserved; + + ut_ad(mutex_own(&(trx->undo_mutex))); + ut_ad(mutex_own(&(trx->rseg->mutex))); + + rseg = trx->rseg; + + if (rseg->curr_size == rseg->max_size) { + + return(NULL); + } + + header_page = trx_undo_page_get(undo->space, undo->zip_size, + undo->hdr_page_no, mtr); + + if (!fsp_reserve_free_extents(&n_reserved, undo->space, 1, + FSP_UNDO, mtr)) { + + return(NULL); + } + + new_block = fseg_alloc_free_page_general( + TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER + + header_page, + undo->top_page_no + 1, FSP_UP, TRUE, mtr, mtr); + + fil_space_release_free_extents(undo->space, n_reserved); + + if (new_block == NULL) { + + /* No space left */ + + return(NULL); + } + + ut_ad(rw_lock_get_x_lock_count(&new_block->lock) == 1); + buf_block_dbg_add_level(new_block, SYNC_TRX_UNDO_PAGE); + undo->last_page_no = buf_block_get_page_no(new_block); + + new_page = buf_block_get_frame(new_block); + + trx_undo_page_init(new_page, undo->type, mtr); + + flst_add_last(header_page + TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST, + new_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE, mtr); + undo->size++; + rseg->curr_size++; + + return(new_block); +} + +/********************************************************************//** +Frees an undo log page that is not the header page. +@return last page number in remaining log */ +static +ulint +trx_undo_free_page( +/*===============*/ + trx_rseg_t* rseg, /*!< in: rollback segment */ + ibool in_history, /*!< in: TRUE if the undo log is in the history + list */ + ulint space, /*!< in: space */ + ulint hdr_page_no, /*!< in: header page number */ + ulint page_no, /*!< in: page number to free: must not be the + header page */ + mtr_t* mtr) /*!< in: mtr which does not have a latch to any + undo log page; the caller must have reserved + the rollback segment mutex */ +{ + page_t* header_page; + page_t* undo_page; + fil_addr_t last_addr; + trx_rsegf_t* rseg_header; + ulint hist_size; + ulint zip_size; + + ut_a(hdr_page_no != page_no); + ut_ad(mutex_own(&(rseg->mutex))); + + zip_size = rseg->zip_size; + + undo_page = trx_undo_page_get(space, zip_size, page_no, mtr); + + header_page = trx_undo_page_get(space, zip_size, hdr_page_no, mtr); + + flst_remove(header_page + TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST, + undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE, mtr); + + fseg_free_page(header_page + TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER, + space, page_no, mtr); + + last_addr = flst_get_last(header_page + TRX_UNDO_SEG_HDR + + TRX_UNDO_PAGE_LIST, mtr); + rseg->curr_size--; + + if (in_history) { + rseg_header = trx_rsegf_get(space, zip_size, + rseg->page_no, mtr); + + hist_size = mtr_read_ulint(rseg_header + TRX_RSEG_HISTORY_SIZE, + MLOG_4BYTES, mtr); + ut_ad(hist_size > 0); + mlog_write_ulint(rseg_header + TRX_RSEG_HISTORY_SIZE, + hist_size - 1, MLOG_4BYTES, mtr); + } + + return(last_addr.page); +} + +/********************************************************************//** +Frees the last undo log page. +The caller must hold the rollback segment mutex. */ +UNIV_INTERN +void +trx_undo_free_last_page_func( +/*==========================*/ +#ifdef UNIV_DEBUG + const trx_t* trx, /*!< in: transaction */ +#endif /* UNIV_DEBUG */ + trx_undo_t* undo, /*!< in/out: undo log memory copy */ + mtr_t* mtr) /*!< in/out: mini-transaction which does not + have a latch to any undo log page or which + has allocated the undo log page */ +{ + ut_ad(mutex_own(&trx->undo_mutex)); + ut_ad(undo->hdr_page_no != undo->last_page_no); + ut_ad(undo->size > 0); + + undo->last_page_no = trx_undo_free_page( + undo->rseg, FALSE, undo->space, + undo->hdr_page_no, undo->last_page_no, mtr); + + undo->size--; +} + +/********************************************************************//** +Empties an undo log header page of undo records for that undo log. Other +undo logs may still have records on that page, if it is an update undo log. */ +static +void +trx_undo_empty_header_page( +/*=======================*/ + ulint space, /*!< in: space */ + ulint zip_size, /*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint hdr_page_no, /*!< in: header page number */ + ulint hdr_offset, /*!< in: header offset */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_t* header_page; + trx_ulogf_t* log_hdr; + ulint end; + + header_page = trx_undo_page_get(space, zip_size, hdr_page_no, mtr); + + log_hdr = header_page + hdr_offset; + + end = trx_undo_page_get_end(header_page, hdr_page_no, hdr_offset); + + mlog_write_ulint(log_hdr + TRX_UNDO_LOG_START, end, MLOG_2BYTES, mtr); +} + +/***********************************************************************//** +Truncates an undo log from the end. This function is used during a rollback +to free space from an undo log. */ +UNIV_INTERN +void +trx_undo_truncate_end_func( +/*=======================*/ +#ifdef UNIV_DEBUG + const trx_t* trx, /*!< in: transaction whose undo log it is */ +#endif /* UNIV_DEBUG */ + trx_undo_t* undo, /*!< in: undo log */ + undo_no_t limit) /*!< in: all undo records with undo number + >= this value should be truncated */ +{ + page_t* undo_page; + ulint last_page_no; + trx_undo_rec_t* rec; + trx_undo_rec_t* trunc_here; + mtr_t mtr; + + ut_ad(mutex_own(&(trx->undo_mutex))); + ut_ad(mutex_own(&(trx->rseg->mutex))); + + for (;;) { + mtr_start(&mtr); + + trunc_here = NULL; + + last_page_no = undo->last_page_no; + + undo_page = trx_undo_page_get(undo->space, undo->zip_size, + last_page_no, &mtr); + + rec = trx_undo_page_get_last_rec(undo_page, undo->hdr_page_no, + undo->hdr_offset); + while (rec) { + if (trx_undo_rec_get_undo_no(rec) >= limit) { + /* Truncate at least this record off, maybe + more */ + trunc_here = rec; + } else { + goto function_exit; + } + + rec = trx_undo_page_get_prev_rec(rec, + undo->hdr_page_no, + undo->hdr_offset); + } + + if (last_page_no == undo->hdr_page_no) { + + goto function_exit; + } + + ut_ad(last_page_no == undo->last_page_no); + trx_undo_free_last_page(trx, undo, &mtr); + + mtr_commit(&mtr); + } + +function_exit: + if (trunc_here) { + mlog_write_ulint(undo_page + TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_FREE, + trunc_here - undo_page, MLOG_2BYTES, &mtr); + } + + mtr_commit(&mtr); +} + +/***********************************************************************//** +Truncates an undo log from the start. This function is used during a purge +operation. */ +UNIV_INTERN +void +trx_undo_truncate_start( +/*====================*/ + trx_rseg_t* rseg, /*!< in: rollback segment */ + ulint space, /*!< in: space id of the log */ + ulint hdr_page_no, /*!< in: header page number */ + ulint hdr_offset, /*!< in: header offset on the page */ + undo_no_t limit) /*!< in: all undo pages with + undo numbers < this value + should be truncated; NOTE that + the function only frees whole + pages; the header page is not + freed, but emptied, if all the + records there are < limit */ +{ + page_t* undo_page; + trx_undo_rec_t* rec; + trx_undo_rec_t* last_rec; + ulint page_no; + mtr_t mtr; + + ut_ad(mutex_own(&(rseg->mutex))); + + if (!limit) { + + return; + } +loop: + mtr_start(&mtr); + + rec = trx_undo_get_first_rec(space, rseg->zip_size, + hdr_page_no, hdr_offset, + RW_X_LATCH, &mtr); + if (rec == NULL) { + /* Already empty */ + + mtr_commit(&mtr); + + return; + } + + undo_page = page_align(rec); + + last_rec = trx_undo_page_get_last_rec(undo_page, hdr_page_no, + hdr_offset); + if (trx_undo_rec_get_undo_no(last_rec) >= limit) { + + mtr_commit(&mtr); + + return; + } + + page_no = page_get_page_no(undo_page); + + if (page_no == hdr_page_no) { + trx_undo_empty_header_page(space, rseg->zip_size, + hdr_page_no, hdr_offset, + &mtr); + } else { + trx_undo_free_page(rseg, TRUE, space, hdr_page_no, + page_no, &mtr); + } + + mtr_commit(&mtr); + + goto loop; +} + +/**********************************************************************//** +Frees an undo log segment which is not in the history list. */ +static +void +trx_undo_seg_free( +/*==============*/ + trx_undo_t* undo) /*!< in: undo log */ +{ + trx_rseg_t* rseg; + fseg_header_t* file_seg; + trx_rsegf_t* rseg_header; + trx_usegf_t* seg_header; + ibool finished; + mtr_t mtr; + + rseg = undo->rseg; + + do { + + mtr_start(&mtr); + + mutex_enter(&(rseg->mutex)); + + seg_header = trx_undo_page_get(undo->space, undo->zip_size, + undo->hdr_page_no, + &mtr) + TRX_UNDO_SEG_HDR; + + file_seg = seg_header + TRX_UNDO_FSEG_HEADER; + + finished = fseg_free_step(file_seg, &mtr); + + if (finished) { + /* Update the rseg header */ + rseg_header = trx_rsegf_get( + rseg->space, rseg->zip_size, rseg->page_no, + &mtr); + trx_rsegf_set_nth_undo(rseg_header, undo->id, FIL_NULL, + &mtr); + + MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_USED); + } + + mutex_exit(&(rseg->mutex)); + mtr_commit(&mtr); + } while (!finished); +} + +/*========== UNDO LOG MEMORY COPY INITIALIZATION =====================*/ + +/********************************************************************//** +Creates and initializes an undo log memory object according to the values +in the header in file, when the database is started. The memory object is +inserted in the appropriate list of rseg. +@return own: the undo log memory object */ +static +trx_undo_t* +trx_undo_mem_create_at_db_start( +/*============================*/ + trx_rseg_t* rseg, /*!< in: rollback segment memory object */ + ulint id, /*!< in: slot index within rseg */ + ulint page_no,/*!< in: undo log segment page number */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_t* undo_page; + trx_upagef_t* page_header; + trx_usegf_t* seg_header; + trx_ulogf_t* undo_header; + trx_undo_t* undo; + ulint type; + ulint state; + trx_id_t trx_id; + ulint offset; + fil_addr_t last_addr; + page_t* last_page; + trx_undo_rec_t* rec; + XID xid; + ibool xid_exists = FALSE; + + if (id >= TRX_RSEG_N_SLOTS) { + fprintf(stderr, + "InnoDB: Error: undo->id is %lu\n", (ulong) id); + ut_error; + } + + undo_page = trx_undo_page_get(rseg->space, rseg->zip_size, + page_no, mtr); + + page_header = undo_page + TRX_UNDO_PAGE_HDR; + + type = mtr_read_ulint(page_header + TRX_UNDO_PAGE_TYPE, MLOG_2BYTES, + mtr); + seg_header = undo_page + TRX_UNDO_SEG_HDR; + + state = mach_read_from_2(seg_header + TRX_UNDO_STATE); + + offset = mach_read_from_2(seg_header + TRX_UNDO_LAST_LOG); + + undo_header = undo_page + offset; + + trx_id = mach_read_from_8(undo_header + TRX_UNDO_TRX_ID); + + xid_exists = mtr_read_ulint(undo_header + TRX_UNDO_XID_EXISTS, + MLOG_1BYTE, mtr); + + /* Read X/Open XA transaction identification if it exists, or + set it to NULL. */ + + memset(&xid, 0, sizeof(xid)); + xid.formatID = -1; + + if (xid_exists == TRUE) { + trx_undo_read_xid(undo_header, &xid); + } + + mutex_enter(&(rseg->mutex)); + + undo = trx_undo_mem_create(rseg, id, type, trx_id, &xid, + page_no, offset); + mutex_exit(&(rseg->mutex)); + + undo->dict_operation = mtr_read_ulint( + undo_header + TRX_UNDO_DICT_TRANS, MLOG_1BYTE, mtr); + + undo->table_id = mach_read_from_8(undo_header + TRX_UNDO_TABLE_ID); + undo->state = state; + undo->size = flst_get_len(seg_header + TRX_UNDO_PAGE_LIST, mtr); + + /* If the log segment is being freed, the page list is inconsistent! */ + if (state == TRX_UNDO_TO_FREE) { + + goto add_to_list; + } + + last_addr = flst_get_last(seg_header + TRX_UNDO_PAGE_LIST, mtr); + + undo->last_page_no = last_addr.page; + undo->top_page_no = last_addr.page; + + last_page = trx_undo_page_get(rseg->space, rseg->zip_size, + undo->last_page_no, mtr); + + rec = trx_undo_page_get_last_rec(last_page, page_no, offset); + + if (rec == NULL) { + undo->empty = TRUE; + } else { + undo->empty = FALSE; + undo->top_offset = rec - last_page; + undo->top_undo_no = trx_undo_rec_get_undo_no(rec); + } +add_to_list: + if (type == TRX_UNDO_INSERT) { + if (state != TRX_UNDO_CACHED) { + UT_LIST_ADD_LAST(undo_list, rseg->insert_undo_list, + undo); + } else { + UT_LIST_ADD_LAST(undo_list, rseg->insert_undo_cached, + undo); + MONITOR_INC(MONITOR_NUM_UNDO_SLOT_CACHED); + } + } else { + ut_ad(type == TRX_UNDO_UPDATE); + if (state != TRX_UNDO_CACHED) { + UT_LIST_ADD_LAST(undo_list, rseg->update_undo_list, + undo); + } else { + UT_LIST_ADD_LAST(undo_list, rseg->update_undo_cached, + undo); + MONITOR_INC(MONITOR_NUM_UNDO_SLOT_CACHED); + } + } + + return(undo); +} + +/********************************************************************//** +Initializes the undo log lists for a rollback segment memory copy. This +function is only called when the database is started or a new rollback +segment is created. +@return the combined size of undo log segments in pages */ +UNIV_INTERN +ulint +trx_undo_lists_init( +/*================*/ + trx_rseg_t* rseg) /*!< in: rollback segment memory object */ +{ + ulint size = 0; + trx_rsegf_t* rseg_header; + ulint i; + mtr_t mtr; + + UT_LIST_INIT(rseg->update_undo_list); + UT_LIST_INIT(rseg->update_undo_cached); + UT_LIST_INIT(rseg->insert_undo_list); + UT_LIST_INIT(rseg->insert_undo_cached); + + mtr_start(&mtr); + + rseg_header = trx_rsegf_get_new( + rseg->space, rseg->zip_size, rseg->page_no, &mtr); + + for (i = 0; i < TRX_RSEG_N_SLOTS; i++) { + ulint page_no; + + page_no = trx_rsegf_get_nth_undo(rseg_header, i, &mtr); + + /* In forced recovery: try to avoid operations which look + at database pages; undo logs are rapidly changing data, and + the probability that they are in an inconsistent state is + high */ + + if (page_no != FIL_NULL + && srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN) { + + trx_undo_t* undo; + + undo = trx_undo_mem_create_at_db_start( + rseg, i, page_no, &mtr); + + size += undo->size; + + mtr_commit(&mtr); + + mtr_start(&mtr); + + rseg_header = trx_rsegf_get( + rseg->space, rseg->zip_size, rseg->page_no, + &mtr); + + /* Found a used slot */ + MONITOR_INC(MONITOR_NUM_UNDO_SLOT_USED); + } + } + + mtr_commit(&mtr); + + return(size); +} + +/********************************************************************//** +Creates and initializes an undo log memory object. +@return own: the undo log memory object */ +static +trx_undo_t* +trx_undo_mem_create( +/*================*/ + trx_rseg_t* rseg, /*!< in: rollback segment memory object */ + ulint id, /*!< in: slot index within rseg */ + ulint type, /*!< in: type of the log: TRX_UNDO_INSERT or + TRX_UNDO_UPDATE */ + trx_id_t trx_id, /*!< in: id of the trx for which the undo log + is created */ + const XID* xid, /*!< in: X/Open transaction identification */ + ulint page_no,/*!< in: undo log header page number */ + ulint offset) /*!< in: undo log header byte offset on page */ +{ + trx_undo_t* undo; + + ut_ad(mutex_own(&(rseg->mutex))); + + if (id >= TRX_RSEG_N_SLOTS) { + fprintf(stderr, + "InnoDB: Error: undo->id is %lu\n", (ulong) id); + ut_error; + } + + undo = static_cast<trx_undo_t*>(mem_alloc(sizeof(*undo))); + + if (undo == NULL) { + + return(NULL); + } + + undo->id = id; + undo->type = type; + undo->state = TRX_UNDO_ACTIVE; + undo->del_marks = FALSE; + undo->trx_id = trx_id; + undo->xid = *xid; + + undo->dict_operation = FALSE; + + undo->rseg = rseg; + + undo->space = rseg->space; + undo->zip_size = rseg->zip_size; + undo->hdr_page_no = page_no; + undo->hdr_offset = offset; + undo->last_page_no = page_no; + undo->size = 1; + + undo->empty = TRUE; + undo->top_page_no = page_no; + undo->guess_block = NULL; + + return(undo); +} + +/********************************************************************//** +Initializes a cached undo log object for new use. */ +static +void +trx_undo_mem_init_for_reuse( +/*========================*/ + trx_undo_t* undo, /*!< in: undo log to init */ + trx_id_t trx_id, /*!< in: id of the trx for which the undo log + is created */ + const XID* xid, /*!< in: X/Open XA transaction identification*/ + ulint offset) /*!< in: undo log header byte offset on page */ +{ + ut_ad(mutex_own(&((undo->rseg)->mutex))); + + if (UNIV_UNLIKELY(undo->id >= TRX_RSEG_N_SLOTS)) { + fprintf(stderr, "InnoDB: Error: undo->id is %lu\n", + (ulong) undo->id); + + mem_analyze_corruption(undo); + ut_error; + } + + undo->state = TRX_UNDO_ACTIVE; + undo->del_marks = FALSE; + undo->trx_id = trx_id; + undo->xid = *xid; + + undo->dict_operation = FALSE; + + undo->hdr_offset = offset; + undo->empty = TRUE; +} + +/********************************************************************//** +Frees an undo log memory copy. */ +UNIV_INTERN +void +trx_undo_mem_free( +/*==============*/ + trx_undo_t* undo) /*!< in: the undo object to be freed */ +{ + if (undo->id >= TRX_RSEG_N_SLOTS) { + fprintf(stderr, + "InnoDB: Error: undo->id is %lu\n", (ulong) undo->id); + ut_error; + } + + mem_free(undo); +} + +/**********************************************************************//** +Creates a new undo log. +@return DB_SUCCESS if successful in creating the new undo lob object, +possible error codes are: DB_TOO_MANY_CONCURRENT_TRXS +DB_OUT_OF_FILE_SPACE DB_OUT_OF_MEMORY */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +trx_undo_create( +/*============*/ + trx_t* trx, /*!< in: transaction */ + trx_rseg_t* rseg, /*!< in: rollback segment memory copy */ + ulint type, /*!< in: type of the log: TRX_UNDO_INSERT or + TRX_UNDO_UPDATE */ + trx_id_t trx_id, /*!< in: id of the trx for which the undo log + is created */ + const XID* xid, /*!< in: X/Open transaction identification*/ + trx_undo_t** undo, /*!< out: the new undo log object, undefined + * if did not succeed */ + mtr_t* mtr) /*!< in: mtr */ +{ + trx_rsegf_t* rseg_header; + ulint page_no; + ulint offset; + ulint id; + page_t* undo_page; + dberr_t err; + + ut_ad(mutex_own(&(rseg->mutex))); + + if (rseg->curr_size == rseg->max_size) { + + return(DB_OUT_OF_FILE_SPACE); + } + + rseg->curr_size++; + + rseg_header = trx_rsegf_get(rseg->space, rseg->zip_size, rseg->page_no, + mtr); + + err = trx_undo_seg_create(rseg, rseg_header, type, &id, + &undo_page, mtr); + + if (err != DB_SUCCESS) { + /* Did not succeed */ + + rseg->curr_size--; + + return(err); + } + + page_no = page_get_page_no(undo_page); + + offset = trx_undo_header_create(undo_page, trx_id, mtr); + + if (trx->support_xa) { + trx_undo_header_add_space_for_xid(undo_page, + undo_page + offset, mtr); + } + + *undo = trx_undo_mem_create(rseg, id, type, trx_id, xid, + page_no, offset); + if (*undo == NULL) { + + err = DB_OUT_OF_MEMORY; + } + + return(err); +} + +/*================ UNDO LOG ASSIGNMENT AND CLEANUP =====================*/ + +/********************************************************************//** +Reuses a cached undo log. +@return the undo log memory object, NULL if none cached */ +static +trx_undo_t* +trx_undo_reuse_cached( +/*==================*/ + trx_t* trx, /*!< in: transaction */ + trx_rseg_t* rseg, /*!< in: rollback segment memory object */ + ulint type, /*!< in: type of the log: TRX_UNDO_INSERT or + TRX_UNDO_UPDATE */ + trx_id_t trx_id, /*!< in: id of the trx for which the undo log + is used */ + const XID* xid, /*!< in: X/Open XA transaction identification */ + mtr_t* mtr) /*!< in: mtr */ +{ + trx_undo_t* undo; + page_t* undo_page; + ulint offset; + + ut_ad(mutex_own(&(rseg->mutex))); + + if (type == TRX_UNDO_INSERT) { + + undo = UT_LIST_GET_FIRST(rseg->insert_undo_cached); + if (undo == NULL) { + + return(NULL); + } + + UT_LIST_REMOVE(undo_list, rseg->insert_undo_cached, undo); + + MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_CACHED); + } else { + ut_ad(type == TRX_UNDO_UPDATE); + + undo = UT_LIST_GET_FIRST(rseg->update_undo_cached); + if (undo == NULL) { + + return(NULL); + } + + UT_LIST_REMOVE(undo_list, rseg->update_undo_cached, undo); + + MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_CACHED); + } + + ut_ad(undo->size == 1); + + if (undo->id >= TRX_RSEG_N_SLOTS) { + fprintf(stderr, "InnoDB: Error: undo->id is %lu\n", + (ulong) undo->id); + mem_analyze_corruption(undo); + ut_error; + } + + undo_page = trx_undo_page_get(undo->space, undo->zip_size, + undo->hdr_page_no, mtr); + + if (type == TRX_UNDO_INSERT) { + offset = trx_undo_insert_header_reuse(undo_page, trx_id, mtr); + + if (trx->support_xa) { + trx_undo_header_add_space_for_xid( + undo_page, undo_page + offset, mtr); + } + } else { + ut_a(mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_TYPE) + == TRX_UNDO_UPDATE); + + offset = trx_undo_header_create(undo_page, trx_id, mtr); + + if (trx->support_xa) { + trx_undo_header_add_space_for_xid( + undo_page, undo_page + offset, mtr); + } + } + + trx_undo_mem_init_for_reuse(undo, trx_id, xid, offset); + + return(undo); +} + +/**********************************************************************//** +Marks an undo log header as a header of a data dictionary operation +transaction. */ +static +void +trx_undo_mark_as_dict_operation( +/*============================*/ + trx_t* trx, /*!< in: dict op transaction */ + trx_undo_t* undo, /*!< in: assigned undo log */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_t* hdr_page; + + hdr_page = trx_undo_page_get(undo->space, undo->zip_size, + undo->hdr_page_no, mtr); + + switch (trx_get_dict_operation(trx)) { + case TRX_DICT_OP_NONE: + ut_error; + case TRX_DICT_OP_INDEX: + /* Do not discard the table on recovery. */ + undo->table_id = 0; + break; + case TRX_DICT_OP_TABLE: + undo->table_id = trx->table_id; + break; + } + + mlog_write_ulint(hdr_page + undo->hdr_offset + + TRX_UNDO_DICT_TRANS, + TRUE, MLOG_1BYTE, mtr); + + mlog_write_ull(hdr_page + undo->hdr_offset + TRX_UNDO_TABLE_ID, + undo->table_id, mtr); + + undo->dict_operation = TRUE; +} + +/**********************************************************************//** +Assigns an undo log for a transaction. A new undo log is created or a cached +undo log reused. +@return DB_SUCCESS if undo log assign successful, possible error codes +are: DB_TOO_MANY_CONCURRENT_TRXS DB_OUT_OF_FILE_SPACE DB_READ_ONLY +DB_OUT_OF_MEMORY */ +UNIV_INTERN +dberr_t +trx_undo_assign_undo( +/*=================*/ + trx_t* trx, /*!< in: transaction */ + ulint type) /*!< in: TRX_UNDO_INSERT or TRX_UNDO_UPDATE */ +{ + trx_rseg_t* rseg; + trx_undo_t* undo; + mtr_t mtr; + dberr_t err = DB_SUCCESS; + + ut_ad(trx); + + if (trx->rseg == NULL) { + return(DB_READ_ONLY); + } + + rseg = trx->rseg; + + ut_ad(mutex_own(&(trx->undo_mutex))); + + mtr_start(&mtr); + + mutex_enter(&rseg->mutex); + + DBUG_EXECUTE_IF( + "ib_create_table_fail_too_many_trx", + err = DB_TOO_MANY_CONCURRENT_TRXS; + goto func_exit; + ); + + undo = trx_undo_reuse_cached(trx, rseg, type, trx->id, &trx->xid, + &mtr); + if (undo == NULL) { + err = trx_undo_create(trx, rseg, type, trx->id, &trx->xid, + &undo, &mtr); + if (err != DB_SUCCESS) { + + goto func_exit; + } + } + + if (type == TRX_UNDO_INSERT) { + UT_LIST_ADD_FIRST(undo_list, rseg->insert_undo_list, undo); + ut_ad(trx->insert_undo == NULL); + trx->insert_undo = undo; + } else { + UT_LIST_ADD_FIRST(undo_list, rseg->update_undo_list, undo); + ut_ad(trx->update_undo == NULL); + trx->update_undo = undo; + } + + if (trx_get_dict_operation(trx) != TRX_DICT_OP_NONE) { + trx_undo_mark_as_dict_operation(trx, undo, &mtr); + } + +func_exit: + mutex_exit(&(rseg->mutex)); + mtr_commit(&mtr); + + return(err); +} + +/******************************************************************//** +Sets the state of the undo log segment at a transaction finish. +@return undo log segment header page, x-latched */ +UNIV_INTERN +page_t* +trx_undo_set_state_at_finish( +/*=========================*/ + trx_undo_t* undo, /*!< in: undo log memory copy */ + mtr_t* mtr) /*!< in: mtr */ +{ + trx_usegf_t* seg_hdr; + trx_upagef_t* page_hdr; + page_t* undo_page; + ulint state; + + if (undo->id >= TRX_RSEG_N_SLOTS) { + fprintf(stderr, "InnoDB: Error: undo->id is %lu\n", + (ulong) undo->id); + mem_analyze_corruption(undo); + ut_error; + } + + undo_page = trx_undo_page_get(undo->space, undo->zip_size, + undo->hdr_page_no, mtr); + + seg_hdr = undo_page + TRX_UNDO_SEG_HDR; + page_hdr = undo_page + TRX_UNDO_PAGE_HDR; + + if (undo->size == 1 + && mach_read_from_2(page_hdr + TRX_UNDO_PAGE_FREE) + < TRX_UNDO_PAGE_REUSE_LIMIT) { + + state = TRX_UNDO_CACHED; + + } else if (undo->type == TRX_UNDO_INSERT) { + + state = TRX_UNDO_TO_FREE; + } else { + state = TRX_UNDO_TO_PURGE; + } + + undo->state = state; + + mlog_write_ulint(seg_hdr + TRX_UNDO_STATE, state, MLOG_2BYTES, mtr); + + return(undo_page); +} + +/******************************************************************//** +Sets the state of the undo log segment at a transaction prepare. +@return undo log segment header page, x-latched */ +UNIV_INTERN +page_t* +trx_undo_set_state_at_prepare( +/*==========================*/ + trx_t* trx, /*!< in: transaction */ + trx_undo_t* undo, /*!< in: undo log memory copy */ + mtr_t* mtr) /*!< in: mtr */ +{ + trx_usegf_t* seg_hdr; + trx_ulogf_t* undo_header; + page_t* undo_page; + ulint offset; + + ut_ad(trx && undo && mtr); + + if (undo->id >= TRX_RSEG_N_SLOTS) { + fprintf(stderr, "InnoDB: Error: undo->id is %lu\n", + (ulong) undo->id); + mem_analyze_corruption(undo); + ut_error; + } + + undo_page = trx_undo_page_get(undo->space, undo->zip_size, + undo->hdr_page_no, mtr); + + seg_hdr = undo_page + TRX_UNDO_SEG_HDR; + + /*------------------------------*/ + undo->state = TRX_UNDO_PREPARED; + undo->xid = trx->xid; + /*------------------------------*/ + + mlog_write_ulint(seg_hdr + TRX_UNDO_STATE, undo->state, + MLOG_2BYTES, mtr); + + offset = mach_read_from_2(seg_hdr + TRX_UNDO_LAST_LOG); + undo_header = undo_page + offset; + + mlog_write_ulint(undo_header + TRX_UNDO_XID_EXISTS, + TRUE, MLOG_1BYTE, mtr); + + trx_undo_write_xid(undo_header, &undo->xid, mtr); + + return(undo_page); +} + +/**********************************************************************//** +Adds the update undo log header as the first in the history list, and +frees the memory object, or puts it to the list of cached update undo log +segments. */ +UNIV_INTERN +void +trx_undo_update_cleanup( +/*====================*/ + trx_t* trx, /*!< in: trx owning the update undo log */ + page_t* undo_page, /*!< in: update undo log header page, + x-latched */ + mtr_t* mtr) /*!< in: mtr */ +{ + trx_rseg_t* rseg; + trx_undo_t* undo; + + undo = trx->update_undo; + rseg = trx->rseg; + + ut_ad(mutex_own(&(rseg->mutex))); + + trx_purge_add_update_undo_to_history(trx, undo_page, mtr); + + UT_LIST_REMOVE(undo_list, rseg->update_undo_list, undo); + + trx->update_undo = NULL; + + if (undo->state == TRX_UNDO_CACHED) { + + UT_LIST_ADD_FIRST(undo_list, rseg->update_undo_cached, undo); + + MONITOR_INC(MONITOR_NUM_UNDO_SLOT_CACHED); + } else { + ut_ad(undo->state == TRX_UNDO_TO_PURGE); + + trx_undo_mem_free(undo); + } +} + +/******************************************************************//** +Frees or caches an insert undo log after a transaction commit or rollback. +Knowledge of inserts is not needed after a commit or rollback, therefore +the data can be discarded. */ +UNIV_INTERN +void +trx_undo_insert_cleanup( +/*====================*/ + trx_t* trx) /*!< in: transaction handle */ +{ + trx_undo_t* undo; + trx_rseg_t* rseg; + + undo = trx->insert_undo; + ut_ad(undo); + + rseg = trx->rseg; + + mutex_enter(&(rseg->mutex)); + + UT_LIST_REMOVE(undo_list, rseg->insert_undo_list, undo); + trx->insert_undo = NULL; + + if (undo->state == TRX_UNDO_CACHED) { + + UT_LIST_ADD_FIRST(undo_list, rseg->insert_undo_cached, undo); + + MONITOR_INC(MONITOR_NUM_UNDO_SLOT_CACHED); + } else { + ut_ad(undo->state == TRX_UNDO_TO_FREE); + + /* Delete first the undo log segment in the file */ + + mutex_exit(&(rseg->mutex)); + + trx_undo_seg_free(undo); + + mutex_enter(&(rseg->mutex)); + + ut_ad(rseg->curr_size > undo->size); + + rseg->curr_size -= undo->size; + + trx_undo_mem_free(undo); + } + + mutex_exit(&(rseg->mutex)); +} + +/********************************************************************//** +At shutdown, frees the undo logs of a PREPARED transaction. */ +UNIV_INTERN +void +trx_undo_free_prepared( +/*===================*/ + trx_t* trx) /*!< in/out: PREPARED transaction */ +{ + ut_ad(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS); + + if (trx->update_undo) { + ut_a(trx->update_undo->state == TRX_UNDO_PREPARED); + UT_LIST_REMOVE(undo_list, trx->rseg->update_undo_list, + trx->update_undo); + trx_undo_mem_free(trx->update_undo); + } + if (trx->insert_undo) { + ut_a(trx->insert_undo->state == TRX_UNDO_PREPARED); + UT_LIST_REMOVE(undo_list, trx->rseg->insert_undo_list, + trx->insert_undo); + trx_undo_mem_free(trx->insert_undo); + } +} +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/xtradb/usr/usr0sess.cc b/storage/xtradb/usr/usr0sess.cc new file mode 100644 index 00000000000..ab7ba6bea09 --- /dev/null +++ b/storage/xtradb/usr/usr0sess.cc @@ -0,0 +1,68 @@ +/***************************************************************************** + +Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file usr/usr0sess.cc +Sessions + +Created 6/25/1996 Heikki Tuuri +*******************************************************/ + +#include "usr0sess.h" + +#ifdef UNIV_NONINL +#include "usr0sess.ic" +#endif + +#include "trx0trx.h" + +/*********************************************************************//** +Opens a session. +@return own: session object */ +UNIV_INTERN +sess_t* +sess_open(void) +/*===========*/ +{ + sess_t* sess; + + sess = static_cast<sess_t*>(mem_zalloc(sizeof(*sess))); + + sess->state = SESS_ACTIVE; + + sess->trx = trx_allocate_for_background(); + sess->trx->sess = sess; + + UT_LIST_INIT(sess->graphs); + + return(sess); +} + +/*********************************************************************//** +Closes a session, freeing the memory occupied by it. */ +UNIV_INTERN +void +sess_close( +/*=======*/ + sess_t* sess) /*!< in, own: session object */ +{ + ut_a(UT_LIST_GET_LEN(sess->graphs) == 0); + + trx_free_for_background(sess->trx); + mem_free(sess); +} diff --git a/storage/xtradb/ut/ut0bh.cc b/storage/xtradb/ut/ut0bh.cc new file mode 100644 index 00000000000..1a3038a0d71 --- /dev/null +++ b/storage/xtradb/ut/ut0bh.cc @@ -0,0 +1,159 @@ +/***************************************************************************//** + +Copyright (c) 2010, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file ut/ut0bh.cc +Binary min-heap implementation. + +Created 2010-05-28 by Sunny Bains +*******************************************************/ + +#include "ut0bh.h" +#include "ut0mem.h" + +#ifdef UNIV_NONINL +#include "ut0bh.ic" +#endif + +#include <string.h> + +/**********************************************************************//** +Create a binary heap. +@return a new binary heap */ +UNIV_INTERN +ib_bh_t* +ib_bh_create( +/*=========*/ + ib_bh_cmp_t compare, /*!< in: comparator */ + ulint sizeof_elem, /*!< in: size of one element */ + ulint max_elems) /*!< in: max elements allowed */ +{ + ulint sz; + ib_bh_t* ib_bh; + + sz = sizeof(*ib_bh) + (sizeof_elem * max_elems); + + ib_bh = (ib_bh_t*) ut_malloc(sz); + memset(ib_bh, 0x0, sz); + + ib_bh->compare = compare; + ib_bh->max_elems = max_elems; + ib_bh->sizeof_elem = sizeof_elem; + + return(ib_bh); +} + +/**********************************************************************//** +Free a binary heap. +@return a new binary heap */ +UNIV_INTERN +void +ib_bh_free( +/*=======*/ + ib_bh_t* ib_bh) /*!< in/own: instance */ +{ + ut_free(ib_bh); +} + +/**********************************************************************//** +Add an element to the binary heap. Note: The element is copied. +@return pointer to added element or NULL if full. */ +UNIV_INTERN +void* +ib_bh_push( +/*=======*/ + ib_bh_t* ib_bh, /*!< in/out: instance */ + const void* elem) /*!< in: element to add */ +{ + void* ptr; + + if (ib_bh_is_full(ib_bh)) { + return(NULL); + } else if (ib_bh_is_empty(ib_bh)) { + ++ib_bh->n_elems; + return(ib_bh_set(ib_bh, 0, elem)); + } else { + ulint i; + + i = ib_bh->n_elems; + + ++ib_bh->n_elems; + + for (ptr = ib_bh_get(ib_bh, i >> 1); + i > 0 && ib_bh->compare(ptr, elem) > 0; + i >>= 1, ptr = ib_bh_get(ib_bh, i >> 1)) { + + ib_bh_set(ib_bh, i, ptr); + } + + ptr = ib_bh_set(ib_bh, i, elem); + } + + return(ptr); +} + +/**********************************************************************//** +Remove the first element from the binary heap. */ +UNIV_INTERN +void +ib_bh_pop( +/*======*/ + ib_bh_t* ib_bh) /*!< in/out: instance */ +{ + byte* ptr; + byte* last; + ulint parent = 0; + + if (ib_bh_is_empty(ib_bh)) { + return; + } else if (ib_bh_size(ib_bh) == 1) { + --ib_bh->n_elems; + return; + } + + last = (byte*) ib_bh_last(ib_bh); + + /* Start from the child node */ + ptr = (byte*) ib_bh_get(ib_bh, 1); + + while (ptr < last) { + /* If the "right" child node is < "left" child node */ + if (ib_bh->compare(ptr + ib_bh->sizeof_elem, ptr) < 0) { + ptr += ib_bh->sizeof_elem; + } + + if (ib_bh->compare(last, ptr) <= 0) { + break; + } + + ib_bh_set(ib_bh, parent, ptr); + + parent = (ptr - (byte*) ib_bh_first(ib_bh)) + / ib_bh->sizeof_elem; + + if ((parent << 1) >= ib_bh_size(ib_bh)) { + break; + } + + ptr = (byte*) ib_bh_get(ib_bh, parent << 1); + } + + --ib_bh->n_elems; + + ib_bh_set(ib_bh, parent, last); +} diff --git a/storage/xtradb/ut/ut0byte.cc b/storage/xtradb/ut/ut0byte.cc new file mode 100644 index 00000000000..bc592edc6bf --- /dev/null +++ b/storage/xtradb/ut/ut0byte.cc @@ -0,0 +1,30 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/***************************************************************//** +@file ut/ut0byte.cc +Byte utilities + +Created 5/11/1994 Heikki Tuuri +********************************************************************/ + +#include "ut0byte.h" + +#ifdef UNIV_NONINL +#include "ut0byte.ic" +#endif diff --git a/storage/xtradb/ut/ut0crc32.cc b/storage/xtradb/ut/ut0crc32.cc new file mode 100644 index 00000000000..1caf27ebae3 --- /dev/null +++ b/storage/xtradb/ut/ut0crc32.cc @@ -0,0 +1,318 @@ +/***************************************************************************** + +Copyright (C) 2009, 2010 Facebook, Inc. All Rights Reserved. +Copyright (c) 2011, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/***************************************************************//** +@file ut/ut0crc32.cc +CRC32 implementation from Facebook, based on the zlib implementation. + +Created Aug 8, 2011, Vasil Dimov, based on mysys/my_crc32.c and +mysys/my_perf.c, contributed by Facebook under the following license. +********************************************************************/ + +/* Copyright (C) 2009-2010 Facebook, Inc. All Rights Reserved. + + Dual licensed under BSD license and GPLv2. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY FACEBOOK, INC. ``AS IS'' AND ANY EXPRESS OR + IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO + EVENT SHALL FACEBOOK, INC. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the Free + Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + You should have received a copy of the GNU General Public License along with + this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */ + +/* The below CRC32 implementation is based on the implementation included with + * zlib with modifications to process 8 bytes at a time and using SSE 4.2 + * extentions when available. The polynomial constant has been changed to + * match the one used by SSE 4.2 and does not return the same value as the + * version used by zlib. This implementation only supports 64-bit + * little-endian processors. The original zlib copyright notice follows. */ + +/* crc32.c -- compute the CRC-32 of a buf stream + * Copyright (C) 1995-2005 Mark Adler + * For conditions of distribution and use, see copyright notice in zlib.h + * + * Thanks to Rodney Brown <rbrown64@csc.com.au> for his contribution of faster + * CRC methods: exclusive-oring 32 bits of buf at a time, and pre-computing + * tables for updating the shift register in one step with three exclusive-ors + * instead of four steps with four exclusive-ors. This results in about a + * factor of two increase in speed on a Power PC G4 (PPC7455) using gcc -O3. + */ + +#include "univ.i" +#include "ut0crc32.h" + +#include <string.h> + +ib_ut_crc32_t ut_crc32; + +/* Precalculated table used to generate the CRC32 if the CPU does not +have support for it */ +static ib_uint32_t ut_crc32_slice8_table[8][256]; +static ibool ut_crc32_slice8_table_initialized = FALSE; + +/* Flag that tells whether the CPU supports CRC32 or not */ +UNIV_INTERN bool ut_crc32_sse2_enabled = false; + +/********************************************************************//** +Initializes the table that is used to generate the CRC32 if the CPU does +not have support for it. */ +static +void +ut_crc32_slice8_table_init() +/*========================*/ +{ + /* bit-reversed poly 0x1EDC6F41 (from SSE42 crc32 instruction) */ + static const ib_uint32_t poly = 0x82f63b78; + ib_uint32_t n; + ib_uint32_t k; + ib_uint32_t c; + + for (n = 0; n < 256; n++) { + c = n; + for (k = 0; k < 8; k++) { + c = (c & 1) ? (poly ^ (c >> 1)) : (c >> 1); + } + ut_crc32_slice8_table[0][n] = c; + } + + for (n = 0; n < 256; n++) { + c = ut_crc32_slice8_table[0][n]; + for (k = 1; k < 8; k++) { + c = ut_crc32_slice8_table[0][c & 0xFF] ^ (c >> 8); + ut_crc32_slice8_table[k][n] = c; + } + } + + ut_crc32_slice8_table_initialized = TRUE; +} + +#if defined(__GNUC__) && defined(__x86_64__) +/********************************************************************//** +Fetches CPU info */ +static +void +ut_cpuid( +/*=====*/ + ib_uint32_t vend[3], /*!< out: CPU vendor */ + ib_uint32_t* model, /*!< out: CPU model */ + ib_uint32_t* family, /*!< out: CPU family */ + ib_uint32_t* stepping, /*!< out: CPU stepping */ + ib_uint32_t* features_ecx, /*!< out: CPU features ecx */ + ib_uint32_t* features_edx) /*!< out: CPU features edx */ +{ + ib_uint32_t sig; + asm("cpuid" : "=b" (vend[0]), "=c" (vend[2]), "=d" (vend[1]) : "a" (0)); + asm("cpuid" : "=a" (sig), "=c" (*features_ecx), "=d" (*features_edx) + : "a" (1) + : "ebx"); + + *model = ((sig >> 4) & 0xF); + *family = ((sig >> 8) & 0xF); + *stepping = (sig & 0xF); + + if (memcmp(vend, "GenuineIntel", 12) == 0 + || (memcmp(vend, "AuthenticAMD", 12) == 0 && *family == 0xF)) { + + *model += (((sig >> 16) & 0xF) << 4); + *family += ((sig >> 20) & 0xFF); + } +} + +/* opcodes taken from objdump of "crc32b (%%rdx), %%rcx" +for RHEL4 support (GCC 3 doesn't support this instruction) */ +#define ut_crc32_sse42_byte \ + asm(".byte 0xf2, 0x48, 0x0f, 0x38, 0xf0, 0x0a" \ + : "=c"(crc) : "c"(crc), "d"(buf)); \ + len--, buf++ + +/* opcodes taken from objdump of "crc32q (%%rdx), %%rcx" +for RHEL4 support (GCC 3 doesn't support this instruction) */ +#define ut_crc32_sse42_quadword \ + asm(".byte 0xf2, 0x48, 0x0f, 0x38, 0xf1, 0x0a" \ + : "=c"(crc) : "c"(crc), "d"(buf)); \ + len -= 8, buf += 8 +#endif /* defined(__GNUC__) && defined(__x86_64__) */ + +/********************************************************************//** +Calculates CRC32 using CPU instructions. +@return CRC-32C (polynomial 0x11EDC6F41) */ +UNIV_INLINE +ib_uint32_t +ut_crc32_sse42( +/*===========*/ + const byte* buf, /*!< in: data over which to calculate CRC32 */ + ulint len) /*!< in: data length */ +{ +#if defined(__GNUC__) && defined(__x86_64__) + ib_uint64_t crc = (ib_uint32_t) (-1); + + ut_a(ut_crc32_sse2_enabled); + + while (len && ((ulint) buf & 7)) { + ut_crc32_sse42_byte; + } + + while (len >= 32) { + ut_crc32_sse42_quadword; + ut_crc32_sse42_quadword; + ut_crc32_sse42_quadword; + ut_crc32_sse42_quadword; + } + + while (len >= 8) { + ut_crc32_sse42_quadword; + } + + while (len) { + ut_crc32_sse42_byte; + } + + return((ib_uint32_t) ((~crc) & 0xFFFFFFFF)); +#else + ut_error; + /* silence compiler warning about unused parameters */ + return((ib_uint32_t) buf[len]); +#endif /* defined(__GNUC__) && defined(__x86_64__) */ +} + +#define ut_crc32_slice8_byte \ + crc = (crc >> 8) ^ ut_crc32_slice8_table[0][(crc ^ *buf++) & 0xFF]; \ + len-- + +#define ut_crc32_slice8_quadword \ + crc ^= *(ib_uint64_t*) buf; \ + crc = ut_crc32_slice8_table[7][(crc ) & 0xFF] ^ \ + ut_crc32_slice8_table[6][(crc >> 8) & 0xFF] ^ \ + ut_crc32_slice8_table[5][(crc >> 16) & 0xFF] ^ \ + ut_crc32_slice8_table[4][(crc >> 24) & 0xFF] ^ \ + ut_crc32_slice8_table[3][(crc >> 32) & 0xFF] ^ \ + ut_crc32_slice8_table[2][(crc >> 40) & 0xFF] ^ \ + ut_crc32_slice8_table[1][(crc >> 48) & 0xFF] ^ \ + ut_crc32_slice8_table[0][(crc >> 56)]; \ + len -= 8, buf += 8 + +/********************************************************************//** +Calculates CRC32 manually. +@return CRC-32C (polynomial 0x11EDC6F41) */ +UNIV_INLINE +ib_uint32_t +ut_crc32_slice8( +/*============*/ + const byte* buf, /*!< in: data over which to calculate CRC32 */ + ulint len) /*!< in: data length */ +{ + ib_uint64_t crc = (ib_uint32_t) (-1); + + ut_a(ut_crc32_slice8_table_initialized); + + while (len && ((ulint) buf & 7)) { + ut_crc32_slice8_byte; + } + + while (len >= 32) { + ut_crc32_slice8_quadword; + ut_crc32_slice8_quadword; + ut_crc32_slice8_quadword; + ut_crc32_slice8_quadword; + } + + while (len >= 8) { + ut_crc32_slice8_quadword; + } + + while (len) { + ut_crc32_slice8_byte; + } + + return((ib_uint32_t) ((~crc) & 0xFFFFFFFF)); +} + +/********************************************************************//** +Initializes the data structures used by ut_crc32(). Does not do any +allocations, would not hurt if called twice, but would be pointless. */ +UNIV_INTERN +void +ut_crc32_init() +/*===========*/ +{ +#if defined(__GNUC__) && defined(__x86_64__) + ib_uint32_t vend[3]; + ib_uint32_t model; + ib_uint32_t family; + ib_uint32_t stepping; + ib_uint32_t features_ecx; + ib_uint32_t features_edx; + + ut_cpuid(vend, &model, &family, &stepping, + &features_ecx, &features_edx); + + /* Valgrind does not understand the CRC32 instructions: + + vex amd64->IR: unhandled instruction bytes: 0xF2 0x48 0xF 0x38 0xF0 0xA + valgrind: Unrecognised instruction at address 0xad3db5. + Your program just tried to execute an instruction that Valgrind + did not recognise. There are two possible reasons for this. + 1. Your program has a bug and erroneously jumped to a non-code + location. If you are running Memcheck and you just saw a + warning about a bad jump, it's probably your program's fault. + 2. The instruction is legitimate but Valgrind doesn't handle it, + i.e. it's Valgrind's fault. If you think this is the case or + you are not sure, please let us know and we'll try to fix it. + Either way, Valgrind will now raise a SIGILL signal which will + probably kill your program. + + */ +#ifndef UNIV_DEBUG_VALGRIND + ut_crc32_sse2_enabled = (features_ecx >> 20) & 1; +#endif /* UNIV_DEBUG_VALGRIND */ + +#endif /* defined(__GNUC__) && defined(__x86_64__) */ + + if (ut_crc32_sse2_enabled) { + ut_crc32 = ut_crc32_sse42; + } else { + ut_crc32_slice8_table_init(); + ut_crc32 = ut_crc32_slice8; + } +} diff --git a/storage/xtradb/ut/ut0dbg.cc b/storage/xtradb/ut/ut0dbg.cc new file mode 100644 index 00000000000..a1cad144da4 --- /dev/null +++ b/storage/xtradb/ut/ut0dbg.cc @@ -0,0 +1,139 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/*****************************************************************//** +@file ut/ut0dbg.cc +Debug utilities for Innobase. + +Created 1/30/1994 Heikki Tuuri +**********************************************************************/ + +#include "univ.i" +#include "ut0dbg.h" +#ifndef UNIV_HOTBACKUP +# include "ha_prototypes.h" +#endif /* !UNIV_HOTBACKUP */ + +#if defined(__GNUC__) && (__GNUC__ > 2) +#else +/** This is used to eliminate compiler warnings */ +UNIV_INTERN ulint ut_dbg_zero = 0; +#endif + +/*************************************************************//** +Report a failed assertion. */ +UNIV_INTERN +void +ut_dbg_assertion_failed( +/*====================*/ + const char* expr, /*!< in: the failed assertion (optional) */ + const char* file, /*!< in: source file containing the assertion */ + ulint line) /*!< in: line number of the assertion */ +{ + ut_print_timestamp(stderr); +#ifdef UNIV_HOTBACKUP + fprintf(stderr, " InnoDB: Assertion failure in file %s line %lu\n", + file, line); +#else /* UNIV_HOTBACKUP */ + fprintf(stderr, + " InnoDB: Assertion failure in thread %lu" + " in file %s line %lu\n", + os_thread_pf(os_thread_get_curr_id()), + innobase_basename(file), line); +#endif /* UNIV_HOTBACKUP */ + if (expr) { + fprintf(stderr, + "InnoDB: Failing assertion: %s\n", expr); + } + + fputs("InnoDB: We intentionally generate a memory trap.\n" + "InnoDB: Submit a detailed bug report" + " to http://bugs.mysql.com.\n" + "InnoDB: If you get repeated assertion failures" + " or crashes, even\n" + "InnoDB: immediately after the mysqld startup, there may be\n" + "InnoDB: corruption in the InnoDB tablespace. Please refer to\n" + "InnoDB: " REFMAN "forcing-innodb-recovery.html\n" + "InnoDB: about forcing recovery.\n", stderr); +} + +#ifdef UNIV_COMPILE_TEST_FUNCS + +#include <sys/types.h> +#include <sys/time.h> +#include <sys/resource.h> + +#include <unistd.h> + +#ifndef timersub +#define timersub(a, b, r) \ + do { \ + (r)->tv_sec = (a)->tv_sec - (b)->tv_sec; \ + (r)->tv_usec = (a)->tv_usec - (b)->tv_usec; \ + if ((r)->tv_usec < 0) { \ + (r)->tv_sec--; \ + (r)->tv_usec += 1000000; \ + } \ + } while (0) +#endif /* timersub */ + +/*******************************************************************//** +Resets a speedo (records the current time in it). */ +UNIV_INTERN +void +speedo_reset( +/*=========*/ + speedo_t* speedo) /*!< out: speedo */ +{ + gettimeofday(&speedo->tv, NULL); + + getrusage(RUSAGE_SELF, &speedo->ru); +} + +/*******************************************************************//** +Shows the time elapsed and usage statistics since the last reset of a +speedo. */ +UNIV_INTERN +void +speedo_show( +/*========*/ + const speedo_t* speedo) /*!< in: speedo */ +{ + struct rusage ru_now; + struct timeval tv_now; + struct timeval tv_diff; + + getrusage(RUSAGE_SELF, &ru_now); + + gettimeofday(&tv_now, NULL); + +#define PRINT_TIMEVAL(prefix, tvp) \ + fprintf(stderr, "%s% 5ld.%06ld sec\n", \ + prefix, (tvp)->tv_sec, (tvp)->tv_usec) + + timersub(&tv_now, &speedo->tv, &tv_diff); + PRINT_TIMEVAL("real", &tv_diff); + + timersub(&ru_now.ru_utime, &speedo->ru.ru_utime, &tv_diff); + PRINT_TIMEVAL("user", &tv_diff); + + timersub(&ru_now.ru_stime, &speedo->ru.ru_stime, &tv_diff); + PRINT_TIMEVAL("sys ", &tv_diff); +} + +#endif /* UNIV_COMPILE_TEST_FUNCS */ diff --git a/storage/xtradb/ut/ut0list.cc b/storage/xtradb/ut/ut0list.cc new file mode 100644 index 00000000000..f906061d185 --- /dev/null +++ b/storage/xtradb/ut/ut0list.cc @@ -0,0 +1,203 @@ +/***************************************************************************** + +Copyright (c) 2006, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/*******************************************************************//** +@file ut/ut0list.cc +A double-linked list + +Created 4/26/2006 Osku Salerma +************************************************************************/ + +#include "ut0list.h" +#ifdef UNIV_NONINL +#include "ut0list.ic" +#endif + +/****************************************************************//** +Create a new list. +@return list */ +UNIV_INTERN +ib_list_t* +ib_list_create(void) +/*=================*/ +{ + ib_list_t* list; + + list = static_cast<ib_list_t*>(mem_alloc(sizeof(*list))); + + list->first = NULL; + list->last = NULL; + list->is_heap_list = FALSE; + + return(list); +} + +/****************************************************************//** +Create a new list using the given heap. ib_list_free MUST NOT BE CALLED for +lists created with this function. +@return list */ +UNIV_INTERN +ib_list_t* +ib_list_create_heap( +/*================*/ + mem_heap_t* heap) /*!< in: memory heap to use */ +{ + ib_list_t* list; + + list = static_cast<ib_list_t*>(mem_heap_alloc(heap, sizeof(*list))); + + list->first = NULL; + list->last = NULL; + list->is_heap_list = TRUE; + + return(list); +} + +/****************************************************************//** +Free a list. */ +UNIV_INTERN +void +ib_list_free( +/*=========*/ + ib_list_t* list) /*!< in: list */ +{ + ut_a(!list->is_heap_list); + + /* We don't check that the list is empty because it's entirely valid + to e.g. have all the nodes allocated from a single heap that is then + freed after the list itself is freed. */ + + mem_free(list); +} + +/****************************************************************//** +Add the data to the start of the list. +@return new list node */ +UNIV_INTERN +ib_list_node_t* +ib_list_add_first( +/*==============*/ + ib_list_t* list, /*!< in: list */ + void* data, /*!< in: data */ + mem_heap_t* heap) /*!< in: memory heap to use */ +{ + return(ib_list_add_after(list, ib_list_get_first(list), data, heap)); +} + +/****************************************************************//** +Add the data to the end of the list. +@return new list node */ +UNIV_INTERN +ib_list_node_t* +ib_list_add_last( +/*=============*/ + ib_list_t* list, /*!< in: list */ + void* data, /*!< in: data */ + mem_heap_t* heap) /*!< in: memory heap to use */ +{ + return(ib_list_add_after(list, ib_list_get_last(list), data, heap)); +} + +/****************************************************************//** +Add the data after the indicated node. +@return new list node */ +UNIV_INTERN +ib_list_node_t* +ib_list_add_after( +/*==============*/ + ib_list_t* list, /*!< in: list */ + ib_list_node_t* prev_node, /*!< in: node preceding new node (can + be NULL) */ + void* data, /*!< in: data */ + mem_heap_t* heap) /*!< in: memory heap to use */ +{ + ib_list_node_t* node; + + node = static_cast<ib_list_node_t*>( + mem_heap_alloc(heap, sizeof(*node))); + + node->data = data; + + if (!list->first) { + /* Empty list. */ + + ut_a(!prev_node); + + node->prev = NULL; + node->next = NULL; + + list->first = node; + list->last = node; + } else if (!prev_node) { + /* Start of list. */ + + node->prev = NULL; + node->next = list->first; + + list->first->prev = node; + + list->first = node; + } else { + /* Middle or end of list. */ + + node->prev = prev_node; + node->next = prev_node->next; + + prev_node->next = node; + + if (node->next) { + node->next->prev = node; + } else { + list->last = node; + } + } + + return(node); +} + +/****************************************************************//** +Remove the node from the list. */ +UNIV_INTERN +void +ib_list_remove( +/*===========*/ + ib_list_t* list, /*!< in: list */ + ib_list_node_t* node) /*!< in: node to remove */ +{ + if (node->prev) { + node->prev->next = node->next; + } else { + /* First item in list. */ + + ut_ad(list->first == node); + + list->first = node->next; + } + + if (node->next) { + node->next->prev = node->prev; + } else { + /* Last item in list. */ + + ut_ad(list->last == node); + + list->last = node->prev; + } + + node->prev = node->next = NULL; +} diff --git a/storage/xtradb/ut/ut0mem.cc b/storage/xtradb/ut/ut0mem.cc new file mode 100644 index 00000000000..2bb5d9ce332 --- /dev/null +++ b/storage/xtradb/ut/ut0mem.cc @@ -0,0 +1,609 @@ +/***************************************************************************** + +Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file ut/ut0mem.cc +Memory primitives + +Created 5/11/1994 Heikki Tuuri +*************************************************************************/ + +#include "ut0mem.h" + +#ifdef UNIV_NONINL +#include "ut0mem.ic" +#endif + +#ifndef UNIV_HOTBACKUP +# include "os0thread.h" +# include "srv0srv.h" + +#include <stdlib.h> + +/** The total amount of memory currently allocated from the operating +system with os_mem_alloc_large() or malloc(). Does not count malloc() +if srv_use_sys_malloc is set. Protected by ut_list_mutex. */ +UNIV_INTERN ulint ut_total_allocated_memory = 0; + +/** Mutex protecting ut_total_allocated_memory and ut_mem_block_list */ +UNIV_INTERN os_fast_mutex_t ut_list_mutex; + +#ifdef UNIV_PFS_MUTEX +/* Key to register server_mutex with performance schema */ +UNIV_INTERN mysql_pfs_key_t ut_list_mutex_key; +#endif + +/** Dynamically allocated memory block */ +struct ut_mem_block_t{ + UT_LIST_NODE_T(ut_mem_block_t) mem_block_list; + /*!< mem block list node */ + ulint size; /*!< size of allocated memory */ + ulint magic_n;/*!< magic number (UT_MEM_MAGIC_N) */ +}; + +/** The value of ut_mem_block_t::magic_n. Used in detecting +memory corruption. */ +#define UT_MEM_MAGIC_N 1601650166 + +/** List of all memory blocks allocated from the operating system +with malloc. Protected by ut_list_mutex. */ +static UT_LIST_BASE_NODE_T(ut_mem_block_t) ut_mem_block_list; + +/** Flag: has ut_mem_block_list been initialized? */ +static ibool ut_mem_block_list_inited = FALSE; + +/** A dummy pointer for generating a null pointer exception in +ut_malloc_low() */ +static ulint* ut_mem_null_ptr = NULL; + +/**********************************************************************//** +Initializes the mem block list at database startup. */ +UNIV_INTERN +void +ut_mem_init(void) +/*=============*/ +{ + ut_a(!ut_mem_block_list_inited); + os_fast_mutex_init(ut_list_mutex_key, &ut_list_mutex); + UT_LIST_INIT(ut_mem_block_list); + ut_mem_block_list_inited = TRUE; +} +#endif /* !UNIV_HOTBACKUP */ + +/**********************************************************************//** +Allocates memory. +@return own: allocated memory */ +UNIV_INTERN +void* +ut_malloc_low( +/*==========*/ + ulint n, /*!< in: number of bytes to allocate */ + ibool assert_on_error)/*!< in: if TRUE, we crash mysqld if the + memory cannot be allocated */ +{ +#ifndef UNIV_HOTBACKUP + ulint retry_count; + void* ret; + + if (UNIV_LIKELY(srv_use_sys_malloc)) { + ret = malloc(n); + ut_a(ret || !assert_on_error); + + return(ret); + } + + ut_ad((sizeof(ut_mem_block_t) % 8) == 0); /* check alignment ok */ + ut_a(ut_mem_block_list_inited); + + retry_count = 0; +retry: + os_fast_mutex_lock(&ut_list_mutex); + + ret = malloc(n + sizeof(ut_mem_block_t)); + + if (ret == NULL && retry_count < 60) { + if (retry_count == 0) { + ut_print_timestamp(stderr); + + fprintf(stderr, + " InnoDB: Error: cannot allocate" + " %lu bytes of\n" + "InnoDB: memory with malloc!" + " Total allocated memory\n" + "InnoDB: by InnoDB %lu bytes." + " Operating system errno: %lu\n" + "InnoDB: Check if you should" + " increase the swap file or\n" + "InnoDB: ulimits of your operating system.\n" + "InnoDB: On FreeBSD check you" + " have compiled the OS with\n" + "InnoDB: a big enough maximum process size.\n" + "InnoDB: Note that in most 32-bit" + " computers the process\n" + "InnoDB: memory space is limited" + " to 2 GB or 4 GB.\n" + "InnoDB: We keep retrying" + " the allocation for 60 seconds...\n", + (ulong) n, (ulong) ut_total_allocated_memory, +#ifdef __WIN__ + (ulong) GetLastError() +#else + (ulong) errno +#endif + ); + } + + os_fast_mutex_unlock(&ut_list_mutex); + + /* Sleep for a second and retry the allocation; maybe this is + just a temporary shortage of memory */ + + os_thread_sleep(1000000); + + retry_count++; + + goto retry; + } + + if (ret == NULL) { + /* Flush stderr to make more probable that the error + message gets in the error file before we generate a seg + fault */ + + fflush(stderr); + + os_fast_mutex_unlock(&ut_list_mutex); + + /* Make an intentional seg fault so that we get a stack + trace */ + if (assert_on_error) { + ut_print_timestamp(stderr); + + fprintf(stderr, + " InnoDB: We now intentionally" + " generate a seg fault so that\n" + "InnoDB: on Linux we get a stack trace.\n"); + + if (*ut_mem_null_ptr) ut_mem_null_ptr = 0; + } else { + return(NULL); + } + } + + UNIV_MEM_ALLOC(ret, n + sizeof(ut_mem_block_t)); + + ((ut_mem_block_t*) ret)->size = n + sizeof(ut_mem_block_t); + ((ut_mem_block_t*) ret)->magic_n = UT_MEM_MAGIC_N; + + ut_total_allocated_memory += n + sizeof(ut_mem_block_t); + + UT_LIST_ADD_FIRST(mem_block_list, ut_mem_block_list, + ((ut_mem_block_t*) ret)); + os_fast_mutex_unlock(&ut_list_mutex); + + return((void*)((byte*) ret + sizeof(ut_mem_block_t))); +#else /* !UNIV_HOTBACKUP */ + void* ret = malloc(n); + ut_a(ret || !assert_on_error); + + return(ret); +#endif /* !UNIV_HOTBACKUP */ +} + +/**********************************************************************//** +Frees a memory block allocated with ut_malloc. Freeing a NULL pointer is +a nop. */ +UNIV_INTERN +void +ut_free( +/*====*/ + void* ptr) /*!< in, own: memory block, can be NULL */ +{ +#ifndef UNIV_HOTBACKUP + ut_mem_block_t* block; + + if (ptr == NULL) { + return; + } else if (UNIV_LIKELY(srv_use_sys_malloc)) { + free(ptr); + return; + } + + block = (ut_mem_block_t*)((byte*) ptr - sizeof(ut_mem_block_t)); + + os_fast_mutex_lock(&ut_list_mutex); + + ut_a(block->magic_n == UT_MEM_MAGIC_N); + ut_a(ut_total_allocated_memory >= block->size); + + ut_total_allocated_memory -= block->size; + + UT_LIST_REMOVE(mem_block_list, ut_mem_block_list, block); + free(block); + + os_fast_mutex_unlock(&ut_list_mutex); +#else /* !UNIV_HOTBACKUP */ + free(ptr); +#endif /* !UNIV_HOTBACKUP */ +} + +#ifndef UNIV_HOTBACKUP +/**********************************************************************//** +Implements realloc. This is needed by /pars/lexyy.cc. Otherwise, you should not +use this function because the allocation functions in mem0mem.h are the +recommended ones in InnoDB. + +man realloc in Linux, 2004: + + realloc() changes the size of the memory block pointed to + by ptr to size bytes. The contents will be unchanged to + the minimum of the old and new sizes; newly allocated mem- + ory will be uninitialized. If ptr is NULL, the call is + equivalent to malloc(size); if size is equal to zero, the + call is equivalent to free(ptr). Unless ptr is NULL, it + must have been returned by an earlier call to malloc(), + calloc() or realloc(). + +RETURN VALUE + realloc() returns a pointer to the newly allocated memory, + which is suitably aligned for any kind of variable and may + be different from ptr, or NULL if the request fails. If + size was equal to 0, either NULL or a pointer suitable to + be passed to free() is returned. If realloc() fails the + original block is left untouched - it is not freed or + moved. +@return own: pointer to new mem block or NULL */ +UNIV_INTERN +void* +ut_realloc( +/*=======*/ + void* ptr, /*!< in: pointer to old block or NULL */ + ulint size) /*!< in: desired size */ +{ + ut_mem_block_t* block; + ulint old_size; + ulint min_size; + void* new_ptr; + + if (UNIV_LIKELY(srv_use_sys_malloc)) { + return(realloc(ptr, size)); + } + + if (ptr == NULL) { + + return(ut_malloc(size)); + } + + if (size == 0) { + ut_free(ptr); + + return(NULL); + } + + block = (ut_mem_block_t*)((byte*) ptr - sizeof(ut_mem_block_t)); + + ut_a(block->magic_n == UT_MEM_MAGIC_N); + + old_size = block->size - sizeof(ut_mem_block_t); + + if (size < old_size) { + min_size = size; + } else { + min_size = old_size; + } + + new_ptr = ut_malloc(size); + + if (new_ptr == NULL) { + + return(NULL); + } + + /* Copy the old data from ptr */ + ut_memcpy(new_ptr, ptr, min_size); + + ut_free(ptr); + + return(new_ptr); +} + +/**********************************************************************//** +Frees in shutdown all allocated memory not freed yet. */ +UNIV_INTERN +void +ut_free_all_mem(void) +/*=================*/ +{ + ut_mem_block_t* block; + + ut_a(ut_mem_block_list_inited); + ut_mem_block_list_inited = FALSE; + os_fast_mutex_free(&ut_list_mutex); + + while ((block = UT_LIST_GET_FIRST(ut_mem_block_list))) { + + ut_a(block->magic_n == UT_MEM_MAGIC_N); + ut_a(ut_total_allocated_memory >= block->size); + + ut_total_allocated_memory -= block->size; + + UT_LIST_REMOVE(mem_block_list, ut_mem_block_list, block); + free(block); + } + + if (ut_total_allocated_memory != 0) { + fprintf(stderr, + "InnoDB: Warning: after shutdown" + " total allocated memory is %lu\n", + (ulong) ut_total_allocated_memory); + } + + ut_mem_block_list_inited = FALSE; +} +#endif /* !UNIV_HOTBACKUP */ + +/**********************************************************************//** +Copies up to size - 1 characters from the NUL-terminated string src to +dst, NUL-terminating the result. Returns strlen(src), so truncation +occurred if the return value >= size. +@return strlen(src) */ +UNIV_INTERN +ulint +ut_strlcpy( +/*=======*/ + char* dst, /*!< in: destination buffer */ + const char* src, /*!< in: source buffer */ + ulint size) /*!< in: size of destination buffer */ +{ + ulint src_size = strlen(src); + + if (size != 0) { + ulint n = ut_min(src_size, size - 1); + + memcpy(dst, src, n); + dst[n] = '\0'; + } + + return(src_size); +} + +/**********************************************************************//** +Like ut_strlcpy, but if src doesn't fit in dst completely, copies the last +(size - 1) bytes of src, not the first. +@return strlen(src) */ +UNIV_INTERN +ulint +ut_strlcpy_rev( +/*===========*/ + char* dst, /*!< in: destination buffer */ + const char* src, /*!< in: source buffer */ + ulint size) /*!< in: size of destination buffer */ +{ + ulint src_size = strlen(src); + + if (size != 0) { + ulint n = ut_min(src_size, size - 1); + + memcpy(dst, src + src_size - n, n + 1); + } + + return(src_size); +} + +#ifndef UNIV_HOTBACKUP +/**********************************************************************//** +Return the number of times s2 occurs in s1. Overlapping instances of s2 +are only counted once. +@return the number of times s2 occurs in s1 */ +UNIV_INTERN +ulint +ut_strcount( +/*========*/ + const char* s1, /*!< in: string to search in */ + const char* s2) /*!< in: string to search for */ +{ + ulint count = 0; + ulint len = strlen(s2); + + if (len == 0) { + + return(0); + } + + for (;;) { + s1 = strstr(s1, s2); + + if (!s1) { + + break; + } + + count++; + s1 += len; + } + + return(count); +} + +/******************************************************************** +Concatenate 3 strings.*/ + +char* +ut_str3cat( +/*=======*/ + /* out, own: concatenated string, must be + freed with mem_free() */ + const char* s1, /* in: string 1 */ + const char* s2, /* in: string 2 */ + const char* s3) /* in: string 3 */ +{ + char* s; + ulint s1_len = strlen(s1); + ulint s2_len = strlen(s2); + ulint s3_len = strlen(s3); + + s = static_cast<char*>(mem_alloc(s1_len + s2_len + s3_len + 1)); + + memcpy(s, s1, s1_len); + memcpy(s + s1_len, s2, s2_len); + memcpy(s + s1_len + s2_len, s3, s3_len); + + s[s1_len + s2_len + s3_len] = '\0'; + + return(s); +} +/**********************************************************************//** +Replace every occurrence of s1 in str with s2. Overlapping instances of s1 +are only replaced once. +@return own: modified string, must be freed with mem_free() */ +UNIV_INTERN +char* +ut_strreplace( +/*==========*/ + const char* str, /*!< in: string to operate on */ + const char* s1, /*!< in: string to replace */ + const char* s2) /*!< in: string to replace s1 with */ +{ + char* new_str; + char* ptr; + const char* str_end; + ulint str_len = strlen(str); + ulint s1_len = strlen(s1); + ulint s2_len = strlen(s2); + ulint count = 0; + int len_delta = (int) s2_len - (int) s1_len; + + str_end = str + str_len; + + if (len_delta <= 0) { + len_delta = 0; + } else { + count = ut_strcount(str, s1); + } + + new_str = static_cast<char*>( + mem_alloc(str_len + count * len_delta + 1)); + + ptr = new_str; + + while (str) { + const char* next = strstr(str, s1); + + if (!next) { + next = str_end; + } + + memcpy(ptr, str, next - str); + ptr += next - str; + + if (next == str_end) { + + break; + } + + memcpy(ptr, s2, s2_len); + ptr += s2_len; + + str = next + s1_len; + } + + *ptr = '\0'; + + return(new_str); +} + +#ifdef UNIV_COMPILE_TEST_FUNCS + +void +test_ut_str_sql_format() +{ + char buf[128]; + ulint ret; + +#define CALL_AND_TEST(str, str_len, buf, buf_size, ret_expected, buf_expected)\ + do {\ + ibool ok = TRUE;\ + memset(buf, 'x', 10);\ + buf[10] = '\0';\ + fprintf(stderr, "TESTING \"%s\", %lu, %lu\n",\ + str, (ulint) str_len, (ulint) buf_size);\ + ret = ut_str_sql_format(str, str_len, buf, buf_size);\ + if (ret != ret_expected) {\ + fprintf(stderr, "expected ret %lu, got %lu\n",\ + (ulint) ret_expected, ret);\ + ok = FALSE;\ + }\ + if (strcmp((char*) buf, buf_expected) != 0) {\ + fprintf(stderr, "expected buf \"%s\", got \"%s\"\n",\ + buf_expected, buf);\ + ok = FALSE;\ + }\ + if (ok) {\ + fprintf(stderr, "OK: %lu, \"%s\"\n\n",\ + (ulint) ret, buf);\ + } else {\ + return;\ + }\ + } while (0) + + CALL_AND_TEST("abcd", 4, buf, 0, 0, "xxxxxxxxxx"); + + CALL_AND_TEST("abcd", 4, buf, 1, 1, ""); + + CALL_AND_TEST("abcd", 4, buf, 2, 1, ""); + + CALL_AND_TEST("abcd", 0, buf, 3, 3, "''"); + CALL_AND_TEST("abcd", 1, buf, 3, 1, ""); + CALL_AND_TEST("abcd", 2, buf, 3, 1, ""); + CALL_AND_TEST("abcd", 3, buf, 3, 1, ""); + CALL_AND_TEST("abcd", 4, buf, 3, 1, ""); + + CALL_AND_TEST("abcd", 0, buf, 4, 3, "''"); + CALL_AND_TEST("abcd", 1, buf, 4, 4, "'a'"); + CALL_AND_TEST("abcd", 2, buf, 4, 4, "'a'"); + CALL_AND_TEST("abcd", 3, buf, 4, 4, "'a'"); + CALL_AND_TEST("abcd", 4, buf, 4, 4, "'a'"); + CALL_AND_TEST("abcde", 5, buf, 4, 4, "'a'"); + CALL_AND_TEST("'", 1, buf, 4, 3, "''"); + CALL_AND_TEST("''", 2, buf, 4, 3, "''"); + CALL_AND_TEST("a'", 2, buf, 4, 4, "'a'"); + CALL_AND_TEST("'a", 2, buf, 4, 3, "''"); + CALL_AND_TEST("ab", 2, buf, 4, 4, "'a'"); + + CALL_AND_TEST("abcdef", 0, buf, 5, 3, "''"); + CALL_AND_TEST("abcdef", 1, buf, 5, 4, "'a'"); + CALL_AND_TEST("abcdef", 2, buf, 5, 5, "'ab'"); + CALL_AND_TEST("abcdef", 3, buf, 5, 5, "'ab'"); + CALL_AND_TEST("abcdef", 4, buf, 5, 5, "'ab'"); + CALL_AND_TEST("abcdef", 5, buf, 5, 5, "'ab'"); + CALL_AND_TEST("abcdef", 6, buf, 5, 5, "'ab'"); + CALL_AND_TEST("'", 1, buf, 5, 5, "''''"); + CALL_AND_TEST("''", 2, buf, 5, 5, "''''"); + CALL_AND_TEST("a'", 2, buf, 5, 4, "'a'"); + CALL_AND_TEST("'a", 2, buf, 5, 5, "''''"); + CALL_AND_TEST("ab", 2, buf, 5, 5, "'ab'"); + CALL_AND_TEST("abc", 3, buf, 5, 5, "'ab'"); + + CALL_AND_TEST("ab", 2, buf, 6, 5, "'ab'"); + + CALL_AND_TEST("a'b'c", 5, buf, 32, 10, "'a''b''c'"); + CALL_AND_TEST("a'b'c'", 6, buf, 32, 12, "'a''b''c'''"); +} + +#endif /* UNIV_COMPILE_TEST_FUNCS */ +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/xtradb/ut/ut0rbt.cc b/storage/xtradb/ut/ut0rbt.cc new file mode 100644 index 00000000000..a6c02a8514a --- /dev/null +++ b/storage/xtradb/ut/ut0rbt.cc @@ -0,0 +1,1353 @@ +/***************************************************************************//** + +Copyright (c) 2007, 2010, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ +/********************************************************************//** +Red-Black tree implementation + +(c) 2007 Oracle/Innobase Oy + +Created 2007-03-20 Sunny Bains +***********************************************************************/ + +#include "ut0rbt.h" + +/**********************************************************************//** +Definition of a red-black tree +============================== + +A red-black tree is a binary search tree which has the following +red-black properties: + + 1. Every node is either red or black. + 2. Every leaf (NULL - in our case tree->nil) is black. + 3. If a node is red, then both its children are black. + 4. Every simple path from a node to a descendant leaf contains the + same number of black nodes. + + from (3) above, the implication is that on any path from the root + to a leaf, red nodes must not be adjacent. + + However, any number of black nodes may appear in a sequence. + */ + +#if defined(IB_RBT_TESTING) +#warning "Testing enabled!" +#endif + +#define ROOT(t) (t->root->left) + +/**********************************************************************//** +Print out the sub-tree recursively. */ +static +void +rbt_print_subtree( +/*==============*/ + const ib_rbt_t* tree, /*!< in: tree to traverse */ + const ib_rbt_node_t* node, /*!< in: node to print */ + ib_rbt_print_node print) /*!< in: print key function */ +{ + /* FIXME: Doesn't do anything yet */ + if (node != tree->nil) { + print(node); + rbt_print_subtree(tree, node->left, print); + rbt_print_subtree(tree, node->right, print); + } +} + +/**********************************************************************//** +Verify that the keys are in order. +@return TRUE of OK. FALSE if not ordered */ +static +ibool +rbt_check_ordering( +/*===============*/ + const ib_rbt_t* tree) /*!< in: tree to verfify */ +{ + const ib_rbt_node_t* node; + const ib_rbt_node_t* prev = NULL; + + /* Iterate over all the nodes, comparing each node with the prev */ + for (node = rbt_first(tree); node; node = rbt_next(tree, prev)) { + + if (prev) { + int result; + + if (tree->cmp_arg) { + result = tree->compare_with_arg( + tree->cmp_arg, prev->value, + node->value); + } else { + result = tree->compare( + prev->value, node->value); + } + + if (result >= 0) { + return(FALSE); + } + } + + prev = node; + } + + return(TRUE); +} + +/**********************************************************************//** +Check that every path from the root to the leaves has the same count. +Count is expressed in the number of black nodes. +@return 0 on failure else black height of the subtree */ +static +ibool +rbt_count_black_nodes( +/*==================*/ + const ib_rbt_t* tree, /*!< in: tree to verify */ + const ib_rbt_node_t* node) /*!< in: start of sub-tree */ +{ + ulint result; + + if (node != tree->nil) { + ulint left_height = rbt_count_black_nodes(tree, node->left); + + ulint right_height = rbt_count_black_nodes(tree, node->right); + + if (left_height == 0 + || right_height == 0 + || left_height != right_height) { + + result = 0; + } else if (node->color == IB_RBT_RED) { + + /* Case 3 */ + if (node->left->color != IB_RBT_BLACK + || node->right->color != IB_RBT_BLACK) { + + result = 0; + } else { + result = left_height; + } + /* Check if it's anything other than RED or BLACK. */ + } else if (node->color != IB_RBT_BLACK) { + + result = 0; + } else { + + result = right_height + 1; + } + } else { + result = 1; + } + + return(result); +} + +/**********************************************************************//** +Turn the node's right child's left sub-tree into node's right sub-tree. +This will also make node's right child it's parent. */ +static +void +rbt_rotate_left( +/*============*/ + const ib_rbt_node_t* nil, /*!< in: nil node of the tree */ + ib_rbt_node_t* node) /*!< in: node to rotate */ +{ + ib_rbt_node_t* right = node->right; + + node->right = right->left; + + if (right->left != nil) { + right->left->parent = node; + } + + /* Right's new parent was node's parent. */ + right->parent = node->parent; + + /* Since root's parent is tree->nil and root->parent->left points + back to root, we can avoid the check. */ + if (node == node->parent->left) { + /* Node was on the left of its parent. */ + node->parent->left = right; + } else { + /* Node must have been on the right. */ + node->parent->right = right; + } + + /* Finally, put node on right's left. */ + right->left = node; + node->parent = right; +} + +/**********************************************************************//** +Turn the node's left child's right sub-tree into node's left sub-tree. +This also make node's left child it's parent. */ +static +void +rbt_rotate_right( +/*=============*/ + const ib_rbt_node_t* nil, /*!< in: nil node of tree */ + ib_rbt_node_t* node) /*!< in: node to rotate */ +{ + ib_rbt_node_t* left = node->left; + + node->left = left->right; + + if (left->right != nil) { + left->right->parent = node; + } + + /* Left's new parent was node's parent. */ + left->parent = node->parent; + + /* Since root's parent is tree->nil and root->parent->left points + back to root, we can avoid the check. */ + if (node == node->parent->right) { + /* Node was on the left of its parent. */ + node->parent->right = left; + } else { + /* Node must have been on the left. */ + node->parent->left = left; + } + + /* Finally, put node on left's right. */ + left->right = node; + node->parent = left; +} + +/**********************************************************************//** +Append a node to the tree. */ +static +ib_rbt_node_t* +rbt_tree_add_child( +/*===============*/ + const ib_rbt_t* tree, + ib_rbt_bound_t* parent, + ib_rbt_node_t* node) +{ + /* Cast away the const. */ + ib_rbt_node_t* last = (ib_rbt_node_t*) parent->last; + + if (last == tree->root || parent->result < 0) { + last->left = node; + } else { + /* FIXME: We don't handle duplicates (yet)! */ + ut_a(parent->result != 0); + + last->right = node; + } + + node->parent = last; + + return(node); +} + +/**********************************************************************//** +Generic binary tree insert */ +static +ib_rbt_node_t* +rbt_tree_insert( +/*============*/ + ib_rbt_t* tree, + const void* key, + ib_rbt_node_t* node) +{ + ib_rbt_bound_t parent; + ib_rbt_node_t* current = ROOT(tree); + + parent.result = 0; + parent.last = tree->root; + + /* Regular binary search. */ + while (current != tree->nil) { + + parent.last = current; + + if (tree->cmp_arg) { + parent.result = tree->compare_with_arg( + tree->cmp_arg, key, current->value); + } else { + parent.result = tree->compare(key, current->value); + } + + if (parent.result < 0) { + current = current->left; + } else { + current = current->right; + } + } + + ut_a(current == tree->nil); + + rbt_tree_add_child(tree, &parent, node); + + return(node); +} + +/**********************************************************************//** +Balance a tree after inserting a node. */ +static +void +rbt_balance_tree( +/*=============*/ + const ib_rbt_t* tree, /*!< in: tree to balance */ + ib_rbt_node_t* node) /*!< in: node that was inserted */ +{ + const ib_rbt_node_t* nil = tree->nil; + ib_rbt_node_t* parent = node->parent; + + /* Restore the red-black property. */ + node->color = IB_RBT_RED; + + while (node != ROOT(tree) && parent->color == IB_RBT_RED) { + ib_rbt_node_t* grand_parent = parent->parent; + + if (parent == grand_parent->left) { + ib_rbt_node_t* uncle = grand_parent->right; + + if (uncle->color == IB_RBT_RED) { + + /* Case 1 - change the colors. */ + uncle->color = IB_RBT_BLACK; + parent->color = IB_RBT_BLACK; + grand_parent->color = IB_RBT_RED; + + /* Move node up the tree. */ + node = grand_parent; + + } else { + + if (node == parent->right) { + /* Right is a black node and node is + to the right, case 2 - move node + up and rotate. */ + node = parent; + rbt_rotate_left(nil, node); + } + + grand_parent = node->parent->parent; + + /* Case 3. */ + node->parent->color = IB_RBT_BLACK; + grand_parent->color = IB_RBT_RED; + + rbt_rotate_right(nil, grand_parent); + } + + } else { + ib_rbt_node_t* uncle = grand_parent->left; + + if (uncle->color == IB_RBT_RED) { + + /* Case 1 - change the colors. */ + uncle->color = IB_RBT_BLACK; + parent->color = IB_RBT_BLACK; + grand_parent->color = IB_RBT_RED; + + /* Move node up the tree. */ + node = grand_parent; + + } else { + + if (node == parent->left) { + /* Left is a black node and node is to + the right, case 2 - move node up and + rotate. */ + node = parent; + rbt_rotate_right(nil, node); + } + + grand_parent = node->parent->parent; + + /* Case 3. */ + node->parent->color = IB_RBT_BLACK; + grand_parent->color = IB_RBT_RED; + + rbt_rotate_left(nil, grand_parent); + } + } + + parent = node->parent; + } + + /* Color the root black. */ + ROOT(tree)->color = IB_RBT_BLACK; +} + +/**********************************************************************//** +Find the given node's successor. +@return successor node or NULL if no successor */ +static +ib_rbt_node_t* +rbt_find_successor( +/*===============*/ + const ib_rbt_t* tree, /*!< in: rb tree */ + const ib_rbt_node_t* current) /*!< in: this is declared const + because it can be called via + rbt_next() */ +{ + const ib_rbt_node_t* nil = tree->nil; + ib_rbt_node_t* next = current->right; + + /* Is there a sub-tree to the right that we can follow. */ + if (next != nil) { + + /* Follow the left most links of the current right child. */ + while (next->left != nil) { + next = next->left; + } + + } else { /* We will have to go up the tree to find the successor. */ + ib_rbt_node_t* parent = current->parent; + + /* Cast away the const. */ + next = (ib_rbt_node_t*) current; + + while (parent != tree->root && next == parent->right) { + next = parent; + parent = next->parent; + } + + next = (parent == tree->root) ? NULL : parent; + } + + return(next); +} + +/**********************************************************************//** +Find the given node's precedecessor. +@return predecessor node or NULL if no predecesor */ +static +ib_rbt_node_t* +rbt_find_predecessor( +/*=================*/ + const ib_rbt_t* tree, /*!< in: rb tree */ + const ib_rbt_node_t* current) /*!< in: this is declared const + because it can be called via + rbt_prev() */ +{ + const ib_rbt_node_t* nil = tree->nil; + ib_rbt_node_t* prev = current->left; + + /* Is there a sub-tree to the left that we can follow. */ + if (prev != nil) { + + /* Follow the right most links of the current left child. */ + while (prev->right != nil) { + prev = prev->right; + } + + } else { /* We will have to go up the tree to find the precedecessor. */ + ib_rbt_node_t* parent = current->parent; + + /* Cast away the const. */ + prev = (ib_rbt_node_t*) current; + + while (parent != tree->root && prev == parent->left) { + prev = parent; + parent = prev->parent; + } + + prev = (parent == tree->root) ? NULL : parent; + } + + return(prev); +} + +/**********************************************************************//** +Replace node with child. After applying transformations eject becomes +an orphan. */ +static +void +rbt_eject_node( +/*===========*/ + ib_rbt_node_t* eject, /*!< in: node to eject */ + ib_rbt_node_t* node) /*!< in: node to replace with */ +{ + /* Update the to be ejected node's parent's child pointers. */ + if (eject->parent->left == eject) { + eject->parent->left = node; + } else if (eject->parent->right == eject) { + eject->parent->right = node; + } else { + ut_a(0); + } + /* eject is now an orphan but otherwise its pointers + and color are left intact. */ + + node->parent = eject->parent; +} + +/**********************************************************************//** +Replace a node with another node. */ +static +void +rbt_replace_node( +/*=============*/ + ib_rbt_node_t* replace, /*!< in: node to replace */ + ib_rbt_node_t* node) /*!< in: node to replace with */ +{ + ib_rbt_color_t color = node->color; + + /* Update the node pointers. */ + node->left = replace->left; + node->right = replace->right; + + /* Update the child node pointers. */ + node->left->parent = node; + node->right->parent = node; + + /* Make the parent of replace point to node. */ + rbt_eject_node(replace, node); + + /* Swap the colors. */ + node->color = replace->color; + replace->color = color; +} + +/**********************************************************************//** +Detach node from the tree replacing it with one of it's children. +@return the child node that now occupies the position of the detached node */ +static +ib_rbt_node_t* +rbt_detach_node( +/*============*/ + const ib_rbt_t* tree, /*!< in: rb tree */ + ib_rbt_node_t* node) /*!< in: node to detach */ +{ + ib_rbt_node_t* child; + const ib_rbt_node_t* nil = tree->nil; + + if (node->left != nil && node->right != nil) { + /* Case where the node to be deleted has two children. */ + ib_rbt_node_t* successor = rbt_find_successor(tree, node); + + ut_a(successor != nil); + ut_a(successor->parent != nil); + ut_a(successor->left == nil); + + child = successor->right; + + /* Remove the successor node and replace with its child. */ + rbt_eject_node(successor, child); + + /* Replace the node to delete with its successor node. */ + rbt_replace_node(node, successor); + } else { + ut_a(node->left == nil || node->right == nil); + + child = (node->left != nil) ? node->left : node->right; + + /* Replace the node to delete with one of it's children. */ + rbt_eject_node(node, child); + } + + /* Reset the node links. */ + node->parent = node->right = node->left = tree->nil; + + return(child); +} + +/**********************************************************************//** +Rebalance the right sub-tree after deletion. +@return node to rebalance if more rebalancing required else NULL */ +static +ib_rbt_node_t* +rbt_balance_right( +/*==============*/ + const ib_rbt_node_t* nil, /*!< in: rb tree nil node */ + ib_rbt_node_t* parent, /*!< in: parent node */ + ib_rbt_node_t* sibling) /*!< in: sibling node */ +{ + ib_rbt_node_t* node = NULL; + + ut_a(sibling != nil); + + /* Case 3. */ + if (sibling->color == IB_RBT_RED) { + + parent->color = IB_RBT_RED; + sibling->color = IB_RBT_BLACK; + + rbt_rotate_left(nil, parent); + + sibling = parent->right; + + ut_a(sibling != nil); + } + + /* Since this will violate case 3 because of the change above. */ + if (sibling->left->color == IB_RBT_BLACK + && sibling->right->color == IB_RBT_BLACK) { + + node = parent; /* Parent needs to be rebalanced too. */ + sibling->color = IB_RBT_RED; + + } else { + if (sibling->right->color == IB_RBT_BLACK) { + + ut_a(sibling->left->color == IB_RBT_RED); + + sibling->color = IB_RBT_RED; + sibling->left->color = IB_RBT_BLACK; + + rbt_rotate_right(nil, sibling); + + sibling = parent->right; + ut_a(sibling != nil); + } + + sibling->color = parent->color; + sibling->right->color = IB_RBT_BLACK; + + parent->color = IB_RBT_BLACK; + + rbt_rotate_left(nil, parent); + } + + return(node); +} + +/**********************************************************************//** +Rebalance the left sub-tree after deletion. +@return node to rebalance if more rebalancing required else NULL */ +static +ib_rbt_node_t* +rbt_balance_left( +/*=============*/ + const ib_rbt_node_t* nil, /*!< in: rb tree nil node */ + ib_rbt_node_t* parent, /*!< in: parent node */ + ib_rbt_node_t* sibling) /*!< in: sibling node */ +{ + ib_rbt_node_t* node = NULL; + + ut_a(sibling != nil); + + /* Case 3. */ + if (sibling->color == IB_RBT_RED) { + + parent->color = IB_RBT_RED; + sibling->color = IB_RBT_BLACK; + + rbt_rotate_right(nil, parent); + sibling = parent->left; + + ut_a(sibling != nil); + } + + /* Since this will violate case 3 because of the change above. */ + if (sibling->right->color == IB_RBT_BLACK + && sibling->left->color == IB_RBT_BLACK) { + + node = parent; /* Parent needs to be rebalanced too. */ + sibling->color = IB_RBT_RED; + + } else { + if (sibling->left->color == IB_RBT_BLACK) { + + ut_a(sibling->right->color == IB_RBT_RED); + + sibling->color = IB_RBT_RED; + sibling->right->color = IB_RBT_BLACK; + + rbt_rotate_left(nil, sibling); + + sibling = parent->left; + + ut_a(sibling != nil); + } + + sibling->color = parent->color; + sibling->left->color = IB_RBT_BLACK; + + parent->color = IB_RBT_BLACK; + + rbt_rotate_right(nil, parent); + } + + return(node); +} + +/**********************************************************************//** +Delete the node and rebalance the tree if necessary */ +static +void +rbt_remove_node_and_rebalance( +/*==========================*/ + ib_rbt_t* tree, /*!< in: rb tree */ + ib_rbt_node_t* node) /*!< in: node to remove */ +{ + /* Detach node and get the node that will be used + as rebalance start. */ + ib_rbt_node_t* child = rbt_detach_node(tree, node); + + if (node->color == IB_RBT_BLACK) { + ib_rbt_node_t* last = child; + + ROOT(tree)->color = IB_RBT_RED; + + while (child && child->color == IB_RBT_BLACK) { + ib_rbt_node_t* parent = child->parent; + + /* Did the deletion cause an imbalance in the + parents left sub-tree. */ + if (parent->left == child) { + + child = rbt_balance_right( + tree->nil, parent, parent->right); + + } else if (parent->right == child) { + + child = rbt_balance_left( + tree->nil, parent, parent->left); + + } else { + ut_error; + } + + if (child) { + last = child; + } + } + + ut_a(last); + + last->color = IB_RBT_BLACK; + ROOT(tree)->color = IB_RBT_BLACK; + } + + /* Note that we have removed a node from the tree. */ + --tree->n_nodes; +} + +/**********************************************************************//** +Recursively free the nodes. */ +static +void +rbt_free_node( +/*==========*/ + ib_rbt_node_t* node, /*!< in: node to free */ + ib_rbt_node_t* nil) /*!< in: rb tree nil node */ +{ + if (node != nil) { + rbt_free_node(node->left, nil); + rbt_free_node(node->right, nil); + + ut_free(node); + } +} + +/**********************************************************************//** +Free all the nodes and free the tree. */ +UNIV_INTERN +void +rbt_free( +/*=====*/ + ib_rbt_t* tree) /*!< in: rb tree to free */ +{ + rbt_free_node(tree->root, tree->nil); + ut_free(tree->nil); + ut_free(tree); +} + +/**********************************************************************//** +Create an instance of a red black tree, whose comparison function takes +an argument +@return an empty rb tree */ +UNIV_INTERN +ib_rbt_t* +rbt_create_arg_cmp( +/*===============*/ + size_t sizeof_value, /*!< in: sizeof data item */ + ib_rbt_arg_compare + compare, /*!< in: fn to compare items */ + void* cmp_arg) /*!< in: compare fn arg */ +{ + ib_rbt_t* tree; + + ut_a(cmp_arg); + + tree = rbt_create(sizeof_value, NULL); + tree->cmp_arg = cmp_arg; + tree->compare_with_arg = compare; + + return(tree); +} + +/**********************************************************************//** +Create an instance of a red black tree. +@return an empty rb tree */ +UNIV_INTERN +ib_rbt_t* +rbt_create( +/*=======*/ + size_t sizeof_value, /*!< in: sizeof data item */ + ib_rbt_compare compare) /*!< in: fn to compare items */ +{ + ib_rbt_t* tree; + ib_rbt_node_t* node; + + tree = (ib_rbt_t*) ut_malloc(sizeof(*tree)); + memset(tree, 0, sizeof(*tree)); + + tree->sizeof_value = sizeof_value; + + /* Create the sentinel (NIL) node. */ + node = tree->nil = (ib_rbt_node_t*) ut_malloc(sizeof(*node)); + memset(node, 0, sizeof(*node)); + + node->color = IB_RBT_BLACK; + node->parent = node->left = node->right = node; + + /* Create the "fake" root, the real root node will be the + left child of this node. */ + node = tree->root = (ib_rbt_node_t*) ut_malloc(sizeof(*node)); + memset(node, 0, sizeof(*node)); + + node->color = IB_RBT_BLACK; + node->parent = node->left = node->right = tree->nil; + + tree->compare = compare; + + return(tree); +} + +/**********************************************************************//** +Generic insert of a value in the rb tree. +@return inserted node */ +UNIV_INTERN +const ib_rbt_node_t* +rbt_insert( +/*=======*/ + ib_rbt_t* tree, /*!< in: rb tree */ + const void* key, /*!< in: key for ordering */ + const void* value) /*!< in: value of key, this value + is copied to the node */ +{ + ib_rbt_node_t* node; + + /* Create the node that will hold the value data. */ + node = (ib_rbt_node_t*) ut_malloc(SIZEOF_NODE(tree)); + + memcpy(node->value, value, tree->sizeof_value); + node->parent = node->left = node->right = tree->nil; + + /* Insert in the tree in the usual way. */ + rbt_tree_insert(tree, key, node); + rbt_balance_tree(tree, node); + + ++tree->n_nodes; + + return(node); +} + +/**********************************************************************//** +Add a new node to the tree, useful for data that is pre-sorted. +@return appended node */ +UNIV_INTERN +const ib_rbt_node_t* +rbt_add_node( +/*=========*/ + ib_rbt_t* tree, /*!< in: rb tree */ + ib_rbt_bound_t* parent, /*!< in: bounds */ + const void* value) /*!< in: this value is copied + to the node */ +{ + ib_rbt_node_t* node; + + /* Create the node that will hold the value data */ + node = (ib_rbt_node_t*) ut_malloc(SIZEOF_NODE(tree)); + + memcpy(node->value, value, tree->sizeof_value); + return(rbt_add_preallocated_node(tree, parent, node)); +} + +/****************************************************************//** +Add a new caller-provided node to tree at the specified position. +The node must have its key fields initialized correctly. +@return added node */ +UNIV_INTERN +const ib_rbt_node_t* +rbt_add_preallocated_node( +/*======================*/ + ib_rbt_t* tree, /*!< in: rb tree */ + ib_rbt_bound_t* parent, /*!< in: parent */ + ib_rbt_node_t* node) /*!< in: node */ +{ + node->parent = node->left = node->right = tree->nil; + + /* If tree is empty */ + if (parent->last == NULL) { + parent->last = tree->root; + } + + /* Append the node, the hope here is that the caller knows + what s/he is doing. */ + rbt_tree_add_child(tree, parent, node); + rbt_balance_tree(tree, node); + + ++tree->n_nodes; + +#if defined(IB_RBT_TESTING) + ut_a(rbt_validate(tree)); +#endif + return(node); +} + + +/**********************************************************************//** +Find a matching node in the rb tree. +@return NULL if not found else the node where key was found */ +UNIV_INTERN +const ib_rbt_node_t* +rbt_lookup( +/*=======*/ + const ib_rbt_t* tree, /*!< in: rb tree */ + const void* key) /*!< in: key to use for search */ +{ + const ib_rbt_node_t* current = ROOT(tree); + + /* Regular binary search. */ + while (current != tree->nil) { + int result; + + if (tree->cmp_arg) { + result = tree->compare_with_arg( + tree->cmp_arg, key, current->value); + } else { + result = tree->compare(key, current->value); + } + + if (result < 0) { + current = current->left; + } else if (result > 0) { + current = current->right; + } else { + break; + } + } + + return(current != tree->nil ? current : NULL); +} + +/**********************************************************************//** +Delete a node indentified by key. +@return TRUE if success FALSE if not found */ +UNIV_INTERN +ibool +rbt_delete( +/*=======*/ + ib_rbt_t* tree, /*!< in: rb tree */ + const void* key) /*!< in: key to delete */ +{ + ibool deleted = FALSE; + ib_rbt_node_t* node = (ib_rbt_node_t*) rbt_lookup(tree, key); + + if (node) { + rbt_remove_node_and_rebalance(tree, node); + + ut_free(node); + deleted = TRUE; + } + + return(deleted); +} + +/**********************************************************************//** +Remove a node from the rb tree, the node is not free'd, that is the +callers responsibility. +@return deleted node but without the const */ +UNIV_INTERN +ib_rbt_node_t* +rbt_remove_node( +/*============*/ + ib_rbt_t* tree, /*!< in: rb tree */ + const ib_rbt_node_t* const_node) /*!< in: node to delete, this + is a fudge and declared const + because the caller can access + only const nodes */ +{ + /* Cast away the const. */ + rbt_remove_node_and_rebalance(tree, (ib_rbt_node_t*) const_node); + + /* This is to make it easier to do something like this: + ut_free(rbt_remove_node(node)); + */ + + return((ib_rbt_node_t*) const_node); +} + +/**********************************************************************//** +Find the node that has the lowest key that is >= key. +@return node satisfying the lower bound constraint or NULL */ +UNIV_INTERN +const ib_rbt_node_t* +rbt_lower_bound( +/*============*/ + const ib_rbt_t* tree, /*!< in: rb tree */ + const void* key) /*!< in: key to search */ +{ + ib_rbt_node_t* lb_node = NULL; + ib_rbt_node_t* current = ROOT(tree); + + while (current != tree->nil) { + int result; + + if (tree->cmp_arg) { + result = tree->compare_with_arg( + tree->cmp_arg, key, current->value); + } else { + result = tree->compare(key, current->value); + } + + if (result > 0) { + + current = current->right; + + } else if (result < 0) { + + lb_node = current; + current = current->left; + + } else { + lb_node = current; + break; + } + } + + return(lb_node); +} + +/**********************************************************************//** +Find the node that has the greatest key that is <= key. +@return node satisfying the upper bound constraint or NULL */ +UNIV_INTERN +const ib_rbt_node_t* +rbt_upper_bound( +/*============*/ + const ib_rbt_t* tree, /*!< in: rb tree */ + const void* key) /*!< in: key to search */ +{ + ib_rbt_node_t* ub_node = NULL; + ib_rbt_node_t* current = ROOT(tree); + + while (current != tree->nil) { + int result; + + if (tree->cmp_arg) { + result = tree->compare_with_arg( + tree->cmp_arg, key, current->value); + } else { + result = tree->compare(key, current->value); + } + + if (result > 0) { + + ub_node = current; + current = current->right; + + } else if (result < 0) { + + current = current->left; + + } else { + ub_node = current; + break; + } + } + + return(ub_node); +} + +/**********************************************************************//** +Find the node that has the greatest key that is <= key. +@return value of result */ +UNIV_INTERN +int +rbt_search( +/*=======*/ + const ib_rbt_t* tree, /*!< in: rb tree */ + ib_rbt_bound_t* parent, /*!< in: search bounds */ + const void* key) /*!< in: key to search */ +{ + ib_rbt_node_t* current = ROOT(tree); + + /* Every thing is greater than the NULL root. */ + parent->result = 1; + parent->last = NULL; + + while (current != tree->nil) { + + parent->last = current; + + if (tree->cmp_arg) { + parent->result = tree->compare_with_arg( + tree->cmp_arg, key, current->value); + } else { + parent->result = tree->compare(key, current->value); + } + + if (parent->result > 0) { + current = current->right; + } else if (parent->result < 0) { + current = current->left; + } else { + break; + } + } + + return(parent->result); +} + +/**********************************************************************//** +Find the node that has the greatest key that is <= key. But use the +supplied comparison function. +@return value of result */ +UNIV_INTERN +int +rbt_search_cmp( +/*===========*/ + const ib_rbt_t* tree, /*!< in: rb tree */ + ib_rbt_bound_t* parent, /*!< in: search bounds */ + const void* key, /*!< in: key to search */ + ib_rbt_compare compare, /*!< in: fn to compare items */ + ib_rbt_arg_compare + arg_compare) /*!< in: fn to compare items + with argument */ +{ + ib_rbt_node_t* current = ROOT(tree); + + /* Every thing is greater than the NULL root. */ + parent->result = 1; + parent->last = NULL; + + while (current != tree->nil) { + + parent->last = current; + + if (arg_compare) { + ut_ad(tree->cmp_arg); + parent->result = arg_compare( + tree->cmp_arg, key, current->value); + } else { + parent->result = compare(key, current->value); + } + + if (parent->result > 0) { + current = current->right; + } else if (parent->result < 0) { + current = current->left; + } else { + break; + } + } + + return(parent->result); +} + +/**********************************************************************//** +Return the left most node in the tree. */ +UNIV_INTERN +const ib_rbt_node_t* +rbt_first( +/*======*/ + /* out leftmost node or NULL */ + const ib_rbt_t* tree) /* in: rb tree */ +{ + ib_rbt_node_t* first = NULL; + ib_rbt_node_t* current = ROOT(tree); + + while (current != tree->nil) { + first = current; + current = current->left; + } + + return(first); +} + +/**********************************************************************//** +Return the right most node in the tree. +@return the rightmost node or NULL */ +UNIV_INTERN +const ib_rbt_node_t* +rbt_last( +/*=====*/ + const ib_rbt_t* tree) /*!< in: rb tree */ +{ + ib_rbt_node_t* last = NULL; + ib_rbt_node_t* current = ROOT(tree); + + while (current != tree->nil) { + last = current; + current = current->right; + } + + return(last); +} + +/**********************************************************************//** +Return the next node. +@return node next from current */ +UNIV_INTERN +const ib_rbt_node_t* +rbt_next( +/*=====*/ + const ib_rbt_t* tree, /*!< in: rb tree */ + const ib_rbt_node_t* current) /*!< in: current node */ +{ + return(current ? rbt_find_successor(tree, current) : NULL); +} + +/**********************************************************************//** +Return the previous node. +@return node prev from current */ +UNIV_INTERN +const ib_rbt_node_t* +rbt_prev( +/*=====*/ + const ib_rbt_t* tree, /*!< in: rb tree */ + const ib_rbt_node_t* current) /*!< in: current node */ +{ + return(current ? rbt_find_predecessor(tree, current) : NULL); +} + +/**********************************************************************//** +Reset the tree. Delete all the nodes. */ +UNIV_INTERN +void +rbt_clear( +/*======*/ + ib_rbt_t* tree) /*!< in: rb tree */ +{ + rbt_free_node(ROOT(tree), tree->nil); + rbt_reset(tree); +} + +/****************************************************************//** +Clear the tree without deleting and freeing its nodes. */ +UNIV_INTERN +void +rbt_reset( +/*======*/ + ib_rbt_t* tree) /*!< in: rb tree */ +{ + tree->n_nodes = 0; + tree->root->left = tree->root->right = tree->nil; +} + +/**********************************************************************//** +Merge the node from dst into src. Return the number of nodes merged. +@return no. of recs merged */ +UNIV_INTERN +ulint +rbt_merge_uniq( +/*===========*/ + ib_rbt_t* dst, /*!< in: dst rb tree */ + const ib_rbt_t* src) /*!< in: src rb tree */ +{ + ib_rbt_bound_t parent; + ulint n_merged = 0; + const ib_rbt_node_t* src_node = rbt_first(src); + + if (rbt_empty(src) || dst == src) { + return(0); + } + + for (/* No op */; src_node; src_node = rbt_next(src, src_node)) { + + if (rbt_search(dst, &parent, src_node->value) != 0) { + rbt_add_node(dst, &parent, src_node->value); + ++n_merged; + } + } + + return(n_merged); +} + +/**********************************************************************//** +Merge the node from dst into src. Return the number of nodes merged. +Delete the nodes from src after copying node to dst. As a side effect +the duplicates will be left untouched in the src. +@return no. of recs merged */ +UNIV_INTERN +ulint +rbt_merge_uniq_destructive( +/*=======================*/ + ib_rbt_t* dst, /*!< in: dst rb tree */ + ib_rbt_t* src) /*!< in: src rb tree */ +{ + ib_rbt_bound_t parent; + ib_rbt_node_t* src_node; + ulint old_size = rbt_size(dst); + + if (rbt_empty(src) || dst == src) { + return(0); + } + + for (src_node = (ib_rbt_node_t*) rbt_first(src); src_node; /* */) { + ib_rbt_node_t* prev = src_node; + + src_node = (ib_rbt_node_t*) rbt_next(src, prev); + + /* Skip duplicates. */ + if (rbt_search(dst, &parent, prev->value) != 0) { + + /* Remove and reset the node but preserve + the node (data) value. */ + rbt_remove_node_and_rebalance(src, prev); + + /* The nil should be taken from the dst tree. */ + prev->parent = prev->left = prev->right = dst->nil; + rbt_tree_add_child(dst, &parent, prev); + rbt_balance_tree(dst, prev); + + ++dst->n_nodes; + } + } + +#if defined(IB_RBT_TESTING) + ut_a(rbt_validate(dst)); + ut_a(rbt_validate(src)); +#endif + return(rbt_size(dst) - old_size); +} + +/**********************************************************************//** +Check that every path from the root to the leaves has the same count and +the tree nodes are in order. +@return TRUE if OK FALSE otherwise */ +UNIV_INTERN +ibool +rbt_validate( +/*=========*/ + const ib_rbt_t* tree) /*!< in: RB tree to validate */ +{ + if (rbt_count_black_nodes(tree, ROOT(tree)) > 0) { + return(rbt_check_ordering(tree)); + } + + return(FALSE); +} + +/**********************************************************************//** +Iterate over the tree in depth first order. */ +UNIV_INTERN +void +rbt_print( +/*======*/ + const ib_rbt_t* tree, /*!< in: tree to traverse */ + ib_rbt_print_node print) /*!< in: print function */ +{ + rbt_print_subtree(tree, ROOT(tree), print); +} diff --git a/storage/xtradb/ut/ut0rnd.cc b/storage/xtradb/ut/ut0rnd.cc new file mode 100644 index 00000000000..3b4d7381181 --- /dev/null +++ b/storage/xtradb/ut/ut0rnd.cc @@ -0,0 +1,97 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/***************************************************************//** +@file ut/ut0rnd.cc +Random numbers and hashing + +Created 5/11/1994 Heikki Tuuri +********************************************************************/ + +#include "ut0rnd.h" + +#ifdef UNIV_NONINL +#include "ut0rnd.ic" +#endif + +/** These random numbers are used in ut_find_prime */ +/*@{*/ +#define UT_RANDOM_1 1.0412321 +#define UT_RANDOM_2 1.1131347 +#define UT_RANDOM_3 1.0132677 +/*@}*/ + +/** Seed value of ut_rnd_gen_ulint(). */ +UNIV_INTERN ulint ut_rnd_ulint_counter = 65654363; + +/***********************************************************//** +Looks for a prime number slightly greater than the given argument. +The prime is chosen so that it is not near any power of 2. +@return prime */ +UNIV_INTERN +ulint +ut_find_prime( +/*==========*/ + ulint n) /*!< in: positive number > 100 */ +{ + ulint pow2; + ulint i; + + n += 100; + + pow2 = 1; + while (pow2 * 2 < n) { + pow2 = 2 * pow2; + } + + if ((double) n < 1.05 * (double) pow2) { + n = (ulint) ((double) n * UT_RANDOM_1); + } + + pow2 = 2 * pow2; + + if ((double) n > 0.95 * (double) pow2) { + n = (ulint) ((double) n * UT_RANDOM_2); + } + + if (n > pow2 - 20) { + n += 30; + } + + /* Now we have n far enough from powers of 2. To make + n more random (especially, if it was not near + a power of 2), we then multiply it by a random number. */ + + n = (ulint) ((double) n * UT_RANDOM_3); + + for (;; n++) { + i = 2; + while (i * i <= n) { + if (n % i == 0) { + goto next_n; + } + i++; + } + + /* Found a prime */ + break; +next_n: ; + } + + return(n); +} diff --git a/storage/xtradb/ut/ut0ut.cc b/storage/xtradb/ut/ut0ut.cc new file mode 100644 index 00000000000..931b61d1a7c --- /dev/null +++ b/storage/xtradb/ut/ut0ut.cc @@ -0,0 +1,839 @@ +/***************************************************************************** + +Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/***************************************************************//** +@file ut/ut0ut.cc +Various utilities for Innobase. + +Created 5/11/1994 Heikki Tuuri +********************************************************************/ + +#include "ut0ut.h" + +#ifndef UNIV_INNOCHECKSUM + +#include "ut0sort.h" +#include "os0thread.h" /* thread-ID */ + +#ifdef UNIV_NONINL +#include "ut0ut.ic" +#endif + +#include <stdarg.h> +#include <string.h> +#include <ctype.h> + +#ifndef UNIV_HOTBACKUP +# include "btr0types.h" +# include "trx0trx.h" +# include "ha_prototypes.h" +# include "mysql_com.h" /* NAME_LEN */ +#endif /* UNIV_HOTBACKUP */ + +/** A constant to prevent the compiler from optimizing ut_delay() away. */ +UNIV_INTERN ibool ut_always_false = FALSE; + +#ifdef __WIN__ +/*****************************************************************//** +NOTE: The Windows epoch starts from 1601/01/01 whereas the Unix +epoch starts from 1970/1/1. For selection of constant see: +http://support.microsoft.com/kb/167296/ */ +#define WIN_TO_UNIX_DELTA_USEC ((ib_int64_t) 11644473600000000ULL) + + +/*****************************************************************//** +This is the Windows version of gettimeofday(2). +@return 0 if all OK else -1 */ +static +int +ut_gettimeofday( +/*============*/ + struct timeval* tv, /*!< out: Values are relative to Unix epoch */ + void* tz) /*!< in: not used */ +{ + FILETIME ft; + ib_int64_t tm; + + if (!tv) { + errno = EINVAL; + return(-1); + } + + GetSystemTimeAsFileTime(&ft); + + tm = (ib_int64_t) ft.dwHighDateTime << 32; + tm |= ft.dwLowDateTime; + + ut_a(tm >= 0); /* If tm wraps over to negative, the quotient / 10 + does not work */ + + tm /= 10; /* Convert from 100 nsec periods to usec */ + + /* If we don't convert to the Unix epoch the value for + struct timeval::tv_sec will overflow.*/ + tm -= WIN_TO_UNIX_DELTA_USEC; + + tv->tv_sec = (long) (tm / 1000000L); + tv->tv_usec = (long) (tm % 1000000L); + + return(0); +} +#else +/** An alias for gettimeofday(2). On Microsoft Windows, we have to +reimplement this function. */ +#define ut_gettimeofday gettimeofday +#endif + +/**********************************************************//** +Returns system time. We do not specify the format of the time returned: +the only way to manipulate it is to use the function ut_difftime. +@return system time */ +UNIV_INTERN +ib_time_t +ut_time(void) +/*=========*/ +{ + return(time(NULL)); +} + +#ifndef UNIV_HOTBACKUP +/**********************************************************//** +Returns system time. +Upon successful completion, the value 0 is returned; otherwise the +value -1 is returned and the global variable errno is set to indicate the +error. +@return 0 on success, -1 otherwise */ +UNIV_INTERN +int +ut_usectime( +/*========*/ + ulint* sec, /*!< out: seconds since the Epoch */ + ulint* ms) /*!< out: microseconds since the Epoch+*sec */ +{ + struct timeval tv; + int ret; + int errno_gettimeofday; + int i; + + for (i = 0; i < 10; i++) { + + ret = ut_gettimeofday(&tv, NULL); + + if (ret == -1) { + errno_gettimeofday = errno; + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: gettimeofday(): %s\n", + strerror(errno_gettimeofday)); + os_thread_sleep(100000); /* 0.1 sec */ + errno = errno_gettimeofday; + } else { + break; + } + } + + if (ret != -1) { + *sec = (ulint) tv.tv_sec; + *ms = (ulint) tv.tv_usec; + } + + return(ret); +} + +/**********************************************************//** +Returns the number of microseconds since epoch. Similar to +time(3), the return value is also stored in *tloc, provided +that tloc is non-NULL. +@return us since epoch */ +UNIV_INTERN +ullint +ut_time_us( +/*=======*/ + ullint* tloc) /*!< out: us since epoch, if non-NULL */ +{ + struct timeval tv; + ullint us; + + ut_gettimeofday(&tv, NULL); + + us = (ullint) tv.tv_sec * 1000000 + tv.tv_usec; + + if (tloc != NULL) { + *tloc = us; + } + + return(us); +} + +/**********************************************************//** +Returns the number of milliseconds since some epoch. The +value may wrap around. It should only be used for heuristic +purposes. +@return ms since epoch */ +UNIV_INTERN +ulint +ut_time_ms(void) +/*============*/ +{ + struct timeval tv; + + ut_gettimeofday(&tv, NULL); + + return((ulint) tv.tv_sec * 1000 + tv.tv_usec / 1000); +} +#endif /* !UNIV_HOTBACKUP */ + +/**********************************************************//** +Returns the difference of two times in seconds. +@return time2 - time1 expressed in seconds */ +UNIV_INTERN +double +ut_difftime( +/*========*/ + ib_time_t time2, /*!< in: time */ + ib_time_t time1) /*!< in: time */ +{ + return(difftime(time2, time1)); +} + +#endif /* !UNIV_INNOCHECKSUM */ + +/**********************************************************//** +Prints a timestamp to a file. */ +UNIV_INTERN +void +ut_print_timestamp( +/*===============*/ + FILE* file) /*!< in: file where to print */ +{ + ulint thread_id = 0; + +#ifndef UNIV_INNOCHECKSUM + thread_id = os_thread_pf(os_thread_get_curr_id()); +#endif + +#ifdef __WIN__ + SYSTEMTIME cal_tm; + + GetLocalTime(&cal_tm); + + fprintf(file, "%d-%02d-%02d %02d:%02d:%02d %lx", + (int) cal_tm.wYear, + (int) cal_tm.wMonth, + (int) cal_tm.wDay, + (int) cal_tm.wHour, + (int) cal_tm.wMinute, + (int) cal_tm.wSecond, + thread_id); +#else + struct tm* cal_tm_ptr; + time_t tm; + +#ifdef HAVE_LOCALTIME_R + struct tm cal_tm; + time(&tm); + localtime_r(&tm, &cal_tm); + cal_tm_ptr = &cal_tm; +#else + time(&tm); + cal_tm_ptr = localtime(&tm); +#endif + fprintf(file, "%d-%02d-%02d %02d:%02d:%02d %lx", + cal_tm_ptr->tm_year + 1900, + cal_tm_ptr->tm_mon + 1, + cal_tm_ptr->tm_mday, + cal_tm_ptr->tm_hour, + cal_tm_ptr->tm_min, + cal_tm_ptr->tm_sec, + thread_id); +#endif +} + +#ifndef UNIV_INNOCHECKSUM + +/**********************************************************//** +Sprintfs a timestamp to a buffer, 13..14 chars plus terminating NUL. */ +UNIV_INTERN +void +ut_sprintf_timestamp( +/*=================*/ + char* buf) /*!< in: buffer where to sprintf */ +{ +#ifdef __WIN__ + SYSTEMTIME cal_tm; + + GetLocalTime(&cal_tm); + + sprintf(buf, "%02d%02d%02d %2d:%02d:%02d", + (int) cal_tm.wYear % 100, + (int) cal_tm.wMonth, + (int) cal_tm.wDay, + (int) cal_tm.wHour, + (int) cal_tm.wMinute, + (int) cal_tm.wSecond); +#else + struct tm* cal_tm_ptr; + time_t tm; + +#ifdef HAVE_LOCALTIME_R + struct tm cal_tm; + time(&tm); + localtime_r(&tm, &cal_tm); + cal_tm_ptr = &cal_tm; +#else + time(&tm); + cal_tm_ptr = localtime(&tm); +#endif + sprintf(buf, "%02d%02d%02d %2d:%02d:%02d", + cal_tm_ptr->tm_year % 100, + cal_tm_ptr->tm_mon + 1, + cal_tm_ptr->tm_mday, + cal_tm_ptr->tm_hour, + cal_tm_ptr->tm_min, + cal_tm_ptr->tm_sec); +#endif +} + +#ifdef UNIV_HOTBACKUP +/**********************************************************//** +Sprintfs a timestamp to a buffer with no spaces and with ':' characters +replaced by '_'. */ +UNIV_INTERN +void +ut_sprintf_timestamp_without_extra_chars( +/*=====================================*/ + char* buf) /*!< in: buffer where to sprintf */ +{ +#ifdef __WIN__ + SYSTEMTIME cal_tm; + + GetLocalTime(&cal_tm); + + sprintf(buf, "%02d%02d%02d_%2d_%02d_%02d", + (int) cal_tm.wYear % 100, + (int) cal_tm.wMonth, + (int) cal_tm.wDay, + (int) cal_tm.wHour, + (int) cal_tm.wMinute, + (int) cal_tm.wSecond); +#else + struct tm* cal_tm_ptr; + time_t tm; + +#ifdef HAVE_LOCALTIME_R + struct tm cal_tm; + time(&tm); + localtime_r(&tm, &cal_tm); + cal_tm_ptr = &cal_tm; +#else + time(&tm); + cal_tm_ptr = localtime(&tm); +#endif + sprintf(buf, "%02d%02d%02d_%2d_%02d_%02d", + cal_tm_ptr->tm_year % 100, + cal_tm_ptr->tm_mon + 1, + cal_tm_ptr->tm_mday, + cal_tm_ptr->tm_hour, + cal_tm_ptr->tm_min, + cal_tm_ptr->tm_sec); +#endif +} + +/**********************************************************//** +Returns current year, month, day. */ +UNIV_INTERN +void +ut_get_year_month_day( +/*==================*/ + ulint* year, /*!< out: current year */ + ulint* month, /*!< out: month */ + ulint* day) /*!< out: day */ +{ +#ifdef __WIN__ + SYSTEMTIME cal_tm; + + GetLocalTime(&cal_tm); + + *year = (ulint) cal_tm.wYear; + *month = (ulint) cal_tm.wMonth; + *day = (ulint) cal_tm.wDay; +#else + struct tm* cal_tm_ptr; + time_t tm; + +#ifdef HAVE_LOCALTIME_R + struct tm cal_tm; + time(&tm); + localtime_r(&tm, &cal_tm); + cal_tm_ptr = &cal_tm; +#else + time(&tm); + cal_tm_ptr = localtime(&tm); +#endif + *year = (ulint) cal_tm_ptr->tm_year + 1900; + *month = (ulint) cal_tm_ptr->tm_mon + 1; + *day = (ulint) cal_tm_ptr->tm_mday; +#endif +} +#endif /* UNIV_HOTBACKUP */ + +#ifndef UNIV_HOTBACKUP +/*************************************************************//** +Runs an idle loop on CPU. The argument gives the desired delay +in microseconds on 100 MHz Pentium + Visual C++. +@return dummy value */ +UNIV_INTERN +ulint +ut_delay( +/*=====*/ + ulint delay) /*!< in: delay in microseconds on 100 MHz Pentium */ +{ + ulint i, j; + + j = 0; + + for (i = 0; i < delay * 50; i++) { + j += i; + UT_RELAX_CPU(); + } + + if (ut_always_false) { + ut_always_false = (ibool) j; + } + + return(j); +} +#endif /* !UNIV_HOTBACKUP */ + +/*************************************************************//** +Prints the contents of a memory buffer in hex and ascii. */ +UNIV_INTERN +void +ut_print_buf( +/*=========*/ + FILE* file, /*!< in: file where to print */ + const void* buf, /*!< in: memory buffer */ + ulint len) /*!< in: length of the buffer */ +{ + const byte* data; + ulint i; + + UNIV_MEM_ASSERT_RW(buf, len); + + fprintf(file, " len %lu; hex ", len); + + for (data = (const byte*) buf, i = 0; i < len; i++) { + fprintf(file, "%02lx", (ulong)*data++); + } + + fputs("; asc ", file); + + data = (const byte*) buf; + + for (i = 0; i < len; i++) { + int c = (int) *data++; + putc(isprint(c) ? c : ' ', file); + } + + putc(';', file); +} + +/**********************************************************************//** +Sort function for ulint arrays. */ +UNIV_INTERN +void +ut_ulint_sort( +/*==========*/ + ulint* arr, /*!< in/out: array to sort */ + ulint* aux_arr, /*!< in/out: aux array to use in sort */ + ulint low, /*!< in: lower bound */ + ulint high) /*!< in: upper bound */ +{ + UT_SORT_FUNCTION_BODY(ut_ulint_sort, arr, aux_arr, low, high, + ut_ulint_cmp); +} + +/*************************************************************//** +Calculates fast the number rounded up to the nearest power of 2. +@return first power of 2 which is >= n */ +UNIV_INTERN +ulint +ut_2_power_up( +/*==========*/ + ulint n) /*!< in: number != 0 */ +{ + ulint res; + + res = 1; + + ut_ad(n > 0); + + while (res < n) { + res = res * 2; + } + + return(res); +} + +/**********************************************************************//** +Outputs a NUL-terminated file name, quoted with apostrophes. */ +UNIV_INTERN +void +ut_print_filename( +/*==============*/ + FILE* f, /*!< in: output stream */ + const char* name) /*!< in: name to print */ +{ + putc('\'', f); + for (;;) { + int c = *name++; + switch (c) { + case 0: + goto done; + case '\'': + putc(c, f); + /* fall through */ + default: + putc(c, f); + } + } +done: + putc('\'', f); +} +#ifndef UNIV_HOTBACKUP +/**********************************************************************//** +Outputs a fixed-length string, quoted as an SQL identifier. +If the string contains a slash '/', the string will be +output as two identifiers separated by a period (.), +as in SQL database_name.identifier. */ +UNIV_INTERN +void +ut_print_name( +/*==========*/ + FILE* f, /*!< in: output stream */ + const trx_t* trx, /*!< in: transaction */ + ibool table_id,/*!< in: TRUE=print a table name, + FALSE=print other identifier */ + const char* name) /*!< in: name to print */ +{ + ut_print_namel(f, trx, table_id, name, strlen(name)); +} + +/**********************************************************************//** +Outputs a fixed-length string, quoted as an SQL identifier. +If the string contains a slash '/', the string will be +output as two identifiers separated by a period (.), +as in SQL database_name.identifier. */ +UNIV_INTERN +void +ut_print_namel( +/*===========*/ + FILE* f, /*!< in: output stream */ + const trx_t* trx, /*!< in: transaction (NULL=no quotes) */ + ibool table_id,/*!< in: TRUE=print a table name, + FALSE=print other identifier */ + const char* name, /*!< in: name to print */ + ulint namelen)/*!< in: length of name */ +{ + /* 2 * NAME_LEN for database and table name, + and some slack for the #mysql50# prefix and quotes */ + char buf[3 * NAME_LEN]; + const char* bufend; + + bufend = innobase_convert_name(buf, sizeof buf, + name, namelen, + trx ? trx->mysql_thd : NULL, + table_id); + + fwrite(buf, 1, bufend - buf, f); +} + +/**********************************************************************//** +Formats a table or index name, quoted as an SQL identifier. If the name +contains a slash '/', the result will contain two identifiers separated by +a period (.), as in SQL database_name.identifier. +@return pointer to 'formatted' */ +UNIV_INTERN +char* +ut_format_name( +/*===========*/ + const char* name, /*!< in: table or index name, must be + '\0'-terminated */ + ibool is_table, /*!< in: if TRUE then 'name' is a table + name */ + char* formatted, /*!< out: formatted result, will be + '\0'-terminated */ + ulint formatted_size) /*!< out: no more than this number of + bytes will be written to 'formatted' */ +{ + switch (formatted_size) { + case 1: + formatted[0] = '\0'; + /* FALL-THROUGH */ + case 0: + return(formatted); + } + + char* end; + + end = innobase_convert_name(formatted, formatted_size, + name, strlen(name), NULL, is_table); + + /* If the space in 'formatted' was completely used, then sacrifice + the last character in order to write '\0' at the end. */ + if ((ulint) (end - formatted) == formatted_size) { + end--; + } + + ut_a((ulint) (end - formatted) < formatted_size); + + *end = '\0'; + + return(formatted); +} + +/**********************************************************************//** +Catenate files. */ +UNIV_INTERN +void +ut_copy_file( +/*=========*/ + FILE* dest, /*!< in: output file */ + FILE* src) /*!< in: input file to be appended to output */ +{ + long len = ftell(src); + char buf[4096]; + + rewind(src); + do { + size_t maxs = len < (long) sizeof buf + ? (size_t) len + : sizeof buf; + size_t size = fread(buf, 1, maxs, src); + fwrite(buf, 1, size, dest); + len -= (long) size; + if (size < maxs) { + break; + } + } while (len > 0); +} +#endif /* !UNIV_HOTBACKUP */ + +#ifdef __WIN__ +# include <stdarg.h> +/**********************************************************************//** +A substitute for vsnprintf(3), formatted output conversion into +a limited buffer. Note: this function DOES NOT return the number of +characters that would have been printed if the buffer was unlimited because +VC's _vsnprintf() returns -1 in this case and we would need to call +_vscprintf() in addition to estimate that but we would need another copy +of "ap" for that and VC does not provide va_copy(). */ +UNIV_INTERN +void +ut_vsnprintf( +/*=========*/ + char* str, /*!< out: string */ + size_t size, /*!< in: str size */ + const char* fmt, /*!< in: format */ + va_list ap) /*!< in: format values */ +{ + _vsnprintf(str, size, fmt, ap); + str[size - 1] = '\0'; +} + +/**********************************************************************//** +A substitute for snprintf(3), formatted output conversion into +a limited buffer. +@return number of characters that would have been printed if the size +were unlimited, not including the terminating '\0'. */ +UNIV_INTERN +int +ut_snprintf( +/*========*/ + char* str, /*!< out: string */ + size_t size, /*!< in: str size */ + const char* fmt, /*!< in: format */ + ...) /*!< in: format values */ +{ + int res; + va_list ap1; + va_list ap2; + + va_start(ap1, fmt); + va_start(ap2, fmt); + + res = _vscprintf(fmt, ap1); + ut_a(res != -1); + + if (size > 0) { + _vsnprintf(str, size, fmt, ap2); + + if ((size_t) res >= size) { + str[size - 1] = '\0'; + } + } + + va_end(ap1); + va_end(ap2); + + return(res); +} +#endif /* __WIN__ */ + +/*************************************************************//** +Convert an error number to a human readable text message. The +returned string is static and should not be freed or modified. +@return string, describing the error */ +UNIV_INTERN +const char* +ut_strerr( +/*======*/ + dberr_t num) /*!< in: error number */ +{ + switch (num) { + case DB_SUCCESS: + return("Success"); + case DB_SUCCESS_LOCKED_REC: + return("Success, record lock created"); + case DB_ERROR: + return("Generic error"); + case DB_READ_ONLY: + return("Read only transaction"); + case DB_INTERRUPTED: + return("Operation interrupted"); + case DB_OUT_OF_MEMORY: + return("Cannot allocate memory"); + case DB_OUT_OF_FILE_SPACE: + return("Out of disk space"); + case DB_LOCK_WAIT: + return("Lock wait"); + case DB_DEADLOCK: + return("Deadlock"); + case DB_ROLLBACK: + return("Rollback"); + case DB_DUPLICATE_KEY: + return("Duplicate key"); + case DB_QUE_THR_SUSPENDED: + return("The queue thread has been suspended"); + case DB_MISSING_HISTORY: + return("Required history data has been deleted"); + case DB_CLUSTER_NOT_FOUND: + return("Cluster not found"); + case DB_TABLE_NOT_FOUND: + return("Table not found"); + case DB_MUST_GET_MORE_FILE_SPACE: + return("More file space needed"); + case DB_TABLE_IS_BEING_USED: + return("Table is being used"); + case DB_TOO_BIG_RECORD: + return("Record too big"); + case DB_TOO_BIG_INDEX_COL: + return("Index columns size too big"); + case DB_LOCK_WAIT_TIMEOUT: + return("Lock wait timeout"); + case DB_NO_REFERENCED_ROW: + return("Referenced key value not found"); + case DB_ROW_IS_REFERENCED: + return("Row is referenced"); + case DB_CANNOT_ADD_CONSTRAINT: + return("Cannot add constraint"); + case DB_CORRUPTION: + return("Data structure corruption"); + case DB_CANNOT_DROP_CONSTRAINT: + return("Cannot drop constraint"); + case DB_NO_SAVEPOINT: + return("No such savepoint"); + case DB_TABLESPACE_EXISTS: + return("Tablespace already exists"); + case DB_TABLESPACE_DELETED: + return("Tablespace deleted or being deleted"); + case DB_TABLESPACE_NOT_FOUND: + return("Tablespace not found"); + case DB_LOCK_TABLE_FULL: + return("Lock structs have exhausted the buffer pool"); + case DB_FOREIGN_DUPLICATE_KEY: + return("Foreign key activated with duplicate keys"); + case DB_FOREIGN_EXCEED_MAX_CASCADE: + return("Foreign key cascade delete/update exceeds max depth"); + case DB_TOO_MANY_CONCURRENT_TRXS: + return("Too many concurrent transactions"); + case DB_UNSUPPORTED: + return("Unsupported"); + case DB_INVALID_NULL: + return("NULL value encountered in NOT NULL column"); + case DB_STATS_DO_NOT_EXIST: + return("Persistent statistics do not exist"); + case DB_FAIL: + return("Failed, retry may succeed"); + case DB_OVERFLOW: + return("Overflow"); + case DB_UNDERFLOW: + return("Underflow"); + case DB_STRONG_FAIL: + return("Failed, retry will not succeed"); + case DB_ZIP_OVERFLOW: + return("Zip overflow"); + case DB_RECORD_NOT_FOUND: + return("Record not found"); + case DB_CHILD_NO_INDEX: + return("No index on referencing keys in referencing table"); + case DB_PARENT_NO_INDEX: + return("No index on referenced keys in referenced table"); + case DB_FTS_INVALID_DOCID: + return("FTS Doc ID cannot be zero"); + case DB_INDEX_CORRUPT: + return("Index corrupted"); + case DB_UNDO_RECORD_TOO_BIG: + return("Undo record too big"); + case DB_END_OF_INDEX: + return("End of index"); + case DB_IO_ERROR: + return("I/O error"); + case DB_TABLE_IN_FK_CHECK: + return("Table is being used in foreign key check"); + case DB_DATA_MISMATCH: + return("data mismatch"); + case DB_SCHEMA_NOT_LOCKED: + return("schema not locked"); + case DB_NOT_FOUND: + return("not found"); + case DB_ONLINE_LOG_TOO_BIG: + return("Log size exceeded during online index creation"); + case DB_DICT_CHANGED: + return("Table dictionary has changed"); + case DB_IDENTIFIER_TOO_LONG: + return("Identifier name is too long"); + case DB_FTS_EXCEED_RESULT_CACHE_LIMIT: + return("FTS query exceeds result cache limit"); + case DB_TEMP_FILE_WRITE_FAILURE: + return("Temp file write failure"); + case DB_FTS_TOO_MANY_WORDS_IN_PHRASE: + return("Too many words in a FTS phrase or proximity search"); + + /* do not add default: in order to produce a warning if new code + is added to the enum but not added here */ + } + + /* we abort here because if unknown error code is given, this could + mean that memory corruption has happened and someone's error-code + variable has been overwritten with bogus data */ + ut_error; + + /* NOT REACHED */ + return("Unknown error"); +} +#endif /* !UNIV_INNOCHECKSUM */ diff --git a/storage/xtradb/ut/ut0vec.cc b/storage/xtradb/ut/ut0vec.cc new file mode 100644 index 00000000000..5842d9f1c0e --- /dev/null +++ b/storage/xtradb/ut/ut0vec.cc @@ -0,0 +1,78 @@ +/***************************************************************************** + +Copyright (c) 2006, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/*******************************************************************//** +@file ut/ut0vec.cc +A vector of pointers to data items + +Created 4/6/2006 Osku Salerma +************************************************************************/ + +#include "ut0vec.h" +#ifdef UNIV_NONINL +#include "ut0vec.ic" +#endif +#include "mem0mem.h" + +/******************************************************************** +Create a new vector with the given initial size. */ +UNIV_INTERN +ib_vector_t* +ib_vector_create( +/*=============*/ + /* out: vector */ + ib_alloc_t* allocator, /* in: vector allocator */ + ulint sizeof_value, /* in: size of data item */ + ulint size) /* in: initial size */ +{ + ib_vector_t* vec; + + ut_a(size > 0); + + vec = static_cast<ib_vector_t*>( + allocator->mem_malloc(allocator, sizeof(*vec))); + + vec->used = 0; + vec->total = size; + vec->allocator = allocator; + vec->sizeof_value = sizeof_value; + + vec->data = static_cast<void*>( + allocator->mem_malloc(allocator, vec->sizeof_value * size)); + + return(vec); +} + +/******************************************************************** +Resize the vector, currently the vector can only grow and we +expand the number of elements it can hold by 2 times. */ +UNIV_INTERN +void +ib_vector_resize( +/*=============*/ + ib_vector_t* vec) /* in: vector */ +{ + ulint new_total = vec->total * 2; + ulint old_size = vec->used * vec->sizeof_value; + ulint new_size = new_total * vec->sizeof_value; + + vec->data = static_cast<void*>(vec->allocator->mem_resize( + vec->allocator, vec->data, old_size, new_size)); + + vec->total = new_total; +} diff --git a/storage/xtradb/ut/ut0wqueue.cc b/storage/xtradb/ut/ut0wqueue.cc new file mode 100644 index 00000000000..d1ba36b3b00 --- /dev/null +++ b/storage/xtradb/ut/ut0wqueue.cc @@ -0,0 +1,175 @@ +/***************************************************************************** + +Copyright (c) 2006, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +#include "ut0wqueue.h" + +/*******************************************************************//** +@file ut/ut0wqueue.cc +A work queue + +Created 4/26/2006 Osku Salerma +************************************************************************/ + +/****************************************************************//** +Create a new work queue. +@return work queue */ +UNIV_INTERN +ib_wqueue_t* +ib_wqueue_create(void) +/*===================*/ +{ + ib_wqueue_t* wq = static_cast<ib_wqueue_t*>(mem_alloc(sizeof(*wq))); + + /* Function ib_wqueue_create() has not been used anywhere, + not necessary to instrument this mutex */ + mutex_create(PFS_NOT_INSTRUMENTED, &wq->mutex, SYNC_WORK_QUEUE); + + wq->items = ib_list_create(); + wq->event = os_event_create(); + + return(wq); +} + +/****************************************************************//** +Free a work queue. */ +UNIV_INTERN +void +ib_wqueue_free( +/*===========*/ + ib_wqueue_t* wq) /*!< in: work queue */ +{ + mutex_free(&wq->mutex); + ib_list_free(wq->items); + os_event_free(wq->event); + + mem_free(wq); +} + +/****************************************************************//** +Add a work item to the queue. */ +UNIV_INTERN +void +ib_wqueue_add( +/*==========*/ + ib_wqueue_t* wq, /*!< in: work queue */ + void* item, /*!< in: work item */ + mem_heap_t* heap) /*!< in: memory heap to use for allocating the + list node */ +{ + mutex_enter(&wq->mutex); + + ib_list_add_last(wq->items, item, heap); + os_event_set(wq->event); + + mutex_exit(&wq->mutex); +} + +/****************************************************************//** +Wait for a work item to appear in the queue. +@return work item */ +UNIV_INTERN +void* +ib_wqueue_wait( +/*===========*/ + ib_wqueue_t* wq) /*!< in: work queue */ +{ + ib_list_node_t* node; + + for (;;) { + os_event_wait(wq->event); + + mutex_enter(&wq->mutex); + + node = ib_list_get_first(wq->items); + + if (node) { + ib_list_remove(wq->items, node); + + if (!ib_list_get_first(wq->items)) { + /* We must reset the event when the list + gets emptied. */ + os_event_reset(wq->event); + } + + break; + } + + mutex_exit(&wq->mutex); + } + + mutex_exit(&wq->mutex); + + return(node->data); +} + + +/******************************************************************** +Wait for a work item to appear in the queue for specified time. */ + +void* +ib_wqueue_timedwait( +/*================*/ + /* out: work item or NULL on timeout*/ + ib_wqueue_t* wq, /* in: work queue */ + ib_time_t wait_in_usecs) /* in: wait time in micro seconds */ +{ + ib_list_node_t* node = NULL; + + for (;;) { + ulint error; + ib_int64_t sig_count; + + mutex_enter(&wq->mutex); + + node = ib_list_get_first(wq->items); + + if (node) { + ib_list_remove(wq->items, node); + + mutex_exit(&wq->mutex); + break; + } + + sig_count = os_event_reset(wq->event); + + mutex_exit(&wq->mutex); + + error = os_event_wait_time_low(wq->event, + (ulint) wait_in_usecs, + sig_count); + + if (error == OS_SYNC_TIME_EXCEEDED) { + break; + } + } + + return(node ? node->data : NULL); +} + +/******************************************************************** +Check if queue is empty. */ + +ibool +ib_wqueue_is_empty( +/*===============*/ + /* out: TRUE if queue empty + else FALSE */ + const ib_wqueue_t* wq) /* in: work queue */ +{ + return(ib_list_is_empty(wq->items)); +} |